Re: [Ext2-devel] [RFC] Adding multiple block allocation

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, Apr 28, 2005 at 12:14:24PM -0700, Mingming Cao wrote:
> Currently ext3_get_block()/ext3_new_block() only allocate one block at a
> time.  To allocate multiple blocks, the caller, for example, ext3 direct
> IO routine, has to invoke ext3_get_block() many times.  This is quite
> inefficient for sequential IO workload. 
> 
> The benefit of a real get_blocks() include
> 1) increase the possibility to get contiguous blocks, reduce possibility
> of  fragmentation due to interleaved allocations from other threads.
> (should good for non reservation case)
> 2) Reduces CPU cycles spent in repeated get_block() calls
> 3) Batch meta data update and journaling in one short
> 4) Could possibly speed up future get_blocks() look up by cache the last
> mapped blocks in inode.
> 

And here is the patch to make mpage_writepages use get_blocks() for
multiple block lookup/allocation. It performs a radix-tree contiguous 
pages lookup, and issues a get_blocks for the range together. It maintains
an mpageio structure to track intermediate mapping state, somewhat
like the DIO code.

It does need some more testing, especially block_size < PAGE_SIZE.
The JFS workaround can be dropped if the JFS get_blocks fix from
Dave Kleikamp is integrated.

Review feedback would be welcome.

Mingming,
Let me know if you have a chance to try this out with your patch.

Regards
Suparna

-- 
Suparna Bhattacharya ([email protected])
Linux Technology Center
IBM Software Lab, India


diff -urp -X dontdiff linux-2.6.12-rc3/fs/buffer.c linux-2.6.12-rc3-getblocks/fs/buffer.c
--- linux-2.6.12-rc3/fs/buffer.c	2005-04-21 05:33:15.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/fs/buffer.c	2005-04-22 15:08:33.000000000 +0530
@@ -2514,53 +2514,10 @@ EXPORT_SYMBOL(nobh_commit_write);
  * that it tries to operate without attaching bufferheads to
  * the page.
  */
-int nobh_writepage(struct page *page, get_block_t *get_block,
-			struct writeback_control *wbc)
+int nobh_writepage(struct page *page, get_blocks_t *get_blocks,
+		struct writeback_control *wbc, writepage_t bh_writepage_fn)
 {
-	struct inode * const inode = page->mapping->host;
-	loff_t i_size = i_size_read(inode);
-	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
-	unsigned offset;
-	void *kaddr;
-	int ret;
-
-	/* Is the page fully inside i_size? */
-	if (page->index < end_index)
-		goto out;
-
-	/* Is the page fully outside i_size? (truncate in progress) */
-	offset = i_size & (PAGE_CACHE_SIZE-1);
-	if (page->index >= end_index+1 || !offset) {
-		/*
-		 * The page may have dirty, unmapped buffers.  For example,
-		 * they may have been added in ext3_writepage().  Make them
-		 * freeable here, so the page does not leak.
-		 */
-#if 0
-		/* Not really sure about this  - do we need this ? */
-		if (page->mapping->a_ops->invalidatepage)
-			page->mapping->a_ops->invalidatepage(page, offset);
-#endif
-		unlock_page(page);
-		return 0; /* don't care */
-	}
-
-	/*
-	 * The page straddles i_size.  It must be zeroed out on each and every
-	 * writepage invocation because it may be mmapped.  "A file is mapped
-	 * in multiples of the page size.  For a file that is not a multiple of
-	 * the  page size, the remaining memory is zeroed when mapped, and
-	 * writes to that region are not written out to the file."
-	 */
-	kaddr = kmap_atomic(page, KM_USER0);
-	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
-	flush_dcache_page(page);
-	kunmap_atomic(kaddr, KM_USER0);
-out:
-	ret = mpage_writepage(page, get_block, wbc);
-	if (ret == -EAGAIN)
-		ret = __block_write_full_page(inode, page, get_block, wbc);
-	return ret;
+	return mpage_writepage(page, get_blocks, wbc, bh_writepage_fn);
 }
 EXPORT_SYMBOL(nobh_writepage);
 
diff -urp -X dontdiff linux-2.6.12-rc3/fs/ext2/inode.c linux-2.6.12-rc3-getblocks/fs/ext2/inode.c
--- linux-2.6.12-rc3/fs/ext2/inode.c	2005-04-21 05:33:15.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/fs/ext2/inode.c	2005-04-22 16:30:42.000000000 +0530
@@ -639,12 +639,6 @@ ext2_nobh_prepare_write(struct file *fil
 	return nobh_prepare_write(page,from,to,ext2_get_block);
 }
 
-static int ext2_nobh_writepage(struct page *page,
-			struct writeback_control *wbc)
-{
-	return nobh_writepage(page, ext2_get_block, wbc);
-}
-
 static sector_t ext2_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping,block,ext2_get_block);
@@ -662,6 +656,12 @@ ext2_get_blocks(struct inode *inode, sec
 	return ret;
 }
 
+static int ext2_nobh_writepage(struct page *page,
+			struct writeback_control *wbc)
+{
+	return nobh_writepage(page, ext2_get_blocks, wbc, ext2_writepage);
+}
+
 static ssize_t
 ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 			loff_t offset, unsigned long nr_segs)
@@ -676,7 +676,8 @@ ext2_direct_IO(int rw, struct kiocb *ioc
 static int
 ext2_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
-	return mpage_writepages(mapping, wbc, ext2_get_block);
+        return __mpage_writepages(mapping, wbc, ext2_get_blocks,
+					ext2_writepage);
 }
 
 struct address_space_operations ext2_aops = {
diff -urp -X dontdiff linux-2.6.12-rc3/fs/ext3/inode.c linux-2.6.12-rc3-getblocks/fs/ext3/inode.c
--- linux-2.6.12-rc3/fs/ext3/inode.c	2005-04-21 05:33:15.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/fs/ext3/inode.c	2005-04-22 15:08:33.000000000 +0530
@@ -866,10 +866,10 @@ get_block:
 	return ret;
 }
 
-static int ext3_writepages_get_block(struct inode *inode, sector_t iblock,
-			struct buffer_head *bh, int create)
+static int ext3_writepages_get_blocks(struct inode *inode, sector_t iblock,
+		unsigned long max_blocks, struct buffer_head *bh, int create)
 {
-	return ext3_direct_io_get_blocks(inode, iblock, 1, bh, create);
+	return ext3_direct_io_get_blocks(inode, iblock, max_blocks, bh, create);
 }
 
 /*
@@ -1369,11 +1369,11 @@ ext3_writeback_writepages(struct address
 		return ret;
 	}
 
-        ret = __mpage_writepages(mapping, wbc, ext3_writepages_get_block,
+        ret = __mpage_writepages(mapping, wbc, ext3_writepages_get_blocks,
 					ext3_writeback_writepage_helper);
 
 	/*
-	 * Need to reaquire the handle since ext3_writepages_get_block()
+	 * Need to reaquire the handle since ext3_writepages_get_blocks()
 	 * can restart the handle
 	 */
 	handle = journal_current_handle();
@@ -1402,7 +1402,8 @@ static int ext3_writeback_writepage(stru
 	}
 
 	if (test_opt(inode->i_sb, NOBH))
-		ret = nobh_writepage(page, ext3_get_block, wbc);
+		ret = nobh_writepage(page, ext3_writepages_get_blocks, wbc,
+			ext3_writeback_writepage_helper);
 	else
 		ret = block_write_full_page(page, ext3_get_block, wbc);
 
diff -urp -X dontdiff linux-2.6.12-rc3/fs/ext3/super.c linux-2.6.12-rc3-getblocks/fs/ext3/super.c
--- linux-2.6.12-rc3/fs/ext3/super.c	2005-04-21 05:33:15.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/fs/ext3/super.c	2005-04-22 15:08:33.000000000 +0530
@@ -1321,6 +1321,7 @@ static int ext3_fill_super (struct super
 	sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
 
 	set_opt(sbi->s_mount_opt, RESERVATION);
+	set_opt(sbi->s_mount_opt, NOBH); /* temp: set nobh default */
 
 	if (!parse_options ((char *) data, sb, &journal_inum, NULL, 0))
 		goto failed_mount;
@@ -1567,6 +1568,7 @@ static int ext3_fill_super (struct super
 			printk(KERN_ERR "EXT3-fs: Journal does not support "
 			       "requested data journaling mode\n");
 			goto failed_mount3;
+		set_opt(sbi->s_mount_opt, NOBH); /* temp: set nobh default */
 		}
 	default:
 		break;
@@ -1584,6 +1586,7 @@ static int ext3_fill_super (struct super
 				"its supported only with writeback mode\n");
 			clear_opt(sbi->s_mount_opt, NOBH);
 		}
+		printk("NOBH option set\n");
 	}
 	/*
 	 * The journal_load will have done any necessary log recovery,
diff -urp -X dontdiff linux-2.6.12-rc3/fs/hfs/inode.c linux-2.6.12-rc3-getblocks/fs/hfs/inode.c
--- linux-2.6.12-rc3/fs/hfs/inode.c	2005-04-21 05:33:15.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/fs/hfs/inode.c	2005-04-22 15:08:33.000000000 +0530
@@ -124,7 +124,7 @@ static ssize_t hfs_direct_IO(int rw, str
 static int hfs_writepages(struct address_space *mapping,
 			  struct writeback_control *wbc)
 {
-	return mpage_writepages(mapping, wbc, hfs_get_block);
+	return mpage_writepages(mapping, wbc, hfs_get_blocks);
 }
 
 struct address_space_operations hfs_btree_aops = {
diff -urp -X dontdiff linux-2.6.12-rc3/fs/hfsplus/inode.c linux-2.6.12-rc3-getblocks/fs/hfsplus/inode.c
--- linux-2.6.12-rc3/fs/hfsplus/inode.c	2005-04-21 05:33:15.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/fs/hfsplus/inode.c	2005-04-22 15:08:33.000000000 +0530
@@ -121,7 +121,7 @@ static ssize_t hfsplus_direct_IO(int rw,
 static int hfsplus_writepages(struct address_space *mapping,
 			      struct writeback_control *wbc)
 {
-	return mpage_writepages(mapping, wbc, hfsplus_get_block);
+	return mpage_writepages(mapping, wbc, hfsplus_get_blocks);
 }
 
 struct address_space_operations hfsplus_btree_aops = {
diff -urp -X dontdiff linux-2.6.12-rc3/fs/jfs/inode.c linux-2.6.12-rc3-getblocks/fs/jfs/inode.c
--- linux-2.6.12-rc3/fs/jfs/inode.c	2005-04-21 05:33:15.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/fs/jfs/inode.c	2005-04-22 16:27:19.000000000 +0530
@@ -267,21 +267,41 @@ jfs_get_blocks(struct inode *ip, sector_
 	return rc;
 }
 
+static int
+jfs_mpage_get_blocks(struct inode *ip, sector_t lblock, unsigned long
+			max_blocks, struct buffer_head *bh_result, int create)
+{
+	/* 
+	 * fixme: temporary workaround: return one block at a time until
+	 * we figure out why we see exposures with truncate on 
+	 * allocating multiple blocks in one shot.
+	 */
+	return jfs_get_blocks(ip, lblock, 1, bh_result, create);
+}
+
 static int jfs_get_block(struct inode *ip, sector_t lblock,
 			 struct buffer_head *bh_result, int create)
 {
 	return jfs_get_blocks(ip, lblock, 1, bh_result, create);
 }
 
+static int jfs_bh_writepage(struct page *page,
+				struct writeback_control *wbc)
+{
+	return block_write_full_page(page, jfs_get_block, wbc);
+}
+
+
 static int jfs_writepage(struct page *page, struct writeback_control *wbc)
 {
-	return nobh_writepage(page, jfs_get_block, wbc);
+	return nobh_writepage(page, jfs_mpage_get_blocks, wbc, jfs_bh_writepage);
 }
 
 static int jfs_writepages(struct address_space *mapping,
 			struct writeback_control *wbc)
 {
-	return mpage_writepages(mapping, wbc, jfs_get_block);
+        return __mpage_writepages(mapping, wbc, jfs_mpage_get_blocks,
+					jfs_bh_writepage);
 }
 
 static int jfs_readpage(struct file *file, struct page *page)
diff -urp -X dontdiff linux-2.6.12-rc3/fs/mpage.c linux-2.6.12-rc3-getblocks/fs/mpage.c
--- linux-2.6.12-rc3/fs/mpage.c	2005-04-21 05:33:15.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/fs/mpage.c	2005-04-22 16:19:14.000000000 +0530
@@ -370,6 +370,67 @@ int mpage_readpage(struct page *page, ge
 }
 EXPORT_SYMBOL(mpage_readpage);
 
+struct mpageio {
+	struct bio *bio;
+	struct buffer_head map_bh;
+	unsigned long block_in_file;
+	unsigned long final_block_in_request;
+	sector_t long block_in_bio;
+	int boundary;
+	sector_t boundary_block;
+	struct block_device *boundary_bdev;
+};
+
+/*
+ * Maps as many contiguous disk blocks as it can within the range of
+ * the request, and returns the total number of contiguous mapped
+ * blocks in the mpageio.
+ */
+static unsigned long mpage_get_more_blocks(struct mpageio *mio,
+	struct inode *inode, get_blocks_t get_blocks)
+{
+	struct buffer_head map_bh = {.b_state = 0};
+	unsigned long mio_nblocks = mio->map_bh.b_size >> inode->i_blkbits;
+	unsigned long first_unmapped = mio->block_in_file + mio_nblocks;
+	unsigned long next_contig_block = mio->map_bh.b_blocknr + mio_nblocks;
+
+	while ((first_unmapped < mio->final_block_in_request) &&
+		(mio->map_bh.b_size < PAGE_SIZE)) {
+
+		if (get_blocks(inode, first_unmapped,
+			mio->final_block_in_request - first_unmapped,
+			&map_bh, 1))
+			break;
+		if (mio_nblocks && ((map_bh.b_blocknr != next_contig_block) ||
+			map_bh.b_bdev != mio->map_bh.b_bdev))
+			break;
+			
+		if (buffer_new(&map_bh)) {
+			int i = 0;
+			for (; i < map_bh.b_size >> inode->i_blkbits; i++)
+				unmap_underlying_metadata(map_bh.b_bdev,
+					map_bh.b_blocknr + i);
+		}
+		
+		if (buffer_boundary(&map_bh)) {
+			mio->boundary = 1;
+			mio->boundary_block = map_bh.b_blocknr;
+			mio->boundary_bdev = map_bh.b_bdev;
+		}
+		if (mio_nblocks == 0) {
+			mio->map_bh.b_bdev = map_bh.b_bdev;
+			mio->map_bh.b_blocknr = map_bh.b_blocknr;
+		}
+
+		mio_nblocks += map_bh.b_size >> inode->i_blkbits;
+		first_unmapped = mio->block_in_file + mio_nblocks;
+		next_contig_block = mio->map_bh.b_blocknr + mio_nblocks;
+		mio->map_bh.b_size += map_bh.b_size;
+	}
+
+	return mio_nblocks;
+}
+
 /*
  * Writing is not so simple.
  *
@@ -386,9 +447,9 @@ EXPORT_SYMBOL(mpage_readpage);
  * written, so it can intelligently allocate a suitably-sized BIO.  For now,
  * just allocate full-size (16-page) BIOs.
  */
-static struct bio *
-__mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
-	sector_t *last_block_in_bio, int *ret, struct writeback_control *wbc,
+static int
+__mpage_writepage(struct mpageio *mio, struct page *page,
+	get_blocks_t get_blocks, struct writeback_control *wbc,
 	writepage_t writepage_fn)
 {
 	struct address_space *mapping = page->mapping;
@@ -396,9 +457,8 @@ __mpage_writepage(struct bio *bio, struc
 	const unsigned blkbits = inode->i_blkbits;
 	unsigned long end_index;
 	const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
-	sector_t last_block;
+	sector_t last_block, blocks_to_skip;
 	sector_t block_in_file;
-	sector_t blocks[MAX_BUF_PER_PAGE];
 	unsigned page_block;
 	unsigned first_unmapped = blocks_per_page;
 	struct block_device *bdev = NULL;
@@ -406,8 +466,10 @@ __mpage_writepage(struct bio *bio, struc
 	sector_t boundary_block = 0;
 	struct block_device *boundary_bdev = NULL;
 	int length;
-	struct buffer_head map_bh;
 	loff_t i_size = i_size_read(inode);
+	struct buffer_head *map_bh = &mio->map_bh;
+	struct bio *bio = mio->bio;
+	int ret = 0;
 
 	if (page_has_buffers(page)) {
 		struct buffer_head *head = page_buffers(page);
@@ -435,10 +497,13 @@ __mpage_writepage(struct bio *bio, struc
 			if (!buffer_dirty(bh) || !buffer_uptodate(bh))
 				goto confused;
 			if (page_block) {
-				if (bh->b_blocknr != blocks[page_block-1] + 1)
+				if (bh->b_blocknr != map_bh->b_blocknr 
+					+ page_block)
 					goto confused;
+			} else {
+				map_bh->b_blocknr = bh->b_blocknr;
+				map_bh->b_size = PAGE_SIZE;
 			}
-			blocks[page_block++] = bh->b_blocknr;
 			boundary = buffer_boundary(bh);
 			if (boundary) {
 				boundary_block = bh->b_blocknr;
@@ -465,33 +530,30 @@ __mpage_writepage(struct bio *bio, struc
 	BUG_ON(!PageUptodate(page));
 	block_in_file = page->index << (PAGE_CACHE_SHIFT - blkbits);
 	last_block = (i_size - 1) >> blkbits;
-	map_bh.b_page = page;
-	for (page_block = 0; page_block < blocks_per_page; ) {
-
-		map_bh.b_state = 0;
-		if (get_block(inode, block_in_file, &map_bh, 1))
-			goto confused;
-		if (buffer_new(&map_bh))
-			unmap_underlying_metadata(map_bh.b_bdev,
-						map_bh.b_blocknr);
-		if (buffer_boundary(&map_bh)) {
-			boundary_block = map_bh.b_blocknr;
-			boundary_bdev = map_bh.b_bdev;
-		}
-		if (page_block) {
-			if (map_bh.b_blocknr != blocks[page_block-1] + 1)
-				goto confused;
-		}
-		blocks[page_block++] = map_bh.b_blocknr;
-		boundary = buffer_boundary(&map_bh);
-		bdev = map_bh.b_bdev;
-		if (block_in_file == last_block)
-			break;
-		block_in_file++;
+	blocks_to_skip = block_in_file - mio->block_in_file;
+	mio->block_in_file = block_in_file;
+	if (blocks_to_skip < (map_bh->b_size >> blkbits)) {
+		map_bh->b_blocknr += blocks_to_skip;
+		map_bh->b_size -= blocks_to_skip << blkbits;
+	} else {
+		map_bh->b_state = 0;
+		map_bh->b_size = 0;
+		if (mio->final_block_in_request > last_block)
+			mio->final_block_in_request = last_block;
+		mpage_get_more_blocks(mio, inode, get_blocks);
 	}
-	BUG_ON(page_block == 0);
+	if (map_bh->b_size < PAGE_SIZE)
+		goto confused;
 
-	first_unmapped = page_block;
+	if (mio->boundary && (mio->boundary_block < map_bh->b_blocknr 
+		+ blocks_per_page)) {
+		boundary = 1;
+		boundary_block = mio->boundary_block;
+		boundary_bdev = mio->boundary_bdev;
+	}
+		
+	bdev = map_bh->b_bdev;
+	first_unmapped = blocks_per_page;
 
 page_is_mapped:
 	end_index = i_size >> PAGE_CACHE_SHIFT;
@@ -518,12 +580,16 @@ page_is_mapped:
 	/*
 	 * This page will go to BIO.  Do we need to send this BIO off first?
 	 */
-	if (bio && *last_block_in_bio != blocks[0] - 1)
+	if (bio && mio->block_in_bio != map_bh->b_blocknr - 1)
 		bio = mpage_bio_submit(WRITE, bio);
 
 alloc_new:
 	if (bio == NULL) {
-		bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
+		/* 
+		 * Fixme: bio size can be limited to final_block - block, or
+		 * even mio->map_bh.b_size
+		 */
+		bio = mpage_alloc(bdev, map_bh->b_blocknr << (blkbits - 9),
 				bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH);
 		if (bio == NULL)
 			goto confused;
@@ -539,6 +605,9 @@ alloc_new:
 		bio = mpage_bio_submit(WRITE, bio);
 		goto alloc_new;
 	}
+	map_bh->b_blocknr += blocks_per_page;
+	map_bh->b_size -= PAGE_SIZE;
+	mio->block_in_file += blocks_per_page;
 
 	/*
 	 * OK, we have our BIO, so we can now mark the buffers clean.  Make
@@ -575,7 +644,8 @@ alloc_new:
 					boundary_block, 1 << blkbits);
 		}
 	} else {
-		*last_block_in_bio = blocks[blocks_per_page - 1];
+		/* we can pack more pages into the bio, don't submit yet */
+		mio->block_in_bio = map_bh->b_blocknr - 1;
 	}
 	goto out;
 
@@ -584,22 +654,23 @@ confused:
 		bio = mpage_bio_submit(WRITE, bio);
 
 	if (writepage_fn) {
-		*ret = (*writepage_fn)(page, wbc);
+		ret = (*writepage_fn)(page, wbc);
 	} else {
-		*ret = -EAGAIN;
+		ret = -EAGAIN;
 		goto out;
 	}
 	/*
 	 * The caller has a ref on the inode, so *mapping is stable
 	 */
-	if (*ret) {
-		if (*ret == -ENOSPC)
+	if (ret) {
+		if (ret == -ENOSPC)
 			set_bit(AS_ENOSPC, &mapping->flags);
 		else
 			set_bit(AS_EIO, &mapping->flags);
 	}
 out:
-	return bio;
+	mio->bio = bio;
+	return ret;
 }
 
 /**
@@ -625,20 +696,21 @@ out:
  */
 int
 mpage_writepages(struct address_space *mapping,
-		struct writeback_control *wbc, get_block_t get_block)
+		struct writeback_control *wbc, get_blocks_t get_blocks)
 {
-	return __mpage_writepages(mapping, wbc, get_block,
+	return __mpage_writepages(mapping, wbc, get_blocks,
 		mapping->a_ops->writepage);
 }
 
 int
 __mpage_writepages(struct address_space *mapping,
-		struct writeback_control *wbc, get_block_t get_block,
+		struct writeback_control *wbc, get_blocks_t get_blocks,
 		writepage_t writepage_fn)
 {
 	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	struct inode *inode = mapping->host;
+	const unsigned blkbits = inode->i_blkbits;
 	struct bio *bio = NULL;
-	sector_t last_block_in_bio = 0;
 	int ret = 0;
 	int done = 0;
 	int (*writepage)(struct page *page, struct writeback_control *wbc);
@@ -648,6 +720,9 @@ __mpage_writepages(struct address_space 
 	pgoff_t end = -1;		/* Inclusive */
 	int scanned = 0;
 	int is_range = 0;
+	struct mpageio mio = {
+		.bio = NULL
+	};
 
 	if (wbc->nonblocking && bdi_write_congested(bdi)) {
 		wbc->encountered_congestion = 1;
@@ -655,7 +730,7 @@ __mpage_writepages(struct address_space 
 	}
 
 	writepage = NULL;
-	if (get_block == NULL)
+	if (get_blocks == NULL)
 		writepage = mapping->a_ops->writepage;
 
 	pagevec_init(&pvec, 0);
@@ -672,12 +747,15 @@ __mpage_writepages(struct address_space 
 		scanned = 1;
 	}
 retry:
+	down_read(&inode->i_alloc_sem);
 	while (!done && (index <= end) &&
-			(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-			PAGECACHE_TAG_DIRTY,
+			(nr_pages = pagevec_contig_lookup_tag(&pvec, mapping,
+			&index, PAGECACHE_TAG_DIRTY,
 			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
 		unsigned i;
 
+		mio.final_block_in_request = min(index, end) <<
+			(PAGE_CACHE_SHIFT - blkbits);
 		scanned = 1;
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
@@ -702,7 +780,7 @@ retry:
 				unlock_page(page);
 				continue;
 			}
-
+			
 			if (wbc->sync_mode != WB_SYNC_NONE)
 				wait_on_page_writeback(page);
 
@@ -723,9 +801,9 @@ retry:
 							&mapping->flags);
 				}
 			} else {
-				bio = __mpage_writepage(bio, page, get_block,
-						&last_block_in_bio, &ret, wbc,
-						writepage_fn);
+				ret = __mpage_writepage(&mio, page, get_blocks,
+						wbc, writepage_fn);
+				bio = mio.bio;
 			}
 			if (ret || (--(wbc->nr_to_write) <= 0))
 				done = 1;
@@ -737,6 +815,9 @@ retry:
 		pagevec_release(&pvec);
 		cond_resched();
 	}
+	
+	up_read(&inode->i_alloc_sem);
+
 	if (!scanned && !done) {
 		/*
 		 * We hit the last page and there is more work to be done: wrap
@@ -755,17 +836,23 @@ retry:
 EXPORT_SYMBOL(mpage_writepages);
 EXPORT_SYMBOL(__mpage_writepages);
 
-int mpage_writepage(struct page *page, get_block_t get_block,
-	struct writeback_control *wbc)
+int mpage_writepage(struct page *page, get_blocks_t get_blocks,
+		struct writeback_control *wbc, writepage_t writepage_fn)
 {
 	int ret = 0;
-	struct bio *bio;
-	sector_t last_block_in_bio = 0;
-
-	bio = __mpage_writepage(NULL, page, get_block,
-			&last_block_in_bio, &ret, wbc, NULL);
-	if (bio)
-		mpage_bio_submit(WRITE, bio);
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	const unsigned blkbits = inode->i_blkbits;
+	struct mpageio mio = {
+		.final_block_in_request = (page->index + 1) << (PAGE_CACHE_SHIFT
+			- blkbits)
+	};
+
+	dump_stack();
+	ret = __mpage_writepage(&mio, page, get_blocks,
+			wbc, writepage_fn);
+	if (mio.bio)
+		mpage_bio_submit(WRITE, mio.bio);
 
 	return ret;
 }
diff -urp -X dontdiff linux-2.6.12-rc3/include/linux/buffer_head.h linux-2.6.12-rc3-getblocks/include/linux/buffer_head.h
--- linux-2.6.12-rc3/include/linux/buffer_head.h	2005-04-21 05:33:16.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/include/linux/buffer_head.h	2005-04-22 15:08:33.000000000 +0530
@@ -203,8 +203,8 @@ int file_fsync(struct file *, struct den
 int nobh_prepare_write(struct page*, unsigned, unsigned, get_block_t*);
 int nobh_commit_write(struct file *, struct page *, unsigned, unsigned);
 int nobh_truncate_page(struct address_space *, loff_t);
-int nobh_writepage(struct page *page, get_block_t *get_block,
-                        struct writeback_control *wbc);
+int nobh_writepage(struct page *page, get_blocks_t *get_blocks,
+	struct writeback_control *wbc, writepage_t bh_writepage);
 
 
 /*
diff -urp -X dontdiff linux-2.6.12-rc3/include/linux/fs.h linux-2.6.12-rc3-getblocks/include/linux/fs.h
--- linux-2.6.12-rc3/include/linux/fs.h	2005-04-21 05:33:16.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/include/linux/fs.h	2005-04-22 15:08:33.000000000 +0530
@@ -304,6 +304,8 @@ struct address_space;
 struct writeback_control;
 struct kiocb;
 
+typedef int (writepage_t)(struct page *page, struct writeback_control *wbc);
+
 struct address_space_operations {
 	int (*writepage)(struct page *page, struct writeback_control *wbc);
 	int (*readpage)(struct file *, struct page *);
diff -urp -X dontdiff linux-2.6.12-rc3/include/linux/mpage.h linux-2.6.12-rc3-getblocks/include/linux/mpage.h
--- linux-2.6.12-rc3/include/linux/mpage.h	2005-04-21 05:33:16.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/include/linux/mpage.h	2005-04-22 15:08:33.000000000 +0530
@@ -11,17 +11,16 @@
  */
 
 struct writeback_control;
-typedef int (writepage_t)(struct page *page, struct writeback_control *wbc);
 
 int mpage_readpages(struct address_space *mapping, struct list_head *pages,
 				unsigned nr_pages, get_block_t get_block);
 int mpage_readpage(struct page *page, get_block_t get_block);
 int mpage_writepages(struct address_space *mapping,
-		struct writeback_control *wbc, get_block_t get_block);
-int mpage_writepage(struct page *page, get_block_t *get_block,
-		struct writeback_control *wbc);
+		struct writeback_control *wbc, get_blocks_t get_blocks);
+int mpage_writepage(struct page *page, get_blocks_t *get_blocks,
+		struct writeback_control *wbc, writepage_t writepage);
 int __mpage_writepages(struct address_space *mapping,
-		struct writeback_control *wbc, get_block_t get_block,
+		struct writeback_control *wbc, get_blocks_t get_blocks,
 		writepage_t writepage);
 
 static inline int
diff -urp -X dontdiff linux-2.6.12-rc3/include/linux/pagemap.h linux-2.6.12-rc3-getblocks/include/linux/pagemap.h
--- linux-2.6.12-rc3/include/linux/pagemap.h	2005-04-21 05:33:16.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/include/linux/pagemap.h	2005-04-22 15:08:33.000000000 +0530
@@ -73,7 +73,8 @@ extern struct page * find_or_create_page
 unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
 			unsigned int nr_pages, struct page **pages);
 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
-			int tag, unsigned int nr_pages, struct page **pages);
+			int tag, unsigned int nr_pages, struct page **pages,
+			int contig);
 
 /*
  * Returns locked page at given index in given cache, creating it if needed.
diff -urp -X dontdiff linux-2.6.12-rc3/include/linux/pagevec.h linux-2.6.12-rc3-getblocks/include/linux/pagevec.h
--- linux-2.6.12-rc3/include/linux/pagevec.h	2005-04-21 05:33:16.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/include/linux/pagevec.h	2005-04-22 15:08:33.000000000 +0530
@@ -28,6 +28,9 @@ unsigned pagevec_lookup(struct pagevec *
 unsigned pagevec_lookup_tag(struct pagevec *pvec,
 		struct address_space *mapping, pgoff_t *index, int tag,
 		unsigned nr_pages);
+unsigned pagevec_contig_lookup_tag(struct pagevec *pvec,
+		struct address_space *mapping, pgoff_t *index, int tag,
+		unsigned nr_pages);
 
 static inline void pagevec_init(struct pagevec *pvec, int cold)
 {
diff -urp -X dontdiff linux-2.6.12-rc3/include/linux/radix-tree.h linux-2.6.12-rc3-getblocks/include/linux/radix-tree.h
--- linux-2.6.12-rc3/include/linux/radix-tree.h	2005-04-21 05:33:16.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/include/linux/radix-tree.h	2005-04-22 15:08:33.000000000 +0530
@@ -59,8 +59,18 @@ void *radix_tree_tag_clear(struct radix_
 int radix_tree_tag_get(struct radix_tree_root *root,
 			unsigned long index, int tag);
 unsigned int
-radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
-		unsigned long first_index, unsigned int max_items, int tag);
+__radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
+		unsigned long first_index, unsigned int max_items, int tag,
+		int contig);
+
+static inline unsigned int radix_tree_gang_lookup_tag(struct radix_tree_root
+		*root, void **results, unsigned long first_index,
+		unsigned int max_items, int tag)
+{
+	return __radix_tree_gang_lookup_tag(root, results, first_index,
+		max_items, tag, 0);
+}
+
 int radix_tree_tagged(struct radix_tree_root *root, int tag);
 
 static inline void radix_tree_preload_end(void)
diff -urp -X dontdiff linux-2.6.12-rc3/lib/radix-tree.c linux-2.6.12-rc3-getblocks/lib/radix-tree.c
--- linux-2.6.12-rc3/lib/radix-tree.c	2005-04-21 05:33:16.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/lib/radix-tree.c	2005-04-22 16:34:29.000000000 +0530
@@ -557,12 +557,13 @@ EXPORT_SYMBOL(radix_tree_gang_lookup);
  */
 static unsigned int
 __lookup_tag(struct radix_tree_root *root, void **results, unsigned long index,
-	unsigned int max_items, unsigned long *next_index, int tag)
+	unsigned int max_items, unsigned long *next_index, int tag, int contig)
 {
 	unsigned int nr_found = 0;
 	unsigned int shift;
 	unsigned int height = root->height;
 	struct radix_tree_node *slot;
+	unsigned long cindex = (contig && (*next_index)) ? *next_index : -1;
 
 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
 	slot = root->rnode;
@@ -575,6 +576,11 @@ __lookup_tag(struct radix_tree_root *roo
 				BUG_ON(slot->slots[i] == NULL);
 				break;
 			}
+			if (contig && index >= cindex) {
+				/* break in contiguity */
+				index = 0;
+				goto out;
+			}
 			index &= ~((1UL << shift) - 1);
 			index += 1UL << shift;
 			if (index == 0)
@@ -593,6 +599,10 @@ __lookup_tag(struct radix_tree_root *roo
 					results[nr_found++] = slot->slots[j];
 					if (nr_found == max_items)
 						goto out;
+				} else if (contig && nr_found) {
+					/* break in contiguity */
+					index = 0;
+					goto out;
 				}
 			}
 		}
@@ -618,29 +628,32 @@ out:
  *	returns the number of items which were placed at *@results.
  */
 unsigned int
-radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
-		unsigned long first_index, unsigned int max_items, int tag)
+__radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
+		unsigned long first_index, unsigned int max_items, int tag,
+		int contig)
 {
 	const unsigned long max_index = radix_tree_maxindex(root->height);
 	unsigned long cur_index = first_index;
+	unsigned long next_index = 0;	/* Index of next contiguous search */
 	unsigned int ret = 0;
 
 	while (ret < max_items) {
 		unsigned int nr_found;
-		unsigned long next_index;	/* Index of next search */
 
 		if (cur_index > max_index)
 			break;
 		nr_found = __lookup_tag(root, results + ret, cur_index,
-					max_items - ret, &next_index, tag);
+				max_items - ret, &next_index, tag, contig);
 		ret += nr_found;
 		if (next_index == 0)
 			break;
 		cur_index = next_index;
+		if (!nr_found)
+			next_index = 0;
 	}
 	return ret;
 }
-EXPORT_SYMBOL(radix_tree_gang_lookup_tag);
+EXPORT_SYMBOL(__radix_tree_gang_lookup_tag);
 
 /**
  *	radix_tree_delete    -    delete an item from a radix tree
diff -urp -X dontdiff linux-2.6.12-rc3/mm/filemap.c linux-2.6.12-rc3-getblocks/mm/filemap.c
--- linux-2.6.12-rc3/mm/filemap.c	2005-04-21 05:33:16.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/mm/filemap.c	2005-04-22 16:20:30.000000000 +0530
@@ -635,16 +635,19 @@ unsigned find_get_pages(struct address_s
 /*
  * Like find_get_pages, except we only return pages which are tagged with
  * `tag'.   We update *index to index the next page for the traversal.
+ * If 'contig' is 1, then we return only pages which are contiguous in the
+ * file.
  */
 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
-			int tag, unsigned int nr_pages, struct page **pages)
+			int tag, unsigned int nr_pages, struct page **pages,
+			int contig)
 {
 	unsigned int i;
 	unsigned int ret;
 
 	read_lock_irq(&mapping->tree_lock);
-	ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
-				(void **)pages, *index, nr_pages, tag);
+	ret = __radix_tree_gang_lookup_tag(&mapping->page_tree,
+			(void **)pages, *index, nr_pages, tag, contig);
 	for (i = 0; i < ret; i++)
 		page_cache_get(pages[i]);
 	if (ret)
diff -urp -X dontdiff linux-2.6.12-rc3/mm/swap.c linux-2.6.12-rc3-getblocks/mm/swap.c
--- linux-2.6.12-rc3/mm/swap.c	2005-04-21 05:33:16.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/mm/swap.c	2005-04-22 15:08:33.000000000 +0530
@@ -384,7 +384,16 @@ unsigned pagevec_lookup_tag(struct pagev
 		pgoff_t *index, int tag, unsigned nr_pages)
 {
 	pvec->nr = find_get_pages_tag(mapping, index, tag,
-					nr_pages, pvec->pages);
+					nr_pages, pvec->pages, 0);
+	return pagevec_count(pvec);
+}
+
+unsigned int
+pagevec_contig_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
+		pgoff_t *index, int tag, unsigned nr_pages)
+{
+	pvec->nr = find_get_pages_tag(mapping, index, tag,
+					nr_pages, pvec->pages, 1);
 	return pagevec_count(pvec);
 }
 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux