[rfc patch 1/2] direct-io: rewrite to handle non-512 aligned user address

This is a re-write of direct i/o so that it does not depend on the
user address being a multiple of 512.  This is necessary to allow
relaxing the address alignment check in the 2nd patch.

I have tested this on 2.6.13-rc2 with a variety of i/o sizes and
i/o offsets on ext3 without problems (with 512 alignment).  It even
make the code smaller:

   text    data     bss     dec     hex filename
   7668       0       0    7668    1df4 direct-io.o original
   7160       0       0    7160    1bf8 direct-io.o patched

Any additional testing and/or comments welcome.

Signed-off-by: Daniel McNeil <daniel@osdl.org>

--- linux-2.6.12.orig/fs/direct-io.c	2005-06-28 15:26:50.000000000 -0700
+++ linux-2.6.12/fs/direct-io.c	2005-06-28 16:03:14.000000000 -0700
@@ -43,16 +43,6 @@
 #define DIO_PAGES	64
 
 /*
- * This code generally works in units of "dio_blocks".  A dio_block is
- * somewhere between the hard sector size and the filesystem block size.  it
- * is determined on a per-invocation basis.   When talking to the filesystem
- * we need to convert dio_blocks to fs_blocks by scaling the dio_block quantity
- * down by dio->blkfactor.  Similarly, fs-blocksize quantities are converted
- * to bio_block quantities by shifting left by blkfactor.
- *
- * If blkfactor is zero then the user's request was aligned to the filesystem's
- * blocksize.
- *
  * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems.
  * This determines whether we need to do the fancy locking which prevents
  * direct-IO from being able to read uninitialised disk blocks.  If its zero
@@ -69,28 +59,20 @@ struct dio {
 	loff_t i_size;			/* i_size when submitted */
 	int lock_type;			/* doesn't change */
 	unsigned blkbits;		/* doesn't change */
-	unsigned blkfactor;		/* When we're using an alignment which
-					   is finer than the filesystem's soft
-					   blocksize, this specifies how much
-					   finer.  blkfactor=2 means 1/4-block
-					   alignment.  Does not change */
 	unsigned start_zero_done;	/* flag: sub-blocksize zeroing has
 					   been performed at the start of a
 					   write */
 	int pages_in_io;		/* approximate total IO pages */
 	size_t	size;			/* total request size (doesn't change)*/
-	sector_t block_in_file;		/* Current offset into the underlying
-					   file in dio_block units. */
-	unsigned blocks_available;	/* At block_in_file.  changes */
-	sector_t final_block_in_request;/* doesn't change */
-	unsigned first_block_in_page;	/* doesn't change, Used only once */
+	unsigned long user_addr;	/* user buffer addr */
+	loff_t file_offset;		/* file offset of i/o */
+	loff_t final_file_offset;	/* file offset at end of i/o */
+	unsigned bytes_available;	/* bytes available */
 	int boundary;			/* prev block is at a boundary */
 	int reap_counter;		/* rate limit reaping */
 	get_blocks_t *get_blocks;	/* block mapping function */
 	dio_iodone_t *end_io;		/* IO completion function */
 	sector_t final_block_in_bio;	/* current final block in bio + 1 */
-	sector_t next_block_for_io;	/* next block to be put under IO,
-					   in dio_blocks units */
 	struct buffer_head map_bh;	/* last get_blocks() result */
 
 	/*
@@ -161,11 +143,11 @@ static int dio_refill_pages(struct dio *
 		NULL);				/* vmas */
 	up_read(&current->mm->mmap_sem);
 
-	if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) {
+	if (ret < 0 && dio->bytes_available && (dio->rw == WRITE)) {
 		/*
 		 * A memory fault, but the filesystem has some outstanding
-		 * mapped blocks.  We need to use those blocks up to avoid
-		 * leaking stale data in the file.
+		 * mapped blocks.  We need to write zeros to those blocks
+		 * to avoid leaking stale data in the file.
 		 */
 		if (dio->page_errors == 0)
 			dio->page_errors = ret;
@@ -480,8 +462,7 @@ static int dio_bio_reap(struct dio *dio)
 
 /*
  * Call into the fs to map some more disk blocks.  We record the current number
- * of available blocks at dio->blocks_available.  These are in units of the
- * fs blocksize, (1 << inode->i_blkbits).
+ * of available bytes at dio->bytess_available.
  *
  * The fs is allowed to map lots of blocks at once.  If it wants to do that,
  * it uses the passed inode-relative block number as the file offset, as usual.
@@ -498,8 +479,7 @@ static int dio_bio_reap(struct dio *dio)
  *
  * In the case of filesystem holes: the fs may return an arbitrarily-large
  * hole by returning an appropriate value in b_size and by clearing
- * buffer_mapped().  However the direct-io code will only process holes one
- * block at a time - it will repeatedly call get_blocks() as it walks the hole.
+ * buffer_mapped().
  */
 static int get_more_blocks(struct dio *dio)
 {
@@ -507,8 +487,6 @@ static int get_more_blocks(struct dio *d
 	struct buffer_head *map_bh = &dio->map_bh;
 	sector_t fs_startblk;	/* Into file, in filesystem-sized blocks */
 	unsigned long fs_count;	/* Number of filesystem-sized blocks */
-	unsigned long dio_count;/* Number of dio_block-sized blocks */
-	unsigned long blkmask;
 	int create;
 
 	/*
@@ -519,18 +497,14 @@ static int get_more_blocks(struct dio *d
 	if (ret == 0) {
 		map_bh->b_state = 0;
 		map_bh->b_size = 0;
-		BUG_ON(dio->block_in_file >= dio->final_block_in_request);
-		fs_startblk = dio->block_in_file >> dio->blkfactor;
-		dio_count = dio->final_block_in_request - dio->block_in_file;
-		fs_count = dio_count >> dio->blkfactor;
-		blkmask = (1 << dio->blkfactor) - 1;
-		if (dio_count & blkmask)	
-			fs_count++;
+		BUG_ON(dio->file_offset >= dio->final_file_offset);
+		fs_startblk = dio->file_offset >> dio->blkbits;
+		fs_count = ((dio->final_file_offset - 1) >> 9) -
+			   fs_startblk + 1;
 
 		create = dio->rw == WRITE;
 		if (dio->lock_type == DIO_LOCKING) {
-			if (dio->block_in_file < (i_size_read(dio->inode) >>
-							dio->blkbits))
+			if (dio->file_offset < dio->i_size)
 				create = 0;
 		} else if (dio->lock_type == DIO_NO_LOCKING) {
 			create = 0;
@@ -558,7 +532,7 @@ static int dio_new_bio(struct dio *dio, 
 	ret = dio_bio_reap(dio);
 	if (ret)
 		goto out;
-	sector = start_sector << (dio->blkbits - 9);
+	sector = start_sector;
 	nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev));
 	BUG_ON(nr_pages <= 0);
 	ret = dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages);
@@ -588,7 +562,7 @@ static int dio_bio_add_page(struct dio *
 			dio->pages_in_io--;
 		page_cache_get(dio->cur_page);
 		dio->final_block_in_bio = dio->cur_page_block +
-			(dio->cur_page_len >> dio->blkbits);
+			(dio->cur_page_len >> 9);
 		ret = 0;
 	} else {
 		ret = 1;
@@ -671,7 +645,7 @@ submit_page_section(struct dio *dio, str
 	if (	(dio->cur_page == page) &&
 		(dio->cur_page_offset + dio->cur_page_len == offset) &&
 		(dio->cur_page_block +
-			(dio->cur_page_len >> dio->blkbits) == blocknr)) {
+			(dio->cur_page_len >> 9) == blocknr)) {
 		dio->cur_page_len += len;
 
 		/*
@@ -727,44 +701,41 @@ static void clean_blockdev_aliases(struc
 /*
  * If we are not writing the entire block and get_block() allocated
  * the block for us, we need to fill-in the unused portion of the
- * block with zeros. This happens only if user-buffer, fileoffset or
- * io length is not filesystem block-size multiple.
+ * block with zeros. This happens only if fileoffset or io length
+ * is not filesystem block-size multiple.
  *
  * `end' is zero if we're doing the start of the IO, 1 at the end of the
  * IO.
  */
 static void dio_zero_block(struct dio *dio, int end)
 {
-	unsigned dio_blocks_per_fs_block;
-	unsigned this_chunk_blocks;	/* In dio_blocks */
 	unsigned this_chunk_bytes;
+	unsigned long blkmask = (1 << dio->blkbits) - 1;
+	unsigned fs_offset = dio->file_offset & blkmask;
+	int start;
 	struct page *page;
 
 	dio->start_zero_done = 1;
-	if (!dio->blkfactor || !buffer_new(&dio->map_bh))
+	if (fs_offset == 0 || !buffer_new(&dio->map_bh))
 		return;
 
-	dio_blocks_per_fs_block = 1 << dio->blkfactor;
-	this_chunk_blocks = dio->block_in_file & (dio_blocks_per_fs_block - 1);
-
-	if (!this_chunk_blocks)
+	if (end == 0) {
+		/* start */
+		start = 0;
+		this_chunk_bytes = fs_offset;
+	} else {
+		/* end */
+		start = fs_offset;
+		this_chunk_bytes = dio->map_bh.b_size - fs_offset;
+	}
+	if (!this_chunk_bytes)
 		return;
 
-	/*
-	 * We need to zero out part of an fs block.  It is either at the
-	 * beginning or the end of the fs block.
-	 */
-	if (end) 
-		this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
-
-	this_chunk_bytes = this_chunk_blocks << dio->blkbits;
-
 	page = ZERO_PAGE(dio->curr_user_address);
 	if (submit_page_section(dio, page, 0, this_chunk_bytes, 
-				dio->next_block_for_io))
+				(dio->map_bh.b_blocknr << (dio->blkbits - 9)) +
+				(start >> 9)))
 		return;
-
-	dio->next_block_for_io += this_chunk_blocks;
 }
 
 /*
@@ -786,72 +757,82 @@ static void dio_zero_block(struct dio *d
 static int do_direct_IO(struct dio *dio)
 {
 	const unsigned blkbits = dio->blkbits;
-	const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
 	struct page *page;
-	unsigned block_in_page;
+	unsigned long user_addr;
+	unsigned int fs_offset = 0;	/* offset in fs block */
+	unsigned int page_offset;	/* offset in page */
+	unsigned int bytes_this_chunk;
+	unsigned long blkmask;
 	struct buffer_head *map_bh = &dio->map_bh;
 	int ret = 0;
 
-	/* The I/O can start at any block offset within the first page */
-	block_in_page = dio->first_block_in_page;
+	user_addr = dio->user_addr;
+
+	blkmask = (1 << dio->blkbits) - 1;
+	while (dio->file_offset < dio->final_file_offset) {
+		unsigned b;
 
-	while (dio->block_in_file < dio->final_block_in_request) {
 		page = dio_get_page(dio);
 		if (IS_ERR(page)) {
 			ret = PTR_ERR(page);
 			goto out;
 		}
 
-		while (block_in_page < blocks_per_page) {
-			unsigned offset_in_page = block_in_page << blkbits;
-			unsigned this_chunk_bytes;	/* # of bytes mapped */
-			unsigned this_chunk_blocks;	/* # of blocks */
-			unsigned u;
-
-			if (dio->blocks_available == 0) {
+		/*
+		 * loop until done with this page 
+		 */
+		do {
+			page_offset = user_addr & (PAGE_SIZE - 1);
+			if (dio->bytes_available == 0) {
+				fs_offset = dio->file_offset & blkmask;
 				/*
 				 * Need to go and map some more disk
 				 */
-				unsigned long blkmask;
-				unsigned long dio_remainder;
 
 				ret = get_more_blocks(dio);
 				if (ret) {
 					page_cache_release(page);
 					goto out;
 				}
-				if (!buffer_mapped(map_bh))
-					goto do_holes;
 
-				dio->blocks_available =
-						map_bh->b_size >> dio->blkbits;
-				dio->next_block_for_io =
-					map_bh->b_blocknr << dio->blkfactor;
+				dio->bytes_available = map_bh->b_size;
+
 				if (buffer_new(map_bh))
 					clean_blockdev_aliases(dio);
 
-				if (!dio->blkfactor)
-					goto do_holes;
-
-				blkmask = (1 << dio->blkfactor) - 1;
-				dio_remainder = (dio->block_in_file & blkmask);
-
 				/*
-				 * If we are at the start of IO and that IO
-				 * starts partway into a fs-block,
-				 * dio_remainder will be non-zero.  If the IO
-				 * is a read then we can simply advance the IO
-				 * cursor to the first block which is to be
-				 * read.  But if the IO is a write and the
-				 * block was newly allocated we cannot do that;
-				 * the start of the fs block must be zeroed out
-				 * on-disk
+				 * If write i/o starts partway into fs_block
+				 * and the block is newly allocated,
+				 * we need to clear the beginning of the block,
+				 * but only for the 1st allocation for this dio.
+				 */
+				if (!dio->start_zero_done) 
+					dio_zero_block(dio, 0);
+				/*
+				 * Adjust bytes available, if starting in
+				 * partway into fs_block.
 				 */
-				if (!buffer_new(map_bh))
-					dio->next_block_for_io += dio_remainder;
-				dio->blocks_available -= dio_remainder;
+				dio->bytes_available -= fs_offset;
 			}
-do_holes:
+			/*
+			 * At this point we know the file block to do i/o to.
+			 * Now we have to see how much of this chunk we can do.
+			 * If starting the i/o in the middle of the file
+			 * system block block, bytes_available has already
+			 * been adjusted.  For unaligned user addr, we could
+			 * hit end of page before end of the fs block.
+			 */
+			bytes_this_chunk = dio->bytes_available;
+
+			b = PAGE_SIZE - page_offset;	/* bytes to page end */
+			if (bytes_this_chunk > b)
+				bytes_this_chunk = b;
+
+			/* check end of i/o */
+			b = dio->final_file_offset - dio->file_offset;
+			if (bytes_this_chunk > b)
+				bytes_this_chunk = b;
+
 			/* Handle holes */
 			if (!buffer_mapped(map_bh)) {
 				char *kaddr;
@@ -862,66 +843,41 @@ do_holes:
 					return -ENOTBLK;
 				}
 
-				if (dio->block_in_file >=
-					i_size_read(dio->inode)>>blkbits) {
+				if (dio->file_offset >= dio->i_size) {
 					/* We hit eof */
 					page_cache_release(page);
 					goto out;
 				}
+				/*
+				 * zero out the user buffer to end of page.
+				 */
 				kaddr = kmap_atomic(page, KM_USER0);
-				memset(kaddr + (block_in_page << blkbits),
-						0, 1 << blkbits);
+				memset(kaddr + page_offset, 0, bytes_this_chunk);
 				flush_dcache_page(page);
 				kunmap_atomic(kaddr, KM_USER0);
-				dio->block_in_file++;
-				block_in_page++;
-				goto next_block;
+			} else {
+				/* add some i/o */
+				ret = submit_page_section(dio, page,
+					page_offset, bytes_this_chunk,
+					(map_bh->b_blocknr << (dio->blkbits - 9))
+					+ (fs_offset >> 9));
+				if (ret) {
+					page_cache_release(page);
+					goto out;
+				}
 			}
 
-			/*
-			 * If we're performing IO which has an alignment which
-			 * is finer than the underlying fs, go check to see if
-			 * we must zero out the start of this block.
-			 */
-			if (unlikely(dio->blkfactor && !dio->start_zero_done))
-				dio_zero_block(dio, 0);
-
-			/*
-			 * Work out, in this_chunk_blocks, how much disk we
-			 * can add to this page
-			 */
-			this_chunk_blocks = dio->blocks_available;
-			u = (PAGE_SIZE - offset_in_page) >> blkbits;
-			if (this_chunk_blocks > u)
-				this_chunk_blocks = u;
-			u = dio->final_block_in_request - dio->block_in_file;
-			if (this_chunk_blocks > u)
-				this_chunk_blocks = u;
-			this_chunk_bytes = this_chunk_blocks << blkbits;
-			BUG_ON(this_chunk_bytes == 0);
-
-			dio->boundary = buffer_boundary(map_bh);
-			ret = submit_page_section(dio, page, offset_in_page,
-				this_chunk_bytes, dio->next_block_for_io);
-			if (ret) {
-				page_cache_release(page);
-				goto out;
-			}
-			dio->next_block_for_io += this_chunk_blocks;
+			dio->bytes_available -= bytes_this_chunk;
+			dio->file_offset += bytes_this_chunk;
+			user_addr += bytes_this_chunk;
+			fs_offset += bytes_this_chunk;
 
-			dio->block_in_file += this_chunk_blocks;
-			block_in_page += this_chunk_blocks;
-			dio->blocks_available -= this_chunk_blocks;
-next_block:
-			if (dio->block_in_file > dio->final_block_in_request)
-				BUG();
-			if (dio->block_in_file == dio->final_block_in_request)
+			if (dio->file_offset >= dio->final_file_offset)
 				break;
-		}
+		} while (user_addr & (PAGE_SIZE - 1));
 
 		/* Drop the ref which was taken in get_user_pages() */
 		page_cache_release(page);
-		block_in_page = 0;
 	}
 out:
 	return ret;
@@ -941,16 +897,17 @@ direct_io_worker(int rw, struct kiocb *i
 	ssize_t ret = 0;
 	ssize_t ret2;
 	size_t bytes;
+	unsigned long blkmask = (1 << blkbits) - 1;
+	unsigned fs_offset;
 
 	dio->bio = NULL;
 	dio->inode = inode;
 	dio->rw = rw;
 	dio->blkbits = blkbits;
-	dio->blkfactor = inode->i_blkbits - blkbits;
 	dio->start_zero_done = 0;
 	dio->size = 0;
-	dio->block_in_file = offset >> blkbits;
-	dio->blocks_available = 0;
+	dio->file_offset = offset;
+	dio->bytes_available = 0;
 	dio->cur_page = NULL;
 
 	dio->boundary = 0;
@@ -959,7 +916,6 @@ direct_io_worker(int rw, struct kiocb *i
 	dio->end_io = end_io;
 	dio->map_bh.b_private = NULL;
 	dio->final_block_in_bio = -1;
-	dio->next_block_for_io = -1;
 
 	dio->page_errors = 0;
 	dio->result = 0;
@@ -984,7 +940,8 @@ direct_io_worker(int rw, struct kiocb *i
 	 * In case of non-aligned buffers, we may need 2 more
 	 * pages since we need to zero out first and last block.
 	 */
-	if (unlikely(dio->blkfactor))
+	fs_offset = offset & blkmask;
+	if (unlikely(fs_offset))
 		dio->pages_in_io = 2;
 	else
 		dio->pages_in_io = 0;
@@ -997,13 +954,11 @@ direct_io_worker(int rw, struct kiocb *i
 	}
 
 	for (seg = 0; seg < nr_segs; seg++) {
+		loff_t start_offset;
 		user_addr = (unsigned long)iov[seg].iov_base;
 		dio->size += bytes = iov[seg].iov_len;
 
-		/* Index into the first page of the first block */
-		dio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
-		dio->final_block_in_request = dio->block_in_file +
-						(bytes >> blkbits);
+		dio->final_file_offset = dio->file_offset + bytes;
 		/* Page fetching state */
 		dio->head = 0;
 		dio->tail = 0;
@@ -1015,13 +970,16 @@ direct_io_worker(int rw, struct kiocb *i
 			bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
 		}
 		dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+		dio->user_addr = 
 		dio->curr_user_address = user_addr;
+		start_offset = dio->file_offset;
 	
 		ret = do_direct_IO(dio);
 
-		dio->result += iov[seg].iov_len -
-			((dio->final_block_in_request - dio->block_in_file) <<
-					blkbits);
+		/*
+		 * Calc bytes transferred for this i/o
+		 */
+		dio->result += (dio->file_offset - start_offset);
 
 		if (ret) {
 			dio_cleanup(dio);
@@ -1198,7 +1156,7 @@ __blockdev_direct_IO(int rw, struct kioc
 			if (bdev)
 				 blkbits = bdev_blkbits;
 			blocksize_mask = (1 << blkbits) - 1;
-			if ((addr & blocksize_mask) || (size & blocksize_mask))  
+			if ((addr & blocksize_mask) || (size & blocksize_mask))
 				goto out;
 		}
 	}
@@ -1256,8 +1214,8 @@ __blockdev_direct_IO(int rw, struct kioc
 	dio->is_async = !is_sync_kiocb(iocb) && !((rw == WRITE) &&
 		(end > i_size_read(inode)));
 
-	retval = direct_io_worker(rw, iocb, inode, iov, offset,
-				nr_segs, blkbits, get_blocks, end_io, dio);
+	retval = direct_io_worker(rw, iocb, inode, iov, offset, nr_segs,
+				inode->i_blkbits, get_blocks, end_io, dio);
 
 	if (rw == READ && dio_lock_type == DIO_LOCKING)
 		reader_with_isem = 0;


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Prev by Date: Re: [PATCH] i386: Selectable Frequency of the Timer Interrupt
Next by Date: eepro100/e100 broken in 2.6.13-rc3
Previous by thread: console remains blanked
Next by thread: eepro100/e100 broken in 2.6.13-rc3
Index(es):
- Date
- Thread
[Index of Archives] [Kernel Newbies] [Netfilter] [Bugtraq] [Photo] [Gimp] [Yosemite News] [MIPS Linux] [ARM Linux] [Linux Security] [Linux RAID] [Video 4 Linux] [Linux for the blind]