[PATCH 020 of 35] Add bi_offset and allow a bio to reference only part of a bi_io_vec

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



To allow bi_io_vec sharing, a bio now can reference just part of the
io_vec.  In particular, the first bi_offset bytes are not included,
and exactly bi_size bytes are included, even if the bi_io_vec goes
beyond there.

bi_offset must be less than bv_len of the first bvec.

This patch only handles the ll_rw_blk usage of bios.  More
changes are need (e.g. in md, dm, umem,...) before it is safe to
set bi_offset non-zero, or bi_size less than sum of bv_len.

To make segment merging easier, we also store the actual length
of the bio_vec in the request:  last_idx, last_len.  These are
calculated in blk_recalc_rq_segments.


Signed-off-by: Neil Brown <[email protected]>

### Diffstat output
 ./block/ll_rw_blk.c      |   36 +++++++++++++++++++++------
 ./drivers/md/raid10.c    |    1 
 ./fs/bio.c               |    7 +++--
 ./include/linux/bio.h    |   62 ++++++++++++++++++++++++++++++++++++++++-------
 ./include/linux/blkdev.h |   25 ++++++++----------
 ./mm/bounce.c            |    1 
 6 files changed, 98 insertions(+), 34 deletions(-)

diff .prev/block/ll_rw_blk.c ./block/ll_rw_blk.c
--- .prev/block/ll_rw_blk.c	2007-07-31 11:21:06.000000000 +1000
+++ ./block/ll_rw_blk.c	2007-07-31 11:21:07.000000000 +1000
@@ -481,6 +481,16 @@ static inline struct request *start_orde
 	return rq;
 }
 
+static inline int rq_virt_mergeable(struct request *req,
+				    struct request *nxt)
+{
+	return BLK_VIRT_MERGEABLE(
+		req->biotail->bi_io_vec[req->last_idx].bv_page,
+		   req->last_len,
+		nxt->bio->bi_io_vec[0].bv_page,
+		   nxt->bio->bi_io_vec[0].bv_offset + nxt->bio->bi_offset);
+}
+
 int blk_do_ordered(struct request_queue *q, struct request **rqp)
 {
 	struct request *rq = *rqp;
@@ -1207,6 +1217,7 @@ static void blk_recalc_rq_segments(struc
 	unsigned int hw_size;
 	struct bio_vec bv;
 	struct bio_vec bvprv = {0};
+	int prvidx = 0;
 	int seg_size;
 	int hw_seg_size;
 	int cluster;
@@ -1242,6 +1253,7 @@ static void blk_recalc_rq_segments(struc
 			seg_size += bv.bv_len;
 			hw_seg_size += bv.bv_len;
 			bvprv = bv;
+			prvidx = i.i.i;
 			continue;
 		}
 new_segment:
@@ -1259,9 +1271,12 @@ new_hw_segment:
 
 		nr_phys_segs++;
 		bvprv = bv;
+		prvidx = i.i.i;
 		seg_size = bv.bv_len;
 		highprv = high;
 	}
+	rq->last_len = bvprv.bv_offset + bvprv.bv_len;
+	rq->last_idx = prvidx;
 
 	if (nr_hw_segs == 1 &&
 	    hw_seg_size > rq->hw_front_size)
@@ -1278,8 +1293,11 @@ static int blk_phys_contig_segment(struc
 	if (!(q->queue_flags & (1 << QUEUE_FLAG_CLUSTER)))
 		return 0;
 
-	if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(req->biotail),
-				   __BVEC_START(nxt->bio)))
+	if (!BLK_PHYS_MERGEABLE(
+		    req->biotail->bi_io_vec[req->last_idx].bv_page,
+		       req->last_len,
+		    nxt->bio->bi_io_vec[0].bv_page,
+		       nxt->bio->bi_io_vec[0].bv_offset + nxt->bio->bi_offset))
 		return 0;
 	if (req->biotail->bi_size + nxt->bio->bi_size > q->max_segment_size)
 		return 0;
@@ -1301,8 +1319,8 @@ static int blk_hw_contig_segment(struct 
 		blk_recount_segments(q, req->biotail);
 	if (unlikely(!bio_flagged(nxt->bio, BIO_SEG_VALID)))
 		blk_recount_segments(q, nxt->bio);
-	if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail),
-				   __BVEC_START(nxt->bio)) ||
+
+	if (!rq_virt_mergeable(req, nxt) ||
 	    BIOVEC_VIRT_OVERSIZE(req->hw_back_size +
 				 nxt->hw_front_size))
 		return 0;
@@ -1419,8 +1437,7 @@ static int ll_back_merge_fn(struct reque
 
 	len = req->hw_back_size + nreq->hw_front_size;
 	if (nreq->first_offset == 0 &&
-	    BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail),
-				  __BVEC_START(nreq->bio)) &&
+	    rq_virt_mergeable(req, nreq) &&
 	    !BIOVEC_VIRT_OVERSIZE(len)) {
 		int mergeable = ll_new_mergeable(q, req, nreq);
 
@@ -1453,8 +1470,7 @@ static int ll_front_merge_fn(struct requ
 
 	len = nreq->hw_back_size + req->hw_front_size;
 
-	if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(nreq->biotail),
-				  __BVEC_START(req->bio)) &&
+	if (rq_virt_mergeable(nreq, req) &&
 	    !BIOVEC_VIRT_OVERSIZE(len)) {
 		int mergeable = ll_new_mergeable(q, req, nreq);
 
@@ -2842,6 +2858,8 @@ static int attempt_merge(struct request_
 
 	req->biotail->bi_next = next->bio;
 	req->biotail = next->biotail;
+	req->last_idx = next->last_idx;
+	req->last_len = next->last_len;
 
 	req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors;
 
@@ -2958,6 +2976,8 @@ static int __make_request(struct request
 			req->biotail->bi_next = bio;
 			req->biotail = bio;
 			req->hw_back_size = nreq.hw_back_size;
+			req->last_idx = nreq.last_idx;
+			req->last_len = nreq.last_len;
 			req->nr_sectors = req->hard_nr_sectors += nr_sectors;
 			req->ioprio = ioprio_best(req->ioprio, prio);
 			drive_stat_acct(req, nr_sectors, 0);

diff .prev/drivers/md/raid10.c ./drivers/md/raid10.c
--- .prev/drivers/md/raid10.c	2007-07-31 11:21:03.000000000 +1000
+++ ./drivers/md/raid10.c	2007-07-31 11:21:07.000000000 +1000
@@ -1285,6 +1285,7 @@ static void sync_request_write(mddev_t *
 		tbio->bi_rw = WRITE;
 		tbio->bi_private = r10_bio;
 		tbio->bi_sector = r10_bio->devs[i].addr;
+		tbio->bi_offset = 0;
 
 		for (j=0; j < vcnt ; j++) {
 			tbio->bi_io_vec[j].bv_offset = 0;

diff .prev/fs/bio.c ./fs/bio.c
--- .prev/fs/bio.c	2007-07-31 11:21:06.000000000 +1000
+++ ./fs/bio.c	2007-07-31 11:21:07.000000000 +1000
@@ -134,6 +134,7 @@ void bio_init(struct bio *bio)
 	bio->bi_vcnt = 0;
 	bio->bi_phys_segments = 0;
 	bio->bi_hw_segments = 0;
+	bio->bi_offset = 0;
 	bio->bi_size = 0;
 	bio->bi_max_vecs = 0;
 	bio->bi_end_io = NULL;
@@ -266,6 +267,7 @@ void __bio_clone(struct bio *bio, struct
 	bio->bi_rw = bio_src->bi_rw;
 	bio->bi_vcnt = bio_src->bi_vcnt;
 	bio->bi_size = bio_src->bi_size;
+	bio->bi_offset = bio_src->bi_offset;
 	bio_phys_segments(q, bio);
 	bio_hw_segments(q, bio);
 }
@@ -396,9 +398,8 @@ static int __bio_add_page(struct request
 	}
 
 	/* If we may be able to merge these biovecs, force a recount */
-	if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec) ||
-	    BIOVEC_VIRT_MERGEABLE(bvec-1, bvec)))
-		bio->bi_flags &= ~(1 << BIO_SEG_VALID);
+	/* NOTE: This looks inefficient, but will go away */
+	bio->bi_flags &= ~(1 << BIO_SEG_VALID);
 
 	bio->bi_vcnt++;
 	bio->bi_phys_segments++;

diff .prev/include/linux/bio.h ./include/linux/bio.h
--- .prev/include/linux/bio.h	2007-07-31 11:21:06.000000000 +1000
+++ ./include/linux/bio.h	2007-07-31 11:21:07.000000000 +1000
@@ -91,7 +91,13 @@ struct bio {
 	 */
 	unsigned short		bi_hw_segments;
 
-	unsigned int		bi_size;	/* residual I/O count */
+	/* This bio only refers to part of the data in bi_io_vec.
+	 * The first bi_offset bytes are not included, and anything after
+	 * the bi_size bytes beyond there are also ignored.
+	 * bi_offset must be less than bi_io_vec[0].bv_len;
+	 */
+	unsigned int		bi_offset;
+	unsigned int		bi_size;
 
 	unsigned int		bi_max_vecs;	/* max bvl_vecs we can hold */
 
@@ -184,13 +190,21 @@ struct bio {
 /*
  * allow arch override, for eg virtualized architectures (put in asm/io.h)
  */
-#ifndef BIOVEC_PHYS_MERGEABLE
-#define BIOVEC_PHYS_MERGEABLE(vec1, vec2)	\
-	((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2)))
+#ifndef BLK_PHYS_MERGEABLE
+#define BLK_PHYS_MERGEABLE(p1, end, p2, start)	\
+	((page_to_phys(p1)+end) == (page_to_phys(p2)+start))
 #endif
+#define BIOVEC_PHYS_MERGEABLE(vec1, vec2)	\
+	BLK_PHYS_MERGEABLE((vec1)->bv_page, (vec1)->bv_offset + (vec1)->bv_len, \
+			    (vec2)->bv_page, (vec2)->bv_offset)
 
+#define BLK_VIRT_MERGEABLE(p1, end, p2, start)	\
+	((((page_to_phys(p1)+end) | (page_to_phys(p2)+start)) 	\
+		& (BIO_VMERGE_BOUNDARY - 1)) == 0)
 #define BIOVEC_VIRT_MERGEABLE(vec1, vec2)	\
-	((((bvec_to_phys((vec1)) + (vec1)->bv_len) | bvec_to_phys((vec2))) & (BIO_VMERGE_BOUNDARY - 1)) == 0)
+	BLK_VIRT_MERGEABLE((vec1)->bv_page, (vec1)->bv_offset + (vec1)->bv_len,\
+			    (vec2)->bv_page, (vec2)->bv_offset)
+
 #define __BIO_SEG_BOUNDARY(addr1, addr2, mask) \
 	(((addr1) | (mask)) == (((addr2) - 1) | (mask)))
 #define BIOVEC_SEG_BOUNDARY(q, b1, b2) \
@@ -202,12 +216,42 @@ struct bio {
 
 struct bio_iterator {
 	int i;
+	int offset;
+	int size;
 };
-#define bio_for_each_segment(bvl, bio, i)				\
-	for (i.i = 0, bvl = *bio_iovec_idx((bio), i.i);			\
-	     i.i < (bio)->bi_vcnt;					\
-	     i.i++, bvl = *bio_iovec_idx((bio), i.i))
 
+/* This macro probably need some explanation...
+ * Its purpose is to find all the effective segments in a bio
+ * missing the first 'offs' bytes.  We need to be sure to honour
+ * bi_offset which can cause us to skip part of the firs segment,
+ * and bi_size which may cause us to stop before the end of bi_io_vec.
+ * The 'for' loop iterates through the segments in bi_io_vec until
+ * we have returned 'bi_size - offs' bytes.
+ * The 'if' sets up the 'bv' to return, adjusts the start if there
+ * is still some 'offset' to deal with, adjusts the length if
+ * we have come to the end, and avoids the call of the body (which
+ * follows this macro) if the size would be zero.
+ * It also keeps 'offset' and 'size' (in the iterator) up to date.
+ */
+#define bio_for_each_segment_offset(bv, bio, _i, offs)			\
+	for (_i.i = 0, _i.offset = (bio)->bi_offset + offs,		\
+		 _i.size = (bio)->bi_size - offs;			\
+	     _i.i < (bio)->bi_vcnt && _i.size > 0;			\
+	     _i.i++)							\
+		if (bv = *bio_iovec_idx((bio), _i.i),			\
+		    bv.bv_offset += _i.offset,				\
+		    bv.bv_len <= _i.offset				\
+		    ? (_i.offset -= bv.bv_len, 0)			\
+		    : (bv.bv_len -= _i.offset,				\
+		       _i.offset = 0,					\
+		       bv.bv_len < _i.size				\
+		       ? (_i.size -= bv.bv_len, 1)			\
+		       : (bv.bv_len = _i.size,				\
+			  _i.size = 0,					\
+			  bv.bv_len > 0)))
+
+#define bio_for_each_segment(bv, bio, __i)				\
+		bio_for_each_segment_offset(bv, bio, __i, 0)
 /*
  * get a reference to a bio, so it won't disappear. the intended use is
  * something like:

diff .prev/include/linux/blkdev.h ./include/linux/blkdev.h
--- .prev/include/linux/blkdev.h	2007-07-31 11:21:03.000000000 +1000
+++ ./include/linux/blkdev.h	2007-07-31 11:21:07.000000000 +1000
@@ -255,6 +255,13 @@ struct request {
 	struct bio *bio;
 	struct bio *biotail;
 	int first_offset;	/* offset into first bio in list */
+	int last_idx, last_len; /* idx and effective len of last
+				 * bio_vec in biotail.  last_len
+				 * is actually an offset in the page
+				 * of the end of the segment.
+				 * so it matches bv_offset+bv_len in
+				 * the simple case.
+				 */
 
 	struct hlist_node hash;	/* merge hash */
 	/*
@@ -647,7 +654,7 @@ static inline void blk_queue_bounce(stru
 #endif /* CONFIG_MMU */
 
 struct req_iterator {
-	int i;
+	struct bio_iterator i;
 	struct bio *bio;
 	int offset;
 };
@@ -655,21 +662,11 @@ struct req_iterator {
 	for (_iter.bio = (rq)->bio, _iter.offset = (rq)->first_offset;	       \
 	     _iter.bio;							       \
 	     _iter.bio = _iter.bio->bi_next, _iter.offset = 0) 		       \
-		for (_iter.i = 0;		 			       \
-		     _iter.i < _iter.bio->bi_vcnt;			       \
-		     _iter.i++						       \
-		)							       \
-			if (bvec = *bio_iovec_idx(_iter.bio, _iter.i),	       \
-			    bvec.bv_offset += _iter.offset,		       \
-			    bvec.bv_len <= _iter.offset			       \
-				? (_iter.offset -= bvec.bv_len, 0)	       \
-				: (bvec.bv_len -= _iter.offset,		       \
-				   _iter.offset = 0,			       \
-				   1))
-
+		bio_for_each_segment_offset(bvec, _iter.bio, _iter.i,	       \
+			_iter.offset)
 
 #define rq_iter_last(rq, _iter) (_iter.bio->bi_next == NULL && 	\
-				 _iter.i == _iter.bio->bi_vcnt - 1)
+				 _iter.i.i == _iter.bio->bi_vcnt - 1)
 
 extern int blk_register_queue(struct gendisk *disk);
 extern void blk_unregister_queue(struct gendisk *disk);

diff .prev/mm/bounce.c ./mm/bounce.c
--- .prev/mm/bounce.c	2007-07-31 11:21:06.000000000 +1000
+++ ./mm/bounce.c	2007-07-31 11:21:07.000000000 +1000
@@ -245,6 +245,7 @@ static void __blk_queue_bounce(struct re
 
 	bio->bi_vcnt = (*bio_orig)->bi_vcnt;
 	bio->bi_size = (*bio_orig)->bi_size;
+	bio->bi_offset = (*bio_orig)->bi_offset;
 
 	if (pool == page_pool) {
 		bio->bi_end_io = bounce_end_io_write;
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux