[PATCH 026 of 35] Split any large bios that arrive at __make_request.

Now that bi_io_vec and bio can be shared, we can handle arbitrarily
large bios in __make_request by splitting them over multiple
requests.
If we do split a request, we mark both halves as "REQ_NOMERGE".
It is only really necessary to mark the first part as
 NO_BACK_MERGE
and the second part as
 NO_FRONT_MERGE
but that distinction isn't currently supported.

Note that we do not try to merge part of a large bio to
a neighbouring request.  That is a possible future enhancement.


Signed-off-by: Neil Brown <[email protected]>

### Diffstat output
 ./block/ll_rw_blk.c      |  122 +++++++++++++++++++++++++++++++++++++++--------
 ./include/linux/blkdev.h |    5 +
 2 files changed, 107 insertions(+), 20 deletions(-)

diff .prev/block/ll_rw_blk.c ./block/ll_rw_blk.c
--- .prev/block/ll_rw_blk.c	2007-07-31 11:21:15.000000000 +1000
+++ ./block/ll_rw_blk.c	2007-07-31 11:21:20.000000000 +1000
@@ -1221,13 +1221,21 @@ static void blk_recalc_rq_segments(struc
 	struct req_iterator i;
 	int high, highprv = 1;
 	struct request_queue *q = rq->q;
+	int curr_size = 0;
+	unsigned short max_sectors;
 
 	if (!rq->bio)
 		return;
 
+	if (unlikely(blk_pc_request(rq)))
+		max_sectors = q->max_hw_sectors;
+	else
+		max_sectors = q->max_sectors;
+
 	cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
 	hw_seg_size = seg_size = 0;
 	phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0;
+	rq->max_allowed_size = 0;
 	rq_for_each_segment(rq, i, bv) {
 		/*
 		 * the trick here is making sure that a high page is never
@@ -1249,9 +1257,7 @@ static void blk_recalc_rq_segments(struc
 
 			seg_size += bv.bv_len;
 			hw_seg_size += bv.bv_len;
-			bvprv = bv;
-			prvidx = i.i.i;
-			continue;
+			goto same_seg;
 		}
 new_segment:
 		if (BIOVEC_VIRT_MERGEABLE(&bvprv, &bv) &&
@@ -1267,11 +1273,19 @@ new_hw_segment:
 		}
 
 		nr_phys_segs++;
+		seg_size = bv.bv_len;
+same_seg:
+		curr_size += bv.bv_len;
 		bvprv = bv;
 		prvidx = i.i.i;
-		seg_size = bv.bv_len;
 		highprv = high;
+
+		if (curr_size <= (max_sectors << 9) &&
+		    nr_phys_segs <= q->max_phys_segments &&
+		    nr_hw_segs <= q->max_hw_segments)
+			rq->max_allowed_size = curr_size;
 	}
+
 	rq->last_len = bvprv.bv_offset + bvprv.bv_len;
 	rq->last_idx = prvidx;
 
@@ -2924,6 +2938,70 @@ static void init_request_from_bio(struct
 	blk_rq_bio_prep(req->q, req, bio);
 }
 
+static void rq_split(struct request *orig, struct request *new)
+{
+
+	/* 'orig' contains exactly one bio, and may refer to
+	 * some section in the middle of that bio.
+	 * Make 'new' refer to the beginning of that section, up
+	 * to orig->max_allowed_size.
+	 * Remove from 'orig' everything that went into 'new'.
+	 * If 'orig' becomes empty, release it's reference to the bio.
+	 */
+
+	new->cmd_type = orig->cmd_type;
+	new->cmd_flags |= orig->cmd_flags;
+	new->errors = 0;
+	new->hard_sector = new->sector = orig->hard_sector;
+	new->ioprio = orig->ioprio;
+	new->start_time = jiffies;
+	new->data_len = orig->data_len;
+	new->bio = orig->bio;
+	atomic_inc(&orig->bio->bi_iocnt);
+	new->biotail = orig->biotail;
+	new->current_nr_sectors = orig->current_nr_sectors;
+
+	new->buffer = orig->buffer;
+	new->rq_disk = orig->rq_disk;
+
+	if (orig->max_allowed_size == orig->hard_nr_sectors << 9) {
+		/* all of orig goes into new */
+		new->nr_sectors = new->hard_nr_sectors
+			= orig->hard_nr_sectors;
+		new->nr_phys_segments = orig->nr_phys_segments;
+		new->nr_hw_segments = orig->nr_hw_segments;
+		new->hw_front_size = orig->hw_front_size;
+		new->hw_back_size = orig->hw_back_size;
+		new->last_len = orig->last_len;
+		new->last_idx = orig->last_idx;
+
+		orig->nr_sectors = orig->hard_nr_sectors  = 0;
+		atomic_dec(&orig->bio->bi_iocnt);
+		orig->bio = NULL;
+	} else {
+		/* start of orig goes into new, rest stays in orig */
+		int offset;
+		new->nr_sectors = new->hard_nr_sectors
+			= (orig->max_allowed_size >> 9);
+		new->data_len = new->nr_sectors << 9;
+		new->biotail = NULL;
+		new->cmd_flags |= REQ_NOMERGE;
+
+		orig->nr_sectors = orig->hard_nr_sectors
+			-= orig->max_allowed_size >> 9;
+		orig->data_len = orig->nr_sectors << 9;
+		orig->sector = orig->hard_sector += orig->max_allowed_size >> 9;
+		offset = orig->first_offset + orig->max_allowed_size;
+		orig->first_offset = offset;
+		if (offset)
+			orig->cmd_flags |= REQ_NOMERGE;
+
+		blk_recalc_rq_segments(new);
+		BUG_ON(new->hard_nr_sectors != (new->max_allowed_size >> 9));
+		blk_recalc_rq_segments(orig);
+	}
+}
+
 static int __make_request(struct request_queue *q, struct bio *bio)
 {
 	struct request *req;
@@ -3029,24 +3107,28 @@ get_rq:
 	if (sync)
 		rw_flags |= REQ_RW_SYNC;
 
-	/*
-	 * Grab a free request. This is might sleep but can not fail.
-	 * Returns with the queue unlocked.
-	 */
-	req = get_request_wait(q, rw_flags, bio);
+	while (nreq.hard_nr_sectors) {
+		/*
+		 * Grab a free request. This is might sleep but can
+		 * not fail.  Returns with the queue unlocked.
+		 */
+		req = get_request_wait(q, rw_flags, bio);
+		rq_split(&nreq, req);
 
-	/*
-	 * After dropping the lock and possibly sleeping here, our request
-	 * may now be mergeable after it had proven unmergeable (above).
-	 * We don't worry about that case for efficiency. It won't happen
-	 * often, and the elevators are able to handle it.
-	 */
-	init_request_from_bio(req, bio);
+		/*
+		 * After dropping the lock and possibly sleeping here,
+		 * our request may now be mergeable after it had
+		 * proven unmergeable (above).  We don't worry about
+		 * that case for efficiency. It won't happen often,
+		 * and the elevators are able to handle it.
+		 */
+
+		spin_lock_irq(q->queue_lock);
+		if (elv_queue_empty(q))
+			blk_plug_device(q);
+		add_request(q, req);
+	}
 
-	spin_lock_irq(q->queue_lock);
-	if (elv_queue_empty(q))
-		blk_plug_device(q);
-	add_request(q, req);
 out:
 	if (sync)
 		__generic_unplug_device(q);

diff .prev/include/linux/blkdev.h ./include/linux/blkdev.h
--- .prev/include/linux/blkdev.h	2007-07-31 11:21:15.000000000 +1000
+++ ./include/linux/blkdev.h	2007-07-31 11:21:20.000000000 +1000
@@ -262,6 +262,11 @@ struct request {
 				 * so it matches bv_offset+bv_len in
 				 * the simple case.
 				 */
+	int max_allowed_size;   /* If this number (in bytes) is less than
+				 * hard_nr_sectors  (in sectors), the request
+				 * is too big for the queue and must be
+				 * split.
+				 */
 
 	struct hlist_node hash;	/* merge hash */
 	/*
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Follow-Ups:
- Re: [PATCH 026 of 35] Split any large bios that arrive at __make_request.
  - From: Tejun Heo <[email protected]>

References:
- [PATCH 000 of 35] Refactor block layer to improve support for stacked devices.
  - From: NeilBrown <[email protected]>

Prev by Date: [PATCH 025 of 35] Treat rq->hard_nr_sectors as setting an overriding limit in the size of the request
Next by Date: [PATCH 028 of 35] Split arbitrarily large requests to md/raid0 and md/linear
Previous by thread: Re: [PATCH 025 of 35] Treat rq->hard_nr_sectors as setting an overriding limit in the size of the request
Next by thread: Re: [PATCH 026 of 35] Split any large bios that arrive at __make_request.
Index(es):
- Date
- Thread

[Index of Archives] [Kernel Newbies] [Netfilter] [Bugtraq] [Photo] [Stuff] [Gimp] [Yosemite News] [MIPS Linux] [ARM Linux] [Linux Security] [Linux RAID] [Video 4 Linux] [Linux for the blind] [Linux Resources]