[PATCH 003 of 006] raid5: Move compute block operations to a work queue

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This patch adds 'compute block' capabilities to the work queue.

Here are a few notes about the new flags R5_ComputeReq and
STRIPE_OP_COMPUTE_Recover:

Previously, when handle_stripe5 found a block that needed to be computed
it updated it in the same step.  Now that these operations are separated
(across multiple calls to handle_stripe5), a R5_ComputeReq flag is
needed to tell other parts of handle_stripe5 to treat the block under
computation as if it were up to date.  The order of events in the work
queue ensures that the block is indeed up to date before performing
further operations.

STRIPE_OP_COMPUTE_Recover was added to track when the parity block is
being computed due to a failed parity check.  This allows the code in
handle_stripe5 that produces requests for check_parity and compute_block
operations to be separate from the code that consumes the result.

Signed-off-by: Dan Williams <[email protected]>

 drivers/md/raid5.c         |  147 +++++++++++++++++++++++++++++++++++++--------
 include/linux/raid/raid5.h |    7 +-
 2 files changed, 129 insertions(+), 25 deletions(-)

===================================================================
Index: linux-2.6-raid/drivers/md/raid5.c
===================================================================
--- linux-2.6-raid.orig/drivers/md/raid5.c	2006-06-28 10:47:43.000000000 -0700
+++ linux-2.6-raid/drivers/md/raid5.c	2006-06-28 11:06:06.000000000 -0700
@@ -1263,7 +1263,9 @@
 			}
 		} else {
 			/* enter stage 1 of read modify write operation */
-			BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags));
+			BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
+				test_bit(R5_ComputeReq, &sh->dev[pd_idx].flags)));
+
 			set_bit(STRIPE_OP_RMW, &sh->state);
 			set_bit(STRIPE_OP_RMW_ParityPre, &sh->ops.state);
 			for (i=disks ; i-- ;) {
@@ -1272,7 +1274,8 @@
 					continue;
 
 				if (dev->towrite &&
-				    test_bit(R5_UPTODATE, &dev->flags)) {
+				    (test_bit(R5_UPTODATE, &dev->flags) ||
+				    test_bit(R5_ComputeReq, &dev->flags))) {
 					set_bit(R5_LOCKED, &dev->flags);
 					locked++;
 				}
@@ -1331,6 +1334,30 @@
 	return work_queued;
 }
 
+static int handle_compute_operations5(struct stripe_head *sh, int dd_idx)
+{
+	int work_queued = -EBUSY;
+
+	if (test_bit(STRIPE_OP_COMPUTE, &sh->state) &&
+		test_bit(STRIPE_OP_COMPUTE_Done, &sh->ops.state)) {
+		clear_bit(STRIPE_OP_COMPUTE, &sh->state);
+		clear_bit(STRIPE_OP_COMPUTE_Done, &sh->ops.state);
+		clear_bit(R5_ComputeReq, &sh->dev[dd_idx].flags);
+		work_queued = 0;
+	} else if (!test_bit(STRIPE_OP_COMPUTE, &sh->state)) {
+		set_bit(STRIPE_OP_COMPUTE, &sh->state);
+		set_bit(STRIPE_OP_COMPUTE_Prep, &sh->ops.state);
+		set_bit(R5_ComputeReq, &sh->dev[dd_idx].flags);
+		work_queued = 1;
+		sh->ops.pending++;
+	}
+
+	PRINTK("%s: stripe %llu work_queued: %d op_state: %lx dev[%d].flags: %lx\n",
+		__FUNCTION__, (unsigned long long)sh->sector,
+		work_queued, sh->ops.state, dd_idx, sh->dev[dd_idx].flags);
+
+	return work_queued;
+}
 
 /*
  * Each stripe/dev can have one or more bion attached.
@@ -1454,7 +1481,7 @@
 	int i, pd_idx = sh->pd_idx, disks = sh->disks, count = 1;
 	void *ptr[MAX_XOR_BLOCKS];
 	struct bio *chosen;
-	int overlap=0, work=0, written=0;
+	int overlap=0, work=0, written=0, compute=0, dd_idx=0;
 	unsigned long state, ops_state, ops_state_orig;
 
 	/* take a snapshot of what needs to be done at this point in time */
@@ -1463,6 +1490,51 @@
 	ops_state_orig = ops_state = sh->ops.state;
 	spin_unlock(&sh->lock);
 
+	if (test_bit(STRIPE_OP_COMPUTE, &state)) {
+		for (i=disks ; i-- ;) {
+			struct r5dev *dev = &sh->dev[i];
+			if (test_bit(R5_ComputeReq, &dev->flags)) {
+				dd_idx = i;
+				i = -1;
+				break;
+			}
+		}
+		BUG_ON(i >= 0);
+		PRINTK("%s: stripe %llu STRIPE_OP_COMPUTE op_state: %lx block: %d\n",
+			__FUNCTION__, (unsigned long long)sh->sector,
+			ops_state, dd_idx);
+		ptr[0] = page_address(sh->dev[dd_idx].page);
+
+		if (test_and_clear_bit(STRIPE_OP_COMPUTE_Prep, &ops_state)) {
+			memset(ptr[0], 0, STRIPE_SIZE);
+			set_bit(STRIPE_OP_COMPUTE_Parity, &ops_state);
+		}
+
+		if (test_and_clear_bit(STRIPE_OP_COMPUTE_Parity, &ops_state)) {
+			for (i = disks ; i--; ) {
+				struct r5dev *dev = &sh->dev[i];
+				void *p;
+				if (i == dd_idx)
+					continue;
+				p = page_address(dev->page);
+				if (test_bit(R5_UPTODATE, &dev->flags))
+					ptr[count++] = p;
+				else
+					printk(KERN_ERR "STRIPE_OP_COMPUTE %d, stripe %llu, %d"
+						" not present\n", dd_idx,
+						(unsigned long long)sh->sector, i);
+
+				check_xor();
+			}
+			if (count != 1)
+				xor_block(count, STRIPE_SIZE, ptr);
+
+			work++;
+			compute++;
+			set_bit(STRIPE_OP_COMPUTE_Done, &ops_state);
+		}
+	}
+
 	if (test_bit(STRIPE_OP_RMW, &state)) {
 		BUG_ON(test_bit(STRIPE_OP_RCW, &state));
 		PRINTK("%s: stripe %llu STRIPE_OP_RMW op_state: %lx\n",
@@ -1615,6 +1687,9 @@
 				wake_up(&sh->raid_conf->wait_for_overlap);
 		}
 
+	if (compute)
+		set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
+
 	sh->ops.pending -= work;
 	clear_bit(STRIPE_OP_QUEUED, &sh->state);
 	set_bit(STRIPE_HANDLE, &sh->state);
@@ -1857,25 +1932,32 @@
 	 * parity, or to satisfy requests
 	 * or to load a block that is being partially written.
 	 */
-	if (to_read || non_overwrite || (syncing && (uptodate < disks)) || expanding) {
+	if (to_read || non_overwrite || (syncing && (uptodate < disks)) || expanding ||
+		test_bit(STRIPE_OP_COMPUTE, &sh->state)) {
 		for (i=disks; i--;) {
 			dev = &sh->dev[i];
-			if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
-			    (dev->toread ||
+			if (test_bit(R5_ComputeReq, &dev->flags) ||
+			    (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
+			     (dev->toread ||
 			     (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
 			     syncing ||
 			     expanding ||
 			     (failed && (sh->dev[failed_num].toread ||
 					 (sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags))))
-				    )
+				    ))
 				) {
 				/* we would like to get this block, possibly
 				 * by computing it, but we might not be able to
 				 */
-				if (uptodate == disks-1) {
-					PRINTK("Computing block %d\n", i);
-					compute_block(sh, i);
-					uptodate++;
+				if ((uptodate == disks-1) || test_bit(STRIPE_OP_COMPUTE, &sh->state)) {
+					handle_compute_operations5(sh, i);
+					if (uptodate == disks-1)
+						uptodate++;
+					/* Careful: from this point on 'uptodate' is in the eye of the
+					 * workqueue which services 'compute' operations before writes
+					 * and parity checks.  R5_ComputeReq flags blocks that will be
+					 * R5_UPTODATE in the work queue.
+					 */
 				} else if (test_bit(R5_Insync, &dev->flags)) {
 					set_bit(R5_LOCKED, &dev->flags);
 					set_bit(R5_Wantread, &dev->flags);
@@ -1898,8 +1980,7 @@
 	}
 
 	/* now to consider writing and what else, if anything should be read */
-	if (to_write || test_bit(STRIPE_OP_RCW, &sh->state) ||
-		test_bit(STRIPE_OP_RMW, &sh->state)) {
+	if (to_write || test_bit(STRIPE_OP_RCW, &sh->state) || test_bit(STRIPE_OP_RMW, &sh->state)) {
 		int rmw=0, rcw=0;
 		for (i=disks ; i--;) {
 			/* would I have to read this buffer for read_modify_write */
@@ -1910,7 +1991,7 @@
 || sh->bh_page[i]!=bh->b_page
 #endif
 				    ) &&
-			    !test_bit(R5_UPTODATE, &dev->flags)) {
+			    !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_ComputeReq, &dev->flags))) {
 				if (test_bit(R5_Insync, &dev->flags)
 /*				    && !(!mddev->insync && i == sh->pd_idx) */
 					)
@@ -1924,7 +2005,7 @@
 || sh->bh_page[i] != bh->b_page
 #endif
 				    ) &&
-			    !test_bit(R5_UPTODATE, &dev->flags)) {
+			    !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_ComputeReq, &dev->flags))) {
 				if (test_bit(R5_Insync, &dev->flags)) rcw++;
 				else rcw += 2*disks;
 			}
@@ -1937,7 +2018,8 @@
 			for (i=disks; i--;) {
 				dev = &sh->dev[i];
 				if ((dev->towrite || i == sh->pd_idx) &&
-				    !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
+				    !test_bit(R5_LOCKED, &dev->flags) &&
+				    !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_ComputeReq, &dev->flags)) &&
 				    test_bit(R5_Insync, &dev->flags)) {
 					if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 					{
@@ -1956,7 +2038,8 @@
 			for (i=disks; i--;) {
 				dev = &sh->dev[i];
 				if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
-				    !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
+				    !test_bit(R5_LOCKED, &dev->flags) &&
+				    !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_ComputeReq, &dev->flags)) &&
 				    test_bit(R5_Insync, &dev->flags)) {
 					if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 					{
@@ -2007,12 +2090,20 @@
 		int work_queued = 0, result = 0;
 
 		set_bit(STRIPE_HANDLE, &sh->state);
-		if (failed == 0) {
+		/* Take one of the following actions:
+		 * 1/ start a check parity operation if (uptodate == disks)
+		 * 2/ finish a check parity operation and act on the result
+		 * 3/ skip this section if we previously initiated a recovery
+		 *    operation
+		 */
+		if (failed == 0 && !test_bit(STRIPE_OP_COMPUTE_Recover, &sh->ops.state)) {
 			BUG_ON(!test_bit(STRIPE_OP_CHECK, &sh->state) &&
 				(uptodate != disks));
 			work_queued = handle_check_operations5(sh,
 							uptodate == disks);
-			result = test_and_clear_bit(STRIPE_OP_CHECK_IsZero, &sh->ops.state);
+			if (work_queued == 0)
+				result = test_and_clear_bit(STRIPE_OP_CHECK_IsZero,
+							&sh->ops.state);
 			if (work_queued > 0) {
 				uptodate--;
 			} else if (result && work_queued == 0) {
@@ -2024,15 +2115,25 @@
 					/* don't try to repair!! */
 					set_bit(STRIPE_INSYNC, &sh->state);
 				else {
-					compute_block(sh, sh->pd_idx);
-					uptodate++;
+					set_bit(STRIPE_OP_COMPUTE_Recover, &sh->ops.state);
+					handle_compute_operations5(sh, sh->pd_idx);
+					if (uptodate == disks-1)
+						uptodate++;
 				}
 			}
 		}
-		if (!test_bit(STRIPE_INSYNC, &sh->state) && work_queued == 0) {
+		/* Wait for check parity and compute block operations to complete
+		 * before write-back
+		 */
+		if (!test_bit(STRIPE_INSYNC, &sh->state) &&
+			!test_bit(STRIPE_OP_CHECK, &sh->state) &&
+			!test_bit(STRIPE_OP_COMPUTE, &sh->state)) {
 			/* either failed parity check, or recovery is happening */
-			if (failed==0)
+			if (failed==0) {
+				BUG_ON(!test_and_clear_bit(
+					STRIPE_OP_COMPUTE_Recover, &sh->ops.state));
 				failed_num = sh->pd_idx;
+			}
 			dev = &sh->dev[failed_num];
 			BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
 			BUG_ON(uptodate != disks);
Index: linux-2.6-raid/include/linux/raid/raid5.h
===================================================================
--- linux-2.6-raid.orig/include/linux/raid/raid5.h	2006-06-28 10:47:43.000000000 -0700
+++ linux-2.6-raid/include/linux/raid/raid5.h	2006-06-28 11:05:38.000000000 -0700
@@ -179,6 +179,7 @@
 #define	R5_ReWrite	9	/* have tried to over-write the readerror */
 #define	R5_Expanded	10	/* This block now has post-expand data */
 #define	R5_Consistent	11	/* Block is HW DMA-able without a cache flush */
+#define	R5_ComputeReq	12	/* compute_block in progress treat as uptodate */
 
 /*
  * Write method
@@ -238,15 +239,17 @@
 #define	STRIPE_OP_COMPUTE_Parity	15
 #define	STRIPE_OP_COMPUTE_End		16
 #define	STRIPE_OP_COMPUTE_Done		17
+#define	STRIPE_OP_COMPUTE_Recover	18
 
 /*
- * Bit mask for status bits set by the work queue thread
+ * Bit mask for status bits not to be cleared by the work queue thread
  */
 #define	STRIPE_OP_COMPLETION_MASK 	(1 << STRIPE_OP_RCW_Done |\
 						1 << STRIPE_OP_RMW_Done |\
 						1 << STRIPE_OP_CHECK_Done |\
 						1 << STRIPE_OP_CHECK_IsZero |\
-						1 << STRIPE_OP_COMPUTE_Done)
+						1 << STRIPE_OP_COMPUTE_Done |\
+						1 << STRIPE_OP_COMPUTE_Recover)
 /*
  * Plugging:
  *
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux