[PATCH] mm: moving dirty pages balancing to pdfludh entirely

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



>From Leonid Ananiev

Moving dirty pages balancing from user to kernel thread pdfludh entirely
reduces extra long write(2) latencies, increases performance.

Signed-off-by: Leonid Ananiev <[email protected]>

	A file block writing time with function write(2) is very long
some times.
IOzone benchmark for file size 50% of RAM size reports record write
latency up to 0.4 sec. See sorted column 2(usec) of 'iozone -Q' output:
$ sort -nk2 wol.dat | tail
    890676      43950       4096
     77040     292806       4096
    407812     346254       4096
   1015436     346382       4096
    632940     368620       4096
    278944     368910       4096
    890672     369545       4096
    761808     376660       4096
    152124     383555       4096
   1144300     400145       4096
Investigation shows that long write(2) time ia a result of
balance_dirty_pages() running. If any of threads on current CPU had
wrote ratelimite_pages and dirty_ratio is achieved a current thread have
to find and write all extra dirty pages in the hole system, than have to
wait IO finishing and repeate until dirty_ratio will be OK. The pdflush
was waked up after balancing at user thread.
A proposed patch wakes up pdflush at low level dirty ratio and task is
continued. If high dirty ratio is reached user task waits for IO finish.
So write process will be throttled. As a result dirty page write back
preparing, inodes scanning, journaling are performed concurrently with
user task run while dirty ratio is inside low/high limits. That is why
high default dirty ratio limit is set to 60%. After patching the kernel
uses multiprocessing more for performance; extra throttling of task is
deleted; task run time becomes more stable and predictable.
	The benchmarks IOzone and Sysbench for file size 50% and 120% of
RAM size on Pentium4, Xeon, Itanium have reported write and mix
throughput increasing about 25%. The described Iozone > 0.1 sec write(2)
latencies are deleted. The condition writeback_in_progress() is tested
earlier now. As a result extra pdflush works are not created and number
of context switches increasing is inside variation limites.


diff -rpu a/mm/page-writeback.c b/mm/page-writeback.c
--- a/mm/page-writeback.c	2006-06-02 16:59:27.000000000 +0400
+++ b/mm/page-writeback.c	2006-06-22 22:43:17.000000000 +0400
@@ -69,7 +69,7 @@ int dirty_background_ratio = 10;
 /*
  * The generator of dirty data starts writeback at this percentage
  */
-int vm_dirty_ratio = 40;
+int vm_dirty_ratio = 60;
 
 /*
  * The interval between `kupdate'-style writebacks, in jiffies
@@ -190,56 +190,14 @@ get_dirty_limits(struct writeback_state 
 static void balance_dirty_pages(struct address_space *mapping)
 {
 	struct writeback_state wbs;
-	long nr_reclaimable;
+	long nr_dirty;
 	long background_thresh;
 	long dirty_thresh;
-	unsigned long pages_written = 0;
-	unsigned long write_chunk = sync_writeback_pages();
 
 	struct backing_dev_info *bdi = mapping->backing_dev_info;
-
-	for (;;) {
-		struct writeback_control wbc = {
-			.bdi		= bdi,
-			.sync_mode	= WB_SYNC_NONE,
-			.older_than_this = NULL,
-			.nr_to_write	= write_chunk,
-		};
-
-		get_dirty_limits(&wbs, &background_thresh,
-					&dirty_thresh, mapping);
-		nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable;
-		if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
-			break;
-
-		if (!dirty_exceeded)
-			dirty_exceeded = 1;
-
-		/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
-		 * Unstable writes are a feature of certain networked
-		 * filesystems (i.e. NFS) in which data may have been
-		 * written to the server's write cache, but has not yet
-		 * been flushed to permanent storage.
-		 */
-		if (nr_reclaimable) {
-			writeback_inodes(&wbc);
-			get_dirty_limits(&wbs, &background_thresh,
-					&dirty_thresh, mapping);
-			nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable;
-			if (nr_reclaimable + wbs.nr_writeback <=
dirty_thresh)
-				break;
-			pages_written += write_chunk - wbc.nr_to_write;
-			if (pages_written >= write_chunk)
-				break;		/* We've done our duty
*/
-		}
-		blk_congestion_wait(WRITE, HZ/10);
-	}
-
-	if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh &&
dirty_exceeded)
-		dirty_exceeded = 0;
-
-	if (writeback_in_progress(bdi))
-		return;		/* pdflush is already working this queue
*/
+	get_dirty_limits(&wbs, &background_thresh,
+				&dirty_thresh, mapping);
+	nr_dirty = wbs.nr_dirty + wbs.nr_unstable;
 
 	/*
 	 * In laptop mode, we wait until hitting the higher threshold
before
@@ -249,8 +207,13 @@ static void balance_dirty_pages(struct a
 	 * In normal mode, we start background writeout at the lower
 	 * background_thresh, to keep the amount of dirty memory low.
 	 */
-	if ((laptop_mode && pages_written) ||
-	     (!laptop_mode && (nr_reclaimable > background_thresh)))
+	if (writeback_in_progress(bdi)) {
+	       if (nr_dirty > dirty_thresh) {
+			dirty_exceeded = 1;
+	                blk_congestion_wait(WRITE, HZ/50);
+		}
+	} else if ((laptop_mode && (nr_dirty + wbs.nr_writeback >
dirty_thresh)) ||
+	     (!laptop_mode && (nr_dirty > background_thresh)))
 		pdflush_operation(background_writeout, 0);
 }
 
@@ -325,6 +288,9 @@ void throttle_vm_writeout(void)
 static void background_writeout(unsigned long _min_pages)
 {
 	long min_pages = _min_pages;
+	struct writeback_state wbs;
+	long background_thresh;
+	long dirty_thresh;
 	struct writeback_control wbc = {
 		.bdi		= NULL,
 		.sync_mode	= WB_SYNC_NONE,
@@ -333,12 +299,11 @@ static void background_writeout(unsigned
 		.nonblocking	= 1,
 	};
 
+	get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL);
+	if (min_pages == 0) {
+		min_pages = wbs.nr_dirty + wbs.nr_unstable;
+	}
 	for ( ; ; ) {
-		struct writeback_state wbs;
-		long background_thresh;
-		long dirty_thresh;
-
-		get_dirty_limits(&wbs, &background_thresh,
&dirty_thresh, NULL);
 		if (wbs.nr_dirty + wbs.nr_unstable < background_thresh
 				&& min_pages <= 0)
 			break;
@@ -347,13 +312,12 @@ static void background_writeout(unsigned
 		wbc.pages_skipped = 0;
 		writeback_inodes(&wbc);
 		min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
-		if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
-			/* Wrote less than expected */
+		if (wbc.nr_to_write > 0 || wbc.encountered_congestion) {
 			blk_congestion_wait(WRITE, HZ/10);
-			if (!wbc.encountered_congestion)
-				break;
 		}
+		get_dirty_limits(&wbs, &background_thresh,
&dirty_thresh, NULL);
 	}
+	dirty_exceeded = 0;
 }
 
 /*
@@ -361,14 +325,8 @@ static void background_writeout(unsigned
  * the whole world.  Returns 0 if a pdflush thread was dispatched.
Returns
  * -1 if all pdflush threads were busy.
  */
-int wakeup_pdflush(long nr_pages)
+int inline wakeup_pdflush(long nr_pages)
 {
-	if (nr_pages == 0) {
-		struct writeback_state wbs;
-
-		get_writeback_state(&wbs);
-		nr_pages = wbs.nr_dirty + wbs.nr_unstable;
-	}
 	return pdflush_operation(background_writeout, nr_pages);
 }
 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux