On Fri, Dec 28, 2007 at 03:01:11PM -0800, Andrew Morton wrote:
> On Thu, 27 Dec 2007 23:08:40 +0100 Sascha Warner <[email protected]> wrote:
>
> > Hi,
> >
> > I applied your patches to 2.6.24-rc6-mm1, but now I am faced with one
> > pdflush often using 100% CPU for a long time. There seem to be some rare
> > pauses from its 100% usage, however.
> >
> > On ~23 minutes uptime i have ~19 minutes pdflush runtime.
> >
> > This is on E6600, x86_64, 2 Gig RAM, SATA HDD, running on gentoo ~x64_64
> >
> > Let me know if you need more info.
Sascha, Thank you for the testing.
Replace the first patch with this one should help:
---
Subject: writeback: introduce s_more_io_wait for inodes to be re-synced after a while
Introduce super_block.s_more_io_wait and writeback_control.more_io_wait.
They are for inodes that for some reason cannot be synced immediately.
These inodes will be moved to s_more_io_wait and retried after a while(waiting
up to 0.1s). The normal lots-of-dirty-pages inodes will be moved to more_io
and be synced after other inodes in s_io have been serviced(no sleep).
The data flow is made more simple:
s_dirty --> s_io --> s_more_io/s_more_io_wait --+
^ |
| |
+----------------------------------+
- to fill s_io:
s_more_io +
s_dirty(expired) +
s_more_io_wait
---> s_io
- to drain s_io:
s_io -+--> clean inodes goto inode_in_use/inode_unused
|
+--> s_more_io
|
+--> s_more_io_wait
Obviously there's no ordering or starvation problems in the queues:
- s_dirty is now a strict FIFO queue
- inode.dirtied_when now really means the first dirty time
- once exipired, the dirty inode will stay in s_*io* queues until made clean
- the dirty inodes in s_*io* queues will be revisted in order: no starvation
Cc: Michael Rubin <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Signed-off-by: Fengguang Wu <[email protected]>
---
fs/fs-writeback.c | 26 ++++++++++++----
fs/super.c | 1
include/linux/fs.h | 1
include/linux/writeback.h | 1
mm/page-writeback.c | 56 ++++++++++++++++++++----------------
5 files changed, 55 insertions(+), 30 deletions(-)
--- linux-2.6.24-rc6-mm1.orig/fs/fs-writeback.c
+++ linux-2.6.24-rc6-mm1/fs/fs-writeback.c
@@ -172,6 +172,14 @@ static void requeue_io(struct inode *ino
list_move(&inode->i_list, &inode->i_sb->s_more_io);
}
+/*
+ * The inode should be retried after _sleeping_ for a while.
+ */
+static void requeue_io_wait(struct inode *inode)
+{
+ list_move(&inode->i_list, &inode->i_sb->s_more_io_wait);
+}
+
static void inode_sync_complete(struct inode *inode)
{
/*
@@ -206,13 +214,15 @@ static void queue_io(struct super_block
{
list_splice_init(&sb->s_more_io, sb->s_io.prev);
move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this);
+ list_splice_init(&sb->s_more_io_wait, sb->s_io.prev);
}
int sb_has_dirty_inodes(struct super_block *sb)
{
- return !list_empty(&sb->s_dirty) ||
- !list_empty(&sb->s_io) ||
- !list_empty(&sb->s_more_io);
+ return !list_empty(&sb->s_dirty) ||
+ !list_empty(&sb->s_io) ||
+ !list_empty(&sb->s_more_io) ||
+ !list_empty(&sb->s_more_io_wait);
}
EXPORT_SYMBOL(sb_has_dirty_inodes);
@@ -472,11 +482,15 @@ int generic_sync_sb_inodes(struct super_
iput(inode);
cond_resched();
spin_lock(&inode_lock);
- if (wbc->nr_to_write <= 0)
+ if (wbc->nr_to_write <= 0) {
+ wbc->more_io = 1;
break;
+ }
+ if (!list_empty(&sb->s_more_io))
+ wbc->more_io = 1;
+ if (!list_empty(&sb->s_more_io_wait))
+ wbc->more_io_wait = 1;
}
- if (!list_empty(&sb->s_more_io))
- wbc->more_io = 1;
spin_unlock(&inode_lock);
return ret; /* Leave any unwritten inodes on s_io */
}
--- linux-2.6.24-rc6-mm1.orig/mm/page-writeback.c
+++ linux-2.6.24-rc6-mm1/mm/page-writeback.c
@@ -543,6 +543,34 @@ void throttle_vm_writeout(gfp_t gfp_mask
}
/*
+ * Write back up to MAX_WRITEBACK_PAGES.
+ * Return true if there's no more work.
+ */
+static int writeback_some_pages(struct writeback_control *wbc, int nr)
+{
+ int all_done = 0;
+
+ wbc->more_io = 0;
+ wbc->more_io_wait = 0;
+ wbc->encountered_congestion = 0;
+ wbc->nr_to_write = nr;
+
+ writeback_inodes(wbc);
+
+ if (wbc->encountered_congestion)
+ congestion_wait(WRITE, HZ/10);
+
+ if (wbc->more_io)
+ ;
+ else if (wbc->more_io_wait)
+ congestion_wait(WRITE, HZ/10);
+ else
+ all_done = 1;
+
+ return all_done;
+}
+
+/*
* writeback at least _min_pages, and keep writing until the amount of dirty
* memory is less than the background threshold, or until we're all clean.
*/
@@ -553,7 +581,6 @@ static void background_writeout(unsigned
.bdi = NULL,
.sync_mode = WB_SYNC_NONE,
.older_than_this = NULL,
- .nr_to_write = 0,
.nonblocking = 1,
.range_cyclic = 1,
};
@@ -567,19 +594,9 @@ static void background_writeout(unsigned
global_page_state(NR_UNSTABLE_NFS) < background_thresh
&& min_pages <= 0)
break;
- wbc.more_io = 0;
- wbc.encountered_congestion = 0;
- wbc.nr_to_write = MAX_WRITEBACK_PAGES;
- wbc.pages_skipped = 0;
- writeback_inodes(&wbc);
+ if (writeback_some_pages(&wbc, MAX_WRITEBACK_PAGES))
+ break;
min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
- if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
- /* Wrote less than expected */
- if (wbc.encountered_congestion || wbc.more_io)
- congestion_wait(WRITE, HZ/10);
- else
- break;
- }
}
}
@@ -627,7 +644,6 @@ static void wb_kupdate(unsigned long arg
.bdi = NULL,
.sync_mode = WB_SYNC_NONE,
.older_than_this = &oldest_jif,
- .nr_to_write = 0,
.nonblocking = 1,
.for_kupdate = 1,
.range_cyclic = 1,
@@ -642,16 +658,8 @@ static void wb_kupdate(unsigned long arg
global_page_state(NR_UNSTABLE_NFS) +
(inodes_stat.nr_inodes - inodes_stat.nr_unused);
while (nr_to_write > 0) {
- wbc.more_io = 0;
- wbc.encountered_congestion = 0;
- wbc.nr_to_write = MAX_WRITEBACK_PAGES;
- writeback_inodes(&wbc);
- if (wbc.nr_to_write > 0) {
- if (wbc.encountered_congestion || wbc.more_io)
- congestion_wait(WRITE, HZ/10);
- else
- break; /* All the old data is written */
- }
+ if (writeback_some_pages(&wbc, MAX_WRITEBACK_PAGES))
+ break;
nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
}
if (time_before(next_jif, jiffies + HZ))
--- linux-2.6.24-rc6-mm1.orig/fs/super.c
+++ linux-2.6.24-rc6-mm1/fs/super.c
@@ -64,6 +64,7 @@ static struct super_block *alloc_super(s
INIT_LIST_HEAD(&s->s_dirty);
INIT_LIST_HEAD(&s->s_io);
INIT_LIST_HEAD(&s->s_more_io);
+ INIT_LIST_HEAD(&s->s_more_io_wait);
INIT_LIST_HEAD(&s->s_files);
INIT_LIST_HEAD(&s->s_instances);
INIT_HLIST_HEAD(&s->s_anon);
--- linux-2.6.24-rc6-mm1.orig/include/linux/fs.h
+++ linux-2.6.24-rc6-mm1/include/linux/fs.h
@@ -1011,6 +1011,7 @@ struct super_block {
struct list_head s_dirty; /* dirty inodes */
struct list_head s_io; /* parked for writeback */
struct list_head s_more_io; /* parked for more writeback */
+ struct list_head s_more_io_wait; /* parked for sleep-then-retry */
struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */
struct list_head s_files;
--- linux-2.6.24-rc6-mm1.orig/include/linux/writeback.h
+++ linux-2.6.24-rc6-mm1/include/linux/writeback.h
@@ -63,6 +63,7 @@ struct writeback_control {
unsigned for_writepages:1; /* This is a writepages() call */
unsigned range_cyclic:1; /* range_start is cyclic */
unsigned more_io:1; /* more io to be dispatched */
+ unsigned more_io_wait:1; /* more io to be dispatched after a while */
};
/*
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
[Index of Archives]
[Kernel Newbies]
[Netfilter]
[Bugtraq]
[Photo]
[Stuff]
[Gimp]
[Yosemite News]
[MIPS Linux]
[ARM Linux]
[Linux Security]
[Linux RAID]
[Video 4 Linux]
[Linux for the blind]
[Linux Resources]