Hi David,
Forgive my selective quoting...
David Chinner wrote:
Take a large file - say Size = 5x RAM or so - and then start
N threads runnnning at offset (n / Size) where n = the thread
number. They each read (Size / N) and so typically don't overlap.
Throughput with increasing numbers of threads on a 24p altix
on an XFS filesystem on 2.6.15-rc5 looks like:
Loads tput
----- -------
1 789.59
2 1191.56
4 1724.63
8 1213.63
16 1057.03
32 744.73
Basically, we hit a scaling limitation at b/t 4 and 8 threads. This was
consistent across I/O sizes from 4KB to 4MB. I took a simple 30s PC sample
profile:
Percent Routine
--------------------------
63.62 _write_lock_irqsave
15.66 _read_unlock_irq
So read_unlock_irq looks to be triggered by the mapping->tree_lock.
I think that the write_lock_irqsave() contention is from memory
reclaim (shrink_list()->try_to_release_page()-> ->releasepage()->
xfs_vm_releasepage()-> try_to_free_buffers()->clear_page_dirty()->
test_clear_page_dirty()-> write_lock_irqsave(&mapping->tree_lock...))
because page cache memory was full of this one file and demand is
causing them to be constantly recycled.
I'd say you're right.
tree_lock contention will be coming from a number of sources. reclaim,
as you say, will be a big one. mpage_readpages (from readahead) will
be another.
Then the read lock in find_get_page in generic_mapping_read will start
contending heavily on the writers and not get much concurrency.
I'm sure lockless (read-side) pagecache will help... not only will it
eliminate read_lock costs, but the reduced read contention should also
decrease write_lock contention and bouncing.
As well as lockless pagecache, I think we can batch tree_lock operations
in readahead. Would be interesting to see how much this patch helps.
--
SUSE Labs, Novell Inc.
Index: linux-2.6/fs/mpage.c
===================================================================
--- linux-2.6.orig/fs/mpage.c 2006-04-30 19:19:14.000000000 +1000
+++ linux-2.6/fs/mpage.c 2006-04-30 19:23:08.000000000 +1000
@@ -26,6 +26,7 @@
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
+#include <linux/swap.h>
/*
* I/O completion handler for multipage BIOs.
@@ -389,31 +390,57 @@
struct bio *bio = NULL;
unsigned page_idx;
sector_t last_block_in_bio = 0;
- struct pagevec lru_pvec;
+ struct pagevec pvec;
struct buffer_head map_bh;
unsigned long first_logical_block = 0;
clear_buffer_mapped(&map_bh);
- pagevec_init(&lru_pvec, 0);
+ pagevec_init(&pvec, 0);
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
struct page *page = list_entry(pages->prev, struct page, lru);
prefetchw(&page->flags);
list_del(&page->lru);
- if (!add_to_page_cache(page, mapping,
- page->index, GFP_KERNEL)) {
- bio = do_mpage_readpage(bio, page,
- nr_pages - page_idx,
- &last_block_in_bio, &map_bh,
- &first_logical_block,
- get_block);
- if (!pagevec_add(&lru_pvec, page))
- __pagevec_lru_add(&lru_pvec);
- } else {
- page_cache_release(page);
+
+ if (!pagevec_add(&pvec, page) || page_idx == nr_pages-1) {
+ int i = 0, in_cache;
+
+ if (radix_tree_preload(GFP_KERNEL))
+ goto pagevec_error;
+
+ write_lock_irq(&mapping->tree_lock);
+ for (; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
+ unsigned long offset = page->index;
+
+ if (__add_to_page_cache(page, mapping, offset))
+ break; /* error */
+ }
+ write_unlock_irq(&mapping->tree_lock);
+ radix_tree_preload_end();
+
+ in_cache = i;
+ for (i = 0; i < in_cache; i++) {
+ struct page *page = pvec.pages[i];
+
+ bio = do_mpage_readpage(bio, page,
+ nr_pages - page_idx,
+ &last_block_in_bio, &map_bh,
+ &first_logical_block,
+ get_block);
+ lru_cache_add(page);
+ }
+
+pagevec_error:
+ for (; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
+ page_cache_release(page);
+ }
+
+ pagevec_reinit(&pvec);
}
}
- pagevec_lru_add(&lru_pvec);
+
BUG_ON(!list_empty(pages));
if (bio)
mpage_bio_submit(READ, bio);
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c 2006-04-30 19:19:14.000000000 +1000
+++ linux-2.6/mm/filemap.c 2006-04-30 19:20:01.000000000 +1000
@@ -394,6 +394,21 @@
return err;
}
+int __add_to_page_cache(struct page *page, struct address_space *mapping,
+ pgoff_t offset)
+{
+ int error = radix_tree_insert(&mapping->page_tree, offset, page);
+ if (!error) {
+ page_cache_get(page);
+ SetPageLocked(page);
+ page->mapping = mapping;
+ page->index = offset;
+ mapping->nrpages++;
+ pagecache_acct(1);
+ }
+ return error;
+}
+
/*
* This function is used to add newly allocated pagecache pages:
* the page is new, so we can just run SetPageLocked() against it.
@@ -407,19 +422,10 @@
int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
if (error == 0) {
- write_lock_irq(&mapping->tree_lock);
- error = radix_tree_insert(&mapping->page_tree, offset, page);
- if (!error) {
- page_cache_get(page);
- SetPageLocked(page);
- page->mapping = mapping;
- page->index = offset;
- mapping->nrpages++;
- pagecache_acct(1);
- }
- write_unlock_irq(&mapping->tree_lock);
+ error = __add_to_page_cache(page, mapping, offset);
radix_tree_preload_end();
}
+
return error;
}
Index: linux-2.6/mm/readahead.c
===================================================================
--- linux-2.6.orig/mm/readahead.c 2006-04-30 19:19:14.000000000 +1000
+++ linux-2.6/mm/readahead.c 2006-04-30 19:23:19.000000000 +1000
@@ -14,6 +14,7 @@
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
+#include <linux/swap.h>
void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
{
@@ -164,37 +165,60 @@
EXPORT_SYMBOL(read_cache_pages);
-static int read_pages(struct address_space *mapping, struct file *filp,
+static void __pagevec_read_pages(struct file *filp,
+ struct address_space *mapping, struct pagevec *pvec)
+{
+ int i = 0, in_cache;
+
+ if (radix_tree_preload(GFP_KERNEL))
+ goto out_error;
+
+ write_lock_irq(&mapping->tree_lock);
+ for (; i < pagevec_count(pvec); i++) {
+ struct page *page = pvec->pages[i];
+ unsigned long offset = page->index;
+
+ if (__add_to_page_cache(page, mapping, offset))
+ break; /* error */
+ }
+ write_unlock_irq(&mapping->tree_lock);
+ radix_tree_preload_end();
+
+ in_cache = i;
+ for (i = 0; i < in_cache; i++) {
+ struct page *page = pvec->pages[i];
+ mapping->a_ops->readpage(filp, page);
+ lru_cache_add(page);
+ }
+
+out_error:
+ for (; i < pagevec_count(pvec); i++) {
+ struct page *page = pvec->pages[i];
+ page_cache_release(page);
+ }
+
+ pagevec_reinit(pvec);
+}
+
+static void read_pages(struct address_space *mapping, struct file *filp,
struct list_head *pages, unsigned nr_pages)
{
- unsigned page_idx;
- struct pagevec lru_pvec;
- int ret;
+ unsigned i;
+ struct pagevec pvec;
if (mapping->a_ops->readpages) {
- ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
- goto out;
+ mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
+ return;
}
- pagevec_init(&lru_pvec, 0);
- for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+ pagevec_init(&pvec, 0);
+ for (i = 0; i < nr_pages; i++) {
struct page *page = list_to_page(pages);
list_del(&page->lru);
- if (!add_to_page_cache(page, mapping,
- page->index, GFP_KERNEL)) {
- ret = mapping->a_ops->readpage(filp, page);
- if (ret != AOP_TRUNCATED_PAGE) {
- if (!pagevec_add(&lru_pvec, page))
- __pagevec_lru_add(&lru_pvec);
- continue;
- } /* else fall through to release */
- }
- page_cache_release(page);
+
+ if (!pagevec_add(&pvec, page) || i == nr_pages-1)
+ __pagevec_read_pages(filp, mapping, &pvec);
}
- pagevec_lru_add(&lru_pvec);
- ret = 0;
-out:
- return ret;
}
/*
Index: linux-2.6/include/linux/pagemap.h
===================================================================
--- linux-2.6.orig/include/linux/pagemap.h 2006-04-30 19:19:14.000000000 +1000
+++ linux-2.6/include/linux/pagemap.h 2006-04-30 19:20:01.000000000 +1000
@@ -97,6 +97,8 @@
extern int read_cache_pages(struct address_space *mapping,
struct list_head *pages, filler_t *filler, void *data);
+int __add_to_page_cache(struct page *page, struct address_space *mapping,
+ unsigned long index);
int add_to_page_cache(struct page *page, struct address_space *mapping,
unsigned long index, gfp_t gfp_mask);
int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
[Index of Archives]
[Kernel Newbies]
[Netfilter]
[Bugtraq]
[Photo]
[Stuff]
[Gimp]
[Yosemite News]
[MIPS Linux]
[ARM Linux]
[Linux Security]
[Linux RAID]
[Video 4 Linux]
[Linux for the blind]
[Linux Resources]