Hello,
I have recently seen massive problems with the bounce buffer allocation
implementation in 2.6. My system got OOM with an ata_piix controller
(4GB bounce limit), 2 P4 CPUs and 8GB of RAM with a very simple
copy-compare test: About 800MB worth of data are copied with cp(1) in 7
synchronous jobs. Each job does a sync(1) after the copy.
All available ZONE_NORMAL memory is filled with bounce buffers in
fractions of a second after the sync(1) calls are issued.
The system goes OOM (becomes unusable, OOM killer strikes several times
against innocent processes).
The attached quick-and-dirty patch gives you an idea of what I figured
could be a useful workaround for that problem. It follows 2 main ideas:
1 When bounce buffer allocations fail, it is wrong to trigger the page
reclaim mechanism which is likely to genetate even more bounce buffer
requests. Thus there should be no wakeup_bdflush() call for the
page_pool.
2 bounce buffer allocations should not simply use alloc_page() because
there is no limit for allocations that way. Rather, bounce buffer
allocations should stop if ZONE_NORMAL is full to a certain degree
(my patch stops at 51% - chosen arbitrarily).
For the case I described, the system behavior is clearly improved with
the patch. The system remains usable, and the OOM killer isn't triggered.
(The patch is against an RH enterprise kernel, but it applies against
almost all recent 2.6.kernels except 2.6.12-rcX).
Idea 1) is already implemented in a recent patch by Nick Piggin which
went into 2.6.12-rc5. I haven't tested yet if that patch alone fixes my
problem because I couldn't get 2.6.12-rc5 to run on my system. I doubt
so, though, because I started with idea 1) alone and it didn't help much.
Here is another idea how the bounce buffer behavior could be enhanced:
Introduce a new memory zone ZONE_4G with memory from 896M-4096M
and allocate bounce buffers preferably in that zone. Advantage: Bounce
buffers would't interfere with valuable ZONE_NORMAL memory. But that may
be too much effort just for systems which have too much memory for their
IO controllers...
Regards
Martin
--
Martin Wilck Phone: +49 5251 8 15113
Fujitsu Siemens Computers Fax: +49 5251 8 20409
Heinz-Nixdorf-Ring 1 mailto:[email protected]
D-33106 Paderborn http://www.fujitsu-siemens.com/primergy
diff -rupX /root/diff-exludes linux-2.6.9-6.37.EL-orig/arch/i386/mm/pgtable.c linux-2.6.9-6.37.EL/arch/i386/mm/pgtable.c
--- linux-2.6.9-6.37.EL-orig/arch/i386/mm/pgtable.c 2005-04-11 14:19:03.000000000 +0200
+++ linux-2.6.9-6.37.EL/arch/i386/mm/pgtable.c 2005-05-31 10:27:57.000000000 +0200
@@ -24,6 +24,7 @@
#include <asm/tlbflush.h>
#include <asm/atomic_kmap.h>
+extern atomic_t n_bounce_pages;
void show_mem(void)
{
int total = 0, reserved = 0;
@@ -55,6 +56,7 @@ void show_mem(void)
printk("%d reserved pages\n",reserved);
printk("%d pages shared\n",shared);
printk("%d pages swap cached\n",cached);
+ printk("%d bounce buffers used\n",atomic_read(&n_bounce_pages));
}
EXPORT_SYMBOL_GPL(show_mem);
diff -rupX /root/diff-exludes linux-2.6.9-6.37.EL-orig/Makefile linux-2.6.9-6.37.EL/Makefile
--- linux-2.6.9-6.37.EL-orig/mm/highmem.c 2004-10-18 23:54:37.000000000 +0200
+++ linux-2.6.9-6.37.EL/mm/highmem.c 2005-06-03 15:24:03.000000000 +0200
@@ -28,18 +28,45 @@
#include <linux/highmem.h>
#include <asm/tlbflush.h>
-static mempool_t *page_pool, *isa_page_pool;
+mempool_t *page_pool, *isa_page_pool;
+atomic_t n_bounce_pages = ATOMIC_INIT(0);
+static unsigned int min_free_pages[MAX_NR_ZONES];
+
+static unsigned int zone_free_pages(int type)
+{
+ pg_data_t *pgdat;
+ unsigned int pages = 0;
+ for_each_pgdat(pgdat)
+ pages += pgdat->node_zones[type].free_pages;
+ return pages;
+}
+
+static int sufficient_lowpages(int gfp)
+{
+ int type = (gfp & __GFP_DMA ? ZONE_DMA : ZONE_NORMAL);
+
+ return (zone_free_pages(type) >= min_free_pages[type]);
+}
static void *page_pool_alloc(int gfp_mask, void *data)
{
int gfp = gfp_mask | (int) (long) data;
+ void *page;
+
+ if (!sufficient_lowpages(gfp))
+ return NULL;
- return alloc_page(gfp);
+ page = alloc_page(gfp);
+
+ if (page)
+ atomic_inc(&n_bounce_pages);
+ return page;
}
static void page_pool_free(void *page, void *data)
{
__free_page(page);
+ atomic_dec(&n_bounce_pages);
}
/*
@@ -212,20 +239,25 @@ void fastcall kunmap_high(struct page *p
EXPORT_SYMBOL(kunmap_high);
#define POOL_SIZE 64
+#define POOL_FRAC(x) ((x)/2)
+#define SMALL_POOL_FRAC(x) ((x)/100)
static __init int init_emergency_pool(void)
{
+ unsigned int n = zone_free_pages(ZONE_NORMAL);
struct sysinfo i;
si_meminfo(&i);
si_swapinfo(&i);
+ min_free_pages[ZONE_NORMAL] = POOL_FRAC(n);
if (!i.totalhigh)
return 0;
-
- page_pool = mempool_create(POOL_SIZE, page_pool_alloc, page_pool_free, NULL);
+ n = SMALL_POOL_FRAC(n);
+ n = (n > POOL_SIZE ? n : POOL_SIZE);
+ page_pool = mempool_create(n, page_pool_alloc, page_pool_free, NULL);
if (!page_pool)
BUG();
- printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
+ printk("highmem bounce pool size: %d pages, min: %u\n", n, min_free_pages[ZONE_NORMAL]);
return 0;
}
@@ -265,11 +297,12 @@ int init_emergency_isa_pool(void)
if (isa_page_pool)
return 0;
+ min_free_pages[ZONE_DMA] = POOL_FRAC(zone_free_pages(ZONE_DMA));
isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc, page_pool_free, (void *) __GFP_DMA);
if (!isa_page_pool)
BUG();
- printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE);
+ printk("isa bounce pool size: %d pages, min: %u\n", ISA_POOL_SIZE, min_free_pages[ZONE_DMA]);
return 0;
}
diff -rupX /root/diff-exludes linux-2.6.9-6.37.EL-orig/mm/mempool.c linux-2.6.9-6.37.EL/mm/mempool.c
--- linux-2.6.9-6.37.EL-orig/mm/mempool.c 2004-10-18 23:54:37.000000000 +0200
+++ linux-2.6.9-6.37.EL/mm/mempool.c 2005-06-03 15:35:01.000000000 +0200
@@ -15,6 +15,8 @@
#include <linux/blkdev.h>
#include <linux/writeback.h>
+extern mempool_t *page_pool, *isa_page_pool;
+
static void add_element(mempool_t *pool, void *element)
{
BUG_ON(pool->curr_nr >= pool->min_nr);
@@ -211,11 +213,12 @@ repeat_alloc:
if (likely(element != NULL))
return element;
}
-
+
/*
* Kick the VM at this point.
*/
- wakeup_bdflush(0);
+ if (pool != page_pool && pool != isa_page_pool)
+ wakeup_bdflush(0);
spin_lock_irqsave(&pool->lock, flags);
if (likely(pool->curr_nr)) {
[Index of Archives]
[Kernel Newbies]
[Netfilter]
[Bugtraq]
[Photo]
[Stuff]
[Gimp]
[Yosemite News]
[MIPS Linux]
[ARM Linux]
[Linux Security]
[Linux RAID]
[Video 4 Linux]
[Linux for the blind]
[Linux Resources]