Re: O_DIRECT question — Linux Kernel

On a embedded systerm, limiting page cache can relieve memory
fragmentation. There is a patch against 2.6.19, which limit every
opened file page cache and total pagecache. When the limit reach, it
will release the page cache overrun the limit.


Index: include/linux/pagemap.h
===================================================================
--- include/linux/pagemap.h	(revision 2628)
+++ include/linux/pagemap.h	(working copy)
@@ -12,6 +12,7 @@
#include <asm/uaccess.h>
#include <linux/gfp.h>

+extern int total_pagecache_limit;
/*
 * Bits in mapping->flags.  The lower __GFP_BITS_SHIFT bits are the page
 * allocation mode flags.
Index: include/linux/fs.h
===================================================================
--- include/linux/fs.h	(revision 2628)
+++ include/linux/fs.h	(working copy)
@@ -444,6 +444,10 @@
	spinlock_t		private_lock;	/* for use by the address_space */
	struct list_head	private_list;	/* ditto */
	struct address_space	*assoc_mapping;	/* ditto */
+#ifdef CONFIG_LIMIT_PAGECACHE
+	unsigned long 		pages_limit;
+	struct list_head	page_head;
+#endif
} __attribute__((aligned(sizeof(long))));
	/*
	 * On most architectures that alignment is already the case; but
Index: include/linux/mm.h
===================================================================
--- include/linux/mm.h	(revision 2628)
+++ include/linux/mm.h	(working copy)
@@ -231,6 +231,9 @@
#else
#define VM_BUG_ON(condition) do { } while(0)
#endif
+#ifdef CONFIG_LIMIT_PAGECACHE
+	struct list_head page_list;
+#endif

/*
 * Methods to modify the page usage count.
@@ -1030,7 +1033,21 @@

/* mm/page-writeback.c */
int write_one_page(struct page *page, int wait);
+/* possible outcome of pageout() */

+typedef enum {
+	/* failed to write page out, page is locked */
+	PAGE_KEEP,
+	/* move page to the active list, page is locked */
+	PAGE_ACTIVATE,
+	/* page has been sent to the disk successfully, page is unlocked */
+	PAGE_SUCCESS,
+	/* page is clean and locked */
+	PAGE_CLEAN,
+} pageout_t;
+
+pageout_t pageout(struct page *page, struct address_space *mapping);
+
/* readahead.c */
#define VM_MAX_READAHEAD	128	/* kbytes */
#define VM_MIN_READAHEAD	16	/* kbytes (includes current page) */
Index: init/Kconfig
===================================================================
--- init/Kconfig	(revision 2628)
+++ init/Kconfig	(working copy)
@@ -419,6 +419,19 @@
	  option replaces shmem and tmpfs with the much simpler ramfs code,
	  which may be appropriate on small systems without swap.

+config LIMIT_PAGECACHE
+	bool "Limit page caches" if EMBEDDED
+
+config PAGECACHE_LIMIT
+	int "Page cache limit for every file in page unit"
+	depends on LIMIT_PAGECACHE
+	default 32
+
+config PAGECACHE_LIMIT_TOTAL
+	int "Total page cache limit in MB unit"
+	depends on LIMIT_PAGECACHE
+	default 10
+
choice
       prompt "Page frame management algorithm"
       default BUDDY
Index: fs/inode.c
===================================================================
--- fs/inode.c	(revision 2628)
+++ fs/inode.c	(working copy)
@@ -205,6 +205,10 @@
	INIT_LIST_HEAD(&inode->inotify_watches);
	mutex_init(&inode->inotify_mutex);
#endif
+#ifdef CONFIG_LIMIT_PAGECACHE
+	INIT_LIST_HEAD(&inode->i_data.page_head);
+	inode->i_data.pages_limit = CONFIG_PAGECACHE_LIMIT;
+#endif
}

EXPORT_SYMBOL(inode_init_once);
Index: mm/filemap.c
===================================================================
--- mm/filemap.c	(revision 2628)
+++ mm/filemap.c	(working copy)
@@ -18,6 +18,7 @@
#include <linux/capability.h>
#include <linux/kernel_stat.h>
#include <linux/mm.h>
+#include <linux/mm_inline.h>
#include <linux/swap.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
@@ -30,6 +31,9 @@
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/cpuset.h>
+#include <linux/rmap.h>
+#include <linux/buffer_head.h>
+#include <linux/page-flags.h>
#include "filemap.h"
#include "internal.h"

@@ -119,6 +123,9 @@
	radix_tree_delete(&mapping->page_tree, page->index);
	page->mapping = NULL;
	mapping->nrpages--;
+#ifdef CONFIG_LIMIT_PAGECACHE
+	list_del_init(&page->page_list);
+#endif
	__dec_zone_page_state(page, NR_FILE_PAGES);
}

@@ -169,6 +176,96 @@
	return 0;
}

+#ifdef CONFIG_LIMIT_PAGECACHE
+static void balance_cache(struct address_space *mapping)
+{
+	/* Release half of the pages */
+	int count ;
+	int nr_released = 0;
+	struct page *page;
+	struct zone *zone= NULL;
+	struct pagevec freed_pvec;
+	struct list_head ret_list;
+
+	count = mapping->nrpages /2;
+	pagevec_init(&freed_pvec, 0);
+	INIT_LIST_HEAD(&ret_list);
+	lru_add_drain();
+	while(count-->0) {
+		page = list_entry(mapping->page_head.prev, struct page, page_list);
+		zone = page_zone(page);
+		TestClearPageLRU(page);
+		if (PageActive(page))
+			del_page_from_active_list(zone, page);
+		else
+			del_page_from_inactive_list(zone, page);
+
+		list_del_init(&page->page_list); /* Remove from current process's
page list */
+		get_page(page);
+
+		if (TestSetPageLocked(page))
+			goto __keep;
+		if (PageWriteback(page))
+			goto __keep_locked;
+		if (page_referenced(page, 1))
+			goto __keep_locked;
+		if (PageDirty(page)) {
+			switch(pageout(page, mapping)) {
+				case PAGE_KEEP:
+				case PAGE_ACTIVATE:
+					goto __keep_locked;
+				case PAGE_SUCCESS:
+					if (PageWriteback(page) || PageDirty(page))
+						goto __keep;
+					if (TestSetPageLocked(page))
+						goto __keep;
+					if (PageDirty(page) || PageWriteback(page))
+						goto __keep_locked;
+				case PAGE_CLEAN:
+					;
+			}
+		}
+
+		if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
+			goto __keep_locked;
+		if (!remove_mapping(mapping, page))
+			goto __keep_locked;
+
+		unlock_page(page);
+		nr_released++;
+		/* This page maybe in Active LRU */
+		ClearPageActive(page);
+		ClearPageUptodate(page);
+		if (!pagevec_add(&freed_pvec, page))
+			__pagevec_release_nonlru(&freed_pvec);
+		continue;
+__keep_locked:
+		unlock_page(page);
+__keep:
+		SetPageLRU(page);
+		if (PageActive(page)) {
+			add_page_to_active_list(zone, page);
+		} else {
+			add_page_to_inactive_list(zone, page);
+		}
+
+		list_add(&page->page_list, &ret_list);
+	}
+	while(!list_empty(&ret_list)) {
+		page = list_entry(ret_list.prev, struct page, page_list);
+		list_move_tail(&page->page_list, &mapping->page_head);
+		put_page(page);
+	}
+	if (pagevec_count(&freed_pvec))
+		__pagevec_release_nonlru(&freed_pvec);
+
+	if (global_page_state(NR_FILE_PAGES) > total_pagecache_limit)
+		if (zone) {
+			wakeup_kswapd(zone, 0);
+		}
+}
+#endif
+
/**
 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
 * @mapping:	address space structure to write
@@ -448,6 +545,10 @@
			page->mapping = mapping;
			page->index = offset;
			mapping->nrpages++;
+#ifdef CONFIG_LIMIT_PAGECACHE
+			list_add(&page->page_list, &mapping->page_head);
+#endif
+
			__inc_zone_page_state(page, NR_FILE_PAGES);
		}
		write_unlock_irq(&mapping->tree_lock);
@@ -1085,6 +1186,10 @@
		page_cache_release(cached_page);
	if (filp)
		file_accessed(filp);
+#ifdef CONFIG_LIMIT_PAGECACHE
+	if (mapping->nrpages >= mapping->pages_limit)
+		balance_cache(mapping);
+#endif	
}
EXPORT_SYMBOL(do_generic_mapping_read);

@@ -2195,6 +2300,11 @@
	if (cached_page)
		page_cache_release(cached_page);

+#ifdef CONFIG_LIMIT_PAGECACHE
+	if (mapping->nrpages >= mapping->pages_limit)
+		balance_cache(mapping);
+#endif
+	
	/*
	 * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
	 */
Index: mm/vmscan.c
===================================================================
--- mm/vmscan.c	(revision 2628)
+++ mm/vmscan.c	(working copy)
@@ -116,6 +116,7 @@

static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
+int total_pagecache_limit = CONFIG_PAGECACHE_LIMIT_TOTAL * 1024 / 4;

/*
 * Add a shrinker callback to be called from the vm
@@ -292,23 +293,11 @@
	unlock_page(page);
}

-/* possible outcome of pageout() */
-typedef enum {
-	/* failed to write page out, page is locked */
-	PAGE_KEEP,
-	/* move page to the active list, page is locked */
-	PAGE_ACTIVATE,
-	/* page has been sent to the disk successfully, page is unlocked */
-	PAGE_SUCCESS,
-	/* page is clean and locked */
-	PAGE_CLEAN,
-} pageout_t;
-
/*
 * pageout is called by shrink_page_list() for each dirty page.
 * Calls ->writepage().
 */
-static pageout_t pageout(struct page *page, struct address_space *mapping)
+pageout_t pageout(struct page *page, struct address_space *mapping)
{
	/*
	 * If the page is dirty, only perform writeback if that write
@@ -1328,7 +1317,11 @@
			order = pgdat->kswapd_max_order;
		}
		finish_wait(&pgdat->kswapd_wait, &wait);
-		balance_pgdat(pgdat, order);
+		if (global_page_state(NR_FILE_PAGES) >= total_pagecache_limit)
+			balance_pgdat(pgdat, (global_page_state(NR_FILE_PAGES) \
+						- total_pagecache_limit), order);
+		else
+			balance_pgdat(pgdat, order);
	}
	return 0;
}
@@ -1344,8 +1337,10 @@
		return;

	pgdat = zone->zone_pgdat;
-	if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
-		return;
+	if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) {
+		if (global_page_state(NR_FILE_PAGES) < total_pagecache_limit)
+			return;
+	}
	if (pgdat->kswapd_max_order < order)
		pgdat->kswapd_max_order = order;
	if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Follow-Ups:
- Re: O_DIRECT question
  - From: Aubrey <[email protected]>
- Re: O_DIRECT question
  - From: Linus Torvalds <[email protected]>

References:
- O_DIRECT question
  - From: Aubrey <[email protected]>
- Re: O_DIRECT question
  - From: Linus Torvalds <[email protected]>
- Re: O_DIRECT question
  - From: Aubrey <[email protected]>
- Re: O_DIRECT question
  - From: Andrew Morton <[email protected]>
- Re: O_DIRECT question
  - From: Aubrey <[email protected]>
- Re: O_DIRECT question
  - From: Andrew Morton <[email protected]>
- Re: O_DIRECT question
  - From: Nick Piggin <[email protected]>
- Re: O_DIRECT question
  - From: Aubrey <[email protected]>

Prev by Date: [git pull] urgent drm patch for 2.6.20-rc4 - repost
Next by Date: Re: [kvm-devel] guest crash on 2.6.20-rc4
Previous by thread: Re: O_DIRECT question
Next by thread: Re: O_DIRECT question
Index(es):
- Date
- Thread

[Index of Archives] [Kernel Newbies] [Netfilter] [Bugtraq] [Photo] [Stuff] [Gimp] [Yosemite News] [MIPS Linux] [ARM Linux] [Linux Security] [Linux RAID] [Video 4 Linux] [Linux for the blind] [Linux Resources]