[RFC][patch] mm: single pcp lists

Having a hot and a cold pcp list means that:

- cold pages are overlooked when when a hot page is needed but none available.
- when the hot list spills, it doesn't fill the cold list if it is low.

Use a single pcp list to solve both these problems.  Disallow cold page
allocation from taking hot pages though.

Index: linux-2.6/include/linux/mmzone.h
===================================================================
--- linux-2.6.orig/include/linux/mmzone.h
+++ linux-2.6/include/linux/mmzone.h
@@ -44,15 +44,13 @@ struct zone_padding {
 #define ZONE_PADDING(name)
 #endif
 
-struct per_cpu_pages {
+struct per_cpu_pageset {
+	struct list_head list;	/* the list of pages */
 	int count;		/* number of pages in the list */
+	int cold_count;		/* number of cold pages in the list */
 	int high;		/* high watermark, emptying needed */
 	int batch;		/* chunk size for buddy add/remove */
-	struct list_head list;	/* the list of pages */
-};
 
-struct per_cpu_pageset {
-	struct per_cpu_pages pcp[2];	/* 0: hot.  1: cold */
 #ifdef CONFIG_NUMA
 	unsigned long numa_hit;		/* allocated in intended node */
 	unsigned long numa_miss;	/* allocated in non intended node */
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c
+++ linux-2.6/mm/page_alloc.c
@@ -598,27 +598,24 @@ static int rmqueue_bulk(struct zone *zon
 void drain_remote_pages(void)
 {
 	struct zone *zone;
-	int i;
 	unsigned long flags;
 
-	local_irq_save(flags);
 	for_each_zone(zone) {
-		struct per_cpu_pageset *pset;
-
 		/* Do not drain local pagesets */
 		if (zone->zone_pgdat->node_id == numa_node_id())
 			continue;
 
-		pset = zone_pcp(zone, smp_processor_id());
-		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
-			struct per_cpu_pages *pcp;
-
-			pcp = &pset->pcp[i];
-			free_pages_bulk(zone, pcp->count, &pcp->list, 0);
-			pcp->count = 0;
+		local_irq_save(flags);
+		if (zone->zone_pgdat->node_id != numa_node_id()) {
+			struct per_cpu_pageset *pset;
+
+			pset = zone_pcp(zone, smp_processor_id());
+			free_pages_bulk(zone, pset->count, &pset->list, 0);
+			pset->cold_count = 0;
+			pset->count = 0;
 		}
+		local_irq_restore(flags);
 	}
-	local_irq_restore(flags);
 }
 #endif
 
@@ -627,21 +624,16 @@ static void __drain_pages(unsigned int c
 {
 	unsigned long flags;
 	struct zone *zone;
-	int i;
 
 	for_each_zone(zone) {
 		struct per_cpu_pageset *pset;
 
 		pset = zone_pcp(zone, cpu);
-		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
-			struct per_cpu_pages *pcp;
-
-			pcp = &pset->pcp[i];
-			local_irq_save(flags);
-			free_pages_bulk(zone, pcp->count, &pcp->list, 0);
-			pcp->count = 0;
-			local_irq_restore(flags);
-		}
+		local_irq_save(flags);
+		free_pages_bulk(zone, pset->count, &pset->list, 0);
+		pset->cold_count = 0;
+		pset->count = 0;
+		local_irq_restore(flags);
 	}
 }
 #endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
@@ -713,7 +705,7 @@ static void zone_statistics(struct zonel
 static void fastcall free_hot_cold_page(struct page *page, int cold)
 {
 	struct zone *zone = page_zone(page);
-	struct per_cpu_pages *pcp;
+	struct per_cpu_pageset *pset;
 	unsigned long flags;
 
 	arch_free_page(page, 0);
@@ -725,14 +717,22 @@ static void fastcall free_hot_cold_page(
 
 	kernel_map_pages(page, 1, 0);
 
-	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
+	pset = zone_pcp(zone, get_cpu());
 	local_irq_save(flags);
 	__inc_page_state(pgfree);
-	list_add(&page->lru, &pcp->list);
-	pcp->count++;
-	if (pcp->count >= pcp->high) {
-		free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
-		pcp->count -= pcp->batch;
+	pset->count++;
+	if (cold) {
+		pset->cold_count++;
+		list_add_tail(&page->lru, &pset->list);
+	} else {
+		list_add(&page->lru, &pset->list);
+	}
+
+	if (pset->count > pset->high) {
+		int count = pset->batch;
+		free_pages_bulk(zone, count, &pset->list, 0);
+		pset->cold_count -= min(count, pset->cold_count);
+		pset->count -= count;
 	}
 	local_irq_restore(flags);
 	put_cpu();
@@ -782,19 +782,30 @@ static struct page *buffered_rmqueue(str
 again:
 	cpu  = get_cpu();
 	if (likely(order == 0)) {
-		struct per_cpu_pages *pcp;
+		struct per_cpu_pageset *pset;
 
-		pcp = &zone_pcp(zone, cpu)->pcp[cold];
+		pset = zone_pcp(zone, cpu);
 		local_irq_save(flags);
-		if (!pcp->count) {
-			pcp->count += rmqueue_bulk(zone, 0,
-						pcp->batch, &pcp->list);
-			if (unlikely(!pcp->count))
+		if (!pset->count || (cold && !pset->cold_count &&
+				pset->count <= pset->high - (pset->high>>2))) {
+			int count;
+			count = rmqueue_bulk(zone, 0, pset->batch, &pset->list);
+			if (unlikely(!count))
 				goto failed;
+			pset->count += count;
+			pset->cold_count += count;
+		}
+
+		pset->count--;
+		if (cold) {
+			page = list_entry(pset->list.prev, struct page, lru);
+			if (pset->cold_count)
+				pset->cold_count--;
+		} else {
+			page = list_entry(pset->list.next, struct page, lru);
+			pset->cold_count = min(pset->cold_count, pset->count);
 		}
-		page = list_entry(pcp->list.next, struct page, lru);
 		list_del(&page->lru);
-		pcp->count--;
 	} else {
 		spin_lock_irqsave(&zone->lock, flags);
 		page = __rmqueue(zone, order);
@@ -1385,7 +1396,7 @@ void si_meminfo_node(struct sysinfo *val
 void show_free_areas(void)
 {
 	struct page_state ps;
-	int cpu, temperature;
+	int cpu;
 	unsigned long active;
 	unsigned long inactive;
 	unsigned long free;
@@ -1402,17 +1413,11 @@ void show_free_areas(void)
 			printk("\n");
 
 		for_each_online_cpu(cpu) {
-			struct per_cpu_pageset *pageset;
-
-			pageset = zone_pcp(zone, cpu);
+			struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
 
-			for (temperature = 0; temperature < 2; temperature++)
-				printk("cpu %d %s: high %d, batch %d used:%d\n",
-					cpu,
-					temperature ? "cold" : "hot",
-					pageset->pcp[temperature].high,
-					pageset->pcp[temperature].batch,
-					pageset->pcp[temperature].count);
+			printk("cpu %d: high %d, batch %d, pages %d, cold %d\n",
+				cpu, pset->high, pset->batch,
+				pset->count, pset->cold_count);
 		}
 	}
 
@@ -1845,23 +1850,14 @@ static int __cpuinit zone_batchsize(stru
 	return batch;
 }
 
-inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+static inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
 {
-	struct per_cpu_pages *pcp;
-
 	memset(p, 0, sizeof(*p));
-
-	pcp = &p->pcp[0];		/* hot */
-	pcp->count = 0;
-	pcp->high = 6 * batch;
-	pcp->batch = max(1UL, 1 * batch);
-	INIT_LIST_HEAD(&pcp->list);
-
-	pcp = &p->pcp[1];		/* cold*/
-	pcp->count = 0;
-	pcp->high = 2 * batch;
-	pcp->batch = max(1UL, batch/2);
-	INIT_LIST_HEAD(&pcp->list);
+	p->count = 0;
+	p->cold_count = 0;
+	p->high = 6 * batch;
+	p->batch = max(1UL, 1 * batch);
+	INIT_LIST_HEAD(&p->list);
 }
 
 /*
@@ -1869,16 +1865,13 @@ inline void setup_pageset(struct per_cpu
  * to the value high for the pageset p.
  */
 
-static void setup_pagelist_highmark(struct per_cpu_pageset *p,
+static void setup_pagelist_highmark(struct per_cpu_pageset *pset,
 				unsigned long high)
 {
-	struct per_cpu_pages *pcp;
-
-	pcp = &p->pcp[0]; /* hot list */
-	pcp->high = high;
-	pcp->batch = max(1UL, high/4);
-	if ((high/4) > (PAGE_SHIFT * 8))
-		pcp->batch = PAGE_SHIFT * 8;
+	pset->high = high;
+	pset->batch = max(1UL, high/4);
+	if (pset->batch > PAGE_SHIFT * 8)
+		pset->batch = PAGE_SHIFT * 8;
 }
 
 
@@ -2259,27 +2252,15 @@ static int zoneinfo_show(struct seq_file
 			   ")"
 			   "\n  pagesets");
 		for_each_online_cpu(i) {
-			struct per_cpu_pageset *pageset;
-			int j;
+			struct per_cpu_pageset *pset;
 
-			pageset = zone_pcp(zone, i);
-			for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
-				if (pageset->pcp[j].count)
-					break;
-			}
-			if (j == ARRAY_SIZE(pageset->pcp))
-				continue;
-			for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
-				seq_printf(m,
-					   "\n    cpu: %i pcp: %i"
-					   "\n              count: %i"
-					   "\n              high:  %i"
-					   "\n              batch: %i",
-					   i, j,
-					   pageset->pcp[j].count,
-					   pageset->pcp[j].high,
-					   pageset->pcp[j].batch);
-			}
+			pset = zone_pcp(zone, i);
+			seq_printf(m,
+				   "\n    cpu: %i, pcp"
+				   "\n              count: %i"
+				   "\n              high:  %i"
+				   "\n              batch: %i",
+				   i, pset->count, pset->high, pset->batch);
 #ifdef CONFIG_NUMA
 			seq_printf(m,
 				   "\n            numa_hit:       %lu"
@@ -2288,12 +2269,12 @@ static int zoneinfo_show(struct seq_file
 				   "\n            interleave_hit: %lu"
 				   "\n            local_node:     %lu"
 				   "\n            other_node:     %lu",
-				   pageset->numa_hit,
-				   pageset->numa_miss,
-				   pageset->numa_foreign,
-				   pageset->interleave_hit,
-				   pageset->local_node,
-				   pageset->other_node);
+				   pset->numa_hit,
+				   pset->numa_miss,
+				   pset->numa_foreign,
+				   pset->interleave_hit,
+				   pset->local_node,
+				   pset->other_node);
 #endif
 		}
 		seq_printf(m,
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Follow-Ups:
- Re: [RFC][patch] mm: single pcp lists
  - From: Joel Schopp <jschopp@austin.ibm.com>
- [patch] mm: pcp stats
  - From: Nick Piggin <npiggin@suse.de>
Prev by Date: Re: Mapping to 0x0
Next by Date: [patch] mm: pcp stats
Previous by thread: Mapping to 0x0
Next by thread: [patch] mm: pcp stats
Index(es):
- Date
- Thread
[Index of Archives] [Kernel Newbies] [Netfilter] [Bugtraq] [Photo] [Stuff] [Gimp] [Yosemite News] [MIPS Linux] [ARM Linux] [Linux Security] [Linux RAID] [Video 4 Linux] [Linux for the blind] [Linux Resources]