[PATCH 14/15] lib: abstract the floating proportion

pull out the floating proportion stuff and make it a lib

Signed-off-by: Peter Zijlstra <[email protected]>
---
 include/linux/proportions.h |   82 ++++++++++++
 lib/Makefile                |    2 
 lib/proportions.c           |  259 +++++++++++++++++++++++++++++++++++++++
 mm/backing-dev.c            |    5 
 mm/page-writeback.c         |  290 ++------------------------------------------
 5 files changed, 364 insertions(+), 274 deletions(-)

Index: linux-2.6/mm/page-writeback.c
===================================================================
--- linux-2.6.orig/mm/page-writeback.c	2007-05-10 10:52:24.000000000 +0200
+++ linux-2.6/mm/page-writeback.c	2007-05-10 11:06:12.000000000 +0200
@@ -116,93 +116,8 @@ static void background_writeout(unsigned
  * because demand can/will vary over time. The length of this period itself is
  * measured in page writeback completions.
  *
- * DETAILS:
- *
- * The floating proportion is a time derivative with an exponentially decaying
- * history:
- *
- *   p_{j} = \Sum_{i=0} (dx_{j}/dt_{-i}) / 2^(1+i)
- *
- * Where j is an element from {BDIs}, x_{j} is j's number of completions, and i
- * the time period over which the differential is taken. So d/dt_{-i} is the
- * differential over the i-th last period.
- *
- * The decaying history gives smooth transitions. The time differential carries
- * the notion of speed.
- *
- * The denominator is 2^(1+i) because we want the series to be normalised, ie.
- *
- *   \Sum_{i=0} 1/2^(1+i) = 1
- *
- * Further more, if we measure time (t) in the same events as x; so that:
- *
- *   t = \Sum_{j} x_{j}
- *
- * we get that:
- *
- *   \Sum_{j} p_{j} = 1
- *
- * Writing this in an iterative fashion we get (dropping the 'd's):
- *
- *   if (++x_{j}, ++t > period)
- *     t /= 2;
- *     for_each (j)
- *       x_{j} /= 2;
- *
- * so that:
- *
- *   p_{j} = x_{j} / t;
- *
- * We optimize away the '/= 2' for the global time delta by noting that:
- *
- *   if (++t > period) t /= 2:
- *
- * Can be approximated by:
- *
- *   period/2 + (++t % period/2)
- *
- * [ Furthermore, when we choose period to be 2^n it can be written in terms of
- *   binary operations and wraparound artefacts disappear. ]
- *
- * Also note that this yields a natural counter of the elapsed periods:
- *
- *   c = t / (period/2)
- *
- * [ Its monotonic increasing property can be applied to mitigate the wrap-
- *   around issue. ]
- *
- * This allows us to do away with the loop over all BDIs on each period
- * expiration. By remembering the period count under which it was last
- * accessed as c_{j}, we can obtain the number of 'missed' cycles from:
- *
- *   c - c_{j}
- *
- * We can then lazily catch up to the global period count every time we are
- * going to use x_{j}, by doing:
- *
- *   x_{j} /= 2^(c - c_{j}), c_{j} = c
  */
-
-struct vm_completions_data {
-	/*
-	 * The period over which we differentiate (in pages)
-	 *
-	 *   period = 2^shift
-	 */
-	int shift;
-
-	/*
-	 * The total page writeback completion counter aka 'time'.
-	 *
-	 * Treated as an unsigned long; the lower 'shift - 1' bits are the
-	 * counter bits, the remaining upper bits the period counter.
-	 */
-	struct percpu_counter completions;
-};
-
-static int vm_completions_index;
-static struct vm_completions_data vm_completions[2];
-static DEFINE_MUTEX(vm_completions_mutex);
+struct prop_descriptor vm_completions;
 
 static unsigned long determine_dirtyable_memory(void);
 
@@ -219,85 +134,6 @@ static int calc_period_shift(void)
 	return 2 + ilog2(dirty_total - 1);
 }
 
-static void vcd_init(void)
-{
-	vm_completions[0].shift = calc_period_shift();
-	percpu_counter_init(&vm_completions[0].completions, 0);
-	percpu_counter_init(&vm_completions[1].completions, 0);
-}
-
-/*
- * We have two copies, and flip between them to make it seem like an atomic
- * update. The update is not really atomic wrt the completions counter, but
- * it is internally consistent with the bit layout depending on shift.
- *
- * We calculate the new shift, copy the completions count, move the bits around
- * and flip the index.
- */
-static void vcd_flip(void)
-{
-	int index;
-	int shift;
-	int offset;
-	u64 completions;
-	unsigned long flags;
-
-	mutex_lock(&vm_completions_mutex);
-
-	index = vm_completions_index ^ 1;
-	shift = calc_period_shift();
-	offset = vm_completions[vm_completions_index].shift - shift;
-	if (!offset)
-		goto out;
-
-	vm_completions[index].shift = shift;
-
-	local_irq_save(flags);
-	completions = percpu_counter_sum_signed(
-			&vm_completions[vm_completions_index].completions);
-
-	if (offset < 0)
-		completions <<= -offset;
-	else
-		completions >>= offset;
-
-	percpu_counter_set(&vm_completions[index].completions, completions);
-
-	/*
-	 * ensure the new vcd is fully written before the switch
-	 */
-	smp_wmb();
-	vm_completions_index = index;
-	local_irq_restore(flags);
-
-	synchronize_rcu();
-
-out:
-	mutex_unlock(&vm_completions_mutex);
-}
-
-/*
- * wrap the access to the data in an rcu_read_lock() section;
- * this is used to track the active references.
- */
-static struct vm_completions_data *get_vcd(void)
-{
-	int index;
-
-	rcu_read_lock();
-	index = vm_completions_index;
-	/*
-	 * match the wmb from vcd_flip()
-	 */
-	smp_rmb();
-	return &vm_completions[index];
-}
-
-static void put_vcd(struct vm_completions_data *vcd)
-{
-	rcu_read_unlock();
-}
-
 /*
  * update the period when the dirty ratio changes.
  */
@@ -307,130 +143,38 @@ int dirty_ratio_handler(ctl_table *table
 {
 	int old_ratio = vm_dirty_ratio;
 	int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
-	if (ret == 0 && write && vm_dirty_ratio != old_ratio)
-		vcd_flip();
-	return ret;
-}
-
-/*
- * adjust the bdi local data to changes in the bit layout.
- */
-static void bdi_adjust_shift(struct backing_dev_info *bdi, int shift)
-{
-	int offset = bdi->shift - shift;
-
-	if (!offset)
-		return;
-
-	if (offset < 0)
-		bdi->period <<= -offset;
-	else
-		bdi->period >>= offset;
-
-	bdi->shift = shift;
-}
-
-/*
- * Catch up with missed period expirations.
- *
- *   until (c_{j} == c)
- *     x_{j} -= x_{j}/2;
- *     c_{j}++;
- */
-static void bdi_writeout_norm(struct backing_dev_info *bdi,
-		struct vm_completions_data *vcd)
-{
-	unsigned long period = 1UL << (vcd->shift - 1);
-	unsigned long period_mask = ~(period - 1);
-	unsigned long global_period;
-	unsigned long flags;
-
-	global_period = percpu_counter_read(&vcd->completions);
-	global_period &= period_mask;
-
-	/*
-	 * Fast path - check if the local and global period count still match
-	 * outside of the lock.
-	 */
-	if (bdi->period == global_period)
-		return;
-
-	spin_lock_irqsave(&bdi->lock, flags);
-	bdi_adjust_shift(bdi, vcd->shift);
-	/*
-	 * For each missed period, we half the local counter.
-	 * basically:
-	 *   bdi_stat(bdi, BDI_COMPLETION) >> (global_period - bdi->period);
-	 *
-	 * but since the distributed nature of percpu counters make division
-	 * rather hard, use a regular subtraction loop. This is safe, because
-	 * BDI_COMPLETION will only every be incremented, hence the subtraction
-	 * can never result in a negative number.
-	 */
-	while (bdi->period != global_period) {
-		unsigned long val = bdi_stat_unsigned(bdi, BDI_COMPLETION);
-		unsigned long half = (val + 1) >> 1;
-
-		/*
-		 * Half of zero won't be much less, break out.
-		 * This limits the loop to shift iterations, even
-		 * if we missed a million.
-		 */
-		if (!val)
-			break;
-
-		/*
-		 * Iff shift >32 half might exceed the limits of
-		 * the regular percpu_counter_mod.
-		 */
-		__mod_bdi_stat64(bdi, BDI_COMPLETION, -half);
-		bdi->period += period;
+	if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
+		int shift = calc_period_shift();
+		prop_change_shift(&vm_completions, shift);
 	}
-	bdi->period = global_period;
-	bdi->shift = vcd->shift;
-	spin_unlock_irqrestore(&bdi->lock, flags);
+	return ret;
 }
 
 /*
  * Increment the BDI's writeout completion count and the global writeout
  * completion count. Called from test_clear_page_writeback().
- *
- *   ++x_{j}, ++t
  */
 static void __bdi_writeout_inc(struct backing_dev_info *bdi)
 {
-	struct vm_completions_data *vcd = get_vcd();
-	/* Catch up with missed period expirations before using the counter. */
-	bdi_writeout_norm(bdi, vcd);
-	__inc_bdi_stat(bdi, BDI_COMPLETION);
-
-	percpu_counter_mod(&vcd->completions, 1);
-	put_vcd(vcd);
+	struct prop_global *pg = prop_get_global(&vm_completions);
+	__prop_inc(pg, &bdi->completions);
+	prop_put_global(&vm_completions, pg);
 }
 
 /*
  * Obtain an accurate fraction of the BDI's portion.
- *
- *   p_{j} = x_{j} / (period/2 + t % period/2)
  */
 void bdi_writeout_fraction(struct backing_dev_info *bdi,
 	       	long *numerator, long *denominator)
 {
-	struct vm_completions_data *vcd = get_vcd();
-	unsigned long period_2 = 1UL << (vcd->shift - 1);
-	unsigned long counter_mask = period_2 - 1;
-	unsigned long global_count;
-
 	if (bdi_cap_writeback_dirty(bdi)) {
-		/* Catch up with the period expirations before use. */
-		bdi_writeout_norm(bdi, vcd);
-		*numerator = bdi_stat(bdi, BDI_COMPLETION);
-	} else
+		struct prop_global *pg = prop_get_global(&vm_completions);
+		prop_fraction(pg, &bdi->completions, numerator, denominator);
+		prop_put_global(&vm_completions, pg);
+	} else {
 		*numerator = 0;
-
-	global_count = percpu_counter_read(&vcd->completions);
-	*denominator = period_2 + (global_count & counter_mask);
-	put_vcd(vcd);
+		*denominator = 1;
+	}
 }
 
 /*
@@ -980,10 +724,14 @@ static struct notifier_block __cpuinitda
  */
 void __init page_writeback_init(void)
 {
+	int shift;
+
 	mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
 	writeback_set_ratelimit();
 	register_cpu_notifier(&ratelimit_nb);
-	vcd_init();
+
+	shift = calc_period_shift();
+	prop_descriptor_init(&vm_completions, shift);
 }
 
 /**
Index: linux-2.6/lib/proportions.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6/lib/proportions.c	2007-05-10 11:03:01.000000000 +0200
@@ -0,0 +1,259 @@
+/*
+ * FLoating proportions
+ *
+ *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <[email protected]>
+ *
+ * DETAILS:
+ *
+ * The floating proportion is a time derivative with an exponentially decaying
+ * history:
+ *
+ *   p_{j} = \Sum_{i=0} (dx_{j}/dt_{-i}) / 2^(1+i)
+ *
+ * Where j is an element from {BDIs}, x_{j} is j's number of completions, and i
+ * the time period over which the differential is taken. So d/dt_{-i} is the
+ * differential over the i-th last period.
+ *
+ * The decaying history gives smooth transitions. The time differential carries
+ * the notion of speed.
+ *
+ * The denominator is 2^(1+i) because we want the series to be normalised, ie.
+ *
+ *   \Sum_{i=0} 1/2^(1+i) = 1
+ *
+ * Further more, if we measure time (t) in the same events as x; so that:
+ *
+ *   t = \Sum_{j} x_{j}
+ *
+ * we get that:
+ *
+ *   \Sum_{j} p_{j} = 1
+ *
+ * Writing this in an iterative fashion we get (dropping the 'd's):
+ *
+ *   if (++x_{j}, ++t > period)
+ *     t /= 2;
+ *     for_each (j)
+ *       x_{j} /= 2;
+ *
+ * so that:
+ *
+ *   p_{j} = x_{j} / t;
+ *
+ * We optimize away the '/= 2' for the global time delta by noting that:
+ *
+ *   if (++t > period) t /= 2:
+ *
+ * Can be approximated by:
+ *
+ *   period/2 + (++t % period/2)
+ *
+ * [ Furthermore, when we choose period to be 2^n it can be written in terms of
+ *   binary operations and wraparound artefacts disappear. ]
+ *
+ * Also note that this yields a natural counter of the elapsed periods:
+ *
+ *   c = t / (period/2)
+ *
+ * [ Its monotonic increasing property can be applied to mitigate the wrap-
+ *   around issue. ]
+ *
+ * This allows us to do away with the loop over all BDIs on each period
+ * expiration. By remembering the period count under which it was last
+ * accessed as c_{j}, we can obtain the number of 'missed' cycles from:
+ *
+ *   c - c_{j}
+ *
+ * We can then lazily catch up to the global period count every time we are
+ * going to use x_{j}, by doing:
+ *
+ *   x_{j} /= 2^(c - c_{j}), c_{j} = c
+ */
+
+#include <linux/proportions.h>
+#include <linux/rcupdate.h>
+
+void prop_descriptor_init(struct prop_descriptor *pd, int shift)
+{
+	pd->index = 0;
+	pd->pg[0].shift = shift;
+	percpu_counter_init(&pd->pg[0].events, 0);
+	percpu_counter_init(&pd->pg[1].events, 0);
+	mutex_init(&pd->mutex);
+}
+
+/*
+ * We have two copies, and flip between them to make it seem like an atomic
+ * update. The update is not really atomic wrt the events counter, but
+ * it is internally consistent with the bit layout depending on shift.
+ *
+ * We copy the events count, move the bits around and flip the index.
+ */
+void prop_change_shift(struct prop_descriptor *pd, int shift)
+{
+	int index;
+	int offset;
+	u64 events;
+	unsigned long flags;
+
+	mutex_lock(&pd->mutex);
+
+	index = pd->index ^ 1;
+	offset = pd->pg[pd->index].shift - shift;
+	if (!offset)
+		goto out;
+
+	pd->pg[index].shift = shift;
+
+	local_irq_save(flags);
+	events = percpu_counter_sum_signed(
+			&pd->pg[pd->index].events);
+	if (offset < 0)
+		events <<= -offset;
+	else
+		events >>= offset;
+	percpu_counter_set(&pd->pg[index].events, events);
+
+	/*
+	 * ensure the new pg is fully written before the switch
+	 */
+	smp_wmb();
+	pd->index = index;
+	local_irq_restore(flags);
+
+	synchronize_rcu();
+
+out:
+	mutex_unlock(&pd->mutex);
+}
+
+/*
+ * wrap the access to the data in an rcu_read_lock() section;
+ * this is used to track the active references.
+ */
+struct prop_global *prop_get_global(struct prop_descriptor *pd)
+{
+	int index;
+
+	rcu_read_lock();
+	index = pd->index;
+	/*
+	 * match the wmb from vcd_flip()
+	 */
+	smp_rmb();
+	return &pd->pg[index];
+}
+
+void prop_put_global(struct prop_descriptor *pd, struct prop_global *pg)
+{
+	rcu_read_unlock();
+}
+
+static void prop_adjust_shift(struct prop_local *pl, int new_shift)
+{
+	int offset = pl->shift - new_shift;
+
+	if (!offset)
+		return;
+
+	if (offset < 0)
+		pl->period <<= -offset;
+	else
+		pl->period >>= offset;
+
+	pl->shift = new_shift;
+}
+
+void prop_local_init(struct prop_local *pl)
+{
+	spin_lock_init(&pl->lock);
+	pl->shift = 0;
+	pl->period = 0;
+	percpu_counter_init(&pl->events, 0);
+}
+
+void prop_local_destroy(struct prop_local *pl)
+{
+	percpu_counter_destroy(&pl->events);
+}
+
+/*
+ * Catch up with missed period expirations.
+ *
+ *   until (c_{j} == c)
+ *     x_{j} -= x_{j}/2;
+ *     c_{j}++;
+ */
+void prop_norm(struct prop_global *pg,
+		struct prop_local *pl)
+{
+	unsigned long period = 1UL << (pg->shift - 1);
+	unsigned long period_mask = ~(period - 1);
+	unsigned long global_period;
+	unsigned long flags;
+
+	global_period = percpu_counter_read(&pg->events);
+	global_period &= period_mask;
+
+	/*
+	 * Fast path - check if the local and global period count still match
+	 * outside of the lock.
+	 */
+	if (pl->period == global_period)
+		return;
+
+	spin_lock_irqsave(&pl->lock, flags);
+	prop_adjust_shift(pl, pg->shift);
+	/*
+	 * For each missed period, we half the local counter.
+	 * basically:
+	 *   pl->events >> (global_period - pl->period);
+	 *
+	 * but since the distributed nature of percpu counters make division
+	 * rather hard, use a regular subtraction loop. This is safe, because
+	 * the events will only every be incremented, hence the subtraction
+	 * can never result in a negative number.
+	 */
+	while (pl->period != global_period) {
+		unsigned long val = percpu_counter_read(&pl->events);
+		unsigned long half = (val + 1) >> 1;
+
+		/*
+		 * Half of zero won't be much less, break out.
+		 * This limits the loop to shift iterations, even
+		 * if we missed a million.
+		 */
+		if (!val)
+			break;
+
+		/*
+		 * Iff shift >32 half might exceed the limits of
+		 * the regular percpu_counter_mod.
+		 */
+		percpu_counter_mod64(&pl->events, -half);
+		pl->period += period;
+	}
+	pl->period = global_period;
+	spin_unlock_irqrestore(&pl->lock, flags);
+}
+
+/*
+ * Obtain an fraction of this proportion
+ *
+ *   p_{j} = x_{j} / (period/2 + t % period/2)
+ */
+void prop_fraction(struct prop_global *pg, struct prop_local *pl,
+	       	long *numerator, long *denominator)
+{
+	unsigned long period_2 = 1UL << (pg->shift - 1);
+	unsigned long counter_mask = period_2 - 1;
+	unsigned long global_count;
+
+	prop_norm(pg, pl);
+	*numerator = percpu_counter_read(&pl->events);
+
+	global_count = percpu_counter_read(&pg->events);
+	*denominator = period_2 + (global_count & counter_mask);
+}
+
+
Index: linux-2.6/include/linux/proportions.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6/include/linux/proportions.h	2007-05-10 11:03:01.000000000 +0200
@@ -0,0 +1,82 @@
+/*
+ * FLoating proportions
+ *
+ *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <[email protected]>
+ *
+ * This file contains the public data structure and API definitions.
+ */
+
+#ifndef _LINUX_PROPORTIONS_H
+#define _LINUX_PROPORTIONS_H
+
+#include <linux/percpu_counter.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+
+struct prop_global {
+	/*
+	 * The period over which we differentiate (in pages)
+	 *
+	 *   period = 2^shift
+	 */
+	int shift;
+	/*
+	 * The total page writeback completion counter aka 'time'.
+	 *
+	 * Treated as an unsigned long; the lower 'shift - 1' bits are the
+	 * counter bits, the remaining upper bits the period counter.
+	 */
+	struct percpu_counter events;
+};
+
+/*
+ * global property descriptor
+ *
+ * this is needed to consitently flip prop_global structures.
+ */
+struct prop_descriptor {
+	int index;
+	struct prop_global pg[2];
+	struct mutex mutex;
+};
+
+void prop_descriptor_init(struct prop_descriptor *pd, int shift);
+void prop_change_shift(struct prop_descriptor *pd, int new_shift);
+struct prop_global *prop_get_global(struct prop_descriptor *pd);
+void prop_put_global(struct prop_descriptor *pd, struct prop_global *pg);
+
+struct prop_local {
+	/*
+	 * the local events counter
+	 */
+	struct percpu_counter events;
+
+	/*
+	 * snapshot of the last seen global state
+	 * and a lock protecting this state
+	 */
+	int shift;
+	unsigned long period;
+	spinlock_t lock;
+};
+
+void prop_local_init(struct prop_local *pl);
+void prop_local_destroy(struct prop_local *pl);
+
+void prop_norm(struct prop_global *pg, struct prop_local *pl);
+
+/*
+ *   ++x_{j}, ++t
+ */
+static inline
+void __prop_inc(struct prop_global *pg, struct prop_local *pl)
+{
+	prop_norm(pg, pl);
+	percpu_counter_mod(&pl->events, 1);
+	percpu_counter_mod(&pg->events, 1);
+}
+
+void prop_fraction(struct prop_global *pg, struct prop_local *pl,
+		long *numerator, long *denominator);
+
+#endif /* _LINUX_PROPORTIONS_H */
Index: linux-2.6/mm/backing-dev.c
===================================================================
--- linux-2.6.orig/mm/backing-dev.c	2007-05-10 10:52:24.000000000 +0200
+++ linux-2.6/mm/backing-dev.c	2007-05-10 11:03:01.000000000 +0200
@@ -12,9 +12,8 @@ void bdi_init(struct backing_dev_info *b
 	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
 		percpu_counter_init(&bdi->bdi_stat[i], 0);
 
-	spin_lock_init(&bdi->lock);
-	bdi->period = 0;
 	bdi->dirty_exceeded = 0;
+	prop_local_init(&bdi->completions);
 
 }
 EXPORT_SYMBOL(bdi_init);
@@ -25,6 +24,8 @@ void bdi_destroy(struct backing_dev_info
 
 	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
 		percpu_counter_destroy(&bdi->bdi_stat[i]);
+
+	prop_local_destroy(&bdi->completions);
 }
 EXPORT_SYMBOL(bdi_destroy);
 
Index: linux-2.6/lib/Makefile
===================================================================
--- linux-2.6.orig/lib/Makefile	2007-05-10 10:51:35.000000000 +0200
+++ linux-2.6/lib/Makefile	2007-05-10 11:03:01.000000000 +0200
@@ -5,7 +5,7 @@
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o dump_stack.o \
 	 idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \
-	 sha1.o irq_regs.o reciprocal_div.o
+	 sha1.o irq_regs.o reciprocal_div.o proportions.o
 
 lib-$(CONFIG_MMU) += ioremap.o pagewalk.o
 lib-$(CONFIG_SMP) += cpumask.o

-- 

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
References:
- [PATCH 00/15] per device dirty throttling -v6
  - From: Peter Zijlstra <[email protected]>
Prev by Date: [PATCH 00/15] per device dirty throttling -v6
Next by Date: [PATCH 07/15] mtd: give mtdconcat devices their own backing_dev_info
Previous by thread: [PATCH 10/15] mm: count writeback pages per BDI
Next by thread: [PATCH 07/15] mtd: give mtdconcat devices their own backing_dev_info
Index(es):
- Date
- Thread
[Index of Archives] [Kernel Newbies] [Netfilter] [Bugtraq] [Photo] [Stuff] [Gimp] [Yosemite News] [MIPS Linux] [ARM Linux] [Linux Security] [Linux RAID] [Video 4 Linux] [Linux for the blind] [Linux Resources]