* [PATCH 1/2] lib: Proportions with flexible period
2012-05-03 22:39 [PATCH 0/2 v2] Flexible proportions for BDIs Jan Kara
@ 2012-05-03 22:39 ` Jan Kara
0 siblings, 0 replies; 14+ messages in thread
From: Jan Kara @ 2012-05-03 22:39 UTC (permalink / raw)
To: linux-mm; +Cc: Wu Fengguang, peterz, Jan Kara
Implement code computing proportions of events of different type (like code in
lib/proportions.c) but allowing periods to have different lengths. This allows
us to have aging periods of fixed wallclock time which gives better proportion
estimates given the hugely varying throughput of different devices - previous
measuring of aging period by number of events has the problem that a reasonable
period length for a system with low-end USB stick is not a reasonable period
length for a system with high-end storage array resulting either in too slow
proportion updates or too fluctuating proportion updates.
Signed-off-by: Jan Kara <jack@suse.cz>
---
include/linux/flex_proportions.h | 91 +++++++++++++++
lib/Makefile | 2 +-
lib/flex_proportions.c | 238 ++++++++++++++++++++++++++++++++++++++
3 files changed, 330 insertions(+), 1 deletions(-)
create mode 100644 include/linux/flex_proportions.h
create mode 100644 lib/flex_proportions.c
diff --git a/include/linux/flex_proportions.h b/include/linux/flex_proportions.h
new file mode 100644
index 0000000..0c3c63f
--- /dev/null
+++ b/include/linux/flex_proportions.h
@@ -0,0 +1,91 @@
+/*
+ * Floating proportions with flexible aging period
+ *
+ * Copyright (C) 2011, SUSE, Jan Kara <jack@suse.cz>
+ */
+
+#ifndef _LINUX_FLEX_PROPORTIONS_H
+#define _LINUX_FLEX_PROPORTIONS_H
+
+#include <linux/percpu_counter.h>
+#include <linux/spinlock.h>
+#include <linux/seqlock.h>
+
+/*
+ * ---- Global proportion definitions ----
+ */
+struct fprop_global {
+ /* Number of events in the current period */
+ struct percpu_counter events;
+ /* Current period */
+ unsigned int period;
+ /* Synchronization with period transitions */
+ seqcount_t sequence;
+};
+
+int fprop_global_init(struct fprop_global *p);
+void fprop_global_destroy(struct fprop_global *p);
+void fprop_new_period(struct fprop_global *p);
+
+/*
+ * ---- SINGLE ----
+ */
+struct fprop_local_single {
+ /* the local events counter */
+ unsigned long events;
+ /* Period in which we last updated events */
+ unsigned int period;
+ raw_spinlock_t lock; /* Protect period and numerator */
+};
+
+#define INIT_FPROP_LOCAL_SINGLE(name) \
+{ .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \
+}
+
+int fprop_local_init_single(struct fprop_local_single *pl);
+void __fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl);
+void fprop_fraction_single(struct fprop_global *p,
+ struct fprop_local_single *pl, unsigned long *numerator,
+ unsigned long *denominator);
+
+static inline
+void fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __fprop_inc_single(p, pl);
+ local_irq_restore(flags);
+}
+
+/*
+ * ---- PERCPU ----
+ */
+struct fprop_local_percpu {
+ /* the local events counter */
+ struct percpu_counter events;
+ /* Period in which we last updated events */
+ unsigned int period;
+ raw_spinlock_t lock; /* Protect period and numerator */
+};
+
+int fprop_local_init_percpu(struct fprop_local_percpu *pl);
+void fprop_local_destroy_percpu(struct fprop_local_percpu *pl);
+void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl);
+void __fprop_inc_percpu_max(struct fprop_global *p, struct fprop_local_percpu *pl,
+ int max_frac);
+void fprop_fraction_percpu(struct fprop_global *p,
+ struct fprop_local_percpu *pl, unsigned long *numerator,
+ unsigned long *denominator);
+
+static inline
+void fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __fprop_inc_percpu(p, pl);
+ local_irq_restore(flags);
+}
+
+#endif
diff --git a/lib/Makefile b/lib/Makefile
index 18515f0..e144536 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -11,7 +11,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
rbtree.o radix-tree.o dump_stack.o timerqueue.o\
idr.o int_sqrt.o extable.o prio_tree.o \
sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \
- proportions.o prio_heap.o ratelimit.o show_mem.o \
+ proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \
is_single_threaded.o plist.o decompress.o
lib-$(CONFIG_MMU) += ioremap.o
diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c
new file mode 100644
index 0000000..d3a2468
--- /dev/null
+++ b/lib/flex_proportions.c
@@ -0,0 +1,238 @@
+/*
+ * Floating proportions with flexible aging period
+ *
+ * Copyright (C) 2011, SUSE, Jan Kara <jack@suse.cz>
+ *
+ * The goal of this code is: Given different types of event, measure proportion
+ * of each type of event over time. The proportions are measured with
+ * exponentially decaying history to give smooth transitions. A formula
+ * expressing proportion of event of type 'j' is:
+ *
+ * p_{j} = (\Sum_{i>=0} x_{i,j}/2^{i+1})/(\Sum_{i>=0} x_i/2^{i+1})
+ *
+ * Where x_{i,j} is j's number of events in i-th last time period and x_i is
+ * total number of events in i-th last time period.
+ *
+ * Note that p_{j}'s are normalised, i.e.
+ *
+ * \Sum_{j} p_{j} = 1,
+ *
+ * This formula can be straightforwardly computed by maintaing denominator
+ * (let's call it 'd') and for each event type its numerator (let's call it
+ * 'n_j'). When an event of type 'j' happens, we simply need to do:
+ * n_j++; d++;
+ *
+ * When a new period is declared, we could do:
+ * d /= 2
+ * for each j
+ * n_j /= 2
+ *
+ * To avoid iteration over all event types, we instead shift numerator of event
+ * j lazily when someone asks for a proportion of event j or when event j
+ * occurs. This can bit trivially implemented by remembering last period in
+ * which something happened with proportion of type j.
+ */
+#include <linux/flex_proportions.h>
+
+int fprop_global_init(struct fprop_global *p)
+{
+ int err;
+
+ p->period = 0;
+ /* Use 1 to avoid dealing with periods with 0 events... */
+ err = percpu_counter_init(&p->events, 1);
+ if (err)
+ return err;
+ seqcount_init(&p->sequence);
+ return 0;
+}
+
+void fprop_global_destroy(struct fprop_global *p)
+{
+ percpu_counter_destroy(&p->events);
+}
+
+/*
+ * Declare new period. It is upto the caller to make sure two period
+ * transitions cannot happen in parallel.
+ */
+void fprop_new_period(struct fprop_global *p)
+{
+ u64 events = percpu_counter_sum(&p->events);
+
+ /*
+ * Don't do anything if there are no events.
+ */
+ if (events <= 1)
+ return;
+ write_seqcount_begin(&p->sequence);
+ /* We use addition to avoid losing events happening between sum and set. */
+ percpu_counter_add(&p->events, -(events >> 1));
+ p->period++;
+ write_seqcount_end(&p->sequence);
+}
+
+/*
+ * ---- SINGLE ----
+ */
+
+int fprop_local_init_single(struct fprop_local_single *pl)
+{
+ pl->events = 0;
+ pl->period = 0;
+ raw_spin_lock_init(&pl->lock);
+ return 0;
+}
+
+void fprop_local_destroy_single(struct fprop_local_single *pl)
+{
+}
+
+static void fprop_reflect_period_single(struct fprop_global *p,
+ struct fprop_local_single *pl)
+{
+ unsigned int period = p->period;
+ unsigned long flags;
+
+ /* Fast path - period didn't change */
+ if (pl->period == period)
+ return;
+ raw_spin_lock_irqsave(&pl->lock, flags);
+ /* Someone updated pl->period while we were spinning? */
+ if (pl->period >= period) {
+ raw_spin_unlock_irqrestore(&pl->lock, flags);
+ return;
+ }
+ /* Aging zeroed our fraction? */
+ if (period - pl->period < BITS_PER_LONG)
+ pl->events >>= period - pl->period;
+ else
+ pl->events = 0;
+ pl->period = period;
+ raw_spin_unlock_irqrestore(&pl->lock, flags);
+}
+
+/* Event of type pl happened */
+void __fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl)
+{
+ fprop_reflect_period_single(p, pl);
+ pl->events++;
+ percpu_counter_add(&p->events, 1);
+}
+
+/* Return fraction of events of type pl */
+void fprop_fraction_single(struct fprop_global *p,
+ struct fprop_local_single *pl,
+ unsigned long *numerator, unsigned long *denominator)
+{
+ unsigned int seq;
+ s64 den;
+
+ do {
+ seq = read_seqcount_begin(&p->sequence);
+ fprop_reflect_period_single(p, pl);
+ *numerator = pl->events;
+ den = percpu_counter_read(&p->events);
+ if (den <= 0)
+ den = percpu_counter_sum(&p->events);
+ *denominator = den;
+ } while (read_seqcount_retry(&p->sequence, seq));
+}
+
+/*
+ * ---- PERCPU ----
+ */
+#define PROP_BATCH (8*(1+ilog2(nr_cpu_ids)))
+
+int fprop_local_init_percpu(struct fprop_local_percpu *pl)
+{
+ int err;
+
+ err = percpu_counter_init(&pl->events, 0);
+ if (err)
+ return err;
+ pl->period = 0;
+ raw_spin_lock_init(&pl->lock);
+ return 0;
+}
+
+void fprop_local_destroy_percpu(struct fprop_local_percpu *pl)
+{
+ percpu_counter_destroy(&pl->events);
+}
+
+static void fprop_reflect_period_percpu(struct fprop_global *p,
+ struct fprop_local_percpu *pl)
+{
+ unsigned int period = p->period;
+ unsigned long flags;
+
+ /* Fast path - period didn't change */
+ if (pl->period == period)
+ return;
+ raw_spin_lock_irqsave(&pl->lock, flags);
+ /* Someone updated pl->period while we were spinning? */
+ if (pl->period >= period) {
+ raw_spin_unlock_irqrestore(&pl->lock, flags);
+ return;
+ }
+ /* Aging zeroed our fraction? */
+ if (period - pl->period < BITS_PER_LONG) {
+ s64 val = percpu_counter_read(&pl->events);
+
+ if (val < (nr_cpu_ids * PROP_BATCH))
+ val = percpu_counter_sum(&pl->events);
+
+ __percpu_counter_add(&pl->events,
+ -val + (val >> (period-pl->period)), PROP_BATCH);
+ } else
+ percpu_counter_set(&pl->events, 0);
+ pl->period = period;
+ raw_spin_unlock_irqrestore(&pl->lock, flags);
+}
+
+/* Event of type pl happened */
+void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl)
+{
+ fprop_reflect_period_percpu(p, pl);
+ __percpu_counter_add(&pl->events, 1, PROP_BATCH);
+ percpu_counter_add(&p->events, 1);
+}
+
+void fprop_fraction_percpu(struct fprop_global *p,
+ struct fprop_local_percpu *pl,
+ unsigned long *numerator, unsigned long *denominator)
+{
+ unsigned int seq;
+ s64 den;
+
+ do {
+ seq = read_seqcount_begin(&p->sequence);
+ fprop_reflect_period_percpu(p, pl);
+ *numerator = percpu_counter_read_positive(&pl->events);
+ den = percpu_counter_read(&p->events);
+ if (den <= 0)
+ den = percpu_counter_sum(&p->events);
+ *denominator = den;
+ } while (read_seqcount_retry(&p->sequence, seq));
+}
+
+/*
+ * Like __fprop_inc_percpu() except that event is counted only if the given
+ * type has fraction smaller than @max_frac/100
+ */
+void __fprop_inc_percpu_max(struct fprop_global *p,
+ struct fprop_local_percpu *pl, int max_frac)
+{
+ if (unlikely(max_frac < 100)) {
+ unsigned long numerator, denominator;
+
+ fprop_fraction_percpu(p, pl, &numerator, &denominator);
+ if (numerator > ((long long)denominator) * max_frac / 100)
+ return;
+ } else
+ fprop_reflect_period_percpu(p, pl);
+ __percpu_counter_add(&pl->events, 1, PROP_BATCH);
+ percpu_counter_add(&p->events, 1);
+}
+
--
1.7.1
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 0/2 v3] Flexible proportions
@ 2012-05-15 15:43 Jan Kara
2012-05-15 15:43 ` [PATCH 1/2] lib: Proportions with flexible period Jan Kara
2012-05-15 15:43 ` [PATCH 2/2] block: Convert BDI proportion calculations to flexible proportions Jan Kara
0 siblings, 2 replies; 14+ messages in thread
From: Jan Kara @ 2012-05-15 15:43 UTC (permalink / raw)
To: Wu Fengguang; +Cc: LKML, linux-mm, Peter Zijlstra
Hello,
here is the next iteration of my flexible proportions code. Changes since
v2 are:
* use timer instead of workqueue for triggering period switch
* arm timer only if aging didn't zero out all fractions, re-arm timer when
new event arrives again
* set period length to 3s
Some introduction for first time readers:
The idea of this patch set is to provide code for computing event proportions
where aging period is not dependent on the number of events happening (so
that aging works well both with fast storage and slow USB sticks in the same
system).
The basic idea is that we compute proportions as:
p_j = (\Sum_{i>=0} x_{i,j}/2^{i+1}) / (\Sum_{i>=0} x_i/2^{i+1})
Where x_{i,j} is j's number of events in i-th last time period and x_i is
total number of events in i-th last time period.
Note that when x_i's are all the same (as is the case with current
proportion code), this expression simplifies to the expression defining
current proportions which is:
p_j = \Sum_{i>=0} x_{i,j}/2^{i+1} / t
where t is the lenght of the aging period.
In fact, if we are in the middle of the period, the proportion computed by
the current code is:
p_j = (x_0 + \Sum_{i>=1} x_{i,j}/2^{i+1}) / (t' + t)
where t' is total number of events in the running period and t is the lenght
of the aging period. So there is event more similarity.
Similarly as with current proportion code, it is simple to compute update
proportion after several periods have elapsed. For each proportion we store
the numerator of our fraction and the number of period when the proportion
was last updated. In global proportion structure we compute the denominator
of the fraction which is the same for all event types. So catch up with missed
periods boils down to shifting the numerator by the number of missed periods
and that's it. For more details, please see the code.
I've also run a few tests (I've created a userspace wrapper to allow me to
run proportion code in userpace and arbitrarily generate events for it) to
compare the behavior of old and new code. You can see them at
http://beta.suse.com/private/jack/flex_proportions/ In all the tests new code
showed faster convergence to current event proportions (I tried to
realistically set period_shift for fixed proportions). Also in the last test
we see that if period_shift is decreased, then current proportions become more
sensitive to short term fluctuations in event rate so just decreasing
period_shift isn't a good solution to slower convergence. If anyone has other
idea what to try, I can do that - it should be simple enough to implement in
my testing tool.
Honza
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH 1/2] lib: Proportions with flexible period
2012-05-15 15:43 [PATCH 0/2 v3] Flexible proportions Jan Kara
@ 2012-05-15 15:43 ` Jan Kara
2012-05-17 21:35 ` Peter Zijlstra
` (2 more replies)
2012-05-15 15:43 ` [PATCH 2/2] block: Convert BDI proportion calculations to flexible proportions Jan Kara
1 sibling, 3 replies; 14+ messages in thread
From: Jan Kara @ 2012-05-15 15:43 UTC (permalink / raw)
To: Wu Fengguang; +Cc: LKML, linux-mm, Peter Zijlstra, Jan Kara
Implement code computing proportions of events of different type (like code in
lib/proportions.c) but allowing periods to have different lengths. This allows
us to have aging periods of fixed wallclock time which gives better proportion
estimates given the hugely varying throughput of different devices - previous
measuring of aging period by number of events has the problem that a reasonable
period length for a system with low-end USB stick is not a reasonable period
length for a system with high-end storage array resulting either in too slow
proportion updates or too fluctuating proportion updates.
Signed-off-by: Jan Kara <jack@suse.cz>
---
include/linux/flex_proportions.h | 91 ++++++++++++++
lib/Makefile | 2 +-
lib/flex_proportions.c | 244 ++++++++++++++++++++++++++++++++++++++
3 files changed, 336 insertions(+), 1 deletions(-)
create mode 100644 include/linux/flex_proportions.h
create mode 100644 lib/flex_proportions.c
diff --git a/include/linux/flex_proportions.h b/include/linux/flex_proportions.h
new file mode 100644
index 0000000..59ab923
--- /dev/null
+++ b/include/linux/flex_proportions.h
@@ -0,0 +1,91 @@
+/*
+ * Floating proportions with flexible aging period
+ *
+ * Copyright (C) 2011, SUSE, Jan Kara <jack@suse.cz>
+ */
+
+#ifndef _LINUX_FLEX_PROPORTIONS_H
+#define _LINUX_FLEX_PROPORTIONS_H
+
+#include <linux/percpu_counter.h>
+#include <linux/spinlock.h>
+#include <linux/seqlock.h>
+
+/*
+ * ---- Global proportion definitions ----
+ */
+struct fprop_global {
+ /* Number of events in the current period */
+ struct percpu_counter events;
+ /* Current period */
+ unsigned int period;
+ /* Synchronization with period transitions */
+ seqcount_t sequence;
+};
+
+int fprop_global_init(struct fprop_global *p);
+void fprop_global_destroy(struct fprop_global *p);
+bool fprop_new_period(struct fprop_global *p);
+
+/*
+ * ---- SINGLE ----
+ */
+struct fprop_local_single {
+ /* the local events counter */
+ unsigned long events;
+ /* Period in which we last updated events */
+ unsigned int period;
+ raw_spinlock_t lock; /* Protect period and numerator */
+};
+
+#define INIT_FPROP_LOCAL_SINGLE(name) \
+{ .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \
+}
+
+int fprop_local_init_single(struct fprop_local_single *pl);
+void __fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl);
+void fprop_fraction_single(struct fprop_global *p,
+ struct fprop_local_single *pl, unsigned long *numerator,
+ unsigned long *denominator);
+
+static inline
+void fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __fprop_inc_single(p, pl);
+ local_irq_restore(flags);
+}
+
+/*
+ * ---- PERCPU ----
+ */
+struct fprop_local_percpu {
+ /* the local events counter */
+ struct percpu_counter events;
+ /* Period in which we last updated events */
+ unsigned int period;
+ raw_spinlock_t lock; /* Protect period and numerator */
+};
+
+int fprop_local_init_percpu(struct fprop_local_percpu *pl);
+void fprop_local_destroy_percpu(struct fprop_local_percpu *pl);
+void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl);
+void __fprop_inc_percpu_max(struct fprop_global *p, struct fprop_local_percpu *pl,
+ int max_frac);
+void fprop_fraction_percpu(struct fprop_global *p,
+ struct fprop_local_percpu *pl, unsigned long *numerator,
+ unsigned long *denominator);
+
+static inline
+void fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __fprop_inc_percpu(p, pl);
+ local_irq_restore(flags);
+}
+
+#endif
diff --git a/lib/Makefile b/lib/Makefile
index 18515f0..e144536 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -11,7 +11,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
rbtree.o radix-tree.o dump_stack.o timerqueue.o\
idr.o int_sqrt.o extable.o prio_tree.o \
sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \
- proportions.o prio_heap.o ratelimit.o show_mem.o \
+ proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \
is_single_threaded.o plist.o decompress.o
lib-$(CONFIG_MMU) += ioremap.o
diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c
new file mode 100644
index 0000000..b4fd603
--- /dev/null
+++ b/lib/flex_proportions.c
@@ -0,0 +1,244 @@
+/*
+ * Floating proportions with flexible aging period
+ *
+ * Copyright (C) 2011, SUSE, Jan Kara <jack@suse.cz>
+ *
+ * The goal of this code is: Given different types of event, measure proportion
+ * of each type of event over time. The proportions are measured with
+ * exponentially decaying history to give smooth transitions. A formula
+ * expressing proportion of event of type 'j' is:
+ *
+ * p_{j} = (\Sum_{i>=0} x_{i,j}/2^{i+1})/(\Sum_{i>=0} x_i/2^{i+1})
+ *
+ * Where x_{i,j} is j's number of events in i-th last time period and x_i is
+ * total number of events in i-th last time period.
+ *
+ * Note that p_{j}'s are normalised, i.e.
+ *
+ * \Sum_{j} p_{j} = 1,
+ *
+ * This formula can be straightforwardly computed by maintaing denominator
+ * (let's call it 'd') and for each event type its numerator (let's call it
+ * 'n_j'). When an event of type 'j' happens, we simply need to do:
+ * n_j++; d++;
+ *
+ * When a new period is declared, we could do:
+ * d /= 2
+ * for each j
+ * n_j /= 2
+ *
+ * To avoid iteration over all event types, we instead shift numerator of event
+ * j lazily when someone asks for a proportion of event j or when event j
+ * occurs. This can bit trivially implemented by remembering last period in
+ * which something happened with proportion of type j.
+ */
+#include <linux/flex_proportions.h>
+
+int fprop_global_init(struct fprop_global *p)
+{
+ int err;
+
+ p->period = 0;
+ /* Use 1 to avoid dealing with periods with 0 events... */
+ err = percpu_counter_init(&p->events, 1);
+ if (err)
+ return err;
+ seqcount_init(&p->sequence);
+ return 0;
+}
+
+void fprop_global_destroy(struct fprop_global *p)
+{
+ percpu_counter_destroy(&p->events);
+}
+
+/*
+ * Declare new period. It is upto the caller to make sure two period
+ * transitions cannot happen in parallel.
+ *
+ * The function returns true if the proportions are still defined and false
+ * if aging zeroed out all events. This can be used to detect whether declaring
+ * further periods has any effect.
+ */
+bool fprop_new_period(struct fprop_global *p)
+{
+ u64 events = percpu_counter_sum(&p->events);
+
+ /*
+ * Don't do anything if there are no events.
+ */
+ if (events <= 1)
+ return false;
+ write_seqcount_begin(&p->sequence);
+ /* We use addition to avoid losing events happening between sum and set. */
+ percpu_counter_add(&p->events, -(events >> 1));
+ p->period++;
+ write_seqcount_end(&p->sequence);
+
+ return true;
+}
+
+/*
+ * ---- SINGLE ----
+ */
+
+int fprop_local_init_single(struct fprop_local_single *pl)
+{
+ pl->events = 0;
+ pl->period = 0;
+ raw_spin_lock_init(&pl->lock);
+ return 0;
+}
+
+void fprop_local_destroy_single(struct fprop_local_single *pl)
+{
+}
+
+static void fprop_reflect_period_single(struct fprop_global *p,
+ struct fprop_local_single *pl)
+{
+ unsigned int period = p->period;
+ unsigned long flags;
+
+ /* Fast path - period didn't change */
+ if (pl->period == period)
+ return;
+ raw_spin_lock_irqsave(&pl->lock, flags);
+ /* Someone updated pl->period while we were spinning? */
+ if (pl->period >= period) {
+ raw_spin_unlock_irqrestore(&pl->lock, flags);
+ return;
+ }
+ /* Aging zeroed our fraction? */
+ if (period - pl->period < BITS_PER_LONG)
+ pl->events >>= period - pl->period;
+ else
+ pl->events = 0;
+ pl->period = period;
+ raw_spin_unlock_irqrestore(&pl->lock, flags);
+}
+
+/* Event of type pl happened */
+void __fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl)
+{
+ fprop_reflect_period_single(p, pl);
+ pl->events++;
+ percpu_counter_add(&p->events, 1);
+}
+
+/* Return fraction of events of type pl */
+void fprop_fraction_single(struct fprop_global *p,
+ struct fprop_local_single *pl,
+ unsigned long *numerator, unsigned long *denominator)
+{
+ unsigned int seq;
+ s64 den;
+
+ do {
+ seq = read_seqcount_begin(&p->sequence);
+ fprop_reflect_period_single(p, pl);
+ *numerator = pl->events;
+ den = percpu_counter_read(&p->events);
+ if (den <= 0)
+ den = percpu_counter_sum(&p->events);
+ *denominator = den;
+ } while (read_seqcount_retry(&p->sequence, seq));
+}
+
+/*
+ * ---- PERCPU ----
+ */
+#define PROP_BATCH (8*(1+ilog2(nr_cpu_ids)))
+
+int fprop_local_init_percpu(struct fprop_local_percpu *pl)
+{
+ int err;
+
+ err = percpu_counter_init(&pl->events, 0);
+ if (err)
+ return err;
+ pl->period = 0;
+ raw_spin_lock_init(&pl->lock);
+ return 0;
+}
+
+void fprop_local_destroy_percpu(struct fprop_local_percpu *pl)
+{
+ percpu_counter_destroy(&pl->events);
+}
+
+static void fprop_reflect_period_percpu(struct fprop_global *p,
+ struct fprop_local_percpu *pl)
+{
+ unsigned int period = p->period;
+ unsigned long flags;
+
+ /* Fast path - period didn't change */
+ if (pl->period == period)
+ return;
+ raw_spin_lock_irqsave(&pl->lock, flags);
+ /* Someone updated pl->period while we were spinning? */
+ if (pl->period >= period) {
+ raw_spin_unlock_irqrestore(&pl->lock, flags);
+ return;
+ }
+ /* Aging zeroed our fraction? */
+ if (period - pl->period < BITS_PER_LONG) {
+ s64 val = percpu_counter_read(&pl->events);
+
+ if (val < (nr_cpu_ids * PROP_BATCH))
+ val = percpu_counter_sum(&pl->events);
+
+ __percpu_counter_add(&pl->events,
+ -val + (val >> (period-pl->period)), PROP_BATCH);
+ } else
+ percpu_counter_set(&pl->events, 0);
+ pl->period = period;
+ raw_spin_unlock_irqrestore(&pl->lock, flags);
+}
+
+/* Event of type pl happened */
+void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl)
+{
+ fprop_reflect_period_percpu(p, pl);
+ __percpu_counter_add(&pl->events, 1, PROP_BATCH);
+ percpu_counter_add(&p->events, 1);
+}
+
+void fprop_fraction_percpu(struct fprop_global *p,
+ struct fprop_local_percpu *pl,
+ unsigned long *numerator, unsigned long *denominator)
+{
+ unsigned int seq;
+ s64 den;
+
+ do {
+ seq = read_seqcount_begin(&p->sequence);
+ fprop_reflect_period_percpu(p, pl);
+ *numerator = percpu_counter_read_positive(&pl->events);
+ den = percpu_counter_read(&p->events);
+ if (den <= 0)
+ den = percpu_counter_sum(&p->events);
+ *denominator = den;
+ } while (read_seqcount_retry(&p->sequence, seq));
+}
+
+/*
+ * Like __fprop_inc_percpu() except that event is counted only if the given
+ * type has fraction smaller than @max_frac/100
+ */
+void __fprop_inc_percpu_max(struct fprop_global *p,
+ struct fprop_local_percpu *pl, int max_frac)
+{
+ if (unlikely(max_frac < 100)) {
+ unsigned long numerator, denominator;
+
+ fprop_fraction_percpu(p, pl, &numerator, &denominator);
+ if (numerator > ((long long)denominator) * max_frac / 100)
+ return;
+ } else
+ fprop_reflect_period_percpu(p, pl);
+ __percpu_counter_add(&pl->events, 1, PROP_BATCH);
+ percpu_counter_add(&p->events, 1);
+}
+
--
1.7.1
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 2/2] block: Convert BDI proportion calculations to flexible proportions
2012-05-15 15:43 [PATCH 0/2 v3] Flexible proportions Jan Kara
2012-05-15 15:43 ` [PATCH 1/2] lib: Proportions with flexible period Jan Kara
@ 2012-05-15 15:43 ` Jan Kara
2012-05-17 22:04 ` Peter Zijlstra
1 sibling, 1 reply; 14+ messages in thread
From: Jan Kara @ 2012-05-15 15:43 UTC (permalink / raw)
To: Wu Fengguang; +Cc: LKML, linux-mm, Peter Zijlstra, Jan Kara
Convert calculations of proportion of writeback each bdi does to new flexible
proportion code. That allows us to use aging period of fixed wallclock time
which gives better proportion estimates given the hugely varying throughput of
different devices.
Signed-off-by: Jan Kara <jack@suse.cz>
---
include/linux/backing-dev.h | 6 +-
mm/backing-dev.c | 5 +-
mm/page-writeback.c | 91 +++++++++++++++++++++++--------------------
3 files changed, 54 insertions(+), 48 deletions(-)
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index b1038bd..64a3617 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -10,7 +10,7 @@
#include <linux/percpu_counter.h>
#include <linux/log2.h>
-#include <linux/proportions.h>
+#include <linux/flex_proportions.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/sched.h>
@@ -89,11 +89,11 @@ struct backing_dev_info {
unsigned long dirty_ratelimit;
unsigned long balanced_dirty_ratelimit;
- struct prop_local_percpu completions;
+ struct fprop_local_percpu completions;
int dirty_exceeded;
unsigned int min_ratio;
- unsigned int max_ratio, max_prop_frac;
+ unsigned int max_ratio;
struct bdi_writeback wb; /* default writeback info for this bdi */
spinlock_t wb_lock; /* protects work_list */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index dd8e2aa..f3a2608 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -677,7 +677,6 @@ int bdi_init(struct backing_dev_info *bdi)
bdi->min_ratio = 0;
bdi->max_ratio = 100;
- bdi->max_prop_frac = PROP_FRAC_BASE;
spin_lock_init(&bdi->wb_lock);
INIT_LIST_HEAD(&bdi->bdi_list);
INIT_LIST_HEAD(&bdi->work_list);
@@ -700,7 +699,7 @@ int bdi_init(struct backing_dev_info *bdi)
bdi->write_bandwidth = INIT_BW;
bdi->avg_write_bandwidth = INIT_BW;
- err = prop_local_init_percpu(&bdi->completions);
+ err = fprop_local_init_percpu(&bdi->completions);
if (err) {
err:
@@ -744,7 +743,7 @@ void bdi_destroy(struct backing_dev_info *bdi)
for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
percpu_counter_destroy(&bdi->bdi_stat[i]);
- prop_local_destroy_percpu(&bdi->completions);
+ fprop_local_destroy_percpu(&bdi->completions);
}
EXPORT_SYMBOL(bdi_destroy);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 26adea8..97c6396 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -34,6 +34,7 @@
#include <linux/syscalls.h>
#include <linux/buffer_head.h> /* __set_page_dirty_buffers */
#include <linux/pagevec.h>
+#include <linux/timer.h>
#include <trace/events/writeback.h>
/*
@@ -135,7 +136,20 @@ unsigned long global_dirty_limit;
* measured in page writeback completions.
*
*/
-static struct prop_descriptor vm_completions;
+static struct fprop_global writeout_completions;
+
+static void writeout_period(unsigned long t);
+/* Timer for aging of writeout_completions */
+static struct timer_list writeout_period_timer =
+ TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0);
+bool writeout_period_timer_running = false;
+
+/*
+ * Length of period for aging writeout fractions of bdis. This is an
+ * arbitrarily chosen number. The longer the period, the slower fractions will
+ * reflect changes in current writeout rate.
+ */
+#define VM_COMPLETIONS_PERIOD_LEN (3*HZ)
/*
* Work out the current dirty-memory clamping and background writeout
@@ -322,34 +336,6 @@ bool zone_dirty_ok(struct zone *zone)
zone_page_state(zone, NR_WRITEBACK) <= limit;
}
-/*
- * couple the period to the dirty_ratio:
- *
- * period/2 ~ roundup_pow_of_two(dirty limit)
- */
-static int calc_period_shift(void)
-{
- unsigned long dirty_total;
-
- if (vm_dirty_bytes)
- dirty_total = vm_dirty_bytes / PAGE_SIZE;
- else
- dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) /
- 100;
- return 2 + ilog2(dirty_total - 1);
-}
-
-/*
- * update the period when the dirty threshold changes.
- */
-static void update_completion_period(void)
-{
- int shift = calc_period_shift();
- prop_change_shift(&vm_completions, shift);
-
- writeback_set_ratelimit();
-}
-
int dirty_background_ratio_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
@@ -383,7 +369,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
- update_completion_period();
+ writeback_set_ratelimit();
vm_dirty_bytes = 0;
}
return ret;
@@ -398,7 +384,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
- update_completion_period();
+ writeback_set_ratelimit();
vm_dirty_ratio = 0;
}
return ret;
@@ -411,8 +397,19 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
{
__inc_bdi_stat(bdi, BDI_WRITTEN);
- __prop_inc_percpu_max(&vm_completions, &bdi->completions,
- bdi->max_prop_frac);
+ __fprop_inc_percpu_max(&writeout_completions, &bdi->completions,
+ bdi->max_ratio);
+ /*
+ * This check is racy but it's not important which of the currently
+ * running events will arm the timer or even whether we lose the race
+ * with writeout_period() and writeout_period_timer_running will be
+ * false despite the timer being armed...
+ */
+ if (!writeout_period_timer_running) {
+ writeout_period_timer_running = true;
+ mod_timer(&writeout_period_timer,
+ jiffies + VM_COMPLETIONS_PERIOD_LEN);
+ }
}
void bdi_writeout_inc(struct backing_dev_info *bdi)
@@ -431,10 +428,25 @@ EXPORT_SYMBOL_GPL(bdi_writeout_inc);
static void bdi_writeout_fraction(struct backing_dev_info *bdi,
long *numerator, long *denominator)
{
- prop_fraction_percpu(&vm_completions, &bdi->completions,
+ fprop_fraction_percpu(&writeout_completions, &bdi->completions,
numerator, denominator);
}
+
+static void writeout_period(unsigned long t)
+{
+ if (fprop_new_period(&writeout_completions)) {
+ mod_timer(&writeout_period_timer,
+ jiffies + VM_COMPLETIONS_PERIOD_LEN);
+ } else {
+ /*
+ * Aging has zeroed all fractions. Stop wasting CPU on period
+ * updates.
+ */
+ writeout_period_timer_running = false;
+ }
+}
+
/*
* bdi_min_ratio keeps the sum of the minimum dirty shares of all
* registered backing devices, which, for obvious reasons, can not
@@ -471,12 +483,10 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
return -EINVAL;
spin_lock_bh(&bdi_lock);
- if (bdi->min_ratio > max_ratio) {
+ if (bdi->min_ratio > max_ratio)
ret = -EINVAL;
- } else {
+ else
bdi->max_ratio = max_ratio;
- bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
- }
spin_unlock_bh(&bdi_lock);
return ret;
@@ -1605,13 +1615,10 @@ static struct notifier_block __cpuinitdata ratelimit_nb = {
*/
void __init page_writeback_init(void)
{
- int shift;
-
writeback_set_ratelimit();
register_cpu_notifier(&ratelimit_nb);
- shift = calc_period_shift();
- prop_descriptor_init(&vm_completions, shift);
+ fprop_global_init(&writeout_completions);
}
/**
--
1.7.1
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH 1/2] lib: Proportions with flexible period
2012-05-15 15:43 ` [PATCH 1/2] lib: Proportions with flexible period Jan Kara
@ 2012-05-17 21:35 ` Peter Zijlstra
2012-05-18 14:33 ` Jan Kara
2012-05-17 21:56 ` Peter Zijlstra
2012-05-18 10:43 ` Peter Zijlstra
2 siblings, 1 reply; 14+ messages in thread
From: Peter Zijlstra @ 2012-05-17 21:35 UTC (permalink / raw)
To: Jan Kara; +Cc: Wu Fengguang, LKML, linux-mm
On Tue, 2012-05-15 at 17:43 +0200, Jan Kara wrote:
> + if (numerator > ((long long)denominator) * max_frac / 100)
Does that even compile on 32bit archs?
Operator precedence is *,/ left-to-right, so that's:
long long t1 = (long long)denom * max_frac
long long t2 = t1 / 100;
Which is a 64bit signed division.
There's a reason I used that max_prop_frac thing you removed, it avoids
having to do the division at all and allows a mult and shift instead.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 1/2] lib: Proportions with flexible period
2012-05-15 15:43 ` [PATCH 1/2] lib: Proportions with flexible period Jan Kara
2012-05-17 21:35 ` Peter Zijlstra
@ 2012-05-17 21:56 ` Peter Zijlstra
2012-05-18 14:42 ` Jan Kara
2012-05-18 10:43 ` Peter Zijlstra
2 siblings, 1 reply; 14+ messages in thread
From: Peter Zijlstra @ 2012-05-17 21:56 UTC (permalink / raw)
To: Jan Kara; +Cc: Wu Fengguang, LKML, linux-mm
On Tue, 2012-05-15 at 17:43 +0200, Jan Kara wrote:
> +void fprop_fraction_percpu(struct fprop_global *p,
> + struct fprop_local_percpu *pl,
> + unsigned long *numerator, unsigned long *denominator)
> +{
> + unsigned int seq;
> + s64 den;
> +
> + do {
> + seq = read_seqcount_begin(&p->sequence);
> + fprop_reflect_period_percpu(p, pl);
> + *numerator = percpu_counter_read_positive(&pl->events);
> + den = percpu_counter_read(&p->events);
> + if (den <= 0)
> + den = percpu_counter_sum(&p->events);
> + *denominator = den;
> + } while (read_seqcount_retry(&p->sequence, seq));
> +}
why not use percpu_counter_read_positive(&p->events) and ditch
percpu_counter_sum()? That sum can be terribly expensive..
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 2/2] block: Convert BDI proportion calculations to flexible proportions
2012-05-15 15:43 ` [PATCH 2/2] block: Convert BDI proportion calculations to flexible proportions Jan Kara
@ 2012-05-17 22:04 ` Peter Zijlstra
2012-05-18 14:24 ` Jan Kara
0 siblings, 1 reply; 14+ messages in thread
From: Peter Zijlstra @ 2012-05-17 22:04 UTC (permalink / raw)
To: Jan Kara; +Cc: Wu Fengguang, LKML, linux-mm
On Tue, 2012-05-15 at 17:43 +0200, Jan Kara wrote:
> +static struct timer_list writeout_period_timer =
> + TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0);
So the problem with using a deferred timer is that it 'ignores' idle
time. So if a very busy period is followed by a real quiet period you'd
expect all the proportions to have aged to 0, but they won't have.
One way to solve that is to track a jiffies count of the last time the
timer triggered and compute the missed periods from that and extend
fprop_new_period() to deal with period increments of more than 1.
The other is of course to not use deferred timers.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 1/2] lib: Proportions with flexible period
2012-05-15 15:43 ` [PATCH 1/2] lib: Proportions with flexible period Jan Kara
2012-05-17 21:35 ` Peter Zijlstra
2012-05-17 21:56 ` Peter Zijlstra
@ 2012-05-18 10:43 ` Peter Zijlstra
2012-05-18 14:45 ` Jan Kara
2 siblings, 1 reply; 14+ messages in thread
From: Peter Zijlstra @ 2012-05-18 10:43 UTC (permalink / raw)
To: Jan Kara; +Cc: Wu Fengguang, LKML, linux-mm
On Tue, 2012-05-15 at 17:43 +0200, Jan Kara wrote:
> +void __fprop_inc_percpu_max(struct fprop_global *p,
> + struct fprop_local_percpu *pl, int max_frac)
> +{
> + if (unlikely(max_frac < 100)) {
> + unsigned long numerator, denominator;
> +
> + fprop_fraction_percpu(p, pl, &numerator, &denominator);
> + if (numerator > ((long long)denominator) * max_frac / 100)
> + return;
Another thing, your fprop_fraction_percpu() can he horribly expensive
due to using _sum() (and to a lesser degree the retry), remember that
this function is called for _every_ page written out.
Esp. on the mega fast storage (multi-spindle or SSD) they're pushing cpu
limits as it is with iops, we should be very careful not to make it more
expensive than absolutely needed.
> + } else
> + fprop_reflect_period_percpu(p, pl);
> + __percpu_counter_add(&pl->events, 1, PROP_BATCH);
> + percpu_counter_add(&p->events, 1);
> +}
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 2/2] block: Convert BDI proportion calculations to flexible proportions
2012-05-17 22:04 ` Peter Zijlstra
@ 2012-05-18 14:24 ` Jan Kara
2012-05-18 14:34 ` Peter Zijlstra
0 siblings, 1 reply; 14+ messages in thread
From: Jan Kara @ 2012-05-18 14:24 UTC (permalink / raw)
To: Peter Zijlstra; +Cc: Jan Kara, Wu Fengguang, LKML, linux-mm
On Fri 18-05-12 00:04:33, Peter Zijlstra wrote:
> On Tue, 2012-05-15 at 17:43 +0200, Jan Kara wrote:
> > +static struct timer_list writeout_period_timer =
> > + TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0);
>
> So the problem with using a deferred timer is that it 'ignores' idle
> time. So if a very busy period is followed by a real quiet period you'd
> expect all the proportions to have aged to 0, but they won't have.
Ah, I see. Thanks for warning me.
> One way to solve that is to track a jiffies count of the last time the
> timer triggered and compute the missed periods from that and extend
> fprop_new_period() to deal with period increments of more than 1.
Yeah, that should be easy enough so I'll try it that way since I presume
it's nicer to power usage to use deferred timers if it's reasonably
possible.
Honza
--
Jan Kara <jack@suse.cz>
SUSE Labs, CR
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 1/2] lib: Proportions with flexible period
2012-05-17 21:35 ` Peter Zijlstra
@ 2012-05-18 14:33 ` Jan Kara
0 siblings, 0 replies; 14+ messages in thread
From: Jan Kara @ 2012-05-18 14:33 UTC (permalink / raw)
To: Peter Zijlstra; +Cc: Jan Kara, Wu Fengguang, LKML, linux-mm
On Thu 17-05-12 23:35:12, Peter Zijlstra wrote:
> On Tue, 2012-05-15 at 17:43 +0200, Jan Kara wrote:
> > + if (numerator > ((long long)denominator) * max_frac / 100)
>
> Does that even compile on 32bit archs?
>
> Operator precedence is *,/ left-to-right, so that's:
>
> long long t1 = (long long)denom * max_frac
> long long t2 = t1 / 100;
>
> Which is a 64bit signed division.
>
> There's a reason I used that max_prop_frac thing you removed, it avoids
> having to do the division at all and allows a mult and shift instead.
Yeah, I misunderstood it's purpose when I read the code originally. I'll
put it back to avoid the division since this is a hot path.
Honza
--
Jan Kara <jack@suse.cz>
SUSE Labs, CR
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 2/2] block: Convert BDI proportion calculations to flexible proportions
2012-05-18 14:24 ` Jan Kara
@ 2012-05-18 14:34 ` Peter Zijlstra
0 siblings, 0 replies; 14+ messages in thread
From: Peter Zijlstra @ 2012-05-18 14:34 UTC (permalink / raw)
To: Jan Kara; +Cc: Wu Fengguang, LKML, linux-mm
On Fri, 2012-05-18 at 16:24 +0200, Jan Kara wrote:
> Yeah, that should be easy enough so I'll try it that way since I presume
> it's nicer to power usage to use deferred timers if it's reasonably
> possible.
Btw, your current scheme also drifts. Since you do jiffes + 3*HZ you
period might actually be longer if the timer got delayed.
If you keep an external jiffies count like:
unsigned long period_jiffies = jiffies;
void my_timer_func()
{
unsigned long delta = jiffies - period_jiffies;
unsigned long periods = delta / 3*HZ;
age(periods);
period_jiffies += 3*HZ * periods;
mod_timer(&my_timer, period_jiffies);
}
it all works without drift (+- bugs of course).
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 1/2] lib: Proportions with flexible period
2012-05-17 21:56 ` Peter Zijlstra
@ 2012-05-18 14:42 ` Jan Kara
0 siblings, 0 replies; 14+ messages in thread
From: Jan Kara @ 2012-05-18 14:42 UTC (permalink / raw)
To: Peter Zijlstra; +Cc: Jan Kara, Wu Fengguang, LKML, linux-mm
On Thu 17-05-12 23:56:45, Peter Zijlstra wrote:
> On Tue, 2012-05-15 at 17:43 +0200, Jan Kara wrote:
> > +void fprop_fraction_percpu(struct fprop_global *p,
> > + struct fprop_local_percpu *pl,
> > + unsigned long *numerator, unsigned long *denominator)
> > +{
> > + unsigned int seq;
> > + s64 den;
> > +
> > + do {
> > + seq = read_seqcount_begin(&p->sequence);
> > + fprop_reflect_period_percpu(p, pl);
> > + *numerator = percpu_counter_read_positive(&pl->events);
> > + den = percpu_counter_read(&p->events);
> > + if (den <= 0)
> > + den = percpu_counter_sum(&p->events);
> > + *denominator = den;
> > + } while (read_seqcount_retry(&p->sequence, seq));
> > +}
>
>
> why not use percpu_counter_read_positive(&p->events) and ditch
> percpu_counter_sum()? That sum can be terribly expensive..
Yes. I'm actually not sure why I used the _sum here... Thanks for
spotting this.
Honza
--
Jan Kara <jack@suse.cz>
SUSE Labs, CR
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 1/2] lib: Proportions with flexible period
2012-05-18 10:43 ` Peter Zijlstra
@ 2012-05-18 14:45 ` Jan Kara
0 siblings, 0 replies; 14+ messages in thread
From: Jan Kara @ 2012-05-18 14:45 UTC (permalink / raw)
To: Peter Zijlstra; +Cc: Jan Kara, Wu Fengguang, LKML, linux-mm
On Fri 18-05-12 12:43:44, Peter Zijlstra wrote:
> On Tue, 2012-05-15 at 17:43 +0200, Jan Kara wrote:
> > +void __fprop_inc_percpu_max(struct fprop_global *p,
> > + struct fprop_local_percpu *pl, int max_frac)
> > +{
> > + if (unlikely(max_frac < 100)) {
> > + unsigned long numerator, denominator;
> > +
> > + fprop_fraction_percpu(p, pl, &numerator, &denominator);
> > + if (numerator > ((long long)denominator) * max_frac / 100)
> > + return;
>
> Another thing, your fprop_fraction_percpu() can he horribly expensive
> due to using _sum() (and to a lesser degree the retry), remember that
> this function is called for _every_ page written out.
The retry happens only when new period is declared while
fprop_fraction_percpu() is running. So that should be rather exceptional.
Regarding the _sum I agree, luckily that's easy enough to fix.
> Esp. on the mega fast storage (multi-spindle or SSD) they're pushing cpu
> limits as it is with iops, we should be very careful not to make it more
> expensive than absolutely needed.
Yup.
> > + } else
> > + fprop_reflect_period_percpu(p, pl);
> > + __percpu_counter_add(&pl->events, 1, PROP_BATCH);
> > + percpu_counter_add(&p->events, 1);
> > +}
Honza
--
Jan Kara <jack@suse.cz>
SUSE Labs, CR
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH 1/2] lib: Proportions with flexible period
2012-05-24 16:59 [PATCH 0/2 v4] Flexible proportions Jan Kara
@ 2012-05-24 16:59 ` Jan Kara
0 siblings, 0 replies; 14+ messages in thread
From: Jan Kara @ 2012-05-24 16:59 UTC (permalink / raw)
To: Wu Fengguang; +Cc: Peter Zijlstra, linux-mm, LKML, Jan Kara
Implement code computing proportions of events of different type (like code in
lib/proportions.c) but allowing periods to have different lengths. This allows
us to have aging periods of fixed wallclock time which gives better proportion
estimates given the hugely varying throughput of different devices - previous
measuring of aging period by number of events has the problem that a reasonable
period length for a system with low-end USB stick is not a reasonable period
length for a system with high-end storage array resulting either in too slow
proportion updates or too fluctuating proportion updates.
Signed-off-by: Jan Kara <jack@suse.cz>
---
include/linux/flex_proportions.h | 100 ++++++++++++++
lib/Makefile | 2 +-
lib/flex_proportions.c | 267 ++++++++++++++++++++++++++++++++++++++
3 files changed, 368 insertions(+), 1 deletions(-)
create mode 100644 include/linux/flex_proportions.h
create mode 100644 lib/flex_proportions.c
diff --git a/include/linux/flex_proportions.h b/include/linux/flex_proportions.h
new file mode 100644
index 0000000..9ac291c
--- /dev/null
+++ b/include/linux/flex_proportions.h
@@ -0,0 +1,100 @@
+/*
+ * Floating proportions with flexible aging period
+ *
+ * Copyright (C) 2011, SUSE, Jan Kara <jack@suse.cz>
+ */
+
+#ifndef _LINUX_FLEX_PROPORTIONS_H
+#define _LINUX_FLEX_PROPORTIONS_H
+
+#include <linux/percpu_counter.h>
+#include <linux/spinlock.h>
+#include <linux/seqlock.h>
+
+/*
+ * When maximum proportion of some event type is specified, this is the
+ * precision with which we allow limitting. Note that this creates an upper
+ * bound on the number of events per period like
+ * ULLONG_MAX >> FPROP_FRAC_SHIFT.
+ */
+#define FPROP_FRAC_SHIFT 10
+#define FPROP_FRAC_BASE (1UL << FPROP_FRAC_SHIFT)
+
+/*
+ * ---- Global proportion definitions ----
+ */
+struct fprop_global {
+ /* Number of events in the current period */
+ struct percpu_counter events;
+ /* Current period */
+ unsigned int period;
+ /* Synchronization with period transitions */
+ seqcount_t sequence;
+};
+
+int fprop_global_init(struct fprop_global *p);
+void fprop_global_destroy(struct fprop_global *p);
+bool fprop_new_period(struct fprop_global *p, int periods);
+
+/*
+ * ---- SINGLE ----
+ */
+struct fprop_local_single {
+ /* the local events counter */
+ unsigned long events;
+ /* Period in which we last updated events */
+ unsigned int period;
+ raw_spinlock_t lock; /* Protect period and numerator */
+};
+
+#define INIT_FPROP_LOCAL_SINGLE(name) \
+{ .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \
+}
+
+int fprop_local_init_single(struct fprop_local_single *pl);
+void __fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl);
+void fprop_fraction_single(struct fprop_global *p,
+ struct fprop_local_single *pl, unsigned long *numerator,
+ unsigned long *denominator);
+
+static inline
+void fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __fprop_inc_single(p, pl);
+ local_irq_restore(flags);
+}
+
+/*
+ * ---- PERCPU ----
+ */
+struct fprop_local_percpu {
+ /* the local events counter */
+ struct percpu_counter events;
+ /* Period in which we last updated events */
+ unsigned int period;
+ raw_spinlock_t lock; /* Protect period and numerator */
+};
+
+int fprop_local_init_percpu(struct fprop_local_percpu *pl);
+void fprop_local_destroy_percpu(struct fprop_local_percpu *pl);
+void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl);
+void __fprop_inc_percpu_max(struct fprop_global *p, struct fprop_local_percpu *pl,
+ int max_frac);
+void fprop_fraction_percpu(struct fprop_global *p,
+ struct fprop_local_percpu *pl, unsigned long *numerator,
+ unsigned long *denominator);
+
+static inline
+void fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __fprop_inc_percpu(p, pl);
+ local_irq_restore(flags);
+}
+
+#endif
diff --git a/lib/Makefile b/lib/Makefile
index 74290c9..8dd81cf 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -11,7 +11,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
rbtree.o radix-tree.o dump_stack.o timerqueue.o\
idr.o int_sqrt.o extable.o prio_tree.o \
sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \
- proportions.o prio_heap.o ratelimit.o show_mem.o \
+ proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \
is_single_threaded.o plist.o decompress.o
lib-$(CONFIG_MMU) += ioremap.o
diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c
new file mode 100644
index 0000000..530dbc2
--- /dev/null
+++ b/lib/flex_proportions.c
@@ -0,0 +1,267 @@
+/*
+ * Floating proportions with flexible aging period
+ *
+ * Copyright (C) 2011, SUSE, Jan Kara <jack@suse.cz>
+ *
+ * The goal of this code is: Given different types of event, measure proportion
+ * of each type of event over time. The proportions are measured with
+ * exponentially decaying history to give smooth transitions. A formula
+ * expressing proportion of event of type 'j' is:
+ *
+ * p_{j} = (\Sum_{i>=0} x_{i,j}/2^{i+1})/(\Sum_{i>=0} x_i/2^{i+1})
+ *
+ * Where x_{i,j} is j's number of events in i-th last time period and x_i is
+ * total number of events in i-th last time period.
+ *
+ * Note that p_{j}'s are normalised, i.e.
+ *
+ * \Sum_{j} p_{j} = 1,
+ *
+ * This formula can be straightforwardly computed by maintaing denominator
+ * (let's call it 'd') and for each event type its numerator (let's call it
+ * 'n_j'). When an event of type 'j' happens, we simply need to do:
+ * n_j++; d++;
+ *
+ * When a new period is declared, we could do:
+ * d /= 2
+ * for each j
+ * n_j /= 2
+ *
+ * To avoid iteration over all event types, we instead shift numerator of event
+ * j lazily when someone asks for a proportion of event j or when event j
+ * occurs. This can bit trivially implemented by remembering last period in
+ * which something happened with proportion of type j.
+ */
+#include <linux/flex_proportions.h>
+
+int fprop_global_init(struct fprop_global *p)
+{
+ int err;
+
+ p->period = 0;
+ /* Use 1 to avoid dealing with periods with 0 events... */
+ err = percpu_counter_init(&p->events, 1);
+ if (err)
+ return err;
+ seqcount_init(&p->sequence);
+ return 0;
+}
+
+void fprop_global_destroy(struct fprop_global *p)
+{
+ percpu_counter_destroy(&p->events);
+}
+
+/*
+ * Declare @periods new periods. It is upto the caller to make sure period
+ * transitions cannot happen in parallel.
+ *
+ * The function returns true if the proportions are still defined and false
+ * if aging zeroed out all events. This can be used to detect whether declaring
+ * further periods has any effect.
+ */
+bool fprop_new_period(struct fprop_global *p, int periods)
+{
+ u64 events = percpu_counter_sum(&p->events);
+
+ /*
+ * Don't do anything if there are no events.
+ */
+ if (events <= 1)
+ return false;
+ write_seqcount_begin(&p->sequence);
+ if (periods < 64)
+ events -= events >> periods;
+ /* Use addition to avoid losing events happening between sum and set */
+ percpu_counter_add(&p->events, -events);
+ p->period += periods;
+ write_seqcount_end(&p->sequence);
+
+ return true;
+}
+
+/*
+ * ---- SINGLE ----
+ */
+
+int fprop_local_init_single(struct fprop_local_single *pl)
+{
+ pl->events = 0;
+ pl->period = 0;
+ raw_spin_lock_init(&pl->lock);
+ return 0;
+}
+
+void fprop_local_destroy_single(struct fprop_local_single *pl)
+{
+}
+
+static void fprop_reflect_period_single(struct fprop_global *p,
+ struct fprop_local_single *pl)
+{
+ unsigned int period = p->period;
+ unsigned long flags;
+
+ /* Fast path - period didn't change */
+ if (pl->period == period)
+ return;
+ raw_spin_lock_irqsave(&pl->lock, flags);
+ /* Someone updated pl->period while we were spinning? */
+ if (pl->period >= period) {
+ raw_spin_unlock_irqrestore(&pl->lock, flags);
+ return;
+ }
+ /* Aging zeroed our fraction? */
+ if (period - pl->period < BITS_PER_LONG)
+ pl->events >>= period - pl->period;
+ else
+ pl->events = 0;
+ pl->period = period;
+ raw_spin_unlock_irqrestore(&pl->lock, flags);
+}
+
+/* Event of type pl happened */
+void __fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl)
+{
+ fprop_reflect_period_single(p, pl);
+ pl->events++;
+ percpu_counter_add(&p->events, 1);
+}
+
+/* Return fraction of events of type pl */
+void fprop_fraction_single(struct fprop_global *p,
+ struct fprop_local_single *pl,
+ unsigned long *numerator, unsigned long *denominator)
+{
+ unsigned int seq;
+ s64 num, den;
+
+ do {
+ seq = read_seqcount_begin(&p->sequence);
+ fprop_reflect_period_single(p, pl);
+ num = pl->events;
+ den = percpu_counter_read_positive(&p->events);
+ } while (read_seqcount_retry(&p->sequence, seq));
+
+ /*
+ * Make fraction <= 1 and denominator > 0 even in presence of percpu
+ * counter errors
+ */
+ if (den <= num) {
+ if (num)
+ den = num;
+ else
+ den = 1;
+ }
+ *denominator = den;
+ *numerator = num;
+}
+
+/*
+ * ---- PERCPU ----
+ */
+#define PROP_BATCH (8*(1+ilog2(nr_cpu_ids)))
+
+int fprop_local_init_percpu(struct fprop_local_percpu *pl)
+{
+ int err;
+
+ err = percpu_counter_init(&pl->events, 0);
+ if (err)
+ return err;
+ pl->period = 0;
+ raw_spin_lock_init(&pl->lock);
+ return 0;
+}
+
+void fprop_local_destroy_percpu(struct fprop_local_percpu *pl)
+{
+ percpu_counter_destroy(&pl->events);
+}
+
+static void fprop_reflect_period_percpu(struct fprop_global *p,
+ struct fprop_local_percpu *pl)
+{
+ unsigned int period = p->period;
+ unsigned long flags;
+
+ /* Fast path - period didn't change */
+ if (pl->period == period)
+ return;
+ raw_spin_lock_irqsave(&pl->lock, flags);
+ /* Someone updated pl->period while we were spinning? */
+ if (pl->period >= period) {
+ raw_spin_unlock_irqrestore(&pl->lock, flags);
+ return;
+ }
+ /* Aging zeroed our fraction? */
+ if (period - pl->period < BITS_PER_LONG) {
+ s64 val = percpu_counter_read(&pl->events);
+
+ if (val < (nr_cpu_ids * PROP_BATCH))
+ val = percpu_counter_sum(&pl->events);
+
+ __percpu_counter_add(&pl->events,
+ -val + (val >> (period-pl->period)), PROP_BATCH);
+ } else
+ percpu_counter_set(&pl->events, 0);
+ pl->period = period;
+ raw_spin_unlock_irqrestore(&pl->lock, flags);
+}
+
+/* Event of type pl happened */
+void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl)
+{
+ fprop_reflect_period_percpu(p, pl);
+ __percpu_counter_add(&pl->events, 1, PROP_BATCH);
+ percpu_counter_add(&p->events, 1);
+}
+
+void fprop_fraction_percpu(struct fprop_global *p,
+ struct fprop_local_percpu *pl,
+ unsigned long *numerator, unsigned long *denominator)
+{
+ unsigned int seq;
+ s64 num, den;
+
+ do {
+ seq = read_seqcount_begin(&p->sequence);
+ fprop_reflect_period_percpu(p, pl);
+ num = percpu_counter_read_positive(&pl->events);
+ den = percpu_counter_read_positive(&p->events);
+ } while (read_seqcount_retry(&p->sequence, seq));
+
+ /*
+ * Make fraction <= 1 and denominator > 0 even in presence of percpu
+ * counter errors
+ */
+ if (den <= num) {
+ if (num)
+ den = num;
+ else
+ den = 1;
+ }
+ *denominator = den;
+ *numerator = num;
+}
+
+/*
+ * Like __fprop_inc_percpu() except that event is counted only if the given
+ * type has fraction smaller than @max_frac/FPROP_FRAC_BASE
+ */
+void __fprop_inc_percpu_max(struct fprop_global *p,
+ struct fprop_local_percpu *pl, int max_frac)
+{
+ if (unlikely(max_frac < FPROP_FRAC_BASE)) {
+ unsigned long numerator, denominator;
+
+ fprop_fraction_percpu(p, pl, &numerator, &denominator);
+ if (numerator >
+ (((u64)denominator) * max_frac) >> FPROP_FRAC_SHIFT)
+ return;
+ } else
+ fprop_reflect_period_percpu(p, pl);
+ __percpu_counter_add(&pl->events, 1, PROP_BATCH);
+ percpu_counter_add(&p->events, 1);
+}
+
--
1.7.1
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 14+ messages in thread
end of thread, other threads:[~2012-05-24 16:59 UTC | newest]
Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-05-15 15:43 [PATCH 0/2 v3] Flexible proportions Jan Kara
2012-05-15 15:43 ` [PATCH 1/2] lib: Proportions with flexible period Jan Kara
2012-05-17 21:35 ` Peter Zijlstra
2012-05-18 14:33 ` Jan Kara
2012-05-17 21:56 ` Peter Zijlstra
2012-05-18 14:42 ` Jan Kara
2012-05-18 10:43 ` Peter Zijlstra
2012-05-18 14:45 ` Jan Kara
2012-05-15 15:43 ` [PATCH 2/2] block: Convert BDI proportion calculations to flexible proportions Jan Kara
2012-05-17 22:04 ` Peter Zijlstra
2012-05-18 14:24 ` Jan Kara
2012-05-18 14:34 ` Peter Zijlstra
-- strict thread matches above, loose matches on Subject: below --
2012-05-24 16:59 [PATCH 0/2 v4] Flexible proportions Jan Kara
2012-05-24 16:59 ` [PATCH 1/2] lib: Proportions with flexible period Jan Kara
2012-05-03 22:39 [PATCH 0/2 v2] Flexible proportions for BDIs Jan Kara
2012-05-03 22:39 ` [PATCH 1/2] lib: Proportions with flexible period Jan Kara
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).