The Linux Kernel Mailing List
 help / color / mirror / Atom feed
* Re: [PATCH] isci: fix typo in deg_dbg message
From: Bart Van Assche @ 2016-11-14 22:23 UTC (permalink / raw)
  To: Colin King, Intel SCU Linux support, Artur Paszkiewicz,
	James E . J . Bottomley, Martin K . Petersen, linux-scsi
  Cc: linux-kernel
In-Reply-To: <20161112183026.9626-1-colin.king@canonical.com>

On 11/12/2016 10:30 AM, Colin King wrote:
> Trivial fix to typo "repsonse" to "response" in dev_dbg message.

Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>

^ permalink raw reply

* [PATCH V4 10/15] blk-throttle: add a simple idle detection
From: Shaohua Li @ 2016-11-14 22:22 UTC (permalink / raw)
  To: linux-block, linux-kernel; +Cc: Kernel-team, axboe, tj, vgoyal
In-Reply-To: <cover.1479161136.git.shli@fb.com>

A cgroup gets assigned a high limit, but the cgroup could never dispatch
enough IO to cross the high limit. In such case, the queue state machine
will remain in LIMIT_HIGH state and all other cgroups will be throttled
according to high limit. This is unfair for other cgroups. We should
treat the cgroup idle and upgrade the state machine to higher state.

We also have a downgrade logic. If the state machine upgrades because of
cgroup idle (real idle), the state machine will downgrade soon as the
cgroup is below its high limit. This isn't what we want. A more
complicated case is cgroup isn't idle when queue is in LIMIT_HIGH. But
when queue gets upgraded to higher state, other cgroups could dispatch
more IO and this cgroup can't dispatch enough IO, so the cgroup is below
its high limit and looks like idle (fake idle). In this case, the queue
should downgrade soon. The key to determine if we should do downgrade is
to detect if cgroup is truely idle.

Unfortunately it's very hard to determine if a cgroup is real idle. This
patch uses the 'think time check' idea from CFQ for the purpose. Please
note, the idea doesn't work for all workloads. For example, a workload
with io depth 8 has disk utilization 100%, hence think time is 0, eg,
not idle. But the workload can run higher bandwidth with io depth 16.
Compared to io depth 16, the io depth 8 workload is idle. We use the
idea to roughly determine if a cgroup is idle.

We treat a cgroup idle if its think time is above a threshold (by
default 50us for SSD and 1ms for HD). The idea is think time above the
threshold will start to harm performance. HD is much slower so a longer
think time is ok.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 block/bio.c               |  2 ++
 block/blk-throttle.c      | 72 ++++++++++++++++++++++++++++++++++++++++++++++-
 block/blk.h               |  2 ++
 include/linux/blk_types.h |  1 +
 4 files changed, 76 insertions(+), 1 deletion(-)

diff --git a/block/bio.c b/block/bio.c
index db85c57..7baa86d 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -30,6 +30,7 @@
 #include <linux/cgroup.h>
 
 #include <trace/events/block.h>
+#include "blk.h"
 
 /*
  * Test patch to inline a certain number of bi_io_vec's inside the bio
@@ -1759,6 +1760,7 @@ void bio_endio(struct bio *bio)
 		goto again;
 	}
 
+	blk_throtl_bio_endio(bio);
 	if (bio->bi_end_io)
 		bio->bi_end_io(bio);
 }
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 45a28c4..cb5fd85 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -21,6 +21,8 @@ static int throtl_quantum = 32;
 /* Throttling is performed over 100ms slice and after that slice is renewed */
 #define DFL_THROTL_SLICE (HZ / 10)
 #define MAX_THROTL_SLICE (HZ / 5)
+#define DFL_IDLE_THRESHOLD_SSD (50 * 1000) /* 50 us */
+#define DFL_IDLE_THRESHOLD_HD (1000 * 1000) /* 1 ms */
 
 static struct blkcg_policy blkcg_policy_throtl;
 
@@ -149,6 +151,10 @@ struct throtl_grp {
 	/* When did we start a new slice */
 	unsigned long slice_start[2];
 	unsigned long slice_end[2];
+
+	u64 last_finish_time;
+	u64 checked_last_finish_time;
+	u64 avg_ttime;
 };
 
 struct throtl_data
@@ -172,6 +178,8 @@ struct throtl_data
 	unsigned long high_downgrade_time;
 
 	unsigned int scale;
+
+	u64 idle_ttime_threshold;
 };
 
 static void throtl_pending_timer_fn(unsigned long arg);
@@ -1629,6 +1637,14 @@ static unsigned long tg_last_high_overflow_time(struct throtl_grp *tg)
 	return ret;
 }
 
+static bool throtl_tg_is_idle(struct throtl_grp *tg)
+{
+	/* cgroup is idle if average think time is more than threshold */
+	return ktime_get_ns() - tg->last_finish_time >
+		4 * tg->td->idle_ttime_threshold ||
+	       tg->avg_ttime > tg->td->idle_ttime_threshold;
+}
+
 static bool throtl_upgrade_check_one(struct throtl_grp *tg)
 {
 	struct throtl_service_queue *sq = &tg->service_queue;
@@ -1837,6 +1853,19 @@ static void throtl_downgrade_check(struct throtl_grp *tg)
 	tg->last_io_disp[WRITE] = 0;
 }
 
+static void blk_throtl_update_ttime(struct throtl_grp *tg)
+{
+	u64 now = ktime_get_ns();
+	u64 last_finish_time = tg->last_finish_time;
+
+	if (now <= last_finish_time || last_finish_time == 0 ||
+	    last_finish_time == tg->checked_last_finish_time)
+		return;
+
+	tg->avg_ttime = (tg->avg_ttime * 7 + now - last_finish_time) >> 3;
+	tg->checked_last_finish_time = last_finish_time;
+}
+
 bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 		    struct bio *bio)
 {
@@ -1848,6 +1877,13 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
 
+	if (tg->td->idle_ttime_threshold == -1) {
+		if (blk_queue_nonrot(q))
+			tg->td->idle_ttime_threshold = DFL_IDLE_THRESHOLD_SSD;
+		else
+			tg->td->idle_ttime_threshold = DFL_IDLE_THRESHOLD_HD;
+	}
+
 	/* see throtl_charge_bio() */
 	if ((bio->bi_opf & REQ_THROTTLED) || !tg->has_rules[rw])
 		goto out;
@@ -1857,6 +1893,11 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 	if (unlikely(blk_queue_bypass(q)))
 		goto out_unlock;
 
+	bio_associate_current(bio);
+	bio->bi_cg_private = q;
+
+	blk_throtl_update_ttime(tg);
+
 	sq = &tg->service_queue;
 
 again:
@@ -1917,7 +1958,6 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 
 	tg->last_high_overflow_time[rw] = jiffies;
 
-	bio_associate_current(bio);
 	tg->td->nr_queued[rw]++;
 	throtl_add_bio_tg(bio, qn, tg);
 	throttled = true;
@@ -1946,6 +1986,34 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 	return throttled;
 }
 
+void blk_throtl_bio_endio(struct bio *bio)
+{
+	struct blkcg *blkcg;
+	struct blkcg_gq *blkg;
+	struct throtl_grp *tg;
+	struct request_queue *q;
+
+	q = bio->bi_cg_private;
+	if (!q)
+		return;
+	bio->bi_cg_private = NULL;
+
+	rcu_read_lock();
+	blkcg = bio_blkcg(bio);
+	if (!blkcg)
+		goto end;
+	blkg = blkg_lookup(blkcg, q);
+	if (!blkg)
+		goto end;
+
+	tg = blkg_to_tg(blkg ?: q->root_blkg);
+
+	tg->last_finish_time = ktime_get_ns();
+
+end:
+	rcu_read_unlock();
+}
+
 /*
  * Dispatch all bios from all children tg's queued on @parent_sq.  On
  * return, @parent_sq is guaranteed to not have any active children tg's
@@ -2030,6 +2098,8 @@ int blk_throtl_init(struct request_queue *q)
 	td->limit_index = LIMIT_MAX;
 	td->high_upgrade_time = jiffies;
 	td->high_downgrade_time = jiffies;
+
+	td->idle_ttime_threshold = -1;
 	/* activate policy */
 	ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
 	if (ret)
diff --git a/block/blk.h b/block/blk.h
index 39c14dd..b433f35 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -292,10 +292,12 @@ extern void blk_throtl_exit(struct request_queue *q);
 extern ssize_t blk_throtl_slice_show(struct request_queue *q, char *page);
 extern ssize_t blk_throtl_slice_store(struct request_queue *q,
 	const char *page, size_t count);
+extern void blk_throtl_bio_endio(struct bio *bio);
 #else /* CONFIG_BLK_DEV_THROTTLING */
 static inline void blk_throtl_drain(struct request_queue *q) { }
 static inline int blk_throtl_init(struct request_queue *q) { return 0; }
 static inline void blk_throtl_exit(struct request_queue *q) { }
+static inline void blk_throtl_bio_endio(struct bio *bio) { }
 #endif /* CONFIG_BLK_DEV_THROTTLING */
 
 #endif /* BLK_INTERNAL_H */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index cd395ec..ff8dd24 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -59,6 +59,7 @@ struct bio {
 	 */
 	struct io_context	*bi_ioc;
 	struct cgroup_subsys_state *bi_css;
+	void *bi_cg_private;
 #endif
 	union {
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
-- 
2.9.3

^ permalink raw reply related

* [PATCH V4 08/15] blk-throttle: detect completed idle cgroup
From: Shaohua Li @ 2016-11-14 22:22 UTC (permalink / raw)
  To: linux-block, linux-kernel; +Cc: Kernel-team, axboe, tj, vgoyal
In-Reply-To: <cover.1479161136.git.shli@fb.com>

cgroup could be assigned a limit, but doesn't dispatch enough IO, eg the
cgroup is idle. When this happens, the cgroup doesn't hit its limit, so
we can't move the state machine to higher level and all cgroups will be
throttled to thier lower limit, so we waste bandwidth. Detecting idle
cgroup is hard. This patch handles a simple case, a cgroup doesn't
dispatch any IO. We ignore such cgroup's limit, so other cgroups can use
the bandwidth.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 block/blk-throttle.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index e85b2b6..32cc6ec 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -144,6 +144,8 @@ struct throtl_grp {
 
 	unsigned long last_check_time;
 
+	unsigned long last_dispatch_time[2];
+
 	/* When did we start a new slice */
 	unsigned long slice_start[2];
 	unsigned long slice_end[2];
@@ -438,11 +440,14 @@ static void tg_update_has_rules(struct throtl_grp *tg)
 
 static void throtl_pd_online(struct blkg_policy_data *pd)
 {
+	struct throtl_grp *tg = pd_to_tg(pd);
 	/*
 	 * We don't want new groups to escape the limits of its ancestors.
 	 * Update has_rules[] after a new group is brought online.
 	 */
-	tg_update_has_rules(pd_to_tg(pd));
+	tg_update_has_rules(tg);
+	tg->last_dispatch_time[READ] = jiffies;
+	tg->last_dispatch_time[WRITE] = jiffies;
 }
 
 static void blk_throtl_update_valid_limit(struct throtl_data *td)
@@ -1611,6 +1616,12 @@ static bool throtl_upgrade_check_one(struct throtl_grp *tg)
 	if (write_limit && sq->nr_queued[WRITE] &&
 	    (!read_limit || sq->nr_queued[READ]))
 		return true;
+
+	if (time_after_eq(jiffies,
+	     tg->last_dispatch_time[READ] + tg->td->throtl_slice) &&
+	    time_after_eq(jiffies,
+	     tg->last_dispatch_time[WRITE] + tg->td->throtl_slice))
+		return true;
 	return false;
 }
 
@@ -1691,6 +1702,11 @@ static bool throtl_downgrade_check_one(struct throtl_grp *tg)
 	struct throtl_data *td = tg->td;
 	unsigned long now = jiffies;
 
+	if (time_after_eq(now, tg->last_dispatch_time[READ] +
+					td->throtl_slice) &&
+	    time_after_eq(now, tg->last_dispatch_time[WRITE] +
+					td->throtl_slice))
+		return false;
 	/*
 	 * If cgroup is below high limit, consider downgrade and throttle other
 	 * cgroups
@@ -1811,6 +1827,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 
 again:
 	while (true) {
+		tg->last_dispatch_time[rw] = jiffies;
 		if (tg->last_high_overflow_time[rw] == 0)
 			tg->last_high_overflow_time[rw] = jiffies;
 		throtl_downgrade_check(tg);
-- 
2.9.3

^ permalink raw reply related

* [PATCH V4 09/15] blk-throttle: make bandwidth change smooth
From: Shaohua Li @ 2016-11-14 22:22 UTC (permalink / raw)
  To: linux-block, linux-kernel; +Cc: Kernel-team, axboe, tj, vgoyal
In-Reply-To: <cover.1479161136.git.shli@fb.com>

When cgroups all reach high limit, cgroups can dispatch more IO. This
could make some cgroups dispatch more IO but others not, and even some
cgroups could dispatch less IO than their high limit. For example, cg1
high limit 10MB/s, cg2 limit 80MB/s, assume disk maximum bandwidth is
120M/s for the workload. Their bps could something like this:

cg1/cg2 bps: T1: 10/80 -> T2: 60/60 -> T3: 10/80

At T1, all cgroups reach high limit, so they can dispatch more IO later.
Then cg1 dispatch more IO and cg2 has no room to dispatch enough IO. At
T2, cg2 only dispatches 60M/s. Since We detect cg2 dispatches less IO
than its high limit 80M/s, we downgrade the queue from LIMIT_MAX to
LIMIT_HIGH, then all cgroups are throttled to their high limit (T3). cg2
will have bandwidth below its high limit at most time.

The big problem here is we don't know the maximum bandwidth of the
workload, so we can't make smart decision to avoid the situation. This
patch makes cgroup bandwidth change smooth. After disk upgrades from
LIMIT_HIGH to LIMIT_MAX, we don't allow cgroups use all bandwidth upto
their max limit immediately. Their bandwidth limit will be increased
gradually to avoid above situation. So above example will became
something like:

cg1/cg2 bps: 10/80 -> 15/105 -> 20/100 -> 25/95 -> 30/90 -> 35/85 -> 40/80
-> 45/75 -> 10/80

In this way cgroups bandwidth will be above their limit in majority
time, this still doesn't fully utilize disk bandwidth, but that's
something we pay for sharing.

Note this doesn't completely avoid cgroup running under its high limit.
The best way to guarantee cgroup doesn't run under its limit is to set
max limit. For example, if we set cg1 max limit to 40, cg2 will never
run under its high limit.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 block/blk-throttle.c | 42 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 38 insertions(+), 4 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 32cc6ec..45a28c4 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -170,6 +170,8 @@ struct throtl_data
 
 	unsigned long high_upgrade_time;
 	unsigned long high_downgrade_time;
+
+	unsigned int scale;
 };
 
 static void throtl_pending_timer_fn(unsigned long arg);
@@ -224,12 +226,27 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
 static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw)
 {
 	struct blkcg_gq *blkg = tg_to_blkg(tg);
+	struct throtl_data *td;
 	uint64_t ret;
 
 	if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
 		return -1;
-	ret = tg->bps[rw][tg->td->limit_index];
-	if (ret == -1 && tg->td->limit_index == LIMIT_HIGH)
+	td = tg->td;
+	ret = tg->bps[rw][td->limit_index];
+	if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_HIGH] != -1) {
+		uint64_t increase;
+
+		if (td->scale < 4096 && time_after_eq(jiffies,
+		    td->high_upgrade_time + td->scale * td->throtl_slice)) {
+			unsigned int time = jiffies - td->high_upgrade_time;
+
+			td->scale = time / td->throtl_slice;
+		}
+		increase = (tg->bps[rw][LIMIT_HIGH] >> 1) * td->scale;
+		ret = min(tg->bps[rw][LIMIT_MAX],
+			tg->bps[rw][LIMIT_HIGH] + increase);
+	}
+	if (ret == -1 && td->limit_index == LIMIT_HIGH)
 		return tg->bps[rw][LIMIT_MAX];
 
 	return ret;
@@ -238,12 +255,28 @@ static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw)
 static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
 {
 	struct blkcg_gq *blkg = tg_to_blkg(tg);
+	struct throtl_data *td;
 	unsigned int ret;
 
 	if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
 		return -1;
-	ret = tg->iops[rw][tg->td->limit_index];
-	if (ret == -1 && tg->td->limit_index == LIMIT_HIGH)
+	td = tg->td;
+	ret = tg->iops[rw][td->limit_index];
+	if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_HIGH] != -1) {
+		uint64_t increase;
+
+		if (td->scale < 4096 && time_after_eq(jiffies,
+		    td->high_upgrade_time + td->scale * td->throtl_slice)) {
+			unsigned int time = jiffies - td->high_upgrade_time;
+
+			td->scale = time / td->throtl_slice;
+		}
+
+		increase = (tg->iops[rw][LIMIT_HIGH] >> 1) * td->scale;
+		ret = min(tg->iops[rw][LIMIT_MAX],
+			tg->iops[rw][LIMIT_HIGH] + (unsigned int)increase);
+	}
+	if (ret == -1 && td->limit_index == LIMIT_HIGH)
 		return tg->iops[rw][LIMIT_MAX];
 	return ret;
 }
@@ -1676,6 +1709,7 @@ static void throtl_upgrade_state(struct throtl_data *td)
 
 	td->limit_index = LIMIT_MAX;
 	td->high_upgrade_time = jiffies;
+	td->scale = 0;
 	rcu_read_lock();
 	blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
 		struct throtl_grp *tg = blkg_to_tg(blkg);
-- 
2.9.3

^ permalink raw reply related

* [PATCH V4 15/15] blk-throttle: add latency target support
From: Shaohua Li @ 2016-11-14 22:22 UTC (permalink / raw)
  To: linux-block, linux-kernel; +Cc: Kernel-team, axboe, tj, vgoyal
In-Reply-To: <cover.1479161136.git.shli@fb.com>

One hard problem adding .high limit is to detect idle cgroup. If one
cgroup doesn't dispatch enough IO against its high limit, we must have a
mechanism to determine if other cgroups dispatch more IO. We added the
think time detection mechanism before, but it doesn't work for all
workloads. Here we add a latency based approach.

We calculate the average request size and average latency of a cgroup.
Then we can calculate the target latency for the cgroup with the average
request size and the equation. In queue LIMIT_HIGH state, if a cgroup
doesn't dispatch enough IO against high limit but its average latency is
lower than its target latency, we treat the cgroup idle. In this case
other cgroups can dispatch more IO, eg, across their high limit.
Similarly in queue LIMIT_MAX state, if a cgroup doesn't dispatch enough
IO but its average latency is higher than its target latency, we treat
the cgroup busy. In this case, we should throttle other cgroups to make
the first cgroup's latency lower.

If cgroup's average request size is big (currently sets to 128k), we
always treat the cgroup busy (the think time check is still effective
though).

Currently this latency target check is only for SSD as we can't
calcualte the latency target for hard disk. And this is only for cgroup
leaf node so far.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 block/blk-throttle.c      | 58 ++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/blk_types.h |  1 +
 2 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index ac4d9ea..d07f332 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -156,6 +156,12 @@ struct throtl_grp {
 	u64 last_finish_time;
 	u64 checked_last_finish_time;
 	u64 avg_ttime;
+
+	unsigned int bio_batch;
+	u64 total_latency;
+	u64 avg_latency;
+	u64 total_size;
+	u64 avg_size;
 };
 
 /* We measure latency for request size from 4k to 4k * ( 1 << 4) */
@@ -1734,12 +1740,30 @@ static unsigned long tg_last_high_overflow_time(struct throtl_grp *tg)
 	return ret;
 }
 
+static u64 throtl_target_latency(struct throtl_data *td,
+	struct throtl_grp *tg)
+{
+	if (td->line_slope == 0 || tg->latency_target == 0)
+		return 0;
+
+	/* latency_target + f(avg_size) - f(4k) */
+	return td->line_slope * ((tg->avg_size >> 10) - 4) +
+		tg->latency_target;
+}
+
 static bool throtl_tg_is_idle(struct throtl_grp *tg)
 {
-	/* cgroup is idle if average think time is more than threshold */
-	return ktime_get_ns() - tg->last_finish_time >
+	/*
+	 * cgroup is idle if:
+	 * 1. average think time is higher than threshold
+	 * 2. average request size is small and average latency is higher
+	 *    than target
+	 */
+	return (ktime_get_ns() - tg->last_finish_time >
 		4 * tg->td->idle_ttime_threshold ||
-	       tg->avg_ttime > tg->td->idle_ttime_threshold;
+		tg->avg_ttime > tg->td->idle_ttime_threshold) ||
+	       (tg->avg_latency && tg->avg_size && tg->avg_size <= 128 * 1024 &&
+		tg->avg_latency < throtl_target_latency(tg->td, tg));
 }
 
 static bool throtl_upgrade_check_one(struct throtl_grp *tg)
@@ -2123,6 +2147,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 	bio_associate_current(bio);
 	bio->bi_cg_private = q;
 	bio->bi_cg_size = bio_sectors(bio);
+	bio->bi_cg_enter_time = ktime_get_ns();
 
 	blk_throtl_update_ttime(tg);
 
@@ -2264,6 +2289,33 @@ void blk_throtl_bio_endio(struct bio *bio)
 		}
 	}
 
+	if (bio->bi_cg_enter_time && finish_time > bio->bi_cg_enter_time &&
+	    tg->latency_target) {
+		lat = finish_time - bio->bi_cg_enter_time;
+		tg->total_latency += lat;
+		tg->total_size += bio->bi_cg_size << 9;
+		tg->bio_batch++;
+	}
+
+	if (tg->bio_batch >= 8) {
+		int batch = tg->bio_batch;
+		u64 size = tg->total_size;
+
+		lat = tg->total_latency;
+
+		tg->bio_batch = 0;
+		tg->total_latency = 0;
+		tg->total_size = 0;
+
+		if (batch) {
+			do_div(lat, batch);
+			tg->avg_latency = (tg->avg_latency * 7 +
+				lat) >> 3;
+			do_div(size, batch);
+			tg->avg_size = (tg->avg_size * 7 + size) >> 3;
+		}
+	}
+
 end:
 	rcu_read_unlock();
 }
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 45bb437..fe87a20 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -61,6 +61,7 @@ struct bio {
 	struct cgroup_subsys_state *bi_css;
 	void *bi_cg_private;
 	u64 bi_cg_issue_time;
+	u64 bi_cg_enter_time;
 	sector_t bi_cg_size;
 #endif
 	union {
-- 
2.9.3

^ permalink raw reply related

* [PATCH V4 12/15] blk-throttle: ignore idle cgroup limit
From: Shaohua Li @ 2016-11-14 22:22 UTC (permalink / raw)
  To: linux-block, linux-kernel; +Cc: Kernel-team, axboe, tj, vgoyal
In-Reply-To: <cover.1479161136.git.shli@fb.com>

Last patch introduces a way to detect idle cgroup. We use it to make
upgrade/downgrade decision. And the new algorithm can detect completely
idle cgroup too, so we can delete the corresponding code.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 block/blk-throttle.c | 39 +++++++++++++++++++++++++--------------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index e403e88..01b494d 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -146,8 +146,7 @@ struct throtl_grp {
 
 	unsigned long last_check_time;
 
-	unsigned long last_dispatch_time[2];
-
+	int upgrade_check_batch;
 	/* When did we start a new slice */
 	unsigned long slice_start[2];
 	unsigned long slice_end[2];
@@ -487,8 +486,6 @@ static void throtl_pd_online(struct blkg_policy_data *pd)
 	 * Update has_rules[] after a new group is brought online.
 	 */
 	tg_update_has_rules(tg);
-	tg->last_dispatch_time[READ] = jiffies;
-	tg->last_dispatch_time[WRITE] = jiffies;
 }
 
 static void blk_throtl_update_valid_limit(struct throtl_data *td)
@@ -1667,9 +1664,8 @@ static bool throtl_upgrade_check_one(struct throtl_grp *tg)
 		return true;
 
 	if (time_after_eq(jiffies,
-	     tg->last_dispatch_time[READ] + tg->td->throtl_slice) &&
-	    time_after_eq(jiffies,
-	     tg->last_dispatch_time[WRITE] + tg->td->throtl_slice))
+		tg_last_high_overflow_time(tg) + tg->td->throtl_slice) &&
+	    throtl_tg_is_idle(tg))
 		return true;
 	return false;
 }
@@ -1718,6 +1714,24 @@ static bool throtl_can_upgrade(struct throtl_data *td,
 	return true;
 }
 
+static void throtl_upgrade_check(struct throtl_grp *tg)
+{
+	if (tg->td->limit_index != LIMIT_HIGH)
+		return;
+
+	if (!time_after_eq(jiffies,
+	     __tg_last_high_overflow_time(tg) + tg->td->throtl_slice))
+		return;
+
+	tg->upgrade_check_batch++;
+	if (tg->upgrade_check_batch < 16)
+		return;
+	tg->upgrade_check_batch = 0;
+
+	if (throtl_can_upgrade(tg->td, NULL))
+		throtl_upgrade_state(tg->td);
+}
+
 static void throtl_upgrade_state(struct throtl_data *td)
 {
 	struct cgroup_subsys_state *pos_css;
@@ -1752,18 +1766,15 @@ static bool throtl_downgrade_check_one(struct throtl_grp *tg)
 	struct throtl_data *td = tg->td;
 	unsigned long now = jiffies;
 
-	if (time_after_eq(now, tg->last_dispatch_time[READ] +
-					td->throtl_slice) &&
-	    time_after_eq(now, tg->last_dispatch_time[WRITE] +
-					td->throtl_slice))
-		return false;
 	/*
 	 * If cgroup is below high limit, consider downgrade and throttle other
 	 * cgroups
 	 */
 	if (time_after_eq(now, td->high_upgrade_time + td->throtl_slice) &&
 	    time_after_eq(now, tg_last_high_overflow_time(tg) +
-					td->throtl_slice))
+					td->throtl_slice) &&
+	    (!throtl_tg_is_idle(tg) ||
+	     !list_empty(&tg_to_blkg(tg)->blkcg->css.children)))
 		return true;
 	return false;
 }
@@ -1902,10 +1913,10 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 
 again:
 	while (true) {
-		tg->last_dispatch_time[rw] = jiffies;
 		if (tg->last_high_overflow_time[rw] == 0)
 			tg->last_high_overflow_time[rw] = jiffies;
 		throtl_downgrade_check(tg);
+		throtl_upgrade_check(tg);
 		/* throtl is FIFO - if bios are already queued, should queue */
 		if (sq->nr_queued[rw])
 			break;
-- 
2.9.3

^ permalink raw reply related

* [PATCH V4 13/15] blk-throttle: add a mechanism to estimate IO latency
From: Shaohua Li @ 2016-11-14 22:22 UTC (permalink / raw)
  To: linux-block, linux-kernel; +Cc: Kernel-team, axboe, tj, vgoyal
In-Reply-To: <cover.1479161136.git.shli@fb.com>

We try to set a latency target for each cgroup. The problem is latency
highly depends on request size, users can't configure the target for
every request size. The idea is users configure latency target for 4k
IO, we estimate the target latency for other request size IO.

To do this, we sample some data, eg, average latency for request size
4k, 8k, 16k, 32k, 64k. We then use an equation f(x) = a * x + b to fit
the data (x is request size in KB, f(x) is the latency). Then we can use
the equation to estimate IO target latency for any request.

To increase the chance of sampling, we actually collect data for any IO
size less than 64k, then calcualte an average latency/size. This is ok
for line fit because the equation should work for average request
size/latency too.

But we shouldn't sample data at any time. If disk is congested, the
calculated data will not represent the disk's capability. Hence we only
do the sampling when block throttling is in the HIGH limit, with
assumption disk isn't congested in such state. If the assumption isn't
true, eg, high limit is too high, calculated latency target will be
higher.

How does the equation fit to actual data? I collected data from 4
different SSDs (one SATA, 3 NVMe). The error range is quite small. The
big difference between measured latency and calculated latency generally
comes from 4k IO. The biggest one has around 30% difference, which isn't
terrible as we don't need accurate latency target. We don't know if line
fit works for other SSDs though. For big request size latency, the error
range seems big. But this mechanism is to determine if we should
throttle IO (eg, if cgroup is idle). If cgroups average request size is
big, we can simply treat it as busy, hence we don't need the mechanism.

Hard disk is completely different. Latency depends on spindle seek
instead of request size. So this latency target feature is for SSD only.

The patch uses below algorithm to calculate the equation:
https://en.wikipedia.org/wiki/Simple_linear_regression

TODO: the latency sampling is better moving to request layer

Signed-off-by: Shaohua Li <shli@fb.com>
---
 block/blk-throttle.c      | 191 +++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/blk_types.h |   2 +
 2 files changed, 190 insertions(+), 3 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 01b494d..a05d351 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -156,6 +156,20 @@ struct throtl_grp {
 	u64 avg_ttime;
 };
 
+/* We measure latency for request size from 4k to 4k * ( 1 << 4) */
+#define LATENCY_BUCKET_SIZE 5
+
+struct latency_bucket {
+	u64 total_latency;
+	u64 total_size;
+	int samples;
+};
+
+struct avg_latency_bucket {
+	u64 latency;
+	u64 size;
+};
+
 struct throtl_data
 {
 	/* service tree for active throtl groups */
@@ -179,6 +193,12 @@ struct throtl_data
 	unsigned int scale;
 
 	u64 idle_ttime_threshold;
+
+	struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE];
+	struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
+	struct latency_bucket __percpu *latency_buckets;
+	s64 line_slope;
+	unsigned long last_calculate_time;
 };
 
 static void throtl_pending_timer_fn(unsigned long arg);
@@ -288,6 +308,19 @@ static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
 	return ret;
 }
 
+static int request_bucket_index(sector_t sectors)
+{
+	int i;
+
+	for (i = LATENCY_BUCKET_SIZE - 1; i >= 0; i--) {
+		if (sectors > (1 << (i + 3)))
+			break;
+	}
+	if (i == LATENCY_BUCKET_SIZE - 1)
+		return -1;
+	return i + 1;
+}
+
 /**
  * throtl_log - log debug message via blktrace
  * @sq: the service_queue being reported
@@ -1877,6 +1910,120 @@ static void blk_throtl_update_ttime(struct throtl_grp *tg)
 	tg->checked_last_finish_time = last_finish_time;
 }
 
+static void throtl_calculate_line_slope(struct throtl_data *td)
+{
+	struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
+	s64 sumX;
+	s64 sumY;
+	s64 sumXY;
+	s64 sumX2;
+	s64 xMean;
+	s64 yMean;
+	s64 denominator;
+	s64 slope;
+	int i, cpu;
+	int valid_lat;
+	u64 last_latency = 0;
+
+	if (!blk_queue_nonrot(td->queue))
+		return;
+	if (time_before(jiffies, td->last_calculate_time + HZ))
+		return;
+	td->last_calculate_time = jiffies;
+
+	memset(avg_latency, 0, sizeof(avg_latency));
+	for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+		struct latency_bucket *tmp = &td->tmp_buckets[i];
+
+		for_each_possible_cpu(cpu) {
+			struct latency_bucket *bucket;
+
+			bucket = per_cpu_ptr(td->latency_buckets, cpu);
+			tmp->total_latency += bucket[i].total_latency;
+			tmp->total_size += bucket[i].total_size;
+			tmp->samples += bucket[i].samples;
+			bucket[i].total_latency = 0;
+			bucket[i].total_size = 0;
+			bucket[i].samples = 0;
+		}
+
+		if (tmp->samples >= 32) {
+			u64 latency = tmp->total_latency;
+			u64 size = tmp->total_size;
+			int samples = tmp->samples;
+
+			tmp->total_latency = 0;
+			tmp->total_size = 0;
+			tmp->samples = 0;
+			do_div(size, samples);
+			if (size == 0 || size > (1 << (i + 12)))
+				continue;
+			avg_latency[i].size = size;
+			do_div(latency, samples);
+			if (latency == 0)
+				continue;
+			avg_latency[i].latency = latency;
+		}
+	}
+
+	valid_lat = 0;
+	for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+		if (!td->avg_buckets[i].latency && !avg_latency[i].latency)
+			continue;
+		valid_lat++;
+		if (!td->avg_buckets[i].latency) {
+			td->avg_buckets[i].latency = avg_latency[i].latency;
+			td->avg_buckets[i].size = avg_latency[i].size;
+			continue;
+		}
+		if (!avg_latency[i].latency)
+			continue;
+		/* make it smooth */
+		td->avg_buckets[i].latency = (td->avg_buckets[i].latency * 7 +
+			avg_latency[i].latency) >> 3;
+		td->avg_buckets[i].size = (td->avg_buckets[i].size * 7 +
+			avg_latency[i].size) >> 3;
+		/* filter out abnormal latency */
+		if (td->avg_buckets[i].latency <= last_latency) {
+			td->avg_buckets[i].latency = 0;
+			valid_lat--;
+		} else
+			last_latency = td->avg_buckets[i].latency;
+	}
+
+	if (valid_lat < 2)
+		return;
+
+	sumX = 0;
+	sumY = 0;
+	sumXY = 0;
+	sumX2 = 0;
+	for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+		u64 x, y;
+
+		if (td->avg_buckets[i].latency == 0)
+			continue;
+
+		x = td->avg_buckets[i].size >> 10;
+		y = td->avg_buckets[i].latency;
+		sumX += x;
+		sumY += y;
+
+		sumXY += x * y;
+		sumX2 += x * x;
+	}
+
+	xMean = sumX;
+	do_div(xMean, valid_lat);
+	yMean = sumY;
+	do_div(yMean, valid_lat);
+	denominator = sumX2 - sumX * xMean;
+
+	slope = sumXY - sumX * yMean;
+	do_div(slope, denominator);
+	td->line_slope = slope;
+}
+
 bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 		    struct bio *bio)
 {
@@ -1901,11 +2048,14 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 
 	spin_lock_irq(q->queue_lock);
 
+	throtl_calculate_line_slope(tg->td);
+
 	if (unlikely(blk_queue_bypass(q)))
 		goto out_unlock;
 
 	bio_associate_current(bio);
 	bio->bi_cg_private = q;
+	bio->bi_cg_size = bio_sectors(bio);
 
 	blk_throtl_update_ttime(tg);
 
@@ -1992,8 +2142,11 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 	 * don't want bios to leave with the flag set.  Clear the flag if
 	 * being issued.
 	 */
-	if (!throttled)
+	if (!throttled) {
+		if (blk_queue_nonrot(q))
+			bio->bi_cg_issue_time = ktime_get_ns();
 		bio->bi_opf &= ~REQ_THROTTLED;
+	}
 	return throttled;
 }
 
@@ -2003,6 +2156,9 @@ void blk_throtl_bio_endio(struct bio *bio)
 	struct blkcg_gq *blkg;
 	struct throtl_grp *tg;
 	struct request_queue *q;
+	struct throtl_data *td;
+	u64 finish_time;
+	u64 lat;
 
 	q = bio->bi_cg_private;
 	if (!q)
@@ -2019,7 +2175,27 @@ void blk_throtl_bio_endio(struct bio *bio)
 
 	tg = blkg_to_tg(blkg ?: q->root_blkg);
 
-	tg->last_finish_time = ktime_get_ns();
+	finish_time = ktime_get_ns();
+	tg->last_finish_time = finish_time;
+
+	td = tg->td;
+
+	if (bio->bi_cg_issue_time && finish_time > bio->bi_cg_issue_time) {
+		int index;
+
+		lat = finish_time - bio->bi_cg_issue_time;
+		index = request_bucket_index(bio->bi_cg_size);
+		if (index >= 0 && bio_op(bio) == REQ_OP_READ &&
+				td->limit_index == LIMIT_HIGH) {
+			struct latency_bucket *latency;
+
+			latency = get_cpu_ptr(td->latency_buckets);
+			latency[index].total_latency += lat;
+			latency[index].total_size += bio->bi_cg_size << 9;
+			latency[index].samples++;
+			put_cpu_ptr(td->latency_buckets);
+		}
+	}
 
 end:
 	rcu_read_unlock();
@@ -2097,6 +2273,12 @@ int blk_throtl_init(struct request_queue *q)
 	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
 	if (!td)
 		return -ENOMEM;
+	td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) *
+		LATENCY_BUCKET_SIZE, __alignof__(u64));
+	if (!td->latency_buckets) {
+		kfree(td);
+		return -ENOMEM;
+	}
 
 	INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
 	throtl_service_queue_init(&td->service_queue);
@@ -2113,8 +2295,10 @@ int blk_throtl_init(struct request_queue *q)
 	td->idle_ttime_threshold = -1;
 	/* activate policy */
 	ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
-	if (ret)
+	if (ret) {
+		free_percpu(td->latency_buckets);
 		kfree(td);
+	}
 	return ret;
 }
 
@@ -2123,6 +2307,7 @@ void blk_throtl_exit(struct request_queue *q)
 	BUG_ON(!q->td);
 	throtl_shutdown_wq(q);
 	blkcg_deactivate_policy(q, &blkcg_policy_throtl);
+	free_percpu(q->td->latency_buckets);
 	kfree(q->td);
 }
 
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index ff8dd24..45bb437 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -60,6 +60,8 @@ struct bio {
 	struct io_context	*bi_ioc;
 	struct cgroup_subsys_state *bi_css;
 	void *bi_cg_private;
+	u64 bi_cg_issue_time;
+	sector_t bi_cg_size;
 #endif
 	union {
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
-- 
2.9.3

^ permalink raw reply related

* [PATCH V4 14/15] blk-throttle: add interface for per-cgroup target latency
From: Shaohua Li @ 2016-11-14 22:22 UTC (permalink / raw)
  To: linux-block, linux-kernel; +Cc: Kernel-team, axboe, tj, vgoyal
In-Reply-To: <cover.1479161136.git.shli@fb.com>

Add interface for per-cgroup target latency. This latency is for 4k
request.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 block/blk-throttle.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a05d351..ac4d9ea 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -147,6 +147,8 @@ struct throtl_grp {
 	unsigned long last_check_time;
 
 	int upgrade_check_batch;
+
+	u64 latency_target;
 	/* When did we start a new slice */
 	unsigned long slice_start[2];
 	unsigned long slice_end[2];
@@ -463,6 +465,7 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
 			tg->iops[rw][index] = -1;
 		}
 	}
+	/* target latency default 0, eg, always not meet */
 
 	return &tg->pd;
 }
@@ -1572,6 +1575,64 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
 	return ret ?: nbytes;
 }
 
+static u64 tg_prfill_latency_target(struct seq_file *sf,
+	struct blkg_policy_data *pd, int off)
+{
+	struct throtl_grp *tg = pd_to_tg(pd);
+	const char *dname = blkg_dev_name(pd->blkg);
+
+	if (!dname)
+		return 0;
+	if (tg->latency_target == 0)
+		return 0;
+
+	seq_printf(sf, "%s 4k_lat=%llu\n", dname, tg->latency_target);
+	return 0;
+}
+
+static int tg_print_latency_target(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+		tg_prfill_latency_target, &blkcg_policy_throtl,
+		seq_cft(sf)->private, false);
+	return 0;
+}
+
+static ssize_t tg_set_latency_target(struct kernfs_open_file *of,
+				     char *buf, size_t nbytes, loff_t off)
+{
+	struct blkcg *blkcg = css_to_blkcg(of_css(of));
+	struct blkg_conf_ctx ctx;
+	struct throtl_grp *tg;
+	int ret = -EINVAL;
+	char tok[27];
+	char *p;
+	u64 val;
+
+	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
+	if (ret)
+		return ret;
+
+	tg = blkg_to_tg(ctx.blkg);
+
+	if (sscanf(ctx.body, "%26s", tok) != 1)
+		goto out_finish;
+
+	p = tok;
+	strsep(&p, "=");
+	if (!p || kstrtou64(p, 10, &val))
+		goto out_finish;
+
+	if (strcmp(tok, "4k_lat"))
+		goto out_finish;
+
+	tg->latency_target = val;
+	ret = 0;
+out_finish:
+	blkg_conf_finish(&ctx);
+	return ret ?: nbytes;
+}
+
 static struct cftype throtl_files[] = {
 	{
 		.name = "high",
@@ -1587,6 +1648,12 @@ static struct cftype throtl_files[] = {
 		.write = tg_set_limit,
 		.private = LIMIT_MAX,
 	},
+	{
+		.name = "latency_target",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = tg_print_latency_target,
+		.write = tg_set_latency_target,
+	},
 	{ }	/* terminate */
 };
 
-- 
2.9.3

^ permalink raw reply related

* Re: [PATCH] qla2xxx: do not abort all commands in the adapter during EEH recovery
From: Madhani, Himanshu @ 2016-11-14 22:07 UTC (permalink / raw)
  To: Mauricio Faria de Oliveira, qla2xxx-upstream@qlogic.com
  Cc: martin.petersen@oracle.com, jejb@linux.vnet.ibm.com,
	linux-scsi@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <1479158782-4544-1-git-send-email-mauricfo@linux.vnet.ibm.com>



On 11/14/16, 1:26 PM, "Mauricio Faria de Oliveira" <mauricfo@linux.vnet.ibm.com> wrote:

>The previous commit ("qla2xxx: fix invalid DMA access after command
>aborts in PCI device remove") introduced a regression during an EEH
>recovery, since the change to the qla2x00_abort_all_cmds() function
>calls qla2xxx_eh_abort(), which verifies the EEH recovery condition
>but handles it heavy-handed. (commit a465537ad1a4 "qla2xxx: Disable
>the adapter and skip error recovery in case of register disconnect.")
>
>This problem warrants a more general/optimistic solution right into
>qla2xxx_eh_abort()  (eg in case a real command abort arrives during
>EEH recovery, or if it takes long enough to trigger command aborts);
>but it's still worth to add a check to ensure the code added by the
>previous commit is correct and contained within its owner function.
>
>This commit just adds a 'if (!ha->flags.eeh_busy)' check around it.
>(ahem; a trivial fix for this -rc series; sorry for this oversight.)
>
>With it applied, both PCI device remove and EEH recovery works fine.
>
>Fixes: 1535aa75a3d8 ("scsi: qla2xxx: fix invalid DMA access after
>command aborts in PCI device remove")
>Signed-off-by: Mauricio Faria de Oliveira <mauricfo@linux.vnet.ibm.com>
>---
> drivers/scsi/qla2xxx/qla_os.c | 21 +++++++++++++--------
> 1 file changed, 13 insertions(+), 8 deletions(-)
>
>diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
>index 567fa080e261..56d6142852a5 100644
>--- a/drivers/scsi/qla2xxx/qla_os.c
>+++ b/drivers/scsi/qla2xxx/qla_os.c
>@@ -1456,15 +1456,20 @@ uint32_t qla2x00_isp_reg_stat(struct qla_hw_data *ha)
> 		for (cnt = 1; cnt < req->num_outstanding_cmds; cnt++) {
> 			sp = req->outstanding_cmds[cnt];
> 			if (sp) {
>-				/* Get a reference to the sp and drop the lock.
>-				 * The reference ensures this sp->done() call
>-				 * - and not the call in qla2xxx_eh_abort() -
>-				 * ends the SCSI command (with result 'res').
>+				/* Don't abort commands in adapter during EEH
>+				 * recovery as it's not accessible/responding.
> 				 */
>-				sp_get(sp);
>-				spin_unlock_irqrestore(&ha->hardware_lock, flags);
>-				qla2xxx_eh_abort(GET_CMD_SP(sp));
>-				spin_lock_irqsave(&ha->hardware_lock, flags);
>+				if (!ha->flags.eeh_busy) {
>+					/* Get a reference to the sp and drop the lock.
>+					 * The reference ensures this sp->done() call
>+					 * - and not the call in qla2xxx_eh_abort() -
>+					 * ends the SCSI command (with result 'res').
>+					 */
>+					sp_get(sp);
>+					spin_unlock_irqrestore(&ha->hardware_lock, flags);
>+					qla2xxx_eh_abort(GET_CMD_SP(sp));
>+					spin_lock_irqsave(&ha->hardware_lock, flags);
>+				}
> 				req->outstanding_cmds[cnt] = NULL;
> 				sp->done(vha, sp, res);
> 			}
>-- 
>1.8.3.1
>

Acked-by: Himanshu Madhani <himanshu.madhani@cavium.com>

Thanks,
Himanshu

^ permalink raw reply

* [PATCH V4 00/15] blk-throttle: add .high limit
From: Shaohua Li @ 2016-11-14 22:22 UTC (permalink / raw)
  To: linux-block, linux-kernel; +Cc: Kernel-team, axboe, tj, vgoyal

Hi,

The background is we don't have an ioscheduler for blk-mq yet, so we can't
prioritize processes/cgroups. This patch set tries to add basic arbitration
between cgroups with blk-throttle. It adds a new limit io.high for
blk-throttle. It's only for cgroup2.

io.max is a hard limit throttling. cgroups with a max limit never dispatch more
IO than their max limit. While io.high is a best effort throttling. cgroups
with high limit can run above their high limit at appropriate time.
Specifically, if all cgroups reach their high limit, all cgroups can run above
their high limit. If any cgroup runs under its high limit, all other cgroups
will run according to their high limit.

An example usage is we have a high prio cgroup with high high limit and a low
prio cgroup with low high limit. If the high prio cgroup isn't running, the low
prio can run above its high limit, so we don't waste the bandwidth. When the
high prio cgroup runs and is below its high limit, low prio cgroup will run
under its high limit. This will protect high prio cgroup to get more resources.
If both cgroups reach their high limit, both can run above their high limit
(eg, fully utilize disk bandwidth). All these can't be done with io.max limit.

The implementation is simple. The disk queue has 2 states LIMIT_HIGH and
LIMIT_MAX. In each disk state, we throttle cgroups according to the limit of
the state. That is io.high limit for LIMIT_HIGH state, io.max limit for
LIMIT_MAX. The disk state can be upgraded/downgraded between
LIMIT_HIGH/LIMIT_MAX according to the rule above. Initially disk state is
LIMIT_MAX. And if no cgroup sets io.high, the disk state will remain in
LIMIT_MAX state. Users with only io.max set will find nothing changed with the
patches.

The first 8 patches implement the basic framework. Add interface, handle
upgrade and downgrade logic. The patch 8 detects a special case a cgroup is
completely idle. In this case, we ignore the cgroup's limit. The patch 9-15
adds more heuristics.

The basic framework has 2 major issues.
1. fluctuation. When the state is upgraded from LIMIT_HIGH to LIMIT_MAX, the
cgroup's bandwidth can change dramatically, sometimes in a way not expected.
For example, one cgroup's bandwidth will drop below its io.high limit very soon
after a upgrade. patch 9 has more details about the issue.
2. idle cgroup. cgroup with a io.high limit doesn't always dispatch enough IO.
In above upgrade rule, the disk will remain in LIMIT_HIGH state and all other
cgroups can't dispatch more IO above their high limit. Hence this is a waste of
disk bandwidth. patch 10 has more details about the issue.

For issue 1, we make cgroup bandwidth increase smoothly after a upgrade. This
will reduce the chance a cgroup's bandwidth drop under its high limit rapidly.
The smoothness means we could waste some bandwidth in the transition though.
But we must pay something for sharing.

The issue 2 is very hard to solve. The patch 10 uses the 'think time check'
idea borrowed from CFQ to detect idle cgroup. It's not perfect, eg, not works
well for high IO depth workloads.  But it's the best I tried so far and in
practice works well. This definitively needs more tuning.

The big change in this version comes from patch 13 - 15. We add a latency
target for each cgroup. The goal is to solve issue 2. If a cgroup's average io
latency exceeds its latency target, the cgroup is considered as busy.

Please review, test and consider merge.

Thanks,
Shaohua

V3->V4:
- Add latency target for cgroup
- Fix bugs

V2->V3:
- Rebase
- Fix several bugs
- Make harddisk think time threshold bigger
http://marc.info/?l=linux-kernel&m=147552964708965&w=2

V1->V2:
- Drop io.low interface for simplicity and the interface isn't a must-have to
  prioritize cgroups.
- Remove the 'trial' logic, which creates too much fluctuation
- Add a new idle cgroup detection
- Other bug fixes and improvements
http://marc.info/?l=linux-block&m=147395674732335&w=2

V1:
http://marc.info/?l=linux-block&m=146292596425689&w=2


Shaohua Li (15):
  blk-throttle: prepare support multiple limits
  blk-throttle: add .high interface
  blk-throttle: configure bps/iops limit for cgroup in high limit
  blk-throttle: add upgrade logic for LIMIT_HIGH state
  blk-throttle: add downgrade logic
  blk-throttle: make sure expire time isn't too big
  blk-throttle: make throtl_slice tunable
  blk-throttle: detect completed idle cgroup
  blk-throttle: make bandwidth change smooth
  blk-throttle: add a simple idle detection
  blk-throttle: add interface to configure think time threshold
  blk-throttle: ignore idle cgroup limit
  blk-throttle: add a mechanism to estimate IO latency
  blk-throttle: add interface for per-cgroup target latency
  blk-throttle: add latency target support

 block/bio.c               |    2 +
 block/blk-sysfs.c         |   18 +
 block/blk-throttle.c      | 1035 ++++++++++++++++++++++++++++++++++++++++++---
 block/blk.h               |    9 +
 include/linux/blk_types.h |    4 +
 5 files changed, 1001 insertions(+), 67 deletions(-)

-- 
2.9.3

^ permalink raw reply

* [PATCH V4 05/15] blk-throttle: add downgrade logic
From: Shaohua Li @ 2016-11-14 22:22 UTC (permalink / raw)
  To: linux-block, linux-kernel; +Cc: Kernel-team, axboe, tj, vgoyal
In-Reply-To: <cover.1479161136.git.shli@fb.com>

When queue state machine is in LIMIT_MAX state, but a cgroup is below
its high limit for some time, the queue should be downgraded to lower
state as one cgroup's high limit isn't met.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 block/blk-throttle.c | 188 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 188 insertions(+)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 34a75e5..d177252 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -136,6 +136,13 @@ struct throtl_grp {
 	/* Number of bio's dispatched in current slice */
 	unsigned int io_disp[2];
 
+	unsigned long last_high_overflow_time[2];
+
+	uint64_t last_bytes_disp[2];
+	unsigned int last_io_disp[2];
+
+	unsigned long last_check_time;
+
 	/* When did we start a new slice */
 	unsigned long slice_start[2];
 	unsigned long slice_end[2];
@@ -155,6 +162,9 @@ struct throtl_data
 	struct work_struct dispatch_work;
 	unsigned int limit_index;
 	bool limit_valid[LIMIT_CNT];
+
+	unsigned long high_upgrade_time;
+	unsigned long high_downgrade_time;
 };
 
 static void throtl_pending_timer_fn(unsigned long arg);
@@ -896,6 +906,8 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 	/* Charge the bio to the group */
 	tg->bytes_disp[rw] += bio->bi_iter.bi_size;
 	tg->io_disp[rw]++;
+	tg->last_bytes_disp[rw] += bio->bi_iter.bi_size;
+	tg->last_io_disp[rw]++;
 
 	/*
 	 * REQ_THROTTLED is used to prevent the same bio to be throttled
@@ -1510,6 +1522,65 @@ static struct blkcg_policy blkcg_policy_throtl = {
 	.pd_free_fn		= throtl_pd_free,
 };
 
+static unsigned long __tg_last_high_overflow_time(struct throtl_grp *tg)
+{
+	unsigned long rtime = -1, wtime = -1;
+
+	if (tg->bps[READ][LIMIT_HIGH] != -1 ||
+	    tg->iops[READ][LIMIT_HIGH] != -1 ||
+	    tg->bps[READ][LIMIT_MAX] != -1 ||
+	    tg->iops[READ][LIMIT_MAX] != -1)
+		rtime = tg->last_high_overflow_time[READ];
+	if (tg->bps[WRITE][LIMIT_HIGH] != -1 ||
+	    tg->iops[WRITE][LIMIT_HIGH] != -1 ||
+	    tg->bps[WRITE][LIMIT_MAX] != -1 ||
+	    tg->iops[WRITE][LIMIT_MAX] != -1)
+		wtime = tg->last_high_overflow_time[WRITE];
+	return min(rtime, wtime) == -1 ? 0 : min(rtime, wtime);
+}
+
+static unsigned long tg_last_high_overflow_time(struct throtl_grp *tg)
+{
+	struct throtl_service_queue *parent_sq;
+	struct throtl_grp *parent = tg;
+	unsigned long ret = __tg_last_high_overflow_time(tg);
+
+	while (true) {
+		parent_sq = parent->service_queue.parent_sq;
+		parent = sq_to_tg(parent_sq);
+		if (!parent)
+			break;
+		if (((parent->bps[READ][LIMIT_HIGH] != -1 &&
+		      parent->bps[READ][LIMIT_HIGH] >=
+		       tg->bps[READ][LIMIT_HIGH]) ||
+		     (parent->bps[READ][LIMIT_HIGH] == -1 &&
+		      parent->bps[READ][LIMIT_MAX] >=
+		       tg->bps[READ][LIMIT_HIGH])) &&
+		    ((parent->bps[WRITE][LIMIT_HIGH] != -1 &&
+		      parent->bps[WRITE][LIMIT_HIGH] >=
+		       tg->bps[WRITE][LIMIT_HIGH]) ||
+		     (parent->bps[WRITE][LIMIT_HIGH] == -1 &&
+		      parent->bps[WRITE][LIMIT_MAX] >=
+		       tg->bps[WRITE][LIMIT_HIGH])) &&
+		    ((parent->iops[READ][LIMIT_HIGH] != -1 &&
+		      parent->iops[READ][LIMIT_HIGH] >=
+		       tg->iops[READ][LIMIT_HIGH]) ||
+		     (parent->iops[READ][LIMIT_HIGH] == -1 &&
+		      parent->iops[READ][LIMIT_MAX] >=
+		       tg->iops[READ][LIMIT_HIGH])) &&
+		    ((parent->iops[WRITE][LIMIT_HIGH] != -1 &&
+		      parent->iops[WRITE][LIMIT_HIGH] >=
+		       tg->iops[WRITE][LIMIT_HIGH]) ||
+		     (parent->iops[WRITE][LIMIT_HIGH] == -1 &&
+		      parent->iops[WRITE][LIMIT_MAX] >=
+		       tg->iops[WRITE][LIMIT_HIGH])))
+			break;
+		if (time_after(__tg_last_high_overflow_time(parent), ret))
+			ret = __tg_last_high_overflow_time(parent);
+	}
+	return ret;
+}
+
 static bool throtl_upgrade_check_one(struct throtl_grp *tg)
 {
 	struct throtl_service_queue *sq = &tg->service_queue;
@@ -1557,6 +1628,9 @@ static bool throtl_can_upgrade(struct throtl_data *td,
 	if (td->limit_index != LIMIT_HIGH)
 		return false;
 
+	if (time_before(jiffies, td->high_downgrade_time + throtl_slice))
+		return false;
+
 	rcu_read_lock();
 	blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
 		struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -1580,6 +1654,7 @@ static void throtl_upgrade_state(struct throtl_data *td)
 	struct blkcg_gq *blkg;
 
 	td->limit_index = LIMIT_MAX;
+	td->high_upgrade_time = jiffies;
 	rcu_read_lock();
 	blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
 		struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -1595,6 +1670,111 @@ static void throtl_upgrade_state(struct throtl_data *td)
 	queue_work(kthrotld_workqueue, &td->dispatch_work);
 }
 
+static void throtl_downgrade_state(struct throtl_data *td, int new)
+{
+	td->limit_index = new;
+	td->high_downgrade_time = jiffies;
+}
+
+static bool throtl_downgrade_check_one(struct throtl_grp *tg)
+{
+	struct throtl_data *td = tg->td;
+	unsigned long now = jiffies;
+
+	/*
+	 * If cgroup is below high limit, consider downgrade and throttle other
+	 * cgroups
+	 */
+	if (time_after_eq(now, td->high_upgrade_time + throtl_slice) &&
+	    time_after_eq(now, tg_last_high_overflow_time(tg) + throtl_slice))
+		return true;
+	return false;
+}
+
+static bool throtl_downgrade_check_hierarchy(struct throtl_grp *tg)
+{
+	if (!throtl_downgrade_check_one(tg))
+		return false;
+	while (true) {
+		if (!tg || (cgroup_subsys_on_dfl(io_cgrp_subsys) &&
+			    !tg_to_blkg(tg)->parent))
+			break;
+
+		if (!throtl_downgrade_check_one(tg))
+			return false;
+		tg = sq_to_tg(tg->service_queue.parent_sq);
+	}
+	return true;
+}
+
+static void throtl_downgrade_check(struct throtl_grp *tg)
+{
+	uint64_t bps;
+	unsigned int iops;
+	unsigned long elapsed_time;
+	unsigned long now = jiffies;
+
+	if (tg->td->limit_index != LIMIT_MAX ||
+	    !tg->td->limit_valid[LIMIT_HIGH])
+		return;
+	if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
+		return;
+	if (time_after(tg->last_check_time + throtl_slice, now))
+		return;
+
+	if (time_before(now, tg_last_high_overflow_time(tg) + throtl_slice))
+		return;
+
+	elapsed_time = now - tg->last_check_time;
+	tg->last_check_time = now;
+
+	if (tg->bps[READ][LIMIT_HIGH] != -1 ||
+	    tg->bps[READ][LIMIT_MAX] != -1) {
+		bps = tg->last_bytes_disp[READ] * HZ;
+		do_div(bps, elapsed_time);
+		if (bps >= tg->bps[READ][LIMIT_HIGH] ||
+		    bps >= tg->bps[READ][LIMIT_MAX])
+			tg->last_high_overflow_time[READ] = now;
+	}
+
+	if (tg->bps[WRITE][LIMIT_HIGH] != -1 ||
+	    tg->bps[WRITE][LIMIT_MAX] != -1) {
+		bps = tg->last_bytes_disp[WRITE] * HZ;
+		do_div(bps, elapsed_time);
+		if (bps >= tg->bps[WRITE][LIMIT_HIGH] ||
+		    bps >= tg->bps[WRITE][LIMIT_MAX])
+			tg->last_high_overflow_time[WRITE] = now;
+	}
+
+	if (tg->iops[READ][LIMIT_HIGH] != -1 ||
+	    tg->iops[READ][LIMIT_MAX] != -1) {
+		iops = tg->last_io_disp[READ] * HZ / elapsed_time;
+		if (iops >= tg->iops[READ][LIMIT_HIGH] ||
+		    iops >= tg->iops[READ][LIMIT_MAX])
+			tg->last_high_overflow_time[READ] = now;
+	}
+
+	if (tg->iops[WRITE][LIMIT_HIGH] != -1 ||
+	    tg->iops[WRITE][LIMIT_MAX] != -1) {
+		iops = tg->last_io_disp[WRITE] * HZ / elapsed_time;
+		if (iops >= tg->iops[WRITE][LIMIT_HIGH] ||
+		    iops >= tg->iops[WRITE][LIMIT_MAX])
+			tg->last_high_overflow_time[WRITE] = now;
+	}
+
+	/*
+	 * If cgroup is below high limit, consider downgrade and throttle other
+	 * cgroups
+	 */
+	if (throtl_downgrade_check_hierarchy(tg))
+		throtl_downgrade_state(tg->td, LIMIT_HIGH);
+
+	tg->last_bytes_disp[READ] = 0;
+	tg->last_bytes_disp[WRITE] = 0;
+	tg->last_io_disp[READ] = 0;
+	tg->last_io_disp[WRITE] = 0;
+}
+
 bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 		    struct bio *bio)
 {
@@ -1619,12 +1799,16 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 
 again:
 	while (true) {
+		if (tg->last_high_overflow_time[rw] == 0)
+			tg->last_high_overflow_time[rw] = jiffies;
+		throtl_downgrade_check(tg);
 		/* throtl is FIFO - if bios are already queued, should queue */
 		if (sq->nr_queued[rw])
 			break;
 
 		/* if above limits, break to queue */
 		if (!tg_may_dispatch(tg, bio, NULL)) {
+			tg->last_high_overflow_time[rw] = jiffies;
 			if (throtl_can_upgrade(tg->td, tg)) {
 				throtl_upgrade_state(tg->td);
 				goto again;
@@ -1668,6 +1852,8 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 		   tg->io_disp[rw], tg_iops_limit(tg, rw),
 		   sq->nr_queued[READ], sq->nr_queued[WRITE]);
 
+	tg->last_high_overflow_time[rw] = jiffies;
+
 	bio_associate_current(bio);
 	tg->td->nr_queued[rw]++;
 	throtl_add_bio_tg(bio, qn, tg);
@@ -1778,6 +1964,8 @@ int blk_throtl_init(struct request_queue *q)
 
 	td->limit_valid[LIMIT_MAX] = true;
 	td->limit_index = LIMIT_MAX;
+	td->high_upgrade_time = jiffies;
+	td->high_downgrade_time = jiffies;
 	/* activate policy */
 	ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
 	if (ret)
-- 
2.9.3

^ permalink raw reply related

* [PATCH V4 02/15] blk-throttle: add .high interface
From: Shaohua Li @ 2016-11-14 22:22 UTC (permalink / raw)
  To: linux-block, linux-kernel; +Cc: Kernel-team, axboe, tj, vgoyal
In-Reply-To: <cover.1479161136.git.shli@fb.com>

Add high limit for cgroup and corresponding cgroup interface.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 block/blk-throttle.c | 132 ++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 103 insertions(+), 29 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 925aa1ed..a564215 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -84,6 +84,7 @@ enum tg_state_flags {
 #define rb_entry_tg(node)	rb_entry((node), struct throtl_grp, rb_node)
 
 enum {
+	LIMIT_HIGH,
 	LIMIT_MAX,
 	LIMIT_CNT,
 };
@@ -414,6 +415,46 @@ static void throtl_pd_online(struct blkg_policy_data *pd)
 	tg_update_has_rules(pd_to_tg(pd));
 }
 
+static void blk_throtl_update_valid_limit(struct throtl_data *td)
+{
+	struct cgroup_subsys_state *pos_css;
+	struct blkcg_gq *blkg;
+	bool high_valid = false;
+
+	rcu_read_lock();
+	blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
+		struct throtl_grp *tg = blkg_to_tg(blkg);
+
+		if (tg->bps[READ][LIMIT_HIGH] != -1 ||
+		    tg->bps[WRITE][LIMIT_HIGH] != -1 ||
+		    tg->iops[READ][LIMIT_HIGH] != -1 ||
+		    tg->iops[WRITE][LIMIT_HIGH] != -1)
+			high_valid = true;
+	}
+	rcu_read_unlock();
+
+	if (high_valid)
+		td->limit_valid[LIMIT_HIGH] = true;
+	else
+		td->limit_valid[LIMIT_HIGH] = false;
+}
+
+static void throtl_pd_offline(struct blkg_policy_data *pd)
+{
+	struct throtl_grp *tg = pd_to_tg(pd);
+
+	tg->bps[READ][LIMIT_HIGH] = -1;
+	tg->bps[WRITE][LIMIT_HIGH] = -1;
+	tg->iops[READ][LIMIT_HIGH] = -1;
+	tg->iops[WRITE][LIMIT_HIGH] = -1;
+
+	blk_throtl_update_valid_limit(tg->td);
+
+	if (tg->td->limit_index == LIMIT_HIGH &&
+	    !tg->td->limit_valid[LIMIT_HIGH])
+		tg->td->limit_index = LIMIT_MAX;
+}
+
 static void throtl_pd_free(struct blkg_policy_data *pd)
 {
 	struct throtl_grp *tg = pd_to_tg(pd);
@@ -1283,7 +1324,7 @@ static struct cftype throtl_legacy_files[] = {
 	{ }	/* terminate */
 };
 
-static u64 tg_prfill_max(struct seq_file *sf, struct blkg_policy_data *pd,
+static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd,
 			 int off)
 {
 	struct throtl_grp *tg = pd_to_tg(pd);
@@ -1292,36 +1333,32 @@ static u64 tg_prfill_max(struct seq_file *sf, struct blkg_policy_data *pd,
 
 	if (!dname)
 		return 0;
-	if (tg->bps[READ][LIMIT_MAX] == -1 && tg->bps[WRITE][LIMIT_MAX] == -1 &&
-	    tg->iops[READ][LIMIT_MAX] == -1 && tg->iops[WRITE][LIMIT_MAX] == -1)
+	if (tg->bps[READ][off] == -1 && tg->bps[WRITE][off] == -1 &&
+	    tg->iops[READ][off] == -1 && tg->iops[WRITE][off] == -1)
 		return 0;
 
-	if (tg->bps[READ][LIMIT_MAX] != -1)
-		snprintf(bufs[0], sizeof(bufs[0]), "%llu",
-			tg->bps[READ][LIMIT_MAX]);
-	if (tg->bps[WRITE][LIMIT_MAX] != -1)
-		snprintf(bufs[1], sizeof(bufs[1]), "%llu",
-			tg->bps[WRITE][LIMIT_MAX]);
-	if (tg->iops[READ][LIMIT_MAX] != -1)
-		snprintf(bufs[2], sizeof(bufs[2]), "%u",
-			tg->iops[READ][LIMIT_MAX]);
-	if (tg->iops[WRITE][LIMIT_MAX] != -1)
-		snprintf(bufs[3], sizeof(bufs[3]), "%u",
-			tg->iops[WRITE][LIMIT_MAX]);
+	if (tg->bps[READ][off] != -1)
+		snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps[READ][off]);
+	if (tg->bps[WRITE][off] != -1)
+		snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps[WRITE][off]);
+	if (tg->iops[READ][off] != -1)
+		snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops[READ][off]);
+	if (tg->iops[WRITE][off] != -1)
+		snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops[WRITE][off]);
 
 	seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n",
 		   dname, bufs[0], bufs[1], bufs[2], bufs[3]);
 	return 0;
 }
 
-static int tg_print_max(struct seq_file *sf, void *v)
+static int tg_print_limit(struct seq_file *sf, void *v)
 {
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_max,
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_limit,
 			  &blkcg_policy_throtl, seq_cft(sf)->private, false);
 	return 0;
 }
 
-static ssize_t tg_set_max(struct kernfs_open_file *of,
+static ssize_t tg_set_limit(struct kernfs_open_file *of,
 			  char *buf, size_t nbytes, loff_t off)
 {
 	struct blkcg *blkcg = css_to_blkcg(of_css(of));
@@ -1329,6 +1366,7 @@ static ssize_t tg_set_max(struct kernfs_open_file *of,
 	struct throtl_grp *tg;
 	u64 v[4];
 	int ret;
+	int index = of_cft(of)->private;
 
 	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
 	if (ret)
@@ -1336,10 +1374,10 @@ static ssize_t tg_set_max(struct kernfs_open_file *of,
 
 	tg = blkg_to_tg(ctx.blkg);
 
-	v[0] = tg->bps[READ][LIMIT_MAX];
-	v[1] = tg->bps[WRITE][LIMIT_MAX];
-	v[2] = tg->iops[READ][LIMIT_MAX];
-	v[3] = tg->iops[WRITE][LIMIT_MAX];
+	v[0] = tg->bps[READ][index];
+	v[1] = tg->bps[WRITE][index];
+	v[2] = tg->iops[READ][index];
+	v[3] = tg->iops[WRITE][index];
 
 	while (true) {
 		char tok[27];	/* wiops=18446744073709551616 */
@@ -1376,11 +1414,37 @@ static ssize_t tg_set_max(struct kernfs_open_file *of,
 			goto out_finish;
 	}
 
-	tg->bps[READ][LIMIT_MAX] = v[0];
-	tg->bps[WRITE][LIMIT_MAX] = v[1];
-	tg->iops[READ][LIMIT_MAX] = v[2];
-	tg->iops[WRITE][LIMIT_MAX] = v[3];
-
+	if (index == LIMIT_MAX) {
+		if ((v[0] < tg->bps[READ][LIMIT_HIGH] &&
+		       tg->bps[READ][LIMIT_HIGH] != -1) ||
+		    (v[1] < tg->bps[WRITE][LIMIT_HIGH] &&
+		       tg->bps[WRITE][LIMIT_HIGH] != -1) ||
+		    (v[2] < tg->iops[READ][LIMIT_HIGH] &&
+		       tg->iops[READ][LIMIT_HIGH] != -1) ||
+		    (v[3] < tg->iops[WRITE][LIMIT_HIGH] &&
+		       tg->iops[WRITE][LIMIT_HIGH] != -1)) {
+			ret = -EINVAL;
+			goto out_finish;
+		}
+	} else if (index == LIMIT_HIGH) {
+		if ((v[0] > tg->bps[READ][LIMIT_MAX] && v[0] != -1) ||
+		    (v[1] > tg->bps[WRITE][LIMIT_MAX] && v[1] != -1) ||
+		    (v[2] > tg->iops[READ][LIMIT_MAX] && v[2] != -1) ||
+		    (v[3] > tg->iops[WRITE][LIMIT_MAX] && v[3] != -1)) {
+			ret = -EINVAL;
+			goto out_finish;
+		}
+	}
+	tg->bps[READ][index] = v[0];
+	tg->bps[WRITE][index] = v[1];
+	tg->iops[READ][index] = v[2];
+	tg->iops[WRITE][index] = v[3];
+
+	if (index == LIMIT_HIGH) {
+		blk_throtl_update_valid_limit(tg->td);
+		if (tg->td->limit_valid[LIMIT_HIGH])
+			tg->td->limit_index = LIMIT_HIGH;
+	}
 	tg_conf_updated(tg);
 	ret = 0;
 out_finish:
@@ -1390,10 +1454,18 @@ static ssize_t tg_set_max(struct kernfs_open_file *of,
 
 static struct cftype throtl_files[] = {
 	{
+		.name = "high",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = tg_print_limit,
+		.write = tg_set_limit,
+		.private = LIMIT_HIGH,
+	},
+	{
 		.name = "max",
 		.flags = CFTYPE_NOT_ON_ROOT,
-		.seq_show = tg_print_max,
-		.write = tg_set_max,
+		.seq_show = tg_print_limit,
+		.write = tg_set_limit,
+		.private = LIMIT_MAX,
 	},
 	{ }	/* terminate */
 };
@@ -1412,6 +1484,7 @@ static struct blkcg_policy blkcg_policy_throtl = {
 	.pd_alloc_fn		= throtl_pd_alloc,
 	.pd_init_fn		= throtl_pd_init,
 	.pd_online_fn		= throtl_pd_online,
+	.pd_offline_fn		= throtl_pd_offline,
 	.pd_free_fn		= throtl_pd_free,
 };
 
@@ -1591,6 +1664,7 @@ int blk_throtl_init(struct request_queue *q)
 	td->queue = q;
 
 	td->limit_valid[LIMIT_MAX] = true;
+	td->limit_index = LIMIT_MAX;
 	/* activate policy */
 	ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
 	if (ret)
-- 
2.9.3

^ permalink raw reply related

* [PATCH V4 03/15] blk-throttle: configure bps/iops limit for cgroup in high limit
From: Shaohua Li @ 2016-11-14 22:22 UTC (permalink / raw)
  To: linux-block, linux-kernel; +Cc: Kernel-team, axboe, tj, vgoyal
In-Reply-To: <cover.1479161136.git.shli@fb.com>

each queue will have a state machine. Initially queue is in LIMIT_HIGH
state, which means all cgroups will be throttled according to their high
limit. After all cgroups with high limit cross the limit, the queue state
gets upgraded to LIMIT_MAX state.
cgroups without high limit will use max limit for their high limit.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 block/blk-throttle.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a564215..ec53671 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -208,12 +208,29 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
 
 static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw)
 {
-	return tg->bps[rw][tg->td->limit_index];
+	struct blkcg_gq *blkg = tg_to_blkg(tg);
+	uint64_t ret;
+
+	if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
+		return -1;
+	ret = tg->bps[rw][tg->td->limit_index];
+	if (ret == -1 && tg->td->limit_index == LIMIT_HIGH)
+		return tg->bps[rw][LIMIT_MAX];
+
+	return ret;
 }
 
 static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
 {
-	return tg->iops[rw][tg->td->limit_index];
+	struct blkcg_gq *blkg = tg_to_blkg(tg);
+	unsigned int ret;
+
+	if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
+		return -1;
+	ret = tg->iops[rw][tg->td->limit_index];
+	if (ret == -1 && tg->td->limit_index == LIMIT_HIGH)
+		return tg->iops[rw][LIMIT_MAX];
+	return ret;
 }
 
 /**
-- 
2.9.3

^ permalink raw reply related

* Re: [PATCH v6 6/9] tpm: fix the missing .owner in tpm_bios_measurements_ops
From: Jarkko Sakkinen @ 2016-11-14 22:22 UTC (permalink / raw)
  To: Nayna Jain
  Cc: tpmdd-devel, peterhuewe, tpmdd, jgunthorpe, linux-kernel,
	linux-security-module
In-Reply-To: <1479117656-12403-7-git-send-email-nayna@linux.vnet.ibm.com>

On Mon, Nov 14, 2016 at 05:00:53AM -0500, Nayna Jain wrote:
> This patch fixes the missing .owner field in
> tpm_bios_measurements_ops definition.
> 
> Suggested-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
> Signed-off-by: Nayna Jain <nayna@linux.vnet.ibm.com>

Reviewed-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>

/Jarkko

> ---
>  drivers/char/tpm/tpm_eventlog.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/char/tpm/tpm_eventlog.c b/drivers/char/tpm/tpm_eventlog.c
> index f8c42fe..5575ffc 100644
> --- a/drivers/char/tpm/tpm_eventlog.c
> +++ b/drivers/char/tpm/tpm_eventlog.c
> @@ -349,6 +349,7 @@ static int tpm_bios_measurements_open(struct inode *inode,
>  }
>  
>  static const struct file_operations tpm_bios_measurements_ops = {
> +	.owner = THIS_MODULE,
>  	.open = tpm_bios_measurements_open,
>  	.read = seq_read,
>  	.llseek = seq_lseek,
> -- 
> 2.5.0
> 

^ permalink raw reply

* Re: [RESEND/PATCH v6 3/3] clk: qcom: Add A53 clock driver
From: Stephen Boyd @ 2016-11-14 22:21 UTC (permalink / raw)
  To: Georgi Djakov
  Cc: Bjorn Andersson, mturquette, linux-clk, linux-kernel,
	linux-arm-msm, devicetree, Rob Herring
In-Reply-To: <549f87fe-7be9-14b4-8e34-86f7f8dad94e@linaro.org>

On 11/11, Georgi Djakov wrote:
> On 11/03/2016 08:28 PM, Bjorn Andersson wrote:
> >On Wed 02 Nov 15:55 PDT 2016, Stephen Boyd wrote:
> >
> >>On 11/02, Bjorn Andersson wrote:
> >>>On Thu 27 Oct 18:54 PDT 2016, Stephen Boyd wrote:
> >>>
> >>>>On 10/19, Georgi Djakov wrote:
> >>>>>Add a driver for the A53 Clock Controller. It is a hardware block that
> >>>>>implements a combined mux and half integer divider functionality. It can
> >>>>>choose between a fixed-rate clock or the dedicated A53 PLL. The source
> >>>>>and the divider can be set both at the same time.
> >>>>>
> >>>>>This is required for enabling CPU frequency scaling on platforms like
> >>>>>MSM8916.
> >>>>>
> >>>>
> >>>>Please Cc DT reviewers.
> >>>>
> >>>>>Signed-off-by: Georgi Djakov <georgi.djakov@linaro.org>
> >>>>>---
> >>>>> .../devicetree/bindings/clock/qcom,a53cc.txt       |  22 +++
> >>>>> drivers/clk/qcom/Kconfig                           |   8 ++
> >>>>> drivers/clk/qcom/Makefile                          |   1 +
> >>>>> drivers/clk/qcom/a53cc.c                           | 155 +++++++++++++++++++++
> >>>>> 4 files changed, 186 insertions(+)
> >>>>> create mode 100644 Documentation/devicetree/bindings/clock/qcom,a53cc.txt
> >>>>> create mode 100644 drivers/clk/qcom/a53cc.c
> >>>>>
> >>>>>diff --git a/Documentation/devicetree/bindings/clock/qcom,a53cc.txt b/Documentation/devicetree/bindings/clock/qcom,a53cc.txt
> >>>>>new file mode 100644
> >>>>>index 000000000000..a025f062f177
> >>>>>--- /dev/null
> >>>>>+++ b/Documentation/devicetree/bindings/clock/qcom,a53cc.txt
> >>>>>@@ -0,0 +1,22 @@
> >>>>>+Qualcomm A53 CPU Clock Controller Binding
> >>>>>+------------------------------------------------
> >>>>>+The A53 CPU Clock Controller is hardware, which provides a combined
> >>>>>+mux and divider functionality for the CPU clocks. It can choose between
> >>>>>+a fixed rate clock and the dedicated A53 PLL.
> >>>>>+
> >>>>>+Required properties :
> >>>>>+- compatible : shall contain:
> >>>>>+
> >>>>>+			"qcom,a53cc"
> >>>>>+
> >>>>>+- reg : shall contain base register location and length
> >>>>>+	of the APCS region
> >>>>>+- #clock-cells : shall contain 1
> >>>>>+
> >>>>>+Example:
> >>>>>+
> >>>>>+	apcs: syscon@b011000 {
> >>>>>+		compatible = "qcom,a53cc", "syscon";
> >>>>
> >>>>Why is it a syscon? Is that part used?
> >>>>
> >>>
> >>>I use the register at offset 8 for interrupting the other subsystems, so
> >>>this must be available as something I can poke.
> >>>
> >>>Which makes me think that this should be described as a "simple-mfd" and
> >>>"syscon" with the a53cc node as a child - grabbing the regmap of the
> >>>syscon parent, rather then ioremapping the same region again.
> >>>
> >>
> >>That's sort of a question for DT reviewers. The register space
> >>certainly seems like a free for all with a tilt toward power
> >>management of the CPU, similar to how this was done on Krait
> >>based designs.
> >>
> >
> >Right. But this kind of mashup blocks was the reason why simple-mfd was
> >put in place.
> >
> 
> Ok, thanks for the comments. Then i will make it look like this:
> 
> 	apcs: syscon@b011000 {
> 		compatible = "syscon", "simple-mfd";
> 		reg = <0x0b011000 0x1000>;
> 
> 		a53mux: clock {
> 			compatible = "qcom,msm8916-a53cc";
> 			#clock-cells = <1>;
> 		};
> 	};
> 
> Thanks,
> Georgi
> 
> >>I wonder why we didn't make up some provider/consumer binding for
> >>the "kicking" feature used by SMD/RPM code. Then this could be a
> >>clock provider and a "kick" provider (haha #kick-cells) and the
> >>usage of syscon/regmap wouldn't be mandatory.
> >>
> >
> >I did consider doing that, but had enough dependencies to put in place
> >as it was.
> >
> >I'm in favour of us inventing a kicker API and it's found outside out
> >use cases as well (e.g. virtio/rpmsg).
> >

I'd rather we did this kicker API as well. That way we don't need
to make a syscon and a simple-mfd to get software to work
properly. Don't other silicon vendors need a kicker API as well?
How are they kicking remote processors in other places? GPIOs?

-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project

^ permalink raw reply

* Re: [PATCH v6 2/9] tpm: replace symbolic permission with octal for securityfs files
From: Jarkko Sakkinen @ 2016-11-14 22:21 UTC (permalink / raw)
  To: Nayna Jain
  Cc: tpmdd-devel, peterhuewe, tpmdd, jgunthorpe, linux-kernel,
	linux-security-module
In-Reply-To: <1479117656-12403-3-git-send-email-nayna@linux.vnet.ibm.com>

On Mon, Nov 14, 2016 at 05:00:49AM -0500, Nayna Jain wrote:
> checkpatch.pl flags warning for symbolic permissions and suggests
> to replace with octal value.
> 
> This patch changes securityfs pseudo files permission
> to octal values in tpm_bios_log_setup().
> 
> Signed-off-by: Nayna Jain <nayna@linux.vnet.ibm.com>

Reviewed-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>

/Jarkko

> ---
>  drivers/char/tpm/tpm_eventlog.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/char/tpm/tpm_eventlog.c b/drivers/char/tpm/tpm_eventlog.c
> index 42b49c4..9467e31 100644
> --- a/drivers/char/tpm/tpm_eventlog.c
> +++ b/drivers/char/tpm/tpm_eventlog.c
> @@ -378,7 +378,7 @@ struct dentry **tpm_bios_log_setup(const char *name)
>  
>  	bin_file =
>  	    securityfs_create_file("binary_bios_measurements",
> -				   S_IRUSR | S_IRGRP, tpm_dir,
> +				   0440, tpm_dir,
>  				   (void *)&tpm_binary_b_measurements_seqops,
>  				   &tpm_bios_measurements_ops);
>  	if (is_bad(bin_file))
> @@ -386,7 +386,7 @@ struct dentry **tpm_bios_log_setup(const char *name)
>  
>  	ascii_file =
>  	    securityfs_create_file("ascii_bios_measurements",
> -				   S_IRUSR | S_IRGRP, tpm_dir,
> +				   0440, tpm_dir,
>  				   (void *)&tpm_ascii_b_measurements_seqops,
>  				   &tpm_bios_measurements_ops);
>  	if (is_bad(ascii_file))
> -- 
> 2.5.0
> 

^ permalink raw reply

* [PATCH v1 0/3] x86: SVM: add additional SVM NPF error and use HW GPA
From: Brijesh Singh @ 2016-11-14 22:04 UTC (permalink / raw)
  To: kvm; +Cc: rkrcmar, joro, x86, linux-kernel, mingo, hpa, pbonzini, tglx

This patch series is taken from SEV RFC series [1]. These patches do not
depend on the SEV feature and can be reviewed and merged on their own.

- Add support for additional SVM NFP error codes
- Add kvm_fast_pio_in support
- Use the hardware provided GPA instead of page walk

[1] http://marc.info/?l=linux-mm&m=147190814023863&w=2

Tom Lendacky (3):
      kvm: svm: Add support for additional SVM NPF error codes
      kvm: svm: Add kvm_fast_pio_in support
      kvm: svm: Use the hardware provided GPA instead of page walk


 arch/x86/include/asm/kvm_emulate.h |    3 ++
 arch/x86/include/asm/kvm_host.h    |   15 ++++++++-
 arch/x86/kvm/mmu.c                 |   20 +++++++++++-
 arch/x86/kvm/svm.c                 |   16 +++++++---
 arch/x86/kvm/x86.c                 |   60 +++++++++++++++++++++++++++++++++++-
 5 files changed, 106 insertions(+), 8 deletions(-)

-- 

Brijesh Singh

^ permalink raw reply

* Re: [PATCH v6 2/9] tpm: replace symbolic permission with octal for securityfs files
From: Jarkko Sakkinen @ 2016-11-14 22:18 UTC (permalink / raw)
  To: Nayna Jain
  Cc: tpmdd-devel, peterhuewe, tpmdd, jgunthorpe, linux-kernel,
	linux-security-module
In-Reply-To: <1479117656-12403-3-git-send-email-nayna@linux.vnet.ibm.com>

On Mon, Nov 14, 2016 at 05:00:49AM -0500, Nayna Jain wrote:
> checkpatch.pl flags warning for symbolic permissions and suggests
> to replace with octal value.
> 
> This patch changes securityfs pseudo files permission
> to octal values in tpm_bios_log_setup().
> 
> Signed-off-by: Nayna Jain <nayna@linux.vnet.ibm.com>

Reviewed-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>

/Jarkko

> ---
>  drivers/char/tpm/tpm_eventlog.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/char/tpm/tpm_eventlog.c b/drivers/char/tpm/tpm_eventlog.c
> index 42b49c4..9467e31 100644
> --- a/drivers/char/tpm/tpm_eventlog.c
> +++ b/drivers/char/tpm/tpm_eventlog.c
> @@ -378,7 +378,7 @@ struct dentry **tpm_bios_log_setup(const char *name)
>  
>  	bin_file =
>  	    securityfs_create_file("binary_bios_measurements",
> -				   S_IRUSR | S_IRGRP, tpm_dir,
> +				   0440, tpm_dir,
>  				   (void *)&tpm_binary_b_measurements_seqops,
>  				   &tpm_bios_measurements_ops);
>  	if (is_bad(bin_file))
> @@ -386,7 +386,7 @@ struct dentry **tpm_bios_log_setup(const char *name)
>  
>  	ascii_file =
>  	    securityfs_create_file("ascii_bios_measurements",
> -				   S_IRUSR | S_IRGRP, tpm_dir,
> +				   0440, tpm_dir,
>  				   (void *)&tpm_ascii_b_measurements_seqops,
>  				   &tpm_bios_measurements_ops);
>  	if (is_bad(ascii_file))
> -- 
> 2.5.0
> 

^ permalink raw reply

* [PATCH v1 3/3] kvm: svm: Use the hardware provided GPA instead of page walk
From: Brijesh Singh @ 2016-11-14 22:16 UTC (permalink / raw)
  To: kvm
  Cc: Thomas.Lendacky, brijesh.singh, rkrcmar, joro, x86, linux-kernel,
	mingo, hpa, pbonzini, tglx, bp
In-Reply-To: <147916172660.16347.15695649975899246333.stgit@brijesh-build-machine>

From: Tom Lendacky <thomas.lendacky@amd.com>

When a guest causes a NPF which requires emulation, KVM sometimes walks
the guest page tables to translate the GVA to a GPA. This is unnecessary
most of the time on AMD hardware since the hardware provides the GPA in
EXITINFO2.

The only exception cases involve string operations involving rep or
operations that use two memory locations. With rep, the GPA will only be
the value of the initial NPF and with dual memory locations we won't know
which memory address was translated into EXITINFO2.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
---
 arch/x86/include/asm/kvm_emulate.h |    3 +++
 arch/x86/include/asm/kvm_host.h    |    3 +++
 arch/x86/kvm/svm.c                 |    9 ++++++++-
 arch/x86/kvm/x86.c                 |   17 ++++++++++++++++-
 4 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index e9cd7be..2d1ac09 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -344,6 +344,9 @@ struct x86_emulate_ctxt {
 	struct read_cache mem_read;
 };
 
+/* String operation identifier (matches the definition in emulate.c) */
+#define CTXT_STRING_OP	(1 << 13)
+
 /* Repeat String Operation Prefix */
 #define REPE_PREFIX	0xf3
 #define REPNE_PREFIX	0xf2
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 77cb3f9..fd5b1c8 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -668,6 +668,9 @@ struct kvm_vcpu_arch {
 
 	int pending_ioapic_eoi;
 	int pending_external_vector;
+
+	/* GPA available (AMD only) */
+	bool gpa_available;
 };
 
 struct kvm_lpage_info {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 5e64e656..b442c5a 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -275,6 +275,9 @@ static int avic;
 module_param(avic, int, S_IRUGO);
 #endif
 
+/* EXITINFO2 contains valid GPA */
+static bool gpa_avail = true;
+
 /* AVIC VM ID bit masks and lock */
 static DECLARE_BITMAP(avic_vm_id_bitmap, AVIC_VM_ID_NR);
 static DEFINE_SPINLOCK(avic_vm_id_lock);
@@ -1055,8 +1058,10 @@ static __init int svm_hardware_setup(void)
 			goto err;
 	}
 
-	if (!boot_cpu_has(X86_FEATURE_NPT))
+	if (!boot_cpu_has(X86_FEATURE_NPT)) {
 		npt_enabled = false;
+		gpa_avail = false;
+	}
 
 	if (npt_enabled && !npt) {
 		printk(KERN_INFO "kvm: Nested Paging disabled\n");
@@ -4192,6 +4197,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
 		vcpu->arch.cr0 = svm->vmcb->save.cr0;
 	if (npt_enabled)
 		vcpu->arch.cr3 = svm->vmcb->save.cr3;
+	if (gpa_avail)
+		vcpu->arch.gpa_available = (exit_code == SVM_EXIT_NPF);
 
 	if (unlikely(svm->nested.exit_required)) {
 		nested_svm_vmexit(svm);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d02aeff..c290794 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4420,7 +4420,19 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
 		return 1;
 	}
 
-	*gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+	/*
+	 * If the exit was due to a NPF we may already have a GPA.
+	 * If the GPA is present, use it to avoid the GVA to GPA table
+	 * walk. Note, this cannot be used on string operations since
+	 * string operation using rep will only have the initial GPA
+	 * from when the NPF occurred.
+	 */
+	if (vcpu->arch.gpa_available &&
+	    !(vcpu->arch.emulate_ctxt.d & CTXT_STRING_OP))
+		*gpa = exception->address;
+	else
+		*gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access,
+						       exception);
 
 	if (*gpa == UNMAPPED_GVA)
 		return -1;
@@ -5542,6 +5554,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 	}
 
 restart:
+	/* Save the faulting GPA (cr2) in the address field */
+	ctxt->exception.address = cr2;
+
 	r = x86_emulate_insn(ctxt);
 
 	if (r == EMULATION_INTERCEPTED)

^ permalink raw reply related

* [PATCH v1 0/3] x86: SVM: add additional SVM NPF error and use HW GPA
From: Brijesh Singh @ 2016-11-14 22:15 UTC (permalink / raw)
  To: kvm
  Cc: Thomas.Lendacky, brijesh.singh, rkrcmar, joro, x86, linux-kernel,
	mingo, hpa, pbonzini, tglx, bp

(resending, forgot to add Tom Lendacky and Borislav Petkov in CC list)

This patch series is taken from SEV RFC series [1]. These patches do not
depend on the SEV feature and can be reviewed and merged on their own.

- Add support for additional SVM NFP error codes
- Add kvm_fast_pio_in support
- Use the hardware provided GPA instead of page walk

[1] http://marc.info/?l=linux-mm&m=147190814023863&w=2

Tom Lendacky (3):
      kvm: svm: Add support for additional SVM NPF error codes
      kvm: svm: Add kvm_fast_pio_in support
      kvm: svm: Use the hardware provided GPA instead of page walk


 arch/x86/include/asm/kvm_emulate.h |    3 ++
 arch/x86/include/asm/kvm_host.h    |   15 ++++++++-
 arch/x86/kvm/mmu.c                 |   20 +++++++++++-
 arch/x86/kvm/svm.c                 |   16 +++++++---
 arch/x86/kvm/x86.c                 |   60 +++++++++++++++++++++++++++++++++++-
 5 files changed, 106 insertions(+), 8 deletions(-)

-- 

Brijesh Singh

^ permalink raw reply

* Re: Long delays creating a netns after deleting one (possibly RCU related)
From: Eric W. Biederman @ 2016-11-14 22:12 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Cong Wang, Rolf Neugebauer, LKML, Linux Kernel Network Developers,
	Justin Cormack, Ian Campbell, netdev, Eric Dumazet
In-Reply-To: <20161114181425.GN4127@linux.vnet.ibm.com>

"Paul E. McKenney" <paulmck@linux.vnet.ibm.com> writes:

> On Mon, Nov 14, 2016 at 09:44:35AM -0800, Cong Wang wrote:
>> On Mon, Nov 14, 2016 at 8:24 AM, Paul E. McKenney
>> <paulmck@linux.vnet.ibm.com> wrote:
>> > On Sun, Nov 13, 2016 at 10:47:01PM -0800, Cong Wang wrote:
>> >> On Fri, Nov 11, 2016 at 4:55 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>> >> > On Fri, Nov 11, 2016 at 4:23 PM, Paul E. McKenney
>> >> > <paulmck@linux.vnet.ibm.com> wrote:
>> >> >>
>> >> >> Ah!  This net_mutex is different than RTNL.  Should synchronize_net() be
>> >> >> modified to check for net_mutex being held in addition to the current
>> >> >> checks for RTNL being held?
>> >> >>
>> >> >
>> >> > Good point!
>> >> >
>> >> > Like commit be3fc413da9eb17cce0991f214ab0, checking
>> >> > for net_mutex for this case seems to be an optimization, I assume
>> >> > synchronize_rcu_expedited() and synchronize_rcu() have the same
>> >> > behavior...
>> >>
>> >> Thinking a bit more, I think commit be3fc413da9eb17cce0991f
>> >> gets wrong on rtnl_is_locked(), the lock could be locked by other
>> >> process not by the current one, therefore it should be
>> >> lockdep_rtnl_is_held() which, however, is defined only when LOCKDEP
>> >> is enabled... Sigh.
>> >>
>> >> I don't see any better way than letting callers decide if they want the
>> >> expedited version or not, but this requires changes of all callers of
>> >> synchronize_net(). Hm.
>> >
>> > I must confess that I don't understand how it would help to use an
>> > expedited grace period when some other process is holding RTNL.
>> > In contrast, I do well understand how it helps when the current process
>> > is holding RTNL.
>> 
>> Yeah, this is exactly my point. And same for ASSERT_RTNL() which checks
>> rtnl_is_locked(), clearly we need to assert "it is held by the current process"
>> rather than "it is locked by whatever process".
>> 
>> But given *_is_held() is always defined by LOCKDEP, so we probably need
>> mutex to provide such a helper directly, mutex->owner is not always defined
>> either. :-/
>
> There is always the option of making acquisition and release set a per-task
> variable that can be tested.  (Where did I put that asbestos suit, anyway?)
>
> 							Thanx, Paul

synchronize_rcu_expidited is not enough if you have multiple network
devices in play.

Looking at the code it comes down to this commit, and it appears there
is a promise add rcu grace period combining by Eric Dumazet.

Eric since people are hitting noticable stalls because of the rcu grace
period taking a long time do you think you could look at this code path
a bit more?

commit 93d05d4a320cb16712bb3d57a9658f395d8cecb9
Author: Eric Dumazet <edumazet@google.com>
Date:   Wed Nov 18 06:31:03 2015 -0800

    net: provide generic busy polling to all NAPI drivers
    
    NAPI drivers no longer need to observe a particular protocol
    to benefit from busy polling (CONFIG_NET_RX_BUSY_POLL=y)
    
    napi_hash_add() and napi_hash_del() are automatically called
    from core networking stack, respectively from
    netif_napi_add() and netif_napi_del()
    
    This patch depends on free_netdev() and netif_napi_del() being
    called from process context, which seems to be the norm.
    
    Drivers might still prefer to call napi_hash_del() on their
    own, since they might combine all the rcu grace periods into
    a single one, knowing their NAPI structures lifetime, while
    core networking stack has no idea of a possible combining.
    
    Once this patch proves to not bring serious regressions,
    we will cleanup drivers to either remove napi_hash_del()
    or provide appropriate rcu grace periods combining.
    
    Signed-off-by: Eric Dumazet <edumazet@google.com>
    Signed-off-by: David S. Miller <davem@davemloft.net>

Eric

^ permalink raw reply

* Re: [PATCH 1/2] staging: iio: ad7606: replace range/range_available with corresponding scale
From: Jonathan Cameron @ 2016-11-14 22:15 UTC (permalink / raw)
  To: Lars-Peter Clausen, Linus Walleij, Jonathan Cameron
  Cc: Eva Rachel Retuya, linux-iio@vger.kernel.org,
	linux-kernel@vger.kernel.org, Michael Hennerich, Hartmut Knaack,
	Peter Meerwald, Greg KH
In-Reply-To: <b59ff168-c82e-26ed-9192-a491d97c5d6d@metafoo.de>



On 14 November 2016 18:53:28 GMT+00:00, Lars-Peter Clausen <lars@metafoo.de> wrote:
>On 11/14/2016 05:58 PM, Linus Walleij wrote:
>> On Sat, Nov 12, 2016 at 3:24 PM, Jonathan Cameron <jic23@kernel.org>
>wrote:
>> 
>>> Is it just me who thought, we need a fixed GPI like a fixed
>regulator?
Probably didn't help clarity that I described it as an input pin whereas it's kind of like having an 
output pin whose state you can't change...

>>> Would allow this sort of fixed wiring to be simply defined.
>>>
>>> Linus, worth exploring?
>> 
>> So if fixed regulator is for a voltage provider, this would be
>> pretty much the inverse: deciding for a voltage range by switching
>> a GPIO.
>
>It's about figuring out the setting of a "GPIO" that can't be changed
>from
>software.
>
>Devices sometimes, instead of a configuration bus like I2C or SPI, use
>simple input pins, that can either be set to high or low, to allow
>software
>the state of the device. The GPIO API is typically used to configure
>these pins.
>
>This works fine as long as the pin is connected to a GPIO. But
>sometimes the
>system designer decides that a settings does not need to be
>configurable, in
>this case the pin will be tied to logic low or high directly on the PCB
>without any GPIO controller being involved.
>
>Sometimes a driver wants to know how the pin is wired up so it can
>report to
>userspace this part runs in the following mode and the mode can't be
>changed. In a sense it is like a reverse GPIO hog.
>
>Considering that this is a common usecase the question was how this can
>be
>implemented in a driver independent way to avoid code duplication and
>slightly different variations of what is effectively the same DT/ACPI
>binding.
>
>E.g. lets say for a configurable pin you use
>
>	range-gpio = <&gpio ...>;
>
>and for a static pin
>
>	range-gpio-fixed = <1>;
>
>Or something similar.
>
>--
>To unsubscribe from this list: send the line "unsubscribe linux-iio" in
>the body of a message to majordomo@vger.kernel.org
>More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
Sent from my Android device with K-9 Mail. Please excuse my brevity.

^ permalink raw reply

* Re: [PATCH v2] cpufreq: conservative: Decrease frequency faster when the update deferred
From: Rafael J. Wysocki @ 2016-11-14 22:09 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Stratos Karafotis, Rafael J. Wysocki, Viresh Kumar,
	linux-pm@vger.kernel.org, LKML
In-Reply-To: <CAJZ5v0hO2T41qkPNETqJv7yfJ-=V34gqr_gEy7bNxg83e5HB+g@mail.gmail.com>

On Mon, Nov 14, 2016 at 10:59 PM, Rafael J. Wysocki <rafael@kernel.org> wrote:
> On Mon, Nov 14, 2016 at 10:46 PM, Stratos Karafotis
> <stratosk@semaphore.gr> wrote:
>>
>>
>> On 14/11/2016 10:44 μμ, Rafael J. Wysocki wrote:
>>> On Sat, Nov 12, 2016 at 10:04 PM, Stratos Karafotis
>>> <stratosk@semaphore.gr> wrote:
>>>> Conservative governor changes the CPU frequency in steps.
>>>> That means that if a CPU runs at max frequency, it will need several
>>>> sampling periods to return to min frequency when the workload
>>>> is finished.
>>>>
>>>> If the update function that calculates the load and target frequency
>>>> is deferred, the governor might need even more time to decrease the
>>>> frequency.
>>>>
>>>> This may have impact to power consumption and after all conservative
>>>> should decrease the frequency if there is no workload at every sampling
>>>> rate.
>>>>
>>>> To resolve the above issue calculate the number of sampling periods
>>>> that the update is deferred. Considering that for each sampling period
>>>> conservative should drop the frequency by a freq_step because the
>>>> CPU was idle apply the proper subtraction to requested frequency.
>>>>
>>>> Below, the kernel trace with and without this patch. First an
>>>> intensive workload is applied on a specific CPU. Then the workload
>>>> is removed and the CPU goes to idle.
>>>>
>>>> WITHOUT
>>>>
>>>>      <idle>-0     [007] dN..   620.329153: cpu_idle: state=4294967295 cpu_id=7
>>>> kworker/7:2-556   [007] ....   620.350857: cpu_frequency: state=1700000 cpu_id=7
>>>> kworker/7:2-556   [007] ....   620.370856: cpu_frequency: state=1900000 cpu_id=7
>>>> kworker/7:2-556   [007] ....   620.390854: cpu_frequency: state=2100000 cpu_id=7
>>>> kworker/7:2-556   [007] ....   620.411853: cpu_frequency: state=2200000 cpu_id=7
>>>> kworker/7:2-556   [007] ....   620.432854: cpu_frequency: state=2400000 cpu_id=7
>>>> kworker/7:2-556   [007] ....   620.453854: cpu_frequency: state=2600000 cpu_id=7
>>>> kworker/7:2-556   [007] ....   620.494856: cpu_frequency: state=2900000 cpu_id=7
>>>> kworker/7:2-556   [007] ....   620.515856: cpu_frequency: state=3100000 cpu_id=7
>>>> kworker/7:2-556   [007] ....   620.536858: cpu_frequency: state=3300000 cpu_id=7
>>>> kworker/7:2-556   [007] ....   620.557857: cpu_frequency: state=3401000 cpu_id=7
>>>>      <idle>-0     [007] d...   669.591363: cpu_idle: state=4 cpu_id=7
>>>>      <idle>-0     [007] d...   669.591939: cpu_idle: state=4294967295 cpu_id=7
>>>>      <idle>-0     [007] d...   669.591980: cpu_idle: state=4 cpu_id=7
>>>>      <idle>-0     [007] dN..   669.591989: cpu_idle: state=4294967295 cpu_id=7
>>>> ...
>>>>      <idle>-0     [007] d...   670.201224: cpu_idle: state=4 cpu_id=7
>>>>      <idle>-0     [007] d...   670.221975: cpu_idle: state=4294967295 cpu_id=7
>>>> kworker/7:2-556   [007] ....   670.222016: cpu_frequency: state=3300000 cpu_id=7
>>>>      <idle>-0     [007] d...   670.222026: cpu_idle: state=4 cpu_id=7
>>>>      <idle>-0     [007] d...   670.234964: cpu_idle: state=4294967295 cpu_id=7
>>>> ...
>>>>      <idle>-0     [007] d...   670.801251: cpu_idle: state=4 cpu_id=7
>>>>      <idle>-0     [007] d...   671.236046: cpu_idle: state=4294967295 cpu_id=7
>>>> kworker/7:2-556   [007] ....   671.236073: cpu_frequency: state=3100000 cpu_id=7
>>>>      <idle>-0     [007] d...   671.236112: cpu_idle: state=4 cpu_id=7
>>>>      <idle>-0     [007] d...   671.393437: cpu_idle: state=4294967295 cpu_id=7
>>>> ...
>>>>      <idle>-0     [007] d...   671.401277: cpu_idle: state=4 cpu_id=7
>>>>      <idle>-0     [007] d...   671.404083: cpu_idle: state=4294967295 cpu_id=7
>>>> kworker/7:2-556   [007] ....   671.404111: cpu_frequency: state=2900000 cpu_id=7
>>>>      <idle>-0     [007] d...   671.404125: cpu_idle: state=4 cpu_id=7
>>>>      <idle>-0     [007] d...   671.404974: cpu_idle: state=4294967295 cpu_id=7
>>>> ...
>>>>      <idle>-0     [007] d...   671.501180: cpu_idle: state=4 cpu_id=7
>>>>      <idle>-0     [007] d...   671.995414: cpu_idle: state=4294967295 cpu_id=7
>>>> kworker/7:2-556   [007] ....   671.995459: cpu_frequency: state=2800000 cpu_id=7
>>>>      <idle>-0     [007] d...   671.995469: cpu_idle: state=4 cpu_id=7
>>>>      <idle>-0     [007] d...   671.996287: cpu_idle: state=4294967295 cpu_id=7
>>>> ...
>>>>      <idle>-0     [007] d...   672.001305: cpu_idle: state=4 cpu_id=7
>>>>      <idle>-0     [007] d...   672.078374: cpu_idle: state=4294967295 cpu_id=7
>>>> kworker/7:2-556   [007] ....   672.078410: cpu_frequency: state=2600000 cpu_id=7
>>>>      <idle>-0     [007] d...   672.078419: cpu_idle: state=4 cpu_id=7
>>>>      <idle>-0     [007] d...   672.158020: cpu_idle: state=4294967295 cpu_id=7
>>>> kworker/7:2-556   [007] ....   672.158040: cpu_frequency: state=2400000 cpu_id=7
>>>>      <idle>-0     [007] d...   672.158044: cpu_idle: state=4 cpu_id=7
>>>>      <idle>-0     [007] d...   672.160038: cpu_idle: state=4294967295 cpu_id=7
>>>> ...
>>>>      <idle>-0     [007] d...   672.234557: cpu_idle: state=4 cpu_id=7
>>>>      <idle>-0     [007] d...   672.237121: cpu_idle: state=4294967295 cpu_id=7
>>>> kworker/7:2-556   [007] ....   672.237174: cpu_frequency: state=2100000 cpu_id=7
>>>>      <idle>-0     [007] d...   672.237186: cpu_idle: state=4 cpu_id=7
>>>>      <idle>-0     [007] d...   672.237778: cpu_idle: state=4294967295 cpu_id=7
>>>> ...
>>>>      <idle>-0     [007] d...   672.267902: cpu_idle: state=4 cpu_id=7
>>>>      <idle>-0     [007] d...   672.269860: cpu_idle: state=4294967295 cpu_id=7
>>>> kworker/7:2-556   [007] ....   672.269906: cpu_frequency: state=1900000 cpu_id=7
>>>>      <idle>-0     [007] d...   672.269914: cpu_idle: state=4 cpu_id=7
>>>>      <idle>-0     [007] d...   672.271902: cpu_idle: state=4294967295 cpu_id=7
>>>> ...
>>>>      <idle>-0     [007] d...   672.751342: cpu_idle: state=4 cpu_id=7
>>>>      <idle>-0     [007] d...   672.823056: cpu_idle: state=4294967295 cpu_id=7
>>>> kworker/7:2-556   [007] ....   672.823095: cpu_frequency: state=1600000 cpu_id=7
>>>>
>>>> WITH
>>>>
>>>>      <idle>-0     [007] dN..  4380.928009: cpu_idle: state=4294967295 cpu_id=7
>>>> kworker/7:2-399   [007] ....  4380.949767: cpu_frequency: state=2000000 cpu_id=7
>>>> kworker/7:2-399   [007] ....  4380.969765: cpu_frequency: state=2200000 cpu_id=7
>>>> kworker/7:2-399   [007] ....  4381.009766: cpu_frequency: state=2500000 cpu_id=7
>>>> kworker/7:2-399   [007] ....  4381.029767: cpu_frequency: state=2600000 cpu_id=7
>>>> kworker/7:2-399   [007] ....  4381.049769: cpu_frequency: state=2800000 cpu_id=7
>>>> kworker/7:2-399   [007] ....  4381.069769: cpu_frequency: state=3000000 cpu_id=7
>>>> kworker/7:2-399   [007] ....  4381.089771: cpu_frequency: state=3100000 cpu_id=7
>>>> kworker/7:2-399   [007] ....  4381.109772: cpu_frequency: state=3400000 cpu_id=7
>>>> kworker/7:2-399   [007] ....  4381.129773: cpu_frequency: state=3401000 cpu_id=7
>>>>      <idle>-0     [007] d...  4428.226159: cpu_idle: state=1 cpu_id=7
>>>>      <idle>-0     [007] d...  4428.226176: cpu_idle: state=4294967295 cpu_id=7
>>>>      <idle>-0     [007] d...  4428.226181: cpu_idle: state=4 cpu_id=7
>>>>      <idle>-0     [007] d...  4428.227177: cpu_idle: state=4294967295 cpu_id=7
>>>> ...
>>>>      <idle>-0     [007] d...  4428.551640: cpu_idle: state=4 cpu_id=7
>>>>      <idle>-0     [007] d...  4428.649239: cpu_idle: state=4294967295 cpu_id=7
>>>> kworker/7:2-399   [007] ....  4428.649268: cpu_frequency: state=2800000 cpu_id=7
>>>>      <idle>-0     [007] d...  4428.649278: cpu_idle: state=4 cpu_id=7
>>>>      <idle>-0     [007] d...  4428.689856: cpu_idle: state=4294967295 cpu_id=7
>>>> ...
>>>>      <idle>-0     [007] d...  4428.799542: cpu_idle: state=4 cpu_id=7
>>>>      <idle>-0     [007] d...  4428.801683: cpu_idle: state=4294967295 cpu_id=7
>>>> kworker/7:2-399   [007] ....  4428.801748: cpu_frequency: state=1700000 cpu_id=7
>>>>      <idle>-0     [007] d...  4428.801761: cpu_idle: state=4 cpu_id=7
>>>>      <idle>-0     [007] d...  4428.806545: cpu_idle: state=4294967295 cpu_id=7
>>>> ...
>>>>      <idle>-0     [007] d...  4429.051880: cpu_idle: state=4 cpu_id=7
>>>>      <idle>-0     [007] d...  4429.086240: cpu_idle: state=4294967295 cpu_id=7
>>>> kworker/7:2-399   [007] ....  4429.086293: cpu_frequency: state=1600000 cpu_id=7
>>>>
>>>> Without the patch the CPU dropped to min frequency after 3.2s
>>>> With the patch applied the CPU dropped to min frequency after 0.86s
>>>>
>>>> Signed-off-by: Stratos Karafotis <stratosk@semaphore.gr>
>>>> ---
>>>>  v1 -> v2
>>>> - Use correct terminology in change log
>>>> - Change the member variable name from 'deferred_periods' to 'idle_periods'
>>>> - Fix format issue
>>>>
>>>>  drivers/cpufreq/cpufreq_conservative.c | 14 +++++++++++++-
>>>>  drivers/cpufreq/cpufreq_governor.c     | 18 +++++++++++++-----
>>>>  drivers/cpufreq/cpufreq_governor.h     |  1 +
>>>>  3 files changed, 27 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
>>>> index fa5ece3..d787772 100644
>>>> --- a/drivers/cpufreq/cpufreq_conservative.c
>>>> +++ b/drivers/cpufreq/cpufreq_conservative.c
>>>> @@ -73,7 +73,19 @@ static unsigned int cs_dbs_update(struct cpufreq_policy *policy)
>>>>          */
>>>>         if (cs_tuners->freq_step == 0)
>>>>                 goto out;
>>>> -
>>>> +       /*
>>>> +        * Decrease requested_freq for each idle period that we didn't
>>>> +        * update the frequency
>>>> +        */
>>>> +       if (policy_dbs->idle_periods < UINT_MAX) {
>>>> +               unsigned int freq_target = policy_dbs->idle_periods *
>>>> +                               get_freq_target(cs_tuners, policy);
>>>> +               if (requested_freq > freq_target)
>>>> +                       requested_freq -= freq_target;
>>>> +               else
>>>> +                       requested_freq = policy->min;
>>>> +               policy_dbs->idle_periods = UINT_MAX;
>>>> +       }
>>>>         /*
>>>>          * If requested_freq is out of range, it is likely that the limits
>>>>          * changed in the meantime, so fall back to current frequency in that
>>>> diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
>>>> index 3729474..1bc7137 100644
>>>> --- a/drivers/cpufreq/cpufreq_governor.c
>>>> +++ b/drivers/cpufreq/cpufreq_governor.c
>>>> @@ -117,7 +117,7 @@ unsigned int dbs_update(struct cpufreq_policy *policy)
>>>>         struct policy_dbs_info *policy_dbs = policy->governor_data;
>>>>         struct dbs_data *dbs_data = policy_dbs->dbs_data;
>>>>         unsigned int ignore_nice = dbs_data->ignore_nice_load;
>>>> -       unsigned int max_load = 0;
>>>> +       unsigned int max_load = 0, idle_periods = UINT_MAX;
>>>>         unsigned int sampling_rate, io_busy, j;
>>>>
>>>>         /*
>>>> @@ -163,8 +163,12 @@ unsigned int dbs_update(struct cpufreq_policy *policy)
>>>>                          * calls, so the previous load value can be used then.
>>>>                          */
>>>>                         load = j_cdbs->prev_load;
>>>> -               } else if (unlikely(time_elapsed > 2 * sampling_rate &&
>>>> -                                   j_cdbs->prev_load)) {
>>>> +               } else if (unlikely(time_elapsed > 2 * sampling_rate)) {
>>>> +                       unsigned int periods = time_elapsed / sampling_rate;
>>>> +
>>>> +                       if (periods < idle_periods)
>>>> +                               idle_periods = periods;
>>>> +
>>>>                         /*
>>>>                          * If the CPU had gone completely idle and a task has
>>>>                          * just woken up on this CPU now, it would be unfair to
>>>> @@ -189,8 +193,10 @@ unsigned int dbs_update(struct cpufreq_policy *policy)
>>>>                          * 'time_elapsed' (as compared to the sampling rate)
>>>>                          * indicates this scenario.
>>>>                          */
>>>> -                       load = j_cdbs->prev_load;
>>>> -                       j_cdbs->prev_load = 0;
>>>> +                       if (j_cdbs->prev_load) {
>>>> +                               load = j_cdbs->prev_load;
>>>> +                               j_cdbs->prev_load = 0;
>>>> +                       }
>>>>                 } else {
>>>>                         if (time_elapsed >= idle_time) {
>>>>                                 load = 100 * (time_elapsed - idle_time) / time_elapsed;
>>>> @@ -218,6 +224,8 @@ unsigned int dbs_update(struct cpufreq_policy *policy)
>>>>                 if (load > max_load)
>>>>                         max_load = load;
>>>>         }
>>>> +       policy_dbs->idle_periods = idle_periods;
>>>> +
>>>>         return max_load;
>>>>  }
>>>>  EXPORT_SYMBOL_GPL(dbs_update);
>>>
>>> I have a murky suspicion that the changes in dbs_update() are going to
>>> break something.  I need to recall what it was, though.
>>
>> The only change in dbs_update() is the calculation of 'idle_periods'.
>> If I don't miss something I left current functionality untouched.
>
> Well, not quite.  The else branch may now trigger when
> j_cdbs->prev_load is zero too which it didn't do before, AFAICS.

What I mean is that the "if else" never triggers when
j_cdbs->prev_load is zero before the change, but that changes, so the
"else" branch will not cover the "j_cdbs->prev_load equal to zero"
case any more.  I'm not sure how much that matters ATM, though.

Sent too quickly, sorry.

Thanks,
Rafael

^ permalink raw reply

* Re: [PATCH] net/phy/vitesse: Configure RGMII skew on VSC8601, if needed
From: Alex @ 2016-11-14 21:54 UTC (permalink / raw)
  To: Florian Fainelli, David Miller; +Cc: gokhan, netdev, linux-kernel
In-Reply-To: <d567c69f-6b57-7083-9090-df01fb140e36@gmail.com>



On 11/14/2016 01:25 PM, Florian Fainelli wrote:
> On 11/14/2016 01:18 PM, David Miller wrote:
>> From: Alexandru Gagniuc <alex.g@adaptrum.com>
>> Date: Sat, 12 Nov 2016 15:32:13 -0800
>>
>>> +	if (phydev->interface == PHY_INTERFACE_MODE_RGMII_ID)
>>> +		ret = vsc8601_add_skew(phydev);
>>
>> I think you should use phy_interface_is_rgmii() here.
>>
>
> This would include all RGMII modes, here I think the intent is to check
> for PHY_INTERFACE_MODE_RGMII_ID and PHY_INTERFACE_MODE_RGMII_TXID (or
> RXID),

That is correct.

>  Alexandru, what direction does the skew settings apply to?

It applies a skew in both TX and RX directions.

Alex

^ permalink raw reply

* [GIT] Networking
From: David Miller @ 2016-11-14 22:08 UTC (permalink / raw)
  To: torvalds; +Cc: akpm, netdev, linux-kernel


1) Fix off by one wrt. indexing when dumping /proc/net/route entries, from
   Alexander Duyck.

2) Fix lockdep splats in iwlwifi, from Johannes Berg.

3) Cure panic when inserting certain netfilter rules when NFT_SET_HASH
   is disabled, from Liping Zhang.

4) Memory leak when nft_expr_clone() fails, also from Liping Zhang.

5) Disable UFO when path will apply IPSEC tranformations, from Jakub
   Sitnicki.

6) Don't bogusly double cwnd in dctcp module, from Florian Westphal.

7) skb_checksum_help() should never actually use the value "0" for
   the resulting checksum, that has a special meaning, use CSUM_MANGLED_0
   instead.  From Eric Dumazet.

8) Per-tx/rx queue statistic strings are wrong in qed driver, fix from
   Yuval MIntz.

9) Fix SCTP reference counting of associations and transports in
   sctp_diag.  From Xin Long.

10) When we hit ip6tunnel_xmit() we could have come from an ipv4
    path in a previous layer or similar, so explicitly clear the
    ipv6 control block in the skb.  From Eli Cooper.

11) Fix bogus sleeping inside of inet_wait_for_connect(), from WANG
    Cong.

12) Correct deivce ID of T6 adapter in cxgb4 driver, from Hariprasad
    Shenai.

13) Fix potential access past the end of the skb page frag array in
    tcp_sendmsg().  From Eric Dumazet.

14) 'skb' can legitimately be NULL in inet{,6}_exact_dif_match(). Fix
    from David Ahern.

15) Don't return an error in tcp_sendmsg() if we wronte any bytes
    successfully, from Eric Dumazet.

16) Extraneous unlocks in netlink_diag_dump(), we removed the locking
    but forgot to purge these unlock calls. From Eric Dumazet.

17) Fix memory leak in error path of __genl_register_family().  We
    leak the attrbuf, from WANG Cong.

18) cgroupstats netlink policy table is mis-sized, from WANG Cong.

19) Several XDP bug fixes in mlx5, from Saeed Mahameed.

20) Fix several device refcount leaks in network drivers, from Johan
    Hovold.

21) icmp6_send() should use skb dst device not skb->dev to determine
    L3 routing domain.  From David Ahern.

22) ip_vs_genl_family sets maxattr incorrectly, from WANG Cong.

23) We leak new macvlan port in some cases of maclan_common_netlink()
    errors.  Fix from Gao Feng.

24) Similar to the icmp6_send() fix, icmp_route_lookup() should determine
    L3 routing domain using skb_dst(skb)->dev not skb->dev.  Also
    from David Ahern.

25) Several fixes for route offloading and FIB notification handling
    in mlxsw driver, from Jiri Pirko.

26) Properly cap __skb_flow_dissect()'s return value, from Eric
    Dumazet.

27) Fix long standing regression in ipv4 redirect handling,
    wrt. validating the new neighbour's reachability.  From
    Stephen Suryaputra Lin.

28) If sk_filter() trims the packet excessively, handle it reasonably
    in tcp input instead of exploding.  From Eric Dumazet.

29) Fix handling of napi hash state when copying channels in sfc
    driver, from Bert Kenward.

Please pull, thanks a lot!

The following changes since commit 2a26d99b251b8625d27aed14e97fc10707a3a81f:

  Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net (2016-10-29 20:33:20 -0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git 

for you to fetch changes up to ac571de999e14b87890cb960ad6f03fbdde6abc8:

  mlxsw: spectrum_router: Flush FIB tables during fini (2016-11-14 16:45:16 -0500)

----------------------------------------------------------------
Alexander Duyck (1):
      fib_trie: Correct /proc/net/route off by one error

Allan Chou (1):
      Net Driver: Add Cypress GX3 VID=04b4 PID=3610.

Andy Gospodarek (1):
      bgmac: stop clearing DMA receive control register right after it is set

Arkadi Sharshevsky (1):
      mlxsw: spectrum_router: Correctly dump neighbour activity

Arnd Bergmann (3):
      brcmfmac: avoid maybe-uninitialized warning in brcmf_cfg80211_start_ap
      netfilter: ip_vs_sync: fix bogus maybe-uninitialized warning
      vxlan: hide unused local variable

Baoquan He (2):
      Revert "bnx2: Reset device during driver initialization"
      bnx2: Wait for in-flight DMA to complete at probe stage

Baruch Siach (1):
      net: bpqether.h: remove if_ether.h guard

Benjamin Poirier (1):
      bna: Add synchronization for tx ring.

Bert Kenward (1):
      sfc: clear napi_hash state when copying channels

Christophe Jaillet (1):
      net/mlx5: Simplify a test

Colin Ian King (2):
      net: ethernet: ixp4xx_eth: fix spelling mistake in debug message
      ps3_gelic: fix spelling mistake in debug message

Daniel Borkmann (2):
      bpf: fix htab map destruction when extra reserve is in use
      bpf: fix map not being uncharged during map creation failure

David Ahern (4):
      net: tcp: check skb is non-NULL for exact match on lookups
      net: icmp6_send should use dst dev to determine L3 domain
      net: icmp_route_lookup should use rt dev to determine L3 domain
      net: tcp response should set oif only if it is L3 master

David S. Miller (14):
      Merge tag 'wireless-drivers-for-davem-2016-10-30' of git://git.kernel.org/.../kvalo/wireless-drivers
      Merge branch 'sctp-hold-transport-fixes'
      Merge tag 'linux-can-fixes-for-4.9-20161031' of git://git.kernel.org/.../mkl/linux-can
      Merge branch 'xgene-coalescing-bugs'
      Merge branch 'mlx5-fixes'
      Merge branch 'phy-ref-leaks'
      Merge branch 'qcom-emac-pause'
      Merge git://git.kernel.org/.../pablo/nf
      Merge branch 'qed-fixes'
      Merge branch 'mlxsw-fixes'
      Merge branch 'fix-bpf_redirect'
      Merge branch 'bnxt_en-fixes'
      Merge branch 'mlxsw-fixes'
      Merge branch 'bnx2-kdump-fix'

Dongli Zhang (2):
      xen-netfront: do not cast grant table reference to signed short
      xen-netfront: cast grant table reference first to type int

Eli Cooper (2):
      ip6_tunnel: Clear IP6CB in ip6tunnel_xmit()
      ip6_udp_tunnel: remove unused IPCB related codes

Eric Dumazet (12):
      net: clear sk_err_soft in sk_clone_lock()
      net: mangle zero checksum in skb_checksum_help()
      tcp: fix potential memory corruption
      tcp: fix return value for partial writes
      dccp: do not release listeners too soon
      dccp: do not send reset to already closed sockets
      dccp: fix out of bound access in dccp_v4_err()
      netlink: netlink_diag_dump() runs without locks
      ipv6: dccp: fix out of bound access in dccp_v6_err()
      ipv6: dccp: add missing bind_conflict to dccp_ipv6_mapped
      net: __skb_flow_dissect() must cap its return value
      tcp: take care of truncations done by sk_filter()

Fabian Mewes (1):
      Documentation: networking: dsa: Update tagging protocols

Florian Fainelli (1):
      net: stmmac: Fix lack of link transition for fixed PHYs

Florian Westphal (5):
      netfilter: conntrack: avoid excess memory allocation
      dctcp: avoid bogus doubling of cwnd after loss
      netfilter: connmark: ignore skbs with magic untracked conntrack objects
      netfilter: conntrack: fix CT target for UNSPEC helpers
      netfilter: conntrack: refine gc worker heuristics

Gao Feng (1):
      driver: macvlan: Destroy new macvlan port if macvlan_common_newlink failed.

Guenter Roeck (1):
      r8152: Fix error path in open function

Guilherme G. Piccoli (1):
      ehea: fix operation state report

Haim Dreyfuss (1):
      iwlwifi: mvm: comply with fw_restart mod param on suspend

Hariprasad Shenai (1):
      cxgb4: correct device ID of T6 adapter

Huy Nguyen (1):
      net/mlx5: Fix invalid pointer reference when prof_sel parameter is invalid

Ido Schimmel (2):
      mlxsw: spectrum: Fix incorrect reuse of MID entries
      mlxsw: spectrum_router: Flush FIB tables during fini

Isaac Boukris (1):
      unix: escape all null bytes in abstract unix domain socket

Iyappan Subramanian (2):
      drivers: net: xgene: fix: Disable coalescing on v1 hardware
      drivers: net: xgene: fix: Coalescing values for v2 hardware

Jakub Sitnicki (1):
      ipv6: Don't use ufo handling on later transformed packets

Jiri Pirko (2):
      mlxsw: spectrum_router: Fix handling of neighbour structure
      mlxsw: spectrum_router: Ignore FIB notification events for non-init namespaces

Johan Hovold (4):
      phy: fix device reference leaks
      net: ethernet: ti: cpsw: fix device and of_node leaks
      net: ethernet: ti: davinci_emac: fix device reference leak
      net: hns: fix device reference leaks

Johannes Berg (1):
      iwlwifi: pcie: mark command queue lock with separate lockdep class

John Allen (1):
      ibmvnic: Start completion queue negotiation at server-provided optimum values

John W. Linville (1):
      netfilter: nf_tables: fix type mismatch with error return from nft_parse_u32_check

Kalle Valo (1):
      Merge tag 'iwlwifi-for-kalle-2015-10-25' of git://git.kernel.org/.../iwlwifi/iwlwifi-fixes

Lance Richardson (2):
      ipv4: allow local fragmentation in ip_finish_output_gso()
      ipv4: update comment to document GSO fragmentation cases.

Liping Zhang (6):
      netfilter: nft_dynset: fix panic if NFT_SET_HASH is not enabled
      netfilter: nf_tables: fix *leak* when expr clone fail
      netfilter: nf_tables: fix race when create new element in dynset
      netfilter: nf_tables: destroy the set if fail to add transaction
      netfilter: nft_dup: do not use sreg_dev if the user doesn't specify it
      netfilter: nf_tables: fix oops when inserting an element into a verdict map

Luca Coelho (4):
      iwlwifi: mvm: use ssize_t for len in iwl_debugfs_mem_read()
      iwlwifi: mvm: fix d3_test with unified D0/D3 images
      iwlwifi: pcie: fix SPLC structure parsing
      iwlwifi: mvm: fix netdetect starting/stopping for unified images

Lukas Resch (1):
      can: sja1000: plx_pci: Add support for Moxa CAN devices

Maciej Żenczykowski (1):
      net-ipv6: on device mtu change do not add mtu to mtu-less routes

Marcelo Ricardo Leitner (1):
      sctp: assign assoc_id earlier in __sctp_connect

Mark Lord (1):
      r8152: Fix broken RX checksums.

Martin KaFai Lau (2):
      bpf: Fix bpf_redirect to an ipip/ip6tnl dev
      bpf: Add test for bpf_redirect to ipip/ip6tnl

Mathias Krause (1):
      rtnl: reset calcit fptr in rtnl_unregister()

Michael Chan (2):
      bnxt_en: Fix ring arithmetic in bnxt_setup_tc().
      bnxt_en: Fix VF virtual link state.

Michael S. Tsirkin (1):
      virtio-net: drop legacy features in virtio 1 mode

Mike Frysinger (1):
      Revert "include/uapi/linux/atm_zatm.h: include linux/time.h"

Mintz, Yuval (2):
      qede: Fix statistics' strings for Tx/Rx queues
      qede: Correctly map aggregation replacement pages

Oliver Hartkopp (1):
      can: bcm: fix warning in bcm_connect/proc_register

Or Gerlitz (3):
      net/mlx5e: Disallow changing name-space for VF representors
      net/mlx5e: Handle matching on vlan priority for offloaded TC rules
      net/mlx5: E-Switch, Set the actions for offloaded rules properly

Rafał Miłecki (1):
      net: bgmac: fix reversed checks for clock control flag

Ram Amrani (2):
      qed: configure ll2 RoCE v1/v2 flavor correctly
      qed: Correct rdma params configuration

Russell King (1):
      net: mv643xx_eth: ensure coalesce settings survive read-modify-write

Saeed Mahameed (3):
      MAINTAINERS: Update MELLANOX MLX5 core VPI driver maintainers
      net/mlx5e: Fix XDP error path of mlx5e_open_channel()
      net/mlx5e: Re-arrange XDP SQ/CQ creation

Sara Sharon (1):
      iwlwifi: mvm: wake the wait queue when the RX sync counter is zero

Soheil Hassas Yeganeh (1):
      sock: fix sendmmsg for partial sendmsg

Stephen Suryaputra Lin (1):
      ipv4: use new_gw for redirect neigh lookup

Tariq Toukan (1):
      Revert "net/mlx4_en: Fix panic during reboot"

Thomas Falcon (2):
      ibmvnic: Unmap ibmvnic_statistics structure
      ibmvnic: Fix size of debugfs name buffer

Timur Tabi (3):
      net: qcom/emac: use correct value for SGMII_LN_UCDR_SO_GAIN_MODE0
      net: qcom/emac: configure the external phy to allow pause frames
      net: qcom/emac: enable flow control if requested

Ulrich Weber (1):
      netfilter: nf_conntrack_sip: extend request line validation

WANG Cong (4):
      inet: fix sleeping inside inet_wait_for_connect()
      genetlink: fix a memory leak on error path
      taskstats: fix the length of cgroupstats_cmd_get_policy
      ipvs: use IPVS_CMD_ATTR_MAX for family.maxattr

Xin Long (5):
      ipv6: add mtu lock check in __ip6_rt_update_pmtu
      sctp: hold transport instead of assoc in sctp_diag
      sctp: return back transport in __sctp_rcv_init_lookup
      sctp: hold transport instead of assoc when lookup assoc in rx path
      sctp: change sk state only when it has assocs in sctp_shutdown

Yotam Gigi (1):
      mlxsw: spectrum: Fix refcount bug on span entries

 Documentation/networking/dsa/dsa.txt                        |   3 +-
 MAINTAINERS                                                 |   1 +
 drivers/net/can/sja1000/plx_pci.c                           |  18 ++++++++++
 drivers/net/ethernet/apm/xgene/xgene_enet_hw.c              |  12 -------
 drivers/net/ethernet/apm/xgene/xgene_enet_hw.h              |   2 ++
 drivers/net/ethernet/apm/xgene/xgene_enet_main.c            |   3 +-
 drivers/net/ethernet/apm/xgene/xgene_enet_ring2.c           |  12 ++++---
 drivers/net/ethernet/broadcom/bgmac.c                       |   9 +++--
 drivers/net/ethernet/broadcom/bnx2.c                        |  48 +++++++++++++++++++-------
 drivers/net/ethernet/broadcom/bnxt/bnxt.c                   |  11 +++---
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c             |   4 +--
 drivers/net/ethernet/brocade/bna/bnad.c                     |   4 +--
 drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h          |   2 +-
 drivers/net/ethernet/hisilicon/hns/hnae.c                   |   8 ++++-
 drivers/net/ethernet/ibm/ehea/ehea_main.c                   |   2 ++
 drivers/net/ethernet/ibm/ibmvnic.c                          |  10 +++---
 drivers/net/ethernet/marvell/mv643xx_eth.c                  |   2 ++
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c              |   1 -
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c           |  31 +++++++++--------
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c            |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c             |   5 ++-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c  |   3 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c           |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/main.c              |   5 +--
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c              |   4 ++-
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h              |   2 +-
 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c       | 134 ++++++++++++++++++++++++++++++++++++----------------------------------
 drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c    |  14 ++++----
 drivers/net/ethernet/qlogic/qed/qed_hsi.h                   |   3 --
 drivers/net/ethernet/qlogic/qed/qed_ll2.c                   |   1 +
 drivers/net/ethernet/qlogic/qed/qed_main.c                  |  17 +++++----
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c             |  25 +++++++++-----
 drivers/net/ethernet/qlogic/qede/qede_main.c                |   2 +-
 drivers/net/ethernet/qualcomm/emac/emac-mac.c               |  15 +++++---
 drivers/net/ethernet/qualcomm/emac/emac-sgmii.c             |   2 +-
 drivers/net/ethernet/sfc/efx.c                              |   3 ++
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c           |   7 ++++
 drivers/net/ethernet/ti/cpsw-phy-sel.c                      |   3 ++
 drivers/net/ethernet/ti/davinci_emac.c                      |  10 +++---
 drivers/net/ethernet/toshiba/ps3_gelic_wireless.c           |   2 +-
 drivers/net/ethernet/xscale/ixp4xx_eth.c                    |   3 +-
 drivers/net/macvlan.c                                       |  31 ++++++++++++-----
 drivers/net/phy/phy_device.c                                |   2 ++
 drivers/net/usb/ax88179_178a.c                              |  17 +++++++++
 drivers/net/usb/r8152.c                                     |  21 ++++++-----
 drivers/net/virtio_net.c                                    |  30 ++++++++++------
 drivers/net/vxlan.c                                         |   4 ++-
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c |   2 +-
 drivers/net/wireless/intel/iwlwifi/mvm/d3.c                 |  49 ++++++++++++++++++++------
 drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c            |   4 +--
 drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c           |   3 +-
 drivers/net/wireless/intel/iwlwifi/mvm/mvm.h                |   1 +
 drivers/net/wireless/intel/iwlwifi/mvm/ops.c                |   1 +
 drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c               |   3 +-
 drivers/net/wireless/intel/iwlwifi/mvm/scan.c               |  33 ++++++++++++++----
 drivers/net/wireless/intel/iwlwifi/pcie/drv.c               |  79 +++++++++++++++++++++++++-----------------
 drivers/net/wireless/intel/iwlwifi/pcie/tx.c                |   8 +++++
 drivers/net/xen-netfront.c                                  |   4 +--
 include/linux/ipv6.h                                        |   2 +-
 include/linux/netdevice.h                                   |  15 ++++++++
 include/net/ip.h                                            |   3 +-
 include/net/ip6_tunnel.h                                    |   1 +
 include/net/netfilter/nf_conntrack_labels.h                 |   3 +-
 include/net/netfilter/nf_tables.h                           |   8 +++--
 include/net/sctp/sctp.h                                     |   2 +-
 include/net/sock.h                                          |   4 +--
 include/net/tcp.h                                           |   3 +-
 include/uapi/linux/atm_zatm.h                               |   1 -
 include/uapi/linux/bpqether.h                               |   2 --
 kernel/bpf/hashtab.c                                        |   3 +-
 kernel/bpf/syscall.c                                        |   4 ++-
 kernel/taskstats.c                                          |   6 +++-
 net/can/bcm.c                                               |  32 ++++++++++++-----
 net/core/dev.c                                              |  19 ++++------
 net/core/filter.c                                           |  68 +++++++++++++++++++++++++++++++-----
 net/core/flow_dissector.c                                   |  11 ++++--
 net/core/rtnetlink.c                                        |   1 +
 net/core/sock.c                                             |   6 ++--
 net/dccp/ipv4.c                                             |  16 +++++----
 net/dccp/ipv6.c                                             |  19 +++++-----
 net/dccp/proto.c                                            |   4 +++
 net/ipv4/af_inet.c                                          |   9 +++--
 net/ipv4/fib_trie.c                                         |  21 +++++------
 net/ipv4/icmp.c                                             |   4 +--
 net/ipv4/ip_forward.c                                       |   2 +-
 net/ipv4/ip_output.c                                        |  25 ++++++++------
 net/ipv4/ip_tunnel_core.c                                   |  11 ------
 net/ipv4/ipmr.c                                             |   2 +-
 net/ipv4/netfilter/nft_dup_ipv4.c                           |   6 ++--
 net/ipv4/route.c                                            |   4 ++-
 net/ipv4/tcp.c                                              |   4 +--
 net/ipv4/tcp_dctcp.c                                        |  13 ++++++-
 net/ipv4/tcp_ipv4.c                                         |  19 +++++++++-
 net/ipv6/icmp.c                                             |   2 +-
 net/ipv6/ip6_output.c                                       |   2 +-
 net/ipv6/ip6_udp_tunnel.c                                   |   3 --
 net/ipv6/netfilter/nft_dup_ipv6.c                           |   6 ++--
 net/ipv6/route.c                                            |   4 +++
 net/ipv6/tcp_ipv6.c                                         |  14 +++++---
 net/netfilter/ipvs/ip_vs_ctl.c                              |   2 +-
 net/netfilter/ipvs/ip_vs_sync.c                             |   7 ++--
 net/netfilter/nf_conntrack_core.c                           |  49 +++++++++++++++++++++-----
 net/netfilter/nf_conntrack_helper.c                         |  11 ++++--
 net/netfilter/nf_conntrack_sip.c                            |   5 ++-
 net/netfilter/nf_tables_api.c                               |  18 ++++++----
 net/netfilter/nft_dynset.c                                  |  19 ++++++----
 net/netfilter/nft_set_hash.c                                |  19 +++++++---
 net/netfilter/nft_set_rbtree.c                              |   2 +-
 net/netfilter/xt_connmark.c                                 |   4 +--
 net/netlink/diag.c                                          |   5 +--
 net/netlink/genetlink.c                                     |   4 ++-
 net/sctp/input.c                                            |  35 +++++++++----------
 net/sctp/ipv6.c                                             |   2 +-
 net/sctp/socket.c                                           |  27 +++++++--------
 net/socket.c                                                |   2 ++
 net/unix/af_unix.c                                          |   3 +-
 samples/bpf/Makefile                                        |   4 +++
 samples/bpf/tc_l2_redirect.sh                               | 173 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 samples/bpf/tc_l2_redirect_kern.c                           | 236 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 samples/bpf/tc_l2_redirect_user.c                           |  73 +++++++++++++++++++++++++++++++++++++++
 120 files changed, 1358 insertions(+), 465 deletions(-)
 create mode 100755 samples/bpf/tc_l2_redirect.sh
 create mode 100644 samples/bpf/tc_l2_redirect_kern.c
 create mode 100644 samples/bpf/tc_l2_redirect_user.c

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox