* [PATCN net-next] net_sched: gen_estimator: complete rewrite of rate estimators
From: Eric Dumazet @ 2016-12-04 7:07 UTC (permalink / raw)
To: David Miller; +Cc: netdev, netfilter-devel
From: Eric Dumazet <edumazet@google.com>
1) Old code was hard to maintain, due to complex lock chains.
(We probably will be able to remove some kfree_rcu() in callers)
2) Using a single timer to update all estimators does not scale.
3) Code was buggy on 32bit kernel (WRITE_ONCE() on 64bit quantity
is not supposed to work well)
In this rewrite :
- I removed the RB tree that had to be scanned in
gen_estimator_active(). qdisc dumps should be much faster.
- Each estimator has its own timer.
- Estimations are maintained in net_rate_estimator structure,
instead of dirtying the qdisc. Minor, but part of the simplification.
- Reading the estimator uses RCU and a seqcount to provide proper
support for 32bit kernels.
- We reduce memory need when estimators are not used, since
we store a pointer, instead of the bytes/packets counters.
- xt_rateest_mt() no longer has to grab a spinlock.
(In the future, xt_rateest_tg() could be switched to per cpu counters)
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
include/net/act_api.h | 2
include/net/gen_stats.h | 17 -
include/net/netfilter/xt_rateest.h | 10
include/net/sch_generic.h | 2
net/core/gen_estimator.c | 298 +++++++++------------------
net/core/gen_stats.c | 17 -
net/ipv4/tcp_output.c | 8
net/netfilter/xt_RATEEST.c | 4
net/netfilter/xt_rateest.c | 28 +-
net/sched/act_api.c | 9
net/sched/act_police.c | 21 +
net/sched/sch_api.c | 2
net/sched/sch_cbq.c | 6
net/sched/sch_drr.c | 6
net/sched/sch_generic.c | 2
net/sched/sch_hfsc.c | 6
net/sched/sch_htb.c | 6
net/sched/sch_qfq.c | 8
18 files changed, 188 insertions(+), 264 deletions(-)
diff --git a/include/net/act_api.h b/include/net/act_api.h
index 9dddf77a69ccbcb003cfa66bcc0de337f78f3dae..1d716449209e4753a297c61a287077a1eb96e6d8 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -36,7 +36,7 @@ struct tc_action {
struct tcf_t tcfa_tm;
struct gnet_stats_basic_packed tcfa_bstats;
struct gnet_stats_queue tcfa_qstats;
- struct gnet_stats_rate_est64 tcfa_rate_est;
+ struct net_rate_estimator __rcu *tcfa_rate_est;
spinlock_t tcfa_lock;
struct rcu_head tcfa_rcu;
struct gnet_stats_basic_cpu __percpu *cpu_bstats;
diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
index 231e121cc7d9c72075e7e6dde3655d631f64a1c4..8b7aa370e7a4af61fcb71ed751dba72ebead6143 100644
--- a/include/net/gen_stats.h
+++ b/include/net/gen_stats.h
@@ -11,6 +11,8 @@ struct gnet_stats_basic_cpu {
struct u64_stats_sync syncp;
};
+struct net_rate_estimator;
+
struct gnet_dump {
spinlock_t * lock;
struct sk_buff * skb;
@@ -42,8 +44,7 @@ void __gnet_stats_copy_basic(const seqcount_t *running,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b);
int gnet_stats_copy_rate_est(struct gnet_dump *d,
- const struct gnet_stats_basic_packed *b,
- struct gnet_stats_rate_est64 *r);
+ struct net_rate_estimator __rcu **ptr);
int gnet_stats_copy_queue(struct gnet_dump *d,
struct gnet_stats_queue __percpu *cpu_q,
struct gnet_stats_queue *q, __u32 qlen);
@@ -53,16 +54,16 @@ int gnet_stats_finish_copy(struct gnet_dump *d);
int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
- struct gnet_stats_rate_est64 *rate_est,
+ struct net_rate_estimator __rcu **rate_est,
spinlock_t *stats_lock,
seqcount_t *running, struct nlattr *opt);
-void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
- struct gnet_stats_rate_est64 *rate_est);
+void gen_kill_estimator(struct net_rate_estimator __rcu **ptr);
int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
- struct gnet_stats_rate_est64 *rate_est,
+ struct net_rate_estimator __rcu **ptr,
spinlock_t *stats_lock,
seqcount_t *running, struct nlattr *opt);
-bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
- const struct gnet_stats_rate_est64 *rate_est);
+bool gen_estimator_active(struct net_rate_estimator __rcu **ptr);
+bool gen_estimator_read(struct net_rate_estimator __rcu **ptr,
+ struct gnet_stats_rate_est64 *sample);
#endif
diff --git a/include/net/netfilter/xt_rateest.h b/include/net/netfilter/xt_rateest.h
index 79f45e19f31eaa54deb8437ef4b883bec78def84..130e58361f998ecdf361910b196e69778224d955 100644
--- a/include/net/netfilter/xt_rateest.h
+++ b/include/net/netfilter/xt_rateest.h
@@ -1,19 +1,23 @@
#ifndef _XT_RATEEST_H
#define _XT_RATEEST_H
+#include <net/gen_stats.h>
+
struct xt_rateest {
/* keep lock and bstats on same cache line to speedup xt_rateest_tg() */
struct gnet_stats_basic_packed bstats;
spinlock_t lock;
- /* keep rstats and lock on same cache line to speedup xt_rateest_mt() */
- struct gnet_stats_rate_est64 rstats;
+
/* following fields not accessed in hot path */
+ unsigned int refcnt;
struct hlist_node list;
char name[IFNAMSIZ];
- unsigned int refcnt;
struct gnet_estimator params;
struct rcu_head rcu;
+
+ /* keep this field far away to speedup xt_rateest_mt() */
+ struct net_rate_estimator __rcu *rate_est;
};
struct xt_rateest *xt_rateest_lookup(const char *name);
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index e6aa0a249672a802dce583166f4f808154b77a96..498f81b229a4565e140f1a054ce931e80361e8b2 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -76,7 +76,7 @@ struct Qdisc {
struct netdev_queue *dev_queue;
- struct gnet_stats_rate_est64 rate_est;
+ struct net_rate_estimator __rcu *rate_est;
struct gnet_stats_basic_cpu __percpu *cpu_bstats;
struct gnet_stats_queue __percpu *cpu_qstats;
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 0993844faeeaefb221ea619fd9d796600b82fbb9..d9cc2e0c1b8f18b417c91dd2f57e646b7c602a63 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -7,6 +7,7 @@
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ * Eric Dumazet <edumazet@google.com>
*
* Changes:
* Jamal Hadi Salim - moved it to net/core and reshulfed
@@ -30,171 +31,79 @@
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
-#include <linux/rbtree.h>
#include <linux/slab.h>
+#include <linux/seqlock.h>
#include <net/sock.h>
#include <net/gen_stats.h>
-/*
- This code is NOT intended to be used for statistics collection,
- its purpose is to provide a base for statistical multiplexing
- for controlled load service.
- If you need only statistics, run a user level daemon which
- periodically reads byte counters.
-
- Unfortunately, rate estimation is not a very easy task.
- F.e. I did not find a simple way to estimate the current peak rate
- and even failed to formulate the problem 8)8)
-
- So I preferred not to built an estimator into the scheduler,
- but run this task separately.
- Ideally, it should be kernel thread(s), but for now it runs
- from timers, which puts apparent top bounds on the number of rated
- flows, has minimal overhead on small, but is enough
- to handle controlled load service, sets of aggregates.
-
- We measure rate over A=(1<<interval) seconds and evaluate EWMA:
-
- avrate = avrate*(1-W) + rate*W
-
- where W is chosen as negative power of 2: W = 2^(-ewma_log)
-
- The resulting time constant is:
-
- T = A/(-ln(1-W))
-
-
- NOTES.
-
- * avbps and avpps are scaled by 2^5.
- * both values are reported as 32 bit unsigned values. bps can
- overflow for fast links : max speed being 34360Mbit/sec
- * Minimal interval is HZ/4=250msec (it is the greatest common divisor
- for HZ=100 and HZ=1024 8)), maximal interval
- is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals
- are too expensive, longer ones can be implemented
- at user level painlessly.
+/* This code is NOT intended to be used for statistics collection,
+ * its purpose is to provide a base for statistical multiplexing
+ * for controlled load service.
+ * If you need only statistics, run a user level daemon which
+ * periodically reads byte counters.
*/
-#define EST_MAX_INTERVAL 5
-
-struct gen_estimator {
- struct list_head list;
+struct net_rate_estimator {
struct gnet_stats_basic_packed *bstats;
- struct gnet_stats_rate_est64 *rate_est;
spinlock_t *stats_lock;
seqcount_t *running;
- int ewma_log;
+ struct gnet_stats_basic_cpu __percpu *cpu_bstats;
+ u8 ewma_log;
+ u8 intvl_log; /* period : (250ms << intvl_log) */
+
+ seqcount_t seq;
u32 last_packets;
- unsigned long avpps;
u64 last_bytes;
+
+ u64 avpps;
u64 avbps;
- struct rcu_head e_rcu;
- struct rb_node node;
- struct gnet_stats_basic_cpu __percpu *cpu_bstats;
- struct rcu_head head;
-};
-struct gen_estimator_head {
- unsigned long next_jiffies;
- struct timer_list timer;
- struct list_head list;
+ unsigned long next_jiffies;
+ struct timer_list timer;
+ struct rcu_head rcu;
};
-static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
-
-/* Protects against NULL dereference */
-static DEFINE_RWLOCK(est_lock);
-
-/* Protects against soft lockup during large deletion */
-static struct rb_root est_root = RB_ROOT;
-static DEFINE_SPINLOCK(est_tree_lock);
-
-static void est_timer(unsigned long arg)
+static void est_fetch_counters(struct net_rate_estimator *e,
+ struct gnet_stats_basic_packed *b)
{
- int idx = (int)arg;
- struct gen_estimator *e;
+ if (e->stats_lock)
+ spin_lock(e->stats_lock);
- rcu_read_lock();
- list_for_each_entry_rcu(e, &elist[idx].list, list) {
- struct gnet_stats_basic_packed b = {0};
- unsigned long rate;
- u64 brate;
-
- if (e->stats_lock)
- spin_lock(e->stats_lock);
- read_lock(&est_lock);
- if (e->bstats == NULL)
- goto skip;
-
- __gnet_stats_copy_basic(e->running, &b, e->cpu_bstats, e->bstats);
-
- brate = (b.bytes - e->last_bytes)<<(7 - idx);
- e->last_bytes = b.bytes;
- e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
- WRITE_ONCE(e->rate_est->bps, (e->avbps + 0xF) >> 5);
-
- rate = b.packets - e->last_packets;
- rate <<= (7 - idx);
- e->last_packets = b.packets;
- e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
- WRITE_ONCE(e->rate_est->pps, (e->avpps + 0xF) >> 5);
-skip:
- read_unlock(&est_lock);
- if (e->stats_lock)
- spin_unlock(e->stats_lock);
- }
+ __gnet_stats_copy_basic(e->running, b, e->cpu_bstats, e->bstats);
- if (!list_empty(&elist[idx].list)) {
- elist[idx].next_jiffies += ((HZ/4) << idx);
+ if (e->stats_lock)
+ spin_unlock(e->stats_lock);
- if (unlikely(time_after_eq(jiffies, elist[idx].next_jiffies))) {
- /* Ouch... timer was delayed. */
- elist[idx].next_jiffies = jiffies + 1;
- }
- mod_timer(&elist[idx].timer, elist[idx].next_jiffies);
- }
- rcu_read_unlock();
}
-static void gen_add_node(struct gen_estimator *est)
+static void est_timer(unsigned long arg)
{
- struct rb_node **p = &est_root.rb_node, *parent = NULL;
+ struct net_rate_estimator *est = (struct net_rate_estimator *)arg;
+ struct gnet_stats_basic_packed b;
+ u64 rate, brate;
- while (*p) {
- struct gen_estimator *e;
+ est_fetch_counters(est, &b);
+ brate = (b.bytes - est->last_bytes) << (8 - est->ewma_log);
+ brate -= (est->avbps >> est->ewma_log);
- parent = *p;
- e = rb_entry(parent, struct gen_estimator, node);
+ rate = (u64)(b.packets - est->last_packets) << (8 - est->ewma_log);
+ rate -= (est->avpps >> est->ewma_log);
- if (est->bstats > e->bstats)
- p = &parent->rb_right;
- else
- p = &parent->rb_left;
- }
- rb_link_node(&est->node, parent, p);
- rb_insert_color(&est->node, &est_root);
-}
-
-static
-struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats,
- const struct gnet_stats_rate_est64 *rate_est)
-{
- struct rb_node *p = est_root.rb_node;
+ write_seqcount_begin(&est->seq);
+ est->avbps += brate;
+ est->avpps += rate;
+ write_seqcount_end(&est->seq);
- while (p) {
- struct gen_estimator *e;
+ est->last_bytes = b.bytes;
+ est->last_packets = b.packets;
- e = rb_entry(p, struct gen_estimator, node);
+ est->next_jiffies += ((HZ/4) << est->intvl_log);
- if (bstats > e->bstats)
- p = p->rb_right;
- else if (bstats < e->bstats || rate_est != e->rate_est)
- p = p->rb_left;
- else
- return e;
+ if (unlikely(time_after_eq(jiffies, est->next_jiffies))) {
+ /* Ouch... timer was delayed. */
+ est->next_jiffies = jiffies + 1;
}
- return NULL;
+ mod_timer(&est->timer, est->next_jiffies);
}
/**
@@ -217,84 +126,76 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats
*/
int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
- struct gnet_stats_rate_est64 *rate_est,
+ struct net_rate_estimator __rcu **rate_est,
spinlock_t *stats_lock,
seqcount_t *running,
struct nlattr *opt)
{
- struct gen_estimator *est;
struct gnet_estimator *parm = nla_data(opt);
- struct gnet_stats_basic_packed b = {0};
- int idx;
+ struct net_rate_estimator *old, *est;
+ struct gnet_stats_basic_packed b;
+ int intvl_log;
if (nla_len(opt) < sizeof(*parm))
return -EINVAL;
+ /* allowed timer periods are :
+ * -2 : 250ms, -1 : 500ms, 0 : 1 sec
+ * 1 : 2 sec, 2 : 4 sec, 3 : 8 sec
+ */
if (parm->interval < -2 || parm->interval > 3)
return -EINVAL;
est = kzalloc(sizeof(*est), GFP_KERNEL);
- if (est == NULL)
+ if (!est)
return -ENOBUFS;
- __gnet_stats_copy_basic(running, &b, cpu_bstats, bstats);
-
- idx = parm->interval + 2;
+ seqcount_init(&est->seq);
+ intvl_log = parm->interval + 2;
est->bstats = bstats;
- est->rate_est = rate_est;
est->stats_lock = stats_lock;
est->running = running;
est->ewma_log = parm->ewma_log;
- est->last_bytes = b.bytes;
- est->avbps = rate_est->bps<<5;
- est->last_packets = b.packets;
- est->avpps = rate_est->pps<<10;
+ est->intvl_log = intvl_log;
est->cpu_bstats = cpu_bstats;
- spin_lock_bh(&est_tree_lock);
- if (!elist[idx].timer.function) {
- INIT_LIST_HEAD(&elist[idx].list);
- setup_timer(&elist[idx].timer, est_timer, idx);
+ est_fetch_counters(est, &b);
+ est->last_bytes = b.bytes;
+ est->last_packets = b.packets;
+ old = rcu_dereference_protected(*rate_est, 1);
+ if (old) {
+ del_timer_sync(&old->timer);
+ est->avbps = old->avbps;
+ est->avpps = old->avpps;
}
- if (list_empty(&elist[idx].list)) {
- elist[idx].next_jiffies = jiffies + ((HZ/4) << idx);
- mod_timer(&elist[idx].timer, elist[idx].next_jiffies);
- }
- list_add_rcu(&est->list, &elist[idx].list);
- gen_add_node(est);
- spin_unlock_bh(&est_tree_lock);
+ est->next_jiffies = jiffies + ((HZ/4) << intvl_log);
+ setup_timer(&est->timer, est_timer, (unsigned long)est);
+ mod_timer(&est->timer, est->next_jiffies);
+ rcu_assign_pointer(*rate_est, est);
+ if (old)
+ kfree_rcu(old, rcu);
return 0;
}
EXPORT_SYMBOL(gen_new_estimator);
/**
* gen_kill_estimator - remove a rate estimator
- * @bstats: basic statistics
- * @rate_est: rate estimator statistics
+ * @est: rate estimator
*
- * Removes the rate estimator specified by &bstats and &rate_est.
+ * Removes the rate estimator.
*
- * Note : Caller should respect an RCU grace period before freeing stats_lock
*/
-void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
- struct gnet_stats_rate_est64 *rate_est)
+void gen_kill_estimator(struct net_rate_estimator __rcu **ptr)
{
- struct gen_estimator *e;
-
- spin_lock_bh(&est_tree_lock);
- while ((e = gen_find_node(bstats, rate_est))) {
- rb_erase(&e->node, &est_root);
+ struct net_rate_estimator *est;
- write_lock(&est_lock);
- e->bstats = NULL;
- write_unlock(&est_lock);
-
- list_del_rcu(&e->list);
- kfree_rcu(e, e_rcu);
+ est = xchg((__force struct net_rate_estimator **)ptr, NULL);
+ if (est) {
+ del_timer_sync(&est->timer);
+ kfree_rcu(est, rcu);
}
- spin_unlock_bh(&est_tree_lock);
}
EXPORT_SYMBOL(gen_kill_estimator);
@@ -314,33 +215,46 @@ EXPORT_SYMBOL(gen_kill_estimator);
*/
int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
- struct gnet_stats_rate_est64 *rate_est,
+ struct net_rate_estimator __rcu **ptr,
spinlock_t *stats_lock,
seqcount_t *running, struct nlattr *opt)
{
- gen_kill_estimator(bstats, rate_est);
- return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt);
+ return gen_new_estimator(bstats, cpu_bstats, ptr, stats_lock, running, opt);
}
EXPORT_SYMBOL(gen_replace_estimator);
/**
* gen_estimator_active - test if estimator is currently in use
- * @bstats: basic statistics
- * @rate_est: rate estimator statistics
+ * @rate_est: rate estimator
*
* Returns true if estimator is active, and false if not.
*/
-bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
- const struct gnet_stats_rate_est64 *rate_est)
+bool gen_estimator_active(struct net_rate_estimator __rcu **ptr)
{
- bool res;
+ return !!rcu_access_pointer(*ptr);
+}
+EXPORT_SYMBOL(gen_estimator_active);
- ASSERT_RTNL();
+bool gen_estimator_read(struct net_rate_estimator __rcu **ptr,
+ struct gnet_stats_rate_est64 *sample)
+{
+ struct net_rate_estimator *est;
+ unsigned seq;
+
+ rcu_read_lock();
+ est = rcu_dereference(*ptr);
+ if (!est) {
+ rcu_read_unlock();
+ return false;
+ }
- spin_lock_bh(&est_tree_lock);
- res = gen_find_node(bstats, rate_est) != NULL;
- spin_unlock_bh(&est_tree_lock);
+ do {
+ seq = read_seqcount_begin(&est->seq);
+ sample->bps = est->avbps >> 8;
+ sample->pps = est->avpps >> 8;
+ } while (read_seqcount_retry(&est->seq, seq));
- return res;
+ rcu_read_unlock();
+ return true;
}
-EXPORT_SYMBOL(gen_estimator_active);
+EXPORT_SYMBOL(gen_estimator_read);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 508e051304fb62627e61b5065b2325edd1b84f2e..55fd980568d855d9558afcdcf37d3d316ae20acc 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -205,18 +205,17 @@ EXPORT_SYMBOL(gnet_stats_copy_basic);
*/
int
gnet_stats_copy_rate_est(struct gnet_dump *d,
- const struct gnet_stats_basic_packed *b,
- struct gnet_stats_rate_est64 *r)
+ struct net_rate_estimator __rcu **ptr)
{
+ struct gnet_stats_rate_est64 sample;
struct gnet_stats_rate_est est;
int res;
- if (b && !gen_estimator_active(b, r))
+ if (!gen_estimator_read(ptr, &sample))
return 0;
-
- est.bps = min_t(u64, UINT_MAX, r->bps);
+ est.bps = min_t(u64, UINT_MAX, sample.bps);
/* we have some time before reaching 2^32 packets per second */
- est.pps = r->pps;
+ est.pps = sample.pps;
if (d->compat_tc_stats) {
d->tc_stats.bps = est.bps;
@@ -226,11 +225,11 @@ gnet_stats_copy_rate_est(struct gnet_dump *d,
if (d->tail) {
res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est),
TCA_STATS_PAD);
- if (res < 0 || est.bps == r->bps)
+ if (res < 0 || est.bps == sample.bps)
return res;
/* emit 64bit stats only if needed */
- return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r),
- TCA_STATS_PAD);
+ return gnet_stats_copy(d, TCA_STATS_RATE_EST64, &sample,
+ sizeof(sample), TCA_STATS_PAD);
}
return 0;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c7adcb57654ea57d1ba6702c91743cb7d2c74d28..859b60bfa86712031186fffc09c65bc43aa065dd 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2081,6 +2081,14 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
limit <<= factor;
if (atomic_read(&sk->sk_wmem_alloc) > limit) {
+ /* Special case where TX completion is delayed too much :
+ * If the skb we try to send is the first skb in write queue,
+ * then send it !
+ * No need to wait for TX completion to call us back.
+ */
+ if (skb == sk->sk_write_queue.next)
+ return false;
+
set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags);
/* It is possible TX completion already happened
* before we set TSQ_THROTTLED, so we must
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index dbd6c4a12b97435d2937c92fd79a035fd5828965..91a373a3f534de8d8641341c33c08d8fc49cbd29 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -63,7 +63,7 @@ void xt_rateest_put(struct xt_rateest *est)
mutex_lock(&xt_rateest_mutex);
if (--est->refcnt == 0) {
hlist_del(&est->list);
- gen_kill_estimator(&est->bstats, &est->rstats);
+ gen_kill_estimator(&est->rate_est);
/*
* gen_estimator est_timer() might access est->lock or bstats,
* wait a RCU grace period before freeing 'est'
@@ -132,7 +132,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
cfg.est.interval = info->interval;
cfg.est.ewma_log = info->ewma_log;
- ret = gen_new_estimator(&est->bstats, NULL, &est->rstats,
+ ret = gen_new_estimator(&est->bstats, NULL, &est->rate_est,
&est->lock, NULL, &cfg.opt);
if (ret < 0)
goto err2;
diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c
index 7720b036d76a84c949080c8c32827b604c80da64..1db02f6fca54d7eb5d60c2d8c5cb8ab11656fbcb 100644
--- a/net/netfilter/xt_rateest.c
+++ b/net/netfilter/xt_rateest.c
@@ -18,35 +18,33 @@ static bool
xt_rateest_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_rateest_match_info *info = par->matchinfo;
- struct gnet_stats_rate_est64 *r;
+ struct gnet_stats_rate_est64 sample = {0};
u_int32_t bps1, bps2, pps1, pps2;
bool ret = true;
- spin_lock_bh(&info->est1->lock);
- r = &info->est1->rstats;
+ gen_estimator_read(&info->est1->rate_est, &sample);
+
if (info->flags & XT_RATEEST_MATCH_DELTA) {
- bps1 = info->bps1 >= r->bps ? info->bps1 - r->bps : 0;
- pps1 = info->pps1 >= r->pps ? info->pps1 - r->pps : 0;
+ bps1 = info->bps1 >= sample.bps ? info->bps1 - sample.bps : 0;
+ pps1 = info->pps1 >= sample.pps ? info->pps1 - sample.pps : 0;
} else {
- bps1 = r->bps;
- pps1 = r->pps;
+ bps1 = sample.bps;
+ pps1 = sample.pps;
}
- spin_unlock_bh(&info->est1->lock);
if (info->flags & XT_RATEEST_MATCH_ABS) {
bps2 = info->bps2;
pps2 = info->pps2;
} else {
- spin_lock_bh(&info->est2->lock);
- r = &info->est2->rstats;
+ gen_estimator_read(&info->est2->rate_est, &sample);
+
if (info->flags & XT_RATEEST_MATCH_DELTA) {
- bps2 = info->bps2 >= r->bps ? info->bps2 - r->bps : 0;
- pps2 = info->pps2 >= r->pps ? info->pps2 - r->pps : 0;
+ bps2 = info->bps2 >= sample.bps ? info->bps2 - sample.bps : 0;
+ pps2 = info->pps2 >= sample.pps ? info->pps2 - sample.pps : 0;
} else {
- bps2 = r->bps;
- pps2 = r->pps;
+ bps2 = sample.bps;
+ pps2 = sample.pps;
}
- spin_unlock_bh(&info->est2->lock);
}
switch (info->mode) {
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index f893d180da1caa3b6dd1cc8773920beb1885f9b0..2095c83ce7730d49550103a8efad943d28815617 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -41,8 +41,7 @@ static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *p)
spin_lock_bh(&hinfo->lock);
hlist_del(&p->tcfa_head);
spin_unlock_bh(&hinfo->lock);
- gen_kill_estimator(&p->tcfa_bstats,
- &p->tcfa_rate_est);
+ gen_kill_estimator(&p->tcfa_rate_est);
/*
* gen_estimator est_timer() might access p->tcfa_lock
* or bstats, wait a RCU grace period before freeing p
@@ -237,8 +236,7 @@ EXPORT_SYMBOL(tcf_hash_check);
void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est)
{
if (est)
- gen_kill_estimator(&a->tcfa_bstats,
- &a->tcfa_rate_est);
+ gen_kill_estimator(&a->tcfa_rate_est);
call_rcu(&a->tcfa_rcu, free_tcf);
}
EXPORT_SYMBOL(tcf_hash_cleanup);
@@ -670,8 +668,7 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p,
goto errout;
if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 ||
- gnet_stats_copy_rate_est(&d, &p->tcfa_bstats,
- &p->tcfa_rate_est) < 0 ||
+ gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 ||
gnet_stats_copy_queue(&d, p->cpu_qstats,
&p->tcfa_qstats,
p->tcfa_qstats.qlen) < 0)
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index c990b73a6c85151862c30971bd3787d20dbcecd1..0ba91d1ce99480c4817684dbe0b4fde61811e03e 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -142,8 +142,7 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla,
goto failure_unlock;
} else if (tb[TCA_POLICE_AVRATE] &&
(ret == ACT_P_CREATED ||
- !gen_estimator_active(&police->tcf_bstats,
- &police->tcf_rate_est))) {
+ !gen_estimator_active(&police->tcf_rate_est))) {
err = -EINVAL;
goto failure_unlock;
}
@@ -216,13 +215,17 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
bstats_update(&police->tcf_bstats, skb);
tcf_lastuse_update(&police->tcf_tm);
- if (police->tcfp_ewma_rate &&
- police->tcf_rate_est.bps >= police->tcfp_ewma_rate) {
- police->tcf_qstats.overlimits++;
- if (police->tcf_action == TC_ACT_SHOT)
- police->tcf_qstats.drops++;
- spin_unlock(&police->tcf_lock);
- return police->tcf_action;
+ if (police->tcfp_ewma_rate) {
+ struct gnet_stats_rate_est64 sample;
+
+ if (!gen_estimator_read(&police->tcf_rate_est, &sample) ||
+ sample.bps >= police->tcfp_ewma_rate) {
+ police->tcf_qstats.overlimits++;
+ if (police->tcf_action == TC_ACT_SHOT)
+ police->tcf_qstats.drops++;
+ spin_unlock(&police->tcf_lock);
+ return police->tcf_action;
+ }
}
if (qdisc_pkt_len(skb) <= police->tcfp_mtu) {
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index f337f1bdd1d4a4cac738f9fe1fbd42e5984174fe..d7b93429f0cca61ff489536b4247f31f3690fdd8 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1395,7 +1395,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
&d, cpu_bstats, &q->bstats) < 0 ||
- gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
+ gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
goto nla_put_failure;
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index beb554aa8cfbb4acb5d89511b5e0030b4c9cf26e..9ffe1c220b02848032474fa7b9b2c2f09d40b110 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -122,7 +122,7 @@ struct cbq_class {
psched_time_t penalized;
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est64 rate_est;
+ struct net_rate_estimator __rcu *rate_est;
struct tc_cbq_xstats xstats;
struct tcf_proto __rcu *filter_list;
@@ -1346,7 +1346,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
d, NULL, &cl->bstats) < 0 ||
- gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
return -1;
@@ -1405,7 +1405,7 @@ static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
tcf_destroy_chain(&cl->filter_list);
qdisc_destroy(cl->q);
qdisc_put_rtab(cl->R_tab);
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(&cl->rate_est);
if (cl != &q->link)
kfree(cl);
}
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index 8af5c59eef848db47cc33a878296693dabb44955..bb4cbdf7500482b170eef6e7923cf2f2259e52b5 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -25,7 +25,7 @@ struct drr_class {
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est64 rate_est;
+ struct net_rate_estimator __rcu *rate_est;
struct list_head alist;
struct Qdisc *qdisc;
@@ -142,7 +142,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl)
{
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(&cl->rate_est);
qdisc_destroy(cl->qdisc);
kfree(cl);
}
@@ -283,7 +283,7 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg,
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
d, NULL, &cl->bstats) < 0 ||
- gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL, &cl->qdisc->qstats, qlen) < 0)
return -1;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 6cfb6e9038c28187cc888cebf004e61270f00871..6eb9c8e88519a36fc3ef82be7c7f1dbe0ef1e13c 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -709,7 +709,7 @@ void qdisc_destroy(struct Qdisc *qdisc)
qdisc_put_stab(rtnl_dereference(qdisc->stab));
#endif
- gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
+ gen_kill_estimator(&qdisc->rate_est);
if (ops->reset)
ops->reset(qdisc);
if (ops->destroy)
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 000f1d36128e1abfc0657bce779a7df852f66384..3ffaa6fb0990f0aa31487a2f1829b1f1accf8b21 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -114,7 +114,7 @@ struct hfsc_class {
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est64 rate_est;
+ struct net_rate_estimator __rcu *rate_est;
struct tcf_proto __rcu *filter_list; /* filter list */
unsigned int filter_cnt; /* filter count */
unsigned int level; /* class level in hierarchy */
@@ -1091,7 +1091,7 @@ hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl)
tcf_destroy_chain(&cl->filter_list);
qdisc_destroy(cl->qdisc);
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(&cl->rate_est);
if (cl != &q->root)
kfree(cl);
}
@@ -1348,7 +1348,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
xstats.rtwork = cl->cl_cumul;
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 ||
- gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->qdisc->q.qlen) < 0)
return -1;
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 9926fe4f3b6f7db9aeba22fbbfae3d47c4e2af60..760f39e7caeeb91b6c34d561f64f1ed97c884a32 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -111,7 +111,7 @@ struct htb_class {
unsigned int children;
struct htb_class *parent; /* parent class */
- struct gnet_stats_rate_est64 rate_est;
+ struct net_rate_estimator __rcu *rate_est;
/*
* Written often fields
@@ -1145,7 +1145,7 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
d, NULL, &cl->bstats) < 0 ||
- gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
return -1;
@@ -1228,7 +1228,7 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
WARN_ON(!cl->un.leaf.q);
qdisc_destroy(cl->un.leaf.q);
}
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(&cl->rate_est);
tcf_destroy_chain(&cl->filter_list);
kfree(cl);
}
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index ca0516e6f74356f47dffcc2262f233ee50f0c47c..f9e712ce2d15ce9280c31d2f75d62b84034ae51d 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -137,7 +137,7 @@ struct qfq_class {
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est64 rate_est;
+ struct net_rate_estimator __rcu *rate_est;
struct Qdisc *qdisc;
struct list_head alist; /* Link for active-classes list. */
struct qfq_aggregate *agg; /* Parent aggregate. */
@@ -508,7 +508,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL);
if (new_agg == NULL) {
err = -ENOBUFS;
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(&cl->rate_est);
goto destroy_class;
}
sch_tree_lock(sch);
@@ -533,7 +533,7 @@ static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
struct qfq_sched *q = qdisc_priv(sch);
qfq_rm_from_agg(q, cl);
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(&cl->rate_est);
qdisc_destroy(cl->qdisc);
kfree(cl);
}
@@ -667,7 +667,7 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
d, NULL, &cl->bstats) < 0 ||
- gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL,
&cl->qdisc->qstats, cl->qdisc->q.qlen) < 0)
return -1;
^ permalink raw reply related
* [PATCN v2 net-next] net_sched: gen_estimator: complete rewrite of rate estimators
From: Eric Dumazet @ 2016-12-04 7:18 UTC (permalink / raw)
To: David Miller; +Cc: netdev, netfilter-devel
In-Reply-To: <1480835273.18162.457.camel@edumazet-glaptop3.roam.corp.google.com>
From: Eric Dumazet <edumazet@google.com>
1) Old code was hard to maintain, due to complex lock chains.
(We probably will be able to remove some kfree_rcu() in callers)
2) Using a single timer to update all estimators does not scale.
3) Code was buggy on 32bit kernel (WRITE_ONCE() on 64bit quantity
is not supposed to work well)
In this rewrite :
- I removed the RB tree that had to be scanned in
gen_estimator_active(). qdisc dumps should be much faster.
- Each estimator has its own timer.
- Estimations are maintained in net_rate_estimator structure,
instead of dirtying the qdisc. Minor, but part of the simplification.
- Reading the estimator uses RCU and a seqcount to provide proper
support for 32bit kernels.
- We reduce memory need when estimators are not used, since
we store a pointer, instead of the bytes/packets counters.
- xt_rateest_mt() no longer has to grab a spinlock.
(In the future, xt_rateest_tg() could be switched to per cpu counters)
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
v2: removed unwanted changes to tcp_output.c
Renamed some parameters to please htmldoc
include/net/act_api.h | 2
include/net/gen_stats.h | 17 -
include/net/netfilter/xt_rateest.h | 10
include/net/sch_generic.h | 2
net/core/gen_estimator.c | 299 +++++++++------------------
net/core/gen_stats.c | 17 -
net/netfilter/xt_RATEEST.c | 4
net/netfilter/xt_rateest.c | 28 +-
net/sched/act_api.c | 9
net/sched/act_police.c | 21 +
net/sched/sch_api.c | 2
net/sched/sch_cbq.c | 6
net/sched/sch_drr.c | 6
net/sched/sch_generic.c | 2
net/sched/sch_hfsc.c | 6
net/sched/sch_htb.c | 6
net/sched/sch_qfq.c | 8
17 files changed, 181 insertions(+), 264 deletions(-)
diff --git a/include/net/act_api.h b/include/net/act_api.h
index 9dddf77a69ccbcb003cfa66bcc0de337f78f3dae..1d716449209e4753a297c61a287077a1eb96e6d8 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -36,7 +36,7 @@ struct tc_action {
struct tcf_t tcfa_tm;
struct gnet_stats_basic_packed tcfa_bstats;
struct gnet_stats_queue tcfa_qstats;
- struct gnet_stats_rate_est64 tcfa_rate_est;
+ struct net_rate_estimator __rcu *tcfa_rate_est;
spinlock_t tcfa_lock;
struct rcu_head tcfa_rcu;
struct gnet_stats_basic_cpu __percpu *cpu_bstats;
diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
index 231e121cc7d9c72075e7e6dde3655d631f64a1c4..8b7aa370e7a4af61fcb71ed751dba72ebead6143 100644
--- a/include/net/gen_stats.h
+++ b/include/net/gen_stats.h
@@ -11,6 +11,8 @@ struct gnet_stats_basic_cpu {
struct u64_stats_sync syncp;
};
+struct net_rate_estimator;
+
struct gnet_dump {
spinlock_t * lock;
struct sk_buff * skb;
@@ -42,8 +44,7 @@ void __gnet_stats_copy_basic(const seqcount_t *running,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b);
int gnet_stats_copy_rate_est(struct gnet_dump *d,
- const struct gnet_stats_basic_packed *b,
- struct gnet_stats_rate_est64 *r);
+ struct net_rate_estimator __rcu **ptr);
int gnet_stats_copy_queue(struct gnet_dump *d,
struct gnet_stats_queue __percpu *cpu_q,
struct gnet_stats_queue *q, __u32 qlen);
@@ -53,16 +54,16 @@ int gnet_stats_finish_copy(struct gnet_dump *d);
int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
- struct gnet_stats_rate_est64 *rate_est,
+ struct net_rate_estimator __rcu **rate_est,
spinlock_t *stats_lock,
seqcount_t *running, struct nlattr *opt);
-void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
- struct gnet_stats_rate_est64 *rate_est);
+void gen_kill_estimator(struct net_rate_estimator __rcu **ptr);
int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
- struct gnet_stats_rate_est64 *rate_est,
+ struct net_rate_estimator __rcu **ptr,
spinlock_t *stats_lock,
seqcount_t *running, struct nlattr *opt);
-bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
- const struct gnet_stats_rate_est64 *rate_est);
+bool gen_estimator_active(struct net_rate_estimator __rcu **ptr);
+bool gen_estimator_read(struct net_rate_estimator __rcu **ptr,
+ struct gnet_stats_rate_est64 *sample);
#endif
diff --git a/include/net/netfilter/xt_rateest.h b/include/net/netfilter/xt_rateest.h
index 79f45e19f31eaa54deb8437ef4b883bec78def84..130e58361f998ecdf361910b196e69778224d955 100644
--- a/include/net/netfilter/xt_rateest.h
+++ b/include/net/netfilter/xt_rateest.h
@@ -1,19 +1,23 @@
#ifndef _XT_RATEEST_H
#define _XT_RATEEST_H
+#include <net/gen_stats.h>
+
struct xt_rateest {
/* keep lock and bstats on same cache line to speedup xt_rateest_tg() */
struct gnet_stats_basic_packed bstats;
spinlock_t lock;
- /* keep rstats and lock on same cache line to speedup xt_rateest_mt() */
- struct gnet_stats_rate_est64 rstats;
+
/* following fields not accessed in hot path */
+ unsigned int refcnt;
struct hlist_node list;
char name[IFNAMSIZ];
- unsigned int refcnt;
struct gnet_estimator params;
struct rcu_head rcu;
+
+ /* keep this field far away to speedup xt_rateest_mt() */
+ struct net_rate_estimator __rcu *rate_est;
};
struct xt_rateest *xt_rateest_lookup(const char *name);
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index e6aa0a249672a802dce583166f4f808154b77a96..498f81b229a4565e140f1a054ce931e80361e8b2 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -76,7 +76,7 @@ struct Qdisc {
struct netdev_queue *dev_queue;
- struct gnet_stats_rate_est64 rate_est;
+ struct net_rate_estimator __rcu *rate_est;
struct gnet_stats_basic_cpu __percpu *cpu_bstats;
struct gnet_stats_queue __percpu *cpu_qstats;
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 0993844faeeaefb221ea619fd9d796600b82fbb9..101b5d0e2142e6a813f6b524a59945379f02bcb3 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -7,6 +7,7 @@
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ * Eric Dumazet <edumazet@google.com>
*
* Changes:
* Jamal Hadi Salim - moved it to net/core and reshulfed
@@ -30,171 +31,79 @@
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
-#include <linux/rbtree.h>
#include <linux/slab.h>
+#include <linux/seqlock.h>
#include <net/sock.h>
#include <net/gen_stats.h>
-/*
- This code is NOT intended to be used for statistics collection,
- its purpose is to provide a base for statistical multiplexing
- for controlled load service.
- If you need only statistics, run a user level daemon which
- periodically reads byte counters.
-
- Unfortunately, rate estimation is not a very easy task.
- F.e. I did not find a simple way to estimate the current peak rate
- and even failed to formulate the problem 8)8)
-
- So I preferred not to built an estimator into the scheduler,
- but run this task separately.
- Ideally, it should be kernel thread(s), but for now it runs
- from timers, which puts apparent top bounds on the number of rated
- flows, has minimal overhead on small, but is enough
- to handle controlled load service, sets of aggregates.
-
- We measure rate over A=(1<<interval) seconds and evaluate EWMA:
-
- avrate = avrate*(1-W) + rate*W
-
- where W is chosen as negative power of 2: W = 2^(-ewma_log)
-
- The resulting time constant is:
-
- T = A/(-ln(1-W))
-
-
- NOTES.
-
- * avbps and avpps are scaled by 2^5.
- * both values are reported as 32 bit unsigned values. bps can
- overflow for fast links : max speed being 34360Mbit/sec
- * Minimal interval is HZ/4=250msec (it is the greatest common divisor
- for HZ=100 and HZ=1024 8)), maximal interval
- is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals
- are too expensive, longer ones can be implemented
- at user level painlessly.
+/* This code is NOT intended to be used for statistics collection,
+ * its purpose is to provide a base for statistical multiplexing
+ * for controlled load service.
+ * If you need only statistics, run a user level daemon which
+ * periodically reads byte counters.
*/
-#define EST_MAX_INTERVAL 5
-
-struct gen_estimator {
- struct list_head list;
+struct net_rate_estimator {
struct gnet_stats_basic_packed *bstats;
- struct gnet_stats_rate_est64 *rate_est;
spinlock_t *stats_lock;
seqcount_t *running;
- int ewma_log;
+ struct gnet_stats_basic_cpu __percpu *cpu_bstats;
+ u8 ewma_log;
+ u8 intvl_log; /* period : (250ms << intvl_log) */
+
+ seqcount_t seq;
u32 last_packets;
- unsigned long avpps;
u64 last_bytes;
+
+ u64 avpps;
u64 avbps;
- struct rcu_head e_rcu;
- struct rb_node node;
- struct gnet_stats_basic_cpu __percpu *cpu_bstats;
- struct rcu_head head;
-};
-struct gen_estimator_head {
- unsigned long next_jiffies;
- struct timer_list timer;
- struct list_head list;
+ unsigned long next_jiffies;
+ struct timer_list timer;
+ struct rcu_head rcu;
};
-static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
-
-/* Protects against NULL dereference */
-static DEFINE_RWLOCK(est_lock);
-
-/* Protects against soft lockup during large deletion */
-static struct rb_root est_root = RB_ROOT;
-static DEFINE_SPINLOCK(est_tree_lock);
-
-static void est_timer(unsigned long arg)
+static void est_fetch_counters(struct net_rate_estimator *e,
+ struct gnet_stats_basic_packed *b)
{
- int idx = (int)arg;
- struct gen_estimator *e;
+ if (e->stats_lock)
+ spin_lock(e->stats_lock);
- rcu_read_lock();
- list_for_each_entry_rcu(e, &elist[idx].list, list) {
- struct gnet_stats_basic_packed b = {0};
- unsigned long rate;
- u64 brate;
-
- if (e->stats_lock)
- spin_lock(e->stats_lock);
- read_lock(&est_lock);
- if (e->bstats == NULL)
- goto skip;
-
- __gnet_stats_copy_basic(e->running, &b, e->cpu_bstats, e->bstats);
-
- brate = (b.bytes - e->last_bytes)<<(7 - idx);
- e->last_bytes = b.bytes;
- e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
- WRITE_ONCE(e->rate_est->bps, (e->avbps + 0xF) >> 5);
-
- rate = b.packets - e->last_packets;
- rate <<= (7 - idx);
- e->last_packets = b.packets;
- e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
- WRITE_ONCE(e->rate_est->pps, (e->avpps + 0xF) >> 5);
-skip:
- read_unlock(&est_lock);
- if (e->stats_lock)
- spin_unlock(e->stats_lock);
- }
+ __gnet_stats_copy_basic(e->running, b, e->cpu_bstats, e->bstats);
- if (!list_empty(&elist[idx].list)) {
- elist[idx].next_jiffies += ((HZ/4) << idx);
+ if (e->stats_lock)
+ spin_unlock(e->stats_lock);
- if (unlikely(time_after_eq(jiffies, elist[idx].next_jiffies))) {
- /* Ouch... timer was delayed. */
- elist[idx].next_jiffies = jiffies + 1;
- }
- mod_timer(&elist[idx].timer, elist[idx].next_jiffies);
- }
- rcu_read_unlock();
}
-static void gen_add_node(struct gen_estimator *est)
+static void est_timer(unsigned long arg)
{
- struct rb_node **p = &est_root.rb_node, *parent = NULL;
+ struct net_rate_estimator *est = (struct net_rate_estimator *)arg;
+ struct gnet_stats_basic_packed b;
+ u64 rate, brate;
- while (*p) {
- struct gen_estimator *e;
+ est_fetch_counters(est, &b);
+ brate = (b.bytes - est->last_bytes) << (8 - est->ewma_log);
+ brate -= (est->avbps >> est->ewma_log);
- parent = *p;
- e = rb_entry(parent, struct gen_estimator, node);
+ rate = (u64)(b.packets - est->last_packets) << (8 - est->ewma_log);
+ rate -= (est->avpps >> est->ewma_log);
- if (est->bstats > e->bstats)
- p = &parent->rb_right;
- else
- p = &parent->rb_left;
- }
- rb_link_node(&est->node, parent, p);
- rb_insert_color(&est->node, &est_root);
-}
-
-static
-struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats,
- const struct gnet_stats_rate_est64 *rate_est)
-{
- struct rb_node *p = est_root.rb_node;
+ write_seqcount_begin(&est->seq);
+ est->avbps += brate;
+ est->avpps += rate;
+ write_seqcount_end(&est->seq);
- while (p) {
- struct gen_estimator *e;
+ est->last_bytes = b.bytes;
+ est->last_packets = b.packets;
- e = rb_entry(p, struct gen_estimator, node);
+ est->next_jiffies += ((HZ/4) << est->intvl_log);
- if (bstats > e->bstats)
- p = p->rb_right;
- else if (bstats < e->bstats || rate_est != e->rate_est)
- p = p->rb_left;
- else
- return e;
+ if (unlikely(time_after_eq(jiffies, est->next_jiffies))) {
+ /* Ouch... timer was delayed. */
+ est->next_jiffies = jiffies + 1;
}
- return NULL;
+ mod_timer(&est->timer, est->next_jiffies);
}
/**
@@ -217,84 +126,76 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats
*/
int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
- struct gnet_stats_rate_est64 *rate_est,
+ struct net_rate_estimator __rcu **rate_est,
spinlock_t *stats_lock,
seqcount_t *running,
struct nlattr *opt)
{
- struct gen_estimator *est;
struct gnet_estimator *parm = nla_data(opt);
- struct gnet_stats_basic_packed b = {0};
- int idx;
+ struct net_rate_estimator *old, *est;
+ struct gnet_stats_basic_packed b;
+ int intvl_log;
if (nla_len(opt) < sizeof(*parm))
return -EINVAL;
+ /* allowed timer periods are :
+ * -2 : 250ms, -1 : 500ms, 0 : 1 sec
+ * 1 : 2 sec, 2 : 4 sec, 3 : 8 sec
+ */
if (parm->interval < -2 || parm->interval > 3)
return -EINVAL;
est = kzalloc(sizeof(*est), GFP_KERNEL);
- if (est == NULL)
+ if (!est)
return -ENOBUFS;
- __gnet_stats_copy_basic(running, &b, cpu_bstats, bstats);
-
- idx = parm->interval + 2;
+ seqcount_init(&est->seq);
+ intvl_log = parm->interval + 2;
est->bstats = bstats;
- est->rate_est = rate_est;
est->stats_lock = stats_lock;
est->running = running;
est->ewma_log = parm->ewma_log;
- est->last_bytes = b.bytes;
- est->avbps = rate_est->bps<<5;
- est->last_packets = b.packets;
- est->avpps = rate_est->pps<<10;
+ est->intvl_log = intvl_log;
est->cpu_bstats = cpu_bstats;
- spin_lock_bh(&est_tree_lock);
- if (!elist[idx].timer.function) {
- INIT_LIST_HEAD(&elist[idx].list);
- setup_timer(&elist[idx].timer, est_timer, idx);
+ est_fetch_counters(est, &b);
+ est->last_bytes = b.bytes;
+ est->last_packets = b.packets;
+ old = rcu_dereference_protected(*rate_est, 1);
+ if (old) {
+ del_timer_sync(&old->timer);
+ est->avbps = old->avbps;
+ est->avpps = old->avpps;
}
- if (list_empty(&elist[idx].list)) {
- elist[idx].next_jiffies = jiffies + ((HZ/4) << idx);
- mod_timer(&elist[idx].timer, elist[idx].next_jiffies);
- }
- list_add_rcu(&est->list, &elist[idx].list);
- gen_add_node(est);
- spin_unlock_bh(&est_tree_lock);
+ est->next_jiffies = jiffies + ((HZ/4) << intvl_log);
+ setup_timer(&est->timer, est_timer, (unsigned long)est);
+ mod_timer(&est->timer, est->next_jiffies);
+ rcu_assign_pointer(*rate_est, est);
+ if (old)
+ kfree_rcu(old, rcu);
return 0;
}
EXPORT_SYMBOL(gen_new_estimator);
/**
* gen_kill_estimator - remove a rate estimator
- * @bstats: basic statistics
- * @rate_est: rate estimator statistics
+ * @rate_est: rate estimator
*
- * Removes the rate estimator specified by &bstats and &rate_est.
+ * Removes the rate estimator.
*
- * Note : Caller should respect an RCU grace period before freeing stats_lock
*/
-void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
- struct gnet_stats_rate_est64 *rate_est)
+void gen_kill_estimator(struct net_rate_estimator __rcu **rate_est)
{
- struct gen_estimator *e;
-
- spin_lock_bh(&est_tree_lock);
- while ((e = gen_find_node(bstats, rate_est))) {
- rb_erase(&e->node, &est_root);
+ struct net_rate_estimator *est;
- write_lock(&est_lock);
- e->bstats = NULL;
- write_unlock(&est_lock);
-
- list_del_rcu(&e->list);
- kfree_rcu(e, e_rcu);
+ est = xchg((__force struct net_rate_estimator **)rate_est, NULL);
+ if (est) {
+ del_timer_sync(&est->timer);
+ kfree_rcu(est, rcu);
}
- spin_unlock_bh(&est_tree_lock);
}
EXPORT_SYMBOL(gen_kill_estimator);
@@ -314,33 +215,47 @@ EXPORT_SYMBOL(gen_kill_estimator);
*/
int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
- struct gnet_stats_rate_est64 *rate_est,
+ struct net_rate_estimator __rcu **rate_est,
spinlock_t *stats_lock,
seqcount_t *running, struct nlattr *opt)
{
- gen_kill_estimator(bstats, rate_est);
- return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt);
+ return gen_new_estimator(bstats, cpu_bstats, rate_est,
+ stats_lock, running, opt);
}
EXPORT_SYMBOL(gen_replace_estimator);
/**
* gen_estimator_active - test if estimator is currently in use
- * @bstats: basic statistics
- * @rate_est: rate estimator statistics
+ * @rate_est: rate estimator
*
* Returns true if estimator is active, and false if not.
*/
-bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
- const struct gnet_stats_rate_est64 *rate_est)
+bool gen_estimator_active(struct net_rate_estimator __rcu **rate_est)
{
- bool res;
+ return !!rcu_access_pointer(*rate_est);
+}
+EXPORT_SYMBOL(gen_estimator_active);
- ASSERT_RTNL();
+bool gen_estimator_read(struct net_rate_estimator __rcu **rate_est,
+ struct gnet_stats_rate_est64 *sample)
+{
+ struct net_rate_estimator *est;
+ unsigned seq;
+
+ rcu_read_lock();
+ est = rcu_dereference(*rate_est);
+ if (!est) {
+ rcu_read_unlock();
+ return false;
+ }
- spin_lock_bh(&est_tree_lock);
- res = gen_find_node(bstats, rate_est) != NULL;
- spin_unlock_bh(&est_tree_lock);
+ do {
+ seq = read_seqcount_begin(&est->seq);
+ sample->bps = est->avbps >> 8;
+ sample->pps = est->avpps >> 8;
+ } while (read_seqcount_retry(&est->seq, seq));
- return res;
+ rcu_read_unlock();
+ return true;
}
-EXPORT_SYMBOL(gen_estimator_active);
+EXPORT_SYMBOL(gen_estimator_read);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 508e051304fb62627e61b5065b2325edd1b84f2e..55fd980568d855d9558afcdcf37d3d316ae20acc 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -205,18 +205,17 @@ EXPORT_SYMBOL(gnet_stats_copy_basic);
*/
int
gnet_stats_copy_rate_est(struct gnet_dump *d,
- const struct gnet_stats_basic_packed *b,
- struct gnet_stats_rate_est64 *r)
+ struct net_rate_estimator __rcu **ptr)
{
+ struct gnet_stats_rate_est64 sample;
struct gnet_stats_rate_est est;
int res;
- if (b && !gen_estimator_active(b, r))
+ if (!gen_estimator_read(ptr, &sample))
return 0;
-
- est.bps = min_t(u64, UINT_MAX, r->bps);
+ est.bps = min_t(u64, UINT_MAX, sample.bps);
/* we have some time before reaching 2^32 packets per second */
- est.pps = r->pps;
+ est.pps = sample.pps;
if (d->compat_tc_stats) {
d->tc_stats.bps = est.bps;
@@ -226,11 +225,11 @@ gnet_stats_copy_rate_est(struct gnet_dump *d,
if (d->tail) {
res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est),
TCA_STATS_PAD);
- if (res < 0 || est.bps == r->bps)
+ if (res < 0 || est.bps == sample.bps)
return res;
/* emit 64bit stats only if needed */
- return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r),
- TCA_STATS_PAD);
+ return gnet_stats_copy(d, TCA_STATS_RATE_EST64, &sample,
+ sizeof(sample), TCA_STATS_PAD);
}
return 0;
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index dbd6c4a12b97435d2937c92fd79a035fd5828965..91a373a3f534de8d8641341c33c08d8fc49cbd29 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -63,7 +63,7 @@ void xt_rateest_put(struct xt_rateest *est)
mutex_lock(&xt_rateest_mutex);
if (--est->refcnt == 0) {
hlist_del(&est->list);
- gen_kill_estimator(&est->bstats, &est->rstats);
+ gen_kill_estimator(&est->rate_est);
/*
* gen_estimator est_timer() might access est->lock or bstats,
* wait a RCU grace period before freeing 'est'
@@ -132,7 +132,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
cfg.est.interval = info->interval;
cfg.est.ewma_log = info->ewma_log;
- ret = gen_new_estimator(&est->bstats, NULL, &est->rstats,
+ ret = gen_new_estimator(&est->bstats, NULL, &est->rate_est,
&est->lock, NULL, &cfg.opt);
if (ret < 0)
goto err2;
diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c
index 7720b036d76a84c949080c8c32827b604c80da64..1db02f6fca54d7eb5d60c2d8c5cb8ab11656fbcb 100644
--- a/net/netfilter/xt_rateest.c
+++ b/net/netfilter/xt_rateest.c
@@ -18,35 +18,33 @@ static bool
xt_rateest_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_rateest_match_info *info = par->matchinfo;
- struct gnet_stats_rate_est64 *r;
+ struct gnet_stats_rate_est64 sample = {0};
u_int32_t bps1, bps2, pps1, pps2;
bool ret = true;
- spin_lock_bh(&info->est1->lock);
- r = &info->est1->rstats;
+ gen_estimator_read(&info->est1->rate_est, &sample);
+
if (info->flags & XT_RATEEST_MATCH_DELTA) {
- bps1 = info->bps1 >= r->bps ? info->bps1 - r->bps : 0;
- pps1 = info->pps1 >= r->pps ? info->pps1 - r->pps : 0;
+ bps1 = info->bps1 >= sample.bps ? info->bps1 - sample.bps : 0;
+ pps1 = info->pps1 >= sample.pps ? info->pps1 - sample.pps : 0;
} else {
- bps1 = r->bps;
- pps1 = r->pps;
+ bps1 = sample.bps;
+ pps1 = sample.pps;
}
- spin_unlock_bh(&info->est1->lock);
if (info->flags & XT_RATEEST_MATCH_ABS) {
bps2 = info->bps2;
pps2 = info->pps2;
} else {
- spin_lock_bh(&info->est2->lock);
- r = &info->est2->rstats;
+ gen_estimator_read(&info->est2->rate_est, &sample);
+
if (info->flags & XT_RATEEST_MATCH_DELTA) {
- bps2 = info->bps2 >= r->bps ? info->bps2 - r->bps : 0;
- pps2 = info->pps2 >= r->pps ? info->pps2 - r->pps : 0;
+ bps2 = info->bps2 >= sample.bps ? info->bps2 - sample.bps : 0;
+ pps2 = info->pps2 >= sample.pps ? info->pps2 - sample.pps : 0;
} else {
- bps2 = r->bps;
- pps2 = r->pps;
+ bps2 = sample.bps;
+ pps2 = sample.pps;
}
- spin_unlock_bh(&info->est2->lock);
}
switch (info->mode) {
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index f893d180da1caa3b6dd1cc8773920beb1885f9b0..2095c83ce7730d49550103a8efad943d28815617 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -41,8 +41,7 @@ static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *p)
spin_lock_bh(&hinfo->lock);
hlist_del(&p->tcfa_head);
spin_unlock_bh(&hinfo->lock);
- gen_kill_estimator(&p->tcfa_bstats,
- &p->tcfa_rate_est);
+ gen_kill_estimator(&p->tcfa_rate_est);
/*
* gen_estimator est_timer() might access p->tcfa_lock
* or bstats, wait a RCU grace period before freeing p
@@ -237,8 +236,7 @@ EXPORT_SYMBOL(tcf_hash_check);
void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est)
{
if (est)
- gen_kill_estimator(&a->tcfa_bstats,
- &a->tcfa_rate_est);
+ gen_kill_estimator(&a->tcfa_rate_est);
call_rcu(&a->tcfa_rcu, free_tcf);
}
EXPORT_SYMBOL(tcf_hash_cleanup);
@@ -670,8 +668,7 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p,
goto errout;
if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 ||
- gnet_stats_copy_rate_est(&d, &p->tcfa_bstats,
- &p->tcfa_rate_est) < 0 ||
+ gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 ||
gnet_stats_copy_queue(&d, p->cpu_qstats,
&p->tcfa_qstats,
p->tcfa_qstats.qlen) < 0)
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index c990b73a6c85151862c30971bd3787d20dbcecd1..0ba91d1ce99480c4817684dbe0b4fde61811e03e 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -142,8 +142,7 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla,
goto failure_unlock;
} else if (tb[TCA_POLICE_AVRATE] &&
(ret == ACT_P_CREATED ||
- !gen_estimator_active(&police->tcf_bstats,
- &police->tcf_rate_est))) {
+ !gen_estimator_active(&police->tcf_rate_est))) {
err = -EINVAL;
goto failure_unlock;
}
@@ -216,13 +215,17 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
bstats_update(&police->tcf_bstats, skb);
tcf_lastuse_update(&police->tcf_tm);
- if (police->tcfp_ewma_rate &&
- police->tcf_rate_est.bps >= police->tcfp_ewma_rate) {
- police->tcf_qstats.overlimits++;
- if (police->tcf_action == TC_ACT_SHOT)
- police->tcf_qstats.drops++;
- spin_unlock(&police->tcf_lock);
- return police->tcf_action;
+ if (police->tcfp_ewma_rate) {
+ struct gnet_stats_rate_est64 sample;
+
+ if (!gen_estimator_read(&police->tcf_rate_est, &sample) ||
+ sample.bps >= police->tcfp_ewma_rate) {
+ police->tcf_qstats.overlimits++;
+ if (police->tcf_action == TC_ACT_SHOT)
+ police->tcf_qstats.drops++;
+ spin_unlock(&police->tcf_lock);
+ return police->tcf_action;
+ }
}
if (qdisc_pkt_len(skb) <= police->tcfp_mtu) {
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index f337f1bdd1d4a4cac738f9fe1fbd42e5984174fe..d7b93429f0cca61ff489536b4247f31f3690fdd8 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1395,7 +1395,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
&d, cpu_bstats, &q->bstats) < 0 ||
- gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
+ gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
goto nla_put_failure;
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index beb554aa8cfbb4acb5d89511b5e0030b4c9cf26e..9ffe1c220b02848032474fa7b9b2c2f09d40b110 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -122,7 +122,7 @@ struct cbq_class {
psched_time_t penalized;
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est64 rate_est;
+ struct net_rate_estimator __rcu *rate_est;
struct tc_cbq_xstats xstats;
struct tcf_proto __rcu *filter_list;
@@ -1346,7 +1346,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
d, NULL, &cl->bstats) < 0 ||
- gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
return -1;
@@ -1405,7 +1405,7 @@ static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
tcf_destroy_chain(&cl->filter_list);
qdisc_destroy(cl->q);
qdisc_put_rtab(cl->R_tab);
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(&cl->rate_est);
if (cl != &q->link)
kfree(cl);
}
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index 8af5c59eef848db47cc33a878296693dabb44955..bb4cbdf7500482b170eef6e7923cf2f2259e52b5 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -25,7 +25,7 @@ struct drr_class {
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est64 rate_est;
+ struct net_rate_estimator __rcu *rate_est;
struct list_head alist;
struct Qdisc *qdisc;
@@ -142,7 +142,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl)
{
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(&cl->rate_est);
qdisc_destroy(cl->qdisc);
kfree(cl);
}
@@ -283,7 +283,7 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg,
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
d, NULL, &cl->bstats) < 0 ||
- gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL, &cl->qdisc->qstats, qlen) < 0)
return -1;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 6cfb6e9038c28187cc888cebf004e61270f00871..6eb9c8e88519a36fc3ef82be7c7f1dbe0ef1e13c 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -709,7 +709,7 @@ void qdisc_destroy(struct Qdisc *qdisc)
qdisc_put_stab(rtnl_dereference(qdisc->stab));
#endif
- gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
+ gen_kill_estimator(&qdisc->rate_est);
if (ops->reset)
ops->reset(qdisc);
if (ops->destroy)
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 000f1d36128e1abfc0657bce779a7df852f66384..3ffaa6fb0990f0aa31487a2f1829b1f1accf8b21 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -114,7 +114,7 @@ struct hfsc_class {
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est64 rate_est;
+ struct net_rate_estimator __rcu *rate_est;
struct tcf_proto __rcu *filter_list; /* filter list */
unsigned int filter_cnt; /* filter count */
unsigned int level; /* class level in hierarchy */
@@ -1091,7 +1091,7 @@ hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl)
tcf_destroy_chain(&cl->filter_list);
qdisc_destroy(cl->qdisc);
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(&cl->rate_est);
if (cl != &q->root)
kfree(cl);
}
@@ -1348,7 +1348,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
xstats.rtwork = cl->cl_cumul;
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 ||
- gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->qdisc->q.qlen) < 0)
return -1;
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 9926fe4f3b6f7db9aeba22fbbfae3d47c4e2af60..760f39e7caeeb91b6c34d561f64f1ed97c884a32 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -111,7 +111,7 @@ struct htb_class {
unsigned int children;
struct htb_class *parent; /* parent class */
- struct gnet_stats_rate_est64 rate_est;
+ struct net_rate_estimator __rcu *rate_est;
/*
* Written often fields
@@ -1145,7 +1145,7 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
d, NULL, &cl->bstats) < 0 ||
- gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
return -1;
@@ -1228,7 +1228,7 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
WARN_ON(!cl->un.leaf.q);
qdisc_destroy(cl->un.leaf.q);
}
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(&cl->rate_est);
tcf_destroy_chain(&cl->filter_list);
kfree(cl);
}
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index ca0516e6f74356f47dffcc2262f233ee50f0c47c..f9e712ce2d15ce9280c31d2f75d62b84034ae51d 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -137,7 +137,7 @@ struct qfq_class {
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est64 rate_est;
+ struct net_rate_estimator __rcu *rate_est;
struct Qdisc *qdisc;
struct list_head alist; /* Link for active-classes list. */
struct qfq_aggregate *agg; /* Parent aggregate. */
@@ -508,7 +508,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL);
if (new_agg == NULL) {
err = -ENOBUFS;
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(&cl->rate_est);
goto destroy_class;
}
sch_tree_lock(sch);
@@ -533,7 +533,7 @@ static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
struct qfq_sched *q = qdisc_priv(sch);
qfq_rm_from_agg(q, cl);
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(&cl->rate_est);
qdisc_destroy(cl->qdisc);
kfree(cl);
}
@@ -667,7 +667,7 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
d, NULL, &cl->bstats) < 0 ||
- gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL,
&cl->qdisc->qstats, cl->qdisc->q.qlen) < 0)
return -1;
^ permalink raw reply related
* Re: [PATCN net-next] net_sched: gen_estimator: complete rewrite of rate estimators
From: Eric Dumazet @ 2016-12-04 7:12 UTC (permalink / raw)
To: David Miller; +Cc: netdev, netfilter-devel
In-Reply-To: <1480835273.18162.457.camel@edumazet-glaptop3.roam.corp.google.com>
On Sat, 2016-12-03 at 23:07 -0800, Eric Dumazet wrote:
> From: Eric Dumazet <edumazet@google.com>
>
> 1) Old code was hard to maintain, due to complex lock chains.
> (We probably will be able to remove some kfree_rcu() in callers)
>
> 2) Using a single timer to update all estimators does not scale.
>
> 3) Code was buggy on 32bit kernel (WRITE_ONCE() on 64bit quantity
> is not supposed to work well)
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index c7adcb57654ea57d1ba6702c91743cb7d2c74d28..859b60bfa86712031186fffc09c65bc43aa065dd 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -2081,6 +2081,14 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
> limit <<= factor;
>
> if (atomic_read(&sk->sk_wmem_alloc) > limit) {
> + /* Special case where TX completion is delayed too much :
> + * If the skb we try to send is the first skb in write queue,
> + * then send it !
> + * No need to wait for TX completion to call us back.
> + */
> + if (skb == sk->sk_write_queue.next)
> + return false;
> +
Oups, this has nothing to do here. I will send a v2, sorry.
^ permalink raw reply
* [PATCH 1/1] net: ethernet: broadcom: fix improper return value
From: Pan Bian @ 2016-12-04 6:29 UTC (permalink / raw)
To: Yuval Mintz, Ariel Elior, everest-linux-l2, netdev; +Cc: linux-kernel, Pan Bian
From: Pan Bian <bianpan2016@163.com>
Marco BNX2X_ALLOC_AND_SET(arr, lbl, func) calls kmalloc() to allocate
memory, and jumps to label "lbl" if the allocation fails. Label "lbl"
first cleans memory and then returns variable rc. Before calling the
macro, the value of variable rc is 0. Because 0 means no error, the
callers of bnx2x_init_firmware() may be misled. This patch fixes the bug,
assigning "-ENOMEM" to rc before calling macro NX2X_ALLOC_AND_SET().
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=189141
Signed-off-by: Pan Bian <bianpan2016@163.com>
---
drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 0cee4c0..6f9fc20 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -13505,6 +13505,7 @@ static int bnx2x_init_firmware(struct bnx2x *bp)
/* Initialize the pointers to the init arrays */
/* Blob */
+ rc = -ENOMEM;
BNX2X_ALLOC_AND_SET(init_data, request_firmware_exit, be32_to_cpu_n);
/* Opcodes */
--
1.9.1
^ permalink raw reply related
* [PATCH 1/1] net: ethernet: qlogic: fix improper return value
From: Pan Bian @ 2016-12-04 6:15 UTC (permalink / raw)
To: Harish Patil, Manish Chopra, Dept-GELinuxNICDev, netdev
Cc: linux-kernel, Pan Bian
From: Pan Bian <bianpan2016@163.com>
When the call to qlcnic_alloc_mbx_args() fails, returning variable "err"
seems improper. With reference to the context, returing variable
"config" may be better.
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=189101
Signed-off-by: Pan Bian <bianpan2016@163.com>
---
drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c
index bdbcd2b..21c4aca 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c
@@ -3189,7 +3189,7 @@ int qlcnic_83xx_test_link(struct qlcnic_adapter *adapter)
err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_GET_LINK_STATUS);
if (err)
- return err;
+ return config;
err = qlcnic_issue_cmd(adapter, &cmd);
if (err) {
--
1.9.1
^ permalink raw reply related
* Re: [PATCH v3 07/13] net: ethernet: ti: cpts: clean up event list if event pool is empty
From: kbuild test robot @ 2016-12-04 6:09 UTC (permalink / raw)
To: Grygorii Strashko
Cc: kbuild-all-JC7UmRfGjtg, David S. Miller,
netdev-u79uwXL29TY76Z2rM5mHXA, Mugunthan V N, Richard Cochran,
Sekhar Nori, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
linux-omap-u79uwXL29TY76Z2rM5mHXA, Rob Herring,
devicetree-u79uwXL29TY76Z2rM5mHXA, Murali Karicheri, Wingman Kwok,
Thomas Gleixner, Grygorii Strashko
In-Reply-To: <20161202203023.25526-8-grygorii.strashko-l0cyMroinI0@public.gmane.org>
[-- Attachment #1: Type: text/plain, Size: 3916 bytes --]
Hi WingMan,
[auto build test ERROR on net/master]
[also build test ERROR on v4.9-rc7 next-20161202]
[cannot apply to net-next/master]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
url: https://github.com/0day-ci/linux/commits/Grygorii-Strashko/net-ethernet-ti-cpts-switch-to-readl-writel_relaxed/20161204-010355
config: arm-omap2plus_defconfig (attached as .config)
compiler: arm-linux-gnueabi-gcc (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=arm
Note: the linux-review/Grygorii-Strashko/net-ethernet-ti-cpts-switch-to-readl-writel_relaxed/20161204-010355 HEAD f938805dce662197c057c75ac67849f60da87c9f builds fine.
It only hurts bisectibility.
All errors (new ones prefixed by >>):
In file included from include/linux/dma-mapping.h:6:0,
from include/linux/skbuff.h:34,
from include/linux/ip.h:20,
from include/linux/ptp_classify.h:26,
from drivers/net/ethernet/ti/cpts.c:25:
drivers/net/ethernet/ti/cpts.c: In function 'cpts_purge_events':
>> drivers/net/ethernet/ti/cpts.c:76:15: error: 'struct cpts' has no member named 'dev'
dev_dbg(cpts->dev, "cpts: event pool cleaned up %d\n", removed);
^
include/linux/device.h:1209:26: note: in definition of macro 'dev_dbg'
dev_printk(KERN_DEBUG, dev, format, ##arg); \
^~~
drivers/net/ethernet/ti/cpts.c: In function 'cpts_fifo_read':
drivers/net/ethernet/ti/cpts.c:94:16: error: 'struct cpts' has no member named 'dev'
dev_err(cpts->dev, "cpts: event pool empty\n");
^~
vim +76 drivers/net/ethernet/ti/cpts.c
19 */
20 #include <linux/err.h>
21 #include <linux/if.h>
22 #include <linux/hrtimer.h>
23 #include <linux/module.h>
24 #include <linux/net_tstamp.h>
> 25 #include <linux/ptp_classify.h>
26 #include <linux/time.h>
27 #include <linux/uaccess.h>
28 #include <linux/workqueue.h>
29 #include <linux/if_ether.h>
30 #include <linux/if_vlan.h>
31
32 #include "cpts.h"
33
34 #define cpts_read32(c, r) readl_relaxed(&c->reg->r)
35 #define cpts_write32(c, v, r) writel_relaxed(v, &c->reg->r)
36
37 static int event_expired(struct cpts_event *event)
38 {
39 return time_after(jiffies, event->tmo);
40 }
41
42 static int event_type(struct cpts_event *event)
43 {
44 return (event->high >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK;
45 }
46
47 static int cpts_fifo_pop(struct cpts *cpts, u32 *high, u32 *low)
48 {
49 u32 r = cpts_read32(cpts, intstat_raw);
50
51 if (r & TS_PEND_RAW) {
52 *high = cpts_read32(cpts, event_high);
53 *low = cpts_read32(cpts, event_low);
54 cpts_write32(cpts, EVENT_POP, event_pop);
55 return 0;
56 }
57 return -1;
58 }
59
60 static int cpts_purge_events(struct cpts *cpts)
61 {
62 struct list_head *this, *next;
63 struct cpts_event *event;
64 int removed = 0;
65
66 list_for_each_safe(this, next, &cpts->events) {
67 event = list_entry(this, struct cpts_event, list);
68 if (event_expired(event)) {
69 list_del_init(&event->list);
70 list_add(&event->list, &cpts->pool);
71 ++removed;
72 }
73 }
74
75 if (removed)
> 76 dev_dbg(cpts->dev, "cpts: event pool cleaned up %d\n", removed);
77 return removed ? 0 : -1;
78 }
79
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 28475 bytes --]
^ permalink raw reply
* [PATCH 1/1] net: ethernet: qlogic: set error code on failure
From: Pan Bian @ 2016-12-04 5:53 UTC (permalink / raw)
To: Yuval Mintz, Ariel Elior, everest-linux-l2, netdev; +Cc: linux-kernel, Pan Bian
From: Pan Bian <bianpan2016@163.com>
When calling dma_mapping_error(), the value of return variable rc is 0.
And when the call returns an unexpected value, rc is not set to a
negative errno. Thus, it will return 0 on the error path, and its
callers cannot detect the bug. This patch fixes the bug, assigning
"-ENOMEM" to err.
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=189041
Signed-off-by: Pan Bian <bianpan2016@163.com>
---
drivers/net/ethernet/qlogic/qed/qed_ll2.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index f95385c..62ae55b 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -1730,6 +1730,7 @@ static int qed_ll2_start_xmit(struct qed_dev *cdev, struct sk_buff *skb)
mapping))) {
DP_NOTICE(cdev,
"Unable to map frag - dropping packet\n");
+ rc = -ENOMEM;
goto err;
}
} else {
--
1.9.1
^ permalink raw reply related
* [PATCH 1/1] atm: fix improper return value
From: Pan Bian @ 2016-12-04 5:45 UTC (permalink / raw)
To: Chas Williams, linux-atm-general, netdev; +Cc: linux-kernel, Pan Bian
From: Pan Bian <bianpan2016@163.com>
It returns variable "error" when ioremap_nocache() returns a NULL
pointer. The value of "error" is 0 then, which will mislead the callers
to believe that there is no error. This patch fixes the bug, returning
"-ENOMEM".
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=189021
Signed-off-by: Pan Bian <bianpan2016@163.com>
---
drivers/atm/eni.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/atm/eni.c b/drivers/atm/eni.c
index f2aaf9e..40c2d56 100644
--- a/drivers/atm/eni.c
+++ b/drivers/atm/eni.c
@@ -1727,7 +1727,7 @@ static int eni_do_init(struct atm_dev *dev)
printk("\n");
printk(KERN_ERR DEV_LABEL "(itf %d): can't set up page "
"mapping\n",dev->number);
- return error;
+ return -ENOMEM;
}
eni_dev->ioaddr = base;
eni_dev->base_diff = real_base - (unsigned long) base;
--
1.9.1
^ permalink raw reply related
* Re: [net-next PATCH v4 1/6] net: virtio dynamically disable/enable LRO
From: Michael S. Tsirkin @ 2016-12-04 5:36 UTC (permalink / raw)
To: John Fastabend
Cc: daniel, shm, davem, tgraf, alexei.starovoitov, john.r.fastabend,
netdev, bblanco, brouer
In-Reply-To: <20161202204945.4331.2419.stgit@john-Precision-Tower-5810>
On Fri, Dec 02, 2016 at 12:49:45PM -0800, John Fastabend wrote:
> This adds support for dynamically setting the LRO feature flag. The
> message to control guest features in the backend uses the
> CTRL_GUEST_OFFLOADS msg type.
>
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> ---
> drivers/net/virtio_net.c | 45 ++++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 44 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index a21d93a..d814e7cb 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -1419,6 +1419,41 @@ static void virtnet_init_settings(struct net_device *dev)
> .set_settings = virtnet_set_settings,
> };
>
> +static int virtnet_set_features(struct net_device *netdev,
> + netdev_features_t features)
> +{
> + struct virtnet_info *vi = netdev_priv(netdev);
> + struct virtio_device *vdev = vi->vdev;
> + struct scatterlist sg;
> + u64 offloads = 0;
> +
> + if (features & NETIF_F_LRO)
> + offloads |= (1 << VIRTIO_NET_F_GUEST_TSO4) |
> + (1 << VIRTIO_NET_F_GUEST_TSO6);
> +
> + if (features & NETIF_F_RXCSUM)
> + offloads |= (1 << VIRTIO_NET_F_GUEST_CSUM);
> +
> + if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) {
> + sg_init_one(&sg, &offloads, sizeof(uint64_t));
> + if (!virtnet_send_command(vi,
> + VIRTIO_NET_CTRL_GUEST_OFFLOADS,
> + VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET,
> + &sg)) {
> + dev_warn(&netdev->dev,
> + "Failed to set guest offloads by virtnet command.\n");
> + return -EINVAL;
> + }
> + } else if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) &&
> + !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
No need for VIRTIO_F_VERSION_1 here.
> + dev_warn(&netdev->dev,
> + "No support for setting offloads pre version_1.\n");
> + return -EINVAL;
> + }
> +
> + return 0;
> +}
> +
> static const struct net_device_ops virtnet_netdev = {
> .ndo_open = virtnet_open,
> .ndo_stop = virtnet_close,
> @@ -1435,6 +1470,7 @@ static void virtnet_init_settings(struct net_device *dev)
> #ifdef CONFIG_NET_RX_BUSY_POLL
> .ndo_busy_poll = virtnet_busy_poll,
> #endif
> + .ndo_set_features = virtnet_set_features,
> };
>
> static void virtnet_config_changed_work(struct work_struct *work)
> @@ -1815,6 +1851,12 @@ static int virtnet_probe(struct virtio_device *vdev)
> if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM))
> dev->features |= NETIF_F_RXCSUM;
>
> + if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) &&
> + virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6)) {
> + dev->features |= NETIF_F_LRO;
> + dev->hw_features |= NETIF_F_LRO;
> + }
> +
> dev->vlan_features = dev->features;
>
> /* MTU range: 68 - 65535 */
> @@ -2057,7 +2099,8 @@ static int virtnet_restore(struct virtio_device *vdev)
> VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
> VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
> VIRTIO_NET_F_CTRL_MAC_ADDR, \
> - VIRTIO_NET_F_MTU
> + VIRTIO_NET_F_MTU, \
> + VIRTIO_NET_F_CTRL_GUEST_OFFLOADS
>
> static unsigned int features[] = {
> VIRTNET_FEATURES,
^ permalink raw reply
* [PATCH 1/1] net: irda: set error code on failures
From: Pan Bian @ 2016-12-04 5:27 UTC (permalink / raw)
To: Samuel Ortiz, netdev; +Cc: linux-kernel, Pan Bian
From: Pan Bian <bianpan2016@163.com>
When the calls to kzalloc() fail, the value of return variable ret may
be 0. 0 means success in this context. This patch fixes the bug,
assigning "-ENOMEM" to ret before calling kzalloc().
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=188971
Signed-off-by: Pan Bian <bianpan2016@163.com>
---
drivers/net/irda/irda-usb.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/net/irda/irda-usb.c b/drivers/net/irda/irda-usb.c
index a198946..8716b8c 100644
--- a/drivers/net/irda/irda-usb.c
+++ b/drivers/net/irda/irda-usb.c
@@ -1723,6 +1723,7 @@ static int irda_usb_probe(struct usb_interface *intf,
/* Don't change this buffer size and allocation without doing
* some heavy and complete testing. Don't ask why :-(
* Jean II */
+ ret = -ENOMEM;
self->speed_buff = kzalloc(IRDA_USB_SPEED_MTU, GFP_KERNEL);
if (!self->speed_buff)
goto err_out_3;
--
1.9.1
^ permalink raw reply related
* [PATCH 1/1] isdn: hisax: set error code on failure
From: Pan Bian @ 2016-12-04 5:15 UTC (permalink / raw)
To: Karsten Keil, netdev; +Cc: linux-kernel, Pan Bian
From: Pan Bian <bianpan2016@163.com>
In function hfc4s8s_probe(), the value of return variable err should be
negative on failures. However, when the call to request_region() returns
NULL, the value of err is 0. This patch fixes the bug, assiging
"-ENOMEM" to err on the path that request_region() fails.
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=188931
Signed-off-by: Pan Bian <bianpan2016@163.com>
---
drivers/isdn/hisax/hfc4s8s_l1.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/isdn/hisax/hfc4s8s_l1.c b/drivers/isdn/hisax/hfc4s8s_l1.c
index 9600cd7..3172cee 100644
--- a/drivers/isdn/hisax/hfc4s8s_l1.c
+++ b/drivers/isdn/hisax/hfc4s8s_l1.c
@@ -1499,6 +1499,7 @@ struct hfc4s8s_l1 {
printk(KERN_INFO
"HFC-4S/8S: failed to request address space at 0x%04x\n",
hw->iobase);
+ err = -ENOMEM;
goto out;
}
--
1.9.1
^ permalink raw reply related
* [PATCH net v2] ipv6: Allow IPv4-mapped address as next-hop
From: Erik Nordmark @ 2016-12-04 4:57 UTC (permalink / raw)
To: davem, nordmark; +Cc: netdev, Bob Gilligan
Made kernel accept IPv6 routes with IPv4-mapped address as next-hop.
It is possible to configure IP interfaces with IPv4-mapped addresses, and
one can add IPv6 routes for IPv4-mapped destinations/prefixes, yet prior
to this fix the kernel returned an EINVAL when attempting to add an IPv6
route with an IPv4-mapped address as a nexthop/gateway.
RFC 4798 (a proposed standard RFC) uses IPv4-mapped addresses as nexthops,
thus in order to support that type of address configuration the kernel
needs to allow IPv4-mapped addresses as nexthops.
Signed-off-by: Erik Nordmark <nordmark@arista.com>
Signed-off-by: Bob Gilligan <gilligan@arista.com>
---
v2 honoring minimum 1000 ft vertical separation between Thunderbird and patches (fixed whitespace issues)
net/ipv6/route.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1b57e11..86bdb02 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1995,8 +1995,11 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
It is very good, but in some (rare!) circumstances
(SIT, PtP, NBMA NOARP links) it is handy to allow
some exceptions. --ANK
+ We allow IPv4-mapped nexthops to support RFC4798-type
+ addressing
*/
- if (!(gwa_type & IPV6_ADDR_UNICAST))
+ if (!(gwa_type & (IPV6_ADDR_UNICAST |
+ IPV6_ADDR_MAPPED)))
goto out;
if (cfg->fc_table) {
--
1.8.1.4
^ permalink raw reply related
* Re: [PATCH 1/1] net: dcb: set error code on failures
From: David Miller @ 2016-12-04 4:55 UTC (permalink / raw)
To: bianpan201602; +Cc: netdev, linux-kernel, bianpan2016
In-Reply-To: <1480772948-7087-1-git-send-email-bianpan201602@163.com>
From: Pan Bian <bianpan201602@163.com>
Date: Sat, 3 Dec 2016 21:49:08 +0800
> From: Pan Bian <bianpan2016@163.com>
>
> In function dcbnl_cee_fill(), returns the value of variable err on
> errors. However, on some error paths (e.g. nla put fails), its value may
> be 0. It may be better to explicitly set a negative errno to variable
> err before returning.
>
> Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=188881
>
> Signed-off-by: Pan Bian <bianpan2016@163.com>
Applied, thanks.
^ permalink raw reply
* Re: [PATCH net-next] liquidio: 'imply' ptp instead of 'select'
From: David Miller @ 2016-12-04 4:39 UTC (permalink / raw)
To: arnd
Cc: felix.manlunas, tglx, david.daney, satananda.burla, rvatsavayi,
nicolas.pitre, sgoutham, netdev, linux-kernel
In-Reply-To: <20161202230451.1639318-1-arnd@arndb.de>
From: Arnd Bergmann <arnd@arndb.de>
Date: Sat, 3 Dec 2016 00:04:32 +0100
> ptp now depends on the optional POSIX_TIMERS setting and fails to build
> if we select it without that:
>
> warning: (LIQUIDIO_VF && TI_CPTS) selects PTP_1588_CLOCK which has unmet direct dependencies (NET && POSIX_TIMERS)
> warning: (LIQUIDIO_VF && TI_CPTS) selects PTP_1588_CLOCK which has unmet direct dependencies (NET && POSIX_TIMERS)
> ERROR: "posix_clock_unregister" [drivers/ptp/ptp.ko] undefined!
> ERROR: "posix_clock_register" [drivers/ptp/ptp.ko] undefined!
> ERROR: "pps_unregister_source" [drivers/ptp/ptp.ko] undefined!
> ERROR: "pps_event" [drivers/ptp/ptp.ko] undefined!
> ERROR: "pps_register_source" [drivers/ptp/ptp.ko] undefined!
>
> It seems that two patches have collided here, the build failure
> is a result of the combination. Changing the new option to 'imply'
> as well fixes it.
>
> Fixes: 111fc64a237f ("liquidio CN23XX: VF registration")
> Fixes: d1cbfd771ce8 ("ptp_clock: Allow for it to be optional")
> Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Like the kbuild robot, when I apply this it complains about 'imply' being
an unknown option.
I guess it worked for you because support for 'imply' exists in the -next
tree and gets pulled in from somewhere else.
In any event, as-is I cannot apply this.
^ permalink raw reply
* Re: [PATCH net v3] tcp: warn on bogus MSS and try to amend it
From: David Miller @ 2016-12-04 4:37 UTC (permalink / raw)
To: marcelo.leitner
Cc: netdev, jmaxwell37, alexandre.sidorenko, kuznet, jmorris,
yoshfuji, kaber, tlfalcon, brking, eric.dumazet
In-Reply-To: <83a3345f2fae904fa59794d59703d86851cda7d5.1480718620.git.marcelo.leitner@gmail.com>
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Fri, 2 Dec 2016 20:51:51 -0200
> @@ -144,7 +144,21 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
> */
> len = skb_shinfo(skb)->gso_size ? : skb->len;
> if (len >= icsk->icsk_ack.rcv_mss) {
> - icsk->icsk_ack.rcv_mss = len;
> + static bool __once __read_mostly;
> +
> + icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
> + tcp_sk(sk)->advmss);
> + if (icsk->icsk_ack.rcv_mss != len && !__once) {
> + struct net_device *dev;
> +
> + __once = true;
> +
> + rcu_read_lock();
> + dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
> + pr_warn_once("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
> + dev ? dev->name : "Unknown driver");
> + rcu_read_unlock();
> + }
This is almost ready to go.
Since you are doing the 'once' logic by hand, using pr_warn_once() is
redundant. And while you're at it, why not split this into a helper
function:
static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb)
{
static bool __once __read_mostly;
if (!__once) {
__once = true;
rcu_read_lock();
dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
dev ? dev->name : "Unknown driver");
rcu_read_unlock();
}
}
And then call that when icsk->icsk_ack.rcv_mss != len, you can even
put an unlikely() around the condition as well.
^ permalink raw reply
* Re: [PATCH net-next v5] ipv6 addrconf: Implemented enhanced DAD (RFC7527)
From: David Miller @ 2016-12-04 4:26 UTC (permalink / raw)
To: nordmark; +Cc: netdev, hannes, gilligan
In-Reply-To: <1480716008-11646-1-git-send-email-nordmark@arista.com>
From: Erik Nordmark <nordmark@arista.com>
Date: Fri, 2 Dec 2016 14:00:08 -0800
> Implemented RFC7527 Enhanced DAD.
> IPv6 duplicate address detection can fail if there is some temporary
> loopback of Ethernet frames. RFC7527 solves this by including a random
> nonce in the NS messages used for DAD, and if an NS is received with the
> same nonce it is assumed to be a looped back DAD probe and is ignored.
> RFC7527 is enabled by default. Can be disabled by setting both of
> conf/{all,interface}/enhanced_dad to zero.
>
> Signed-off-by: Erik Nordmark <nordmark@arista.com>
> Signed-off-by: Bob Gilligan <gilligan@arista.com>
> Reviewed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Applied, thanks.
^ permalink raw reply
* Re: [PATCHv2 net-next 0/4] MV88E6390 batch two
From: David Miller @ 2016-12-04 4:15 UTC (permalink / raw)
To: andrew; +Cc: vivien.didelot, netdev
In-Reply-To: <1480736119-12195-1-git-send-email-andrew@lunn.ch>
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 3 Dec 2016 04:35:15 +0100
> This is the second batch of patches adding support for the
> MV88e6390. They are not sufficient to make it work properly.
>
> The mv88e6390 has a much expanded set of priority maps. Refactor the
> existing code, and implement basic support for the new device.
>
> Similarly, the monitor control register has been reworked.
>
> The mv88e6390 has something odd in its EDSA tagging implementation,
> which means it is not possible to use it. So we need to use DSA
> tagging. This is the first device with EDSA support where we need to
> use DSA, and the code does not support this. So two patches refactor
> the existing code. The two different register definitions are
> separated out, and using DSA on an EDSA capable device is added.
...
Series applied.
^ permalink raw reply
* [PATCH 1/1] net: caif: remove ineffective check
From: Pan Bian @ 2016-12-04 4:15 UTC (permalink / raw)
To: Dmitry Tarnyagin, David S. Miller, Sergei Shtylyov, netdev
Cc: linux-kernel, Pan Bian
The check of the return value of sock_register() is ineffective.
"if(!err)" seems to be a typo. It is better to propagate the error code
to the callers of caif_sktinit_module(). This patch removes the check
statment and directly returns the result of sock_register().
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=188751
Signed-off-by: Pan Bian <bianpan2016@163.com>
---
net/caif/caif_socket.c | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index aa209b1..92cbbd2 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -1107,10 +1107,7 @@ static int caif_create(struct net *net, struct socket *sock, int protocol,
static int __init caif_sktinit_module(void)
{
- int err = sock_register(&caif_family_ops);
- if (!err)
- return err;
- return 0;
+ return sock_register(&caif_family_ops);
}
static void __exit caif_sktexit_module(void)
--
1.9.1
^ permalink raw reply related
* Re: [PATCH net] geneve: avoid use-after-free of skb->data
From: David Miller @ 2016-12-04 4:11 UTC (permalink / raw)
To: sd; +Cc: netdev, linville
In-Reply-To: <20161203003326.GA27610@bistromath.localdomain>
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Sat, 3 Dec 2016 01:33:26 +0100
> I'd like to try something based on static analysis. We'd need a way to
> tag cached pointers to skb->data (via ip_hdr() or whatever), and
> propagate the notion that pskb_expand_head() makes these cached
> pointers stale through layers of function calls. I don't know how
> feasible this is with the tools we have.
Perhaps create helpers that have some special attribute attached to
them like "skb_volatile" or whatever. ip_hdr() et al would go through
them.
Then the static analysis tool is told that pskb_expand_head() "kills"
all skb_volatile obtained values, and it could basically mark all such
variables as uninitialized.
^ permalink raw reply
* Re: [net-next PATCH v4 1/6] net: virtio dynamically disable/enable LRO
From: David Miller @ 2016-12-04 4:01 UTC (permalink / raw)
To: john.fastabend
Cc: daniel, mst, shm, tgraf, alexei.starovoitov, john.r.fastabend,
netdev, bblanco, brouer
In-Reply-To: <20161202204945.4331.2419.stgit@john-Precision-Tower-5810>
From: John Fastabend <john.fastabend@gmail.com>
Date: Fri, 02 Dec 2016 12:49:45 -0800
> + if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) {
> + sg_init_one(&sg, &offloads, sizeof(uint64_t));
> + if (!virtnet_send_command(vi,
> + VIRTIO_NET_CTRL_GUEST_OFFLOADS,
> + VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET,
> + &sg)) {
> + dev_warn(&netdev->dev,
> + "Failed to set guest offloads by virtnet command.\n");
> + return -EINVAL;
> + }
> + } else if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) &&
> + !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
Hmmm, to me this reads as:
if (X) {
...
else if (X && ...) {
I don't see how the second basic block can ever execute. If the virtio
has the VIRTIO_NET_F_CTRL_GUEST_OFFLOADS feature, we will execute only
the first basic block.
Maybe I misunderstand the logic for whatever reason.
^ permalink raw reply
* mlx5 search flow table
From: domingo montoya @ 2016-12-04 3:31 UTC (permalink / raw)
To: Linux Netdev List
Hello,
I was wondering if there was any way I could search what flow tables,
flow groups, flow rules exist for mlx5_core driver.
I am aware of the QUERY_* commands but I need to provide a valid
tableId or groupId to retrieve the information.
Let's say because of any bug, one of the flow table or flow group or
flow rule gets orphaned i.e., doesn't get deleted.
Is there any way to know that information, so as to explicity delete
those rules, tables, groups at a later point?
Thanks a lot!
Best Regards,
Domingo
^ permalink raw reply
* Re: mlx5 VST and VGT mode at the same time
From: domingo montoya @ 2016-12-04 3:28 UTC (permalink / raw)
To: Mohamad Haj Yahia; +Cc: Linux Netdev List
In-Reply-To: <CANDJFw5uTvgBPa5howOnsff82MvM=DcpEnor1ZELtSg8xwrMew@mail.gmail.com>
Thanks a lot Mohamad. This is really helpful.
On Mon, Aug 22, 2016 at 6:39 PM, Mohamad Haj Yahia
<mohamadhajyahia.mellanox@gmail.com> wrote:
> On Thu, Aug 18, 2016 at 12:41 PM, domingo montoya
> <reach.domingomontoya@gmail.com> wrote:
>> Hi All,
>>
>> Is there any way we can support both VST and VGT modes at the same time in mlx5?
>>
>> For e.g,
>>
>> If i send untagged packets from the VF, they should be tagged with the
>> VST vlan and the vlan be stripped for received packets.
>>
>> If i send tagged packets from the VF, they should be send as it and no
>> tag inserted for these and also the vlan tag not stripped for received
>> packets.
>>
>> Any way we can achieve this?
>>
>>
>> I understand that in the latest code these features are mutually exclusive.
>>
>> But if we have a requirement like this, any ideas on how to go about
>> implementing the same.
>>
>> Few observations:
>>
>> After going through the code, I figured out that for VST mode, we run
>> MODIFY_ESW_VPORT_CONTEXT and as part of this set the flag to strip the
>> vlan from the received packets. In case of VGT mode, because of this
>> command, the tags set by the VF driver also get stripped.
>>
>>
>>
>> Thanks a lot!
>>
>>
>> Best Regards,
>> Domingo
>
> Hi Domingo,
>
> Unfortunately there is a HW limitation that prevent VGT working
> besides VST on the same VF.
> Since the stripping feature is global attribute for all the VF
> incoming vlans, if we enable both modes you will see that the VGT
> traffic vlan also stripped and thus it will arrive to the VF as
> untagged.
> Because of this limitation we blocked the outgoing vlan tagged traffic
> from a VF that is in VST mode and also dropped incoming vlan tagged
> packets targeting that VF with a different vlan than the VF vlan-id.
> The VGT and VST mutual exclusive enforcement is done by VF ACL ingress
> and egress flow tables.
>
> Thanks,
> Mohamad
^ permalink raw reply
* (unknown),
From: Bob Biloxi @ 2016-12-04 3:26 UTC (permalink / raw)
To: netdev
subscribe linux-netdev
^ permalink raw reply
* Re: [PATCH net-next 2/4] mlx4: xdp: Allow raising MTU up to one page minus eth and vlan hdrs
From: Martin KaFai Lau @ 2016-12-04 3:19 UTC (permalink / raw)
To: Rick Jones
Cc: netdev, Alexei Starovoitov, Brenden Blanco, Daniel Borkmann,
David Miller, Saeed Mahameed, Tariq Toukan, Kernel Team
In-Reply-To: <ba7950ad-d469-5e98-6ed1-ded9f58dba6b@hpe.com>
On Fri, Dec 02, 2016 at 04:07:09PM -0800, Rick Jones wrote:
> On 12/02/2016 03:23 PM, Martin KaFai Lau wrote:
> >When XDP prog is attached, it is currently limiting
> >MTU to be FRAG_SZ0 - ETH_HLEN - (2 * VLAN_HLEN) which is 1514
> >in x86.
> >
> >AFAICT, since mlx4 is doing one page per packet for XDP,
> >we can at least raise the MTU limitation up to
> >PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN) which this patch is
> >doing. It will be useful in the next patch which allows
> >XDP program to extend the packet by adding new header(s).
>
> Is mlx4 the only driver doing page-per-packet?
Sorry for the late reply. This allocation scheme is only effective
when XDP is active. AFAIK, only mlx4/5 supports XDP now.
^ permalink raw reply
* [PATCH v2 net-next 2/4] mlx4: xdp: Allow raising MTU up to one page minus eth and vlan hdrs
From: Martin KaFai Lau @ 2016-12-04 3:17 UTC (permalink / raw)
To: netdev
Cc: Alexei Starovoitov, Brenden Blanco, Daniel Borkmann, David Miller,
Jesper Dangaard Brouer, Saeed Mahameed, Tariq Toukan, Kernel Team
In-Reply-To: <1480821446-4122277-1-git-send-email-kafai@fb.com>
When XDP is active in mlx4, mlx4 is using one page/pkt.
At the same time (i.e. when XDP is active), it is currently
limiting MTU to be FRAG_SZ0 - ETH_HLEN - (2 * VLAN_HLEN)
which is 1514 in x86. AFAICT, we can at least raise the MTU
limit up to PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN) which this
patch is doing. It will be useful in the next patch which
allows XDP program to extend the packet by adding new header(s).
Note: In the earlier XDP patches, there is already existing guard
to ensure the page/pkt scheme only applies when XDP is active
in mlx4.
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 28 +++++++++++-----
drivers/net/ethernet/mellanox/mlx4/en_rx.c | 46 ++++++++++++++------------
2 files changed, 44 insertions(+), 30 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 49a81f1fc1d6..311c14153b8b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -51,6 +51,8 @@
#include "mlx4_en.h"
#include "en_port.h"
+#define MLX4_EN_MAX_XDP_MTU ((int)(PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN)))
+
int mlx4_en_setup_tc(struct net_device *dev, u8 up)
{
struct mlx4_en_priv *priv = netdev_priv(dev);
@@ -2249,6 +2251,19 @@ void mlx4_en_destroy_netdev(struct net_device *dev)
free_netdev(dev);
}
+static bool mlx4_en_check_xdp_mtu(struct net_device *dev, int mtu)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+
+ if (mtu > MLX4_EN_MAX_XDP_MTU) {
+ en_err(priv, "mtu:%d > max:%d when XDP prog is attached\n",
+ mtu, MLX4_EN_MAX_XDP_MTU);
+ return false;
+ }
+
+ return true;
+}
+
static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu)
{
struct mlx4_en_priv *priv = netdev_priv(dev);
@@ -2258,11 +2273,10 @@ static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu)
en_dbg(DRV, priv, "Change MTU called - current:%d new:%d\n",
dev->mtu, new_mtu);
- if (priv->tx_ring_num[TX_XDP] && MLX4_EN_EFF_MTU(new_mtu) > FRAG_SZ0) {
- en_err(priv, "MTU size:%d requires frags but XDP running\n",
- new_mtu);
- return -EOPNOTSUPP;
- }
+ if (priv->tx_ring_num[TX_XDP] &&
+ !mlx4_en_check_xdp_mtu(dev, new_mtu))
+ return -ENOTSUPP;
+
dev->mtu = new_mtu;
if (netif_running(dev)) {
@@ -2710,10 +2724,8 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog)
return 0;
}
- if (priv->num_frags > 1) {
- en_err(priv, "Cannot set XDP if MTU requires multiple frags\n");
+ if (!mlx4_en_check_xdp_mtu(dev, dev->mtu))
return -EOPNOTSUPP;
- }
tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
if (!tmp)
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 6562f78b07f4..23e9d04d1ef4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -1164,37 +1164,39 @@ static const int frag_sizes[] = {
void mlx4_en_calc_rx_buf(struct net_device *dev)
{
- enum dma_data_direction dma_dir = PCI_DMA_FROMDEVICE;
struct mlx4_en_priv *priv = netdev_priv(dev);
int eff_mtu = MLX4_EN_EFF_MTU(dev->mtu);
- int order = MLX4_EN_ALLOC_PREFER_ORDER;
- u32 align = SMP_CACHE_BYTES;
- int buf_size = 0;
int i = 0;
/* bpf requires buffers to be set up as 1 packet per page.
* This only works when num_frags == 1.
*/
if (priv->tx_ring_num[TX_XDP]) {
- dma_dir = PCI_DMA_BIDIRECTIONAL;
- /* This will gain efficient xdp frame recycling at the expense
- * of more costly truesize accounting
+ priv->frag_info[0].order = 0;
+ priv->frag_info[0].frag_size = eff_mtu;
+ priv->frag_info[0].frag_prefix_size = 0;
+ /* This will gain efficient xdp frame recycling at the
+ * expense of more costly truesize accounting
*/
- align = PAGE_SIZE;
- order = 0;
- }
-
- while (buf_size < eff_mtu) {
- priv->frag_info[i].order = order;
- priv->frag_info[i].frag_size =
- (eff_mtu > buf_size + frag_sizes[i]) ?
- frag_sizes[i] : eff_mtu - buf_size;
- priv->frag_info[i].frag_prefix_size = buf_size;
- priv->frag_info[i].frag_stride =
- ALIGN(priv->frag_info[i].frag_size, align);
- priv->frag_info[i].dma_dir = dma_dir;
- buf_size += priv->frag_info[i].frag_size;
- i++;
+ priv->frag_info[0].frag_stride = PAGE_SIZE;
+ priv->frag_info[0].dma_dir = PCI_DMA_BIDIRECTIONAL;
+ i = 1;
+ } else {
+ int buf_size = 0;
+
+ while (buf_size < eff_mtu) {
+ priv->frag_info[i].order = MLX4_EN_ALLOC_PREFER_ORDER;
+ priv->frag_info[i].frag_size =
+ (eff_mtu > buf_size + frag_sizes[i]) ?
+ frag_sizes[i] : eff_mtu - buf_size;
+ priv->frag_info[i].frag_prefix_size = buf_size;
+ priv->frag_info[i].frag_stride =
+ ALIGN(priv->frag_info[i].frag_size,
+ SMP_CACHE_BYTES);
+ priv->frag_info[i].dma_dir = PCI_DMA_FROMDEVICE;
+ buf_size += priv->frag_info[i].frag_size;
+ i++;
+ }
}
priv->num_frags = i;
--
2.5.1
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox