* [PATCH 6/6] netfilter: nft_compat: don't use refcount_inc on newly allocated entry
From: Pablo Neira Ayuso @ 2019-02-05 19:04 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20190205190415.25041-1-pablo@netfilter.org>
From: Florian Westphal <fw@strlen.de>
When I moved the refcount to refcount_t type I missed the fact that
refcount_inc() will result in use-after-free warning with
CONFIG_REFCOUNT_FULL=y builds.
The correct fix would be to init the reference count to 1 at allocation
time, but, unfortunately we cannot do this, as we can't undo that
in case something else fails later in the batch.
So only solution I see is to special-case the 'new entry' condition
and replace refcount_inc() with a "delayed" refcount_set(1) in this case,
as done here.
The .activate callback can be removed to simplify things, we only
need to make sure that deactivate() decrements/unlinks the entry
from the list at end of transaction phase (commit or abort).
Fixes: 12c44aba6618 ("netfilter: nft_compat: use refcnt_t type for nft_xt reference count")
Reported-by: Jordan Glover <Golden_Miller83@protonmail.ch>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/nft_compat.c | 62 +++++++++++++++++-----------------------------
1 file changed, 23 insertions(+), 39 deletions(-)
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 0732a2fc697c..fe64df848365 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -61,6 +61,21 @@ static struct nft_compat_net *nft_compat_pernet(struct net *net)
return net_generic(net, nft_compat_net_id);
}
+static void nft_xt_get(struct nft_xt *xt)
+{
+ /* refcount_inc() warns on 0 -> 1 transition, but we can't
+ * init the reference count to 1 in .select_ops -- we can't
+ * undo such an increase when another expression inside the same
+ * rule fails afterwards.
+ */
+ if (xt->listcnt == 0)
+ refcount_set(&xt->refcnt, 1);
+ else
+ refcount_inc(&xt->refcnt);
+
+ xt->listcnt++;
+}
+
static bool nft_xt_put(struct nft_xt *xt)
{
if (refcount_dec_and_test(&xt->refcnt)) {
@@ -291,7 +306,7 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
return -EINVAL;
nft_xt = container_of(expr->ops, struct nft_xt, ops);
- refcount_inc(&nft_xt->refcnt);
+ nft_xt_get(nft_xt);
return 0;
}
@@ -504,7 +519,7 @@ __nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
return ret;
nft_xt = container_of(expr->ops, struct nft_xt, ops);
- refcount_inc(&nft_xt->refcnt);
+ nft_xt_get(nft_xt);
return 0;
}
@@ -558,45 +573,16 @@ nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
__nft_match_destroy(ctx, expr, nft_expr_priv(expr));
}
-static void nft_compat_activate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- struct list_head *h)
-{
- struct nft_xt *xt = container_of(expr->ops, struct nft_xt, ops);
-
- if (xt->listcnt == 0)
- list_add(&xt->head, h);
-
- xt->listcnt++;
-}
-
-static void nft_compat_activate_mt(const struct nft_ctx *ctx,
- const struct nft_expr *expr)
-{
- struct nft_compat_net *cn = nft_compat_pernet(ctx->net);
-
- nft_compat_activate(ctx, expr, &cn->nft_match_list);
-}
-
-static void nft_compat_activate_tg(const struct nft_ctx *ctx,
- const struct nft_expr *expr)
-{
- struct nft_compat_net *cn = nft_compat_pernet(ctx->net);
-
- nft_compat_activate(ctx, expr, &cn->nft_target_list);
-}
-
static void nft_compat_deactivate(const struct nft_ctx *ctx,
const struct nft_expr *expr,
enum nft_trans_phase phase)
{
struct nft_xt *xt = container_of(expr->ops, struct nft_xt, ops);
- if (phase == NFT_TRANS_COMMIT)
- return;
-
- if (--xt->listcnt == 0)
- list_del_init(&xt->head);
+ if (phase == NFT_TRANS_ABORT || phase == NFT_TRANS_COMMIT) {
+ if (--xt->listcnt == 0)
+ list_del_init(&xt->head);
+ }
}
static void
@@ -852,7 +838,6 @@ nft_match_select_ops(const struct nft_ctx *ctx,
nft_match->ops.eval = nft_match_eval;
nft_match->ops.init = nft_match_init;
nft_match->ops.destroy = nft_match_destroy;
- nft_match->ops.activate = nft_compat_activate_mt;
nft_match->ops.deactivate = nft_compat_deactivate;
nft_match->ops.dump = nft_match_dump;
nft_match->ops.validate = nft_match_validate;
@@ -870,7 +855,7 @@ nft_match_select_ops(const struct nft_ctx *ctx,
nft_match->ops.size = matchsize;
- nft_match->listcnt = 1;
+ nft_match->listcnt = 0;
list_add(&nft_match->head, &cn->nft_match_list);
return &nft_match->ops;
@@ -957,7 +942,6 @@ nft_target_select_ops(const struct nft_ctx *ctx,
nft_target->ops.size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize));
nft_target->ops.init = nft_target_init;
nft_target->ops.destroy = nft_target_destroy;
- nft_target->ops.activate = nft_compat_activate_tg;
nft_target->ops.deactivate = nft_compat_deactivate;
nft_target->ops.dump = nft_target_dump;
nft_target->ops.validate = nft_target_validate;
@@ -968,7 +952,7 @@ nft_target_select_ops(const struct nft_ctx *ctx,
else
nft_target->ops.eval = nft_target_eval_xt;
- nft_target->listcnt = 1;
+ nft_target->listcnt = 0;
list_add(&nft_target->head, &cn->nft_target_list);
return &nft_target->ops;
--
2.11.0
^ permalink raw reply related
* [PATCH 4/6] netfilter: nf_tables: unbind set in rule from commit path
From: Pablo Neira Ayuso @ 2019-02-05 19:04 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20190205190415.25041-1-pablo@netfilter.org>
Anonymous sets that are bound to rules from the same transaction trigger
a kernel splat from the abort path due to double set list removal and
double free.
This patch updates the logic to search for the transaction that is
responsible for creating the set and disable the set list removal and
release, given the rule is now responsible for this. Lookup is reverse
since the transaction that adds the set is likely to be at the tail of
the list.
Moreover, this patch adds the unbind step to deliver the event from the
commit path. This should not be done from the worker thread, since we
have no guarantees of in-order delivery to the listener.
This patch removes the assumption that both activate and deactivate
callbacks need to be provided.
Fixes: cd5125d8f518 ("netfilter: nf_tables: split set destruction in deactivate and destroy phase")
Reported-by: Mikhail Morfikov <mmorfikov@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
include/net/netfilter/nf_tables.h | 17 ++++++--
net/netfilter/nf_tables_api.c | 85 +++++++++++++++++++--------------------
net/netfilter/nft_compat.c | 6 ++-
net/netfilter/nft_dynset.c | 18 ++++-----
net/netfilter/nft_immediate.c | 6 ++-
net/netfilter/nft_lookup.c | 18 ++++-----
net/netfilter/nft_objref.c | 18 ++++-----
7 files changed, 85 insertions(+), 83 deletions(-)
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 841835a387e1..b4984bbbe157 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -469,9 +469,7 @@ struct nft_set_binding {
int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
struct nft_set_binding *binding);
void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
- struct nft_set_binding *binding);
-void nf_tables_rebind_set(const struct nft_ctx *ctx, struct nft_set *set,
- struct nft_set_binding *binding);
+ struct nft_set_binding *binding, bool commit);
void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set);
/**
@@ -721,6 +719,13 @@ struct nft_expr_type {
#define NFT_EXPR_STATEFUL 0x1
#define NFT_EXPR_GC 0x2
+enum nft_trans_phase {
+ NFT_TRANS_PREPARE,
+ NFT_TRANS_ABORT,
+ NFT_TRANS_COMMIT,
+ NFT_TRANS_RELEASE
+};
+
/**
* struct nft_expr_ops - nf_tables expression operations
*
@@ -750,7 +755,8 @@ struct nft_expr_ops {
void (*activate)(const struct nft_ctx *ctx,
const struct nft_expr *expr);
void (*deactivate)(const struct nft_ctx *ctx,
- const struct nft_expr *expr);
+ const struct nft_expr *expr,
+ enum nft_trans_phase phase);
void (*destroy)(const struct nft_ctx *ctx,
const struct nft_expr *expr);
void (*destroy_clone)(const struct nft_ctx *ctx,
@@ -1323,12 +1329,15 @@ struct nft_trans_rule {
struct nft_trans_set {
struct nft_set *set;
u32 set_id;
+ bool bound;
};
#define nft_trans_set(trans) \
(((struct nft_trans_set *)trans->data)->set)
#define nft_trans_set_id(trans) \
(((struct nft_trans_set *)trans->data)->set_id)
+#define nft_trans_set_bound(trans) \
+ (((struct nft_trans_set *)trans->data)->bound)
struct nft_trans_chain {
bool update;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index fb07f6cfc719..5a92f23f179f 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -116,6 +116,23 @@ static void nft_trans_destroy(struct nft_trans *trans)
kfree(trans);
}
+static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set)
+{
+ struct net *net = ctx->net;
+ struct nft_trans *trans;
+
+ if (!nft_set_is_anonymous(set))
+ return;
+
+ list_for_each_entry_reverse(trans, &net->nft.commit_list, list) {
+ if (trans->msg_type == NFT_MSG_NEWSET &&
+ nft_trans_set(trans) == set) {
+ nft_trans_set_bound(trans) = true;
+ break;
+ }
+ }
+}
+
static int nf_tables_register_hook(struct net *net,
const struct nft_table *table,
struct nft_chain *chain)
@@ -211,18 +228,6 @@ static int nft_delchain(struct nft_ctx *ctx)
return err;
}
-/* either expr ops provide both activate/deactivate, or neither */
-static bool nft_expr_check_ops(const struct nft_expr_ops *ops)
-{
- if (!ops)
- return true;
-
- if (WARN_ON_ONCE((!ops->activate ^ !ops->deactivate)))
- return false;
-
- return true;
-}
-
static void nft_rule_expr_activate(const struct nft_ctx *ctx,
struct nft_rule *rule)
{
@@ -238,14 +243,15 @@ static void nft_rule_expr_activate(const struct nft_ctx *ctx,
}
static void nft_rule_expr_deactivate(const struct nft_ctx *ctx,
- struct nft_rule *rule)
+ struct nft_rule *rule,
+ enum nft_trans_phase phase)
{
struct nft_expr *expr;
expr = nft_expr_first(rule);
while (expr != nft_expr_last(rule) && expr->ops) {
if (expr->ops->deactivate)
- expr->ops->deactivate(ctx, expr);
+ expr->ops->deactivate(ctx, expr, phase);
expr = nft_expr_next(expr);
}
@@ -296,7 +302,7 @@ static int nft_delrule(struct nft_ctx *ctx, struct nft_rule *rule)
nft_trans_destroy(trans);
return err;
}
- nft_rule_expr_deactivate(ctx, rule);
+ nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_PREPARE);
return 0;
}
@@ -1929,9 +1935,6 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
*/
int nft_register_expr(struct nft_expr_type *type)
{
- if (!nft_expr_check_ops(type->ops))
- return -EINVAL;
-
nfnl_lock(NFNL_SUBSYS_NFTABLES);
if (type->family == NFPROTO_UNSPEC)
list_add_tail_rcu(&type->list, &nf_tables_expressions);
@@ -2079,10 +2082,6 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx,
err = PTR_ERR(ops);
goto err1;
}
- if (!nft_expr_check_ops(ops)) {
- err = -EINVAL;
- goto err1;
- }
} else
ops = type->ops;
@@ -2511,7 +2510,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
static void nf_tables_rule_release(const struct nft_ctx *ctx,
struct nft_rule *rule)
{
- nft_rule_expr_deactivate(ctx, rule);
+ nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_RELEASE);
nf_tables_rule_destroy(ctx, rule);
}
@@ -3708,39 +3707,30 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
bind:
binding->chain = ctx->chain;
list_add_tail_rcu(&binding->list, &set->bindings);
+ nft_set_trans_bind(ctx, set);
+
return 0;
}
EXPORT_SYMBOL_GPL(nf_tables_bind_set);
-void nf_tables_rebind_set(const struct nft_ctx *ctx, struct nft_set *set,
- struct nft_set_binding *binding)
-{
- if (list_empty(&set->bindings) && nft_set_is_anonymous(set) &&
- nft_is_active(ctx->net, set))
- list_add_tail_rcu(&set->list, &ctx->table->sets);
-
- list_add_tail_rcu(&binding->list, &set->bindings);
-}
-EXPORT_SYMBOL_GPL(nf_tables_rebind_set);
-
void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
- struct nft_set_binding *binding)
+ struct nft_set_binding *binding, bool event)
{
list_del_rcu(&binding->list);
- if (list_empty(&set->bindings) && nft_set_is_anonymous(set) &&
- nft_is_active(ctx->net, set))
+ if (list_empty(&set->bindings) && nft_set_is_anonymous(set)) {
list_del_rcu(&set->list);
+ if (event)
+ nf_tables_set_notify(ctx, set, NFT_MSG_DELSET,
+ GFP_KERNEL);
+ }
}
EXPORT_SYMBOL_GPL(nf_tables_unbind_set);
void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set)
{
- if (list_empty(&set->bindings) && nft_set_is_anonymous(set) &&
- nft_is_active(ctx->net, set)) {
- nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_ATOMIC);
+ if (list_empty(&set->bindings) && nft_set_is_anonymous(set))
nft_set_destroy(set);
- }
}
EXPORT_SYMBOL_GPL(nf_tables_destroy_set);
@@ -6535,6 +6525,9 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
nf_tables_rule_notify(&trans->ctx,
nft_trans_rule(trans),
NFT_MSG_DELRULE);
+ nft_rule_expr_deactivate(&trans->ctx,
+ nft_trans_rule(trans),
+ NFT_TRANS_COMMIT);
break;
case NFT_MSG_NEWSET:
nft_clear(net, nft_trans_set(trans));
@@ -6621,7 +6614,8 @@ static void nf_tables_abort_release(struct nft_trans *trans)
nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans));
break;
case NFT_MSG_NEWSET:
- nft_set_destroy(nft_trans_set(trans));
+ if (!nft_trans_set_bound(trans))
+ nft_set_destroy(nft_trans_set(trans));
break;
case NFT_MSG_NEWSETELEM:
nft_set_elem_destroy(nft_trans_elem_set(trans),
@@ -6682,7 +6676,9 @@ static int __nf_tables_abort(struct net *net)
case NFT_MSG_NEWRULE:
trans->ctx.chain->use--;
list_del_rcu(&nft_trans_rule(trans)->list);
- nft_rule_expr_deactivate(&trans->ctx, nft_trans_rule(trans));
+ nft_rule_expr_deactivate(&trans->ctx,
+ nft_trans_rule(trans),
+ NFT_TRANS_ABORT);
break;
case NFT_MSG_DELRULE:
trans->ctx.chain->use++;
@@ -6692,7 +6688,8 @@ static int __nf_tables_abort(struct net *net)
break;
case NFT_MSG_NEWSET:
trans->ctx.table->use--;
- list_del_rcu(&nft_trans_set(trans)->list);
+ if (!nft_trans_set_bound(trans))
+ list_del_rcu(&nft_trans_set(trans)->list);
break;
case NFT_MSG_DELSET:
trans->ctx.table->use++;
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 5eb269428832..0732a2fc697c 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -587,10 +587,14 @@ static void nft_compat_activate_tg(const struct nft_ctx *ctx,
}
static void nft_compat_deactivate(const struct nft_ctx *ctx,
- const struct nft_expr *expr)
+ const struct nft_expr *expr,
+ enum nft_trans_phase phase)
{
struct nft_xt *xt = container_of(expr->ops, struct nft_xt, ops);
+ if (phase == NFT_TRANS_COMMIT)
+ return;
+
if (--xt->listcnt == 0)
list_del_init(&xt->head);
}
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 07d4efd3d851..f1172f99752b 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -235,20 +235,17 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
return err;
}
-static void nft_dynset_activate(const struct nft_ctx *ctx,
- const struct nft_expr *expr)
-{
- struct nft_dynset *priv = nft_expr_priv(expr);
-
- nf_tables_rebind_set(ctx, priv->set, &priv->binding);
-}
-
static void nft_dynset_deactivate(const struct nft_ctx *ctx,
- const struct nft_expr *expr)
+ const struct nft_expr *expr,
+ enum nft_trans_phase phase)
{
struct nft_dynset *priv = nft_expr_priv(expr);
- nf_tables_unbind_set(ctx, priv->set, &priv->binding);
+ if (phase == NFT_TRANS_PREPARE)
+ return;
+
+ nf_tables_unbind_set(ctx, priv->set, &priv->binding,
+ phase == NFT_TRANS_COMMIT);
}
static void nft_dynset_destroy(const struct nft_ctx *ctx,
@@ -296,7 +293,6 @@ static const struct nft_expr_ops nft_dynset_ops = {
.eval = nft_dynset_eval,
.init = nft_dynset_init,
.destroy = nft_dynset_destroy,
- .activate = nft_dynset_activate,
.deactivate = nft_dynset_deactivate,
.dump = nft_dynset_dump,
};
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index 0777a93211e2..3f6d1d2a6281 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -72,10 +72,14 @@ static void nft_immediate_activate(const struct nft_ctx *ctx,
}
static void nft_immediate_deactivate(const struct nft_ctx *ctx,
- const struct nft_expr *expr)
+ const struct nft_expr *expr,
+ enum nft_trans_phase phase)
{
const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+ if (phase == NFT_TRANS_COMMIT)
+ return;
+
return nft_data_release(&priv->data, nft_dreg_to_type(priv->dreg));
}
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index 227b2b15a19c..14496da5141d 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -121,20 +121,17 @@ static int nft_lookup_init(const struct nft_ctx *ctx,
return 0;
}
-static void nft_lookup_activate(const struct nft_ctx *ctx,
- const struct nft_expr *expr)
-{
- struct nft_lookup *priv = nft_expr_priv(expr);
-
- nf_tables_rebind_set(ctx, priv->set, &priv->binding);
-}
-
static void nft_lookup_deactivate(const struct nft_ctx *ctx,
- const struct nft_expr *expr)
+ const struct nft_expr *expr,
+ enum nft_trans_phase phase)
{
struct nft_lookup *priv = nft_expr_priv(expr);
- nf_tables_unbind_set(ctx, priv->set, &priv->binding);
+ if (phase == NFT_TRANS_PREPARE)
+ return;
+
+ nf_tables_unbind_set(ctx, priv->set, &priv->binding,
+ phase == NFT_TRANS_COMMIT);
}
static void nft_lookup_destroy(const struct nft_ctx *ctx,
@@ -225,7 +222,6 @@ static const struct nft_expr_ops nft_lookup_ops = {
.size = NFT_EXPR_SIZE(sizeof(struct nft_lookup)),
.eval = nft_lookup_eval,
.init = nft_lookup_init,
- .activate = nft_lookup_activate,
.deactivate = nft_lookup_deactivate,
.destroy = nft_lookup_destroy,
.dump = nft_lookup_dump,
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
index a3185ca2a3a9..ae178e914486 100644
--- a/net/netfilter/nft_objref.c
+++ b/net/netfilter/nft_objref.c
@@ -155,20 +155,17 @@ static int nft_objref_map_dump(struct sk_buff *skb, const struct nft_expr *expr)
return -1;
}
-static void nft_objref_map_activate(const struct nft_ctx *ctx,
- const struct nft_expr *expr)
-{
- struct nft_objref_map *priv = nft_expr_priv(expr);
-
- nf_tables_rebind_set(ctx, priv->set, &priv->binding);
-}
-
static void nft_objref_map_deactivate(const struct nft_ctx *ctx,
- const struct nft_expr *expr)
+ const struct nft_expr *expr,
+ enum nft_trans_phase phase)
{
struct nft_objref_map *priv = nft_expr_priv(expr);
- nf_tables_unbind_set(ctx, priv->set, &priv->binding);
+ if (phase == NFT_TRANS_PREPARE)
+ return;
+
+ nf_tables_unbind_set(ctx, priv->set, &priv->binding,
+ phase == NFT_TRANS_COMMIT);
}
static void nft_objref_map_destroy(const struct nft_ctx *ctx,
@@ -185,7 +182,6 @@ static const struct nft_expr_ops nft_objref_map_ops = {
.size = NFT_EXPR_SIZE(sizeof(struct nft_objref_map)),
.eval = nft_objref_map_eval,
.init = nft_objref_map_init,
- .activate = nft_objref_map_activate,
.deactivate = nft_objref_map_deactivate,
.destroy = nft_objref_map_destroy,
.dump = nft_objref_map_dump,
--
2.11.0
^ permalink raw reply related
* [PATCH 5/6] netfilter: ipv6: Don't preserve original oif for loopback address
From: Pablo Neira Ayuso @ 2019-02-05 19:04 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20190205190415.25041-1-pablo@netfilter.org>
From: Eli Cooper <elicooper@gmx.com>
Commit 508b09046c0f ("netfilter: ipv6: Preserve link scope traffic
original oif") made ip6_route_me_harder() keep the original oif for
link-local and multicast packets. However, it also affected packets
for the loopback address because it used rt6_need_strict().
REDIRECT rules in the OUTPUT chain rewrite the destination to loopback
address; thus its oif should not be preserved. This commit fixes the bug
that redirected local packets are being dropped. Actually the packet was
not exactly dropped; Instead it was sent out to the original oif rather
than lo. When a packet with daddr ::1 is sent to the router, it is
effectively dropped.
Fixes: 508b09046c0f ("netfilter: ipv6: Preserve link scope traffic original oif")
Signed-off-by: Eli Cooper <elicooper@gmx.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/ipv6/netfilter.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 8b075f0bc351..6d0b1f3e927b 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -23,9 +23,11 @@ int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
struct sock *sk = sk_to_full_sk(skb->sk);
unsigned int hh_len;
struct dst_entry *dst;
+ int strict = (ipv6_addr_type(&iph->daddr) &
+ (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
struct flowi6 fl6 = {
.flowi6_oif = sk && sk->sk_bound_dev_if ? sk->sk_bound_dev_if :
- rt6_need_strict(&iph->daddr) ? skb_dst(skb)->dev->ifindex : 0,
+ strict ? skb_dst(skb)->dev->ifindex : 0,
.flowi6_mark = skb->mark,
.flowi6_uid = sock_net_uid(net, sk),
.daddr = iph->daddr,
--
2.11.0
^ permalink raw reply related
* [PATCH 1/6] selftests: netfilter: fix config fragment CONFIG_NF_TABLES_INET
From: Pablo Neira Ayuso @ 2019-02-05 19:04 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20190205190415.25041-1-pablo@netfilter.org>
From: Naresh Kamboju <naresh.kamboju@linaro.org>
In selftests the config fragment for netfilter was added as
NF_TABLES_INET=y and this patch correct it as CONFIG_NF_TABLES_INET=y
Signed-off-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
tools/testing/selftests/netfilter/config | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/testing/selftests/netfilter/config b/tools/testing/selftests/netfilter/config
index 1017313e41a8..59caa8f71cd8 100644
--- a/tools/testing/selftests/netfilter/config
+++ b/tools/testing/selftests/netfilter/config
@@ -1,2 +1,2 @@
CONFIG_NET_NS=y
-NF_TABLES_INET=y
+CONFIG_NF_TABLES_INET=y
--
2.11.0
^ permalink raw reply related
* [PATCH 0/6] Netfilter fixes for net
From: Pablo Neira Ayuso @ 2019-02-05 19:04 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev
Hi David,
The following patchset contains Netfilter fixes for net:
1) Use CONFIG_NF_TABLES_INET from seltests, not NF_TABLES_INET.
From Naresh Kamboju.
2) Add a test to cover masquerading and redirect case, from Florian
Westphal.
3) Two packets coming from the same socket may race to set up NAT,
ending up with different tuples and the packet losing race being
dropped. Update nf_conntrack_tuple_taken() to exercise clash
resolution for this case. From Martynas Pumputis and Florian
Westphal.
4) Unbind anonymous sets from the commit and abort path, this fixes
a splat due to double set list removal/release in case that the
transaction needs to be aborted.
5) Do not preserve original output interface for packets that are
redirected in the output chain when ip6_route_me_harder() is
called. Otherwise packets end up going not going to the loopback
device. From Eli Cooper.
6) Fix bogus splat in nft_compat with CONFIG_REFCOUNT_FULL=y, this
also simplifies the existing logic to deal with the list insertions
of the xtables extensions. From Florian Westphal.
Diffstat look rather larger than usual because of the new selftest, but
Florian and I consider that having tests soon into the tree is good to
improve coverage. If there's a different policy in this regard, please,
let me know.
You can pull these changes from:
git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git
Thanks!
----------------------------------------------------------------
The following changes since commit cfe4bd7a257f6d6f81d3458d8c9d9ec4957539e6:
sctp: check and update stream->out_curr when allocating stream_out (2019-02-03 14:27:47 -0800)
are available in the git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git HEAD
for you to fetch changes up to 947e492c0fc2132ae5fca081a9c2952ccaab0404:
netfilter: nft_compat: don't use refcount_inc on newly allocated entry (2019-02-05 14:10:33 +0100)
----------------------------------------------------------------
Eli Cooper (1):
netfilter: ipv6: Don't preserve original oif for loopback address
Florian Westphal (2):
selftests: netfilter: add simple masq/redirect test cases
netfilter: nft_compat: don't use refcount_inc on newly allocated entry
Martynas Pumputis (1):
netfilter: nf_nat: skip nat clash resolution for same-origin entries
Naresh Kamboju (1):
selftests: netfilter: fix config fragment CONFIG_NF_TABLES_INET
Pablo Neira Ayuso (1):
netfilter: nf_tables: unbind set in rule from commit path
include/net/netfilter/nf_tables.h | 17 +-
net/ipv6/netfilter.c | 4 +-
net/netfilter/nf_conntrack_core.c | 16 +
net/netfilter/nf_tables_api.c | 85 ++-
net/netfilter/nft_compat.c | 62 +--
net/netfilter/nft_dynset.c | 18 +-
net/netfilter/nft_immediate.c | 6 +-
net/netfilter/nft_lookup.c | 18 +-
net/netfilter/nft_objref.c | 18 +-
tools/testing/selftests/netfilter/Makefile | 2 +-
tools/testing/selftests/netfilter/config | 2 +-
tools/testing/selftests/netfilter/nft_nat.sh | 762 +++++++++++++++++++++++++++
12 files changed, 888 insertions(+), 122 deletions(-)
create mode 100755 tools/testing/selftests/netfilter/nft_nat.sh
^ permalink raw reply
* Re: r8169 Driver - Poor Network Performance Since Kernel 4.19
From: Heiner Kallweit @ 2019-02-05 18:53 UTC (permalink / raw)
To: David Chang; +Cc: Realtek linux nic maintainers, netdev
In-Reply-To: <856b3a75-5daf-6ce8-7fa3-0405e3cefe97@gmail.com>
By the way: I can't reproduce the issue on a RTL8168g.
So it doesn't seem to be an issue with generic code in the driver.
I would assume it's some kind of incompatibility between activated
chip settings (ASPM etc) and certain systems.
Heiner
On 05.02.2019 19:50, Heiner Kallweit wrote:
> Hi David,
>
> meanwhile there's the following bug report matching what reported.
> It's even the same chip version (RTL8168h).
> https://bugzilla.redhat.com/show_bug.cgi?id=1671958
>
> Symptom there is also a significant number of rx_missed packets.
> Could you try what I mentioned there last:
> Try building a kernel with the call to rtl_hw_aspm_clkreq_enable(tp, true) at the
> end of rtl_hw_start_8168h_1() being disabled.
>
> Heiner
>
>
> On 31.01.2019 03:32, David Chang wrote:
>> Hi,
>>
>> We had a similr case here.
>> - Realtek r8169 receive performance regression in kernel 4.19
>> https://bugzilla.suse.com/show_bug.cgi?id=1119649
>>
>> kernel: r8169 0000:01:00.0 eth0: RTL8168h/8111h, XID 54100880
>> The major symptom is there are many rx_missed count.
>>
>>
>> On Jan 30, 2019 at 20:15:45 +0100, Heiner Kallweit wrote:
>>> Hi Peter,
>>>
>>> recently I had somebody where pcie_aspm=off for whatever reason didn't
>>> do the trick, can you also check with pcie_aspm.policy=performance.
>>
>> We will give it a try later.
>>
>>> And please check with "ethtool -S <if>" whether the chip statistics
>>> show a significant number of errors.
>>>
>>> If this doesn't help you may have to bisect to find the offending commit.
>>
>> We had tried fallback driver to a few previous commits as following,
>> but with no luck.
>>
>> 9675931e6b65 r8169: re-enable MSI-X on RTL8168g (v4.19)
>> 098b01ad9837 r8169: don't include asm headers directly (v4.19-rc1)
>> a2965f12fde6 r8169: remove rtl8169_set_speed_xmii (v4.19-rc1)
>> 6fcf9b1d4d6c r8169: fix runtime suspend (v4.19-rc1)
>> e397286b8e89 r8169: remove TBI 1000BaseX support (v4.19-rc1)
>>
>> Thanks,
>> David Chang
>>
>>>
>>> Heiner
>>>
>>>
>>> On 30.01.2019 10:59, Peter Ceiley wrote:
>>>> Hi Heiner,
>>>>
>>>> I tried disabling the ASPM using the pcie_aspm=off kernel parameter
>>>> and this made no difference.
>>>>
>>>> I tried compiling the 4.18.16 r8169.c with the 4.19.18 source and
>>>> subsequently loaded the module in the running 4.19.18 kernel. I can
>>>> confirm that this immediately resolved the issue and access to the NFS
>>>> shares operated as expected.
>>>>
>>>> I presume this means it is an issue with the r8169 driver included in
>>>> 4.19 onwards?
>>>>
>>>> To answer your last questions:
>>>>
>>>> Base Board Information
>>>> Manufacturer: Alienware
>>>> Product Name: 0PGRP5
>>>> Version: A02
>>>>
>>>> ... and yes, the RTL8168 is the onboard network chip.
>>>>
>>>> Regards,
>>>>
>>>> Peter.
>>>>
>>>> On Tue, 29 Jan 2019 at 17:44, Heiner Kallweit <hkallweit1@gmail.com> wrote:
>>>>>
>>>>> Hi Peter,
>>>>>
>>>>> I think the vendor driver doesn't enable ASPM per default.
>>>>> So it's worth a try to disable ASPM in the BIOS or via sysfs.
>>>>> Few older systems seem to have issues with ASPM, what kind of
>>>>> system / mainboard are you using? The RTL8168 is the onboard
>>>>> network chip?
>>>>>
>>>>> Rgds, Heiner
>>>>>
>>>>>
>>>>> On 29.01.2019 07:20, Peter Ceiley wrote:
>>>>>> Hi Heiner,
>>>>>>
>>>>>> Thanks, I'll do some more testing. It might not be the driver - I
>>>>>> assumed it was due to the fact that using the r8168 driver 'resolves'
>>>>>> the issue. I'll see if I can test the r8169.c on top of 4.19 - this is
>>>>>> a good idea.
>>>>>>
>>>>>> Cheers,
>>>>>>
>>>>>> Peter.
>>>>>>
>>>>>> On Tue, 29 Jan 2019 at 17:16, Heiner Kallweit <hkallweit1@gmail.com> wrote:
>>>>>>>
>>>>>>> Hi Peter,
>>>>>>>
>>>>>>> at a first glance it doesn't look like a typical driver issue.
>>>>>>> What you could do:
>>>>>>>
>>>>>>> - Test the r8169.c from 4.18 on top of 4.19.
>>>>>>>
>>>>>>> - Check whether disabling ASPM (/sys/module/pcie_aspm) has an effect.
>>>>>>>
>>>>>>> - Bisect between 4.18 and 4.19 to find the offending commit.
>>>>>>>
>>>>>>> Any specific reason why you think root cause is in the driver and not
>>>>>>> elsewhere in the network subsystem?
>>>>>>>
>>>>>>> Heiner
>>>>>>>
>>>>>>>
>>>>>>> On 28.01.2019 23:10, Peter Ceiley wrote:
>>>>>>>> Hi Heiner,
>>>>>>>>
>>>>>>>> Thanks for getting back to me.
>>>>>>>>
>>>>>>>> No, I don't use jumbo packets.
>>>>>>>>
>>>>>>>> Bandwidth is *generally* good, and iperf results to my NAS provide
>>>>>>>> over 900 Mbits/s in both circumstances. The issue seems to appear when
>>>>>>>> establishing a connection and is most notable, for example, on my
>>>>>>>> mounted NFS shares where it takes seconds (up to 10's of seconds on
>>>>>>>> larger directories) to list the contents of each directory. Once a
>>>>>>>> transfer begins on a file, I appear to get good bandwidth.
>>>>>>>>
>>>>>>>> I'm unsure of the best scientific data to provide you in order to
>>>>>>>> troubleshoot this issue. Running the following
>>>>>>>>
>>>>>>>> netstat -s |grep retransmitted
>>>>>>>>
>>>>>>>> shows a steady increase in retransmitted segments each time I list the
>>>>>>>> contents of a remote directory, for example, running 'ls' on a
>>>>>>>> directory containing 345 media files did the following using kernel
>>>>>>>> 4.19.18:
>>>>>>>>
>>>>>>>> increased retransmitted segments by 21 and the 'time' command showed
>>>>>>>> the following:
>>>>>>>> real 0m19.867s
>>>>>>>> user 0m0.012s
>>>>>>>> sys 0m0.036s
>>>>>>>>
>>>>>>>> The same command shows no retransmitted segments running kernel
>>>>>>>> 4.18.16 and 'time' showed:
>>>>>>>> real 0m0.300s
>>>>>>>> user 0m0.004s
>>>>>>>> sys 0m0.007s
>>>>>>>>
>>>>>>>> ifconfig does not show any RX/TX errors nor dropped packets in either case.
>>>>>>>>
>>>>>>>> dmesg XID:
>>>>>>>> [ 2.979984] r8169 0000:03:00.0 eth0: RTL8168g/8111g,
>>>>>>>> f8:b1:56:fe:67:e0, XID 4c000800, IRQ 32
>>>>>>>>
>>>>>>>> # lspci -vv
>>>>>>>> 03:00.0 Ethernet controller: Realtek Semiconductor Co., Ltd.
>>>>>>>> RTL8111/8168/8411 PCI Express Gigabit Ethernet Controller (rev 0c)
>>>>>>>> Subsystem: Dell RTL8111/8168/8411 PCI Express Gigabit Ethernet Controller
>>>>>>>> Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop-
>>>>>>>> ParErr- Stepping- SERR- FastB2B- DisINTx+
>>>>>>>> Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort-
>>>>>>>> <TAbort- <MAbort- >SERR- <PERR- INTx-
>>>>>>>> Latency: 0, Cache Line Size: 64 bytes
>>>>>>>> Interrupt: pin A routed to IRQ 19
>>>>>>>> Region 0: I/O ports at d000 [size=256]
>>>>>>>> Region 2: Memory at f7b00000 (64-bit, non-prefetchable) [size=4K]
>>>>>>>> Region 4: Memory at f2100000 (64-bit, prefetchable) [size=16K]
>>>>>>>> Capabilities: [40] Power Management version 3
>>>>>>>> Flags: PMEClk- DSI- D1+ D2+ AuxCurrent=375mA
>>>>>>>> PME(D0+,D1+,D2+,D3hot+,D3cold+)
>>>>>>>> Status: D0 NoSoftRst+ PME-Enable- DSel=0 DScale=0 PME-
>>>>>>>> Capabilities: [50] MSI: Enable- Count=1/1 Maskable- 64bit+
>>>>>>>> Address: 0000000000000000 Data: 0000
>>>>>>>> Capabilities: [70] Express (v2) Endpoint, MSI 01
>>>>>>>> DevCap: MaxPayload 128 bytes, PhantFunc 0, Latency L0s
>>>>>>>> <512ns, L1 <64us
>>>>>>>> ExtTag- AttnBtn- AttnInd- PwrInd- RBE+ FLReset-
>>>>>>>> SlotPowerLimit 10.000W
>>>>>>>> DevCtl: CorrErr- NonFatalErr- FatalErr- UnsupReq-
>>>>>>>> RlxdOrd- ExtTag- PhantFunc- AuxPwr- NoSnoop-
>>>>>>>> MaxPayload 128 bytes, MaxReadReq 4096 bytes
>>>>>>>> DevSta: CorrErr+ NonFatalErr- FatalErr- UnsupReq- AuxPwr+ TransPend-
>>>>>>>> LnkCap: Port #0, Speed 2.5GT/s, Width x1, ASPM L0s L1, Exit
>>>>>>>> Latency L0s unlimited, L1 <64us
>>>>>>>> ClockPM+ Surprise- LLActRep- BwNot- ASPMOptComp+
>>>>>>>> LnkCtl: ASPM L1 Enabled; RCB 64 bytes Disabled- CommClk+
>>>>>>>> ExtSynch- ClockPM+ AutWidDis- BWInt- AutBWInt-
>>>>>>>> LnkSta: Speed 2.5GT/s (ok), Width x1 (ok)
>>>>>>>> TrErr- Train- SlotClk+ DLActive- BWMgmt- ABWMgmt-
>>>>>>>> DevCap2: Completion Timeout: Range ABCD, TimeoutDis+, LTR+,
>>>>>>>> OBFF Via message/WAKE#
>>>>>>>> AtomicOpsCap: 32bit- 64bit- 128bitCAS-
>>>>>>>> DevCtl2: Completion Timeout: 50us to 50ms, TimeoutDis-, LTR+,
>>>>>>>> OBFF Disabled
>>>>>>>> AtomicOpsCtl: ReqEn-
>>>>>>>> LnkCtl2: Target Link Speed: 2.5GT/s, EnterCompliance- SpeedDis-
>>>>>>>> Transmit Margin: Normal Operating Range,
>>>>>>>> EnterModifiedCompliance- ComplianceSOS-
>>>>>>>> Compliance De-emphasis: -6dB
>>>>>>>> LnkSta2: Current De-emphasis Level: -6dB,
>>>>>>>> EqualizationComplete-, EqualizationPhase1-
>>>>>>>> EqualizationPhase2-, EqualizationPhase3-, LinkEqualizationRequest-
>>>>>>>> Capabilities: [b0] MSI-X: Enable+ Count=4 Masked-
>>>>>>>> Vector table: BAR=4 offset=00000000
>>>>>>>> PBA: BAR=4 offset=00000800
>>>>>>>> Capabilities: [d0] Vital Product Data
>>>>>>>> pcilib: sysfs_read_vpd: read failed: Input/output error
>>>>>>>> Not readable
>>>>>>>> Capabilities: [100 v1] Advanced Error Reporting
>>>>>>>> UESta: DLP- SDES- TLP- FCP- CmpltTO- CmpltAbrt- UnxCmplt-
>>>>>>>> RxOF- MalfTLP- ECRC- UnsupReq- ACSViol-
>>>>>>>> UEMsk: DLP- SDES- TLP- FCP- CmpltTO- CmpltAbrt- UnxCmplt-
>>>>>>>> RxOF- MalfTLP- ECRC- UnsupReq- ACSViol-
>>>>>>>> UESvrt: DLP+ SDES+ TLP- FCP+ CmpltTO- CmpltAbrt- UnxCmplt-
>>>>>>>> RxOF+ MalfTLP+ ECRC- UnsupReq- ACSViol-
>>>>>>>> CESta: RxErr+ BadTLP+ BadDLLP+ Rollover- Timeout+ AdvNonFatalErr-
>>>>>>>> CEMsk: RxErr- BadTLP- BadDLLP- Rollover- Timeout- AdvNonFatalErr+
>>>>>>>> AERCap: First Error Pointer: 00, ECRCGenCap+ ECRCGenEn-
>>>>>>>> ECRCChkCap+ ECRCChkEn-
>>>>>>>> MultHdrRecCap- MultHdrRecEn- TLPPfxPres- HdrLogCap-
>>>>>>>> HeaderLog: 00000000 00000000 00000000 00000000
>>>>>>>> Capabilities: [140 v1] Virtual Channel
>>>>>>>> Caps: LPEVC=0 RefClk=100ns PATEntryBits=1
>>>>>>>> Arb: Fixed- WRR32- WRR64- WRR128-
>>>>>>>> Ctrl: ArbSelect=Fixed
>>>>>>>> Status: InProgress-
>>>>>>>> VC0: Caps: PATOffset=00 MaxTimeSlots=1 RejSnoopTrans-
>>>>>>>> Arb: Fixed- WRR32- WRR64- WRR128- TWRR128- WRR256-
>>>>>>>> Ctrl: Enable+ ID=0 ArbSelect=Fixed TC/VC=01
>>>>>>>> Status: NegoPending- InProgress-
>>>>>>>> Capabilities: [160 v1] Device Serial Number 01-00-00-00-68-4c-e0-00
>>>>>>>> Capabilities: [170 v1] Latency Tolerance Reporting
>>>>>>>> Max snoop latency: 71680ns
>>>>>>>> Max no snoop latency: 71680ns
>>>>>>>> Kernel driver in use: r8169
>>>>>>>> Kernel modules: r8169
>>>>>>>>
>>>>>>>> Please let me know if you have any other ideas in terms of testing.
>>>>>>>>
>>>>>>>> Thanks!
>>>>>>>>
>>>>>>>> Peter.
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> On Tue, 29 Jan 2019 at 05:28, Heiner Kallweit <hkallweit1@gmail.com> wrote:
>>>>>>>>>
>>>>>>>>> On 28.01.2019 12:13, Peter Ceiley wrote:
>>>>>>>>>> Hi,
>>>>>>>>>>
>>>>>>>>>> I have been experiencing very poor network performance since Kernel
>>>>>>>>>> 4.19 and I'm confident it's related to the r8169 driver.
>>>>>>>>>>
>>>>>>>>>> I have no issue with kernel versions 4.18 and prior. I am experiencing
>>>>>>>>>> this issue in kernels 4.19 and 4.20 (currently running/testing with
>>>>>>>>>> 4.20.4 & 4.19.18).
>>>>>>>>>>
>>>>>>>>>> If someone could guide me in the right direction, I'm happy to help
>>>>>>>>>> troubleshoot this issue. Note that I have been keeping an eye on one
>>>>>>>>>> issue related to loading of the PHY driver, however, my symptoms
>>>>>>>>>> differ in that I still have a network connection. I have attempted to
>>>>>>>>>> reload the driver on a running system, but this does not improve the
>>>>>>>>>> situation.
>>>>>>>>>>
>>>>>>>>>> Using the proprietary r8168 driver returns my device to proper working order.
>>>>>>>>>>
>>>>>>>>>> lshw shows:
>>>>>>>>>> description: Ethernet interface
>>>>>>>>>> product: RTL8111/8168/8411 PCI Express Gigabit Ethernet Controller
>>>>>>>>>> vendor: Realtek Semiconductor Co., Ltd.
>>>>>>>>>> physical id: 0
>>>>>>>>>> bus info: pci@0000:03:00.0
>>>>>>>>>> logical name: enp3s0
>>>>>>>>>> version: 0c
>>>>>>>>>> serial:
>>>>>>>>>> size: 1Gbit/s
>>>>>>>>>> capacity: 1Gbit/s
>>>>>>>>>> width: 64 bits
>>>>>>>>>> clock: 33MHz
>>>>>>>>>> capabilities: pm msi pciexpress msix vpd bus_master cap_list
>>>>>>>>>> ethernet physical tp aui bnc mii fibre 10bt 10bt-fd 100bt 100bt-fd
>>>>>>>>>> 1000bt-fd autonegotiation
>>>>>>>>>> configuration: autonegotiation=on broadcast=yes driver=r8169
>>>>>>>>>> duplex=full firmware=rtl8168g-2_0.0.1 02/06/13 ip=192.168.1.25
>>>>>>>>>> latency=0 link=yes multicast=yes port=MII speed=1Gbit/s
>>>>>>>>>> resources: irq:19 ioport:d000(size=256)
>>>>>>>>>> memory:f7b00000-f7b00fff memory:f2100000-f2103fff
>>>>>>>>>>
>>>>>>>>>> Kind Regards,
>>>>>>>>>>
>>>>>>>>>> Peter.
>>>>>>>>>>
>>>>>>>>> Hi Peter,
>>>>>>>>>
>>>>>>>>> the description "poor network performance" is quite vague, therefore:
>>>>>>>>>
>>>>>>>>> - Can you provide any measurements?
>>>>>>>>> - iperf results before and after
>>>>>>>>> - statistics about dropped packets (rx and/or tx)
>>>>>>>>> - Do you use jumbo packets?
>>>>>>>>>
>>>>>>>>> Also help would be a "lspci -vv" output for the network card and
>>>>>>>>> the dmesg output line with the chip XID.
>>>>>>>>>
>>>>>>>>> Heiner
>>>>>>>>
>>>>>>>
>>>>>>
>>>>>
>>>>
>>>
>>
>
^ permalink raw reply
* Re: r8169 Driver - Poor Network Performance Since Kernel 4.19
From: Heiner Kallweit @ 2019-02-05 18:50 UTC (permalink / raw)
To: David Chang; +Cc: Realtek linux nic maintainers, netdev
In-Reply-To: <20190131023240.GF25745@linux-kyyb.suse>
Hi David,
meanwhile there's the following bug report matching what reported.
It's even the same chip version (RTL8168h).
https://bugzilla.redhat.com/show_bug.cgi?id=1671958
Symptom there is also a significant number of rx_missed packets.
Could you try what I mentioned there last:
Try building a kernel with the call to rtl_hw_aspm_clkreq_enable(tp, true) at the
end of rtl_hw_start_8168h_1() being disabled.
Heiner
On 31.01.2019 03:32, David Chang wrote:
> Hi,
>
> We had a similr case here.
> - Realtek r8169 receive performance regression in kernel 4.19
> https://bugzilla.suse.com/show_bug.cgi?id=1119649
>
> kernel: r8169 0000:01:00.0 eth0: RTL8168h/8111h, XID 54100880
> The major symptom is there are many rx_missed count.
>
>
> On Jan 30, 2019 at 20:15:45 +0100, Heiner Kallweit wrote:
>> Hi Peter,
>>
>> recently I had somebody where pcie_aspm=off for whatever reason didn't
>> do the trick, can you also check with pcie_aspm.policy=performance.
>
> We will give it a try later.
>
>> And please check with "ethtool -S <if>" whether the chip statistics
>> show a significant number of errors.
>>
>> If this doesn't help you may have to bisect to find the offending commit.
>
> We had tried fallback driver to a few previous commits as following,
> but with no luck.
>
> 9675931e6b65 r8169: re-enable MSI-X on RTL8168g (v4.19)
> 098b01ad9837 r8169: don't include asm headers directly (v4.19-rc1)
> a2965f12fde6 r8169: remove rtl8169_set_speed_xmii (v4.19-rc1)
> 6fcf9b1d4d6c r8169: fix runtime suspend (v4.19-rc1)
> e397286b8e89 r8169: remove TBI 1000BaseX support (v4.19-rc1)
>
> Thanks,
> David Chang
>
>>
>> Heiner
>>
>>
>> On 30.01.2019 10:59, Peter Ceiley wrote:
>>> Hi Heiner,
>>>
>>> I tried disabling the ASPM using the pcie_aspm=off kernel parameter
>>> and this made no difference.
>>>
>>> I tried compiling the 4.18.16 r8169.c with the 4.19.18 source and
>>> subsequently loaded the module in the running 4.19.18 kernel. I can
>>> confirm that this immediately resolved the issue and access to the NFS
>>> shares operated as expected.
>>>
>>> I presume this means it is an issue with the r8169 driver included in
>>> 4.19 onwards?
>>>
>>> To answer your last questions:
>>>
>>> Base Board Information
>>> Manufacturer: Alienware
>>> Product Name: 0PGRP5
>>> Version: A02
>>>
>>> ... and yes, the RTL8168 is the onboard network chip.
>>>
>>> Regards,
>>>
>>> Peter.
>>>
>>> On Tue, 29 Jan 2019 at 17:44, Heiner Kallweit <hkallweit1@gmail.com> wrote:
>>>>
>>>> Hi Peter,
>>>>
>>>> I think the vendor driver doesn't enable ASPM per default.
>>>> So it's worth a try to disable ASPM in the BIOS or via sysfs.
>>>> Few older systems seem to have issues with ASPM, what kind of
>>>> system / mainboard are you using? The RTL8168 is the onboard
>>>> network chip?
>>>>
>>>> Rgds, Heiner
>>>>
>>>>
>>>> On 29.01.2019 07:20, Peter Ceiley wrote:
>>>>> Hi Heiner,
>>>>>
>>>>> Thanks, I'll do some more testing. It might not be the driver - I
>>>>> assumed it was due to the fact that using the r8168 driver 'resolves'
>>>>> the issue. I'll see if I can test the r8169.c on top of 4.19 - this is
>>>>> a good idea.
>>>>>
>>>>> Cheers,
>>>>>
>>>>> Peter.
>>>>>
>>>>> On Tue, 29 Jan 2019 at 17:16, Heiner Kallweit <hkallweit1@gmail.com> wrote:
>>>>>>
>>>>>> Hi Peter,
>>>>>>
>>>>>> at a first glance it doesn't look like a typical driver issue.
>>>>>> What you could do:
>>>>>>
>>>>>> - Test the r8169.c from 4.18 on top of 4.19.
>>>>>>
>>>>>> - Check whether disabling ASPM (/sys/module/pcie_aspm) has an effect.
>>>>>>
>>>>>> - Bisect between 4.18 and 4.19 to find the offending commit.
>>>>>>
>>>>>> Any specific reason why you think root cause is in the driver and not
>>>>>> elsewhere in the network subsystem?
>>>>>>
>>>>>> Heiner
>>>>>>
>>>>>>
>>>>>> On 28.01.2019 23:10, Peter Ceiley wrote:
>>>>>>> Hi Heiner,
>>>>>>>
>>>>>>> Thanks for getting back to me.
>>>>>>>
>>>>>>> No, I don't use jumbo packets.
>>>>>>>
>>>>>>> Bandwidth is *generally* good, and iperf results to my NAS provide
>>>>>>> over 900 Mbits/s in both circumstances. The issue seems to appear when
>>>>>>> establishing a connection and is most notable, for example, on my
>>>>>>> mounted NFS shares where it takes seconds (up to 10's of seconds on
>>>>>>> larger directories) to list the contents of each directory. Once a
>>>>>>> transfer begins on a file, I appear to get good bandwidth.
>>>>>>>
>>>>>>> I'm unsure of the best scientific data to provide you in order to
>>>>>>> troubleshoot this issue. Running the following
>>>>>>>
>>>>>>> netstat -s |grep retransmitted
>>>>>>>
>>>>>>> shows a steady increase in retransmitted segments each time I list the
>>>>>>> contents of a remote directory, for example, running 'ls' on a
>>>>>>> directory containing 345 media files did the following using kernel
>>>>>>> 4.19.18:
>>>>>>>
>>>>>>> increased retransmitted segments by 21 and the 'time' command showed
>>>>>>> the following:
>>>>>>> real 0m19.867s
>>>>>>> user 0m0.012s
>>>>>>> sys 0m0.036s
>>>>>>>
>>>>>>> The same command shows no retransmitted segments running kernel
>>>>>>> 4.18.16 and 'time' showed:
>>>>>>> real 0m0.300s
>>>>>>> user 0m0.004s
>>>>>>> sys 0m0.007s
>>>>>>>
>>>>>>> ifconfig does not show any RX/TX errors nor dropped packets in either case.
>>>>>>>
>>>>>>> dmesg XID:
>>>>>>> [ 2.979984] r8169 0000:03:00.0 eth0: RTL8168g/8111g,
>>>>>>> f8:b1:56:fe:67:e0, XID 4c000800, IRQ 32
>>>>>>>
>>>>>>> # lspci -vv
>>>>>>> 03:00.0 Ethernet controller: Realtek Semiconductor Co., Ltd.
>>>>>>> RTL8111/8168/8411 PCI Express Gigabit Ethernet Controller (rev 0c)
>>>>>>> Subsystem: Dell RTL8111/8168/8411 PCI Express Gigabit Ethernet Controller
>>>>>>> Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop-
>>>>>>> ParErr- Stepping- SERR- FastB2B- DisINTx+
>>>>>>> Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort-
>>>>>>> <TAbort- <MAbort- >SERR- <PERR- INTx-
>>>>>>> Latency: 0, Cache Line Size: 64 bytes
>>>>>>> Interrupt: pin A routed to IRQ 19
>>>>>>> Region 0: I/O ports at d000 [size=256]
>>>>>>> Region 2: Memory at f7b00000 (64-bit, non-prefetchable) [size=4K]
>>>>>>> Region 4: Memory at f2100000 (64-bit, prefetchable) [size=16K]
>>>>>>> Capabilities: [40] Power Management version 3
>>>>>>> Flags: PMEClk- DSI- D1+ D2+ AuxCurrent=375mA
>>>>>>> PME(D0+,D1+,D2+,D3hot+,D3cold+)
>>>>>>> Status: D0 NoSoftRst+ PME-Enable- DSel=0 DScale=0 PME-
>>>>>>> Capabilities: [50] MSI: Enable- Count=1/1 Maskable- 64bit+
>>>>>>> Address: 0000000000000000 Data: 0000
>>>>>>> Capabilities: [70] Express (v2) Endpoint, MSI 01
>>>>>>> DevCap: MaxPayload 128 bytes, PhantFunc 0, Latency L0s
>>>>>>> <512ns, L1 <64us
>>>>>>> ExtTag- AttnBtn- AttnInd- PwrInd- RBE+ FLReset-
>>>>>>> SlotPowerLimit 10.000W
>>>>>>> DevCtl: CorrErr- NonFatalErr- FatalErr- UnsupReq-
>>>>>>> RlxdOrd- ExtTag- PhantFunc- AuxPwr- NoSnoop-
>>>>>>> MaxPayload 128 bytes, MaxReadReq 4096 bytes
>>>>>>> DevSta: CorrErr+ NonFatalErr- FatalErr- UnsupReq- AuxPwr+ TransPend-
>>>>>>> LnkCap: Port #0, Speed 2.5GT/s, Width x1, ASPM L0s L1, Exit
>>>>>>> Latency L0s unlimited, L1 <64us
>>>>>>> ClockPM+ Surprise- LLActRep- BwNot- ASPMOptComp+
>>>>>>> LnkCtl: ASPM L1 Enabled; RCB 64 bytes Disabled- CommClk+
>>>>>>> ExtSynch- ClockPM+ AutWidDis- BWInt- AutBWInt-
>>>>>>> LnkSta: Speed 2.5GT/s (ok), Width x1 (ok)
>>>>>>> TrErr- Train- SlotClk+ DLActive- BWMgmt- ABWMgmt-
>>>>>>> DevCap2: Completion Timeout: Range ABCD, TimeoutDis+, LTR+,
>>>>>>> OBFF Via message/WAKE#
>>>>>>> AtomicOpsCap: 32bit- 64bit- 128bitCAS-
>>>>>>> DevCtl2: Completion Timeout: 50us to 50ms, TimeoutDis-, LTR+,
>>>>>>> OBFF Disabled
>>>>>>> AtomicOpsCtl: ReqEn-
>>>>>>> LnkCtl2: Target Link Speed: 2.5GT/s, EnterCompliance- SpeedDis-
>>>>>>> Transmit Margin: Normal Operating Range,
>>>>>>> EnterModifiedCompliance- ComplianceSOS-
>>>>>>> Compliance De-emphasis: -6dB
>>>>>>> LnkSta2: Current De-emphasis Level: -6dB,
>>>>>>> EqualizationComplete-, EqualizationPhase1-
>>>>>>> EqualizationPhase2-, EqualizationPhase3-, LinkEqualizationRequest-
>>>>>>> Capabilities: [b0] MSI-X: Enable+ Count=4 Masked-
>>>>>>> Vector table: BAR=4 offset=00000000
>>>>>>> PBA: BAR=4 offset=00000800
>>>>>>> Capabilities: [d0] Vital Product Data
>>>>>>> pcilib: sysfs_read_vpd: read failed: Input/output error
>>>>>>> Not readable
>>>>>>> Capabilities: [100 v1] Advanced Error Reporting
>>>>>>> UESta: DLP- SDES- TLP- FCP- CmpltTO- CmpltAbrt- UnxCmplt-
>>>>>>> RxOF- MalfTLP- ECRC- UnsupReq- ACSViol-
>>>>>>> UEMsk: DLP- SDES- TLP- FCP- CmpltTO- CmpltAbrt- UnxCmplt-
>>>>>>> RxOF- MalfTLP- ECRC- UnsupReq- ACSViol-
>>>>>>> UESvrt: DLP+ SDES+ TLP- FCP+ CmpltTO- CmpltAbrt- UnxCmplt-
>>>>>>> RxOF+ MalfTLP+ ECRC- UnsupReq- ACSViol-
>>>>>>> CESta: RxErr+ BadTLP+ BadDLLP+ Rollover- Timeout+ AdvNonFatalErr-
>>>>>>> CEMsk: RxErr- BadTLP- BadDLLP- Rollover- Timeout- AdvNonFatalErr+
>>>>>>> AERCap: First Error Pointer: 00, ECRCGenCap+ ECRCGenEn-
>>>>>>> ECRCChkCap+ ECRCChkEn-
>>>>>>> MultHdrRecCap- MultHdrRecEn- TLPPfxPres- HdrLogCap-
>>>>>>> HeaderLog: 00000000 00000000 00000000 00000000
>>>>>>> Capabilities: [140 v1] Virtual Channel
>>>>>>> Caps: LPEVC=0 RefClk=100ns PATEntryBits=1
>>>>>>> Arb: Fixed- WRR32- WRR64- WRR128-
>>>>>>> Ctrl: ArbSelect=Fixed
>>>>>>> Status: InProgress-
>>>>>>> VC0: Caps: PATOffset=00 MaxTimeSlots=1 RejSnoopTrans-
>>>>>>> Arb: Fixed- WRR32- WRR64- WRR128- TWRR128- WRR256-
>>>>>>> Ctrl: Enable+ ID=0 ArbSelect=Fixed TC/VC=01
>>>>>>> Status: NegoPending- InProgress-
>>>>>>> Capabilities: [160 v1] Device Serial Number 01-00-00-00-68-4c-e0-00
>>>>>>> Capabilities: [170 v1] Latency Tolerance Reporting
>>>>>>> Max snoop latency: 71680ns
>>>>>>> Max no snoop latency: 71680ns
>>>>>>> Kernel driver in use: r8169
>>>>>>> Kernel modules: r8169
>>>>>>>
>>>>>>> Please let me know if you have any other ideas in terms of testing.
>>>>>>>
>>>>>>> Thanks!
>>>>>>>
>>>>>>> Peter.
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> On Tue, 29 Jan 2019 at 05:28, Heiner Kallweit <hkallweit1@gmail.com> wrote:
>>>>>>>>
>>>>>>>> On 28.01.2019 12:13, Peter Ceiley wrote:
>>>>>>>>> Hi,
>>>>>>>>>
>>>>>>>>> I have been experiencing very poor network performance since Kernel
>>>>>>>>> 4.19 and I'm confident it's related to the r8169 driver.
>>>>>>>>>
>>>>>>>>> I have no issue with kernel versions 4.18 and prior. I am experiencing
>>>>>>>>> this issue in kernels 4.19 and 4.20 (currently running/testing with
>>>>>>>>> 4.20.4 & 4.19.18).
>>>>>>>>>
>>>>>>>>> If someone could guide me in the right direction, I'm happy to help
>>>>>>>>> troubleshoot this issue. Note that I have been keeping an eye on one
>>>>>>>>> issue related to loading of the PHY driver, however, my symptoms
>>>>>>>>> differ in that I still have a network connection. I have attempted to
>>>>>>>>> reload the driver on a running system, but this does not improve the
>>>>>>>>> situation.
>>>>>>>>>
>>>>>>>>> Using the proprietary r8168 driver returns my device to proper working order.
>>>>>>>>>
>>>>>>>>> lshw shows:
>>>>>>>>> description: Ethernet interface
>>>>>>>>> product: RTL8111/8168/8411 PCI Express Gigabit Ethernet Controller
>>>>>>>>> vendor: Realtek Semiconductor Co., Ltd.
>>>>>>>>> physical id: 0
>>>>>>>>> bus info: pci@0000:03:00.0
>>>>>>>>> logical name: enp3s0
>>>>>>>>> version: 0c
>>>>>>>>> serial:
>>>>>>>>> size: 1Gbit/s
>>>>>>>>> capacity: 1Gbit/s
>>>>>>>>> width: 64 bits
>>>>>>>>> clock: 33MHz
>>>>>>>>> capabilities: pm msi pciexpress msix vpd bus_master cap_list
>>>>>>>>> ethernet physical tp aui bnc mii fibre 10bt 10bt-fd 100bt 100bt-fd
>>>>>>>>> 1000bt-fd autonegotiation
>>>>>>>>> configuration: autonegotiation=on broadcast=yes driver=r8169
>>>>>>>>> duplex=full firmware=rtl8168g-2_0.0.1 02/06/13 ip=192.168.1.25
>>>>>>>>> latency=0 link=yes multicast=yes port=MII speed=1Gbit/s
>>>>>>>>> resources: irq:19 ioport:d000(size=256)
>>>>>>>>> memory:f7b00000-f7b00fff memory:f2100000-f2103fff
>>>>>>>>>
>>>>>>>>> Kind Regards,
>>>>>>>>>
>>>>>>>>> Peter.
>>>>>>>>>
>>>>>>>> Hi Peter,
>>>>>>>>
>>>>>>>> the description "poor network performance" is quite vague, therefore:
>>>>>>>>
>>>>>>>> - Can you provide any measurements?
>>>>>>>> - iperf results before and after
>>>>>>>> - statistics about dropped packets (rx and/or tx)
>>>>>>>> - Do you use jumbo packets?
>>>>>>>>
>>>>>>>> Also help would be a "lspci -vv" output for the network card and
>>>>>>>> the dmesg output line with the chip XID.
>>>>>>>>
>>>>>>>> Heiner
>>>>>>>
>>>>>>
>>>>>
>>>>
>>>
>>
>
^ permalink raw reply
* Re: [PATCH net-next v3] net: dsa: mv88e6xxx: Prevent suspend to RAM
From: Miquel Raynal @ 2019-02-05 18:47 UTC (permalink / raw)
To: Vivien Didelot
Cc: Andrew Lunn, Florian Fainelli, David S. Miller, netdev,
linux-kernel, Thomas Petazzoni, Gregory Clement, Antoine Tenart,
Maxime Chevallier, Nadav Haklai
In-Reply-To: <20190205112857.GB13620@t480s.localdomain>
Hi Vivien,
Vivien Didelot <vivien.didelot@gmail.com> wrote on Tue, 5 Feb 2019
11:28:57 -0500:
> Hi Miquel,
>
> On Tue, 5 Feb 2019 12:07:28 +0100, Miquel Raynal <miquel.raynal@bootlin.com> wrote:
>
> > +/* There is no suspend to RAM support at DSA level yet, the switch configuration
> > + * would be lost after a power cycle so prevent it to be suspended.
> > + */
> > +static int __maybe_unused mv88e6xxx_suspend(struct device *dev)
> > +{
> > + return -EOPNOTSUPP;
> > +}
> > +
> > +static int __maybe_unused mv88e6xxx_resume(struct device *dev)
> > +{
> > + return 0;
> > +}
>
> The code looks good but my only concern is -EOPNOTSUPP. In this
> context this code is specific to callbacks targeting bridge and
> switchdev, while the dev_pm_ops are completely parallel to DSA.
>
> It is intuitive but given Documentation/power/runtime_pm.txt, this
> will default to being interpreted as a fatal error, while -EBUSY
> seems to keep the device in an 'active' state in a saner way.
>
> I don't understand yet how to properly tell PM core that suspend to RAM
> isn't supported. If an error code different from -EAGAIN or -EBUSY
> is the way to go, I'm good with it:
I do share your concern and I went through the Documentation but I did
not find a unified way to tell the PM core the feature is unsupported.
By grepping code, I realized returning -EOPNOTSUPP was a recurrent
alternative so here we are. I also considered -EBUSY but it seems more
like a "I cannot right now" and -EAGAIN which is more a "try again
soon". Anyway, no matter the error code returned, I'm not sure if the PM
core actually cares?
> Reviewed-by: Vivien Didelot <vivien.didelot@gmail.com>
>
>
> Thanks,
>
> Vivien
Thanks,
Miquèl
^ permalink raw reply
* Re: [PATCH v2 2/2] r8169: Avoid pointer aliasing
From: Joe Perches @ 2019-02-05 18:42 UTC (permalink / raw)
To: David Miller, thierry.reding
Cc: hkallweit1, andrew, nic_swsd, netdev, linux-kernel
In-Reply-To: <20190204.192040.1074738183781998611.davem@davemloft.net>
On Mon, 2019-02-04 at 19:20 -0800, David Miller wrote:
> From: Thierry Reding <thierry.reding@gmail.com>
> Date: Mon, 4 Feb 2019 17:42:13 +0100
>
> > @@ -7316,7 +7325,7 @@ static int rtl_get_ether_clk(struct rtl8169_private *tp)
> > static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
> > {
> > const struct rtl_cfg_info *cfg = rtl_cfg_infos + ent->driver_data;
> > - u8 mac_addr[ETH_ALEN] __aligned(4) = {};
> > + u8 mac_addr[ETH_ALEN] = {};
> > struct rtl8169_private *tp;
>
> I agree with Heiner, you have to provide at least 2 byte alignment for this
> buffer due to the reasons he stated.
It's declared after a pointer so it is already is 2 byte aligned.
A lot of drivers wouldn't work otherwise.
^ permalink raw reply
* Re: [PATCH net-next 2/2] net: marvell: mvpp2: fix lack of link interrupts
From: David Miller @ 2019-02-05 18:41 UTC (permalink / raw)
To: rmk+kernel; +Cc: antoine.tenart, maxime.chevallier, baruch, netdev
In-Reply-To: <E1gqnmR-0007eF-87@rmk-PC.armlinux.org.uk>
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Mon, 04 Feb 2019 23:35:59 +0000
> Sven Auhagen reports that if he changes a SFP+ module for a SFP module
> on the Macchiatobin Single Shot, the link does not come back up. For
> Sven, it is as easy as:
>
> - Insert a SFP+ module connected, and use ping6 to verify link is up.
> - Remove SFP+ module
> - Insert SFP 1000base-X module use ping6 to verify link is up: Link
> up event did not trigger and the link is down
>
> but that doesn't show the problem for me. Locally, this has been
> reproduced by:
>
> - Boot with no modules.
> - Insert SFP+ module, confirm link is up.
> - Replace module with 25000base-X module. Confirm link is up.
> - Set remote end down, link is reported as dropped at both ends.
> - Set remote end up, link is reported up at remote end, but not local
> end due to lack of link interrupt.
>
> Fix this by setting up both GMAC and XLG interrupts for port 0, but
> only unmasking the appropriate interrupt according to the current mode
> set in the mac_config() method. However, only do the mask/unmask
> dance when we are really changing the link mode to avoid missing any
> link interrupts.
>
> Tested-by: Sven Auhagen <sven.auhagen@voleatech.de>
> Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Applied.
^ permalink raw reply
* Re: [PATCH net-next 1/2] net: marvell: mvpp2: use phy_interface_mode_is_8023z() helper
From: David Miller @ 2019-02-05 18:40 UTC (permalink / raw)
To: rmk+kernel; +Cc: antoine.tenart, maxime.chevallier, baruch, netdev
In-Reply-To: <E1gqnmM-0007dy-3M@rmk-PC.armlinux.org.uk>
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Mon, 04 Feb 2019 23:35:54 +0000
> Use the phy_interface_mode_is_8023z() helper for detecting interface
> modes that use 802.3z serial encoding. This is equivalent to testing
> for both 1000base-X and 2500base-X.
>
> Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Applied.
^ permalink raw reply
* Re: [PATCH v2] arm64: dts: lx2160aqds: Add mdio mux nodes
From: Li Yang @ 2019-02-05 18:38 UTC (permalink / raw)
To: Pankaj Bansal
Cc: shawnguo@kernel.org, andrew@lunn.ch, f.fainelli@gmail.com,
netdev@vger.kernel.org, linux-arm-kernel@lists.infradead.org,
David Miller
In-Reply-To: <VI1PR0401MB2496085A7A23DF5CE86A4657F16E0@VI1PR0401MB2496.eurprd04.prod.outlook.com>
On Tue, Feb 5, 2019 at 6:26 AM Pankaj Bansal <pankaj.bansal@nxp.com> wrote:
>
> Hi Shawn/Leo,
>
> If you have no more comments, can you please merge this path in your branch?
> in same branch in which you have accepted LX2160AQDS board patches.
It can be merged through the ARM SoC tree. But there are still
comments remain to be addressed.
>
> Regards,
> Pankaj Bansal
>
> > -----Original Message-----
> > From: David Miller [mailto:davem@davemloft.net]
> > Sent: Monday, 4 February, 2019 10:21 PM
> > To: Pankaj Bansal <pankaj.bansal@nxp.com>
> > Cc: shawnguo@kernel.org; Leo Li <leoyang.li@nxp.com>; andrew@lunn.ch;
> > f.fainelli@gmail.com; netdev@vger.kernel.org; linux-arm-
> > kernel@lists.infradead.org
> > Subject: Re: [PATCH v2] arm64: dts: lx2160aqds: Add mdio mux nodes
> >
> > From: Pankaj Bansal <pankaj.bansal@nxp.com>
> > Date: Mon, 4 Feb 2019 08:51:57 +0000
> >
> > > The two external MDIO buses used to communicate with phy devices that
> > > are external to SOC are muxed in LX2160AQDS board.
> > >
> > > These buses can be routed to any one of the eight IO slots on
> > > LX2160AQDS board depending on value in fpga register 0x54.
> > >
> > > Additionally the external MDIO1 is used to communicate to the onboard
> > > RGMII phy devices.
> > >
> > > The mdio1 is controlled by bits 4-7 of fpga register and mdio2 is
> > > controlled by bits 0-3 of fpga register.
> > >
> > > Signed-off-by: Pankaj Bansal <pankaj.bansal@nxp.com>
> >
> > Am I applying this to my networking tree or are the ARM folks taking this?
^ permalink raw reply
* Re: [PATCH v2] net: dsa: mv88e6xxx: Revise irq setup ordering
From: David Miller @ 2019-02-05 18:37 UTC (permalink / raw)
To: dave.anglin; +Cc: andrew, linux, vivien.didelot, f.fainelli, netdev
In-Reply-To: <824d011b-3692-69c3-5e2c-58e950a80abf@bell.net>
Something is really wrong with how your patches are submitted.
The patch shows up in between the commit message and your signoff
tags, also the patch appears to be mangled by your email client.
Please fix this before submitting any future work.
Thank you.
^ permalink raw reply
* Re: [PATCH v2] arm64: dts: lx2160aqds: Add mdio mux nodes
From: Li Yang @ 2019-02-05 18:37 UTC (permalink / raw)
To: Pankaj Bansal
Cc: Shawn Guo, Andrew Lunn, Florian Fainelli, netdev@vger.kernel.org,
linux-arm-kernel@lists.infradead.org
In-Reply-To: <20190204141641.18272-1-pankaj.bansal@nxp.com>
On Mon, Feb 4, 2019 at 2:53 AM Pankaj Bansal <pankaj.bansal@nxp.com> wrote:
>
> The two external MDIO buses used to communicate with phy devices that are
> external to SOC are muxed in LX2160AQDS board.
>
> These buses can be routed to any one of the eight IO slots on LX2160AQDS
> board depending on value in fpga register 0x54.
>
> Additionally the external MDIO1 is used to communicate to the onboard
> RGMII phy devices.
>
> The mdio1 is controlled by bits 4-7 of fpga register and mdio2 is
> controlled by bits 0-3 of fpga register.
>
> Signed-off-by: Pankaj Bansal <pankaj.bansal@nxp.com>
> ---
>
> Notes:
> V2:
> - removed unnecassary TODO statements
> - removed device_type from mdio nodes
> - change the case of hex number to lowercase
> - removed board specific comments from soc file
There are still some comments from V1 haven't been addressed.
>
> .../boot/dts/freescale/fsl-lx2160a-qds.dts | 115 +++++++++++++++++
> .../boot/dts/freescale/fsl-lx2160a.dtsi | 18 +++
> 2 files changed, 133 insertions(+)
>
> diff --git a/arch/arm64/boot/dts/freescale/fsl-lx2160a-qds.dts b/arch/arm64/boot/dts/freescale/fsl-lx2160a-qds.dts
> index 99a22abbe725..2c3020a72d41 100644
> --- a/arch/arm64/boot/dts/freescale/fsl-lx2160a-qds.dts
> +++ b/arch/arm64/boot/dts/freescale/fsl-lx2160a-qds.dts
> @@ -46,6 +46,121 @@
> &i2c0 {
> status = "okay";
>
> + fpga@66 {
> + compatible = "fsl,lx2160aqds-fpga", "fsl,fpga-qixis-i2c";
> + reg = <0x66>;
> + #address-cells = <1>;
> + #size-cells = <0>;
> +
> + mdio-mux-1@54 {
Still no compatible string defined for the node. Probably should be
"mdio-mux-mmioreg", "mdio-mux"
> + mdio-parent-bus = <&emdio1>;
> + reg = <0x54>; /* BRDCFG4 */
> + mux-mask = <0xf8>; /* EMI1_MDIO */
> + #address-cells=<1>;
> + #size-cells = <0>;
> +
> + mdio@0 {
> + reg = <0x00>;
> + #address-cells = <1>;
> + #size-cells = <0>;
> + };
> + mdio@40 {
> + reg = <0x40>;
> + #address-cells = <1>;
> + #size-cells = <0>;
> + };
> + mdio@c0 {
> + reg = <0xc0>;
> + #address-cells = <1>;
> + #size-cells = <0>;
> + };
> + mdio@c8 {
> + reg = <0xc8>;
> + #address-cells = <1>;
> + #size-cells = <0>;
> + };
> + mdio@d0 {
> + reg = <0xd0>;
> + #address-cells = <1>;
> + #size-cells = <0>;
> + };
> + mdio@d8 {
> + reg = <0xd8>;
> + #address-cells = <1>;
> + #size-cells = <0>;
> + };
> + mdio@e0 {
> + reg = <0xe0>;
> + #address-cells = <1>;
> + #size-cells = <0>;
> + };
> + mdio@e8 {
> + reg = <0xe8>;
> + #address-cells = <1>;
> + #size-cells = <0>;
> + };
> + mdio@f0 {
> + reg = <0xf0>;
> + #address-cells = <1>;
> + #size-cells = <0>;
> + };
> + mdio@f8 {
> + reg = <0xf8>;
> + #address-cells = <1>;
> + #size-cells = <0>;
> + };
> + };
> +
> + mdio-mux-2@54 {
> + mdio-parent-bus = <&emdio2>;
> + reg = <0x54>; /* BRDCFG4 */
> + mux-mask = <0x07>; /* EMI2_MDIO */
> + #address-cells=<1>;
> + #size-cells = <0>;
> +
> + mdio@0 {
> + reg = <0x00>;
> + #address-cells = <1>;
> + #size-cells = <0>;
> + };
> + mdio@1 {
> + reg = <0x01>;
> + #address-cells = <1>;
> + #size-cells = <0>;
> + };
> + mdio@2 {
> + reg = <0x02>;
> + #address-cells = <1>;
> + #size-cells = <0>;
> + };
> + mdio@3 {
> + reg = <0x03>;
> + #address-cells = <1>;
> + #size-cells = <0>;
> + };
> + mdio@4 {
> + reg = <0x04>;
> + #address-cells = <1>;
> + #size-cells = <0>;
> + };
> + mdio@5 {
> + reg = <0x05>;
> + #address-cells = <1>;
> + #size-cells = <0>;
> + };
> + mdio@6 {
> + reg = <0x06>;
> + #address-cells = <1>;
> + #size-cells = <0>;
> + };
> + mdio@7 {
> + reg = <0x07>;
> + #address-cells = <1>;
> + #size-cells = <0>;
> + };
> + };
> + };
> +
> i2c-mux@77 {
> compatible = "nxp,pca9547";
> reg = <0x77>;
> diff --git a/arch/arm64/boot/dts/freescale/fsl-lx2160a.dtsi b/arch/arm64/boot/dts/freescale/fsl-lx2160a.dtsi
> index a79f5c1ea56d..a74045ad22ad 100644
> --- a/arch/arm64/boot/dts/freescale/fsl-lx2160a.dtsi
> +++ b/arch/arm64/boot/dts/freescale/fsl-lx2160a.dtsi
> @@ -762,5 +762,23 @@
> <GIC_SPI 209 IRQ_TYPE_LEVEL_HIGH>;
> dma-coherent;
> };
> +
> + /* WRIOP0: 0x8b8_0000, E-MDIO1: 0x1_6000 */
> + emdio1: mdio@8b96000 {
> + compatible = "fsl,fman-memac-mdio";
> + reg = <0x0 0x8b96000 0x0 0x1000>;
> + #address-cells = <1>;
> + #size-cells = <0>;
> + little-endian; /* force the driver in LE mode */
Still doesn't fully align with the binding at
"Documentation/devicetree/bindings/powerpc/fsl/fman.txt".
It should either have the "interrupts" property for external MDIO or
"fsl,fman-internal-mdio" for internal MDIO.
> + };
> +
> + /* WRIOP0: 0x8b8_0000, E-MDIO2: 0x1_7000 */
> + emdio2: mdio@8b97000 {
> + compatible = "fsl,fman-memac-mdio";
> + reg = <0x0 0x8b97000 0x0 0x1000>;
> + #address-cells = <1>;
> + #size-cells = <0>;
> + little-endian; /* force the driver in LE mode */
> + };
> };
> };
> --
> 2.17.1
>
^ permalink raw reply
* Re: [PATCH v2] net: emac: remove IBM_EMAC_RX_SKB_HEADROOM
From: David Miller @ 2019-02-05 18:36 UTC (permalink / raw)
To: chunkeey; +Cc: netdev
In-Reply-To: <20190204215829.27063-1-chunkeey@gmail.com>
From: Christian Lamparter <chunkeey@gmail.com>
Date: Mon, 4 Feb 2019 22:58:29 +0100
> @@ -1195,6 +1195,27 @@ static inline int emac_alloc_rx_skb(struct emac_instance *dev, int slot,
> return 0;
> }
>
> +static inline int
> +emac_alloc_rx_skb(struct emac_instance *dev, int slot)
> +{
> + struct sk_buff *skb;
> +
> + skb = __netdev_alloc_skb_ip_align(dev->ndev, dev->rx_skb_size,
> + GFP_KERNEL);
> +
> + return __emac_prepare_rx_skb(skb, dev, slot);
> +}
> +
> +static inline int
> +emac_alloc_rx_skb_napi(struct emac_instance *dev, int slot)
> +{
> + struct sk_buff *skb;
> +
> + skb = napi_alloc_skb(&dev->mal->napi, dev->rx_skb_size);
> +
> + return __emac_prepare_rx_skb(skb, dev, slot);
> +}
> +
Do not use the inline keyword in foo.c files, let the compiler decide.
Thank you.
^ permalink raw reply
* Re: [PATCH net-next 0/3] nixge: Fixed-link support
From: David Miller @ 2019-02-05 18:34 UTC (permalink / raw)
To: moritz.fischer
Cc: netdev, devicetree, linux-kernel, alex.williams, andrew, robh+dt,
mdf
In-Reply-To: <20190204173040.5538-1-moritz.fischer@ettus.com>
From: Moritz Fischer <moritz.fischer@ettus.com>
Date: Mon, 4 Feb 2019 09:30:37 -0800
> From: Moritz Fischer <mdf@kernel.org>
>
> This series adds fixed-link support to nixge.
>
> The first patch corrects the binding to correctly reflect
> hardware that does not come with MDIO cores instantiated.
>
> The second patch adds fixed link support to the driver.
>
> The third patch updates the binding document with the now
> optional (formerly required) phy-handle property and references
> the fixed-link docs.
Series applied, thank you.
^ permalink raw reply
* Re: [PATCH net] rxrpc: bad unlock balance in rxrpc_recvmsg
From: David Miller @ 2019-02-05 3:19 UTC (permalink / raw)
To: edumazet; +Cc: netdev, eric.dumazet, dhowells, syzkaller
In-Reply-To: <20190204163606.236419-1-edumazet@google.com>
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 4 Feb 2019 08:36:06 -0800
> When either "goto wait_interrupted;" or "goto wait_error;"
> paths are taken, socket lock has already been released.
>
> This patch fixes following syzbot splat :
...
> Fixes: 248f219cb8bc ("rxrpc: Rewrite the data and ack handling code")
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: David Howells <dhowells@redhat.com>
> Reported-by: syzbot <syzkaller@googlegroups.com>
David H., please take a look at this.
I can apply it directly and queue it up for -stable, just let me
know.
Thanks.
^ permalink raw reply
* Re: [PATCH v2] net: phy: fixed-phy: Drop GPIO from fixed_phy_add()
From: David Miller @ 2019-02-05 2:33 UTC (permalink / raw)
To: linus.walleij; +Cc: andrew, f.fainelli, netdev, Laurent.pinchart
In-Reply-To: <20190204102618.20583-1-linus.walleij@linaro.org>
From: Linus Walleij <linus.walleij@linaro.org>
Date: Mon, 4 Feb 2019 11:26:18 +0100
> All users of the fixed_phy_add() pass -1 as GPIO number
> to the fixed phy driver, and all users of fixed_phy_register()
> pass -1 as GPIO number as well, except for the device
> tree MDIO bus.
>
> Any new users should create a proper device and pass the
> GPIO as a descriptor associated with the device so delete
> the GPIO argument from the calls and drop the code looking
> requesting a GPIO in fixed_phy_add().
>
> In fixed phy_register(), investigate the "fixed-link"
> node and pick the GPIO descriptor from "link-gpios" if
> this property exists. Move the corresponding code out
> of of_mdio.c as the fixed phy code anyways requires
> OF to be in use.
>
> Tested-by: Andrew Lunn <andrew@lunn.ch>
> Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
This looks like net-next material, so that's where I have applied
this.
Thanks.
^ permalink raw reply
* Re: [PATCH v2 2/2] r8169: Avoid pointer aliasing
From: David Miller @ 2019-02-05 3:20 UTC (permalink / raw)
To: thierry.reding; +Cc: hkallweit1, andrew, nic_swsd, netdev, linux-kernel
In-Reply-To: <20190204164213.30727-2-thierry.reding@gmail.com>
From: Thierry Reding <thierry.reding@gmail.com>
Date: Mon, 4 Feb 2019 17:42:13 +0100
> @@ -7316,7 +7325,7 @@ static int rtl_get_ether_clk(struct rtl8169_private *tp)
> static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
> {
> const struct rtl_cfg_info *cfg = rtl_cfg_infos + ent->driver_data;
> - u8 mac_addr[ETH_ALEN] __aligned(4) = {};
> + u8 mac_addr[ETH_ALEN] = {};
> struct rtl8169_private *tp;
I agree with Heiner, you have to provide at least 2 byte alignment for this
buffer due to the reasons he stated.
^ permalink raw reply
* Re: [PATCH] net: dsa: slave: Don't propagate flag changes on down slave interfaces
From: David Miller @ 2019-02-05 2:29 UTC (permalink / raw)
To: f.fainelli; +Cc: rdong.ge, andrew, vivien.didelot, netdev
In-Reply-To: <7f8fadc6-1bb5-03b5-4f5e-a407e9116399@gmail.com>
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Sat, 2 Feb 2019 09:05:11 -0800
> Le 2/2/19 à 6:29 AM, Rundong Ge a écrit :
>> The unbalance of master's promiscuity or allmulti will happen after ifdown
>> and ifup a slave interface which is in a bridge.
>>
>> When we ifdown a slave interface , both the 'dsa_slave_close' and
>> 'dsa_slave_change_rx_flags' will clear the master's flags. The flags
>> of master will be decrease twice.
>> In the other hand, if we ifup the slave interface again, since the
>> slave's flags were cleared the 'dsa_slave_open' won't set the master's
>> flag, only 'dsa_slave_change_rx_flags' that triggered by 'br_add_if'
>> will set the master's flags. The flags of master is increase once.
>>
>> Only propagating flag changes when a slave interface is up makes
>> sure this does not happen. The 'vlan_dev_change_rx_flags' had the
>> same problem and was fixed, and changes here follows that fix.
>
> VLAN code under net/8021q/vlan_dev.c::vlan_dev_change_rx_flags() appears
> to do the same thing that you are proposing, so this looks fine to me.
> Since that is a bugfix, we should probably add:
>
> Fixes: 91da11f870f0 ("net: Distributed Switch Architecture protocol
> support")
Applied with Fixes tag added, and queued up for -stable.
Thanks.
^ permalink raw reply
* Re: [PATCH 00/12] net: Introduce ndo_get_port_parent_id()
From: David Miller @ 2019-02-05 2:24 UTC (permalink / raw)
To: f.fainelli; +Cc: netdev
In-Reply-To: <20190204233633.20421-1-f.fainelli@gmail.com>
Florian, your email headers, particularly the CC: list, were too huge.
So vger.kernel.org blocked them.
Please trim the CC: list down and make it more reasonable.
Thank you.
^ permalink raw reply
* Re: [ovs-dev] [PATCH net-next V2 1/1] openvswitch: Declare ovs key structures using macros
From: Gregory Rose @ 2019-02-05 18:22 UTC (permalink / raw)
To: Eli Britstein, David Miller, yihung.wei@gmail.com
Cc: dev@openvswitch.org, netdev@vger.kernel.org,
simon.horman@netronome.com
In-Reply-To: <4f57a34c-55a3-ea39-b9cc-2e1dbebc99c1@mellanox.com>
On 2/5/2019 4:02 AM, Eli Britstein wrote:
> On 2/4/2019 10:07 PM, David Miller wrote:
>> From: Yi-Hung Wei <yihung.wei@gmail.com>
>> Date: Mon, 4 Feb 2019 10:47:18 -0800
>>
>>> For example, to see how 'struct ovs_key_ipv6' is defined, now we need
>>> to trace how OVS_KEY_IPV6_FIELDS is defined, and how OVS_KEY_FIELD_ARR
>>> and OVS_KEY_FIELD defined. I think it makes the header file to be
>>> more complicated.
>> I completely agree.
>>
>> Unless this is totally unavoidable, I do not want to apply a patch
>> which makes reading and auditing the networking code more difficult.
> This technique is discussed for example in
> https://stackoverflow.com/questions/6635851/real-world-use-of-x-macros,
> and I found existing examples of using it in the kernel tree:
>
> __BPF_FUNC_MAPPER in commit ebb676daa1a34 ("bpf: Print function name in
> addition to function id")
>
> __AAL_STAT_ITEMS and __SONET_ITEMS in commit 607ca46e97a1b ("UAPI:
> (Scripted) Disintegrate include/linux"), the successor of commit
> 1da177e4c3f4 ("Linux-2.6.12-rc2"). I didn't dig deeper to the past.
>
> I can agree it makes that H file a bit more complicated, but for sure
> less than ## macros that are widely used.
>
> However, I think the alternatives of generating such defines by some
> scripts, or having the fields in more than one place are even worse, so
> it is a kind of unavoidable.
Why is using a script to generate defines for the requirements of your
application or driver so bad? Your patch
turns openvswitch.h into some hardly readable code - using a script and
having a machine output the macros
your application or driver needs seems like a much better alternative to me.
- Greg
>
> Please reconsider regarding applying this patch.
>
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
^ permalink raw reply
* [linux-next:master 5947/6329] ERROR: "check_mmu_context" [arch/mips/kvm/kvm.ko] undefined!
From: kbuild test robot @ 2019-02-05 18:14 UTC (permalink / raw)
To: David S. Miller; +Cc: kbuild-all, netdev
[-- Attachment #1: Type: text/plain, Size: 1069 bytes --]
tree: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git master
head: 1ff5403385648b1554fd1aeffffdeec71d9cd41c
commit: bde8ef804c0dcf62b73cba01832e45f03f27694c [5947/6329] Merge remote-tracking branch 'net-next/master'
config: mips-malta_kvm_defconfig (attached as .config)
compiler: mipsel-linux-gnu-gcc (Debian 8.2.0-11) 8.2.0
reproduce:
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
git checkout bde8ef804c0dcf62b73cba01832e45f03f27694c
# save the attached .config to linux build tree
GCC_VERSION=8.2.0 make.cross ARCH=mips
All errors (new ones prefixed by >>):
>> ERROR: "check_mmu_context" [arch/mips/kvm/kvm.ko] undefined!
>> ERROR: "check_switch_mmu_context" [arch/mips/kvm/kvm.ko] undefined!
>> ERROR: "get_new_mmu_context" [arch/mips/kvm/kvm.ko] undefined!
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 19546 bytes --]
^ permalink raw reply
* [RFC bpf-next 7/7] net: flow_dissector: pass net argument to the eth_get_headlen
From: Stanislav Fomichev @ 2019-02-05 17:36 UTC (permalink / raw)
To: netdev; +Cc: davem, ast, daniel, simon.horman, willemb, Stanislav Fomichev
In-Reply-To: <20190205173629.160717-1-sdf@google.com>
This is an example and for RFC only, just to show what the end result
would look like. In a proper submission, we need to introduce new
helper and convert each driver one by one to make sure we don't
break 'git bisect'.
Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +-
drivers/net/ethernet/hisilicon/hns/hns_enet.c | 3 ++-
drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 3 ++-
drivers/net/ethernet/intel/fm10k/fm10k_main.c | 2 +-
drivers/net/ethernet/intel/i40e/i40e_txrx.c | 3 ++-
drivers/net/ethernet/intel/iavf/iavf_txrx.c | 2 +-
drivers/net/ethernet/intel/ice/ice_txrx.c | 2 +-
drivers/net/ethernet/intel/igb/igb_main.c | 2 +-
drivers/net/ethernet/intel/igc/igc_main.c | 2 +-
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 2 +-
drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 3 ++-
drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 3 ++-
drivers/net/tun.c | 3 ++-
include/linux/etherdevice.h | 2 +-
net/ethernet/eth.c | 5 +++--
15 files changed, 23 insertions(+), 16 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 6a512871176b..efc3a0455967 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -889,7 +889,7 @@ static struct sk_buff *bnxt_rx_page_skb(struct bnxt *bp,
DMA_ATTR_WEAK_ORDERING);
if (unlikely(!payload))
- payload = eth_get_headlen(data_ptr, len);
+ payload = eth_get_headlen(bp->dev, data_ptr, len);
skb = napi_alloc_skb(&rxr->bnapi->napi, payload);
if (!skb) {
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
index 60e7d7ae3787..caf075a7449c 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
@@ -603,7 +603,8 @@ static int hns_nic_poll_rx_skb(struct hns_nic_ring_data *ring_data,
} else {
ring->stats.seg_pkt_cnt++;
- pull_len = eth_get_headlen(va, HNS_RX_HEAD_SIZE);
+ pull_len = eth_get_headlen(dev_net(ndev), va,
+ HNS_RX_HEAD_SIZE);
memcpy(__skb_put(skb, pull_len), va,
ALIGN(pull_len, sizeof(long)));
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 40b17223ee41..81308c7c7d1e 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -2433,7 +2433,8 @@ static int hns3_alloc_skb(struct hns3_enet_ring *ring, int length,
ring->stats.seg_pkt_cnt++;
u64_stats_update_end(&ring->syncp);
- ring->pull_len = eth_get_headlen(va, HNS3_RX_HEAD_SIZE);
+ ring->pull_len = eth_get_headlen(dev_net(netdev), va,
+ HNS3_RX_HEAD_SIZE);
__skb_put(skb, ring->pull_len);
hns3_nic_reuse_page(skb, ring->frag_num++, ring, ring->pull_len,
desc_cb);
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
index 6fd15a734324..b52dfad58b71 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
@@ -278,7 +278,7 @@ static bool fm10k_add_rx_frag(struct fm10k_rx_buffer *rx_buffer,
/* we need the header to contain the greater of either ETH_HLEN or
* 60 bytes if the skb->len is less than 60 for skb_pad.
*/
- pull_len = eth_get_headlen(va, FM10K_RX_HDR_LEN);
+ pull_len = eth_get_headlen(skb_net(skb), va, FM10K_RX_HDR_LEN);
/* align pull length to size of long to optimize memcpy performance */
memcpy(__skb_put(skb, pull_len), va, ALIGN(pull_len, sizeof(long)));
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index a7e14e98889f..7990f80736e2 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2035,7 +2035,8 @@ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring,
/* Determine available headroom for copy */
headlen = size;
if (headlen > I40E_RX_HDR_SIZE)
- headlen = eth_get_headlen(xdp->data, I40E_RX_HDR_SIZE);
+ headlen = eth_get_headlen(skb_net(skb), xdp->data,
+ I40E_RX_HDR_SIZE);
/* align pull length to size of long to optimize memcpy performance */
memcpy(__skb_put(skb, headlen), xdp->data,
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index 9b4d7cec2e18..d3c7fd521fd7 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -1315,7 +1315,7 @@ static struct sk_buff *iavf_construct_skb(struct iavf_ring *rx_ring,
/* Determine available headroom for copy */
headlen = size;
if (headlen > IAVF_RX_HDR_SIZE)
- headlen = eth_get_headlen(va, IAVF_RX_HDR_SIZE);
+ headlen = eth_get_headlen(skb_net(skb), va, IAVF_RX_HDR_SIZE);
/* align pull length to size of long to optimize memcpy performance */
memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long)));
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 2357fcac996b..0876ddbc495c 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -708,7 +708,7 @@ static void ice_pull_tail(struct sk_buff *skb)
/* we need the header to contain the greater of either ETH_HLEN or
* 60 bytes if the skb->len is less than 60 for skb_pad.
*/
- pull_len = eth_get_headlen(va, ICE_RX_HDR_SIZE);
+ pull_len = eth_get_headlen(skb_net(skb), va, ICE_RX_HDR_SIZE);
/* align pull length to size of long to optimize memcpy performance */
skb_copy_to_linear_data(skb, va, ALIGN(pull_len, sizeof(long)));
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index dfa357b1a9d6..924d91696a01 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -8064,7 +8064,7 @@ static struct sk_buff *igb_construct_skb(struct igb_ring *rx_ring,
/* Determine available headroom for copy */
headlen = size;
if (headlen > IGB_RX_HDR_LEN)
- headlen = eth_get_headlen(va, IGB_RX_HDR_LEN);
+ headlen = eth_get_headlen(skb_net(skb), va, IGB_RX_HDR_LEN);
/* align pull length to size of long to optimize memcpy performance */
memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long)));
diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
index f20183037fb2..71e14f309189 100644
--- a/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@ -1148,7 +1148,7 @@ static struct sk_buff *igc_construct_skb(struct igc_ring *rx_ring,
/* Determine available headroom for copy */
headlen = size;
if (headlen > IGC_RX_HDR_LEN)
- headlen = eth_get_headlen(va, IGC_RX_HDR_LEN);
+ headlen = eth_get_headlen(skb_net(skb), va, IGC_RX_HDR_LEN);
/* align pull length to size of long to optimize memcpy performance */
memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long)));
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index b53087a980ef..9e9e6e5e679a 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1799,7 +1799,7 @@ static void ixgbe_pull_tail(struct ixgbe_ring *rx_ring,
* we need the header to contain the greater of either ETH_HLEN or
* 60 bytes if the skb->len is less than 60 for skb_pad.
*/
- pull_len = eth_get_headlen(va, IXGBE_RX_HDR_SIZE);
+ pull_len = eth_get_headlen(skb_net(skb), va, IXGBE_RX_HDR_SIZE);
/* align pull length to size of long to optimize memcpy performance */
skb_copy_to_linear_data(skb, va, ALIGN(pull_len, sizeof(long)));
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 49e23afa05a2..c45be7487d57 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -895,7 +895,8 @@ struct sk_buff *ixgbevf_construct_skb(struct ixgbevf_ring *rx_ring,
/* Determine available headroom for copy */
headlen = size;
if (headlen > IXGBEVF_RX_HDR_SIZE)
- headlen = eth_get_headlen(xdp->data, IXGBEVF_RX_HDR_SIZE);
+ headlen = eth_get_headlen(skb_net(skb), xdp->data,
+ IXGBEVF_RX_HDR_SIZE);
/* align pull length to size of long to optimize memcpy performance */
memcpy(__skb_put(skb, headlen), xdp->data,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 598ad7e4d5c9..b9fc28787f44 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -167,7 +167,8 @@ static inline u16 mlx5e_calc_min_inline(enum mlx5_inline_modes mode,
case MLX5_INLINE_MODE_NONE:
return 0;
case MLX5_INLINE_MODE_TCP_UDP:
- hlen = eth_get_headlen(skb->data, skb_headlen(skb));
+ hlen = eth_get_headlen(skb_net(skb), skb->data,
+ skb_headlen(skb));
if (hlen == ETH_HLEN && !skb_vlan_tag_present(skb))
hlen += VLAN_HLEN;
break;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 18656c4094b3..26063a8daf75 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1961,7 +1961,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
if (frags) {
/* Exercise flow dissector code path. */
- u32 headlen = eth_get_headlen(skb->data, skb_headlen(skb));
+ u32 headlen = eth_get_headlen(skb_net(skb), skb->data,
+ skb_headlen(skb));
if (unlikely(headlen > skb_headlen(skb))) {
this_cpu_inc(tun->pcpu_stats->rx_dropped);
diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 2c0af7b00715..2e2b5ed30bfc 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -33,7 +33,7 @@ struct device;
int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr);
unsigned char *arch_get_platform_mac_address(void);
int nvmem_get_mac_address(struct device *dev, void *addrbuf);
-u32 eth_get_headlen(void *data, unsigned int max_len);
+u32 eth_get_headlen(struct net *net, void *data, unsigned int max_len);
__be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev);
extern const struct header_ops eth_header_ops;
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 155d55025bfc..6fbfd6e41548 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -119,13 +119,14 @@ EXPORT_SYMBOL(eth_header);
/**
* eth_get_headlen - determine the length of header for an ethernet frame
+ * @net: pointer to device network namespace
* @data: pointer to start of frame
* @len: total length of frame
*
* Make a best effort attempt to pull the length for all of the headers for
* a given frame in a linear buffer.
*/
-u32 eth_get_headlen(void *data, unsigned int len)
+u32 eth_get_headlen(struct net *net, void *data, unsigned int len)
{
const unsigned int flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG;
const struct ethhdr *eth = (const struct ethhdr *)data;
@@ -136,7 +137,7 @@ u32 eth_get_headlen(void *data, unsigned int len)
return len;
/* parse any remaining L2/L3 headers, check for L4 */
- if (!skb_flow_dissect_flow_keys_basic(NULL, NULL, &keys, data,
+ if (!skb_flow_dissect_flow_keys_basic(net, NULL, &keys, data,
eth->h_proto, sizeof(*eth),
len, flags))
return max_t(u32, keys.control.thoff, sizeof(*eth));
--
2.20.1.611.gfbb209baf1-goog
^ permalink raw reply related
* [RFC bpf-next 6/7] selftests/bpf: add flow dissector bpf_skb_load_bytes helper test
From: Stanislav Fomichev @ 2019-02-05 17:36 UTC (permalink / raw)
To: netdev; +Cc: davem, ast, daniel, simon.horman, willemb, Stanislav Fomichev
In-Reply-To: <20190205173629.160717-1-sdf@google.com>
With the on-stack skb, we want to make sure we don't trigger any
shinfo access. Add small test which tries to read the data past
the packet boundary.
Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
tools/testing/selftests/bpf/test_progs.c | 49 ++++++++++++++++++++++++
1 file changed, 49 insertions(+)
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index c52bd90fbb34..c12f61efc427 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -1995,6 +1995,54 @@ static void test_flow_dissector(void)
bpf_object__close(obj);
}
+static void test_flow_dissector_load_bytes(void)
+{
+ struct bpf_flow_keys flow_keys;
+ __u32 duration, retval, size;
+ struct bpf_insn prog[] = {
+ // BPF_REG_1 - 1st argument: context
+ // BPF_REG_2 - 2nd argument: offset, start at last byte + 1
+ BPF_MOV64_IMM(BPF_REG_2, sizeof(pkt_v4)),
+ // BPF_REG_3 - 3rd argument: destination, reserve byte on stack
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_3, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -1),
+ // BPF_REG_4 - 4th argument: copy one byte
+ BPF_MOV64_IMM(BPF_REG_4, 1),
+ // bpf_skb_load_bytes(ctx, sizeof(pkt_v4), ptr, 1)
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_skb_load_bytes),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ // if (ret == 0) return BPF_DROP (2)
+ BPF_MOV64_IMM(BPF_REG_0, BPF_DROP),
+ BPF_EXIT_INSN(),
+ // if (ret != 0) return BPF_OK (0)
+ BPF_MOV64_IMM(BPF_REG_0, BPF_OK),
+ BPF_EXIT_INSN(),
+ };
+ int fd, err;
+
+ /* make sure bpf_skb_load_bytes helper doesn't cause any
+ * problems when used with the fake skb in the flow
+ * dissector (try to read past the last byte)
+ */
+ fd = bpf_load_program(BPF_PROG_TYPE_FLOW_DISSECTOR, prog,
+ ARRAY_SIZE(prog), "GPL", 0, NULL, 0);
+ CHECK(fd < 0,
+ "flow_dissector-bpf_skb_load_bytes-load",
+ "fd %d errno %d\n",
+ fd, errno);
+
+ err = bpf_prog_test_run(fd, 1, &pkt_v4, sizeof(pkt_v4),
+ &flow_keys, &size, &retval, &duration);
+ CHECK(size != sizeof(flow_keys) || err || retval != 1,
+ "flow_dissector-bpf_skb_load_bytes",
+ "err %d errno %d retval %d duration %d size %u/%lu\n",
+ err, errno, retval, duration, size, sizeof(flow_keys));
+
+ if (fd >= -1)
+ close(fd);
+}
+
static void *test_spin_lock(void *arg)
{
__u32 duration, retval;
@@ -2136,6 +2184,7 @@ int main(void)
test_queue_stack_map(QUEUE);
test_queue_stack_map(STACK);
test_flow_dissector();
+ test_flow_dissector_load_bytes();
test_spinlock();
test_map_lock();
--
2.20.1.611.gfbb209baf1-goog
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox