* [RFC PATCH net-next 2/6 v2] net/sched: cls_flower: add match on ct info
From: Paul Blakey @ 2019-01-29 8:02 UTC (permalink / raw)
To: Guy Shattah, Marcelo Leitner, Aaron Conole, John Hurley,
Simon Horman, Justin Pettit, Gregory Rose, Eelco Chaudron,
Flavio Leitner, Florian Westphal, Jiri Pirko, Rashid Khan,
Sushil Kulkarni, Andy Gospodarek, Roi Dayan, Yossi Kuperman,
Or Gerlitz, Rony Efraim, davem@davemloft.net, netdev
Cc: Paul Blakey
In-Reply-To: <1548748926-23822-2-git-send-email-paulb@mellanox.com>
New match on ct state, mark, and label from ct_info on the skb.
This can be set via sending the packet to ct via the ct action.
Signed-off-by: Paul Blakey <paulb@mellanox.com>
---
include/uapi/linux/pkt_cls.h | 17 ++++++
net/sched/cls_flower.c | 126 +++++++++++++++++++++++++++++++++++++++++--
2 files changed, 140 insertions(+), 3 deletions(-)
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 02ac251..121f1ef 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -497,11 +497,28 @@ enum {
TCA_FLOWER_KEY_PORT_DST_MIN, /* be16 */
TCA_FLOWER_KEY_PORT_DST_MAX, /* be16 */
+ TCA_FLOWER_KEY_CT_STATE,
+ TCA_FLOWER_KEY_CT_STATE_MASK,
+ TCA_FLOWER_KEY_CT_ZONE,
+ TCA_FLOWER_KEY_CT_ZONE_MASK,
+ TCA_FLOWER_KEY_CT_MARK,
+ TCA_FLOWER_KEY_CT_MARK_MASK,
+ TCA_FLOWER_KEY_CT_LABELS,
+ TCA_FLOWER_KEY_CT_LABELS_MASK,
+
__TCA_FLOWER_MAX,
};
#define TCA_FLOWER_MAX (__TCA_FLOWER_MAX - 1)
+
+#define TCA_FLOWER_KEY_CT_FLAGS_NEW 0x01 /* Beginning of a new connection. */
+#define TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED 0x02 /* Part of an existing connection. */
+#define TCA_FLOWER_KEY_CT_FLAGS_RELATED 0x04 /* Related to an established connection. */
+#define TCA_FLOWER_KEY_CT_FLAGS_INVALID 0x10 /* Could not track connection. */
+#define TCA_FLOWER_KEY_CT_FLAGS_TRACKED 0x20 /* Conntrack has occurred. */
+
+
enum {
TCA_FLOWER_KEY_ENC_OPTS_UNSPEC,
TCA_FLOWER_KEY_ENC_OPTS_GENEVE, /* Nested
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index f6aa57f..bf74a31 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -29,6 +29,9 @@
#include <net/dst.h>
#include <net/dst_metadata.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_labels.h>
+
struct fl_flow_key {
int indev_ifindex;
struct flow_dissector_key_control control;
@@ -57,6 +60,11 @@ struct fl_flow_key {
struct flow_dissector_key_enc_opts enc_opts;
struct flow_dissector_key_ports tp_min;
struct flow_dissector_key_ports tp_max;
+
+ u8 ct_state;
+ u16 ct_zone;
+ u32 ct_mark;
+ u32 ct_labels[NF_CT_LABELS_MAX_SIZE / sizeof(u32)];
} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
struct fl_flow_mask_range {
@@ -265,19 +273,55 @@ static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask,
return __fl_lookup(mask, mkey);
}
+static u8 fl_ct_get_state(enum ip_conntrack_info ctinfo)
+{
+ u8 ct_state = TCA_FLOWER_KEY_CT_FLAGS_TRACKED;
+
+ switch (ctinfo) {
+ case IP_CT_ESTABLISHED:
+ case IP_CT_ESTABLISHED_REPLY:
+ ct_state |= TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED;
+ break;
+ case IP_CT_RELATED:
+ case IP_CT_RELATED_REPLY:
+ ct_state |= TCA_FLOWER_KEY_CT_FLAGS_RELATED;
+ break;
+ case IP_CT_NEW:
+ ct_state |= TCA_FLOWER_KEY_CT_FLAGS_NEW;
+ break;
+ default:
+ break;
+ }
+
+ return ct_state;
+}
+
static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
struct cls_fl_head *head = rcu_dereference_bh(tp->root);
- struct cls_fl_filter *f;
- struct fl_flow_mask *mask;
- struct fl_flow_key skb_key;
+ enum ip_conntrack_info ctinfo;
struct fl_flow_key skb_mkey;
+ struct fl_flow_key skb_key;
+ struct fl_flow_mask *mask;
+ struct nf_conn_labels *cl;
+ struct cls_fl_filter *f;
+ struct nf_conn *ct;
list_for_each_entry_rcu(mask, &head->masks, list) {
fl_clear_masked_range(&skb_key, mask);
skb_key.indev_ifindex = skb->skb_iif;
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct) {
+ skb_key.ct_state = fl_ct_get_state(ctinfo);
+ skb_key.ct_zone = ct->zone.id;
+ skb_key.ct_mark = ct->mark;
+
+ cl = nf_ct_labels_find(ct);
+ if (cl)
+ memcpy(skb_key.ct_labels, cl->bits, sizeof(skb_key.ct_labels));
+ }
/* skb_flow_dissect() does not set n_proto in case an unknown
* protocol, so do it rather here.
*/
@@ -562,6 +606,14 @@ static void *fl_get(struct tcf_proto *tp, u32 handle)
[TCA_FLOWER_KEY_ENC_IP_TTL_MASK] = { .type = NLA_U8 },
[TCA_FLOWER_KEY_ENC_OPTS] = { .type = NLA_NESTED },
[TCA_FLOWER_KEY_ENC_OPTS_MASK] = { .type = NLA_NESTED },
+ [TCA_FLOWER_KEY_CT_STATE] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_CT_STATE_MASK] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_CT_ZONE] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_CT_ZONE_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_CT_MARK] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_CT_MARK_MASK] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_CT_LABELS] = { .type = NLA_UNSPEC, .len = 16 },
+ [TCA_FLOWER_KEY_CT_LABELS_MASK] = { .type = NLA_UNSPEC, .len = 16 },
};
static const struct nla_policy
@@ -872,6 +924,36 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
return 0;
}
+static int fl_set_key_ct(struct nlattr **tb, struct fl_flow_key *key,
+ struct fl_flow_key *mask,
+ struct netlink_ext_ack *extack)
+{
+ size_t label_len = 0;
+
+ if (tb[TCA_FLOWER_KEY_CT_STATE]) {
+ key->ct_state = nla_get_u8(tb[TCA_FLOWER_KEY_CT_STATE]);
+ mask->ct_state = nla_get_u8(tb[TCA_FLOWER_KEY_CT_STATE_MASK]);
+ }
+
+ if (tb[TCA_FLOWER_KEY_CT_ZONE_MASK]) {
+ key->ct_zone = nla_get_u16(tb[TCA_FLOWER_KEY_CT_ZONE]);
+ mask->ct_zone = nla_get_u16(tb[TCA_FLOWER_KEY_CT_ZONE_MASK]);
+ }
+
+ if (tb[TCA_FLOWER_KEY_CT_MARK_MASK]) {
+ key->ct_mark = nla_get_u32(tb[TCA_FLOWER_KEY_CT_MARK]);
+ mask->ct_mark = nla_get_u32(tb[TCA_FLOWER_KEY_CT_MARK_MASK]);
+ }
+
+ if (tb[TCA_FLOWER_KEY_CT_LABELS_MASK]) {
+ label_len = nla_len(tb[TCA_FLOWER_KEY_CT_LABELS]);
+ memcpy(key->ct_labels, nla_data(tb[TCA_FLOWER_KEY_CT_LABELS]), label_len);
+ memcpy(mask->ct_labels, nla_data(tb[TCA_FLOWER_KEY_CT_LABELS_MASK]), label_len);
+ }
+
+ return 0;
+}
+
static int fl_set_key(struct net *net, struct nlattr **tb,
struct fl_flow_key *key, struct fl_flow_key *mask,
struct netlink_ext_ack *extack)
@@ -1082,6 +1164,10 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
return ret;
}
+ ret = fl_set_key_ct(tb, key, mask, extack);
+ if (ret)
+ return ret;
+
if (tb[TCA_FLOWER_KEY_FLAGS])
ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags);
@@ -1761,6 +1847,37 @@ static int fl_dump_key_geneve_opt(struct sk_buff *skb,
return -EMSGSIZE;
}
+static int fl_dump_key_ct(struct sk_buff *skb,
+ struct fl_flow_key *key,
+ struct fl_flow_key *mask)
+{
+ if(fl_dump_key_val(skb, &key->ct_state, TCA_FLOWER_KEY_CT_STATE,
+ &mask->ct_state, TCA_FLOWER_KEY_CT_STATE_MASK,
+ sizeof(key->ct_state)))
+ goto nla_put_failure;
+
+ if (fl_dump_key_val(skb, &key->ct_zone, TCA_FLOWER_KEY_CT_ZONE,
+ &mask->ct_zone, TCA_FLOWER_KEY_CT_ZONE_MASK,
+ sizeof(key->ct_zone)))
+ goto nla_put_failure;
+
+ if (fl_dump_key_val(skb, &key->ct_mark, TCA_FLOWER_KEY_CT_MARK,
+ &mask->ct_mark, TCA_FLOWER_KEY_CT_MARK_MASK,
+ sizeof(key->ct_mark)))
+ goto nla_put_failure;
+
+ if (fl_dump_key_val(skb, &key->ct_labels, TCA_FLOWER_KEY_CT_LABELS,
+ &mask->ct_labels, TCA_FLOWER_KEY_CT_LABELS_MASK,
+ sizeof(key->ct_labels)))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+
static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type,
struct flow_dissector_key_enc_opts *enc_opts)
{
@@ -1994,6 +2111,9 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net,
fl_dump_key_enc_opt(skb, &key->enc_opts, &mask->enc_opts))
goto nla_put_failure;
+ if (fl_dump_key_ct(skb, key, mask))
+ goto nla_put_failure;
+
if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags))
goto nla_put_failure;
--
1.8.3.1
^ permalink raw reply related
* [RFC PATCH net-next 3/6 v2] net/sched: cls_flower: Add ematch support
From: Paul Blakey @ 2019-01-29 8:02 UTC (permalink / raw)
To: Guy Shattah, Marcelo Leitner, Aaron Conole, John Hurley,
Simon Horman, Justin Pettit, Gregory Rose, Eelco Chaudron,
Flavio Leitner, Florian Westphal, Jiri Pirko, Rashid Khan,
Sushil Kulkarni, Andy Gospodarek, Roi Dayan, Yossi Kuperman,
Or Gerlitz, Rony Efraim, davem@davemloft.net, netdev
Cc: Paul Blakey
In-Reply-To: <1548748926-23822-2-git-send-email-paulb@mellanox.com>
TODO: handle EEXist.
Signed-off-by: Paul Blakey <paulb@mellanox.com>
---
include/uapi/linux/pkt_cls.h | 2 ++
net/sched/cls_flower.c | 22 ++++++++++++++++++----
2 files changed, 20 insertions(+), 4 deletions(-)
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 121f1ef..d848d6d 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -506,6 +506,8 @@ enum {
TCA_FLOWER_KEY_CT_LABELS,
TCA_FLOWER_KEY_CT_LABELS_MASK,
+ TCA_FLOWER_EMATCHES,
+
__TCA_FLOWER_MAX,
};
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index bf74a31..f11fda0 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -104,6 +104,7 @@ struct cls_fl_filter {
struct rhash_head ht_node;
struct fl_flow_key mkey;
struct tcf_exts exts;
+ struct tcf_ematch_tree ematches;
struct tcf_result res;
struct fl_flow_key key;
struct list_head list;
@@ -332,10 +333,14 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
fl_set_masked_key(&skb_mkey, &skb_key, mask);
f = fl_lookup(mask, &skb_mkey, &skb_key);
- if (f && !tc_skip_sw(f->flags)) {
- *res = f->res;
- return tcf_exts_exec(skb, &f->exts, res);
- }
+ if (!f || tc_skip_sw(f->flags))
+ continue;
+
+ if (!tcf_em_tree_match(skb, &f->ematches, NULL))
+ continue;
+
+ *res = f->res;
+ return tcf_exts_exec(skb, &f->exts, res);
}
return -1;
}
@@ -388,6 +393,7 @@ static bool fl_mask_put(struct cls_fl_head *head, struct fl_flow_mask *mask,
static void __fl_destroy_filter(struct cls_fl_filter *f)
{
tcf_exts_destroy(&f->exts);
+ tcf_em_tree_destroy(&f->ematches);
tcf_exts_put_net(&f->exts);
kfree(f);
}
@@ -523,6 +529,7 @@ static void *fl_get(struct tcf_proto *tp, u32 handle)
static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
[TCA_FLOWER_UNSPEC] = { .type = NLA_UNSPEC },
[TCA_FLOWER_CLASSID] = { .type = NLA_U32 },
+ [TCA_FLOWER_EMATCHES] = { .type = NLA_NESTED },
[TCA_FLOWER_INDEV] = { .type = NLA_STRING,
.len = IFNAMSIZ },
[TCA_FLOWER_KEY_ETH_DST] = { .len = ETH_ALEN },
@@ -1348,6 +1355,10 @@ static int fl_set_parms(struct net *net, struct tcf_proto *tp,
if (err < 0)
return err;
+ err = tcf_em_tree_validate(tp, tb[TCA_FLOWER_EMATCHES], &f->ematches);
+ if (err < 0)
+ return err;
+
if (tb[TCA_FLOWER_CLASSID]) {
f->res.classid = nla_get_u32(tb[TCA_FLOWER_CLASSID]);
tcf_bind_filter(tp, &f->res, base);
@@ -2143,6 +2154,9 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
nla_put_u32(skb, TCA_FLOWER_CLASSID, f->res.classid))
goto nla_put_failure;
+ if (tcf_em_tree_dump(skb, &f->ematches, TCA_FLOWER_EMATCHES) < 0)
+ goto nla_put_failure;
+
key = &f->key;
mask = &f->mask->key;
--
1.8.3.1
^ permalink raw reply related
* [RFC PATCH net-next 6/6 v2] net/sched: act_ct: Add tc recirc id set/del support
From: Paul Blakey @ 2019-01-29 8:02 UTC (permalink / raw)
To: Guy Shattah, Marcelo Leitner, Aaron Conole, John Hurley,
Simon Horman, Justin Pettit, Gregory Rose, Eelco Chaudron,
Flavio Leitner, Florian Westphal, Jiri Pirko, Rashid Khan,
Sushil Kulkarni, Andy Gospodarek, Roi Dayan, Yossi Kuperman,
Or Gerlitz, Rony Efraim, davem@davemloft.net, netdev
Cc: Paul Blakey
In-Reply-To: <1548748926-23822-2-git-send-email-paulb@mellanox.com>
Set or clears (free) the skb tc recirc id extension.
If used with OVS, OVS can clear this recirc id after it reads it.
Signed-off-by: Paul Blakey <paulb@mellanox.com>
---
include/net/tc_act/tc_ct.h | 2 ++
include/uapi/linux/tc_act/tc_ct.h | 2 ++
net/sched/act_ct.c | 18 ++++++++++++++++++
3 files changed, 22 insertions(+)
diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h
index 4a16375..6ea19d8 100644
--- a/include/net/tc_act/tc_ct.h
+++ b/include/net/tc_act/tc_ct.h
@@ -16,6 +16,8 @@ struct tcf_ct {
u32 mark_mask;
u16 zone;
bool commit;
+ uint32_t set_recirc;
+ bool del_recirc;
};
#define to_ct(a) ((struct tcf_ct *)a)
diff --git a/include/uapi/linux/tc_act/tc_ct.h b/include/uapi/linux/tc_act/tc_ct.h
index 6dbd771..2279a9b 100644
--- a/include/uapi/linux/tc_act/tc_ct.h
+++ b/include/uapi/linux/tc_act/tc_ct.h
@@ -14,6 +14,8 @@ struct tc_ct {
__u32 labels_mask[4];
__u32 mark;
__u32 mark_mask;
+ uint32_t set_recirc;
+ bool del_recirc;
bool commit;
};
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 61155cc..7822385 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -117,6 +117,11 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
u_int8_t family;
bool cached;
+ if (ca->del_recirc) {
+ skb_ext_del(skb, SKB_EXT_TC_RECIRC_ID);
+ return ca->tcf_action;
+ }
+
/* The conntrack module expects to be working at L3. */
nh_ofs = skb_network_offset(skb);
skb_pull_rcsum(skb, nh_ofs);
@@ -243,6 +248,15 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
skb_postpush_rcsum(skb, skb->data, nh_ofs);
spin_unlock(&ca->tcf_lock);
+
+ if (ca->set_recirc) {
+ u32 recirc = ca->set_recirc;
+ uint32_t *recircp = skb_ext_add(skb, SKB_EXT_TC_RECIRC_ID);
+
+ if (recircp)
+ *recircp = recirc;
+ }
+
return ca->tcf_action;
drop:
@@ -305,6 +319,8 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla,
ci->net = net;
ci->commit = parm->commit;
ci->zone = parm->zone;
+ ci->set_recirc = parm->set_recirc;
+ ci->del_recirc = parm->del_recirc;
#if !IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
if (parm->mark_mask) {
NL_SET_ERR_MSG_MOD(extack, "Mark not supported by kernel config");
@@ -378,6 +394,8 @@ static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a,
opt.mark_mask = ci->mark_mask,
memcpy(opt.labels, ci->labels, sizeof(opt.labels));
memcpy(opt.labels_mask, ci->labels_mask, sizeof(opt.labels_mask));
+ opt.set_recirc = ci->set_recirc;
+ opt.del_recirc = ci->del_recirc;
if (nla_put(skb, TCA_CT_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
--
1.8.3.1
^ permalink raw reply related
* [RFC PATCH net-next 5/6 v2] net/sched: em_meta: add match on tc recirc id skb extension
From: Paul Blakey @ 2019-01-29 8:02 UTC (permalink / raw)
To: Guy Shattah, Marcelo Leitner, Aaron Conole, John Hurley,
Simon Horman, Justin Pettit, Gregory Rose, Eelco Chaudron,
Flavio Leitner, Florian Westphal, Jiri Pirko, Rashid Khan,
Sushil Kulkarni, Andy Gospodarek, Roi Dayan, Yossi Kuperman,
Or Gerlitz, Rony Efraim, davem@davemloft.net, netdev
Cc: Paul Blakey
In-Reply-To: <1548748926-23822-2-git-send-email-paulb@mellanox.com>
Signed-off-by: Paul Blakey <paulb@mellanox.com>
---
include/uapi/linux/tc_ematch/tc_em_meta.h | 1 +
net/sched/em_meta.c | 8 ++++++++
2 files changed, 9 insertions(+)
diff --git a/include/uapi/linux/tc_ematch/tc_em_meta.h b/include/uapi/linux/tc_ematch/tc_em_meta.h
index cf30b5b..25a6fb8 100644
--- a/include/uapi/linux/tc_ematch/tc_em_meta.h
+++ b/include/uapi/linux/tc_ematch/tc_em_meta.h
@@ -81,6 +81,7 @@ enum {
TCF_META_ID_SK_WRITE_PENDING,
TCF_META_ID_VLAN_TAG,
TCF_META_ID_RXHASH,
+ TCF_META_ID_TCRECIRC,
__TCF_META_ID_MAX
};
#define TCF_META_ID_MAX (__TCF_META_ID_MAX - 1)
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index d6e9711..041e0b3 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -245,6 +245,13 @@ static inline int var_dev(struct net_device *dev, struct meta_obj *dst)
dst->value = skb->tc_index;
}
+META_COLLECTOR(int_tcrecirc)
+{
+ uint32_t *recirc = skb_ext_find(skb, SKB_EXT_TC_RECIRC_ID);
+
+ dst->value = recirc? *recirc : 0;
+}
+
/**************************************************************************
* Routing
**************************************************************************/
@@ -638,6 +645,7 @@ struct meta_ops {
[META_ID(MACLEN)] = META_FUNC(int_maclen),
[META_ID(NFMARK)] = META_FUNC(int_mark),
[META_ID(TCINDEX)] = META_FUNC(int_tcindex),
+ [META_ID(TCRECIRC)] = META_FUNC(int_tcrecirc),
[META_ID(RTCLASSID)] = META_FUNC(int_rtclassid),
[META_ID(RTIIF)] = META_FUNC(int_rtiif),
[META_ID(SK_FAMILY)] = META_FUNC(int_sk_family),
--
1.8.3.1
^ permalink raw reply related
* Re: [PATCH net-next 00/24] sctp: support SCTP_FUTURE/CURRENT/ALL_ASSOC
From: Neil Horman @ 2019-01-29 8:03 UTC (permalink / raw)
To: David Miller; +Cc: lucien.xin, netdev, linux-sctp, marcelo.leitner
In-Reply-To: <20190128.104109.78917221225730035.davem@davemloft.net>
On Mon, Jan 28, 2019 at 10:41:09AM -0800, David Miller wrote:
> From: Neil Horman <nhorman@tuxdriver.com>
> Date: Mon, 28 Jan 2019 04:44:44 -0500
>
> > Sorry, I'm traveling at the moment, but I'll review this as soon as
> > I'm back on the ground
>
> I'll hold this until you get a chance to review Neil.
>
Thank you. I'm hoping to be someplace I can get through this tonight.
Neil
^ permalink raw reply
* RE: [PATCH] ucc_geth: Reset BQL queue when stopping device
From: Mathias Thore @ 2019-01-29 8:07 UTC (permalink / raw)
To: Li Yang
Cc: Christophe Leroy, netdev@vger.kernel.org,
linuxppc-dev@lists.ozlabs.org, David Gounaris, Joakim Tjernlund
In-Reply-To: <CADRPPNS3M=j4DDMtq_Mb+Xo1M71_yM+_N7n48u-z5pSSzdBoXg@mail.gmail.com>
Is there a scenario where we are clearing the TX ring but don't want to reset the BQL TX queue?
I think it makes sense to keep it in ucc_geth_free_tx since the reason it is needed isn't the timeout per se, but rather the clearing of the TX ring. This way, it will be performed no matter why the driver ends up calling this function.
-----Original Message-----
From: Li Yang [mailto:leoyang.li@nxp.com]
Sent: Monday, 28 January 2019 22:37
To: Mathias Thore <Mathias.Thore@infinera.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>; netdev@vger.kernel.org; linuxppc-dev@lists.ozlabs.org; David Gounaris <David.Gounaris@infinera.com>; Joakim Tjernlund <Joakim.Tjernlund@infinera.com>
Subject: Re: [PATCH] ucc_geth: Reset BQL queue when stopping device
On Mon, Jan 28, 2019 at 8:37 AM Mathias Thore <Mathias.Thore@infinera.com> wrote:
>
> Hi,
>
>
> This is what we observed: there was a storm on the medium so that our controller could not do its TX, resulting in timeout. When timeout occurs, the driver clears all descriptors from the TX queue. The function called in this patch is used to reflect this clearing also in the BQL layer. Without it, the controller would get stuck, unable to perform TX, even several minutes after the storm had ended. Bringing the device down and then up again would solve the problem, but this patch also solves it automatically.
The explanation makes sense. So this should only be required in the timeout scenario instead of other clean up scenarios like device shutdown? If so, it probably it will be better to be done in ucc_geth_timeout_work()?
>
>
> Some other drivers do the same, for example e1000e driver calls netdev_reset_queue in its e1000_clean_tx_ring function. It is possible that other drivers should do the same; I have no way of verifying this.
>
>
> Regards,
>
> Mathias
>
> --
>
>
> From: Christophe Leroy <christophe.leroy@c-s.fr>
> Sent: Monday, January 28, 2019 10:48 AM
> To: Mathias Thore; leoyang.li@nxp.com; netdev@vger.kernel.org;
> linuxppc-dev@lists.ozlabs.org; David Gounaris; Joakim Tjernlund
> Subject: Re: [PATCH] ucc_geth: Reset BQL queue when stopping device
>
>
> CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you recognize the sender and know the content is safe.
>
>
> Hi,
>
> Le 28/01/2019 à 10:07, Mathias Thore a écrit :
> > After a timeout event caused by for example a broadcast storm, when
> > the MAC and PHY are reset, the BQL TX queue needs to be reset as
> > well. Otherwise, the device will exhibit severe performance issues
> > even after the storm has ended.
>
> What are the symptomns ?
>
> Is this reset needed on any network driver in that case, or is it
> something particular for the ucc_geth ?
> For instance, the freescale fs_enet doesn't have that reset. Should it
> have it too ?
>
> Christophe
>
> >
> > Co-authored-by: David Gounaris <david.gounaris@infinera.com>
> > Signed-off-by: Mathias Thore <mathias.thore@infinera.com>
> > ---
> > drivers/net/ethernet/freescale/ucc_geth.c | 2 ++
> > 1 file changed, 2 insertions(+)
> >
> > diff --git a/drivers/net/ethernet/freescale/ucc_geth.c
> > b/drivers/net/ethernet/freescale/ucc_geth.c
> > index c3d539e209ed..eb3e65e8868f 100644
> > --- a/drivers/net/ethernet/freescale/ucc_geth.c
> > +++ b/drivers/net/ethernet/freescale/ucc_geth.c
> > @@ -1879,6 +1879,8 @@ static void ucc_geth_free_tx(struct ucc_geth_private *ugeth)
> > u16 i, j;
> > u8 __iomem *bd;
> >
> > + netdev_reset_queue(ugeth->ndev);
> > +
> > ug_info = ugeth->ug_info;
> > uf_info = &ug_info->uf_info;
> >
> >
>
^ permalink raw reply
* Re: [PATCH net-next] net: udp Allow CHECKSUM_UNNECESSARY packets to do GRO.
From: maowenan @ 2019-01-29 8:08 UTC (permalink / raw)
To: Tom Herbert
Cc: Linux Kernel Network Developers, David S. Miller, Eric Dumazet
In-Reply-To: <CALx6S37iaSDWZTQj5-9obh+ErxLE2dNYTcTO+sGs6R-bQTofRg@mail.gmail.com>
On 2019/1/29 14:24, Tom Herbert wrote:
> On Mon, Jan 28, 2019 at 10:04 PM maowenan <maowenan@huawei.com> wrote:
>>
>>
>>
>> On 2019/1/29 12:01, Tom Herbert wrote:
>>> On Mon, Jan 28, 2019 at 7:00 PM maowenan <maowenan@huawei.com> wrote:
>>>>
>>>> Hi all,
>>>> Do you have any comments about this change?
>>>>
>>>>
>>>> On 2019/1/23 11:33, Mao Wenan wrote:
>>>>> When udp4_gro_receive() get one packet that uh->check=0,
>>>>> skb_gro_checksum_validate_zero_check() will set the
>>>>> skb->ip_summed = CHECKSUM_UNNECESSARY;
>>>>> skb->csum_level = 0;
>>>>> Then udp_gro_receive() will flush the packet which is not CHECKSUM_PARTIAL,
>>>>> It is not our expect, because check=0 in udp header indicates this
>>>>> packet is no need to caculate checksum, we should go further to do GRO.
>>>>>
>>>>> This patch changes the value of csum_cnt according to skb->csum_level.
>>>>> ---
>>>>> include/linux/netdevice.h | 1 +
>>>>> 1 file changed, 1 insertion(+)
>>>>>
>>>>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>>>>> index 1377d08..9c819f1 100644
>>>>> --- a/include/linux/netdevice.h
>>>>> +++ b/include/linux/netdevice.h
>>>>> @@ -2764,6 +2764,7 @@ static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb)
>>>>> * during GRO. This saves work if we fallback to normal path.
>>>>> */
>>>>> __skb_incr_checksum_unnecessary(skb);
>>>>> + NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
>>>
>>> That doesn't look right. This would be reinitializing the GRO
>>> checksums from the beginning.
>>>
>>>>> }
>>>>> }
>>>>>
>>>>>
>>>>
>>> I assume the code is bailing on this conditional:
>>>
>>> if (NAPI_GRO_CB(skb)->encap_mark ||
>>> (skb->ip_summed != CHECKSUM_PARTIAL &&
>>> NAPI_GRO_CB(skb)->csum_cnt == 0 &&
>>> !NAPI_GRO_CB(skb)->csum_valid) ||
>>> !udp_sk(sk)->gro_receive)
>>> goto out_unlock;
>>>
>>> I am trying to remember why this needs to check csum_cnt. If there was
>>> a csum_cnt for the UDP csum being zero from checksum-unnecessary, it
>>> was consumed by skb_gro_checksum_validate_zero_check in UDP4 GRO
>>> received.
>>
>> We have met the scene about two VMs in different host with vxlan packets, when udp4_gro_receive receives
>> one packet with ip_summed=CHECKSUM_NONE,csum_cnt=0,csum_valid=0,and udp->check=0, then skb_gro_checksum_validate_zero_check()->
>> skb_gro_incr_csum_unnecessary() validate it and set ip_summed=CHECKSUM_UNNECESSARY,csum_level=0, but csum_cnt and csum_valid
>> keep zero value. Then it will be flushed in udp_gro_receive(), the codes as you have showed.
>>
>> so I think it forgets to modify csum_cnt since csum_level is changed in skb_gro_incr_csum_unnecessary()->__skb_incr_checksum_unnecessary().
>>
> Yes, but the csum_level is changing since we've gone beyond the
> checksums initially reported inc checksum-unnecessary. GRO csum_cnt is
> initialized to skb->csum_level + 1 at the start of GRO processing.
>
> If I recall, the rule is that UDP GRO requires at least one non-zero
> checksum to be verified. The idea is that if we end up computing
> packet checksums on the host for inner checksums like TCP during GRO,
> then that's negating the performance benefits of GRO. Had UDP check
> not been zero then we would do checksum unnecessary conversion and so
> csum_valid would be set for the remainded of GRO processing. The
> existing code is following the rule I believe, so this may be working
> as intended.
Do you have any suggestion if I need do GRO as udp->check is zero?
My previous modification which works fine as below:
if (NAPI_GRO_CB(skb)->encap_mark ||
(skb->ip_summed != CHECKSUM_PARTIAL &&
+ skb->ip_summed != CHECKSUM_UNNECESSARY &&
NAPI_GRO_CB(skb)->csum_cnt == 0 &&
!NAPI_GRO_CB(skb)->csum_valid) ||
!udp_sk(sk)->gro_receive)
goto out_unlock;
>
> Tom
>
>>>
>>> .
>>>
>>
>
> .
>
^ permalink raw reply
* Re: [PATCH 06/33] netfilter: nf_tables: Support RULE_ID reference in new rule
From: Florian Westphal @ 2019-01-29 8:09 UTC (permalink / raw)
To: Cong Wang
Cc: Pablo Neira Ayuso, NetFilter, David Miller,
Linux Kernel Network Developers
In-Reply-To: <CAM_iQpWg0s8yaJskGyTko=S0GmfbXa4T--WTHA9gJ8xk0LAikQ@mail.gmail.com>
Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > Note that NFTA_RULE_POSITION takes precedence over
> > NFTA_RULE_POSITION_ID, so if the former is present, the latter is
> > ignored.
>
> It looks like you forgot to add NFTA_RULE_POSITION_ID into
> nft_rule_policy[]?
Classic...
Thanks for spotting this, I'll send a patch.
^ permalink raw reply
* Re: [PATCH v4 bpf-next 1/9] bpf: introduce bpf_spin_lock
From: Peter Zijlstra @ 2019-01-29 8:59 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: Alexei Starovoitov, davem, daniel, jakub.kicinski, netdev,
kernel-team, mingo, will.deacon, Paul McKenney, jannh
In-Reply-To: <20190128213710.vjxnc2eq5rsisgfx@ast-mbp>
On Mon, Jan 28, 2019 at 01:37:12PM -0800, Alexei Starovoitov wrote:
> On Mon, Jan 28, 2019 at 09:43:10AM +0100, Peter Zijlstra wrote:
> > Isn't that still broken? AFAIU networking progs can happen in task
> > context (TX) and SoftIRQ context (RX), which can nest.
>
> Sure. sendmsg side of networking can be interrupted by napi receive.
> Both can have bpf progs attached at different points, but napi won't run
> when bpf prog is running, because bpf prog disables preemption.
Disabling preemption is not sufficient, it needs to have done
local_bh_disable(), which isn't unlikely given this is all networking
code.
Anyway, if you're sure that's all good, I'll trust you on that. It's
been a very _very_ long time since I looked at the networking code.
^ permalink raw reply
* Re: [PATCH net-next v2 1/2] net: dsa: mv88e6xxx: Save switch rules
From: Miquel Raynal @ 2019-01-29 9:01 UTC (permalink / raw)
To: Andrew Lunn
Cc: Florian Fainelli, Vivien Didelot, David S. Miller, netdev,
linux-kernel, Thomas Petazzoni, Gregory Clement, Antoine Tenart,
Maxime Chevallier, Nadav Haklai
In-Reply-To: <20190128174246.GD28759@lunn.ch>
Hi Andrew,
Andrew Lunn <andrew@lunn.ch> wrote on Mon, 28 Jan 2019 18:42:46 +0100:
> On Mon, Jan 28, 2019 at 04:57:49PM +0100, Miquel Raynal wrote:
> > Hi Andrew,
> >
> > Thanks for helping!
> >
> > Andrew Lunn <andrew@lunn.ch> wrote on Mon, 28 Jan 2019 15:44:17 +0100:
> >
> > > > I don't see where VLAN and bridge information are cached, can you point
> > > > me to the relevant locations?
> > >
> > > Miquèl
> > >
> > > The bridge should have all that information. You need to ask it to
> > > enumerate the current configuration and replay it to the switch.
> > >
> > > There might be something in the Mellanox driver you can copy? But i've
> > > not looked, i'm just guessing.
> >
> > I am still searching but so far I did not find a mechanism reading the
> > configuration of the bridge out of a 'net' object. Indeed there are
> > multiple lists with the configuration but they are all 'mellanox'
> > objects, they do not belong to the core.
>
> Hi Miquèl
>
> Look at how iproute2 works. How does the bridge command enumerate the
> fdb and mdb's? How does bridge vlan show work? bridge link show? See
> if you can use this infrastructure within the kernel.
Thanks!
>
> > > We also need to think about how we are going to test this. There is a
> > > lot of state information in a switch. So we are going to need some
> > > pretty good tests to show we have recreated all of it.
> >
> > My understanding of all this is rather short, until know I used what
> > you proposed in the v1 of this series but I am all ears if I need to
> > add anything to my test list.
>
> What you probably need is a generic DSA test suite, with a number of
> hardware devices, with different generations of mv88e6xxx devices, and
> ideally different sf2, kzs, etc switches. Setup a configuration and
> test is works correctly. Suspend, resume, and test is still works. And
> you probably need to go through a number of cycles of suspend/resume.
> And you are going to need to maintain that for a number of years,
> testing every release, to see what breaks as we add new features and
> new devices.
I am very sorry but I kind of disagree with the above proposal. Usually
contributors try to write the best solution with the help of the
community, test on the hardware they have in hand and propose the
changes. I cannot bond on a 10-years involvement in testing several
switches over the releases.
Today, there is no S2RAM support for switches. First, I proposed to add
suspend/resume callbacks to the mv88e6xxx driver - just enough to avoid
crashing the kernel. It was reported that the configuration was lost so
I wrote a rule-saving mechanism to replay the rules at resume. I was
told that this mechanism would best fit in the DSA core directly. I am
open to do that, I don't think it is that much work. But it is also
required that I use as less memory as possible. This is going to take
more time but I think I can do it as well. At least for a minimal set of
configuration.
Then, why not let other people improve things as they need? IIUC Switch
S2RAM does not work at all, I may try to improve the situation but I
do not have the abilities nor the time to do it exhaustively for every
piece of hardware and every situation.
>
> There also needs to be some though put into what happens when the
> network changes while the switch is suspended. A port looses its link,
> a port comes up, an SFP module is ejected, and SFP module is
> inserted. The PTP grand master moves, etc. I hope the usual mechanisms
> just work, but it all needs testing.
Is this really specific to switches? I know it is an issue and I
understand you would prefer not to support S2RAM at all rather than
addressing part of it, but isn't it better to support the simplest
situation first, than supporting nothing at all?
Thanks Andrew for your guidance and help anyway,
Miquèl
^ permalink raw reply
* Re: Request for stable backport: stmmac: Use correct values in TQS/RQS fields
From: Niklas Cassel @ 2019-01-29 9:09 UTC (permalink / raw)
To: davem, stable; +Cc: netdev, vinod.koul
In-Reply-To: <20181219095516.GA7747@centauri.lan>
Adding stable since I see Greg's Sign-off-by on recent backports
to this driver.
On Wed, Dec 19, 2018 at 10:55:16AM +0100, Niklas Cassel wrote:
> Hello David,
>
> I can observe a netdev watchdog timeout on kernel 4.14.78 when using stmmac
> with multiple tx queues.
>
> Backporting the following commit:
>
> commit 52a76235d0c4dd259cd0df503afed4757c04ba1d
> Author: Jose Abreu <Jose.Abreu@synopsys.com>
> Date: Fri Oct 13 10:58:36 2017 +0100
>
> net: stmmac: Use correct values in TQS/RQS fields
>
> Currently we are using all the available fifo size in RQS and
> TQS fields. This will not work correctly in multi-queues IP's
> because total fifo size must be splitted to the enabled queues.
>
> Correct this by computing the available fifo size per queue and
> setting the right value in TQS and RQS fields.
>
> Signed-off-by: Jose Abreu <joabreu@synopsys.com>
> Cc: David S. Miller <davem@davemloft.net>
> Cc: Joao Pinto <jpinto@synopsys.com>
> Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
> Cc: Alexandre Torgue <alexandre.torgue@st.com>
> Signed-off-by: David S. Miller <davem@davemloft.net>
>
>
> resolves the issue.
>
> The fix was first included in v4.15
> $ git tag --contains 52a76235d0c4dd259cd0df503afed4757c04ba1d
> v4.15
> v4.15-rc1
> v4.15-rc2
>
> Could you please queue it up for 4.14 stable?
>
>
> Kind regards,
> Niklas
^ permalink raw reply
* Re: bpf memory model. Was: [PATCH v4 bpf-next 1/9] bpf: introduce bpf_spin_lock
From: Peter Zijlstra @ 2019-01-29 9:16 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: Alexei Starovoitov, davem, daniel, jakub.kicinski, netdev,
kernel-team, mingo, will.deacon, Paul McKenney, jannh
In-Reply-To: <20190128215623.6eqskzhklydhympa@ast-mbp>
On Mon, Jan 28, 2019 at 01:56:24PM -0800, Alexei Starovoitov wrote:
> On Mon, Jan 28, 2019 at 10:24:08AM +0100, Peter Zijlstra wrote:
> > Ah, but the loop won't be in the BPF program itself. The BPF program
> > would only have had the BPF_SPIN_LOCK instruction, the JIT them emits
> > code similar to queued_spin_lock()/queued_spin_unlock() (or calls to
> > out-of-line versions of them).
>
> As I said we considered exactly that and such approach has a lot of downsides
> comparing with the helper approach.
> Pretty much every time new feature is added we're evaluating whether it
> should be new instruction or new helper. 99% of the time we go with new helper.
Ah; it seems I'm confused on helper vs instruction. As in, I've no idea
what a helper is.
> > There isn't anything that mandates the JIT uses the exact same locking
> > routines the interpreter does, is there?
>
> sure. This bpf_spin_lock() helper can be optimized whichever way the kernel wants.
> Like bpf_map_lookup_elem() call is _inlined_ by the verifier for certain map types.
> JITs don't even need to do anything. It looks like function call from bpf prog
> point of view, but in JITed code it is a sequence of native instructions.
>
> Say tomorrow we find out that bpf_prog->bpf_spin_lock()->queued_spin_lock()
> takes too much time then we can inline fast path of queued_spin_lock
> directly into bpf prog and save function call cost.
OK, so then the JIT can optimize helpers. Would it not make sense to
have the simple test-and-set spinlock in the generic code and have the
JITs use arch_spinlock_t where appropriate?
^ permalink raw reply
* general protection fault in __xfrm_policy_bysel_ctx
From: syzbot @ 2019-01-29 9:22 UTC (permalink / raw)
To: davem, herbert, linux-kernel, netdev, steffen.klassert,
syzkaller-bugs
Hello,
syzbot found the following crash on:
HEAD commit: 085c4c7dd2b6 net: lmc: remove -I. header search path
git tree: net-next
console output: https://syzkaller.appspot.com/x/log.txt?x=12347128c00000
kernel config: https://syzkaller.appspot.com/x/.config?x=505743eba4e4f68
dashboard link: https://syzkaller.appspot.com/bug?extid=e6e1fe9148cffa18cf97
compiler: gcc (GCC) 9.0.0 20181231 (experimental)
Unfortunately, I don't have any reproducer for this crash yet.
IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+e6e1fe9148cffa18cf97@syzkaller.appspotmail.com
netlink: 104 bytes leftover after parsing attributes in process
`syz-executor0'.
device sit0 left promiscuous mode
kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] PREEMPT SMP KASAN
netlink: 'syz-executor1': attribute type 2 has an invalid length.
CPU: 1 PID: 6717 Comm: syz-executor0 Not tainted 5.0.0-rc3+ #24
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
RIP: 0010:__xfrm_policy_bysel_ctx.constprop.0+0xe4/0x470
net/xfrm/xfrm_policy.c:1618
Code: 00 e8 50 40 c3 fa 49 83 ec 08 0f 84 df 02 00 00 e8 41 40 c3 fa 49 8d
bc 24 19 02 00 00 48 89 f8 48 89 fa 48 c1 e8 03 83 e2 07 <42> 0f b6 04 38
38 d0 7f 08 84 c0 0f 85 eb 02 00 00 45 0f b6 ac 24
kobject: 'loop4' (00000000b3bda7f3): kobject_uevent_env
RSP: 0018:ffff888065bff0e8 EFLAGS: 00010206
RAX: 0000005800000042 RBX: 0000000000000000 RCX: ffffc90005deb000
RDX: 0000000000000003 RSI: ffffffff86bebeaf RDI: 000002c000000213
RBP: ffff888065bff148 R08: ffff8880568ce080 R09: 0000000000000000
R10: ffff8880568ce080 R11: 0000000000000000 R12: 000002bffffffffa
R13: 00000000000000ff R14: ffff88806bae3290 R15: dffffc0000000000
kobject: 'loop4' (00000000b3bda7f3): fill_kobj_path: path
= '/devices/virtual/block/loop4'
FS: 00007f39763ec700(0000) GS:ffff8880ae700000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000000000073c000 CR3: 0000000089628000 CR4: 00000000001406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
xfrm_policy_bysel_ctx+0x944/0x1020 net/xfrm/xfrm_policy.c:1664
xfrm_get_policy+0x67b/0x1160 net/xfrm/xfrm_user.c:1887
xfrm_user_rcv_msg+0x458/0x8d0 net/xfrm/xfrm_user.c:2663
netlink_rcv_skb+0x17d/0x410 net/netlink/af_netlink.c:2485
xfrm_netlink_rcv+0x70/0x90 net/xfrm/xfrm_user.c:2671
netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline]
netlink_unicast+0x574/0x770 net/netlink/af_netlink.c:1336
netlink_sendmsg+0xa05/0xf90 net/netlink/af_netlink.c:1925
sock_sendmsg_nosec net/socket.c:621 [inline]
sock_sendmsg+0xdd/0x130 net/socket.c:631
___sys_sendmsg+0x7ec/0x910 net/socket.c:2116
__sys_sendmsg+0x112/0x270 net/socket.c:2154
__do_sys_sendmsg net/socket.c:2163 [inline]
__se_sys_sendmsg net/socket.c:2161 [inline]
__x64_sys_sendmsg+0x78/0xb0 net/socket.c:2161
do_syscall_64+0x1a3/0x800 arch/x86/entry/common.c:290
entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x458099
Code: 6d b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7
48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff
ff 0f 83 3b b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007f39763ebc78 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000458099
RDX: 0000000000000000 RSI: 000000002014f000 RDI: 0000000000000003
RBP: 000000000073bf00 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00007f39763ec6d4
R13: 00000000004c56af R14: 00000000004d9420 R15: 00000000ffffffff
Modules linked in:
---[ end trace 6bda763f27a7462f ]---
RIP: 0010:__xfrm_policy_bysel_ctx.constprop.0+0xe4/0x470
net/xfrm/xfrm_policy.c:1618
Code: 00 e8 50 40 c3 fa 49 83 ec 08 0f 84 df 02 00 00 e8 41 40 c3 fa 49 8d
bc 24 19 02 00 00 48 89 f8 48 89 fa 48 c1 e8 03 83 e2 07 <42> 0f b6 04 38
38 d0 7f 08 84 c0 0f 85 eb 02 00 00 45 0f b6 ac 24
RSP: 0018:ffff888065bff0e8 EFLAGS: 00010206
RAX: 0000005800000042 RBX: 0000000000000000 RCX: ffffc90005deb000
RDX: 0000000000000003 RSI: ffffffff86bebeaf RDI: 000002c000000213
RBP: ffff888065bff148 R08: ffff8880568ce080 R09: 0000000000000000
R10: ffff8880568ce080 R11: 0000000000000000 R12: 000002bffffffffa
R13: 00000000000000ff R14: ffff88806bae3290 R15: dffffc0000000000
FS: 00007f39763ec700(0000) GS:ffff8880ae700000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000000000073c000 CR3: 0000000089628000 CR4: 00000000001406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.
syzbot will keep track of this bug report. See:
https://goo.gl/tpsmEJ#bug-status-tracking for how to communicate with
syzbot.
^ permalink raw reply
* INFO: task hung in vhost_init_device_iotlb
From: syzbot @ 2019-01-29 9:22 UTC (permalink / raw)
To: jasowang, kvm, linux-kernel, mst, netdev, syzkaller-bugs,
virtualization
Hello,
syzbot found the following crash on:
HEAD commit: 983542434e6b Merge tag 'edac_fix_for_5.0' of git://git.ker..
git tree: upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=17476498c00000
kernel config: https://syzkaller.appspot.com/x/.config?x=505743eba4e4f68
dashboard link: https://syzkaller.appspot.com/bug?extid=40e28a8bd59d10ed0c42
compiler: gcc (GCC) 9.0.0 20181231 (experimental)
Unfortunately, I don't have any reproducer for this crash yet.
IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+40e28a8bd59d10ed0c42@syzkaller.appspotmail.com
protocol 88fb is buggy, dev hsr_slave_1
protocol 88fb is buggy, dev hsr_slave_0
protocol 88fb is buggy, dev hsr_slave_1
protocol 88fb is buggy, dev hsr_slave_0
protocol 88fb is buggy, dev hsr_slave_1
INFO: task syz-executor5:9417 blocked for more than 140 seconds.
Not tainted 5.0.0-rc3+ #48
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
syz-executor5 D27576 9417 8469 0x00000004
Call Trace:
context_switch kernel/sched/core.c:2831 [inline]
__schedule+0x897/0x1e60 kernel/sched/core.c:3472
schedule+0xfe/0x350 kernel/sched/core.c:3516
protocol 88fb is buggy, dev hsr_slave_0
protocol 88fb is buggy, dev hsr_slave_1
schedule_preempt_disabled+0x13/0x20 kernel/sched/core.c:3574
__mutex_lock_common kernel/locking/mutex.c:1002 [inline]
__mutex_lock+0xa3b/0x1670 kernel/locking/mutex.c:1072
mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:1087
vhost_init_device_iotlb+0x124/0x280 drivers/vhost/vhost.c:1606
vhost_net_set_features drivers/vhost/net.c:1674 [inline]
vhost_net_ioctl+0x1282/0x1c00 drivers/vhost/net.c:1739
vfs_ioctl fs/ioctl.c:46 [inline]
file_ioctl fs/ioctl.c:509 [inline]
do_vfs_ioctl+0x107b/0x17d0 fs/ioctl.c:696
ksys_ioctl+0xab/0xd0 fs/ioctl.c:713
__do_sys_ioctl fs/ioctl.c:720 [inline]
__se_sys_ioctl fs/ioctl.c:718 [inline]
__x64_sys_ioctl+0x73/0xb0 fs/ioctl.c:718
do_syscall_64+0x1a3/0x800 arch/x86/entry/common.c:290
protocol 88fb is buggy, dev hsr_slave_0
protocol 88fb is buggy, dev hsr_slave_1
entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x458099
Code: Bad RIP value.
RSP: 002b:00007efd7ca9bc78 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000458099
RDX: 0000000020000080 RSI: 000000004008af00 RDI: 0000000000000003
RBP: 000000000073bfa0 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00007efd7ca9c6d4
R13: 00000000004c295b R14: 00000000004d5280 R15: 00000000ffffffff
INFO: task syz-executor5:9418 blocked for more than 140 seconds.
Not tainted 5.0.0-rc3+ #48
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
syz-executor5 D27800 9418 8469 0x00000004
Call Trace:
context_switch kernel/sched/core.c:2831 [inline]
__schedule+0x897/0x1e60 kernel/sched/core.c:3472
schedule+0xfe/0x350 kernel/sched/core.c:3516
schedule_preempt_disabled+0x13/0x20 kernel/sched/core.c:3574
__mutex_lock_common kernel/locking/mutex.c:1002 [inline]
__mutex_lock+0xa3b/0x1670 kernel/locking/mutex.c:1072
mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:1087
vhost_net_set_owner drivers/vhost/net.c:1697 [inline]
vhost_net_ioctl+0x426/0x1c00 drivers/vhost/net.c:1754
vfs_ioctl fs/ioctl.c:46 [inline]
file_ioctl fs/ioctl.c:509 [inline]
do_vfs_ioctl+0x107b/0x17d0 fs/ioctl.c:696
ksys_ioctl+0xab/0xd0 fs/ioctl.c:713
__do_sys_ioctl fs/ioctl.c:720 [inline]
__se_sys_ioctl fs/ioctl.c:718 [inline]
__x64_sys_ioctl+0x73/0xb0 fs/ioctl.c:718
do_syscall_64+0x1a3/0x800 arch/x86/entry/common.c:290
entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x458099
Code: Bad RIP value.
RSP: 002b:00007efd7ca7ac78 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000458099
RDX: 0000000000000000 RSI: 000040010000af01 RDI: 0000000000000003
RBP: 000000000073c040 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00007efd7ca7b6d4
R13: 00000000004c33a4 R14: 00000000004d5e80 R15: 00000000ffffffff
Showing all locks held in the system:
1 lock held by khungtaskd/1040:
#0: 00000000b7479fbe (rcu_read_lock){....}, at:
debug_show_all_locks+0xc6/0x41d kernel/locking/lockdep.c:4389
1 lock held by rsyslogd/8285:
#0: 000000006d9ccf7d (&f->f_pos_lock){+.+.}, at: __fdget_pos+0x1b3/0x1f0
fs/file.c:795
2 locks held by getty/8406:
#0: 00000000052e805b (&tty->ldisc_sem){++++}, at:
ldsem_down_read+0x33/0x40 drivers/tty/tty_ldsem.c:341
#1: 00000000b90dc267 (&ldata->atomic_read_lock){+.+.}, at:
n_tty_read+0x30a/0x1eb0 drivers/tty/n_tty.c:2154
2 locks held by getty/8407:
#0: 000000009fdef632 (&tty->ldisc_sem){++++}, at:
ldsem_down_read+0x33/0x40 drivers/tty/tty_ldsem.c:341
#1: 00000000ff2b1a16 (&ldata->atomic_read_lock){+.+.}, at:
n_tty_read+0x30a/0x1eb0 drivers/tty/n_tty.c:2154
2 locks held by getty/8408:
#0: 00000000e48a8e78 (&tty->ldisc_sem){++++}, at:
ldsem_down_read+0x33/0x40 drivers/tty/tty_ldsem.c:341
#1: 000000008fcf2060 (&ldata->atomic_read_lock){+.+.}, at:
n_tty_read+0x30a/0x1eb0 drivers/tty/n_tty.c:2154
2 locks held by getty/8409:
#0: 0000000063f3f4f5 (&tty->ldisc_sem){++++}, at:
ldsem_down_read+0x33/0x40 drivers/tty/tty_ldsem.c:341
#1: 000000001dc973ca (&ldata->atomic_read_lock){+.+.}, at:
n_tty_read+0x30a/0x1eb0 drivers/tty/n_tty.c:2154
2 locks held by getty/8410:
#0: 00000000f3c14150 (&tty->ldisc_sem){++++}, at:
ldsem_down_read+0x33/0x40 drivers/tty/tty_ldsem.c:341
#1: 000000007987cec5 (&ldata->atomic_read_lock){+.+.}, at:
n_tty_read+0x30a/0x1eb0 drivers/tty/n_tty.c:2154
2 locks held by getty/8411:
#0: 00000000d04f4305 (&tty->ldisc_sem){++++}, at:
ldsem_down_read+0x33/0x40 drivers/tty/tty_ldsem.c:341
#1: 000000003f47e3a6 (&ldata->atomic_read_lock){+.+.}, at:
n_tty_read+0x30a/0x1eb0 drivers/tty/n_tty.c:2154
2 locks held by getty/8412:
#0: 0000000082430560 (&tty->ldisc_sem){++++}, at:
ldsem_down_read+0x33/0x40 drivers/tty/tty_ldsem.c:341
#1: 0000000094609d81 (&ldata->atomic_read_lock){+.+.}, at:
n_tty_read+0x30a/0x1eb0 drivers/tty/n_tty.c:2154
2 locks held by syz-executor5/9417:
#0: 0000000020a0f0a1 (&dev->mutex#4){+.+.}, at: vhost_net_set_features
drivers/vhost/net.c:1668 [inline]
#0: 0000000020a0f0a1 (&dev->mutex#4){+.+.}, at:
vhost_net_ioctl+0x204/0x1c00 drivers/vhost/net.c:1739
#1: 00000000a7b5872b (&vq->mutex){+.+.}, at:
vhost_init_device_iotlb+0x124/0x280 drivers/vhost/vhost.c:1606
1 lock held by syz-executor5/9418:
#0: 0000000020a0f0a1 (&dev->mutex#4){+.+.}, at: vhost_net_set_owner
drivers/vhost/net.c:1697 [inline]
#0: 0000000020a0f0a1 (&dev->mutex#4){+.+.}, at:
vhost_net_ioctl+0x426/0x1c00 drivers/vhost/net.c:1754
1 lock held by vhost-9408/9413:
=============================================
NMI backtrace for cpu 0
CPU: 0 PID: 1040 Comm: khungtaskd Not tainted 5.0.0-rc3+ #48
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:77 [inline]
dump_stack+0x1db/0x2d0 lib/dump_stack.c:113
nmi_cpu_backtrace.cold+0x63/0xa4 lib/nmi_backtrace.c:101
nmi_trigger_cpumask_backtrace+0x1be/0x236 lib/nmi_backtrace.c:62
arch_trigger_cpumask_backtrace+0x14/0x20 arch/x86/kernel/apic/hw_nmi.c:38
trigger_all_cpu_backtrace include/linux/nmi.h:146 [inline]
check_hung_uninterruptible_tasks kernel/hung_task.c:203 [inline]
watchdog+0xbbb/0x1170 kernel/hung_task.c:287
kthread+0x357/0x430 kernel/kthread.c:246
ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:352
Sending NMI from CPU 0 to CPUs 1:
NMI backtrace for cpu 1
CPU: 1 PID: 7 Comm: kworker/u4:0 Not tainted 5.0.0-rc3+ #48
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
Workqueue: bat_events batadv_nc_worker
RIP: 0010:__sanitizer_cov_trace_const_cmp1+0x15/0x20 kernel/kcov.c:174
Code: 00 48 89 e5 48 8b 4d 08 e8 18 ff ff ff 5d c3 66 0f 1f 44 00 00 55 40
0f b6 d6 40 0f b6 f7 bf 01 00 00 00 48 89 e5 48 8b 4d 08 <e8> f6 fe ff ff
5d c3 0f 1f 40 00 55 0f b7 d6 0f b7 f7 bf 03 00 00
RSP: 0018:ffff8880a947f8a8 EFLAGS: 00000246
RAX: ffff8880a94701c0 RBX: ffff8880a05efc40 RCX: ffffffff87d36c97
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000001
RBP: ffff8880a947f8a8 R08: ffff8880a94701c0 R09: ffffed1015ce5b90
R10: ffffed1015ce5b8f R11: ffff8880ae72dc7b R12: 0000000000000000
R13: 0000000000000000 R14: 000000000000019e R15: dffffc0000000000
FS: 0000000000000000(0000) GS:ffff8880ae700000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffffffffff600400 CR3: 00000000a005a000 CR4: 00000000001426e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
rcu_read_unlock include/linux/rcupdate.h:657 [inline]
batadv_nc_purge_orig_hash net/batman-adv/network-coding.c:423 [inline]
batadv_nc_worker+0x2f7/0x920 net/batman-adv/network-coding.c:730
process_one_work+0xd0c/0x1ce0 kernel/workqueue.c:2153
worker_thread+0x143/0x14a0 kernel/workqueue.c:2296
kthread+0x357/0x430 kernel/kthread.c:246
ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:352
---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.
syzbot will keep track of this bug report. See:
https://goo.gl/tpsmEJ#bug-status-tracking for how to communicate with
syzbot.
^ permalink raw reply
* Re: [PATCH net 1/1] bonding: fix PACKET_ORIGDEV regression on bonding masters
From: Maciej Żenczykowski @ 2019-01-29 9:39 UTC (permalink / raw)
To: Michal Soltys
Cc: David Miller, Linux NetDev, Jay Vosburgh, Vincent Bernat,
Mahesh Bandewar, Chonggang Li
In-Reply-To: <20f00da1-74a3-90a2-6759-186f6b415685@ziu.info>
Sounds good to me.
On Mon, Jan 28, 2019 at 5:47 PM Michal Soltys <soltys@ziu.info> wrote:
>
> On 19/01/18 07:58, Maciej Żenczykowski wrote:
> > I'm not sure there's a truly good answer.
> >
> > Also note, that there's subtle differences between af_packet sockets
> > tied to ALL protocols vs tied to specific protocols (vs none/0 of
> > course).
> > However ETH_P_ALL protocol raw sockets need to be avoided if at all
> > possible due to perf impact.
> > Certainly we don't want any of these created outside of debugging.
> >
> > I believe we only cared about the utterly unbound to any device case
> > working (ie. delivering all LLDP packets all nics are receiving).
> > So that a single simple daemon could collect and export all the link
> > information.
> > [btw. now that I know about the PACKET_ORIGDEV option, I do -
> > unsurprisingly - see our daemon sets it]
> > We really didn't want the complexity of having to bind to individual
> > interfaces and having to try to dynamically adjust the set of raw
> > sockets.
> > (not to mention that less raw sockets is always good, also it's never
> > actually entirely clear which interfaces would need to be monitored
> > due to the preponderance of tunnels/macvlan/ipvlan/veth/dummy and
> > other virtual interface types)
> >
> > I think from a logic perspective:
> >
> > - if you bind to slave (physical interface) you should see *all*
> > packets on that interface,
> > regardless of whether the slave is active or not, and whether the
> > packet is link-local or not.
> > ie. this should show you exactly what nic is receiving (& sending:
> > PACKET_OUTGOING)
> > I should see *exactly* one copy of any packet.
> > This mode has to be useful for debugging.
> > This includes seeing packets which aren't even destined for me if nic
> > receives them.
> > (ie. destined to unicast/multicast mac we'd filter out: PACKET_OTHERHOST)
> > Although by itself this socket's existence shouldn't affect nic mac
> > filters and promiscuous mode
> > (I believe there are all sorts of various additional socket options to
> > change those).
> >
> > - if you don't bind to an interface then I think I'd expect to see
> > packets delivered to stack + link local packets
> > - ie. should not see non-link-local packets discarded due to being
> > on inactive slaves
> > similarly to how I should not see packets filtered out by virtue
> > of mac not matching interface mac filters
> > - IMHO you should see *all* link local packets arriving at system
> > (ie. the original change's purpose)
> > [and I think, but am not certain, I shouldn't need to use any
> > socket options to register the link local macs,
> > although glancing at code I do think the daemon uses
> > PACKET_ADD_MEMBERSHIP to register lldp mac,
> > so perhaps filtering should apply as normal?]
> > - with PACKET_ORIGDEV they should always show up as from the
> > physical interfaces (ie. slaves)
> > - without PACKET_ORIGDEV:
> > - non link local packets should show up as from bonding/team master
>
> > - link local packets on active slaves could show up on master or
> > slave (probably master is required ifirc some earlier fix???)
>
> As far as I can see, they (LLCs coming on active slave) will show via
> master as from master (w/o the socket option) or as from slave (with).
> Think Vincent's tests confirmed it.
>
> > - link local packets on inactive slaves should show up on slave -
> > and definitely not on master
> > - I don't think I should ever see any PACKET_OTHERHOST packets
> >
> > - if you bind to master you should see packets from active slaves (ie.
> > those that will get delivered to stack)
> > - clearly you should not see non-link-local inactive slave packets
> > (they'll be dropped)
>
> > - behaviour wrt. link local packets is more dicey... (I believe
> > somewhere in these threads patches there was some description of what
> > standard requires, but I don't off the top of my head remember)
>
> They must be seen, otherwise bonds attached to a linux bridge (I assume
> enslaving an interface to a bridge essentially counts as bind) will be
> blind to them (among those - e.g. to spanning tree information - this
> was what originally caused issues for me last year). Even the bridge
> code goes to extra length to not carelessly consume stp packets, if it's
> not actively participating in stp (of any kind). Aside that, certain
> [recent] bridge features like group_fwd_mask would be non-functional on
> bond ports as well.
>
> As for standard (the one I quoted in the oldest thread) expected the
> link-local packet to be both readable via master and slaves depending on
> need (though it didn't go into exact gory details). Your current patch I
> think cover all possible cases quite greacefully.
>
> > - for an active-backup bond it would seem logical to see only active
> > links link local
> > - for a multiple-active aggregate bond I'd be fine with seeing none,
> > or all active slaves link local packets - I guess even though the
> > later is confusing it makes sense
> > - it might be okay to see link-local inactive slave packets, but I
> > think I'd prefer not to (could be configurable though) - this would
> > seem confusing/wrong to me.
> > - and again I don't think I should see any PACKET_OTHERHOST packets here...
> > - PACKET_ORIGDEV as above...
> >
> > I gave the above a fair amount of thought... but I'm not guaranteeing
> > I didn't make typos, or write something utterly stupid or
> > unimplementable or not how stuff should work for other reasons...
> > Comments welcome.
> >
> > I think this continues to be in line with my proposal from earlier in
> > the thread?
>
> (sorry for late reply)
>
> Yes, pretty much spot on. With some confirmation comments above.
>
> I did bridging tests yesterday (2 LACP bonds attached via separate
> switches treating linux bridge as a shared segment in mstp scenario) and
> all is working fine on my side as well.
>
> If everyone agrees with the proposed code, I will submit v2 patch with
> added comment explaining basic logic (or you could submit if you prefer).
>
> >
> > On Thu, Jan 17, 2019 at 4:27 PM Michal Soltys <soltys@ziu.info> wrote:
> >>
> >> On 19/01/14 03:01, Maciej Żenczykowski wrote:
> >> > So I don't remember the specifics...
> >> >
> >> > (note I'm writing this all from memory without looking it up/testing
> >> > it - I may be utterly wrong or dreaming)
> >> >
> >> > But I seem to recall that the core problem we were trying to solve was
> >> > that a daemon listening
> >> > on an AF_PACKET ethertype 88CC [LLDP] socket not bound to any device
> >> > would not receive LLDP packets
> >> > arriving on inactive bond slaves (either active-backup or lag).
> >> >
> >> > [inactive = link/carrier up, but not part of active aggregator]
> >> >
> >> > This made monitoring for miscabling harder (IFIRC the only non kernel
> >> > fix was to get the daemon to create
> >> > a separate AF_PACKET/88CC socket bound to every physical interface in
> >> > the system, or monitor for
> >> > inactive slaves and add extra packet sockets as needed).
> >> >
> >> > They would get re-parented to the master and then since the slave was
> >> > inactive they would be considered RX_HANDLER_EXACT match only and not
> >> > match the * interface.
> >> >
> >> > Honestly I wasn't aware of PACKET_ORIGDEV, although I don't think it
> >> > helps in this case - AFAICR the packets never made it to the packet
> >> > socket.
> >> >
> >> > Perhaps going from:
> >> > /* don't change skb->dev for link-local packets */
> >> > if (is_link_local_ether_addr(eth_hdr(skb)->h_dest)) return RX_HANDLER_PASS;
> >> > if (bond_should_deliver_exact_match(skb, slave, bond)) return
> >> > RX_HANDLER_EXACT;
> >> >
> >> > to something more like:
> >> > if (bond_should_deliver_exact_match(skb, slave, bond)) {
> >> > /* don't change skb->dev for link-local packets on inactive slaves */
> >> > if (is_link_local_ether_addr(eth_hdr(skb)->h_dest)) return RX_HANDLER_PASS;
> >> > return RX_HANDLER_EXACT;
> >> > }
> >>
> >> Having checked the code (if I get the flow correctly), one
> >> thing/question - currently with Mahesh's fixes, not bound LLDP listener
> >> will receive all packets - both from active and inactive slaves directly
> >> (as the check for suppression is done after the link-local check).
> >>
> >> The version above will do the suppression check first - so all inactive
> >> slaves - excluding non-multi/non-broad ALB - will pass it and return
> >> RX_HANDLER_PASS if the packet is link-local. So those will be available
> >> w/o binding, but active slaves' packets will be available via master
> >> device (but with working PACKET_ORIGDEV now - so slave device can be
> >> retrieved easily). This is fine in your scenario I presume ?
> >>
> >
>
^ permalink raw reply
* Re: Request for stable backport: stmmac: Use correct values in TQS/RQS fields
From: Greg KH @ 2019-01-29 9:41 UTC (permalink / raw)
To: Niklas Cassel; +Cc: davem, stable, netdev, vinod.koul
In-Reply-To: <20190129090955.GA16354@centauri.lan>
On Tue, Jan 29, 2019 at 10:09:55AM +0100, Niklas Cassel wrote:
> Adding stable since I see Greg's Sign-off-by on recent backports
> to this driver.
>
>
> On Wed, Dec 19, 2018 at 10:55:16AM +0100, Niklas Cassel wrote:
> > Hello David,
> >
> > I can observe a netdev watchdog timeout on kernel 4.14.78 when using stmmac
> > with multiple tx queues.
> >
> > Backporting the following commit:
> >
> > commit 52a76235d0c4dd259cd0df503afed4757c04ba1d
> > Author: Jose Abreu <Jose.Abreu@synopsys.com>
> > Date: Fri Oct 13 10:58:36 2017 +0100
> >
> > net: stmmac: Use correct values in TQS/RQS fields
> >
> > Currently we are using all the available fifo size in RQS and
> > TQS fields. This will not work correctly in multi-queues IP's
> > because total fifo size must be splitted to the enabled queues.
> >
> > Correct this by computing the available fifo size per queue and
> > setting the right value in TQS and RQS fields.
> >
> > Signed-off-by: Jose Abreu <joabreu@synopsys.com>
> > Cc: David S. Miller <davem@davemloft.net>
> > Cc: Joao Pinto <jpinto@synopsys.com>
> > Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
> > Cc: Alexandre Torgue <alexandre.torgue@st.com>
> > Signed-off-by: David S. Miller <davem@davemloft.net>
> >
> >
> > resolves the issue.
> >
> > The fix was first included in v4.15
> > $ git tag --contains 52a76235d0c4dd259cd0df503afed4757c04ba1d
> > v4.15
> > v4.15-rc1
> > v4.15-rc2
> >
> > Could you please queue it up for 4.14 stable?
Netdev doesn't care about 4.14 anymore, so I'll just take it, thanks.
greg k-h
^ permalink raw reply
* Re: general protection fault in __xfrm_policy_bysel_ctx
From: Florian Westphal @ 2019-01-29 9:41 UTC (permalink / raw)
To: syzbot
Cc: davem, herbert, linux-kernel, netdev, steffen.klassert,
syzkaller-bugs
In-Reply-To: <0000000000007b129305809553da@google.com>
syzbot <syzbot+e6e1fe9148cffa18cf97@syzkaller.appspotmail.com> wrote:
> Hello,
>
> syzbot found the following crash on:
>
> HEAD commit: 085c4c7dd2b6 net: lmc: remove -I. header search path
> git tree: net-next
> console output: https://syzkaller.appspot.com/x/log.txt?x=12347128c00000
> kernel config: https://syzkaller.appspot.com/x/.config?x=505743eba4e4f68
> dashboard link: https://syzkaller.appspot.com/bug?extid=e6e1fe9148cffa18cf97
> compiler: gcc (GCC) 9.0.0 20181231 (experimental)
>
> Unfortunately, I don't have any reproducer for this crash yet.
net-next doesn't contain the fixes for the rbtree fallout yet, so
this might already be fixed (fingers crossed).
^ permalink raw reply
* [net-next] cxgb4/cxgb4vf: Program hash region for {t4/t4vf}_change_mac()
From: Arjun Vynipadath @ 2019-01-29 9:48 UTC (permalink / raw)
To: netdev, davem; +Cc: nirranjan, indranil, dt, leedom, Arjun Vynipadath
{t4/t4_vf}_change_mac() API's were only doing additions to MPS_TCAM.
This will fail, when the number of tcam entries is limited particularly
in vf's.
This fix programs hash region with the mac address, when TCAM
addtion fails for {t4/t4vf}_change_mac(). Since the locally maintained
driver list for hash entries is shared across mac_{sync/unsync}(),
added an extra parameter if_mac to track the address added thorugh
{t4/t4vf}_change_mac()
Signed-off-by: Arjun Vynipadath <arjun@chelsio.com>
---
drivers/net/ethernet/chelsio/cxgb4/cxgb4.h | 3 +-
drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 71 ++++++++++++---
drivers/net/ethernet/chelsio/cxgb4vf/adapter.h | 3 +-
.../net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c | 100 +++++++++++++++------
4 files changed, 137 insertions(+), 40 deletions(-)
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 2d1ca92..568715a 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -568,7 +568,7 @@ struct sge_rspq;
struct port_info {
struct adapter *adapter;
u16 viid;
- s16 xact_addr_filt; /* index of exact MAC address filter */
+ int xact_addr_filt; /* index of exact MAC address filter */
u16 rss_size; /* size of VI's RSS table slice */
s8 mdio_addr;
enum fw_port_type port_type;
@@ -860,6 +860,7 @@ struct doorbell_stats {
struct hash_mac_addr {
struct list_head list;
u8 addr[ETH_ALEN];
+ unsigned int iface_mac;
};
struct uld_msix_bmap {
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index c664a19..e2ad7df 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -433,6 +433,60 @@ static int set_rxmode(struct net_device *dev, int mtu, bool sleep_ok)
}
/**
+ * cxgb4_change_mac - Update match filter for a MAC address.
+ * @pi: the port_info
+ * @viid: the VI id
+ * @tcam_idx: TCAM index of existing filter for old value of MAC address,
+ * or -1
+ * @addr: the new MAC address value
+ * @persist: whether a new MAC allocation should be persistent
+ * @add_smt: if true also add the address to the HW SMT
+ *
+ * Modifies an MPS filter and sets it to the new MAC address if
+ * @tcam_idx >= 0, or adds the MAC address to a new filter if
+ * @tcam_idx < 0. In the latter case the address is added persistently
+ * if @persist is %true.
+ * Addresses are programmed to hash region, if tcam runs out of entries.
+ *
+ */
+static inline int cxgb4_change_mac(struct port_info *pi, unsigned int viid,
+ int *tcam_idx, const u8 *addr,
+ bool persist, u8 *smt_idx)
+{
+ struct adapter *adapter = pi->adapter;
+ struct hash_mac_addr *entry, *new_entry;
+ int ret;
+
+ ret = t4_change_mac(adapter, adapter->mbox, viid,
+ *tcam_idx, addr, persist, smt_idx);
+ /* We ran out of TCAM entries. try programming hash region. */
+ if (ret == -ENOMEM) {
+ /* If the MAC address to be updated is in the hash addr
+ * list, update it from the list
+ */
+ list_for_each_entry(entry, &adapter->mac_hlist, list) {
+ if (entry->iface_mac) {
+ ether_addr_copy(entry->addr, addr);
+ goto set_hash;
+ }
+ }
+ new_entry = kzalloc(sizeof(*new_entry), GFP_ATOMIC);
+ if (!new_entry)
+ return -ENOMEM;
+ ether_addr_copy(new_entry->addr, addr);
+ new_entry->iface_mac = true;
+ list_add_tail(&new_entry->list, &adapter->mac_hlist);
+set_hash:
+ ret = cxgb4_set_addr_hash(pi);
+ } else if (ret >= 0) {
+ *tcam_idx = ret;
+ ret = 0;
+ }
+
+ return ret;
+}
+
+/*
* link_start - enable a port
* @dev: the port to enable
*
@@ -450,15 +504,9 @@ static int link_start(struct net_device *dev)
*/
ret = t4_set_rxmode(pi->adapter, mb, pi->viid, dev->mtu, -1, -1, -1,
!!(dev->features & NETIF_F_HW_VLAN_CTAG_RX), true);
- if (ret == 0) {
- ret = t4_change_mac(pi->adapter, mb, pi->viid,
- pi->xact_addr_filt, dev->dev_addr, true,
- &pi->smt_idx);
- if (ret >= 0) {
- pi->xact_addr_filt = ret;
- ret = 0;
- }
- }
+ if (ret == 0)
+ ret = cxgb4_change_mac(pi, pi->viid, &pi->xact_addr_filt,
+ dev->dev_addr, true, &pi->smt_idx);
if (ret == 0)
ret = t4_link_l1cfg(pi->adapter, mb, pi->tx_chan,
&pi->link_cfg);
@@ -2839,9 +2887,8 @@ static int cxgb_set_mac_addr(struct net_device *dev, void *p)
if (!is_valid_ether_addr(addr->sa_data))
return -EADDRNOTAVAIL;
- ret = t4_change_mac(pi->adapter, pi->adapter->pf, pi->viid,
- pi->xact_addr_filt, addr->sa_data, true,
- &pi->smt_idx);
+ ret = cxgb4_change_mac(pi, pi->viid, &pi->xact_addr_filt,
+ addr->sa_data, true, &pi->smt_idx);
if (ret < 0)
return ret;
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/adapter.h b/drivers/net/ethernet/chelsio/cxgb4vf/adapter.h
index 5883f09..26f48a1 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/adapter.h
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/adapter.h
@@ -94,7 +94,7 @@ struct port_info {
struct adapter *adapter; /* our adapter */
u32 vlan_id; /* vlan id for VST */
u16 viid; /* virtual interface ID */
- s16 xact_addr_filt; /* index of our MAC address filter */
+ int xact_addr_filt; /* index of our MAC address filter */
u16 rss_size; /* size of VI's RSS table slice */
u8 pidx; /* index into adapter port[] */
s8 mdio_addr;
@@ -352,6 +352,7 @@ struct sge {
struct hash_mac_addr {
struct list_head list;
u8 addr[ETH_ALEN];
+ unsigned int iface_mac;
};
struct mbox_list {
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
index 2fab87e..a8f78ea 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
@@ -236,6 +236,74 @@ void t4vf_os_portmod_changed(struct adapter *adapter, int pidx)
"inserted\n", dev->name, pi->mod_type);
}
+static inline int cxgb4vf_set_addr_hash(struct port_info *pi)
+{
+ struct adapter *adapter = pi->adapter;
+ u64 vec = 0;
+ bool ucast = false;
+ struct hash_mac_addr *entry;
+
+ /* Calculate the hash vector for the updated list and program it */
+ list_for_each_entry(entry, &adapter->mac_hlist, list) {
+ ucast |= is_unicast_ether_addr(entry->addr);
+ vec |= (1ULL << hash_mac_addr(entry->addr));
+ }
+ return t4vf_set_addr_hash(adapter, pi->viid, ucast, vec, false);
+}
+
+/**
+ * cxgb4vf_change_mac - Update match filter for a MAC address.
+ * @pi: the port_info
+ * @viid: the VI id
+ * @tcam_idx: TCAM index of existing filter for old value of MAC address,
+ * or -1
+ * @addr: the new MAC address value
+ * @persist: whether a new MAC allocation should be persistent
+ * @add_smt: if true also add the address to the HW SMT
+ *
+ * Modifies an MPS filter and sets it to the new MAC address if
+ * @tcam_idx >= 0, or adds the MAC address to a new filter if
+ * @tcam_idx < 0. In the latter case the address is added persistently
+ * if @persist is %true.
+ * Addresses are programmed to hash region, if tcam runs out of entries.
+ *
+ */
+static inline int cxgb4vf_change_mac(struct port_info *pi, unsigned int viid,
+ int *tcam_idx, const u8 *addr,
+ bool persistent)
+{
+ struct hash_mac_addr *new_entry, *entry;
+ struct adapter *adapter = pi->adapter;
+ int ret;
+
+ ret = t4vf_change_mac(adapter, viid, *tcam_idx, addr, persistent);
+ /* We ran out of TCAM entries. try programming hash region. */
+ if (ret == -ENOMEM) {
+ /* If the MAC address to be updated is in the hash addr
+ * list, update it from the list
+ */
+ list_for_each_entry(entry, &adapter->mac_hlist, list) {
+ if (entry->iface_mac) {
+ ether_addr_copy(entry->addr, addr);
+ goto set_hash;
+ }
+ }
+ new_entry = kzalloc(sizeof(*new_entry), GFP_ATOMIC);
+ if (!new_entry)
+ return -ENOMEM;
+ ether_addr_copy(new_entry->addr, addr);
+ new_entry->iface_mac = true;
+ list_add_tail(&new_entry->list, &adapter->mac_hlist);
+set_hash:
+ ret = cxgb4vf_set_addr_hash(pi);
+ } else if (ret >= 0) {
+ *tcam_idx = ret;
+ ret = 0;
+ }
+
+ return ret;
+}
+
/*
* Net device operations.
* ======================
@@ -259,14 +327,10 @@ static int link_start(struct net_device *dev)
*/
ret = t4vf_set_rxmode(pi->adapter, pi->viid, dev->mtu, -1, -1, -1, 1,
true);
- if (ret == 0) {
- ret = t4vf_change_mac(pi->adapter, pi->viid,
- pi->xact_addr_filt, dev->dev_addr, true);
- if (ret >= 0) {
- pi->xact_addr_filt = ret;
- ret = 0;
- }
- }
+ if (ret == 0)
+ ret = cxgb4vf_change_mac(pi, pi->viid,
+ &pi->xact_addr_filt,
+ dev->dev_addr, true);
/*
* We don't need to actually "start the link" itself since the
@@ -863,21 +927,6 @@ static struct net_device_stats *cxgb4vf_get_stats(struct net_device *dev)
return ns;
}
-static inline int cxgb4vf_set_addr_hash(struct port_info *pi)
-{
- struct adapter *adapter = pi->adapter;
- u64 vec = 0;
- bool ucast = false;
- struct hash_mac_addr *entry;
-
- /* Calculate the hash vector for the updated list and program it */
- list_for_each_entry(entry, &adapter->mac_hlist, list) {
- ucast |= is_unicast_ether_addr(entry->addr);
- vec |= (1ULL << hash_mac_addr(entry->addr));
- }
- return t4vf_set_addr_hash(adapter, pi->viid, ucast, vec, false);
-}
-
static int cxgb4vf_mac_sync(struct net_device *netdev, const u8 *mac_addr)
{
struct port_info *pi = netdev_priv(netdev);
@@ -1159,13 +1208,12 @@ static int cxgb4vf_set_mac_addr(struct net_device *dev, void *_addr)
if (!is_valid_ether_addr(addr->sa_data))
return -EADDRNOTAVAIL;
- ret = t4vf_change_mac(pi->adapter, pi->viid, pi->xact_addr_filt,
- addr->sa_data, true);
+ ret = cxgb4vf_change_mac(pi, pi->viid, &pi->xact_addr_filt,
+ addr->sa_data, true);
if (ret < 0)
return ret;
memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
- pi->xact_addr_filt = ret;
return 0;
}
--
2.9.5
^ permalink raw reply related
* [PATCH v2] net: macb: Apply RXUBR workaround only to versions with errata
From: Harini Katakam @ 2019-01-29 9:50 UTC (permalink / raw)
To: nicolas.ferre, davem, claudiu.beznea, brandon.streiff
Cc: netdev, linux-kernel, michal.simek, harinikatakamlinux,
harini.katakam
The interrupt handler contains a workaround for RX hang applicable
to Zynq and AT91RM9200 only. Subsequent versions do not need this
workaround. This workaround unnecessarily resets RX whenever RX used
bit read is observed, which can be often under heavy traffic. There
is no other action performed on RX UBR interrupt. Hence introduce a
CAPS mask; enable this interrupt and workaround only on affected
versions.
Signed-off-by: Harini Katakam <harini.katakam@xilinx.com>
---
v2 changes:
- Added caps mask in correct AT91RM9200 config
- Disabled RXUBR for devices other than AT91RM9200 and Zynq and
kept the interrupt and reset workaround for these two.
- Updated comment in interrupt handler.
Changes from RFC:
- Use CAPS mask instead introducing and errata field.
- Use check only on RX reset part; ISR should still be cleared.
drivers/net/ethernet/cadence/macb.h | 3 +++
drivers/net/ethernet/cadence/macb_main.c | 28 +++++++++++++++++-----------
2 files changed, 20 insertions(+), 11 deletions(-)
diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h
index 3d45f4c..9bbaad9 100644
--- a/drivers/net/ethernet/cadence/macb.h
+++ b/drivers/net/ethernet/cadence/macb.h
@@ -643,6 +643,7 @@
#define MACB_CAPS_JUMBO 0x00000020
#define MACB_CAPS_GEM_HAS_PTP 0x00000040
#define MACB_CAPS_BD_RD_PREFETCH 0x00000080
+#define MACB_CAPS_NEEDS_RSTONUBR 0x00000100
#define MACB_CAPS_FIFO_MODE 0x10000000
#define MACB_CAPS_GIGABIT_MODE_AVAILABLE 0x20000000
#define MACB_CAPS_SG_DISABLED 0x40000000
@@ -1214,6 +1215,8 @@ struct macb {
int rx_bd_rd_prefetch;
int tx_bd_rd_prefetch;
+
+ u32 rx_intr_mask;
};
#ifdef CONFIG_MACB_USE_HWSTAMP
diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index 66cc792..2b28826 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -56,8 +56,7 @@
/* level of occupied TX descriptors under which we wake up TX process */
#define MACB_TX_WAKEUP_THRESH(bp) (3 * (bp)->tx_ring_size / 4)
-#define MACB_RX_INT_FLAGS (MACB_BIT(RCOMP) | MACB_BIT(RXUBR) \
- | MACB_BIT(ISR_ROVR))
+#define MACB_RX_INT_FLAGS (MACB_BIT(RCOMP) | MACB_BIT(ISR_ROVR))
#define MACB_TX_ERR_FLAGS (MACB_BIT(ISR_TUND) \
| MACB_BIT(ISR_RLE) \
| MACB_BIT(TXERR))
@@ -1270,7 +1269,7 @@ static int macb_poll(struct napi_struct *napi, int budget)
queue_writel(queue, ISR, MACB_BIT(RCOMP));
napi_reschedule(napi);
} else {
- queue_writel(queue, IER, MACB_RX_INT_FLAGS);
+ queue_writel(queue, IER, bp->rx_intr_mask);
}
}
@@ -1288,7 +1287,7 @@ static void macb_hresp_error_task(unsigned long data)
u32 ctrl;
for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
- queue_writel(queue, IDR, MACB_RX_INT_FLAGS |
+ queue_writel(queue, IDR, bp->rx_intr_mask |
MACB_TX_INT_FLAGS |
MACB_BIT(HRESP));
}
@@ -1318,7 +1317,7 @@ static void macb_hresp_error_task(unsigned long data)
/* Enable interrupts */
queue_writel(queue, IER,
- MACB_RX_INT_FLAGS |
+ bp->rx_intr_mask |
MACB_TX_INT_FLAGS |
MACB_BIT(HRESP));
}
@@ -1372,14 +1371,14 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
(unsigned int)(queue - bp->queues),
(unsigned long)status);
- if (status & MACB_RX_INT_FLAGS) {
+ if (status & bp->rx_intr_mask) {
/* There's no point taking any more interrupts
* until we have processed the buffers. The
* scheduling call may fail if the poll routine
* is already scheduled, so disable interrupts
* now.
*/
- queue_writel(queue, IDR, MACB_RX_INT_FLAGS);
+ queue_writel(queue, IDR, bp->rx_intr_mask);
if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
queue_writel(queue, ISR, MACB_BIT(RCOMP));
@@ -1412,8 +1411,9 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
/* There is a hardware issue under heavy load where DMA can
* stop, this causes endless "used buffer descriptor read"
* interrupts but it can be cleared by re-enabling RX. See
- * the at91 manual, section 41.3.1 or the Zynq manual
- * section 16.7.4 for details.
+ * the at91rm9200 manual, section 41.3.1 or the Zynq manual
+ * section 16.7.4 for details. RXUBR is only enabled for
+ * these two versions.
*/
if (status & MACB_BIT(RXUBR)) {
ctrl = macb_readl(bp, NCR);
@@ -2259,7 +2259,7 @@ static void macb_init_hw(struct macb *bp)
/* Enable interrupts */
queue_writel(queue, IER,
- MACB_RX_INT_FLAGS |
+ bp->rx_intr_mask |
MACB_TX_INT_FLAGS |
MACB_BIT(HRESP));
}
@@ -3907,6 +3907,7 @@ static const struct macb_config sama5d4_config = {
};
static const struct macb_config emac_config = {
+ .caps = MACB_CAPS_NEEDS_RSTONUBR,
.clk_init = at91ether_clk_init,
.init = at91ether_init,
};
@@ -3928,7 +3929,8 @@ static const struct macb_config zynqmp_config = {
};
static const struct macb_config zynq_config = {
- .caps = MACB_CAPS_GIGABIT_MODE_AVAILABLE | MACB_CAPS_NO_GIGABIT_HALF,
+ .caps = MACB_CAPS_GIGABIT_MODE_AVAILABLE | MACB_CAPS_NO_GIGABIT_HALF |
+ MACB_CAPS_NEEDS_RSTONUBR,
.dma_burst_length = 16,
.clk_init = macb_clk_init,
.init = macb_init,
@@ -4083,6 +4085,10 @@ static int macb_probe(struct platform_device *pdev)
macb_dma_desc_get_size(bp);
}
+ bp->rx_intr_mask = MACB_RX_INT_FLAGS;
+ if (bp->caps & MACB_CAPS_NEEDS_RSTONUBR)
+ bp->rx_intr_mask |= MACB_BIT(RXUBR);
+
mac = of_get_mac_address(np);
if (mac) {
ether_addr_copy(bp->dev->dev_addr, mac);
--
2.7.4
^ permalink raw reply related
* [PATCH net-next] cxgb4vf: Update port information in cxgb4vf_open()
From: Arjun Vynipadath @ 2019-01-29 9:50 UTC (permalink / raw)
To: netdev, davem; +Cc: nirranjan, indranil, dt, leedom, Arjun Vynipadath
It's possible that the basic port information could have
changed since we first read it.
Signed-off-by: Arjun Vynipadath <arjun@chelsio.com>
---
drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
index a8f78ea..3e37815 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
@@ -855,6 +855,13 @@ static int cxgb4vf_open(struct net_device *dev)
return err;
}
+ /* It's possible that the basic port information could have
+ * changed since we first read it.
+ */
+ err = t4vf_update_port_info(pi);
+ if (err < 0)
+ return err;
+
/*
* Note that this interface is up and start everything up ...
*/
--
2.9.5
^ permalink raw reply related
* [PATCH v2] i40e: replace switch-statement to speed-up retpoline-enabled builds
From: bjorn.topel @ 2019-01-29 9:57 UTC (permalink / raw)
To: intel-wired-lan
Cc: Björn Töpel, pmenzel, brouer, magnus.karlsson,
magnus.karlsson, netdev
From: Björn Töpel <bjorn.topel@intel.com>
GCC will generate jump tables for switch-statements with more than 5
case statements. An entry into the jump table is an indirect call,
which means that for CONFIG_RETPOLINE builds, this is rather
expensive.
This commit replaces the switch-statement that acts on the XDP program
result with an if-clause.
The if-clause was also refactored into a common function that can be
used by AF_XDP zero-copy and non-zero-copy code.
Performance prior this patch:
$ sudo ./xdp_rxq_info --dev enp134s0f0 --action XDP_DROP
Running XDP on dev:enp134s0f0 (ifindex:7) action:XDP_DROP options:no_touch
XDP stats CPU pps issue-pps
XDP-RX CPU 20 18983018 0
XDP-RX CPU total 18983018
RXQ stats RXQ:CPU pps issue-pps
rx_queue_index 20:20 18983012 0
rx_queue_index 20:sum 18983012
$ sudo ./xdpsock -i enp134s0f0 -q 20 -n 2 -z -r
sock0@enp134s0f0:20 rxdrop
pps pkts 2.00
rx 14,641,496 144,751,092
tx 0 0
And after:
$ sudo ./xdp_rxq_info --dev enp134s0f0 --action XDP_DROP
Running XDP on dev:enp134s0f0 (ifindex:7) action:XDP_DROP options:no_touch
XDP stats CPU pps issue-pps
XDP-RX CPU 20 24000986 0
XDP-RX CPU total 24000986
RXQ stats RXQ:CPU pps issue-pps
rx_queue_index 20:20 24000985 0
rx_queue_index 20:sum 24000985
+26%
$ sudo ./xdpsock -i enp134s0f0 -q 20 -n 2 -z -r
sock0@enp134s0f0:20 rxdrop
pps pkts 2.00
rx 17,623,578 163,503,263
tx 0 0
+20%
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
v1->v2:
* Fixed build error on alpha "error: implicit declaration of function
'unlikely'; did you mean 'inline'? " (kbuild test robot)
* Improved commit message (Paul Menzel)
---
drivers/net/ethernet/intel/i40e/i40e_txrx.c | 32 ++++---------------
.../ethernet/intel/i40e/i40e_txrx_common.h | 27 ++++++++++++++++
drivers/net/ethernet/intel/i40e/i40e_xsk.c | 24 ++------------
3 files changed, 36 insertions(+), 47 deletions(-)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index a7e14e98889f..4f530427ce61 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2,7 +2,7 @@
/* Copyright(c) 2013 - 2018 Intel Corporation. */
#include <linux/prefetch.h>
-#include <linux/bpf_trace.h>
+#include <linux/compiler.h>
#include <net/xdp.h>
#include "i40e.h"
#include "i40e_trace.h"
@@ -2195,41 +2195,23 @@ int i40e_xmit_xdp_tx_ring(struct xdp_buff *xdp, struct i40e_ring *xdp_ring)
static struct sk_buff *i40e_run_xdp(struct i40e_ring *rx_ring,
struct xdp_buff *xdp)
{
- int err, result = I40E_XDP_PASS;
- struct i40e_ring *xdp_ring;
struct bpf_prog *xdp_prog;
+ int result;
u32 act;
rcu_read_lock();
xdp_prog = READ_ONCE(rx_ring->xdp_prog);
- if (!xdp_prog)
+ if (!xdp_prog) {
+ result = I40E_XDP_PASS;
goto xdp_out;
+ }
prefetchw(xdp->data_hard_start); /* xdp_frame write */
act = bpf_prog_run_xdp(xdp_prog, xdp);
- switch (act) {
- case XDP_PASS:
- break;
- case XDP_TX:
- xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->queue_index];
- result = i40e_xmit_xdp_tx_ring(xdp, xdp_ring);
- break;
- case XDP_REDIRECT:
- err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
- result = !err ? I40E_XDP_REDIR : I40E_XDP_CONSUMED;
- break;
- default:
- bpf_warn_invalid_xdp_action(act);
- /* fall through */
- case XDP_ABORTED:
- trace_xdp_exception(rx_ring->netdev, xdp_prog, act);
- /* fall through -- handle aborts by dropping packet */
- case XDP_DROP:
- result = I40E_XDP_CONSUMED;
- break;
- }
+ i40e_xdp_do_action(act, &result, rx_ring, xdp, xdp_prog);
+
xdp_out:
rcu_read_unlock();
return ERR_PTR(-result);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h
index 8af0e99c6c0d..8cc4d8365f9e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h
@@ -4,6 +4,8 @@
#ifndef I40E_TXRX_COMMON_
#define I40E_TXRX_COMMON_
+#include <linux/bpf_trace.h>
+
void i40e_fd_handle_status(struct i40e_ring *rx_ring,
union i40e_rx_desc *rx_desc, u8 prog_id);
int i40e_xmit_xdp_tx_ring(struct xdp_buff *xdp, struct i40e_ring *xdp_ring);
@@ -88,4 +90,29 @@ void i40e_xsk_clean_rx_ring(struct i40e_ring *rx_ring);
void i40e_xsk_clean_tx_ring(struct i40e_ring *tx_ring);
bool i40e_xsk_any_rx_ring_enabled(struct i40e_vsi *vsi);
+static inline void i40e_xdp_do_action(u32 act, int *result,
+ struct i40e_ring *rx_ring,
+ struct xdp_buff *xdp,
+ struct bpf_prog *xdp_prog)
+{
+ struct i40e_ring *xdp_ring;
+ int err;
+
+ if (act == XDP_TX) {
+ xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->queue_index];
+ *result = i40e_xmit_xdp_tx_ring(xdp, xdp_ring);
+ } else if (act == XDP_REDIRECT) {
+ err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
+ *result = !err ? I40E_XDP_REDIR : I40E_XDP_CONSUMED;
+ } else if (act == XDP_PASS) {
+ *result = I40E_XDP_PASS;
+ } else if (act == XDP_DROP) {
+ *result = I40E_XDP_CONSUMED;
+ } else {
+ if (act != XDP_ABORTED)
+ bpf_warn_invalid_xdp_action(act);
+ trace_xdp_exception(rx_ring->netdev, xdp_prog, act);
+ *result = I40E_XDP_CONSUMED;
+ }
+}
#endif /* I40E_TXRX_COMMON_ */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index 96d849460d9b..c9d58f49f7a2 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -210,9 +210,8 @@ int i40e_xsk_umem_setup(struct i40e_vsi *vsi, struct xdp_umem *umem,
**/
static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
{
- int err, result = I40E_XDP_PASS;
- struct i40e_ring *xdp_ring;
struct bpf_prog *xdp_prog;
+ int result;
u32 act;
rcu_read_lock();
@@ -222,26 +221,7 @@ static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
xdp_prog = READ_ONCE(rx_ring->xdp_prog);
act = bpf_prog_run_xdp(xdp_prog, xdp);
xdp->handle += xdp->data - xdp->data_hard_start;
- switch (act) {
- case XDP_PASS:
- break;
- case XDP_TX:
- xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->queue_index];
- result = i40e_xmit_xdp_tx_ring(xdp, xdp_ring);
- break;
- case XDP_REDIRECT:
- err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
- result = !err ? I40E_XDP_REDIR : I40E_XDP_CONSUMED;
- break;
- default:
- bpf_warn_invalid_xdp_action(act);
- case XDP_ABORTED:
- trace_xdp_exception(rx_ring->netdev, xdp_prog, act);
- /* fallthrough -- handle aborts by dropping packet */
- case XDP_DROP:
- result = I40E_XDP_CONSUMED;
- break;
- }
+ i40e_xdp_do_action(act, &result, rx_ring, xdp, xdp_prog);
rcu_read_unlock();
return result;
}
--
2.19.1
^ permalink raw reply related
* [PATCH net-next] MAINTAINERS: update cxgb4 and cxgb3 maintainer
From: Arjun Vynipadath @ 2019-01-29 10:08 UTC (permalink / raw)
To: netdev, davem; +Cc: nirranjan, indranil, dt, leedom, Arjun Vynipadath
Vishal Kulkarni will be the new maintainer for Chelsio cxgb3/cxgb4
drivers.
Signed-off-by: Arjun Vynipadath <arjun@chelsio.com>
---
MAINTAINERS | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/MAINTAINERS b/MAINTAINERS
index a0245fd..071f1f5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4122,7 +4122,7 @@ S: Maintained
F: drivers/media/dvb-frontends/cxd2820r*
CXGB3 ETHERNET DRIVER (CXGB3)
-M: Arjun Vynipadath <arjun@chelsio.com>
+M: Vishal Kulkarni <vishal@chelsio.com>
L: netdev@vger.kernel.org
W: http://www.chelsio.com
S: Supported
@@ -4151,7 +4151,7 @@ S: Supported
F: drivers/crypto/chelsio
CXGB4 ETHERNET DRIVER (CXGB4)
-M: Arjun Vynipadath <arjun@chelsio.com>
+M: Vishal Kulkarni <vishal@chelsio.com>
L: netdev@vger.kernel.org
W: http://www.chelsio.com
S: Supported
--
2.9.5
^ permalink raw reply related
* Re: [PATCH 3/7] sh_eth: offload RX checksum on R7S72100
From: Sergei Shtylyov @ 2019-01-29 10:37 UTC (permalink / raw)
To: Simon Horman; +Cc: netdev, David S. Miller, linux-renesas-soc, linux-sh
In-Reply-To: <20190129080004.blc3jrt5y2j7dauo@verge.net.au>
Hello!
On 01/29/2019 11:00 AM, Simon Horman wrote:
>>>> The RZ/A1H (R7S721000) SoC manual describes the Ether MAC's RX checksum
>>>> offload the same way as it's implemented in the EtherAVB MACs...
>>>>
>>>> Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
>>>
>>> Regarding this and the remaining patches in this series,
>>> which add rx-csum offload support in the sh_eth driver for
>>> various SoCs: has this been tested?
>>
>> As I said, I've only tested it on R8A77980.
And still hoping Geert would be able to test on R8A7740.
>
> Thanks, I missed that.
>
> As you may have guessed the implication of my question is that
> IMHO it would be best only to add this feature to SoCs where
> it has been tested.
You don't trust the manuals? :-)
MBR, Sergei
^ permalink raw reply
* Re: [PATCH 0/7] sh_eth: implement simple RX checksum offload
From: Sergei Shtylyov @ 2019-01-29 11:06 UTC (permalink / raw)
To: Heiner Kallweit, netdev, David S. Miller; +Cc: linux-renesas-soc, linux-sh
In-Reply-To: <62dbdf72-4e3f-6c3e-901f-5e8f6de8dec3@gmail.com>
Hello!
On 01/27/2019 08:52 PM, Heiner Kallweit wrote:
>> Here's a set of 7 patches against DaveM's 'net-next.git' repo. I'm implemeting
>> the simple RX checksum offload (like was done for the 'ravb' driver by Simon
>> Horman); it was only tested on the R8A77980 SoC, the other SoCs should just
>> work (according to their manuals)...
>>
>> [1/7] sh_eth: rename sh_eth_cpu_data::hw_checksum
>> [2/7] sh_eth: RX checksum offload support
>> [3/7] sh_eth: offload RX checksum on R7S72100
>> [4/7] sh_eth: offload RX checksum on R8A7740
>> [5/7] sh_eth: offload RX checksum on R8A77980
>> [6/7] sh_eth: offload RX checksum on SH7734
>> [7/7] sh_eth: offload RX checksum on SH7763
>>
>> MBR, Sergei
>>
> Hi Sergei,
>
> the formatting of the patch series isn't in line with the netdev standards.
> See here: https://www.kernel.org/doc/html/latest/networking/netdev-FAQ.html
>
> - cover letter isn't generated by git
Sorry, I don't use git for development. However, I fail to see where this
is requested in the above FAQ.
>
> - That's not ok
> --- *net-next.orig/*drivers/net/ethernet/renesas/sh_eth.h
> +++ *net-next/*drivers/net/ethernet/renesas/sh_eth.h
>
> - patches miss net / net-next annotation
I noted the applicable repo in the cover letter. I know I should use
the subject... but I just keep forgetting about this requirement. :-)
> Heiner
MBR, Sergei
^ permalink raw reply
* Re: [PATCH v2] i40e: replace switch-statement to speed-up retpoline-enabled builds
From: Daniel Borkmann @ 2019-01-29 11:17 UTC (permalink / raw)
To: bjorn.topel, intel-wired-lan
Cc: Björn Töpel, pmenzel, brouer, magnus.karlsson,
magnus.karlsson, netdev, alexei.starovoitov, davem
In-Reply-To: <20190129095754.9390-1-bjorn.topel@gmail.com>
On 01/29/2019 10:57 AM, bjorn.topel@gmail.com wrote:
> From: Björn Töpel <bjorn.topel@intel.com>
>
> GCC will generate jump tables for switch-statements with more than 5
> case statements. An entry into the jump table is an indirect call,
> which means that for CONFIG_RETPOLINE builds, this is rather
> expensive.
>
> This commit replaces the switch-statement that acts on the XDP program
> result with an if-clause.
>
> The if-clause was also refactored into a common function that can be
> used by AF_XDP zero-copy and non-zero-copy code.
>
> Performance prior this patch:
> $ sudo ./xdp_rxq_info --dev enp134s0f0 --action XDP_DROP
> Running XDP on dev:enp134s0f0 (ifindex:7) action:XDP_DROP options:no_touch
> XDP stats CPU pps issue-pps
> XDP-RX CPU 20 18983018 0
> XDP-RX CPU total 18983018
>
> RXQ stats RXQ:CPU pps issue-pps
> rx_queue_index 20:20 18983012 0
> rx_queue_index 20:sum 18983012
>
> $ sudo ./xdpsock -i enp134s0f0 -q 20 -n 2 -z -r
> sock0@enp134s0f0:20 rxdrop
> pps pkts 2.00
> rx 14,641,496 144,751,092
> tx 0 0
>
> And after:
> $ sudo ./xdp_rxq_info --dev enp134s0f0 --action XDP_DROP
> Running XDP on dev:enp134s0f0 (ifindex:7) action:XDP_DROP options:no_touch
> XDP stats CPU pps issue-pps
> XDP-RX CPU 20 24000986 0
> XDP-RX CPU total 24000986
>
> RXQ stats RXQ:CPU pps issue-pps
> rx_queue_index 20:20 24000985 0
> rx_queue_index 20:sum 24000985
>
> +26%
>
> $ sudo ./xdpsock -i enp134s0f0 -q 20 -n 2 -z -r
> sock0@enp134s0f0:20 rxdrop
> pps pkts 2.00
> rx 17,623,578 163,503,263
> tx 0 0
>
> +20%
>
> Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Looks good. Given the performance improvements, wondering in general whether
it would make sense to raise the default limit for generating jump tables if
we have CONFIG_RETPOLINE enabled; as in:
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 9c5a67d..33495a9 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -217,6 +217,8 @@ KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
# Avoid indirect branches in kernel to deal with Spectre
ifdef CONFIG_RETPOLINE
KBUILD_CFLAGS += $(RETPOLINE_CFLAGS)
+ # Avoid generating slow indirect jumps for small number of switch cases
+ KBUILD_CFLAGS += --param case-values-threshold=12
endif
archscripts: scripts_basic
That would likely bloat the kernel a bit also in slow-path places where it
would not be needed, but it would generically catch majority of cases. I'll
run some experiments later today (but in any case that should not block this
patch here).
Cheers,
Daniel
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox