* [PATCH net-next v4 3/4] bpf: BPF for lightweight tunnel infrastructure
From: Thomas Graf @ 2016-11-30 16:10 UTC (permalink / raw)
To: davem; +Cc: netdev, alexei.starovoitov, daniel, tom, roopa, hannes
In-Reply-To: <cover.1480522144.git.tgraf@suug.ch>
Registers new BPF program types which correspond to the LWT hooks:
- BPF_PROG_TYPE_LWT_IN => dst_input()
- BPF_PROG_TYPE_LWT_OUT => dst_output()
- BPF_PROG_TYPE_LWT_XMIT => lwtunnel_xmit()
The separate program types are required to differentiate between the
capabilities each LWT hook allows:
* Programs attached to dst_input() or dst_output() are restricted and
may only read the data of an skb. This prevent modification and
possible invalidation of already validated packet headers on receive
and the construction of illegal headers while the IP headers are
still being assembled.
* Programs attached to lwtunnel_xmit() are allowed to modify packet
content as well as prepending an L2 header via a newly introduced
helper bpf_skb_change_head(). This is safe as lwtunnel_xmit() is
invoked after the IP header has been assembled completely.
All BPF programs receive an skb with L3 headers attached and may return
one of the following error codes:
BPF_OK - Continue routing as per nexthop
BPF_DROP - Drop skb and return EPERM
BPF_REDIRECT - Redirect skb to device as per redirect() helper.
(Only valid in lwtunnel_xmit() context)
The return codes are binary compatible with their TC_ACT_
relatives to ease compatibility.
Signed-off-by: Thomas Graf <tgraf@suug.ch>
---
include/linux/filter.h | 2 +-
include/uapi/linux/bpf.h | 32 +++-
include/uapi/linux/lwtunnel.h | 23 +++
kernel/bpf/verifier.c | 14 +-
net/Kconfig | 8 +
net/core/Makefile | 1 +
net/core/filter.c | 173 ++++++++++++++++++
net/core/lwt_bpf.c | 396 ++++++++++++++++++++++++++++++++++++++++++
net/core/lwtunnel.c | 2 +
9 files changed, 646 insertions(+), 5 deletions(-)
create mode 100644 net/core/lwt_bpf.c
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 7f246a2..7ba6446 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -438,7 +438,7 @@ struct xdp_buff {
};
/* compute the linear packet data range [data, data_end) which
- * will be accessed by cls_bpf and act_bpf programs
+ * will be accessed by cls_bpf, act_bpf and lwt programs
*/
static inline void bpf_compute_data_end(struct sk_buff *skb)
{
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1370a9d..22ac827 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -101,6 +101,9 @@ enum bpf_prog_type {
BPF_PROG_TYPE_XDP,
BPF_PROG_TYPE_PERF_EVENT,
BPF_PROG_TYPE_CGROUP_SKB,
+ BPF_PROG_TYPE_LWT_IN,
+ BPF_PROG_TYPE_LWT_OUT,
+ BPF_PROG_TYPE_LWT_XMIT,
};
enum bpf_attach_type {
@@ -409,6 +412,16 @@ union bpf_attr {
*
* int bpf_get_numa_node_id()
* Return: Id of current NUMA node.
+ *
+ * int bpf_skb_change_head()
+ * Grows headroom of skb and adjusts MAC header offset accordingly.
+ * Will extends/reallocae as required automatically.
+ * May change skb data pointer and will thus invalidate any check
+ * performed for direct packet access.
+ * @skb: pointer to skb
+ * @len: length of header to be pushed in front
+ * @flags: Flags (unused for now)
+ * Return: 0 on success or negative error
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -453,7 +466,8 @@ union bpf_attr {
FN(skb_pull_data), \
FN(csum_update), \
FN(set_hash_invalid), \
- FN(get_numa_node_id),
+ FN(get_numa_node_id), \
+ FN(skb_change_head),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
@@ -537,6 +551,22 @@ struct bpf_tunnel_key {
__u32 tunnel_label;
};
+/* Generic BPF return codes which all BPF program types may support.
+ * The values are binary compatible with their TC_ACT_* counter-part to
+ * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT
+ * programs.
+ *
+ * XDP is handled seprately, see XDP_*.
+ */
+enum bpf_ret_code {
+ BPF_OK = 0,
+ /* 1 reserved */
+ BPF_DROP = 2,
+ /* 3-6 reserved */
+ BPF_REDIRECT = 7,
+ /* >127 are reserved for prog type specific return codes */
+};
+
/* User return codes for XDP prog type.
* A valid XDP program must return one of these defined values. All other
* return codes are reserved for future use. Unknown return codes will result
diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index 453cc62..92724cba 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -10,6 +10,7 @@ enum lwtunnel_encap_types {
LWTUNNEL_ENCAP_ILA,
LWTUNNEL_ENCAP_IP6,
LWTUNNEL_ENCAP_SEG6,
+ LWTUNNEL_ENCAP_BPF,
__LWTUNNEL_ENCAP_MAX,
};
@@ -43,4 +44,26 @@ enum lwtunnel_ip6_t {
#define LWTUNNEL_IP6_MAX (__LWTUNNEL_IP6_MAX - 1)
+enum {
+ LWT_BPF_PROG_UNSPEC,
+ LWT_BPF_PROG_FD,
+ LWT_BPF_PROG_NAME,
+ __LWT_BPF_PROG_MAX,
+};
+
+#define LWT_BPF_PROG_MAX (__LWT_BPF_PROG_MAX - 1)
+
+enum {
+ LWT_BPF_UNSPEC,
+ LWT_BPF_IN,
+ LWT_BPF_OUT,
+ LWT_BPF_XMIT,
+ LWT_BPF_XMIT_HEADROOM,
+ __LWT_BPF_MAX,
+};
+
+#define LWT_BPF_MAX (__LWT_BPF_MAX - 1)
+
+#define LWT_BPF_MAX_HEADROOM 256
+
#endif /* _UAPI_LWTUNNEL_H_ */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8740c5f..8135cb1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -633,12 +633,19 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
#define MAX_PACKET_OFF 0xffff
static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
- const struct bpf_call_arg_meta *meta)
+ const struct bpf_call_arg_meta *meta,
+ enum bpf_access_type t)
{
switch (env->prog->type) {
+ case BPF_PROG_TYPE_LWT_IN:
+ case BPF_PROG_TYPE_LWT_OUT:
+ /* dst_input() and dst_output() can't write for now */
+ if (t == BPF_WRITE)
+ return false;
case BPF_PROG_TYPE_SCHED_CLS:
case BPF_PROG_TYPE_SCHED_ACT:
case BPF_PROG_TYPE_XDP:
+ case BPF_PROG_TYPE_LWT_XMIT:
if (meta)
return meta->pkt_access;
@@ -837,7 +844,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
err = check_stack_read(state, off, size, value_regno);
}
} else if (state->regs[regno].type == PTR_TO_PACKET) {
- if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL)) {
+ if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
verbose("cannot write into packet\n");
return -EACCES;
}
@@ -970,7 +977,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
return 0;
}
- if (type == PTR_TO_PACKET && !may_access_direct_pkt_data(env, meta)) {
+ if (type == PTR_TO_PACKET &&
+ !may_access_direct_pkt_data(env, meta, BPF_READ)) {
verbose("helper access to the packet is not allowed\n");
return -EACCES;
}
diff --git a/net/Kconfig b/net/Kconfig
index 7b6cd34..a100500 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -402,6 +402,14 @@ config LWTUNNEL
weight tunnel endpoint. Tunnel encapsulation parameters are stored
with light weight tunnel state associated with fib routes.
+config LWTUNNEL_BPF
+ bool "Execute BPF program as route nexthop action"
+ depends on LWTUNNEL
+ default y if LWTUNNEL=y
+ ---help---
+ Allows to run BPF programs as a nexthop action following a route
+ lookup for incoming and outgoing packets.
+
config DST_CACHE
bool
default n
diff --git a/net/core/Makefile b/net/core/Makefile
index d6508c2..f6761b6 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
+obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
obj-$(CONFIG_DST_CACHE) += dst_cache.o
obj-$(CONFIG_HWBM) += hwbm.o
obj-$(CONFIG_NET_DEVLINK) += devlink.o
diff --git a/net/core/filter.c b/net/core/filter.c
index 698a262..1c4d0fa 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1689,6 +1689,12 @@ static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev,
static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
u32 flags)
{
+ /* Verify that a link layer header is carried */
+ if (unlikely(skb->mac_header >= skb->network_header)) {
+ kfree_skb(skb);
+ return -ERANGE;
+ }
+
bpf_push_mac_rcsum(skb);
return flags & BPF_F_INGRESS ?
__bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
@@ -2188,12 +2194,53 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = {
.arg3_type = ARG_ANYTHING,
};
+BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
+ u64, flags)
+{
+ u32 max_len = __bpf_skb_max_len(skb);
+ u32 new_len = skb->len + head_room;
+ int ret;
+
+ if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) ||
+ new_len < skb->len))
+ return -EINVAL;
+
+ ret = skb_cow(skb, head_room);
+ if (likely(!ret)) {
+ /* Idea for this helper is that we currently only
+ * allow to expand on mac header. This means that
+ * skb->protocol network header, etc, stay as is.
+ * Compared to bpf_skb_change_tail(), we're more
+ * flexible due to not needing to linearize or
+ * reset GSO. Intention for this helper is to be
+ * used by an L3 skb that needs to push mac header
+ * for redirection into L2 device.
+ */
+ __skb_push(skb, head_room);
+ memset(skb->data, 0, head_room);
+ skb_reset_mac_header(skb);
+ }
+
+ bpf_compute_data_end(skb);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_change_head_proto = {
+ .func = bpf_skb_change_head,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
bool bpf_helper_changes_skb_data(void *func)
{
if (func == bpf_skb_vlan_push ||
func == bpf_skb_vlan_pop ||
func == bpf_skb_store_bytes ||
func == bpf_skb_change_proto ||
+ func == bpf_skb_change_head ||
func == bpf_skb_change_tail ||
func == bpf_skb_pull_data ||
func == bpf_l3_csum_replace ||
@@ -2639,6 +2686,68 @@ cg_skb_func_proto(enum bpf_func_id func_id)
}
}
+static const struct bpf_func_proto *
+lwt_inout_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_skb_load_bytes_proto;
+ case BPF_FUNC_skb_pull_data:
+ return &bpf_skb_pull_data_proto;
+ case BPF_FUNC_csum_diff:
+ return &bpf_csum_diff_proto;
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_proto;
+ case BPF_FUNC_get_route_realm:
+ return &bpf_get_route_realm_proto;
+ case BPF_FUNC_get_hash_recalc:
+ return &bpf_get_hash_recalc_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_skb_event_output_proto;
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_smp_processor_id_proto;
+ case BPF_FUNC_skb_under_cgroup:
+ return &bpf_skb_under_cgroup_proto;
+ default:
+ return sk_filter_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
+lwt_xmit_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_get_tunnel_key:
+ return &bpf_skb_get_tunnel_key_proto;
+ case BPF_FUNC_skb_set_tunnel_key:
+ return bpf_get_skb_set_tunnel_proto(func_id);
+ case BPF_FUNC_skb_get_tunnel_opt:
+ return &bpf_skb_get_tunnel_opt_proto;
+ case BPF_FUNC_skb_set_tunnel_opt:
+ return bpf_get_skb_set_tunnel_proto(func_id);
+ case BPF_FUNC_redirect:
+ return &bpf_redirect_proto;
+ case BPF_FUNC_clone_redirect:
+ return &bpf_clone_redirect_proto;
+ case BPF_FUNC_skb_change_tail:
+ return &bpf_skb_change_tail_proto;
+ case BPF_FUNC_skb_change_head:
+ return &bpf_skb_change_head_proto;
+ case BPF_FUNC_skb_store_bytes:
+ return &bpf_skb_store_bytes_proto;
+ case BPF_FUNC_csum_update:
+ return &bpf_csum_update_proto;
+ case BPF_FUNC_l3_csum_replace:
+ return &bpf_l3_csum_replace_proto;
+ case BPF_FUNC_l4_csum_replace:
+ return &bpf_l4_csum_replace_proto;
+ case BPF_FUNC_set_hash_invalid:
+ return &bpf_set_hash_invalid_proto;
+ default:
+ return lwt_inout_func_proto(func_id);
+ }
+}
+
static bool __is_valid_access(int off, int size, enum bpf_access_type type)
{
if (off < 0 || off >= sizeof(struct __sk_buff))
@@ -2676,6 +2785,39 @@ static bool sk_filter_is_valid_access(int off, int size,
return __is_valid_access(off, size, type);
}
+static bool lwt_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ enum bpf_reg_type *reg_type)
+{
+ switch (off) {
+ case offsetof(struct __sk_buff, tc_classid):
+ return false;
+ }
+
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case offsetof(struct __sk_buff, mark):
+ case offsetof(struct __sk_buff, priority):
+ case offsetof(struct __sk_buff, cb[0]) ...
+ offsetof(struct __sk_buff, cb[4]):
+ break;
+ default:
+ return false;
+ }
+ }
+
+ switch (off) {
+ case offsetof(struct __sk_buff, data):
+ *reg_type = PTR_TO_PACKET;
+ break;
+ case offsetof(struct __sk_buff, data_end):
+ *reg_type = PTR_TO_PACKET_END;
+ break;
+ }
+
+ return __is_valid_access(off, size, type);
+}
+
static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
const struct bpf_prog *prog)
{
@@ -3007,6 +3149,19 @@ static const struct bpf_verifier_ops cg_skb_ops = {
.convert_ctx_access = sk_filter_convert_ctx_access,
};
+static const struct bpf_verifier_ops lwt_inout_ops = {
+ .get_func_proto = lwt_inout_func_proto,
+ .is_valid_access = lwt_is_valid_access,
+ .convert_ctx_access = sk_filter_convert_ctx_access,
+};
+
+static const struct bpf_verifier_ops lwt_xmit_ops = {
+ .get_func_proto = lwt_xmit_func_proto,
+ .is_valid_access = lwt_is_valid_access,
+ .convert_ctx_access = sk_filter_convert_ctx_access,
+ .gen_prologue = tc_cls_act_prologue,
+};
+
static struct bpf_prog_type_list sk_filter_type __read_mostly = {
.ops = &sk_filter_ops,
.type = BPF_PROG_TYPE_SOCKET_FILTER,
@@ -3032,6 +3187,21 @@ static struct bpf_prog_type_list cg_skb_type __read_mostly = {
.type = BPF_PROG_TYPE_CGROUP_SKB,
};
+static struct bpf_prog_type_list lwt_in_type __read_mostly = {
+ .ops = &lwt_inout_ops,
+ .type = BPF_PROG_TYPE_LWT_IN,
+};
+
+static struct bpf_prog_type_list lwt_out_type __read_mostly = {
+ .ops = &lwt_inout_ops,
+ .type = BPF_PROG_TYPE_LWT_OUT,
+};
+
+static struct bpf_prog_type_list lwt_xmit_type __read_mostly = {
+ .ops = &lwt_xmit_ops,
+ .type = BPF_PROG_TYPE_LWT_XMIT,
+};
+
static int __init register_sk_filter_ops(void)
{
bpf_register_prog_type(&sk_filter_type);
@@ -3039,6 +3209,9 @@ static int __init register_sk_filter_ops(void)
bpf_register_prog_type(&sched_act_type);
bpf_register_prog_type(&xdp_type);
bpf_register_prog_type(&cg_skb_type);
+ bpf_register_prog_type(&lwt_in_type);
+ bpf_register_prog_type(&lwt_out_type);
+ bpf_register_prog_type(&lwt_xmit_type);
return 0;
}
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
new file mode 100644
index 0000000..71bb3e2
--- /dev/null
+++ b/net/core/lwt_bpf.c
@@ -0,0 +1,396 @@
+/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <net/lwtunnel.h>
+
+struct bpf_lwt_prog {
+ struct bpf_prog *prog;
+ char *name;
+};
+
+struct bpf_lwt {
+ struct bpf_lwt_prog in;
+ struct bpf_lwt_prog out;
+ struct bpf_lwt_prog xmit;
+ int family;
+};
+
+#define MAX_PROG_NAME 256
+
+static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt)
+{
+ return (struct bpf_lwt *)lwt->data;
+}
+
+#define NO_REDIRECT false
+#define CAN_REDIRECT true
+
+static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
+ struct dst_entry *dst, bool can_redirect)
+{
+ int ret;
+
+ /* Preempt disable is needed to protect per-cpu redirect_info between
+ * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and
+ * access to maps strictly require a rcu_read_lock() for protection,
+ * mixing with BH RCU lock doesn't work.
+ */
+ preempt_disable();
+ rcu_read_lock();
+ bpf_compute_data_end(skb);
+ ret = bpf_prog_run_save_cb(lwt->prog, skb);
+ rcu_read_unlock();
+
+ switch (ret) {
+ case BPF_OK:
+ break;
+
+ case BPF_REDIRECT:
+ if (unlikely(!can_redirect)) {
+ pr_warn_once("Illegal redirect return code in prog %s\n",
+ lwt->name ? : "<unknown>");
+ ret = BPF_OK;
+ } else {
+ ret = skb_do_redirect(skb);
+ if (ret == 0)
+ ret = BPF_REDIRECT;
+ }
+ break;
+
+ case BPF_DROP:
+ kfree_skb(skb);
+ ret = -EPERM;
+ break;
+
+ default:
+ pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret);
+ kfree_skb(skb);
+ ret = -EINVAL;
+ break;
+ }
+
+ preempt_enable();
+
+ return ret;
+}
+
+static int bpf_input(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct bpf_lwt *bpf;
+ int ret;
+
+ bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+ if (bpf->in.prog) {
+ ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (unlikely(!dst->lwtstate->orig_input)) {
+ pr_warn_once("orig_input not set on dst for prog %s\n",
+ bpf->out.name);
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ return dst->lwtstate->orig_input(skb);
+}
+
+static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct bpf_lwt *bpf;
+ int ret;
+
+ bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+ if (bpf->out.prog) {
+ ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (unlikely(!dst->lwtstate->orig_output)) {
+ pr_warn_once("orig_output not set on dst for prog %s\n",
+ bpf->out.name);
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ return dst->lwtstate->orig_output(net, sk, skb);
+}
+
+static int xmit_check_hhlen(struct sk_buff *skb)
+{
+ int hh_len = skb_dst(skb)->dev->hard_header_len;
+
+ if (skb_headroom(skb) < hh_len) {
+ int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb));
+
+ if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC))
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int bpf_xmit(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct bpf_lwt *bpf;
+
+ bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+ if (bpf->xmit.prog) {
+ int ret;
+
+ ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT);
+ switch (ret) {
+ case BPF_OK:
+ /* If the header was expanded, headroom might be too
+ * small for L2 header to come, expand as needed.
+ */
+ ret = xmit_check_hhlen(skb);
+ if (unlikely(ret))
+ return ret;
+
+ return LWTUNNEL_XMIT_CONTINUE;
+ case BPF_REDIRECT:
+ return LWTUNNEL_XMIT_DONE;
+ default:
+ return ret;
+ }
+ }
+
+ return LWTUNNEL_XMIT_CONTINUE;
+}
+
+static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog)
+{
+ if (prog->prog)
+ bpf_prog_put(prog->prog);
+
+ kfree(prog->name);
+}
+
+static void bpf_destroy_state(struct lwtunnel_state *lwt)
+{
+ struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt);
+
+ bpf_lwt_prog_destroy(&bpf->in);
+ bpf_lwt_prog_destroy(&bpf->out);
+ bpf_lwt_prog_destroy(&bpf->xmit);
+}
+
+static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = {
+ [LWT_BPF_PROG_FD] = { .type = NLA_U32, },
+ [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING,
+ .len = MAX_PROG_NAME },
+};
+
+static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog,
+ enum bpf_prog_type type)
+{
+ struct nlattr *tb[LWT_BPF_PROG_MAX + 1];
+ struct bpf_prog *p;
+ int ret;
+ u32 fd;
+
+ ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy);
+ if (ret < 0)
+ return ret;
+
+ if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME])
+ return -EINVAL;
+
+ prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_KERNEL);
+ if (!prog->name)
+ return -ENOMEM;
+
+ fd = nla_get_u32(tb[LWT_BPF_PROG_FD]);
+ p = bpf_prog_get_type(fd, type);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ prog->prog = p;
+
+ return 0;
+}
+
+static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = {
+ [LWT_BPF_IN] = { .type = NLA_NESTED, },
+ [LWT_BPF_OUT] = { .type = NLA_NESTED, },
+ [LWT_BPF_XMIT] = { .type = NLA_NESTED, },
+ [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 },
+};
+
+static int bpf_build_state(struct net_device *dev, struct nlattr *nla,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts)
+{
+ struct nlattr *tb[LWT_BPF_MAX + 1];
+ struct lwtunnel_state *newts;
+ struct bpf_lwt *bpf;
+ int ret;
+
+ if (family != AF_INET && family != AF_INET6)
+ return -EAFNOSUPPORT;
+
+ ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy);
+ if (ret < 0)
+ return ret;
+
+ if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT])
+ return -EINVAL;
+
+ newts = lwtunnel_state_alloc(sizeof(*bpf));
+ if (!newts)
+ return -ENOMEM;
+
+ newts->type = LWTUNNEL_ENCAP_BPF;
+ bpf = bpf_lwt_lwtunnel(newts);
+
+ if (tb[LWT_BPF_IN]) {
+ newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT;
+ ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in,
+ BPF_PROG_TYPE_LWT_IN);
+ if (ret < 0)
+ goto errout;
+ }
+
+ if (tb[LWT_BPF_OUT]) {
+ newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+ ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out,
+ BPF_PROG_TYPE_LWT_OUT);
+ if (ret < 0)
+ goto errout;
+ }
+
+ if (tb[LWT_BPF_XMIT]) {
+ newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT;
+ ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit,
+ BPF_PROG_TYPE_LWT_XMIT);
+ if (ret < 0)
+ goto errout;
+ }
+
+ if (tb[LWT_BPF_XMIT_HEADROOM]) {
+ u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]);
+
+ if (headroom > LWT_BPF_MAX_HEADROOM) {
+ ret = -ERANGE;
+ goto errout;
+ }
+
+ newts->headroom = headroom;
+ }
+
+ bpf->family = family;
+ *ts = newts;
+
+ return 0;
+
+errout:
+ bpf_destroy_state(newts);
+ kfree(newts);
+ return ret;
+}
+
+static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr,
+ struct bpf_lwt_prog *prog)
+{
+ struct nlattr *nest;
+
+ if (!prog->prog)
+ return 0;
+
+ nest = nla_nest_start(skb, attr);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (prog->name &&
+ nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name))
+ return -EMSGSIZE;
+
+ return nla_nest_end(skb, nest);
+}
+
+static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt)
+{
+ struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt);
+
+ if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 ||
+ bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 ||
+ bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0)
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+ int nest_len = nla_total_size(sizeof(struct nlattr)) +
+ nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */
+ 0;
+
+ return nest_len + /* LWT_BPF_IN */
+ nest_len + /* LWT_BPF_OUT */
+ nest_len + /* LWT_BPF_XMIT */
+ 0;
+}
+
+int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b)
+{
+ /* FIXME:
+ * The LWT state is currently rebuilt for delete requests which
+ * results in a new bpf_prog instance. Comparing names for now.
+ */
+ if (!a->name && !b->name)
+ return 0;
+
+ if (!a->name || !b->name)
+ return 1;
+
+ return strcmp(a->name, b->name);
+}
+
+static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+ struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a);
+ struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b);
+
+ return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) ||
+ bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) ||
+ bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit);
+}
+
+static const struct lwtunnel_encap_ops bpf_encap_ops = {
+ .build_state = bpf_build_state,
+ .destroy_state = bpf_destroy_state,
+ .input = bpf_input,
+ .output = bpf_output,
+ .xmit = bpf_xmit,
+ .fill_encap = bpf_fill_encap_info,
+ .get_encap_size = bpf_encap_nlsize,
+ .cmp_encap = bpf_encap_cmp,
+};
+
+static int __init bpf_lwt_init(void)
+{
+ return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF);
+}
+
+subsys_initcall(bpf_lwt_init)
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 03976e9..a5d4e86 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -41,6 +41,8 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
return "ILA";
case LWTUNNEL_ENCAP_SEG6:
return "SEG6";
+ case LWTUNNEL_ENCAP_BPF:
+ return "BPF";
case LWTUNNEL_ENCAP_IP6:
case LWTUNNEL_ENCAP_IP:
case LWTUNNEL_ENCAP_NONE:
--
2.7.4
^ permalink raw reply related
* [PATCH net-next v4 2/4] route: Set lwtstate for local traffic and cached input dsts
From: Thomas Graf @ 2016-11-30 16:10 UTC (permalink / raw)
To: davem; +Cc: netdev, alexei.starovoitov, daniel, tom, roopa, hannes
In-Reply-To: <cover.1480522144.git.tgraf@suug.ch>
A route on the output path hitting a RTN_LOCAL route will keep the dst
associated on its way through the loopback device. On the receive path,
the dst_input() call will thus invoke the input handler of the route
created in the output path. Thus, lwt redirection for input must be done
for dsts allocated in the otuput path as well.
Also, if a route is cached in the input path, the allocated dst should
respect lwtunnel configuration on the nexthop as well.
Signed-off-by: Thomas Graf <tgraf@suug.ch>
---
net/ipv4/route.c | 39 ++++++++++++++++++++++++++-------------
1 file changed, 26 insertions(+), 13 deletions(-)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 0c39b62..f4c4d7a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1602,6 +1602,19 @@ static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
spin_unlock_bh(&fnhe_lock);
}
+static void set_lwt_redirect(struct rtable *rth)
+{
+ if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
+ rth->dst.lwtstate->orig_output = rth->dst.output;
+ rth->dst.output = lwtunnel_output;
+ }
+
+ if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
+ rth->dst.lwtstate->orig_input = rth->dst.input;
+ rth->dst.input = lwtunnel_input;
+ }
+}
+
/* called in rcu_read_lock() section */
static int __mkroute_input(struct sk_buff *skb,
const struct fib_result *res,
@@ -1691,14 +1704,7 @@ static int __mkroute_input(struct sk_buff *skb,
rth->dst.input = ip_forward;
rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
- if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
- rth->dst.lwtstate->orig_output = rth->dst.output;
- rth->dst.output = lwtunnel_output;
- }
- if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
- rth->dst.lwtstate->orig_input = rth->dst.input;
- rth->dst.input = lwtunnel_input;
- }
+ set_lwt_redirect(rth);
skb_dst_set(skb, &rth->dst);
out:
err = 0;
@@ -1925,8 +1931,18 @@ out: return err;
rth->dst.error= -err;
rth->rt_flags &= ~RTCF_LOCAL;
}
+
if (do_cache) {
- if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
+ struct fib_nh *nh = &FIB_RES_NH(res);
+
+ rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
+ if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
+ WARN_ON(rth->dst.input == lwtunnel_input);
+ rth->dst.lwtstate->orig_input = rth->dst.input;
+ rth->dst.input = lwtunnel_input;
+ }
+
+ if (unlikely(!rt_cache_route(nh, rth))) {
rth->dst.flags |= DST_NOCACHE;
rt_add_uncached_list(rth);
}
@@ -2154,10 +2170,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
}
rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
- if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
- rth->dst.lwtstate->orig_output = rth->dst.output;
- rth->dst.output = lwtunnel_output;
- }
+ set_lwt_redirect(rth);
return rth;
}
--
2.7.4
^ permalink raw reply related
* [PATCH net-next v4 1/4] route: Set orig_output when redirecting to lwt on locally generated traffic
From: Thomas Graf @ 2016-11-30 16:10 UTC (permalink / raw)
To: davem; +Cc: netdev, alexei.starovoitov, daniel, tom, roopa, hannes
In-Reply-To: <cover.1480522144.git.tgraf@suug.ch>
orig_output for IPv4 was only set for dsts which hit an input route.
Set it consistently for locally generated traffic as well to allow
lwt to continue the dst_output() path as configured by the nexthop.
Fixes: 2536862311d ("lwt: Add support to redirect dst.input")
Signed-off-by: Thomas Graf <tgraf@suug.ch>
---
net/ipv4/route.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d37fc6f..0c39b62 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2154,8 +2154,10 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
}
rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
- if (lwtunnel_output_redirect(rth->dst.lwtstate))
+ if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
+ rth->dst.lwtstate->orig_output = rth->dst.output;
rth->dst.output = lwtunnel_output;
+ }
return rth;
}
--
2.7.4
^ permalink raw reply related
* [PATCH net-next v4 0/4] bpf: BPF for lightweight tunnel encapsulation
From: Thomas Graf @ 2016-11-30 16:10 UTC (permalink / raw)
To: davem; +Cc: netdev, alexei.starovoitov, daniel, tom, roopa, hannes
This series implements BPF program invocation from dst entries via the
lightweight tunnels infrastructure. The BPF program can be attached to
lwtunnel_input(), lwtunnel_output() or lwtunnel_xmit() and see an L3
skb as context. Programs attached to input and output are read-only.
Programs attached to lwtunnel_xmit() can modify and redirect, push headers
and redirect packets.
The facility can be used to:
- Collect statistics and generate sampling data for a subset of traffic
based on the dst utilized by the packet thus allowing to extend the
existing realms.
- Apply additional per route/dst filters to prohibit certain outgoing
or incoming packets based on BPF filters. In particular, this allows
to maintain per dst custom state across multiple packets in BPF maps
and apply filters based on statistics and behaviour observed over time.
- Attachment of L2 headers at transmit where resolving the L2 address
is not required.
- Possibly many more.
v3 -> v4:
- Bumped LWT_BPF_MAX_HEADROOM from 128 to 256 (Alexei)
- Renamed bpf_skb_push() helper to bpf_skb_change_head() to relate to
existing bpf_skb_change_tail() helper (Alexei/Daniel)
- Added check in __bpf_redirect_common() to verify that program added a
link header before redirecting to a l2 device. Adding the check to
lwt-bpf code was considered but dropped due to massive code required
due to retrieval of net_device via per-cpu redirect buffer. A test
case was added to cover the scenario when a program directs to an l2
device without adding an appropriate l2 header.
(Alexei)
- Prohibited access to tc_classid (Daniel)
- Collapsed bpf_verifier_ops instance for lwt in/out as they are
identical (Daniel)
- Some cosmetic changes
v2 -> v3:
- Added real world sample lwt_len_hist_kern.c which demonstrates how to
collect a histogram on packet sizes for all packets flowing through
a number of routes.
- Restricted output to be read-only. Since the header can no longer
be modified, the rerouting functionality has been removed again.
- Added test case which cover destructive modification of packet data.
v1 -> v2:
- Added new BPF_LWT_REROUTE return code for program to indicate
that new route lookup should be performed. Suggested by Tom.
- New sample to illustrate rerouting
- New patch 05: Recursion limit for lwtunnel_output for the case
when user creates circular dst redirection. Also resolves the
issue for ILA.
- Fix to ensure headroom for potential future L2 header is still
guaranteed
Thomas Graf (4):
route: Set orig_output when redirecting to lwt on locally generated
traffic
route: Set lwtstate for local traffic and cached input dsts
bpf: BPF for lightweight tunnel infrastructure
bpf: Add tests and samples for LWT-BPF
include/linux/filter.h | 2 +-
include/uapi/linux/bpf.h | 32 +++-
include/uapi/linux/lwtunnel.h | 23 +++
kernel/bpf/verifier.c | 14 +-
net/Kconfig | 8 +
net/core/Makefile | 1 +
net/core/filter.c | 173 +++++++++++++++++
net/core/lwt_bpf.c | 396 +++++++++++++++++++++++++++++++++++++++
net/core/lwtunnel.c | 2 +
net/ipv4/route.c | 37 ++--
samples/bpf/Makefile | 4 +
samples/bpf/bpf_helpers.h | 4 +
samples/bpf/lwt_len_hist.sh | 37 ++++
samples/bpf/lwt_len_hist_kern.c | 82 +++++++++
samples/bpf/lwt_len_hist_user.c | 76 ++++++++
samples/bpf/test_lwt_bpf.c | 253 +++++++++++++++++++++++++
samples/bpf/test_lwt_bpf.sh | 399 ++++++++++++++++++++++++++++++++++++++++
17 files changed, 1527 insertions(+), 16 deletions(-)
create mode 100644 net/core/lwt_bpf.c
create mode 100755 samples/bpf/lwt_len_hist.sh
create mode 100644 samples/bpf/lwt_len_hist_kern.c
create mode 100644 samples/bpf/lwt_len_hist_user.c
create mode 100644 samples/bpf/test_lwt_bpf.c
create mode 100755 samples/bpf/test_lwt_bpf.sh
--
2.7.4
^ permalink raw reply
* Re: [patch net / RFC] net: fec: increase frame size limitation to actually available buffer
From: Zefir Kurtisi @ 2016-11-30 16:05 UTC (permalink / raw)
To: Nikita Yushchenko, David S. Miller, Fugang Duan, Troy Kisky,
Andrew Lunn, Eric Nelson, Philippe Reynes, Johannes Berg, netdev
Cc: Chris Healy, Fabio Estevam, linux-kernel
In-Reply-To: <1480444528-30054-1-git-send-email-nikita.yoush@cogentembedded.com>
On 11/29/2016 07:35 PM, Nikita Yushchenko wrote:
> Fec driver uses Rx buffers of 2k, but programs hardware to limit
> incoming frames to 1522 bytes. This raises issues when FEC device
> is used with DSA (since DSA tag can make frame larger), and also
> disallows manual sending and receiving larger frames.
>
> This patch removes the limitation, allowing Rx size up to entire buffer.
> At the same time possible Tx size is increased as well, because hardware
> uses the same register field to limit Rx and Tx.
>
> Signed-off-by: Nikita Yushchenko <nikita.yoush@cogentembedded.com>
> ---
I recently did similar changes to the freescale/gianfar to prevent fragmentation
of (E)DSA frames [1].
With the fragmentation kicking in, I noticed a bug in the scatter-gather logic
which caused the frame-size of fragmented frames to be reported wrong [2]. Not
sure if the fec driver has SG capabilities and if this issue also applies, you
might want to double-check.
Cheers,
Zefir
[1] http://marc.info/?l=linux-netdev&m=147187438313519&w=2
[2] http://marc.info/?l=linux-netdev&m=147187430213484&w=2
^ permalink raw reply
* Re: [PATCH net-next V3 0/9] liquidio VF operations
From: David Miller @ 2016-11-30 16:03 UTC (permalink / raw)
To: rvatsavayi; +Cc: netdev
In-Reply-To: <1480380881-19255-1-git-send-email-rvatsavayi@caviumnetworks.com>
From: Raghu Vatsavayi <rvatsavayi@caviumnetworks.com>
Date: Mon, 28 Nov 2016 16:54:32 -0800
> This patchseries adds support for VF device specific operations
> like mailbox, queues and register access. This V3 patchset also
> has changes based on comments form earlier versions:
>
> 1) Removed extra 'void *' casting.
> 2) Fixed all cross compilations issues reported on S390 and Powerpc
> architectures.
>
> Please apply the patches in following order as these patches depend
> on each other.
Series applied, thanks.
^ permalink raw reply
* Re: [PATCH] net: ethernet: ti: cpsw: fix ASSERT_RTNL() warning during resume
From: Dave Gerlach @ 2016-11-30 16:03 UTC (permalink / raw)
To: Ivan Khoronzhuk, Grygorii Strashko
Cc: David S. Miller, netdev, Mugunthan V N, Sekhar Nori, linux-kernel,
linux-omap
In-Reply-To: <20161130001857.GB18604@khorivan>
On 11/29/2016 06:18 PM, Ivan Khoronzhuk wrote:
> On Tue, Nov 29, 2016 at 04:27:03PM -0600, Grygorii Strashko wrote:
>> netif_set_real_num_tx/rx_queues() are required to be called with rtnl_lock
>> taken, otherwise ASSERT_RTNL() warning will be triggered - which happens
>> now during System resume from suspend:
>> cpsw_resume()
>> |- cpsw_ndo_open()
>> |- netif_set_real_num_tx/rx_queues()
>> |- ASSERT_RTNL();
>>
>> Hence, fix it by surrounding cpsw_ndo_open() by rtnl_lock/unlock() calls.
>>
>> Cc: Dave Gerlach <d-gerlach@ti.com>
>> Cc: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
>> Fixes: commit e05107e6b747 ("net: ethernet: ti: cpsw: add multi queue support")
>> Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
> Reviewed-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
>
Tested-by: Dave Gerlach <d-gerlach@ti.com>
>> ---
>> drivers/net/ethernet/ti/cpsw.c | 4 ++++
>> 1 file changed, 4 insertions(+)
>>
>> diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
>> index ae1ec6a..fd6c03b 100644
>> --- a/drivers/net/ethernet/ti/cpsw.c
>> +++ b/drivers/net/ethernet/ti/cpsw.c
>> @@ -2944,6 +2944,8 @@ static int cpsw_resume(struct device *dev)
>> /* Select default pin state */
>> pinctrl_pm_select_default_state(dev);
>>
>> + /* shut up ASSERT_RTNL() warning in netif_set_real_num_tx/rx_queues */
>> + rtnl_lock();
>> if (cpsw->data.dual_emac) {
>> int i;
>>
>> @@ -2955,6 +2957,8 @@ static int cpsw_resume(struct device *dev)
>> if (netif_running(ndev))
>> cpsw_ndo_open(ndev);
>> }
>> + rtnl_unlock();
>> +
>> return 0;
>> }
>> #endif
>> --
>> 2.10.1
>>
^ permalink raw reply
* Re: [PATCH net] openvswitch: Fix skb leak in IPv6 reassembly.
From: David Miller @ 2016-11-30 16:02 UTC (permalink / raw)
To: diproiettod; +Cc: netdev, fw, pshelar, joe
In-Reply-To: <20161128234353.4262-1-diproiettod@ovn.org>
From: Daniele Di Proietto <diproiettod@ovn.org>
Date: Mon, 28 Nov 2016 15:43:53 -0800
> If nf_ct_frag6_gather() returns an error other than -EINPROGRESS, it
> means that we still have a reference to the skb. We should free it
> before returning from handle_fragments, as stated in the comment above.
>
> Fixes: daaa7d647f81 ("netfilter: ipv6: avoid nf_iterate recursion")
> CC: Florian Westphal <fw@strlen.de>
> CC: Pravin B Shelar <pshelar@ovn.org>
> CC: Joe Stringer <joe@ovn.org>
> Signed-off-by: Daniele Di Proietto <diproiettod@ovn.org>
Applied, thanks.
^ permalink raw reply
* Re: Regression: [PATCH] mlx4: give precise rx/tx bytes/packets counters
From: Eric Dumazet @ 2016-11-30 15:58 UTC (permalink / raw)
To: Jesper Dangaard Brouer; +Cc: David Miller, netdev, Tariq Toukan
In-Reply-To: <20161130150839.5203ece0@redhat.com>
On Wed, 2016-11-30 at 15:08 +0100, Jesper Dangaard Brouer wrote:
> On Fri, 25 Nov 2016 07:46:20 -0800 Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> > From: Eric Dumazet <edumazet@google.com>
>
> Ended up-in net-next as:
>
> commit 40931b85113dad7881d49e8759e5ad41d30a5e6c
> Author: Eric Dumazet <edumazet@google.com>
> Date: Fri Nov 25 07:46:20 2016 -0800
>
> mlx4: give precise rx/tx bytes/packets counters
>
> mlx4 stats are chaotic because a deferred work queue is responsible
> to update them every 250 ms.
>
> Likely after this patch I get this crash (below), when rebooting my machine.
> Looks like a device removal order thing.
> Tested with net-next at commit 93ba22225504.
>
> [...]
> [ 1967.248453] mlx5_core 0000:02:00.1: Shutdown was called
> [ 1967.854556] mlx5_core 0000:02:00.0: Shutdown was called
> [ 1968.443015] e1000e: EEE TX LPI TIMER: 00000011
> [ 1968.484676] sd 3:0:0:0: [sda] Synchronizing SCSI cache
> [ 1968.528354] mlx4_core 0000:01:00.0: mlx4_shutdown was called
> [ 1968.534054] mlx4_en: mlx4p1: Close port called
> [ 1968.571156] mlx4_en 0000:01:00.0: removed PHC
> [ 1968.575677] mlx4_en: mlx4p2: Close port called
> [ 1969.506602] BUG: unable to handle kernel NULL pointer dereference at 0000000000000d08
> [ 1969.514530] IP: [<ffffffffa0127ca4>] mlx4_en_fold_software_stats.part.1+0x34/0xb0 [mlx4_en]
> [ 1969.522963] PGD 0 [ 1969.524803]
> [ 1969.526332] Oops: 0000 [#1] PREEMPT SMP
> [ 1969.530201] Modules linked in: coretemp kvm_intel kvm irqbypass intel_cstate mxm_wmi i2c_i801 intel_rapl_perf i2c_smbus sg pcspkr i2c_core shpchp nfsd wmi video acpi_pad auth_rpcgss oid_registry nfs_acl lockd grace sunrpc ip_tables x_tables mlx4_en e1000e mlx5_core ptp serio_raw sd_mod mlx4_core pps_core devlink hid_generic
> [ 1969.559616] CPU: 3 PID: 3104 Comm: kworker/3:1 Not tainted 4.9.0-rc6-net-next3-01390-g93ba22225504 #12
> [ 1969.568984] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z97 Extreme4, BIOS P2.10 05/12/2015
> [ 1969.578877] Workqueue: events linkwatch_event
> [ 1969.583285] task: ffff8803f42a0000 task.stack: ffff88040b2d0000
> [ 1969.589238] RIP: 0010:[<ffffffffa0127ca4>] [<ffffffffa0127ca4>] mlx4_en_fold_software_stats.part.1+0x34/0xb0 [mlx4_en]
> [ 1969.600102] RSP: 0018:ffff88040b2d3bd8 EFLAGS: 00010282
> [ 1969.605442] RAX: ffff8803f432efc8 RBX: ffff8803f4320000 RCX: 0000000000000000
> [ 1969.612604] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8803f4320000
> [ 1969.619772] RBP: ffff88040b2d3bd8 R08: 000000000000000c R09: ffff8803f432f000
> [ 1969.626938] R10: 0000000000000000 R11: ffff88040d64ac00 R12: ffff8803e5aff8dc
> [ 1969.634104] R13: ffff8803f4320a28 R14: ffff8803e5aff800 R15: 0000000000000000
> [ 1969.641273] FS: 0000000000000000(0000) GS:ffff88041fac0000(0000) knlGS:0000000000000000
> [ 1969.649422] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 1969.655197] CR2: 0000000000000d08 CR3: 0000000001c07000 CR4: 00000000001406e0
> [ 1969.662366] Stack:
> [ 1969.664412] ffff88040b2d3be8 ffffffffa0127f8e ffff88040b2d3c10 ffffffffa012a23b
> [ 1969.671948] ffff8803e5aff8dc ffff8803f4320000 ffff8803f4320000 ffff88040b2d3c30
> [ 1969.679478] ffffffff8160ae29 ffff8803e5aff8d8 ffff8804088ff300 ffff88040b2d3c58
> [ 1969.687001] Call Trace:
> [ 1969.689484] [<ffffffffa0127f8e>] mlx4_en_fold_software_stats+0x1e/0x20 [mlx4_en]
> [ 1969.697026] [<ffffffffa012a23b>] mlx4_en_get_stats64+0x2b/0x50 [mlx4_en]
> [ 1969.703844] [<ffffffff8160ae29>] dev_get_stats+0x39/0xa0
> [ 1969.709274] [<ffffffff81622470>] rtnl_fill_stats+0x40/0x130
> [ 1969.714968] [<ffffffff8162631b>] rtnl_fill_ifinfo+0x55b/0x1010
> [ 1969.720921] [<ffffffff816285d3>] rtmsg_ifinfo_build_skb+0x73/0xd0
> [ 1969.727136] [<ffffffff81628646>] rtmsg_ifinfo.part.25+0x16/0x50
> [ 1969.733176] [<ffffffff81628698>] rtmsg_ifinfo+0x18/0x20
> [ 1969.738522] [<ffffffff8160e947>] netdev_state_change+0x47/0x50
> [ 1969.744478] [<ffffffff81629018>] linkwatch_do_dev+0x38/0x50
> [ 1969.750170] [<ffffffff81629257>] __linkwatch_run_queue+0xe7/0x160
> [ 1969.756385] [<ffffffff816292f5>] linkwatch_event+0x25/0x30
> [ 1969.761991] [<ffffffff8107b3cb>] process_one_work+0x15b/0x460
> [ 1969.767857] [<ffffffff8107b71e>] worker_thread+0x4e/0x480
> [ 1969.773378] [<ffffffff8107b6d0>] ? process_one_work+0x460/0x460
> [ 1969.779420] [<ffffffff8107b6d0>] ? process_one_work+0x460/0x460
> [ 1969.785460] [<ffffffff810811ea>] kthread+0xca/0xe0
> [ 1969.790372] [<ffffffff81081120>] ? kthread_worker_fn+0x120/0x120
> [ 1969.796495] [<ffffffff817302d2>] ret_from_fork+0x22/0x30
> [ 1969.801924] Code: 00 00 55 48 89 e5 85 d2 0f 84 90 00 00 00 83 ea 01 31 c9 31 f6 48 8d 87 c0 ef 00 00 4c 8d 8c d7 c8 ef 00 00 48 8b 10 48 83 c0 08 <4c> 8b 82 08 0d 00 00 48 8b 92 00 0d 00 00 4c 01 c6 48 01 d1 4c
> [ 1969.821969] RIP [<ffffffffa0127ca4>] mlx4_en_fold_software_stats.part.1+0x34/0xb0 [mlx4_en]
> [ 1969.830486] RSP <ffff88040b2d3bd8>
> [ 1969.834002] CR2: 0000000000000d08
> [ 1969.837440] ---[ end trace 80b9fbc1e7baed9b ]---
> [ 1969.842102] Kernel panic - not syncing: Fatal exception in interrupt
> [ 1969.848520] Kernel Offset: disabled
> [ 1969.852050] ---[ end Kernel panic - not syncing: Fatal exception in interrupt
> (END)
Hi Jesper.
Thanks for the report.
Then we have a bug in the driver, deleting some memory too soon.
If we depend on having proper stats at device dismantle, we need to keep
around the TX/RX ring where counters are, until the point these stats
wont be required.
^ permalink raw reply
* Re: [PATCH net] ipv6: Allow IPv4-mapped address as next-hop
From: David Miller @ 2016-11-30 15:58 UTC (permalink / raw)
To: nordmark; +Cc: kuznet, jmorris, yoshfuji, kaber, netdev, gilligan
In-Reply-To: <76e59d76-7905-bb14-43d6-d4aaba51f0f9@arista.com>
From: Erik Nordmark <nordmark@arista.com>
Date: Mon, 28 Nov 2016 15:27:05 -0800
> @@ -1995,8 +1995,11 @@ static struct rt6_info *ip6_route_info_c
> It is very good, but in some (rare!) circumstances
> (SIT, PtP, NBMA NOARP links) it is handy to allow
> some exceptions. --ANK
> + We allow IPv4-mapped nexthops to support RFC4798-type
> + addressing
> */
This is definitely not indented and formatted correctly, please fix.
^ permalink raw reply
* Re: [PATCH net-next v2] ipv6 addrconf: Implemented enhanced DAD (RFC7527)
From: David Miller @ 2016-11-30 15:58 UTC (permalink / raw)
To: nordmark; +Cc: kuznet, jmorris, yoshfuji, kaber, netdev, gilligan, hannes
In-Reply-To: <4936dca6-c55d-809e-a032-f78e23ce6b49@arista.com>
From: Erik Nordmark <nordmark@arista.com>
Date: Mon, 28 Nov 2016 15:26:05 -0800
> Implemented RFC7527 Enhanced DAD.
> IPv6 duplicate address detection can fail if there is some temporary
> loopback of Ethernet frames. RFC7527 solves this by including a random
> nonce in the NS messages used for DAD, and if an NS is received with
> the
> same nonce it is assumed to be a looped back DAD probe and is ignored.
> RFC7527 is enabled by default. Can be disabled by setting both of
> conf/{all,interface}/enhanced_dad to zero.
>
> Signed-off-by: Erik Nordmark<nordmark@arista.com>
> Signed-off-by: Bob Gilligan<gilligan@arista.com>"
> ---
>
> v2: renamed sysctl and made it default to true, plus minor code review
> fixes
This doesn't apply to net-next, please respin.
Also, please format your signoffs correctly, there must be a space
between your name and the <> enclosed email address.
I always wonder why people elide spaces when formatting text, is
there a global shortage of space characters that I am unaware of?
^ permalink raw reply
* Re: [WIP] net+mlx4: auto doorbell
From: Eric Dumazet @ 2016-11-30 15:56 UTC (permalink / raw)
To: Jesper Dangaard Brouer
Cc: Rick Jones, netdev, Saeed Mahameed, Tariq Toukan, Achiad Shochat
In-Reply-To: <20161130123810.581ba21f@redhat.com>
On Wed, 2016-11-30 at 12:38 +0100, Jesper Dangaard Brouer wrote:
> I've played with a somewhat similar patch (from Achiad Shochat) for
> mlx5 (attached). While it gives huge improvements, the problem I ran
> into was that; TX performance became a function of the TX completion
> time/interrupt and could easily be throttled if configured too
> high/slow.
>
> Can your patch be affected by this too?
Like all TX business, you should know this Jesper.
No need to constantly remind us something very well known.
>
> Adjustable via:
> ethtool -C mlx5p2 tx-usecs 16 tx-frames 32
>
Generally speaking these settings impact TX, throughput/latencies.
Since NIC IRQ then trigger softirqs, we already know that IRQ handling
is critical, and some tuning can help, depending on particular load.
Now the trick is when you want a generic kernel, being deployed on hosts
shared by millions of jobs.
>
> On Mon, 28 Nov 2016 22:58:36 -0800 Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> > I have a WIP, that increases pktgen rate by 75 % on mlx4 when bulking is
> > not used.
> >
> > lpaa23:~# echo 0 >/sys/class/net/eth0/doorbell_opt
> > lpaa23:~# sar -n DEV 1 10|grep eth0
> [...]
> > Average: eth0 9.50 5707925.60 0.99 585285.69 0.00 0.00 0.50
> > lpaa23:~# echo 1 >/sys/class/net/eth0/doorbell_opt
> > lpaa23:~# sar -n DEV 1 10|grep eth0
> [...]
> > Average: eth0 2.40 9985214.90 0.31 1023874.60 0.00 0.00 0.50
>
> These +75% number is pktgen without "burst", and definitely show that
> your patch activate xmit_more.
> What is the pps performance number when using pktgen "burst" option?
About the same really. About all packets now get the xmit_more effect.
>
>
> > And about 11 % improvement on an mono-flow UDP_STREAM test.
> >
> > skb_set_owner_w() is now the most consuming function.
> >
> >
> > lpaa23:~# ./udpsnd -4 -H 10.246.7.152 -d 2 &
> > [1] 13696
> > lpaa23:~# echo 0 >/sys/class/net/eth0/doorbell_opt
> > lpaa23:~# sar -n DEV 1 10|grep eth0
> [...]
> > Average: eth0 9.00 1307380.50 1.00 308356.18 0.00 0.00 0.50
> > lpaa23:~# echo 3 >/sys/class/net/eth0/doorbell_opt
> > lpaa23:~# sar -n DEV 1 10|grep eth0
> [...]
> > Average: eth0 3.10 1459558.20 0.44 344267.57 0.00 0.00 0.50
>
> The +11% number seems consistent with my perf observations that approx
> 12% was "fakely" spend on the xmit spin_lock.
>
>
> [...]
> > diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
> > index 4b597dca5c52..affebb435679 100644
> > --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
> > +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
> [...]
> > -static inline bool mlx4_en_is_tx_ring_full(struct mlx4_en_tx_ring *ring)
> > +static inline bool mlx4_en_is_tx_ring_full(const struct mlx4_en_tx_ring *ring)
> > {
> > - return ring->prod - ring->cons > ring->full_size;
> > + return READ_ONCE(ring->prod) - READ_ONCE(ring->cons) > ring->full_size;
> > }
> [...]
>
> > @@ -1033,6 +1058,14 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
> > }
> > send_doorbell = !skb->xmit_more || netif_xmit_stopped(ring->tx_queue);
> >
> > + /* Doorbell avoidance : We can omit doorbell if we know a TX completion
> > + * will happen shortly.
> > + */
> > + if (send_doorbell &&
> > + dev->doorbell_opt &&
> > + (s32)(READ_ONCE(ring->prod_bell) - READ_ONCE(ring->ncons)) > 0)
>
> It would be nice with a function call with an appropriate name, instead
> of an open-coded queue size check. I'm also confused by the "ncons" name.
>
> > + send_doorbell = false;
> > +
> [...]
>
> > diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
> > index 574bcbb1b38f..c3fd0deda198 100644
> > --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
> > +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
> > @@ -280,6 +280,7 @@ struct mlx4_en_tx_ring {
> > */
> > u32 last_nr_txbb;
> > u32 cons;
> > + u32 ncons;
>
> Maybe we can find a better name than "ncons" ?
Thats because 'cons' in this driver really means 'old cons'
and new cons = old cons + last_nr_txbb;
>
> > unsigned long wake_queue;
> > struct netdev_queue *tx_queue;
> > u32 (*free_tx_desc)(struct mlx4_en_priv *priv,
> > @@ -290,6 +291,7 @@ struct mlx4_en_tx_ring {
> >
> > /* cache line used and dirtied in mlx4_en_xmit() */
> > u32 prod ____cacheline_aligned_in_smp;
> > + u32 prod_bell;
>
> Good descriptive variable name.
>
> > unsigned int tx_dropped;
> > unsigned long bytes;
> > unsigned long packets;
>
>
^ permalink raw reply
* Re: [WIP] net+mlx4: auto doorbell
From: Eric Dumazet @ 2016-11-30 15:50 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: Jesper Dangaard Brouer, Rick Jones, netdev@vger.kernel.org,
Saeed Mahameed, Tariq Toukan
In-Reply-To: <CAADnVQLSxkeOkZMAq=PRdPevSLTC6un-pj_QKD1wNeFZSYY-Sw@mail.gmail.com>
On Tue, 2016-11-29 at 23:28 -0800, Alexei Starovoitov wrote:
> On Mon, Nov 28, 2016 at 10:58 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > {
> > @@ -496,8 +531,13 @@ static bool mlx4_en_process_tx_cq(struct net_device *dev,
> > wmb();
> >
> > /* we want to dirty this cache line once */
> > - ACCESS_ONCE(ring->last_nr_txbb) = last_nr_txbb;
> > - ACCESS_ONCE(ring->cons) = ring_cons + txbbs_skipped;
> > + WRITE_ONCE(ring->last_nr_txbb, last_nr_txbb);
> > + ring_cons += txbbs_skipped;
> > + WRITE_ONCE(ring->cons, ring_cons);
> > + WRITE_ONCE(ring->ncons, ring_cons + last_nr_txbb);
> > +
> > + if (dev->doorbell_opt)
> > + mlx4_en_xmit_doorbell(dev, ring);
> ...
> > + /* Doorbell avoidance : We can omit doorbell if we know a TX completion
> > + * will happen shortly.
> > + */
> > + if (send_doorbell &&
> > + dev->doorbell_opt &&
> > + (s32)(READ_ONCE(ring->prod_bell) - READ_ONCE(ring->ncons)) > 0)
> > + send_doorbell = false;
>
> this is awesome idea!
> I'm missing though why you need all these read_once/write_once.
> It's a tx ring that is already protected by tx queue lock
> or completion is happening without grabbing the lock?
Yes, TX completion runs lockless, on any good/recent NIC driver.
> Then how do we make sure that we don't race here
> and indeed above prod_bell - ncons > 0 condition is safe?
That is why I have a cmpxchg()
> Good description of algorithm would certainly help ;)
Sure, when it is ready ;)
^ permalink raw reply
* Re: Aw: Re: [PATCH] mlx4: give precise rx/tx bytes/packets counters
From: Eric Dumazet @ 2016-11-30 15:47 UTC (permalink / raw)
To: David Laight; +Cc: Lino Sanfilippo, David Miller, netdev, Tariq Toukan
In-Reply-To: <063D6719AE5E284EB5DD2968C1650D6DB0230455@AcuExch.aculab.com>
On Wed, 2016-11-30 at 15:28 +0000, David Laight wrote:
> Are you sure??
Yes I am
> Last I looked gcc seemed to convert 'foo++' to 'foo = foo + 1' before
> generating any code.
Your gcc might need a refresh then.
> It might then optimise that back to a memory increment, but that would
> also happen if you'd coded the latter form.
>
> > Which is kind of unfortunate, given it is the fast path.
> >
> > Better add a comment, like :
> >
> > /* We should use WRITE_ONCE() to pair with the READ_ONCE() found in xxxx()
> > * But gcc would generate non optimal code.
> > */
>
> Actually while READ_ONCE() is generally useful - to get a snapshot of a changing value.
>
> WRITE_ONCE() isn't a pairing - the compiler is highly unlikely to write a
> location twice.
WOW. I can not believe what you just said.
We had numerous bugs because compiler was writing on the location
temporary computations. Just take a look at git history to find some
gems.
> You might want an annotation to ensure is doesn't assume it can read the value
> back (write through volatile pointer). But that has nothing to do with how readers behave.
Completely wrong.
^ permalink raw reply
* Re: [WIP] net+mlx4: auto doorbell
From: Eric Dumazet @ 2016-11-30 15:44 UTC (permalink / raw)
To: Saeed Mahameed
Cc: Jesper Dangaard Brouer, Rick Jones, Linux Netdev List,
Saeed Mahameed, Tariq Toukan
In-Reply-To: <CALzJLG_G3G-_jFFD0xiAOfR-k69rhb_2ia4LHZroE-oEFxApnQ@mail.gmail.com>
On Wed, 2016-11-30 at 15:50 +0200, Saeed Mahameed wrote:
> On Tue, Nov 29, 2016 at 8:58 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > On Mon, 2016-11-21 at 10:10 -0800, Eric Dumazet wrote:
> >
> >
> >> Not sure it this has been tried before, but the doorbell avoidance could
> >> be done by the driver itself, because it knows a TX completion will come
> >> shortly (well... if softirqs are not delayed too much !)
> >>
> >> Doorbell would be forced only if :
> >>
> >> ( "skb->xmit_more is not set" AND "TX engine is not 'started yet'" )
> >> OR
> >> ( too many [1] packets were put in TX ring buffer, no point deferring
> >> more)
> >>
> >> Start the pump, but once it is started, let the doorbells being done by
> >> TX completion.
> >>
> >> ndo_start_xmit and TX completion handler would have to maintain a shared
> >> state describing if packets were ready but doorbell deferred.
> >>
> >>
> >> Note that TX completion means "if at least one packet was drained",
> >> otherwise busy polling, constantly calling napi->poll() would force a
> >> doorbell too soon for devices sharing a NAPI for both RX and TX.
> >>
> >> But then, maybe busy poll would like to force a doorbell...
> >>
> >> I could try these ideas on mlx4 shortly.
> >>
> >>
> >> [1] limit could be derived from active "ethtool -c" params, eg tx-frames
> >
> > I have a WIP, that increases pktgen rate by 75 % on mlx4 when bulking is
> > not used.
>
> Hi Eric, Nice Idea indeed and we need something like this,
> today we almost don't exploit the TX bulking at all.
>
> But please see below, i am not sure different contexts should share
> the doorbell ringing, it is really risky.
>
> > drivers/net/ethernet/mellanox/mlx4/en_rx.c | 2
> > drivers/net/ethernet/mellanox/mlx4/en_tx.c | 90 +++++++++++------
> > drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 4
> > include/linux/netdevice.h | 1
> > net/core/net-sysfs.c | 18 +++
> > 5 files changed, 83 insertions(+), 32 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> > index 6562f78b07f4..fbea83218fc0 100644
> > --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> > +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> > @@ -1089,7 +1089,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
> >
> > if (polled) {
> > if (doorbell_pending)
> > - mlx4_en_xmit_doorbell(priv->tx_ring[TX_XDP][cq->ring]);
> > + mlx4_en_xmit_doorbell(dev, priv->tx_ring[TX_XDP][cq->ring]);
> >
> > mlx4_cq_set_ci(&cq->mcq);
> > wmb(); /* ensure HW sees CQ consumer before we post new buffers */
> > diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
> > index 4b597dca5c52..affebb435679 100644
> > --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
> > +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
> > @@ -67,7 +67,7 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
> > ring->size = size;
> > ring->size_mask = size - 1;
> > ring->sp_stride = stride;
> > - ring->full_size = ring->size - HEADROOM - MAX_DESC_TXBBS;
> > + ring->full_size = ring->size - HEADROOM - 2*MAX_DESC_TXBBS;
> >
> > tmp = size * sizeof(struct mlx4_en_tx_info);
> > ring->tx_info = kmalloc_node(tmp, GFP_KERNEL | __GFP_NOWARN, node);
> > @@ -193,6 +193,7 @@ int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
> > ring->sp_cqn = cq;
> > ring->prod = 0;
> > ring->cons = 0xffffffff;
> > + ring->ncons = 0;
> > ring->last_nr_txbb = 1;
> > memset(ring->tx_info, 0, ring->size * sizeof(struct mlx4_en_tx_info));
> > memset(ring->buf, 0, ring->buf_size);
> > @@ -227,9 +228,9 @@ void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv,
> > MLX4_QP_STATE_RST, NULL, 0, 0, &ring->sp_qp);
> > }
> >
> > -static inline bool mlx4_en_is_tx_ring_full(struct mlx4_en_tx_ring *ring)
> > +static inline bool mlx4_en_is_tx_ring_full(const struct mlx4_en_tx_ring *ring)
> > {
> > - return ring->prod - ring->cons > ring->full_size;
> > + return READ_ONCE(ring->prod) - READ_ONCE(ring->cons) > ring->full_size;
> > }
> >
> > static void mlx4_en_stamp_wqe(struct mlx4_en_priv *priv,
> > @@ -374,6 +375,7 @@ int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
> >
> > /* Skip last polled descriptor */
> > ring->cons += ring->last_nr_txbb;
> > + ring->ncons += ring->last_nr_txbb;
> > en_dbg(DRV, priv, "Freeing Tx buf - cons:0x%x prod:0x%x\n",
> > ring->cons, ring->prod);
> >
> > @@ -389,6 +391,7 @@ int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
> > !!(ring->cons & ring->size), 0,
> > 0 /* Non-NAPI caller */);
> > ring->cons += ring->last_nr_txbb;
> > + ring->ncons += ring->last_nr_txbb;
> > cnt++;
> > }
> >
> > @@ -401,6 +404,38 @@ int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
> > return cnt;
> > }
> >
> > +void mlx4_en_xmit_doorbell(const struct net_device *dev,
> > + struct mlx4_en_tx_ring *ring)
> > +{
> > +
> > + if (dev->doorbell_opt & 1) {
> > + u32 oval = READ_ONCE(ring->prod_bell);
> > + u32 nval = READ_ONCE(ring->prod);
> > +
> > + if (oval == nval)
> > + return;
> > +
> > + /* I can not tell yet if a cmpxchg() is needed or not */
> > + if (dev->doorbell_opt & 2)
> > + WRITE_ONCE(ring->prod_bell, nval);
> > + else
> > + if (cmpxchg(&ring->prod_bell, oval, nval) != oval)
> > + return;
> > + }
> > + /* Since there is no iowrite*_native() that writes the
> > + * value as is, without byteswapping - using the one
> > + * the doesn't do byteswapping in the relevant arch
> > + * endianness.
> > + */
> > +#if defined(__LITTLE_ENDIAN)
> > + iowrite32(
> > +#else
> > + iowrite32be(
> > +#endif
> > + ring->doorbell_qpn,
> > + ring->bf.uar->map + MLX4_SEND_DOORBELL);
> > +}
> > +
> > static bool mlx4_en_process_tx_cq(struct net_device *dev,
> > struct mlx4_en_cq *cq, int napi_budget)
> > {
> > @@ -496,8 +531,13 @@ static bool mlx4_en_process_tx_cq(struct net_device *dev,
> > wmb();
> >
> > /* we want to dirty this cache line once */
> > - ACCESS_ONCE(ring->last_nr_txbb) = last_nr_txbb;
> > - ACCESS_ONCE(ring->cons) = ring_cons + txbbs_skipped;
> > + WRITE_ONCE(ring->last_nr_txbb, last_nr_txbb);
> > + ring_cons += txbbs_skipped;
> > + WRITE_ONCE(ring->cons, ring_cons);
> > + WRITE_ONCE(ring->ncons, ring_cons + last_nr_txbb);
> > +
> > + if (dev->doorbell_opt)
> > + mlx4_en_xmit_doorbell(dev, ring);
> >
> > if (ring->free_tx_desc == mlx4_en_recycle_tx_desc)
> > return done < budget;
> > @@ -725,29 +765,14 @@ static void mlx4_bf_copy(void __iomem *dst, const void *src,
> > __iowrite64_copy(dst, src, bytecnt / 8);
> > }
> >
> > -void mlx4_en_xmit_doorbell(struct mlx4_en_tx_ring *ring)
> > -{
> > - wmb();
>
> you missed/removed this "wmb()" in the new function, why ? where did
> you compensate for it ?
I removed it because I had a cmpxchg() there if the barrier was needed.
My patch is a WIP, where you can set the bit 2 to ask to replace the
cmpxchg() by a simple write, only for performance testing/comparisons.
>
> > - /* Since there is no iowrite*_native() that writes the
> > - * value as is, without byteswapping - using the one
> > - * the doesn't do byteswapping in the relevant arch
> > - * endianness.
> > - */
> > -#if defined(__LITTLE_ENDIAN)
> > - iowrite32(
> > -#else
> > - iowrite32be(
> > -#endif
> > - ring->doorbell_qpn,
> > - ring->bf.uar->map + MLX4_SEND_DOORBELL);
> > -}
> >
> > static void mlx4_en_tx_write_desc(struct mlx4_en_tx_ring *ring,
> > struct mlx4_en_tx_desc *tx_desc,
> > union mlx4_wqe_qpn_vlan qpn_vlan,
> > int desc_size, int bf_index,
> > __be32 op_own, bool bf_ok,
> > - bool send_doorbell)
> > + bool send_doorbell,
> > + const struct net_device *dev, int nr_txbb)
> > {
> > tx_desc->ctrl.qpn_vlan = qpn_vlan;
> >
> > @@ -761,6 +786,7 @@ static void mlx4_en_tx_write_desc(struct mlx4_en_tx_ring *ring,
> >
> > wmb();
> >
> > + ring->prod += nr_txbb;
> > mlx4_bf_copy(ring->bf.reg + ring->bf.offset, &tx_desc->ctrl,
> > desc_size);
> >
> > @@ -773,8 +799,9 @@ static void mlx4_en_tx_write_desc(struct mlx4_en_tx_ring *ring,
> > */
> > dma_wmb();
> > tx_desc->ctrl.owner_opcode = op_own;
> > + ring->prod += nr_txbb;
> > if (send_doorbell)
> > - mlx4_en_xmit_doorbell(ring);
> > + mlx4_en_xmit_doorbell(dev, ring);
> > else
> > ring->xmit_more++;
> > }
> > @@ -1017,8 +1044,6 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
> > op_own |= cpu_to_be32(MLX4_WQE_CTRL_IIP);
> > }
> >
> > - ring->prod += nr_txbb;
> > -
> > /* If we used a bounce buffer then copy descriptor back into place */
> > if (unlikely(bounce))
> > tx_desc = mlx4_en_bounce_to_desc(priv, ring, index, desc_size);
> > @@ -1033,6 +1058,14 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
> > }
> > send_doorbell = !skb->xmit_more || netif_xmit_stopped(ring->tx_queue);
> >
> > + /* Doorbell avoidance : We can omit doorbell if we know a TX completion
> > + * will happen shortly.
> > + */
> > + if (send_doorbell &&
> > + dev->doorbell_opt &&
> > + (s32)(READ_ONCE(ring->prod_bell) - READ_ONCE(ring->ncons)) > 0)
>
> Aelexi already expressed his worries about synchronization, and i
> think here (in this exact line) sits the problem,
> what about if exactly at this point the TX completion handler just
> finished and rang the last doorbell,
> you didn't write the new TX descriptor yet (mlx4_en_tx_write_desc), so
> the last doorbell from the CQ handler missed it.
> even if you wrote the TX descriptor before the doorbell decision here,
> you will need a memory barrier to make sure the HW sees
> the new packet, which was typically done before ringing the doorbell.
My patch is a WIP, meaning it is not completed ;)
Surely we can find a non racy way to handle this.
> All in all, this is risky business :), the right way to go is to
> force the upper layer to use xmit-more and delay doorbells/use bulking
> but from the same context
> (xmit routine). For example see Achiad's suggestion (attached in
> Jesper's response), he used stop queue to force the stack to queue up
> packets (TX bulking)
> which would set xmit-more and will use the next completion to release
> the "stopped" ring TXQ rather than hit the doorbell on behalf of it.
Well, you depend on having a higher level queue like a qdisc.
Some users do not use a qdisc.
If you stop the queue, they no longer can send anything -> drops.
T
^ permalink raw reply
* Re: [patch net-next v3 11/12] mlxsw: spectrum_router: Request a dump of FIB tables during init
From: Hannes Frederic Sowa @ 2016-11-30 15:37 UTC (permalink / raw)
To: Jiri Pirko, netdev
Cc: davem, idosch, eladr, yotamg, nogahf, arkadis, ogerlitz, roopa,
dsa, nikolay, andy, vivien.didelot, andrew, f.fainelli,
alexander.h.duyck, kaber
In-Reply-To: <1480500546-2544-12-git-send-email-jiri@resnulli.us>
On 30.11.2016 11:09, Jiri Pirko wrote:
> From: Ido Schimmel <idosch@mellanox.com>
>
> Make sure the device has a complete view of the FIB tables by invoking
> their dump during module init.
>
> Signed-off-by: Ido Schimmel <idosch@mellanox.com>
> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
> ---
> .../net/ethernet/mellanox/mlxsw/spectrum_router.c | 23 ++++++++++++++++++++++
> 1 file changed, 23 insertions(+)
>
> diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
> index 14bed1d..d176047 100644
> --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
> +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
> @@ -2027,8 +2027,23 @@ static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
> return NOTIFY_DONE;
> }
>
> +static void mlxsw_sp_router_fib_dump_flush(struct notifier_block *nb)
> +{
> + struct mlxsw_sp *mlxsw_sp = container_of(nb, struct mlxsw_sp, fib_nb);
> +
> + /* Flush pending FIB notifications and then flush the device's
> + * table before requesting another dump. Do that with RTNL held,
> + * as FIB notification block is already registered.
> + */
> + mlxsw_core_flush_owq();
> + rtnl_lock();
> + mlxsw_sp_router_fib_flush(mlxsw_sp);
> + rtnl_unlock();
> +}
> +
> int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
> {
> + fib_dump_cb_t *cb = mlxsw_sp_router_fib_dump_flush;
> int err;
>
> INIT_LIST_HEAD(&mlxsw_sp->router.nexthop_neighs_list);
> @@ -2048,8 +2063,16 @@ int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
>
> mlxsw_sp->fib_nb.notifier_call = mlxsw_sp_router_fib_event;
> register_fib_notifier(&mlxsw_sp->fib_nb);
Sorry to pick in here again:
There is a race here. You need to protect the registration of the fib
notifier as well by the sequence counter. Updates here are not ordered
in relation to this code below.
I think just move the register notification into the fib_notifier_dump
function, rename it to fib_notifier_init and use it here:
> + if (!fib_notifier_dump(&mlxsw_sp->fib_nb, &init_net, cb)) {
> + err = -EBUSY;
> + goto err_fib_notifier_dump;
> + }
> +
> return 0;
Thanks,
Hannes
^ permalink raw reply
* [PATCH net-next 7/8] net/mlx5e: Save the represntor netdevice as part of the representor
From: Hadar Hen Zion @ 2016-11-30 14:41 UTC (permalink / raw)
To: David S. Miller
Cc: netdev, Saeed Mahameed, Jiri Pirko, Amir Vadai, Or Gerlitz,
Roi Dayan, Hadar Hen Zion
In-Reply-To: <1480516895-29545-1-git-send-email-hadarh@mellanox.com>
Replace the representor private data to a net_device pointer holding the
representor netdevice, instead of void pointer holding mlx5e_priv.
It will be used by a new eswitch service function, returning the uplink representor
netdevice.
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 +-
drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 15 ++++++++-------
drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 3 ++-
.../net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 12 +++++++++++-
4 files changed, 22 insertions(+), 10 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 6b492ca..37c0d84 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3796,7 +3796,7 @@ static void mlx5e_nic_enable(struct mlx5e_priv *priv)
rep.load = mlx5e_nic_rep_load;
rep.unload = mlx5e_nic_rep_unload;
rep.vport = FDB_UPLINK_VPORT;
- rep.priv_data = priv;
+ rep.netdev = netdev;
mlx5_eswitch_register_vport_rep(esw, 0, &rep);
}
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 9b1e351..0868677 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -208,7 +208,8 @@ int mlx5e_add_sqs_fwd_rules(struct mlx5e_priv *priv)
int mlx5e_nic_rep_load(struct mlx5_eswitch *esw, struct mlx5_eswitch_rep *rep)
{
- struct mlx5e_priv *priv = rep->priv_data;
+ struct net_device *netdev = rep->netdev;
+ struct mlx5e_priv *priv = netdev_priv(netdev);
if (test_bit(MLX5E_STATE_OPENED, &priv->state))
return mlx5e_add_sqs_fwd_rules(priv);
@@ -226,7 +227,8 @@ void mlx5e_remove_sqs_fwd_rules(struct mlx5e_priv *priv)
void mlx5e_nic_rep_unload(struct mlx5_eswitch *esw,
struct mlx5_eswitch_rep *rep)
{
- struct mlx5e_priv *priv = rep->priv_data;
+ struct net_device *netdev = rep->netdev;
+ struct mlx5e_priv *priv = netdev_priv(netdev);
if (test_bit(MLX5E_STATE_OPENED, &priv->state))
mlx5e_remove_sqs_fwd_rules(priv);
@@ -555,7 +557,7 @@ int mlx5e_vport_rep_load(struct mlx5_eswitch *esw,
return -EINVAL;
}
- rep->priv_data = netdev_priv(netdev);
+ rep->netdev = netdev;
err = mlx5e_attach_netdev(esw->dev, netdev);
if (err) {
@@ -577,7 +579,7 @@ int mlx5e_vport_rep_load(struct mlx5_eswitch *esw,
mlx5e_detach_netdev(esw->dev, netdev);
err_destroy_netdev:
- mlx5e_destroy_netdev(esw->dev, rep->priv_data);
+ mlx5e_destroy_netdev(esw->dev, netdev_priv(netdev));
return err;
@@ -586,10 +588,9 @@ int mlx5e_vport_rep_load(struct mlx5_eswitch *esw,
void mlx5e_vport_rep_unload(struct mlx5_eswitch *esw,
struct mlx5_eswitch_rep *rep)
{
- struct mlx5e_priv *priv = rep->priv_data;
- struct net_device *netdev = priv->netdev;
+ struct net_device *netdev = rep->netdev;
unregister_netdev(netdev);
mlx5e_detach_netdev(esw->dev, netdev);
- mlx5e_destroy_netdev(esw->dev, priv);
+ mlx5e_destroy_netdev(esw->dev, netdev_priv(netdev));
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index cf1aa56..8661dd3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -186,7 +186,7 @@ struct mlx5_eswitch_rep {
struct mlx5_eswitch_rep *rep);
u16 vport;
u8 hw_id[ETH_ALEN];
- void *priv_data;
+ struct net_device *netdev;
struct mlx5_flow_handle *vport_rx_rule;
struct list_head vport_sqs_list;
@@ -318,6 +318,7 @@ void mlx5_eswitch_register_vport_rep(struct mlx5_eswitch *esw,
struct mlx5_eswitch_rep *rep);
void mlx5_eswitch_unregister_vport_rep(struct mlx5_eswitch *esw,
int vport_index);
+struct net_device *mlx5_eswitch_get_uplink_netdev(struct mlx5_eswitch *esw);
int mlx5_eswitch_add_vlan_action(struct mlx5_eswitch *esw,
struct mlx5_esw_flow_attr *attr);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 5c01550..466e161 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -970,7 +970,7 @@ void mlx5_eswitch_register_vport_rep(struct mlx5_eswitch *esw,
rep->load = __rep->load;
rep->unload = __rep->unload;
rep->vport = __rep->vport;
- rep->priv_data = __rep->priv_data;
+ rep->netdev = __rep->netdev;
ether_addr_copy(rep->hw_id, __rep->hw_id);
INIT_LIST_HEAD(&rep->vport_sqs_list);
@@ -990,3 +990,13 @@ void mlx5_eswitch_unregister_vport_rep(struct mlx5_eswitch *esw,
rep->valid = false;
}
+
+struct net_device *mlx5_eswitch_get_uplink_netdev(struct mlx5_eswitch *esw)
+{
+#define UPLINK_REP_INDEX 0
+ struct mlx5_esw_offload *offloads = &esw->offloads;
+ struct mlx5_eswitch_rep *rep;
+
+ rep = &offloads->vport_reps[UPLINK_REP_INDEX];
+ return rep->netdev;
+}
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next 5/8] net/sched: cls_flower: Add offload support using egress Hardware device
From: Hadar Hen Zion @ 2016-11-30 14:41 UTC (permalink / raw)
To: David S. Miller
Cc: netdev, Saeed Mahameed, Jiri Pirko, Amir Vadai, Or Gerlitz,
Roi Dayan, Hadar Hen Zion
In-Reply-To: <1480516895-29545-1-git-send-email-hadarh@mellanox.com>
In order to support hardware offloading when the device given by the tc
rule is different from the Hardware underline device, extract the mirred
(egress) device from the tc action when a filter is added, using the new
tc_action_ops, get_dev().
Flower caches the information about the mirred device and use it for
calling ndo_setup_tc in filter change, update stats and delete.
Calling ndo_setup_tc of the mirred (egress) device instead of the
ingress device will allow a resolution between the software ingress
device and the underline hardware device.
The resolution will take place inside the offloading driver using
'egress_device' flag added to tc_to_netdev struct which is provided to
the offloading driver.
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
include/linux/netdevice.h | 1 +
include/net/pkt_cls.h | 2 ++
net/sched/cls_api.c | 22 ++++++++++++++++++++++
net/sched/cls_flower.c | 41 ++++++++++++++++++++++++-----------------
4 files changed, 49 insertions(+), 17 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4ffcd87..00c351c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -802,6 +802,7 @@ struct tc_to_netdev {
struct tc_cls_matchall_offload *cls_mall;
struct tc_cls_bpf_offload *cls_bpf;
};
+ bool egress_dev;
};
/* These structures hold the attributes of xdp state that are being passed
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 45ad9aa..f0a0514 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -171,6 +171,8 @@ void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst,
struct tcf_exts *src);
int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts);
int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts);
+int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts,
+ struct net_device **hw_dev);
/**
* struct tcf_pkt_info - packet information
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index b05d4a2..e7aeab9 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -682,6 +682,28 @@ int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts)
}
EXPORT_SYMBOL(tcf_exts_dump_stats);
+int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts,
+ struct net_device **hw_dev)
+{
+ const struct tc_action *a;
+ LIST_HEAD(actions);
+
+ if (tc_no_actions(exts))
+ return -EINVAL;
+
+ tcf_exts_to_list(exts, &actions);
+ list_for_each_entry(a, &actions, list) {
+ if (a->ops->get_dev) {
+ a->ops->get_dev(a, dev_net(dev), hw_dev);
+ break;
+ }
+ }
+ if (*hw_dev)
+ return 0;
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(tcf_exts_get_dev);
+
static int __init tc_filter_init(void)
{
rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, NULL);
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 13b349f..1cacfa5 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -78,6 +78,8 @@ struct cls_fl_filter {
u32 handle;
u32 flags;
struct rcu_head rcu;
+ struct tc_to_netdev tc;
+ struct net_device *hw_dev;
};
static unsigned short int fl_mask_range(const struct fl_flow_mask *mask)
@@ -203,9 +205,9 @@ static void fl_destroy_filter(struct rcu_head *head)
static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f)
{
- struct net_device *dev = tp->q->dev_queue->dev;
struct tc_cls_flower_offload offload = {0};
- struct tc_to_netdev tc;
+ struct net_device *dev = f->hw_dev;
+ struct tc_to_netdev *tc = &f->tc;
if (!tc_can_offload(dev, tp))
return;
@@ -213,10 +215,10 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f)
offload.command = TC_CLSFLOWER_DESTROY;
offload.cookie = (unsigned long)f;
- tc.type = TC_SETUP_CLSFLOWER;
- tc.cls_flower = &offload;
+ tc->type = TC_SETUP_CLSFLOWER;
+ tc->cls_flower = &offload;
- dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
+ dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, tc);
}
static int fl_hw_replace_filter(struct tcf_proto *tp,
@@ -226,11 +228,17 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
{
struct net_device *dev = tp->q->dev_queue->dev;
struct tc_cls_flower_offload offload = {0};
- struct tc_to_netdev tc;
+ struct tc_to_netdev *tc = &f->tc;
int err;
- if (!tc_can_offload(dev, tp))
- return tc_skip_sw(f->flags) ? -EINVAL : 0;
+ if (!tc_can_offload(dev, tp)) {
+ if (tcf_exts_get_dev(dev, &f->exts, &f->hw_dev))
+ return tc_skip_sw(f->flags) ? -EINVAL : 0;
+ dev = f->hw_dev;
+ tc->egress_dev = true;
+ } else {
+ f->hw_dev = dev;
+ }
offload.command = TC_CLSFLOWER_REPLACE;
offload.cookie = (unsigned long)f;
@@ -239,23 +247,22 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
offload.key = &f->key;
offload.exts = &f->exts;
- tc.type = TC_SETUP_CLSFLOWER;
- tc.cls_flower = &offload;
+ tc->type = TC_SETUP_CLSFLOWER;
+ tc->cls_flower = &offload;
err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol,
- &tc);
+ tc);
if (tc_skip_sw(f->flags))
return err;
-
return 0;
}
static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
{
- struct net_device *dev = tp->q->dev_queue->dev;
struct tc_cls_flower_offload offload = {0};
- struct tc_to_netdev tc;
+ struct net_device *dev = f->hw_dev;
+ struct tc_to_netdev *tc = &f->tc;
if (!tc_can_offload(dev, tp))
return;
@@ -264,10 +271,10 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
offload.cookie = (unsigned long)f;
offload.exts = &f->exts;
- tc.type = TC_SETUP_CLSFLOWER;
- tc.cls_flower = &offload;
+ tc->type = TC_SETUP_CLSFLOWER;
+ tc->cls_flower = &offload;
- dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
+ dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, tc);
}
static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f)
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next 8/8] net/mlx5e: Support adding ingress tc rule when egress device flag is set
From: Hadar Hen Zion @ 2016-11-30 14:41 UTC (permalink / raw)
To: David S. Miller
Cc: netdev, Saeed Mahameed, Jiri Pirko, Amir Vadai, Or Gerlitz,
Roi Dayan, Hadar Hen Zion
In-Reply-To: <1480516895-29545-1-git-send-email-hadarh@mellanox.com>
When ndo_setup_tc is called with an egress_dev flag set, it means that
the ndo call was executed on the mirred action (egress) device and not
on the ingress device.
In order to support this kind of ndo_setup_tc call, and insert the
correct decap rule to the hardware, the uplink device on the same eswitch
should be found.
Currently, we use this resolution between the mirred device and the
uplink on the same eswitch to offload vxlan shared device decap rules.
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 0868677..8503788 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -289,6 +289,14 @@ static int mlx5e_rep_ndo_setup_tc(struct net_device *dev, u32 handle,
if (TC_H_MAJ(handle) != TC_H_MAJ(TC_H_INGRESS))
return -EOPNOTSUPP;
+ if (tc->egress_dev) {
+ struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+ struct net_device *uplink_dev = mlx5_eswitch_get_uplink_netdev(esw);
+
+ return uplink_dev->netdev_ops->ndo_setup_tc(uplink_dev, handle,
+ proto, tc);
+ }
+
switch (tc->type) {
case TC_SETUP_CLSFLOWER:
switch (tc->cls_flower->command) {
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next 6/8] net/mlx5e: Bring back representor's ndos that were accidentally removed
From: Hadar Hen Zion @ 2016-11-30 14:41 UTC (permalink / raw)
To: David S. Miller
Cc: netdev, Saeed Mahameed, Jiri Pirko, Amir Vadai, Or Gerlitz,
Roi Dayan, Hadar Hen Zion
In-Reply-To: <1480516895-29545-1-git-send-email-hadarh@mellanox.com>
The VF Representor udp tunnel ndo entries were removed by mistake,
return them.
Fixes: 370bad0f9a52 ('net/mlx5e: Support HW (offloaded) and SW counters for SRIOV switchdev mode')
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 5e33f6b..9b1e351 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -384,6 +384,8 @@ int mlx5e_get_offload_stats(int attr_id, const struct net_device *dev,
.ndo_get_phys_port_name = mlx5e_rep_get_phys_port_name,
.ndo_setup_tc = mlx5e_rep_ndo_setup_tc,
.ndo_get_stats64 = mlx5e_rep_get_stats,
+ .ndo_udp_tunnel_add = mlx5e_add_vxlan_port,
+ .ndo_udp_tunnel_del = mlx5e_del_vxlan_port,
.ndo_has_offload_stats = mlx5e_has_offload_stats,
.ndo_get_offload_stats = mlx5e_get_offload_stats,
};
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next 2/8] net/sched: cls_flower: Try to offload only if skip_hw flag isn't set
From: Hadar Hen Zion @ 2016-11-30 14:41 UTC (permalink / raw)
To: David S. Miller
Cc: netdev, Saeed Mahameed, Jiri Pirko, Amir Vadai, Or Gerlitz,
Roi Dayan, Hadar Hen Zion
In-Reply-To: <1480516895-29545-1-git-send-email-hadarh@mellanox.com>
Check skip_hw flag isn't set before calling
fl_hw_{replace/destroy}_filter and fl_hw_update_stats functions.
Replace the call to tc_should_offload with tc_can_offload.
tc_can_offload only checks if the device supports offloading, the check for
skip_hw flag is done earlier in the flow.
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
net/sched/cls_flower.c | 35 ++++++++++++++++++++---------------
1 file changed, 20 insertions(+), 15 deletions(-)
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index e8dd09a..5e70f65 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -207,7 +207,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, unsigned long cookie)
struct tc_cls_flower_offload offload = {0};
struct tc_to_netdev tc;
- if (!tc_should_offload(dev, tp, 0))
+ if (!tc_can_offload(dev, tp))
return;
offload.command = TC_CLSFLOWER_DESTROY;
@@ -231,7 +231,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
struct tc_to_netdev tc;
int err;
- if (!tc_should_offload(dev, tp, flags))
+ if (!tc_can_offload(dev, tp))
return tc_skip_sw(flags) ? -EINVAL : 0;
offload.command = TC_CLSFLOWER_REPLACE;
@@ -259,7 +259,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
struct tc_cls_flower_offload offload = {0};
struct tc_to_netdev tc;
- if (!tc_should_offload(dev, tp, 0))
+ if (!tc_can_offload(dev, tp))
return;
offload.command = TC_CLSFLOWER_STATS;
@@ -275,7 +275,8 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f)
{
list_del_rcu(&f->list);
- fl_hw_destroy_filter(tp, (unsigned long)f);
+ if (!tc_skip_hw(f->flags))
+ fl_hw_destroy_filter(tp, (unsigned long)f);
tcf_unbind_filter(tp, &f->res);
call_rcu(&f->rcu, fl_destroy_filter);
}
@@ -743,20 +744,23 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
goto errout;
}
- err = fl_hw_replace_filter(tp,
- &head->dissector,
- &mask.key,
- &fnew->key,
- &fnew->exts,
- (unsigned long)fnew,
- fnew->flags);
- if (err)
- goto errout;
+ if (!tc_skip_hw(fnew->flags)) {
+ err = fl_hw_replace_filter(tp,
+ &head->dissector,
+ &mask.key,
+ &fnew->key,
+ &fnew->exts,
+ (unsigned long)fnew,
+ fnew->flags);
+ if (err)
+ goto errout;
+ }
if (fold) {
rhashtable_remove_fast(&head->ht, &fold->ht_node,
head->ht_params);
- fl_hw_destroy_filter(tp, (unsigned long)fold);
+ if (!tc_skip_hw(fold->flags))
+ fl_hw_destroy_filter(tp, (unsigned long)fold);
}
*arg = (unsigned long) fnew;
@@ -879,7 +883,8 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
goto nla_put_failure;
}
- fl_hw_update_stats(tp, f);
+ if (!tc_skip_hw(f->flags))
+ fl_hw_update_stats(tp, f);
if (fl_dump_key_val(skb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST,
mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK,
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next 4/8] net/sched: act_mirred: Add new tc_action_ops get_dev()
From: Hadar Hen Zion @ 2016-11-30 14:41 UTC (permalink / raw)
To: David S. Miller
Cc: netdev, Saeed Mahameed, Jiri Pirko, Amir Vadai, Or Gerlitz,
Roi Dayan, Hadar Hen Zion
In-Reply-To: <1480516895-29545-1-git-send-email-hadarh@mellanox.com>
Adding support to a new tc_action_ops.
get_dev is a general option which allows to get the underline
device when trying to offload a tc rule.
In case of mirred action the returned device is the mirred (egress)
device.
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
---
include/net/act_api.h | 2 ++
net/sched/act_mirred.c | 12 ++++++++++++
2 files changed, 14 insertions(+)
diff --git a/include/net/act_api.h b/include/net/act_api.h
index d8eae87..9dddf77 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -119,6 +119,8 @@ struct tc_action_ops {
int (*walk)(struct net *, struct sk_buff *,
struct netlink_callback *, int, const struct tc_action_ops *);
void (*stats_update)(struct tc_action *, u64, u32, u64);
+ int (*get_dev)(const struct tc_action *a, struct net *net,
+ struct net_device **mirred_dev);
};
struct tc_action_net {
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 1af7baa..bb09ba3 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -315,6 +315,17 @@ static int mirred_device_event(struct notifier_block *unused,
.notifier_call = mirred_device_event,
};
+static int tcf_mirred_device(const struct tc_action *a, struct net *net,
+ struct net_device **mirred_dev)
+{
+ int ifindex = tcf_mirred_ifindex(a);
+
+ *mirred_dev = __dev_get_by_index(net, ifindex);
+ if (!mirred_dev)
+ return -EINVAL;
+ return 0;
+}
+
static struct tc_action_ops act_mirred_ops = {
.kind = "mirred",
.type = TCA_ACT_MIRRED,
@@ -327,6 +338,7 @@ static int mirred_device_event(struct notifier_block *unused,
.walk = tcf_mirred_walker,
.lookup = tcf_mirred_search,
.size = sizeof(struct tcf_mirred),
+ .get_dev = tcf_mirred_device,
};
static __net_init int mirred_init_net(struct net *net)
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next 3/8] net/sched: cls_flower: Provide a filter to replace/destroy hardware filter functions
From: Hadar Hen Zion @ 2016-11-30 14:41 UTC (permalink / raw)
To: David S. Miller
Cc: netdev, Saeed Mahameed, Jiri Pirko, Amir Vadai, Or Gerlitz,
Roi Dayan, Hadar Hen Zion
In-Reply-To: <1480516895-29545-1-git-send-email-hadarh@mellanox.com>
Instead of providing many arguments to fl_hw_{replace/destroy}_filter
functions, just provide cls_fl_filter struct that includes all the relevant
args.
This patches doesn't add any new functionality.
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
net/sched/cls_flower.c | 27 +++++++++++----------------
1 file changed, 11 insertions(+), 16 deletions(-)
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 5e70f65..13b349f 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -201,7 +201,7 @@ static void fl_destroy_filter(struct rcu_head *head)
kfree(f);
}
-static void fl_hw_destroy_filter(struct tcf_proto *tp, unsigned long cookie)
+static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f)
{
struct net_device *dev = tp->q->dev_queue->dev;
struct tc_cls_flower_offload offload = {0};
@@ -211,7 +211,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, unsigned long cookie)
return;
offload.command = TC_CLSFLOWER_DESTROY;
- offload.cookie = cookie;
+ offload.cookie = (unsigned long)f;
tc.type = TC_SETUP_CLSFLOWER;
tc.cls_flower = &offload;
@@ -222,9 +222,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, unsigned long cookie)
static int fl_hw_replace_filter(struct tcf_proto *tp,
struct flow_dissector *dissector,
struct fl_flow_key *mask,
- struct fl_flow_key *key,
- struct tcf_exts *actions,
- unsigned long cookie, u32 flags)
+ struct cls_fl_filter *f)
{
struct net_device *dev = tp->q->dev_queue->dev;
struct tc_cls_flower_offload offload = {0};
@@ -232,14 +230,14 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
int err;
if (!tc_can_offload(dev, tp))
- return tc_skip_sw(flags) ? -EINVAL : 0;
+ return tc_skip_sw(f->flags) ? -EINVAL : 0;
offload.command = TC_CLSFLOWER_REPLACE;
- offload.cookie = cookie;
+ offload.cookie = (unsigned long)f;
offload.dissector = dissector;
offload.mask = mask;
- offload.key = key;
- offload.exts = actions;
+ offload.key = &f->key;
+ offload.exts = &f->exts;
tc.type = TC_SETUP_CLSFLOWER;
tc.cls_flower = &offload;
@@ -247,7 +245,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol,
&tc);
- if (tc_skip_sw(flags))
+ if (tc_skip_sw(f->flags))
return err;
return 0;
@@ -276,7 +274,7 @@ static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f)
{
list_del_rcu(&f->list);
if (!tc_skip_hw(f->flags))
- fl_hw_destroy_filter(tp, (unsigned long)f);
+ fl_hw_destroy_filter(tp, f);
tcf_unbind_filter(tp, &f->res);
call_rcu(&f->rcu, fl_destroy_filter);
}
@@ -748,10 +746,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
err = fl_hw_replace_filter(tp,
&head->dissector,
&mask.key,
- &fnew->key,
- &fnew->exts,
- (unsigned long)fnew,
- fnew->flags);
+ fnew);
if (err)
goto errout;
}
@@ -760,7 +755,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
rhashtable_remove_fast(&head->ht, &fold->ht_node,
head->ht_params);
if (!tc_skip_hw(fold->flags))
- fl_hw_destroy_filter(tp, (unsigned long)fold);
+ fl_hw_destroy_filter(tp, fold);
}
*arg = (unsigned long) fnew;
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next 1/8] net/sched: Add separate check for skip_hw flag
From: Hadar Hen Zion @ 2016-11-30 14:41 UTC (permalink / raw)
To: David S. Miller
Cc: netdev, Saeed Mahameed, Jiri Pirko, Amir Vadai, Or Gerlitz,
Roi Dayan, Hadar Hen Zion
In-Reply-To: <1480516895-29545-1-git-send-email-hadarh@mellanox.com>
Creating a difference between two possible cases:
1. Not offloading tc rule since the user sets 'skip_hw' flag.
2. Not offloading tc rule since the device doesn't support offloading.
This patch doesn't add any new functionality.
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
---
include/net/pkt_cls.h | 19 +++++++++++++++----
1 file changed, 15 insertions(+), 4 deletions(-)
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 767b03a..45ad9aa 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -425,16 +425,14 @@ struct tc_cls_u32_offload {
};
};
-static inline bool tc_should_offload(const struct net_device *dev,
- const struct tcf_proto *tp, u32 flags)
+static inline bool tc_can_offload(const struct net_device *dev,
+ const struct tcf_proto *tp)
{
const struct Qdisc *sch = tp->q;
const struct Qdisc_class_ops *cops = sch->ops->cl_ops;
if (!(dev->features & NETIF_F_HW_TC))
return false;
- if (flags & TCA_CLS_FLAGS_SKIP_HW)
- return false;
if (!dev->netdev_ops->ndo_setup_tc)
return false;
if (cops && cops->tcf_cl_offload)
@@ -443,6 +441,19 @@ static inline bool tc_should_offload(const struct net_device *dev,
return true;
}
+static inline bool tc_skip_hw(u32 flags)
+{
+ return (flags & TCA_CLS_FLAGS_SKIP_HW) ? true : false;
+}
+
+static inline bool tc_should_offload(const struct net_device *dev,
+ const struct tcf_proto *tp, u32 flags)
+{
+ if (tc_skip_hw(flags))
+ return false;
+ return tc_can_offload(dev, tp);
+}
+
static inline bool tc_skip_sw(u32 flags)
{
return (flags & TCA_CLS_FLAGS_SKIP_SW) ? true : false;
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next 0/8] Offloading tc rules using underline Hardware device
From: Hadar Hen Zion @ 2016-11-30 14:41 UTC (permalink / raw)
To: David S. Miller
Cc: netdev, Saeed Mahameed, Jiri Pirko, Amir Vadai, Or Gerlitz,
Roi Dayan, Hadar Hen Zion
This series adds flower classifier support in offloading tc rules when the
Software ingress device is different from the Hardware ingress device,
such as when dealing with IP tunnels
The first two patches are a small fixes to flower, checking the skip_hw flag
wasn't set before calling the Hardware offloading functions which will try to
offload the rule.
The next two patches are infrastructure patches, a preparation for the fourth
patch which is adding support in flower to offload rules when the ingress
device is not a Hardware device and therefore can't offload.
In this case ndo_setup_tc is called with the mirred (egress) device.
The last three patchs are adding mlx5e support to offload rules using the new
"egress_device" flag.
Thanks,
Hadar
Hadar Hen Zion (8):
net/sched: Add separate check for skip_hw flag
net/sched: cls_flower: Try to offload only if skip_hw flag isn't set
net/sched: cls_flower: Provide a filter to replace/destroy hardware
filter functions
net/sched: act_mirred: Add new tc_action_ops get_dev()
net/sched: cls_flower: Add offload support using egress Hardware
device
net/mlx5e: Bring back representor's ndos that were accidentally
removed
net/mlx5e: Save the represntor netdevice as part of the representor
net/mlx5e: Support adding ingress tc rule when egress device flag is
set
drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 +-
drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 25 +++++--
drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 3 +-
.../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 12 ++-
include/linux/netdevice.h | 1 +
include/net/act_api.h | 2 +
include/net/pkt_cls.h | 21 +++++-
net/sched/act_mirred.c | 12 +++
net/sched/cls_api.c | 22 ++++++
net/sched/cls_flower.c | 87 ++++++++++++----------
10 files changed, 133 insertions(+), 54 deletions(-)
--
1.8.3.1
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox