* Re: [bpf-next RFC 1/3] flow_dissector: implements flow dissector BPF hook
From: Song Liu @ 2018-08-16 22:40 UTC (permalink / raw)
To: Petar Penkov
Cc: Networking, David S . Miller, Alexei Starovoitov, Daniel Borkmann,
simon.horman, Petar Penkov, Willem de Bruijn
In-Reply-To: <20180816164423.14368-2-peterpenkov96@gmail.com>
On Thu, Aug 16, 2018 at 9:44 AM, Petar Penkov <peterpenkov96@gmail.com> wrote:
> From: Petar Penkov <ppenkov@google.com>
>
> Adds a hook for programs of type BPF_PROG_TYPE_FLOW_DISSECTOR and
> attach type BPF_FLOW_DISSECTOR that is executed in the flow dissector
> path. The BPF program is kept as a global variable so it is
> accessible to all flow dissectors.
>
> Signed-off-by: Petar Penkov <ppenkov@google.com>
> Signed-off-by: Willem de Bruijn <willemb@google.com>
> ---
> include/linux/bpf_types.h | 1 +
> include/linux/skbuff.h | 7 +
> include/net/flow_dissector.h | 16 +++
> include/uapi/linux/bpf.h | 14 +-
> kernel/bpf/syscall.c | 8 ++
> kernel/bpf/verifier.c | 2 +
> net/core/filter.c | 157 ++++++++++++++++++++++
> net/core/flow_dissector.c | 76 +++++++++++
> tools/bpf/bpftool/prog.c | 1 +
> tools/include/uapi/linux/bpf.h | 5 +-
> tools/lib/bpf/libbpf.c | 2 +
> tools/testing/selftests/bpf/bpf_helpers.h | 3 +
> 12 files changed, 290 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> index cd26c090e7c0..22083712dd18 100644
> --- a/include/linux/bpf_types.h
> +++ b/include/linux/bpf_types.h
> @@ -32,6 +32,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2)
> #ifdef CONFIG_INET
> BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport)
> #endif
> +BPF_PROG_TYPE(BPF_PROG_TYPE_FLOW_DISSECTOR, flow_dissector)
>
> BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
> BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 17a13e4785fc..ce0e863f02a2 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -243,6 +243,8 @@ struct scatterlist;
> struct pipe_inode_info;
> struct iov_iter;
> struct napi_struct;
> +struct bpf_prog;
> +union bpf_attr;
>
> #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
> struct nf_conntrack {
> @@ -1192,6 +1194,11 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
> const struct flow_dissector_key *key,
> unsigned int key_count);
>
> +int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
> + struct bpf_prog *prog);
> +
> +int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr);
> +
> bool __skb_flow_dissect(const struct sk_buff *skb,
> struct flow_dissector *flow_dissector,
> void *target_container,
> diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
> index 6a4586dcdede..edb919d320c1 100644
> --- a/include/net/flow_dissector.h
> +++ b/include/net/flow_dissector.h
> @@ -270,6 +270,22 @@ __be32 flow_get_u32_dst(const struct flow_keys *flow);
> extern struct flow_dissector flow_keys_dissector;
> extern struct flow_dissector flow_keys_basic_dissector;
>
> +/* struct bpf_flow_dissect_cb:
> + *
> + * This struct is used to pass parameters to BPF programs of type
> + * BPF_PROG_TYPE_FLOW_DISSECTOR. Before such a program is run, the caller sets
> + * the control block of the skb to be a struct of this type. The first field is
> + * used to communicate the next header offset between the BPF programs and the
> + * first value of it is passed from the kernel. The last two fields are used for
> + * writing out flow keys.
> + */
> +struct bpf_flow_dissect_cb {
> + u16 nhoff;
> + u16 unused;
> + void *target_container;
> + struct flow_dissector *flow_dissector;
> +};
> +
> /* struct flow_keys_digest:
> *
> * This structure is used to hold a digest of the full flow keys. This is a
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 66917a4eba27..8bc0fdab685d 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -152,6 +152,7 @@ enum bpf_prog_type {
> BPF_PROG_TYPE_LWT_SEG6LOCAL,
> BPF_PROG_TYPE_LIRC_MODE2,
> BPF_PROG_TYPE_SK_REUSEPORT,
> + BPF_PROG_TYPE_FLOW_DISSECTOR,
> };
>
> enum bpf_attach_type {
> @@ -172,6 +173,7 @@ enum bpf_attach_type {
> BPF_CGROUP_UDP4_SENDMSG,
> BPF_CGROUP_UDP6_SENDMSG,
> BPF_LIRC_MODE2,
> + BPF_FLOW_DISSECTOR,
> __MAX_BPF_ATTACH_TYPE
> };
>
> @@ -2141,6 +2143,15 @@ union bpf_attr {
> * request in the skb.
> * Return
> * 0 on success, or a negative error in case of failure.
> + *
> + * int bpf_flow_dissector_write_keys(const struct sk_buff *skb, const void *from, u32 len, enum flow_dissector_key_id key_id)
> + * Description
> + * Try to write *len* bytes from the source pointer into the offset
> + * of the key with id *key_id*. If *len* is different from the
> + * size of the key, an error is returned. If the key is not used,
> + * this function exits with no effect and code 0.
> + * Return
> + * 0 on success, negative error in case of failure.
> */
> #define __BPF_FUNC_MAPPER(FN) \
> FN(unspec), \
> @@ -2226,7 +2237,8 @@ union bpf_attr {
> FN(get_current_cgroup_id), \
> FN(get_local_storage), \
> FN(sk_select_reuseport), \
> - FN(skb_ancestor_cgroup_id),
> + FN(skb_ancestor_cgroup_id), \
> + FN(flow_dissector_write_keys),
>
> /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> * function eBPF program intends to call
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 43727ed0d94a..a06568841a92 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -1616,6 +1616,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
> case BPF_LIRC_MODE2:
> ptype = BPF_PROG_TYPE_LIRC_MODE2;
> break;
> + case BPF_FLOW_DISSECTOR:
> + ptype = BPF_PROG_TYPE_FLOW_DISSECTOR;
> + break;
> default:
> return -EINVAL;
> }
> @@ -1637,6 +1640,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
> case BPF_PROG_TYPE_LIRC_MODE2:
> ret = lirc_prog_attach(attr, prog);
> break;
> + case BPF_PROG_TYPE_FLOW_DISSECTOR:
> + ret = skb_flow_dissector_bpf_prog_attach(attr, prog);
> + break;
> default:
> ret = cgroup_bpf_prog_attach(attr, ptype, prog);
> }
> @@ -1689,6 +1695,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
> return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL);
> case BPF_LIRC_MODE2:
> return lirc_prog_detach(attr);
> + case BPF_FLOW_DISSECTOR:
> + return skb_flow_dissector_bpf_prog_detach(attr);
> default:
> return -EINVAL;
> }
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index ca90679a7fe5..6d3f268fa8e0 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -1321,6 +1321,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
> case BPF_PROG_TYPE_LWT_XMIT:
> case BPF_PROG_TYPE_SK_SKB:
> case BPF_PROG_TYPE_SK_MSG:
> + case BPF_PROG_TYPE_FLOW_DISSECTOR:
> if (meta)
> return meta->pkt_access;
>
> @@ -3976,6 +3977,7 @@ static bool may_access_skb(enum bpf_prog_type type)
> case BPF_PROG_TYPE_SOCKET_FILTER:
> case BPF_PROG_TYPE_SCHED_CLS:
> case BPF_PROG_TYPE_SCHED_ACT:
> + case BPF_PROG_TYPE_FLOW_DISSECTOR:
> return true;
> default:
> return false;
> diff --git a/net/core/filter.c b/net/core/filter.c
> index fd423ce3da34..03d3037e6508 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -4820,6 +4820,111 @@ bool bpf_helper_changes_pkt_data(void *func)
> return false;
> }
>
> +BPF_CALL_4(bpf_flow_dissector_write_keys, const struct sk_buff *, skb,
> + const void *, from, u32, len, enum flow_dissector_key_id, key_id)
> +{
> + struct bpf_flow_dissect_cb *cb;
> + void *dest;
> +
> + cb = (struct bpf_flow_dissect_cb *)bpf_skb_cb(skb);
> +
> + /* Make sure the dissector actually uses the key. It is not an error if
> + * it does not, but we should not continue past this point in that case
> + */
> + if (!dissector_uses_key(cb->flow_dissector, key_id))
> + return 0;
> +
> + /* Make sure the length is correct */
> + switch (key_id) {
> + case FLOW_DISSECTOR_KEY_CONTROL:
> + case FLOW_DISSECTOR_KEY_ENC_CONTROL:
> + if (len != sizeof(struct flow_dissector_key_control))
> + return -EINVAL;
> + break;
> + case FLOW_DISSECTOR_KEY_BASIC:
> + if (len != sizeof(struct flow_dissector_key_basic))
> + return -EINVAL;
> + break;
> + case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
> + case FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS:
> + if (len != sizeof(struct flow_dissector_key_ipv4_addrs))
> + return -EINVAL;
> + break;
> + case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
> + case FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS:
> + if (len != sizeof(struct flow_dissector_key_ipv6_addrs))
> + return -EINVAL;
> + break;
> + case FLOW_DISSECTOR_KEY_ICMP:
> + if (len != sizeof(struct flow_dissector_key_icmp))
> + return -EINVAL;
> + break;
> + case FLOW_DISSECTOR_KEY_PORTS:
> + case FLOW_DISSECTOR_KEY_ENC_PORTS:
> + if (len != sizeof(struct flow_dissector_key_ports))
> + return -EINVAL;
> + break;
> + case FLOW_DISSECTOR_KEY_ETH_ADDRS:
> + if (len != sizeof(struct flow_dissector_key_eth_addrs))
> + return -EINVAL;
> + break;
> + case FLOW_DISSECTOR_KEY_TIPC:
> + if (len != sizeof(struct flow_dissector_key_tipc))
> + return -EINVAL;
> + break;
> + case FLOW_DISSECTOR_KEY_ARP:
> + if (len != sizeof(struct flow_dissector_key_arp))
> + return -EINVAL;
> + break;
> + case FLOW_DISSECTOR_KEY_VLAN:
> + case FLOW_DISSECTOR_KEY_CVLAN:
> + if (len != sizeof(struct flow_dissector_key_vlan))
> + return -EINVAL;
> + break;
> + case FLOW_DISSECTOR_KEY_FLOW_LABEL:
> + if (len != sizeof(struct flow_dissector_key_tags))
> + return -EINVAL;
> + break;
> + case FLOW_DISSECTOR_KEY_GRE_KEYID:
> + case FLOW_DISSECTOR_KEY_ENC_KEYID:
> + case FLOW_DISSECTOR_KEY_MPLS_ENTROPY:
> + if (len != sizeof(struct flow_dissector_key_keyid))
> + return -EINVAL;
> + break;
> + case FLOW_DISSECTOR_KEY_MPLS:
> + if (len != sizeof(struct flow_dissector_key_mpls))
> + return -EINVAL;
> + break;
> + case FLOW_DISSECTOR_KEY_TCP:
> + if (len != sizeof(struct flow_dissector_key_tcp))
> + return -EINVAL;
> + break;
> + case FLOW_DISSECTOR_KEY_IP:
> + case FLOW_DISSECTOR_KEY_ENC_IP:
> + if (len != sizeof(struct flow_dissector_key_ip))
> + return -EINVAL;
> + break;
> + default:
> + return -EINVAL;
> + }
> +
> + dest = skb_flow_dissector_target(cb->flow_dissector, key_id,
> + cb->target_container);
> +
> + memcpy(dest, from, len);
> + return 0;
> +}
> +
> +static const struct bpf_func_proto bpf_flow_dissector_write_keys_proto = {
> + .func = bpf_flow_dissector_write_keys,
> + .gpl_only = false,
> + .ret_type = RET_INTEGER,
> + .arg1_type = ARG_PTR_TO_CTX,
> + .arg2_type = ARG_PTR_TO_MEM,
> + .arg3_type = ARG_CONST_SIZE,
> + .arg4_type = ARG_ANYTHING,
> +};
> +
> static const struct bpf_func_proto *
> bpf_base_func_proto(enum bpf_func_id func_id)
> {
> @@ -5100,6 +5205,19 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> }
> }
>
> +static const struct bpf_func_proto *
> +flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> +{
> + switch (func_id) {
> + case BPF_FUNC_skb_load_bytes:
> + return &bpf_skb_load_bytes_proto;
> + case BPF_FUNC_flow_dissector_write_keys:
> + return &bpf_flow_dissector_write_keys_proto;
> + default:
> + return bpf_base_func_proto(func_id);
> + }
> +}
> +
> static const struct bpf_func_proto *
> lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> {
> @@ -5738,6 +5856,35 @@ static bool sk_msg_is_valid_access(int off, int size,
> return true;
> }
>
> +static bool flow_dissector_is_valid_access(int off, int size,
> + enum bpf_access_type type,
> + const struct bpf_prog *prog,
> + struct bpf_insn_access_aux *info)
> +{
> + if (type == BPF_WRITE) {
> + switch (off) {
> + case bpf_ctx_range(struct __sk_buff, cb[0]):
> + break;
> + default:
> + return false;
> + }
> + }
> +
> + switch (off) {
> + case bpf_ctx_range(struct __sk_buff, data):
> + info->reg_type = PTR_TO_PACKET;
> + break;
> + case bpf_ctx_range(struct __sk_buff, data_end):
> + info->reg_type = PTR_TO_PACKET_END;
> + break;
> + case bpf_ctx_range_till(struct __sk_buff, family, local_port):
> + case bpf_ctx_range_till(struct __sk_buff, cb[1], cb[4]):
> + return false;
> + }
> +
> + return bpf_skb_is_valid_access(off, size, type, prog, info);
> +}
> +
> static u32 bpf_convert_ctx_access(enum bpf_access_type type,
> const struct bpf_insn *si,
> struct bpf_insn *insn_buf,
> @@ -6995,6 +7142,16 @@ const struct bpf_verifier_ops sk_msg_verifier_ops = {
> const struct bpf_prog_ops sk_msg_prog_ops = {
> };
>
> +const struct bpf_verifier_ops flow_dissector_verifier_ops = {
> + .get_func_proto = flow_dissector_func_proto,
> + .is_valid_access = flow_dissector_is_valid_access,
> + .convert_ctx_access = bpf_convert_ctx_access,
> + .gen_ld_abs = bpf_gen_ld_abs,
> +};
> +
> +const struct bpf_prog_ops flow_dissector_prog_ops = {
> +};
> +
> int sk_detach_filter(struct sock *sk)
> {
> int ret = -ENOENT;
> diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
> index ce9eeeb7c024..767daa231f04 100644
> --- a/net/core/flow_dissector.c
> +++ b/net/core/flow_dissector.c
> @@ -25,6 +25,11 @@
> #include <net/flow_dissector.h>
> #include <scsi/fc/fc_fcoe.h>
> #include <uapi/linux/batadv_packet.h>
> +#include <linux/bpf.h>
> +
> +/* BPF program accessible by all flow dissectors */
> +static struct bpf_prog __rcu *flow_dissector_prog;
> +static DEFINE_MUTEX(flow_dissector_mutex);
>
> static void dissector_set_key(struct flow_dissector *flow_dissector,
> enum flow_dissector_key_id key_id)
> @@ -62,6 +67,40 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
> }
> EXPORT_SYMBOL(skb_flow_dissector_init);
>
> +int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
> + struct bpf_prog *prog)
> +{
> + struct bpf_prog *attached;
> +
> + mutex_lock(&flow_dissector_mutex);
> + attached = rcu_dereference_protected(flow_dissector_prog,
> + lockdep_is_held(&flow_dissector_mutex));
> + if (attached) {
> + /* Only one BPF program can be attached at a time */
> + mutex_unlock(&flow_dissector_mutex);
> + return -EEXIST;
> + }
> + rcu_assign_pointer(flow_dissector_prog, prog);
> + mutex_unlock(&flow_dissector_mutex);
> + return 0;
> +}
> +
> +int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)
> +{
> + struct bpf_prog *attached;
> +
> + mutex_lock(&flow_dissector_mutex);
> + attached = rcu_dereference_protected(flow_dissector_prog,
> + lockdep_is_held(&flow_dissector_mutex));
> + if (!flow_dissector_prog) {
> + mutex_unlock(&flow_dissector_mutex);
> + return -EINVAL;
> + }
> + bpf_prog_put(attached);
> + RCU_INIT_POINTER(flow_dissector_prog, NULL);
> + mutex_unlock(&flow_dissector_mutex);
> + return 0;
> +}
> /**
> * skb_flow_get_be16 - extract be16 entity
> * @skb: sk_buff to extract from
> @@ -619,6 +658,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
> struct flow_dissector_key_vlan *key_vlan;
> enum flow_dissect_ret fdret;
> enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX;
> + struct bpf_prog *attached;
> int num_hdrs = 0;
> u8 ip_proto = 0;
> bool ret;
> @@ -658,6 +698,42 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
> FLOW_DISSECTOR_KEY_BASIC,
> target_container);
>
> + rcu_read_lock();
> + attached = rcu_dereference(flow_dissector_prog);
> + if (attached) {
> + /* Note that even though the const qualifier is discarded
> + * throughout the execution of the BPF program, all changes(the
> + * control block) are reverted after the BPF program returns.
> + * Therefore, __skb_flow_dissect does not alter the skb.
> + */
> + struct bpf_flow_dissect_cb *cb;
> + u8 cb_saved[BPF_SKB_CB_LEN];
> + u32 result;
> +
> + cb = (struct bpf_flow_dissect_cb *)(bpf_skb_cb((struct sk_buff *)skb));
> +
> + /* Save Control Block */
> + memcpy(cb_saved, cb, sizeof(cb_saved));
> + memset(cb, 0, sizeof(cb_saved));
> +
> + /* Pass parameters to the BPF program */
> + cb->nhoff = nhoff;
> + cb->target_container = target_container;
> + cb->flow_dissector = flow_dissector;
> +
> + bpf_compute_data_pointers((struct sk_buff *)skb);
> + result = BPF_PROG_RUN(attached, skb);
> +
> + /* Restore state */
> + memcpy(cb, cb_saved, sizeof(cb_saved));
> +
> + key_control->thoff = min_t(u16, key_control->thoff,
> + skb ? skb->len : hlen);
> + rcu_read_unlock();
> + return result == BPF_OK;
> + }
If the BPF program cannot handle certain protocol, shall we fall back
to the built-in logic? Otherwise, all BPF programs need to have some
code for all protocols.
Song
> + rcu_read_unlock();
> +
> if (dissector_uses_key(flow_dissector,
> FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
> struct ethhdr *eth = eth_hdr(skb);
> diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
> index dce960d22106..b1cd3bc8db70 100644
> --- a/tools/bpf/bpftool/prog.c
> +++ b/tools/bpf/bpftool/prog.c
> @@ -74,6 +74,7 @@ static const char * const prog_type_name[] = {
> [BPF_PROG_TYPE_RAW_TRACEPOINT] = "raw_tracepoint",
> [BPF_PROG_TYPE_CGROUP_SOCK_ADDR] = "cgroup_sock_addr",
> [BPF_PROG_TYPE_LIRC_MODE2] = "lirc_mode2",
> + [BPF_PROG_TYPE_FLOW_DISSECTOR] = "flow_dissector",
> };
>
> static void print_boot_time(__u64 nsecs, char *buf, unsigned int size)
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index 66917a4eba27..acd74a0dd063 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -152,6 +152,7 @@ enum bpf_prog_type {
> BPF_PROG_TYPE_LWT_SEG6LOCAL,
> BPF_PROG_TYPE_LIRC_MODE2,
> BPF_PROG_TYPE_SK_REUSEPORT,
> + BPF_PROG_TYPE_FLOW_DISSECTOR,
> };
>
> enum bpf_attach_type {
> @@ -172,6 +173,7 @@ enum bpf_attach_type {
> BPF_CGROUP_UDP4_SENDMSG,
> BPF_CGROUP_UDP6_SENDMSG,
> BPF_LIRC_MODE2,
> + BPF_FLOW_DISSECTOR,
> __MAX_BPF_ATTACH_TYPE
> };
>
> @@ -2226,7 +2228,8 @@ union bpf_attr {
> FN(get_current_cgroup_id), \
> FN(get_local_storage), \
> FN(sk_select_reuseport), \
> - FN(skb_ancestor_cgroup_id),
> + FN(skb_ancestor_cgroup_id), \
> + FN(flow_dissector_write_keys),
>
> /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> * function eBPF program intends to call
> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> index 2abd0f112627..0c749ce1b717 100644
> --- a/tools/lib/bpf/libbpf.c
> +++ b/tools/lib/bpf/libbpf.c
> @@ -1502,6 +1502,7 @@ static bool bpf_prog_type__needs_kver(enum bpf_prog_type type)
> case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
> case BPF_PROG_TYPE_LIRC_MODE2:
> case BPF_PROG_TYPE_SK_REUSEPORT:
> + case BPF_PROG_TYPE_FLOW_DISSECTOR:
> return false;
> case BPF_PROG_TYPE_UNSPEC:
> case BPF_PROG_TYPE_KPROBE:
> @@ -2121,6 +2122,7 @@ static const struct {
> BPF_PROG_SEC("sk_skb", BPF_PROG_TYPE_SK_SKB),
> BPF_PROG_SEC("sk_msg", BPF_PROG_TYPE_SK_MSG),
> BPF_PROG_SEC("lirc_mode2", BPF_PROG_TYPE_LIRC_MODE2),
> + BPF_PROG_SEC("flow_dissector", BPF_PROG_TYPE_FLOW_DISSECTOR),
> BPF_SA_PROG_SEC("cgroup/bind4", BPF_CGROUP_INET4_BIND),
> BPF_SA_PROG_SEC("cgroup/bind6", BPF_CGROUP_INET6_BIND),
> BPF_SA_PROG_SEC("cgroup/connect4", BPF_CGROUP_INET4_CONNECT),
> diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
> index e4be7730222d..4204c496a04f 100644
> --- a/tools/testing/selftests/bpf/bpf_helpers.h
> +++ b/tools/testing/selftests/bpf/bpf_helpers.h
> @@ -143,6 +143,9 @@ static unsigned long long (*bpf_skb_cgroup_id)(void *ctx) =
> (void *) BPF_FUNC_skb_cgroup_id;
> static unsigned long long (*bpf_skb_ancestor_cgroup_id)(void *ctx, int level) =
> (void *) BPF_FUNC_skb_ancestor_cgroup_id;
> +static int (*bpf_flow_dissector_write_keys)(void *ctx, void *src, int len,
> + int key) =
> + (void *) BPF_FUNC_flow_dissector_write_keys;
>
> /* llvm builtin functions that eBPF C program may use to
> * emit BPF_LD_ABS and BPF_LD_IND instructions
> --
> 2.18.0.865.gffc8e1a3cd6-goog
>
^ permalink raw reply
* Re: [bpf-next RFC 1/3] flow_dissector: implements flow dissector BPF hook
From: Willem de Bruijn @ 2018-08-16 22:37 UTC (permalink / raw)
To: ecree
Cc: Petar Penkov, Network Development, David Miller,
Alexei Starovoitov, Daniel Borkmann, simon.horman, Petar Penkov,
Willem de Bruijn
In-Reply-To: <cb06e7c6-2042-1243-fe6e-3e1e33e4b14d@solarflare.com>
On Thu, Aug 16, 2018 at 2:51 PM Edward Cree <ecree@solarflare.com> wrote:
>
> On 16/08/18 17:44, Petar Penkov wrote:
> > From: Petar Penkov <ppenkov@google.com>
> >
> > Adds a hook for programs of type BPF_PROG_TYPE_FLOW_DISSECTOR and
> > attach type BPF_FLOW_DISSECTOR that is executed in the flow dissector
> > path. The BPF program is kept as a global variable so it is
> > accessible to all flow dissectors.
> >
> > Signed-off-by: Petar Penkov <ppenkov@google.com>
> > Signed-off-by: Willem de Bruijn <willemb@google.com>
> > ---
>
> This looks really great.
>
> > +int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)
> > +{
> > + struct bpf_prog *attached;
> > +
> > + mutex_lock(&flow_dissector_mutex);
> > + attached = rcu_dereference_protected(flow_dissector_prog,
> > + lockdep_is_held(&flow_dissector_mutex));
> > + if (!flow_dissector_prog) {
> > + mutex_unlock(&flow_dissector_mutex);
> > + return -EINVAL;
> Wouldn't -ENOENT be more usual here (as the counterpart to -EEXIST in
> the skb_flow_dissector_bpf_prog_attach() version just above)?
Absolutely. That better matches bpf_detach behavior, too.
^ permalink raw reply
* Re: [PATCH bpf 0/5] BPF sockmap and ulp fixes
From: Alexei Starovoitov @ 2018-08-16 22:06 UTC (permalink / raw)
To: Daniel Borkmann; +Cc: john.fastabend, netdev
In-Reply-To: <20180816194910.9040-1-daniel@iogearbox.net>
On Thu, Aug 16, 2018 at 09:49:05PM +0200, Daniel Borkmann wrote:
> Batch of various fixes related to BPF sockmap and ULP, including
> adding module alias to restrict module requests, races and memory
> leaks in sockmap code. For details please refer to the individual
> patches. Thanks!
Applied to bpf tree, Thanks everyone
^ permalink raw reply
* KINDLY REPLY diplmosesd@gmail.com URGENTLY
From: MR MOSES @ 2018-08-16 21:50 UTC (permalink / raw)
To: Recipients
KINDLY REPLY diplmosesd@gmail.com URGENTLY
^ permalink raw reply
* Re: [PATCH bpf 5/5] bpf, sockmap: fix sock_map_ctx_update_elem race with exist/noexist
From: Song Liu @ 2018-08-16 21:51 UTC (permalink / raw)
To: Daniel Borkmann; +Cc: Alexei Starovoitov, John Fastabend, Networking
In-Reply-To: <20180816194910.9040-6-daniel@iogearbox.net>
On Thu, Aug 16, 2018 at 12:49 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> The current code in sock_map_ctx_update_elem() allows for BPF_EXIST
> and BPF_NOEXIST map update flags. While on array-like maps this approach
> is rather uncommon, e.g. bpf_fd_array_map_update_elem() and others
> enforce map update flags to be BPF_ANY such that xchg() can be used
> directly, the current implementation in sock map does not guarantee
> that such operation with BPF_EXIST / BPF_NOEXIST is atomic.
>
> The initial test does a READ_ONCE(stab->sock_map[i]) to fetch the
> socket from the slot which is then tested for NULL / non-NULL. However
> later after __sock_map_ctx_update_elem(), the actual update is done
> through osock = xchg(&stab->sock_map[i], sock). Problem is that in
> the meantime a different CPU could have updated / deleted a socket
> on that specific slot and thus flag contraints won't hold anymore.
>
> I've been thinking whether best would be to just break UAPI and do
> an enforcement of BPF_ANY to check if someone actually complains,
> however trouble is that already in BPF kselftest we use BPF_NOEXIST
> for the map update, and therefore it might have been copied into
> applications already. The fix to keep the current behavior intact
> would be to add a map lock similar to the sock hash bucket lock only
> for covering the whole map.
>
> Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support")
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Song Liu <songliubraving@fb.com>
> ---
> kernel/bpf/sockmap.c | 106 +++++++++++++++++++++++++++------------------------
> 1 file changed, 57 insertions(+), 49 deletions(-)
>
> diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
> index 921cb6b..98e621a 100644
> --- a/kernel/bpf/sockmap.c
> +++ b/kernel/bpf/sockmap.c
> @@ -58,6 +58,7 @@ struct bpf_stab {
> struct bpf_map map;
> struct sock **sock_map;
> struct bpf_sock_progs progs;
> + raw_spinlock_t lock;
> };
>
> struct bucket {
> @@ -89,9 +90,9 @@ enum smap_psock_state {
>
> struct smap_psock_map_entry {
> struct list_head list;
> + struct bpf_map *map;
> struct sock **entry;
> struct htab_elem __rcu *hash_link;
> - struct bpf_htab __rcu *htab;
> };
>
> struct smap_psock {
> @@ -343,13 +344,18 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
> e = psock_map_pop(sk, psock);
> while (e) {
> if (e->entry) {
> - osk = cmpxchg(e->entry, sk, NULL);
> + struct bpf_stab *stab = container_of(e->map, struct bpf_stab, map);
> +
> + raw_spin_lock_bh(&stab->lock);
> + osk = *e->entry;
> if (osk == sk) {
> + *e->entry = NULL;
> smap_release_sock(psock, sk);
> }
> + raw_spin_unlock_bh(&stab->lock);
> } else {
> struct htab_elem *link = rcu_dereference(e->hash_link);
> - struct bpf_htab *htab = rcu_dereference(e->htab);
> + struct bpf_htab *htab = container_of(e->map, struct bpf_htab, map);
> struct hlist_head *head;
> struct htab_elem *l;
> struct bucket *b;
> @@ -1642,6 +1648,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
> return ERR_PTR(-ENOMEM);
>
> bpf_map_init_from_attr(&stab->map, attr);
> + raw_spin_lock_init(&stab->lock);
>
> /* make sure page count doesn't overflow */
> cost = (u64) stab->map.max_entries * sizeof(struct sock *);
> @@ -1716,14 +1723,15 @@ static void sock_map_free(struct bpf_map *map)
> * and a grace period expire to ensure psock is really safe to remove.
> */
> rcu_read_lock();
> + raw_spin_lock_bh(&stab->lock);
> for (i = 0; i < stab->map.max_entries; i++) {
> struct smap_psock *psock;
> struct sock *sock;
>
> - sock = xchg(&stab->sock_map[i], NULL);
> + sock = stab->sock_map[i];
> if (!sock)
> continue;
> -
> + stab->sock_map[i] = NULL;
> psock = smap_psock_sk(sock);
> /* This check handles a racing sock event that can get the
> * sk_callback_lock before this case but after xchg happens
> @@ -1735,6 +1743,7 @@ static void sock_map_free(struct bpf_map *map)
> smap_release_sock(psock, sock);
> }
> }
> + raw_spin_unlock_bh(&stab->lock);
> rcu_read_unlock();
>
> sock_map_remove_complete(stab);
> @@ -1778,14 +1787,16 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key)
> if (k >= map->max_entries)
> return -EINVAL;
>
> - sock = xchg(&stab->sock_map[k], NULL);
> + raw_spin_lock_bh(&stab->lock);
> + sock = stab->sock_map[k];
> + stab->sock_map[k] = NULL;
> + raw_spin_unlock_bh(&stab->lock);
> if (!sock)
> return -EINVAL;
>
> psock = smap_psock_sk(sock);
> if (!psock)
> - goto out;
> -
> + return 0;
> if (psock->bpf_parse) {
> write_lock_bh(&sock->sk_callback_lock);
> smap_stop_sock(psock, sock);
> @@ -1793,7 +1804,6 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key)
> }
> smap_list_map_remove(psock, &stab->sock_map[k]);
> smap_release_sock(psock, sock);
> -out:
> return 0;
> }
>
> @@ -1829,11 +1839,9 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key)
> static int __sock_map_ctx_update_elem(struct bpf_map *map,
> struct bpf_sock_progs *progs,
> struct sock *sock,
> - struct sock **map_link,
> void *key)
> {
> struct bpf_prog *verdict, *parse, *tx_msg;
> - struct smap_psock_map_entry *e = NULL;
> struct smap_psock *psock;
> bool new = false;
> int err = 0;
> @@ -1906,14 +1914,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
> new = true;
> }
>
> - if (map_link) {
> - e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
> - if (!e) {
> - err = -ENOMEM;
> - goto out_free;
> - }
> - }
> -
> /* 3. At this point we have a reference to a valid psock that is
> * running. Attach any BPF programs needed.
> */
> @@ -1935,17 +1935,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
> write_unlock_bh(&sock->sk_callback_lock);
> }
>
> - /* 4. Place psock in sockmap for use and stop any programs on
> - * the old sock assuming its not the same sock we are replacing
> - * it with. Because we can only have a single set of programs if
> - * old_sock has a strp we can stop it.
> - */
> - if (map_link) {
> - e->entry = map_link;
> - spin_lock_bh(&psock->maps_lock);
> - list_add_tail(&e->list, &psock->maps);
> - spin_unlock_bh(&psock->maps_lock);
> - }
> return err;
> out_free:
> smap_release_sock(psock, sock);
> @@ -1956,7 +1945,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
> }
> if (tx_msg)
> bpf_prog_put(tx_msg);
> - kfree(e);
> return err;
> }
>
> @@ -1966,36 +1954,57 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
> {
> struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
> struct bpf_sock_progs *progs = &stab->progs;
> - struct sock *osock, *sock;
> + struct sock *osock, *sock = skops->sk;
> + struct smap_psock_map_entry *e;
> + struct smap_psock *psock;
> u32 i = *(u32 *)key;
> int err;
>
> if (unlikely(flags > BPF_EXIST))
> return -EINVAL;
> -
> if (unlikely(i >= stab->map.max_entries))
> return -E2BIG;
>
> - sock = READ_ONCE(stab->sock_map[i]);
> - if (flags == BPF_EXIST && !sock)
> - return -ENOENT;
> - else if (flags == BPF_NOEXIST && sock)
> - return -EEXIST;
> + e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
> + if (!e)
> + return -ENOMEM;
>
> - sock = skops->sk;
> - err = __sock_map_ctx_update_elem(map, progs, sock, &stab->sock_map[i],
> - key);
> + err = __sock_map_ctx_update_elem(map, progs, sock, key);
> if (err)
> goto out;
>
> - osock = xchg(&stab->sock_map[i], sock);
> - if (osock) {
> - struct smap_psock *opsock = smap_psock_sk(osock);
> + /* psock guaranteed to be present. */
> + psock = smap_psock_sk(sock);
> + raw_spin_lock_bh(&stab->lock);
> + osock = stab->sock_map[i];
> + if (osock && flags == BPF_NOEXIST) {
> + err = -EEXIST;
> + goto out_unlock;
> + }
> + if (!osock && flags == BPF_EXIST) {
> + err = -ENOENT;
> + goto out_unlock;
> + }
> +
> + e->entry = &stab->sock_map[i];
> + e->map = map;
> + spin_lock_bh(&psock->maps_lock);
> + list_add_tail(&e->list, &psock->maps);
> + spin_unlock_bh(&psock->maps_lock);
>
> - smap_list_map_remove(opsock, &stab->sock_map[i]);
> - smap_release_sock(opsock, osock);
> + stab->sock_map[i] = sock;
> + if (osock) {
> + psock = smap_psock_sk(osock);
> + smap_list_map_remove(psock, &stab->sock_map[i]);
> + smap_release_sock(psock, osock);
> }
> + raw_spin_unlock_bh(&stab->lock);
> + return 0;
> +out_unlock:
> + smap_release_sock(psock, sock);
> + raw_spin_unlock_bh(&stab->lock);
> out:
> + kfree(e);
> return err;
> }
>
> @@ -2358,7 +2367,7 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
> b = __select_bucket(htab, hash);
> head = &b->head;
>
> - err = __sock_map_ctx_update_elem(map, progs, sock, NULL, key);
> + err = __sock_map_ctx_update_elem(map, progs, sock, key);
> if (err)
> goto err;
>
> @@ -2384,8 +2393,7 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
> }
>
> rcu_assign_pointer(e->hash_link, l_new);
> - rcu_assign_pointer(e->htab,
> - container_of(map, struct bpf_htab, map));
> + e->map = map;
> spin_lock_bh(&psock->maps_lock);
> list_add_tail(&e->list, &psock->maps);
> spin_unlock_bh(&psock->maps_lock);
> --
> 2.9.5
>
^ permalink raw reply
* Re: [Intel-wired-lan] [PATCH next-queue 0/8] ixgbe/ixgbevf: IPsec offload support for VFs
From: Shannon Nelson @ 2018-08-16 21:36 UTC (permalink / raw)
To: Alexander Duyck; +Cc: intel-wired-lan, Jeff Kirsher, Steffen Klassert, Netdev
In-Reply-To: <CAKgT0UfjB35yWMALtLUh-dU78VUW95=8MMn8rLcnKiTQxOB-Eg@mail.gmail.com>
On 8/16/2018 2:15 PM, Alexander Duyck wrote:
> On Tue, Aug 14, 2018 at 10:10 AM Shannon Nelson
> <shannon.nelson@oracle.com> wrote:
>>
>> On 8/14/2018 8:30 AM, Alexander Duyck wrote:
>>> On Mon, Aug 13, 2018 at 11:43 AM Shannon Nelson
>>> <shannon.nelson@oracle.com> wrote:
>>>>
>>>> This set of patches implements IPsec hardware offload for VF devices in
>>>> Intel's 10Gbe x540 family of Ethernet devices.
>>
>> [...]
>>
>>>
>>> So the one question I would have about this patch set is what happens
>>> if you are setting up a ipsec connection between the PF and one of the
>>> VFs on the same port/function? Do the ipsec offloads get translated
>>> across the Tx loopback or do they end up causing issues? Specifically
>>> I would be interested in seeing the results of a test either between
>>> two VFs, or the PF and one of the VFs on the same port.
>>>
>>> - Alex
>>>
>>
>> There is definitely something funky in the internal switch connection,
>> as messages going from PF to VF with an offloaded encryption don't seem
>> to get received by the VF, at least when in a VEB setup. If I only set
>> up offloads on the Rx on both PF and VF, and don't offload the Tx, then
>> things work.
>>
>> I don't have a setup to test this, but I suspect that in a VEPA
>> configuration, with packets going out to the switch and turned around
>> back in, the Tx encryption offload would happen as expected.
>>
>> sln
>
> We should probably look at adding at least one patch to the set then
> that disables IPsec Tx offload if SR-IOV is enabled with VEB so that
> we don't end up breaking connections should a VF be migrated from a
> remote system to a local one that it is connected to.
>
> - Alex
>
The problem with this is that someone could set up an IPsec connection
on the PF for Tx and Rx use, then set num_vfs, start some VFs, and we
still can end up in the same place. I don't think we want to disallow
all Tx IPsec offload.
Maybe we can catch it in ixgbe_ipsec_offload_ok()? If it can find that
the dest mac is on the internal switch, perhaps it can NAK the Tx
offload? That would force the XFRM xmit code to do a regular SW encrypt
before sending the packet. I'll look into this.
sln
^ permalink raw reply
* RE: [PATCH v2 net-next 0/8] net: dsa: microchip: Modify KSZ9477 DSA driver in preparation to add other KSZ switch drivers
From: Tristram.Ha @ 2018-08-16 21:34 UTC (permalink / raw)
To: f.fainelli
Cc: arkadis, UNGLinuxDriver, netdev, andrew, pavel, ruediger.schmitt
In-Reply-To: <1227c661-14c0-eb93-b487-f2e478f77fb8@gmail.com>
> -----Original Message-----
> From: Florian Fainelli <f.fainelli@gmail.com>
> Sent: Wednesday, August 15, 2018 5:29 PM
> To: Tristram Ha - C24268 <Tristram.Ha@microchip.com>; Andrew Lunn
> <andrew@lunn.ch>; Pavel Machek <pavel@ucw.cz>; Ruediger Schmitt
> <ruediger.schmitt@philips.com>
> Cc: Arkadi Sharshevsky <arkadis@mellanox.com>; UNGLinuxDriver
> <UNGLinuxDriver@microchip.com>; netdev@vger.kernel.org
> Subject: Re: [PATCH v2 net-next 0/8] net: dsa: microchip: Modify KSZ9477
> DSA driver in preparation to add other KSZ switch drivers
>
> On 12/05/2017 05:46 PM, Tristram.Ha@microchip.com wrote:
> > From: Tristram Ha <Tristram.Ha@microchip.com>
> >
> > This series of patches is to modify the original KSZ9477 DSA driver so
> > that other KSZ switch drivers can be added and use the common code.
> >
> > There are several steps to accomplish this achievement. First is to
> > rename some function names with a prefix to indicate chip specific
> > function. Second is to move common code into header that can be shared.
> > Last is to modify tag_ksz.c so that it can handle many tail tag formats
> > used by different KSZ switch drivers.
> >
> > ksz_common.c will contain the common code used by all KSZ switch drivers.
> > ksz9477.c will contain KSZ9477 code from the original ksz_common.c.
> > ksz9477_spi.c is renamed from ksz_spi.c.
> > ksz9477_reg.h is renamed from ksz_9477_reg.h.
> > ksz_common.h is added to provide common code access to KSZ switch
> > drivers.
> > ksz_spi.h is added to provide common SPI access functions to KSZ SPI
> > drivers.
>
> Is something gating this series from getting included? It's been nearly
> 8 months now and this has not been include nor resubmitted, any plans to
> rebase that patch series and work towards inclusion in net-next when it
> opens back again?
>
> Thank you!
Sorry for the long delay. I will restart my kernel submission effort next month
after finishing the work on current development project.
^ permalink raw reply
* Re: [PATCH bpf 4/5] bpf, sockmap: fix map elem deletion race with smap_stop_sock
From: Song Liu @ 2018-08-16 21:30 UTC (permalink / raw)
To: Daniel Borkmann; +Cc: Alexei Starovoitov, John Fastabend, Networking
In-Reply-To: <20180816194910.9040-5-daniel@iogearbox.net>
On Thu, Aug 16, 2018 at 12:49 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> The smap_start_sock() and smap_stop_sock() are each protected under
> the sock->sk_callback_lock from their call-sites except in the case
> of sock_map_delete_elem() where we drop the old socket from the map
> slot. This is racy because the same sock could be part of multiple
> sock maps, so we run smap_stop_sock() in parallel, and given at that
> point psock->strp_enabled might be true on both CPUs, we might for
> example wrongly restore the sk->sk_data_ready / sk->sk_write_space.
> Therefore, hold the sock->sk_callback_lock as well on delete. Looks
> like 2f857d04601a ("bpf: sockmap, remove STRPARSER map_flags and add
> multi-map support") had this right, but later on e9db4ef6bf4c ("bpf:
> sockhash fix omitted bucket lock in sock_close") removed it again
> from delete leaving this smap_stop_sock() instance unprotected.
>
> Fixes: e9db4ef6bf4c ("bpf: sockhash fix omitted bucket lock in sock_close")
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Song Liu <songliubraving@fb.com>
> ---
> kernel/bpf/sockmap.c | 5 ++++-
> 1 file changed, 4 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
> index 94a324b..921cb6b 100644
> --- a/kernel/bpf/sockmap.c
> +++ b/kernel/bpf/sockmap.c
> @@ -1786,8 +1786,11 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key)
> if (!psock)
> goto out;
>
> - if (psock->bpf_parse)
> + if (psock->bpf_parse) {
> + write_lock_bh(&sock->sk_callback_lock);
> smap_stop_sock(psock, sock);
> + write_unlock_bh(&sock->sk_callback_lock);
> + }
> smap_list_map_remove(psock, &stab->sock_map[k]);
> smap_release_sock(psock, sock);
> out:
> --
> 2.9.5
>
^ permalink raw reply
* Re: [PATCH bpf 3/5] bpf, sockmap: fix leakage of smap_psock_map_entry
From: Song Liu @ 2018-08-16 21:27 UTC (permalink / raw)
To: Daniel Borkmann; +Cc: Alexei Starovoitov, John Fastabend, Networking
In-Reply-To: <20180816194910.9040-4-daniel@iogearbox.net>
On Thu, Aug 16, 2018 at 12:49 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> While working on sockmap I noticed that we do not always kfree the
> struct smap_psock_map_entry list elements which track psocks attached
> to maps. In the case of sock_hash_ctx_update_elem(), these map entries
> are allocated outside of __sock_map_ctx_update_elem() with their
> linkage to the socket hash table filled. In the case of sock array,
> the map entries are allocated inside of __sock_map_ctx_update_elem()
> and added with their linkage to the psock->maps. Both additions are
> under psock->maps_lock each.
>
> Now, we drop these elements from their psock->maps list in a few
> occasions: i) in sock array via smap_list_map_remove() when an entry
> is either deleted from the map from user space, or updated via
> user space or BPF program where we drop the old socket at that map
> slot, or the sock array is freed via sock_map_free() and drops all
> its elements; ii) for sock hash via smap_list_hash_remove() in exactly
> the same occasions as just described for sock array; iii) in the
> bpf_tcp_close() where we remove the elements from the list via
> psock_map_pop() and iterate over them dropping themselves from either
> sock array or sock hash; and last but not least iv) once again in
> smap_gc_work() which is a callback for deferring the work once the
> psock refcount hit zero and thus the socket is being destroyed.
>
> Problem is that the only case where we kfree() the list entry is
> in case iv), which at that point should have an empty list in
> normal cases. So in cases from i) to iii) we unlink the elements
> without freeing where they go out of reach from us. Hence fix is
> to properly kfree() them as well to stop the leakage. Given these
> are all handled under psock->maps_lock there is no need for deferred
> RCU freeing.
>
> I later also ran with kmemleak detector and it confirmed the finding
> as well where in the state before the fix the object goes unreferenced
> while after the patch no kmemleak report related to BPF showed up.
>
> [...]
> unreferenced object 0xffff880378eadae0 (size 64):
> comm "test_sockmap", pid 2225, jiffies 4294720701 (age 43.504s)
> hex dump (first 32 bytes):
> 00 01 00 00 00 00 ad de 00 02 00 00 00 00 ad de ................
> 50 4d 75 5d 03 88 ff ff 00 00 00 00 00 00 00 00 PMu]............
> backtrace:
> [<000000005225ac3c>] sock_map_ctx_update_elem.isra.21+0xd8/0x210
> [<0000000045dd6d3c>] bpf_sock_map_update+0x29/0x60
> [<00000000877723aa>] ___bpf_prog_run+0x1e1f/0x4960
> [<000000002ef89e83>] 0xffffffffffffffff
> unreferenced object 0xffff880378ead240 (size 64):
> comm "test_sockmap", pid 2225, jiffies 4294720701 (age 43.504s)
> hex dump (first 32 bytes):
> 00 01 00 00 00 00 ad de 00 02 00 00 00 00 ad de ................
> 00 44 75 5d 03 88 ff ff 00 00 00 00 00 00 00 00 .Du]............
> backtrace:
> [<000000005225ac3c>] sock_map_ctx_update_elem.isra.21+0xd8/0x210
> [<0000000030e37a3a>] sock_map_update_elem+0x125/0x240
> [<000000002e5ce36e>] map_update_elem+0x4eb/0x7b0
> [<00000000db453cc9>] __x64_sys_bpf+0x1f9/0x360
> [<0000000000763660>] do_syscall_64+0x9a/0x300
> [<00000000422a2bb2>] entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [<000000002ef89e83>] 0xffffffffffffffff
> [...]
>
> Fixes: e9db4ef6bf4c ("bpf: sockhash fix omitted bucket lock in sock_close")
> Fixes: 54fedb42c653 ("bpf: sockmap, fix smap_list_map_remove when psock is in many maps")
> Fixes: 2f857d04601a ("bpf: sockmap, remove STRPARSER map_flags and add multi-map support")
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Song Liu <songliubraving@fb.com>
> ---
> kernel/bpf/sockmap.c | 9 +++++++--
> 1 file changed, 7 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
> index 0c1a696..94a324b 100644
> --- a/kernel/bpf/sockmap.c
> +++ b/kernel/bpf/sockmap.c
> @@ -370,6 +370,7 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
> }
> raw_spin_unlock_bh(&b->lock);
> }
> + kfree(e);
> e = psock_map_pop(sk, psock);
> }
> rcu_read_unlock();
> @@ -1675,8 +1676,10 @@ static void smap_list_map_remove(struct smap_psock *psock,
>
> spin_lock_bh(&psock->maps_lock);
> list_for_each_entry_safe(e, tmp, &psock->maps, list) {
> - if (e->entry == entry)
> + if (e->entry == entry) {
> list_del(&e->list);
> + kfree(e);
> + }
> }
> spin_unlock_bh(&psock->maps_lock);
> }
> @@ -1690,8 +1693,10 @@ static void smap_list_hash_remove(struct smap_psock *psock,
> list_for_each_entry_safe(e, tmp, &psock->maps, list) {
> struct htab_elem *c = rcu_dereference(e->hash_link);
>
> - if (c == hash_link)
> + if (c == hash_link) {
> list_del(&e->list);
> + kfree(e);
> + }
> }
> spin_unlock_bh(&psock->maps_lock);
> }
> --
> 2.9.5
>
^ permalink raw reply
* Re: [PATCH bpf 2/5] tcp, ulp: fix leftover icsk_ulp_ops preventing sock from reattach
From: Song Liu @ 2018-08-16 21:26 UTC (permalink / raw)
To: Daniel Borkmann; +Cc: Alexei Starovoitov, John Fastabend, Networking
In-Reply-To: <20180816194910.9040-3-daniel@iogearbox.net>
On Thu, Aug 16, 2018 at 12:49 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> I found that in BPF sockmap programs once we either delete a socket
> from the map or we updated a map slot and the old socket was purged
> from the map that these socket can never get reattached into a map
> even though their related psock has been dropped entirely at that
> point.
>
> Reason is that tcp_cleanup_ulp() leaves the old icsk->icsk_ulp_ops
> intact, so that on the next tcp_set_ulp_id() the kernel returns an
> -EEXIST thinking there is still some active ULP attached.
>
> BPF sockmap is the only one that has this issue as the other user,
> kTLS, only calls tcp_cleanup_ulp() from tcp_v4_destroy_sock() whereas
> sockmap semantics allow dropping the socket from the map with all
> related psock state being cleaned up.
>
> Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks")
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Song Liu <songliubraving@fb.com>
> ---
> net/ipv4/tcp_ulp.c | 2 ++
> 1 file changed, 2 insertions(+)
>
> diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
> index 7dd44b6..a5995bb 100644
> --- a/net/ipv4/tcp_ulp.c
> +++ b/net/ipv4/tcp_ulp.c
> @@ -129,6 +129,8 @@ void tcp_cleanup_ulp(struct sock *sk)
> if (icsk->icsk_ulp_ops->release)
> icsk->icsk_ulp_ops->release(sk);
> module_put(icsk->icsk_ulp_ops->owner);
> +
> + icsk->icsk_ulp_ops = NULL;
> }
>
> /* Change upper layer protocol for socket */
> --
> 2.9.5
>
^ permalink raw reply
* Re: [PATCH bpf 1/5] tcp, ulp: add alias for all ulp modules
From: Song Liu @ 2018-08-16 21:25 UTC (permalink / raw)
To: Daniel Borkmann; +Cc: Alexei Starovoitov, John Fastabend, Networking
In-Reply-To: <20180816194910.9040-2-daniel@iogearbox.net>
On Thu, Aug 16, 2018 at 12:49 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> Lets not turn the TCP ULP lookup into an arbitrary module loader as
> we only intend to load ULP modules through this mechanism, not other
> unrelated kernel modules:
>
> [root@bar]# cat foo.c
> #include <sys/types.h>
> #include <sys/socket.h>
> #include <linux/tcp.h>
> #include <linux/in.h>
>
> int main(void)
> {
> int sock = socket(PF_INET, SOCK_STREAM, 0);
> setsockopt(sock, IPPROTO_TCP, TCP_ULP, "sctp", sizeof("sctp"));
> return 0;
> }
>
> [root@bar]# gcc foo.c -O2 -Wall
> [root@bar]# lsmod | grep sctp
> [root@bar]# ./a.out
> [root@bar]# lsmod | grep sctp
> sctp 1077248 4
> libcrc32c 16384 3 nf_conntrack,nf_nat,sctp
> [root@bar]#
>
> Fix it by adding module alias to TCP ULP modules, so probing module
> via request_module() will be limited to tcp-ulp-[name]. The existing
> modules like kTLS will load fine given tcp-ulp-tls alias, but others
> will fail to load:
>
> [root@bar]# lsmod | grep sctp
> [root@bar]# ./a.out
> [root@bar]# lsmod | grep sctp
> [root@bar]#
>
> Sockmap is not affected from this since it's either built-in or not.
>
> Fixes: 734942cc4ea6 ("tcp: ULP infrastructure")
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Song Liu <songliubraving@fb.com>
> ---
> include/net/tcp.h | 4 ++++
> net/ipv4/tcp_ulp.c | 2 +-
> net/tls/tls_main.c | 1 +
> 3 files changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index d196901..770917d 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -2065,6 +2065,10 @@ int tcp_set_ulp_id(struct sock *sk, const int ulp);
> void tcp_get_available_ulp(char *buf, size_t len);
> void tcp_cleanup_ulp(struct sock *sk);
>
> +#define MODULE_ALIAS_TCP_ULP(name) \
> + __MODULE_INFO(alias, alias_userspace, name); \
> + __MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name)
> +
> /* Call BPF_SOCK_OPS program that returns an int. If the return value
> * is < 0, then the BPF op failed (for example if the loaded BPF
> * program does not support the chosen operation or there is no BPF
> diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
> index 622caa4..7dd44b6 100644
> --- a/net/ipv4/tcp_ulp.c
> +++ b/net/ipv4/tcp_ulp.c
> @@ -51,7 +51,7 @@ static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name)
> #ifdef CONFIG_MODULES
> if (!ulp && capable(CAP_NET_ADMIN)) {
> rcu_read_unlock();
> - request_module("%s", name);
> + request_module("tcp-ulp-%s", name);
> rcu_read_lock();
> ulp = tcp_ulp_find(name);
> }
> diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
> index b09867c..93c0c22 100644
> --- a/net/tls/tls_main.c
> +++ b/net/tls/tls_main.c
> @@ -45,6 +45,7 @@
> MODULE_AUTHOR("Mellanox Technologies");
> MODULE_DESCRIPTION("Transport Layer Security Support");
> MODULE_LICENSE("Dual BSD/GPL");
> +MODULE_ALIAS_TCP_ULP("tls");
>
> enum {
> TLSV4,
> --
> 2.9.5
>
^ permalink raw reply
* Re: [Intel-wired-lan] [PATCH next-queue 0/8] ixgbe/ixgbevf: IPsec offload support for VFs
From: Alexander Duyck @ 2018-08-16 21:15 UTC (permalink / raw)
To: Shannon Nelson; +Cc: intel-wired-lan, Jeff Kirsher, Steffen Klassert, Netdev
In-Reply-To: <5ff4527d-7557-10f4-f41e-3e618f0e5863@oracle.com>
On Tue, Aug 14, 2018 at 10:10 AM Shannon Nelson
<shannon.nelson@oracle.com> wrote:
>
> On 8/14/2018 8:30 AM, Alexander Duyck wrote:
> > On Mon, Aug 13, 2018 at 11:43 AM Shannon Nelson
> > <shannon.nelson@oracle.com> wrote:
> >>
> >> This set of patches implements IPsec hardware offload for VF devices in
> >> Intel's 10Gbe x540 family of Ethernet devices.
>
> [...]
>
> >
> > So the one question I would have about this patch set is what happens
> > if you are setting up a ipsec connection between the PF and one of the
> > VFs on the same port/function? Do the ipsec offloads get translated
> > across the Tx loopback or do they end up causing issues? Specifically
> > I would be interested in seeing the results of a test either between
> > two VFs, or the PF and one of the VFs on the same port.
> >
> > - Alex
> >
>
> There is definitely something funky in the internal switch connection,
> as messages going from PF to VF with an offloaded encryption don't seem
> to get received by the VF, at least when in a VEB setup. If I only set
> up offloads on the Rx on both PF and VF, and don't offload the Tx, then
> things work.
>
> I don't have a setup to test this, but I suspect that in a VEPA
> configuration, with packets going out to the switch and turned around
> back in, the Tx encryption offload would happen as expected.
>
> sln
We should probably look at adding at least one patch to the set then
that disables IPsec Tx offload if SR-IOV is enabled with VEB so that
we don't end up breaking connections should a VF be migrated from a
remote system to a local one that it is connected to.
- Alex
^ permalink raw reply
* [PATCH] Revert "net/smc: Replace ib_query_gid with rdma_get_gid_attr"
From: Jason Gunthorpe @ 2018-08-16 20:31 UTC (permalink / raw)
To: Parav Pandit, Leon Romanovsky, Ursula Braun; +Cc: linux-rdma, netdev
This reverts commit ddb457c6993babbcdd41fca638b870d2a2fc3941.
The include rdma/ib_cache.h is kept, and we have to add a memset
to the compat wrapper to avoid compiler warnings in gcc-7
This revert is done to avoid extensive merge conflicts with SMC
changes in netdev during the 4.19 merge window.
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
include/rdma/ib_cache.h | 1 +
net/smc/smc_core.c | 19 ++++++++++---------
net/smc/smc_ib.c | 24 ++++++++++--------------
3 files changed, 21 insertions(+), 23 deletions(-)
As discussed before, the above patch to SMC in the rdma.git causes too
many merge conflicts, I am reverting it prior to sending the pull
request for RDMA and instead relying on the ibv_query_gid() compat
wrapper that has been in linux-next for some time.
Parav, please respin this patch against this branch:
https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/log/?h=wip/jgg-for-next
Thanks,
Jason
diff --git a/include/rdma/ib_cache.h b/include/rdma/ib_cache.h
index a4ce441f36f0ad..3e11e7cc60b745 100644
--- a/include/rdma/ib_cache.h
+++ b/include/rdma/ib_cache.h
@@ -143,6 +143,7 @@ static inline __deprecated int ib_query_gid(struct ib_device *device,
{
const struct ib_gid_attr *attr;
+ memset(attr_out, 0, sizeof(*attr_out));
attr = rdma_get_gid_attr(device, port_num, index);
if (IS_ERR(attr))
return PTR_ERR(attr);
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index d99a75f75e42be..15bad268f37d8b 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -451,7 +451,8 @@ static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id)
static int smc_link_determine_gid(struct smc_link_group *lgr)
{
struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
- const struct ib_gid_attr *gattr;
+ struct ib_gid_attr gattr;
+ union ib_gid gid;
int i;
if (!lgr->vlan_id) {
@@ -461,18 +462,18 @@ static int smc_link_determine_gid(struct smc_link_group *lgr)
for (i = 0; i < lnk->smcibdev->pattr[lnk->ibport - 1].gid_tbl_len;
i++) {
- gattr = rdma_get_gid_attr(lnk->smcibdev->ibdev, lnk->ibport, i);
- if (IS_ERR(gattr))
+ if (ib_query_gid(lnk->smcibdev->ibdev, lnk->ibport, i, &gid,
+ &gattr))
continue;
- if (gattr->ndev) {
- if (is_vlan_dev(gattr->ndev) &&
- vlan_dev_vlan_id(gattr->ndev) == lgr->vlan_id) {
- lnk->gid = gattr->gid;
- rdma_put_gid_attr(gattr);
+ if (gattr.ndev) {
+ if (is_vlan_dev(gattr.ndev) &&
+ vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id) {
+ lnk->gid = gid;
+ dev_put(gattr.ndev);
return 0;
}
+ dev_put(gattr.ndev);
}
- rdma_put_gid_attr(gattr);
}
return -ENODEV;
}
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 74f29f814ec1f9..117b05f1a49475 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -373,21 +373,17 @@ void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev,
static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport)
{
- const struct ib_gid_attr *gattr;
- int rc = 0;
+ struct ib_gid_attr gattr;
+ int rc;
- gattr = rdma_get_gid_attr(smcibdev->ibdev, ibport, 0);
- if (IS_ERR(gattr))
- return PTR_ERR(gattr);
- if (!gattr->ndev) {
- rc = -ENODEV;
- goto done;
- }
- smcibdev->gid[ibport - 1] = gattr->gid;
- memcpy(smcibdev->mac[ibport - 1], gattr->ndev->dev_addr, ETH_ALEN);
-done:
- rdma_put_gid_attr(gattr);
- return rc;
+ rc = ib_query_gid(smcibdev->ibdev, ibport, 0,
+ &smcibdev->gid[ibport - 1], &gattr);
+ if (rc || !gattr.ndev)
+ return -ENODEV;
+
+ memcpy(smcibdev->mac[ibport - 1], gattr.ndev->dev_addr, ETH_ALEN);
+ dev_put(gattr.ndev);
+ return 0;
}
/* Create an identifier unique for this instance of SMC-R.
--
2.18.0
^ permalink raw reply related
* [PATCH bpf 5/5] bpf, sockmap: fix sock_map_ctx_update_elem race with exist/noexist
From: Daniel Borkmann @ 2018-08-16 19:49 UTC (permalink / raw)
To: alexei.starovoitov; +Cc: john.fastabend, netdev, Daniel Borkmann
In-Reply-To: <20180816194910.9040-1-daniel@iogearbox.net>
The current code in sock_map_ctx_update_elem() allows for BPF_EXIST
and BPF_NOEXIST map update flags. While on array-like maps this approach
is rather uncommon, e.g. bpf_fd_array_map_update_elem() and others
enforce map update flags to be BPF_ANY such that xchg() can be used
directly, the current implementation in sock map does not guarantee
that such operation with BPF_EXIST / BPF_NOEXIST is atomic.
The initial test does a READ_ONCE(stab->sock_map[i]) to fetch the
socket from the slot which is then tested for NULL / non-NULL. However
later after __sock_map_ctx_update_elem(), the actual update is done
through osock = xchg(&stab->sock_map[i], sock). Problem is that in
the meantime a different CPU could have updated / deleted a socket
on that specific slot and thus flag contraints won't hold anymore.
I've been thinking whether best would be to just break UAPI and do
an enforcement of BPF_ANY to check if someone actually complains,
however trouble is that already in BPF kselftest we use BPF_NOEXIST
for the map update, and therefore it might have been copied into
applications already. The fix to keep the current behavior intact
would be to add a map lock similar to the sock hash bucket lock only
for covering the whole map.
Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
---
kernel/bpf/sockmap.c | 106 +++++++++++++++++++++++++++------------------------
1 file changed, 57 insertions(+), 49 deletions(-)
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 921cb6b..98e621a 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -58,6 +58,7 @@ struct bpf_stab {
struct bpf_map map;
struct sock **sock_map;
struct bpf_sock_progs progs;
+ raw_spinlock_t lock;
};
struct bucket {
@@ -89,9 +90,9 @@ enum smap_psock_state {
struct smap_psock_map_entry {
struct list_head list;
+ struct bpf_map *map;
struct sock **entry;
struct htab_elem __rcu *hash_link;
- struct bpf_htab __rcu *htab;
};
struct smap_psock {
@@ -343,13 +344,18 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
e = psock_map_pop(sk, psock);
while (e) {
if (e->entry) {
- osk = cmpxchg(e->entry, sk, NULL);
+ struct bpf_stab *stab = container_of(e->map, struct bpf_stab, map);
+
+ raw_spin_lock_bh(&stab->lock);
+ osk = *e->entry;
if (osk == sk) {
+ *e->entry = NULL;
smap_release_sock(psock, sk);
}
+ raw_spin_unlock_bh(&stab->lock);
} else {
struct htab_elem *link = rcu_dereference(e->hash_link);
- struct bpf_htab *htab = rcu_dereference(e->htab);
+ struct bpf_htab *htab = container_of(e->map, struct bpf_htab, map);
struct hlist_head *head;
struct htab_elem *l;
struct bucket *b;
@@ -1642,6 +1648,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&stab->map, attr);
+ raw_spin_lock_init(&stab->lock);
/* make sure page count doesn't overflow */
cost = (u64) stab->map.max_entries * sizeof(struct sock *);
@@ -1716,14 +1723,15 @@ static void sock_map_free(struct bpf_map *map)
* and a grace period expire to ensure psock is really safe to remove.
*/
rcu_read_lock();
+ raw_spin_lock_bh(&stab->lock);
for (i = 0; i < stab->map.max_entries; i++) {
struct smap_psock *psock;
struct sock *sock;
- sock = xchg(&stab->sock_map[i], NULL);
+ sock = stab->sock_map[i];
if (!sock)
continue;
-
+ stab->sock_map[i] = NULL;
psock = smap_psock_sk(sock);
/* This check handles a racing sock event that can get the
* sk_callback_lock before this case but after xchg happens
@@ -1735,6 +1743,7 @@ static void sock_map_free(struct bpf_map *map)
smap_release_sock(psock, sock);
}
}
+ raw_spin_unlock_bh(&stab->lock);
rcu_read_unlock();
sock_map_remove_complete(stab);
@@ -1778,14 +1787,16 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key)
if (k >= map->max_entries)
return -EINVAL;
- sock = xchg(&stab->sock_map[k], NULL);
+ raw_spin_lock_bh(&stab->lock);
+ sock = stab->sock_map[k];
+ stab->sock_map[k] = NULL;
+ raw_spin_unlock_bh(&stab->lock);
if (!sock)
return -EINVAL;
psock = smap_psock_sk(sock);
if (!psock)
- goto out;
-
+ return 0;
if (psock->bpf_parse) {
write_lock_bh(&sock->sk_callback_lock);
smap_stop_sock(psock, sock);
@@ -1793,7 +1804,6 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key)
}
smap_list_map_remove(psock, &stab->sock_map[k]);
smap_release_sock(psock, sock);
-out:
return 0;
}
@@ -1829,11 +1839,9 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key)
static int __sock_map_ctx_update_elem(struct bpf_map *map,
struct bpf_sock_progs *progs,
struct sock *sock,
- struct sock **map_link,
void *key)
{
struct bpf_prog *verdict, *parse, *tx_msg;
- struct smap_psock_map_entry *e = NULL;
struct smap_psock *psock;
bool new = false;
int err = 0;
@@ -1906,14 +1914,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
new = true;
}
- if (map_link) {
- e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
- if (!e) {
- err = -ENOMEM;
- goto out_free;
- }
- }
-
/* 3. At this point we have a reference to a valid psock that is
* running. Attach any BPF programs needed.
*/
@@ -1935,17 +1935,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
write_unlock_bh(&sock->sk_callback_lock);
}
- /* 4. Place psock in sockmap for use and stop any programs on
- * the old sock assuming its not the same sock we are replacing
- * it with. Because we can only have a single set of programs if
- * old_sock has a strp we can stop it.
- */
- if (map_link) {
- e->entry = map_link;
- spin_lock_bh(&psock->maps_lock);
- list_add_tail(&e->list, &psock->maps);
- spin_unlock_bh(&psock->maps_lock);
- }
return err;
out_free:
smap_release_sock(psock, sock);
@@ -1956,7 +1945,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
}
if (tx_msg)
bpf_prog_put(tx_msg);
- kfree(e);
return err;
}
@@ -1966,36 +1954,57 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
{
struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
struct bpf_sock_progs *progs = &stab->progs;
- struct sock *osock, *sock;
+ struct sock *osock, *sock = skops->sk;
+ struct smap_psock_map_entry *e;
+ struct smap_psock *psock;
u32 i = *(u32 *)key;
int err;
if (unlikely(flags > BPF_EXIST))
return -EINVAL;
-
if (unlikely(i >= stab->map.max_entries))
return -E2BIG;
- sock = READ_ONCE(stab->sock_map[i]);
- if (flags == BPF_EXIST && !sock)
- return -ENOENT;
- else if (flags == BPF_NOEXIST && sock)
- return -EEXIST;
+ e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
+ if (!e)
+ return -ENOMEM;
- sock = skops->sk;
- err = __sock_map_ctx_update_elem(map, progs, sock, &stab->sock_map[i],
- key);
+ err = __sock_map_ctx_update_elem(map, progs, sock, key);
if (err)
goto out;
- osock = xchg(&stab->sock_map[i], sock);
- if (osock) {
- struct smap_psock *opsock = smap_psock_sk(osock);
+ /* psock guaranteed to be present. */
+ psock = smap_psock_sk(sock);
+ raw_spin_lock_bh(&stab->lock);
+ osock = stab->sock_map[i];
+ if (osock && flags == BPF_NOEXIST) {
+ err = -EEXIST;
+ goto out_unlock;
+ }
+ if (!osock && flags == BPF_EXIST) {
+ err = -ENOENT;
+ goto out_unlock;
+ }
+
+ e->entry = &stab->sock_map[i];
+ e->map = map;
+ spin_lock_bh(&psock->maps_lock);
+ list_add_tail(&e->list, &psock->maps);
+ spin_unlock_bh(&psock->maps_lock);
- smap_list_map_remove(opsock, &stab->sock_map[i]);
- smap_release_sock(opsock, osock);
+ stab->sock_map[i] = sock;
+ if (osock) {
+ psock = smap_psock_sk(osock);
+ smap_list_map_remove(psock, &stab->sock_map[i]);
+ smap_release_sock(psock, osock);
}
+ raw_spin_unlock_bh(&stab->lock);
+ return 0;
+out_unlock:
+ smap_release_sock(psock, sock);
+ raw_spin_unlock_bh(&stab->lock);
out:
+ kfree(e);
return err;
}
@@ -2358,7 +2367,7 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
b = __select_bucket(htab, hash);
head = &b->head;
- err = __sock_map_ctx_update_elem(map, progs, sock, NULL, key);
+ err = __sock_map_ctx_update_elem(map, progs, sock, key);
if (err)
goto err;
@@ -2384,8 +2393,7 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
}
rcu_assign_pointer(e->hash_link, l_new);
- rcu_assign_pointer(e->htab,
- container_of(map, struct bpf_htab, map));
+ e->map = map;
spin_lock_bh(&psock->maps_lock);
list_add_tail(&e->list, &psock->maps);
spin_unlock_bh(&psock->maps_lock);
--
2.9.5
^ permalink raw reply related
* [PATCH bpf 4/5] bpf, sockmap: fix map elem deletion race with smap_stop_sock
From: Daniel Borkmann @ 2018-08-16 19:49 UTC (permalink / raw)
To: alexei.starovoitov; +Cc: john.fastabend, netdev, Daniel Borkmann
In-Reply-To: <20180816194910.9040-1-daniel@iogearbox.net>
The smap_start_sock() and smap_stop_sock() are each protected under
the sock->sk_callback_lock from their call-sites except in the case
of sock_map_delete_elem() where we drop the old socket from the map
slot. This is racy because the same sock could be part of multiple
sock maps, so we run smap_stop_sock() in parallel, and given at that
point psock->strp_enabled might be true on both CPUs, we might for
example wrongly restore the sk->sk_data_ready / sk->sk_write_space.
Therefore, hold the sock->sk_callback_lock as well on delete. Looks
like 2f857d04601a ("bpf: sockmap, remove STRPARSER map_flags and add
multi-map support") had this right, but later on e9db4ef6bf4c ("bpf:
sockhash fix omitted bucket lock in sock_close") removed it again
from delete leaving this smap_stop_sock() instance unprotected.
Fixes: e9db4ef6bf4c ("bpf: sockhash fix omitted bucket lock in sock_close")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
---
kernel/bpf/sockmap.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 94a324b..921cb6b 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -1786,8 +1786,11 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key)
if (!psock)
goto out;
- if (psock->bpf_parse)
+ if (psock->bpf_parse) {
+ write_lock_bh(&sock->sk_callback_lock);
smap_stop_sock(psock, sock);
+ write_unlock_bh(&sock->sk_callback_lock);
+ }
smap_list_map_remove(psock, &stab->sock_map[k]);
smap_release_sock(psock, sock);
out:
--
2.9.5
^ permalink raw reply related
* [PATCH bpf 1/5] tcp, ulp: add alias for all ulp modules
From: Daniel Borkmann @ 2018-08-16 19:49 UTC (permalink / raw)
To: alexei.starovoitov; +Cc: john.fastabend, netdev, Daniel Borkmann
In-Reply-To: <20180816194910.9040-1-daniel@iogearbox.net>
Lets not turn the TCP ULP lookup into an arbitrary module loader as
we only intend to load ULP modules through this mechanism, not other
unrelated kernel modules:
[root@bar]# cat foo.c
#include <sys/types.h>
#include <sys/socket.h>
#include <linux/tcp.h>
#include <linux/in.h>
int main(void)
{
int sock = socket(PF_INET, SOCK_STREAM, 0);
setsockopt(sock, IPPROTO_TCP, TCP_ULP, "sctp", sizeof("sctp"));
return 0;
}
[root@bar]# gcc foo.c -O2 -Wall
[root@bar]# lsmod | grep sctp
[root@bar]# ./a.out
[root@bar]# lsmod | grep sctp
sctp 1077248 4
libcrc32c 16384 3 nf_conntrack,nf_nat,sctp
[root@bar]#
Fix it by adding module alias to TCP ULP modules, so probing module
via request_module() will be limited to tcp-ulp-[name]. The existing
modules like kTLS will load fine given tcp-ulp-tls alias, but others
will fail to load:
[root@bar]# lsmod | grep sctp
[root@bar]# ./a.out
[root@bar]# lsmod | grep sctp
[root@bar]#
Sockmap is not affected from this since it's either built-in or not.
Fixes: 734942cc4ea6 ("tcp: ULP infrastructure")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
---
include/net/tcp.h | 4 ++++
net/ipv4/tcp_ulp.c | 2 +-
net/tls/tls_main.c | 1 +
3 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index d196901..770917d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2065,6 +2065,10 @@ int tcp_set_ulp_id(struct sock *sk, const int ulp);
void tcp_get_available_ulp(char *buf, size_t len);
void tcp_cleanup_ulp(struct sock *sk);
+#define MODULE_ALIAS_TCP_ULP(name) \
+ __MODULE_INFO(alias, alias_userspace, name); \
+ __MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name)
+
/* Call BPF_SOCK_OPS program that returns an int. If the return value
* is < 0, then the BPF op failed (for example if the loaded BPF
* program does not support the chosen operation or there is no BPF
diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
index 622caa4..7dd44b6 100644
--- a/net/ipv4/tcp_ulp.c
+++ b/net/ipv4/tcp_ulp.c
@@ -51,7 +51,7 @@ static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name)
#ifdef CONFIG_MODULES
if (!ulp && capable(CAP_NET_ADMIN)) {
rcu_read_unlock();
- request_module("%s", name);
+ request_module("tcp-ulp-%s", name);
rcu_read_lock();
ulp = tcp_ulp_find(name);
}
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index b09867c..93c0c22 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -45,6 +45,7 @@
MODULE_AUTHOR("Mellanox Technologies");
MODULE_DESCRIPTION("Transport Layer Security Support");
MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS_TCP_ULP("tls");
enum {
TLSV4,
--
2.9.5
^ permalink raw reply related
* [PATCH bpf 3/5] bpf, sockmap: fix leakage of smap_psock_map_entry
From: Daniel Borkmann @ 2018-08-16 19:49 UTC (permalink / raw)
To: alexei.starovoitov; +Cc: john.fastabend, netdev, Daniel Borkmann
In-Reply-To: <20180816194910.9040-1-daniel@iogearbox.net>
While working on sockmap I noticed that we do not always kfree the
struct smap_psock_map_entry list elements which track psocks attached
to maps. In the case of sock_hash_ctx_update_elem(), these map entries
are allocated outside of __sock_map_ctx_update_elem() with their
linkage to the socket hash table filled. In the case of sock array,
the map entries are allocated inside of __sock_map_ctx_update_elem()
and added with their linkage to the psock->maps. Both additions are
under psock->maps_lock each.
Now, we drop these elements from their psock->maps list in a few
occasions: i) in sock array via smap_list_map_remove() when an entry
is either deleted from the map from user space, or updated via
user space or BPF program where we drop the old socket at that map
slot, or the sock array is freed via sock_map_free() and drops all
its elements; ii) for sock hash via smap_list_hash_remove() in exactly
the same occasions as just described for sock array; iii) in the
bpf_tcp_close() where we remove the elements from the list via
psock_map_pop() and iterate over them dropping themselves from either
sock array or sock hash; and last but not least iv) once again in
smap_gc_work() which is a callback for deferring the work once the
psock refcount hit zero and thus the socket is being destroyed.
Problem is that the only case where we kfree() the list entry is
in case iv), which at that point should have an empty list in
normal cases. So in cases from i) to iii) we unlink the elements
without freeing where they go out of reach from us. Hence fix is
to properly kfree() them as well to stop the leakage. Given these
are all handled under psock->maps_lock there is no need for deferred
RCU freeing.
I later also ran with kmemleak detector and it confirmed the finding
as well where in the state before the fix the object goes unreferenced
while after the patch no kmemleak report related to BPF showed up.
[...]
unreferenced object 0xffff880378eadae0 (size 64):
comm "test_sockmap", pid 2225, jiffies 4294720701 (age 43.504s)
hex dump (first 32 bytes):
00 01 00 00 00 00 ad de 00 02 00 00 00 00 ad de ................
50 4d 75 5d 03 88 ff ff 00 00 00 00 00 00 00 00 PMu]............
backtrace:
[<000000005225ac3c>] sock_map_ctx_update_elem.isra.21+0xd8/0x210
[<0000000045dd6d3c>] bpf_sock_map_update+0x29/0x60
[<00000000877723aa>] ___bpf_prog_run+0x1e1f/0x4960
[<000000002ef89e83>] 0xffffffffffffffff
unreferenced object 0xffff880378ead240 (size 64):
comm "test_sockmap", pid 2225, jiffies 4294720701 (age 43.504s)
hex dump (first 32 bytes):
00 01 00 00 00 00 ad de 00 02 00 00 00 00 ad de ................
00 44 75 5d 03 88 ff ff 00 00 00 00 00 00 00 00 .Du]............
backtrace:
[<000000005225ac3c>] sock_map_ctx_update_elem.isra.21+0xd8/0x210
[<0000000030e37a3a>] sock_map_update_elem+0x125/0x240
[<000000002e5ce36e>] map_update_elem+0x4eb/0x7b0
[<00000000db453cc9>] __x64_sys_bpf+0x1f9/0x360
[<0000000000763660>] do_syscall_64+0x9a/0x300
[<00000000422a2bb2>] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[<000000002ef89e83>] 0xffffffffffffffff
[...]
Fixes: e9db4ef6bf4c ("bpf: sockhash fix omitted bucket lock in sock_close")
Fixes: 54fedb42c653 ("bpf: sockmap, fix smap_list_map_remove when psock is in many maps")
Fixes: 2f857d04601a ("bpf: sockmap, remove STRPARSER map_flags and add multi-map support")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
---
kernel/bpf/sockmap.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 0c1a696..94a324b 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -370,6 +370,7 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
}
raw_spin_unlock_bh(&b->lock);
}
+ kfree(e);
e = psock_map_pop(sk, psock);
}
rcu_read_unlock();
@@ -1675,8 +1676,10 @@ static void smap_list_map_remove(struct smap_psock *psock,
spin_lock_bh(&psock->maps_lock);
list_for_each_entry_safe(e, tmp, &psock->maps, list) {
- if (e->entry == entry)
+ if (e->entry == entry) {
list_del(&e->list);
+ kfree(e);
+ }
}
spin_unlock_bh(&psock->maps_lock);
}
@@ -1690,8 +1693,10 @@ static void smap_list_hash_remove(struct smap_psock *psock,
list_for_each_entry_safe(e, tmp, &psock->maps, list) {
struct htab_elem *c = rcu_dereference(e->hash_link);
- if (c == hash_link)
+ if (c == hash_link) {
list_del(&e->list);
+ kfree(e);
+ }
}
spin_unlock_bh(&psock->maps_lock);
}
--
2.9.5
^ permalink raw reply related
* [PATCH bpf 2/5] tcp, ulp: fix leftover icsk_ulp_ops preventing sock from reattach
From: Daniel Borkmann @ 2018-08-16 19:49 UTC (permalink / raw)
To: alexei.starovoitov; +Cc: john.fastabend, netdev, Daniel Borkmann
In-Reply-To: <20180816194910.9040-1-daniel@iogearbox.net>
I found that in BPF sockmap programs once we either delete a socket
from the map or we updated a map slot and the old socket was purged
from the map that these socket can never get reattached into a map
even though their related psock has been dropped entirely at that
point.
Reason is that tcp_cleanup_ulp() leaves the old icsk->icsk_ulp_ops
intact, so that on the next tcp_set_ulp_id() the kernel returns an
-EEXIST thinking there is still some active ULP attached.
BPF sockmap is the only one that has this issue as the other user,
kTLS, only calls tcp_cleanup_ulp() from tcp_v4_destroy_sock() whereas
sockmap semantics allow dropping the socket from the map with all
related psock state being cleaned up.
Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
---
net/ipv4/tcp_ulp.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
index 7dd44b6..a5995bb 100644
--- a/net/ipv4/tcp_ulp.c
+++ b/net/ipv4/tcp_ulp.c
@@ -129,6 +129,8 @@ void tcp_cleanup_ulp(struct sock *sk)
if (icsk->icsk_ulp_ops->release)
icsk->icsk_ulp_ops->release(sk);
module_put(icsk->icsk_ulp_ops->owner);
+
+ icsk->icsk_ulp_ops = NULL;
}
/* Change upper layer protocol for socket */
--
2.9.5
^ permalink raw reply related
* [PATCH bpf 0/5] BPF sockmap and ulp fixes
From: Daniel Borkmann @ 2018-08-16 19:49 UTC (permalink / raw)
To: alexei.starovoitov; +Cc: john.fastabend, netdev, Daniel Borkmann
Batch of various fixes related to BPF sockmap and ULP, including
adding module alias to restrict module requests, races and memory
leaks in sockmap code. For details please refer to the individual
patches. Thanks!
Daniel Borkmann (5):
tcp, ulp: add alias for all ulp modules
tcp, ulp: fix leftover icsk_ulp_ops preventing sock from reattach
bpf, sockmap: fix leakage of smap_psock_map_entry
bpf, sockmap: fix map elem deletion race with smap_stop_sock
bpf, sockmap: fix sock_map_ctx_update_elem race with exist/noexist
include/net/tcp.h | 4 ++
kernel/bpf/sockmap.c | 120 +++++++++++++++++++++++++++++----------------------
net/ipv4/tcp_ulp.c | 4 +-
net/tls/tls_main.c | 1 +
4 files changed, 76 insertions(+), 53 deletions(-)
--
2.9.5
^ permalink raw reply
* Re: phylib: Any PHY which reports link up during autoneg ?
From: Heiner Kallweit @ 2018-08-16 19:46 UTC (permalink / raw)
To: Florian Fainelli, Andrew Lunn; +Cc: netdev@vger.kernel.org
In-Reply-To: <45f80146-f041-d6e6-a78b-b997456e03e1@gmail.com>
On 16.08.2018 21:21, Florian Fainelli wrote:
> On 08/16/2018 12:15 PM, Heiner Kallweit wrote:
>> When reading through the state machine code in phy.c I wondered whether
>> there is any PHY which reports the link as up during autonegotiation.
>> (It's about handling PHY_AN in the state machine, once we know the link
>> is up we still check whether aneg was completed. Is this needed?)
>>
>> At least the PHY's I have access to all report the link as down when
>> autonegotiating.
>> I checked also clause 22 of 802.3u, however found no clear definition
>> when a link should be considered up. There it's stated that it's
>> PHY-dependent.
>>
>> Can you shed any light on this?
>
> I think the answer is no, there are no PHYs that can report a link UP
> during auto-negotiation that is, between the time you ask for an
> auto-negotiation to occur/restart, and the PHY reporting that it is
> done. By definition, auto-negotiation needs to figure out the common
> denominator between link partners, if, and only if that process
> completes (ANEG done), then the link should be UP.
>
> If the link was reported UP during auto-negotiation, it would not be
> possible to be deterministic about:
>
> - which link parameters changed between negotiation attempts
> - whether the link is DOWN due to a physical disconnection or a failure
> to agree on parameters
>
Thanks for the clarification.
My confusion was mainly about the case of an autoneg restart if there's
an established connection. Because autoneg parameters are exchanged
by sideband signaling (link pulses), normal traffic may not be affected.
Except autoneg requires the rx/tx units to switch to some autoneg
processing mode.
Of course at one point in time, when both link partners have agreed
to a parameter set, they have to switch the mode, resulting in at
least a small window where link is down.
^ permalink raw reply
* Re: [PATCH v3] net/mlx5e: Delete unneeded function argument
From: David Miller @ 2018-08-16 19:28 UTC (permalink / raw)
To: yuval.shaia; +Cc: saeedm, leon, netdev, linux-rdma
In-Reply-To: <20180816090220.4537-1-yuval.shaia@oracle.com>
From: Yuval Shaia <yuval.shaia@oracle.com>
Date: Thu, 16 Aug 2018 12:02:20 +0300
> priv argument is not used by the function, delete it.
>
> Fixes: a89842811ea98 ("net/mlx5e: Merge per priority stats groups")
> Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com>
> ---
> v1 -> v2:
> * Remove blank line as pointed by Leon.
>
> v2 -> v3:
> * Change prefix to mlx5e
Applied, thank you.
^ permalink raw reply
* Re: [PATCH] net: dsa: add support for ksz9897 ethernet switch
From: David Miller @ 2018-08-16 19:25 UTC (permalink / raw)
To: prabhakar.csengg
Cc: Woojung.Huh, UNGLinuxDriver, andrew, vivien.didelot, f.fainelli,
netdev, devicetree
In-Reply-To: <1534348283-12790-1-git-send-email-prabhakar.csengg@gmail.com>
From: Lad Prabhakar <prabhakar.csengg@gmail.com>
Date: Wed, 15 Aug 2018 16:51:23 +0100
> From: "Lad, Prabhakar" <prabhakar.csengg@gmail.com>
>
> ksz9477 is superset of ksz9xx series, driver just works
> out of the box for ksz9897 chip with this patch.
>
> Signed-off-by: Lad, Prabhakar <prabhakar.csengg@gmail.com>
Since this is just adding chip IDs and such, this can go in now.
Applied, thanks.
^ permalink raw reply
* Re: [PATCH v3 net-next] veth: Free queues on link delete
From: David Miller @ 2018-08-16 19:22 UTC (permalink / raw)
To: makita.toshiaki; +Cc: netdev, dsahern, dsahern
In-Reply-To: <1534320449-2433-1-git-send-email-makita.toshiaki@lab.ntt.co.jp>
From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Date: Wed, 15 Aug 2018 17:07:29 +0900
> David Ahern reported memory leak in veth.
...
> veth_rq allocated in veth_newlink() was not freed on dellink.
>
> We need to free up them after veth_close() so that any packets will not
> reference the queues afterwards. Thus free them in veth_dev_free() in
> the same way as freeing stats structure (vstats).
>
> Also move queues allocation to veth_dev_init() to be in line with stats
> allocation.
>
> Fixes: 638264dc90227 ("veth: Support per queue XDP ring")
> Reported-by: David Ahern <dsahern@gmail.com>
> Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Applied, thank you.
^ permalink raw reply
* Re: phylib: Any PHY which reports link up during autoneg ?
From: Florian Fainelli @ 2018-08-16 19:21 UTC (permalink / raw)
To: Heiner Kallweit, Andrew Lunn; +Cc: netdev@vger.kernel.org
In-Reply-To: <9996748f-c274-e1de-6a5a-962f2035d5c2@gmail.com>
On 08/16/2018 12:15 PM, Heiner Kallweit wrote:
> When reading through the state machine code in phy.c I wondered whether
> there is any PHY which reports the link as up during autonegotiation.
> (It's about handling PHY_AN in the state machine, once we know the link
> is up we still check whether aneg was completed. Is this needed?)
>
> At least the PHY's I have access to all report the link as down when
> autonegotiating.
> I checked also clause 22 of 802.3u, however found no clear definition
> when a link should be considered up. There it's stated that it's
> PHY-dependent.
>
> Can you shed any light on this?
I think the answer is no, there are no PHYs that can report a link UP
during auto-negotiation that is, between the time you ask for an
auto-negotiation to occur/restart, and the PHY reporting that it is
done. By definition, auto-negotiation needs to figure out the common
denominator between link partners, if, and only if that process
completes (ANEG done), then the link should be UP.
If the link was reported UP during auto-negotiation, it would not be
possible to be deterministic about:
- which link parameters changed between negotiation attempts
- whether the link is DOWN due to a physical disconnection or a failure
to agree on parameters
--
Florian
^ permalink raw reply
* Re: [PATCH] r8169: don't use MSI-X on RTL8106e
From: Heiner Kallweit @ 2018-08-16 19:18 UTC (permalink / raw)
To: Jian-Hong Pan, David Miller; +Cc: netdev@vger.kernel.org
In-Reply-To: <ff5ea624-0bd5-ef9e-9c02-deb4c1de601b@gmail.com>
On 16.08.2018 20:59, Heiner Kallweit wrote:
>> From: Jian-Hong Pan <jian-hong@endlessm.com>
>>
>> Found the ethernet network on ASUS X441UAR doesn't come back on resume
>> from suspend when using MSI-X. The chip is RTL8106e - version 39.
>>
> The patch itself looks good, just the commit message is wrong in one
> place and a little bit long.
>
Patch should also be annotated "net", and it misses a "Fixes" tag.
>> asus@endless:~$ dmesg | grep r8169
>> [ 21.848357] libphy: r8169: probed
>> [ 21.848473] r8169 0000:02:00.0 eth0: RTL8106e, 0c:9d:92:32:67:b4, XID
>> 44900000, IRQ 127
>> [ 22.518860] r8169 0000:02:00.0 enp2s0: renamed from eth0
>> [ 29.458041] Generic PHY r8169-200:00: attached PHY driver [Generic
>> PHY] (mii_bus:phy_addr=r8169-200:00, irq=IGNORE)
>> [ 63.227398] r8169 0000:02:00.0 enp2s0: Link is Up - 100Mbps/Full -
>> flow control off
>> [ 124.514648] Generic PHY r8169-200:00: attached PHY driver [Generic
>> PHY] (mii_bus:phy_addr=r8169-200:00, irq=IGNORE)
>>
>> Here is the ethernet controller in detail:
>>
>> asus@endless:~$ sudo lspci -nnvs 02:00.0
>> [sudo] password for asus:
>> 02:00.0 Ethernet controller [0200]: Realtek Semiconductor Co., Ltd.
>> RTL8101/2/6E PCI Express Fast/Gigabit Ethernet controller [10ec:8136]
>> (rev 07)
>> Subsystem: ASUSTeK Computer Inc. RTL810xE PCI Express Fast
>> Ethernet controller [1043:200f]
>> Flags: bus master, fast devsel, latency 0, IRQ 16
>> I/O ports at e000 [size=256]
>> Memory at ef100000 (64-bit, non-prefetchable) [size=4K]
>> Memory at e0000000 (64-bit, prefetchable) [size=16K]
>> Capabilities: [40] Power Management version 3
>> Capabilities: [50] MSI: Enable- Count=1/1 Maskable- 64bit+
>> Capabilities: [70] Express Endpoint, MSI 01
>> Capabilities: [b0] MSI-X: Enable+ Count=4 Masked-
>> Capabilities: [d0] Vital Product Data
>> Capabilities: [100] Advanced Error Reporting
>> Capabilities: [140] Virtual Channel
>> Capabilities: [160] Device Serial Number 01-00-00-00-36-4c-e0-00
>> Capabilities: [170] Latency Tolerance Reporting
>> Kernel driver in use: r8169
>> Kernel modules: r8169
>>
>> Here is the system interrupt table:
>>
>> asus@endless:~$ cat /proc/interrupts
>> CPU0 CPU1 CPU2 CPU3
>> 0: 22 0 0 0 IO-APIC 2-edge
>> timer
>> 1: 157 42 0 0 IO-APIC 1-edge
>> i8042
>> 8: 0 0 1 0 IO-APIC 8-edge
>> rtc0
>> 9: 10 13 0 0 IO-APIC 9-fasteoi
>> acpi
>> 16: 0 0 0 0 IO-APIC 16-fasteoi
>> i2c_designware.0, i801_smbus
>> 17: 2445 0 3453 0 IO-APIC 17-fasteoi
>> i2c_designware.1, rtl_pci
>> 109: 2 0 0 1 IO-APIC 109-fasteoi
>> FTE1200:00
>> 120: 0 0 0 0 PCI-MSI 458752-edge
>> PCIe PME
>> 121: 0 0 0 0 PCI-MSI 466944-edge
>> PCIe PME
>> 122: 0 0 0 0 PCI-MSI 468992-edge
>> PCIe PME
>> 123: 1465 0 0 21263 PCI-MSI 376832-edge
>> ahci[0000:00:17.0]
>> 124: 0 530 0 0 PCI-MSI 327680-edge
>> xhci_hcd
>> 125: 5204 0 0 0 PCI-MSI 32768-edge
>> i915
>> 126: 0 0 149 0 PCI-MSI 514048-edge
>> snd_hda_intel:card0
>> 127: 0 0 337 0 PCI-MSI 1048576-edge
>> enp2s0
>> NMI: 0 0 0 0 Non-maskable
>> interrupts
>> LOC: 45049 39474 38978 46677 Local timer
>> interrupts
>> SPU: 0 0 0 0 Spurious interrupts
>> PMI: 0 0 0 0 Performance
>> monitoring interrupts
>> IWI: 619 8 0 1 IRQ work interrupts
>> RTR: 6 0 0 0 APIC ICR read
>> retries
>> RES: 4918 4436 3835 2943 Rescheduling
>> interrupts
>> CAL: 1399 1478 1598 1465 Function call
>> interrupts
>> TLB: 608 513 723 559 TLB shootdowns
>> TRM: 0 0 0 0 Thermal event
>> interrupts
>> THR: 0 0 0 0 Threshold APIC
>> interrupts
>> DFR: 0 0 0 0 Deferred Error APIC
>> interrupts
>> MCE: 0 0 0 0 Machine check
>> exceptions
>> MCP: 3 4 4 4 Machine check polls
>> ERR: 0
>> MIS: 0
>> PIN: 0 0 0 0 Posted-interrupt
>> notification event
>> NPI: 0 0 0 0 Nested
>> posted-interrupt event
>> PIW: 0 0 0 0 Posted-interrupt
>> wakeup event
>>
>> It is the IRQ 127 - PCI-MSI used by enp2s0. However, lspci lists MSI is
>> disabled and MSI-X is enabled which conflicts to the interrupt table.
>>
> Both types of interrupts, MSI and MSI-X, are listed with irq chip name
> "PCI-MSI", because MSI-X is treated as a sub-feature of MSI.
> Therefore the output of /proc/interrupts doesn't allow to tell whether
> a MSI or MSI-X interrupt is used, and as a consequence there is no such
> conflict.
> Indeed only lspci provides the information whether MSI or MSI-X is used.
>
>> Falling back to MSI fixes the issue.
>>
>> Here is the test result with this patch in dmesg:
>>
>> asus@endless:~$ dmesg | grep r8169
>> [ 22.017477] libphy: r8169: probed
>> [ 22.017735] r8169 0000:02:00.0 eth0: RTL8106e, 0c:9d:92:32:67:b4, XID
>> 44900000, IRQ 127
>> [ 22.041489] r8169 0000:02:00.0 enp2s0: renamed from eth0
>> [ 29.138312] Generic PHY r8169-200:00: attached PHY driver [Generic
>> PHY] (mii_bus:phy_addr=r8169-200:00, irq=IGNORE)
>> [ 30.927359] r8169 0000:02:00.0 enp2s0: Link is Up - 100Mbps/Full -
>> flow control off
>> [ 289.998077] r8169 0000:02:00.0 enp2s0: Link is Up - 100Mbps/Full -
>> flow control off
>> [ 290.508084] Generic PHY r8169-200:00: attached PHY driver [Generic
>> PHY] (mii_bus:phy_addr=r8169-200:00, irq=IGNORE)
>> [ 290.745690] r8169 0000:02:00.0 enp2s0: Link is Down
>> [ 292.367717] r8169 0000:02:00.0 enp2s0: Link is Up - 100Mbps/Full -
>> flow control off
>>
>> lspci lists MSI is enabled and MSI-X is disabled with this patch:
>>
>> asus@endless:~/linux-net$ sudo lspci -nnvs 02:00.0
>> [sudo] password for asus:
>> 02:00.0 Ethernet controller [0200]: Realtek Semiconductor Co., Ltd.
>> RTL8101/2/6E PCI Express Fast/Gigabit Ethernet controller [10ec:8136]
>> (rev 07)
>> Subsystem: ASUSTeK Computer Inc. RTL810xE PCI Express Fast
>> Ethernet controller [1043:200f]
>> Flags: bus master, fast devsel, latency 0, IRQ 127
>> I/O ports at e000 [size=256]
>> Memory at ef100000 (64-bit, non-prefetchable) [size=4K]
>> Memory at e0000000 (64-bit, prefetchable) [size=16K]
>> Capabilities: [40] Power Management version 3
>> Capabilities: [50] MSI: Enable+ Count=1/1 Maskable- 64bit+
>> Capabilities: [70] Express Endpoint, MSI 01
>> Capabilities: [b0] MSI-X: Enable- Count=4 Masked-
>> Capabilities: [d0] Vital Product Data
>> Capabilities: [100] Advanced Error Reporting
>> Capabilities: [140] Virtual Channel
>> Capabilities: [160] Device Serial Number 01-00-00-00-36-4c-e0-00
>> Capabilities: [170] Latency Tolerance Reporting
>> Kernel driver in use: r8169
>> Kernel modules: r8169
>>
>> Signed-off-by: Jian-Hong Pan <jian-hong@endlessm.com>
>> ---
>> drivers/net/ethernet/realtek/r8169.c | 9 ++++++---
>> 1 file changed, 6 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
>> index 0d9c3831838f..0efa977c422d 100644
>> --- a/drivers/net/ethernet/realtek/r8169.c
>> +++ b/drivers/net/ethernet/realtek/r8169.c
>> @@ -7071,17 +7071,20 @@ static int rtl_alloc_irq(struct rtl8169_private *tp)
>> {
>> unsigned int flags;
>>
>> - if (tp->mac_version <= RTL_GIGA_MAC_VER_06) {
>> + switch (tp->mac_version) {
>> + case RTL_GIGA_MAC_VER_01 ... RTL_GIGA_MAC_VER_06:
>> RTL_W8(tp, Cfg9346, Cfg9346_Unlock);
>> RTL_W8(tp, Config2, RTL_R8(tp, Config2) & ~MSIEnable);
>> RTL_W8(tp, Cfg9346, Cfg9346_Lock);
>> flags = PCI_IRQ_LEGACY;
>> - } else if (tp->mac_version == RTL_GIGA_MAC_VER_40) {
>> + break;
>> + case RTL_GIGA_MAC_VER_39 ... RTL_GIGA_MAC_VER_40:
>> /* This version was reported to have issues with resume
>> * from suspend when using MSI-X
>> */
>> flags = PCI_IRQ_LEGACY | PCI_IRQ_MSI;
>> - } else {
>> + break;
>> + default:
>> flags = PCI_IRQ_ALL_TYPES;
>> }
>>
>>
>
>
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox