Netdev List
 help / color / mirror / Atom feed
* [PATCH][next] ath10k: wmi: Use struct_size() helper in ath10k_wmi_alloc_skb()
From: Gustavo A. R. Silva @ 2020-06-16 22:51 UTC (permalink / raw)
  To: Kalle Valo, David S. Miller, Jakub Kicinski
  Cc: ath10k, linux-wireless, netdev, linux-kernel, Gustavo A. R. Silva

Make use of the struct_size() helper instead of an open-coded version
in order to avoid any potential type mistakes. Also, remove unnecessary
variable _len_.

This code was detected with the help of Coccinelle and, audited and
fixed manually.

Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
---
 drivers/net/wireless/ath/ath10k/wmi.c | 32 +++++++--------------------
 1 file changed, 8 insertions(+), 24 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c
index a81a1ab2de19..b89681394a15 100644
--- a/drivers/net/wireless/ath/ath10k/wmi.c
+++ b/drivers/net/wireless/ath/ath10k/wmi.c
@@ -6551,7 +6551,7 @@ static struct sk_buff *ath10k_wmi_op_gen_init(struct ath10k *ar)
 	struct wmi_init_cmd *cmd;
 	struct sk_buff *buf;
 	struct wmi_resource_config config = {};
-	u32 len, val;
+	u32 val;
 
 	config.num_vdevs = __cpu_to_le32(TARGET_NUM_VDEVS);
 	config.num_peers = __cpu_to_le32(TARGET_NUM_PEERS);
@@ -6603,10 +6603,7 @@ static struct sk_buff *ath10k_wmi_op_gen_init(struct ath10k *ar)
 	config.num_msdu_desc = __cpu_to_le32(TARGET_NUM_MSDU_DESC);
 	config.max_frag_entries = __cpu_to_le32(TARGET_MAX_FRAG_ENTRIES);
 
-	len = sizeof(*cmd) +
-	      (sizeof(struct host_memory_chunk) * ar->wmi.num_mem_chunks);
-
-	buf = ath10k_wmi_alloc_skb(ar, len);
+	buf = ath10k_wmi_alloc_skb(ar, struct_size(cmd, mem_chunks.items, ar->wmi.num_mem_chunks));
 	if (!buf)
 		return ERR_PTR(-ENOMEM);
 
@@ -6624,7 +6621,7 @@ static struct sk_buff *ath10k_wmi_10_1_op_gen_init(struct ath10k *ar)
 	struct wmi_init_cmd_10x *cmd;
 	struct sk_buff *buf;
 	struct wmi_resource_config_10x config = {};
-	u32 len, val;
+	u32 val;
 
 	config.num_vdevs = __cpu_to_le32(TARGET_10X_NUM_VDEVS);
 	config.num_peers = __cpu_to_le32(TARGET_10X_NUM_PEERS);
@@ -6668,10 +6665,7 @@ static struct sk_buff *ath10k_wmi_10_1_op_gen_init(struct ath10k *ar)
 	config.num_msdu_desc = __cpu_to_le32(TARGET_10X_NUM_MSDU_DESC);
 	config.max_frag_entries = __cpu_to_le32(TARGET_10X_MAX_FRAG_ENTRIES);
 
-	len = sizeof(*cmd) +
-	      (sizeof(struct host_memory_chunk) * ar->wmi.num_mem_chunks);
-
-	buf = ath10k_wmi_alloc_skb(ar, len);
+	buf = ath10k_wmi_alloc_skb(ar, struct_size(cmd, mem_chunks.items, ar->wmi.num_mem_chunks));
 	if (!buf)
 		return ERR_PTR(-ENOMEM);
 
@@ -6689,7 +6683,7 @@ static struct sk_buff *ath10k_wmi_10_2_op_gen_init(struct ath10k *ar)
 	struct wmi_init_cmd_10_2 *cmd;
 	struct sk_buff *buf;
 	struct wmi_resource_config_10x config = {};
-	u32 len, val, features;
+	u32 val, features;
 
 	config.num_vdevs = __cpu_to_le32(TARGET_10X_NUM_VDEVS);
 	config.num_peer_keys = __cpu_to_le32(TARGET_10X_NUM_PEER_KEYS);
@@ -6741,10 +6735,7 @@ static struct sk_buff *ath10k_wmi_10_2_op_gen_init(struct ath10k *ar)
 	config.num_msdu_desc = __cpu_to_le32(TARGET_10X_NUM_MSDU_DESC);
 	config.max_frag_entries = __cpu_to_le32(TARGET_10X_MAX_FRAG_ENTRIES);
 
-	len = sizeof(*cmd) +
-	      (sizeof(struct host_memory_chunk) * ar->wmi.num_mem_chunks);
-
-	buf = ath10k_wmi_alloc_skb(ar, len);
+	buf = ath10k_wmi_alloc_skb(ar, struct_size(cmd, mem_chunks.items, ar->wmi.num_mem_chunks));
 	if (!buf)
 		return ERR_PTR(-ENOMEM);
 
@@ -6776,7 +6767,6 @@ static struct sk_buff *ath10k_wmi_10_4_op_gen_init(struct ath10k *ar)
 	struct wmi_init_cmd_10_4 *cmd;
 	struct sk_buff *buf;
 	struct wmi_resource_config_10_4 config = {};
-	u32 len;
 
 	config.num_vdevs = __cpu_to_le32(ar->max_num_vdevs);
 	config.num_peers = __cpu_to_le32(ar->max_num_peers);
@@ -6838,10 +6828,7 @@ static struct sk_buff *ath10k_wmi_10_4_op_gen_init(struct ath10k *ar)
 	config.iphdr_pad_config = __cpu_to_le32(TARGET_10_4_IPHDR_PAD_CONFIG);
 	config.qwrap_config = __cpu_to_le32(TARGET_10_4_QWRAP_CONFIG);
 
-	len = sizeof(*cmd) +
-	      (sizeof(struct host_memory_chunk) * ar->wmi.num_mem_chunks);
-
-	buf = ath10k_wmi_alloc_skb(ar, len);
+	buf = ath10k_wmi_alloc_skb(ar, struct_size(cmd, mem_chunks.items, ar->wmi.num_mem_chunks));
 	if (!buf)
 		return ERR_PTR(-ENOMEM);
 
@@ -7549,12 +7536,9 @@ ath10k_wmi_op_gen_scan_chan_list(struct ath10k *ar,
 	struct sk_buff *skb;
 	struct wmi_channel_arg *ch;
 	struct wmi_channel *ci;
-	int len;
 	int i;
 
-	len = sizeof(*cmd) + arg->n_channels * sizeof(struct wmi_channel);
-
-	skb = ath10k_wmi_alloc_skb(ar, len);
+	skb = ath10k_wmi_alloc_skb(ar, struct_size(cmd, chan_info, arg->n_channels));
 	if (!skb)
 		return ERR_PTR(-EINVAL);
 
-- 
2.27.0


^ permalink raw reply related

* Re: [PATCH net v3 1/2] net/sched: act_gate: fix NULL dereference in tcf_gate_init()
From: Vladimir Oltean @ 2020-06-16 22:51 UTC (permalink / raw)
  To: Davide Caratti; +Cc: Po Liu, Cong Wang, David S. Miller, netdev
In-Reply-To: <d20f15f7241892ab7f042a8361103287aaf6a83f.1592338450.git.dcaratti@redhat.com>

On Tue, 16 Jun 2020 at 23:25, Davide Caratti <dcaratti@redhat.com> wrote:
>
> it is possible to see a KASAN use-after-free, immediately followed by a
> NULL dereference crash, with the following command:
>
>  # tc action add action gate index 3 cycle-time 100000000ns \
>  > cycle-time-ext 100000000ns clockid CLOCK_TAI
>
>  BUG: KASAN: use-after-free in tcf_action_init_1+0x8eb/0x960
>  Write of size 1 at addr ffff88810a5908bc by task tc/883
>
>  CPU: 0 PID: 883 Comm: tc Not tainted 5.7.0+ #188
>  Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014
>  Call Trace:
>   dump_stack+0x75/0xa0
>   print_address_description.constprop.6+0x1a/0x220
>   kasan_report.cold.9+0x37/0x7c
>   tcf_action_init_1+0x8eb/0x960
>   tcf_action_init+0x157/0x2a0
>   tcf_action_add+0xd9/0x2f0
>   tc_ctl_action+0x2a3/0x39d
>   rtnetlink_rcv_msg+0x5f3/0x920
>   netlink_rcv_skb+0x120/0x380
>   netlink_unicast+0x439/0x630
>   netlink_sendmsg+0x714/0xbf0
>   sock_sendmsg+0xe2/0x110
>   ____sys_sendmsg+0x5b4/0x890
>   ___sys_sendmsg+0xe9/0x160
>   __sys_sendmsg+0xd3/0x170
>   do_syscall_64+0x9a/0x370
>   entry_SYSCALL_64_after_hwframe+0x44/0xa9
>
> [...]
>
>  KASAN: null-ptr-deref in range [0x0000000000000070-0x0000000000000077]
>  CPU: 0 PID: 883 Comm: tc Tainted: G    B             5.7.0+ #188
>  Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014
>  RIP: 0010:tcf_action_fill_size+0xa3/0xf0
>  [....]
>  RSP: 0018:ffff88813a48f250 EFLAGS: 00010212
>  RAX: dffffc0000000000 RBX: 0000000000000094 RCX: ffffffffa47c3eb6
>  RDX: 000000000000000e RSI: 0000000000000008 RDI: 0000000000000070
>  RBP: ffff88810a590800 R08: 0000000000000004 R09: ffffed1027491e03
>  R10: 0000000000000003 R11: ffffed1027491e03 R12: 0000000000000000
>  R13: 0000000000000000 R14: dffffc0000000000 R15: ffff88810a590800
>  FS:  00007f62cae8ce40(0000) GS:ffff888147c00000(0000) knlGS:0000000000000000
>  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>  CR2: 00007f62c9d20a10 CR3: 000000013a52a000 CR4: 0000000000340ef0
>  Call Trace:
>   tcf_action_init+0x172/0x2a0
>   tcf_action_add+0xd9/0x2f0
>   tc_ctl_action+0x2a3/0x39d
>   rtnetlink_rcv_msg+0x5f3/0x920
>   netlink_rcv_skb+0x120/0x380
>   netlink_unicast+0x439/0x630
>   netlink_sendmsg+0x714/0xbf0
>   sock_sendmsg+0xe2/0x110
>   ____sys_sendmsg+0x5b4/0x890
>   ___sys_sendmsg+0xe9/0x160
>   __sys_sendmsg+0xd3/0x170
>   do_syscall_64+0x9a/0x370
>   entry_SYSCALL_64_after_hwframe+0x44/0xa9
>
> this is caused by the test on 'cycletime_ext', that is still unassigned
> when the action is newly created. This makes the action .init() return 0
> without calling tcf_idr_insert(), hence the UAF + crash.
>
> rework the logic that prevents zero values of cycle-time, as follows:
>
> 1) 'tcfg_cycletime_ext' seems to be unused in the action software path,
>    and it was already possible by other means to obtain non-zero
>    cycletime and zero cycletime-ext. So, removing that test should not
>    cause any damage.
> 2) while at it, we must prevent overwriting configuration data with wrong
>    ones: use a temporary variable for 'tcfg_cycletime', and validate it
>    preserving the original semantic (that allowed computing the cycle
>    time as the sum of all intervals, when not specified by
>    TCA_GATE_CYCLE_TIME).
> 3) remove the test on 'tcfg_cycletime', no more useful, and avoid
>    returning -EFAULT, which did not seem an appropriate return value for
>    a wrong netlink attribute.
>
> v3: fix uninitialized 'cycletime' (thanks to Vladimir Oltean)
> v2: remove useless 'return;' at the end of void gate_get_start_time()
>
> Fixes: a51c328df310 ("net: qos: introduce a gate control flow action")
> CC: Ivan Vecera <ivecera@redhat.com>
> Signed-off-by: Davide Caratti <dcaratti@redhat.com>
> ---

Acked-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Tested-by: Vladimir Oltean <vladimir.oltean@nxp.com>


>  net/sched/act_gate.c | 36 +++++++++++++-----------------------
>  1 file changed, 13 insertions(+), 23 deletions(-)
>
> diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c
> index 9c628591f452..94e560c2f81d 100644
> --- a/net/sched/act_gate.c
> +++ b/net/sched/act_gate.c
> @@ -32,7 +32,7 @@ static ktime_t gate_get_time(struct tcf_gate *gact)
>         return KTIME_MAX;
>  }
>
> -static int gate_get_start_time(struct tcf_gate *gact, ktime_t *start)
> +static void gate_get_start_time(struct tcf_gate *gact, ktime_t *start)
>  {
>         struct tcf_gate_params *param = &gact->param;
>         ktime_t now, base, cycle;
> @@ -43,18 +43,13 @@ static int gate_get_start_time(struct tcf_gate *gact, ktime_t *start)
>
>         if (ktime_after(base, now)) {
>                 *start = base;
> -               return 0;
> +               return;
>         }
>
>         cycle = param->tcfg_cycletime;
>
> -       /* cycle time should not be zero */
> -       if (!cycle)
> -               return -EFAULT;
> -
>         n = div64_u64(ktime_sub_ns(now, base), cycle);
>         *start = ktime_add_ns(base, (n + 1) * cycle);
> -       return 0;
>  }
>
>  static void gate_start_timer(struct tcf_gate *gact, ktime_t start)
> @@ -287,12 +282,12 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla,
>         enum tk_offsets tk_offset = TK_OFFS_TAI;
>         struct nlattr *tb[TCA_GATE_MAX + 1];
>         struct tcf_chain *goto_ch = NULL;
> +       u64 cycletime = 0, basetime = 0;
>         struct tcf_gate_params *p;
>         s32 clockid = CLOCK_TAI;
>         struct tcf_gate *gact;
>         struct tc_gate *parm;
>         int ret = 0, err;
> -       u64 basetime = 0;
>         u32 gflags = 0;
>         s32 prio = -1;
>         ktime_t start;
> @@ -375,11 +370,8 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla,
>         spin_lock_bh(&gact->tcf_lock);
>         p = &gact->param;
>
> -       if (tb[TCA_GATE_CYCLE_TIME]) {
> -               p->tcfg_cycletime = nla_get_u64(tb[TCA_GATE_CYCLE_TIME]);
> -               if (!p->tcfg_cycletime_ext)
> -                       goto chain_put;
> -       }
> +       if (tb[TCA_GATE_CYCLE_TIME])
> +               cycletime = nla_get_u64(tb[TCA_GATE_CYCLE_TIME]);
>
>         if (tb[TCA_GATE_ENTRY_LIST]) {
>                 err = parse_gate_list(tb[TCA_GATE_ENTRY_LIST], p, extack);
> @@ -387,14 +379,19 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla,
>                         goto chain_put;
>         }
>
> -       if (!p->tcfg_cycletime) {
> +       if (!cycletime) {
>                 struct tcfg_gate_entry *entry;
>                 ktime_t cycle = 0;
>
>                 list_for_each_entry(entry, &p->entries, list)
>                         cycle = ktime_add_ns(cycle, entry->interval);
> -               p->tcfg_cycletime = cycle;
> +               cycletime = cycle;
> +               if (!cycletime) {
> +                       err = -EINVAL;
> +                       goto chain_put;
> +               }
>         }
> +       p->tcfg_cycletime = cycletime;
>
>         if (tb[TCA_GATE_CYCLE_TIME_EXT])
>                 p->tcfg_cycletime_ext =
> @@ -408,14 +405,7 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla,
>         gact->tk_offset = tk_offset;
>         hrtimer_init(&gact->hitimer, clockid, HRTIMER_MODE_ABS_SOFT);
>         gact->hitimer.function = gate_timer_func;
> -
> -       err = gate_get_start_time(gact, &start);
> -       if (err < 0) {
> -               NL_SET_ERR_MSG(extack,
> -                              "Internal error: failed get start time");
> -               release_entry_list(&p->entries);
> -               goto chain_put;
> -       }
> +       gate_get_start_time(gact, &start);
>
>         gact->current_close_time = start;
>         gact->current_gate_status = GATE_ACT_GATE_OPEN | GATE_ACT_PENDING;
> --
> 2.26.2
>

^ permalink raw reply

* [PATCH bpf v4 1/2] bpf: don't return EINVAL from {get,set}sockopt when optlen > PAGE_SIZE
From: Stanislav Fomichev @ 2020-06-16 22:52 UTC (permalink / raw)
  To: netdev, bpf; +Cc: davem, ast, daniel, Stanislav Fomichev, David Laight

Attaching to these hooks can break iptables because its optval is
usually quite big, or at least bigger than the current PAGE_SIZE limit.
David also mentioned some SCTP options can be big (around 256k).

There are two possible ways to fix it:
1. Increase the limit to match iptables max optval. There is, however,
   no clear upper limit. Technically, iptables can accept up to
   512M of data (not sure how practical it is though).

2. Bypass the value (don't expose to BPF) if it's too big and trigger
   BPF only with level/optname so BPF can still decide whether
   to allow/deny big sockopts.

The initial attempt was implemented using strategy #1. Due to
listed shortcomings, let's switch to strategy #2. When there is
legitimate a real use-case for iptables/SCTP, we can consider increasing
 the PAGE_SIZE limit.

To support the cases where len(optval) > PAGE_SIZE we can
leverage upcoming sleepable BPF work by providing a helper
which can do copy_from_user (sleepable) at the given offset
from the original large buffer.

v4:
* use temporary buffer to avoid optval == optval_end == NULL;
  this removes the corner case in the verifier that might assume
  non-zero PTR_TO_PACKET/PTR_TO_PACKET_END.

v3:
* don't increase the limit, bypass the argument

v2:
* proper comments formatting (Jakub Kicinski)

Fixes: 0d01da6afc54 ("bpf: implement getsockopt and setsockopt hooks")
Cc: David Laight <David.Laight@ACULAB.COM>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 include/linux/filter.h |  1 +
 kernel/bpf/cgroup.c    | 31 +++++++++++++++++++++++++------
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 259377723603..f4565a70f8ba 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1276,6 +1276,7 @@ struct bpf_sockopt_kern {
 	s32		optname;
 	s32		optlen;
 	s32		retval;
+	u8		optval_too_large;
 };
 
 #endif /* __LINUX_FILTER_H__ */
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 4d76f16524cc..be78c01bf459 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1276,9 +1276,18 @@ static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
 
 static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
 {
-	if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0)
+	if (unlikely(max_optlen < 0))
 		return -EINVAL;
 
+	if (unlikely(max_optlen > PAGE_SIZE)) {
+		/* We don't expose optvals that are greater than PAGE_SIZE
+		 * to the BPF program.
+		 */
+		ctx->optval = &ctx->optval_too_large;
+		ctx->optval_end = &ctx->optval_too_large;
+		return 0;
+	}
+
 	ctx->optval = kzalloc(max_optlen, GFP_USER);
 	if (!ctx->optval)
 		return -ENOMEM;
@@ -1288,9 +1297,15 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
 	return 0;
 }
 
+static int sockopt_has_optval(struct bpf_sockopt_kern *ctx)
+{
+	return ctx->optval != &ctx->optval_too_large;
+}
+
 static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
 {
-	kfree(ctx->optval);
+	if (sockopt_has_optval(ctx))
+		kfree(ctx->optval);
 }
 
 int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
@@ -1325,7 +1340,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 
 	ctx.optlen = *optlen;
 
-	if (copy_from_user(ctx.optval, optval, *optlen) != 0) {
+	if (sockopt_has_optval(&ctx) &&
+	    copy_from_user(ctx.optval, optval, *optlen) != 0) {
 		ret = -EFAULT;
 		goto out;
 	}
@@ -1354,7 +1370,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 		*level = ctx.level;
 		*optname = ctx.optname;
 		*optlen = ctx.optlen;
-		*kernel_optval = ctx.optval;
+		if (sockopt_has_optval(&ctx))
+			*kernel_optval = ctx.optval;
 	}
 
 out:
@@ -1407,7 +1424,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 		if (ctx.optlen > max_optlen)
 			ctx.optlen = max_optlen;
 
-		if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) {
+		if (sockopt_has_optval(&ctx) &&
+		    copy_from_user(ctx.optval, optval, ctx.optlen) != 0) {
 			ret = -EFAULT;
 			goto out;
 		}
@@ -1436,7 +1454,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 		goto out;
 	}
 
-	if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
+	if ((sockopt_has_optval(&ctx) &&
+	     copy_to_user(optval, ctx.optval, ctx.optlen)) ||
 	    put_user(ctx.optlen, optlen)) {
 		ret = -EFAULT;
 		goto out;
-- 
2.27.0.290.gba653c62da-goog


^ permalink raw reply related

* [PATCH bpf v4 2/2] selftests/bpf: make sure optvals > PAGE_SIZE are bypassed
From: Stanislav Fomichev @ 2020-06-16 22:52 UTC (permalink / raw)
  To: netdev, bpf; +Cc: davem, ast, daniel, Stanislav Fomichev
In-Reply-To: <20200616225256.246769-1-sdf@google.com>

We are relying on the fact, that we can pass > sizeof(int) optvals
to the SOL_IP+IP_FREEBIND option (the kernel will take first 4 bytes).
In the BPF program, we return EPERM if optval is greater than optval_end
(implemented via PTR_TO_PACKET/PTR_TO_PACKET_END) and rely on the verifier
to enforce the fact that this data can not be touched.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 .../selftests/bpf/prog_tests/sockopt_sk.c     | 26 +++++++++++++++++++
 .../testing/selftests/bpf/progs/sockopt_sk.c  | 20 ++++++++++++++
 2 files changed, 46 insertions(+)

diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
index 2061a6beac0f..eae1c8a1fee0 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
@@ -13,6 +13,7 @@ static int getsetsockopt(void)
 		char cc[16]; /* TCP_CA_NAME_MAX */
 	} buf = {};
 	socklen_t optlen;
+	char *big_buf;
 
 	fd = socket(AF_INET, SOCK_STREAM, 0);
 	if (fd < 0) {
@@ -78,6 +79,31 @@ static int getsetsockopt(void)
 		goto err;
 	}
 
+	/* IP_FREEBIND - BPF can't access optval when optlen > PAGE_SIZE */
+
+	optlen = getpagesize() * 2;
+	big_buf = calloc(1, optlen);
+	if (!big_buf) {
+		log_err("Couldn't allocate two pages");
+		goto err;
+	}
+
+	err = setsockopt(fd, SOL_IP, IP_FREEBIND, big_buf, optlen);
+	if (err != 0) {
+		log_err("Failed to call setsockopt, ret=%d", err);
+		free(big_buf);
+		goto err;
+	}
+
+	err = getsockopt(fd, SOL_IP, IP_FREEBIND, big_buf, &optlen);
+	if (err != 0) {
+		log_err("Failed to call getsockopt, ret=%d", err);
+		free(big_buf);
+		goto err;
+	}
+
+	free(big_buf);
+
 	/* SO_SNDBUF is overwritten */
 
 	buf.u32 = 0x01010101;
diff --git a/tools/testing/selftests/bpf/progs/sockopt_sk.c b/tools/testing/selftests/bpf/progs/sockopt_sk.c
index d5a5eeb5fb52..933a2ef9c930 100644
--- a/tools/testing/selftests/bpf/progs/sockopt_sk.c
+++ b/tools/testing/selftests/bpf/progs/sockopt_sk.c
@@ -51,6 +51,16 @@ int _getsockopt(struct bpf_sockopt *ctx)
 		return 1;
 	}
 
+	if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) {
+		if (optval > optval_end) {
+			/* For optval > PAGE_SIZE, the actual data
+			 * is not provided.
+			 */
+			return 0; /* EPERM, unexpected data size */
+		}
+		return 1;
+	}
+
 	if (ctx->level != SOL_CUSTOM)
 		return 0; /* EPERM, deny everything except custom level */
 
@@ -112,6 +122,16 @@ int _setsockopt(struct bpf_sockopt *ctx)
 		return 1;
 	}
 
+	if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) {
+		if (optval > optval_end) {
+			/* For optval > PAGE_SIZE, the actual data
+			 * is not provided.
+			 */
+			return 0; /* EPERM, unexpected data size */
+		}
+		return 1;
+	}
+
 	if (ctx->level != SOL_CUSTOM)
 		return 0; /* EPERM, deny everything except custom level */
 
-- 
2.27.0.290.gba653c62da-goog


^ permalink raw reply related

* Re: [PATCH net v3 2/2] net/sched: act_gate: fix configuration of the periodic timer
From: Vladimir Oltean @ 2020-06-16 22:52 UTC (permalink / raw)
  To: Davide Caratti; +Cc: Po Liu, Cong Wang, David S. Miller, netdev
In-Reply-To: <bca565004b9fd7ff7aba09f41aa65dab2085154b.1592338450.git.dcaratti@redhat.com>

On Tue, 16 Jun 2020 at 23:25, Davide Caratti <dcaratti@redhat.com> wrote:
>
> assigning a dummy value of 'clock_id' to avoid cancellation of the cycle
> timer before its initialization was a temporary solution, and we still
> need to handle the case where act_gate timer parameters are changed by
> commands like the following one:
>
>  # tc action replace action gate <parameters>
>
> the fix consists in the following items:
>
> 1) remove the workaround assignment of 'clock_id', and init the list of
>    entries before the first error path after IDR atomic check/allocation
> 2) validate 'clock_id' earlier: there is no need to do IDR atomic
>    check/allocation if we know that 'clock_id' is a bad value
> 3) use a dedicated function, 'gate_setup_timer()', to ensure that the
>    timer is cancelled and re-initialized on action overwrite, and also
>    ensure we initialize the timer in the error path of tcf_gate_init()
>
> v3: improve comment in the error path of tcf_gate_init() (thanks to
>     Vladimir Oltean)
> v2: avoid 'goto' in gate_setup_timer (thanks to Cong Wang)
>
> CC: Ivan Vecera <ivecera@redhat.com>
> Fixes: a01c245438c5 ("net/sched: fix a couple of splats in the error path of tfc_gate_init()")
> Fixes: a51c328df310 ("net: qos: introduce a gate control flow action")
> Signed-off-by: Davide Caratti <dcaratti@redhat.com>
> ---

Acked-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Tested-by: Vladimir Oltean <vladimir.oltean@nxp.com>

>  net/sched/act_gate.c | 90 +++++++++++++++++++++++++++-----------------
>  1 file changed, 55 insertions(+), 35 deletions(-)
>
> diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c
> index 94e560c2f81d..323ae7f6315d 100644
> --- a/net/sched/act_gate.c
> +++ b/net/sched/act_gate.c
> @@ -272,6 +272,27 @@ static int parse_gate_list(struct nlattr *list_attr,
>         return err;
>  }
>
> +static void gate_setup_timer(struct tcf_gate *gact, u64 basetime,
> +                            enum tk_offsets tko, s32 clockid,
> +                            bool do_init)
> +{
> +       if (!do_init) {
> +               if (basetime == gact->param.tcfg_basetime &&
> +                   tko == gact->tk_offset &&
> +                   clockid == gact->param.tcfg_clockid)
> +                       return;
> +
> +               spin_unlock_bh(&gact->tcf_lock);
> +               hrtimer_cancel(&gact->hitimer);
> +               spin_lock_bh(&gact->tcf_lock);
> +       }
> +       gact->param.tcfg_basetime = basetime;
> +       gact->param.tcfg_clockid = clockid;
> +       gact->tk_offset = tko;
> +       hrtimer_init(&gact->hitimer, clockid, HRTIMER_MODE_ABS_SOFT);
> +       gact->hitimer.function = gate_timer_func;
> +}
> +
>  static int tcf_gate_init(struct net *net, struct nlattr *nla,
>                          struct nlattr *est, struct tc_action **a,
>                          int ovr, int bind, bool rtnl_held,
> @@ -303,6 +324,27 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla,
>         if (!tb[TCA_GATE_PARMS])
>                 return -EINVAL;
>
> +       if (tb[TCA_GATE_CLOCKID]) {
> +               clockid = nla_get_s32(tb[TCA_GATE_CLOCKID]);
> +               switch (clockid) {
> +               case CLOCK_REALTIME:
> +                       tk_offset = TK_OFFS_REAL;
> +                       break;
> +               case CLOCK_MONOTONIC:
> +                       tk_offset = TK_OFFS_MAX;
> +                       break;
> +               case CLOCK_BOOTTIME:
> +                       tk_offset = TK_OFFS_BOOT;
> +                       break;
> +               case CLOCK_TAI:
> +                       tk_offset = TK_OFFS_TAI;
> +                       break;
> +               default:
> +                       NL_SET_ERR_MSG(extack, "Invalid 'clockid'");
> +                       return -EINVAL;
> +               }
> +       }
> +
>         parm = nla_data(tb[TCA_GATE_PARMS]);
>         index = parm->index;
>
> @@ -326,10 +368,6 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla,
>                 tcf_idr_release(*a, bind);
>                 return -EEXIST;
>         }
> -       if (ret == ACT_P_CREATED) {
> -               to_gate(*a)->param.tcfg_clockid = -1;
> -               INIT_LIST_HEAD(&(to_gate(*a)->param.entries));
> -       }
>
>         if (tb[TCA_GATE_PRIORITY])
>                 prio = nla_get_s32(tb[TCA_GATE_PRIORITY]);
> @@ -340,33 +378,14 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla,
>         if (tb[TCA_GATE_FLAGS])
>                 gflags = nla_get_u32(tb[TCA_GATE_FLAGS]);
>
> -       if (tb[TCA_GATE_CLOCKID]) {
> -               clockid = nla_get_s32(tb[TCA_GATE_CLOCKID]);
> -               switch (clockid) {
> -               case CLOCK_REALTIME:
> -                       tk_offset = TK_OFFS_REAL;
> -                       break;
> -               case CLOCK_MONOTONIC:
> -                       tk_offset = TK_OFFS_MAX;
> -                       break;
> -               case CLOCK_BOOTTIME:
> -                       tk_offset = TK_OFFS_BOOT;
> -                       break;
> -               case CLOCK_TAI:
> -                       tk_offset = TK_OFFS_TAI;
> -                       break;
> -               default:
> -                       NL_SET_ERR_MSG(extack, "Invalid 'clockid'");
> -                       goto release_idr;
> -               }
> -       }
> +       gact = to_gate(*a);
> +       if (ret == ACT_P_CREATED)
> +               INIT_LIST_HEAD(&gact->param.entries);
>
>         err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
>         if (err < 0)
>                 goto release_idr;
>
> -       gact = to_gate(*a);
> -
>         spin_lock_bh(&gact->tcf_lock);
>         p = &gact->param;
>
> @@ -397,14 +416,10 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla,
>                 p->tcfg_cycletime_ext =
>                         nla_get_u64(tb[TCA_GATE_CYCLE_TIME_EXT]);
>
> +       gate_setup_timer(gact, basetime, tk_offset, clockid,
> +                        ret == ACT_P_CREATED);
>         p->tcfg_priority = prio;
> -       p->tcfg_basetime = basetime;
> -       p->tcfg_clockid = clockid;
>         p->tcfg_flags = gflags;
> -
> -       gact->tk_offset = tk_offset;
> -       hrtimer_init(&gact->hitimer, clockid, HRTIMER_MODE_ABS_SOFT);
> -       gact->hitimer.function = gate_timer_func;
>         gate_get_start_time(gact, &start);
>
>         gact->current_close_time = start;
> @@ -433,6 +448,13 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla,
>         if (goto_ch)
>                 tcf_chain_put_by_act(goto_ch);
>  release_idr:
> +       /* action is not inserted in any list: it's safe to init hitimer
> +        * without taking tcf_lock.
> +        */
> +       if (ret == ACT_P_CREATED)
> +               gate_setup_timer(gact, gact->param.tcfg_basetime,
> +                                gact->tk_offset, gact->param.tcfg_clockid,
> +                                true);
>         tcf_idr_release(*a, bind);
>         return err;
>  }
> @@ -443,9 +465,7 @@ static void tcf_gate_cleanup(struct tc_action *a)
>         struct tcf_gate_params *p;
>
>         p = &gact->param;
> -       if (p->tcfg_clockid != -1)
> -               hrtimer_cancel(&gact->hitimer);
> -
> +       hrtimer_cancel(&gact->hitimer);
>         release_entry_list(&p->entries);
>  }
>
> --
> 2.26.2
>

^ permalink raw reply

* [net 0/3][pull request] Intel Wired LAN Driver Updates 2020-06-16
From: Jeff Kirsher @ 2020-06-16 22:53 UTC (permalink / raw)
  To: davem; +Cc: Jeff Kirsher, netdev, nhorman, sassmann

This series contains fixes to e1000 and e1000e.

Chen fixes an e1000e issue where systems could be waken via WoL, even
though the user has disabled the wakeup bit via sysfs.

Vaibhav Gupta updates the e1000 driver to clean up the legacy Power
Management hooks.

Arnd Bergmann cleans up the inconsistent use CONFIG_PM_SLEEP
preprocessor tags, which also resolves the compiler warnings about the
possibility of unused structure.

The following are changes since commit b8ad540dd4e40566c520dff491fc06c71ae6b989:
  mptcp: fix memory leak in mptcp_subflow_create_socket()
and are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/net-queue 1GbE

Arnd Bergmann (1):
  e1000e: fix unused-function warning

Chen Yu (1):
  e1000e: Do not wake up the system via WOL if device wakeup is disabled

Vaibhav Gupta (1):
  e1000: use generic power management

 drivers/net/ethernet/intel/e1000/e1000_main.c | 49 +++++--------------
 drivers/net/ethernet/intel/e1000e/netdev.c    | 30 ++++++------
 2 files changed, 28 insertions(+), 51 deletions(-)

-- 
2.26.2


^ permalink raw reply

* [net 3/3] e1000e: fix unused-function warning
From: Jeff Kirsher @ 2020-06-16 22:53 UTC (permalink / raw)
  To: davem; +Cc: Arnd Bergmann, netdev, nhorman, sassmann, Aaron Brown,
	Jeff Kirsher
In-Reply-To: <20200616225354.2744572-1-jeffrey.t.kirsher@intel.com>

From: Arnd Bergmann <arnd@arndb.de>

The CONFIG_PM_SLEEP #ifdef checks in this file are inconsistent,
leading to a warning about sometimes unused function:

drivers/net/ethernet/intel/e1000e/netdev.c:137:13: error: unused function 'e1000e_check_me' [-Werror,-Wunused-function]

Rather than adding more #ifdefs, just remove them completely
and mark the PM functions as __maybe_unused to let the compiler
work it out on it own.

Fixes: e086ba2fccda ("e1000e: disable s0ix entry and exit flows for ME systems")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/e1000e/netdev.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index e2ad3f38c75c..6f6479ca1267 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -6349,7 +6349,6 @@ static void e1000e_flush_lpic(struct pci_dev *pdev)
 	pm_runtime_put_sync(netdev->dev.parent);
 }
 
-#ifdef CONFIG_PM_SLEEP
 /* S0ix implementation */
 static void e1000e_s0ix_entry_flow(struct e1000_adapter *adapter)
 {
@@ -6571,7 +6570,6 @@ static void e1000e_s0ix_exit_flow(struct e1000_adapter *adapter)
 	mac_data &= ~E1000_CTRL_EXT_FORCE_SMBUS;
 	ew32(CTRL_EXT, mac_data);
 }
-#endif /* CONFIG_PM_SLEEP */
 
 static int e1000e_pm_freeze(struct device *dev)
 {
@@ -6875,7 +6873,6 @@ static int e1000e_pm_thaw(struct device *dev)
 	return rc;
 }
 
-#ifdef CONFIG_PM
 static int __e1000_resume(struct pci_dev *pdev)
 {
 	struct net_device *netdev = pci_get_drvdata(pdev);
@@ -6941,8 +6938,7 @@ static int __e1000_resume(struct pci_dev *pdev)
 	return 0;
 }
 
-#ifdef CONFIG_PM_SLEEP
-static int e1000e_pm_suspend(struct device *dev)
+static __maybe_unused int e1000e_pm_suspend(struct device *dev)
 {
 	struct net_device *netdev = pci_get_drvdata(to_pci_dev(dev));
 	struct e1000_adapter *adapter = netdev_priv(netdev);
@@ -6966,7 +6962,7 @@ static int e1000e_pm_suspend(struct device *dev)
 	return rc;
 }
 
-static int e1000e_pm_resume(struct device *dev)
+static __maybe_unused int e1000e_pm_resume(struct device *dev)
 {
 	struct net_device *netdev = pci_get_drvdata(to_pci_dev(dev));
 	struct e1000_adapter *adapter = netdev_priv(netdev);
@@ -6985,9 +6981,8 @@ static int e1000e_pm_resume(struct device *dev)
 
 	return e1000e_pm_thaw(dev);
 }
-#endif /* CONFIG_PM_SLEEP */
 
-static int e1000e_pm_runtime_idle(struct device *dev)
+static __maybe_unused int e1000e_pm_runtime_idle(struct device *dev)
 {
 	struct net_device *netdev = dev_get_drvdata(dev);
 	struct e1000_adapter *adapter = netdev_priv(netdev);
@@ -7003,7 +6998,7 @@ static int e1000e_pm_runtime_idle(struct device *dev)
 	return -EBUSY;
 }
 
-static int e1000e_pm_runtime_resume(struct device *dev)
+static __maybe_unused int e1000e_pm_runtime_resume(struct device *dev)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
 	struct net_device *netdev = pci_get_drvdata(pdev);
@@ -7020,7 +7015,7 @@ static int e1000e_pm_runtime_resume(struct device *dev)
 	return rc;
 }
 
-static int e1000e_pm_runtime_suspend(struct device *dev)
+static __maybe_unused int e1000e_pm_runtime_suspend(struct device *dev)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
 	struct net_device *netdev = pci_get_drvdata(pdev);
@@ -7045,7 +7040,6 @@ static int e1000e_pm_runtime_suspend(struct device *dev)
 
 	return 0;
 }
-#endif /* CONFIG_PM */
 
 static void e1000_shutdown(struct pci_dev *pdev)
 {
-- 
2.26.2


^ permalink raw reply related

* [net 1/3] e1000e: Do not wake up the system via WOL if device wakeup is disabled
From: Jeff Kirsher @ 2020-06-16 22:53 UTC (permalink / raw)
  To: davem
  Cc: Chen Yu, netdev, nhorman, sassmann, Rafael J. Wysocki,
	Andy Shevchenko, Stable, Aaron Brown, Jeff Kirsher
In-Reply-To: <20200616225354.2744572-1-jeffrey.t.kirsher@intel.com>

From: Chen Yu <yu.c.chen@intel.com>

Currently the system will be woken up via WOL(Wake On LAN) even if the
device wakeup ability has been disabled via sysfs:
 cat /sys/devices/pci0000:00/0000:00:1f.6/power/wakeup
 disabled

The system should not be woken up if the user has explicitly
disabled the wake up ability for this device.

This patch clears the WOL ability of this network device if the
user has disabled the wake up ability in sysfs.

Fixes: bc7f75fa9788 ("[E1000E]: New pci-express e1000 driver")
Reported-by: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: <Stable@vger.kernel.org>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/e1000e/netdev.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index a279f4fa9962..e2ad3f38c75c 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -6611,11 +6611,17 @@ static int __e1000_shutdown(struct pci_dev *pdev, bool runtime)
 	struct net_device *netdev = pci_get_drvdata(pdev);
 	struct e1000_adapter *adapter = netdev_priv(netdev);
 	struct e1000_hw *hw = &adapter->hw;
-	u32 ctrl, ctrl_ext, rctl, status;
-	/* Runtime suspend should only enable wakeup for link changes */
-	u32 wufc = runtime ? E1000_WUFC_LNKC : adapter->wol;
+	u32 ctrl, ctrl_ext, rctl, status, wufc;
 	int retval = 0;
 
+	/* Runtime suspend should only enable wakeup for link changes */
+	if (runtime)
+		wufc = E1000_WUFC_LNKC;
+	else if (device_may_wakeup(&pdev->dev))
+		wufc = adapter->wol;
+	else
+		wufc = 0;
+
 	status = er32(STATUS);
 	if (status & E1000_STATUS_LU)
 		wufc &= ~E1000_WUFC_LNKC;
@@ -6672,7 +6678,7 @@ static int __e1000_shutdown(struct pci_dev *pdev, bool runtime)
 	if (adapter->hw.phy.type == e1000_phy_igp_3) {
 		e1000e_igp3_phy_powerdown_workaround_ich8lan(&adapter->hw);
 	} else if (hw->mac.type >= e1000_pch_lpt) {
-		if (!(wufc & (E1000_WUFC_EX | E1000_WUFC_MC | E1000_WUFC_BC)))
+		if (wufc && !(wufc & (E1000_WUFC_EX | E1000_WUFC_MC | E1000_WUFC_BC)))
 			/* ULP does not support wake from unicast, multicast
 			 * or broadcast.
 			 */
-- 
2.26.2


^ permalink raw reply related

* [net 2/3] e1000: use generic power management
From: Jeff Kirsher @ 2020-06-16 22:53 UTC (permalink / raw)
  To: davem; +Cc: Vaibhav Gupta, netdev, nhorman, sassmann, Aaron Brown,
	Jeff Kirsher
In-Reply-To: <20200616225354.2744572-1-jeffrey.t.kirsher@intel.com>

From: Vaibhav Gupta <vaibhavgupta40@gmail.com>

With legacy PM hooks, it was the responsibility of a driver to manage PCI
states and also the device's power state. The generic approach is to let PCI
core handle the work.

e1000_suspend() calls __e1000_shutdown() to perform intermediate tasks.
__e1000_shutdown() modifies the value of "wake" (device should be wakeup
enabled or not), responsible for controlling the flow of legacy PM.

Since, PCI core has no idea about the value of "wake", new code for generic
PM may produce unexpected results. Thus, use "device_set_wakeup_enable()"
to wakeup-enable the device accordingly.

Signed-off-by: Vaibhav Gupta <vaibhavgupta40@gmail.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/e1000/e1000_main.c | 49 +++++--------------
 1 file changed, 13 insertions(+), 36 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
index d9fa4600f745..4b2de08137be 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_main.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
@@ -151,10 +151,8 @@ static int e1000_vlan_rx_kill_vid(struct net_device *netdev,
 				  __be16 proto, u16 vid);
 static void e1000_restore_vlan(struct e1000_adapter *adapter);
 
-#ifdef CONFIG_PM
-static int e1000_suspend(struct pci_dev *pdev, pm_message_t state);
-static int e1000_resume(struct pci_dev *pdev);
-#endif
+static int __maybe_unused e1000_suspend(struct device *dev);
+static int __maybe_unused e1000_resume(struct device *dev);
 static void e1000_shutdown(struct pci_dev *pdev);
 
 #ifdef CONFIG_NET_POLL_CONTROLLER
@@ -179,16 +177,16 @@ static const struct pci_error_handlers e1000_err_handler = {
 	.resume = e1000_io_resume,
 };
 
+static SIMPLE_DEV_PM_OPS(e1000_pm_ops, e1000_suspend, e1000_resume);
+
 static struct pci_driver e1000_driver = {
 	.name     = e1000_driver_name,
 	.id_table = e1000_pci_tbl,
 	.probe    = e1000_probe,
 	.remove   = e1000_remove,
-#ifdef CONFIG_PM
-	/* Power Management Hooks */
-	.suspend  = e1000_suspend,
-	.resume   = e1000_resume,
-#endif
+	.driver = {
+		.pm = &e1000_pm_ops,
+	},
 	.shutdown = e1000_shutdown,
 	.err_handler = &e1000_err_handler
 };
@@ -5060,9 +5058,6 @@ static int __e1000_shutdown(struct pci_dev *pdev, bool *enable_wake)
 	struct e1000_hw *hw = &adapter->hw;
 	u32 ctrl, ctrl_ext, rctl, status;
 	u32 wufc = adapter->wol;
-#ifdef CONFIG_PM
-	int retval = 0;
-#endif
 
 	netif_device_detach(netdev);
 
@@ -5076,12 +5071,6 @@ static int __e1000_shutdown(struct pci_dev *pdev, bool *enable_wake)
 		e1000_down(adapter);
 	}
 
-#ifdef CONFIG_PM
-	retval = pci_save_state(pdev);
-	if (retval)
-		return retval;
-#endif
-
 	status = er32(STATUS);
 	if (status & E1000_STATUS_LU)
 		wufc &= ~E1000_WUFC_LNKC;
@@ -5142,37 +5131,26 @@ static int __e1000_shutdown(struct pci_dev *pdev, bool *enable_wake)
 	return 0;
 }
 
-#ifdef CONFIG_PM
-static int e1000_suspend(struct pci_dev *pdev, pm_message_t state)
+static int __maybe_unused e1000_suspend(struct device *dev)
 {
 	int retval;
+	struct pci_dev *pdev = to_pci_dev(dev);
 	bool wake;
 
 	retval = __e1000_shutdown(pdev, &wake);
-	if (retval)
-		return retval;
-
-	if (wake) {
-		pci_prepare_to_sleep(pdev);
-	} else {
-		pci_wake_from_d3(pdev, false);
-		pci_set_power_state(pdev, PCI_D3hot);
-	}
+	device_set_wakeup_enable(dev, wake);
 
-	return 0;
+	return retval;
 }
 
-static int e1000_resume(struct pci_dev *pdev)
+static int __maybe_unused e1000_resume(struct device *dev)
 {
+	struct pci_dev *pdev = to_pci_dev(dev);
 	struct net_device *netdev = pci_get_drvdata(pdev);
 	struct e1000_adapter *adapter = netdev_priv(netdev);
 	struct e1000_hw *hw = &adapter->hw;
 	u32 err;
 
-	pci_set_power_state(pdev, PCI_D0);
-	pci_restore_state(pdev);
-	pci_save_state(pdev);
-
 	if (adapter->need_ioport)
 		err = pci_enable_device(pdev);
 	else
@@ -5209,7 +5187,6 @@ static int e1000_resume(struct pci_dev *pdev)
 
 	return 0;
 }
-#endif
 
 static void e1000_shutdown(struct pci_dev *pdev)
 {
-- 
2.26.2


^ permalink raw reply related

* Re: [PATCH 4/5] Huawei BMA: Adding Huawei BMA driver: cdev_veth_drv
From: Andrew Lunn @ 2020-06-16 22:56 UTC (permalink / raw)
  To: yunaixin03610; +Cc: netdev, yunaixin
In-Reply-To: <20200616020528.1389-1-yunaixin03610@163.com>

> This cdev_veth_drv driver is one of the communication drivers used
> by BMA software. It depends on the host_edma_drv driver. It will
> create a char device once loaded, offering interfaces (open, close,
> read, write and poll) to BMA to send/receive RESTful messages
> between BMC software. When the message is longer than 1KB, it will
> be cut into packets of 1KB. The other side, BMC's cdev_veth driver,
> will assemble these packets back into original mesages.

Hi Yunaixin

Interfaces like this nearly always get a NACK.

Is the user space code using this API open source?

   Thanks
	Andrew


^ permalink raw reply

* [PATCH][next] net: stmmac: selftests: Use struct_size() helper in kzalloc()
From: Gustavo A. R. Silva @ 2020-06-16 23:03 UTC (permalink / raw)
  To: Giuseppe Cavallaro, Alexandre Torgue, Jose Abreu, David S. Miller,
	Jakub Kicinski, Maxime Coquelin
  Cc: netdev, linux-stm32, linux-arm-kernel, linux-kernel,
	Gustavo A. R. Silva

Make use of the struct_size() helper instead of an open-coded version
in order to avoid any potential type mistakes.

This code was detected with the help of Coccinelle and, audited and
fixed manually.

Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c
index e6696495f126..e113b1376fdd 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c
@@ -1094,7 +1094,7 @@ static int stmmac_test_rxp(struct stmmac_priv *priv)
 	if (!priv->dma_cap.frpsel)
 		return -EOPNOTSUPP;
 
-	sel = kzalloc(sizeof(*sel) + nk * sizeof(struct tc_u32_key), GFP_KERNEL);
+	sel = kzalloc(struct_size(sel, keys, nk), GFP_KERNEL);
 	if (!sel)
 		return -ENOMEM;
 
-- 
2.27.0


^ permalink raw reply related

* Re: [PATCH v4 0/3] mm, treewide: Rename kzfree() to kfree_sensitive()
From: David Sterba @ 2020-06-16 23:01 UTC (permalink / raw)
  To: Joe Perches
  Cc: Waiman Long, Andrew Morton, David Howells, Jarkko Sakkinen,
	James Morris, Serge E. Hallyn, Linus Torvalds, Matthew Wilcox,
	David Rientjes, Michal Hocko, Johannes Weiner, Dan Carpenter,
	David Sterba, Jason A . Donenfeld, linux-mm, keyrings,
	linux-kernel, linux-crypto, linux-pm, linux-stm32, linux-amlogic,
	linux-mediatek, linuxppc-dev, virtualization, netdev, linux-ppp,
	wireguard, linux-wireless, devel, linux-scsi, target-devel,
	linux-btrfs, linux-cifs, linux-fscrypt, ecryptfs, kasan-dev,
	linux-bluetooth, linux-wpan, linux-sctp, linux-nfs,
	tipc-discussion, linux-security-module, linux-integrity
In-Reply-To: <fe3b9a437be4aeab3bac68f04193cb6daaa5bee4.camel@perches.com>

On Tue, Jun 16, 2020 at 11:53:50AM -0700, Joe Perches wrote:
> On Mon, 2020-06-15 at 21:57 -0400, Waiman Long wrote:
> >  v4:
> >   - Break out the memzero_explicit() change as suggested by Dan Carpenter
> >     so that it can be backported to stable.
> >   - Drop the "crypto: Remove unnecessary memzero_explicit()" patch for
> >     now as there can be a bit more discussion on what is best. It will be
> >     introduced as a separate patch later on after this one is merged.
> 
> To this larger audience and last week without reply:
> https://lore.kernel.org/lkml/573b3fbd5927c643920e1364230c296b23e7584d.camel@perches.com/
> 
> Are there _any_ fastpath uses of kfree or vfree?

I'd consider kfree performance critical for cases where it is called
under locks. If possible the kfree is moved outside of the critical
section, but we have rbtrees or lists that get deleted under locks and
restructuring the code to do eg. splice and free it outside of the lock
is not always possible.

^ permalink raw reply

* Re: [PATCH bpf v3 1/2] bpf: don't return EINVAL from {get,set}sockopt when optlen > PAGE_SIZE
From: Alexei Starovoitov @ 2020-06-16 23:03 UTC (permalink / raw)
  To: Stanislav Fomichev
  Cc: Netdev, bpf, David Miller, Alexei Starovoitov, Daniel Borkmann,
	David Laight
In-Reply-To: <CAKH8qBvUv_OwjFA70JQfL-rET662okH87QYyeivbybCPwCEJEQ@mail.gmail.com>

On Mon, Jun 15, 2020 at 09:41:38AM -0700, Stanislav Fomichev wrote:
> On Fri, Jun 12, 2020 at 8:50 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> [ .. ]
> > > > It's probably ok, but makes me uneasy about verifier consequences.
> > > > ctx->optval is PTR_TO_PACKET and it's a valid pointer from verifier pov.
> > > > Do we have cases already where PTR_TO_PACKET == PTR_TO_PACKET_END ?
> > > > I don't think we have such tests. I guess bpf prog won't be able to read
> > > > anything and nothing will crash, but having PTR_TO_PACKET that is
> > > > actually NULL would be an odd special case to keep in mind for everyone
> > > > who will work on the verifier from now on.
> > > >
> > > > Also consider bpf prog that simply reads something small like 4 bytes.
> > > > IP_FREEBIND sockopt (like your selftest in the patch 2) will have
> > > > those 4 bytes, so it's natural for the prog to assume that it can read it.
> > > > It will have
> > > > p = ctx->optval;
> > > > if (p + 4 > ctx->optval_end)
> > > >  /* goto out and don't bother logging, since that never happens */
> > > > *(u32*)p;
> > > >
> > > > but 'clever' user space would pass long optlen and prog suddenly
> > > > 'not seeing' the sockopt. It didn't crash, but debugging would be
> > > > surprising.
> > > >
> > > > I feel it's better to copy the first 4k and let the program see it.
> > > Agreed with the IP_FREEBIND example wrt observability, however it's
> > > not clear what to do with the cropped buffer if the bpf program
> > > modifies it.
> > >
> > > Consider that huge iptables setsockopts where the usespace passes
> > > PAGE_SIZE*10 optlen with real data and bpf prog sees only part of it.
> > > Now, if the bpf program modifies the buffer (say, flips some byte), we
> > > are back to square one. We either have to silently discard that buffer
> > > or reallocate/merge. My reasoning with data == NULL, is that at least
> > > there is a clear signal that the program can't access the data (and
> > > can look at optlen to see if the original buffer is indeed non-zero
> > > and maybe deny such requests?).
> > > At this point I'm really starting to think that maybe we should just
> > > vmalloc everything that is >PAGE_SIZE and add a sysclt to limit an
> > > upper bound :-/
> > > I'll try to think about this a bit more over the weekend.
> >
> > Yeah. Tough choices.
> > We can also detect in the verifier whether program accessed ctx->optval
> > and skip alloc/copy if program didn't touch it, but I suspect in most
> > case the program would want to read it.
> > I think vmallocing what optlen said is DoS-able. It's better to
> > stick with single page.
> > Let's keep brainstorming.
> Btw, can we use sleepable bpf for that? As in, do whatever I suggested
> in these patches (don't expose optval>PAGE_SIZE via context), but add
> a new helper where you can say 'copy x bytes from y offset of the
> original optval' (the helper will do sleepable copy_form_user).
> That way we have a clean signal to the BPF that the value is too big
> (optval==optval_end==NULL) and the user can fallback to the helper to
> inspect the value. We can also provide another helper to export new
> value for this case.

sleepable will be read-only and with toctou.
I guess this patch is the least evil then ?
But I'm confused with the test in patch 2.
Why does it do 'if (optval > optval_end)' ?
How is that possible when patch 1 makes them equal.
may be another idea:
allocate 4k, copy first 4k into it, but keep ctx.optlen as original.
if bpf prog reads optval and finds it ok in setsockopt,
it can set ctx.optlen = 0
which would mean run the rest of setsockopt handling with original
'__user *optval' and ignore the buffer that was passed to bpf prog.
In case of ctx.optlen < 4k the behavior won't change from bpf prog
and from kernel pov.
When ctx.optlen > 4k and prog didn't adjust it
__cgroup_bpf_run_filter_setsockopt will do ret = -EFAULT;
and reject sockopt.
So bpf prog would be force to do something with large optvals.
yet it will be able to examine first 4k bytes of it.
It's a bit of change in behavior, but I don't think bpf prog is
doing ctx.optlen = 0, since that's more or less the same as
doing ctx.optlen = -2.
or may be use some other indicator ?
And do something similar with getsockopt.

^ permalink raw reply

* Re: [PATCH bpf v4 1/2] bpf: don't return EINVAL from {get,set}sockopt when optlen > PAGE_SIZE
From: Alexei Starovoitov @ 2020-06-16 23:05 UTC (permalink / raw)
  To: Stanislav Fomichev
  Cc: Network Development, bpf, David S. Miller, Alexei Starovoitov,
	Daniel Borkmann, David Laight
In-Reply-To: <20200616225256.246769-1-sdf@google.com>

On Tue, Jun 16, 2020 at 3:53 PM Stanislav Fomichev <sdf@google.com> wrote:
>
> Attaching to these hooks can break iptables because its optval is
> usually quite big, or at least bigger than the current PAGE_SIZE limit.
> David also mentioned some SCTP options can be big (around 256k).
>
> There are two possible ways to fix it:
> 1. Increase the limit to match iptables max optval. There is, however,
>    no clear upper limit. Technically, iptables can accept up to
>    512M of data (not sure how practical it is though).
>
> 2. Bypass the value (don't expose to BPF) if it's too big and trigger
>    BPF only with level/optname so BPF can still decide whether
>    to allow/deny big sockopts.
>
> The initial attempt was implemented using strategy #1. Due to
> listed shortcomings, let's switch to strategy #2. When there is
> legitimate a real use-case for iptables/SCTP, we can consider increasing
>  the PAGE_SIZE limit.
>
> To support the cases where len(optval) > PAGE_SIZE we can
> leverage upcoming sleepable BPF work by providing a helper
> which can do copy_from_user (sleepable) at the given offset
> from the original large buffer.
>
> v4:
> * use temporary buffer to avoid optval == optval_end == NULL;
>   this removes the corner case in the verifier that might assume
>   non-zero PTR_TO_PACKET/PTR_TO_PACKET_END.

just replied with another idea in v3 thread...

^ permalink raw reply

* Re: [PATCH bpf v4 2/2] selftests/bpf: make sure optvals > PAGE_SIZE are bypassed
From: Alexei Starovoitov @ 2020-06-16 23:06 UTC (permalink / raw)
  To: Stanislav Fomichev
  Cc: Network Development, bpf, David S. Miller, Alexei Starovoitov,
	Daniel Borkmann
In-Reply-To: <20200616225256.246769-2-sdf@google.com>

On Tue, Jun 16, 2020 at 3:53 PM Stanislav Fomichev <sdf@google.com> wrote:
> +       if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) {
> +               if (optval > optval_end) {

same issue as before ?
see reply in v3.

> +                       /* For optval > PAGE_SIZE, the actual data
> +                        * is not provided.
> +                        */
> +                       return 0; /* EPERM, unexpected data size */
> +               }
> +               return 1;
> +       }
> +

^ permalink raw reply

* Re: [PATCH 5/5] Huawei BMA: Adding Huawei BMA driver: host_kbox_drv
From: Andrew Lunn @ 2020-06-16 23:11 UTC (permalink / raw)
  To: yunaixin03610; +Cc: netdev, yunaixin
In-Reply-To: <20200616020554.1443-1-yunaixin03610@163.com>

> The host_kbox_drv driver serves the function of a black box. When a
> panic or mce event happen to the system, it will record the event
> time, system's status and system logs and send them to BMC before
> the OS shutdown. This driver depends on the host_edms_drv driver.

This does not belong in net, or a network driver.

It is also very difficult to do anything with a panicing kernel. Which
is what kexec on panic is all about. Get into a fresh kernel, and use
that kernel to dump the status of the crashed kernel.

So i doubt you will find a home anywhere in the kernel for any of this
code.

	Andrew

^ permalink raw reply

* Re: [PATCH bpf 2/2] selftests/bpf: add variable-length data concatenation pattern test
From: Andrii Nakryiko @ 2020-06-16 23:14 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Andrii Nakryiko, bpf, Networking, Alexei Starovoitov, Kernel Team,
	Christoph Hellwig
In-Reply-To: <dd14f356-44bc-0ff0-a089-ce9fb9936c62@iogearbox.net>

On Tue, Jun 16, 2020 at 3:23 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 6/16/20 11:27 PM, Andrii Nakryiko wrote:
> > On Tue, Jun 16, 2020 at 1:21 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >> On 6/16/20 7:04 AM, Andrii Nakryiko wrote:
> >>> Add selftest that validates variable-length data reading and concatentation
> >>> with one big shared data array. This is a common pattern in production use for
> >>> monitoring and tracing applications, that potentially can read a lot of data,
> >>> but usually reads much less. Such pattern allows to determine precisely what
> >>> amount of data needs to be sent over perfbuf/ringbuf and maximize efficiency.
> >>>
> >>> This is the first BPF selftest that at all looks at and tests
> >>> bpf_probe_read_str()-like helper's return value, closing a major gap in BPF
> >>> testing. It surfaced the problem with bpf_probe_read_kernel_str() returning
> >>> 0 on success, instead of amount of bytes successfully read.
> >>>
> >>> Signed-off-by: Andrii Nakryiko <andriin@fb.com>
> >>
> >> Fix looks good, but I'm seeing an issue in the selftest on my side. With latest
> >> Clang/LLVM I'm getting:
> >>
> >> # ./test_progs -t varlen
> >> #86 varlen:OK
> >> Summary: 1/0 PASSED, 0 SKIPPED, 0 FAILED
> >>
> >> All good, however, the test_progs-no_alu32 fails for me with:
> >
> > Yeah, same here. It's due to Clang emitting unnecessary bit shifts
> > because bpf_probe_read_kernel_str() is defined as returning 32-bit
> > int. I have a patch ready locally, just waiting for bpf-next to open,
> > which switches those helpers to return long, which auto-matically
> > fixes this test.
> >
> > If it's not a problem, I'd just wait for that patch to go into
> > bpf-next. If not, I can sprinkle bits of assembly magic around to
> > force the kernel to do those bitshifts earlier. But I figured having
> > test_progs-no_alu32 failing one selftest temporarily wasn't too bad.
>
> Given {net,bpf}-next will open up soon, another option could be to take in the fix
> itself to bpf and selftest would be submitted together with your other improvement;
> any objections?
>

Yeah, no objections.

> Thanks,
> Daniel

^ permalink raw reply

* Re: [net 0/3][pull request] Intel Wired LAN Driver Updates 2020-06-16
From: David Miller @ 2020-06-16 23:16 UTC (permalink / raw)
  To: jeffrey.t.kirsher; +Cc: netdev, nhorman, sassmann
In-Reply-To: <20200616225354.2744572-1-jeffrey.t.kirsher@intel.com>

From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Tue, 16 Jun 2020 15:53:51 -0700

> This series contains fixes to e1000 and e1000e.
> 
> Chen fixes an e1000e issue where systems could be waken via WoL, even
> though the user has disabled the wakeup bit via sysfs.
> 
> Vaibhav Gupta updates the e1000 driver to clean up the legacy Power
> Management hooks.
> 
> Arnd Bergmann cleans up the inconsistent use CONFIG_PM_SLEEP
> preprocessor tags, which also resolves the compiler warnings about the
> possibility of unused structure.
> 
> The following are changes since commit b8ad540dd4e40566c520dff491fc06c71ae6b989:
>   mptcp: fix memory leak in mptcp_subflow_create_socket()
> and are available in the git repository at:
>   git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/net-queue 1GbE

Pulled, thanks Jeff.

^ permalink raw reply

* Re: [PATCH bpf v3 1/2] bpf: don't return EINVAL from {get,set}sockopt when optlen > PAGE_SIZE
From: Stanislav Fomichev @ 2020-06-16 23:20 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Netdev, bpf, David Miller, Alexei Starovoitov, Daniel Borkmann,
	David Laight
In-Reply-To: <20200616230355.hzipb7hly3fo5moc@ast-mbp.dhcp.thefacebook.com>

On Tue, Jun 16, 2020 at 4:04 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Mon, Jun 15, 2020 at 09:41:38AM -0700, Stanislav Fomichev wrote:
> > On Fri, Jun 12, 2020 at 8:50 PM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > [ .. ]
> > > > > It's probably ok, but makes me uneasy about verifier consequences.
> > > > > ctx->optval is PTR_TO_PACKET and it's a valid pointer from verifier pov.
> > > > > Do we have cases already where PTR_TO_PACKET == PTR_TO_PACKET_END ?
> > > > > I don't think we have such tests. I guess bpf prog won't be able to read
> > > > > anything and nothing will crash, but having PTR_TO_PACKET that is
> > > > > actually NULL would be an odd special case to keep in mind for everyone
> > > > > who will work on the verifier from now on.
> > > > >
> > > > > Also consider bpf prog that simply reads something small like 4 bytes.
> > > > > IP_FREEBIND sockopt (like your selftest in the patch 2) will have
> > > > > those 4 bytes, so it's natural for the prog to assume that it can read it.
> > > > > It will have
> > > > > p = ctx->optval;
> > > > > if (p + 4 > ctx->optval_end)
> > > > >  /* goto out and don't bother logging, since that never happens */
> > > > > *(u32*)p;
> > > > >
> > > > > but 'clever' user space would pass long optlen and prog suddenly
> > > > > 'not seeing' the sockopt. It didn't crash, but debugging would be
> > > > > surprising.
> > > > >
> > > > > I feel it's better to copy the first 4k and let the program see it.
> > > > Agreed with the IP_FREEBIND example wrt observability, however it's
> > > > not clear what to do with the cropped buffer if the bpf program
> > > > modifies it.
> > > >
> > > > Consider that huge iptables setsockopts where the usespace passes
> > > > PAGE_SIZE*10 optlen with real data and bpf prog sees only part of it.
> > > > Now, if the bpf program modifies the buffer (say, flips some byte), we
> > > > are back to square one. We either have to silently discard that buffer
> > > > or reallocate/merge. My reasoning with data == NULL, is that at least
> > > > there is a clear signal that the program can't access the data (and
> > > > can look at optlen to see if the original buffer is indeed non-zero
> > > > and maybe deny such requests?).
> > > > At this point I'm really starting to think that maybe we should just
> > > > vmalloc everything that is >PAGE_SIZE and add a sysclt to limit an
> > > > upper bound :-/
> > > > I'll try to think about this a bit more over the weekend.
> > >
> > > Yeah. Tough choices.
> > > We can also detect in the verifier whether program accessed ctx->optval
> > > and skip alloc/copy if program didn't touch it, but I suspect in most
> > > case the program would want to read it.
> > > I think vmallocing what optlen said is DoS-able. It's better to
> > > stick with single page.
> > > Let's keep brainstorming.
> > Btw, can we use sleepable bpf for that? As in, do whatever I suggested
> > in these patches (don't expose optval>PAGE_SIZE via context), but add
> > a new helper where you can say 'copy x bytes from y offset of the
> > original optval' (the helper will do sleepable copy_form_user).
> > That way we have a clean signal to the BPF that the value is too big
> > (optval==optval_end==NULL) and the user can fallback to the helper to
> > inspect the value. We can also provide another helper to export new
> > value for this case.
>
> sleepable will be read-only and with toctou.
> I guess this patch is the least evil then ?
> But I'm confused with the test in patch 2.
> Why does it do 'if (optval > optval_end)' ?
> How is that possible when patch 1 makes them equal.
Right, it should really be 'optval < optval_end', good point!
I want to make sure in the BPF program that we can't access the buffer
(essentially return EPERM to the userpace so the test fails).
Will fix in a follow up.

> may be another idea:
> allocate 4k, copy first 4k into it, but keep ctx.optlen as original.
> if bpf prog reads optval and finds it ok in setsockopt,
> it can set ctx.optlen = 0
> which would mean run the rest of setsockopt handling with original
> '__user *optval' and ignore the buffer that was passed to bpf prog.
> In case of ctx.optlen < 4k the behavior won't change from bpf prog
> and from kernel pov.
> When ctx.optlen > 4k and prog didn't adjust it
> __cgroup_bpf_run_filter_setsockopt will do ret = -EFAULT;
> and reject sockopt.
> So bpf prog would be force to do something with large optvals.
> yet it will be able to examine first 4k bytes of it.
> It's a bit of change in behavior, but I don't think bpf prog is
> doing ctx.optlen = 0, since that's more or less the same as
> doing ctx.optlen = -2.
> or may be use some other indicator ?
> And do something similar with getsockopt.
Good suggestion! That sounds doable in theory, let me try and see if I
hit something unexpected.
I suppose trimming provided optval to zero (optlen=0) is not something
that sensible programs would do?

In this case please ignore the v4 I posted recently!

^ permalink raw reply

* Re: [PATCH bpf v4 1/2] bpf: don't return EINVAL from {get,set}sockopt when optlen > PAGE_SIZE
From: Stanislav Fomichev @ 2020-06-16 23:20 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Network Development, bpf, David S. Miller, Alexei Starovoitov,
	Daniel Borkmann, David Laight
In-Reply-To: <CAADnVQLS7=UmT9ivyuUiq8i9ZJRUyPNhN0dvdeiF32sUi=A3NQ@mail.gmail.com>

On Tue, Jun 16, 2020 at 4:05 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Jun 16, 2020 at 3:53 PM Stanislav Fomichev <sdf@google.com> wrote:
> >
> > Attaching to these hooks can break iptables because its optval is
> > usually quite big, or at least bigger than the current PAGE_SIZE limit.
> > David also mentioned some SCTP options can be big (around 256k).
> >
> > There are two possible ways to fix it:
> > 1. Increase the limit to match iptables max optval. There is, however,
> >    no clear upper limit. Technically, iptables can accept up to
> >    512M of data (not sure how practical it is though).
> >
> > 2. Bypass the value (don't expose to BPF) if it's too big and trigger
> >    BPF only with level/optname so BPF can still decide whether
> >    to allow/deny big sockopts.
> >
> > The initial attempt was implemented using strategy #1. Due to
> > listed shortcomings, let's switch to strategy #2. When there is
> > legitimate a real use-case for iptables/SCTP, we can consider increasing
> >  the PAGE_SIZE limit.
> >
> > To support the cases where len(optval) > PAGE_SIZE we can
> > leverage upcoming sleepable BPF work by providing a helper
> > which can do copy_from_user (sleepable) at the given offset
> > from the original large buffer.
> >
> > v4:
> > * use temporary buffer to avoid optval == optval_end == NULL;
> >   this removes the corner case in the verifier that might assume
> >   non-zero PTR_TO_PACKET/PTR_TO_PACKET_END.
>
> just replied with another idea in v3 thread...
Yeah, sorry about that, posted 5 mins before your reply :-( Sorry for the noise.

^ permalink raw reply

* [GIT] Networking
From: David Miller @ 2020-06-16 23:25 UTC (permalink / raw)
  To: torvalds; +Cc: akpm, netdev, linux-kernel


1) Don't get per-cpu pointer with preemption enabled in nft_set_pipapo,
   fix from Stefano Brivio.

2) Fix memory leak in ctnetlink, from Pablo Neira Ayuso.

3) Multiple definitions of MPTCP_PM_MAX_ADDR, from Geliang Tang.

4) Accidently disabling NAPI in non-error paths of macb_open(), from
   Charles Keepax.

5) Fix races between alx_stop and alx_remove, from Zekun Shen.

6) We forget to re-enable SRIOV during resume in bnxt_en driver,
   from Michael Chan.

7) Fix memory leak in ipv6_mc_destroy_dev(), from Wang Hai.

8) rxtx stats use wrong index in mvpp2 driver, from Sven Auhagen.

9) Fix memory leak in mptcp_subflow_create_socket error path,
   from Wei Yongjun.

10) We should not adjust the TCP window advertised when sending dup
    acks in non-SACK mode, because it won't be counted as a dup by the
    sender if the window size changes.  From Eric Dumazet.

11) Destroy the right number of queues during remove in mvpp2 driver,
    from Sven Auhagen.

12) Various WOL and PM fixes to e1000 driver, from Chen Yu, Vaibhav
    Gupta, and Arnd Bergmann.

Please pull, thanks a lot!

The following changes since commit b3a9e3b9622ae10064826dccb4f7a52bd88c7407:

  Linux 5.8-rc1 (2020-06-14 12:45:04 -0700)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net 

for you to fetch changes up to c9f66b43ee27409e1b614434d87e0e722efaa5f2:

  Merge branch '1GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/net-queue (2020-06-16 16:16:24 -0700)

----------------------------------------------------------------
Aditya Pakki (2):
      test_objagg: Fix potential memory leak in error handling
      rocker: fix incorrect error handling in dma_rings_init

Alaa Hleihel (2):
      net/sched: act_ct: Make tcf_ct_flow_table_restore_skb inline
      netfilter: flowtable: Make nf_flow_table_offload_add/del_cb inline

Arnd Bergmann (1):
      e1000e: fix unused-function warning

Bartosz Golaszewski (1):
      net: ethernet: mtk-star-emac: simplify interrupt handling

Charles Keepax (1):
      net: macb: Only disable NAPI on the actual error path

Chen Yu (1):
      e1000e: Do not wake up the system via WOL if device wakeup is disabled

Colin Ian King (1):
      net: axienet: fix spelling mistake in comment "Exteneded" -> "extended"

David S. Miller (4):
      Merge git://git.kernel.org/.../pablo/nf
      Merge branch 'bnxt_en-Bug-fixes'
      Merge branch 'remove-dependency-between-mlx5-act_ct-nf_flow_table'
      Merge branch '1GbE' of git://git.kernel.org/.../jkirsher/net-queue

Eric Dumazet (1):
      tcp: grow window for OOO packets only for SACK flows

Geliang Tang (2):
      mptcp: drop MPTCP_PM_MAX_ADDR
      mptcp: use list_first_entry_or_null

Ido Schimmel (1):
      mlxsw: spectrum: Adjust headroom buffers for 8x ports

Ka-Cheong Poon (1):
      net/rds: NULL pointer de-reference in rds_ib_add_one()

Martin (1):
      bareudp: Fixed configuration to avoid having garbage values

Michael Chan (3):
      bnxt_en: Simplify bnxt_resume().
      bnxt_en: Re-enable SRIOV during resume.
      bnxt_en: Fix AER reset logic on 57500 chips.

Pablo Neira Ayuso (2):
      netfilter: ctnetlink: memleak in filter initialization error path
      netfilter: nf_tables: hook list memleak in flowtable deletion

Sergei Shtylyov (1):
      MAINTAINERS: switch to my private email for Renesas Ethernet drivers

Stefano Brivio (2):
      netfilter: nft_set_rbtree: Don't account for expired elements on insertion
      netfilter: nft_set_pipapo: Disable preemption before getting per-CPU pointer

Sven Auhagen (2):
      mvpp2: ethtool rxtx stats fix
      mvpp2: remove module bugfix

Thomas Falcon (1):
      ibmvnic: Harden device login requests

Tim Harvey (1):
      lan743x: add MODULE_DEVICE_TABLE for module loading alias

Vaibhav Gupta (1):
      e1000: use generic power management

Vasundhara Volam (1):
      bnxt_en: Return from timer if interface is not in open state.

Vladimir Oltean (2):
      MAINTAINERS: merge entries for felix and ocelot drivers
      net: dsa: sja1105: fix PTP timestamping with large tc-taprio cycles

Wang Hai (1):
      mld: fix memory leak in ipv6_mc_destroy_dev()

Wang Qing (1):
      qlcnic: Use kobj_to_dev() instead

Wei Yongjun (1):
      mptcp: fix memory leak in mptcp_subflow_create_socket()

Zekun Shen (1):
      net: alx: fix race condition in alx_remove

 MAINTAINERS                                            |  30 ++++++++++-------------
 drivers/net/bareudp.c                                  |   2 ++
 drivers/net/dsa/sja1105/sja1105_ptp.c                  |   8 +++---
 drivers/net/ethernet/atheros/alx/main.c                |   9 ++++---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c              |  35 +++++++++++++--------------
 drivers/net/ethernet/cadence/macb_main.c               |   9 +++----
 drivers/net/ethernet/ibm/ibmvnic.c                     |  21 +++++++++++++---
 drivers/net/ethernet/intel/e1000/e1000_main.c          |  49 ++++++++++---------------------------
 drivers/net/ethernet/intel/e1000e/netdev.c             |  30 +++++++++++------------
 drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c        |  11 ++++++---
 drivers/net/ethernet/mediatek/mtk_star_emac.c          | 118 ++++++++++++++++++++++-------------------------------------------------------------------
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c         |   2 ++
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h         |  13 ++++++++++
 drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c |   1 +
 drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c    |   1 +
 drivers/net/ethernet/microchip/lan743x_main.c          |   2 ++
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_sysfs.c      |  34 +++++++++++++-------------
 drivers/net/ethernet/rocker/rocker_main.c              |   4 +--
 drivers/net/ethernet/xilinx/xilinx_axienet.h           |   2 +-
 include/net/netfilter/nf_flow_table.h                  |  49 ++++++++++++++++++++++++++++++++++---
 include/net/tc_act/tc_ct.h                             |  11 ++++++++-
 lib/test_objagg.c                                      |   4 +--
 net/ipv4/tcp_input.c                                   |  12 +++++++--
 net/ipv6/mcast.c                                       |   1 +
 net/mptcp/protocol.h                                   |   7 +-----
 net/mptcp/subflow.c                                    |   4 ++-
 net/netfilter/nf_conntrack_netlink.c                   |  32 ++++++++++++++++--------
 net/netfilter/nf_flow_table_core.c                     |  45 ----------------------------------
 net/netfilter/nf_tables_api.c                          |  31 ++++++++++++++++++------
 net/netfilter/nft_set_pipapo.c                         |   6 ++++-
 net/netfilter/nft_set_rbtree.c                         |  21 ++++++++++------
 net/rds/ib.h                                           |   8 +++++-
 net/sched/act_ct.c                                     |  11 ---------
 33 files changed, 309 insertions(+), 314 deletions(-)

^ permalink raw reply

* [PATCH net 0/3] Fix VLAN checks for SJA1105 DSA tc-flower filters
From: Vladimir Oltean @ 2020-06-16 23:58 UTC (permalink / raw)
  To: netdev; +Cc: andrew, f.fainelli, vivien.didelot, kuba

From: Vladimir Oltean <vladimir.oltean@nxp.com>

This fixes a ridiculous situation where the driver, in VLAN-unaware
mode, would refuse accepting any tc filter:

tc filter replace dev sw1p3 ingress flower skip_sw \
	dst_mac 42:be:24:9b:76:20 \
	action gate (...)
Error: sja1105: Can only gate based on {DMAC, VID, PCP}.

tc filter replace dev sw1p3 ingress protocol 802.1Q flower skip_sw \
	vlan_id 1 vlan_prio 0 dst_mac 42:be:24:9b:76:20 \
	action gate (...)
Error: sja1105: Can only gate based on DMAC.

So, without changing the VLAN awareness state, it says it doesn't want
VLAN-aware rules, and it doesn't want VLAN-unaware rules either. One
would say it's in Schrodinger's state...

Now, the situation has been made worse by commit 7f14937facdc ("net:
dsa: sja1105: keep the VLAN awareness state in a driver variable"),
which made VLAN awareness a ternary attribute, but after inspecting the
code from before that patch with a truth table, it looks like the
logical bug was there even before.

While attempting to fix this, I also noticed some leftover debugging
code in one of the places that needed to be fixed. It would have
appeared in the context of patch 3/3 anyway, so I decided to create a
patch that removes it.

Vladimir Oltean (3):
  net: dsa: sja1105: remove debugging code in sja1105_vl_gate
  net: dsa: sja1105: fix checks for VLAN state in redirect action
  net: dsa: sja1105: fix checks for VLAN state in gate action

 drivers/net/dsa/sja1105/sja1105_vl.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

-- 
2.25.1


^ permalink raw reply

* [PATCH net 1/3] net: dsa: sja1105: remove debugging code in sja1105_vl_gate
From: Vladimir Oltean @ 2020-06-16 23:58 UTC (permalink / raw)
  To: netdev; +Cc: andrew, f.fainelli, vivien.didelot, kuba
In-Reply-To: <20200616235843.756413-1-olteanv@gmail.com>

From: Vladimir Oltean <vladimir.oltean@nxp.com>

This shouldn't be there.

Fixes: 834f8933d5dd ("net: dsa: sja1105: implement tc-gate using time-triggered virtual links")
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
---
 drivers/net/dsa/sja1105/sja1105_vl.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/net/dsa/sja1105/sja1105_vl.c b/drivers/net/dsa/sja1105/sja1105_vl.c
index bdfd6c4e190d..32eca3e660e1 100644
--- a/drivers/net/dsa/sja1105/sja1105_vl.c
+++ b/drivers/net/dsa/sja1105/sja1105_vl.c
@@ -588,14 +588,10 @@ int sja1105_vl_gate(struct sja1105_private *priv, int port,
 
 	if (priv->vlan_state == SJA1105_VLAN_UNAWARE &&
 	    key->type != SJA1105_KEY_VLAN_UNAWARE_VL) {
-		dev_err(priv->ds->dev, "1: vlan state %d key type %d\n",
-			priv->vlan_state, key->type);
 		NL_SET_ERR_MSG_MOD(extack,
 				   "Can only gate based on DMAC");
 		return -EOPNOTSUPP;
 	} else if (key->type != SJA1105_KEY_VLAN_AWARE_VL) {
-		dev_err(priv->ds->dev, "2: vlan state %d key type %d\n",
-			priv->vlan_state, key->type);
 		NL_SET_ERR_MSG_MOD(extack,
 				   "Can only gate based on {DMAC, VID, PCP}");
 		return -EOPNOTSUPP;
-- 
2.25.1


^ permalink raw reply related

* [PATCH net 3/3] net: dsa: sja1105: fix checks for VLAN state in gate action
From: Vladimir Oltean @ 2020-06-16 23:58 UTC (permalink / raw)
  To: netdev; +Cc: andrew, f.fainelli, vivien.didelot, kuba
In-Reply-To: <20200616235843.756413-1-olteanv@gmail.com>

From: Vladimir Oltean <vladimir.oltean@nxp.com>

This action requires the VLAN awareness state of the switch to be of the
same type as the key that's being added:

- If the switch is unaware of VLAN, then the tc filter key must only
  contain the destination MAC address.
- If the switch is VLAN-aware, the key must also contain the VLAN ID and
  PCP.

But this check doesn't work unless we verify the VLAN awareness state on
both the "if" and the "else" branches.

Fixes: 834f8933d5dd ("net: dsa: sja1105: implement tc-gate using time-triggered virtual links")
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
---
 drivers/net/dsa/sja1105/sja1105_vl.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/dsa/sja1105/sja1105_vl.c b/drivers/net/dsa/sja1105/sja1105_vl.c
index a176f39a052b..0056f9c1e471 100644
--- a/drivers/net/dsa/sja1105/sja1105_vl.c
+++ b/drivers/net/dsa/sja1105/sja1105_vl.c
@@ -593,7 +593,9 @@ int sja1105_vl_gate(struct sja1105_private *priv, int port,
 		NL_SET_ERR_MSG_MOD(extack,
 				   "Can only gate based on DMAC");
 		return -EOPNOTSUPP;
-	} else if (key->type != SJA1105_KEY_VLAN_AWARE_VL) {
+	} else if ((priv->vlan_state == SJA1105_VLAN_BEST_EFFORT ||
+		    priv->vlan_state == SJA1105_VLAN_FILTERING_FULL) &&
+		   key->type != SJA1105_KEY_VLAN_AWARE_VL) {
 		NL_SET_ERR_MSG_MOD(extack,
 				   "Can only gate based on {DMAC, VID, PCP}");
 		return -EOPNOTSUPP;
-- 
2.25.1


^ permalink raw reply related

* [PATCH net 2/3] net: dsa: sja1105: fix checks for VLAN state in redirect action
From: Vladimir Oltean @ 2020-06-16 23:58 UTC (permalink / raw)
  To: netdev; +Cc: andrew, f.fainelli, vivien.didelot, kuba
In-Reply-To: <20200616235843.756413-1-olteanv@gmail.com>

From: Vladimir Oltean <vladimir.oltean@nxp.com>

This action requires the VLAN awareness state of the switch to be of the
same type as the key that's being added:

- If the switch is unaware of VLAN, then the tc filter key must only
  contain the destination MAC address.
- If the switch is VLAN-aware, the key must also contain the VLAN ID and
  PCP.

But this check doesn't work unless we verify the VLAN awareness state on
both the "if" and the "else" branches.

Fixes: dfacc5a23e22 ("net: dsa: sja1105: support flow-based redirection via virtual links")
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
---
 drivers/net/dsa/sja1105/sja1105_vl.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/dsa/sja1105/sja1105_vl.c b/drivers/net/dsa/sja1105/sja1105_vl.c
index 32eca3e660e1..a176f39a052b 100644
--- a/drivers/net/dsa/sja1105/sja1105_vl.c
+++ b/drivers/net/dsa/sja1105/sja1105_vl.c
@@ -342,7 +342,9 @@ int sja1105_vl_redirect(struct sja1105_private *priv, int port,
 		NL_SET_ERR_MSG_MOD(extack,
 				   "Can only redirect based on DMAC");
 		return -EOPNOTSUPP;
-	} else if (key->type != SJA1105_KEY_VLAN_AWARE_VL) {
+	} else if ((priv->vlan_state == SJA1105_VLAN_BEST_EFFORT ||
+		    priv->vlan_state == SJA1105_VLAN_FILTERING_FULL) &&
+		   key->type != SJA1105_KEY_VLAN_AWARE_VL) {
 		NL_SET_ERR_MSG_MOD(extack,
 				   "Can only redirect based on {DMAC, VID, PCP}");
 		return -EOPNOTSUPP;
-- 
2.25.1


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox