* [PATCH bpf 4/7] bpf: fix cg_skb types to hint access type in may_access_direct_pkt_data
From: Daniel Borkmann @ 2018-10-24 20:05 UTC (permalink / raw)
To: ast; +Cc: netdev, Daniel Borkmann, Song Liu
In-Reply-To: <20181024200549.8516-1-daniel@iogearbox.net>
Commit b39b5f411dcf ("bpf: add cg_skb_is_valid_access for
BPF_PROG_TYPE_CGROUP_SKB") added direct packet access for skbs in
cg_skb program types, however allowed access type was not added to
the may_access_direct_pkt_data() helper. Therefore the latter always
returns false. This is not directly an issue, it just means writes
are unconditionally disabled (which is correct) but also reads.
Latter is relevant in this function when BPF helpers may read direct
packet data which is unconditionally disabled then. Fix it by properly
adding BPF_PROG_TYPE_CGROUP_SKB to may_access_direct_pkt_data().
Fixes: b39b5f411dcf ("bpf: add cg_skb_is_valid_access for BPF_PROG_TYPE_CGROUP_SKB")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: Song Liu <songliubraving@fb.com>
---
kernel/bpf/verifier.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b0cc8f2..5fc9a65 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1393,6 +1393,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
case BPF_PROG_TYPE_LWT_SEG6LOCAL:
case BPF_PROG_TYPE_SK_REUSEPORT:
case BPF_PROG_TYPE_FLOW_DISSECTOR:
+ case BPF_PROG_TYPE_CGROUP_SKB:
if (t == BPF_WRITE)
return false;
/* fallthrough */
--
2.9.5
^ permalink raw reply related
* [PATCH bpf 3/7] bpf: fix direct packet access for flow dissector progs
From: Daniel Borkmann @ 2018-10-24 20:05 UTC (permalink / raw)
To: ast; +Cc: netdev, Daniel Borkmann, Petar Penkov
In-Reply-To: <20181024200549.8516-1-daniel@iogearbox.net>
Commit d58e468b1112 ("flow_dissector: implements flow dissector BPF
hook") added direct packet access for skbs in may_access_direct_pkt_data()
function where this enables read and write access to the skb->data. This
is buggy because without a prologue generator such as bpf_unclone_prologue()
we would allow for writing into cloned skbs. Original intention might have
been to only allow read access where this is not needed (similar as the
flow_dissector_func_proto() indicates which enables only bpf_skb_load_bytes()
as well), therefore this patch fixes it to restrict to read-only.
Fixes: d58e468b1112 ("flow_dissector: implements flow dissector BPF hook")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: Petar Penkov <ppenkov@google.com>
---
kernel/bpf/verifier.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 98fa0be..b0cc8f2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1387,21 +1387,23 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
enum bpf_access_type t)
{
switch (env->prog->type) {
+ /* Program types only with direct read access go here! */
case BPF_PROG_TYPE_LWT_IN:
case BPF_PROG_TYPE_LWT_OUT:
case BPF_PROG_TYPE_LWT_SEG6LOCAL:
case BPF_PROG_TYPE_SK_REUSEPORT:
- /* dst_input() and dst_output() can't write for now */
+ case BPF_PROG_TYPE_FLOW_DISSECTOR:
if (t == BPF_WRITE)
return false;
/* fallthrough */
+
+ /* Program types with direct read + write access go here! */
case BPF_PROG_TYPE_SCHED_CLS:
case BPF_PROG_TYPE_SCHED_ACT:
case BPF_PROG_TYPE_XDP:
case BPF_PROG_TYPE_LWT_XMIT:
case BPF_PROG_TYPE_SK_SKB:
case BPF_PROG_TYPE_SK_MSG:
- case BPF_PROG_TYPE_FLOW_DISSECTOR:
if (meta)
return meta->pkt_access;
--
2.9.5
^ permalink raw reply related
* [PATCH bpf 1/7] bpf: fix test suite to enable all unpriv program types
From: Daniel Borkmann @ 2018-10-24 20:05 UTC (permalink / raw)
To: ast; +Cc: netdev, Daniel Borkmann
In-Reply-To: <20181024200549.8516-1-daniel@iogearbox.net>
Given BPF_PROG_TYPE_CGROUP_SKB program types are also valid in an
unprivileged setting, lets not omit these tests and potentially
have issues fall through the cracks. Make this more obvious by
adding a small test_as_unpriv() helper.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
tools/testing/selftests/bpf/test_verifier.c | 15 +++++++++++++--
1 file changed, 13 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 769d68a..8e1a79d 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -4891,6 +4891,8 @@ static struct bpf_test tests[] = {
BPF_EXIT_INSN(),
},
.result = ACCEPT,
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "R3 pointer comparison prohibited",
.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
},
{
@@ -5146,6 +5148,7 @@ static struct bpf_test tests[] = {
.fixup_cgroup_storage = { 1 },
.result = REJECT,
.errstr = "get_local_storage() doesn't support non-zero flags",
+ .errstr_unpriv = "R2 leaks addr into helper function",
.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
},
{
@@ -5261,6 +5264,7 @@ static struct bpf_test tests[] = {
.fixup_percpu_cgroup_storage = { 1 },
.result = REJECT,
.errstr = "get_local_storage() doesn't support non-zero flags",
+ .errstr_unpriv = "R2 leaks addr into helper function",
.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
},
{
@@ -14050,6 +14054,13 @@ static void get_unpriv_disabled()
fclose(fd);
}
+static bool test_as_unpriv(struct bpf_test *test)
+{
+ return !test->prog_type ||
+ test->prog_type == BPF_PROG_TYPE_SOCKET_FILTER ||
+ test->prog_type == BPF_PROG_TYPE_CGROUP_SKB;
+}
+
static int do_test(bool unpriv, unsigned int from, unsigned int to)
{
int i, passes = 0, errors = 0, skips = 0;
@@ -14060,10 +14071,10 @@ static int do_test(bool unpriv, unsigned int from, unsigned int to)
/* Program types that are not supported by non-root we
* skip right away.
*/
- if (!test->prog_type && unpriv_disabled) {
+ if (test_as_unpriv(test) && unpriv_disabled) {
printf("#%d/u %s SKIP\n", i, test->descr);
skips++;
- } else if (!test->prog_type) {
+ } else if (test_as_unpriv(test)) {
if (!unpriv)
set_admin(false);
printf("#%d/u %s ", i, test->descr);
--
2.9.5
^ permalink raw reply related
* [PATCH bpf 0/7] Batch of direct packet access fixes for BPF
From: Daniel Borkmann @ 2018-10-24 20:05 UTC (permalink / raw)
To: ast; +Cc: netdev, Daniel Borkmann
Several fixes to get direct packet access in order from verifier
side. Also test suite fix to run cg_skb as unpriv and an improvement
to make direct packet write less error prone in future.
Thanks!
Daniel Borkmann (7):
bpf: fix test suite to enable all unpriv program types
bpf: disallow direct packet access for unpriv in cg_skb
bpf: fix direct packet access for flow dissector progs
bpf: fix cg_skb types to hint access type in may_access_direct_pkt_data
bpf: fix direct packet write into pop/peek helpers
bpf: fix leaking uninitialized memory on pop/peek helpers
bpf: make direct packet write unclone more robust
kernel/bpf/helpers.c | 2 --
kernel/bpf/queue_stack_maps.c | 2 ++
kernel/bpf/verifier.c | 13 ++++++++++---
net/core/filter.c | 17 +++++++++++++++++
tools/testing/selftests/bpf/test_verifier.c | 15 +++++++++++++--
5 files changed, 42 insertions(+), 7 deletions(-)
--
2.9.5
^ permalink raw reply
* [PATCH net 4/4] net: rtnl_dump_all needs to propagate error from dumpit function
From: David Ahern @ 2018-10-24 19:59 UTC (permalink / raw)
To: netdev, davem; +Cc: lirongqing, David Ahern
In-Reply-To: <20181024195902.17479-1-dsahern@kernel.org>
From: David Ahern <dsahern@gmail.com>
If an address, route or netconf dump request is sent for AF_UNSPEC, then
rtnl_dump_all is used to do the dump across all address families. If one
of the dumpit functions fails (e.g., invalid attributes in the dump
request) then rtnl_dump_all needs to propagate that error so the user
gets an appropriate response instead of just getting no data.
Fixes: effe67926624 ("net: Enable kernel side filtering of route dumps")
Fixes: 5fcd266a9f64 ("net/ipv4: Add support for dumping addresses for a specific device")
Fixes: 6371a71f3a3b ("net/ipv6: Add support for dumping addresses for a specific device")
Signed-off-by: David Ahern <dsahern@gmail.com>
---
net/core/rtnetlink.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 0958c7be2c22..f679c7a7d761 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3333,6 +3333,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
int idx;
int s_idx = cb->family;
int type = cb->nlh->nlmsg_type - RTM_BASE;
+ int ret = 0;
if (s_idx == 0)
s_idx = 1;
@@ -3365,12 +3366,13 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
cb->prev_seq = 0;
cb->seq = 0;
}
- if (dumpit(skb, cb))
+ ret = dumpit(skb, cb);
+ if (ret < 0)
break;
}
cb->family = idx;
- return skb->len;
+ return skb->len ? : ret;
}
struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
--
2.11.0
^ permalink raw reply related
* [PATCH net 3/4] net: Don't return invalid table id error when dumping all families
From: David Ahern @ 2018-10-24 19:59 UTC (permalink / raw)
To: netdev, davem; +Cc: lirongqing, David Ahern
In-Reply-To: <20181024195902.17479-1-dsahern@kernel.org>
From: David Ahern <dsahern@gmail.com>
When doing a route dump across all address families, do not error out
if the table does not exist. This allows a route dump for AF_UNSPEC
with a table id that may only exist for some of the families.
Do return the table does not exist error if dumping routes for a
specific family and the table does not exist.
Signed-off-by: David Ahern <dsahern@gmail.com>
---
include/net/ip_fib.h | 1 +
net/ipv4/fib_frontend.c | 4 ++++
net/ipv4/ipmr.c | 3 +++
net/ipv6/ip6_fib.c | 3 +++
net/ipv6/ip6mr.c | 3 +++
5 files changed, 14 insertions(+)
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index e8d9456bf36e..c5969762a8f4 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -226,6 +226,7 @@ struct fib_dump_filter {
u32 table_id;
/* filter_set is an optimization that an entry is set */
bool filter_set;
+ bool dump_all_families;
unsigned char protocol;
unsigned char rt_type;
unsigned int flags;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 5bf653f36911..6df95be96311 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -829,6 +829,7 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
return -EINVAL;
}
+ filter->dump_all_families = (rtm->rtm_family == AF_UNSPEC);
filter->flags = rtm->rtm_flags;
filter->protocol = rtm->rtm_protocol;
filter->rt_type = rtm->rtm_type;
@@ -899,6 +900,9 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
if (filter.table_id) {
tb = fib_get_table(net, filter.table_id);
if (!tb) {
+ if (filter.dump_all_families)
+ return skb->len;
+
NL_SET_ERR_MSG(cb->extack, "ipv4: FIB table does not exist");
return -ENOENT;
}
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 7a3e2acda94c..a6defbec4f1b 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2542,6 +2542,9 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
mrt = ipmr_get_table(sock_net(skb->sk), filter.table_id);
if (!mrt) {
+ if (filter.dump_all_families)
+ return skb->len;
+
NL_SET_ERR_MSG(cb->extack, "ipv4: MR table does not exist");
return -ENOENT;
}
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 2a058b408a6a..1b8bc008b53b 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -620,6 +620,9 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
if (arg.filter.table_id) {
tb = fib6_get_table(net, arg.filter.table_id);
if (!tb) {
+ if (arg.filter.dump_all_families)
+ return skb->len;
+
NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist");
return -ENOENT;
}
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index c3317ffb09eb..e2ea691e42c6 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2473,6 +2473,9 @@ static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
mrt = ip6mr_get_table(sock_net(skb->sk), filter.table_id);
if (!mrt) {
+ if (filter.dump_all_families)
+ return skb->len;
+
NL_SET_ERR_MSG_MOD(cb->extack, "MR table does not exist");
return -ENOENT;
}
--
2.11.0
^ permalink raw reply related
* [PATCH net 1/4] net/ipv4: Put target net when address dump fails due to bad attributes
From: David Ahern @ 2018-10-24 19:58 UTC (permalink / raw)
To: netdev, davem; +Cc: lirongqing, David Ahern
In-Reply-To: <20181024195902.17479-1-dsahern@kernel.org>
From: David Ahern <dsahern@gmail.com>
If tgt_net is set based on IFA_TARGET_NETNSID attribute in the dump
request, make sure all error paths call put_net.
Fixes: 5fcd266a9f64 ("net/ipv4: Add support for dumping addresses for a specific device")
Fixes: c33078e3dfb1 ("net/ipv4: Update inet_dump_ifaddr for strict data checking")
Reported-by: Li RongQing <lirongqing@baidu.com>
Signed-off-by: David Ahern <dsahern@gmail.com>
---
net/ipv4/devinet.c | 13 ++++++++-----
1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 63d5b58fbfdb..9250b309c742 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1761,7 +1761,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
struct net_device *dev;
struct in_device *in_dev;
struct hlist_head *head;
- int err;
+ int err = 0;
s_h = cb->args[0];
s_idx = idx = cb->args[1];
@@ -1771,12 +1771,15 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
err = inet_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net,
skb->sk, cb);
if (err < 0)
- return err;
+ goto put_tgt_net;
+ err = 0;
if (fillargs.ifindex) {
dev = __dev_get_by_index(tgt_net, fillargs.ifindex);
- if (!dev)
- return -ENODEV;
+ if (!dev) {
+ err = -ENODEV;
+ goto put_tgt_net;
+ }
in_dev = __in_dev_get_rtnl(dev);
if (in_dev) {
@@ -1821,7 +1824,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
if (fillargs.netnsid >= 0)
put_net(tgt_net);
- return skb->len;
+ return err < 0 ? err : skb->len;
}
static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
--
2.11.0
^ permalink raw reply related
* [PATCH net 2/4] net/ipv6: Put target net when address dump fails due to bad attributes
From: David Ahern @ 2018-10-24 19:59 UTC (permalink / raw)
To: netdev, davem; +Cc: lirongqing, David Ahern
In-Reply-To: <20181024195902.17479-1-dsahern@kernel.org>
From: David Ahern <dsahern@gmail.com>
If tgt_net is set based on IFA_TARGET_NETNSID attribute in the dump
request, make sure all error paths call put_net.
Fixes: 6371a71f3a3b ("net/ipv6: Add support for dumping addresses for a specific device")
Fixes: ed6eff11790a ("net/ipv6: Update inet6_dump_addr for strict data checking")
Signed-off-by: David Ahern <dsahern@gmail.com>
---
net/ipv6/addrconf.c | 14 ++++++++------
1 file changed, 8 insertions(+), 6 deletions(-)
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 45b84dd5c4eb..7eb09c86fa13 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -5089,23 +5089,25 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
struct net_device *dev;
struct inet6_dev *idev;
struct hlist_head *head;
+ int err = 0;
s_h = cb->args[0];
s_idx = idx = cb->args[1];
s_ip_idx = cb->args[2];
if (cb->strict_check) {
- int err;
-
err = inet6_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net,
skb->sk, cb);
if (err < 0)
- return err;
+ goto put_tgt_net;
+ err = 0;
if (fillargs.ifindex) {
dev = __dev_get_by_index(tgt_net, fillargs.ifindex);
- if (!dev)
- return -ENODEV;
+ if (!dev) {
+ err = -ENODEV;
+ goto put_tgt_net;
+ }
idev = __in6_dev_get(dev);
if (idev) {
err = in6_dump_addrs(idev, skb, cb, s_ip_idx,
@@ -5144,7 +5146,7 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
if (fillargs.netnsid >= 0)
put_net(tgt_net);
- return skb->len;
+ return err < 0 ? err : skb->len;
}
static int inet6_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
--
2.11.0
^ permalink raw reply related
* [PATCH net 0/4] net: Fixups for recent dump filtering changes
From: David Ahern @ 2018-10-24 19:58 UTC (permalink / raw)
To: netdev, davem; +Cc: lirongqing, David Ahern
From: David Ahern <dsahern@gmail.com>
Li RongQing noted that tgt_net is leaked in ipv4 due to the recent change
to handle address dumps for a specific device. The report also applies to
ipv6 and other error paths. Patches 1 and 2 fix those leaks.
Patch 3 stops route dumps from erroring out when dumping across address
families and a table id is given. This is needed in preparation for
patch 4.
Patch 4 updates the rtnl_dump_all to handle a failure in one of the dumpit
functions. At the moment, if an address dump returns an error the dump all
loop breaks but the error is dropped. The result can be no data is returned
and no error either leaving the user wondering about the addresses.
Patches were tested with a modified iproute2 to add invalid data to the
dump request causing each specific failure path to be hit in addition
to positive testing that it works as it should when given valid data.
David Ahern (4):
net/ipv4: Put target net when address dump fails due to bad attributes
net/ipv6: Put target net when address dump fails due to bad attributes
net: Don't return invalid table id error when dumping all families
net: rtnl_dump_all needs to propagate error from dumpit function
include/net/ip_fib.h | 1 +
net/core/rtnetlink.c | 6 ++++--
net/ipv4/devinet.c | 13 ++++++++-----
net/ipv4/fib_frontend.c | 4 ++++
net/ipv4/ipmr.c | 3 +++
net/ipv6/addrconf.c | 14 ++++++++------
net/ipv6/ip6_fib.c | 3 +++
net/ipv6/ip6mr.c | 3 +++
8 files changed, 34 insertions(+), 13 deletions(-)
--
2.11.0
^ permalink raw reply
* Re: Fw: [Bug 201423] New: eth0: hw csum failure
From: Andre Tomt @ 2018-10-24 19:41 UTC (permalink / raw)
To: Eric Dumazet, Eric Dumazet
Cc: Stephen Hemminger, netdev, rossi.f, Dimitris Michailidis
In-Reply-To: <e2c4e4a2-5e51-4df0-a34f-8a24b67ef55f@tomt.net>
On 21.10.2018 15:34, Andre Tomt wrote:
> On 20.10.2018 00:25, Eric Dumazet wrote:
>> On 10/19/2018 02:58 PM, Eric Dumazet wrote:
>>> On 10/16/2018 06:00 AM, Eric Dumazet wrote:
>>>> On Mon, Oct 15, 2018 at 11:30 PM Andre Tomt <andre@tomt.net> wrote:
>>>>> I've seen similar on several systems with mlx4 cards when using
>>>>> 4.18.x -
>>>>> that is hw csum failure followed by some backtrace.
>>>>>
>>>>> Only seems to happen on systems dealing with quite a bit of UDP.
>>>>>
>>>>
>>>> Strange, because mlx4 on IPv6+UDP should not use CHECKSUM_COMPLETE,
>>>> but CHECKSUM_UNNECESSARY
>>>>
>>>> I would be nice to track this a bit further, maybe by providing the
>>>> full packet content.
>>>>
> <snip>
>>>
>>> As a matter of fact Dimitris found the issue in the patch and is
>>> working on a fix involving csum_block_sub()
>>>
>>> Problems comes from trimming an odd number of bytes.
>>
>> More exactly, trimming bytes starting at an odd offset.
>
> No hw csum failures here since I deployed Dimitris fix on top of 4.18.16
> 32 hours ago.
>
> Thanks
It eventually showed up again with mlx4, on 4.18.16 + fix and also on
4.19. I still do not have a useful packet capture.
It is running a torrent client serving up various linux distributions.
> [116116.994519] p0xe0: hw csum failure
> [116116.994550] CPU: 5 PID: 0 Comm: swapper/5 Not tainted 4.19.0-1 #1
> [116116.994551] Hardware name: Supermicro Super Server/X10SRL-F, BIOS 2.0b 05/02/2017
> [116116.994555] Call Trace:
> [116116.994558] <IRQ>
> [116116.994567] dump_stack+0x5c/0x7b
> [116116.994574] __skb_gro_checksum_complete+0x9a/0xa0
> [116116.994580] udp6_gro_receive+0x211/0x290
> [116116.994585] ipv6_gro_receive+0x1b1/0x3a0
> [116116.994588] dev_gro_receive+0x3a0/0x620
> [116116.994590] ? __build_skb+0x25/0xe0
> [116116.994592] napi_gro_frags+0xa8/0x220
> [116116.994598] mlx4_en_process_rx_cq+0xa01/0xb40 [mlx4_en]
> [116116.994611] ? mlx4_cq_completion+0x23/0x70 [mlx4_core]
> [116116.994621] ? mlx4_eq_int+0x373/0xc80 [mlx4_core]
> [116116.994629] mlx4_en_poll_rx_cq+0x55/0xf0 [mlx4_en]
> [116116.994635] net_rx_action+0xe0/0x2e0
> [116116.994641] __do_softirq+0xd8/0x2ff
> [116116.994646] irq_exit+0xbd/0xd0
> [116116.994650] do_IRQ+0x85/0xd0
> [116116.994656] common_interrupt+0xf/0xf
> [116116.994659] </IRQ>
> [116116.994665] RIP: 0010:cpuidle_enter_state+0xb3/0x310
> [116116.994668] Code: 31 ff e8 e0 e0 bb ff 45 84 f6 74 17 9c 58 0f 1f 44 00 00 f6 c4 02 0f 85 3f 02 00 00 31 ff e8 64 cc c0 ff fb 66 0f 1f 44 00 00 <4c> 29 fb 48 ba cf f7 53 e3 a5 9b c4 20 48 89 d8 48 c1 fb 3f 48 f7
> [116116.994669] RSP: 0018:ffff924a0635bea8 EFLAGS: 00000246 ORIG_RAX: ffffffffffffffda
> [116116.994671] RAX: ffff9016ffb60fc0 RBX: 0000699b9835d616 RCX: 000000000000001f
> [116116.994673] RDX: 0000699b9835d616 RSI: 00000000229837f7 RDI: 0000000000000000
> [116116.994674] RBP: 0000000000000001 R08: 0000000000000002 R09: 0000000000020840
> [116116.994675] R10: ffff924a0635be88 R11: 0000000000000367 R12: ffff9016ffb69aa8
> [116116.994676] R13: ffffffffa50ac638 R14: 0000000000000000 R15: 0000699b981c63b9
> [116116.994680] ? cpuidle_enter_state+0x90/0x310
> [116116.994685] do_idle+0x1d0/0x240
> [116116.994687] cpu_startup_entry+0x5f/0x70
> [116116.994690] start_secondary+0x185/0x1a0
> [116116.994693] secondary_startup_64+0xa4/0xb0
> [116116.994709] p0xe0: hw csum failure
> [116116.994739] CPU: 5 PID: 0 Comm: swapper/5 Not tainted 4.19.0-1 #1
> [116116.994740] Hardware name: Supermicro Super Server/X10SRL-F, BIOS 2.0b 05/02/2017
> [116116.994741] Call Trace:
> [116116.994743] <IRQ>
> [116116.994746] dump_stack+0x5c/0x7b
> [116116.994751] __skb_checksum_complete+0xb8/0xd0
> [116116.994755] __udp6_lib_rcv+0xa0e/0xa20
> [116116.994764] ? nft_do_chain_inet+0x7a/0xd0 [nf_tables]
> [116116.994768] ? nft_do_chain_inet+0x7a/0xd0 [nf_tables]
> [116116.994771] ip6_input_finish+0xc0/0x460
> [116116.994774] ip6_input+0x2b/0x90
> [116116.994776] ? ip6_make_skb+0x1b0/0x1b0
> [116116.994778] ipv6_rcv+0x54/0xb0
> [116116.994781] __netif_receive_skb_one_core+0x42/0x50
> [116116.994784] netif_receive_skb_internal+0x24/0xb0
> [116116.994786] napi_gro_frags+0x171/0x220
> [116116.994790] mlx4_en_process_rx_cq+0xa01/0xb40 [mlx4_en]
> [116116.994798] ? mlx4_cq_completion+0x23/0x70 [mlx4_core]
> [116116.994803] ? mlx4_eq_int+0x373/0xc80 [mlx4_core]
> [116116.994806] mlx4_en_poll_rx_cq+0x55/0xf0 [mlx4_en]
> [116116.994808] net_rx_action+0xe0/0x2e0
> [116116.994810] __do_softirq+0xd8/0x2ff
> [116116.994812] irq_exit+0xbd/0xd0
> [116116.994814] do_IRQ+0x85/0xd0
> [116116.994816] common_interrupt+0xf/0xf
> [116116.994818] </IRQ>
> [116116.994821] RIP: 0010:cpuidle_enter_state+0xb3/0x310
> [116116.994823] Code: 31 ff e8 e0 e0 bb ff 45 84 f6 74 17 9c 58 0f 1f 44 00 00 f6 c4 02 0f 85 3f 02 00 00 31 ff e8 64 cc c0 ff fb 66 0f 1f 44 00 00 <4c> 29 fb 48 ba cf f7 53 e3 a5 9b c4 20 48 89 d8 48 c1 fb 3f 48 f7
> [116116.994824] RSP: 0018:ffff924a0635bea8 EFLAGS: 00000246 ORIG_RAX: ffffffffffffffda
> [116116.994825] RAX: ffff9016ffb60fc0 RBX: 0000699b9835d616 RCX: 000000000000001f
> [116116.994826] RDX: 0000699b9835d616 RSI: 00000000229837f7 RDI: 0000000000000000
> [116116.994827] RBP: 0000000000000001 R08: 0000000000000002 R09: 0000000000020840
> [116116.994828] R10: ffff924a0635be88 R11: 0000000000000367 R12: ffff9016ffb69aa8
> [116116.994829] R13: ffffffffa50ac638 R14: 0000000000000000 R15: 0000699b981c63b9
> [116116.994832] ? cpuidle_enter_state+0x90/0x310
> [116116.994835] do_idle+0x1d0/0x240
> [116116.994837] cpu_startup_entry+0x5f/0x70
> [116116.994838] start_secondary+0x185/0x1a0
> [116116.994840] secondary_startup_64+0xa4/0xb0
^ permalink raw reply
* Regression in 4.19 net/phy/realtek: garbled sysfs output
From: Holger Hoffstätte @ 2018-10-24 19:36 UTC (permalink / raw)
To: Netdev, Jassi Brar, David S. Miller
Hi,
Since 4.19 r8169 depends on phylib:
$lsmod | grep r8169
r8169 81920 0
libphy 57344 2 r8169,realtek
Unfortunately this now gives me the following sysfs error:
$cd /sys/module/realtek/drivers
$ls -l
ls: cannot access 'mdio_bus:RTL8201F 10/100Mbps Ethernet': No such file or directory
total 0
lrwxrwxrwx 1 root root 0 Oct 24 21:09 'mdio_bus:RTL8201CP Ethernet' -> '../../../bus/mdio_bus/drivers/RTL8201CP Ethernet'
l????????? ? ? ? ? ? 'mdio_bus:RTL8201F 10/100Mbps Ethernet'
lrwxrwxrwx 1 root root 0 Oct 24 21:09 'mdio_bus:RTL8211 Gigabit Ethernet' -> '../../../bus/mdio_bus/drivers/RTL8211 Gigabit Ethernet'
[..]
Apparently the forward slash in "10/100Mbps Ethernet" is interpreted as
directory separator that leads nowhere, and was introduced in commit
513588dd44b ("net: phy: realtek: add RTL8201F phy-id and functions").
Would it be acceptable to change the name simply to "RTL8201F Ethernet"?
thanks,
Holger
^ permalink raw reply
* [PATCH] netfilter: conntrack: fix calculation of next bucket number in early_drop
From: Vasily Khoruzhick @ 2018-10-25 3:48 UTC (permalink / raw)
To: Pablo Neira Ayuso, Jozsef Kadlecsik, Florian Westphal,
David S. Miller, netfilter-devel, coreteam, netdev, linux-kernel,
Dmitry Safonov
Cc: Vasily Khoruzhick, stable
If there's no entry to drop in bucket that corresponds to the hash,
early_drop() should look for it in other buckets. But since it increments
hash instead of bucket number, it actually looks in the same bucket 8
times: hsize is 16k by default (14 bits) and hash is 32-bit value, so
reciprocal_scale(hash, hsize) returns the same value for hash..hash+7 in
most cases.
Fix it by increasing bucket number instead of hash and rename _hash
to bucket to avoid future confusion.
Fixes: 3e86638e9a0b ("netfilter: conntrack: consider ct netns in early_drop logic")
Cc: <stable@vger.kernel.org> # v4.7+
Signed-off-by: Vasily Khoruzhick <vasilykh@arista.com>
---
net/netfilter/nf_conntrack_core.c | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index ca1168d67fac..a04af246b184 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1073,19 +1073,22 @@ static unsigned int early_drop_list(struct net *net,
return drops;
}
-static noinline int early_drop(struct net *net, unsigned int _hash)
+static noinline int early_drop(struct net *net, unsigned int hash)
{
unsigned int i;
for (i = 0; i < NF_CT_EVICTION_RANGE; i++) {
struct hlist_nulls_head *ct_hash;
- unsigned int hash, hsize, drops;
+ unsigned int bucket, hsize, drops;
rcu_read_lock();
nf_conntrack_get_ht(&ct_hash, &hsize);
- hash = reciprocal_scale(_hash++, hsize);
+ if (!i)
+ bucket = reciprocal_scale(hash, hsize);
+ else
+ bucket = (bucket + 1) % hsize;
- drops = early_drop_list(net, &ct_hash[hash]);
+ drops = early_drop_list(net, &ct_hash[bucket]);
rcu_read_unlock();
if (drops) {
--
2.19.1
^ permalink raw reply related
* Re: [PATCH] Change judgment len position
From: Willy Tarreau @ 2018-10-25 3:20 UTC (permalink / raw)
To: Joe Perches
Cc: Al Viro, Fengguang Wu, David Miller, wanghaifine, netdev, LKML,
LKP, Philip Li
In-Reply-To: <4664b8b350ed35ee24746fd34fb0e600ced776a5.camel@perches.com>
On Wed, Oct 24, 2018 at 07:41:52PM -0700, Joe Perches wrote:
> The Code of Conduct, if it exists at all, should apply
> to all of the kernel.
>
> And no, as I have previously posted, I don't agree with
> it nor the method that was used to introduce it.
>
> But it does exist.
> Its splatter affects us all.
Then just like any rule, start to use it as a guideline and not as
something extremely strict to apply to others. If someone feels
offended he can complain, there's no reason for suggesting that
maybe someone else could have felt offended, otherwise we'll end
up with a new code of thinking to explain how to think what others
could feel like...
Willy
^ permalink raw reply
* Re: [PATCH] Change judgment len position
From: Joe Perches @ 2018-10-25 2:41 UTC (permalink / raw)
To: Al Viro
Cc: Fengguang Wu, Willy Tarreau, David Miller, wanghaifine, netdev,
LKML, LKP, Philip Li
In-Reply-To: <20181025022027.GG32577@ZenIV.linux.org.uk>
On Thu, 2018-10-25 at 03:20 +0100, Al Viro wrote:
> On Wed, Oct 24, 2018 at 06:16:31PM -0700, Joe Perches wrote:
> > On Thu, 2018-10-25 at 09:11 +0800, Fengguang Wu wrote:
> > > CC Philip and LKP team.
> > > > Please try to make your first patch in drivers/staging
> > > > to get familiar with submitting patches to the kernel.
> > > > https://kernelnewbies.org/FirstKernelPatch
> > > >
> > > > Maybe there's utility in creating a filtering and auto-reply
> > > > tool for first time patch submitters for all the vger mailing
> > > > lists using some combination of previously known submitters
> > > > and the 0-day robot to point those first time submitters of
> > > > defective patches to kernelnewbies and staging.
> > >
> > > Yeah good idea. That feature can be broken into 2 parts:
> > >
> > > - an email script, which could be added to Linux scripts/ dir
> > > - maintain records for telling whether someone is first-time patch submitters
> >
> > Maybe run checkpatch on those first-time submitter patches too.
>
> OK, now I'm certain that you are trolling...
Nope, the process suggestions above are sincere.
> Joe, what really pisses me off is that it's actually at the expense of original
> poster (who had nothing to do with the CoCup)
CoCup? No doubt pronounced cock-up.
> *and* an invitation for a certain
> variety of kooks. In probably vain hope of heading that off, here's the
> summary of what happened _before_ Joe started to stir the shit:
>
> * code in question is, indeed, (slightly) bogus in mainline.
> It reads as "reject negative values for length, truncate positive ones to 4",
> but in reality it's "treat everything outside of 0..4 as 4". It's not broken
> per se, but it's certainly misleading.
> * one possible fix would be to drop the "reject negative values"
> completely, another - to move checking for negatives before the truncation.
> Patch tried to do the latter.
Umm, I suggested an appropriate mechanism to fix the patch
in this thread immediately after reading it.
> Code of Conduct is garbage, but neither Dave nor the author
> of this patch had anything to do with that mess. If you want to make a point,
> do so without shit splatter hitting innocent bystanders - people tend to
> get very annoyed by that kind of thing, and with a damn good reason.
The Code of Conduct, if it exists at all, should apply
to all of the kernel.
And no, as I have previously posted, I don't agree with
it nor the method that was used to introduce it.
But it does exist.
Its splatter affects us all.
^ permalink raw reply
* Re: [PATCH v2 bpf] bpf: devmap: fix wrong interface selection in notifier_call
From: Song Liu @ 2018-10-24 17:56 UTC (permalink / raw)
To: ap420073; +Cc: Daniel Borkmann, Alexei Starovoitov, Networking, John Fastabend
In-Reply-To: <20181024111517.13361-1-ap420073@gmail.com>
On Wed, Oct 24, 2018 at 4:16 AM Taehee Yoo <ap420073@gmail.com> wrote:
>
> The dev_map_notification() removes interface in devmap if
> unregistering interface's ifindex is same.
> But only checking ifindex is not enough because other netns can have
> same ifindex. so that wrong interface selection could occurred.
> Hence netdev pointer comparison code is added.
>
> v2: compare netdev pointer instead of using net_eq() (Daniel Borkmann)
> v1: Initial patch
>
> Fixes: 2ddf71e23cc2 ("net: add notifier hooks for devmap bpf map")
> Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Acked-by: Song Liu <songliubraving@fb.com>
> ---
> kernel/bpf/devmap.c | 3 +--
> 1 file changed, 1 insertion(+), 2 deletions(-)
>
> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index 141710b82a6c..191b79948424 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -512,8 +512,7 @@ static int dev_map_notification(struct notifier_block *notifier,
> struct bpf_dtab_netdev *dev, *odev;
>
> dev = READ_ONCE(dtab->netdev_map[i]);
> - if (!dev ||
> - dev->dev->ifindex != netdev->ifindex)
> + if (!dev || netdev != dev->dev)
> continue;
> odev = cmpxchg(&dtab->netdev_map[i], dev, NULL);
> if (dev == odev)
> --
> 2.17.1
>
^ permalink raw reply
* Re: [PATCH] Change judgment len position
From: Al Viro @ 2018-10-25 2:20 UTC (permalink / raw)
To: Joe Perches
Cc: Fengguang Wu, Willy Tarreau, David Miller, wanghaifine, netdev,
LKML, LKP, Philip Li
In-Reply-To: <dda151c160e42aeb07d158d9c7cd4c1a3341ab5f.camel@perches.com>
On Wed, Oct 24, 2018 at 06:16:31PM -0700, Joe Perches wrote:
> On Thu, 2018-10-25 at 09:11 +0800, Fengguang Wu wrote:
> > CC Philip and LKP team.
> > > Please try to make your first patch in drivers/staging
> > > to get familiar with submitting patches to the kernel.
> > > https://kernelnewbies.org/FirstKernelPatch
> > >
> > > Maybe there's utility in creating a filtering and auto-reply
> > > tool for first time patch submitters for all the vger mailing
> > > lists using some combination of previously known submitters
> > > and the 0-day robot to point those first time submitters of
> > > defective patches to kernelnewbies and staging.
> >
> > Yeah good idea. That feature can be broken into 2 parts:
> >
> > - an email script, which could be added to Linux scripts/ dir
> > - maintain records for telling whether someone is first-time patch submitters
>
> Maybe run checkpatch on those first-time submitter patches too.
OK, now I'm certain that you are trolling...
Joe, what really pisses me off is that it's actually at the expense of original
poster (who had nothing to do with the CoCup) *and* an invitation for a certain
variety of kooks. In probably vain hope of heading that off, here's the
summary of what happened _before_ Joe started to stir the shit:
* code in question is, indeed, (slightly) bogus in mainline.
It reads as "reject negative values for length, truncate positive ones to 4",
but in reality it's "treat everything outside of 0..4 as 4". It's not broken
per se, but it's certainly misleading.
* one possible fix would be to drop the "reject negative values"
completely, another - to move checking for negatives before the truncation.
Patch tried to do the latter.
* the author of the patch has moved the check *too* early -
before the value being tested is even obtained. It's obviously wrong - kernel
newbie or not.
* I sincerely doubt that it was an attempt to introduce a backdoor,
albeit one would've been created if that patch went in as-is. Genuine braino
is much more likely, and we'd all done such.
* such brainos can be surprisingly hard to spot in one's code.
It's too obviously wrong, in a sense - you know what you've meant to write
and you keep seeing that instead of what you've actually written. If you
are really unlucky, that might end up with a few days worth of debugging,
with eventual embarrassed "how the fuck have I managed not to notice that
previous twenty times I went over this function today?" Been there,
done that, and so has everyone who'd actually written anything (other
than the worthless screeds, that is).
* one thing the author of that patch could be blamed for (and most
of us have fucked up that way at one point or another) is not testing the
effect of the damn thing. Modification is local, the change of behaviour -
simple and triggering that code is also trivial. Checking that the patch
has done what you expect it to do would be simple and would've caught the
problem.
`
Code of Conduct is garbage, but neither Dave nor the author
of this patch had anything to do with that mess. If you want to make a point,
do so without shit splatter hitting innocent bystanders - people tend to
get very annoyed by that kind of thing, and with a damn good reason.
Sheesh...
^ permalink raw reply
* Re: [PATCH 2/3] net: ethernet: Remove dummy runtime PM callbacks from Renesas drivers
From: Sergei Shtylyov @ 2018-10-24 17:44 UTC (permalink / raw)
To: Jarkko Nikula, linux-pm
Cc: linux-i2c, Wolfram Sang, netdev, David S . Miller,
linux-renesas-soc, linux-usb, Yoshihiro Shimoda
In-Reply-To: <20181024135134.28456-3-jarkko.nikula@linux.intel.com>
Hello!
On 10/24/2018 04:51 PM, Jarkko Nikula wrote:
> Platform drivers don't need dummy runtime PM callbacks that just return
> success in order to have runtime PM happening. This has changed since
> following commits:
>
> 05aa55dddb9e ("PM / Runtime: Lenient generic runtime pm callbacks")
> 543f2503a956 ("PM / platform_bus: Allow runtime PM by default")
> 8b313a38ecff ("PM / Platform: Use generic runtime PM callbacks directly")
Wow, these are old! :-)
> Signed-off-by: Jarkko Nikula <jarkko.nikula@linux.intel.com>
> ---
> Only build tested.
Seems long overdue...
[...]
Reviewed-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
MBR, Sergei
^ permalink raw reply
* Re: [PATCH] Change judgment len position
From: Joe Perches @ 2018-10-25 1:16 UTC (permalink / raw)
To: Fengguang Wu
Cc: Willy Tarreau, David Miller, wanghaifine, netdev, LKML, LKP,
Philip Li
In-Reply-To: <20181025011111.bhbstq5wtrwk26b5@wfg-t540p.sh.intel.com>
On Thu, 2018-10-25 at 09:11 +0800, Fengguang Wu wrote:
> CC Philip and LKP team.
> > Please try to make your first patch in drivers/staging
> > to get familiar with submitting patches to the kernel.
> > https://kernelnewbies.org/FirstKernelPatch
> >
> > Maybe there's utility in creating a filtering and auto-reply
> > tool for first time patch submitters for all the vger mailing
> > lists using some combination of previously known submitters
> > and the 0-day robot to point those first time submitters of
> > defective patches to kernelnewbies and staging.
>
> Yeah good idea. That feature can be broken into 2 parts:
>
> - an email script, which could be added to Linux scripts/ dir
> - maintain records for telling whether someone is first-time patch submitters
Maybe run checkpatch on those first-time submitter patches too.
^ permalink raw reply
* Re: [PATCH] Change judgment len position
From: Fengguang Wu @ 2018-10-25 1:11 UTC (permalink / raw)
To: Joe Perches
Cc: Willy Tarreau, David Miller, wanghaifine, netdev, LKML, LKP,
Philip Li
In-Reply-To: <49ec92564284b5beb0a151bce1d537b51340d0a8.camel@perches.com>
CC Philip and LKP team.
On Wed, Oct 24, 2018 at 04:46:18PM -0700, Joe Perches wrote:
>(adding Fengguang Wu and lkp)
>
>On Wed, 2018-10-24 at 22:48 +0200, Willy Tarreau wrote:
>> When you read this patch from an apparent first-time contributor (no
>> trace in either LKML, netdev or even google), the level of assurance
>> in the commit message is pretty good, showing that he's not at all a
>> beginner, which doesn't match at all the type of error seen in the
>> code, which doesn't even need to be compiled to see that it will emit
>> a warning and not work as advertised.
>
>Which to me is more of an indication of beginner-ness.
>
>> When you factor in the fact that the code opens a big but very discrete
>> vulnerability, I tend to think that in fact we're not facing a newbie
>> at all but someone deliberately trying to inject a subtle backdoor into
>> the kernel and disguise it as a vague bug fix,
>
>That seems unlikely as it introduces a compilation warning.
>
>A real intentional backdoor from a nefarious source would
>likely be completely correct about compilation and use the
>typical commit subject style.
>
>Anyway, who know for certain right now.
>
>I would have suggested David reply with only his second sentence
>and maybe a pointer to kernelnewbies or staging.
>
>Just something like:
>
> nack: 'len' has no value before the get_user() call.
>
> Please try to make your first patch in drivers/staging
> to get familiar with submitting patches to the kernel.
> https://kernelnewbies.org/FirstKernelPatch
>
>Maybe there's utility in creating a filtering and auto-reply
>tool for first time patch submitters for all the vger mailing
>lists using some combination of previously known submitters
>and the 0-day robot to point those first time submitters of
>defective patches to kernelnewbies and staging.
Yeah good idea. That feature can be broken into 2 parts:
- an email script, which could be added to Linux scripts/ dir
- maintain records for telling whether someone is first-time patch submitters
Thanks,
Fengguang
^ permalink raw reply
* Re: [PATCH] Change judgment len position
From: Al Viro @ 2018-10-25 1:03 UTC (permalink / raw)
To: Joe Perches; +Cc: David Miller, wanghaifine, netdev, LKML
In-Reply-To: <61d94f2a5563db4d6580c8385c3b93c8eeb3669a.camel@perches.com>
On Wed, Oct 24, 2018 at 11:28:11AM -0700, Joe Perches wrote:
> On Wed, 2018-10-24 at 10:10 -0700, David Miller wrote:
> > From: Wang Hai <wanghaifine@gmail.com>
> > Date: Wed, 24 Oct 2018 23:47:29 +0800
> >
> > > To determine whether len is less than zero, it should be put before
> > > the function min_t, because the return value of min_t is not likely
> > > to be less than zero.
> []
> > > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
[snip obviously broken patch]
> > You can't be serious?
>
> I'm not personally taken aback by this but
> there is the new Code of
> Conduct to consider.
>
> John McEnroe earned quite a bit of his
> reputation as an 'enfant terrible' via a
> similar statement.
>
> https://www.youtube.com/watch?v=t0hK1wyrrAU
>
> Perhaps a different word choice next time in
> reply to submitters of ill-considered and/or
> defective patches could be useful.
Please tell me we are being Poe'd...
^ permalink raw reply
* Re: [PATCH RFC v3 0/3] Rlimit for module space
From: Edgecombe, Rick P @ 2018-10-25 1:01 UTC (permalink / raw)
To: daniel@iogearbox.net, jeyu@kernel.org, ard.biesheuvel@linaro.org,
mhocko@kernel.org
Cc: linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org,
jannh@google.com, keescook@chromium.org, arjan@linux.intel.com,
netdev@vger.kernel.org, tglx@linutronix.de,
linux-mips@linux-mips.org, linux-s390@vger.kernel.org,
x86@kernel.org, kristen@linux.intel.com, Dock, Deneen T,
catalin.marinas@arm.com, mingo@redhat.com, will.deacon@arm.com,
"kernel-hardening@lists.openwall
In-Reply-To: <d7cb6a8c-b7d6-5c82-6721-2b5387da673f@iogearbox.net>
On Wed, 2018-10-24 at 18:04 +0200, Daniel Borkmann wrote:
> [ +Alexei, netdev ]
>
> On 10/24/2018 05:07 PM, Jessica Yu wrote:
> > +++ Ard Biesheuvel [23/10/18 08:54 -0300]:
> > > On 22 October 2018 at 20:06, Edgecombe, Rick P
> > > <rick.p.edgecombe@intel.com> wrote:
>
> [...]
> > > I think it is wrong to conflate the two things. Limiting the number of
> > > BPF allocations and the limiting number of module allocations are two
> > > separate things, and the fact that BPF reuses module_alloc() out of
> > > convenience does not mean a single rlimit for both is appropriate.
> >
> > Hm, I think Ard has a good point. AFAIK, and correct me if I'm wrong,
> > users of module_alloc() i.e. kprobes, ftrace, bpf, seem to use it
> > because it is an easy way to obtain executable kernel memory (and
> > depending on the needs of the architecture, being additionally
> > reachable via relative branches) during runtime. The side effect is
> > that all these users share the "module" memory space, even though this
> > memory region is not exclusively used by modules (well, personally I
> > think it technically should be, because seeing module_alloc() usage
> > outside of the module loader is kind of a misuse of the module API and
> > it's confusing for people who don't know the reason behind its usage
> > outside of the module loader).
> >
> > Right now I'm not sure if it makes sense to impose a blanket limit on
> > all module_alloc() allocations when the real motivation behind the
> > rlimit is related to BPF, i.e., to stop unprivileged users from
> > hogging up all the vmalloc space for modules with JITed BPF filters.
> > So the rlimit has more to do with limiting the memory usage of BPF
> > filters than it has to do with modules themselves.
> >
> > I think Ard's suggestion of having a separate bpf_alloc/free API makes
> > a lot of sense if we want to keep track of bpf-related allocations
> > (and then the rlimit would be enforced for those). Maybe part of the
> > module mapping space could be carved out for bpf filters (e.g. have
> > BPF_VADDR, BPF_VSIZE, etc like how we have it for modules), or
> > continue sharing the region but explicitly define a separate bpf_alloc
> > API, depending on an architecture's needs. What do people think?
>
> Hmm, I think here are several issues mixed up at the same time which is just
> very confusing, imho:
>
> 1) The fact that there are several non-module users of module_alloc()
> as Jessica notes such as kprobes, ftrace, bpf, for example. While all of
> them are not being modules, they all need to alloc some piece of executable
> memory. It's nothing new, this exists for 7 years now since 0a14842f5a3c
> ("net: filter: Just In Time compiler for x86-64") from BPF side; effectively
> that is even /before/ eBPF existed. Having some different API perhaps for all
> these users seems to make sense if the goal is not to interfere with modules
> themselves. It might also help as a benefit to potentially increase that
> memory pool if we're hitting limits at scale which would not be a concern
> for normal kernel modules since there's usually just a very few of them
> needed (unlike dynamically tracing various kernel parts 24/7 w/ or w/o BPF,
> running BPF-seccomp policies, networking BPF policies, etc which need to
> scale w/ application or container deployment; so this is of much more
> dynamic and unpredictable nature).
>
> 2) Then there is rlimit which is proposing to have a "fairer" share among
> unprivileged users. I'm not fully sure yet whether rlimit is actually a
> nice usable interface for all this. I'd agree that something is needed
> on that regard, but I also tend to like Michal Hocko's cgroup proposal,
> iow, to have such resource control as part of memory cgroup could be
> something to consider _especially_ since it already _exists_ for vmalloc()
> backed memory so this should be not much different than that. It sounds
> like 2) comes on top of 1).
FWIW, cgroups seems like a better solution than rlimit to me too. Maybe you all
already know, but it looks like the cgroups vmalloc charge is done in the main
page allocator and counts against the whole kernel memory limit. A user may want
to have a higher kernel limit than the module space size, so it seems it isn't
enough by itself and some new limit would need to be added.
As for what the limit should be, I wonder if some of the disagreement is just
from the name "module space".
There is a limited resource of physical memory, so we have limits for it. There
is a limited resource of CPU time, so we have limits for it. If there is a
limited resource for preferred address space for executable code, why not just
continue that trend? If other forms of unprivileged JIT come along, would it be
better to have N limits for each type? Request_module probably can't fill the
space, but technically there are already 2 unprivileged users. So IMHO, its a
more forward looking solution.
If there are some usage/architecture combos that don't need the preferred space
they can allocate in vmalloc and have it not count against the preferred space
limit but still count against the cgroups kernel memory limit.
Another benefit of centralizing the allocation of the "executable memory
preferred space" is KASLR. Right now that is only done in module_alloc and so
all users of it get randomized. If they all call vmalloc by themselves they will
just use the normal allocator.
Anyway, it seems like either type of limit (BPF JIT or all module space) will
solve the problem equally well today.
> 3) Last but not least, there's a short term fix which is needed independently
> of 1) and 2) and should be done immediately which is to account for
> unprivileged users and restrict them based on a global configurable
> limit such that privileged use keeps functioning, and 2) could enforce
> based on the global upper limit, for example. Pending fix is under
> https://patchwork.ozlabs.org/patch/987971/ which we intend to ship to
> Linus as soon as possible as short term fix. Then something like memcg
> can be considered on top since it seems this makes most sense from a
> usability point.
>
> Thanks a lot,
> Daniel
^ permalink raw reply
* Re: [PATCH] Change judgment len position
From: Eric Dumazet @ 2018-10-25 0:50 UTC (permalink / raw)
To: Joe Perches, David Miller
Cc: w, wanghaifine, netdev, linux-kernel, fengguang.wu, lkp
In-Reply-To: <825268591809f66eb475c3b41c327809a304388f.camel@perches.com>
On 10/24/2018 05:23 PM, Joe Perches wrote:
>
> "You can't be serious?" is kind?
Context maybe ? As a non native, I do not see why it is an offense.
I would like very much we discuss about patches here, not about whatever
interpretation you or anyone could make from any answers.
Recipe for disaster in linux community :
1) Write a bot, sending one wrong patch every hour, with a random "From:" name
and email address.
Yeah, can you believe a bot can actually 'Signed-off-by:' a patch ???
2) Hundred of emails sent, from reviewers, annoyed maintainers, and flames
because the proper words for newbies were not _carefully_ chosen.
So just wait for an answer from the supposed patch author, and see what happens next.
Thank you.
^ permalink raw reply
* Re: [PATCH ghak90 (was ghak32) V4 03/10] audit: log container info of syscalls
From: Richard Guy Briggs @ 2018-10-25 0:42 UTC (permalink / raw)
To: Paul Moore
Cc: containers, linux-api, linux-audit, linux-fsdevel, linux-kernel,
netdev, netfilter-devel, ebiederm, luto, carlos, dhowells, viro,
simo, Eric Paris, Serge Hallyn
In-Reply-To: <CAHC9VhRF1vsxM-k0Lw-9NqS9b9rgXRRBu0tq4ajdFpMqUxiH4A@mail.gmail.com>
On 2018-10-24 16:55, Paul Moore wrote:
> On Wed, Oct 24, 2018 at 11:15 AM Richard Guy Briggs <rgb@redhat.com> wrote:
> > On 2018-10-19 19:16, Paul Moore wrote:
> > > On Sun, Aug 5, 2018 at 4:32 AM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > > Create a new audit record AUDIT_CONTAINER to document the audit
> > > > container identifier of a process if it is present.
> > > >
> > > > Called from audit_log_exit(), syscalls are covered.
> > > >
> > > > A sample raw event:
> > > > type=SYSCALL msg=audit(1519924845.499:257): arch=c000003e syscall=257 success=yes exit=3 a0=ffffff9c a1=56374e1cef30 a2=241 a3=1b6 items=2 ppid=606 pid=635 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=pts0 ses=3 comm="bash" exe="/usr/bin/bash" subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key="tmpcontainerid"
> > > > type=CWD msg=audit(1519924845.499:257): cwd="/root"
> > > > type=PATH msg=audit(1519924845.499:257): item=0 name="/tmp/" inode=13863 dev=00:27 mode=041777 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:tmp_t:s0 nametype= PARENT cap_fp=0000000000000000 cap_fi=0000000000000000 cap_fe=0 cap_fver=0
> > > > type=PATH msg=audit(1519924845.499:257): item=1 name="/tmp/tmpcontainerid" inode=17729 dev=00:27 mode=0100644 ouid=0 ogid=0 rdev=00:00 obj=unconfined_u:object_r:user_tmp_t:s0 nametype=CREATE cap_fp=0000000000000000 cap_fi=0000000000000000 cap_fe=0 cap_fver=0
> > > > type=PROCTITLE msg=audit(1519924845.499:257): proctitle=62617368002D6300736C65657020313B206563686F2074657374203E202F746D702F746D70636F6E7461696E65726964
> > > > type=CONTAINER msg=audit(1519924845.499:257): op=task contid=123458
> > > >
> > > > See: https://github.com/linux-audit/audit-kernel/issues/90
> > > > See: https://github.com/linux-audit/audit-userspace/issues/51
> > > > See: https://github.com/linux-audit/audit-testsuite/issues/64
> > > > See: https://github.com/linux-audit/audit-kernel/wiki/RFE-Audit-Container-ID
> > > > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > > > Acked-by: Serge Hallyn <serge@hallyn.com>
> > > > Acked-by: Steve Grubb <sgrubb@redhat.com>
> > > > ---
> > > > include/linux/audit.h | 7 +++++++
> > > > include/uapi/linux/audit.h | 1 +
> > > > kernel/audit.c | 24 ++++++++++++++++++++++++
> > > > kernel/auditsc.c | 3 +++
> > > > 4 files changed, 35 insertions(+)
> > >
> > > ...
> > >
> > > > @@ -2045,6 +2045,30 @@ void audit_log_session_info(struct audit_buffer *ab)
> > > > audit_log_format(ab, " auid=%u ses=%u", auid, sessionid);
> > > > }
> > > >
> > > > +/*
> > > > + * audit_log_contid - report container info
> > > > + * @tsk: task to be recorded
> > > > + * @context: task or local context for record
> > > > + * @op: contid string description
> > > > + */
> > > > +int audit_log_contid(struct task_struct *tsk,
> > > > + struct audit_context *context, char *op)
> > > > +{
> > > > + struct audit_buffer *ab;
> > > > +
> > > > + if (!audit_contid_set(tsk))
> > > > + return 0;
> > > > + /* Generate AUDIT_CONTAINER record with container ID */
> > > > + ab = audit_log_start(context, GFP_KERNEL, AUDIT_CONTAINER);
> > > > + if (!ab)
> > > > + return -ENOMEM;
> > > > + audit_log_format(ab, "op=%s contid=%llu",
> > > > + op, audit_get_contid(tsk));
> > > > + audit_log_end(ab);
> > > > + return 0;
> > > > +}
> > > > +EXPORT_SYMBOL(audit_log_contid);
> > >
> > > As discussed in the previous iteration of the patch, I prefer
> > > AUDIT_CONTAINER_ID here over AUDIT_CONTAINER. If you feel strongly
> > > about keeping it as-is with AUDIT_CONTAINER I suppose I could live
> > > with that, but it is isn't my first choice.
> >
> > I don't have a strong opinion on this one, mildly preferring the shorter
> > one only because it is shorter.
>
> We already have multiple AUDIT_CONTAINER* record types, so it seems as
> though we should use "AUDIT_CONTAINER" as a prefix of sorts, rather
> than a type itself.
I'm fine with that. I'd still like to hear Steve's input. He had
stronger opinions than me.
> > > However, I do care about the "op" field in this record. It just
> > > doesn't make any sense; the way you are using it it is more of a
> > > context field than an operations field, and even then why is the
> > > context important from a logging and/or security perspective? Drop it
> > > please.
> >
> > I'll rename it to whatever you like. I'd suggest "ref=". The reason I
> > think it is important is there are multiple sources that aren't always
> > obvious from the other records to which it is associated. In the case
> > of ptrace and signals, there can be many target tasks listed (OBJ_PID)
> > with no other way to distinguish the matching audit container identifier
> > records all for one event. This is in addition to the default syscall
> > container identifier record. I'm not currently happy with the text
> > content to link the two, but that should be solvable (most obvious is
> > taret PID). Throwing away this information seems shortsighted.
>
> It would be helpful if you could generate real audit events
> demonstrating the problems you are describing, as well as a more
> standard syscall event, so we can discuss some possible solutions.
If the auditted process is in a container and it ptraces or signals
another process in a container, there will be two AUDIT_CONTAINER
records for the same event that won't be identified as to which record
belongs to which process or other record (SYSCALL vs 1+ OBJ_PID
records). There could be many signals recorded, each with their own
OBJ_PID record. The first is stored in the audit context and additional
ones are stored in a chained struct that can accommodate 16 entries each.
(See audit_signal_info(), __audit_ptrace().)
(As a side note, on code inspection it appears that a signal target
would get overwritten by a ptrace action if they were to happen in that
order.)
> paul moore
- RGB
--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635
^ permalink raw reply
* Re: [PATCH] Change judgment len position
From: Joe Perches @ 2018-10-25 0:23 UTC (permalink / raw)
To: David Miller; +Cc: w, wanghaifine, netdev, linux-kernel, fengguang.wu, lkp
In-Reply-To: <20181024.170213.632063387857625082.davem@davemloft.net>
On Wed, 2018-10-24 at 17:02 -0700, David Miller wrote:
> From: Joe Perches <joe@perches.com>
> Date: Wed, 24 Oct 2018 16:46:18 -0700
>
> > I would have suggested David reply with only his second sentence
> > and maybe a pointer to kernelnewbies or staging.is
>
> I maintain that I was not out of line with my comment.
>
> I sought a second opinion from Greg KH and others, and they agree with
> me that I was still kind with my choice of words.
"You can't be serious?" is kind?
> I think you're taking things too far, and I will simply not stand for
> this overreaching judgement upon my behavior on this list which is
> completely not justified.
Even above here, concision is generally better.
overreaching and completely are probably not reasonable.
openness and defensiveness are somewhat antithetic.
>From the new Code of Conduct (which I don't even approve)
* Gracefully accepting constructive criticism.
cheers, Joe
^ permalink raw reply
* [PATCH 2/2] Bluetooth: btqcomsmd: use HCI_QUIRK_USE_BDADDR_PROPERTY
From: Matthias Kaehlcke @ 2018-10-25 0:21 UTC (permalink / raw)
To: Marcel Holtmann, Johan Hedberg, David S . Miller, Loic Poulain
Cc: linux-bluetooth, netdev, linux-kernel, Balakrishna Godavarthi,
Brian Norris, Dmitry Grinberg, Matthias Kaehlcke
In-Reply-To: <20181025002134.256791-1-mka@chromium.org>
Use the HCI_QUIRK_USE_BDADDR_PROPERTY quirk to let the HCI
core handle the reading of 'local-bd-address'. With this there
is no need to set HCI_QUIRK_INVALID_BDADDR, the case of a
non-existing or invalid fwnode property is handled by the core
code.
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
---
I couldn't actually test the changes in this driver since I
don't have a device with this controller. Could someone
from Qualcomm help with this?
---
drivers/bluetooth/btqcomsmd.c | 28 +++-------------------------
1 file changed, 3 insertions(+), 25 deletions(-)
diff --git a/drivers/bluetooth/btqcomsmd.c b/drivers/bluetooth/btqcomsmd.c
index 7df3eed1ef5e..e5841602c4f1 100644
--- a/drivers/bluetooth/btqcomsmd.c
+++ b/drivers/bluetooth/btqcomsmd.c
@@ -125,23 +125,10 @@ static int btqcomsmd_setup(struct hci_dev *hdev)
return PTR_ERR(skb);
kfree_skb(skb);
- /* Devices do not have persistent storage for BD address. If no
- * BD address has been retrieved during probe, mark the device
- * as having an invalid BD address.
+ /* Devices do not have persistent storage for BD address. Retrieve
+ * it from the firmware node property.
*/
- if (!bacmp(&btq->bdaddr, BDADDR_ANY)) {
- set_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks);
- return 0;
- }
-
- /* When setting a configured BD address fails, mark the device
- * as having an invalid BD address.
- */
- err = qca_set_bdaddr_rome(hdev, &btq->bdaddr);
- if (err) {
- set_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks);
- return 0;
- }
+ set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks);
return 0;
}
@@ -169,15 +156,6 @@ static int btqcomsmd_probe(struct platform_device *pdev)
if (IS_ERR(btq->cmd_channel))
return PTR_ERR(btq->cmd_channel);
- /* The local-bd-address property is usually injected by the
- * bootloader which has access to the allocated BD address.
- */
- if (!of_property_read_u8_array(pdev->dev.of_node, "local-bd-address",
- (u8 *)&btq->bdaddr, sizeof(bdaddr_t))) {
- dev_info(&pdev->dev, "BD address %pMR retrieved from device-tree",
- &btq->bdaddr);
- }
-
hdev = hci_alloc_dev();
if (!hdev)
return -ENOMEM;
--
2.19.1.568.g152ad8e336-goog
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox