* [bpf-next PATCH 2/2] bpf: use --cgroup in test_suite if supplied
From: John Fastabend @ 2018-08-28 16:10 UTC (permalink / raw)
To: alexei.starovoitov, daniel; +Cc: netdev
In-Reply-To: <20180828160921.24004.71893.stgit@john-Precision-Tower-5810>
If the user supplies a --cgroup value in the arguments when running
the test_suite go ahaead and run the self tests there. I use this
to test with multiple cgroup users.
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
tools/testing/selftests/bpf/test_sockmap.c | 53 ++++++++++++++++------------
1 file changed, 31 insertions(+), 22 deletions(-)
diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index a0e77c6..ac7de38 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -1345,9 +1345,9 @@ static int populate_progs(char *bpf_file)
return 0;
}
-static int __test_suite(char *bpf_file)
+static int __test_suite(int cg_fd, char *bpf_file)
{
- int cg_fd, err;
+ int err, cleanup = cg_fd;
err = populate_progs(bpf_file);
if (err < 0) {
@@ -1355,22 +1355,24 @@ static int __test_suite(char *bpf_file)
return err;
}
- if (setup_cgroup_environment()) {
- fprintf(stderr, "ERROR: cgroup env failed\n");
- return -EINVAL;
- }
-
- cg_fd = create_and_get_cgroup(CG_PATH);
if (cg_fd < 0) {
- fprintf(stderr,
- "ERROR: (%i) open cg path failed: %s\n",
- cg_fd, optarg);
- return cg_fd;
- }
+ if (setup_cgroup_environment()) {
+ fprintf(stderr, "ERROR: cgroup env failed\n");
+ return -EINVAL;
+ }
+
+ cg_fd = create_and_get_cgroup(CG_PATH);
+ if (cg_fd < 0) {
+ fprintf(stderr,
+ "ERROR: (%i) open cg path failed: %s\n",
+ cg_fd, optarg);
+ return cg_fd;
+ }
- if (join_cgroup(CG_PATH)) {
- fprintf(stderr, "ERROR: failed to join cgroup\n");
- return -EINVAL;
+ if (join_cgroup(CG_PATH)) {
+ fprintf(stderr, "ERROR: failed to join cgroup\n");
+ return -EINVAL;
+ }
}
/* Tests basic commands and APIs with range of iov values */
@@ -1391,20 +1393,24 @@ static int __test_suite(char *bpf_file)
out:
printf("Summary: %i PASSED %i FAILED\n", passed, failed);
- cleanup_cgroup_environment();
- close(cg_fd);
+ if (cleanup < 0) {
+ cleanup_cgroup_environment();
+ close(cg_fd);
+ }
return err;
}
-static int test_suite(void)
+static int test_suite(int cg_fd)
{
int err;
- err = __test_suite(BPF_SOCKMAP_FILENAME);
+ err = __test_suite(cg_fd, BPF_SOCKMAP_FILENAME);
if (err)
goto out;
- err = __test_suite(BPF_SOCKHASH_FILENAME);
+ err = __test_suite(cg_fd, BPF_SOCKHASH_FILENAME);
out:
+ if (cg_fd > -1)
+ close(cg_fd);
return err;
}
@@ -1417,7 +1423,7 @@ int main(int argc, char **argv)
int test = PING_PONG;
if (argc < 2)
- return test_suite();
+ return test_suite(-1);
while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:",
long_options, &longindex)) != -1) {
@@ -1483,6 +1489,9 @@ int main(int argc, char **argv)
}
}
+ if (argc <= 3 && cg_fd)
+ return test_suite(cg_fd);
+
if (!cg_fd) {
fprintf(stderr, "%s requires cgroup option: --cgroup <path>\n",
argv[0]);
^ permalink raw reply related
* [bpf-next PATCH 1/2] bpf: sockmap test remove shutdown() calls
From: John Fastabend @ 2018-08-28 16:10 UTC (permalink / raw)
To: alexei.starovoitov, daniel; +Cc: netdev
In-Reply-To: <20180828160921.24004.71893.stgit@john-Precision-Tower-5810>
Currently, we do a shutdown(sk, SHUT_RDWR) on both peer sockets and
a shutdown on the sender as well. However, this is incorrect and can
occasionally cause issues if you happen to have bad timing. First
peer1 or peer2 may still be in use depending on the test and timing.
Second we really should only be closing the read side and/or write
side depending on if the test is receiving or sending.
But, really none of this is needed just remove the shutdown calls.
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
tools/testing/selftests/bpf/test_sockmap.c | 3 ---
1 file changed, 3 deletions(-)
diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 0c7d9e5..a0e77c6 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -469,8 +469,6 @@ static int sendmsg_test(struct sockmap_options *opt)
fprintf(stderr,
"msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n",
iov_count, iov_buf, cnt, err);
- shutdown(p2, SHUT_RDWR);
- shutdown(p1, SHUT_RDWR);
if (s.end.tv_sec - s.start.tv_sec) {
sent_Bps = sentBps(s);
recvd_Bps = recvdBps(s);
@@ -500,7 +498,6 @@ static int sendmsg_test(struct sockmap_options *opt)
fprintf(stderr,
"msg_loop_tx: iov_count %i iov_buf %i cnt %i err %i\n",
iov_count, iov_buf, cnt, err);
- shutdown(c1, SHUT_RDWR);
if (s.end.tv_sec - s.start.tv_sec) {
sent_Bps = sentBps(s);
recvd_Bps = recvdBps(s);
^ permalink raw reply related
* [bpf-next PATCH 0/2] bpf: test_sockmap updates
From: John Fastabend @ 2018-08-28 16:10 UTC (permalink / raw)
To: alexei.starovoitov, daniel; +Cc: netdev
Two small test sockmap updates for bpf-next. These help me run some
additional tests with test_sockmap.
---
John Fastabend (2):
bpf: sockmap test remove shutdown() calls
bpf: use --cgroup in test_suite if supplied
tools/testing/selftests/bpf/test_sockmap.c | 56 ++++++++++++++++------------
1 file changed, 31 insertions(+), 25 deletions(-)
^ permalink raw reply
* Re: Oops running iptables -F OUTPUT
From: Ard Biesheuvel @ 2018-08-28 16:09 UTC (permalink / raw)
To: Nicholas Piggin
Cc: Andreas Schwab, <netdev@vger.kernel.org>, linuxppc-dev,
Jessica Yu, Michael Ellerman, Will Deacon, Ingo Molnar,
Andrew Morton, linux-arch
In-Reply-To: <CAKv+Gu8ROLamoxFZzbqrzSHq1cUQg5hn02HrKGB_0AT=EcBJpg@mail.gmail.com>
On 28 August 2018 at 15:56, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> Hello Andreas, Nick,
>
> On 28 August 2018 at 06:06, Nicholas Piggin <nicholas.piggin@gmail.com> wrote:
>> On Mon, 27 Aug 2018 19:11:01 +0200
>> Andreas Schwab <schwab@linux-m68k.org> wrote:
>>
>>> I'm getting this Oops when running iptables -F OUTPUT:
>>>
>>> [ 91.139409] Unable to handle kernel paging request for data at address 0xd0000001fff12f34
>>> [ 91.139414] Faulting instruction address: 0xd0000000016a5718
>>> [ 91.139419] Oops: Kernel access of bad area, sig: 11 [#1]
>>> [ 91.139426] BE SMP NR_CPUS=2 PowerMac
>>> [ 91.139434] Modules linked in: iptable_filter ip_tables x_tables bpfilter nfsd auth_rpcgss lockd grace nfs_acl sunrpc tun af_packet snd_aoa_codec_tas snd_aoa_fabric_layout snd_aoa snd_aoa_i2sbus snd_aoa_soundbus snd_pcm_oss snd_pcm snd_seq snd_timer snd_seq_device snd_mixer_oss snd sungem sr_mod firewire_ohci cdrom sungem_phy soundcore firewire_core pata_macio crc_itu_t sg hid_generic usbhid linear md_mod ohci_pci ohci_hcd ehci_pci ehci_hcd usbcore usb_common dm_snapshot dm_bufio dm_mirror dm_region_hash dm_log dm_mod sata_svw
>>> [ 91.139522] CPU: 1 PID: 3620 Comm: iptables Not tainted 4.19.0-rc1 #1
>>> [ 91.139526] NIP: d0000000016a5718 LR: d0000000016a569c CTR: c0000000006f560c
>>> [ 91.139531] REGS: c0000001fa577670 TRAP: 0300 Not tainted (4.19.0-rc1)
>>> [ 91.139534] MSR: 900000000200b032 <SF,HV,VEC,EE,FP,ME,IR,DR,RI> CR: 84002484 XER: 20000000
>>> [ 91.139553] DAR: d0000001fff12f34 DSISR: 40000000 IRQMASK: 0
>>> GPR00: d0000000016a569c c0000001fa5778f0 d0000000016b0400 0000000000000000
>>> GPR04: 0000000000000002 0000000000000000 80000001fa46418e c0000001fa0d05c8
>>> GPR08: d0000000016b0400 d00037fffff13000 00000001ff3e7000 d0000000016a6fb8
>>> GPR12: c0000000006f560c c00000000ffff780 0000000000000000 0000000000000000
>>> GPR16: 0000000011635010 00003fffa1b7aa68 0000000000000000 0000000000000000
>>> GPR20: 0000000000000003 0000000010013918 00000000116350c0 c000000000b88990
>>> GPR24: c000000000b88ba4 0000000000000000 d0000001fff12f34 0000000000000000
>>> GPR28: d0000000016b8000 c0000001fa20f400 c0000001fa20f440 0000000000000000
>>> [ 91.139627] NIP [d0000000016a5718] .alloc_counters.isra.10+0xbc/0x140 [ip_tables]
>>> [ 91.139634] LR [d0000000016a569c] .alloc_counters.isra.10+0x40/0x140 [ip_tables]
>>> [ 91.139638] Call Trace:
>>> [ 91.139645] [c0000001fa5778f0] [d0000000016a569c] .alloc_counters.isra.10+0x40/0x140 [ip_tables] (unreliable)
>>> [ 91.139655] [c0000001fa5779b0] [d0000000016a5b54] .do_ipt_get_ctl+0x110/0x2ec [ip_tables]
>>> [ 91.139666] [c0000001fa577aa0] [c0000000006233e0] .nf_getsockopt+0x68/0x88
>>> [ 91.139674] [c0000001fa577b40] [c000000000631608] .ip_getsockopt+0xbc/0x128
>>> [ 91.139682] [c0000001fa577bf0] [c00000000065adf4] .raw_getsockopt+0x18/0x5c
>>> [ 91.139690] [c0000001fa577c60] [c0000000005b5f60] .sock_common_getsockopt+0x2c/0x40
>>> [ 91.139697] [c0000001fa577cd0] [c0000000005b3394] .__sys_getsockopt+0xa4/0xd0
>>> [ 91.139704] [c0000001fa577d80] [c0000000005b5ab0] .__se_sys_socketcall+0x238/0x2b4
>>> [ 91.139712] [c0000001fa577e30] [c00000000000a31c] system_call+0x5c/0x70
>>> [ 91.139716] Instruction dump:
>>> [ 91.139721] 39290040 7d3d4a14 7fbe4840 409cff98 81380000 2b890001 419d000c 393e0060
>>> [ 91.139736] 48000010 7d57c82a e93e0060 7d295214 <815a0000> 794807e1 41e20010 7c210b78
>>> [ 91.139752] ---[ end trace f5d1d5431651845d ]---
>>
>> This is due to 7290d58095 ("module: use relative references for
>> __ksymtab entries"). This part of kernel/module.c -
>>
>> /* Divert to percpu allocation if a percpu var. */
>> if (sym[i].st_shndx == info->index.pcpu)
>> secbase = (unsigned long)mod_percpu(mod);
>> else
>> secbase = info->sechdrs[sym[i].st_shndx].sh_addr;
>> sym[i].st_value += secbase;
>>
>> Causes the distance to the target to exceed 32-bits on powerpc, so
>> it doesn't fit in a rel32 reloc. Not sure how other archs cope.
>>
>
> Apologies for the breakage. It does indeed appear to affect all
> architectures, and I'm a bit puzzled why you are the first one to spot
> it.
>
> I will try to find a clean way to special case the per-CPU variable
> __ksymtab references in the generic module code, and if that is too
> cumbersome, we can switch to 64-bit relative references (or rather,
> native word size relative references) instead. Or revert the whole
> thing ...
OK, after a bit of digging, and confirming that the arm64
implementation works as expected (its module loader actually detects
overflows of the 32-bit place relative relocations, so the problem
definitely does not occur there), I think I found the explanation why
this occurs on powerpc and not on x86 or arm64.
Could you please check whether this change makes the issue go away?
(whitespace damage courtesy of Gmail)
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 6a501b25dd85..57d09d5ceb1a 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -779,7 +779,6 @@ EXPORT_SYMBOL(__per_cpu_offset);
void __init setup_per_cpu_areas(void)
{
- const size_t dyn_size = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
size_t atom_size;
unsigned long delta;
unsigned int cpu;
@@ -795,7 +794,9 @@ void __init setup_per_cpu_areas(void)
else
atom_size = 1 << 20;
- rc = pcpu_embed_first_chunk(0, dyn_size, atom_size, pcpu_cpu_distance,
+ rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
+ PERCPU_DYNAMIC_RESERVE,
+ atom_size, pcpu_cpu_distance,
pcpu_fc_alloc, pcpu_fc_free);
if (rc < 0)
panic("cannot initialize percpu area (err=%d)", rc);
The git log does not explain why power deviates from x86 and arm64 in
the way it initializes the percpu areas.
^ permalink raw reply related
* Re: [PATCH net] sctp: hold transport before accessing its asoc in sctp_transport_get_next
From: Xin Long @ 2018-08-28 16:08 UTC (permalink / raw)
To: Neil Horman; +Cc: network dev, linux-sctp, davem, Marcelo Ricardo Leitner
In-Reply-To: <20180827130803.GA12418@hmswarspite.think-freely.org>
On Mon, Aug 27, 2018 at 9:08 PM Neil Horman <nhorman@tuxdriver.com> wrote:
>
> On Mon, Aug 27, 2018 at 06:38:31PM +0800, Xin Long wrote:
> > As Marcelo noticed, in sctp_transport_get_next, it is iterating over
> > transports but then also accessing the association directly, without
> > checking any refcnts before that, which can cause an use-after-free
> > Read.
> >
> > So fix it by holding transport before accessing the association. With
> > that, sctp_transport_hold calls can be removed in the later places.
> >
> > Fixes: 626d16f50f39 ("sctp: export some apis or variables for sctp_diag and reuse some for proc")
> > Reported-by: syzbot+fe62a0c9aa6a85c6de16@syzkaller.appspotmail.com
> > Signed-off-by: Xin Long <lucien.xin@gmail.com>
> > ---
> > net/sctp/proc.c | 4 ----
> > net/sctp/socket.c | 22 +++++++++++++++-------
> > 2 files changed, 15 insertions(+), 11 deletions(-)
> >
> > diff --git a/net/sctp/proc.c b/net/sctp/proc.c
> > index ef5c9a8..4d6f1c8 100644
> > --- a/net/sctp/proc.c
> > +++ b/net/sctp/proc.c
> > @@ -264,8 +264,6 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v)
> > }
> >
> > transport = (struct sctp_transport *)v;
> > - if (!sctp_transport_hold(transport))
> > - return 0;
> > assoc = transport->asoc;
> > epb = &assoc->base;
> > sk = epb->sk;
> > @@ -322,8 +320,6 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v)
> > }
> >
> > transport = (struct sctp_transport *)v;
> > - if (!sctp_transport_hold(transport))
> > - return 0;
> > assoc = transport->asoc;
> >
> > list_for_each_entry_rcu(tsp, &assoc->peer.transport_addr_list,
> > diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> > index e96b15a..aa76586 100644
> > --- a/net/sctp/socket.c
> > +++ b/net/sctp/socket.c
> > @@ -5005,9 +5005,14 @@ struct sctp_transport *sctp_transport_get_next(struct net *net,
> > break;
> > }
> >
> > + if (!sctp_transport_hold(t))
> > + continue;
> > +
> > if (net_eq(sock_net(t->asoc->base.sk), net) &&
> > t->asoc->peer.primary_path == t)
> > break;
> > +
> > + sctp_transport_put(t);
> > }
> >
> > return t;
> > @@ -5017,13 +5022,18 @@ struct sctp_transport *sctp_transport_get_idx(struct net *net,
> > struct rhashtable_iter *iter,
> > int pos)
> > {
> > - void *obj = SEQ_START_TOKEN;
> > + struct sctp_transport *t;
> >
> > - while (pos && (obj = sctp_transport_get_next(net, iter)) &&
> > - !IS_ERR(obj))
> > - pos--;
> > + if (!pos)
> > + return SEQ_START_TOKEN;
> >
> > - return obj;
> > + while ((t = sctp_transport_get_next(net, iter)) && !IS_ERR(t)) {
> > + if (!--pos)
> > + break;
> > + sctp_transport_put(t);
> > + }
> > +
> > + return t;
> > }
> >
> > int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *),
> > @@ -5082,8 +5092,6 @@ int sctp_for_each_transport(int (*cb)(struct sctp_transport *, void *),
> >
> > tsp = sctp_transport_get_idx(net, &hti, *pos + 1);
> > for (; !IS_ERR_OR_NULL(tsp); tsp = sctp_transport_get_next(net, &hti)) {
> > - if (!sctp_transport_hold(tsp))
> > - continue;
> > ret = cb(tsp, p);
> > if (ret)
> > break;
> > --
> > 2.1.0
> >
> >
> Acked-by: Neil Horman <nhorman@tuxdriver.com>
>
> Additionally, its not germaine to this particular fix, but why are we still
> using that pos variable in sctp_transport_get_idx? With the conversion to
> rhashtables, it doesn't seem particularly useful anymore.
For proc, seems so, hti is saved into seq->private.
But for diag, "hti" in sctp_for_each_transport() is a local variable.
do you think where we can save it?
^ permalink raw reply
* Waiting for
From: Ruby @ 2018-08-28 13:50 UTC (permalink / raw)
To: netdev
We provide photoshop services to some of the companies from around the
world.
We have worked on tons of images ever since our team establishment in 2009.
Many online retail companies use our services for retouching electronics,
jewelry, apparels, furniture
etc. by getting the images of their products enhanced.
Here are the details of what we provide:
Clipping path;
Deep etch process
Image masking
Remove background
Portrait retouching
Jewelry retouching
Fashion retouching
Please reply back for further info.
We can provide testing for your photos if needed.
Thanks,
Ruby
^ permalink raw reply
* Re: [PATCH net] net: bcmgenet: use MAC link status for fixed phy
From: Florian Fainelli @ 2018-08-28 19:40 UTC (permalink / raw)
To: Doug Berger, David S. Miller; +Cc: netdev, linux-kernel
In-Reply-To: <1535484795-31871-1-git-send-email-opendmb@gmail.com>
On 08/28/2018 12:33 PM, Doug Berger wrote:
> When using the fixed PHY with GENET (e.g. MOCA) the PHY link
> status can be determined from the internal link status captured
> by the MAC. This allows the PHY state machine to use the correct
> link state with the fixed PHY even if MAC link event interrupts
> are missed when the net device is opened.
>
> Fixes: 8d88c6e ("net: bcmgenet: enable MoCA link state change detection")
The 12-digit sha1 for that commit would be 8d88c6ebb34c
> Signed-off-by: Doug Berger <opendmb@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Thanks Doug!
> ---
> drivers/net/ethernet/broadcom/genet/bcmgenet.h | 3 +++
> drivers/net/ethernet/broadcom/genet/bcmmii.c | 10 ++++++++--
> 2 files changed, 11 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.h b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
> index b773bc0..14b49612a 100644
> --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.h
> +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
> @@ -186,6 +186,9 @@ struct bcmgenet_mib_counters {
> #define UMAC_MAC1 0x010
> #define UMAC_MAX_FRAME_LEN 0x014
>
> +#define UMAC_MODE 0x44
> +#define MODE_LINK_STATUS (1 << 5)
> +
> #define UMAC_EEE_CTRL 0x064
> #define EN_LPI_RX_PAUSE (1 << 0)
> #define EN_LPI_TX_PFC (1 << 1)
> diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c
> index 5333274..4241ae9 100644
> --- a/drivers/net/ethernet/broadcom/genet/bcmmii.c
> +++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c
> @@ -115,8 +115,14 @@ void bcmgenet_mii_setup(struct net_device *dev)
> static int bcmgenet_fixed_phy_link_update(struct net_device *dev,
> struct fixed_phy_status *status)
> {
> - if (dev && dev->phydev && status)
> - status->link = dev->phydev->link;
> + struct bcmgenet_priv *priv;
> + u32 reg;
> +
> + if (dev && dev->phydev && status) {
> + priv = netdev_priv(dev);
> + reg = bcmgenet_umac_readl(priv, UMAC_MODE);
> + status->link = !!(reg & MODE_LINK_STATUS);
> + }
>
> return 0;
> }
>
--
Florian
^ permalink raw reply
* net-next is OPEN...
From: David Miller @ 2018-08-28 15:43 UTC (permalink / raw)
To: netdev
You know the drill...
http://vger.kernel.org/~davem/net-next.html
^ permalink raw reply
* Re: [PATCH 1/2] net/ibm/emac: wrong emac_calc_base call was used by typo
From: Christian Lamparter @ 2018-08-28 19:35 UTC (permalink / raw)
To: Ivan Mikhaylov; +Cc: netdev, David S . Miller, linux-kernel
In-Reply-To: <20180827164336.8815-1-ivan@de.ibm.com>
On Monday, August 27, 2018 6:43:35 PM CEST Ivan Mikhaylov wrote:
> __emac_calc_base_mr1 was used instead of __emac4_calc_base_mr1
> by copy-paste mistake for emac4syn.
>
> Fixes: 45d6e545505fd32edb812f085be7de45b6a5c0af ("net/ibm/emac: add 8192 rx/tx fifo size")
> Signed-off-by: Ivan Mikhaylov <ivan@de.ibm.com>
Always nice to see :) .
I do have a ot question though:
Since you are working for IBM and probably have access to all the
wonderful docs and the working hardware: Would you consider to be
the emac/emac4/emac-sync maintainer? From what I can parse from
the /MAINTAINERS file, there isn't currently anyone listed.
Thanks,
Christian
^ permalink raw reply
* [PATCH net] net: bcmgenet: use MAC link status for fixed phy
From: Doug Berger @ 2018-08-28 19:33 UTC (permalink / raw)
To: David S. Miller; +Cc: Florian Fainelli, netdev, linux-kernel, Doug Berger
When using the fixed PHY with GENET (e.g. MOCA) the PHY link
status can be determined from the internal link status captured
by the MAC. This allows the PHY state machine to use the correct
link state with the fixed PHY even if MAC link event interrupts
are missed when the net device is opened.
Fixes: 8d88c6e ("net: bcmgenet: enable MoCA link state change detection")
Signed-off-by: Doug Berger <opendmb@gmail.com>
---
drivers/net/ethernet/broadcom/genet/bcmgenet.h | 3 +++
drivers/net/ethernet/broadcom/genet/bcmmii.c | 10 ++++++++--
2 files changed, 11 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.h b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
index b773bc0..14b49612a 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.h
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
@@ -186,6 +186,9 @@ struct bcmgenet_mib_counters {
#define UMAC_MAC1 0x010
#define UMAC_MAX_FRAME_LEN 0x014
+#define UMAC_MODE 0x44
+#define MODE_LINK_STATUS (1 << 5)
+
#define UMAC_EEE_CTRL 0x064
#define EN_LPI_RX_PAUSE (1 << 0)
#define EN_LPI_TX_PFC (1 << 1)
diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c
index 5333274..4241ae9 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmmii.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c
@@ -115,8 +115,14 @@ void bcmgenet_mii_setup(struct net_device *dev)
static int bcmgenet_fixed_phy_link_update(struct net_device *dev,
struct fixed_phy_status *status)
{
- if (dev && dev->phydev && status)
- status->link = dev->phydev->link;
+ struct bcmgenet_priv *priv;
+ u32 reg;
+
+ if (dev && dev->phydev && status) {
+ priv = netdev_priv(dev);
+ reg = bcmgenet_umac_readl(priv, UMAC_MODE);
+ status->link = !!(reg & MODE_LINK_STATUS);
+ }
return 0;
}
--
2.7.4
^ permalink raw reply related
* Re: [PATCH] iprule: Fix destination prefix output
From: Luca Boccassi @ 2018-08-28 15:38 UTC (permalink / raw)
To: Stefan Bader, netdev; +Cc: Stephen Hemminger, Christian Ehrhardt
In-Reply-To: <1535466449-24190-1-git-send-email-stefan.bader@canonical.com>
[-- Attachment #1: Type: text/plain, Size: 1174 bytes --]
On Tue, 2018-08-28 at 16:27 +0200, Stefan Bader wrote:
> When adding support for JSON output the new code for printing
> the destination prefix adds a stray blank character before
> the bitmask. This causes some user-space parsing to fail.
>
> Current output:
> ...: from x.x.x.x/l to y.y.y.y /l
> Previous output:
> ...: from x.x.x.x/l to y.y.y.y/l
>
> Fixes: 0dd4ccc5 "iprule: add json support"
> Signed-off-by: Stefan Bader <stefan.bader@canonical.com>
> ---
> ip/iprule.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/ip/iprule.c b/ip/iprule.c
> index 8b94214..744d6d8 100644
> --- a/ip/iprule.c
> +++ b/ip/iprule.c
> @@ -239,7 +239,7 @@ int print_rule(const struct sockaddr_nl *who,
> struct nlmsghdr *n, void *arg)
>
> print_string(PRINT_FP, NULL, "to ", NULL);
> print_color_string(PRINT_ANY, ifa_family_color(frh-
> >family),
> - "dst", "%s ", dst);
> + "dst", "%s", dst);
> if (frh->dst_len != host_len)
> print_uint(PRINT_ANY, "dstlen", "/%u ", frh-
> >dst_len);
> else
Acked-by: Luca Boccassi <bluca@debian.org>
--
Kind regards,
Luca Boccassi
[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 488 bytes --]
^ permalink raw reply
* Re: [PATCH net 0/3] ipv6: fix error path of inet6_init()
From: Xin Long @ 2018-08-28 15:30 UTC (permalink / raw)
To: Sabrina Dubroca; +Cc: netdev
In-Reply-To: <cover.1535451234.git.sd@queasysnail.net>
----- Original Message -----
> The error path of inet6_init() can trigger multiple kernel panics,
> mostly due to wrong ordering of cleanups. This series fixes those
> issues.
>
> Sabrina Dubroca (3):
> ipv6: fix cleanup ordering for ip6_mr failure
> ipv6: fix cleanup ordering for pingv6 registration
> net: rtnl: return early from rtnl_unregister_all when protocol isn't
> registered
>
> net/core/rtnetlink.c | 4 ++++
> net/ipv6/af_inet6.c | 10 +++++-----
> 2 files changed, 9 insertions(+), 5 deletions(-)
>
> --
> 2.18.0
>
>
Series Reviewed-by: Xin Long <lucien.xin@gmail.com>
^ permalink raw reply
* Re: [PATCH RFT] net: dsa: Allow configuring CPU port VLANs
From: Florian Fainelli @ 2018-08-28 19:17 UTC (permalink / raw)
To: Maxim Uvarov
Cc: Ilias Apalodimas, petrm, netdev, jiri, Andrew Lunn,
Vivien Didelot, David Miller, kernel list
In-Reply-To: <CAJGZr0JaqBt0BdoCAmjnMT9zN8cjBnJKJJGzWrThTjBKtwp-vA@mail.gmail.com>
On 08/28/2018 12:08 PM, Maxim Uvarov wrote:
> вт, 28 авг. 2018 г. в 20:00, Florian Fainelli <f.fainelli@gmail.com>:
>>
>> On 08/28/2018 01:32 AM, Ilias Apalodimas wrote:
>>> On Fri, Aug 10, 2018 at 04:58:10PM -0700, Florian Fainelli wrote:
>>>> On 06/25/2018 02:17 AM, Ilias Apalodimas wrote:
>>>>> On Mon, Jun 25, 2018 at 12:13:10PM +0300, Petr Machata wrote:
>>>>>> Florian Fainelli <f.fainelli@gmail.com> writes:
>>>>>>
>>>>>>> if (netif_is_bridge_master(vlan->obj.orig_dev))
>>>>>>> - return -EOPNOTSUPP;
>>>>>>> + info.port = dp->cpu_dp->index;
>>>>>>
>>>>>> The condition above will trigger also when a VLAN is added on a member
>>>>>> port, and there's no other port with that VLAN. In that case the VLAN
>>>>>> comes without the BRIDGE_VLAN_INFO_BRENTRY flag. In mlxsw we have this
>>>>>> to get the bridge VLANs:
>>>>>>
>>>>>> if (netif_is_bridge_master(orig_dev)) {
>>>>>> [...]
>>>>>> if ((vlan->flags & BRIDGE_VLAN_INFO_BRENTRY) &&
>>>>>> [...]
>>>>>>
>>>>>> This doesn't appear to be done in DSA unless I'm missing something.
>>>>> Petr's right. This will trigger for VLANs added on 'not cpu ports' if the VLAN
>>>>> is not already a member.
>>>>>
>>>>> This command has BRIDGE_VLAN_INFO_BRENTRY set:
>>>>> bridge vlan add dev br0 vid 100 pvid untagged self
>>>>> I had the same issue on my CPSW RFC and solved it
>>>>> exactly the same was as Petr suggested.
>>>>
>>>> Humm, there must be something obvious I am missing, but the following
>>>> don't exactly result in what I would expect after adding a check for
>>>> vlan->flags & BRIDGE_VLAN_INFO_BRENTRY:
>>>>
>>>> brctl addbr br0
>>>> echo 1 > /sys/class/net/br0/bridge/vlan_filtering
>>>> brctl addif br0 lan1
>>>>
>>>> #1 results in lan1 being programmed with VID 1, PVID, untagged, but not
>>>> the CPU port. I would have sort of expected that the bridge layer would
>>>> also push the configuration to br0/CPU port since this is the default VLAN:
>>>>
>>>> bridge vlan show dev br0
>>>> port vlan ids
>>>> br0 1 PVID Egress Untagged
>>>>
>>>> But it does not.
>>>>
>>>> bridge vlan add vid 2 dev lan1
>>>>
>>>> #2 same thing, results in only lan1 being programmed with VID 2, tagged
>>>> but that is expected because we are creating the VLAN only for the
>>>> user-facing port.
>>>>
>>>> bridge vlan add vid 3 dev br0 self
>>>>
>>>> #3 results in the CPU port being programmed with VID 3, tagged, again,
>>>> this is expected because we are only programming the bridge master/CPU
>>>> port here.
>>>>
>>>> Does #1 also happen for cpsw and mlxsw or do you actually get events
>>>> about the bridge's default VLAN configuration? Or does the switch driver
>>>> actually need to obtain that at the time the port is enslaved somehow?
>>> As long as ports are attached you get the events (one event per attached port
>>> iirc)
>>> if the event is checked against BRIDGE_VLAN_INFO_BRENTRY, the only way to add a
>>> VLAN to the cpu port is via 'bridge vlan add vid 3 dev br0 self'
>>
>> Do we have a guarantee that upon port enslavement, whatever default_pvid
>> is configured on the bridge master device also happens to be the port's
>> default_pvid settings as well?
>
> I think default pvid is per port thing. I.e. each port can have it's
> own pvid (i.e. it will tag with vlan id not tagged incoming packet to
> that port),
We are talking about the bridge master device's default_pvid which can
be set prior to any port being enslaved into the bridge. As of today, if
you enslave a port of a switch into a bridge, you need to properly
configure the CPU/management port as well otherwise things just wont' be
working. At the time we enslave the first port into the bridge, there is
no notification AFAICT that is generated to tell us about what the
bridge master device's default_pvid is.
> I did not exactly understand use case. With adding vlan filtering to
> cpu port you filter out packets from other vlan groups to cpu port.
> This might be useful
> only for multicast packes or missing fbd entry on some dsa port. Is
> filtering multicast a main problem to solve here?
> Linux is missing vlan ingress policy. I.e. filtering (echo 1 >
> /sys/br0/vlan_filter) has to be case of 3 policies: secure (default
> now), check and fallback. With current secure mode it
> might work, but with check mode it will be needed to add all vlans to
> cpu port. Btw, on some hardware vlan ingress policies are also per
> port, not per bridge.
The general use case is that the CPU port on switches that have such a
thing is just a normal port on which you should be able to configure
exactly the VLAN membership and attributes.
With DSA switches today, we cannot do that, because there is no network
interface exposed for the CPU port (and there should not be one), so
when you target the bridge master device, e.g: br0, we can generate
events towards the switch driver that map to the CPU port.
There are many reasons for trying to do that, if we don't support such a
thing, then we need to have the CPU port be part of all VLAN IDs that
get added to ports, as a tagged member (because if untagged, you can't
differentiate traffic anymore).
Regarding your suggestion, we could certainly change vlan_filtering to
take several values:
0: disabled
1: secure
2: check
Or something like that.
--
Florian
^ permalink raw reply
* Re: [PATCH RFT] net: dsa: Allow configuring CPU port VLANs
From: Maxim Uvarov @ 2018-08-28 19:08 UTC (permalink / raw)
To: Florian Fainelli
Cc: Ilias Apalodimas, petrm, netdev, jiri, Andrew Lunn,
Vivien Didelot, David Miller, kernel list
In-Reply-To: <da1a9896-94aa-589c-3fd2-023b1586bb15@gmail.com>
вт, 28 авг. 2018 г. в 20:00, Florian Fainelli <f.fainelli@gmail.com>:
>
> On 08/28/2018 01:32 AM, Ilias Apalodimas wrote:
> > On Fri, Aug 10, 2018 at 04:58:10PM -0700, Florian Fainelli wrote:
> >> On 06/25/2018 02:17 AM, Ilias Apalodimas wrote:
> >>> On Mon, Jun 25, 2018 at 12:13:10PM +0300, Petr Machata wrote:
> >>>> Florian Fainelli <f.fainelli@gmail.com> writes:
> >>>>
> >>>>> if (netif_is_bridge_master(vlan->obj.orig_dev))
> >>>>> - return -EOPNOTSUPP;
> >>>>> + info.port = dp->cpu_dp->index;
> >>>>
> >>>> The condition above will trigger also when a VLAN is added on a member
> >>>> port, and there's no other port with that VLAN. In that case the VLAN
> >>>> comes without the BRIDGE_VLAN_INFO_BRENTRY flag. In mlxsw we have this
> >>>> to get the bridge VLANs:
> >>>>
> >>>> if (netif_is_bridge_master(orig_dev)) {
> >>>> [...]
> >>>> if ((vlan->flags & BRIDGE_VLAN_INFO_BRENTRY) &&
> >>>> [...]
> >>>>
> >>>> This doesn't appear to be done in DSA unless I'm missing something.
> >>> Petr's right. This will trigger for VLANs added on 'not cpu ports' if the VLAN
> >>> is not already a member.
> >>>
> >>> This command has BRIDGE_VLAN_INFO_BRENTRY set:
> >>> bridge vlan add dev br0 vid 100 pvid untagged self
> >>> I had the same issue on my CPSW RFC and solved it
> >>> exactly the same was as Petr suggested.
> >>
> >> Humm, there must be something obvious I am missing, but the following
> >> don't exactly result in what I would expect after adding a check for
> >> vlan->flags & BRIDGE_VLAN_INFO_BRENTRY:
> >>
> >> brctl addbr br0
> >> echo 1 > /sys/class/net/br0/bridge/vlan_filtering
> >> brctl addif br0 lan1
> >>
> >> #1 results in lan1 being programmed with VID 1, PVID, untagged, but not
> >> the CPU port. I would have sort of expected that the bridge layer would
> >> also push the configuration to br0/CPU port since this is the default VLAN:
> >>
> >> bridge vlan show dev br0
> >> port vlan ids
> >> br0 1 PVID Egress Untagged
> >>
> >> But it does not.
> >>
> >> bridge vlan add vid 2 dev lan1
> >>
> >> #2 same thing, results in only lan1 being programmed with VID 2, tagged
> >> but that is expected because we are creating the VLAN only for the
> >> user-facing port.
> >>
> >> bridge vlan add vid 3 dev br0 self
> >>
> >> #3 results in the CPU port being programmed with VID 3, tagged, again,
> >> this is expected because we are only programming the bridge master/CPU
> >> port here.
> >>
> >> Does #1 also happen for cpsw and mlxsw or do you actually get events
> >> about the bridge's default VLAN configuration? Or does the switch driver
> >> actually need to obtain that at the time the port is enslaved somehow?
> > As long as ports are attached you get the events (one event per attached port
> > iirc)
> > if the event is checked against BRIDGE_VLAN_INFO_BRENTRY, the only way to add a
> > VLAN to the cpu port is via 'bridge vlan add vid 3 dev br0 self'
>
> Do we have a guarantee that upon port enslavement, whatever default_pvid
> is configured on the bridge master device also happens to be the port's
> default_pvid settings as well?
I think default pvid is per port thing. I.e. each port can have it's
own pvid (i.e. it will tag with vlan id not tagged incoming packet to
that port),
I did not exactly understand use case. With adding vlan filtering to
cpu port you filter out packets from other vlan groups to cpu port.
This might be useful
only for multicast packes or missing fbd entry on some dsa port. Is
filtering multicast a main problem to solve here?
Linux is missing vlan ingress policy. I.e. filtering (echo 1 >
/sys/br0/vlan_filter) has to be case of 3 policies: secure (default
now), check and fallback. With current secure mode it
might work, but with check mode it will be needed to add all vlans to
cpu port. Btw, on some hardware vlan ingress policies are also per
port, not per bridge.
Best regards,
Maxim.
> --
> Florian
--
Best regards,
Maxim Uvarov
^ permalink raw reply
* [PATCH 3/3] xen-netback: handle page straddling in xenvif_set_hash_mapping()
From: Jan Beulich @ 2018-08-28 15:00 UTC (permalink / raw)
To: Paul Durrant, Wei Liu; +Cc: davem, xen-devel, netdev
In-Reply-To: <5B85622302000078001E2A35@prv1-mh.provo.novell.com>
There's no guarantee that the mapping array doesn't cross a page
boundary. Use a second grant copy operation if necessary.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
drivers/net/xen-netback/hash.c | 25 ++++++++++++++++++-------
1 file changed, 18 insertions(+), 7 deletions(-)
--- 4.19-rc1-xen-netback-set-hash-mapping.orig/drivers/net/xen-netback/hash.c
+++ 4.19-rc1-xen-netback-set-hash-mapping/drivers/net/xen-netback/hash.c
@@ -334,28 +334,39 @@ u32 xenvif_set_hash_mapping(struct xenvi
u32 off)
{
u32 *mapping = vif->hash.mapping[!vif->hash.mapping_sel];
- struct gnttab_copy copy_op = {
+ unsigned int nr = 1;
+ struct gnttab_copy copy_op[2] = {{
.source.u.ref = gref,
.source.domid = vif->domid,
.dest.domid = DOMID_SELF,
.len = len * sizeof(*mapping),
.flags = GNTCOPY_source_gref
- };
+ }};
if ((off + len < off) || (off + len > vif->hash.size) ||
len > XEN_PAGE_SIZE / sizeof(*mapping))
return XEN_NETIF_CTRL_STATUS_INVALID_PARAMETER;
- copy_op.dest.u.gmfn = virt_to_gfn(mapping + off);
- copy_op.dest.offset = xen_offset_in_page(mapping + off);
+ copy_op[0].dest.u.gmfn = virt_to_gfn(mapping + off);
+ copy_op[0].dest.offset = xen_offset_in_page(mapping + off);
+ if (copy_op[0].dest.offset + copy_op[0].len > XEN_PAGE_SIZE) {
+ copy_op[1] = copy_op[0];
+ copy_op[1].source.offset = XEN_PAGE_SIZE - copy_op[0].dest.offset;
+ copy_op[1].dest.u.gmfn = virt_to_gfn(mapping + off + len);
+ copy_op[1].dest.offset = 0;
+ copy_op[1].len = copy_op[0].len - copy_op[1].source.offset;
+ copy_op[0].len = copy_op[1].source.offset;
+ nr = 2;
+ }
memcpy(mapping, vif->hash.mapping[vif->hash.mapping_sel],
vif->hash.size * sizeof(*mapping));
- if (copy_op.len != 0) {
- gnttab_batch_copy(©_op, 1);
+ if (copy_op[0].len != 0) {
+ gnttab_batch_copy(copy_op, nr);
- if (copy_op.status != GNTST_okay)
+ if (copy_op[0].status != GNTST_okay ||
+ copy_op[nr - 1].status != GNTST_okay)
return XEN_NETIF_CTRL_STATUS_INVALID_PARAMETER;
}
^ permalink raw reply
* [PATCH 2/3] xen-netback: validate queue numbers in xenvif_set_hash_mapping()
From: Jan Beulich @ 2018-08-28 14:59 UTC (permalink / raw)
To: Paul Durrant, Wei Liu; +Cc: davem, xen-devel, netdev
In-Reply-To: <5B85622302000078001E2A35@prv1-mh.provo.novell.com>
Checking them before the grant copy means nothing as to the validity of
the incoming request. As we shouldn't make the new data live before
having validated it, introduce a second instance of the mapping array.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
drivers/net/xen-netback/common.h | 3 ++-
drivers/net/xen-netback/hash.c | 20 ++++++++++++++------
drivers/net/xen-netback/interface.c | 3 ++-
3 files changed, 18 insertions(+), 8 deletions(-)
--- 4.19-rc1-xen-netback-set-hash-mapping.orig/drivers/net/xen-netback/common.h
+++ 4.19-rc1-xen-netback-set-hash-mapping/drivers/net/xen-netback/common.h
@@ -241,8 +241,9 @@ struct xenvif_hash_cache {
struct xenvif_hash {
unsigned int alg;
u32 flags;
+ bool mapping_sel;
u8 key[XEN_NETBK_MAX_HASH_KEY_SIZE];
- u32 mapping[XEN_NETBK_MAX_HASH_MAPPING_SIZE];
+ u32 mapping[2][XEN_NETBK_MAX_HASH_MAPPING_SIZE];
unsigned int size;
struct xenvif_hash_cache cache;
};
--- 4.19-rc1-xen-netback-set-hash-mapping.orig/drivers/net/xen-netback/hash.c
+++ 4.19-rc1-xen-netback-set-hash-mapping/drivers/net/xen-netback/hash.c
@@ -324,7 +324,8 @@ u32 xenvif_set_hash_mapping_size(struct
return XEN_NETIF_CTRL_STATUS_INVALID_PARAMETER;
vif->hash.size = size;
- memset(vif->hash.mapping, 0, sizeof(u32) * size);
+ memset(vif->hash.mapping[vif->hash.mapping_sel], 0,
+ sizeof(u32) * size);
return XEN_NETIF_CTRL_STATUS_SUCCESS;
}
@@ -332,7 +333,7 @@ u32 xenvif_set_hash_mapping_size(struct
u32 xenvif_set_hash_mapping(struct xenvif *vif, u32 gref, u32 len,
u32 off)
{
- u32 *mapping = vif->hash.mapping;
+ u32 *mapping = vif->hash.mapping[!vif->hash.mapping_sel];
struct gnttab_copy copy_op = {
.source.u.ref = gref,
.source.domid = vif->domid,
@@ -348,9 +349,8 @@ u32 xenvif_set_hash_mapping(struct xenvi
copy_op.dest.u.gmfn = virt_to_gfn(mapping + off);
copy_op.dest.offset = xen_offset_in_page(mapping + off);
- while (len-- != 0)
- if (mapping[off++] >= vif->num_queues)
- return XEN_NETIF_CTRL_STATUS_INVALID_PARAMETER;
+ memcpy(mapping, vif->hash.mapping[vif->hash.mapping_sel],
+ vif->hash.size * sizeof(*mapping));
if (copy_op.len != 0) {
gnttab_batch_copy(©_op, 1);
@@ -359,6 +359,12 @@ u32 xenvif_set_hash_mapping(struct xenvi
return XEN_NETIF_CTRL_STATUS_INVALID_PARAMETER;
}
+ while (len-- != 0)
+ if (mapping[off++] >= vif->num_queues)
+ return XEN_NETIF_CTRL_STATUS_INVALID_PARAMETER;
+
+ vif->hash.mapping_sel = !vif->hash.mapping_sel;
+
return XEN_NETIF_CTRL_STATUS_SUCCESS;
}
@@ -410,6 +416,8 @@ void xenvif_dump_hash_info(struct xenvif
}
if (vif->hash.size != 0) {
+ const u32 *mapping = vif->hash.mapping[vif->hash.mapping_sel];
+
seq_puts(m, "\nHash Mapping:\n");
for (i = 0; i < vif->hash.size; ) {
@@ -422,7 +430,7 @@ void xenvif_dump_hash_info(struct xenvif
seq_printf(m, "[%4u - %4u]: ", i, i + n - 1);
for (j = 0; j < n; j++, i++)
- seq_printf(m, "%4u ", vif->hash.mapping[i]);
+ seq_printf(m, "%4u ", mapping[i]);
seq_puts(m, "\n");
}
--- 4.19-rc1-xen-netback-set-hash-mapping.orig/drivers/net/xen-netback/interface.c
+++ 4.19-rc1-xen-netback-set-hash-mapping/drivers/net/xen-netback/interface.c
@@ -162,7 +162,8 @@ static u16 xenvif_select_queue(struct ne
if (size == 0)
return skb_get_hash_raw(skb) % dev->real_num_tx_queues;
- return vif->hash.mapping[skb_get_hash_raw(skb) % size];
+ return vif->hash.mapping[vif->hash.mapping_sel]
+ [skb_get_hash_raw(skb) % size];
}
static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)
^ permalink raw reply
* KASAN: use-after-free Read in vhost_work_queue
From: syzbot @ 2018-08-28 14:44 UTC (permalink / raw)
To: jasowang, kvm, linux-kernel, mst, netdev, syzkaller-bugs,
virtualization
Hello,
syzbot found the following crash on:
HEAD commit: 33e17876ea4e Merge branch 'akpm' (patches from Andrew)
git tree: upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=12d8a20a400000
kernel config: https://syzkaller.appspot.com/x/.config?x=40e5d6b26b73cd5b
dashboard link: https://syzkaller.appspot.com/bug?extid=d5a0a170c5069658b141
compiler: gcc (GCC) 8.0.1 20180413 (experimental)
userspace arch: i386
Unfortunately, I don't have any reproducer for this crash yet.
IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+d5a0a170c5069658b141@syzkaller.appspotmail.com
==================================================================
BUG: KASAN: use-after-free in vhost_work_queue+0xc3/0xe0
drivers/vhost/vhost.c:258
Read of size 8 at addr ffff880193862068 by task syz-executor7/22100
CPU: 0 PID: 22100 Comm: syz-executor7 Not tainted 4.18.0+ #108
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:77 [inline]
dump_stack+0x1c9/0x2b4 lib/dump_stack.c:113
print_address_description+0x6c/0x20b mm/kasan/report.c:256
kasan_report_error mm/kasan/report.c:354 [inline]
kasan_report.cold.7+0x242/0x30d mm/kasan/report.c:412
__asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:433
vhost_work_queue+0xc3/0xe0 drivers/vhost/vhost.c:258
vhost_transport_send_pkt+0x28a/0x380 drivers/vhost/vsock.c:227
virtio_transport_send_pkt_info+0x31d/0x460
net/vmw_vsock/virtio_transport_common.c:190
virtio_transport_shutdown+0x1b1/0x270
net/vmw_vsock/virtio_transport_common.c:604
vsock_send_shutdown net/vmw_vsock/af_vsock.c:451 [inline]
vsock_shutdown+0x229/0x290 net/vmw_vsock/af_vsock.c:849
__sys_shutdown+0x15c/0x2c0 net/socket.c:1964
__do_sys_shutdown net/socket.c:1972 [inline]
__se_sys_shutdown net/socket.c:1970 [inline]
__ia32_sys_shutdown+0x54/0x80 net/socket.c:1970
do_syscall_32_irqs_on arch/x86/entry/common.c:326 [inline]
do_fast_syscall_32+0x34d/0xfb2 arch/x86/entry/common.c:397
entry_SYSENTER_compat+0x70/0x7f arch/x86/entry/entry_64_compat.S:139
RIP: 0023:0xf7fa4ca9
Code: 55 08 8b 88 64 cd ff ff 8b 98 68 cd ff ff 89 c8 85 d2 74 02 89 0a 5b
5d c3 8b 04 24 c3 8b 1c 24 c3 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90
90 90 90 eb 0d 90 90 90 90 90 90 90 90 90 90 90 90
RSP: 002b:00000000f5f7f0cc EFLAGS: 00000296 ORIG_RAX: 0000000000000175
RAX: ffffffffffffffda RBX: 0000000000000009 RCX: 0000000000000000
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
Allocated by task 22094:
save_stack+0x43/0xd0 mm/kasan/kasan.c:448
set_track mm/kasan/kasan.c:460 [inline]
kasan_kmalloc+0xc4/0xe0 mm/kasan/kasan.c:553
__do_kmalloc_node mm/slab.c:3682 [inline]
__kmalloc_node+0x47/0x70 mm/slab.c:3689
kmalloc_node include/linux/slab.h:555 [inline]
kvmalloc_node+0xb9/0xf0 mm/util.c:423
kvmalloc include/linux/mm.h:577 [inline]
vhost_vsock_dev_open+0xa2/0x5a0 drivers/vhost/vsock.c:511
misc_open+0x3ca/0x560 drivers/char/misc.c:141
chrdev_open+0x25a/0x770 fs/char_dev.c:417
do_dentry_open+0x49c/0x1140 fs/open.c:771
vfs_open+0xa0/0xd0 fs/open.c:880
do_last fs/namei.c:3418 [inline]
path_openat+0x12fb/0x5300 fs/namei.c:3534
do_filp_open+0x255/0x380 fs/namei.c:3564
do_sys_open+0x584/0x720 fs/open.c:1063
__do_compat_sys_openat fs/open.c:1109 [inline]
__se_compat_sys_openat fs/open.c:1107 [inline]
__ia32_compat_sys_openat+0x98/0xf0 fs/open.c:1107
do_syscall_32_irqs_on arch/x86/entry/common.c:326 [inline]
do_fast_syscall_32+0x34d/0xfb2 arch/x86/entry/common.c:397
entry_SYSENTER_compat+0x70/0x7f arch/x86/entry/entry_64_compat.S:139
Freed by task 22093:
save_stack+0x43/0xd0 mm/kasan/kasan.c:448
set_track mm/kasan/kasan.c:460 [inline]
__kasan_slab_free+0x11a/0x170 mm/kasan/kasan.c:521
kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528
__cache_free mm/slab.c:3498 [inline]
kfree+0xd9/0x210 mm/slab.c:3813
kvfree+0x61/0x70 mm/util.c:449
vhost_vsock_free drivers/vhost/vsock.c:499 [inline]
vhost_vsock_dev_release+0x4fd/0x750 drivers/vhost/vsock.c:604
__fput+0x36e/0x8c0 fs/file_table.c:278
____fput+0x15/0x20 fs/file_table.c:309
task_work_run+0x1e8/0x2a0 kernel/task_work.c:113
tracehook_notify_resume include/linux/tracehook.h:193 [inline]
exit_to_usermode_loop+0x318/0x380 arch/x86/entry/common.c:166
prepare_exit_to_usermode arch/x86/entry/common.c:197 [inline]
syscall_return_slowpath arch/x86/entry/common.c:268 [inline]
do_syscall_32_irqs_on arch/x86/entry/common.c:341 [inline]
do_fast_syscall_32+0xcd5/0xfb2 arch/x86/entry/common.c:397
entry_SYSENTER_compat+0x70/0x7f arch/x86/entry/entry_64_compat.S:139
The buggy address belongs to the object at ffff880193861fc0
which belongs to the cache kmalloc-65536 of size 65536
The buggy address is located 168 bytes inside of
65536-byte region [ffff880193861fc0, ffff880193871fc0)
The buggy address belongs to the page:
page:ffffea00064e1800 count:1 mapcount:0 mapping:ffff8801dac02500 index:0x0
compound_mapcount: 0
flags: 0x2fffc0000008100(slab|head)
raw: 02fffc0000008100 ffffea00064c7008 ffffea00064e5008 ffff8801dac02500
raw: 0000000000000000 ffff880193861fc0 0000000100000001 0000000000000000
page dumped because: kasan: bad access detected
Memory state around the buggy address:
ffff880193861f00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
ffff880193861f80: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb
> ffff880193862000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
^
ffff880193862080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
ffff880193862100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
==================================================================
---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.
syzbot will keep track of this bug report. See:
https://goo.gl/tpsmEJ#bug-status-tracking for how to communicate with
syzbot.
^ permalink raw reply
* [PATCH] iprule: Fix destination prefix output
From: Stefan Bader @ 2018-08-28 14:27 UTC (permalink / raw)
To: netdev; +Cc: Stephen Hemminger, Luca, Christian Ehrhardt
When adding support for JSON output the new code for printing
the destination prefix adds a stray blank character before
the bitmask. This causes some user-space parsing to fail.
Current output:
...: from x.x.x.x/l to y.y.y.y /l
Previous output:
...: from x.x.x.x/l to y.y.y.y/l
Fixes: 0dd4ccc5 "iprule: add json support"
Signed-off-by: Stefan Bader <stefan.bader@canonical.com>
---
ip/iprule.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ip/iprule.c b/ip/iprule.c
index 8b94214..744d6d8 100644
--- a/ip/iprule.c
+++ b/ip/iprule.c
@@ -239,7 +239,7 @@ int print_rule(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
print_string(PRINT_FP, NULL, "to ", NULL);
print_color_string(PRINT_ANY, ifa_family_color(frh->family),
- "dst", "%s ", dst);
+ "dst", "%s", dst);
if (frh->dst_len != host_len)
print_uint(PRINT_ANY, "dstlen", "/%u ", frh->dst_len);
else
--
2.7.4
^ permalink raw reply related
* [PATCH net 2/2] tc-testing: add test-cases for numeric and invalid control action
From: Paolo Abeni @ 2018-08-28 14:24 UTC (permalink / raw)
To: netdev
Cc: Jamal Hadi Salim, Cong Wang, Jiri Pirko, David S. Miller,
Davide Caratti
In-Reply-To: <cover.1535465984.git.pabeni@redhat.com>
Only the police action allows us to specify an arbitrary numeric value
for the control action. This change introduces an explicit test case
for the above feature and then leverage it for testing the kernel behavior
for invalid control actions (reject).
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
.../tc-testing/tc-tests/actions/police.json | 48 +++++++++++++++++++
1 file changed, 48 insertions(+)
diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/police.json b/tools/testing/selftests/tc-testing/tc-tests/actions/police.json
index f03763d81617..30f9b54bd666 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/actions/police.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/police.json
@@ -312,6 +312,54 @@
"$TC actions flush action police"
]
},
+ {
+ "id": "6aaf",
+ "name": "Add police actions with conform-exceed control pass/pipe [with numeric values]",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action police",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action police rate 3mbit burst 250k conform-exceed 0/3 index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action police index 1",
+ "matchPattern": "action order [0-9]*: police 0x1 rate 3Mbit burst 250Kb mtu 2Kb action pass/pipe",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action police"
+ ]
+ },
+ {
+ "id": "29b1",
+ "name": "Add police actions with conform-exceed control <invalid>/drop",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action police",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action police rate 3mbit burst 250k conform-exceed 10/drop index 1",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions ls action police",
+ "matchPattern": "action order [0-9]*: police 0x1 rate 3Mbit burst 250Kb mtu 2Kb action ",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action police"
+ ]
+ },
{
"id": "c26f",
"name": "Add police action with invalid peakrate value",
--
2.17.1
^ permalink raw reply related
* [PATCH net 1/2] net_sched: reject unknown tcfa_action values
From: Paolo Abeni @ 2018-08-28 14:24 UTC (permalink / raw)
To: netdev
Cc: Jamal Hadi Salim, Cong Wang, Jiri Pirko, David S. Miller,
Davide Caratti
In-Reply-To: <cover.1535465984.git.pabeni@redhat.com>
After the commit 802bfb19152c ("net/sched: user-space can't set
unknown tcfa_action values"), unknown tcfa_action values are
converted to TC_ACT_UNSPEC, but the common agreement is instead
rejecting such configurations.
This change also introduce an helper to simplify the destruction
of a single action, avoding code duplication.
Fixes: 802bfb19152c ("net/sched: user-space can't set unknown tcfa_action values")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
net/sched/act_api.c | 14 ++++++++++----
1 file changed, 10 insertions(+), 4 deletions(-)
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index db83dac1e7f4..8614f2c282e8 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -662,6 +662,13 @@ int tcf_action_destroy(struct tc_action *actions[], int bind)
return ret;
}
+int tcf_action_destroy_one(struct tc_action *a, int bind)
+{
+ struct tc_action *actions[] = { a, NULL };
+
+ return tcf_action_destroy(actions, bind);
+}
+
static int tcf_action_put(struct tc_action *p)
{
return __tcf_action_put(p, false);
@@ -881,17 +888,16 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN)) {
err = tcf_action_goto_chain_init(a, tp);
if (err) {
- struct tc_action *actions[] = { a, NULL };
-
- tcf_action_destroy(actions, bind);
NL_SET_ERR_MSG(extack, "Failed to init TC action chain");
+ tcf_action_destroy_one(a, bind);
return ERR_PTR(err);
}
}
if (!tcf_action_valid(a->tcfa_action)) {
NL_SET_ERR_MSG(extack, "invalid action value, using TC_ACT_UNSPEC instead");
- a->tcfa_action = TC_ACT_UNSPEC;
+ tcf_action_destroy_one(a, bind);
+ return ERR_PTR(-EINVAL);
}
return a;
--
2.17.1
^ permalink raw reply related
* [PATCH net 0/2] net_sched: reject unknown tcfa_action values
From: Paolo Abeni @ 2018-08-28 14:24 UTC (permalink / raw)
To: netdev
Cc: Jamal Hadi Salim, Cong Wang, Jiri Pirko, David S. Miller,
Davide Caratti
As agreed some time ago, this changeset reject unknown tcfa_action values,
instead of changing such values under the hood.
A tdc test is included to verify the new behavior.
Paolo Abeni (2):
net_sched: reject unknown tcfa_action values
tc-testing: add test-cases for numeric and invalid control action
net/sched/act_api.c | 14 ++++--
.../tc-testing/tc-tests/actions/police.json | 48 +++++++++++++++++++
2 files changed, 58 insertions(+), 4 deletions(-)
--
2.17.1
^ permalink raw reply
* [PATCH bpf] bpf: fix several offset tests in bpf_msg_pull_data
From: Daniel Borkmann @ 2018-08-28 14:15 UTC (permalink / raw)
To: alexei.starovoitov; +Cc: john.fastabend, netdev, Daniel Borkmann
While recently going over bpf_msg_pull_data(), I noticed three
issues which are fixed in here:
1) When we attempt to find the first scatterlist element (sge)
for the start offset, we add len to the offset before we check
for start < offset + len, whereas it should come after when
we iterate to the next sge to accumulate the offsets. For
example, given a start offset of 12 with a sge length of 8
for the first sge in the list would lead us to determine this
sge as the first sge thinking it covers first 16 bytes where
start is located, whereas start sits in subsequent sges so
we would end up pulling in the wrong data.
2) After figuring out the starting sge, we have a short-cut test
in !msg->sg_copy[i] && bytes <= len. This checks whether it's
not needed to make the page at the sge private where we can
just exit by updating msg->data and msg->data_end. However,
the length test is not fully correct. bytes <= len checks
whether the requested bytes (end - start offsets) fit into the
sge's length. The part that is missing is that start must not
be sge length aligned. Meaning, the start offset into the sge
needs to be accounted as well on top of the requested bytes
as otherwise we can access the sge out of bounds. For example
the sge could have length of 8, our requested bytes could have
length of 8, but at a start offset of 4, so we also would need
to pull in 4 bytes of the next sge, when we jump to the out
label we do set msg->data to sg_virt(&sg[i]) + start - offset
and msg->data_end to msg->data + bytes which would be oob.
3) The subsequent bytes < copy test for finding the last sge has
the same issue as in point 2) but also it tests for less than
rather than less or equal to. Meaning if the sge length is of
8 and requested bytes of 8 while having the start aligned with
the sge, we would unnecessarily go and pull in the next sge as
well to make it private.
Fixes: 015632bb30da ("bpf: sk_msg program helper bpf_sk_msg_pull_data")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
---
net/core/filter.c | 14 +++++++++-----
1 file changed, 9 insertions(+), 5 deletions(-)
diff --git a/net/core/filter.c b/net/core/filter.c
index 7a24309..ec4d67c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2286,10 +2286,10 @@ BPF_CALL_4(bpf_msg_pull_data,
struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags)
{
unsigned int len = 0, offset = 0, copy = 0;
+ int bytes = end - start, bytes_sg_total;
struct scatterlist *sg = msg->sg_data;
int first_sg, last_sg, i, shift;
unsigned char *p, *to, *from;
- int bytes = end - start;
struct page *page;
if (unlikely(flags || end <= start))
@@ -2299,9 +2299,9 @@ BPF_CALL_4(bpf_msg_pull_data,
i = msg->sg_start;
do {
len = sg[i].length;
- offset += len;
if (start < offset + len)
break;
+ offset += len;
i++;
if (i == MAX_SKB_FRAGS)
i = 0;
@@ -2310,7 +2310,11 @@ BPF_CALL_4(bpf_msg_pull_data,
if (unlikely(start >= offset + len))
return -EINVAL;
- if (!msg->sg_copy[i] && bytes <= len)
+ /* The start may point into the sg element so we need to also
+ * account for the headroom.
+ */
+ bytes_sg_total = start - offset + bytes;
+ if (!msg->sg_copy[i] && bytes_sg_total <= len)
goto out;
first_sg = i;
@@ -2330,12 +2334,12 @@ BPF_CALL_4(bpf_msg_pull_data,
i++;
if (i == MAX_SKB_FRAGS)
i = 0;
- if (bytes < copy)
+ if (bytes_sg_total <= copy)
break;
} while (i != msg->sg_end);
last_sg = i;
- if (unlikely(copy < end - start))
+ if (unlikely(bytes_sg_total > copy))
return -EINVAL;
page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC, get_order(copy));
--
2.9.5
^ permalink raw reply related
* WARNING from tcp.c
From: Adam Mitchell @ 2018-08-28 14:11 UTC (permalink / raw)
To: netdev
Can anyone help me understand why a busy database server gets these
kernel warnings? It comes from this WARN_ON macro, apparently because
the socket is still owned by the user process. Why would that happen?
WARN_ON(sock_owned_by_user(sk));
[726780.788201] WARNING: CPU: 15 PID: 52245 at net/ipv4/tcp.c:2278
tcp_close+0x40f/0x430
[726780.794947] Modules linked in: binfmt_misc nf_conntrack_netlink
nfnetlink_queue tcp_diag inet_diag isofs ip6table_mangle ip6table_raw
ip6table_nat nf_nat_ipv6 iptable_security xt_CT iptable_raw
iptable_nat nf_nat_ipv4 nf_nat iptable_mangle xt_pkttype xt_NFLOG
nfnetlink_log xt_u32 xt_multiport xt_set xt_conntrack
ip_set_hash_netport ip_set_hash_ipport ip_set_hash_net ip_set_hash_ip
ip_set nfnetlink nf_conntrack_proto_gre nf_conntrack_ipv6
nf_defrag_ipv6 ip6table_filter ip6_tables xt_LOG nf_conntrack_tftp
nf_conntrack_ftp nf_conntrack_ipv4 nf_defrag_ipv4 nf_conntrack
iptable_filter zfs(PO) zunicode(PO) zavl(PO) icp(PO) sb_edac
intel_powerclamp crct10dif_pclmul crc32_pclmul ghash_clmulni_intel
pcbc aesni_intel crypto_simd glue_helper cryptd cirrus snd_seq
zcommon(PO) ttm intel_rapl_perf snd_seq_device
[726780.848696] drm_kms_helper znvpair(PO) snd_pcm snd_timer spl(O)
drm snd soundcore syscopyarea sysfillrect pcspkr sysimgblt input_leds
fb_sys_fops i2c_piix4 ip_tables xfs libcrc32c ata_generic pata_acpi
ata_piix xen_blkfront crc32c_intel libata ena(O) serio_raw floppy
sunrpc
[726780.863916] CPU: 15 PID: 52245 Comm: mysqld Tainted: P W O
4.16.13-1.el7.elrepo.x86_64 #1
[726780.869486] Hardware name: Xen HVM domU, BIOS 4.2.amazon 08/24/2006
[726780.873756] RIP: 0010:tcp_close+0x40f/0x430
[726780.877049] RSP: 0018:ffffc90063a87dd0 EFLAGS: 00010202
[726780.880913] RAX: 0000000000000001 RBX: ffff8839cbb4b300 RCX:
0000000000000001
[726780.885708] RDX: 0000000000400001 RSI: 0000000000023540 RDI:
000000000000002b
[726780.890412] RBP: ffffc90063a87df0 R08: 0000000000000000 R09:
0000000000000101
[726780.895192] R10: 00000000000003ff R11: 0000000000002057 R12:
ffff8839cbb4b388
[726780.900029] R13: 0000000000000009 R14: ffff8839cbb4b3c8 R15:
ffff883c5ed14900
[726780.904777] FS: 00007f6acd467700(0000) GS:ffff883c8b1c0000(0000)
knlGS:0000000000000000
[726780.909964] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[726780.914078] CR2: 00007f6cd5d430c8 CR3: 0000003c7e6b6005 CR4:
00000000001606e0
[726780.918837] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
0000000000000000
[726780.923433] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
0000000000000400
[726780.927882] Call Trace:
[726780.930410] inet_release+0x42/0x70
[726780.933423] inet6_release+0x30/0x40
[726780.936439] sock_release+0x25/0x80
[726780.939421] sock_close+0x12/0x20
[726780.942342] __fput+0xea/0x220
[726780.945170] ____fput+0xe/0x10
[726780.947963] task_work_run+0x8c/0xb0
[726780.951052] exit_to_usermode_loop+0x6b/0x95
[726780.954480] do_syscall_64+0x182/0x1b0
[726780.957586] entry_SYSCALL_64_after_hwframe+0x3d/0xa2
[726780.961288] RIP: 0033:0x7fd82b27085d
[726780.964349] RSP: 002b:00007f6acd466b50 EFLAGS: 00000293 ORIG_RAX:
0000000000000003
[726780.969280] RAX: 0000000000000000 RBX: 00007f65b2418220 RCX:
00007fd82b27085d
[726780.973987] RDX: 0000000000000003 RSI: 00007f6d5e9990c0 RDI:
00000000000002d8
[726780.978689] RBP: 00007f6acd466c00 R08: 0000000001622848 R09:
00000000000001f8
[726780.983373] R10: 0000000000000000 R11: 0000000000000293 R12:
0000000000000000
[726780.988049] R13: 00007f6d5e9990c0 R14: 0000000001d87cc0 R15:
00000000000002d8
[726780.992700] Code: ff 48 8b 43 28 31 f6 48 89 df 48 8b 40 10 e8 49
2d 4e 00 48 8b 43 30 48 8b 80 98 01 00 00 65 48 ff 80 90 01 00 00 e9
49 ff ff ff <0f> 0b e9 ab fc ff ff 48 8b 43 28 31 f6 48 89 df 48 8b 40
10 e8
[726781.004156] ---[ end trace 8525f27644ac4631 ]---
^ permalink raw reply
* Re: [PATCH bpf-next 01/11] xdp: implement convert_to_xdp_frame for MEM_TYPE_ZERO_COPY
From: Jesper Dangaard Brouer @ 2018-08-28 14:11 UTC (permalink / raw)
To: Björn Töpel
Cc: magnus.karlsson, magnus.karlsson, alexander.h.duyck,
alexander.duyck, ast, daniel, netdev, jesse.brandeburg,
anjali.singhai, peter.waskiewicz.jr, Björn Töpel,
michael.lundkvist, willemdebruijn.kernel, john.fastabend,
jakub.kicinski, neerav.parikh, mykyta.iziumtsev, francois.ozog,
ilias.apalodimas, brian.brooks, u9012063, pavel, qi.z.zhang,
brouer
In-Reply-To: <20180828124435.30578-2-bjorn.topel@gmail.com>
On Tue, 28 Aug 2018 14:44:25 +0200
Björn Töpel <bjorn.topel@gmail.com> wrote:
> From: Björn Töpel <bjorn.topel@intel.com>
>
> This commit adds proper MEM_TYPE_ZERO_COPY support for
> convert_to_xdp_frame. Converting a MEM_TYPE_ZERO_COPY xdp_buff to an
> xdp_frame is done by transforming the MEM_TYPE_ZERO_COPY buffer into a
> MEM_TYPE_PAGE_ORDER0 frame. This is costly, and in the future it might
> make sense to implement a more sophisticated thread-safe alloc/free
> scheme for MEM_TYPE_ZERO_COPY, so that no allocation and copy is
> required in the fast-path.
This is going to be slow. Especially the dev_alloc_page() call, which
for small frames is likely going to be slower than the data copy.
I guess this is a good first step, but I do hope we will circle back and
optimize this later. (It would also be quite easy to use
MEM_TYPE_PAGE_POOL instead to get page recycling in devmap redirect case).
I would have liked the MEM_TYPE_ZERO_COPY frame to travel one level
deeper into the redirect-core code. Allowing devmap to send these
frame without copy, and allow cpumap to do the dev_alloc_page() call
(+copy) on the remote CPU.
> Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
> ---
> include/net/xdp.h | 5 +++--
> net/core/xdp.c | 39 +++++++++++++++++++++++++++++++++++++++
> 2 files changed, 42 insertions(+), 2 deletions(-)
>
> diff --git a/include/net/xdp.h b/include/net/xdp.h
> index 76b95256c266..0d5c6fb4b2e2 100644
> --- a/include/net/xdp.h
> +++ b/include/net/xdp.h
> @@ -91,6 +91,8 @@ static inline void xdp_scrub_frame(struct xdp_frame *frame)
> frame->dev_rx = NULL;
> }
>
> +struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp);
> +
> /* Convert xdp_buff to xdp_frame */
> static inline
> struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
> @@ -99,9 +101,8 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
> int metasize;
> int headroom;
>
> - /* TODO: implement clone, copy, use "native" MEM_TYPE */
> if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY)
> - return NULL;
> + return xdp_convert_zc_to_xdp_frame(xdp);
>
> /* Assure headroom is available for storing info */
> headroom = xdp->data - xdp->data_hard_start;
> diff --git a/net/core/xdp.c b/net/core/xdp.c
> index 89b6785cef2a..be6cb2f0e722 100644
> --- a/net/core/xdp.c
> +++ b/net/core/xdp.c
> @@ -398,3 +398,42 @@ void xdp_attachment_setup(struct xdp_attachment_info *info,
> info->flags = bpf->flags;
> }
> EXPORT_SYMBOL_GPL(xdp_attachment_setup);
> +
> +struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp)
> +{
> + unsigned int metasize, headroom, totsize;
> + void *addr, *data_to_copy;
> + struct xdp_frame *xdpf;
> + struct page *page;
> +
> + /* Clone into a MEM_TYPE_PAGE_ORDER0 xdp_frame. */
> + metasize = xdp_data_meta_unsupported(xdp) ? 0 :
> + xdp->data - xdp->data_meta;
> + headroom = xdp->data - xdp->data_hard_start;
> + totsize = xdp->data_end - xdp->data + metasize;
> +
> + if (sizeof(*xdpf) + totsize > PAGE_SIZE)
> + return NULL;
> +
> + page = dev_alloc_page();
> + if (!page)
> + return NULL;
> +
> + addr = page_to_virt(page);
> + xdpf = addr;
> + memset(xdpf, 0, sizeof(*xdpf));
> +
> + addr += sizeof(*xdpf);
> + data_to_copy = metasize ? xdp->data_meta : xdp->data;
> + memcpy(addr, data_to_copy, totsize);
> +
> + xdpf->data = addr + metasize;
> + xdpf->len = totsize - metasize;
> + xdpf->headroom = 0;
> + xdpf->metasize = metasize;
> + xdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
> +
> + xdp_return_buff(xdp);
> + return xdpf;
> +}
> +EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame);
--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Principal Kernel Engineer at Red Hat
LinkedIn: http://www.linkedin.com/in/brouer
^ permalink raw reply
* [PATCH net-next 2/2] ethtool: drop get_settings and set_settings callbacks
From: Michal Kubecek @ 2018-08-28 17:56 UTC (permalink / raw)
To: David S. Miller
Cc: netdev, Andrew Lunn, Florian Fainelli, Russell King,
linux-arm-kernel, linux-kernel
In-Reply-To: <cover.1535477409.git.mkubecek@suse.cz>
Since [gs]et_settings ethtool_ops callbacks have been deprecated in
February 2016, all in tree NIC drivers have been converted to provide
[gs]et_link_ksettings() and out of tree drivers have had enough time to do
the same.
Drop get_settings() and set_settings() and implement both ETHTOOL_[GS]SET
and ETHTOOL_[GS]LINKSETTINGS only using [gs]et_link_ksettings().
Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
---
Documentation/ABI/testing/sysfs-class-net | 4 +-
include/linux/ethtool.h | 33 ++---
include/uapi/linux/ethtool.h | 15 +-
net/core/ethtool.c | 158 +++++-----------------
4 files changed, 50 insertions(+), 160 deletions(-)
diff --git a/Documentation/ABI/testing/sysfs-class-net b/Documentation/ABI/testing/sysfs-class-net
index 2f1788111cd9..e2e0fe553ad8 100644
--- a/Documentation/ABI/testing/sysfs-class-net
+++ b/Documentation/ABI/testing/sysfs-class-net
@@ -117,7 +117,7 @@ Description:
full: full duplex
Note: This attribute is only valid for interfaces that implement
- the ethtool get_settings method (mostly Ethernet).
+ the ethtool get_link_ksettings method (mostly Ethernet).
What: /sys/class/net/<iface>/flags
Date: April 2005
@@ -224,7 +224,7 @@ Description:
an integer representing the link speed in Mbits/sec.
Note: this attribute is only valid for interfaces that implement
- the ethtool get_settings method (mostly Ethernet ).
+ the ethtool get_link_ksettings method (mostly Ethernet).
What: /sys/class/net/<iface>/tx_queue_len
Date: April 2005
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index f8a2245b70ac..afd9596ce636 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -183,14 +183,6 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
/**
* struct ethtool_ops - optional netdev operations
- * @get_settings: DEPRECATED, use %get_link_ksettings/%set_link_ksettings
- * API. Get various device settings including Ethernet link
- * settings. The @cmd parameter is expected to have been cleared
- * before get_settings is called. Returns a negative error code
- * or zero.
- * @set_settings: DEPRECATED, use %get_link_ksettings/%set_link_ksettings
- * API. Set various device settings including Ethernet link
- * settings. Returns a negative error code or zero.
* @get_drvinfo: Report driver/device information. Should only set the
* @driver, @version, @fw_version and @bus_info fields. If not
* implemented, the @driver and @bus_info fields will be filled in
@@ -297,19 +289,16 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
* a TX queue has this number, return -EINVAL. If only a RX queue or a TX
* queue has this number, ignore the inapplicable fields.
* Returns a negative error code or zero.
- * @get_link_ksettings: When defined, takes precedence over the
- * %get_settings method. Get various device settings
- * including Ethernet link settings. The %cmd and
- * %link_mode_masks_nwords fields should be ignored (use
- * %__ETHTOOL_LINK_MODE_MASK_NBITS instead of the latter), any
- * change to them will be overwritten by kernel. Returns a
- * negative error code or zero.
- * @set_link_ksettings: When defined, takes precedence over the
- * %set_settings method. Set various device settings including
- * Ethernet link settings. The %cmd and %link_mode_masks_nwords
- * fields should be ignored (use %__ETHTOOL_LINK_MODE_MASK_NBITS
- * instead of the latter), any change to them will be overwritten
- * by kernel. Returns a negative error code or zero.
+ * @get_link_ksettings: Get various device settings including Ethernet link
+ * settings. The %cmd and %link_mode_masks_nwords fields should be
+ * ignored (use %__ETHTOOL_LINK_MODE_MASK_NBITS instead of the latter),
+ * any change to them will be overwritten by kernel. Returns a negative
+ * error code or zero.
+ * @set_link_ksettings: Set various device settings including Ethernet link
+ * settings. The %cmd and %link_mode_masks_nwords fields should be
+ * ignored (use %__ETHTOOL_LINK_MODE_MASK_NBITS instead of the latter),
+ * any change to them will be overwritten by kernel. Returns a negative
+ * error code or zero.
* @get_fecparam: Get the network device Forward Error Correction parameters.
* @set_fecparam: Set the network device Forward Error Correction parameters.
* @get_ethtool_phy_stats: Return extended statistics about the PHY device.
@@ -329,8 +318,6 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
* of the generic netdev features interface.
*/
struct ethtool_ops {
- int (*get_settings)(struct net_device *, struct ethtool_cmd *);
- int (*set_settings)(struct net_device *, struct ethtool_cmd *);
void (*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *);
int (*get_regs_len)(struct net_device *);
void (*get_regs)(struct net_device *, struct ethtool_regs *, void *);
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index dc69391d2bba..c8f8e2455bf3 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -91,10 +91,6 @@
* %ETHTOOL_GSET to get the current values before making specific
* changes and then applying them with %ETHTOOL_SSET.
*
- * Drivers that implement set_settings() should validate all fields
- * other than @cmd that are not described as read-only or deprecated,
- * and must ignore all fields described as read-only.
- *
* Deprecated fields should be ignored by both users and drivers.
*/
struct ethtool_cmd {
@@ -1800,14 +1796,9 @@ enum ethtool_reset_flags {
* rejected.
*
* Deprecated %ethtool_cmd fields transceiver, maxtxpkt and maxrxpkt
- * are not available in %ethtool_link_settings. Until all drivers are
- * converted to ignore them or to the new %ethtool_link_settings API,
- * for both queries and changes, users should always try
- * %ETHTOOL_GLINKSETTINGS first, and if it fails with -ENOTSUPP stick
- * only to %ETHTOOL_GSET and %ETHTOOL_SSET consistently. If it
- * succeeds, then users should stick to %ETHTOOL_GLINKSETTINGS and
- * %ETHTOOL_SLINKSETTINGS (which would support drivers implementing
- * either %ethtool_cmd or %ethtool_link_settings).
+ * are not available in %ethtool_link_settings. These fields will be
+ * always set to zero in %ETHTOOL_GSET reply and %ETHTOOL_SSET will
+ * fail if any of them is set to non-zero value.
*
* Users should assume that all fields not marked read-only are
* writable and subject to validation by the driver. They should use
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index c9993c6c2fd4..9d4e56d97080 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -539,47 +539,17 @@ struct ethtool_link_usettings {
} link_modes;
};
-/* Internal kernel helper to query a device ethtool_link_settings.
- *
- * Backward compatibility note: for compatibility with legacy drivers
- * that implement only the ethtool_cmd API, this has to work with both
- * drivers implementing get_link_ksettings API and drivers
- * implementing get_settings API. When drivers implement get_settings
- * and report ethtool_cmd deprecated fields
- * (transceiver/maxrxpkt/maxtxpkt), these fields are silently ignored
- * because the resulting struct ethtool_link_settings does not report them.
- */
+/* Internal kernel helper to query a device ethtool_link_settings. */
int __ethtool_get_link_ksettings(struct net_device *dev,
struct ethtool_link_ksettings *link_ksettings)
{
- int err;
- struct ethtool_cmd cmd;
-
ASSERT_RTNL();
- if (dev->ethtool_ops->get_link_ksettings) {
- memset(link_ksettings, 0, sizeof(*link_ksettings));
- return dev->ethtool_ops->get_link_ksettings(dev,
- link_ksettings);
- }
-
- /* driver doesn't support %ethtool_link_ksettings API. revert to
- * legacy %ethtool_cmd API, unless it's not supported either.
- * TODO: remove when ethtool_ops::get_settings disappears internally
- */
- if (!dev->ethtool_ops->get_settings)
+ if (!dev->ethtool_ops->get_link_ksettings)
return -EOPNOTSUPP;
- memset(&cmd, 0, sizeof(cmd));
- cmd.cmd = ETHTOOL_GSET;
- err = dev->ethtool_ops->get_settings(dev, &cmd);
- if (err < 0)
- return err;
-
- /* we ignore deprecated fields transceiver/maxrxpkt/maxtxpkt
- */
- convert_legacy_settings_to_link_ksettings(link_ksettings, &cmd);
- return err;
+ memset(link_ksettings, 0, sizeof(*link_ksettings));
+ return dev->ethtool_ops->get_link_ksettings(dev, link_ksettings);
}
EXPORT_SYMBOL(__ethtool_get_link_ksettings);
@@ -635,16 +605,7 @@ store_link_ksettings_for_user(void __user *to,
return 0;
}
-/* Query device for its ethtool_link_settings.
- *
- * Backward compatibility note: this function must fail when driver
- * does not implement ethtool::get_link_ksettings, even if legacy
- * ethtool_ops::get_settings is implemented. This tells new versions
- * of ethtool that they should use the legacy API %ETHTOOL_GSET for
- * this driver, so that they can correctly access the ethtool_cmd
- * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver
- * implements ethtool_ops::get_settings anymore.
- */
+/* Query device for its ethtool_link_settings. */
static int ethtool_get_link_ksettings(struct net_device *dev,
void __user *useraddr)
{
@@ -652,7 +613,6 @@ static int ethtool_get_link_ksettings(struct net_device *dev,
struct ethtool_link_ksettings link_ksettings;
ASSERT_RTNL();
-
if (!dev->ethtool_ops->get_link_ksettings)
return -EOPNOTSUPP;
@@ -699,16 +659,7 @@ static int ethtool_get_link_ksettings(struct net_device *dev,
return store_link_ksettings_for_user(useraddr, &link_ksettings);
}
-/* Update device ethtool_link_settings.
- *
- * Backward compatibility note: this function must fail when driver
- * does not implement ethtool::set_link_ksettings, even if legacy
- * ethtool_ops::set_settings is implemented. This tells new versions
- * of ethtool that they should use the legacy API %ETHTOOL_SSET for
- * this driver, so that they can correctly update the ethtool_cmd
- * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver
- * implements ethtool_ops::get_settings anymore.
- */
+/* Update device ethtool_link_settings. */
static int ethtool_set_link_ksettings(struct net_device *dev,
void __user *useraddr)
{
@@ -746,51 +697,31 @@ static int ethtool_set_link_ksettings(struct net_device *dev,
/* Query device for its ethtool_cmd settings.
*
- * Backward compatibility note: for compatibility with legacy ethtool,
- * this has to work with both drivers implementing get_link_ksettings
- * API and drivers implementing get_settings API. When drivers
- * implement get_link_ksettings and report higher link mode bits, a
- * kernel warning is logged once (with name of 1st driver/device) to
- * recommend user to upgrade ethtool, but the command is successful
- * (only the lower link mode bits reported back to user).
+ * Backward compatibility note: for compatibility with legacy ethtool, this is
+ * now implemented via get_link_ksettings. When driver reports higher link mode
+ * bits, a kernel warning is logged once (with name of 1st driver/device) to
+ * recommend user to upgrade ethtool, but the command is successful (only the
+ * lower link mode bits reported back to user). Deprecated fields from
+ * ethtool_cmd (transceiver/maxrxpkt/maxtxpkt) are always set to zero.
*/
static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
{
+ struct ethtool_link_ksettings link_ksettings;
struct ethtool_cmd cmd;
+ int err;
ASSERT_RTNL();
+ if (!dev->ethtool_ops->get_link_ksettings)
+ return -EOPNOTSUPP;
- if (dev->ethtool_ops->get_link_ksettings) {
- /* First, use link_ksettings API if it is supported */
- int err;
- struct ethtool_link_ksettings link_ksettings;
-
- memset(&link_ksettings, 0, sizeof(link_ksettings));
- err = dev->ethtool_ops->get_link_ksettings(dev,
- &link_ksettings);
- if (err < 0)
- return err;
- convert_link_ksettings_to_legacy_settings(&cmd,
- &link_ksettings);
-
- /* send a sensible cmd tag back to user */
- cmd.cmd = ETHTOOL_GSET;
- } else {
- /* driver doesn't support %ethtool_link_ksettings
- * API. revert to legacy %ethtool_cmd API, unless it's
- * not supported either.
- */
- int err;
-
- if (!dev->ethtool_ops->get_settings)
- return -EOPNOTSUPP;
+ memset(&link_ksettings, 0, sizeof(link_ksettings));
+ err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings);
+ if (err < 0)
+ return err;
+ convert_link_ksettings_to_legacy_settings(&cmd, &link_ksettings);
- memset(&cmd, 0, sizeof(cmd));
- cmd.cmd = ETHTOOL_GSET;
- err = dev->ethtool_ops->get_settings(dev, &cmd);
- if (err < 0)
- return err;
- }
+ /* send a sensible cmd tag back to user */
+ cmd.cmd = ETHTOOL_GSET;
if (copy_to_user(useraddr, &cmd, sizeof(cmd)))
return -EFAULT;
@@ -800,48 +731,29 @@ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
/* Update device link settings with given ethtool_cmd.
*
- * Backward compatibility note: for compatibility with legacy ethtool,
- * this has to work with both drivers implementing set_link_ksettings
- * API and drivers implementing set_settings API. When drivers
- * implement set_link_ksettings and user's request updates deprecated
- * ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel
- * warning is logged once (with name of 1st driver/device) to
- * recommend user to upgrade ethtool, and the request is rejected.
+ * Backward compatibility note: for compatibility with legacy ethtool, this is
+ * now always implemented via set_link_settings. When user's request updates
+ * deprecated ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel
+ * warning is logged once (with name of 1st driver/device) to recommend user to
+ * upgrade ethtool, and the request is rejected.
*/
static int ethtool_set_settings(struct net_device *dev, void __user *useraddr)
{
+ struct ethtool_link_ksettings link_ksettings;
struct ethtool_cmd cmd;
ASSERT_RTNL();
if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
return -EFAULT;
-
- /* first, try new %ethtool_link_ksettings API. */
- if (dev->ethtool_ops->set_link_ksettings) {
- struct ethtool_link_ksettings link_ksettings;
-
- if (!convert_legacy_settings_to_link_ksettings(&link_ksettings,
- &cmd))
- return -EINVAL;
-
- link_ksettings.base.cmd = ETHTOOL_SLINKSETTINGS;
- link_ksettings.base.link_mode_masks_nwords
- = __ETHTOOL_LINK_MODE_MASK_NU32;
- return dev->ethtool_ops->set_link_ksettings(dev,
- &link_ksettings);
- }
-
- /* legacy %ethtool_cmd API */
-
- /* TODO: return -EOPNOTSUPP when ethtool_ops::get_settings
- * disappears internally
- */
-
- if (!dev->ethtool_ops->set_settings)
+ if (!dev->ethtool_ops->set_link_ksettings)
return -EOPNOTSUPP;
- return dev->ethtool_ops->set_settings(dev, &cmd);
+ if (!convert_legacy_settings_to_link_ksettings(&link_ksettings, &cmd))
+ return -EINVAL;
+ link_ksettings.base.link_mode_masks_nwords =
+ __ETHTOOL_LINK_MODE_MASK_NU32;
+ return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings);
}
static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev,
--
2.18.0
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox