* [Patch net 3/3] team: use a larger struct for mac address
From: Cong Wang @ 2017-04-26 5:03 UTC (permalink / raw)
To: netdev; +Cc: andreyknvl, Cong Wang, Jiri Pirko
In-Reply-To: <1493183003-884-1-git-send-email-xiyou.wangcong@gmail.com>
IPv6 tunnels use sizeof(struct in6_addr) as dev->addr_len,
but in many places especially bonding, we use struct sockaddr
to copy and set mac addr, this could lead to stack out-of-bounds
access.
Fix it by using a larger address storage.
Reported-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
---
drivers/net/team/team.c | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 85c0124..88878f1 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -60,10 +60,13 @@ static struct team_port *team_port_get_rtnl(const struct net_device *dev)
static int __set_port_dev_addr(struct net_device *port_dev,
const unsigned char *dev_addr)
{
- struct sockaddr addr;
+ struct {
+ unsigned short type;
+ unsigned char addr[MAX_ADDR_LEN];
+ } addr;
- memcpy(addr.sa_data, dev_addr, port_dev->addr_len);
- addr.sa_family = port_dev->type;
+ memcpy(addr.addr, dev_addr, port_dev->addr_len);
+ addr.type = port_dev->type;
return dev_set_mac_address(port_dev, &addr);
}
--
2.5.5
^ permalink raw reply related
* [Patch net 2/3] bonding: use a larger struct for mac address
From: Cong Wang @ 2017-04-26 5:03 UTC (permalink / raw)
To: netdev; +Cc: andreyknvl, Cong Wang, Jay Vosburgh
In-Reply-To: <1493183003-884-1-git-send-email-xiyou.wangcong@gmail.com>
IPv6 tunnels use sizeof(struct in6_addr) as dev->addr_len,
but in many places especially bonding, we use struct sockaddr
to copy and set mac addr, this could lead to stack out-of-bounds
access.
Fix it by using a larger address storage.
Reported-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Jay Vosburgh <j.vosburgh@gmail.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
---
drivers/net/bonding/bond_alb.c | 8 ++++----
drivers/net/bonding/bond_main.c | 17 +++++++++--------
drivers/net/bonding/bonding_priv.h | 6 ++++++
3 files changed, 19 insertions(+), 12 deletions(-)
diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index c80b023..0038266 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -39,7 +39,7 @@
#include <asm/byteorder.h>
#include <net/bonding.h>
#include <net/bond_alb.h>
-
+#include "bonding_priv.h"
static const u8 mac_bcast[ETH_ALEN + 2] __long_aligned = {
@@ -1234,7 +1234,7 @@ static int alb_set_mac_address(struct bonding *bond, void *addr)
{
struct slave *slave, *rollback_slave;
struct list_head *iter;
- struct sockaddr sa;
+ struct bond_mac_addr sa;
char tmp_addr[ETH_ALEN];
int res;
@@ -1257,8 +1257,8 @@ static int alb_set_mac_address(struct bonding *bond, void *addr)
return 0;
unwind:
- memcpy(sa.sa_data, bond->dev->dev_addr, bond->dev->addr_len);
- sa.sa_family = bond->dev->type;
+ memcpy(sa.addr, bond->dev->dev_addr, bond->dev->addr_len);
+ sa.type = bond->dev->type;
/* unwind from head to the slave that failed */
bond_for_each_slave(bond, rollback_slave, iter) {
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 8a4ba8b..7a9bd30 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1330,7 +1330,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
struct bonding *bond = netdev_priv(bond_dev);
const struct net_device_ops *slave_ops = slave_dev->netdev_ops;
struct slave *new_slave = NULL, *prev_slave;
- struct sockaddr addr;
+ struct bond_mac_addr addr;
int link_reporting;
int res = 0, i;
@@ -1488,8 +1488,8 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
/* Set slave to master's mac address. The application already
* set the master's mac address to that of the first slave
*/
- memcpy(addr.sa_data, bond_dev->dev_addr, bond_dev->addr_len);
- addr.sa_family = slave_dev->type;
+ memcpy(addr.addr, bond_dev->dev_addr, bond_dev->addr_len);
+ addr.type = slave_dev->type;
res = dev_set_mac_address(slave_dev, &addr);
if (res) {
netdev_dbg(bond_dev, "Error %d calling set_mac_address\n", res);
@@ -1773,8 +1773,8 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
* MAC if this slave's MAC is in use by the bond, or at
* least print a warning.
*/
- ether_addr_copy(addr.sa_data, new_slave->perm_hwaddr);
- addr.sa_family = slave_dev->type;
+ ether_addr_copy(addr.addr, new_slave->perm_hwaddr);
+ addr.type = slave_dev->type;
dev_set_mac_address(slave_dev, &addr);
}
@@ -3619,7 +3619,8 @@ static int bond_set_mac_address(struct net_device *bond_dev, void *addr)
{
struct bonding *bond = netdev_priv(bond_dev);
struct slave *slave, *rollback_slave;
- struct sockaddr *sa = addr, tmp_sa;
+ struct sockaddr *sa = addr;
+ struct bond_mac_addr tmp_sa;
struct list_head *iter;
int res = 0;
@@ -3659,8 +3660,8 @@ static int bond_set_mac_address(struct net_device *bond_dev, void *addr)
return 0;
unwind:
- memcpy(tmp_sa.sa_data, bond_dev->dev_addr, bond_dev->addr_len);
- tmp_sa.sa_family = bond_dev->type;
+ memcpy(tmp_sa.addr, bond_dev->dev_addr, bond_dev->addr_len);
+ tmp_sa.type = bond_dev->type;
/* unwind from head to the slave that failed */
bond_for_each_slave(bond, rollback_slave, iter) {
diff --git a/drivers/net/bonding/bonding_priv.h b/drivers/net/bonding/bonding_priv.h
index 5a4d81a..8b3a7e3 100644
--- a/drivers/net/bonding/bonding_priv.h
+++ b/drivers/net/bonding/bonding_priv.h
@@ -22,4 +22,10 @@
#define bond_version DRV_DESCRIPTION ": v" DRV_VERSION " (" DRV_RELDATE ")\n"
+/* Must be compatible with struct sockaddr */
+struct bond_mac_addr {
+ unsigned short type;
+ unsigned char addr[MAX_ADDR_LEN];
+};
+
#endif
--
2.5.5
^ permalink raw reply related
* [Patch net 1/3] net: check mac address length for dev_set_mac_address()
From: Cong Wang @ 2017-04-26 5:03 UTC (permalink / raw)
To: netdev; +Cc: andreyknvl, Cong Wang
In-Reply-To: <1493183003-884-1-git-send-email-xiyou.wangcong@gmail.com>
dev_set_mac_address() accepts a struct sockaddr pointer as
input but we have various types of mac addresse whose lengths
are up to MAX_ADDR_LEN, this is confusing.
Make it void like ->ndo_set_mac_address() and let callers check
its length before calling it. It is too late to fix dev_ifsioc()
due to API compatiblity, so just reject those larger than
sizeof(struct sockaddr).
This is also a preparation for the following patches.
Reported-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
---
include/linux/netdevice.h | 2 +-
net/core/dev.c | 10 +++++++---
net/core/dev_ioctl.c | 2 ++
3 files changed, 10 insertions(+), 4 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 97456b25..40e674c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3271,7 +3271,7 @@ int dev_set_alias(struct net_device *, const char *, size_t);
int dev_change_net_namespace(struct net_device *, struct net *, const char *);
int dev_set_mtu(struct net_device *, int);
void dev_set_group(struct net_device *, int);
-int dev_set_mac_address(struct net_device *, struct sockaddr *);
+int dev_set_mac_address(struct net_device *, void *);
int dev_change_carrier(struct net_device *, bool new_carrier);
int dev_get_phys_port_id(struct net_device *dev,
struct netdev_phys_item_id *ppid);
diff --git a/net/core/dev.c b/net/core/dev.c
index 533a6d6..cc670e1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6610,13 +6610,17 @@ EXPORT_SYMBOL(dev_set_group);
/**
* dev_set_mac_address - Change Media Access Control Address
* @dev: device
- * @sa: new address
+ * @addr: new address, whose type could be either struct sockaddr or
+ * any other compatible type whose length is up to MAX_ADDR_LEN depending
+ * on the dev->addr_len. Callers should check if its length is smaller than
+ * dev->addr_len!!
*
* Change the hardware (MAC) address of the device
*/
-int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
+int dev_set_mac_address(struct net_device *dev, void *addr)
{
const struct net_device_ops *ops = dev->netdev_ops;
+ struct sockaddr *sa = addr;
int err;
if (!ops->ndo_set_mac_address)
@@ -6625,7 +6629,7 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
return -EINVAL;
if (!netif_device_present(dev))
return -ENODEV;
- err = ops->ndo_set_mac_address(dev, sa);
+ err = ops->ndo_set_mac_address(dev, addr);
if (err)
return err;
dev->addr_assign_type = NET_ADDR_SET;
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index b94b1d2..11f262e 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -261,6 +261,8 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
return dev_set_mtu(dev, ifr->ifr_mtu);
case SIOCSIFHWADDR:
+ if (dev->addr_len > sizeof(struct sockaddr))
+ return -EINVAL;
return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
case SIOCSIFHWBROADCAST:
--
2.5.5
^ permalink raw reply related
* [Patch net 0/3] net: fix a stack out-of-bound access
From: Cong Wang @ 2017-04-26 5:03 UTC (permalink / raw)
To: netdev; +Cc: andreyknvl, Cong Wang
This patchset fixes the following kernel bug reported by Andrey:
BUG: KASAN: stack-out-of-bounds in bond_enslave+0xe0a/0x4ef0 at addr
ffff8800666b7792
Write of size 16 by task a.out/3894
page:ffffea000199adc0 count:0 mapcount:0 mapping: (null) index:0x0
flags: 0x100000000000000()
raw: 0100000000000000 0000000000000000 0000000000000000 00000000ffffffff
raw: 0000000000000000 ffffea000199ade0 0000000000000000 0000000000000000
page dumped because: kasan: bad access detected
CPU: 1 PID: 3894 Comm: a.out Not tainted 4.11.0-rc7+ #251
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:16
dump_stack+0x292/0x398 lib/dump_stack.c:52
kasan_report_error mm/kasan/report.c:212
kasan_report+0x4d8/0x510 mm/kasan/report.c:347
check_memory_region_inline mm/kasan/kasan.c:326
check_memory_region+0x139/0x190 mm/kasan/kasan.c:333
memcpy+0x37/0x50 mm/kasan/kasan.c:369
bond_enslave+0xe0a/0x4ef0 drivers/net/bonding/bond_main.c:1491
bond_do_ioctl+0xb5d/0xec0 drivers/net/bonding/bond_main.c:3449
dev_ifsioc+0x53f/0x9f0 net/core/dev_ioctl.c:338
dev_ioctl+0x249/0x1160 net/core/dev_ioctl.c:532
sock_do_ioctl+0x94/0xb0 net/socket.c:913
sock_ioctl+0x28f/0x440 net/socket.c:1004
vfs_ioctl fs/ioctl.c:45
do_vfs_ioctl+0x1bf/0x1780 fs/ioctl.c:685
SYSC_ioctl fs/ioctl.c:700
SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691
entry_SYSCALL_64_fastpath+0x1f/0xc2 arch/x86/entry/entry_64.S:204
RIP: 0033:0x7fc764b31b79
RSP: 002b:00007ffc1d73aa58 EFLAGS: 00000206 ORIG_RAX: 0000000000000010
RAX: ffffffffffffffda RBX: 00007ffc1d73abb0 RCX: 00007fc764b31b79
RDX: 00000000209eafd8 RSI: 0000008000008990 RDI: 0000000000000003
RBP: 00000000004004e0 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000206 R12: 0000000000000000
R13: 00007ffc1d73abb0 R14: 0000000000000000 R15: 0000000000000000
Please see each patch for details.
Cong Wang (3):
net: check mac address length for dev_set_mac_address()
bonding: use a larger struct for mac address
team: use a larger struct for mac address
drivers/net/bonding/bond_alb.c | 8 ++++----
drivers/net/bonding/bond_main.c | 17 +++++++++--------
drivers/net/bonding/bonding_priv.h | 6 ++++++
drivers/net/team/team.c | 9 ++++++---
include/linux/netdevice.h | 2 +-
net/core/dev.c | 10 +++++++---
net/core/dev_ioctl.c | 2 ++
7 files changed, 35 insertions(+), 19 deletions(-)
--
2.5.5
^ permalink raw reply
* Multiple Txqs with One Qdisc
From: Michael Ma @ 2017-04-26 4:46 UTC (permalink / raw)
To: Linux Kernel Network Developers
Hi -
As I mentioned in a previous thread, we're implementing a qdisc
similar to mqprio which can associate multiple txqs to one qdisc, so
that we can work around the root qdisc bottleneck. I think this should
be in general fine since without MQ, root qdisc is associated with
multiple txqs anyway. However we encountered kernel panic as I
mentioned in the "corrupted skb" thread.
Any thought on potential issue of doing this? I'm asking this because
mqprio only associates one txq with one qdisc which I think must have
been done this way for a reason.
Would appreciate if anyone can shed some light on this.
Thanks,
Michael
^ permalink raw reply
* Re: Corrupted SKB
From: Michael Ma @ 2017-04-26 4:42 UTC (permalink / raw)
To: Cong Wang; +Cc: Linux Kernel Network Developers, jin.oyj
In-Reply-To: <CAAmHdhy0N2VttXNXL+S+4G=4=mf4ihpW7KsNWUYpiOFXez3B7w@mail.gmail.com>
2017-04-18 21:46 GMT-07:00 Michael Ma <make0818@gmail.com>:
> 2017-04-18 16:12 GMT-07:00 Cong Wang <xiyou.wangcong@gmail.com>:
>> On Mon, Apr 17, 2017 at 5:39 PM, Michael Ma <make0818@gmail.com> wrote:
>>> Hi -
>>>
>>> We've implemented a "glue" qdisc similar to mqprio which can associate
>>> one qdisc to multiple txqs as the root qdisc. Reference count of the
>>> child qdiscs have been adjusted properly in this case so that it
>>> represents the number of txqs it has been attached to. However when
>>> sending packets we saw the skb from dequeue_skb() corrupted with the
>>> following call stack:
>>>
>>> [exception RIP: netif_skb_features+51]
>>> RIP: ffffffff815292b3 RSP: ffff8817f6987940 RFLAGS: 00010246
>>>
>>> #9 [ffff8817f6987968] validate_xmit_skb at ffffffff815294aa
>>> #10 [ffff8817f69879a0] validate_xmit_skb at ffffffff8152a0d9
>>> #11 [ffff8817f69879b0] __qdisc_run at ffffffff8154a193
>>> #12 [ffff8817f6987a00] dev_queue_xmit at ffffffff81529e03
>>>
>>> It looks like the skb has already been released since its dev pointer
>>> field is invalid.
>>>
>>> Any clue on how this can be investigated further? My current thought
>>> is to add some instrumentation to the place where skb is released and
>>> analyze whether there is any race condition happening there. However
>>
>> Either dropwatch or perf could do the work to instrument kfree_skb().
>
> Thanks - will try it out.
I'm using perf to collect the callstack for kfree_skb and trying to
correlate that with the corrupted SKB address however when system
crashes the perf.data file is also corrupted - how can I view this
file in case the system crashes before perf exits?
>>
>>> by looking through the existing code I think the case where one root
>>> qdisc is associated with multiple txqs already exists (when mqprio is
>>> not used) so not sure why it won't work when we group txqs and assign
>>> each group a root qdisc. Any insight on this issue would be much
>>> appreciated!
>>
>> How do you implement ->attach()? How does it work with netdev_pick_tx()?
>
> attach() essentially grafts the default qdisc(pfifo) to each "txq
> group" represented by a TC class. For netdev_pick_txq() we use classid
> of the socket to select a class based on a "class id base" and the
> class to txq mapping defined together with this glue qdisc - it's
> pretty much the same as mqprio with the difference of mapping one
> class to multiple txqs and selecting the txq through a hash.
^ permalink raw reply
* (unknown),
From: prasad padiyar @ 2017-04-26 3:57 UTC (permalink / raw)
To: netdev
subscribe linux-netdev
^ permalink raw reply
* Re: llvm-objdump...
From: Alexei Starovoitov @ 2017-04-26 3:48 UTC (permalink / raw)
To: David Miller; +Cc: daniel, netdev
In-Reply-To: <20170425.131337.2175176102097155053.davem@davemloft.net>
On 4/25/17 10:13 AM, David Miller wrote:
>
> I think there are some endianness issues ;-)
>
> davem@patience:~/src/GIT/net-next/tools/testing/selftests/bpf$ llvm-objdump -S x.o
nice host name ;)
> x.o: file format ELF64-BPF
>
> Disassembly of section test1:
> process:
> 0: b7 00 00 00 00 00 00 02 r0 = 33554432
> 1: 61 21 00 50 00 00 00 00 r1 = *(u32 *)(r2 + 20480)
>
> That first instruction should be "r0 = 2"
hmm. I haven't tested it on big endian.
When last time s390 folks tested samples/bpf with llvm we didn't even
have automatic -march=bpf in llvm, so they used -march=bpfeb.
There was no llvm-objdump support either.
llvm side does this:
tatic Triple::ArchType parseBPFArch(StringRef ArchName) {
if (ArchName.equals("bpf")) {
if (sys::IsLittleEndianHost)
return Triple::bpfel;
else
return Triple::bpfeb;
} else if (ArchName.equals("bpf_be") || ArchName.equals("bpfeb")) {
return Triple::bpfeb;
} else if (ArchName.equals("bpf_le") || ArchName.equals("bpfel")) {
return Triple::bpfel;
It works for clang and for llvm.
I thought llvm-objdump should infer triple from elf file
and do the 'right thing'... hmm
could you please test it with -g and see whether dwarf is still
correct in .o ?
llvm-objdump -S should print original C code next to asm.
Hope bpf dwarf is not broken on big-endian...
^ permalink raw reply
* Re: more test_progs...
From: Alexei Starovoitov @ 2017-04-26 3:42 UTC (permalink / raw)
To: David Miller; +Cc: daniel, netdev
In-Reply-To: <20170425.125217.1962662516948420246.davem@davemloft.net>
On 4/25/17 9:52 AM, David Miller wrote:
>
> 10: (15) if r3 == 0xdd86 goto pc+9
> R0=imm2,min_value=2,max_value=2 R1=pkt(id=0,off=0,r=14) R2=pkt_end R3=inv R4=pkt(id=0,off=14,r=14) R5=inv56 R10=fp
>
> Hmmm, endianness looks wrong here. "-target bpf" defaults to the
> endianness of whatever cpu that llvm was built for, right?
yeah. so here the code comes from:
} else if (eth->h_proto == _htons(ETH_P_IPV6)) {
and in the beginning of that .c file:
#define _htons __builtin_bswap16
^^^ that's the bug.
It should have been nop for sparc.
We cannot use htons() from system header, since it uses x86 inline asm.
> R0=imm2,min_value=2,max_value=2 R1=inv56,min_value=6,max_value=6 R2=pkt_end R3=imm3,min_value=3,max_value=3 R4=pkt(id=1,off=14,r=34) R5=pkt(id=1,off=34,r=34) R10=fp
> 36: (07) r4 += 18
> 37: (2d) if r4 > r2 goto pc+5
> R0=imm2,min_value=2,max_value=2 R1=inv56,min_value=6,max_value=6 R2=pkt_end R3=imm3,min_value=3,max_value=3 R4=pkt(id=1,off=32,r=34) R5=pkt(id=1,off=34,r=34) R10=fp
> 38: (b7) r0 = 0
> 39: (69) r1 = *(u16 *)(r4 +0)
> Unknown alignment. Only byte-sized access allowed in packet access.
>
> And this seems to load the urgent pointer as a u16 which is what the verifier rejects.
>
> Oh I see, this is guarded by CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
yes. exactly.
Your analysis is correct and offending 16-bit load is this C code:
if (tcp->urg_ptr == 123)
the parsing code before that line deals with variable length ip header:
tcp = (struct tcphdr *)((void *)(iph) + ihl_len);
and at that point verifier cannot track the alignment of the pointer
anymore. It knows 'iph' alignment, but as soon as some variable is added
to it cannot assume good alignment anymore and from there on it
allows >1 byte accesses only on HAVE_EFFICIENT_UNALIGNED_ACCESS
architectures.
That's the 'if' that you found:
'if (reg->id && size != 1) {'
ref->id > 0 means that some variable was added to ptr_to_packet.
That sucks for sparc, of course. I don't have good suggestions for
workaround other than marking tcphdr as packed :(
From code efficiency point of view it still will be faster than
LD_ABS insn.
Another idea is to try to track that ihl_len variable is also
somehow multiple of 4, but it will be quite complicated on
the verifier side in its existing architecture and maybe? worth
waiting until the verifier has proper dataflow analysis.
^ permalink raw reply
* Re: hmmm...
From: David Miller @ 2017-04-26 3:38 UTC (permalink / raw)
To: ast; +Cc: netdev
In-Reply-To: <af3781d2-f8af-1c6e-b3a0-163d8c1de75a@fb.com>
From: Alexei Starovoitov <ast@fb.com>
Date: Tue, 25 Apr 2017 19:56:06 -0700
> On 4/25/17 6:52 PM, David Miller wrote:
>>
>> Alexei, I found something strange on my computer :-)
>>
>> [davem@localhost binutils]$ ./objdump -d x.o
>
> No way! :) I thought it will take weeks!
> Ship it. Ship it. Ship it.
> Cannot wait to pull.
> This is awesome. Thanks a ton!
Relax, it is in a very raw state still. :)
> What is the mnemonic for 32-bit alu ?
It is of the form "xxx32". Here is opcodes table below.
I think there are no formal mnenomics defined anywhere yet right?
So just tell me what adjustments you want to make.
#include "sysdep.h"
#include <stdio.h>
#include "opcode/bpf.h"
#define BPF_OPC_ALU64 0x07
#define BPF_OPC_DW 0x18
#define BPF_OPC_XADD 0xc0
#define BPF_OPC_MOV 0xb0
#define BPF_OPC_ARSH 0xc0
#define BPF_OPC_END 0xd0
#define BPF_OPC_TO_LE 0x00
#define BPF_OPC_TO_BE 0x08
#define BPF_OPC_JNE 0x50
#define BPF_OPC_JSGT 0x60
#define BPF_OPC_JSGE 0x70
#define BPF_OPC_CALL 0x80
#define BPF_OPC_EXIT 0x90
#define BPF_OPC_LD 0x00
#define BPF_OPC_LDX 0x01
#define BPF_OPC_ST 0x02
#define BPF_OPC_STX 0x03
#define BPF_OPC_ALU 0x04
#define BPF_OPC_JMP 0x05
#define BPF_OPC_RET 0x06
#define BPF_OPC_MISC 0x07
#define BPF_OPC_W 0x00
#define BPF_OPC_H 0x08
#define BPF_OPC_B 0x10
#define BPF_OPC_IMM 0x00
#define BPF_OPC_ABS 0x20
#define BPF_OPC_IND 0x40
#define BPF_OPC_MEM 0x60
#define BPF_OPC_LEL 0x80
#define BPF_OPC_MSH 0xa0
#define BPF_OPC_ADD 0x00
#define BPF_OPC_SUB 0x10
#define BPF_OPC_MUL 0x20
#define BPF_OPC_DIV 0x30
#define BPF_OPC_OR 0x40
#define BPF_OPC_AND 0x50
#define BPF_OPC_LSH 0x60
#define BPF_OPC_RSH 0x70
#define BPF_OPC_NEG 0x80
#define BPF_OPC_MOD 0x90
#define BPF_OPC_XOR 0xa0
#define BPF_OPC_JA 0x00
#define BPF_OPC_JEQ 0x10
#define BPF_OPC_JGT 0x20
#define BPF_OPC_JGE 0x30
#define BPF_OPC_JSET 0x40
#define BPF_OPC_K 0x00
#define BPF_OPC_X 0x08
const struct bpf_opcode bpf_opcodes[] = {
{ "mov32", BPF_OPC_ALU | BPF_OPC_MOV | BPF_OPC_X, "1,2" },
{ "mov", BPF_OPC_ALU64 | BPF_OPC_MOV | BPF_OPC_X, "1,2" },
{ "add32", BPF_OPC_ALU | BPF_OPC_ADD | BPF_OPC_X, "1,2" },
{ "add", BPF_OPC_ALU64 | BPF_OPC_ADD | BPF_OPC_X, "1,2" },
{ "sub32", BPF_OPC_ALU | BPF_OPC_SUB | BPF_OPC_X, "1,2" },
{ "sub", BPF_OPC_ALU64 | BPF_OPC_SUB | BPF_OPC_X, "1,2" },
{ "and32", BPF_OPC_ALU | BPF_OPC_AND | BPF_OPC_X, "1,2" },
{ "and", BPF_OPC_ALU64 | BPF_OPC_AND | BPF_OPC_X, "1,2" },
{ "or32", BPF_OPC_ALU | BPF_OPC_OR | BPF_OPC_X, "1,2" },
{ "or", BPF_OPC_ALU64 | BPF_OPC_OR | BPF_OPC_X, "1,2" },
{ "xor32", BPF_OPC_ALU | BPF_OPC_XOR | BPF_OPC_X, "1,2" },
{ "xor", BPF_OPC_ALU64 | BPF_OPC_XOR | BPF_OPC_X, "1,2" },
{ "mul32", BPF_OPC_ALU | BPF_OPC_MUL | BPF_OPC_X, "1,2" },
{ "mul", BPF_OPC_ALU64 | BPF_OPC_MUL | BPF_OPC_X, "1,2" },
{ "div32", BPF_OPC_ALU | BPF_OPC_DIV | BPF_OPC_X, "1,2" },
{ "div", BPF_OPC_ALU64 | BPF_OPC_DIV | BPF_OPC_X, "1,2" },
{ "mod32", BPF_OPC_ALU | BPF_OPC_MOD | BPF_OPC_X, "1,2" },
{ "mod", BPF_OPC_ALU64 | BPF_OPC_MOD | BPF_OPC_X, "1,2" },
{ "lsh32", BPF_OPC_ALU | BPF_OPC_LSH | BPF_OPC_X, "1,2" },
{ "lsh", BPF_OPC_ALU64 | BPF_OPC_LSH | BPF_OPC_X, "1,2" },
{ "rsh32", BPF_OPC_ALU | BPF_OPC_RSH | BPF_OPC_X, "1,2" },
{ "rsh", BPF_OPC_ALU64 | BPF_OPC_RSH | BPF_OPC_X, "1,2" },
{ "arsh32", BPF_OPC_ALU | BPF_OPC_ARSH | BPF_OPC_X, "1,2" },
{ "arsh", BPF_OPC_ALU64 | BPF_OPC_ARSH | BPF_OPC_X, "1,2" },
{ "neg32", BPF_OPC_ALU | BPF_OPC_NEG | BPF_OPC_X, "1" },
{ "neg", BPF_OPC_ALU64 | BPF_OPC_NEG | BPF_OPC_X, "1" },
{ "endbe", BPF_OPC_ALU | BPF_OPC_END | BPF_OPC_TO_BE, "1,i" },
{ "endle", BPF_OPC_ALU | BPF_OPC_END | BPF_OPC_TO_LE, "1,i" },
{ "mov32", BPF_OPC_ALU | BPF_OPC_MOV | BPF_OPC_K, "1,i" },
{ "mov", BPF_OPC_ALU64 | BPF_OPC_MOV | BPF_OPC_K, "1,i" },
{ "add32", BPF_OPC_ALU | BPF_OPC_ADD | BPF_OPC_K, "1,i" },
{ "add", BPF_OPC_ALU64 | BPF_OPC_ADD | BPF_OPC_K, "1,i" },
{ "sub32", BPF_OPC_ALU | BPF_OPC_SUB | BPF_OPC_K, "1,i" },
{ "sub", BPF_OPC_ALU64 | BPF_OPC_SUB | BPF_OPC_K, "1,i" },
{ "and32", BPF_OPC_ALU | BPF_OPC_AND | BPF_OPC_K, "1,i" },
{ "and", BPF_OPC_ALU64 | BPF_OPC_AND | BPF_OPC_K, "1,i" },
{ "or32", BPF_OPC_ALU | BPF_OPC_XOR | BPF_OPC_K, "1,i" },
{ "or", BPF_OPC_ALU64 | BPF_OPC_XOR | BPF_OPC_K, "1,i" },
{ "xor32", BPF_OPC_ALU | BPF_OPC_OR | BPF_OPC_K, "1,i" },
{ "xor", BPF_OPC_ALU64 | BPF_OPC_OR | BPF_OPC_K, "1,i" },
{ "mul32", BPF_OPC_ALU | BPF_OPC_MUL | BPF_OPC_K, "1,i" },
{ "mul", BPF_OPC_ALU64 | BPF_OPC_MUL | BPF_OPC_K, "1,i" },
{ "div32", BPF_OPC_ALU | BPF_OPC_DIV | BPF_OPC_K, "1,i" },
{ "div", BPF_OPC_ALU64 | BPF_OPC_DIV | BPF_OPC_K, "1,i" },
{ "mod32", BPF_OPC_ALU | BPF_OPC_MOD | BPF_OPC_K, "1,i" },
{ "mod", BPF_OPC_ALU64 | BPF_OPC_MOD | BPF_OPC_K, "1,i" },
{ "lsh32", BPF_OPC_ALU | BPF_OPC_LSH | BPF_OPC_K, "1,i" },
{ "lsh", BPF_OPC_ALU64 | BPF_OPC_LSH | BPF_OPC_K, "1,i" },
{ "rsh32", BPF_OPC_ALU | BPF_OPC_RSH | BPF_OPC_K, "1,i" },
{ "rsh", BPF_OPC_ALU64 | BPF_OPC_RSH | BPF_OPC_K, "1,i" },
{ "arsh32", BPF_OPC_ALU | BPF_OPC_ARSH | BPF_OPC_K, "1,i" },
{ "arsh", BPF_OPC_ALU64 | BPF_OPC_ARSH | BPF_OPC_K, "1,i" },
{ "ja", BPF_OPC_JMP | BPF_OPC_JA, "L" },
{ "jeq", BPF_OPC_JMP | BPF_OPC_JEQ | BPF_OPC_X, "1,2,L" },
{ "jgt", BPF_OPC_JMP | BPF_OPC_JGT | BPF_OPC_X, "1,2,L" },
{ "jge", BPF_OPC_JMP | BPF_OPC_JGE | BPF_OPC_X, "1,2,L" },
{ "jne", BPF_OPC_JMP | BPF_OPC_JNE | BPF_OPC_X, "1,2,L" },
{ "jsgt", BPF_OPC_JMP | BPF_OPC_JSGT | BPF_OPC_X, "1,2,L" },
{ "jsge", BPF_OPC_JMP | BPF_OPC_JSGE | BPF_OPC_X, "1,2,L" },
{ "jset", BPF_OPC_JMP | BPF_OPC_JSET | BPF_OPC_X, "1,2,L" },
{ "jeq", BPF_OPC_JMP | BPF_OPC_JEQ | BPF_OPC_K, "1,i,L" },
{ "jgt", BPF_OPC_JMP | BPF_OPC_JGT | BPF_OPC_K, "1,i,L" },
{ "jge", BPF_OPC_JMP | BPF_OPC_JGE | BPF_OPC_K, "1,i,L" },
{ "jne", BPF_OPC_JMP | BPF_OPC_JNE | BPF_OPC_K, "1,i,L" },
{ "jsgt", BPF_OPC_JMP | BPF_OPC_JSGT | BPF_OPC_K, "1,i,L" },
{ "jsge", BPF_OPC_JMP | BPF_OPC_JSGE | BPF_OPC_K, "1,i,L" },
{ "jset", BPF_OPC_JMP | BPF_OPC_JSET | BPF_OPC_K, "1,i,L" },
{ "call", BPF_OPC_JMP | BPF_OPC_CALL, "C" },
{ "tailcall",BPF_OPC_JMP | BPF_OPC_CALL | BPF_OPC_X, "C" },
{ "exit", BPF_OPC_JMP | BPF_OPC_EXIT, "" },
{ "ldimm64", BPF_OPC_LD | BPF_OPC_IMM | BPF_OPC_DW, "1,D" },
{ "ldxw", BPF_OPC_LDX | BPF_OPC_MEM | BPF_OPC_W, "1,[2+O]" },
{ "ldxh", BPF_OPC_LDX | BPF_OPC_MEM | BPF_OPC_H, "1,[2+O]" },
{ "ldxb", BPF_OPC_LDX | BPF_OPC_MEM | BPF_OPC_B, "1,[2+O]" },
{ "ldxdw", BPF_OPC_LDX | BPF_OPC_MEM | BPF_OPC_DW, "1,[2+O]" },
{ "stw", BPF_OPC_ST | BPF_OPC_MEM | BPF_OPC_W, "[1+O],i" },
{ "sth", BPF_OPC_ST | BPF_OPC_MEM | BPF_OPC_H, "[1+O],i" },
{ "stb", BPF_OPC_ST | BPF_OPC_MEM | BPF_OPC_B, "[1+O],i" },
{ "stdw", BPF_OPC_ST | BPF_OPC_MEM | BPF_OPC_DW, "[1+O],i" },
{ "stw", BPF_OPC_STX | BPF_OPC_MEM | BPF_OPC_W, "[1+O],2" },
{ "sth", BPF_OPC_STX | BPF_OPC_MEM | BPF_OPC_H, "[1+O],2" },
{ "stb", BPF_OPC_STX | BPF_OPC_MEM | BPF_OPC_B, "[1+O],2" },
{ "stdw", BPF_OPC_STX | BPF_OPC_MEM | BPF_OPC_DW, "[1+O],2" },
{ "xaddw", BPF_OPC_STX | BPF_OPC_XADD | BPF_OPC_W, "[1+O],2" },
{ "xadddw", BPF_OPC_STX | BPF_OPC_XADD | BPF_OPC_DW, "[1+O],2" },
};
const int bpf_num_opcodes = ((sizeof bpf_opcodes)/(sizeof bpf_opcodes[0]));
^ permalink raw reply
* [net-next] net: update comment for netif_dormant() function
From: Zhang Shengju @ 2017-04-26 3:05 UTC (permalink / raw)
To: netdev, davem
This patch updates the comment for netif_dormant() function to reflect
the intended usage.
Signed-off-by: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
---
include/linux/netdevice.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5d5267f..b8c8e5b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3400,10 +3400,10 @@ static inline void netif_dormant_off(struct net_device *dev)
}
/**
- * netif_dormant - test if carrier present
+ * netif_dormant - test if device is dormant
* @dev: network device
*
- * Check if carrier is present on device
+ * Check if device is dormant.
*/
static inline bool netif_dormant(const struct net_device *dev)
{
--
1.8.3.1
^ permalink raw reply related
* Re: xdp_redirect ifindex vs port. Was: best API for returning/setting egress port?
From: John Fastabend @ 2017-04-26 3:07 UTC (permalink / raw)
To: Alexei Starovoitov, Jesper Dangaard Brouer
Cc: Daniel Borkmann, Andy Gospodarek, Daniel Borkmann,
Alexei Starovoitov, netdev@vger.kernel.org,
xdp-newbies@vger.kernel.org
In-Reply-To: <20170426002610.eihwmz4knbmrolfw@ast-mbp.dhcp.thefacebook.com>
On 17-04-25 05:26 PM, Alexei Starovoitov wrote:
> On Tue, Apr 25, 2017 at 11:34:53AM +0200, Jesper Dangaard Brouer wrote:
>>> Note the very first bpf patchset years ago contained the port table
>>> abstraction. ovs has concept of vports as well. These two very
>>> different projects needed port table to provide a layer of
>>> indirection between ifindex==netdev and virtual port number.
>>> This is still the case and I'd like to see this port table to be
>>> implemented for both cls_bpf and xdp. In that sense xdp is not
>>> special.
>>
>> Glad to hear you want to see this implemented, I will start coding on
>> this then. Good point with cls_bpf, I was planning to make this port
>> table strongly connected to XDP, guess I should also think of cls_bpf.
>
> perfect.
> I think we should try to make all additions to bpf networking world
> to be usable for both tc and xdp, since both are actively used and
> it wouldn't be great to have cool feature for one, but not the other.
> I think port table is an excellent candidate that applies to both.
+1
Jesper, I was working up the code for the redirect piece for ixgbe and
virtio, please use this as a base for your virtual port number table. I'll
push an update onto github tomorrow. I think the table should drop in fairly
nicely.
One piece that isn't clear to me is how do you plan to instantiate and
program this table. Is it a new static bpf map that is created any time we see
the redirect command? I think this would be preferred.
>
>> I'm not worried about the DROP case, I agree that is fine (as you also
>> say). The problem is unintentionally sending a packet to a wrong
>> ifindex. This is clearly an eBPF program error, BUT with XDP this
>> becomes a very hard to debug program error. With TC-redirect/cls_bpf
>> we can tcpdump the packets, with XDP there is no visibility into this
>> happening (the NSA is going to love this "feature"). Maybe we could add
>> yet-another tracepoint to allow debugging this. My proposal that we
>> simply remove the possibility for such program errors, by as you say
>> move the validation from run-time into static insertion-time, via a
>> port table.
>
> I think lack of tcpdump-like debugging in xdp is a separate issue.
> As I was saying in the other thread we have trivial 'xdpdump' kern+user
> app that emits pcap file, but it's too specific to how we use
> tail_calls+prog_array in our xdp setup. I'm working on the program
> chaining that will be generic and allow us transparently add multiple
> xdp or tc progs to the same attachment point and will allow us to
> do 'xdpdump' at any point of this pipeline, so debugging of what
> happened to the packet will be easier and done in the same way
> for both tc and xdp.
> btw in our experience working with both tc and xdp the tc+bpf was
> actually harder to use and more bug prone.
>
Nice, the tcpdump-like debugging looks interesting.
^ permalink raw reply
* Re: [PATCH net v2 0/3] net: hns: bug fix for HNS driver
From: lipeng (Y) @ 2017-04-26 3:05 UTC (permalink / raw)
To: David Miller, yankejian
Cc: salil.mehta, yisen.zhuang, huangdaode, zhouhuiru, netdev,
charles.chenxin, linuxarm
In-Reply-To: <20170424.122448.1775404720510848518.davem@davemloft.net>
On 2017/4/25 0:24, David Miller wrote:
> From: Yankejian <yankejian@huawei.com>
> Date: Fri, 21 Apr 2017 15:44:41 +0800
>
>> From: lipeng <lipeng321@huawei.com>
>>
>> This series adds support defered probe when mdio or mbigen module
>> insmod behind HNS driver, and fixes a bug that a skb has been
>> freed, but it may be still used in driver.
>>
>> change log:
>> V1 -> V2:
>> 1. Return appropriate errno in hns_mac_register_phy;
> At a minimum you're going to have to expand your commit message in
> patch #1 so that it has more detail and explains the situation better.
> .
OK, will add more detail and explains for #1 patch.
thanks
^ permalink raw reply
* Re: hmmm...
From: Alexei Starovoitov @ 2017-04-26 2:56 UTC (permalink / raw)
To: David Miller; +Cc: netdev
In-Reply-To: <20170425.215232.727894789143301373.davem@davemloft.net>
On 4/25/17 6:52 PM, David Miller wrote:
>
> Alexei, I found something strange on my computer :-)
>
> [davem@localhost binutils]$ ./objdump -d x.o
No way! :) I thought it will take weeks!
Ship it. Ship it. Ship it.
Cannot wait to pull.
This is awesome. Thanks a ton!
What is the mnemonic for 32-bit alu ?
^ permalink raw reply
* Re: Bug and configuration MPLS error?
From: David Ahern @ 2017-04-26 2:22 UTC (permalink / raw)
To: Алексей Болдырев,
netdev
In-Reply-To: <184081493141314@web16g.yandex.ru>
On 4/25/17 11:28 AM, Алексей Болдырев wrote:
> 226 sysctl -w net.mpls.conf.lo.input=1
> 227 sysctl -w net.mpls.platform_labels=1048575
> 228 ip link add veth0 type veth peer name veth1
> 229 ip link add veth2 type veth peer name veth3
> 230 sysctl -w net.mpls.conf.veth0.input=1
> 231 sysctl -w net.mpls.conf.veth2.input=1
> 232 ifconfig veth0 10.3.3.1 netmask 255.255.255.0
> 233 ifconfig veth2 10.4.4.1 netmask 255.255.255.0
> 234 ip netns add host1
> 235 ip netns add host2
> 236 ip link set veth1 netns host1
> 237 ip link set veth3 netns host2
> 238 ip netns exec host1 ifconfig veth1 10.3.3.2 netmask 255.255.255.0 up
> 239 ip netns exec host2 ifconfig veth3 10.4.4.2 netmask 255.255.255.0 up
> 240 ip netns exec host1 ip route add 10.10.10.2/32 encap mpls 112 via inet 10.3.3.1
> 241 ip netns exec host2 ip route add 10.10.10.1/32 encap mpls 111 via inet 10.4.4.1
> 242 ip -f mpls route add 111 via inet 10.3.3.2
> 243 ip -f mpls route add 112 via inet 10.4.4.2
your setup is incomplete.
# ip netns exec host2 ping 10.10.10.1
PING 10.10.10.1 (10.10.10.1) 56(84) bytes of data.
^C
--- 10.10.10.1 ping statistics ---
2 packets transmitted, 0 received, 100% packet loss, time 1038ms
If you run tcpdump on veth1 in host1 you see the packets come in but no
response:
# ip netns exec host1 tcpdump -n -i veth1
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on veth1, link-type EN10MB (Ethernet), capture size 262144 bytes
19:20:24.599529 IP6 fe80::347d:e3ff:fe93:944b > ff02::2: ICMP6, router
solicitation, length 16
19:20:27.413901 IP 10.4.4.2 > 10.10.10.1: ICMP echo request, id 978, seq
1, length 64
19:20:28.439574 IP 10.4.4.2 > 10.10.10.1: ICMP echo request, id 978, seq
2, length 64
and the lack of response is b/c:
1. host1 has no address for 10.10.10.1 and
2. even if it did, there is no return route to 10.4.4.2:
# ip -netns host1 ro ls
10.3.3.0/24 dev veth1 proto kernel scope link src 10.3.3.2
10.10.10.2 encap mpls 112 via 10.3.3.1 dev veth1
^ permalink raw reply
* hmmm...
From: David Miller @ 2017-04-26 1:52 UTC (permalink / raw)
To: ast; +Cc: netdev
Alexei, I found something strange on my computer :-)
[davem@localhost binutils]$ ./objdump -d x.o
x.o: file format elf64-bpf
Disassembly of section test1:
0000000000000000 <process>:
0: b7 00 00 00 00 00 00 02 mov r0, 2
8: 61 21 00 50 00 00 00 00 ldxw r2, [r1+80]
10: 61 11 00 4c 00 00 00 00 ldxw r1, [r1+76]
18: bf 41 00 00 00 00 00 00 mov r4, r1
20: 07 40 00 00 00 00 00 0e add r4, 14
28: 2d 42 00 25 00 00 00 00 jgt r4, r2, 148 <LBB0_11>
30: 71 51 00 0d 00 00 00 00 ldxb r5, [r1+13]
38: 71 31 00 0c 00 00 00 00 ldxb r3, [r1+12]
40: 67 30 00 00 00 00 00 08 lsh r3, 8
48: 4f 35 00 00 00 00 00 00 or r3, r5
50: 15 30 00 09 00 00 dd 86 jeq r3, 56710, 90 <process+0x90>
58: 55 30 00 1d 00 00 00 08 jne r3, 8, 138 <LBB0_6+0x70>
60: bf 31 00 00 00 00 00 00 mov r3, r1
68: 07 30 00 00 00 00 00 22 add r3, 34
70: 2d 32 00 1c 00 00 00 00 jgt r3, r2, 148 <LBB0_11>
78: b7 30 00 00 00 00 00 03 mov r3, 3
80: 71 54 00 00 00 00 00 00 ldxb r5, [r4+0]
88: 67 50 00 00 00 00 00 02 lsh r5, 2
90: 57 50 00 00 00 00 00 3c and r5, 60
98: 05 00 00 05 00 00 00 00 ja b8 <LBB0_5+0x18>
00000000000000a0 <LBB0_5>:
a0: b7 50 00 00 00 00 00 28 mov r5, 40
a8: b7 30 00 00 00 00 00 00 mov r3, 0
b0: bf 41 00 00 00 00 00 00 mov r4, r1
b8: 07 40 00 00 00 00 00 36 add r4, 54
c0: 2d 42 00 12 00 00 00 00 jgt r4, r2, 148 <LBB0_11>
00000000000000c8 <LBB0_6>:
c8: bf 41 00 00 00 00 00 00 mov r4, r1
d0: 0f 45 00 00 00 00 00 00 add r4, r5
d8: 07 40 00 00 00 00 00 0e add r4, 14
e0: 15 40 00 0c 00 00 00 00 jeq r4, 0, 138 <LBB0_6+0x70>
e8: bf 54 00 00 00 00 00 00 mov r5, r4
f0: 07 50 00 00 00 00 00 14 add r5, 20
f8: 2d 52 00 0b 00 00 00 00 jgt r5, r2, 148 <LBB0_11>
100: 0f 13 00 00 00 00 00 00 add r1, r3
108: 71 11 00 14 00 00 00 00 ldxb r1, [r1+20]
110: 57 10 00 00 00 00 00 ff and r1, 255
118: 55 10 00 07 00 00 00 06 jne r1, 6, 148 <LBB0_11>
120: 07 40 00 00 00 00 00 12 add r4, 18
128: 2d 42 00 05 00 00 00 00 jgt r4, r2, 148 <LBB0_11>
130: b7 00 00 00 00 00 00 00 mov r0, 0
138: 69 14 00 00 00 00 00 00 ldxh r1, [r4+0]
140: 15 10 00 02 00 00 00 7b jeq r1, 123, 148 <LBB0_11>
0000000000000148 <LBB0_11>:
148: 18 00 00 00 ff ff ff ff ldimm64 r0, 4294967295
150: 00 00 00 00 00 00 00 00
0000000000000158 <LBB0_12>:
158: 95 00 00 00 00 00 00 00 exit
[davem@localhost binutils]$
^ permalink raw reply
* linux-next: manual merge of the net-next tree with the kbuild tree
From: Stephen Rothwell @ 2017-04-26 1:09 UTC (permalink / raw)
To: David Miller, Networking, Masahiro Yamada
Cc: Linux-Next Mailing List, Linux Kernel Mailing List,
Nicolas Dichtel, Gerard Garcia
Hi all,
Today's linux-next merge of the net-next tree got a conflict in:
include/uapi/linux/Kbuild
between commit:
65017bab8a9e ("uapi: export all headers under uapi directories")
from the kbuild tree and commit:
0b2e66448ba2 ("VSOCK: Add vsockmon device")
from the net-next tree.
I fixed it up (I just used the kbuild tree version as new entries are not
needed any more in this file) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging. You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.
--
Cheers,
Stephen Rothwell
^ permalink raw reply
* Re: [PATCH v2 2/2] ARM: dts: Add the ethernet and ethernet PHY to the cygnus core DT.
From: Andrew Lunn @ 2017-04-26 0:49 UTC (permalink / raw)
To: Eric Anholt
Cc: Florian Fainelli, Vivien Didelot, netdev, Rob Herring,
Mark Rutland, devicetree, linux-arm-kernel, linux-kernel,
bcm-kernel-feedback-list, Ray Jui, Scott Branden, Jon Mason
In-Reply-To: <20170425235357.7690-2-eric@anholt.net>
> + eth0: ethernet@18042000 {
> + compatible = "brcm,amac";
> + reg = <0x18042000 0x1000>,
> + <0x18110000 0x1000>;
> + reg-names = "amac_base", "idm_base";
> + interrupts = <GIC_SPI 110 IRQ_TYPE_LEVEL_HIGH>;
> + max-speed = <1000>;
Hi Eric
Sorry i missed this the first time. Does this Ethernet controller do >
1Gbps? Does this max-speed do anything useful?
Andrew
^ permalink raw reply
* Re: TCP fast open using experimental TCP option?
From: Tom Herbert @ 2017-04-26 0:39 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Linux Kernel Network Developers, Jerry Chu
In-Reply-To: <1493165636.6453.56.camel@edumazet-glaptop3.roam.corp.google.com>
On Tue, Apr 25, 2017 at 5:13 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Tue, 2017-04-25 at 12:08 -0700, Tom Herbert wrote:
>> Looks like TCP fast open was using experimental TCP option at some. Is
>> this still needed? Technically this violates usage requirements of
>> experimental options. Can this be removed now since there is now an
>> assigned option number for TFO?
>>
>> case TCPOPT_EXP:
>> /* Fast Open option shares code 254 using a
>> * 16 bits magic number.
>> */
>> if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
>> get_unaligned_be16(ptr) ==
>> TCPOPT_FASTOPEN_MAGIC)
>> tcp_parse_fastopen_option(opsize -
>> TCPOLEN_EXP_FASTOPEN_BASE,
>> ptr + 2, th->syn, foc, true);
>> break;
>
> Hi Tom
>
> Client side was updated in linux-4.1 only two years ago.
>
> We lack counters telling how often the experimental option is used.
>
> RFC6994 ( 5. Migration to Assigned Options ) guidelines are properly
> met.
>
> Not sure why we should hurry ?
>
An IETFer was complaining that Linux indiscriminately violates TCP
RFCs and gave the use of experimental options as example. Yuchung
pointed out that this use is actually conformant to the spec so we're
good (thanks Yuchung!).
Tom
>
^ permalink raw reply
* [PATCH] ipv6: check raw payload size correctly in ioctl
From: Jamie Bainbridge @ 2017-04-26 0:43 UTC (permalink / raw)
To: David S. Miller, Alexey Kuznetsov, James Morris,
Hideaki YOSHIFUJI, Patrick McHardy, netdev
Cc: Jamie Bainbridge
In situations where an skb is paged, the transport header pointer and
tail pointer can be the same because the skb contents are in frags.
This results in ioctl(SIOCINQ/FIONREAD) incorrectly returning a
length of 0 when the length to receive is actually greater than zero.
skb->len is already correctly set in ip6_input_finish() with
pskb_pull(), so use skb->len as it always returns the correct result
for both linear and paged data.
Signed-off-by: Jamie Bainbridge <jbainbri@redhat.com>
---
net/ipv6/raw.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index f174e76e6505d4045e940c9fceef765d2aaa937d..0da6a12b5472e322d679572c7244e5c9bc467741 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -1178,8 +1178,7 @@ static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg)
spin_lock_bh(&sk->sk_receive_queue.lock);
skb = skb_peek(&sk->sk_receive_queue);
if (skb)
- amount = skb_tail_pointer(skb) -
- skb_transport_header(skb);
+ amount = skb->len;
spin_unlock_bh(&sk->sk_receive_queue.lock);
return put_user(amount, (int __user *)arg);
}
--
1.8.3.1
^ permalink raw reply related
* Re: [PATCH net] net: batman-adv: Fix possible memleaks when fail to register_netdevice
From: Gao Feng @ 2017-04-26 0:41 UTC (permalink / raw)
To: 'Sven Eckelmann',
b.a.t.m.a.n-ZwoEplunGu2X36UT3dwllkB+6BGkLq7r
Cc: mareklindner-rVWd3aGhH2z5bpWLKbzFeg,
netdev-u79uwXL29TY76Z2rM5mHXA, a, 'Gao Feng',
davem-fT/PcQaiUtIeIZ0/mPfg9Q
In-Reply-To: <1756616.320MP6AHYH@bentobox>
> From: Sven Eckelmann [mailto:sven-KaDOiPu9UxWEi8DpZVb4nw@public.gmane.org]
> Sent: Tuesday, April 25, 2017 9:53 PM
> On Dienstag, 25. April 2017 20:03:20 CEST gfree.wind-H32Fclmsjq1BDgjK7y7TUQ@public.gmane.org wrote:
> > From: Gao Feng <fgao-KlmEoCYek3zQT0dZR+AlfA@public.gmane.org>
> >
> > Because the func batadv_softif_init_late allocate some resources and
> > it would be invoked in register_netdevice. So we need to invoke the
> > func batadv_softif_free instead of free_netdev to cleanup when fail to
> > register_netdevice.
>
> I could be wrong, but shouldn't the destructor be replaced with
free_netdevice
> and the batadv_softif_free (without the free_netdev) used as ndo_uninit?
The
> line you've changed should then be kept as free_netdevice.
>
> At least this seems to be important when using rtnl_newlink() instead of
the
> legacy sysfs netdev stuff from batman-adv. rtnl_newlink() would also only
call
> free_netdevice and thus also not run batadv_softif_free. This seems to be
only
> fixable by calling ndo_uninit.
>
> Kind regards,
> Sven
Sorry, I don't get you.
The net_dev is created in this func batadv_softif_create.
Why couldn't invoke batadv_softif_free to cleanup when fail to
register_netdevice?
Best Regards
Feng
^ permalink raw reply
* [PATCH net-next] tcp: memset ca_priv data to 0 properly
From: Wei Wang @ 2017-04-26 0:38 UTC (permalink / raw)
To: netdev, David Miller; +Cc: Eric Dumazet, Yuchung Cheng, Neal Cardwell, Wei Wang
From: Wei Wang <weiwan@google.com>
Always zero out ca_priv data in tcp_assign_congestion_control() so that
ca_priv data is cleared out during socket creation.
Also always zero out ca_priv data in tcp_reinit_congestion_control() so
that when cc algorithm is changed, ca_priv data is cleared out as well.
We should still zero out ca_priv data even in TCP_CLOSE state because
user could call connect() on AF_UNSPEC to disconnect the socket and
leave it in TCP_CLOSE state and later call setsockopt() to switch cc
algorithm on this socket.
Fixes: 2b0a8c9ee ("tcp: add CDG congestion control")
Reported-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
---
net/ipv4/tcp_cong.c | 11 +++--------
1 file changed, 3 insertions(+), 8 deletions(-)
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 79c4817abc94..6e3c512054a6 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -168,12 +168,8 @@ void tcp_assign_congestion_control(struct sock *sk)
}
out:
rcu_read_unlock();
+ memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
- /* Clear out private data before diag gets it and
- * the ca has not been initialized.
- */
- if (ca->get_info)
- memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
if (ca->flags & TCP_CONG_NEEDS_ECN)
INET_ECN_xmit(sk);
else
@@ -200,11 +196,10 @@ static void tcp_reinit_congestion_control(struct sock *sk,
tcp_cleanup_congestion_control(sk);
icsk->icsk_ca_ops = ca;
icsk->icsk_ca_setsockopt = 1;
+ memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
- if (sk->sk_state != TCP_CLOSE) {
- memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
+ if (sk->sk_state != TCP_CLOSE)
tcp_init_congestion_control(sk);
- }
}
/* Manage refcounts on socket close. */
--
2.13.0.rc0.306.g87b477812d-goog
^ permalink raw reply related
* Re: xdp_redirect ifindex vs port. Was: best API for returning/setting egress port?
From: Alexei Starovoitov @ 2017-04-26 0:26 UTC (permalink / raw)
To: Jesper Dangaard Brouer
Cc: Daniel Borkmann, Andy Gospodarek, Daniel Borkmann,
Alexei Starovoitov, netdev@vger.kernel.org,
xdp-newbies@vger.kernel.org, John Fastabend
In-Reply-To: <20170425113453.5c72080f@redhat.com>
On Tue, Apr 25, 2017 at 11:34:53AM +0200, Jesper Dangaard Brouer wrote:
> > Note the very first bpf patchset years ago contained the port table
> > abstraction. ovs has concept of vports as well. These two very
> > different projects needed port table to provide a layer of
> > indirection between ifindex==netdev and virtual port number.
> > This is still the case and I'd like to see this port table to be
> > implemented for both cls_bpf and xdp. In that sense xdp is not
> > special.
>
> Glad to hear you want to see this implemented, I will start coding on
> this then. Good point with cls_bpf, I was planning to make this port
> table strongly connected to XDP, guess I should also think of cls_bpf.
perfect.
I think we should try to make all additions to bpf networking world
to be usable for both tc and xdp, since both are actively used and
it wouldn't be great to have cool feature for one, but not the other.
I think port table is an excellent candidate that applies to both.
> I'm not worried about the DROP case, I agree that is fine (as you also
> say). The problem is unintentionally sending a packet to a wrong
> ifindex. This is clearly an eBPF program error, BUT with XDP this
> becomes a very hard to debug program error. With TC-redirect/cls_bpf
> we can tcpdump the packets, with XDP there is no visibility into this
> happening (the NSA is going to love this "feature"). Maybe we could add
> yet-another tracepoint to allow debugging this. My proposal that we
> simply remove the possibility for such program errors, by as you say
> move the validation from run-time into static insertion-time, via a
> port table.
I think lack of tcpdump-like debugging in xdp is a separate issue.
As I was saying in the other thread we have trivial 'xdpdump' kern+user
app that emits pcap file, but it's too specific to how we use
tail_calls+prog_array in our xdp setup. I'm working on the program
chaining that will be generic and allow us transparently add multiple
xdp or tc progs to the same attachment point and will allow us to
do 'xdpdump' at any point of this pipeline, so debugging of what
happened to the packet will be easier and done in the same way
for both tc and xdp.
btw in our experience working with both tc and xdp the tc+bpf was
actually harder to use and more bug prone.
^ permalink raw reply
* Re: [PATCH net-next] virtio-net: on tx, only call napi_disable if tx napi is on
From: Jason Wang @ 2017-04-26 0:17 UTC (permalink / raw)
To: Willem de Bruijn, netdev; +Cc: Willem de Bruijn, virtualization, davem, mst
In-Reply-To: <20170425195917.54209-1-willemdebruijn.kernel@gmail.com>
On 2017年04月26日 03:59, Willem de Bruijn wrote:
> From: Willem de Bruijn <willemb@google.com>
>
> As of tx napi, device down (`ip link set dev $dev down`) hangs unless
> tx napi is enabled. Else napi_enable is not called, so napi_disable
> will spin on test_and_set_bit NAPI_STATE_SCHED.
>
> Only call napi_disable if tx napi is enabled.
>
> Fixes: 5a719c2552ca ("virtio-net: transmit napi")
> Reported-by: Jason Wang <jasowang@redhat.com>
> Signed-off-by: Willem de Bruijn <willemb@google.com>
> ---
> drivers/net/virtio_net.c | 10 ++++++++--
> 1 file changed, 8 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 003143835766..82f1c3a73345 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -997,6 +997,12 @@ static void virtnet_napi_tx_enable(struct virtnet_info *vi,
> return virtnet_napi_enable(vq, napi);
> }
>
> +static void virtnet_napi_tx_disable(struct napi_struct *napi)
> +{
> + if (napi->weight)
> + napi_disable(napi);
> +}
> +
> static void refill_work(struct work_struct *work)
> {
> struct virtnet_info *vi =
> @@ -1445,7 +1451,7 @@ static int virtnet_close(struct net_device *dev)
>
> for (i = 0; i < vi->max_queue_pairs; i++) {
> napi_disable(&vi->rq[i].napi);
> - napi_disable(&vi->sq[i].napi);
> + virtnet_napi_tx_disable(&vi->sq[i].napi);
> }
>
> return 0;
> @@ -1803,7 +1809,7 @@ static void virtnet_freeze_down(struct virtio_device *vdev)
> if (netif_running(vi->dev)) {
> for (i = 0; i < vi->max_queue_pairs; i++) {
> napi_disable(&vi->rq[i].napi);
> - napi_disable(&vi->sq[i].napi);
> + virtnet_napi_tx_disable(&vi->sq[i].napi);
> }
> }
> }
Acked-by: Jason Wang <jasowang@redhat.com>
Thanks
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Get back to me
From: Ashraf Basit @ 2017-04-26 0:10 UTC (permalink / raw)
To: netdev
Good day. Did you receive the business proposal I sent to you yesterday? I was waiting for your reply but I am not sure if you receive the message. If for some reason you did not receive my previous email, I can resend the message to you. Please confirm as this is very urgent and important.
Regards,
Ashraf.
ashrafbas@secsuremailer.com
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox