* Re: [PATCH net-next,v2 3/4] net: flow_offload: mangle action at byte level
From: Vlad Buslov @ 2019-09-04 12:48 UTC (permalink / raw)
To: Pablo Neira Ayuso
Cc: netfilter-devel@vger.kernel.org, davem@davemloft.net,
netdev@vger.kernel.org, jakub.kicinski@netronome.com,
jiri@resnulli.us, Saeed Mahameed, vishal@chelsio.com, Vlad Buslov
In-Reply-To: <20190903164513.15462-4-pablo@netfilter.org>
On Tue 03 Sep 2019 at 19:45, Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> The flow mangle action is originally modeled after the tc pedit action,
> this has a number of shortcomings:
>
> 1) The tc pedit offset must be set on the 32-bits boundaries. Many
> protocol header field offsets are not aligned to 32-bits, eg. port
> destination, port source and ethernet destination. This patch adjusts
> the offset accordingly and trim off length in these case, so drivers get
> an exact offset and length to the header fields.
>
> 2) The maximum mangle length is one word of 32-bits, hence you need to
> up to four actions to mangle an IPv6 address. This patch coalesces
> consecutive tc pedit actions into one single action so drivers can
> configure the IPv6 mangling in one go. Ethernet address fields now
> require one single action instead of two too.
>
> The following drivers have been updated accordingly to use this new
> mangle action layout:
>
> 1) The cxgb4 driver does not need to split protocol field matching
> larger than one 32-bit words into multiple definitions. Instead one
> single definition per protocol field is enough. Checking for
> transport protocol ports is also simplified.
>
> 2) The mlx5 driver logic to disallow IPv4 ttl and IPv6 hoplimit fields
> becomes more simple too.
>
> 3) The nfp driver uses the nfp_fl_set_helper() function to configure the
> payload mangling. The memchr_inv() function is used to check for
> proper initialization of the value and mask. The driver has been
> updated to refer to the exact protocol header offsets too.
>
> As a result, this patch reduces code complexity on the driver side at
> the cost of adding ~100 LOC at the core to perform offset and length
> adjustment; and to coalesce consecutive actions.
>
> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
> ---
> .../net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c | 162 +++++-----------
> .../net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h | 40 ++--
> drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 90 +++------
> drivers/net/ethernet/netronome/nfp/flower/action.c | 203 ++++++++++-----------
> include/net/flow_offload.h | 7 +-
> net/sched/cls_api.c | 145 ++++++++++++---
> 6 files changed, 309 insertions(+), 338 deletions(-)
[...]
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
> index f29895b3a947..b7b88bc22cf7 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
> @@ -2201,19 +2201,24 @@ static int pedit_header_offsets[] = {
>
> #define pedit_header(_ph, _htype) ((void *)(_ph) + pedit_header_offsets[_htype])
>
> -static int set_pedit_val(u8 hdr_type, u32 mask, u32 val, u32 offset,
> +static int set_pedit_val(u8 hdr_type, const struct flow_action_entry *act,
> struct pedit_headers_action *hdrs)
> {
> - u32 *curr_pmask, *curr_pval;
> + u32 offset = act->mangle.offset;
> + u8 *curr_pmask, *curr_pval;
> + int i;
>
> - curr_pmask = (u32 *)(pedit_header(&hdrs->masks, hdr_type) + offset);
> - curr_pval = (u32 *)(pedit_header(&hdrs->vals, hdr_type) + offset);
> + curr_pmask = (u8 *)(pedit_header(&hdrs->masks, hdr_type) + offset);
> + curr_pval = (u8 *)(pedit_header(&hdrs->vals, hdr_type) + offset);
>
> - if (*curr_pmask & mask) /* disallow acting twice on the same location */
> - goto out_err;
> + for (i = 0; i < act->mangle.len; i++) {
> + /* disallow acting twice on the same location */
> + if (curr_pmask[i] & act->mangle.mask[i])
> + goto out_err;
>
> - *curr_pmask |= mask;
> - *curr_pval |= val;
> + curr_pmask[i] |= act->mangle.mask[i];
> + curr_pval[i] |= act->mangle.val[i];
> + }
>
> return 0;
>
> @@ -2487,7 +2492,6 @@ static int parse_tc_pedit_action(struct mlx5e_priv *priv,
> {
> u8 cmd = (act->id == FLOW_ACTION_MANGLE) ? 0 : 1;
> int err = -EOPNOTSUPP;
> - u32 mask, val, offset;
> u8 htype;
>
> htype = act->mangle.htype;
> @@ -2504,11 +2508,7 @@ static int parse_tc_pedit_action(struct mlx5e_priv *priv,
> goto out_err;
> }
>
> - mask = act->mangle.mask;
> - val = act->mangle.val;
> - offset = act->mangle.offset;
> -
> - err = set_pedit_val(htype, mask, val, offset, &hdrs[cmd]);
> + err = set_pedit_val(htype, act, &hdrs[cmd]);
> if (err)
> goto out_err;
>
> @@ -2589,50 +2589,18 @@ static bool csum_offload_supported(struct mlx5e_priv *priv,
> return true;
> }
>
> -struct ip_ttl_word {
> - __u8 ttl;
> - __u8 protocol;
> - __sum16 check;
> -};
> -
> -struct ipv6_hoplimit_word {
> - __be16 payload_len;
> - __u8 nexthdr;
> - __u8 hop_limit;
> -};
> -
> static bool is_action_keys_supported(const struct flow_action_entry *act)
> {
> - u32 mask, offset;
> - u8 htype;
> + u32 offset = act->mangle.offset;
> + u8 htype = act->mangle.htype;
>
> - htype = act->mangle.htype;
> - offset = act->mangle.offset;
> - mask = act->mangle.mask;
> - /* For IPv4 & IPv6 header check 4 byte word,
> - * to determine that modified fields
> - * are NOT ttl & hop_limit only.
> - */
> - if (htype == FLOW_ACT_MANGLE_HDR_TYPE_IP4) {
> - struct ip_ttl_word *ttl_word =
> - (struct ip_ttl_word *)&mask;
> -
> - if (offset != offsetof(struct iphdr, ttl) ||
> - ttl_word->protocol ||
> - ttl_word->check) {
> - return true;
> - }
> - } else if (htype == FLOW_ACT_MANGLE_HDR_TYPE_IP6) {
> - struct ipv6_hoplimit_word *hoplimit_word =
> - (struct ipv6_hoplimit_word *)&mask;
> -
> - if (offset != offsetof(struct ipv6hdr, payload_len) ||
> - hoplimit_word->payload_len ||
> - hoplimit_word->nexthdr) {
> - return true;
> - }
> - }
> - return false;
> + if ((htype == FLOW_ACT_MANGLE_HDR_TYPE_IP4 &&
> + offset == offsetof(struct iphdr, ttl)) ||
> + (htype == FLOW_ACT_MANGLE_HDR_TYPE_IP6 &&
> + offset == offsetof(struct ipv6hdr, hop_limit)))
> + return false;
> +
> + return true;
> }
With this change is_action_keys_supported() incorrectly returns true for
non-IP{4|6} mangles. I guess naming of the functions doesn't help
because it should be something like is_action_iphdr_keys_supported()...
Anyway, this results following rule to be incorrectly rejected by
driver:
tc filter add dev ens1f0_0 protocol ip parent ffff: prio 3
flower dst_mac e4:1d:2d:fd:8b:02 skip_sw
action pedit ex munge eth src set 11:22:33:44:55:66 munge eth dst set
aa:bb:cc:dd:ee:ff pipe
action csum ip pipe
action tunnel_key set id 98 src_ip 2.2.2.2 dst_ip 2.2.2.3 dst_port 1234
action mirred egress redirect dev vxlan1
The pedit action is rejected by conditional that follows the loop in
modify_header_match_supported() which calls is_action_keys_supported().
With this change modify_ip_header==true (even though the pedit only
modifies eth header), which causes failure because ip proto is not
supported:
Error: mlx5_core: can't offload re-write of non TCP/UDP.
ERROR: [ 3345.830338] can't offload re-write of ip proto 0
^ permalink raw reply
* pull-request: can-next 2019-09-04 j1939
From: Marc Kleine-Budde @ 2019-09-04 12:29 UTC (permalink / raw)
To: netdev
Cc: davem, kernel, linux-can, Oliver Hartkopp, Bastian Stender,
Elenita Hinds, Kurt Van Dijck, Maxime Jayat, Robin van der Gracht,
Oleksij Rempel, David Jander
[-- Attachment #1.1: Type: text/plain, Size: 6099 bytes --]
Hello David,
this is a pull request for net-next/master consisting of 21 patches.
the first 12 patches are by me and target the CAN core infrastructure.
They clean up the names of variables , structs and struct members,
convert can_rx_register() to use max() instead of open coding it and
remove unneeded code from the can_pernet_exit() callback.
The next three patches are also by me and they introduce and make use of
the CAN midlayer private structure. It is used to hold protocol specific
per device data structures.
The next patch is by Oleksij Rempel, switches the
&net->can.rcvlists_lock from a spin_lock() to a spin_lock_bh(), so that
it can be used from NAPI (soft IRQ) context.
The next 4 patches are by Kurt Van Dijck, he first updates his email
address via mailmap and then extends sockaddr_can to include j1939
members.
The final patch is the collective effort of many entities (The j1939
authors: Oliver Hartkopp, Bastian Stender, Elenita Hinds, kbuild test
robot, Kurt Van Dijck, Maxime Jayat, Robin van der Gracht, Oleksij
Rempel, Marc Kleine-Budde). It adds support of SAE J1939 protocol to the
CAN networking stack.
SAE J1939 is the vehicle bus recommended practice used for communication
and diagnostics among vehicle components. Originating in the car and
heavy-duty truck industry in the United States, it is now widely used in
other parts of the world.
regards,
Marc
P.S.: This pull request doesn't invalidate my last pull request:
"pull-request: can-next 2019-09-03".
---
The following changes since commit 2c1f9e26344483e2c74e80ef708d9c7fd2e543f4:
Merge branch '100GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue (2019-09-03 21:51:25 -0700)
are available in the Git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/mkl/linux-can-next.git tags/linux-can-next-for-5.4-20190904
for you to fetch changes up to 9d71dd0c70099914fcd063135da3c580865e924c:
can: add support of SAE J1939 protocol (2019-09-04 14:22:33 +0200)
----------------------------------------------------------------
linux-can-next-for-5.4-20190904
----------------------------------------------------------------
Kurt Van Dijck (4):
mailmap: update email address
can: introduce CAN_REQUIRED_SIZE macro
can: add socket type for CAN_J1939
can: extend sockaddr_can to include j1939 members
Marc Kleine-Budde (15):
can: netns: give structs holding the CAN statistics a sensible name
can: netns: give members of struct netns_can holding the statistics a sensible name
can: af_can: give variables holding CAN statistics a sensible name
can: proc: give variables holding CAN statistics a sensible name
can: netns: remove "can_" prefix from members struct netns_can
can: af_can: give variable holding the CAN per device receive lists a sensible name
can: proc: give variable holding the CAN per device receive lists a sensible name
can: af_can: rename find_rcv_list() to can_rcv_list_find()
can: af_can: rename find_dev_rcv_lists() to can_dev_rcv_lists_find()
can: af_can: give variable holding the CAN receiver and the receiver list a sensible name
can: af_can: can_rx_register(): use max() instead of open coding it
can: af_can: can_pernet_exit(): no need to iterate over and cleanup registered CAN devices
can: introduce CAN midlayer private and allocate it automatically
can: make use of preallocated can_ml_priv for per device struct can_dev_rcv_lists
can: af_can: remove NULL-ptr checks from users of can_dev_rcv_lists_find()
Oleksij Rempel (1):
can: af_can: use spin_lock_bh() for &net->can.rcvlists_lock
The j1939 authors (1):
can: add support of SAE J1939 protocol
.mailmap | 1 +
Documentation/networking/index.rst | 1 +
Documentation/networking/j1939.rst | 422 ++++++++
MAINTAINERS | 10 +
drivers/net/can/dev.c | 24 +-
drivers/net/can/slcan.c | 6 +-
drivers/net/can/vcan.c | 7 +-
drivers/net/can/vxcan.c | 4 +-
include/linux/can/can-ml.h | 68 ++
include/linux/can/core.h | 8 +
include/net/netns/can.h | 14 +-
include/uapi/linux/can.h | 20 +-
include/uapi/linux/can/j1939.h | 99 ++
net/can/Kconfig | 2 +
net/can/Makefile | 2 +
net/can/af_can.c | 302 +++---
net/can/af_can.h | 19 +-
net/can/bcm.c | 4 +-
net/can/j1939/Kconfig | 15 +
net/can/j1939/Makefile | 10 +
net/can/j1939/address-claim.c | 230 ++++
net/can/j1939/bus.c | 333 ++++++
net/can/j1939/j1939-priv.h | 338 ++++++
net/can/j1939/main.c | 403 +++++++
net/can/j1939/socket.c | 1160 +++++++++++++++++++++
net/can/j1939/transport.c | 2027 ++++++++++++++++++++++++++++++++++++
net/can/proc.c | 163 +--
net/can/raw.c | 4 +-
28 files changed, 5398 insertions(+), 298 deletions(-)
create mode 100644 Documentation/networking/j1939.rst
create mode 100644 include/linux/can/can-ml.h
create mode 100644 include/uapi/linux/can/j1939.h
create mode 100644 net/can/j1939/Kconfig
create mode 100644 net/can/j1939/Makefile
create mode 100644 net/can/j1939/address-claim.c
create mode 100644 net/can/j1939/bus.c
create mode 100644 net/can/j1939/j1939-priv.h
create mode 100644 net/can/j1939/main.c
create mode 100644 net/can/j1939/socket.c
create mode 100644 net/can/j1939/transport.c
--
Pengutronix e.K. | Marc Kleine-Budde |
Industrial Linux Solutions | Phone: +49-231-2826-924 |
Vertretung West/Dortmund | Fax: +49-5121-206917-5555 |-
Amtsgericht Hildesheim, HRA 2686 | http://www.pengutronix.de |
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]
^ permalink raw reply
* Re: [PATCH] net/skbuff: silence warnings under memory pressure
From: Qian Cai @ 2019-09-04 12:28 UTC (permalink / raw)
To: Michal Hocko
Cc: Sergey Senozhatsky, Eric Dumazet, davem, netdev, linux-mm,
linux-kernel, Petr Mladek, Sergey Senozhatsky, Steven Rostedt
In-Reply-To: <20190904120707.GU3838@dhcp22.suse.cz>
On Wed, 2019-09-04 at 14:07 +0200, Michal Hocko wrote:
> On Wed 04-09-19 07:59:17, Qian Cai wrote:
> > On Wed, 2019-09-04 at 10:25 +0200, Michal Hocko wrote:
> > > On Wed 04-09-19 16:00:42, Sergey Senozhatsky wrote:
> > > > On (09/04/19 15:41), Sergey Senozhatsky wrote:
> > > > > But the thing is different in case of dump_stack() + show_mem() +
> > > > > some other output. Because now we ratelimit not a single printk()
> > > > > line,
> > > > > but hundreds of them. The ratelimit becomes - 10 * $$$ lines in 5
> > > > > seconds
> > > > > (IOW, now we talk about thousands of lines).
> > > >
> > > > And on devices with slow serial consoles this can be somewhat close to
> > > > "no ratelimit". *Suppose* that warn_alloc() adds 700 lines each time.
> > > > Within 5 seconds we can call warn_alloc() 10 times, which will add 7000
> > > > lines to the logbuf. If printk() can evict only 6000 lines in 5 seconds
> > > > then we have a growing number of pending logbuf messages.
> > >
> > > Yes, ratelimit is problematic when the ratelimited operation is slow. I
> > > guess that is a well known problem and we would need to rework both the
> > > api and the implementation to make it work in those cases as well.
> > > Essentially we need to make the ratelimit act as a gatekeeper to an
> > > operation section - something like a critical section except you can
> > > tolerate more code executions but not too many. So effectively
> > >
> > > start_throttle(rate, number);
> > > /* here goes your operation */
> > > end_throttle();
> > >
> > > one operation is not considered done until the whole section ends.
> > > Or something along those lines.
> > >
> > > In this particular case we can increase the rate limit parameters of
> > > course but I think that longterm we need a better api.
> >
> > The problem is when a system is under heavy memory pressure, everything is
> > becoming slower, so I don't know how to come up with a sane default for rate
> > limit parameters as a generic solution that would work for every machine out
> > there. Sure, it is possible to set a limit as low as possible that would
> > work
> > for the majority of systems apart from people may complain that they are now
> > missing important warnings, but using __GFP_NOWARN in this code would work
> > for
> > all systems. You could even argument there is even a separate benefit that
> > it
> > could reduce the noise-level overall from those build_skb() allocation
> > failures
> > as it has a fall-back mechanism anyway.
>
> As Vlastimil already pointed out, __GFP_NOWARN would hide that reserves
> might be configured too low.
Tune "min_free_kbytes" is also an unreliable solution and situational as the
same reason mentioned previously. It may also need a lot of testing to find out
the right value of it on one particular system.
"
When there is a heavy memory pressure, the system is trying hard to reclaim
memory to fill up the watermark. However, the IO is slow to page out, but the
memory pressure keep draining atomic reservoir, and some of those skb_build()
will fail eventually.
Only if there is a fast IO, it will finish swapping sooner and then invoke the
OOM to end the memory pressure.
"
It also have a drawback that "waste" precious memory resources, as allocations
other than GPF_ATOMIC are unable to use those reserved memory anymore.
^ permalink raw reply
* Re: RE: Is bug 200755 in anyone's queue??
From: Mark KEATON @ 2019-09-04 12:00 UTC (permalink / raw)
To: Steve Zabele, Willem de Bruijn
Cc: Eric Dumazet, Network Development, shum@canndrew.org,
vladimir116@gmail.com, saifi.khan@strikr.in, Daniel Borkmann,
on2k16nm@gmail.com, Stephen Hemminger
In-Reply-To: <00aa01d5630b$7e062660$7a127320$@net>
Hi Willem,
I am the person who commented on the original bug report in bugzilla.
In communicating with Steve just now about possible solutions that maintain the efficiency that you are after, what would you think of the following: keep two lists of UDP sockets, those connected and those not connected, and always searching the connected list first. If the connected list is empty, then the lookup can quickly use the not connected list to find a socket for load balancing. If there are connected sockets, then only those connected sockets are searched first for an exact match.
Another option might be to do it with a single list if the connected sockets are all at the beginning of the list. This would require the two separate lookups to start at different points in the list.
Thoughts?
Thanks!
Mark
> On Sep 4, 2019, at 6:28 AM, Steve Zabele <zabele@comcast.net> wrote:
>
> Hi Willem,
>
> Thanks for continuing to poke at this, much appreciated!
>
>> As for the BPF program: good point on accessing the udp port when
>> skb->data is already beyond the header.
>
>> Programs of type sk_filter can use bpf_skb_load_bytes(_relative).
>> Which I think will work, but have not tested.
>
> Please note that the test code was intentionally set up to make testing as simple as possible. Hence the source addresses for the multiple UDP sessions were identical -- but that is not the general case. In the general case a connected and bound socket should be associated with exactly one five tuple (source and dest addresses, source and destination ports, and protocol.
>
> So a 'connect bpf' would actually need access to the IP addresses as well, not just the ports. To do this, the load bytes call required negative arguments, which failed miserably when we tried it.
>
> In any event, there remains the issue of figuring out which index to return when a match is detected since the index is not the same as the file descriptor value and in fact can change as file descriptors are added and deleted. If I understand the kernel mechanism correctly, the operation is something like this. When you add the first one, its assigned to the first slot; when you add the second its assigned to the second slot; when you delete the first one, the second is moved to the first slot) so tracking this requires figuring out the order stored in the socket array within the kernel, and updating the bpf whenever something changes. I don't know if it's even possible to query which slot a given
>
> So we think handling this with a bpf is really not viable.
>
> One thing worth mentioning is that the connect mechanism here is meant to (at least used to) work the same as connect does with TCP. Bind sets the expected/required local address and port; connect sets the expected/required remote address and port -- so a socket file descriptor becomes associated with exactly one five-tuple. That's how it's worked for several decades anyway.
>
> Thanks again!!!
>
> Steve
>
> -----Original Message-----
> From: Willem de Bruijn [mailto:willemdebruijn.kernel@gmail.com]
> Sent: Tuesday, September 03, 2019 1:56 PM
> Cc: Eric Dumazet; Steve Zabele; Network Development; shum@canndrew.org; vladimir116@gmail.com; saifi.khan@strikr.in; Daniel Borkmann; on2k16nm@gmail.com; Stephen Hemminger
> Subject: Re: Is bug 200755 in anyone's queue??
>
> On Fri, Aug 30, 2019 at 4:30 PM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
>>
>> On Fri, Aug 30, 2019 at 4:54 AM Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>>
>>>
>>>
>>> On 8/29/19 9:26 PM, Willem de Bruijn wrote:
>>>
>>>> SO_REUSEPORT was not intended to be used in this way. Opening
>>>> multiple connected sockets with the same local port.
>>>>
>>>> But since the interface allowed connect after joining a group, and
>>>> that is being used, I guess that point is moot. Still, I'm a bit
>>>> surprised that it ever worked as described.
>>>>
>>>> Also note that the default distribution algorithm is not round robin
>>>> assignment, but hash based. So multiple consecutive datagrams arriving
>>>> at the same socket is not unexpected.
>>>>
>>>> I suspect that this quick hack might "work". It seemed to on the
>>>> supplied .c file:
>>>>
>>>> score = compute_score(sk, net, saddr, sport,
>>>> daddr, hnum, dif, sdif);
>>>> if (score > badness) {
>>>> - if (sk->sk_reuseport) {
>>>> + if (sk->sk_reuseport && !sk->sk_state !=
>>>> TCP_ESTABLISHED) {
>>
>> This won't work for a mix of connected and connectionless sockets, of
>> course (even ignoring the typo), as it only skips reuseport on the
>> connected sockets.
>>
>>>>
>>>> But a more robust approach, that also works on existing kernels, is to
>>>> swap the default distribution algorithm with a custom BPF based one (
>>>> SO_ATTACH_REUSEPORT_EBPF).
>>>>
>>>
>>> Yes, I suspect that reuseport could still be used by to load-balance incoming packets
>>> targetting the same 4-tuple.
>>>
>>> So all sockets would have the same score, and we would select the first socket in
>>> the list (if not applying reuseport hashing)
>>
>> Can you elaborate a bit?
>>
>> One option I see is to record in struct sock_reuseport if any port in
>> the group is connected and, if so, don't return immediately on the
>> first reuseport_select_sock hit, but continue the search for a higher
>> scoring connected socket.
>>
>> Or do return immediately, but do this refined search in
>> reuseport_select_sock itself, as it has a reference to all sockets in the
>> group in sock_reuseport->socks[]. Instead of the straightforward hash.
>
> That won't work, as reuseport_select_sock does not have access to
> protocol specific data, notably inet_dport.
>
> Unfortunately, what I've come up with so far is not concise and slows
> down existing reuseport lookup in a busy port table slot. Note that it
> is needed for both ipv4 and ipv6.
>
> Do not break out of the port table slot early, but continue to search
> for a higher scored match even after matching a reuseport:
>
> "
> @@ -413,28 +413,39 @@ static struct sock *udp4_lib_lookup2(struct net *net,
> struct udp_hslot *hslot2,
> struct sk_buff *skb)
> {
> + struct sock *reuseport_result = NULL;
> struct sock *sk, *result;
> + int reuseport_score = 0;
> int score, badness;
> u32 hash = 0;
>
> result = NULL;
> badness = 0;
> udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
> score = compute_score(sk, net, saddr, sport,
> daddr, hnum, dif, sdif);
> if (score > badness) {
> - if (sk->sk_reuseport) {
> + if (sk->sk_reuseport &&
> + sk->sk_state != TCP_ESTABLISHED &&
> + !reuseport_result) {
> hash = udp_ehashfn(net, daddr, hnum,
> saddr, sport);
> - result = reuseport_select_sock(sk, hash, skb,
> + reuseport_result =
> reuseport_select_sock(sk, hash, skb,
> sizeof(struct udphdr));
> - if (result)
> - return result;
> + if (reuseport_result)
> + reuseport_score = score;
> + continue;
> }
> badness = score;
> result = sk;
> }
> }
> +
> + if (badness < reuseport_score)
> + result = reuseport_result;
> +
> return result;
> "
>
> To break out after the first reuseport hit when it is safe, i.e., when
> it holds no connected sockets, requires adding this state to struct
> reuseport_sock at __ip4_datagram_connect. And modify
> reuseport_select_sock to read this. At least, I have not found a more
> elegant solution.
>
>> Steve, Re: your point on a scalable QUIC server. That is an
>> interesting case certainly. Opening a connected socket per flow adds
>> both memory and port table pressure. I once looked into an SO_TXONLY
>> udp socket option that does not hash connected sockets into the port
>> table. In effect receiving on a small set of listening sockets (e.g.,
>> one per cpu) and sending over separate tx-only sockets. That still
>> introduces unnecessary memory allocation. OTOH it amortizes some
>> operations, such as route lookup.
>>
>> Anyway, that does not fix the immediate issue you reported when using
>> SO_REUSEPORT as described.
>
> As for the BPF program: good point on accessing the udp port when
> skb->data is already beyond the header.
>
> Programs of type sk_filter can use bpf_skb_load_bytes(_relative).
> Which I think will work, but have not tested.
>
> As of kernel 4.19 programs of type BPF_PROG_TYPE_SK_REUSEPORT can be
> attached (with CAP_SYS_ADMIN). See
> tools/testing/selftests/bpf/progs/test_select_reuseport_kern.c for an
> example that parses udp headers with bpf_skb_load_bytes.
>
^ permalink raw reply
* Re: Is bug 200755 in anyone's queue??
From: Eric Dumazet @ 2019-09-04 12:23 UTC (permalink / raw)
To: Mark KEATON, Steve Zabele, Willem de Bruijn
Cc: Network Development, shum@canndrew.org, vladimir116@gmail.com,
saifi.khan@strikr.in, Daniel Borkmann, on2k16nm@gmail.com,
Stephen Hemminger
In-Reply-To: <4242994D-E2CF-499A-848A-7B14CE536E33@raytheon.com>
On 9/4/19 2:00 PM, Mark KEATON wrote:
> Hi Willem,
>
> I am the person who commented on the original bug report in bugzilla.
>
> In communicating with Steve just now about possible solutions that maintain the efficiency that you are after, what would you think of the following: keep two lists of UDP sockets, those connected and those not connected, and always searching the connected list first.
This was my suggestion.
Note that this requires adding yet another hash table, and yet another lookup
(another cache line miss per incoming packet)
This lookup will slow down DNS and QUIC servers, or any application solely using not connected sockets.
The word 'quick' you use is slightly misleading, since a change like that is a trade off.
Some applications might become faster, while others become slower.
Another issue is that a connect() can follow a bind(), we would need to rehash sockets
from one table to another. (Or add another set of anchors in UDP sockets, so that sockets can be in all the hash tables)
If the connected list is empty, then the lookup can quickly use the not connected list to find a socket for load balancing. If there are connected sockets, then only those connected sockets are searched first for an exact match.
>
> Another option might be to do it with a single list if the connected sockets are all at the beginning of the list. This would require the two separate lookups to start at different points in the list.
>
> Thoughts?
>
> Thanks!
> Mark
>
>
>> On Sep 4, 2019, at 6:28 AM, Steve Zabele <zabele@comcast.net> wrote:
>>
>> Hi Willem,
>>
>> Thanks for continuing to poke at this, much appreciated!
>>
>>> As for the BPF program: good point on accessing the udp port when
>>> skb->data is already beyond the header.
>>
>>> Programs of type sk_filter can use bpf_skb_load_bytes(_relative).
>>> Which I think will work, but have not tested.
>>
>> Please note that the test code was intentionally set up to make testing as simple as possible. Hence the source addresses for the multiple UDP sessions were identical -- but that is not the general case. In the general case a connected and bound socket should be associated with exactly one five tuple (source and dest addresses, source and destination ports, and protocol.
>>
>> So a 'connect bpf' would actually need access to the IP addresses as well, not just the ports. To do this, the load bytes call required negative arguments, which failed miserably when we tried it.
>>
>> In any event, there remains the issue of figuring out which index to return when a match is detected since the index is not the same as the file descriptor value and in fact can change as file descriptors are added and deleted. If I understand the kernel mechanism correctly, the operation is something like this. When you add the first one, its assigned to the first slot; when you add the second its assigned to the second slot; when you delete the first one, the second is moved to the first slot) so tracking this requires figuring out the order stored in the socket array within the kernel, and updating the bpf whenever something changes. I don't know if it's even possible to query which slot a given
>>
>> So we think handling this with a bpf is really not viable.
>>
>> One thing worth mentioning is that the connect mechanism here is meant to (at least used to) work the same as connect does with TCP. Bind sets the expected/required local address and port; connect sets the expected/required remote address and port -- so a socket file descriptor becomes associated with exactly one five-tuple. That's how it's worked for several decades anyway.
>>
>> Thanks again!!!
>>
>> Steve
>>
>> -----Original Message-----
>> From: Willem de Bruijn [mailto:willemdebruijn.kernel@gmail.com]
>> Sent: Tuesday, September 03, 2019 1:56 PM
>> Cc: Eric Dumazet; Steve Zabele; Network Development; shum@canndrew.org; vladimir116@gmail.com; saifi.khan@strikr.in; Daniel Borkmann; on2k16nm@gmail.com; Stephen Hemminger
>> Subject: Re: Is bug 200755 in anyone's queue??
>>
>> On Fri, Aug 30, 2019 at 4:30 PM Willem de Bruijn
>> <willemdebruijn.kernel@gmail.com> wrote:
>>>
>>> On Fri, Aug 30, 2019 at 4:54 AM Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>>>
>>>>
>>>>
>>>> On 8/29/19 9:26 PM, Willem de Bruijn wrote:
>>>>
>>>>> SO_REUSEPORT was not intended to be used in this way. Opening
>>>>> multiple connected sockets with the same local port.
>>>>>
>>>>> But since the interface allowed connect after joining a group, and
>>>>> that is being used, I guess that point is moot. Still, I'm a bit
>>>>> surprised that it ever worked as described.
>>>>>
>>>>> Also note that the default distribution algorithm is not round robin
>>>>> assignment, but hash based. So multiple consecutive datagrams arriving
>>>>> at the same socket is not unexpected.
>>>>>
>>>>> I suspect that this quick hack might "work". It seemed to on the
>>>>> supplied .c file:
>>>>>
>>>>> score = compute_score(sk, net, saddr, sport,
>>>>> daddr, hnum, dif, sdif);
>>>>> if (score > badness) {
>>>>> - if (sk->sk_reuseport) {
>>>>> + if (sk->sk_reuseport && !sk->sk_state !=
>>>>> TCP_ESTABLISHED) {
>>>
>>> This won't work for a mix of connected and connectionless sockets, of
>>> course (even ignoring the typo), as it only skips reuseport on the
>>> connected sockets.
>>>
>>>>>
>>>>> But a more robust approach, that also works on existing kernels, is to
>>>>> swap the default distribution algorithm with a custom BPF based one (
>>>>> SO_ATTACH_REUSEPORT_EBPF).
>>>>>
>>>>
>>>> Yes, I suspect that reuseport could still be used by to load-balance incoming packets
>>>> targetting the same 4-tuple.
>>>>
>>>> So all sockets would have the same score, and we would select the first socket in
>>>> the list (if not applying reuseport hashing)
>>>
>>> Can you elaborate a bit?
>>>
>>> One option I see is to record in struct sock_reuseport if any port in
>>> the group is connected and, if so, don't return immediately on the
>>> first reuseport_select_sock hit, but continue the search for a higher
>>> scoring connected socket.
>>>
>>> Or do return immediately, but do this refined search in
>>> reuseport_select_sock itself, as it has a reference to all sockets in the
>>> group in sock_reuseport->socks[]. Instead of the straightforward hash.
>>
>> That won't work, as reuseport_select_sock does not have access to
>> protocol specific data, notably inet_dport.
>>
>> Unfortunately, what I've come up with so far is not concise and slows
>> down existing reuseport lookup in a busy port table slot. Note that it
>> is needed for both ipv4 and ipv6.
>>
>> Do not break out of the port table slot early, but continue to search
>> for a higher scored match even after matching a reuseport:
>>
>> "
>> @@ -413,28 +413,39 @@ static struct sock *udp4_lib_lookup2(struct net *net,
>> struct udp_hslot *hslot2,
>> struct sk_buff *skb)
>> {
>> + struct sock *reuseport_result = NULL;
>> struct sock *sk, *result;
>> + int reuseport_score = 0;
>> int score, badness;
>> u32 hash = 0;
>>
>> result = NULL;
>> badness = 0;
>> udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
>> score = compute_score(sk, net, saddr, sport,
>> daddr, hnum, dif, sdif);
>> if (score > badness) {
>> - if (sk->sk_reuseport) {
>> + if (sk->sk_reuseport &&
>> + sk->sk_state != TCP_ESTABLISHED &&
>> + !reuseport_result) {
>> hash = udp_ehashfn(net, daddr, hnum,
>> saddr, sport);
>> - result = reuseport_select_sock(sk, hash, skb,
>> + reuseport_result =
>> reuseport_select_sock(sk, hash, skb,
>> sizeof(struct udphdr));
>> - if (result)
>> - return result;
>> + if (reuseport_result)
>> + reuseport_score = score;
>> + continue;
>> }
>> badness = score;
>> result = sk;
>> }
>> }
>> +
>> + if (badness < reuseport_score)
>> + result = reuseport_result;
>> +
>> return result;
>> "
>>
>> To break out after the first reuseport hit when it is safe, i.e., when
>> it holds no connected sockets, requires adding this state to struct
>> reuseport_sock at __ip4_datagram_connect. And modify
>> reuseport_select_sock to read this. At least, I have not found a more
>> elegant solution.
>>
>>> Steve, Re: your point on a scalable QUIC server. That is an
>>> interesting case certainly. Opening a connected socket per flow adds
>>> both memory and port table pressure. I once looked into an SO_TXONLY
>>> udp socket option that does not hash connected sockets into the port
>>> table. In effect receiving on a small set of listening sockets (e.g.,
>>> one per cpu) and sending over separate tx-only sockets. That still
>>> introduces unnecessary memory allocation. OTOH it amortizes some
>>> operations, such as route lookup.
>>>
>>> Anyway, that does not fix the immediate issue you reported when using
>>> SO_REUSEPORT as described.
>>
>> As for the BPF program: good point on accessing the udp port when
>> skb->data is already beyond the header.
>>
>> Programs of type sk_filter can use bpf_skb_load_bytes(_relative).
>> Which I think will work, but have not tested.
>>
>> As of kernel 4.19 programs of type BPF_PROG_TYPE_SK_REUSEPORT can be
>> attached (with CAP_SYS_ADMIN). See
>> tools/testing/selftests/bpf/progs/test_select_reuseport_kern.c for an
>> example that parses udp headers with bpf_skb_load_bytes.
>>
^ permalink raw reply
* Re: [PATCH bpf 2/2] libbpf: remove dependency on barrier.h in xsk.h
From: Magnus Karlsson @ 2019-09-04 12:21 UTC (permalink / raw)
To: Yauheni Kaliuta; +Cc: Magnus Karlsson, bpf, Network Development
In-Reply-To: <xuny36hc6ypx.fsf@redhat.com>
On Wed, Sep 4, 2019 at 2:19 PM Yauheni Kaliuta
<yauheni.kaliuta@redhat.com> wrote:
>
> Hi, Magnus!
>
> >>>>> On Wed, 4 Sep 2019 12:25:13 +0200, Magnus Karlsson wrote:
> > On Wed, Sep 4, 2019 at 8:56 AM Yauheni Kaliuta
> > <yauheni.kaliuta@redhat.com> wrote:
> >>
> >> Hi, Magnus!
> >>
> >> >>>>> On Wed, 4 Sep 2019 08:39:24 +0200, Magnus Karlsson wrote:
> >>
> >> > On Wed, Sep 4, 2019 at 7:32 AM Yauheni Kaliuta
> >> > <yauheni.kaliuta@redhat.com> wrote:
> >> >>
> >> >> Hi, Magnus!
> >> >>
> >> >> >>>>> On Tue, 9 Apr 2019 08:44:13 +0200, Magnus Karlsson wrote:
> >> >>
> >> >> > The use of smp_rmb() and smp_wmb() creates a Linux header dependency
> >> >> > on barrier.h that is uneccessary in most parts. This patch implements
> >> >> > the two small defines that are needed from barrier.h. As a bonus, the
> >> >> > new implementations are faster than the default ones as they default
> >> >> > to sfence and lfence for x86, while we only need a compiler barrier in
> >> >> > our case. Just as it is when the same ring access code is compiled in
> >> >> > the kernel.
> >> >>
> >> >> > Fixes: 1cad07884239 ("libbpf: add support for using AF_XDP sockets")
> >> >> > Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
> >> >> > ---
> >> >> > tools/lib/bpf/xsk.h | 19 +++++++++++++++++--
> >> >> > 1 file changed, 17 insertions(+), 2 deletions(-)
> >> >>
> >> >> > diff --git a/tools/lib/bpf/xsk.h b/tools/lib/bpf/xsk.h
> >> >> > index 3638147..317b44f 100644
> >> >> > --- a/tools/lib/bpf/xsk.h
> >> >> > +++ b/tools/lib/bpf/xsk.h
> >> >> > @@ -39,6 +39,21 @@ DEFINE_XSK_RING(xsk_ring_cons);
> >> >> > struct xsk_umem;
> >> >> > struct xsk_socket;
> >> >>
> >> >> > +#if !defined bpf_smp_rmb && !defined bpf_smp_wmb
> >> >> > +# if defined(__i386__) || defined(__x86_64__)
> >> >> > +# define bpf_smp_rmb() asm volatile("" : : : "memory")
> >> >> > +# define bpf_smp_wmb() asm volatile("" : : : "memory")
> >> >> > +# elif defined(__aarch64__)
> >> >> > +# define bpf_smp_rmb() asm volatile("dmb ishld" : : : "memory")
> >> >> > +# define bpf_smp_wmb() asm volatile("dmb ishst" : : : "memory")
> >> >> > +# elif defined(__arm__)
> >> >> > +# define bpf_smp_rmb() asm volatile("dmb ish" : : : "memory")
> >> >> > +# define bpf_smp_wmb() asm volatile("dmb ishst" : : : "memory")
> >> >> > +# else
> >> >> > +# error Architecture not supported by the XDP socket code in libbpf.
> >> >> > +# endif
> >> >> > +#endif
> >> >> > +
> >> >>
> >> >> What about other architectures then?
> >>
> >> > AF_XDP has not been tested on anything else, as far as I
> >> > know. But contributions that extend it to more archs are
> >> > very welcome.
> >>
> >> Well, I'll may be try to fetch something from barrier.h's
> >> (since I cannot consider myself as a specialist in the area),
> >> but at the moment the patch breaks the build on that arches.
>
> > Do you have a specific architecture in mind and do you have
> > some board/server (of that architecture) you could test AF_XDP
> > on?
>
> I do care about s390 and ppc64 and I can run tests for them.
Perfect!. Thanks.
/Magnus
>
> [...]
>
> --
> WBR,
> Yauheni Kaliuta
^ permalink raw reply
* Re: [PATCH bpf 2/2] libbpf: remove dependency on barrier.h in xsk.h
From: Yauheni Kaliuta @ 2019-09-04 12:19 UTC (permalink / raw)
To: Magnus Karlsson; +Cc: Magnus Karlsson, bpf, Network Development
In-Reply-To: <CAJ8uoz3jhr+VUmtjotW07mnDkYLgOYYO2HpV9hOv3i8B4=Z_CQ@mail.gmail.com>
Hi, Magnus!
>>>>> On Wed, 4 Sep 2019 12:25:13 +0200, Magnus Karlsson wrote:
> On Wed, Sep 4, 2019 at 8:56 AM Yauheni Kaliuta
> <yauheni.kaliuta@redhat.com> wrote:
>>
>> Hi, Magnus!
>>
>> >>>>> On Wed, 4 Sep 2019 08:39:24 +0200, Magnus Karlsson wrote:
>>
>> > On Wed, Sep 4, 2019 at 7:32 AM Yauheni Kaliuta
>> > <yauheni.kaliuta@redhat.com> wrote:
>> >>
>> >> Hi, Magnus!
>> >>
>> >> >>>>> On Tue, 9 Apr 2019 08:44:13 +0200, Magnus Karlsson wrote:
>> >>
>> >> > The use of smp_rmb() and smp_wmb() creates a Linux header dependency
>> >> > on barrier.h that is uneccessary in most parts. This patch implements
>> >> > the two small defines that are needed from barrier.h. As a bonus, the
>> >> > new implementations are faster than the default ones as they default
>> >> > to sfence and lfence for x86, while we only need a compiler barrier in
>> >> > our case. Just as it is when the same ring access code is compiled in
>> >> > the kernel.
>> >>
>> >> > Fixes: 1cad07884239 ("libbpf: add support for using AF_XDP sockets")
>> >> > Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
>> >> > ---
>> >> > tools/lib/bpf/xsk.h | 19 +++++++++++++++++--
>> >> > 1 file changed, 17 insertions(+), 2 deletions(-)
>> >>
>> >> > diff --git a/tools/lib/bpf/xsk.h b/tools/lib/bpf/xsk.h
>> >> > index 3638147..317b44f 100644
>> >> > --- a/tools/lib/bpf/xsk.h
>> >> > +++ b/tools/lib/bpf/xsk.h
>> >> > @@ -39,6 +39,21 @@ DEFINE_XSK_RING(xsk_ring_cons);
>> >> > struct xsk_umem;
>> >> > struct xsk_socket;
>> >>
>> >> > +#if !defined bpf_smp_rmb && !defined bpf_smp_wmb
>> >> > +# if defined(__i386__) || defined(__x86_64__)
>> >> > +# define bpf_smp_rmb() asm volatile("" : : : "memory")
>> >> > +# define bpf_smp_wmb() asm volatile("" : : : "memory")
>> >> > +# elif defined(__aarch64__)
>> >> > +# define bpf_smp_rmb() asm volatile("dmb ishld" : : : "memory")
>> >> > +# define bpf_smp_wmb() asm volatile("dmb ishst" : : : "memory")
>> >> > +# elif defined(__arm__)
>> >> > +# define bpf_smp_rmb() asm volatile("dmb ish" : : : "memory")
>> >> > +# define bpf_smp_wmb() asm volatile("dmb ishst" : : : "memory")
>> >> > +# else
>> >> > +# error Architecture not supported by the XDP socket code in libbpf.
>> >> > +# endif
>> >> > +#endif
>> >> > +
>> >>
>> >> What about other architectures then?
>>
>> > AF_XDP has not been tested on anything else, as far as I
>> > know. But contributions that extend it to more archs are
>> > very welcome.
>>
>> Well, I'll may be try to fetch something from barrier.h's
>> (since I cannot consider myself as a specialist in the area),
>> but at the moment the patch breaks the build on that arches.
> Do you have a specific architecture in mind and do you have
> some board/server (of that architecture) you could test AF_XDP
> on?
I do care about s390 and ppc64 and I can run tests for them.
[...]
--
WBR,
Yauheni Kaliuta
^ permalink raw reply
* Re: [PATCH] net/skbuff: silence warnings under memory pressure
From: Qian Cai @ 2019-09-04 12:14 UTC (permalink / raw)
To: Sergey Senozhatsky, Michal Hocko
Cc: Eric Dumazet, davem, netdev, linux-mm, linux-kernel, Petr Mladek,
Sergey Senozhatsky, Steven Rostedt
In-Reply-To: <20190904074312.GA25744@jagdpanzerIV>
On Wed, 2019-09-04 at 16:43 +0900, Sergey Senozhatsky wrote:
> On (09/04/19 16:19), Sergey Senozhatsky wrote:
> > Hmm. I need to look at this more... wake_up_klogd() queues work only once
> > on particular CPU: irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
> >
> > bool irq_work_queue()
> > {
> > /* Only queue if not already pending */
> > if (!irq_work_claim(work))
> > return false;
> >
> > __irq_work_queue_local(work);
> > }
>
> Plus one more check - waitqueue_active(&log_wait). printk() adds
> pending irq_work only if there is a user-space process sleeping on
> log_wait and irq_work is not already scheduled. If the syslog is
> active or there is noone to wakeup then we don't queue irq_work.
Another possibility for this potential livelock is that those printk() from
warn_alloc(), dump_stack() and show_mem() increase the time it needs to process
build_skb() allocation failures significantly under memory pressure. As the
result, ksoftirqd() could be rescheduled during that time via a different CPU
(this is a large x86 NUMA system anyway),
[83605.577256][ C31] run_ksoftirqd+0x1f/0x40
[83605.577256][ C31] smpboot_thread_fn+0x255/0x440
[83605.577256][ C31] kthread+0x1df/0x200
[83605.577256][ C31] ret_from_fork+0x35/0x40
In addition, those printk() will deal with console drivers or even a networking
console, so it is probably not unusual that it could call irq_exit()-
>__do_softirq() at one point and then this livelock.
^ permalink raw reply
* Re: [PATCH] net/skbuff: silence warnings under memory pressure
From: Michal Hocko @ 2019-09-04 12:07 UTC (permalink / raw)
To: Qian Cai
Cc: Sergey Senozhatsky, Eric Dumazet, davem, netdev, linux-mm,
linux-kernel, Petr Mladek, Sergey Senozhatsky, Steven Rostedt
In-Reply-To: <1567598357.5576.70.camel@lca.pw>
On Wed 04-09-19 07:59:17, Qian Cai wrote:
> On Wed, 2019-09-04 at 10:25 +0200, Michal Hocko wrote:
> > On Wed 04-09-19 16:00:42, Sergey Senozhatsky wrote:
> > > On (09/04/19 15:41), Sergey Senozhatsky wrote:
> > > > But the thing is different in case of dump_stack() + show_mem() +
> > > > some other output. Because now we ratelimit not a single printk() line,
> > > > but hundreds of them. The ratelimit becomes - 10 * $$$ lines in 5 seconds
> > > > (IOW, now we talk about thousands of lines).
> > >
> > > And on devices with slow serial consoles this can be somewhat close to
> > > "no ratelimit". *Suppose* that warn_alloc() adds 700 lines each time.
> > > Within 5 seconds we can call warn_alloc() 10 times, which will add 7000
> > > lines to the logbuf. If printk() can evict only 6000 lines in 5 seconds
> > > then we have a growing number of pending logbuf messages.
> >
> > Yes, ratelimit is problematic when the ratelimited operation is slow. I
> > guess that is a well known problem and we would need to rework both the
> > api and the implementation to make it work in those cases as well.
> > Essentially we need to make the ratelimit act as a gatekeeper to an
> > operation section - something like a critical section except you can
> > tolerate more code executions but not too many. So effectively
> >
> > start_throttle(rate, number);
> > /* here goes your operation */
> > end_throttle();
> >
> > one operation is not considered done until the whole section ends.
> > Or something along those lines.
> >
> > In this particular case we can increase the rate limit parameters of
> > course but I think that longterm we need a better api.
>
> The problem is when a system is under heavy memory pressure, everything is
> becoming slower, so I don't know how to come up with a sane default for rate
> limit parameters as a generic solution that would work for every machine out
> there. Sure, it is possible to set a limit as low as possible that would work
> for the majority of systems apart from people may complain that they are now
> missing important warnings, but using __GFP_NOWARN in this code would work for
> all systems. You could even argument there is even a separate benefit that it
> could reduce the noise-level overall from those build_skb() allocation failures
> as it has a fall-back mechanism anyway.
As Vlastimil already pointed out, __GFP_NOWARN would hide that reserves
might be configured too low.
--
Michal Hocko
SUSE Labs
^ permalink raw reply
* Re: [PATCH] net/skbuff: silence warnings under memory pressure
From: Qian Cai @ 2019-09-04 11:59 UTC (permalink / raw)
To: Michal Hocko, Sergey Senozhatsky
Cc: Eric Dumazet, davem, netdev, linux-mm, linux-kernel, Petr Mladek,
Sergey Senozhatsky, Steven Rostedt
In-Reply-To: <20190904082540.GI3838@dhcp22.suse.cz>
On Wed, 2019-09-04 at 10:25 +0200, Michal Hocko wrote:
> On Wed 04-09-19 16:00:42, Sergey Senozhatsky wrote:
> > On (09/04/19 15:41), Sergey Senozhatsky wrote:
> > > But the thing is different in case of dump_stack() + show_mem() +
> > > some other output. Because now we ratelimit not a single printk() line,
> > > but hundreds of them. The ratelimit becomes - 10 * $$$ lines in 5 seconds
> > > (IOW, now we talk about thousands of lines).
> >
> > And on devices with slow serial consoles this can be somewhat close to
> > "no ratelimit". *Suppose* that warn_alloc() adds 700 lines each time.
> > Within 5 seconds we can call warn_alloc() 10 times, which will add 7000
> > lines to the logbuf. If printk() can evict only 6000 lines in 5 seconds
> > then we have a growing number of pending logbuf messages.
>
> Yes, ratelimit is problematic when the ratelimited operation is slow. I
> guess that is a well known problem and we would need to rework both the
> api and the implementation to make it work in those cases as well.
> Essentially we need to make the ratelimit act as a gatekeeper to an
> operation section - something like a critical section except you can
> tolerate more code executions but not too many. So effectively
>
> start_throttle(rate, number);
> /* here goes your operation */
> end_throttle();
>
> one operation is not considered done until the whole section ends.
> Or something along those lines.
>
> In this particular case we can increase the rate limit parameters of
> course but I think that longterm we need a better api.
The problem is when a system is under heavy memory pressure, everything is
becoming slower, so I don't know how to come up with a sane default for rate
limit parameters as a generic solution that would work for every machine out
there. Sure, it is possible to set a limit as low as possible that would work
for the majority of systems apart from people may complain that they are now
missing important warnings, but using __GFP_NOWARN in this code would work for
all systems. You could even argument there is even a separate benefit that it
could reduce the noise-level overall from those build_skb() allocation failures
as it has a fall-back mechanism anyway.
^ permalink raw reply
* Re: net: hsr: remove a redundant null check before kfree_skb
From: Markus Elfring @ 2019-09-04 11:55 UTC (permalink / raw)
To: zhong jiang, Arvid Brodin, David S. Miller, netdev
Cc: linux-kernel, kernel-janitors
In-Reply-To: <1567566558-7764-1-git-send-email-zhongjiang@huawei.com>
> kfree_skb has taken the null pointer into account.
I suggest to take another look also at information around
a similar update suggestion.
net-hsr: Delete unnecessary checks before the function call "kfree_skb"
https://lkml.org/lkml/2015/11/14/120
https://lore.kernel.org/patchwork/patch/617878/
https://lore.kernel.org/r/5647A77E.6040501@users.sourceforge.net/
https://lkml.org/lkml/2015/11/24/433
https://lore.kernel.org/r/56546951.9080101@alten.se/
Regards,
Markus
^ permalink raw reply
* [PATCH bpf-next v3 4/4] xsk: lock the control mutex in sock_diag interface
From: Björn Töpel @ 2019-09-04 11:49 UTC (permalink / raw)
To: ast, daniel, netdev
Cc: Björn Töpel, magnus.karlsson, magnus.karlsson, bpf,
jonathan.lemon, syzbot+c82697e3043781e08802, hdanton, i.maximets
In-Reply-To: <20190904114913.17217-1-bjorn.topel@gmail.com>
From: Björn Töpel <bjorn.topel@intel.com>
When accessing the members of an XDP socket, the control mutex should
be held. This commit fixes that.
Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Fixes: a36b38aa2af6 ("xsk: add sock_diag interface for AF_XDP")
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
net/xdp/xsk_diag.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c
index 9986a759fe06..f59791ba43a0 100644
--- a/net/xdp/xsk_diag.c
+++ b/net/xdp/xsk_diag.c
@@ -97,6 +97,7 @@ static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb,
msg->xdiag_ino = sk_ino;
sock_diag_save_cookie(sk, msg->xdiag_cookie);
+ mutex_lock(&xs->mutex);
if ((req->xdiag_show & XDP_SHOW_INFO) && xsk_diag_put_info(xs, nlskb))
goto out_nlmsg_trim;
@@ -117,10 +118,12 @@ static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb,
sock_diag_put_meminfo(sk, nlskb, XDP_DIAG_MEMINFO))
goto out_nlmsg_trim;
+ mutex_unlock(&xs->mutex);
nlmsg_end(nlskb, nlh);
return 0;
out_nlmsg_trim:
+ mutex_unlock(&xs->mutex);
nlmsg_cancel(nlskb, nlh);
return -EMSGSIZE;
}
--
2.20.1
^ permalink raw reply related
* [PATCH bpf-next v3 3/4] xsk: use state member for socket synchronization
From: Björn Töpel @ 2019-09-04 11:49 UTC (permalink / raw)
To: ast, daniel, netdev
Cc: Björn Töpel, magnus.karlsson, magnus.karlsson, bpf,
jonathan.lemon, syzbot+c82697e3043781e08802, hdanton, i.maximets
In-Reply-To: <20190904114913.17217-1-bjorn.topel@gmail.com>
From: Björn Töpel <bjorn.topel@intel.com>
Prior the state variable was introduced by Ilya, the dev member was
used to determine whether the socket was bound or not. However, when
dev was read, proper SMP barriers and READ_ONCE were missing. In order
to address the missing barriers and READ_ONCE, we start using the
state variable as a point of synchronization. The state member
read/write is paired with proper SMP barriers, and from this follows
that the members described above does not need READ_ONCE if used in
conjunction with state check.
In all syscalls and the xsk_rcv path we check if state is
XSK_BOUND. If that is the case we do a SMP read barrier, and this
implies that the dev, umem and all rings are correctly setup. Note
that no READ_ONCE are needed for these variable if used when state is
XSK_BOUND (plus the read barrier).
To summarize: The members struct xdp_sock members dev, queue_id, umem,
fq, cq, tx, rx, and state were read lock-less, with incorrect barriers
and missing {READ, WRITE}_ONCE. Now, umem, fq, cq, tx, rx, and state
are read lock-less. When these members are updated, WRITE_ONCE is
used. When read, READ_ONCE are only used when read outside the control
mutex (e.g. mmap) or, not synchronized with the state member
(XSK_BOUND plus smp_rmb())
Note that dev and queue_id do not need a WRITE_ONCE or READ_ONCE, due
to the introduce state synchronization (XSK_BOUND plus smp_rmb()).
Introducing the state check also fixes a race, found by syzcaller, in
xsk_poll() where umem could be accessed when stale.
Suggested-by: Hillf Danton <hdanton@sina.com>
Reported-by: syzbot+c82697e3043781e08802@syzkaller.appspotmail.com
Fixes: 77cd0d7b3f25 ("xsk: add support for need_wakeup flag in AF_XDP rings")
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
net/xdp/xsk.c | 54 +++++++++++++++++++++++++++++++++++++--------------
1 file changed, 39 insertions(+), 15 deletions(-)
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 8c9056f06989..c2f1af3b6a7c 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -186,10 +186,23 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
return err;
}
+static bool xsk_is_bound(struct xdp_sock *xs)
+{
+ if (READ_ONCE(xs->state) == XSK_BOUND) {
+ /* Matches smp_wmb() in bind(). */
+ smp_rmb();
+ return true;
+ }
+ return false;
+}
+
int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
{
u32 len;
+ if (!xsk_is_bound(xs))
+ return -EINVAL;
+
if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
return -EINVAL;
@@ -387,7 +400,7 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
struct sock *sk = sock->sk;
struct xdp_sock *xs = xdp_sk(sk);
- if (unlikely(!xs->dev))
+ if (unlikely(!xsk_is_bound(xs)))
return -ENXIO;
if (unlikely(!(xs->dev->flags & IFF_UP)))
return -ENETDOWN;
@@ -403,10 +416,15 @@ static unsigned int xsk_poll(struct file *file, struct socket *sock,
struct poll_table_struct *wait)
{
unsigned int mask = datagram_poll(file, sock, wait);
- struct sock *sk = sock->sk;
- struct xdp_sock *xs = xdp_sk(sk);
- struct net_device *dev = xs->dev;
- struct xdp_umem *umem = xs->umem;
+ struct xdp_sock *xs = xdp_sk(sock->sk);
+ struct net_device *dev;
+ struct xdp_umem *umem;
+
+ if (unlikely(!xsk_is_bound(xs)))
+ return mask;
+
+ dev = xs->dev;
+ umem = xs->umem;
if (umem->need_wakeup)
dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id,
@@ -442,10 +460,9 @@ static void xsk_unbind_dev(struct xdp_sock *xs)
{
struct net_device *dev = xs->dev;
- if (!dev || xs->state != XSK_BOUND)
+ if (xs->state != XSK_BOUND)
return;
-
- xs->state = XSK_UNBOUND;
+ WRITE_ONCE(xs->state, XSK_UNBOUND);
/* Wait for driver to stop using the xdp socket. */
xdp_del_sk_umem(xs->umem, xs);
@@ -520,7 +537,9 @@ static int xsk_release(struct socket *sock)
local_bh_enable();
xsk_delete_from_maps(xs);
+ mutex_lock(&xs->mutex);
xsk_unbind_dev(xs);
+ mutex_unlock(&xs->mutex);
xskq_destroy(xs->rx);
xskq_destroy(xs->tx);
@@ -632,12 +651,12 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
}
umem_xs = xdp_sk(sock->sk);
- if (!umem_xs->umem) {
- /* No umem to inherit. */
+ if (!xsk_is_bound(umem_xs)) {
err = -EBADF;
sockfd_put(sock);
goto out_unlock;
- } else if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
+ }
+ if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
err = -EINVAL;
sockfd_put(sock);
goto out_unlock;
@@ -671,10 +690,15 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
xdp_add_sk_umem(xs->umem, xs);
out_unlock:
- if (err)
+ if (err) {
dev_put(dev);
- else
- xs->state = XSK_BOUND;
+ } else {
+ /* Matches smp_rmb() in bind() for shared umem
+ * sockets, and xsk_is_bound().
+ */
+ smp_wmb();
+ WRITE_ONCE(xs->state, XSK_BOUND);
+ }
out_release:
mutex_unlock(&xs->mutex);
rtnl_unlock();
@@ -927,7 +951,7 @@ static int xsk_mmap(struct file *file, struct socket *sock,
unsigned long pfn;
struct page *qpg;
- if (xs->state != XSK_READY)
+ if (READ_ONCE(xs->state) != XSK_READY)
return -EBUSY;
if (offset == XDP_PGOFF_RX_RING) {
--
2.20.1
^ permalink raw reply related
* [PATCH bpf-next v3 2/4] xsk: avoid store-tearing when assigning umem
From: Björn Töpel @ 2019-09-04 11:49 UTC (permalink / raw)
To: ast, daniel, netdev
Cc: Björn Töpel, magnus.karlsson, magnus.karlsson, bpf,
jonathan.lemon, syzbot+c82697e3043781e08802, hdanton, i.maximets
In-Reply-To: <20190904114913.17217-1-bjorn.topel@gmail.com>
From: Björn Töpel <bjorn.topel@intel.com>
The umem member of struct xdp_sock is read outside of the control
mutex, in the mmap implementation, and needs a WRITE_ONCE to avoid
potential store-tearing.
Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Fixes: 423f38329d26 ("xsk: add umem fill queue support and mmap")
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
net/xdp/xsk.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 271d8d3fb11e..8c9056f06989 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -644,7 +644,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
}
xdp_get_umem(umem_xs->umem);
- xs->umem = umem_xs->umem;
+ WRITE_ONCE(xs->umem, umem_xs->umem);
sockfd_put(sock);
} else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
err = -EINVAL;
@@ -751,7 +751,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
/* Make sure umem is ready before it can be seen by others */
smp_wmb();
- xs->umem = umem;
+ WRITE_ONCE(xs->umem, umem);
mutex_unlock(&xs->mutex);
return 0;
}
--
2.20.1
^ permalink raw reply related
* [PATCH bpf-next v3 1/4] xsk: avoid store-tearing when assigning queues
From: Björn Töpel @ 2019-09-04 11:49 UTC (permalink / raw)
To: ast, daniel, netdev
Cc: Björn Töpel, magnus.karlsson, magnus.karlsson, bpf,
jonathan.lemon, syzbot+c82697e3043781e08802, hdanton, i.maximets
In-Reply-To: <20190904114913.17217-1-bjorn.topel@gmail.com>
From: Björn Töpel <bjorn.topel@intel.com>
Use WRITE_ONCE when doing the store of tx, rx, fq, and cq, to avoid
potential store-tearing. These members are read outside of the control
mutex in the mmap implementation.
Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Fixes: 37b076933a8e ("xsk: add missing write- and data-dependency barrier")
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
net/xdp/xsk.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 187fd157fcff..271d8d3fb11e 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -434,7 +434,7 @@ static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
/* Make sure queue is ready before it can be seen by others */
smp_wmb();
- *queue = q;
+ WRITE_ONCE(*queue, q);
return 0;
}
--
2.20.1
^ permalink raw reply related
* [PATCH bpf-next v3 0/4] xsk: various CPU barrier and {READ, WRITE}_ONCE
From: Björn Töpel @ 2019-09-04 11:49 UTC (permalink / raw)
To: ast, daniel, netdev
Cc: Björn Töpel, magnus.karlsson, magnus.karlsson, bpf,
bjorn.topel, jonathan.lemon, syzbot+c82697e3043781e08802, hdanton,
i.maximets
This is a four patch series of various barrier, {READ, WRITE}_ONCE
cleanups in the AF_XDP socket code. More details can be found in the
corresponding commit message. Previous revisions: v1 [4] and v2 [5].
For an AF_XDP socket, most control plane operations are done under the
control mutex (struct xdp_sock, mutex), but there are some places
where members of the struct is read outside the control mutex. The
dev, queue_id members are set in bind() and cleared at cleanup. The
umem, fq, cq, tx, rx, and state member are all assigned in various
places, e.g. bind() and setsockopt(). When the members are assigned,
they are protected by the control mutex, but since they are read
outside the mutex, a WRITE_ONCE is required to avoid store-tearing on
the read-side.
Prior the state variable was introduced by Ilya, the dev member was
used to determine whether the socket was bound or not. However, when
dev was read, proper SMP barriers and READ_ONCE were missing. In order
to address the missing barriers and READ_ONCE, we start using the
state variable as a point of synchronization. The state member
read/write is paired with proper SMP barriers, and from this follows
that the members described above does not need READ_ONCE statements if
used in conjunction with state check.
To summarize: The members struct xdp_sock members dev, queue_id, umem,
fq, cq, tx, rx, and state were read lock-less, with incorrect barriers
and missing {READ, WRITE}_ONCE. After this series umem, fq, cq, tx,
rx, and state are read lock-less. When these members are updated,
WRITE_ONCE is used. When read, READ_ONCE are only used when read
outside the control mutex (e.g. mmap) or, not synchronized with the
state member (XSK_BOUND plus smp_rmb())
Thanks,
Björn
[1] https://lore.kernel.org/bpf/beef16bb-a09b-40f1-7dd0-c323b4b89b17@iogearbox.net/
[2] https://lwn.net/Articles/793253/
[3] https://github.com/google/ktsan/wiki/READ_ONCE-and-WRITE_ONCE
[4] https://lore.kernel.org/bpf/20190822091306.20581-1-bjorn.topel@gmail.com/
[5] https://lore.kernel.org/bpf/20190826061053.15996-1-bjorn.topel@gmail.com/
v2->v3:
Minor restructure of commits.
Improve cover and commit messages. (Daniel)
v1->v2:
Removed redundant dev check. (Jonathan)
Björn Töpel (4):
xsk: avoid store-tearing when assigning queues
xsk: avoid store-tearing when assigning umem
xsk: use state member for socket synchronization
xsk: lock the control mutex in sock_diag interface
net/xdp/xsk.c | 60 ++++++++++++++++++++++++++++++++--------------
net/xdp/xsk_diag.c | 3 +++
2 files changed, 45 insertions(+), 18 deletions(-)
--
2.20.1
^ permalink raw reply
* Re: [PATCH] rtl_nic: add firmware rtl8125a-3
From: Josh Boyer @ 2019-09-04 11:05 UTC (permalink / raw)
To: Hau
Cc: Heiner Kallweit, Linux Firmware, nic_swsd, netdev@vger.kernel.org,
Hayes Wang
In-Reply-To: <80377ECBC5453840BA8C7155328B5377F227C3A6@RTITMBSVM03.realtek.com.tw>
On Fri, Aug 30, 2019 at 11:56 AM Hau <hau@realtek.com> wrote:
>
> > On 27.08.2019 14:08, Josh Boyer wrote:
> > > On Mon, Aug 26, 2019 at 6:23 PM Heiner Kallweit <hkallweit1@gmail.com>
> > wrote:
> > >>
> > >> This adds firmware rtl8125a-3 for Realtek's 2.5Gbps chip RTL8125.
> > >>
> > >> Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
> > >> ---
> > >> Firmware file was provided by Realtek and they asked me to submit it.
> > >
> > > Can we get a Signed-off-by from someone at Realtek then?
> > >
> > Hi Hau,
> >
> > can you reply and add your Signed-off-by?
> > I saw that all the RTL8168 firmware was submitted by Hayes Wang.
> >
> > > josh
> > >
> > Heiner
> >
>
> Signed-off-by: Chunhao Lin <hau@realtek.com>
Thank you. Applied and pushed out.
josh
>
> > >> The related extension to r8169 driver will be submitted in the next days.
> > >> ---
> > >> WHENCE | 3 +++
> > >> rtl_nic/rtl8125a-3.fw | Bin 0 -> 3456 bytes
> > >> 2 files changed, 3 insertions(+)
> > >> create mode 100644 rtl_nic/rtl8125a-3.fw
> > >>
> > >> diff --git a/WHENCE b/WHENCE
> > >> index fb12924..dbec18a 100644
> > >> --- a/WHENCE
> > >> +++ b/WHENCE
> > >> @@ -2906,6 +2906,9 @@ Version: 0.0.2
> > >> File: rtl_nic/rtl8107e-2.fw
> > >> Version: 0.0.2
> > >>
> > >> +File: rtl_nic/rtl8125a-3.fw
> > >> +Version: 0.0.1
> > >> +
> > >> Licence:
> > >> * Copyright © 2011-2013, Realtek Semiconductor Corporation
> > >> *
> > >> diff --git a/rtl_nic/rtl8125a-3.fw b/rtl_nic/rtl8125a-3.fw new file
> > >> mode 100644 index
> > >>
> > 0000000000000000000000000000000000000000..fac635263f92e8d9734456b75
> > 93
> > >> 2b2088edd5ef9
> > >> GIT binary patch
> > >> literal 3456
> > >>
> > zcmb7G4@{Kj8Gr9M&hw<l39l3>p*MPE#webM6qsqQiuC(hXPi?+wDL#V(Z*4#K%
> > FD{
> > >>
> > zd#PH+tTJbqZG?a`)*5G*m3F2zmN`7hx;2*IKiVu;;~Hwa_E@^5+Z^ooav$qyWa|!|
> > >> z{J!V^^FHtMe%~vE5S!~a<<HMqSUGn=c_2HGJ>M6|pO=$6Z+-
> > !F`d2|JjuT=?tX!6t
> > >> zo1eG1ysol-V@-
> > KgU0(T?#@f6Efr9d!!2E*zz=G_mCu@CyK;goi!iD+b1yk6B2%b&6
> > >> z7edS;xkzqO0?9-2l9EW0ltM}+rIFG}86+PmljJ95fhB~6Z~~0y3JYX}?Z^&0uqy1t
> > >>
> > zny?FN!)}}nC*Ym12kkF<@t&E4_=v=ynF9AnDz2DmaE+uRv6r!*@!^m!6NmhMO
> > zq8r
> > >> zcySgS;n{HZ&ViVjO+Em7C<o$9E{w7~#?6OA6vF-
> > CA|!?$MBkPmUNaZNIZ}kPTZ-Wd
> > >> z8F2PIf~lcpz}RxchgOhZNnAy~1V!<ssEIFw^W*gx{=)|N&sV@7s=~F-
> > YS=QKK)AC8
> > >>
> > z;l`&BHaB5(q!u4F*5UicX4Dw<xZc@_xQwl|*+!ct+H9u{FeB7V|DE*TO<fCht<>$I
> > >> zZZG}Y@U*d?z6a>rPW?gZU!wjH^_^&crVIA-
> > hauiRf}*mc@O*L%b>cXFD^8&Io|9br
> > >>
> > zFS+(#A<NUl=QsF#3U7Megl)!Y%#NIan9+;JM$V!#)QA3t5H6bia7TWJOXlz4ioA=<
> > >> z<^?z-1MK-A9Fa>HXt;u_<`7nle1Pws`y;xZ4b$%$RvXt*Vtj-(#xP2a8yGS^#rwu*
> > >> z7&RlXNB)8`;|q+Lf8+C)SZDkL{T(+MYZPk@p$0naYDvhUdK;W-
> > &~&K<5x2TvCa4ES
> > >>
> > zJSq_Os=`o`>Ti(hqM4#RkyLfbOj8MwbamOxQ0|CNT`@D2E8<rJ4O!}{IZMSyW~=P
> > b
> > >> z9LCG0O+ej0lA~sw%T-
> > ;^=BaOn@)@g8tu_{^65~O&#t5oXW3d`Ciq!i?u^KfEWsf|f
> > >>
> > z%8X@d%v{dr6>6QaQuTMNV*C=d)+lAYWhy1Kp7A%(ze4qPRH@`pHTfshkXfVRB
> > 2TgY
> > >>
> > zO=`+Wt(q39Q=W61)m5XOc8&DkO5CPgp(bTNzg>y9p~8jDs$zJj5<R=s;Oi~Q^>M4
> > (
> > >>
> > zG`vTNq`hitUz@`B_Nx)&fWp3Z<>))8?4g&GICDrH_jW38Z<lJiahS3rlpR$<98=@x
> > >> z6ix)=-SB%nIOXwTePI%gc_p@upI;Gdo~F;Tmw)5`y%^_39nX1Ki-J6~L1FKz7NSjv
> > >> z_`nkPz3?L$w%rohr-(vgBF1<i;qBEnDP75AC6X+Z8dD{_;JbnNV+;3L_)`miW?_eg
> > >> z-4-4vre2o#VTHs{P{PCaz|dle45F5=t7KfRt3*ceh=r$#TAy7mkxtwHW#j)EHgmp)
> > >> zO)hO_GH>cb5{`=!(@#j)h_-jxgCneiD9Hcd;ix`Q>rX~yy2c{beS3_|m>9m87;BeK
> > >> z9^<~->L`kd5sZmZuw?QWNw>vliHU)j7&EQ4-f1m1#+c~6-
> > j5U9k9~bn**#pV?$N|-
> > >> zq$Nr8T#$I{J?c3t2VJ-F9=Aj_^{tkE`!wz?XF~Vaag5Xw_4`&lQTP1kQQmM$_|B0(
> > >>
> > zOkxdl@0Tb}k#O<ZNq?P7^BP@o5?P$tDMUYUDdm~Ohjk2MA!9p<P0Z~eCa@+
> > uv7NOF
> > >> zV)t~$Ac`@GiSytUlb?qfh~`cEKhXXQ`gkQS+oQgJX8gniiK%+szlqBBQMQ+LjIoYA
> > >> z7Pea0V&QHJcUss?e1U!-
> > enM;`#@W7FhmW$!&b34|Z>oiU3%_IGY6~}8_<=_875$5K
> > >> z-p<>elW^0nO-Xb$v)?-<5?*G%4-
> > kJO5#)Zu+T&X8&rjZA4DRX1phN*@#Ln3Z5`&x>
> > >> zM^Cgz;x5{-ci}0VyQ8FT#^zi&IL{9H?xHU!(>!!e${wT4$GqBa3H>xiG*Va3d7hBl
> > >> z$-
> > lAV)_T3WsTa}QwwT;{)^yFKtnb^bPxFJExzl=W&oejIewS6dj^AhH>tVh&*5~e`
> > >>
> > zja9dg7?{KsZ_$^!dgjpQL9gf03g&nvnzMv6vp=S9DYVsnn`y<1qkR|cAF7hLwo&4_
> > >> z$0b%#ug_6NWfb*$Il}q#a(!Ob6=ePMXrpB-
> > v<#HJEP3C!v!5@<?;Dn1MU0<*f8Qxz
> > >>
> > zAjU7*@~eyS)8C3a`2}PA^u1Eoi5NfK?_u^^&&MtM&ozI+{=2wFEidM}?R=ite~tg7
> > >> zpYGYyoP*`0xugV=o@1RyFwcMDQ>Obe;jicCb=vBh2MhU4<KB$Vk2NcQ<(`-
> > qXixrA
> > >> z8^49!$+$sGAily`auZvb-$fkYEIElgD0dJaC)$bQXUsw`@urBL?<RiJ1G^-)6L-BT
> > >> z@#-RprR2w-
> > mqwqPKV$B{bA8MiN1HEyEpap~@gZZ_*el!TTw^JF*(K5a0Q2#@rtsUg
> > >> zt6SovowDhaJ<ob6YnhO-UOUgSow9UNkB-
> > M#+sN~3dy@Qhi9cEVf73Z`N^D^5uW{WK
> > >> zME%~Yvas61t;E=S%Z@SO6V|;&h-
> > h!3cbdD!=(z6g@jH#a_vpS&+;={={DQpi2<!K6
> > >> DeEY6F
> > >>
> > >> literal 0
> > >> HcmV?d00001
> > >>
> > >> --
> > >> 2.23.0
> > >>
> > >
> >
> >
> > ------Please consider the environment before printing this e-mail.
^ permalink raw reply
* Re: [PATCH 1/2] linux/kernel.h: add yesno(), onoff(), enableddisabled(), plural() helpers
From: Jani Nikula @ 2019-09-04 10:47 UTC (permalink / raw)
To: Rasmus Villemoes, linux-kernel
Cc: Joonas Lahtinen, Rodrigo Vivi, intel-gfx, Vishal Kulkarni, netdev,
Greg Kroah-Hartman, linux-usb, Andrew Morton, Julia Lawall
In-Reply-To: <dcdf1abc-7b8e-1f42-a955-0438b90fe9dc@rasmusvillemoes.dk>
On Wed, 04 Sep 2019, Rasmus Villemoes <linux@rasmusvillemoes.dk> wrote:
> On 03/09/2019 15.37, Jani Nikula wrote:
>
>> While the main goal here is to abstract recurring patterns, and slightly
>> clean up the code base by not open coding the ternary operators, there
>> are also some space savings to be had via better string constant
>> pooling.
>
> Eh, no? The linker does that across translation units anyway - moreover,
> given that you make them static inlines, "yes" and "no" will still live
> in .rodata.strX.Y in each individual TU that uses the yesno() helper.
I should've been more careful there; this allows us to do better
constant pooling but does not actually deliver on that here. You'd need
to return pointers to strings in the kernel image. The linker can't
constant pool across modules by itself.
Anyway, that was not the main point here.
> The enableddisabled() is a mouthful, perhaps the helpers should have an
> underscore between the choices
>
> yes_no()
> enabled_disabled()
> on_off()
>
> ?
I'm replacing existing functions that are being used in the kernel
already.
$ git grep "[^a-zA-Z0-9_]\(yesno\|onoff\|enableddisabled\)(" | wc -l
241
Not keen on renaming all of them.
>> drivers/gpu/drm/i915/i915_utils.h | 15 -------------
>> .../ethernet/chelsio/cxgb4/cxgb4_debugfs.c | 11 ----------
>> drivers/usb/core/config.c | 5 -----
>> drivers/usb/core/generic.c | 5 -----
>> include/linux/kernel.h | 21 +++++++++++++++++++
>
> Pet peeve: Can we please stop using linux/kernel.h as a dumping ground
> for every little utility/helper? That makes each and every translation
> unit in the kernel slightly larger, hence slower to compile. Please make
> a linux/string-choice.h and put them there.
*grin*
In the absense of a natural candidate in include/linux/*.h, I thought
shoving them to kernel.h would provoke the best feedback on where to put
them. A new string-choice.h works for me, thanks. :)
BR,
Jani.
--
Jani Nikula, Intel Open Source Graphics Center
^ permalink raw reply
* RE: Is bug 200755 in anyone's queue??
From: Steve Zabele @ 2019-09-04 10:28 UTC (permalink / raw)
To: 'Willem de Bruijn'
Cc: 'Eric Dumazet', 'Network Development', shum,
vladimir116, saifi.khan, 'Daniel Borkmann', on2k16nm,
'Stephen Hemminger', mark.keaton
In-Reply-To: <CA+FuTSdi=tw=N4X2f+paFNM7KHqBgNkV_se-ykZ0+WoA7q0AhQ@mail.gmail.com>
Hi Willem,
Thanks for continuing to poke at this, much appreciated!
>As for the BPF program: good point on accessing the udp port when
>skb->data is already beyond the header.
> Programs of type sk_filter can use bpf_skb_load_bytes(_relative).
>Which I think will work, but have not tested.
Please note that the test code was intentionally set up to make testing as simple as possible. Hence the source addresses for the multiple UDP sessions were identical -- but that is not the general case. In the general case a connected and bound socket should be associated with exactly one five tuple (source and dest addresses, source and destination ports, and protocol.
So a 'connect bpf' would actually need access to the IP addresses as well, not just the ports. To do this, the load bytes call required negative arguments, which failed miserably when we tried it.
In any event, there remains the issue of figuring out which index to return when a match is detected since the index is not the same as the file descriptor value and in fact can change as file descriptors are added and deleted. If I understand the kernel mechanism correctly, the operation is something like this. When you add the first one, its assigned to the first slot; when you add the second its assigned to the second slot; when you delete the first one, the second is moved to the first slot) so tracking this requires figuring out the order stored in the socket array within the kernel, and updating the bpf whenever something changes. I don't know if it's even possible to query which slot a given
So we think handling this with a bpf is really not viable.
One thing worth mentioning is that the connect mechanism here is meant to (at least used to) work the same as connect does with TCP. Bind sets the expected/required local address and port; connect sets the expected/required remote address and port -- so a socket file descriptor becomes associated with exactly one five-tuple. That's how it's worked for several decades anyway.
Thanks again!!!
Steve
-----Original Message-----
From: Willem de Bruijn [mailto:willemdebruijn.kernel@gmail.com]
Sent: Tuesday, September 03, 2019 1:56 PM
Cc: Eric Dumazet; Steve Zabele; Network Development; shum@canndrew.org; vladimir116@gmail.com; saifi.khan@strikr.in; Daniel Borkmann; on2k16nm@gmail.com; Stephen Hemminger
Subject: Re: Is bug 200755 in anyone's queue??
On Fri, Aug 30, 2019 at 4:30 PM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> On Fri, Aug 30, 2019 at 4:54 AM Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >
> >
> >
> > On 8/29/19 9:26 PM, Willem de Bruijn wrote:
> >
> > > SO_REUSEPORT was not intended to be used in this way. Opening
> > > multiple connected sockets with the same local port.
> > >
> > > But since the interface allowed connect after joining a group, and
> > > that is being used, I guess that point is moot. Still, I'm a bit
> > > surprised that it ever worked as described.
> > >
> > > Also note that the default distribution algorithm is not round robin
> > > assignment, but hash based. So multiple consecutive datagrams arriving
> > > at the same socket is not unexpected.
> > >
> > > I suspect that this quick hack might "work". It seemed to on the
> > > supplied .c file:
> > >
> > > score = compute_score(sk, net, saddr, sport,
> > > daddr, hnum, dif, sdif);
> > > if (score > badness) {
> > > - if (sk->sk_reuseport) {
> > > + if (sk->sk_reuseport && !sk->sk_state !=
> > > TCP_ESTABLISHED) {
>
> This won't work for a mix of connected and connectionless sockets, of
> course (even ignoring the typo), as it only skips reuseport on the
> connected sockets.
>
> > >
> > > But a more robust approach, that also works on existing kernels, is to
> > > swap the default distribution algorithm with a custom BPF based one (
> > > SO_ATTACH_REUSEPORT_EBPF).
> > >
> >
> > Yes, I suspect that reuseport could still be used by to load-balance incoming packets
> > targetting the same 4-tuple.
> >
> > So all sockets would have the same score, and we would select the first socket in
> > the list (if not applying reuseport hashing)
>
> Can you elaborate a bit?
>
> One option I see is to record in struct sock_reuseport if any port in
> the group is connected and, if so, don't return immediately on the
> first reuseport_select_sock hit, but continue the search for a higher
> scoring connected socket.
>
> Or do return immediately, but do this refined search in
> reuseport_select_sock itself, as it has a reference to all sockets in the
> group in sock_reuseport->socks[]. Instead of the straightforward hash.
That won't work, as reuseport_select_sock does not have access to
protocol specific data, notably inet_dport.
Unfortunately, what I've come up with so far is not concise and slows
down existing reuseport lookup in a busy port table slot. Note that it
is needed for both ipv4 and ipv6.
Do not break out of the port table slot early, but continue to search
for a higher scored match even after matching a reuseport:
"
@@ -413,28 +413,39 @@ static struct sock *udp4_lib_lookup2(struct net *net,
struct udp_hslot *hslot2,
struct sk_buff *skb)
{
+ struct sock *reuseport_result = NULL;
struct sock *sk, *result;
+ int reuseport_score = 0;
int score, badness;
u32 hash = 0;
result = NULL;
badness = 0;
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
score = compute_score(sk, net, saddr, sport,
daddr, hnum, dif, sdif);
if (score > badness) {
- if (sk->sk_reuseport) {
+ if (sk->sk_reuseport &&
+ sk->sk_state != TCP_ESTABLISHED &&
+ !reuseport_result) {
hash = udp_ehashfn(net, daddr, hnum,
saddr, sport);
- result = reuseport_select_sock(sk, hash, skb,
+ reuseport_result =
reuseport_select_sock(sk, hash, skb,
sizeof(struct udphdr));
- if (result)
- return result;
+ if (reuseport_result)
+ reuseport_score = score;
+ continue;
}
badness = score;
result = sk;
}
}
+
+ if (badness < reuseport_score)
+ result = reuseport_result;
+
return result;
"
To break out after the first reuseport hit when it is safe, i.e., when
it holds no connected sockets, requires adding this state to struct
reuseport_sock at __ip4_datagram_connect. And modify
reuseport_select_sock to read this. At least, I have not found a more
elegant solution.
> Steve, Re: your point on a scalable QUIC server. That is an
> interesting case certainly. Opening a connected socket per flow adds
> both memory and port table pressure. I once looked into an SO_TXONLY
> udp socket option that does not hash connected sockets into the port
> table. In effect receiving on a small set of listening sockets (e.g.,
> one per cpu) and sending over separate tx-only sockets. That still
> introduces unnecessary memory allocation. OTOH it amortizes some
> operations, such as route lookup.
>
> Anyway, that does not fix the immediate issue you reported when using
> SO_REUSEPORT as described.
As for the BPF program: good point on accessing the udp port when
skb->data is already beyond the header.
Programs of type sk_filter can use bpf_skb_load_bytes(_relative).
Which I think will work, but have not tested.
As of kernel 4.19 programs of type BPF_PROG_TYPE_SK_REUSEPORT can be
attached (with CAP_SYS_ADMIN). See
tools/testing/selftests/bpf/progs/test_select_reuseport_kern.c for an
example that parses udp headers with bpf_skb_load_bytes.
^ permalink raw reply
* Re: [PATCH bpf 2/2] libbpf: remove dependency on barrier.h in xsk.h
From: Magnus Karlsson @ 2019-09-04 10:25 UTC (permalink / raw)
To: Yauheni Kaliuta; +Cc: Magnus Karlsson, bpf, Network Development
In-Reply-To: <xunyftlc7dn8.fsf@redhat.com>
On Wed, Sep 4, 2019 at 8:56 AM Yauheni Kaliuta
<yauheni.kaliuta@redhat.com> wrote:
>
> Hi, Magnus!
>
> >>>>> On Wed, 4 Sep 2019 08:39:24 +0200, Magnus Karlsson wrote:
>
> > On Wed, Sep 4, 2019 at 7:32 AM Yauheni Kaliuta
> > <yauheni.kaliuta@redhat.com> wrote:
> >>
> >> Hi, Magnus!
> >>
> >> >>>>> On Tue, 9 Apr 2019 08:44:13 +0200, Magnus Karlsson wrote:
> >>
> >> > The use of smp_rmb() and smp_wmb() creates a Linux header dependency
> >> > on barrier.h that is uneccessary in most parts. This patch implements
> >> > the two small defines that are needed from barrier.h. As a bonus, the
> >> > new implementations are faster than the default ones as they default
> >> > to sfence and lfence for x86, while we only need a compiler barrier in
> >> > our case. Just as it is when the same ring access code is compiled in
> >> > the kernel.
> >>
> >> > Fixes: 1cad07884239 ("libbpf: add support for using AF_XDP sockets")
> >> > Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
> >> > ---
> >> > tools/lib/bpf/xsk.h | 19 +++++++++++++++++--
> >> > 1 file changed, 17 insertions(+), 2 deletions(-)
> >>
> >> > diff --git a/tools/lib/bpf/xsk.h b/tools/lib/bpf/xsk.h
> >> > index 3638147..317b44f 100644
> >> > --- a/tools/lib/bpf/xsk.h
> >> > +++ b/tools/lib/bpf/xsk.h
> >> > @@ -39,6 +39,21 @@ DEFINE_XSK_RING(xsk_ring_cons);
> >> > struct xsk_umem;
> >> > struct xsk_socket;
> >>
> >> > +#if !defined bpf_smp_rmb && !defined bpf_smp_wmb
> >> > +# if defined(__i386__) || defined(__x86_64__)
> >> > +# define bpf_smp_rmb() asm volatile("" : : : "memory")
> >> > +# define bpf_smp_wmb() asm volatile("" : : : "memory")
> >> > +# elif defined(__aarch64__)
> >> > +# define bpf_smp_rmb() asm volatile("dmb ishld" : : : "memory")
> >> > +# define bpf_smp_wmb() asm volatile("dmb ishst" : : : "memory")
> >> > +# elif defined(__arm__)
> >> > +# define bpf_smp_rmb() asm volatile("dmb ish" : : : "memory")
> >> > +# define bpf_smp_wmb() asm volatile("dmb ishst" : : : "memory")
> >> > +# else
> >> > +# error Architecture not supported by the XDP socket code in libbpf.
> >> > +# endif
> >> > +#endif
> >> > +
> >>
> >> What about other architectures then?
>
> > AF_XDP has not been tested on anything else, as far as I
> > know. But contributions that extend it to more archs are very
> > welcome.
>
> Well, I'll may be try to fetch something from barrier.h's (since
> I cannot consider myself as a specialist in the area), but at the
> moment the patch breaks the build on that arches.
Do you have a specific architecture in mind and do you have some
board/server (of that architecture) you could test AF_XDP on?
> > /Magnus
>
> >>
> >> > static inline __u64 *xsk_ring_prod__fill_addr(struct xsk_ring_prod *fill,
> >> > __u32 idx)
> >> > {
> >> > @@ -119,7 +134,7 @@ static inline void xsk_ring_prod__submit(struct xsk_ring_prod *prod, size_t nb)
> >> > /* Make sure everything has been written to the ring before signalling
> >> > * this to the kernel.
> >> > */
> >> > - smp_wmb();
> >> > + bpf_smp_wmb();
> >>
> >> > *prod->producer += nb;
> >> > }
> >> > @@ -133,7 +148,7 @@ static inline size_t xsk_ring_cons__peek(struct xsk_ring_cons *cons,
> >> > /* Make sure we do not speculatively read the data before
> >> > * we have received the packet buffers from the ring.
> >> > */
> >> > - smp_rmb();
> >> > + bpf_smp_rmb();
> >>
> >> > *idx = cons->cached_cons;
> cons-> cached_cons += entries;
> >> > --
> >> > 2.7.4
> >>
> >>
> >> --
> >> WBR,
> >> Yauheni Kaliuta
>
> --
> WBR,
> Yauheni Kaliuta
^ permalink raw reply
* Re: [PATCH net] net: sonic: remove dev_kfree_skb before return NETDEV_TX_BUSY
From: Eric Dumazet @ 2019-09-04 10:19 UTC (permalink / raw)
To: Mao Wenan, tsbogend, davem; +Cc: netdev, linux-kernel, kernel-janitors
In-Reply-To: <20190904094211.117454-1-maowenan@huawei.com>
On 9/4/19 11:42 AM, Mao Wenan wrote:
> When dma_map_single is failed to map buffer, skb can't be freed
> before sonic driver return to stack with NETDEV_TX_BUSY, because
> this skb may be requeued to qdisc, it might trigger use-after-free.
>
> Fixes: d9fb9f384292 ("*sonic/natsemi/ns83829: Move the National Semi-conductor drivers")
> Signed-off-by: Mao Wenan <maowenan@huawei.com>
> ---
> drivers/net/ethernet/natsemi/sonic.c | 1 -
> 1 file changed, 1 deletion(-)
>
> diff --git a/drivers/net/ethernet/natsemi/sonic.c b/drivers/net/ethernet/natsemi/sonic.c
> index d0a01e8f000a..248a8f22a33b 100644
> --- a/drivers/net/ethernet/natsemi/sonic.c
> +++ b/drivers/net/ethernet/natsemi/sonic.c
> @@ -233,7 +233,6 @@ static int sonic_send_packet(struct sk_buff *skb, struct net_device *dev)
> laddr = dma_map_single(lp->device, skb->data, length, DMA_TO_DEVICE);
> if (!laddr) {
> printk(KERN_ERR "%s: failed to map tx DMA buffer.\n", dev->name);
> - dev_kfree_skb(skb);
> return NETDEV_TX_BUSY;
> }
>
>
That is the wrong way to fix this bug.
What guarantee do we have that the mapping operation will succeed next time we attempt
the transmit (and the dma_map_single() operation) ?
NETDEV_TX_BUSY is very dangerous, this might trigger an infinite loop.
I would rather leave the dev_kfree_skb(skb), and return NETDEV_TX_OK
Also the printk(KERN_ERR ...) should be replaced by pr_err_ratelimited(...)
NETDEV_TX_BUSY really should only be used by drivers that call netif_tx_stop_queue()
at the wrong moment.
^ permalink raw reply
* Re: [PATCH net] net: sonic: remove dev_kfree_skb before return NETDEV_TX_BUSY
From: Thomas Bogendoerfer @ 2019-09-04 9:50 UTC (permalink / raw)
To: Mao Wenan; +Cc: davem, netdev, linux-kernel, kernel-janitors
In-Reply-To: <20190904094211.117454-1-maowenan@huawei.com>
On Wed, Sep 04, 2019 at 05:42:11PM +0800, Mao Wenan wrote:
> When dma_map_single is failed to map buffer, skb can't be freed
> before sonic driver return to stack with NETDEV_TX_BUSY, because
> this skb may be requeued to qdisc, it might trigger use-after-free.
>
> Fixes: d9fb9f384292 ("*sonic/natsemi/ns83829: Move the National Semi-conductor drivers")
> Signed-off-by: Mao Wenan <maowenan@huawei.com>
> ---
> drivers/net/ethernet/natsemi/sonic.c | 1 -
> 1 file changed, 1 deletion(-)
>
> diff --git a/drivers/net/ethernet/natsemi/sonic.c b/drivers/net/ethernet/natsemi/sonic.c
> index d0a01e8f000a..248a8f22a33b 100644
> --- a/drivers/net/ethernet/natsemi/sonic.c
> +++ b/drivers/net/ethernet/natsemi/sonic.c
> @@ -233,7 +233,6 @@ static int sonic_send_packet(struct sk_buff *skb, struct net_device *dev)
> laddr = dma_map_single(lp->device, skb->data, length, DMA_TO_DEVICE);
> if (!laddr) {
> printk(KERN_ERR "%s: failed to map tx DMA buffer.\n", dev->name);
> - dev_kfree_skb(skb);
> return NETDEV_TX_BUSY;
> }
Reviewed-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Thomas.
--
Crap can work. Given enough thrust pigs will fly, but it's not necessarily a
good idea. [ RFC1925, 2.3 ]
^ permalink raw reply
* Re: [PATCH net-next v3] net: openvswitch: Set OvS recirc_id from tc chain index
From: Davide Caratti @ 2019-09-04 9:47 UTC (permalink / raw)
To: Paul Blakey, Pravin B Shelar, netdev, David S. Miller,
Justin Pettit, Simon Horman, Marcelo Ricardo Leitner, Vlad Buslov
Cc: Jiri Pirko, Roi Dayan, Yossi Kuperman, Rony Efraim, Oz Shlomo
In-Reply-To: <1567517015-10778-2-git-send-email-paulb@mellanox.com>
On Tue, 2019-09-03 at 16:23 +0300, Paul Blakey wrote:
> Offloaded OvS datapath rules are translated one to one to tc rules,
> for example the following simplified OvS rule:
>
> recirc_id(0),in_port(dev1),eth_type(0x0800),ct_state(-trk) actions:ct(),recirc(2)
>
> Will be translated to the following tc rule:
>
> $ tc filter add dev dev1 ingress \
> prio 1 chain 0 proto ip \
> flower tcp ct_state -trk \
> action ct pipe \
> action goto chain 2
hello Paul!
one small question:
[... ]
> index 43f5b7e..2fdc746 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -274,7 +274,10 @@ struct tcf_result {
> unsigned long class;
> u32 classid;
> };
> - const struct tcf_proto *goto_tp;
> + struct {
> + const struct tcf_proto *goto_tp;
> + u32 goto_index;
I don't understand why we need to store another copy of the chain index in
'res.goto_index'.
(see below)
[...]
> index 3397122..c393604 100644
> --- a/net/sched/act_api.c
> +++ b/net/sched/act_api.c
> @@ -27,6 +27,7 @@ static void tcf_action_goto_chain_exec(const struct tc_action *a,
> {
> const struct tcf_chain *chain = rcu_dereference_bh(a->goto_chain);
>
> + res->goto_index = chain->index;
I see "a->goto_chain" is used to read the chain index, but I think it's
not needed _ because the chain index is encoded together with the "goto
chain" control action.
> res->goto_tp = rcu_dereference_bh(chain->filter_chain);
> }
>
> diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
> index 671ca90..dd147be 100644
> --- a/net/sched/cls_api.c
> +++ b/net/sched/cls_api.c
> @@ -1514,6 +1514,18 @@ int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
> goto reset;
> } else if (unlikely(TC_ACT_EXT_CMP(err, TC_ACT_GOTO_CHAIN))) {
> first_tp = res->goto_tp;
> +
> +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
> + {
> + struct tc_skb_ext *ext;
> +
> + ext = skb_ext_add(skb, TC_SKB_EXT);
> + if (WARN_ON_ONCE(!ext))
> + return TC_ACT_SHOT;
> +
> + ext->chain = res->goto_index;
the value of 'res->goto_index' is already encoded in the control action
'err' (masked with TC_ACT_EXT_VAL_MASK), since TC_ACT_GOTO_CHAIN bits are
not zero.
you can just get rid of res->goto_index, and just do:
ext->chain = err & TC_ACT_EXT_VAL_MASK;
am I missing something?
thanks!
--
davide
^ permalink raw reply
* [PATCH net-next] r8152: adjust the settings of ups flags
From: Hayes Wang @ 2019-09-04 9:34 UTC (permalink / raw)
To: netdev; +Cc: nic_swsd, linux-kernel, Hayes Wang
The UPS feature only works for runtime suspend, so UPS flags only
need to be set before enabling runtime suspend. Therefore, I create
a struct to record relative information, and use it before runtime
suspend.
All chips could record such information, even though not all of
them support the feature of UPS. Then, some functions could be
combined.
Signed-off-by: Hayes Wang <hayeswang@realtek.com>
---
drivers/net/usb/r8152.c | 215 +++++++++++++++++++++++-----------------
1 file changed, 125 insertions(+), 90 deletions(-)
diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index 778d27d1fb15..86c403d12b6b 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -445,18 +445,18 @@
#define UPS_FLAGS_250M_CKDIV BIT(2)
#define UPS_FLAGS_EN_ALDPS BIT(3)
#define UPS_FLAGS_CTAP_SHORT_DIS BIT(4)
-#define UPS_FLAGS_SPEED_MASK (0xf << 16)
#define ups_flags_speed(x) ((x) << 16)
#define UPS_FLAGS_EN_EEE BIT(20)
#define UPS_FLAGS_EN_500M_EEE BIT(21)
#define UPS_FLAGS_EN_EEE_CKDIV BIT(22)
+#define UPS_FLAGS_EEE_PLLOFF_100 BIT(23)
#define UPS_FLAGS_EEE_PLLOFF_GIGA BIT(24)
#define UPS_FLAGS_EEE_CMOD_LV_EN BIT(25)
#define UPS_FLAGS_EN_GREEN BIT(26)
#define UPS_FLAGS_EN_FLOW_CTR BIT(27)
enum spd_duplex {
- NWAY_10M_HALF = 1,
+ NWAY_10M_HALF,
NWAY_10M_FULL,
NWAY_100M_HALF,
NWAY_100M_FULL,
@@ -749,6 +749,23 @@ struct r8152 {
void (*autosuspend_en)(struct r8152 *tp, bool enable);
} rtl_ops;
+ struct ups_info {
+ u32 _10m_ckdiv:1;
+ u32 _250m_ckdiv:1;
+ u32 aldps:1;
+ u32 lite_mode:2;
+ u32 speed_duplex:4;
+ u32 eee:1;
+ u32 eee_lite:1;
+ u32 eee_ckdiv:1;
+ u32 eee_plloff_100:1;
+ u32 eee_plloff_giga:1;
+ u32 eee_cmod_lv:1;
+ u32 green:1;
+ u32 flow_control:1;
+ u32 ctap_short_off:1;
+ } ups_info;
+
atomic_t rx_count;
bool eee_en;
@@ -2857,14 +2874,76 @@ static void r8153_u2p3en(struct r8152 *tp, bool enable)
ocp_write_word(tp, MCU_TYPE_USB, USB_U2P3_CTRL, ocp_data);
}
-static void r8153b_ups_flags_w1w0(struct r8152 *tp, u32 set, u32 clear)
+static void r8153b_ups_flags(struct r8152 *tp)
{
- u32 ocp_data;
+ u32 ups_flags = 0;
+
+ if (tp->ups_info.green)
+ ups_flags |= UPS_FLAGS_EN_GREEN;
+
+ if (tp->ups_info.aldps)
+ ups_flags |= UPS_FLAGS_EN_ALDPS;
+
+ if (tp->ups_info.eee)
+ ups_flags |= UPS_FLAGS_EN_EEE;
+
+ if (tp->ups_info.flow_control)
+ ups_flags |= UPS_FLAGS_EN_FLOW_CTR;
+
+ if (tp->ups_info.eee_ckdiv)
+ ups_flags |= UPS_FLAGS_EN_EEE_CKDIV;
+
+ if (tp->ups_info.eee_cmod_lv)
+ ups_flags |= UPS_FLAGS_EEE_CMOD_LV_EN;
+
+ if (tp->ups_info._10m_ckdiv)
+ ups_flags |= UPS_FLAGS_EN_10M_CKDIV;
+
+ if (tp->ups_info.eee_plloff_100)
+ ups_flags |= UPS_FLAGS_EEE_PLLOFF_100;
+
+ if (tp->ups_info.eee_plloff_giga)
+ ups_flags |= UPS_FLAGS_EEE_PLLOFF_GIGA;
+
+ if (tp->ups_info._250m_ckdiv)
+ ups_flags |= UPS_FLAGS_250M_CKDIV;
+
+ if (tp->ups_info.ctap_short_off)
+ ups_flags |= UPS_FLAGS_CTAP_SHORT_DIS;
+
+ switch (tp->ups_info.speed_duplex) {
+ case NWAY_10M_HALF:
+ ups_flags |= ups_flags_speed(1);
+ break;
+ case NWAY_10M_FULL:
+ ups_flags |= ups_flags_speed(2);
+ break;
+ case NWAY_100M_HALF:
+ ups_flags |= ups_flags_speed(3);
+ break;
+ case NWAY_100M_FULL:
+ ups_flags |= ups_flags_speed(4);
+ break;
+ case NWAY_1000M_FULL:
+ ups_flags |= ups_flags_speed(5);
+ break;
+ case FORCE_10M_HALF:
+ ups_flags |= ups_flags_speed(6);
+ break;
+ case FORCE_10M_FULL:
+ ups_flags |= ups_flags_speed(7);
+ break;
+ case FORCE_100M_HALF:
+ ups_flags |= ups_flags_speed(8);
+ break;
+ case FORCE_100M_FULL:
+ ups_flags |= ups_flags_speed(9);
+ break;
+ default:
+ break;
+ }
- ocp_data = ocp_read_dword(tp, MCU_TYPE_USB, USB_UPS_FLAGS);
- ocp_data &= ~clear;
- ocp_data |= set;
- ocp_write_dword(tp, MCU_TYPE_USB, USB_UPS_FLAGS, ocp_data);
+ ocp_write_dword(tp, MCU_TYPE_USB, USB_UPS_FLAGS, ups_flags);
}
static void r8153b_green_en(struct r8152 *tp, bool enable)
@@ -2885,7 +2964,7 @@ static void r8153b_green_en(struct r8152 *tp, bool enable)
data |= GREEN_ETH_EN;
sram_write(tp, SRAM_GREEN_CFG, data);
- r8153b_ups_flags_w1w0(tp, UPS_FLAGS_EN_GREEN, 0);
+ tp->ups_info.green = enable;
}
static u16 r8153_phy_status(struct r8152 *tp, u16 desired)
@@ -2915,6 +2994,8 @@ static void r8153b_ups_en(struct r8152 *tp, bool enable)
u32 ocp_data = ocp_read_byte(tp, MCU_TYPE_USB, USB_POWER_CUT);
if (enable) {
+ r8153b_ups_flags(tp);
+
ocp_data |= UPS_EN | USP_PREWAKE | PHASE2_EN;
ocp_write_byte(tp, MCU_TYPE_USB, USB_POWER_CUT, ocp_data);
@@ -3223,16 +3304,8 @@ static void r8153_eee_en(struct r8152 *tp, bool enable)
ocp_write_word(tp, MCU_TYPE_PLA, PLA_EEE_CR, ocp_data);
ocp_reg_write(tp, OCP_EEE_CFG, config);
-}
-
-static void r8153b_eee_en(struct r8152 *tp, bool enable)
-{
- r8153_eee_en(tp, enable);
- if (enable)
- r8153b_ups_flags_w1w0(tp, UPS_FLAGS_EN_EEE, 0);
- else
- r8153b_ups_flags_w1w0(tp, 0, UPS_FLAGS_EN_EEE);
+ tp->ups_info.eee = enable;
}
static void rtl_eee_enable(struct r8152 *tp, bool enable)
@@ -3254,21 +3327,13 @@ static void rtl_eee_enable(struct r8152 *tp, bool enable)
case RTL_VER_04:
case RTL_VER_05:
case RTL_VER_06:
- if (enable) {
- r8153_eee_en(tp, true);
- ocp_reg_write(tp, OCP_EEE_ADV, tp->eee_adv);
- } else {
- r8153_eee_en(tp, false);
- ocp_reg_write(tp, OCP_EEE_ADV, 0);
- }
- break;
case RTL_VER_08:
case RTL_VER_09:
if (enable) {
- r8153b_eee_en(tp, true);
+ r8153_eee_en(tp, true);
ocp_reg_write(tp, OCP_EEE_ADV, tp->eee_adv);
} else {
- r8153b_eee_en(tp, false);
+ r8153_eee_en(tp, false);
ocp_reg_write(tp, OCP_EEE_ADV, 0);
}
break;
@@ -3284,6 +3349,8 @@ static void r8152b_enable_fc(struct r8152 *tp)
anar = r8152_mdio_read(tp, MII_ADVERTISE);
anar |= ADVERTISE_PAUSE_CAP | ADVERTISE_PAUSE_ASYM;
r8152_mdio_write(tp, MII_ADVERTISE, anar);
+
+ tp->ups_info.flow_control = true;
}
static void rtl8152_disable(struct r8152 *tp)
@@ -3477,22 +3544,8 @@ static void r8153_aldps_en(struct r8152 *tp, bool enable)
break;
}
}
-}
-
-static void r8153b_aldps_en(struct r8152 *tp, bool enable)
-{
- r8153_aldps_en(tp, enable);
-
- if (enable)
- r8153b_ups_flags_w1w0(tp, UPS_FLAGS_EN_ALDPS, 0);
- else
- r8153b_ups_flags_w1w0(tp, 0, UPS_FLAGS_EN_ALDPS);
-}
-static void r8153b_enable_fc(struct r8152 *tp)
-{
- r8152b_enable_fc(tp);
- r8153b_ups_flags_w1w0(tp, UPS_FLAGS_EN_FLOW_CTR, 0);
+ tp->ups_info.aldps = enable;
}
static void r8153_hw_phy_cfg(struct r8152 *tp)
@@ -3569,11 +3622,11 @@ static u32 r8152_efuse_read(struct r8152 *tp, u8 addr)
static void r8153b_hw_phy_cfg(struct r8152 *tp)
{
- u32 ocp_data, ups_flags = 0;
+ u32 ocp_data;
u16 data;
/* disable ALDPS before updating the PHY parameters */
- r8153b_aldps_en(tp, false);
+ r8153_aldps_en(tp, false);
/* disable EEE before updating the PHY parameters */
rtl_eee_enable(tp, false);
@@ -3621,28 +3674,27 @@ static void r8153b_hw_phy_cfg(struct r8152 *tp)
data = ocp_reg_read(tp, OCP_POWER_CFG);
data |= EEE_CLKDIV_EN;
ocp_reg_write(tp, OCP_POWER_CFG, data);
+ tp->ups_info.eee_ckdiv = true;
data = ocp_reg_read(tp, OCP_DOWN_SPEED);
data |= EN_EEE_CMODE | EN_EEE_1000 | EN_10M_CLKDIV;
ocp_reg_write(tp, OCP_DOWN_SPEED, data);
+ tp->ups_info.eee_cmod_lv = true;
+ tp->ups_info._10m_ckdiv = true;
+ tp->ups_info.eee_plloff_giga = true;
ocp_reg_write(tp, OCP_SYSCLK_CFG, 0);
ocp_reg_write(tp, OCP_SYSCLK_CFG, clk_div_expo(5));
-
- ups_flags |= UPS_FLAGS_EN_10M_CKDIV | UPS_FLAGS_250M_CKDIV |
- UPS_FLAGS_EN_EEE_CKDIV | UPS_FLAGS_EEE_CMOD_LV_EN |
- UPS_FLAGS_EEE_PLLOFF_GIGA;
+ tp->ups_info._250m_ckdiv = true;
r8153_patch_request(tp, false);
}
- r8153b_ups_flags_w1w0(tp, ups_flags, 0);
-
if (tp->eee_en)
rtl_eee_enable(tp, true);
- r8153b_aldps_en(tp, true);
- r8153b_enable_fc(tp);
+ r8153_aldps_en(tp, true);
+ r8152b_enable_fc(tp);
r8153_u2p3en(tp, true);
set_bit(PHY_RESET, &tp->flags);
@@ -3793,18 +3845,9 @@ static void rtl8153_disable(struct r8152 *tp)
r8153_aldps_en(tp, true);
}
-static void rtl8153b_disable(struct r8152 *tp)
-{
- r8153b_aldps_en(tp, false);
- rtl_disable(tp);
- rtl_reset_bmu(tp);
- r8153b_aldps_en(tp, true);
-}
-
static int rtl8152_set_speed(struct r8152 *tp, u8 autoneg, u16 speed, u8 duplex)
{
u16 bmcr, anar, gbcr;
- enum spd_duplex speed_duplex;
int ret = 0;
anar = r8152_mdio_read(tp, MII_ADVERTISE);
@@ -3821,43 +3864,46 @@ static int rtl8152_set_speed(struct r8152 *tp, u8 autoneg, u16 speed, u8 duplex)
if (speed == SPEED_10) {
bmcr = 0;
anar |= ADVERTISE_10HALF | ADVERTISE_10FULL;
- speed_duplex = FORCE_10M_HALF;
+ if (duplex == DUPLEX_FULL)
+ tp->ups_info.speed_duplex = FORCE_10M_FULL;
+ else
+ tp->ups_info.speed_duplex = FORCE_10M_HALF;
} else if (speed == SPEED_100) {
bmcr = BMCR_SPEED100;
anar |= ADVERTISE_100HALF | ADVERTISE_100FULL;
- speed_duplex = FORCE_100M_HALF;
+ if (duplex == DUPLEX_FULL)
+ tp->ups_info.speed_duplex = FORCE_100M_FULL;
+ else
+ tp->ups_info.speed_duplex = FORCE_100M_HALF;
} else if (speed == SPEED_1000 && tp->mii.supports_gmii) {
bmcr = BMCR_SPEED1000;
gbcr |= ADVERTISE_1000FULL | ADVERTISE_1000HALF;
- speed_duplex = NWAY_1000M_FULL;
+ tp->ups_info.speed_duplex = NWAY_1000M_FULL;
} else {
ret = -EINVAL;
goto out;
}
- if (duplex == DUPLEX_FULL) {
+ if (duplex == DUPLEX_FULL)
bmcr |= BMCR_FULLDPLX;
- if (speed != SPEED_1000)
- speed_duplex++;
- }
} else {
if (speed == SPEED_10) {
if (duplex == DUPLEX_FULL) {
anar |= ADVERTISE_10HALF | ADVERTISE_10FULL;
- speed_duplex = NWAY_10M_FULL;
+ tp->ups_info.speed_duplex = NWAY_10M_FULL;
} else {
anar |= ADVERTISE_10HALF;
- speed_duplex = NWAY_10M_HALF;
+ tp->ups_info.speed_duplex = NWAY_10M_HALF;
}
} else if (speed == SPEED_100) {
if (duplex == DUPLEX_FULL) {
anar |= ADVERTISE_10HALF | ADVERTISE_10FULL;
anar |= ADVERTISE_100HALF | ADVERTISE_100FULL;
- speed_duplex = NWAY_100M_FULL;
+ tp->ups_info.speed_duplex = NWAY_100M_FULL;
} else {
anar |= ADVERTISE_10HALF;
anar |= ADVERTISE_100HALF;
- speed_duplex = NWAY_100M_HALF;
+ tp->ups_info.speed_duplex = NWAY_100M_HALF;
}
} else if (speed == SPEED_1000 && tp->mii.supports_gmii) {
if (duplex == DUPLEX_FULL) {
@@ -3869,7 +3915,7 @@ static int rtl8152_set_speed(struct r8152 *tp, u8 autoneg, u16 speed, u8 duplex)
anar |= ADVERTISE_100HALF;
gbcr |= ADVERTISE_1000HALF;
}
- speed_duplex = NWAY_1000M_FULL;
+ tp->ups_info.speed_duplex = NWAY_1000M_FULL;
} else {
ret = -EINVAL;
goto out;
@@ -3887,17 +3933,6 @@ static int rtl8152_set_speed(struct r8152 *tp, u8 autoneg, u16 speed, u8 duplex)
r8152_mdio_write(tp, MII_ADVERTISE, anar);
r8152_mdio_write(tp, MII_BMCR, bmcr);
- switch (tp->version) {
- case RTL_VER_08:
- case RTL_VER_09:
- r8153b_ups_flags_w1w0(tp, ups_flags_speed(speed_duplex),
- UPS_FLAGS_SPEED_MASK);
- break;
-
- default:
- break;
- }
-
if (bmcr & BMCR_RESET) {
int i;
@@ -3982,12 +4017,12 @@ static void rtl8153b_up(struct r8152 *tp)
r8153b_u1u2en(tp, false);
r8153_u2p3en(tp, false);
- r8153b_aldps_en(tp, false);
+ r8153_aldps_en(tp, false);
r8153_first_init(tp);
ocp_write_dword(tp, MCU_TYPE_USB, USB_RX_BUF_TH, RX_THR_B);
- r8153b_aldps_en(tp, true);
+ r8153_aldps_en(tp, true);
r8153_u2p3en(tp, true);
r8153b_u1u2en(tp, true);
}
@@ -4002,9 +4037,9 @@ static void rtl8153b_down(struct r8152 *tp)
r8153b_u1u2en(tp, false);
r8153_u2p3en(tp, false);
r8153b_power_cut_en(tp, false);
- r8153b_aldps_en(tp, false);
+ r8153_aldps_en(tp, false);
r8153_enter_oob(tp);
- r8153b_aldps_en(tp, true);
+ r8153_aldps_en(tp, true);
}
static bool rtl8152_in_nway(struct r8152 *tp)
@@ -5383,7 +5418,7 @@ static int rtl_ops_init(struct r8152 *tp)
case RTL_VER_09:
ops->init = r8153b_init;
ops->enable = rtl8153_enable;
- ops->disable = rtl8153b_disable;
+ ops->disable = rtl8153_disable;
ops->up = rtl8153b_up;
ops->down = rtl8153b_down;
ops->unload = rtl8153b_unload;
--
2.21.0
^ permalink raw reply related
* [PATCH net] net: sonic: remove dev_kfree_skb before return NETDEV_TX_BUSY
From: Mao Wenan @ 2019-09-04 9:42 UTC (permalink / raw)
To: tsbogend, davem; +Cc: netdev, linux-kernel, kernel-janitors, Mao Wenan
When dma_map_single is failed to map buffer, skb can't be freed
before sonic driver return to stack with NETDEV_TX_BUSY, because
this skb may be requeued to qdisc, it might trigger use-after-free.
Fixes: d9fb9f384292 ("*sonic/natsemi/ns83829: Move the National Semi-conductor drivers")
Signed-off-by: Mao Wenan <maowenan@huawei.com>
---
drivers/net/ethernet/natsemi/sonic.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/drivers/net/ethernet/natsemi/sonic.c b/drivers/net/ethernet/natsemi/sonic.c
index d0a01e8f000a..248a8f22a33b 100644
--- a/drivers/net/ethernet/natsemi/sonic.c
+++ b/drivers/net/ethernet/natsemi/sonic.c
@@ -233,7 +233,6 @@ static int sonic_send_packet(struct sk_buff *skb, struct net_device *dev)
laddr = dma_map_single(lp->device, skb->data, length, DMA_TO_DEVICE);
if (!laddr) {
printk(KERN_ERR "%s: failed to map tx DMA buffer.\n", dev->name);
- dev_kfree_skb(skb);
return NETDEV_TX_BUSY;
}
--
2.20.1
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox