Netdev List
 help / color / mirror / Atom feed
* [PATCH v3 2/4] Export usbnet_cdc_update_filter
From: Miguel Rodríguez Pérez @ 2018-07-01  9:05 UTC (permalink / raw)
  To: oliver, linux-usb, netdev, gregkh; +Cc: Miguel Rodríguez Pérez
In-Reply-To: <20180701090553.7776-1-miguel@det.uvigo.gal>

This makes the function avaiable to other drivers, like cdn_ncm.

Signed-off-by: Miguel Rodríguez Pérez <miguel@det.uvigo.gal>
---
 drivers/net/usb/cdc_ether.c | 3 ++-
 include/linux/usb/usbnet.h  | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c
index 815ed0dc18fe..54472ab77b90 100644
--- a/drivers/net/usb/cdc_ether.c
+++ b/drivers/net/usb/cdc_ether.c
@@ -75,7 +75,7 @@ static const u8 mbm_guid[16] = {
 	0xa6, 0x07, 0xc0, 0xff, 0xcb, 0x7e, 0x39, 0x2a,
 };
 
-static void usbnet_cdc_update_filter(struct usbnet *dev)
+void usbnet_cdc_update_filter(struct usbnet *dev)
 {
 	struct net_device *net = dev->net;
 
@@ -99,6 +99,7 @@ static void usbnet_cdc_update_filter(struct usbnet *dev)
 			NULL,
 			0);
 }
+EXPORT_SYMBOL_GPL(usbnet_cdc_update_filter);
 
 /* probes control interface, claims data interface, collects the bulk
  * endpoints, activates data interface (if needed), maybe sets MTU.
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index e2ec3582e549..7821cf1dcd60 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -286,4 +286,5 @@ extern void usbnet_update_max_qlen(struct usbnet *dev);
 extern void usbnet_get_stats64(struct net_device *dev,
 			       struct rtnl_link_stats64 *stats);
 
+extern void usbnet_cdc_update_filter(struct usbnet *);
 #endif /* __LINUX_USB_USBNET_H */
-- 
2.17.1

^ permalink raw reply related

* [PATCH v3 1/4] Simplify usbnet_cdc_update_filter
From: Miguel Rodríguez Pérez @ 2018-07-01  9:05 UTC (permalink / raw)
  To: oliver, linux-usb, netdev, gregkh; +Cc: Miguel Rodríguez Pérez
In-Reply-To: <20180701090553.7776-1-miguel@det.uvigo.gal>

Remove some unneded varibles to make the code easier to read
and, replace the generic usb_control_msg function for the
more specific usbnet_write_cmd.

Signed-off-by: Miguel Rodríguez Pérez <miguel@det.uvigo.gal>
---
 drivers/net/usb/cdc_ether.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c
index 178b956501a7..815ed0dc18fe 100644
--- a/drivers/net/usb/cdc_ether.c
+++ b/drivers/net/usb/cdc_ether.c
@@ -77,9 +77,7 @@ static const u8 mbm_guid[16] = {
 
 static void usbnet_cdc_update_filter(struct usbnet *dev)
 {
-	struct cdc_state	*info = (void *) &dev->data;
-	struct usb_interface	*intf = info->control;
-	struct net_device	*net = dev->net;
+	struct net_device *net = dev->net;
 
 	u16 cdc_filter = USB_CDC_PACKET_TYPE_DIRECTED
 			| USB_CDC_PACKET_TYPE_BROADCAST;
@@ -93,16 +91,13 @@ static void usbnet_cdc_update_filter(struct usbnet *dev)
 	if (!netdev_mc_empty(net) || (net->flags & IFF_ALLMULTI))
 		cdc_filter |= USB_CDC_PACKET_TYPE_ALL_MULTICAST;
 
-	usb_control_msg(dev->udev,
-			usb_sndctrlpipe(dev->udev, 0),
+	usbnet_write_cmd(dev,
 			USB_CDC_SET_ETHERNET_PACKET_FILTER,
-			USB_TYPE_CLASS | USB_RECIP_INTERFACE,
+			USB_TYPE_CLASS | USB_DIR_OUT | USB_RECIP_INTERFACE,
 			cdc_filter,
-			intf->cur_altsetting->desc.bInterfaceNumber,
+			dev->intf->cur_altsetting->desc.bInterfaceNumber,
 			NULL,
-			0,
-			USB_CTRL_SET_TIMEOUT
-		);
+			0);
 }
 
 /* probes control interface, claims data interface, collects the bulk
-- 
2.17.1

^ permalink raw reply related

* [PATCH v3 0/4] usbnet: Admit multicast traffic in cdc ncm devices
From: Miguel Rodríguez Pérez @ 2018-07-01  9:05 UTC (permalink / raw)
  To: oliver, linux-usb, netdev, gregkh; +Cc: Miguel Rodríguez Pérez
In-Reply-To: <20180701081550.GA7048@kroah.com>

I have reworked the patches according to the previous round of revisions.
I hope everything is formated correctly this time.

Miguel Rodríguez Pérez (4):
  Simplify usbnet_cdc_update_filter
  Export usbnet_cdc_update_filter
  Replace the way cdc_ncm hooks into usbnet_change_mtu
  Hook into set_rx_mode to admit multicast traffic

 drivers/net/usb/cdc_ether.c | 18 +++++++-----------
 drivers/net/usb/cdc_ncm.c   | 16 ++++++----------
 include/linux/usb/usbnet.h  |  1 +
 3 files changed, 14 insertions(+), 21 deletions(-)

-- 
2.17.1

^ permalink raw reply

* [PATCHv2 net] ipvlan: call dev_change_flags when ipvlan mode is reset
From: Hangbin Liu @ 2018-07-01  8:21 UTC (permalink / raw)
  To: netdev
  Cc: Stefano Brivio, Paolo Abeni, David Miller, Mahesh Bandewar,
	Cong Wang, Sabrina Dubroca, Hangbin Liu
In-Reply-To: <1529330677-15328-1-git-send-email-liuhangbin@gmail.com>

After we change the ipvlan mode from l3 to l2, or vice versa, we only
reset IFF_NOARP flag, but don't flush the ARP table cache, which will
cause eth->h_dest to be equal to eth->h_source in ipvlan_xmit_mode_l2().
Then the message will not come out of host.

Here is the reproducer on local host:

ip link set eth1 up
ip addr add 192.168.1.1/24 dev eth1
ip link add link eth1 ipvlan1 type ipvlan mode l3

ip netns add net1
ip link set ipvlan1 netns net1
ip netns exec net1 ip link set ipvlan1 up
ip netns exec net1 ip addr add 192.168.2.1/24 dev ipvlan1

ip route add 192.168.2.0/24 via 192.168.1.2
ping 192.168.2.2 -c 2

ip netns exec net1 ip link set ipvlan1 type ipvlan mode l2
ping 192.168.2.2 -c 2

Add the same configuration on remote host. After we set the mode to l2,
we could find that the src/dst MAC addresses are the same on eth1:

21:26:06.648565 00:b7:13:ad:d3:05 > 00:b7:13:ad:d3:05, ethertype IPv4 (0x0800), length 98: (tos 0x0, ttl 64, id 58356, offset 0, flags [DF], proto ICMP (1), length 84)
    192.168.2.1 > 192.168.2.2: ICMP echo request, id 22686, seq 1, length 64

Fix this by calling dev_change_flags(), which will call netdevice notifier
with flag change info.

v2:
a) As pointed out by Wang Cong, check return value for dev_change_flags() when
change dev flags.
b) As suggested by Stefano and Sabrina, move flags setting before l3mdev_ops.
So we don't need to redo ipvlan_{, un}register_nf_hook() again in err path.

Reported-by: Jianlin Shi <jishi@redhat.com>
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Fixes: 2ad7bf3638411 ("ipvlan: Initial check-in of the IPVLAN driver.")
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
---
 drivers/net/ipvlan/ipvlan_main.c | 36 ++++++++++++++++++++++++++++--------
 1 file changed, 28 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index 4377c26..f126bc1 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -75,10 +75,23 @@ static int ipvlan_set_port_mode(struct ipvl_port *port, u16 nval)
 {
 	struct ipvl_dev *ipvlan;
 	struct net_device *mdev = port->dev;
-	int err = 0;
+	unsigned int flags;
+	int err;
 
 	ASSERT_RTNL();
 	if (port->mode != nval) {
+		list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
+			flags = ipvlan->dev->flags;
+			if (nval == IPVLAN_MODE_L3 || nval == IPVLAN_MODE_L3S) {
+				err = dev_change_flags(ipvlan->dev,
+						       flags | IFF_NOARP);
+			} else {
+				err = dev_change_flags(ipvlan->dev,
+						       flags & ~IFF_NOARP);
+			}
+			if (unlikely(err))
+				goto fail;
+		}
 		if (nval == IPVLAN_MODE_L3S) {
 			/* New mode is L3S */
 			err = ipvlan_register_nf_hook(read_pnet(&port->pnet));
@@ -86,21 +99,28 @@ static int ipvlan_set_port_mode(struct ipvl_port *port, u16 nval)
 				mdev->l3mdev_ops = &ipvl_l3mdev_ops;
 				mdev->priv_flags |= IFF_L3MDEV_MASTER;
 			} else
-				return err;
+				goto fail;
 		} else if (port->mode == IPVLAN_MODE_L3S) {
 			/* Old mode was L3S */
 			mdev->priv_flags &= ~IFF_L3MDEV_MASTER;
 			ipvlan_unregister_nf_hook(read_pnet(&port->pnet));
 			mdev->l3mdev_ops = NULL;
 		}
-		list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
-			if (nval == IPVLAN_MODE_L3 || nval == IPVLAN_MODE_L3S)
-				ipvlan->dev->flags |= IFF_NOARP;
-			else
-				ipvlan->dev->flags &= ~IFF_NOARP;
-		}
 		port->mode = nval;
 	}
+	return 0;
+
+fail:
+	/* Undo the flags changes that have been done so far. */
+	list_for_each_entry_continue_reverse(ipvlan, &port->ipvlans, pnode) {
+		flags = ipvlan->dev->flags;
+		if (port->mode == IPVLAN_MODE_L3 ||
+		    port->mode == IPVLAN_MODE_L3S)
+			dev_change_flags(ipvlan->dev, flags | IFF_NOARP);
+		else
+			dev_change_flags(ipvlan->dev, flags & ~IFF_NOARP);
+	}
+
 	return err;
 }
 
-- 
2.5.5

^ permalink raw reply related

* Re: [PATCH v2 1/4] Simplify usbnet_cdc_update_filter
From: Greg KH @ 2018-07-01  8:15 UTC (permalink / raw)
  To: Miguel Rodríguez Pérez; +Cc: linux-usb, netdev
In-Reply-To: <634ae5d9-c969-15a6-1831-10bd0719fe59@det.uvigo.gal>

On Sat, Jun 30, 2018 at 07:32:23PM +0200, Miguel Rodríguez Pérez wrote:
> Remove some unneeded varibles to make the code easier to read
> and, replace the generic usb_control_msg function for the
> more specific usbnet_write_cmd.
> ---
>  drivers/net/usb/cdc_ether.c | 15 +++++----------
>  1 file changed, 5 insertions(+), 10 deletions(-)

None of your patches have a signed-off-by line, and your subject needs
work as well (see other patches for how to do this with the correct
prefix.)

Also work on cc: the correct people, scripts/get_maintainer.pl on the
patch will tell you that, and also use 'git send-email' to properly
thread them.

thanks,

greg k-h

^ permalink raw reply

* Re: [PATCH net-next 5/5] net: gemini: Indicate that we can handle jumboframes
From: Andrew Lunn @ 2018-07-01  7:58 UTC (permalink / raw)
  To: Linus Walleij
  Cc: netdev, David S . Miller, Michał Mirosław, Janos Laube,
	Paulius Zaleckas, linux-arm-kernel, Hans Ulli Kroll,
	Florian Fainelli
In-Reply-To: <20180630161806.24312-5-linus.walleij@linaro.org>

On Sat, Jun 30, 2018 at 06:18:06PM +0200, Linus Walleij wrote:
> The hardware supposedly handles frames up to 10236 bytes and
> implements .ndo_change_mtu() so accept 10236 minus the ethernet
> header for a VLAN tagged frame on the netdevices.
> 
> Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
> ---
>  drivers/net/ethernet/cortina/gemini.c | 5 +++++
>  1 file changed, 5 insertions(+)
> 
> diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c
> index 79324bbfd768..ae475393e4ac 100644
> --- a/drivers/net/ethernet/cortina/gemini.c
> +++ b/drivers/net/ethernet/cortina/gemini.c
> @@ -2473,6 +2473,11 @@ static int gemini_ethernet_port_probe(struct platform_device *pdev)
>  
>  	netdev->hw_features = GMAC_OFFLOAD_FEATURES;
>  	netdev->features |= GMAC_OFFLOAD_FEATURES | NETIF_F_GRO;
> +	/* We can handle jumbo frames up to 10236 bytes so, let's accept
> +	 * payloads of 10236 bytes minus VLAN and ethernet header
> +	 */
> +	netdev->min_mtu = 256;
> +	netdev->max_mtu = 10236 - VLAN_ETH_HLEN;

Hi Linus

The commit message does not mention the min mtu you set here. Where
does 256 come from?

Thanks
     Andrew

^ permalink raw reply

* Re: [PATCH net-next 2/5] net: gemini: Improve connection prints
From: Andrew Lunn @ 2018-07-01  7:54 UTC (permalink / raw)
  To: Linus Walleij
  Cc: netdev, David S . Miller, Michał Mirosław, Janos Laube,
	Paulius Zaleckas, linux-arm-kernel, Hans Ulli Kroll,
	Florian Fainelli
In-Reply-To: <20180630161806.24312-2-linus.walleij@linaro.org>

On Sat, Jun 30, 2018 at 06:18:03PM +0200, Linus Walleij wrote:
> Instead of just specify that a PHY is connected at some
> speed, also specify which one. This is helpful with several
> PHYs on the system.

Hi Linus

Could you just use phy_print_status()

      Andrew

^ permalink raw reply

* Re: [PATCH net-next 0/2] tcp: fix high tail latencies in DCTCP
From: Lawrence Brakmo @ 2018-07-01  4:09 UTC (permalink / raw)
  To: Neal Cardwell
  Cc: Netdev, Kernel Team, Blake Matheny, Alexei Starovoitov,
	Yuchung Cheng, Steve Ibanez, Eric Dumazet, Yousuk Seung
In-Reply-To: <CADVnQymszmasDGXjuibNaRyzavau73VPtZ6btELcO5fNrbuKOQ@mail.gmail.com>

On 6/30/18, 5:26 PM, "netdev-owner@vger.kernel.org on behalf of Neal Cardwell" <netdev-owner@vger.kernel.org on behalf of ncardwell@google.com> wrote:

    On Fri, Jun 29, 2018 at 9:48 PM Lawrence Brakmo <brakmo@fb.com> wrote:
    >
    > When have observed high tail latencies when using DCTCP for RPCs as
    > compared to using Cubic. For example, in one setup there are 2 hosts
    > sending to a 3rd one, with each sender having 3 flows (1 stream,
    > 1 1MB back-to-back RPCs and 1 10KB back-to-back RPCs). The following
    > table shows the 99% and 99.9% latencies for both Cubic and dctcp:
    >
    >            Cubic 99%  Cubic 99.9%   dctcp 99%    dctcp 99.9%
    >   1MB RPCs    2.6ms       5.5ms         43ms          208ms
    >  10KB RPCs    1.1ms       1.3ms         53ms          212ms
    >
    > Looking at tcpdump traces showed that there are two causes for the
    > latency.
    >
    >   1) RTOs caused by the receiver sending a dup ACK and not ACKing
    >      the last (and only) packet sent.
    >   2) Delaying ACKs when the sender has a cwnd of 1, so everything
    >      pauses for the duration of the delayed ACK.
    >
    > The first patch fixes the cause of the dup ACKs, not updating DCTCP
    > state when an ACK that was initially delayed has been sent with a
    > data packet.
    >
    > The second patch insures that an ACK is sent immediately when a
    > CWR marked packet arrives.
    >
    > With the patches the latencies for DCTCP now look like:
    >
    >            dctcp 99%  dctcp 99.9%
    >   1MB RPCs    4.8ms       6.5ms
    >  10KB RPCs    143us       184us
    >
    > Note that while the 1MB RPCs tail latencies are higher than Cubic's,
    > the 10KB latencies are much smaller than Cubic's. These patches fix
    > issues on the receiver, but tcpdump traces indicate there is an
    > opportunity to also fix an issue at the sender that adds about 3ms
    > to the tail latencies.
    >
    > The following trace shows the issue that tiggers an RTO (fixed by these patches):
    >
    >    Host A sends the last packets of the request
    >    Host B receives them, and the last packet is marked with congestion (CE)
    >    Host B sends ACKs for packets not marked with congestion
    >    Host B sends data packet with reply and ACK for packet marked with
    >           congestion (TCP flag ECE)
    >    Host A receives ACKs with no ECE flag
    >    Host A receives data packet with ACK for the last packet of request
    >           and which has TCP ECE bit set
    >    Host A sends 1st data packet of the next request with TCP flag CWR
    >    Host B receives the packet (as seen in tcpdump at B), no CE flag
    >    Host B sends a dup ACK that also has the TCP ECE flag
    >    Host A RTO timer fires!
    >    Host A to send the next packet
    >    Host A receives an ACK for everything it has sent (i.e. Host B
    >           did receive 1st packet of request)
    >    Host A send more packets…
    >
    > [PATCH net-next 1/2] tcp: notify when a delayed ack is sent
    > [PATCH net-next 2/2] tcp: ack immediately when a cwr packet arrives
    
    Thanks, Larry!
    
    I suspect there is a broader problem with "DCTCP-style precise ECE
    ACKs" that this patch series does not address.
    
    AFAICT the DCTCP "historical" ACKs for ECE precision, generated in
    dctcp_ce_state_0_to_1() and dctcp_ce_state_1_to_0(), violate the
    assumptions of the pre-existing delayed ACK state machine. They
    violate those assumptions by rewinding tp->rcv_nxt backwards and then
    calling tcp_send_ack(sk). But the existing code path to send an ACK
    assumes that any ACK transmission always sends an ACK that accounts
    for all received data, so that after sending an ACK we can cancel the
    delayed ACK timer. So it seems with DCTCP we can have buggy sequences
    where the DCTCP historical ACK causes us to forget that we need to
    schedule a delayed ACK (which will force the sender to RTO, as shown
    in the trace):
    
    tcp_event_data_recv()
     inet_csk_schedule_ack()  // remember that we need an ACK
     tcp_ecn_check_ce()
      tcp_ca_event() // with CA_EVENT_ECN_IS_CE or CA_EVENT_ECN_NO_CE
        dctcp_cwnd_event()
          dctcp_ce_state_0_to_1() or dctcp_ce_state_1_to_0()
           if (... && ca->delayed_ack_reserved)
              tp->rcv_nxt = ca->prior_rcv_nxt;
              tcp_send_ack(sk);     // send an ACK, but for old data!
                tcp_transmit_skb()
                  tcp_event_ack_sent()
                    inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
                        // forget that we need a delayed ACK!
    
    AFAICT the first patch in this series, to force an immediate ACK on
    any packet with a CWR, papers over this issue of the forgotten delayed
    ACK by forcing an immediate ack in __tcp_ack_snd_check(). This ensures
    that at least for CWR packets the forgotten delayed ACK does not
    matter. But for packets without CWR I believe the buggy "forgotten
    delayed ACK" sequence above is still possible, and could still plague
    DCTCP with high tail latencies in some cases. I suspect that after
    your CWR-forces-ACK patch it may not be jumping out in performance
    test results because (a) if there is no CWR involved then usually the
    cwnd is high enough that missing delayed ACK does not cause a stall,
    and (b) if there is a CWR involved then your patch forces an immediate
    ACK. But AFAICT that forgotten delayed ACK bug is still there.
    
    What do folks think?
    
    neal

    
Neal, just to clarify, the 2nd patch is the one that ACKs immediately after receiving a CWR marked packet. The 1st patch eliminates the problem that was causing RTOs due to DCTCP being in the wrong state (thinking there was a delayed ACK pending when there was not). With just the 1st patch I do not see any RTOs, only delays due to delayed ACKs (which are fixed with the 2nd patch).

You are correct in that the dctcp code to send old ACKs can result in missing sending an ACK. However, it is for the new packet not for the previously delayed ACK since that is the ACK that is sent when the dctcp code (dctcp_ce_state_..()) calls tcp_send_ack().

When a new data packet arrives, tcp_event_data_recv() is called. This calls inet_csk_schedule_ack(sk) which sets icsk_ack.pending to remember that an ack needs to be sent. If dctcp_ce_state_..() ends up calling tcp_send_ack(), it will ultimately clear icsk_ack.pending (it is done in inet_csk_clear_xmit_timer()). As a result, tcp_ack_snd_check(), which is called after dctcp_ce_state_..() is called, will just return after checking that icsk_ack.pending is zero. That is, not ACK will be sent.

If the cwnd is greater than 1, then missing one ACK will not have a major impact because the next packet will trigger sending an ACK (although it may be delayed). This is why my original patch of making sure the cwnd is at least 2 solved the problem (but did not fix the main causes).

This patch set fixes two issues in the current code that were causing the high percentile latencies when using DCTCP and RPCs.

The bug you pointed out does exists, but is orthogonal to the issues fixed by my current patch set. I will leave this patch set as is and do another patch for the bug you pointed out.

Thanks Neal!

Larry

^ permalink raw reply

* Re: [PATCH net] tcp: prevent bogus FRTO undos with non-SACK flows
From: Neal Cardwell @ 2018-07-01  1:56 UTC (permalink / raw)
  To: Ilpo Järvinen; +Cc: Netdev, Yuchung Cheng, Eric Dumazet, Michal Kubecek
In-Reply-To: <alpine.DEB.2.20.1806292220150.26571@whs-18.cs.helsinki.fi>

On Fri, Jun 29, 2018 at 3:52 PM Ilpo Järvinen <ilpo.jarvinen@helsinki.fi> wrote:
> > +.005 < . 1:1(0) ack 2001 win 257
>
> Why did the receiver send a cumulative ACK only for 2001?

Sorry, you are right Ilpo. Upon further reflection, the packetdrill
scenario I posted is not a realistic one, and I agree we should not
worry about it.

As I mentioned, I ran your patch through all our team's TCP
packetdrill tests, and it passes all of the tests. One of our tests
needed updating, because if there is a non-SACK connection with a
spurious RTO due to a delayed flight of ACKs then the FRTO undo now
happens one ACK later (when we get an ACK that doesn't cover a
retransmit). But that seems fine to me.

I also cooked the new packetdrill test below to explicitly cover this
case you are addressing (please let me know if you have an alternate
suggestion).

Tested-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>

Thanks!
neal

---

    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
   +0 bind(3, ..., ...) = 0
   +0 listen(3, 1) = 0

   +0 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7>
   +0 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 8>
 +.02 < . 1:1(0) ack 1 win 257
   +0 accept(3, ..., ...) = 4

// Send 3 packets. First is really lost. And the dupacks
// for the data packets that arrived at the reciver are slow in arriving.
   +0 write(4, ..., 3000) = 3000
   +0 > P. 1:3001(3000) ack 1

// RTO and retransmit head. This fills a real loss.
 +.22 > . 1:1001(1000) ack 1

// Dupacks for packets 2 and 3 arrive.
+.02  < . 1:1(0) ack 1 win 257
   +0 < . 1:1(0) ack 1 win 257

// The cumulative ACK for all the data arrives. We do not undo, because
// this is a non-SACK connection, and retransmitted data was ACKed.
// It's good that there's no FRTO undo, since a packet was really lost.
// Because this is non-SACK, tcp_try_undo_recovery() holds CA_Loss
// until something beyond high_seq is ACKed.
+.005 < . 1:1(0) ack 3001 win 257
   +0 %{ assert tcpi_ca_state == TCP_CA_Loss, tcpi_ca_state }%
   +0 %{ assert tcpi_snd_cwnd == 4, tcpi_snd_cwnd }%
   +0 %{ assert tcpi_snd_ssthresh == 7, tcpi_snd_ssthresh }%

^ permalink raw reply

* Re: [PATCH net-next 2/2] tcp: ack immediately when a cwr packet arrives
From: Lawrence Brakmo @ 2018-07-01  1:46 UTC (permalink / raw)
  To: Neal Cardwell
  Cc: Netdev, Kernel Team, Blake Matheny, Alexei Starovoitov,
	Yuchung Cheng, Steve Ibanez, Eric Dumazet
In-Reply-To: <CADVnQykfQRLVcaE042SoOESv=LwWMnVPhsHn+8eP9FXNTEpOXA@mail.gmail.com>

On 6/30/18, 11:23 AM, "Neal Cardwell" <ncardwell@google.com> wrote:

    On Fri, Jun 29, 2018 at 9:48 PM Lawrence Brakmo <brakmo@fb.com> wrote:
    >
    > We observed high 99 and 99.9% latencies when doing RPCs with DCTCP. The
    > problem is triggered when the last packet of a request arrives CE
    > marked. The reply will carry the ECE mark causing TCP to shrink its cwnd
    > to 1 (because there are no packets in flight). When the 1st packet of
    > the next request arrives it was sometimes delayed adding up to 40ms to
    > the latency.
    >
    > This patch insures that CWR makred data packets arriving will be acked
    > immediately.
    >
    > Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
    > ---
    
    Thanks, Larry. Ensuring that CWR-marked data packets arriving will be
    acked immediately sounds like a good goal to me.
    
    I am wondering if we can have a simpler fix.
    
    The dctcp_ce_state_0_to_1() code is setting the TCP_ECN_DEMAND_CWR
    bit in ecn_flags, which disables the code in __tcp_ecn_check_ce() that
    would have otherwise used the tcp_enter_quickack_mode() mechanism to
    force an ACK:
    
    __tcp_ecn_check_ce():
    ...
    case INET_ECN_CE:
      if (tcp_ca_needs_ecn(sk))
        tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
           // -> dctcp_ce_state_0_to_1()
           //     ->  tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
    
      if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
        /* Better not delay acks, sender can have a very low cwnd */
        tcp_enter_quickack_mode(sk, 1);
        tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
      }
      tp->ecn_flags |= TCP_ECN_SEEN;
      break;
    
    So it seems like the bug here may be that dctcp_ce_state_0_to_1()  is
    setting the TCP_ECN_DEMAND_CWR  bit in ecn_flags, when really it
    should let its caller, __tcp_ecn_check_ce() set TCP_ECN_DEMAND_CWR, in
    which case the code would already properly force an immediate ACK.
    
    So, what if we use this fix instead (not compiled, not tested):
    
    diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
    index 5f5e5936760e..4fecd2824edb 100644
    --- a/net/ipv4/tcp_dctcp.c
    +++ b/net/ipv4/tcp_dctcp.c
    @@ -152,8 +152,6 @@ static void dctcp_ce_state_0_to_1(struct sock *sk)
    
            ca->prior_rcv_nxt = tp->rcv_nxt;
            ca->ce_state = 1;
    -
    -       tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
     }
    
     static void dctcp_ce_state_1_to_0(struct sock *sk)
    
    What do you think?
    
    neal

I see two issues, one is that entering quickack mode as you mentioned does not insure that it will still be on when the CWR arrives. The second issue is that the problem occurs right after the receiver sends a small reply which results in entering pingpong mode right before the sender starts the new request by sending just one packet (i.e. forces delayed ack).

I compiled and tested your patch. Both 99 and 99.9 percentile latencies are around 40ms. Looking at the packet traces shows that some CWR marked packets are not being ack immediately (delayed by 40ms).

Larry

    

^ permalink raw reply

* Re: pull-request: bpf 2018-07-01
From: David Miller @ 2018-07-01  0:28 UTC (permalink / raw)
  To: daniel; +Cc: ast, netdev
In-Reply-To: <20180630235637.22331-1-daniel@iogearbox.net>

From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun,  1 Jul 2018 01:56:37 +0200

> The following pull-request contains BPF updates for your *net* tree.
> 
> The main changes are:
 ...
> Please consider pulling these changes from:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git

Pulled, thanks Daniel.

^ permalink raw reply

* Re: [PATCH net-next 0/2] tcp: fix high tail latencies in DCTCP
From: Neal Cardwell @ 2018-07-01  0:26 UTC (permalink / raw)
  To: Lawrence Brakmo
  Cc: Netdev, Kernel Team, bmatheny, ast, Yuchung Cheng, Steve Ibanez,
	Eric Dumazet, Yousuk Seung
In-Reply-To: <20180630014815.2881895-1-brakmo@fb.com>

On Fri, Jun 29, 2018 at 9:48 PM Lawrence Brakmo <brakmo@fb.com> wrote:
>
> When have observed high tail latencies when using DCTCP for RPCs as
> compared to using Cubic. For example, in one setup there are 2 hosts
> sending to a 3rd one, with each sender having 3 flows (1 stream,
> 1 1MB back-to-back RPCs and 1 10KB back-to-back RPCs). The following
> table shows the 99% and 99.9% latencies for both Cubic and dctcp:
>
>            Cubic 99%  Cubic 99.9%   dctcp 99%    dctcp 99.9%
>   1MB RPCs    2.6ms       5.5ms         43ms          208ms
>  10KB RPCs    1.1ms       1.3ms         53ms          212ms
>
> Looking at tcpdump traces showed that there are two causes for the
> latency.
>
>   1) RTOs caused by the receiver sending a dup ACK and not ACKing
>      the last (and only) packet sent.
>   2) Delaying ACKs when the sender has a cwnd of 1, so everything
>      pauses for the duration of the delayed ACK.
>
> The first patch fixes the cause of the dup ACKs, not updating DCTCP
> state when an ACK that was initially delayed has been sent with a
> data packet.
>
> The second patch insures that an ACK is sent immediately when a
> CWR marked packet arrives.
>
> With the patches the latencies for DCTCP now look like:
>
>            dctcp 99%  dctcp 99.9%
>   1MB RPCs    4.8ms       6.5ms
>  10KB RPCs    143us       184us
>
> Note that while the 1MB RPCs tail latencies are higher than Cubic's,
> the 10KB latencies are much smaller than Cubic's. These patches fix
> issues on the receiver, but tcpdump traces indicate there is an
> opportunity to also fix an issue at the sender that adds about 3ms
> to the tail latencies.
>
> The following trace shows the issue that tiggers an RTO (fixed by these patches):
>
>    Host A sends the last packets of the request
>    Host B receives them, and the last packet is marked with congestion (CE)
>    Host B sends ACKs for packets not marked with congestion
>    Host B sends data packet with reply and ACK for packet marked with
>           congestion (TCP flag ECE)
>    Host A receives ACKs with no ECE flag
>    Host A receives data packet with ACK for the last packet of request
>           and which has TCP ECE bit set
>    Host A sends 1st data packet of the next request with TCP flag CWR
>    Host B receives the packet (as seen in tcpdump at B), no CE flag
>    Host B sends a dup ACK that also has the TCP ECE flag
>    Host A RTO timer fires!
>    Host A to send the next packet
>    Host A receives an ACK for everything it has sent (i.e. Host B
>           did receive 1st packet of request)
>    Host A send more packets…
>
> [PATCH net-next 1/2] tcp: notify when a delayed ack is sent
> [PATCH net-next 2/2] tcp: ack immediately when a cwr packet arrives

Thanks, Larry!

I suspect there is a broader problem with "DCTCP-style precise ECE
ACKs" that this patch series does not address.

AFAICT the DCTCP "historical" ACKs for ECE precision, generated in
dctcp_ce_state_0_to_1() and dctcp_ce_state_1_to_0(), violate the
assumptions of the pre-existing delayed ACK state machine. They
violate those assumptions by rewinding tp->rcv_nxt backwards and then
calling tcp_send_ack(sk). But the existing code path to send an ACK
assumes that any ACK transmission always sends an ACK that accounts
for all received data, so that after sending an ACK we can cancel the
delayed ACK timer. So it seems with DCTCP we can have buggy sequences
where the DCTCP historical ACK causes us to forget that we need to
schedule a delayed ACK (which will force the sender to RTO, as shown
in the trace):

tcp_event_data_recv()
 inet_csk_schedule_ack()  // remember that we need an ACK
 tcp_ecn_check_ce()
  tcp_ca_event() // with CA_EVENT_ECN_IS_CE or CA_EVENT_ECN_NO_CE
    dctcp_cwnd_event()
      dctcp_ce_state_0_to_1() or dctcp_ce_state_1_to_0()
       if (... && ca->delayed_ack_reserved)
          tp->rcv_nxt = ca->prior_rcv_nxt;
          tcp_send_ack(sk);     // send an ACK, but for old data!
            tcp_transmit_skb()
              tcp_event_ack_sent()
                inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
                    // forget that we need a delayed ACK!

AFAICT the first patch in this series, to force an immediate ACK on
any packet with a CWR, papers over this issue of the forgotten delayed
ACK by forcing an immediate ack in __tcp_ack_snd_check(). This ensures
that at least for CWR packets the forgotten delayed ACK does not
matter. But for packets without CWR I believe the buggy "forgotten
delayed ACK" sequence above is still possible, and could still plague
DCTCP with high tail latencies in some cases. I suspect that after
your CWR-forces-ACK patch it may not be jumping out in performance
test results because (a) if there is no CWR involved then usually the
cwnd is high enough that missing delayed ACK does not cause a stall,
and (b) if there is a CWR involved then your patch forces an immediate
ACK. But AFAICT that forgotten delayed ACK bug is still there.

What do folks think?

neal

^ permalink raw reply

* Re: [bpf PATCH v5 0/4] BPF fixes for sockhash
From: Daniel Borkmann @ 2018-06-30 23:58 UTC (permalink / raw)
  To: John Fastabend, ast, kafai; +Cc: netdev
In-Reply-To: <20180630131506.17344.7833.stgit@john-Precision-Tower-5810>

On 06/30/2018 03:17 PM, John Fastabend wrote:
> This addresses two syzbot issues that lead to identifying (by Eric and
> Wei) a class of bugs where we don't correctly check for IPv4/v6
> sockets and their associated state. The second issue was a locking
> omission in sockhash.
> 
> The first patch addresses IPv6 socks and fixing an error where
> sockhash would overwrite the prot pointer with IPv4 prot. To fix
> this build similar solution to TLS ULP. Although we continue to
> allow socks in all states not just ESTABLISH in this patch set
> because as Martin points out there should be no issue with this
> on the sockmap ULP because we don't use the ctx in this code. Once
> multiple ULPs coexist we may need to revisit this. However we
> can do this in *next trees.
> 
> The other issue syzbot found that the tcp_close() handler missed
> locking the hash bucket lock which could result in corrupting the
> sockhash bucket list if delete and close ran at the same time. 
> And also the smap_list_remove() routine was not working correctly
> at all. This was not caught in my testing because in general my
> tests (to date at least lets add some more robust selftest in
> bpf-next) do things in the "expected" order, create map, add socks,
> delete socks, then tear down maps. The tests we have that do the
> ops out of this order where only working on single maps not multi-
> maps so we never saw the issue. Thanks syzbot. The fix is to
> restructure the tcp_close() lock handling. And fix the obvious
> bug in smap_list_remove().
> 
> Finally, during review I noticed the release handler was omitted
> from the upstream code (patch 4) due to an incorrect merge conflict
> fix when I ported the code to latest bpf-next before submitting.
> This would leave references to the map around if the user never
> closes the map.
> 
> v3: rework patches, dropping ESTABLISH check and adding rcu
>     annotation along with the smap_list_remove fix
> 
> v4: missed one more case where maps was being accessed without
>     the sk_callback_lock, spoted by Martin as well.
> 
> v5: changed to use a specific lock for maps and reduced callback
>     lock so that it is only used to gaurd sk callbacks. I think
>     this makes the logic a bit cleaner and avoids confusion
>     ovoer what each lock is doing.
> 
> Also big thanks to Martin for thorough review he caught at least
> one case where I missed a rcu_call().

Applied it to bpf, thanks John!

^ permalink raw reply

* pull-request: bpf 2018-07-01
From: Daniel Borkmann @ 2018-06-30 23:56 UTC (permalink / raw)
  To: davem; +Cc: daniel, ast, netdev

Hi David,

The following pull-request contains BPF updates for your *net* tree.

The main changes are:

1) A bpf_fib_lookup() helper fix to change the API before freeze to
   return an encoding of the FIB lookup result and return the nexthop
   device index in the params struct (instead of device index as return
   code that we had before), from David.

2) Various BPF JIT fixes to address syzkaller fallout, that is, do not
   reject progs when set_memory_*() fails since it could still be RO.
   Also arm32 JIT was not using bpf_jit_binary_lock_ro() API which was
   an issue, and a memory leak in s390 JIT found during review, from
   Daniel.

3) Multiple fixes for sockmap/hash to address most of the syzkaller
   triggered bugs. Usage with IPv6 was crashing, a GPF in bpf_tcp_close(),
   a missing sock_map_release() routine to hook up to callbacks, and a
   fix for an omitted bucket lock in sock_close(), from John.

4) Two bpftool fixes to remove duplicated error message on program load,
   and another one to close the libbpf object after program load. One
   additional fix for nfp driver's BPF offload to avoid stopping offload
   completely if replace of program failed, from Jakub.

5) Couple of BPF selftest fixes that bail out in some of the test
   scripts if the user does not have the right privileges, from Jeffrin.

6) Fixes in test_bpf for s390 when CONFIG_BPF_JIT_ALWAYS_ON is set
   where we need to set the flag that some of the test cases are expected
   to fail, from Kleber.

7) Fix to detangle BPF_LIRC_MODE2 dependency from CONFIG_CGROUP_BPF
   since it has no relation to it and lirc2 users often have configs
   without cgroups enabled and thus would not be able to use it, from Sean.

8) Fix a selftest failure in sockmap by removing a useless setrlimit()
   call that would set a too low limit where at the same time we are
   already including bpf_rlimit.h that does the job, from Yonghong.

9) Fix BPF selftest config with missing missing NET_SCHED, from Anders.

Please consider pulling these changes from:

  git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git

Thanks a lot!

----------------------------------------------------------------

The following changes since commit 3739a21e0ef6ac06f46bd38e81daa95e8cb462bc:

  selftests: net: add tcp_inq to gitignore (2018-06-21 15:02:32 +0900)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git 

for you to fetch changes up to bf2b866a2fe2d74558fe4b7bdf63a4bc0afbdf70:

  Merge branch 'bpf-sockmap-fixes' (2018-07-01 01:21:33 +0200)

----------------------------------------------------------------
Alexei Starovoitov (1):
      Merge branch 'bpf-fixes'

Anders Roxell (1):
      selftests: bpf: add missing NET_SCHED to config

Daniel Borkmann (5):
      Merge branch 'bpf-bpftool-fixes'
      bpf, arm32: fix to use bpf_jit_binary_lock_ro api
      bpf, s390: fix potential memleak when later bpf_jit_prog fails
      bpf: undo prog rejection on read-only lock failure
      Merge branch 'bpf-sockmap-fixes'

David Ahern (1):
      bpf: Change bpf_fib_lookup to return lookup status

Jakub Kicinski (3):
      tools: bpftool: remove duplicated error message on prog load
      tools: bpftool: remember to close the libbpf object after prog load
      nfp: bpf: don't stop offload if replace failed

Jeffrin Jose T (3):
      selftests: bpf: notification about privilege required to run test_kmod.sh testing script
      selftests: bpf: notification about privilege required to run test_lirc_mode2.sh testing script
      selftests: bpf: notification about privilege required to run test_lwt_seg6local.sh testing script

John Fastabend (4):
      bpf: sockmap, fix crash when ipv6 sock is added
      bpf: sockmap, fix smap_list_map_remove when psock is in many maps
      bpf: sockhash fix omitted bucket lock in sock_close
      bpf: sockhash, add release routine

Kleber Sacilotto de Souza (1):
      test_bpf: flag tests that cannot be jited on s390

Sean Young (1):
      bpf: fix attach type BPF_LIRC_MODE2 dependency wrt CONFIG_CGROUP_BPF

Yonghong Song (1):
      tools/bpf: fix test_sockmap failure

 arch/arm/net/bpf_jit_32.c                         |   2 +-
 arch/s390/net/bpf_jit_comp.c                      |   1 +
 drivers/media/rc/bpf-lirc.c                       |  14 +-
 drivers/net/ethernet/netronome/nfp/bpf/main.c     |   6 +-
 include/linux/bpf-cgroup.h                        |  26 +++
 include/linux/bpf.h                               |   8 +
 include/linux/bpf_lirc.h                          |   5 +-
 include/linux/filter.h                            |  56 +----
 include/uapi/linux/bpf.h                          |  28 ++-
 kernel/bpf/cgroup.c                               |  54 +++++
 kernel/bpf/core.c                                 |  30 +--
 kernel/bpf/sockmap.c                              | 254 ++++++++++++++++------
 kernel/bpf/syscall.c                              |  99 ++-------
 lib/test_bpf.c                                    |  20 ++
 net/core/filter.c                                 |  86 +++++---
 samples/bpf/xdp_fwd_kern.c                        |   8 +-
 tools/bpf/bpftool/prog.c                          |  12 +-
 tools/testing/selftests/bpf/config                |   1 +
 tools/testing/selftests/bpf/test_kmod.sh          |   9 +
 tools/testing/selftests/bpf/test_lirc_mode2.sh    |   9 +
 tools/testing/selftests/bpf/test_lwt_seg6local.sh |   9 +
 tools/testing/selftests/bpf/test_sockmap.c        |   6 -
 22 files changed, 449 insertions(+), 294 deletions(-)

^ permalink raw reply

* Re: [PATCH bpf] bpf: hash_map: decrement counter on error
From: Daniel Borkmann @ 2018-06-30 23:20 UTC (permalink / raw)
  To: Mauricio Vasquez B, Alexei Starovoitov; +Cc: netdev
In-Reply-To: <1530276500-29765-1-git-send-email-mauricio.vasquez@polito.it>

On 06/29/2018 02:48 PM, Mauricio Vasquez B wrote:
> Decrement the number of elements in the map in case the allocation
> of a new node fails.
> 
> Signed-off-by: Mauricio Vasquez B <mauricio.vasquez@polito.it>

Thanks for the fix, Mauricio!

Could you reply with a Fixes: tag in order to track the commit originally
introducing this bug?

Thanks,
Daniel

^ permalink raw reply

* Re: [PATCH bpf-next 0/8] tools: bpf: updates to bpftool and libbpf
From: Daniel Borkmann @ 2018-06-30 23:03 UTC (permalink / raw)
  To: Jakub Kicinski, alexei.starovoitov; +Cc: oss-drivers, netdev
In-Reply-To: <20180628214142.11268-1-jakub.kicinski@netronome.com>

On 06/28/2018 11:41 PM, Jakub Kicinski wrote:
> Hi!
> 
> Set of random updates to bpftool and libbpf.  I'm preparing for
> extending bpftool prog load, but there is a good number of
> improvements that can be made before bpf -> bpf-next merge
> helping to keep the later patch set to a manageable size as well.
> 
> First patch is a bpftool build speed improvement.  Next missing
> program types are added to libbpf program type detection by section
> name.  The ability to load programs from '.text' section is restored
> when ELF file doesn't contain any pseudo calls.
> 
> In bpftool I remove my Author comments as unnecessary sign of vanity.
> Last but not least missing option is added to bash completions and
> processing of options in bash completions is improved.

Applied to bpf-next, thanks Jakub!

^ permalink raw reply

* [PATCH net] ipv6: sr: fix passing wrong flags to crypto_alloc_shash()
From: Eric Biggers @ 2018-06-30 22:26 UTC (permalink / raw)
  To: netdev, David S . Miller; +Cc: David Lebrun, linux-crypto, Eric Biggers

From: Eric Biggers <ebiggers@google.com>

The 'mask' argument to crypto_alloc_shash() uses the CRYPTO_ALG_* flags,
not 'gfp_t'.  So don't pass GFP_KERNEL to it.

Fixes: bf355b8d2c30 ("ipv6: sr: add core files for SR HMAC support")
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 net/ipv6/seg6_hmac.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c
index 33fb35cbfac13..558fe8cc6d438 100644
--- a/net/ipv6/seg6_hmac.c
+++ b/net/ipv6/seg6_hmac.c
@@ -373,7 +373,7 @@ static int seg6_hmac_init_algo(void)
 			return -ENOMEM;
 
 		for_each_possible_cpu(cpu) {
-			tfm = crypto_alloc_shash(algo->name, 0, GFP_KERNEL);
+			tfm = crypto_alloc_shash(algo->name, 0, 0);
 			if (IS_ERR(tfm))
 				return PTR_ERR(tfm);
 			p_tfm = per_cpu_ptr(algo->tfms, cpu);
-- 
2.18.0

^ permalink raw reply related

* [PATCH net-next] r8169: remove old PHY reset hack
From: Heiner Kallweit @ 2018-06-30 22:25 UTC (permalink / raw)
  To: Realtek linux nic maintainers, David Miller; +Cc: netdev@vger.kernel.org

This hack (affecting the non-PCIe models only) was introduced in 2004
to deal with link negotiation failures in 1GBit mode. Based on a
comment in the r8169 vendor driver I assume the issue affects RTL8169sb
in combination with particular 1GBit switch models.

Resetting the PHY every 10s and hoping that one fine day we will make
it to establish the link seems to be very hacky to me. I'd say:
If 1GBit doesn't work reliably in a users environment then the user
should remove 1GBit from the advertised modes, e.g. by using
ethtool -s <if> advertise <10/100 modes>

If the issue affects one chip version only and that with most link
partners, then we could also think of removing 1GBit from the
advertised modes for this chip version in the driver.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
---
 drivers/net/ethernet/realtek/r8169.c | 57 +---------------------------
 1 file changed, 1 insertion(+), 56 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 72a7778b..b0a06902 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -80,7 +80,6 @@ static const int multicast_filter_limit = 32;
 #define R8169_RX_RING_BYTES	(NUM_RX_DESC * sizeof(struct RxDesc))
 
 #define RTL8169_TX_TIMEOUT	(6*HZ)
-#define RTL8169_PHY_TIMEOUT	(10*HZ)
 
 /* write/read MMIO register */
 #define RTL_W8(tp, reg, val8)	writeb((val8), tp->mmio_addr + (reg))
@@ -703,7 +702,6 @@ enum rtl_flag {
 	RTL_FLAG_TASK_ENABLED,
 	RTL_FLAG_TASK_SLOW_PENDING,
 	RTL_FLAG_TASK_RESET_PENDING,
-	RTL_FLAG_TASK_PHY_PENDING,
 	RTL_FLAG_MAX
 };
 
@@ -731,7 +729,6 @@ struct rtl8169_private {
 	dma_addr_t RxPhyAddr;
 	void *Rx_databuff[NUM_RX_DESC];	/* Rx data buffers */
 	struct ring_info tx_skb[NUM_TX_DESC];	/* Tx data buffers */
-	struct timer_list timer;
 	u16 cp_cmd;
 
 	u16 event_slow;
@@ -1788,20 +1785,7 @@ static int rtl8169_set_speed_xmii(struct net_device *dev,
 static int rtl8169_set_speed(struct net_device *dev,
 			     u8 autoneg, u16 speed, u8 duplex, u32 advertising)
 {
-	struct rtl8169_private *tp = netdev_priv(dev);
-	int ret;
-
-	ret = rtl8169_set_speed_xmii(dev, autoneg, speed, duplex, advertising);
-	if (ret < 0)
-		goto out;
-
-	if (netif_running(dev) && (autoneg == AUTONEG_ENABLE) &&
-	    (advertising & ADVERTISED_1000baseT_Full) &&
-	    !pci_is_pcie(tp->pci_dev)) {
-		mod_timer(&tp->timer, jiffies + RTL8169_PHY_TIMEOUT);
-	}
-out:
-	return ret;
+	return rtl8169_set_speed_xmii(dev, autoneg, speed, duplex, advertising);
 }
 
 static netdev_features_t rtl8169_fix_features(struct net_device *dev,
@@ -1888,8 +1872,6 @@ static int rtl8169_set_link_ksettings(struct net_device *dev,
 	    cmd->link_modes.advertising))
 		return -EINVAL;
 
-	del_timer_sync(&tp->timer);
-
 	rtl_lock_work(tp);
 	rc = rtl8169_set_speed(dev, cmd->base.autoneg, cmd->base.speed,
 			       cmd->base.duplex, advertising);
@@ -4293,44 +4275,12 @@ static void rtl_hw_phy_config(struct net_device *dev)
 	}
 }
 
-static void rtl_phy_work(struct rtl8169_private *tp)
-{
-	struct timer_list *timer = &tp->timer;
-	unsigned long timeout = RTL8169_PHY_TIMEOUT;
-
-	if (rtl8169_xmii_reset_pending(tp)) {
-		/*
-		 * A busy loop could burn quite a few cycles on nowadays CPU.
-		 * Let's delay the execution of the timer for a few ticks.
-		 */
-		timeout = HZ/10;
-		goto out_mod_timer;
-	}
-
-	if (rtl8169_xmii_link_ok(tp))
-		return;
-
-	netif_dbg(tp, link, tp->dev, "PHY reset until link up\n");
-
-	rtl8169_xmii_reset_enable(tp);
-
-out_mod_timer:
-	mod_timer(timer, jiffies + timeout);
-}
-
 static void rtl_schedule_task(struct rtl8169_private *tp, enum rtl_flag flag)
 {
 	if (!test_and_set_bit(flag, tp->wk.flags))
 		schedule_work(&tp->wk.work);
 }
 
-static void rtl8169_phy_timer(struct timer_list *t)
-{
-	struct rtl8169_private *tp = from_timer(tp, t, timer);
-
-	rtl_schedule_task(tp, RTL_FLAG_TASK_PHY_PENDING);
-}
-
 DECLARE_RTL_COND(rtl_phy_reset_cond)
 {
 	return rtl8169_xmii_reset_pending(tp);
@@ -6909,7 +6859,6 @@ static void rtl_task(struct work_struct *work)
 		/* XXX - keep rtl_slow_event_work() as first element. */
 		{ RTL_FLAG_TASK_SLOW_PENDING,	rtl_slow_event_work },
 		{ RTL_FLAG_TASK_RESET_PENDING,	rtl_reset_work },
-		{ RTL_FLAG_TASK_PHY_PENDING,	rtl_phy_work }
 	};
 	struct rtl8169_private *tp =
 		container_of(work, struct rtl8169_private, wk.work);
@@ -6982,8 +6931,6 @@ static void rtl8169_down(struct net_device *dev)
 {
 	struct rtl8169_private *tp = netdev_priv(dev);
 
-	del_timer_sync(&tp->timer);
-
 	napi_disable(&tp->napi);
 	netif_stop_queue(dev);
 
@@ -7694,8 +7641,6 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	tp->event_slow = cfg->event_slow;
 	tp->coalesce_info = cfg->coalesce_info;
 
-	timer_setup(&tp->timer, rtl8169_phy_timer, 0);
-
 	tp->rtl_fw = RTL_FIRMWARE_UNKNOWN;
 
 	tp->counters = dmam_alloc_coherent (&pdev->dev, sizeof(*tp->counters),
-- 
2.18.0

^ permalink raw reply related

* Re: pull-request: mac80211-next 2018-06-29
From: Johannes Berg @ 2018-06-30 20:45 UTC (permalink / raw)
  To: David Miller
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20180630.211728.888180353537471428.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>

On Sat, 2018-06-30 at 21:17 +0900, David Miller wrote:
> From: Johannes Berg <johannes-cdvu00un1VgdHxzADdlk8Q@public.gmane.org>
> Date: Fri, 29 Jun 2018 13:59:05 +0200
> 
> > Here's the next round of -next updates. This includes the promised
> > HE stuff but it's not all that big in the end, most of it is just
> > additions to the protocol header file :-)
> 
> :-)
> 
> > Please pull and let me know if there's any problem.
> 
> There was a small merge conflict in net/mac80211/scan.c, I arranged
> it such that the kcalloc() conversion was preserved.

Huh, sorry, hadn't seen that massive patch, I see it now.

> Please send me any necessary fixups if I didn't do it properly.

No need, looks fine. I diffed it to my tree and the only change is the
kcalloc conversion, as expected.

Thanks!

johannes

^ permalink raw reply

* [PATCH][next] netdevsim: fix sa_idx out of bounds check
From: Colin King @ 2018-06-30 20:39 UTC (permalink / raw)
  To: Jakub Kicinski, David S . Miller, netdev; +Cc: kernel-janitors, linux-kernel

From: Colin Ian King <colin.king@canonical.com>

Currently if sa_idx is equal to NSIM_IPSEC_MAX_SA_COUNT then
an out-of-bounds read on ipsec->sa will occur. Fix the
incorrect bounds check by using >= rather than >.

Detected by CoverityScan, CID#1470226 ("Out-of-bounds-read")

Fixes: 7699353da875 ("netdevsim: add ipsec offload testing")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
---
 drivers/net/netdevsim/ipsec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/netdevsim/ipsec.c b/drivers/net/netdevsim/ipsec.c
index ceff544510b9..2dcf6cc269d0 100644
--- a/drivers/net/netdevsim/ipsec.c
+++ b/drivers/net/netdevsim/ipsec.c
@@ -249,7 +249,7 @@ bool nsim_ipsec_tx(struct netdevsim *ns, struct sk_buff *skb)
 	}
 
 	sa_idx = xs->xso.offload_handle & ~NSIM_IPSEC_VALID;
-	if (unlikely(sa_idx > NSIM_IPSEC_MAX_SA_COUNT)) {
+	if (unlikely(sa_idx >= NSIM_IPSEC_MAX_SA_COUNT)) {
 		netdev_err(ns->netdev, "bad sa_idx=%d max=%d\n",
 			   sa_idx, NSIM_IPSEC_MAX_SA_COUNT);
 		return false;
-- 
2.17.1

^ permalink raw reply related

* Re: Anyone know if strongswan works with vrf?
From: David Ahern @ 2018-06-30 20:24 UTC (permalink / raw)
  To: Ben Greear, netdev
In-Reply-To: <95b83150-a6dc-f43b-f605-bc7c7f5986e8@candelatech.com>

On 6/29/18 4:10 PM, Ben Greear wrote:
> Hello,
> 
> We're trying to create lots of strongswan VPN tunnels on network devices
> bound to different VRFs.  We are using Fedora-24 on the client side,
> with a 4.16.15+ kernel
> and updated 'ip' package, etc.
> 
> So far, no luck getting it to work.
> 
> Any idea if this is supported or not?

Kernel side xfrm code does work; been a couple of years since I tried
it. As I recall strongswan needs an update.

Looking at the 'ip xfrm' based scripts, you need to add 'sel dev ${VRF}'
to the state. eg.,

VRF="sel dev blue"

ip xfrm state add src ${REMIP} dst ${MYIP} \
    proto esp spi 0x02122b77 reqid 0 mode tunnel \
    replay-window 4 replay-oseq 0x4 \
    auth-trunc 'hmac(md5)' 0xd94fcfea65fddf21dc6e0d24a0253508 96 \
    enc 'cbc(des3_ede)' 0xfc46c20f8048be9725930ff3fb07ac2a91f0347dffeacf62 \
    ${VRF}

^ permalink raw reply

* Re: [net-next 01/12] net/mlx5e: Add UDP GSO support
From: Boris Pismenny @ 2018-06-30 19:40 UTC (permalink / raw)
  To: Willem de Bruijn, Saeed Mahameed
  Cc: David Miller, Network Development, ogerlitz, yossiku,
	Alexander Duyck
In-Reply-To: <ccec1043-2223-b9d8-e73e-3e20987c634d@mellanox.com>


On 06/30/18 19:06, Boris Pismenny wrote:
>
>
> On 06/30/18 01:19, Willem de Bruijn wrote:
>> On Fri, Jun 29, 2018 at 2:24 AM Saeed Mahameed <saeedm@mellanox.com> 
>> wrote:
>>> From: Boris Pismenny <borisp@mellanox.com>
>>>
>>> This patch enables UDP GSO support. We enable this by using two WQEs
>>> the first is a UDP LSO WQE for all segments with equal length, and the
>>> second is for the last segment in case it has different length.
>>> Due to HW limitation, before sending, we must adjust the packet 
>>> length fields.
>>>
>>> We measure performance between two Intel(R) Xeon(R) CPU E5-2643 v2 
>>> @3.50GHz
>>> machines connected back-to-back with Connectx4-Lx (40Gbps) NICs.
>>> We compare single stream UDP, UDP GSO and UDP GSO with offload.
>>> Performance:
>>>                  | MSS (bytes)   | Throughput (Gbps)     | CPU 
>>> utilization (%)
>>> UDP GSO offload | 1472          | 35.6                  | 8%
>>> UDP GSO         | 1472          | 25.5                  | 17%
>>> UDP             | 1472          | 10.2                  | 17%
>>> UDP GSO offload | 1024          | 35.6                  | 8%
>>> UDP GSO         | 1024          | 19.2                  | 17%
>>> UDP             | 1024          | 5.7                   | 17%
>>> UDP GSO offload | 512           | 33.8                  | 16%
>>> UDP GSO         | 512           | 10.4                  | 17%
>>> UDP             | 512           | 3.5                   | 17%
>> Very nice results :)
> Thanks. We expect to have 100Gbps results by netdev0x12.
>
>>> +static void mlx5e_udp_gso_prepare_last_skb(struct sk_buff *skb,
>>> +                                          struct sk_buff *nskb,
>>> +                                          int remaining)
>>> +{
>>> +       int bytes_needed = remaining, remaining_headlen, 
>>> remaining_page_offset;
>>> +       int headlen = skb_transport_offset(skb) + sizeof(struct 
>>> udphdr);
>>> +       int payload_len = remaining + sizeof(struct udphdr);
>>> +       int k = 0, i, j;
>>> +
>>> +       skb_copy_bits(skb, 0, nskb->data, headlen);
>>> +       nskb->dev = skb->dev;
>>> +       skb_reset_mac_header(nskb);
>>> +       skb_set_network_header(nskb, skb_network_offset(skb));
>>> +       skb_set_transport_header(nskb, skb_transport_offset(skb));
>>> +       skb_set_tail_pointer(nskb, headlen);
>>> +
>>> +       /* How many frags do we need? */
>>> +       for (i = skb_shinfo(skb)->nr_frags - 1; i >= 0; i--) {
>>> +               bytes_needed -= 
>>> skb_frag_size(&skb_shinfo(skb)->frags[i]);
>>> +               k++;
>>> +               if (bytes_needed <= 0)
>>> +                       break;
>>> +       }
>>> +
>>> +       /* Fill the first frag and split it if necessary */
>>> +       j = skb_shinfo(skb)->nr_frags - k;
>>> +       remaining_page_offset = -bytes_needed;
>>> +       skb_fill_page_desc(nskb, 0,
>>> + skb_shinfo(skb)->frags[j].page.p,
>>> + skb_shinfo(skb)->frags[j].page_offset + remaining_page_offset,
>>> +                          skb_shinfo(skb)->frags[j].size - 
>>> remaining_page_offset);
>>> +
>>> +       skb_frag_ref(skb, j);
>>> +
>>> +       /* Fill the rest of the frags */
>>> +       for (i = 1; i < k; i++) {
>>> +               j = skb_shinfo(skb)->nr_frags - k + i;
>>> +
>>> +               skb_fill_page_desc(nskb, i,
>>> + skb_shinfo(skb)->frags[j].page.p,
>>> + skb_shinfo(skb)->frags[j].page_offset,
>>> + skb_shinfo(skb)->frags[j].size);
>>> +               skb_frag_ref(skb, j);
>>> +       }
>>> +       skb_shinfo(nskb)->nr_frags = k;
>>> +
>>> +       remaining_headlen = remaining - skb->data_len;
>>> +
>>> +       /* headlen contains remaining data? */
>>> +       if (remaining_headlen > 0)
>>> +               skb_copy_bits(skb, skb->len - remaining, nskb->data 
>>> + headlen,
>>> +                             remaining_headlen);
>>> +       nskb->len = remaining + headlen;
>>> +       nskb->data_len =  payload_len - sizeof(struct udphdr) +
>>> +               max_t(int, 0, remaining_headlen);
>>> +       nskb->protocol = skb->protocol;
>>> +       if (nskb->protocol == htons(ETH_P_IP)) {
>>> +               ip_hdr(nskb)->id = htons(ntohs(ip_hdr(nskb)->id) +
>>> + skb_shinfo(skb)->gso_segs);
>>> +               ip_hdr(nskb)->tot_len =
>>> +                       htons(payload_len + sizeof(struct iphdr));
>>> +       } else {
>>> +               ipv6_hdr(nskb)->payload_len = htons(payload_len);
>>> +       }
>>> +       udp_hdr(nskb)->len = htons(payload_len);
>>> +       skb_shinfo(nskb)->gso_size = 0;
>>> +       nskb->ip_summed = skb->ip_summed;
>>> +       nskb->csum_start = skb->csum_start;
>>> +       nskb->csum_offset = skb->csum_offset;
>>> +       nskb->queue_mapping = skb->queue_mapping;
>>> +}
>>> +
>>> +/* might send skbs and update wqe and pi */
>>> +struct sk_buff *mlx5e_udp_gso_handle_tx_skb(struct net_device *netdev,
>>> +                                           struct mlx5e_txqsq *sq,
>>> +                                           struct sk_buff *skb,
>>> +                                           struct mlx5e_tx_wqe **wqe,
>>> +                                           u16 *pi)
>>> +{
>>> +       int payload_len = skb_shinfo(skb)->gso_size + sizeof(struct 
>>> udphdr);
>>> +       int headlen = skb_transport_offset(skb) + sizeof(struct 
>>> udphdr);
>>> +       int remaining = (skb->len - headlen) % 
>>> skb_shinfo(skb)->gso_size;
>>> +       struct sk_buff *nskb;
>>> +
>>> +       if (skb->protocol == htons(ETH_P_IP))
>>> +               ip_hdr(skb)->tot_len = htons(payload_len + 
>>> sizeof(struct iphdr));
>>> +       else
>>> +               ipv6_hdr(skb)->payload_len = htons(payload_len);
>>> +       udp_hdr(skb)->len = htons(payload_len);
>>> +       if (!remaining)
>>> +               return skb;
>>> +
>>> +       nskb = alloc_skb(max_t(int, headlen, headlen + remaining - 
>>> skb->data_len), GFP_ATOMIC);
>>> +       if (unlikely(!nskb)) {
>>> +               sq->stats->dropped++;
>>> +               return NULL;
>>> +       }
>>> +
>>> +       mlx5e_udp_gso_prepare_last_skb(skb, nskb, remaining);
>>> +
>>> +       skb_shinfo(skb)->gso_segs--;
>>> +       pskb_trim(skb, skb->len - remaining);
>>> +       mlx5e_sq_xmit(sq, skb, *wqe, *pi);
>>> +       mlx5e_sq_fetch_wqe(sq, wqe, pi);
>>> +       return nskb;
>>> +}
>> The device driver seems to be implementing the packet split here
>> similar to NETIF_F_GSO_PARTIAL. When advertising the right flag, the
>> stack should be able to do that for you and pass two packets to the
>> driver.
>
> We've totally missed that!
> Thank you. I'll fix and resubmit.

I've noticed that we could get cleaner code in our driver if we remove 
these two lines from net/ipv4/udp_offload.c:
if (skb_is_gso(segs))
              mss *= skb_shinfo(segs)->gso_segs;

I think that this is correct in case of GSO_PARTIAL segmentation for the 
following reasons:
1. After this change the UDP payload field is consistent with the IP 
header payload length field. Currently, IPv4 length is 1500 and UDP 
total length is the full unsegmented length.
2. AFAIU, in GSO_PARTIAL no tunnel headers should be modified except the 
IP ID field, including the UDP length field.
What do you think?

^ permalink raw reply

* Re: [PATCH] [v2] infiniband: i40iw, nes: don't use wall time for TCP sequence numbers
From: kbuild test robot @ 2018-06-30 19:00 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: kbuild-all, Faisal Latif, Shiraz Saleem, Doug Ledford,
	Jason Gunthorpe, David S. Miller, Arnd Bergmann, Henry Orosco,
	Tatyana Nikolova, Mustafa Ismail, Bart Van Assche, Yuval Shaia,
	linux-rdma, linux-kernel, netdev
In-Reply-To: <20180627132628.915978-1-arnd@arndb.de>

[-- Attachment #1: Type: text/plain, Size: 6089 bytes --]

Hi Arnd,

I love your patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v4.18-rc2 next-20180629]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Arnd-Bergmann/infiniband-i40iw-nes-don-t-use-wall-time-for-TCP-sequence-numbers/20180627-221142
config: x86_64-randconfig-s2-06302231 (attached as .config)
compiler: gcc-6 (Debian 6.4.0-9) 6.4.0 20171026
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

All errors (new ones prefixed by >>):

   drivers/infiniband/hw/i40iw/i40iw_cm.o: In function `i40iw_make_cm_node':
>> drivers/infiniband/hw/i40iw/i40iw_cm.c:2232: undefined reference to `secure_tcpv6_seq'

vim +2232 drivers/infiniband/hw/i40iw/i40iw_cm.c

  2153	
  2154	/**
  2155	 * i40iw_make_cm_node - create a new instance of a cm node
  2156	 * @cm_core: cm's core
  2157	 * @iwdev: iwarp device structure
  2158	 * @cm_info: quad info for connection
  2159	 * @listener: passive connection's listener
  2160	 */
  2161	static struct i40iw_cm_node *i40iw_make_cm_node(
  2162					   struct i40iw_cm_core *cm_core,
  2163					   struct i40iw_device *iwdev,
  2164					   struct i40iw_cm_info *cm_info,
  2165					   struct i40iw_cm_listener *listener)
  2166	{
  2167		struct i40iw_cm_node *cm_node;
  2168		int oldarpindex;
  2169		int arpindex;
  2170		struct net_device *netdev = iwdev->netdev;
  2171	
  2172		/* create an hte and cm_node for this instance */
  2173		cm_node = kzalloc(sizeof(*cm_node), GFP_ATOMIC);
  2174		if (!cm_node)
  2175			return NULL;
  2176	
  2177		/* set our node specific transport info */
  2178		cm_node->ipv4 = cm_info->ipv4;
  2179		cm_node->vlan_id = cm_info->vlan_id;
  2180		if ((cm_node->vlan_id == I40IW_NO_VLAN) && iwdev->dcb)
  2181			cm_node->vlan_id = 0;
  2182		cm_node->tos = cm_info->tos;
  2183		cm_node->user_pri = cm_info->user_pri;
  2184		if (listener) {
  2185			if (listener->tos != cm_info->tos)
  2186				i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_DCB,
  2187					    "application TOS[%d] and remote client TOS[%d] mismatch\n",
  2188					     listener->tos, cm_info->tos);
  2189			cm_node->tos = max(listener->tos, cm_info->tos);
  2190			cm_node->user_pri = rt_tos2priority(cm_node->tos);
  2191			i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_DCB, "listener: TOS:[%d] UP:[%d]\n",
  2192				    cm_node->tos, cm_node->user_pri);
  2193		}
  2194		memcpy(cm_node->loc_addr, cm_info->loc_addr, sizeof(cm_node->loc_addr));
  2195		memcpy(cm_node->rem_addr, cm_info->rem_addr, sizeof(cm_node->rem_addr));
  2196		cm_node->loc_port = cm_info->loc_port;
  2197		cm_node->rem_port = cm_info->rem_port;
  2198	
  2199		cm_node->mpa_frame_rev = iwdev->mpa_version;
  2200		cm_node->send_rdma0_op = SEND_RDMA_READ_ZERO;
  2201		cm_node->ird_size = I40IW_MAX_IRD_SIZE;
  2202		cm_node->ord_size = I40IW_MAX_ORD_SIZE;
  2203	
  2204		cm_node->listener = listener;
  2205		cm_node->cm_id = cm_info->cm_id;
  2206		ether_addr_copy(cm_node->loc_mac, netdev->dev_addr);
  2207		spin_lock_init(&cm_node->retrans_list_lock);
  2208		cm_node->ack_rcvd = false;
  2209	
  2210		atomic_set(&cm_node->ref_count, 1);
  2211		/* associate our parent CM core */
  2212		cm_node->cm_core = cm_core;
  2213		cm_node->tcp_cntxt.loc_id = I40IW_CM_DEF_LOCAL_ID;
  2214		cm_node->tcp_cntxt.rcv_wscale = I40IW_CM_DEFAULT_RCV_WND_SCALE;
  2215		cm_node->tcp_cntxt.rcv_wnd =
  2216				I40IW_CM_DEFAULT_RCV_WND_SCALED >> I40IW_CM_DEFAULT_RCV_WND_SCALE;
  2217		if (cm_node->ipv4) {
  2218			cm_node->tcp_cntxt.loc_seq_num = secure_tcp_seq(htonl(cm_node->loc_addr[0]),
  2219								htonl(cm_node->rem_addr[0]),
  2220								htons(cm_node->loc_port),
  2221								htons(cm_node->rem_port));
  2222			cm_node->tcp_cntxt.mss = iwdev->vsi.mtu - I40IW_MTU_TO_MSS_IPV4;
  2223		} else {
  2224			__be32 loc[4] = {
  2225				htonl(cm_node->loc_addr[0]), htonl(cm_node->loc_addr[1]),
  2226				htonl(cm_node->loc_addr[2]), htonl(cm_node->loc_addr[3])
  2227			};
  2228			__be32 rem[4] = {
  2229				htonl(cm_node->rem_addr[0]), htonl(cm_node->rem_addr[1]),
  2230				htonl(cm_node->rem_addr[2]), htonl(cm_node->rem_addr[3])
  2231			};
> 2232			cm_node->tcp_cntxt.loc_seq_num = secure_tcpv6_seq(loc, rem,
  2233								htons(cm_node->loc_port),
  2234								htons(cm_node->rem_port));
  2235			cm_node->tcp_cntxt.mss = iwdev->vsi.mtu - I40IW_MTU_TO_MSS_IPV6;
  2236		}
  2237	
  2238		cm_node->iwdev = iwdev;
  2239		cm_node->dev = &iwdev->sc_dev;
  2240	
  2241		if ((cm_node->ipv4 &&
  2242		     i40iw_ipv4_is_loopback(cm_node->loc_addr[0], cm_node->rem_addr[0])) ||
  2243		     (!cm_node->ipv4 && i40iw_ipv6_is_loopback(cm_node->loc_addr,
  2244							       cm_node->rem_addr))) {
  2245			arpindex = i40iw_arp_table(iwdev,
  2246						   cm_node->rem_addr,
  2247						   false,
  2248						   NULL,
  2249						   I40IW_ARP_RESOLVE);
  2250		} else {
  2251			oldarpindex = i40iw_arp_table(iwdev,
  2252						      cm_node->rem_addr,
  2253						      false,
  2254						      NULL,
  2255						      I40IW_ARP_RESOLVE);
  2256			if (cm_node->ipv4)
  2257				arpindex = i40iw_addr_resolve_neigh(iwdev,
  2258								    cm_info->loc_addr[0],
  2259								    cm_info->rem_addr[0],
  2260								    oldarpindex);
  2261			else if (IS_ENABLED(CONFIG_IPV6))
  2262				arpindex = i40iw_addr_resolve_neigh_ipv6(iwdev,
  2263									 cm_info->loc_addr,
  2264									 cm_info->rem_addr,
  2265									 oldarpindex);
  2266			else
  2267				arpindex = -EINVAL;
  2268		}
  2269		if (arpindex < 0) {
  2270			i40iw_pr_err("cm_node arpindex\n");
  2271			kfree(cm_node);
  2272			return NULL;
  2273		}
  2274		ether_addr_copy(cm_node->rem_mac, iwdev->arp_table[arpindex].mac_addr);
  2275		i40iw_add_hte_node(cm_core, cm_node);
  2276		cm_core->stats_nodes_created++;
  2277		return cm_node;
  2278	}
  2279	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 31721 bytes --]

^ permalink raw reply

* Re: [PATCH net-next 2/2] tcp: ack immediately when a cwr packet arrives
From: Neal Cardwell @ 2018-06-30 18:23 UTC (permalink / raw)
  To: Lawrence Brakmo
  Cc: Netdev, Kernel Team, bmatheny, ast, Yuchung Cheng, Steve Ibanez,
	Eric Dumazet
In-Reply-To: <20180630014815.2881895-3-brakmo@fb.com>

On Fri, Jun 29, 2018 at 9:48 PM Lawrence Brakmo <brakmo@fb.com> wrote:
>
> We observed high 99 and 99.9% latencies when doing RPCs with DCTCP. The
> problem is triggered when the last packet of a request arrives CE
> marked. The reply will carry the ECE mark causing TCP to shrink its cwnd
> to 1 (because there are no packets in flight). When the 1st packet of
> the next request arrives it was sometimes delayed adding up to 40ms to
> the latency.
>
> This patch insures that CWR makred data packets arriving will be acked
> immediately.
>
> Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
> ---

Thanks, Larry. Ensuring that CWR-marked data packets arriving will be
acked immediately sounds like a good goal to me.

I am wondering if we can have a simpler fix.

The dctcp_ce_state_0_to_1() code is setting the TCP_ECN_DEMAND_CWR
bit in ecn_flags, which disables the code in __tcp_ecn_check_ce() that
would have otherwise used the tcp_enter_quickack_mode() mechanism to
force an ACK:

__tcp_ecn_check_ce():
...
case INET_ECN_CE:
  if (tcp_ca_needs_ecn(sk))
    tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
       // -> dctcp_ce_state_0_to_1()
       //     ->  tp->ecn_flags |= TCP_ECN_DEMAND_CWR;

  if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
    /* Better not delay acks, sender can have a very low cwnd */
    tcp_enter_quickack_mode(sk, 1);
    tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
  }
  tp->ecn_flags |= TCP_ECN_SEEN;
  break;

So it seems like the bug here may be that dctcp_ce_state_0_to_1()  is
setting the TCP_ECN_DEMAND_CWR  bit in ecn_flags, when really it
should let its caller, __tcp_ecn_check_ce() set TCP_ECN_DEMAND_CWR, in
which case the code would already properly force an immediate ACK.

So, what if we use this fix instead (not compiled, not tested):

diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 5f5e5936760e..4fecd2824edb 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -152,8 +152,6 @@ static void dctcp_ce_state_0_to_1(struct sock *sk)

        ca->prior_rcv_nxt = tp->rcv_nxt;
        ca->ce_state = 1;
-
-       tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
 }

 static void dctcp_ce_state_1_to_0(struct sock *sk)

What do you think?

neal

^ permalink raw reply related

* [PATCH v2 4/4] Hook into set_rx_mode to admit multicast traffic
From: Miguel Rodríguez Pérez @ 2018-06-30 17:32 UTC (permalink / raw)
  To: linux-usb, netdev

We set set_rx_mode to usbnet_cdc_update_filter provided
by cdc_ether that simply admits all multicast traffic
if there is more than one multicast filter configured.
---
 drivers/net/usb/cdc_ncm.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index d6b51e2b9495..fda0af0b5d3c 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -1652,6 +1652,7 @@ static const struct driver_info cdc_ncm_info = {
 	.status = cdc_ncm_status,
 	.rx_fixup = cdc_ncm_rx_fixup,
 	.tx_fixup = cdc_ncm_tx_fixup,
+	.set_rx_mode = usbnet_cdc_update_filter,
 };

 /* Same as cdc_ncm_info, but with FLAG_WWAN */
@@ -1665,6 +1666,7 @@ static const struct driver_info wwan_info = {
 	.status = cdc_ncm_status,
 	.rx_fixup = cdc_ncm_rx_fixup,
 	.tx_fixup = cdc_ncm_tx_fixup,
+	.set_rx_mode = usbnet_cdc_update_filter,
 };

 /* Same as wwan_info, but with FLAG_NOARP  */
@@ -1678,6 +1680,7 @@ static const struct driver_info wwan_noarp_info = {
 	.status = cdc_ncm_status,
 	.rx_fixup = cdc_ncm_rx_fixup,
 	.tx_fixup = cdc_ncm_tx_fixup,
+	.set_rx_mode = usbnet_cdc_update_filter,
 };

 static const struct usb_device_id cdc_devs[] = {
-- 
2.17.1

-- 
Miguel Rodríguez Pérez
Laboratorio de Redes
EE Telecomunicación – Universidade de Vigo

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox