Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next v2 4/9] bnxt_en: Add EEE setup code.
From: Michael Chan @ 2016-04-05 18:08 UTC (permalink / raw)
  To: davem; +Cc: netdev
In-Reply-To: <1459879743-16960-1-git-send-email-michael.chan@broadcom.com>

1. Add bnxt_hwrm_set_eee() function to setup EEE firmware parameters based
on the bp->eee settings.
2. The new function bnxt_eee_config_ok() will check if EEE parameters need
to be modified due to autoneg changes.
3. bnxt_hwrm_set_link() has added a new parameter to update EEE.  If the
parameter is set, it will call bnxt_hwrm_set_eee().

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c         | 58 ++++++++++++++++++++++-
 drivers/net/ethernet/broadcom/bnxt/bnxt.h         |  2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c |  4 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.h |  1 +
 4 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 7442e20..2c3c795 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -4711,7 +4711,30 @@ int bnxt_hwrm_set_pause(struct bnxt *bp)
 	return rc;
 }
 
-int bnxt_hwrm_set_link_setting(struct bnxt *bp, bool set_pause)
+static void bnxt_hwrm_set_eee(struct bnxt *bp,
+			      struct hwrm_port_phy_cfg_input *req)
+{
+	struct ethtool_eee *eee = &bp->eee;
+
+	if (eee->eee_enabled) {
+		u16 eee_speeds;
+		u32 flags = PORT_PHY_CFG_REQ_FLAGS_EEE_ENABLE;
+
+		if (eee->tx_lpi_enabled)
+			flags |= PORT_PHY_CFG_REQ_FLAGS_EEE_TX_LPI_ENABLE;
+		else
+			flags |= PORT_PHY_CFG_REQ_FLAGS_EEE_TX_LPI_DISABLE;
+
+		req->flags |= cpu_to_le32(flags);
+		eee_speeds = bnxt_get_fw_auto_link_speeds(eee->advertised);
+		req->eee_link_speed_mask = cpu_to_le16(eee_speeds);
+		req->tx_lpi_timer = cpu_to_le32(eee->tx_lpi_timer);
+	} else {
+		req->flags |= cpu_to_le32(PORT_PHY_CFG_REQ_FLAGS_EEE_DISABLE);
+	}
+}
+
+int bnxt_hwrm_set_link_setting(struct bnxt *bp, bool set_pause, bool set_eee)
 {
 	struct hwrm_port_phy_cfg_input req = {0};
 
@@ -4720,14 +4743,42 @@ int bnxt_hwrm_set_link_setting(struct bnxt *bp, bool set_pause)
 		bnxt_hwrm_set_pause_common(bp, &req);
 
 	bnxt_hwrm_set_link_common(bp, &req);
+
+	if (set_eee)
+		bnxt_hwrm_set_eee(bp, &req);
 	return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
 }
 
+static bool bnxt_eee_config_ok(struct bnxt *bp)
+{
+	struct ethtool_eee *eee = &bp->eee;
+	struct bnxt_link_info *link_info = &bp->link_info;
+
+	if (!(bp->flags & BNXT_FLAG_EEE_CAP))
+		return true;
+
+	if (eee->eee_enabled) {
+		u32 advertising =
+			_bnxt_fw_to_ethtool_adv_spds(link_info->advertising, 0);
+
+		if (!(link_info->autoneg & BNXT_AUTONEG_SPEED)) {
+			eee->eee_enabled = 0;
+			return false;
+		}
+		if (eee->advertised & ~advertising) {
+			eee->advertised = advertising & eee->supported;
+			return false;
+		}
+	}
+	return true;
+}
+
 static int bnxt_update_phy_setting(struct bnxt *bp)
 {
 	int rc;
 	bool update_link = false;
 	bool update_pause = false;
+	bool update_eee = false;
 	struct bnxt_link_info *link_info = &bp->link_info;
 
 	rc = bnxt_update_link(bp, true);
@@ -4757,8 +4808,11 @@ static int bnxt_update_phy_setting(struct bnxt *bp)
 			update_link = true;
 	}
 
+	if (!bnxt_eee_config_ok(bp))
+		update_eee = true;
+
 	if (update_link)
-		rc = bnxt_hwrm_set_link_setting(bp, update_pause);
+		rc = bnxt_hwrm_set_link_setting(bp, update_pause, update_eee);
 	else if (update_pause)
 		rc = bnxt_hwrm_set_pause(bp);
 	if (rc) {
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 5e83405..a981e2c 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1112,7 +1112,7 @@ int hwrm_send_message_silent(struct bnxt *, void *, u32, int);
 int bnxt_hwrm_set_coal(struct bnxt *);
 int bnxt_hwrm_func_qcaps(struct bnxt *);
 int bnxt_hwrm_set_pause(struct bnxt *);
-int bnxt_hwrm_set_link_setting(struct bnxt *, bool);
+int bnxt_hwrm_set_link_setting(struct bnxt *, bool, bool);
 int bnxt_open_nic(struct bnxt *, bool, bool);
 int bnxt_close_nic(struct bnxt *, bool, bool);
 int bnxt_get_max_rings(struct bnxt *, int *, int *, bool);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index bdc6220..14f0520 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -763,7 +763,7 @@ static u32 bnxt_get_fw_speed(struct net_device *dev, u16 ethtool_speed)
 	return 0;
 }
 
-static u16 bnxt_get_fw_auto_link_speeds(u32 advertising)
+u16 bnxt_get_fw_auto_link_speeds(u32 advertising)
 {
 	u16 fw_speed_mask = 0;
 
@@ -840,7 +840,7 @@ static int bnxt_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 	}
 
 	if (netif_running(dev))
-		rc = bnxt_hwrm_set_link_setting(bp, set_pause);
+		rc = bnxt_hwrm_set_link_setting(bp, set_pause, false);
 
 set_setting_exit:
 	return rc;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.h
index e061f8f..3abc03b 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.h
@@ -14,5 +14,6 @@ extern const struct ethtool_ops bnxt_ethtool_ops;
 
 u32 _bnxt_fw_to_ethtool_adv_spds(u16, u8);
 u32 bnxt_fw_to_ethtool_speed(u16);
+u16 bnxt_get_fw_auto_link_speeds(u32);
 
 #endif
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next v2 2/9] bnxt_en: Improve flow control autoneg with Firmware 1.2.1 interface.
From: Michael Chan @ 2016-04-05 18:08 UTC (permalink / raw)
  To: davem; +Cc: netdev
In-Reply-To: <1459879743-16960-1-git-send-email-michael.chan@broadcom.com>

Make use of the new AUTONEG_PAUSE bit in the new interface to better
control autoneg flow control settings, independent of RX and TX
advertisement settings.

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c         | 28 +++++++++++++++++++----
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 10 ++++----
 2 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index bfe98cb..2b5a541 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -4557,6 +4557,9 @@ static void
 bnxt_hwrm_set_pause_common(struct bnxt *bp, struct hwrm_port_phy_cfg_input *req)
 {
 	if (bp->link_info.autoneg & BNXT_AUTONEG_FLOW_CTRL) {
+		if (bp->hwrm_spec_code >= 0x10201)
+			req->auto_pause =
+				PORT_PHY_CFG_REQ_AUTO_PAUSE_AUTONEG_PAUSE;
 		if (bp->link_info.req_flow_ctrl & BNXT_LINK_PAUSE_RX)
 			req->auto_pause |= PORT_PHY_CFG_REQ_AUTO_PAUSE_RX;
 		if (bp->link_info.req_flow_ctrl & BNXT_LINK_PAUSE_TX)
@@ -4570,6 +4573,11 @@ bnxt_hwrm_set_pause_common(struct bnxt *bp, struct hwrm_port_phy_cfg_input *req)
 			req->force_pause |= PORT_PHY_CFG_REQ_FORCE_PAUSE_TX;
 		req->enables |=
 			cpu_to_le32(PORT_PHY_CFG_REQ_ENABLES_FORCE_PAUSE);
+		if (bp->hwrm_spec_code >= 0x10201) {
+			req->auto_pause = req->force_pause;
+			req->enables |= cpu_to_le32(
+				PORT_PHY_CFG_REQ_ENABLES_AUTO_PAUSE);
+		}
 	}
 }
 
@@ -4656,7 +4664,8 @@ static int bnxt_update_phy_setting(struct bnxt *bp)
 		return rc;
 	}
 	if ((link_info->autoneg & BNXT_AUTONEG_FLOW_CTRL) &&
-	    link_info->auto_pause_setting != link_info->req_flow_ctrl)
+	    (link_info->auto_pause_setting & BNXT_LINK_PAUSE_BOTH) !=
+	    link_info->req_flow_ctrl)
 		update_pause = true;
 	if (!(link_info->autoneg & BNXT_AUTONEG_FLOW_CTRL) &&
 	    link_info->force_pause_setting != link_info->req_flow_ctrl)
@@ -5825,15 +5834,24 @@ static int bnxt_probe_phy(struct bnxt *bp)
 
 	/*initialize the ethool setting copy with NVM settings */
 	if (BNXT_AUTO_MODE(link_info->auto_mode)) {
-		link_info->autoneg = BNXT_AUTONEG_SPEED |
-				     BNXT_AUTONEG_FLOW_CTRL;
+		link_info->autoneg = BNXT_AUTONEG_SPEED;
+		if (bp->hwrm_spec_code >= 0x10201) {
+			if (link_info->auto_pause_setting &
+			    PORT_PHY_CFG_REQ_AUTO_PAUSE_AUTONEG_PAUSE)
+				link_info->autoneg |= BNXT_AUTONEG_FLOW_CTRL;
+		} else {
+			link_info->autoneg |= BNXT_AUTONEG_FLOW_CTRL;
+		}
 		link_info->advertising = link_info->auto_link_speeds;
-		link_info->req_flow_ctrl = link_info->auto_pause_setting;
 	} else {
 		link_info->req_link_speed = link_info->force_link_speed;
 		link_info->req_duplex = link_info->duplex_setting;
-		link_info->req_flow_ctrl = link_info->force_pause_setting;
 	}
+	if (link_info->autoneg & BNXT_AUTONEG_FLOW_CTRL)
+		link_info->req_flow_ctrl =
+			link_info->auto_pause_setting & BNXT_LINK_PAUSE_BOTH;
+	else
+		link_info->req_flow_ctrl = link_info->force_pause_setting;
 	return rc;
 }
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index f103f9b..99b1740 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -874,7 +874,9 @@ static int bnxt_set_pauseparam(struct net_device *dev,
 			return -EINVAL;
 
 		link_info->autoneg |= BNXT_AUTONEG_FLOW_CTRL;
-		link_info->req_flow_ctrl |= BNXT_LINK_PAUSE_BOTH;
+		if (bp->hwrm_spec_code >= 0x10201)
+			link_info->req_flow_ctrl =
+				PORT_PHY_CFG_REQ_AUTO_PAUSE_AUTONEG_PAUSE;
 	} else {
 		/* when transition from auto pause to force pause,
 		 * force a link change
@@ -882,17 +884,13 @@ static int bnxt_set_pauseparam(struct net_device *dev,
 		if (link_info->autoneg & BNXT_AUTONEG_FLOW_CTRL)
 			link_info->force_link_chng = true;
 		link_info->autoneg &= ~BNXT_AUTONEG_FLOW_CTRL;
-		link_info->req_flow_ctrl &= ~BNXT_LINK_PAUSE_BOTH;
+		link_info->req_flow_ctrl = 0;
 	}
 	if (epause->rx_pause)
 		link_info->req_flow_ctrl |= BNXT_LINK_PAUSE_RX;
-	else
-		link_info->req_flow_ctrl &= ~BNXT_LINK_PAUSE_RX;
 
 	if (epause->tx_pause)
 		link_info->req_flow_ctrl |= BNXT_LINK_PAUSE_TX;
-	else
-		link_info->req_flow_ctrl &= ~BNXT_LINK_PAUSE_TX;
 
 	if (netif_running(dev))
 		rc = bnxt_hwrm_set_pause(bp);
-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH net-next 1/8] perf: optimize perf_fetch_caller_regs
From: Alexei Starovoitov @ 2016-04-05 17:41 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Steven Rostedt, David S . Miller, Ingo Molnar, Daniel Borkmann,
	Arnaldo Carvalho de Melo, Wang Nan, Josef Bacik, Brendan Gregg,
	netdev, linux-kernel, kernel-team
In-Reply-To: <20160405120626.GM3448@twins.programming.kicks-ass.net>

On 4/5/16 5:06 AM, Peter Zijlstra wrote:
> On Mon, Apr 04, 2016 at 09:52:47PM -0700, Alexei Starovoitov wrote:
>> avoid memset in perf_fetch_caller_regs, since it's the critical path of all tracepoints.
>> It's called from perf_sw_event_sched, perf_event_task_sched_in and all of perf_trace_##call
>> with this_cpu_ptr(&__perf_regs[..]) which are zero initialized by perpcu_alloc
>
> Its not actually allocated; but because its a static uninitialized
> variable we get .bss like behaviour and the initial value is copied to
> all CPUs when the per-cpu allocator thingy bootstraps SMP IIRC.

yes, it's .bss-like in a special section. I think static percpu still
goes through some fancy boot time init similar to dynamic.
What I tried to emphasize that either static or dynamic percpu areas
are guaranteed to be zero initialized.

>> and
>> subsequent call to perf_arch_fetch_caller_regs initializes the same fields on all archs,
>> so we can safely drop memset from all of the above cases and
>
> Indeed.
>
>> move it into
>> perf_ftrace_function_call that calls it with stack allocated pt_regs.
>
> Hmm, is there a reason that's still on-stack instead of using the
> per-cpu thing, Steve?
>
>> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
>
> In any case,
>
> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>

Thanks for the quick review.

^ permalink raw reply

* Re: [net PATCH v2 2/2] ipv4/GRO: Make GRO conform to RFC 6864
From: Tom Herbert @ 2016-04-05 17:38 UTC (permalink / raw)
  To: Edward Cree
  Cc: Herbert Xu, Alexander Duyck, Alexander Duyck, Jesse Gross,
	Eric Dumazet, Netdev, David Miller
In-Reply-To: <5703F0A6.5010201@solarflare.com>

On Tue, Apr 5, 2016 at 2:06 PM, Edward Cree <ecree@solarflare.com> wrote:
> On 05/04/16 16:36, Tom Herbert wrote:
>> I thought about that some. It seems like we would want to do both GRO
>> and retain all the individual packets in the skb so that we could use
>> those for forwarding instead of GSO as I think you're saying.
> I didn't quite mean that, I meant just pass around the skb list, don't
> do GRO at all.  The receiving TCP stack ends up just getting called
> several times, in quick succession, without I$ loss from network stack
> traversal in between.
>
>> This
>> would would work great in the plain forwarding case, but one problem
>> is what to do if the host modifies the super packet (for instance when
>> forwarding over a tunnel we might add encapsulation header). This
>> should work in GSO (although we need to address the limitations around
>> 1 encap level), not sure this is easy if we need to add a header to
>> each packet in a batch.
> This is indeed a problem with what I was proposing; perhaps the answer
> is that as you process these SKB lists you also update something like a
> GRO CB, then if you do decide to transform the packets you can coalesce
> them at that point.  But doing 'the rest' of GRO might cost as much as
> just transforming all the packets, in which case you only win if you want
> to transform them multiple times.
> And if we assume that we're going to be touching all the headers anyway,
> it probably doesn't cost us much to transform all the packets in the list
> since our code and data are both hot in cache.  Well, the code is cold
> for the first packet in the list, but equally it's cold for the
> superpacket in the current approach.
>
> If this is as performant as GRO in the normal (non-forwarding) receive
> case (and that's a *big* if, which can only be resolved by trying it), it
> might make sense to just not have GRO, while TSO only gets used for
> locally-generated sends, and for the case where you're forwarding between
> networks with different MTUs (e.g. from a 64k VM netdevice to the wire).
>
> What do you think?  Am I crazy?  (More than my usual?)
>
It's not clear to me how important optimizing the GRO to GSO
forwarding case really is. We definitely need GRO for locally received
packets, but how much benefit do we really get when applying GRO to
forwarding and does that really outweigh the latency we're adding in
the forwarding path to to GRO? Also, once we implement high
performance forwarding in XDP there is really is no need to consider
GRO, we'll do some sort of I/O batching for sure but that would be
batching packets to same TX queue not same destination.

Tom

> -Ed

^ permalink raw reply

* Re: [PATCH v4 net-next 01/15] nfp: correct RX buffer length calculation
From: Jakub Kicinski @ 2016-04-05 17:24 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20160405.113917.846241936652070162.davem@davemloft.net>

On Tue, 05 Apr 2016 11:39:17 -0400 (EDT), David Miller wrote:
> From: Jakub Kicinski <jakub.kicinski@netronome.com>
> Date: Fri,  1 Apr 2016 22:06:37 +0100
> 
> > When calculating the RX buffer length we need to account for
> > up to 2 VLAN tags and up to 8 MPLS labels.  Rounding up to 1k
> > is an relic of a distant past and can be removed.  While at
> > it also remove trivial print statement.
> > 
> > Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>  
> 
> I disagree with the MPLS aspect of this change.
> 
> VLAN is special, in that when the hardware supports VLAN properly, the
> VLAN header doesn't eat into the MTU and is sort of "transparent".
> 
> But MPLS doesn't work that way.
> 
> MPLS is in the main frame and takes up MTU space.

Makes sense.  RFC3032 counts MPLS label stack as Frame Payload.
 
> Therefore I see no reason to increase the buffer length by 8 * MPLS
> which is just a rediculous amount of wasted space.
> 
> I'm not applying this without at least some more explanations about
> why exactly you need to account for these values in the commit message.

It's just what FW guys asked me for.  I'll try to find out what their
reasoning was.  

I have a patch queued up which gets rid of unconditionally reserving
64B for firmware prepend, I guess that made me too excited to pay
attention to the fact the accounting for MPLS is indeed questionable...

^ permalink raw reply

* Re: [RFC PATCH 6/6] ppp: add rtnetlink device creation support
From: walter harms @ 2016-04-05 17:18 UTC (permalink / raw)
  To: Guillaume Nault; +Cc: netdev, linux-ppp, Paul Mackerras, David Miller
In-Reply-To: <dc5fd88775581dba073d0051bbfb4c2c9c5c2d23.1459807527.git.g.nault@alphalink.fr>



Am 05.04.2016 02:56, schrieb Guillaume Nault:
> Define PPP device handlers for use with rtnetlink.
> The only PPP specific attribute is IFLA_PPP_DEV_FD. It is mandatory and
> contains the file descriptor of the associated /dev/ppp instance (the
> file descriptor which would have been used for ioctl(PPPIOCNEWUNIT) in
> the ioctl-based API). The PPP device is removed when this file
> descriptor is released (same behaviour as with ioctl based PPP
> devices).
> 
> PPP devices created with the rtnetlink API behave like the ones created
> with ioctl(PPPIOCNEWUNIT). In particular existing ioctls work the same
> way, no matter how the PPP device was created.
> 
> However, there are a few differences between rtnl and ioctl based PPP
> devices. Rtnl based PPP devices can be removed with RTM_DELLINK
> messages (e.g. with "ip link del"), while the ones created with
> ioctl(PPPIOCNEWUNIT) can't.
> The interface name is also built differently: the number following the
> "ppp" prefix corresponds to the PPP unit number for ioctl based
> devices, while it is just an unrelated incrementing index for rtnl
> ones.
> 
> Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
> ---
>  drivers/net/ppp/ppp_generic.c | 143 +++++++++++++++++++++++++++++++++++++-----
>  include/uapi/linux/if_link.h  |   8 +++
>  2 files changed, 136 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
> index 516f8dc..ae40368 100644
> --- a/drivers/net/ppp/ppp_generic.c
> +++ b/drivers/net/ppp/ppp_generic.c
> @@ -46,6 +46,7 @@
>  #include <linux/device.h>
>  #include <linux/mutex.h>
>  #include <linux/slab.h>
> +#include <linux/file.h>
>  #include <asm/unaligned.h>
>  #include <net/slhc_vj.h>
>  #include <linux/atomic.h>
> @@ -185,7 +186,9 @@ struct channel {
>  
>  struct ppp_config {
>  	struct file *file;
> +	s32 fd;
>  	s32 unit;
> +	bool ifname_is_set;
>  };
>  
>  /*
> @@ -286,6 +289,7 @@ static int unit_get(struct idr *p, void *ptr);
>  static int unit_set(struct idr *p, void *ptr, int n);
>  static void unit_put(struct idr *p, int n);
>  static void *unit_find(struct idr *p, int n);
> +static void ppp_setup(struct net_device *dev);
>  
>  static const struct net_device_ops ppp_netdev_ops;
>  
> @@ -989,7 +993,7 @@ static struct pernet_operations ppp_net_ops = {
>  	.size = sizeof(struct ppp_net),
>  };
>  
> -static int ppp_unit_register(struct ppp *ppp, int unit)
> +static int ppp_unit_register(struct ppp *ppp, int unit, bool ifname_is_set)
>  {
>  	struct ppp_net *pn = ppp_pernet(ppp->ppp_net);
>  	int ret;
> @@ -1019,7 +1023,8 @@ static int ppp_unit_register(struct ppp *ppp, int unit)
>  	}
>  	ppp->file.index = ret;
>  
> -	snprintf(ppp->dev->name, IFNAMSIZ, "ppp%i", ppp->file.index);
> +	if (!ifname_is_set)
> +		snprintf(ppp->dev->name, IFNAMSIZ, "ppp%i", ppp->file.index);
>  
>  	ret = register_netdevice(ppp->dev);
>  	if (ret < 0)
> @@ -1043,12 +1048,39 @@ static int ppp_dev_configure(struct net *src_net, struct net_device *dev,
>  			     const struct ppp_config *conf)
>  {
>  	struct ppp *ppp = netdev_priv(dev);
> +	struct file *file;
>  	int indx;
> +	int err;
> +
> +	if (conf->fd < 0) {
> +		file = conf->file;
> +		if (!file) {
> +			err = -EBADF;
> +			goto out;

why not just return -EBADF;

> +		}
> +	} else {
> +		file = fget(conf->fd);
> +		if (!file) {
> +			err = -EBADF;
> +			goto out;
	
why not just return -EBADF;
> +		}

just my 2 cents,

re,
 wh

> +
> +		if (file->f_op != &ppp_device_fops) {
> +			err = -EBADF;
> +			goto out;
> +		}
> +	}
> +
> +	mutex_lock(&ppp_mutex);
> +	if (file->private_data) {
> +		err = -ENOTTY;
> +		goto out_mutex;
> +	}
>  
>  	ppp->dev = dev;
>  	ppp->mru = PPP_MRU;
>  	ppp->ppp_net = src_net;
> -	ppp->owner = conf->file;
> +	ppp->owner = file;
>  
>  	init_ppp_file(&ppp->file, INTERFACE);
>  	ppp->file.hdrlen = PPP_HDRLEN - 2; /* don't count proto bytes */
> @@ -1067,9 +1099,88 @@ static int ppp_dev_configure(struct net *src_net, struct net_device *dev,
>  	ppp->active_filter = NULL;
>  #endif /* CONFIG_PPP_FILTER */
>  
> -	return ppp_unit_register(ppp, conf->unit);
> +	err = ppp_unit_register(ppp, conf->unit, conf->ifname_is_set);
> +	if (err < 0)
> +		goto out_mutex;
> +
> +	file->private_data = &ppp->file;
> +
> +out_mutex:
> +	mutex_unlock(&ppp_mutex);
> +out:
> +	if (conf->fd >= 0 && file)
> +		fput(file);
> +
> +	return err;
>  }
>  
> +static const struct nla_policy ppp_nl_policy[IFLA_PPP_MAX + 1] = {
> +	[IFLA_PPP_DEV_FD]       = { .type = NLA_S32 },
> +};
> +
> +static int ppp_nl_validate(struct nlattr *tb[], struct nlattr *data[])
> +{
> +	if (!data)
> +		return -EINVAL;
> +
> +	if (!data[IFLA_PPP_DEV_FD])
> +		return -EINVAL;
> +	if (nla_get_s32(data[IFLA_PPP_DEV_FD]) < 0)
> +		return -EBADF;
> +
> +	return 0;
> +}
> +
> +static int ppp_nl_newlink(struct net *src_net, struct net_device *dev,
> +			  struct nlattr *tb[], struct nlattr *data[])
> +{
> +	struct ppp_config conf = {
> +		.file = NULL,
> +		.unit = -1,
> +		.ifname_is_set = true,
> +	};
> +
> +	conf.fd = nla_get_s32(data[IFLA_PPP_DEV_FD]);
> +
> +	return ppp_dev_configure(src_net, dev, &conf);
> +}
> +
> +static void ppp_nl_dellink(struct net_device *dev, struct list_head *head)
> +{
> +	unregister_netdevice_queue(dev, head);
> +}
> +
> +static size_t ppp_nl_get_size(const struct net_device *dev)
> +{
> +	return 0;
> +}
> +
> +static int ppp_nl_fill_info(struct sk_buff *skb, const struct net_device *dev)
> +{
> +	return 0;
> +}
> +
> +static struct net *ppp_nl_get_link_net(const struct net_device *dev)
> +{
> +	struct ppp *ppp = netdev_priv(dev);
> +
> +	return ppp->ppp_net;
> +}
> +
> +static struct rtnl_link_ops ppp_link_ops __read_mostly = {
> +	.kind		= "ppp",
> +	.maxtype	= IFLA_PPP_MAX,
> +	.policy		= ppp_nl_policy,
> +	.priv_size	= sizeof(struct ppp),
> +	.setup		= ppp_setup,
> +	.validate	= ppp_nl_validate,
> +	.newlink	= ppp_nl_newlink,
> +	.dellink	= ppp_nl_dellink,
> +	.get_size	= ppp_nl_get_size,
> +	.fill_info	= ppp_nl_fill_info,
> +	.get_link_net	= ppp_nl_get_link_net,
> +};
> +
>  #define PPP_MAJOR	108
>  
>  /* Called at boot time if ppp is compiled into the kernel,
> @@ -1098,11 +1209,19 @@ static int __init ppp_init(void)
>  		goto out_chrdev;
>  	}
>  
> +	err = rtnl_link_register(&ppp_link_ops);
> +	if (err) {
> +		pr_err("failed to register rtnetlink PPP handler\n");
> +		goto out_class;
> +	}
> +
>  	/* not a big deal if we fail here :-) */
>  	device_create(ppp_class, NULL, MKDEV(PPP_MAJOR, 0), NULL, "ppp");
>  
>  	return 0;
>  
> +out_class:
> +	class_destroy(ppp_class);
>  out_chrdev:
>  	unregister_chrdev(PPP_MAJOR, "ppp");
>  out_net:
> @@ -2846,7 +2965,9 @@ static int ppp_create_interface(struct net *net, struct file *file, int *unit)
>  {
>  	struct ppp_config conf = {
>  		.file = file,
> +		.fd = -1,
>  		.unit = *unit,
> +		.ifname_is_set = false,
>  	};
>  	struct net_device *dev;
>  	struct ppp *ppp;
> @@ -2861,27 +2982,17 @@ static int ppp_create_interface(struct net *net, struct file *file, int *unit)
>  	dev_net_set(dev, net);
>  
>  	rtnl_lock();
> -	mutex_lock(&ppp_mutex);
> -	if (file->private_data) {
> -		err = -ENOTTY;
> -		goto err_dev;
> -	}
> -
>  	err = ppp_dev_configure(net, dev, &conf);
>  	if (err < 0)
>  		goto err_dev;
> +	rtnl_unlock();
>  
>  	ppp = netdev_priv(dev);
>  	*unit = ppp->file.index;
> -	file->private_data = &ppp->file;
> -
> -	mutex_unlock(&ppp_mutex);
> -	rtnl_unlock();
>  
>  	return 0;
>  
>  err_dev:
> -	mutex_unlock(&ppp_mutex);
>  	rtnl_unlock();
>  	free_netdev(dev);
>  err:
> @@ -3074,6 +3185,7 @@ static void __exit ppp_cleanup(void)
>  	/* should never happen */
>  	if (atomic_read(&ppp_unit_count) || atomic_read(&channel_count))
>  		pr_err("PPP: removing module but units remain!\n");
> +	rtnl_link_unregister(&ppp_link_ops);
>  	unregister_chrdev(PPP_MAJOR, "ppp");
>  	device_destroy(ppp_class, MKDEV(PPP_MAJOR, 0));
>  	class_destroy(ppp_class);
> @@ -3132,4 +3244,5 @@ EXPORT_SYMBOL(ppp_register_compressor);
>  EXPORT_SYMBOL(ppp_unregister_compressor);
>  MODULE_LICENSE("GPL");
>  MODULE_ALIAS_CHARDEV(PPP_MAJOR, 0);
> +MODULE_ALIAS_RTNL_LINK("ppp");
>  MODULE_ALIAS("devname:ppp");
> diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
> index c488066..f238de9 100644
> --- a/include/uapi/linux/if_link.h
> +++ b/include/uapi/linux/if_link.h
> @@ -515,6 +515,14 @@ enum {
>  };
>  #define IFLA_GENEVE_MAX	(__IFLA_GENEVE_MAX - 1)
>  
> +/* PPP section */
> +enum {
> +	IFLA_PPP_UNSPEC,
> +	IFLA_PPP_DEV_FD,
> +	__IFLA_PPP_MAX,
> +};
> +#define IFLA_PPP_MAX (__IFLA_PPP_MAX - 1)
> +
>  /* Bonding section */
>  
>  enum {

^ permalink raw reply

* Re: [net PATCH v2 2/2] ipv4/GRO: Make GRO conform to RFC 6864
From: Edward Cree @ 2016-04-05 17:06 UTC (permalink / raw)
  To: Tom Herbert
  Cc: Herbert Xu, Alexander Duyck, Alexander Duyck, Jesse Gross,
	Eric Dumazet, Netdev, David Miller
In-Reply-To: <CALx6S37p7sBZ_=xDpmW90-Aqs2sgnOjcjdvYJhEGZ7Y2=6+gcg@mail.gmail.com>

On 05/04/16 16:36, Tom Herbert wrote:
> I thought about that some. It seems like we would want to do both GRO
> and retain all the individual packets in the skb so that we could use
> those for forwarding instead of GSO as I think you're saying.
I didn't quite mean that, I meant just pass around the skb list, don't
do GRO at all.  The receiving TCP stack ends up just getting called
several times, in quick succession, without I$ loss from network stack
traversal in between.

> This
> would would work great in the plain forwarding case, but one problem
> is what to do if the host modifies the super packet (for instance when
> forwarding over a tunnel we might add encapsulation header). This
> should work in GSO (although we need to address the limitations around
> 1 encap level), not sure this is easy if we need to add a header to
> each packet in a batch.
This is indeed a problem with what I was proposing; perhaps the answer
is that as you process these SKB lists you also update something like a
GRO CB, then if you do decide to transform the packets you can coalesce
them at that point.  But doing 'the rest' of GRO might cost as much as
just transforming all the packets, in which case you only win if you want
to transform them multiple times.
And if we assume that we're going to be touching all the headers anyway,
it probably doesn't cost us much to transform all the packets in the list
since our code and data are both hot in cache.  Well, the code is cold
for the first packet in the list, but equally it's cold for the
superpacket in the current approach.

If this is as performant as GRO in the normal (non-forwarding) receive
case (and that's a *big* if, which can only be resolved by trying it), it
might make sense to just not have GRO, while TSO only gets used for
locally-generated sends, and for the case where you're forwarding between
networks with different MTUs (e.g. from a 64k VM netdevice to the wire).

What do you think?  Am I crazy?  (More than my usual?)

-Ed

^ permalink raw reply

* Re: [PATCH net 4/4] lib/test_bpf: Add additional BPF_ADD tests
From: Naveen N. Rao @ 2016-04-05 16:51 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: linux-kernel, linuxppc-dev, netdev, Daniel Borkmann,
	David S. Miller, Ananth N Mavinakayanahalli, Michael Ellerman,
	Paul Mackerras
In-Reply-To: <5703E7B2.6050706@fb.com>

On 2016/04/05 09:28AM, Alexei Starovoitov wrote:
> On 4/5/16 3:02 AM, Naveen N. Rao wrote:
> >Some of these tests proved useful with the powerpc eBPF JIT port due to
> >sign-extended 16-bit immediate loads. Though some of these aspects get
> >covered in other tests, it is better to have explicit tests so as to
> >quickly tag the precise problem.
> >
> >Cc: Alexei Starovoitov <ast@fb.com>
> >Cc: Daniel Borkmann <daniel@iogearbox.net>
> >Cc: "David S. Miller" <davem@davemloft.net>
> >Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
> >Cc: Michael Ellerman <mpe@ellerman.id.au>
> >Cc: Paul Mackerras <paulus@samba.org>
> >Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
> 
> Makes sense. Looks like ppc jit will be using quite a bit of
> available ppc instructions. Nice.
> 
> I'm assuming all these new tests passed with x64 jit?

Yes, all these tests pass on x86_64.

- Naveen

^ permalink raw reply

* Re: [PATCH net 2/4] lib/test_bpf: Add tests for unsigned BPF_JGT
From: Naveen N. Rao @ 2016-04-05 16:50 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: linux-kernel, linuxppc-dev, netdev, Paul Mackerras,
	David S. Miller, Daniel Borkmann
In-Reply-To: <5703E5C5.5060901@fb.com>

On 2016/04/05 09:20AM, Alexei Starovoitov wrote:
> On 4/5/16 3:02 AM, Naveen N. Rao wrote:
> >Unsigned Jump-if-Greater-Than.
> >
> >Cc: Alexei Starovoitov <ast@fb.com>
> >Cc: Daniel Borkmann <daniel@iogearbox.net>
> >Cc: "David S. Miller" <davem@davemloft.net>
> >Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
> >Cc: Michael Ellerman <mpe@ellerman.id.au>
> >Cc: Paul Mackerras <paulus@samba.org>
> >Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
> 
> I think some of the tests already cover it, but extra tests are
> always great.
> Acked-by: Alexei Starovoitov <ast@kernel.org>
> 
> I think the whole set belongs in net-next.
> Next time you submit the patches please say [PATCH net-next] in subject.
> [PATCH net] is for bugfixes only.

Ah, sure. Thanks for the review!

- Naveen

^ permalink raw reply

* Re: [RFC PATCH 4/5] mlx4: add support for fast rx drop bpf program
From: Or Gerlitz @ 2016-04-05 14:15 UTC (permalink / raw)
  To: Alexei Starovoitov, Eric Dumazet
  Cc: Jesper Dangaard Brouer, Brenden Blanco, davem, netdev, tom,
	daniel, john.fastabend, Eran Ben Elisha, Rana Shahout,
	Matan Barak
In-Reply-To: <20160404185010.GD68392@ast-mbp.thefacebook.com>

On 4/4/2016 9:50 PM, Alexei Starovoitov wrote:
> On Mon, Apr 04, 2016 at 08:22:03AM -0700, Eric Dumazet wrote:
>> A single flow is able to use 40Gbit on those 40Gbit NIC, so there is not
>> a single 10GB trunk used for a given flow.
>>
>> This 14Mpps thing seems to be a queue limitation on mlx4.
> yeah, could be queueing related. Multiple cpus can send ~30Mpps of the same 64 byte packet,
> but mlx4 can only receive 14.5Mpps. Odd.
>
> Or (and other mellanox guys), what is really going on inside 40G nic?

Hi Alexei,

Not that I know everything that goes inside there, and not that if I 
knew it all I could have posted that here (I heard HWs sometimes have 
IP)... but, anyway, as for your questions:

ConnectX3 40Gbs NIC can receive > 10Gbs packet-worthy (14.5M) in single 
ring and Mellanox
100Gbs NICs can receive > 25Gbs packet-worthy (37.5M) in single ring, 
people that use DPDK (...) even see this numbers and AFAIU we now 
attempt to see that in the kernel with XDP :)

I realize that we might have some issues in the mlx4 driver reporting on 
HW drops. Eran (cc-ed) and Co are looking on that.

In parallel to doing so, I would suggest you to do some experiments that 
might shed some more light, if on the TX side you do

$ ./pktgen_sample03_burst_single_flow.sh -i $DEV -d $IP -m $MAC -t 4

On the RX side,  skip RSS and force the packets that match that traffic 
pattern to go to (say) ring (==action) 0

$ ethtool -U $DEV flow-type ip4 dst-mac $MAC dst-ip $IP action 0 loc 0

to go back to RSS remove the rule

$ ethtool -U $DEV delete action 0

FWIW (not that I see how it helps you now), you can do HW drop on the RX 
side with ring -1

$ ethtool -U $DEV flow-type ip4 dst-mac $MAC dst-ip $IP action -1 loc 0

Or.

^ permalink raw reply

* Re: [net PATCH v2 2/2] ipv4/GRO: Make GRO conform to RFC 6864
From: Alexander Duyck @ 2016-04-05 16:45 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Herbert Xu, Alexander Duyck, Tom Herbert, Jesse Gross,
	Eric Dumazet, Netdev, David Miller
In-Reply-To: <1459873806.6473.358.camel@edumazet-glaptop3.roam.corp.google.com>

On Tue, Apr 5, 2016 at 9:30 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Tue, 2016-04-05 at 08:52 -0700, Alexander Duyck wrote:
>
>>
>> I disagree I think it will have to be part of the default
>> configuration.  The problem is the IP ID is quickly becoming
>> meaningless.  When you consider that a 40Gb/s link can wrap the IP ID
>> value nearly 50 times a second using a 1500 MTU the IP ID field should
>> just be ignored anyway because you cannot guarantee that it will be
>> unique without limiting the Tx window size.  That was the whole point
>> of RFC6864.  Basically the IP ID field is so small that as we push
>> into the higher speeds you cannot guarantee that the field will have
>> any meaning so for any case where you don't need to use it you
>> shouldn't because it will likely not provide enough useful data.
>
> Just because a few flows reach 40Gbit , we should remind that vast
> majority of the Internet runs with < 50Mbits flows.
>
> I prefer the argument of IPv6 not having ID ;)

Okay, maybe I'll try to use that argument more often then.. :-)

> We should do our best to keep interoperability, this is the selling
> point.
>
> And quite frankly your last patch makes perfect sense to me :

Yes.  It was a compromise, though I might still have to go through and
refine it more.  It might make sense for the IP header associated with
the TCP flow, but for outer headers it actually is worse because we
end up blocking several different possibilities.  What I might need to
do is capture the state of the DF bit as we work a flow up through the
stack and once it is in the list of GRO SKBs use that DF bit as a flag
to indicate if we support a incrementing or fixed pattern for the
values.  That way tunnels can optionally ignore the IP ID if the DF
bit is set since their values may not be as clean as that of TCP.

> The aggregation is done only if the TCP headers of consecutive packets
> matches. So who cares of IPv4 ID really ?
> This is a very minor detail. The possible gains outperform the
> theoretical 'problem'
>
> GRO already reorder flows, it never had a guarantee of being 'ínvisible'
> as Herbert claims.

I can see what he is trying to get at.  I just think it is a bit too
strict on the interpretation of what values have to be maintained.  My
plan going forward is to add a sysctl that will probably allow us some
wiggle room in regards to IP ID for GRO and GSO so that when it is
disabled we will not perform GSO partial nor allow for repeating IP ID
in GRO on devices that cannot get the IP ID right.

- Alex

^ permalink raw reply

* Re: [PATCH net-next 0/3] udp: support SO_PEEK_OF
From: Willem de Bruijn @ 2016-04-05 16:43 UTC (permalink / raw)
  To: Network Development
  Cc: David Miller, Sam Kumar, Eric Dumazet, Willem de Bruijn
In-Reply-To: <1459874301-92389-1-git-send-email-willemdebruijn.kernel@gmail.com>

> Support peeking at a non-zero offset for UDP sockets. Match the
> existing behavior on Unix datagram sockets.
>
> 1/3 makes the sk_peek_offset functions safe to use outside locks
> 2/3 removes udp headers before enqueue, to simplify offset arithmetic
> 3/3 introduces SO_PEEK_OFFSET support, with Unix socket peek semantics.

Please ignore this cover letter. I sent it to soon. It is superseded
by a v2 patch set.

^ permalink raw reply

* [PATCH net] ipv6: Count in extension headers in skb->network_header
From: Jakub Sitnicki @ 2016-04-05 16:41 UTC (permalink / raw)
  To: netdev; +Cc: David S. Miller, Ji Jianwen, Hannes Frederic Sowa

When sending a UDPv6 message longer than MTU, account for the length
of fragmentable IPv6 extension headers in skb->network_header offset.
Same as we do in alloc_new_skb path in __ip6_append_data().

This ensures that later on __ip6_make_skb() will make space in
headroom for fragmentable extension headers:

	/* move skb->data to ip header from ext header */
	if (skb->data < skb_network_header(skb))
		__skb_pull(skb, skb_network_offset(skb));

Prevents a splat due to skb_under_panic:

skbuff: skb_under_panic: text:ffffffff8143397b len:2126 put:14 \
head:ffff880005bacf50 data:ffff880005bacf4a tail:0x48 end:0xc0 dev:lo
------------[ cut here ]------------
kernel BUG at net/core/skbuff.c:104!
invalid opcode: 0000 [#1] KASAN
CPU: 0 PID: 160 Comm: reproducer Not tainted 4.6.0-rc2 #65
[...]
Call Trace:
 [<ffffffff813eb7b9>] skb_push+0x79/0x80
 [<ffffffff8143397b>] eth_header+0x2b/0x100
 [<ffffffff8141e0d0>] neigh_resolve_output+0x210/0x310
 [<ffffffff814eab77>] ip6_finish_output2+0x4a7/0x7c0
 [<ffffffff814efe3a>] ip6_output+0x16a/0x280
 [<ffffffff815440c1>] ip6_local_out+0xb1/0xf0
 [<ffffffff814f1115>] ip6_send_skb+0x45/0xd0
 [<ffffffff81518836>] udp_v6_send_skb+0x246/0x5d0
 [<ffffffff8151985e>] udpv6_sendmsg+0xa6e/0x1090
[...]

Reported-by: Ji Jianwen <jiji@redhat.com>
Signed-off-by: Jakub Sitnicki <jkbs@redhat.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
---

Can be reproduced by sending a UDPv6 message longer than MTU when
Destination Options are present, as shown below.

Original reproducer has been developed by Ji Jianwen.  Cut down
version included.

# ip link set dev lo mtu 1500
# ./reproducer 0 1024		# works
# ./reproducer 8 1024		# works
# ./reproducer 64 1024		# works
# ./reproducer 0 2048		# works
# ./reproducer 8 2048		# crash
# ./reproducer 64 2048		# crash


/* reproducer.c */

#include <sys/types.h>
#include <sys/socket.h>
#include <netdb.h>
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

static void set_dstopts(int s, int len)
{
	char *dstopts;
	int r;

	assert(len % 8 == 0);

	dstopts = calloc(len, 1);
	dstopts[1] = len / 8 - 1; /* Hdr Ext Len */
	dstopts[2] = 1;		  /* PadN Option */
	dstopts[3] = len - 4;	  /* Opt Data Len */

	r = setsockopt(s, IPPROTO_IPV6, IPV6_DSTOPTS, dstopts, len);
	if (r < 0) {
		perror("setsockopt");
		exit(1);
	}

	free(dstopts);
}

static void do_send(int s, const struct addrinfo *ai, int len)
{
	struct msghdr msg;
	struct iovec iov[1];
	char *data;
	int r;

	data = malloc(len);
	memset(data, 'A', len);
	memset(&msg, 0, sizeof(msg));
	iov[0].iov_base = data;
	iov[0].iov_len = len;

	msg.msg_name = ai->ai_addr;
	msg.msg_namelen = ai->ai_addrlen;
	msg.msg_iov = iov;
	msg.msg_iovlen = 1;
	msg.msg_control = 0;
	msg.msg_controllen = 0;

	r = sendmsg(s, &msg, 0);
	if (r < 0) {
		perror("sendmsg");
		exit(1);
	}

	free(data);
}

int main(int argc, char *argv[])
{
	struct addrinfo *ai = NULL;
	int dstopts_len, data_len;
	int r, s;

	if (argc != 3) {
		fprintf(stderr, "Usage: %s <dstopts-bytes> <data-bytes>\n", argv[0]);
		return 1;
	}

	dstopts_len = atoi(argv[1]);
	data_len = atoi(argv[2]);

	r = getaddrinfo("::1", "12345", NULL, &ai);
	assert(r == 0);

	s = socket(ai->ai_family, SOCK_DGRAM, IPPROTO_UDP);
	assert(s != -1);

	if (dstopts_len > 0)
		set_dstopts(s, dstopts_len);
	do_send(s, ai, data_len);

	freeaddrinfo(ai);

	return 0;
}


 net/ipv6/ip6_output.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 9428345..bc972e7 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1090,8 +1090,8 @@ static inline int ip6_ufo_append_data(struct sock *sk,
 			int getfrag(void *from, char *to, int offset, int len,
 			int odd, struct sk_buff *skb),
 			void *from, int length, int hh_len, int fragheaderlen,
-			int transhdrlen, int mtu, unsigned int flags,
-			const struct flowi6 *fl6)
+			int exthdrlen, int transhdrlen, int mtu,
+			unsigned int flags, const struct flowi6 *fl6)
 
 {
 	struct sk_buff *skb;
@@ -1116,7 +1116,7 @@ static inline int ip6_ufo_append_data(struct sock *sk,
 		skb_put(skb, fragheaderlen + transhdrlen);
 
 		/* initialize network header pointer */
-		skb_reset_network_header(skb);
+		skb_set_network_header(skb, exthdrlen);
 
 		/* initialize protocol header pointer */
 		skb->transport_header = skb->network_header + fragheaderlen;
@@ -1358,7 +1358,7 @@ emsgsize:
 	    (rt->dst.dev->features & NETIF_F_UFO) &&
 	    (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) {
 		err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
-					  hh_len, fragheaderlen,
+					  hh_len, fragheaderlen, exthdrlen,
 					  transhdrlen, mtu, flags, fl6);
 		if (err)
 			goto error;
-- 
2.5.5

^ permalink raw reply related

* [PATCH net-next v2 3/3] udp: enable MSG_PEEK at non-zero offset
From: Willem de Bruijn @ 2016-04-05 16:41 UTC (permalink / raw)
  To: netdev; +Cc: davem, samanthakumar, edumazet, willemb
In-Reply-To: <1459874476-92838-1-git-send-email-willemdebruijn.kernel@gmail.com>

From: samanthakumar <samanthakumar@google.com>

Enable peeking at UDP datagrams at the offset specified with socket
option SOL_SOCKET/SO_PEEK_OFF. Peek at any datagram in the queue, up
to the end of the given datagram.

Implement the SO_PEEK_OFF semantics introduced in commit ef64a54f6e55
("sock: Introduce the SO_PEEK_OFF sock option"). Increase the offset
on peek, decrease it on regular reads.

When peeking, always checksum the packet immediately, to avoid
recomputation on subsequent peeks and final read.

The socket lock is not held for the duration of udp_recvmsg, so
peek and read operations can run concurrently. Only the last store
to sk_peek_off is preserved.

Signed-off-by: Sam Kumar <samanthakumar@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 include/linux/skbuff.h |  7 ++++++-
 include/net/sock.h     |  2 ++
 net/core/datagram.c    |  9 ++++++---
 net/core/sock.c        |  9 +++++++++
 net/ipv4/af_inet.c     |  1 +
 net/ipv4/udp.c         | 22 +++++++++++-----------
 net/ipv6/af_inet6.c    |  1 +
 net/ipv6/udp.c         | 22 +++++++++++-----------
 8 files changed, 47 insertions(+), 26 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 15d0df9..0073812 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2949,7 +2949,12 @@ int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
 				 struct iov_iter *from, int len);
 int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm);
 void skb_free_datagram(struct sock *sk, struct sk_buff *skb);
-void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb);
+void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len);
+static inline void skb_free_datagram_locked(struct sock *sk,
+					    struct sk_buff *skb)
+{
+	__skb_free_datagram_locked(sk, skb, 0);
+}
 int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags);
 int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len);
 int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len);
diff --git a/include/net/sock.h b/include/net/sock.h
index b759989..1decb7a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -457,6 +457,8 @@ struct sock {
 #define SK_CAN_REUSE	1
 #define SK_FORCE_REUSE	2
 
+int sk_set_peek_off(struct sock *sk, int val);
+
 static inline int sk_peek_offset(struct sock *sk, int flags)
 {
 	if (unlikely(flags & MSG_PEEK)) {
diff --git a/net/core/datagram.c b/net/core/datagram.c
index fa9dc64..b7de71f 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -301,16 +301,19 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(skb_free_datagram);
 
-void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb)
+void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len)
 {
 	bool slow;
 
 	if (likely(atomic_read(&skb->users) == 1))
 		smp_rmb();
-	else if (likely(!atomic_dec_and_test(&skb->users)))
+	else if (likely(!atomic_dec_and_test(&skb->users))) {
+		sk_peek_offset_bwd(sk, len);
 		return;
+	}
 
 	slow = lock_sock_fast(sk);
+	sk_peek_offset_bwd(sk, len);
 	skb_orphan(skb);
 	sk_mem_reclaim_partial(sk);
 	unlock_sock_fast(sk, slow);
@@ -318,7 +321,7 @@ void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb)
 	/* skb is now orphaned, can be freed outside of locked section */
 	__kfree_skb(skb);
 }
-EXPORT_SYMBOL(skb_free_datagram_locked);
+EXPORT_SYMBOL(__skb_free_datagram_locked);
 
 /**
  *	skb_kill_datagram - Free a datagram skbuff forcibly
diff --git a/net/core/sock.c b/net/core/sock.c
index e12197b..2ce76e8 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2187,6 +2187,15 @@ void __sk_mem_reclaim(struct sock *sk, int amount)
 }
 EXPORT_SYMBOL(__sk_mem_reclaim);
 
+int sk_set_peek_off(struct sock *sk, int val)
+{
+	if (val < 0)
+		return -EINVAL;
+
+	sk->sk_peek_off = val;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sk_set_peek_off);
 
 /*
  * Set of default routines for initialising struct proto_ops when
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 9e48199..a38b991 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -948,6 +948,7 @@ const struct proto_ops inet_dgram_ops = {
 	.recvmsg	   = inet_recvmsg,
 	.mmap		   = sock_no_mmap,
 	.sendpage	   = inet_sendpage,
+	.set_peek_off	   = sk_set_peek_off,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_sock_common_setsockopt,
 	.compat_getsockopt = compat_sock_common_getsockopt,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index cf747e8..d80312d 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1294,7 +1294,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
 	DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
 	struct sk_buff *skb;
 	unsigned int ulen, copied;
-	int peeked, off = 0;
+	int peeked, peeking, off;
 	int err;
 	int is_udplite = IS_UDPLITE(sk);
 	bool checksum_valid = false;
@@ -1304,15 +1304,16 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
 		return ip_recv_error(sk, msg, len, addr_len);
 
 try_again:
+	peeking = off = sk_peek_offset(sk, flags);
 	skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
 				  &peeked, &off, &err);
 	if (!skb)
-		goto out;
+		return err;
 
 	ulen = skb->len;
 	copied = len;
-	if (copied > ulen)
-		copied = ulen;
+	if (copied > ulen - off)
+		copied = ulen - off;
 	else if (copied < ulen)
 		msg->msg_flags |= MSG_TRUNC;
 
@@ -1322,16 +1323,16 @@ try_again:
 	 * coverage checksum (UDP-Lite), do it before the copy.
 	 */
 
-	if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) {
+	if (copied < ulen || UDP_SKB_CB(skb)->partial_cov || peeking) {
 		checksum_valid = !udp_lib_checksum_complete(skb);
 		if (!checksum_valid)
 			goto csum_copy_err;
 	}
 
 	if (checksum_valid || skb_csum_unnecessary(skb))
-		err = skb_copy_datagram_msg(skb, 0, msg, copied);
+		err = skb_copy_datagram_msg(skb, off, msg, copied);
 	else {
-		err = skb_copy_and_csum_datagram_msg(skb, 0, msg);
+		err = skb_copy_and_csum_datagram_msg(skb, off, msg);
 
 		if (err == -EINVAL)
 			goto csum_copy_err;
@@ -1344,7 +1345,8 @@ try_again:
 			UDP_INC_STATS_USER(sock_net(sk),
 					   UDP_MIB_INERRORS, is_udplite);
 		}
-		goto out_free;
+		skb_free_datagram_locked(sk, skb);
+		return err;
 	}
 
 	if (!peeked)
@@ -1368,9 +1370,7 @@ try_again:
 	if (flags & MSG_TRUNC)
 		err = ulen;
 
-out_free:
-	skb_free_datagram_locked(sk, skb);
-out:
+	__skb_free_datagram_locked(sk, skb, peeking ? -err : err);
 	return err;
 
 csum_copy_err:
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index b11c37c..2b78aad 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -561,6 +561,7 @@ const struct proto_ops inet6_dgram_ops = {
 	.recvmsg	   = inet_recvmsg,		/* ok		*/
 	.mmap		   = sock_no_mmap,
 	.sendpage	   = sock_no_sendpage,
+	.set_peek_off	   = sk_set_peek_off,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_sock_common_setsockopt,
 	.compat_getsockopt = compat_sock_common_getsockopt,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 84c8d7b..87bd7af 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -357,7 +357,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 	struct inet_sock *inet = inet_sk(sk);
 	struct sk_buff *skb;
 	unsigned int ulen, copied;
-	int peeked, off = 0;
+	int peeked, peeking, off;
 	int err;
 	int is_udplite = IS_UDPLITE(sk);
 	bool checksum_valid = false;
@@ -371,15 +371,16 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
 
 try_again:
+	peeking = off = sk_peek_offset(sk, flags);
 	skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
 				  &peeked, &off, &err);
 	if (!skb)
-		goto out;
+		return err;
 
 	ulen = skb->len;
 	copied = len;
-	if (copied > ulen)
-		copied = ulen;
+	if (copied > ulen - off)
+		copied = ulen - off;
 	else if (copied < ulen)
 		msg->msg_flags |= MSG_TRUNC;
 
@@ -391,16 +392,16 @@ try_again:
 	 * coverage checksum (UDP-Lite), do it before the copy.
 	 */
 
-	if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) {
+	if (copied < ulen || UDP_SKB_CB(skb)->partial_cov || peeking) {
 		checksum_valid = !udp_lib_checksum_complete(skb);
 		if (!checksum_valid)
 			goto csum_copy_err;
 	}
 
 	if (checksum_valid || skb_csum_unnecessary(skb))
-		err = skb_copy_datagram_msg(skb, 0, msg, copied);
+		err = skb_copy_datagram_msg(skb, off, msg, copied);
 	else {
-		err = skb_copy_and_csum_datagram_msg(skb, 0, msg);
+		err = skb_copy_and_csum_datagram_msg(skb, off, msg);
 		if (err == -EINVAL)
 			goto csum_copy_err;
 	}
@@ -417,7 +418,8 @@ try_again:
 						    UDP_MIB_INERRORS,
 						    is_udplite);
 		}
-		goto out_free;
+		skb_free_datagram_locked(sk, skb);
+		return err;
 	}
 	if (!peeked) {
 		if (is_udp4)
@@ -465,9 +467,7 @@ try_again:
 	if (flags & MSG_TRUNC)
 		err = ulen;
 
-out_free:
-	skb_free_datagram_locked(sk, skb);
-out:
+	__skb_free_datagram_locked(sk, skb, peeking ? -err : err);
 	return err;
 
 csum_copy_err:
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related

* [PATCH net-next v2 2/3] udp: remove headers from UDP packets before queueing
From: Willem de Bruijn @ 2016-04-05 16:41 UTC (permalink / raw)
  To: netdev; +Cc: davem, samanthakumar, edumazet, willemb
In-Reply-To: <1459874476-92838-1-git-send-email-willemdebruijn.kernel@gmail.com>

From: samanthakumar <samanthakumar@google.com>

Remove UDP transport headers before queueing packets for reception.
This change simplifies a follow-up patch to add MSG_PEEK support.

Signed-off-by: Sam Kumar <samanthakumar@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 include/net/sock.h |  1 +
 include/net/udp.h  |  9 +++++++++
 net/core/sock.c    | 19 +++++++++++++------
 net/ipv4/udp.c     | 20 +++++++++++---------
 net/ipv6/udp.c     | 12 +++++++-----
 5 files changed, 41 insertions(+), 20 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 09aec75..b759989 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1864,6 +1864,7 @@ void sk_reset_timer(struct sock *sk, struct timer_list *timer,
 
 void sk_stop_timer(struct sock *sk, struct timer_list *timer);
 
+int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
 
 int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb);
diff --git a/include/net/udp.h b/include/net/udp.h
index d870ec1..a0b0da9 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -158,6 +158,15 @@ static inline __sum16 udp_v4_check(int len, __be32 saddr,
 void udp_set_csum(bool nocheck, struct sk_buff *skb,
 		  __be32 saddr, __be32 daddr, int len);
 
+static inline void udp_csum_pull_header(struct sk_buff *skb)
+{
+	if (skb->ip_summed == CHECKSUM_NONE)
+		skb->csum = csum_partial(udp_hdr(skb), sizeof(struct udphdr),
+					 skb->csum);
+	skb_pull_rcsum(skb, sizeof(struct udphdr));
+	UDP_SKB_CB(skb)->cscov -= sizeof(struct udphdr);
+}
+
 struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
 				 struct udphdr *uh);
 int udp_gro_complete(struct sk_buff *skb, int nhoff);
diff --git a/net/core/sock.c b/net/core/sock.c
index 2f517ea..e12197b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -402,9 +402,8 @@ static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 }
 
 
-int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
-	int err;
 	unsigned long flags;
 	struct sk_buff_head *list = &sk->sk_receive_queue;
 
@@ -414,10 +413,6 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 		return -ENOMEM;
 	}
 
-	err = sk_filter(sk, skb);
-	if (err)
-		return err;
-
 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 		atomic_inc(&sk->sk_drops);
 		return -ENOBUFS;
@@ -440,6 +435,18 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 		sk->sk_data_ready(sk);
 	return 0;
 }
+EXPORT_SYMBOL(__sock_queue_rcv_skb);
+
+int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	int err;
+
+	err = sk_filter(sk, skb);
+	if (err)
+		return err;
+
+	return __sock_queue_rcv_skb(sk, skb);
+}
 EXPORT_SYMBOL(sock_queue_rcv_skb);
 
 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 355bdb2..cf747e8 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1309,7 +1309,7 @@ try_again:
 	if (!skb)
 		goto out;
 
-	ulen = skb->len - sizeof(struct udphdr);
+	ulen = skb->len;
 	copied = len;
 	if (copied > ulen)
 		copied = ulen;
@@ -1329,11 +1329,9 @@ try_again:
 	}
 
 	if (checksum_valid || skb_csum_unnecessary(skb))
-		err = skb_copy_datagram_msg(skb, sizeof(struct udphdr),
-					    msg, copied);
+		err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	else {
-		err = skb_copy_and_csum_datagram_msg(skb, sizeof(struct udphdr),
-						     msg);
+		err = skb_copy_and_csum_datagram_msg(skb, 0, msg);
 
 		if (err == -EINVAL)
 			goto csum_copy_err;
@@ -1500,7 +1498,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 		sk_incoming_cpu_update(sk);
 	}
 
-	rc = sock_queue_rcv_skb(sk, skb);
+	rc = __sock_queue_rcv_skb(sk, skb);
 	if (rc < 0) {
 		int is_udplite = IS_UDPLITE(sk);
 
@@ -1616,10 +1614,14 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 		}
 	}
 
-	if (rcu_access_pointer(sk->sk_filter) &&
-	    udp_lib_checksum_complete(skb))
-		goto csum_error;
+	if (rcu_access_pointer(sk->sk_filter)) {
+		if (udp_lib_checksum_complete(skb))
+			goto csum_error;
+		if (sk_filter(sk, skb))
+			goto drop;
+	}
 
+	udp_csum_pull_header(skb);
 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 		UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
 				 is_udplite);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 78a7dfd..84c8d7b 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -376,7 +376,7 @@ try_again:
 	if (!skb)
 		goto out;
 
-	ulen = skb->len - sizeof(struct udphdr);
+	ulen = skb->len;
 	copied = len;
 	if (copied > ulen)
 		copied = ulen;
@@ -398,10 +398,9 @@ try_again:
 	}
 
 	if (checksum_valid || skb_csum_unnecessary(skb))
-		err = skb_copy_datagram_msg(skb, sizeof(struct udphdr),
-					    msg, copied);
+		err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	else {
-		err = skb_copy_and_csum_datagram_msg(skb, sizeof(struct udphdr), msg);
+		err = skb_copy_and_csum_datagram_msg(skb, 0, msg);
 		if (err == -EINVAL)
 			goto csum_copy_err;
 	}
@@ -554,7 +553,7 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 		sk_incoming_cpu_update(sk);
 	}
 
-	rc = sock_queue_rcv_skb(sk, skb);
+	rc = __sock_queue_rcv_skb(sk, skb);
 	if (rc < 0) {
 		int is_udplite = IS_UDPLITE(sk);
 
@@ -648,8 +647,11 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	if (rcu_access_pointer(sk->sk_filter)) {
 		if (udp_lib_checksum_complete(skb))
 			goto csum_error;
+		if (sk_filter(sk, skb))
+			goto drop;
 	}
 
+	udp_csum_pull_header(skb);
 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 		UDP6_INC_STATS_BH(sock_net(sk),
 				  UDP_MIB_RCVBUFERRORS, is_udplite);
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related

* [PATCH net-next v2 1/3] sock: convert sk_peek_offset functions to WRITE_ONCE
From: Willem de Bruijn @ 2016-04-05 16:41 UTC (permalink / raw)
  To: netdev; +Cc: davem, samanthakumar, edumazet, willemb
In-Reply-To: <1459874476-92838-1-git-send-email-willemdebruijn.kernel@gmail.com>

From: Willem de Bruijn <willemb@google.com>

Make the peek offset interface safe to use in lockless environments.
Use READ_ONCE and WRITE_ONCE to avoid race conditions between testing
and updating the peek offset.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 include/net/sock.h | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 310c436..09aec75 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -459,26 +459,28 @@ struct sock {
 
 static inline int sk_peek_offset(struct sock *sk, int flags)
 {
-	if ((flags & MSG_PEEK) && (sk->sk_peek_off >= 0))
-		return sk->sk_peek_off;
-	else
-		return 0;
+	if (unlikely(flags & MSG_PEEK)) {
+		s32 off = READ_ONCE(sk->sk_peek_off);
+		if (off >= 0)
+			return off;
+	}
+
+	return 0;
 }
 
 static inline void sk_peek_offset_bwd(struct sock *sk, int val)
 {
-	if (sk->sk_peek_off >= 0) {
-		if (sk->sk_peek_off >= val)
-			sk->sk_peek_off -= val;
-		else
-			sk->sk_peek_off = 0;
+	s32 off = READ_ONCE(sk->sk_peek_off);
+
+	if (unlikely(off >= 0)) {
+		off = max_t(s32, off - val, 0);
+		WRITE_ONCE(sk->sk_peek_off, off);
 	}
 }
 
 static inline void sk_peek_offset_fwd(struct sock *sk, int val)
 {
-	if (sk->sk_peek_off >= 0)
-		sk->sk_peek_off += val;
+	sk_peek_offset_bwd(sk, -val);
 }
 
 /*
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related

* [PATCH net-next v2 0/3] udp: support SO_PEEK_OFF
From: Willem de Bruijn @ 2016-04-05 16:41 UTC (permalink / raw)
  To: netdev; +Cc: davem, samanthakumar, edumazet, willemb

From: Willem de Bruijn <willemb@google.com>

Support peeking at a non-zero offset for UDP sockets. Match the
existing behavior on Unix datagram sockets.

1/3 makes the sk_peek_offset functions safe to use outside locks
2/3 removes udp headers before enqueue, to simplify offset arithmetic
3/3 introduces SO_PEEK_OFFSET support, with Unix socket peek semantics.

Changes
  v1->v2
    - squash patches 3 and 4

Willem de Bruijn (1):
  sock: convert sk_peek_offset functions to WRITE_ONCE

samanthakumar (2):
  udp: remove headers from UDP packets before queueing
  udp: enable MSG_PEEK at non-zero offset

 include/linux/skbuff.h |  7 ++++++-
 include/net/sock.h     | 27 ++++++++++++++++-----------
 include/net/udp.h      |  9 +++++++++
 net/core/datagram.c    |  9 ++++++---
 net/core/sock.c        | 28 ++++++++++++++++++++++------
 net/ipv4/af_inet.c     |  1 +
 net/ipv4/udp.c         | 38 ++++++++++++++++++++------------------
 net/ipv6/af_inet6.c    |  1 +
 net/ipv6/udp.c         | 30 ++++++++++++++++--------------
 9 files changed, 97 insertions(+), 53 deletions(-)

-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply

* [PATCH net-next 0/3] udp: support SO_PEEK_OF
From: Willem de Bruijn @ 2016-04-05 16:38 UTC (permalink / raw)
  To: netdev; +Cc: davem, samanthakumar, edumazet, willemb

From: Willem de Bruijn <willemb@google.com>

Support peeking at a non-zero offset for UDP sockets. Match the
existing behavior on Unix datagram sockets.

1/3 makes the sk_peek_offset functions safe to use outside locks
2/3 removes udp headers before enqueue, to simplify offset arithmetic
3/3 introduces SO_PEEK_OFFSET support, with Unix socket peek semantics.

Changes
  v1->v2
    - squash patches 3 and 4

Willem de Bruijn (1):
  sock: convert sk_peek_offset functions to WRITE_ONCE

samanthakumar (2):
  udp: remove headers from UDP packets before queueing
  udp: enable MSG_PEEK at non-zero offset

 include/linux/skbuff.h |  7 ++++++-
 include/net/sock.h     | 27 ++++++++++++++++-----------
 include/net/udp.h      |  9 +++++++++
 net/core/datagram.c    |  9 ++++++---
 net/core/sock.c        | 28 ++++++++++++++++++++++------
 net/ipv4/af_inet.c     |  1 +
 net/ipv4/udp.c         | 38 ++++++++++++++++++++------------------
 net/ipv6/af_inet6.c    |  1 +
 net/ipv6/udp.c         | 30 ++++++++++++++++--------------
 9 files changed, 97 insertions(+), 53 deletions(-)

-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply

* Re: [net PATCH v2 2/2] ipv4/GRO: Make GRO conform to RFC 6864
From: Eric Dumazet @ 2016-04-05 16:30 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Herbert Xu, Alexander Duyck, Tom Herbert, Jesse Gross,
	Eric Dumazet, Netdev, David Miller
In-Reply-To: <CAKgT0Uc=9MMzSHWgZib5+EwPCQokjSSnC9vfOtWKULvmwUQpdQ@mail.gmail.com>

On Tue, 2016-04-05 at 08:52 -0700, Alexander Duyck wrote:

> 
> I disagree I think it will have to be part of the default
> configuration.  The problem is the IP ID is quickly becoming
> meaningless.  When you consider that a 40Gb/s link can wrap the IP ID
> value nearly 50 times a second using a 1500 MTU the IP ID field should
> just be ignored anyway because you cannot guarantee that it will be
> unique without limiting the Tx window size.  That was the whole point
> of RFC6864.  Basically the IP ID field is so small that as we push
> into the higher speeds you cannot guarantee that the field will have
> any meaning so for any case where you don't need to use it you
> shouldn't because it will likely not provide enough useful data.

Just because a few flows reach 40Gbit , we should remind that vast
majority of the Internet runs with < 50Mbits flows.

I prefer the argument of IPv6 not having ID ;)

We should do our best to keep interoperability, this is the selling
point. 

And quite frankly your last patch makes perfect sense to me :

The aggregation is done only if the TCP headers of consecutive packets
matches. So who cares of IPv4 ID really ?
This is a very minor detail. The possible gains outperform the
theoretical 'problem'

GRO already reorder flows, it never had a guarantee of being 'ínvisible'
as Herbert claims.

^ permalink raw reply

* Re: [PATCH net 4/4] lib/test_bpf: Add additional BPF_ADD tests
From: Alexei Starovoitov @ 2016-04-05 16:28 UTC (permalink / raw)
  To: Naveen N. Rao, linux-kernel, linuxppc-dev, netdev
  Cc: Daniel Borkmann, David S. Miller, Ananth N Mavinakayanahalli,
	Michael Ellerman, Paul Mackerras
In-Reply-To: <9b71e280481a9f84cd7dbb9e767fd08e6f4c0aef.1459850410.git.naveen.n.rao@linux.vnet.ibm.com>

On 4/5/16 3:02 AM, Naveen N. Rao wrote:
> Some of these tests proved useful with the powerpc eBPF JIT port due to
> sign-extended 16-bit immediate loads. Though some of these aspects get
> covered in other tests, it is better to have explicit tests so as to
> quickly tag the precise problem.
>
> Cc: Alexei Starovoitov <ast@fb.com>
> Cc: Daniel Borkmann <daniel@iogearbox.net>
> Cc: "David S. Miller" <davem@davemloft.net>
> Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: Paul Mackerras <paulus@samba.org>
> Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>

Makes sense. Looks like ppc jit will be using quite a bit of
available ppc instructions. Nice.

I'm assuming all these new tests passed with x64 jit?

Acked-by: Alexei Starovoitov <ast@kernel.org>

^ permalink raw reply

* Re: [PATCH net 3/4] lib/test_bpf: Add test to check for result of 32-bit add that overflows
From: Alexei Starovoitov @ 2016-04-05 16:21 UTC (permalink / raw)
  To: Naveen N. Rao, linux-kernel, linuxppc-dev, netdev
  Cc: Daniel Borkmann, David S. Miller, Ananth N Mavinakayanahalli,
	Michael Ellerman, Paul Mackerras
In-Reply-To: <9c799e59e71c022271c6287769d45a32f29de4bd.1459850410.git.naveen.n.rao@linux.vnet.ibm.com>

On 4/5/16 3:02 AM, Naveen N. Rao wrote:
> BPF_ALU32 and BPF_ALU64 tests for adding two 32-bit values that results in
> 32-bit overflow.
>
> Cc: Alexei Starovoitov <ast@fb.com>
> Cc: Daniel Borkmann <daniel@iogearbox.net>
> Cc: "David S. Miller" <davem@davemloft.net>
> Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: Paul Mackerras <paulus@samba.org>
> Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>

Acked-by: Alexei Starovoitov <ast@kernel.org>

^ permalink raw reply

* [PATCH net-next 3/3] tipc: reduce transmission rate of reset messages when link is down
From: Jon Maloy @ 2016-04-05 16:20 UTC (permalink / raw)
  To: davem; +Cc: Jon Maloy, netdev, Paul Gortmaker, tipc-discussion
In-Reply-To: <1459873255-32354-1-git-send-email-jon.maloy@ericsson.com>

When a link is down, it will continuously try to re-establish contact
with the peer by sending out a RESET or and ACTIVATE message at each
timeout interval. The default value for this interval is currently
375 ms. This is wasteful, and may become a problem in very large
clusters with dozens or hundereds of nodes being down simultaneously.

We now introduce a simple backoff algorithm for these cases. The
first five messages are sent at default rate; thereafter a message
is sent only each 16't timer interval.

This will cover the vast majority of link recyling cases, since the
endpoint starting last will transmit at the higher speed, and the link
should normally be established well be before the rate needs to be
reduced.

The only case where we will see a degradation of link re-establishment
is when the endpoins remain intact, and a glitch in the transmission
media is causing the link reset. We will then experience a worst-case
re-establishing time of 6 seconds, something we deem acceptable.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
---
 net/tipc/link.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/tipc/link.c b/net/tipc/link.c
index 7d2bb3e..42cdbd1 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -140,6 +140,7 @@ struct tipc_link {
 	char if_name[TIPC_MAX_IF_NAME];
 	u32 priority;
 	char net_plane;
+	u16 rst_cnt;
 
 	/* Failover/synch */
 	u16 drop_point;
@@ -701,8 +702,6 @@ static void link_profile_stats(struct tipc_link *l)
 
 /* tipc_link_timeout - perform periodic task as instructed from node timeout
  */
-/* tipc_link_timeout - perform periodic task as instructed from node timeout
- */
 int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq)
 {
 	int rc = 0;
@@ -730,11 +729,13 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq)
 		l->silent_intv_cnt++;
 		break;
 	case LINK_RESET:
-		xmit = true;
+		if ((l->rst_cnt++ <= 4) || !(l->rst_cnt % 16))
+			xmit = true;
 		mtyp = RESET_MSG;
 		break;
 	case LINK_ESTABLISHING:
-		xmit = true;
+		if ((l->rst_cnt++ <= 4) || !(l->rst_cnt % 16))
+			xmit = true;
 		mtyp = ACTIVATE_MSG;
 		break;
 	case LINK_PEER_RESET:
@@ -833,6 +834,7 @@ void tipc_link_reset(struct tipc_link *l)
 	l->rcv_nxt = 1;
 	l->acked = 0;
 	l->silent_intv_cnt = 0;
+	l->rst_cnt = 0;
 	l->stats.recv_info = 0;
 	l->stale_count = 0;
 	l->bc_peer_is_up = false;
-- 
1.9.1


------------------------------------------------------------------------------

^ permalink raw reply related

* [PATCH net-next 2/3] tipc: stricter filtering of packets in bearer layer
From: Jon Maloy @ 2016-04-05 16:20 UTC (permalink / raw)
  To: davem; +Cc: Jon Maloy, netdev, Paul Gortmaker, tipc-discussion
In-Reply-To: <1459873255-32354-1-git-send-email-jon.maloy@ericsson.com>

Resetting a bearer/interface, with the consequence of resetting all its
pertaining links, is not an atomic action. This becomes particularly
evident in very large clusters, where a lot of traffic may happen on the
remaining links while we are busy shutting them down. In extreme cases,
we may even see links being re-created and re-established before we are
finished with the job.

To solve this, we now introduce a solution where we temporarily detach
the bearer from the interface when the bearer is reset. This inhibits
all packet reception, while sending still is possible. For the latter,
we use the fact that the device's user pointer now is zero to filter out
which packets can be sent during this situation; i.e., outgoing RESET
messages only.  This filtering serves to speed up the neighbors'
detection of the loss event, and saves us from unnecessary probing.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
---
 net/tipc/bearer.c | 50 +++++++++++++++++++++++++++++++++-----------------
 net/tipc/msg.h    |  5 +++++
 2 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 20566e9..6f11c62 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -337,23 +337,16 @@ static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b)
  */
 static void bearer_disable(struct net *net, struct tipc_bearer *b)
 {
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
-	u32 i;
+	struct tipc_net *tn = tipc_net(net);
+	int bearer_id = b->identity;
 
 	pr_info("Disabling bearer <%s>\n", b->name);
 	b->media->disable_media(b);
-
-	tipc_node_delete_links(net, b->identity);
+	tipc_node_delete_links(net, bearer_id);
 	RCU_INIT_POINTER(b->media_ptr, NULL);
 	if (b->link_req)
 		tipc_disc_delete(b->link_req);
-
-	for (i = 0; i < MAX_BEARERS; i++) {
-		if (b == rtnl_dereference(tn->bearer_list[i])) {
-			RCU_INIT_POINTER(tn->bearer_list[i], NULL);
-			break;
-		}
-	}
+	RCU_INIT_POINTER(tn->bearer_list[bearer_id], NULL);
 	kfree_rcu(b, rcu);
 }
 
@@ -396,7 +389,7 @@ void tipc_disable_l2_media(struct tipc_bearer *b)
 
 /**
  * tipc_l2_send_msg - send a TIPC packet out over an L2 interface
- * @buf: the packet to be sent
+ * @skb: the packet to be sent
  * @b: the bearer through which the packet is to be sent
  * @dest: peer destination address
  */
@@ -405,17 +398,21 @@ int tipc_l2_send_msg(struct net *net, struct sk_buff *skb,
 {
 	struct net_device *dev;
 	int delta;
+	void *tipc_ptr;
 
 	dev = (struct net_device *)rcu_dereference_rtnl(b->media_ptr);
 	if (!dev)
 		return 0;
 
+	/* Send RESET message even if bearer is detached from device */
+	tipc_ptr = rtnl_dereference(dev->tipc_ptr);
+	if (unlikely(!tipc_ptr && !msg_is_reset(buf_msg(skb))))
+		goto drop;
+
 	delta = dev->hard_header_len - skb_headroom(skb);
 	if ((delta > 0) &&
-	    pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
-		kfree_skb(skb);
-		return 0;
-	}
+	    pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC))
+		goto drop;
 
 	skb_reset_network_header(skb);
 	skb->dev = dev;
@@ -424,6 +421,9 @@ int tipc_l2_send_msg(struct net *net, struct sk_buff *skb,
 			dev->dev_addr, skb->len);
 	dev_queue_xmit(skb);
 	return 0;
+drop:
+	kfree_skb(skb);
+	return 0;
 }
 
 int tipc_bearer_mtu(struct net *net, u32 bearer_id)
@@ -549,9 +549,18 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt,
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 	struct net *net = dev_net(dev);
+	struct tipc_net *tn = tipc_net(net);
 	struct tipc_bearer *b;
+	int i;
 
 	b = rtnl_dereference(dev->tipc_ptr);
+	if (!b) {
+		for (i = 0; i < MAX_BEARERS; b = NULL, i++) {
+			b = rtnl_dereference(tn->bearer_list[i]);
+			if (b && (b->media_ptr == dev))
+				break;
+		}
+	}
 	if (!b)
 		return NOTIFY_DONE;
 
@@ -561,13 +570,20 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt,
 	case NETDEV_CHANGE:
 		if (netif_carrier_ok(dev))
 			break;
+	case NETDEV_UP:
+		rcu_assign_pointer(dev->tipc_ptr, b);
+		break;
 	case NETDEV_GOING_DOWN:
+		RCU_INIT_POINTER(dev->tipc_ptr, NULL);
+		synchronize_net();
+		tipc_reset_bearer(net, b);
+		break;
 	case NETDEV_CHANGEMTU:
 		tipc_reset_bearer(net, b);
 		break;
 	case NETDEV_CHANGEADDR:
 		b->media->raw2addr(b, &b->addr,
-				       (char *)dev->dev_addr);
+				   (char *)dev->dev_addr);
 		tipc_reset_bearer(net, b);
 		break;
 	case NETDEV_UNREGISTER:
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 55778a0..f34f639 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -779,6 +779,11 @@ static inline bool msg_peer_node_is_up(struct tipc_msg *m)
 	return msg_redundant_link(m);
 }
 
+static inline bool msg_is_reset(struct tipc_msg *hdr)
+{
+	return (msg_user(hdr) == LINK_PROTOCOL) && (msg_type(hdr) == RESET_MSG);
+}
+
 struct sk_buff *tipc_buf_acquire(u32 size);
 bool tipc_msg_validate(struct sk_buff *skb);
 bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, int err);
-- 
1.9.1


------------------------------------------------------------------------------

^ permalink raw reply related

* [PATCH net-next 1/3] tipc: eliminate buffer leak in bearer layer
From: Jon Maloy @ 2016-04-05 16:20 UTC (permalink / raw)
  To: davem; +Cc: Jon Maloy, netdev, Paul Gortmaker, tipc-discussion
In-Reply-To: <1459873255-32354-1-git-send-email-jon.maloy@ericsson.com>

When enabling a bearer we create a 'neigbor discoverer' instance by
calling the function tipc_disc_create() before the bearer is actually
registered in the list of enabled bearers. Because of this, the very
first discovery broadcast message, created by the mentioned function,
is lost, since it cannot find any valid bearer to use. Furthermore,
the used send function, tipc_bearer_xmit_skb() does not free the given
buffer when it cannot find a  bearer, resulting in the leak of exactly
one send buffer each time a bearer is enabled.

This commit fixes this problem by introducing two changes:

1) Instead of attemting to send the discovery message directly, we let
   tipc_disc_create() return the discovery buffer to the calling
   function, tipc_enable_bearer(), so that the latter can send it
   when the enabling sequence is finished.

2) In tipc_bearer_xmit_skb(), as well as in the two other transmit
   functions at the bearer layer, we now free the indicated buffer or
   buffer chain when a valid bearer cannot be found.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
---
 net/tipc/bearer.c   | 51 ++++++++++++++++++++++++++-------------------------
 net/tipc/discover.c |  7 ++-----
 net/tipc/discover.h |  2 +-
 3 files changed, 29 insertions(+), 31 deletions(-)

diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 27a5406..20566e9 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -205,6 +205,7 @@ static int tipc_enable_bearer(struct net *net, const char *name,
 	struct tipc_bearer *b;
 	struct tipc_media *m;
 	struct tipc_bearer_names b_names;
+	struct sk_buff *skb;
 	char addr_string[16];
 	u32 bearer_id;
 	u32 with_this_prio;
@@ -301,7 +302,7 @@ restart:
 	b->net_plane = bearer_id + 'A';
 	b->priority = priority;
 
-	res = tipc_disc_create(net, b, &b->bcast_addr);
+	res = tipc_disc_create(net, b, &b->bcast_addr, &skb);
 	if (res) {
 		bearer_disable(net, b);
 		pr_warn("Bearer <%s> rejected, discovery object creation failed\n",
@@ -310,7 +311,8 @@ restart:
 	}
 
 	rcu_assign_pointer(tn->bearer_list[bearer_id], b);
-
+	if (skb)
+		tipc_bearer_xmit_skb(net, bearer_id, skb, &b->bcast_addr);
 	pr_info("Enabled bearer <%s>, discovery domain %s, priority %u\n",
 		name,
 		tipc_addr_string_fill(addr_string, disc_domain), priority);
@@ -450,6 +452,8 @@ void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id,
 	b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]);
 	if (likely(b))
 		b->media->send_msg(net, skb, b, dest);
+	else
+		kfree_skb(skb);
 	rcu_read_unlock();
 }
 
@@ -468,11 +472,11 @@ void tipc_bearer_xmit(struct net *net, u32 bearer_id,
 
 	rcu_read_lock();
 	b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]);
-	if (likely(b)) {
-		skb_queue_walk_safe(xmitq, skb, tmp) {
-			__skb_dequeue(xmitq);
-			b->media->send_msg(net, skb, b, dst);
-		}
+	if (unlikely(!b))
+		__skb_queue_purge(xmitq);
+	skb_queue_walk_safe(xmitq, skb, tmp) {
+		__skb_dequeue(xmitq);
+		b->media->send_msg(net, skb, b, dst);
 	}
 	rcu_read_unlock();
 }
@@ -490,14 +494,14 @@ void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id,
 
 	rcu_read_lock();
 	b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]);
-	if (likely(b)) {
-		skb_queue_walk_safe(xmitq, skb, tmp) {
-			hdr = buf_msg(skb);
-			msg_set_non_seq(hdr, 1);
-			msg_set_mc_netid(hdr, net_id);
-			__skb_dequeue(xmitq);
-			b->media->send_msg(net, skb, b, &b->bcast_addr);
-		}
+	if (unlikely(!b))
+		__skb_queue_purge(xmitq);
+	skb_queue_walk_safe(xmitq, skb, tmp) {
+		hdr = buf_msg(skb);
+		msg_set_non_seq(hdr, 1);
+		msg_set_mc_netid(hdr, net_id);
+		__skb_dequeue(xmitq);
+		b->media->send_msg(net, skb, b, &b->bcast_addr);
 	}
 	rcu_read_unlock();
 }
@@ -513,24 +517,21 @@ void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id,
  * ignores packets sent using interface multicast, and traffic sent to other
  * nodes (which can happen if interface is running in promiscuous mode).
  */
-static int tipc_l2_rcv_msg(struct sk_buff *buf, struct net_device *dev,
+static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev,
 			   struct packet_type *pt, struct net_device *orig_dev)
 {
 	struct tipc_bearer *b;
 
 	rcu_read_lock();
 	b = rcu_dereference_rtnl(dev->tipc_ptr);
-	if (likely(b)) {
-		if (likely(buf->pkt_type <= PACKET_BROADCAST)) {
-			buf->next = NULL;
-			tipc_rcv(dev_net(dev), buf, b);
-			rcu_read_unlock();
-			return NET_RX_SUCCESS;
-		}
+	if (likely(b && (skb->pkt_type <= PACKET_BROADCAST))) {
+		skb->next = NULL;
+		tipc_rcv(dev_net(dev), skb, b);
+		rcu_read_unlock();
+		return NET_RX_SUCCESS;
 	}
 	rcu_read_unlock();
-
-	kfree_skb(buf);
+	kfree_skb(skb);
 	return NET_RX_DROP;
 }
 
diff --git a/net/tipc/discover.c b/net/tipc/discover.c
index f1e738e..ad9d477 100644
--- a/net/tipc/discover.c
+++ b/net/tipc/discover.c
@@ -268,10 +268,9 @@ exit:
  * Returns 0 if successful, otherwise -errno.
  */
 int tipc_disc_create(struct net *net, struct tipc_bearer *b,
-		     struct tipc_media_addr *dest)
+		     struct tipc_media_addr *dest, struct sk_buff **skb)
 {
 	struct tipc_link_req *req;
-	struct sk_buff *skb;
 
 	req = kmalloc(sizeof(*req), GFP_ATOMIC);
 	if (!req)
@@ -293,9 +292,7 @@ int tipc_disc_create(struct net *net, struct tipc_bearer *b,
 	setup_timer(&req->timer, disc_timeout, (unsigned long)req);
 	mod_timer(&req->timer, jiffies + req->timer_intv);
 	b->link_req = req;
-	skb = skb_clone(req->buf, GFP_ATOMIC);
-	if (skb)
-		tipc_bearer_xmit_skb(net, req->bearer_id, skb, &req->dest);
+	*skb = skb_clone(req->buf, GFP_ATOMIC);
 	return 0;
 }
 
diff --git a/net/tipc/discover.h b/net/tipc/discover.h
index c9b1277..b80a335 100644
--- a/net/tipc/discover.h
+++ b/net/tipc/discover.h
@@ -40,7 +40,7 @@
 struct tipc_link_req;
 
 int tipc_disc_create(struct net *net, struct tipc_bearer *b_ptr,
-		     struct tipc_media_addr *dest);
+		     struct tipc_media_addr *dest, struct sk_buff **skb);
 void tipc_disc_delete(struct tipc_link_req *req);
 void tipc_disc_reset(struct net *net, struct tipc_bearer *b_ptr);
 void tipc_disc_add_dest(struct tipc_link_req *req);
-- 
1.9.1


------------------------------------------------------------------------------

^ permalink raw reply related

* [PATCH net-next 0/3] tipc: some small fixes
From: Jon Maloy @ 2016-04-05 16:20 UTC (permalink / raw)
  To: davem; +Cc: Jon Maloy, netdev, Paul Gortmaker, tipc-discussion

When running TIPC in large clusters we experience behavior that
may potentially become problematic in the future. This series
picks some low-hanging fruit in this regard, and also fixes a
couple of other minor issues.

Jon Maloy (3):
  tipc: eliminate buffer leak in bearer layer
  tipc: stricter filtering of packets in bearer layer
  tipc: reduce transmission rate of reset messages when link is down

 net/tipc/bearer.c   | 101 ++++++++++++++++++++++++++++++----------------------
 net/tipc/discover.c |   7 ++--
 net/tipc/discover.h |   2 +-
 net/tipc/link.c     |  10 +++---
 net/tipc/msg.h      |   5 +++
 5 files changed, 73 insertions(+), 52 deletions(-)

-- 
1.9.1


------------------------------------------------------------------------------

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox