Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [patch net-next v2 10/10] net: sched: add termination action to allow goto chain
From: Jamal Hadi Salim @ 2017-05-16 12:46 UTC (permalink / raw)
  To: Daniel Borkmann, Jiri Pirko
  Cc: netdev, davem, xiyou.wangcong, dsa, edumazet, stephen,
	alexander.h.duyck, simon.horman, mlxsw, alexei.starovoitov
In-Reply-To: <591AB168.30202@iogearbox.net>

On 17-05-16 03:59 AM, Daniel Borkmann wrote:
> On 05/16/2017 06:43 AM, Jiri Pirko wrote:
>> Mon, May 15, 2017 at 10:02:08PM CEST, daniel@iogearbox.net wrote:

[..]
>>>
>>> Given this goto chain feature is pretty much only interesting for hw
>>> offloads, can we move this further away from the sw fast path to not
>>> add up to the cost per packet? (I doubt anyone is using
>>> TC_ACT_RECLASSIFY
>>> in sw as well ...)
>>
>> I don't think so. First of all, the whole thing would be broken then in
>> sw. It is useful to have it in sw, at least for testing reasons.
>> So I would leave the unlikely and add it to the second check as well.
>
> Ok, lets go with that then, thanks!

Side comment:
I dont think this is only useful for h/w offloads;->

cheers,
jamal

^ permalink raw reply

* [PATCH v3 net-next 4/7] net: add new control message for incoming HW-timestamped packets
From: Miroslav Lichvar @ 2017-05-16 12:44 UTC (permalink / raw)
  To: netdev; +Cc: Richard Cochran, Willem de Bruijn
In-Reply-To: <20170516124425.6294-1-mlichvar@redhat.com>

Add SOF_TIMESTAMPING_OPT_PKTINFO option to request a new control message
for incoming packets with hardware timestamps. It contains the index of
the real interface which received the packet and the length of the
packet at layer 2.

The index is useful with bonding, bridges and other interfaces, where
IP_PKTINFO doesn't allow applications to determine which PHC made the
timestamp. With the L2 length (and link speed) it is possible to
transpose preamble timestamps to trailer timestamps, which are used in
the NTP protocol.

While this information could be provided by two new socket options
independently from timestamping, it doesn't look like they would be very
useful. With this option any performance impact is limited to hardware
timestamping.

Use dev_get_by_napi_id() to get the device and its index. On kernels
with disabled CONFIG_NET_RX_BUSY_POLL or drivers not using NAPI, a zero
index will be returned in the control message.

CC: Richard Cochran <richardcochran@gmail.com>
CC: Willem de Bruijn <willemb@google.com>
Signed-off-by: Miroslav Lichvar <mlichvar@redhat.com>
---
 Documentation/networking/timestamping.txt |  9 +++++++++
 include/uapi/asm-generic/socket.h         |  2 ++
 include/uapi/linux/net_tstamp.h           | 10 +++++++++-
 net/socket.c                              | 27 ++++++++++++++++++++++++++-
 4 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt
index 96f5069..600c6bf 100644
--- a/Documentation/networking/timestamping.txt
+++ b/Documentation/networking/timestamping.txt
@@ -193,6 +193,15 @@ SOF_TIMESTAMPING_OPT_STATS:
   the transmit timestamps, such as how long a certain block of
   data was limited by peer's receiver window.
 
+SOF_TIMESTAMPING_OPT_PKTINFO:
+
+  Enable the SCM_TIMESTAMPING_PKTINFO control message for incoming
+  packets with hardware timestamps. The message contains struct
+  scm_ts_pktinfo, which supplies the index of the real interface which
+  received the packet and its length at layer 2. A valid (non-zero)
+  interface index will be returned only if CONFIG_NET_RX_BUSY_POLL is
+  enabled and the driver is using NAPI.
+
 New applications are encouraged to pass SOF_TIMESTAMPING_OPT_ID to
 disambiguate timestamps and SOF_TIMESTAMPING_OPT_TSONLY to operate
 regardless of the setting of sysctl net.core.tstamp_allow_data.
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 2b48856..a5f6e81 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -100,4 +100,6 @@
 
 #define SO_COOKIE		57
 
+#define SCM_TIMESTAMPING_PKTINFO	58
+
 #endif /* __ASM_GENERIC_SOCKET_H */
diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h
index 0749fb1..f2fb455 100644
--- a/include/uapi/linux/net_tstamp.h
+++ b/include/uapi/linux/net_tstamp.h
@@ -9,6 +9,7 @@
 #ifndef _NET_TIMESTAMPING_H
 #define _NET_TIMESTAMPING_H
 
+#include <linux/types.h>
 #include <linux/socket.h>   /* for SO_TIMESTAMPING */
 
 /* SO_TIMESTAMPING gets an integer bit field comprised of these values */
@@ -26,8 +27,9 @@ enum {
 	SOF_TIMESTAMPING_OPT_CMSG = (1<<10),
 	SOF_TIMESTAMPING_OPT_TSONLY = (1<<11),
 	SOF_TIMESTAMPING_OPT_STATS = (1<<12),
+	SOF_TIMESTAMPING_OPT_PKTINFO = (1<<13),
 
-	SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_STATS,
+	SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_PKTINFO,
 	SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) |
 				 SOF_TIMESTAMPING_LAST
 };
@@ -130,4 +132,10 @@ enum hwtstamp_rx_filters {
 	HWTSTAMP_FILTER_NTP_ALL,
 };
 
+/* SCM_TIMESTAMPING_PKTINFO control message */
+struct scm_ts_pktinfo {
+	__u32 if_index;
+	__u32 pkt_length;
+};
+
 #endif /* _NET_TIMESTAMPING_H */
diff --git a/net/socket.c b/net/socket.c
index c2564eb..ee1f4ec 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -662,6 +662,27 @@ static bool skb_is_err_queue(const struct sk_buff *skb)
 	return skb->pkt_type == PACKET_OUTGOING;
 }
 
+static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb)
+{
+	struct scm_ts_pktinfo ts_pktinfo;
+	struct net_device *orig_dev;
+	int ifindex = 0;
+
+	if (!skb_mac_header_was_set(skb))
+		return;
+
+	rcu_read_lock();
+	orig_dev = dev_get_by_napi_id(skb_napi_id(skb));
+	if (orig_dev)
+		ifindex = orig_dev->ifindex;
+	rcu_read_unlock();
+
+	ts_pktinfo.if_index = ifindex;
+	ts_pktinfo.pkt_length = skb->len - skb_mac_offset(skb);
+	put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_PKTINFO,
+		 sizeof(ts_pktinfo), &ts_pktinfo);
+}
+
 /*
  * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP)
  */
@@ -699,8 +720,12 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
 		empty = 0;
 	if (shhwtstamps &&
 	    (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
-	    ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2))
+	    ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2)) {
 		empty = 0;
+		if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) &&
+		    !skb_is_err_queue(skb))
+			put_ts_pktinfo(msg, skb);
+	}
 	if (!empty) {
 		put_cmsg(msg, SOL_SOCKET,
 			 SCM_TIMESTAMPING, sizeof(tss), &tss);
-- 
2.9.3

^ permalink raw reply related

* [PATCH v3 net-next 7/7] net: ethernet: update drivers to make both SW and HW TX timestamps
From: Miroslav Lichvar @ 2017-05-16 12:44 UTC (permalink / raw)
  To: netdev; +Cc: Richard Cochran, Willem de Bruijn
In-Reply-To: <20170516124425.6294-1-mlichvar@redhat.com>

Some drivers were calling the skb_tx_timestamp() function only when
a hardware timestamp was not requested. Now that applications can use
the SOF_TIMESTAMPING_OPT_TX_SWHW option to request both software and
hardware timestamps, the drivers need to be modified to unconditionally
call skb_tx_timestamp().

CC: Richard Cochran <richardcochran@gmail.com>
CC: Willem de Bruijn <willemb@google.com>
Signed-off-by: Miroslav Lichvar <mlichvar@redhat.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c          | 3 +--
 drivers/net/ethernet/intel/e1000e/netdev.c        | 4 ++--
 drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c   | 3 +--
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 6 ++----
 4 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 89b21d7..5a2ad9c 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -1391,8 +1391,7 @@ static void xgbe_prep_tx_tstamp(struct xgbe_prv_data *pdata,
 		spin_unlock_irqrestore(&pdata->tstamp_lock, flags);
 	}
 
-	if (!XGMAC_GET_BITS(packet->attributes, TX_PACKET_ATTRIBUTES, PTP))
-		skb_tx_timestamp(skb);
+	skb_tx_timestamp(skb);
 }
 
 static void xgbe_prep_vlan(struct sk_buff *skb, struct xgbe_packet_data *packet)
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index 0ff9295..6ed3bc4 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -5868,10 +5868,10 @@ static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb,
 			adapter->tx_hwtstamp_skb = skb_get(skb);
 			adapter->tx_hwtstamp_start = jiffies;
 			schedule_work(&adapter->tx_hwtstamp_work);
-		} else {
-			skb_tx_timestamp(skb);
 		}
 
+		skb_tx_timestamp(skb);
+
 		netdev_sent_queue(netdev, skb->len);
 		e1000_tx_queue(tx_ring, tx_flags, count);
 		/* Make sure there is space in the ring for the next send. */
diff --git a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c
index 1e59435..89831ad 100644
--- a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c
+++ b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c
@@ -1418,8 +1418,7 @@ static netdev_tx_t sxgbe_xmit(struct sk_buff *skb, struct net_device *dev)
 		priv->hw->desc->tx_enable_tstamp(first_desc);
 	}
 
-	if (!tqueue->hwts_tx_en)
-		skb_tx_timestamp(skb);
+	skb_tx_timestamp(skb);
 
 	priv->hw->dma->enable_dma_transmission(priv->ioaddr, txq_index);
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index cce862b..27c12e7 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -2880,8 +2880,7 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
 		priv->xstats.tx_set_ic_bit++;
 	}
 
-	if (!priv->hwts_tx_en)
-		skb_tx_timestamp(skb);
+	skb_tx_timestamp(skb);
 
 	if (unlikely((skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) &&
 		     priv->hwts_tx_en)) {
@@ -3084,8 +3083,7 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 		priv->xstats.tx_set_ic_bit++;
 	}
 
-	if (!priv->hwts_tx_en)
-		skb_tx_timestamp(skb);
+	skb_tx_timestamp(skb);
 
 	/* Ready to fill the first descriptor and set the OWN bit w/o any
 	 * problems because all the descriptors are actually ready to be
-- 
2.9.3

^ permalink raw reply related

* [PATCH v3 net-next 3/7] net: add function to retrieve original skb device using NAPI ID
From: Miroslav Lichvar @ 2017-05-16 12:44 UTC (permalink / raw)
  To: netdev; +Cc: Richard Cochran, Willem de Bruijn
In-Reply-To: <20170516124425.6294-1-mlichvar@redhat.com>

Since commit b68581778cd0 ("net: Make skb->skb_iif always track
skb->dev") skbs don't have the original index of the interface which
received the packet. This information is now needed for a new control
message related to hardware timestamping.

Instead of adding a new field to skb, we can find the device by the NAPI
ID if it is available, i.e. CONFIG_NET_RX_BUSY_POLL is enabled and the
driver is using NAPI. Add dev_get_by_napi_id() and also skb_napi_id() to
hide the CONFIG_NET_RX_BUSY_POLL ifdef.

CC: Richard Cochran <richardcochran@gmail.com>
CC: Willem de Bruijn <willemb@google.com>
Suggested-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Miroslav Lichvar <mlichvar@redhat.com>
---
 include/linux/netdevice.h |  1 +
 include/linux/skbuff.h    |  9 +++++++++
 net/core/dev.c            | 26 ++++++++++++++++++++++++++
 3 files changed, 36 insertions(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3f39d27..b6c36d5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2456,6 +2456,7 @@ static inline int dev_recursion_level(void)
 struct net_device *dev_get_by_index(struct net *net, int ifindex);
 struct net_device *__dev_get_by_index(struct net *net, int ifindex);
 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
+struct net_device *dev_get_by_napi_id(unsigned int napi_id);
 int netdev_get_name(struct net *net, char *name, int ifindex);
 int dev_restart(struct net_device *dev);
 int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb);
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a098d95..42dd430 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -915,6 +915,15 @@ static inline bool skb_pkt_type_ok(u32 ptype)
 	return ptype <= PACKET_OTHERHOST;
 }
 
+static inline unsigned int skb_napi_id(const struct sk_buff *skb)
+{
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	return skb->napi_id;
+#else
+	return 0;
+#endif
+}
+
 void kfree_skb(struct sk_buff *skb);
 void kfree_skb_list(struct sk_buff *segs);
 void skb_tx_error(struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index fca407b..dcbc84e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -161,6 +161,7 @@ static int netif_rx_internal(struct sk_buff *skb);
 static int call_netdevice_notifiers_info(unsigned long val,
 					 struct net_device *dev,
 					 struct netdev_notifier_info *info);
+static struct napi_struct *napi_by_id(unsigned int napi_id);
 
 /*
  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
@@ -865,6 +866,31 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex)
 EXPORT_SYMBOL(dev_get_by_index);
 
 /**
+ *	dev_get_by_napi_id - find a device by napi_id
+ *	@napi_id: ID of the NAPI struct
+ *
+ *	Search for an interface by NAPI ID. Returns %NULL if the device
+ *	is not found or a pointer to the device. The device has not had
+ *	its reference counter increased so the caller must be careful
+ *	about locking. The caller must hold RCU lock.
+ */
+
+struct net_device *dev_get_by_napi_id(unsigned int napi_id)
+{
+	struct napi_struct *napi;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	if (napi_id < MIN_NAPI_ID)
+		return NULL;
+
+	napi = napi_by_id(napi_id);
+
+	return napi ? napi->dev : NULL;
+}
+EXPORT_SYMBOL(dev_get_by_napi_id);
+
+/**
  *	netdev_get_name - get a netdevice name, knowing its ifindex.
  *	@net: network namespace
  *	@name: a pointer to the buffer where the name will be stored.
-- 
2.9.3

^ permalink raw reply related

* [PATCH v3 net-next 6/7] net: allow simultaneous SW and HW transmit timestamping
From: Miroslav Lichvar @ 2017-05-16 12:44 UTC (permalink / raw)
  To: netdev; +Cc: Richard Cochran, Willem de Bruijn
In-Reply-To: <20170516124425.6294-1-mlichvar@redhat.com>

Add SOF_TIMESTAMPING_OPT_TX_SWHW option to allow an outgoing packet to
be looped to the socket's error queue with a software timestamp even
when a hardware transmit timestamp is expected to be provided by the
driver.

Applications using this option will receive two separate messages from
the error queue, one with a software timestamp and the other with a
hardware timestamp. As the hardware timestamp is saved to the shared skb
info, which may happen before the first message with software timestamp
is received by the application, the hardware timestamp is copied to the
SCM_TIMESTAMPING control message only when the skb has no software
timestamp or it is an incoming packet.

While changing sw_tx_timestamp(), inline it in skb_tx_timestamp() as
there are no other users.

CC: Richard Cochran <richardcochran@gmail.com>
CC: Willem de Bruijn <willemb@google.com>
Signed-off-by: Miroslav Lichvar <mlichvar@redhat.com>
---
 Documentation/networking/timestamping.txt | 14 ++++++++++++--
 include/linux/skbuff.h                    | 10 ++--------
 include/uapi/linux/net_tstamp.h           |  3 ++-
 net/core/skbuff.c                         |  4 ++++
 net/socket.c                              | 11 +++++++++++
 5 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt
index 600c6bf..55b0007 100644
--- a/Documentation/networking/timestamping.txt
+++ b/Documentation/networking/timestamping.txt
@@ -202,6 +202,14 @@ SOF_TIMESTAMPING_OPT_PKTINFO:
   interface index will be returned only if CONFIG_NET_RX_BUSY_POLL is
   enabled and the driver is using NAPI.
 
+SOF_TIMESTAMPING_OPT_TX_SWHW:
+
+  Request both hardware and software timestamps for outgoing packets
+  when SOF_TIMESTAMPING_TX_HARDWARE and SOF_TIMESTAMPING_TX_SOFTWARE
+  are enabled at the same time. If both timestamps are generated,
+  two separate messages will be looped to the socket's error queue,
+  each containing just one timestamp.
+
 New applications are encouraged to pass SOF_TIMESTAMPING_OPT_ID to
 disambiguate timestamps and SOF_TIMESTAMPING_OPT_TSONLY to operate
 regardless of the setting of sysctl net.core.tstamp_allow_data.
@@ -321,8 +329,10 @@ struct scm_timestamping {
 };
 
 The structure can return up to three timestamps. This is a legacy
-feature. Only one field is non-zero at any time. Most timestamps
-are passed in ts[0]. Hardware timestamps are passed in ts[2].
+feature. Most timestamps are passed in ts[0]. Hardware timestamps
+are passed in ts[2]. Incoming packets may have timestamps in both
+ts[0] and ts[2], but for outgoing packets only one field is non-zero
+at any time.
 
 ts[1] used to hold hardware timestamps converted to system time.
 Instead, expose the hardware clock device on the NIC directly as
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 42dd430..3824549 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3307,13 +3307,6 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
 void skb_tstamp_tx(struct sk_buff *orig_skb,
 		   struct skb_shared_hwtstamps *hwtstamps);
 
-static inline void sw_tx_timestamp(struct sk_buff *skb)
-{
-	if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP &&
-	    !(skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS))
-		skb_tstamp_tx(skb, NULL);
-}
-
 /**
  * skb_tx_timestamp() - Driver hook for transmit timestamping
  *
@@ -3329,7 +3322,8 @@ static inline void sw_tx_timestamp(struct sk_buff *skb)
 static inline void skb_tx_timestamp(struct sk_buff *skb)
 {
 	skb_clone_tx_timestamp(skb);
-	sw_tx_timestamp(skb);
+	if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP)
+		skb_tstamp_tx(skb, NULL);
 }
 
 /**
diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h
index f2fb455..3cab0ab 100644
--- a/include/uapi/linux/net_tstamp.h
+++ b/include/uapi/linux/net_tstamp.h
@@ -28,8 +28,9 @@ enum {
 	SOF_TIMESTAMPING_OPT_TSONLY = (1<<11),
 	SOF_TIMESTAMPING_OPT_STATS = (1<<12),
 	SOF_TIMESTAMPING_OPT_PKTINFO = (1<<13),
+	SOF_TIMESTAMPING_OPT_TX_SWHW = (1<<14),
 
-	SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_PKTINFO,
+	SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_TX_SWHW,
 	SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) |
 				 SOF_TIMESTAMPING_LAST
 };
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 346d3e8..68c02df 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3875,6 +3875,10 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
 	if (!sk)
 		return;
 
+	if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
+	    skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS)
+		return;
+
 	tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY;
 	if (!skb_may_tx_timestamp(sk, tsonly))
 		return;
diff --git a/net/socket.c b/net/socket.c
index 879df37..e5a5f89 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -662,6 +662,16 @@ static bool skb_is_err_queue(const struct sk_buff *skb)
 	return skb->pkt_type == PACKET_OUTGOING;
 }
 
+/* On transmit, software and hardware timestamps are returned independently.
+ * As the two skb clones share the hardware timestamp, which may be updated
+ * before the software timestamp is received, a hardware TX timestamp may be
+ * returned only if there is no software TX timestamp.
+ */
+static bool skb_is_swtx_tstamp(const struct sk_buff *skb)
+{
+	return skb->tstamp && skb_is_err_queue(skb);
+}
+
 static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb)
 {
 	struct scm_ts_pktinfo ts_pktinfo;
@@ -721,6 +731,7 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
 		empty = 0;
 	if (shhwtstamps &&
 	    (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
+	    !skb_is_swtx_tstamp(skb) &&
 	    ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2)) {
 		empty = 0;
 		if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) &&
-- 
2.9.3

^ permalink raw reply related

* [PATCH v3 net-next 0/7] Extend socket timestamping API
From: Miroslav Lichvar @ 2017-05-16 12:44 UTC (permalink / raw)
  To: netdev

Changes v2->v3:
- modified struct scm_ts_pktinfo to use fixed-width integer types
- added WARN_ON_ONCE for missing RCU lock in dev_get_by_napi_id()
- modified dev_get_by_napi_id() to not return dev in unexpected branch
- modified recv to return SCM_TIMESTAMPING_PKTINFO even if the interface
  index is unknown

Changes v1->v2:
- added separate patch for new NAPI functions 
- split code from __sock_recv_timestamp() for better readability
- fixed RCU locking
- fixed compiler warning (missing case in switch in first patch)
- inline sw_tx_timestamp() in its only user

Changes RFC->v1:
- reworked SOF_TIMESTAMPING_OPT_PKTINFO patch to not add new fields to
  skb shared info (net device is now looked up by napi_id), not require
  any changes in drivers, and restrict the cmsg to incoming packets
- renamed SOF_TIMESTAMPING_OPT_MULTIMSG to SOF_TIMESTAMPING_OPT_TX_SWHW
  and fixed its description
- moved struct scm_ts_pktinfo from errqueue.h to net_tstamp.h as it
  can't be received from the error queue anymore
- improved commit descriptions and removed incorrect comment

This patchset adds new options to the timestamping API that will be
useful for NTP implementations and possibly other applications.

The first patch specifies a timestamp filter for NTP packets. The second
patch updates drivers that can timestamp all packets, or need to list
the filter as unsupported. There is no attempt to add the support to the
phyter driver.

The third patch adds two helper functions working with NAPI ID, which is
needed by the next patch. The fourth patch adds a new option to get a
new control message with the L2 length and interface index for incoming
packets with hardware timestamps.

The fifth patch fixes the code to not make a false software TX timestamp
when hardware timestamping is enabled. The sixth patch depends on this
fix.

The sixth patch adds a new option to request both software and hardware
timestamps for outgoing packets. The seventh patch updates drivers that
assumed software timestamping cannot be used together with hardware
timestamping.

The patches have been tested on x86_64 machines with igb and e1000e
drivers.

Miroslav Lichvar (7):
  net: define receive timestamp filter for NTP
  net: ethernet: update drivers to handle HWTSTAMP_FILTER_NTP_ALL
  net: add function to retrieve original skb device using NAPI ID
  net: add new control message for incoming HW-timestamped packets
  net: don't make false software transmit timestamps
  net: allow simultaneous SW and HW transmit timestamping
  net: ethernet: update drivers to make both SW and HW TX timestamps

 Documentation/networking/timestamping.txt          | 23 ++++++++++--
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c           |  4 +--
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c   |  1 +
 drivers/net/ethernet/cavium/liquidio/lio_main.c    |  1 +
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c |  1 +
 drivers/net/ethernet/cavium/octeon/octeon_mgmt.c   |  1 +
 drivers/net/ethernet/intel/e1000e/netdev.c         |  5 +--
 drivers/net/ethernet/intel/i40e/i40e_ptp.c         |  1 +
 drivers/net/ethernet/intel/igb/igb_ptp.c           |  1 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c       |  1 +
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c     |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_clock.c |  1 +
 drivers/net/ethernet/neterion/vxge/vxge-main.c     |  1 +
 drivers/net/ethernet/qlogic/qede/qede_ptp.c        |  1 +
 drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c    |  3 +-
 drivers/net/ethernet/sfc/ef10.c                    |  1 +
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  |  7 ++--
 drivers/net/ethernet/ti/cpsw.c                     |  1 +
 drivers/net/ethernet/tile/tilegx.c                 |  1 +
 include/linux/netdevice.h                          |  1 +
 include/linux/skbuff.h                             | 19 +++++-----
 include/uapi/asm-generic/socket.h                  |  2 ++
 include/uapi/linux/net_tstamp.h                    | 14 +++++++-
 net/core/dev.c                                     | 26 ++++++++++++++
 net/core/dev_ioctl.c                               |  1 +
 net/core/skbuff.c                                  |  4 +++
 net/socket.c                                       | 41 ++++++++++++++++++++--
 27 files changed, 141 insertions(+), 23 deletions(-)

-- 
2.9.3

^ permalink raw reply

* [PATCH v3 net-next 5/7] net: don't make false software transmit timestamps
From: Miroslav Lichvar @ 2017-05-16 12:44 UTC (permalink / raw)
  To: netdev; +Cc: Richard Cochran, Willem de Bruijn
In-Reply-To: <20170516124425.6294-1-mlichvar@redhat.com>

If software timestamping is enabled by the SO_TIMESTAMP(NS) option
when a message without timestamp is already waiting in the queue, the
__sock_recv_timestamp() function will read the current time to make a
timestamp in order to always have something for the application.

However, this applies also to outgoing packets looped back to the error
queue when hardware timestamping is enabled by the SO_TIMESTAMPING
option. A software transmit timestamp made after the actual transmission
is added to messages with hardware timestamps.

Modify the function to save the current time as a software timestamp
only if it's for a received packet (i.e. it's not in the error queue).

CC: Richard Cochran <richardcochran@gmail.com>
CC: Willem de Bruijn <willemb@google.com>
Signed-off-by: Miroslav Lichvar <mlichvar@redhat.com>
---
 net/socket.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/socket.c b/net/socket.c
index ee1f4ec..879df37 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -689,7 +689,8 @@ static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb)
 void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
 	struct sk_buff *skb)
 {
-	int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP);
+	int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP) &&
+				   !skb_is_err_queue(skb);
 	struct scm_timestamping tss;
 	int empty = 1;
 	struct skb_shared_hwtstamps *shhwtstamps =
-- 
2.9.3

^ permalink raw reply related

* [PATCH v3 net-next 2/7] net: ethernet: update drivers to handle HWTSTAMP_FILTER_NTP_ALL
From: Miroslav Lichvar @ 2017-05-16 12:44 UTC (permalink / raw)
  To: netdev; +Cc: Richard Cochran, Willem de Bruijn
In-Reply-To: <20170516124425.6294-1-mlichvar@redhat.com>

Include HWTSTAMP_FILTER_NTP_ALL in net_hwtstamp_validate() as a valid
filter and update drivers which can timestamp all packets, or which
explicitly list unsupported filters instead of using a default case, to
handle the filter.

CC: Richard Cochran <richardcochran@gmail.com>
CC: Willem de Bruijn <willemb@google.com>
Signed-off-by: Miroslav Lichvar <mlichvar@redhat.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c           | 1 +
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c   | 1 +
 drivers/net/ethernet/cavium/liquidio/lio_main.c    | 1 +
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 1 +
 drivers/net/ethernet/cavium/octeon/octeon_mgmt.c   | 1 +
 drivers/net/ethernet/intel/e1000e/netdev.c         | 1 +
 drivers/net/ethernet/intel/i40e/i40e_ptp.c         | 1 +
 drivers/net/ethernet/intel/igb/igb_ptp.c           | 1 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c       | 1 +
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c     | 1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_clock.c | 1 +
 drivers/net/ethernet/neterion/vxge/vxge-main.c     | 1 +
 drivers/net/ethernet/qlogic/qede/qede_ptp.c        | 1 +
 drivers/net/ethernet/sfc/ef10.c                    | 1 +
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 1 +
 drivers/net/ethernet/ti/cpsw.c                     | 1 +
 drivers/net/ethernet/tile/tilegx.c                 | 1 +
 net/core/dev_ioctl.c                               | 3 +--
 18 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index c772420..89b21d7 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -1268,6 +1268,7 @@ static int xgbe_set_hwtstamp_settings(struct xgbe_prv_data *pdata,
 	case HWTSTAMP_FILTER_NONE:
 		break;
 
+	case HWTSTAMP_FILTER_NTP_ALL:
 	case HWTSTAMP_FILTER_ALL:
 		XGMAC_SET_BITS(mac_tscr, MAC_TSCR, TSENALL, 1);
 		XGMAC_SET_BITS(mac_tscr, MAC_TSCR, TSENA, 1);
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index a851f95..2f30b1a 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -15351,6 +15351,7 @@ int bnx2x_configure_ptp_filters(struct bnx2x *bp)
 		break;
 	case HWTSTAMP_FILTER_ALL:
 	case HWTSTAMP_FILTER_SOME:
+	case HWTSTAMP_FILTER_NTP_ALL:
 		bp->rx_filter = HWTSTAMP_FILTER_NONE;
 		break;
 	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index 927617c..7a0ef5b 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -3020,6 +3020,7 @@ static int hwtstamp_ioctl(struct net_device *netdev, struct ifreq *ifr)
 	case HWTSTAMP_FILTER_PTP_V2_EVENT:
 	case HWTSTAMP_FILTER_PTP_V2_SYNC:
 	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+	case HWTSTAMP_FILTER_NTP_ALL:
 		conf.rx_filter = HWTSTAMP_FILTER_ALL;
 		break;
 	default:
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
index 34c7782..15e21b5 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
@@ -2100,6 +2100,7 @@ static int hwtstamp_ioctl(struct net_device *netdev, struct ifreq *ifr)
 	case HWTSTAMP_FILTER_PTP_V2_EVENT:
 	case HWTSTAMP_FILTER_PTP_V2_SYNC:
 	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+	case HWTSTAMP_FILTER_NTP_ALL:
 		conf.rx_filter = HWTSTAMP_FILTER_ALL;
 		break;
 	default:
diff --git a/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c b/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c
index a213868..2887bca 100644
--- a/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c
+++ b/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c
@@ -755,6 +755,7 @@ static int octeon_mgmt_ioctl_hwtstamp(struct net_device *netdev,
 	case HWTSTAMP_FILTER_PTP_V2_EVENT:
 	case HWTSTAMP_FILTER_PTP_V2_SYNC:
 	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+	case HWTSTAMP_FILTER_NTP_ALL:
 		p->has_rx_tstamp = have_hw_timestamps;
 		config.rx_filter = HWTSTAMP_FILTER_ALL;
 		if (p->has_rx_tstamp) {
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index b367972..0ff9295 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -3680,6 +3680,7 @@ static int e1000e_config_hwtstamp(struct e1000_adapter *adapter,
 		 * Delay Request messages but not both so fall-through to
 		 * time stamp all packets.
 		 */
+	case HWTSTAMP_FILTER_NTP_ALL:
 	case HWTSTAMP_FILTER_ALL:
 		is_l2 = true;
 		is_l4 = true;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ptp.c b/drivers/net/ethernet/intel/i40e/i40e_ptp.c
index 18c1cc0..0efff18 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ptp.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ptp.c
@@ -562,6 +562,7 @@ static int i40e_ptp_set_timestamp_mode(struct i40e_pf *pf,
 			config->rx_filter = HWTSTAMP_FILTER_PTP_V2_L2_EVENT;
 		}
 		break;
+	case HWTSTAMP_FILTER_NTP_ALL:
 	case HWTSTAMP_FILTER_ALL:
 	default:
 		return -ERANGE;
diff --git a/drivers/net/ethernet/intel/igb/igb_ptp.c b/drivers/net/ethernet/intel/igb/igb_ptp.c
index 7a3fd4d..d333d6d 100644
--- a/drivers/net/ethernet/intel/igb/igb_ptp.c
+++ b/drivers/net/ethernet/intel/igb/igb_ptp.c
@@ -941,6 +941,7 @@ static int igb_ptp_set_timestamp_mode(struct igb_adapter *adapter,
 		is_l4 = true;
 		break;
 	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+	case HWTSTAMP_FILTER_NTP_ALL:
 	case HWTSTAMP_FILTER_ALL:
 		/* 82576 cannot timestamp all packets, which it needs to do to
 		 * support both V1 Sync and Delay_Req messages
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
index ef0635e..d44c728 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
@@ -883,6 +883,7 @@ static int ixgbe_ptp_set_timestamp_mode(struct ixgbe_adapter *adapter,
 				   IXGBE_FLAG_RX_HWTSTAMP_IN_REGISTER);
 		break;
 	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+	case HWTSTAMP_FILTER_NTP_ALL:
 	case HWTSTAMP_FILTER_ALL:
 		/* The X550 controller is capable of timestamping all packets,
 		 * which allows it to accept any filter.
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 94fab20..8243674 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2375,6 +2375,7 @@ static int mlx4_en_hwtstamp_set(struct net_device *dev, struct ifreq *ifr)
 	case HWTSTAMP_FILTER_PTP_V2_EVENT:
 	case HWTSTAMP_FILTER_PTP_V2_SYNC:
 	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+	case HWTSTAMP_FILTER_NTP_ALL:
 		config.rx_filter = HWTSTAMP_FILTER_ALL;
 		break;
 	default:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c
index e706a87..e294944 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c
@@ -128,6 +128,7 @@ int mlx5e_hwstamp_set(struct net_device *dev, struct ifreq *ifr)
 	case HWTSTAMP_FILTER_PTP_V2_EVENT:
 	case HWTSTAMP_FILTER_PTP_V2_SYNC:
 	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+	case HWTSTAMP_FILTER_NTP_ALL:
 		/* Disable CQE compression */
 		netdev_warn(dev, "Disabling cqe compression");
 		err = mlx5e_modify_rx_cqe_compression_locked(priv, false);
diff --git a/drivers/net/ethernet/neterion/vxge/vxge-main.c b/drivers/net/ethernet/neterion/vxge/vxge-main.c
index 6a4310a..50ea69d 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-main.c
+++ b/drivers/net/ethernet/neterion/vxge/vxge-main.c
@@ -3218,6 +3218,7 @@ static int vxge_hwtstamp_set(struct vxgedev *vdev, void __user *data)
 	case HWTSTAMP_FILTER_PTP_V2_EVENT:
 	case HWTSTAMP_FILTER_PTP_V2_SYNC:
 	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+	case HWTSTAMP_FILTER_NTP_ALL:
 		if (vdev->devh->config.hwts_en != VXGE_HW_HWTS_ENABLE)
 			return -EFAULT;
 
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ptp.c b/drivers/net/ethernet/qlogic/qede/qede_ptp.c
index 24f06e2..9b2280b 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ptp.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ptp.c
@@ -244,6 +244,7 @@ static int qede_ptp_cfg_filters(struct qede_dev *edev)
 		break;
 	case HWTSTAMP_FILTER_ALL:
 	case HWTSTAMP_FILTER_SOME:
+	case HWTSTAMP_FILTER_NTP_ALL:
 		ptp->rx_filter = HWTSTAMP_FILTER_NONE;
 		rx_filter = QED_PTP_FILTER_ALL;
 		break;
diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c
index 78efb28..ad9c4de 100644
--- a/drivers/net/ethernet/sfc/ef10.c
+++ b/drivers/net/ethernet/sfc/ef10.c
@@ -6068,6 +6068,7 @@ static int efx_ef10_ptp_set_ts_config(struct efx_nic *efx,
 	case HWTSTAMP_FILTER_PTP_V2_EVENT:
 	case HWTSTAMP_FILTER_PTP_V2_SYNC:
 	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+	case HWTSTAMP_FILTER_NTP_ALL:
 		init->rx_filter = HWTSTAMP_FILTER_ALL;
 		rc = efx_ptp_change_mode(efx, true, 0);
 		if (!rc)
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index a74c481..cce862b 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -644,6 +644,7 @@ static int stmmac_hwtstamp_ioctl(struct net_device *dev, struct ifreq *ifr)
 			ptp_over_ethernet = PTP_TCR_TSIPENA;
 			break;
 
+		case HWTSTAMP_FILTER_NTP_ALL:
 		case HWTSTAMP_FILTER_ALL:
 			/* time stamp any incoming packet */
 			config.rx_filter = HWTSTAMP_FILTER_ALL;
diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index f4d7aec..37fc165 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -1734,6 +1734,7 @@ static int cpsw_hwtstamp_set(struct net_device *dev, struct ifreq *ifr)
 	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
 	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
 	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+	case HWTSTAMP_FILTER_NTP_ALL:
 		return -ERANGE;
 	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
 	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c
index 7c634bc..aec9538 100644
--- a/drivers/net/ethernet/tile/tilegx.c
+++ b/drivers/net/ethernet/tile/tilegx.c
@@ -512,6 +512,7 @@ static int tile_hwtstamp_set(struct net_device *dev, struct ifreq *rq)
 	case HWTSTAMP_FILTER_PTP_V2_EVENT:
 	case HWTSTAMP_FILTER_PTP_V2_SYNC:
 	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+	case HWTSTAMP_FILTER_NTP_ALL:
 		config.rx_filter = HWTSTAMP_FILTER_ALL;
 		break;
 	default:
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 8f036a7..77f04e7 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -225,9 +225,8 @@ static int net_hwtstamp_validate(struct ifreq *ifr)
 	case HWTSTAMP_FILTER_PTP_V2_EVENT:
 	case HWTSTAMP_FILTER_PTP_V2_SYNC:
 	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
-		rx_filter_valid = 1;
-		break;
 	case HWTSTAMP_FILTER_NTP_ALL:
+		rx_filter_valid = 1;
 		break;
 	}
 
-- 
2.9.3

^ permalink raw reply related

* [PATCH v3 net-next 1/7] net: define receive timestamp filter for NTP
From: Miroslav Lichvar @ 2017-05-16 12:44 UTC (permalink / raw)
  To: netdev; +Cc: Richard Cochran, Willem de Bruijn
In-Reply-To: <20170516124425.6294-1-mlichvar@redhat.com>

Add HWTSTAMP_FILTER_NTP_ALL to the hwtstamp_rx_filters enum for
timestamping of NTP packets. There is currently only one driver
(phyter) that could support it directly.

CC: Richard Cochran <richardcochran@gmail.com>
CC: Willem de Bruijn <willemb@google.com>
Signed-off-by: Miroslav Lichvar <mlichvar@redhat.com>
---
 include/uapi/linux/net_tstamp.h | 3 +++
 net/core/dev_ioctl.c            | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h
index 464dcca..0749fb1 100644
--- a/include/uapi/linux/net_tstamp.h
+++ b/include/uapi/linux/net_tstamp.h
@@ -125,6 +125,9 @@ enum hwtstamp_rx_filters {
 	HWTSTAMP_FILTER_PTP_V2_SYNC,
 	/* PTP v2/802.AS1, any layer, Delay_req packet */
 	HWTSTAMP_FILTER_PTP_V2_DELAY_REQ,
+
+	/* NTP, UDP, all versions and packet modes */
+	HWTSTAMP_FILTER_NTP_ALL,
 };
 
 #endif /* _NET_TIMESTAMPING_H */
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index b94b1d2..8f036a7 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -227,6 +227,8 @@ static int net_hwtstamp_validate(struct ifreq *ifr)
 	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
 		rx_filter_valid = 1;
 		break;
+	case HWTSTAMP_FILTER_NTP_ALL:
+		break;
 	}
 
 	if (!tx_type_valid || !rx_filter_valid)
-- 
2.9.3

^ permalink raw reply related

* Re: [PATCH] rt2x00: improve calling conventions for register accessors
From: Stanislaw Gruszka @ 2017-05-16 12:43 UTC (permalink / raw)
  To: Johannes Berg
  Cc: David Miller, arnd, helmut.schaa, kvalo, daniel, dev,
	pozega.tomislav, vasilugin, roman, linux-wireless, netdev,
	linux-kernel
In-Reply-To: <1494935936.15923.0.camel@sipsolutions.net>

On Tue, May 16, 2017 at 01:58:56PM +0200, Johannes Berg wrote:
> On Tue, 2017-05-16 at 13:55 +0200, Stanislaw Gruszka wrote:
> > 
> > In rt2x00 driver we use poor convention in other kind of registers
> > accessors like bbp, mac, eeprom. I dislike to changing only rfcsr
> > accessors and leaving others in the old way. And changing all
> > accessors would be massive and error prone change, which I'm not
> > prefer either.
> 
> That's a stupid argument, but for the sake of it - the conversion can
> easily be done with coccinelle/spatch without being "error prone".

Sure, but still I think it would be preferable to fix newly added
rt2800_bw_filter_calibration() function, instead of ancient rfcsr
accessors.

Stanislaw  

^ permalink raw reply

* Re: [PATCH v2 30/53] net: fix some identation issues at kernel-doc markups
From: David Howells @ 2017-05-16 12:38 UTC (permalink / raw)
  To: Mauro Carvalho Chehab
  Cc: dhowells, Linux Doc Mailing List, Mauro Carvalho Chehab,
	linux-kernel, Jonathan Corbet, David Woodhouse, Brian Norris,
	Boris Brezillon, Marek Vasut, Richard Weinberger, Cyrille Pitchen,
	linux-mtd, Andrew Lunn, Florian Fainelli, David S. Miller,
	Eric Dumazet, Paolo Abeni, Al Viro, Hannes Frederic Sowa,
	Alexander Duyck <alexa
In-Reply-To: <d651983dde41a854e25664d98cbfc999d55785a8.1494935649.git.mchehab@s-opensource.com>

Mauro Carvalho Chehab <mchehab@s-opensource.com> wrote:

> - *	calling skb_free_datagram). Returns NULL with *err set to
> + *	calling skb_free_datagram). Returns NULL with @err set to

I think this should be *@err or @*err.  err is not itself changed.

David

^ permalink raw reply

* Re: [PATCH v2 1/3] bpf: Use 1<<16 as ceiling for immediate alignment in verifier.
From: Edward Cree @ 2017-05-16 12:37 UTC (permalink / raw)
  To: David Miller, daniel; +Cc: ast, alexei.starovoitov, netdev
In-Reply-To: <20170515.120431.1588221938554447723.davem@davemloft.net>

On 15/05/17 17:04, David Miller wrote:
> If we use 1<<31, then sequences like:
>
> 	R1 = 0
> 	R1 <<= 2
>
> do silly things.
Hmm.  It might be a bit late for this, but I wonder if, instead of handling
 alignments as (1 << align), you could store them as -(1 << align), i.e.
 leading 1s followed by 'align' 0s.
Now the alignment of 0 is 0 (really 1 << 32), which doesn't change when
 left-shifted some more.  Shifts of other numbers' alignments also do the
 right thing, e.g. align(6) << 2 = (-2) << 2 = -8 = align(6 << 2).  Of
 course you do all this in unsigned, to make sure right shifts work.
This also makes other arithmetic simple to track; for instance, align(a + b)
 is at worst align(a) | align(b).  (Of course, this bound isn't tight.)
A number is 2^(n+1)-aligned if the 2^n bit of its alignment is cleared.
Considered as unsigned numbers, smaller values are stricter alignments.

-Ed

^ permalink raw reply

* RE: [PATCH] net: fec: select queue depending on VLAN priority
From: Andy Duan @ 2017-05-16 12:30 UTC (permalink / raw)
  To: Stefan Agner
  Cc: David Miller, andrew@lunn.ch, festevam@gmail.com,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <2f01cfbfae700463dc09433064006429@agner.ch>

From: Stefan Agner <stefan@agner.ch> Sent: Monday, May 15, 2017 1:39 PM
>To: Andy Duan <fugang.duan@nxp.com>
>Cc: David Miller <davem@davemloft.net>; andrew@lunn.ch;
>festevam@gmail.com; netdev@vger.kernel.org; linux-
>kernel@vger.kernel.org
>Subject: Re: [PATCH] net: fec: select queue depending on VLAN priority
>
>On 2017-05-10 21:49, Andy Duan wrote:
>> From: Stefan Agner <stefan@agner.ch> Sent: Thursday, May 11, 2017
>> 12:08 PM
>>>To: Andy Duan <fugang.duan@nxp.com>
>>>Cc: David Miller <davem@davemloft.net>; andrew@lunn.ch;
>>>festevam@gmail.com; netdev@vger.kernel.org; linux-
>>>kernel@vger.kernel.org
>>>Subject: Re: [PATCH] net: fec: select queue depending on VLAN priority
>>>
>>>On 2017-05-09 19:42, Andy Duan wrote:
>>>> From: David Miller <davem@davemloft.net> Sent: Tuesday, May 09, 2017
>>>> 9:39 PM
>>>>>To: stefan@agner.ch
>>>>>Cc: Andy Duan <fugang.duan@nxp.com>; andrew@lunn.ch;
>>>>>festevam@gmail.com; netdev@vger.kernel.org; linux-
>>>>>kernel@vger.kernel.org
>>>>>Subject: Re: [PATCH] net: fec: select queue depending on VLAN
>>>>>priority
>>>>>
>>>>>From: Stefan Agner <stefan@agner.ch>
>>>>>Date: Mon,  8 May 2017 22:37:08 -0700
>>>>>
>>>>>> Since the addition of the multi queue code with commit
>>>>>> 59d0f7465644
>>>>>> ("net: fec: init multi queue date structure") the queue selection
>>>>>> has been handelt by the default transmit queue selection
>>>>>> implementation which tries to evenly distribute the traffic across
>>>>>> all available queues. This selection presumes that the queues are
>>>>>> using an equal priority, however, the queues 1 and 2 are actually
>>>>>> of higher priority (the classification of the queues is enabled in
>>>fec_enet_enable_ring).
>>>>>>
>>>>>> This can lead to net scheduler warnings and continuous TX ring
>>>>>> dumps when exercising the system with iperf.
>>>>>>
>>>>>> Use only queue 0 for all common traffic (no VLAN and P802.1p
>>>>>> priority
>>>>>> 0 and 1) and route level 2-7 through queue 1 and 2.
>>>>>>
>>>>>> Signed-off-by: Fugang Duan <fugang.duan@nxp.com>
>>>>>> Fixes: 59d0f7465644 ("net: fec: init multi queue date structure")
>>>>>
>>>>>If the queues are used for prioritization, and it does not have
>>>>>multiple normal priority level queues, multiqueue is not what the
>>>>>driver should have implemented.
>>>> Firstly, HW multiple queues support:
>>>> 	- Traffic-shaping bandwidth distribution supports credit-based and
>>>> round-robin-based policies. Either policy can be combined with
>>>> time-based shaping.
>>>> 	- AVB (Audio Video Bridging, IEEE 802.1Qav) features:
>>>> 		* Credit-based bandwidth distribution policy can be combined
>>>with
>>>> time-based shaping
>>>> 		* AVB endpoint talker and listener support
>>>> 		* Support for arbitration between different priority traffic (for
>>>> example, AVB class A, AVB class B, and non-AVB) Round-robin-based
>>>> policies:
>>>> 	It has the same priority for three queues: In the round-robin QoS
>>>> scheme, each queue is given an equal opportunity to transmit one
>>>> frame. For example, if queue n has a frame to transmit, the queue
>>>> transmits its frame. After queue n has transmitted its frame, or if
>>>> queue n does not have a frame to transmit, queue n+1 is then allowed
>>>> to transmit its frame, and so on.
>>>>
>>>> Credit-based policies:
>>>> 	The AVB credit based shaper acts independently, per class, to
>>>> control the bandwidth distribution between normal traffic and
>>>> time-sensitive traffic with respect to the total link bandwidth available.
>>>> 	Credit based shaper conbined with time-based shaping:
>>>> 		- priority: ClassA queue > ClassB queue > best effort
>>>> 		- ensure the queue bandwidth as user set based on time-
>>>based shaping
>>>> algorithms (transmitter transmit frame from three queue in turn
>>>> based on time-based shaping algorithms)
>>>> 	And in real AVB case,  each streaming can be independent, and are
>>>> fixed on related queue. Then driver level should implement
>>>> .ndo_select_queue() to put the streaming into related queue. That is
>>>> what the patch did.
>>>>
>>>> The current driver config the three queue to credit-based policies
>>>> (AVB), the patch seems no problem for the implementation. Do you
>>>> have any suggestion ?
>>>>
>>>
>>>I tried using the round robin mode by adding this:
>>>
>>>+       /* Set Round-Robin policy */
>>>+       writel(1, fep->hwp + FEC_QOS_SCHEME);
>>>
>>>After a while I got the warning non the less:
>>>
>>>WARNING: CPU: 0 PID: 0 at net/sched/sch_generic.c:316
>>>dev_watchdog+0x248/0x24c NETDEV WATCHDOG: eth0 (fec): transmit
>queue
>>>2 timed out Modules linked in:
>>>CPU: 0 PID: 0 Comm: swapper/0 Not tainted
>>>4.11.0-rc1-00058-g56d22eced8bc- dirty #377 Hardware name: Freescale
>>>i.MX7 Dual (Device Tree) [<c02266b0>]
>>>(unwind_backtrace) from [<c0222d7c>] (show_stack+0x10/0x14)
>>>[<c0222d7c>]
>>>(show_stack) from [<c04d4098>] (dump_stack+0x78/0x8c) [<c04d4098>]
>>>(dump_stack) from [<c0236548>] (__warn+0xe8/0x100) [<c0236548>]
>>>(__warn) from [<c0236598>] (warn_slowpath_fmt+0x38/0x48)
>[<c0236598>]
>>>(warn_slowpath_fmt) from [<c0805904>]
>>>(dev_watchdog+0x248/0x24c)
>>>[<c0805904>] (dev_watchdog) from [<c028a0e8>]
>>>(call_timer_fn+0x28/0x98) [<c028a0e8>] (call_timer_fn) from
>>>[<c028a1f8>] (expire_timers+0xa0/0xac) [<c028a1f8>] (expire_timers)
>>>from [<c028a2a0>]
>>>(run_timer_softirq+0x9c/0x194)
>>>[<c028a2a0>] (run_timer_softirq) from [<c023aaf8>]
>>>(__do_softirq+0x114/0x234)
>>>[<c023aaf8>] (__do_softirq) from [<c023aee4>] (irq_exit+0xcc/0x108)
>>>[<c023aee4>] (irq_exit) from [<c0279920>]
>>>(__handle_domain_irq+0x80/0xec)
>>>[<c0279920>] (__handle_domain_irq) from [<c0201544>]
>>>(gic_handle_irq+0x48/0x8c)
>>>[<c0201544>] (gic_handle_irq) from [<c0223838>] (__irq_svc+0x58/0x8c)
>>>Exception stack(0xc1001f28 to 0xc1001f70)
>>>1f20:                   00000001 00000000 00000000 c022fdc0 c1000000
>>>c1003d80
>>>1f40: c1003d34 c0e72ed0 c0bd9b04 c1001f80 00000000 00000000 00000000
>>>c1001f78
>>>1f60: c022048c c0220490 60000013 ffffffff [<c0223838>] (__irq_svc)
>>>from [<c0220490>] (arch_cpu_idle+0x38/0x3c) [<c0220490>]
>>>(arch_cpu_idle) from [<c026ec60>] (do_idle+0x170/0x204) [<c026ec60>]
>>>(do_idle) from [<c026efac>] (cpu_startup_entry+0x18/0x1c) [<c026efac>]
>>>(cpu_startup_entry) from [<c0e00c88>]
>>>(start_kernel+0x394/0x3a0)
>>>---[ end trace a474f341d40e0705 ]---
>>>fec 30be0000.ethernet eth0: TX ring dump
>>>
>>>I disabled the regular ring dump and only printed one line. It seems
>>>to come up every 2 seconds, and checking cat /proc/interrupts showed
>>>that queue 2 stayed at its last value (3562218):
>>>
>>> 58:    3091320     GIC-0 150 Level     30be0000.ethernet
>>> 59:    3562218     GIC-0 151 Level     30be0000.ethernet
>>> 60:   13377922     GIC-0 152 Level     30be0000.ethernet
>>
>> Pls check ENETx_DMAnCFG[16] whether is set, and disable time-based
>> shaping for round robin.
>
>When I do not set ENETx_DMAnCFG[16] (DMA_CLASS_EN), then networking
>stops working (I cannot mount NFS).
>
>Time-based is disabled by default I guess?

No, you should clear ENETx_DMAnCFG[15:0],  and set ENETx_DMAnCFG[16].

Andy

^ permalink raw reply

* Re: [patch net-next v2 04/10] net: sched: replace nprio by a bool to make the function more readable
From: Jiri Pirko @ 2017-05-16 12:25 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: netdev, davem, xiyou.wangcong, dsa, edumazet, stephen, daniel,
	alexander.h.duyck, simon.horman, mlxsw
In-Reply-To: <0553194d-9d76-bb39-4b29-c3c734a6ff44@mojatatu.com>

Tue, May 16, 2017 at 02:09:25PM CEST, jhs@mojatatu.com wrote:
>On 17-05-15 04:38 AM, Jiri Pirko wrote:
>
>> diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
>> index 88ec1a1..0e49e6e 100644
>> --- a/net/sched/cls_api.c
>> +++ b/net/sched/cls_api.c
>> @@ -271,7 +271,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
>>  	struct tcmsg *t;
>>  	u32 protocol;
>>  	u32 prio;
>> -	u32 nprio;
>> +	bool prio_allocate;
>
>prio_allocated? (past tense seems more sensible)

No, it actually tell if the prio should be allocated:

                if (prio_allocate)
                        prio = tcf_auto_prio(tcf_chain_tp_prev(&chain_info));

I believe it is accurate.

^ permalink raw reply

* Re: [patch net-next v2 02/10] net: sched: introduce tcf block infractructure
From: Jiri Pirko @ 2017-05-16 12:23 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: netdev, davem, xiyou.wangcong, dsa, edumazet, stephen, daniel,
	alexander.h.duyck, simon.horman, mlxsw
In-Reply-To: <33ea772b-d35b-ae08-4137-b63185c2f590@mojatatu.com>

Tue, May 16, 2017 at 02:07:25PM CEST, jhs@mojatatu.com wrote:
>
>Jiri,
>
>I am sorry i am tied up elsewhere but will respond in chunks.
>
>On 17-05-15 04:38 AM, Jiri Pirko wrote:
>
>
>>  static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
>>  {
>>  	struct qdisc_skb_cb *qcb;
>
>
>> +int tcf_block_get(struct tcf_block **p_block,
>> +		  struct tcf_proto __rcu **p_filter_chain)
>> +{
>> +	struct tcf_block *block = kzalloc(sizeof(*block), GFP_KERNEL);
>> +
>> +	if (!block)
>> +		return -ENOMEM;
>> +	block->p_filter_chain = p_filter_chain;
>> +	*p_block = block;
>> +	return 0;
>> +}
>
>tcf_block_get() sounds odd. tcf_block_create()?

I used get/put because I plan to allow sharing of block between qdiscs
in future. Then there will be a refcount.


>
>> +EXPORT_SYMBOL(tcf_block_get);
>> +
>> +void tcf_block_put(struct tcf_block *block)
>> +{
>> +	if (!block)
>> +		return;
>> +	tcf_destroy_chain(block->p_filter_chain);
>> +	kfree(block);
>> +}
>
>tcf_destroy_block()?
>
>[..]
>
>> +	error = tcf_block_get(&flow->block, &flow->filter_list);
>> +	if (error) {
>> +		kfree(flow);
>> +		goto err_out;
>> +	}
>> +
>>  	flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
>>  	if (!flow->q)
>>  		flow->q = &noop_qdisc;
>> @@ -346,14 +353,13 @@ static void atm_tc_walk(struct Qdisc *sch, struct qdisc_walker *walker)
>>  	}
>>  }
>> 
>> -static struct tcf_proto __rcu **atm_tc_find_tcf(struct Qdisc *sch,
>> -						unsigned long cl)
>> +static struct tcf_block *atm_tc_tcf_block(struct Qdisc *sch, unsigned long cl)
>
>Any reason you removed the verb "find" from all these calls?
>eg above: better to have atm_tc_tcf_block_find()?

Yeah, I was thinking about it. The thing is, the callback does not do
any lookup so "find" is not accurate. Also without "find" this is
shorter so I decided for this naming variant.



>
>cheers,
>jamal

^ permalink raw reply

* [PATCH net-next] bnx2x: Remove open coded carrier check
From: Leon Romanovsky @ 2017-05-16 12:20 UTC (permalink / raw)
  To: Yuval Mintz, David S . Miller; +Cc: netdev, Leon Romanovsky

From: Leon Romanovsky <leonro@mellanox.com>

There is inline function to test if carrier present,
so it makes open-coded solution redundant.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index a851f95c307a..7414ffd70c90 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -10303,7 +10303,7 @@ static void bnx2x_sp_rtnl_task(struct work_struct *work)
 	}
 	if (test_and_clear_bit(BNX2X_SP_RTNL_VFPF_CHANNEL_DOWN,
 			       &bp->sp_rtnl_state)){
-		if (!test_bit(__LINK_STATE_NOCARRIER, &bp->dev->state)) {
+		if (netif_carrier_ok(bp->dev)) {
 			bnx2x_tx_disable(bp);
 			BNX2X_ERR("PF indicated channel is not servicable anymore. This means this VF device is no longer operational\n");
 		}

^ permalink raw reply related

* [PATCH v2 30/53] net: fix some identation issues at kernel-doc markups
From: Mauro Carvalho Chehab @ 2017-05-16 12:16 UTC (permalink / raw)
  To: Linux Doc Mailing List
  Cc: Mauro Carvalho Chehab, Mauro Carvalho Chehab, linux-kernel,
	Jonathan Corbet, David Woodhouse, Brian Norris, Boris Brezillon,
	Marek Vasut, Richard Weinberger, Cyrille Pitchen, linux-mtd,
	Andrew Lunn, Florian Fainelli, David S. Miller, Eric Dumazet,
	Paolo Abeni, Al Viro, Hannes Frederic Sowa, Alexander Duyck
In-Reply-To: <cover.1494935649.git.mchehab@s-opensource.com>

Sphinx is very pedantic with regards to identation and
escape sequences:

  ./include/net/sock.h:1967: ERROR: Unexpected indentation.
  ./include/net/sock.h:1969: ERROR: Unexpected indentation.
  ./include/net/sock.h:1970: WARNING: Block quote ends without a blank line; unexpected unindent.
  ./include/net/sock.h:1971: WARNING: Block quote ends without a blank line; unexpected unindent.
  ./include/net/sock.h:2268: WARNING: Inline emphasis start-string without end-string.
  ./net/core/sock.c:2686: ERROR: Unexpected indentation.
  ./net/core/sock.c:2687: WARNING: Block quote ends without a blank line; unexpected unindent.
  ./net/core/datagram.c:182: WARNING: Inline emphasis start-string without end-string.
  ./include/linux/netdevice.h:1444: ERROR: Unexpected indentation.
  ./drivers/net/phy/phy.c:381: ERROR: Unexpected indentation.
  ./drivers/net/phy/phy.c:382: WARNING: Block quote ends without a blank line; unexpected unindent.

- Fix spacing where needed;
- Properly escape constants;
- Use a literal block for a race description.

No functional changes.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/net/phy/phy.c     | 1 +
 include/linux/netdevice.h | 9 +++++----
 include/net/sock.h        | 9 ++++-----
 net/core/datagram.c       | 2 +-
 net/core/sock.c           | 7 +++++--
 5 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 82ab8fb82587..709b8be53443 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -377,6 +377,7 @@ static void phy_sanitize_settings(struct phy_device *phydev)
  * @cmd: ethtool_cmd
  *
  * A few notes about parameter checking:
+ *
  * - We don't set port or transceiver, so we don't care what they
  *   were set to.
  * - phy_start_aneg() will make sure forced settings are sane, and
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9c23bd2efb56..56d54b6fac45 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1433,13 +1433,14 @@ enum netdev_priv_flags {
 
 /**
  *	struct net_device - The DEVICE structure.
- *		Actually, this whole structure is a big mistake.  It mixes I/O
- *		data with strictly "high-level" data, and it has to know about
- *		almost every data structure used in the INET module.
+ *
+ *	Actually, this whole structure is a big mistake.  It mixes I/O
+ *	data with strictly "high-level" data, and it has to know about
+ *	almost every data structure used in the INET module.
  *
  *	@name:	This is the first field of the "visible" part of this structure
  *		(i.e. as seen by users in the "Space.c" file).  It is the name
- *	 	of the interface.
+ *		of the interface.
  *
  *	@name_hlist: 	Device name hash chain, please keep it close to name[]
  *	@ifalias:	SNMP alias
diff --git a/include/net/sock.h b/include/net/sock.h
index 66349e49d468..9ca99b5c1328 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1953,11 +1953,10 @@ static inline bool sk_has_allocations(const struct sock *sk)
  * The purpose of the skwq_has_sleeper and sock_poll_wait is to wrap the memory
  * barrier call. They were added due to the race found within the tcp code.
  *
- * Consider following tcp code paths:
+ * Consider following tcp code paths::
  *
- * CPU1                  CPU2
- *
- * sys_select            receive packet
+ *   CPU1                CPU2
+ *   sys_select          receive packet
  *   ...                 ...
  *   __add_wait_queue    update tp->rcv_nxt
  *   ...                 ...
@@ -2264,7 +2263,7 @@ void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags);
  * @tsflags:	timestamping flags to use
  * @tx_flags:	completed with instructions for time stamping
  *
- * Note : callers should take care of initial *tx_flags value (usually 0)
+ * Note: callers should take care of initial ``*tx_flags`` value (usually 0)
  */
 static inline void sock_tx_timestamp(const struct sock *sk, __u16 tsflags,
 				     __u8 *tx_flags)
diff --git a/net/core/datagram.c b/net/core/datagram.c
index db1866f2ffcf..4dd594741b6d 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -181,7 +181,7 @@ static struct sk_buff *skb_set_peeked(struct sk_buff *skb)
  *
  *	This function will lock the socket if a skb is returned, so
  *	the caller needs to unlock the socket in that case (usually by
- *	calling skb_free_datagram). Returns NULL with *err set to
+ *	calling skb_free_datagram). Returns NULL with @err set to
  *	-EAGAIN if no data was available or to some other value if an
  *	error was detected.
  *
diff --git a/net/core/sock.c b/net/core/sock.c
index 79c6aee6af9b..6adc69edfdd6 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2682,9 +2682,12 @@ EXPORT_SYMBOL(release_sock);
  * @sk: socket
  *
  * This version should be used for very small section, where process wont block
- * return false if fast path is taken
+ * return false if fast path is taken:
+ *
  *   sk_lock.slock locked, owned = 0, BH disabled
- * return true if slow path is taken
+ *
+ * return true if slow path is taken:
+ *
  *   sk_lock.slock unlocked, owned = 1, BH enabled
  */
 bool lock_sock_fast(struct sock *sk)
-- 
2.9.3


^ permalink raw reply related

* Re: [PATCH] ravb: add wake-on-lan support via magic packet
From: Niklas Söderlund @ 2017-05-16 12:16 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: Simon Horman, Sergei Shtylyov, netdev@vger.kernel.org,
	Linux-Renesas
In-Reply-To: <CAMuHMdV9fjYvV=axAkp5WQbXTbZSXg5+g=HNubzPYSRwmStiWg@mail.gmail.com>

Hi,

On 2017-05-16 13:36:21 +0200, Geert Uytterhoeven wrote:
> Hi Simon,
> 
> On Tue, May 16, 2017 at 1:01 PM, Simon Horman <horms@verge.net.au> wrote:
> > On Tue, May 16, 2017 at 11:07:34AM +0200, Geert Uytterhoeven wrote:
> >> On Tue, May 16, 2017 at 11:02 AM, Niklas Söderlund
> >> <niklas.soderlund@ragnatech.se> wrote:
> >> >> > Whit all this being said I still like to withdraw this patch as I found
> >> >> > another fault with it, ravb_wol_restore() will unconditionally be called
> >> >> > while ravb_wol_setup() will only be called if netif_running(ndev). This
> >> >> > is en easy fix and I will send out a v2 once we figure out what to do
> >> >> > about the clock.
> >> >>
> >> >> The clock issue is external to the ravb driver. If it works with
> >> >> s2idle, it should
> >> >> be OK.
> >> >
> >> > Do you think it's a good idea to move ahead with a v2 of the ravb WoL
> >> > patch to fix the unrelated issue and aim for it to be picked up prior to
> >> > suspend/resume support is added to the CPG/MSSR?
> >>
> >> Sure.
> >>
> >> It can still be used on R-Car Gen2, where we're not s*d by mandatory PSCI.
> >
> > Is there some way for - e.g. the driver - to not enable WoL on Gen3 SoCs
> > until the clock issues is sorted out? I'm quite happy to enable features
> 
> "priv->chip_id != RCAR_GEN3". However, as we don't have RAVB enabled
> on any R-Car Gen2 board, its use is limited.

I agree that it's not so useful to enable this on Gen2 only.

> 
> > where they work; not so much where they don't.
> 
> Agreed.
> 
> One workaround could be to disable/enable the module clock in the WoL
> resume path, to make sure it is enabled.  Once the enable count reaches
> 0, CCF will know it's disabled, and will really enable next time.
> You may need a double disable/double enable though, without testing I
> don't know remember the enable count is 1 or 2 at that point (due to PM
> runtime).

I thought about this but it feels like such a hack I did not dare 
suggest it :-) But at the same time it would be nice to enable WoL for 
the s2idle use-case where it works. Only resume from PSCI with WoL 
enabled that is broken, and WoL in PSCI suspend will never work :-)

How about I add another patch in v2 on-top of this that adds the clock 
disable/enable hack? That way it's clear that this is a workaround and 
once we have support for suspend/resume in CPG/MSSR just that patch can 
be reverted? Or is it cleaner to fold it in to this patch with a big 
comment that this is a workaround? Or is it maybe better to hold of on 
this until CPG/MSSR supports suspend/resume?

> 
> Gr{oetje,eeting}s,
> 
>                         Geert
> 
> --
> Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org
> 
> In personal conversations with technical people, I call myself a hacker. But
> when I'm talking to journalists I just say "programmer" or something like that.
>                                 -- Linus Torvalds

-- 
Regards,
Niklas Söderlund

^ permalink raw reply

* Re: [patch net-next v2 04/10] net: sched: replace nprio by a bool to make the function more readable
From: Jamal Hadi Salim @ 2017-05-16 12:09 UTC (permalink / raw)
  To: Jiri Pirko, netdev
  Cc: davem, xiyou.wangcong, dsa, edumazet, stephen, daniel,
	alexander.h.duyck, simon.horman, mlxsw
In-Reply-To: <20170515083857.3615-5-jiri@resnulli.us>

On 17-05-15 04:38 AM, Jiri Pirko wrote:

> diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
> index 88ec1a1..0e49e6e 100644
> --- a/net/sched/cls_api.c
> +++ b/net/sched/cls_api.c
> @@ -271,7 +271,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
>  	struct tcmsg *t;
>  	u32 protocol;
>  	u32 prio;
> -	u32 nprio;
> +	bool prio_allocate;

prio_allocated? (past tense seems more sensible)

cheers,
jamal

^ permalink raw reply

* Re: [patch net-next v2 02/10] net: sched: introduce tcf block infractructure
From: Jamal Hadi Salim @ 2017-05-16 12:07 UTC (permalink / raw)
  To: Jiri Pirko, netdev
  Cc: davem, xiyou.wangcong, dsa, edumazet, stephen, daniel,
	alexander.h.duyck, simon.horman, mlxsw
In-Reply-To: <20170515083857.3615-3-jiri@resnulli.us>


Jiri,

I am sorry i am tied up elsewhere but will respond in chunks.

On 17-05-15 04:38 AM, Jiri Pirko wrote:


>  static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
>  {
>  	struct qdisc_skb_cb *qcb;


> +int tcf_block_get(struct tcf_block **p_block,
> +		  struct tcf_proto __rcu **p_filter_chain)
> +{
> +	struct tcf_block *block = kzalloc(sizeof(*block), GFP_KERNEL);
> +
> +	if (!block)
> +		return -ENOMEM;
> +	block->p_filter_chain = p_filter_chain;
> +	*p_block = block;
> +	return 0;
> +}

tcf_block_get() sounds odd. tcf_block_create()?

> +EXPORT_SYMBOL(tcf_block_get);
> +
> +void tcf_block_put(struct tcf_block *block)
> +{
> +	if (!block)
> +		return;
> +	tcf_destroy_chain(block->p_filter_chain);
> +	kfree(block);
> +}

tcf_destroy_block()?

[..]

> +	error = tcf_block_get(&flow->block, &flow->filter_list);
> +	if (error) {
> +		kfree(flow);
> +		goto err_out;
> +	}
> +
>  	flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
>  	if (!flow->q)
>  		flow->q = &noop_qdisc;
> @@ -346,14 +353,13 @@ static void atm_tc_walk(struct Qdisc *sch, struct qdisc_walker *walker)
>  	}
>  }
>
> -static struct tcf_proto __rcu **atm_tc_find_tcf(struct Qdisc *sch,
> -						unsigned long cl)
> +static struct tcf_block *atm_tc_tcf_block(struct Qdisc *sch, unsigned long cl)

Any reason you removed the verb "find" from all these calls?
eg above: better to have atm_tc_tcf_block_find()?

cheers,
jamal

^ permalink raw reply

* Re: [PATCH] rt2x00: improve calling conventions for register accessors
From: Johannes Berg @ 2017-05-16 11:58 UTC (permalink / raw)
  To: Stanislaw Gruszka, David Miller
  Cc: arnd, helmut.schaa, kvalo, daniel, dev, pozega.tomislav,
	vasilugin, roman, linux-wireless, netdev, linux-kernel
In-Reply-To: <20170516115511.GA4230@redhat.com>

On Tue, 2017-05-16 at 13:55 +0200, Stanislaw Gruszka wrote:
> 
> In rt2x00 driver we use poor convention in other kind of registers
> accessors like bbp, mac, eeprom. I dislike to changing only rfcsr
> accessors and leaving others in the old way. And changing all
> accessors would be massive and error prone change, which I'm not
> prefer either.

That's a stupid argument, but for the sake of it - the conversion can
easily be done with coccinelle/spatch without being "error prone".

johannes

^ permalink raw reply

* Re: [PATCH] rt2x00: improve calling conventions for register accessors
From: Stanislaw Gruszka @ 2017-05-16 11:55 UTC (permalink / raw)
  To: David Miller
  Cc: arnd, helmut.schaa, kvalo, daniel, dev, johannes.berg,
	pozega.tomislav, vasilugin, roman, linux-wireless, netdev,
	linux-kernel
In-Reply-To: <20170515.103951.2305484593464882104.davem@davemloft.net>

On Mon, May 15, 2017 at 10:39:51AM -0400, David Miller wrote:
> From: Stanislaw Gruszka <sgruszka@redhat.com>
> Date: Mon, 15 May 2017 16:28:01 +0200
> 
> > On Mon, May 15, 2017 at 03:46:55PM +0200, Arnd Bergmann wrote:
> >> With CONFIG_KASAN enabled and gcc-7, we get a warning about rather high
> >> stack usage (with a private patch set I have to turn on this warning,
> >> which I intend to get into the next kernel release):
> >> 
> >> wireless/ralink/rt2x00/rt2800lib.c: In function 'rt2800_bw_filter_calibration':
> >> wireless/ralink/rt2x00/rt2800lib.c:7990:1: error: the frame size of 2144 bytes is larger than 1536 bytes [-Werror=frame-larger-than=]
> >> 
> >> The problem is that KASAN inserts a redzone around each local variable that
> >> gets passed by reference, and the newly added function has a lot of them.
> >> We can easily avoid that here by changing the calling convention to have
> >> the output as the return value of the function. This should also results in
> >> smaller object code, saving around 4KB in .text with KASAN, or 2KB without
> >> KASAN.
> >> 
> >> Fixes: 41977e86c984 ("rt2x00: add support for MT7620")
> >> Signed-off-by: Arnd Bergmann <arnd@arndb.de>
> >> ---
> >>  drivers/net/wireless/ralink/rt2x00/rt2800lib.c | 319 +++++++++++++------------
> >>  1 file changed, 164 insertions(+), 155 deletions(-)
> > 
> > We have read(, &val) calling convention since forever in rt2x00 and that
> > was never a problem. I dislike to change that now to make some tools
> > happy, I think problem should be fixed in the tools instead.
> 
> Passing return values by reference is and always has been a really
> poor way to achieve what these functions are doing.
> 
> And frankly, whilst the tool could see what's going on here better, we
> should be making code easier rather than more difficult to audit.
> 
> I am therefore very much in favor of Arnd's change.
> 
> This isn't even a situation where there are multiple return values,
> such as needing to signal an error and return an unsigned value at the
> same time.
> 
> These functions return _one_ value, and therefore they should be
> returned as a true return value.

In rt2x00 driver we use poor convention in other kind of registers
accessors like bbp, mac, eeprom. I dislike to changing only rfcsr
accessors and leaving others in the old way. And changing all accessors
would be massive and error prone change, which I'm not prefer either.

Arnd, could this be fixed by refactoring rt2800_bw_filter_calibration()
function (which is enormous and definitely should be split into smaller
subroutines) ? If not, I would accept this patch.

Thanks
Stanislaw

^ permalink raw reply

* Re: [PATCH] ravb: add wake-on-lan support via magic packet
From: Geert Uytterhoeven @ 2017-05-16 11:36 UTC (permalink / raw)
  To: Simon Horman
  Cc: Niklas Söderlund, Sergei Shtylyov, netdev@vger.kernel.org,
	Linux-Renesas
In-Reply-To: <20170516110145.GA27247@verge.net.au>

Hi Simon,

On Tue, May 16, 2017 at 1:01 PM, Simon Horman <horms@verge.net.au> wrote:
> On Tue, May 16, 2017 at 11:07:34AM +0200, Geert Uytterhoeven wrote:
>> On Tue, May 16, 2017 at 11:02 AM, Niklas Söderlund
>> <niklas.soderlund@ragnatech.se> wrote:
>> >> > Whit all this being said I still like to withdraw this patch as I found
>> >> > another fault with it, ravb_wol_restore() will unconditionally be called
>> >> > while ravb_wol_setup() will only be called if netif_running(ndev). This
>> >> > is en easy fix and I will send out a v2 once we figure out what to do
>> >> > about the clock.
>> >>
>> >> The clock issue is external to the ravb driver. If it works with
>> >> s2idle, it should
>> >> be OK.
>> >
>> > Do you think it's a good idea to move ahead with a v2 of the ravb WoL
>> > patch to fix the unrelated issue and aim for it to be picked up prior to
>> > suspend/resume support is added to the CPG/MSSR?
>>
>> Sure.
>>
>> It can still be used on R-Car Gen2, where we're not s*d by mandatory PSCI.
>
> Is there some way for - e.g. the driver - to not enable WoL on Gen3 SoCs
> until the clock issues is sorted out? I'm quite happy to enable features

"priv->chip_id != RCAR_GEN3". However, as we don't have RAVB enabled
on any R-Car Gen2 board, its use is limited.

> where they work; not so much where they don't.

Agreed.

One workaround could be to disable/enable the module clock in the WoL
resume path, to make sure it is enabled.  Once the enable count reaches
0, CCF will know it's disabled, and will really enable next time.
You may need a double disable/double enable though, without testing I
don't know remember the enable count is 1 or 2 at that point (due to PM
runtime).

Gr{oetje,eeting}s,

                        Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* [PATCH] [net, 4.12] mlx5e: add CONFIG_INET dependency
From: Arnd Bergmann @ 2017-05-16 11:27 UTC (permalink / raw)
  To: Saeed Mahameed, Matan Barak, Leon Romanovsky
  Cc: Arnd Bergmann, David S. Miller, Erez Shitrit, Or Gerlitz,
	netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

We now reference the arp_tbl, which requires IPv4 support to be
enabled in the kernel, otherwise we get a link error:

drivers/net/built-in.o: In function `mlx5e_tc_update_neigh_used_value':
(.text+0x16afec): undefined reference to `arp_tbl'
drivers/net/built-in.o: In function `mlx5e_rep_neigh_init':
en_rep.c:(.text+0x16c16d): undefined reference to `arp_tbl'
drivers/net/built-in.o: In function `mlx5e_rep_netevent_event':
en_rep.c:(.text+0x16cbb5): undefined reference to `arp_tbl'

This adds a Kconfig dependency for it.

Fixes: 232c001398ae ("net/mlx5e: Add support to neighbour update flow")
Signed-off-by: Arnd Bergmann <arnd-r2nGTMty4D4@public.gmane.org>
---
Note: the mlxsw driver has referenced the same symbol for a while,
but had an implicit dependency on CONFIG_INET since it also depends
on SWITCHDEV.

 drivers/net/ethernet/mellanox/mlx5/core/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
index fc52d742b7f7..27251a78075c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -13,7 +13,7 @@ config MLX5_CORE
 
 config MLX5_CORE_EN
 	bool "Mellanox Technologies ConnectX-4 Ethernet support"
-	depends on NETDEVICES && ETHERNET && PCI && MLX5_CORE
+	depends on NETDEVICES && ETHERNET && INET && PCI && MLX5_CORE
 	depends on IPV6=y || IPV6=n || MLX5_CORE=m
 	imply PTP_1588_CLOCK
 	default n
-- 
2.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH v2 net-next] tcp: internal implementation for pacing
From: Eric Dumazet @ 2017-05-16 11:24 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, Eric Dumazet, Eric Dumazet, Neal Cardwell, Yuchung Cheng,
	Van Jacobson, Jerry Chu

BBR congestion control depends on pacing, and pacing is
currently handled by sch_fq packet scheduler for performance reasons,
and also because implemening pacing with FQ was convenient to truly
avoid bursts.

However there are many cases where this packet scheduler constraint
is not practical.
- Many linux hosts are not focusing on handling thousands of TCP
  flows in the most efficient way.
- Some routers use fq_codel or other AQM, but still would like
  to use BBR for the few TCP flows they initiate/terminate.

This patch implements an automatic fallback to internal pacing.

Pacing is requested either by BBR or use of SO_MAX_PACING_RATE option.

If sch_fq happens to be in the egress path, pacing is delegated to
the qdisc, otherwise pacing is done by TCP itself.

One advantage of pacing from TCP stack is to get more precise rtt
estimations, and less work done from TX completion, since TCP Small
queue limits are not generally hit. Setups with single TX queue but
many cpus might even benefit from this.

Note that unlike sch_fq, we do not take into account header sizes.
Taking care of these headers would add additional complexity for
no practical differences in behavior.

Some performance numbers using 800 TCP_STREAM flows rate limited to
~48 Mbit per second on 40Gbit NIC.

If MQ+pfifo_fast is used on the NIC :

$ sar -n DEV 1 5 | grep eth
14:48:44         eth0 725743.00 2932134.00  46776.76 4335184.68      0.00      0.00      1.00
14:48:45         eth0 725349.00 2932112.00  46751.86 4335158.90      0.00      0.00      0.00
14:48:46         eth0 725101.00 2931153.00  46735.07 4333748.63      0.00      0.00      0.00
14:48:47         eth0 725099.00 2931161.00  46735.11 4333760.44      0.00      0.00      1.00
14:48:48         eth0 725160.00 2931731.00  46738.88 4334606.07      0.00      0.00      0.00
Average:         eth0 725290.40 2931658.20  46747.54 4334491.74      0.00      0.00      0.40
$ vmstat 1 5
procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu-----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
 4  0      0 259825920  45644 2708324    0    0    21     2  247   98  0  0 100  0  0
 4  0      0 259823744  45644 2708356    0    0     0     0 2400825 159843  0 19 81  0  0
 0  0      0 259824208  45644 2708072    0    0     0     0 2407351 159929  0 19 81  0  0
 1  0      0 259824592  45644 2708128    0    0     0     0 2405183 160386  0 19 80  0  0
 1  0      0 259824272  45644 2707868    0    0     0    32 2396361 158037  0 19 81  0  0

Now use MQ+FQ :

lpaa23:~# echo fq >/proc/sys/net/core/default_qdisc
lpaa23:~# tc qdisc replace dev eth0 root mq

$ sar -n DEV 1 5 | grep eth
14:49:57         eth0 678614.00 2727930.00  43739.13 4033279.14      0.00      0.00      0.00
14:49:58         eth0 677620.00 2723971.00  43674.69 4027429.62      0.00      0.00      1.00
14:49:59         eth0 676396.00 2719050.00  43596.83 4020125.02      0.00      0.00      0.00
14:50:00         eth0 675197.00 2714173.00  43518.62 4012938.90      0.00      0.00      1.00
14:50:01         eth0 676388.00 2719063.00  43595.47 4020171.64      0.00      0.00      0.00
Average:         eth0 676843.00 2720837.40  43624.95 4022788.86      0.00      0.00      0.40
$ vmstat 1 5
procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu-----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
 2  0      0 259832240  46008 2710912    0    0    21     2  223  192  0  1 99  0  0
 1  0      0 259832896  46008 2710744    0    0     0     0 1702206 198078  0 17 82  0  0
 0  0      0 259830272  46008 2710596    0    0     0     0 1696340 197756  1 17 83  0  0
 4  0      0 259829168  46024 2710584    0    0    16     0 1688472 197158  1 17 82  0  0
 3  0      0 259830224  46024 2710408    0    0     0     0 1692450 197212  0 18 82  0  0

As expected, number of interrupts per second is very different.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Van Jacobson <vanj@google.com>
Cc: Jerry Chu <hkchu@google.com>
---
v2: added one missing kdoc for sk_pacing_status (kbuild test robot report)
 include/linux/tcp.h   |  2 ++
 include/net/sock.h    |  9 +++++-
 include/net/tcp.h     |  3 ++
 net/core/sock.c       |  4 +++
 net/ipv4/tcp_bbr.c    |  9 +++---
 net/ipv4/tcp_output.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_timer.c  |  3 ++
 net/sched/sch_fq.c    |  8 ++++++
 8 files changed, 113 insertions(+), 5 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index b6d5adcee8fcb611de202993623cc80274d262e4..22854f0284347a3bb047709478525ee5a9dd9b36 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -293,6 +293,8 @@ struct tcp_sock {
 	u32	sacked_out;	/* SACK'd packets			*/
 	u32	fackets_out;	/* FACK'd packets			*/
 
+	struct hrtimer	pacing_timer;
+
 	/* from STCP, retrans queue hinting */
 	struct sk_buff* lost_skb_hint;
 	struct sk_buff *retransmit_skb_hint;
diff --git a/include/net/sock.h b/include/net/sock.h
index f33e3d134e0b7f66329f2122d7acc8b396c1787b..28d33b08524fda31d50f97366e2a4fc6f9bff402 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -253,6 +253,7 @@ struct sock_common {
   *	@sk_ll_usec: usecs to busypoll when there is no data
   *	@sk_allocation: allocation mode
   *	@sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler)
+  *	@sk_pacing_status: Pacing status (requested, handled by sch_fq)
   *	@sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE)
   *	@sk_sndbuf: size of send buffer in bytes
   *	@sk_padding: unused element for alignment
@@ -396,7 +397,7 @@ struct sock {
 	__s32			sk_peek_off;
 	int			sk_write_pending;
 	__u32			sk_dst_pending_confirm;
-	/* Note: 32bit hole on 64bit arches */
+	u32			sk_pacing_status; /* see enum sk_pacing */
 	long			sk_sndtimeo;
 	struct timer_list	sk_timer;
 	__u32			sk_priority;
@@ -475,6 +476,12 @@ struct sock {
 	struct rcu_head		sk_rcu;
 };
 
+enum sk_pacing {
+	SK_PACING_NONE		= 0,
+	SK_PACING_NEEDED	= 1,
+	SK_PACING_FQ		= 2,
+};
+
 #define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data)))
 
 #define rcu_dereference_sk_user_data(sk)	rcu_dereference(__sk_user_data((sk)))
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 38a7427ae902e35973a8b7fa0e95ff602ede0e87..b4dc93dae98c2d175ccadce150083705d237555e 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -574,6 +574,7 @@ void tcp_fin(struct sock *sk);
 void tcp_init_xmit_timers(struct sock *);
 static inline void tcp_clear_xmit_timers(struct sock *sk)
 {
+	hrtimer_cancel(&tcp_sk(sk)->pacing_timer);
 	inet_csk_clear_xmit_timers(sk);
 }
 
@@ -1945,4 +1946,6 @@ static inline void tcp_listendrop(const struct sock *sk)
 	__NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS);
 }
 
+enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer);
+
 #endif	/* _TCP_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index e43e71d7856b385111cd4c4b1bd835a78c670c60..93d011e35b8349954db6918055c2f90ae473d254 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1041,6 +1041,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 #endif
 
 	case SO_MAX_PACING_RATE:
+		if (val != ~0U)
+			cmpxchg(&sk->sk_pacing_status,
+				SK_PACING_NONE,
+				SK_PACING_NEEDED);
 		sk->sk_max_pacing_rate = val;
 		sk->sk_pacing_rate = min(sk->sk_pacing_rate,
 					 sk->sk_max_pacing_rate);
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index b89bce4c721eed530f5cfc725b759147b38cef42..92b045c72163def1c1d6aa0f2002760186aa5dc3 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -52,10 +52,9 @@
  * There is a public e-mail list for discussing BBR development and testing:
  *   https://groups.google.com/forum/#!forum/bbr-dev
  *
- * NOTE: BBR *must* be used with the fq qdisc ("man tc-fq") with pacing enabled,
- * since pacing is integral to the BBR design and implementation.
- * BBR without pacing would not function properly, and may incur unnecessary
- * high packet loss rates.
+ * NOTE: BBR might be used with the fq qdisc ("man tc-fq") with pacing enabled,
+ * otherwise TCP stack falls back to an internal pacing using one high
+ * resolution timer per TCP socket and may use more resources.
  */
 #include <linux/module.h>
 #include <net/tcp.h>
@@ -830,6 +829,8 @@ static void bbr_init(struct sock *sk)
 	bbr->cycle_idx = 0;
 	bbr_reset_lt_bw_sampling(sk);
 	bbr_reset_startup_mode(sk);
+
+	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
 }
 
 static u32 bbr_sndbuf_expand(struct sock *sk)
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 4858e190f6ac130c9441f58cb8944cc82bf67270..a32172d69a03cbe76b45ec3094222f6c3a73e27d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -904,6 +904,72 @@ void tcp_wfree(struct sk_buff *skb)
 	sk_free(sk);
 }
 
+/* Note: Called under hard irq.
+ * We can not call TCP stack right away.
+ */
+enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
+{
+	struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
+	struct sock *sk = (struct sock *)tp;
+	unsigned long nval, oval;
+
+	for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
+		struct tsq_tasklet *tsq;
+		bool empty;
+
+		if (oval & TSQF_QUEUED)
+			break;
+
+		nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
+		nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
+		if (nval != oval)
+			continue;
+
+		if (!atomic_inc_not_zero(&sk->sk_wmem_alloc))
+			break;
+		/* queue this socket to tasklet queue */
+		tsq = this_cpu_ptr(&tsq_tasklet);
+		empty = list_empty(&tsq->head);
+		list_add(&tp->tsq_node, &tsq->head);
+		if (empty)
+			tasklet_schedule(&tsq->tasklet);
+		break;
+	}
+	return HRTIMER_NORESTART;
+}
+
+/* BBR congestion control needs pacing.
+ * Same remark for SO_MAX_PACING_RATE.
+ * sch_fq packet scheduler is efficiently handling pacing,
+ * but is not always installed/used.
+ * Return true if TCP stack should pace packets itself.
+ */
+static bool tcp_needs_internal_pacing(const struct sock *sk)
+{
+	return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED;
+}
+
+static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
+{
+	u64 len_ns;
+	u32 rate;
+
+	if (!tcp_needs_internal_pacing(sk))
+		return;
+	rate = sk->sk_pacing_rate;
+	if (!rate || rate == ~0U)
+		return;
+
+	/* Should account for header sizes as sch_fq does,
+	 * but lets make things simple.
+	 */
+	len_ns = (u64)skb->len * NSEC_PER_SEC;
+	do_div(len_ns, rate);
+	hrtimer_start(&tcp_sk(sk)->pacing_timer,
+		      ktime_add_ns(ktime_get(), len_ns),
+		      HRTIMER_MODE_ABS_PINNED);
+}
+
 /* This routine actually transmits TCP packets queued in by
  * tcp_do_sendmsg().  This is used by both the initial
  * transmission and possible later retransmissions.
@@ -1034,6 +1100,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	if (skb->len != tcp_header_size) {
 		tcp_event_data_sent(tp, sk);
 		tp->data_segs_out += tcp_skb_pcount(skb);
+		tcp_internal_pacing(sk, skb);
 	}
 
 	if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
@@ -2086,6 +2153,12 @@ static int tcp_mtu_probe(struct sock *sk)
 	return -1;
 }
 
+static bool tcp_pacing_check(const struct sock *sk)
+{
+	return tcp_needs_internal_pacing(sk) &&
+	       hrtimer_active(&tcp_sk(sk)->pacing_timer);
+}
+
 /* TCP Small Queues :
  * Control number of packets in qdisc/devices to two packets / or ~1 ms.
  * (These limits are doubled for retransmits)
@@ -2210,6 +2283,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 	while ((skb = tcp_send_head(sk))) {
 		unsigned int limit;
 
+		if (tcp_pacing_check(sk))
+			break;
+
 		tso_segs = tcp_init_tso_segs(skb, mss_now);
 		BUG_ON(!tso_segs);
 
@@ -2878,6 +2954,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 
 		if (skb == tcp_send_head(sk))
 			break;
+
+		if (tcp_pacing_check(sk))
+			break;
+
 		/* we could do better than to assign each time */
 		if (!hole)
 			tp->retransmit_skb_hint = skb;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 14672543cf0bd27bc59976d5cec38d2d3bbcdd2c..86934bcf685a65ec3af3d22f1801ffa33eea76e2 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -710,4 +710,7 @@ void tcp_init_xmit_timers(struct sock *sk)
 {
 	inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
 				  &tcp_keepalive_timer);
+	hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC,
+		     HRTIMER_MODE_ABS_PINNED);
+	tcp_sk(sk)->pacing_timer.function = tcp_pace_kick;
 }
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index b488721a0059adb24aea47240afa0164a6e467a9..147fde73a0f566e8f6a26718adf176ef3943afa0 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -390,9 +390,17 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		q->stat_tcp_retrans++;
 	qdisc_qstats_backlog_inc(sch, skb);
 	if (fq_flow_is_detached(f)) {
+		struct sock *sk = skb->sk;
+
 		fq_flow_add_tail(&q->new_flows, f);
 		if (time_after(jiffies, f->age + q->flow_refill_delay))
 			f->credit = max_t(u32, f->credit, q->quantum);
+		if (sk && q->rate_enable) {
+			if (unlikely(smp_load_acquire(&sk->sk_pacing_status) !=
+				     SK_PACING_FQ))
+				smp_store_release(&sk->sk_pacing_status,
+						  SK_PACING_FQ);
+		}
 		q->inactive_flows--;
 	}
 
-- 
2.13.0.303.g4ebf302169-goog

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox