Netdev List
 help / color / mirror / Atom feed
* [PATCH v1 net-next 10/14] igb: Refactor igb_offload_cbs()
From: Jesus Sanchez-Palencia @ 2018-06-27 21:59 UTC (permalink / raw)
  To: netdev
  Cc: tglx, jan.altenberg, vinicius.gomes, kurt.kanzenbach, henrik,
	richardcochran, levi.pearson, ilias.apalodimas, ivan.khoronzhuk,
	mlichvar, willemb, jhs, xiyou.wangcong, jiri,
	Jesus Sanchez-Palencia
In-Reply-To: <20180627215950.6719-1-jesus.sanchez-palencia@intel.com>

Split code into a separate function (igb_offload_apply()) that will be
used by ETF offload implementation.

Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 8c90f1e51add..c30ab7b260cc 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -2474,6 +2474,19 @@ igb_features_check(struct sk_buff *skb, struct net_device *dev,
 	return features;
 }
 
+static void igb_offload_apply(struct igb_adapter *adapter, s32 queue)
+{
+	if (!is_fqtss_enabled(adapter)) {
+		enable_fqtss(adapter, true);
+		return;
+	}
+
+	igb_config_tx_modes(adapter, queue);
+
+	if (!is_any_cbs_enabled(adapter))
+		enable_fqtss(adapter, false);
+}
+
 static int igb_offload_cbs(struct igb_adapter *adapter,
 			   struct tc_cbs_qopt_offload *qopt)
 {
@@ -2494,15 +2507,7 @@ static int igb_offload_cbs(struct igb_adapter *adapter,
 	if (err)
 		return err;
 
-	if (is_fqtss_enabled(adapter)) {
-		igb_config_tx_modes(adapter, qopt->queue);
-
-		if (!is_any_cbs_enabled(adapter))
-			enable_fqtss(adapter, false);
-
-	} else {
-		enable_fqtss(adapter, true);
-	}
+	igb_offload_apply(adapter, qopt->queue);
 
 	return 0;
 }
-- 
2.17.1

^ permalink raw reply related

* [PATCH v1 net-next 11/14] igb: Add support for ETF offload
From: Jesus Sanchez-Palencia @ 2018-06-27 21:59 UTC (permalink / raw)
  To: netdev
  Cc: tglx, jan.altenberg, vinicius.gomes, kurt.kanzenbach, henrik,
	richardcochran, levi.pearson, ilias.apalodimas, ivan.khoronzhuk,
	mlichvar, willemb, jhs, xiyou.wangcong, jiri,
	Jesus Sanchez-Palencia
In-Reply-To: <20180627215950.6719-1-jesus.sanchez-palencia@intel.com>

Implement HW offload support for SO_TXTIME through igb's Launchtime
feature. This is done by extending igb_setup_tc() so it supports
TC_SETUP_QDISC_ETF and configuring i210 so time based transmit
arbitration is enabled.

The FQTSS transmission mode added before is extended so strict
priority (SP) queues wait for stream reservation (SR) ones.
igb_config_tx_modes() is extended so it can support enabling/disabling
Launchtime following the previous approach used for the credit-based
shaper (CBS).

As the previous flow, FQTSS transmission mode is enabled automatically
by the driver once Launchtime (or CBS, as before) is enabled.
Similarly, it's automatically disabled when the feature is disabled
for the last queue that had it setup on.

The driver just consumes the transmit times from the skbuffs directly,
so no special handling is done in case an 'invalid' time is provided.
We assume this has been handled by the ETF qdisc already.

Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
---
 .../net/ethernet/intel/igb/e1000_defines.h    |  16 ++
 drivers/net/ethernet/intel/igb/igb.h          |   1 +
 drivers/net/ethernet/intel/igb/igb_main.c     | 139 +++++++++++++++---
 3 files changed, 139 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/e1000_defines.h b/drivers/net/ethernet/intel/igb/e1000_defines.h
index 252440a418dc..8a28f3388f69 100644
--- a/drivers/net/ethernet/intel/igb/e1000_defines.h
+++ b/drivers/net/ethernet/intel/igb/e1000_defines.h
@@ -1048,6 +1048,22 @@
 #define E1000_TQAVCTRL_XMIT_MODE	BIT(0)
 #define E1000_TQAVCTRL_DATAFETCHARB	BIT(4)
 #define E1000_TQAVCTRL_DATATRANARB	BIT(8)
+#define E1000_TQAVCTRL_DATATRANTIM	BIT(9)
+#define E1000_TQAVCTRL_SP_WAIT_SR	BIT(10)
+/* Fetch Time Delta - bits 31:16
+ *
+ * This field holds the value to be reduced from the launch time for
+ * fetch time decision. The FetchTimeDelta value is defined in 32 ns
+ * granularity.
+ *
+ * This field is 16 bits wide, and so the maximum value is:
+ *
+ * 65535 * 32 = 2097120 ~= 2.1 msec
+ *
+ * XXX: We are configuring the max value here since we couldn't come up
+ * with a reason for not doing so.
+ */
+#define E1000_TQAVCTRL_FETCHTIME_DELTA	(0xFFFF << 16)
 
 /* TX Qav Credit Control fields */
 #define E1000_TQAVCC_IDLESLOPE_MASK	0xFFFF
diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index 9643b5b3d444..ca54e268d157 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -262,6 +262,7 @@ struct igb_ring {
 	u16 count;			/* number of desc. in the ring */
 	u8 queue_index;			/* logical index of the ring*/
 	u8 reg_idx;			/* physical index of the ring */
+	bool launchtime_enable;		/* true if LaunchTime is enabled */
 	bool cbs_enable;		/* indicates if CBS is enabled */
 	s32 idleslope;			/* idleSlope in kbps */
 	s32 sendslope;			/* sendSlope in kbps */
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index c30ab7b260cc..9b9a6a6227e0 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -1666,13 +1666,26 @@ static bool is_any_cbs_enabled(struct igb_adapter *adapter)
 	return false;
 }
 
+static bool is_any_txtime_enabled(struct igb_adapter *adapter)
+{
+	int i;
+
+	for (i = 0; i < adapter->num_tx_queues; i++) {
+		if (adapter->tx_ring[i]->launchtime_enable)
+			return true;
+	}
+
+	return false;
+}
+
 /**
  *  igb_config_tx_modes - Configure "Qav Tx mode" features on igb
  *  @adapter: pointer to adapter struct
  *  @queue: queue number
  *
- *  Configure CBS for a given hardware queue. Parameters are retrieved
- *  from the correct Tx ring, so igb_save_cbs_params() should be used
+ *  Configure CBS and Launchtime for a given hardware queue.
+ *  Parameters are retrieved from the correct Tx ring, so
+ *  igb_save_cbs_params() and igb_save_txtime_params() should be used
  *  for setting those correctly prior to this function being called.
  **/
 static void igb_config_tx_modes(struct igb_adapter *adapter, int queue)
@@ -1686,6 +1699,19 @@ static void igb_config_tx_modes(struct igb_adapter *adapter, int queue)
 	WARN_ON(hw->mac.type != e1000_i210);
 	WARN_ON(queue < 0 || queue > 1);
 
+	/* If any of the Qav features is enabled, configure queues as SR and
+	 * with HIGH PRIO. If none is, then configure them with LOW PRIO and
+	 * as SP.
+	 */
+	if (ring->cbs_enable || ring->launchtime_enable) {
+		set_tx_desc_fetch_prio(hw, queue, TX_QUEUE_PRIO_HIGH);
+		set_queue_mode(hw, queue, QUEUE_MODE_STREAM_RESERVATION);
+	} else {
+		set_tx_desc_fetch_prio(hw, queue, TX_QUEUE_PRIO_LOW);
+		set_queue_mode(hw, queue, QUEUE_MODE_STRICT_PRIORITY);
+	}
+
+	/* If CBS is enabled, set DataTranARB and config its parameters. */
 	if (ring->cbs_enable || queue == 0) {
 		/* i210 does not allow the queue 0 to be in the Strict
 		 * Priority mode while the Qav mode is enabled, so,
@@ -1702,9 +1728,6 @@ static void igb_config_tx_modes(struct igb_adapter *adapter, int queue)
 			ring->hicredit = ETH_FRAME_LEN;
 		}
 
-		set_tx_desc_fetch_prio(hw, queue, TX_QUEUE_PRIO_HIGH);
-		set_queue_mode(hw, queue, QUEUE_MODE_STREAM_RESERVATION);
-
 		/* Always set data transfer arbitration to credit-based
 		 * shaper algorithm on TQAVCTRL if CBS is enabled for any of
 		 * the queues.
@@ -1780,8 +1803,6 @@ static void igb_config_tx_modes(struct igb_adapter *adapter, int queue)
 		wr32(E1000_I210_TQAVHC(queue),
 		     0x80000000 + ring->hicredit * 0x7735);
 	} else {
-		set_tx_desc_fetch_prio(hw, queue, TX_QUEUE_PRIO_LOW);
-		set_queue_mode(hw, queue, QUEUE_MODE_STRICT_PRIORITY);
 
 		/* Set idleSlope to zero. */
 		tqavcc = rd32(E1000_I210_TQAVCC(queue));
@@ -1802,17 +1823,61 @@ static void igb_config_tx_modes(struct igb_adapter *adapter, int queue)
 		}
 	}
 
+	/* If LaunchTime is enabled, set DataTranTIM. */
+	if (ring->launchtime_enable) {
+		/* Always set DataTranTIM on TQAVCTRL if LaunchTime is enabled
+		 * for any of the SR queues, and configure fetchtime delta.
+		 * XXX NOTE:
+		 *     - LaunchTime will be enabled for all SR queues.
+		 *     - A fixed offset can be added relative to the launch
+		 *       time of all packets if configured at reg LAUNCH_OS0.
+		 *       We are keeping it as 0 for now (default value).
+		 */
+		tqavctrl = rd32(E1000_I210_TQAVCTRL);
+		tqavctrl |= E1000_TQAVCTRL_DATATRANTIM |
+		       E1000_TQAVCTRL_FETCHTIME_DELTA;
+		wr32(E1000_I210_TQAVCTRL, tqavctrl);
+	} else {
+		/* If Launchtime is not enabled for any SR queues anymore,
+		 * then clear DataTranTIM on TQAVCTRL and clear fetchtime delta,
+		 * effectively disabling Launchtime.
+		 */
+		if (!is_any_txtime_enabled(adapter)) {
+			tqavctrl = rd32(E1000_I210_TQAVCTRL);
+			tqavctrl &= ~E1000_TQAVCTRL_DATATRANTIM;
+			tqavctrl &= ~E1000_TQAVCTRL_FETCHTIME_DELTA;
+			wr32(E1000_I210_TQAVCTRL, tqavctrl);
+		}
+	}
+
 	/* XXX: In i210 controller the sendSlope and loCredit parameters from
 	 * CBS are not configurable by software so we don't do any 'controller
 	 * configuration' in respect to these parameters.
 	 */
 
-	netdev_dbg(netdev, "CBS %s: queue %d idleslope %d sendslope %d hiCredit %d locredit %d\n",
-		   (ring->cbs_enable) ? "enabled" : "disabled", queue,
+	netdev_dbg(netdev, "Qav Tx mode: cbs %s, launchtime %s, queue %d \
+			    idleslope %d sendslope %d hiCredit %d \
+			    locredit %d\n",
+		   (ring->cbs_enable) ? "enabled" : "disabled",
+		   (ring->launchtime_enable) ? "enabled" : "disabled", queue,
 		   ring->idleslope, ring->sendslope, ring->hicredit,
 		   ring->locredit);
 }
 
+static int igb_save_txtime_params(struct igb_adapter *adapter, int queue,
+				  bool enable)
+{
+	struct igb_ring *ring;
+
+	if (queue < 0 || queue > adapter->num_tx_queues)
+		return -EINVAL;
+
+	ring = adapter->tx_ring[queue];
+	ring->launchtime_enable = enable;
+
+	return 0;
+}
+
 static int igb_save_cbs_params(struct igb_adapter *adapter, int queue,
 			       bool enable, int idleslope, int sendslope,
 			       int hicredit, int locredit)
@@ -1856,10 +1921,11 @@ static void igb_setup_tx_mode(struct igb_adapter *adapter)
 		int i, max_queue;
 
 		/* Configure TQAVCTRL register: set transmit mode to 'Qav',
-		 * set data fetch arbitration to 'round robin'.
+		 * set data fetch arbitration to 'round robin', set SP_WAIT_SR
+		 * so SP queues wait for SR ones.
 		 */
 		val = rd32(E1000_I210_TQAVCTRL);
-		val |= E1000_TQAVCTRL_XMIT_MODE;
+		val |= E1000_TQAVCTRL_XMIT_MODE | E1000_TQAVCTRL_SP_WAIT_SR;
 		val &= ~E1000_TQAVCTRL_DATAFETCHARB;
 		wr32(E1000_I210_TQAVCTRL, val);
 
@@ -2483,7 +2549,7 @@ static void igb_offload_apply(struct igb_adapter *adapter, s32 queue)
 
 	igb_config_tx_modes(adapter, queue);
 
-	if (!is_any_cbs_enabled(adapter))
+	if (!is_any_cbs_enabled(adapter) && !is_any_txtime_enabled(adapter))
 		enable_fqtss(adapter, false);
 }
 
@@ -2756,6 +2822,30 @@ static int igb_setup_tc_block(struct igb_adapter *adapter,
 	}
 }
 
+static int igb_offload_txtime(struct igb_adapter *adapter,
+			      struct tc_etf_qopt_offload *qopt)
+{
+	struct e1000_hw *hw = &adapter->hw;
+	int err;
+
+	/* Launchtime offloading is only supported by i210 controller. */
+	if (hw->mac.type != e1000_i210)
+		return -EOPNOTSUPP;
+
+	/* Launchtime offloading is only supported by queues 0 and 1. */
+	if (qopt->queue < 0 || qopt->queue > 1)
+		return -EINVAL;
+
+	err = igb_save_txtime_params(adapter, qopt->queue, qopt->enable);
+
+	if (err)
+		return err;
+
+	igb_offload_apply(adapter, qopt->queue);
+
+	return 0;
+}
+
 static int igb_setup_tc(struct net_device *dev, enum tc_setup_type type,
 			void *type_data)
 {
@@ -2766,6 +2856,8 @@ static int igb_setup_tc(struct net_device *dev, enum tc_setup_type type,
 		return igb_offload_cbs(adapter, type_data);
 	case TC_SETUP_BLOCK:
 		return igb_setup_tc_block(adapter, type_data);
+	case TC_SETUP_QDISC_ETF:
+		return igb_offload_txtime(adapter, type_data);
 
 	default:
 		return -EOPNOTSUPP;
@@ -5586,11 +5678,14 @@ static void igb_set_itr(struct igb_q_vector *q_vector)
 	}
 }
 
-static void igb_tx_ctxtdesc(struct igb_ring *tx_ring, u32 vlan_macip_lens,
-			    u32 type_tucmd, u32 mss_l4len_idx)
+static void igb_tx_ctxtdesc(struct igb_ring *tx_ring,
+			    struct igb_tx_buffer *first,
+			    u32 vlan_macip_lens, u32 type_tucmd,
+			    u32 mss_l4len_idx)
 {
 	struct e1000_adv_tx_context_desc *context_desc;
 	u16 i = tx_ring->next_to_use;
+	struct timespec64 ts;
 
 	context_desc = IGB_TX_CTXTDESC(tx_ring, i);
 
@@ -5605,9 +5700,18 @@ static void igb_tx_ctxtdesc(struct igb_ring *tx_ring, u32 vlan_macip_lens,
 		mss_l4len_idx |= tx_ring->reg_idx << 4;
 
 	context_desc->vlan_macip_lens	= cpu_to_le32(vlan_macip_lens);
-	context_desc->seqnum_seed	= 0;
 	context_desc->type_tucmd_mlhl	= cpu_to_le32(type_tucmd);
 	context_desc->mss_l4len_idx	= cpu_to_le32(mss_l4len_idx);
+
+	/* We assume there is always a valid tx time available. Invalid times
+	 * should have been handled by the upper layers.
+	 */
+	if (tx_ring->launchtime_enable) {
+		ts = ns_to_timespec64(first->skb->tstamp);
+		context_desc->seqnum_seed = cpu_to_le32(ts.tv_nsec / 32);
+	} else {
+		context_desc->seqnum_seed = 0;
+	}
 }
 
 static int igb_tso(struct igb_ring *tx_ring,
@@ -5690,7 +5794,8 @@ static int igb_tso(struct igb_ring *tx_ring,
 	vlan_macip_lens |= (ip.hdr - skb->data) << E1000_ADVTXD_MACLEN_SHIFT;
 	vlan_macip_lens |= first->tx_flags & IGB_TX_FLAGS_VLAN_MASK;
 
-	igb_tx_ctxtdesc(tx_ring, vlan_macip_lens, type_tucmd, mss_l4len_idx);
+	igb_tx_ctxtdesc(tx_ring, first, vlan_macip_lens,
+			type_tucmd, mss_l4len_idx);
 
 	return 1;
 }
@@ -5745,7 +5850,7 @@ static void igb_tx_csum(struct igb_ring *tx_ring, struct igb_tx_buffer *first)
 	vlan_macip_lens |= skb_network_offset(skb) << E1000_ADVTXD_MACLEN_SHIFT;
 	vlan_macip_lens |= first->tx_flags & IGB_TX_FLAGS_VLAN_MASK;
 
-	igb_tx_ctxtdesc(tx_ring, vlan_macip_lens, type_tucmd, 0);
+	igb_tx_ctxtdesc(tx_ring, first, vlan_macip_lens, type_tucmd, 0);
 }
 
 #define IGB_SET_FLAG(_input, _flag, _result) \
-- 
2.17.1

^ permalink raw reply related

* [PATCH v1 net-next 12/14] igb: Only call skb_tx_timestamp after descriptors are ready
From: Jesus Sanchez-Palencia @ 2018-06-27 21:59 UTC (permalink / raw)
  To: netdev
  Cc: tglx, jan.altenberg, vinicius.gomes, kurt.kanzenbach, henrik,
	richardcochran, levi.pearson, ilias.apalodimas, ivan.khoronzhuk,
	mlichvar, willemb, jhs, xiyou.wangcong, jiri,
	Jesus Sanchez-Palencia
In-Reply-To: <20180627215950.6719-1-jesus.sanchez-palencia@intel.com>

Currently, skb_tx_timestamp() is being called before the DMA
descriptors are prepared in igb_xmit_frame_ring(), which happens
during either the igb_tso() or igb_tx_csum() calls.

Given that now the skb->tstamp might be used to carry the timestamp
for SO_TXTIME, we must only call skb_tx_timestamp() after the
information has been copied into the DMA tx_ring.

Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 9b9a6a6227e0..0d72f2417143 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -6138,8 +6138,6 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,
 		}
 	}
 
-	skb_tx_timestamp(skb);
-
 	if (skb_vlan_tag_present(skb)) {
 		tx_flags |= IGB_TX_FLAGS_VLAN;
 		tx_flags |= (skb_vlan_tag_get(skb) << IGB_TX_FLAGS_VLAN_SHIFT);
@@ -6155,6 +6153,8 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,
 	else if (!tso)
 		igb_tx_csum(tx_ring, first);
 
+	skb_tx_timestamp(skb);
+
 	if (igb_tx_map(tx_ring, first, hdr_len))
 		goto cleanup_tx_tstamp;
 
-- 
2.17.1

^ permalink raw reply related

* [PATCH v1 net-next 13/14] net/sched: Enforce usage of CLOCK_TAI for sch_etf
From: Jesus Sanchez-Palencia @ 2018-06-27 21:59 UTC (permalink / raw)
  To: netdev
  Cc: tglx, jan.altenberg, vinicius.gomes, kurt.kanzenbach, henrik,
	richardcochran, levi.pearson, ilias.apalodimas, ivan.khoronzhuk,
	mlichvar, willemb, jhs, xiyou.wangcong, jiri,
	Jesus Sanchez-Palencia
In-Reply-To: <20180627215950.6719-1-jesus.sanchez-palencia@intel.com>

The qdisc and the SO_TXTIME ABIs allow for a clockid to be configured,
but it's been decided that usage of CLOCK_TAI should be enforced until
we decide to allow for other clockids to be used. The rationale here is
that PTP times are usually in the TAI scale, thus no other clocks should
be necessary.

For now, the qdisc will return EINVAL if any clocks other than
CLOCK_TAI are used.

Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
---
 net/sched/sch_etf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c
index cd6cb5b69228..5514a8aa3bd5 100644
--- a/net/sched/sch_etf.c
+++ b/net/sched/sch_etf.c
@@ -56,8 +56,8 @@ static inline int validate_input_params(struct tc_etf_qopt *qopt,
 		return -ENOTSUPP;
 	}
 
-	if (qopt->clockid >= MAX_CLOCKS) {
-		NL_SET_ERR_MSG(extack, "Invalid clockid");
+	if (qopt->clockid != CLOCK_TAI) {
+		NL_SET_ERR_MSG(extack, "Invalid clockid. CLOCK_TAI must be used");
 		return -EINVAL;
 	}
 
-- 
2.17.1

^ permalink raw reply related

* [PATCH v1 net-next 09/14] igb: Only change Tx arbitration when CBS is on
From: Jesus Sanchez-Palencia @ 2018-06-27 21:59 UTC (permalink / raw)
  To: netdev
  Cc: tglx, jan.altenberg, vinicius.gomes, kurt.kanzenbach, henrik,
	richardcochran, levi.pearson, ilias.apalodimas, ivan.khoronzhuk,
	mlichvar, willemb, jhs, xiyou.wangcong, jiri,
	Jesus Sanchez-Palencia
In-Reply-To: <20180627215950.6719-1-jesus.sanchez-palencia@intel.com>

Currently the data transmission arbitration algorithm - DataTranARB
field on TQAVCTRL reg - is always set to CBS when the Tx mode is
changed from legacy to 'Qav' mode.

Make that configuration a bit more granular in preparation for the
upcoming Launchtime enabling patches, since CBS and Launchtime can be
enabled separately. That is achieved by moving the DataTranARB setup
to igb_config_tx_modes() instead.

Similarly, when disabling CBS we must check if it has been disabled
for all queues, and clear the DataTranARB accordingly.

Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 49 +++++++++++++++--------
 1 file changed, 33 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 15f6b9c57ccf..8c90f1e51add 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -1654,6 +1654,18 @@ static void set_queue_mode(struct e1000_hw *hw, int queue, enum queue_mode mode)
 	wr32(E1000_I210_TQAVCC(queue), val);
 }
 
+static bool is_any_cbs_enabled(struct igb_adapter *adapter)
+{
+	int i;
+
+	for (i = 0; i < adapter->num_tx_queues; i++) {
+		if (adapter->tx_ring[i]->cbs_enable)
+			return true;
+	}
+
+	return false;
+}
+
 /**
  *  igb_config_tx_modes - Configure "Qav Tx mode" features on igb
  *  @adapter: pointer to adapter struct
@@ -1668,7 +1680,7 @@ static void igb_config_tx_modes(struct igb_adapter *adapter, int queue)
 	struct igb_ring *ring = adapter->tx_ring[queue];
 	struct net_device *netdev = adapter->netdev;
 	struct e1000_hw *hw = &adapter->hw;
-	u32 tqavcc;
+	u32 tqavcc, tqavctrl;
 	u16 value;
 
 	WARN_ON(hw->mac.type != e1000_i210);
@@ -1693,6 +1705,14 @@ static void igb_config_tx_modes(struct igb_adapter *adapter, int queue)
 		set_tx_desc_fetch_prio(hw, queue, TX_QUEUE_PRIO_HIGH);
 		set_queue_mode(hw, queue, QUEUE_MODE_STREAM_RESERVATION);
 
+		/* Always set data transfer arbitration to credit-based
+		 * shaper algorithm on TQAVCTRL if CBS is enabled for any of
+		 * the queues.
+		 */
+		tqavctrl = rd32(E1000_I210_TQAVCTRL);
+		tqavctrl |= E1000_TQAVCTRL_DATATRANARB;
+		wr32(E1000_I210_TQAVCTRL, tqavctrl);
+
 		/* According to i210 datasheet section 7.2.7.7, we should set
 		 * the 'idleSlope' field from TQAVCC register following the
 		 * equation:
@@ -1770,6 +1790,16 @@ static void igb_config_tx_modes(struct igb_adapter *adapter, int queue)
 
 		/* Set hiCredit to zero. */
 		wr32(E1000_I210_TQAVHC(queue), 0);
+
+		/* If CBS is not enabled for any queues anymore, then return to
+		 * the default state of Data Transmission Arbitration on
+		 * TQAVCTRL.
+		 */
+		if (!is_any_cbs_enabled(adapter)) {
+			tqavctrl = rd32(E1000_I210_TQAVCTRL);
+			tqavctrl &= ~E1000_TQAVCTRL_DATATRANARB;
+			wr32(E1000_I210_TQAVCTRL, tqavctrl);
+		}
 	}
 
 	/* XXX: In i210 controller the sendSlope and loCredit parameters from
@@ -1803,18 +1833,6 @@ static int igb_save_cbs_params(struct igb_adapter *adapter, int queue,
 	return 0;
 }
 
-static bool is_any_cbs_enabled(struct igb_adapter *adapter)
-{
-	int i;
-
-	for (i = 0; i < adapter->num_tx_queues; i++) {
-		if (adapter->tx_ring[i]->cbs_enable)
-			return true;
-	}
-
-	return false;
-}
-
 /**
  *  igb_setup_tx_mode - Switch to/from Qav Tx mode when applicable
  *  @adapter: pointer to adapter struct
@@ -1838,11 +1856,10 @@ static void igb_setup_tx_mode(struct igb_adapter *adapter)
 		int i, max_queue;
 
 		/* Configure TQAVCTRL register: set transmit mode to 'Qav',
-		 * set data fetch arbitration to 'round robin' and set data
-		 * transfer arbitration to 'credit shaper algorithm.
+		 * set data fetch arbitration to 'round robin'.
 		 */
 		val = rd32(E1000_I210_TQAVCTRL);
-		val |= E1000_TQAVCTRL_XMIT_MODE | E1000_TQAVCTRL_DATATRANARB;
+		val |= E1000_TQAVCTRL_XMIT_MODE;
 		val &= ~E1000_TQAVCTRL_DATAFETCHARB;
 		wr32(E1000_I210_TQAVCTRL, val);
 
-- 
2.17.1

^ permalink raw reply related

* [PATCH v1 net-next 14/14] net/sched: Make etf report drops on error_queue
From: Jesus Sanchez-Palencia @ 2018-06-27 21:59 UTC (permalink / raw)
  To: netdev
  Cc: tglx, jan.altenberg, vinicius.gomes, kurt.kanzenbach, henrik,
	richardcochran, levi.pearson, ilias.apalodimas, ivan.khoronzhuk,
	mlichvar, willemb, jhs, xiyou.wangcong, jiri,
	Jesus Sanchez-Palencia
In-Reply-To: <20180627215950.6719-1-jesus.sanchez-palencia@intel.com>

Use the socket error queue for reporting dropped packets if the
socket has enabled that feature through the SO_TXTIME API.

Packets are dropped either on enqueue() if they aren't accepted by the
qdisc or on dequeue() if the system misses their deadline. Those are
reported as different errors so applications can react accordingly.

Userspace can retrieve the errors through the socket error queue and the
corresponding cmsg interfaces. A struct sock_extended_err* is used for
returning the error data, and the packet's timestamp can be retrieved by
adding both ee_data and ee_info fields as e.g.:

    ((__u64) serr->ee_data << 32) + serr->ee_info

This feature is disabled by default and must be explicitly enabled by
applications. Enabling it can bring some overhead for the Tx cycles
of the application.

Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
---
 include/linux/socket.h        |  4 +++-
 include/net/sock.h            |  1 +
 include/uapi/linux/errqueue.h |  2 ++
 net/sched/sch_etf.c           | 37 +++++++++++++++++++++++++++++++++--
 4 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/include/linux/socket.h b/include/linux/socket.h
index ca476b7a8ff0..75e11d29b32a 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -85,7 +85,9 @@ struct cmsghdr {
 
 struct sock_txtime {
 	clockid_t	clockid;	/* reference clockid */
-	u16		flags;		/* bit 0: txtime in deadline_mode */
+	u16		flags;		/* bit 0: txtime in deadline_mode
+					 * bit 1: report drops on sk err queue
+					 */
 };
 
 /*
diff --git a/include/net/sock.h b/include/net/sock.h
index 73f4404e49e4..e681a45cfe7e 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -473,6 +473,7 @@ struct sock {
 	u16			sk_clockid;
 	u16			sk_txtime_flags;
 #define SK_TXTIME_DEADLINE_MASK	BIT(0)
+#define SK_TXTIME_RECV_ERR_MASK	BIT(1)
 
 	struct socket		*sk_socket;
 	void			*sk_user_data;
diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h
index dc64cfaf13da..66fd5e443c94 100644
--- a/include/uapi/linux/errqueue.h
+++ b/include/uapi/linux/errqueue.h
@@ -25,6 +25,8 @@ struct sock_extended_err {
 #define SO_EE_OFFENDER(ee)	((struct sockaddr*)((ee)+1))
 
 #define SO_EE_CODE_ZEROCOPY_COPIED	1
+#define SO_EE_CODE_TXTIME_INVALID_PARAM	2
+#define SO_EE_CODE_TXTIME_MISSED	3
 
 /**
  *	struct scm_timestamping - timestamps exposed through cmsg
diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c
index 5514a8aa3bd5..166f4b72875b 100644
--- a/net/sched/sch_etf.c
+++ b/net/sched/sch_etf.c
@@ -11,6 +11,7 @@
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/errno.h>
+#include <linux/errqueue.h>
 #include <linux/rbtree.h>
 #include <linux/skbuff.h>
 #include <linux/posix-timers.h>
@@ -124,6 +125,35 @@ static void reset_watchdog(struct Qdisc *sch)
 	qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next));
 }
 
+static void report_sock_error(struct sk_buff *skb, u32 err, u8 code)
+{
+	struct sock_exterr_skb *serr;
+	ktime_t txtime = skb->tstamp;
+
+	if (!skb->sk || !(skb->sk->sk_txtime_flags & SK_TXTIME_RECV_ERR_MASK))
+		return;
+
+	skb = skb_clone_sk(skb);
+	if (!skb)
+		return;
+
+	sock_hold(skb->sk);
+
+	serr = SKB_EXT_ERR(skb);
+	serr->ee.ee_errno = err;
+	serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
+	serr->ee.ee_type = 0;
+	serr->ee.ee_code = code;
+	serr->ee.ee_pad = 0;
+	serr->ee.ee_data = (txtime >> 32); /* high part of tstamp */
+	serr->ee.ee_info = txtime; /* low part of tstamp */
+
+	if (sock_queue_err_skb(skb->sk, skb))
+		kfree_skb(skb);
+
+	sock_put(skb->sk);
+}
+
 static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch,
 				      struct sk_buff **to_free)
 {
@@ -131,8 +161,10 @@ static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch,
 	struct rb_node **p = &q->head.rb_node, *parent = NULL;
 	ktime_t txtime = nskb->tstamp;
 
-	if (!is_packet_valid(sch, nskb))
+	if (!is_packet_valid(sch, nskb)) {
+		report_sock_error(nskb, EINVAL, SO_EE_CODE_TXTIME_INVALID_PARAM);
 		return qdisc_drop(nskb, sch, to_free);
+	}
 
 	while (*p) {
 		struct sk_buff *skb;
@@ -175,6 +207,8 @@ static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb,
 	if (drop) {
 		struct sk_buff *to_free = NULL;
 
+		report_sock_error(skb, ECANCELED, SO_EE_CODE_TXTIME_MISSED);
+
 		qdisc_drop(skb, sch, &to_free);
 		kfree_skb_list(to_free);
 		qdisc_qstats_overlimit(sch);
@@ -200,7 +234,6 @@ static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch)
 	now = q->get_time();
 
 	/* Drop if packet has expired while in queue. */
-	/* FIXME: Must return error on the socket's error queue */
 	if (ktime_before(skb->tstamp, now)) {
 		timesortedlist_erase(sch, skb, true);
 		skb = NULL;
-- 
2.17.1

^ permalink raw reply related

* [PATCH v1 net-next 08/14] igb: Refactor igb_configure_cbs()
From: Jesus Sanchez-Palencia @ 2018-06-27 21:59 UTC (permalink / raw)
  To: netdev
  Cc: tglx, jan.altenberg, vinicius.gomes, kurt.kanzenbach, henrik,
	richardcochran, levi.pearson, ilias.apalodimas, ivan.khoronzhuk,
	mlichvar, willemb, jhs, xiyou.wangcong, jiri,
	Jesus Sanchez-Palencia
In-Reply-To: <20180627215950.6719-1-jesus.sanchez-palencia@intel.com>

Make this function retrieve what it needs from the Tx ring being
addressed since it already relies on what had been saved on it before.
Also, since this function will be used by the upcoming Launchtime
patches rename it to better reflect its intention. Note that
Launchtime is not part of what 802.1Qav specifies, but the i210
datasheet refers to this set of functionality as "Qav Transmission
Mode".

Here we also perform a tiny refactor at is_any_cbs_enabled(), and add
further documentation to igb_setup_tx_mode().

Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 60 +++++++++++------------
 1 file changed, 28 insertions(+), 32 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index f1e3397bd405..15f6b9c57ccf 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -1655,23 +1655,17 @@ static void set_queue_mode(struct e1000_hw *hw, int queue, enum queue_mode mode)
 }
 
 /**
- *  igb_configure_cbs - Configure Credit-Based Shaper (CBS)
+ *  igb_config_tx_modes - Configure "Qav Tx mode" features on igb
  *  @adapter: pointer to adapter struct
  *  @queue: queue number
- *  @enable: true = enable CBS, false = disable CBS
- *  @idleslope: idleSlope in kbps
- *  @sendslope: sendSlope in kbps
- *  @hicredit: hiCredit in bytes
- *  @locredit: loCredit in bytes
  *
- *  Configure CBS for a given hardware queue. When disabling, idleslope,
- *  sendslope, hicredit, locredit arguments are ignored. Returns 0 if
- *  success. Negative otherwise.
+ *  Configure CBS for a given hardware queue. Parameters are retrieved
+ *  from the correct Tx ring, so igb_save_cbs_params() should be used
+ *  for setting those correctly prior to this function being called.
  **/
-static void igb_configure_cbs(struct igb_adapter *adapter, int queue,
-			      bool enable, int idleslope, int sendslope,
-			      int hicredit, int locredit)
+static void igb_config_tx_modes(struct igb_adapter *adapter, int queue)
 {
+	struct igb_ring *ring = adapter->tx_ring[queue];
 	struct net_device *netdev = adapter->netdev;
 	struct e1000_hw *hw = &adapter->hw;
 	u32 tqavcc;
@@ -1680,7 +1674,7 @@ static void igb_configure_cbs(struct igb_adapter *adapter, int queue,
 	WARN_ON(hw->mac.type != e1000_i210);
 	WARN_ON(queue < 0 || queue > 1);
 
-	if (enable || queue == 0) {
+	if (ring->cbs_enable || queue == 0) {
 		/* i210 does not allow the queue 0 to be in the Strict
 		 * Priority mode while the Qav mode is enabled, so,
 		 * instead of disabling strict priority mode, we give
@@ -1690,10 +1684,10 @@ static void igb_configure_cbs(struct igb_adapter *adapter, int queue,
 		 * Queue0 QueueMode must be set to 1b when
 		 * TransmitMode is set to Qav."
 		 */
-		if (queue == 0 && !enable) {
+		if (queue == 0 && !ring->cbs_enable) {
 			/* max "linkspeed" idleslope in kbps */
-			idleslope = 1000000;
-			hicredit = ETH_FRAME_LEN;
+			ring->idleslope = 1000000;
+			ring->hicredit = ETH_FRAME_LEN;
 		}
 
 		set_tx_desc_fetch_prio(hw, queue, TX_QUEUE_PRIO_HIGH);
@@ -1756,14 +1750,15 @@ static void igb_configure_cbs(struct igb_adapter *adapter, int queue,
 		 *       calculated value, so the resulting bandwidth might
 		 *       be slightly higher for some configurations.
 		 */
-		value = DIV_ROUND_UP_ULL(idleslope * 61034ULL, 1000000);
+		value = DIV_ROUND_UP_ULL(ring->idleslope * 61034ULL, 1000000);
 
 		tqavcc = rd32(E1000_I210_TQAVCC(queue));
 		tqavcc &= ~E1000_TQAVCC_IDLESLOPE_MASK;
 		tqavcc |= value;
 		wr32(E1000_I210_TQAVCC(queue), tqavcc);
 
-		wr32(E1000_I210_TQAVHC(queue), 0x80000000 + hicredit * 0x7735);
+		wr32(E1000_I210_TQAVHC(queue),
+		     0x80000000 + ring->hicredit * 0x7735);
 	} else {
 		set_tx_desc_fetch_prio(hw, queue, TX_QUEUE_PRIO_LOW);
 		set_queue_mode(hw, queue, QUEUE_MODE_STRICT_PRIORITY);
@@ -1783,8 +1778,9 @@ static void igb_configure_cbs(struct igb_adapter *adapter, int queue,
 	 */
 
 	netdev_dbg(netdev, "CBS %s: queue %d idleslope %d sendslope %d hiCredit %d locredit %d\n",
-		   (enable) ? "enabled" : "disabled", queue,
-		   idleslope, sendslope, hicredit, locredit);
+		   (ring->cbs_enable) ? "enabled" : "disabled", queue,
+		   ring->idleslope, ring->sendslope, ring->hicredit,
+		   ring->locredit);
 }
 
 static int igb_save_cbs_params(struct igb_adapter *adapter, int queue,
@@ -1809,19 +1805,25 @@ static int igb_save_cbs_params(struct igb_adapter *adapter, int queue,
 
 static bool is_any_cbs_enabled(struct igb_adapter *adapter)
 {
-	struct igb_ring *ring;
 	int i;
 
 	for (i = 0; i < adapter->num_tx_queues; i++) {
-		ring = adapter->tx_ring[i];
-
-		if (ring->cbs_enable)
+		if (adapter->tx_ring[i]->cbs_enable)
 			return true;
 	}
 
 	return false;
 }
 
+/**
+ *  igb_setup_tx_mode - Switch to/from Qav Tx mode when applicable
+ *  @adapter: pointer to adapter struct
+ *
+ *  Configure TQAVCTRL register switching the controller's Tx mode
+ *  if FQTSS mode is enabled or disabled. Additionally, will issue
+ *  a call to igb_config_tx_modes() per queue so any previously saved
+ *  Tx parameters are applied.
+ **/
 static void igb_setup_tx_mode(struct igb_adapter *adapter)
 {
 	struct net_device *netdev = adapter->netdev;
@@ -1881,11 +1883,7 @@ static void igb_setup_tx_mode(struct igb_adapter *adapter)
 			    adapter->num_tx_queues : I210_SR_QUEUES_NUM;
 
 		for (i = 0; i < max_queue; i++) {
-			struct igb_ring *ring = adapter->tx_ring[i];
-
-			igb_configure_cbs(adapter, i, ring->cbs_enable,
-					  ring->idleslope, ring->sendslope,
-					  ring->hicredit, ring->locredit);
+			igb_config_tx_modes(adapter, i);
 		}
 	} else {
 		wr32(E1000_RXPBS, I210_RXPBSIZE_DEFAULT);
@@ -2480,9 +2478,7 @@ static int igb_offload_cbs(struct igb_adapter *adapter,
 		return err;
 
 	if (is_fqtss_enabled(adapter)) {
-		igb_configure_cbs(adapter, qopt->queue, qopt->enable,
-				  qopt->idleslope, qopt->sendslope,
-				  qopt->hicredit, qopt->locredit);
+		igb_config_tx_modes(adapter, qopt->queue);
 
 		if (!is_any_cbs_enabled(adapter))
 			enable_fqtss(adapter, false);
-- 
2.17.1

^ permalink raw reply related

* [PATCH v1 net-next 06/14] net/sched: Introduce the ETF Qdisc
From: Jesus Sanchez-Palencia @ 2018-06-27 21:59 UTC (permalink / raw)
  To: netdev
  Cc: tglx, jan.altenberg, vinicius.gomes, kurt.kanzenbach, henrik,
	richardcochran, levi.pearson, ilias.apalodimas, ivan.khoronzhuk,
	mlichvar, willemb, jhs, xiyou.wangcong, jiri,
	Jesus Sanchez-Palencia
In-Reply-To: <20180627215950.6719-1-jesus.sanchez-palencia@intel.com>

From: Vinicius Costa Gomes <vinicius.gomes@intel.com>

The ETF (Earliest TxTime First) qdisc uses the information added
earlier in this series (the socket option SO_TXTIME and the new
role of sk_buff->tstamp) to schedule packets transmission based
on absolute time.

For some workloads, just bandwidth enforcement is not enough, and
precise control of the transmission of packets is necessary.

Example:

$ tc qdisc replace dev enp2s0 parent root handle 100 mqprio num_tc 3 \
           map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 queues 1@0 1@1 2@2 hw 0

$ tc qdisc add dev enp2s0 parent 100:1 etf delta 100000 \
           clockid CLOCK_REALTIME

In this example, the Qdisc will provide SW best-effort for the control
of the transmission time to the network adapter, the time stamp in the
socket will be in reference to the clockid CLOCK_REALTIME and packets
will leave the qdisc "delta" (100000) nanoseconds before its transmission
time.

The ETF qdisc will buffer packets sorted by their txtime. It will drop
packets on enqueue() if their skbuff clockid does not match the clock
reference of the Qdisc. Moreover, on dequeue(), a packet will be dropped
if it expires while being enqueued.

The qdisc also supports the SO_TXTIME deadline mode. For this mode, it
will dequeue a packet as soon as possible and change the skb timestamp
to 'now' during etf_dequeue().

Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
---
 include/linux/netdevice.h      |   1 +
 include/uapi/linux/pkt_sched.h |  17 ++
 net/sched/Kconfig              |  11 +
 net/sched/Makefile             |   1 +
 net/sched/sch_etf.c            | 385 +++++++++++++++++++++++++++++++++
 5 files changed, 415 insertions(+)
 create mode 100644 net/sched/sch_etf.c

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c6b377a15869..7f650bdc6ec3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -793,6 +793,7 @@ enum tc_setup_type {
 	TC_SETUP_QDISC_RED,
 	TC_SETUP_QDISC_PRIO,
 	TC_SETUP_QDISC_MQ,
+	TC_SETUP_QDISC_ETF,
 };
 
 /* These structures hold the attributes of bpf state that are being passed
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 37b5096ae97b..9d6fd2004a03 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -934,4 +934,21 @@ enum {
 
 #define TCA_CBS_MAX (__TCA_CBS_MAX - 1)
 
+
+/* ETF */
+struct tc_etf_qopt {
+	__s32 delta;
+	__s32 clockid;
+	__u32 flags;
+#define TC_ETF_DEADLINE_MODE_ON	BIT(0)
+};
+
+enum {
+	TCA_ETF_UNSPEC,
+	TCA_ETF_PARMS,
+	__TCA_ETF_MAX,
+};
+
+#define TCA_ETF_MAX (__TCA_ETF_MAX - 1)
+
 #endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index a01169fb5325..fcc89706745b 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -183,6 +183,17 @@ config NET_SCH_CBS
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_cbs.
 
+config NET_SCH_ETF
+	tristate "Earliest TxTime First (ETF)"
+	help
+	  Say Y here if you want to use the Earliest TxTime First (ETF) packet
+	  scheduling algorithm.
+
+	  See the top of <file:net/sched/sch_etf.c> for more details.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_etf.
+
 config NET_SCH_GRED
 	tristate "Generic Random Early Detection (GRED)"
 	---help---
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 8811d3804878..9a5a7077d217 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -54,6 +54,7 @@ obj-$(CONFIG_NET_SCH_FQ)	+= sch_fq.o
 obj-$(CONFIG_NET_SCH_HHF)	+= sch_hhf.o
 obj-$(CONFIG_NET_SCH_PIE)	+= sch_pie.o
 obj-$(CONFIG_NET_SCH_CBS)	+= sch_cbs.o
+obj-$(CONFIG_NET_SCH_ETF)	+= sch_etf.o
 
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c
new file mode 100644
index 000000000000..5f01a285f399
--- /dev/null
+++ b/net/sched/sch_etf.c
@@ -0,0 +1,385 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* net/sched/sch_etf.c  Earliest TxTime First queueing discipline.
+ *
+ * Authors:	Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
+ *		Vinicius Costa Gomes <vinicius.gomes@intel.com>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/rbtree.h>
+#include <linux/skbuff.h>
+#include <linux/posix-timers.h>
+#include <net/netlink.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+#include <net/sock.h>
+
+#define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON)
+
+struct etf_sched_data {
+	bool deadline_mode;
+	int clockid;
+	int queue;
+	s32 delta; /* in ns */
+	ktime_t last; /* The txtime of the last skb sent to the netdevice. */
+	struct rb_root head;
+	struct qdisc_watchdog watchdog;
+	ktime_t (*get_time)(void);
+};
+
+static const struct nla_policy etf_policy[TCA_ETF_MAX + 1] = {
+	[TCA_ETF_PARMS]	= { .len = sizeof(struct tc_etf_qopt) },
+};
+
+static inline int validate_input_params(struct tc_etf_qopt *qopt,
+					struct netlink_ext_ack *extack)
+{
+	/* Check if params comply to the following rules:
+	 *	* Clockid and delta must be valid.
+	 *
+	 *	* Dynamic clockids are not supported.
+	 *
+	 *	* Delta must be a positive integer.
+	 */
+	if (qopt->clockid < 0) {
+		NL_SET_ERR_MSG(extack, "Dynamic clockids are not supported");
+		return -ENOTSUPP;
+	}
+
+	if (qopt->clockid >= MAX_CLOCKS) {
+		NL_SET_ERR_MSG(extack, "Invalid clockid");
+		return -EINVAL;
+	}
+
+	if (qopt->delta < 0) {
+		NL_SET_ERR_MSG(extack, "Delta must be positive");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+	ktime_t txtime = nskb->tstamp;
+	struct sock *sk = nskb->sk;
+	ktime_t now;
+
+	if (!sk)
+		return false;
+
+	if (!sock_flag(sk, SOCK_TXTIME))
+		return false;
+
+	/* We don't perform crosstimestamping.
+	 * Drop if packet's clockid differs from qdisc's.
+	 */
+	if (sk->sk_clockid != q->clockid)
+		return false;
+
+	if ((sk->sk_txtime_flags & SK_TXTIME_DEADLINE_MASK) !=
+	    q->deadline_mode)
+		return false;
+
+	now = q->get_time();
+	if (ktime_before(txtime, now) || ktime_before(txtime, q->last))
+		return false;
+
+	return true;
+}
+
+static struct sk_buff *etf_peek_timesortedlist(struct Qdisc *sch)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+	struct rb_node *p;
+
+	p = rb_first(&q->head);
+	if (!p)
+		return NULL;
+
+	return rb_to_skb(p);
+}
+
+static void reset_watchdog(struct Qdisc *sch)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb = etf_peek_timesortedlist(sch);
+	ktime_t next;
+
+	if (!skb)
+		return;
+
+	next = ktime_sub_ns(skb->tstamp, q->delta);
+	qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next));
+}
+
+static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch,
+				      struct sk_buff **to_free)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+	struct rb_node **p = &q->head.rb_node, *parent = NULL;
+	ktime_t txtime = nskb->tstamp;
+
+	if (!is_packet_valid(sch, nskb))
+		return qdisc_drop(nskb, sch, to_free);
+
+	while (*p) {
+		struct sk_buff *skb;
+
+		parent = *p;
+		skb = rb_to_skb(parent);
+		if (ktime_after(txtime, skb->tstamp))
+			p = &parent->rb_right;
+		else
+			p = &parent->rb_left;
+	}
+	rb_link_node(&nskb->rbnode, parent, p);
+	rb_insert_color(&nskb->rbnode, &q->head);
+
+	qdisc_qstats_backlog_inc(sch, nskb);
+	sch->q.qlen++;
+
+	/* Now we may need to re-arm the qdisc watchdog for the next packet. */
+	reset_watchdog(sch);
+
+	return NET_XMIT_SUCCESS;
+}
+
+static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb,
+				 bool drop)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+
+	rb_erase(&skb->rbnode, &q->head);
+
+	/* The rbnode field in the skb re-uses these fields, now that
+	 * we are done with the rbnode, reset them.
+	 */
+	skb->next = NULL;
+	skb->prev = NULL;
+	skb->dev = qdisc_dev(sch);
+
+	qdisc_qstats_backlog_dec(sch, skb);
+
+	if (drop) {
+		struct sk_buff *to_free = NULL;
+
+		qdisc_drop(skb, sch, &to_free);
+		kfree_skb_list(to_free);
+		qdisc_qstats_overlimit(sch);
+	} else {
+		qdisc_bstats_update(sch, skb);
+
+		q->last = skb->tstamp;
+	}
+
+	sch->q.qlen--;
+}
+
+static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+	ktime_t now, next;
+
+	skb = etf_peek_timesortedlist(sch);
+	if (!skb)
+		return NULL;
+
+	now = q->get_time();
+
+	/* Drop if packet has expired while in queue. */
+	/* FIXME: Must return error on the socket's error queue */
+	if (ktime_before(skb->tstamp, now)) {
+		timesortedlist_erase(sch, skb, true);
+		skb = NULL;
+		goto out;
+	}
+
+	/* When in deadline mode, dequeue as soon as possible and change the
+	 * txtime from deadline to (now + delta).
+	 */
+	if (q->deadline_mode) {
+		timesortedlist_erase(sch, skb, false);
+		skb->tstamp = now;
+		goto out;
+	}
+
+	next = ktime_sub_ns(skb->tstamp, q->delta);
+
+	/* Dequeue only if now is within the [txtime - delta, txtime] range. */
+	if (ktime_after(now, next))
+		timesortedlist_erase(sch, skb, false);
+	else
+		skb = NULL;
+
+out:
+	/* Now we may need to re-arm the qdisc watchdog for the next packet. */
+	reset_watchdog(sch);
+
+	return skb;
+}
+
+static int etf_init(struct Qdisc *sch, struct nlattr *opt,
+		    struct netlink_ext_ack *extack)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+	struct nlattr *tb[TCA_ETF_MAX + 1];
+	struct tc_etf_qopt *qopt;
+	int err;
+
+	if (!opt) {
+		NL_SET_ERR_MSG(extack,
+			       "Missing ETF qdisc options which are mandatory");
+		return -EINVAL;
+	}
+
+	err = nla_parse_nested(tb, TCA_ETF_MAX, opt, etf_policy, extack);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_ETF_PARMS]) {
+		NL_SET_ERR_MSG(extack, "Missing mandatory ETF parameters");
+		return -EINVAL;
+	}
+
+	qopt = nla_data(tb[TCA_ETF_PARMS]);
+
+	pr_debug("delta %d clockid %d deadline %s\n",
+		 qopt->delta, qopt->clockid,
+		 DEADLINE_MODE_IS_ON(qopt) ? "on" : "off");
+
+	err = validate_input_params(qopt, extack);
+	if (err < 0)
+		return err;
+
+	q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0);
+
+	/* Everything went OK, save the parameters used. */
+	q->delta = qopt->delta;
+	q->clockid = qopt->clockid;
+	q->deadline_mode = DEADLINE_MODE_IS_ON(qopt);
+
+	switch (q->clockid) {
+	case CLOCK_REALTIME:
+		q->get_time = ktime_get_real;
+		break;
+	case CLOCK_MONOTONIC:
+		q->get_time = ktime_get;
+		break;
+	case CLOCK_BOOTTIME:
+		q->get_time = ktime_get_boottime;
+		break;
+	case CLOCK_TAI:
+		q->get_time = ktime_get_clocktai;
+		break;
+	default:
+		NL_SET_ERR_MSG(extack, "Clockid is not supported");
+		return -ENOTSUPP;
+	}
+
+	qdisc_watchdog_init_clockid(&q->watchdog, sch, q->clockid);
+
+	return 0;
+}
+
+static void timesortedlist_clear(struct Qdisc *sch)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+	struct rb_node *p = rb_first(&q->head);
+
+	while (p) {
+		struct sk_buff *skb = rb_to_skb(p);
+
+		p = rb_next(p);
+
+		rb_erase(&skb->rbnode, &q->head);
+		rtnl_kfree_skbs(skb, skb);
+		sch->q.qlen--;
+	}
+}
+
+static void etf_reset(struct Qdisc *sch)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+
+	/* Only cancel watchdog if it's been initialized. */
+	if (q->watchdog.qdisc == sch)
+		qdisc_watchdog_cancel(&q->watchdog);
+
+	/* No matter which mode we are on, it's safe to clear both lists. */
+	timesortedlist_clear(sch);
+	__qdisc_reset_queue(&sch->q);
+
+	sch->qstats.backlog = 0;
+	sch->q.qlen = 0;
+
+	q->last = 0;
+}
+
+static void etf_destroy(struct Qdisc *sch)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+
+	/* Only cancel watchdog if it's been initialized. */
+	if (q->watchdog.qdisc == sch)
+		qdisc_watchdog_cancel(&q->watchdog);
+}
+
+static int etf_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+	struct tc_etf_qopt opt = { };
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (!nest)
+		goto nla_put_failure;
+
+	opt.delta = q->delta;
+	opt.clockid = q->clockid;
+	if (q->deadline_mode)
+		opt.flags |= TC_ETF_DEADLINE_MODE_ON;
+
+	if (nla_put(skb, TCA_ETF_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	return nla_nest_end(skb, nest);
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -1;
+}
+
+static struct Qdisc_ops etf_qdisc_ops __read_mostly = {
+	.id		=	"etf",
+	.priv_size	=	sizeof(struct etf_sched_data),
+	.enqueue	=	etf_enqueue_timesortedlist,
+	.dequeue	=	etf_dequeue_timesortedlist,
+	.peek		=	etf_peek_timesortedlist,
+	.init		=	etf_init,
+	.reset		=	etf_reset,
+	.destroy	=	etf_destroy,
+	.dump		=	etf_dump,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init etf_module_init(void)
+{
+	return register_qdisc(&etf_qdisc_ops);
+}
+
+static void __exit etf_module_exit(void)
+{
+	unregister_qdisc(&etf_qdisc_ops);
+}
+module_init(etf_module_init)
+module_exit(etf_module_exit)
+MODULE_LICENSE("GPL");
-- 
2.17.1

^ permalink raw reply related

* [PATCH v1 net-next 03/14] net: ipv4: Hook into time based transmission
From: Jesus Sanchez-Palencia @ 2018-06-27 21:59 UTC (permalink / raw)
  To: netdev
  Cc: tglx, jan.altenberg, vinicius.gomes, kurt.kanzenbach, henrik,
	richardcochran, levi.pearson, ilias.apalodimas, ivan.khoronzhuk,
	mlichvar, willemb, jhs, xiyou.wangcong, jiri, Richard Cochran,
	Jesus Sanchez-Palencia
In-Reply-To: <20180627215950.6719-1-jesus.sanchez-palencia@intel.com>

Add a transmit_time field to struct inet_cork, then copy the
timestamp from the CMSG cookie at ip_setup_cork() so we can
safely copy it into the skb later during __ip_make_skb().

For the raw fast path, just perform the copy at raw_send_hdrinc().

Signed-off-by: Richard Cochran <rcochran@linutronix.de>
Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
---
 include/net/inet_sock.h | 1 +
 net/ipv4/ip_output.c    | 3 +++
 net/ipv4/raw.c          | 2 ++
 net/ipv4/udp.c          | 1 +
 4 files changed, 7 insertions(+)

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 83d5b3c2ac42..314be484c696 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -148,6 +148,7 @@ struct inet_cork {
 	__s16			tos;
 	char			priority;
 	__u16			gso_size;
+	u64			transmit_time;
 };
 
 struct inet_cork_full {
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index b3308e9d9762..904a54a090e9 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1153,6 +1153,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
 	cork->tos = ipc->tos;
 	cork->priority = ipc->priority;
 	cork->tx_flags = ipc->tx_flags;
+	cork->transmit_time = ipc->sockc.transmit_time;
 
 	return 0;
 }
@@ -1413,6 +1414,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
 
 	skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
 	skb->mark = sk->sk_mark;
+	skb->tstamp = cork->transmit_time;
 	/*
 	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
 	 * on dst refcount
@@ -1495,6 +1497,7 @@ struct sk_buff *ip_make_skb(struct sock *sk,
 	cork->flags = 0;
 	cork->addr = 0;
 	cork->opt = NULL;
+	cork->transmit_time = 0;
 	err = ip_setup_cork(sk, cork, ipc, rtp);
 	if (err)
 		return ERR_PTR(err);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index abb3c9490c55..446af7be2b55 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -381,6 +381,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
 
 	skb->priority = sk->sk_priority;
 	skb->mark = sk->sk_mark;
+	skb->tstamp = sockc->transmit_time;
 	skb_dst_set(skb, &rt->dst);
 	*rtp = NULL;
 
@@ -562,6 +563,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	}
 
 	ipc.sockc.tsflags = sk->sk_tsflags;
+	ipc.sockc.transmit_time = 0;
 	ipc.addr = inet->inet_saddr;
 	ipc.opt = NULL;
 	ipc.tx_flags = 0;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 9bb27df4dac5..0ab2c13bc7a1 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -978,6 +978,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	}
 
 	ipc.sockc.tsflags = sk->sk_tsflags;
+	ipc.sockc.transmit_time = 0;
 	ipc.addr = inet->inet_saddr;
 	ipc.oif = sk->sk_bound_dev_if;
 	ipc.gso_size = up->gso_size;
-- 
2.17.1

^ permalink raw reply related

* Re: [RFC PATCH] net, mm: account sock objects to kmemcg
From: Eric Dumazet @ 2018-06-27 22:06 UTC (permalink / raw)
  To: Shakeel Butt, eric.dumazet
  Cc: Andrew Morton, Johannes Weiner, Vladimir Davydov, Greg Thelen,
	Roman Gushchin, davem, Eric Dumazet, Kirill Tkhai, LKML, netdev,
	Linux MM
In-Reply-To: <CALvZod4JTY8T19L-q+E1LLFVGFso6ea1MACKdFsed8dM-3AvYQ@mail.gmail.com>



On 06/27/2018 03:03 PM, Shakeel Butt wrote:

> 
> This will opt-in all the sock kmem_caches which I think is better and
> much smaller change. Should I resend this or do you want to send the
> patch?
>

Please send a V2, with maybe some updated changelog ;)

Thanks !

^ permalink raw reply

* [PATCH v1 iproute2 1/2] uapi pkt_sched: Add etf info - DO NOT COMMIT
From: Jesus Sanchez-Palencia @ 2018-06-27 22:09 UTC (permalink / raw)
  To: netdev
  Cc: jhs, kurt.kanzenbach, xiyou.wangcong, jiri, vinicius.gomes,
	Jesus Sanchez-Palencia

This should come from the next uapi headers update.
Sending it now just as a convenience so anyone can build tc with etf
and taprio support.

Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
---
 include/uapi/linux/pkt_sched.h | 66 ++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 37b5096a..4d5a5bd3 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -934,4 +934,70 @@ enum {
 
 #define TCA_CBS_MAX (__TCA_CBS_MAX - 1)
 
+
+/* ETF */
+struct tc_etf_qopt {
+	__s32 delta;
+	__s32 clockid;
+	__u32 flags;
+#define TC_ETF_DEADLINE_MODE_ON	BIT(0)
+#define TC_ETF_OFFLOAD_ON	BIT(1)
+};
+
+enum {
+	TCA_ETF_UNSPEC,
+	TCA_ETF_PARMS,
+	__TCA_ETF_MAX,
+};
+
+#define TCA_ETF_MAX (__TCA_ETF_MAX - 1)
+
+/* TAPRIO */
+enum {
+	TC_TAPRIO_CMD_SET_GATES = 0x00,
+	TC_TAPRIO_CMD_SET_AND_HOLD = 0x01,
+	TC_TAPRIO_CMD_SET_AND_RELEASE = 0x02,
+};
+
+enum {
+	TCA_TAPRIO_SCHED_ENTRY_UNSPEC,
+	TCA_TAPRIO_SCHED_ENTRY_INDEX, /* u32 */
+	TCA_TAPRIO_SCHED_ENTRY_CMD, /* u8 */
+	TCA_TAPRIO_SCHED_ENTRY_GATE_MASK, /* u32 */
+	TCA_TAPRIO_SCHED_ENTRY_INTERVAL, /* u32 */
+	__TCA_TAPRIO_SCHED_ENTRY_MAX,
+};
+#define TCA_TAPRIO_SCHED_ENTRY_MAX (__TCA_TAPRIO_SCHED_ENTRY_MAX - 1)
+
+/* The format for schedule entry list is:
+ * [TCA_TAPRIO_SCHED_ENTRY_LIST]
+ *   [TCA_TAPRIO_SCHED_ENTRY]
+ *     [TCA_TAPRIO_SCHED_ENTRY_CMD]
+ *     [TCA_TAPRIO_SCHED_ENTRY_GATES]
+ *     [TCA_TAPRIO_SCHED_ENTRY_INTERVAL]
+ */
+enum {
+	TCA_TAPRIO_SCHED_UNSPEC,
+	TCA_TAPRIO_SCHED_ENTRY,
+	__TCA_TAPRIO_SCHED_MAX,
+};
+
+#define TCA_TAPRIO_SCHED_MAX (__TCA_TAPRIO_SCHED_MAX - 1)
+
+enum {
+	TCA_TAPRIO_ATTR_UNSPEC,
+	TCA_TAPRIO_ATTR_PRIOMAP, /* struct tc_mqprio_qopt */
+	TCA_TAPRIO_ATTR_PREEMPT_MASK, /* which traffic classes are preemptible, u32 */
+	TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST, /* nested of entry */
+	TCA_TAPRIO_ATTR_SCHED_BASE_TIME, /* s64 */
+	TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, /* s64 */
+	TCA_TAPRIO_ATTR_SCHED_EXTENSION_TIME, /* s64 */
+	TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY, /*  */
+	TCA_TAPRIO_ATTR_SCHED_CLOCKID, /* s32 */
+	TCA_TAPRIO_PAD,
+	__TCA_TAPRIO_ATTR_MAX,
+};
+
+#define TCA_TAPRIO_ATTR_MAX (__TCA_TAPRIO_ATTR_MAX - 1)
+
 #endif
-- 
2.17.1

^ permalink raw reply related

* [PATCH v1 iproute2 2/2] tc: Add support for the ETF Qdisc
From: Jesus Sanchez-Palencia @ 2018-06-27 22:09 UTC (permalink / raw)
  To: netdev
  Cc: jhs, kurt.kanzenbach, xiyou.wangcong, jiri, vinicius.gomes,
	Jesus Sanchez-Palencia
In-Reply-To: <20180627220912.7148-1-jesus.sanchez-palencia@intel.com>

From: Vinicius Costa Gomes <vinicius.gomes@intel.com>

The "Earliest TxTime First" (ETF) queueing discipline allows precise
control of the transmission time of packets by providing a sorted
time-based scheduling of packets.

The syntax is:

tc qdisc add dev DEV parent NODE etf delta <DELTA>
                     clockid <CLOCKID> [offload] [deadline_mode]

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
---
 tc/Makefile |   1 +
 tc/q_etf.c  | 168 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 169 insertions(+)
 create mode 100644 tc/q_etf.c

diff --git a/tc/Makefile b/tc/Makefile
index dfd00267..4525c0fb 100644
--- a/tc/Makefile
+++ b/tc/Makefile
@@ -71,6 +71,7 @@ TCMODULES += q_clsact.o
 TCMODULES += e_bpf.o
 TCMODULES += f_matchall.o
 TCMODULES += q_cbs.o
+TCMODULES += q_etf.o
 
 TCSO :=
 ifeq ($(TC_CONFIG_ATM),y)
diff --git a/tc/q_etf.c b/tc/q_etf.c
new file mode 100644
index 00000000..5db1dd6f
--- /dev/null
+++ b/tc/q_etf.c
@@ -0,0 +1,168 @@
+/*
+ * q_etf.c		Earliest TxTime First (ETF).
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Vinicius Costa Gomes <vinicius.gomes@intel.com>
+ *		Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "utils.h"
+#include "tc_util.h"
+
+#define CLOCKID_INVALID (-1)
+static void explain(void)
+{
+	fprintf(stderr, "Usage: ... etf delta NANOS clockid CLOCKID [offload] [deadline_mode]\n");
+	fprintf(stderr, "CLOCKID must be a valid SYS-V id (i.e. CLOCK_TAI)\n");
+}
+
+static void explain1(const char *arg, const char *val)
+{
+	fprintf(stderr, "etf: illegal value for \"%s\": \"%s\"\n", arg, val);
+}
+
+static void explain_clockid(const char *val)
+{
+	fprintf(stderr, "etf: illegal value for \"clockid\": \"%s\".\n", val);
+	fprintf(stderr, "It must be a valid SYS-V id (i.e. CLOCK_TAI)");
+}
+
+static int get_clockid(__s32 *val, const char *arg)
+{
+	const struct static_clockid {
+		const char *name;
+		clockid_t clockid;
+	} clockids_sysv[] = {
+		{ "CLOCK_REALTIME", CLOCK_REALTIME },
+		{ "CLOCK_TAI", CLOCK_TAI },
+		{ "CLOCK_BOOTTIME", CLOCK_BOOTTIME },
+		{ "CLOCK_MONOTONIC", CLOCK_MONOTONIC },
+		{ NULL }
+	};
+
+	const struct static_clockid *c;
+
+	for (c = clockids_sysv; c->name; c++) {
+		if (strncasecmp(c->name, arg, 25) == 0) {
+			*val = c->clockid;
+
+			return 0;
+		}
+	}
+
+	return -1;
+}
+
+
+static int etf_parse_opt(struct qdisc_util *qu, int argc,
+			 char **argv, struct nlmsghdr *n, const char *dev)
+{
+	struct tc_etf_qopt opt = {
+		.clockid = CLOCKID_INVALID,
+	};
+	struct rtattr *tail;
+
+	while (argc > 0) {
+		if (matches(*argv, "offload") == 0) {
+			if (opt.flags & TC_ETF_OFFLOAD_ON) {
+				fprintf(stderr, "etf: duplicate \"offload\" specification\n");
+				return -1;
+			}
+
+			opt.flags |= TC_ETF_OFFLOAD_ON;
+		} else if (matches(*argv, "deadline_mode") == 0) {
+			if (opt.flags & TC_ETF_DEADLINE_MODE_ON) {
+				fprintf(stderr, "etf: duplicate \"deadline_mode\" specification\n");
+				return -1;
+			}
+
+			opt.flags |= TC_ETF_DEADLINE_MODE_ON;
+		} else if (matches(*argv, "delta") == 0) {
+			NEXT_ARG();
+			if (opt.delta) {
+				fprintf(stderr, "etf: duplicate \"delta\" specification\n");
+				return -1;
+			}
+			if (get_s32(&opt.delta, *argv, 0)) {
+				explain1("delta", *argv);
+				return -1;
+			}
+		} else if (matches(*argv, "clockid") == 0) {
+			NEXT_ARG();
+			if (opt.clockid != CLOCKID_INVALID) {
+				fprintf(stderr, "etf: duplicate \"clockid\" specification\n");
+				return -1;
+			}
+			if (get_clockid(&opt.clockid, *argv)) {
+				explain_clockid(*argv);
+				return -1;
+			}
+		} else if (strcmp(*argv, "help") == 0) {
+			explain();
+			return -1;
+		} else {
+			fprintf(stderr, "etf: unknown parameter \"%s\"\n", *argv);
+			explain();
+			return -1;
+		}
+		argc--; argv++;
+	}
+
+	tail = NLMSG_TAIL(n);
+	addattr_l(n, 1024, TCA_OPTIONS, NULL, 0);
+	addattr_l(n, 2024, TCA_ETF_PARMS, &opt, sizeof(opt));
+	tail->rta_len = (void *) NLMSG_TAIL(n) - (void *) tail;
+	return 0;
+}
+
+static int etf_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt)
+{
+	struct rtattr *tb[TCA_ETF_MAX+1];
+	struct tc_etf_qopt *qopt;
+
+	if (opt == NULL)
+		return 0;
+
+	parse_rtattr_nested(tb, TCA_ETF_MAX, opt);
+
+	if (tb[TCA_ETF_PARMS] == NULL)
+		return -1;
+
+	qopt = RTA_DATA(tb[TCA_ETF_PARMS]);
+	if (RTA_PAYLOAD(tb[TCA_ETF_PARMS])  < sizeof(*qopt))
+		return -1;
+
+	if (qopt->clockid == CLOCKID_INVALID)
+		print_string(PRINT_ANY, "clockid", "clockid %s ", "invalid");
+	else
+		print_uint(PRINT_ANY, "clockid", "clockid %d ", qopt->clockid);
+
+	print_uint(PRINT_ANY, "delta", "delta %d ", qopt->delta);
+	print_string(PRINT_ANY, "offload", "offload %s ",
+				(qopt->flags & TC_ETF_OFFLOAD_ON) ? "on" : "off");
+	print_string(PRINT_ANY, "deadline_mode", "deadline_mode %s",
+				(qopt->flags & TC_ETF_DEADLINE_MODE_ON) ? "on" : "off");
+
+	return 0;
+}
+
+struct qdisc_util etf_qdisc_util = {
+	.id		= "etf",
+	.parse_qopt	= etf_parse_opt,
+	.print_qopt	= etf_print_opt,
+};
-- 
2.17.1

^ permalink raw reply related

* [PATCH v2] net, mm: account sock objects to kmemcg
From: Shakeel Butt @ 2018-06-27 22:16 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Johannes Weiner, Vladimir Davydov, Greg Thelen, Roman Gushchin,
	David S . Miller, Eric Dumazet, Kirill Tkhai, linux-kernel,
	netdev, linux-mm, Shakeel Butt

Currently the kernel accounts the memory for network traffic through
mem_cgroup_[un]charge_skmem() interface. However the memory accounted
only includes the truesize of sk_buff which does not include the size of
sock objects. In our production environment, with opt-out kmem
accounting, the sock kmem caches (TCP[v6], UDP[v6], RAW[v6], UNIX) are
among the top most charged kmem caches and consume a significant amount
of memory which can not be left as system overhead. So, this patch
converts the kmem caches of all sock objects to SLAB_ACCOUNT.

Signed-off-by: Shakeel Butt <shakeelb@google.com>
Suggested-by: Eric Dumazet <edumazet@google.com>
---
Changelog since v1:
- Instead of specific sock kmem_caches, convert all sock kmem_caches to
  use SLAB_ACCOUNT.

 net/core/sock.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index bcc41829a16d..9e8f65585b81 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3243,7 +3243,8 @@ static int req_prot_init(const struct proto *prot)
 
 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
 					   rsk_prot->obj_size, 0,
-					   prot->slab_flags, NULL);
+					   SLAB_ACCOUNT | prot->slab_flags,
+					   NULL);
 
 	if (!rsk_prot->slab) {
 		pr_crit("%s: Can't create request sock SLAB cache!\n",
@@ -3258,7 +3259,8 @@ int proto_register(struct proto *prot, int alloc_slab)
 	if (alloc_slab) {
 		prot->slab = kmem_cache_create_usercopy(prot->name,
 					prot->obj_size, 0,
-					SLAB_HWCACHE_ALIGN | prot->slab_flags,
+					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
+					prot->slab_flags,
 					prot->useroffset, prot->usersize,
 					NULL);
 
@@ -3281,6 +3283,7 @@ int proto_register(struct proto *prot, int alloc_slab)
 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
 						  prot->twsk_prot->twsk_obj_size,
 						  0,
+						  SLAB_ACCOUNT |
 						  prot->slab_flags,
 						  NULL);
 			if (prot->twsk_prot->twsk_slab == NULL)
-- 
2.18.0.rc2.346.g013aa6912e-goog

^ permalink raw reply related

* Re: [PATCH v1 net-next 02/14] net: Add a new socket option for a future transmit time.
From: Eric Dumazet @ 2018-06-27 22:16 UTC (permalink / raw)
  To: Jesus Sanchez-Palencia, netdev
  Cc: tglx, jan.altenberg, vinicius.gomes, kurt.kanzenbach, henrik,
	richardcochran, levi.pearson, ilias.apalodimas, ivan.khoronzhuk,
	mlichvar, willemb, jhs, xiyou.wangcong, jiri, Richard Cochran
In-Reply-To: <20180627215950.6719-3-jesus.sanchez-palencia@intel.com>



On 06/27/2018 02:59 PM, Jesus Sanchez-Palencia wrote:
> From: Richard Cochran <rcochran@linutronix.de>
> 
> This patch introduces SO_TXTIME. User space enables this option in
> order to pass a desired future transmit time in a CMSG when calling
> sendmsg(2). The argument to this socket option is a 6-bytes long struct
> defined as:
> 
> struct sock_txtime {
> 	clockid_t 	clockid;
> 	u16		flags;
> };

Note that sizeof(struct sock_txtime) is 8, not 6, because of alignments.

This means that your implementation of getsockopt(... SO_TXTIME )
is probably leaking two bytes of kernel stack to user space.

^ permalink raw reply

* Re: [PATCH net-next v2] tcp: force cwnd at least 2 in tcp_cwnd_reduction
From: Yuchung Cheng @ 2018-06-27 22:27 UTC (permalink / raw)
  To: Lawrence Brakmo
  Cc: Neal Cardwell, Matt Mathis, Netdev, Kernel Team, Blake Matheny,
	Alexei Starovoitov, Eric Dumazet, Wei Wang
In-Reply-To: <9745C5DD-BB86-4A7D-B607-4B8AA0E07245@fb.com>

On Wed, Jun 27, 2018 at 1:00 PM, Lawrence Brakmo <brakmo@fb.com> wrote:
>
>
> From: <netdev-owner@vger.kernel.org> on behalf of Yuchung Cheng
> <ycheng@google.com>
> Date: Wednesday, June 27, 2018 at 9:59 AM
> To: Neal Cardwell <ncardwell@google.com>
> Cc: Lawrence Brakmo <brakmo@fb.com>, Matt Mathis <mattmathis@google.com>,
> Netdev <netdev@vger.kernel.org>, Kernel Team <Kernel-team@fb.com>, Blake
> Matheny <bmatheny@fb.com>, Alexei Starovoitov <ast@fb.com>, Eric Dumazet
> <eric.dumazet@gmail.com>, Wei Wang <weiwan@google.com>
> Subject: Re: [PATCH net-next v2] tcp: force cwnd at least 2 in
> tcp_cwnd_reduction
>
>
>
> On Wed, Jun 27, 2018 at 8:24 AM, Neal Cardwell <ncardwell@google.com> wrote:
>
> On Tue, Jun 26, 2018 at 10:34 PM Lawrence Brakmo <brakmo@fb.com> wrote:
>
> The only issue is if it is safe to always use 2 or if it is better to
>
> use min(2, snd_ssthresh) (which could still trigger the problem).
>
>
>
> Always using 2 SGTM. I don't think we need min(2, snd_ssthresh), as
>
> that should be the same as just 2, since:
>
>
>
> (a) RFCs mandate ssthresh should not be below 2, e.g.
>
> https://urldefense.proofpoint.com/v2/url?u=https-3A__tools.ietf.org_html_rfc5681&d=DwIBaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=pq_Mqvzfy-C8ltkgyx1u_g&m=PZQC-6NGqK6QEVrf1WhlAD3mQt7tK8aqrfQGp93dJy4&s=WH9fXpDBHC31NCixiiMvhOb2UXCuJIxUiY4IXyJTgpc&e=
> page 7:
>
>
>
>   ssthresh = max (FlightSize / 2, 2*SMSS)            (4)
>
>
>
> (b) The main loss-based CCs used in Linux (CUBIC, Reno, DCTCP) respect
>
> that constraint, and always have an ssthresh of at least 2.
>
>
>
> And if some CC misbehaves and uses a lower ssthresh, then taking
>
> min(2, snd_ssthresh) will trigger problems, as you note.
>
>
>
> +       tp->snd_cwnd = max((int)tcp_packets_in_flight(tp) + sndcnt, 2);
>
>
>
> AFAICT this does seem like it will make the sender behavior more
>
> aggressive in cases with high loss and/or a very low per-flow
>
> fair-share.
>
>
>
> Old:
>
>
>
> o send N packets
>
> o receive SACKs for last 3 packets
>
> o fast retransmit packet 1
>
> o using ACKs, slow-start upward
>
>
>
> New:
>
>
>
> o send N packets
>
> o receive SACKs for last 3 packets
>
> o fast retransmit packets 1 and 2
>
> o using ACKs, slow-start upward
>
>
>
> In the extreme case, if the available fair share is less than 2
>
> packets, whereas inflight would have oscillated between 1 packet and 2
>
> packets with the existing code, it now seems like with this commit the
>
> inflight will now hover at 2. It seems like this would have
>
> significantly higher losses than we had with the existing code.
>
> I share similar concern. Note that this function is used by most
>
> existing congestion control modules beside DCTCP so I am more cautious
>
> of changing this to address DCTCP issue.
>
>
>
> Theoretically it could happen with any CC, but I have only seen it
> consistently with DCTCP.
>
> One option, as I mentioned to Neal, is to use 2 only when cwnd reduction is
> caused by
>
> ECN signal.
>
>
>
> One problem that DCTCP paper notices when cwnd = 1 is still too big
>
> when the bottleneck
>
> is shared by many flows (e.g. incast). It specifically suggest
>
> changing the lower-bound of 2 in the spec to 1. (Section 8.2).
>
> https://urldefense.proofpoint.com/v2/url?u=https-3A__www.usenix.org_system_files_conference_nsdi15_nsdi15-2Dpaper-2Djudd.pdf&d=DwIBaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=pq_Mqvzfy-C8ltkgyx1u_g&m=PZQC-6NGqK6QEVrf1WhlAD3mQt7tK8aqrfQGp93dJy4&s=fO8p5khks-sGO_lwEvKicWtFz_q9LI-ecwZcS1nyZ7w&e=
>
>
>
> One could even envision using even lower values than 1, where we send one
> packet
>
> every 2 RTTs. However, I do not think this is a problem that needs to be
> fixed.
>
>
>
> I am curious about the differences you observe in 4.11 and 4.16. I
>
> wasn't aware of any (significant) change in tcp_cwnd_reduction / PRR
>
> algorithm between 4.11 and 4.16. Also the receiver should not delay
>
> ACKs if it has out-of-order packet or receiving CE data packets. This
>
> means the delayed ACK is by tail losses and the last data received
>
> carries no CE mark: seems a less common scenario?
>
>
>
> I have the feeling the differences between 4.11 and 4.16 are due to
> accounting
>
> changes and not change in CC behaviors.
If the difference is not due to change in PRR, I don't think we
should change PRR to address that until we have some evidence.

Could you share some traces?

>
>
>
> The packet and ACK are not lost. I captured pcaps on both hosts and could
> see
>
> cases when the data packet arrived, but the ACK was not sent before the
> packet was
>
> retransmitted. I saw this behavior multiple times. In many cases the ACK did
> not show
>
> up in the pcap within the 40ms one would usually expect with a delayed ACK.
>
> If delayed-ACK is the problem, we probably should fix the receiver to
>
> delay ACK more intelligently, not the sender. weiwan@google.com is
>
> working on it.
>
>
>
> Not sure if it is delayed-ACK or something caused the ACK to not be sent
> since it did
>
> not show up within 40ms as one would usually expect for a delayed ACK.
>
>
>
>
>
>
>
> This may or may not be OK in practice, but IMHO it is worth mentioning
>
> and discussing.
>
>
>
> neal
>
>

^ permalink raw reply

* Re: [RFC v2, net-next, PATCH 4/4] net/cpsw_switchdev: add switchdev mode of operation on cpsw driver
From: Grygorii Strashko @ 2018-06-27 23:03 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Ilias Apalodimas, Ivan Vecera, Florian Fainelli, Andrew Lunn,
	Networking, ivan.khoronzhuk, Sekhar Nori,
	Jiří Pírko, Francois Ozog, yogeshs, spatton,
	Jose.Abreu
In-Reply-To: <CAK8P3a1fi=C-60J7qMCRsOWYM9_e3YN7hf4hZuiSbPCvPBRbtA@mail.gmail.com>



On 06/27/2018 03:40 PM, Arnd Bergmann wrote:
> On Wed, Jun 27, 2018 at 9:18 PM, Grygorii Strashko
> <grygorii.strashko@ti.com> wrote:
>> On 06/22/2018 02:45 AM, Ilias Apalodimas wrote:
>>> On Thu, Jun 21, 2018 at 05:31:31PM +0200, Arnd Bergmann wrote:
>>>> On Thu, Jun 21, 2018 at 2:45 PM, Ilias Apalodimas
>>>> <ilias.apalodimas@linaro.org> wrote:
>>>>> On Thu, Jun 21, 2018 at 02:19:55PM +0200, Ivan Vecera wrote:
>>>>
>>>
>>> If people like this idea, i can send a V3 with these changes.
>>
>> Nop. I do not think this is good idea, because "dual_mac" mode has very strict
>> meaning and requirements. In "dual_mac" mode both port should be teated and work
>> as *separate network devices" (like two, not connected PCI eth cards) - the fact that
>> it's implemented on top of hw, which can do packet switching doesn't matter here and just a
>> technical solution.
>> Main requirements:
>> 1) No packet forwarding is allowed inside hw under any circumstances, only Linux
>>     Host SW can consume or forward packets
>> 2) One interface should not block another inside CPSW hw which implies special FIFOs/HW
>>   configuration
> 
> Could you explain the reasoning behind those requirements? I honestly don't
> see what difference it makes, given that a new driver with switchdev support
> would look exactly like the dual_emac mode as long as you don't add the
> two interfaces into a bridge, and the user-visible behavior is already required
> to be the same.

Am not aware of all details - it's custom filtering/routing/firewalling applications.
(Like Industrial Ethernet (EtherCAT) to Ethernet converter on one port and
 another port is for control/monitoring purposes)
And yes, it looks similar. But, as I mentioned, dual_mac mode required CPSW to be
configured differently and reconfiguration during attaching to the bridge
is very (very) problematic - first, FIFOs/HW configuration not expected to be done on the fly,
second vlans 1/2 reserved for this mode while bridge uses vid 1 by default.
In dual_mac mode port just switched to promiscuous mode when attached to the bridge.
Using kernel configuration option will break multi-platform support as
all CPSW instances will start behaving as switch.  

> 
>> As per, above switchdev functionality doesn't make too much sense in "dual_mac" mode and
>> introducing dual meaning for this mode is not a good choice either.
>>
>> Again, as discussed, option 4 is considered as preferred.
> 
> Do you mean creating an incompatible binding that could be implemented by
> the same driver, or duplicating the driver with one copy of the old binding
> and one copy for the new binding?

The idea is to keep dual_mac and one port mode (60% of use cases) as is
while create new driver for two port switch mode by refactoring existing driver and 
re-using generic parts as max as possible. Also, update bindings as there are
a lot of ancient and obsolete DT definitions which still supported for compatibility
reasons. And, yes, possibility to mix dual_mac and switchdev in one driver is
considered, but postponed as it hard to implement now, and as the main target is
to drop custom ioctl and switch to standard Linux interfaces for switch.
One step at time.

-- 
regards,
-grygorii

^ permalink raw reply

* [PATCH net] tcp: fix Fast Open key endianness
From: Yuchung Cheng @ 2018-06-27 23:04 UTC (permalink / raw)
  To: davem; +Cc: netdev, edumazet, ncardwell, Yuchung Cheng

Fast Open key could be stored in different endian based on the CPU.
Previously hosts in different endianness in a server farm using
the same key config (sysctl value) would produce different cookies.
This patch fixes it by always storing it as little endian to keep
same API for LE hosts.

Reported-by: Daniele Iamartino <danielei@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
---
 net/ipv4/sysctl_net_ipv4.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d06247ba08b2..af0a857d8352 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -265,8 +265,9 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write,
 	    ipv4.sysctl_tcp_fastopen);
 	struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) };
 	struct tcp_fastopen_context *ctxt;
-	int ret;
 	u32  user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */
+	__le32 key[4];
+	int ret, i;
 
 	tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL);
 	if (!tbl.data)
@@ -275,11 +276,14 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write,
 	rcu_read_lock();
 	ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
 	if (ctxt)
-		memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH);
+		memcpy(key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH);
 	else
-		memset(user_key, 0, sizeof(user_key));
+		memset(key, 0, sizeof(key));
 	rcu_read_unlock();
 
+	for (i = 0; i < ARRAY_SIZE(key); i++)
+		user_key[i] = le32_to_cpu(key[i]);
+
 	snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x",
 		user_key[0], user_key[1], user_key[2], user_key[3]);
 	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
@@ -290,13 +294,17 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write,
 			ret = -EINVAL;
 			goto bad_key;
 		}
-		tcp_fastopen_reset_cipher(net, NULL, user_key,
+
+		for (i = 0; i < ARRAY_SIZE(user_key); i++)
+			key[i] = cpu_to_le32(user_key[i]);
+
+		tcp_fastopen_reset_cipher(net, NULL, key,
 					  TCP_FASTOPEN_KEY_LENGTH);
 	}
 
 bad_key:
 	pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n",
-	       user_key[0], user_key[1], user_key[2], user_key[3],
+		user_key[0], user_key[1], user_key[2], user_key[3],
 	       (char *)tbl.data, ret);
 	kfree(tbl.data);
 	return ret;
-- 
2.18.0.rc2.346.g013aa6912e-goog

^ permalink raw reply related

* Re: [PATCH 0/6] offload Linux LAG devices to the TC datapath
From: Jakub Kicinski @ 2018-06-27 23:08 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: Or Gerlitz, John Hurley, Jiri Pirko, Linux Netdev List,
	ASAP_Direct_Dev, Simon Horman, Andy Gospodarek
In-Reply-To: <CAJ3xEMg=UDooJtZ4amhJAF1ohHHzLFm7qgWrrkhMmff3G-RstQ@mail.gmail.com>

On Wed, 27 Jun 2018 23:07:29 +0300, Or Gerlitz wrote:
> On Wed, Jun 27, 2018 at 1:31 AM, Jakub Kicinski
> <jakub.kicinski@netronome.com> wrote:
> > On Tue, 26 Jun 2018 17:57:08 +0300, Or Gerlitz wrote:  
> 
> >> 2. re the egress side of things. Some NIC HWs can't just use LAG
> >> as the egress port destination of an ACL (tc rule) and the HW rule
> >> needs to be duplicated to both HW ports. So... in that case, you
> >> see the HW driver doing the duplication (:() or we can somehow
> >> make it happen from user-space?  
> 
> > It's the TC core that does the duplication.  Drivers which don't need
> > the duplication (e.g. mlxsw) will not register a new callback for each
> > port on which shared block is bound.  They will keep one list of rules,
> > and a list of ports that those rules apply to.  
> 
> [snip]
> 
> > Drivers which need duplication (multiplication) (all NICs?) have to
> > register a new callback for each port bound to a shared block.  And TC
> > will call those drivers as many times as they have callbacks registered
> > == as many times as they have ports bound to the block.  Each time
> > callback is invoked the driver will figure out the ingress port based
> > on the cb_priv and use <ingress, cookie> as the key in its rule table
> > (or have a separate rule table per ingress port).  
> 
> [snip snip]
> 
> > I may be wrong, but I think you split the rules tables per port for mlx5  
> 
> correct,  currently I have a rule table per physical port.
> 
> > So again you just register a callback every time shared block is bound,
> > and then TC core will send add/remove rule commands down to the driver,
> > relaying existing rules as well if needed.  
> 
> Let's see, the NIC uplink rep port devices were bounded (say) by ovs to
> a shared-block because they are the lower devices (hate the slavish jargon)
> of a bond device.
> 
> Next, the TC stack will invoke the callback over these ports, when ingress
> rule is added on the bond.
> 
> But we are talking on ingress rule set on a non-uplink rep (VF rep) port,
> where bonding is the egress of the rule. I guess the callback which you probably
> refer to (you hinted there below) is the egdev one, correct? you are suggesting
> that bonding will do egdev registration... I am a bit confused.

Ah, you really meant egress.  We don't have this problem, but yes, I
think you could register an egdev callback for each lower.  You won't
get the nice rule replay from egdev as of today, though :(

> > Does that clarify things or were you asking more about the active
> > passive thing John mentioned or some way to save rule space?  
> 
> no (didn't refer to active-passive) and no (didn't look to save rule space)
> yes for active-active in a HW that needs duplicated rules (NICs).
> 
> >> 3. for the case of overlay networks, e.g OVS based vxlan tunnel, the
> >> ingress (decap) rule is set on the vxlan device. Jakub, you mentioned
> >> a possible kernel patch to the HW (nfp, mlx5) drivers to have them bind
> >> to the tunnel device for ingress rules. If we have agreed way to identify
> >> uplink representors, can we do that from ovs too?  
> >
> > I'm not sure, there can be multiple tunnel devices.  Plus we really
> > want to know the tunnel type, e.g. vxlan vs geneve, so simple shared
> > block propagation will probably not cut it.  If that's what you're
> > referring to.  
> 
> isn't knowing the tunnel type already missing today? I saw you
> started patches the tunnel key set action for Geneve, does upstream
> + the patches you sent works or more is missing to get geneve encap
> through the TC stack?

Yes, knowing tunnel type missing today, but hopefully it won't be once
we get to redesign of egdev :)  Today we only support decap on standard
ports :/  Encap is fine, though.  FWIW Geneve already works on the nfp,
the work from Simon & Pieter we posted is adding support for the
options.

^ permalink raw reply

* Re: [PATCH v1 net-next 02/14] net: Add a new socket option for a future transmit time.
From: Jesus Sanchez-Palencia @ 2018-06-27 23:07 UTC (permalink / raw)
  To: Eric Dumazet, netdev
  Cc: tglx, jan.altenberg, vinicius.gomes, kurt.kanzenbach, henrik,
	richardcochran, ilias.apalodimas, ivan.khoronzhuk, mlichvar,
	willemb, jhs, xiyou.wangcong, jiri
In-Reply-To: <c133ef6d-120c-52fb-e6bb-f33da5badb27@gmail.com>

Hi Eric,


On 06/27/2018 03:16 PM, Eric Dumazet wrote:
> 
> 
> On 06/27/2018 02:59 PM, Jesus Sanchez-Palencia wrote:
>> From: Richard Cochran <rcochran@linutronix.de>
>>
>> This patch introduces SO_TXTIME. User space enables this option in
>> order to pass a desired future transmit time in a CMSG when calling
>> sendmsg(2). The argument to this socket option is a 6-bytes long struct
>> defined as:
>>
>> struct sock_txtime {
>> 	clockid_t 	clockid;
>> 	u16		flags;
>> };
> 
> Note that sizeof(struct sock_txtime) is 8, not 6, because of alignments.


Oh yeah, sure.


> 
> This means that your implementation of getsockopt(... SO_TXTIME )
> is probably leaking two bytes of kernel stack to user space.

I'm failing to see how... There is a memset() in sock.c:1147 clearing all the 8
bytes that we later use to (explicitly) assign each member of the struct. Aren't
the 2 extra bytes sanitized, then? What have I missed?


Thanks,
Jesus

^ permalink raw reply

* Re: [PATCH v1 net-next 12/14] igb: Only call skb_tx_timestamp after descriptors are ready
From: Eric Dumazet @ 2018-06-27 23:56 UTC (permalink / raw)
  To: Jesus Sanchez-Palencia, netdev
  Cc: tglx, jan.altenberg, vinicius.gomes, kurt.kanzenbach, henrik,
	richardcochran, levi.pearson, ilias.apalodimas, ivan.khoronzhuk,
	mlichvar, willemb, jhs, xiyou.wangcong, jiri
In-Reply-To: <20180627215950.6719-13-jesus.sanchez-palencia@intel.com>



On 06/27/2018 02:59 PM, Jesus Sanchez-Palencia wrote:
> Currently, skb_tx_timestamp() is being called before the DMA
> descriptors are prepared in igb_xmit_frame_ring(), which happens
> during either the igb_tso() or igb_tx_csum() calls.
> 
> Given that now the skb->tstamp might be used to carry the timestamp
> for SO_TXTIME, we must only call skb_tx_timestamp() after the
> information has been copied into the DMA tx_ring.


Since when this skb->tstamp use happened ?

If this is in patch 11/14 (igb: Add support for ETF offload), then you should either :

1) Squash this into 11/14

2) swap 11 and 12 patch, so that this change is done before "igb: Add support for ETF offload"  

Otherwise a bisection could fail badly.

^ permalink raw reply

* Re: [RFC PATCH RESEND] tcp: avoid F-RTO if SACK and timestamps are disabled
From: Yuchung Cheng @ 2018-06-27 23:56 UTC (permalink / raw)
  To: Ilpo Järvinen; +Cc: Michal Kubecek, netdev, Eric Dumazet, LKML
In-Reply-To: <alpine.DEB.2.20.1806151230340.29120@whs-18.cs.helsinki.fi>

On Fri, Jun 15, 2018 at 3:35 AM, Ilpo Järvinen
<ilpo.jarvinen@helsinki.fi> wrote:
> On Fri, 15 Jun 2018, Michal Kubecek wrote:
>
>> On Fri, Jun 15, 2018 at 11:05:03AM +0300, Ilpo Järvinen wrote:
>> > On Thu, 14 Jun 2018, Michal Kubecek wrote:
>> >
>> > My point was that the new data segment bursts that occur if the sender
>> > isn't application limited indicate that there's something going wrong
>> > with FRTO. And that wrong is also what is causing that RTO loop because
>> > the sender doesn't see the previous FRTO recovery on second RTO. With
>> > my FRTO undo fix, (new_recovery || icsk->icsk_retransmits) will be false
>> > and that will prevent the RTO loop.
>>
>> Yes, it would prevent the loop in this case (except it would be a bit
>> later, after second RTO rather than after first).
>
> Hmm, I'm actually wrong about the new data missing bit I think. After
> reading more code I'm quite sure conventional RTO recovery is triggered
> right away (as long as that bogus undo that ends the recovery wouldn't
> occur first like it does without my fix). So no second RTO would be
> needed.
>
>> But I'm not convinced
>> the logic of the patch is correct. If I understand it correctly, it
>> essentially changes "presumption of innocence" (if we get an ack past
>> what we retransmitted, we assume it was received earlier - i.e. would
>> have been sacked before if SACK was in use) to "presumption of guilt"
>> (whenever a retransmitted segment is acked, we assume nothing else acked
>> with it was received earlier). Or that you trade false negatives for
>> false positives.
>
> FRTO depends on knowing for sure what packet (original pre-RTO one or
> something that was transmitted post-RTO) triggered the ACK. If FRTO
> isn't sure that the ACK was generated by a post-RTO packet, it must
> not assume innocence! This change in practice affects just the time while
> the segment rexmitted by RTO is still there, that is, processing in step 2
> (if we get a cumulative ACK beyond it because the next loss is not for the
> subsequent segment but for some later segment, FLAG_ORIG_SACK_ACKED is set
> and we'll incorrectly do step 3b while still in FRTO has only reached step
> 2 for real; this is fixed by my patch). ...The decision about
> positive/negative only occurs _after_ that in step 3.
>
>> Maybe I understand it wrong but it seems that you de facto prevent
>> Step (3b) from ever happening in non-SACK case.
>
> Only if any of skb that was ACKed had been retransmitted. There shouldn't
> be any in step 3 because the RTO rexmit was ACKed (and also because
> of how new_recovery variable works wrt. earlier recoveries). Thus, in
> step 3 the fix won't clear FLAG_ORIG_SACK_ACKED flag allowing FRTO to
> detect spurious RTOs using step 3b.
>
>> > > > No! The window should not update window on ACKs the receiver intends to
>> > > > designate as "duplicate ACKs". That is not without some potential cost
>> > > > though as it requires delaying window updates up to the next cumulative
>> > > > ACK. In the non-SACK series one of the changes is fixing this for
>> > > > non-SACK Linux TCP flows.
>> > >
>> > > That sounds like a reasonable change (at least at the first glance,
>> > > I didn't think about it too deeply) but even if we fix Linux stack to
>> > > behave like this, we cannot force everyone else to do the same.
>> >
>> > Unfortunately I don't know what the other stacks besides Linux do. But
>> > for Linux, the cause for the changing receiver window is the receiver
>> > window auto-tuning and I'm not sure if other stacks have a similar
>> > feature (or if that affects (almost) all ACKs like in Linux).
>>
>> The capture from my previous e-mail and some others I have seen indicate
>> that at least some implementations do not take care to never change
>> window size when responding to an out-of-order segment.  That means that
>> even if we change linux TCP this way (we might still need to send
>> a separate window update in some cases), we still cannot rely on others
>> doing the same.
>
> Those implementations ignore what is a duplicate ACK (RFC5681, which
> is also pointed into by RFC5682 for its defination):
>   DUPLICATE ACKNOWLEDGMENT: An acknowledgment is considered a
>       "duplicate" ... (e)
>       the advertised window in the incoming acknowledgment equals the
>       advertised window in the last incoming acknowledgment.
>
> Not sending duplicate ACKs also means that fast recovery will not work
> for those flows but that may not show up performance wise as long as you
> have enough capacity for any unnecessary rexmits the forced RTO recovery
> is going to do. RTO recovery may actually improve completion times for
> non-SACK flows as NewReno recovers only one lost pkt/RTT where as RTO
> recovery needs log(outstanding packets) RTTs at worst. For a large number
> of losses in a window, the log is going to win.
>
>> I checked the capture attached to my previous e-mail again and there is
>> one thing where our F-RTO implementation (in 4.4, at least) is wrong,
>> IMHO. While the first ACK after "new data" (sent in (2b)) was a window
>> update (and therefore not dupack by definition) so that we could take
>> neither (3a) nor (3b), in some iterations there were further acks which
>> did not change window size. The text just before Step 1 says
>>
>>   The F-RTO algorithm does not specify actions for receiving
>>   a segment that neither acknowledges new data nor is a duplicate
>>   acknowledgment.  The TCP sender SHOULD ignore such segments and
>>   wait for a segment that either acknowledges new data or is
>>   a duplicate acknowledgment.
>>
>> My understanding is that this means that while the first ack after new
>> data is correctly ignored, the following ack which preserves window size
>> should be recognized as a dupack and (3a) should be taken.
>
> Linux FRTO never gets that far (without my fix) if the ACK in step 2
> covers beyond the RTO rexmit because 3b is prematurely invoked, that's
> why you never see what would occur if 3a is taken. TCP thinks it's not
> recovering anymore and therefore can send only new data (if there's some
> available).
>
> This is what I tried to tell earlier, with new data there you see there's
> something else wrong too with FRTO besides the RTO loop.
agreed. Ilpo do you mind re-submitting your fix
https://patchwork.ozlabs.org/patch/883654/ (IIRC I already acked-by)

tcptest suite may have to wait due to some internal workload Neal is juggling.

>
>
> --
>  i.

^ permalink raw reply

* Re: [PATCH v1 net-next 02/14] net: Add a new socket option for a future transmit time.
From: Eric Dumazet @ 2018-06-28  0:05 UTC (permalink / raw)
  To: Jesus Sanchez-Palencia, Eric Dumazet, netdev
  Cc: tglx, jan.altenberg, vinicius.gomes, kurt.kanzenbach, henrik,
	richardcochran, ilias.apalodimas, ivan.khoronzhuk, mlichvar,
	willemb, jhs, xiyou.wangcong, jiri
In-Reply-To: <b29a6c7c-7428-b98c-3a18-0ac892bfcda8@intel.com>



On 06/27/2018 04:07 PM, Jesus Sanchez-Palencia wrote:
 
> I'm failing to see how... There is a memset() in sock.c:1147 clearing all the 8
> bytes that we later use to (explicitly) assign each member of the struct. Aren't
> the 2 extra bytes sanitized, then? What have I missed?

Nothing, it seems I missed the memset(), it was not seen in the context of your patch
and I have not checked the whole function.

^ permalink raw reply

* [BISECTED] [4.17.0-rc6] IPv6 link-local address not getting added
From: Sowmini Varadhan @ 2018-06-28  0:35 UTC (permalink / raw)
  To: dsahern; +Cc: netdev


Hi David,

An IPv6 regression has been introduced in 4.17.0-rc6 by
  8308f3f net/ipv6: Add support for specifying metric of connected routes

The regression is that some interfaces on my test machine come
up with link-local addrs but the fe80 prefix is missing.
After this bug, I cannot send any packets to anyone onlink 
(including my routers). 

Here are the symptoms:

When everything is fine, "ip -6 route|grep eno" shows

2606:b400:400:18c8::/64 dev eno1 proto ra metric 100  pref medium
fe80::5:73ff:fea0:52d dev eno1 proto static metric 100  pref medium
fe80::/64 dev eno1 proto kernel metric 256  pref medium
fe80::/64 dev eno3 proto kernel metric 256  pref medium
fe80::/64 dev eno4 proto kernel metric 256  pref medium
default via fe80::5:73ff:fea0:52d dev eno1 proto static metric 100  pref medium

But after 8308f3f, I only find

# ip -6 route|grep eno
2606:b400:400:18c8::/64 dev eno1 proto ra metric 100  pref medium
fe80::5:73ff:fea0:52d dev eno1 proto static metric 100  pref medium
fe80::/64 dev eno1 proto kernel metric 256  pref medium
default via fe80::5:73ff:fea0:52d dev eno1 proto static metric 100  pref medium

(note that eno2 is not enabled in my config, so its absence is expected)

Please have a look, thanks.
--Sowmini

^ permalink raw reply

* Re: [PATCH net-next] net: qmi_wwan: Add pass through mode
From: Subash Abhinov Kasiviswanathan @ 2018-06-28  0:38 UTC (permalink / raw)
  To: Bjørn Mork; +Cc: dnlplm, dcbw, davem, netdev, Aleksander Morgado
In-Reply-To: <87a7rg8xqv.fsf@miraculix.mork.no>

> The concepte looks fine to me, but I have a few comments to the
> implementation below.
> 
> First: I missed the last part of the discussion around automatic
> detection of passthrough mode.  Could you give us a short summary of 
> the
> alternatives you tried and why they were dropped?

Hi Bjørn

The other approach was to explicitly check for the rx_handler of the 
qmi_wwan
netdev and if it was attached by rmnet driver (rmnet_rx_handler).
If it was indeed attached, then the packet would be queued to stack. I 
had
dropped this option since it would mean that atleast the rmnet 
rx_handler would
have to be exported and exposed through a new header. This also brings a
tight coupling between qmi_wwan and rmnet driver.

> IIUC, userspace will be responsible for doing something like this to 
> set
> up a rmnet map interface:
> 
>  1) set the qmi_wwan netdev mode to raw-ip using sysfs
>  2) set the qmi_wwan netdev mode to pass-through using syfs
>  3) bind an rmnet netdev to the qmi_wwan netdev using netlink
>  4) configure the device for raw-ip using qmi
>  5) configure the device for map using qmi
> 
> I've just had a quick glance at this, but this function looks like an
> almost exact copy of the raw_ip_store().  Why not share that code
> instead of copying it?
> 
> And while you're at it:  There is nothing preventing us from turning on
> raw-ip here instead of failing if it is off, us there?  I.e. why not
> set QMI_WWAN_FLAG_RAWIP when QMI_WWAN_FLAG_PASS_THROUGH is being set,
> and clear QMI_WWAN_FLAG_PASS_THROUGH when QMI_WWAN_FLAG_RAWIP is being
> cleared?
> 
> You're missing the inverse relationship, aren't you?  There is nothing
> preventing the user from turning off raw-ip again after setting
> pass-through.
> 
> Do we really need all the notifier stuff here? You don't change the
> qmi_wwan netdev since that's already taken care of when setting
> QMI_WWAN_FLAG_RAWIP.
> 
> AFAICS, QMI_WWAN_FLAG_PASS_THROUGH can be changed without any locking,
> notifications or netdev state restrictions. All it does is change the
> behaviour on rx, and there is no reason that can't be applied from the
> next packet even on a running interface.
> 
> We could make pass_through_store just call raw_ip_store (or the part
> of it which matters, factored out into a separate function), if
> necessary. The rest of pass_through_store just sets or clears the flag.
> There is no need to do more than that, is there?
> 
> And as noted above, raw_ip_store must also clear
> QMI_WWAN_FLAG_PASS_THROUGH if clearing QMI_WWAN_FLAG_RAWIP.
> 
> There is no need testing for rawip here, since you enforce that in
> pass_through_store().

I can make these changes. With this, steps 1 and 2 are combined.

-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project

^ permalink raw reply

* Re: [net-next PATCH v4 1/7] net: Refactor XPS for CPUs and Rx queues
From: Nambiar, Amritha @ 2018-06-28  0:47 UTC (permalink / raw)
  To: Tom Herbert
  Cc: Linux Kernel Network Developers, David S. Miller, Alexander Duyck,
	Willem de Bruijn, Sridhar Samudrala, Alexander Duyck,
	Eric Dumazet, Hannes Frederic Sowa
In-Reply-To: <CALx6S36b54XVuuEkd6JoDJYb+mzJ1K0X7G512EJZk_D3V2b0_w@mail.gmail.com>

On 6/26/2018 3:53 PM, Tom Herbert wrote:
> On Mon, Jun 25, 2018 at 11:04 AM, Amritha Nambiar
> <amritha.nambiar@intel.com> wrote:
>> Refactor XPS code to support Tx queue selection based on
>> CPU(s) map or Rx queue(s) map.
>>
>> Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
>> ---
>>  include/linux/cpumask.h   |   11 ++
>>  include/linux/netdevice.h |  100 +++++++++++++++++++++
>>  net/core/dev.c            |  211 ++++++++++++++++++++++++++++++---------------
>>  net/core/net-sysfs.c      |    4 -
>>  4 files changed, 246 insertions(+), 80 deletions(-)
>>
>> diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
>> index bf53d89..57f20a0 100644
>> --- a/include/linux/cpumask.h
>> +++ b/include/linux/cpumask.h
>> @@ -115,12 +115,17 @@ extern struct cpumask __cpu_active_mask;
>>  #define cpu_active(cpu)                ((cpu) == 0)
>>  #endif
>>
>> -/* verify cpu argument to cpumask_* operators */
>> -static inline unsigned int cpumask_check(unsigned int cpu)
>> +static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits)
>>  {
>>  #ifdef CONFIG_DEBUG_PER_CPU_MAPS
>> -       WARN_ON_ONCE(cpu >= nr_cpumask_bits);
>> +       WARN_ON_ONCE(cpu >= bits);
>>  #endif /* CONFIG_DEBUG_PER_CPU_MAPS */
>> +}
>> +
>> +/* verify cpu argument to cpumask_* operators */
>> +static inline unsigned int cpumask_check(unsigned int cpu)
>> +{
>> +       cpu_max_bits_warn(cpu, nr_cpumask_bits);
>>         return cpu;
>>  }
>>
>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>> index 3ec9850..c534f03 100644
>> --- a/include/linux/netdevice.h
>> +++ b/include/linux/netdevice.h
>> @@ -730,10 +730,15 @@ struct xps_map {
>>   */
>>  struct xps_dev_maps {
>>         struct rcu_head rcu;
>> -       struct xps_map __rcu *cpu_map[0];
>> +       struct xps_map __rcu *attr_map[0]; /* Either CPUs map or RXQs map */
>>  };
>> -#define XPS_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) +         \
>> +
>> +#define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) +     \
>>         (nr_cpu_ids * (_tcs) * sizeof(struct xps_map *)))
>> +
>> +#define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\
>> +       (_rxqs * (_tcs) * sizeof(struct xps_map *)))
>> +
>>  #endif /* CONFIG_XPS */
>>
>>  #define TC_MAX_QUEUE   16
>> @@ -1909,7 +1914,8 @@ struct net_device {
>>         int                     watchdog_timeo;
>>
>>  #ifdef CONFIG_XPS
>> -       struct xps_dev_maps __rcu *xps_maps;
>> +       struct xps_dev_maps __rcu *xps_cpus_map;
>> +       struct xps_dev_maps __rcu *xps_rxqs_map;
>>  #endif
>>  #ifdef CONFIG_NET_CLS_ACT
>>         struct mini_Qdisc __rcu *miniq_egress;
>> @@ -3258,6 +3264,94 @@ static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
>>  #ifdef CONFIG_XPS
>>  int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>                         u16 index);
>> +int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
>> +                         u16 index, bool is_rxqs_map);
>> +
>> +/**
>> + *     attr_test_mask - Test a CPU or Rx queue set in a cpumask/rx queues mask
>> + *     @j: CPU/Rx queue index
>> + *     @mask: bitmask of all cpus/rx queues
>> + *     @nr_bits: number of bits in the bitmask
>> + *
>> + * Test if a CPU or Rx queue index is set in a mask of all CPU/Rx queues.
>> + */
>> +static inline bool attr_test_mask(unsigned long j, const unsigned long *mask,
>> +                                 unsigned int nr_bits)
>> +{
>> +       cpu_max_bits_warn(j, nr_bits);
>> +       return test_bit(j, mask);
>> +}
>> +
>> +/**
>> + *     attr_test_online - Test for online CPU/Rx queue
>> + *     @j: CPU/Rx queue index
>> + *     @online_mask: bitmask for CPUs/Rx queues that are online
>> + *     @nr_bits: number of bits in the bitmask
>> + *
>> + * Returns true if a CPU/Rx queue is online.
>> + */
>> +static inline bool attr_test_online(unsigned long j,
>> +                                   const unsigned long *online_mask,
>> +                                   unsigned int nr_bits)
>> +{
>> +       cpu_max_bits_warn(j, nr_bits);
>> +
>> +       if (online_mask)
>> +               return test_bit(j, online_mask);
>> +
>> +       if (j >= 0 && j < nr_bits)
> 
> j is unsigned so j >= 0 is superfluous.
> 
>> +               return true;
>> +
>> +       return false;
> 
> Could just do:
> 
> return (j < nr_bits);

Will fix.

> 
>> +}
>> +
>> +/**
>> + *     attrmask_next - get the next CPU/Rx queue in a cpumask/Rx queues mask
>> + *     @n: CPU/Rx queue index
>> + *     @srcp: the cpumask/Rx queue mask pointer
>> + *     @nr_bits: number of bits in the bitmask
>> + *
>> + * Returns >= nr_bits if no further CPUs/Rx queues set.
>> + */
>> +static inline unsigned int attrmask_next(int n, const unsigned long *srcp,
>> +                                        unsigned int nr_bits)
>> +{
>> +       /* -1 is a legal arg here. */
>> +       if (n != -1)
>> +               cpu_max_bits_warn(n, nr_bits);
>> +
>> +       if (srcp)
>> +               return find_next_bit(srcp, nr_bits, n + 1);
>> +
>> +       return n + 1;
>> +}
>> +
>> +/**
>> + *     attrmask_next_and - get the next CPU/Rx queue in *src1p & *src2p
>> + *     @n: CPU/Rx queue index
>> + *     @src1p: the first CPUs/Rx queues mask pointer
>> + *     @src2p: the second CPUs/Rx queues mask pointer
>> + *     @nr_bits: number of bits in the bitmask
>> + *
>> + * Returns >= nr_bits if no further CPUs/Rx queues set in both.
>> + */
>> +static inline int attrmask_next_and(int n, const unsigned long *src1p,
>> +                                   const unsigned long *src2p,
>> +                                   unsigned int nr_bits)
>> +{
>> +       /* -1 is a legal arg here. */
>> +       if (n != -1)
>> +               cpu_max_bits_warn(n, nr_bits);
>> +
>> +       if (src1p && src2p)
>> +               return find_next_and_bit(src1p, src2p, nr_bits, n + 1);
>> +       else if (src1p)
>> +               return find_next_bit(src1p, nr_bits, n + 1);
>> +       else if (src2p)
>> +               return find_next_bit(src2p, nr_bits, n + 1);
>> +
>> +       return n + 1;
>> +}
>>  #else
>>  static inline int netif_set_xps_queue(struct net_device *dev,
>>                                       const struct cpumask *mask,
>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index a5aa1c7..2552556 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -2092,7 +2092,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
>>         int pos;
>>
>>         if (dev_maps)
>> -               map = xmap_dereference(dev_maps->cpu_map[tci]);
>> +               map = xmap_dereference(dev_maps->attr_map[tci]);
>>         if (!map)
>>                 return false;
>>
>> @@ -2105,7 +2105,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
>>                         break;
>>                 }
>>
>> -               RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
>> +               RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
>>                 kfree_rcu(map, rcu);
>>                 return false;
>>         }
>> @@ -2135,31 +2135,58 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
>>         return active;
>>  }
>>
>> +static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
>> +                          struct xps_dev_maps *dev_maps, unsigned int nr_ids,
>> +                          u16 offset, u16 count, bool is_rxqs_map)
>> +{
>> +       bool active = false;
>> +       int i, j;
>> +
>> +       for (j = -1; j = attrmask_next(j, mask, nr_ids),
>> +            j < nr_ids;)
>> +               active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
>> +                                              count);
>> +       if (!active) {
>> +               if (is_rxqs_map) {
>> +                       RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
>> +               } else {
>> +                       RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
>> +
>> +                       for (i = offset + (count - 1); count--; i--)
>> +                               netdev_queue_numa_node_write(
>> +                                       netdev_get_tx_queue(dev, i),
>> +                                                       NUMA_NO_NODE);
>> +               }
>> +               kfree_rcu(dev_maps, rcu);
>> +       }
>> +}
>> +
>>  static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
>>                                    u16 count)
>>  {
>> +       const unsigned long *possible_mask = NULL;
>>         struct xps_dev_maps *dev_maps;
>> -       int cpu, i;
>> -       bool active = false;
>> +       unsigned int nr_ids;
>>
>>         mutex_lock(&xps_map_mutex);
>> -       dev_maps = xmap_dereference(dev->xps_maps);
>>
>> -       if (!dev_maps)
>> -               goto out_no_maps;
>> -
>> -       for_each_possible_cpu(cpu)
>> -               active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
>> -                                              offset, count);
>> +       dev_maps = xmap_dereference(dev->xps_rxqs_map);
>> +       if (dev_maps) {
>> +               nr_ids = dev->num_rx_queues;
>> +               clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset,
>> +                              count, true);
>>
>> -       if (!active) {
>> -               RCU_INIT_POINTER(dev->xps_maps, NULL);
>> -               kfree_rcu(dev_maps, rcu);
>>         }
>>
>> -       for (i = offset + (count - 1); count--; i--)
>> -               netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
>> -                                            NUMA_NO_NODE);
>> +       dev_maps = xmap_dereference(dev->xps_cpus_map);
>> +       if (!dev_maps)
>> +               goto out_no_maps;
>> +
>> +       if (num_possible_cpus() > 1)
>> +               possible_mask = cpumask_bits(cpu_possible_mask);
>> +       nr_ids = nr_cpu_ids;
>> +       clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
>> +                      false);
>>
>>  out_no_maps:
>>         mutex_unlock(&xps_map_mutex);
>> @@ -2170,8 +2197,8 @@ static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
>>         netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
>>  }
>>
>> -static struct xps_map *expand_xps_map(struct xps_map *map,
>> -                                     int cpu, u16 index)
>> +static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
>> +                                     u16 index, bool is_rxqs_map)
>>  {
>>         struct xps_map *new_map;
>>         int alloc_len = XPS_MIN_MAP_ALLOC;
>> @@ -2183,7 +2210,7 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
>>                 return map;
>>         }
>>
>> -       /* Need to add queue to this CPU's existing map */
>> +       /* Need to add tx-queue to this CPU's/rx-queue's existing map */
>>         if (map) {
>>                 if (pos < map->alloc_len)
>>                         return map;
>> @@ -2191,9 +2218,14 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
>>                 alloc_len = map->alloc_len * 2;
>>         }
>>
>> -       /* Need to allocate new map to store queue on this CPU's map */
>> -       new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
>> -                              cpu_to_node(cpu));
>> +       /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
>> +        *  map
>> +        */
>> +       if (is_rxqs_map)
>> +               new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
>> +       else
>> +               new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
>> +                                      cpu_to_node(attr_index));
>>         if (!new_map)
>>                 return NULL;
>>
>> @@ -2205,14 +2237,16 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
>>         return new_map;
>>  }
>>
>> -int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>> -                       u16 index)
>> +int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
>> +                         u16 index, bool is_rxqs_map)
>>  {
>> +       const unsigned long *online_mask = NULL, *possible_mask = NULL;
>>         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
>> -       int i, cpu, tci, numa_node_id = -2;
>> +       int i, j, tci, numa_node_id = -2;
>>         int maps_sz, num_tc = 1, tc = 0;
>>         struct xps_map *map, *new_map;
>>         bool active = false;
>> +       unsigned int nr_ids;
>>
>>         if (dev->num_tc) {
>>                 num_tc = dev->num_tc;
>> @@ -2221,16 +2255,27 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>                         return -EINVAL;
>>         }
>>
>> -       maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
>> -       if (maps_sz < L1_CACHE_BYTES)
>> -               maps_sz = L1_CACHE_BYTES;
>> -
>>         mutex_lock(&xps_map_mutex);
>> +       if (is_rxqs_map) {
>> +               maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
>> +               dev_maps = xmap_dereference(dev->xps_rxqs_map);
>> +               nr_ids = dev->num_rx_queues;
>> +       } else {
>> +               maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
>> +               if (num_possible_cpus() > 1) {
>> +                       online_mask = cpumask_bits(cpu_online_mask);
>> +                       possible_mask = cpumask_bits(cpu_possible_mask);
>> +               }
>> +               dev_maps = xmap_dereference(dev->xps_cpus_map);
>> +               nr_ids = nr_cpu_ids;
>> +       }
>>
>> -       dev_maps = xmap_dereference(dev->xps_maps);
>> +       if (maps_sz < L1_CACHE_BYTES)
>> +               maps_sz = L1_CACHE_BYTES;
>>
>>         /* allocate memory for queue storage */
>> -       for_each_cpu_and(cpu, cpu_online_mask, mask) {
>> +       for (j = -1; j = attrmask_next_and(j, online_mask, mask, nr_ids),
>> +            j < nr_ids;) {
>>                 if (!new_dev_maps)
>>                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
>>                 if (!new_dev_maps) {
>> @@ -2238,73 +2283,81 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>                         return -ENOMEM;
>>                 }
>>
>> -               tci = cpu * num_tc + tc;
>> -               map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
>> +               tci = j * num_tc + tc;
>> +               map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
>>                                  NULL;
>>
>> -               map = expand_xps_map(map, cpu, index);
>> +               map = expand_xps_map(map, j, index, is_rxqs_map);
>>                 if (!map)
>>                         goto error;
>>
>> -               RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
>> +               RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
>>         }
>>
>>         if (!new_dev_maps)
>>                 goto out_no_new_maps;
>>
>> -       for_each_possible_cpu(cpu) {
>> +       for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
>> +            j < nr_ids;) {
>>                 /* copy maps belonging to foreign traffic classes */
>> -               for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
>> +               for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
>>                         /* fill in the new device map from the old device map */
>> -                       map = xmap_dereference(dev_maps->cpu_map[tci]);
>> -                       RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
>> +                       map = xmap_dereference(dev_maps->attr_map[tci]);
>> +                       RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
>>                 }
>>
>>                 /* We need to explicitly update tci as prevous loop
>>                  * could break out early if dev_maps is NULL.
>>                  */
>> -               tci = cpu * num_tc + tc;
>> +               tci = j * num_tc + tc;
>>
>> -               if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
>> -                       /* add queue to CPU maps */
>> +               if (attr_test_mask(j, mask, nr_ids) &&
>> +                   attr_test_online(j, online_mask, nr_ids)) {
>> +                       /* add tx-queue to CPU/rx-queue maps */
>>                         int pos = 0;
>>
>> -                       map = xmap_dereference(new_dev_maps->cpu_map[tci]);
>> +                       map = xmap_dereference(new_dev_maps->attr_map[tci]);
>>                         while ((pos < map->len) && (map->queues[pos] != index))
>>                                 pos++;
>>
>>                         if (pos == map->len)
>>                                 map->queues[map->len++] = index;
>>  #ifdef CONFIG_NUMA
>> -                       if (numa_node_id == -2)
>> -                               numa_node_id = cpu_to_node(cpu);
>> -                       else if (numa_node_id != cpu_to_node(cpu))
>> -                               numa_node_id = -1;
> 
> Seems like there should be a comment here about meaning of -2 and -1
> in NUMA node. Better yet, seems like there should be constants defined
> for these special values. Maybe something to clean up in the future.

Will have a separate patch (not part of this series) for this.

> 
>> +                       if (!is_rxqs_map) {
>> +                               if (numa_node_id == -2)
>> +                                       numa_node_id = cpu_to_node(j);
>> +                               else if (numa_node_id != cpu_to_node(j))
>> +                                       numa_node_id = -1;
>> +                       }
>>  #endif
>>                 } else if (dev_maps) {
>>                         /* fill in the new device map from the old device map */
>> -                       map = xmap_dereference(dev_maps->cpu_map[tci]);
>> -                       RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
>> +                       map = xmap_dereference(dev_maps->attr_map[tci]);
>> +                       RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
>>                 }
>>
>>                 /* copy maps belonging to foreign traffic classes */
>>                 for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
>>                         /* fill in the new device map from the old device map */
>> -                       map = xmap_dereference(dev_maps->cpu_map[tci]);
>> -                       RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
>> +                       map = xmap_dereference(dev_maps->attr_map[tci]);
>> +                       RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
>>                 }
>>         }
>>
>> -       rcu_assign_pointer(dev->xps_maps, new_dev_maps);
>> +       if (is_rxqs_map)
>> +               rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
>> +       else
>> +               rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
>>
>>         /* Cleanup old maps */
>>         if (!dev_maps)
>>                 goto out_no_old_maps;
>>
>> -       for_each_possible_cpu(cpu) {
>> -               for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
>> -                       new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
>> -                       map = xmap_dereference(dev_maps->cpu_map[tci]);
>> +       for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
>> +            j < nr_ids;) {
>> +               for (i = num_tc, tci = j * num_tc; i--; tci++) {
>> +                       new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
>> +                       map = xmap_dereference(dev_maps->attr_map[tci]);
>>                         if (map && map != new_map)
>>                                 kfree_rcu(map, rcu);
>>                 }
>> @@ -2317,19 +2370,23 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>         active = true;
>>
>>  out_no_new_maps:
>> -       /* update Tx queue numa node */
>> -       netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
>> -                                    (numa_node_id >= 0) ? numa_node_id :
>> -                                    NUMA_NO_NODE);
>> +       if (!is_rxqs_map) {
>> +               /* update Tx queue numa node */
>> +               netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
>> +                                            (numa_node_id >= 0) ?
>> +                                            numa_node_id : NUMA_NO_NODE);
>> +       }
>>
>>         if (!dev_maps)
>>                 goto out_no_maps;
>>
>> -       /* removes queue from unused CPUs */
>> -       for_each_possible_cpu(cpu) {
>> -               for (i = tc, tci = cpu * num_tc; i--; tci++)
>> +       /* removes tx-queue from unused CPUs/rx-queues */
>> +       for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
>> +            j < nr_ids;) {
>> +               for (i = tc, tci = j * num_tc; i--; tci++)
>>                         active |= remove_xps_queue(dev_maps, tci, index);
>> -               if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
>> +               if (!attr_test_mask(j, mask, nr_ids) ||
>> +                   !attr_test_online(j, online_mask, nr_ids))
>>                         active |= remove_xps_queue(dev_maps, tci, index);
>>                 for (i = num_tc - tc, tci++; --i; tci++)
>>                         active |= remove_xps_queue(dev_maps, tci, index);
>> @@ -2337,7 +2394,10 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>
>>         /* free map if not active */
>>         if (!active) {
>> -               RCU_INIT_POINTER(dev->xps_maps, NULL);
>> +               if (is_rxqs_map)
>> +                       RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
>> +               else
>> +                       RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
>>                 kfree_rcu(dev_maps, rcu);
>>         }
>>
>> @@ -2347,11 +2407,12 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>         return 0;
>>  error:
>>         /* remove any maps that we added */
>> -       for_each_possible_cpu(cpu) {
>> -               for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
>> -                       new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
>> +       for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
>> +            j < nr_ids;) {
>> +               for (i = num_tc, tci = j * num_tc; i--; tci++) {
>> +                       new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
>>                         map = dev_maps ?
>> -                             xmap_dereference(dev_maps->cpu_map[tci]) :
>> +                             xmap_dereference(dev_maps->attr_map[tci]) :
>>                               NULL;
>>                         if (new_map && new_map != map)
>>                                 kfree(new_map);
>> @@ -2363,6 +2424,12 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>         kfree(new_dev_maps);
>>         return -ENOMEM;
>>  }
>> +
>> +int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>> +                       u16 index)
>> +{
>> +       return __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
>> +}
>>  EXPORT_SYMBOL(netif_set_xps_queue);
>>
>>  #endif
>> @@ -3384,7 +3451,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
>>         int queue_index = -1;
>>
>>         rcu_read_lock();
>> -       dev_maps = rcu_dereference(dev->xps_maps);
>> +       dev_maps = rcu_dereference(dev->xps_cpus_map);
>>         if (dev_maps) {
>>                 unsigned int tci = skb->sender_cpu - 1;
>>
>> @@ -3393,7 +3460,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
>>                         tci += netdev_get_prio_tc_map(dev, skb->priority);
>>                 }
>>
>> -               map = rcu_dereference(dev_maps->cpu_map[tci]);
>> +               map = rcu_dereference(dev_maps->attr_map[tci]);
>>                 if (map) {
>>                         if (map->len == 1)
>>                                 queue_index = map->queues[0];
>> diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
>> index bb7e80f..b39987c 100644
>> --- a/net/core/net-sysfs.c
>> +++ b/net/core/net-sysfs.c
>> @@ -1227,13 +1227,13 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue,
>>                 return -ENOMEM;
>>
>>         rcu_read_lock();
>> -       dev_maps = rcu_dereference(dev->xps_maps);
>> +       dev_maps = rcu_dereference(dev->xps_cpus_map);
>>         if (dev_maps) {
>>                 for_each_possible_cpu(cpu) {
>>                         int i, tci = cpu * num_tc + tc;
>>                         struct xps_map *map;
>>
>> -                       map = rcu_dereference(dev_maps->cpu_map[tci]);
>> +                       map = rcu_dereference(dev_maps->attr_map[tci]);
>>                         if (!map)
>>                                 continue;
>>
>>
> 
> Acked-by: Tom Herbert <tom@quantonium.net>
> 

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox