Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next 0/2] hv_netvsc: minor enhancements
From: Stephen Hemminger @ 2018-03-16 22:44 UTC (permalink / raw)
  To: kys, haiyangz, sthemmin; +Cc: devel, netdev

A couple of small things for net-next

Stephen Hemminger (2):
  hv_netvsc: pass netvsc_device to rndis halt
  hv_netvsc: add trace points

 drivers/net/hyperv/Makefile       |   2 +-
 drivers/net/hyperv/netvsc.c       |  26 +++++-
 drivers/net/hyperv/netvsc_trace.c |   7 ++
 drivers/net/hyperv/netvsc_trace.h | 182 ++++++++++++++++++++++++++++++++++++++
 drivers/net/hyperv/rndis_filter.c |  12 ++-
 5 files changed, 223 insertions(+), 6 deletions(-)
 create mode 100644 drivers/net/hyperv/netvsc_trace.c
 create mode 100644 drivers/net/hyperv/netvsc_trace.h

-- 
2.16.2

^ permalink raw reply

* [PATCH 2/5] ixgbevf: Add support for XDP_TX action
From: Tony Nguyen @ 2018-03-16 22:34 UTC (permalink / raw)
  To: intel-wired-lan; +Cc: Tony Nguyen, john.fastabend, netdev
In-Reply-To: <20180316223406.7295-1-anthony.l.nguyen@intel.com>

This implements the XDP_TX action which is modeled on the ixgbe
implementation. However instead of using CPU id to determine which XDP
queue to use, this uses the received RX queue index, which is similar
to i40e. Doing this eliminates the restriction that number of CPUs not
exceed number of XDP queues that ixgbe has.

Also, based on the number of queues available, the number of TX queues
may be reduced when an XDP program is loaded in order to accommodate the
XDP queues.

Based largely on commit 33fdc82f0883 ("ixgbe: add support for XDP_TX
action")

Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
---
 drivers/net/ethernet/intel/ixgbevf/ethtool.c      |  35 ++-
 drivers/net/ethernet/intel/ixgbevf/ixgbevf.h      |  20 +-
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 271 +++++++++++++++++++---
 3 files changed, 294 insertions(+), 32 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ethtool.c b/drivers/net/ethernet/intel/ixgbevf/ethtool.c
index 4946a62c70a4..da8c0e299a37 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ethtool.c
@@ -269,7 +269,7 @@ static int ixgbevf_set_ringparam(struct net_device *netdev,
 	struct ixgbevf_adapter *adapter = netdev_priv(netdev);
 	struct ixgbevf_ring *tx_ring = NULL, *rx_ring = NULL;
 	u32 new_rx_count, new_tx_count;
-	int i, err = 0;
+	int i, j, err = 0;
 
 	if ((ring->rx_mini_pending) || (ring->rx_jumbo_pending))
 		return -EINVAL;
@@ -293,15 +293,19 @@ static int ixgbevf_set_ringparam(struct net_device *netdev,
 	if (!netif_running(adapter->netdev)) {
 		for (i = 0; i < adapter->num_tx_queues; i++)
 			adapter->tx_ring[i]->count = new_tx_count;
+		for (i = 0; i < adapter->num_xdp_queues; i++)
+			adapter->xdp_ring[i]->count = new_tx_count;
 		for (i = 0; i < adapter->num_rx_queues; i++)
 			adapter->rx_ring[i]->count = new_rx_count;
 		adapter->tx_ring_count = new_tx_count;
+		adapter->xdp_ring_count = new_tx_count;
 		adapter->rx_ring_count = new_rx_count;
 		goto clear_reset;
 	}
 
 	if (new_tx_count != adapter->tx_ring_count) {
-		tx_ring = vmalloc(adapter->num_tx_queues * sizeof(*tx_ring));
+		tx_ring = vmalloc((adapter->num_tx_queues +
+				   adapter->num_xdp_queues) * sizeof(*tx_ring));
 		if (!tx_ring) {
 			err = -ENOMEM;
 			goto clear_reset;
@@ -324,6 +328,24 @@ static int ixgbevf_set_ringparam(struct net_device *netdev,
 				goto clear_reset;
 			}
 		}
+
+		for (j = 0; j < adapter->num_xdp_queues; i++, j++) {
+			/* clone ring and setup updated count */
+			tx_ring[i] = *adapter->xdp_ring[j];
+			tx_ring[i].count = new_tx_count;
+			err = ixgbevf_setup_tx_resources(&tx_ring[i]);
+			if (err) {
+				while (i) {
+					i--;
+					ixgbevf_free_tx_resources(&tx_ring[i]);
+				}
+
+				vfree(tx_ring);
+				tx_ring = NULL;
+
+				goto clear_reset;
+			}
+		}
 	}
 
 	if (new_rx_count != adapter->rx_ring_count) {
@@ -368,6 +390,12 @@ static int ixgbevf_set_ringparam(struct net_device *netdev,
 		}
 		adapter->tx_ring_count = new_tx_count;
 
+		for (j = 0; j < adapter->num_xdp_queues; i++, j++) {
+			ixgbevf_free_tx_resources(adapter->xdp_ring[j]);
+			*adapter->xdp_ring[j] = tx_ring[i];
+		}
+		adapter->xdp_ring_count = new_tx_count;
+
 		vfree(tx_ring);
 		tx_ring = NULL;
 	}
@@ -390,7 +418,8 @@ static int ixgbevf_set_ringparam(struct net_device *netdev,
 clear_reset:
 	/* free Tx resources if Rx error is encountered */
 	if (tx_ring) {
-		for (i = 0; i < adapter->num_tx_queues; i++)
+		for (i = 0;
+		     i < adapter->num_tx_queues + adapter->num_xdp_queues; i++)
 			ixgbevf_free_tx_resources(&tx_ring[i]);
 		vfree(tx_ring);
 	}
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
index 6f6b4a157dff..f7126462f0eb 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
@@ -52,7 +52,11 @@
 struct ixgbevf_tx_buffer {
 	union ixgbe_adv_tx_desc *next_to_watch;
 	unsigned long time_stamp;
-	struct sk_buff *skb;
+	union {
+		struct sk_buff *skb;
+		/* XDP uses address ptr on irq_clean */
+		void *data;
+	};
 	unsigned int bytecount;
 	unsigned short gso_segs;
 	__be16 protocol;
@@ -95,8 +99,16 @@ enum ixgbevf_ring_state_t {
 	__IXGBEVF_RX_BUILD_SKB_ENABLED,
 	__IXGBEVF_TX_DETECT_HANG,
 	__IXGBEVF_HANG_CHECK_ARMED,
+	__IXGBEVF_TX_XDP_RING,
 };
 
+#define ring_is_xdp(ring) \
+		test_bit(__IXGBEVF_TX_XDP_RING, &(ring)->state)
+#define set_ring_xdp(ring) \
+		set_bit(__IXGBEVF_TX_XDP_RING, &(ring)->state)
+#define clear_ring_xdp(ring) \
+		clear_bit(__IXGBEVF_TX_XDP_RING, &(ring)->state)
+
 struct ixgbevf_ring {
 	struct ixgbevf_ring *next;
 	struct ixgbevf_q_vector *q_vector;	/* backpointer to q_vector */
@@ -139,6 +151,7 @@ struct ixgbevf_ring {
 
 #define MAX_RX_QUEUES IXGBE_VF_MAX_RX_QUEUES
 #define MAX_TX_QUEUES IXGBE_VF_MAX_TX_QUEUES
+#define MAX_XDP_QUEUES IXGBE_VF_MAX_TX_QUEUES
 #define IXGBEVF_MAX_RSS_QUEUES		2
 #define IXGBEVF_82599_RETA_SIZE		128	/* 128 entries */
 #define IXGBEVF_X550_VFRETA_SIZE	64	/* 64 entries */
@@ -339,6 +352,10 @@ struct ixgbevf_adapter {
 	u32 eims_enable_mask;
 	u32 eims_other;
 
+	/* XDP */
+	int num_xdp_queues;
+	struct ixgbevf_ring *xdp_ring[MAX_XDP_QUEUES];
+
 	/* TX */
 	int num_tx_queues;
 	struct ixgbevf_ring *tx_ring[MAX_TX_QUEUES]; /* One per active queue */
@@ -373,6 +390,7 @@ struct ixgbevf_adapter {
 	unsigned long state;
 	u64 tx_busy;
 	unsigned int tx_ring_count;
+	unsigned int xdp_ring_count;
 	unsigned int rx_ring_count;
 
 	u8 __iomem *io_addr; /* Mainly for iounmap use */
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 2696b5a6806f..309f549808e4 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -324,7 +324,10 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector *q_vector,
 		total_packets += tx_buffer->gso_segs;
 
 		/* free the skb */
-		napi_consume_skb(tx_buffer->skb, napi_budget);
+		if (ring_is_xdp(tx_ring))
+			page_frag_free(tx_buffer->data);
+		else
+			napi_consume_skb(tx_buffer->skb, napi_budget);
 
 		/* unmap skb header data */
 		dma_unmap_single(tx_ring->dev,
@@ -388,7 +391,7 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector *q_vector,
 
 		eop_desc = tx_ring->tx_buffer_info[i].next_to_watch;
 
-		pr_err("Detected Tx Unit Hang\n"
+		pr_err("Detected Tx Unit Hang%s\n"
 		       "  Tx Queue             <%d>\n"
 		       "  TDH, TDT             <%x>, <%x>\n"
 		       "  next_to_use          <%x>\n"
@@ -398,6 +401,7 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector *q_vector,
 		       "  eop_desc->wb.status  <%x>\n"
 		       "  time_stamp           <%lx>\n"
 		       "  jiffies              <%lx>\n",
+		       ring_is_xdp(tx_ring) ? " XDP" : "",
 		       tx_ring->queue_index,
 		       IXGBE_READ_REG(hw, IXGBE_VFTDH(tx_ring->reg_idx)),
 		       IXGBE_READ_REG(hw, IXGBE_VFTDT(tx_ring->reg_idx)),
@@ -405,7 +409,9 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector *q_vector,
 		       eop_desc, (eop_desc ? eop_desc->wb.status : 0),
 		       tx_ring->tx_buffer_info[i].time_stamp, jiffies);
 
-		netif_stop_subqueue(tx_ring->netdev, tx_ring->queue_index);
+		if (!ring_is_xdp(tx_ring))
+			netif_stop_subqueue(tx_ring->netdev,
+					    tx_ring->queue_index);
 
 		/* schedule immediate reset if we believe we hung */
 		ixgbevf_tx_timeout_reset(adapter);
@@ -413,6 +419,9 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector *q_vector,
 		return true;
 	}
 
+	if (ring_is_xdp(tx_ring))
+		return !!budget;
+
 #define TX_WAKE_THRESHOLD (DESC_NEEDED * 2)
 	if (unlikely(total_packets && netif_carrier_ok(tx_ring->netdev) &&
 		     (ixgbevf_desc_unused(tx_ring) >= TX_WAKE_THRESHOLD))) {
@@ -963,11 +972,78 @@ static struct sk_buff *ixgbevf_build_skb(struct ixgbevf_ring *rx_ring,
 
 #define IXGBEVF_XDP_PASS 0
 #define IXGBEVF_XDP_CONSUMED 1
+#define IXGBEVF_XDP_TX 2
+
+static int ixgbevf_xmit_xdp_ring(struct ixgbevf_ring *ring,
+				 struct xdp_buff *xdp)
+{
+	struct ixgbevf_tx_buffer *tx_buffer;
+	union ixgbe_adv_tx_desc *tx_desc;
+	u32 len, cmd_type;
+	dma_addr_t dma;
+	u16 i;
+
+	len = xdp->data_end - xdp->data;
+
+	if (unlikely(!ixgbevf_desc_unused(ring)))
+		return IXGBEVF_XDP_CONSUMED;
+
+	dma = dma_map_single(ring->dev, xdp->data, len, DMA_TO_DEVICE);
+	if (dma_mapping_error(ring->dev, dma))
+		return IXGBEVF_XDP_CONSUMED;
+
+	/* record the location of the first descriptor for this packet */
+	tx_buffer = &ring->tx_buffer_info[ring->next_to_use];
+	tx_buffer->bytecount = len;
+	tx_buffer->gso_segs = 1;
+	tx_buffer->protocol = 0;
+
+	i = ring->next_to_use;
+	tx_desc = IXGBEVF_TX_DESC(ring, i);
+
+	dma_unmap_len_set(tx_buffer, len, len);
+	dma_unmap_addr_set(tx_buffer, dma, dma);
+	tx_buffer->data = xdp->data;
+	tx_desc->read.buffer_addr = cpu_to_le64(dma);
+
+	/* put descriptor type bits */
+	cmd_type = IXGBE_ADVTXD_DTYP_DATA |
+		   IXGBE_ADVTXD_DCMD_DEXT |
+		   IXGBE_ADVTXD_DCMD_IFCS;
+	cmd_type |= len | IXGBE_TXD_CMD;
+	tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type);
+	tx_desc->read.olinfo_status =
+			cpu_to_le32((len << IXGBE_ADVTXD_PAYLEN_SHIFT) |
+				    IXGBE_ADVTXD_CC);
+
+	/* Force memory writes to complete before letting h/w know there
+	 * are new descriptors to fetch.  (Only applicable for weak-ordered
+	 * memory model archs, such as IA-64).
+	 *
+	 * We also need this memory barrier to make certain all of the
+	 * status bits have been updated before next_to_watch is written.
+	 */
+	wmb();
 
-static struct sk_buff *ixgbevf_run_xdp(struct ixgbevf_ring  *rx_ring,
+	/* set next_to_watch value indicating a packet is present */
+	i++;
+	if (i == ring->count)
+		i = 0;
+
+	tx_buffer->next_to_watch = tx_desc;
+	ring->next_to_use = i;
+
+	/* notify HW of packet */
+	ixgbevf_write_tail(ring, i);
+	return IXGBEVF_XDP_TX;
+}
+
+static struct sk_buff *ixgbevf_run_xdp(struct ixgbevf_adapter *adapter,
+				       struct ixgbevf_ring  *rx_ring,
 				       struct xdp_buff *xdp)
 {
 	int result = IXGBEVF_XDP_PASS;
+	struct ixgbevf_ring *xdp_ring;
 	struct bpf_prog *xdp_prog;
 	u32 act;
 
@@ -981,10 +1057,13 @@ static struct sk_buff *ixgbevf_run_xdp(struct ixgbevf_ring  *rx_ring,
 	switch (act) {
 	case XDP_PASS:
 		break;
+	case XDP_TX:
+		xdp_ring = adapter->xdp_ring[rx_ring->queue_index];
+		result = ixgbevf_xmit_xdp_ring(xdp_ring, xdp);
+		break;
 	default:
 		bpf_warn_invalid_xdp_action(act);
 		/* fallthrough */
-	case XDP_TX:
 	case XDP_ABORTED:
 		trace_xdp_exception(rx_ring->netdev, xdp_prog, act);
 		/* fallthrough -- handle aborts by dropping packet */
@@ -997,11 +1076,29 @@ static struct sk_buff *ixgbevf_run_xdp(struct ixgbevf_ring  *rx_ring,
 	return ERR_PTR(-result);
 }
 
+static void ixgbevf_rx_buffer_flip(struct ixgbevf_ring *rx_ring,
+				   struct ixgbevf_rx_buffer *rx_buffer,
+				   unsigned int size)
+{
+#if (PAGE_SIZE < 8192)
+	unsigned int truesize = ixgbevf_rx_pg_size(rx_ring) / 2;
+
+	rx_buffer->page_offset ^= truesize;
+#else
+	unsigned int truesize = ring_uses_build_skb(rx_ring) ?
+				SKB_DATA_ALIGN(IXGBEVF_SKB_PAD + size) :
+				SKB_DATA_ALIGN(size);
+
+	rx_buffer->page_offset += truesize;
+#endif
+}
+
 static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector,
 				struct ixgbevf_ring *rx_ring,
 				int budget)
 {
 	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
+	struct ixgbevf_adapter *adapter = q_vector->adapter;
 	u16 cleaned_count = ixgbevf_desc_unused(rx_ring);
 	struct sk_buff *skb = rx_ring->skb;
 	struct xdp_buff xdp;
@@ -1041,13 +1138,17 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector,
 					      ixgbevf_rx_offset(rx_ring);
 			xdp.data_end = xdp.data + size;
 
-			skb = ixgbevf_run_xdp(rx_ring, &xdp);
+			skb = ixgbevf_run_xdp(adapter, rx_ring, &xdp);
 		}
 
 		if (IS_ERR(skb)) {
+			if (PTR_ERR(skb) == -IXGBEVF_XDP_TX)
+				ixgbevf_rx_buffer_flip(rx_ring, rx_buffer,
+						       size);
+			else
+				rx_buffer->pagecnt_bias++;
 			total_rx_packets++;
 			total_rx_bytes += size;
-			rx_buffer->pagecnt_bias++;
 		} else if (skb) {
 			ixgbevf_add_rx_frag(rx_ring, rx_buffer, skb, size);
 		} else if (ring_uses_build_skb(rx_ring)) {
@@ -1608,6 +1709,8 @@ static void ixgbevf_configure_tx(struct ixgbevf_adapter *adapter)
 	/* Setup the HW Tx Head and Tail descriptor pointers */
 	for (i = 0; i < adapter->num_tx_queues; i++)
 		ixgbevf_configure_tx_ring(adapter, adapter->tx_ring[i]);
+	for (i = 0; i < adapter->num_xdp_queues; i++)
+		ixgbevf_configure_tx_ring(adapter, adapter->xdp_ring[i]);
 }
 
 #define IXGBE_SRRCTL_BSIZEHDRSIZE_SHIFT	2
@@ -2239,7 +2342,10 @@ static void ixgbevf_clean_tx_ring(struct ixgbevf_ring *tx_ring)
 		union ixgbe_adv_tx_desc *eop_desc, *tx_desc;
 
 		/* Free all the Tx ring sk_buffs */
-		dev_kfree_skb_any(tx_buffer->skb);
+		if (ring_is_xdp(tx_ring))
+			page_frag_free(tx_buffer->data);
+		else
+			dev_kfree_skb_any(tx_buffer->skb);
 
 		/* unmap skb header data */
 		dma_unmap_single(tx_ring->dev,
@@ -2307,6 +2413,8 @@ static void ixgbevf_clean_all_tx_rings(struct ixgbevf_adapter *adapter)
 
 	for (i = 0; i < adapter->num_tx_queues; i++)
 		ixgbevf_clean_tx_ring(adapter->tx_ring[i]);
+	for (i = 0; i < adapter->num_xdp_queues; i++)
+		ixgbevf_clean_tx_ring(adapter->xdp_ring[i]);
 }
 
 void ixgbevf_down(struct ixgbevf_adapter *adapter)
@@ -2345,6 +2453,13 @@ void ixgbevf_down(struct ixgbevf_adapter *adapter)
 				IXGBE_TXDCTL_SWFLSH);
 	}
 
+	for (i = 0; i < adapter->num_xdp_queues; i++) {
+		u8 reg_idx = adapter->xdp_ring[i]->reg_idx;
+
+		IXGBE_WRITE_REG(hw, IXGBE_VFTXDCTL(reg_idx),
+				IXGBE_TXDCTL_SWFLSH);
+	}
+
 	if (!pci_channel_offline(adapter->pdev))
 		ixgbevf_reset(adapter);
 
@@ -2442,6 +2557,7 @@ static void ixgbevf_set_num_queues(struct ixgbevf_adapter *adapter)
 	/* Start with base case */
 	adapter->num_rx_queues = 1;
 	adapter->num_tx_queues = 1;
+	adapter->num_xdp_queues = 0;
 
 	spin_lock_bh(&adapter->mbx_lock);
 
@@ -2463,8 +2579,13 @@ static void ixgbevf_set_num_queues(struct ixgbevf_adapter *adapter)
 		case ixgbe_mbox_api_11:
 		case ixgbe_mbox_api_12:
 		case ixgbe_mbox_api_13:
+			if (adapter->xdp_prog &&
+			    hw->mac.max_tx_queues == rss)
+				rss = rss > 3 ? 2 : 1;
+
 			adapter->num_rx_queues = rss;
 			adapter->num_tx_queues = rss;
+			adapter->num_xdp_queues = adapter->xdp_prog ? rss : 0;
 		default:
 			break;
 		}
@@ -2521,6 +2642,8 @@ static void ixgbevf_add_ring(struct ixgbevf_ring *ring,
  * @v_idx: index of vector in adapter struct
  * @txr_count: number of Tx rings for q vector
  * @txr_idx: index of first Tx ring to assign
+ * @xdp_count: total number of XDP rings to allocate
+ * @xdp_idx: index of first XDP ring to allocate
  * @rxr_count: number of Rx rings for q vector
  * @rxr_idx: index of first Rx ring to assign
  *
@@ -2528,13 +2651,15 @@ static void ixgbevf_add_ring(struct ixgbevf_ring *ring,
  **/
 static int ixgbevf_alloc_q_vector(struct ixgbevf_adapter *adapter, int v_idx,
 				  int txr_count, int txr_idx,
+				  int xdp_count, int xdp_idx,
 				  int rxr_count, int rxr_idx)
 {
 	struct ixgbevf_q_vector *q_vector;
+	int reg_idx = txr_idx + xdp_idx;
 	struct ixgbevf_ring *ring;
 	int ring_count, size;
 
-	ring_count = txr_count + rxr_count;
+	ring_count = txr_count + xdp_count + rxr_count;
 	size = sizeof(*q_vector) + (sizeof(*ring) * ring_count);
 
 	/* allocate q_vector and rings */
@@ -2567,7 +2692,7 @@ static int ixgbevf_alloc_q_vector(struct ixgbevf_adapter *adapter, int v_idx,
 		/* apply Tx specific ring traits */
 		ring->count = adapter->tx_ring_count;
 		ring->queue_index = txr_idx;
-		ring->reg_idx = txr_idx;
+		ring->reg_idx = reg_idx;
 
 		/* assign ring to adapter */
 		 adapter->tx_ring[txr_idx] = ring;
@@ -2575,6 +2700,36 @@ static int ixgbevf_alloc_q_vector(struct ixgbevf_adapter *adapter, int v_idx,
 		/* update count and index */
 		txr_count--;
 		txr_idx++;
+		reg_idx++;
+
+		/* push pointer to next ring */
+		ring++;
+	}
+
+	while (xdp_count) {
+		/* assign generic ring traits */
+		ring->dev = &adapter->pdev->dev;
+		ring->netdev = adapter->netdev;
+
+		/* configure backlink on ring */
+		ring->q_vector = q_vector;
+
+		/* update q_vector Tx values */
+		ixgbevf_add_ring(ring, &q_vector->tx);
+
+		/* apply Tx specific ring traits */
+		ring->count = adapter->tx_ring_count;
+		ring->queue_index = xdp_idx;
+		ring->reg_idx = reg_idx;
+		set_ring_xdp(ring);
+
+		/* assign ring to adapter */
+		adapter->xdp_ring[xdp_idx] = ring;
+
+		/* update count and index */
+		xdp_count--;
+		xdp_idx++;
+		reg_idx++;
 
 		/* push pointer to next ring */
 		ring++;
@@ -2624,8 +2779,12 @@ static void ixgbevf_free_q_vector(struct ixgbevf_adapter *adapter, int v_idx)
 	struct ixgbevf_q_vector *q_vector = adapter->q_vector[v_idx];
 	struct ixgbevf_ring *ring;
 
-	ixgbevf_for_each_ring(ring, q_vector->tx)
-		adapter->tx_ring[ring->queue_index] = NULL;
+	ixgbevf_for_each_ring(ring, q_vector->tx) {
+		if (ring_is_xdp(ring))
+			adapter->xdp_ring[ring->queue_index] = NULL;
+		else
+			adapter->tx_ring[ring->queue_index] = NULL;
+	}
 
 	ixgbevf_for_each_ring(ring, q_vector->rx)
 		adapter->rx_ring[ring->queue_index] = NULL;
@@ -2651,15 +2810,16 @@ static int ixgbevf_alloc_q_vectors(struct ixgbevf_adapter *adapter)
 	int q_vectors = adapter->num_msix_vectors - NON_Q_VECTORS;
 	int rxr_remaining = adapter->num_rx_queues;
 	int txr_remaining = adapter->num_tx_queues;
-	int rxr_idx = 0, txr_idx = 0, v_idx = 0;
+	int xdp_remaining = adapter->num_xdp_queues;
+	int rxr_idx = 0, txr_idx = 0, xdp_idx = 0, v_idx = 0;
 	int err;
 
-	if (q_vectors >= (rxr_remaining + txr_remaining)) {
+	if (q_vectors >= (rxr_remaining + txr_remaining + xdp_remaining)) {
 		for (; rxr_remaining; v_idx++, q_vectors--) {
 			int rqpv = DIV_ROUND_UP(rxr_remaining, q_vectors);
 
 			err = ixgbevf_alloc_q_vector(adapter, v_idx,
-						     0, 0, rqpv, rxr_idx);
+						     0, 0, 0, 0, rqpv, rxr_idx);
 			if (err)
 				goto err_out;
 
@@ -2672,9 +2832,11 @@ static int ixgbevf_alloc_q_vectors(struct ixgbevf_adapter *adapter)
 	for (; q_vectors; v_idx++, q_vectors--) {
 		int rqpv = DIV_ROUND_UP(rxr_remaining, q_vectors);
 		int tqpv = DIV_ROUND_UP(txr_remaining, q_vectors);
+		int xqpv = DIV_ROUND_UP(xdp_remaining, q_vectors);
 
 		err = ixgbevf_alloc_q_vector(adapter, v_idx,
 					     tqpv, txr_idx,
+					     xqpv, xdp_idx,
 					     rqpv, rxr_idx);
 
 		if (err)
@@ -2685,6 +2847,8 @@ static int ixgbevf_alloc_q_vectors(struct ixgbevf_adapter *adapter)
 		rxr_idx += rqpv;
 		txr_remaining -= tqpv;
 		txr_idx += tqpv;
+		xdp_remaining -= xqpv;
+		xdp_idx += xqpv;
 	}
 
 	return 0;
@@ -2756,9 +2920,10 @@ static int ixgbevf_init_interrupt_scheme(struct ixgbevf_adapter *adapter)
 		goto err_alloc_q_vectors;
 	}
 
-	hw_dbg(&adapter->hw, "Multiqueue %s: Rx Queue count = %u, Tx Queue count = %u\n",
-	       (adapter->num_rx_queues > 1) ? "Enabled" :
-	       "Disabled", adapter->num_rx_queues, adapter->num_tx_queues);
+	hw_dbg(&adapter->hw, "Multiqueue %s: Rx Queue count = %u, Tx Queue count = %u XDP Queue count %u\n",
+	       (adapter->num_rx_queues > 1) ? "Enabled" : "Disabled",
+	       adapter->num_rx_queues, adapter->num_tx_queues,
+	       adapter->num_xdp_queues);
 
 	set_bit(__IXGBEVF_DOWN, &adapter->state);
 
@@ -2779,6 +2944,7 @@ static int ixgbevf_init_interrupt_scheme(struct ixgbevf_adapter *adapter)
 static void ixgbevf_clear_interrupt_scheme(struct ixgbevf_adapter *adapter)
 {
 	adapter->num_tx_queues = 0;
+	adapter->num_xdp_queues = 0;
 	adapter->num_rx_queues = 0;
 
 	ixgbevf_free_q_vectors(adapter);
@@ -2986,6 +3152,8 @@ static void ixgbevf_check_hang_subtask(struct ixgbevf_adapter *adapter)
 	if (netif_carrier_ok(adapter->netdev)) {
 		for (i = 0; i < adapter->num_tx_queues; i++)
 			set_check_for_tx_hang(adapter->tx_ring[i]);
+		for (i = 0; i < adapter->num_xdp_queues; i++)
+			set_check_for_tx_hang(adapter->xdp_ring[i]);
 	}
 
 	/* get one bit for every active Tx/Rx interrupt vector */
@@ -3157,6 +3325,9 @@ static void ixgbevf_free_all_tx_resources(struct ixgbevf_adapter *adapter)
 	for (i = 0; i < adapter->num_tx_queues; i++)
 		if (adapter->tx_ring[i]->desc)
 			ixgbevf_free_tx_resources(adapter->tx_ring[i]);
+	for (i = 0; i < adapter->num_xdp_queues; i++)
+		if (adapter->xdp_ring[i]->desc)
+			ixgbevf_free_tx_resources(adapter->xdp_ring[i]);
 }
 
 /**
@@ -3207,7 +3378,7 @@ int ixgbevf_setup_tx_resources(struct ixgbevf_ring *tx_ring)
  **/
 static int ixgbevf_setup_all_tx_resources(struct ixgbevf_adapter *adapter)
 {
-	int i, err = 0;
+	int i, j = 0, err = 0;
 
 	for (i = 0; i < adapter->num_tx_queues; i++) {
 		err = ixgbevf_setup_tx_resources(adapter->tx_ring[i]);
@@ -3217,11 +3388,22 @@ static int ixgbevf_setup_all_tx_resources(struct ixgbevf_adapter *adapter)
 		goto err_setup_tx;
 	}
 
+	for (j = 0; j < adapter->num_xdp_queues; j++) {
+		err = ixgbevf_setup_tx_resources(adapter->xdp_ring[j]);
+		if (!err)
+			continue;
+		hw_dbg(&adapter->hw, "Allocation for XDP Queue %u failed\n", j);
+		break;
+	}
+
 	return 0;
 err_setup_tx:
 	/* rewind the index freeing the rings as we go */
+	while (j--)
+		ixgbevf_free_tx_resources(adapter->xdp_ring[j]);
 	while (i--)
 		ixgbevf_free_tx_resources(adapter->tx_ring[i]);
+
 	return err;
 }
 
@@ -4114,6 +4296,23 @@ static void ixgbevf_shutdown(struct pci_dev *pdev)
 	ixgbevf_suspend(pdev, PMSG_SUSPEND);
 }
 
+static void ixgbevf_get_tx_ring_stats(struct rtnl_link_stats64 *stats,
+				      const struct ixgbevf_ring *ring)
+{
+	u64 bytes, packets;
+	unsigned int start;
+
+	if (ring) {
+		do {
+			start = u64_stats_fetch_begin_irq(&ring->syncp);
+			bytes = ring->stats.bytes;
+			packets = ring->stats.packets;
+		} while (u64_stats_fetch_retry_irq(&ring->syncp, start));
+		stats->tx_bytes += bytes;
+		stats->tx_packets += packets;
+	}
+}
+
 static void ixgbevf_get_stats(struct net_device *netdev,
 			      struct rtnl_link_stats64 *stats)
 {
@@ -4141,13 +4340,12 @@ static void ixgbevf_get_stats(struct net_device *netdev,
 
 	for (i = 0; i < adapter->num_tx_queues; i++) {
 		ring = adapter->tx_ring[i];
-		do {
-			start = u64_stats_fetch_begin_irq(&ring->syncp);
-			bytes = ring->stats.bytes;
-			packets = ring->stats.packets;
-		} while (u64_stats_fetch_retry_irq(&ring->syncp, start));
-		stats->tx_bytes += bytes;
-		stats->tx_packets += packets;
+		ixgbevf_get_tx_ring_stats(stats, ring);
+	}
+
+	for (i = 0; i < adapter->num_xdp_queues; i++) {
+		ring = adapter->xdp_ring[i];
+		ixgbevf_get_tx_ring_stats(stats, ring);
 	}
 	rcu_read_unlock();
 }
@@ -4201,8 +4399,25 @@ static int ixgbevf_xdp_setup(struct net_device *dev, struct bpf_prog *prog)
 	}
 
 	old_prog = xchg(&adapter->xdp_prog, prog);
-	for (i = 0; i < adapter->num_rx_queues; i++)
-		xchg(&adapter->rx_ring[i]->xdp_prog, adapter->xdp_prog);
+
+	/* If transitioning XDP modes reconfigure rings */
+	if (!!prog != !!old_prog) {
+		/* Hardware has to reinitialize queues and interrupts to
+		 * match packet buffer alignment. Unfortunately, the
+		 * hardware is not flexible enough to do this dynamically.
+		 */
+		if (netif_running(dev))
+			ixgbevf_close(dev);
+
+		ixgbevf_clear_interrupt_scheme(adapter);
+		ixgbevf_init_interrupt_scheme(adapter);
+
+		if (netif_running(dev))
+			ixgbevf_open(dev);
+	} else {
+		for (i = 0; i < adapter->num_rx_queues; i++)
+			xchg(&adapter->rx_ring[i]->xdp_prog, adapter->xdp_prog);
+	}
 
 	if (old_prog)
 		bpf_prog_put(old_prog);
-- 
2.13.6

^ permalink raw reply related

* [PATCH 4/5] ixgbevf: Add support for meta data
From: Tony Nguyen @ 2018-03-16 22:34 UTC (permalink / raw)
  To: intel-wired-lan; +Cc: Tony Nguyen, john.fastabend, netdev
In-Reply-To: <20180316223406.7295-1-anthony.l.nguyen@intel.com>

Add support for XDP meta data when using build skb.

Based on commit 366a88fe2f40 ("bpf, ixgbe: add meta data support")

Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 29 +++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 5167e81e0cf1..3d9033f26eff 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -889,6 +889,20 @@ struct sk_buff *ixgbevf_construct_skb(struct ixgbevf_ring *rx_ring,
 #if L1_CACHE_BYTES < 128
 	prefetch(xdp->data + L1_CACHE_BYTES);
 #endif
+	/* Note, we get here by enabling legacy-rx via:
+	 *
+	 *    ethtool --set-priv-flags <dev> legacy-rx on
+	 *
+	 * In this mode, we currently get 0 extra XDP headroom as
+	 * opposed to having legacy-rx off, where we process XDP
+	 * packets going to stack via ixgbevf_build_skb().
+	 *
+	 * For ixgbevf_construct_skb() mode it means that the
+	 * xdp->data_meta will always point to xdp->data, since
+	 * the helper cannot expand the head. Should this ever
+	 * changed in future for legacy-rx mode on, then lets also
+	 * add xdp->data_meta handling here.
+	 */
 
 	/* allocate a skb to store the frags */
 	skb = napi_alloc_skb(&rx_ring->q_vector->napi, IXGBEVF_RX_HDR_SIZE);
@@ -936,6 +950,7 @@ static struct sk_buff *ixgbevf_build_skb(struct ixgbevf_ring *rx_ring,
 					 struct xdp_buff *xdp,
 					 union ixgbe_adv_rx_desc *rx_desc)
 {
+	unsigned int metasize = xdp->data - xdp->data_meta;
 #if (PAGE_SIZE < 8192)
 	unsigned int truesize = ixgbevf_rx_pg_size(rx_ring) / 2;
 #else
@@ -945,10 +960,14 @@ static struct sk_buff *ixgbevf_build_skb(struct ixgbevf_ring *rx_ring,
 #endif
 	struct sk_buff *skb;
 
-	/* prefetch first cache line of first page */
-	prefetch(xdp->data);
+	/* Prefetch first cache line of first page. If xdp->data_meta
+	 * is unused, this points to xdp->data, otherwise, we likely
+	 * have a consumer accessing first few bytes of meta data,
+	 * and then actual data.
+	 */
+	prefetch(xdp->data_meta);
 #if L1_CACHE_BYTES < 128
-	prefetch(xdp->data + L1_CACHE_BYTES);
+	prefetch(xdp->data_meta + L1_CACHE_BYTES);
 #endif
 
 	/* build an skb around the page buffer */
@@ -959,6 +978,8 @@ static struct sk_buff *ixgbevf_build_skb(struct ixgbevf_ring *rx_ring,
 	/* update pointers within the skb to store the data */
 	skb_reserve(skb, xdp->data - xdp->data_hard_start);
 	__skb_put(skb, xdp->data_end - xdp->data);
+	if (metasize)
+		skb_metadata_set(skb, metasize);
 
 	/* update buffer offset */
 #if (PAGE_SIZE < 8192)
@@ -1126,7 +1147,7 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector,
 		if (!skb) {
 			xdp.data = page_address(rx_buffer->page) +
 				   rx_buffer->page_offset;
-			xdp_set_data_meta_invalid(&xdp);
+			xdp.data_meta = xdp.data;
 			xdp.data_hard_start = xdp.data -
 					      ixgbevf_rx_offset(rx_ring);
 			xdp.data_end = xdp.data + size;
-- 
2.13.6

^ permalink raw reply related

* [PATCH 3/5] ixgbevf: Delay tail write for XDP packets
From: Tony Nguyen @ 2018-03-16 22:34 UTC (permalink / raw)
  To: intel-wired-lan; +Cc: Tony Nguyen, john.fastabend, netdev
In-Reply-To: <20180316223406.7295-1-anthony.l.nguyen@intel.com>

Current XDP implementation hits the tail on every XDP_TX; change the
driver to only hit the tail after packet processing is complete.

Based on commit 7379f97a4fce ("ixgbe: delay tail write to every 'n'
packets")

Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 30 ++++++++++++++---------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 309f549808e4..5167e81e0cf1 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -1016,14 +1016,8 @@ static int ixgbevf_xmit_xdp_ring(struct ixgbevf_ring *ring,
 			cpu_to_le32((len << IXGBE_ADVTXD_PAYLEN_SHIFT) |
 				    IXGBE_ADVTXD_CC);
 
-	/* Force memory writes to complete before letting h/w know there
-	 * are new descriptors to fetch.  (Only applicable for weak-ordered
-	 * memory model archs, such as IA-64).
-	 *
-	 * We also need this memory barrier to make certain all of the
-	 * status bits have been updated before next_to_watch is written.
-	 */
-	wmb();
+	/* Avoid any potential race with cleanup */
+	smp_wmb();
 
 	/* set next_to_watch value indicating a packet is present */
 	i++;
@@ -1033,8 +1027,6 @@ static int ixgbevf_xmit_xdp_ring(struct ixgbevf_ring *ring,
 	tx_buffer->next_to_watch = tx_desc;
 	ring->next_to_use = i;
 
-	/* notify HW of packet */
-	ixgbevf_write_tail(ring, i);
 	return IXGBEVF_XDP_TX;
 }
 
@@ -1101,6 +1093,7 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector,
 	struct ixgbevf_adapter *adapter = q_vector->adapter;
 	u16 cleaned_count = ixgbevf_desc_unused(rx_ring);
 	struct sk_buff *skb = rx_ring->skb;
+	bool xdp_xmit = false;
 	struct xdp_buff xdp;
 
 	xdp.rxq = &rx_ring->xdp_rxq;
@@ -1142,11 +1135,13 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector,
 		}
 
 		if (IS_ERR(skb)) {
-			if (PTR_ERR(skb) == -IXGBEVF_XDP_TX)
+			if (PTR_ERR(skb) == -IXGBEVF_XDP_TX) {
+				xdp_xmit = true;
 				ixgbevf_rx_buffer_flip(rx_ring, rx_buffer,
 						       size);
-			else
+			} else {
 				rx_buffer->pagecnt_bias++;
+			}
 			total_rx_packets++;
 			total_rx_bytes += size;
 		} else if (skb) {
@@ -1208,6 +1203,17 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector,
 	/* place incomplete frames back on ring for completion */
 	rx_ring->skb = skb;
 
+	if (xdp_xmit) {
+		struct ixgbevf_ring *xdp_ring =
+			adapter->xdp_ring[rx_ring->queue_index];
+
+		/* Force memory writes to complete before letting h/w
+		 * know there are new descriptors to fetch.
+		 */
+		wmb();
+		ixgbevf_write_tail(xdp_ring, xdp_ring->next_to_use);
+	}
+
 	u64_stats_update_begin(&rx_ring->syncp);
 	rx_ring->stats.packets += total_rx_packets;
 	rx_ring->stats.bytes += total_rx_bytes;
-- 
2.13.6

^ permalink raw reply related

* [PATCH 1/5] ixgbevf: Add XDP support for pass and drop actions
From: Tony Nguyen @ 2018-03-16 22:34 UTC (permalink / raw)
  To: intel-wired-lan; +Cc: Tony Nguyen, john.fastabend, netdev
In-Reply-To: <20180316223406.7295-1-anthony.l.nguyen@intel.com>

Implement XDP_PASS and XDP_DROP based on the ixgbe implementation.

Based largely on commit 924708081629 ("ixgbe: add XDP support for pass and
drop actions").

Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
---
 drivers/net/ethernet/intel/ixgbevf/ethtool.c      |   9 +-
 drivers/net/ethernet/intel/ixgbevf/ixgbevf.h      |  10 +-
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 201 ++++++++++++++++++----
 3 files changed, 178 insertions(+), 42 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ethtool.c b/drivers/net/ethernet/intel/ixgbevf/ethtool.c
index e7623fed42da..4946a62c70a4 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ethtool.c
@@ -1,7 +1,7 @@
 /*******************************************************************************
 
   Intel 82599 Virtual Function driver
-  Copyright(c) 1999 - 2015 Intel Corporation.
+  Copyright(c) 1999 - 2018 Intel Corporation.
 
   This program is free software; you can redistribute it and/or modify it
   under the terms and conditions of the GNU General Public License,
@@ -336,8 +336,13 @@ static int ixgbevf_set_ringparam(struct net_device *netdev,
 		for (i = 0; i < adapter->num_rx_queues; i++) {
 			/* clone ring and setup updated count */
 			rx_ring[i] = *adapter->rx_ring[i];
+
+			/* Clear copied XDP RX-queue info */
+			memset(&rx_ring[i].xdp_rxq, 0,
+			       sizeof(rx_ring[i].xdp_rxq));
+
 			rx_ring[i].count = new_rx_count;
-			err = ixgbevf_setup_rx_resources(&rx_ring[i]);
+			err = ixgbevf_setup_rx_resources(adapter, &rx_ring[i]);
 			if (err) {
 				while (i) {
 					i--;
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
index 7778c30b6ca5..6f6b4a157dff 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
@@ -2,7 +2,7 @@
 /*******************************************************************************
 
   Intel 82599 Virtual Function driver
-  Copyright(c) 1999 - 2015 Intel Corporation.
+  Copyright(c) 1999 - 2018 Intel Corporation.
 
   This program is free software; you can redistribute it and/or modify it
   under the terms and conditions of the GNU General Public License,
@@ -35,6 +35,7 @@
 #include <linux/netdevice.h>
 #include <linux/if_vlan.h>
 #include <linux/u64_stats_sync.h>
+#include <net/xdp.h>
 
 #include "vf.h"
 
@@ -100,6 +101,7 @@ struct ixgbevf_ring {
 	struct ixgbevf_ring *next;
 	struct ixgbevf_q_vector *q_vector;	/* backpointer to q_vector */
 	struct net_device *netdev;
+	struct bpf_prog *xdp_prog;
 	struct device *dev;
 	void *desc;			/* descriptor ring memory */
 	dma_addr_t dma;			/* phys. address of descriptor ring */
@@ -120,7 +122,7 @@ struct ixgbevf_ring {
 		struct ixgbevf_tx_queue_stats tx_stats;
 		struct ixgbevf_rx_queue_stats rx_stats;
 	};
-
+	struct xdp_rxq_info xdp_rxq;
 	u64 hw_csum_rx_error;
 	u8 __iomem *tail;
 	struct sk_buff *skb;
@@ -357,6 +359,7 @@ struct ixgbevf_adapter {
 
 	/* OS defined structs */
 	struct net_device *netdev;
+	struct bpf_prog *xdp_prog;
 	struct pci_dev *pdev;
 
 	/* structs defined in ixgbe_vf.h */
@@ -443,7 +446,8 @@ void ixgbevf_down(struct ixgbevf_adapter *adapter);
 void ixgbevf_reinit_locked(struct ixgbevf_adapter *adapter);
 void ixgbevf_reset(struct ixgbevf_adapter *adapter);
 void ixgbevf_set_ethtool_ops(struct net_device *netdev);
-int ixgbevf_setup_rx_resources(struct ixgbevf_ring *);
+int ixgbevf_setup_rx_resources(struct ixgbevf_adapter *adapter,
+			       struct ixgbevf_ring *rx_ring);
 int ixgbevf_setup_tx_resources(struct ixgbevf_ring *);
 void ixgbevf_free_rx_resources(struct ixgbevf_ring *);
 void ixgbevf_free_tx_resources(struct ixgbevf_ring *);
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 4da449e0a4ba..2696b5a6806f 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -1,7 +1,7 @@
 /*******************************************************************************
 
   Intel 82599 Virtual Function driver
-  Copyright(c) 1999 - 2015 Intel Corporation.
+  Copyright(c) 1999 - 2018 Intel Corporation.
 
   This program is free software; you can redistribute it and/or modify it
   under the terms and conditions of the GNU General Public License,
@@ -50,6 +50,9 @@
 #include <linux/if_vlan.h>
 #include <linux/prefetch.h>
 #include <net/mpls.h>
+#include <linux/bpf.h>
+#include <linux/bpf_trace.h>
+#include <linux/atomic.h>
 
 #include "ixgbevf.h"
 
@@ -552,19 +555,21 @@ struct ixgbevf_rx_buffer *ixgbevf_get_rx_buffer(struct ixgbevf_ring *rx_ring,
 }
 
 static void ixgbevf_put_rx_buffer(struct ixgbevf_ring *rx_ring,
-				  struct ixgbevf_rx_buffer *rx_buffer)
+				  struct ixgbevf_rx_buffer *rx_buffer,
+				  struct sk_buff *skb)
 {
 	if (ixgbevf_can_reuse_rx_page(rx_buffer)) {
 		/* hand second half of page back to the ring */
 		ixgbevf_reuse_rx_page(rx_ring, rx_buffer);
 	} else {
-		/* We are not reusing the buffer so unmap it and free
-		 * any references we are holding to it
-		 */
-		dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma,
-				     ixgbevf_rx_pg_size(rx_ring),
-				     DMA_FROM_DEVICE,
-				     IXGBEVF_RX_DMA_ATTR);
+		if (IS_ERR(skb))
+			/* We are not reusing the buffer so unmap it and free
+			 * any references we are holding to it
+			 */
+			dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma,
+					     ixgbevf_rx_pg_size(rx_ring),
+					     DMA_FROM_DEVICE,
+					     IXGBEVF_RX_DMA_ATTR);
 		__page_frag_cache_drain(rx_buffer->page,
 					rx_buffer->pagecnt_bias);
 	}
@@ -737,6 +742,10 @@ static bool ixgbevf_cleanup_headers(struct ixgbevf_ring *rx_ring,
 				    union ixgbe_adv_rx_desc *rx_desc,
 				    struct sk_buff *skb)
 {
+	/* XDP packets use error pointer so abort at this point */
+	if (IS_ERR(skb))
+		return true;
+
 	/* verify that the packet does not have any known errors */
 	if (unlikely(ixgbevf_test_staterr(rx_desc,
 					  IXGBE_RXDADV_ERR_FRAME_ERR_MASK))) {
@@ -853,22 +862,23 @@ static void ixgbevf_add_rx_frag(struct ixgbevf_ring *rx_ring,
 static
 struct sk_buff *ixgbevf_construct_skb(struct ixgbevf_ring *rx_ring,
 				      struct ixgbevf_rx_buffer *rx_buffer,
-				      union ixgbe_adv_rx_desc *rx_desc,
-				      unsigned int size)
+				      struct xdp_buff *xdp,
+				      union ixgbe_adv_rx_desc *rx_desc)
 {
-	void *va = page_address(rx_buffer->page) + rx_buffer->page_offset;
+	unsigned int size = xdp->data_end - xdp->data;
 #if (PAGE_SIZE < 8192)
 	unsigned int truesize = ixgbevf_rx_pg_size(rx_ring) / 2;
 #else
-	unsigned int truesize = SKB_DATA_ALIGN(size);
+	unsigned int truesize = SKB_DATA_ALIGN(xdp->data_end -
+					       xdp->data_hard_start);
 #endif
 	unsigned int headlen;
 	struct sk_buff *skb;
 
 	/* prefetch first cache line of first page */
-	prefetch(va);
+	prefetch(xdp->data);
 #if L1_CACHE_BYTES < 128
-	prefetch(va + L1_CACHE_BYTES);
+	prefetch(xdp->data + L1_CACHE_BYTES);
 #endif
 
 	/* allocate a skb to store the frags */
@@ -879,16 +889,18 @@ struct sk_buff *ixgbevf_construct_skb(struct ixgbevf_ring *rx_ring,
 	/* Determine available headroom for copy */
 	headlen = size;
 	if (headlen > IXGBEVF_RX_HDR_SIZE)
-		headlen = eth_get_headlen(va, IXGBEVF_RX_HDR_SIZE);
+		headlen = eth_get_headlen(xdp->data, IXGBEVF_RX_HDR_SIZE);
 
 	/* align pull length to size of long to optimize memcpy performance */
-	memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long)));
+	memcpy(__skb_put(skb, headlen), xdp->data,
+	       ALIGN(headlen, sizeof(long)));
 
 	/* update all of the pointers */
 	size -= headlen;
 	if (size) {
 		skb_add_rx_frag(skb, 0, rx_buffer->page,
-				(va + headlen) - page_address(rx_buffer->page),
+				(xdp->data + headlen) -
+					page_address(rx_buffer->page),
 				size, truesize);
 #if (PAGE_SIZE < 8192)
 		rx_buffer->page_offset ^= truesize;
@@ -912,32 +924,32 @@ static inline void ixgbevf_irq_enable_queues(struct ixgbevf_adapter *adapter,
 
 static struct sk_buff *ixgbevf_build_skb(struct ixgbevf_ring *rx_ring,
 					 struct ixgbevf_rx_buffer *rx_buffer,
-					 union ixgbe_adv_rx_desc *rx_desc,
-					 unsigned int size)
+					 struct xdp_buff *xdp,
+					 union ixgbe_adv_rx_desc *rx_desc)
 {
-	void *va = page_address(rx_buffer->page) + rx_buffer->page_offset;
 #if (PAGE_SIZE < 8192)
 	unsigned int truesize = ixgbevf_rx_pg_size(rx_ring) / 2;
 #else
 	unsigned int truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) +
-				SKB_DATA_ALIGN(IXGBEVF_SKB_PAD + size);
+				SKB_DATA_ALIGN(xdp->data_end -
+					       xdp->data_hard_start);
 #endif
 	struct sk_buff *skb;
 
 	/* prefetch first cache line of first page */
-	prefetch(va);
+	prefetch(xdp->data);
 #if L1_CACHE_BYTES < 128
-	prefetch(va + L1_CACHE_BYTES);
+	prefetch(xdp->data + L1_CACHE_BYTES);
 #endif
 
-	/* build an skb to around the page buffer */
-	skb = build_skb(va - IXGBEVF_SKB_PAD, truesize);
+	/* build an skb around the page buffer */
+	skb = build_skb(xdp->data_hard_start, truesize);
 	if (unlikely(!skb))
 		return NULL;
 
 	/* update pointers within the skb to store the data */
-	skb_reserve(skb, IXGBEVF_SKB_PAD);
-	__skb_put(skb, size);
+	skb_reserve(skb, xdp->data - xdp->data_hard_start);
+	__skb_put(skb, xdp->data_end - xdp->data);
 
 	/* update buffer offset */
 #if (PAGE_SIZE < 8192)
@@ -948,6 +960,43 @@ static struct sk_buff *ixgbevf_build_skb(struct ixgbevf_ring *rx_ring,
 
 	return skb;
 }
+
+#define IXGBEVF_XDP_PASS 0
+#define IXGBEVF_XDP_CONSUMED 1
+
+static struct sk_buff *ixgbevf_run_xdp(struct ixgbevf_ring  *rx_ring,
+				       struct xdp_buff *xdp)
+{
+	int result = IXGBEVF_XDP_PASS;
+	struct bpf_prog *xdp_prog;
+	u32 act;
+
+	rcu_read_lock();
+	xdp_prog = READ_ONCE(rx_ring->xdp_prog);
+
+	if (!xdp_prog)
+		goto xdp_out;
+
+	act = bpf_prog_run_xdp(xdp_prog, xdp);
+	switch (act) {
+	case XDP_PASS:
+		break;
+	default:
+		bpf_warn_invalid_xdp_action(act);
+		/* fallthrough */
+	case XDP_TX:
+	case XDP_ABORTED:
+		trace_xdp_exception(rx_ring->netdev, xdp_prog, act);
+		/* fallthrough -- handle aborts by dropping packet */
+	case XDP_DROP:
+		result = IXGBEVF_XDP_CONSUMED;
+		break;
+	}
+xdp_out:
+	rcu_read_unlock();
+	return ERR_PTR(-result);
+}
+
 static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector,
 				struct ixgbevf_ring *rx_ring,
 				int budget)
@@ -955,10 +1004,13 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector,
 	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
 	u16 cleaned_count = ixgbevf_desc_unused(rx_ring);
 	struct sk_buff *skb = rx_ring->skb;
+	struct xdp_buff xdp;
+
+	xdp.rxq = &rx_ring->xdp_rxq;
 
 	while (likely(total_rx_packets < budget)) {
-		union ixgbe_adv_rx_desc *rx_desc;
 		struct ixgbevf_rx_buffer *rx_buffer;
+		union ixgbe_adv_rx_desc *rx_desc;
 		unsigned int size;
 
 		/* return some buffers to hardware, one at a time is too slow */
@@ -981,14 +1033,30 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector,
 		rx_buffer = ixgbevf_get_rx_buffer(rx_ring, size);
 
 		/* retrieve a buffer from the ring */
-		if (skb)
+		if (!skb) {
+			xdp.data = page_address(rx_buffer->page) +
+				   rx_buffer->page_offset;
+			xdp_set_data_meta_invalid(&xdp);
+			xdp.data_hard_start = xdp.data -
+					      ixgbevf_rx_offset(rx_ring);
+			xdp.data_end = xdp.data + size;
+
+			skb = ixgbevf_run_xdp(rx_ring, &xdp);
+		}
+
+		if (IS_ERR(skb)) {
+			total_rx_packets++;
+			total_rx_bytes += size;
+			rx_buffer->pagecnt_bias++;
+		} else if (skb) {
 			ixgbevf_add_rx_frag(rx_ring, rx_buffer, skb, size);
-		else if (ring_uses_build_skb(rx_ring))
+		} else if (ring_uses_build_skb(rx_ring)) {
 			skb = ixgbevf_build_skb(rx_ring, rx_buffer,
-						rx_desc, size);
-		else
+						&xdp, rx_desc);
+		} else {
 			skb = ixgbevf_construct_skb(rx_ring, rx_buffer,
-						    rx_desc, size);
+						    &xdp, rx_desc);
+		}
 
 		/* exit if we failed to retrieve a buffer */
 		if (!skb) {
@@ -997,7 +1065,7 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector,
 			break;
 		}
 
-		ixgbevf_put_rx_buffer(rx_ring, rx_buffer);
+		ixgbevf_put_rx_buffer(rx_ring, rx_buffer, skb);
 		cleaned_count++;
 
 		/* fetch next buffer in frame if non-eop */
@@ -3159,11 +3227,13 @@ static int ixgbevf_setup_all_tx_resources(struct ixgbevf_adapter *adapter)
 
 /**
  * ixgbevf_setup_rx_resources - allocate Rx resources (Descriptors)
+ * @adapter: board private structure
  * @rx_ring: Rx descriptor ring (for a specific queue) to setup
  *
  * Returns 0 on success, negative on failure
  **/
-int ixgbevf_setup_rx_resources(struct ixgbevf_ring *rx_ring)
+int ixgbevf_setup_rx_resources(struct ixgbevf_adapter *adapter,
+			       struct ixgbevf_ring *rx_ring)
 {
 	int size;
 
@@ -3184,6 +3254,13 @@ int ixgbevf_setup_rx_resources(struct ixgbevf_ring *rx_ring)
 	if (!rx_ring->desc)
 		goto err;
 
+	/* XDP RX-queue info */
+	if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, adapter->netdev,
+			     rx_ring->queue_index) < 0)
+		goto err;
+
+	rx_ring->xdp_prog = adapter->xdp_prog;
+
 	return 0;
 err:
 	vfree(rx_ring->rx_buffer_info);
@@ -3207,7 +3284,7 @@ static int ixgbevf_setup_all_rx_resources(struct ixgbevf_adapter *adapter)
 	int i, err = 0;
 
 	for (i = 0; i < adapter->num_rx_queues; i++) {
-		err = ixgbevf_setup_rx_resources(adapter->rx_ring[i]);
+		err = ixgbevf_setup_rx_resources(adapter, adapter->rx_ring[i]);
 		if (!err)
 			continue;
 		hw_dbg(&adapter->hw, "Allocation for Rx Queue %u failed\n", i);
@@ -3232,6 +3309,8 @@ void ixgbevf_free_rx_resources(struct ixgbevf_ring *rx_ring)
 {
 	ixgbevf_clean_rx_ring(rx_ring);
 
+	rx_ring->xdp_prog = NULL;
+	xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
 	vfree(rx_ring->rx_buffer_info);
 	rx_ring->rx_buffer_info = NULL;
 
@@ -3918,6 +3997,12 @@ static int ixgbevf_change_mtu(struct net_device *netdev, int new_mtu)
 	int max_frame = new_mtu + ETH_HLEN + ETH_FCS_LEN;
 	int ret;
 
+	/* prevent MTU being changed to a size unsupported by XDP */
+	if (adapter->xdp_prog) {
+		dev_warn(&adapter->pdev->dev, "MTU cannot be changed while XDP program is loaded\n");
+		return -EPERM;
+	}
+
 	spin_lock_bh(&adapter->mbx_lock);
 	/* notify the PF of our intent to use this size of frame */
 	ret = hw->mac.ops.set_rlpml(hw, max_frame);
@@ -4101,6 +4186,47 @@ ixgbevf_features_check(struct sk_buff *skb, struct net_device *dev,
 	return features;
 }
 
+static int ixgbevf_xdp_setup(struct net_device *dev, struct bpf_prog *prog)
+{
+	int i, frame_size = dev->mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN;
+	struct ixgbevf_adapter *adapter = netdev_priv(dev);
+	struct bpf_prog *old_prog;
+
+	/* verify ixgbevf ring attributes are sufficient for XDP */
+	for (i = 0; i < adapter->num_rx_queues; i++) {
+		struct ixgbevf_ring *ring = adapter->rx_ring[i];
+
+		if (frame_size > ixgbevf_rx_bufsz(ring))
+			return -EINVAL;
+	}
+
+	old_prog = xchg(&adapter->xdp_prog, prog);
+	for (i = 0; i < adapter->num_rx_queues; i++)
+		xchg(&adapter->rx_ring[i]->xdp_prog, adapter->xdp_prog);
+
+	if (old_prog)
+		bpf_prog_put(old_prog);
+
+	return 0;
+}
+
+static int ixgbevf_xdp(struct net_device *dev, struct netdev_bpf *xdp)
+{
+	struct ixgbevf_adapter *adapter = netdev_priv(dev);
+
+	switch (xdp->command) {
+	case XDP_SETUP_PROG:
+		return ixgbevf_xdp_setup(dev, xdp->prog);
+	case XDP_QUERY_PROG:
+		xdp->prog_attached = !!(adapter->xdp_prog);
+		xdp->prog_id = adapter->xdp_prog ?
+			       adapter->xdp_prog->aux->id : 0;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
 static const struct net_device_ops ixgbevf_netdev_ops = {
 	.ndo_open		= ixgbevf_open,
 	.ndo_stop		= ixgbevf_close,
@@ -4117,6 +4243,7 @@ static const struct net_device_ops ixgbevf_netdev_ops = {
 	.ndo_poll_controller	= ixgbevf_netpoll,
 #endif
 	.ndo_features_check	= ixgbevf_features_check,
+	.ndo_bpf		= ixgbevf_xdp,
 };
 
 static void ixgbevf_assign_netdev_ops(struct net_device *dev)
-- 
2.13.6

^ permalink raw reply related

* [PATCH 5/5] ixgbevf: Add XDP queue stats reporting
From: Tony Nguyen @ 2018-03-16 22:34 UTC (permalink / raw)
  To: intel-wired-lan; +Cc: Tony Nguyen, john.fastabend, netdev
In-Reply-To: <20180316223406.7295-1-anthony.l.nguyen@intel.com>

XDP stats are included in TX stats, however, they are not
reported in TX queue stats since they are setup on different
queues.  Add reporting for XDP queue stats to provide
consistency between the total stats and per queue stats.

Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
---
 drivers/net/ethernet/intel/ixgbevf/ethtool.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ethtool.c b/drivers/net/ethernet/intel/ixgbevf/ethtool.c
index da8c0e299a37..8e7d6c6f5c92 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ethtool.c
@@ -82,6 +82,7 @@ static struct ixgbe_stats ixgbevf_gstrings_stats[] = {
 
 #define IXGBEVF_QUEUE_STATS_LEN ( \
 	(((struct ixgbevf_adapter *)netdev_priv(netdev))->num_tx_queues + \
+	 ((struct ixgbevf_adapter *)netdev_priv(netdev))->num_xdp_queues + \
 	 ((struct ixgbevf_adapter *)netdev_priv(netdev))->num_rx_queues) * \
 	 (sizeof(struct ixgbevf_stats) / sizeof(u64)))
 #define IXGBEVF_GLOBAL_STATS_LEN ARRAY_SIZE(ixgbevf_gstrings_stats)
@@ -491,6 +492,23 @@ static void ixgbevf_get_ethtool_stats(struct net_device *netdev,
 		i += 2;
 	}
 
+	/* populate XDP queue data */
+	for (j = 0; j < adapter->num_xdp_queues; j++) {
+		ring = adapter->xdp_ring[j];
+		if (!ring) {
+			data[i++] = 0;
+			data[i++] = 0;
+			continue;
+		}
+
+		do {
+			start = u64_stats_fetch_begin_irq(&ring->syncp);
+			data[i] = ring->stats.packets;
+			data[i + 1] = ring->stats.bytes;
+		} while (u64_stats_fetch_retry_irq(&ring->syncp, start));
+		i += 2;
+	}
+
 	/* populate Rx queue data */
 	for (j = 0; j < adapter->num_rx_queues; j++) {
 		ring = adapter->rx_ring[j];
@@ -534,6 +552,12 @@ static void ixgbevf_get_strings(struct net_device *netdev, u32 stringset,
 			sprintf(p, "tx_queue_%u_bytes", i);
 			p += ETH_GSTRING_LEN;
 		}
+		for (i = 0; i < adapter->num_xdp_queues; i++) {
+			sprintf(p, "xdp_queue_%u_packets", i);
+			p += ETH_GSTRING_LEN;
+			sprintf(p, "xdp_queue_%u_bytes", i);
+			p += ETH_GSTRING_LEN;
+		}
 		for (i = 0; i < adapter->num_rx_queues; i++) {
 			sprintf(p, "rx_queue_%u_packets", i);
 			p += ETH_GSTRING_LEN;
-- 
2.13.6

^ permalink raw reply related

* [PATCH 0/5] Enable XDP for ixgbevf
From: Tony Nguyen @ 2018-03-16 22:34 UTC (permalink / raw)
  To: intel-wired-lan; +Cc: Tony Nguyen, john.fastabend, netdev

This patch series implements support for XDP on ixgbevf;
it is mainly based on the ixgbe implementation and supports
the following actions: XDP_PASS, XDP_DROP, and XDP_TX.

Tony Nguyen (5):
  ixgbevf: Add XDP support for pass and drop actions
  ixgbevf: Add support for XDP_TX action
  ixgbevf: Delay tail write for XDP packets
  ixgbevf: Add support for meta data
  ixgbevf: Add XDP queue stats reporting

 drivers/net/ethernet/intel/ixgbevf/ethtool.c      |  68 ++-
 drivers/net/ethernet/intel/ixgbevf/ixgbevf.h      |  30 +-
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 489 +++++++++++++++++++---
 3 files changed, 518 insertions(+), 69 deletions(-)

-- 
2.13.6

^ permalink raw reply

* [PATCH net-next] liquidio: Added support for trusted VF
From: Felix Manlunas @ 2018-03-16 22:40 UTC (permalink / raw)
  To: davem
  Cc: netdev, raghu.vatsavayi, derek.chickles, satananda.burla,
	intiyaz.basha, felix.manlunas

From: Intiyaz Basha <intiyaz.basha@cavium.com>

When a VF is trusted, all promiscuous traffic will only be sent to that VF.
In normal operation promiscuous traffic is sent to the PF. There can be
only one trusted VF per PF.

Signed-off-by: Intiyaz Basha <intiyaz.basha@cavium.com>
Acked-by: Satanand Burla <satananda.burla@cavium.com>
Signed-off-by: Felix Manlunas <felix.manlunas@cavium.com>
---
 drivers/net/ethernet/cavium/liquidio/lio_main.c    | 125 +++++++++++++++++++++
 .../net/ethernet/cavium/liquidio/liquidio_common.h |   7 ++
 .../net/ethernet/cavium/liquidio/octeon_device.h   |   2 +
 3 files changed, 134 insertions(+)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index 140085b..c14b87a 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -91,6 +91,12 @@ static int octeon_console_debug_enabled(u32 console)
  */
 #define LIO_SYNC_OCTEON_TIME_INTERVAL_MS 60000
 
+struct lio_trusted_vf_ctx {
+	wait_queue_head_t wc;
+	int cond;
+	int status;
+};
+
 struct liquidio_rx_ctl_context {
 	int octeon_id;
 
@@ -3265,10 +3271,128 @@ static int liquidio_get_vf_config(struct net_device *netdev, int vfidx,
 	ether_addr_copy(&ivi->mac[0], macaddr);
 	ivi->vlan = oct->sriov_info.vf_vlantci[vfidx] & VLAN_VID_MASK;
 	ivi->qos = oct->sriov_info.vf_vlantci[vfidx] >> VLAN_PRIO_SHIFT;
+	if (oct->sriov_info.trusted_vf.active &&
+	    oct->sriov_info.trusted_vf.id == vfidx)
+		ivi->trusted = true;
+	else
+		ivi->trusted = false;
 	ivi->linkstate = oct->sriov_info.vf_linkstate[vfidx];
 	return 0;
 }
 
+static void trusted_vf_callback(struct octeon_device *oct_dev,
+				u32 status, void *ptr)
+{
+	struct octeon_soft_command *sc = (struct octeon_soft_command *)ptr;
+	struct lio_trusted_vf_ctx *ctx;
+
+	ctx = (struct lio_trusted_vf_ctx *)sc->ctxptr;
+	ctx->status = status;
+	WRITE_ONCE(ctx->cond, 1);
+
+	/* This barrier is required to be sure that the response has
+	 * been written fully before waking up the handler
+	 */
+	wmb();
+
+	wake_up_interruptible(&ctx->wc);
+}
+
+static int liquidio_send_vf_trust_cmd(struct lio *lio, int vfidx, bool trusted)
+{
+	struct octeon_device *oct = lio->oct_dev;
+	struct lio_trusted_vf_ctx *ctx;
+	struct octeon_soft_command *sc;
+	int ctx_size, retval;
+
+	ctx_size = sizeof(struct lio_trusted_vf_ctx);
+	sc = octeon_alloc_soft_command(oct, 0, 0, ctx_size);
+
+	ctx  = (struct lio_trusted_vf_ctx *)sc->ctxptr;
+	WRITE_ONCE(ctx->cond, 0);
+	init_waitqueue_head(&ctx->wc);
+
+	sc->iq_no = lio->linfo.txpciq[0].s.q_no;
+
+	/* vfidx is 0 based, but vf_num (param1) is 1 based */
+	octeon_prepare_soft_command(oct, sc, OPCODE_NIC,
+				    OPCODE_NIC_SET_TRUSTED_VF, 0, vfidx + 1,
+				    trusted);
+
+	sc->callback = trusted_vf_callback;
+	sc->callback_arg = sc;
+	sc->wait_time = 1000;
+
+	retval = octeon_send_soft_command(oct, sc);
+	if (retval == IQ_SEND_FAILED) {
+		retval = -1;
+	} else {
+		/* Sleep on a wait queue till the cond flag indicates that the
+		 * response arrived or timed-out.
+		 */
+		if (sleep_cond(&ctx->wc, &ctx->cond) == -EINTR)
+			return -1;
+
+		retval = ctx->status;
+	}
+
+	octeon_free_soft_command(oct, sc);
+
+	return retval;
+}
+
+static int liquidio_set_vf_trust(struct net_device *netdev, int vfidx,
+				 bool setting)
+{
+	struct lio *lio = GET_LIO(netdev);
+	struct octeon_device *oct = lio->oct_dev;
+
+	if (strcmp(oct->fw_info.liquidio_firmware_version, "1.7.1") < 0) {
+		/* trusted vf is not supported by firmware older than 1.7.1 */
+		return -EOPNOTSUPP;
+	}
+
+	if (vfidx < 0 || vfidx >= oct->sriov_info.num_vfs_alloced) {
+		netif_info(lio, drv, lio->netdev, "Invalid vfidx %d\n", vfidx);
+		return -EINVAL;
+	}
+
+	if (setting) {
+		/* Set */
+
+		if (oct->sriov_info.trusted_vf.active &&
+		    oct->sriov_info.trusted_vf.id == vfidx)
+			return 0;
+
+		if (oct->sriov_info.trusted_vf.active) {
+			netif_info(lio, drv, lio->netdev, "More than one trusted VF is not allowed\n");
+			return -EPERM;
+		}
+	} else {
+		/* Clear */
+
+		if (!oct->sriov_info.trusted_vf.active)
+			return 0;
+	}
+
+	if (!liquidio_send_vf_trust_cmd(lio, vfidx, setting)) {
+		if (setting) {
+			oct->sriov_info.trusted_vf.id = vfidx;
+			oct->sriov_info.trusted_vf.active = true;
+		} else {
+			oct->sriov_info.trusted_vf.active = false;
+		}
+
+		netif_info(lio, drv, lio->netdev, "VF %u is %strusted\n", vfidx,
+			   setting ? "" : "not ");
+	} else {
+		netif_info(lio, drv, lio->netdev, "Failed to set VF trusted\n");
+		return -1;
+	}
+
+	return 0;
+}
+
 static int liquidio_set_vf_link_state(struct net_device *netdev, int vfidx,
 				      int linkstate)
 {
@@ -3399,6 +3523,7 @@ static const struct net_device_ops lionetdevops = {
 	.ndo_set_vf_mac		= liquidio_set_vf_mac,
 	.ndo_set_vf_vlan	= liquidio_set_vf_vlan,
 	.ndo_get_vf_config	= liquidio_get_vf_config,
+	.ndo_set_vf_trust	= liquidio_set_vf_trust,
 	.ndo_set_vf_link_state  = liquidio_set_vf_link_state,
 };
 
diff --git a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
index ecc1682..82a783d 100644
--- a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
+++ b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
@@ -84,6 +84,7 @@ enum octeon_tag_type {
 #define OPCODE_NIC_IF_CFG              0x09
 #define OPCODE_NIC_VF_DRV_NOTICE       0x0A
 #define OPCODE_NIC_INTRMOD_PARAMS      0x0B
+#define OPCODE_NIC_SET_TRUSTED_VF	0x13
 #define OPCODE_NIC_SYNC_OCTEON_TIME	0x14
 #define VF_DRV_LOADED                  1
 #define VF_DRV_REMOVED                -1
@@ -918,6 +919,12 @@ union oct_nic_if_cfg {
 	} s;
 };
 
+struct lio_trusted_vf {
+	uint64_t active: 1;
+	uint64_t id : 8;
+	uint64_t reserved: 55;
+};
+
 struct lio_time {
 	s64 sec;   /* seconds */
 	s64 nsec;  /* nanoseconds */
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_device.h b/drivers/net/ethernet/cavium/liquidio/octeon_device.h
index 63b0c75..91937cc 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_device.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_device.h
@@ -370,6 +370,8 @@ struct octeon_sriov_info {
 
 	u32	sriov_enabled;
 
+	struct lio_trusted_vf	trusted_vf;
+
 	/*lookup table that maps DPI ring number to VF pci_dev struct pointer*/
 	struct pci_dev *dpiring_to_vfpcidev_lut[MAX_POSSIBLE_VFS];
 

^ permalink raw reply related

* Re: BUG_ON triggered in skb_segment
From: Yonghong Song @ 2018-03-16 22:37 UTC (permalink / raw)
  To: Eric Dumazet, Alexei Starovoitov, Steffen Klassert,
	Daniel Borkmann
  Cc: netdev, Martin Lau, saeedm, diptanu
In-Reply-To: <d6d140dd-2fc9-e67d-41f7-ef78385271da@gmail.com>


Eric and Daniel,

I have tried to fix this issue but not really successful.
I tried two hacks:
   . if skb_headlen(list_skb) is not 0, we just pull
     skb_headlen(list_skb) from the skb to make skb_headlen(list_skb) = 
0, or
   . if skb_headlen(list_skb) is not 0, we go to the beginning of
     the outer loop which will allocate another nskb for this list_skb.

Both approaches removed the BUG and the packet is able to reach the
remote host. Upon receiving the packet, however, the remote host sends a
reset packet back so connection eventually closed. I did not debug
further on this.

Considering it is tricky to change skb_segment, I hacked test_bpf
kernel module to reproduce the issue. The change reflects the gso packet
structure I got from mlx5. Maybe you could take a look and suggest a fix 
or a direction of how to move forward.

Thanks!

============= PATCH  ===============

-bash-4.2$ git show
commit 41681ab51f85b4a0ea3416a0a62d6bde74f3af4b
Author: Yonghong Song <yhs@fb.com>
Date:   Fri Mar 16 15:10:02 2018 -0700

     [hack] hack test_bpf module to trigger BUG_ON in skb_segment.

     "modprobe test_bpf" will have the following errors:
     ...
     [   98.149165] ------------[ cut here ]------------
     [   98.159362] kernel BUG at net/core/skbuff.c:3667!
     [   98.169756] invalid opcode: 0000 [#1] SMP PTI
     [   98.179370] Modules linked in:
     [   98.179371]  test_bpf(+)
     ...

     The BUG happens in function skb_segment:
     ...
     3665                 while (pos < offset + len) {
     3666                         if (i >= nfrags) {
     3667                                 BUG_ON(skb_headlen(list_skb));
     3668
     3669                                 i = 0;
     3670                                 nfrags = 
skb_shinfo(list_skb)->nr_frags;
     3671                                 frag = 
skb_shinfo(list_skb)->frags;
     3672                                 frag_skb = list_skb;
     3673
     3674                                 BUG_ON(!nfrags);
     ...

     The skbs are constructed to mimic what mlx5 may generate.
     The packet size/header may not mimic real cases in production. But
     the processing flow is similar.

     Signed-off-by: Yonghong Song <yhs@fb.com>

diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index 2efb213..d36a991 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -6574,6 +6574,67 @@ static bool exclude_test(int test_id)
         return test_id < test_range[0] || test_id > test_range[1];
  }

+static struct sk_buff *build_test_skb(void) {
+       u32 tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+       u32 headroom = NET_SKB_PAD + NET_IP_ALIGN + ETH_HLEN;
+       struct sk_buff *skb[2];
+       void *data[2], *page;
+       int i, data_size = 8;
+
+       page = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+       if (!page)
+               return NULL;
+
+       for (i = 0; i < 2; i++) {
+               data[i] = kzalloc(headroom + tailroom + data_size, 
GFP_KERNEL);
+               if (!data[i])
+                       return NULL;
+               skb[i] = build_skb(data[i], 0);
+               if (!skb[i]) {
+                       kfree(data[i]);
+                       return NULL;
+               }
+               skb_reserve(skb[i], headroom);
+               skb_put(skb[i], data_size);
+               skb[i]->protocol = htons(ETH_P_IP);
+               skb_reset_network_header(skb[i]);
+               skb_set_mac_header(skb[i], -ETH_HLEN);
+
+               skb_add_rx_frag(skb[i], skb_shinfo(skb[i])->nr_frags,
+                        page, 0, 64, 64);
+       }
+
+       /* setup shinfo */
+       skb_shinfo(skb[0])->gso_size = 1448;
+       skb_shinfo(skb[0])->gso_type = SKB_GSO_TCPV4;
+       skb_shinfo(skb[0])->gso_type |= SKB_GSO_DODGY;
+       skb_shinfo(skb[0])->gso_segs = 0;
+       skb_shinfo(skb[0])->frag_list = skb[1];
+
+       /* adjust skb[0]'s len */
+       skb[0]->len += skb[1]->len;
+       skb[0]->data_len += skb[1]->data_len;
+       skb[0]->truesize += skb[1]->truesize;
+
+       return skb[0];
+}
+
+static void test_skb_segment(void) {
+       netdev_features_t features;
+       struct sk_buff *skb;
+
+       features = NETIF_F_SG | NETIF_F_GSO_PARTIAL | NETIF_F_IP_CSUM | 
NETIF_F_IPV6_CSUM;
+       features |= NETIF_F_RXCSUM;
+       skb = build_test_skb();
+       if (!skb)
+               pr_info("Failed in test_skb_segment:build_test_skb!");
+
+       if (skb_segment(skb, features))
+               pr_info("Success in test_skb_segment!");
+       else
+               pr_info("Failed in test_skb_segment!");
+}
+
  static __init int test_bpf(void)
  {
         int i, err_cnt = 0, pass_cnt = 0;
@@ -6631,7 +6692,8 @@ static int __init test_bpf_init(void)
         if (ret < 0)
                 return ret;

-       ret = test_bpf();
+       // ret = test_bpf();
+       test_skb_segment();

         destroy_bpf_tests();
         return ret;


=============  END ================


On 3/13/18 6:15 PM, Eric Dumazet wrote:
> 
> 
> On 03/13/2018 05:35 PM, Eric Dumazet wrote:
>>
>>
>> On 03/13/2018 05:26 PM, Eric Dumazet wrote:
>>>
>>>
>>> On 03/13/2018 05:04 PM, Alexei Starovoitov wrote:
>>>> On 3/13/18 4:27 PM, Eric Dumazet wrote:
>>>>>
>>>>>
>>>>> On 03/13/2018 04:09 PM, Alexei Starovoitov wrote:
>>>>>
>>>>>> we have bpf_skb_proto_6_to_4() that was used by cilium for long time.
>>>>>> It's not clear why it's not crashing there, but we cannot just
>>>>>> reject changing proto in bpf programs now.
>>>>>> We have to fix whatever needs to be fixed in skb_segment
>>>>>> (if bug is there) or fix whatever necessary on mlx5 side.
>>>>>> In bpf helper we mark it as SKB_GSO_DODGY just like packets coming
>>>>>> through virtio would do, so if skb_segment() needs to do something
>>>>>> special with skb the SKB_GSO_DODGY flag is already there.
>>>>>
>>>>> 'Fixing' skb_segment(), I did that a long time ago and Herbert Xu was
>>>>> not happy with the fix and provided something else.
>>>>
>>>> any link to your old patches and discussion?
>>>>
>>>> I think since mlx4 can do tso on them and the packets came out
>>>> correct on the wire, there is nothing fundamentally wrong with
>>>> changing gso_size. Just tricky to teach skb_segment.
>>>>
>>>
>>> The world is not mlx4 only. Some NIC will ask skb_segment() fallback 
>>> segmentation for various reasons (like skb->len above a given limit 
>>> like 16KB)
>>>
>>> Maybe 
>>> https://urldefense.proofpoint.com/v2/url?u=https-3A__www.spinics.net_lists_netdev_msg255549.html&d=DwIDaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=DA8e1B5r073vIqRrFz7MRA&m=x9lG2k53B7AFsnNx6t2DgbXt06J-sLznZIHk6YdGoGg&s=0Nxx2G8PtDAEMCFAvQ7kxYTXVr9aHdOolP1KB_lnmes&e= 
>>>
>>
>>
>> Herbert patch :
>>
>> commit 9d8506cc2d7ea1f911c72c100193a3677f6668c3
>> Author: Herbert Xu <herbert@gondor.apana.org.au>
>> Date:   Thu Nov 21 11:10:04 2013 -0800
>>
>>      gso: handle new frag_list of frags GRO packets
>>
> 
> I found my initial patch.
> 
> https://urldefense.proofpoint.com/v2/url?u=https-3A__www.spinics.net_lists_netdev_msg255452.html&d=DwIDaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=DA8e1B5r073vIqRrFz7MRA&m=x9lG2k53B7AFsnNx6t2DgbXt06J-sLznZIHk6YdGoGg&s=VuWRpUdJwBwTxpnMNZYgKvQANLL5UA7hZnTFZsQlK6c&e= 
> 
> 
> 

^ permalink raw reply related

* Re: [PATCH v3 18/18] infiniband: cxgb4: Eliminate duplicate barriers on weakly-ordered archs
From: Jason Gunthorpe @ 2018-03-16 22:13 UTC (permalink / raw)
  To: Steve Wise
  Cc: 'Sinan Kaya', netdev, timur, sulrich, linux-arm-msm,
	linux-arm-kernel, 'Steve Wise', 'Doug Ledford',
	linux-rdma, linux-kernel, 'Michael Werner',
	'Casey Leedom'
In-Reply-To: <003601d3bd6a$783d6970$68b83c50$@opengridcomputing.com>

On Fri, Mar 16, 2018 at 04:05:10PM -0500, Steve Wise wrote:
> > Code includes wmb() followed by writel(). writel() already has a barrier
> on
> > some architectures like arm64.
> > 
> > This ends up CPU observing two barriers back to back before executing the
> > register write.
> > 
> > Since code already has an explicit barrier call, changing writel() to
> > writel_relaxed().
> > 
> > Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
> 
> NAK - This isn't correct for PowerPC.  For PowerPC, writeX_relaxed() is just
> writeX().  

?? Why is changing writex() to writeX() a NAK then?

> I was just looking at this with Chelsio developers, and they said the
> writeX() should be replaced with __raw_writeX(), not writeX_relaxed(), to
> get rid of the extra barrier for all architectures.

That doesn't seem semanticaly sane.

__raw_writeX() should not appear in driver code, IMHO. Only the arch
code can know what the exact semantics of that accessor are..

If ppc can't use writel_relaxed to optimize then we probably need yet
another io accessor semantic defined :(

Jason

^ permalink raw reply

* Re: [PATCH 0/2] net: phy: relax error checking when creating sysfs link netdev->phydev
From: Grygorii Strashko @ 2018-03-16 22:08 UTC (permalink / raw)
  To: Andrew Lunn, Florian Fainelli
  Cc: David S. Miller, netdev, Greg Kroah-Hartman, Sekhar Nori,
	linux-kernel, linux-omap
In-Reply-To: <20180316211459.GC8735@lunn.ch>



On 03/16/2018 04:14 PM, Andrew Lunn wrote:
>> I agree, let's not have you run into circles, let's just use your
>> patches as they are since they fix the problem and are not intrusive in
>> any way.
> 
> Agreed, this is too complex, for little gain.
> 

Thanks. v2 posted.

-- 
regards,
-grygorii

^ permalink raw reply

* [PATCH v2 2/2] net: phy: relax error checking when creating sysfs link netdev->phydev
From: Grygorii Strashko @ 2018-03-16 22:08 UTC (permalink / raw)
  To: David S. Miller, netdev, Andrew Lunn, Florian Fainelli,
	Greg Kroah-Hartman
  Cc: Sekhar Nori, linux-kernel, linux-omap, Grygorii Strashko
In-Reply-To: <20180316220835.30006-1-grygorii.strashko@ti.com>

Some ethernet drivers (like TI CPSW) may connect and manage >1 Net PHYs per
one netdevice, as result such drivers will produce warning during system
boot and fail to connect second phy to netdevice when PHYLIB framework
will try to create sysfs link netdev->phydev for second PHY
in phy_attach_direct(), because sysfs link with the same name has been
created already for the first PHY. As result, second CPSW external
port will became unusable.

Fix it by relaxing error checking when PHYLIB framework is creating sysfs
link netdev->phydev in phy_attach_direct(), suppressing warning by using
sysfs_create_link_nowarn() and adding error message instead.
After this change links (phy->netdev and netdev->phy) creation failure is not
fatal any more and system can continue working, which fixes TI CPSW issue.

Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: Andrew Lunn <andrew@lunn.ch>
Fixes: a3995460491d ("net: phy: Relax error checking on sysfs_create_link()")
Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
---
 drivers/net/phy/phy_device.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 478405e..fe16f58 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1012,10 +1012,17 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
 	err = sysfs_create_link(&phydev->mdio.dev.kobj, &dev->dev.kobj,
 				"attached_dev");
 	if (!err) {
-		err = sysfs_create_link(&dev->dev.kobj, &phydev->mdio.dev.kobj,
-					"phydev");
-		if (err)
-			goto error;
+		err = sysfs_create_link_nowarn(&dev->dev.kobj,
+					       &phydev->mdio.dev.kobj,
+					       "phydev");
+		if (err) {
+			dev_err(&dev->dev, "could not add device link to %s err %d\n",
+				kobject_name(&phydev->mdio.dev.kobj),
+				err);
+			/* non-fatal - some net drivers can use one netdevice
+			 * with more then one phy
+			 */
+		}

 		phydev->sysfs_links = true;
 	}
-- 
2.10.5

^ permalink raw reply related

* [PATCH v2 1/2] sysfs: symlink: export sysfs_create_link_nowarn()
From: Grygorii Strashko @ 2018-03-16 22:08 UTC (permalink / raw)
  To: David S. Miller, netdev, Andrew Lunn, Florian Fainelli,
	Greg Kroah-Hartman
  Cc: Sekhar Nori, linux-kernel, linux-omap, Grygorii Strashko
In-Reply-To: <20180316220835.30006-1-grygorii.strashko@ti.com>

The sysfs_create_link_nowarn() is going to be used in phylib framework in
subsequent patch which can be built as module. Hence, export
sysfs_create_link_nowarn() to avoid build errors.

Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: Andrew Lunn <andrew@lunn.ch>
Fixes: a3995460491d ("net: phy: Relax error checking on sysfs_create_link()")
Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
---
"Fixes" added as there is dependency this and subsequent patch.
 fs/sysfs/symlink.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 8664db2..215c225 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -106,6 +106,7 @@ int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target,
 {
 	return sysfs_do_create_link(kobj, target, name, 0);
 }
+EXPORT_SYMBOL_GPL(sysfs_create_link_nowarn);
 
 /**
  *	sysfs_delete_link - remove symlink in object's directory.
-- 
2.10.5

^ permalink raw reply related

* [PATCH v2 0/2] net: phy: relax error checking when creating sysfs link netdev->phydev
From: Grygorii Strashko @ 2018-03-16 22:08 UTC (permalink / raw)
  To: David S. Miller, netdev, Andrew Lunn, Florian Fainelli,
	Greg Kroah-Hartman
  Cc: Sekhar Nori, linux-kernel, linux-omap, Grygorii Strashko

Some ethernet drivers (like TI CPSW) may connect and manage >1 Net PHYs per
one netdevice, as result such drivers will produce warning during system
boot and fail to connect second phy to netdevice when PHYLIB framework
will try to create sysfs link netdev->phydev for second PHY
in phy_attach_direct(), because sysfs link with the same name has been
created already for the first PHY.
As result, second CPSW external port will became unusable.
This regression was introduced by commits:
5568363f0cb3 ("net: phy: Create sysfs reciprocal links for attached_dev/phydev"
a3995460491d ("net: phy: Relax error checking on sysfs_create_link()"

Patch 1: exports sysfs_create_link_nowarn() function as preparation for Patch 2.
Patch 2: relaxes error checking when PHYLIB framework is creating sysfs
link netdev->phydev in phy_attach_direct(), suppresses warning by using
sysfs_create_link_nowarn() and adds error message instead, so links creation
failure is not fatal any more and system can continue working,
which fixes TI CPSW issue and makes boot logs accessible
in case of NFS boot, for example.

This can be stable material 4.13+.

Changes in v2:
- commit messages updated.

v1: 
 https://patchwork.ozlabs.org/cover/886058/

Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: Andrew Lunn <andrew@lunn.ch>
Grygorii Strashko (2):
  sysfs: symlink: export sysfs_create_link_nowarn()
  net: phy: relax error checking when creating sysfs link netdev->phydev

 drivers/net/phy/phy_device.c | 15 +++++++++++----
 fs/sysfs/symlink.c           |  1 +
 2 files changed, 12 insertions(+), 4 deletions(-)

-- 
2.10.5

^ permalink raw reply

* Re: [PATCH RFC 0/2] Add support for warnings to extack
From: Jakub Kicinski @ 2018-03-16 22:05 UTC (permalink / raw)
  To: Marcelo Ricardo Leitner; +Cc: netdev, Alexander Aring, Jiri Pirko, David Ahern
In-Reply-To: <cover.1521226621.git.marcelo.leitner@gmail.com>

CC: David Ahern <dsahern@gmail.com>

On Fri, 16 Mar 2018 16:23:08 -0300, Marcelo Ricardo Leitner wrote:
> Currently we have the limitation that warnings cannot be reported though
> extack. For example, when tc flower failed to get offloaded but got
> installed on software datapath. The hardware failure is not fatal and
> thus extack is not even shared with the driver, so the error is simply
> omitted from any logging.
> 
> The idea here is to allow such kind of warnings to get through and be
> available for the sysadmin or the tool managing such commands (like Open
> vSwitch), so that if this happens, we will have such log message in a
> file later.
> 
> The first patch extends extack to support more than one message and with
> different log level (currently only error and warning). The second
> shares extack with the drivers regardless of skip_sw.
> 
> The iproute patch also follows.
> 
> This kernel change is backward compatible with older iproute because
> iproute will only process the last message, which should be the error
> one in case of failure, or a warning if it suceeded. 
> 
> The iproute change is compatible with older kernels because it will find
> only one message to be processed and will handle it properly.
> 
> With this patches, this is now possible:
> # tc qdisc add dev p7p1 ingress
> # tc filter add dev p7p1 parent ffff: protocol ip prio 1 flower \
> 	src_mac ec:13:db:00:00:00 dst_mac ec:14:c2:00:00:00 \
> 	src_ip 56.0.0.0 dst_ip 55.0.0.0 action drop
> Warning: TC offload is disabled on net device.
> # echo $?
> 0

IMHO this set does more and less than is required to solve the
problem.  

The way I understand it is we don't want HW offload errors/warnings to
be printed to unsuspecting users who didn't specify any skip_* flags.
What carries the message and whether it's explicitly marked as warning
or error does not change the fact that user of the SW fwd path may not
want to not be bothered by offload warnings.

There maybe well be value in ability to report multiple messages.  But
for opt-in warning messages I would be leaning towards:

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index e828d31be5dae0ae8c69016dfde50379296484aa..7cec393bb47974b48a6d510b8aa84534a7a98594 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -705,8 +705,7 @@ tc_cls_common_offload_init(struct tc_cls_common_offload *cls_common,
 	cls_common->chain_index = tp->chain->index;
 	cls_common->protocol = tp->protocol;
 	cls_common->prio = tp->prio;
-	if (tc_skip_sw(flags))
+	if (tc_skip_sw(flags) || flags & TCA_CLS_FLAGS_OFFLOAD_VERBOSE)
		cls_common->extack = extack;
 }
 
 enum tc_fl_command {

That is admittedly quite conservative.  Esp. in case of flower, cls_bpf
is used in SW far more than HW, not to mention qdisc offload (although
flag would be different there)!

^ permalink raw reply related

* [PATCH v2 net 2/2] vmxnet3: use correct flag to indicate LRO feature
From: Ronak Doshi @ 2018-03-16 21:49 UTC (permalink / raw)
  To: netdev
  Cc: rachel_lunnon, Ronak Doshi, Shrikrishna Khare, VMware, Inc.,
	open list

'Commit 45dac1d6ea04 ("vmxnet3: Changes for vmxnet3 adapter version 2
(fwd)")' introduced a flag "lro" in structure vmxnet3_adapter which is
used to indicate whether LRO is enabled or not. However, the patch
did not set the flag and hence it was never exercised.

So, when LRO is enabled, it resulted in poor TCP performance due to
delayed acks. This issue is seen with packets which are larger than
the mss getting a delayed ack rather than an immediate ack, thus
resulting in high latency.

This patch removes the lro flag and directly uses device features
against NETIF_F_LRO to check if lro is enabled.

Fixes: 45dac1d6ea04 ("vmxnet3: Changes for vmxnet3 adapter version 2 (fwd)")
Reported-by: Rachel Lunnon <rachel_lunnon@stormagic.com>
Signed-off-by: Ronak Doshi <doshir@vmware.com>
Acked-by: Shrikrishna Khare <skhare@vmware.com>
---
Changes in v2:
 - Added "Fixes:" tag for the commit which introduced this issue

 drivers/net/vmxnet3/vmxnet3_drv.c | 3 ++-
 drivers/net/vmxnet3/vmxnet3_int.h | 5 ++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index b466a422b72d..e04937f44f33 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -1473,7 +1473,8 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
 			vmxnet3_rx_csum(adapter, skb,
 					(union Vmxnet3_GenericDesc *)rcd);
 			skb->protocol = eth_type_trans(skb, adapter->netdev);
-			if (!rcd->tcp || !adapter->lro)
+			if (!rcd->tcp ||
+			    !(adapter->netdev->features & NETIF_F_LRO))
 				goto not_lro;
 
 			if (segCnt != 0 && mss != 0) {
diff --git a/drivers/net/vmxnet3/vmxnet3_int.h b/drivers/net/vmxnet3/vmxnet3_int.h
index b94fdfd0b6f1..99387a4a20a8 100644
--- a/drivers/net/vmxnet3/vmxnet3_int.h
+++ b/drivers/net/vmxnet3/vmxnet3_int.h
@@ -69,10 +69,10 @@
 /*
  * Version numbers
  */
-#define VMXNET3_DRIVER_VERSION_STRING   "1.4.12.0-k"
+#define VMXNET3_DRIVER_VERSION_STRING   "1.4.13.0-k"
 
 /* a 32-bit int, each byte encode a verion number in VMXNET3_DRIVER_VERSION */
-#define VMXNET3_DRIVER_VERSION_NUM      0x01040c00
+#define VMXNET3_DRIVER_VERSION_NUM      0x01040d00
 
 #if defined(CONFIG_PCI_MSI)
 	/* RSS only makes sense if MSI-X is supported. */
@@ -343,7 +343,6 @@ struct vmxnet3_adapter {
 	u8                              version;
 
 	bool				rxcsum;
-	bool				lro;
 
 #ifdef VMXNET3_RSS
 	struct UPT1_RSSConf		*rss_conf;
-- 
2.11.0

^ permalink raw reply related

* [PATCH v2 net 1/2] vmxnet3: avoid xmit reset due to a race in vmxnet3
From: Ronak Doshi @ 2018-03-16 21:47 UTC (permalink / raw)
  To: netdev; +Cc: ntanaka, Ronak Doshi, Shrikrishna Khare, VMware, Inc., open list

The field txNumDeferred is used by the driver to keep track of the number
of packets it has pushed to the emulation. The driver increments it on
pushing the packet to the emulation and the emulation resets it to 0 at
the end of the transmit.

There is a possibility of a race either when (a) ESX is under heavy load or
(b) workload inside VM is of low packet rate.

This race results in xmit hangs when network coalescing is disabled. This
change creates a local copy of txNumDeferred and uses it to perform ring
arithmetic.

Reported-by: Noriho Tanaka <ntanaka@vmware.com>
Signed-off-by: Ronak Doshi <doshir@vmware.com>
Acked-by: Shrikrishna Khare <skhare@vmware.com>
---
Changes in v2:
 - Used lowercase letters for local variables

 drivers/net/vmxnet3/vmxnet3_drv.c | 13 ++++++++-----
 drivers/net/vmxnet3/vmxnet3_int.h |  4 ++--
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index 8b39c160743d..b466a422b72d 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -977,6 +977,8 @@ vmxnet3_tq_xmit(struct sk_buff *skb, struct vmxnet3_tx_queue *tq,
 {
 	int ret;
 	u32 count;
+	int num_pkts;
+	int tx_num_deferred;
 	unsigned long flags;
 	struct vmxnet3_tx_ctx ctx;
 	union Vmxnet3_GenericDesc *gdesc;
@@ -1075,12 +1077,12 @@ vmxnet3_tq_xmit(struct sk_buff *skb, struct vmxnet3_tx_queue *tq,
 #else
 	gdesc = ctx.sop_txd;
 #endif
+	tx_num_deferred = le32_to_cpu(tq->shared->txNumDeferred);
 	if (ctx.mss) {
 		gdesc->txd.hlen = ctx.eth_ip_hdr_size + ctx.l4_hdr_size;
 		gdesc->txd.om = VMXNET3_OM_TSO;
 		gdesc->txd.msscof = ctx.mss;
-		le32_add_cpu(&tq->shared->txNumDeferred, (skb->len -
-			     gdesc->txd.hlen + ctx.mss - 1) / ctx.mss);
+		num_pkts = (skb->len - gdesc->txd.hlen + ctx.mss - 1) / ctx.mss;
 	} else {
 		if (skb->ip_summed == CHECKSUM_PARTIAL) {
 			gdesc->txd.hlen = ctx.eth_ip_hdr_size;
@@ -1091,8 +1093,10 @@ vmxnet3_tq_xmit(struct sk_buff *skb, struct vmxnet3_tx_queue *tq,
 			gdesc->txd.om = 0;
 			gdesc->txd.msscof = 0;
 		}
-		le32_add_cpu(&tq->shared->txNumDeferred, 1);
+		num_pkts = 1;
 	}
+	le32_add_cpu(&tq->shared->txNumDeferred, num_pkts);
+	tx_num_deferred += num_pkts;
 
 	if (skb_vlan_tag_present(skb)) {
 		gdesc->txd.ti = 1;
@@ -1118,8 +1122,7 @@ vmxnet3_tq_xmit(struct sk_buff *skb, struct vmxnet3_tx_queue *tq,
 
 	spin_unlock_irqrestore(&tq->tx_lock, flags);
 
-	if (le32_to_cpu(tq->shared->txNumDeferred) >=
-					le32_to_cpu(tq->shared->txThreshold)) {
+	if (tx_num_deferred >= le32_to_cpu(tq->shared->txThreshold)) {
 		tq->shared->txNumDeferred = 0;
 		VMXNET3_WRITE_BAR0_REG(adapter,
 				       VMXNET3_REG_TXPROD + tq->qid * 8,
diff --git a/drivers/net/vmxnet3/vmxnet3_int.h b/drivers/net/vmxnet3/vmxnet3_int.h
index 5ba222920e80..b94fdfd0b6f1 100644
--- a/drivers/net/vmxnet3/vmxnet3_int.h
+++ b/drivers/net/vmxnet3/vmxnet3_int.h
@@ -69,10 +69,10 @@
 /*
  * Version numbers
  */
-#define VMXNET3_DRIVER_VERSION_STRING   "1.4.11.0-k"
+#define VMXNET3_DRIVER_VERSION_STRING   "1.4.12.0-k"
 
 /* a 32-bit int, each byte encode a verion number in VMXNET3_DRIVER_VERSION */
-#define VMXNET3_DRIVER_VERSION_NUM      0x01040b00
+#define VMXNET3_DRIVER_VERSION_NUM      0x01040c00
 
 #if defined(CONFIG_PCI_MSI)
 	/* RSS only makes sense if MSI-X is supported. */
-- 
2.11.0

^ permalink raw reply related

* Re: [PATCH v3 18/18] infiniband: cxgb4: Eliminate duplicate barriers on weakly-ordered archs
From: Sinan Kaya @ 2018-03-16 21:46 UTC (permalink / raw)
  To: Steve Wise, netdev, timur, sulrich
  Cc: linux-arm-msm, linux-arm-kernel, 'Steve Wise',
	'Doug Ledford', 'Jason Gunthorpe', linux-rdma,
	linux-kernel, 'Michael Werner', 'Casey Leedom'
In-Reply-To: <003601d3bd6a$783d6970$68b83c50$@opengridcomputing.com>

On 3/16/2018 5:05 PM, Steve Wise wrote:
>> Code includes wmb() followed by writel(). writel() already has a barrier
> on
>> some architectures like arm64.
>>
>> This ends up CPU observing two barriers back to back before executing the
>> register write.
>>
>> Since code already has an explicit barrier call, changing writel() to
>> writel_relaxed().
>>
>> Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
> 
> NAK - This isn't correct for PowerPC.  For PowerPC, writeX_relaxed() is just
> writeX().  
> 
> I was just looking at this with Chelsio developers, and they said the
> writeX() should be replaced with __raw_writeX(), not writeX_relaxed(), to
> get rid of the extra barrier for all architectures.

OK. I can do that but isn't the problem at PowerPC adaptation?

/*
 * We don't do relaxed operations yet, at least not with this semantic
 */
#define readb_relaxed(addr)	readb(addr)
#define readw_relaxed(addr)	readw(addr)
#define readl_relaxed(addr)	readl(addr)
#define readq_relaxed(addr)	readq(addr)
#define writeb_relaxed(v, addr)	writeb(v, addr)
#define writew_relaxed(v, addr)	writew(v, addr)
#define writel_relaxed(v, addr)	writel(v, addr)
#define writeq_relaxed(v, addr)	writeq(v, addr)

Why don't we fix the PowerPC's relaxed operators? Is that a bigger task?

>From API perspective both __raw_writeX() and writeX_relaxed() are correct.
It is just PowerPC doesn't seem the follow the definition yet.

^ permalink raw reply

* Re: [pci PATCH v7 0/5] Add support for unmanaged SR-IOV
From: Don Dutile @ 2018-03-16 21:42 UTC (permalink / raw)
  To: Alexander Duyck, bhelgaas, alexander.h.duyck, linux-pci
  Cc: virtio-dev, kvm, netdev, dan.daly, linux-kernel, linux-nvme,
	keith.busch, netanel, mheyne, liang-min.wang, mark.d.rustad,
	dwmw2, hch, dwmw
In-Reply-To: <20180315183449.3102.64791.stgit@localhost.localdomain>

On 03/15/2018 02:40 PM, Alexander Duyck wrote:
> This series is meant to add support for SR-IOV on devices when the VFs are
> not managed by the kernel. Examples of recent patches attempting to do this
> include:
> virto - https://patchwork.kernel.org/patch/10241225/
> pci-stub - https://patchwork.kernel.org/patch/10109935/
> vfio - https://patchwork.kernel.org/patch/10103353/
> uio - https://patchwork.kernel.org/patch/9974031/
> 
> Since this is quickly blowing up into a multi-driver problem it is probably
> best to implement this solution as generically as possible.
> 
> This series is an attempt to do that. What we do with this patch set is
> provide a generic framework to enable SR-IOV in the case that the PF driver
> doesn't support managing the VFs itself.
> 
> I based my patch set originally on the patch by Mark Rustad but there isn't
> much left after going through and cleaning out the bits that were no longer
> needed, and after incorporating the feedback from David Miller. At this point
> the only items to be fully reused was his patch description which is now
> present in patch 3 of the set.
> 
> This solution is limited in scope to just adding support for devices that
> provide no functionality for SR-IOV other than allocating the VFs by
> calling pci_enable_sriov. Previous sets had included patches for VFIO, but
> for now I am dropping that as the scope of that work is larger then I
> think I can take on at this time.
> 
> v2: Reduced scope back to just virtio_pci and vfio-pci
>      Broke into 3 patch set from single patch
>      Changed autoprobe behavior to always set when num_vfs is set non-zero
> v3: Updated Documentation to clarify when sriov_unmanaged_autoprobe is used
>      Wrapped vfio_pci_sriov_configure to fix build errors w/o SR-IOV in kernel
> v4: Dropped vfio-pci patch
>      Added ena and nvme to drivers now using pci_sriov_configure_unmanaged
>      Dropped pci_disable_sriov call in virtio_pci to be consistent with ena
> v5: Dropped sriov_unmanaged_autoprobe and pci_sriov_conifgure_unmanaged
>      Added new patch that enables pci_sriov_configure_simple
>      Updated drivers to use pci_sriov_configure_simple
> v6: Defined pci_sriov_configure_simple as NULL when SR-IOV is not enabled
>      Updated drivers to drop "#ifdef" checks for IOV
>      Added pci-pf-stub as place for PF-only drivers to add support
> v7: Dropped pci_id table explanation from pci-pf-stub driver
>      Updated pci_sriov_configure_simple to drop need for err value
>      Fixed comment explaining why pci_sriov_configure_simple is NULL
> 
> Cc: Mark Rustad <mark.d.rustad@intel.com>
> Cc: Maximilian Heyne <mheyne@amazon.de>
> Cc: Liang-Min Wang <liang-min.wang@intel.com>
> Cc: David Woodhouse <dwmw@amazon.co.uk>
> 
> ---
> 
> Alexander Duyck (5):
>        pci: Add pci_sriov_configure_simple for PFs that don't manage VF resources
>        virtio_pci: Add support for unmanaged SR-IOV on virtio_pci devices
>        ena: Migrate over to unmanaged SR-IOV support
>        nvme: Migrate over to unmanaged SR-IOV support
>        pci-pf-stub: Add PF driver stub for PFs that function only to enable VFs
> 
> 
>   drivers/net/ethernet/amazon/ena/ena_netdev.c |   28 -------------
>   drivers/nvme/host/pci.c                      |   20 ----------
>   drivers/pci/Kconfig                          |   12 ++++++
>   drivers/pci/Makefile                         |    2 +
>   drivers/pci/iov.c                            |   31 +++++++++++++++
>   drivers/pci/pci-pf-stub.c                    |   54 ++++++++++++++++++++++++++
>   drivers/virtio/virtio_pci_common.c           |    1
>   include/linux/pci.h                          |    3 +
>   include/linux/pci_ids.h                      |    2 +
>   9 files changed, 107 insertions(+), 46 deletions(-)
>   create mode 100644 drivers/pci/pci-pf-stub.c
> 
> --
> 
For what it's worth.

Good, simpler start for this type of support/effort.
Thanks for the multiple versions to get to this point.

Reviewed-by: Donald Dutile <ddutile@redhat.com>

^ permalink raw reply

* Re: [RFC 2/2] page_frag_cache: Store metadata in struct page
From: Eric Dumazet @ 2018-03-16 21:32 UTC (permalink / raw)
  To: Matthew Wilcox, Alexander Duyck
  Cc: Alexander Duyck, linux-mm, Netdev, Matthew Wilcox
In-Reply-To: <20180316210500.GH27498@bombadil.infradead.org>



On 03/16/2018 02:05 PM, Matthew Wilcox wrote:
  
> 
> Obviously if the problem turns out to be the cacheline thrashing rather
> than the call to page_to_virt, then this is pointless to test.
> 
>> I won't be able to test the patches until next week, but I expect I
>> will probably see a noticeable regression when performing a small
>> packet routing test.
> 
> I really appreciate you being willing to try this for me.  I need to
> get myself a dual-socket machine to test things like this.
>

It seems my prior mail/answer was lost.

Issue is cacheline thrashing indeed, particularly on PowerPC (64 KB pages)

^ permalink raw reply

* [PATCH RFC v2 7/7] net: phy: remove phy_stop_machine
From: Heiner Kallweit @ 2018-03-16 21:25 UTC (permalink / raw)
  To: Florian Fainelli, Andrew Lunn, Geert Uytterhoeven; +Cc: netdev@vger.kernel.org
In-Reply-To: <6618f53c-778a-cb12-deb4-c618a728b43e@gmail.com>

Now that the functionality of phy_stop() was integrated to __phy_stop()
we can remove phy_stop_machine().

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
---
v2:
- no changes
---
 drivers/net/phy/phy.c        | 18 ------------------
 drivers/net/phy/phy_device.c |  2 --
 include/linux/phy.h          |  1 -
 3 files changed, 21 deletions(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 6ffb9952..b6a24ab8 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -565,24 +565,6 @@ void phy_trigger_machine(struct phy_device *phydev, bool sync)
 	queue_delayed_work(system_power_efficient_wq, &phydev->state_queue, 0);
 }
 
-/**
- * phy_stop_machine - stop the PHY state machine tracking
- * @phydev: target phy_device struct
- *
- * Description: Stops the state machine delayed workqueue, sets the
- *   state to UP (unless it wasn't up yet). This function must be
- *   called BEFORE phy_detach.
- */
-void phy_stop_machine(struct phy_device *phydev)
-{
-	cancel_delayed_work_sync(&phydev->state_queue);
-
-	mutex_lock(&phydev->lock);
-	if (phydev->state > PHY_UP && phydev->state != PHY_HALTED)
-		phydev->state = PHY_UP;
-	mutex_unlock(&phydev->lock);
-}
-
 /**
  * phy_error - enter HALTED state for this PHY device
  * @phydev: target phy_device struct
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index a33dec37..a0c39b4d 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -831,8 +831,6 @@ void phy_disconnect(struct phy_device *phydev)
 	if (phydev->irq > 0)
 		phy_stop_interrupts(phydev);
 
-	phy_stop_machine(phydev);
-
 	phydev->adjust_link = NULL;
 
 	phy_detach(phydev);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 780c2690..5c953bd3 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1023,7 +1023,6 @@ int phy_drivers_register(struct phy_driver *new_driver, int n,
 void phy_state_machine(struct work_struct *work);
 void phy_change_work(struct work_struct *work);
 void phy_mac_interrupt(struct phy_device *phydev);
-void phy_stop_machine(struct phy_device *phydev);
 void phy_trigger_machine(struct phy_device *phydev, bool sync);
 int phy_ethtool_sset(struct phy_device *phydev, struct ethtool_cmd *cmd);
 void phy_ethtool_ksettings_get(struct phy_device *phydev,
-- 
2.16.2

- Address error reported by Geert by changing call to phy_link_down()

^ permalink raw reply related

* [PATCH RFC v2 6/7] net: phy: use new function phy_stop_suspending in mdio_bus_phy_suspend
From: Heiner Kallweit @ 2018-03-16 21:24 UTC (permalink / raw)
  To: Florian Fainelli, Andrew Lunn, Geert Uytterhoeven; +Cc: netdev@vger.kernel.org
In-Reply-To: <6618f53c-778a-cb12-deb4-c618a728b43e@gmail.com>

Use new function phy_stop_suspending() in mdio_bus_phy_suspend() to also
disable interrupts and set link state to down.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
---
v2:
- no changes
---
 drivers/net/phy/phy_device.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index ed97f152..a33dec37 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -139,6 +139,7 @@ static bool mdio_bus_phy_needs_start(struct phy_device *phydev)
 static int mdio_bus_phy_suspend(struct device *dev)
 {
 	struct phy_device *phydev = to_phy_device(dev);
+	int ret = 0;
 
 	/* We must stop the state machine manually, otherwise it stops out of
 	 * control, possibly with the phydev->lock held. Upon resume, netdev
@@ -146,9 +147,11 @@ static int mdio_bus_phy_suspend(struct device *dev)
 	 * lead to a deadlock.
 	 */
 	if (phydev->attached_dev && phydev->adjust_link)
-		phy_stop_machine(phydev);
+		phy_stop_suspending(phydev);
+	else
+		ret = phy_suspend(phydev);
 
-	return phy_suspend(phydev);
+	return ret;
 }
 
 static int mdio_bus_phy_resume(struct device *dev)
-- 
2.16.2

^ permalink raw reply related

* [PATCH RFC v2 5/7] net: phy: make phy_stop synchronous
From: Heiner Kallweit @ 2018-03-16 21:23 UTC (permalink / raw)
  To: Florian Fainelli, Andrew Lunn, Geert Uytterhoeven; +Cc: netdev@vger.kernel.org
In-Reply-To: <6618f53c-778a-cb12-deb4-c618a728b43e@gmail.com>

Currently phy_stop() just sets the state to PHY_HALTED and relies on the
state machine to do the remaining work. It can take up to 1s until the
state machine runs again what causes issues in situations where e.g.
driver / device is brought down directly after executing phy_stop().

Fix this by executing all phy_stop() activities synchronously.

Add a function phy_stop_suspending() which does basically the same as
phy_stop() and just adopts the state adjustment logic from
phy_stop_machine() to inform the resume callback about the status of
the PHY before suspending.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
---
Changes in v2:
- Extend documenting comment for phy_stop_suspend
- Address error reported by Geert by changing call to phy_link_down()
  to not trigger a linkwatch event
---
 drivers/net/phy/phy.c | 70 ++++++++++++++++++++++++++++++++++++++-------------
 include/linux/phy.h   |  1 +
 2 files changed, 53 insertions(+), 18 deletions(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 0ca1672a..6ffb9952 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -737,21 +737,45 @@ int phy_stop_interrupts(struct phy_device *phydev)
 }
 EXPORT_SYMBOL(phy_stop_interrupts);
 
-/**
- * phy_stop - Bring down the PHY link, and stop checking the status
- * @phydev: target phy_device struct
- */
-void phy_stop(struct phy_device *phydev)
+static void phy_link_up(struct phy_device *phydev)
+{
+	phydev->phy_link_change(phydev, true, true);
+	phy_led_trigger_change_speed(phydev);
+}
+
+static void phy_link_down(struct phy_device *phydev, bool do_carrier)
+{
+	phydev->phy_link_change(phydev, false, do_carrier);
+	phy_led_trigger_change_speed(phydev);
+}
+
+static void __phy_stop(struct phy_device *phydev, bool suspending)
 {
 	mutex_lock(&phydev->lock);
 
 	if (PHY_HALTED == phydev->state)
 		goto out_unlock;
 
+	/* stop state machine */
+	cancel_delayed_work_sync(&phydev->state_queue);
+
 	if (phy_interrupt_is_valid(phydev))
 		phy_disable_interrupts(phydev);
 
-	phydev->state = PHY_HALTED;
+	if (phydev->link) {
+		phydev->link = 0;
+		if (phydev->adjust_link)
+			phy_link_down(phydev, false);
+	}
+
+	phy_suspend(phydev);
+
+	if (suspending) {
+		if (phydev->state > PHY_UP && phydev->state != PHY_HALTED)
+			phydev->state = PHY_UP;
+	} else {
+		phydev->state = PHY_HALTED;
+	}
 
 out_unlock:
 	mutex_unlock(&phydev->lock);
@@ -761,8 +785,30 @@ void phy_stop(struct phy_device *phydev)
 	 * will not reenable interrupts.
 	 */
 }
+
+/**
+ * phy_stop - Bring down the PHY link, and stop checking the status
+ * @phydev: target phy_device struct
+ */
+void phy_stop(struct phy_device *phydev)
+{
+	__phy_stop(phydev, false);
+}
 EXPORT_SYMBOL(phy_stop);
 
+/**
+ * phy_stop_suspending - Bring down the PHY link, preparing for system suspend
+ * @phydev: target phy_device struct
+ *
+ * Description: Basically the same as phy_stop(), just sets the state to UP
+ * (unless it wasn't up yet)
+ */
+void phy_stop_suspending(struct phy_device *phydev)
+{
+	__phy_stop(phydev, true);
+}
+EXPORT_SYMBOL(phy_stop_suspending);
+
 /**
  * phy_start - start or restart a PHY device
  * @phydev: target phy_device struct
@@ -804,18 +850,6 @@ void phy_start(struct phy_device *phydev)
 }
 EXPORT_SYMBOL(phy_start);
 
-static void phy_link_up(struct phy_device *phydev)
-{
-	phydev->phy_link_change(phydev, true, true);
-	phy_led_trigger_change_speed(phydev);
-}
-
-static void phy_link_down(struct phy_device *phydev, bool do_carrier)
-{
-	phydev->phy_link_change(phydev, false, do_carrier);
-	phy_led_trigger_change_speed(phydev);
-}
-
 /**
  * phy_state_machine - Handle the state machine
  * @work: work_struct that describes the work to be done
diff --git a/include/linux/phy.h b/include/linux/phy.h
index bc7aa93c..780c2690 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -941,6 +941,7 @@ void phy_disconnect(struct phy_device *phydev);
 void phy_detach(struct phy_device *phydev);
 void phy_start(struct phy_device *phydev);
 void phy_stop(struct phy_device *phydev);
+void phy_stop_suspending(struct phy_device *phydev);
 int phy_start_aneg(struct phy_device *phydev);
 int phy_aneg_done(struct phy_device *phydev);
 
-- 
2.16.2

^ permalink raw reply related

* [PATCH RFC v2 4/7] net: phy: remove phy_start_machine
From: Heiner Kallweit @ 2018-03-16 21:15 UTC (permalink / raw)
  To: Florian Fainelli, Andrew Lunn, Geert Uytterhoeven; +Cc: netdev@vger.kernel.org
In-Reply-To: <6618f53c-778a-cb12-deb4-c618a728b43e@gmail.com>

Now that phy_start() integrated the functionality of phy_start_machine()
we can remove it.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
---
v2:
- no changes
---
 drivers/net/phy/phy.c        | 16 ----------------
 drivers/net/phy/phy_device.c |  1 -
 drivers/net/phy/phylink.c    |  1 -
 include/linux/phy.h          |  1 -
 4 files changed, 19 deletions(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 0aef35ef..0ca1672a 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -546,22 +546,6 @@ int phy_start_aneg(struct phy_device *phydev)
 }
 EXPORT_SYMBOL(phy_start_aneg);
 
-/**
- * phy_start_machine - start PHY state machine tracking
- * @phydev: the phy_device struct
- *
- * Description: The PHY infrastructure can run a state machine
- *   which tracks whether the PHY is starting up, negotiating,
- *   etc.  This function starts the delayed workqueue which tracks
- *   the state of the PHY. If you want to maintain your own state machine,
- *   do not call this function.
- */
-void phy_start_machine(struct phy_device *phydev)
-{
-	queue_delayed_work(system_power_efficient_wq, &phydev->state_queue, HZ);
-}
-EXPORT_SYMBOL_GPL(phy_start_machine);
-
 /**
  * phy_trigger_machine - trigger the state machine to run
  *
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index c934725b..ed97f152 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -769,7 +769,6 @@ int phy_connect_direct(struct net_device *dev, struct phy_device *phydev,
 		return rc;
 
 	phy_prepare_link(phydev, handler);
-	phy_start_machine(phydev);
 	if (phydev->irq > 0)
 		phy_start_interrupts(phydev);
 
diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 51a011a3..402d0889 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -694,7 +694,6 @@ static int phylink_bringup_phy(struct phylink *pl, struct phy_device *phy)
 		   __ETHTOOL_LINK_MODE_MASK_NBITS, pl->supported,
 		   phy->advertising);
 
-	phy_start_machine(phy);
 	if (phy->irq > 0)
 		phy_start_interrupts(phy);
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 68127b00..bc7aa93c 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1022,7 +1022,6 @@ int phy_drivers_register(struct phy_driver *new_driver, int n,
 void phy_state_machine(struct work_struct *work);
 void phy_change_work(struct work_struct *work);
 void phy_mac_interrupt(struct phy_device *phydev);
-void phy_start_machine(struct phy_device *phydev);
 void phy_stop_machine(struct phy_device *phydev);
 void phy_trigger_machine(struct phy_device *phydev, bool sync);
 int phy_ethtool_sset(struct phy_device *phydev, struct ethtool_cmd *cmd);
-- 
2.16.2

^ permalink raw reply related

* [PATCH RFC v2 3/7] net: phy: resume PHY only if needed in mdio_bus_phy_suspend
From: Heiner Kallweit @ 2018-03-16 21:15 UTC (permalink / raw)
  To: Florian Fainelli, Andrew Lunn, Geert Uytterhoeven; +Cc: netdev@vger.kernel.org
In-Reply-To: <6618f53c-778a-cb12-deb4-c618a728b43e@gmail.com>

Currently the PHY is unconditionally resumed in mdio_bus_phy_suspend().
In cases where the PHY was sleepinh before suspending or if somebody else
takes care of resuming later, this is not needed and wastes energy.

Also start the state machine only if it's used by the driver (indicated
by the adjust_link callback being defined).

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
---
v2:
- rename variable in mdio_bus_phy_needs_start
---
 drivers/net/phy/phy_device.c | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 85ebb969..c934725b 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -124,6 +124,18 @@ static bool phy_may_suspend(struct phy_device *phydev)
 }
 
 #ifdef CONFIG_PM
+
+static bool mdio_bus_phy_needs_start(struct phy_device *phydev)
+{
+	bool needs_start;
+
+	mutex_lock(&phydev->lock);
+	needs_start = phydev->state == PHY_UP && phydev->adjust_link;
+	mutex_unlock(&phydev->lock);
+
+	return needs_start;
+}
+
 static int mdio_bus_phy_suspend(struct device *dev)
 {
 	struct phy_device *phydev = to_phy_device(dev);
@@ -142,16 +154,17 @@ static int mdio_bus_phy_suspend(struct device *dev)
 static int mdio_bus_phy_resume(struct device *dev)
 {
 	struct phy_device *phydev = to_phy_device(dev);
-	int ret;
+	int ret = 0;
 
-	ret = phy_resume(phydev);
-	if (ret < 0)
-		return ret;
+	if (!phydev->attached_dev)
+		return 0;
 
-	if (phydev->attached_dev && phydev->adjust_link)
-		phy_start_machine(phydev);
+	if (mdio_bus_phy_needs_start(phydev))
+		phy_start(phydev);
+	else if (!phydev->adjust_link)
+		ret = phy_resume(phydev);
 
-	return 0;
+	return ret;
 }
 
 static int mdio_bus_phy_restore(struct device *dev)
@@ -171,7 +184,8 @@ static int mdio_bus_phy_restore(struct device *dev)
 	phydev->link = 0;
 	phydev->state = PHY_UP;
 
-	phy_start_machine(phydev);
+	if (mdio_bus_phy_needs_start(phydev))
+		phy_start(phydev);
 
 	return 0;
 }
-- 
2.16.2

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox