Netdev List
 help / color / mirror / Atom feed
* [PATCH v3 net-next 02/11] ibmvnic: Replace is_closed with state field
From: Nathan Fontenot @ 2017-05-02 18:52 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon
In-Reply-To: <20170502185006.28214.89072.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>

Replace the is_closed flag in the ibmvnic adapter strcut with a
more comprehensive state field that tracks the current state of
the driver.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c |   20 +++++++++++++-------
 drivers/net/ethernet/ibm/ibmvnic.h |   12 ++++++++++--
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index c67f1d6..40a8ba0 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -669,7 +669,9 @@ static int ibmvnic_open(struct net_device *netdev)
 	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
 	int i, rc;
 
-	if (adapter->is_closed) {
+	adapter->state = VNIC_OPENING;
+
+	if (adapter->state == VNIC_CLOSED) {
 		rc = ibmvnic_init(adapter);
 		if (rc)
 			return rc;
@@ -704,7 +706,7 @@ static int ibmvnic_open(struct net_device *netdev)
 		release_resources(adapter);
 	} else {
 		netif_tx_start_all_queues(netdev);
-		adapter->is_closed = false;
+		adapter->state = VNIC_OPEN;
 	}
 
 	return rc;
@@ -733,7 +735,7 @@ static int ibmvnic_close(struct net_device *netdev)
 	int rc = 0;
 	int i;
 
-	adapter->closing = true;
+	adapter->state = VNIC_CLOSING;
 	disable_sub_crqs(adapter);
 
 	if (adapter->napi) {
@@ -748,8 +750,7 @@ static int ibmvnic_close(struct net_device *netdev)
 
 	release_resources(adapter);
 
-	adapter->is_closed = true;
-	adapter->closing = false;
+	adapter->state = VNIC_CLOSED;
 	return rc;
 }
 
@@ -1860,7 +1861,8 @@ static int pending_scrq(struct ibmvnic_adapter *adapter,
 {
 	union sub_crq *entry = &scrq->msgs[scrq->cur];
 
-	if (entry->generic.first & IBMVNIC_CRQ_CMD_RSP || adapter->closing)
+	if (entry->generic.first & IBMVNIC_CRQ_CMD_RSP ||
+	    adapter->state == VNIC_CLOSING)
 		return 1;
 	else
 		return 0;
@@ -3353,6 +3355,7 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
 		return -ENOMEM;
 
 	adapter = netdev_priv(netdev);
+	adapter->state = VNIC_PROBING;
 	dev_set_drvdata(&dev->dev, netdev);
 	adapter->vdev = dev;
 	adapter->netdev = netdev;
@@ -3380,7 +3383,6 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
 	}
 
 	netdev->mtu = adapter->req_mtu - ETH_HLEN;
-	adapter->is_closed = false;
 
 	rc = register_netdev(netdev);
 	if (rc) {
@@ -3390,6 +3392,7 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
 	}
 	dev_info(&dev->dev, "ibmvnic registered\n");
 
+	adapter->state = VNIC_PROBED;
 	return 0;
 }
 
@@ -3398,12 +3401,15 @@ static int ibmvnic_remove(struct vio_dev *dev)
 	struct net_device *netdev = dev_get_drvdata(&dev->dev);
 	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
 
+	adapter->state = VNIC_REMOVING;
 	unregister_netdev(netdev);
 
 	release_resources(adapter);
 	release_sub_crqs(adapter);
 	release_crq_queue(adapter);
 
+	adapter->state = VNIC_REMOVED;
+
 	free_netdev(netdev);
 	dev_set_drvdata(&dev->dev, NULL);
 
diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h
index a69979f..03a866f 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.h
+++ b/drivers/net/ethernet/ibm/ibmvnic.h
@@ -913,6 +913,15 @@ struct ibmvnic_error_buff {
 	__be32 error_id;
 };
 
+enum vnic_state {VNIC_PROBING = 1,
+		 VNIC_PROBED,
+		 VNIC_OPENING,
+		 VNIC_OPEN,
+		 VNIC_CLOSING,
+		 VNIC_CLOSED,
+		 VNIC_REMOVING,
+		 VNIC_REMOVED};
+
 struct ibmvnic_adapter {
 	struct vio_dev *vdev;
 	struct net_device *netdev;
@@ -962,7 +971,6 @@ struct ibmvnic_adapter {
 	u64 promisc;
 
 	struct ibmvnic_tx_pool *tx_pool;
-	bool closing;
 	struct completion init_done;
 	int init_done_rc;
 
@@ -1011,5 +1019,5 @@ struct ibmvnic_adapter {
 	struct work_struct ibmvnic_xport;
 	struct tasklet_struct tasklet;
 	bool failover;
-	bool is_closed;
+	enum vnic_state state;
 };

^ permalink raw reply related

* [PATCH v3 net-next 03/11] ibmvnic: Updated reset handling
From: Nathan Fontenot @ 2017-05-02 18:52 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon
In-Reply-To: <20170502185006.28214.89072.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>

The ibmvnic driver has multiple handlers for resetting the driver
depending on the reason the reset is needed (failover, lpm,
fatal erors,...). All of the reset handlers do essentially the same
thing, this patch moves this work to a common reset handler.

By doing this we also allow the driver to better handle situations
where we can get a reset while handling a reset.

The updated reset handling works by adding a reset work item to the
list of resets and then scheduling work to perform the reset. This
step is necessary because we can receive a reset in interrupt context
and we want to handle the reset out of interrupt context.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c |  413 +++++++++++++++++++++++-------------
 drivers/net/ethernet/ibm/ibmvnic.h |   19 +-
 2 files changed, 275 insertions(+), 157 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 40a8ba0..b3d9b28 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -194,7 +194,8 @@ static void free_long_term_buff(struct ibmvnic_adapter *adapter,
 	if (!ltb->buff)
 		return;
 
-	if (!adapter->failover)
+	if (adapter->reset_reason != VNIC_RESET_FAILOVER &&
+	    adapter->reset_reason != VNIC_RESET_MOBILITY)
 		send_request_unmap(adapter, ltb->map_id);
 	dma_free_coherent(dev, ltb->size, ltb->buff, ltb->addr);
 }
@@ -292,9 +293,6 @@ static void replenish_pools(struct ibmvnic_adapter *adapter)
 {
 	int i;
 
-	if (adapter->migrated)
-		return;
-
 	adapter->replenish_task_cycles++;
 	for (i = 0; i < be32_to_cpu(adapter->login_rsp_buf->num_rxadd_subcrqs);
 	     i++) {
@@ -569,11 +567,6 @@ static int set_link_state(struct ibmvnic_adapter *adapter, u8 link_state)
 	bool resend;
 	int rc;
 
-	if (adapter->logical_link_state == link_state) {
-		netdev_dbg(netdev, "Link state already %d\n", link_state);
-		return 0;
-	}
-
 	netdev_err(netdev, "setting link state %d\n", link_state);
 	memset(&crq, 0, sizeof(crq));
 	crq.logical_link_state.first = IBMVNIC_CRQ_CMD;
@@ -664,27 +657,13 @@ static int init_resources(struct ibmvnic_adapter *adapter)
 	return rc;
 }
 
-static int ibmvnic_open(struct net_device *netdev)
+static int __ibmvnic_open(struct net_device *netdev)
 {
 	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
+	enum vnic_state prev_state = adapter->state;
 	int i, rc;
 
 	adapter->state = VNIC_OPENING;
-
-	if (adapter->state == VNIC_CLOSED) {
-		rc = ibmvnic_init(adapter);
-		if (rc)
-			return rc;
-	}
-
-	rc = ibmvnic_login(netdev);
-	if (rc)
-		return rc;
-
-	rc = init_resources(adapter);
-	if (rc)
-		return rc;
-
 	replenish_pools(adapter);
 
 	for (i = 0; i < adapter->req_rx_queues; i++)
@@ -693,22 +672,65 @@ static int ibmvnic_open(struct net_device *netdev)
 	/* We're ready to receive frames, enable the sub-crq interrupts and
 	 * set the logical link state to up
 	 */
-	for (i = 0; i < adapter->req_rx_queues; i++)
-		enable_scrq_irq(adapter, adapter->rx_scrq[i]);
+	for (i = 0; i < adapter->req_rx_queues; i++) {
+		if (prev_state == VNIC_CLOSED)
+			enable_irq(adapter->rx_scrq[i]->irq);
+		else
+			enable_scrq_irq(adapter, adapter->rx_scrq[i]);
+	}
 
-	for (i = 0; i < adapter->req_tx_queues; i++)
-		enable_scrq_irq(adapter, adapter->tx_scrq[i]);
+	for (i = 0; i < adapter->req_tx_queues; i++) {
+		if (prev_state == VNIC_CLOSED)
+			enable_irq(adapter->tx_scrq[i]->irq);
+		else
+			enable_scrq_irq(adapter, adapter->tx_scrq[i]);
+	}
 
 	rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_UP);
 	if (rc) {
 		for (i = 0; i < adapter->req_rx_queues; i++)
 			napi_disable(&adapter->napi[i]);
 		release_resources(adapter);
-	} else {
-		netif_tx_start_all_queues(netdev);
-		adapter->state = VNIC_OPEN;
+		return rc;
+	}
+	
+	netif_tx_start_all_queues(netdev);
+
+	if (prev_state == VNIC_CLOSED) {
+		for (i = 0; i < adapter->req_rx_queues; i++)
+			napi_schedule(&adapter->napi[i]);
 	}
 
+	adapter->state = VNIC_OPEN;
+	return rc;
+}
+
+static int ibmvnic_open(struct net_device *netdev)
+{
+	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
+	int rc;
+
+	mutex_lock(&adapter->reset_lock);
+
+	if (adapter->state != VNIC_CLOSED) {
+		rc = ibmvnic_login(netdev);
+		if (rc) {
+			mutex_unlock(&adapter->reset_lock);
+			return rc;
+		}
+
+		rc = init_resources(adapter);
+		if (rc) {
+			netdev_err(netdev, "failed to initialize resources\n");
+			release_resources(adapter);
+			mutex_unlock(&adapter->reset_lock);
+			return rc;
+		}
+	}
+
+	rc = __ibmvnic_open(netdev);
+	mutex_unlock(&adapter->reset_lock);
+
 	return rc;
 }
 
@@ -729,13 +751,14 @@ static void disable_sub_crqs(struct ibmvnic_adapter *adapter)
 	}
 }
 
-static int ibmvnic_close(struct net_device *netdev)
+static int __ibmvnic_close(struct net_device *netdev)
 {
 	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
 	int rc = 0;
 	int i;
 
 	adapter->state = VNIC_CLOSING;
+	netif_tx_stop_all_queues(netdev);
 	disable_sub_crqs(adapter);
 
 	if (adapter->napi) {
@@ -743,17 +766,24 @@ static int ibmvnic_close(struct net_device *netdev)
 			napi_disable(&adapter->napi[i]);
 	}
 
-	if (!adapter->failover)
-		netif_tx_stop_all_queues(netdev);
-
 	rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_DN);
 
-	release_resources(adapter);
-
 	adapter->state = VNIC_CLOSED;
 	return rc;
 }
 
+static int ibmvnic_close(struct net_device *netdev)
+{
+	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
+	int rc;
+
+	mutex_lock(&adapter->reset_lock);
+	rc = __ibmvnic_close(netdev);
+	mutex_unlock(&adapter->reset_lock);
+
+	return rc;
+}
+
 /**
  * build_hdr_data - creates L2/L3/L4 header data buffer
  * @hdr_field - bitfield determining needed headers
@@ -915,7 +945,7 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev)
 	handle_array = (u64 *)((u8 *)(adapter->login_rsp_buf) +
 				   be32_to_cpu(adapter->login_rsp_buf->
 					       off_txsubm_subcrqs));
-	if (adapter->migrated) {
+	if (adapter->resetting) {
 		if (!netif_subqueue_stopped(netdev, skb))
 			netif_stop_subqueue(netdev, queue_num);
 		dev_kfree_skb_any(skb);
@@ -1107,18 +1137,185 @@ static int ibmvnic_set_mac(struct net_device *netdev, void *p)
 	return 0;
 }
 
-static void ibmvnic_tx_timeout(struct net_device *dev)
+/**
+ * do_reset returns zero if we are able to keep processing reset events, or
+ * non-zero if we hit a fatal error and must halt.
+ */
+static int do_reset(struct ibmvnic_adapter *adapter,
+		    struct ibmvnic_rwi *rwi, u32 reset_state)
 {
-	struct ibmvnic_adapter *adapter = netdev_priv(dev);
-	int rc;
+	struct net_device *netdev = adapter->netdev;
+	int i, rc;
+
+	netif_carrier_off(netdev);
+	adapter->reset_reason = rwi->reset_reason;
+
+	if (rwi->reset_reason == VNIC_RESET_MOBILITY) {
+		rc = ibmvnic_reenable_crq_queue(adapter);
+		if (rc)
+			return 0;
+	}
+
+	rc = __ibmvnic_close(netdev);
+	if (rc)
+		return rc;
+
+	/* remove the closed state so when we call open it appears
+	 * we are coming from the probed state.
+	 */
+	adapter->state = VNIC_PROBED;
 
-	/* Adapter timed out, resetting it */
+	release_resources(adapter);
 	release_sub_crqs(adapter);
-	rc = ibmvnic_reset_crq(adapter);
+	release_crq_queue(adapter);
+
+	rc = ibmvnic_init(adapter);
 	if (rc)
-		dev_err(&adapter->vdev->dev, "Adapter timeout, reset failed\n");
-	else
-		ibmvnic_send_crq_init(adapter);
+		return 0;
+
+	/* If the adapter was in PROBE state prior to the reset, exit here. */
+	if (reset_state == VNIC_PROBED)
+		return 0;
+
+	rc = ibmvnic_login(netdev);
+	if (rc) {
+		adapter->state = VNIC_PROBED;
+		return 0;
+	}
+
+	rtnl_lock();
+	rc = init_resources(adapter);
+	rtnl_unlock();
+	if (rc)
+		return rc;
+
+	if (reset_state == VNIC_CLOSED)
+		return 0;
+
+	rc = __ibmvnic_open(netdev);
+	if (rc) {
+		if (list_empty(&adapter->rwi_list))
+			adapter->state = VNIC_CLOSED;
+		else
+			adapter->state = reset_state;
+
+		return 0;
+	}
+
+	netif_carrier_on(netdev);
+
+	/* kick napi */
+	for (i = 0; i < adapter->req_rx_queues; i++)
+		napi_schedule(&adapter->napi[i]);
+
+	return 0;
+}
+
+static struct ibmvnic_rwi *get_next_rwi(struct ibmvnic_adapter *adapter)
+{
+	struct ibmvnic_rwi *rwi;
+
+	mutex_lock(&adapter->rwi_lock);
+
+	if (!list_empty(&adapter->rwi_list)) {
+		rwi = list_first_entry(&adapter->rwi_list, struct ibmvnic_rwi,
+				       list);
+		list_del(&rwi->list);
+	} else {
+		rwi = NULL;
+	}
+
+	mutex_unlock(&adapter->rwi_lock);
+	return rwi;
+}
+
+static void free_all_rwi(struct ibmvnic_adapter *adapter)
+{
+	struct ibmvnic_rwi *rwi;
+
+	rwi = get_next_rwi(adapter);
+	while (rwi) {
+		kfree(rwi);
+		rwi = get_next_rwi(adapter);
+	}
+}
+
+static void __ibmvnic_reset(struct work_struct *work)
+{
+	struct ibmvnic_rwi *rwi;
+	struct ibmvnic_adapter *adapter;
+	struct net_device *netdev;
+	u32 reset_state;
+	int rc;
+
+	adapter = container_of(work, struct ibmvnic_adapter, ibmvnic_reset);
+	netdev = adapter->netdev;
+
+	mutex_lock(&adapter->reset_lock);
+	adapter->resetting = true;
+	reset_state = adapter->state;
+
+	rwi = get_next_rwi(adapter);
+	while (rwi) {
+		rc = do_reset(adapter, rwi, reset_state);
+		kfree(rwi);
+		if (rc)
+			break;
+
+		rwi = get_next_rwi(adapter);
+	}
+
+	if (rc) {
+		free_all_rwi(adapter);
+		return;
+	}
+
+	adapter->resetting = false;
+	mutex_unlock(&adapter->reset_lock);
+}
+
+static void ibmvnic_reset(struct ibmvnic_adapter *adapter,
+			  enum ibmvnic_reset_reason reason)
+{
+	struct ibmvnic_rwi *rwi, *tmp;
+	struct net_device *netdev = adapter->netdev;
+	struct list_head *entry;
+
+	if (adapter->state == VNIC_REMOVING ||
+	    adapter->state == VNIC_REMOVED) {
+		netdev_dbg(netdev, "Adapter removing, skipping reset\n");
+		return;
+	}
+
+	mutex_lock(&adapter->rwi_lock);
+
+	list_for_each(entry, &adapter->rwi_list) {
+		tmp = list_entry(entry, struct ibmvnic_rwi, list);
+		if (tmp->reset_reason == reason) {
+			netdev_err(netdev, "Matching reset found, skipping\n");
+			mutex_unlock(&adapter->rwi_lock);
+			return;
+		}
+	}
+
+	rwi = kzalloc(sizeof(*rwi), GFP_KERNEL);
+	if (!rwi) {
+		mutex_unlock(&adapter->rwi_lock);
+		ibmvnic_close(netdev);
+		return;
+	}
+
+	rwi->reset_reason = reason;
+	list_add_tail(&rwi->list, &adapter->rwi_list);
+	mutex_unlock(&adapter->rwi_lock);
+	schedule_work(&adapter->ibmvnic_reset);
+}
+
+static void ibmvnic_tx_timeout(struct net_device *dev)
+{
+	struct ibmvnic_adapter *adapter = netdev_priv(dev);
+
+	ibmvnic_reset(adapter, VNIC_RESET_TIMEOUT);
 }
 
 static void remove_buff_from_pool(struct ibmvnic_adapter *adapter,
@@ -2000,18 +2197,6 @@ static int ibmvnic_send_crq_init(struct ibmvnic_adapter *adapter)
 	return ibmvnic_send_crq(adapter, &crq);
 }
 
-static int ibmvnic_send_crq_init_complete(struct ibmvnic_adapter *adapter)
-{
-	union ibmvnic_crq crq;
-
-	memset(&crq, 0, sizeof(crq));
-	crq.generic.first = IBMVNIC_CRQ_INIT_CMD;
-	crq.generic.cmd = IBMVNIC_CRQ_INIT_COMPLETE;
-	netdev_dbg(adapter->netdev, "Sending CRQ init complete\n");
-
-	return ibmvnic_send_crq(adapter, &crq);
-}
-
 static int send_version_xchg(struct ibmvnic_adapter *adapter)
 {
 	union ibmvnic_crq crq;
@@ -2509,6 +2694,9 @@ static void handle_error_indication(union ibmvnic_crq *crq,
 
 	if (be32_to_cpu(crq->error_indication.error_id))
 		request_error_information(adapter, crq);
+
+	if (crq->error_indication.flags & IBMVNIC_FATAL_ERROR)
+		ibmvnic_reset(adapter, VNIC_RESET_FATAL);
 }
 
 static void handle_change_mac_rsp(union ibmvnic_crq *crq,
@@ -2897,26 +3085,6 @@ static void handle_query_cap_rsp(union ibmvnic_crq *crq,
 	}
 }
 
-static void ibmvnic_xport_event(struct work_struct *work)
-{
-	struct ibmvnic_adapter *adapter = container_of(work,
-						       struct ibmvnic_adapter,
-						       ibmvnic_xport);
-	struct device *dev = &adapter->vdev->dev;
-	long rc;
-
-	release_sub_crqs(adapter);
-	if (adapter->migrated) {
-		rc = ibmvnic_reenable_crq_queue(adapter);
-		if (rc)
-			dev_err(dev, "Error after enable rc=%ld\n", rc);
-		adapter->migrated = false;
-		rc = ibmvnic_send_crq_init(adapter);
-		if (rc)
-			dev_err(dev, "Error sending init rc=%ld\n", rc);
-	}
-}
-
 static void ibmvnic_handle_crq(union ibmvnic_crq *crq,
 			       struct ibmvnic_adapter *adapter)
 {
@@ -2934,12 +3102,6 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq,
 		switch (gen_crq->cmd) {
 		case IBMVNIC_CRQ_INIT:
 			dev_info(dev, "Partner initialized\n");
-			/* Send back a response */
-			rc = ibmvnic_send_crq_init_complete(adapter);
-			if (!rc)
-				schedule_work(&adapter->vnic_crq_init);
-			else
-				dev_err(dev, "Can't send initrsp rc=%ld\n", rc);
 			break;
 		case IBMVNIC_CRQ_INIT_COMPLETE:
 			dev_info(dev, "Partner initialization complete\n");
@@ -2950,19 +3112,18 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq,
 		}
 		return;
 	case IBMVNIC_CRQ_XPORT_EVENT:
+		netif_carrier_off(netdev);
 		if (gen_crq->cmd == IBMVNIC_PARTITION_MIGRATED) {
-			dev_info(dev, "Re-enabling adapter\n");
-			adapter->migrated = true;
-			schedule_work(&adapter->ibmvnic_xport);
+			dev_info(dev, "Migrated, re-enabling adapter\n");
+			ibmvnic_reset(adapter, VNIC_RESET_MOBILITY);
 		} else if (gen_crq->cmd == IBMVNIC_DEVICE_FAILOVER) {
 			dev_info(dev, "Backing device failover detected\n");
-			netif_carrier_off(netdev);
-			adapter->failover = true;
+			ibmvnic_reset(adapter, VNIC_RESET_FAILOVER);
 		} else {
 			/* The adapter lost the connection */
 			dev_err(dev, "Virtual Adapter failed (rc=%d)\n",
 				gen_crq->cmd);
-			schedule_work(&adapter->ibmvnic_xport);
+			ibmvnic_reset(adapter, VNIC_RESET_FATAL);
 		}
 		return;
 	case IBMVNIC_CRQ_CMD_RSP:
@@ -3243,64 +3404,6 @@ static int init_crq_queue(struct ibmvnic_adapter *adapter)
 	return retrc;
 }
 
-static void handle_crq_init_rsp(struct work_struct *work)
-{
-	struct ibmvnic_adapter *adapter = container_of(work,
-						       struct ibmvnic_adapter,
-						       vnic_crq_init);
-	struct device *dev = &adapter->vdev->dev;
-	struct net_device *netdev = adapter->netdev;
-	unsigned long timeout = msecs_to_jiffies(30000);
-	bool restart = false;
-	int rc;
-
-	if (adapter->failover) {
-		release_sub_crqs(adapter);
-		if (netif_running(netdev)) {
-			netif_tx_disable(netdev);
-			ibmvnic_close(netdev);
-			restart = true;
-		}
-	}
-
-	reinit_completion(&adapter->init_done);
-	send_version_xchg(adapter);
-	if (!wait_for_completion_timeout(&adapter->init_done, timeout)) {
-		dev_err(dev, "Passive init timeout\n");
-		goto task_failed;
-	}
-
-	netdev->mtu = adapter->req_mtu - ETH_HLEN;
-
-	if (adapter->failover) {
-		adapter->failover = false;
-		if (restart) {
-			rc = ibmvnic_open(netdev);
-			if (rc)
-				goto restart_failed;
-		}
-		netif_carrier_on(netdev);
-		return;
-	}
-
-	rc = register_netdev(netdev);
-	if (rc) {
-		dev_err(dev,
-			"failed to register netdev rc=%d\n", rc);
-		goto register_failed;
-	}
-	dev_info(dev, "ibmvnic registered\n");
-
-	return;
-
-restart_failed:
-	dev_err(dev, "Failed to restart ibmvnic, rc=%d\n", rc);
-register_failed:
-	release_sub_crqs(adapter);
-task_failed:
-	dev_err(dev, "Passive initialization was not successful\n");
-}
-
 static int ibmvnic_init(struct ibmvnic_adapter *adapter)
 {
 	struct device *dev = &adapter->vdev->dev;
@@ -3359,7 +3462,6 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
 	dev_set_drvdata(&dev->dev, netdev);
 	adapter->vdev = dev;
 	adapter->netdev = netdev;
-	adapter->failover = false;
 
 	ether_addr_copy(adapter->mac_addr, mac_addr_p);
 	ether_addr_copy(netdev->dev_addr, adapter->mac_addr);
@@ -3368,14 +3470,17 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
 	netdev->ethtool_ops = &ibmvnic_ethtool_ops;
 	SET_NETDEV_DEV(netdev, &dev->dev);
 
-	INIT_WORK(&adapter->vnic_crq_init, handle_crq_init_rsp);
-	INIT_WORK(&adapter->ibmvnic_xport, ibmvnic_xport_event);
-
 	spin_lock_init(&adapter->stats_lock);
 
 	INIT_LIST_HEAD(&adapter->errors);
 	spin_lock_init(&adapter->error_list_lock);
 
+	INIT_WORK(&adapter->ibmvnic_reset, __ibmvnic_reset);
+	INIT_LIST_HEAD(&adapter->rwi_list);
+	mutex_init(&adapter->reset_lock);
+	mutex_init(&adapter->rwi_lock);
+	adapter->resetting = false;
+
 	rc = ibmvnic_init(adapter);
 	if (rc) {
 		free_netdev(netdev);
@@ -3403,6 +3508,7 @@ static int ibmvnic_remove(struct vio_dev *dev)
 
 	adapter->state = VNIC_REMOVING;
 	unregister_netdev(netdev);
+	mutex_lock(&adapter->reset_lock);
 
 	release_resources(adapter);
 	release_sub_crqs(adapter);
@@ -3410,6 +3516,7 @@ static int ibmvnic_remove(struct vio_dev *dev)
 
 	adapter->state = VNIC_REMOVED;
 
+	mutex_unlock(&adapter->reset_lock);
 	free_netdev(netdev);
 	dev_set_drvdata(&dev->dev, NULL);
 
diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h
index 03a866f..4702b48 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.h
+++ b/drivers/net/ethernet/ibm/ibmvnic.h
@@ -922,6 +922,16 @@ enum vnic_state {VNIC_PROBING = 1,
 		 VNIC_REMOVING,
 		 VNIC_REMOVED};
 
+enum ibmvnic_reset_reason {VNIC_RESET_FAILOVER = 1,
+			   VNIC_RESET_MOBILITY,
+			   VNIC_RESET_FATAL,
+			   VNIC_RESET_TIMEOUT};
+
+struct ibmvnic_rwi {
+	enum ibmvnic_reset_reason reset_reason;
+	struct list_head list;
+};
+
 struct ibmvnic_adapter {
 	struct vio_dev *vdev;
 	struct net_device *netdev;
@@ -931,7 +941,6 @@ struct ibmvnic_adapter {
 	dma_addr_t ip_offload_tok;
 	struct ibmvnic_control_ip_offload_buffer ip_offload_ctrl;
 	dma_addr_t ip_offload_ctrl_tok;
-	bool migrated;
 	u32 msg_enable;
 
 	/* Statistics */
@@ -1015,9 +1024,11 @@ struct ibmvnic_adapter {
 	__be64 tx_rx_desc_req;
 	u8 map_id;
 
-	struct work_struct vnic_crq_init;
-	struct work_struct ibmvnic_xport;
 	struct tasklet_struct tasklet;
-	bool failover;
 	enum vnic_state state;
+	enum ibmvnic_reset_reason reset_reason;
+	struct mutex reset_lock, rwi_lock;
+	struct list_head rwi_list;
+	struct work_struct ibmvnic_reset;
+	bool resetting;
 };

^ permalink raw reply related

* [PATCH v3 net-next 04/11] ibmvnic: Delete napi's when releasing driver resources
From: Nathan Fontenot @ 2017-05-02 18:52 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon
In-Reply-To: <20170502185006.28214.89072.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>

The napi structs allocated at drivier initializatio need to be
free'ed when releasing the drivers resources.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c |    9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index b3d9b28..9f2686d 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -552,11 +552,20 @@ static int ibmvnic_login(struct net_device *netdev)
 
 static void release_resources(struct ibmvnic_adapter *adapter)
 {
+	int i;
+
 	release_tx_pools(adapter);
 	release_rx_pools(adapter);
 
 	release_stats_token(adapter);
 	release_error_buffers(adapter);
+
+	if (adapter->napi) {
+		for (i = 0; i < adapter->req_rx_queues; i++) {
+			if (&adapter->napi[i])
+				netif_napi_del(&adapter->napi[i]);
+		}
+	}
 }
 
 static int set_link_state(struct ibmvnic_adapter *adapter, u8 link_state)

^ permalink raw reply related

* [PATCH v3 net-next 05/11] ibmvnic: Whitespace correction in release_rx_pools
From: Nathan Fontenot @ 2017-05-02 18:52 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon
In-Reply-To: <20170502185006.28214.89072.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 9f2686d..d20d884 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -348,7 +348,7 @@ static void release_rx_pools(struct ibmvnic_adapter *adapter)
 		free_long_term_buff(adapter, &rx_pool->long_term_buff);
 
 		if (!rx_pool->rx_buff)
-		continue;
+			continue;
 
 		for (j = 0; j < rx_pool->size; j++) {
 			if (rx_pool->rx_buff[j].skb) {

^ permalink raw reply related

* [PATCH v3 net-next 06/11] ibmvnic: Clean up tx pools when closing
From: Nathan Fontenot @ 2017-05-02 18:52 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon
In-Reply-To: <20170502185006.28214.89072.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>

When closing the ibmvnic driver, most notably during the reset
path, the tx pools need to be cleaned to ensure there are no
hanging skbs that need to be free'ed.

The need for this was found during debugging a loss of network
traffic after handling a driver reset. The underlying cause was
some skbs in the tx pool that were never free'ed. As a
result the upper network layers never tried a re-send since it
believed the driver still had the skb.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c |   30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index d20d884..0400ae7 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -760,6 +760,34 @@ static void disable_sub_crqs(struct ibmvnic_adapter *adapter)
 	}
 }
 
+static void clean_tx_pools(struct ibmvnic_adapter *adapter)
+{
+	struct ibmvnic_tx_pool *tx_pool;
+	u64 tx_entries;
+	int tx_scrqs;
+	int i, j;
+
+	if (!adapter->tx_pool)
+		return;
+
+	tx_scrqs = be32_to_cpu(adapter->login_rsp_buf->num_txsubm_subcrqs);
+	tx_entries = adapter->req_tx_entries_per_subcrq;
+
+	/* Free any remaining skbs in the tx buffer pools */
+	for (i = 0; i < tx_scrqs; i++) {
+		tx_pool = &adapter->tx_pool[i];
+		if (!tx_pool)
+			continue;
+
+		for (j = 0; j < tx_entries; j++) {
+			if (tx_pool->tx_buff[j].skb) {
+				dev_kfree_skb_any(tx_pool->tx_buff[j].skb);
+				tx_pool->tx_buff[j].skb = NULL;
+			}
+		}
+	}
+}
+
 static int __ibmvnic_close(struct net_device *netdev)
 {
 	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
@@ -768,6 +796,8 @@ static int __ibmvnic_close(struct net_device *netdev)
 
 	adapter->state = VNIC_CLOSING;
 	netif_tx_stop_all_queues(netdev);
+
+	clean_tx_pools(adapter);
 	disable_sub_crqs(adapter);
 
 	if (adapter->napi) {

^ permalink raw reply related

* Re: TPACKET_V3 timeout bug?
From: chetan loke @ 2017-05-02 15:04 UTC (permalink / raw)
  To: Guy Harris; +Cc: Andrew Lunn, Sowmini Varadhan, netdev, tcpdump-workers
In-Reply-To: <251BA382-D82C-4931-A98F-A23AD88F9811@alum.mit.edu>

On Sat, Apr 15, 2017 at 7:41 PM, Guy Harris <guy@alum.mit.edu> wrote:
> On Apr 15, 2017, at 7:10 PM, Andrew Lunn <andrew@lunn.ch> wrote:
>
>> Do you think this is a kernel problem, libpcap problem, or an
>> application problem?
>

Its clearly a kernel regression.

If you look at if_packet.h, I have explicitly called out all the cases
for the return/status codes. When I first merged the functionality in
3.11(or 3.12 I think) I had the logic in place to retire the block
with or without packets in it. I think there was one case where we
wouldn't wake up userspace. Someone checked in a fix for that. Now I
am not sure the regression happened as part of that bug fix or
sometime later. If you diff 3.12 against the latest you will find the
regression. Look for prb_retire_rx_blk_timer_expired().


> An application problem.  See my response on netdev; the timeout (which is provided by the kernel's capture mechanism, in most cases) is to make sure packets don't stay stuck in the kernel's packet buffer for an indefinite period of time, it's *not* to make sure a thread capturing packets doesn't stay blocked for an indefinite period of time.  Whether the timer goes off even if no

I cannot speak on behalf of user-space wrappers developed around
tpacket_v3 but the intention(from the kernel POV) of the block_timer
*is* to unblock the capture/user process/thread so that it does NOT
stay blocked for an indefinite period of time. The header explicitly
specifies that contract.

I'm tied up for the next 3 weeks. However I can work on it after
that(unless someone else has cycles for it now).

Chetan

^ permalink raw reply

* [PATCH v3 net-next 07/11] ibmvnic: Wait for any pending scrqs entries at driver close
From: Nathan Fontenot @ 2017-05-02 18:52 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon
In-Reply-To: <20170502185006.28214.89072.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>

When closing the ibmvnic driver we need to wait for any pending
sub crq entries to ensure they are handled.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c |   47 +++++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 0400ae7..4da7080 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -743,23 +743,6 @@ static int ibmvnic_open(struct net_device *netdev)
 	return rc;
 }
 
-static void disable_sub_crqs(struct ibmvnic_adapter *adapter)
-{
-	int i;
-
-	if (adapter->tx_scrq) {
-		for (i = 0; i < adapter->req_tx_queues; i++)
-			if (adapter->tx_scrq[i])
-				disable_irq(adapter->tx_scrq[i]->irq);
-	}
-
-	if (adapter->rx_scrq) {
-		for (i = 0; i < adapter->req_rx_queues; i++)
-			if (adapter->rx_scrq[i])
-				disable_irq(adapter->rx_scrq[i]->irq);
-	}
-}
-
 static void clean_tx_pools(struct ibmvnic_adapter *adapter)
 {
 	struct ibmvnic_tx_pool *tx_pool;
@@ -797,15 +780,39 @@ static int __ibmvnic_close(struct net_device *netdev)
 	adapter->state = VNIC_CLOSING;
 	netif_tx_stop_all_queues(netdev);
 
-	clean_tx_pools(adapter);
-	disable_sub_crqs(adapter);
-
 	if (adapter->napi) {
 		for (i = 0; i < adapter->req_rx_queues; i++)
 			napi_disable(&adapter->napi[i]);
 	}
 
+	clean_tx_pools(adapter);
+
+	if (adapter->tx_scrq) {
+		for (i = 0; i < adapter->req_tx_queues; i++)
+			if (adapter->tx_scrq[i]->irq)
+				disable_irq(adapter->tx_scrq[i]->irq);
+	}
+
 	rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_DN);
+	if (rc)
+		return rc;
+
+	if (adapter->rx_scrq) {
+		for (i = 0; i < adapter->req_rx_queues; i++) {
+			int retries = 10;
+
+			while (pending_scrq(adapter, adapter->rx_scrq[i])) {
+				retries--;
+				mdelay(100);
+
+				if (retries == 0)
+					break;
+			}
+	
+			if (adapter->rx_scrq[i]->irq)
+				disable_irq(adapter->rx_scrq[i]->irq);
+		}
+	}
 
 	adapter->state = VNIC_CLOSED;
 	return rc;

^ permalink raw reply related

* [PATCH v3 net-next 08/11] ibmvnic: Check for driver reset first in ibmvnic_xmit
From: Nathan Fontenot @ 2017-05-02 18:52 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon
In-Reply-To: <20170502185006.28214.89072.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>

Move the check for the driver resetting to the first thing
in ibmvnic_xmit().

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c |   12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 4da7080..2cf1b64 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -985,12 +985,6 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev)
 	int index = 0;
 	int ret = 0;
 
-	tx_pool = &adapter->tx_pool[queue_num];
-	tx_scrq = adapter->tx_scrq[queue_num];
-	txq = netdev_get_tx_queue(netdev, skb_get_queue_mapping(skb));
-	handle_array = (u64 *)((u8 *)(adapter->login_rsp_buf) +
-				   be32_to_cpu(adapter->login_rsp_buf->
-					       off_txsubm_subcrqs));
 	if (adapter->resetting) {
 		if (!netif_subqueue_stopped(netdev, skb))
 			netif_stop_subqueue(netdev, queue_num);
@@ -1002,6 +996,12 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev)
 		goto out;
 	}
 
+	tx_pool = &adapter->tx_pool[queue_num];
+	tx_scrq = adapter->tx_scrq[queue_num];
+	txq = netdev_get_tx_queue(netdev, skb_get_queue_mapping(skb));
+	handle_array = (u64 *)((u8 *)(adapter->login_rsp_buf) +
+		be32_to_cpu(adapter->login_rsp_buf->off_txsubm_subcrqs));
+
 	index = tx_pool->free_map[tx_pool->consumer_index];
 	offset = index * adapter->req_mtu;
 	dst = tx_pool->long_term_buff.buff + offset;

^ permalink raw reply related

* [PATCH v3 net-next 10/11] ibmvnic: Record SKB RX queue during poll
From: Nathan Fontenot @ 2017-05-02 18:53 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon
In-Reply-To: <20170502185006.28214.89072.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>

From: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>

Map each RX SKB to the RX queue associated with the driver's RX SCRQ.
This should improve the RX CPU load balancing issues seen by the
performance team.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c |    1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 4b2e128..4a2b99f 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1428,6 +1428,7 @@ static int ibmvnic_poll(struct napi_struct *napi, int budget)
 
 		skb_put(skb, length);
 		skb->protocol = eth_type_trans(skb, netdev);
+		skb_record_rx_queue(skb, scrq_num);
 
 		if (flags & IBMVNIC_IP_CHKSUM_GOOD &&
 		    flags & IBMVNIC_TCP_UDP_CHKSUM_GOOD) {

^ permalink raw reply related

* [PATCH v3 net-next 11/11] ibmvnic: Move queue restarting in ibmvnic_tx_complete
From: Nathan Fontenot @ 2017-05-02 18:53 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon
In-Reply-To: <20170502185006.28214.89072.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>

Restart of the subqueue should occur outside of the loop processing
any tx buffers instead of doing this in the middle of the loop.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
---

v2: Use __netif_subqueue_stopped() instead of netif_subqueue_stopped()
to avoid possible using un-initialized skb variable.
---
 drivers/net/ethernet/ibm/ibmvnic.c |   22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 4a2b99f..4c5de60 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1809,19 +1809,8 @@ static int ibmvnic_complete_tx(struct ibmvnic_adapter *adapter,
 			}
 
 			if (txbuff->last_frag) {
-				if (atomic_sub_return(next->tx_comp.num_comps,
-						      &scrq->used) <=
-				    (adapter->req_tx_entries_per_subcrq / 2) &&
-				    netif_subqueue_stopped(adapter->netdev,
-							   txbuff->skb)) {
-					netif_wake_subqueue(adapter->netdev,
-							    scrq->pool_index);
-					netdev_dbg(adapter->netdev,
-						   "Started queue %d\n",
-						   scrq->pool_index);
-				}
-
 				dev_kfree_skb_any(txbuff->skb);
+				txbuff->skb = NULL;
 			}
 
 			adapter->tx_pool[pool].free_map[adapter->tx_pool[pool].
@@ -1832,6 +1821,15 @@ static int ibmvnic_complete_tx(struct ibmvnic_adapter *adapter,
 		}
 		/* remove tx_comp scrq*/
 		next->tx_comp.first = 0;
+
+		if (atomic_sub_return(next->tx_comp.num_comps, &scrq->used) <=
+		    (adapter->req_tx_entries_per_subcrq / 2) &&
+		    __netif_subqueue_stopped(adapter->netdev,
+					     scrq->pool_index)) {
+			netif_wake_subqueue(adapter->netdev, scrq->pool_index);
+			netdev_info(adapter->netdev, "Started queue %d\n",
+				    scrq->pool_index);
+		}
 	}
 
 	enable_scrq_irq(adapter, scrq);

^ permalink raw reply related

* [PATCH v3 net-next 09/11] ibmvnic: Continue skb processing after skb completion error
From: Nathan Fontenot @ 2017-05-02 18:53 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon
In-Reply-To: <20170502185006.28214.89072.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>

There is not a need to stop processing skbs if we encounter a
skb that has a receive completion error.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 2cf1b64..4b2e128 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1404,7 +1404,7 @@ static int ibmvnic_poll(struct napi_struct *napi, int budget)
 			/* free the entry */
 			next->rx_comp.first = 0;
 			remove_buff_from_pool(adapter, rx_buff);
-			break;
+			continue;
 		}
 
 		length = be32_to_cpu(next->rx_comp.len);

^ permalink raw reply related

* Re: [PATCH net-next ] tcp: add new parameter tcp_inherit_buffsize to control the initial buffer size for new passive connetion
From: Eric Dumazet @ 2017-05-02 15:05 UTC (permalink / raw)
  To: 单卫; +Cc: netdev, David Miller
In-Reply-To: <CAPYxyxJYXA_maME_qnshnFdzwDUkU1PvgDBQMfH32EerfQJZxA@mail.gmail.com>

On Tue, 2017-05-02 at 12:57 +0800, 单卫 wrote:
> Current sndbuff/rcvbuff value of new passive connetion inherit from
> listen socket.
> After then, tcp_init_buffer_space() initial them with init_cwnd and 
> tcp_should_expand_sndbuf() adjust them according the new cwnd.
> 
> 
> But, For Operation & Maintenance engineer, can't control
> sndbuff/rcvbuff with 
> tcp_wmem/tcp_rmem sysctl parameter online. They need to migrate flows
> out 
> and migrate them in after restart services and turn sndbuff/rcvfuff
> up.
> 
> 
> This patch is useful for online servcie setting tcp_inherit_buffsize
> with 0.
> By default, keep be consistent whth before.
> 
> 
> Signed-off-by: Shan Wei <shanwei88@gmail.com>
> ---
>  Documentation/networking/ip-sysctl.txt |    8 ++++++++
>  include/net/netns/ipv4.h               |    1 +
>  net/ipv4/sysctl_net_ipv4.c             |    7 +++++++
>  net/ipv4/tcp_ipv4.c                    |    1 +
>  net/ipv4/tcp_minisocks.c               |    6 ++++++
>  5 files changed, 23 insertions(+), 0 deletions(-)
> 
> 
> diff --git a/Documentation/networking/ip-sysctl.txt
> b/Documentation/networking/ip-sysctl.txt
> index 974ab47..3292bbf 100644
> --- a/Documentation/networking/ip-sysctl.txt
> +++ b/Documentation/networking/ip-sysctl.txt
> @@ -308,6 +308,14 @@ tcp_frto - INTEGER
>  
>   By default it's enabled with a non-zero value. 0 disables F-RTO.
>  
> +tcp_inherit_buffsize - BOOLEAN
> + For a new passive TCP connection, can use current tcp_wmem/tcp_rmem
> + parameter to set initial snd/rcv buffer size. This is useful for
> online
> + services which no need to be restarted just set it with 0. By
> default,
> + new passive connection inherits snd/rcv buffer size from lister
> socket.
> +
> + Default: 1
> +
>  tcp_invalid_ratelimit - INTEGER
>   Limit the maximal rate for sending duplicate acknowledgments
>   in response to incoming TCP packets that are for an existing
> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
> index cd686c4..1fc85bd 100644
> --- a/include/net/netns/ipv4.h
> +++ b/include/net/netns/ipv4.h
> @@ -124,6 +124,7 @@ struct netns_ipv4 {
>   int sysctl_tcp_tw_reuse;
>   struct inet_timewait_death_row tcp_death_row;
>   int sysctl_max_syn_backlog;
> + int sysctl_tcp_inherit_buffsize;
>  
>  #ifdef CONFIG_NET_L3_MASTER_DEV
>   int sysctl_udp_l3mdev_accept;
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index 86957e9..58a8bec 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -1078,6 +1078,13 @@ static int
> proc_tfo_blackhole_detect_timeout(struct ctl_table *table,
>   .mode = 0644,
>   .proc_handler = proc_dointvec
>   },
> + {
> + .procname = "tcp_inherit_buffsize",
> + .data = &init_net.ipv4.sysctl_tcp_inherit_buffsize,
> + .maxlen = sizeof(int),
> + .mode = 0644,
> + .proc_handler = proc_dointvec
> + },
>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
>   {
>   .procname = "fib_multipath_use_neigh",
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index cbbafe5..7a8a2bb 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -2457,6 +2457,7 @@ static int __net_init tcp_sk_init(struct net
> *net)
>   net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
>  
>   net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
> + net->ipv4.sysctl_tcp_inherit_buffsize = 1;
>  
>   return 0;
>  fail:
> diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
> index 8f6373b..6090d7e 100644
> --- a/net/ipv4/tcp_minisocks.c
> +++ b/net/ipv4/tcp_minisocks.c
> @@ -474,6 +474,12 @@ struct sock *tcp_create_openreq_child(const
> struct sock *sk,
>   tcp_init_xmit_timers(newsk);
>   newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
>  
> + if (!(sock_net(newsk)->ipv4.sysctl_tcp_inherit_buffsize) &&
> + !(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) {
> + newsk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1];
> + newsk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
> + }
> +

Hi Shan

1) Your patch never reached netdev, because it was sent in HTML format.

2) During Linus merge window, net-next is closed

I am not really convinced that we need this with TCP autotuning anyway.
Initial value of sk_sndbuf and sk_rcvbuf is really a hint.

How often do you really tweak /proc/sys/net/ipv4 files in production.

Please provide more information, like what actual values you change back
and forth.

Thanks.

^ permalink raw reply

* RE: [PATCH net-next 9/9] ipvlan: introduce individual MAC addresses
From: Chiappero, Marco @ 2017-05-02 15:08 UTC (permalink / raw)
  To: 'Dan Williams', netdev@vger.kernel.org
  Cc: David S . Miller, Kirsher, Jeffrey T, Duyck, Alexander H,
	Grandhi, Sainath, Mahesh Bandewar
In-Reply-To: <1493310219.27948.5.camel@redhat.com>


> -----Original Message-----
> From: Dan Williams [mailto:dcbw@redhat.com]
> On Thu, 2017-04-27 at 11:20 -0500, Dan Williams wrote:
> > On Thu, 2017-04-27 at 15:51 +0100, Marco Chiappero wrote:
> > > Currently all the slave devices belonging to the same port inherit
> > > their MAC address from its master device. This patch removes this
> > > limitation and allows every slave device to obtain a unique MAC
> > > address, by default randomly generated at creation time.
> > >
> > > Moreover it is now possible to correctly modify the MAC address at
> > > any time, fixing an existing bug as MAC address changes on the
> > > master were not reflected on the slaves. It also avoids multiple
> > > interfaces sharing the same IPv6 link-local address.
> >
> > How is this different than macvlan now?

The same way it was before. The purpose of the patch is to make possible to change the MAC address on slaves, not to change the external behavior of ipvlan: ipvlan will still behave as ipvlan, macvlan will still behave as macvlan.

>>  Why would you use unique
> > addressed ipvlan instances instead of macvlan?  Wouldn't the same
> > problems around external switches not expecting multiple MACs from the
> > same switch port apply now to ipvlan?

No. I'm willing to rework the commit messages if this point is not clear.
The idea is to effectively NAT communications, which is more or less the whole concept behind ipvlan.

> > The whole point of ipvlan AIUI was to get around macvlan problems
> > related to multiple MACs on the same port.
> 
> Another issue is the unicast MAC limits on cards.  ipvlan is now much more likely
> to hit the unicast MAC limit of the NIC and thus trigger promiscuous mode and
> the resulting performance drop, where before it would not.

The outside world will still see and use only one unicast MAC address, belonging to the master interface. No need to store any other unicast MAC in the filter.

Multicast filters will work as before, the only difference is that now slaves have different IPv6 link-local addresses, with an associated L2 multicast address). If this turns out to be an issue it's possible to keep the same MAC address by default but still allow it change. Or maybe force the same default link-local address somehow.
 
However, if the MAC filter is really the issue, I strongly encourage you to start working with HW vendors in order to better support modern data center workloads as a long term solution.

> > Also, I think the IPv6 thing you mention is incorrect and has long
> > since been solved.  Originally, ipvlan did not include a "dev_id"
> > property that differened between child interfaces, and thus the IID of
> > the each interface was the same.  That has now been fixed, and each
> > ipvlan slave should now have a different IID and thus a different
> > link-
> > local address.

Yes, thank you for pointing it out. I started this change a few months ago when such fix not present and simply rebased lately without noticing it.

Please let me know if you have further questions.

Thank you,
Marco
--------------------------------------------------------------
Intel Research and Development Ireland Limited
Registered in Ireland
Registered Office: Collinstown Industrial Park, Leixlip, County Kildare
Registered Number: 308263


This e-mail and any attachments may contain confidential material for the sole
use of the intended recipient(s). Any review or distribution by others is
strictly prohibited. If you are not the intended recipient, please contact the
sender and delete all copies.

^ permalink raw reply

* RE: [PATCH net-next 0/9] support unique MAC addresses for slave devices
From: Chiappero, Marco @ 2017-05-02 15:12 UTC (permalink / raw)
  To: maheshb@google.com, linux-netdev
  Cc: David S . Miller, Kirsher, Jeffrey T, Duyck, Alexander H,
	Grandhi, Sainath
In-Reply-To: <CAF2d9jjJ9J1B3fN3Ojkq=T5CVT-360-uY+-kpedTGr_XRcv0kw@mail.gmail.com>


> -----Original Message-----
> From: Mahesh Bandewar (महेश बंडेवार)
> 
> On Thu, Apr 27, 2017 at 7:51 AM, Marco Chiappero
> <marco.chiappero@intel.com> wrote:
> > Currently every slave device gets assigned the same MAC address, by
> > having it copied from the master interface. Since some code paths
> > depend on this identity, changing the MAC address on slave interfaces
> > is not supported. However identical MAC addresses can pose problems to
> > management and orchestration software that correctly expect network
> > interfaces on the same segment to have unique addresses.
> >
> Please understand that there are two distinct drivers IPvlan and MACvlan. They
> both exist together for good reasons and are trying to cater for different needs. I
> would love to combine them together if we don't mess / miss the goodies each
> of them have to offer... otherwise *NO*! Having said that if management /
> orchestration software has problems then clearly you should not use IPvlan for
> that use case.

I don't want to start a conversation on whether that software is the problem or not, which is probably arguable anyway, but their combination with ipvlan is a perfectly valid use case, the same way macvlan is.
Generally speaking, it's very reasonable and indeed common to look up interfaces by their MAC address. Being able to change the MAC address is also a reasonable expectation, especially for a soft interface which does not even use unicast L2 addresses for forwarding decisions.

> > Patches 1-8 include style fixes and refactoring (patch 9 depends upon)
> > that improve the overal quality and make the intruduction of the
> > feature straightforward.
> >
> Lots of this fall into I-say-potato-you-say-... category. My way of thinking /
> organizing code is different than yours and you don't have to like mine and I
> don't have to like yours.

A few patches are definitely subject to personal taste, I have no problem dropping them although I still see improvements in readability. Having that said:
- patch 1 is based on checkpatch.pl, so it's substantially objective.
- patches 3 and 4 are well motivated and driven by very specific reasons: both improve readability, remove unnecessary jumps and 4 in particular removes a good amount of duplicated code (52 insertions, 135 deletions).

> > Patch 9 enables slave devices to own unique MAC addresses and change
> > such addresses live, fixing lack of support and a related bug, as MAC
> > address changes on master were not propagated to slave devices.
> > In order to preserve the main peculiarity of this driver, that is
> > exposing only a single MAC address for outbound traffic, frames
> > egressing from master are now effectively masquerated when working in
> > L2 mode.
> >
> This enhancement is, however, coming via packet-header rewrite for every
> Tx/Rx packet which defeats the purpose. 

This only applies to N-S traffic in L2 mode, but nothing prevents you from reversing the logic by modifying ipvlan_hard_header and moving the rewrite to E-W traffic instead. I've made this decision in order to fully emulate an L2 network for slaves and make the master interface appear as a sort of L2 NAT. Alternatively you can limit yourself to the mangling done upfront in ipvlan_hard_header, although when sniffing from a slave things will appear confusing.

Now, if by "purpose" you mean performance implications in L2 mode, I doubt a small memcpy done while handling the skb has a major impact. In fact from the few performance tests I run I could not see any noticeable difference. I'd be happy to investigate this further with additional benchmarks in order to make a more informed decision.

L3 and L3s modes are not affected at all (actually they are on the reception side but this can be modified), so this change can be introduced at no cost here.

> The only good thing that came in light
> is the mac-addr change propagation from master issue; but if the fix is coming as
> a side-effect of header rewrite then it's not an acceptable fix either. 

It's not at all a side-effect of the header rewrite, it's rather the opposite. Either you force MAC addresses to stay in sync or let them vary individually, which is what I'm proposing for the reasons mentioned earlier. How you achieve this is a different story.

Of course, the former case can be fixed by simply doing what macvlan does in passthru mode, that is, sync up the slave(s) upon address change on the master via observer. The latter case can be addressed in one of the above ways. I would like to have such conversation especially in light of the following comments from you:

392         /* TODO Probably use a different field than dev_addr so that the
393          * mac-address on the virtual device is portable and can be carried
394          * while the packets use the mac-addr on the physical device.
395          */

536         /* TODO Probably put random address here to be presented to the
537          * world but keep using the physical-dev address for the outgoing
538          * packets.
539          */

> This can be
> simply fixed by changing a line in ipvlan_hard_header().

This is conceptually the same approach as above, you would still need to mangle ARP and ND frames for N-S traffic to make it work though, which is the biggest part of patch 9. 

Please, review the patch and elaborate specific concerns.

Thank you,
Marco
--------------------------------------------------------------
Intel Research and Development Ireland Limited
Registered in Ireland
Registered Office: Collinstown Industrial Park, Leixlip, County Kildare
Registered Number: 308263


This e-mail and any attachments may contain confidential material for the sole
use of the intended recipient(s). Any review or distribution by others is
strictly prohibited. If you are not the intended recipient, please contact the
sender and delete all copies.

^ permalink raw reply

* Re: bpf_test_finish()
From: David Miller @ 2017-05-02 15:14 UTC (permalink / raw)
  To: ast; +Cc: daniel, netdev
In-Reply-To: <a7f18cfe-049c-d580-8c15-5d09b6860c48@fb.com>

From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 1 May 2017 21:46:07 -0700

> I'll send a patch first thing tomorrow unless Daniel beats me to it.
> We have kattr there as well which has the whole bpf_attr copied into
> kernel memory already. Should have taken data_out from there and
> passed into bpf_test_finish().

I happen to be hacking on this file for another reason right now so
I'll work on a fix for this.  So don't worry about it.

^ permalink raw reply

* [RFC PATCH 00/17] latest qdisc patch series
From: John Fastabend @ 2017-05-02 15:24 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, john.fastabend

This is my latest series of patches to remove the locking requirement
from qdisc logic. 

I still have a couple issues to resolve main problem at the moment is
pfifo_fast qdisc without contention running under mq or mqprio is
actually slower by a few 100k pps with pktgen tests. I am trying to
sort out how to get the performance back now.

The main difference in these patches is to recognize that parking
packets on the gso slot and bad txq cases are really edge cases and
can be handled with locked queue operations. This simplifies the
patches and avoids racing with netif scheduler. If you hit these paths
it means either there is a driver overrun, should be very rare with
bql, or a TCP session has migrated cores with outstanding packets.

Patches 16, 17 were an attempt to resolve the performance degradation
in the uncontended case. The idea is to use the qdisc lock around the
dequeue operations so that we only need to take a single lock for the
entire bulk operation and can consume packets out of skb_array without
a spin_lock.

Another potential issue, I think, is if we have multiple packets
on the bad_txq we should bulk out of the bad_txq and not jump to
bulking out of the "normal" dequeue qdisc op.

I've mostly tested with pktgen at this point but have done some basic
netperf tests and both seem to be working.

I am not going to be able to work on this for a few days so I figured
it might be worth getting some feedback if there is any. Any thoughts
on how to squeeze a few extra pps out of this would be very useful. I
would like to avoid having a degradation in the micro-benchmark if
possible. Further, it should be possible best I can tell.

Thanks!
John

---

John Fastabend (17):
      net: sched: cleanup qdisc_run and __qdisc_run semantics
      net: sched: allow qdiscs to handle locking
      net: sched: remove remaining uses for qdisc_qlen in xmit path
      net: sched: provide per cpu qstat helpers
      net: sched: a dflt qdisc may be used with per cpu stats
      net: sched: explicit locking in gso_cpu fallback
      net: sched: drop qdisc_reset from dev_graft_qdisc
      net: sched: support skb_bad_tx with lockless qdisc
      net: sched: check for frozen queue before skb_bad_txq check
      net: sched: qdisc_qlen for per cpu logic
      net: sched: helper to sum qlen
      net: sched: add support for TCQ_F_NOLOCK subqueues to sch_mq
      net: sched: add support for TCQ_F_NOLOCK subqueues to sch_mqprio
      net: skb_array: expose peek API
      net: sched: pfifo_fast use skb_array
      net: skb_array additions for unlocked consumer
      net: sched: lock once per bulk dequeue


 include/linux/skb_array.h |   10 +
 include/net/gen_stats.h   |    3 
 include/net/pkt_sched.h   |   10 +
 include/net/sch_generic.h |   82 ++++++++-
 net/core/dev.c            |   31 +++
 net/core/gen_stats.c      |    9 +
 net/sched/sch_api.c       |    3 
 net/sched/sch_generic.c   |  400 +++++++++++++++++++++++++++++++++------------
 net/sched/sch_mq.c        |   25 ++-
 net/sched/sch_mqprio.c    |   61 ++++---
 10 files changed, 470 insertions(+), 164 deletions(-)

^ permalink raw reply

* [RFC PATCH 01/17] net: sched: cleanup qdisc_run and __qdisc_run semantics
From: John Fastabend @ 2017-05-02 15:25 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, john.fastabend
In-Reply-To: <20170502151445.8871.43089.stgit@john-Precision-Tower-5810>

Currently __qdisc_run calls qdisc_run_end() but does not call
qdisc_run_begin(). This makes it hard to track pairs of
qdisc_run_{begin,end} across function calls.

To simplify reading these code paths and simpler code this
patch moves begin/end calls into qdisc_run().

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Acked-by: Eric Dumazet <edumazet@google.com>
---
 include/net/pkt_sched.h |    4 +++-
 net/core/dev.c          |    5 +++--
 net/sched/sch_generic.c |    2 --
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index bec46f6..b7f5776 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -109,8 +109,10 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 
 static inline void qdisc_run(struct Qdisc *q)
 {
-	if (qdisc_run_begin(q))
+	if (qdisc_run_begin(q)) {
 		__qdisc_run(q);
+		qdisc_run_end(q);
+	}
 }
 
 int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
diff --git a/net/core/dev.c b/net/core/dev.c
index db6e315..3169074 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3099,9 +3099,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 				contended = false;
 			}
 			__qdisc_run(q);
-		} else
-			qdisc_run_end(q);
+		}
 
+		qdisc_run_end(q);
 		rc = NET_XMIT_SUCCESS;
 	} else {
 		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
@@ -3111,6 +3111,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 				contended = false;
 			}
 			__qdisc_run(q);
+			qdisc_run_end(q);
 		}
 	}
 	spin_unlock(root_lock);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 52a2c55..ef33bf8 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -262,8 +262,6 @@ void __qdisc_run(struct Qdisc *q)
 			break;
 		}
 	}
-
-	qdisc_run_end(q);
 }
 
 unsigned long dev_trans_start(struct net_device *dev)

^ permalink raw reply related

* Re: [PATCH net-next iproute2 1/3] netlink: import netlink message parsing from kernel
From: Stephen Hemminger @ 2017-05-02 15:25 UTC (permalink / raw)
  To: David Ahern; +Cc: netdev, jakub.kicinski
In-Reply-To: <1493695105-9418-2-git-send-email-dsa@cumulusnetworks.com>

On Mon,  1 May 2017 20:18:23 -0700
David Ahern <dsa@cumulusnetworks.com> wrote:

> include/nlattr.h is pulled from include/net/netlink.h.
> lib/nlattr.c is pulled from lib/nlattr.c
> 
> Signed-off-by: David Ahern <dsa@cumulusnetworks.com>

No. I am not taking a third way of parsing netlink attributes.


Please either use existing netlink attribute code in libnetlink.h
(rta_getattr_u32 etc) or use libmnl like devlink.

^ permalink raw reply

* [RFC PATCH 02/17] net: sched: allow qdiscs to handle locking
From: John Fastabend @ 2017-05-02 15:25 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, john.fastabend
In-Reply-To: <20170502151445.8871.43089.stgit@john-Precision-Tower-5810>

This patch adds a flag for queueing disciplines to indicate the stack
does not need to use the qdisc lock to protect operations. This can
be used to build lockless scheduling algorithms and improving
performance.

The flag is checked in the tx path and the qdisc lock is only taken
if it is not set. For now use a conditional if statement. Later we
could be more aggressive if it proves worthwhile and use a static key
or wrap this in a likely().

Also the lockless case drops the TCQ_F_CAN_BYPASS logic. The reason
for this is synchronizing a qlen counter across threads proves to
cost more than doing the enqueue/dequeue operations when tested with
pktgen.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 include/net/sch_generic.h |    1 +
 net/core/dev.c            |   26 ++++++++++++++++++++++----
 net/sched/sch_generic.c   |   30 ++++++++++++++++++++----------
 3 files changed, 43 insertions(+), 14 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 22e5209..f5c8613 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -67,6 +67,7 @@ struct Qdisc {
 				      * qdisc_tree_decrease_qlen() should stop.
 				      */
 #define TCQ_F_INVISIBLE		0x80 /* invisible by default in dump */
+#define TCQ_F_NOLOCK		0x100 /* qdisc does not require locking */
 	u32			limit;
 	const struct Qdisc_ops	*ops;
 	struct qdisc_size_table	__rcu *stab;
diff --git a/net/core/dev.c b/net/core/dev.c
index 3169074..6d2db37 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3069,6 +3069,21 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 	int rc;
 
 	qdisc_calculate_pkt_len(skb, q);
+
+	if (q->flags & TCQ_F_NOLOCK) {
+		if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
+			__qdisc_drop(skb, &to_free);
+			rc = NET_XMIT_DROP;
+		} else {
+			rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
+			__qdisc_run(q);
+		}
+
+		if (unlikely(to_free))
+			kfree_skb_list(to_free);
+		return rc;
+	}
+
 	/*
 	 * Heuristic to force contended enqueues to serialize on a
 	 * separate lock before trying to get qdisc main lock.
@@ -3896,19 +3911,22 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
 
 		while (head) {
 			struct Qdisc *q = head;
-			spinlock_t *root_lock;
+			spinlock_t *root_lock = NULL;
 
 			head = head->next_sched;
 
-			root_lock = qdisc_lock(q);
-			spin_lock(root_lock);
+			if (!(q->flags & TCQ_F_NOLOCK)) {
+				root_lock = qdisc_lock(q);
+				spin_lock(root_lock);
+			}
 			/* We need to make sure head->next_sched is read
 			 * before clearing __QDISC_STATE_SCHED
 			 */
 			smp_mb__before_atomic();
 			clear_bit(__QDISC_STATE_SCHED, &q->state);
 			qdisc_run(q);
-			spin_unlock(root_lock);
+			if (root_lock)
+				spin_unlock(root_lock);
 		}
 	}
 }
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index ef33bf8..39653e8 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -170,7 +170,8 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 	int ret = NETDEV_TX_BUSY;
 
 	/* And release qdisc */
-	spin_unlock(root_lock);
+	if (root_lock)
+		spin_unlock(root_lock);
 
 	/* Note that we validate skb (GSO, checksum, ...) outside of locks */
 	if (validate)
@@ -183,10 +184,13 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 
 		HARD_TX_UNLOCK(dev, txq);
 	} else {
-		spin_lock(root_lock);
+		if (root_lock)
+			spin_lock(root_lock);
 		return qdisc_qlen(q);
 	}
-	spin_lock(root_lock);
+
+	if (root_lock)
+		spin_lock(root_lock);
 
 	if (dev_xmit_complete(ret)) {
 		/* Driver sent out skb successfully or skb was consumed */
@@ -227,9 +231,9 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
  */
 static inline int qdisc_restart(struct Qdisc *q, int *packets)
 {
+	spinlock_t *root_lock = NULL;
 	struct netdev_queue *txq;
 	struct net_device *dev;
-	spinlock_t *root_lock;
 	struct sk_buff *skb;
 	bool validate;
 
@@ -238,7 +242,9 @@ static inline int qdisc_restart(struct Qdisc *q, int *packets)
 	if (unlikely(!skb))
 		return 0;
 
-	root_lock = qdisc_lock(q);
+	if (!(q->flags & TCQ_F_NOLOCK))
+		root_lock = qdisc_lock(q);
+
 	dev = qdisc_dev(q);
 	txq = skb_get_tx_queue(dev, skb);
 
@@ -875,14 +881,18 @@ static bool some_qdisc_is_busy(struct net_device *dev)
 
 		dev_queue = netdev_get_tx_queue(dev, i);
 		q = dev_queue->qdisc_sleeping;
-		root_lock = qdisc_lock(q);
 
-		spin_lock_bh(root_lock);
+		if (q->flags & TCQ_F_NOLOCK) {
+			val = test_bit(__QDISC_STATE_SCHED, &q->state);
+		} else {
+			root_lock = qdisc_lock(q);
+			spin_lock_bh(root_lock);
 
-		val = (qdisc_is_running(q) ||
-		       test_bit(__QDISC_STATE_SCHED, &q->state));
+			val = (qdisc_is_running(q) ||
+			       test_bit(__QDISC_STATE_SCHED, &q->state));
 
-		spin_unlock_bh(root_lock);
+			spin_unlock_bh(root_lock);
+		}
 
 		if (val)
 			return true;

^ permalink raw reply related

* [RFC PATCH 03/17] net: sched: remove remaining uses for qdisc_qlen in xmit path
From: John Fastabend @ 2017-05-02 15:25 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, john.fastabend
In-Reply-To: <20170502151445.8871.43089.stgit@john-Precision-Tower-5810>

sch_direct_xmit() uses qdisc_qlen as a return value but all call sites
of the routine only check if it is zero or not. Simplify the logic so
that we don't need to return an actual queue length value.

This introduces a case now where sch_direct_xmit would have returned
a qlen of zero but now it returns true. However in this case all
call sites of sch_direct_xmit will implement a dequeue() and get
a null skb and abort. This trades tracking qlen in the hotpath for
an extra dequeue operation. Overall this seems to be good for
performance.

Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 include/net/pkt_sched.h |    6 +++---
 net/sched/sch_generic.c |   23 ++++++++++++-----------
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index b7f5776..69b90a3 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -101,9 +101,9 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
 void qdisc_put_rtab(struct qdisc_rate_table *tab);
 void qdisc_put_stab(struct qdisc_size_table *tab);
 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc);
-int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
-		    struct net_device *dev, struct netdev_queue *txq,
-		    spinlock_t *root_lock, bool validate);
+bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
+		     struct net_device *dev, struct netdev_queue *txq,
+		     spinlock_t *root_lock, bool validate);
 
 void __qdisc_run(struct Qdisc *q);
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 39653e8..2f66b1f 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -160,12 +160,12 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
  * only one CPU can execute this function.
  *
  * Returns to the caller:
- *				0  - queue is empty or throttled.
- *				>0 - queue is not empty.
+ *				false  - hardware queue frozen backoff
+ *				true   - feel free to send more pkts
  */
-int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
-		    struct net_device *dev, struct netdev_queue *txq,
-		    spinlock_t *root_lock, bool validate)
+bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
+		     struct net_device *dev, struct netdev_queue *txq,
+		     spinlock_t *root_lock, bool validate)
 {
 	int ret = NETDEV_TX_BUSY;
 
@@ -186,7 +186,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 	} else {
 		if (root_lock)
 			spin_lock(root_lock);
-		return qdisc_qlen(q);
+		return true;
 	}
 
 	if (root_lock)
@@ -194,18 +194,19 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 
 	if (dev_xmit_complete(ret)) {
 		/* Driver sent out skb successfully or skb was consumed */
-		ret = qdisc_qlen(q);
+		ret = true;
 	} else {
 		/* Driver returned NETDEV_TX_BUSY - requeue skb */
 		if (unlikely(ret != NETDEV_TX_BUSY))
 			net_warn_ratelimited("BUG %s code %d qlen %d\n",
 					     dev->name, ret, q->q.qlen);
 
-		ret = dev_requeue_skb(skb, q);
+		dev_requeue_skb(skb, q);
+		ret = false;
 	}
 
 	if (ret && netif_xmit_frozen_or_stopped(txq))
-		ret = 0;
+		ret = false;
 
 	return ret;
 }
@@ -229,7 +230,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
  *				>0 - queue is not empty.
  *
  */
-static inline int qdisc_restart(struct Qdisc *q, int *packets)
+static inline bool qdisc_restart(struct Qdisc *q, int *packets)
 {
 	spinlock_t *root_lock = NULL;
 	struct netdev_queue *txq;
@@ -240,7 +241,7 @@ static inline int qdisc_restart(struct Qdisc *q, int *packets)
 	/* Dequeue packet */
 	skb = dequeue_skb(q, &validate, packets);
 	if (unlikely(!skb))
-		return 0;
+		return false;
 
 	if (!(q->flags & TCQ_F_NOLOCK))
 		root_lock = qdisc_lock(q);

^ permalink raw reply related

* [RFC PATCH 04/17] net: sched: provide per cpu qstat helpers
From: John Fastabend @ 2017-05-02 15:26 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, john.fastabend
In-Reply-To: <20170502151445.8871.43089.stgit@john-Precision-Tower-5810>

The per cpu qstats support was added with per cpu bstat support which
is currently used by the ingress qdisc. This patch adds a set of
helpers needed to make other qdiscs that use qstats per cpu as well.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 include/net/sch_generic.h |   35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index f5c8613..9531b81 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -590,12 +590,39 @@ static inline void qdisc_qstats_backlog_dec(struct Qdisc *sch,
 	sch->qstats.backlog -= qdisc_pkt_len(skb);
 }
 
+static inline void qdisc_qstats_cpu_backlog_dec(struct Qdisc *sch,
+						const struct sk_buff *skb)
+{
+	this_cpu_sub(sch->cpu_qstats->backlog, qdisc_pkt_len(skb));
+}
+
 static inline void qdisc_qstats_backlog_inc(struct Qdisc *sch,
 					    const struct sk_buff *skb)
 {
 	sch->qstats.backlog += qdisc_pkt_len(skb);
 }
 
+static inline void qdisc_qstats_cpu_backlog_inc(struct Qdisc *sch,
+						const struct sk_buff *skb)
+{
+	this_cpu_add(sch->cpu_qstats->backlog, qdisc_pkt_len(skb));
+}
+
+static inline void qdisc_qstats_cpu_qlen_inc(struct Qdisc *sch)
+{
+	this_cpu_inc(sch->cpu_qstats->qlen);
+}
+
+static inline void qdisc_qstats_cpu_qlen_dec(struct Qdisc *sch)
+{
+	this_cpu_dec(sch->cpu_qstats->qlen);
+}
+
+static inline void qdisc_qstats_cpu_requeues_inc(struct Qdisc *sch)
+{
+	this_cpu_inc(sch->cpu_qstats->requeues);
+}
+
 static inline void __qdisc_qstats_drop(struct Qdisc *sch, int count)
 {
 	sch->qstats.drops += count;
@@ -800,6 +827,14 @@ static inline void rtnl_qdisc_drop(struct sk_buff *skb, struct Qdisc *sch)
 	qdisc_qstats_drop(sch);
 }
 
+static inline int qdisc_drop_cpu(struct sk_buff *skb, struct Qdisc *sch,
+				 struct sk_buff **to_free)
+{
+	__qdisc_drop(skb, to_free);
+	qdisc_qstats_cpu_drop(sch);
+
+	return NET_XMIT_DROP;
+}
 
 static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch,
 			     struct sk_buff **to_free)

^ permalink raw reply related

* [RFC PATCH 05/17] net: sched: a dflt qdisc may be used with per cpu stats
From: John Fastabend @ 2017-05-02 15:26 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, john.fastabend
In-Reply-To: <20170502151445.8871.43089.stgit@john-Precision-Tower-5810>

Enable dflt qdisc support for per cpu stats before this patch a dflt
qdisc was required to use the global statistics qstats and bstats.

This adds a static flags field to qdisc_ops that is propagated
into qdisc->flags in qdisc allocate call. This allows the allocation
block to completely allocate the qdisc object so we don't have
dangling allocations after qdisc init.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 include/net/sch_generic.h |    1 +
 net/sched/sch_generic.c   |   16 ++++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 9531b81..83952af 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -172,6 +172,7 @@ struct Qdisc_ops {
 	const struct Qdisc_class_ops	*cl_ops;
 	char			id[IFNAMSIZ];
 	int			priv_size;
+	unsigned int		static_flags;
 
 	int 			(*enqueue)(struct sk_buff *skb,
 					   struct Qdisc *sch,
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 2f66b1f..8a03738 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -625,6 +625,19 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 	qdisc_skb_head_init(&sch->q);
 	spin_lock_init(&sch->q.lock);
 
+	if (ops->static_flags & TCQ_F_CPUSTATS) {
+		sch->cpu_bstats =
+			netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
+		if (!sch->cpu_bstats)
+			goto errout1;
+
+		sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
+		if (!sch->cpu_qstats) {
+			free_percpu(sch->cpu_bstats);
+			goto errout1;
+		}
+	}
+
 	spin_lock_init(&sch->busylock);
 	lockdep_set_class(&sch->busylock,
 			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
@@ -634,6 +647,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 			  dev->qdisc_running_key ?: &qdisc_running_key);
 
 	sch->ops = ops;
+	sch->flags = ops->static_flags;
 	sch->enqueue = ops->enqueue;
 	sch->dequeue = ops->dequeue;
 	sch->dev_queue = dev_queue;
@@ -641,6 +655,8 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 	atomic_set(&sch->refcnt, 1);
 
 	return sch;
+errout1:
+	kfree(p);
 errout:
 	return ERR_PTR(err);
 }

^ permalink raw reply related

* [RFC PATCH 06/17] net: sched: explicit locking in gso_cpu fallback
From: John Fastabend @ 2017-05-02 15:26 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, john.fastabend
In-Reply-To: <20170502151445.8871.43089.stgit@john-Precision-Tower-5810>

This work is preparing the qdisc layer to support egress lockless
qdiscs. If we are running the egress qdisc lockless in the case we
overrun the netdev, for whatever reason, the netdev returns a busy
error code and the skb is parked on the gso_skb pointer. With many
cores all hitting this case at once its possible to have multiple
sk_buffs here so we turn gso_skb into a queue.

This should be the edge case and if we see this frequently then
the netdev/qdisc layer needs to back off.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 include/net/sch_generic.h |   20 +++++++----
 net/sched/sch_generic.c   |   81 ++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 80 insertions(+), 21 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 83952af..3021bbb 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -85,7 +85,7 @@ struct Qdisc {
 	/*
 	 * For performance sake on SMP, we put highly modified fields at the end
 	 */
-	struct sk_buff		*gso_skb ____cacheline_aligned_in_smp;
+	struct sk_buff_head	gso_skb ____cacheline_aligned_in_smp;
 	struct qdisc_skb_head	q;
 	struct gnet_stats_basic_packed bstats;
 	seqcount_t		running;
@@ -754,26 +754,30 @@ static inline struct sk_buff *qdisc_peek_head(struct Qdisc *sch)
 /* generic pseudo peek method for non-work-conserving qdisc */
 static inline struct sk_buff *qdisc_peek_dequeued(struct Qdisc *sch)
 {
+	struct sk_buff *skb = skb_peek(&sch->gso_skb);
+
 	/* we can reuse ->gso_skb because peek isn't called for root qdiscs */
-	if (!sch->gso_skb) {
-		sch->gso_skb = sch->dequeue(sch);
-		if (sch->gso_skb) {
+	if (!skb) {
+		skb = sch->dequeue(sch);
+
+		if (skb) {
+			__skb_queue_head(&sch->gso_skb, skb);
 			/* it's still part of the queue */
-			qdisc_qstats_backlog_inc(sch, sch->gso_skb);
+			qdisc_qstats_backlog_inc(sch, skb);
 			sch->q.qlen++;
 		}
 	}
 
-	return sch->gso_skb;
+	return skb;
 }
 
 /* use instead of qdisc->dequeue() for all qdiscs queried with ->peek() */
 static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch)
 {
-	struct sk_buff *skb = sch->gso_skb;
+	struct sk_buff *skb = skb_peek(&sch->gso_skb);
 
 	if (skb) {
-		sch->gso_skb = NULL;
+		skb = __skb_dequeue(&sch->gso_skb);
 		qdisc_qstats_backlog_dec(sch, skb);
 		sch->q.qlen--;
 	} else {
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 8a03738..f179fc9 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -44,10 +44,9 @@
  * - ingress filtering is also serialized via qdisc root lock
  * - updates to tree and tree walking are only done under the rtnl mutex.
  */
-
-static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
+static inline int __dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
 {
-	q->gso_skb = skb;
+	__skb_queue_head(&q->gso_skb, skb);
 	q->qstats.requeues++;
 	qdisc_qstats_backlog_inc(q, skb);
 	q->q.qlen++;	/* it's still part of the queue */
@@ -56,6 +55,30 @@ static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
 	return 0;
 }
 
+static inline int dev_requeue_skb_locked(struct sk_buff *skb, struct Qdisc *q)
+{
+	spinlock_t *lock = qdisc_lock(q);
+
+	spin_lock(lock);
+	__skb_queue_tail(&q->gso_skb, skb);
+	spin_unlock(lock);
+
+	qdisc_qstats_cpu_requeues_inc(q);
+	qdisc_qstats_cpu_backlog_inc(q, skb);
+	qdisc_qstats_cpu_qlen_inc(q);
+	__netif_schedule(q);
+
+	return 0;
+}
+
+static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
+{
+	if (q->flags & TCQ_F_NOLOCK)
+		return dev_requeue_skb_locked(skb, q);
+	else
+		return __dev_requeue_skb(skb, q);
+}
+
 static void try_bulk_dequeue_skb(struct Qdisc *q,
 				 struct sk_buff *skb,
 				 const struct netdev_queue *txq,
@@ -111,23 +134,50 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
 static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
 				   int *packets)
 {
-	struct sk_buff *skb = q->gso_skb;
 	const struct netdev_queue *txq = q->dev_queue;
+	struct sk_buff *skb;
 
 	*packets = 1;
-	if (unlikely(skb)) {
+	if (unlikely(!skb_queue_empty(&q->gso_skb))) {
+		spinlock_t *lock = NULL;
+
+		if (q->flags & TCQ_F_NOLOCK) {
+			lock = qdisc_lock(q);
+			spin_lock(lock);
+		}
+
+		skb = skb_peek(&q->gso_skb);
+
+		/* skb may be null if another cpu pulls gso_skb off in between
+		 * empty check and lock.
+		 */
+		if (!skb) {
+			if (lock)
+				spin_unlock(lock);
+			goto validate;
+		}
+
 		/* skb in gso_skb were already validated */
 		*validate = false;
 		/* check the reason of requeuing without tx lock first */
 		txq = skb_get_tx_queue(txq->dev, skb);
 		if (!netif_xmit_frozen_or_stopped(txq)) {
-			q->gso_skb = NULL;
-			qdisc_qstats_backlog_dec(q, skb);
-			q->q.qlen--;
+			skb = __skb_dequeue(&q->gso_skb);
+			if (qdisc_is_percpu_stats(q)) {
+				qdisc_qstats_cpu_backlog_dec(q, skb);
+				qdisc_qstats_cpu_qlen_dec(q);
+			} else {
+				qdisc_qstats_backlog_dec(q, skb);
+				q->q.qlen--;
+			}
 		} else
 			skb = NULL;
+
+		if (lock)
+			spin_unlock(lock);
 		return skb;
 	}
+validate:
 	*validate = true;
 	skb = q->skb_bad_txq;
 	if (unlikely(skb)) {
@@ -622,6 +672,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 		sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
 		sch->padded = (char *) sch - (char *) p;
 	}
+	__skb_queue_head_init(&sch->gso_skb);
 	qdisc_skb_head_init(&sch->q);
 	spin_lock_init(&sch->q.lock);
 
@@ -690,6 +741,7 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
 void qdisc_reset(struct Qdisc *qdisc)
 {
 	const struct Qdisc_ops *ops = qdisc->ops;
+	struct sk_buff *skb, *tmp;
 
 	if (ops->reset)
 		ops->reset(qdisc);
@@ -697,10 +749,9 @@ void qdisc_reset(struct Qdisc *qdisc)
 	kfree_skb(qdisc->skb_bad_txq);
 	qdisc->skb_bad_txq = NULL;
 
-	if (qdisc->gso_skb) {
-		kfree_skb_list(qdisc->gso_skb);
-		qdisc->gso_skb = NULL;
-	}
+	skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp)
+		kfree_skb_list(skb);
+
 	qdisc->q.qlen = 0;
 }
 EXPORT_SYMBOL(qdisc_reset);
@@ -720,6 +771,7 @@ static void qdisc_rcu_free(struct rcu_head *head)
 void qdisc_destroy(struct Qdisc *qdisc)
 {
 	const struct Qdisc_ops  *ops = qdisc->ops;
+	struct sk_buff *skb, *tmp;
 
 	if (qdisc->flags & TCQ_F_BUILTIN ||
 	    !atomic_dec_and_test(&qdisc->refcnt))
@@ -739,7 +791,9 @@ void qdisc_destroy(struct Qdisc *qdisc)
 	module_put(ops->owner);
 	dev_put(qdisc_dev(qdisc));
 
-	kfree_skb_list(qdisc->gso_skb);
+	skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp)
+		kfree_skb_list(skb);
+
 	kfree_skb(qdisc->skb_bad_txq);
 	/*
 	 * gen_estimator est_timer() might access qdisc->q.lock,
@@ -971,6 +1025,7 @@ static void dev_init_scheduler_queue(struct net_device *dev,
 
 	rcu_assign_pointer(dev_queue->qdisc, qdisc);
 	dev_queue->qdisc_sleeping = qdisc;
+	__skb_queue_head_init(&qdisc->gso_skb);
 }
 
 void dev_init_scheduler(struct net_device *dev)

^ permalink raw reply related

* [RFC PATCH 07/17] net: sched: drop qdisc_reset from dev_graft_qdisc
From: John Fastabend @ 2017-05-02 15:27 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, john.fastabend
In-Reply-To: <20170502151445.8871.43089.stgit@john-Precision-Tower-5810>

In qdisc_graft_qdisc a "new" qdisc is attached and the 'qdisc_destroy'
operation is called on the old qdisc. The destroy operation will wait
a rcu grace period and call qdisc_rcu_free(). At which point
gso_cpu_skb is free'd along with all stats so no need to zero stats
and gso_cpu_skb from the graft operation itself.

Further after dropping the qdisc locks we can not continue to call
qdisc_reset before waiting an rcu grace period so that the qdisc is
detached from all cpus. By removing the qdisc_reset() here we get
the correct property of waiting an rcu grace period and letting the
qdisc_destroy operation clean up the qdisc correctly.

Note, a refcnt greater than 1 would cause the destroy operation to
be aborted however if this ever happened the reference to the qdisc
would be lost and we would have a memory leak.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 net/sched/sch_generic.c |   28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index f179fc9..37cd54e 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -813,10 +813,6 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
 	root_lock = qdisc_lock(oqdisc);
 	spin_lock_bh(root_lock);
 
-	/* Prune old scheduler */
-	if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
-		qdisc_reset(oqdisc);
-
 	/* ... and graft new one */
 	if (qdisc == NULL)
 		qdisc = &noop_qdisc;
@@ -971,6 +967,16 @@ static bool some_qdisc_is_busy(struct net_device *dev)
 	return false;
 }
 
+static void dev_qdisc_reset(struct net_device *dev,
+			    struct netdev_queue *dev_queue,
+			    void *none)
+{
+	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
+
+	if (qdisc)
+		qdisc_reset(qdisc);
+}
+
 /**
  * 	dev_deactivate_many - deactivate transmissions on several devices
  * 	@head: list of devices to deactivate
@@ -981,7 +987,6 @@ static bool some_qdisc_is_busy(struct net_device *dev)
 void dev_deactivate_many(struct list_head *head)
 {
 	struct net_device *dev;
-	bool sync_needed = false;
 
 	list_for_each_entry(dev, head, close_list) {
 		netdev_for_each_tx_queue(dev, dev_deactivate_queue,
@@ -991,20 +996,25 @@ void dev_deactivate_many(struct list_head *head)
 					     &noop_qdisc);
 
 		dev_watchdog_down(dev);
-		sync_needed |= !dev->dismantle;
 	}
 
 	/* Wait for outstanding qdisc-less dev_queue_xmit calls.
 	 * This is avoided if all devices are in dismantle phase :
 	 * Caller will call synchronize_net() for us
 	 */
-	if (sync_needed)
-		synchronize_net();
+	synchronize_net();
 
 	/* Wait for outstanding qdisc_run calls. */
-	list_for_each_entry(dev, head, close_list)
+	list_for_each_entry(dev, head, close_list) {
 		while (some_qdisc_is_busy(dev))
 			yield();
+		/* The new qdisc is assigned at this point so we can safely
+		 * unwind stale skb lists and qdisc statistics
+		 */
+		netdev_for_each_tx_queue(dev, dev_qdisc_reset, NULL);
+		if (dev_ingress_queue(dev))
+			dev_qdisc_reset(dev, dev_ingress_queue(dev), NULL);
+	}
 }
 
 void dev_deactivate(struct net_device *dev)

^ permalink raw reply related

* [RFC PATCH 08/17] net: sched: support skb_bad_tx with lockless qdisc
From: John Fastabend @ 2017-05-02 15:27 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, john.fastabend
In-Reply-To: <20170502151445.8871.43089.stgit@john-Precision-Tower-5810>

Similar to how gso is handled skb_bad_tx needs to be per cpu to handle
lockless qdisc with multiple writer/producers.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 include/net/sch_generic.h |    2 -
 net/sched/sch_generic.c   |  100 ++++++++++++++++++++++++++++++++++++---------
 2 files changed, 82 insertions(+), 20 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 3021bbb..4288222 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -92,7 +92,7 @@ struct Qdisc {
 	struct gnet_stats_queue	qstats;
 	unsigned long		state;
 	struct Qdisc            *next_sched;
-	struct sk_buff		*skb_bad_txq;
+	struct sk_buff_head	skb_bad_txq;
 	struct rcu_head		rcu_head;
 	int			padded;
 	atomic_t		refcnt;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 37cd54e..d4194c2 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -44,6 +44,68 @@
  * - ingress filtering is also serialized via qdisc root lock
  * - updates to tree and tree walking are only done under the rtnl mutex.
  */
+
+static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q)
+{
+	const struct netdev_queue *txq = q->dev_queue;
+	spinlock_t *lock = NULL;
+	struct sk_buff *skb;
+
+	if (q->flags & TCQ_F_NOLOCK) {
+		lock = qdisc_lock(q);
+		spin_lock(lock);
+	}
+
+	skb = skb_peek(&q->skb_bad_txq);
+	if (skb) {
+		/* check the reason of requeuing without tx lock first */
+		txq = skb_get_tx_queue(txq->dev, skb);
+		if (!netif_xmit_frozen_or_stopped(txq)) {
+			skb = __skb_dequeue(&q->skb_bad_txq);
+			if (qdisc_is_percpu_stats(q)) {
+				qdisc_qstats_cpu_backlog_dec(q, skb);
+				qdisc_qstats_cpu_qlen_dec(q);
+			} else {
+				qdisc_qstats_backlog_dec(q, skb);
+				q->q.qlen--;
+			}
+		} else {
+			skb = NULL;
+		}
+	}
+
+	if (lock)
+		spin_unlock(lock);
+
+	return skb;
+}
+
+static inline struct sk_buff *qdisc_dequeue_skb_bad_txq(struct Qdisc *q)
+{
+	struct sk_buff *skb = skb_peek(&q->skb_bad_txq);
+
+	if (unlikely(skb))
+		skb = __skb_dequeue_bad_txq(q);
+
+	return skb;
+}
+
+static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q,
+					     struct sk_buff *skb)
+{
+	spinlock_t *lock = NULL;
+
+	if (q->flags & TCQ_F_NOLOCK) {
+		lock = qdisc_lock(q);
+		spin_lock(lock);
+	}
+
+	__skb_queue_tail(&q->skb_bad_txq, skb);
+
+	if (lock)
+		spin_unlock(lock);
+}
+
 static inline int __dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
 {
 	__skb_queue_head(&q->gso_skb, skb);
@@ -116,9 +178,15 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
 		if (!nskb)
 			break;
 		if (unlikely(skb_get_queue_mapping(nskb) != mapping)) {
-			q->skb_bad_txq = nskb;
-			qdisc_qstats_backlog_inc(q, nskb);
-			q->q.qlen++;
+			qdisc_enqueue_skb_bad_txq(q, nskb);
+
+			if (qdisc_is_percpu_stats(q)) {
+				qdisc_qstats_cpu_backlog_inc(q, nskb);
+				qdisc_qstats_cpu_qlen_inc(q);
+			} else {
+				qdisc_qstats_backlog_inc(q, nskb);
+				q->q.qlen++;
+			}
 			break;
 		}
 		skb->next = nskb;
@@ -179,18 +247,9 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
 	}
 validate:
 	*validate = true;
-	skb = q->skb_bad_txq;
-	if (unlikely(skb)) {
-		/* check the reason of requeuing without tx lock first */
-		txq = skb_get_tx_queue(txq->dev, skb);
-		if (!netif_xmit_frozen_or_stopped(txq)) {
-			q->skb_bad_txq = NULL;
-			qdisc_qstats_backlog_dec(q, skb);
-			q->q.qlen--;
-			goto bulk;
-		}
-		return NULL;
-	}
+	skb = qdisc_dequeue_skb_bad_txq(q);
+	if (unlikely(skb))
+		goto bulk;
 	if (!(q->flags & TCQ_F_ONETXQUEUE) ||
 	    !netif_xmit_frozen_or_stopped(txq))
 		skb = q->dequeue(q);
@@ -673,6 +732,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 		sch->padded = (char *) sch - (char *) p;
 	}
 	__skb_queue_head_init(&sch->gso_skb);
+	__skb_queue_head_init(&sch->skb_bad_txq);
 	qdisc_skb_head_init(&sch->q);
 	spin_lock_init(&sch->q.lock);
 
@@ -746,12 +806,12 @@ void qdisc_reset(struct Qdisc *qdisc)
 	if (ops->reset)
 		ops->reset(qdisc);
 
-	kfree_skb(qdisc->skb_bad_txq);
-	qdisc->skb_bad_txq = NULL;
-
 	skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp)
 		kfree_skb_list(skb);
 
+	skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp)
+		kfree_skb_list(skb);
+
 	qdisc->q.qlen = 0;
 }
 EXPORT_SYMBOL(qdisc_reset);
@@ -793,8 +853,9 @@ void qdisc_destroy(struct Qdisc *qdisc)
 
 	skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp)
 		kfree_skb_list(skb);
+	skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp)
+		kfree_skb_list(skb);
 
-	kfree_skb(qdisc->skb_bad_txq);
 	/*
 	 * gen_estimator est_timer() might access qdisc->q.lock,
 	 * wait a RCU grace period before freeing qdisc.
@@ -1036,6 +1097,7 @@ static void dev_init_scheduler_queue(struct net_device *dev,
 	rcu_assign_pointer(dev_queue->qdisc, qdisc);
 	dev_queue->qdisc_sleeping = qdisc;
 	__skb_queue_head_init(&qdisc->gso_skb);
+	__skb_queue_head_init(&qdisc->skb_bad_txq);
 }
 
 void dev_init_scheduler(struct net_device *dev)

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox