Netdev List
 help / color / mirror / Atom feed
* [PATCH v4 net-next 03/11] ibmvnic: Updated reset handling
From: Nathan Fontenot @ 2017-05-03 18:04 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon
In-Reply-To: <20170503180246.29742.64039.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>

The ibmvnic driver has multiple handlers for resetting the driver
depending on the reason the reset is needed (failover, lpm,
fatal erors,...). All of the reset handlers do essentially the same
thing, this patch moves this work to a common reset handler.

By doing this we also allow the driver to better handle situations
where we can get a reset while handling a reset.

The updated reset handling works by adding a reset work item to the
list of resets and then scheduling work to perform the reset. This
step is necessary because we can receive a reset in interrupt context
and we want to handle the reset out of interrupt context.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c |  413 +++++++++++++++++++++++-------------
 drivers/net/ethernet/ibm/ibmvnic.h |   19 +-
 2 files changed, 275 insertions(+), 157 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 40a8ba0..a7c7a94 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -194,7 +194,8 @@ static void free_long_term_buff(struct ibmvnic_adapter *adapter,
 	if (!ltb->buff)
 		return;
 
-	if (!adapter->failover)
+	if (adapter->reset_reason != VNIC_RESET_FAILOVER &&
+	    adapter->reset_reason != VNIC_RESET_MOBILITY)
 		send_request_unmap(adapter, ltb->map_id);
 	dma_free_coherent(dev, ltb->size, ltb->buff, ltb->addr);
 }
@@ -292,9 +293,6 @@ static void replenish_pools(struct ibmvnic_adapter *adapter)
 {
 	int i;
 
-	if (adapter->migrated)
-		return;
-
 	adapter->replenish_task_cycles++;
 	for (i = 0; i < be32_to_cpu(adapter->login_rsp_buf->num_rxadd_subcrqs);
 	     i++) {
@@ -569,11 +567,6 @@ static int set_link_state(struct ibmvnic_adapter *adapter, u8 link_state)
 	bool resend;
 	int rc;
 
-	if (adapter->logical_link_state == link_state) {
-		netdev_dbg(netdev, "Link state already %d\n", link_state);
-		return 0;
-	}
-
 	netdev_err(netdev, "setting link state %d\n", link_state);
 	memset(&crq, 0, sizeof(crq));
 	crq.logical_link_state.first = IBMVNIC_CRQ_CMD;
@@ -664,27 +657,13 @@ static int init_resources(struct ibmvnic_adapter *adapter)
 	return rc;
 }
 
-static int ibmvnic_open(struct net_device *netdev)
+static int __ibmvnic_open(struct net_device *netdev)
 {
 	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
+	enum vnic_state prev_state = adapter->state;
 	int i, rc;
 
 	adapter->state = VNIC_OPENING;
-
-	if (adapter->state == VNIC_CLOSED) {
-		rc = ibmvnic_init(adapter);
-		if (rc)
-			return rc;
-	}
-
-	rc = ibmvnic_login(netdev);
-	if (rc)
-		return rc;
-
-	rc = init_resources(adapter);
-	if (rc)
-		return rc;
-
 	replenish_pools(adapter);
 
 	for (i = 0; i < adapter->req_rx_queues; i++)
@@ -693,22 +672,65 @@ static int ibmvnic_open(struct net_device *netdev)
 	/* We're ready to receive frames, enable the sub-crq interrupts and
 	 * set the logical link state to up
 	 */
-	for (i = 0; i < adapter->req_rx_queues; i++)
-		enable_scrq_irq(adapter, adapter->rx_scrq[i]);
+	for (i = 0; i < adapter->req_rx_queues; i++) {
+		if (prev_state == VNIC_CLOSED)
+			enable_irq(adapter->rx_scrq[i]->irq);
+		else
+			enable_scrq_irq(adapter, adapter->rx_scrq[i]);
+	}
 
-	for (i = 0; i < adapter->req_tx_queues; i++)
-		enable_scrq_irq(adapter, adapter->tx_scrq[i]);
+	for (i = 0; i < adapter->req_tx_queues; i++) {
+		if (prev_state == VNIC_CLOSED)
+			enable_irq(adapter->tx_scrq[i]->irq);
+		else
+			enable_scrq_irq(adapter, adapter->tx_scrq[i]);
+	}
 
 	rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_UP);
 	if (rc) {
 		for (i = 0; i < adapter->req_rx_queues; i++)
 			napi_disable(&adapter->napi[i]);
 		release_resources(adapter);
-	} else {
-		netif_tx_start_all_queues(netdev);
-		adapter->state = VNIC_OPEN;
+		return rc;
 	}
 
+	netif_tx_start_all_queues(netdev);
+
+	if (prev_state == VNIC_CLOSED) {
+		for (i = 0; i < adapter->req_rx_queues; i++)
+			napi_schedule(&adapter->napi[i]);
+	}
+
+	adapter->state = VNIC_OPEN;
+	return rc;
+}
+
+static int ibmvnic_open(struct net_device *netdev)
+{
+	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
+	int rc;
+
+	mutex_lock(&adapter->reset_lock);
+
+	if (adapter->state != VNIC_CLOSED) {
+		rc = ibmvnic_login(netdev);
+		if (rc) {
+			mutex_unlock(&adapter->reset_lock);
+			return rc;
+		}
+
+		rc = init_resources(adapter);
+		if (rc) {
+			netdev_err(netdev, "failed to initialize resources\n");
+			release_resources(adapter);
+			mutex_unlock(&adapter->reset_lock);
+			return rc;
+		}
+	}
+
+	rc = __ibmvnic_open(netdev);
+	mutex_unlock(&adapter->reset_lock);
+
 	return rc;
 }
 
@@ -729,13 +751,14 @@ static void disable_sub_crqs(struct ibmvnic_adapter *adapter)
 	}
 }
 
-static int ibmvnic_close(struct net_device *netdev)
+static int __ibmvnic_close(struct net_device *netdev)
 {
 	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
 	int rc = 0;
 	int i;
 
 	adapter->state = VNIC_CLOSING;
+	netif_tx_stop_all_queues(netdev);
 	disable_sub_crqs(adapter);
 
 	if (adapter->napi) {
@@ -743,17 +766,24 @@ static int ibmvnic_close(struct net_device *netdev)
 			napi_disable(&adapter->napi[i]);
 	}
 
-	if (!adapter->failover)
-		netif_tx_stop_all_queues(netdev);
-
 	rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_DN);
 
-	release_resources(adapter);
-
 	adapter->state = VNIC_CLOSED;
 	return rc;
 }
 
+static int ibmvnic_close(struct net_device *netdev)
+{
+	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
+	int rc;
+
+	mutex_lock(&adapter->reset_lock);
+	rc = __ibmvnic_close(netdev);
+	mutex_unlock(&adapter->reset_lock);
+
+	return rc;
+}
+
 /**
  * build_hdr_data - creates L2/L3/L4 header data buffer
  * @hdr_field - bitfield determining needed headers
@@ -915,7 +945,7 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev)
 	handle_array = (u64 *)((u8 *)(adapter->login_rsp_buf) +
 				   be32_to_cpu(adapter->login_rsp_buf->
 					       off_txsubm_subcrqs));
-	if (adapter->migrated) {
+	if (adapter->resetting) {
 		if (!netif_subqueue_stopped(netdev, skb))
 			netif_stop_subqueue(netdev, queue_num);
 		dev_kfree_skb_any(skb);
@@ -1107,18 +1137,185 @@ static int ibmvnic_set_mac(struct net_device *netdev, void *p)
 	return 0;
 }
 
-static void ibmvnic_tx_timeout(struct net_device *dev)
+/**
+ * do_reset returns zero if we are able to keep processing reset events, or
+ * non-zero if we hit a fatal error and must halt.
+ */
+static int do_reset(struct ibmvnic_adapter *adapter,
+		    struct ibmvnic_rwi *rwi, u32 reset_state)
 {
-	struct ibmvnic_adapter *adapter = netdev_priv(dev);
-	int rc;
+	struct net_device *netdev = adapter->netdev;
+	int i, rc;
+
+	netif_carrier_off(netdev);
+	adapter->reset_reason = rwi->reset_reason;
+
+	if (rwi->reset_reason == VNIC_RESET_MOBILITY) {
+		rc = ibmvnic_reenable_crq_queue(adapter);
+		if (rc)
+			return 0;
+	}
+
+	rc = __ibmvnic_close(netdev);
+	if (rc)
+		return rc;
+
+	/* remove the closed state so when we call open it appears
+	 * we are coming from the probed state.
+	 */
+	adapter->state = VNIC_PROBED;
 
-	/* Adapter timed out, resetting it */
+	release_resources(adapter);
 	release_sub_crqs(adapter);
-	rc = ibmvnic_reset_crq(adapter);
+	release_crq_queue(adapter);
+
+	rc = ibmvnic_init(adapter);
 	if (rc)
-		dev_err(&adapter->vdev->dev, "Adapter timeout, reset failed\n");
-	else
-		ibmvnic_send_crq_init(adapter);
+		return 0;
+
+	/* If the adapter was in PROBE state prior to the reset, exit here. */
+	if (reset_state == VNIC_PROBED)
+		return 0;
+
+	rc = ibmvnic_login(netdev);
+	if (rc) {
+		adapter->state = VNIC_PROBED;
+		return 0;
+	}
+
+	rtnl_lock();
+	rc = init_resources(adapter);
+	rtnl_unlock();
+	if (rc)
+		return rc;
+
+	if (reset_state == VNIC_CLOSED)
+		return 0;
+
+	rc = __ibmvnic_open(netdev);
+	if (rc) {
+		if (list_empty(&adapter->rwi_list))
+			adapter->state = VNIC_CLOSED;
+		else
+			adapter->state = reset_state;
+
+		return 0;
+	}
+
+	netif_carrier_on(netdev);
+
+	/* kick napi */
+	for (i = 0; i < adapter->req_rx_queues; i++)
+		napi_schedule(&adapter->napi[i]);
+
+	return 0;
+}
+
+static struct ibmvnic_rwi *get_next_rwi(struct ibmvnic_adapter *adapter)
+{
+	struct ibmvnic_rwi *rwi;
+
+	mutex_lock(&adapter->rwi_lock);
+
+	if (!list_empty(&adapter->rwi_list)) {
+		rwi = list_first_entry(&adapter->rwi_list, struct ibmvnic_rwi,
+				       list);
+		list_del(&rwi->list);
+	} else {
+		rwi = NULL;
+	}
+
+	mutex_unlock(&adapter->rwi_lock);
+	return rwi;
+}
+
+static void free_all_rwi(struct ibmvnic_adapter *adapter)
+{
+	struct ibmvnic_rwi *rwi;
+
+	rwi = get_next_rwi(adapter);
+	while (rwi) {
+		kfree(rwi);
+		rwi = get_next_rwi(adapter);
+	}
+}
+
+static void __ibmvnic_reset(struct work_struct *work)
+{
+	struct ibmvnic_rwi *rwi;
+	struct ibmvnic_adapter *adapter;
+	struct net_device *netdev;
+	u32 reset_state;
+	int rc;
+
+	adapter = container_of(work, struct ibmvnic_adapter, ibmvnic_reset);
+	netdev = adapter->netdev;
+
+	mutex_lock(&adapter->reset_lock);
+	adapter->resetting = true;
+	reset_state = adapter->state;
+
+	rwi = get_next_rwi(adapter);
+	while (rwi) {
+		rc = do_reset(adapter, rwi, reset_state);
+		kfree(rwi);
+		if (rc)
+			break;
+
+		rwi = get_next_rwi(adapter);
+	}
+
+	if (rc) {
+		free_all_rwi(adapter);
+		return;
+	}
+
+	adapter->resetting = false;
+	mutex_unlock(&adapter->reset_lock);
+}
+
+static void ibmvnic_reset(struct ibmvnic_adapter *adapter,
+			  enum ibmvnic_reset_reason reason)
+{
+	struct ibmvnic_rwi *rwi, *tmp;
+	struct net_device *netdev = adapter->netdev;
+	struct list_head *entry;
+
+	if (adapter->state == VNIC_REMOVING ||
+	    adapter->state == VNIC_REMOVED) {
+		netdev_dbg(netdev, "Adapter removing, skipping reset\n");
+		return;
+	}
+
+	mutex_lock(&adapter->rwi_lock);
+
+	list_for_each(entry, &adapter->rwi_list) {
+		tmp = list_entry(entry, struct ibmvnic_rwi, list);
+		if (tmp->reset_reason == reason) {
+			netdev_err(netdev, "Matching reset found, skipping\n");
+			mutex_unlock(&adapter->rwi_lock);
+			return;
+		}
+	}
+
+	rwi = kzalloc(sizeof(*rwi), GFP_KERNEL);
+	if (!rwi) {
+		mutex_unlock(&adapter->rwi_lock);
+		ibmvnic_close(netdev);
+		return;
+	}
+
+	rwi->reset_reason = reason;
+	list_add_tail(&rwi->list, &adapter->rwi_list);
+	mutex_unlock(&adapter->rwi_lock);
+	schedule_work(&adapter->ibmvnic_reset);
+}
+
+static void ibmvnic_tx_timeout(struct net_device *dev)
+{
+	struct ibmvnic_adapter *adapter = netdev_priv(dev);
+
+	ibmvnic_reset(adapter, VNIC_RESET_TIMEOUT);
 }
 
 static void remove_buff_from_pool(struct ibmvnic_adapter *adapter,
@@ -2000,18 +2197,6 @@ static int ibmvnic_send_crq_init(struct ibmvnic_adapter *adapter)
 	return ibmvnic_send_crq(adapter, &crq);
 }
 
-static int ibmvnic_send_crq_init_complete(struct ibmvnic_adapter *adapter)
-{
-	union ibmvnic_crq crq;
-
-	memset(&crq, 0, sizeof(crq));
-	crq.generic.first = IBMVNIC_CRQ_INIT_CMD;
-	crq.generic.cmd = IBMVNIC_CRQ_INIT_COMPLETE;
-	netdev_dbg(adapter->netdev, "Sending CRQ init complete\n");
-
-	return ibmvnic_send_crq(adapter, &crq);
-}
-
 static int send_version_xchg(struct ibmvnic_adapter *adapter)
 {
 	union ibmvnic_crq crq;
@@ -2509,6 +2694,9 @@ static void handle_error_indication(union ibmvnic_crq *crq,
 
 	if (be32_to_cpu(crq->error_indication.error_id))
 		request_error_information(adapter, crq);
+
+	if (crq->error_indication.flags & IBMVNIC_FATAL_ERROR)
+		ibmvnic_reset(adapter, VNIC_RESET_FATAL);
 }
 
 static void handle_change_mac_rsp(union ibmvnic_crq *crq,
@@ -2897,26 +3085,6 @@ static void handle_query_cap_rsp(union ibmvnic_crq *crq,
 	}
 }
 
-static void ibmvnic_xport_event(struct work_struct *work)
-{
-	struct ibmvnic_adapter *adapter = container_of(work,
-						       struct ibmvnic_adapter,
-						       ibmvnic_xport);
-	struct device *dev = &adapter->vdev->dev;
-	long rc;
-
-	release_sub_crqs(adapter);
-	if (adapter->migrated) {
-		rc = ibmvnic_reenable_crq_queue(adapter);
-		if (rc)
-			dev_err(dev, "Error after enable rc=%ld\n", rc);
-		adapter->migrated = false;
-		rc = ibmvnic_send_crq_init(adapter);
-		if (rc)
-			dev_err(dev, "Error sending init rc=%ld\n", rc);
-	}
-}
-
 static void ibmvnic_handle_crq(union ibmvnic_crq *crq,
 			       struct ibmvnic_adapter *adapter)
 {
@@ -2934,12 +3102,6 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq,
 		switch (gen_crq->cmd) {
 		case IBMVNIC_CRQ_INIT:
 			dev_info(dev, "Partner initialized\n");
-			/* Send back a response */
-			rc = ibmvnic_send_crq_init_complete(adapter);
-			if (!rc)
-				schedule_work(&adapter->vnic_crq_init);
-			else
-				dev_err(dev, "Can't send initrsp rc=%ld\n", rc);
 			break;
 		case IBMVNIC_CRQ_INIT_COMPLETE:
 			dev_info(dev, "Partner initialization complete\n");
@@ -2950,19 +3112,18 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq,
 		}
 		return;
 	case IBMVNIC_CRQ_XPORT_EVENT:
+		netif_carrier_off(netdev);
 		if (gen_crq->cmd == IBMVNIC_PARTITION_MIGRATED) {
-			dev_info(dev, "Re-enabling adapter\n");
-			adapter->migrated = true;
-			schedule_work(&adapter->ibmvnic_xport);
+			dev_info(dev, "Migrated, re-enabling adapter\n");
+			ibmvnic_reset(adapter, VNIC_RESET_MOBILITY);
 		} else if (gen_crq->cmd == IBMVNIC_DEVICE_FAILOVER) {
 			dev_info(dev, "Backing device failover detected\n");
-			netif_carrier_off(netdev);
-			adapter->failover = true;
+			ibmvnic_reset(adapter, VNIC_RESET_FAILOVER);
 		} else {
 			/* The adapter lost the connection */
 			dev_err(dev, "Virtual Adapter failed (rc=%d)\n",
 				gen_crq->cmd);
-			schedule_work(&adapter->ibmvnic_xport);
+			ibmvnic_reset(adapter, VNIC_RESET_FATAL);
 		}
 		return;
 	case IBMVNIC_CRQ_CMD_RSP:
@@ -3243,64 +3404,6 @@ static int init_crq_queue(struct ibmvnic_adapter *adapter)
 	return retrc;
 }
 
-static void handle_crq_init_rsp(struct work_struct *work)
-{
-	struct ibmvnic_adapter *adapter = container_of(work,
-						       struct ibmvnic_adapter,
-						       vnic_crq_init);
-	struct device *dev = &adapter->vdev->dev;
-	struct net_device *netdev = adapter->netdev;
-	unsigned long timeout = msecs_to_jiffies(30000);
-	bool restart = false;
-	int rc;
-
-	if (adapter->failover) {
-		release_sub_crqs(adapter);
-		if (netif_running(netdev)) {
-			netif_tx_disable(netdev);
-			ibmvnic_close(netdev);
-			restart = true;
-		}
-	}
-
-	reinit_completion(&adapter->init_done);
-	send_version_xchg(adapter);
-	if (!wait_for_completion_timeout(&adapter->init_done, timeout)) {
-		dev_err(dev, "Passive init timeout\n");
-		goto task_failed;
-	}
-
-	netdev->mtu = adapter->req_mtu - ETH_HLEN;
-
-	if (adapter->failover) {
-		adapter->failover = false;
-		if (restart) {
-			rc = ibmvnic_open(netdev);
-			if (rc)
-				goto restart_failed;
-		}
-		netif_carrier_on(netdev);
-		return;
-	}
-
-	rc = register_netdev(netdev);
-	if (rc) {
-		dev_err(dev,
-			"failed to register netdev rc=%d\n", rc);
-		goto register_failed;
-	}
-	dev_info(dev, "ibmvnic registered\n");
-
-	return;
-
-restart_failed:
-	dev_err(dev, "Failed to restart ibmvnic, rc=%d\n", rc);
-register_failed:
-	release_sub_crqs(adapter);
-task_failed:
-	dev_err(dev, "Passive initialization was not successful\n");
-}
-
 static int ibmvnic_init(struct ibmvnic_adapter *adapter)
 {
 	struct device *dev = &adapter->vdev->dev;
@@ -3359,7 +3462,6 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
 	dev_set_drvdata(&dev->dev, netdev);
 	adapter->vdev = dev;
 	adapter->netdev = netdev;
-	adapter->failover = false;
 
 	ether_addr_copy(adapter->mac_addr, mac_addr_p);
 	ether_addr_copy(netdev->dev_addr, adapter->mac_addr);
@@ -3368,14 +3470,17 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
 	netdev->ethtool_ops = &ibmvnic_ethtool_ops;
 	SET_NETDEV_DEV(netdev, &dev->dev);
 
-	INIT_WORK(&adapter->vnic_crq_init, handle_crq_init_rsp);
-	INIT_WORK(&adapter->ibmvnic_xport, ibmvnic_xport_event);
-
 	spin_lock_init(&adapter->stats_lock);
 
 	INIT_LIST_HEAD(&adapter->errors);
 	spin_lock_init(&adapter->error_list_lock);
 
+	INIT_WORK(&adapter->ibmvnic_reset, __ibmvnic_reset);
+	INIT_LIST_HEAD(&adapter->rwi_list);
+	mutex_init(&adapter->reset_lock);
+	mutex_init(&adapter->rwi_lock);
+	adapter->resetting = false;
+
 	rc = ibmvnic_init(adapter);
 	if (rc) {
 		free_netdev(netdev);
@@ -3403,6 +3508,7 @@ static int ibmvnic_remove(struct vio_dev *dev)
 
 	adapter->state = VNIC_REMOVING;
 	unregister_netdev(netdev);
+	mutex_lock(&adapter->reset_lock);
 
 	release_resources(adapter);
 	release_sub_crqs(adapter);
@@ -3410,6 +3516,7 @@ static int ibmvnic_remove(struct vio_dev *dev)
 
 	adapter->state = VNIC_REMOVED;
 
+	mutex_unlock(&adapter->reset_lock);
 	free_netdev(netdev);
 	dev_set_drvdata(&dev->dev, NULL);
 
diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h
index 03a866f..4702b48 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.h
+++ b/drivers/net/ethernet/ibm/ibmvnic.h
@@ -922,6 +922,16 @@ enum vnic_state {VNIC_PROBING = 1,
 		 VNIC_REMOVING,
 		 VNIC_REMOVED};
 
+enum ibmvnic_reset_reason {VNIC_RESET_FAILOVER = 1,
+			   VNIC_RESET_MOBILITY,
+			   VNIC_RESET_FATAL,
+			   VNIC_RESET_TIMEOUT};
+
+struct ibmvnic_rwi {
+	enum ibmvnic_reset_reason reset_reason;
+	struct list_head list;
+};
+
 struct ibmvnic_adapter {
 	struct vio_dev *vdev;
 	struct net_device *netdev;
@@ -931,7 +941,6 @@ struct ibmvnic_adapter {
 	dma_addr_t ip_offload_tok;
 	struct ibmvnic_control_ip_offload_buffer ip_offload_ctrl;
 	dma_addr_t ip_offload_ctrl_tok;
-	bool migrated;
 	u32 msg_enable;
 
 	/* Statistics */
@@ -1015,9 +1024,11 @@ struct ibmvnic_adapter {
 	__be64 tx_rx_desc_req;
 	u8 map_id;
 
-	struct work_struct vnic_crq_init;
-	struct work_struct ibmvnic_xport;
 	struct tasklet_struct tasklet;
-	bool failover;
 	enum vnic_state state;
+	enum ibmvnic_reset_reason reset_reason;
+	struct mutex reset_lock, rwi_lock;
+	struct list_head rwi_list;
+	struct work_struct ibmvnic_reset;
+	bool resetting;
 };

^ permalink raw reply related

* [PATCH v4 net-next 04/11] ibmvnic: Delete napi's when releasing driver resources
From: Nathan Fontenot @ 2017-05-03 18:04 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon
In-Reply-To: <20170503180246.29742.64039.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>

The napi structs allocated at drivier initializatio need to be
free'ed when releasing the drivers resources.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c |    9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index a7c7a94..d52d98c 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -552,11 +552,20 @@ static int ibmvnic_login(struct net_device *netdev)
 
 static void release_resources(struct ibmvnic_adapter *adapter)
 {
+	int i;
+
 	release_tx_pools(adapter);
 	release_rx_pools(adapter);
 
 	release_stats_token(adapter);
 	release_error_buffers(adapter);
+
+	if (adapter->napi) {
+		for (i = 0; i < adapter->req_rx_queues; i++) {
+			if (&adapter->napi[i])
+				netif_napi_del(&adapter->napi[i]);
+		}
+	}
 }
 
 static int set_link_state(struct ibmvnic_adapter *adapter, u8 link_state)

^ permalink raw reply related

* [PATCH v4 net-next 05/11] ibmvnic: Whitespace correction in release_rx_pools
From: Nathan Fontenot @ 2017-05-03 18:04 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon
In-Reply-To: <20170503180246.29742.64039.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index d52d98c..bbbd57e 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -348,7 +348,7 @@ static void release_rx_pools(struct ibmvnic_adapter *adapter)
 		free_long_term_buff(adapter, &rx_pool->long_term_buff);
 
 		if (!rx_pool->rx_buff)
-		continue;
+			continue;
 
 		for (j = 0; j < rx_pool->size; j++) {
 			if (rx_pool->rx_buff[j].skb) {

^ permalink raw reply related

* [PATCH v4 net-next 06/11] ibmvnic: Clean up tx pools when closing
From: Nathan Fontenot @ 2017-05-03 18:04 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon
In-Reply-To: <20170503180246.29742.64039.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>

When closing the ibmvnic driver, most notably during the reset
path, the tx pools need to be cleaned to ensure there are no
hanging skbs that need to be free'ed.

The need for this was found during debugging a loss of network
traffic after handling a driver reset. The underlying cause was
some skbs in the tx pool that were never free'ed. As a
result the upper network layers never tried a re-send since it
believed the driver still had the skb.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c |   30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index bbbd57e..2297cf2 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -760,6 +760,34 @@ static void disable_sub_crqs(struct ibmvnic_adapter *adapter)
 	}
 }
 
+static void clean_tx_pools(struct ibmvnic_adapter *adapter)
+{
+	struct ibmvnic_tx_pool *tx_pool;
+	u64 tx_entries;
+	int tx_scrqs;
+	int i, j;
+
+	if (!adapter->tx_pool)
+		return;
+
+	tx_scrqs = be32_to_cpu(adapter->login_rsp_buf->num_txsubm_subcrqs);
+	tx_entries = adapter->req_tx_entries_per_subcrq;
+
+	/* Free any remaining skbs in the tx buffer pools */
+	for (i = 0; i < tx_scrqs; i++) {
+		tx_pool = &adapter->tx_pool[i];
+		if (!tx_pool)
+			continue;
+
+		for (j = 0; j < tx_entries; j++) {
+			if (tx_pool->tx_buff[j].skb) {
+				dev_kfree_skb_any(tx_pool->tx_buff[j].skb);
+				tx_pool->tx_buff[j].skb = NULL;
+			}
+		}
+	}
+}
+
 static int __ibmvnic_close(struct net_device *netdev)
 {
 	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
@@ -768,6 +796,8 @@ static int __ibmvnic_close(struct net_device *netdev)
 
 	adapter->state = VNIC_CLOSING;
 	netif_tx_stop_all_queues(netdev);
+
+	clean_tx_pools(adapter);
 	disable_sub_crqs(adapter);
 
 	if (adapter->napi) {

^ permalink raw reply related

* [PATCH v4 net-next 07/11] ibmvnic: Wait for any pending scrqs entries at driver close
From: Nathan Fontenot @ 2017-05-03 18:05 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon
In-Reply-To: <20170503180246.29742.64039.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>

When closing the ibmvnic driver we need to wait for any pending
sub crq entries to ensure they are handled.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c |   47 +++++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 2297cf2..a312bc1 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -743,23 +743,6 @@ static int ibmvnic_open(struct net_device *netdev)
 	return rc;
 }
 
-static void disable_sub_crqs(struct ibmvnic_adapter *adapter)
-{
-	int i;
-
-	if (adapter->tx_scrq) {
-		for (i = 0; i < adapter->req_tx_queues; i++)
-			if (adapter->tx_scrq[i])
-				disable_irq(adapter->tx_scrq[i]->irq);
-	}
-
-	if (adapter->rx_scrq) {
-		for (i = 0; i < adapter->req_rx_queues; i++)
-			if (adapter->rx_scrq[i])
-				disable_irq(adapter->rx_scrq[i]->irq);
-	}
-}
-
 static void clean_tx_pools(struct ibmvnic_adapter *adapter)
 {
 	struct ibmvnic_tx_pool *tx_pool;
@@ -797,15 +780,39 @@ static int __ibmvnic_close(struct net_device *netdev)
 	adapter->state = VNIC_CLOSING;
 	netif_tx_stop_all_queues(netdev);
 
-	clean_tx_pools(adapter);
-	disable_sub_crqs(adapter);
-
 	if (adapter->napi) {
 		for (i = 0; i < adapter->req_rx_queues; i++)
 			napi_disable(&adapter->napi[i]);
 	}
 
+	clean_tx_pools(adapter);
+
+	if (adapter->tx_scrq) {
+		for (i = 0; i < adapter->req_tx_queues; i++)
+			if (adapter->tx_scrq[i]->irq)
+				disable_irq(adapter->tx_scrq[i]->irq);
+	}
+
 	rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_DN);
+	if (rc)
+		return rc;
+
+	if (adapter->rx_scrq) {
+		for (i = 0; i < adapter->req_rx_queues; i++) {
+			int retries = 10;
+
+			while (pending_scrq(adapter, adapter->rx_scrq[i])) {
+				retries--;
+				mdelay(100);
+
+				if (retries == 0)
+					break;
+			}
+
+			if (adapter->rx_scrq[i]->irq)
+				disable_irq(adapter->rx_scrq[i]->irq);
+		}
+	}
 
 	adapter->state = VNIC_CLOSED;
 	return rc;

^ permalink raw reply related

* [PATCH v4 net-next 08/11] ibmvnic: Check for driver reset first in ibmvnic_xmit
From: Nathan Fontenot @ 2017-05-03 18:05 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon
In-Reply-To: <20170503180246.29742.64039.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>

Move the check for the driver resetting to the first thing
in ibmvnic_xmit().

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c |   12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index a312bc1..f1ee377 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -985,12 +985,6 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev)
 	int index = 0;
 	int ret = 0;
 
-	tx_pool = &adapter->tx_pool[queue_num];
-	tx_scrq = adapter->tx_scrq[queue_num];
-	txq = netdev_get_tx_queue(netdev, skb_get_queue_mapping(skb));
-	handle_array = (u64 *)((u8 *)(adapter->login_rsp_buf) +
-				   be32_to_cpu(adapter->login_rsp_buf->
-					       off_txsubm_subcrqs));
 	if (adapter->resetting) {
 		if (!netif_subqueue_stopped(netdev, skb))
 			netif_stop_subqueue(netdev, queue_num);
@@ -1002,6 +996,12 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev)
 		goto out;
 	}
 
+	tx_pool = &adapter->tx_pool[queue_num];
+	tx_scrq = adapter->tx_scrq[queue_num];
+	txq = netdev_get_tx_queue(netdev, skb_get_queue_mapping(skb));
+	handle_array = (u64 *)((u8 *)(adapter->login_rsp_buf) +
+		be32_to_cpu(adapter->login_rsp_buf->off_txsubm_subcrqs));
+
 	index = tx_pool->free_map[tx_pool->consumer_index];
 	offset = index * adapter->req_mtu;
 	dst = tx_pool->long_term_buff.buff + offset;

^ permalink raw reply related

* [PATCH v4 net-next 09/11] ibmvnic: Continue skb processing after skb completion error
From: Nathan Fontenot @ 2017-05-03 18:05 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon
In-Reply-To: <20170503180246.29742.64039.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>

There is not a need to stop processing skbs if we encounter a
skb that has a receive completion error.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index f1ee377..00c9d5a 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1404,7 +1404,7 @@ static int ibmvnic_poll(struct napi_struct *napi, int budget)
 			/* free the entry */
 			next->rx_comp.first = 0;
 			remove_buff_from_pool(adapter, rx_buff);
-			break;
+			continue;
 		}
 
 		length = be32_to_cpu(next->rx_comp.len);

^ permalink raw reply related

* Re: [PATCH] net: ipv6: Fix warning of freeing alive inet6 address
From: Mike Manning @ 2017-05-03 14:16 UTC (permalink / raw)
  To: netdev; +Cc: Andrey Konovalov
In-Reply-To: <1493749857-18032-1-git-send-email-mmanning@brocade.com>

On reflection, please put this on hold subject to testing with syzkaller. I have not had a repro of the issue and so the fix even though harmless may not be effective.

Thanks
Mike

On 02/05/17 19:30, Mike Manning wrote:
> While this is not reproducible manually, Andrey's syzkaller program hit
> the warning "IPv6: Freeing alive inet6 address" with this part trace:
> 
> inet6_ifa_finish_destroy+0x12e/0x190 c:894
> in6_ifa_put ./include/net/addrconf.h:330
> addrconf_dad_work+0x4e9/0x1040 net/ipv6/addrconf.c:3963
> 
> The fix is to call in6_ifa_put() for the inet6_ifaddr before rather
> than after calling addrconf_ifdown(), as the latter may remove it from
> the address hash table.
> 
> Fixes: 85b51b12115c ("net: ipv6: Remove addresses for failures with strict DAD")
> Reported-by: Andrey Konovalov <andreyknvl@google.com>
> Signed-off-by: Mike Manning <mmanning@brocade.com>
> ---
>  net/ipv6/addrconf.c | 6 +++++-
>  1 file changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
> index 80ce478..361993a 100644
> --- a/net/ipv6/addrconf.c
> +++ b/net/ipv6/addrconf.c
> @@ -3902,8 +3902,11 @@ static void addrconf_dad_work(struct work_struct *w)
>  	} else if (action == DAD_ABORT) {
>  		in6_ifa_hold(ifp);
>  		addrconf_dad_stop(ifp, 1);
> -		if (disable_ipv6)
> +		if (disable_ipv6) {
> +			in6_ifa_put(ifp);
>  			addrconf_ifdown(idev->dev, 0);
> +			goto unlock;
> +		}
>  		goto out;
>  	}
>  
> @@ -3950,6 +3953,7 @@ static void addrconf_dad_work(struct work_struct *w)
>  		      ifp->dad_nonce);
>  out:
>  	in6_ifa_put(ifp);
> +unlock:
>  	rtnl_unlock();
>  }
>  
> 

^ permalink raw reply

* [PATCH v4 net-next 10/11] ibmvnic: Record SKB RX queue during poll
From: Nathan Fontenot @ 2017-05-03 18:05 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon
In-Reply-To: <20170503180246.29742.64039.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>

From: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>

Map each RX SKB to the RX queue associated with the driver's RX SCRQ.
This should improve the RX CPU load balancing issues seen by the
performance team.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c |    1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 00c9d5a..1b6268c 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1428,6 +1428,7 @@ static int ibmvnic_poll(struct napi_struct *napi, int budget)
 
 		skb_put(skb, length);
 		skb->protocol = eth_type_trans(skb, netdev);
+		skb_record_rx_queue(skb, scrq_num);
 
 		if (flags & IBMVNIC_IP_CHKSUM_GOOD &&
 		    flags & IBMVNIC_TCP_UDP_CHKSUM_GOOD) {

^ permalink raw reply related

* [PATCH v4 net-next 11/11] ibmvnic: Move queue restarting in ibmvnic_tx_complete
From: Nathan Fontenot @ 2017-05-03 18:05 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon
In-Reply-To: <20170503180246.29742.64039.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>

Restart of the subqueue should occur outside of the loop processing
any tx buffers instead of doing this in the middle of the loop.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
---

v2: Use __netif_subqueue_stopped() instead of netif_subqueue_stopped()
to avoid possible using un-initialized skb variable.
---
 drivers/net/ethernet/ibm/ibmvnic.c |   22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 1b6268c..4f2d329 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1809,19 +1809,8 @@ static int ibmvnic_complete_tx(struct ibmvnic_adapter *adapter,
 			}
 
 			if (txbuff->last_frag) {
-				if (atomic_sub_return(next->tx_comp.num_comps,
-						      &scrq->used) <=
-				    (adapter->req_tx_entries_per_subcrq / 2) &&
-				    netif_subqueue_stopped(adapter->netdev,
-							   txbuff->skb)) {
-					netif_wake_subqueue(adapter->netdev,
-							    scrq->pool_index);
-					netdev_dbg(adapter->netdev,
-						   "Started queue %d\n",
-						   scrq->pool_index);
-				}
-
 				dev_kfree_skb_any(txbuff->skb);
+				txbuff->skb = NULL;
 			}
 
 			adapter->tx_pool[pool].free_map[adapter->tx_pool[pool].
@@ -1832,6 +1821,15 @@ static int ibmvnic_complete_tx(struct ibmvnic_adapter *adapter,
 		}
 		/* remove tx_comp scrq*/
 		next->tx_comp.first = 0;
+
+		if (atomic_sub_return(next->tx_comp.num_comps, &scrq->used) <=
+		    (adapter->req_tx_entries_per_subcrq / 2) &&
+		    __netif_subqueue_stopped(adapter->netdev,
+					     scrq->pool_index)) {
+			netif_wake_subqueue(adapter->netdev, scrq->pool_index);
+			netdev_info(adapter->netdev, "Started queue %d\n",
+				    scrq->pool_index);
+		}
 	}
 
 	enable_scrq_irq(adapter, scrq);

^ permalink raw reply related

* Re: [PATCH] net/sched: remove redundant null check on head
From: walter harms @ 2017-05-03 14:21 UTC (permalink / raw)
  To: Colin King
  Cc: Jamal Hadi Salim, Cong Wang, Jiri Pirko, netdev, kernel-janitors
In-Reply-To: <20170503135040.32023-1-colin.king@canonical.com>



Am 03.05.2017 15:50, schrieb Colin King:
> From: Colin Ian King <colin.king@canonical.com>
> 
> head is previously null checked and so the 2nd null check on head
> is redundant and therefore can be removed.
> 
> Detected by CoverityScan, CID#1399505 ("Logically dead code")
> 
> Signed-off-by: Colin Ian King <colin.king@canonical.com>
> ---
>  net/sched/cls_matchall.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
> 
> diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
> index 2efb36c08f2a..dee469fed967 100644
> --- a/net/sched/cls_matchall.c
> +++ b/net/sched/cls_matchall.c
> @@ -203,8 +203,7 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
>  
>  	*arg = (unsigned long) head;
>  	rcu_assign_pointer(tp->root, new);
> -	if (head)
> -		call_rcu(&head->rcu, mall_destroy_rcu);
> +	call_rcu(&head->rcu, mall_destroy_rcu);
>  	return 0;
>  
>  err_replace_hw_filter:


if someone cares .. the err=0 in the the line above the patch is also useless
because err is never used again. Merging both if will save 1 indent level.

 err = mall_replace_hw_filter(tp, new, (unsigned long) new);
 if (err && tc_skip_sw(flags) )
	goto errout;


just my impression,
re
 wh

^ permalink raw reply

* Re: [PATCH] net: ipv6: Fix warning of freeing alive inet6 address
From: David Miller @ 2017-05-03 14:22 UTC (permalink / raw)
  To: mmanning; +Cc: netdev, andreyknvl
In-Reply-To: <1fb7b6de-ddb8-0207-0567-91d1ed0aad83@brocade.com>

From: Mike Manning <mmanning@brocade.com>
Date: Wed, 3 May 2017 15:16:20 +0100

> On reflection, please put this on hold subject to testing with
> syzkaller. I have not had a repro of the issue and so the fix even
> though harmless may not be effective.

Ok.

^ permalink raw reply

* [PATCH] ath5k: fix memory leak on buf on failed eeprom read
From: Colin King @ 2017-05-03 14:26 UTC (permalink / raw)
  To: Jiri Slaby, Nick Kossifidis, Luis R . Rodriguez, Kalle Valo,
	linux-wireless
  Cc: kernel-janitors, netdev

From: Colin Ian King <colin.king@canonical.com>

The AR5K_EEPROM_READ macro returns with -EIO if a read error
occurs causing a memory leak on the allocated buffer buf. Fix
this by explicitly calling ath5k_hw_nvram_read and exiting on
the via the freebuf label that performs the necessary free'ing
of buf when a read error occurs.

Detected by CoverityScan, CID#1248782 ("Resource Leak")

Signed-off-by: Colin Ian King <colin.king@canonical.com>
---
 drivers/net/wireless/ath/ath5k/debug.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/ath5k/debug.c b/drivers/net/wireless/ath/ath5k/debug.c
index d068df520e7a..bd7f6d7b199e 100644
--- a/drivers/net/wireless/ath/ath5k/debug.c
+++ b/drivers/net/wireless/ath/ath5k/debug.c
@@ -938,7 +938,10 @@ static int open_file_eeprom(struct inode *inode, struct file *file)
 	}
 
 	for (i = 0; i < eesize; ++i) {
-		AR5K_EEPROM_READ(i, val);
+		if (!ath5k_hw_nvram_read(ah, i, &val)) {
+			ret = -EIO;
+			goto freebuf;
+		}
 		buf[i] = val;
 	}
 
-- 
2.11.0

^ permalink raw reply related

* Re: [PATCH] net: ethernet: stmmac: properly set PS bit in MII configurations during reset
From: Corentin Labbe @ 2017-05-03 14:30 UTC (permalink / raw)
  To: Giuseppe CAVALLARO; +Cc: Thomas Petazzoni, Alexandre Torgue, netdev, stable
In-Reply-To: <deea272a-f0a9-ad0b-4ad6-387723938314@st.com>

On Wed, May 03, 2017 at 10:13:56AM +0200, Giuseppe CAVALLARO wrote:
> Hello Thomas
> 
> this was initially set by using the hw->link.port; both the core_init 
> and adjust callback
> should invoke the hook and tuning the PS bit according to the speed and 
> mode.
> So maybe the ->set_ps is superfluous and you could reuse the existent hook
> 
> let me know
> 
> Regards
> peppe
> 

I just see that GMAC_CONTROL and MAC_CTRL_REG are the same, so why not create a custom adjust_link for each dwmac type ?
This will permit to call it instead of set_ps() and remove lots of if (has_gmac) and co in stmmac_adjust_link()
Basicly replace all between "ctrl = readl()... and writel(ctrl)" by a sot of priv->hw->mac->adjust_link()

It will also help a lot for my dwmac-sun8i inclusion (since I add some if has_sun8i:))

Regards

> On 4/27/2017 11:45 AM, Thomas Petazzoni wrote:
> > On the SPEAr600 SoC, which has the dwmac1000 variant of the IP block,
> > the DMA reset never succeeds when a MII PHY is used (no problem with a
> > GMII PHY). The dwmac_dma_reset() function sets the
> > DMA_BUS_MODE_SFT_RESET bit in the DMA_BUS_MODE register, and then
> > polls until this bit clears. When a MII PHY is used, with the current
> > driver, this bit never clears and the driver therefore doesn't work.
> >
> > The reason is that the PS bit of the GMAC_CONTROL register should be
> > correctly configured for the DMA reset to work. When the PS bit is 0,
> > it tells the MAC we have a GMII PHY, when the PS bit is 1, it tells
> > the MAC we have a MII PHY.
> >
> > Doing a DMA reset clears all registers, so the PS bit is cleared as
> > well. This makes the DMA reset work fine with a GMII PHY. However,
> > with MII PHY, the PS bit should be set.
> >
> > We have identified this issue thanks to two SPEAr600 platform:
> >
> >   - One equipped with a GMII PHY, with which the existing driver was
> >     working fine.
> >
> >   - One equipped with a MII PHY, where the current driver fails because
> >     the DMA reset times out.
> >
> > This patch fixes the problem for the MII PHY configuration, and has
> > been tested with a GMII PHY configuration as well.
> >
> > In terms of implement, since the ->reset() hook is implemented in the
> > DMA related code, we do not want to touch directly from this function
> > the MAC registers. Therefore, a ->set_ps() hook has been added to
> > stmmac_ops, which gets called between the moment the reset is asserted
> > and the polling loop waiting for the reset bit to clear.
> >
> > In order for this ->set_ps() hook to decide what to do, we pass it the
> > "struct mac_device_info" so it can access the MAC registers, and the
> > PHY interface type so it knows if we're using a MII PHY or not.
> >
> > The ->set_ps() hook is only implemented for the dwmac1000 case.
> >
> > Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
> > Cc: <stable@vger.kernel.org>
> > ---
> > Do not hesitate to suggest ideas for alternative implementations, I'm
> > not sure if the current proposal is the one that fits best with the
> > current design of the driver.
> > ---
> >   drivers/net/ethernet/stmicro/stmmac/common.h         | 12 +++++++++---
> >   drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c | 16 ++++++++++++++++
> >   drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h     |  3 ++-
> >   drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c     |  7 ++++++-
> >   drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h      |  3 ++-
> >   drivers/net/ethernet/stmicro/stmmac/dwmac_lib.c      |  6 +++++-
> >   drivers/net/ethernet/stmicro/stmmac/stmmac_main.c    |  3 ++-
> >   7 files changed, 42 insertions(+), 8 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
> > index 04d9245..d576f95 100644
> > --- a/drivers/net/ethernet/stmicro/stmmac/common.h
> > +++ b/drivers/net/ethernet/stmicro/stmmac/common.h
> > @@ -407,10 +407,13 @@ struct stmmac_desc_ops {
> >   extern const struct stmmac_desc_ops enh_desc_ops;
> >   extern const struct stmmac_desc_ops ndesc_ops;
> >   
> > +struct mac_device_info;
> > +
> >   /* Specific DMA helpers */
> >   struct stmmac_dma_ops {
> >   	/* DMA core initialization */
> > -	int (*reset)(void __iomem *ioaddr);
> > +	int (*reset)(void __iomem *ioaddr, struct mac_device_info *hw,
> > +		     phy_interface_t interface);
> >   	void (*init)(void __iomem *ioaddr, struct stmmac_dma_cfg *dma_cfg,
> >   		     u32 dma_tx, u32 dma_rx, int atds);
> >   	/* Configure the AXI Bus Mode Register */
> > @@ -445,12 +448,15 @@ struct stmmac_dma_ops {
> >   	void (*enable_tso)(void __iomem *ioaddr, bool en, u32 chan);
> >   };
> >   
> > -struct mac_device_info;
> > -
> >   /* Helpers to program the MAC core */
> >   struct stmmac_ops {
> >   	/* MAC core initialization */
> >   	void (*core_init)(struct mac_device_info *hw, int mtu);
> > +	/* Set port select. Called between asserting DMA reset and
> > +	 * waiting for the reset bit to clear.
> > +	 */
> > +	void (*set_ps)(struct mac_device_info *hw,
> > +		       phy_interface_t interface);
> >   	/* Enable and verify that the IPC module is supported */
> >   	int (*rx_ipc)(struct mac_device_info *hw);
> >   	/* Enable RX Queues */
> > diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
> > index 19b9b308..dfcbb5b 100644
> > --- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
> > +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
> > @@ -75,6 +75,21 @@ static void dwmac1000_core_init(struct mac_device_info *hw, int mtu)
> >   #endif
> >   }
> >   
> > +static void dwmac1000_set_ps(struct mac_device_info *hw,
> > +			     phy_interface_t interface)
> > +{
> > +	void __iomem *ioaddr = hw->pcsr;
> > +	u32 value = readl(ioaddr + GMAC_CONTROL);
> > +
> > +	/* When a MII PHY is used, we must set the PS bit for the DMA
> > +	 * reset to succeed.
> > +	 */
> > +	if (interface == PHY_INTERFACE_MODE_MII)
> > +		value |= GMAC_CONTROL_PS;
> > +
> > +	writel(value, ioaddr + GMAC_CONTROL);
> > +}
> > +
> >   static int dwmac1000_rx_ipc_enable(struct mac_device_info *hw)
> >   {
> >   	void __iomem *ioaddr = hw->pcsr;
> > @@ -488,6 +503,7 @@ static void dwmac1000_debug(void __iomem *ioaddr, struct stmmac_extra_stats *x)
> >   
> >   static const struct stmmac_ops dwmac1000_ops = {
> >   	.core_init = dwmac1000_core_init,
> > +	.set_ps = dwmac1000_set_ps,
> >   	.rx_ipc = dwmac1000_rx_ipc_enable,
> >   	.dump_regs = dwmac1000_dump_regs,
> >   	.host_irq_status = dwmac1000_irq_status,
> > diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h
> > index 1b06df7..e9c6c49 100644
> > --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h
> > +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h
> > @@ -183,7 +183,8 @@
> >   #define DMA_CHAN0_DBG_STAT_RPS		GENMASK(11, 8)
> >   #define DMA_CHAN0_DBG_STAT_RPS_SHIFT	8
> >   
> > -int dwmac4_dma_reset(void __iomem *ioaddr);
> > +int dwmac4_dma_reset(void __iomem *ioaddr, struct mac_device_info *hw,
> > +		     phy_interface_t interface);
> >   void dwmac4_enable_dma_transmission(void __iomem *ioaddr, u32 tail_ptr);
> >   void dwmac4_enable_dma_irq(void __iomem *ioaddr);
> >   void dwmac410_enable_dma_irq(void __iomem *ioaddr);
> > diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c
> > index c7326d5..485eecb 100644
> > --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c
> > +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c
> > @@ -14,7 +14,8 @@
> >   #include "dwmac4_dma.h"
> >   #include "dwmac4.h"
> >   
> > -int dwmac4_dma_reset(void __iomem *ioaddr)
> > +int dwmac4_dma_reset(void __iomem *ioaddr, struct mac_device_info *hw,
> > +		     phy_interface_t interface)
> >   {
> >   	u32 value = readl(ioaddr + DMA_BUS_MODE);
> >   	int limit;
> > @@ -22,6 +23,10 @@ int dwmac4_dma_reset(void __iomem *ioaddr)
> >   	/* DMA SW reset */
> >   	value |= DMA_BUS_MODE_SFT_RESET;
> >   	writel(value, ioaddr + DMA_BUS_MODE);
> > +
> > +	if (hw->mac->set_ps)
> > +		hw->mac->set_ps(hw, interface);
> > +
> >   	limit = 10;
> >   	while (limit--) {
> >   		if (!(readl(ioaddr + DMA_BUS_MODE) & DMA_BUS_MODE_SFT_RESET))
> > diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h b/drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h
> > index 56e485f..25ae028 100644
> > --- a/drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h
> > +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h
> > @@ -144,6 +144,7 @@ void dwmac_dma_stop_tx(void __iomem *ioaddr);
> >   void dwmac_dma_start_rx(void __iomem *ioaddr);
> >   void dwmac_dma_stop_rx(void __iomem *ioaddr);
> >   int dwmac_dma_interrupt(void __iomem *ioaddr, struct stmmac_extra_stats *x);
> > -int dwmac_dma_reset(void __iomem *ioaddr);
> > +int dwmac_dma_reset(void __iomem *ioaddr, struct mac_device_info *hw,
> > +		    phy_interface_t interface);
> >   
> >   #endif /* __DWMAC_DMA_H__ */
> > diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac_lib.c b/drivers/net/ethernet/stmicro/stmmac/dwmac_lib.c
> > index e60bfca..1a17df5 100644
> > --- a/drivers/net/ethernet/stmicro/stmmac/dwmac_lib.c
> > +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac_lib.c
> > @@ -23,7 +23,8 @@
> >   
> >   #define GMAC_HI_REG_AE		0x80000000
> >   
> > -int dwmac_dma_reset(void __iomem *ioaddr)
> > +int dwmac_dma_reset(void __iomem *ioaddr, struct mac_device_info *hw,
> > +		    phy_interface_t interface)
> >   {
> >   	u32 value = readl(ioaddr + DMA_BUS_MODE);
> >   	int err;
> > @@ -32,6 +33,9 @@ int dwmac_dma_reset(void __iomem *ioaddr)
> >   	value |= DMA_BUS_MODE_SFT_RESET;
> >   	writel(value, ioaddr + DMA_BUS_MODE);
> >   
> > +	if (hw->mac->set_ps)
> > +		hw->mac->set_ps(hw, interface);
> > +
> >   	err = readl_poll_timeout(ioaddr + DMA_BUS_MODE, value,
> >   				 !(value & DMA_BUS_MODE_SFT_RESET),
> >   				 100000, 10000);
> > diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> > index 4498a38..66bc218 100644
> > --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> > +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> > @@ -1585,7 +1585,8 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv)
> >   	if (priv->extend_desc && (priv->mode == STMMAC_RING_MODE))
> >   		atds = 1;
> >   
> > -	ret = priv->hw->dma->reset(priv->ioaddr);
> > +	ret = priv->hw->dma->reset(priv->ioaddr, priv->hw,
> > +				   priv->plat->interface);
> >   	if (ret) {
> >   		dev_err(priv->device, "Failed to reset the dma\n");
> >   		return ret;
> 
> 

^ permalink raw reply

* Re: net/smc and the RDMA core
From: Ursula Braun @ 2017-05-03 14:40 UTC (permalink / raw)
  To: Bart Van Assche, hch-jcswGhMUV9g@public.gmane.org,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <1493750358.2552.13.camel-XdAiOPVOjttBDgjK7y7TUQ@public.gmane.org>



On 05/02/2017 08:39 PM, Bart Van Assche wrote:
> On Tue, 2017-05-02 at 14:25 +0200, Ursula Braun wrote:
>> if you can point out specific issues, we will be happy to work with you
>> to get them addressed!
> 
> Hello Ursula,
> 
> My list of issues that I would like to see addressed can be found below. Doug,
> Christoph and others may have additional inputs. The issues that have not yet
> been mentioned in other e-mails are:
> - The SMC driver only supports one RDMA transport type (RoCE v1) but
>   none of the other RDMA transport types (RoCE v2, IB and iWARP). New
>   RDMA drivers should support all RDMA transport types transparently.
>   The traditional approach to support multiple RDMA transport types is
>   by using the RDMA/CM to establish connections.
> - The implementation of the SMC driver only supports RoCEv1. This is
>   a very unfortunate choice because:
>   - RoCEv1 is not routable and hence is limited to a single Ethernet
>     broadcast domain.
>   - RoCEv1 packets escape a whole bunch of mechanisms that only work
>     for IP packets. Firewalls pass all RoCEv1 packets and switches
>     do not restrict RoCEv1 packets to a single VLAN. This means that
>     if the network configuration is changed after an SMC connection
>     has been set up such that IP communication between the endpoints
>     of an SMC connection is blocked that the SMC RoCEv1 packets will
>     not be blocked by the network equipment of which the configuration
>     has just been changed.
> - As already mentioned by Christoph, the SMC implementation uses RDMA
>   calls that probably will be deprecated soon (ib_create_cq()) and
>   should use the RDMA R/W API instead of building sge lists itself.
> 
> Bart.
> 
Thanks for your list, we will take care of it!

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH v2 net] xfrm: fix stack access out of bounds with CONFIG_XFRM_SUB_POLICY
From: Sabrina Dubroca @ 2017-05-03 14:43 UTC (permalink / raw)
  To: netdev; +Cc: Sabrina Dubroca, Steffen Klassert, Herbert Xu

When CONFIG_XFRM_SUB_POLICY=y, xfrm_dst stores a copy of the flowi for
that dst. Unfortunately, the code that allocates and fills this copy
doesn't care about what type of flowi (flowi, flowi4, flowi6) gets
passed. In multiple code paths (from raw_sendmsg, from TCP when
replying to a FIN, in vxlan, geneve, and gre), the flowi that gets
passed to xfrm is actually an on-stack flowi4, so we end up reading
stuff from the stack past the end of the flowi4 struct.

Since xfrm_dst->origin isn't used anywhere following commit
ca116922afa8 ("xfrm: Eliminate "fl" and "pol" args to
xfrm_bundle_ok()."), just get rid of it.  xfrm_dst->partner isn't used
either, so get rid of that too.

Fixes: 9d6ec938019c ("ipv4: Use flowi4 in public route lookup interfaces.")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
---
v2: change Fixes tag to the commit that actually introduced the issue

 include/net/xfrm.h     | 10 ----------
 net/xfrm/xfrm_policy.c | 47 -----------------------------------------------
 2 files changed, 57 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 6793a30c66b1..7e7e2b0d2915 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -979,10 +979,6 @@ struct xfrm_dst {
 	struct flow_cache_object flo;
 	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
 	int num_pols, num_xfrms;
-#ifdef CONFIG_XFRM_SUB_POLICY
-	struct flowi *origin;
-	struct xfrm_selector *partner;
-#endif
 	u32 xfrm_genid;
 	u32 policy_genid;
 	u32 route_mtu_cached;
@@ -998,12 +994,6 @@ static inline void xfrm_dst_destroy(struct xfrm_dst *xdst)
 	dst_release(xdst->route);
 	if (likely(xdst->u.dst.xfrm))
 		xfrm_state_put(xdst->u.dst.xfrm);
-#ifdef CONFIG_XFRM_SUB_POLICY
-	kfree(xdst->origin);
-	xdst->origin = NULL;
-	kfree(xdst->partner);
-	xdst->partner = NULL;
-#endif
 }
 #endif
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index b00a1d5a7f52..ed4e52d95172 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1797,43 +1797,6 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 	goto out;
 }
 
-#ifdef CONFIG_XFRM_SUB_POLICY
-static int xfrm_dst_alloc_copy(void **target, const void *src, int size)
-{
-	if (!*target) {
-		*target = kmalloc(size, GFP_ATOMIC);
-		if (!*target)
-			return -ENOMEM;
-	}
-
-	memcpy(*target, src, size);
-	return 0;
-}
-#endif
-
-static int xfrm_dst_update_parent(struct dst_entry *dst,
-				  const struct xfrm_selector *sel)
-{
-#ifdef CONFIG_XFRM_SUB_POLICY
-	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
-	return xfrm_dst_alloc_copy((void **)&(xdst->partner),
-				   sel, sizeof(*sel));
-#else
-	return 0;
-#endif
-}
-
-static int xfrm_dst_update_origin(struct dst_entry *dst,
-				  const struct flowi *fl)
-{
-#ifdef CONFIG_XFRM_SUB_POLICY
-	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
-	return xfrm_dst_alloc_copy((void **)&(xdst->origin), fl, sizeof(*fl));
-#else
-	return 0;
-#endif
-}
-
 static int xfrm_expand_policies(const struct flowi *fl, u16 family,
 				struct xfrm_policy **pols,
 				int *num_pols, int *num_xfrms)
@@ -1905,16 +1868,6 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
 
 	xdst = (struct xfrm_dst *)dst;
 	xdst->num_xfrms = err;
-	if (num_pols > 1)
-		err = xfrm_dst_update_parent(dst, &pols[1]->selector);
-	else
-		err = xfrm_dst_update_origin(dst, fl);
-	if (unlikely(err)) {
-		dst_free(dst);
-		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR);
-		return ERR_PTR(err);
-	}
-
 	xdst->num_pols = num_pols;
 	memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols);
 	xdst->policy_genid = atomic_read(&pols[0]->genid);
-- 
2.12.2

^ permalink raw reply related

* [PATCH net] ah: use crypto_memneq to check the ICV
From: Sabrina Dubroca @ 2017-05-03 14:57 UTC (permalink / raw)
  To: netdev; +Cc: Sabrina Dubroca, Steffen Klassert, Herbert Xu

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
---
 net/ipv4/ah4.c | 5 +++--
 net/ipv6/ah6.c | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 22377c8ff14b..207350b30f88 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -1,5 +1,6 @@
 #define pr_fmt(fmt) "IPsec: " fmt
 
+#include <crypto/algapi.h>
 #include <crypto/hash.h>
 #include <linux/err.h>
 #include <linux/module.h>
@@ -277,7 +278,7 @@ static void ah_input_done(struct crypto_async_request *base, int err)
 	auth_data = ah_tmp_auth(work_iph, ihl);
 	icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len);
 
-	err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0;
+	err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0;
 	if (err)
 		goto out;
 
@@ -413,7 +414,7 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
 		goto out_free;
 	}
 
-	err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0;
+	err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0;
 	if (err)
 		goto out_free;
 
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index dda6035e3b84..ac747b13a8dc 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -25,6 +25,7 @@
 
 #define pr_fmt(fmt) "IPv6: " fmt
 
+#include <crypto/algapi.h>
 #include <crypto/hash.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -481,7 +482,7 @@ static void ah6_input_done(struct crypto_async_request *base, int err)
 	auth_data = ah_tmp_auth(work_iph, hdr_len);
 	icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len);
 
-	err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0;
+	err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0;
 	if (err)
 		goto out;
 
@@ -627,7 +628,7 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb)
 		goto out_free;
 	}
 
-	err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0;
+	err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0;
 	if (err)
 		goto out_free;
 
-- 
2.12.2

^ permalink raw reply related

* Re: [PATCH] net: ipv6: Fix warning of freeing alive inet6 address
From: Andrey Konovalov @ 2017-05-03 14:58 UTC (permalink / raw)
  To: Mike Manning; +Cc: netdev
In-Reply-To: <1fb7b6de-ddb8-0207-0567-91d1ed0aad83@brocade.com>

On Wed, May 3, 2017 at 4:16 PM, Mike Manning <mmanning@brocade.com> wrote:
> On reflection, please put this on hold subject to testing with syzkaller. I have not had a repro of the issue and so the fix even though harmless may not be effective.

Hi Mike,

I didn't see your patch, you think you might have forgotten to add me
to recipients.

Did you try building the kernel with the provided config and then
running the reproducer?

Would you like me to test if this patch fixes the warning?

Thanks!

>
> Thanks
> Mike
>
> On 02/05/17 19:30, Mike Manning wrote:
>> While this is not reproducible manually, Andrey's syzkaller program hit
>> the warning "IPv6: Freeing alive inet6 address" with this part trace:
>>
>> inet6_ifa_finish_destroy+0x12e/0x190 c:894
>> in6_ifa_put ./include/net/addrconf.h:330
>> addrconf_dad_work+0x4e9/0x1040 net/ipv6/addrconf.c:3963
>>
>> The fix is to call in6_ifa_put() for the inet6_ifaddr before rather
>> than after calling addrconf_ifdown(), as the latter may remove it from
>> the address hash table.
>>
>> Fixes: 85b51b12115c ("net: ipv6: Remove addresses for failures with strict DAD")
>> Reported-by: Andrey Konovalov <andreyknvl@google.com>
>> Signed-off-by: Mike Manning <mmanning@brocade.com>
>> ---
>>  net/ipv6/addrconf.c | 6 +++++-
>>  1 file changed, 5 insertions(+), 1 deletion(-)
>>
>> diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
>> index 80ce478..361993a 100644
>> --- a/net/ipv6/addrconf.c
>> +++ b/net/ipv6/addrconf.c
>> @@ -3902,8 +3902,11 @@ static void addrconf_dad_work(struct work_struct *w)
>>       } else if (action == DAD_ABORT) {
>>               in6_ifa_hold(ifp);
>>               addrconf_dad_stop(ifp, 1);
>> -             if (disable_ipv6)
>> +             if (disable_ipv6) {
>> +                     in6_ifa_put(ifp);
>>                       addrconf_ifdown(idev->dev, 0);
>> +                     goto unlock;
>> +             }
>>               goto out;
>>       }
>>
>> @@ -3950,6 +3953,7 @@ static void addrconf_dad_work(struct work_struct *w)
>>                     ifp->dad_nonce);
>>  out:
>>       in6_ifa_put(ifp);
>> +unlock:
>>       rtnl_unlock();
>>  }
>>
>>
>

^ permalink raw reply

* [PATCH] ipv4, ipv6: ensure raw socket message is big enough to hold an IP header
From: Alexander Potapenko @ 2017-05-03 15:06 UTC (permalink / raw)
  To: dvyukov, kcc, edumazet, davem, kuznet; +Cc: linux-kernel, netdev

raw_send_hdrinc() and rawv6_send_hdrinc() expect that the buffer copied
from the userspace contains the IPv4/IPv6 header, so if too few bytes are
copied, parts of the header may remain uninitialized.

This bug has been detected with KMSAN.

Signed-off-by: Alexander Potapenko <glider@google.com>
---
The previous version of this patch was called "ipv6: ensure message length
for raw socket is at least sizeof(ipv6hdr)"

For the record, the KMSAN report:

==================================================================
BUG: KMSAN: use of unitialized memory in nf_ct_frag6_gather+0xf5a/0x44a0
inter: 0
CPU: 0 PID: 1036 Comm: probe Not tainted 4.11.0-rc5+ #2455
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:16
 dump_stack+0x143/0x1b0 lib/dump_stack.c:52
 kmsan_report+0x16b/0x1e0 mm/kmsan/kmsan.c:1078
 __kmsan_warning_32+0x5c/0xa0 mm/kmsan/kmsan_instr.c:510
 nf_ct_frag6_gather+0xf5a/0x44a0 net/ipv6/netfilter/nf_conntrack_reasm.c:577
 ipv6_defrag+0x1d9/0x280 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c:68
 nf_hook_entry_hookfn ./include/linux/netfilter.h:102
 nf_hook_slow+0x13f/0x3c0 net/netfilter/core.c:310
 nf_hook ./include/linux/netfilter.h:212
 NF_HOOK ./include/linux/netfilter.h:255
 rawv6_send_hdrinc net/ipv6/raw.c:673
 rawv6_sendmsg+0x2fcb/0x41a0 net/ipv6/raw.c:919
 inet_sendmsg+0x3f8/0x6d0 net/ipv4/af_inet.c:762
 sock_sendmsg_nosec net/socket.c:633
 sock_sendmsg net/socket.c:643
 SYSC_sendto+0x6a5/0x7c0 net/socket.c:1696
 SyS_sendto+0xbc/0xe0 net/socket.c:1664
 do_syscall_64+0x72/0xa0 arch/x86/entry/common.c:285
 entry_SYSCALL64_slow_path+0x25/0x25 arch/x86/entry/entry_64.S:246
RIP: 0033:0x436e03
RSP: 002b:00007ffce48baf38 EFLAGS: 00000246 ORIG_RAX: 000000000000002c
RAX: ffffffffffffffda RBX: 00000000004002b0 RCX: 0000000000436e03
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000003
RBP: 00007ffce48baf90 R08: 00007ffce48baf50 R09: 000000000000001c
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 0000000000401790 R14: 0000000000401820 R15: 0000000000000000
origin: 00000000d9400053
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:362
 kmsan_internal_poison_shadow+0xb1/0x1a0 mm/kmsan/kmsan.c:257
 kmsan_poison_shadow+0x6d/0xc0 mm/kmsan/kmsan.c:270
 slab_alloc_node mm/slub.c:2735
 __kmalloc_node_track_caller+0x1f4/0x390 mm/slub.c:4341
 __kmalloc_reserve net/core/skbuff.c:138
 __alloc_skb+0x2cd/0x740 net/core/skbuff.c:231
 alloc_skb ./include/linux/skbuff.h:933
 alloc_skb_with_frags+0x209/0xbc0 net/core/skbuff.c:4678
 sock_alloc_send_pskb+0x9ff/0xe00 net/core/sock.c:1903
 sock_alloc_send_skb+0xe4/0x100 net/core/sock.c:1920
 rawv6_send_hdrinc net/ipv6/raw.c:638
 rawv6_sendmsg+0x2918/0x41a0 net/ipv6/raw.c:919
 inet_sendmsg+0x3f8/0x6d0 net/ipv4/af_inet.c:762
 sock_sendmsg_nosec net/socket.c:633
 sock_sendmsg net/socket.c:643
 SYSC_sendto+0x6a5/0x7c0 net/socket.c:1696
 SyS_sendto+0xbc/0xe0 net/socket.c:1664
 do_syscall_64+0x72/0xa0 arch/x86/entry/common.c:285
 return_from_SYSCALL_64+0x0/0x6a arch/x86/entry/entry_64.S:246
==================================================================

, triggered by the following syscalls:
  socket(PF_INET6, SOCK_RAW, IPPROTO_RAW) = 3
  sendto(3, NULL, 0, 0, {sa_family=AF_INET6, sin6_port=htons(0), inet_pton(AF_INET6, "ff00::", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, 28) = -1 EPERM

A similar report is triggered in net/ipv4/raw.c if we use a PF_INET socket
instead of a PF_INET6 one.
---

 net/ipv4/raw.c | 3 +++
 net/ipv6/raw.c | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 9d943974de2b..bdffad875691 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -358,6 +358,9 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
 			       rt->dst.dev->mtu);
 		return -EMSGSIZE;
 	}
+	if (length < sizeof(struct iphdr))
+		return -EINVAL;
+
 	if (flags&MSG_PROBE)
 		goto out;
 
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index f174e76e6505..805464668bd8 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -632,6 +632,8 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
 		ipv6_local_error(sk, EMSGSIZE, fl6, rt->dst.dev->mtu);
 		return -EMSGSIZE;
 	}
+	if (length < sizeof(struct ipv6hdr))
+		return -EINVAL;
 	if (flags&MSG_PROBE)
 		goto out;
 
-- 
2.13.0.rc1.294.g07d810a77f-goog

^ permalink raw reply related

* Re: [PATCH] net: ethernet: stmmac: properly set PS bit in MII configurations during reset
From: Thomas Petazzoni @ 2017-05-03 15:11 UTC (permalink / raw)
  To: Giuseppe CAVALLARO; +Cc: Alexandre Torgue, netdev, stable
In-Reply-To: <deea272a-f0a9-ad0b-4ad6-387723938314@st.com>

Hello Giuseppe,

On Wed, 3 May 2017 10:13:56 +0200, Giuseppe CAVALLARO wrote:

> this was initially set by using the hw->link.port; both the core_init 
> and adjust callback
> should invoke the hook and tuning the PS bit according to the speed and 
> mode.

But this doesn't work: core_init and adjust callbacks are called too
late / not at the appropriate time.

Here, I need the PS to be set after asserting the DMA reset bit but
*before* polling for the DMA reset bit to clear.

I.e, I need:

int dwmac_dma_reset(void __iomem *ioaddr, struct mac_device_info *hw,
                    phy_interface_t interface)
{
        u32 value = readl(ioaddr + DMA_BUS_MODE);
        int limit;

        /* DMA SW reset */
        value |= DMA_BUS_MODE_SFT_RESET;
        writel(value, ioaddr + DMA_BUS_MODE);

        /* ===> PS must be set here when the PHY interface is MII */

        limit = 10;
        while (limit--) {
                if (!(readl(ioaddr + DMA_BUS_MODE) & DMA_BUS_MODE_SFT_RESET))
                        break;
                mdelay(10);
        }

        if (limit < 0)
                return -EBUSY;

        return 0;
}

Setting PS *before* asserting the DMA reset bit doesn't work, because
asserting the DMA reset bit clears all bits in all registers.

Setting PS *after* waiting for the DMA reset bit to clear doesn't work,
because this bit never clears if the PS configuration is not correct
with the regard to the PHY being used (which was my original problem).

Am I missing something here?

Best regards,

Thomas Petazzoni
-- 
Thomas Petazzoni, CTO, Free Electrons
Embedded Linux and Kernel engineering
http://free-electrons.com

^ permalink raw reply

* Re: [PATCH] net/sched: remove redundant null check on head
From: Jiri Pirko @ 2017-05-03 15:12 UTC (permalink / raw)
  To: walter harms
  Cc: Colin King, Jamal Hadi Salim, Cong Wang, netdev, kernel-janitors
In-Reply-To: <5909E75B.30305@bfs.de>

Wed, May 03, 2017 at 04:21:15PM CEST, wharms@bfs.de wrote:
>
>
>Am 03.05.2017 15:50, schrieb Colin King:
>> From: Colin Ian King <colin.king@canonical.com>
>> 
>> head is previously null checked and so the 2nd null check on head
>> is redundant and therefore can be removed.
>> 
>> Detected by CoverityScan, CID#1399505 ("Logically dead code")
>> 
>> Signed-off-by: Colin Ian King <colin.king@canonical.com>
>> ---
>>  net/sched/cls_matchall.c | 3 +--
>>  1 file changed, 1 insertion(+), 2 deletions(-)
>> 
>> diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
>> index 2efb36c08f2a..dee469fed967 100644
>> --- a/net/sched/cls_matchall.c
>> +++ b/net/sched/cls_matchall.c
>> @@ -203,8 +203,7 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
>>  
>>  	*arg = (unsigned long) head;
>>  	rcu_assign_pointer(tp->root, new);
>> -	if (head)
>> -		call_rcu(&head->rcu, mall_destroy_rcu);
>> +	call_rcu(&head->rcu, mall_destroy_rcu);
>>  	return 0;
>>  
>>  err_replace_hw_filter:
>
>
>if someone cares .. the err=0 in the the line above the patch is also useless
>because err is never used again. Merging both if will save 1 indent level.
>
> err = mall_replace_hw_filter(tp, new, (unsigned long) new);
> if (err && tc_skip_sw(flags) )
>	goto errout;


true, however unrelated to this patch :)

Please submit as a separate patch.

Thanks.

^ permalink raw reply

* Re: [PATCH v4 net-next 00/11] ibmvnic: Updated reset handler and code fixes
From: David Miller @ 2017-05-03 15:33 UTC (permalink / raw)
  To: nfont; +Cc: netdev, brking, jallen, muvic, tlfalcon
In-Reply-To: <20170503180246.29742.64039.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>

From: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Date: Wed, 03 May 2017 14:04:20 -0400

> This set of patches multiple code fixes and a new rest handler
> for the ibmvnic driver. In order to implement the new reset handler
> for the ibmvnic driver resource initialization needed to be moved to
> its own routine, a state variable is introduced to replace the
> various is_* flags in the driver, and a new routine to handle the
> assorted reasons the driver can be reset.

Series applied, thank you.

^ permalink raw reply

* Re: [patch iproute2 v2 1/2] devlink: Change netlink attribute validation
From: Greg Rose @ 2017-05-03 15:43 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev@vger.kernel.org, arkadis, Stephen Hemminger,
	David S. Miller, mlxsw, David Ahern
In-Reply-To: <1493810723-2536-2-git-send-email-jiri@resnulli.us>

On Wed, May 3, 2017 at 4:25 AM, Jiri Pirko <jiri@resnulli.us> wrote:
> From: Arkadi Sharshevsky <arkadis@mellanox.com>
>
> Currently the netlink attribute resolving is done by a sequence of
> if's. Change the attribute resolving to table lookup.
>
> Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
> ---
>  devlink/devlink.c | 103 ++++++++++++++++--------------------------------------
>  1 file changed, 30 insertions(+), 73 deletions(-)
>
> diff --git a/devlink/devlink.c b/devlink/devlink.c
> index e90226e..35220d8 100644
> --- a/devlink/devlink.c
> +++ b/devlink/devlink.c
> @@ -232,88 +232,45 @@ static bool dl_no_arg(struct dl *dl)
>         return dl_argc(dl) == 0;
>  }
>
> +static const enum mnl_attr_data_type devlink_policy[DEVLINK_ATTR_MAX + 1] = {
> +       [DEVLINK_ATTR_BUS_NAME] = MNL_TYPE_NUL_STRING,
> +       [DEVLINK_ATTR_DEV_NAME] = MNL_TYPE_NUL_STRING,
> +       [DEVLINK_ATTR_PORT_INDEX] = MNL_TYPE_U32,
> +       [DEVLINK_ATTR_PORT_TYPE] = MNL_TYPE_U16,
> +       [DEVLINK_ATTR_PORT_DESIRED_TYPE] = MNL_TYPE_U16,
> +       [DEVLINK_ATTR_PORT_NETDEV_IFINDEX] = MNL_TYPE_U32,
> +       [DEVLINK_ATTR_PORT_NETDEV_NAME] = MNL_TYPE_NUL_STRING,
> +       [DEVLINK_ATTR_PORT_IBDEV_NAME] = MNL_TYPE_NUL_STRING,
> +       [DEVLINK_ATTR_SB_INDEX] = MNL_TYPE_U32,
> +       [DEVLINK_ATTR_SB_SIZE] = MNL_TYPE_U32,
> +       [DEVLINK_ATTR_SB_INGRESS_POOL_COUNT] = MNL_TYPE_U16,
> +       [DEVLINK_ATTR_SB_EGRESS_POOL_COUNT] = MNL_TYPE_U16,
> +       [DEVLINK_ATTR_SB_INGRESS_TC_COUNT] = MNL_TYPE_U16,
> +       [DEVLINK_ATTR_SB_EGRESS_TC_COUNT] = MNL_TYPE_U16,
> +       [DEVLINK_ATTR_SB_POOL_INDEX] = MNL_TYPE_U16,
> +       [DEVLINK_ATTR_SB_POOL_TYPE] = MNL_TYPE_U8,
> +       [DEVLINK_ATTR_SB_POOL_SIZE] = MNL_TYPE_U32,
> +       [DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE] = MNL_TYPE_U8,
> +       [DEVLINK_ATTR_SB_THRESHOLD] = MNL_TYPE_U32,
> +       [DEVLINK_ATTR_SB_TC_INDEX] = MNL_TYPE_U16,
> +       [DEVLINK_ATTR_SB_OCC_CUR] = MNL_TYPE_U32,
> +       [DEVLINK_ATTR_SB_OCC_MAX] = MNL_TYPE_U32,
> +       [DEVLINK_ATTR_ESWITCH_MODE] = MNL_TYPE_U16,
> +       [DEVLINK_ATTR_ESWITCH_INLINE_MODE] = MNL_TYPE_U8,
> +};
> +
>  static int attr_cb(const struct nlattr *attr, void *data)
>  {
>         const struct nlattr **tb = data;
>         int type;
>
> -       type = mnl_attr_get_type(attr);
> -
>         if (mnl_attr_type_valid(attr, DEVLINK_ATTR_MAX) < 0)
>                 return MNL_CB_ERROR;
>
> -       if (type == DEVLINK_ATTR_BUS_NAME &&
> -           mnl_attr_validate(attr, MNL_TYPE_NUL_STRING) < 0)
> -               return MNL_CB_ERROR;
> -       if (type == DEVLINK_ATTR_DEV_NAME &&
> -           mnl_attr_validate(attr, MNL_TYPE_NUL_STRING) < 0)
> -               return MNL_CB_ERROR;
> -       if (type == DEVLINK_ATTR_PORT_INDEX &&
> -           mnl_attr_validate(attr, MNL_TYPE_U32) < 0)
> -               return MNL_CB_ERROR;
> -       if (type == DEVLINK_ATTR_PORT_TYPE &&
> -           mnl_attr_validate(attr, MNL_TYPE_U16) < 0)
> -               return MNL_CB_ERROR;
> -       if (type == DEVLINK_ATTR_PORT_DESIRED_TYPE &&
> -           mnl_attr_validate(attr, MNL_TYPE_U16) < 0)
> -               return MNL_CB_ERROR;
> -       if (type == DEVLINK_ATTR_PORT_NETDEV_IFINDEX &&
> -           mnl_attr_validate(attr, MNL_TYPE_U32) < 0)
> -               return MNL_CB_ERROR;
> -       if (type == DEVLINK_ATTR_PORT_NETDEV_NAME &&
> -           mnl_attr_validate(attr, MNL_TYPE_NUL_STRING) < 0)
> -               return MNL_CB_ERROR;
> -       if (type == DEVLINK_ATTR_PORT_IBDEV_NAME &&
> -           mnl_attr_validate(attr, MNL_TYPE_NUL_STRING) < 0)
> -               return MNL_CB_ERROR;
> -       if (type == DEVLINK_ATTR_SB_INDEX &&
> -           mnl_attr_validate(attr, MNL_TYPE_U32) < 0)
> -               return MNL_CB_ERROR;
> -       if (type == DEVLINK_ATTR_SB_SIZE &&
> -           mnl_attr_validate(attr, MNL_TYPE_U32) < 0)
> -               return MNL_CB_ERROR;
> -       if (type == DEVLINK_ATTR_SB_INGRESS_POOL_COUNT &&
> -           mnl_attr_validate(attr, MNL_TYPE_U16) < 0)
> -               return MNL_CB_ERROR;
> -       if (type == DEVLINK_ATTR_SB_EGRESS_POOL_COUNT &&
> -           mnl_attr_validate(attr, MNL_TYPE_U16) < 0)
> -               return MNL_CB_ERROR;
> -       if (type == DEVLINK_ATTR_SB_INGRESS_TC_COUNT &&
> -           mnl_attr_validate(attr, MNL_TYPE_U16) < 0)
> -               return MNL_CB_ERROR;
> -       if (type == DEVLINK_ATTR_SB_EGRESS_TC_COUNT &&
> -           mnl_attr_validate(attr, MNL_TYPE_U16) < 0)
> -               return MNL_CB_ERROR;
> -       if (type == DEVLINK_ATTR_SB_POOL_INDEX &&
> -           mnl_attr_validate(attr, MNL_TYPE_U16) < 0)
> -               return MNL_CB_ERROR;
> -       if (type == DEVLINK_ATTR_SB_POOL_TYPE &&
> -           mnl_attr_validate(attr, MNL_TYPE_U8) < 0)
> -               return MNL_CB_ERROR;
> -       if (type == DEVLINK_ATTR_SB_POOL_SIZE &&
> -           mnl_attr_validate(attr, MNL_TYPE_U32) < 0)
> -               return MNL_CB_ERROR;
> -       if (type == DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE &&
> -           mnl_attr_validate(attr, MNL_TYPE_U8) < 0)
> -               return MNL_CB_ERROR;
> -       if (type == DEVLINK_ATTR_SB_THRESHOLD &&
> -           mnl_attr_validate(attr, MNL_TYPE_U32) < 0)
> -               return MNL_CB_ERROR;
> -       if (type == DEVLINK_ATTR_SB_TC_INDEX &&
> -           mnl_attr_validate(attr, MNL_TYPE_U16) < 0)
> -               return MNL_CB_ERROR;
> -       if (type == DEVLINK_ATTR_SB_OCC_CUR &&
> -           mnl_attr_validate(attr, MNL_TYPE_U32) < 0)
> -               return MNL_CB_ERROR;
> -       if (type == DEVLINK_ATTR_SB_OCC_MAX &&
> -           mnl_attr_validate(attr, MNL_TYPE_U32) < 0)
> -               return MNL_CB_ERROR;
> -       if (type == DEVLINK_ATTR_ESWITCH_MODE &&
> -           mnl_attr_validate(attr, MNL_TYPE_U16) < 0)
> -               return MNL_CB_ERROR;
> -       if (type == DEVLINK_ATTR_ESWITCH_INLINE_MODE &&
> -           mnl_attr_validate(attr, MNL_TYPE_U8) < 0)
> +       type = mnl_attr_get_type(attr);
> +       if (mnl_attr_validate(attr, devlink_policy[type]) < 0)
>                 return MNL_CB_ERROR;
> +
>         tb[type] = attr;
>         return MNL_CB_OK;
>  }
> --
> 2.7.4
>

Much better!!!  LGTM.

Reviewed-by: Greg Rose <gvrose8192@gmail.com>

^ permalink raw reply

* Re: x86: warning: kernel stack regs has bad 'bp' value
From: Andrey Konovalov @ 2017-05-03 15:50 UTC (permalink / raw)
  To: Josh Poimboeuf
  Cc: Thomas Gleixner, Ingo Molnar, H. Peter Anvin, x86, LKML,
	Vlad Yasevich, Neil Horman, David S. Miller, linux-sctp, netdev,
	Marcelo Ricardo Leitner, Dmitry Vyukov, Kostya Serebryany,
	syzkaller, Eric Dumazet, Cong Wang
In-Reply-To: <20170503133006.7jv4qhnnbsv7ueam@treble>

On Wed, May 3, 2017 at 3:30 PM, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
> On Wed, May 03, 2017 at 02:48:28PM +0200, Andrey Konovalov wrote:
>> Hi,
>>
>> I've got the following error report while fuzzing the kernel with syzkaller.
>>
>> On commit 89c9fea3c8034cdb2fd745f551cde0b507fd6893 (4.11.0+).
>>
>> A reproducer and .config are attached.
>>
>> The reproducer open SCTP sockets and sends data to it in a loop.
>> I'm not sure whether this is an issue with SCTP or with something else.
>>
>> WARNING: kernel stack regs at ffff8800686869f8 in a.out:4933 has bad
>> 'bp' value c3fc855a10167ec0
>
> Hi Andrey,
>
> Can you test this patch?

Hi Josh,

This seems to be fixing the reports caused by the reproducers.

I'll keep fuzzing though with syzkaller to make sure.

Thanks!

>
>
> diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
> index 7e48807..45a53df 100644
> --- a/arch/x86/lib/csum-copy_64.S
> +++ b/arch/x86/lib/csum-copy_64.S
> @@ -55,7 +55,7 @@ ENTRY(csum_partial_copy_generic)
>         movq  %r12, 3*8(%rsp)
>         movq  %r14, 4*8(%rsp)
>         movq  %r13, 5*8(%rsp)
> -       movq  %rbp, 6*8(%rsp)
> +       movq  %r15, 6*8(%rsp)
>
>         movq  %r8, (%rsp)
>         movq  %r9, 1*8(%rsp)
> @@ -74,7 +74,7 @@ ENTRY(csum_partial_copy_generic)
>         /* main loop. clear in 64 byte blocks */
>         /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
>         /* r11: temp3, rdx: temp4, r12 loopcnt */
> -       /* r10: temp5, rbp: temp6, r14 temp7, r13 temp8 */
> +       /* r10: temp5, r15: temp6, r14 temp7, r13 temp8 */
>         .p2align 4
>  .Lloop:
>         source
> @@ -89,7 +89,7 @@ ENTRY(csum_partial_copy_generic)
>         source
>         movq  32(%rdi), %r10
>         source
> -       movq  40(%rdi), %rbp
> +       movq  40(%rdi), %r15
>         source
>         movq  48(%rdi), %r14
>         source
> @@ -103,7 +103,7 @@ ENTRY(csum_partial_copy_generic)
>         adcq  %r11, %rax
>         adcq  %rdx, %rax
>         adcq  %r10, %rax
> -       adcq  %rbp, %rax
> +       adcq  %r15, %rax
>         adcq  %r14, %rax
>         adcq  %r13, %rax
>
> @@ -121,7 +121,7 @@ ENTRY(csum_partial_copy_generic)
>         dest
>         movq %r10, 32(%rsi)
>         dest
> -       movq %rbp, 40(%rsi)
> +       movq %r15, 40(%rsi)
>         dest
>         movq %r14, 48(%rsi)
>         dest
> @@ -203,7 +203,7 @@ ENTRY(csum_partial_copy_generic)
>         movq 3*8(%rsp), %r12
>         movq 4*8(%rsp), %r14
>         movq 5*8(%rsp), %r13
> -       movq 6*8(%rsp), %rbp
> +       movq 6*8(%rsp), %r15
>         addq $7*8, %rsp
>         ret
>

^ permalink raw reply

* Re: [PATCH 1/2] PCI: Add new PCIe Fabric End Node flag, PCI_DEV_FLAGS_NO_RELAXED_ORDERING
From: Alexander Duyck @ 2017-05-03 16:02 UTC (permalink / raw)
  To: Casey Leedom
  Cc: Raj, Ashok, Bjorn Helgaas, leedom@gmail.com, Michael Werner,
	Ganesh GR, Arjun V., Asit K Mallick, Patrick J Cramer,
	Suravee Suthikulpanit, Bob Shaw, h, Ding Tianhong, Mark Rutland,
	Amir Ancel, Gabriele Paoloni, Catalin Marinas, Will Deacon,
	LinuxArm, David Laight, Jeff Kirsher <jeffrey.t.kir
In-Reply-To: <MWHPR12MB160095DF53CE981C923FE65DC8160@MWHPR12MB1600.namprd12.prod.outlook.com>

On Tue, May 2, 2017 at 9:30 PM, Casey Leedom <leedom@chelsio.com> wrote:
> | From: Alexander Duyck <alexander.duyck@gmail.com>
> | Date: Tuesday, May 2, 2017 11:10 AM
> | ...
> | So for example, in the case of x86 it seems like there are multiple
> | root complexes that have issues, and the gains for enabling it with
> | standard DMA to host memory are small. As such we may want to default
> | it to off via the architecture specific PCIe code and then look at
> | having "white-list" cases where we enable it for things like
> | peer-to-peer accesses. In the case of SPARC we could look at
> | defaulting it to on, and only "black-list" any cases where there might
> | be issues since SPARC relies on this in a significant way for
> | performance. In the case of ARM and other architectures it is a bit of
> | a toss-up. I would say we could just default it to on for now and
> | "black-list" anything that doesn't work, or we could go the other way
> | like I suggested for x86. It all depends on what the ARM community
> | would want to agree on for this. I would say unless it makes a
> | significant difference like it does in the case of SPARC we are
> | probably better off just defaulting it to off.
>
>   Sorry, I forgot to respond to this earlier when someone was rushing me out
> to a customer dinner.
>
>   I'm unaware of any other x86 Root Complex Port that has a problem with
> Relaxed Ordering other than the performance issue with the current Intel
> implementation.  Ashok tells me that Intel is in the final stages of putting
> together a technical notice on this issue but I don't know when that will
> come out.  Hopefully that will shed much more light on the cause and actual
> use of Relaxed Ordering when directed to Coherent Memory on current and past
> Intel platforms.  (Note that the performance bug seems to limit us to
> ~75-85Gb/s DMA Write speed to Coherent Host Memory.)

So my concern isn't so much about existing issues as it is about where
is the advantage in enabling it. We have had support in the Intel
hardware for enabling relaxed ordering for about 10 years. In all that
time I have yet to see an x86 platform that sees any real benefit from
enabling it for standard DMA. That is why my preference would be to
leave it disabled by default on x86 and we white list it in at some
point when hardware shows that there is a benefit to be had for
enabling it.

>   The only other Device that I currently know of which has issues with
> Relaxed Ordering TLPs directed at it, is also a Root Complex Port on the new
> AMD A1100 ARM SoC ("SEATTLE").  There we have an actual bug which could lead
> to Data Corruption.
>
>   But I don't know anything about other x86 Root Complex Ports having issues
> with this -- we've been using it ever since commit ef306b50b from December
> 2010.

So the question I would have for you then, is what benefits have you
seen from enabling it on x86? In our case we haven't seen any for
transactions that go through the root complex. If you are seeing
benefits would I be correct in assuming it is for your peer-to-peer
case or were there some x86 platforms that showed gains?

>   Also, I'm not aware of any performance data which has been gathered on the
> use of Relaxed Ordering when directed at Host Memory.  From your note, it
> sounds like it's important on SPARC architectures.  But it could conceivably
> be important on any architecture or Root Complex/Memory Controller
> implementation.  We use it to direct Ingress Packet Data to Free List
> Buffers, followed by a TLP without Relaxed Ordering directed at a Host
> Memory Message Queue.  The idea here is to give the Root Complex options on
> which DMA Memory Write TLPs to process in order to optimize data placement
> in memory.  And with the next Ethernet speed bump to 200Gb/s this may be
> even more important.

The operative term here is "may be". That is my concern. We are
leaving this enabled by default and there are known hardware that have
issues that can be pretty serious. I'm not saying we have to disable
it and keep it disabled, but I would like to see us intelligently
enable this feature so that it is enabled on the platforms that show
benefit and disabled on the ones that don't.

My biggest concern with all this is introducing regressions as drivers
like igb and ixgbe are used on a wide range of platforms beyond even
what is covered by x86 and I would prefer not to suddenly have a
deluge of bugs to sort out triggered by us enabling relaxed ordering
on platforms that historically have not had it enabled on them.

>   Basically, I'd hate to come up with a solution where we write off the use
> of Relaxed Ordering for DMA Writes to Host Memory.  I don't think you're
> suggesting that, but there are a number of x86 Root Complex implementations
> out there -- and some like the new AMD Ryzen have yet to be tested -- as
> well as other architectures.

I'm not saying that we have to write off the use of Relaxed Ordering,
what I am saying is that we should probably be more judicious about
how we go about enabling it. If a platform and/or architecture has no
benefit to enabling it what is the point in adding the possible risk?

Hopefully the AMD Ryzen platform has already been tested and doesn't
need a quirk to disable relaxed ordering. Really it shouldn't fall on
the likes of us to be testing for those kind of things.

>   And, as Ashok and I have both nothed, any solution we come up with needs
> to cope with a heterogeneous situation where, on the same PCIe Fabric, it
> may be necessesary/desireable to support Relaxed Ordering TLPs directed at
> some End Nodes but not others.
>
> Casey

It sounds like we are more or less in agreement. My only concern is
really what we default this to. On x86 I would say we could probably
default this to disabled for existing platforms since my understanding
is that relaxed ordering doesn't provide much benefit on what is out
there right now when performing DMA through the root complex. As far
as peer-to-peer I would say we should probably look at enabling the
ability to have Relaxed Ordering enabled for some channels but not
others. In those cases the hardware needs to be smart enough to allow
for you to indicate you want it disabled by default for most of your
DMA channels, and then enabled for the select channels that are
handling the peer-to-peer traffic.

- Alex

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox