Netdev List
 help / color / mirror / Atom feed
* [PATCH v3 net-next 11/11] ibmvnic: Move queue restarting in ibmvnic_tx_complete
From: Nathan Fontenot @ 2017-05-02 18:53 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon
In-Reply-To: <20170502185006.28214.89072.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>

Restart of the subqueue should occur outside of the loop processing
any tx buffers instead of doing this in the middle of the loop.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
---

v2: Use __netif_subqueue_stopped() instead of netif_subqueue_stopped()
to avoid possible using un-initialized skb variable.
---
 drivers/net/ethernet/ibm/ibmvnic.c |   22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 4a2b99f..4c5de60 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1809,19 +1809,8 @@ static int ibmvnic_complete_tx(struct ibmvnic_adapter *adapter,
 			}
 
 			if (txbuff->last_frag) {
-				if (atomic_sub_return(next->tx_comp.num_comps,
-						      &scrq->used) <=
-				    (adapter->req_tx_entries_per_subcrq / 2) &&
-				    netif_subqueue_stopped(adapter->netdev,
-							   txbuff->skb)) {
-					netif_wake_subqueue(adapter->netdev,
-							    scrq->pool_index);
-					netdev_dbg(adapter->netdev,
-						   "Started queue %d\n",
-						   scrq->pool_index);
-				}
-
 				dev_kfree_skb_any(txbuff->skb);
+				txbuff->skb = NULL;
 			}
 
 			adapter->tx_pool[pool].free_map[adapter->tx_pool[pool].
@@ -1832,6 +1821,15 @@ static int ibmvnic_complete_tx(struct ibmvnic_adapter *adapter,
 		}
 		/* remove tx_comp scrq*/
 		next->tx_comp.first = 0;
+
+		if (atomic_sub_return(next->tx_comp.num_comps, &scrq->used) <=
+		    (adapter->req_tx_entries_per_subcrq / 2) &&
+		    __netif_subqueue_stopped(adapter->netdev,
+					     scrq->pool_index)) {
+			netif_wake_subqueue(adapter->netdev, scrq->pool_index);
+			netdev_info(adapter->netdev, "Started queue %d\n",
+				    scrq->pool_index);
+		}
 	}
 
 	enable_scrq_irq(adapter, scrq);

^ permalink raw reply related

* [PATCH v3 net-next 10/11] ibmvnic: Record SKB RX queue during poll
From: Nathan Fontenot @ 2017-05-02 18:53 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon
In-Reply-To: <20170502185006.28214.89072.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>

From: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>

Map each RX SKB to the RX queue associated with the driver's RX SCRQ.
This should improve the RX CPU load balancing issues seen by the
performance team.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c |    1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 4b2e128..4a2b99f 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1428,6 +1428,7 @@ static int ibmvnic_poll(struct napi_struct *napi, int budget)
 
 		skb_put(skb, length);
 		skb->protocol = eth_type_trans(skb, netdev);
+		skb_record_rx_queue(skb, scrq_num);
 
 		if (flags & IBMVNIC_IP_CHKSUM_GOOD &&
 		    flags & IBMVNIC_TCP_UDP_CHKSUM_GOOD) {

^ permalink raw reply related

* [PATCH v3 net-next 08/11] ibmvnic: Check for driver reset first in ibmvnic_xmit
From: Nathan Fontenot @ 2017-05-02 18:52 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon
In-Reply-To: <20170502185006.28214.89072.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>

Move the check for the driver resetting to the first thing
in ibmvnic_xmit().

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c |   12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 4da7080..2cf1b64 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -985,12 +985,6 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev)
 	int index = 0;
 	int ret = 0;
 
-	tx_pool = &adapter->tx_pool[queue_num];
-	tx_scrq = adapter->tx_scrq[queue_num];
-	txq = netdev_get_tx_queue(netdev, skb_get_queue_mapping(skb));
-	handle_array = (u64 *)((u8 *)(adapter->login_rsp_buf) +
-				   be32_to_cpu(adapter->login_rsp_buf->
-					       off_txsubm_subcrqs));
 	if (adapter->resetting) {
 		if (!netif_subqueue_stopped(netdev, skb))
 			netif_stop_subqueue(netdev, queue_num);
@@ -1002,6 +996,12 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev)
 		goto out;
 	}
 
+	tx_pool = &adapter->tx_pool[queue_num];
+	tx_scrq = adapter->tx_scrq[queue_num];
+	txq = netdev_get_tx_queue(netdev, skb_get_queue_mapping(skb));
+	handle_array = (u64 *)((u8 *)(adapter->login_rsp_buf) +
+		be32_to_cpu(adapter->login_rsp_buf->off_txsubm_subcrqs));
+
 	index = tx_pool->free_map[tx_pool->consumer_index];
 	offset = index * adapter->req_mtu;
 	dst = tx_pool->long_term_buff.buff + offset;

^ permalink raw reply related

* [PATCH v3 net-next 07/11] ibmvnic: Wait for any pending scrqs entries at driver close
From: Nathan Fontenot @ 2017-05-02 18:52 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon
In-Reply-To: <20170502185006.28214.89072.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>

When closing the ibmvnic driver we need to wait for any pending
sub crq entries to ensure they are handled.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c |   47 +++++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 0400ae7..4da7080 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -743,23 +743,6 @@ static int ibmvnic_open(struct net_device *netdev)
 	return rc;
 }
 
-static void disable_sub_crqs(struct ibmvnic_adapter *adapter)
-{
-	int i;
-
-	if (adapter->tx_scrq) {
-		for (i = 0; i < adapter->req_tx_queues; i++)
-			if (adapter->tx_scrq[i])
-				disable_irq(adapter->tx_scrq[i]->irq);
-	}
-
-	if (adapter->rx_scrq) {
-		for (i = 0; i < adapter->req_rx_queues; i++)
-			if (adapter->rx_scrq[i])
-				disable_irq(adapter->rx_scrq[i]->irq);
-	}
-}
-
 static void clean_tx_pools(struct ibmvnic_adapter *adapter)
 {
 	struct ibmvnic_tx_pool *tx_pool;
@@ -797,15 +780,39 @@ static int __ibmvnic_close(struct net_device *netdev)
 	adapter->state = VNIC_CLOSING;
 	netif_tx_stop_all_queues(netdev);
 
-	clean_tx_pools(adapter);
-	disable_sub_crqs(adapter);
-
 	if (adapter->napi) {
 		for (i = 0; i < adapter->req_rx_queues; i++)
 			napi_disable(&adapter->napi[i]);
 	}
 
+	clean_tx_pools(adapter);
+
+	if (adapter->tx_scrq) {
+		for (i = 0; i < adapter->req_tx_queues; i++)
+			if (adapter->tx_scrq[i]->irq)
+				disable_irq(adapter->tx_scrq[i]->irq);
+	}
+
 	rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_DN);
+	if (rc)
+		return rc;
+
+	if (adapter->rx_scrq) {
+		for (i = 0; i < adapter->req_rx_queues; i++) {
+			int retries = 10;
+
+			while (pending_scrq(adapter, adapter->rx_scrq[i])) {
+				retries--;
+				mdelay(100);
+
+				if (retries == 0)
+					break;
+			}
+	
+			if (adapter->rx_scrq[i]->irq)
+				disable_irq(adapter->rx_scrq[i]->irq);
+		}
+	}
 
 	adapter->state = VNIC_CLOSED;
 	return rc;

^ permalink raw reply related

* Re: TPACKET_V3 timeout bug?
From: chetan loke @ 2017-05-02 15:04 UTC (permalink / raw)
  To: Guy Harris; +Cc: Andrew Lunn, Sowmini Varadhan, netdev, tcpdump-workers
In-Reply-To: <251BA382-D82C-4931-A98F-A23AD88F9811@alum.mit.edu>

On Sat, Apr 15, 2017 at 7:41 PM, Guy Harris <guy@alum.mit.edu> wrote:
> On Apr 15, 2017, at 7:10 PM, Andrew Lunn <andrew@lunn.ch> wrote:
>
>> Do you think this is a kernel problem, libpcap problem, or an
>> application problem?
>

Its clearly a kernel regression.

If you look at if_packet.h, I have explicitly called out all the cases
for the return/status codes. When I first merged the functionality in
3.11(or 3.12 I think) I had the logic in place to retire the block
with or without packets in it. I think there was one case where we
wouldn't wake up userspace. Someone checked in a fix for that. Now I
am not sure the regression happened as part of that bug fix or
sometime later. If you diff 3.12 against the latest you will find the
regression. Look for prb_retire_rx_blk_timer_expired().


> An application problem.  See my response on netdev; the timeout (which is provided by the kernel's capture mechanism, in most cases) is to make sure packets don't stay stuck in the kernel's packet buffer for an indefinite period of time, it's *not* to make sure a thread capturing packets doesn't stay blocked for an indefinite period of time.  Whether the timer goes off even if no

I cannot speak on behalf of user-space wrappers developed around
tpacket_v3 but the intention(from the kernel POV) of the block_timer
*is* to unblock the capture/user process/thread so that it does NOT
stay blocked for an indefinite period of time. The header explicitly
specifies that contract.

I'm tied up for the next 3 weeks. However I can work on it after
that(unless someone else has cycles for it now).

Chetan

^ permalink raw reply

* [PATCH v3 net-next 06/11] ibmvnic: Clean up tx pools when closing
From: Nathan Fontenot @ 2017-05-02 18:52 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon
In-Reply-To: <20170502185006.28214.89072.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>

When closing the ibmvnic driver, most notably during the reset
path, the tx pools need to be cleaned to ensure there are no
hanging skbs that need to be free'ed.

The need for this was found during debugging a loss of network
traffic after handling a driver reset. The underlying cause was
some skbs in the tx pool that were never free'ed. As a
result the upper network layers never tried a re-send since it
believed the driver still had the skb.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c |   30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index d20d884..0400ae7 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -760,6 +760,34 @@ static void disable_sub_crqs(struct ibmvnic_adapter *adapter)
 	}
 }
 
+static void clean_tx_pools(struct ibmvnic_adapter *adapter)
+{
+	struct ibmvnic_tx_pool *tx_pool;
+	u64 tx_entries;
+	int tx_scrqs;
+	int i, j;
+
+	if (!adapter->tx_pool)
+		return;
+
+	tx_scrqs = be32_to_cpu(adapter->login_rsp_buf->num_txsubm_subcrqs);
+	tx_entries = adapter->req_tx_entries_per_subcrq;
+
+	/* Free any remaining skbs in the tx buffer pools */
+	for (i = 0; i < tx_scrqs; i++) {
+		tx_pool = &adapter->tx_pool[i];
+		if (!tx_pool)
+			continue;
+
+		for (j = 0; j < tx_entries; j++) {
+			if (tx_pool->tx_buff[j].skb) {
+				dev_kfree_skb_any(tx_pool->tx_buff[j].skb);
+				tx_pool->tx_buff[j].skb = NULL;
+			}
+		}
+	}
+}
+
 static int __ibmvnic_close(struct net_device *netdev)
 {
 	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
@@ -768,6 +796,8 @@ static int __ibmvnic_close(struct net_device *netdev)
 
 	adapter->state = VNIC_CLOSING;
 	netif_tx_stop_all_queues(netdev);
+
+	clean_tx_pools(adapter);
 	disable_sub_crqs(adapter);
 
 	if (adapter->napi) {

^ permalink raw reply related

* [PATCH v3 net-next 05/11] ibmvnic: Whitespace correction in release_rx_pools
From: Nathan Fontenot @ 2017-05-02 18:52 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon
In-Reply-To: <20170502185006.28214.89072.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 9f2686d..d20d884 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -348,7 +348,7 @@ static void release_rx_pools(struct ibmvnic_adapter *adapter)
 		free_long_term_buff(adapter, &rx_pool->long_term_buff);
 
 		if (!rx_pool->rx_buff)
-		continue;
+			continue;
 
 		for (j = 0; j < rx_pool->size; j++) {
 			if (rx_pool->rx_buff[j].skb) {

^ permalink raw reply related

* [PATCH v3 net-next 04/11] ibmvnic: Delete napi's when releasing driver resources
From: Nathan Fontenot @ 2017-05-02 18:52 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon
In-Reply-To: <20170502185006.28214.89072.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>

The napi structs allocated at drivier initializatio need to be
free'ed when releasing the drivers resources.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c |    9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index b3d9b28..9f2686d 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -552,11 +552,20 @@ static int ibmvnic_login(struct net_device *netdev)
 
 static void release_resources(struct ibmvnic_adapter *adapter)
 {
+	int i;
+
 	release_tx_pools(adapter);
 	release_rx_pools(adapter);
 
 	release_stats_token(adapter);
 	release_error_buffers(adapter);
+
+	if (adapter->napi) {
+		for (i = 0; i < adapter->req_rx_queues; i++) {
+			if (&adapter->napi[i])
+				netif_napi_del(&adapter->napi[i]);
+		}
+	}
 }
 
 static int set_link_state(struct ibmvnic_adapter *adapter, u8 link_state)

^ permalink raw reply related

* [PATCH v3 net-next 03/11] ibmvnic: Updated reset handling
From: Nathan Fontenot @ 2017-05-02 18:52 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon
In-Reply-To: <20170502185006.28214.89072.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>

The ibmvnic driver has multiple handlers for resetting the driver
depending on the reason the reset is needed (failover, lpm,
fatal erors,...). All of the reset handlers do essentially the same
thing, this patch moves this work to a common reset handler.

By doing this we also allow the driver to better handle situations
where we can get a reset while handling a reset.

The updated reset handling works by adding a reset work item to the
list of resets and then scheduling work to perform the reset. This
step is necessary because we can receive a reset in interrupt context
and we want to handle the reset out of interrupt context.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c |  413 +++++++++++++++++++++++-------------
 drivers/net/ethernet/ibm/ibmvnic.h |   19 +-
 2 files changed, 275 insertions(+), 157 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 40a8ba0..b3d9b28 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -194,7 +194,8 @@ static void free_long_term_buff(struct ibmvnic_adapter *adapter,
 	if (!ltb->buff)
 		return;
 
-	if (!adapter->failover)
+	if (adapter->reset_reason != VNIC_RESET_FAILOVER &&
+	    adapter->reset_reason != VNIC_RESET_MOBILITY)
 		send_request_unmap(adapter, ltb->map_id);
 	dma_free_coherent(dev, ltb->size, ltb->buff, ltb->addr);
 }
@@ -292,9 +293,6 @@ static void replenish_pools(struct ibmvnic_adapter *adapter)
 {
 	int i;
 
-	if (adapter->migrated)
-		return;
-
 	adapter->replenish_task_cycles++;
 	for (i = 0; i < be32_to_cpu(adapter->login_rsp_buf->num_rxadd_subcrqs);
 	     i++) {
@@ -569,11 +567,6 @@ static int set_link_state(struct ibmvnic_adapter *adapter, u8 link_state)
 	bool resend;
 	int rc;
 
-	if (adapter->logical_link_state == link_state) {
-		netdev_dbg(netdev, "Link state already %d\n", link_state);
-		return 0;
-	}
-
 	netdev_err(netdev, "setting link state %d\n", link_state);
 	memset(&crq, 0, sizeof(crq));
 	crq.logical_link_state.first = IBMVNIC_CRQ_CMD;
@@ -664,27 +657,13 @@ static int init_resources(struct ibmvnic_adapter *adapter)
 	return rc;
 }
 
-static int ibmvnic_open(struct net_device *netdev)
+static int __ibmvnic_open(struct net_device *netdev)
 {
 	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
+	enum vnic_state prev_state = adapter->state;
 	int i, rc;
 
 	adapter->state = VNIC_OPENING;
-
-	if (adapter->state == VNIC_CLOSED) {
-		rc = ibmvnic_init(adapter);
-		if (rc)
-			return rc;
-	}
-
-	rc = ibmvnic_login(netdev);
-	if (rc)
-		return rc;
-
-	rc = init_resources(adapter);
-	if (rc)
-		return rc;
-
 	replenish_pools(adapter);
 
 	for (i = 0; i < adapter->req_rx_queues; i++)
@@ -693,22 +672,65 @@ static int ibmvnic_open(struct net_device *netdev)
 	/* We're ready to receive frames, enable the sub-crq interrupts and
 	 * set the logical link state to up
 	 */
-	for (i = 0; i < adapter->req_rx_queues; i++)
-		enable_scrq_irq(adapter, adapter->rx_scrq[i]);
+	for (i = 0; i < adapter->req_rx_queues; i++) {
+		if (prev_state == VNIC_CLOSED)
+			enable_irq(adapter->rx_scrq[i]->irq);
+		else
+			enable_scrq_irq(adapter, adapter->rx_scrq[i]);
+	}
 
-	for (i = 0; i < adapter->req_tx_queues; i++)
-		enable_scrq_irq(adapter, adapter->tx_scrq[i]);
+	for (i = 0; i < adapter->req_tx_queues; i++) {
+		if (prev_state == VNIC_CLOSED)
+			enable_irq(adapter->tx_scrq[i]->irq);
+		else
+			enable_scrq_irq(adapter, adapter->tx_scrq[i]);
+	}
 
 	rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_UP);
 	if (rc) {
 		for (i = 0; i < adapter->req_rx_queues; i++)
 			napi_disable(&adapter->napi[i]);
 		release_resources(adapter);
-	} else {
-		netif_tx_start_all_queues(netdev);
-		adapter->state = VNIC_OPEN;
+		return rc;
+	}
+	
+	netif_tx_start_all_queues(netdev);
+
+	if (prev_state == VNIC_CLOSED) {
+		for (i = 0; i < adapter->req_rx_queues; i++)
+			napi_schedule(&adapter->napi[i]);
 	}
 
+	adapter->state = VNIC_OPEN;
+	return rc;
+}
+
+static int ibmvnic_open(struct net_device *netdev)
+{
+	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
+	int rc;
+
+	mutex_lock(&adapter->reset_lock);
+
+	if (adapter->state != VNIC_CLOSED) {
+		rc = ibmvnic_login(netdev);
+		if (rc) {
+			mutex_unlock(&adapter->reset_lock);
+			return rc;
+		}
+
+		rc = init_resources(adapter);
+		if (rc) {
+			netdev_err(netdev, "failed to initialize resources\n");
+			release_resources(adapter);
+			mutex_unlock(&adapter->reset_lock);
+			return rc;
+		}
+	}
+
+	rc = __ibmvnic_open(netdev);
+	mutex_unlock(&adapter->reset_lock);
+
 	return rc;
 }
 
@@ -729,13 +751,14 @@ static void disable_sub_crqs(struct ibmvnic_adapter *adapter)
 	}
 }
 
-static int ibmvnic_close(struct net_device *netdev)
+static int __ibmvnic_close(struct net_device *netdev)
 {
 	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
 	int rc = 0;
 	int i;
 
 	adapter->state = VNIC_CLOSING;
+	netif_tx_stop_all_queues(netdev);
 	disable_sub_crqs(adapter);
 
 	if (adapter->napi) {
@@ -743,17 +766,24 @@ static int ibmvnic_close(struct net_device *netdev)
 			napi_disable(&adapter->napi[i]);
 	}
 
-	if (!adapter->failover)
-		netif_tx_stop_all_queues(netdev);
-
 	rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_DN);
 
-	release_resources(adapter);
-
 	adapter->state = VNIC_CLOSED;
 	return rc;
 }
 
+static int ibmvnic_close(struct net_device *netdev)
+{
+	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
+	int rc;
+
+	mutex_lock(&adapter->reset_lock);
+	rc = __ibmvnic_close(netdev);
+	mutex_unlock(&adapter->reset_lock);
+
+	return rc;
+}
+
 /**
  * build_hdr_data - creates L2/L3/L4 header data buffer
  * @hdr_field - bitfield determining needed headers
@@ -915,7 +945,7 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev)
 	handle_array = (u64 *)((u8 *)(adapter->login_rsp_buf) +
 				   be32_to_cpu(adapter->login_rsp_buf->
 					       off_txsubm_subcrqs));
-	if (adapter->migrated) {
+	if (adapter->resetting) {
 		if (!netif_subqueue_stopped(netdev, skb))
 			netif_stop_subqueue(netdev, queue_num);
 		dev_kfree_skb_any(skb);
@@ -1107,18 +1137,185 @@ static int ibmvnic_set_mac(struct net_device *netdev, void *p)
 	return 0;
 }
 
-static void ibmvnic_tx_timeout(struct net_device *dev)
+/**
+ * do_reset returns zero if we are able to keep processing reset events, or
+ * non-zero if we hit a fatal error and must halt.
+ */
+static int do_reset(struct ibmvnic_adapter *adapter,
+		    struct ibmvnic_rwi *rwi, u32 reset_state)
 {
-	struct ibmvnic_adapter *adapter = netdev_priv(dev);
-	int rc;
+	struct net_device *netdev = adapter->netdev;
+	int i, rc;
+
+	netif_carrier_off(netdev);
+	adapter->reset_reason = rwi->reset_reason;
+
+	if (rwi->reset_reason == VNIC_RESET_MOBILITY) {
+		rc = ibmvnic_reenable_crq_queue(adapter);
+		if (rc)
+			return 0;
+	}
+
+	rc = __ibmvnic_close(netdev);
+	if (rc)
+		return rc;
+
+	/* remove the closed state so when we call open it appears
+	 * we are coming from the probed state.
+	 */
+	adapter->state = VNIC_PROBED;
 
-	/* Adapter timed out, resetting it */
+	release_resources(adapter);
 	release_sub_crqs(adapter);
-	rc = ibmvnic_reset_crq(adapter);
+	release_crq_queue(adapter);
+
+	rc = ibmvnic_init(adapter);
 	if (rc)
-		dev_err(&adapter->vdev->dev, "Adapter timeout, reset failed\n");
-	else
-		ibmvnic_send_crq_init(adapter);
+		return 0;
+
+	/* If the adapter was in PROBE state prior to the reset, exit here. */
+	if (reset_state == VNIC_PROBED)
+		return 0;
+
+	rc = ibmvnic_login(netdev);
+	if (rc) {
+		adapter->state = VNIC_PROBED;
+		return 0;
+	}
+
+	rtnl_lock();
+	rc = init_resources(adapter);
+	rtnl_unlock();
+	if (rc)
+		return rc;
+
+	if (reset_state == VNIC_CLOSED)
+		return 0;
+
+	rc = __ibmvnic_open(netdev);
+	if (rc) {
+		if (list_empty(&adapter->rwi_list))
+			adapter->state = VNIC_CLOSED;
+		else
+			adapter->state = reset_state;
+
+		return 0;
+	}
+
+	netif_carrier_on(netdev);
+
+	/* kick napi */
+	for (i = 0; i < adapter->req_rx_queues; i++)
+		napi_schedule(&adapter->napi[i]);
+
+	return 0;
+}
+
+static struct ibmvnic_rwi *get_next_rwi(struct ibmvnic_adapter *adapter)
+{
+	struct ibmvnic_rwi *rwi;
+
+	mutex_lock(&adapter->rwi_lock);
+
+	if (!list_empty(&adapter->rwi_list)) {
+		rwi = list_first_entry(&adapter->rwi_list, struct ibmvnic_rwi,
+				       list);
+		list_del(&rwi->list);
+	} else {
+		rwi = NULL;
+	}
+
+	mutex_unlock(&adapter->rwi_lock);
+	return rwi;
+}
+
+static void free_all_rwi(struct ibmvnic_adapter *adapter)
+{
+	struct ibmvnic_rwi *rwi;
+
+	rwi = get_next_rwi(adapter);
+	while (rwi) {
+		kfree(rwi);
+		rwi = get_next_rwi(adapter);
+	}
+}
+
+static void __ibmvnic_reset(struct work_struct *work)
+{
+	struct ibmvnic_rwi *rwi;
+	struct ibmvnic_adapter *adapter;
+	struct net_device *netdev;
+	u32 reset_state;
+	int rc;
+
+	adapter = container_of(work, struct ibmvnic_adapter, ibmvnic_reset);
+	netdev = adapter->netdev;
+
+	mutex_lock(&adapter->reset_lock);
+	adapter->resetting = true;
+	reset_state = adapter->state;
+
+	rwi = get_next_rwi(adapter);
+	while (rwi) {
+		rc = do_reset(adapter, rwi, reset_state);
+		kfree(rwi);
+		if (rc)
+			break;
+
+		rwi = get_next_rwi(adapter);
+	}
+
+	if (rc) {
+		free_all_rwi(adapter);
+		return;
+	}
+
+	adapter->resetting = false;
+	mutex_unlock(&adapter->reset_lock);
+}
+
+static void ibmvnic_reset(struct ibmvnic_adapter *adapter,
+			  enum ibmvnic_reset_reason reason)
+{
+	struct ibmvnic_rwi *rwi, *tmp;
+	struct net_device *netdev = adapter->netdev;
+	struct list_head *entry;
+
+	if (adapter->state == VNIC_REMOVING ||
+	    adapter->state == VNIC_REMOVED) {
+		netdev_dbg(netdev, "Adapter removing, skipping reset\n");
+		return;
+	}
+
+	mutex_lock(&adapter->rwi_lock);
+
+	list_for_each(entry, &adapter->rwi_list) {
+		tmp = list_entry(entry, struct ibmvnic_rwi, list);
+		if (tmp->reset_reason == reason) {
+			netdev_err(netdev, "Matching reset found, skipping\n");
+			mutex_unlock(&adapter->rwi_lock);
+			return;
+		}
+	}
+
+	rwi = kzalloc(sizeof(*rwi), GFP_KERNEL);
+	if (!rwi) {
+		mutex_unlock(&adapter->rwi_lock);
+		ibmvnic_close(netdev);
+		return;
+	}
+
+	rwi->reset_reason = reason;
+	list_add_tail(&rwi->list, &adapter->rwi_list);
+	mutex_unlock(&adapter->rwi_lock);
+	schedule_work(&adapter->ibmvnic_reset);
+}
+
+static void ibmvnic_tx_timeout(struct net_device *dev)
+{
+	struct ibmvnic_adapter *adapter = netdev_priv(dev);
+
+	ibmvnic_reset(adapter, VNIC_RESET_TIMEOUT);
 }
 
 static void remove_buff_from_pool(struct ibmvnic_adapter *adapter,
@@ -2000,18 +2197,6 @@ static int ibmvnic_send_crq_init(struct ibmvnic_adapter *adapter)
 	return ibmvnic_send_crq(adapter, &crq);
 }
 
-static int ibmvnic_send_crq_init_complete(struct ibmvnic_adapter *adapter)
-{
-	union ibmvnic_crq crq;
-
-	memset(&crq, 0, sizeof(crq));
-	crq.generic.first = IBMVNIC_CRQ_INIT_CMD;
-	crq.generic.cmd = IBMVNIC_CRQ_INIT_COMPLETE;
-	netdev_dbg(adapter->netdev, "Sending CRQ init complete\n");
-
-	return ibmvnic_send_crq(adapter, &crq);
-}
-
 static int send_version_xchg(struct ibmvnic_adapter *adapter)
 {
 	union ibmvnic_crq crq;
@@ -2509,6 +2694,9 @@ static void handle_error_indication(union ibmvnic_crq *crq,
 
 	if (be32_to_cpu(crq->error_indication.error_id))
 		request_error_information(adapter, crq);
+
+	if (crq->error_indication.flags & IBMVNIC_FATAL_ERROR)
+		ibmvnic_reset(adapter, VNIC_RESET_FATAL);
 }
 
 static void handle_change_mac_rsp(union ibmvnic_crq *crq,
@@ -2897,26 +3085,6 @@ static void handle_query_cap_rsp(union ibmvnic_crq *crq,
 	}
 }
 
-static void ibmvnic_xport_event(struct work_struct *work)
-{
-	struct ibmvnic_adapter *adapter = container_of(work,
-						       struct ibmvnic_adapter,
-						       ibmvnic_xport);
-	struct device *dev = &adapter->vdev->dev;
-	long rc;
-
-	release_sub_crqs(adapter);
-	if (adapter->migrated) {
-		rc = ibmvnic_reenable_crq_queue(adapter);
-		if (rc)
-			dev_err(dev, "Error after enable rc=%ld\n", rc);
-		adapter->migrated = false;
-		rc = ibmvnic_send_crq_init(adapter);
-		if (rc)
-			dev_err(dev, "Error sending init rc=%ld\n", rc);
-	}
-}
-
 static void ibmvnic_handle_crq(union ibmvnic_crq *crq,
 			       struct ibmvnic_adapter *adapter)
 {
@@ -2934,12 +3102,6 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq,
 		switch (gen_crq->cmd) {
 		case IBMVNIC_CRQ_INIT:
 			dev_info(dev, "Partner initialized\n");
-			/* Send back a response */
-			rc = ibmvnic_send_crq_init_complete(adapter);
-			if (!rc)
-				schedule_work(&adapter->vnic_crq_init);
-			else
-				dev_err(dev, "Can't send initrsp rc=%ld\n", rc);
 			break;
 		case IBMVNIC_CRQ_INIT_COMPLETE:
 			dev_info(dev, "Partner initialization complete\n");
@@ -2950,19 +3112,18 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq,
 		}
 		return;
 	case IBMVNIC_CRQ_XPORT_EVENT:
+		netif_carrier_off(netdev);
 		if (gen_crq->cmd == IBMVNIC_PARTITION_MIGRATED) {
-			dev_info(dev, "Re-enabling adapter\n");
-			adapter->migrated = true;
-			schedule_work(&adapter->ibmvnic_xport);
+			dev_info(dev, "Migrated, re-enabling adapter\n");
+			ibmvnic_reset(adapter, VNIC_RESET_MOBILITY);
 		} else if (gen_crq->cmd == IBMVNIC_DEVICE_FAILOVER) {
 			dev_info(dev, "Backing device failover detected\n");
-			netif_carrier_off(netdev);
-			adapter->failover = true;
+			ibmvnic_reset(adapter, VNIC_RESET_FAILOVER);
 		} else {
 			/* The adapter lost the connection */
 			dev_err(dev, "Virtual Adapter failed (rc=%d)\n",
 				gen_crq->cmd);
-			schedule_work(&adapter->ibmvnic_xport);
+			ibmvnic_reset(adapter, VNIC_RESET_FATAL);
 		}
 		return;
 	case IBMVNIC_CRQ_CMD_RSP:
@@ -3243,64 +3404,6 @@ static int init_crq_queue(struct ibmvnic_adapter *adapter)
 	return retrc;
 }
 
-static void handle_crq_init_rsp(struct work_struct *work)
-{
-	struct ibmvnic_adapter *adapter = container_of(work,
-						       struct ibmvnic_adapter,
-						       vnic_crq_init);
-	struct device *dev = &adapter->vdev->dev;
-	struct net_device *netdev = adapter->netdev;
-	unsigned long timeout = msecs_to_jiffies(30000);
-	bool restart = false;
-	int rc;
-
-	if (adapter->failover) {
-		release_sub_crqs(adapter);
-		if (netif_running(netdev)) {
-			netif_tx_disable(netdev);
-			ibmvnic_close(netdev);
-			restart = true;
-		}
-	}
-
-	reinit_completion(&adapter->init_done);
-	send_version_xchg(adapter);
-	if (!wait_for_completion_timeout(&adapter->init_done, timeout)) {
-		dev_err(dev, "Passive init timeout\n");
-		goto task_failed;
-	}
-
-	netdev->mtu = adapter->req_mtu - ETH_HLEN;
-
-	if (adapter->failover) {
-		adapter->failover = false;
-		if (restart) {
-			rc = ibmvnic_open(netdev);
-			if (rc)
-				goto restart_failed;
-		}
-		netif_carrier_on(netdev);
-		return;
-	}
-
-	rc = register_netdev(netdev);
-	if (rc) {
-		dev_err(dev,
-			"failed to register netdev rc=%d\n", rc);
-		goto register_failed;
-	}
-	dev_info(dev, "ibmvnic registered\n");
-
-	return;
-
-restart_failed:
-	dev_err(dev, "Failed to restart ibmvnic, rc=%d\n", rc);
-register_failed:
-	release_sub_crqs(adapter);
-task_failed:
-	dev_err(dev, "Passive initialization was not successful\n");
-}
-
 static int ibmvnic_init(struct ibmvnic_adapter *adapter)
 {
 	struct device *dev = &adapter->vdev->dev;
@@ -3359,7 +3462,6 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
 	dev_set_drvdata(&dev->dev, netdev);
 	adapter->vdev = dev;
 	adapter->netdev = netdev;
-	adapter->failover = false;
 
 	ether_addr_copy(adapter->mac_addr, mac_addr_p);
 	ether_addr_copy(netdev->dev_addr, adapter->mac_addr);
@@ -3368,14 +3470,17 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
 	netdev->ethtool_ops = &ibmvnic_ethtool_ops;
 	SET_NETDEV_DEV(netdev, &dev->dev);
 
-	INIT_WORK(&adapter->vnic_crq_init, handle_crq_init_rsp);
-	INIT_WORK(&adapter->ibmvnic_xport, ibmvnic_xport_event);
-
 	spin_lock_init(&adapter->stats_lock);
 
 	INIT_LIST_HEAD(&adapter->errors);
 	spin_lock_init(&adapter->error_list_lock);
 
+	INIT_WORK(&adapter->ibmvnic_reset, __ibmvnic_reset);
+	INIT_LIST_HEAD(&adapter->rwi_list);
+	mutex_init(&adapter->reset_lock);
+	mutex_init(&adapter->rwi_lock);
+	adapter->resetting = false;
+
 	rc = ibmvnic_init(adapter);
 	if (rc) {
 		free_netdev(netdev);
@@ -3403,6 +3508,7 @@ static int ibmvnic_remove(struct vio_dev *dev)
 
 	adapter->state = VNIC_REMOVING;
 	unregister_netdev(netdev);
+	mutex_lock(&adapter->reset_lock);
 
 	release_resources(adapter);
 	release_sub_crqs(adapter);
@@ -3410,6 +3516,7 @@ static int ibmvnic_remove(struct vio_dev *dev)
 
 	adapter->state = VNIC_REMOVED;
 
+	mutex_unlock(&adapter->reset_lock);
 	free_netdev(netdev);
 	dev_set_drvdata(&dev->dev, NULL);
 
diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h
index 03a866f..4702b48 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.h
+++ b/drivers/net/ethernet/ibm/ibmvnic.h
@@ -922,6 +922,16 @@ enum vnic_state {VNIC_PROBING = 1,
 		 VNIC_REMOVING,
 		 VNIC_REMOVED};
 
+enum ibmvnic_reset_reason {VNIC_RESET_FAILOVER = 1,
+			   VNIC_RESET_MOBILITY,
+			   VNIC_RESET_FATAL,
+			   VNIC_RESET_TIMEOUT};
+
+struct ibmvnic_rwi {
+	enum ibmvnic_reset_reason reset_reason;
+	struct list_head list;
+};
+
 struct ibmvnic_adapter {
 	struct vio_dev *vdev;
 	struct net_device *netdev;
@@ -931,7 +941,6 @@ struct ibmvnic_adapter {
 	dma_addr_t ip_offload_tok;
 	struct ibmvnic_control_ip_offload_buffer ip_offload_ctrl;
 	dma_addr_t ip_offload_ctrl_tok;
-	bool migrated;
 	u32 msg_enable;
 
 	/* Statistics */
@@ -1015,9 +1024,11 @@ struct ibmvnic_adapter {
 	__be64 tx_rx_desc_req;
 	u8 map_id;
 
-	struct work_struct vnic_crq_init;
-	struct work_struct ibmvnic_xport;
 	struct tasklet_struct tasklet;
-	bool failover;
 	enum vnic_state state;
+	enum ibmvnic_reset_reason reset_reason;
+	struct mutex reset_lock, rwi_lock;
+	struct list_head rwi_list;
+	struct work_struct ibmvnic_reset;
+	bool resetting;
 };

^ permalink raw reply related

* [PATCH v3 net-next 02/11] ibmvnic: Replace is_closed with state field
From: Nathan Fontenot @ 2017-05-02 18:52 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon
In-Reply-To: <20170502185006.28214.89072.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>

Replace the is_closed flag in the ibmvnic adapter strcut with a
more comprehensive state field that tracks the current state of
the driver.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c |   20 +++++++++++++-------
 drivers/net/ethernet/ibm/ibmvnic.h |   12 ++++++++++--
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index c67f1d6..40a8ba0 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -669,7 +669,9 @@ static int ibmvnic_open(struct net_device *netdev)
 	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
 	int i, rc;
 
-	if (adapter->is_closed) {
+	adapter->state = VNIC_OPENING;
+
+	if (adapter->state == VNIC_CLOSED) {
 		rc = ibmvnic_init(adapter);
 		if (rc)
 			return rc;
@@ -704,7 +706,7 @@ static int ibmvnic_open(struct net_device *netdev)
 		release_resources(adapter);
 	} else {
 		netif_tx_start_all_queues(netdev);
-		adapter->is_closed = false;
+		adapter->state = VNIC_OPEN;
 	}
 
 	return rc;
@@ -733,7 +735,7 @@ static int ibmvnic_close(struct net_device *netdev)
 	int rc = 0;
 	int i;
 
-	adapter->closing = true;
+	adapter->state = VNIC_CLOSING;
 	disable_sub_crqs(adapter);
 
 	if (adapter->napi) {
@@ -748,8 +750,7 @@ static int ibmvnic_close(struct net_device *netdev)
 
 	release_resources(adapter);
 
-	adapter->is_closed = true;
-	adapter->closing = false;
+	adapter->state = VNIC_CLOSED;
 	return rc;
 }
 
@@ -1860,7 +1861,8 @@ static int pending_scrq(struct ibmvnic_adapter *adapter,
 {
 	union sub_crq *entry = &scrq->msgs[scrq->cur];
 
-	if (entry->generic.first & IBMVNIC_CRQ_CMD_RSP || adapter->closing)
+	if (entry->generic.first & IBMVNIC_CRQ_CMD_RSP ||
+	    adapter->state == VNIC_CLOSING)
 		return 1;
 	else
 		return 0;
@@ -3353,6 +3355,7 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
 		return -ENOMEM;
 
 	adapter = netdev_priv(netdev);
+	adapter->state = VNIC_PROBING;
 	dev_set_drvdata(&dev->dev, netdev);
 	adapter->vdev = dev;
 	adapter->netdev = netdev;
@@ -3380,7 +3383,6 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
 	}
 
 	netdev->mtu = adapter->req_mtu - ETH_HLEN;
-	adapter->is_closed = false;
 
 	rc = register_netdev(netdev);
 	if (rc) {
@@ -3390,6 +3392,7 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
 	}
 	dev_info(&dev->dev, "ibmvnic registered\n");
 
+	adapter->state = VNIC_PROBED;
 	return 0;
 }
 
@@ -3398,12 +3401,15 @@ static int ibmvnic_remove(struct vio_dev *dev)
 	struct net_device *netdev = dev_get_drvdata(&dev->dev);
 	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
 
+	adapter->state = VNIC_REMOVING;
 	unregister_netdev(netdev);
 
 	release_resources(adapter);
 	release_sub_crqs(adapter);
 	release_crq_queue(adapter);
 
+	adapter->state = VNIC_REMOVED;
+
 	free_netdev(netdev);
 	dev_set_drvdata(&dev->dev, NULL);
 
diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h
index a69979f..03a866f 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.h
+++ b/drivers/net/ethernet/ibm/ibmvnic.h
@@ -913,6 +913,15 @@ struct ibmvnic_error_buff {
 	__be32 error_id;
 };
 
+enum vnic_state {VNIC_PROBING = 1,
+		 VNIC_PROBED,
+		 VNIC_OPENING,
+		 VNIC_OPEN,
+		 VNIC_CLOSING,
+		 VNIC_CLOSED,
+		 VNIC_REMOVING,
+		 VNIC_REMOVED};
+
 struct ibmvnic_adapter {
 	struct vio_dev *vdev;
 	struct net_device *netdev;
@@ -962,7 +971,6 @@ struct ibmvnic_adapter {
 	u64 promisc;
 
 	struct ibmvnic_tx_pool *tx_pool;
-	bool closing;
 	struct completion init_done;
 	int init_done_rc;
 
@@ -1011,5 +1019,5 @@ struct ibmvnic_adapter {
 	struct work_struct ibmvnic_xport;
 	struct tasklet_struct tasklet;
 	bool failover;
-	bool is_closed;
+	enum vnic_state state;
 };

^ permalink raw reply related

* [PATCH v3 net-next 01/11] ibmvnic: Move resource initialization to its own routine
From: Nathan Fontenot @ 2017-05-02 18:52 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon
In-Reply-To: <20170502185006.28214.89072.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>

Move all of the calls to initialize resources for the driver to
a separate routine.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c |   71 ++++++++++++++++++++----------------
 1 file changed, 39 insertions(+), 32 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 4fcd2f0..c67f1d6 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -624,22 +624,10 @@ static int set_real_num_queues(struct net_device *netdev)
 	return rc;
 }
 
-static int ibmvnic_open(struct net_device *netdev)
+static int init_resources(struct ibmvnic_adapter *adapter)
 {
-	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
-	struct device *dev = &adapter->vdev->dev;
-	int rc = 0;
-	int i;
-
-	if (adapter->is_closed) {
-		rc = ibmvnic_init(adapter);
-		if (rc)
-			return rc;
-	}
-
-	rc = ibmvnic_login(netdev);
-	if (rc)
-		return rc;
+	struct net_device *netdev = adapter->netdev;
+	int i, rc;
 
 	rc = set_real_num_queues(netdev);
 	if (rc)
@@ -647,7 +635,7 @@ static int ibmvnic_open(struct net_device *netdev)
 
 	rc = init_sub_crq_irqs(adapter);
 	if (rc) {
-		dev_err(dev, "failed to initialize sub crq irqs\n");
+		netdev_err(netdev, "failed to initialize sub crq irqs\n");
 		return -1;
 	}
 
@@ -659,25 +647,47 @@ static int ibmvnic_open(struct net_device *netdev)
 	adapter->napi = kcalloc(adapter->req_rx_queues,
 				sizeof(struct napi_struct), GFP_KERNEL);
 	if (!adapter->napi)
-		goto ibmvnic_open_fail;
+		return -ENOMEM;
+
 	for (i = 0; i < adapter->req_rx_queues; i++) {
 		netif_napi_add(netdev, &adapter->napi[i], ibmvnic_poll,
 			       NAPI_POLL_WEIGHT);
-		napi_enable(&adapter->napi[i]);
 	}
 
 	send_map_query(adapter);
 
 	rc = init_rx_pools(netdev);
 	if (rc)
-		goto ibmvnic_open_fail;
+		return rc;
 
 	rc = init_tx_pools(netdev);
+	return rc;
+}
+
+static int ibmvnic_open(struct net_device *netdev)
+{
+	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
+	int i, rc;
+
+	if (adapter->is_closed) {
+		rc = ibmvnic_init(adapter);
+		if (rc)
+			return rc;
+	}
+
+	rc = ibmvnic_login(netdev);
 	if (rc)
-		goto ibmvnic_open_fail;
+		return rc;
+
+	rc = init_resources(adapter);
+	if (rc)
+		return rc;
 
 	replenish_pools(adapter);
 
+	for (i = 0; i < adapter->req_rx_queues; i++)
+		napi_enable(&adapter->napi[i]);
+
 	/* We're ready to receive frames, enable the sub-crq interrupts and
 	 * set the logical link state to up
 	 */
@@ -688,19 +698,16 @@ static int ibmvnic_open(struct net_device *netdev)
 		enable_scrq_irq(adapter, adapter->tx_scrq[i]);
 
 	rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_UP);
-	if (rc)
-		goto ibmvnic_open_fail;
-
-	netif_tx_start_all_queues(netdev);
-	adapter->is_closed = false;
-
-	return 0;
+	if (rc) {
+		for (i = 0; i < adapter->req_rx_queues; i++)
+			napi_disable(&adapter->napi[i]);
+		release_resources(adapter);
+	} else {
+		netif_tx_start_all_queues(netdev);
+		adapter->is_closed = false;
+	}
 
-ibmvnic_open_fail:
-	for (i = 0; i < adapter->req_rx_queues; i++)
-		napi_disable(&adapter->napi[i]);
-	release_resources(adapter);
-	return -ENOMEM;
+	return rc;
 }
 
 static void disable_sub_crqs(struct ibmvnic_adapter *adapter)

^ permalink raw reply related

* [PATCH v3 net-next 00/11] ibmvnic: Updated reset handler and code fixes
From: Nathan Fontenot @ 2017-05-02 18:52 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon

This set of patches multiple code fixes and a new rest handler
for the ibmvnic driver. In order to implement the new reset handler
for the ibmvnic driver resource initialization needed to be moved to
its own routine, a state variable is introduced to replace the
various is_* flags in the driver, and a new routine to handle the
assorted reasons the driver can be reset.

v3 updates:

Patch 10/11: Correct patch subject line to be a description of the patch.

v2 updates:

Patch 11/11: Use __netif_subqueue_stopped() instead of
netif_subqueue_stopped() to avoid possible use of an un-initialized
skb variable.

---

Nathan Fontenot (10):
      ibmvnic: Move resource initialization to its own routine
      ibmvnic: Replace is_closed with state field
      ibmvnic: Updated reset handling
      ibmvnic: Delete napi's when releasing driver resources
      ibmvnic: Whitespace correction in release_rx_pools
      ibmvnic: Clean up tx pools when closing
      ibmvnic: Wait for any pending scrqs entries at driver close
      ibmvnic: Check for driver reset first in ibmvnic_xmit
      ibmvnic: Continue skb processing after skb completion error
      ibmvnic: Move queue restarting in ibmvnic_tx_complete

Thomas Falcon (1):
      ibmvnic: Record SKB RX queue during poll


 drivers/net/ethernet/ibm/ibmvnic.c |  563 +++++++++++++++++++++++-------------
 drivers/net/ethernet/ibm/ibmvnic.h |   31 ++
 2 files changed, 389 insertions(+), 205 deletions(-)

^ permalink raw reply

* [PATCH] selftests: bpf: Use bpf_endian.h in test_xdp.c
From: David Miller @ 2017-05-02 14:54 UTC (permalink / raw)
  To: netdev; +Cc: ast, daniel


This fixes the testcase on big-endian.

Signed-off-by: David S. Miller <davem@davemloft.net>
---

Committed to net-next, I almost have test_progs working fully
on sparc with a small verifier hack...

 tools/testing/selftests/bpf/test_xdp.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_xdp.c b/tools/testing/selftests/bpf/test_xdp.c
index 9a33b03..5e7df8b 100644
--- a/tools/testing/selftests/bpf/test_xdp.c
+++ b/tools/testing/selftests/bpf/test_xdp.c
@@ -17,10 +17,9 @@
 #include <linux/pkt_cls.h>
 #include <sys/socket.h>
 #include "bpf_helpers.h"
+#include "bpf_endian.h"
 #include "test_iptunnel_common.h"
 
-#define htons __builtin_bswap16
-#define ntohs __builtin_bswap16
 int _version SEC("version") = 1;
 
 struct bpf_map_def SEC("maps") rxcnt = {
@@ -104,7 +103,7 @@ static __always_inline int handle_ipv4(struct xdp_md *xdp)
 	vip.family = AF_INET;
 	vip.daddr.v4 = iph->daddr;
 	vip.dport = dport;
-	payload_len = ntohs(iph->tot_len);
+	payload_len = bpf_ntohs(iph->tot_len);
 
 	tnl = bpf_map_lookup_elem(&vip2tnl, &vip);
 	/* It only does v4-in-v4 */
@@ -126,7 +125,7 @@ static __always_inline int handle_ipv4(struct xdp_md *xdp)
 	    iph + 1 > data_end)
 		return XDP_DROP;
 
-	set_ethhdr(new_eth, old_eth, tnl, htons(ETH_P_IP));
+	set_ethhdr(new_eth, old_eth, tnl, bpf_htons(ETH_P_IP));
 
 	iph->version = 4;
 	iph->ihl = sizeof(*iph) >> 2;
@@ -134,7 +133,7 @@ static __always_inline int handle_ipv4(struct xdp_md *xdp)
 	iph->protocol = IPPROTO_IPIP;
 	iph->check = 0;
 	iph->tos = 0;
-	iph->tot_len = htons(payload_len + sizeof(*iph));
+	iph->tot_len = bpf_htons(payload_len + sizeof(*iph));
 	iph->daddr = tnl->daddr.v4;
 	iph->saddr = tnl->saddr.v4;
 	iph->ttl = 8;
@@ -195,12 +194,12 @@ static __always_inline int handle_ipv6(struct xdp_md *xdp)
 	    ip6h + 1 > data_end)
 		return XDP_DROP;
 
-	set_ethhdr(new_eth, old_eth, tnl, htons(ETH_P_IPV6));
+	set_ethhdr(new_eth, old_eth, tnl, bpf_htons(ETH_P_IPV6));
 
 	ip6h->version = 6;
 	ip6h->priority = 0;
 	memset(ip6h->flow_lbl, 0, sizeof(ip6h->flow_lbl));
-	ip6h->payload_len = htons(ntohs(payload_len) + sizeof(*ip6h));
+	ip6h->payload_len = bpf_htons(bpf_ntohs(payload_len) + sizeof(*ip6h));
 	ip6h->nexthdr = IPPROTO_IPV6;
 	ip6h->hop_limit = 8;
 	memcpy(ip6h->saddr.s6_addr32, tnl->saddr.v6, sizeof(tnl->saddr.v6));
@@ -224,9 +223,9 @@ int _xdp_tx_iptunnel(struct xdp_md *xdp)
 
 	h_proto = eth->h_proto;
 
-	if (h_proto == htons(ETH_P_IP))
+	if (h_proto == bpf_htons(ETH_P_IP))
 		return handle_ipv4(xdp);
-	else if (h_proto == htons(ETH_P_IPV6))
+	else if (h_proto == bpf_htons(ETH_P_IPV6))
 
 		return handle_ipv6(xdp);
 	else
-- 
2.1.2.532.g19b5d50

^ permalink raw reply related

* Re: [PATCH v1 1/3] bnx2x: Replace custom scnprintf()
From: David Miller @ 2017-05-02 14:42 UTC (permalink / raw)
  To: andy.shevchenko; +Cc: Yuval.Mintz, andriy.shevchenko, netdev, Ariel.Elior
In-Reply-To: <CAHp75VfX71nmjzr23WWUO6ycGcvyw1sNkdTLhX-L=Pbk2jDPLg@mail.gmail.com>

From: Andy Shevchenko <andy.shevchenko@gmail.com>
Date: Tue, 2 May 2017 11:45:50 +0300

> On Sun, Apr 30, 2017 at 6:32 PM, David Miller <davem@davemloft.net> wrote:
>> It is an essential part of all patch series.
> 
> I hope it doesn't apply for the single patch(es).

No it does not.

^ permalink raw reply

* Re: net/smc and the RDMA core
From: Doug Ledford @ 2017-05-02 14:34 UTC (permalink / raw)
  To: Ursula Braun, Bart Van Assche, hch-jcswGhMUV9g@public.gmane.org,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <dce14470-06f4-8da3-6894-cd724eac3447-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>


[-- Attachment #1.1: Type: text/plain, Size: 5819 bytes --]

On 5/2/2017 8:34 AM, Ursula Braun wrote:
> On 05/01/2017 07:29 PM, Bart Van Assche wrote:
>> On Mon, 2017-05-01 at 18:33 +0200, Christoph Hellwig wrote:
>>> Hi Ursual, hi netdev reviewers,
>>>
>>> how did the smc protocol manage to get merged without any review 
>>> on linux-rdma at all?  As the results it seems it's very substandard
>>> in terms of RDMA API usage, e.g. it neither uses the proper CQ API
>>> nor the RDMA R/W API, and other will probably find additional issues
>>> as well.
>>
>> Hello Dave and Ursula,
>>
>> It seems very rude to me to have merged the SMC protocol driver without
>> having involved the linux-rdma community. Anyway, I have the following
>> questions for Dave and Ursula:
>> * Since the Linux kernel is standards based: where can we find the standard
>>   that defines the SMC wire protocol? If this protocol has not been
>>   standardized yet: in what file (other than *.[ch]) in the Linux kernel
>>   tree has this protocol been documented?
> 
> Hello Bart,
> 
> The protocol is standardized, see: http://www.rfc-editor.org/info/rfc7609.

No, SMC-R is *not* standardized.  It is informationally publicized.  In
other words, you guys dumped what you considered your standard out and
rfc-editor published it under the "information" status category.  To be
an Internet Standard requires more work, and additionally requires a
period of time for people to comment on this proposal.  Given that this
is for a linux RDMA communications layer, advertising this RFC on the
linux-rdma@ mailing list seems the absolute *minimum* advertising that
should be done during the comment phase (at least as relates to the RDMA
portion of this protocol, which this protocol is first and foremost an
RDMA protocol with only TCP as a control/fallback).  It's been 18 months
since you published this, and this is the first that it's been brought
to the linux-rdma@ mailing list to my knowledge.

Anyway, the RFC is informational, not a standard, and as it stands, I'm
pretty sure the RDMA community would have a number of comments on it
before it were allowed to become a standard, including the fact that it
is currently exclusive to RoCEv1 which is, as far as the RDMA community
is concerned, a deprecated RoCE mode.  Adding a new protocol that only
supports this is just simply short sighted.  Had we been consulted
before this was merged, I have no doubt that we would have had a
considerable list of review requirements.  But, we are where we are, so
the issue is how to move forward.  Moving it to staging doesn't seem so
inappropriate in this particular case...

> I described this and some more protocol details in my patch series
> overview mail, see for instance:
>      http://marc.info/?l=linux-s390&m=148397751211964&w=2
> 
> This description explains the reasons to come up with SMC-R.
> 
>> * What are the differences between the SMC protocol, the SDP protocol and
>>   the rsockets protocol? How do existing implementations for these protocols
>>   compare to each other from a performance point of view? If no performance
>>   comparison between these protocols is available, shouldn't the performance
>>   of these protocols have been compared with each other before a review of
>>   the SMC driver even started?
>> * What are the reasons why the SDP driver was never accepted upstream? Do
>>   the arguments why SDP was not accepted upstream also apply to the SMC
>>   driver (SDP = Sockets Direct Protocol)?
>> * Since SMC has to be selected by specifying AF_SMC, how are users expected
>>   to specify whether AF_INET, AF_INET6 or yet another address family should
>>   be used to set up a connection between SMC
>> endpoints?
> 
> The IPv6 support in SMC-R is on our todo-list.
> 
>> * Is the SMC driver limited to RoCE? Are you aware that the rsockets library
>>   supports multiple transport layers (RoCE, IB and iWARP)?
> 
> For now, only RoCE is supported. Other transports might be added in the future.

We generally don't allow this type of partial support in RDMA code if we
can avoid it.  There are means in place in the RDMA subsystem to hide
the differences between the different RDMA types and we highly frown on
any code that doesn't deal with all of the fabrics.  Old code might get
grandfathered in if it had good reason for being specific to a fabric,
but not new code.  Fixing this would have been high on our list of
review items.  At a minimum supporting iWARP and all flavors of RoCE
would have required as these are all Ethernet devices at their heart and
should all support the required TCP control channel and such.  We
*might* have been more lenient on IB and OPA since they don't have a
native TCP network device, but since IPoIB works on both, even that
isn't really a reason not to support them.  The real issue is the much
more difficult issue of namespaces and SELinux, which is different
between TCP sockets and IB/OPA connections.  That would have been the
difficult part to get right, but as we haven't investigated it, we
really don't know if it would have been solvable in a reasonable fashion.

>> * Since functionality that is similar what the SMC driver provides already
>>   exists in user space (rsockets), why has this functionality been
>>   reimplemented as a kernel driver (SMC)?
>>
>> Bart.
>>
> 
> Regards, Ursula
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


-- 
Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
    GPG Key ID: B826A3330E572FDD
    Key fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 0E57 2FDD


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 884 bytes --]

^ permalink raw reply

* Re: [PATCH] net: dsa: mv88e6xxx: fix mdio bus name when using devicetree
From: Sylvain Lemieux @ 2017-05-02 14:26 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: vivien.didelot, netdev
In-Reply-To: <20170501152054.GD1285@lunn.ch>

Hi Andrew,

On Mon, 2017-05-01 at 17:20 +0200, Andrew Lunn wrote:
> On Mon, May 01, 2017 at 10:54:04AM -0400, Sylvain Lemieux wrote:
> > From: Liam Beguin <lbeguin@tycoint.com>
> > 
> > mv88e6xxx_mdio_register automatically generates mdio buses for each switch
> > discovered in the devicetree. When switch nodes are embedded in other nodes,
> > this can cause sysfs naming collisions since full_name may be truncated.
> > 
> > Only use devicetree node name instead of the full devicetree path
> > as the mdio bus name.
> 
> Hi Sylvain
> 
> I'm not sure this is a good idea. It probably breaks my boards:
> 
> :/sys/class/mdio_bus# ls
> !mdio-mux!mdio@1!switch@0!mdio	0.1  0.4  400d0000.ethernet-1  fixed-0
> !mdio-mux!mdio@2!switch@0!mdio	0.2  0.8  400d1000.ethernet-2  mv88e6xxx-0
> 
> np->name is not unique, where as np->full_name is unique.
> 
> However, i can understand your problem with truncation. Maybe a better
> solution is to detect if truncation is going to happen. If so, use a
> concatenation of a hash of np->full_name, and the right hand part of
> np->full_name?
> 
Thanks for the feedback.

I am currently busy on something else; I should be able to look at this
next week or the following.

Regards,
Sylvain

> 	Andrew

^ permalink raw reply

* [PATCH net v4 12/12] net: batman-adv: Fix one possbile memleak when fail to register_netdevice
From: gfree.wind @ 2017-05-02 14:17 UTC (permalink / raw)
  To: davem, jiri, mareklindner, sw, a, kuznet, jmorris, yoshfuji,
	kaber, steffen.klassert, herbert, netdev
  Cc: Gao Feng
In-Reply-To: <cover.1493699451.git.gfree.wind@foxmail.com>

From: Gao Feng <gfree.wind@foxmail.com>

The batman-adv allocates some resources in its ndo_init func, and free
them in its destructor func. Then there is one memleak that some errors
happen after register_netdevice invokes the ndo_init callback. Because
the destructor would not be invoked to free the resources.

Now create one new func batadv_destructor_free to free the mem in
the destructor, and add ndo_uninit func also invokes it when fail to
register the batman-adv device.

It's not only free all resources, but also follow the original desgin
that the resources are freed in the destructor normally after
register the device successfully.

Signed-off-by: Gao Feng <gfree.wind@foxmail.com>
---
 net/batman-adv/soft-interface.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index d042c99..626cbf6 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -878,6 +878,19 @@ static int batadv_softif_init_late(struct net_device *dev)
 	return ret;
 }
 
+static void batadv_destructor_free(struct net_device *dev)
+{
+	batadv_debugfs_del_meshif(dev);
+	batadv_mesh_free(dev);
+}
+
+static void batadv_softif_uninit(struct net_device *dev)
+{
+	/* dev is not registered, perform the free instead of destructor */
+	if (dev->reg_state == NETREG_UNINITIALIZED)
+		batadv_destructor_free(dev);
+}
+
 /**
  * batadv_softif_slave_add - Add a slave interface to a batadv_soft_interface
  * @dev: batadv_soft_interface used as master interface
@@ -933,6 +946,7 @@ static int batadv_softif_slave_del(struct net_device *dev,
 
 static const struct net_device_ops batadv_netdev_ops = {
 	.ndo_init = batadv_softif_init_late,
+	.ndo_uninit = batadv_softif_uninit,
 	.ndo_open = batadv_interface_open,
 	.ndo_stop = batadv_interface_release,
 	.ndo_get_stats = batadv_interface_stats,
@@ -953,9 +967,7 @@ static int batadv_softif_slave_del(struct net_device *dev,
  */
 static void batadv_softif_free(struct net_device *dev)
 {
-	batadv_debugfs_del_meshif(dev);
-	batadv_mesh_free(dev);
-
+	batadv_destructor_free(dev);
 	/* some scheduled RCU callbacks need the bat_priv struct to accomplish
 	 * their tasks. Wait for them all to be finished before freeing the
 	 * netdev and its private data (bat_priv)
-- 
1.9.1

^ permalink raw reply related

* Re: sparc64 and ARM64 JIT bug (was Re: LLVM 4.0 code generation bug)
From: Daniel Borkmann @ 2017-05-02 14:11 UTC (permalink / raw)
  To: David Miller, ast; +Cc: netdev, xi.wang, catalin.marinas
In-Reply-To: <20170501.230234.787989809925411599.davem@davemloft.net>

On 05/02/2017 05:02 AM, David Miller wrote:
> From: Alexei Starovoitov <ast@fb.com>
> Date: Mon, 1 May 2017 19:39:33 -0700
>
>> On 5/1/17 7:31 PM, David Miller wrote:
>>>
>>> If the last BPF instruction before exit is a ldimm64, branches to the
>>> exit point at the wrong location.
>>>
>>> Here is what I get from test_pkt_access.c on sparc:
>>>
>>> 0000000000000000 <process>:
>>>     0:	b7 00 00 00 00 00 00 02 	mov	r0, 2
>>>     8:	61 21 00 50 00 00 00 00 	ldw	r2, [r1+80]
>>>    10:	61 11 00 4c 00 00 00 00 	ldw	r1, [r1+76]
>>>    18:	bf 41 00 00 00 00 00 00 	mov	r4, r1
>>>    20:	07 40 00 00 00 00 00 0e 	add	r4, 14
>>>    28:	2d 42 00 25 00 00 00 00 	jgt	r4, r2, 148 <LBB0_11>
>>>   ...
>>> 0000000000000148 <LBB0_11>:
>>>   148:	18 00 00 00 ff ff ff ff 	ldimm64	r0, 4294967295
>>>   150:	00 00 00 00 00 00 00 00
>>>
>>> 0000000000000158 <LBB0_12>:
>>>   158:	95 00 00 00 00 00 00 00 	exit	
>   ...
>> looks fine to me. it jumps to 0x158,
>> since offset 0 is the next insn after jump which is 0x30
>> That's how classic bpf defined jumps.
>
> Ok, it seems that both arm64 and sparc64's JIT handle the above
> situation improperly.
>
> They both work by recording the instruction offsets in an array which
> is indexed off by one.  It it built like this:
>
> 	for (i = 0; i < prog->len; i++) {
> 		const struct bpf_insn *insn = &prog->insnsi[i];
> 		int ret;
>
> 		ret = build_insn(insn, ctx);
> 		ctx->offset[i] = ctx->idx;
>
> 		if (ret > 0) {
> 			i++;
> 			continue;
> 		}
> 		if (ret)
> 			return ret;
> 	}
>
> That is, we record the JIT'd instruction offset for BPF instruction
> 'idx' in array entry 'idx - 1'.
>
> Then when we emit a relative branch, we lookup the destination offset
> using "ctx->offset[this_insn_idx + insn->off]"
>
> And this works most of the time.  It doesn't work for the scenerio
> above, because 'idx - 1' is not necessarily the index of the previous
> BPF instruction.  Instead, that might point to the second half of an
> lddimm64 instruction.
>
> This bug was introduced by commit
> 8eee539ddea09bccae2426f09b0ba6a18b72b691 ("arm64: bpf: fix
> out-of-bounds read in bpf2a64_offset()") and I copied the logic into
> sparc64 :-)

I'll take a look at the arm64 one since I have a node at hand, and add
tests into lib/test_bpf.c as well later today.

^ permalink raw reply

* Re: net/ipv6: warning in inet6_ifa_finish_destroy
From: Mike Manning @ 2017-05-02 14:04 UTC (permalink / raw)
  To: Cong Wang, Andrey Konovalov
  Cc: David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, netdev, LKML, Eric Dumazet,
	David Ahern, Dmitry Vyukov, Kostya Serebryany, syzkaller
In-Reply-To: <CAM_iQpUTx_nDtZ63TRVGnqu5L33NzTE8XFMjaxSD_i4ghGFQ7g@mail.gmail.com>

On 28/04/17 21:39, Cong Wang wrote:
> On Fri, Apr 28, 2017 at 6:08 AM, Andrey Konovalov <andreyknvl@google.com> wrote:
>> Hi,
>>
>> I've got the following error report while fuzzing the kernel with syzkaller.
>>
>> On commit 5a7ad1146caa895ad718a534399e38bd2ba721b7 (4.11-rc8).
>>
>> C reproducer and .config are attached.
>> It takes 1-2 minutes of running the reproducer to trigger the issue.
>>
>> ------------[ cut here ]------------
>> WARNING: CPU: 0 PID: 21 at net/ipv6/addrconf.c:894
>> inet6_ifa_finish_destroy+0x12e/0x190
>> Modules linked in:
>> CPU: 0 PID: 21 Comm: kworker/0:1 Not tainted 4.11.0-rc8+ #296
>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
>> Workqueue: ipv6_addrconf addrconf_dad_work
>> Call Trace:
>>  __dump_stack lib/dump_stack.c:16
>>  dump_stack+0x292/0x398 lib/dump_stack.c:52
>>  __warn+0x19f/0x1e0 kernel/panic.c:549
>>  warn_slowpath_null+0x2c/0x40 kernel/panic.c:584
>>  inet6_ifa_finish_destroy+0x12e/0x190 c:894
>>  in6_ifa_put ./include/net/addrconf.h:330
>>  addrconf_dad_work+0x4e9/0x1040 net/ipv6/addrconf.c:3963
> 
> 
> I don't look too much, but a quick glance shows in the following
> path:
> 
>         } else if (action == DAD_ABORT) {
>                 in6_ifa_hold(ifp);
>                 addrconf_dad_stop(ifp, 1);
>                 if (disable_ipv6)
>                         addrconf_ifdown(idev->dev, 0);
>                 goto out;
>         }
> 
> the inet6_addr could be removed from hash table in
> addrconf_ifdown() before calling in6_ifa_put(). which causes
> this warning.
> 

git describe 85b51b12115c79cce7ea1ced6c0bd0339a165d3f --contains
v4.8-rc5~34^2~32

This fix was introduced in 4.8, so it is interesting that this problem is only showing up now for 4.11. Also, it is not reproducible manually, i.e. DAD failure with disable_ipv6 works just fine without triggering this warning, with or without keeping IPv6 addresses on admin down.

I will go ahead with putting out a fix so that in6_ifa_put() precedes the call to addrconf_ifdown() in this case.

Thanks for the heads up on this,
Mike

^ permalink raw reply

* Re: IPsec PFP support on linux
From: Sowmini Varadhan @ 2017-05-02 14:05 UTC (permalink / raw)
  To: Paul Wouters
  Cc: steffen.klassert, borisp, swan, netdev, ilant, linux-crypto,
	herbert
In-Reply-To: <B4EE88ED-B403-42C3-A6D8-1A2526F1DA67@nohats.ca>

On (05/02/17 09:58), Paul Wouters wrote:
>    I think you want to use Opportunistic IPsec, eg see 
>    https://libreswan.org/wiki/HOWTO:_Opportunistic_IPsec

I dont think that what I want is opportunistic ipsec..

taking an extreme example, I can set up the ipsec tunnels with 
esp-null for *.5001 and 5001.* (and that's how I'm able to
trigger quick mode in my experiment).

But I want each 5-tuple to/from 5001 to get its own SPI. That
is not happening in my case. 

If I used OE, I would *never* get a per-5-tuple SPI, right?
(I can give this a try later today but the rfc definition of OE is
not what I have in mind- I really want PFP, not OE).

>    Note that IKEv2 also allows you to define one connection and instantiate
>     a connection based on the trigger packet whose src/dst proto/port are
>    included in the IKEv2 packet as traffic selectors. See RFC7296 and
>    "narrowing".

yes, the rfc's permit this, I'm not sure how to set up my 
SPD for this though.

^ permalink raw reply

* Re: IPsec PFP support on linux
From: Paul Wouters @ 2017-05-02 13:58 UTC (permalink / raw)
  To: Sowmini Varadhan
  Cc: steffen.klassert, borisp, swan, netdev, ilant, linux-crypto,
	herbert
In-Reply-To: <20170502123238.GE5843@oracle.com>


[-- Attachment #1.1: Type: text/plain, Size: 2979 bytes --]

I think you want to use Opportunistic IPsec, eg see 

https://libreswan.org/wiki/HOWTO:_Opportunistic_IPsec

Note that IKEv2 also allows you to define one connection and instantiate  a connection based on the trigger packet whose src/dst proto/port are included in the IKEv2 packet as traffic selectors. See RFC7296 and "narrowing".

Paul


Sent from my iPhone

> On May 2, 2017, at 08:32, Sowmini Varadhan <sowmini.varadhan@oracle.com> wrote:
> 
> I have a question about linux support for IPsec PFP (as defined in
> rfc 4301). I am assuming this exists, and is accessible from uspace,
> in which case I need some hints on how to set it up.
> 
> Assuming I have a server listening at port 5001 that I want to
> secure via ipsec. Suppose I want to make sure that each TCP/UDP 5-tuple
> sending packets to port 5001 gets its own SA.
> 
> RFC4301 has this:
> 
>       - SPD-S: For traffic that is to be protected using IPsec, the
>         entry consists of the values of the selectors that apply to the
>         traffic to be protected via AH or ESP, controls on how to
>         create SAs based on these selectors, ...
> 
> and further down
>      If IPsec processing is specified for
>      an entry, a "populate from packet" (PFP) flag may be asserted for
>      one or more of the selectors in the SPD entry (Local IP address;
>      Remote IP address; Next Layer Protocol; and, depending on Next
>      Layer Protocol, Local port and Remote port, or ICMP type/code, or
>      Mobility Header type).  If asserted for a given selector X, the
>      flag indicates that the SA to be created should take its value for
>      X from the value in the packet.  Otherwise, the SA should take its
>      value(s) for X from the value(s) in the SPD entry.
> 
> A google search produces a discarded patch
>  http://marc.info/?l=linux-netdev&m=119746758904140
> but its not clear to me how to set this up (if PFP works fine,
> as suggested by Herbert's response above)
> 
> I tried experimenting with IP_XFRM_POLICY from a simple udp client but
> (a) that seems to require a SPI and reqid to set up the SPD 
> (b) I see the SADB_ACQUIRE upcall being triggered after the local port
>    is bound (and SADB entry is set up for the lport).  But ike phase2
>    does not converge for the lport specific sadb added
>    by the bind (even in quick mode)
> 
> My understanding is that pluto shoud be generating spi's to make sure
> they are sufficiently unique/random etc. so (a) makes me think I'm
> either not setting this up or not using this correctly.
> 
> Any hints/sample code/RTFMs would be helpful (documentation for
> IP_XFRM_POLICY seems scant, afaict). I'd be happy to share my 
> udp client program, if it can provide more context to my question.
> 
> --Sowmini
> 
> _______________________________________________
> Swan mailing list
> Swan@lists.libreswan.org
> https://lists.libreswan.org/mailman/listinfo/swan

[-- Attachment #1.2: Type: text/html, Size: 4716 bytes --]

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply

* RE: [PATCH 3/4] net: macb: Add hardware PTP support
From: Rafal Ozieblo @ 2017-05-02 13:57 UTC (permalink / raw)
  To: Richard Cochran
  Cc: David Miller, nicolas.ferre@atmel.com, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, devicetree@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org,
	harinikatakamlinux@gmail.com, harini.katakam@xilinx.com,
	Andrei.Pistirica@microchip.com
In-Reply-To: <20170414182850.GB7751@localhost.localdomain>

> From: Richard Cochran [mailto:richardcochran@gmail.com]
> Sent: 14 kwietnia 2017 20:29
> To: Rafal Ozieblo <rafalo@cadence.com>
> Cc: David Miller <davem@davemloft.net>; nicolas.ferre@atmel.com;
> netdev@vger.kernel.org; linux-kernel@vger.kernel.org;
> devicetree@vger.kernel.org; linux-arm-kernel@lists.infradead.org;
> harinikatakamlinux@gmail.com; harini.katakam@xilinx.com;
> Andrei.Pistirica@microchip.com
> Subject: Re: [PATCH 3/4] net: macb: Add hardware PTP support
> 
> On Thu, Apr 13, 2017 at 02:39:23PM +0100, Rafal Ozieblo wrote:
(...)
> > +static int gem_tsu_get_time(struct macb *bp, struct timespec64 *ts)
> > +{
> > +	long first, second;
> > +	u32 secl, sech;
> > +	unsigned long flags;
> > +
> > +	if (!bp || !ts)
> > +		return -EINVAL;
> 
> Useless test.
Sorry for me being too carefulness, I'll remove all tests.
> 
(...)
> > +static int gem_ptp_gettime(struct ptp_clock_info *ptp, struct timespec64
> *ts)
> > +{
> > +	struct macb *bp = container_of(ptp, struct macb, ptp_clock_info);
> > +
> > +	if (!ptp || !ts)
> > +		return -EINVAL;
> 
> Useles test.
> 
> What is the point of this wrapper function anyhow?  Please remove it.
gem_ptp_gettime() is assigned in ptp_clock_info and it has to have 
ptp_clock_info pointer as first parameter. gem_tsu_get_time() is used in
the source code but with macb pointer.
Do you want me to do something like:
gem_ptp_gettime(macb->ptp, ts);
and first would be getting macb pointer from ptp ?
struct macb *bp = container_of(ptp, struct macb, ptp_clock_info);
> 
> > +
> > +	gem_tsu_get_time(bp, ts);
> > +	return 0;
> > +}
> > +
> > +static int gem_ptp_settime(struct ptp_clock_info *ptp,
> > +		const struct timespec64 *ts)
> > +{
> > +	struct macb *bp = container_of(ptp, struct macb, ptp_clock_info);
> > +
> > +	if (!ptp || !ts)
> > +		return -EINVAL;
> 
> Another useless function.
ditto
> 
> > +	gem_tsu_set_time(bp, ts);
> > +	return 0;
> > +}
> > +
> > +static int gem_ptp_enable(struct ptp_clock_info *ptp,
> > +			  struct ptp_clock_request *rq, int on)
> > +{
> > +	struct macb *bp = container_of(ptp, struct macb, ptp_clock_info);
> > +
> > +	if (!ptp || !rq)
> > +		return -EINVAL;
> 
> Sigh.
> 
> > +
> > +	switch (rq->type) {
> > +	case PTP_CLK_REQ_EXTTS:	/* Toggle TSU match interrupt */
> > +		if (on)
> > +			macb_writel(bp, IER, MACB_BIT(TCI));
> 
> No locking to protect IER and IDE?
There is no need.
> 
> > +		else
> > +			macb_writel(bp, IDR, MACB_BIT(TCI));
> > +		break;
> > +	case PTP_CLK_REQ_PEROUT: /* Toggle Periodic output */
> > +		return -EOPNOTSUPP;
> > +		/* break; */
> > +	case PTP_CLK_REQ_PPS:	/* Toggle TSU periodic (second)
> interrupt */
> > +		if (on)
> > +			macb_writel(bp, IER, MACB_BIT(SRI));
> > +		else
> > +			macb_writel(bp, IDR, MACB_BIT(SRI));
> > +		break;
> > +	default:
> > +		break;
> > +	}
> > +	return 0;
> > +}
> > +
(...)
> > --
> > 2.4.5
> >
> 
> Thanks,
> Richard

^ permalink raw reply

* [PATCH net v4 11/12] net: vlan: Fix one possbile memleak when fail to register_netdevice
From: gfree.wind @ 2017-05-02 13:41 UTC (permalink / raw)
  To: davem, jiri, mareklindner, sw, a, kuznet, jmorris, yoshfuji,
	kaber, steffen.klassert, herbert, netdev
  Cc: Gao Feng
In-Reply-To: <cover.1493699451.git.gfree.wind@foxmail.com>

From: Gao Feng <gfree.wind@foxmail.com>

The vlan driver allocates some resources in its ndo_init func, and
free some of them in its destructor func. Then there is one memleak
that some errors happen after register_netdevice invokes the ndo_init
callback. Because only the ndo_uninit callback is invoked in the error
handler of register_netdevice, but destructor not.

Now create one new func vlan_destructor_free to free the mem in the
destructor, and ndo_uninit func also invokes it when fail to register
the vlan device.

It's not only free all resources, but also follow the original desgin
that the resources are freed in the destructor normally after
register the device successfully.

Signed-off-by: Gao Feng <gfree.wind@foxmail.com>
---
 net/8021q/vlan_dev.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index e97ab82..adcaa39 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -608,6 +608,14 @@ static int vlan_dev_init(struct net_device *dev)
 	return 0;
 }
 
+static void vlan_destructor_free(struct net_device *dev)
+{
+	struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+
+	free_percpu(vlan->vlan_pcpu_stats);
+	vlan->vlan_pcpu_stats = NULL;
+}
+
 static void vlan_dev_uninit(struct net_device *dev)
 {
 	struct vlan_priority_tci_mapping *pm;
@@ -620,6 +628,10 @@ static void vlan_dev_uninit(struct net_device *dev)
 			kfree(pm);
 		}
 	}
+
+	/* dev is not registered, perform the free instead of destructor */
+	if (dev->reg_state == NETREG_UNINITIALIZED)
+		vlan_destructor_free(dev);
 }
 
 static netdev_features_t vlan_dev_fix_features(struct net_device *dev,
@@ -803,10 +815,7 @@ static int vlan_dev_get_iflink(const struct net_device *dev)
 
 static void vlan_dev_free(struct net_device *dev)
 {
-	struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
-
-	free_percpu(vlan->vlan_pcpu_stats);
-	vlan->vlan_pcpu_stats = NULL;
+	vlan_destructor_free(dev);
 	free_netdev(dev);
 }
 
-- 
1.9.1

^ permalink raw reply related

* [PATCH net v4 10/12] net: sit: Fix one possbile memleak when fail to register_netdevice
From: gfree.wind @ 2017-05-02 13:39 UTC (permalink / raw)
  To: davem, jiri, mareklindner, sw, a, kuznet, jmorris, yoshfuji,
	kaber, steffen.klassert, herbert, netdev
  Cc: Gao Feng
In-Reply-To: <cover.1493699451.git.gfree.wind@foxmail.com>

From: Gao Feng <gfree.wind@foxmail.com>

The ipip6 allocates some resources in its ndo_init func, and
free some of them in its destructor func. Then there is one memleak
that some errors happen after register_netdevice invokes the ndo_init
callback. Because only the ndo_uninit callback is invoked in the error
handler of register_netdevice, but destructor not.

Now create one new func ipip6_destructor_free to free the mem in
the destructor, and ndo_uninit func also invokes it when fail to
register the ipip6 device.

It's not only free all resources, but also follow the original desgin
that the resources are freed in the destructor normally after
register the device successfully.

Signed-off-by: Gao Feng <gfree.wind@foxmail.com>
---
 net/ipv6/sit.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 99853c6..28c1649 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -464,6 +464,14 @@ static void prl_list_destroy_rcu(struct rcu_head *head)
 	return ok;
 }
 
+static void ipip6_destructor_free(struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+
+	dst_cache_destroy(&tunnel->dst_cache);
+	free_percpu(dev->tstats);
+}
+
 static void ipip6_tunnel_uninit(struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
@@ -477,6 +485,10 @@ static void ipip6_tunnel_uninit(struct net_device *dev)
 	}
 	dst_cache_reset(&tunnel->dst_cache);
 	dev_put(dev);
+
+	/* dev is not registered, perform the free instead of destructor */
+	if (dev->reg_state == NETREG_UNINITIALIZED)
+		ipip6_destructor_free(dev);
 }
 
 static int ipip6_err(struct sk_buff *skb, u32 info)
@@ -1329,10 +1341,7 @@ static bool ipip6_valid_ip_proto(u8 ipproto)
 
 static void ipip6_dev_free(struct net_device *dev)
 {
-	struct ip_tunnel *tunnel = netdev_priv(dev);
-
-	dst_cache_destroy(&tunnel->dst_cache);
-	free_percpu(dev->tstats);
+	ipip6_destructor_free(dev);
 	free_netdev(dev);
 }
 
-- 
1.9.1

^ permalink raw reply related

* [PATCH net v4 09/12] net: ip_tunnel: Fix one possbile memleak when fail to register_netdevice
From: gfree.wind @ 2017-05-02 13:37 UTC (permalink / raw)
  To: davem, jiri, mareklindner, sw, a, kuznet, jmorris, yoshfuji,
	kaber, steffen.klassert, herbert, netdev
  Cc: Gao Feng
In-Reply-To: <cover.1493699451.git.gfree.wind@foxmail.com>

From: Gao Feng <gfree.wind@foxmail.com>

The ip_tunnel allocates some resources in its ndo_init func, and
free some of them in its destructor func. Then there is one memleak
that some errors happen after register_netdevice invokes the ndo_init
callback. Because only the ndo_uninit callback is invoked in the error
handler of register_netdevice, but destructor not.

Now create one new func ip_tunnel_destructor_free to free the mem in
the destructor, and ndo_uninit func also invokes it when fail to
register the ip_tunnel device.

It's not only free all resources, but also follow the original desgin
that the resources are freed in the destructor normally after
register the device successfully.

Signed-off-by: Gao Feng <gfree.wind@foxmail.com>
---
 net/ipv4/ip_tunnel.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 823abae..96a3005 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -954,13 +954,18 @@ int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
 
-static void ip_tunnel_dev_free(struct net_device *dev)
+static void ip_tunnel_destructor_free(struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 
 	gro_cells_destroy(&tunnel->gro_cells);
 	dst_cache_destroy(&tunnel->dst_cache);
 	free_percpu(dev->tstats);
+}
+
+static void ip_tunnel_dev_free(struct net_device *dev)
+{
+	ip_tunnel_destructor_free(dev);
 	free_netdev(dev);
 }
 
@@ -1192,6 +1197,10 @@ void ip_tunnel_uninit(struct net_device *dev)
 		ip_tunnel_del(itn, netdev_priv(dev));
 
 	dst_cache_reset(&tunnel->dst_cache);
+
+	/* dev is not registered, perform the free instead of destructor */
+	if (dev->reg_state == NETREG_UNINITIALIZED)
+		ip_tunnel_destructor_free(dev);
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
 
-- 
1.9.1

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox