Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next 09/15] netvsc: optimize netvsc_send_pkt
From: Stephen Hemminger @ 2017-05-03 23:01 UTC (permalink / raw)
  To: davem; +Cc: netdev, Stephen Hemminger
In-Reply-To: <20170503230117.20070-1-sthemmin@microsoft.com>

Hand optimize netvsc_send_pkt by adding likely/unlikely.
Also don't print pointer in warning message, instead dump info.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
---
 drivers/net/hyperv/netvsc.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 59ca5fd6797d..d9bd1a2db4db 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -751,9 +751,9 @@ static u32 netvsc_copy_to_send_buf(struct netvsc_device *net_device,
 	return msg_size;
 }
 
-static inline int netvsc_send_pkt(
+static int netvsc_send_pkt(
 	struct hv_device *device,
-	struct hv_netvsc_packet *packet,
+	const struct hv_netvsc_packet *packet,
 	struct netvsc_device *net_device,
 	struct hv_page_buffer **pb,
 	struct sk_buff *skb)
@@ -766,7 +766,6 @@ static inline int netvsc_send_pkt(
 	struct netdev_queue *txq = netdev_get_tx_queue(ndev, packet->q_idx);
 	u64 req_id;
 	int ret;
-	struct hv_page_buffer *pgbuf;
 	u32 ring_avail = hv_ringbuf_avail_percent(out_channel);
 
 	nvmsg.hdr.msg_type = NVSP_MSG1_TYPE_SEND_RNDIS_PKT;
@@ -788,10 +787,12 @@ static inline int netvsc_send_pkt(
 
 	req_id = (ulong)skb;
 
-	if (out_channel->rescind)
+	if (unlikely(out_channel->rescind))
 		return -ENODEV;
 
 	if (packet->page_buf_cnt) {
+		struct hv_page_buffer *pgbuf;
+
 		pgbuf = packet->cp_partial ? (*pb) +
 			packet->rmsg_pgcnt : (*pb);
 		ret = vmbus_sendpacket_pagebuffer_ctl(out_channel,
@@ -809,20 +810,23 @@ static inline int netvsc_send_pkt(
 					   VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
 	}
 
-	if (ret == 0) {
-		atomic_inc_return(&nvchan->queue_sends);
+	if (likely(ret == 0)) {
+		atomic_inc(&nvchan->queue_sends);
 
 		if (ring_avail < RING_AVAIL_PERCENT_LOWATER)
 			netif_tx_stop_queue(txq);
-	} else if (ret == -EAGAIN) {
+	} else if (likely(ret == -EAGAIN)) {
 		netif_tx_stop_queue(txq);
 		if (atomic_read(&nvchan->queue_sends) < 1) {
 			netif_tx_wake_queue(txq);
 			ret = -ENOSPC;
 		}
 	} else {
-		netdev_err(ndev, "Unable to send packet %p ret %d\n",
-			   packet, ret);
+		if (net_ratelimit())
+			netdev_warn(ndev,
+				    "Unable to send packet qid %u index %d len %u (%d)\n",
+				    packet->q_idx, packet->send_buf_index,
+				    packet->total_data_buflen, ret);
 	}
 
 	return ret;
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 05/15] netvsc: optimize avail percent calculation
From: Stephen Hemminger @ 2017-05-03 23:01 UTC (permalink / raw)
  To: davem; +Cc: netdev, Stephen Hemminger
In-Reply-To: <20170503230117.20070-1-sthemmin@microsoft.com>

Only need to look at write space (not read space) when computing
percent available.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
---
 drivers/net/hyperv/netvsc.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 385809512c9f..ee5f8c520977 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -598,14 +598,13 @@ void netvsc_device_remove(struct hv_device *device)
  * Get the percentage of available bytes to write in the ring.
  * The return value is in range from 0 to 100.
  */
-static inline u32 hv_ringbuf_avail_percent(
-		struct hv_ring_buffer_info *ring_info)
+static inline
+u32 hv_ringbuf_avail_percent(const struct vmbus_channel *channel)
 {
-	u32 avail_read, avail_write;
+	const struct hv_ring_buffer_info *ring_info = &channel->outbound;
+	u32 avail_write = hv_get_bytes_to_write(ring_info);
 
-	hv_get_ringbuffer_availbytes(ring_info, &avail_read, &avail_write);
-
-	return avail_write * 100 / ring_info->ring_datasize;
+	return (avail_write * 100) / ring_info->ring_datasize;
 }
 
 static inline void netvsc_free_send_slot(struct netvsc_device *net_device,
@@ -655,7 +654,7 @@ static void netvsc_send_tx_complete(struct netvsc_device *net_device,
 		wake_up(&net_device->wait_drain);
 
 	if (netif_tx_queue_stopped(netdev_get_tx_queue(ndev, q_idx)) &&
-	    (hv_ringbuf_avail_percent(&channel->outbound) > RING_AVAIL_PERCENT_HIWATER ||
+	    (hv_ringbuf_avail_percent(channel) > RING_AVAIL_PERCENT_HIWATER ||
 	     queue_sends < 1))
 		netif_tx_wake_queue(netdev_get_tx_queue(ndev, q_idx));
 }
@@ -764,7 +763,7 @@ static inline int netvsc_send_pkt(
 	u64 req_id;
 	int ret;
 	struct hv_page_buffer *pgbuf;
-	u32 ring_avail = hv_ringbuf_avail_percent(&out_channel->outbound);
+	u32 ring_avail = hv_ringbuf_avail_percent(out_channel);
 
 	nvmsg.hdr.msg_type = NVSP_MSG1_TYPE_SEND_RNDIS_PKT;
 	if (skb != NULL) {
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 10/15] netvsc: replace modulus with mask for alignment
From: Stephen Hemminger @ 2017-05-03 23:01 UTC (permalink / raw)
  To: davem; +Cc: netdev, Stephen Hemminger
In-Reply-To: <20170503230117.20070-1-sthemmin@microsoft.com>

Since packet alignment (pkt_align) is always a power of 2, it
is safe to replace expensive divide with shift.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
---
 drivers/net/hyperv/netvsc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index d9bd1a2db4db..767ff20d659e 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -722,7 +722,7 @@ static u32 netvsc_copy_to_send_buf(struct netvsc_device *net_device,
 	int i;
 	u32 msg_size = 0;
 	u32 padding = 0;
-	u32 remain = packet->total_data_buflen % net_device->pkt_align;
+	u32 remain = packet->total_data_buflen & (net_device->pkt_align - 1);
 	u32 page_count = packet->cp_partial ? packet->rmsg_pgcnt :
 		packet->page_buf_cnt;
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 11/15] netvsc: reduce unnecessary memset
From: Stephen Hemminger @ 2017-05-03 23:01 UTC (permalink / raw)
  To: davem; +Cc: netdev, Stephen Hemminger
In-Reply-To: <20170503230117.20070-1-sthemmin@microsoft.com>

Only part of the headroom maybe used on typical packet. Avoid doing memset
of whole area.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
---
 drivers/net/hyperv/netvsc_drv.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index ef3a3a46790f..d9a690ad7cd5 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -195,6 +195,7 @@ static void *init_ppi_data(struct rndis_message *msg, u32 ppi_size,
 	ppi->size = ppi_size;
 	ppi->type = pkt_type;
 	ppi->ppi_offset = sizeof(struct rndis_per_packet_info);
+	memset(ppi + 1, 0, ppi_size - sizeof(struct rndis_per_packet_info));
 
 	rndis_pkt->per_pkt_info_len += ppi_size;
 
@@ -461,12 +462,12 @@ static int netvsc_start_xmit(struct sk_buff *skb, struct net_device *net)
 
 	rndis_msg = (struct rndis_message *)skb->head;
 
-	memset(rndis_msg, 0, RNDIS_AND_PPI_SIZE);
-
 	/* Add the rndis header */
 	rndis_msg->ndis_msg_type = RNDIS_MSG_PACKET;
 	rndis_msg->msg_len = packet->total_data_buflen;
+
 	rndis_pkt = &rndis_msg->msg.pkt;
+	memset(rndis_pkt, 0, sizeof(*rndis_pkt));
 	rndis_pkt->data_offset = sizeof(struct rndis_packet);
 	rndis_pkt->data_len = packet->total_data_buflen;
 	rndis_pkt->per_pkt_info_offset = sizeof(struct rndis_packet);
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 12/15] netvsc: size receive completion ring based on receive area
From: Stephen Hemminger @ 2017-05-03 23:01 UTC (permalink / raw)
  To: davem; +Cc: netdev, Stephen Hemminger
In-Reply-To: <20170503230117.20070-1-sthemmin@microsoft.com>

Now that receive area is parameterized, also need to adjust the
size of the ring for receive completions based on receive area.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
---
 drivers/net/hyperv/hyperv_net.h   |  4 +---
 drivers/net/hyperv/netvsc.c       | 22 ++++++++++++++--------
 drivers/net/hyperv/rndis_filter.c |  5 ++---
 3 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 21666df4cd35..5627003bd8b6 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -653,13 +653,11 @@ struct recv_comp_data {
 	u32 status;
 };
 
-/* Netvsc Receive Slots Max */
-#define NETVSC_RECVSLOT_MAX (NETVSC_RECEIVE_BUFFER_SIZE / ETH_DATA_LEN + 1)
-
 struct multi_recv_comp {
 	void *buf; /* queued receive completions */
 	u32 first; /* first data entry */
 	u32 next; /* next entry for writing */
+	u32 size; /* number of slots in ring */
 };
 
 struct netvsc_stats {
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 767ff20d659e..56e0721f703c 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -61,16 +61,22 @@ void netvsc_switch_datapath(struct net_device *ndev, bool vf)
 			       VM_PKT_DATA_INBAND, 0);
 }
 
-static struct netvsc_device *alloc_net_device(void)
+static struct netvsc_device *alloc_net_device(u32 recvslot_max)
 {
 	struct netvsc_device *net_device;
+	struct multi_recv_comp *mrc;
 
 	net_device = kzalloc(sizeof(struct netvsc_device), GFP_KERNEL);
 	if (!net_device)
 		return NULL;
 
-	net_device->chan_table[0].mrc.buf
-		= vzalloc(NETVSC_RECVSLOT_MAX * sizeof(struct recv_comp_data));
+	mrc = &net_device->chan_table[0].mrc;
+	mrc->size = recvslot_max;
+	mrc->buf = vzalloc(recvslot_max * sizeof(struct recv_comp_data));
+	if (!mrc->buf) {
+		kfree(net_device);
+		return NULL;
+	}
 
 	init_waitqueue_head(&net_device->wait_drain);
 	net_device->destroy = false;
@@ -993,10 +999,10 @@ static inline void count_recv_comp_slot(struct netvsc_device *nvdev, u16 q_idx,
 	u32 first = mrc->first;
 	u32 next = mrc->next;
 
-	*filled = (first > next) ? NETVSC_RECVSLOT_MAX - first + next :
+	*filled = (first > next) ? mrc->size - first + next :
 		  next - first;
 
-	*avail = NETVSC_RECVSLOT_MAX - *filled - 1;
+	*avail = mrc->size - *filled - 1;
 }
 
 /* Read the first filled slot, no change to index */
@@ -1022,7 +1028,7 @@ static inline void put_recv_comp_slot(struct netvsc_device *nvdev, u16 q_idx)
 	struct multi_recv_comp *mrc = &nvdev->chan_table[q_idx].mrc;
 	int num_recv;
 
-	mrc->first = (mrc->first + 1) % NETVSC_RECVSLOT_MAX;
+	mrc->first = (mrc->first + 1) % mrc->size;
 
 	num_recv = atomic_dec_return(&nvdev->num_outstanding_recvs);
 
@@ -1077,7 +1083,7 @@ static inline struct recv_comp_data *get_recv_comp_slot(
 
 	next = mrc->next;
 	rcd = mrc->buf + next * sizeof(struct recv_comp_data);
-	mrc->next = (next + 1) % NETVSC_RECVSLOT_MAX;
+	mrc->next = (next + 1) % mrc->size;
 
 	atomic_inc(&nvdev->num_outstanding_recvs);
 
@@ -1315,7 +1321,7 @@ int netvsc_device_add(struct hv_device *device,
 	struct net_device *ndev = hv_get_drvdata(device);
 	struct net_device_context *net_device_ctx = netdev_priv(ndev);
 
-	net_device = alloc_net_device();
+	net_device = alloc_net_device(device_info->recv_buf_size / ETH_DATA_LEN + 1);
 	if (!net_device)
 		return -ENOMEM;
 
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index f9d5b0b8209a..bfeaa0549f7f 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -996,9 +996,8 @@ static void netvsc_sc_open(struct vmbus_channel *new_sc)
 		return;
 
 	nvchan = nvscdev->chan_table + chn_index;
-	nvchan->mrc.buf
-		= vzalloc(NETVSC_RECVSLOT_MAX * sizeof(struct recv_comp_data));
-
+	nvchan->mrc.size = nvscdev->recv_buf_size / ETH_DATA_LEN + 1;
+	nvchan->mrc.buf = vzalloc(nvchan->mrc.size * sizeof(struct recv_comp_data));
 	if (!nvchan->mrc.buf)
 		return;
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 13/15] netvsc: convert open count from atomic to refcount
From: Stephen Hemminger @ 2017-05-03 23:01 UTC (permalink / raw)
  To: davem; +Cc: netdev, Stephen Hemminger
In-Reply-To: <20170503230117.20070-1-sthemmin@microsoft.com>

Refcount provides wraparond protection.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
---
 drivers/net/hyperv/hyperv_net.h   | 2 +-
 drivers/net/hyperv/netvsc.c       | 3 ++-
 drivers/net/hyperv/rndis_filter.c | 4 ++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 5627003bd8b6..29555317ca05 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -772,7 +772,7 @@ struct netvsc_device {
 
 	atomic_t num_outstanding_recvs;
 
-	atomic_t open_cnt;
+	refcount_t open_cnt;
 
 	struct netvsc_channel chan_table[VRSS_CHANNEL_MAX];
 
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 56e0721f703c..eb9f3e517fa5 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -80,7 +80,8 @@ static struct netvsc_device *alloc_net_device(u32 recvslot_max)
 
 	init_waitqueue_head(&net_device->wait_drain);
 	net_device->destroy = false;
-	atomic_set(&net_device->open_cnt, 0);
+
+	refcount_set(&net_device->open_cnt, 0);
 	net_device->max_pkt = RNDIS_MAX_PKT_DEFAULT;
 	net_device->pkt_align = RNDIS_PKT_ALIGN_DEFAULT;
 	init_completion(&net_device->channel_init_wait);
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index bfeaa0549f7f..2a89bbd6e42b 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -1259,7 +1259,7 @@ int rndis_filter_open(struct netvsc_device *nvdev)
 	if (!nvdev)
 		return -EINVAL;
 
-	if (atomic_inc_return(&nvdev->open_cnt) != 1)
+	if (refcount_inc_not_zero(&nvdev->open_cnt))
 		return 0;
 
 	return rndis_filter_open_device(nvdev->extension);
@@ -1270,7 +1270,7 @@ int rndis_filter_close(struct netvsc_device *nvdev)
 	if (!nvdev)
 		return -EINVAL;
 
-	if (atomic_dec_return(&nvdev->open_cnt) != 0)
+	if (refcount_dec_not_one(&nvdev->open_cnt))
 		return 0;
 
 	return rndis_filter_close_device(nvdev->extension);
-- 
2.11.0

^ permalink raw reply related

* [PATCH 15/15] netvsc: use vzalloc_node for receive completion data
From: Stephen Hemminger @ 2017-05-03 23:01 UTC (permalink / raw)
  To: davem; +Cc: netdev, Stephen Hemminger
In-Reply-To: <20170503230117.20070-1-sthemmin@microsoft.com>

Put the receive completion ring on the NUMA node of the CPU
assigned to the channel.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
---
 drivers/net/hyperv/hyperv_net.h   |  3 +++
 drivers/net/hyperv/netvsc.c       | 30 +++++++++++++++++++++++-------
 drivers/net/hyperv/rndis_filter.c |  8 ++++----
 3 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index a4417100a040..779c77a1638b 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -668,6 +668,9 @@ static inline bool recv_complete_ring_empty(const struct multi_recv_comp *mrc)
 	return mrc->read == mrc->write;
 }
 
+int netvsc_recv_comp_alloc(const struct vmbus_channel *chan,
+			   struct multi_recv_comp *mrc, u32 recv_slots);
+
 struct netvsc_stats {
 	u64 packets;
 	u64 bytes;
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 2938f1a2b765..ee42aa56460c 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -61,19 +61,33 @@ void netvsc_switch_datapath(struct net_device *ndev, bool vf)
 			       VM_PKT_DATA_INBAND, 0);
 }
 
-static struct netvsc_device *alloc_net_device(u32 recvslot_max)
+int netvsc_recv_comp_alloc(const struct vmbus_channel *channel,
+			   struct multi_recv_comp *mrc, u32 recv_slots)
+{
+	int node = cpu_to_node(channel->target_cpu);
+	size_t size = recv_slots * sizeof(struct recv_comp_data);
+
+	mrc->ring = vzalloc_node(size, node);
+	if (!mrc->ring)
+		mrc->ring = vzalloc(size);
+	if (!mrc->ring)
+		return -ENOMEM;
+
+	mrc->size = recv_slots;
+	return 0;
+}
+
+static struct netvsc_device *alloc_net_device(const struct vmbus_channel *chan,
+					      u32 recvslot_max)
 {
 	struct netvsc_device *net_device;
-	struct multi_recv_comp *mrc;
 
 	net_device = kzalloc(sizeof(struct netvsc_device), GFP_KERNEL);
 	if (!net_device)
 		return NULL;
 
-	mrc = &net_device->chan_table[0].mrc;
-	mrc->size = recvslot_max;
-	mrc->ring = vzalloc(recvslot_max * sizeof(struct recv_comp_data));
-	if (!mrc->ring) {
+	if (netvsc_recv_comp_alloc(chan, &net_device->chan_table[0].mrc,
+				   recvslot_max) != 0) {
 		kfree(net_device);
 		return NULL;
 	}
@@ -1246,7 +1260,9 @@ int netvsc_device_add(struct hv_device *device,
 	struct net_device *ndev = hv_get_drvdata(device);
 	struct net_device_context *net_device_ctx = netdev_priv(ndev);
 
-	net_device = alloc_net_device(device_info->recv_buf_size / ETH_DATA_LEN + 1);
+	net_device = alloc_net_device(device->channel,
+				      device_info->recv_buf_size
+				      / ETH_DATA_LEN + 1);
 	if (!net_device)
 		return -ENOMEM;
 
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index 1b8ce9bc0ce7..da4992b26397 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -996,10 +996,10 @@ static void netvsc_sc_open(struct vmbus_channel *new_sc)
 		return;
 
 	nvchan = nvscdev->chan_table + chn_index;
-	nvchan->mrc.size = nvscdev->recv_buf_size / ETH_DATA_LEN + 1;
-	nvchan->mrc.ring = vzalloc(nvchan->mrc.size
-				   * sizeof(struct recv_comp_data));
-	if (!nvchan->mrc.ring)
+	ret = netvsc_recv_comp_alloc(new_sc,
+				     &nvchan->mrc,
+				     nvscdev->recv_buf_size / ETH_DATA_LEN + 1);
+	if (ret)
 		return;
 
 	/* Because the device uses NAPI, all the interrupt batching and
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 14/15] netvsc: optimize receive completions
From: Stephen Hemminger @ 2017-05-03 23:01 UTC (permalink / raw)
  To: davem; +Cc: netdev, Stephen Hemminger
In-Reply-To: <20170503230117.20070-1-sthemmin@microsoft.com>

Handle receive completions better:
 * format message directly in ring rather than in different bookkeeping structure
 * eliminate atomic operation
 * get rid of modulus (divide) on ring wrap
 * avoid potential stall if ring gets full
 * don't make ring element opaque

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
---
 drivers/net/hyperv/hyperv_net.h   |  16 +++-
 drivers/net/hyperv/netvsc.c       | 168 +++++++++++---------------------------
 drivers/net/hyperv/rndis_filter.c |  11 +--
 3 files changed, 64 insertions(+), 131 deletions(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 29555317ca05..a4417100a040 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -650,16 +650,24 @@ struct multi_send_data {
 
 struct recv_comp_data {
 	u64 tid; /* transaction id */
-	u32 status;
+	struct  {
+		struct nvsp_message_header hdr;
+		u32 status;
+	} msg __packed;
 };
 
 struct multi_recv_comp {
-	void *buf; /* queued receive completions */
-	u32 first; /* first data entry */
-	u32 next; /* next entry for writing */
+	struct recv_comp_data *ring;
+	u32 read;
+	u32 write;
 	u32 size; /* number of slots in ring */
 };
 
+static inline bool recv_complete_ring_empty(const struct multi_recv_comp *mrc)
+{
+	return mrc->read == mrc->write;
+}
+
 struct netvsc_stats {
 	u64 packets;
 	u64 bytes;
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index eb9f3e517fa5..2938f1a2b765 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -72,8 +72,8 @@ static struct netvsc_device *alloc_net_device(u32 recvslot_max)
 
 	mrc = &net_device->chan_table[0].mrc;
 	mrc->size = recvslot_max;
-	mrc->buf = vzalloc(recvslot_max * sizeof(struct recv_comp_data));
-	if (!mrc->buf) {
+	mrc->ring = vzalloc(recvslot_max * sizeof(struct recv_comp_data));
+	if (!mrc->ring) {
 		kfree(net_device);
 		return NULL;
 	}
@@ -96,7 +96,7 @@ static void free_netvsc_device(struct rcu_head *head)
 	int i;
 
 	for (i = 0; i < VRSS_CHANNEL_MAX; i++)
-		vfree(nvdev->chan_table[i].mrc.buf);
+		vfree(nvdev->chan_table[i].mrc.ring);
 
 	kfree(nvdev);
 }
@@ -974,120 +974,51 @@ int netvsc_send(struct hv_device *device,
 	return ret;
 }
 
-static int netvsc_send_recv_completion(struct vmbus_channel *channel,
-				       u64 transaction_id, u32 status)
-{
-	struct nvsp_message recvcompMessage;
-	int ret;
-
-	recvcompMessage.hdr.msg_type =
-				NVSP_MSG1_TYPE_SEND_RNDIS_PKT_COMPLETE;
-
-	recvcompMessage.msg.v1_msg.send_rndis_pkt_complete.status = status;
-
-	/* Send the completion */
-	ret = vmbus_sendpacket(channel, &recvcompMessage,
-			       sizeof(struct nvsp_message_header) + sizeof(u32),
-			       transaction_id, VM_PKT_COMP, 0);
-
-	return ret;
-}
-
-static inline void count_recv_comp_slot(struct netvsc_device *nvdev, u16 q_idx,
-					u32 *filled, u32 *avail)
-{
-	struct multi_recv_comp *mrc = &nvdev->chan_table[q_idx].mrc;
-	u32 first = mrc->first;
-	u32 next = mrc->next;
-
-	*filled = (first > next) ? mrc->size - first + next :
-		  next - first;
-
-	*avail = mrc->size - *filled - 1;
-}
 
-/* Read the first filled slot, no change to index */
-static inline struct recv_comp_data *read_recv_comp_slot(struct netvsc_device
-							 *nvdev, u16 q_idx)
+/* Check and send pending recv completions */
+static int send_receive_comp(struct netvsc_device *nvdev,
+			     struct vmbus_channel *channel, u16 q_idx)
 {
 	struct multi_recv_comp *mrc = &nvdev->chan_table[q_idx].mrc;
-	u32 filled, avail;
 
-	if (unlikely(!mrc->buf))
-		return NULL;
+	while (!recv_complete_ring_empty(mrc)) {
+		struct recv_comp_data *rcd = mrc->ring + mrc->read;
+		int ret;
 
-	count_recv_comp_slot(nvdev, q_idx, &filled, &avail);
-	if (!filled)
-		return NULL;
+		ret = vmbus_sendpacket(channel, &rcd->msg, sizeof(rcd->msg),
+				       rcd->tid, VM_PKT_COMP, 0);
 
-	return mrc->buf + mrc->first * sizeof(struct recv_comp_data);
-}
+		/* if ring to host gets full, retry later */
+		if (unlikely(ret != 0))
+			return ret;
 
-/* Put the first filled slot back to available pool */
-static inline void put_recv_comp_slot(struct netvsc_device *nvdev, u16 q_idx)
-{
-	struct multi_recv_comp *mrc = &nvdev->chan_table[q_idx].mrc;
-	int num_recv;
-
-	mrc->first = (mrc->first + 1) % mrc->size;
-
-	num_recv = atomic_dec_return(&nvdev->num_outstanding_recvs);
+		if (++mrc->read == mrc->size)
+			mrc->read = 0;
+	}
 
-	if (nvdev->destroy && num_recv == 0)
+	/* ring now empty */
+	if (unlikely(nvdev->destroy))
 		wake_up(&nvdev->wait_drain);
+	return 0;
 }
 
-/* Check and send pending recv completions */
-static void netvsc_chk_recv_comp(struct netvsc_device *nvdev,
-				 struct vmbus_channel *channel, u16 q_idx)
-{
-	struct recv_comp_data *rcd;
-	int ret;
-
-	while (true) {
-		rcd = read_recv_comp_slot(nvdev, q_idx);
-		if (!rcd)
-			break;
-
-		ret = netvsc_send_recv_completion(channel, rcd->tid,
-						  rcd->status);
-		if (ret)
-			break;
-
-		put_recv_comp_slot(nvdev, q_idx);
-	}
-}
-
-#define NETVSC_RCD_WATERMARK 80
-
 /* Get next available slot */
-static inline struct recv_comp_data *get_recv_comp_slot(
-	struct netvsc_device *nvdev, struct vmbus_channel *channel, u16 q_idx)
+static struct recv_comp_data *
+get_recv_comp_slot(struct netvsc_device *nvdev,
+		   struct vmbus_channel *channel, u16 q_idx)
 {
 	struct multi_recv_comp *mrc = &nvdev->chan_table[q_idx].mrc;
-	u32 filled, avail, next;
 	struct recv_comp_data *rcd;
+	u32 next = mrc->write;
 
-	if (unlikely(!nvdev->recv_section))
-		return NULL;
-
-	if (unlikely(!mrc->buf))
-		return NULL;
-
-	if (atomic_read(&nvdev->num_outstanding_recvs) >
-	    nvdev->recv_section->num_sub_allocs * NETVSC_RCD_WATERMARK / 100)
-		netvsc_chk_recv_comp(nvdev, channel, q_idx);
+	if (++next == mrc->size)
+		next = 0;
 
-	count_recv_comp_slot(nvdev, q_idx, &filled, &avail);
-	if (!avail)
+	if (unlikely(next == mrc->read))
 		return NULL;
 
-	next = mrc->next;
-	rcd = mrc->buf + next * sizeof(struct recv_comp_data);
-	mrc->next = (next + 1) % mrc->size;
-
-	atomic_inc(&nvdev->num_outstanding_recvs);
-
+	rcd = mrc->ring + mrc->write;
+	mrc->write = next;
 	return rcd;
 }
 
@@ -1104,9 +1035,8 @@ static int netvsc_receive(struct net_device *ndev,
 	u16 q_idx = channel->offermsg.offer.sub_channel_index;
 	char *recv_buf = net_device->recv_buf;
 	u32 status = NVSP_STAT_SUCCESS;
-	int i;
-	int count = 0;
-	int ret;
+	struct recv_comp_data *rcd;
+	int i, count = 0;
 
 	/* Make sure this is a valid nvsp packet */
 	if (unlikely(nvsp->hdr.msg_type != NVSP_MSG1_TYPE_SEND_RNDIS_PKT)) {
@@ -1137,25 +1067,16 @@ static int netvsc_receive(struct net_device *ndev,
 					      channel, data, buflen);
 	}
 
-	if (net_device->chan_table[q_idx].mrc.buf) {
-		struct recv_comp_data *rcd;
-
-		rcd = get_recv_comp_slot(net_device, channel, q_idx);
-		if (rcd) {
-			rcd->tid = vmxferpage_packet->d.trans_id;
-			rcd->status = status;
-		} else {
-			netdev_err(ndev, "Recv_comp full buf q:%hd, tid:%llx\n",
-				   q_idx, vmxferpage_packet->d.trans_id);
-		}
+	rcd = get_recv_comp_slot(net_device, channel, q_idx);
+	if (likely(rcd)) {
+		rcd->tid = vmxferpage_packet->d.trans_id;
+		rcd->msg.hdr.msg_type = NVSP_MSG1_TYPE_SEND_RNDIS_PKT_COMPLETE;
+		rcd->msg.status = status;
 	} else {
-		ret = netvsc_send_recv_completion(channel,
-						  vmxferpage_packet->d.trans_id,
-						  status);
-		if (ret)
-			netdev_err(ndev, "Recv_comp q:%hd, tid:%llx, err:%d\n",
-				   q_idx, vmxferpage_packet->d.trans_id, ret);
+		netdev_err(ndev, "Recv_comp full buf q:%hd, tid:%llx\n",
+			   q_idx, vmxferpage_packet->d.trans_id);
 	}
+
 	return count;
 }
 
@@ -1258,6 +1179,9 @@ int netvsc_poll(struct napi_struct *napi, int budget)
 	struct netvsc_device *net_device = net_device_to_netvsc_device(ndev);
 	int work_done = 0;
 
+	/* If ring has leftover completions flush them now */
+	send_receive_comp(net_device, channel, q_idx);
+
 	/* If starting a new interval */
 	if (!nvchan->desc)
 		nvchan->desc = hv_pkt_iter_first(channel);
@@ -1270,14 +1194,14 @@ int netvsc_poll(struct napi_struct *napi, int budget)
 
 	hv_pkt_iter_close(channel);
 
-	netvsc_chk_recv_comp(net_device, channel, q_idx);
-
-	/* If receive ring was exhausted
+	/* If all receive completions sent to host
+	 * and budget was not used up
 	 * and not doing busy poll
 	 * then re-enable host interrupts
 	 *  and reschedule if ring is not empty.
 	 */
-	if (work_done < budget &&
+	if (send_receive_comp(net_device, channel, q_idx) == 0 &&
+	    work_done < budget &&
 	    napi_complete_done(napi, work_done) &&
 	    hv_end_read(&channel->inbound) != 0) {
 		/* special case if new messages are available */
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index 2a89bbd6e42b..1b8ce9bc0ce7 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -901,12 +901,12 @@ static bool netvsc_device_idle(const struct netvsc_device *nvdev)
 {
 	int i;
 
-	if (atomic_read(&nvdev->num_outstanding_recvs) > 0)
-		return false;
-
 	for (i = 0; i < nvdev->num_chn; i++) {
 		const struct netvsc_channel *nvchan = &nvdev->chan_table[i];
 
+		if (!recv_complete_ring_empty(&nvchan->mrc))
+			return false;
+
 		if (atomic_read(&nvchan->queue_sends) > 0)
 			return false;
 	}
@@ -997,8 +997,9 @@ static void netvsc_sc_open(struct vmbus_channel *new_sc)
 
 	nvchan = nvscdev->chan_table + chn_index;
 	nvchan->mrc.size = nvscdev->recv_buf_size / ETH_DATA_LEN + 1;
-	nvchan->mrc.buf = vzalloc(nvchan->mrc.size * sizeof(struct recv_comp_data));
-	if (!nvchan->mrc.buf)
+	nvchan->mrc.ring = vzalloc(nvchan->mrc.size
+				   * sizeof(struct recv_comp_data));
+	if (!nvchan->mrc.ring)
 		return;
 
 	/* Because the device uses NAPI, all the interrupt batching and
-- 
2.11.0

^ permalink raw reply related

* Re: [PATCH net-next 00/15] netvsc: misc patches
From: David Miller @ 2017-05-03 23:35 UTC (permalink / raw)
  To: stephen; +Cc: netdev, sthemmin
In-Reply-To: <20170503230117.20070-1-sthemmin@microsoft.com>

From: Stephen Hemminger <stephen@networkplumber.org>
Date: Wed,  3 May 2017 16:01:02 -0700

> Mostly these are performance related. There is also one bug fix for
> incorrect handling of NAPI on device removal

Stephen, the net-next tree is closed as we are in the merge window.

Please resubmit this when the net-next tree opens back up.

Thank you.

^ permalink raw reply

* Re: net/ipv6: GPF in rt6_device_match
From: Cong Wang @ 2017-05-03 23:35 UTC (permalink / raw)
  To: David Ahern
  Cc: Andrey Konovalov, David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, netdev, LKML, Dmitry Vyukov,
	Kostya Serebryany, Eric Dumazet, syzkaller
In-Reply-To: <dece4ab2-86b0-1649-5e34-1a9fa1240792@gmail.com>

On Wed, May 3, 2017 at 3:09 PM, David Ahern <dsahern@gmail.com> wrote:
> On 5/3/17 4:02 PM, Cong Wang wrote:
>> On Wed, May 3, 2017 at 11:22 AM, David Ahern <dsahern@gmail.com> wrote:
>>> On 5/3/17 11:02 AM, Cong Wang wrote:
>>>> A quick glance shows we need to simply check local->rt6i_idev
>>>> since we do the same check for sprt right above.
>>>
>>> As I recall, rt6i_idev is set for all routes except null_entry and it is
>>> not set on null_entry only because of initialization order.
>>
>> Are you sure?
>>
>>         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
>>                 net->ipv6.ip6_null_entry->dst.dev = dev;
>>                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
>> #ifdef CONFIG_IPV6_MULTIPLE_TABLES
>>                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
>>                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
>>                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
>>                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
>> #endif
>>         }
>>
>> Loopback is the first one to register, so null entry is set to
>> loopback dev initially. Similar for init_net.
>>
>
> Why don't you add a printk and see ;-)

Ah, we need:

@@ -4024,7 +4027,7 @@ static struct pernet_operations ip6_route_net_late_ops = {

 static struct notifier_block ip6_route_dev_notifier = {
        .notifier_call = ip6_route_dev_notify,
-       .priority = 0,
+       .priority = -10, /* Must be called after addrconf_notify!! */
 };

^ permalink raw reply

* [RFC] iproute: Add support for extended ack to rtnl_talk
From: Stephen Hemminger @ 2017-05-03 23:56 UTC (permalink / raw)
  To: netdev; +Cc: Stephen Hemminger

Add support for extended ack error reporting via libmnl. This
is a better alternative to use existing library and not copy/paste
code from the kernel. Also make arguments const where possible.

Add a new function rtnl_talk_extack that takes a callback as an input
arg. If a netlink response contains extack attributes, the callback is
is invoked with the the err string, offset in the message and a pointer
to the message returned by the kernel.

Adding a new function allows commands to be moved over to the
extended error reporting over time.

For feedback, compile tested only.

---
 include/libnetlink.h |   6 +++
 lib/Makefile         |   7 ++++
 lib/libnetlink.c     | 110 +++++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 116 insertions(+), 7 deletions(-)

diff --git a/include/libnetlink.h b/include/libnetlink.h
index c43ab0a2d9d9..32d5177cd1ab 100644
--- a/include/libnetlink.h
+++ b/include/libnetlink.h
@@ -66,6 +66,9 @@ typedef int (*rtnl_listen_filter_t)(const struct sockaddr_nl *,
 				    struct rtnl_ctrl_data *,
 				    struct nlmsghdr *n, void *);
 
+typedef int (*nl_ext_ack_fn_t)(const char *errmsg, uint32_t off,
+			       const struct nlmsghdr *inner_nlh);
+
 struct rtnl_dump_filter_arg {
 	rtnl_filter_t filter;
 	void *arg1;
@@ -82,6 +85,9 @@ int rtnl_dump_filter_nc(struct rtnl_handle *rth,
 int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 	      struct nlmsghdr *answer, size_t len)
 	__attribute__((warn_unused_result));
+int rtnl_talk_extack(struct rtnl_handle *rtnl, struct nlmsghdr *n,
+	      struct nlmsghdr *answer, size_t len, nl_ext_ack_fn_t errfn)
+	__attribute__((warn_unused_result));
 int rtnl_talk_suppress_rtnl_errmsg(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 				   struct nlmsghdr *answer, size_t len)
 	__attribute__((warn_unused_result));
diff --git a/lib/Makefile b/lib/Makefile
index 1d24ca24b9a3..f81888cca974 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -4,6 +4,13 @@ ifeq ($(IP_CONFIG_SETNS),y)
 	CFLAGS += -DHAVE_SETNS
 endif
 
+ifeq ($(HAVE_MNL),y)
+	CFLAGS += $(shell $(PKG_CONFIG) libmnl --cflags)
+	LDLIBS += $(shell $(PKG_CONFIG) libmnl --libs)
+else
+@warn "libmnl required for error support"
+endif
+
 CFLAGS += -fPIC
 
 UTILOBJ = utils.o rt_names.o ll_types.o ll_proto.o ll_addr.o \
diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index 5b75b2db4e0b..20ec7fe20d7c 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -36,6 +36,79 @@
 
 int rcvbuf = 1024 * 1024;
 
+#ifdef HAVE_LIBMNL
+#include <libmnl/libmnl.h>
+
+static const enum mnl_attr_data_type extack_policy[NLMSGERR_ATTR_MAX + 1] = {
+	[NLMSGERR_ATTR_MSG]	= MNL_TYPE_NUL_STRING,
+	[NLMSGERR_ATTR_OFFS]	= MNL_TYPE_U32,
+};
+
+static int err_attr_cb(const struct nlattr *attr, void *data)
+{
+	const struct nlattr **tb = data;
+	uint16_t type;
+
+	if (mnl_attr_type_valid(attr, NLMSGERR_ATTR_MAX) < 0)
+		return MNL_CB_ERROR;
+
+	type = mnl_attr_get_type(attr);
+	if (mnl_attr_validate(attr, extack_policy[type]) < 0)
+		return MNL_CB_ERROR;
+
+
+	tb[type] = attr;
+	return MNL_CB_OK;
+}
+
+
+/* dump netlink extended ack error message */
+static int nl_dump_ext_err(const struct nlmsghdr *nlh, nl_ext_ack_fn_t errfn)
+{
+	struct nlattr *tb[NLMSGERR_ATTR_MAX + 1];
+	const struct nlmsgerr *err = mnl_nlmsg_get_payload(nlh);
+	const struct nlmsghdr *err_nlh = NULL;
+	unsigned int hlen = sizeof(*err);
+	const char *errmsg = NULL;
+	uint32_t off = 0;
+
+	if (!errfn)
+		return 0;
+
+	/* no TLVs, nothing to do here */
+	if (!(nlh->nlmsg_flags & NLM_F_ACK_TLVS))
+		return 0;
+
+	/* if NLM_F_CAPPED is set then the inner err msg was capped */
+	if (!(nlh->nlmsg_flags & NLM_F_CAPPED))
+		hlen += mnl_nlmsg_get_payload_len(&err->msg);
+
+	mnl_attr_parse(nlh, hlen, err_attr_cb, tb);
+
+	if (tb[NLMSGERR_ATTR_MSG])
+		errmsg = mnl_attr_get_str(tb[NLMSGERR_ATTR_MSG]);
+
+	if (tb[NLMSGERR_ATTR_OFFS]) {
+		off = mnl_attr_get_u32(tb[NLMSGERR_ATTR_OFFS]);
+
+		if (off > nlh->nlmsg_len) {
+			fprintf(stderr,
+				"Invalid offset for NLMSGERR_ATTR_OFFS\n");
+			off = 0;
+		} else if (!(nlh->nlmsg_flags & NLM_F_CAPPED))
+			err_nlh = &err->msg;
+	}
+
+	return errfn(errmsg, off, err_nlh);
+}
+#else
+/* No extended error ack without libmnl */
+static int nl_dump_ext_err(const struct nlmsghdr *nlh, nl_ext_ack_fn_t errfn)
+{
+	return 0;
+}
+#endif
+
 void rtnl_close(struct rtnl_handle *rth)
 {
 	if (rth->fd >= 0) {
@@ -49,6 +122,7 @@ int rtnl_open_byproto(struct rtnl_handle *rth, unsigned int subscriptions,
 {
 	socklen_t addr_len;
 	int sndbuf = 32768;
+	int one = 1;
 
 	memset(rth, 0, sizeof(*rth));
 
@@ -71,6 +145,11 @@ int rtnl_open_byproto(struct rtnl_handle *rth, unsigned int subscriptions,
 		return -1;
 	}
 
+	if (setsockopt(rth->fd, SOL_NETLINK, NETLINK_EXT_ACK,
+		       &one, sizeof(one)) < 0) {
+		/* debug/verbose message that it is not supported */
+	}
+
 	memset(&rth->local, 0, sizeof(rth->local));
 	rth->local.nl_family = AF_NETLINK;
 	rth->local.nl_groups = subscriptions;
@@ -417,9 +496,19 @@ int rtnl_dump_filter_nc(struct rtnl_handle *rth,
 	return rtnl_dump_filter_l(rth, a);
 }
 
+static void rtnl_talk_error(struct nlmsghdr *h, struct nlmsgerr *err,
+			    nl_ext_ack_fn_t errfn)
+{
+	if (nl_dump_ext_err(h, errfn))
+		return;
+
+	fprintf(stderr, "RTNETLINK answers: %s\n",
+		strerror(-err->error));
+}
+
 static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 		       struct nlmsghdr *answer, size_t maxlen,
-		       bool show_rtnl_err)
+		       bool show_rtnl_err, nl_ext_ack_fn_t errfn)
 {
 	int status;
 	unsigned int seq;
@@ -506,10 +595,10 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 					return 0;
 				}
 
-				if (rtnl->proto != NETLINK_SOCK_DIAG && show_rtnl_err)
-					fprintf(stderr,
-						"RTNETLINK answers: %s\n",
-						strerror(-err->error));
+				if (rtnl->proto != NETLINK_SOCK_DIAG &&
+				    show_rtnl_err)
+					rtnl_talk_error(h, err, errfn);
+
 				errno = -err->error;
 				return -1;
 			}
@@ -541,13 +630,20 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 	      struct nlmsghdr *answer, size_t maxlen)
 {
-	return __rtnl_talk(rtnl, n, answer, maxlen, true);
+	return __rtnl_talk(rtnl, n, answer, maxlen, true, NULL);
+}
+
+int rtnl_talk_extack(struct rtnl_handle *rtnl, struct nlmsghdr *n,
+		     struct nlmsghdr *answer, size_t maxlen,
+		     nl_ext_ack_fn_t errfn)
+{
+	return __rtnl_talk(rtnl, n, answer, maxlen, true, errfn);
 }
 
 int rtnl_talk_suppress_rtnl_errmsg(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 				   struct nlmsghdr *answer, size_t maxlen)
 {
-	return __rtnl_talk(rtnl, n, answer, maxlen, false);
+	return __rtnl_talk(rtnl, n, answer, maxlen, false, NULL);
 }
 
 int rtnl_listen_all_nsid(struct rtnl_handle *rth)
-- 
2.11.0

^ permalink raw reply related

* [PATCH net] netvsc: make sure napi enabled before vmbus_open
From: Stephen Hemminger @ 2017-05-03 23:59 UTC (permalink / raw)
  To: davem; +Cc: netdev, Stephen Hemminger

This fixes a race where vmbus callback for new packet arriving
could occur before NAPI is initialized.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
---
 drivers/net/hyperv/netvsc.c       | 8 +++++---
 drivers/net/hyperv/rndis_filter.c | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 15749d359e60..652453d9fb08 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -1322,6 +1322,10 @@ int netvsc_device_add(struct hv_device *device,
 		nvchan->channel = device->channel;
 	}
 
+	/* Enable NAPI handler before init callbacks */
+	netif_napi_add(ndev, &net_device->chan_table[0].napi,
+		       netvsc_poll, NAPI_POLL_WEIGHT);
+
 	/* Open the channel */
 	ret = vmbus_open(device->channel, ring_size * PAGE_SIZE,
 			 ring_size * PAGE_SIZE, NULL, 0,
@@ -1329,6 +1333,7 @@ int netvsc_device_add(struct hv_device *device,
 			 net_device->chan_table);
 
 	if (ret != 0) {
+		netif_napi_del(&net_device->chan_table[0].napi);
 		netdev_err(ndev, "unable to open channel: %d\n", ret);
 		goto cleanup;
 	}
@@ -1336,9 +1341,6 @@ int netvsc_device_add(struct hv_device *device,
 	/* Channel is opened */
 	netdev_dbg(ndev, "hv_netvsc channel opened successfully\n");
 
-	/* Enable NAPI handler for init callbacks */
-	netif_napi_add(ndev, &net_device->chan_table[0].napi,
-		       netvsc_poll, NAPI_POLL_WEIGHT);
 	napi_enable(&net_device->chan_table[0].napi);
 
 	/* Writing nvdev pointer unlocks netvsc_send(), make sure chn_table is
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index ab92c3c95951..f9d5b0b8209a 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -1018,7 +1018,7 @@ static void netvsc_sc_open(struct vmbus_channel *new_sc)
 	if (ret == 0)
 		napi_enable(&nvchan->napi);
 	else
-		netdev_err(ndev, "sub channel open failed (%d)\n", ret);
+		netif_napi_del(&nvchan->napi);
 
 	if (refcount_dec_and_test(&nvscdev->sc_offered))
 		complete(&nvscdev->channel_init_wait);
-- 
2.11.0

^ permalink raw reply related

* Re: [PATCH v4 net-next 06/10] net/ncsi: Ethtool operation to get NCSI hw statistics
From: Gavin Shan @ 2017-05-04  0:05 UTC (permalink / raw)
  To: David Miller; +Cc: andrew, gwshan, netdev, joe, kubakici, f.fainelli
In-Reply-To: <20170503.091823.747316360571887178.davem@davemloft.net>

On Wed, May 03, 2017 at 09:18:23AM -0400, David Miller wrote:
>From: Andrew Lunn <andrew@lunn.ch>
>Date: Wed, 3 May 2017 14:47:22 +0200
>
>> On Wed, May 03, 2017 at 02:44:37PM +1000, Gavin Shan wrote:
>>> This adds ethtool command (ETHTOOL_GNCSISTATS) to retrieve the
>>> NCSI hardware statistics.
>> 
>> Hi Gavin
>> 
>> I've not been following this patchset, so maybe i'm about to ask a
>> question which has already been asked and answered.
>> 
>> Why cannot use just use ethtool -S for this?
>
>Indeed, when I said to use ethtool for these NCSI hw stats I meant
>that the "ethtool -S" be used, not some new ethtool command.
>

Thanks for the comments. Feel free to ask any questions which would
make the code clear and better. There are couple of factors I thought
separate command is better than ETHTOOL_GSTATS: The statistic items from
ETHTOOL_GSTATS are variable, meaning the kernel needs provide the layout
of the statistic items via ETHTOOL_GSSET_INFO and ETHTOOL_GSTRINGS.
NCSI HW statistics aren't following this and their layout is fixed.
Besides, statistics for ETHTOOL_GSTATS are maintained in local MAC,
but NCSI HW statistics are collected from NIC on far-end. So they're
different from this point. Lastly, the NCSI software statistics needs
separate command. It'd better to have separate command for HW statistics
as well, to make things consistent.

Cheers,
Gavin

^ permalink raw reply

* Re: [PATCH v4 net-next 00/10] net/ncsi: Add debugging functionality
From: Gavin Shan @ 2017-05-04  0:06 UTC (permalink / raw)
  To: David Miller; +Cc: gwshan, netdev, joe, kubakici, f.fainelli
In-Reply-To: <20170503.012518.74146567285105473.davem@davemloft.net>

On Wed, May 03, 2017 at 01:25:18AM -0400, David Miller wrote:
>
>Sorry, the net-next tree is closed right now as we are in the merge
>window.
>
>Please resubmit this when the net-next tree opens back up.
>
>Thank you.
>

Sorry that I didn't catch the merge window. Thanks for the tip. I'll
resubmit after net-next is open again.

Cheers,
Gavin

^ permalink raw reply

* Re: [PATCH v4 net-next 00/10] net/ncsi: Add debugging functionality
From: Gavin Shan @ 2017-05-04  0:09 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: Gavin Shan, netdev, joe, kubakici, f.fainelli, davem
In-Reply-To: <20170503125802.GG8029@lunn.ch>

On Wed, May 03, 2017 at 02:58:02PM +0200, Andrew Lunn wrote:
>On Wed, May 03, 2017 at 02:44:31PM +1000, Gavin Shan wrote:
>> This series supports NCSI debugging infrastructure by adding several
>> ethtool commands and one debugfs file. It was inspired by the reported
>> issues: No available package and channel are probed successfully.
>> Obviously, we don't have a debugging infrastructure for NCSI stack yet.
>> 
>> The first 3 patches, fixing some issues, aren't relevant to the
>> subject. 
>
>...
>
>> PATCH[9,10] fix two issues found by the debugging functionality.
>
>Hi Gavin
>
>If they are real fixes, you should submit them separately. Please
>include Fixes: tags, and indicate if they need to be included in
>-stable.
>
>David will probably accept fixes while the merge window is closed.
>
>	Andrew
>

Andrew, thanks a lot for the good tip. Yeah, I agree and will follow
in next respin. I think the patches fixing issues should be rebased
to "net" and reposted?

git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git

Cheers,
Gavin

^ permalink raw reply

* Re: [PATCH v4 net-next 06/10] net/ncsi: Ethtool operation to get NCSI hw statistics
From: David Miller @ 2017-05-04  0:16 UTC (permalink / raw)
  To: gwshan; +Cc: andrew, netdev, joe, kubakici, f.fainelli
In-Reply-To: <20170504000534.GA11287@gwshan>

From: Gavin Shan <gwshan@linux.vnet.ibm.com>
Date: Thu, 4 May 2017 10:05:34 +1000

> On Wed, May 03, 2017 at 09:18:23AM -0400, David Miller wrote:
>>From: Andrew Lunn <andrew@lunn.ch>
>>Date: Wed, 3 May 2017 14:47:22 +0200
>>
>>> On Wed, May 03, 2017 at 02:44:37PM +1000, Gavin Shan wrote:
>>>> This adds ethtool command (ETHTOOL_GNCSISTATS) to retrieve the
>>>> NCSI hardware statistics.
>>> 
>>> Hi Gavin
>>> 
>>> I've not been following this patchset, so maybe i'm about to ask a
>>> question which has already been asked and answered.
>>> 
>>> Why cannot use just use ethtool -S for this?
>>
>>Indeed, when I said to use ethtool for these NCSI hw stats I meant
>>that the "ethtool -S" be used, not some new ethtool command.
>>
> 
> Thanks for the comments. Feel free to ask any questions which would
> make the code clear and better. There are couple of factors I thought
> separate command is better than ETHTOOL_GSTATS: The statistic items from
> ETHTOOL_GSTATS are variable, meaning the kernel needs provide the layout
> of the statistic items via ETHTOOL_GSSET_INFO and ETHTOOL_GSTRINGS.
> NCSI HW statistics aren't following this and their layout is fixed.
> Besides, statistics for ETHTOOL_GSTATS are maintained in local MAC,
> but NCSI HW statistics are collected from NIC on far-end. So they're
> different from this point. Lastly, the NCSI software statistics needs
> separate command. It'd better to have separate command for HW statistics
> as well, to make things consistent.

ETHTOOL_GSTATS are for device specific statistics.  Whether they are
fixed in your implementation or not.

^ permalink raw reply

* Re: [PATCH v4 net-next 06/10] net/ncsi: Ethtool operation to get NCSI hw statistics
From: Andrew Lunn @ 2017-05-04  0:34 UTC (permalink / raw)
  To: Gavin Shan; +Cc: David Miller, netdev, joe, kubakici, f.fainelli
In-Reply-To: <20170504000534.GA11287@gwshan>

On Thu, May 04, 2017 at 10:05:34AM +1000, Gavin Shan wrote:
> On Wed, May 03, 2017 at 09:18:23AM -0400, David Miller wrote:
> >From: Andrew Lunn <andrew@lunn.ch>
> >Date: Wed, 3 May 2017 14:47:22 +0200
> >
> >> On Wed, May 03, 2017 at 02:44:37PM +1000, Gavin Shan wrote:
> >>> This adds ethtool command (ETHTOOL_GNCSISTATS) to retrieve the
> >>> NCSI hardware statistics.
> >> 
> >> Hi Gavin
> >> 
> >> I've not been following this patchset, so maybe i'm about to ask a
> >> question which has already been asked and answered.
> >> 
> >> Why cannot use just use ethtool -S for this?
> >
> >Indeed, when I said to use ethtool for these NCSI hw stats I meant
> >that the "ethtool -S" be used, not some new ethtool command.
> >
> 
> Thanks for the comments. Feel free to ask any questions which would
> make the code clear and better. There are couple of factors I thought
> separate command is better than ETHTOOL_GSTATS: The statistic items from
> ETHTOOL_GSTATS are variable, meaning the kernel needs provide the layout
> of the statistic items via ETHTOOL_GSSET_INFO and ETHTOOL_GSTRINGS.
> NCSI HW statistics aren't following this and their layout is fixed.

That does not stop you from using it for a fixed layout. And what
happens when a new version of the standard is released with more
statistics?

> Besides, statistics for ETHTOOL_GSTATS are maintained in local MAC,
> but NCSI HW statistics are collected from NIC on far-end. So they're
> different from this point. 

You might want to take a look at what is happening with Ethernet
switches. Again, we have two sets of statistics for each port on the
switch. We have the statistics from the hardware, and statistics from
the software. Frames that come in one interface and the hardware
forwards out another interface are not seen by the software. But
frames which originate/terminate in the host, or the switch does not
know what to do with and so are passed to the host, are seen by the
software. Mellanox can tell you more. Your local and remote are not
that different.

> Lastly, the NCSI software statistics needs separate command. It'd
> better to have separate command for HW statistics as well, to make
> things consistent.

Are software statistics defined in DSP0222? Since they are software, i
kind of expect you want the flexibility to add more later. The
ETHTOOL_GSSET_INFO and ETHTOOL_GSTRINGS gives you that flexibility,
without having to change ethtool.

	Andrew

^ permalink raw reply

* Re: [PATCH v4 net-next 06/10] net/ncsi: Ethtool operation to get NCSI hw statistics
From: Gavin Shan @ 2017-05-04  0:38 UTC (permalink / raw)
  To: David Miller; +Cc: gwshan, andrew, netdev, joe, kubakici, f.fainelli
In-Reply-To: <20170503.201643.358209122385716637.davem@davemloft.net>

On Wed, May 03, 2017 at 08:16:43PM -0400, David Miller wrote:
>From: Gavin Shan <gwshan@linux.vnet.ibm.com>
>Date: Thu, 4 May 2017 10:05:34 +1000
>> On Wed, May 03, 2017 at 09:18:23AM -0400, David Miller wrote:
>>>From: Andrew Lunn <andrew@lunn.ch>
>>>Date: Wed, 3 May 2017 14:47:22 +0200
>>>
>>>> On Wed, May 03, 2017 at 02:44:37PM +1000, Gavin Shan wrote:
>>>>> This adds ethtool command (ETHTOOL_GNCSISTATS) to retrieve the
>>>>> NCSI hardware statistics.
>>>> 
>>>> Hi Gavin
>>>> 
>>>> I've not been following this patchset, so maybe i'm about to ask a
>>>> question which has already been asked and answered.
>>>> 
>>>> Why cannot use just use ethtool -S for this?
>>>
>>>Indeed, when I said to use ethtool for these NCSI hw stats I meant
>>>that the "ethtool -S" be used, not some new ethtool command.
>>>
>> 
>> Thanks for the comments. Feel free to ask any questions which would
>> make the code clear and better. There are couple of factors I thought
>> separate command is better than ETHTOOL_GSTATS: The statistic items from
>> ETHTOOL_GSTATS are variable, meaning the kernel needs provide the layout
>> of the statistic items via ETHTOOL_GSSET_INFO and ETHTOOL_GSTRINGS.
>> NCSI HW statistics aren't following this and their layout is fixed.
>> Besides, statistics for ETHTOOL_GSTATS are maintained in local MAC,
>> but NCSI HW statistics are collected from NIC on far-end. So they're
>> different from this point. Lastly, the NCSI software statistics needs
>> separate command. It'd better to have separate command for HW statistics
>> as well, to make things consistent.
>
>ETHTOOL_GSTATS are for device specific statistics.  Whether they are
>fixed in your implementation or not.
>

For NCSI HW statistics, they're always fixed. It's assured by NCSI
specification. It seems my explanation isn't convincing enough. I'm
fine to convey NCSI HW statistics through ETHTOOL_GSTATS (ETHTOOL_GSSET_INFO
and ETHTOOL_GSTRINGS). Dave, please confirm it's what you're happy to
see?

Cheers,
Gavin

^ permalink raw reply

* Re: [PATCH v4 net-next 04/10] net/ncsi: Ethtool operation to get NCSI topology
From: Andrew Lunn @ 2017-05-04  0:49 UTC (permalink / raw)
  To: Gavin Shan; +Cc: netdev, joe, kubakici, f.fainelli, davem
In-Reply-To: <1493786681-27468-5-git-send-email-gwshan@linux.vnet.ibm.com>

> +void ncsi_ethtool_register_dev(struct net_device *dev)
> +{
> +	struct ethtool_ops *ops;
> +
> +	ops = (struct ethtool_ops *)(dev->ethtool_ops);

Why do you need the cast here?

Ah, is it because net_device has:

          const struct ethtool_ops *ethtool_ops;

i.e. you are casting off the const.

> +	if (!ops)
> +		return;
> +
> +	ops->get_ncsi_channels = ncsi_get_channels;

and here you modify it. Which is going to blow up, because it will be
in a read only segment and should throw an opps when you write to it?

You need to export ncsi_get_channels, and let the underlying driver
add it to its own ethtool_ops.

   Andrew

^ permalink raw reply

* Re: [PATCH v4 net-next 06/10] net/ncsi: Ethtool operation to get NCSI hw statistics
From: Gavin Shan @ 2017-05-04  0:55 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: Gavin Shan, David Miller, netdev, joe, kubakici, f.fainelli
In-Reply-To: <20170504003452.GO8029@lunn.ch>

On Thu, May 04, 2017 at 02:34:52AM +0200, Andrew Lunn wrote:
>On Thu, May 04, 2017 at 10:05:34AM +1000, Gavin Shan wrote:
>> On Wed, May 03, 2017 at 09:18:23AM -0400, David Miller wrote:
>> >From: Andrew Lunn <andrew@lunn.ch>
>> >Date: Wed, 3 May 2017 14:47:22 +0200
>> >
>> >> On Wed, May 03, 2017 at 02:44:37PM +1000, Gavin Shan wrote:
>> >>> This adds ethtool command (ETHTOOL_GNCSISTATS) to retrieve the
>> >>> NCSI hardware statistics.
>> >> 
>> >> Hi Gavin
>> >> 
>> >> I've not been following this patchset, so maybe i'm about to ask a
>> >> question which has already been asked and answered.
>> >> 
>> >> Why cannot use just use ethtool -S for this?
>> >
>> >Indeed, when I said to use ethtool for these NCSI hw stats I meant
>> >that the "ethtool -S" be used, not some new ethtool command.
>> >
>> 
>> Thanks for the comments. Feel free to ask any questions which would
>> make the code clear and better. There are couple of factors I thought
>> separate command is better than ETHTOOL_GSTATS: The statistic items from
>> ETHTOOL_GSTATS are variable, meaning the kernel needs provide the layout
>> of the statistic items via ETHTOOL_GSSET_INFO and ETHTOOL_GSTRINGS.
>> NCSI HW statistics aren't following this and their layout is fixed.
>
>That does not stop you from using it for a fixed layout. And what
>happens when a new version of the standard is released with more
>statistics?
>

Correct, I don't see the difference between NCSI spec 1.01 and 1.1.0.
So I assume it's pretty stable, but still have potential to change.

I agree to collect NCSI HW statistic with ETHTOOL_GSTATS as I said
in another thread.

>> Besides, statistics for ETHTOOL_GSTATS are maintained in local MAC,
>> but NCSI HW statistics are collected from NIC on far-end. So they're
>> different from this point. 
>
>You might want to take a look at what is happening with Ethernet
>switches. Again, we have two sets of statistics for each port on the
>switch. We have the statistics from the hardware, and statistics from
>the software. Frames that come in one interface and the hardware
>forwards out another interface are not seen by the software. But
>frames which originate/terminate in the host, or the switch does not
>know what to do with and so are passed to the host, are seen by the
>software. Mellanox can tell you more. Your local and remote are not
>that different.
>

Yeah, it makes sense. Thanks for making an example here.

>> Lastly, the NCSI software statistics needs separate command. It'd
>> better to have separate command for HW statistics as well, to make
>> things consistent.
>
>Are software statistics defined in DSP0222? Since they are software, i
>kind of expect you want the flexibility to add more later. The
>ETHTOOL_GSSET_INFO and ETHTOOL_GSTRINGS gives you that flexibility,
>without having to change ethtool.
>

No, they're purely software implementation, not defined in DSP0222.
These statistics count the NCSI packets seen by software, not hardware.
It's used for diagnosis.

It's a nice suggestion to convey SW statistic with ETHTOOL_GSTATS,
ETHTOOL_GSSET_INFO and ETHTOOL_GSTRINGS, for better flexibility. I'll
follow this in next respin.

Cheers,
Gavin

^ permalink raw reply

* Re: FEC on i.MX 7 transmit queue timeout
From: Stefan Agner @ 2017-05-04  1:21 UTC (permalink / raw)
  To: Andy Duan; +Cc: fugang.duan, festevam, netdev, netdev-owner
In-Reply-To: <ab1fc814-4a81-9eb0-b20b-2cd325158ba8@nxp.com>

Hi Andy,

On 2017-04-20 19:48, Andy Duan wrote:
> On 2017年04月20日 07:15, Stefan Agner wrote:
>> I tested again with imx6sx-fec compatible string. I could reproduce it
>> on a Colibri with i.MX 7Dual. But not always: It really depends whether
>> queue 2 is counting up or not. Just after boot, I check /proc/interrupts
>> twice, if queue 2 is counting it will happen!
>>
>> But if only queue 0 is mostly in use, then it seems to work just fine.
> If your case is only running best effort like tcp/udp, you can re-set 
> the "fsl,num-tx-queues" and "fsl,num-rx-queues" to 1 in board dts file.
> Other two queues are for AVB audio/video queues, they have high priority 
> than queue 0. If running iperf tcp test on the three queues, then
> the tcp segment may be out-of-order that cause net watchdog timeout.
>>
>> I also tried i.MX 7Dual SabreSD here, and the same thing. I had to
>> reboot 3 times, then queue 2 was counting:
>>   57:          8     GIC-0 150 Level     30be0000.ethernet
>>   58:      20137     GIC-0 151 Level     30be0000.ethernet
>>   59:       9269     GIC-0 152 Level     30be0000.ethernet
>>
>> It took me about 40 minutes on Sabre until it happened, and I had to
>> force it using iperf, but then I got the ring dumps:
> My board had ran more than 47 hours with nfs rootfs in 4.11.0-rc6, but 
> not running iperf.
> I am testing with iperf.

Any update on this issue?

When using iperf (server) on the board with Linux 4.11 the issue appears
within a few iperf iterations on a Sabre (TO 1.2, Board Rev C, if that
matters)...

root@colibri-imx7:~# iperf -s
------------------------------------------------------------
Server listening on TCP port 5001
TCP window size: 85.3 KByte (default)
------------------------------------------------------------
[  4] local 192.168.10.70 port 5001 connected with 192.168.10.1 port
60524
random: crng init done
[ ID] Interval       Transfer     Bandwidth
[  4]  0.0-10.0 sec  1.06 GBytes   909 Mbits/sec
[  5] local 192.168.10.70 port 5001 connected with 192.168.10.1 port
60528
[  5]  0.0-10.0 sec  1.07 GBytes   919 Mbits/sec
[  4] local 192.168.10.70 port 5001 connected with 192.168.10.1 port
60562
------------[ cut here ]------------
WARNING: CPU: 0 PID: 0 at net/sched/sch_generic.c:316
dev_watchdog+0x248/0x24c
NETDEV WATCHDOG: eth0 (fec): transmit queue 1 timed out
Modules linked in:
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.11.0 #360
Hardware name: Freescale i.MX7 Dual (Device Tree)
[<c0226930>] (unwind_backtrace) from [<c0222ffc>] (show_stack+0x10/0x14)
[<c0222ffc>] (show_stack) from [<c04d4f78>] (dump_stack+0x78/0x8c)
[<c04d4f78>] (dump_stack) from [<c0236b38>] (__warn+0xe8/0x100)
[<c0236b38>] (__warn) from [<c0236b88>] (warn_slowpath_fmt+0x38/0x48)
[<c0236b88>] (warn_slowpath_fmt) from [<c0806154>]
(dev_watchdog+0x248/0x24c)
[<c0806154>] (dev_watchdog) from [<c028a9a0>] (call_timer_fn+0x28/0x98)
[<c028a9a0>] (call_timer_fn) from [<c028aab0>] (expire_timers+0xa0/0xac)
[<c028aab0>] (expire_timers) from [<c028ab58>]
(run_timer_softirq+0x9c/0x194)
[<c028ab58>] (run_timer_softirq) from [<c023b110>]
(__do_softirq+0x114/0x234)
[<c023b110>] (__do_softirq) from [<c023b4fc>] (irq_exit+0xcc/0x108)
[<c023b4fc>] (irq_exit) from [<c027a1a0>]
(__handle_domain_irq+0x80/0xec)
[<c027a1a0>] (__handle_domain_irq) from [<c0201544>]
(gic_handle_irq+0x48/0x8c)
[<c0201544>] (gic_handle_irq) from [<c0223ab8>] (__irq_svc+0x58/0x8c)
Exception stack(0xc1001f28 to 0xc1001f70)
1f20:                   00000001 00000000 00000000 c0230060 c1000000
c1003d80
1f40: c1003d34 c0e72f50 c0bd9a04 c1001f80 00000000 00000000 0000320a
c1001f78
1f60: c022070c c0220710 600e0013 ffffffff
[<c0223ab8>] (__irq_svc) from [<c0220710>] (arch_cpu_idle+0x38/0x3c)
[<c0220710>] (arch_cpu_idle) from [<c026f4e0>] (do_idle+0x170/0x204)
[<c026f4e0>] (do_idle) from [<c026f82c>] (cpu_startup_entry+0x18/0x1c)
[<c026f82c>] (cpu_startup_entry) from [<c0e00c88>]
(start_kernel+0x394/0x3a0)
---[ end trace 86a38600d1b9e2a5 ]---
fec 30be0000.ethernet eth0: TX ring dump
Nr     SC     addr       len  SKB
  0    0x1c00 0x00000000   42   (null)
  1  H 0x1c00 0x00000000   86   (null)

^ permalink raw reply

* Re: [4.9.13] use after free in ipv4_mtu
From: Daniel J Blueman @ 2017-05-04  1:30 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Netdev, David Miller, Stephen Hemminger
In-Reply-To: <1488807904.9415.375.camel@edumazet-glaptop3.roam.corp.google.com>

On 6 March 2017 at 21:45, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Mon, 2017-03-06 at 14:33 +0800, Daniel J Blueman wrote:
>> On 2 March 2017 at 21:28, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> > On Thu, 2017-03-02 at 05:08 -0800, Eric Dumazet wrote:
>> >
>> >> Thanks for the report !
>> >>
>> >> This patch should solve this precise issue, but we need more work.
>> >>
>> >> We need to audit all __sk_dst_get() and make sure they are inside an
>> >> rcu_read_lock()/rcu_read_unlock() section.
>> >>
>> >> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
>> >> index 22548b5f05cbe5a655e0c53df2d31c5cc2e8a702..517963e1cb6eb9d70fcd71f44262813c3378759f 100644
>> >> --- a/net/ipv4/tcp_output.c
>> >> +++ b/net/ipv4/tcp_output.c
>> >> @@ -1459,7 +1459,7 @@ EXPORT_SYMBOL(tcp_sync_mss);
>> >>  unsigned int tcp_current_mss(struct sock *sk)
>> >>  {
>> >>       const struct tcp_sock *tp = tcp_sk(sk);
>> >> -     const struct dst_entry *dst = __sk_dst_get(sk);
>> >> +     const struct dst_entry *dst;
>> >>       u32 mss_now;
>> >>       unsigned int header_len;
>> >>       struct tcp_out_options opts;
>> >> @@ -1467,11 +1467,14 @@ unsigned int tcp_current_mss(struct sock *sk)
>> >>
>> >>       mss_now = tp->mss_cache;
>> >>
>> >> +     rcu_read_lock();
>> >> +     dst = __sk_dst_get(sk);
>> >>       if (dst) {
>> >>               u32 mtu = dst_mtu(dst);
>> >>               if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
>> >>                       mss_now = tcp_sync_mss(sk, mtu);
>> >>       }
>> >> +     rcu_read_unlock();
>> >>
>> >>       header_len = tcp_established_options(sk, NULL, &opts, &md5) +
>> >>                    sizeof(struct tcphdr);
>> >
>> > Normally TCP sockets sk_dst_cache can only be changed if the thread
>> > doing the change owns the socket.
>> >
>> > I have not yet understood which point was breaking the rule yet.
>>
>> Great work Eric! I have been unable to reproduce the KASAN warning
>> with this patch.
>>
>> Reported-by: Daniel J Blueman <daniel@quora.org>
>> Tested-by: Daniel J Blueman <daniel@quora.org>
>>
>> I do change the network queueing discipline and related at runtime [1]
>> which may be triggering this, though I did think I saw the KASAN
>> report only after resuming from suspend. rf(un)kill and other tweaking
>> may have been involved too.
>>
>> Thanks,
>>   Dan
>
> Thanks Daniel, but this bandaid patch should not be needed.
>
> Somehow another point in the stack is at fault and needs to be
> identified.
>
> Otherwise we'll keep adding works around.
>
> Since net-next is soon to be re-opened, I will submit patches adding
> more lockdep assisted checks.

I am still seeing the same KASAN issue on 4.9.25; let me know if any
debug or testing would help. Thanks Eric!

https://quora.org/linux/ipv4_mtu/vmlinux
https://quora.org/linux/ipv4_mtu/config

[19515.180104] BUG: KASAN: use-after-free in ipv4_mtu+0x25d/0x320 at
addr ffff88040d6fecc4
[19515.180108] Read of size 4 by task Socket Thread/3299
[19515.180113] CPU: 7 PID: 3299 Comm: Socket Thread Tainted: G    BU
       4.9.25-debug+ #5
[19515.180115] Hardware name: Dell Inc. XPS 15 9550/0N7TVV, BIOS
1.2.21 02/17/2017
[19515.180127]  ffff8803f55a77d0 ffffffff9ce404b1 ffff88042d803800
ffff88040d6fecc0
[19515.180133]  ffff8803f55a77f8 ffffffff9c7e2751 ffff8803f55a7890
ffff88040d6fecc0
[19515.180137]  ffff88042d803800 ffff8803f55a7880 ffffffff9c7e29ea
ffff8803f55a7828
[19515.180140] Call Trace:
[19515.180148]  [<ffffffff9ce404b1>] dump_stack+0x63/0x82
[19515.180154]  [<ffffffff9c7e2751>] kasan_object_err+0x21/0x70
[19515.180158]  [<ffffffff9c7e29ea>] kasan_report_error+0x1fa/0x500
[19515.180162]  [<ffffffff9dced48d>] ? schedule+0x8d/0x1a0
[19515.180168]  [<ffffffff9c4f561e>] ? get_futex_key+0x64e/0xbb0
[19515.180174]  [<ffffffff9c40a1ab>] ? update_cfs_rq_load_avg+0x89b/0x1520
[19515.180177]  [<ffffffff9c7e2e31>] __asan_report_load4_noabort+0x61/0x70
[19515.180180]  [<ffffffff9d9ace2d>] ? ipv4_mtu+0x25d/0x320
[19515.180182]  [<ffffffff9d9ace2d>] ipv4_mtu+0x25d/0x320
[19515.180187]  [<ffffffff9da44693>] tcp_current_mss+0x133/0x270
[19515.180190]  [<ffffffff9da44560>] ? tcp_mtu_to_mss+0x280/0x280
[19515.180193]  [<ffffffff9c40eeb9>] ? select_idle_sibling+0x29/0xaa0
[19515.180195]  [<ffffffff9c4f6284>] ? futex_wait_setup+0x1d4/0x300
[19515.180198]  [<ffffffff9c424f81>] ? enqueue_task_fair+0x261/0x2760
[19515.180201]  [<ffffffff9d9f99e4>] tcp_send_mss+0x24/0x2b0
[19515.180203]  [<ffffffff9da09809>] tcp_sendmsg+0x3d9/0x3530
[19515.180208]  [<ffffffff9c3f018d>] ? ttwu_do_wakeup+0x1d/0x2c0
[19515.180211]  [<ffffffff9c3f0539>] ? ttwu_do_activate+0x109/0x180
[19515.180213]  [<ffffffff9da09430>] ? tcp_sendpage+0x1cc0/0x1cc0
[19515.180216]  [<ffffffff9c3f3a27>] ? wake_up_q+0x87/0xe0
[19515.180219]  [<ffffffff9c4fb707>] ? do_futex+0xf87/0x1730
[19515.180222]  [<ffffffff9dac9d90>] ? inet_gso_segment+0x1340/0x1340
[19515.180224]  [<ffffffff9daca1b0>] ? inet_recvmsg+0x420/0x420
[19515.180226]  [<ffffffff9daca3c7>] inet_sendmsg+0x217/0x3c0
[19515.180228]  [<ffffffff9daca1b0>] ? inet_recvmsg+0x420/0x420
[19515.180232]  [<ffffffff9d87752a>] sock_sendmsg+0xba/0xf0
[19515.180234]  [<ffffffff9d8785ef>] SYSC_sendto+0x1df/0x330
[19515.180236]  [<ffffffff9d878410>] ? SYSC_connect+0x2d0/0x2d0
[19515.180243]  [<ffffffff9d872794>] ? sock_ioctl+0x284/0x3a0
[19515.180248]  [<ffffffff9c87e3ef>] ? do_vfs_ioctl+0x1af/0xf60
[19515.180250]  [<ffffffff9c87e240>] ? ioctl_preallocate+0x1e0/0x1e0
[19515.180253]  [<ffffffff9c4fbfa4>] ? SyS_futex+0xf4/0x2a0
[19515.180259]  [<ffffffff9c8436b4>] ? vfs_read+0x254/0x2f0
[19515.180261]  [<ffffffff9c4fbeb0>] ? do_futex+0x1730/0x1730
[19515.180265]  [<ffffffff9c84705c>] ? SyS_read+0x11c/0x1c0
[19515.180267]  [<ffffffff9d87a31e>] SyS_sendto+0xe/0x10
[19515.180271]  [<ffffffff9dcf897b>] entry_SYSCALL_64_fastpath+0x1e/0xad
[19515.180274] Object at ffff88040d6fecc0, in cache kmalloc-64 size: 64
[19515.180277] Allocated:
[19515.180279] PID = 2326
[19515.180287]  save_stack_trace+0x1b/0x20
[19515.180291]  save_stack+0x46/0xd0
[19515.180293]  kasan_kmalloc+0xad/0xe0
[19515.180295]  kmem_cache_alloc_trace+0xe8/0x1e0
[19515.180298]  fib_create_info+0x917/0x3fc0
[19515.180300]  fib_table_insert+0x1d4/0x1c90
[19515.180302]  inet_rtm_newroute+0xfb/0x180
[19515.180306]  rtnetlink_rcv_msg+0x249/0x6a0
[19515.180312]  netlink_rcv_skb+0x247/0x350
[19515.180314]  rtnetlink_rcv+0x28/0x30
[19515.180316]  netlink_unicast+0x419/0x5c0
[19515.180318]  netlink_sendmsg+0x8a7/0xb60
[19515.180319]  sock_sendmsg+0xba/0xf0
[19515.180321]  ___sys_sendmsg+0x697/0x860
[19515.180323]  __sys_sendmsg+0xd5/0x170
[19515.180324]  SyS_sendmsg+0x12/0x20
[19515.180327]  entry_SYSCALL_64_fastpath+0x1e/0xad
[19515.180327] Freed:
[19515.180329] PID = 0
[19515.180333]  save_stack_trace+0x1b/0x20
[19515.180340]  save_stack+0x46/0xd0
[19515.180345]  kasan_slab_free+0x71/0xb0
[19515.180347]  kfree+0x8c/0x1a0
[19515.180353]  free_fib_info_rcu+0x558/0x760
[19515.180358]  rcu_process_callbacks+0x968/0x1000
[19515.180361]  __do_softirq+0x1a9/0x538
[19515.180361] Memory state around the buggy address:
[19515.180367]  ffff88040d6feb80: fc fc fc fc 00 00 00 00 00 00 00 00
fc fc fc fc
[19515.180369]  ffff88040d6fec00: fb fb fb fb fb fb fb fb fc fc fc fc
fb fb fb fb
[19515.180371] >ffff88040d6fec80: fb fb fb fb fc fc fc fc fb fb fb fb
fb fb fb fb
[19515.180373]                                            ^
[19515.180375]  ffff88040d6fed00: fc fc fc fc fb fb fb fb fb fb fb fb
fc fc fc fc
[19515.180378]  ffff88040d6fed80: 00 00 00 00 00 00 00 00 fc fc fc fc
fb fb fb fb
-- 
Daniel J Blueman

^ permalink raw reply

* Re: [PATCH v4 net-next 04/10] net/ncsi: Ethtool operation to get NCSI topology
From: Gavin Shan @ 2017-05-04  1:36 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: Gavin Shan, netdev, joe, kubakici, f.fainelli, davem
In-Reply-To: <20170504004933.GP8029@lunn.ch>

On Thu, May 04, 2017 at 02:49:33AM +0200, Andrew Lunn wrote:
>> +void ncsi_ethtool_register_dev(struct net_device *dev)
>> +{
>> +	struct ethtool_ops *ops;
>> +
>> +	ops = (struct ethtool_ops *)(dev->ethtool_ops);
>
>Why do you need the cast here?
>
>Ah, is it because net_device has:
>
>          const struct ethtool_ops *ethtool_ops;
>
>i.e. you are casting off the const.
>
>> +	if (!ops)
>> +		return;
>> +
>> +	ops->get_ncsi_channels = ncsi_get_channels;
>
>and here you modify it. Which is going to blow up, because it will be
>in a read only segment and should throw an opps when you write to it?
>
>You need to export ncsi_get_channels, and let the underlying driver
>add it to its own ethtool_ops.
>

I didn't see oops because of this on the ARM board where I did the
testing. The intention was to make ethtool invisible to drivers, for
simplicity. However, we need to expose ethtool to driver when the HW
and SW statistics are conveyed by ETHTOOL_GSTATS as agree'd in another
thread.

Cheers,
Gavin

^ permalink raw reply

* Re: Maximum MPLS labels on Linux network stack
From: David Ahern @ 2017-05-04  2:22 UTC (permalink / raw)
  To: Joe Stringer,
	Алексей Болдырев
  Cc: netdev
In-Reply-To: <CAPWQB7FFwBFL9h7JoJMF4FuK7JXLrHfo5yg4aHJ4iczSdNU_bg@mail.gmail.com>

On 5/3/17 2:21 PM, Joe Stringer wrote:
>> • 8192 MPLS labels
>>
>> Especially interested in the figure 8192 MPLS Labels.

The 8k labels has to be 8k individual routes with a single label (or a
few labels in the stack for the route). In that case you can set
net.mpls.platforms_labels to 10001 and install routes with label values
up to 10000.

^ permalink raw reply

* Re: [PATCH 2/6] wl1251: Use request_firmware_prefer_user() for loading NVS calibration data
From: Luis R. Rodriguez @ 2017-05-04  2:28 UTC (permalink / raw)
  To: Arend Van Spriel
  Cc: Luis R. Rodriguez, Pavel Machek, Daniel Wagner, Tom Gundersen,
	Pali Rohár, Ming Lei, Greg Kroah-Hartman, Kalle Valo,
	David Gnedt, Michal Kazior, Tony Lindgren, Sebastian Reichel,
	Ivaylo Dimitrov, Aaro Koskinen, Takashi Iwai, David Woodhouse,
	Bjorn Andersson, Grazvydas Ignotas, linux-kernel, linux-wireless
In-Reply-To: <936bf348-58ac-882c-a433-83f209862deb@broadcom.com>

On Wed, May 03, 2017 at 09:02:20PM +0200, Arend Van Spriel wrote:
> On 3-1-2017 18:59, Luis R. Rodriguez wrote:
> > On Mon, Dec 26, 2016 at 05:35:59PM +0100, Pavel Machek wrote:
> >>
> >> Right question is "should we solve it without user-space help"?
> >>
> >> Answer is no, too. Way too complex. Yes, it would be nice if hardware
> >> was designed in such a way that getting calibration data from kernel
> >> is easy, and if you design hardware, please design it like that. But
> >> N900 is not designed like that and getting the calibration through
> >> userspace looks like only reasonable solution.
> > 
> > Arend seems to have a better alternative in mind possible for other
> > devices which *can* probably pull of doing this easily and nicely,
> > given the nasty history of the usermode helper crap we should not
> > in any way discourage such efforts.
> > 
> > Arend -- please look at the firmware cache, it not a hash but a hash
> > table for an O(1) lookups would be a welcomed change, then it could
> > be repurposed for what you describe, I think the only difference is
> > you'd perhaps want a custom driver hook to fetch the calibration data
> > so the driver does whatever it needs.
> 
> Hi Luis,
> 
> I let my idea catch dust on the shelf for a while. 

:) BTW did you get to check out Daniel Wagner and Tom Gundersen's firmware work
[0] ? This can provide a proper clear fallback mechanism which *also* helps
address the race between "critical mount points ready" problem we had discussed
long ago. IIRC it did this by having two modes of operation a best effort-mode
and a final-mode. The final-mode would only be used once all the real rootfs is
ready. Userspace decides when to kick / signal firmwared to switch to final-mode
as only userspace will know for sure when that is ready. The best-effort mode
would be used in initramfs. There is also no need for a "custom fallback", the
uevent fallback mechanism can be relied upon alone. Custom tools can just fork
off and do something similar to firmward then in terms of architecture. This is
a form of fallback mechanism I think I'd be happy to see enabled on the new
driver data API. But of course, I recall also liking what you had in mind as well
so would be happy to see more alternatives! The cleaner the better !

[0] https://github.com/teg/firmwared

> Actually had a couple
> of patches ready, but did get around testing them. So I wanted to rebase
> them on your linux-next tree. I bumped into the umh lock thing and was
> wondering why direct filesystem access was under that lock as well.

Indeed, it was an odd thing.

> In your tree I noticed a fix for that. 

Yup!

It took a lot of git archeology to reach a sound approach forward which makes
me feel comfortable without regressing the kernel, this should not regress
the kernel.

> The more reason to base my work on
> top of your firmware_class changes. Now my question is what is the best
> branch to choose, because you have a "few" in that repo to choose from ;-)

I have a series of topical changes, and I rebase onto the latest linux-next
regularly before posting patches, if 0-day finds issues, I dish out a next
try2 or tryX iteration until all issues are fixed. So in this case you'd
just go for the latest driver-data-$(next_date) and if there is a try
postfix use the latest tryX.

In this case 20170501-driver-data-try2, this is based on linux-next tag
next-20170501. If you have issues booting on that next tag though you
could instead try the v4.11-rc8-driver-data-try3 branch based on v4.11-rc8.
But naturally patches ultimately should be based on the latest linux-next
tag for actual submission.

PS. after my changes the fallback mechanism can easily be shoved into its
own file, not sure if that helps with how clean of a solution your work
will be.

  Luis

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox