Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 2/5] VSOCK: support fill data to mergeable rx buffer in host
From: jiangyiwen @ 2018-11-05  7:45 UTC (permalink / raw)
  To: stefanha, Jason Wang; +Cc: netdev, kvm, virtualization

When vhost support VIRTIO_VSOCK_F_MRG_RXBUF feature,
it will merge big packet into rx vq.

Signed-off-by: Yiwen Jiang <jiangyiwen@huawei.com>
---
 drivers/vhost/vsock.c             | 117 +++++++++++++++++++++++++++++++-------
 include/linux/virtio_vsock.h      |   1 +
 include/uapi/linux/virtio_vsock.h |   5 ++
 3 files changed, 102 insertions(+), 21 deletions(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 34bc3ab..648be39 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -22,7 +22,8 @@
 #define VHOST_VSOCK_DEFAULT_HOST_CID	2

 enum {
-	VHOST_VSOCK_FEATURES = VHOST_FEATURES,
+	VHOST_VSOCK_FEATURES = VHOST_FEATURES |
+			(1ULL << VIRTIO_VSOCK_F_MRG_RXBUF),
 };

 /* Used to track all the vhost_vsock instances on the system. */
@@ -80,6 +81,68 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
 	return vsock;
 }

+static int get_rx_bufs(struct vhost_virtqueue *vq,
+		struct vring_used_elem *heads, int datalen,
+		unsigned *iovcount, unsigned int quota)
+{
+	unsigned int out, in;
+	int seg = 0;
+	int headcount = 0;
+	unsigned d;
+	int ret;
+	/*
+	 * len is always initialized before use since we are always called with
+	 * datalen > 0.
+	 */
+	u32 uninitialized_var(len);
+
+	while (datalen > 0 && headcount < quota) {
+		if (unlikely(seg >= UIO_MAXIOV)) {
+			ret = -ENOBUFS;
+			goto err;
+		}
+
+		ret = vhost_get_vq_desc(vq, vq->iov + seg,
+				ARRAY_SIZE(vq->iov) - seg, &out,
+				&in, NULL, NULL);
+		if (unlikely(ret < 0))
+			goto err;
+
+		d = ret;
+		if (d == vq->num) {
+			ret = 0;
+			goto err;
+		}
+
+		if (unlikely(out || in <= 0)) {
+			vq_err(vq, "unexpected descriptor format for RX: "
+					"out %d, in %d\n", out, in);
+			ret = -EINVAL;
+			goto err;
+		}
+
+		heads[headcount].id = cpu_to_vhost32(vq, d);
+		len = iov_length(vq->iov + seg, in);
+		heads[headcount].len = cpu_to_vhost32(vq, len);
+		datalen -= len;
+		++headcount;
+		seg += in;
+	}
+
+	heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
+	*iovcount = seg;
+
+	/* Detect overrun */
+	if (unlikely(datalen > 0)) {
+		ret = UIO_MAXIOV + 1;
+		goto err;
+	}
+	return headcount;
+err:
+	vhost_discard_vq_desc(vq, headcount);
+	return ret;
+}
+
 static void
 vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 			    struct vhost_virtqueue *vq)
@@ -87,22 +150,34 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
 	struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX];
 	bool added = false;
 	bool restart_tx = false;
+	int mergeable;
+	size_t vsock_hlen;

 	mutex_lock(&vq->mutex);

 	if (!vq->private_data)
 		goto out;

+	mergeable = vhost_has_feature(vq, VIRTIO_VSOCK_F_MRG_RXBUF);
+	/*
+	 * Guest fill page for rx vq in mergeable case, so it will not
+	 * allocate pkt structure, we should reserve size of pkt in advance.
+	 */
+	if (likely(mergeable))
+		vsock_hlen = sizeof(struct virtio_vsock_pkt);
+	else
+		vsock_hlen = sizeof(struct virtio_vsock_hdr);
+
 	/* Avoid further vmexits, we're already processing the virtqueue */
 	vhost_disable_notify(&vsock->dev, vq);

 	for (;;) {
 		struct virtio_vsock_pkt *pkt;
 		struct iov_iter iov_iter;
-		unsigned out, in;
+		unsigned out = 0, in = 0;
 		size_t nbytes;
 		size_t len;
-		int head;
+		s16 headcount;

 		spin_lock_bh(&vsock->send_pkt_list_lock);
 		if (list_empty(&vsock->send_pkt_list)) {
@@ -116,16 +191,9 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
 		list_del_init(&pkt->list);
 		spin_unlock_bh(&vsock->send_pkt_list_lock);

-		head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
-					 &out, &in, NULL, NULL);
-		if (head < 0) {
-			spin_lock_bh(&vsock->send_pkt_list_lock);
-			list_add(&pkt->list, &vsock->send_pkt_list);
-			spin_unlock_bh(&vsock->send_pkt_list_lock);
-			break;
-		}
-
-		if (head == vq->num) {
+		headcount = get_rx_bufs(vq, vq->heads, vsock_hlen + pkt->len,
+				&in, likely(mergeable) ? UIO_MAXIOV : 1);
+		if (headcount <= 0) {
 			spin_lock_bh(&vsock->send_pkt_list_lock);
 			list_add(&pkt->list, &vsock->send_pkt_list);
 			spin_unlock_bh(&vsock->send_pkt_list_lock);
@@ -133,19 +201,13 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
 			/* We cannot finish yet if more buffers snuck in while
 			 * re-enabling notify.
 			 */
-			if (unlikely(vhost_enable_notify(&vsock->dev, vq))) {
+			if (!headcount && unlikely(vhost_enable_notify(&vsock->dev, vq))) {
 				vhost_disable_notify(&vsock->dev, vq);
 				continue;
 			}
 			break;
 		}

-		if (out) {
-			virtio_transport_free_pkt(pkt);
-			vq_err(vq, "Expected 0 output buffers, got %u\n", out);
-			break;
-		}
-
 		len = iov_length(&vq->iov[out], in);
 		iov_iter_init(&iov_iter, READ, &vq->iov[out], in, len);

@@ -156,6 +218,19 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
 			break;
 		}

+		if (likely(mergeable)) {
+			pkt->mrg_rxbuf_hdr.num_buffers = cpu_to_le16(headcount);
+			nbytes = copy_to_iter(&pkt->mrg_rxbuf_hdr,
+					sizeof(pkt->mrg_rxbuf_hdr), &iov_iter);
+			if (nbytes != sizeof(pkt->mrg_rxbuf_hdr)) {
+				virtio_transport_free_pkt(pkt);
+				vq_err(vq, "Faulted on copying rxbuf hdr\n");
+				break;
+			}
+			iov_iter_advance(&iov_iter, (vsock_hlen -
+					sizeof(pkt->mrg_rxbuf_hdr) - sizeof(pkt->hdr)));
+		}
+
 		nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter);
 		if (nbytes != pkt->len) {
 			virtio_transport_free_pkt(pkt);
@@ -163,7 +238,7 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
 			break;
 		}

-		vhost_add_used(vq, head, sizeof(pkt->hdr) + pkt->len);
+		vhost_add_used_n(vq, vq->heads, headcount);
 		added = true;

 		if (pkt->reply) {
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index bf84418..da9e1fe 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -50,6 +50,7 @@ struct virtio_vsock_sock {

 struct virtio_vsock_pkt {
 	struct virtio_vsock_hdr	hdr;
+	struct virtio_vsock_mrg_rxbuf_hdr mrg_rxbuf_hdr;
 	struct work_struct work;
 	struct list_head list;
 	/* socket refcnt not held, only use for cancellation */
diff --git a/include/uapi/linux/virtio_vsock.h b/include/uapi/linux/virtio_vsock.h
index 1d57ed3..2292f30 100644
--- a/include/uapi/linux/virtio_vsock.h
+++ b/include/uapi/linux/virtio_vsock.h
@@ -63,6 +63,11 @@ struct virtio_vsock_hdr {
 	__le32	fwd_cnt;
 } __attribute__((packed));

+/* It add mergeable rx buffers feature */
+struct virtio_vsock_mrg_rxbuf_hdr {
+	__le16  num_buffers;    /* number of mergeable rx buffers */
+} __attribute__((packed));
+
 enum virtio_vsock_type {
 	VIRTIO_VSOCK_TYPE_STREAM = 1,
 };
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH 4/5] VSOCK: modify default rx buf size to improve performance
From: jiangyiwen @ 2018-11-05  7:47 UTC (permalink / raw)
  To: stefanha, Jason Wang; +Cc: netdev, kvm, virtualization

Since VSOCK already support mergeable rx buffer, so it can
implement the balance with performance and guest memory,
we can increase the default rx buffer size to improve
performance.

Signed-off-by: Yiwen Jiang <jiangyiwen@huawei.com>
---
 include/linux/virtio_vsock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index 6be3cd7..594e720 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -10,7 +10,7 @@
 #define VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE	128
 #define VIRTIO_VSOCK_DEFAULT_BUF_SIZE		(1024 * 256)
 #define VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE	(1024 * 256)
-#define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE	(1024 * 4)
+#define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE	(1024 * 64)
 #define VIRTIO_VSOCK_MAX_BUF_SIZE		0xFFFFFFFFUL
 #define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE		(1024 * 64)
 /* virtio_vsock_pkt + max_pkt_len(default MAX_PKT_BUF_SIZE) */
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH 5/5] VSOCK: batch sending rx buffer to increase bandwidth
From: jiangyiwen @ 2018-11-05  7:48 UTC (permalink / raw)
  To: stefanha, Jason Wang; +Cc: netdev, kvm, virtualization

Batch sending rx buffer can improve total bandwidth.

Signed-off-by: Yiwen Jiang <jiangyiwen@huawei.com>
---
 drivers/vhost/vsock.c | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 648be39..a587ddc 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -148,10 +148,12 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
 			    struct vhost_virtqueue *vq)
 {
 	struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX];
-	bool added = false;
 	bool restart_tx = false;
 	int mergeable;
 	size_t vsock_hlen;
+	int batch_count = 0;
+
+#define VHOST_VSOCK_BATCH 16

 	mutex_lock(&vq->mutex);

@@ -191,8 +193,9 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
 		list_del_init(&pkt->list);
 		spin_unlock_bh(&vsock->send_pkt_list_lock);

-		headcount = get_rx_bufs(vq, vq->heads, vsock_hlen + pkt->len,
-				&in, likely(mergeable) ? UIO_MAXIOV : 1);
+		headcount = get_rx_bufs(vq, vq->heads + batch_count,
+				vsock_hlen + pkt->len, &in,
+				likely(mergeable) ? UIO_MAXIOV : 1);
 		if (headcount <= 0) {
 			spin_lock_bh(&vsock->send_pkt_list_lock);
 			list_add(&pkt->list, &vsock->send_pkt_list);
@@ -238,8 +241,12 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
 			break;
 		}

-		vhost_add_used_n(vq, vq->heads, headcount);
-		added = true;
+		batch_count += headcount;
+		if (batch_count > VHOST_VSOCK_BATCH) {
+			vhost_add_used_and_signal_n(&vsock->dev, vq,
+					vq->heads, batch_count);
+			batch_count = 0;
+		}

 		if (pkt->reply) {
 			int val;
@@ -258,8 +265,11 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,

 		virtio_transport_free_pkt(pkt);
 	}
-	if (added)
-		vhost_signal(&vsock->dev, vq);
+
+	if (batch_count) {
+		vhost_add_used_and_signal_n(&vsock->dev, vq,
+				vq->heads, batch_count);
+	}

 out:
 	mutex_unlock(&vq->mutex);
-- 
1.8.3.1

^ permalink raw reply related

* Forgive my intentions if this email comes to you as a surprise,
From: MR JHORGE JAMES @ 2018-11-05  7:56 UTC (permalink / raw)


 Dear friend,

 Forgive my indignation if this message comes to you as a surprise. I
got your contact When i was searching for a foreign reliable partner I
am formal Director Central bank of Gabon, Presently i work with UNITED
BANK FOR AFRICA (BOA) ECO-WAS as telex managing Director.

bank (B.O.A). In my department we discovered an abandoned sum of $37.5
million U.S.A dollars. In an account that belongs to one of our
foreign customer who died along with all his family in the Asia Earth
Quake Disaster(TSUNAMI DISASTER INDONESIA / INDIA.

Since we got information about his death, unfortunately i learn that
all his supposed next of kin or relation died along side leaving
nobody behind for the claim. In respect to the provision of a foreign
account ($15 million dollars) for you and ($20 million dollars) for
me. Then we give the remain ($2.5 million dollars) to orphanage.
There after i will visit your country for disbursement according to
the percentages indicated.

(FILL THIS FORM BELLOW PLEASE AND RESEND IT TO ME).

1) Your Full Name
    2) Your Age
    3) Marital Status
    4) Your Cell Phone Number
    5) Your Fax Number
    6) Your Country
    7) Your Occupation
    8) Sex
    9) Your Religion
for security reasons You have to keep everything secret as to enable
the transfer to move very smoothly in to the account you will prove to
the bank. I am waiting for your immediate response as you receive this
mail. Extend my sincere greetings to your entire family. God bless you
and bye for now.
  this is my private email you can contact me on (jhor6767@outlook.com).
Thanks for your maximum co-operation,
Yours Sincerely,

^ permalink raw reply

* [PATCH] xfrm: Fix bucket count reported to userspace
From: Benjamin Poirier @ 2018-11-05  8:00 UTC (permalink / raw)
  To: Steffen Klassert, Jamal Hadi Salim; +Cc: Herbert Xu, David S. Miller, netdev

sadhcnt is reported by `ip -s xfrm state count` as "buckets count", not the
hash mask.

Fixes: 28d8909bc790 ("[XFRM]: Export SAD info.")
Signed-off-by: Benjamin Poirier <bpoirier@suse.com>
---
 net/xfrm/xfrm_state.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index b669262682c9..12cdb350c456 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -788,7 +788,7 @@ void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si)
 {
 	spin_lock_bh(&net->xfrm.xfrm_state_lock);
 	si->sadcnt = net->xfrm.state_num;
-	si->sadhcnt = net->xfrm.state_hmask;
+	si->sadhcnt = net->xfrm.state_hmask + 1;
 	si->sadhmcnt = xfrm_state_hashmax;
 	spin_unlock_bh(&net->xfrm.xfrm_state_lock);
 }
-- 
2.19.0

^ permalink raw reply related

* Re: [PATCH 0/5] Use common cordic algorithm for b43
From: Arend van Spriel @ 2018-11-05  8:17 UTC (permalink / raw)
  To: Kalle Valo, Priit Laes
  Cc: linux-wireless, b43-dev, netdev, linux-kernel,
	brcm80211-dev-list.pdl, brcm80211-dev-list
In-Reply-To: <87muqoar5i.fsf@purkki.adurom.net>

On 11/5/2018 9:02 AM, Kalle Valo wrote:
> Also I don't see MAINTAINERS entry for cordic.[c|h], that would be good
> to have as well.

We added the cordic library functions during brcm80211 staging cleanup. 
We can add it to MAINTAINERS file.

Regards,
Arend

^ permalink raw reply

* [PATCH] net: alx: make alx_drv_name static
From: Rasmus Villemoes @ 2018-11-05 17:52 UTC (permalink / raw)
  To: Jay Cliburn, Chris Snook, David S. Miller
  Cc: Rasmus Villemoes, netdev, linux-kernel

alx_drv_name is not used outside main.c, so there's no reason for it to
have external linkage.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
---
 drivers/net/ethernet/atheros/alx/alx.h  | 1 -
 drivers/net/ethernet/atheros/alx/main.c | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/atheros/alx/alx.h b/drivers/net/ethernet/atheros/alx/alx.h
index 78c5de467426..9d0e74f6b089 100644
--- a/drivers/net/ethernet/atheros/alx/alx.h
+++ b/drivers/net/ethernet/atheros/alx/alx.h
@@ -140,6 +140,5 @@ struct alx_priv {
 };
 
 extern const struct ethtool_ops alx_ethtool_ops;
-extern const char alx_drv_name[];
 
 #endif
diff --git a/drivers/net/ethernet/atheros/alx/main.c b/drivers/net/ethernet/atheros/alx/main.c
index 7968c644ad86..c131cfc1b79d 100644
--- a/drivers/net/ethernet/atheros/alx/main.c
+++ b/drivers/net/ethernet/atheros/alx/main.c
@@ -49,7 +49,7 @@
 #include "hw.h"
 #include "reg.h"
 
-const char alx_drv_name[] = "alx";
+static const char alx_drv_name[] = "alx";
 
 static void alx_free_txbuf(struct alx_tx_queue *txq, int entry)
 {
-- 
2.19.1.6.gbde171bbf5

^ permalink raw reply related

* Re: Kernel 4.19 network performance - forwarding/routing normal users traffic
From: Tariq Toukan @ 2018-11-05  8:42 UTC (permalink / raw)
  To: Jesper Dangaard Brouer, Aaron Lu
  Cc: Saeed Mahameed, pstaszewski@itcare.pl, eric.dumazet@gmail.com,
	netdev@vger.kernel.org, Tariq Toukan, ilias.apalodimas@linaro.org,
	yoel@kviknet.dk, mgorman@techsingularity.net
In-Reply-To: <20181103135325.01a7b5d6@redhat.com>



On 03/11/2018 2:53 PM, Jesper Dangaard Brouer wrote:
> 
> On Fri, 2 Nov 2018 22:20:24 +0800 Aaron Lu <aaron.lu@intel.com> wrote:
> 
>> On Fri, Nov 02, 2018 at 12:40:37PM +0100, Jesper Dangaard Brouer wrote:
>>> On Fri, 2 Nov 2018 13:23:56 +0800
>>> Aaron Lu <aaron.lu@intel.com> wrote:
>>>    
>>>> On Thu, Nov 01, 2018 at 08:23:19PM +0000, Saeed Mahameed wrote:
>>>>> On Thu, 2018-11-01 at 23:27 +0800, Aaron Lu wrote:
>>>>>> On Thu, Nov 01, 2018 at 10:22:13AM +0100, Jesper Dangaard Brouer
>>>>>> wrote:
>>>>>> ... ...
>>>>>>> Section copied out:
>>>>>>>
>>>>>>>    mlx5e_poll_tx_cq
>>>>>>>    |
>>>>>>>     --16.34%--napi_consume_skb
>>>>>>>               |
>>>>>>>               |--12.65%--__free_pages_ok
>>>>>>>               |          |
>>>>>>>               |           --11.86%--free_one_page
>>>>>>>               |                     |
>>>>>>>               |                     |--10.10%
>>>>>>> --queued_spin_lock_slowpath
>>>>>>>               |                     |
>>>>>>>               |                      --0.65%--_raw_spin_lock
>>>>>>
>>>>>> This callchain looks like it is freeing higher order pages than order
>>>>>> 0:
>>>>>> __free_pages_ok is only called for pages whose order are bigger than
>>>>>> 0.
>>>>>
>>>>> mlx5 rx uses only order 0 pages, so i don't know where these high order
>>>>> tx SKBs are coming from..
>>>>
>>>> Perhaps here:
>>>> __netdev_alloc_skb(), __napi_alloc_skb(), __netdev_alloc_frag() and
>>>> __napi_alloc_frag() will all call page_frag_alloc(), which will use
>>>> __page_frag_cache_refill() to get an order 3 page if possible, or fall
>>>> back to an order 0 page if order 3 page is not available.
>>>>
>>>> I'm not sure if your workload will use the above code path though.
>>>
>>> TL;DR: this is order-0 pages (code-walk trough proof below)
>>>
>>> To Aaron, the network stack *can* call __free_pages_ok() with order-0
>>> pages, via:
>>>
>>> static void skb_free_head(struct sk_buff *skb)
>>> {
>>> 	unsigned char *head = skb->head;
>>>
>>> 	if (skb->head_frag)
>>> 		skb_free_frag(head);
>>> 	else
>>> 		kfree(head);
>>> }
>>>
>>> static inline void skb_free_frag(void *addr)
>>> {
>>> 	page_frag_free(addr);
>>> }
>>>
>>> /*
>>>   * Frees a page fragment allocated out of either a compound or order 0 page.
>>>   */
>>> void page_frag_free(void *addr)
>>> {
>>> 	struct page *page = virt_to_head_page(addr);
>>>
>>> 	if (unlikely(put_page_testzero(page)))
>>> 		__free_pages_ok(page, compound_order(page));
>>> }
>>> EXPORT_SYMBOL(page_frag_free);
>>
>> I think here is a problem - order 0 pages are freed directly to buddy,
>> bypassing per-cpu-pages. This might be the reason lock contention
>> appeared on free path.
> 
> OMG - you just found a significant issue with the network stacks
> interaction with the page allocator!  This explains why I could not get
> the PCP (Per-Cpu-Pages) system to have good performance, in my
> performance networking benchmarks. As we are basically only using the
> alloc side of PCP, and not the free side.
>   We have spend years adding different driver level recycle tricks to
> avoid this code path getting activated, exactly because it is rather
> slow and problematic that we hit this zone->lock.
> 

Oh! It has been behaving this way for too long.
Good catch!

>> Can someone apply below diff and see if lock contention is gone?
> 
> I have also applied and tested this patch, and yes the lock contention
> is gone.  As mentioned is it rather difficult to hit this code path, as
> the driver page recycle mechanism tries to hide/avoid it, but mlx5 +
> page_pool + CPU-map recycling have a known weakness that bypass the
> driver page recycle scheme (that I've not fixed yet).  I observed a 7%
> speedup for this micro benchmark.
> 

Great news. I also have a benchmark that uses orde-r0 pages and stresses 
the zone-lock. I'll test your patch during this week.

>   
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index e2ef1c17942f..65c0ae13215a 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -4554,8 +4554,14 @@ void page_frag_free(void *addr)
>>   {
>>   	struct page *page = virt_to_head_page(addr);
>>   
>> -	if (unlikely(put_page_testzero(page)))
>> -		__free_pages_ok(page, compound_order(page));
>> +	if (unlikely(put_page_testzero(page))) {
>> +		unsigned int order = compound_order(page);
>> +
>> +		if (order == 0)
>> +			free_unref_page(page);
>> +		else
>> +			__free_pages_ok(page, order);
>> +	}
>>   }
>>   EXPORT_SYMBOL(page_frag_free);
> 
> Thank you Aaron for spotting this!!!
> 
Thanks Aaron :) !!

Does it conflict with your recent work that optimizes order-0 allocation?

^ permalink raw reply

* RE: [PATCH net-next 5/6] net/ncsi: Reset channel state in ncsi_start_dev()
From: Justin.Lee1 @ 2018-11-05 18:01 UTC (permalink / raw)
  To: sam, netdev; +Cc: davem, linux-kernel, openbmc
In-Reply-To: <c4f0fdcc971ca258539899a8b15755b96b2353f5.camel@mendozajonas.com>



> On Tue, 2018-10-30 at 21:26 +0000, Justin.Lee1@Dell.com wrote:
> > > +int ncsi_reset_dev(struct ncsi_dev *nd)
> > > +{
> > > +	struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd);
> > > +	struct ncsi_channel *nc, *active;
> > > +	struct ncsi_package *np;
> > > +	unsigned long flags;
> > > +	bool enabled;
> > > +	int state;
> > > +
> > > +	active = NULL;
> > > +	NCSI_FOR_EACH_PACKAGE(ndp, np) {
> > > +		NCSI_FOR_EACH_CHANNEL(np, nc) {
> > > +			spin_lock_irqsave(&nc->lock, flags);
> > > +			enabled = nc->monitor.enabled;
> > > +			state = nc->state;
> > > +			spin_unlock_irqrestore(&nc->lock, flags);
> > > +
> > > +			if (enabled)
> > > +				ncsi_stop_channel_monitor(nc);
> > > +			if (state == NCSI_CHANNEL_ACTIVE) {
> > > +				active = nc;
> > > +				break;
> > 
> > Is the original intention to process the channel one by one?
> > If it is the case, there are two loops and we might need to use
> > "goto found" instead.
> 
> Yes we'll need to break out of the package loop here as well.
> 
> > 
> > > +			}
> > > +		}
> > > +	}
> > > +
> > 
> > found: ?
> > 
> > > +	if (!active) {
> > > +		/* Done */
> > > +		spin_lock_irqsave(&ndp->lock, flags);
> > > +		ndp->flags &= ~NCSI_DEV_RESET;
> > > +		spin_unlock_irqrestore(&ndp->lock, flags);
> > > +		return ncsi_choose_active_channel(ndp);
> > > +	}
> > > +
> > > +	spin_lock_irqsave(&ndp->lock, flags);
> > > +	ndp->flags |= NCSI_DEV_RESET;
> > > +	ndp->active_channel = active;
> > > +	ndp->active_package = active->package;
> > > +	spin_unlock_irqrestore(&ndp->lock, flags);
> > > +
> > > +	nd->state = ncsi_dev_state_suspend;
> > > +	schedule_work(&ndp->work);
> > > +	return 0;
> > > +}
> > 
> > Also similar issue in ncsi_choose_active_channel() function below.
> > 
> > > @@ -916,32 +1045,49 @@ static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp)
> > >  
> > >  			ncm = &nc->modes[NCSI_MODE_LINK];
> > >  			if (ncm->data[2] & 0x1) {
> > > -				spin_unlock_irqrestore(&nc->lock, flags);
> > >  				found = nc;
> > > -				goto out;
> > > +				with_link = true;
> > >  			}
> > >  
> > > -			spin_unlock_irqrestore(&nc->lock, flags);
> > > +			/* If multi_channel is enabled configure all valid
> > > +			 * channels whether or not they currently have link
> > > +			 * so they will have AENs enabled.
> > > +			 */
> > > +			if (with_link || np->multi_channel) {
> > 
> > I notice that there is a case that we will misconfigure the interface.
> > For example below, multi-channel is not enable for package 1.
> > But we enable the channel for ncsi2 below (package 1 channel 0) as that interface is the first
> > channel for that package with link.
> 
> I don't think I see the issue here; multi-channel is not set on package
> 1, but both channels are in the channel whitelist. Channel 0 is
> configured since it's the first found on package 1, and channel 1 is not
> since channel 0 is already found. Are you expecting something different?
>  

The setting is that multi-package is enable for both package 0 and 1.
Multi-channel is only enabled for package 0.

> > 
> > cat /sys/kernel/debug/ncsi_protocol/ncsi_device_
> > IFIDX IFNAME NAME   PID CID RX TX MP MC WP WC PC CS PS LS RU CR NQ HA
> > =====================================================================
> >   2   eth2   ncsi0  000 000 1  1  1  1  1  1  0  2  1  1  1  1  0  1
> >   2   eth2   ncsi1  000 001 1  0  1  1  1  1  0  2  1  1  1  1  0  1
> >   2   eth2   ncsi2  001 000 1  0  1  0  1  1  0  2  1  1  1  1  0  1

I was replying to the wrong old email and it might cause a bit confusion.
The first 1 meaning channel is enabled for package 1 channel 0 (ncsi2). 
For eth2, we already has ncsi0 as the active channel with TX enable.
I would think that package doesn't have the multi-channel enabled and
we should not enable the channel for ncsi2. The problem is that package 1 doesn't
enable the multi-channel and it believes it needs to enable one channel for its package 
but it doesn't aware that the other package already has one active channel.

> >   2   eth2   ncsi3  001 001 0  0  1  0  1  1  0  1  0  1  1  1  0  1
> > =====================================================================
> > MP: Multi-mode Package     WP: Whitelist Package
> > MC: Multi-mode Channel     WC: Whitelist Channel
> > PC: Primary Channel        CS: Channel State IA/A/IV 1/2/3
> > PS: Poll Status            LS: Link Status
> > RU: Running                CR: Carrier OK
> > NQ: Queue Stopped          HA: Hardware Arbitration
> > 
> > I temporally change to the following to avoid that.
> > 			if ((with_link &&
> > 			     !np->multi_channel &&
> > 			     list_empty(&ndp->channel_queue)) || np->multi_channel) {
> > 
> > > +				spin_lock_irqsave(&ndp->lock, flags);
> > > +				list_add_tail_rcu(&nc->link,
> > > +						  &ndp->channel_queue);
> > > +				spin_unlock_irqrestore(&ndp->lock, flags);
> > > +
> > > +				netdev_dbg(ndp->ndev.dev,
> > > +					   "NCSI: Channel %u added to queue (link %s)\n",
> > > +					   nc->id,
> > > +					   ncm->data[2] & 0x1 ? "up" : "down");
> > > +			}
> > > +
> > > +			spin_unlock_irqrestore(&nc->lock, cflags);
> > > +
> > > +			if (with_link && !np->multi_channel)
> > > +				break;
> > 
> > Similar issue here. As we are using break, so each package will configure one active TX.
> > 
> 
> I believe this is handled properly in ncsi_channel_is_tx() in the most
> recent revision.

I saw this issue with the last revision. I was using the wrong email to reply.

> 
> > >  		}
> > > +		if (with_link && !ndp->multi_package)
> > > +			break;
> > >  	}
> > >  
> > > -	if (!found) {
> > > +	if (list_empty(&ndp->channel_queue) && found) {
> > > +		netdev_info(ndp->ndev.dev,
> > > +			    "NCSI: No channel with link found, configuring channel %u\n",
> > > +			    found->id);
> > > +		spin_lock_irqsave(&ndp->lock, flags);
> > > +		list_add_tail_rcu(&found->link, &ndp->channel_queue);
> > > +		spin_unlock_irqrestore(&ndp->lock, flags);
> > > +	} else if (!found) {
> > >  		netdev_warn(ndp->ndev.dev,
> > > -			    "NCSI: No channel found with link\n");
> > > +			    "NCSI: No channel found to configure!\n");
> > >  		ncsi_report_link(ndp, true);
> > >  		return -ENODEV;
> > >  	}
> > 
> > Also, for deselect package handler function, do we want to set to inactive here?
> > If we just change the state, the cached data still keeps the old value. If the new 
> > ncsi_reset_dev() function is handling one by one, can we skip this part?
> 
> Technically yes we could skip the state change here since
> ncsi_reset_dev() will have already done it. However if we send a DP
> command via some other means then it is probably best to ensure we treat
> all channels on that package as inactive.

When I tested, if I didn't comment out the state change in response handler,
ncsi_reset_dev() function will not handle properly and some channels got into
invisible state and at the end we lost those selectable channels.

> 
> > 
> > static int ncsi_rsp_handler_dp(struct ncsi_request *nr)
> > {
> > 	struct ncsi_rsp_pkt *rsp;
> > 	struct ncsi_dev_priv *ndp = nr->ndp;
> > 	struct ncsi_package *np;
> > 	struct ncsi_channel *nc;
> > 	unsigned long flags;
> > 
> > 	/* Find the package */
> > 	rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
> > 	ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
> > 				      &np, NULL);
> > 	if (!np)
> > 		return -ENODEV;
> > 
> > 	/* Change state of all channels attached to the package */
> > 	NCSI_FOR_EACH_CHANNEL(np, nc) {
> > 		spin_lock_irqsave(&nc->lock, flags);
> > 		nc->state = NCSI_CHANNEL_INACTIVE;
> > 
> > 		spin_unlock_irqrestore(&nc->lock, flags);
> > 	}
> > 
> > 	return 0;
> > }
> > 
> > 



^ permalink raw reply

* Re: Kernel 4.19 network performance - forwarding/routing normal users traffic
From: Aaron Lu @ 2018-11-05  8:48 UTC (permalink / raw)
  To: Tariq Toukan
  Cc: Jesper Dangaard Brouer, Saeed Mahameed, pstaszewski@itcare.pl,
	eric.dumazet@gmail.com, netdev@vger.kernel.org,
	ilias.apalodimas@linaro.org, yoel@kviknet.dk,
	mgorman@techsingularity.net
In-Reply-To: <a01c44c2-bb52-e575-62c0-e990b38bda53@mellanox.com>

On Mon, Nov 05, 2018 at 08:42:33AM +0000, Tariq Toukan wrote:
> 
> On 03/11/2018 2:53 PM, Jesper Dangaard Brouer wrote:
> > 
> > On Fri, 2 Nov 2018 22:20:24 +0800 Aaron Lu <aaron.lu@intel.com> wrote:
> >>
> >> I think here is a problem - order 0 pages are freed directly to buddy,
> >> bypassing per-cpu-pages. This might be the reason lock contention
> >> appeared on free path.
> > 
> > OMG - you just found a significant issue with the network stacks
> > interaction with the page allocator!  This explains why I could not get
> > the PCP (Per-Cpu-Pages) system to have good performance, in my
> > performance networking benchmarks. As we are basically only using the
> > alloc side of PCP, and not the free side.
> >   We have spend years adding different driver level recycle tricks to
> > avoid this code path getting activated, exactly because it is rather
> > slow and problematic that we hit this zone->lock.
> > 
> 
> Oh! It has been behaving this way for too long.
> Good catch!

Thanks.

> >> Can someone apply below diff and see if lock contention is gone?
> > 
> > I have also applied and tested this patch, and yes the lock contention
> > is gone.  As mentioned is it rather difficult to hit this code path, as
> > the driver page recycle mechanism tries to hide/avoid it, but mlx5 +
> > page_pool + CPU-map recycling have a known weakness that bypass the
> > driver page recycle scheme (that I've not fixed yet).  I observed a 7%
> > speedup for this micro benchmark.
> > 
> 
> Great news. I also have a benchmark that uses orde-r0 pages and stresses 
> the zone-lock. I'll test your patch during this week.

Note this patch only helps when order-0 pages are freed through
page_frag_free().

I'll send a formal patch later.

> >   
> >> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> >> index e2ef1c17942f..65c0ae13215a 100644
> >> --- a/mm/page_alloc.c
> >> +++ b/mm/page_alloc.c
> >> @@ -4554,8 +4554,14 @@ void page_frag_free(void *addr)
> >>   {
> >>   	struct page *page = virt_to_head_page(addr);
> >>   
> >> -	if (unlikely(put_page_testzero(page)))
> >> -		__free_pages_ok(page, compound_order(page));
> >> +	if (unlikely(put_page_testzero(page))) {
> >> +		unsigned int order = compound_order(page);
> >> +
> >> +		if (order == 0)
> >> +			free_unref_page(page);
> >> +		else
> >> +			__free_pages_ok(page, order);
> >> +	}
> >>   }
> >>   EXPORT_SYMBOL(page_frag_free);
> > 
> > Thank you Aaron for spotting this!!!
> > 
> Thanks Aaron :) !!
> 
> Does it conflict with your recent work that optimizes order-0 allocation?

No it doesn't. This patch optimize code outside of zone lock(by reducing
the need to take zone lock) while my recent work optimize code inside
the zone lock :-)

^ permalink raw reply

* Re: [PATCH] net: skbuff.h: remove unnecessary unlikely()
From: David Miller @ 2018-11-05 18:09 UTC (permalink / raw)
  To: tiny.windzz
  Cc: edumazet, willemb, dja, ast, sbrivio, pabeni, linux-kernel,
	netdev
In-Reply-To: <CAEExFWuGN_R=7B4ueAVA8hVoix8ko8zQSXzHxZB5gjwP7jOjpg@mail.gmail.com>

From: Frank Lee <tiny.windzz@gmail.com>
Date: Mon, 5 Nov 2018 21:21:50 +0800

> add netdev@vger.kernel.org
> -- Yangtao

Sorry, you can't do it like that.

You have to make a formal, fresh, posting to netdev with your patch.

Thank you.

^ permalink raw reply

* Re: linux-next: Tree for Nov 5 (net/ipv6/af_inet6)
From: David Miller @ 2018-11-05 18:12 UTC (permalink / raw)
  To: rdunlap; +Cc: sfr, linux-next, linux-kernel, netdev, 0xeffeff
In-Reply-To: <2ad190b3-0b0d-972b-2a6e-16abf4a81c5b@infradead.org>

From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 5 Nov 2018 08:21:28 -0800

> On 11/4/18 9:51 PM, Stephen Rothwell wrote:
>> Hi all,
>> 
>> Changes since 20181102:
>> 
>> Non-merge commits (relative to Linus' tree): 418
> 
> on i386 or x86_64:
> 
> ld: net/ipv6/af_inet6.o: in function `inet6_init':
> af_inet6.c:(.init.text+0x285): undefined reference to `ipv6_anycast_init'
> ld: af_inet6.c:(.init.text+0x376): undefined reference to `ipv6_anycast_cleanup'
> 
> 
> Full randconfig file is attached (for i386).

Jeff, please fix this.

^ permalink raw reply

* Re: [PATCH V2 2/7] net: lorawan: Add LoRaWAN socket module
From: David Miller @ 2018-11-05 18:16 UTC (permalink / raw)
  To: starnight
  Cc: afaerber, netdev, linux-arm-kernel, linux-kernel, marcel,
	dollar.chen, ken.yu, linux-wpan, stefan
In-Reply-To: <20181105165544.5215-3-starnight@g.ncu.edu.tw>

From: Jian-Hong Pan <starnight@g.ncu.edu.tw>
Date: Tue,  6 Nov 2018 00:55:40 +0800

> +static inline struct lrw_mac_cb * mac_cb(struct sk_buff *skb)

"mac_cb()" is pretty generic for a name, and leads to namespace pollution,
please use lrw_mac_cb() or similar.

> +static inline struct dgram_sock *
> +dgram_sk(const struct sock *sk)
> +{
> +	return container_of(sk, struct dgram_sock, sk);
> +}
> +
> +static inline struct net_device *
> +lrw_get_dev_by_addr(struct net *net, u32 devaddr)

Never use inline for functions in a foo.c file, let the compiler decide.

> +{
> +	struct net_device *ndev = NULL;
> +	__be32 be_addr = cpu_to_be32(devaddr);

Always order local variables from longest to shortest line.

> +static int
> +dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
> +	      int noblock, int flags, int *addr_len)
> +{
> +	struct sk_buff *skb;
> +	size_t copied = 0;
> +	DECLARE_SOCKADDR(struct sockaddr_lorawan *, saddr, msg->msg_name);
> +	int err;

Likewise.

I'm not going to point out every single place where you have made these
two errors.

Please audit your entire submission and fix the problems wherever they
occur.

Thank you.

^ permalink raw reply

* [PATCH 1/2] mm/page_alloc: free order-0 pages through PCP in page_frag_free()
From: Aaron Lu @ 2018-11-05  8:58 UTC (permalink / raw)
  To: linux-mm, linux-kernel, netdev
  Cc: Andrew Morton, Paweł Staszewski, Jesper Dangaard Brouer,
	Eric Dumazet, Tariq Toukan, Ilias Apalodimas, Yoel Caspersen,
	Mel Gorman, Saeed Mahameed, Michal Hocko, Vlastimil Babka,
	Dave Hansen

page_frag_free() calls __free_pages_ok() to free the page back to
Buddy. This is OK for high order page, but for order-0 pages, it
misses the optimization opportunity of using Per-Cpu-Pages and can
cause zone lock contention when called frequently.

Paweł Staszewski recently shared his result of 'how Linux kernel
handles normal traffic'[1] and from perf data, Jesper Dangaard Brouer
found the lock contention comes from page allocator:

  mlx5e_poll_tx_cq
  |
   --16.34%--napi_consume_skb
             |
             |--12.65%--__free_pages_ok
             |          |
             |           --11.86%--free_one_page
             |                     |
             |                     |--10.10%--queued_spin_lock_slowpath
             |                     |
             |                      --0.65%--_raw_spin_lock
             |
             |--1.55%--page_frag_free
             |
              --1.44%--skb_release_data

Jesper explained how it happened: mlx5 driver RX-page recycle
mechanism is not effective in this workload and pages have to go
through the page allocator. The lock contention happens during
mlx5 DMA TX completion cycle. And the page allocator cannot keep
up at these speeds.[2]

I thought that __free_pages_ok() are mostly freeing high order
pages and thought this is an lock contention for high order pages
but Jesper explained in detail that __free_pages_ok() here are
actually freeing order-0 pages because mlx5 is using order-0 pages
to satisfy its page pool allocation request.[3]

The free path as pointed out by Jesper is:
skb_free_head()
  -> skb_free_frag()
    -> skb_free_frag()
      -> page_frag_free()
And the pages being freed on this path are order-0 pages.

Fix this by doing similar things as in __page_frag_cache_drain() -
send the being freed page to PCP if it's an order-0 page, or
directly to Buddy if it is a high order page.

With this change, Paweł hasn't noticed lock contention yet in
his workload and Jesper has noticed a 7% performance improvement
using a micro benchmark and lock contention is gone.

[1]: https://www.spinics.net/lists/netdev/msg531362.html
[2]: https://www.spinics.net/lists/netdev/msg531421.html
[3]: https://www.spinics.net/lists/netdev/msg531556.html
Reported-by: Paweł Staszewski <pstaszewski@itcare.pl>
Analysed-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Aaron Lu <aaron.lu@intel.com>
---
 mm/page_alloc.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ae31839874b8..91a9a6af41a2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4555,8 +4555,14 @@ void page_frag_free(void *addr)
 {
 	struct page *page = virt_to_head_page(addr);
 
-	if (unlikely(put_page_testzero(page)))
-		__free_pages_ok(page, compound_order(page));
+	if (unlikely(put_page_testzero(page))) {
+		unsigned int order = compound_order(page);
+
+		if (order == 0)
+			free_unref_page(page);
+		else
+			__free_pages_ok(page, order);
+	}
 }
 EXPORT_SYMBOL(page_frag_free);
 
-- 
2.17.2

^ permalink raw reply related

* [PATCH 2/2] mm/page_alloc: use a single function to free page
From: Aaron Lu @ 2018-11-05  8:58 UTC (permalink / raw)
  To: linux-mm, linux-kernel, netdev
  Cc: Andrew Morton, Paweł Staszewski, Jesper Dangaard Brouer,
	Eric Dumazet, Tariq Toukan, Ilias Apalodimas, Yoel Caspersen,
	Mel Gorman, Saeed Mahameed, Michal Hocko, Vlastimil Babka,
	Dave Hansen
In-Reply-To: <20181105085820.6341-1-aaron.lu@intel.com>

We have multiple places of freeing a page, most of them doing similar
things and a common function can be used to reduce code duplicate.

It also avoids bug fixed in one function and left in another.

Signed-off-by: Aaron Lu <aaron.lu@intel.com>
---
 mm/page_alloc.c | 37 +++++++++++++++++--------------------
 1 file changed, 17 insertions(+), 20 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 91a9a6af41a2..2b330296e92a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4425,9 +4425,17 @@ unsigned long get_zeroed_page(gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(get_zeroed_page);
 
-void __free_pages(struct page *page, unsigned int order)
+/*
+ * Free a page by reducing its ref count by @nr.
+ * If its refcount reaches 0, then according to its order:
+ * order0: send to PCP;
+ * high order: directly send to Buddy.
+ */
+static inline void free_the_page(struct page *page, unsigned int order, int nr)
 {
-	if (put_page_testzero(page)) {
+	VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
+
+	if (page_ref_sub_and_test(page, nr)) {
 		if (order == 0)
 			free_unref_page(page);
 		else
@@ -4435,6 +4443,11 @@ void __free_pages(struct page *page, unsigned int order)
 	}
 }
 
+void __free_pages(struct page *page, unsigned int order)
+{
+	free_the_page(page, order, 1);
+}
+
 EXPORT_SYMBOL(__free_pages);
 
 void free_pages(unsigned long addr, unsigned int order)
@@ -4481,16 +4494,7 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 
 void __page_frag_cache_drain(struct page *page, unsigned int count)
 {
-	VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
-
-	if (page_ref_sub_and_test(page, count)) {
-		unsigned int order = compound_order(page);
-
-		if (order == 0)
-			free_unref_page(page);
-		else
-			__free_pages_ok(page, order);
-	}
+	free_the_page(page, compound_order(page), count);
 }
 EXPORT_SYMBOL(__page_frag_cache_drain);
 
@@ -4555,14 +4559,7 @@ void page_frag_free(void *addr)
 {
 	struct page *page = virt_to_head_page(addr);
 
-	if (unlikely(put_page_testzero(page))) {
-		unsigned int order = compound_order(page);
-
-		if (order == 0)
-			free_unref_page(page);
-		else
-			__free_pages_ok(page, order);
-	}
+	free_the_page(page, compound_order(page), 1);
 }
 EXPORT_SYMBOL(page_frag_free);
 
-- 
2.17.2

^ permalink raw reply related

* Re: [PATCH] staging: net: ipv4: tcp_westwood: fixed warnings and checks
From: David Miller @ 2018-11-05 18:20 UTC (permalink / raw)
  To: suraj1998; +Cc: edumazet, kuznet, yoshfuji, netdev, linux-kernel
In-Reply-To: <1541425985-31869-1-git-send-email-suraj1998@gmail.com>

From: Suraj Singh <suraj1998@gmail.com>
Date: Mon,  5 Nov 2018 19:23:05 +0530

> Fixed warnings and checks for TCP Westwood
> 
> Signed-off-by: Suraj Singh <suraj1998@gmail.com>

Why 'staging' in the subject line?

^ permalink raw reply

* Re: Kernel 4.19 network performance - forwarding/routing normal users traffic
From: Jesper Dangaard Brouer @ 2018-11-05  9:10 UTC (permalink / raw)
  To: Aaron Lu
  Cc: Saeed Mahameed, pstaszewski@itcare.pl, eric.dumazet@gmail.com,
	netdev@vger.kernel.org, Tariq Toukan, ilias.apalodimas@linaro.org,
	yoel@kviknet.dk, mgorman@techsingularity.net, brouer,
	Jérôme Glisse
In-Reply-To: <20181105062836.GB4502@intel.com>

On Mon, 5 Nov 2018 14:28:36 +0800
Aaron Lu <aaron.lu@intel.com> wrote:

> On Sat, Nov 03, 2018 at 01:53:25PM +0100, Jesper Dangaard Brouer wrote:
> > 
> > On Fri, 2 Nov 2018 22:20:24 +0800 Aaron Lu <aaron.lu@intel.com> wrote:
> >   
> > > On Fri, Nov 02, 2018 at 12:40:37PM +0100, Jesper Dangaard Brouer wrote:  
> > > > On Fri, 2 Nov 2018 13:23:56 +0800
> > > > Aaron Lu <aaron.lu@intel.com> wrote:
> > > >     
> > > > > On Thu, Nov 01, 2018 at 08:23:19PM +0000, Saeed Mahameed wrote:    
> > > > > > On Thu, 2018-11-01 at 23:27 +0800, Aaron Lu wrote:      
> > > > > > > On Thu, Nov 01, 2018 at 10:22:13AM +0100, Jesper Dangaard Brouer
> > > > > > > wrote:
> > > > > > > ... ...      
> > > > > > > > Section copied out:
> > > > > > > > 
> > > > > > > >   mlx5e_poll_tx_cq
> > > > > > > >   |          
> > > > > > > >    --16.34%--napi_consume_skb
> > > > > > > >              |          
> > > > > > > >              |--12.65%--__free_pages_ok
> > > > > > > >              |          |          
> > > > > > > >              |           --11.86%--free_one_page
> > > > > > > >              |                     |          
> > > > > > > >              |                     |--10.10%
> > > > > > > > --queued_spin_lock_slowpath
> > > > > > > >              |                     |          
> > > > > > > >              |                      --0.65%--_raw_spin_lock      
> > > > > > > 
> > > > > > > This callchain looks like it is freeing higher order pages than order
> > > > > > > 0:
> > > > > > > __free_pages_ok is only called for pages whose order are bigger than
> > > > > > > 0.      
> > > > > > 
> > > > > > mlx5 rx uses only order 0 pages, so i don't know where these high order
> > > > > > tx SKBs are coming from..       
> > > > > 
> > > > > Perhaps here:
> > > > > __netdev_alloc_skb(), __napi_alloc_skb(), __netdev_alloc_frag() and
> > > > > __napi_alloc_frag() will all call page_frag_alloc(), which will use
> > > > > __page_frag_cache_refill() to get an order 3 page if possible, or fall
> > > > > back to an order 0 page if order 3 page is not available.
> > > > > 
> > > > > I'm not sure if your workload will use the above code path though.    
> > > > 
> > > > TL;DR: this is order-0 pages (code-walk trough proof below)
> > > > 
> > > > To Aaron, the network stack *can* call __free_pages_ok() with order-0
> > > > pages, via:
> > > > 
> > > > static void skb_free_head(struct sk_buff *skb)
> > > > {
> > > > 	unsigned char *head = skb->head;
> > > > 
> > > > 	if (skb->head_frag)
> > > > 		skb_free_frag(head);
> > > > 	else
> > > > 		kfree(head);
> > > > }
> > > > 
> > > > static inline void skb_free_frag(void *addr)
> > > > {
> > > > 	page_frag_free(addr);
> > > > }
> > > > 
> > > > /*
> > > >  * Frees a page fragment allocated out of either a compound or order 0 page.
> > > >  */
> > > > void page_frag_free(void *addr)
> > > > {
> > > > 	struct page *page = virt_to_head_page(addr);
> > > > 
> > > > 	if (unlikely(put_page_testzero(page)))
> > > > 		__free_pages_ok(page, compound_order(page));
> > > > }
> > > > EXPORT_SYMBOL(page_frag_free);    
> > > 
> > > I think here is a problem - order 0 pages are freed directly to buddy,
> > > bypassing per-cpu-pages. This might be the reason lock contention
> > > appeared on free path.   
> > 
> > OMG - you just found a significant issue with the network stacks
> > interaction with the page allocator!  This explains why I could not get
> > the PCP (Per-Cpu-Pages) system to have good performance, in my
> > performance networking benchmarks. As we are basically only using the
> > alloc side of PCP, and not the free side.  
> 
> Exactly.
> 
> >  We have spend years adding different driver level recycle tricks to
> > avoid this code path getting activated, exactly because it is rather
> > slow and problematic that we hit this zone->lock.  
> 
> I can see when this code path is hit, it causes unnecessary taking of
> zone lock for order-0 pages and cause lock contention.
> 
> >   
> > > Can someone apply below diff and see if lock contention is gone?  
> > 
> > I have also applied and tested this patch, and yes the lock contention
> > is gone.  As mentioned is it rather difficult to hit this code path, as
> > the driver page recycle mechanism tries to hide/avoid it, but mlx5 +
> > page_pool + CPU-map recycling have a known weakness that bypass the
> > driver page recycle scheme (that I've not fixed yet).  I observed a 7%
> > speedup for this micro benchmark.  
> 
> Good to know this, I will prepare a formal patch.

I wonder if this code is still missing something. I was looking at
using put_devmap_managed_page() infrastructure, but I realized that
page_frag_free() is also skipping this code path.  I guess, I can add
it later when I show/proof (performance wise) that this is a good idea
(as we currently don't have any users).


> > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > > index e2ef1c17942f..65c0ae13215a 100644
> > > --- a/mm/page_alloc.c
> > > +++ b/mm/page_alloc.c
> > > @@ -4554,8 +4554,14 @@ void page_frag_free(void *addr)
> > >  {
> > >  	struct page *page = virt_to_head_page(addr);
> > >  
> > > -	if (unlikely(put_page_testzero(page)))
> > > -		__free_pages_ok(page, compound_order(page));
> > > +	if (unlikely(put_page_testzero(page))) {
> > > +		unsigned int order = compound_order(page);
> > > +
> > > +		if (order == 0)
> > > +			free_unref_page(page);
> > > +		else
> > > +			__free_pages_ok(page, order);
> > > +	}
> > >  }
> > >  EXPORT_SYMBOL(page_frag_free);  
> > 
> > Thank you Aaron for spotting this!!!  
> 
> Which is impossible without your analysis :-)



-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [PATCH 5/5] b43: Drop internal cordic algorithm implementation
From: Arend van Spriel @ 2018-11-05  9:11 UTC (permalink / raw)
  To: Kalle Valo, Priit Laes
  Cc: linux-kernel, David S. Miller, linux-wireless, b43-dev, netdev
In-Reply-To: <87y3a7gaag.fsf@codeaurora.org>

On 11/5/2018 10:09 AM, Kalle Valo wrote:
> Priit Laes <plaes@plaes.org> writes:
>
>> Signed-off-by: Priit Laes <plaes@plaes.org>
>
> No empty commit logs, please.
>
> And IMHO you could fold patch 5 into patch 4.

Similarly 2 and 3.

Regards,
Arend

^ permalink raw reply

* Re: [PATCH 0/5] VSOCK: support mergeable rx buffer in vhost-vsock
From: Jason Wang @ 2018-11-05  9:21 UTC (permalink / raw)
  To: jiangyiwen, stefanha; +Cc: netdev, kvm, virtualization
In-Reply-To: <5BDFF49C.3040603@huawei.com>


On 2018/11/5 下午3:43, jiangyiwen wrote:
> Now vsock only support send/receive small packet, it can't achieve
> high performance. As previous discussed with Jason Wang, I revisit the
> idea of vhost-net about mergeable rx buffer and implement the mergeable
> rx buffer in vhost-vsock, it can allow big packet to be scattered in
> into different buffers and improve performance obviously.
>
> I write a tool to test the vhost-vsock performance, mainly send big
> packet(64K) included guest->Host and Host->Guest. The result as
> follows:
>
> Before performance:
>                Single socket            Multiple sockets(Max Bandwidth)
> Guest->Host   ~400MB/s                 ~480MB/s
> Host->Guest   ~1450MB/s                ~1600MB/s
>
> After performance:
>                Single socket            Multiple sockets(Max Bandwidth)
> Guest->Host   ~1700MB/s                ~2900MB/s
> Host->Guest   ~1700MB/s                ~2900MB/s
>
>  From the test results, the performance is improved obviously, and guest
> memory will not be wasted.


Hi:

Thanks for the patches and the numbers are really impressive.

But instead of duplicating codes between sock and net. I was considering 
to use virtio-net as a transport of vsock. Then we may have all existed 
features likes batching, mergeable rx buffers and multiqueue. Want to 
consider this idea? Thoughts?


>
> ---
>
> Yiwen Jiang (5):
>    VSOCK: support fill mergeable rx buffer in guest
>    VSOCK: support fill data to mergeable rx buffer in host
>    VSOCK: support receive mergeable rx buffer in guest
>    VSOCK: modify default rx buf size to improve performance
>    VSOCK: batch sending rx buffer to increase bandwidth
>
>   drivers/vhost/vsock.c                   | 135 +++++++++++++++++++++++------
>   include/linux/virtio_vsock.h            |  15 +++-
>   include/uapi/linux/virtio_vsock.h       |   5 ++
>   net/vmw_vsock/virtio_transport.c        | 147 ++++++++++++++++++++++++++------
>   net/vmw_vsock/virtio_transport_common.c |  59 +++++++++++--
>   5 files changed, 300 insertions(+), 61 deletions(-)
>

^ permalink raw reply

* [PATCH v2 2/3] brcmsmac: Use cordic-related macros from common cordic library
From: Priit Laes @ 2018-11-05 19:37 UTC (permalink / raw)
  To: linux-kernel
  Cc: Arend van Spriel, Franky Lin, Hante Meuleman, Chi-Hsien Lin,
	Wright Feng, Kalle Valo, David S. Miller, linux-wireless,
	brcm80211-dev-list.pdl, brcm80211-dev-list, netdev
In-Reply-To: <cover.9ec98292c595a111ae78f31383ff8618c9cb04da.1541446422.git-series.plaes@plaes.org>

Current driver includes macro that is available from general cordic
library. Use that and drop unused duplicate and unneeded internal
definitions.

Signed-off-by: Priit Laes <plaes@plaes.org>

---
v2: Use single patch instead of change/removal patches.
---
 drivers/net/wireless/broadcom/brcm80211/brcmsmac/phy/phy_int.h | 7 +-------
 drivers/net/wireless/broadcom/brcm80211/brcmsmac/phy/phy_lcn.c | 4 ++--
 drivers/net/wireless/broadcom/brcm80211/brcmsmac/phy/phy_n.c   | 4 ++--
 3 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/phy/phy_int.h b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/phy/phy_int.h
index 4960f7d..e9e8337 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/phy/phy_int.h
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/phy/phy_int.h
@@ -220,13 +220,6 @@ enum phy_cal_mode {
 #define BB_MULT_MASK		0x0000ffff
 #define BB_MULT_VALID_MASK	0x80000000
 
-#define CORDIC_AG	39797
-#define	CORDIC_NI	18
-#define	FIXED(X)	((s32)((X) << 16))
-
-#define	FLOAT(X) \
-	(((X) >= 0) ? ((((X) >> 15) + 1) >> 1) : -((((-(X)) >> 15) + 1) >> 1))
-
 #define PHY_CHAIN_TX_DISABLE_TEMP	115
 #define PHY_HYSTERESIS_DELTATEMP	5
 
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/phy/phy_lcn.c b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/phy/phy_lcn.c
index 9fb0d9f..e78a93a 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/phy/phy_lcn.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/phy/phy_lcn.c
@@ -3447,8 +3447,8 @@ wlc_lcnphy_start_tx_tone(struct brcms_phy *pi, s32 f_kHz, u16 max_val,
 
 		theta += rot;
 
-		i_samp = (u16) (FLOAT(tone_samp.i * max_val) & 0x3ff);
-		q_samp = (u16) (FLOAT(tone_samp.q * max_val) & 0x3ff);
+		i_samp = (u16)(CORDIC_FLOAT(tone_samp.i * max_val) & 0x3ff);
+		q_samp = (u16)(CORDIC_FLOAT(tone_samp.q * max_val) & 0x3ff);
 		data_buf[t] = (i_samp << 10) | q_samp;
 	}
 
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/phy/phy_n.c b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/phy/phy_n.c
index a57f271..f4f5e90 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/phy/phy_n.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/phy/phy_n.c
@@ -23089,8 +23089,8 @@ wlc_phy_gen_load_samples_nphy(struct brcms_phy *pi, u32 f_kHz, u16 max_val,
 
 		theta += rot;
 
-		tone_buf[t].q = (s32) FLOAT(tone_buf[t].q * max_val);
-		tone_buf[t].i = (s32) FLOAT(tone_buf[t].i * max_val);
+		tone_buf[t].q = (s32)CORDIC_FLOAT(tone_buf[t].q * max_val);
+		tone_buf[t].i = (s32)CORDIC_FLOAT(tone_buf[t].i * max_val);
 	}
 
 	wlc_phy_loadsampletable_nphy(pi, tone_buf, num_samps);
-- 
git-series 0.9.1

^ permalink raw reply related

* [PATCH v2 3/3] b43: Use cordic algorithm from kernel library
From: Priit Laes @ 2018-11-05 19:37 UTC (permalink / raw)
  To: linux-kernel; +Cc: Kalle Valo, David S. Miller, linux-wireless, b43-dev, netdev
In-Reply-To: <cover.9ec98292c595a111ae78f31383ff8618c9cb04da.1541446422.git-series.plaes@plaes.org>

Kernel library has a common cordic algorithm which is identical
to internally implementatd one, so use it and drop the duplicate
implementation.

Signed-off-by: Priit Laes <plaes@plaes.org>

---
v2: Merge the update/removal patches into single patch.
---
 drivers/net/wireless/broadcom/b43/Kconfig      |  1 +-
 drivers/net/wireless/broadcom/b43/phy_common.c | 47 +-------------------
 drivers/net/wireless/broadcom/b43/phy_common.h |  9 +----
 drivers/net/wireless/broadcom/b43/phy_lp.c     | 13 ++---
 drivers/net/wireless/broadcom/b43/phy_n.c      | 13 ++---
 5 files changed, 15 insertions(+), 68 deletions(-)

diff --git a/drivers/net/wireless/broadcom/b43/Kconfig b/drivers/net/wireless/broadcom/b43/Kconfig
index fba8560..3e41457 100644
--- a/drivers/net/wireless/broadcom/b43/Kconfig
+++ b/drivers/net/wireless/broadcom/b43/Kconfig
@@ -4,6 +4,7 @@ config B43
 	select BCMA if B43_BCMA
 	select SSB if B43_SSB
 	select FW_LOADER
+	select CORDIC
 	---help---
 	  b43 is a driver for the Broadcom 43xx series wireless devices.
 
diff --git a/drivers/net/wireless/broadcom/b43/phy_common.c b/drivers/net/wireless/broadcom/b43/phy_common.c
index 85f2ca9..98c4fa5 100644
--- a/drivers/net/wireless/broadcom/b43/phy_common.c
+++ b/drivers/net/wireless/broadcom/b43/phy_common.c
@@ -604,50 +604,3 @@ void b43_phy_force_clock(struct b43_wldev *dev, bool force)
 #endif
 	}
 }
-
-/* http://bcm-v4.sipsolutions.net/802.11/PHY/Cordic */
-struct b43_c32 b43_cordic(int theta)
-{
-	static const u32 arctg[] = {
-		2949120, 1740967, 919879, 466945, 234379, 117304,
-		  58666,   29335,  14668,   7334,   3667,   1833,
-		    917,     458,    229,    115,     57,     29,
-	};
-	u8 i;
-	s32 tmp;
-	s8 signx = 1;
-	u32 angle = 0;
-	struct b43_c32 ret = { .i = 39797, .q = 0, };
-
-	while (theta > (180 << 16))
-		theta -= (360 << 16);
-	while (theta < -(180 << 16))
-		theta += (360 << 16);
-
-	if (theta > (90 << 16)) {
-		theta -= (180 << 16);
-		signx = -1;
-	} else if (theta < -(90 << 16)) {
-		theta += (180 << 16);
-		signx = -1;
-	}
-
-	for (i = 0; i <= 17; i++) {
-		if (theta > angle) {
-			tmp = ret.i - (ret.q >> i);
-			ret.q += ret.i >> i;
-			ret.i = tmp;
-			angle += arctg[i];
-		} else {
-			tmp = ret.i + (ret.q >> i);
-			ret.q -= ret.i >> i;
-			ret.i = tmp;
-			angle -= arctg[i];
-		}
-	}
-
-	ret.i *= signx;
-	ret.q *= signx;
-
-	return ret;
-}
diff --git a/drivers/net/wireless/broadcom/b43/phy_common.h b/drivers/net/wireless/broadcom/b43/phy_common.h
index 57a1ad8..4213cac 100644
--- a/drivers/net/wireless/broadcom/b43/phy_common.h
+++ b/drivers/net/wireless/broadcom/b43/phy_common.h
@@ -7,13 +7,6 @@
 
 struct b43_wldev;
 
-/* Complex number using 2 32-bit signed integers */
-struct b43_c32 { s32 i, q; };
-
-#define CORDIC_CONVERT(value)	(((value) >= 0) ? \
-				 ((((value) >> 15) + 1) >> 1) : \
-				 -((((-(value)) >> 15) + 1) >> 1))
-
 /* PHY register routing bits */
 #define B43_PHYROUTE			0x0C00 /* PHY register routing bits mask */
 #define  B43_PHYROUTE_BASE		0x0000 /* Base registers */
@@ -450,6 +443,4 @@ bool b43_is_40mhz(struct b43_wldev *dev);
 
 void b43_phy_force_clock(struct b43_wldev *dev, bool force);
 
-struct b43_c32 b43_cordic(int theta);
-
 #endif /* LINUX_B43_PHY_COMMON_H_ */
diff --git a/drivers/net/wireless/broadcom/b43/phy_lp.c b/drivers/net/wireless/broadcom/b43/phy_lp.c
index 6922cbb..1718e3b 100644
--- a/drivers/net/wireless/broadcom/b43/phy_lp.c
+++ b/drivers/net/wireless/broadcom/b43/phy_lp.c
@@ -23,6 +23,7 @@
 
 */
 
+#include <linux/cordic.h>
 #include <linux/slab.h>
 
 #include "b43.h"
@@ -1780,9 +1781,9 @@ static void lpphy_start_tx_tone(struct b43_wldev *dev, s32 freq, u16 max)
 {
 	struct b43_phy_lp *lpphy = dev->phy.lp;
 	u16 buf[64];
-	int i, samples = 0, angle = 0;
+	int i, samples = 0, theta = 0;
 	int rotation = (((36 * freq) / 20) << 16) / 100;
-	struct b43_c32 sample;
+	struct cordic_iq sample;
 
 	lpphy->tx_tone_freq = freq;
 
@@ -1798,10 +1799,10 @@ static void lpphy_start_tx_tone(struct b43_wldev *dev, s32 freq, u16 max)
 	}
 
 	for (i = 0; i < samples; i++) {
-		sample = b43_cordic(angle);
-		angle += rotation;
-		buf[i] = CORDIC_CONVERT((sample.i * max) & 0xFF) << 8;
-		buf[i] |= CORDIC_CONVERT((sample.q * max) & 0xFF);
+		sample = cordic_calc_iq(theta);
+		theta += rotation;
+		buf[i] = CORDIC_FLOAT((sample.i * max) & 0xFF) << 8;
+		buf[i] |= CORDIC_FLOAT((sample.q * max) & 0xFF);
 	}
 
 	b43_lptab_write_bulk(dev, B43_LPTAB16(5, 0), samples, buf);
diff --git a/drivers/net/wireless/broadcom/b43/phy_n.c b/drivers/net/wireless/broadcom/b43/phy_n.c
index 44ab080..1f9378a 100644
--- a/drivers/net/wireless/broadcom/b43/phy_n.c
+++ b/drivers/net/wireless/broadcom/b43/phy_n.c
@@ -23,6 +23,7 @@
 
 */
 
+#include <linux/cordic.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/types.h>
@@ -1513,7 +1514,7 @@ static void b43_radio_init2055(struct b43_wldev *dev)
 
 /* http://bcm-v4.sipsolutions.net/802.11/PHY/N/LoadSampleTable */
 static int b43_nphy_load_samples(struct b43_wldev *dev,
-					struct b43_c32 *samples, u16 len) {
+					struct cordic_iq *samples, u16 len) {
 	struct b43_phy_n *nphy = dev->phy.n;
 	u16 i;
 	u32 *data;
@@ -1544,7 +1545,7 @@ static u16 b43_nphy_gen_load_samples(struct b43_wldev *dev, u32 freq, u16 max,
 {
 	int i;
 	u16 bw, len, rot, angle;
-	struct b43_c32 *samples;
+	struct cordic_iq *samples;
 
 	bw = b43_is_40mhz(dev) ? 40 : 20;
 	len = bw << 3;
@@ -1561,7 +1562,7 @@ static u16 b43_nphy_gen_load_samples(struct b43_wldev *dev, u32 freq, u16 max,
 		len = bw << 1;
 	}
 
-	samples = kcalloc(len, sizeof(struct b43_c32), GFP_KERNEL);
+	samples = kcalloc(len, sizeof(struct cordic_iq), GFP_KERNEL);
 	if (!samples) {
 		b43err(dev->wl, "allocation for samples generation failed\n");
 		return 0;
@@ -1570,10 +1571,10 @@ static u16 b43_nphy_gen_load_samples(struct b43_wldev *dev, u32 freq, u16 max,
 	angle = 0;
 
 	for (i = 0; i < len; i++) {
-		samples[i] = b43_cordic(angle);
+		samples[i] = cordic_calc_iq(angle);
 		angle += rot;
-		samples[i].q = CORDIC_CONVERT(samples[i].q * max);
-		samples[i].i = CORDIC_CONVERT(samples[i].i * max);
+		samples[i].q = CORDIC_FLOAT(samples[i].q * max);
+		samples[i].i = CORDIC_FLOAT(samples[i].i * max);
 	}
 
 	i = b43_nphy_load_samples(dev, samples, len);
-- 
git-series 0.9.1

^ permalink raw reply related

* Re: may I ignore "net/core/rtnetlink.c:3156:1: warning: the frame size of 1280 bytes ..."?
From: David Ahern @ 2018-11-05 19:57 UTC (permalink / raw)
  To: Toralf Förster, netdev; +Cc: Linux Kernel
In-Reply-To: <0cfb892c-b358-4bd6-f7c9-071a0039bc71@gmx.de>

On 11/4/18 9:14 AM, Toralf Förster wrote:
> compiling recent kernel (4.18.x, 4.19.1) at my server I do still get :
> 
> 
> net/core/rtnetlink.c: In function ‘rtnl_newlink’:
> net/core/rtnetlink.c:3156:1: warning: the frame size of 1280 bytes is larger than 1024 bytes [-Wframe-larger-than=]
> 
> 
> with "gcc version 7.3.0 (Gentoo Hardened 7.3.0-r3 p1.4) " and do wonder whether it is safe to ignore it?
> 
> 

I believe the warning is coming from this part of rtnl_newlink():

        if (1) {
                struct nlattr *attr[RTNL_MAX_TYPE + 1];
                struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1];
                struct nlattr **data = NULL;
                struct nlattr **slave_data = NULL;
                struct net *dest_net, *link_net = NULL;

The heavy hitters are:
#define RTNL_MAX_TYPE           49
#define RTNL_SLAVE_MAX_TYPE     36

attr and slave_attr would amount to 696 bytes of that 1280. The earlier
defined:

struct nlattr *tb[IFLA_MAX+1];

Would be another 416, so those 3 are 1112 bytes of the warning.

I have been using CONFIG_FRAME_WARN=2048 for a while without a problem.

^ permalink raw reply

* Re: [PATCH] net: move ‘__zerocopy_sg_from_iter’ prototype to header file <linux/skbuff.h>
From: Mathieu Malaterre @ 2018-11-05 20:20 UTC (permalink / raw)
  To: David S. Miller; +Cc: LKML, netdev
In-Reply-To: <20181102.233357.1129560140046922258.davem@davemloft.net>

On Sat, Nov 3, 2018 at 7:34 AM David Miller <davem@davemloft.net> wrote:
>
> From: Mathieu Malaterre <malat@debian.org>
> Date: Wed, 31 Oct 2018 12:34:59 +0100
>
> > This makes it clear the function is part of the API. Also this will
> > remove a warning triggered at W=1:
> >
> >   net/core/datagram.c:581:5: warning: no previous prototype for ‘__zerocopy_sg_from_iter’ [-Wmissing-prototypes]
> >
> > Signed-off-by: Mathieu Malaterre <malat@debian.org>
>
> It's not part of the "API", and it shouldn't even be exported to
> modules.
>
> Only net/core/skbuff.c calls it, and that is never modular.

OK. I got confused with the EXPORT_SYMBOL(). I'll re-send moving
__zerocopy_sg_from_iter to skbuff.c.

Thanks

^ permalink raw reply

* CFS for Netdev 0x13 open!
From: Jamal Hadi Salim @ 2018-11-05 20:42 UTC (permalink / raw)
  To: people
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	netfilter-devel-u79uwXL29TY76Z2rM5mHXA, linux-wireless,
	lwn-T1hC0tSOHrs

We are pleased to announce the opening of call for submissions(CFS)
for Netdev 0x13.
For overview of topics, submissions and requirements please visit:
https://netdevconf.org/0x13/submit-proposal.html

Sessions are selected via a blind review process carried out by
the Program Committee and Shepherd. Please refer to:
https://www.netdevconf.org/0x13/pc_review.html

Important dates:
Closing of CFS: Tue, January 15, 2019.
Acceptance or rejection notification by: Fri, January 18, 2019
Conference dates:  Wed, March 20 to Fri, March 22.

cheers,
jamal

^ permalink raw reply

* SCTP on RH 5.7
From: David Laight @ 2018-11-05 11:48 UTC (permalink / raw)
  To: netdev@vger.kernel.org

Why do our customers insist on trying to use SCTP on RH 5.7 with its
ancient 2.6.18 kernel.
Not surprising they are getting issues!

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox