Netdev List
 help / color / mirror / Atom feed
* Re: Netlink mmap tx security?
From: David Miller @ 2014-10-14 20:00 UTC (permalink / raw)
  To: luto; +Cc: torvalds, kaber, netdev
In-Reply-To: <CALCETrUYBed_TBacxoDPMphM5y=iqxho2isrDruOQi=pmK2yoQ@mail.gmail.com>

From: Andy Lutomirski <luto@amacapital.net>
Date: Tue, 14 Oct 2014 12:33:43 -0700

> For full honesty, there is now the machinery developed for memfd
> sealing, but I can't imagine that this is ever faster than just
> copying the buffer.

I don't have much motivation to even check if it's a worthwhile
pursuit at this point.  

Someone who wants to can :-)

> I think that the NETLINK_SKB_TX declaration in include/linux/netlink.h
> should probably go, too.  And there's the last parameter to
> netlink_set_ring, too, and possibly even the nlk->tx_ring struct
> itself.

Agreed on all counts, here is the new patch:

====================
[PATCH] netlink: Remove TX mmap support.

There is no reasonable manner in which to absolutely make sure that another
thread of control cannot write to the pages in the mmap()'d area and thus
make sure that netlink messages do not change underneath us after we've
performed verifications.

Reported-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h  |   5 +-
 net/netlink/af_netlink.c | 161 +++++------------------------------------------
 net/netlink/af_netlink.h |   1 -
 3 files changed, 16 insertions(+), 151 deletions(-)

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 9e572da..57080a9 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -17,9 +17,8 @@ static inline struct nlmsghdr *nlmsg_hdr(const struct sk_buff *skb)
 
 enum netlink_skb_flags {
 	NETLINK_SKB_MMAPED	= 0x1,	/* Packet data is mmaped */
-	NETLINK_SKB_TX		= 0x2,	/* Packet was sent by userspace */
-	NETLINK_SKB_DELIVERED	= 0x4,	/* Packet was delivered */
-	NETLINK_SKB_DST		= 0x8,	/* Dst set in sendto or sendmsg */
+	NETLINK_SKB_DELIVERED	= 0x2,	/* Packet was delivered */
+	NETLINK_SKB_DST		= 0x4,	/* Dst set in sendto or sendmsg */
 };
 
 struct netlink_skb_parms {
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index c416725..07ef0c9 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -289,11 +289,6 @@ static bool netlink_rx_is_mmaped(struct sock *sk)
 	return nlk_sk(sk)->rx_ring.pg_vec != NULL;
 }
 
-static bool netlink_tx_is_mmaped(struct sock *sk)
-{
-	return nlk_sk(sk)->tx_ring.pg_vec != NULL;
-}
-
 static __pure struct page *pgvec_to_page(const void *addr)
 {
 	if (is_vmalloc_addr(addr))
@@ -359,7 +354,7 @@ err1:
 }
 
 static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req,
-			    bool closing, bool tx_ring)
+			    bool closing)
 {
 	struct netlink_sock *nlk = nlk_sk(sk);
 	struct netlink_ring *ring;
@@ -368,8 +363,8 @@ static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req,
 	unsigned int order = 0;
 	int err;
 
-	ring  = tx_ring ? &nlk->tx_ring : &nlk->rx_ring;
-	queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
+	ring  = &nlk->rx_ring;
+	queue = &sk->sk_receive_queue;
 
 	if (!closing) {
 		if (atomic_read(&nlk->mapped))
@@ -476,11 +471,9 @@ static int netlink_mmap(struct file *file, struct socket *sock,
 	mutex_lock(&nlk->pg_vec_lock);
 
 	expected = 0;
-	for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
-		if (ring->pg_vec == NULL)
-			continue;
+	ring = &nlk->rx_ring;
+	if (ring->pg_vec)
 		expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE;
-	}
 
 	if (expected == 0)
 		goto out;
@@ -490,10 +483,8 @@ static int netlink_mmap(struct file *file, struct socket *sock,
 		goto out;
 
 	start = vma->vm_start;
-	for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
-		if (ring->pg_vec == NULL)
-			continue;
-
+	ring = &nlk->rx_ring;
+	if (ring->pg_vec) {
 		for (i = 0; i < ring->pg_vec_len; i++) {
 			struct page *page;
 			void *kaddr = ring->pg_vec[i];
@@ -662,13 +653,6 @@ static unsigned int netlink_poll(struct file *file, struct socket *sock,
 	}
 	spin_unlock_bh(&sk->sk_receive_queue.lock);
 
-	spin_lock_bh(&sk->sk_write_queue.lock);
-	if (nlk->tx_ring.pg_vec) {
-		if (netlink_current_frame(&nlk->tx_ring, NL_MMAP_STATUS_UNUSED))
-			mask |= POLLOUT | POLLWRNORM;
-	}
-	spin_unlock_bh(&sk->sk_write_queue.lock);
-
 	return mask;
 }
 
@@ -698,104 +682,6 @@ static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk,
 	NETLINK_CB(skb).sk = sk;
 }
 
-static int netlink_mmap_sendmsg(struct sock *sk, struct msghdr *msg,
-				u32 dst_portid, u32 dst_group,
-				struct sock_iocb *siocb)
-{
-	struct netlink_sock *nlk = nlk_sk(sk);
-	struct netlink_ring *ring;
-	struct nl_mmap_hdr *hdr;
-	struct sk_buff *skb;
-	unsigned int maxlen;
-	bool excl = true;
-	int err = 0, len = 0;
-
-	/* Netlink messages are validated by the receiver before processing.
-	 * In order to avoid userspace changing the contents of the message
-	 * after validation, the socket and the ring may only be used by a
-	 * single process, otherwise we fall back to copying.
-	 */
-	if (atomic_long_read(&sk->sk_socket->file->f_count) > 2 ||
-	    atomic_read(&nlk->mapped) > 1)
-		excl = false;
-
-	mutex_lock(&nlk->pg_vec_lock);
-
-	ring   = &nlk->tx_ring;
-	maxlen = ring->frame_size - NL_MMAP_HDRLEN;
-
-	do {
-		hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID);
-		if (hdr == NULL) {
-			if (!(msg->msg_flags & MSG_DONTWAIT) &&
-			    atomic_read(&nlk->tx_ring.pending))
-				schedule();
-			continue;
-		}
-		if (hdr->nm_len > maxlen) {
-			err = -EINVAL;
-			goto out;
-		}
-
-		netlink_frame_flush_dcache(hdr);
-
-		if (likely(dst_portid == 0 && dst_group == 0 && excl)) {
-			skb = alloc_skb_head(GFP_KERNEL);
-			if (skb == NULL) {
-				err = -ENOBUFS;
-				goto out;
-			}
-			sock_hold(sk);
-			netlink_ring_setup_skb(skb, sk, ring, hdr);
-			NETLINK_CB(skb).flags |= NETLINK_SKB_TX;
-			__skb_put(skb, hdr->nm_len);
-			netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED);
-			atomic_inc(&ring->pending);
-		} else {
-			skb = alloc_skb(hdr->nm_len, GFP_KERNEL);
-			if (skb == NULL) {
-				err = -ENOBUFS;
-				goto out;
-			}
-			__skb_put(skb, hdr->nm_len);
-			memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, hdr->nm_len);
-			netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
-		}
-
-		netlink_increment_head(ring);
-
-		NETLINK_CB(skb).portid	  = nlk->portid;
-		NETLINK_CB(skb).dst_group = dst_group;
-		NETLINK_CB(skb).creds	  = siocb->scm->creds;
-
-		err = security_netlink_send(sk, skb);
-		if (err) {
-			kfree_skb(skb);
-			goto out;
-		}
-
-		if (unlikely(dst_group)) {
-			atomic_inc(&skb->users);
-			netlink_broadcast(sk, skb, dst_portid, dst_group,
-					  GFP_KERNEL);
-		}
-		err = netlink_unicast(sk, skb, dst_portid,
-				      msg->msg_flags & MSG_DONTWAIT);
-		if (err < 0)
-			goto out;
-		len += err;
-
-	} while (hdr != NULL ||
-		 (!(msg->msg_flags & MSG_DONTWAIT) &&
-		  atomic_read(&nlk->tx_ring.pending)));
-
-	if (len > 0)
-		err = len;
-out:
-	mutex_unlock(&nlk->pg_vec_lock);
-	return err;
-}
-
 static void netlink_queue_mmaped_skb(struct sock *sk, struct sk_buff *skb)
 {
 	struct nl_mmap_hdr *hdr;
@@ -842,10 +728,8 @@ static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb)
 #else /* CONFIG_NETLINK_MMAP */
 #define netlink_skb_is_mmaped(skb)	false
 #define netlink_rx_is_mmaped(sk)	false
-#define netlink_tx_is_mmaped(sk)	false
 #define netlink_mmap			sock_no_mmap
 #define netlink_poll			datagram_poll
-#define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, siocb)	0
 #endif /* CONFIG_NETLINK_MMAP */
 
 static void netlink_skb_destructor(struct sk_buff *skb)
@@ -864,16 +748,11 @@ static void netlink_skb_destructor(struct sk_buff *skb)
 		hdr = netlink_mmap_hdr(skb);
 		sk = NETLINK_CB(skb).sk;
 
-		if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) {
-			netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
-			ring = &nlk_sk(sk)->tx_ring;
-		} else {
-			if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) {
-				hdr->nm_len = 0;
-				netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
-			}
-			ring = &nlk_sk(sk)->rx_ring;
+		if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) {
+			hdr->nm_len = 0;
+			netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
 		}
+		ring = &nlk_sk(sk)->rx_ring;
 
 		WARN_ON(atomic_read(&ring->pending) == 0);
 		atomic_dec(&ring->pending);
@@ -921,10 +800,7 @@ static void netlink_sock_destruct(struct sock *sk)
 
 		memset(&req, 0, sizeof(req));
 		if (nlk->rx_ring.pg_vec)
-			netlink_set_ring(sk, &req, true, false);
-		memset(&req, 0, sizeof(req));
-		if (nlk->tx_ring.pg_vec)
-			netlink_set_ring(sk, &req, true, true);
+			netlink_set_ring(sk, &req, true);
 	}
 #endif /* CONFIG_NETLINK_MMAP */
 
@@ -2165,8 +2041,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 		err = 0;
 		break;
 #ifdef CONFIG_NETLINK_MMAP
-	case NETLINK_RX_RING:
-	case NETLINK_TX_RING: {
+	case NETLINK_RX_RING: {
 		struct nl_mmap_req req;
 
 		/* Rings might consume more memory than queue limits, require
@@ -2178,8 +2053,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 			return -EINVAL;
 		if (copy_from_user(&req, optval, sizeof(req)))
 			return -EFAULT;
-		err = netlink_set_ring(sk, &req, false,
-				       optname == NETLINK_TX_RING);
+		err = netlink_set_ring(sk, &req, false);
 		break;
 	}
 #endif /* CONFIG_NETLINK_MMAP */
@@ -2295,13 +2169,6 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
 			goto out;
 	}
 
-	if (netlink_tx_is_mmaped(sk) &&
-	    msg->msg_iov->iov_base == NULL) {
-		err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group,
-					   siocb);
-		goto out;
-	}
-
 	err = -EMSGSIZE;
 	if (len > sk->sk_sndbuf - 32)
 		goto out;
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
index b20a173..4741b88 100644
--- a/net/netlink/af_netlink.h
+++ b/net/netlink/af_netlink.h
@@ -45,7 +45,6 @@ struct netlink_sock {
 #ifdef CONFIG_NETLINK_MMAP
 	struct mutex		pg_vec_lock;
 	struct netlink_ring	rx_ring;
-	struct netlink_ring	tx_ring;
 	atomic_t		mapped;
 #endif /* CONFIG_NETLINK_MMAP */
 
-- 
1.7.11.7

^ permalink raw reply related

* Re: [PATCH] dsa: mv88e6171: Fix tag_protocol check
From: Andrew Lunn @ 2014-10-14 19:59 UTC (permalink / raw)
  To: Guenter Roeck
  Cc: David S. Miller, Florian Fainelli, netdev, linux-kernel,
	Andrew Lunn
In-Reply-To: <1413310864-16830-1-git-send-email-linux@roeck-us.net>

On Tue, Oct 14, 2014 at 11:21:04AM -0700, Guenter Roeck wrote:
> tag_protocol is now an enum, so drivers have to check against it.
> 
> Cc: Andrew Lunn <andrew@lunn.ch>
> Signed-off-by: Guenter Roeck <linux@roeck-us.net>

Acked-by: Andrew Lunn <andrew@lunn.ch>

Thanks for fixing this.

       Andrew

> ---
>  drivers/net/dsa/mv88e6171.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/net/dsa/mv88e6171.c b/drivers/net/dsa/mv88e6171.c
> index 6365e30..1020a7a 100644
> --- a/drivers/net/dsa/mv88e6171.c
> +++ b/drivers/net/dsa/mv88e6171.c
> @@ -206,7 +206,7 @@ static int mv88e6171_setup_port(struct dsa_switch *ds, int p)
>  	 */
>  	val = 0x0433;
>  	if (dsa_is_cpu_port(ds, p)) {
> -		if (ds->dst->tag_protocol == htons(ETH_P_EDSA))
> +		if (ds->dst->tag_protocol == DSA_TAG_PROTO_EDSA)
>  			val |= 0x3300;
>  		else
>  			val |= 0x0100;
> -- 
> 1.9.1
> 

^ permalink raw reply

* Re: [PATCH linux v3 1/1] fs/proc: use a rb tree for the directory entries
From: Eric W. Biederman @ 2014-10-14 19:56 UTC (permalink / raw)
  To: nicolas.dichtel
  Cc: netdev, linux-kernel, davem, akpm, adobriyan, rui.xiang, viro,
	oleg, gorcunov, kirill.shutemov, grant.likely, tytso,
	Linus Torvalds, Andrew Morton
In-Reply-To: <543BB42B.30505@6wind.com>

Nicolas Dichtel <nicolas.dichtel@6wind.com> writes:

> Le 07/10/2014 11:02, Nicolas Dichtel a écrit :
>> The current implementation for the directories in /proc is using a single
>> linked list. This is slow when handling directories with large numbers of
>> entries (eg netdevice-related entries when lots of tunnels are opened).
>>
>> This patch replaces this linked list by a red-black tree.
>>
>> Here are some numbers:
>>
>> dummy30000.batch contains 30 000 times 'link add type dummy'.
>>
>> Before the patch:
>> $ time ip -b dummy30000.batch
>> real	2m31.950s
>> user	0m0.440s
>> sys	2m21.440s
>> $ time rmmod dummy
>> real	1m35.764s
>> user	0m0.000s
>> sys	1m24.088s
>>
>> After the patch:
>> $ time ip -b dummy30000.batch
>> real	2m0.874s
>> user	0m0.448s
>> sys	1m49.720s
>> $ time rmmod dummy
>> real	1m13.988s
>> user	0m0.000s
>> sys	1m1.008s
>>
>> The idea of improving this part was suggested by
>> Thierry Herbelot <thierry.herbelot@6wind.com>.
>>
>> Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
>> Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
>> ---
>
> I'm not sure who is in charge of taking this patch. Should I resend it to
> someone else or is it already included in a tree?

There are a couple of things going on here.

This patch came out at the beginning of the merge window which is a time
when everything that was ready and well tested ahead of time gets
merged.

Your numbers don't look too bad, so I expect this code is ready to go
(although I am a smidge disappointed in the small size of the
performance improvement), my quick read through earlier did not show
anything scary.   Do you have any idea why going from O(N^2) algorithm
to a O(NlogN) algorithm showed such a small performance improvement with
30,000 entries?

Normally proc is looked at by a group of folks me, Alexey Dobriyan, and
Al Viro all sort of tag team taking care of the proc infrastructure with
(except for Al) Andrew Morton typically taking the patches and merging
them.

I am currently in the middle of a move so I don't have the time to carry
this change or do much of anything until I am settled again.

What I would recommend is verifying your patch works against v3.18-rc1
at the begginning of next week and sending the code to Andrew Morton.

Eric

^ permalink raw reply

* Re: [net PATCH 0/3] bug fixes for davinci_cpdma and cpsw drivers
From: David Miller @ 2014-10-14 19:35 UTC (permalink / raw)
  To: mugunthanvnm; +Cc: netdev
In-Reply-To: <1413219067-15328-1-git-send-email-mugunthanvnm@ti.com>

From: Mugunthan V N <mugunthanvnm@ti.com>
Date: Mon, 13 Oct 2014 22:21:04 +0530

> Mugunthan V N (3):
>   drivers: net: davinci_cpdma: remove kfree on objects allocated with
>     devm_* apis
>   drivers: net: davinci_cpdma: remove spinlock as SOFTIRQ-unsafe lock
>     order detected
>   drivers: net: cpsw: remove child devices while driver detach

Series applied, thanks.

^ permalink raw reply

* [Patch net] rds: avoid calling sock_kfree_s() on allocation failure
From: Cong Wang @ 2014-10-14 19:35 UTC (permalink / raw)
  To: netdev; +Cc: davem, rds-devel, Chien Yen, Stephen Hemminger, Cong Wang,
	Cong Wang

From: Cong Wang <cwang@twopensource.com>

It is okay to free a NULL pointer but not okay to mischarge the socket optmem
accounting. Compile test only.

Reported-by: rucsoftsec@gmail.com
Cc: Chien Yen <chien.yen@oracle.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Cong Wang <cwang@twopensource.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>

---
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 4e37c1c..40084d8 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -564,12 +564,12 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
 
 	if (rs->rs_bound_addr == 0) {
 		ret = -ENOTCONN; /* XXX not a great errno */
-		goto out;
+		goto out_ret;
 	}
 
 	if (args->nr_local > UIO_MAXIOV) {
 		ret = -EMSGSIZE;
-		goto out;
+		goto out_ret;
 	}
 
 	/* Check whether to allocate the iovec area */
@@ -578,7 +578,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
 		iovs = sock_kmalloc(rds_rs_to_sk(rs), iov_size, GFP_KERNEL);
 		if (!iovs) {
 			ret = -ENOMEM;
-			goto out;
+			goto out_ret;
 		}
 	}
 
@@ -696,6 +696,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
 	if (iovs != iovstack)
 		sock_kfree_s(rds_rs_to_sk(rs), iovs, iov_size);
 	kfree(pages);
+out_ret:
 	if (ret)
 		rds_rdma_free_op(op);
 	else

^ permalink raw reply related

* Re: [PATCH net-next] tg3: Add skb->xmit_more support
From: David Miller @ 2014-10-14 19:34 UTC (permalink / raw)
  To: prashant; +Cc: netdev, dborkman, mchan
In-Reply-To: <1413217302-15396-1-git-send-email-prashant@broadcom.com>

From: Prashant Sreedharan <prashant@broadcom.com>
Date: Mon, 13 Oct 2014 09:21:42 -0700

> Ring TX doorbell only if xmit_more is not set or the queue is stopped.
> 
> Suggested-by: Daniel Borkmann <dborkman@redhat.com>
> Signed-off-by: Prashant Sreedharan <prashant@broadcom.com>
> Signed-off-by: Michael Chan <mchan@broadcom.com>

Applied, thanks.

^ permalink raw reply

* Re: Netlink mmap tx security?
From: Andy Lutomirski @ 2014-10-14 19:33 UTC (permalink / raw)
  To: David Miller; +Cc: Linus Torvalds, Patrick McHardy, Network Development
In-Reply-To: <20141014.151949.1967601568480255495.davem@davemloft.net>

On Tue, Oct 14, 2014 at 12:19 PM, David Miller <davem@davemloft.net> wrote:
> From: Andy Lutomirski <luto@amacapital.net>
> Date: Sat, 11 Oct 2014 15:29:17 -0700
>
>> On May 12, 2014 3:08 PM, "Andy Lutomirski" <luto@amacapital.net> wrote:
>>>
>>> [moving to netdev -- this is much lower impact than I thought, since
>>> you can't set up a netlink mmap ring without global CAP_NET_ADMIN]
>>
>> Did anything ever happen here?  Despite not being a privilege
>> escalation in the normal sense, it's still a bug, and it's still a
>> fairly easy bypass of module signatures.
>
> Andy, please review:
>
> ====================
> [PATCH] netlink: Remove TX mmap support.
>
> There is no reasonable manner in which to absolutely make sure that another
> thread of control cannot write to the pages in the mmap()'d area and thus
> make sure that netlink messages do not change underneath us after we've
> performed verifications.

For full honesty, there is now the machinery developed for memfd
sealing, but I can't imagine that this is ever faster than just
copying the buffer.

I think that the NETLINK_SKB_TX declaration in include/linux/netlink.h
should probably go, too.  And there's the last parameter to
netlink_set_ring, too, and possibly even the nlk->tx_ring struct
itself.

--Andy

>
> Reported-by: Andy Lutomirski <luto@amacapital.net>
> Signed-off-by: David S. Miller <davem@davemloft.net>
> ---
>  net/netlink/af_netlink.c | 135 ++---------------------------------------------
>  1 file changed, 5 insertions(+), 130 deletions(-)
>
> diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
> index c416725..771e6c0 100644
> --- a/net/netlink/af_netlink.c
> +++ b/net/netlink/af_netlink.c
> @@ -289,11 +289,6 @@ static bool netlink_rx_is_mmaped(struct sock *sk)
>         return nlk_sk(sk)->rx_ring.pg_vec != NULL;
>  }
>
> -static bool netlink_tx_is_mmaped(struct sock *sk)
> -{
> -       return nlk_sk(sk)->tx_ring.pg_vec != NULL;
> -}
> -
>  static __pure struct page *pgvec_to_page(const void *addr)
>  {
>         if (is_vmalloc_addr(addr))
> @@ -662,13 +657,6 @@ static unsigned int netlink_poll(struct file *file, struct socket *sock,
>         }
>         spin_unlock_bh(&sk->sk_receive_queue.lock);
>
> -       spin_lock_bh(&sk->sk_write_queue.lock);
> -       if (nlk->tx_ring.pg_vec) {
> -               if (netlink_current_frame(&nlk->tx_ring, NL_MMAP_STATUS_UNUSED))
> -                       mask |= POLLOUT | POLLWRNORM;
> -       }
> -       spin_unlock_bh(&sk->sk_write_queue.lock);
> -
>         return mask;
>  }
>
> @@ -698,104 +686,6 @@ static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk,
>         NETLINK_CB(skb).sk = sk;
>  }
>
> -static int netlink_mmap_sendmsg(struct sock *sk, struct msghdr *msg,
> -                               u32 dst_portid, u32 dst_group,
> -                               struct sock_iocb *siocb)
> -{
> -       struct netlink_sock *nlk = nlk_sk(sk);
> -       struct netlink_ring *ring;
> -       struct nl_mmap_hdr *hdr;
> -       struct sk_buff *skb;
> -       unsigned int maxlen;
> -       bool excl = true;
> -       int err = 0, len = 0;
> -
> -       /* Netlink messages are validated by the receiver before processing.
> -        * In order to avoid userspace changing the contents of the message
> -        * after validation, the socket and the ring may only be used by a
> -        * single process, otherwise we fall back to copying.
> -        */
> -       if (atomic_long_read(&sk->sk_socket->file->f_count) > 2 ||
> -           atomic_read(&nlk->mapped) > 1)
> -               excl = false;
> -
> -       mutex_lock(&nlk->pg_vec_lock);
> -
> -       ring   = &nlk->tx_ring;
> -       maxlen = ring->frame_size - NL_MMAP_HDRLEN;
> -
> -       do {
> -               hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID);
> -               if (hdr == NULL) {
> -                       if (!(msg->msg_flags & MSG_DONTWAIT) &&
> -                           atomic_read(&nlk->tx_ring.pending))
> -                               schedule();
> -                       continue;
> -               }
> -               if (hdr->nm_len > maxlen) {
> -                       err = -EINVAL;
> -                       goto out;
> -               }
> -
> -               netlink_frame_flush_dcache(hdr);
> -
> -               if (likely(dst_portid == 0 && dst_group == 0 && excl)) {
> -                       skb = alloc_skb_head(GFP_KERNEL);
> -                       if (skb == NULL) {
> -                               err = -ENOBUFS;
> -                               goto out;
> -                       }
> -                       sock_hold(sk);
> -                       netlink_ring_setup_skb(skb, sk, ring, hdr);
> -                       NETLINK_CB(skb).flags |= NETLINK_SKB_TX;
> -                       __skb_put(skb, hdr->nm_len);
> -                       netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED);
> -                       atomic_inc(&ring->pending);
> -               } else {
> -                       skb = alloc_skb(hdr->nm_len, GFP_KERNEL);
> -                       if (skb == NULL) {
> -                               err = -ENOBUFS;
> -                               goto out;
> -                       }
> -                       __skb_put(skb, hdr->nm_len);
> -                       memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, hdr->nm_len);
> -                       netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
> -               }
> -
> -               netlink_increment_head(ring);
> -
> -               NETLINK_CB(skb).portid    = nlk->portid;
> -               NETLINK_CB(skb).dst_group = dst_group;
> -               NETLINK_CB(skb).creds     = siocb->scm->creds;
> -
> -               err = security_netlink_send(sk, skb);
> -               if (err) {
> -                       kfree_skb(skb);
> -                       goto out;
> -               }
> -
> -               if (unlikely(dst_group)) {
> -                       atomic_inc(&skb->users);
> -                       netlink_broadcast(sk, skb, dst_portid, dst_group,
> -                                         GFP_KERNEL);
> -               }
> -               err = netlink_unicast(sk, skb, dst_portid,
> -                                     msg->msg_flags & MSG_DONTWAIT);
> -               if (err < 0)
> -                       goto out;
> -               len += err;
> -
> -       } while (hdr != NULL ||
> -                (!(msg->msg_flags & MSG_DONTWAIT) &&
> -                 atomic_read(&nlk->tx_ring.pending)));
> -
> -       if (len > 0)
> -               err = len;
> -out:
> -       mutex_unlock(&nlk->pg_vec_lock);
> -       return err;
> -}
> -
>  static void netlink_queue_mmaped_skb(struct sock *sk, struct sk_buff *skb)
>  {
>         struct nl_mmap_hdr *hdr;
> @@ -842,10 +732,8 @@ static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb)
>  #else /* CONFIG_NETLINK_MMAP */
>  #define netlink_skb_is_mmaped(skb)     false
>  #define netlink_rx_is_mmaped(sk)       false
> -#define netlink_tx_is_mmaped(sk)       false
>  #define netlink_mmap                   sock_no_mmap
>  #define netlink_poll                   datagram_poll
> -#define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, siocb)    0
>  #endif /* CONFIG_NETLINK_MMAP */
>
>  static void netlink_skb_destructor(struct sk_buff *skb)
> @@ -864,16 +752,11 @@ static void netlink_skb_destructor(struct sk_buff *skb)
>                 hdr = netlink_mmap_hdr(skb);
>                 sk = NETLINK_CB(skb).sk;
>
> -               if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) {
> -                       netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
> -                       ring = &nlk_sk(sk)->tx_ring;
> -               } else {
> -                       if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) {
> -                               hdr->nm_len = 0;
> -                               netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
> -                       }
> -                       ring = &nlk_sk(sk)->rx_ring;
> +               if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) {
> +                       hdr->nm_len = 0;
> +                       netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
>                 }
> +               ring = &nlk_sk(sk)->rx_ring;
>
>                 WARN_ON(atomic_read(&ring->pending) == 0);
>                 atomic_dec(&ring->pending);
> @@ -2165,8 +2048,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
>                 err = 0;
>                 break;
>  #ifdef CONFIG_NETLINK_MMAP
> -       case NETLINK_RX_RING:
> -       case NETLINK_TX_RING: {
> +       case NETLINK_RX_RING: {
>                 struct nl_mmap_req req;
>
>                 /* Rings might consume more memory than queue limits, require
> @@ -2295,13 +2177,6 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
>                         goto out;
>         }
>
> -       if (netlink_tx_is_mmaped(sk) &&
> -           msg->msg_iov->iov_base == NULL) {
> -               err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group,
> -                                          siocb);
> -               goto out;
> -       }
> -
>         err = -EMSGSIZE;
>         if (len > sk->sk_sndbuf - 32)
>                 goto out;
> --
> 1.7.11.7
>



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply

* Re: [patch net repost] ipv4: fix nexthop attlen check in fib_nh_match
From: David Miller @ 2014-10-14 19:33 UTC (permalink / raw)
  To: jiri; +Cc: netdev, kuznet, jmorris, yoshfuji, kaber, edumazet, tgraf
In-Reply-To: <1413210850-14456-1-git-send-email-jiri@resnulli.us>

From: Jiri Pirko <jiri@resnulli.us>
Date: Mon, 13 Oct 2014 16:34:10 +0200

> fib_nh_match does not match nexthops correctly. Example:
> 
> ip route add 172.16.10/24 nexthop via 192.168.122.12 dev eth0 \
>                           nexthop via 192.168.122.13 dev eth0
> ip route del 172.16.10/24 nexthop via 192.168.122.14 dev eth0 \
>                           nexthop via 192.168.122.15 dev eth0
> 
> Del command is successful and route is removed. After this patch
> applied, the route is correctly matched and result is:
> RTNETLINK answers: No such process
> 
> Please consider this for stable trees as well.
> 
> Fixes: 4e902c57417c4 ("[IPv4]: FIB configuration using struct fib_config")
> Signed-off-by: Jiri Pirko <jiri@resnulli.us>
> Acked-by: Eric Dumazet <edumazet@google.com>
> ---
> reposted with example (it was missing for some reason in the original post)

Applied and queued up for -stable, thanks Jiri!

^ permalink raw reply

* Re: Fwd: micrel: ksz8051 badly detected as ksz8031
From: Angelo Dureghello @ 2014-10-14 19:33 UTC (permalink / raw)
  To: Florian Fainelli, netdev@vger.kernel.org
In-Reply-To: <543D7507.3060008@gmail.com>

Hi Florian,

> On 10/14/2014 10:24 AM, Angelo Dureghello wrote:
>> Dear,
>>
>> have to apologize for the confusion, previous patch is not the proper fix,
>> since it is not solving completely the issue.
>>
>> And also, i mainly misunderstood the issue.
>>
>> The issue i am experiencing is :
>>
>> https://lkml.org/lkml/2013/9/18/259
>>
>> Mainly, i have Micrel chip marked KSZ8051(RNL), but the product Id in the
>> silicon is KSZ8031 and linux detects it as KSZ8031.
>> The attmept to mdio boot override register kill the Micrel functionality.
> Ok, so basically your bootloader does something that Linux does, and
> once Linux boots, it will reset the PHY to put it in a known state. If
> you can snoop the MDIO read/writes done in your bootloader environment,
> that might help narrow down the issue.
>
Bootloader is u-boot and seems it uses generic PHY setup, and so it works.

Linux at boot detects the phy and does a soft_reset. If the detection 
sets the
driver as for KSZ8031, ethernet/link will not work, becouse micrel.c uses
the incorrect config_init function, attempts to write to the bootstrap
override register, that can't be written for KSZ8051, and so puts the micrel
chip in a broken state.

The guy in this link (https://lkml.org/lkml/2013/9/18/259) seems are 
discussing a
better solution.

I patched my linux as below, but this is a fast fixup for me:

diff -rupN drivers/net/phy/phy_device.c 
../linux-3.17/drivers/net/phy/phy_device.c
--- drivers/net/phy/phy_device.c        2014-10-14 21:05:56.191117190 +0200
+++ ../linux-3.17/drivers/net/phy/phy_device.c  2014-10-05 
21:23:04.000000000 +0200
@@ -310,19 +310,6 @@ static int get_phy_id(struct mii_bus *bu

         *phy_id |= (phy_reg & 0xffff);

-       /*
-        * Angelo - Barix
-        * Micrel produced chips marked KSZ8051 but with KSZ8031 id code
-        * in the silicon. After getting crazy to understand why in 
recent kernel
-        * the ethenret was not workeing, i find it out.
-        *
-        * From the schematic, we assume to use KSZ8051
-        * I hardcode the fix here for ipam390.
-        */
-#if CONFIG_MACH_BARIX_IPAM390
-       if (*phy_id == 0x00221556) *phy_id = 0x00221550;
-#endif
-
         return 0;
  }

Regards
angelo

^ permalink raw reply

* Re: [PATCH net] tcp: TCP Small Queues and strange attractors
From: David Miller @ 2014-10-14 19:33 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev
In-Reply-To: <1413206867.9362.100.camel@edumazet-glaptop2.roam.corp.google.com>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 13 Oct 2014 06:27:47 -0700

> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index 8d4eac793700..4a7e97811d71 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -839,26 +839,38 @@ void tcp_wfree(struct sk_buff *skb)
>  {
 ...
>  		local_irq_restore(flags);
> -	} else {
> -		sock_wfree(skb);
> +		return;
>  	}
> +out:
> +	sk_free(sk);
>  }
>  

Why do we need to release the socket here?

^ permalink raw reply

* Re: [PATCH net] tcp: fix tcp_ack() performance problem
From: David Miller @ 2014-10-14 19:30 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, willemb, ncardwell, ycheng, vanj
In-Reply-To: <1413065849.9362.72.camel@edumazet-glaptop2.roam.corp.google.com>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sat, 11 Oct 2014 15:17:29 -0700

> From: Eric Dumazet <edumazet@google.com>
> 
> We worked hard to improve tcp_ack() performance, by not accessing
> skb_shinfo() in fast path (cd7d8498c9a5 tcp: change tcp_skb_pcount()
> location)
> 
> We still have one spurious access because of ACK timestamping,
> added in commit e1c8a607b281 ("net-timestamp: ACK timestamp for
> bytestreams")
> 
> By checking if sk_tsflags has SOF_TIMESTAMPING_TX_ACK set,
> we can avoid two cache line misses for the common case.
> 
> While we are at it, add two prefetchw() :
> 
> One in tcp_ack() to bring skb at the head of write queue.
> 
> One in tcp_clean_rtx_queue() loop to bring following skb,
> as we will delete skb from the write queue and dirty skb->next->prev.
> 
> Add a couple of [un]likely() clauses.
> 
> After this patch, tcp_ack() is no longer the most consuming
> function in tcp stack.
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>

Applied, thanks.

^ permalink raw reply

* Re: [PATCH linux v3 1/1] fs/proc: use a rb tree for the directory entries
From: David Miller @ 2014-10-14 19:30 UTC (permalink / raw)
  To: nicolas.dichtel
  Cc: netdev, linux-kernel, ebiederm, akpm, adobriyan, rui.xiang, viro,
	oleg, gorcunov, kirill.shutemov, grant.likely, tytso, torvalds
In-Reply-To: <543BB42B.30505@6wind.com>

From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Mon, 13 Oct 2014 13:14:51 +0200

> I'm not sure who is in charge of taking this patch. Should I resend
> it to someone else or is it already included in a tree?

Just want to make it clear that I don't intend to take this via
the networking tree.

^ permalink raw reply

* Re: Netlink mmap tx security?
From: David Miller @ 2014-10-14 19:19 UTC (permalink / raw)
  To: luto; +Cc: torvalds, kaber, netdev
In-Reply-To: <CALCETrWfQe5H2Ht7cjCQLfUw+XUcRvga_H93esaWpAp37=noZg@mail.gmail.com>

From: Andy Lutomirski <luto@amacapital.net>
Date: Sat, 11 Oct 2014 15:29:17 -0700

> On May 12, 2014 3:08 PM, "Andy Lutomirski" <luto@amacapital.net> wrote:
>>
>> [moving to netdev -- this is much lower impact than I thought, since
>> you can't set up a netlink mmap ring without global CAP_NET_ADMIN]
> 
> Did anything ever happen here?  Despite not being a privilege
> escalation in the normal sense, it's still a bug, and it's still a
> fairly easy bypass of module signatures.

Andy, please review:

====================
[PATCH] netlink: Remove TX mmap support.

There is no reasonable manner in which to absolutely make sure that another
thread of control cannot write to the pages in the mmap()'d area and thus
make sure that netlink messages do not change underneath us after we've
performed verifications.

Reported-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/af_netlink.c | 135 ++---------------------------------------------
 1 file changed, 5 insertions(+), 130 deletions(-)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index c416725..771e6c0 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -289,11 +289,6 @@ static bool netlink_rx_is_mmaped(struct sock *sk)
 	return nlk_sk(sk)->rx_ring.pg_vec != NULL;
 }
 
-static bool netlink_tx_is_mmaped(struct sock *sk)
-{
-	return nlk_sk(sk)->tx_ring.pg_vec != NULL;
-}
-
 static __pure struct page *pgvec_to_page(const void *addr)
 {
 	if (is_vmalloc_addr(addr))
@@ -662,13 +657,6 @@ static unsigned int netlink_poll(struct file *file, struct socket *sock,
 	}
 	spin_unlock_bh(&sk->sk_receive_queue.lock);
 
-	spin_lock_bh(&sk->sk_write_queue.lock);
-	if (nlk->tx_ring.pg_vec) {
-		if (netlink_current_frame(&nlk->tx_ring, NL_MMAP_STATUS_UNUSED))
-			mask |= POLLOUT | POLLWRNORM;
-	}
-	spin_unlock_bh(&sk->sk_write_queue.lock);
-
 	return mask;
 }
 
@@ -698,104 +686,6 @@ static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk,
 	NETLINK_CB(skb).sk = sk;
 }
 
-static int netlink_mmap_sendmsg(struct sock *sk, struct msghdr *msg,
-				u32 dst_portid, u32 dst_group,
-				struct sock_iocb *siocb)
-{
-	struct netlink_sock *nlk = nlk_sk(sk);
-	struct netlink_ring *ring;
-	struct nl_mmap_hdr *hdr;
-	struct sk_buff *skb;
-	unsigned int maxlen;
-	bool excl = true;
-	int err = 0, len = 0;
-
-	/* Netlink messages are validated by the receiver before processing.
-	 * In order to avoid userspace changing the contents of the message
-	 * after validation, the socket and the ring may only be used by a
-	 * single process, otherwise we fall back to copying.
-	 */
-	if (atomic_long_read(&sk->sk_socket->file->f_count) > 2 ||
-	    atomic_read(&nlk->mapped) > 1)
-		excl = false;
-
-	mutex_lock(&nlk->pg_vec_lock);
-
-	ring   = &nlk->tx_ring;
-	maxlen = ring->frame_size - NL_MMAP_HDRLEN;
-
-	do {
-		hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID);
-		if (hdr == NULL) {
-			if (!(msg->msg_flags & MSG_DONTWAIT) &&
-			    atomic_read(&nlk->tx_ring.pending))
-				schedule();
-			continue;
-		}
-		if (hdr->nm_len > maxlen) {
-			err = -EINVAL;
-			goto out;
-		}
-
-		netlink_frame_flush_dcache(hdr);
-
-		if (likely(dst_portid == 0 && dst_group == 0 && excl)) {
-			skb = alloc_skb_head(GFP_KERNEL);
-			if (skb == NULL) {
-				err = -ENOBUFS;
-				goto out;
-			}
-			sock_hold(sk);
-			netlink_ring_setup_skb(skb, sk, ring, hdr);
-			NETLINK_CB(skb).flags |= NETLINK_SKB_TX;
-			__skb_put(skb, hdr->nm_len);
-			netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED);
-			atomic_inc(&ring->pending);
-		} else {
-			skb = alloc_skb(hdr->nm_len, GFP_KERNEL);
-			if (skb == NULL) {
-				err = -ENOBUFS;
-				goto out;
-			}
-			__skb_put(skb, hdr->nm_len);
-			memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, hdr->nm_len);
-			netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
-		}
-
-		netlink_increment_head(ring);
-
-		NETLINK_CB(skb).portid	  = nlk->portid;
-		NETLINK_CB(skb).dst_group = dst_group;
-		NETLINK_CB(skb).creds	  = siocb->scm->creds;
-
-		err = security_netlink_send(sk, skb);
-		if (err) {
-			kfree_skb(skb);
-			goto out;
-		}
-
-		if (unlikely(dst_group)) {
-			atomic_inc(&skb->users);
-			netlink_broadcast(sk, skb, dst_portid, dst_group,
-					  GFP_KERNEL);
-		}
-		err = netlink_unicast(sk, skb, dst_portid,
-				      msg->msg_flags & MSG_DONTWAIT);
-		if (err < 0)
-			goto out;
-		len += err;
-
-	} while (hdr != NULL ||
-		 (!(msg->msg_flags & MSG_DONTWAIT) &&
-		  atomic_read(&nlk->tx_ring.pending)));
-
-	if (len > 0)
-		err = len;
-out:
-	mutex_unlock(&nlk->pg_vec_lock);
-	return err;
-}
-
 static void netlink_queue_mmaped_skb(struct sock *sk, struct sk_buff *skb)
 {
 	struct nl_mmap_hdr *hdr;
@@ -842,10 +732,8 @@ static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb)
 #else /* CONFIG_NETLINK_MMAP */
 #define netlink_skb_is_mmaped(skb)	false
 #define netlink_rx_is_mmaped(sk)	false
-#define netlink_tx_is_mmaped(sk)	false
 #define netlink_mmap			sock_no_mmap
 #define netlink_poll			datagram_poll
-#define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, siocb)	0
 #endif /* CONFIG_NETLINK_MMAP */
 
 static void netlink_skb_destructor(struct sk_buff *skb)
@@ -864,16 +752,11 @@ static void netlink_skb_destructor(struct sk_buff *skb)
 		hdr = netlink_mmap_hdr(skb);
 		sk = NETLINK_CB(skb).sk;
 
-		if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) {
-			netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
-			ring = &nlk_sk(sk)->tx_ring;
-		} else {
-			if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) {
-				hdr->nm_len = 0;
-				netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
-			}
-			ring = &nlk_sk(sk)->rx_ring;
+		if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) {
+			hdr->nm_len = 0;
+			netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
 		}
+		ring = &nlk_sk(sk)->rx_ring;
 
 		WARN_ON(atomic_read(&ring->pending) == 0);
 		atomic_dec(&ring->pending);
@@ -2165,8 +2048,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 		err = 0;
 		break;
 #ifdef CONFIG_NETLINK_MMAP
-	case NETLINK_RX_RING:
-	case NETLINK_TX_RING: {
+	case NETLINK_RX_RING: {
 		struct nl_mmap_req req;
 
 		/* Rings might consume more memory than queue limits, require
@@ -2295,13 +2177,6 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
 			goto out;
 	}
 
-	if (netlink_tx_is_mmaped(sk) &&
-	    msg->msg_iov->iov_base == NULL) {
-		err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group,
-					   siocb);
-		goto out;
-	}
-
 	err = -EMSGSIZE;
 	if (len > sk->sk_sndbuf - 32)
 		goto out;
-- 
1.7.11.7

^ permalink raw reply related

* Re: Fwd: micrel: ksz8051 badly detected as ksz8031
From: Florian Fainelli @ 2014-10-14 19:09 UTC (permalink / raw)
  To: Angelo Dureghello, netdev@vger.kernel.org
In-Reply-To: <543D5C5B.9010703@gmail.com>

On 10/14/2014 10:24 AM, Angelo Dureghello wrote:
> Dear,
> 
> have to apologize for the confusion, previous patch is not the proper fix,
> since it is not solving completely the issue.
> 
> And also, i mainly misunderstood the issue.
> 
> The issue i am experiencing is :
> 
> https://lkml.org/lkml/2013/9/18/259
> 
> Mainly, i have Micrel chip marked KSZ8051(RNL), but the product Id in the
> silicon is KSZ8031 and linux detects it as KSZ8031.
> The attmept to mdio boot override register kill the Micrel functionality.

Ok, so basically your bootloader does something that Linux does, and
once Linux boots, it will reset the PHY to put it in a known state. If
you can snoop the MDIO read/writes done in your bootloader environment,
that might help narrow down the issue.

> 
> So i just replaced the phy_id code (hardcoded) in the code mdio detecion
> routine.

According to the link you posted above, the problem is that two
different PHY chips have the PHY ID, so why would you need to hardcode
the detection here?

> 
> Also, i am not giving to the Micrel any external 50Mhz clock, but
> as per default, the Micrel is giving the clock out to the davinci-emac.
> So no fixups are needed for my case.
> 
> But i still have a last issue now: i see link is up 100Mbit, but no
> packets are really sent, and nothing is received.
> Led links are up.

Do you have access to the Ethernet MAC counters using 'ethtool -S'? That
might tell you whether the problem is between the MAC and PHY, or at the
PHY level. If the PHY maintain its own set of counters (fairly unlikely)
can you try reading those and see if that works?

What about pinmux settings, Ethernet MAC settings etc... are they all
correct? Is there any Ethernet MAC back-pressure mechanism that can be
read to know whether the DMA engine is stuck transmitting?

Thanks

> 
> Time zone set
> Starting network...
> davinci_mdio davinci_mdio.0: resetting idled controller
> net eth0: attached PHY driver [Micrel KSZ8051]
> (mii_bus:phy_addr=davinci_mdio-0:00, id=221550)
> udhcpc (v1.20.2) started
> Sending discover...
> davinci_emac davinci_emac.1 eth0: Link is Up - 100Mbps/Full - flow
> control off
> Sending discover...
> Sending discover...
> No lease, failing
> ....
> 
> 
> [root@barix ~]# ifconfig
> eth0      Link encap:Ethernet  HWaddr 00:08:E1:03:2A:C5
>           inet6 addr: fe80::208:e1ff:fe03:2ac5/64 Scope:Link
>           UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
>           RX packets:0 errors:0 dropped:0 overruns:0 frame:0
>           TX packets:13 errors:0 dropped:0 overruns:0 carrier:0
>           collisions:0 txqueuelen:1000
>           RX bytes:0 (0.0 B)  TX bytes:2258 (2.2 KiB)
>           Interrupt:33
> 
> lo        Link encap:Local Loopback
>           inet addr:127.0.0.1  Mask:255.0.0.0
>           inet6 addr: ::1/128 Scope:Host
>           UP LOOPBACK RUNNING  MTU:65536  Metric:1
>           RX packets:0 errors:0 dropped:0 overruns:0 frame:0
>           TX packets:0 errors:0 dropped:0 overruns:0 carrier:0
>           collisions:0 txqueuelen:0
>           RX bytes:0 (0.0 B)  TX bytes:0 (0.0 B)
> 
> Packets seems sent but they are not sent at all (checking from WS)
> and no packets are received at the same time.
> 
> Reagrds,
> Angelo
> 
> 
> 
> -- 
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 00/12] Coverity patches for drivers/isdn
From: David Miller @ 2014-10-14 19:05 UTC (permalink / raw)
  To: tilman; +Cc: netdev, davej, hjlipp, isdn, isdn4linux
In-Reply-To: <cover.1413021630.git.tilman@imap.cc>

From: Tilman Schmidt <tilman@imap.cc>
Date: Sat, 11 Oct 2014 13:46:29 +0200 (CEST)

> Here's a series of patches for the ISDN CAPI subsystem and the
> Gigaset ISDN driver.
> Patches 1 to 7 are specific fixes for Coverity warnings.
> Patches 8 to 11 fix related problems with the handling of invalid
> CAPI command codes I noticed while working on this.
> Patch 12 fixes an unrelated problem I noticed during the subsequent
> regression tests.
> It would be great if these could still be merged.

Series applied, thanks.

^ permalink raw reply

* Re: something is wrong in commit 971f10eca1 - tcp: better TCP_SKB_CB layout to reduce cache line misses
From: David Miller @ 2014-10-14 19:03 UTC (permalink / raw)
  To: cwang; +Cc: kkolasa, eric.dumazet, netdev, edumazet
In-Reply-To: <CAHA+R7N9rNUH4aytFG3YLt8nzpERqmf34FgaWbe5R7P0AaWuZw@mail.gmail.com>

From: Cong Wang <cwang@twopensource.com>
Date: Tue, 14 Oct 2014 11:59:25 -0700

> On Tue, Oct 14, 2014 at 6:25 AM, Krzysztof Kolasa <kkolasa@winsoft.pl> wrote:
>> W dniu 14.10.2014 o 02:09, Cong Wang pisze:
>>
>>> On Mon, Oct 13, 2014 at 4:59 PM, Cong Wang <cwang@twopensource.com> wrote:
>>>>
>>>> Probably not related with this bug, but with regarding to the
>>>> offending commit, what's the point of the memmove() in tcp_v4_rcv()
>>>> since ip_rcv() already clears IPCB()?
>>>
>>> Oh, ip options are actually saved in ip_rcv_finish()... Hmm, looks scary
>>> to play with variable-length array with memmove()....
>>>
>> On my other old laptop with 32bit kernel next and graphics card Intel 945GM
>> just after the revert commit working OK,
>> before, after login to gnome shell in some seconds decorations disappear
>>
>> 32 bit Ubuntu 12.04.5 LTS, gnome shell, kernel source next 14-10-2014
>>
>> Can anyone confirm this ?
>>
> 
> Sorry, believe it or not, for me it is hard to find a 32bit machine even VM. :)
> 
> Could the attached patch by any chance help? I noticed cookie_v4_check()
> still uses IPCB() instead of TCPCB() at TCP layer.

Eric, please review.

^ permalink raw reply

* Re: [PATCH RFC v4 net 1/3] ipv6: Remove BACKTRACK macro
From: David Miller @ 2014-10-14 19:01 UTC (permalink / raw)
  To: kafai; +Cc: netdev, hannes
In-Reply-To: <1412966888-31384-2-git-send-email-kafai@fb.com>

From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 10 Oct 2014 11:48:06 -0700

> +struct fib6_node *fib6_backtrack(struct fib6_node *fn,
> +				 struct in6_addr *saddr);
> +

I am completely mystified why you did this, could you explain the
logic?  I want to know what drove you to make this exported.

I marked it static in my example patch, and there is no caller outside
of route.c

Doing this also eliminates inlining opportunitites.

Please keep this private inside of route.c

^ permalink raw reply

* ixgbe: Question about Flow Control on 10G
From: dom @ 2014-10-14 19:01 UTC (permalink / raw)
  To: donald.c.skidmore, netdev

Hi

I have a question about the ixgbe driver's handling of 'ethtool -a ethX' 
when the NIC is using fibre.

Specifically I don't understand the code introduced by this commit:

commit 73d80953dfd1d5a92948005798c857c311c2834b
Author: Don Skidmore <donald.c.skidmore@intel.com>
Date:   Wed Jul 31 02:19:24 2013 +0000
Subject: ixgbe: fix fc autoneg ethtool reporting.

The function introduced the function:
        ixgbe_device_supports_autoneg_fc()

which gets called by ixgbe_get_pauseparam()/ixgbe_set_pauseparam().

specifically there is a  case in ixgbe_device_supports_autoneg_fc()

     case ixgbe_media_type_fiber_qsfp:
     case ixgbe_media_type_fiber:
         hw->mac.ops.check_link(hw, &speed, &link_up, false);
         /* if link is down, assume supported */
         if (link_up)
             supported = speed == IXGBE_LINK_SPEED_1GB_FULL ?
                 true : false;

If link_up=1 then why is supported only true for a 
speed=IXGBE_LINK_SPEED_1GB_FULL ?

Why is Flow Control not  supported for IXGBE_LINK_SPEED_10GB_FULL ?

I have search through the 82599 spec, but I can't find a reason.
Please can anyone help me understand ?

Thanks
dom

^ permalink raw reply

* Re: something is wrong in commit 971f10eca1 - tcp: better TCP_SKB_CB layout to reduce cache line misses
From: Cong Wang @ 2014-10-14 18:59 UTC (permalink / raw)
  To: Krzysztof Kolasa; +Cc: Eric Dumazet, netdev, edumazet, David Miller
In-Reply-To: <543D2435.8090002@winsoft.pl>

[-- Attachment #1: Type: text/plain, Size: 1044 bytes --]

On Tue, Oct 14, 2014 at 6:25 AM, Krzysztof Kolasa <kkolasa@winsoft.pl> wrote:
> W dniu 14.10.2014 o 02:09, Cong Wang pisze:
>
>> On Mon, Oct 13, 2014 at 4:59 PM, Cong Wang <cwang@twopensource.com> wrote:
>>>
>>> Probably not related with this bug, but with regarding to the
>>> offending commit, what's the point of the memmove() in tcp_v4_rcv()
>>> since ip_rcv() already clears IPCB()?
>>
>> Oh, ip options are actually saved in ip_rcv_finish()... Hmm, looks scary
>> to play with variable-length array with memmove()....
>>
> On my other old laptop with 32bit kernel next and graphics card Intel 945GM
> just after the revert commit working OK,
> before, after login to gnome shell in some seconds decorations disappear
>
> 32 bit Ubuntu 12.04.5 LTS, gnome shell, kernel source next 14-10-2014
>
> Can anyone confirm this ?
>

Sorry, believe it or not, for me it is hard to find a 32bit machine even VM. :)

Could the attached patch by any chance help? I noticed cookie_v4_check()
still uses IPCB() instead of TCPCB() at TCP layer.

Thanks.

[-- Attachment #2: tcp-cb.diff --]
[-- Type: text/plain, Size: 3524 bytes --]

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 74efeda..5d11012 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -468,8 +468,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb);
 /* From syncookies.c */
 int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
 		      u32 cookie);
-struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
-			     struct ip_options *opt);
+struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb);
 #ifdef CONFIG_SYN_COOKIES
 
 /* Syncookies use a monotonic timer which increments every 60 seconds.
@@ -530,6 +529,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority);
 int tcp_send_synack(struct sock *);
 bool tcp_syn_flood_action(struct sock *sk, const struct sk_buff *skb,
 			  const char *proto);
+struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb);
 void tcp_push_one(struct sock *, unsigned int mss_now);
 void tcp_send_ack(struct sock *sk);
 void tcp_send_delayed_ack(struct sock *sk);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 0431a8f..d346303 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -255,9 +255,9 @@ bool cookie_check_timestamp(struct tcp_options_received *tcp_opt,
 }
 EXPORT_SYMBOL(cookie_check_timestamp);
 
-struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
-			     struct ip_options *opt)
+struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 {
+	struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
 	struct tcp_options_received tcp_opt;
 	struct inet_request_sock *ireq;
 	struct tcp_request_sock *treq;
@@ -317,15 +317,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 	/* We throwed the options of the initial SYN away, so we hope
 	 * the ACK carries the same options again (see RFC1122 4.2.3.8)
 	 */
-	if (opt && opt->optlen) {
-		int opt_size = sizeof(struct ip_options_rcu) + opt->optlen;
-
-		ireq->opt = kmalloc(opt_size, GFP_ATOMIC);
-		if (ireq->opt != NULL && ip_options_echo(&ireq->opt->opt, skb)) {
-			kfree(ireq->opt);
-			ireq->opt = NULL;
-		}
-	}
+	ireq->opt = tcp_v4_save_options(skb);
 
 	if (security_inet_conn_request(sk, skb, req)) {
 		reqsk_free(req);
@@ -344,7 +336,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 	flowi4_init_output(&fl4, sk->sk_bound_dev_if, ireq->ir_mark,
 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP,
 			   inet_sk_flowi_flags(sk),
-			   (opt && opt->srr) ? opt->faddr : ireq->ir_rmt_addr,
+			   opt->srr ? opt->faddr : ireq->ir_rmt_addr,
 			   ireq->ir_loc_addr, th->source, th->dest);
 	security_req_classify_flow(req, flowi4_to_flowi(&fl4));
 	rt = ip_route_output_key(sock_net(sk), &fl4);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 552e87e..e02d586 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -883,7 +883,7 @@ EXPORT_SYMBOL(tcp_syn_flood_action);
 /*
  * Save and compile IPv4 options into the request_sock if needed.
  */
-static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
+struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
 {
 	const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
 	struct ip_options_rcu *dopt = NULL;
@@ -1428,7 +1428,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
 
 #ifdef CONFIG_SYN_COOKIES
 	if (!th->syn)
-		sk = cookie_v4_check(sk, skb, &TCP_SKB_CB(skb)->header.h4.opt);
+		sk = cookie_v4_check(sk, skb);
 #endif
 	return sk;
 }

^ permalink raw reply related

* Re: [PATCH net-next RFC 0/3] virtio-net: Conditionally enable tx interrupt
From: David Miller @ 2014-10-14 18:53 UTC (permalink / raw)
  To: jasowang-H+wXaHxf7aLQT0dZR+AlfA
  Cc: rusty-8n+1lVoiYb80n/F98K4Iww, mst-H+wXaHxf7aLQT0dZR+AlfA,
	virtualization-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, kvm-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1413011806-3813-1-git-send-email-jasowang-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

From: Jason Wang <jasowang-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Date: Sat, 11 Oct 2014 15:16:43 +0800

> We free old transmitted packets in ndo_start_xmit() currently, so any
> packet must be orphaned also there. This was used to reduce the overhead of
> tx interrupt to achieve better performance. But this may not work for some
> protocols such as TCP stream. TCP depends on the value of sk_wmem_alloc to
> implement various optimization for small packets stream such as TCP small
> queue and auto corking. But orphaning packets early in ndo_start_xmit()
> disable such things more or less since sk_wmem_alloc was not accurate. This
> lead extra low throughput for TCP stream of small writes.
> 
> This series tries to solve this issue by enable tx interrupts for all TCP
> packets other than the ones with push bit or pure ACK. This is done through
> the support of urgent descriptor which can force an interrupt for a
> specified packet. If tx interrupt was enabled for a packet, there's no need
> to orphan it in ndo_start_xmit(), we can free it tx napi which is scheduled
> by tx interrupt. Then sk_wmem_alloc was more accurate than before and TCP
> can batch more for small write. More larger skb was produced by TCP in this
> case to improve both throughput and cpu utilization.
> 
> Test shows great improvements on small write tcp streams. For most of the
> other cases, the throughput and cpu utilization are the same in the
> past. Only few cases, more cpu utilization was noticed which needs more
> investigation.
> 
> Review and comments are welcomed.

I think proper accounting and queueing (at all levels, not just TCP
sockets) is more important than trying to skim a bunch of cycles by
avoiding TX interrupts.

Having an event to free the SKB is absolutely essential for the stack
to operate correctly.

And with virtio-net you don't even have the excuse of "the HW
unfortunately doesn't have an appropriate TX event."

So please don't play games, and instead use TX interrupts all the
time.  You can mitigate them in various ways, but don't turn them on
selectively based upon traffic type, that's terrible.

You can even use ->xmit_more to defer the TX interrupt indication to
the final TX packet in the chain.

^ permalink raw reply

* Re: [PATCH] net: fec: ptp: fix convergence issue to support LinuxPTP stack
From: Richard Cochran @ 2014-10-14 18:49 UTC (permalink / raw)
  To: Frank.Li@freescale.com
  Cc: David Miller, fugang.duan@freescale.com, netdev@vger.kernel.org,
	bhutchings@solarflare.com
In-Reply-To: <6b1256b73b7a4f978573a77ec3e36831@BLUPR03MB486.namprd03.prod.outlook.com>

On Tue, Oct 14, 2014 at 06:43:51PM +0000, Frank.Li@freescale.com wrote:
> Only MX6 SX. Only MX6 SX added FEC_QUIRK_BUG_CAPTURE.

But what about this comment:

+/* ENET Block Guide/ Chapter for the iMX6SLX (PELE) address one issue:
+ * Incorrect behavior for ENET_ATCR[Capture and Restart Bits]. These bits will
+ * always read a value zero. When these bits are set to 1'b1, these should hold
+ * value 1'b1 until the counter value is capture in the register clock domain.
+ */

It sounds like the bits are "sticky" until the counter value has been
latched. Therefore you need to check the bits before reading the
counter, but the code does not do this for the case of !SX.

Thanks,
Richard

^ permalink raw reply

* Re: [PATCH v3 0/3] Enable FEC pps feather
From: David Miller @ 2014-10-14 18:45 UTC (permalink / raw)
  To: b45643
  Cc: richardcochran, netdev, shawn.guo, bhutchings, R49496, b38611,
	b20596, stephen
In-Reply-To: <1412918130-18830-1-git-send-email-b45643@freescale.com>

From: Luwei Zhou <b45643@freescale.com>
Date: Fri, 10 Oct 2014 13:15:27 +0800

> Change from v2 to v3:
> 	-Using the default channel 0 to be PPS channel not PTP_PIN_SET/GETFUNC interface.
> 	-Using the linux definition of NSEC_PER_SEC.
> 
> Change from v1 to v2:
> 	- Fix the potential 32-bit multiplication overflow issue.
> 	- Optimize the hareware adjustment code to improve efficiency as Richard suggested
> 	- Use ptp PTP_PIN_SET/GETFUNC interface to set PPS channel not device tree
> 	and add PTP_PF_PPS enumeration
> 	- Modify comments style

Series applied, thanks.

^ permalink raw reply

* Re: [PATCH v3 0/3] Enable FEC pps feather
From: Richard Cochran @ 2014-10-14 18:45 UTC (permalink / raw)
  To: Luwei Zhou
  Cc: davem, netdev, shawn.guo, bhutchings, R49496, b38611, b20596,
	stephen
In-Reply-To: <1412918130-18830-1-git-send-email-b45643@freescale.com>


I had to think a lot about PPS feathers
and wonder what color they might be.

Finally I got it: you meant "feature" :^)

Thanks,
Richard

^ permalink raw reply

* RE: [PATCH] net: fec: ptp: fix convergence issue to support LinuxPTP stack
From: Frank.Li @ 2014-10-14 18:43 UTC (permalink / raw)
  To: Richard Cochran
  Cc: David Miller, fugang.duan@freescale.com, netdev@vger.kernel.org,
	bhutchings@solarflare.com
In-Reply-To: <20141014183933.GA14216@localhost.localdomain>



> -----Original Message-----
> From: Richard Cochran [mailto:richardcochran@gmail.com]
> Sent: Tuesday, October 14, 2014 1:40 PM
> To: Li Frank-B20596
> Cc: David Miller; Duan Fugang-B38611; netdev@vger.kernel.org;
> bhutchings@solarflare.com
> Subject: Re: [PATCH] net: fec: ptp: fix convergence issue to support LinuxPTP
> stack
> 
> Frank,
> 
> On Tue, Oct 14, 2014 at 06:27:28PM +0000, Frank.Li@freescale.com wrote:
> > Fugang's patch fix the problem existed in MX6SX platform.
> 
> You say that Fugang's patch fixes a quirk in the SX device.
> 
> But he said that the user's manual tells us to wait for some status bits to
> clear. (See the third line, with the 'while' key word).
> 
> > > > On Tue, Oct 14, 2014 at 01:39:47PM +0800, Fugang Duan wrote:
> > > >> IEEE 1588 module has one hw issue in capturing the ATVR register.
> > > >> According to the user manual it is:
> > > >> 		ENET0->ATCR |= ENET_ATCR_CAPTURE_MASK;
> > > >> 		while(ENET0->ATCR & ENET_ATCR_CAPTURE_MASK);
> > > >> 		ts_counter_ns = ENET0->ATVR;
> 
> That applies to all IMX devices, doesn't it?
> In any case, Fugang's change log is not clear.

Only MX6 SX. Only MX6 SX added FEC_QUIRK_BUG_CAPTURE.

.driver_data = FEC_QUIRK_ENET_MAC | FEC_QUIRK_HAS_GBIT |
 				FEC_QUIRK_HAS_BUFDESC_EX | FEC_QUIRK_HAS_CSUM |
 				FEC_QUIRK_HAS_VLAN | FEC_QUIRK_HAS_AVB |
-				FEC_QUIRK_ERR007885,
+				FEC_QUIRK_ERR007885 | FEC_QUIRK_BUG_CAPTURE,

> 
> Confused,
> Richard

^ permalink raw reply

* [PATCH RFC net-next] sfc: add support for skb->xmit_more
From: Edward Cree @ 2014-10-14 18:41 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Edward Cree, davem, nikolay, netdev, Shradha Shah, Jon Cooper,
	linux-net-drivers
In-Reply-To: <543C2588.6060604@redhat.com>

Don't ring the doorbell, and don't do PIO.  This will also prevent
 TX Push, because there will be more than one buffer waiting when
 the doorbell is rung.

Signed-off-by: Edward Cree <ecree@solarflare.com>
---

 drivers/net/ethernet/sfc/nic.h | 29 +++++++++++++++++++++++-----
 drivers/net/ethernet/sfc/tx.c  | 43 +++++++++++++++++-------------------------
 2 files changed, 41 insertions(+), 31 deletions(-)

diff --git a/drivers/net/ethernet/sfc/nic.h b/drivers/net/ethernet/sfc/nic.h
index 60f8514..f77cce0 100644
--- a/drivers/net/ethernet/sfc/nic.h
+++ b/drivers/net/ethernet/sfc/nic.h
@@ -71,9 +71,17 @@ efx_tx_desc(struct efx_tx_queue *tx_queue, unsigned int index)
 	return ((efx_qword_t *) (tx_queue->txd.buf.addr)) + index;
 }
 
-/* Report whether the NIC considers this TX queue empty, given the
- * write_count used for the last doorbell push.  May return false
- * negative.
+/* Get partner of a TX queue, seen as part of the same net core queue */
+static struct efx_tx_queue *efx_tx_queue_partner(struct efx_tx_queue *tx_queue)
+{
+	if (tx_queue->queue & EFX_TXQ_TYPE_OFFLOAD)
+		return tx_queue - EFX_TXQ_TYPE_OFFLOAD;
+	else
+		return tx_queue + EFX_TXQ_TYPE_OFFLOAD;
+}
+
+/* Report whether this TX queue would be empty for the given write_count.
+ * May return false negative.
  */
 static inline bool __efx_nic_tx_is_empty(struct efx_tx_queue *tx_queue,
 					 unsigned int write_count)
@@ -86,9 +94,18 @@ static inline bool __efx_nic_tx_is_empty(struct efx_tx_queue *tx_queue,
 	return ((empty_read_count ^ write_count) & ~EFX_EMPTY_COUNT_VALID) == 0;
 }
 
-static inline bool efx_nic_tx_is_empty(struct efx_tx_queue *tx_queue)
+/* Decide whether we can use TX PIO, ie. write packet data directly into
+ * a buffer on the device.  This can reduce latency at the expense of
+ * throughput, so we only do this if both hardware and software TX rings
+ * are empty.  This also ensures that only one packet at a time can be
+ * using the PIO buffer.
+ */
+static inline bool efx_nic_may_tx_pio(struct efx_tx_queue *tx_queue)
 {
-	return __efx_nic_tx_is_empty(tx_queue, tx_queue->write_count);
+	struct efx_tx_queue *partner = efx_tx_queue_partner(tx_queue);
+	return tx_queue->piobuf &&
+	       __efx_nic_tx_is_empty(tx_queue, tx_queue->insert_count) &&
+	       __efx_nic_tx_is_empty(partner, partner->insert_count);
 }
 
 /* Decide whether to push a TX descriptor to the NIC vs merely writing
@@ -96,6 +113,8 @@ static inline bool efx_nic_tx_is_empty(struct efx_tx_queue *tx_queue)
  * descriptor to an empty queue, but is otherwise pointless.  Further,
  * Falcon and Siena have hardware bugs (SF bug 33851) that may be
  * triggered if we don't check this.
+ * We use the write_count used for the last doorbell push, to get the
+ * NIC's view of the tx queue.
  */
 static inline bool efx_nic_may_push_tx_desc(struct efx_tx_queue *tx_queue,
 					    unsigned int write_count)
diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
index 3206098..aaf2987 100644
--- a/drivers/net/ethernet/sfc/tx.c
+++ b/drivers/net/ethernet/sfc/tx.c
@@ -132,15 +132,6 @@ unsigned int efx_tx_max_skb_descs(struct efx_nic *efx)
 	return max_descs;
 }
 
-/* Get partner of a TX queue, seen as part of the same net core queue */
-static struct efx_tx_queue *efx_tx_queue_partner(struct efx_tx_queue *tx_queue)
-{
-	if (tx_queue->queue & EFX_TXQ_TYPE_OFFLOAD)
-		return tx_queue - EFX_TXQ_TYPE_OFFLOAD;
-	else
-		return tx_queue + EFX_TXQ_TYPE_OFFLOAD;
-}
-
 static void efx_tx_maybe_stop_queue(struct efx_tx_queue *txq1)
 {
 	/* We need to consider both queues that the net core sees as one */
@@ -344,6 +335,7 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
 	struct efx_nic *efx = tx_queue->efx;
 	struct device *dma_dev = &efx->pci_dev->dev;
 	struct efx_tx_buffer *buffer;
+	unsigned int old_insert_count = tx_queue->insert_count;
 	skb_frag_t *fragment;
 	unsigned int len, unmap_len = 0;
 	dma_addr_t dma_addr, unmap_addr = 0;
@@ -351,8 +343,6 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
 	unsigned short dma_flags;
 	int i = 0;
 
-	EFX_BUG_ON_PARANOID(tx_queue->write_count != tx_queue->insert_count);
-
 	if (skb_shinfo(skb)->gso_size)
 		return efx_enqueue_skb_tso(tx_queue, skb);
 
@@ -369,9 +359,8 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
 
 	/* Consider using PIO for short packets */
 #ifdef EFX_USE_PIO
-	if (skb->len <= efx_piobuf_size && tx_queue->piobuf &&
-	    efx_nic_tx_is_empty(tx_queue) &&
-	    efx_nic_tx_is_empty(efx_tx_queue_partner(tx_queue))) {
+	if (skb->len <= efx_piobuf_size && !skb->xmit_more &&
+	    efx_nic_may_tx_pio(tx_queue)) {
 		buffer = efx_enqueue_skb_pio(tx_queue, skb);
 		dma_flags = EFX_TX_BUF_OPTION;
 		goto finish_packet;
@@ -439,13 +428,14 @@ finish_packet:
 
 	netdev_tx_sent_queue(tx_queue->core_txq, skb->len);
 
+	efx_tx_maybe_stop_queue(tx_queue);
+
 	/* Pass off to hardware */
-	efx_nic_push_buffers(tx_queue);
+	if (!skb->xmit_more || netif_xmit_stopped(tx_queue->core_txq))
+		efx_nic_push_buffers(tx_queue);
 
 	tx_queue->tx_packets++;
 
-	efx_tx_maybe_stop_queue(tx_queue);
-
 	return NETDEV_TX_OK;
 
  dma_err:
@@ -458,7 +448,7 @@ finish_packet:
 	dev_kfree_skb_any(skb);
 
 	/* Work backwards until we hit the original insert pointer value */
-	while (tx_queue->insert_count != tx_queue->write_count) {
+	while (tx_queue->insert_count != old_insert_count) {
 		unsigned int pkts_compl = 0, bytes_compl = 0;
 		--tx_queue->insert_count;
 		buffer = __efx_tx_queue_get_insert_buffer(tx_queue);
@@ -989,12 +979,13 @@ static int efx_tso_put_header(struct efx_tx_queue *tx_queue,
 /* Remove buffers put into a tx_queue.  None of the buffers must have
  * an skb attached.
  */
-static void efx_enqueue_unwind(struct efx_tx_queue *tx_queue)
+static void efx_enqueue_unwind(struct efx_tx_queue *tx_queue,
+			       unsigned int insert_count)
 {
 	struct efx_tx_buffer *buffer;
 
 	/* Work backwards until we hit the original insert pointer value */
-	while (tx_queue->insert_count != tx_queue->write_count) {
+	while (tx_queue->insert_count != insert_count) {
 		--tx_queue->insert_count;
 		buffer = __efx_tx_queue_get_insert_buffer(tx_queue);
 		efx_dequeue_buffer(tx_queue, buffer, NULL, NULL);
@@ -1258,14 +1249,13 @@ static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue,
 			       struct sk_buff *skb)
 {
 	struct efx_nic *efx = tx_queue->efx;
+	unsigned int old_insert_count = tx_queue->insert_count;
 	int frag_i, rc;
 	struct tso_state state;
 
 	/* Find the packet protocol and sanity-check it */
 	state.protocol = efx_tso_check_protocol(skb);
 
-	EFX_BUG_ON_PARANOID(tx_queue->write_count != tx_queue->insert_count);
-
 	rc = tso_start(&state, efx, skb);
 	if (rc)
 		goto mem_err;
@@ -1308,11 +1298,12 @@ static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue,
 
 	netdev_tx_sent_queue(tx_queue->core_txq, skb->len);
 
-	/* Pass off to hardware */
-	efx_nic_push_buffers(tx_queue);
-
 	efx_tx_maybe_stop_queue(tx_queue);
 
+	/* Pass off to hardware */
+	if (!skb->xmit_more || netif_xmit_stopped(tx_queue->core_txq))
+		efx_nic_push_buffers(tx_queue);
+
 	tx_queue->tso_bursts++;
 	return NETDEV_TX_OK;
 
@@ -1336,6 +1327,6 @@ static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue,
 		dma_unmap_single(&efx->pci_dev->dev, state.header_dma_addr,
 				 state.header_unmap_len, DMA_TO_DEVICE);
 
-	efx_enqueue_unwind(tx_queue);
+	efx_enqueue_unwind(tx_queue, old_insert_count);
 	return NETDEV_TX_OK;
 }
-- 
1.7.11.7

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox