Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH] phy: Add support for VSC8234
From: Andy Fleming @ 2011-10-16  6:49 UTC (permalink / raw)
  To: linux-kernel; +Cc: Kumar Gala, netdev

No functional changes other than to recognize this PHYID.

Signed-off-by: Andy Fleming <afleming@freescale.com>
Signed-off-by: Kumar Gala <kumar.gala@freescale.com>
Signed-off-by: Ben Collins <ben.c@servergy.com>
Cc: netdev@vger.kernel.org
---
 drivers/net/phy/vitesse.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/drivers/net/phy/vitesse.c b/drivers/net/phy/vitesse.c
index 2585c38..b760ba1 100644
--- a/drivers/net/phy/vitesse.c
+++ b/drivers/net/phy/vitesse.c
@@ -54,6 +54,7 @@
 #define MII_VSC8221_AUXCONSTAT_INIT	0x0004 /* need to set this bit? */
 #define MII_VSC8221_AUXCONSTAT_RESERVED	0x0004
 
+#define PHY_ID_VSC8234			0x000fc620
 #define PHY_ID_VSC8244			0x000fc6c0
 #define PHY_ID_VSC8221			0x000fc550
 
@@ -119,7 +120,8 @@ static int vsc82xx_config_intr(struct phy_device *phydev)
 
 	if (phydev->interrupts == PHY_INTERRUPT_ENABLED)
 		err = phy_write(phydev, MII_VSC8244_IMASK,
-			phydev->drv->phy_id == PHY_ID_VSC8244 ?
+			((phydev->drv->phy_id == PHY_ID_VSC8234) ||
+			 (phydev->drv->phy_id == PHY_ID_VSC8244)) ?
 				MII_VSC8244_IMASK_MASK :
 				MII_VSC8221_IMASK_MASK);
 	else {
@@ -165,6 +167,19 @@ static struct phy_driver vsc82xx_driver[] = {
 	.config_intr	= &vsc82xx_config_intr,
 	.driver		= { .owner = THIS_MODULE,},
 }, {
+	/* Vitesse 8234 */
+	.phy_id		= PHY_ID_VSC8234,
+	.phy_id_mask	= 0x000ffff0,
+	.name		= "Vitesse VSC8234",
+	.features	= PHY_GBIT_FEATURES,
+	.flags		= PHY_HAS_INTERRUPT,
+	.config_init	= &vsc8221_config_init,
+	.config_aneg	= &genphy_config_aneg,
+	.read_status	= &genphy_read_status,
+	.ack_interrupt	= &vsc824x_ack_interrupt,
+	.config_intr	= &vsc82xx_config_intr,
+	.driver		= { .owner = THIS_MODULE,},
+}, {
 	/* Vitesse 8221 */
 	.phy_id		= PHY_ID_VSC8221,
 	.phy_id_mask	= 0x000ffff0,
-- 
1.8.1.2

^ permalink raw reply related

* Re: [PATCH net-next] net: ipv6: inet6_connection_sock.h needs flowi
From: David Miller @ 2011-10-15 22:41 UTC (permalink / raw)
  To: christoph.paasch; +Cc: netdev
In-Reply-To: <1318671264-19684-1-git-send-email-christoph.paasch@uclouvain.be>

From: Christoph Paasch <christoph.paasch@uclouvain.be>
Date: Sat, 15 Oct 2011 12:34:24 +0300

> Otherwise we have a compiler-warning in c-files not including net/flow.h
> before inet6_connection_sock.h .
> 
> Signed-off-by: Christoph Paasch <christoph.paasch@uclouvain.be>

Example?  I've never seen this warning.

^ permalink raw reply

* net-next [PATCH 1/1] ipv4: compat_ioctl is local to af_inet.c, make it static
From: Gerrit Renker @ 2011-10-15 19:26 UTC (permalink / raw)
  To: netdev

ipv4: compat_ioctl is local to af_inet.c, make it static

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/ipv4/af_inet.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -893,7 +893,7 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 EXPORT_SYMBOL(inet_ioctl);
 
 #ifdef CONFIG_COMPAT
-int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 {
 	struct sock *sk = sock->sk;
 	int err = -ENOIOCTLCMD;

^ permalink raw reply

* sky2: only 10Mb/s
From: Pavel Matěja @ 2011-10-15 17:58 UTC (permalink / raw)
  To: netdev

Hi,
I tested new kernel and I have found out I have only 10Mb/s link instead of 
100Mb/s to my router.
If I did the git bisect right it was caused by commit 
4fb99cd6ac4fe6d03a334a6f4ebb2bbfc4b479ed

My card is (lspci -v):
05:00.0 Ethernet controller: Marvell Technology Group Ltd. Yukon Optima 
88E8059 [PCIe Gigabit Ethernet Controller with AVB] (rev 11)
        Subsystem: ASUSTeK Computer Inc. Device 8439
        Flags: bus master, fast devsel, latency 0, IRQ 80
        Memory at fe6fc000 (64-bit, non-prefetchable) [size=16K]
        I/O ports at b800 [size=256]
        Expansion ROM at fe6c0000 [disabled] [size=128K]
        Capabilities: [48] Power Management version 3
        Capabilities: [5c] MSI: Enable+ Count=1/1 Maskable- 64bit+
        Capabilities: [c0] Express Legacy Endpoint, MSI 00
        Capabilities: [100] Advanced Error Reporting
        Capabilities: [130] Device Serial Number a5-89-6d-ff-ff-30-cf-20
        Kernel driver in use: sky2

Can anybody help me?
-- 
Pavel Mateja

^ permalink raw reply

* Flow classifier proto-dst and TOS (and proto-src)
From: Dan Siemon @ 2011-10-15 16:51 UTC (permalink / raw)
  To: netdev

[-- Attachment #1: Type: text/plain, Size: 1840 bytes --]

cls_flow.c: flow_get_proto_dst()

The proto-dst key returns the destination port for UDP, TCP and a few
other protocols [see proto_ports_offset()]. For ICMP and IPIP it falls
back to:

return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol;

Since Linux maintains a dst_entry for each TOS value this causes the
returned value to be affected by the TOS which is unexpected and
probably broken.

Is there a reason why this doesn't return 0 for protocols that don't
have a notion of source and destination ports? It seems very odd to me
that a value which is not at all related to the traffic on the wire is
returned for this key.

There is a somewhat similar situation with flow_get_proto_src(). Here
the fallback value is:

return addr_fold(skb->sk);

It looks like this is 0 when the traffic doesn't originate locally and
even for local traffic I don't understand why the use of a effectively
random number here is useful.

For a long winded explanation of how I discovered this see:
http://www.coverfire.com/archives/2011/10/15/linux-flow-classifier-proto-dst-and-tos/

Below is a simple patch which makes these functions fallback to
returning 0 when the protocol doesn't have the notion of ports.

Signed-off-by: Dan Siemon <dan@coverfire.com>
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 6994214..7527e61 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -150,7 +150,7 @@ static u32 flow_get_proto_src(struct sk_buff *skb)
 	}
 	}

-	return addr_fold(skb->sk);
+	return 0;
 }

 static u32 flow_get_proto_dst(struct sk_buff *skb)
@@ -192,7 +192,7 @@ static u32 flow_get_proto_dst(struct sk_buff *skb)
 	}
 	}

-	return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol;
+	return 0;
 }

static u32 flow_get_iif(const struct sk_buff *skb)

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply related

* dj lighting equipment price list
From: Picasa Web Albums @ 2011-10-15 14:39 UTC (permalink / raw)
  To: netdev

[-- Attachment #1: Type: text/plain, Size: 381 bytes --]

hello, sir

we are sorry to take your time

we are mainly producing stage lighting equipment, include led moving head  
lighting and laserlighting , par lighting and effect lighting,here is our  
price list from our factory, if you have any interest, please reply us  
soonest


mr.wu

DJ- EUROPE LIGHTING MANUFACTORY CO.LTD
WWW.DJ-EUROPE.COM
TEL; 0757-63386561 FAX: 0757-63386562

[-- Attachment #2: YTU.jpg --]
[-- Type: image/jpeg, Size: 44349 bytes --]

^ permalink raw reply

* [PATCH net-next] net: ipv6: inet6_connection_sock.h needs flowi
From: Christoph Paasch @ 2011-10-15  9:34 UTC (permalink / raw)
  To: davem; +Cc: netdev, Christoph Paasch

Otherwise we have a compiler-warning in c-files not including net/flow.h
before inet6_connection_sock.h .

Signed-off-by: Christoph Paasch <christoph.paasch@uclouvain.be>
---
 include/net/inet6_connection_sock.h |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h
index 3207e58..2a86a84 100644
--- a/include/net/inet6_connection_sock.h
+++ b/include/net/inet6_connection_sock.h
@@ -21,6 +21,7 @@ struct request_sock;
 struct sk_buff;
 struct sock;
 struct sockaddr;
+struct flowi;
 
 extern int inet6_csk_bind_conflict(const struct sock *sk,
 				   const struct inet_bind_bucket *tb);
-- 
1.7.5.4

^ permalink raw reply related

* Kernel panic from tg3 net driver
From: Ari Savolainen @ 2011-10-15  7:50 UTC (permalink / raw)
  To: David S. Miller, netdev, linux-kernel

Hi,

I get this panic when I try to print from a virtual machine:

https://docs.google.com/leaf?id=0B7LPWLwa6EUaODIxYTY2YmQtNWJlZS00M2ViLTk5ZmEtNDM2ZTZmNzE2MDEz&hl=fi

I tried to bisect it, but couldn't finish, because after the last step
the boot process got stuck right after selecting the kernel in grub
and I ran out of time:

git bisect start
# bad: [322a8b034003c0d46d39af85bf24fee27b902f48] Linux 3.1-rc1
git bisect bad 322a8b034003c0d46d39af85bf24fee27b902f48
# good: [02f8c6aee8df3cdc935e9bdd4f2d020306035dbe] Linux 3.0
git bisect good 02f8c6aee8df3cdc935e9bdd4f2d020306035dbe
# bad: [0003230e8200699860f0b10af524dc47bf8aecad] Merge branch
'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6
git bisect bad 0003230e8200699860f0b10af524dc47bf8aecad
# bad: [72f96e0e38d7e29ba16dcfd824ecaebe38b8293e] Merge branch
'for-linus-core' of
git://git.kernel.org/pub/scm/linux/kernel/git/nab/target-pending
git bisect bad 72f96e0e38d7e29ba16dcfd824ecaebe38b8293e
# good: [204d1641d200709c759d8c269458cbc7de378c40] Merge branch
'master' of git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-next-2.6
into for-davem
git bisect good 204d1641d200709c759d8c269458cbc7de378c40
# bad: [415b3334a21aa67806c52d1acf4e72e14f7f402f] icmp: Fix regression
in nexthop resolution during replies.
git bisect bad 415b3334a21aa67806c52d1acf4e72e14f7f402f
# bad: [95a943c162d74b20d869917bdf5df11293c35b63] Merge branch
'master' of git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-next-2.6
into for-davem
git bisect bad 95a943c162d74b20d869917bdf5df11293c35b63

In the first bad kernel (3.1-rc1) there was this in the log:

[  105.612095]
[  105.612096] ===================================================
[  105.612100] [ INFO: suspicious rcu_dereference_check() usage. ]
[  105.612101] ---------------------------------------------------
[  105.612103] include/net/dst.h:91 invoked rcu_dereference_check()
without protection!
[  105.612105]
[  105.612106] other info that might help us debug this:
[  105.612106]
[  105.612108]
[  105.612108] rcu_scheduler_active = 1, debug_locks = 0
[  105.612110] 1 lock held by dnsmasq/2618:
[  105.612111]  #0:  (rtnl_mutex){+.+.+.}, at: [<ffffffff815df8c7>]
rtnl_lock+0x17/0x20
[  105.612120]
[  105.612121] stack backtrace:
[  105.612123] Pid: 2618, comm: dnsmasq Not tainted 3.1.0-rc1 #41
[  105.612125] Call Trace:
[  105.612129]  [<ffffffff810ccdcb>] lockdep_rcu_dereference+0xbb/0xc0
[  105.612132]  [<ffffffff815dc5a9>] neigh_update+0x4f9/0x5f0
[  105.612135]  [<ffffffff815da001>] ? neigh_lookup+0xe1/0x220
[  105.612139]  [<ffffffff81639298>] arp_req_set+0xb8/0x230
[  105.612142]  [<ffffffff8163a59f>] arp_ioctl+0x1bf/0x310
[  105.612146]  [<ffffffff810baa40>] ? lock_hrtimer_base.isra.26+0x30/0x60
[  105.612150]  [<ffffffff8163fb75>] inet_ioctl+0x85/0x90
[  105.612154]  [<ffffffff815b5520>] sock_do_ioctl+0x30/0x70
[  105.612157]  [<ffffffff815b55d3>] sock_ioctl+0x73/0x280
[  105.612162]  [<ffffffff811b7698>] do_vfs_ioctl+0x98/0x570
[  105.612165]  [<ffffffff811a5c40>] ? fget_light+0x340/0x3a0
[  105.612168]  [<ffffffff811b7bbf>] sys_ioctl+0x4f/0x80
[  105.612172]  [<ffffffff816fdcab>] system_call_fastpath+0x16/0x1b

^ permalink raw reply

* Re: [PATCH net-next] tcp: reduce memory needs of out of order queue
From: Eric Dumazet @ 2011-10-15  6:54 UTC (permalink / raw)
  To: David Miller; +Cc: rick.jones2, netdev
In-Reply-To: <20111014.191845.232827637484150228.davem@davemloft.net>

Le vendredi 14 octobre 2011 à 19:18 -0400, David Miller a écrit :
> From: Rick Jones <rick.jones2@hp.com>
> Date: Fri, 14 Oct 2011 15:12:04 -0700
> 
> > From just a very quick look it looks like tcp_v[46]_rcv is called,
> > finds that the socket is owned by the user, attempts to add to the
> > backlog, but the path called by sk_add_backlog does not seem to make
> > any attempts to compress things, so when the quantity of data is <<
> > the truesize it starts tossing babies out with the bathwater.
> 
> This is why I don't believe the right fix is to add bandaids all
> around the TCP layer.
> 
> The wastage has to be avoided at a higher level.

We cant do that at higher level without smart hardware (like NIU) or
adding a copy.

Its a tradeoff between space and speed.

Most drivers have to allocate a large skb1 and post it to hardware to
receive a frame (Unknown length, only max length is known)

Some drivers have a copybreak feature, doing a copy of small incoming
frames into a smaller skb2 (skb2->truesize < skb1->truesize)

This strategy do save memory for small frames, not for 1500 bytes
frames.

I think the problem is in TCP layer (and maybe in other protocols) :

1) Either tune rcvbuf to allow more memory to be used, for a particular
tcp window,

   Or lower TCP window to allow less packets in flight for a given
rcvbuf.

2) TCP COLLAPSE already is trying to reduce memory costs of a tcp socket
with many packets in OFO queue. But fixing 1) would make these collapses
never happen in the first place. People wanting high TCP bandwidth
[ with say more than 500 in-flight packets per session ] can certainly
afford having enough memory.

^ permalink raw reply

* Re: [PATCH net-next] tcp: reduce memory needs of out of order queue
From: Eric Dumazet @ 2011-10-15  6:39 UTC (permalink / raw)
  To: Rick Jones; +Cc: David Miller, netdev
In-Reply-To: <4E98B3B4.20406@hp.com>

Le vendredi 14 octobre 2011 à 15:12 -0700, Rick Jones a écrit :

Thanks Rick


> So, a test as above from a system running 2.6.38-11-generic to a system 
> running 3.0.0-12-generic.  On the sender we have:
> 
> raj@tardy:~/netperf2_trunk$ netstat -s > before; src/netperf -H 
> raj-8510w.americas.hpqcorp.net -t tcp_rr -- -b 256 -D -o 
> throughput,local_transport_retrans,remote_transport_retrans,lss_size_end,rsr_size_end 
> ; netstat -s > after
> MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET 
> to internal-host.americas.hpqcorp.net (16.89.245.115) port 0 AF_INET : 
> nodelay : first burst 256
> Throughput,Local Transport Retransmissions,Remote Transport 
> Retransmissions,Local Send Socket Size Final,Remote Recv Socket Size Final
> 76752.43,274,0,16384,98304
> 
> 274 retransmissions at the sender.  The "beforeafter" of that on the sender:
> 
> raj@tardy:~/netperf2_trunk$ cat delta.send

> Tcp:
>      2 active connections openings
>      0 passive connection openings
>      0 failed connection attempts
>      0 connection resets received
>      0 connections established
>      766727 segments received
>      734408 segments send out

>      274 segments retransmited

	Exactly the count of dropped frames because of receiver sk_rmem_alloc +
backlog.len hitting receiver sk_rcvbuf

static inline bool sk_rcvqueues_full(const struct sock *sk, const struct sk_buff *skb)
{
        unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc);

        return qsize + skb->truesize > sk->sk_rcvbuf;
}

static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb)
{
        if (sk_rcvqueues_full(sk, skb))
                return -ENOBUFS;

        __sk_add_backlog(sk, skb);
        sk->sk_backlog.len += skb->truesize;
        return 0;
}

In very old kernels, we had no limit on backlog, so we could queue lot
of extra skbs in it and eventually consume all kernel memory (OOM)

refs : commit c377411f249 (net: sk_add_backlog() take rmem_alloc into
account)
	commit 6b03a53a5ab7 (tcp: use limited socket backlog)

	commit 8eae939f14003 (net: add limit for socket backlog )

Now we enforce a limit, better to chose a correct limit / tcpwindow
combination so that normal trafic doesnt trigger drops at receiver

>      0 bad segments received.
>      0 resets sent
> Udp:
>      7 packets received
>      0 packets to unknown port received.
>      0 packet receive errors
>      7 packets sent
> UdpLite:
> TcpExt:
>      0 packets pruned from receive queue because of socket buffer overrun
>      0 ICMP packets dropped because they were out-of-window
>      0 TCP sockets finished time wait in fast timer
>      2 delayed acks sent
>      0 delayed acks further delayed because of locked socket
>      Quick ack mode was activated 0 times
>      170856 packets directly queued to recvmsg prequeue.
>      1204 bytes directly in process context from backlog
>      170678 bytes directly received in process context from prequeue
>      592090 packet headers predicted
>      170626 packets header predicted and directly queued to user
>      1375 acknowledgments not containing data payload received
>      174911 predicted acknowledgments
>      150 times recovered from packet loss by selective acknowledgements
>      0 congestion windows recovered without slow start by DSACK
>      0 congestion windows recovered without slow start after partial ack
>      299 TCP data loss events
>      TCPLostRetransmit: 9
>      0 timeouts after reno fast retransmit
>      0 timeouts after SACK recovery
>      253 fast retransmits
>      14 forward retransmits
>      6 retransmits in slow start
>      0 other TCP timeouts
>      1 SACK retransmits failed
>      0 times receiver scheduled too late for direct processing
>      0 packets collapsed in receive queue due to low socket buffer
>      0 DSACKs sent for old packets
>      0 DSACKs received
>      0 connections reset due to unexpected data
>      0 connections reset due to early user close
>      0 connections aborted due to timeout
>      0 times unabled to send RST due to no memory
>      TCPDSACKIgnoredOld: 0
>      TCPDSACKIgnoredNoUndo: 0
>      TCPSackShifted: 0
>      TCPSackMerged: 1031
>      TCPSackShiftFallback: 240
>      TCPBacklogDrop: 0
>      IPReversePathFilter: 0
> IpExt:
>      InMcastPkts: 0
>      OutMcastPkts: 0
>      InBcastPkts: 1
>      InOctets: -1012182764
>      OutOctets: -1436530450
>      InMcastOctets: 0
>      OutMcastOctets: 0
>      InBcastOctets: 147
> 
> and then the deltas on the receiver:
> 
> raj@raj-8510w:~/netperf2_trunk$ cat delta.recv
> Ip:
>      734669 total packets received
>      0 with invalid addresses
>      0 forwarded
>      0 incoming packets discarded
>      734669 incoming packets delivered
>      766696 requests sent out
>      0 dropped because of missing route
> Icmp:
>      0 ICMP messages received
>      0 input ICMP message failed.
>      ICMP input histogram:
>          destination unreachable: 0
>      0 ICMP messages sent
>      0 ICMP messages failed
>      ICMP output histogram:
> IcmpMsg:
>          InType3: 0
> Tcp:
>      0 active connections openings
>      2 passive connection openings
>      0 failed connection attempts
>      0 connection resets received
>      0 connections established
>      734651 segments received
>      766695 segments send out
>      0 segments retransmited
>      0 bad segments received.
>      0 resets sent
> Udp:
>      1 packets received
>      0 packets to unknown port received.
>      0 packet receive errors
>      1 packets sent
> UdpLite:
> TcpExt:
>      28 packets pruned from receive queue because of socket buffer overrun
>      0 delayed acks sent
>      0 delayed acks further delayed because of locked socket
>      19 packets directly queued to recvmsg prequeue.
>      0 bytes directly in process context from backlog
>      667 bytes directly received in process context from prequeue
>      727842 packet headers predicted
>      9 packets header predicted and directly queued to user
>      161 acknowledgments not containing data payload received
>      229704 predicted acknowledgments


>      6774 packets collapsed in receive queue due to low socket buffer
>      TCPBacklogDrop: 276

	Yes, these two counters explain all.

	1) "6774 packets collapsed in receive queue due to low socket buffer"

We spend a _lot_ of cpu time in "collapsing" process : Taking several
skb and build a compound one (using one PAGE and trying to fill all the
available bytes in it with contigous parts).

Doing this work is of course last desperate attempt before the much
painfull :

	2) TCPBacklogDrop: 276

	We plain drop incoming messages because too much kernel memory is used
by the socket.

> IpExt:
>      InMcastPkts: 0
>      OutMcastPkts: 0
>      InBcastPkts: 17
>      OutBcastPkts: 0
>      InOctets: 38973144
>      OutOctets: 40673137
>      InMcastOctets: 0
>      OutMcastOctets: 0
>      InBcastOctets: 1816
>      OutBcastOctets: 0
> 
> this is an otherwise clean network, no errors reported by ifconfig or 
> ethtool -S, and the packet rate was well within the limits of 1 GbE and 
> the ProCurve 2724 switch between the two systems.
> 
>  From just a very quick look it looks like tcp_v[46]_rcv is called, 
> finds that the socket is owned by the user, attempts to add to the 
> backlog, but the path called by sk_add_backlog does not seem to make any 
> attempts to compress things, so when the quantity of data is << the 
> truesize it starts tossing babies out with the bathwater.
> 

Rick, could you redo the test, using following bit on receiver :

echo 1 >/proc/sys/net/ipv4/tcp_adv_win_scale

If you still have collapses/retransmits, you then could try :

echo -2 >/proc/sys/net/ipv4/tcp_adv_win_scale

Thanks !

^ permalink raw reply

* [PATCH 3/3] x25: Prevent skb overreads when checking call user data
From: Matthew Daley @ 2011-10-15  4:45 UTC (permalink / raw)
  To: netdev; +Cc: Eric Dumazet, Andrew Hendry, Matthew Daley, stable
In-Reply-To: <1318653905-13716-1-git-send-email-mattjd@gmail.com>

x25_find_listener does not check that the amount of call user data given
in the skb is big enough in per-socket comparisons, hence buffer
overreads may occur.  Fix this by adding a check.

Signed-off-by: Matthew Daley <mattjd@gmail.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Andrew Hendry <andrew.hendry@gmail.com>
Cc: stable <stable@kernel.org>
---
 net/x25/af_x25.c |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index aa567b0..5f03e4e 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -295,7 +295,8 @@ static struct sock *x25_find_listener(struct x25_address *addr,
 			 * Found a listening socket, now check the incoming
 			 * call user data vs this sockets call user data
 			 */
-			if(skb->len > 0 && x25_sk(s)->cudmatchlength > 0) {
+			if (x25_sk(s)->cudmatchlength > 0 &&
+				skb->len >= x25_sk(s)->cudmatchlength) {
 				if((memcmp(x25_sk(s)->calluserdata.cuddata,
 					skb->data,
 					x25_sk(s)->cudmatchlength)) == 0) {
-- 
1.7.2.5

^ permalink raw reply related

* [PATCH 2/3] x25: Handle undersized/fragmented skbs
From: Matthew Daley @ 2011-10-15  4:45 UTC (permalink / raw)
  To: netdev; +Cc: Eric Dumazet, Andrew Hendry, Matthew Daley, stable
In-Reply-To: <1318653905-13716-1-git-send-email-mattjd@gmail.com>

There are multiple locations in the X.25 packet layer where a skb is
assumed to be of at least a certain size and that all its data is
currently available at skb->data.  These assumptions are not checked,
hence buffer overreads may occur.  Use pskb_may_pull to check these
minimal size assumptions and ensure that data is available at skb->data
when necessary, as well as use skb_copy_bits where needed.

Signed-off-by: Matthew Daley <mattjd@gmail.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Andrew Hendry <andrew.hendry@gmail.com>
Cc: stable <stable@kernel.org>
---
 net/x25/af_x25.c         |   31 ++++++++++++++++++++++++-------
 net/x25/x25_dev.c        |    6 ++++++
 net/x25/x25_facilities.c |   10 ++++++----
 net/x25/x25_in.c         |   40 +++++++++++++++++++++++++++++++++++-----
 net/x25/x25_link.c       |    3 +++
 net/x25/x25_subr.c       |   14 +++++++++++++-
 6 files changed, 87 insertions(+), 17 deletions(-)

diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index a4bd172..aa567b0 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -91,7 +91,7 @@ int x25_parse_address_block(struct sk_buff *skb,
 	int needed;
 	int rc;
 
-	if (skb->len < 1) {
+	if (!pskb_may_pull(skb, 1)) {
 		/* packet has no address block */
 		rc = 0;
 		goto empty;
@@ -100,7 +100,7 @@ int x25_parse_address_block(struct sk_buff *skb,
 	len = *skb->data;
 	needed = 1 + (len >> 4) + (len & 0x0f);
 
-	if (skb->len < needed) {
+	if (!pskb_may_pull(skb, needed)) {
 		/* packet is too short to hold the addresses it claims
 		   to hold */
 		rc = -1;
@@ -951,10 +951,10 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
 	 *
 	 *	Facilities length is mandatory in call request packets
 	 */
-	if (skb->len < 1)
+	if (!pskb_may_pull(skb, 1))
 		goto out_clear_request;
 	len = skb->data[0] + 1;
-	if (skb->len < len)
+	if (!pskb_may_pull(skb, len))
 		goto out_clear_request;
 	skb_pull(skb,len);
 
@@ -965,6 +965,13 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
 		goto out_clear_request;
 
 	/*
+	 *	Get all the call user data so it can be used in
+	 *	x25_find_listener and skb_copy_from_linear_data up ahead.
+	 */
+	if (!pskb_may_pull(skb, skb->len))
+		goto out_clear_request;
+
+	/*
 	 *	Find a listener for the particular address/cud pair.
 	 */
 	sk = x25_find_listener(&source_addr,skb);
@@ -1172,6 +1179,9 @@ static int x25_sendmsg(struct kiocb *iocb, struct socket *sock,
 	 *	byte of the user data is the logical value of the Q Bit.
 	 */
 	if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) {
+		if (!pskb_may_pull(skb, 1))
+			goto out_kfree_skb;
+
 		qbit = skb->data[0];
 		skb_pull(skb, 1);
 	}
@@ -1250,7 +1260,9 @@ static int x25_recvmsg(struct kiocb *iocb, struct socket *sock,
 	struct x25_sock *x25 = x25_sk(sk);
 	struct sockaddr_x25 *sx25 = (struct sockaddr_x25 *)msg->msg_name;
 	size_t copied;
-	int qbit;
+	int qbit, header_len = x25->neighbour->extended ?
+		X25_EXT_MIN_LEN : X25_STD_MIN_LEN;
+
 	struct sk_buff *skb;
 	unsigned char *asmptr;
 	int rc = -ENOTCONN;
@@ -1271,6 +1283,9 @@ static int x25_recvmsg(struct kiocb *iocb, struct socket *sock,
 
 		skb = skb_dequeue(&x25->interrupt_in_queue);
 
+		if (!pskb_may_pull(skb, X25_STD_MIN_LEN))
+			goto out_free_dgram;
+
 		skb_pull(skb, X25_STD_MIN_LEN);
 
 		/*
@@ -1291,10 +1306,12 @@ static int x25_recvmsg(struct kiocb *iocb, struct socket *sock,
 		if (!skb)
 			goto out;
 
+		if (!pskb_may_pull(skb, header_len))
+			goto out_free_dgram;
+
 		qbit = (skb->data[0] & X25_Q_BIT) == X25_Q_BIT;
 
-		skb_pull(skb, x25->neighbour->extended ?
-				X25_EXT_MIN_LEN : X25_STD_MIN_LEN);
+		skb_pull(skb, header_len);
 
 		if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) {
 			asmptr  = skb_push(skb, 1);
diff --git a/net/x25/x25_dev.c b/net/x25/x25_dev.c
index e547ca1..fa2b418 100644
--- a/net/x25/x25_dev.c
+++ b/net/x25/x25_dev.c
@@ -32,6 +32,9 @@ static int x25_receive_data(struct sk_buff *skb, struct x25_neigh *nb)
 	unsigned short frametype;
 	unsigned int lci;
 
+	if (!pskb_may_pull(skb, X25_STD_MIN_LEN))
+		return 0;
+
 	frametype = skb->data[2];
 	lci = ((skb->data[0] << 8) & 0xF00) + ((skb->data[1] << 0) & 0x0FF);
 
@@ -115,6 +118,9 @@ int x25_lapb_receive_frame(struct sk_buff *skb, struct net_device *dev,
 		goto drop;
 	}
 
+	if (!pskb_may_pull(skb, 1))
+		return 0;
+
 	switch (skb->data[0]) {
 
 	case X25_IFACE_DATA:
diff --git a/net/x25/x25_facilities.c b/net/x25/x25_facilities.c
index f77e4e7..36384a1 100644
--- a/net/x25/x25_facilities.c
+++ b/net/x25/x25_facilities.c
@@ -44,7 +44,7 @@
 int x25_parse_facilities(struct sk_buff *skb, struct x25_facilities *facilities,
 		struct x25_dte_facilities *dte_facs, unsigned long *vc_fac_mask)
 {
-	unsigned char *p = skb->data;
+	unsigned char *p;
 	unsigned int len;
 
 	*vc_fac_mask = 0;
@@ -60,14 +60,16 @@ int x25_parse_facilities(struct sk_buff *skb, struct x25_facilities *facilities,
 	memset(dte_facs->called_ae, '\0', sizeof(dte_facs->called_ae));
 	memset(dte_facs->calling_ae, '\0', sizeof(dte_facs->calling_ae));
 
-	if (skb->len < 1)
+	if (!pskb_may_pull(skb, 1))
 		return 0;
 
-	len = *p++;
+	len = skb->data[0];
 
-	if (len >= skb->len)
+	if (!pskb_may_pull(skb, 1 + len))
 		return -1;
 
+	p = skb->data + 1;
+
 	while (len > 0) {
 		switch (*p & X25_FAC_CLASS_MASK) {
 		case X25_FAC_CLASS_A:
diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c
index 63488fd..a49cd4e 100644
--- a/net/x25/x25_in.c
+++ b/net/x25/x25_in.c
@@ -107,6 +107,8 @@ static int x25_state1_machine(struct sock *sk, struct sk_buff *skb, int frametyp
 		/*
 		 *	Parse the data in the frame.
 		 */
+		if (!pskb_may_pull(skb, X25_STD_MIN_LEN))
+			goto out_clear;
 		skb_pull(skb, X25_STD_MIN_LEN);
 
 		len = x25_parse_address_block(skb, &source_addr,
@@ -130,9 +132,8 @@ static int x25_state1_machine(struct sock *sk, struct sk_buff *skb, int frametyp
 			if (skb->len > X25_MAX_CUD_LEN)
 				goto out_clear;
 
-			skb_copy_from_linear_data(skb,
-						  x25->calluserdata.cuddata,
-						  skb->len);
+			skb_copy_bits(skb, 0, x25->calluserdata.cuddata,
+				skb->len);
 			x25->calluserdata.cudlength = skb->len;
 		}
 		if (!sock_flag(sk, SOCK_DEAD))
@@ -140,6 +141,9 @@ static int x25_state1_machine(struct sock *sk, struct sk_buff *skb, int frametyp
 		break;
 	}
 	case X25_CLEAR_REQUEST:
+		if (!pskb_may_pull(skb, X25_STD_MIN_LEN + 2))
+			goto out_clear;
+
 		x25_write_internal(sk, X25_CLEAR_CONFIRMATION);
 		x25_disconnect(sk, ECONNREFUSED, skb->data[3], skb->data[4]);
 		break;
@@ -167,6 +171,9 @@ static int x25_state2_machine(struct sock *sk, struct sk_buff *skb, int frametyp
 	switch (frametype) {
 
 		case X25_CLEAR_REQUEST:
+			if (!pskb_may_pull(skb, X25_STD_MIN_LEN + 2))
+				goto out_clear;
+
 			x25_write_internal(sk, X25_CLEAR_CONFIRMATION);
 			x25_disconnect(sk, 0, skb->data[3], skb->data[4]);
 			break;
@@ -180,6 +187,11 @@ static int x25_state2_machine(struct sock *sk, struct sk_buff *skb, int frametyp
 	}
 
 	return 0;
+
+out_clear:
+	x25_write_internal(sk, X25_CLEAR_REQUEST);
+	x25_start_t23timer(sk);
+	return 0;
 }
 
 /*
@@ -209,6 +221,9 @@ static int x25_state3_machine(struct sock *sk, struct sk_buff *skb, int frametyp
 			break;
 
 		case X25_CLEAR_REQUEST:
+			if (!pskb_may_pull(skb, X25_STD_MIN_LEN + 2))
+				goto out_clear;
+
 			x25_write_internal(sk, X25_CLEAR_CONFIRMATION);
 			x25_disconnect(sk, 0, skb->data[3], skb->data[4]);
 			break;
@@ -307,6 +322,12 @@ static int x25_state3_machine(struct sock *sk, struct sk_buff *skb, int frametyp
 	}
 
 	return queued;
+
+out_clear:
+	x25_write_internal(sk, X25_CLEAR_REQUEST);
+	x25->state = X25_STATE_2;
+	x25_start_t23timer(sk);
+	return 0;
 }
 
 /*
@@ -316,13 +337,13 @@ static int x25_state3_machine(struct sock *sk, struct sk_buff *skb, int frametyp
  */
 static int x25_state4_machine(struct sock *sk, struct sk_buff *skb, int frametype)
 {
+	struct x25_sock *x25 = x25_sk(sk);
+
 	switch (frametype) {
 
 		case X25_RESET_REQUEST:
 			x25_write_internal(sk, X25_RESET_CONFIRMATION);
 		case X25_RESET_CONFIRMATION: {
-			struct x25_sock *x25 = x25_sk(sk);
-
 			x25_stop_timer(sk);
 			x25->condition = 0x00;
 			x25->va        = 0;
@@ -334,6 +355,9 @@ static int x25_state4_machine(struct sock *sk, struct sk_buff *skb, int frametyp
 			break;
 		}
 		case X25_CLEAR_REQUEST:
+			if (!pskb_may_pull(skb, X25_STD_MIN_LEN + 2))
+				goto out_clear;
+
 			x25_write_internal(sk, X25_CLEAR_CONFIRMATION);
 			x25_disconnect(sk, 0, skb->data[3], skb->data[4]);
 			break;
@@ -343,6 +367,12 @@ static int x25_state4_machine(struct sock *sk, struct sk_buff *skb, int frametyp
 	}
 
 	return 0;
+
+out_clear:
+	x25_write_internal(sk, X25_CLEAR_REQUEST);
+	x25->state = X25_STATE_2;
+	x25_start_t23timer(sk);
+	return 0;
 }
 
 /* Higher level upcall for a LAPB frame */
diff --git a/net/x25/x25_link.c b/net/x25/x25_link.c
index 037958f..4acacf3 100644
--- a/net/x25/x25_link.c
+++ b/net/x25/x25_link.c
@@ -90,6 +90,9 @@ void x25_link_control(struct sk_buff *skb, struct x25_neigh *nb,
 		break;
 
 	case X25_DIAGNOSTIC:
+		if (!pskb_may_pull(skb, X25_STD_MIN_LEN + 4))
+			break;
+
 		printk(KERN_WARNING "x25: diagnostic #%d - %02X %02X %02X\n",
 		       skb->data[3], skb->data[4],
 		       skb->data[5], skb->data[6]);
diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c
index 24a342e..5170d52 100644
--- a/net/x25/x25_subr.c
+++ b/net/x25/x25_subr.c
@@ -269,7 +269,11 @@ int x25_decode(struct sock *sk, struct sk_buff *skb, int *ns, int *nr, int *q,
 	       int *d, int *m)
 {
 	struct x25_sock *x25 = x25_sk(sk);
-	unsigned char *frame = skb->data;
+	unsigned char *frame;
+
+	if (!pskb_may_pull(skb, X25_STD_MIN_LEN))
+		return X25_ILLEGAL;
+	frame = skb->data;
 
 	*ns = *nr = *q = *d = *m = 0;
 
@@ -294,6 +298,10 @@ int x25_decode(struct sock *sk, struct sk_buff *skb, int *ns, int *nr, int *q,
 		if (frame[2] == X25_RR  ||
 		    frame[2] == X25_RNR ||
 		    frame[2] == X25_REJ) {
+			if (!pskb_may_pull(skb, X25_EXT_MIN_LEN))
+				return X25_ILLEGAL;
+			frame = skb->data;
+
 			*nr = (frame[3] >> 1) & 0x7F;
 			return frame[2];
 		}
@@ -308,6 +316,10 @@ int x25_decode(struct sock *sk, struct sk_buff *skb, int *ns, int *nr, int *q,
 
 	if (x25->neighbour->extended) {
 		if ((frame[2] & 0x01) == X25_DATA) {
+			if (!pskb_may_pull(skb, X25_EXT_MIN_LEN))
+				return X25_ILLEGAL;
+			frame = skb->data;
+
 			*q  = (frame[0] & X25_Q_BIT) == X25_Q_BIT;
 			*d  = (frame[0] & X25_D_BIT) == X25_D_BIT;
 			*m  = (frame[3] & X25_EXT_M_BIT) == X25_EXT_M_BIT;
-- 
1.7.2.5

^ permalink raw reply related

* [PATCH 1/3] x25: Validate incoming call user data lengths
From: Matthew Daley @ 2011-10-15  4:45 UTC (permalink / raw)
  To: netdev; +Cc: Eric Dumazet, Andrew Hendry, Matthew Daley, stable
In-Reply-To: <1318653905-13716-1-git-send-email-mattjd@gmail.com>

X.25 call user data is being copied in its entirety from incoming messages
without consideration to the size of the destination buffers, leading to
possible buffer overflows. Validate incoming call user data lengths before
these copies are performed.

It appears this issue was noticed some time ago, however nothing seemed to
come of it: see http://www.spinics.net/lists/linux-x25/msg00043.html and
commit 8db09f26f912f7c90c764806e804b558da520d4f.

Signed-off-by: Matthew Daley <mattjd@gmail.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Tested-by: Andrew Hendry <andrew.hendry@gmail.com>
Cc: stable <stable@kernel.org>
---
 net/x25/af_x25.c |    6 ++++++
 net/x25/x25_in.c |    3 +++
 2 files changed, 9 insertions(+), 0 deletions(-)

diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index d306154..a4bd172 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -959,6 +959,12 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
 	skb_pull(skb,len);
 
 	/*
+	 *	Ensure that the amount of call user data is valid.
+	 */
+	if (skb->len > X25_MAX_CUD_LEN)
+		goto out_clear_request;
+
+	/*
 	 *	Find a listener for the particular address/cud pair.
 	 */
 	sk = x25_find_listener(&source_addr,skb);
diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c
index 0b073b5..63488fd 100644
--- a/net/x25/x25_in.c
+++ b/net/x25/x25_in.c
@@ -127,6 +127,9 @@ static int x25_state1_machine(struct sock *sk, struct sk_buff *skb, int frametyp
 		 *	Copy any Call User Data.
 		 */
 		if (skb->len > 0) {
+			if (skb->len > X25_MAX_CUD_LEN)
+				goto out_clear;
+
 			skb_copy_from_linear_data(skb,
 						  x25->calluserdata.cuddata,
 						  skb->len);
-- 
1.7.2.5

^ permalink raw reply related

* x25: Fix multiple buffer overruns/overreads
From: Matthew Daley @ 2011-10-15  4:45 UTC (permalink / raw)
  To: netdev; +Cc: Eric Dumazet, Andrew Hendry

This patchset fixes several buffer overruns/overreads in the X.25
packet layer. The first patch fixes a particularly nasty remote-triggerable
buffer overflow, while the rest fix skb overreads on undersized/fragmented
skbs.

Matthew Daley (3):
      x25: Validate incoming call user data lengths
      x25: Handle undersized/fragmented skbs
      x25: Prevent skb overreads when checking call user data

 net/x25/af_x25.c         |   40 ++++++++++++++++++++++++++++++++--------
 net/x25/x25_dev.c        |    6 ++++++
 net/x25/x25_facilities.c |   10 ++++++----
 net/x25/x25_in.c         |   43 ++++++++++++++++++++++++++++++++++++++-----
 net/x25/x25_link.c       |    3 +++
 net/x25/x25_subr.c       |   14 +++++++++++++-
 6 files changed, 98 insertions(+), 18 deletions(-)

^ permalink raw reply

* Re: [PATCH net-next] skbuff: update sk truesize in pskb_expand_head
From: David Miller @ 2011-10-15  4:02 UTC (permalink / raw)
  To: roy.qing.li; +Cc: netdev
In-Reply-To: <CAJFZqHyvu5419zaJ6Ro+s2d06LZ3mBqbAPuGF=3K6JAUK_vHqA@mail.gmail.com>

From: RongQing Li <roy.qing.li@gmail.com>
Date: Sat, 15 Oct 2011 11:52:22 +0800

> About incorrectly impact on socket, I am consider it, but I still no
> idea about it.

Sockets attach themselves to packets, and assign a destructor.

This destructor atomically decrements the receive or send buffer
space used, and amount decremented is skb->truesize.

Therefore if you change skb->truesize on such an SKB, the wrong
amount will be decremented when the destructor is called.

This is a very fundamental aspec of SKB handling, perhaps you should
familiarize yourself with SKBs a little bit more before modifying code
which manages them.

^ permalink raw reply

* Re: [PATCH net-next] skbuff: update sk truesize in pskb_expand_head
From: RongQing Li @ 2011-10-15  3:54 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev
In-Reply-To: <1318578829.2533.100.camel@edumazet-laptop>

2011/10/14 Eric Dumazet <eric.dumazet@gmail.com>:
> Le vendredi 14 octobre 2011 à 15:39 +0800, roy.qing.li@gmail.com a
> écrit :
>
> I dont believe this is needed or complete patch.
>
> Callers that need an updated truesize do the adjustement.
>

If we think the size which needs to adjust is small, we can ignore it,
I accept it.

if Let caller to adjust it, I think the caller is hard to get the adjustment.

Do you think the below adjustment is needed?

void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
		int size)
{
	skb_fill_page_desc(skb, i, page, off, size);
	skb->len += size;
	skb->data_len += size;
-	skb->truesize += size;
+	skb->truesize += PAGE_SIZE;
}

Thanks
-Qing

^ permalink raw reply

* Re: [PATCH net-next] skbuff: update sk truesize in pskb_expand_head
From: RongQing Li @ 2011-10-15  3:52 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20111014.034822.2255593021765068562.davem@davemloft.net>

2011/10/14 David Miller <davem@davemloft.net>:
> From: roy.qing.li@gmail.com
> Date: Fri, 14 Oct 2011 15:39:30 +0800
>
>> when pskb_expand_head reallocates header of &sk_buff, the sk
>> truesize should be updated simultaneously
>>
>> Signed-off-by: RongQing.Li <roy.qing.li@gmail.com>
>
> I know you did not test this patch at all.
>
> You can't modify the truesize because packets passed to this routine
> are often attached to a socket, thus if you change the truesize the
> socket memory accouting will be adjusted incorrectly later when the
> SKB is freed up.
>
> Most SKB modifying functions have to operate with this restriction.
>
>

I am sorry I do not know how to test it in fact.

But I believe the adjustment is right here
About incorrectly impact on socket, I am consider it, but I still no
idea about it.

Thanks.

^ permalink raw reply

* Re: [PATCH net-next] tcp: reduce memory needs of out of order queue
From: David Miller @ 2011-10-14 23:18 UTC (permalink / raw)
  To: rick.jones2; +Cc: eric.dumazet, netdev
In-Reply-To: <4E98B3B4.20406@hp.com>

From: Rick Jones <rick.jones2@hp.com>
Date: Fri, 14 Oct 2011 15:12:04 -0700

> From just a very quick look it looks like tcp_v[46]_rcv is called,
> finds that the socket is owned by the user, attempts to add to the
> backlog, but the path called by sk_add_backlog does not seem to make
> any attempts to compress things, so when the quantity of data is <<
> the truesize it starts tossing babies out with the bathwater.

This is why I don't believe the right fix is to add bandaids all
around the TCP layer.

The wastage has to be avoided at a higher level.

^ permalink raw reply

* Re: [PATCH] net: ipv6: Allow netlink to set IPv6 address scope
From: Lorenzo Colitti @ 2011-10-14 22:32 UTC (permalink / raw)
  To: Brian Haley; +Cc: maze, yoshfuji, netdev
In-Reply-To: <4E98981D.6080908@hp.com>

On Fri, Oct 14, 2011 at 13:14, Brian Haley <brian.haley@hp.com> wrote:
> Playing devil's advocate here, isn't this a brain-dead ISP?  If they're
> giving you a global IPv6 address you should get Internet connectivity
> with it.  If not, you probably knew it up front, or you're going to find
> another provider that does.  It's like they're giving you a site-local
> address...

I wouldn't say they're a brain-dead carrier, because they also give you
true IPv6 connectivity on another interface. The phone will deactivate
this interface when bringing up wifi, because wifi is usually cheaper and
faster. However, the carrier interface needs to stay up all the time
in order to do such things as provisioning, send and receive SMS
messages, handle voice over IP calls, and so on. So you will have
cases where the phone's primary Internet connection is over wifi, but
the phone still has a global unicast IPv6 address on the carrier interface.

> So are you talking about being able to dynamically change the scope
> of an address?  Wifi comes up - change provider addreses to host-
> local, wifi goes down - change it back to global.  That looks like a
> hack.

What I'm suggesting is to have the carrier interface be created with
site scope and stay up forever (or as long as the phone is on the cell
network). If an application wants to use the carrier interface, it will create
host routes that explicitly specify the carrier interface and the source
address of the carrier interface.

Applications that don't use the carrier interface will not have to do
anything special; if you set the carrier interface to site scope,
the kernel should just do the right thing.

> A default route with only a link-local address isn't very useful.  Will the
> kernel ever use this interface with the global address of your carrier -
> isn't it going to prefer the interface that address is configured on?

No, it will use the carrier address, because RFC3484 says "avoid
deprecated addresses" (rule 3) before it says "prefer outgoing interface"
(rule 5).

> That's a pretty small window where the address is in tentative state (< 2 sec), and re-trying shortly after will work.

That's true, but as the application you don't know that. You just call
connect(), and sit there and hang. There's a lot of applications that want
to reconnect as soon as they see that wifi has come up, and they all
get stuck.

> You can also use routing rules, like anyone that does dual-homed
> with IPv4 does  - only use 1.2.3.4 on eth0, and only use 4.3.2.1 on
> eth1.

That doesn't work, because when the kernel creates a new TCP
connection, its source address is empty, so no source-based rules
match. More specifically, if I do (assume the carrier IP address is
2001:db8::1/64):

ip -6 rule add prio 100 from 2001:db8::/64 lookup 100
ip -6 route add table 100 unreachable default
ip -6 route add default via fe80::1 dev wlan0

the system will still use the address from the carrier interface. I think
this is by design though.

> And there's also gai.conf, although I haven't played with that in a
> while.

gai.conf only handles destination address selection; the kernel
handles source address selection.

> The other trick/hack is to change the preferred lifetime of an address
> to zero, which should mark it deprecated, moving it down in the
> selection list.

That will work, but I think it's more of a hack than setting scope to
site. The carrier address is not deprecated, it's perfectly
functional. It just can't reach the Internet, but only a part of it.
So I think that calling it scoped is the solution that makes the most
sense.

> What else will break though?  If I configure fe80::1/64 and set the
> scope to global, do applications know to look at ifa_scope and not
> just the address itself to determine the scope?  Should they?

If the application treats scoped addresses specially, it will treat
fe80::1 as link-local scope, and will do things like check
sin6_scope_id to see if it's unique. On balance, I think that's the
right thing to do for an application in this situation, since, as
above, it's the network that truly decides what scope an address is
unique in, not the application or the kernel, and I would expect the
rest of the network to treat fe80::/64 as link-local (for example,
routers won't forward it, and so on).

The kernel, on the other hand, will try to use fe80::1 as if it were a
global address for the purposes of source address selection, but after
all, that's what you asked it to do, so you can't really blame it.

Make sense?

^ permalink raw reply

* Re: [PATCH net-next] tcp: reduce memory needs of out of order queue
From: Rick Jones @ 2011-10-14 22:12 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, netdev
In-Reply-To: <4E985A3F.5080103@hp.com>


> I believe that may be the case - at least during something like:
>
> netperf -t TCP_RR -H <host> -l 30 -- -b 256 -D
>
> which on an otherwise quiet test setup will report a non-trivial number
> of retransmissions - either via looking at netstat -s output, or by
> adding local_transport_retrans,remote_transport_retrans to an output
> selector for netperf (eg -o
> throughput,burst_size,local_transport_retrans,remote_transport_retrans,lss_size_end,rsr_size_end)
>
>
> (I plan on providing more data after a laptop has gone through some
> upgrades)

So, a test as above from a system running 2.6.38-11-generic to a system 
running 3.0.0-12-generic.  On the sender we have:

raj@tardy:~/netperf2_trunk$ netstat -s > before; src/netperf -H 
raj-8510w.americas.hpqcorp.net -t tcp_rr -- -b 256 -D -o 
throughput,local_transport_retrans,remote_transport_retrans,lss_size_end,rsr_size_end 
; netstat -s > after
MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET 
to internal-host.americas.hpqcorp.net (16.89.245.115) port 0 AF_INET : 
nodelay : first burst 256
Throughput,Local Transport Retransmissions,Remote Transport 
Retransmissions,Local Send Socket Size Final,Remote Recv Socket Size Final
76752.43,274,0,16384,98304

274 retransmissions at the sender.  The "beforeafter" of that on the sender:

raj@tardy:~/netperf2_trunk$ cat delta.send
Ip:
     766747 total packets received
     12 with invalid addresses
     0 forwarded
     0 incoming packets discarded
     766735 incoming packets delivered
     734689 requests sent out
     0 dropped because of missing route
Icmp:
     0 ICMP messages received
     0 input ICMP message failed.
     ICMP input histogram:
         destination unreachable: 0
         echo requests: 0
         echo replies: 0
     0 ICMP messages sent
     0 ICMP messages failed
     ICMP output histogram:
         destination unreachable: 0
         echo request: 0
         echo replies: 0
IcmpMsg:
         InType0: 0
         InType3: 0
         InType8: 0
         OutType0: 0
         OutType3: 0
         OutType8: 0
Tcp:
     2 active connections openings
     0 passive connection openings
     0 failed connection attempts
     0 connection resets received
     0 connections established
     766727 segments received
     734408 segments send out
     274 segments retransmited
     0 bad segments received.
     0 resets sent
Udp:
     7 packets received
     0 packets to unknown port received.
     0 packet receive errors
     7 packets sent
UdpLite:
TcpExt:
     0 packets pruned from receive queue because of socket buffer overrun
     0 ICMP packets dropped because they were out-of-window
     0 TCP sockets finished time wait in fast timer
     2 delayed acks sent
     0 delayed acks further delayed because of locked socket
     Quick ack mode was activated 0 times
     170856 packets directly queued to recvmsg prequeue.
     1204 bytes directly in process context from backlog
     170678 bytes directly received in process context from prequeue
     592090 packet headers predicted
     170626 packets header predicted and directly queued to user
     1375 acknowledgments not containing data payload received
     174911 predicted acknowledgments
     150 times recovered from packet loss by selective acknowledgements
     0 congestion windows recovered without slow start by DSACK
     0 congestion windows recovered without slow start after partial ack
     299 TCP data loss events
     TCPLostRetransmit: 9
     0 timeouts after reno fast retransmit
     0 timeouts after SACK recovery
     253 fast retransmits
     14 forward retransmits
     6 retransmits in slow start
     0 other TCP timeouts
     1 SACK retransmits failed
     0 times receiver scheduled too late for direct processing
     0 packets collapsed in receive queue due to low socket buffer
     0 DSACKs sent for old packets
     0 DSACKs received
     0 connections reset due to unexpected data
     0 connections reset due to early user close
     0 connections aborted due to timeout
     0 times unabled to send RST due to no memory
     TCPDSACKIgnoredOld: 0
     TCPDSACKIgnoredNoUndo: 0
     TCPSackShifted: 0
     TCPSackMerged: 1031
     TCPSackShiftFallback: 240
     TCPBacklogDrop: 0
     IPReversePathFilter: 0
IpExt:
     InMcastPkts: 0
     OutMcastPkts: 0
     InBcastPkts: 1
     InOctets: -1012182764
     OutOctets: -1436530450
     InMcastOctets: 0
     OutMcastOctets: 0
     InBcastOctets: 147

and then the deltas on the receiver:

raj@raj-8510w:~/netperf2_trunk$ cat delta.recv
Ip:
     734669 total packets received
     0 with invalid addresses
     0 forwarded
     0 incoming packets discarded
     734669 incoming packets delivered
     766696 requests sent out
     0 dropped because of missing route
Icmp:
     0 ICMP messages received
     0 input ICMP message failed.
     ICMP input histogram:
         destination unreachable: 0
     0 ICMP messages sent
     0 ICMP messages failed
     ICMP output histogram:
IcmpMsg:
         InType3: 0
Tcp:
     0 active connections openings
     2 passive connection openings
     0 failed connection attempts
     0 connection resets received
     0 connections established
     734651 segments received
     766695 segments send out
     0 segments retransmited
     0 bad segments received.
     0 resets sent
Udp:
     1 packets received
     0 packets to unknown port received.
     0 packet receive errors
     1 packets sent
UdpLite:
TcpExt:
     28 packets pruned from receive queue because of socket buffer overrun
     0 delayed acks sent
     0 delayed acks further delayed because of locked socket
     19 packets directly queued to recvmsg prequeue.
     0 bytes directly in process context from backlog
     667 bytes directly received in process context from prequeue
     727842 packet headers predicted
     9 packets header predicted and directly queued to user
     161 acknowledgments not containing data payload received
     229704 predicted acknowledgments
     6774 packets collapsed in receive queue due to low socket buffer
     TCPBacklogDrop: 276
IpExt:
     InMcastPkts: 0
     OutMcastPkts: 0
     InBcastPkts: 17
     OutBcastPkts: 0
     InOctets: 38973144
     OutOctets: 40673137
     InMcastOctets: 0
     OutMcastOctets: 0
     InBcastOctets: 1816
     OutBcastOctets: 0

this is an otherwise clean network, no errors reported by ifconfig or 
ethtool -S, and the packet rate was well within the limits of 1 GbE and 
the ProCurve 2724 switch between the two systems.

 From just a very quick look it looks like tcp_v[46]_rcv is called, 
finds that the socket is owned by the user, attempts to add to the 
backlog, but the path called by sk_add_backlog does not seem to make any 
attempts to compress things, so when the quantity of data is << the 
truesize it starts tossing babies out with the bathwater.

rick jones

^ permalink raw reply

* Re: [PATCH v3 1/3] phylib: Convert MDIO and PHY Lib drivers to support 10G
From: David Daney @ 2011-10-14 21:52 UTC (permalink / raw)
  To: Andy Fleming, davem; +Cc: netdev
In-Reply-To: <86469237-B8CD-4A0E-A744-649AFB5E44C2@freescale.com>

On 10/14/2011 02:17 PM, Andy Fleming wrote:
>
> On Oct 13, 2011, at 11:00 AM, David Daney wrote:
>
>> On 10/13/2011 07:37 AM, Andy Fleming wrote:
>>> 10G MDIO is a totally different protocol (clause 45 of 802.3).
>>> Supporting this new protocol requires a couple of changes:
>>>
>>> * Add a new parameter to the mdiobus_read functions to specify the
>>>    "device address" inside the PHY.
>>> * Add a phy45_read/write function which takes advantage of that
>>>    new parameter
>>> * Convert all of the existing drivers to use the new format
>>>
>>> I created a new clause-45-specific read/write functions because:
>>> 1) phy_read and phy_write are highly overloaded functions, and
>>>     finding every instance which is actually the PHY Lib version
>>>     was quite difficult
>>> 2) Most code which invokes phy_read/phy_write inside PHY Lib is
>>>     Clause-22-specific. None of the phy_read/phy_write invocations
>>>     were useable on 10G PHYs
>>>
>>
>> I think converting all these phy_read/phy_write to take an extra
>> parameter is a mistake.  99% of the users have no need for the "device
>> address".  Also you are still passing the protocol mode as a high
>> order bit in the register address, so that part is still quite ugly.
>
>
> I didn't convert *any* of the phy_read/phy_write functions to have
> an extra parameter. I converted only the mdio bus functions.
>
> And…I'm not passing the protocol mode as a high order bit. Am I
> missing something?
>

I misspoke, I meant all the mdiobus_{read,write} functions.  But my 
feeling is the same, a lot of churn may not be good.


> Ah, right. That's what the MDIO bitbang driver was converted to
> do. Are there any clients in the tree that actually use that
> functionality (currently a grep of MII_ADDR_C45 yields only the mdio
> bitbang driver and the macro definition)? I agree that's pretty
> ugly. That's why my second patch converted MDIO bit-bang to use the
> devad argument, instead.
>

Granted, there is nothing in-tree.  Not that it is a good excuse, but I 
am actively working on converting my out-of-tree drivers to be in-tree, 
so I have a natural tendency towards the status quo.

> If we were going to use this method of setting a flag in an existing
> parameter, I'd like it if we could make our method the same as the
> mdio.c code's for improved potential for integration. My objection
> to the use of unused bits in the existing arguments is that if we
> pass a C45 argument to a C22 bus, the behavior is undefined. i.e. -
> we don't know whether the underlying drivers will accidentally write
> bits in registers that have unknown effects, or BUG(), or just pass
> the bad value through.  While I agree that my approach is
> disruptive, I also think that a) It's not that bad (I changed all of
> the affected drivers), and b) It makes the API more explicit and
> self-documenting.
>
> mdio.c's read/write functions go with separate arguments...
>

Well there is that...

Really we need a netdev maintainer to decide the best way forward.  It 
seems like we need to work towards unifying mdio.c and the PHY driver 
infrastructure if possible.

I can adapt my patches either way, but it would be good to know soon 
which way it will be.

David Daney

>>
>> The existing infrastructure where we pass the "device address" in bits
>> 16..20 of the register number is much less disruptive.
>>
>> If you don't like it, an easy and much less intrusive approach might
>> be a simple (untested) wrapper:
>>
>> static inline int phy45_read(struct phy_device *phydev,
>>                              int devad, u16 regnum)
>> {
>> 	u32 c45_reg = MII_ADDR_C45 | ((devad&  0x1f)<<  16) | regnum;
>> 	return phy_read(phydev, c45_reg)
>> }
>>
>> static inline int phy45_write(struct phy_device *phydev,
>>                               int devad, u16 regnum, u16 val)
>> {
>> 	u32 c45_reg = MII_ADDR_C45 | ((devad&  0x1f)<<  16) | regnum;
>> 	return phy_write(phydev, c45_reg, val)
>> }
>
>
> I admit this is far easier, but it feels much less clean to me. It
> sounds like Grant's ok with it, so if that's the approach we want,
> I'd be fine with converting David's approach to use the mdio45_probe
> equivalent, so we get the more robust device probing.
>
> Andy

^ permalink raw reply

* Re: [PATCH v3 1/3] phylib: Convert MDIO and PHY Lib drivers to support 10G
From: Andy Fleming @ 2011-10-14 21:17 UTC (permalink / raw)
  To: David Daney; +Cc: davem, netdev
In-Reply-To: <4E970B03.9060200@cavium.com>

On Oct 13, 2011, at 11:00 AM, David Daney wrote:

> On 10/13/2011 07:37 AM, Andy Fleming wrote:
>> 10G MDIO is a totally different protocol (clause 45 of 802.3).
>> Supporting this new protocol requires a couple of changes:
>> 
>> * Add a new parameter to the mdiobus_read functions to specify the
>>   "device address" inside the PHY.
>> * Add a phy45_read/write function which takes advantage of that
>>   new parameter
>> * Convert all of the existing drivers to use the new format
>> 
>> I created a new clause-45-specific read/write functions because:
>> 1) phy_read and phy_write are highly overloaded functions, and
>>    finding every instance which is actually the PHY Lib version
>>    was quite difficult
>> 2) Most code which invokes phy_read/phy_write inside PHY Lib is
>>    Clause-22-specific. None of the phy_read/phy_write invocations
>>    were useable on 10G PHYs
>> 
> 
> I think converting all these phy_read/phy_write to take an extra
> parameter is a mistake.  99% of the users have no need for the "device
> address".  Also you are still passing the protocol mode as a high
> order bit in the register address, so that part is still quite ugly.

I didn't convert *any* of the phy_read/phy_write functions to have an extra parameter. I converted only the mdio bus functions.

And…I'm not passing the protocol mode as a high order bit. Am I missing something?

Ah, right. That's what the MDIO bitbang driver was converted to do. Are there any clients in the tree that actually use that functionality (currently a grep of MII_ADDR_C45 yields only the mdio bitbang driver and the macro definition)? I agree that's pretty ugly. That's why my second patch converted MDIO bit-bang to use the devad argument, instead.

If we were going to use this method of setting a flag in an existing parameter, I'd like it if we could make our method the same as the mdio.c code's for improved potential for integration. My objection to the use of unused bits in the existing arguments is that if we pass a C45 argument to a C22 bus, the behavior is undefined. i.e. - we don't know whether the underlying drivers will accidentally write bits in registers that have unknown effects, or BUG(), or just pass the bad value through.

While I agree that my approach is disruptive, I also think that a) It's not that bad (I changed all of the affected drivers), and b) It makes the API more explicit and self-documenting.

mdio.c's read/write functions go with separate arguments...

> 
> The existing infrastructure where we pass the "device address" in bits
> 16..20 of the register number is much less disruptive.
> 
> If you don't like it, an easy and much less intrusive approach might
> be a simple (untested) wrapper:
> 
> static inline int phy45_read(struct phy_device *phydev,
>                             int devad, u16 regnum)
> {
> 	u32 c45_reg = MII_ADDR_C45 | ((devad & 0x1f) << 16) | regnum;
> 	return phy_read(phydev, c45_reg)
> }
> 
> static inline int phy45_write(struct phy_device *phydev,
>                              int devad, u16 regnum, u16 val)
> {
> 	u32 c45_reg = MII_ADDR_C45 | ((devad & 0x1f) << 16) | regnum;
> 	return phy_write(phydev, c45_reg, val)
> }

I admit this is far easier, but it feels much less clean to me. It sounds like Grant's ok with it, so if that's the approach we want, I'd be fine with converting David's approach to use the mdio45_probe equivalent, so we get the more robust device probing.

Andy

^ permalink raw reply

* Re: [PATCH net-next] bnx2x: Disable LRO on FCoE or iSCSI boot device
From: Michael Chan @ 2011-10-14 20:59 UTC (permalink / raw)
  To: John Fastabend
  Cc: 'Rick Jones', 'davem@davemloft.net',
	'netdev@vger.kernel.org', Dmitry Kravkov,
	Eilon Greenstein, eddie.wai
In-Reply-To: <4E9898F6.6000302@intel.com>


On Fri, 2011-10-14 at 13:17 -0700, John Fastabend wrote:
> On 10/14/2011 9:15 AM, Michael Chan wrote:
> > Rick Jones wrote:
> > 
> >> On 10/14/2011 08:53 AM, Michael Chan wrote:
> >>> Rick Jones wrote:
> >>>
> >>>> Is this perhaps saying that a bnx2x-driven device being used for
> >>>> FCoE or iSCSI boot must not permit *any* run-time configuration
> >>>> change which leads to a NIC reset?
> >>>>
> >>>
> >>> That is right.  Unless you have a multipath configuration with
> >> multiple
> >>> ports, then you can reset one port at a time.
> >>
> >> So, should there also be a "cnic_boot_device" check in many of the
> >> "capital letter" ethtool paths?
> >>
> > 
> > If the user is doing ethtool configuration changes or device shutdown,
> > it is more obvious what the consequence will be.  The user may also be
> > careful to do it on a multipath setup.
> > 
> > The reset caused by the auto turn-off of LRO when you enable
> > ip_forward or bridging will not be obvious to the user.  In addition,
> > all devices with LRO turned on will be reset at the same time so even
> > multipath will not survive.
> >
> 
> But after the reset the device should login and SCSI layer should
> handle retries. So I don't see why this is a problem. Why do we
> need to handle this any different from any other link events?
> 

During a link down event, the iSCSI state does not get reset.  When link
comes back up quickly enough, there should be just some retransmissions
and everything should recover.  The root file system won't tolerate a
chip reset that will reset the iSCSI state.

^ permalink raw reply

* Re: [PATCH RFC 0/2] Add extended pause query capability
From: Matt Carlson @ 2011-10-14 20:55 UTC (permalink / raw)
  To: Matt Carlson
  Cc: davem@davemloft.net, netdev@vger.kernel.org,
	bhutchings@solarflare.com
In-Reply-To: <1318625642-9668-1-git-send-email-mcarlson@broadcom.com>

On Fri, Oct 14, 2011 at 01:54:00PM -0700, Matt Carlson wrote:
> The current implementation of get_pauseparam allows userspace to query the
> flow control configuration, but not the flow control status.  This patchset
> defines a new ethtool_pauseparamext structure and adds a new
> get_pauseparamext ethtool callback to support it.  The new facilities allow
> the driver to report both config and status in the same query.
> 
> Please note that Ben Hutchings' suggestion to deduce the flow control settings
> through the 'advertising' and 'lp_advertising' from ETHTOOL_GSET was considered,
> but rejected because there was no way to know if the flow control
> advertisements reported were valid.

Please add the following to the patches.  Sorry Michael.

Signed-off-by: Michael Chan <mchan@broadcom.com>

^ permalink raw reply

* [PATCH RFC 2/2] tg3: Convert to get_pauseparamext
From: Matt Carlson @ 2011-10-14 20:54 UTC (permalink / raw)
  To: davem; +Cc: netdev, mcarlson, bhutchings

This patch converts the tg3 driver to the get_pauseparamext ethtool
command.

Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
---
 drivers/net/ethernet/broadcom/tg3.c |   28 ++++++++++++++++++++--------
 1 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index fe712f9..a7b7ddd 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -10576,24 +10576,36 @@ static int tg3_set_ringparam(struct net_device *dev, struct ethtool_ringparam *e
 	return err;
 }
 
-static void tg3_get_pauseparam(struct net_device *dev, struct ethtool_pauseparam *epause)
+static void tg3_get_pauseparamext(struct net_device *dev,
+				  struct ethtool_pauseparamext *epause)
 {
 	struct tg3 *tp = netdev_priv(dev);
 
-	epause->autoneg = !!tg3_flag(tp, PAUSE_AUTONEG);
+	epause->cfg.autoneg = !!tg3_flag(tp, PAUSE_AUTONEG);
+
+	if (tp->link_config.flowctrl & FLOW_CTRL_RX)
+		epause->cfg.rx_pause = 1;
+	else
+		epause->cfg.rx_pause = 0;
+
+	if (tp->link_config.flowctrl & FLOW_CTRL_TX)
+		epause->cfg.tx_pause = 1;
+	else
+		epause->cfg.tx_pause = 0;
 
 	if (tp->link_config.active_flowctrl & FLOW_CTRL_RX)
-		epause->rx_pause = 1;
+		epause->rx_pause_status = 1;
 	else
-		epause->rx_pause = 0;
+		epause->rx_pause_status = 0;
 
 	if (tp->link_config.active_flowctrl & FLOW_CTRL_TX)
-		epause->tx_pause = 1;
+		epause->tx_pause_status = 1;
 	else
-		epause->tx_pause = 0;
+		epause->tx_pause_status = 0;
 }
 
-static int tg3_set_pauseparam(struct net_device *dev, struct ethtool_pauseparam *epause)
+static int tg3_set_pauseparam(struct net_device *dev,
+			      struct ethtool_pauseparam *epause)
 {
 	struct tg3 *tp = netdev_priv(dev);
 	int err = 0;
@@ -11926,7 +11938,7 @@ static const struct ethtool_ops tg3_ethtool_ops = {
 	.set_eeprom		= tg3_set_eeprom,
 	.get_ringparam		= tg3_get_ringparam,
 	.set_ringparam		= tg3_set_ringparam,
-	.get_pauseparam		= tg3_get_pauseparam,
+	.get_pauseparamext	= tg3_get_pauseparamext,
 	.set_pauseparam		= tg3_set_pauseparam,
 	.self_test		= tg3_self_test,
 	.get_strings		= tg3_get_strings,
-- 
1.7.3.4

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox