Netdev List
 help / color / mirror / Atom feed
* [PATCH 3/8] af_unix: add setsockopt on unix sockets
From: Alban Crequy @ 2011-01-21 14:39 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Lennart Poettering, netdev,
	linux-doc, linux-
  Cc: Alban Crequy
In-Reply-To: <20110121143751.57b1453d@chocolatine.cbg.collabora.co.uk>

unix_setsockopt() is called only on SOCK_DGRAM and SOCK_SEQPACKET unix sockets

Signed-off-by: Alban Crequy <alban.crequy@collabora.co.uk>
Reviewed-by: Ian Molton <ian.molton@collabora.co.uk>
---
 net/unix/af_unix.c |   13 +++++++++++--
 1 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index d8d98d5..7ea85de 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -512,6 +512,8 @@ static unsigned int unix_dgram_poll(struct file *, struct socket *,
 				    poll_table *);
 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 static int unix_shutdown(struct socket *, int);
+static int unix_setsockopt(struct socket *, int, int,
+			   char __user *, unsigned int);
 static int unix_stream_sendmsg(struct kiocb *, struct socket *,
 			       struct msghdr *, size_t);
 static int unix_stream_recvmsg(struct kiocb *, struct socket *,
@@ -559,7 +561,7 @@ static const struct proto_ops unix_dgram_ops = {
 	.ioctl =	unix_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	unix_shutdown,
-	.setsockopt =	sock_no_setsockopt,
+	.setsockopt =	unix_setsockopt,
 	.getsockopt =	sock_no_getsockopt,
 	.sendmsg =	unix_dgram_sendmsg,
 	.recvmsg =	unix_dgram_recvmsg,
@@ -580,7 +582,7 @@ static const struct proto_ops unix_seqpacket_ops = {
 	.ioctl =	unix_ioctl,
 	.listen =	unix_listen,
 	.shutdown =	unix_shutdown,
-	.setsockopt =	sock_no_setsockopt,
+	.setsockopt =	unix_setsockopt,
 	.getsockopt =	sock_no_getsockopt,
 	.sendmsg =	unix_seqpacket_sendmsg,
 	.recvmsg =	unix_dgram_recvmsg,
@@ -1561,6 +1563,13 @@ out:
 }
 
 
+static int unix_setsockopt(struct socket *sock, int level, int optname,
+			   char __user *optval, unsigned int optlen)
+{
+	return -EOPNOTSUPP;
+}
+
+
 static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 			       struct msghdr *msg, size_t len)
 {
-- 
1.7.2.3

^ permalink raw reply related

* [PATCH 1/8] af_unix: Documentation on multicast unix sockets
From: Alban Crequy @ 2011-01-21 14:39 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Lennart Poettering, netdev,
	linux-doc, linux-
  Cc: Alban Crequy
In-Reply-To: <20110121143751.57b1453d@chocolatine.cbg.collabora.co.uk>

Signed-off-by: Alban Crequy <alban.crequy@collabora.co.uk>
Reviewed-by: Ian Molton <ian.molton@collabora.co.uk>
---
 .../networking/multicast-unix-sockets.txt          |  171 ++++++++++++++++++++
 1 files changed, 171 insertions(+), 0 deletions(-)
 create mode 100644 Documentation/networking/multicast-unix-sockets.txt

diff --git a/Documentation/networking/multicast-unix-sockets.txt b/Documentation/networking/multicast-unix-sockets.txt
new file mode 100644
index 0000000..0cc30cb
--- /dev/null
+++ b/Documentation/networking/multicast-unix-sockets.txt
@@ -0,0 +1,171 @@
+Multicast Unix sockets
+======================
+
+Multicast is implemented on SOCK_DGRAM and SOCK_SEQPACKET Unix sockets.
+
+An userspace application can create a multicast group with:
+
+  struct unix_mreq mreq = {0,};
+  mreq.address.sun_family = AF_UNIX;
+  mreq.address.sun_path[0] = '\0';
+  strcpy(mreq.address.sun_path + 1, "socket-address");
+
+  sockfd = socket(AF_UNIX, SOCK_DGRAM, 0);
+  ret = setsockopt(sockfd, SOL_UNIX, UNIX_CREATE_GROUP, &mreq, sizeof(mreq));
+
+This allocates a struct unix_mcast_group, which is reference counted and exists
+as long as the socket who created it exists or the group has at least one
+member.
+
+Then a multicast group can be joined with:
+
+  ret = setsockopt(sockfd, SOL_UNIX, UNIX_JOIN_GROUP, &mreq, sizeof(mreq));
+
+This allocates a struct unix_mcast, which holds the settings of the membership,
+mainly whether loopback is enabled. A socket can be a member of several
+multicast groups.
+
+The socket is part of the multicast group until it is released, shutdown with
+RCV_SHUTDOWN or it leaves explicitely the group:
+
+  ret = setsockopt(sockfd, SOL_UNIX, UNIX_LEAVE_GROUP, &mreq, sizeof(mreq));
+
+Struct unix_mcast nodes are linked in two RCU lists:
+- (struct unix_sock)->mcast_subscriptions
+- (struct unix_mcast_group)->mcast_members
+
+              unix_mcast_group  unix_mcast_group
+                      |                 |
+                      v                 v
+unix_sock  ---->  unix_mcast  ----> unix_mcast
+                      |
+                      v
+unix_sock  ---->  unix_mcast
+                      |
+                      v
+unix_sock  ---->  unix_mcast
+
+
+SOCK_DGRAM semantics
+====================
+
+          G          The socket which created the group
+       /  |  \
+     P1  P2  P3      The member sockets
+
+Messages sent to the group are received by all members except the sender itself
+unless the sending socket has UNIX_MREQ_LOOPBACK set.
+
+Non-members can also send to the group socket G and the message will be
+broadcast to the group members, however socket G does not receive messages sent
+to the group, via it, itself.
+
+
+SOCK_SEQPACKET semantics
+========================
+
+When a connection is performed on a SOCK_SEQPACKET multicast socket, a new
+socket is created and its file descriptor is received by accept().
+
+          L          The listening socket
+       /  |  \
+     A1  A2  A3      The accepted sockets
+      |   |   |
+     C1  C2  C3      The connected sockets
+
+Messages sent on the C1 socket are received by:
+- C1 itself if UNIX_MREQ_LOOPBACK is set.
+- The peer socket A1 if UNIX_MREQ_SEND_TO_PEER is set.
+- The other members of the multicast group C2 and C3.
+
+Only members can send to the group in this case.
+
+
+Atomic delivery and ordering
+============================
+
+Each message sent is delivered atomically to either none of the recipients or
+all the recipients, even with interruptions and errors.
+
+Locking is used in order to keep the ordering consistent on all recipients. We
+want to avoid the following scenario. Two emitters A and B, and 2 recipients, C
+and D:
+
+           C    D
+A -------->|    |    Step 1: A's message is delivered to C
+B -------->|    |    Step 2: B's message is delivered to C
+B ---------|--->|    Step 3: B's message is delivered to D
+A ---------|--->|    Step 4: A's message is delivered to D
+
+Result: - C received (A, B)
+        - D received (B, A)
+
+Although A and B had a list of recipients (C, D) in the same order, C and D
+received the messages in a different order. To avoid this scenario, we need a
+locking mechanism while the messages are being delivered with skb_queue_tail().
+
+Solution 1:
+The easiest implementation would be to use a global spinlock on the group, but
+it creates an avoidable contention, especially when there are two independent
+streams set up with socket filters; e.g. if A sends messages received only by
+C, and B sends messages received only by D.
+
+Solution 2:
+Fine-grained locking could be implemented with a spinlock on each recipient.
+Before delivering the message to the recipients, the sender takes a spinlock on
+each recipient at the same time.
+
+Taking several spinlocks on the same struct can be dangerous and leads to
+deadlocks. This is prevented by sorting the list of sockets by memory address
+and taking the spinlocks in that order. The ordered list of recipients is
+computed on demand when a message is sent and the list is cached for
+performance. When the group membership changes, the generation of the
+membership is incremented and the ordered recipient list is invalidated.
+
+With this solution, the number of spinlocks taken simultaneously can be
+arbitrary big. Whilst it works, it breaks the lockdep mechanism.
+
+Solution 3:
+The current implementation is similar to solution 2 but with a limit on the
+number of spinlocks taken simultaneously (8), so lockdep works fine. A hash
+function and bit array with n=8 specifies which spinlocks to take.  Contention
+on independent streams can still happen but it is less likely.
+
+
+Flow control
+============
+
+When a socket's receiving queue is full, the default behavior is to block
+senders (or to return -EAGAIN on non-blocking sockets). The socket can also
+join a multicast group with the flag UNIX_MREQ_DROP_WHEN_FULL. In this case,
+messages sent to the group will not be delivered to that socket when its
+receiving queue is full.
+
+Messages are still delivered atomically to all members who don't have the flag
+UNIX_MREQ_DROP_WHEN_FULL. If send() returns -EAGAIN, nobody received the
+message. If send() blocks because of one member, the other members don't
+receive the message until all sockets (except those with
+UNIX_MREQ_DROP_WHEN_FULL set) can receive at the same time.
+
+poll/epoll/select on POLLOUT events have a consistent behavior; they block if
+at least one member of the multicast group without UNIX_MREQ_DROP_WHEN_FULL has
+a full receiving queue.
+
+
+Multicast socket reference counting
+===================================
+
+A poller for POLLOUT events can block for any member of the group. The poller
+can use the wait queue "peer_wait" of any member. So it is important that Unix
+sockets are not released before all pollers exit. This is achieved by:
+
+- Incrementing the reference counter of a socket when it joins a multicast
+  group.
+- Decrementing it when the group is destroyed, that is when all
+  sockets keeping a reference on the group released their reference on the
+  group.
+
+struct unix_mcast_group keeps track of both current members and previous
+members. When a socket leaves a group, it is removed from the members list and
+put in the dead members list. This is done in order to take advantage of RCU
+lists, which reduces lock contention.
-- 
1.7.2.3

^ permalink raw reply related

* Re: 2.6.38-rc1: arp triggers RTNL assertion
From: Richard Cochran @ 2011-01-21 14:02 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jamie Heilman, linux-kernel, netdev
In-Reply-To: <1295593946.2613.52.camel@edumazet-laptop>

On Fri, Jan 21, 2011 at 08:12:26AM +0100, Eric Dumazet wrote:
> Thanks for the report, I am looking at this right now.

FYI, I had this too. Happens every time I use my UMTS modem.

Kernel: private branch from e744070fd4ff9d3114277e52d77afa21579adce2


Jan 13 05:07:23 riccoc20 pppd[12961]: Serial connection established.
Jan 13 05:07:23 riccoc20 pppd[12961]: Using interface ppp0
Jan 13 05:07:23 riccoc20 pppd[12961]: Connect: ppp0 <--> /dev/ttyACM0
Jan 13 05:07:24 riccoc20 pppd[12961]: PAP authentication succeeded
Jan 13 05:07:25 riccoc20 pppd[12961]: replacing old default route to eth0 [10.0.0.1]
Jan 13 05:07:25 riccoc20 pppd[12961]: found interface eth0 for proxy arp
Jan 13 05:07:25 riccoc20 pppd[12961]: local  IP address 46.124.43.179
Jan 13 05:07:25 riccoc20 pppd[12961]: remote IP address 10.6.6.6
Jan 13 05:07:25 riccoc20 pppd[12961]: primary   DNS address 213.162.69.169
Jan 13 05:07:25 riccoc20 pppd[12961]: secondary DNS address 213.162.65.1
Jan 13 05:07:25 riccoc20 kernel: [ 3151.088740] RTNL: assertion failed at /home/cochran/work/git/net-next-2.6/net/core/neighbour.c (589)
Jan 13 05:07:25 riccoc20 kernel: [ 3151.088746] Pid: 12961, comm: pppd Tainted: P            2.6.37+ #3
Jan 13 05:07:25 riccoc20 kernel: [ 3151.088748] Call Trace:
Jan 13 05:07:25 riccoc20 kernel: [ 3151.088757]  [<c04da48f>] ? pneigh_lookup+0x1af/0x1c0
Jan 13 05:07:25 riccoc20 kernel: [ 3151.088761]  [<c05254fe>] ? arp_req_set+0x18e/0x290
Jan 13 05:07:25 riccoc20 kernel: [ 3151.088764]  [<c04dbb62>] ? __rtnl_unlock+0x12/0x20
Jan 13 05:07:25 riccoc20 kernel: [ 3151.088767]  [<c04cd972>] ? netdev_run_todo+0x42/0x230
Jan 13 05:07:25 riccoc20 kernel: [ 3151.088771]  [<c0329821>] ? apparmor_capable+0x21/0x70
Jan 13 05:07:25 riccoc20 kernel: [ 3151.088774]  [<c04cf81d>] ? dev_get_by_name_rcu+0x8d/0xb0
Jan 13 05:07:25 riccoc20 kernel: [ 3151.088777]  [<c0525844>] ? arp_ioctl+0x244/0x260
Jan 13 05:07:25 riccoc20 kernel: [ 3151.088780]  [<c052a5e5>] ? inet_ioctl+0xa5/0xb0
Jan 13 05:07:25 riccoc20 kernel: [ 3151.088784]  [<c04bc6bd>] ? sock_ioctl+0x6d/0x290
Jan 13 05:07:25 riccoc20 kernel: [ 3151.088786]  [<c04bc650>] ? sock_ioctl+0x0/0x290
Jan 13 05:07:25 riccoc20 kernel: [ 3151.088790]  [<c02222cc>] ? do_vfs_ioctl+0x8c/0x620
Jan 13 05:07:25 riccoc20 kernel: [ 3151.088794]  [<c05ab22a>] ? do_page_fault+0x1ca/0x450
Jan 13 05:07:25 riccoc20 kernel: [ 3151.088796]  [<c04be7bb>] ? sys_send+0x3b/0x40
Jan 13 05:07:25 riccoc20 kernel: [ 3151.088798]  [<c04bf3b8>] ? sys_socketcall+0x1d8/0x2a0
Jan 13 05:07:25 riccoc20 kernel: [ 3151.088801]  [<c02228c7>] ? sys_ioctl+0x67/0x80
Jan 13 05:07:25 riccoc20 kernel: [ 3151.088804]  [<c05a80a4>] ? syscall_call+0x7/0xb
Jan 13 05:07:31 riccoc20 ntpdate[13030]: no server suitable for synchronization found

^ permalink raw reply

* Re: [PATCH] bonding: added 802.3ad round-robin hashing policy for single TCP session balancing
From: Oleg V. Ukhno @ 2011-01-21 13:55 UTC (permalink / raw)
  To: Nicolas de Pesloüan
  Cc: Jay Vosburgh, John Fastabend, netdev@vger.kernel.org
In-Reply-To: <4D3745AF.5040808@gmail.com>

On 01/19/2011 11:12 PM, Nicolas de Pesloüan wrote:

> If you have time for that, then yes, please, do the same test using
> balance-rr+vlan to segregate path. With those results, we whould have
> the opportunity to enhance the documentation with some well tested cases
> of TCP load balancing on a LAN, not limited to 802.3ad automatic setup.
> Both setups make sense, and assuming the results would be similar is
> probably true, but not reliable enough to assert it into the documentation.
>
> Thanks,
>
> Nicolas.
>
Nicolas,
I've ran similar tests for VLAN tunneling scenario. Results are 
identical, as I expected. The only significat difference is link failure 
handling. 802.3ad mode allows almost painless load reditribution, 
balance-rr causes packet loss.
The only question for me now is if my patch could be applied to upstream 
version - fixing issues with adaptftion to net-next code aren't the 
problem, if nobody objects
There were 2 tests:
1) unidirectional test
2) bidirectional test
Below are results:

Iperf results:
test 1:
  iperf -f m -c 192.168.111.128 -B 192.168.111.129 -p 9999 -t 300
------------------------------------------------------------
Client connecting to 192.168.111.128, TCP port 9999
Binding to local address 192.168.111.129
TCP window size: 32.0 MByte (default)
------------------------------------------------------------
[  3] local 192.168.111.129 port 9999 connected with 192.168.111.128 
port 9999
[ ID] Interval       Transfer     Bandwidth
[  3]  0.0-300.0 sec  141637 MBytes  3960 Mbits/sec

test 2:
iperf -f m -c 192.168.111.128 -B 192.168.111.129 -p 9999 -t 300 
--dualtest -P 4
------------------------------------------------------------
Server listening on TCP port 9999
Binding to local address 192.168.111.129
TCP window size: 32.0 MByte (default)
------------------------------------------------------------
...
[SUM]  0.0-300.2 sec  111334 MBytes  3111 Mbits/sec
[SUM]  0.0-300.4 sec  109582 MBytes  3060 Mbits/sec

TCP stats:
receiver side, before test 1:
[root@target1 ~]# netstat -st
IcmpMsg:
     InType0: 4
     InType3: 6
     InType8: 2
     OutType0: 2
     OutType3: 6
     OutType8: 4
Tcp:
     4 active connections openings
     2 passive connection openings
     3 failed connection attempts
     0 connection resets received
     3 connections established
     10252 segments received
     29766 segments send out
     2 segments retransmited
     0 bad segments received.
     0 resets sent
UdpLite:
TcpExt:
     3 delayed acks sent
     613 packets directly queued to recvmsg prequeue.
     16 packets directly received from backlog
     1760 packets directly received from prequeue
     428 packets header predicted
     10 packets header predicted and directly queued to user
     9295 acknowledgments not containing data received
     265 predicted acknowledgments
     0 TCP data loss events
     1 other TCP timeouts
     TCPSackMerged: 1
     TCPSackShiftFallback: 1
IpExt:
     InMcastPkts: 92
     OutMcastPkts: 64
     InBcastPkts: 2
     InOctets: 1089217
     OutOctets: 265005791
     InMcastOctets: 16294
     OutMcastOctets: 10364
     InBcastOctets: 483


receiver side , after test 1:
[root@target1 ~]netstat -st
IcmpMsg:
     InType0: 17
     InType3: 6
     InType8: 9
     OutType0: 9
     OutType3: 6
     OutType8: 19
Tcp:
     84 active connections openings
     14 passive connection openings
     6 failed connection attempts
     4 connection resets received
     4 connections established
     16684784 segments received
     16704650 segments send out
     22 segments retransmited
     0 bad segments received.
     6 resets sent
UdpLite:
TcpExt:
     39 TCP sockets finished time wait in slow timer
     23 delayed acks sent
     83 delayed acks further delayed because of locked socket
     Quick ack mode was activated 225 times
     1019 packets directly queued to recvmsg prequeue.
     3235352384 packets directly received from backlog
     483600 packets directly received from prequeue
     86065 packets header predicted
     4855 packets header predicted and directly queued to user
     10369 acknowledgments not containing data received
     928 predicted acknowledgments
     0 TCP data loss events
     2 retransmits in slow start
     6 other TCP timeouts
     225 DSACKs sent for old packets
     1 connections reset due to unexpected data
     TCPSackMerged: 1
     TCPSackShiftFallback: 3
IpExt:
     InMcastPkts: 108
     OutMcastPkts: 72
     InBcastPkts: 4
     InOctets: -936746758
     OutOctets: 1556837236
     InMcastOctets: 16774
     OutMcastOctets: 10620
     InBcastOctets: 966

receiver side, after test 2
[root@target1 ~]netstat -st
IcmpMsg:
     InType0: 17
     InType3: 6
     InType8: 12
     OutType0: 12
     OutType3: 6
     OutType8: 19
Tcp:
     144 active connections openings
     25 passive connection openings
     29 failed connection attempts
     7 connection resets received
     4 connections established
     44349148 segments received
     44401154 segments send out
     58434 segments retransmited
     0 bad segments received.
     6 resets sent
UdpLite:
TcpExt:
     58 TCP sockets finished time wait in slow timer
     735072 packets rejects in established connections because of timestamp
     34 delayed acks sent
     359 delayed acks further delayed because of locked socket
     Quick ack mode was activated 14800 times
     2112 packets directly queued to recvmsg prequeue.
     3753925448 packets directly received from backlog
     4377976 packets directly received from prequeue
     847653 packets header predicted
     105696 packets header predicted and directly queued to user
     8804473 acknowledgments not containing data received
     154775 predicted acknowledgments
     10465 times recovered from packet loss due to SACK data
     Detected reordering 1 times using FACK
     Detected reordering 11185 times using SACK
     Detected reordering 182 times using time stamp
     2116 congestion windows fully recovered
     18951 congestion windows partially recovered using Hoe heuristic
     TCPDSACKUndo: 58
     8 congestion windows recovered after partial ack
     0 TCP data loss events
     53 timeouts after SACK recovery
     1 timeouts in loss state
     57287 fast retransmits
     12 forward retransmits
     793 retransmits in slow start
     10 other TCP timeouts
     263 sack retransmits failed
     14800 DSACKs sent for old packets
     31 DSACKs sent for out of order packets
     14289 DSACKs received
     43 DSACKs for out of order packets received
     1 connections reset due to unexpected data
     TCPDSACKIgnoredOld: 8615
     TCPDSACKIgnoredNoUndo: 5683
     TCPSackMerged: 1
     TCPSackShiftFallback: 15015212
IpExt:
     InMcastPkts: 116
     OutMcastPkts: 76
     InBcastPkts: 4
     InOctets: 1012355682
     OutOctets: -1540562156
     InMcastOctets: 17014
     OutMcastOctets: 10748
     InBcastOctets: 966


sender side, before test 1:
[root@target2 ~]# netstat -st
IcmpMsg:
     InType3: 4
     InType8: 32
     OutType0: 32
     OutType3: 4
Tcp:
     1 active connections openings
     2 passive connection openings
     0 failed connection attempts
     0 connection resets received
     3 connections established
     30268 segments received
     10217 segments send out
     0 segments retransmited
     0 bad segments received.
     3 resets sent
UdpLite:
TcpExt:
     7 delayed acks sent
     6332 packets directly queued to recvmsg prequeue.
     8 packets directly received from backlog
     46104 packets directly received from prequeue
     27935 packets header predicted
     11 packets header predicted and directly queued to user
     455 acknowledgments not containing data received
     119 predicted acknowledgments
     0 TCP data loss events
     TCPSackShiftFallback: 1
IpExt:
     InMcastPkts: 87
     OutMcastPkts: 54
     InBcastPkts: 2
     InOctets: 265039007
     OutOctets: 1083024
     InMcastOctets: 16444
     OutMcastOctets: 9893
     InBcastOctets: 483

sender side , after test 1:
[root@target2 ~]# netstat -st
IcmpMsg:
     InType3: 4
     InType8: 53
     OutType0: 53
     OutType3: 4
Tcp:
     69 active connections openings
     12 passive connection openings
     2 failed connection attempts
     4 connection resets received
     4 connections established
     16704819 segments received
     16684841 segments send out
     401 segments retransmited
     0 bad segments received.
     10 resets sent
UdpLite:
TcpExt:
     31 TCP sockets finished time wait in slow timer
     25 delayed acks sent
     6515 packets directly queued to recvmsg prequeue.
     24 packets directly received from backlog
     46988 packets directly received from prequeue
     27974 packets header predicted
     115 packets header predicted and directly queued to user
     10259331 acknowledgments not containing data received
     12483 predicted acknowledgments
     166 times recovered from packet loss due to SACK data
     Detected reordering 1 times using FACK
     Detected reordering 7 times using SACK
     Detected reordering 1 times using time stamp
     1 congestion windows fully recovered
     41 congestion windows partially recovered using Hoe heuristic
     0 TCP data loss events
     386 fast retransmits
     5 forward retransmits
     3 other TCP timeouts
     1 times receiver scheduled too late for direct processing
     225 DSACKs received
     1 connections reset due to unexpected data
     TCPDSACKIgnoredOld: 167
     TCPDSACKIgnoredNoUndo: 58
     TCPSackShiftFallback: 30925668
IpExt:
     InMcastPkts: 103
     OutMcastPkts: 62
     InBcastPkts: 4
     InOctets: 1556368288
     OutOctets: -934790015
     InMcastOctets: 16924
     OutMcastOctets: 10149
     InBcastOctets: 966

sender side, after test 2:
[root@target2 ~]# netstat -st
IcmpMsg:
     InType3: 4
     InType8: 56
     OutType0: 56
     OutType3: 4
Tcp:
     117 active connections openings
     25 passive connection openings
     2 failed connection attempts
     7 connection resets received
     4 connections established
     44383169 segments received
     44367187 segments send out
     59660 segments retransmited
     0 bad segments received.
     34 resets sent
UdpLite:
TcpExt:
     2 TCP sockets finished time wait in fast timer
     57 TCP sockets finished time wait in slow timer
     717082 packets rejects in established connections because of timestamp
     46 delayed acks sent
     202 delayed acks further delayed because of locked socket
     Quick ack mode was activated 14356 times
     7432 packets directly queued to recvmsg prequeue.
     135038632 packets directly received from backlog
     3633432 packets directly received from prequeue
     783534 packets header predicted
     94671 packets header predicted and directly queued to user
     20034470 acknowledgments not containing data received
     177885 predicted acknowledgments
     10851 times recovered from packet loss due to SACK data
     Detected reordering 6 times using FACK
     Detected reordering 9217 times using SACK
     Detected reordering 111 times using time stamp
     2125 congestion windows fully recovered
     19325 congestion windows partially recovered using Hoe heuristic
     TCPDSACKUndo: 71
     7 congestion windows recovered after partial ack
     0 TCP data loss events
     52 timeouts after SACK recovery
     58562 fast retransmits
     67 forward retransmits
     736 retransmits in slow start
     8 other TCP timeouts
     226 sack retransmits failed
     1 times receiver scheduled too late for direct processing
     14356 DSACKs sent for old packets
     44 DSACKs sent for out of order packets
     14679 DSACKs received
     31 DSACKs for out of order packets received
     1 connections reset due to unexpected data
     TCPDSACKIgnoredOld: 8899
     TCPDSACKIgnoredNoUndo: 5791
     TCPSackShiftFallback: 47227517
IpExt:
     InMcastPkts: 109
     OutMcastPkts: 65
     InBcastPkts: 4
     InOctets: -1885181292
     OutOctets: 1366995261
     InMcastOctets: 17104
     OutMcastOctets: 10245
     InBcastOctets: 966

-- 
Best regards,
Oleg Ukhno,
ITO Team lead
Yandex LLC.


^ permalink raw reply

* Re: Using ethernet device as efficient small packet generator
From: Ben Greear @ 2011-01-21 13:38 UTC (permalink / raw)
  To: juice; +Cc: Eric Dumazet, Loke, Chetan, Jon Zhou, Stephen Hemminger, netdev
In-Reply-To: <37bfb9ca79c2325ec4b70033f509200a.squirrel@www.liukuma.net>

On 01/21/2011 04:12 AM, juice wrote:
>> Le vendredi 21 janvier 2011 à 13:44 +0200, juice a écrit :
>>
>>
>> You should try
>>
>> CLONE_SKB="clone_skb 10"
>> ...
>> pgset "$CLONE_SKB"
>>
>>
>> Because I suspect you hit a performance problem on skb
>> allocation/filling/use/freeing
>
> Actually, that makes the performance worse:
> (Now I tried it with kernel 2.6.37, which is currently running)

Maybe try clone-skb of 1000 or so.  It zero's it's memory when
allocating a packet which can be quite expensive.

Also note that the Ethernet inter-frame gap isn't accounted for in
the BPS, but it is a significant amount of the total bandwidth
when using 64-byte packets.
You are pushing a bit more than half of the theoretical limit of
around 1,400,000 64-byte packets per second for 1Gbps ethernet.

Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply

* Re: MultiPath TCP in the Linux Kernel
From: Christoph Paasch @ 2011-01-21 13:26 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Hagen Paul Pfeifer, netdev, bonding-devel, linux-sctp
In-Reply-To: <1295538176.2825.311.camel@edumazet-laptop>


On Thursday, January 20, 2011 wrote Eric Dumazet:
> > if you want that your work becomes part of the official network stack you
> > should align your effort on the official Linux way. This means you should
> > split your work and publish patches on this maillinglist.
> 
> Hmm, they probably know that, and prefer to wait MTCP stuff is mature
> before patch submission :)
Exactly... :)

The protocol specification of MPTCP is not stable yet 
(http://datatracker.ietf.org/doc/draft-ietf-mptcp-multiaddressed/).

Also, all the features are not yet in our implementation (e.g., IPv6-support 
and dual IPv4/IPv6-support), and it is necessary to make our implementation 
less intrusive to the regular TCP/IP-stack.

However, as developper-resources are limited this will still take quite some 
time... ;)

Cheers,
Christoph


--
Christoph Paasch
PhD Student

IP Networking Lab --- http://inl.info.ucl.ac.be
MultiPath TCP in the Linux Kernel --- http://inl.info.ucl.ac.be/mptcp
Université Catholique de Louvain

www.rollerbulls.be
--

^ permalink raw reply

* Re: MultiPath TCP in the Linux Kernel
From: Christoph Paasch @ 2011-01-21 13:19 UTC (permalink / raw)
  To: Peter Chacko
  Cc: netdev, bonding-devel, linux-sctp, MS PRASAD, Lal Samuel Varghese,
	Gregory S. Tseytin
In-Reply-To: <AANLkTi=v7jJzfRoeQLGzwU29p2dVqJqd=Eap_8ntfxKP@mail.gmail.com>

Hi Peter,

On Thursday, January 20, 2011 wrote Peter Chacko:
> Is there any document that shows the difference of SCTP multi-streaming and
> MPTCP ? SCTP team has done lot of great workin in load-balanced
> multi-streaming work across multi-hopped hosts. I am curious to learn more
> on your work in the context of this.
MPTCP can be compared to SCTP-CMT. Regular SCTP just uses one path for a 
stream and thus does not increase the throughput.
At http://inl.info.ucl.ac.be/mptcp/ is a master  thesis from a student, where 
he compared plenty of different multi-path transport solutions. Have a look at 
the document "multi-path congestion control" from Arnaud Ongenae on this 
website.

> Does MP-TCP make use of CM ? (Congestion Manager, created out of MIT) for
> sharing congestion states across TCP ensembles ?
No, MPTCP uses the coupled congestion control. However, this congestion-
manager might be an interesting extension to mptcp.

> How does it work with a firewall that will allow only a session that is the
> reply of a SYN packet from inside firewall ? (Because other parallel
> streams of TCP doesn't send SYN to the same destination, and how will the
> firewall will allow the replies of this other session go through ? )
You mean the case, where a server has 2 interfaces, and a client (behind the 
firewall) establishes a MPTCP-session to the server?
The server will tell the client about it's available addresses.
Additionally, the server will try by itself the establishment of the 
additional subflow. This attempt will get blocked by the firewall.
As the server sended its set of addresses to the client, the client will also 
attempt to establish a new subflow. This attempt will pass the firewall as it 
comes from the inside.

> Is there any commerical use  of this work ?
Our implementation is still in alpha-state. There are still missing features, 
and it's not yet 100% stable. It's a major modification of the TCP/IP-stack 
because packets from the subflows need to get pushed up to the 'meta-
connection' and reordered there.

> Will this also support message level TCP packet exchange ? (So that  main
> strengths of SCTP are "in" to the TCP stack ).
What do you mean with "message level TCP packet exchange" ?

Best regards,
Christoph


> 2011/1/20 Christoph Paasch <christoph.paasch@uclouvain.be>
> 
> > Hi,
> > 
> > MultiPath TCP is not a port of SCTP. It is based on regular TCP and
> > presents a
> > regular socket-api to the application. Thus applications do not have to
> > be modified.
> > 
> > MPTCP opens several TCP-subflows across it's different IP-addresses, and
> > lets
> > the data go over these different TCP sessions. To synchronize the data-
> > transfer MPTCP uses TCP-options. Thus, on the wire it looks like regular
> > TCP,
> > with the only difference being that there are additional TCP-options.
> > 
> > MPTCP increases the throughput, because it uses the TCP-subflows
> > simultaneously. With our implementation we got 2Gbps throughput for a
> > single
> > iperf-session on a machine having two 1Gb-interfaces (using
> > jumbo-frames), whereas regular TCP could only go up to 1Gbps, as it only
> > uses one interface.
> > 
> > To maintain bottleneck-fairness the Coupled Congestion Control controls
> > the congestion window of the individual subflows (included in the
> > implementation
> > since the latest release).
> > http://datatracker.ietf.org/doc/draft-ietf-mptcp-congestion/
> > 
> > 
> > Cheers,
> > Christoph
> > 
> > P.S.: We have a public webserver running MPTCP at
> > http://mptcp.info.ucl.ac.be
> > So you can directly try out the power of MPTCP... ;-)
> > 
> > On Thursday, January 20, 2011 wrote Peter Chacko:
> > > SCTP already provides that , and is TCP Multi-Path is going to be a
> > > port
> > 
> > of
> > 
> > > it or any other difference ?
> > > 
> > > We are looking to use SCTP for this feature, but as we found it it has
> > 
> > not
> > 
> > > kicked off , because of its firewall issues, we are trying add
> > > Multi-Pathing at application layer, sharing all the congestion
> > 
> > states(like
> > 
> > > CM idea) as we are building a WAN optimized storage replication module
> > > as part of our cloud storage gateway development.
> > > 
> > > Curious to see more info on this.
> > > 
> > > Thanks
> > > 
> > > 2011/1/20 Christoph Paasch <christoph.paasch@uclouvain.be>
> > > 
> > > > Hi all,
> > > > 
> > > > The IETF is developing a new transport layer solution, MultiPath TCP,
> > > > which allows to efficiently exploit several Internet paths between a
> > > > pair of hosts,
> > > > while presenting a single TCP connection to the application layer.
> > > > 
> > > > At the UCLouvain in Belgium we are developping the support for
> > 
> > MultiPath
> > 
> > > > TCP
> > > > in the Linux Kernel. The implementation is a major extension to the
> > 
> > Linux
> > 
> > > > TCP-
> > > > stack.
> > > > 
> > > > For general information, access:
> > > > http://inl.info.ucl.ac.be/mptcp
> > > > https://scm.info.ucl.ac.be/trac/mptcp/
> > > > 
> > > > To access the git-repository:
> > > > git://scm.info.ucl.ac.be/mtcp.git
> > > > 
> > > > branches:
> > > >        mptcp_2.6.36 - based on Linux Kernel 2.6.36
> > > >        mtcp_no_subrcvqueue - based on Linux Kernel 2.6.28
> > > > 
> > > > For questions, feedback,... feel free to subscribe to the mptcp-dev
> > > > Mailing-
> > > > List:
> > > > https://listes-2.sipr.ucl.ac.be/sympa/info/mptcp-dev
> > > > 
> > > > 
> > > > Regards,
> > > > Christoph
> > > > 
> > > > --
> > > > Christoph Paasch
> > > > PhD Student
> > > > 
> > > > IP Networking Lab --- http://inl.info.ucl.ac.be
> > > > MultiPath TCP in the Linux Kernel --- http://inl.info.ucl.ac.be/mptcp
> > > > Université Catholique de Louvain
> > > > 
> > > > www.rollerbulls.be
> > > > --
> > > > --
> > > > To unsubscribe from this list: send the line "unsubscribe netdev" in
> > > > the body of a message to majordomo@vger.kernel.org
> > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > 
> > --
> > Christoph Paasch
> > PhD Student
> > 
> > IP Networking Lab --- http://inl.info.ucl.ac.be
> > MultiPath TCP in the Linux Kernel --- http://inl.info.ucl.ac.be/mptcp
> > Université Catholique de Louvain
> > 
> > www.rollerbulls.be
> > --

--
Christoph Paasch
PhD Student

IP Networking Lab --- http://inl.info.ucl.ac.be
MultiPath TCP in the Linux Kernel --- http://inl.info.ucl.ac.be/mptcp
Université Catholique de Louvain

www.rollerbulls.be
--

^ permalink raw reply

* Re: [PATCH v4] net: add Faraday FTMAC100 10/100 Ethernet driver
From: Michał Mirosław @ 2011-01-21 12:26 UTC (permalink / raw)
  To: Po-Yu Chuang
  Cc: netdev, linux-kernel, bhutchings, eric.dumazet, joe, dilinger,
	Po-Yu Chuang
In-Reply-To: <1295596533-1748-1-git-send-email-ratbert.chuang@gmail.com>

2011/1/21 Po-Yu Chuang <ratbert.chuang@gmail.com>:
> From: Po-Yu Chuang <ratbert@faraday-tech.com>
>
> FTMAC100 Ethernet Media Access Controller supports 10/100 Mbps and
> MII.  This driver has been working on some ARM/NDS32 SoC's including
> Faraday A320 and Andes AG101.
>
> Signed-off-by: Po-Yu Chuang <ratbert@faraday-tech.com>
[...]
> +static void ftmac100_txdes_reset(struct ftmac100_txdes *txdes)
> +{
> +       /* clear all except end of ring bit */
> +       txdes->txdes0 = 0;
> +       txdes->txdes1 &= FTMAC100_TXDES1_EDOTR;
> +       txdes->txdes2 = 0;
> +       txdes->txdes3 = 0;
> +}

This also probably needs cpu_to_le32().

[...]
> +static void ftmac100_free_buffers(struct ftmac100 *priv)
> +{
> +       int i;
> +
> +       for (i = 0; i < RX_QUEUE_ENTRIES; i += 2) {
> +               struct ftmac100_rxdes *rxdes = &priv->descs->rxdes[i];
> +               dma_addr_t d = ftmac100_rxdes_get_dma_addr(rxdes);
> +               void *page = ftmac100_rxdes_get_va(rxdes);
> +
> +               if (d)
> +                       dma_unmap_single(priv->dev, d, PAGE_SIZE,
> +                                        DMA_FROM_DEVICE);
> +
> +               if (page != NULL)
> +                       free_page((unsigned long)page);
> +       }
> +
[...]

> +static int ftmac100_alloc_buffers(struct ftmac100 *priv)
> +{
> +       int i;
> +
> +       priv->descs = dma_alloc_coherent(priv->dev,
> +                                        sizeof(struct ftmac100_descs),
> +                                        &priv->descs_dma_addr,
> +                                        GFP_KERNEL | GFP_DMA);
> +       if (priv->descs == NULL)
> +               return -ENOMEM;
> +
> +       memset(priv->descs, 0, sizeof(struct ftmac100_descs));
> +
> +       /* initialize RX ring */
> +
> +       ftmac100_rxdes_set_end_of_ring(&priv->descs->rxdes[RX_QUEUE_ENTRIES - 1]);
> +
> +       for (i = 0; i < RX_QUEUE_ENTRIES; i += 2) {
> +               struct ftmac100_rxdes *rxdes = &priv->descs->rxdes[i];
> +               void *page;
> +               dma_addr_t d;
> +
> +               page = (void *)__get_free_page(GFP_KERNEL | GFP_DMA);
> +               if (page == NULL)
> +                       goto err;
> +
> +               d = dma_map_single(priv->dev, page, PAGE_SIZE, DMA_FROM_DEVICE);
> +               if (unlikely(dma_mapping_error(priv->dev, d))) {
> +                       free_page((unsigned long)page);
> +                       goto err;
> +               }
> +
> +               /*
> +                * The hardware enforces a sub-2K maximum packet size, so we
> +                * put two buffers on every hardware page.
> +                */
> +               ftmac100_rxdes_set_va(rxdes, page);
> +               ftmac100_rxdes_set_va(rxdes + 1, page + PAGE_SIZE / 2);
> +
> +               ftmac100_rxdes_set_dma_addr(rxdes, d);
> +               ftmac100_rxdes_set_dma_addr(rxdes + 1, d + PAGE_SIZE / 2);
> +
> +               ftmac100_rxdes_set_buffer_size(rxdes, RX_BUF_SIZE);
> +               ftmac100_rxdes_set_buffer_size(rxdes + 1, RX_BUF_SIZE);
> +
> +               ftmac100_rxdes_set_dma_own(rxdes);
> +               ftmac100_rxdes_set_dma_own(rxdes + 1);
> +       }
[...]

Did you test this? This looks like it will result in double free after
packet RX, as you are giving the same page (referenced once) to two
distinct RX descriptors, that may be assigned different packets.

Since your not implementing any RX offloads, you might just allocate
fresh skb's with alloc_skb() and store skb pointer in rxdes3. Since
hardware doesn't touch it, you can skip cpu_to_le32()/le32_to_cpu()
there (leave a comment, though).

Unless this needs to work for ISA devices, you should drop GFP_DMA
allocation flag.

Best Regards,
Michał Mirosław

^ permalink raw reply

* RE: Using ethernet device as efficient small packet generator
From: juice @ 2011-01-21 12:12 UTC (permalink / raw)
  To: Eric Dumazet, Loke, Chetan, Jon Zhou, Stephen Hemminger, netdev
In-Reply-To: <1295610709.2601.35.camel@edumazet-laptop>

> Le vendredi 21 janvier 2011 à 13:44 +0200, juice a écrit :
>
>
> You should try
>
> CLONE_SKB="clone_skb 10"
> ...
> pgset "$CLONE_SKB"
>
>
> Because I suspect you hit a performance problem on skb
> allocation/filling/use/freeing

Actually, that makes the performance worse:
(Now I tried it with kernel 2.6.37, which is currently running)

root@d8labralinux:/var/home/juice/pkt_test# cat /proc/net/pktgen/eth1
Params: count 10000000  min_pkt_size: 60  max_pkt_size: 60
     frags: 0  delay: 0  clone_skb: 10  ifname: eth1
     flows: 0 flowlen: 0
     queue_map_min: 0  queue_map_max: 0
     dst_min: 10.10.11.2  dst_max:
        src_min:   src_max:
     src_mac: 00:1b:21:7c:e5:b1 dst_mac: 00:04:23:08:91:dc
     udp_src_min: 9  udp_src_max: 9  udp_dst_min: 9  udp_dst_max: 9
     src_mac_count: 0  dst_mac_count: 0
     Flags:
Current:
     pkts-sofar: 10000000  errors: 0
     started: 2555660074us  stopped: 2569239323us idle: 3484us
     seq_num: 10000001  cur_dst_mac_offset: 0  cur_src_mac_offset: 0
     cur_saddr: 0x0  cur_daddr: 0x20b0a0a
     cur_udp_dst: 9  cur_udp_src: 9
     cur_queue_map: 0
     flows: 0
Result: OK: 13579248(c13575763+d3484) nsec, 10000000 (60byte,0frags)
  736417pps 353Mb/sec (353480160bps) errors: 0


> You can use perf tool to get some performance profile while your pktgen
> session is running
>
> # cd tools/perf
> # make
> ...
> # ./perf top
>

I can try that.
Where do I get the performance profiler tool?


Yours, Jussi Ohenoja



^ permalink raw reply

* RE: Using ethernet device as efficient small packet generator
From: Eric Dumazet @ 2011-01-21 11:51 UTC (permalink / raw)
  To: juice; +Cc: Loke, Chetan, Jon Zhou, Stephen Hemminger, netdev
In-Reply-To: <13dbf221c875a931d408784495884998.squirrel@www.liukuma.net>

Le vendredi 21 janvier 2011 à 13:44 +0200, juice a écrit :

> Hi again.
> 
> It has been a while since last time I got to be able to test this
> again, as there have been some other matters at hand.
> However, now I managed to rerun my tests in several different kernels.
> 
> I am using now a PCIe Intel e1000e card, that should be able to handle
> the needed traffic amount.
> 
> The statistics that I get are as follows:
> 
> kernel 2.6.32-27 (ubuntu 10.10 default)
>     pktgen:           750064pps 360Mb/sec (360030720bps)
>     AX4000 analyser:  Total bitrate:             383.879 MBits/s
>                       Bandwidth:                 38.39% GE
>                       Average packet intereval:  1.33 us
> 
> kernel 2.6.37 (latest stable from kernel.org)
>     pktgen:           786848pps 377Mb/sec (377687040bps)
>     AX4000 analyser:  Total bitrate:             402.904 MBits/s
>                       Bandwidth:                 40.29% GE
>                       Average packet intereval:  1.27 us
> 
> kernel 2.6.38-rc1 (latest from kernel.org)
>     pktgen:           795297pps 381Mb/sec (381742560bps)
>     AX4000 analyser:  Total bitrate:             407.117 MBits/s
>                       Bandwidth:                 40.72% GE
>                       Average packet intereval:  1.26 us
> 
> 

...

> pktgen:
> 
> Params: count 10000000  min_pkt_size: 60  max_pkt_size: 60
>      frags: 0  delay: 0  clone_skb: 1  ifname: eth1
>      flows: 0 flowlen: 0
>      queue_map_min: 0  queue_map_max: 0
>      dst_min: 10.10.11.2  dst_max:
>         src_min:   src_max:
>      src_mac: 00:1b:21:7c:e5:b1 dst_mac: 00:04:23:08:91:dc
>      udp_src_min: 9  udp_src_max: 9  udp_dst_min: 9  udp_dst_max: 9
>      src_mac_count: 0  dst_mac_count: 0
>      Flags:
> Current:
>      pkts-sofar: 10000000  errors: 0
>      started: 77203892067us  stopped: 77216465982us idle: 1325us
>      seq_num: 10000001  cur_dst_mac_offset: 0  cur_src_mac_offset: 0
>      cur_saddr: 0x0  cur_daddr: 0x20b0a0a
>      cur_udp_dst: 9  cur_udp_src: 9
>      cur_queue_map: 0
>      flows: 0
> Result: OK: 12573914(c12572589+d1325) nsec, 10000000 (60byte,0frags)
>   795297pps 381Mb/sec (381742560bps) errors: 0
> 
> 
> AX4000 analyser:
> 
>    Total bitrate:             407.117 MBits/s
>    Bandwidth:                 40.72% GE
>    Average packet intereval:  1.26 us
> 
> 

You should try

CLONE_SKB="clone_skb 10"
...
pgset "$CLONE_SKB"


Because I suspect you hit a performance problem on skb
allocation/filling/use/freeing

You can use perf tool to get some performance profile while your pktgen
session is running

# cd tools/perf
# make
...
# ./perf top




^ permalink raw reply

* Re: [PATCH] Ensure that we unshare skbs prior to calling pskb_may_pull in bonding driver
From: Neil Horman @ 2011-01-21 11:51 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, andy, fubar
In-Reply-To: <20110120.164723.73670910.davem@davemloft.net>

On Thu, Jan 20, 2011 at 04:47:23PM -0800, David Miller wrote:
> From: Neil Horman <nhorman@tuxdriver.com>
> Date: Thu, 20 Jan 2011 14:02:31 -0500
> 
> > Recently reported oops:
> 
> Applied, but please compose reasonable Subject lines with your patches,
> always begin the line with a subsystem tag followed by a colon.
> 
> This way we get
> 
> 	bonding: Foo bar baz
> 
> instead of
> 
> 	Foo bar baz in the bonding driver
> 
> Thanks.
> 
Yeah, my bad, I realized I screwed up the Subject the second I sent the email,
sorry about that.

Regards
Neil


^ permalink raw reply

* RE: Using ethernet device as efficient small packet generator
From: juice @ 2011-01-21 11:44 UTC (permalink / raw)
  To: Loke, Chetan, Jon Zhou, Eric Dumazet, Stephen Hemminger, netdev
In-Reply-To: <D3F292ADF945FB49B35E96C94C2061B90ECC4FAC@nsmail.netscout.com>

>> -----Original Message-----
>> From: netdev-owner@vger.kernel.org [mailto:netdev-
>> owner@vger.kernel.org] On Behalf Of Jon Zhou
>> Sent: December 23, 2010 3:58 AM
>> To: juice@swagman.org; Eric Dumazet; Stephen Hemminger;
>> netdev@vger.kernel.org
>> Subject: RE: Using ethernet device as efficient small packet generator
>>
>>
>> At another old kernel(2.6.16) with tg3 and bnx2 1G NIC,XEON E5450, I
>> only got 490K pps(it is about 300Mbps,30% GE), I think the reason is
>> multiqueue unsupported in this kernel.
>>
>> I will do a test with 1Gb nic on the new kernel later.
>>
>
>
> I can hit close to 1M pps(first time every time) w/ a 64-byte payload on
> my VirtualMachine(running 2.6.33) via vmxnet3 vNIC -
>
>
> [root@localhost ~]# cat /proc/net/pktgen/eth2
> Params: count 0  min_pkt_size: 60  max_pkt_size: 60
>      frags: 0  delay: 0  clone_skb: 0  ifname: eth2
>      flows: 0 flowlen: 0
>      queue_map_min: 0  queue_map_max: 0
>      dst_min: 192.168.222.2  dst_max:
>         src_min:   src_max:
>      src_mac: 00:50:56:b1:00:19 dst_mac: 00:50:56:c0:00:3e
>      udp_src_min: 9  udp_src_max: 9  udp_dst_min: 9  udp_dst_max: 9
>      src_mac_count: 0  dst_mac_count: 0
>      Flags:
> Current:
>      pkts-sofar: 59241012  errors: 0
>      started: 1898437021us  stopped: 1957709510us idle: 9168us
>      seq_num: 59241013  cur_dst_mac_offset: 0  cur_src_mac_offset: 0
>      cur_saddr: 0x0  cur_daddr: 0x2dea8c0
>      cur_udp_dst: 9  cur_udp_src: 9
>      cur_queue_map: 0
>      flows: 0
> Result: OK: 59272488(c59263320+d9168) nsec, 59241012 (60byte,0frags)
>   999468pps 479Mb/sec (479744640bps) errors: 0
>
>
>
> Chetan
>


Hi again.

It has been a while since last time I got to be able to test this
again, as there have been some other matters at hand.
However, now I managed to rerun my tests in several different kernels.

I am using now a PCIe Intel e1000e card, that should be able to handle
the needed traffic amount.

The statistics that I get are as follows:

kernel 2.6.32-27 (ubuntu 10.10 default)
    pktgen:           750064pps 360Mb/sec (360030720bps)
    AX4000 analyser:  Total bitrate:             383.879 MBits/s
                      Bandwidth:                 38.39% GE
                      Average packet intereval:  1.33 us

kernel 2.6.37 (latest stable from kernel.org)
    pktgen:           786848pps 377Mb/sec (377687040bps)
    AX4000 analyser:  Total bitrate:             402.904 MBits/s
                      Bandwidth:                 40.29% GE
                      Average packet intereval:  1.27 us

kernel 2.6.38-rc1 (latest from kernel.org)
    pktgen:           795297pps 381Mb/sec (381742560bps)
    AX4000 analyser:  Total bitrate:             407.117 MBits/s
                      Bandwidth:                 40.72% GE
                      Average packet intereval:  1.26 us


In every case I have set the IRQ affinity of eth1 to CPU0 and started
the test running in kpktgend_0.

The complete data of my measurements follows in the end of this post.

It looks like the small packet sending effiency of the ethernet driver
is improving all the time, albeit quite slowly.

Now, I would be intrested in knowing whether it is indeed possible to
increase the sending rate near full 1GE capacity with the current
ethernet card I am using or do I have here a hardware limitation here?

I recall hearing that there are some enhanced versions of the e1000
network card, such that have been geared towards higher performance
at the expense of some functionality or general system effiency.
Can anybody point me how to do that?

As I stated before, quoting myself:

> Which do you suppose is the reason for poor performance on my setup,
> is it lack of multiqueue HW in the GE NIC's I am using or is it lack
> of multiqueue support in the kernel (2.6.32) that I am using?
>
> Is multiqueue really necessary to achieve the full 1GE saturation, or
> is it only needed on 10GE NIC's?
>
> As I understand multiqueue is useful only if there are lots of CPU cores
> to run, each handling one queue.
>
> The application I am thinking of, preloading a packet sequence into
> kernel from userland application and then starting to send from buffer
> propably does not benefit so much from many cores, it would be enough
> that one CPU would handle the sending and other core(s) would handle
> other tasks.

Yours, Jussi Ohenoja


*** Measurement details follows ***


root@d8labralinux:/var/home/juice# lspci -vvv -s 04:00.0
04:00.0 Ethernet controller: Intel Corporation 82572EI Gigabit Ethernet
Controller (Copper) (rev 06)
	Subsystem: Intel Corporation Device 1082
	Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr-
Stepping- SERR+ FastB2B- DisINTx-
	Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort- <TAbort-
<MAbort- >SERR- <PERR- INTx-
	Latency: 0, Cache Line Size: 64 bytes
	Interrupt: pin A routed to IRQ 11
	Region 0: Memory at f3cc0000 (32-bit, non-prefetchable) [size=128K]
	Region 1: Memory at f3ce0000 (32-bit, non-prefetchable) [size=128K]
	Region 2: I/O ports at cce0 [size=32]
	Expansion ROM at f3d00000 [disabled] [size=128K]
	Capabilities: [c8] Power Management version 2
		Flags: PMEClk- DSI+ D1- D2- AuxCurrent=0mA PME(D0+,D1-,D2-,D3hot+,D3cold+)
		Status: D0 PME-Enable- DSel=0 DScale=1 PME-
	Capabilities: [d0] Message Signalled Interrupts: Mask- 64bit+ Queue=0/0
Enable-
		Address: 0000000000000000  Data: 0000
	Capabilities: [e0] Express (v1) Endpoint, MSI 00
		DevCap:	MaxPayload 256 bytes, PhantFunc 0, Latency L0s <512ns, L1 <64us
			ExtTag- AttnBtn- AttnInd- PwrInd- RBE- FLReset-
		DevCtl:	Report errors: Correctable- Non-Fatal- Fatal- Unsupported-
			RlxdOrd+ ExtTag- PhantFunc- AuxPwr- NoSnoop+
			MaxPayload 128 bytes, MaxReadReq 512 bytes
		DevSta:	CorrErr- UncorrErr- FatalErr- UnsuppReq- AuxPwr+ TransPend-
		LnkCap:	Port #0, Speed 2.5GT/s, Width x1, ASPM L0s, Latency L0 <4us, L1
<64us
			ClockPM- Suprise- LLActRep- BwNot-
		LnkCtl:	ASPM Disabled; RCB 64 bytes Disabled- Retrain- CommClk-
			ExtSynch- ClockPM- AutWidDis- BWInt- AutBWInt-
		LnkSta:	Speed 2.5GT/s, Width x1, TrErr- Train- SlotClk+ DLActive-
BWMgmt- ABWMgmt-
	Capabilities: [100] Advanced Error Reporting <?>
	Capabilities: [140] Device Serial Number b1-e5-7c-ff-ff-21-1b-00
	Kernel modules: e1000e

root@d8labralinux:/var/home/juice# ethtool eth1
Settings for eth1:
	Supported ports: [ TP ]
	Supported link modes:   10baseT/Half 10baseT/Full
	                        100baseT/Half 100baseT/Full
	                        1000baseT/Full
	Supports auto-negotiation: Yes
	Advertised link modes:  10baseT/Half 10baseT/Full
	                        100baseT/Half 100baseT/Full
	                        1000baseT/Full
	Advertised pause frame use: No
	Advertised auto-negotiation: Yes
	Link partner advertised link modes:  Not reported
	Link partner advertised pause frame use: No
	Link partner advertised auto-negotiation: No
	Speed: 1000Mb/s
	Duplex: Full
	Port: Twisted Pair
	PHYAD: 1
	Transceiver: internal
	Auto-negotiation: on
	MDI-X: on
	Supports Wake-on: pumbag
	Wake-on: d
	Current message level: 0x00000001 (1)
	Link detected: yes





2.6.38-rc1
----------

dmesg:

[  195.685655] e1000e: Intel(R) PRO/1000 Network Driver - 1.2.20-k2
[  195.685658] e1000e: Copyright(c) 1999 - 2011 Intel Corporation.
[  195.685677] e1000e 0000:04:00.0: Disabling ASPM  L1
[  195.685690] e1000e 0000:04:00.0: PCI INT A -> GSI 16 (level, low) ->
IRQ 16
[  195.685707] e1000e 0000:04:00.0: setting latency timer to 64
[  195.685852] e1000e 0000:04:00.0: irq 69 for MSI/MSI-X
[  195.869917] e1000e 0000:04:00.0: eth1: (PCI Express:2.5GB/s:Width x1)
00:1b:21:7c:e5:b1
[  195.869921] e1000e 0000:04:00.0: eth1: Intel(R) PRO/1000 Network
Connection
[  195.870006] e1000e 0000:04:00.0: eth1: MAC: 1, PHY: 4, PBA No: D50861-006
[  196.017285] e1000e 0000:04:00.0: irq 69 for MSI/MSI-X
[  196.073144] e1000e 0000:04:00.0: irq 69 for MSI/MSI-X
[  196.073630] ADDRCONF(NETDEV_UP): eth1: link is not ready
[  198.746000] e1000e: eth1 NIC Link is Up 1000 Mbps Full Duplex, Flow
Control: None
[  198.746162] ADDRCONF(NETDEV_CHANGE): eth1: link becomes ready
[  209.564433] eth1: no IPv6 routers present


pktgen:

Params: count 10000000  min_pkt_size: 60  max_pkt_size: 60
     frags: 0  delay: 0  clone_skb: 1  ifname: eth1
     flows: 0 flowlen: 0
     queue_map_min: 0  queue_map_max: 0
     dst_min: 10.10.11.2  dst_max:
        src_min:   src_max:
     src_mac: 00:1b:21:7c:e5:b1 dst_mac: 00:04:23:08:91:dc
     udp_src_min: 9  udp_src_max: 9  udp_dst_min: 9  udp_dst_max: 9
     src_mac_count: 0  dst_mac_count: 0
     Flags:
Current:
     pkts-sofar: 10000000  errors: 0
     started: 77203892067us  stopped: 77216465982us idle: 1325us
     seq_num: 10000001  cur_dst_mac_offset: 0  cur_src_mac_offset: 0
     cur_saddr: 0x0  cur_daddr: 0x20b0a0a
     cur_udp_dst: 9  cur_udp_src: 9
     cur_queue_map: 0
     flows: 0
Result: OK: 12573914(c12572589+d1325) nsec, 10000000 (60byte,0frags)
  795297pps 381Mb/sec (381742560bps) errors: 0


AX4000 analyser:

   Total bitrate:             407.117 MBits/s
   Bandwidth:                 40.72% GE
   Average packet intereval:  1.26 us






2.6.37
------


dmesg:

[ 1810.959907] e1000e: Intel(R) PRO/1000 Network Driver - 1.2.7-k2
[ 1810.959909] e1000e: Copyright (c) 1999 - 2010 Intel Corporation.
[ 1810.959928] e1000e 0000:04:00.0: Disabling ASPM  L1
[ 1810.959942] e1000e 0000:04:00.0: PCI INT A -> GSI 16 (level, low) ->
IRQ 16
[ 1810.959961] e1000e 0000:04:00.0: setting latency timer to 64
[ 1810.960103] e1000e 0000:04:00.0: irq 66 for MSI/MSI-X
[ 1811.137269] e1000e 0000:04:00.0: eth1: (PCI Express:2.5GB/s:Width x1)
00:1b:21:7c:e5:b1
[ 1811.137272] e1000e 0000:04:00.0: eth1: Intel(R) PRO/1000 Network
Connection
[ 1811.137358] e1000e 0000:04:00.0: eth1: MAC: 1, PHY: 4, PBA No: d50861-006
[ 1811.286173] e1000e 0000:04:00.0: irq 66 for MSI/MSI-X
[ 1811.342065] e1000e 0000:04:00.0: irq 66 for MSI/MSI-X
[ 1811.342575] ADDRCONF(NETDEV_UP): eth1: link is not ready
[ 1814.010736] e1000e: eth1 NIC Link is Up 1000 Mbps Full Duplex, Flow
Control: None
[ 1814.010949] ADDRCONF(NETDEV_CHANGE): eth1: link becomes ready
[ 1824.082148] eth1: no IPv6 routers present


pktgen:

Params: count 10000000  min_pkt_size: 60  max_pkt_size: 60
     frags: 0  delay: 0  clone_skb: 1  ifname: eth1
     flows: 0 flowlen: 0
     queue_map_min: 0  queue_map_max: 0
     dst_min: 10.10.11.2  dst_max:
        src_min:   src_max:
     src_mac: 00:1b:21:7c:e5:b1 dst_mac: 00:04:23:08:91:dc
     udp_src_min: 9  udp_src_max: 9  udp_dst_min: 9  udp_dst_max: 9
     src_mac_count: 0  dst_mac_count: 0
     Flags:
Current:
     pkts-sofar: 10000000  errors: 0
     started: 265936151us  stopped: 278645077us idle: 1651us
     seq_num: 10000001  cur_dst_mac_offset: 0  cur_src_mac_offset: 0
     cur_saddr: 0x0  cur_daddr: 0x20b0a0a
     cur_udp_dst: 9  cur_udp_src: 9
     cur_queue_map: 0
     flows: 0
Result: OK: 12708925(c12707274+d1651) nsec, 10000000 (60byte,0frags)
  786848pps 377Mb/sec (377687040bps) errors: 0


AX4000 analyser:

   Total bitrate:             402.904 MBits/s
   Bandwidth:                 40.29% GE
   Average packet intereval:  1.27 us






2.6.32-27
---------


dmesg:

[    2.178800] e1000e: Intel(R) PRO/1000 Network Driver - 1.0.2-k2
[    2.178802] e1000e: Copyright (c) 1999-2008 Intel Corporation.
[    2.178854] e1000e 0000:04:00.0: PCI INT A -> GSI 16 (level, low) ->
IRQ 16
[    2.178887] e1000e 0000:04:00.0: setting latency timer to 64
[    2.179039] e1000e 0000:04:00.0: irq 53 for MSI/MSI-X
[    2.360700] 0000:04:00.0: eth1: (PCI Express:2.5GB/s:Width x1)
00:1b:21:7c:e5:b1
[    2.360702] 0000:04:00.0: eth1: Intel(R) PRO/1000 Network Connection
[    2.360787] 0000:04:00.0: eth1: MAC: 1, PHY: 4, PBA No: d50861-006
[    9.551486] e1000e 0000:04:00.0: irq 53 for MSI/MSI-X
[    9.607309] e1000e 0000:04:00.0: irq 53 for MSI/MSI-X
[    9.607876] ADDRCONF(NETDEV_UP): eth1: link is not ready
[   12.448302] e1000e: eth1 NIC Link is Up 1000 Mbps Full Duplex, Flow
Control: None
[   12.448544] ADDRCONF(NETDEV_CHANGE): eth1: link becomes ready
[   23.068498] eth1: no IPv6 routers present


pktgen:

Params: count 10000000  min_pkt_size: 60  max_pkt_size: 60
     frags: 0  delay: 0  clone_skb: 1  ifname: eth1
     flows: 0 flowlen: 0
     queue_map_min: 0  queue_map_max: 0
     dst_min: 10.10.11.2  dst_max:
        src_min:   src_max:
     src_mac: 00:1b:21:7c:e5:b1 dst_mac: 00:04:23:08:91:dc
     udp_src_min: 9  udp_src_max: 9  udp_dst_min: 9  udp_dst_max: 9
     src_mac_count: 0  dst_mac_count: 0
     Flags:
Current:
     pkts-sofar: 10000000  errors: 0
     started: 799760010us  stopped: 813092189us idle: 1314us
     seq_num: 10000001  cur_dst_mac_offset: 0  cur_src_mac_offset: 0
     cur_saddr: 0x0  cur_daddr: 0x20b0a0a
     cur_udp_dst: 9  cur_udp_src: 9
     cur_queue_map: 0
     flows: 0
Result: OK: 13332178(c13330864+d1314) nsec, 10000000 (60byte,0frags)
  750064pps 360Mb/sec (360030720bps) errors: 0


AX4000 analyser:

   Total bitrate:             383.879 MBits/s
   Bandwidth:                 38.39% GE
   Average packet intereval:  1.33 us




root@d8labralinux:/var/home/juice/pkt_test# cat ./pktgen_conf
#!/bin/bash

#modprobe pktgen

function pgset() {
  local result
  echo $1 > $PGDEV
  result=`cat $PGDEV | fgrep "Result: OK:"`
  if [ "$result" = "" ]; then
    cat $PGDEV | fgrep Result:
  fi
}

function pg() {
  echo inject > $PGDEV
  cat $PGDEV
}

# Config Start Here
-----------------------------------------------------------

# thread config
# Each CPU has own thread. Two CPU exammple. We add eth1, eth2 respectivly.
PGDEV=/proc/net/pktgen/kpktgend_0
echo "Removing all devices"
pgset "rem_device_all"
PGDEV=/proc/net/pktgen/kpktgend_1
pgset "rem_device_all"

PGDEV=/proc/net/pktgen/kpktgend_0
echo "Adding eth1"
pgset "add_device eth1"
#echo "Setting max_before_softirq 10000"
#pgset "max_before_softirq 10000"

# device config
# ipg is inter packet gap. 0 means maximum speed.
CLONE_SKB="clone_skb 1"
# NIC adds 4 bytes CRC
PKT_SIZE="pkt_size 60"
# COUNT 0 means forever
#COUNT="count 0"
COUNT="count 10000000"
IPG="delay 0"
PGDEV=/proc/net/pktgen/eth1
echo "Configuring $PGDEV"
pgset "$COUNT"
pgset "$CLONE_SKB"
pgset "$PKT_SIZE"
pgset "$IPG"
pgset "dst 10.10.11.2"
pgset "dst_mac 00:04:23:08:91:dc"
pgset "queue_map_min 0"

# Time to run
PGDEV=/proc/net/pktgen/pgctrl
echo "Running... ctrl^C to stop"
pgset "start"
echo "Done"

# Result can be vieved in /proc/net/pktgen/eth1





^ permalink raw reply

* [PATCH net-next-2.6] net_sched:  TCQ_F_CAN_BYPASS generalization
From: Eric Dumazet @ 2011-01-21 11:04 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, Patrick McHardy, Jesper Dangaard Brouer, Jarek Poplawski,
	jamal
In-Reply-To: <1295537236.2825.286.camel@edumazet-laptop>

Now qdisc stab is handled before TCQ_F_CAN_BYPASS test in
__dev_xmit_skb(), we can generalize TCQ_F_CAN_BYPASS to other qdiscs
than pfifo_fast : pfifo, bfifo, pfifo_head_drop and sfq

SFQ is special because it can have external classifiers, and in these
cases, we cannot bypass queue discipline (packet could be dropped by
classifier) without admin asking it, or further changes.

Its worth doing this, especially for SFQ, avoiding dirtying memory in
case no packets are already waiting in queue.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Patrick McHardy <kaber@trash.net>
CC: Jesper Dangaard Brouer <hawk@diku.dk>
CC: Jarek Poplawski <jarkao2@gmail.com>
CC: Jamal Hadi Salim <hadi@cyberus.ca>
CC: Stephen Hemminger <shemminger@vyatta.com>
---
I am not sure RED can use bypass too, feel free to comment on this ;)

 net/sched/sch_fifo.c    |   13 ++++++++++++-
 net/sched/sch_generic.c |    5 ++---
 net/sched/sch_mq.c      |    1 -
 net/sched/sch_mqprio.c  |    1 -
 net/sched/sch_sfq.c     |    6 ++++++
 5 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index b3075f8..f7290d2 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -64,11 +64,13 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 static int fifo_init(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct fifo_sched_data *q = qdisc_priv(sch);
+	bool bypass;
+	bool is_bfifo = sch->ops == &bfifo_qdisc_ops;
 
 	if (opt == NULL) {
 		u32 limit = qdisc_dev(sch)->tx_queue_len ? : 1;
 
-		if (sch->ops == &bfifo_qdisc_ops)
+		if (is_bfifo)
 			limit *= psched_mtu(qdisc_dev(sch));
 
 		q->limit = limit;
@@ -81,6 +83,15 @@ static int fifo_init(struct Qdisc *sch, struct nlattr *opt)
 		q->limit = ctl->limit;
 	}
 
+	if (is_bfifo)
+		bypass = q->limit >= psched_mtu(qdisc_dev(sch));
+	else
+		bypass = q->limit >= 1;
+
+	if (bypass)
+		sch->flags |= TCQ_F_CAN_BYPASS;
+	else
+		sch->flags &= ~TCQ_F_CAN_BYPASS;
 	return 0;
 }
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index cc17e79..0da09d5 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -527,6 +527,8 @@ static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
 		skb_queue_head_init(band2list(priv, prio));
 
+	/* Can by-pass the queue discipline */
+	qdisc->flags |= TCQ_F_CAN_BYPASS;
 	return 0;
 }
 
@@ -691,9 +693,6 @@ static void attach_one_default_qdisc(struct net_device *dev,
 			netdev_info(dev, "activation failed\n");
 			return;
 		}
-
-		/* Can by-pass the queue discipline for default qdisc */
-		qdisc->flags |= TCQ_F_CAN_BYPASS;
 	}
 	dev_queue->qdisc_sleeping = qdisc;
 }
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index ecc302f..ec5cbc8 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -61,7 +61,6 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt)
 						    TC_H_MIN(ntx + 1)));
 		if (qdisc == NULL)
 			goto err;
-		qdisc->flags |= TCQ_F_CAN_BYPASS;
 		priv->qdiscs[ntx] = qdisc;
 	}
 
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 8620c65..fbc6f53 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -130,7 +130,6 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 			err = -ENOMEM;
 			goto err;
 		}
-		qdisc->flags |= TCQ_F_CAN_BYPASS;
 		priv->qdiscs[i] = qdisc;
 	}
 
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 156ad30..fdba52a 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -560,6 +560,10 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 		slot_queue_init(&q->slots[i]);
 		sfq_link(q, i);
 	}
+	if (q->limit >= 1)
+		sch->flags |= TCQ_F_CAN_BYPASS;
+	else
+		sch->flags &= ~TCQ_F_CAN_BYPASS;
 	return 0;
 }
 
@@ -611,6 +615,8 @@ static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
 			      u32 classid)
 {
+	/* we cannot bypass queue discipline anymore */
+	sch->flags &= ~TCQ_F_CAN_BYPASS;
 	return 0;
 }
 



^ permalink raw reply related

* Re: Flow Control and Port Mirroring Revisited
From: Michael S. Tsirkin @ 2011-01-21  9:59 UTC (permalink / raw)
  To: Simon Horman
  Cc: Rick Jones, Jesse Gross, Rusty Russell, virtualization, dev,
	virtualization, netdev, kvm
In-Reply-To: <20110120083727.GA1807@verge.net.au>

On Thu, Jan 20, 2011 at 05:38:33PM +0900, Simon Horman wrote:
> [ Trimmed Eric from CC list as vger was complaining that it is too long ]
> 
> On Tue, Jan 18, 2011 at 11:41:22AM -0800, Rick Jones wrote:
> > >So it won't be all that simple to implement well, and before we try,
> > >I'd like to know whether there are applications that are helped
> > >by it. For example, we could try to measure latency at various
> > >pps and see whether the backpressure helps. netperf has -b, -w
> > >flags which might help these measurements.
> > 
> > Those options are enabled when one adds --enable-burst to the
> > pre-compilation ./configure  of netperf (one doesn't have to
> > recompile netserver).  However, if one is also looking at latency
> > statistics via the -j option in the top-of-trunk, or simply at the
> > histogram with --enable-histogram on the ./configure and a verbosity
> > level of 2 (global -v 2) then one wants the very top of trunk
> > netperf from:
> 
> Hi,
> 
> I have constructed a test where I run an un-paced  UDP_STREAM test in
> one guest and a paced omni rr test in another guest at the same time.

Hmm, what is this supposed to measure?  Basically each time you run an
un-paced UDP_STREAM you get some random load on the network.
You can't tell what it was exactly, only that it was between
the send and receive throughput.

> Breifly I get the following results from the omni test..
> 
> 1. Omni test only:		MEAN_LATENCY=272.00
> 2. Omni and stream test:	MEAN_LATENCY=3423.00
> 3. cpu and net_cls group:	MEAN_LATENCY=493.00
>    As per 2 plus cgoups are created for each guest
>    and guest tasks added to the groups
> 4. 100Mbit/s class:		MEAN_LATENCY=273.00
>    As per 3 plus the net_cls groups each have a 100MBit/s HTB class
> 5. cpu.shares=128:		MEAN_LATENCY=652.00
>    As per 4 plus the cpu groups have cpu.shares set to 128
> 6. Busy CPUS:			MEAN_LATENCY=15126.00
>    As per 5 but the CPUs are made busy using a simple shell while loop
> 
> There is a bit of noise in the results as the two netperf invocations
> aren't started at exactly the same moment
> 
> For reference, my netperf invocations are:
> netperf -c -C -t UDP_STREAM -H 172.17.60.216 -l 12
> netperf.omni -p 12866 -D -c -C -H 172.17.60.216 -t omni -j -v 2 -- -r 1 -d rr -k foo -b 1 -w 200 -m 200
> 
> foo contains
> PROTOCOL
> THROUGHPUT,THROUGHPUT_UNITS
> LOCAL_SEND_THROUGHPUT
> LOCAL_RECV_THROUGHPUT
> REMOTE_SEND_THROUGHPUT
> REMOTE_RECV_THROUGHPUT
> RT_LATENCY,MIN_LATENCY,MEAN_LATENCY,MAX_LATENCY
> P50_LATENCY,P90_LATENCY,P99_LATENCY,STDDEV_LATENCY
> LOCAL_CPU_UTIL,REMOTE_CPU_UTIL

^ permalink raw reply

* Re: RFC: pid "ownership" of ip config information
From: Nicolas de Pesloüan @ 2011-01-21 10:17 UTC (permalink / raw)
  To: Patrick Schaaf; +Cc: netdev
In-Reply-To: <1295602091.3582.1.camel@lat1>

Le 21/01/2011 10:28, Patrick Schaaf a écrit :
> Dear netdev,
>
> I want to solicit comments on a feature enhancement that occured
> to me recently.
>
> Feature:
>
> - For "ip addr add", "ip route add", "ip rule add", and maybe "ip link
> add",
>    implement an option 'pid XXXXX' to specify a PID
> - if that PID is not currently existing, fail the operation
> - if, at a later time, that PID dies, automatically remove the
> configuration,
>    as if a corresponding "ip ... del" would have been given
>
> The feature would be useful in any kind of "IP takeover" scenario.
>
> I'm concretely working on deployment of keepalived (VRRP address
> takeover) and memcachedb (address takeover after berkeley DB master
> selection).
>
> It would also apply to all kinds of routing daemons (zebra, quagga...).
>
> In all these cases, for as long as the process is working normally,
> it can trigger the relevant address withdrawal, but when the process
> dies unexpectedly (oom killer or whatever), addresses are left
> configured,
> while a partner on another host might take them over, resulting in
> actively duplicate IPs and the application breaking.
>
> The alternative to such a feature, would be to have an additional
> monitoring process, which would watch the PID somehow, and need to
> be configured to know what to withdraw when it dies.
>
> Before I go ahead and try to implement that, I would like to have
> some feedback regarding the idea
>
> - has it been discussed before?
> - would it be accepted by the relevant maintainers?
> - did I overlook alternative solutions to the problem?

There exists some user space clustering system that should provide the same functionalities. Did you 
had a look at http://www.linux-ha.org/ ?

> best regards
>    Patrick

^ permalink raw reply

* [PATCH] netfilter: ipvs: fix compiler warnings
From: Changli Gao @ 2011-01-21 10:02 UTC (permalink / raw)
  To: Simon Horman
  Cc: Wensong Zhang, Julian Anastasov, Patrick McHardy, David S. Miller,
	netdev, lvs-devel, netfilter-devel, Changli Gao

Fix compiler warnings when no transport protocol load balancing support
is configured.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
---
 net/netfilter/ipvs/ip_vs_core.c  |    4 +---
 net/netfilter/ipvs/ip_vs_ctl.c   |    4 ++++
 net/netfilter/ipvs/ip_vs_proto.c |    4 ++++
 3 files changed, 9 insertions(+), 3 deletions(-)
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index f36a84f..d889f4f 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1894,9 +1894,7 @@ static int __net_init __ip_vs_init(struct net *net)
 
 static void __net_exit __ip_vs_cleanup(struct net *net)
 {
-	struct netns_ipvs *ipvs = net_ipvs(net);
-
-	IP_VS_DBG(10, "ipvs netns %d released\n", ipvs->gen);
+	IP_VS_DBG(10, "ipvs netns %d released\n", net_ipvs(net)->gen);
 }
 
 static struct pernet_operations ipvs_core_ops = {
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 09ca2ce..68b8033 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2062,7 +2062,9 @@ static const struct file_operations ip_vs_stats_percpu_fops = {
  */
 static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)
 {
+#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
 	struct ip_vs_proto_data *pd;
+#endif
 
 	IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
 		  u->tcp_timeout,
@@ -2405,7 +2407,9 @@ __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
 static inline void
 __ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u)
 {
+#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
 	struct ip_vs_proto_data *pd;
+#endif
 
 #ifdef CONFIG_IP_VS_PROTO_TCP
 	pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index 6ac986c..17484a4 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -60,6 +60,9 @@ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)
 	return 0;
 }
 
+#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) || \
+    defined(CONFIG_IP_VS_PROTO_SCTP) || defined(CONFIG_IP_VS_PROTO_AH) || \
+    defined(CONFIG_IP_VS_PROTO_ESP)
 /*
  *	register an ipvs protocols netns related data
  */
@@ -85,6 +88,7 @@ register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp)
 
 	return 0;
 }
+#endif
 
 /*
  *	unregister an ipvs protocol

^ permalink raw reply related

* [vmxnet3] possible irq lock inversion dependency detected
From: Jongman Heo @ 2011-01-21  9:44 UTC (permalink / raw)
  To: netdev


I'm using Fedora 14 on VMWare.

With latest Linus git tree(2b1caf6ed7b888), following warnings are printed.

Is this a known issue? I don't know whether this is a regression or not.
This is my first time using vmxnet3 driver.

===============================================================
[   17.593243] NET: Registered protocol family 10
[   17.640420] ip6_tables: (C) 2000-2006 Netfilter Core Team
[   18.418134] auditd (733): /proc/733/oom_adj is deprecated, please 
use /proc/733/oom_score_adj instead.
[   24.074627] eth0: intr type 3, mode 0, 5 vectors allocated
[   24.075450] eth0: NIC Link is Up 10000 Mbps
[   24.081505] 
[   24.081507] =========================================================
[   24.081693] [ INFO: possible irq lock inversion dependency detected ]
[   24.081797] 2.6.38-rc1+ #85
[   24.081914] ---------------------------------------------------------
[   24.082061] dbus-daemon/847 just changed the state of lock:
[   24.082200]  (&(&mc->mca_lock)->rlock){+.-...}, at: [<f85a034e>] 
mld_ifc_timer_expire+0x12a/0x1f2 [ipv6]
[   24.082488] but this lock took another, SOFTIRQ-unsafe lock in the past:
[   24.082690]  (&(&adapter->cmd_lock)->rlock){+.+...}
[   24.082769] 
[   24.082770] and interrupts could create inverse lock ordering between them.
[   24.082772] 
[   24.083196] 
[   24.083197] other info that might help us debug this:
[   24.083415] 3 locks held by dbus-daemon/847:
[   24.083538]  #0:  (&mm->mmap_sem){++++++}, at: [<c07d49ea>] 
do_page_fault+0x140/0x33b
[   24.083799]  #1:  (&idev->mc_ifc_timer){+.-...}, at: [<c04459c7>] 
run_timer_softirq+0x11c/0x268
[   24.084081]  #2:  (&ndev->lock){++.-..}, at: [<f85a023f>] 
mld_ifc_timer_expire+0x1b/0x1f2 [ipv6]
[   24.084364] 
[   24.084365] the shortest dependencies between 2nd lock and 1st lock:
[   24.084659]   -> (&(&adapter->cmd_lock)->rlock){+.+...} ops: 28 {
[   24.084826]      HARDIRQ-ON-W at:
[   24.084987]                                            [<c0461e11>] 
__lock_acquire+0x2d9/0xbf2
[   24.085302]                                            [<c0462b5f>] 
lock_acquire+0xb7/0xd7
[   24.085507]                                            [<c07d1835>] 
_raw_spin_lock+0x33/0x40
[   24.085708]                                            [<f855e2bf>] 
vmxnet3_alloc_intr_resources+0x18/0x1c1 [vmxnet3]
[   24.085964]                                            [<f8562b23>] 
vmxnet3_probe_device+0x503/0x712 [vmxnet3]
[   24.086180]                                            [<c05f645a>] 
local_pci_probe+0x2f/0x5a
[   24.086382]                                            [<c05f68ed>] 
pci_device_probe+0x48/0x6b
[   24.086582]                                            [<c067f87a>] 
driver_probe_device+0x115/0x1ec
[   24.086788]                                            [<c067f990>] 
__driver_attach+0x3f/0x5b
[   24.087014]                                            [<c067eb28>] 
bus_for_each_dev+0x3d/0x60
[   24.087214]                                            [<c067f50e>] 
driver_attach+0x19/0x1b
[   24.087411]                                            [<c067f1a4>] 
bus_add_driver+0xbd/0x215
[   24.087611]                                            [<c067fb61>] 
driver_register+0x7f/0xde
[   24.087811]                                            [<c05f6adb>] 
__pci_register_driver+0x4c/0xa9
[   24.088046]                                            [<f8568036>] 
0xf8568036
[   24.088238]                                            [<c0401268>] 
do_one_initcall+0x87/0x143
[   24.088439]                                            [<c046b0a6>] 
sys_init_module+0x130d/0x14aa
[   24.088643]                                            [<c040319f>] 
sysenter_do_call+0x12/0x38
[   24.088844]      SOFTIRQ-ON-W at:
[   24.115469]                                            [<c0461e30>] 
__lock_acquire+0x2f8/0xbf2
[   24.115483]                                            [<c0462b5f>] 
lock_acquire+0xb7/0xd7
[   24.115486]                                            [<c07d1835>] 
_raw_spin_lock+0x33/0x40
[   24.115493]                                            [<f855e2bf>] 
vmxnet3_alloc_intr_resources+0x18/0x1c1 [vmxnet3]
[   24.115508]                                            [<f8562b23>] 
vmxnet3_probe_device+0x503/0x712 [vmxnet3]
[   24.115513]                                            [<c05f645a>] 
local_pci_probe+0x2f/0x5a
[   24.115519]                                            [<c05f68ed>] 
pci_device_probe+0x48/0x6b
[   24.115523]                                            [<c067f87a>] 
driver_probe_device+0x115/0x1ec
[   24.115529]                                            [<c067f990>] 
__driver_attach+0x3f/0x5b
[   24.115532]                                            [<c067eb28>] 
bus_for_each_dev+0x3d/0x60
[   24.115535]                                            [<c067f50e>] 
driver_attach+0x19/0x1b
[   24.115539]                                            [<c067f1a4>] 
bus_add_driver+0xbd/0x215
[   24.115542]                                            [<c067fb61>] 
driver_register+0x7f/0xde
[   24.115545]                                            [<c05f6adb>] 
__pci_register_driver+0x4c/0xa9
[   24.115555]                                            [<f8568036>] 
0xf8568036
[   24.115562]                                            [<c0401268>] 
do_one_initcall+0x87/0x143
[   24.115567]                                            [<c046b0a6>] 
sys_init_module+0x130d/0x14aa
[   24.115590]                                            [<c040319f>] 
sysenter_do_call+0x12/0x38
[   24.115596]      INITIAL USE at:
[   24.115598]                                           [<c0461e85>] 
__lock_acquire+0x34d/0xbf2
[   24.115602]                                           [<c0462b5f>] 
lock_acquire+0xb7/0xd7
[   24.115606]                                           [<c07d1835>] 
_raw_spin_lock+0x33/0x40
[   24.115609]                                           [<f855e2bf>] 
vmxnet3_alloc_intr_resources+0x18/0x1c1 [vmxnet3]
[   24.115614]                                           [<f8562b23>] 
vmxnet3_probe_device+0x503/0x712 [vmxnet3]
[   24.115619]                                           [<c05f645a>] 
local_pci_probe+0x2f/0x5a
[   24.115622]                                           [<c05f68ed>] 
pci_device_probe+0x48/0x6b
[   24.115626]                                           [<c067f87a>] 
driver_probe_device+0x115/0x1ec
[   24.115629]                                           [<c067f990>] 
__driver_attach+0x3f/0x5b
[   24.115633]                                           [<c067eb28>] 
bus_for_each_dev+0x3d/0x60
[   24.115636]                                           [<c067f50e>] 
driver_attach+0x19/0x1b
[   24.115639]                                           [<c067f1a4>] 
bus_add_driver+0xbd/0x215
[   24.115642]                                           [<c067fb61>] 
driver_register+0x7f/0xde
[   24.115645]                                           [<c05f6adb>] 
__pci_register_driver+0x4c/0xa9
[   24.115648]                                           [<f8568036>] 
0xf8568036
[   24.115652]                                           [<c0401268>] 
do_one_initcall+0x87/0x143
[   24.115655]                                           [<c046b0a6>] 
sys_init_module+0x130d/0x14aa
[   24.115659]                                           [<c040319f>] 
sysenter_do_call+0x12/0x38
[   24.115662]    }
[   24.115663]    ... key      at: [<f8564580>] __key.40447+0x0/0xffffe7b2 
[vmxnet3]
[   24.115668]    ... acquired at:
[   24.115670]    [<c0462b5f>] lock_acquire+0xb7/0xd7
[   24.115673]    [<c07d1920>] _raw_spin_lock_irqsave+0x40/0x50
[   24.115676]    [<f855f494>] vmxnet3_set_mc+0x11a/0x165 [vmxnet3]
[   24.115684]    [<c0751f4d>] __dev_set_rx_mode+0x76/0x7a
[   24.115689]    [<c0751f6c>] dev_set_rx_mode+0x1b/0x26
[   24.115692]    [<c0752014>] __dev_open+0x9d/0xaf
[   24.115694]    [<c07521e6>] __dev_change_flags+0x98/0x10d
[   24.115697]    [<c07522c1>] dev_change_flags+0x13/0x3f
[   24.115699]    [<c075ae71>] do_setlink+0x245/0x56b
[   24.115703]    [<c075b6a6>] rtnl_setlink+0xaa/0xc6
[   24.115706]    [<c075b90f>] rtnetlink_rcv_msg+0x1a0/0x1af
[   24.115709]    [<c07694fd>] netlink_rcv_skb+0x32/0x73
[   24.115712]    [<c075b3bc>] rtnetlink_rcv+0x1b/0x22
[   24.115714]    [<c0769098>] netlink_unicast+0xc4/0x120
[   24.115716]    [<c076934e>] netlink_sendmsg+0x25a/0x271
[   24.115719]    [<c074135e>] __sock_sendmsg+0x54/0x5b
[   24.115723]    [<c07419dd>] sock_sendmsg+0x95/0xac
[   24.115726]    [<c0742fbc>] sys_sendmsg+0x181/0x1e8
[   24.115729]    [<c074348c>] sys_socketcall+0x22c/0x287
[   24.115732]    [<c040319f>] sysenter_do_call+0x12/0x38
[   24.115735] 
[   24.115736]  -> (_xmit_ETHER){+.....} ops: 6 {
[   24.115741]     HARDIRQ-ON-W at:
[   24.115742]                                          [<c0461e11>] 
__lock_acquire+0x2d9/0xbf2
[   24.115746]                                          [<c0462b5f>] 
lock_acquire+0xb7/0xd7
[   24.115750]                                          [<c07d1a20>] 
_raw_spin_lock_bh+0x38/0x45
[   24.115753]                                          [<c07553cd>] 
__dev_mc_add+0x23/0x61
[   24.115761]                                          [<c0755424>] 
dev_mc_add+0xa/0xc
[   24.115764]                                          [<f85a0bb9>] 
igmp6_group_added+0x56/0x139 [ipv6]
[   24.115784]                                          [<f85a114f>] 
ipv6_dev_mc_inc+0x1fb/0x20c [ipv6]
[   24.115799]                                          [<f858d0f9>] 
ipv6_add_dev+0x26d/0x28b [ipv6]
[   24.115834]                                          [<f8590007>] 
addrconf_notify+0x57/0x52c [ipv6]
[   24.115848]                                          [<c074ec2a>] 
register_netdevice_notifier+0x54/0x14e
[   24.115852]                                          [<f866b324>] 0xf866b324
[   24.115856]                                          [<f866b18a>] 0xf866b18a
[   24.115859]                                          [<c0401268>] 
do_one_initcall+0x87/0x143
[   24.115862]                                          [<c046b0a6>] 
sys_init_module+0x130d/0x14aa
[   24.115867]                                          [<c040319f>] 
sysenter_do_call+0x12/0x38
[   24.115870]     INITIAL USE at:
[   24.115872]                                         [<c0461e85>] 
__lock_acquire+0x34d/0xbf2
[   24.115876]                                         [<c0462b5f>] 
lock_acquire+0xb7/0xd7
[   24.115880]                                         [<c07d1a20>] 
_raw_spin_lock_bh+0x38/0x45
[   24.115884]                                         [<c07553cd>] 
__dev_mc_add+0x23/0x61
[   24.115887]                                         [<c0755424>] 
dev_mc_add+0xa/0xc
[   24.115891]                                         [<f85a0bb9>] 
igmp6_group_added+0x56/0x139 [ipv6]
[   24.115911]                                         [<f85a114f>] 
ipv6_dev_mc_inc+0x1fb/0x20c [ipv6]
[   24.115926]                                         [<f858d0f9>] 
ipv6_add_dev+0x26d/0x28b [ipv6]
[   24.115939]                                         [<f8590007>] 
addrconf_notify+0x57/0x52c [ipv6]
[   24.115951]                                         [<c074ec2a>] 
register_netdevice_notifier+0x54/0x14e
[   24.115954]                                         [<f866b324>] 0xf866b324
[   24.115957]                                         [<f866b18a>] 0xf866b18a
[   24.115960]                                         [<c0401268>] 
do_one_initcall+0x87/0x143
[   24.115963]                                         [<c046b0a6>] 
sys_init_module+0x130d/0x14aa
[   24.115966]                                         [<c040319f>] 
sysenter_do_call+0x12/0x38
[   24.115970]   }
[   24.115971]   ... key      at: [<c10308b8>] netdev_addr_lock_key+0x8/0x1d0
[   24.115985]   ... acquired at:
[   24.115986]    [<c0462b5f>] lock_acquire+0xb7/0xd7
[   24.115990]    [<c07d1a20>] _raw_spin_lock_bh+0x38/0x45
[   24.115992]    [<c07553cd>] __dev_mc_add+0x23/0x61
[   24.115995]    [<c0755424>] dev_mc_add+0xa/0xc
[   24.115997]    [<f85a0bb9>] igmp6_group_added+0x56/0x139 [ipv6]
[   24.116013]    [<f85a114f>] ipv6_dev_mc_inc+0x1fb/0x20c [ipv6]
[   24.116027]    [<f858d0f9>] ipv6_add_dev+0x26d/0x28b [ipv6]
[   24.116039]    [<f8590007>] addrconf_notify+0x57/0x52c [ipv6]
[   24.116051]    [<c074ec2a>] register_netdevice_notifier+0x54/0x14e
[   24.116054]    [<f866b324>] 0xf866b324
[   24.116056]    [<f866b18a>] 0xf866b18a
[   24.116058]    [<c0401268>] do_one_initcall+0x87/0x143
[   24.116061]    [<c046b0a6>] sys_init_module+0x130d/0x14aa
[   24.116064]    [<c040319f>] sysenter_do_call+0x12/0x38
[   24.116067] 
[   24.116068] -> (&(&mc->mca_lock)->rlock){+.-...} ops: 6 {
[   24.116071]    HARDIRQ-ON-W at:
[   24.116073]                                        [<c0461e11>] 
__lock_acquire+0x2d9/0xbf2
[   24.116077]                                        [<c0462b5f>] 
lock_acquire+0xb7/0xd7
[   24.116080]                                        [<c07d1a20>] 
_raw_spin_lock_bh+0x38/0x45
[   24.116083]                                        [<f85a0b8b>] 
igmp6_group_added+0x28/0x139 [ipv6]
[   24.116102]                                        [<f85a114f>] 
ipv6_dev_mc_inc+0x1fb/0x20c [ipv6]
[   24.116118]                                        [<f858d0f9>] 
ipv6_add_dev+0x26d/0x28b [ipv6]
[   24.116130]                                        [<f866b2f0>] 0xf866b2f0
[   24.116133]                                        [<f866b18a>] 0xf866b18a
[   24.116136]                                        [<c0401268>] 
do_one_initcall+0x87/0x143
[   24.116139]                                        [<c046b0a6>] 
sys_init_module+0x130d/0x14aa
[   24.116143]                                        [<c040319f>] 
sysenter_do_call+0x12/0x38
[   24.116146]    IN-SOFTIRQ-W at:
[   24.116148]                                        [<c0461dbc>] 
__lock_acquire+0x284/0xbf2
[   24.116151]                                        [<c0462b5f>] 
lock_acquire+0xb7/0xd7
[   24.116154]                                        [<c07d1a20>] 
_raw_spin_lock_bh+0x38/0x45
[   24.116158]                                        [<f85a034e>] 
mld_ifc_timer_expire+0x12a/0x1f2 [ipv6]
[   24.116173]                                        [<c0445a4a>] 
run_timer_softirq+0x19f/0x268
[   24.116180]                                        [<c043fd5b>] 
__do_softirq+0xa9/0x16a
[   24.116183]    INITIAL USE at:
[   24.116185]                                       [<c0461e85>] 
__lock_acquire+0x34d/0xbf2
[   24.116188]                                       [<c0462b5f>] 
lock_acquire+0xb7/0xd7
[   24.116191]                                       [<c07d1a20>] 
_raw_spin_lock_bh+0x38/0x45
[   24.116195]                                       [<f85a0b8b>] 
igmp6_group_added+0x28/0x139 [ipv6]
[   24.116210]                                       [<f85a114f>] 
ipv6_dev_mc_inc+0x1fb/0x20c [ipv6]
[   24.116226]                                       [<f858d0f9>] 
ipv6_add_dev+0x26d/0x28b [ipv6]
[   24.116238]                                       [<f866b2f0>] 0xf866b2f0
[   24.116241]                                       [<f866b18a>] 0xf866b18a
[   24.116244]                                       [<c0401268>] 
do_one_initcall+0x87/0x143
[   24.116247]                                       [<c046b0a6>] 
sys_init_module+0x130d/0x14aa
[   24.116251]                                       [<c040319f>] 
sysenter_do_call+0x12/0x38
[   24.116254]  }
[   24.116255]  ... key      at: [<f85b382c>] __key.38329+0x0/0xffff9cd8 [ipv6]
[   24.116266]  ... acquired at:
[   24.116268]    [<c046135b>] check_usage_forwards+0x6f/0x77
[   24.116271]    [<c0461a70>] mark_lock+0xf3/0x1bb
[   24.116273]    [<c0461dbc>] __lock_acquire+0x284/0xbf2
[   24.116276]    [<c0462b5f>] lock_acquire+0xb7/0xd7
[   24.116279]    [<c07d1a20>] _raw_spin_lock_bh+0x38/0x45
[   24.116282]    [<f85a034e>] mld_ifc_timer_expire+0x12a/0x1f2 [ipv6]
[   24.116296]    [<c0445a4a>] run_timer_softirq+0x19f/0x268
[   24.116299]    [<c043fd5b>] __do_softirq+0xa9/0x16a
[   24.116302] 
[   24.116303] 
[   24.116303] stack backtrace:
[   24.116307] Pid: 847, comm: dbus-daemon Not tainted 2.6.38-rc1+ #85
[   24.116309] Call Trace:
[   24.116314]  [<c04612e2>] ? print_irq_inversion_bug+0xfc/0x106
[   24.116317]  [<c046135b>] ? check_usage_forwards+0x6f/0x77
[   24.116320]  [<c0461a70>] ? mark_lock+0xf3/0x1bb
[   24.116323]  [<c04612ec>] ? check_usage_forwards+0x0/0x77
[   24.116327]  [<c0461dbc>] ? __lock_acquire+0x284/0xbf2
[   24.116330]  [<c04607f5>] ? save_trace+0x37/0x93
[   24.116333]  [<c046267c>] ? __lock_acquire+0xb44/0xbf2
[   24.116348]  [<f85a034e>] ? mld_ifc_timer_expire+0x12a/0x1f2 [ipv6]
[   24.116352]  [<c0462b5f>] ? lock_acquire+0xb7/0xd7
[   24.116366]  [<f85a034e>] ? mld_ifc_timer_expire+0x12a/0x1f2 [ipv6]
[   24.116370]  [<c07d1a20>] ? _raw_spin_lock_bh+0x38/0x45
[   24.116385]  [<f85a034e>] ? mld_ifc_timer_expire+0x12a/0x1f2 [ipv6]
[   24.116400]  [<f85a034e>] ? mld_ifc_timer_expire+0x12a/0x1f2 [ipv6]
[   24.116403]  [<c04459c7>] ? run_timer_softirq+0x11c/0x268
[   24.116410]  [<c0445a4a>] ? run_timer_softirq+0x19f/0x268
[   24.116413]  [<c04459c7>] ? run_timer_softirq+0x11c/0x268
[   24.116428]  [<f85a0224>] ? mld_ifc_timer_expire+0x0/0x1f2 [ipv6]
[   24.116432]  [<c043fd5b>] ? __do_softirq+0xa9/0x16a
[   24.116434]  [<c043fcb2>] ? __do_softirq+0x0/0x16a
[   24.116436]  <IRQ>  [<c043fead>] ? irq_exit+0x38/0x6c
[   24.116443]  [<c0419e71>] ? smp_apic_timer_interrupt+0x66/0x73
[   24.116447]  [<c05e6cc0>] ? trace_hardirqs_off_thunk+0xc/0x10
[   24.116451]  [<c07d2522>] ? apic_timer_interrupt+0x36/0x3c
[   24.116456]  [<c04bb9d7>] ? copy_user_highpage.clone.44+0x21/0x34
[   24.116459]  [<c04bc87a>] ? do_wp_page+0x397/0x514
[   24.116462]  [<c07d183c>] ? _raw_spin_lock+0x3a/0x40
[   24.116465]  [<c04be2a8>] ? handle_pte_fault+0x67f/0x6ea
[   24.116468]  [<c04be3bf>] ? handle_mm_fault+0xac/0xb8
[   24.116472]  [<c07d4bcd>] ? do_page_fault+0x323/0x33b
[   24.116475]  [<c0462b77>] ? lock_acquire+0xcf/0xd7
[   24.116478]  [<c07d205d>] ? restore_all_notrace+0x0/0x18
[   24.116481]  [<c04601a3>] ? trace_hardirqs_off_caller+0x2e/0x86
[   24.116484]  [<c07d48aa>] ? do_page_fault+0x0/0x33b
[   24.116487]  [<c07d27a4>] ? error_code+0x6c/0x74
[   24.550299] RPC: Registered udp transport module.
[   24.550405] RPC: Registered tcp transport module.
[   24.550498] RPC: Registered tcp NFSv4.1 backchannel transport module.
[   28.499064] Installing knfsd (copyright (C) 1996 okir@monad.swb.de).
[   28.725260] NFSD: Using /var/lib/nfs/v4recovery as the NFSv4 state recovery 
directory
[   28.783996] NFSD: starting 90-second grace period
[   33.488381] Bridge firewalling registered
[   34.443551] ------------[ cut here ]------------
[   34.443561] WARNING: at net/core/dev.c:1351 dev_disable_lro+0x54/0x57()
[   34.443563] Hardware name: VMware Virtual Platform
[   34.443565] Modules linked in: ipt_MASQUERADE iptable_nat nf_nat bridge stp 
llc nfsd lockd nfs_acl auth_rpcgss exportfs sunrpc xt_physdev 
nf_conntrack_tftp nf_conntrack_netbios_ns ip6t_REJECT nf_conntrack_ipv6 
nf_defrag_ipv6 ip6table_filter ip6_tables ipv6 vmhgfs uinput snd_ens1371 
gameport snd_rawmidi snd_ac97_codec ac97_bus snd_seq snd_seq_device snd_pcm 
snd_timer microcode vmxnet3 vmci snd soundcore snd_page_alloc i2c_piix4 mptspi 
mptscsih mptbase scsi_transport_spi [last unloaded: scsi_wait_scan]
[   34.443605] Pid: 1358, comm: libvirtd Not tainted 2.6.38-rc1+ #85
[   34.443607] Call Trace:
[   34.443615]  [<c043a801>] ? warn_slowpath_common+0x77/0x8c
[   34.443618]  [<c074d8cb>] ? dev_disable_lro+0x54/0x57
[   34.443620]  [<c074d8cb>] ? dev_disable_lro+0x54/0x57
[   34.443623]  [<c043a833>] ? warn_slowpath_null+0x1d/0x1f
[   34.443626]  [<c074d8cb>] ? dev_disable_lro+0x54/0x57
[   34.443630]  [<c079a574>] ? devinet_sysctl_forward+0xd5/0x139
[   34.443633]  [<c079a49f>] ? devinet_sysctl_forward+0x0/0x139
[   34.443638]  [<c051e889>] ? proc_sys_call_handler.clone.0+0x6a/0x89
[   34.443641]  [<c051e8a8>] ? proc_sys_write+0x0/0x22
[   34.443643]  [<c051e8c5>] ? proc_sys_write+0x1d/0x22
[   34.443649]  [<c04da9d8>] ? vfs_write+0x86/0xde
[   34.443651]  [<c04dba58>] ? fget_light+0x5f/0x66
[   34.443654]  [<c04daba6>] ? sys_write+0x3d/0x5e
[   34.443659]  [<c040319f>] ? sysenter_do_call+0x12/0x38
[   34.443662] ---[ end trace 06a697a570356b0c ]---


^ permalink raw reply

* RFC: pid "ownership" of ip config information
From: Patrick Schaaf @ 2011-01-21  9:28 UTC (permalink / raw)
  To: netdev

Dear netdev,

I want to solicit comments on a feature enhancement that occured
to me recently.

Feature:

- For "ip addr add", "ip route add", "ip rule add", and maybe "ip link
add",
  implement an option 'pid XXXXX' to specify a PID
- if that PID is not currently existing, fail the operation
- if, at a later time, that PID dies, automatically remove the
configuration,
  as if a corresponding "ip ... del" would have been given

The feature would be useful in any kind of "IP takeover" scenario.

I'm concretely working on deployment of keepalived (VRRP address
takeover) and memcachedb (address takeover after berkeley DB master
selection).

It would also apply to all kinds of routing daemons (zebra, quagga...).

In all these cases, for as long as the process is working normally,
it can trigger the relevant address withdrawal, but when the process
dies unexpectedly (oom killer or whatever), addresses are left
configured,
while a partner on another host might take them over, resulting in
actively duplicate IPs and the application breaking.

The alternative to such a feature, would be to have an additional
monitoring process, which would watch the PID somehow, and need to
be configured to know what to withdraw when it dies.

Before I go ahead and try to implement that, I would like to have
some feedback regarding the idea 

- has it been discussed before?
- would it be accepted by the relevant maintainers? 
- did I overlook alternative solutions to the problem?

best regards
  Patrick



^ permalink raw reply

* [PATCH] atm: idt77105: fix fetch_stats() result
From: Vasiliy Kulikov @ 2011-01-21  9:43 UTC (permalink / raw)
  To: kernel-janitors; +Cc: Chas Williams, linux-atm-general, netdev, linux-kernel

copy_to_user() used PRIV(dev)->stats instead of local stats variable.
Zero stats were returned to user in case of (zero != 0), also memcpy()
was pointless.

Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
---
 drivers/atm/idt77105.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/atm/idt77105.c b/drivers/atm/idt77105.c
index bca9cb8..487a547 100644
--- a/drivers/atm/idt77105.c
+++ b/drivers/atm/idt77105.c
@@ -151,7 +151,7 @@ static int fetch_stats(struct atm_dev *dev,struct idt77105_stats __user *arg,int
 	spin_unlock_irqrestore(&idt77105_priv_lock, flags);
 	if (arg == NULL)
 		return 0;
-	return copy_to_user(arg, &PRIV(dev)->stats,
+	return copy_to_user(arg, &stats,
 		    sizeof(struct idt77105_stats)) ? -EFAULT : 0;
 }
 
-- 
1.7.0.4

^ permalink raw reply related

* Re: [PATCH v4] net: add Faraday FTMAC100 10/100 Ethernet driver
From: Eric Dumazet @ 2011-01-21  9:08 UTC (permalink / raw)
  To: Po-Yu Chuang
  Cc: netdev, linux-kernel, bhutchings, joe, dilinger, mirqus,
	Po-Yu Chuang
In-Reply-To: <1295596533-1748-1-git-send-email-ratbert.chuang@gmail.com>

Le vendredi 21 janvier 2011 à 15:55 +0800, Po-Yu Chuang a écrit :
> From: Po-Yu Chuang <ratbert@faraday-tech.com>
> 
> FTMAC100 Ethernet Media Access Controller supports 10/100 Mbps and
> MII.  This driver has been working on some ARM/NDS32 SoC's including
> Faraday A320 and Andes AG101.
> 
> Signed-off-by: Po-Yu Chuang <ratbert@faraday-tech.com>


> +
> +static bool ftmac100_tx_complete_packet(struct ftmac100 *priv)
> +{
...
> +
> +	dma_unmap_single(priv->dev, map, skb_headlen(skb), DMA_TO_DEVICE);
> +
> +	dev_kfree_skb_irq(skb);
> +
> +	ftmac100_txdes_reset(txdes);
> +
> +	ftmac100_tx_clean_pointer_advance(priv);
> +
> +	priv->tx_pending--;
> +	netif_wake_queue(netdev);
> +
> +	return true;
> +}
> +

Thanks to NAPI, you can free skb directly, not queuing it via
NET_TX_SOFTIRQ softirq, using dev_kfree_skb() instead of
dev_kfree_skb_irq()

^ permalink raw reply

* [PATCH 4/4] ppp: Use SKB queue abstraction interfaces in fragment processing.
From: David Miller @ 2011-01-21  7:56 UTC (permalink / raw)
  To: netdev; +Cc: paulus


No more direct references to SKB queue and list implementation
details.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ppp_generic.c |   31 ++++++++++++++++---------------
 1 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c
index 1d4fb34..9f6d670 100644
--- a/drivers/net/ppp_generic.c
+++ b/drivers/net/ppp_generic.c
@@ -1998,7 +1998,7 @@ ppp_mp_reconstruct(struct ppp *ppp)
 	u32 seq = ppp->nextseq;
 	u32 minseq = ppp->minseq;
 	struct sk_buff_head *list = &ppp->mrq;
-	struct sk_buff *p, *next;
+	struct sk_buff *p, *tmp;
 	struct sk_buff *head, *tail;
 	struct sk_buff *skb = NULL;
 	int lost = 0, len = 0;
@@ -2007,14 +2007,15 @@ ppp_mp_reconstruct(struct ppp *ppp)
 		return NULL;
 	head = list->next;
 	tail = NULL;
-	for (p = head; p != (struct sk_buff *) list; p = next) {
-		next = p->next;
+	skb_queue_walk_safe(list, p, tmp) {
+	again:
 		if (seq_before(PPP_MP_CB(p)->sequence, seq)) {
 			/* this can't happen, anyway ignore the skb */
 			netdev_err(ppp->dev, "ppp_mp_reconstruct bad "
 				   "seq %u < %u\n",
 				   PPP_MP_CB(p)->sequence, seq);
-			head = next;
+			__skb_unlink(p, list);
+			kfree_skb(p);
 			continue;
 		}
 		if (PPP_MP_CB(p)->sequence != seq) {
@@ -2026,8 +2027,7 @@ ppp_mp_reconstruct(struct ppp *ppp)
 			lost = 1;
 			seq = seq_before(minseq, PPP_MP_CB(p)->sequence)?
 				minseq + 1: PPP_MP_CB(p)->sequence;
-			next = p;
-			continue;
+			goto again;
 		}
 
 		/*
@@ -2067,9 +2067,17 @@ ppp_mp_reconstruct(struct ppp *ppp)
 		 * and we haven't found a complete valid packet yet,
 		 * we can discard up to and including this fragment.
 		 */
-		if (PPP_MP_CB(p)->BEbits & E)
-			head = next;
+		if (PPP_MP_CB(p)->BEbits & E) {
+			struct sk_buff *tmp2;
 
+			skb_queue_reverse_walk_from_safe(list, p, tmp2) {
+				__skb_unlink(p, list);
+				kfree_skb(p);
+			}
+			head = skb_peek(list);
+			if (!head)
+				break;
+		}
 		++seq;
 	}
 
@@ -2110,13 +2118,6 @@ ppp_mp_reconstruct(struct ppp *ppp)
 		}
 
 		ppp->nextseq = PPP_MP_CB(tail)->sequence + 1;
-		head = tail->next;
-	}
-
-	/* Discard all the skbuffs that we can't use. */
-	while ((p = list->next) != head) {
-		__skb_unlink(p, list);
-		kfree_skb(p);
 	}
 
 	return skb;
-- 
1.7.3.4


^ permalink raw reply related

* [PATCH v4] net: add Faraday FTMAC100 10/100 Ethernet driver
From: Po-Yu Chuang @ 2011-01-21  7:55 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, bhutchings, eric.dumazet, joe, dilinger, mirqus,
	Po-Yu Chuang
In-Reply-To: <1295537418-2057-1-git-send-email-ratbert.chuang@gmail.com>

From: Po-Yu Chuang <ratbert@faraday-tech.com>

FTMAC100 Ethernet Media Access Controller supports 10/100 Mbps and
MII.  This driver has been working on some ARM/NDS32 SoC's including
Faraday A320 and Andes AG101.

Signed-off-by: Po-Yu Chuang <ratbert@faraday-tech.com>
---
v2:
always use NAPI
do not use our own net_device_stats structure
don't set trans_start and last_rx
stats.rx_packets and stats.rx_bytes include dropped packets
add missed netif_napi_del()
initialize spinlocks in probe function
remove rx_lock and hw_lock
use netdev_[err/info/dbg] instead of dev_* ones
use netdev_alloc_skb_ip_align()
remove ftmac100_get_stats()
use is_valid_ether_addr() instead of is_zero_ether_addr()
add const to ftmac100_ethtool_ops and ftmac100_netdev_ops
use net_ratelimit() instead of printk_ratelimit()
no explicit inline
use %pM to print MAC address
add comment before wmb
use napi poll() to handle all interrupts

v3:
undo "stats.rx_packets and stats.rx_bytes include dropped packets"
ftmac100_mdio_read() returns 0 if error
fix comment typos
use pr_fmt and pr_info
define INT_MASK_ALL_ENABLED
define MACCR_ENABLE_ALL
do not count length error many times
use bool/true/false
use cpu_to_le32/le32_to_cpu to access descriptors
indent style fix

v4:
should not access skb after netif_receive_skb()
use resource_size()
better way to use cpu_to_le32/le32_to_cpu
use spin_lock() for tx_lock
combine all netdev_info() together in ftmac100_poll()

 drivers/net/Kconfig    |    9 +
 drivers/net/Makefile   |    1 +
 drivers/net/ftmac100.c | 1207 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/net/ftmac100.h |  180 +++++++
 4 files changed, 1397 insertions(+), 0 deletions(-)
 create mode 100644 drivers/net/ftmac100.c
 create mode 100644 drivers/net/ftmac100.h

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 4f1755b..26da0ee 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -2014,6 +2014,15 @@ config BCM63XX_ENET
 	  This driver supports the ethernet MACs in the Broadcom 63xx
 	  MIPS chipset family (BCM63XX).
 
+config FTMAC100
+	tristate "Faraday FTMAC100 10/100 Ethernet support"
+	depends on ARM
+	select MII
+	help
+	  This driver supports the FTMAC100 Ethernet controller from
+	  Faraday. It is used on Faraday A320, Andes AG101, AG101P
+	  and some other ARM/NDS32 SoC's.
+
 source "drivers/net/fs_enet/Kconfig"
 
 source "drivers/net/octeon/Kconfig"
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index b90738d..7c21711 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -147,6 +147,7 @@ obj-$(CONFIG_FORCEDETH) += forcedeth.o
 obj-$(CONFIG_NE_H8300) += ne-h8300.o 8390.o
 obj-$(CONFIG_AX88796) += ax88796.o
 obj-$(CONFIG_BCM63XX_ENET) += bcm63xx_enet.o
+obj-$(CONFIG_FTMAC100) += ftmac100.o
 
 obj-$(CONFIG_TSI108_ETH) += tsi108_eth.o
 obj-$(CONFIG_MV643XX_ETH) += mv643xx_eth.o
diff --git a/drivers/net/ftmac100.c b/drivers/net/ftmac100.c
new file mode 100644
index 0000000..58b2d5f
--- /dev/null
+++ b/drivers/net/ftmac100.c
@@ -0,0 +1,1207 @@
+/*
+ * Faraday FTMAC100 10/100 Ethernet
+ *
+ * (C) Copyright 2009-2011 Faraday Technology
+ * Po-Yu Chuang <ratbert@faraday-tech.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/dma-mapping.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/mii.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netdevice.h>
+#include <linux/platform_device.h>
+
+#include "ftmac100.h"
+
+#define DRV_NAME	"ftmac100"
+#define DRV_VERSION	"0.2"
+
+#define RX_QUEUE_ENTRIES	128	/* must be power of 2 */
+#define TX_QUEUE_ENTRIES	16	/* must be power of 2 */
+
+#define MAX_PKT_SIZE		1518
+#define RX_BUF_SIZE		2044	/* must be smaller than 0x7ff */
+
+/******************************************************************************
+ * private data
+ *****************************************************************************/
+struct ftmac100_descs {
+	struct ftmac100_rxdes	rxdes[RX_QUEUE_ENTRIES];
+	struct ftmac100_txdes	txdes[TX_QUEUE_ENTRIES];
+};
+
+struct ftmac100 {
+	struct resource		*res;
+	void			*base;
+	int			irq;
+
+	struct ftmac100_descs	*descs;
+	dma_addr_t		descs_dma_addr;
+
+	unsigned int		rx_pointer;
+	unsigned int		tx_clean_pointer;
+	unsigned int		tx_pointer;
+	unsigned int		tx_pending;
+
+	spinlock_t		tx_lock;
+
+	struct net_device	*netdev;
+	struct device		*dev;
+	struct napi_struct	napi;
+
+	struct mii_if_info	mii;
+};
+
+/******************************************************************************
+ * internal functions (hardware register access)
+ *****************************************************************************/
+#define INT_MASK_ALL_ENABLED	(FTMAC100_INT_RPKT_FINISH	| \
+				 FTMAC100_INT_NORXBUF		| \
+				 FTMAC100_INT_XPKT_OK		| \
+				 FTMAC100_INT_XPKT_LOST		| \
+				 FTMAC100_INT_RPKT_LOST		| \
+				 FTMAC100_INT_AHB_ERR		| \
+				 FTMAC100_INT_PHYSTS_CHG)
+
+static void ftmac100_enable_all_int(struct ftmac100 *priv)
+{
+	iowrite32(INT_MASK_ALL_ENABLED, priv->base + FTMAC100_OFFSET_IMR);
+}
+
+static void ftmac100_disable_all_int(struct ftmac100 *priv)
+{
+	iowrite32(0, priv->base + FTMAC100_OFFSET_IMR);
+}
+
+static void ftmac100_set_rx_ring_base(struct ftmac100 *priv, dma_addr_t addr)
+{
+	iowrite32(addr, priv->base + FTMAC100_OFFSET_RXR_BADR);
+}
+
+static void ftmac100_set_tx_ring_base(struct ftmac100 *priv, dma_addr_t addr)
+{
+	iowrite32(addr, priv->base + FTMAC100_OFFSET_TXR_BADR);
+}
+
+static void ftmac100_txdma_start_polling(struct ftmac100 *priv)
+{
+	iowrite32(1, priv->base + FTMAC100_OFFSET_TXPD);
+}
+
+static int ftmac100_reset(struct ftmac100 *priv)
+{
+	struct net_device *netdev = priv->netdev;
+	int i;
+
+	/* NOTE: reset clears all registers */
+	iowrite32(FTMAC100_MACCR_SW_RST, priv->base + FTMAC100_OFFSET_MACCR);
+
+	for (i = 0; i < 5; i++) {
+		unsigned int maccr;
+
+		maccr = ioread32(priv->base + FTMAC100_OFFSET_MACCR);
+		if (!(maccr & FTMAC100_MACCR_SW_RST)) {
+			/*
+			 * FTMAC100_MACCR_SW_RST cleared does not indicate
+			 * that hardware reset completed (what the f*ck).
+			 * We still need to wait for a while.
+			 */
+			usleep_range(500, 1000);
+			return 0;
+		}
+
+		usleep_range(1000, 10000);
+	}
+
+	netdev_err(netdev, "software reset failed\n");
+	return -EIO;
+}
+
+static void ftmac100_set_mac(struct ftmac100 *priv, const unsigned char *mac)
+{
+	unsigned int maddr = mac[0] << 8 | mac[1];
+	unsigned int laddr = mac[2] << 24 | mac[3] << 16 | mac[4] << 8 | mac[5];
+
+	iowrite32(maddr, priv->base + FTMAC100_OFFSET_MAC_MADR);
+	iowrite32(laddr, priv->base + FTMAC100_OFFSET_MAC_LADR);
+}
+
+#define MACCR_ENABLE_ALL	(FTMAC100_MACCR_XMT_EN	| \
+				 FTMAC100_MACCR_RCV_EN	| \
+				 FTMAC100_MACCR_XDMA_EN	| \
+				 FTMAC100_MACCR_RDMA_EN	| \
+				 FTMAC100_MACCR_CRC_APD	| \
+				 FTMAC100_MACCR_FULLDUP	| \
+				 FTMAC100_MACCR_RX_RUNT	| \
+				 FTMAC100_MACCR_RX_BROADPKT)
+
+static int ftmac100_start_hw(struct ftmac100 *priv)
+{
+	struct net_device *netdev = priv->netdev;
+
+	if (ftmac100_reset(priv))
+		return -EIO;
+
+	/* setup ring buffer base registers */
+
+	ftmac100_set_rx_ring_base(priv,
+				  priv->descs_dma_addr +
+				  offsetof(struct ftmac100_descs, rxdes));
+	ftmac100_set_tx_ring_base(priv,
+				  priv->descs_dma_addr +
+				  offsetof(struct ftmac100_descs, txdes));
+
+	iowrite32(FTMAC100_APTC_RXPOLL_CNT(1), priv->base + FTMAC100_OFFSET_APTC);
+
+	ftmac100_set_mac(priv, netdev->dev_addr);
+
+	iowrite32(MACCR_ENABLE_ALL, priv->base + FTMAC100_OFFSET_MACCR);
+	return 0;
+}
+
+static void ftmac100_stop_hw(struct ftmac100 *priv)
+{
+	iowrite32(0, priv->base + FTMAC100_OFFSET_MACCR);
+}
+
+/******************************************************************************
+ * internal functions (receive descriptor)
+ *****************************************************************************/
+static bool ftmac100_rxdes_first_segment(struct ftmac100_rxdes *rxdes)
+{
+	return rxdes->rxdes0 & cpu_to_le32(FTMAC100_RXDES0_FRS);
+}
+
+static bool ftmac100_rxdes_last_segment(struct ftmac100_rxdes *rxdes)
+{
+	return rxdes->rxdes0 & cpu_to_le32(FTMAC100_RXDES0_LRS);
+}
+
+static bool ftmac100_rxdes_owned_by_dma(struct ftmac100_rxdes *rxdes)
+{
+	return rxdes->rxdes0 & cpu_to_le32(FTMAC100_RXDES0_RXDMA_OWN);
+}
+
+static void ftmac100_rxdes_set_dma_own(struct ftmac100_rxdes *rxdes)
+{
+	/* clear status bits */
+	rxdes->rxdes0 = cpu_to_le32(FTMAC100_RXDES0_RXDMA_OWN);
+}
+
+static bool ftmac100_rxdes_rx_error(struct ftmac100_rxdes *rxdes)
+{
+	return rxdes->rxdes0 & cpu_to_le32(FTMAC100_RXDES0_RX_ERR);
+}
+
+static bool ftmac100_rxdes_crc_error(struct ftmac100_rxdes *rxdes)
+{
+	return rxdes->rxdes0 & cpu_to_le32(FTMAC100_RXDES0_CRC_ERR);
+}
+
+static bool ftmac100_rxdes_frame_too_long(struct ftmac100_rxdes *rxdes)
+{
+	return rxdes->rxdes0 & cpu_to_le32(FTMAC100_RXDES0_FTL);
+}
+
+static bool ftmac100_rxdes_runt(struct ftmac100_rxdes *rxdes)
+{
+	return rxdes->rxdes0 & cpu_to_le32(FTMAC100_RXDES0_RUNT);
+}
+
+static bool ftmac100_rxdes_odd_nibble(struct ftmac100_rxdes *rxdes)
+{
+	return rxdes->rxdes0 & cpu_to_le32(FTMAC100_RXDES0_RX_ODD_NB);
+}
+
+static unsigned int ftmac100_rxdes_frame_length(struct ftmac100_rxdes *rxdes)
+{
+	return rxdes->rxdes0 & cpu_to_le32(FTMAC100_RXDES0_RFL);
+}
+
+static bool ftmac100_rxdes_multicast(struct ftmac100_rxdes *rxdes)
+{
+	return rxdes->rxdes0 & cpu_to_le32(FTMAC100_RXDES0_MULTICAST);
+}
+
+static void ftmac100_rxdes_set_buffer_size(struct ftmac100_rxdes *rxdes,
+					   unsigned int size)
+{
+	rxdes->rxdes1 &= cpu_to_le32(FTMAC100_RXDES1_EDORR);
+	rxdes->rxdes1 |= cpu_to_le32(FTMAC100_RXDES1_RXBUF_SIZE(size));
+}
+
+static void ftmac100_rxdes_set_end_of_ring(struct ftmac100_rxdes *rxdes)
+{
+	rxdes->rxdes1 |= cpu_to_le32(FTMAC100_RXDES1_EDORR);
+}
+
+static void ftmac100_rxdes_set_dma_addr(struct ftmac100_rxdes *rxdes,
+					dma_addr_t addr)
+{
+	rxdes->rxdes2 = cpu_to_le32(addr);
+}
+
+static dma_addr_t ftmac100_rxdes_get_dma_addr(struct ftmac100_rxdes *rxdes)
+{
+	return le32_to_cpu(rxdes->rxdes2);
+}
+
+/* rxdes3 is not used by hardware, we use it to keep track of buffer */
+static void ftmac100_rxdes_set_va(struct ftmac100_rxdes *rxdes, void *addr)
+{
+	rxdes->rxdes3 = cpu_to_le32(addr);
+}
+
+static void *ftmac100_rxdes_get_va(struct ftmac100_rxdes *rxdes)
+{
+	return (void *)le32_to_cpu(rxdes->rxdes3);
+}
+
+/******************************************************************************
+ * internal functions (receive)
+ *****************************************************************************/
+static int ftmac100_next_rx_pointer(int pointer)
+{
+	return (pointer + 1) & (RX_QUEUE_ENTRIES - 1);
+}
+
+static void ftmac100_rx_pointer_advance(struct ftmac100 *priv)
+{
+	priv->rx_pointer = ftmac100_next_rx_pointer(priv->rx_pointer);
+}
+
+static struct ftmac100_rxdes *ftmac100_current_rxdes(struct ftmac100 *priv)
+{
+	return &priv->descs->rxdes[priv->rx_pointer];
+}
+
+static struct ftmac100_rxdes *
+ftmac100_rx_locate_first_segment(struct ftmac100 *priv)
+{
+	struct ftmac100_rxdes *rxdes = ftmac100_current_rxdes(priv);
+
+	while (!ftmac100_rxdes_owned_by_dma(rxdes)) {
+		if (ftmac100_rxdes_first_segment(rxdes))
+			return rxdes;
+
+		ftmac100_rxdes_set_dma_own(rxdes);
+		ftmac100_rx_pointer_advance(priv);
+		rxdes = ftmac100_current_rxdes(priv);
+	}
+
+	return NULL;
+}
+
+static bool ftmac100_rx_packet_error(struct ftmac100 *priv,
+				     struct ftmac100_rxdes *rxdes)
+{
+	struct net_device *netdev = priv->netdev;
+	bool error = false;
+
+	if (unlikely(ftmac100_rxdes_rx_error(rxdes))) {
+		if (net_ratelimit())
+			netdev_info(netdev, "rx err\n");
+
+		netdev->stats.rx_errors++;
+		error = true;
+	}
+
+	if (unlikely(ftmac100_rxdes_crc_error(rxdes))) {
+		if (net_ratelimit())
+			netdev_info(netdev, "rx crc err\n");
+
+		netdev->stats.rx_crc_errors++;
+		error = true;
+	}
+
+	if (unlikely(ftmac100_rxdes_frame_too_long(rxdes))) {
+		if (net_ratelimit())
+			netdev_info(netdev, "rx frame too long\n");
+
+		netdev->stats.rx_length_errors++;
+		error = true;
+	} else if (unlikely(ftmac100_rxdes_runt(rxdes))) {
+		if (net_ratelimit())
+			netdev_info(netdev, "rx runt\n");
+
+		netdev->stats.rx_length_errors++;
+		error = true;
+	} else if (unlikely(ftmac100_rxdes_odd_nibble(rxdes))) {
+		if (net_ratelimit())
+			netdev_info(netdev, "rx odd nibble\n");
+
+		netdev->stats.rx_length_errors++;
+		error = true;
+	}
+
+	return error;
+}
+
+static void ftmac100_rx_drop_packet(struct ftmac100 *priv)
+{
+	struct net_device *netdev = priv->netdev;
+	struct ftmac100_rxdes *rxdes = ftmac100_current_rxdes(priv);
+	bool done = false;
+
+	if (net_ratelimit())
+		netdev_dbg(netdev, "drop packet %p\n", rxdes);
+
+	do {
+		if (ftmac100_rxdes_last_segment(rxdes))
+			done = true;
+
+		ftmac100_rxdes_set_dma_own(rxdes);
+		ftmac100_rx_pointer_advance(priv);
+		rxdes = ftmac100_current_rxdes(priv);
+	} while (!done && !ftmac100_rxdes_owned_by_dma(rxdes));
+
+	netdev->stats.rx_dropped++;
+}
+
+static bool ftmac100_rx_packet(struct ftmac100 *priv, int *processed)
+{
+	struct net_device *netdev = priv->netdev;
+	struct ftmac100_rxdes *rxdes;
+	struct sk_buff *skb;
+	int length;
+	bool copied = false;
+	bool done = false;
+
+	rxdes = ftmac100_rx_locate_first_segment(priv);
+	if (!rxdes)
+		return false;
+
+	if (unlikely(ftmac100_rx_packet_error(priv, rxdes))) {
+		ftmac100_rx_drop_packet(priv);
+		return true;
+	}
+
+	/* start processing */
+
+	length = ftmac100_rxdes_frame_length(rxdes);
+
+	skb = netdev_alloc_skb_ip_align(netdev, length);
+	if (unlikely(!skb)) {
+		if (net_ratelimit())
+			netdev_err(netdev, "rx skb alloc failed\n");
+
+		ftmac100_rx_drop_packet(priv);
+		return true;
+	}
+
+	if (unlikely(ftmac100_rxdes_multicast(rxdes)))
+		netdev->stats.multicast++;
+
+	do {
+		dma_addr_t d = ftmac100_rxdes_get_dma_addr(rxdes);
+		void *buf = ftmac100_rxdes_get_va(rxdes);
+		int size;
+
+		size = min(length - copied, RX_BUF_SIZE);
+
+		dma_sync_single_for_cpu(priv->dev, d, RX_BUF_SIZE,
+					DMA_FROM_DEVICE);
+		memcpy(skb_put(skb, size), buf, size);
+
+		copied += size;
+
+		if (ftmac100_rxdes_last_segment(rxdes))
+			done = true;
+
+		dma_sync_single_for_device(priv->dev, d, RX_BUF_SIZE,
+					   DMA_FROM_DEVICE);
+
+		ftmac100_rxdes_set_dma_own(rxdes);
+
+		ftmac100_rx_pointer_advance(priv);
+		rxdes = ftmac100_current_rxdes(priv);
+	} while (!done && copied < length);
+
+	skb->protocol = eth_type_trans(skb, netdev);
+
+	netdev->stats.rx_packets++;
+	netdev->stats.rx_bytes += skb->len;
+
+	/* push packet to protocol stack */
+	netif_receive_skb(skb);
+
+	(*processed)++;
+	return true;
+}
+
+/******************************************************************************
+ * internal functions (transmit descriptor)
+ *****************************************************************************/
+static void ftmac100_txdes_reset(struct ftmac100_txdes *txdes)
+{
+	/* clear all except end of ring bit */
+	txdes->txdes0 = 0;
+	txdes->txdes1 &= FTMAC100_TXDES1_EDOTR;
+	txdes->txdes2 = 0;
+	txdes->txdes3 = 0;
+}
+
+static bool ftmac100_txdes_owned_by_dma(struct ftmac100_txdes *txdes)
+{
+	return txdes->txdes0 & cpu_to_le32(FTMAC100_TXDES0_TXDMA_OWN);
+}
+
+static void ftmac100_txdes_set_dma_own(struct ftmac100_txdes *txdes)
+{
+	/*
+	 * Make sure dma own bit will not be set before any other
+	 * descriptor fields.
+	 */
+	wmb();
+	txdes->txdes0 |= cpu_to_le32(FTMAC100_TXDES0_TXDMA_OWN);
+}
+
+static bool ftmac100_txdes_excessive_collision(struct ftmac100_txdes *txdes)
+{
+	return txdes->txdes0 & cpu_to_le32(FTMAC100_TXDES0_TXPKT_EXSCOL);
+}
+
+static bool ftmac100_txdes_late_collision(struct ftmac100_txdes *txdes)
+{
+	return txdes->txdes0 & cpu_to_le32(FTMAC100_TXDES0_TXPKT_LATECOL);
+}
+
+static void ftmac100_txdes_set_end_of_ring(struct ftmac100_txdes *txdes)
+{
+	txdes->txdes1 |= cpu_to_le32(FTMAC100_TXDES1_EDOTR);
+}
+
+static void ftmac100_txdes_set_first_segment(struct ftmac100_txdes *txdes)
+{
+	txdes->txdes1 |= cpu_to_le32(FTMAC100_TXDES1_FTS);
+}
+
+static void ftmac100_txdes_set_last_segment(struct ftmac100_txdes *txdes)
+{
+	txdes->txdes1 |= cpu_to_le32(FTMAC100_TXDES1_LTS);
+}
+
+static void ftmac100_txdes_set_txint(struct ftmac100_txdes *txdes)
+{
+	txdes->txdes1 |= cpu_to_le32(FTMAC100_TXDES1_TXIC);
+}
+
+static void ftmac100_txdes_set_buffer_size(struct ftmac100_txdes *txdes,
+					   unsigned int len)
+{
+	txdes->txdes1 |= cpu_to_le32(FTMAC100_TXDES1_TXBUF_SIZE(len));
+}
+
+static void ftmac100_txdes_set_dma_addr(struct ftmac100_txdes *txdes,
+					dma_addr_t addr)
+{
+	txdes->txdes2 = cpu_to_le32(addr);
+}
+
+static dma_addr_t ftmac100_txdes_get_dma_addr(struct ftmac100_txdes *txdes)
+{
+	return le32_to_cpu(txdes->txdes2);
+}
+
+/* txdes3 is not used by hardware, we use it to keep track of socket buffer */
+static void ftmac100_txdes_set_skb(struct ftmac100_txdes *txdes,
+				   struct sk_buff *skb)
+{
+	txdes->txdes3 = cpu_to_le32(skb);
+}
+
+static struct sk_buff *ftmac100_txdes_get_skb(struct ftmac100_txdes *txdes)
+{
+	return (struct sk_buff *)le32_to_cpu(txdes->txdes3);
+}
+
+/******************************************************************************
+ * internal functions (transmit)
+ *****************************************************************************/
+static int ftmac100_next_tx_pointer(int pointer)
+{
+	return (pointer + 1) & (TX_QUEUE_ENTRIES - 1);
+}
+
+static void ftmac100_tx_pointer_advance(struct ftmac100 *priv)
+{
+	priv->tx_pointer = ftmac100_next_tx_pointer(priv->tx_pointer);
+}
+
+static void ftmac100_tx_clean_pointer_advance(struct ftmac100 *priv)
+{
+	priv->tx_clean_pointer = ftmac100_next_tx_pointer(priv->tx_clean_pointer);
+}
+
+static struct ftmac100_txdes *ftmac100_current_txdes(struct ftmac100 *priv)
+{
+	return &priv->descs->txdes[priv->tx_pointer];
+}
+
+static struct ftmac100_txdes *
+ftmac100_current_clean_txdes(struct ftmac100 *priv)
+{
+	return &priv->descs->txdes[priv->tx_clean_pointer];
+}
+
+static bool ftmac100_tx_complete_packet(struct ftmac100 *priv)
+{
+	struct net_device *netdev = priv->netdev;
+	struct ftmac100_txdes *txdes;
+	struct sk_buff *skb;
+	dma_addr_t map;
+
+	if (priv->tx_pending == 0)
+		return false;
+
+	txdes = ftmac100_current_clean_txdes(priv);
+
+	if (ftmac100_txdes_owned_by_dma(txdes))
+		return false;
+
+	skb = ftmac100_txdes_get_skb(txdes);
+	map = ftmac100_txdes_get_dma_addr(txdes);
+
+	if (unlikely(ftmac100_txdes_excessive_collision(txdes) ||
+		     ftmac100_txdes_late_collision(txdes))) {
+		/*
+		 * packet transmitted to ethernet lost due to late collision
+		 * or excessive collision
+		 */
+		netdev->stats.tx_aborted_errors++;
+	} else {
+		netdev->stats.tx_packets++;
+		netdev->stats.tx_bytes += skb->len;
+	}
+
+	dma_unmap_single(priv->dev, map, skb_headlen(skb), DMA_TO_DEVICE);
+
+	dev_kfree_skb_irq(skb);
+
+	ftmac100_txdes_reset(txdes);
+
+	ftmac100_tx_clean_pointer_advance(priv);
+
+	priv->tx_pending--;
+	netif_wake_queue(netdev);
+
+	return true;
+}
+
+static void ftmac100_tx_complete(struct ftmac100 *priv)
+{
+	spin_lock(&priv->tx_lock);
+	while (ftmac100_tx_complete_packet(priv))
+		;
+	spin_unlock(&priv->tx_lock);
+}
+
+static int ftmac100_xmit(struct ftmac100 *priv, struct sk_buff *skb,
+			 dma_addr_t map)
+{
+	struct net_device *netdev = priv->netdev;
+	struct ftmac100_txdes *txdes;
+	unsigned int len = (skb->len < ETH_ZLEN) ? ETH_ZLEN : skb->len;
+
+	txdes = ftmac100_current_txdes(priv);
+	ftmac100_tx_pointer_advance(priv);
+
+	/* setup TX descriptor */
+
+	spin_lock(&priv->tx_lock);
+	ftmac100_txdes_set_skb(txdes, skb);
+	ftmac100_txdes_set_dma_addr(txdes, map);
+
+	ftmac100_txdes_set_first_segment(txdes);
+	ftmac100_txdes_set_last_segment(txdes);
+	ftmac100_txdes_set_txint(txdes);
+	ftmac100_txdes_set_buffer_size(txdes, len);
+
+	priv->tx_pending++;
+	if (priv->tx_pending == TX_QUEUE_ENTRIES) {
+		if (net_ratelimit())
+			netdev_info(netdev, "tx queue full\n");
+
+		netif_stop_queue(netdev);
+	}
+
+	/* start transmit */
+	ftmac100_txdes_set_dma_own(txdes);
+	spin_unlock(&priv->tx_lock);
+
+	ftmac100_txdma_start_polling(priv);
+
+	return NETDEV_TX_OK;
+}
+
+/******************************************************************************
+ * internal functions (buffer)
+ *****************************************************************************/
+static void ftmac100_free_buffers(struct ftmac100 *priv)
+{
+	int i;
+
+	for (i = 0; i < RX_QUEUE_ENTRIES; i += 2) {
+		struct ftmac100_rxdes *rxdes = &priv->descs->rxdes[i];
+		dma_addr_t d = ftmac100_rxdes_get_dma_addr(rxdes);
+		void *page = ftmac100_rxdes_get_va(rxdes);
+
+		if (d)
+			dma_unmap_single(priv->dev, d, PAGE_SIZE,
+					 DMA_FROM_DEVICE);
+
+		if (page != NULL)
+			free_page((unsigned long)page);
+	}
+
+	for (i = 0; i < TX_QUEUE_ENTRIES; i++) {
+		struct ftmac100_txdes *txdes = &priv->descs->txdes[i];
+		struct sk_buff *skb = ftmac100_txdes_get_skb(txdes);
+
+		if (skb) {
+			dma_addr_t map;
+
+			map = ftmac100_txdes_get_dma_addr(txdes);
+			dma_unmap_single(priv->dev, map, skb_headlen(skb),
+					 DMA_TO_DEVICE);
+			dev_kfree_skb(skb);
+		}
+	}
+
+	dma_free_coherent(priv->dev, sizeof(struct ftmac100_descs),
+			  priv->descs, priv->descs_dma_addr);
+}
+
+static int ftmac100_alloc_buffers(struct ftmac100 *priv)
+{
+	int i;
+
+	priv->descs = dma_alloc_coherent(priv->dev,
+					 sizeof(struct ftmac100_descs),
+					 &priv->descs_dma_addr,
+					 GFP_KERNEL | GFP_DMA);
+	if (priv->descs == NULL)
+		return -ENOMEM;
+
+	memset(priv->descs, 0, sizeof(struct ftmac100_descs));
+
+	/* initialize RX ring */
+
+	ftmac100_rxdes_set_end_of_ring(&priv->descs->rxdes[RX_QUEUE_ENTRIES - 1]);
+
+	for (i = 0; i < RX_QUEUE_ENTRIES; i += 2) {
+		struct ftmac100_rxdes *rxdes = &priv->descs->rxdes[i];
+		void *page;
+		dma_addr_t d;
+
+		page = (void *)__get_free_page(GFP_KERNEL | GFP_DMA);
+		if (page == NULL)
+			goto err;
+
+		d = dma_map_single(priv->dev, page, PAGE_SIZE, DMA_FROM_DEVICE);
+		if (unlikely(dma_mapping_error(priv->dev, d))) {
+			free_page((unsigned long)page);
+			goto err;
+		}
+
+		/*
+		 * The hardware enforces a sub-2K maximum packet size, so we
+		 * put two buffers on every hardware page.
+		 */
+		ftmac100_rxdes_set_va(rxdes, page);
+		ftmac100_rxdes_set_va(rxdes + 1, page + PAGE_SIZE / 2);
+
+		ftmac100_rxdes_set_dma_addr(rxdes, d);
+		ftmac100_rxdes_set_dma_addr(rxdes + 1, d + PAGE_SIZE / 2);
+
+		ftmac100_rxdes_set_buffer_size(rxdes, RX_BUF_SIZE);
+		ftmac100_rxdes_set_buffer_size(rxdes + 1, RX_BUF_SIZE);
+
+		ftmac100_rxdes_set_dma_own(rxdes);
+		ftmac100_rxdes_set_dma_own(rxdes + 1);
+	}
+
+	/* initialize TX ring */
+
+	ftmac100_txdes_set_end_of_ring(&priv->descs->txdes[TX_QUEUE_ENTRIES - 1]);
+	return 0;
+
+err:
+	ftmac100_free_buffers(priv);
+	return -ENOMEM;
+}
+
+/******************************************************************************
+ * struct mii_if_info functions
+ *****************************************************************************/
+static int ftmac100_mdio_read(struct net_device *netdev, int phy_id, int reg)
+{
+	struct ftmac100 *priv = netdev_priv(netdev);
+	unsigned int phycr;
+	int i;
+
+	phycr = FTMAC100_PHYCR_PHYAD(phy_id) |
+		FTMAC100_PHYCR_REGAD(reg) |
+		FTMAC100_PHYCR_MIIRD;
+
+	iowrite32(phycr, priv->base + FTMAC100_OFFSET_PHYCR);
+	for (i = 0; i < 10; i++) {
+		phycr = ioread32(priv->base + FTMAC100_OFFSET_PHYCR);
+
+		if ((phycr & FTMAC100_PHYCR_MIIRD) == 0)
+			return phycr & FTMAC100_PHYCR_MIIRDATA;
+
+		usleep_range(100, 1000);
+	}
+
+	netdev_err(netdev, "mdio read timed out\n");
+	return 0;
+}
+
+static void ftmac100_mdio_write(struct net_device *netdev, int phy_id, int reg,
+				int data)
+{
+	struct ftmac100 *priv = netdev_priv(netdev);
+	unsigned int phycr;
+	int i;
+
+	phycr = FTMAC100_PHYCR_PHYAD(phy_id) |
+		FTMAC100_PHYCR_REGAD(reg) |
+		FTMAC100_PHYCR_MIIWR;
+
+	data = FTMAC100_PHYWDATA_MIIWDATA(data);
+
+	iowrite32(data, priv->base + FTMAC100_OFFSET_PHYWDATA);
+	iowrite32(phycr, priv->base + FTMAC100_OFFSET_PHYCR);
+
+	for (i = 0; i < 10; i++) {
+		phycr = ioread32(priv->base + FTMAC100_OFFSET_PHYCR);
+
+		if ((phycr & FTMAC100_PHYCR_MIIWR) == 0)
+			return;
+
+		usleep_range(100, 1000);
+	}
+
+	netdev_err(netdev, "mdio write timed out\n");
+}
+
+/******************************************************************************
+ * struct ethtool_ops functions
+ *****************************************************************************/
+static void ftmac100_get_drvinfo(struct net_device *netdev,
+				 struct ethtool_drvinfo *info)
+{
+	strcpy(info->driver, DRV_NAME);
+	strcpy(info->version, DRV_VERSION);
+	strcpy(info->bus_info, dev_name(&netdev->dev));
+}
+
+static int ftmac100_get_settings(struct net_device *netdev,
+				 struct ethtool_cmd *cmd)
+{
+	struct ftmac100 *priv = netdev_priv(netdev);
+	return mii_ethtool_gset(&priv->mii, cmd);
+}
+
+static int ftmac100_set_settings(struct net_device *netdev,
+				 struct ethtool_cmd *cmd)
+{
+	struct ftmac100 *priv = netdev_priv(netdev);
+	return mii_ethtool_sset(&priv->mii, cmd);
+}
+
+static int ftmac100_nway_reset(struct net_device *netdev)
+{
+	struct ftmac100 *priv = netdev_priv(netdev);
+	return mii_nway_restart(&priv->mii);
+}
+
+static u32 ftmac100_get_link(struct net_device *netdev)
+{
+	struct ftmac100 *priv = netdev_priv(netdev);
+	return mii_link_ok(&priv->mii);
+}
+
+static const struct ethtool_ops ftmac100_ethtool_ops = {
+	.set_settings		= ftmac100_set_settings,
+	.get_settings		= ftmac100_get_settings,
+	.get_drvinfo		= ftmac100_get_drvinfo,
+	.nway_reset		= ftmac100_nway_reset,
+	.get_link		= ftmac100_get_link,
+};
+
+/******************************************************************************
+ * interrupt handler
+ *****************************************************************************/
+static irqreturn_t ftmac100_interrupt(int irq, void *dev_id)
+{
+	struct net_device *netdev = dev_id;
+	struct ftmac100 *priv = netdev_priv(netdev);
+
+	if (likely(netif_running(netdev))) {
+		/* Disable interrupts for polling */
+		ftmac100_disable_all_int(priv);
+		napi_schedule(&priv->napi);
+	}
+
+	return IRQ_HANDLED;
+}
+
+/******************************************************************************
+ * struct napi_struct functions
+ *****************************************************************************/
+static int ftmac100_poll(struct napi_struct *napi, int budget)
+{
+	struct ftmac100 *priv = container_of(napi, struct ftmac100, napi);
+	struct net_device *netdev = priv->netdev;
+	unsigned int status;
+	bool completed = true;
+	int rx = 0;
+
+	status = ioread32(priv->base + FTMAC100_OFFSET_ISR);
+
+	if (status & (FTMAC100_INT_RPKT_FINISH | FTMAC100_INT_NORXBUF)) {
+		/*
+		 * FTMAC100_INT_RPKT_FINISH:
+		 *	RX DMA has received packets into RX buffer successfully
+		 *
+		 * FTMAC100_INT_NORXBUF:
+		 *	RX buffer unavailable
+		 */
+		bool retry;
+
+		do {
+			retry = ftmac100_rx_packet(priv, &rx);
+		} while (retry && rx < budget);
+
+		if (retry && rx == budget)
+			completed = false;
+	}
+
+	if (status & (FTMAC100_INT_XPKT_OK | FTMAC100_INT_XPKT_LOST)) {
+		/*
+		 * FTMAC100_INT_XPKT_OK:
+		 *	packet transmitted to ethernet successfully
+		 *
+		 * FTMAC100_INT_XPKT_LOST:
+		 *	packet transmitted to ethernet lost due to late
+		 *	collision or excessive collision
+		 */
+		ftmac100_tx_complete(priv);
+	}
+
+	if (status & (FTMAC100_INT_NORXBUF | FTMAC100_INT_RPKT_LOST |
+		      FTMAC100_INT_AHB_ERR | FTMAC100_INT_PHYSTS_CHG)) {
+		if (net_ratelimit())
+			netdev_info(netdev, "[ISR] = 0x%x: %s%s%s%s\n", status,
+				    status & FTMAC100_INT_NORXBUF ? "NORXBUF " : "",
+				    status & FTMAC100_INT_RPKT_LOST ? "RPKT_LOST " : "",
+				    status & FTMAC100_INT_AHB_ERR ? "AHB_ERR " : "",
+				    status & FTMAC100_INT_PHYSTS_CHG ? "PHYSTS_CHG" : "");
+
+		if (status & FTMAC100_INT_NORXBUF) {
+			/* RX buffer unavailable */
+			netdev->stats.rx_over_errors++;
+		}
+
+		if (status & FTMAC100_INT_RPKT_LOST) {
+			/* received packet lost due to RX FIFO full */
+			netdev->stats.rx_fifo_errors++;
+		}
+
+		if (status & FTMAC100_INT_PHYSTS_CHG) {
+			/* PHY link status change */
+			mii_check_link(&priv->mii);
+		}
+	}
+
+	if (completed) {
+		/* stop polling */
+		napi_complete(napi);
+		ftmac100_enable_all_int(priv);
+	}
+
+	return rx;
+}
+
+/******************************************************************************
+ * struct net_device_ops functions
+ *****************************************************************************/
+static int ftmac100_open(struct net_device *netdev)
+{
+	struct ftmac100 *priv = netdev_priv(netdev);
+	int err;
+
+	err = ftmac100_alloc_buffers(priv);
+	if (err) {
+		netdev_err(netdev, "failed to allocate buffers\n");
+		goto err_alloc;
+	}
+
+	err = request_irq(priv->irq, ftmac100_interrupt, 0, netdev->name,
+		netdev);
+	if (err) {
+		netdev_err(netdev, "failed to request irq %d\n", priv->irq);
+		goto err_irq;
+	}
+
+	priv->rx_pointer = 0;
+	priv->tx_clean_pointer = 0;
+	priv->tx_pointer = 0;
+	priv->tx_pending = 0;
+
+	err = ftmac100_start_hw(priv);
+	if (err)
+		goto err_hw;
+
+	napi_enable(&priv->napi);
+	netif_start_queue(netdev);
+
+	ftmac100_enable_all_int(priv);
+	return 0;
+
+err_hw:
+	free_irq(priv->irq, netdev);
+err_irq:
+	ftmac100_free_buffers(priv);
+err_alloc:
+	return err;
+}
+
+static int ftmac100_stop(struct net_device *netdev)
+{
+	struct ftmac100 *priv = netdev_priv(netdev);
+
+	ftmac100_disable_all_int(priv);
+	netif_stop_queue(netdev);
+	napi_disable(&priv->napi);
+	ftmac100_stop_hw(priv);
+	free_irq(priv->irq, netdev);
+	ftmac100_free_buffers(priv);
+
+	return 0;
+}
+
+static int ftmac100_hard_start_xmit(struct sk_buff *skb,
+				    struct net_device *netdev)
+{
+	struct ftmac100 *priv = netdev_priv(netdev);
+	dma_addr_t map;
+
+	if (unlikely(skb->len > MAX_PKT_SIZE)) {
+		if (net_ratelimit())
+			netdev_dbg(netdev, "tx packet too big\n");
+
+		netdev->stats.tx_dropped++;
+		dev_kfree_skb(skb);
+		return NETDEV_TX_OK;
+	}
+
+	map = dma_map_single(priv->dev, skb->data, skb_headlen(skb), DMA_TO_DEVICE);
+	if (unlikely(dma_mapping_error(priv->dev, map))) {
+		/* drop packet */
+		if (net_ratelimit())
+			netdev_err(netdev, "map socket buffer failed\n");
+
+		netdev->stats.tx_dropped++;
+		dev_kfree_skb(skb);
+		return NETDEV_TX_OK;
+	}
+
+	return ftmac100_xmit(priv, skb, map);
+}
+
+/* optional */
+static int ftmac100_do_ioctl(struct net_device *netdev, struct ifreq *ifr,
+			     int cmd)
+{
+	struct ftmac100 *priv = netdev_priv(netdev);
+	struct mii_ioctl_data *data = if_mii(ifr);
+
+	return generic_mii_ioctl(&priv->mii, data, cmd, NULL);
+}
+
+static const struct net_device_ops ftmac100_netdev_ops = {
+	.ndo_open		= ftmac100_open,
+	.ndo_stop		= ftmac100_stop,
+	.ndo_start_xmit		= ftmac100_hard_start_xmit,
+	.ndo_set_mac_address	= eth_mac_addr,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_do_ioctl		= ftmac100_do_ioctl,
+};
+
+/******************************************************************************
+ * struct platform_driver functions
+ *****************************************************************************/
+static int ftmac100_remove(struct platform_device *pdev)
+{
+	struct net_device *netdev;
+	struct ftmac100 *priv;
+
+	netdev = platform_get_drvdata(pdev);
+	if (netdev == NULL)
+		return 0;
+
+	platform_set_drvdata(pdev, NULL);
+
+	priv = netdev_priv(netdev);
+
+	netif_napi_del(&priv->napi);
+	unregister_netdev(netdev);
+
+	if (priv->base != NULL)
+		iounmap(priv->base);
+
+	if (priv->res != NULL)
+		release_resource(priv->res);
+
+	free_netdev(netdev);
+	return 0;
+}
+
+static int ftmac100_probe(struct platform_device *pdev)
+{
+	struct resource *res;
+	int irq;
+	struct net_device *netdev;
+	struct ftmac100 *priv;
+	int err;
+
+	if (!pdev)
+		return -ENODEV;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENXIO;
+
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0)
+		return irq;
+
+	/* setup net_device */
+
+	netdev = alloc_etherdev(sizeof(struct ftmac100));
+	if (netdev == NULL) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	SET_NETDEV_DEV(netdev, &pdev->dev);
+	SET_ETHTOOL_OPS(netdev, &ftmac100_ethtool_ops);
+	netdev->netdev_ops = &ftmac100_netdev_ops;
+
+	platform_set_drvdata(pdev, netdev);
+
+	/* setup private data */
+
+	priv = netdev_priv(netdev);
+	priv->netdev = netdev;
+	priv->dev = &pdev->dev;
+
+	spin_lock_init(&priv->tx_lock);
+
+	/* initialize NAPI */
+	netif_napi_add(netdev, &priv->napi, ftmac100_poll, 64);
+
+	/* map io memory */
+	priv->res = request_mem_region(res->start, resource_size(res),
+				       dev_name(&pdev->dev));
+	if (priv->res == NULL) {
+		dev_err(&pdev->dev, "Could not reserve memory region\n");
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	priv->base = ioremap(res->start, res->end - res->start);
+	if (priv->base == NULL) {
+		dev_err(&pdev->dev, "Failed to ioremap ethernet registers\n");
+		err = -EIO;
+		goto err_out;
+	}
+
+	priv->irq = irq;
+
+	/* initialize struct mii_if_info */
+
+	priv->mii.phy_id	= 0;
+	priv->mii.phy_id_mask	= 0x1f;
+	priv->mii.reg_num_mask	= 0x1f;
+	priv->mii.dev		= netdev;
+	priv->mii.mdio_read	= ftmac100_mdio_read;
+	priv->mii.mdio_write	= ftmac100_mdio_write;
+
+	/* register network device */
+
+	err = register_netdev(netdev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to register netdev\n");
+		goto err_out;
+	}
+
+	netdev_info(netdev, "irq %d, mapped at %p\n", priv->irq, priv->base);
+
+	if (!is_valid_ether_addr(netdev->dev_addr)) {
+		random_ether_addr(netdev->dev_addr);
+		netdev_info(netdev, "generated random MAC address %pM\n",
+			    netdev->dev_addr);
+	}
+
+	return 0;
+
+err_out:
+	ftmac100_remove(pdev);
+	return err;
+}
+
+static struct platform_driver ftmac100_driver = {
+	.probe		= ftmac100_probe,
+	.remove		= ftmac100_remove,
+	.driver		= {
+		.name	= DRV_NAME,
+		.owner	= THIS_MODULE,
+	},
+};
+
+/******************************************************************************
+ * initialization / finalization
+ *****************************************************************************/
+static int __init ftmac100_init(void)
+{
+	pr_info("Loading version " DRV_VERSION " ...\n");
+	return platform_driver_register(&ftmac100_driver);
+}
+
+static void __exit ftmac100_exit(void)
+{
+	platform_driver_unregister(&ftmac100_driver);
+}
+
+module_init(ftmac100_init);
+module_exit(ftmac100_exit);
+
+MODULE_AUTHOR("Po-Yu Chuang <ratbert@faraday-tech.com>");
+MODULE_DESCRIPTION("FTMAC100 driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/ftmac100.h b/drivers/net/ftmac100.h
new file mode 100644
index 0000000..46a0c47
--- /dev/null
+++ b/drivers/net/ftmac100.h
@@ -0,0 +1,180 @@
+/*
+ * Faraday FTMAC100 10/100 Ethernet
+ *
+ * (C) Copyright 2009-2011 Faraday Technology
+ * Po-Yu Chuang <ratbert@faraday-tech.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef __FTMAC100_H
+#define __FTMAC100_H
+
+#define	FTMAC100_OFFSET_ISR		0x00
+#define	FTMAC100_OFFSET_IMR		0x04
+#define	FTMAC100_OFFSET_MAC_MADR	0x08
+#define	FTMAC100_OFFSET_MAC_LADR	0x0c
+#define	FTMAC100_OFFSET_MAHT0		0x10
+#define	FTMAC100_OFFSET_MAHT1		0x14
+#define	FTMAC100_OFFSET_TXPD		0x18
+#define	FTMAC100_OFFSET_RXPD		0x1c
+#define	FTMAC100_OFFSET_TXR_BADR	0x20
+#define	FTMAC100_OFFSET_RXR_BADR	0x24
+#define	FTMAC100_OFFSET_ITC		0x28
+#define	FTMAC100_OFFSET_APTC		0x2c
+#define	FTMAC100_OFFSET_DBLAC		0x30
+#define	FTMAC100_OFFSET_MACCR		0x88
+#define	FTMAC100_OFFSET_MACSR		0x8c
+#define	FTMAC100_OFFSET_PHYCR		0x90
+#define	FTMAC100_OFFSET_PHYWDATA	0x94
+#define	FTMAC100_OFFSET_FCR		0x98
+#define	FTMAC100_OFFSET_BPR		0x9c
+#define	FTMAC100_OFFSET_TS		0xc4
+#define	FTMAC100_OFFSET_DMAFIFOS	0xc8
+#define	FTMAC100_OFFSET_TM		0xcc
+#define	FTMAC100_OFFSET_TX_MCOL_SCOL	0xd4
+#define	FTMAC100_OFFSET_RPF_AEP		0xd8
+#define	FTMAC100_OFFSET_XM_PG		0xdc
+#define	FTMAC100_OFFSET_RUNT_TLCC	0xe0
+#define	FTMAC100_OFFSET_CRCER_FTL	0xe4
+#define	FTMAC100_OFFSET_RLC_RCC		0xe8
+#define	FTMAC100_OFFSET_BROC		0xec
+#define	FTMAC100_OFFSET_MULCA		0xf0
+#define	FTMAC100_OFFSET_RP		0xf4
+#define	FTMAC100_OFFSET_XP		0xf8
+
+/*
+ * Interrupt status register & interrupt mask register
+ */
+#define	FTMAC100_INT_RPKT_FINISH	(1 << 0)
+#define	FTMAC100_INT_NORXBUF		(1 << 1)
+#define	FTMAC100_INT_XPKT_FINISH	(1 << 2)
+#define	FTMAC100_INT_NOTXBUF		(1 << 3)
+#define	FTMAC100_INT_XPKT_OK		(1 << 4)
+#define	FTMAC100_INT_XPKT_LOST		(1 << 5)
+#define	FTMAC100_INT_RPKT_SAV		(1 << 6)
+#define	FTMAC100_INT_RPKT_LOST		(1 << 7)
+#define	FTMAC100_INT_AHB_ERR		(1 << 8)
+#define	FTMAC100_INT_PHYSTS_CHG		(1 << 9)
+
+/*
+ * Interrupt timer control register
+ */
+#define FTMAC100_ITC_RXINT_CNT(x)	(((x) & 0xf) << 0)
+#define FTMAC100_ITC_RXINT_THR(x)	(((x) & 0x7) << 4)
+#define FTMAC100_ITC_RXINT_TIME_SEL	(1 << 7)
+#define FTMAC100_ITC_TXINT_CNT(x)	(((x) & 0xf) << 8)
+#define FTMAC100_ITC_TXINT_THR(x)	(((x) & 0x7) << 12)
+#define FTMAC100_ITC_TXINT_TIME_SEL	(1 << 15)
+
+/*
+ * Automatic polling timer control register
+ */
+#define	FTMAC100_APTC_RXPOLL_CNT(x)	(((x) & 0xf) << 0)
+#define	FTMAC100_APTC_RXPOLL_TIME_SEL	(1 << 4)
+#define	FTMAC100_APTC_TXPOLL_CNT(x)	(((x) & 0xf) << 8)
+#define	FTMAC100_APTC_TXPOLL_TIME_SEL	(1 << 12)
+
+/*
+ * DMA burst length and arbitration control register
+ */
+#define FTMAC100_DBLAC_INCR4_EN		(1 << 0)
+#define FTMAC100_DBLAC_INCR8_EN		(1 << 1)
+#define FTMAC100_DBLAC_INCR16_EN	(1 << 2)
+#define FTMAC100_DBLAC_RXFIFO_LTHR(x)	(((x) & 0x7) << 3)
+#define FTMAC100_DBLAC_RXFIFO_HTHR(x)	(((x) & 0x7) << 6)
+#define FTMAC100_DBLAC_RX_THR_EN	(1 << 9)
+
+/*
+ * MAC control register
+ */
+#define	FTMAC100_MACCR_XDMA_EN		(1 << 0)
+#define	FTMAC100_MACCR_RDMA_EN		(1 << 1)
+#define	FTMAC100_MACCR_SW_RST		(1 << 2)
+#define	FTMAC100_MACCR_LOOP_EN		(1 << 3)
+#define	FTMAC100_MACCR_CRC_DIS		(1 << 4)
+#define	FTMAC100_MACCR_XMT_EN		(1 << 5)
+#define	FTMAC100_MACCR_ENRX_IN_HALFTX	(1 << 6)
+#define	FTMAC100_MACCR_RCV_EN		(1 << 8)
+#define	FTMAC100_MACCR_HT_MULTI_EN	(1 << 9)
+#define	FTMAC100_MACCR_RX_RUNT		(1 << 10)
+#define	FTMAC100_MACCR_RX_FTL		(1 << 11)
+#define	FTMAC100_MACCR_RCV_ALL		(1 << 12)
+#define	FTMAC100_MACCR_CRC_APD		(1 << 14)
+#define	FTMAC100_MACCR_FULLDUP		(1 << 15)
+#define	FTMAC100_MACCR_RX_MULTIPKT	(1 << 16)
+#define	FTMAC100_MACCR_RX_BROADPKT	(1 << 17)
+
+/*
+ * PHY control register
+ */
+#define FTMAC100_PHYCR_MIIRDATA		0xffff
+#define FTMAC100_PHYCR_PHYAD(x)		(((x) & 0x1f) << 16)
+#define FTMAC100_PHYCR_REGAD(x)		(((x) & 0x1f) << 21)
+#define FTMAC100_PHYCR_MIIRD		(1 << 26)
+#define FTMAC100_PHYCR_MIIWR		(1 << 27)
+
+/*
+ * PHY write data register
+ */
+#define FTMAC100_PHYWDATA_MIIWDATA(x)	((x) & 0xffff)
+
+/*
+ * Transmit descriptor, aligned to 16 bytes
+ */
+struct ftmac100_txdes {
+	unsigned int	txdes0;
+	unsigned int	txdes1;
+	unsigned int	txdes2;	/* TXBUF_BADR */
+	unsigned int	txdes3;	/* not used by HW */
+} __attribute__ ((aligned(16)));
+
+#define	FTMAC100_TXDES0_TXPKT_LATECOL	(1 << 0)
+#define	FTMAC100_TXDES0_TXPKT_EXSCOL	(1 << 1)
+#define	FTMAC100_TXDES0_TXDMA_OWN	(1 << 31)
+
+#define	FTMAC100_TXDES1_TXBUF_SIZE(x)	((x) & 0x7ff)
+#define	FTMAC100_TXDES1_LTS		(1 << 27)
+#define	FTMAC100_TXDES1_FTS		(1 << 28)
+#define	FTMAC100_TXDES1_TX2FIC		(1 << 29)
+#define	FTMAC100_TXDES1_TXIC		(1 << 30)
+#define	FTMAC100_TXDES1_EDOTR		(1 << 31)
+
+/*
+ * Receive descriptor, aligned to 16 bytes
+ */
+struct ftmac100_rxdes {
+	unsigned int	rxdes0;
+	unsigned int	rxdes1;
+	unsigned int	rxdes2;	/* RXBUF_BADR */
+	unsigned int	rxdes3;	/* not used by HW */
+} __attribute__ ((aligned(16)));
+
+#define	FTMAC100_RXDES0_RFL		0x7ff
+#define	FTMAC100_RXDES0_MULTICAST	(1 << 16)
+#define	FTMAC100_RXDES0_BROADCAST	(1 << 17)
+#define	FTMAC100_RXDES0_RX_ERR		(1 << 18)
+#define	FTMAC100_RXDES0_CRC_ERR		(1 << 19)
+#define	FTMAC100_RXDES0_FTL		(1 << 20)
+#define	FTMAC100_RXDES0_RUNT		(1 << 21)
+#define	FTMAC100_RXDES0_RX_ODD_NB	(1 << 22)
+#define	FTMAC100_RXDES0_LRS		(1 << 28)
+#define	FTMAC100_RXDES0_FRS		(1 << 29)
+#define	FTMAC100_RXDES0_RXDMA_OWN	(1 << 31)
+
+#define	FTMAC100_RXDES1_RXBUF_SIZE(x)	((x) & 0x7ff)
+#define	FTMAC100_RXDES1_EDORR		(1 << 31)
+
+#endif /* __FTMAC100_H */
-- 
1.6.3.3

^ permalink raw reply related

* [PATCH 3/4] net: Add safe reverse SKB queue walkers.
From: David Miller @ 2011-01-21  7:56 UTC (permalink / raw)
  To: netdev; +Cc: paulus


Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |    9 +++++++++
 1 files changed, 9 insertions(+), 0 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index bf221d6..6e946da 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1801,6 +1801,15 @@ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len)
 		     prefetch(skb->prev), (skb != (struct sk_buff *)(queue));	\
 		     skb = skb->prev)
 
+#define skb_queue_reverse_walk_safe(queue, skb, tmp)				\
+		for (skb = (queue)->prev, tmp = skb->prev;			\
+		     skb != (struct sk_buff *)(queue);				\
+		     skb = tmp, tmp = skb->prev)
+
+#define skb_queue_reverse_walk_from_safe(queue, skb, tmp)			\
+		for (tmp = skb->prev;						\
+		     skb != (struct sk_buff *)(queue);				\
+		     skb = tmp, tmp = skb->prev)
 
 static inline bool skb_has_frag_list(const struct sk_buff *skb)
 {
-- 
1.7.3.4


^ permalink raw reply related

* [PATCH 2/4] ppp: Reconstruct fragmented packets using frag lists instead of copying.
From: David Miller @ 2011-01-21  7:56 UTC (permalink / raw)
  To: netdev; +Cc: paulus


[paulus@samba.org: fixed a couple of bugs]

Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 drivers/net/ppp_generic.c |   39 +++++++++++++++++++++++----------------
 1 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c
index 3d7a38e..1d4fb34 100644
--- a/drivers/net/ppp_generic.c
+++ b/drivers/net/ppp_generic.c
@@ -2055,16 +2055,6 @@ ppp_mp_reconstruct(struct ppp *ppp)
 				netdev_printk(KERN_DEBUG, ppp->dev,
 					      "PPP: reconstructed packet"
 					      " is too long (%d)\n", len);
-			} else if (p == head) {
-				/* fragment is complete packet - reuse skb */
-				tail = p;
-				skb = skb_get(p);
-				break;
-			} else if ((skb = dev_alloc_skb(len)) == NULL) {
-				++ppp->dev->stats.rx_missed_errors;
-				netdev_printk(KERN_DEBUG, ppp->dev,
-					      "PPP: no memory for "
-					      "reconstructed packet");
 			} else {
 				tail = p;
 				break;
@@ -2097,16 +2087,33 @@ ppp_mp_reconstruct(struct ppp *ppp)
 			ppp_receive_error(ppp);
 		}
 
-		if (head != tail)
-			/* copy to a single skb */
-			for (p = head; p != tail->next; p = p->next)
-				skb_copy_bits(p, 0, skb_put(skb, p->len), p->len);
+		skb = head;
+		if (head != tail) {
+			struct sk_buff **fragpp = &skb_shinfo(skb)->frag_list;
+			p = skb_queue_next(list, head);
+			__skb_unlink(skb, list);
+			skb_queue_walk_from_safe(list, p, tmp) {
+				__skb_unlink(p, list);
+				*fragpp = p;
+				p->next = NULL;
+				fragpp = &p->next;
+
+				skb->len += p->len;
+				skb->data_len += p->len;
+				skb->truesize += p->len;
+
+				if (p == tail)
+					break;
+			}
+		} else {
+			__skb_unlink(skb, list);
+		}
+
 		ppp->nextseq = PPP_MP_CB(tail)->sequence + 1;
 		head = tail->next;
 	}
 
-	/* Discard all the skbuffs that we have copied the data out of
-	   or that we can't use. */
+	/* Discard all the skbuffs that we can't use. */
 	while ((p = list->next) != head) {
 		__skb_unlink(p, list);
 		kfree_skb(p);
-- 
1.7.3.4


^ permalink raw reply related

* [PATCH 1/4] ppp: Clean up kernel log messages.
From: David Miller @ 2011-01-21  7:56 UTC (permalink / raw)
  To: netdev; +Cc: paulus


Use netdev_*() and pr_*().

To preserve existing semantics in cases where KERN_DEBUG is indeed
appropriate, use netdev_printk(KERN_DEBUG, ...)

Convert PPPIOCDETACH to pr_warn() because an unexpected file count is
a serious bug and should be logged with KERN_WARN.

Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 drivers/net/ppp_generic.c |   86 +++++++++++++++++++++++++-------------------
 1 files changed, 49 insertions(+), 37 deletions(-)

diff --git a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c
index c7a6c44..3d7a38e 100644
--- a/drivers/net/ppp_generic.c
+++ b/drivers/net/ppp_generic.c
@@ -592,8 +592,8 @@ static long ppp_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 			ppp_release(NULL, file);
 			err = 0;
 		} else
-			printk(KERN_DEBUG "PPPIOCDETACH file->f_count=%ld\n",
-			       atomic_long_read(&file->f_count));
+			pr_warn("PPPIOCDETACH file->f_count=%ld\n",
+				atomic_long_read(&file->f_count));
 		mutex_unlock(&ppp_mutex);
 		return err;
 	}
@@ -630,7 +630,7 @@ static long ppp_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 
 	if (pf->kind != INTERFACE) {
 		/* can't happen */
-		printk(KERN_ERR "PPP: not interface or channel??\n");
+		pr_err("PPP: not interface or channel??\n");
 		return -EINVAL;
 	}
 
@@ -704,7 +704,8 @@ static long ppp_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		}
 		vj = slhc_init(val2+1, val+1);
 		if (!vj) {
-			printk(KERN_ERR "PPP: no memory (VJ compressor)\n");
+			netdev_err(ppp->dev,
+				   "PPP: no memory (VJ compressor)\n");
 			err = -ENOMEM;
 			break;
 		}
@@ -898,17 +899,17 @@ static int __init ppp_init(void)
 {
 	int err;
 
-	printk(KERN_INFO "PPP generic driver version " PPP_VERSION "\n");
+	pr_info("PPP generic driver version " PPP_VERSION "\n");
 
 	err = register_pernet_device(&ppp_net_ops);
 	if (err) {
-		printk(KERN_ERR "failed to register PPP pernet device (%d)\n", err);
+		pr_err("failed to register PPP pernet device (%d)\n", err);
 		goto out;
 	}
 
 	err = register_chrdev(PPP_MAJOR, "ppp", &ppp_device_fops);
 	if (err) {
-		printk(KERN_ERR "failed to register PPP device (%d)\n", err);
+		pr_err("failed to register PPP device (%d)\n", err);
 		goto out_net;
 	}
 
@@ -1078,7 +1079,7 @@ pad_compress_skb(struct ppp *ppp, struct sk_buff *skb)
 	new_skb = alloc_skb(new_skb_size, GFP_ATOMIC);
 	if (!new_skb) {
 		if (net_ratelimit())
-			printk(KERN_ERR "PPP: no memory (comp pkt)\n");
+			netdev_err(ppp->dev, "PPP: no memory (comp pkt)\n");
 		return NULL;
 	}
 	if (ppp->dev->hard_header_len > PPP_HDRLEN)
@@ -1108,7 +1109,7 @@ pad_compress_skb(struct ppp *ppp, struct sk_buff *skb)
 		 * the same number.
 		 */
 		if (net_ratelimit())
-			printk(KERN_ERR "ppp: compressor dropped pkt\n");
+			netdev_err(ppp->dev, "ppp: compressor dropped pkt\n");
 		kfree_skb(skb);
 		kfree_skb(new_skb);
 		new_skb = NULL;
@@ -1138,7 +1139,9 @@ ppp_send_frame(struct ppp *ppp, struct sk_buff *skb)
 		if (ppp->pass_filter &&
 		    sk_run_filter(skb, ppp->pass_filter) == 0) {
 			if (ppp->debug & 1)
-				printk(KERN_DEBUG "PPP: outbound frame not passed\n");
+				netdev_printk(KERN_DEBUG, ppp->dev,
+					      "PPP: outbound frame "
+					      "not passed\n");
 			kfree_skb(skb);
 			return;
 		}
@@ -1164,7 +1167,7 @@ ppp_send_frame(struct ppp *ppp, struct sk_buff *skb)
 		new_skb = alloc_skb(skb->len + ppp->dev->hard_header_len - 2,
 				    GFP_ATOMIC);
 		if (!new_skb) {
-			printk(KERN_ERR "PPP: no memory (VJ comp pkt)\n");
+			netdev_err(ppp->dev, "PPP: no memory (VJ comp pkt)\n");
 			goto drop;
 		}
 		skb_reserve(new_skb, ppp->dev->hard_header_len - 2);
@@ -1202,7 +1205,9 @@ ppp_send_frame(struct ppp *ppp, struct sk_buff *skb)
 	    proto != PPP_LCP && proto != PPP_CCP) {
 		if (!(ppp->flags & SC_CCP_UP) && (ppp->flags & SC_MUST_COMP)) {
 			if (net_ratelimit())
-				printk(KERN_ERR "ppp: compression required but down - pkt dropped.\n");
+				netdev_err(ppp->dev,
+					   "ppp: compression required but "
+					   "down - pkt dropped.\n");
 			goto drop;
 		}
 		skb = pad_compress_skb(ppp, skb);
@@ -1505,7 +1510,7 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb)
  noskb:
 	spin_unlock_bh(&pch->downl);
 	if (ppp->debug & 1)
-		printk(KERN_ERR "PPP: no memory (fragment)\n");
+		netdev_err(ppp->dev, "PPP: no memory (fragment)\n");
 	++ppp->dev->stats.tx_errors;
 	++ppp->nxseq;
 	return 1;	/* abandon the frame */
@@ -1686,7 +1691,8 @@ ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb)
 			/* copy to a new sk_buff with more tailroom */
 			ns = dev_alloc_skb(skb->len + 128);
 			if (!ns) {
-				printk(KERN_ERR"PPP: no memory (VJ decomp)\n");
+				netdev_err(ppp->dev, "PPP: no memory "
+					   "(VJ decomp)\n");
 				goto err;
 			}
 			skb_reserve(ns, 2);
@@ -1699,7 +1705,8 @@ ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb)
 
 		len = slhc_uncompress(ppp->vj, skb->data + 2, skb->len - 2);
 		if (len <= 0) {
-			printk(KERN_DEBUG "PPP: VJ decompression error\n");
+			netdev_printk(KERN_DEBUG, ppp->dev,
+				      "PPP: VJ decompression error\n");
 			goto err;
 		}
 		len += 2;
@@ -1721,7 +1728,7 @@ ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb)
 			goto err;
 
 		if (slhc_remember(ppp->vj, skb->data + 2, skb->len - 2) <= 0) {
-			printk(KERN_ERR "PPP: VJ uncompressed error\n");
+			netdev_err(ppp->dev, "PPP: VJ uncompressed error\n");
 			goto err;
 		}
 		proto = PPP_IP;
@@ -1762,8 +1769,9 @@ ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb)
 			if (ppp->pass_filter &&
 			    sk_run_filter(skb, ppp->pass_filter) == 0) {
 				if (ppp->debug & 1)
-					printk(KERN_DEBUG "PPP: inbound frame "
-					       "not passed\n");
+					netdev_printk(KERN_DEBUG, ppp->dev,
+						      "PPP: inbound frame "
+						      "not passed\n");
 				kfree_skb(skb);
 				return;
 			}
@@ -1821,7 +1829,8 @@ ppp_decompress_frame(struct ppp *ppp, struct sk_buff *skb)
 
 		ns = dev_alloc_skb(obuff_size);
 		if (!ns) {
-			printk(KERN_ERR "ppp_decompress_frame: no memory\n");
+			netdev_err(ppp->dev, "ppp_decompress_frame: "
+				   "no memory\n");
 			goto err;
 		}
 		/* the decompressor still expects the A/C bytes in the hdr */
@@ -2002,8 +2011,9 @@ ppp_mp_reconstruct(struct ppp *ppp)
 		next = p->next;
 		if (seq_before(PPP_MP_CB(p)->sequence, seq)) {
 			/* this can't happen, anyway ignore the skb */
-			printk(KERN_ERR "ppp_mp_reconstruct bad seq %u < %u\n",
-			       PPP_MP_CB(p)->sequence, seq);
+			netdev_err(ppp->dev, "ppp_mp_reconstruct bad "
+				   "seq %u < %u\n",
+				   PPP_MP_CB(p)->sequence, seq);
 			head = next;
 			continue;
 		}
@@ -2042,8 +2052,9 @@ ppp_mp_reconstruct(struct ppp *ppp)
 		    (PPP_MP_CB(head)->BEbits & B)) {
 			if (len > ppp->mrru + 2) {
 				++ppp->dev->stats.rx_length_errors;
-				printk(KERN_DEBUG "PPP: reconstructed packet"
-				       " is too long (%d)\n", len);
+				netdev_printk(KERN_DEBUG, ppp->dev,
+					      "PPP: reconstructed packet"
+					      " is too long (%d)\n", len);
 			} else if (p == head) {
 				/* fragment is complete packet - reuse skb */
 				tail = p;
@@ -2051,8 +2062,9 @@ ppp_mp_reconstruct(struct ppp *ppp)
 				break;
 			} else if ((skb = dev_alloc_skb(len)) == NULL) {
 				++ppp->dev->stats.rx_missed_errors;
-				printk(KERN_DEBUG "PPP: no memory for "
-				       "reconstructed packet");
+				netdev_printk(KERN_DEBUG, ppp->dev,
+					      "PPP: no memory for "
+					      "reconstructed packet");
 			} else {
 				tail = p;
 				break;
@@ -2077,9 +2089,10 @@ ppp_mp_reconstruct(struct ppp *ppp)
 		   signal a receive error. */
 		if (PPP_MP_CB(head)->sequence != ppp->nextseq) {
 			if (ppp->debug & 1)
-				printk(KERN_DEBUG "  missed pkts %u..%u\n",
-				       ppp->nextseq,
-				       PPP_MP_CB(head)->sequence-1);
+				netdev_printk(KERN_DEBUG, ppp->dev,
+					      "  missed pkts %u..%u\n",
+					      ppp->nextseq,
+					      PPP_MP_CB(head)->sequence-1);
 			++ppp->dev->stats.rx_dropped;
 			ppp_receive_error(ppp);
 		}
@@ -2617,8 +2630,8 @@ ppp_create_interface(struct net *net, int unit, int *retp)
 	ret = register_netdev(dev);
 	if (ret != 0) {
 		unit_put(&pn->units_idr, unit);
-		printk(KERN_ERR "PPP: couldn't register device %s (%d)\n",
-		       dev->name, ret);
+		netdev_err(ppp->dev, "PPP: couldn't register device %s (%d)\n",
+			   dev->name, ret);
 		goto out2;
 	}
 
@@ -2690,9 +2703,9 @@ static void ppp_destroy_interface(struct ppp *ppp)
 
 	if (!ppp->file.dead || ppp->n_channels) {
 		/* "can't happen" */
-		printk(KERN_ERR "ppp: destroying ppp struct %p but dead=%d "
-		       "n_channels=%d !\n", ppp, ppp->file.dead,
-		       ppp->n_channels);
+		netdev_err(ppp->dev, "ppp: destroying ppp struct %p "
+			   "but dead=%d n_channels=%d !\n",
+			   ppp, ppp->file.dead, ppp->n_channels);
 		return;
 	}
 
@@ -2834,8 +2847,7 @@ static void ppp_destroy_channel(struct channel *pch)
 
 	if (!pch->file.dead) {
 		/* "can't happen" */
-		printk(KERN_ERR "ppp: destroying undead channel %p !\n",
-		       pch);
+		pr_err("ppp: destroying undead channel %p !\n", pch);
 		return;
 	}
 	skb_queue_purge(&pch->file.xq);
@@ -2847,7 +2859,7 @@ static void __exit ppp_cleanup(void)
 {
 	/* should never happen */
 	if (atomic_read(&ppp_unit_count) || atomic_read(&channel_count))
-		printk(KERN_ERR "PPP: removing module but units remain!\n");
+		pr_err("PPP: removing module but units remain!\n");
 	unregister_chrdev(PPP_MAJOR, "ppp");
 	device_destroy(ppp_class, MKDEV(PPP_MAJOR, 0));
 	class_destroy(ppp_class);
@@ -2865,7 +2877,7 @@ static int __unit_alloc(struct idr *p, void *ptr, int n)
 
 again:
 	if (!idr_pre_get(p, GFP_KERNEL)) {
-		printk(KERN_ERR "PPP: No free memory for idr\n");
+		pr_err("PPP: No free memory for idr\n");
 		return -ENOMEM;
 	}
 
-- 
1.7.3.4


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox