* [PATCH net-next 09/14] rxrpc: Fix unexposed client conn release
From: David Howells @ 2016-09-17 23:18 UTC (permalink / raw)
To: netdev; +Cc: dhowells, linux-afs, linux-kernel
In-Reply-To: <147415426459.17897.13899206413543061473.stgit@warthog.procyon.org.uk>
If the last call on a client connection is release after the connection has
had a bunch of calls allocated but before any DATA packets are sent (so
that it's not yet marked RXRPC_CONN_EXPOSED), an assertion will happen in
rxrpc_disconnect_client_call().
af_rxrpc: Assertion failed - 1(0x1) >= 2(0x2) is false
------------[ cut here ]------------
kernel BUG at ../net/rxrpc/conn_client.c:753!
This is because it's expecting the conn to have been exposed and to have 2
or more refs - but this isn't necessarily the case.
Simply remove the assertion. This allows the conn to be moved into the
inactive state and deleted if it isn't resurrected before the final put is
called.
Signed-off-by: David Howells <dhowells@redhat.com>
---
net/rxrpc/conn_client.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 5a675c43cace..226bc910e556 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -721,7 +721,6 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
}
ASSERTCMP(rcu_access_pointer(chan->call), ==, call);
- ASSERTCMP(atomic_read(&conn->usage), >=, 2);
/* If a client call was exposed to the world, we save the result for
* retransmission.
^ permalink raw reply related
* [PATCH net-next 08/14] rxrpc: Call rxrpc_release_call() on error in rxrpc_new_client_call()
From: David Howells @ 2016-09-17 23:18 UTC (permalink / raw)
To: netdev; +Cc: dhowells, linux-afs, linux-kernel
In-Reply-To: <147415426459.17897.13899206413543061473.stgit@warthog.procyon.org.uk>
Call rxrpc_release_call() on getting an error in rxrpc_new_client_call()
rather than trying to do the cleanup ourselves. This isn't a problem,
provided we set RXRPC_CALL_HAS_USERID only if we actually add the call to
the calls tree as cleanup code fragments that would otherwise cause
problems are conditional.
Without this, we miss some of the cleanup.
Signed-off-by: David Howells <dhowells@redhat.com>
---
net/rxrpc/call_object.c | 36 ++++++++++++------------------------
1 file changed, 12 insertions(+), 24 deletions(-)
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index b0ffbd9664e6..23f5a5f58282 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -226,9 +226,6 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
(const void *)user_call_ID);
/* Publish the call, even though it is incompletely set up as yet */
- call->user_call_ID = user_call_ID;
- __set_bit(RXRPC_CALL_HAS_USERID, &call->flags);
-
write_lock(&rx->call_lock);
pp = &rx->calls.rb_node;
@@ -242,10 +239,12 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
else if (user_call_ID > xcall->user_call_ID)
pp = &(*pp)->rb_right;
else
- goto found_user_ID_now_present;
+ goto error_dup_user_ID;
}
rcu_assign_pointer(call->socket, rx);
+ call->user_call_ID = user_call_ID;
+ __set_bit(RXRPC_CALL_HAS_USERID, &call->flags);
rxrpc_get_call(call, rxrpc_call_got_userid);
rb_link_node(&call->sock_node, parent, pp);
rb_insert_color(&call->sock_node, &rx->calls);
@@ -276,33 +275,22 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
_leave(" = %p [new]", call);
return call;
-error:
- write_lock(&rx->call_lock);
- rb_erase(&call->sock_node, &rx->calls);
- write_unlock(&rx->call_lock);
- rxrpc_put_call(call, rxrpc_call_put_userid);
-
- write_lock(&rxrpc_call_lock);
- list_del_init(&call->link);
- write_unlock(&rxrpc_call_lock);
-
-error_out:
- __rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR,
- RX_CALL_DEAD, ret);
- set_bit(RXRPC_CALL_RELEASED, &call->flags);
- rxrpc_put_call(call, rxrpc_call_put);
- _leave(" = %d", ret);
- return ERR_PTR(ret);
-
/* We unexpectedly found the user ID in the list after taking
* the call_lock. This shouldn't happen unless the user races
* with itself and tries to add the same user ID twice at the
* same time in different threads.
*/
-found_user_ID_now_present:
+error_dup_user_ID:
write_unlock(&rx->call_lock);
ret = -EEXIST;
- goto error_out;
+
+error:
+ __rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR,
+ RX_CALL_DEAD, ret);
+ rxrpc_release_call(rx, call);
+ rxrpc_put_call(call, rxrpc_call_put);
+ _leave(" = %d", ret);
+ return ERR_PTR(ret);
}
/*
^ permalink raw reply related
* [PATCH net-next 07/14] rxrpc: Fix the putting of client connections
From: David Howells @ 2016-09-17 23:18 UTC (permalink / raw)
To: netdev; +Cc: dhowells, linux-afs, linux-kernel
In-Reply-To: <147415426459.17897.13899206413543061473.stgit@warthog.procyon.org.uk>
In rxrpc_put_one_client_conn(), if a connection has RXRPC_CONN_COUNTED set
on it, then it's accounted for in rxrpc_nr_client_conns and may be on
various lists - and this is cleaned up correctly.
However, if the connection doesn't have RXRPC_CONN_COUNTED set on it, then
the put routine returns rather than just skipping the extra bit of cleanup.
Fix this by making the extra bit of clean up conditional instead and always
killing off the connection.
This manifests itself as connections with a zero usage count hanging around
in /proc/net/rxrpc_conns because the connection allocated, but discarded,
due to a race with another process that set up a parallel connection, which
was then shared instead.
Signed-off-by: David Howells <dhowells@redhat.com>
---
net/rxrpc/conn_client.c | 28 +++++++++++++---------------
1 file changed, 13 insertions(+), 15 deletions(-)
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 9344a8416ceb..5a675c43cace 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -818,7 +818,7 @@ idle_connection:
static struct rxrpc_connection *
rxrpc_put_one_client_conn(struct rxrpc_connection *conn)
{
- struct rxrpc_connection *next;
+ struct rxrpc_connection *next = NULL;
struct rxrpc_local *local = conn->params.local;
unsigned int nr_conns;
@@ -834,24 +834,22 @@ rxrpc_put_one_client_conn(struct rxrpc_connection *conn)
ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_INACTIVE);
- if (!test_bit(RXRPC_CONN_COUNTED, &conn->flags))
- return NULL;
-
- spin_lock(&rxrpc_client_conn_cache_lock);
- nr_conns = --rxrpc_nr_client_conns;
+ if (test_bit(RXRPC_CONN_COUNTED, &conn->flags)) {
+ spin_lock(&rxrpc_client_conn_cache_lock);
+ nr_conns = --rxrpc_nr_client_conns;
+
+ if (nr_conns < rxrpc_max_client_connections &&
+ !list_empty(&rxrpc_waiting_client_conns)) {
+ next = list_entry(rxrpc_waiting_client_conns.next,
+ struct rxrpc_connection, cache_link);
+ rxrpc_get_connection(next);
+ rxrpc_activate_conn(next);
+ }
- next = NULL;
- if (nr_conns < rxrpc_max_client_connections &&
- !list_empty(&rxrpc_waiting_client_conns)) {
- next = list_entry(rxrpc_waiting_client_conns.next,
- struct rxrpc_connection, cache_link);
- rxrpc_get_connection(next);
- rxrpc_activate_conn(next);
+ spin_unlock(&rxrpc_client_conn_cache_lock);
}
- spin_unlock(&rxrpc_client_conn_cache_lock);
rxrpc_kill_connection(conn);
-
if (next)
rxrpc_activate_channels(next);
^ permalink raw reply related
* [PATCH net-next 05/14] rxrpc: Record calls that need to be accepted
From: David Howells @ 2016-09-17 23:18 UTC (permalink / raw)
To: netdev; +Cc: dhowells, linux-afs, linux-kernel
In-Reply-To: <147415426459.17897.13899206413543061473.stgit@warthog.procyon.org.uk>
Record calls that need to be accepted using sk_acceptq_added() otherwise
the backlog counter goes negative because sk_acceptq_removed() is called.
This causes the preallocator to malfunction.
Calls that are preaccepted by AFS within the kernel aren't affected by
this.
Signed-off-by: David Howells <dhowells@redhat.com>
---
net/rxrpc/call_accept.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 26c293ef98eb..323b8da50163 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -369,6 +369,8 @@ found_service:
if (rx->notify_new_call)
rx->notify_new_call(&rx->sk, call, call->user_call_ID);
+ else
+ sk_acceptq_added(&rx->sk);
spin_lock(&conn->state_lock);
switch (conn->state) {
^ permalink raw reply related
* [PATCH net-next 06/14] rxrpc: Purge the to_be_accepted queue on socket release
From: David Howells @ 2016-09-17 23:18 UTC (permalink / raw)
To: netdev; +Cc: dhowells, linux-afs, linux-kernel
In-Reply-To: <147415426459.17897.13899206413543061473.stgit@warthog.procyon.org.uk>
Purge the queue of to_be_accepted calls on socket release. Note that
purging sock_calls doesn't release the ref owned by to_be_accepted.
Probably the sock_calls list is redundant given a purges of the recvmsg_q,
the to_be_accepted queue and the calls tree.
Signed-off-by: David Howells <dhowells@redhat.com>
---
net/rxrpc/call_object.c | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 22f9b0d1a138..b0ffbd9664e6 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -476,6 +476,16 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx)
_enter("%p", rx);
+ while (!list_empty(&rx->to_be_accepted)) {
+ call = list_entry(rx->to_be_accepted.next,
+ struct rxrpc_call, accept_link);
+ list_del(&call->accept_link);
+ rxrpc_abort_call("SKR", call, 0, RX_CALL_DEAD, ECONNRESET);
+ rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
+ rxrpc_release_call(rx, call);
+ rxrpc_put_call(call, rxrpc_call_put);
+ }
+
while (!list_empty(&rx->sock_calls)) {
call = list_entry(rx->sock_calls.next,
struct rxrpc_call, sock_link);
^ permalink raw reply related
* Re: [PATCH net] MAINTAINERS: Gary Zambrano's email is bouncing
From: Florian Fainelli @ 2016-09-17 23:17 UTC (permalink / raw)
To: Joe Perches
Cc: netdev, David Miller, Michael Chan, bcm-kernel-feedback-list,
Rafał Miłecki, Hauke Mehrtens
In-Reply-To: <1474152709.1954.8.camel@perches.com>
2016-09-17 15:51 GMT-07:00 Joe Perches <joe@perches.com>:
> On Sat, 2016-09-17 at 15:27 -0700, Florian Fainelli wrote:
>> Gary has not been with Broadcom for some time now, replace his address
>> with the internal mailing-list used for other entries.
>>
>> > Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
>> ---
>> Michael,
>>
>> Since this is an old driver, not sure who could step up as a maintainer
>> for b44?
> []
>> diff --git a/MAINTAINERS b/MAINTAINERS
> []
>> @@ -2500,8 +2500,8 @@ S: Supported
>
>> F: kernel/bpf/
>>
>> BROADCOM B44 10/100 ETHERNET DRIVER
>> -M: Gary Zambrano <zambrano@broadcom.com>
>> L: netdev@vger.kernel.org
>> +M: bcm-kernel-feedback-list@broadcom.com
>> S: Supported
>> F: drivers/net/ethernet/broadcom/b44.*
>
> Without an actual maintainer, this should really be
> orphan and not supported.
I would like to hear from Michael before concluding that
>
> And the M: bcm-kernel-feedback-list@ should be L:
The list does not accept public subscribers, so this is the correct
entry to use.
>
> BCM4401 NICs are essentially from 2002.
>
> Does anyone really use these any longer with a
> current distribution or kernel version?
This NIC is also embedded inside BCM47xx/BCM53xx which is still
getting active support from Rafal and Hauke.
--
Florian
^ permalink raw reply
* [PATCH net-next 04/14] rxrpc: Fix handling of the last packet in rxrpc_recvmsg_data()
From: David Howells @ 2016-09-17 23:18 UTC (permalink / raw)
To: netdev; +Cc: dhowells, linux-afs, linux-kernel
In-Reply-To: <147415426459.17897.13899206413543061473.stgit@warthog.procyon.org.uk>
The code for determining the last packet in rxrpc_recvmsg_data() has been
using the RXRPC_CALL_RX_LAST flag to determine if the rx_top pointer points
to the last packet or not. This isn't a good idea, however, as the input
code may be running simultaneously on another CPU and that sets the flag
*before* updating the top pointer.
Fix this by the following means:
(1) Restrict the use of RXRPC_CALL_RX_LAST to the input routines only.
There's otherwise a synchronisation problem between detecting the flag
and checking tx_top. This could probably be dealt with by appropriate
application of memory barriers, but there's a simpler way.
(2) Set RXRPC_CALL_RX_LAST after setting rx_top.
(3) Make rxrpc_rotate_rx_window() consult the flags header field of the
DATA packet it's about to discard to see if that was the last packet.
Use this as the basis for ending the Rx phase. This shouldn't be a
problem because the recvmsg side of things is guaranteed to see the
packets in order.
(4) Make rxrpc_recvmsg_data() return 1 to indicate the end of the data if:
(a) the packet it has just processed is marked as RXRPC_LAST_PACKET
(b) the call's Rx phase has been ended.
Signed-off-by: David Howells <dhowells@redhat.com>
---
net/rxrpc/input.c | 4 +++-
net/rxrpc/recvmsg.c | 49 +++++++++++++++++++++++++++++++++----------------
2 files changed, 36 insertions(+), 17 deletions(-)
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 75af0bd316c7..f0d9115b9b7e 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -238,7 +238,7 @@ next_subpacket:
len = RXRPC_JUMBO_DATALEN;
if (flags & RXRPC_LAST_PACKET) {
- if (test_and_set_bit(RXRPC_CALL_RX_LAST, &call->flags) &&
+ if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) &&
seq != call->rx_top)
return rxrpc_proto_abort("LSN", call, seq);
} else {
@@ -282,6 +282,8 @@ next_subpacket:
call->rxtx_buffer[ix] = skb;
if (after(seq, call->rx_top))
smp_store_release(&call->rx_top, seq);
+ if (flags & RXRPC_LAST_PACKET)
+ set_bit(RXRPC_CALL_RX_LAST, &call->flags);
queued = true;
if (after_eq(seq, call->rx_expect_next)) {
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 1edf2cf62cc5..8b8d7e14f800 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -134,6 +134,8 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call)
{
_enter("%d,%s", call->debug_id, rxrpc_call_states[call->state]);
+ ASSERTCMP(call->rx_hard_ack, ==, call->rx_top);
+
if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) {
rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, true, false);
rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK);
@@ -163,8 +165,10 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call)
*/
static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
{
+ struct rxrpc_skb_priv *sp;
struct sk_buff *skb;
rxrpc_seq_t hard_ack, top;
+ u8 flags;
int ix;
_enter("%d", call->debug_id);
@@ -177,6 +181,8 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
ix = hard_ack & RXRPC_RXTX_BUFF_MASK;
skb = call->rxtx_buffer[ix];
rxrpc_see_skb(skb);
+ sp = rxrpc_skb(skb);
+ flags = sp->hdr.flags;
call->rxtx_buffer[ix] = NULL;
call->rxtx_annotations[ix] = 0;
/* Barrier against rxrpc_input_data(). */
@@ -184,8 +190,8 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
rxrpc_free_skb(skb);
- _debug("%u,%u,%lx", hard_ack, top, call->flags);
- if (hard_ack == top && test_bit(RXRPC_CALL_RX_LAST, &call->flags))
+ _debug("%u,%u,%02x", hard_ack, top, flags);
+ if (flags & RXRPC_LAST_PACKET)
rxrpc_end_rx_phase(call);
}
@@ -278,13 +284,19 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
size_t remain;
bool last;
unsigned int rx_pkt_offset, rx_pkt_len;
- int ix, copy, ret = 0;
+ int ix, copy, ret = -EAGAIN, ret2;
_enter("");
rx_pkt_offset = call->rx_pkt_offset;
rx_pkt_len = call->rx_pkt_len;
+ if (call->state >= RXRPC_CALL_SERVER_ACK_REQUEST) {
+ seq = call->rx_hard_ack;
+ ret = 1;
+ goto done;
+ }
+
/* Barriers against rxrpc_input_data(). */
hard_ack = call->rx_hard_ack;
top = smp_load_acquire(&call->rx_top);
@@ -301,11 +313,13 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
sock_recv_timestamp(msg, sock->sk, skb);
if (rx_pkt_offset == 0) {
- ret = rxrpc_locate_data(call, skb,
- &call->rxtx_annotations[ix],
- &rx_pkt_offset, &rx_pkt_len);
- if (ret < 0)
+ ret2 = rxrpc_locate_data(call, skb,
+ &call->rxtx_annotations[ix],
+ &rx_pkt_offset, &rx_pkt_len);
+ if (ret2 < 0) {
+ ret = ret2;
goto out;
+ }
}
_debug("recvmsg %x DATA #%u { %d, %d }",
sp->hdr.callNumber, seq, rx_pkt_offset, rx_pkt_len);
@@ -316,10 +330,12 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
if (copy > remain)
copy = remain;
if (copy > 0) {
- ret = skb_copy_datagram_iter(skb, rx_pkt_offset, iter,
- copy);
- if (ret < 0)
+ ret2 = skb_copy_datagram_iter(skb, rx_pkt_offset, iter,
+ copy);
+ if (ret2 < 0) {
+ ret = ret2;
goto out;
+ }
/* handle piecemeal consumption of data packets */
_debug("copied %d @%zu", copy, *_offset);
@@ -332,6 +348,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
if (rx_pkt_len > 0) {
_debug("buffer full");
ASSERTCMP(*_offset, ==, len);
+ ret = 0;
break;
}
@@ -342,19 +359,19 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
rx_pkt_offset = 0;
rx_pkt_len = 0;
- ASSERTIFCMP(last, seq, ==, top);
- }
-
- if (after(seq, top)) {
- ret = -EAGAIN;
- if (test_bit(RXRPC_CALL_RX_LAST, &call->flags))
+ if (last) {
+ ASSERTCMP(seq, ==, READ_ONCE(call->rx_top));
ret = 1;
+ goto out;
+ }
}
+
out:
if (!(flags & MSG_PEEK)) {
call->rx_pkt_offset = rx_pkt_offset;
call->rx_pkt_len = rx_pkt_len;
}
+done:
_leave(" = %d [%u/%u]", ret, seq, top);
return ret;
}
^ permalink raw reply related
* [PATCH net-next 03/14] rxrpc: Check the return value of rxrpc_locate_data()
From: David Howells @ 2016-09-17 23:18 UTC (permalink / raw)
To: netdev; +Cc: dhowells, linux-afs, linux-kernel
In-Reply-To: <147415426459.17897.13899206413543061473.stgit@warthog.procyon.org.uk>
Check the return value of rxrpc_locate_data() in rxrpc_recvmsg_data().
Signed-off-by: David Howells <dhowells@redhat.com>
---
net/rxrpc/recvmsg.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 0d085f5cf1bf..1edf2cf62cc5 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -300,10 +300,13 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
if (msg)
sock_recv_timestamp(msg, sock->sk, skb);
- if (rx_pkt_offset == 0)
+ if (rx_pkt_offset == 0) {
ret = rxrpc_locate_data(call, skb,
&call->rxtx_annotations[ix],
&rx_pkt_offset, &rx_pkt_len);
+ if (ret < 0)
+ goto out;
+ }
_debug("recvmsg %x DATA #%u { %d, %d }",
sp->hdr.callNumber, seq, rx_pkt_offset, rx_pkt_len);
^ permalink raw reply related
* [PATCH net-next 02/14] rxrpc: Move the check of rx_pkt_offset from rxrpc_locate_data() to caller
From: David Howells @ 2016-09-17 23:17 UTC (permalink / raw)
To: netdev; +Cc: dhowells, linux-afs, linux-kernel
In-Reply-To: <147415426459.17897.13899206413543061473.stgit@warthog.procyon.org.uk>
Move the check of rx_pkt_offset from rxrpc_locate_data() to the caller,
rxrpc_recvmsg_data(), so that it's more clear what's going on there.
Signed-off-by: David Howells <dhowells@redhat.com>
---
net/rxrpc/recvmsg.c | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index a284205b8ecf..0d085f5cf1bf 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -240,9 +240,6 @@ static int rxrpc_locate_data(struct rxrpc_call *call, struct sk_buff *skb,
int ret;
u8 annotation = *_annotation;
- if (offset > 0)
- return 0;
-
/* Locate the subpacket */
offset = sp->offset;
len = skb->len - sp->offset;
@@ -303,8 +300,10 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
if (msg)
sock_recv_timestamp(msg, sock->sk, skb);
- ret = rxrpc_locate_data(call, skb, &call->rxtx_annotations[ix],
- &rx_pkt_offset, &rx_pkt_len);
+ if (rx_pkt_offset == 0)
+ ret = rxrpc_locate_data(call, skb,
+ &call->rxtx_annotations[ix],
+ &rx_pkt_offset, &rx_pkt_len);
_debug("recvmsg %x DATA #%u { %d, %d }",
sp->hdr.callNumber, seq, rx_pkt_offset, rx_pkt_len);
^ permalink raw reply related
* [PATCH net-next 01/14] rxrpc: Remove some whitespace.
From: David Howells @ 2016-09-17 23:17 UTC (permalink / raw)
To: netdev; +Cc: dhowells, linux-afs, linux-kernel
In-Reply-To: <147415426459.17897.13899206413543061473.stgit@warthog.procyon.org.uk>
Remove a tab that's on a line that should otherwise be blank.
Signed-off-by: David Howells <dhowells@redhat.com>
---
net/rxrpc/call_event.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 61432049869b..9367c3be31eb 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -31,7 +31,7 @@ static void rxrpc_set_timer(struct rxrpc_call *call)
_enter("{%ld,%ld,%ld:%ld}",
call->ack_at - now, call->resend_at - now, call->expire_at - now,
call->timer.expires - now);
-
+
read_lock_bh(&call->state_lock);
if (call->state < RXRPC_CALL_COMPLETE) {
^ permalink raw reply related
* [PATCH net-next 00/14] rxrpc: Fixes & miscellany
From: David Howells @ 2016-09-17 23:17 UTC (permalink / raw)
To: netdev; +Cc: dhowells, linux-afs, linux-kernel
Here are some more AF_RXRPC fix patches with a couple of miscellaneous
changes also. Fixes include:
(1) Make RxRPC IPv6 support conditional on IPv6 being available.
(2) Move the condition check in rxrpc_locate_data() into the caller and
check the error return.
(3) Fix the detection of the last received packet in recvmsg.
(4) Account calls that need acceptance and clean up any unaccepted ones if
the socket gets closed.
(5) Fix the cleanup of client connections.
(6) Fix the soft-ACK parsing and the retransmission of packets based on
those ACKs.
(7) Suppress transmission of an ACK when there's no pending ACK to
transmit because another thread stole it.
And some miscellany:
(8) Whitespace removal.
(9) Switch-value consistency in rxrpc_send_call_packet().
(10) Fix the basic transmission packet size to allow for spur-of-the-moment
jumbo DATA packet production.
The patches can be found here also (non-terminally on the branch):
http://git.kernel.org/cgit/linux/kernel/git/dhowells/linux-fs.git/log/?h=rxrpc-rewrite
Tagged thusly:
git://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs.git
rxrpc-rewrite-20160917-1
David
---
David Howells (14):
rxrpc: Remove some whitespace.
rxrpc: Move the check of rx_pkt_offset from rxrpc_locate_data() to caller
rxrpc: Check the return value of rxrpc_locate_data()
rxrpc: Fix handling of the last packet in rxrpc_recvmsg_data()
rxrpc: Record calls that need to be accepted
rxrpc: Purge the to_be_accepted queue on socket release
rxrpc: Fix the putting of client connections
rxrpc: Call rxrpc_release_call() on error in rxrpc_new_client_call()
rxrpc: Fix unexposed client conn release
rxrpc: Fix the parsing of soft-ACKs
rxrpc: Fix retransmission algorithm
rxrpc: Don't transmit an ACK if there's no reason set
rxrpc: Be consistent about switch value in rxrpc_send_call_packet()
rxrpc: Fix the basic transmit DATA packet content size at 1412 bytes
net/rxrpc/call_accept.c | 2 ++
net/rxrpc/call_event.c | 14 ++++--------
net/rxrpc/call_object.c | 46 ++++++++++++++++++++---------------------
net/rxrpc/conn_client.c | 29 ++++++++++++--------------
net/rxrpc/input.c | 6 ++++-
net/rxrpc/output.c | 7 +++++-
net/rxrpc/recvmsg.c | 53 ++++++++++++++++++++++++++++++++---------------
net/rxrpc/sendmsg.c | 2 +-
8 files changed, 89 insertions(+), 70 deletions(-)
^ permalink raw reply
* [PATCH net-next 5/5] sched: add and use qdisc_skb_head helpers
From: Florian Westphal @ 2016-09-17 22:57 UTC (permalink / raw)
To: netdev; +Cc: Florian Westphal
In-Reply-To: <1474153054-9059-1-git-send-email-fw@strlen.de>
This change replaces sk_buff_head struct in Qdiscs with new qdisc_skb_head.
Its similar to the skb_buff_head api, but does not use skb->prev pointers.
Qdiscs will commonly enqueue at the tail of a list and dequeue at head.
While skb_buff_head works fine for this, enqueue/dequeue needs to also
adjust the prev pointer of next element.
The ->prev pointer is not required for qdiscs so we can just leave
it undefined and avoid one cacheline write access for en/dequeue.
Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
include/net/sch_generic.h | 63 ++++++++++++++++++++++++++++++++++++++---------
net/sched/sch_generic.c | 21 ++++++++--------
net/sched/sch_htb.c | 24 +++++++++++++++---
net/sched/sch_netem.c | 14 +++++++++--
4 files changed, 94 insertions(+), 28 deletions(-)
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 0741ed4..e6aa0a2 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -36,6 +36,14 @@ struct qdisc_size_table {
u16 data[];
};
+/* similar to sk_buff_head, but skb->prev pointer is undefined. */
+struct qdisc_skb_head {
+ struct sk_buff *head;
+ struct sk_buff *tail;
+ __u32 qlen;
+ spinlock_t lock;
+};
+
struct Qdisc {
int (*enqueue)(struct sk_buff *skb,
struct Qdisc *sch,
@@ -76,7 +84,7 @@ struct Qdisc {
* For performance sake on SMP, we put highly modified fields at the end
*/
struct sk_buff *gso_skb ____cacheline_aligned_in_smp;
- struct sk_buff_head q;
+ struct qdisc_skb_head q;
struct gnet_stats_basic_packed bstats;
seqcount_t running;
struct gnet_stats_queue qstats;
@@ -600,10 +608,27 @@ static inline void qdisc_qstats_overlimit(struct Qdisc *sch)
sch->qstats.overlimits++;
}
+static inline void qdisc_skb_head_init(struct qdisc_skb_head *qh)
+{
+ qh->head = NULL;
+ qh->tail = NULL;
+ qh->qlen = 0;
+}
+
static inline int __qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
- struct sk_buff_head *list)
+ struct qdisc_skb_head *qh)
{
- __skb_queue_tail(list, skb);
+ struct sk_buff *last = qh->tail;
+
+ if (last) {
+ skb->next = NULL;
+ last->next = skb;
+ qh->tail = skb;
+ } else {
+ qh->tail = skb;
+ qh->head = skb;
+ }
+ qh->qlen++;
qdisc_qstats_backlog_inc(sch, skb);
return NET_XMIT_SUCCESS;
@@ -614,9 +639,17 @@ static inline int qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch)
return __qdisc_enqueue_tail(skb, sch, &sch->q);
}
-static inline struct sk_buff *__qdisc_dequeue_head(struct sk_buff_head *list)
+static inline struct sk_buff *__qdisc_dequeue_head(struct qdisc_skb_head *qh)
{
- struct sk_buff *skb = __skb_dequeue(list);
+ struct sk_buff *skb = qh->head;
+
+ if (likely(skb != NULL)) {
+ qh->head = skb->next;
+ qh->qlen--;
+ if (qh->head == NULL)
+ qh->tail = NULL;
+ skb->next = NULL;
+ }
return skb;
}
@@ -643,10 +676,10 @@ static inline void __qdisc_drop(struct sk_buff *skb, struct sk_buff **to_free)
}
static inline unsigned int __qdisc_queue_drop_head(struct Qdisc *sch,
- struct sk_buff_head *list,
+ struct qdisc_skb_head *qh,
struct sk_buff **to_free)
{
- struct sk_buff *skb = __skb_dequeue(list);
+ struct sk_buff *skb = __qdisc_dequeue_head(qh);
if (likely(skb != NULL)) {
unsigned int len = qdisc_pkt_len(skb);
@@ -667,7 +700,9 @@ static inline unsigned int qdisc_queue_drop_head(struct Qdisc *sch,
static inline struct sk_buff *qdisc_peek_head(struct Qdisc *sch)
{
- return skb_peek(&sch->q);
+ const struct qdisc_skb_head *qh = &sch->q;
+
+ return qh->head;
}
/* generic pseudo peek method for non-work-conserving qdisc */
@@ -702,15 +737,19 @@ static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch)
return skb;
}
-static inline void __qdisc_reset_queue(struct sk_buff_head *list)
+static inline void __qdisc_reset_queue(struct qdisc_skb_head *qh)
{
/*
* We do not know the backlog in bytes of this list, it
* is up to the caller to correct it
*/
- if (!skb_queue_empty(list)) {
- rtnl_kfree_skbs(list->next, list->prev);
- __skb_queue_head_init(list);
+ ASSERT_RTNL();
+ if (qh->qlen) {
+ rtnl_kfree_skbs(qh->head, qh->tail);
+
+ qh->head = NULL;
+ qh->tail = NULL;
+ qh->qlen = 0;
}
}
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 73877d9..6cfb6e9 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -466,7 +466,7 @@ static const u8 prio2band[TC_PRIO_MAX + 1] = {
*/
struct pfifo_fast_priv {
u32 bitmap;
- struct sk_buff_head q[PFIFO_FAST_BANDS];
+ struct qdisc_skb_head q[PFIFO_FAST_BANDS];
};
/*
@@ -477,7 +477,7 @@ struct pfifo_fast_priv {
*/
static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0};
-static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
+static inline struct qdisc_skb_head *band2list(struct pfifo_fast_priv *priv,
int band)
{
return priv->q + band;
@@ -489,7 +489,7 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
if (qdisc->q.qlen < qdisc_dev(qdisc)->tx_queue_len) {
int band = prio2band[skb->priority & TC_PRIO_MAX];
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
- struct sk_buff_head *list = band2list(priv, band);
+ struct qdisc_skb_head *list = band2list(priv, band);
priv->bitmap |= (1 << band);
qdisc->q.qlen++;
@@ -505,8 +505,8 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
int band = bitmap2band[priv->bitmap];
if (likely(band >= 0)) {
- struct sk_buff_head *list = band2list(priv, band);
- struct sk_buff *skb = __qdisc_dequeue_head(list);
+ struct qdisc_skb_head *qh = band2list(priv, band);
+ struct sk_buff *skb = __qdisc_dequeue_head(qh);
if (likely(skb != NULL)) {
qdisc_qstats_backlog_dec(qdisc, skb);
@@ -514,7 +514,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
}
qdisc->q.qlen--;
- if (skb_queue_empty(list))
+ if (qh->qlen == 0)
priv->bitmap &= ~(1 << band);
return skb;
@@ -529,9 +529,9 @@ static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
int band = bitmap2band[priv->bitmap];
if (band >= 0) {
- struct sk_buff_head *list = band2list(priv, band);
+ struct qdisc_skb_head *qh = band2list(priv, band);
- return skb_peek(list);
+ return qh->head;
}
return NULL;
@@ -569,7 +569,7 @@ static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
- __skb_queue_head_init(band2list(priv, prio));
+ qdisc_skb_head_init(band2list(priv, prio));
/* Can by-pass the queue discipline */
qdisc->flags |= TCQ_F_CAN_BYPASS;
@@ -617,7 +617,8 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
sch->padded = (char *) sch - (char *) p;
}
- skb_queue_head_init(&sch->q);
+ qdisc_skb_head_init(&sch->q);
+ spin_lock_init(&sch->q.lock);
spin_lock_init(&sch->busylock);
lockdep_set_class(&sch->busylock,
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 53dbfa1..c798d0d 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -162,7 +162,7 @@ struct htb_sched {
struct work_struct work;
/* non shaped skbs; let them go directly thru */
- struct sk_buff_head direct_queue;
+ struct qdisc_skb_head direct_queue;
long direct_pkts;
struct qdisc_watchdog watchdog;
@@ -570,6 +570,22 @@ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
list_del_init(&cl->un.leaf.drop_list);
}
+static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
+ struct qdisc_skb_head *qh)
+{
+ struct sk_buff *last = qh->tail;
+
+ if (last) {
+ skb->next = NULL;
+ last->next = skb;
+ qh->tail = skb;
+ } else {
+ qh->tail = skb;
+ qh->head = skb;
+ }
+ qh->qlen++;
+}
+
static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
@@ -580,7 +596,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
if (cl == HTB_DIRECT) {
/* enqueue to helper queue */
if (q->direct_queue.qlen < q->direct_qlen) {
- __skb_queue_tail(&q->direct_queue, skb);
+ htb_enqueue_tail(skb, sch, &q->direct_queue);
q->direct_pkts++;
} else {
return qdisc_drop(skb, sch, to_free);
@@ -888,7 +904,7 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch)
unsigned long start_at;
/* try to dequeue direct packets as high prio (!) to minimize cpu work */
- skb = __skb_dequeue(&q->direct_queue);
+ skb = __qdisc_dequeue_head(&q->direct_queue);
if (skb != NULL) {
ok:
qdisc_bstats_update(sch, skb);
@@ -1019,7 +1035,7 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt)
qdisc_watchdog_init(&q->watchdog, sch);
INIT_WORK(&q->work, htb_work_func);
- __skb_queue_head_init(&q->direct_queue);
+ qdisc_skb_head_init(&q->direct_queue);
if (tb[TCA_HTB_DIRECT_QLEN])
q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 0a964b3..9f7b380 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -413,6 +413,16 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
return segs;
}
+static void netem_enqueue_skb_head(struct qdisc_skb_head *qh, struct sk_buff *skb)
+{
+ skb->next = qh->head;
+
+ if (!qh->head)
+ qh->tail = skb;
+ qh->head = skb;
+ qh->qlen++;
+}
+
/*
* Insert one skb into qdisc.
* Note: parent depends on return value to account for queue length.
@@ -523,7 +533,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff *last;
if (sch->q.qlen)
- last = skb_peek_tail(&sch->q);
+ last = sch->q.tail;
else
last = netem_rb_to_skb(rb_last(&q->t_root));
if (last) {
@@ -552,7 +562,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
cb->time_to_send = psched_get_time();
q->counter = 0;
- __skb_queue_head(&sch->q, skb);
+ netem_enqueue_skb_head(&sch->q, skb);
sch->qstats.requeues++;
}
--
2.7.3
^ permalink raw reply related
* [PATCH net-next 4/5] sched: replace __skb_dequeue with __qdisc_dequeue_head
From: Florian Westphal @ 2016-09-17 22:57 UTC (permalink / raw)
To: netdev; +Cc: Florian Westphal
In-Reply-To: <1474153054-9059-1-git-send-email-fw@strlen.de>
After previous patch these functions are identical.
Replace __skb_dequeue in qdiscs with __qdisc_dequeue_head.
Next patch will then make __qdisc_dequeue_head handle
single-linked list instead of strcut sk_buff_head argument.
Doesn't change generated code.
Signed-off-by: Florian Westphal <fw@strlen.de>
---
net/sched/sch_codel.c | 4 ++--
net/sched/sch_netem.c | 2 +-
net/sched/sch_pie.c | 2 +-
3 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index 4002df3..5bfa79e 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -69,7 +69,7 @@ struct codel_sched_data {
static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx)
{
struct Qdisc *sch = ctx;
- struct sk_buff *skb = __skb_dequeue(&sch->q);
+ struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
if (skb)
sch->qstats.backlog -= qdisc_pkt_len(skb);
@@ -172,7 +172,7 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt)
qlen = sch->q.qlen;
while (sch->q.qlen > sch->limit) {
- struct sk_buff *skb = __skb_dequeue(&sch->q);
+ struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
dropped += qdisc_pkt_len(skb);
qdisc_qstats_backlog_dec(sch, skb);
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 1832d77..0a964b3 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -587,7 +587,7 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
struct rb_node *p;
tfifo_dequeue:
- skb = __skb_dequeue(&sch->q);
+ skb = __qdisc_dequeue_head(&sch->q);
if (skb) {
qdisc_qstats_backlog_dec(sch, skb);
deliver:
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index d976d74..5c3a99d 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -231,7 +231,7 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt)
/* Drop excess packets if new limit is lower */
qlen = sch->q.qlen;
while (sch->q.qlen > sch->limit) {
- struct sk_buff *skb = __skb_dequeue(&sch->q);
+ struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
dropped += qdisc_pkt_len(skb);
qdisc_qstats_backlog_dec(sch, skb);
--
2.7.3
^ permalink raw reply related
* [PATCH net-next 3/5] sched: remove qdisc arg from __qdisc_dequeue_head
From: Florian Westphal @ 2016-09-17 22:57 UTC (permalink / raw)
To: netdev; +Cc: Florian Westphal
In-Reply-To: <1474153054-9059-1-git-send-email-fw@strlen.de>
Moves qdisc stat accouting to qdisc_dequeue_head.
The only direct caller of the __qdisc_dequeue_head version open-codes
this now.
This allows us to later use __qdisc_dequeue_head as a replacement
of __skb_dequeue() (which operates on sk_buff_head list).
Signed-off-by: Florian Westphal <fw@strlen.de>
---
include/net/sch_generic.h | 15 ++++++++-------
net/sched/sch_generic.c | 7 ++++++-
2 files changed, 14 insertions(+), 8 deletions(-)
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 52a2015..0741ed4 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -614,11 +614,17 @@ static inline int qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch)
return __qdisc_enqueue_tail(skb, sch, &sch->q);
}
-static inline struct sk_buff *__qdisc_dequeue_head(struct Qdisc *sch,
- struct sk_buff_head *list)
+static inline struct sk_buff *__qdisc_dequeue_head(struct sk_buff_head *list)
{
struct sk_buff *skb = __skb_dequeue(list);
+ return skb;
+}
+
+static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch)
+{
+ struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
+
if (likely(skb != NULL)) {
qdisc_qstats_backlog_dec(sch, skb);
qdisc_bstats_update(sch, skb);
@@ -627,11 +633,6 @@ static inline struct sk_buff *__qdisc_dequeue_head(struct Qdisc *sch,
return skb;
}
-static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch)
-{
- return __qdisc_dequeue_head(sch, &sch->q);
-}
-
/* Instead of calling kfree_skb() while root qdisc lock is held,
* queue the skb for future freeing at end of __dev_xmit_skb()
*/
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 5e63bf6..73877d9 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -506,7 +506,12 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
if (likely(band >= 0)) {
struct sk_buff_head *list = band2list(priv, band);
- struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);
+ struct sk_buff *skb = __qdisc_dequeue_head(list);
+
+ if (likely(skb != NULL)) {
+ qdisc_qstats_backlog_dec(qdisc, skb);
+ qdisc_bstats_update(qdisc, skb);
+ }
qdisc->q.qlen--;
if (skb_queue_empty(list))
--
2.7.3
^ permalink raw reply related
* [PATCH net-next 2/5] sched: don't use skb queue helpers
From: Florian Westphal @ 2016-09-17 22:57 UTC (permalink / raw)
To: netdev; +Cc: Florian Westphal
In-Reply-To: <1474153054-9059-1-git-send-email-fw@strlen.de>
A followup change will replace the sk_buff_head in the qdisc
struct with a slightly different list.
Use of the sk_buff_head helpers will thus cause compiler
warnings.
Open-code these accesses in an extra change to ease review.
Signed-off-by: Florian Westphal <fw@strlen.de>
---
net/sched/sch_fifo.c | 4 ++--
net/sched/sch_generic.c | 2 +-
net/sched/sch_netem.c | 4 ++--
3 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index baeed6a..1e37247 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -31,7 +31,7 @@ static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch,
static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
- if (likely(skb_queue_len(&sch->q) < sch->limit))
+ if (likely(sch->q.qlen < sch->limit))
return qdisc_enqueue_tail(skb, sch);
return qdisc_drop(skb, sch, to_free);
@@ -42,7 +42,7 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch,
{
unsigned int prev_backlog;
- if (likely(skb_queue_len(&sch->q) < sch->limit))
+ if (likely(sch->q.qlen < sch->limit))
return qdisc_enqueue_tail(skb, sch);
prev_backlog = sch->qstats.backlog;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 0d21b56..5e63bf6 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -486,7 +486,7 @@ static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
struct sk_buff **to_free)
{
- if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
+ if (qdisc->q.qlen < qdisc_dev(qdisc)->tx_queue_len) {
int band = prio2band[skb->priority & TC_PRIO_MAX];
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
struct sk_buff_head *list = band2list(priv, band);
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index aaaf021..1832d77 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -502,7 +502,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1<<(prandom_u32() % 8);
}
- if (unlikely(skb_queue_len(&sch->q) >= sch->limit))
+ if (unlikely(sch->q.qlen >= sch->limit))
return qdisc_drop(skb, sch, to_free);
qdisc_qstats_backlog_inc(sch, skb);
@@ -522,7 +522,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
if (q->rate) {
struct sk_buff *last;
- if (!skb_queue_empty(&sch->q))
+ if (sch->q.qlen)
last = skb_peek_tail(&sch->q);
else
last = netem_rb_to_skb(rb_last(&q->t_root));
--
2.7.3
^ permalink raw reply related
* [PATCH net-next 1/5] pie: use qdisc_dequeue_head wrapper
From: Florian Westphal @ 2016-09-17 22:57 UTC (permalink / raw)
To: netdev; +Cc: Florian Westphal
In-Reply-To: <1474153054-9059-1-git-send-email-fw@strlen.de>
Doesn't change generated code.
Signed-off-by: Florian Westphal <fw@strlen.de>
---
net/sched/sch_pie.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index a570b0b..d976d74 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -511,7 +511,7 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch)
{
struct sk_buff *skb;
- skb = __qdisc_dequeue_head(sch, &sch->q);
+ skb = qdisc_dequeue_head(sch);
if (!skb)
return NULL;
--
2.7.3
^ permalink raw reply related
* [PATCH net-next 0/5] sched: convert queues to single-linked list
From: Florian Westphal @ 2016-09-17 22:57 UTC (permalink / raw)
To: netdev
During Netfilter Workshop 2016 Eric Dumazet pointed out that qdisc
schedulers use doubly-linked lists, even though single-linked list
would be enough.
The double-linked skb lists incur one extra write on enqueue/dequeue
operations (to change ->prev pointer of next list elem).
This series converts qdiscs to single-linked version, listhead
maintains pointers to first (for dequeue) and last skb (for enqueue).
Most qdiscs don't queue at all and instead use a leaf qdisc (typically
pfifo_fast) so only a few schedulers needed changes.
I briefly tested netem and htb and they seemed fine.
UDP_STREAM netperf with 64 byte packets via veth+pfifo_fast shows
a small (~2%) improvement.
Florian Westphal (5):
pie: use qdisc_dequeue_head wrapper
sched: don't use skb queue helpers
sched: remove qdisc arg from __qdisc_dequeue_head
sched: replace __skb_dequeue with __qdisc_dequeue_head
sched: add and use qdisc_skb_head helpers
include/net/sch_generic.h | 72 +++++++++++++++++++++++++++++++++++-----------
net/sched/sch_codel.c | 4 +-
net/sched/sch_fifo.c | 4 +-
net/sched/sch_generic.c | 30 +++++++++++--------
net/sched/sch_htb.c | 24 ++++++++++++---
net/sched/sch_netem.c | 20 +++++++++---
net/sched/sch_pie.c | 4 +-
7 files changed, 115 insertions(+), 43 deletions(-)
^ permalink raw reply
* Re: [PATCH net] MAINTAINERS: Gary Zambrano's email is bouncing
From: Joe Perches @ 2016-09-17 22:51 UTC (permalink / raw)
To: Florian Fainelli, netdev
Cc: davem, michael.chan, bcm-kernel-feedback-list, zajec5, hauke
In-Reply-To: <1474151235-16558-1-git-send-email-f.fainelli@gmail.com>
On Sat, 2016-09-17 at 15:27 -0700, Florian Fainelli wrote:
> Gary has not been with Broadcom for some time now, replace his address
> with the internal mailing-list used for other entries.
>
> > Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
> ---
> Michael,
>
> Since this is an old driver, not sure who could step up as a maintainer
> for b44?
[]
> diff --git a/MAINTAINERS b/MAINTAINERS
[]
> @@ -2500,8 +2500,8 @@ S: Supported
> F: kernel/bpf/
>
> BROADCOM B44 10/100 ETHERNET DRIVER
> -M: Gary Zambrano <zambrano@broadcom.com>
> L: netdev@vger.kernel.org
> +M: bcm-kernel-feedback-list@broadcom.com
> S: Supported
> F: drivers/net/ethernet/broadcom/b44.*
Without an actual maintainer, this should really be
orphan and not supported.
And the M: bcm-kernel-feedback-list@ should be L:
BCM4401 NICs are essentially from 2002.
Does anyone really use these any longer with a
current distribution or kernel version?
^ permalink raw reply
* [PATCH net] MAINTAINERS: Gary Zambrano's email is bouncing
From: Florian Fainelli @ 2016-09-17 22:27 UTC (permalink / raw)
To: netdev
Cc: davem, michael.chan, bcm-kernel-feedback-list, zajec5, hauke,
Florian Fainelli
Gary has not been with Broadcom for some time now, replace his address
with the internal mailing-list used for other entries.
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
Michael,
Since this is an old driver, not sure who could step up as a maintainer
for b44?
MAINTAINERS | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/MAINTAINERS b/MAINTAINERS
index a5e1270dfbf1..dffc3bca17ee 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2500,8 +2500,8 @@ S: Supported
F: kernel/bpf/
BROADCOM B44 10/100 ETHERNET DRIVER
-M: Gary Zambrano <zambrano@broadcom.com>
L: netdev@vger.kernel.org
+M: bcm-kernel-feedback-list@broadcom.com
S: Supported
F: drivers/net/ethernet/broadcom/b44.*
--
2.7.4
^ permalink raw reply related
* [PATCH 2/2] net: ethernet: broadcom: b44: use new api ethtool_{get|set}_link_ksettings
From: Philippe Reynes @ 2016-09-17 22:11 UTC (permalink / raw)
To: zambrano, davem, f.fainelli; +Cc: netdev, linux-kernel, Philippe Reynes
In-Reply-To: <1474150295-24622-1-git-send-email-tremyfr@gmail.com>
The ethtool api {get|set}_settings is deprecated.
We move this driver to new api {get|set}_link_ksettings.
Signed-off-by: Philippe Reynes <tremyfr@gmail.com>
---
drivers/net/ethernet/broadcom/b44.c | 98 +++++++++++++++++++----------------
1 files changed, 54 insertions(+), 44 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/b44.c b/drivers/net/ethernet/broadcom/b44.c
index 936f06f..17aa33c 100644
--- a/drivers/net/ethernet/broadcom/b44.c
+++ b/drivers/net/ethernet/broadcom/b44.c
@@ -1832,58 +1832,65 @@ static int b44_nway_reset(struct net_device *dev)
return r;
}
-static int b44_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+static int b44_get_link_ksettings(struct net_device *dev,
+ struct ethtool_link_ksettings *cmd)
{
struct b44 *bp = netdev_priv(dev);
+ u32 supported, advertising;
if (bp->flags & B44_FLAG_EXTERNAL_PHY) {
BUG_ON(!dev->phydev);
- return phy_ethtool_gset(dev->phydev, cmd);
+ return phy_ethtool_ksettings_get(dev->phydev, cmd);
}
- cmd->supported = (SUPPORTED_Autoneg);
- cmd->supported |= (SUPPORTED_100baseT_Half |
- SUPPORTED_100baseT_Full |
- SUPPORTED_10baseT_Half |
- SUPPORTED_10baseT_Full |
- SUPPORTED_MII);
+ supported = (SUPPORTED_Autoneg);
+ supported |= (SUPPORTED_100baseT_Half |
+ SUPPORTED_100baseT_Full |
+ SUPPORTED_10baseT_Half |
+ SUPPORTED_10baseT_Full |
+ SUPPORTED_MII);
- cmd->advertising = 0;
+ advertising = 0;
if (bp->flags & B44_FLAG_ADV_10HALF)
- cmd->advertising |= ADVERTISED_10baseT_Half;
+ advertising |= ADVERTISED_10baseT_Half;
if (bp->flags & B44_FLAG_ADV_10FULL)
- cmd->advertising |= ADVERTISED_10baseT_Full;
+ advertising |= ADVERTISED_10baseT_Full;
if (bp->flags & B44_FLAG_ADV_100HALF)
- cmd->advertising |= ADVERTISED_100baseT_Half;
+ advertising |= ADVERTISED_100baseT_Half;
if (bp->flags & B44_FLAG_ADV_100FULL)
- cmd->advertising |= ADVERTISED_100baseT_Full;
- cmd->advertising |= ADVERTISED_Pause | ADVERTISED_Asym_Pause;
- ethtool_cmd_speed_set(cmd, ((bp->flags & B44_FLAG_100_BASE_T) ?
- SPEED_100 : SPEED_10));
- cmd->duplex = (bp->flags & B44_FLAG_FULL_DUPLEX) ?
+ advertising |= ADVERTISED_100baseT_Full;
+ advertising |= ADVERTISED_Pause | ADVERTISED_Asym_Pause;
+ cmd->base.speed = (bp->flags & B44_FLAG_100_BASE_T) ?
+ SPEED_100 : SPEED_10;
+ cmd->base.duplex = (bp->flags & B44_FLAG_FULL_DUPLEX) ?
DUPLEX_FULL : DUPLEX_HALF;
- cmd->port = 0;
- cmd->phy_address = bp->phy_addr;
- cmd->transceiver = (bp->flags & B44_FLAG_EXTERNAL_PHY) ?
- XCVR_EXTERNAL : XCVR_INTERNAL;
- cmd->autoneg = (bp->flags & B44_FLAG_FORCE_LINK) ?
+ cmd->base.port = 0;
+ cmd->base.phy_address = bp->phy_addr;
+ cmd->base.autoneg = (bp->flags & B44_FLAG_FORCE_LINK) ?
AUTONEG_DISABLE : AUTONEG_ENABLE;
- if (cmd->autoneg == AUTONEG_ENABLE)
- cmd->advertising |= ADVERTISED_Autoneg;
+ if (cmd->base.autoneg == AUTONEG_ENABLE)
+ advertising |= ADVERTISED_Autoneg;
+
+ ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.supported,
+ supported);
+ ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.advertising,
+ advertising);
+
if (!netif_running(dev)){
- ethtool_cmd_speed_set(cmd, 0);
- cmd->duplex = 0xff;
+ cmd->base.speed = 0;
+ cmd->base.duplex = 0xff;
}
- cmd->maxtxpkt = 0;
- cmd->maxrxpkt = 0;
+
return 0;
}
-static int b44_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+static int b44_set_link_ksettings(struct net_device *dev,
+ const struct ethtool_link_ksettings *cmd)
{
struct b44 *bp = netdev_priv(dev);
u32 speed;
int ret;
+ u32 advertising;
if (bp->flags & B44_FLAG_EXTERNAL_PHY) {
BUG_ON(!dev->phydev);
@@ -1891,31 +1898,34 @@ static int b44_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
if (netif_running(dev))
b44_setup_phy(bp);
- ret = phy_ethtool_sset(dev->phydev, cmd);
+ ret = phy_ethtool_ksettings_set(dev->phydev, cmd);
spin_unlock_irq(&bp->lock);
return ret;
}
- speed = ethtool_cmd_speed(cmd);
+ speed = cmd->base.speed;
+
+ ethtool_convert_link_mode_to_legacy_u32(&advertising,
+ cmd->link_modes.advertising);
/* We do not support gigabit. */
- if (cmd->autoneg == AUTONEG_ENABLE) {
- if (cmd->advertising &
+ if (cmd->base.autoneg == AUTONEG_ENABLE) {
+ if (advertising &
(ADVERTISED_1000baseT_Half |
ADVERTISED_1000baseT_Full))
return -EINVAL;
} else if ((speed != SPEED_100 &&
speed != SPEED_10) ||
- (cmd->duplex != DUPLEX_HALF &&
- cmd->duplex != DUPLEX_FULL)) {
+ (cmd->base.duplex != DUPLEX_HALF &&
+ cmd->base.duplex != DUPLEX_FULL)) {
return -EINVAL;
}
spin_lock_irq(&bp->lock);
- if (cmd->autoneg == AUTONEG_ENABLE) {
+ if (cmd->base.autoneg == AUTONEG_ENABLE) {
bp->flags &= ~(B44_FLAG_FORCE_LINK |
B44_FLAG_100_BASE_T |
B44_FLAG_FULL_DUPLEX |
@@ -1923,19 +1933,19 @@ static int b44_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
B44_FLAG_ADV_10FULL |
B44_FLAG_ADV_100HALF |
B44_FLAG_ADV_100FULL);
- if (cmd->advertising == 0) {
+ if (advertising == 0) {
bp->flags |= (B44_FLAG_ADV_10HALF |
B44_FLAG_ADV_10FULL |
B44_FLAG_ADV_100HALF |
B44_FLAG_ADV_100FULL);
} else {
- if (cmd->advertising & ADVERTISED_10baseT_Half)
+ if (advertising & ADVERTISED_10baseT_Half)
bp->flags |= B44_FLAG_ADV_10HALF;
- if (cmd->advertising & ADVERTISED_10baseT_Full)
+ if (advertising & ADVERTISED_10baseT_Full)
bp->flags |= B44_FLAG_ADV_10FULL;
- if (cmd->advertising & ADVERTISED_100baseT_Half)
+ if (advertising & ADVERTISED_100baseT_Half)
bp->flags |= B44_FLAG_ADV_100HALF;
- if (cmd->advertising & ADVERTISED_100baseT_Full)
+ if (advertising & ADVERTISED_100baseT_Full)
bp->flags |= B44_FLAG_ADV_100FULL;
}
} else {
@@ -1943,7 +1953,7 @@ static int b44_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
bp->flags &= ~(B44_FLAG_100_BASE_T | B44_FLAG_FULL_DUPLEX);
if (speed == SPEED_100)
bp->flags |= B44_FLAG_100_BASE_T;
- if (cmd->duplex == DUPLEX_FULL)
+ if (cmd->base.duplex == DUPLEX_FULL)
bp->flags |= B44_FLAG_FULL_DUPLEX;
}
@@ -2110,8 +2120,6 @@ static int b44_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
static const struct ethtool_ops b44_ethtool_ops = {
.get_drvinfo = b44_get_drvinfo,
- .get_settings = b44_get_settings,
- .set_settings = b44_set_settings,
.nway_reset = b44_nway_reset,
.get_link = ethtool_op_get_link,
.get_wol = b44_get_wol,
@@ -2125,6 +2133,8 @@ static const struct ethtool_ops b44_ethtool_ops = {
.get_strings = b44_get_strings,
.get_sset_count = b44_get_sset_count,
.get_ethtool_stats = b44_get_ethtool_stats,
+ .get_link_ksettings = b44_get_link_ksettings,
+ .set_link_ksettings = b44_set_link_ksettings,
};
static int b44_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
--
1.7.4.4
^ permalink raw reply related
* [PATCH 1/2] net: ethernet: broadcom: b44: use phydev from struct net_device
From: Philippe Reynes @ 2016-09-17 22:11 UTC (permalink / raw)
To: zambrano, davem, f.fainelli; +Cc: netdev, linux-kernel, Philippe Reynes
The private structure contain a pointer to phydev, but the structure
net_device already contain such pointer. So we can remove the pointer
phydev in the private structure, and update the driver to use the
one contained in struct net_device.
Signed-off-by: Philippe Reynes <tremyfr@gmail.com>
---
drivers/net/ethernet/broadcom/b44.c | 22 +++++++++++-----------
drivers/net/ethernet/broadcom/b44.h | 1 -
2 files changed, 11 insertions(+), 12 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/b44.c b/drivers/net/ethernet/broadcom/b44.c
index 74f0a37..936f06f 100644
--- a/drivers/net/ethernet/broadcom/b44.c
+++ b/drivers/net/ethernet/broadcom/b44.c
@@ -1486,7 +1486,7 @@ static int b44_open(struct net_device *dev)
b44_enable_ints(bp);
if (bp->flags & B44_FLAG_EXTERNAL_PHY)
- phy_start(bp->phydev);
+ phy_start(dev->phydev);
netif_start_queue(dev);
out:
@@ -1651,7 +1651,7 @@ static int b44_close(struct net_device *dev)
netif_stop_queue(dev);
if (bp->flags & B44_FLAG_EXTERNAL_PHY)
- phy_stop(bp->phydev);
+ phy_stop(dev->phydev);
napi_disable(&bp->napi);
@@ -1837,8 +1837,8 @@ static int b44_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
struct b44 *bp = netdev_priv(dev);
if (bp->flags & B44_FLAG_EXTERNAL_PHY) {
- BUG_ON(!bp->phydev);
- return phy_ethtool_gset(bp->phydev, cmd);
+ BUG_ON(!dev->phydev);
+ return phy_ethtool_gset(dev->phydev, cmd);
}
cmd->supported = (SUPPORTED_Autoneg);
@@ -1886,12 +1886,12 @@ static int b44_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
int ret;
if (bp->flags & B44_FLAG_EXTERNAL_PHY) {
- BUG_ON(!bp->phydev);
+ BUG_ON(!dev->phydev);
spin_lock_irq(&bp->lock);
if (netif_running(dev))
b44_setup_phy(bp);
- ret = phy_ethtool_sset(bp->phydev, cmd);
+ ret = phy_ethtool_sset(dev->phydev, cmd);
spin_unlock_irq(&bp->lock);
@@ -2137,8 +2137,8 @@ static int b44_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
spin_lock_irq(&bp->lock);
if (bp->flags & B44_FLAG_EXTERNAL_PHY) {
- BUG_ON(!bp->phydev);
- err = phy_mii_ioctl(bp->phydev, ifr, cmd);
+ BUG_ON(!dev->phydev);
+ err = phy_mii_ioctl(dev->phydev, ifr, cmd);
} else {
err = generic_mii_ioctl(&bp->mii_if, if_mii(ifr), cmd, NULL);
}
@@ -2206,7 +2206,7 @@ static const struct net_device_ops b44_netdev_ops = {
static void b44_adjust_link(struct net_device *dev)
{
struct b44 *bp = netdev_priv(dev);
- struct phy_device *phydev = bp->phydev;
+ struct phy_device *phydev = dev->phydev;
bool status_changed = 0;
BUG_ON(!phydev);
@@ -2303,7 +2303,6 @@ static int b44_register_phy_one(struct b44 *bp)
SUPPORTED_MII);
phydev->advertising = phydev->supported;
- bp->phydev = phydev;
bp->old_link = 0;
bp->phy_addr = phydev->mdio.addr;
@@ -2323,9 +2322,10 @@ err_out:
static void b44_unregister_phy_one(struct b44 *bp)
{
+ struct net_device *dev = bp->dev;
struct mii_bus *mii_bus = bp->mii_bus;
- phy_disconnect(bp->phydev);
+ phy_disconnect(dev->phydev);
mdiobus_unregister(mii_bus);
mdiobus_free(mii_bus);
}
diff --git a/drivers/net/ethernet/broadcom/b44.h b/drivers/net/ethernet/broadcom/b44.h
index 65d88d7..89d2cf3 100644
--- a/drivers/net/ethernet/broadcom/b44.h
+++ b/drivers/net/ethernet/broadcom/b44.h
@@ -404,7 +404,6 @@ struct b44 {
u32 tx_pending;
u8 phy_addr;
u8 force_copybreak;
- struct phy_device *phydev;
struct mii_bus *mii_bus;
int old_link;
struct mii_if_info mii_if;
--
1.7.4.4
^ permalink raw reply related
* Re: stmmac/RTL8211F/Meson GXBB: TX throughput problems
From: André Roth @ 2016-09-17 21:23 UTC (permalink / raw)
To: Martin Blumenstingl
Cc: Alexandre Torgue, Johnson Leung, netdev, Giuseppe Cavallaro,
linux-amlogic
In-Reply-To: <CAFBinCD42A_SqQz8GM-H+A32dMELknAytHuJ5T2YgZ3BrMKA3A@mail.gmail.com>
[-- Attachment #1: Type: text/plain, Size: 3257 bytes --]
Hi all,
I have an odroid c2 board which shows this issue. No data is
transmitted or received after a moment of intense tx traffic. Copying a
1GB file per scp from the board triggers it repeatedly.
The board has a stmmac - user ID: 0x11, Synopsys ID: 0x37.
When switching the network to 100Mb/s the copying does
not seam to trigger the issue.
I've attached the ethtool statistics before and after the problem.
Thanks for your help,
André
> Hi Alexandre,
>
> On Mon, Sep 12, 2016 at 6:37 PM, Alexandre Torgue
> <alexandre.torgue@st.com> wrote:
> > Which Synopsys IP version do you use ?
> found this in a dmesg log:
> [ 1.504784] stmmac - user ID: 0x11, Synopsys ID: 0x37
> [ 1.509785] Ring mode enabled
> [ 1.512796] DMA HW capability register supported
> [ 1.517286] Normal descriptors
> [ 1.520565] RX Checksum Offload Engine supported
> [ 1.525219] COE Type 2
> [ 1.527638] TX Checksum insertion supported
> [ 1.531862] Wake-Up On Lan supported
> [ 1.535483] Enable RX Mitigation via HW Watchdog Timer
> [ 1.543851] libphy: stmmac: probed
> [ 1.544025] eth0: PHY ID 001cc916 at 0 IRQ POLL (stmmac-0:00)
> active [ 1.550321] eth0: PHY ID 001cc916 at 7 IRQ POLL
> (stmmac-0:07)
>
> >> Gbit ethernet on my device is provided by a Realtek RTL8211F RGMII
> >> PHY. Similar issues were reported in #linux-amlogic by a user with
> >> an Odroid C2 board (= similar hardware).
> >>
> >> The symptoms are:
> >> Receiving data is plenty fast (I can max out my internet connection
> >> easily, and with iperf3 I get ~900Mbit/s).
> >> Transmitting data from the device is unfortunately very slow,
> >> traffic sometimes even stalls completely.
> >>
> >> I have attached the iperf results and the output of
> >> /sys/kernel/debug/stmmaceth/eth0/descriptors_status.
> >> Below you can find the ifconfig, netstat and stmmac dma_cap info
> >> (*after* I ran all tests).
> >>
> >> The "involved parties" are:
> >> - Meson GXBB specific network configuration registers (I have have
> >> double-checked them with the reference drivers: everything seems
> >> fine here)
> >> - stmmac: it seems that nobody else has reported these kind of
> >> issues so far, however I'd still like to hear where I should
> >> enable some debugging bits to rule out any stmmac bug
> >
> >
> > On my side, I just tested on the same "kind" of system:
> > -SYNOPSYS GMAC 3.7
> > -RTL8211EG as PHY
> >
> > With I perf, I reach:
> > -RX: 932 Mbps
> > -TX: 820Mbps
> >
> > Can you check ethtool -S eth0 (most precisely "MMC"counter and
> > errors) ? Which kernel version do you use ?
> I am using a 4.8.0-rc4 kernel, based on Kevin's "integration" branch:
> [0] Unfortunately I don't have access to my device in the next few
> days, but I'll keep you updated once I have the ethtool output.
>
>
> Thanks for your time
> Regards,
> Martin
>
>
> [0]
> https://git.kernel.org/cgit/linux/kernel/git/khilman/linux-amlogic.git/log/?h=v4.8/integ
>
> _______________________________________________
> linux-amlogic mailing list
> linux-amlogic@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-amlogic
>
[-- Attachment #2: ethstats.after --]
[-- Type: application/octet-stream, Size: 5155 bytes --]
NIC statistics:
mmc_tx_octetcount_gb: 0
mmc_tx_framecount_gb: 0
mmc_tx_broadcastframe_g: 0
mmc_tx_multicastframe_g: 0
mmc_tx_64_octets_gb: 0
mmc_tx_65_to_127_octets_gb: 0
mmc_tx_128_to_255_octets_gb: 0
mmc_tx_256_to_511_octets_gb: 0
mmc_tx_512_to_1023_octets_gb: 0
mmc_tx_1024_to_max_octets_gb: 0
mmc_tx_unicast_gb: 0
mmc_tx_multicast_gb: 0
mmc_tx_broadcast_gb: 0
mmc_tx_underflow_error: 0
mmc_tx_singlecol_g: 0
mmc_tx_multicol_g: 0
mmc_tx_deferred: 0
mmc_tx_latecol: 0
mmc_tx_exesscol: 0
mmc_tx_carrier_error: 0
mmc_tx_octetcount_g: 0
mmc_tx_framecount_g: 0
mmc_tx_excessdef: 0
mmc_tx_pause_frame: 0
mmc_tx_vlan_frame_g: 0
mmc_rx_framecount_gb: 31723
mmc_rx_octetcount_gb: 2360579
mmc_rx_octetcount_g: 2360579
mmc_rx_broadcastframe_g: 0
mmc_rx_multicastframe_g: 0
mmc_rx_crc_error: 0
mmc_rx_align_error: 0
mmc_rx_run_error: 0
mmc_rx_jabber_error: 0
mmc_rx_undersize_g: 0
mmc_rx_oversize_g: 0
mmc_rx_64_octets_gb: 4
mmc_rx_65_to_127_octets_gb: 31700
mmc_rx_128_to_255_octets_gb: 11
mmc_rx_256_to_511_octets_gb: 2
mmc_rx_512_to_1023_octets_gb: 5
mmc_rx_1024_to_max_octets_gb: 1
mmc_rx_unicast_g: 31723
mmc_rx_length_error: 0
mmc_rx_autofrangetype: 0
mmc_rx_pause_frames: 0
mmc_rx_fifo_overflow: 0
mmc_rx_vlan_frames_gb: 0
mmc_rx_watchdog_error: 0
mmc_rx_ipc_intr_mask: 2147385342
mmc_rx_ipc_intr: 0
mmc_rx_ipv4_gd: 31719
mmc_rx_ipv4_hderr: 0
mmc_rx_ipv4_nopay: 0
mmc_rx_ipv4_frag: 0
mmc_rx_ipv4_udsbl: 0
mmc_rx_ipv4_gd_octets: 1789381
mmc_rx_ipv4_hderr_octets: 0
mmc_rx_ipv4_nopay_octets: 0
mmc_rx_ipv4_frag_octets: 0
mmc_rx_ipv4_udsbl_octets: 0
mmc_rx_ipv6_gd_octets: 0
mmc_rx_ipv6_hderr_octets: 0
mmc_rx_ipv6_nopay_octets: 0
mmc_rx_ipv6_gd: 0
mmc_rx_ipv6_hderr: 0
mmc_rx_ipv6_nopay: 0
mmc_rx_udp_gd: 27
mmc_rx_udp_err: 0
mmc_rx_tcp_gd: 31692
mmc_rx_tcp_err: 0
mmc_rx_icmp_gd: 0
mmc_rx_icmp_err: 0
mmc_rx_udp_gd_octets: 3856
mmc_rx_udp_err_octets: 0
mmc_rx_tcp_gd_octets: 1151145
mmc_rx_tcp_err_octets: 0
mmc_rx_icmp_gd_octets: 0
mmc_rx_icmp_err_octets: 0
tx_underflow: 0
tx_carrier: 0
tx_losscarrier: 0
vlan_tag: 0
tx_deferred: 0
tx_vlan: 0
tx_jabber: 0
tx_frame_flushed: 0
tx_payload_error: 0
tx_ip_header_error: 0
rx_desc: 0
sa_filter_fail: 0
overflow_error: 0
ipc_csum_error: 0
rx_collision: 0
rx_crc: 0
dribbling_bit: 0
rx_length: 0
rx_mii: 0
rx_multicast: 0
rx_gmac_overflow: 0
rx_watchdog: 0
da_rx_filter_fail: 0
sa_rx_filter_fail: 0
rx_missed_cntr: 0
rx_overflow_cntr: 0
rx_vlan: 0
tx_undeflow_irq: 0
tx_process_stopped_irq: 0
tx_jabber_irq: 0
rx_overflow_irq: 0
rx_buf_unav_irq: 0
rx_process_stopped_irq: 0
rx_watchdog_irq: 0
tx_early_irq: 0
fatal_bus_error_irq: 0
rx_early_irq: 2931
threshold: 1
tx_pkt_n: 307436
rx_pkt_n: 31706
normal_irq_n: 24352
rx_normal_irq_n: 15239
napi_poll: 24346
tx_normal_irq_n: 9161
tx_clean: 24463
tx_set_ic_bit: 9366
irq_receive_pmt_irq_n: 0
mmc_tx_irq_n: 0
mmc_rx_irq_n: 0
mmc_rx_csum_offload_irq_n: 0
irq_tx_path_in_lpi_mode_n: 12723
irq_tx_path_exit_lpi_mode_n: 12722
irq_rx_path_in_lpi_mode_n: 0
irq_rx_path_exit_lpi_mode_n: 0
phy_eee_wakeup_error_n: 0
ip_hdr_err: 0
ip_payload_err: 0
ip_csum_bypassed: 0
ipv4_pkt_rcvd: 0
ipv6_pkt_rcvd: 0
rx_msg_type_ext_no_ptp: 0
rx_msg_type_sync: 0
rx_msg_type_follow_up: 0
rx_msg_type_delay_req: 0
rx_msg_type_delay_resp: 0
rx_msg_type_pdelay_req: 0
rx_msg_type_pdelay_resp: 0
rx_msg_type_pdelay_follow_up: 0
ptp_frame_type: 0
ptp_ver: 0
timestamp_dropped: 0
av_pkt_rcvd: 0
av_tagged_pkt_rcvd: 0
vlan_tag_priority_val: 0
l3_filter_match: 0
l4_filter_match: 0
l3_l4_filter_no_match: 0
irq_pcs_ane_n: 0
irq_pcs_link_n: 0
irq_rgmii_n: 1
mtl_tx_status_fifo_full: 0
mtl_tx_fifo_not_empty: 0
mmtl_fifo_ctrl: 0
mtl_tx_fifo_read_ctrl_write: 0
mtl_tx_fifo_read_ctrl_wait: 0
mtl_tx_fifo_read_ctrl_read: 0
mtl_tx_fifo_read_ctrl_idle: 0
mac_tx_in_pause: 0
mac_tx_frame_ctrl_xfer: 0
mac_tx_frame_ctrl_idle: 0
mac_tx_frame_ctrl_wait: 0
mac_tx_frame_ctrl_pause: 0
mac_gmii_tx_proto_engine: 0
mtl_rx_fifo_fill_level_full: 0
mtl_rx_fifo_fill_above_thresh: 0
mtl_rx_fifo_fill_below_thresh: 0
mtl_rx_fifo_fill_level_empty: 0
mtl_rx_fifo_read_ctrl_flush: 0
mtl_rx_fifo_read_ctrl_read_data: 0
mtl_rx_fifo_read_ctrl_status: 0
mtl_rx_fifo_read_ctrl_idle: 0
mtl_rx_fifo_ctrl_active: 0
mac_rx_frame_ctrl_fifo: 0
mac_gmii_rx_proto_engine: 0
tx_tso_frames: 0
tx_tso_nfrags: 0
[-- Attachment #3: ethstats.before --]
[-- Type: application/octet-stream, Size: 5093 bytes --]
NIC statistics:
mmc_tx_octetcount_gb: 0
mmc_tx_framecount_gb: 0
mmc_tx_broadcastframe_g: 0
mmc_tx_multicastframe_g: 0
mmc_tx_64_octets_gb: 0
mmc_tx_65_to_127_octets_gb: 0
mmc_tx_128_to_255_octets_gb: 0
mmc_tx_256_to_511_octets_gb: 0
mmc_tx_512_to_1023_octets_gb: 0
mmc_tx_1024_to_max_octets_gb: 0
mmc_tx_unicast_gb: 0
mmc_tx_multicast_gb: 0
mmc_tx_broadcast_gb: 0
mmc_tx_underflow_error: 0
mmc_tx_singlecol_g: 0
mmc_tx_multicol_g: 0
mmc_tx_deferred: 0
mmc_tx_latecol: 0
mmc_tx_exesscol: 0
mmc_tx_carrier_error: 0
mmc_tx_octetcount_g: 0
mmc_tx_framecount_g: 0
mmc_tx_excessdef: 0
mmc_tx_pause_frame: 0
mmc_tx_vlan_frame_g: 0
mmc_rx_framecount_gb: 30
mmc_rx_octetcount_gb: 5049
mmc_rx_octetcount_g: 5049
mmc_rx_broadcastframe_g: 0
mmc_rx_multicastframe_g: 0
mmc_rx_crc_error: 0
mmc_rx_align_error: 0
mmc_rx_run_error: 0
mmc_rx_jabber_error: 0
mmc_rx_undersize_g: 0
mmc_rx_oversize_g: 0
mmc_rx_64_octets_gb: 4
mmc_rx_65_to_127_octets_gb: 13
mmc_rx_128_to_255_octets_gb: 8
mmc_rx_256_to_511_octets_gb: 1
mmc_rx_512_to_1023_octets_gb: 4
mmc_rx_1024_to_max_octets_gb: 0
mmc_rx_unicast_g: 30
mmc_rx_length_error: 0
mmc_rx_autofrangetype: 0
mmc_rx_pause_frames: 0
mmc_rx_fifo_overflow: 0
mmc_rx_vlan_frames_gb: 0
mmc_rx_watchdog_error: 0
mmc_rx_ipc_intr_mask: 1073692671
mmc_rx_ipc_intr: 0
mmc_rx_ipv4_gd: 26
mmc_rx_ipv4_hderr: 0
mmc_rx_ipv4_nopay: 0
mmc_rx_ipv4_frag: 0
mmc_rx_ipv4_udsbl: 0
mmc_rx_ipv4_gd_octets: 4325
mmc_rx_ipv4_hderr_octets: 0
mmc_rx_ipv4_nopay_octets: 0
mmc_rx_ipv4_frag_octets: 0
mmc_rx_ipv4_udsbl_octets: 0
mmc_rx_ipv6_gd_octets: 0
mmc_rx_ipv6_hderr_octets: 0
mmc_rx_ipv6_nopay_octets: 0
mmc_rx_ipv6_gd: 0
mmc_rx_ipv6_hderr: 0
mmc_rx_ipv6_nopay: 0
mmc_rx_udp_gd: 26
mmc_rx_udp_err: 0
mmc_rx_tcp_gd: 0
mmc_rx_tcp_err: 0
mmc_rx_icmp_gd: 0
mmc_rx_icmp_err: 0
mmc_rx_udp_gd_octets: 3805
mmc_rx_udp_err_octets: 0
mmc_rx_tcp_gd_octets: 0
mmc_rx_tcp_err_octets: 0
mmc_rx_icmp_gd_octets: 0
mmc_rx_icmp_err_octets: 0
tx_underflow: 0
tx_carrier: 0
tx_losscarrier: 0
vlan_tag: 0
tx_deferred: 0
tx_vlan: 0
tx_jabber: 0
tx_frame_flushed: 0
tx_payload_error: 0
tx_ip_header_error: 0
rx_desc: 0
sa_filter_fail: 0
overflow_error: 0
ipc_csum_error: 0
rx_collision: 0
rx_crc: 0
dribbling_bit: 0
rx_length: 0
rx_mii: 0
rx_multicast: 0
rx_gmac_overflow: 0
rx_watchdog: 0
da_rx_filter_fail: 0
sa_rx_filter_fail: 0
rx_missed_cntr: 0
rx_overflow_cntr: 0
rx_vlan: 0
tx_undeflow_irq: 0
tx_process_stopped_irq: 0
tx_jabber_irq: 0
rx_overflow_irq: 0
rx_buf_unav_irq: 0
rx_process_stopped_irq: 0
rx_watchdog_irq: 0
tx_early_irq: 0
fatal_bus_error_irq: 0
rx_early_irq: 0
threshold: 1
tx_pkt_n: 162
rx_pkt_n: 30
normal_irq_n: 31
rx_normal_irq_n: 29
napi_poll: 31
tx_normal_irq_n: 2
tx_clean: 138
tx_set_ic_bit: 2
irq_receive_pmt_irq_n: 0
mmc_tx_irq_n: 0
mmc_rx_irq_n: 0
mmc_rx_csum_offload_irq_n: 0
irq_tx_path_in_lpi_mode_n: 124
irq_tx_path_exit_lpi_mode_n: 123
irq_rx_path_in_lpi_mode_n: 0
irq_rx_path_exit_lpi_mode_n: 0
phy_eee_wakeup_error_n: 0
ip_hdr_err: 0
ip_payload_err: 0
ip_csum_bypassed: 0
ipv4_pkt_rcvd: 0
ipv6_pkt_rcvd: 0
rx_msg_type_ext_no_ptp: 0
rx_msg_type_sync: 0
rx_msg_type_follow_up: 0
rx_msg_type_delay_req: 0
rx_msg_type_delay_resp: 0
rx_msg_type_pdelay_req: 0
rx_msg_type_pdelay_resp: 0
rx_msg_type_pdelay_follow_up: 0
ptp_frame_type: 0
ptp_ver: 0
timestamp_dropped: 0
av_pkt_rcvd: 0
av_tagged_pkt_rcvd: 0
vlan_tag_priority_val: 0
l3_filter_match: 0
l4_filter_match: 0
l3_l4_filter_no_match: 0
irq_pcs_ane_n: 0
irq_pcs_link_n: 0
irq_rgmii_n: 1
mtl_tx_status_fifo_full: 0
mtl_tx_fifo_not_empty: 0
mmtl_fifo_ctrl: 0
mtl_tx_fifo_read_ctrl_write: 0
mtl_tx_fifo_read_ctrl_wait: 0
mtl_tx_fifo_read_ctrl_read: 0
mtl_tx_fifo_read_ctrl_idle: 0
mac_tx_in_pause: 0
mac_tx_frame_ctrl_xfer: 0
mac_tx_frame_ctrl_idle: 0
mac_tx_frame_ctrl_wait: 0
mac_tx_frame_ctrl_pause: 0
mac_gmii_tx_proto_engine: 0
mtl_rx_fifo_fill_level_full: 0
mtl_rx_fifo_fill_above_thresh: 0
mtl_rx_fifo_fill_below_thresh: 0
mtl_rx_fifo_fill_level_empty: 0
mtl_rx_fifo_read_ctrl_flush: 0
mtl_rx_fifo_read_ctrl_read_data: 0
mtl_rx_fifo_read_ctrl_status: 0
mtl_rx_fifo_read_ctrl_idle: 0
mtl_rx_fifo_ctrl_active: 0
mac_rx_frame_ctrl_fifo: 0
mac_gmii_rx_proto_engine: 0
tx_tso_frames: 0
tx_tso_nfrags: 0
^ permalink raw reply
* Re: [PATCH v2 net-next 07/16] tcp: track data delivery rate for a TCP connection
From: Eric Dumazet @ 2016-09-17 19:09 UTC (permalink / raw)
To: kbuild test robot
Cc: Neal Cardwell, kbuild-all, David Miller, netdev, Yuchung Cheng,
Van Jacobson, Nandita Dukkipati, Soheil Hassas Yeganeh
In-Reply-To: <201609180342.qLM70Old%fengguang.wu@intel.com>
On Sat, Sep 17, 2016 at 12:04 PM, kbuild test robot <lkp@intel.com> wrote:
> Hi Yuchung,
>
> [auto build test ERROR on net-next/master]
>
> url: https://github.com/0day-ci/linux/commits/Neal-Cardwell/tcp-BBR-congestion-control-algorithm/20160918-014058
> config: x86_64-randconfig-s2-09180225 (attached as .config)
> compiler: gcc-4.4 (Debian 4.4.7-8) 4.4.7
> reproduce:
> # save the attached .config to linux build tree
> make ARCH=x86_64
>
> All error/warnings (new ones prefixed by >>):
>
> net/ipv4/tcp_input.c: In function 'tcp_ack':
>>> net/ipv4/tcp_input.c:3559: error: unknown field 'v64' specified in initializer
>>> net/ipv4/tcp_input.c:3559: warning: missing braces around initializer
> net/ipv4/tcp_input.c:3559: warning: (near initialization for 'rs.prior_mstamp.<anonymous>')
>
> vim +/v64 +3559 net/ipv4/tcp_input.c
>
> 3553 /* This routine deals with incoming acks, but not outgoing ones. */
> 3554 static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
> 3555 {
> 3556 struct inet_connection_sock *icsk = inet_csk(sk);
> 3557 struct tcp_sock *tp = tcp_sk(sk);
> 3558 struct tcp_sacktag_state sack_state;
>> 3559 struct rate_sample rs = { .prior_mstamp.v64 = 0, .prior_delivered = 0 };
>
Arg, silly compilers out there.
We can omit prior_mstamp , as the compiler will zero fields anyway
struct rate_sample rs = { .prior_delivered = 0 };
^ permalink raw reply
* Re: [PATCH v2 net-next 07/16] tcp: track data delivery rate for a TCP connection
From: kbuild test robot @ 2016-09-17 19:04 UTC (permalink / raw)
To: Neal Cardwell
Cc: kbuild-all, David Miller, netdev, Yuchung Cheng, Van Jacobson,
Neal Cardwell, Nandita Dukkipati, Eric Dumazet,
Soheil Hassas Yeganeh
In-Reply-To: <1474133749-12895-8-git-send-email-ncardwell@google.com>
[-- Attachment #1: Type: text/plain, Size: 1432 bytes --]
Hi Yuchung,
[auto build test ERROR on net-next/master]
url: https://github.com/0day-ci/linux/commits/Neal-Cardwell/tcp-BBR-congestion-control-algorithm/20160918-014058
config: x86_64-randconfig-s2-09180225 (attached as .config)
compiler: gcc-4.4 (Debian 4.4.7-8) 4.4.7
reproduce:
# save the attached .config to linux build tree
make ARCH=x86_64
All error/warnings (new ones prefixed by >>):
net/ipv4/tcp_input.c: In function 'tcp_ack':
>> net/ipv4/tcp_input.c:3559: error: unknown field 'v64' specified in initializer
>> net/ipv4/tcp_input.c:3559: warning: missing braces around initializer
net/ipv4/tcp_input.c:3559: warning: (near initialization for 'rs.prior_mstamp.<anonymous>')
vim +/v64 +3559 net/ipv4/tcp_input.c
3553 /* This routine deals with incoming acks, but not outgoing ones. */
3554 static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3555 {
3556 struct inet_connection_sock *icsk = inet_csk(sk);
3557 struct tcp_sock *tp = tcp_sk(sk);
3558 struct tcp_sacktag_state sack_state;
> 3559 struct rate_sample rs = { .prior_mstamp.v64 = 0, .prior_delivered = 0 };
3560 u32 prior_snd_una = tp->snd_una;
3561 u32 ack_seq = TCP_SKB_CB(skb)->seq;
3562 u32 ack = TCP_SKB_CB(skb)->ack_seq;
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 30692 bytes --]
^ permalink raw reply
* [PATCH v2 net-next 16/16] tcp_bbr: add BBR congestion control
From: Neal Cardwell @ 2016-09-17 17:35 UTC (permalink / raw)
To: David Miller
Cc: netdev, Neal Cardwell, Van Jacobson, Yuchung Cheng,
Nandita Dukkipati, Eric Dumazet, Soheil Hassas Yeganeh
In-Reply-To: <1474133749-12895-1-git-send-email-ncardwell@google.com>
This commit implements a new TCP congestion control algorithm: BBR
(Bottleneck Bandwidth and RTT). A detailed description of BBR will be
published in ACM Queue, Vol. 14 No. 5, September-October 2016, as
"BBR: Congestion-Based Congestion Control".
BBR has significantly increased throughput and reduced latency for
connections on Google's internal backbone networks and google.com and
YouTube Web servers.
BBR requires only changes on the sender side, not in the network or
the receiver side. Thus it can be incrementally deployed on today's
Internet, or in datacenters.
The Internet has predominantly used loss-based congestion control
(largely Reno or CUBIC) since the 1980s, relying on packet loss as the
signal to slow down. While this worked well for many years, loss-based
congestion control is unfortunately out-dated in today's networks. On
today's Internet, loss-based congestion control causes the infamous
bufferbloat problem, often causing seconds of needless queuing delay,
since it fills the bloated buffers in many last-mile links. On today's
high-speed long-haul links using commodity switches with shallow
buffers, loss-based congestion control has abysmal throughput because
it over-reacts to losses caused by transient traffic bursts.
In 1981 Kleinrock and Gale showed that the optimal operating point for
a network maximizes delivered bandwidth while minimizing delay and
loss, not only for single connections but for the network as a
whole. Finding that optimal operating point has been elusive, since
any single network measurement is ambiguous: network measurements are
the result of both bandwidth and propagation delay, and those two
cannot be measured simultaneously.
While it is impossible to disambiguate any single bandwidth or RTT
measurement, a connection's behavior over time tells a clearer
story. BBR uses a measurement strategy designed to resolve this
ambiguity. It combines these measurements with a robust servo loop
using recent control systems advances to implement a distributed
congestion control algorithm that reacts to actual congestion, not
packet loss or transient queue delay, and is designed to converge with
high probability to a point near the optimal operating point.
In a nutshell, BBR creates an explicit model of the network pipe by
sequentially probing the bottleneck bandwidth and RTT. On the arrival
of each ACK, BBR derives the current delivery rate of the last round
trip, and feeds it through a windowed max-filter to estimate the
bottleneck bandwidth. Conversely it uses a windowed min-filter to
estimate the round trip propagation delay. The max-filtered bandwidth
and min-filtered RTT estimates form BBR's model of the network pipe.
Using its model, BBR sets control parameters to govern sending
behavior. The primary control is the pacing rate: BBR applies a gain
multiplier to transmit faster or slower than the observed bottleneck
bandwidth. The conventional congestion window (cwnd) is now the
secondary control; the cwnd is set to a small multiple of the
estimated BDP (bandwidth-delay product) in order to allow full
utilization and bandwidth probing while bounding the potential amount
of queue at the bottleneck.
When a BBR connection starts, it enters STARTUP mode and applies a
high gain to perform an exponential search to quickly probe the
bottleneck bandwidth (doubling its sending rate each round trip, like
slow start). However, instead of continuing until it fills up the
buffer (i.e. a loss), or until delay or ACK spacing reaches some
threshold (like Hystart), it uses its model of the pipe to estimate
when that pipe is full: it estimates the pipe is full when it notices
the estimated bandwidth has stopped growing. At that point it exits
STARTUP and enters DRAIN mode, where it reduces its pacing rate to
drain the queue it estimates it has created.
Then BBR enters steady state. In steady state, PROBE_BW mode cycles
between first pacing faster to probe for more bandwidth, then pacing
slower to drain any queue that created if no more bandwidth was
available, and then cruising at the estimated bandwidth to utilize the
pipe without creating excess queue. Occasionally, on an as-needed
basis, it sends significantly slower to probe for RTT (PROBE_RTT
mode).
Our long-term goal is to improve the congestion control algorithms
used on the Internet. We are hopeful that BBR can help advance the
efforts toward this goal, and motivate the community to do further
research.
Test results, performance evaluations, feedback, and BBR-related
discussions are very welcome in the public e-mail list for BBR:
https://groups.google.com/forum/#!forum/bbr-dev
Signed-off-by: Van Jacobson <vanj@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
---
include/uapi/linux/inet_diag.h | 13 +
net/ipv4/Kconfig | 18 +
net/ipv4/Makefile | 1 +
net/ipv4/tcp_bbr.c | 875 +++++++++++++++++++++++++++++++++++++++++
4 files changed, 907 insertions(+)
create mode 100644 net/ipv4/tcp_bbr.c
diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index b5c366f..509cd96 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -124,6 +124,7 @@ enum {
INET_DIAG_PEERS,
INET_DIAG_PAD,
INET_DIAG_MARK,
+ INET_DIAG_BBRINFO,
__INET_DIAG_MAX,
};
@@ -157,8 +158,20 @@ struct tcp_dctcp_info {
__u32 dctcp_ab_tot;
};
+/* INET_DIAG_BBRINFO */
+
+struct tcp_bbr_info {
+ /* u64 bw: max-filtered BW (app throughput) estimate in Byte per sec: */
+ __u32 bbr_bw_lo; /* lower 32 bits of bw */
+ __u32 bbr_bw_hi; /* upper 32 bits of bw */
+ __u32 bbr_min_rtt; /* min-filtered RTT in uSec */
+ __u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */
+ __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */
+};
+
union tcp_cc_info {
struct tcpvegas_info vegas;
struct tcp_dctcp_info dctcp;
+ struct tcp_bbr_info bbr;
};
#endif /* _UAPI_INET_DIAG_H_ */
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 50d6a9b..300b068 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -640,6 +640,21 @@ config TCP_CONG_CDG
D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg
+config TCP_CONG_BBR
+ tristate "BBR TCP"
+ default n
+ ---help---
+
+ BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
+ maximize network utilization and minimize queues. It builds an explicit
+ model of the the bottleneck delivery rate and path round-trip
+ propagation delay. It tolerates packet loss and delay unrelated to
+ congestion. It can operate over LAN, WAN, cellular, wifi, or cable
+ modem links. It can coexist with flows that use loss-based congestion
+ control, and can operate with shallow buffers, deep buffers,
+ bufferbloat, policers, or AQM schemes that do not provide a delay
+ signal. It requires the fq ("Fair Queue") pacing packet scheduler.
+
choice
prompt "Default TCP congestion control"
default DEFAULT_CUBIC
@@ -674,6 +689,9 @@ choice
config DEFAULT_CDG
bool "CDG" if TCP_CONG_CDG=y
+ config DEFAULT_BBR
+ bool "BBR" if TCP_CONG_BBR=y
+
config DEFAULT_RENO
bool "Reno"
endchoice
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 9cfff1a..bc6a6c8 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -41,6 +41,7 @@ obj-$(CONFIG_INET_DIAG) += inet_diag.o
obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
+obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
new file mode 100644
index 0000000..34cde81
--- /dev/null
+++ b/net/ipv4/tcp_bbr.c
@@ -0,0 +1,875 @@
+/* Bottleneck Bandwidth and RTT (BBR) congestion control
+ *
+ * BBR congestion control computes the sending rate based on the delivery
+ * rate (throughput) estimated from ACKs. In a nutshell:
+ *
+ * On each ACK, update our model of the network path:
+ * bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
+ * min_rtt = windowed_min(rtt, 10 seconds)
+ * pacing_rate = pacing_gain * bottleneck_bandwidth
+ * cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
+ *
+ * The core algorithm does not react directly to packet losses or delays,
+ * although BBR may adjust the size of next send per ACK when loss is
+ * observed, or adjust the sending rate if it estimates there is a
+ * traffic policer, in order to keep the drop rate reasonable.
+ *
+ * BBR is described in detail in:
+ * "BBR: Congestion-Based Congestion Control",
+ * Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh,
+ * Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016.
+ *
+ * There is a public e-mail list for discussing BBR development and testing:
+ * https://groups.google.com/forum/#!forum/bbr-dev
+ *
+ * NOTE: BBR *must* be used with the fq qdisc ("man tc-fq") with pacing enabled,
+ * since pacing is integral to the BBR design and implementation.
+ * BBR without pacing would not function properly, and may incur unnecessary
+ * high packet loss rates.
+ */
+#include <linux/module.h>
+#include <net/tcp.h>
+#include <linux/inet_diag.h>
+#include <linux/inet.h>
+#include <linux/random.h>
+#include <linux/win_minmax.h>
+
+/* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
+ * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
+ * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
+ * Since the minimum window is >=4 packets, the lower bound isn't
+ * an issue. The upper bound isn't an issue with existing technologies.
+ */
+#define BW_SCALE 24
+#define BW_UNIT (1 << BW_SCALE)
+
+#define BBR_SCALE 8 /* scaling factor for fractions in BBR (e.g. gains) */
+#define BBR_UNIT (1 << BBR_SCALE)
+
+/* BBR has the following modes for deciding how fast to send: */
+enum bbr_mode {
+ BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */
+ BBR_DRAIN, /* drain any queue created during startup */
+ BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */
+ BBR_PROBE_RTT, /* cut cwnd to min to probe min_rtt */
+};
+
+/* BBR congestion control block */
+struct bbr {
+ u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */
+ u32 min_rtt_stamp; /* timestamp of min_rtt_us */
+ u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */
+ struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */
+ u32 rtt_cnt; /* count of packet-timed rounds elapsed */
+ u32 next_rtt_delivered; /* scb->tx.delivered at end of round */
+ struct skb_mstamp cycle_mstamp; /* time of this cycle phase start */
+ u32 mode:3, /* current bbr_mode in state machine */
+ prev_ca_state:3, /* CA state on previous ACK */
+ packet_conservation:1, /* use packet conservation? */
+ restore_cwnd:1, /* decided to revert cwnd to old value */
+ round_start:1, /* start of packet-timed tx->ack round? */
+ tso_segs_goal:7, /* segments we want in each skb we send */
+ idle_restart:1, /* restarting after idle? */
+ probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */
+ unused:5,
+ lt_is_sampling:1, /* taking long-term ("LT") samples now? */
+ lt_rtt_cnt:7, /* round trips in long-term interval */
+ lt_use_bw:1; /* use lt_bw as our bw estimate? */
+ u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */
+ u32 lt_last_delivered; /* LT intvl start: tp->delivered */
+ u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */
+ u32 lt_last_lost; /* LT intvl start: tp->lost */
+ u32 pacing_gain:10, /* current gain for setting pacing rate */
+ cwnd_gain:10, /* current gain for setting cwnd */
+ full_bw_cnt:3, /* number of rounds without large bw gains */
+ cycle_idx:3, /* current index in pacing_gain cycle array */
+ unused_b:6;
+ u32 prior_cwnd; /* prior cwnd upon entering loss recovery */
+ u32 full_bw; /* recent bw, to estimate if pipe is full */
+};
+
+#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */
+
+static int bbr_bw_rtts = CYCLE_LEN + 2; /* win len of bw filter (in rounds) */
+static u32 bbr_min_rtt_win_sec = 10; /* min RTT filter window (in sec) */
+static u32 bbr_probe_rtt_mode_ms = 200; /* min ms at cwnd=4 in BBR_PROBE_RTT */
+static int bbr_min_tso_rate = 1200000; /* skip TSO below here (bits/sec) */
+
+/* We use a high_gain value chosen to allow a smoothly increasing pacing rate
+ * that will double each RTT and send the same number of packets per RTT that
+ * an un-paced, slow-starting Reno or CUBIC flow would.
+ */
+static int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; /* 2/ln(2) */
+static int bbr_drain_gain = BBR_UNIT * 1000 / 2885; /* 1/high_gain */
+static int bbr_cwnd_gain = BBR_UNIT * 2; /* gain for steady-state cwnd */
+/* The pacing_gain values for the PROBE_BW gain cycle: */
+static int bbr_pacing_gain[] = { BBR_UNIT * 5 / 4, BBR_UNIT * 3 / 4,
+ BBR_UNIT, BBR_UNIT, BBR_UNIT,
+ BBR_UNIT, BBR_UNIT, BBR_UNIT };
+static u32 bbr_cycle_rand = 7; /* randomize gain cycling phase over N phases */
+
+/* Try to keep at least this many packets in flight, if things go smoothly. For
+ * smooth functioning, a sliding window protocol ACKing every other packet
+ * needs at least 4 packets in flight.
+ */
+static u32 bbr_cwnd_min_target = 4;
+
+/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe. */
+static u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4; /* bw up 1.25x per round? */
+static u32 bbr_full_bw_cnt = 3; /* N rounds w/o bw growth -> pipe full */
+
+/* "long-term" ("LT") bandwidth estimator parameters: */
+static bool bbr_lt_bw_estimator = true; /* use the long-term bw estimate? */
+static u32 bbr_lt_intvl_min_rtts = 4; /* min rounds in sampling interval */
+static u32 bbr_lt_loss_thresh = 50; /* lost/delivered > 20% -> "lossy" */
+static u32 bbr_lt_conv_thresh = BBR_UNIT / 8; /* bw diff <= 12.5% -> "close" */
+static u32 bbr_lt_bw_max_rtts = 48; /* max # of round trips using lt_bw */
+
+/* Do we estimate that STARTUP filled the pipe? */
+static bool bbr_full_bw_reached(const struct sock *sk)
+{
+ const struct bbr *bbr = inet_csk_ca(sk);
+
+ return bbr->full_bw_cnt >= bbr_full_bw_cnt;
+}
+
+/* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
+static u32 bbr_max_bw(const struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ return minmax_get(&bbr->bw);
+}
+
+/* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
+static u32 bbr_bw(const struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
+}
+
+/* Return rate in bytes per second, optionally with a gain.
+ * The order here is chosen carefully to avoid overflow of u64. This should
+ * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
+ */
+static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
+{
+ rate *= tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache);
+ rate *= gain;
+ rate >>= BBR_SCALE;
+ rate *= USEC_PER_SEC;
+ return rate >> BW_SCALE;
+}
+
+static u64 bbr_rate_kbps(struct sock *sk, u64 rate)
+{
+ return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT) * 8 / 1000;
+}
+
+/* Pace using current bw estimate and a gain factor. */
+static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+ u64 rate = bw;
+
+ rate = bbr_rate_bytes_per_sec(sk, rate, gain);
+ rate = min_t(u64, rate, sk->sk_max_pacing_rate);
+ if (bbr->mode != BBR_STARTUP || rate > sk->sk_pacing_rate)
+ sk->sk_pacing_rate = rate;
+}
+
+/* Return count of segments we want in the skbs we send, or 0 for default. */
+static u32 bbr_tso_segs_goal(struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ return bbr->tso_segs_goal;
+}
+
+static void bbr_set_tso_segs_goal(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 min_segs;
+
+ min_segs = sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
+ bbr->tso_segs_goal = min(tcp_tso_autosize(sk, tp->mss_cache, min_segs),
+ 0x7FU);
+}
+
+/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
+static void bbr_save_cwnd(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT)
+ bbr->prior_cwnd = tp->snd_cwnd; /* this cwnd is good enough */
+ else /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */
+ bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd);
+}
+
+static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ if (event == CA_EVENT_TX_START && tp->app_limited) {
+ bbr->idle_restart = 1;
+ /* Avoid pointless buffer overflows: pace at est. bw if we don't
+ * need more speed (we're restarting from idle and app-limited).
+ */
+ if (bbr->mode == BBR_PROBE_BW)
+ bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
+ }
+}
+
+/* Find target cwnd. Right-size the cwnd based on min RTT and the
+ * estimated bottleneck bandwidth:
+ *
+ * cwnd = bw * min_rtt * gain = BDP * gain
+ *
+ * The key factor, gain, controls the amount of queue. While a small gain
+ * builds a smaller queue, it becomes more vulnerable to noise in RTT
+ * measurements (e.g., delayed ACKs or other ACK compression effects). This
+ * noise may cause BBR to under-estimate the rate.
+ *
+ * To achieve full performance in high-speed paths, we budget enough cwnd to
+ * fit full-sized skbs in-flight on both end hosts to fully utilize the path:
+ * - one skb in sending host Qdisc,
+ * - one skb in sending host TSO/GSO engine
+ * - one skb being received by receiver host LRO/GRO/delayed-ACK engine
+ * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
+ * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
+ * which allows 2 outstanding 2-packet sequences, to try to keep pipe
+ * full even with ACK-every-other-packet delayed ACKs.
+ */
+static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 cwnd;
+ u64 w;
+
+ /* If we've never had a valid RTT sample, cap cwnd at the initial
+ * default. This should only happen when the connection is not using TCP
+ * timestamps and has retransmitted all of the SYN/SYNACK/data packets
+ * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
+ * case we need to slow-start up toward something safe: TCP_INIT_CWND.
+ */
+ if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */
+ return TCP_INIT_CWND; /* be safe: cap at default initial cwnd*/
+
+ w = (u64)bw * bbr->min_rtt_us;
+
+ /* Apply a gain to the given value, then remove the BW_SCALE shift. */
+ cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
+
+ /* Allow enough full-sized skbs in flight to utilize end systems. */
+ cwnd += 3 * bbr->tso_segs_goal;
+
+ /* Reduce delayed ACKs by rounding up cwnd to the next even number. */
+ cwnd = (cwnd + 1) & ~1U;
+
+ return cwnd;
+}
+
+/* An optimization in BBR to reduce losses: On the first round of recovery, we
+ * follow the packet conservation principle: send P packets per P packets acked.
+ * After that, we slow-start and send at most 2*P packets per P packets acked.
+ * After recovery finishes, or upon undo, we restore the cwnd we had when
+ * recovery started (capped by the target cwnd based on estimated BDP).
+ *
+ * TODO(ycheng/ncardwell): implement a rate-based approach.
+ */
+static bool bbr_set_cwnd_to_recover_or_restore(
+ struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
+ u32 cwnd = tp->snd_cwnd;
+
+ /* An ACK for P pkts should release at most 2*P packets. We do this
+ * in two steps. First, here we deduct the number of lost packets.
+ * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
+ */
+ if (rs->losses > 0)
+ cwnd = max_t(s32, cwnd - rs->losses, 1);
+
+ if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
+ /* Starting 1st round of Recovery, so do packet conservation. */
+ bbr->packet_conservation = 1;
+ bbr->next_rtt_delivered = tp->delivered; /* start round now */
+ /* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
+ cwnd = tcp_packets_in_flight(tp) + acked;
+ } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
+ /* Exiting loss recovery; restore cwnd saved before recovery. */
+ bbr->restore_cwnd = 1;
+ bbr->packet_conservation = 0;
+ }
+ bbr->prev_ca_state = state;
+
+ if (bbr->restore_cwnd) {
+ /* Restore cwnd after exiting loss recovery or PROBE_RTT. */
+ cwnd = max(cwnd, bbr->prior_cwnd);
+ bbr->restore_cwnd = 0;
+ }
+
+ if (bbr->packet_conservation) {
+ *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
+ return true; /* yes, using packet conservation */
+ }
+ *new_cwnd = cwnd;
+ return false;
+}
+
+/* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
+ * has drawn us down below target), or snap down to target if we're above it.
+ */
+static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+ u32 acked, u32 bw, int gain)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 cwnd = 0, target_cwnd = 0;
+
+ if (!acked)
+ return;
+
+ if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
+ goto done;
+
+ /* If we're below target cwnd, slow start cwnd toward target cwnd. */
+ target_cwnd = bbr_target_cwnd(sk, bw, gain);
+ if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */
+ cwnd = min(cwnd + acked, target_cwnd);
+ else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
+ cwnd = cwnd + acked;
+ cwnd = max(cwnd, bbr_cwnd_min_target);
+
+done:
+ tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); /* apply global cap */
+ if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */
+ tp->snd_cwnd = min(tp->snd_cwnd, bbr_cwnd_min_target);
+}
+
+/* End cycle phase if it's time and/or we hit the phase's in-flight target. */
+static bool bbr_is_next_cycle_phase(struct sock *sk,
+ const struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ bool is_full_length =
+ skb_mstamp_us_delta(&tp->delivered_mstamp, &bbr->cycle_mstamp) >
+ bbr->min_rtt_us;
+ u32 inflight, bw;
+
+ /* The pacing_gain of 1.0 paces at the estimated bw to try to fully
+ * use the pipe without increasing the queue.
+ */
+ if (bbr->pacing_gain == BBR_UNIT)
+ return is_full_length; /* just use wall clock time */
+
+ inflight = rs->prior_in_flight; /* what was in-flight before ACK? */
+ bw = bbr_max_bw(sk);
+
+ /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
+ * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
+ * small (e.g. on a LAN). We do not persist if packets are lost, since
+ * a path with small buffers may not hold that much.
+ */
+ if (bbr->pacing_gain > BBR_UNIT)
+ return is_full_length &&
+ (rs->losses || /* perhaps pacing_gain*BDP won't fit */
+ inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain));
+
+ /* A pacing_gain < 1.0 tries to drain extra queue we added if bw
+ * probing didn't find more bw. If inflight falls to match BDP then we
+ * estimate queue is drained; persisting would underutilize the pipe.
+ */
+ return is_full_length ||
+ inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT);
+}
+
+static void bbr_advance_cycle_phase(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
+ bbr->cycle_mstamp = tp->delivered_mstamp;
+ bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx];
+}
+
+/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
+static void bbr_update_cycle_phase(struct sock *sk,
+ const struct rate_sample *rs)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ if ((bbr->mode == BBR_PROBE_BW) && !bbr->lt_use_bw &&
+ bbr_is_next_cycle_phase(sk, rs))
+ bbr_advance_cycle_phase(sk);
+}
+
+static void bbr_reset_startup_mode(struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->mode = BBR_STARTUP;
+ bbr->pacing_gain = bbr_high_gain;
+ bbr->cwnd_gain = bbr_high_gain;
+}
+
+static void bbr_reset_probe_bw_mode(struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->mode = BBR_PROBE_BW;
+ bbr->pacing_gain = BBR_UNIT;
+ bbr->cwnd_gain = bbr_cwnd_gain;
+ bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand);
+ bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */
+}
+
+static void bbr_reset_mode(struct sock *sk)
+{
+ if (!bbr_full_bw_reached(sk))
+ bbr_reset_startup_mode(sk);
+ else
+ bbr_reset_probe_bw_mode(sk);
+}
+
+/* Start a new long-term sampling interval. */
+static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->lt_last_stamp = tp->delivered_mstamp.stamp_jiffies;
+ bbr->lt_last_delivered = tp->delivered;
+ bbr->lt_last_lost = tp->lost;
+ bbr->lt_rtt_cnt = 0;
+}
+
+/* Completely reset long-term bandwidth sampling. */
+static void bbr_reset_lt_bw_sampling(struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->lt_bw = 0;
+ bbr->lt_use_bw = 0;
+ bbr->lt_is_sampling = false;
+ bbr_reset_lt_bw_sampling_interval(sk);
+}
+
+/* Long-term bw sampling interval is done. Estimate whether we're policed. */
+static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 diff;
+
+ if (bbr->lt_bw && /* do we have bw from a previous interval? */
+ bbr_lt_bw_estimator) { /* using long-term bw estimator enabled? */
+ /* Is new bw close to the lt_bw from the previous interval? */
+ diff = abs(bw - bbr->lt_bw);
+ if ((diff * BBR_UNIT <= bbr_lt_conv_thresh * bbr->lt_bw) ||
+ (bbr_rate_kbps(sk, diff) <= 4)) { /* diff <= 4 Kbit/sec? */
+ /* All criteria are met; estimate we're policed. */
+ bbr->lt_bw = (bw + bbr->lt_bw) >> 1; /* avg 2 intvls */
+ bbr->lt_use_bw = 1;
+ bbr->pacing_gain = BBR_UNIT; /* try to avoid drops */
+ bbr->lt_rtt_cnt = 0;
+ return;
+ }
+ }
+ bbr->lt_bw = bw;
+ bbr_reset_lt_bw_sampling_interval(sk);
+}
+
+/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
+ * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
+ * explicitly models their policed rate, to reduce unnecessary losses. We
+ * estimate that we're policed if we see 2 consecutive sampling intervals with
+ * consistent throughput and high packet loss. If we think we're being policed,
+ * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
+ */
+static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 lost, delivered;
+ u64 bw;
+ s32 t;
+
+ if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */
+ if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
+ ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
+ bbr_reset_lt_bw_sampling(sk); /* stop using lt_bw */
+ bbr_reset_probe_bw_mode(sk); /* restart gain cycling */
+ }
+ return;
+ }
+
+ /* Wait for the first loss before sampling, to let the policer exhaust
+ * its tokens and estimate the steady-state rate allowed by the policer.
+ * Starting samples earlier includes bursts that over-estimate the bw.
+ */
+ if (!bbr->lt_is_sampling) {
+ if (!rs->losses)
+ return;
+ bbr_reset_lt_bw_sampling_interval(sk);
+ bbr->lt_is_sampling = true;
+ }
+
+ /* To avoid underestimates, reset sampling if we run out of data. */
+ if (rs->is_app_limited) {
+ bbr_reset_lt_bw_sampling(sk);
+ return;
+ }
+
+ if (bbr->round_start)
+ bbr->lt_rtt_cnt++; /* count round trips in this interval */
+ if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
+ return; /* sampling interval needs to be longer */
+ if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
+ bbr_reset_lt_bw_sampling(sk); /* interval is too long */
+ return;
+ }
+
+ /* End sampling interval when a packet is lost, so we estimate the
+ * policer tokens were exhausted. Stopping the sampling before the
+ * tokens are exhausted under-estimates the policed rate.
+ */
+ if (!rs->losses)
+ return;
+
+ /* Calculate packets lost and delivered in sampling interval. */
+ lost = tp->lost - bbr->lt_last_lost;
+ delivered = tp->delivered - bbr->lt_last_delivered;
+ /* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
+ if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
+ return;
+
+ /* Find average delivery rate in this sampling interval. */
+ t = (s32)(tp->delivered_mstamp.stamp_jiffies - bbr->lt_last_stamp);
+ if (t < 1)
+ return; /* interval is less than one jiffy, so wait */
+ t = jiffies_to_usecs(t);
+ /* Interval long enough for jiffies_to_usecs() to return a bogus 0? */
+ if (t < 1) {
+ bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */
+ return;
+ }
+ bw = (u64)delivered * BW_UNIT;
+ do_div(bw, t);
+ bbr_lt_bw_interval_done(sk, bw);
+}
+
+/* Estimate the bandwidth based on how fast packets are delivered */
+static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u64 bw;
+
+ bbr->round_start = 0;
+ if (rs->delivered < 0 || rs->interval_us <= 0)
+ return; /* Not a valid observation */
+
+ /* See if we've reached the next RTT */
+ if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
+ bbr->next_rtt_delivered = tp->delivered;
+ bbr->rtt_cnt++;
+ bbr->round_start = 1;
+ bbr->packet_conservation = 0;
+ }
+
+ bbr_lt_bw_sampling(sk, rs);
+
+ /* Divide delivered by the interval to find a (lower bound) bottleneck
+ * bandwidth sample. Delivered is in packets and interval_us in uS and
+ * ratio will be <<1 for most connections. So delivered is first scaled.
+ */
+ bw = (u64)rs->delivered * BW_UNIT;
+ do_div(bw, rs->interval_us);
+
+ /* If this sample is application-limited, it is likely to have a very
+ * low delivered count that represents application behavior rather than
+ * the available network rate. Such a sample could drag down estimated
+ * bw, causing needless slow-down. Thus, to continue to send at the
+ * last measured network rate, we filter out app-limited samples unless
+ * they describe the path bw at least as well as our bw model.
+ *
+ * So the goal during app-limited phase is to proceed with the best
+ * network rate no matter how long. We automatically leave this
+ * phase when app writes faster than the network can deliver :)
+ */
+ if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
+ /* Incorporate new sample into our max bw filter. */
+ minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
+ }
+}
+
+/* Estimate when the pipe is full, using the change in delivery rate: BBR
+ * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
+ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
+ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
+ * higher rwin, 3: we get higher delivery rate samples. Or transient
+ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
+ * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
+ */
+static void bbr_check_full_bw_reached(struct sock *sk,
+ const struct rate_sample *rs)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 bw_thresh;
+
+ if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
+ return;
+
+ bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
+ if (bbr_max_bw(sk) >= bw_thresh) {
+ bbr->full_bw = bbr_max_bw(sk);
+ bbr->full_bw_cnt = 0;
+ return;
+ }
+ ++bbr->full_bw_cnt;
+}
+
+/* If pipe is probably full, drain the queue and then enter steady-state. */
+static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
+ bbr->mode = BBR_DRAIN; /* drain queue we created */
+ bbr->pacing_gain = bbr_drain_gain; /* pace slow to drain */
+ bbr->cwnd_gain = bbr_high_gain; /* maintain cwnd */
+ } /* fall through to check if in-flight is already small: */
+ if (bbr->mode == BBR_DRAIN &&
+ tcp_packets_in_flight(tcp_sk(sk)) <=
+ bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT))
+ bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */
+}
+
+/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
+ * periodically drain the bottleneck queue, to converge to measure the true
+ * min_rtt (unloaded propagation delay). This allows the flows to keep queues
+ * small (reducing queuing delay and packet loss) and achieve fairness among
+ * BBR flows.
+ *
+ * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires,
+ * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets.
+ * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed
+ * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and
+ * re-enter the previous mode. BBR uses 200ms to approximately bound the
+ * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s).
+ *
+ * Note that flows need only pay 2% if they are busy sending over the last 10
+ * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have
+ * natural silences or low-rate periods within 10 seconds where the rate is low
+ * enough for long enough to drain its queue in the bottleneck. We pick up
+ * these min RTT measurements opportunistically with our min_rtt filter. :-)
+ */
+static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ bool filter_expired;
+
+ /* Track min RTT seen in the min_rtt_win_sec filter window: */
+ filter_expired = after(tcp_time_stamp,
+ bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
+ if (rs->rtt_us >= 0 &&
+ (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) {
+ bbr->min_rtt_us = rs->rtt_us;
+ bbr->min_rtt_stamp = tcp_time_stamp;
+ }
+
+ if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
+ !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
+ bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */
+ bbr->pacing_gain = BBR_UNIT;
+ bbr->cwnd_gain = BBR_UNIT;
+ bbr_save_cwnd(sk); /* note cwnd so we can restore it */
+ bbr->probe_rtt_done_stamp = 0;
+ }
+
+ if (bbr->mode == BBR_PROBE_RTT) {
+ /* Ignore low rate samples during this mode. */
+ tp->app_limited =
+ (tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+ /* Maintain min packets in flight for max(200 ms, 1 round). */
+ if (!bbr->probe_rtt_done_stamp &&
+ tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
+ bbr->probe_rtt_done_stamp = tcp_time_stamp +
+ msecs_to_jiffies(bbr_probe_rtt_mode_ms);
+ bbr->probe_rtt_round_done = 0;
+ bbr->next_rtt_delivered = tp->delivered;
+ } else if (bbr->probe_rtt_done_stamp) {
+ if (bbr->round_start)
+ bbr->probe_rtt_round_done = 1;
+ if (bbr->probe_rtt_round_done &&
+ after(tcp_time_stamp, bbr->probe_rtt_done_stamp)) {
+ bbr->min_rtt_stamp = tcp_time_stamp;
+ bbr->restore_cwnd = 1; /* snap to prior_cwnd */
+ bbr_reset_mode(sk);
+ }
+ }
+ }
+ bbr->idle_restart = 0;
+}
+
+static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
+{
+ bbr_update_bw(sk, rs);
+ bbr_update_cycle_phase(sk, rs);
+ bbr_check_full_bw_reached(sk, rs);
+ bbr_check_drain(sk, rs);
+ bbr_update_min_rtt(sk, rs);
+}
+
+static void bbr_main(struct sock *sk, const struct rate_sample *rs)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 bw;
+
+ bbr_update_model(sk, rs);
+
+ bw = bbr_bw(sk);
+ bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
+ bbr_set_tso_segs_goal(sk);
+ bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
+}
+
+static void bbr_init(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u64 bw;
+
+ bbr->prior_cwnd = 0;
+ bbr->tso_segs_goal = 0; /* default segs per skb until first ACK */
+ bbr->rtt_cnt = 0;
+ bbr->next_rtt_delivered = 0;
+ bbr->prev_ca_state = TCP_CA_Open;
+ bbr->packet_conservation = 0;
+
+ bbr->probe_rtt_done_stamp = 0;
+ bbr->probe_rtt_round_done = 0;
+ bbr->min_rtt_us = tcp_min_rtt(tp);
+ bbr->min_rtt_stamp = tcp_time_stamp;
+
+ minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */
+
+ /* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
+ bw = (u64)tp->snd_cwnd * BW_UNIT;
+ do_div(bw, (tp->srtt_us >> 3) ? : USEC_PER_MSEC);
+ sk->sk_pacing_rate = 0; /* force an update of sk_pacing_rate */
+ bbr_set_pacing_rate(sk, bw, bbr_high_gain);
+
+ bbr->restore_cwnd = 0;
+ bbr->round_start = 0;
+ bbr->idle_restart = 0;
+ bbr->full_bw = 0;
+ bbr->full_bw_cnt = 0;
+ bbr->cycle_mstamp.v64 = 0;
+ bbr->cycle_idx = 0;
+ bbr_reset_lt_bw_sampling(sk);
+ bbr_reset_startup_mode(sk);
+}
+
+static u32 bbr_sndbuf_expand(struct sock *sk)
+{
+ /* Provision 3 * cwnd since BBR may slow-start even during recovery. */
+ return 3;
+}
+
+/* In theory BBR does not need to undo the cwnd since it does not
+ * always reduce cwnd on losses (see bbr_main()). Keep it for now.
+ */
+static u32 bbr_undo_cwnd(struct sock *sk)
+{
+ return tcp_sk(sk)->snd_cwnd;
+}
+
+/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
+static u32 bbr_ssthresh(struct sock *sk)
+{
+ bbr_save_cwnd(sk);
+ return TCP_INFINITE_SSTHRESH; /* BBR does not use ssthresh */
+}
+
+static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info)
+{
+ if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
+ ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u64 bw = bbr_bw(sk);
+
+ bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
+ memset(&info->bbr, 0, sizeof(info->bbr));
+ info->bbr.bbr_bw_lo = (u32)bw;
+ info->bbr.bbr_bw_hi = (u32)(bw >> 32);
+ info->bbr.bbr_min_rtt = bbr->min_rtt_us;
+ info->bbr.bbr_pacing_gain = bbr->pacing_gain;
+ info->bbr.bbr_cwnd_gain = bbr->cwnd_gain;
+ *attr = INET_DIAG_BBRINFO;
+ return sizeof(info->bbr);
+ }
+ return 0;
+}
+
+static void bbr_set_state(struct sock *sk, u8 new_state)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ if (new_state == TCP_CA_Loss) {
+ struct rate_sample rs = { .prior_mstamp.v64 = 0, .losses = 1 };
+
+ bbr->prev_ca_state = TCP_CA_Loss;
+ bbr->full_bw = 0;
+ bbr->round_start = 1; /* treat RTO like end of a round */
+ bbr_lt_bw_sampling(sk, &rs);
+ }
+}
+
+static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
+ .flags = TCP_CONG_NON_RESTRICTED,
+ .name = "bbr",
+ .owner = THIS_MODULE,
+ .init = bbr_init,
+ .cong_control = bbr_main,
+ .sndbuf_expand = bbr_sndbuf_expand,
+ .undo_cwnd = bbr_undo_cwnd,
+ .cwnd_event = bbr_cwnd_event,
+ .ssthresh = bbr_ssthresh,
+ .tso_segs_goal = bbr_tso_segs_goal,
+ .get_info = bbr_get_info,
+ .set_state = bbr_set_state,
+};
+
+static int __init bbr_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE);
+ return tcp_register_congestion_control(&tcp_bbr_cong_ops);
+}
+
+static void __exit bbr_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_bbr_cong_ops);
+}
+
+module_init(bbr_register);
+module_exit(bbr_unregister);
+
+MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
+MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
+MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
+MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
--
2.8.0.rc3.226.g39d4020
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox