* [PATCH net-next 1/9] rxrpc: Don't store the rxrpc header in the Tx queue sk_buffs
From: David Howells @ 2016-09-22 0:39 UTC (permalink / raw)
To: netdev; +Cc: dhowells, linux-afs, linux-kernel
In-Reply-To: <147450474784.14691.229861132515739820.stgit@warthog.procyon.org.uk>
Don't store the rxrpc protocol header in sk_buffs on the transmit queue,
but rather generate it on the fly and pass it to kernel_sendmsg() as a
separate iov. This reduces the amount of storage required.
Note that the security header is still stored in the sk_buff as it may get
encrypted along with the data (and doesn't change with each transmission).
Signed-off-by: David Howells <dhowells@redhat.com>
---
net/rxrpc/ar-internal.h | 5 +--
net/rxrpc/call_event.c | 11 +-----
net/rxrpc/conn_object.c | 1 -
net/rxrpc/output.c | 83 ++++++++++++++++++++++++++++++++---------------
net/rxrpc/rxkad.c | 8 ++---
net/rxrpc/sendmsg.c | 51 +++++------------------------
6 files changed, 71 insertions(+), 88 deletions(-)
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 034f525f2235..f021df4a6a22 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -385,10 +385,9 @@ struct rxrpc_connection {
int debug_id; /* debug ID for printks */
atomic_t serial; /* packet serial number counter */
unsigned int hi_serial; /* highest serial number received */
+ u32 security_nonce; /* response re-use preventer */
u8 size_align; /* data size alignment (for security) */
- u8 header_size; /* rxrpc + security header size */
u8 security_size; /* security header size */
- u32 security_nonce; /* response re-use preventer */
u8 security_ix; /* security type */
u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */
};
@@ -946,7 +945,7 @@ extern const s8 rxrpc_ack_priority[];
* output.c
*/
int rxrpc_send_call_packet(struct rxrpc_call *, u8);
-int rxrpc_send_data_packet(struct rxrpc_connection *, struct sk_buff *);
+int rxrpc_send_data_packet(struct rxrpc_call *, struct sk_buff *);
void rxrpc_reject_packets(struct rxrpc_local *);
/*
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 7d1b99824ed9..6247ce25eb21 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -139,7 +139,6 @@ void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
*/
static void rxrpc_resend(struct rxrpc_call *call)
{
- struct rxrpc_wire_header *whdr;
struct rxrpc_skb_priv *sp;
struct sk_buff *skb;
rxrpc_seq_t cursor, seq, top;
@@ -201,15 +200,8 @@ static void rxrpc_resend(struct rxrpc_call *call)
skb = call->rxtx_buffer[ix];
rxrpc_get_skb(skb, rxrpc_skb_tx_got);
spin_unlock_bh(&call->lock);
- sp = rxrpc_skb(skb);
-
- /* Each Tx packet needs a new serial number */
- sp->hdr.serial = atomic_inc_return(&call->conn->serial);
- whdr = (struct rxrpc_wire_header *)skb->head;
- whdr->serial = htonl(sp->hdr.serial);
-
- if (rxrpc_send_data_packet(call->conn, skb) < 0) {
+ if (rxrpc_send_data_packet(call, skb) < 0) {
call->resend_at = now + 2;
rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
return;
@@ -217,6 +209,7 @@ static void rxrpc_resend(struct rxrpc_call *call)
if (rxrpc_is_client_call(call))
rxrpc_expose_client_call(call);
+ sp = rxrpc_skb(skb);
sp->resend_at = now + rxrpc_resend_timeout;
rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 3b55aee0c436..e1e83af47866 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -53,7 +53,6 @@ struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp)
spin_lock_init(&conn->state_lock);
conn->debug_id = atomic_inc_return(&rxrpc_debug_id);
conn->size_align = 4;
- conn->header_size = sizeof(struct rxrpc_wire_header);
conn->idle_timestamp = jiffies;
}
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 16e18a94ffa6..817fb0e82d6a 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -208,19 +208,42 @@ out:
/*
* send a packet through the transport endpoint
*/
-int rxrpc_send_data_packet(struct rxrpc_connection *conn, struct sk_buff *skb)
+int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb)
{
- struct kvec iov[1];
+ struct rxrpc_connection *conn = call->conn;
+ struct rxrpc_wire_header whdr;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
struct msghdr msg;
+ struct kvec iov[2];
+ rxrpc_serial_t serial;
+ size_t len;
int ret, opt;
_enter(",{%d}", skb->len);
- iov[0].iov_base = skb->head;
- iov[0].iov_len = skb->len;
+ /* Each transmission of a Tx packet needs a new serial number */
+ serial = atomic_inc_return(&conn->serial);
+
+ whdr.epoch = htonl(conn->proto.epoch);
+ whdr.cid = htonl(call->cid);
+ whdr.callNumber = htonl(call->call_id);
+ whdr.seq = htonl(sp->hdr.seq);
+ whdr.serial = htonl(serial);
+ whdr.type = RXRPC_PACKET_TYPE_DATA;
+ whdr.flags = sp->hdr.flags;
+ whdr.userStatus = 0;
+ whdr.securityIndex = call->security_ix;
+ whdr._rsvd = htons(sp->hdr._rsvd);
+ whdr.serviceId = htons(call->service_id);
+
+ iov[0].iov_base = &whdr;
+ iov[0].iov_len = sizeof(whdr);
+ iov[1].iov_base = skb->head;
+ iov[1].iov_len = skb->len;
+ len = iov[0].iov_len + iov[1].iov_len;
- msg.msg_name = &conn->params.peer->srx.transport;
- msg.msg_namelen = conn->params.peer->srx.transport_len;
+ msg.msg_name = &call->peer->srx.transport;
+ msg.msg_namelen = call->peer->srx.transport_len;
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
@@ -234,26 +257,33 @@ int rxrpc_send_data_packet(struct rxrpc_connection *conn, struct sk_buff *skb)
}
}
+ _proto("Tx DATA %%%u { #%u }", serial, sp->hdr.seq);
+
/* send the packet with the don't fragment bit set if we currently
* think it's small enough */
- if (skb->len - sizeof(struct rxrpc_wire_header) < conn->params.peer->maxdata) {
- down_read(&conn->params.local->defrag_sem);
- /* send the packet by UDP
- * - returns -EMSGSIZE if UDP would have to fragment the packet
- * to go out of the interface
- * - in which case, we'll have processed the ICMP error
- * message and update the peer record
- */
- ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 1,
- iov[0].iov_len);
-
- up_read(&conn->params.local->defrag_sem);
- if (ret == -EMSGSIZE)
- goto send_fragmentable;
-
- _leave(" = %d [%u]", ret, conn->params.peer->maxdata);
- return ret;
+ if (iov[1].iov_len >= call->peer->maxdata)
+ goto send_fragmentable;
+
+ down_read(&conn->params.local->defrag_sem);
+ /* send the packet by UDP
+ * - returns -EMSGSIZE if UDP would have to fragment the packet
+ * to go out of the interface
+ * - in which case, we'll have processed the ICMP error
+ * message and update the peer record
+ */
+ ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len);
+
+ up_read(&conn->params.local->defrag_sem);
+ if (ret == -EMSGSIZE)
+ goto send_fragmentable;
+
+done:
+ if (ret == 0) {
+ sp->resend_at = jiffies + rxrpc_resend_timeout;
+ sp->hdr.serial = serial;
}
+ _leave(" = %d [%u]", ret, call->peer->maxdata);
+ return ret;
send_fragmentable:
/* attempt to send this message with fragmentation enabled */
@@ -268,8 +298,8 @@ send_fragmentable:
SOL_IP, IP_MTU_DISCOVER,
(char *)&opt, sizeof(opt));
if (ret == 0) {
- ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 1,
- iov[0].iov_len);
+ ret = kernel_sendmsg(conn->params.local->socket, &msg,
+ iov, 2, len);
opt = IP_PMTUDISC_DO;
kernel_setsockopt(conn->params.local->socket, SOL_IP,
@@ -298,8 +328,7 @@ send_fragmentable:
}
up_write(&conn->params.local->defrag_sem);
- _leave(" = %d [frag %u]", ret, conn->params.peer->maxdata);
- return ret;
+ goto done;
}
/*
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index ae392558829d..88d080a1a3de 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -80,12 +80,10 @@ static int rxkad_init_connection_security(struct rxrpc_connection *conn)
case RXRPC_SECURITY_AUTH:
conn->size_align = 8;
conn->security_size = sizeof(struct rxkad_level1_hdr);
- conn->header_size += sizeof(struct rxkad_level1_hdr);
break;
case RXRPC_SECURITY_ENCRYPT:
conn->size_align = 8;
conn->security_size = sizeof(struct rxkad_level2_hdr);
- conn->header_size += sizeof(struct rxkad_level2_hdr);
break;
default:
ret = -EKEYREJECTED;
@@ -161,7 +159,7 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
_enter("");
- check = sp->hdr.seq ^ sp->hdr.callNumber;
+ check = sp->hdr.seq ^ call->call_id;
data_size |= (u32)check << 16;
hdr.data_size = htonl(data_size);
@@ -205,7 +203,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
_enter("");
- check = sp->hdr.seq ^ sp->hdr.callNumber;
+ check = sp->hdr.seq ^ call->call_id;
rxkhdr.data_size = htonl(data_size | (u32)check << 16);
rxkhdr.checksum = 0;
@@ -277,7 +275,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call,
/* calculate the security checksum */
x = (call->cid & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT);
x |= sp->hdr.seq & 0x3fffffff;
- call->crypto_buf[0] = htonl(sp->hdr.callNumber);
+ call->crypto_buf[0] = htonl(call->call_id);
call->crypto_buf[1] = htonl(x);
sg_init_one(&sg, call->crypto_buf, 8);
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 6a39ee97a0b7..814b17f23971 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -134,13 +134,11 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
write_unlock_bh(&call->state_lock);
}
- _proto("Tx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq);
-
if (seq == 1 && rxrpc_is_client_call(call))
rxrpc_expose_client_call(call);
sp->resend_at = jiffies + rxrpc_resend_timeout;
- ret = rxrpc_send_data_packet(call->conn, skb);
+ ret = rxrpc_send_data_packet(call, skb);
if (ret < 0) {
_debug("need instant resend %d", ret);
rxrpc_instant_resend(call, ix);
@@ -151,29 +149,6 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
}
/*
- * Convert a host-endian header into a network-endian header.
- */
-static void rxrpc_insert_header(struct sk_buff *skb)
-{
- struct rxrpc_wire_header whdr;
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-
- whdr.epoch = htonl(sp->hdr.epoch);
- whdr.cid = htonl(sp->hdr.cid);
- whdr.callNumber = htonl(sp->hdr.callNumber);
- whdr.seq = htonl(sp->hdr.seq);
- whdr.serial = htonl(sp->hdr.serial);
- whdr.type = sp->hdr.type;
- whdr.flags = sp->hdr.flags;
- whdr.userStatus = sp->hdr.userStatus;
- whdr.securityIndex = sp->hdr.securityIndex;
- whdr._rsvd = htons(sp->hdr._rsvd);
- whdr.serviceId = htons(sp->hdr.serviceId);
-
- memcpy(skb->head, &whdr, sizeof(whdr));
-}
-
-/*
* send data through a socket
* - must be called in process context
* - caller holds the socket locked
@@ -232,7 +207,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
space = chunk + call->conn->size_align;
space &= ~(call->conn->size_align - 1UL);
- size = space + call->conn->header_size;
+ size = space + call->conn->security_size;
_debug("SIZE: %zu/%zu/%zu", chunk, space, size);
@@ -248,9 +223,9 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
ASSERTCMP(skb->mark, ==, 0);
- _debug("HS: %u", call->conn->header_size);
- skb_reserve(skb, call->conn->header_size);
- skb->len += call->conn->header_size;
+ _debug("HS: %u", call->conn->security_size);
+ skb_reserve(skb, call->conn->security_size);
+ skb->len += call->conn->security_size;
sp = rxrpc_skb(skb);
sp->remain = chunk;
@@ -312,33 +287,23 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
seq = call->tx_top + 1;
- sp->hdr.epoch = conn->proto.epoch;
- sp->hdr.cid = call->cid;
- sp->hdr.callNumber = call->call_id;
sp->hdr.seq = seq;
- sp->hdr.serial = atomic_inc_return(&conn->serial);
- sp->hdr.type = RXRPC_PACKET_TYPE_DATA;
- sp->hdr.userStatus = 0;
- sp->hdr.securityIndex = call->security_ix;
sp->hdr._rsvd = 0;
- sp->hdr.serviceId = call->service_id;
+ sp->hdr.flags = conn->out_clientflag;
- sp->hdr.flags = conn->out_clientflag;
if (msg_data_left(msg) == 0 && !more)
sp->hdr.flags |= RXRPC_LAST_PACKET;
else if (call->tx_top - call->tx_hard_ack <
call->tx_winsize)
sp->hdr.flags |= RXRPC_MORE_PACKETS;
- if (more && seq & 1)
+ if (seq & 1)
sp->hdr.flags |= RXRPC_REQUEST_ACK;
ret = conn->security->secure_packet(
- call, skb, skb->mark,
- skb->head + sizeof(struct rxrpc_wire_header));
+ call, skb, skb->mark, skb->head);
if (ret < 0)
goto out;
- rxrpc_insert_header(skb);
rxrpc_queue_packet(call, skb, !msg_data_left(msg) && !more);
skb = NULL;
}
^ permalink raw reply related
* [PATCH net-next 0/9] rxrpc: Preparation for slow-start algorithm
From: David Howells @ 2016-09-22 0:39 UTC (permalink / raw)
To: netdev; +Cc: dhowells, linux-afs, linux-kernel
Here are some patches that prepare for improvements in ACK generation and
for the implementation of the slow-start part of the protocol:
(1) Stop storing the protocol header in the Tx socket buffers, but rather
generate it on the fly. This potentially saves a little space and
makes it easier to alter the header just before transmission (the
flags may get altered and the serial number has to be changed).
(2) Mask off the Tx buffer annotations and add a flag to record which ones
have already been resent.
(3) Track RTT on a per-peer basis for use in future changes. Tracepoints
are added to log this.
(4) Send PING ACKs in response to incoming calls to elicit a PING-RESPONSE
ACK from which RTT data can be calculated. The response also carries
other useful information.
(5) Expedite PING-RESPONSE ACK generation from sendmsg. If we're actively
using sendmsg, this allows us, under some circumstances, to avoid
having to rely on the background work item to run to generate this
ACK.
This requires ktime_sub_ms() to be added.
(6) Set the REQUEST-ACK flag on some DATA packets to elicit ACK-REQUESTED
ACKs from which RTT data can be calculated.
(7) Limit the use of pings and ACK requests for RTT determination.
The patches can be found here also:
http://git.kernel.org/cgit/linux/kernel/git/dhowells/linux-fs.git/log/?h=rxrpc-rewrite
Tagged thusly:
git://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs.git
rxrpc-rewrite-20160922
David
---
David Howells (9):
rxrpc: Don't store the rxrpc header in the Tx queue sk_buffs
rxrpc: Add re-sent Tx annotation
rxrpc: Add per-peer RTT tracker
rxrpc: Send pings to get RTT data
rxrpc: Expedite ping response transmission
rxrpc: Add ktime_sub_ms()
rxrpc: Obtain RTT data by requesting ACKs on DATA packets
rxrpc: Reduce the number of PING ACKs sent
rxrpc: Reduce the number of ACK-Requests sent
include/linux/ktime.h | 5 ++
include/trace/events/rxrpc.h | 61 ++++++++++++++++++++++
net/rxrpc/ar-internal.h | 47 ++++++++++++-----
net/rxrpc/call_event.c | 57 +++++++++++---------
net/rxrpc/conn_object.c | 1
net/rxrpc/input.c | 100 ++++++++++++++++++++++++++++++++++--
net/rxrpc/misc.c | 25 ++++++---
net/rxrpc/output.c | 117 ++++++++++++++++++++++++++++++++----------
net/rxrpc/peer_event.c | 39 ++++++++++++++
net/rxrpc/peer_object.c | 1
net/rxrpc/rxkad.c | 8 +--
net/rxrpc/sendmsg.c | 56 ++++----------------
net/rxrpc/sysctl.c | 2 -
13 files changed, 389 insertions(+), 130 deletions(-)
^ permalink raw reply
* Re: XDP (eXpress Data Path) documentation
From: Tom Herbert @ 2016-09-22 0:03 UTC (permalink / raw)
To: Jesper Dangaard Brouer
Cc: netdev@vger.kernel.org, iovisor-dev@lists.iovisor.org,
Tariq Toukan, Rana Shahout, Saeed Mahameed, Brenden Blanco,
Alexei Starovoitov, Jonathan Corbet, Nathan Willis, linux-doc
In-Reply-To: <20160920110844.661965be@redhat.com>
On Tue, Sep 20, 2016 at 2:08 AM, Jesper Dangaard Brouer
<brouer@redhat.com> wrote:
> Hi all,
>
> As promised, I've started documenting the XDP eXpress Data Path):
>
> [1] https://prototype-kernel.readthedocs.io/en/latest/networking/XDP/index.html
>
> IMHO the documentation have reached a stage where it is useful for the
> XDP project, BUT I request collaboration on improving the documentation
> from all. (Native English speakers are encouraged to send grammar fixes ;-))
>
Hi Jesper,
Thanks for taking the initiative on the this, The document reads more
like a design doc than description right now, that's probably okay
since we could use a design doc.
Under "Important to understand" there are some disclaimers that XDP
does not implement qdiscs or BQL and fairness otherwise. This is true
for it's own traffic, but it does not (or at least should not) affect
these mechanisms or normal stack traffic running simultaneously. I
think we've made assumptions about fairness between XDP and non-XDP
queues, we probably want to clarify fairness (and also validate
whatever assumptions we've made with testing).
Thanks,
Tom
> You wouldn't believe it: But this pretty looking documentation actually
> follows the new Kernel documentation format. It is actually just
> ".rst" text files stored in my github repository under kernel/Documentation [2]
>
> [2] https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/Documentation
>
> Thus, just git clone my repository and started editing and send me
> patches (or github pull requests). Like:
>
> $ git clone https://github.com/netoptimizer/prototype-kernel
> $ cd prototype-kernel/kernel/Documentation/
> $ make html
> $ firefox _build/html/index.html &
>
> This new documentation format combines the best of two worlds, pretty
> online browser documentation with almost plain text files, and changes
> being tracked via git commits [3] (and auto git hooks to generate the
> readthedocs.org page). You got to love it! :-)
>
> --
> Best regards,
> Jesper Dangaard Brouer
> MSc.CS, Principal Kernel Engineer at Red Hat
> Author of http://www.iptv-analyzer.org
> LinkedIn: http://www.linkedin.com/in/brouer
>
> [3] https://github.com/netoptimizer/prototype-kernel/commits/master
^ permalink raw reply
* Re: [PATCHv7 net-next 00/15] BPF hardware offload (cls_bpf for now)
From: David Miller @ 2016-09-21 23:56 UTC (permalink / raw)
To: jakub.kicinski; +Cc: netdev, ast, daniel
In-Reply-To: <1474454647-20137-1-git-send-email-jakub.kicinski@netronome.com>
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 21 Sep 2016 11:43:52 +0100
> In the last year a lot of progress have been made on offloading
> simpler TC classifiers. There is also growing interest in using
> BPF for generic high-speed packet processing in the kernel.
> It seems beneficial to tie those two trends together and think
> about hardware offloads of BPF programs. This patch set presents
> such offload to Netronome smart NICs. cls_bpf is extended with
> hardware offload capabilities and NFP driver gets a JIT translator
> which in presence of capable firmware can be used to offload
> the BPF program onto the card.
>
> BPF JIT implementation is not 100% complete (e.g. missing instructions)
> but it is functional. Encouragingly it should be possible to
> offload most (if not all) advanced BPF features onto the NIC -
> including packet modification, maps, tunnel encap/decap etc.
...
Series applied, thanks.
^ permalink raw reply
* Re: [PATCH iproute2 net-next] iptnl: add support for collect_md flag in IPv4 and IPv6 tunnels
From: Stephen Hemminger @ 2016-09-21 23:37 UTC (permalink / raw)
To: Alexei Starovoitov; +Cc: Daniel Borkmann, William Tu, netdev
In-Reply-To: <1474329794-639615-1-git-send-email-ast@fb.com>
On Mon, 19 Sep 2016 17:03:14 -0700
Alexei Starovoitov <ast@fb.com> wrote:
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Applied, please send update to man page as well.
^ permalink raw reply
* Re: [PATCH iproute2] ipmonitor: fix ip monitor can't work when NET_NS is not enabled
From: Stephen Hemminger @ 2016-09-21 23:33 UTC (permalink / raw)
To: Liping Zhang; +Cc: netdev, nicolas.dichtel, Liping Zhang
In-Reply-To: <1474362542-5506-1-git-send-email-zlpnobody@gmail.com>
On Tue, 20 Sep 2016 02:09:02 -0700
Liping Zhang <zlpnobody@gmail.com> wrote:
> From: Liping Zhang <liping.zhang@spreadtrum.com>
>
> In ip monitor, netns_map_init will check getnsid is supported or not.
> But when /proc/self/ns/net does not exist, we just print out error
> messages and exit. So user cannot use ip monitor anymore when
> CONFIG_NET_NS is disabled:
> # ip monitor
> open("/proc/self/ns/net"): No such file or directory
>
> If open "/proc/self/ns/net" failed, set have_rtnl_getnsid to false.
>
> Fixes: d652ccbf8195 ("netns: allow to dump and monitor nsid")
> Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
Makes sense. Applied.
^ permalink raw reply
* Re: [PATCH net-next 2/3] udp: implement memory accounting helpers
From: Eric Dumazet @ 2016-09-21 23:31 UTC (permalink / raw)
To: Paolo Abeni
Cc: netdev, David S. Miller, James Morris, Trond Myklebust,
Alexander Duyck, Daniel Borkmann, Eric Dumazet, Tom Herbert,
Hannes Frederic Sowa, linux-nfs
In-Reply-To: <93ccb49b7f037461ef436a50b907185744b093d8.1474477902.git.pabeni@redhat.com>
On Wed, 2016-09-21 at 19:23 +0200, Paolo Abeni wrote:
> Avoid usage of common memory accounting functions, since
> the logic is pretty much different.
>
> To account for forward allocation, a couple of new atomic_t
> members are added to udp_sock: 'mem_alloced' and 'mem_freed'.
> The current forward allocation is estimated as 'mem_alloced'
> minus 'mem_freed' minus 'sk_rmem_alloc'.
>
> When the forward allocation can't cope with the packet to be
> enqueued, 'mem_alloced' is incremented by the packet size
> rounded-up to the next SK_MEM_QUANTUM.
> After a dequeue, we try to partially reclaim of the forward
> allocated memory rounded down to an SK_MEM_QUANTUM and
> 'mem_freed' is increased by that amount.
> sk->sk_forward_alloc is set after each allocated/freed memory
> update, to the currently estimated forward allocation, without
> any lock or protection.
> This value is updated/maintained only to expose some
> semi-reasonable value to the eventual reader, and is guaranteed
> to be 0 at socket destruction time.
>
> The above needs custom memory reclaiming on shutdown, provided
> by the udp_destruct_sock() helper, which completely reclaim
> the allocated forward memory.
>
> Helpers are provided for skb free, consume and purge, respecting
> the above constraints.
>
> The socket lock is still used to protect the updates to sk_peek_off,
> but is acquired only if peeking with offset is enabled.
>
> As a consequence of the above schema, enqueue to sk_error_queue
> will cause larger forward allocation on following normal data
> (due to sk_rmem_alloc grow), but this allows amortizing the cost
> of the atomic operation on SK_MEM_QUANTUM/skb->truesize packets.
> The use of separate atomics for 'mem_alloced' and 'mem_freed'
> allows the use of a single atomic operation to protect against
> concurrent dequeue.
>
> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> include/linux/udp.h | 2 +
> include/net/udp.h | 5 ++
> net/ipv4/udp.c | 151 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 158 insertions(+)
>
> diff --git a/include/linux/udp.h b/include/linux/udp.h
> index d1fd8cd..cd72645 100644
> --- a/include/linux/udp.h
> +++ b/include/linux/udp.h
> @@ -42,6 +42,8 @@ static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask)
> struct udp_sock {
> /* inet_sock has to be the first member */
> struct inet_sock inet;
> + atomic_t mem_allocated;
> + atomic_t mem_freed;
Hi Paolo, thanks for working on this.
All this code looks quite invasive to me ?
Also does inet_diag properly give the forward_alloc to user ?
$ ss -mua
State Recv-Q Send-Q Local Address:Port Peer Addres
s:Port
UNCONN 51584 0 *:52460 *:*
skmem:(r51584,rb327680,t0,tb327680,f1664,w0,o0,bl0,d575)
Couldn't we instead use an union of an atomic_t and int for
sk->sk_forward_alloc ?
All udp queues/dequeues would manipulate the atomic_t using regular
atomic ops and use a special skb destructor (instead of sock_rfree())
Also I would not bother 'reclaiming' forward_alloc at dequeue, unless
udp is under memory pressure.
Please share your performance numbers, thanks !
^ permalink raw reply
* Re: [PATCH iproute2 3/3] ss: output TCP BBR diag information
From: Stephen Hemminger @ 2016-09-21 23:31 UTC (permalink / raw)
To: Neal Cardwell; +Cc: netdev, Yuchung Cheng, Eric Dumazet, Soheil Hassas Yeganeh
In-Reply-To: <1474425824-22646-3-git-send-email-ncardwell@google.com>
On Tue, 20 Sep 2016 22:43:44 -0400
Neal Cardwell <ncardwell@google.com> wrote:
> Dump useful TCP BBR state information from a struct tcp_bbr_info that
> was grabbed using the inet_diag API.
>
> We tolerate info that is shorter or longer than expected, in case the
> kernel is older or newer than the ss binary. We simply print the
> minimum of what is expected from the kernel and what is provided from
> the kernel. We use the same trick as that used for struct tcp_info:
> when the info from the kernel is shorter than we hoped, we pad the end
> with zeroes, and don't print fields if they are zero.
>
> The BBR output looks like:
> bbr:(bw:1.2Mbps,mrtt:18.965,pacing_gain:2.88672,cwnd_gain:2.88672)
>
> The motivation here is to be consistent with DCTCP, which looks like:
> dctcp(ce_state:23,alpha:23,ab_ecn:23,ab_tot:23)
>
> Signed-off-by: Neal Cardwell <ncardwell@google.com>
> Signed-off-by: Yuchung Cheng <ycheng@google.com>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Applied, to net-next.
The first two patches were unnecessary. Already picked up current net-next headers.
^ permalink raw reply
* [PATCH net] tcp: fix under-accounting retransmit SNMP counters
From: Yuchung Cheng @ 2016-09-21 23:16 UTC (permalink / raw)
To: davem; +Cc: netdev, edumazet, Yuchung Cheng
This patch fixes these under-accounting SNMP rtx stats
LINUX_MIB_TCPFORWARDRETRANS
LINUX_MIB_TCPFASTRETRANS
LINUX_MIB_TCPSLOWSTARTRETRANS
when retransmitting TSO packets
Fixes: 10d3be569243 ("tcp-tso: do not split TSO packets at retransmit time")
Signed-off-by: Yuchung Cheng <ycheng@google.com>
---
net/ipv4/tcp_output.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 810be35..5725822 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2822,7 +2822,7 @@ begin_fwd:
if (tcp_retransmit_skb(sk, skb, segs))
return;
- NET_INC_STATS(sock_net(sk), mib_idx);
+ NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb));
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += tcp_skb_pcount(skb);
--
2.8.0.rc3.226.g39d4020
^ permalink raw reply related
* [PATCH net] tcp: properly account Fast Open SYN-ACK retrans
From: Yuchung Cheng @ 2016-09-21 23:16 UTC (permalink / raw)
To: davem; +Cc: netdev, edumazet, Yuchung Cheng, Neal Cardwell,
Soheil Hassas Yeganeh
In-Reply-To: <1474499775-26436-1-git-send-email-ycheng@google.com>
Since the TFO socket is accepted right off SYN-data, the socket
owner can call getsockopt(TCP_INFO) to collect ongoing SYN-ACK
retransmission or timeout stats (i.e., tcpi_total_retrans,
tcpi_retransmits). Currently those stats are only updated
upon handshake completes. This patch fixes it.
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
---
net/ipv4/tcp_input.c | 2 +-
net/ipv4/tcp_output.c | 2 ++
net/ipv4/tcp_timer.c | 1 +
3 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d6c8f4cd0..8faf97e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5852,7 +5852,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
* so release it.
*/
if (req) {
- tp->total_retrans = req->num_retrans;
+ inet_csk(sk)->icsk_retransmits = 0;
reqsk_fastopen_remove(sk, req, false);
} else {
/* Make sure socket is routed, for correct metrics. */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8bd9911..810be35 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3559,6 +3559,8 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
if (!res) {
__TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+ if (unlikely(tcp_passive_fastopen(sk)))
+ tcp_sk(sk)->total_retrans++;
}
return res;
}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index debdd8b..39bc5b2 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -346,6 +346,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk)
*/
inet_rtx_syn_ack(sk, req);
req->num_timeout++;
+ icsk->icsk_retransmits++;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
}
--
2.8.0.rc3.226.g39d4020
^ permalink raw reply related
* [PATCH net-next v2 6/6] net/faraday: Mask out PHYSTS_CHG interrupt
From: Joel Stanley @ 2016-09-21 23:05 UTC (permalink / raw)
To: davem; +Cc: clg, gwshan, andrew, andrew, netdev, linux-kernel, benh
In-Reply-To: <20160921230503.23309-1-joel@jms.id.au>
The PHYSTS_CHG (the ftgmac100's PHY IRQ) is telling the system to go
look at the PHY registers for a link status change.
The interrupt was causing issues on Aspeed SoC where some board designs
had an active high configuration, some active low, and in some cases
repurposed for other functions. When misconfigured Linux would chew 100%
of CPU cycles servicing interrupts:
[ 20.280000] ftgmac100 1e660000.ethernet eth0: [ISR] = 0x200: PHYSTS_CHG
[ 20.280000] ftgmac100 1e660000.ethernet eth0: [ISR] = 0x200: PHYSTS_CHG
[ 20.280000] ftgmac100 1e660000.ethernet eth0: [ISR] = 0x200: PHYSTS_CHG
[ 20.300000] ftgmac100 1e660000.ethernet eth0: [ISR] = 0x200: PHYSTS_CHG
While in the ftgmac100 IP can be configured for high, low and edge
sensitivity the current driver always polls the PHY, so we chose to mask
out the interrupt.
See https://patchwork.ozlabs.org/patch/672099/ for more discussion.
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
v2:
- Reworked to mask out PHYSTS_CHG instead of trying to determine the IRQ line
level
drivers/net/ethernet/faraday/ftgmac100.c | 10 +++-------
drivers/net/ethernet/faraday/ftgmac100.h | 1 +
2 files changed, 4 insertions(+), 7 deletions(-)
diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c
index e3653b14008a..90f9c5481290 100644
--- a/drivers/net/ethernet/faraday/ftgmac100.c
+++ b/drivers/net/ethernet/faraday/ftgmac100.c
@@ -1075,14 +1075,12 @@ static int ftgmac100_poll(struct napi_struct *napi, int budget)
}
if (status & priv->int_mask_all & (FTGMAC100_INT_NO_RXBUF |
- FTGMAC100_INT_RPKT_LOST | FTGMAC100_INT_AHB_ERR |
- FTGMAC100_INT_PHYSTS_CHG)) {
+ FTGMAC100_INT_RPKT_LOST | FTGMAC100_INT_AHB_ERR)) {
if (net_ratelimit())
- netdev_info(netdev, "[ISR] = 0x%x: %s%s%s%s\n", status,
+ netdev_info(netdev, "[ISR] = 0x%x: %s%s%s\n", status,
status & FTGMAC100_INT_NO_RXBUF ? "NO_RXBUF " : "",
status & FTGMAC100_INT_RPKT_LOST ? "RPKT_LOST " : "",
- status & FTGMAC100_INT_AHB_ERR ? "AHB_ERR " : "",
- status & FTGMAC100_INT_PHYSTS_CHG ? "PHYSTS_CHG" : "");
+ status & FTGMAC100_INT_AHB_ERR ? "AHB_ERR " : "");
if (status & FTGMAC100_INT_NO_RXBUF) {
/* RX buffer unavailable */
@@ -1390,7 +1388,6 @@ static int ftgmac100_probe(struct platform_device *pdev)
FTGMAC100_INT_XPKT_ETH |
FTGMAC100_INT_XPKT_LOST |
FTGMAC100_INT_AHB_ERR |
- FTGMAC100_INT_PHYSTS_CHG |
FTGMAC100_INT_RPKT_BUF |
FTGMAC100_INT_NO_RXBUF);
@@ -1412,7 +1409,6 @@ static int ftgmac100_probe(struct platform_device *pdev)
dev_info(&pdev->dev, "Using NCSI interface\n");
priv->use_ncsi = true;
- priv->int_mask_all &= ~FTGMAC100_INT_PHYSTS_CHG;
priv->ndev = ncsi_register_dev(netdev, ftgmac100_ncsi_handler);
if (!priv->ndev)
goto err_ncsi_dev;
diff --git a/drivers/net/ethernet/faraday/ftgmac100.h b/drivers/net/ethernet/faraday/ftgmac100.h
index 8a377ab1d127..a7ce0ac8858a 100644
--- a/drivers/net/ethernet/faraday/ftgmac100.h
+++ b/drivers/net/ethernet/faraday/ftgmac100.h
@@ -157,6 +157,7 @@
#define FTGMAC100_MACCR_FULLDUP (1 << 8)
#define FTGMAC100_MACCR_GIGA_MODE (1 << 9)
#define FTGMAC100_MACCR_CRC_APD (1 << 10)
+#define FTGMAC100_MACCR_PHY_LINK_LEVEL (1 << 11)
#define FTGMAC100_MACCR_RX_RUNT (1 << 12)
#define FTGMAC100_MACCR_JUMBO_LF (1 << 13)
#define FTGMAC100_MACCR_RX_ALL (1 << 14)
--
2.9.3
^ permalink raw reply related
* [PATCH net-next v2 2/6] net/faraday: Make EDO{R,T}R bits configurable
From: Joel Stanley @ 2016-09-21 23:04 UTC (permalink / raw)
To: davem; +Cc: clg, Andrew Jeffery, gwshan, andrew, netdev, linux-kernel, benh
In-Reply-To: <20160921230503.23309-1-joel@jms.id.au>
From: Andrew Jeffery <andrew@aj.id.au>
These bits are #defined at a fixed location. In order to support future
hardware that has chosen to move these bits around move the bits into a
member of the struct ftgmac100.
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
drivers/net/ethernet/faraday/ftgmac100.c | 40 +++++++++++++++++++++-----------
drivers/net/ethernet/faraday/ftgmac100.h | 2 --
2 files changed, 26 insertions(+), 16 deletions(-)
diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c
index 40622567159a..62a88d1a1f99 100644
--- a/drivers/net/ethernet/faraday/ftgmac100.c
+++ b/drivers/net/ethernet/faraday/ftgmac100.c
@@ -79,6 +79,9 @@ struct ftgmac100 {
int int_mask_all;
bool use_ncsi;
bool enabled;
+
+ u32 rxdes0_edorr_mask;
+ u32 txdes0_edotr_mask;
};
static int ftgmac100_alloc_rx_page(struct ftgmac100 *priv,
@@ -259,10 +262,11 @@ static bool ftgmac100_rxdes_packet_ready(struct ftgmac100_rxdes *rxdes)
return rxdes->rxdes0 & cpu_to_le32(FTGMAC100_RXDES0_RXPKT_RDY);
}
-static void ftgmac100_rxdes_set_dma_own(struct ftgmac100_rxdes *rxdes)
+static void ftgmac100_rxdes_set_dma_own(const struct ftgmac100 *priv,
+ struct ftgmac100_rxdes *rxdes)
{
/* clear status bits */
- rxdes->rxdes0 &= cpu_to_le32(FTGMAC100_RXDES0_EDORR);
+ rxdes->rxdes0 &= cpu_to_le32(priv->rxdes0_edorr_mask);
}
static bool ftgmac100_rxdes_rx_error(struct ftgmac100_rxdes *rxdes)
@@ -300,9 +304,10 @@ static bool ftgmac100_rxdes_multicast(struct ftgmac100_rxdes *rxdes)
return rxdes->rxdes0 & cpu_to_le32(FTGMAC100_RXDES0_MULTICAST);
}
-static void ftgmac100_rxdes_set_end_of_ring(struct ftgmac100_rxdes *rxdes)
+static void ftgmac100_rxdes_set_end_of_ring(const struct ftgmac100 *priv,
+ struct ftgmac100_rxdes *rxdes)
{
- rxdes->rxdes0 |= cpu_to_le32(FTGMAC100_RXDES0_EDORR);
+ rxdes->rxdes0 |= cpu_to_le32(priv->rxdes0_edorr_mask);
}
static void ftgmac100_rxdes_set_dma_addr(struct ftgmac100_rxdes *rxdes,
@@ -393,7 +398,7 @@ ftgmac100_rx_locate_first_segment(struct ftgmac100 *priv)
if (ftgmac100_rxdes_first_segment(rxdes))
return rxdes;
- ftgmac100_rxdes_set_dma_own(rxdes);
+ ftgmac100_rxdes_set_dma_own(priv, rxdes);
ftgmac100_rx_pointer_advance(priv);
rxdes = ftgmac100_current_rxdes(priv);
}
@@ -464,7 +469,7 @@ static void ftgmac100_rx_drop_packet(struct ftgmac100 *priv)
if (ftgmac100_rxdes_last_segment(rxdes))
done = true;
- ftgmac100_rxdes_set_dma_own(rxdes);
+ ftgmac100_rxdes_set_dma_own(priv, rxdes);
ftgmac100_rx_pointer_advance(priv);
rxdes = ftgmac100_current_rxdes(priv);
} while (!done && ftgmac100_rxdes_packet_ready(rxdes));
@@ -556,10 +561,11 @@ static bool ftgmac100_rx_packet(struct ftgmac100 *priv, int *processed)
/******************************************************************************
* internal functions (transmit descriptor)
*****************************************************************************/
-static void ftgmac100_txdes_reset(struct ftgmac100_txdes *txdes)
+static void ftgmac100_txdes_reset(const struct ftgmac100 *priv,
+ struct ftgmac100_txdes *txdes)
{
/* clear all except end of ring bit */
- txdes->txdes0 &= cpu_to_le32(FTGMAC100_TXDES0_EDOTR);
+ txdes->txdes0 &= cpu_to_le32(priv->txdes0_edotr_mask);
txdes->txdes1 = 0;
txdes->txdes2 = 0;
txdes->txdes3 = 0;
@@ -580,9 +586,10 @@ static void ftgmac100_txdes_set_dma_own(struct ftgmac100_txdes *txdes)
txdes->txdes0 |= cpu_to_le32(FTGMAC100_TXDES0_TXDMA_OWN);
}
-static void ftgmac100_txdes_set_end_of_ring(struct ftgmac100_txdes *txdes)
+static void ftgmac100_txdes_set_end_of_ring(const struct ftgmac100 *priv,
+ struct ftgmac100_txdes *txdes)
{
- txdes->txdes0 |= cpu_to_le32(FTGMAC100_TXDES0_EDOTR);
+ txdes->txdes0 |= cpu_to_le32(priv->txdes0_edotr_mask);
}
static void ftgmac100_txdes_set_first_segment(struct ftgmac100_txdes *txdes)
@@ -701,7 +708,7 @@ static bool ftgmac100_tx_complete_packet(struct ftgmac100 *priv)
dev_kfree_skb(skb);
- ftgmac100_txdes_reset(txdes);
+ ftgmac100_txdes_reset(priv, txdes);
ftgmac100_tx_clean_pointer_advance(priv);
@@ -792,7 +799,7 @@ static int ftgmac100_alloc_rx_page(struct ftgmac100 *priv,
ftgmac100_rxdes_set_page(priv, rxdes, page);
ftgmac100_rxdes_set_dma_addr(rxdes, map);
- ftgmac100_rxdes_set_dma_own(rxdes);
+ ftgmac100_rxdes_set_dma_own(priv, rxdes);
return 0;
}
@@ -839,7 +846,8 @@ static int ftgmac100_alloc_buffers(struct ftgmac100 *priv)
return -ENOMEM;
/* initialize RX ring */
- ftgmac100_rxdes_set_end_of_ring(&priv->descs->rxdes[RX_QUEUE_ENTRIES - 1]);
+ ftgmac100_rxdes_set_end_of_ring(priv,
+ &priv->descs->rxdes[RX_QUEUE_ENTRIES - 1]);
for (i = 0; i < RX_QUEUE_ENTRIES; i++) {
struct ftgmac100_rxdes *rxdes = &priv->descs->rxdes[i];
@@ -849,7 +857,8 @@ static int ftgmac100_alloc_buffers(struct ftgmac100 *priv)
}
/* initialize TX ring */
- ftgmac100_txdes_set_end_of_ring(&priv->descs->txdes[TX_QUEUE_ENTRIES - 1]);
+ ftgmac100_txdes_set_end_of_ring(priv,
+ &priv->descs->txdes[TX_QUEUE_ENTRIES - 1]);
return 0;
err:
@@ -1336,6 +1345,9 @@ static int ftgmac100_probe(struct platform_device *pdev)
priv->netdev = netdev;
priv->dev = &pdev->dev;
+ priv->rxdes0_edorr_mask = BIT(15);
+ priv->txdes0_edotr_mask = BIT(15);
+
spin_lock_init(&priv->tx_lock);
/* initialize NAPI */
diff --git a/drivers/net/ethernet/faraday/ftgmac100.h b/drivers/net/ethernet/faraday/ftgmac100.h
index 13408d448b05..c258586ce4a4 100644
--- a/drivers/net/ethernet/faraday/ftgmac100.h
+++ b/drivers/net/ethernet/faraday/ftgmac100.h
@@ -189,7 +189,6 @@ struct ftgmac100_txdes {
} __attribute__ ((aligned(16)));
#define FTGMAC100_TXDES0_TXBUF_SIZE(x) ((x) & 0x3fff)
-#define FTGMAC100_TXDES0_EDOTR (1 << 15)
#define FTGMAC100_TXDES0_CRC_ERR (1 << 19)
#define FTGMAC100_TXDES0_LTS (1 << 28)
#define FTGMAC100_TXDES0_FTS (1 << 29)
@@ -215,7 +214,6 @@ struct ftgmac100_rxdes {
} __attribute__ ((aligned(16)));
#define FTGMAC100_RXDES0_VDBC 0x3fff
-#define FTGMAC100_RXDES0_EDORR (1 << 15)
#define FTGMAC100_RXDES0_MULTICAST (1 << 16)
#define FTGMAC100_RXDES0_BROADCAST (1 << 17)
#define FTGMAC100_RXDES0_RX_ERR (1 << 18)
--
2.9.3
^ permalink raw reply related
* [PATCH net-next v2 0/6] ftgmac100 support for ast2500
From: Joel Stanley @ 2016-09-21 23:04 UTC (permalink / raw)
To: davem; +Cc: clg, gwshan, andrew, andrew, netdev, linux-kernel, benh
Hello Dave,
This series adds support to the ftgmac100 driver for the Aspeed ast2400 and
ast2500 SoCs. In particular, they ensure the driver works correctly on the
ast2500 where the MAC block has seen some changes in register layout.
They have been tested on ast2400 and ast2500 systems with the NCSI stack and
with a directly attached PHY.
V2 reworks the two patches relating to PHYSTS_CHG into the one patch that
disables the interrupt instead of playing with interrupt sensitivity. I kept
patch 4 'net/faraday: Clear stale interrupts' which was first introduced to
clear the stale PHYSTS_CHG interrupt, as it helps keep us safe from unhygienic
(vendor) bootloaders.
Cheers,
Joel
Andrew Jeffery (2):
net/faraday: Separate rx page storage from rxdesc
net/faraday: Make EDO{R,T}R bits configurable
Gavin Shan (1):
net/faraday: Clear stale interrupts
Joel Stanley (3):
net/faraday: Adapt for Aspeed SoCs
net/faraday: Configure old MDIO interface on Aspeed SoCs
net/faraday: Mask out PHYSTS_CHG interrupt
drivers/net/ethernet/faraday/ftgmac100.c | 97 +++++++++++++++++++++++---------
drivers/net/ethernet/faraday/ftgmac100.h | 8 ++-
2 files changed, 75 insertions(+), 30 deletions(-)
--
2.9.3
^ permalink raw reply
* [PATCH net-next v2 5/6] net/faraday: Configure old MDIO interface on Aspeed SoCs
From: Joel Stanley @ 2016-09-21 23:05 UTC (permalink / raw)
To: davem; +Cc: clg, gwshan, andrew, andrew, netdev, linux-kernel, benh
In-Reply-To: <20160921230503.23309-1-joel@jms.id.au>
The Aspeed SoCs have a new MDIO interface as an option in the G4 and G5
SoCs. The old one is still available, so select it in order to remain
compatible with the ftgmac100 driver.
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
drivers/net/ethernet/faraday/ftgmac100.c | 9 +++++++++
drivers/net/ethernet/faraday/ftgmac100.h | 5 +++++
2 files changed, 14 insertions(+)
diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c
index 189373743ddf..e3653b14008a 100644
--- a/drivers/net/ethernet/faraday/ftgmac100.c
+++ b/drivers/net/ethernet/faraday/ftgmac100.c
@@ -1252,12 +1252,21 @@ static int ftgmac100_setup_mdio(struct net_device *netdev)
struct ftgmac100 *priv = netdev_priv(netdev);
struct platform_device *pdev = to_platform_device(priv->dev);
int i, err = 0;
+ u32 reg;
/* initialize mdio bus */
priv->mii_bus = mdiobus_alloc();
if (!priv->mii_bus)
return -EIO;
+ if (of_machine_is_compatible("aspeed,ast2400") ||
+ of_machine_is_compatible("aspeed,ast2500")) {
+ /* This driver supports the old MDIO interface */
+ reg = ioread32(priv->base + FTGMAC100_OFFSET_REVR);
+ reg &= ~FTGMAC100_REVR_NEW_MDIO_INTERFACE;
+ iowrite32(reg, priv->base + FTGMAC100_OFFSET_REVR);
+ };
+
priv->mii_bus->name = "ftgmac100_mdio";
snprintf(priv->mii_bus->id, MII_BUS_ID_SIZE, "%s-%d",
pdev->name, pdev->id);
diff --git a/drivers/net/ethernet/faraday/ftgmac100.h b/drivers/net/ethernet/faraday/ftgmac100.h
index c258586ce4a4..8a377ab1d127 100644
--- a/drivers/net/ethernet/faraday/ftgmac100.h
+++ b/drivers/net/ethernet/faraday/ftgmac100.h
@@ -134,6 +134,11 @@
#define FTGMAC100_DMAFIFOS_TXDMA_REQ (1 << 31)
/*
+ * Feature Register
+ */
+#define FTGMAC100_REVR_NEW_MDIO_INTERFACE BIT(31)
+
+/*
* Receive buffer size register
*/
#define FTGMAC100_RBSR_SIZE(x) ((x) & 0x3fff)
--
2.9.3
^ permalink raw reply related
* [PATCH net-next v2 4/6] net/faraday: Clear stale interrupts
From: Joel Stanley @ 2016-09-21 23:05 UTC (permalink / raw)
To: davem; +Cc: clg, Gavin Shan, andrew, andrew, netdev, linux-kernel, benh
In-Reply-To: <20160921230503.23309-1-joel@jms.id.au>
From: Gavin Shan <gwshan@linux.vnet.ibm.com>
There is stale interrupt (PHYSTS_CHG in ISR, bit#6 in 0x0) from
the bootloader (uboot) when enabling the MAC. The stale interrupts
aren't part of kernel and should be cleared.
This clears the stale interrupts in ISR (0x0) when enabling the MAC.
Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
drivers/net/ethernet/faraday/ftgmac100.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c
index 47f512224b57..189373743ddf 100644
--- a/drivers/net/ethernet/faraday/ftgmac100.c
+++ b/drivers/net/ethernet/faraday/ftgmac100.c
@@ -1112,6 +1112,7 @@ static int ftgmac100_poll(struct napi_struct *napi, int budget)
static int ftgmac100_open(struct net_device *netdev)
{
struct ftgmac100 *priv = netdev_priv(netdev);
+ unsigned int status;
int err;
err = ftgmac100_alloc_buffers(priv);
@@ -1137,6 +1138,11 @@ static int ftgmac100_open(struct net_device *netdev)
ftgmac100_init_hw(priv);
ftgmac100_start_hw(priv, priv->use_ncsi ? 100 : 10);
+
+ /* Clear stale interrupts */
+ status = ioread32(priv->base + FTGMAC100_OFFSET_ISR);
+ iowrite32(status, priv->base + FTGMAC100_OFFSET_ISR);
+
if (netdev->phydev)
phy_start(netdev->phydev);
else if (priv->use_ncsi)
--
2.9.3
^ permalink raw reply related
* [PATCH net-next v2 3/6] net/faraday: Adapt for Aspeed SoCs
From: Joel Stanley @ 2016-09-21 23:05 UTC (permalink / raw)
To: davem; +Cc: clg, gwshan, andrew, andrew, netdev, linux-kernel, benh
In-Reply-To: <20160921230503.23309-1-joel@jms.id.au>
The RXDES and TXDES registers bits in the ftgmac100 indicates EDO{R,T}R
at bit position 15 for the Faraday Tech IP. However, the version of this
IP present in the Aspeed SoCs has these bits at position 30 in the
registers.
It appers that ast2400 SoCs support both positions, with the 15th bit
marked as reserved but still functional. In the ast2500 this bit is
reused for another function, so we need a work around.
This was confirmed with engineers from Aspeed that using bit 30 is
correct for both the ast2400 and ast2500 SoCs.
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
drivers/net/ethernet/faraday/ftgmac100.c | 13 ++++++++++---
1 file changed, 10 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c
index 62a88d1a1f99..47f512224b57 100644
--- a/drivers/net/ethernet/faraday/ftgmac100.c
+++ b/drivers/net/ethernet/faraday/ftgmac100.c
@@ -1345,9 +1345,6 @@ static int ftgmac100_probe(struct platform_device *pdev)
priv->netdev = netdev;
priv->dev = &pdev->dev;
- priv->rxdes0_edorr_mask = BIT(15);
- priv->txdes0_edotr_mask = BIT(15);
-
spin_lock_init(&priv->tx_lock);
/* initialize NAPI */
@@ -1381,6 +1378,16 @@ static int ftgmac100_probe(struct platform_device *pdev)
FTGMAC100_INT_PHYSTS_CHG |
FTGMAC100_INT_RPKT_BUF |
FTGMAC100_INT_NO_RXBUF);
+
+ if (of_machine_is_compatible("aspeed,ast2400") ||
+ of_machine_is_compatible("aspeed,ast2500")) {
+ priv->rxdes0_edorr_mask = BIT(30);
+ priv->txdes0_edotr_mask = BIT(30);
+ } else {
+ priv->rxdes0_edorr_mask = BIT(15);
+ priv->txdes0_edotr_mask = BIT(15);
+ }
+
if (pdev->dev.of_node &&
of_get_property(pdev->dev.of_node, "use-ncsi", NULL)) {
if (!IS_ENABLED(CONFIG_NET_NCSI)) {
--
2.9.3
^ permalink raw reply related
* [PATCH net-next v2 1/6] net/faraday: Separate rx page storage from rxdesc
From: Joel Stanley @ 2016-09-21 23:04 UTC (permalink / raw)
To: davem; +Cc: clg, Andrew Jeffery, gwshan, andrew, netdev, linux-kernel, benh
In-Reply-To: <20160921230503.23309-1-joel@jms.id.au>
From: Andrew Jeffery <andrew@aj.id.au>
The ftgmac100 hardware revision in e.g. the Aspeed AST2500 no longer
reserves all bits in RXDES#2 but instead uses the bottom 16 bits to
store MAC frame metadata. Avoid corruption by shifting struct page
pointers out to their own member in struct ftgmac100.
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
drivers/net/ethernet/faraday/ftgmac100.c | 25 ++++++++++++++++++-------
1 file changed, 18 insertions(+), 7 deletions(-)
diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c
index 36361f8bf894..40622567159a 100644
--- a/drivers/net/ethernet/faraday/ftgmac100.c
+++ b/drivers/net/ethernet/faraday/ftgmac100.c
@@ -60,6 +60,8 @@ struct ftgmac100 {
struct ftgmac100_descs *descs;
dma_addr_t descs_dma_addr;
+ struct page *rx_pages[RX_QUEUE_ENTRIES];
+
unsigned int rx_pointer;
unsigned int tx_clean_pointer;
unsigned int tx_pointer;
@@ -341,18 +343,27 @@ static bool ftgmac100_rxdes_ipcs_err(struct ftgmac100_rxdes *rxdes)
return rxdes->rxdes1 & cpu_to_le32(FTGMAC100_RXDES1_IP_CHKSUM_ERR);
}
+static inline struct page **ftgmac100_rxdes_page_slot(struct ftgmac100 *priv,
+ struct ftgmac100_rxdes *rxdes)
+{
+ return &priv->rx_pages[rxdes - priv->descs->rxdes];
+}
+
/*
* rxdes2 is not used by hardware. We use it to keep track of page.
* Since hardware does not touch it, we can skip cpu_to_le32()/le32_to_cpu().
*/
-static void ftgmac100_rxdes_set_page(struct ftgmac100_rxdes *rxdes, struct page *page)
+static void ftgmac100_rxdes_set_page(struct ftgmac100 *priv,
+ struct ftgmac100_rxdes *rxdes,
+ struct page *page)
{
- rxdes->rxdes2 = (unsigned int)page;
+ *ftgmac100_rxdes_page_slot(priv, rxdes) = page;
}
-static struct page *ftgmac100_rxdes_get_page(struct ftgmac100_rxdes *rxdes)
+static struct page *ftgmac100_rxdes_get_page(struct ftgmac100 *priv,
+ struct ftgmac100_rxdes *rxdes)
{
- return (struct page *)rxdes->rxdes2;
+ return *ftgmac100_rxdes_page_slot(priv, rxdes);
}
/******************************************************************************
@@ -501,7 +512,7 @@ static bool ftgmac100_rx_packet(struct ftgmac100 *priv, int *processed)
do {
dma_addr_t map = ftgmac100_rxdes_get_dma_addr(rxdes);
- struct page *page = ftgmac100_rxdes_get_page(rxdes);
+ struct page *page = ftgmac100_rxdes_get_page(priv, rxdes);
unsigned int size;
dma_unmap_page(priv->dev, map, RX_BUF_SIZE, DMA_FROM_DEVICE);
@@ -779,7 +790,7 @@ static int ftgmac100_alloc_rx_page(struct ftgmac100 *priv,
return -ENOMEM;
}
- ftgmac100_rxdes_set_page(rxdes, page);
+ ftgmac100_rxdes_set_page(priv, rxdes, page);
ftgmac100_rxdes_set_dma_addr(rxdes, map);
ftgmac100_rxdes_set_dma_own(rxdes);
return 0;
@@ -791,7 +802,7 @@ static void ftgmac100_free_buffers(struct ftgmac100 *priv)
for (i = 0; i < RX_QUEUE_ENTRIES; i++) {
struct ftgmac100_rxdes *rxdes = &priv->descs->rxdes[i];
- struct page *page = ftgmac100_rxdes_get_page(rxdes);
+ struct page *page = ftgmac100_rxdes_get_page(priv, rxdes);
dma_addr_t map = ftgmac100_rxdes_get_dma_addr(rxdes);
if (!page)
--
2.9.3
^ permalink raw reply related
* Re: [PATCH net-next 0/7] ftgmac100 support for ast2500
From: Joel Stanley @ 2016-09-21 23:04 UTC (permalink / raw)
To: davem
Cc: Cédric Le Goater, Gavin Shan, Andrew Lunn, Andrew Jeffery,
netdev, linux-kernel
In-Reply-To: <20160921230334.23224-1-joel@jms.id.au>
Please ignore this one.
On Thu, Sep 22, 2016 at 8:33 AM, Joel Stanley <joel@jms.id.au> wrote:
> Hello Dave,
>
> This series adds support to the ftgmac100 driver for the Aspeed ast2400 and
> ast2500 SoCs. In particular, they ensure the driver works correctly on the
> ast2500 where the MAC block has seen some changes in register layout.
>
> They have been tested on ast2400 and ast2500 systems with the NCSI stack and
> with a directly attached PHY.
>
> Cheers,
>
> Joel
>
> Andrew Jeffery (2):
> net/ftgmac100: Separate rx page storage from rxdesc
> net/ftgmac100: Make EDO{R,T}R bits configurable
>
> Gavin Shan (2):
> net/faraday: Avoid PHYSTS_CHG interrupt
> net/faraday: Clear stale interrupts
>
> Joel Stanley (3):
> net/ftgmac100: Adapt for Aspeed SoCs
> net/faraday: Fix phy link irq on Aspeed G5 SoCs
> net/faraday: Configure old MDIO interface on Aspeed SoCs
>
> drivers/net/ethernet/faraday/ftgmac100.c | 92 ++++++++++++++++++++++++--------
> drivers/net/ethernet/faraday/ftgmac100.h | 8 ++-
> 2 files changed, 77 insertions(+), 23 deletions(-)
>
> --
> 2.9.3
>
^ permalink raw reply
* [PATCH net-next 0/7] ftgmac100 support for ast2500
From: Joel Stanley @ 2016-09-21 23:03 UTC (permalink / raw)
To: davem; +Cc: clg, gwshan, andrew, andrew, netdev, linux-kernel
Hello Dave,
This series adds support to the ftgmac100 driver for the Aspeed ast2400 and
ast2500 SoCs. In particular, they ensure the driver works correctly on the
ast2500 where the MAC block has seen some changes in register layout.
They have been tested on ast2400 and ast2500 systems with the NCSI stack and
with a directly attached PHY.
Cheers,
Joel
Andrew Jeffery (2):
net/ftgmac100: Separate rx page storage from rxdesc
net/ftgmac100: Make EDO{R,T}R bits configurable
Gavin Shan (2):
net/faraday: Avoid PHYSTS_CHG interrupt
net/faraday: Clear stale interrupts
Joel Stanley (3):
net/ftgmac100: Adapt for Aspeed SoCs
net/faraday: Fix phy link irq on Aspeed G5 SoCs
net/faraday: Configure old MDIO interface on Aspeed SoCs
drivers/net/ethernet/faraday/ftgmac100.c | 92 ++++++++++++++++++++++++--------
drivers/net/ethernet/faraday/ftgmac100.h | 8 ++-
2 files changed, 77 insertions(+), 23 deletions(-)
--
2.9.3
^ permalink raw reply
* Re: [net-next 01/15] i40e: Introduce VF port representor/control netdevs
From: Jeff Kirsher @ 2016-09-21 21:23 UTC (permalink / raw)
To: Or Gerlitz, Samudrala, Sridhar
Cc: David Miller, Linux Netdev List, nhorman@redhat.com,
sassmann@redhat.com, jogreene@redhat.com, guru.anbalagane,
Ilya Lesokhin, Andy Gospodarek, John Fastabend, Jiri Pirko,
Rony Efraim
In-Reply-To: <CAJ3xEMj-teLmjg+Nzn0pHsnz8P=EY6uF4VzDJsxVEOTihStrLw@mail.gmail.com>
[-- Attachment #1: Type: text/plain, Size: 2173 bytes --]
On Wed, 2016-09-21 at 22:21 +0300, Or Gerlitz wrote:
> On Wed, Sep 21, 2016 at 7:59 PM, Samudrala, Sridhar
> <sridhar.samudrala@intel.com> wrote:
> > On 9/21/2016 12:04 AM, Or Gerlitz wrote:
>
>
> >> so what happens after this patchset is applied and before the future
> work is
> >> submitted? RX/TX slow path through the VFPRs isn't supported and what
> >> about fast path? in other words what happens when someone
> >> loads the driver, sets SRIOV (--> the driver set itself to switchdev
> mode
> >> and VFPRs are created) and then a VF sends a packet? do you still put
> >> into the HW the legacy DMAC based switching rules? I am not
> following...
>
> > The VF driver requests adding the dmac based filter rules via mailbox
> > messages to PF and that is not changed in this patchset.
> > Once we have VFPR TX/RX support, we will not allow the VF driver to add
> > these rules, Instead a host based
> > program will be able to add these rules to enable the fast path.
>
> I see, this means that when this patch set is applied your driver
> reports through devlink that they are in switchdev mode, but the
> operational state of the VFs and VFPRs isn't such - as the VFs dictate
> the steering and the VFPRs don't support slow path TX/RX --- in an
> earlier comment you made on this thread you said that you will be
> submitting RX/TX support in the next patchset. Maybe it would be best
> if you can take the VFPRs patches out of this series and roll a follow
> up series with all what's needed? unless you need more time and gonna
> miss 4.9 as of that... if the patches are ready, I say lets have them
> all in one series, if not, I wonder what other people think on the
> matter. I am basically half+ good to have also the half baked code
> base merged
>
> Anyway, there's no point to report through ethtool something (VF vport
> HW stats) you can report in the standard and convenient manner, so
> this one please do address regardless of the prev comment.
I will drop Sridhar's changes from this series for now, so that he can do
the re-work AND provide the additional patches he referred to earlier at a
later date.
[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 819 bytes --]
^ permalink raw reply
* [PATCH] L2TP:Adjust intf MTU, add underlay L3, overlay L2
From: R. Parameswaran @ 2016-09-21 21:11 UTC (permalink / raw)
To: kleptog, jchapman, mostrows, acme, netdev
Cc: davem, linux-kernel, nprachan, rshearma, dfawcus, stephen
Take into account all of the tunnel encapsulation headers when setting
up the MTU on the L2TP logical interface device. Otherwise, packets
created by the applications on top of the L2TP layer are larger
than they ought to be, relative to the underlay MTU, leading to
needless fragmentation once the outer IP encap is added.
Specifically, take into account the (outer, underlay) IP header
imposed on the encapsulated L2TP packet, and the Layer 2 header
imposed on the inner IP packet prior to L2TP encapsulation.
Do not assume an Ethernet (non-jumbo) underlay. Use the PMTU mechanism
and the dst entry in the L2TP tunnel socket to directly pull up
the underlay MTU (as the baseline number on top of which the
encapsulation headers are factored in). Fall back to Ethernet MTU
if this fails.
Signed-off-by: Ramkumar Parameswaran <rparames@brocade.com>
Reviewed-by: N. Prachanda <nprachan@brocade.com>,
R. Shearman <rshearma@brocade.com>,
D. Fawcus <dfawcus@brocade.com>
---
net/l2tp/l2tp_eth.c | 48 ++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 44 insertions(+), 4 deletions(-)
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index 57fc5a4..dbcd6bd 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -30,6 +30,9 @@
#include <net/xfrm.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/udp.h>
#include "l2tp_core.h"
@@ -206,6 +209,46 @@ static void l2tp_eth_show(struct seq_file *m, void
*arg)
}
#endif
+static void l2tp_eth_adjust_mtu(struct l2tp_tunnel *tunnel,
+ struct l2tp_session *session,
+ struct net_device *dev)
+{
+ unsigned int overhead = 0;
+ struct dst_entry *dst;
+
+ if (session->mtu != 0) {
+ dev->mtu = session->mtu;
+ dev->needed_headroom += session->hdr_len;
+ if (tunnel->encap == L2TP_ENCAPTYPE_UDP)
+ dev->needed_headroom += sizeof(struct udphdr);
+ return;
+ }
+ overhead = session->hdr_len;
+ /* Adjust MTU, factor overhead - underlay L3 hdr, overlay L2 hdr*/
+ if (tunnel->sock->sk_family == AF_INET)
+ overhead += (ETH_HLEN + sizeof(struct iphdr));
+ else if (tunnel->sock->sk_family == AF_INET6)
+ overhead += (ETH_HLEN + sizeof(struct ipv6hdr));
+ /* Additionally, if the encap is UDP, account for UDP header size */
+ if (tunnel->encap == L2TP_ENCAPTYPE_UDP)
+ overhead += sizeof(struct udphdr);
+ /* If PMTU discovery was enabled, use discovered MTU on L2TP device */
+ dst = sk_dst_get(tunnel->sock);
+ if (dst) {
+ u32 pmtu = dst_mtu(dst);
+
+ if (pmtu != 0)
+ dev->mtu = pmtu;
+ dst_release(dst);
+ }
+ /* else (no PMTUD) L2TP dev MTU defaulted to Ethernet MTU in caller */
+ session->mtu = dev->mtu - overhead;
+ dev->mtu = session->mtu;
+ dev->needed_headroom += session->hdr_len;
+ if (tunnel->encap == L2TP_ENCAPTYPE_UDP)
+ dev->needed_headroom += sizeof(struct udphdr);
+}
+
static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32
session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg)
{
struct net_device *dev;
@@ -255,11 +298,8 @@ static int l2tp_eth_create(struct net *net, u32
tunnel_id, u32 session_id, u32 p
}
dev_net_set(dev, net);
- if (session->mtu == 0)
- session->mtu = dev->mtu - session->hdr_len;
- dev->mtu = session->mtu;
- dev->needed_headroom += session->hdr_len;
+ l2tp_eth_adjust_mtu(tunnel, session, dev);
priv = netdev_priv(dev);
priv->dev = dev;
priv->session = session;
--
2.1.4
^ permalink raw reply related
* Re: [PATCH RFC 1/3] xdp: Infrastructure to generalize XDP
From: Jesper Dangaard Brouer @ 2016-09-21 19:56 UTC (permalink / raw)
To: Tom Herbert
Cc: Thomas Graf, David S. Miller, Linux Kernel Network Developers,
Kernel Team, Tariq Toukan, Brenden Blanco, Alexei Starovoitov,
Eric Dumazet, brouer
In-Reply-To: <CALx6S36ZCdh5iOp6CPfztVKOEkDFgUo1RyqWMJVxhakOYNx8iw@mail.gmail.com>
On Wed, 21 Sep 2016 08:08:34 -0700 Tom Herbert <tom@herbertland.com> wrote:
> On Wed, Sep 21, 2016 at 7:48 AM, Thomas Graf <tgraf@suug.ch> wrote:
> > On 09/21/16 at 07:19am, Tom Herbert wrote:
> >> certain design that because of constraints on one kernel interface. As
> >> a kernel developer I want flexibility on how we design and implement
> >> things!
> >
> > Perfectly valid argument. I reviewed your ILA changes and did not
> > object to them.
> >
> >
> >> I think there are two questions that this patch set poses for the
> >> community wrt XDP:
> >>
> >> #1: Should we allow alternate code to run in XDP other than BPF?
> >> #2: If #1 is true what is the best way to implement that?
> >>
> >> If the answer to #1 is "no" then the answer to #2 is irrelevant. So
> >> with this RFC I'm hoping we can come the agreement on questions #1.
I vote yes to #1.
> > I'm not opposed to running non-BPF code at XDP. I'm against adding
> > a linked list of hook consumers.
I also worry about the performance impact of a linked list. We should
simple benchmark it instead of discussing it! ;-)
> > Would anyone require to run XDP-BPF in combination ILA? Or XDP-BPF
> > in combination with a potential XDP-nftables? We don't know yet I
> > guess.
> >
> Right. Admittedly, I feel like we owe a bit of reciprocity to
> nftables. For ILA we are using the NF_INET_PRE_ROUTING hook with our
> own code (looks like ipvlan set nfhooks as well). This works really
> well and saves the value of early demux in ILA. Had we not had the
> ability to use nfhooks in this fashion it's likely we would have had
> to create another hook (we did try putting translation in nftables
> rules but that was too inefficient for ILA).
Thinking about it, I actually think Tom is proposing a very valid user
of the XDP hook, which is the kernel itself. And Tom even have a real
first user ILA. The way I read the ILA-RFC-draft[1], the XDP hook
would benefit the NVE (Network Virtualization Edge) component, which
can run separately or run on the Tenant System, where the latter case
could use XDP_PASS.
[1] https://www.ietf.org/archive/id/draft-herbert-nvo3-ila-02.txt
--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Principal Kernel Engineer at Red Hat
Author of http://www.iptv-analyzer.org
LinkedIn: http://www.linkedin.com/in/brouer
^ permalink raw reply
* Re: [RFC v2 07/12] qedr: Add support for memory registeration verbs
From: Sagi Grimberg @ 2016-09-21 19:53 UTC (permalink / raw)
To: Ram Amrani, davem, dledford
Cc: Ariel.Elior, Michal.Kalderon, Yuval.Mintz, rajesh.borundia,
linux-rdma, netdev
In-Reply-To: <1474367764-9555-8-git-send-email-Ram.Amrani@cavium.com>
> +static int qedr_set_page(struct ib_mr *ibmr, u64 addr)
> +{
> + struct qedr_mr *mr = get_qedr_mr(ibmr);
> + struct qedr_pbl *pbl_table;
> + struct regpair *pbe;
> + u32 pbes_in_page;
> +
> + if (unlikely(mr->npages == mr->info.pbl_info.num_pbes)) {
> + DP_ERR(mr->dev, "qedr_set_page failes when %d\n", mr->npages);
> + return -ENOMEM;
> + }
> +
> + DP_VERBOSE(mr->dev, QEDR_MSG_MR, "qedr_set_page pages[%d] = 0x%llx\n",
> + mr->npages, addr);
> +
> + pbes_in_page = mr->info.pbl_info.pbl_size / sizeof(u64);
> + pbl_table = mr->info.pbl_table + (mr->npages / pbes_in_page);
> + pbe = (struct regpair *)pbl_table->va;
> + pbe += mr->npages % pbes_in_page;
> + pbe->lo = cpu_to_le32((u32)addr);
> + pbe->hi = cpu_to_le32((u32)upper_32_bits(addr));
> +
> + mr->npages++;
> +
> + return 0;
> +}
Looks better.
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
^ permalink raw reply
* Re: [net-next 01/15] i40e: Introduce VF port representor/control netdevs
From: Or Gerlitz @ 2016-09-21 19:21 UTC (permalink / raw)
To: Samudrala, Sridhar
Cc: Jeff Kirsher, David Miller, Linux Netdev List, nhorman@redhat.com,
sassmann@redhat.com, jogreene@redhat.com, guru.anbalagane,
Ilya Lesokhin, Andy Gospodarek, John Fastabend, Jiri Pirko,
Rony Efraim
In-Reply-To: <57E2BC74.5040905@intel.com>
On Wed, Sep 21, 2016 at 7:59 PM, Samudrala, Sridhar
<sridhar.samudrala@intel.com> wrote:
> On 9/21/2016 12:04 AM, Or Gerlitz wrote:
>> so what happens after this patchset is applied and before the future work is
>> submitted? RX/TX slow path through the VFPRs isn't supported and what
>> about fast path? in other words what happens when someone
>> loads the driver, sets SRIOV (--> the driver set itself to switchdev mode
>> and VFPRs are created) and then a VF sends a packet? do you still put
>> into the HW the legacy DMAC based switching rules? I am not following...
> The VF driver requests adding the dmac based filter rules via mailbox
> messages to PF and that is not changed in this patchset.
> Once we have VFPR TX/RX support, we will not allow the VF driver to add
> these rules, Instead a host based
> program will be able to add these rules to enable the fast path.
I see, this means that when this patch set is applied your driver
reports through devlink that they are in switchdev mode, but the
operational state of the VFs and VFPRs isn't such - as the VFs dictate
the steering and the VFPRs don't support slow path TX/RX --- in an
earlier comment you made on this thread you said that you will be
submitting RX/TX support in the next patchset. Maybe it would be best
if you can take the VFPRs patches out of this series and roll a follow
up series with all what's needed? unless you need more time and gonna
miss 4.9 as of that... if the patches are ready, I say lets have them
all in one series, if not, I wonder what other people think on the
matter. I am basically half+ good to have also the half baked code
base merged
Anyway, there's no point to report through ethtool something (VF vport
HW stats) you can report in the standard and convenient manner, so
this one please do address regardless of the prev comment.
Or.
^ permalink raw reply
* Re: [PATCH RFC 1/3] xdp: Infrastructure to generalize XDP
From: Thomas Graf @ 2016-09-21 18:58 UTC (permalink / raw)
To: Tom Herbert
Cc: Jakub Kicinski, Alexei Starovoitov, David S. Miller,
Linux Kernel Network Developers, Kernel Team, Tariq Toukan,
Brenden Blanco, Alexei Starovoitov, Eric Dumazet,
Jesper Dangaard Brouer
In-Reply-To: <CALx6S34c6Mz=YpbiPcVNe38wTkxfjB9SWxaqtz1LivLeGzssRQ@mail.gmail.com>
On 09/21/16 at 11:50am, Tom Herbert wrote:
> 50 lines in one driver is not a big deal, 50 lines in a hundred
> drivers is! I learned this lesson in BQL which was well abstracted out
> to be minimally invasive but we still saw many issues because of the
> pecularities of different drivers.
You want to enable XDP in a hundred drivers? Are you planning to
deploy ISA NIC based ILA routers? ;-)
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox