* [PATCH net-next 1/1] net/smc: add SMC rendezvous protocol
From: Ursula Braun @ 2017-10-10 14:14 UTC (permalink / raw)
To: davem
Cc: netdev, linux-s390, jwi, schwidefsky, heiko.carstens, raspl,
hwippel, ubraun
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
The SMC protocol [1] uses a rendezvous protocol to negotiate SMC
capability between peers. The current Linux implementation does not use
this rendezvous protocol and, thus, is not compliant to RFC7609 and
incompatible with other SMC implementations like in zOS. This patch adds
support for the SMC rendezvous protocol.
Details:
The SMC rendezvous protocol relies on the use of a new TCP experimental
option. With this option, SMC capabilities are exchanged between the
peers during the TCP three way handshake.
The goal of this patch is to leave common TCP code unmodified. Thus,
it uses netfilter hooks to intercept TCP SYN and SYN/ACK packets. For
outgoing packets originating from SMC sockets, the experimental option
is added. For inbound packets destined for SMC sockets, the experimental
option is checked.
Another goal was to minimize the performance impact on non-SMC traffic
(when SMC is enabled). The netfilter hooks used for SMC client
connections are active only during TCP connection establishment.
The netfilter hooks used for SMC servers are active as long as there are
listening SMC sockets.
When the hooks are active, the following additional operations are
performed on incoming and outgoing packets:
(1) call SMC netfilter hook (all IPv4 packets)
(2) check if TCP SYN or SYN/ACK packet (all IPv4 packets)
(3) check if packet goes to/comes from SMC socket (SYN & SYN/ACK
packets only)
(4) check/add SMC experimental option (SMC sockets' SYN & SYN/ACK
packets only)
References:
[1] SMC-R Informational RFC: http://www.rfc-editor.org/info/rfc7609
Signed-off-by: Hans Wippel <hwippel@linux.vnet.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
---
net/smc/Kconfig | 2 +-
net/smc/Makefile | 2 +-
net/smc/af_smc.c | 66 ++++++-
net/smc/smc.h | 10 +-
net/smc/smc_rv.c | 543 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
net/smc/smc_rv.h | 31 ++++
6 files changed, 646 insertions(+), 8 deletions(-)
create mode 100644 net/smc/smc_rv.c
create mode 100644 net/smc/smc_rv.h
diff --git a/net/smc/Kconfig b/net/smc/Kconfig
index c717ef0896aa..ad49086e8ed7 100644
--- a/net/smc/Kconfig
+++ b/net/smc/Kconfig
@@ -1,6 +1,6 @@
config SMC
tristate "SMC socket protocol family"
- depends on INET && INFINIBAND
+ depends on INET && INFINIBAND && NETFILTER
---help---
SMC-R provides a "sockets over RDMA" solution making use of
RDMA over Converged Ethernet (RoCE) technology to upgrade
diff --git a/net/smc/Makefile b/net/smc/Makefile
index 188104654b54..2155a7eff41d 100644
--- a/net/smc/Makefile
+++ b/net/smc/Makefile
@@ -1,4 +1,4 @@
obj-$(CONFIG_SMC) += smc.o
obj-$(CONFIG_SMC_DIAG) += smc_diag.o
smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
-smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o
+smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_rv.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 745f145d4c4d..290b9ff06e01 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -34,6 +34,7 @@
#include <net/smc.h>
#include "smc.h"
+#include "smc_rv.h"
#include "smc_clc.h"
#include "smc_llc.h"
#include "smc_cdc.h"
@@ -109,6 +110,7 @@ static int smc_release(struct socket *sock)
{
struct sock *sk = sock->sk;
struct smc_sock *smc;
+ int old_state;
int rc = 0;
if (!sk)
@@ -123,6 +125,7 @@ static int smc_release(struct socket *sock)
lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
else
lock_sock(sk);
+ old_state = sk->sk_state;
if (smc->use_fallback) {
sk->sk_state = SMC_CLOSED;
@@ -132,6 +135,10 @@ static int smc_release(struct socket *sock)
sock_set_flag(sk, SOCK_DEAD);
sk->sk_shutdown |= SHUTDOWN_MASK;
}
+ if (old_state == SMC_LISTEN) {
+ smc_rv_nf_unregister_hook(sock_net(sk), &smc_nfho_serv);
+ kfree(smc->listen_pends);
+ }
if (smc->clcsock) {
sock_release(smc->clcsock);
smc->clcsock = NULL;
@@ -178,6 +185,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
sk->sk_destruct = smc_destruct;
sk->sk_protocol = SMCPROTO_SMC;
smc = smc_sk(sk);
+ smc->use_fallback = true; /* default: not SMC-capable */
INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
INIT_LIST_HEAD(&smc->accept_q);
spin_lock_init(&smc->accept_q_lock);
@@ -390,6 +398,10 @@ static int smc_connect_rdma(struct smc_sock *smc)
int rc = 0;
u8 ibport;
+ if (smc->use_fallback)
+ /* peer has not signalled SMC-capability */
+ goto out_connected;
+
/* IPSec connections opt out of SMC-R optimizations */
if (using_ipsec(smc)) {
reason_code = SMC_CLC_DECL_IPSEC;
@@ -500,7 +512,6 @@ static int smc_connect_rdma(struct smc_sock *smc)
smc_tx_init(smc);
out_connected:
- smc_copy_sock_settings_to_clc(smc);
if (smc->sk.sk_state == SMC_INIT)
smc->sk.sk_state = SMC_ACTIVE;
@@ -555,7 +566,11 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr,
}
smc_copy_sock_settings_to_clc(smc);
+ smc_rv_nf_register_hook(sock_net(sk), &smc_nfho_clnt);
+
rc = kernel_connect(smc->clcsock, addr, alen, flags);
+ if (rc != -EINPROGRESS)
+ smc_rv_nf_unregister_hook(sock_net(sk), &smc_nfho_clnt);
if (rc)
goto out;
@@ -574,10 +589,12 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr,
static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
{
+ struct smc_listen_pending *pnd;
struct sock *sk = &lsmc->sk;
struct socket *new_clcsock;
struct sock *new_sk;
- int rc;
+ unsigned long flags;
+ int i, rc;
release_sock(&lsmc->sk);
new_sk = smc_sock_alloc(sock_net(sk), NULL);
@@ -613,6 +630,25 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
}
(*new_smc)->clcsock = new_clcsock;
+
+ /* enable SMC-capability if an SMC-capable connecting socket is
+ * contained in listen_pends; invalidate this entry
+ */
+ spin_lock_irqsave(&lsmc->listen_pends_lock, flags);
+ for (i = 0; i < 2 * lsmc->sk.sk_max_ack_backlog; i++) {
+ pnd = lsmc->listen_pends + i;
+ if (pnd->used &&
+ pnd->addr == new_clcsock->sk->sk_daddr &&
+ pnd->port == new_clcsock->sk->sk_dport &&
+ jiffies_to_msecs(get_jiffies_64() - pnd->time) <=
+ SMC_LISTEN_PEND_VALID_TIME) {
+ (*new_smc)->use_fallback = false;
+ pnd->used = false;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&lsmc->listen_pends_lock, flags);
+
out:
return rc;
}
@@ -759,6 +795,10 @@ static void smc_listen_work(struct work_struct *work)
u8 prefix_len;
u8 ibport;
+ if (new_smc->use_fallback)
+ /* peer has not signalled SMC-capability */
+ goto out_connected;
+
/* do inband token exchange -
*wait for and receive SMC Proposal CLC message
*/
@@ -929,7 +969,6 @@ static void smc_tcp_listen_work(struct work_struct *work)
continue;
new_smc->listen_smc = lsmc;
- new_smc->use_fallback = false; /* assume rdma capability first*/
sock_hold(&lsmc->sk); /* sock_put in smc_listen_work */
INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
smc_copy_sock_settings_to_smc(new_smc);
@@ -954,16 +993,32 @@ static int smc_listen(struct socket *sock, int backlog)
if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
goto out;
+ rc = -ENOMEM;
+ /* Addresses and ports of incoming SYN packets with experimental option
+ * SMC are saved, but TCP might decide to drop them. Thus more slots
+ * than the backlog value are allocated for pending connecting sockets
+ */
+ smc->listen_pends = kzalloc(
+ 2 * backlog * sizeof(struct smc_listen_pending),
+ GFP_KERNEL);
+ if (!smc->listen_pends)
+ goto out;
+ spin_lock_init(&smc->listen_pends_lock);
+
rc = 0;
if (sk->sk_state == SMC_LISTEN) {
sk->sk_max_ack_backlog = backlog;
goto out;
}
+
+ smc->use_fallback = false; /* listen sockets are SMC-capable */
/* some socket options are handled in core, so we could not apply
* them to the clc socket -- copy smc socket options to clc socket
*/
smc_copy_sock_settings_to_clc(smc);
+ smc_rv_nf_register_hook(sock_net(sk), &smc_nfho_serv);
+
rc = kernel_listen(smc->clcsock, backlog);
if (rc)
goto out;
@@ -1114,7 +1169,7 @@ static unsigned int smc_poll(struct file *file, struct socket *sock,
struct sock *sk = sock->sk;
unsigned int mask = 0;
struct smc_sock *smc;
- int rc;
+ int rc = 0;
smc = smc_sk(sock->sk);
if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
@@ -1123,6 +1178,7 @@ static unsigned int smc_poll(struct file *file, struct socket *sock,
/* if non-blocking connect finished ... */
lock_sock(sk);
if ((sk->sk_state == SMC_INIT) && (mask & POLLOUT)) {
+ smc_rv_nf_unregister_hook(sock_net(sk), &smc_nfho_clnt);
sk->sk_err = smc->clcsock->sk->sk_err;
if (sk->sk_err) {
mask |= POLLERR;
@@ -1348,7 +1404,6 @@ static int smc_create(struct net *net, struct socket *sock, int protocol,
/* create internal TCP socket for CLC handshake and fallback */
smc = smc_sk(sk);
- smc->use_fallback = false; /* assume rdma capability first */
rc = sock_create_kern(net, PF_INET, SOCK_STREAM,
IPPROTO_TCP, &smc->clcsock);
if (rc)
@@ -1370,6 +1425,7 @@ static int __init smc_init(void)
{
int rc;
+ smc_rv_init();
rc = smc_pnet_init();
if (rc)
return rc;
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 0ccd6fa387ad..96d7a20ba7db 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -167,6 +167,13 @@ struct smc_connection {
struct work_struct close_work; /* peer sent some closing */
};
+struct smc_listen_pending {
+ u64 time; /* time when entry was created*/
+ bool used; /* true if entry is in use */
+ __be32 addr; /* address of a listen socket */
+ __be16 port; /* port of a listen socket */
+};
+
struct smc_sock { /* smc sock container */
struct sock sk;
struct socket *clcsock; /* internal tcp socket */
@@ -175,6 +182,8 @@ struct smc_sock { /* smc sock container */
struct smc_sock *listen_smc; /* listen parent */
struct work_struct tcp_listen_work;/* handle tcp socket accepts */
struct work_struct smc_listen_work;/* prepare new accept socket */
+ struct smc_listen_pending *listen_pends;/* listen pending SYNs */
+ spinlock_t listen_pends_lock; /* protects listen_pends */
struct list_head accept_q; /* sockets to be accepted */
spinlock_t accept_q_lock; /* protects accept_q */
struct delayed_work sock_put_work; /* final socket freeing */
@@ -271,5 +280,4 @@ int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr,
struct smc_clc_msg_local *lcl, int srv_first_contact);
struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock);
void smc_close_non_accepted(struct sock *sk);
-
#endif /* __SMC_H */
diff --git a/net/smc/smc_rv.c b/net/smc/smc_rv.c
new file mode 100644
index 000000000000..4ce01dec808f
--- /dev/null
+++ b/net/smc/smc_rv.c
@@ -0,0 +1,543 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * SMC Rendezvous to determine SMC-capability of the peer
+ *
+ * Copyright IBM Corp. 2017
+ *
+ * Author(s): Hans Wippel <hwippel@linux.vnet.ibm.com>
+ * Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <net/tcp.h>
+
+#include "smc.h"
+#include "smc_rv.h"
+
+#define TCPOLEN_SMC 8
+#define TCPOLEN_SMC_BASE 6
+#define TCPOLEN_SMC_ALIGNED 2
+
+static const char TCPOPT_SMC_MAGIC[4] = {'\xe2', '\xd4', '\xc3', '\xd9'};
+
+/* in TCP header, replace EOL option and remaining header bytes with NOPs */
+static bool smc_rv_replace_eol_option(struct sk_buff *skb)
+{
+ struct tcphdr *tcph = tcp_hdr(skb);
+ int opt_bytes = tcp_optlen(skb);
+ unsigned char *buf;
+ int i = 0;
+
+ buf = (unsigned char *)(tcph + 1);
+ /* Parse TCP options. Based on tcp_parse_options in tcp_input.c */
+ while (i < opt_bytes) {
+ switch (buf[i]) {
+ /* one byte options */
+ case TCPOPT_EOL:
+ /* replace remaining bytes with NOPs */
+ while (i < opt_bytes) {
+ buf[i] = TCPOPT_NOP;
+ i++;
+ }
+ return true;
+ case TCPOPT_NOP:
+ i++;
+ continue;
+ default:
+ /* multi-byte options */
+ if (buf[i + 1] < 2 || i + buf[i + 1] > opt_bytes)
+ return false; /* bad option */
+ i += buf[i + 1];
+ continue;
+ }
+ }
+ return true;
+}
+
+/* check if TCP header contains SMC option */
+static bool smc_rv_has_smc_option(struct sk_buff *skb)
+{
+ struct tcphdr *tcph = tcp_hdr(skb);
+ int opt_bytes = tcp_optlen(skb);
+ unsigned char *buf;
+ int i = 0;
+
+ buf = (unsigned char *)(tcph + 1);
+ /* Parse TCP options. Based on tcp_parse_options in tcp_input.c */
+ while (i < opt_bytes) {
+ switch (buf[i]) {
+ /* one byte options */
+ case TCPOPT_EOL:
+ return false;
+ case TCPOPT_NOP:
+ i++;
+ continue;
+ default:
+ /* multi-byte options */
+ if (buf[i + 1] < 2)
+ return false; /* bad option */
+ /* check for SMC rendezvous option */
+ if (buf[i] == TCPOPT_EXP &&
+ buf[i + 1] == TCPOLEN_SMC_BASE &&
+ (opt_bytes - i >= TCPOLEN_SMC_BASE) &&
+ !memcmp(&buf[i + 2], TCPOPT_SMC_MAGIC,
+ sizeof(TCPOPT_SMC_MAGIC)))
+ return true;
+ i += buf[i + 1];
+ continue;
+ }
+ }
+
+ return false;
+}
+
+/* Add SMC option to TCP header */
+static int smc_rv_add_smc_option(struct sk_buff *skb)
+{
+ unsigned char smc_opt[] = {TCPOPT_NOP, TCPOPT_NOP,
+ TCPOPT_EXP, TCPOLEN_SMC_BASE,
+ TCPOPT_SMC_MAGIC[0], TCPOPT_SMC_MAGIC[1],
+ TCPOPT_SMC_MAGIC[2], TCPOPT_SMC_MAGIC[3]};
+ struct tcphdr *tcph = tcp_hdr(skb);
+ struct iphdr *iph = ip_hdr(skb);
+ int tcplen = 0;
+
+ if (skb_availroom(skb) < TCPOLEN_SMC)
+ return -EFAULT;
+
+ if (tcp_optlen(skb) + TCPOLEN_SMC > MAX_TCP_OPTION_SPACE)
+ return -EFAULT;
+
+ /* give up if there is data after the TCP header */
+ if (skb_headlen(skb) > ip_hdrlen(skb) + tcp_hdrlen(skb))
+ return -EFAULT;
+
+ if (smc_rv_has_smc_option(skb))
+ return -EFAULT;
+
+ if (!smc_rv_replace_eol_option(skb))
+ return -EFAULT;
+
+ iph->tot_len = cpu_to_be16(be16_to_cpu(iph->tot_len) + TCPOLEN_SMC);
+ iph->check = 0;
+ iph->check = ip_fast_csum(iph, iph->ihl);
+ skb_put_data(skb, smc_opt, TCPOLEN_SMC);
+ tcph->doff += TCPOLEN_SMC_ALIGNED;
+ tcplen = (skb->len - ip_hdrlen(skb));
+ tcph->check = 0;
+ tcph->check = tcp_v4_check(tcplen, iph->saddr, iph->daddr,
+ csum_partial(tcph, tcplen, 0));
+ skb->ip_summed = CHECKSUM_NONE;
+ return 0;
+}
+
+/* return an smc socket with certain source and destination */
+static struct smc_sock *smc_rv_lookup_connecting_smc(struct net *net,
+ __be32 dest_addr,
+ __be16 dest_port,
+ __be32 source_addr,
+ __be16 source_port)
+{
+ struct smc_sock *smc = NULL;
+ struct hlist_head *head;
+ struct socket *clcsock;
+ struct sock *sk;
+
+ read_lock(&smc_proto.h.smc_hash->lock);
+ head = &smc_proto.h.smc_hash->ht;
+
+ if (hlist_empty(head))
+ goto out;
+
+ sk_for_each(sk, head) {
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ if (sk->sk_state != SMC_INIT)
+ continue;
+ clcsock = smc_sk(sk)->clcsock;
+ if (!clcsock)
+ continue;
+ if (source_port != htons(clcsock->sk->sk_num))
+ continue;
+ if (source_addr != clcsock->sk->sk_rcv_saddr)
+ continue;
+ if (dest_port != clcsock->sk->sk_dport)
+ continue;
+ if (dest_addr == clcsock->sk->sk_daddr) {
+ smc = smc_sk(sk);
+ break;
+ }
+ }
+
+out:
+ read_unlock(&smc_proto.h.smc_hash->lock);
+ return smc;
+}
+
+/* for netfilter smc_rv_hook_out_clnt (outgoing SYN):
+ * check if there exists a connecting smc socket with certain source and
+ * destination
+ */
+static bool smc_rv_exists_connecting_smc(struct net *net,
+ __be32 dest_addr,
+ __be16 dest_port,
+ __be32 source_addr,
+ __be16 source_port)
+{
+ return (smc_rv_lookup_connecting_smc(net, dest_addr, dest_port,
+ source_addr, source_port) ?
+ true : false);
+}
+
+/* for netfilter smc_rv_hook_in_clnt (incoming SYN ACK):
+ * enable SMC-capability for the corresponding socket
+ */
+static void smc_rv_accepting_smc_peer(struct net *net,
+ __be32 dest_addr,
+ __be16 dest_port,
+ __be32 source_addr,
+ __be16 source_port)
+{
+ struct smc_sock *smc;
+
+ smc = smc_rv_lookup_connecting_smc(net, dest_addr, dest_port,
+ source_addr, source_port);
+ if (smc)
+ /* connection is SMC-capable */
+ smc->use_fallback = false;
+}
+
+/* return an smc socket listening on a certain port */
+static struct smc_sock *smc_rv_lookup_listen_socket(struct net *net,
+ __be32 listen_addr,
+ __be16 listen_port)
+{
+ struct smc_sock *smc = NULL;
+ struct hlist_head *head;
+ struct socket *clcsock;
+ struct sock *sk;
+
+ read_lock(&smc_proto.h.smc_hash->lock);
+ head = &smc_proto.h.smc_hash->ht;
+
+ if (hlist_empty(head))
+ goto out;
+
+ sk_for_each(sk, head) {
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ if (sk->sk_state != SMC_LISTEN)
+ continue;
+ clcsock = smc_sk(sk)->clcsock;
+ if (listen_port != htons(clcsock->sk->sk_num))
+ continue;
+ if (!listen_addr || !clcsock->sk->sk_rcv_saddr ||
+ listen_addr == clcsock->sk->sk_rcv_saddr) {
+ smc = smc_sk(sk);
+ break;
+ }
+ }
+
+out:
+ read_unlock(&smc_proto.h.smc_hash->lock);
+ return smc;
+}
+
+/* for netfilter smc_rv_hook_in_serv (incoming SYN):
+ * save addr and port of connecting smc peer
+ */
+static void smc_rv_connecting_smc_peer(struct net *net,
+ __be32 listen_addr,
+ __be16 listen_port,
+ __be32 peer_addr,
+ __be16 peer_port)
+{
+ struct smc_listen_pending *pnd;
+ struct smc_sock *lsmc;
+ unsigned long flags;
+ int i;
+
+ lsmc = smc_rv_lookup_listen_socket(net, listen_addr, listen_port);
+ if (!lsmc)
+ return;
+
+ spin_lock_irqsave(&lsmc->listen_pends_lock, flags);
+ for (i = 0; i < 2 * lsmc->sk.sk_max_ack_backlog; i++) {
+ pnd = lsmc->listen_pends + i;
+ /* either use an unused entry or reuse an outdated entry */
+ if (!pnd->used ||
+ jiffies_to_msecs(get_jiffies_64() - pnd->time) >
+ SMC_LISTEN_PEND_VALID_TIME) {
+ pnd->used = true;
+ pnd->addr = peer_addr;
+ pnd->port = peer_port;
+ pnd->time = get_jiffies_64();
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&lsmc->listen_pends_lock, flags);
+}
+
+/* for netfilter smc_rv_hook_out_serv (outgoing SYN/ACK):
+ * remove listen_pends entry of connecting smc peer in case of a problem
+ */
+static void smc_rv_remove_smc_peer(struct net *net,
+ __be32 listen_addr,
+ __be16 listen_port,
+ __be32 peer_addr,
+ __be16 peer_port)
+{
+ struct smc_listen_pending *pnd;
+ struct smc_sock *lsmc;
+ unsigned long flags;
+ int i;
+
+ lsmc = smc_rv_lookup_listen_socket(net, listen_addr, listen_port);
+ if (!lsmc)
+ return;
+
+ spin_lock_irqsave(&lsmc->listen_pends_lock, flags);
+ for (i = 0; i < 2 * lsmc->sk.sk_max_ack_backlog; i++) {
+ pnd = lsmc->listen_pends + i;
+ if (pnd->used &&
+ pnd->addr == peer_addr &&
+ pnd->port == peer_port &&
+ jiffies_to_msecs(get_jiffies_64() - pnd->time) <=
+ SMC_LISTEN_PEND_VALID_TIME) {
+ pnd->used = false;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&lsmc->listen_pends_lock, flags);
+}
+
+/* for netfilter smc_rv_hook_out_serv (outgoing SYN ACK):
+ * check if there has been a connecting smc peer
+ */
+static bool smc_rv_exists_connecting_smc_peer(struct net *net,
+ __be32 listen_addr,
+ __be16 listen_port,
+ __be32 peer_addr,
+ __be16 peer_port)
+{
+ struct smc_listen_pending *pnd;
+ struct smc_sock *lsmc;
+ unsigned long flags;
+ int i;
+
+ lsmc = smc_rv_lookup_listen_socket(net, listen_addr, listen_port);
+ if (!lsmc)
+ return false;
+
+ spin_lock_irqsave(&lsmc->listen_pends_lock, flags);
+ for (i = 0; i < 2 * lsmc->sk.sk_max_ack_backlog; i++) {
+ pnd = lsmc->listen_pends + i;
+ if (pnd->used &&
+ pnd->addr == peer_addr &&
+ pnd->port == peer_port &&
+ jiffies_to_msecs(get_jiffies_64() - pnd->time) <=
+ SMC_LISTEN_PEND_VALID_TIME) {
+ spin_unlock_irqrestore(&lsmc->listen_pends_lock, flags);
+ return true;
+ }
+ }
+ spin_unlock_irqrestore(&lsmc->listen_pends_lock, flags);
+ return false;
+}
+
+/* Netfilter hooks */
+
+/* netfilter hook for incoming packets (client) */
+static unsigned int smc_rv_hook_in_clnt(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct tcphdr *tcph = tcp_hdr(skb);
+ struct iphdr *iph;
+
+ if (skb_headlen(skb) - sizeof(*iph) < sizeof(*tcph))
+ return NF_ACCEPT;
+
+ iph = ip_hdr(skb);
+ if (iph->protocol != IPPROTO_TCP)
+ return NF_ACCEPT;
+
+ /* Local SMC client, incoming SYN,ACK from server
+ * check if there really is a local SMC client
+ * and tell the client connection if the server is SMC capable
+ */
+ if (tcph->syn == 1 && tcph->ack == 1) {
+ /* check for experimental option */
+ if (!smc_rv_has_smc_option(skb))
+ return NF_ACCEPT;
+ /* add info about server SMC capability */
+ smc_rv_accepting_smc_peer(state->net, iph->saddr, tcph->source,
+ iph->daddr, tcph->dest);
+ }
+ return NF_ACCEPT;
+}
+
+/* netfilter hook for incoming packets (server) */
+static unsigned int smc_rv_hook_in_serv(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct tcphdr *tcph = tcp_hdr(skb);
+ struct iphdr *iph;
+
+ if (skb_headlen(skb) - sizeof(*iph) < sizeof(*tcph))
+ return NF_ACCEPT;
+
+ iph = ip_hdr(skb);
+ if (iph->protocol != IPPROTO_TCP)
+ return NF_ACCEPT;
+
+ /* Local SMC Server, incoming SYN request from client
+ * check if there is a local SMC server
+ * and tell the server if there is a new SMC capable client
+ */
+ if (tcph->syn == 1 && tcph->ack == 0) {
+ /* check for experimental option */
+ if (!smc_rv_has_smc_option(skb))
+ return NF_ACCEPT;
+ /* add info about new client SMC capability */
+ smc_rv_connecting_smc_peer(state->net, iph->daddr, tcph->dest,
+ iph->saddr, tcph->source);
+ }
+ return NF_ACCEPT;
+}
+
+/* netfilter hook for outgoing packets (client) */
+static unsigned int smc_rv_hook_out_clnt(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct tcphdr *tcph = tcp_hdr(skb);
+ struct iphdr *iph;
+
+ if (skb_headlen(skb) - sizeof(*iph) < sizeof(*tcph))
+ return NF_ACCEPT;
+
+ iph = ip_hdr(skb);
+ if (iph->protocol != IPPROTO_TCP)
+ return NF_ACCEPT;
+
+ /* Local SMC client, outgoing SYN request to server
+ * add TCP experimental option if there really is a local SMC client
+ */
+ if (tcph->syn == 1 && tcph->ack == 0) {
+ /* check for local SMC client */
+ if (!smc_rv_exists_connecting_smc(state->net,
+ iph->daddr, tcph->dest,
+ iph->saddr, tcph->source))
+ return NF_ACCEPT;
+ /* add experimental option */
+ smc_rv_add_smc_option(skb);
+ }
+ return NF_ACCEPT;
+}
+
+/* netfilter hook for outgoing packets (server) */
+static unsigned int smc_rv_hook_out_serv(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct tcphdr *tcph = tcp_hdr(skb);
+ struct iphdr *iph;
+
+ if (skb_headlen(skb) - sizeof(*iph) < sizeof(*tcph))
+ return NF_ACCEPT;
+
+ iph = ip_hdr(skb);
+ if (iph->protocol != IPPROTO_TCP)
+ return NF_ACCEPT;
+
+ /* Local SMC server, outgoing SYN,ACK to client
+ * add TCP experimental option if there really is a local SMC server
+ */
+ if (tcph->syn == 1 && tcph->ack == 1) {
+ /* check if client's SYN contained the experimental option */
+ if (!smc_rv_exists_connecting_smc_peer(state->net,
+ iph->saddr, tcph->source,
+ iph->daddr, tcph->dest))
+ return NF_ACCEPT;
+ /* add experimental option */
+ if (smc_rv_add_smc_option(skb) < 0)
+ smc_rv_remove_smc_peer(state->net,
+ iph->saddr, tcph->source,
+ iph->daddr, tcph->dest);
+ }
+ return NF_ACCEPT;
+}
+
+static struct nf_hook_ops smc_nfho_ops_clnt[] = {
+ {
+ .hook = smc_rv_hook_in_clnt,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .pf = PF_INET,
+ .priority = NF_IP_PRI_FIRST,
+ },
+ {
+ .hook = smc_rv_hook_out_clnt,
+ .hooknum = NF_INET_POST_ROUTING,
+ .pf = PF_INET,
+ .priority = NF_IP_PRI_FIRST,
+ },
+};
+
+static struct nf_hook_ops smc_nfho_ops_serv[] = {
+ {
+ .hook = smc_rv_hook_in_serv,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .pf = PF_INET,
+ .priority = NF_IP_PRI_FIRST,
+ },
+ {
+ .hook = smc_rv_hook_out_serv,
+ .hooknum = NF_INET_POST_ROUTING,
+ .pf = PF_INET,
+ .priority = NF_IP_PRI_FIRST,
+ },
+};
+
+struct smc_nf_hook smc_nfho_clnt = {
+ .refcount = 0,
+ .hook = &smc_nfho_ops_clnt[0],
+};
+
+struct smc_nf_hook smc_nfho_serv = {
+ .refcount = 0,
+ .hook = &smc_nfho_ops_serv[0],
+};
+
+int smc_rv_nf_register_hook(struct net *net, struct smc_nf_hook *nfho)
+{
+ int rc = 0;
+
+ mutex_lock(&nfho->nf_hook_mutex);
+ if (!(nfho->refcount++)) {
+ rc = nf_register_net_hooks(net, nfho->hook, 2);
+ if (rc)
+ nfho->refcount--;
+ }
+ mutex_unlock(&nfho->nf_hook_mutex);
+ return rc;
+}
+
+void smc_rv_nf_unregister_hook(struct net *net, struct smc_nf_hook *nfho)
+{
+ mutex_lock(&nfho->nf_hook_mutex);
+ if (!(--nfho->refcount))
+ nf_unregister_net_hooks(net, nfho->hook, 2);
+ mutex_unlock(&nfho->nf_hook_mutex);
+}
+
+void __init smc_rv_init(void)
+{
+ mutex_init(&smc_nfho_clnt.nf_hook_mutex);
+ mutex_init(&smc_nfho_serv.nf_hook_mutex);
+}
diff --git a/net/smc/smc_rv.h b/net/smc/smc_rv.h
new file mode 100644
index 000000000000..c3bdf4c0a5cb
--- /dev/null
+++ b/net/smc/smc_rv.h
@@ -0,0 +1,31 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Definitions for SMC Rendezvous - SMC capability checking
+ *
+ * Copyright IBM Corp. 2017
+ *
+ * Author(s): Hans Wippel <hwippel@linux.vnet.ibm.com>
+ * Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef _SMC_RV_H
+#define _SMC_RV_H
+
+#include <linux/netfilter.h>
+
+#define SMC_LISTEN_PEND_VALID_TIME (600 * HZ)
+
+struct smc_nf_hook {
+ struct mutex nf_hook_mutex; /* serialize nf register ops */
+ int refcount;
+ struct nf_hook_ops *hook;
+};
+
+extern struct smc_nf_hook smc_nfho_clnt;
+extern struct smc_nf_hook smc_nfho_serv;
+
+int smc_rv_nf_register_hook(struct net *net, struct smc_nf_hook *nfho);
+void smc_rv_nf_unregister_hook(struct net *net, struct smc_nf_hook *nfho);
+void smc_rv_init(void) __init;
+#endif
--
2.13.5
^ permalink raw reply related
* [PATCH net 2/2] net/smc: dev_put for netdev after usage of ib_query_gid()
From: Ursula Braun @ 2017-10-10 14:13 UTC (permalink / raw)
To: davem
Cc: netdev, linux-rdma, linux-s390, jwi, schwidefsky, heiko.carstens,
raspl, ubraun, parav
In-Reply-To: <20171010141351.87700-1-ubraun@linux.vnet.ibm.com>
For ROCEs ib_query_gid() takes a reference count on the net_device.
This reference count must be decreased by the caller.
Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
---
net/smc/smc_core.c | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 20b66e79c5d6..e93a31ec3cc2 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -380,10 +380,13 @@ static int smc_link_determine_gid(struct smc_link_group *lgr)
if (ib_query_gid(lnk->smcibdev->ibdev, lnk->ibport, i, &gid,
&gattr))
continue;
- if (gattr.ndev &&
- (vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id)) {
- lnk->gid = gid;
- return 0;
+ if (gattr.ndev) {
+ if (vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id) {
+ lnk->gid = gid;
+ dev_put(gattr.ndev);
+ return 0;
+ }
+ dev_put(gattr.ndev);
}
}
return -ENODEV;
--
2.13.5
^ permalink raw reply related
* [PATCH net 1/2] net/smc: replace function pointer get_netdev()
From: Ursula Braun @ 2017-10-10 14:13 UTC (permalink / raw)
To: davem-fT/PcQaiUtIeIZ0/mPfg9Q
Cc: netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
linux-s390-u79uwXL29TY76Z2rM5mHXA,
jwi-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8,
schwidefsky-tA70FqPdS9bQT0dZR+AlfA,
heiko.carstens-tA70FqPdS9bQT0dZR+AlfA,
raspl-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8,
ubraun-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8,
parav-VPRAkNaXOzVWk0Htik3J/w
In-Reply-To: <20171010141351.87700-1-ubraun-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
SMC should not open code the function pointer get_netdev of the
IB device. Replacing ib_query_gid(..., NULL) with
ib_query_gid(..., gid_attr) allows access to the netdev.
Signed-off-by: Ursula Braun <ubraun-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Suggested-by: Parav Pandit <parav-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
---
net/smc/smc_ib.c | 26 +++++++++-----------------
1 file changed, 9 insertions(+), 17 deletions(-)
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 0b5852299158..b428c0f6c782 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -369,26 +369,17 @@ void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev,
static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport)
{
- struct net_device *ndev;
+ struct ib_gid_attr gattr;
int rc;
rc = ib_query_gid(smcibdev->ibdev, ibport, 0,
- &smcibdev->gid[ibport - 1], NULL);
- /* the SMC protocol requires specification of the roce MAC address;
- * if net_device cannot be determined, it can be derived from gid 0
- */
- ndev = smcibdev->ibdev->get_netdev(smcibdev->ibdev, ibport);
- if (ndev) {
- memcpy(&smcibdev->mac, ndev->dev_addr, ETH_ALEN);
- dev_put(ndev);
- } else if (!rc) {
- memcpy(&smcibdev->mac[ibport - 1][0],
- &smcibdev->gid[ibport - 1].raw[8], 3);
- memcpy(&smcibdev->mac[ibport - 1][3],
- &smcibdev->gid[ibport - 1].raw[13], 3);
- smcibdev->mac[ibport - 1][0] &= ~0x02;
- }
- return rc;
+ &smcibdev->gid[ibport - 1], &gattr);
+ if (rc || !gattr.ndev)
+ return -ENODEV;
+
+ memcpy(smcibdev->mac[ibport - 1], gattr.ndev->dev_addr, ETH_ALEN);
+ dev_put(gattr.ndev);
+ return 0;
}
/* Create an identifier unique for this instance of SMC-R.
@@ -419,6 +410,7 @@ int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)
&smcibdev->pattr[ibport - 1]);
if (rc)
goto out;
+ /* the SMC protocol requires specification of the roce MAC address */
rc = smc_ib_fill_gid_and_mac(smcibdev, ibport);
if (rc)
goto out;
--
2.13.5
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH net 0/2] net/smc: ib_query_gid() patches
From: Ursula Braun @ 2017-10-10 14:13 UTC (permalink / raw)
To: davem-fT/PcQaiUtIeIZ0/mPfg9Q
Cc: netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
linux-s390-u79uwXL29TY76Z2rM5mHXA,
jwi-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8,
schwidefsky-tA70FqPdS9bQT0dZR+AlfA,
heiko.carstens-tA70FqPdS9bQT0dZR+AlfA,
raspl-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8,
ubraun-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8,
parav-VPRAkNaXOzVWk0Htik3J/w
Dave,
triggered by Parav Pandit here are 2 cleanup patches for usage of
ib_query_gid() in the smc-code.
Thanks, Ursula
Ursula Braun (2):
net/smc: replace function pointer get_netdev()
net/smc: dev_put for netdev after usage of ib_query_gid()
net/smc/smc_core.c | 11 +++++++----
net/smc/smc_ib.c | 26 +++++++++-----------------
2 files changed, 16 insertions(+), 21 deletions(-)
--
2.13.5
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [RFC net 1/1] net: sched: act: fix rcu race in dump
From: Eric Dumazet @ 2017-10-10 14:12 UTC (permalink / raw)
To: Alexander Aring; +Cc: jhs, xiyou.wangcong, jiri, netdev, kurup.manish, bjb
In-Reply-To: <20171010123218.5251-2-aring@mojatatu.com>
On Tue, 2017-10-10 at 08:32 -0400, Alexander Aring wrote:
> This patch fixes an issue with kfree_rcu which is not protected by RTNL
> lock. It could be that the current assigned rcu pointer will be freed by
> kfree_rcu while dump callback is running.
>
> To prevent this, we call rcu_synchronize at first. Then we are sure all
> latest rcu functions e.g. rcu_assign_pointer and kfree_rcu in init are
> done. After rcu_synchronize we dereference under RTNL lock which is also
> held in init function, which means no other rcu_assign_pointer or
> kfree_rcu will occur.
>
> To call rcu_synchronize will also prevent weird behaviours by doing over
> netlink:
>
> - set params A
> - set params B
> - dump params
> \--> will dump params A
>
> This could be a unlikely case that the last rcu_assign_pointer was not
> happened before dump callback.
>
> Signed-off-by: Alexander Aring <aring@mojatatu.com>
> ---
> net/sched/act_skbmod.c | 7 ++++++-
> 1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
> index b642ad3d39dd..231e07bca384 100644
> --- a/net/sched/act_skbmod.c
> +++ b/net/sched/act_skbmod.c
> @@ -198,7 +198,7 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a,
> {
> struct tcf_skbmod *d = to_skbmod(a);
> unsigned char *b = skb_tail_pointer(skb);
> - struct tcf_skbmod_params *p = rtnl_dereference(d->skbmod_p);
> + struct tcf_skbmod_params *p;
> struct tc_skbmod opt = {
> .index = d->tcf_index,
> .refcnt = d->tcf_refcnt - ref,
> @@ -207,6 +207,11 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a,
> };
> struct tcf_t t;
>
> + /* wait until last rcu_assign_pointer/kfree_rcu is done */
> + rcu_synchronize();
> + /* RTNL lock prevents another rcu_assign_pointer/kfree_rcu call */
> + p = rtnl_dereference(d->skbmod_p);
> +
> opt.flags = p->flags;
> if (nla_put(skb, TCA_SKBMOD_PARMS, sizeof(opt), &opt))
> goto nla_put_failure;
Sorry but no. This is plainly wrong.
We need to fix this without adding a _very_ expensive rcu_synchronize()
on a path which does not need such thing.
I am confused by this patch, please tell us more what the problem is.
I suspect rcu_read_lock() is what you need, but isn't a writer supposed
to hold RTNL in net/sched/* ???
^ permalink raw reply
* [PATCH 4/4] fsl/fman: add dpaa in module names
From: Madalin Bucur @ 2017-10-10 14:10 UTC (permalink / raw)
To: netdev, davem; +Cc: f.fainelli, andrew, vivien.didelot, junote, linux-kernel
In-Reply-To: <1507644618-32006-1-git-send-email-madalin.bucur@nxp.com>
Signed-off-by: Madalin Bucur <madalin.bucur@nxp.com>
---
drivers/net/ethernet/freescale/fman/Makefile | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/drivers/net/ethernet/freescale/fman/Makefile b/drivers/net/ethernet/freescale/fman/Makefile
index 2c38119..4ae524a 100644
--- a/drivers/net/ethernet/freescale/fman/Makefile
+++ b/drivers/net/ethernet/freescale/fman/Makefile
@@ -1,9 +1,9 @@
subdir-ccflags-y += -I$(srctree)/drivers/net/ethernet/freescale/fman
-obj-$(CONFIG_FSL_FMAN) += fsl_fman.o
-obj-$(CONFIG_FSL_FMAN) += fsl_fman_port.o
-obj-$(CONFIG_FSL_FMAN) += fsl_mac.o
+obj-$(CONFIG_FSL_FMAN) += fsl_dpaa_fman.o
+obj-$(CONFIG_FSL_FMAN) += fsl_dpaa_fman_port.o
+obj-$(CONFIG_FSL_FMAN) += fsl_dpaa_mac.o
-fsl_fman-objs := fman_muram.o fman.o fman_sp.o fman_keygen.o
-fsl_fman_port-objs := fman_port.o
-fsl_mac-objs:= mac.o fman_dtsec.o fman_memac.o fman_tgec.o
+fsl_dpaa_fman-objs := fman_muram.o fman.o fman_sp.o fman_keygen.o
+fsl_dpaa_fman_port-objs := fman_port.o
+fsl_dpaa_mac-objs:= mac.o fman_dtsec.o fman_memac.o fman_tgec.o
--
2.1.0
^ permalink raw reply related
* [PATCH 3/4] dpaa_eth: change device used
From: Madalin Bucur @ 2017-10-10 14:10 UTC (permalink / raw)
To: netdev, davem; +Cc: f.fainelli, andrew, vivien.didelot, junote, linux-kernel
In-Reply-To: <1507644618-32006-1-git-send-email-madalin.bucur@nxp.com>
Change device used for DMA mapping to the MAC device that is an
of_device, with proper DMA ops. Using this device for the netdevice
should also address the issue with DSA scenarios that need the
netdevice to be backed by an of_device.
Signed-off-by: Madalin Bucur <madalin.bucur@nxp.com>
---
drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 41 ++++++++------------------
drivers/net/ethernet/freescale/fman/mac.c | 37 ++++++++++-------------
drivers/net/ethernet/freescale/fman/mac.h | 1 -
3 files changed, 27 insertions(+), 52 deletions(-)
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
index 7cf61d6..428ef2b 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
@@ -385,34 +385,19 @@ static int dpaa_setup_tc(struct net_device *net_dev, enum tc_setup_type type,
static struct mac_device *dpaa_mac_dev_get(struct platform_device *pdev)
{
- struct platform_device *of_dev;
struct dpaa_eth_data *eth_data;
- struct device *dpaa_dev, *dev;
- struct device_node *mac_node;
+ struct device *dpaa_dev;
struct mac_device *mac_dev;
dpaa_dev = &pdev->dev;
eth_data = dpaa_dev->platform_data;
- if (!eth_data)
+ if (!eth_data) {
+ dev_err(dpaa_dev, "eth_data missing\n");
return ERR_PTR(-ENODEV);
-
- mac_node = eth_data->mac_node;
-
- of_dev = of_find_device_by_node(mac_node);
- if (!of_dev) {
- dev_err(dpaa_dev, "of_find_device_by_node(%pOF) failed\n",
- mac_node);
- of_node_put(mac_node);
- return ERR_PTR(-EINVAL);
}
- of_node_put(mac_node);
-
- dev = &of_dev->dev;
-
- mac_dev = dev_get_drvdata(dev);
+ mac_dev = eth_data->mac_dev;
if (!mac_dev) {
- dev_err(dpaa_dev, "dev_get_drvdata(%s) failed\n",
- dev_name(dev));
+ dev_err(dpaa_dev, "mac_dev missing\n");
return ERR_PTR(-EINVAL);
}
@@ -2696,7 +2681,13 @@ static int dpaa_eth_probe(struct platform_device *pdev)
int err = 0, i, channel;
struct device *dev;
- dev = &pdev->dev;
+ /* device used for DMA mapping */
+ dev = pdev->dev.parent;
+ err = dma_coerce_mask_and_coherent(dev, DMA_BIT_MASK(40));
+ if (err) {
+ dev_err(dev, "dma_coerce_mask_and_coherent() failed\n");
+ goto dev_mask_failed;
+ }
/* Allocate this early, so we can store relevant information in
* the private area
@@ -2738,14 +2729,6 @@ static int dpaa_eth_probe(struct platform_device *pdev)
priv->buf_layout[RX].priv_data_size = DPAA_RX_PRIV_DATA_SIZE; /* Rx */
priv->buf_layout[TX].priv_data_size = DPAA_TX_PRIV_DATA_SIZE; /* Tx */
- /* device used for DMA mapping */
- set_dma_ops(dev, get_dma_ops(&pdev->dev));
- err = dma_coerce_mask_and_coherent(dev, DMA_BIT_MASK(40));
- if (err) {
- dev_err(dev, "dma_coerce_mask_and_coherent() failed\n");
- goto dev_mask_failed;
- }
-
/* bp init */
for (i = 0; i < DPAA_BPS_NUM; i++) {
int err;
diff --git a/drivers/net/ethernet/freescale/fman/mac.c b/drivers/net/ethernet/freescale/fman/mac.c
index a0a3107..1d6da1e 100644
--- a/drivers/net/ethernet/freescale/fman/mac.c
+++ b/drivers/net/ethernet/freescale/fman/mac.c
@@ -542,8 +542,7 @@ static const u16 phy2speed[] = {
};
static struct platform_device *dpaa_eth_add_device(int fman_id,
- struct mac_device *mac_dev,
- struct device_node *node)
+ struct mac_device *mac_dev)
{
struct platform_device *pdev;
struct dpaa_eth_data data;
@@ -556,10 +555,8 @@ static struct platform_device *dpaa_eth_add_device(int fman_id,
data.mac_dev = mac_dev;
data.mac_hw_id = priv->cell_index;
data.fman_hw_id = fman_id;
- data.mac_node = node;
mutex_lock(ð_lock);
-
pdev = platform_device_alloc("dpaa-ethernet", dpaa_eth_dev_cnt);
if (!pdev) {
ret = -ENOMEM;
@@ -648,9 +645,6 @@ static int mac_probe(struct platform_device *_of_dev)
goto _return;
}
- /* Register mac_dev */
- dev_set_drvdata(dev, mac_dev);
-
INIT_LIST_HEAD(&priv->mc_addr_list);
/* Get the FM node */
@@ -659,7 +653,7 @@ static int mac_probe(struct platform_device *_of_dev)
dev_err(dev, "of_get_parent(%pOF) failed\n",
mac_node);
err = -EINVAL;
- goto _return_dev_set_drvdata;
+ goto _return_of_get_parent;
}
of_dev = of_find_device_by_node(dev_node);
@@ -693,7 +687,7 @@ static int mac_probe(struct platform_device *_of_dev)
if (err < 0) {
dev_err(dev, "of_address_to_resource(%pOF) = %d\n",
mac_node, err);
- goto _return_dev_set_drvdata;
+ goto _return_of_get_parent;
}
mac_dev->res = __devm_request_region(dev,
@@ -703,7 +697,7 @@ static int mac_probe(struct platform_device *_of_dev)
if (!mac_dev->res) {
dev_err(dev, "__devm_request_mem_region(mac) failed\n");
err = -EBUSY;
- goto _return_dev_set_drvdata;
+ goto _return_of_get_parent;
}
priv->vaddr = devm_ioremap(dev, mac_dev->res->start,
@@ -711,7 +705,7 @@ static int mac_probe(struct platform_device *_of_dev)
if (!priv->vaddr) {
dev_err(dev, "devm_ioremap() failed\n");
err = -EIO;
- goto _return_dev_set_drvdata;
+ goto _return_of_get_parent;
}
if (!of_device_is_available(mac_node)) {
@@ -728,7 +722,7 @@ static int mac_probe(struct platform_device *_of_dev)
if (err) {
dev_err(dev, "failed to read cell-index for %pOF\n", mac_node);
err = -EINVAL;
- goto _return_dev_set_drvdata;
+ goto _return_of_get_parent;
}
priv->cell_index = (u8)val;
@@ -737,7 +731,7 @@ static int mac_probe(struct platform_device *_of_dev)
if (!mac_addr) {
dev_err(dev, "of_get_mac_address(%pOF) failed\n", mac_node);
err = -EINVAL;
- goto _return_dev_set_drvdata;
+ goto _return_of_get_parent;
}
memcpy(mac_dev->addr, mac_addr, sizeof(mac_dev->addr));
@@ -747,14 +741,14 @@ static int mac_probe(struct platform_device *_of_dev)
dev_err(dev, "of_count_phandle_with_args(%pOF, fsl,fman-ports) failed\n",
mac_node);
err = nph;
- goto _return_dev_set_drvdata;
+ goto _return_of_get_parent;
}
if (nph != ARRAY_SIZE(mac_dev->port)) {
dev_err(dev, "Not supported number of fman-ports handles of mac node %pOF from device tree\n",
mac_node);
err = -EINVAL;
- goto _return_dev_set_drvdata;
+ goto _return_of_get_parent;
}
for (i = 0; i < ARRAY_SIZE(mac_dev->port); i++) {
@@ -818,20 +812,20 @@ static int mac_probe(struct platform_device *_of_dev)
err = of_phy_register_fixed_link(mac_node);
if (err)
- goto _return_dev_set_drvdata;
+ goto _return_of_get_parent;
priv->fixed_link = kzalloc(sizeof(*priv->fixed_link),
GFP_KERNEL);
if (!priv->fixed_link) {
err = -ENOMEM;
- goto _return_dev_set_drvdata;
+ goto _return_of_get_parent;
}
mac_dev->phy_node = of_node_get(mac_node);
phy = of_phy_find_device(mac_dev->phy_node);
if (!phy) {
err = -EINVAL;
- goto _return_dev_set_drvdata;
+ goto _return_of_get_parent;
}
priv->fixed_link->link = phy->link;
@@ -847,7 +841,7 @@ static int mac_probe(struct platform_device *_of_dev)
if (err < 0) {
dev_err(dev, "mac_dev->init() = %d\n", err);
of_node_put(mac_dev->phy_node);
- goto _return_dev_set_drvdata;
+ goto _return_of_get_parent;
}
/* pause frame autonegotiation enabled */
@@ -868,7 +862,7 @@ static int mac_probe(struct platform_device *_of_dev)
mac_dev->addr[0], mac_dev->addr[1], mac_dev->addr[2],
mac_dev->addr[3], mac_dev->addr[4], mac_dev->addr[5]);
- priv->eth_dev = dpaa_eth_add_device(fman_id, mac_dev, mac_node);
+ priv->eth_dev = dpaa_eth_add_device(fman_id, mac_dev);
if (IS_ERR(priv->eth_dev)) {
dev_err(dev, "failed to add Ethernet platform device for MAC %d\n",
priv->cell_index);
@@ -879,9 +873,8 @@ static int mac_probe(struct platform_device *_of_dev)
_return_of_node_put:
of_node_put(dev_node);
-_return_dev_set_drvdata:
+_return_of_get_parent:
kfree(priv->fixed_link);
- dev_set_drvdata(dev, NULL);
_return:
return err;
}
diff --git a/drivers/net/ethernet/freescale/fman/mac.h b/drivers/net/ethernet/freescale/fman/mac.h
index 1ca85a1..eefb335 100644
--- a/drivers/net/ethernet/freescale/fman/mac.h
+++ b/drivers/net/ethernet/freescale/fman/mac.h
@@ -83,7 +83,6 @@ struct mac_device {
};
struct dpaa_eth_data {
- struct device_node *mac_node;
struct mac_device *mac_dev;
int mac_hw_id;
int fman_hw_id;
--
2.1.0
^ permalink raw reply related
* [PATCH 2/4] dpaa_eth: move of_phy_connect() to the eth driver
From: Madalin Bucur @ 2017-10-10 14:10 UTC (permalink / raw)
To: netdev, davem; +Cc: f.fainelli, andrew, vivien.didelot, junote, linux-kernel
In-Reply-To: <1507644618-32006-1-git-send-email-madalin.bucur@nxp.com>
Signed-off-by: Madalin Bucur <madalin.bucur@nxp.com>
---
drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 48 +++++++++++--
drivers/net/ethernet/freescale/fman/mac.c | 97 ++++++--------------------
drivers/net/ethernet/freescale/fman/mac.h | 5 +-
3 files changed, 66 insertions(+), 84 deletions(-)
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
index 4225806..7cf61d6 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
@@ -2435,6 +2435,48 @@ static void dpaa_eth_napi_disable(struct dpaa_priv *priv)
}
}
+static void dpaa_adjust_link(struct net_device *net_dev)
+{
+ struct mac_device *mac_dev;
+ struct dpaa_priv *priv;
+
+ priv = netdev_priv(net_dev);
+ mac_dev = priv->mac_dev;
+ mac_dev->adjust_link(mac_dev);
+}
+
+static int dpaa_phy_init(struct net_device *net_dev)
+{
+ struct mac_device *mac_dev;
+ struct phy_device *phy_dev;
+ struct dpaa_priv *priv;
+
+ priv = netdev_priv(net_dev);
+ mac_dev = priv->mac_dev;
+
+ phy_dev = of_phy_connect(net_dev, mac_dev->phy_node,
+ &dpaa_adjust_link, 0,
+ mac_dev->phy_if);
+ if (!phy_dev) {
+ netif_err(priv, ifup, net_dev, "init_phy() failed\n");
+ return -ENODEV;
+ }
+
+ /* Remove any features not supported by the controller */
+ phy_dev->supported &= mac_dev->if_support;
+
+ /* Enable the symmetric and asymmetric PAUSE frame advertisements,
+ * as most of the PHY drivers do not enable them by default.
+ */
+ phy_dev->supported |= (SUPPORTED_Pause | SUPPORTED_Asym_Pause);
+ phy_dev->advertising = phy_dev->supported;
+
+ mac_dev->phy_dev = phy_dev;
+ net_dev->phydev = phy_dev;
+
+ return 0;
+}
+
static int dpaa_open(struct net_device *net_dev)
{
struct mac_device *mac_dev;
@@ -2445,12 +2487,8 @@ static int dpaa_open(struct net_device *net_dev)
mac_dev = priv->mac_dev;
dpaa_eth_napi_enable(priv);
- net_dev->phydev = mac_dev->init_phy(net_dev, priv->mac_dev);
- if (!net_dev->phydev) {
- netif_err(priv, ifup, net_dev, "init_phy() failed\n");
- err = -ENODEV;
+ if (dpaa_phy_init(net_dev))
goto phy_init_failed;
- }
for (i = 0; i < ARRAY_SIZE(mac_dev->port); i++) {
err = fman_port_enable(mac_dev->port[i]);
diff --git a/drivers/net/ethernet/freescale/fman/mac.c b/drivers/net/ethernet/freescale/fman/mac.c
index 9a265f8..a0a3107 100644
--- a/drivers/net/ethernet/freescale/fman/mac.c
+++ b/drivers/net/ethernet/freescale/fman/mac.c
@@ -57,9 +57,7 @@ struct mac_priv_s {
struct device *dev;
void __iomem *vaddr;
u8 cell_index;
- phy_interface_t phy_if;
struct fman *fman;
- struct device_node *phy_node;
struct device_node *internal_phy_node;
/* List of multicast addresses */
struct list_head mc_addr_list;
@@ -106,7 +104,7 @@ static void set_fman_mac_params(struct mac_device *mac_dev,
resource_size(mac_dev->res));
memcpy(¶ms->addr, mac_dev->addr, sizeof(mac_dev->addr));
params->max_speed = priv->max_speed;
- params->phy_if = priv->phy_if;
+ params->phy_if = mac_dev->phy_if;
params->basex_if = false;
params->mac_id = priv->cell_index;
params->fm = (void *)priv->fman;
@@ -419,15 +417,12 @@ void fman_get_pause_cfg(struct mac_device *mac_dev, bool *rx_pause,
}
EXPORT_SYMBOL(fman_get_pause_cfg);
-static void adjust_link_void(struct net_device *net_dev)
+static void adjust_link_void(struct mac_device *mac_dev)
{
}
-static void adjust_link_dtsec(struct net_device *net_dev)
+static void adjust_link_dtsec(struct mac_device *mac_dev)
{
- struct device *dev = net_dev->dev.parent;
- struct dpaa_eth_data *eth_data = dev->platform_data;
- struct mac_device *mac_dev = eth_data->mac_dev;
struct phy_device *phy_dev = mac_dev->phy_dev;
struct fman_mac *fman_mac;
bool rx_pause, tx_pause;
@@ -444,14 +439,12 @@ static void adjust_link_dtsec(struct net_device *net_dev)
fman_get_pause_cfg(mac_dev, &rx_pause, &tx_pause);
err = fman_set_mac_active_pause(mac_dev, rx_pause, tx_pause);
if (err < 0)
- netdev_err(net_dev, "fman_set_mac_active_pause() = %d\n", err);
+ dev_err(mac_dev->priv->dev, "fman_set_mac_active_pause() = %d\n",
+ err);
}
-static void adjust_link_memac(struct net_device *net_dev)
+static void adjust_link_memac(struct mac_device *mac_dev)
{
- struct device *dev = net_dev->dev.parent;
- struct dpaa_eth_data *eth_data = dev->platform_data;
- struct mac_device *mac_dev = eth_data->mac_dev;
struct phy_device *phy_dev = mac_dev->phy_dev;
struct fman_mac *fman_mac;
bool rx_pause, tx_pause;
@@ -463,60 +456,12 @@ static void adjust_link_memac(struct net_device *net_dev)
fman_get_pause_cfg(mac_dev, &rx_pause, &tx_pause);
err = fman_set_mac_active_pause(mac_dev, rx_pause, tx_pause);
if (err < 0)
- netdev_err(net_dev, "fman_set_mac_active_pause() = %d\n", err);
-}
-
-/* Initializes driver's PHY state, and attaches to the PHY.
- * Returns 0 on success.
- */
-static struct phy_device *init_phy(struct net_device *net_dev,
- struct mac_device *mac_dev,
- void (*adj_lnk)(struct net_device *))
-{
- struct phy_device *phy_dev;
- struct mac_priv_s *priv = mac_dev->priv;
-
- phy_dev = of_phy_connect(net_dev, priv->phy_node, adj_lnk, 0,
- priv->phy_if);
- if (!phy_dev) {
- netdev_err(net_dev, "Could not connect to PHY\n");
- return NULL;
- }
-
- /* Remove any features not supported by the controller */
- phy_dev->supported &= mac_dev->if_support;
- /* Enable the symmetric and asymmetric PAUSE frame advertisements,
- * as most of the PHY drivers do not enable them by default.
- */
- phy_dev->supported |= (SUPPORTED_Pause | SUPPORTED_Asym_Pause);
- phy_dev->advertising = phy_dev->supported;
-
- mac_dev->phy_dev = phy_dev;
-
- return phy_dev;
-}
-
-static struct phy_device *dtsec_init_phy(struct net_device *net_dev,
- struct mac_device *mac_dev)
-{
- return init_phy(net_dev, mac_dev, &adjust_link_dtsec);
-}
-
-static struct phy_device *tgec_init_phy(struct net_device *net_dev,
- struct mac_device *mac_dev)
-{
- return init_phy(net_dev, mac_dev, adjust_link_void);
-}
-
-static struct phy_device *memac_init_phy(struct net_device *net_dev,
- struct mac_device *mac_dev)
-{
- return init_phy(net_dev, mac_dev, &adjust_link_memac);
+ dev_err(mac_dev->priv->dev, "fman_set_mac_active_pause() = %d\n",
+ err);
}
static void setup_dtsec(struct mac_device *mac_dev)
{
- mac_dev->init_phy = dtsec_init_phy;
mac_dev->init = dtsec_initialization;
mac_dev->set_promisc = dtsec_set_promiscuous;
mac_dev->change_addr = dtsec_modify_mac_address;
@@ -528,14 +473,13 @@ static void setup_dtsec(struct mac_device *mac_dev)
mac_dev->set_multi = set_multi;
mac_dev->start = start;
mac_dev->stop = stop;
-
+ mac_dev->adjust_link = adjust_link_dtsec;
mac_dev->priv->enable = dtsec_enable;
mac_dev->priv->disable = dtsec_disable;
}
static void setup_tgec(struct mac_device *mac_dev)
{
- mac_dev->init_phy = tgec_init_phy;
mac_dev->init = tgec_initialization;
mac_dev->set_promisc = tgec_set_promiscuous;
mac_dev->change_addr = tgec_modify_mac_address;
@@ -547,14 +491,13 @@ static void setup_tgec(struct mac_device *mac_dev)
mac_dev->set_multi = set_multi;
mac_dev->start = start;
mac_dev->stop = stop;
-
+ mac_dev->adjust_link = adjust_link_void;
mac_dev->priv->enable = tgec_enable;
mac_dev->priv->disable = tgec_disable;
}
static void setup_memac(struct mac_device *mac_dev)
{
- mac_dev->init_phy = memac_init_phy;
mac_dev->init = memac_initialization;
mac_dev->set_promisc = memac_set_promiscuous;
mac_dev->change_addr = memac_modify_mac_address;
@@ -566,7 +509,7 @@ static void setup_memac(struct mac_device *mac_dev)
mac_dev->set_multi = set_multi;
mac_dev->start = start;
mac_dev->stop = stop;
-
+ mac_dev->adjust_link = adjust_link_memac;
mac_dev->priv->enable = memac_enable;
mac_dev->priv->disable = memac_disable;
}
@@ -850,13 +793,13 @@ static int mac_probe(struct platform_device *_of_dev)
mac_node);
phy_if = PHY_INTERFACE_MODE_SGMII;
}
- priv->phy_if = phy_if;
+ mac_dev->phy_if = phy_if;
- priv->speed = phy2speed[priv->phy_if];
+ priv->speed = phy2speed[mac_dev->phy_if];
priv->max_speed = priv->speed;
mac_dev->if_support = DTSEC_SUPPORTED;
/* We don't support half-duplex in SGMII mode */
- if (priv->phy_if == PHY_INTERFACE_MODE_SGMII)
+ if (mac_dev->phy_if == PHY_INTERFACE_MODE_SGMII)
mac_dev->if_support &= ~(SUPPORTED_10baseT_Half |
SUPPORTED_100baseT_Half);
@@ -865,12 +808,12 @@ static int mac_probe(struct platform_device *_of_dev)
mac_dev->if_support |= SUPPORTED_1000baseT_Full;
/* The 10G interface only supports one mode */
- if (priv->phy_if == PHY_INTERFACE_MODE_XGMII)
+ if (mac_dev->phy_if == PHY_INTERFACE_MODE_XGMII)
mac_dev->if_support = SUPPORTED_10000baseT_Full;
/* Get the rest of the PHY information */
- priv->phy_node = of_parse_phandle(mac_node, "phy-handle", 0);
- if (!priv->phy_node && of_phy_is_fixed_link(mac_node)) {
+ mac_dev->phy_node = of_parse_phandle(mac_node, "phy-handle", 0);
+ if (!mac_dev->phy_node && of_phy_is_fixed_link(mac_node)) {
struct phy_device *phy;
err = of_phy_register_fixed_link(mac_node);
@@ -884,8 +827,8 @@ static int mac_probe(struct platform_device *_of_dev)
goto _return_dev_set_drvdata;
}
- priv->phy_node = of_node_get(mac_node);
- phy = of_phy_find_device(priv->phy_node);
+ mac_dev->phy_node = of_node_get(mac_node);
+ phy = of_phy_find_device(mac_dev->phy_node);
if (!phy) {
err = -EINVAL;
goto _return_dev_set_drvdata;
@@ -903,7 +846,7 @@ static int mac_probe(struct platform_device *_of_dev)
err = mac_dev->init(mac_dev);
if (err < 0) {
dev_err(dev, "mac_dev->init() = %d\n", err);
- of_node_put(priv->phy_node);
+ of_node_put(mac_dev->phy_node);
goto _return_dev_set_drvdata;
}
diff --git a/drivers/net/ethernet/freescale/fman/mac.h b/drivers/net/ethernet/freescale/fman/mac.h
index d7313f0..1ca85a1 100644
--- a/drivers/net/ethernet/freescale/fman/mac.h
+++ b/drivers/net/ethernet/freescale/fman/mac.h
@@ -50,6 +50,8 @@ struct mac_device {
struct fman_port *port[2];
u32 if_support;
struct phy_device *phy_dev;
+ phy_interface_t phy_if;
+ struct device_node *phy_node;
bool autoneg_pause;
bool rx_pause_req;
@@ -58,11 +60,10 @@ struct mac_device {
bool tx_pause_active;
bool promisc;
- struct phy_device *(*init_phy)(struct net_device *net_dev,
- struct mac_device *mac_dev);
int (*init)(struct mac_device *mac_dev);
int (*start)(struct mac_device *mac_dev);
int (*stop)(struct mac_device *mac_dev);
+ void (*adjust_link)(struct mac_device *mac_dev);
int (*set_promisc)(struct fman_mac *mac_dev, bool enable);
int (*change_addr)(struct fman_mac *mac_dev, enet_addr_t *enet_addr);
int (*set_multi)(struct net_device *net_dev,
--
2.1.0
^ permalink raw reply related
* [PATCH 1/4] fsl/fman: remove of_node
From: Madalin Bucur @ 2017-10-10 14:10 UTC (permalink / raw)
To: netdev, davem; +Cc: f.fainelli, andrew, vivien.didelot, junote, linux-kernel
In-Reply-To: <1507644618-32006-1-git-send-email-madalin.bucur@nxp.com>
The FMan MAC driver allocates a platform device for the Ethernet
driver to probe on. Setting pdev->dev.of_node with the MAC node
triggers the MAC driver probing of the new platform device. While
this fails quickly and does not affect the functionality of the
drivers, it is incorrect and must be removed. This was added to
address a report that DSA code using of_find_net_device_by_node()
is unable to use the DPAA interfaces. Error message seen before
this fix:
fsl_mac dpaa-ethernet.0: __devm_request_mem_region(mac) failed
fsl_mac: probe of dpaa-ethernet.0 failed with error -16
Signed-off-by: Madalin Bucur <madalin.bucur@nxp.com>
---
drivers/net/ethernet/freescale/fman/mac.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/drivers/net/ethernet/freescale/fman/mac.c b/drivers/net/ethernet/freescale/fman/mac.c
index 387eb4a..9a265f8 100644
--- a/drivers/net/ethernet/freescale/fman/mac.c
+++ b/drivers/net/ethernet/freescale/fman/mac.c
@@ -623,7 +623,6 @@ static struct platform_device *dpaa_eth_add_device(int fman_id,
goto no_mem;
}
- pdev->dev.of_node = node;
pdev->dev.parent = priv->dev;
set_dma_ops(&pdev->dev, get_dma_ops(priv->dev));
--
2.1.0
^ permalink raw reply related
* [PATCH 0/4] adapt DPAA drivers for DSA
From: Madalin Bucur @ 2017-10-10 14:10 UTC (permalink / raw)
To: netdev, davem; +Cc: f.fainelli, andrew, vivien.didelot, junote, linux-kernel
Junote Cai reported that he was not able to get a DSA setup involving the
DPAA/FMAN driver to work and narrowed it down to of_find_net_device_by_node()
call in DSA setup. The initial attempt to fix this by adding of_node to the
platform device results in a second, failed, probing of the FMan MAC driver
against the new platform device created for the DPAA Ethernet driver.
Solve these issues by removing the of_node pointer from the platform device
and changing the net_dev dev to the of_device dev to ensure the DSA init
will be able to find the DPAA net_dev using of_find_net_device_by_node().
Several changes were required to enable this solution: refactoring the
adjust_link (also resulted in lesser, cleaner code) and renaming the fman
kernel modules to keep the legacy udev rules happy.
Madalin Bucur (4):
fsl/fman: remove of_node
dpaa_eth: move of_phy_connect() to the eth driver
dpaa_eth: change device used
fsl/fman: add dpaa in module names
drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 89 +++++++++-------
drivers/net/ethernet/freescale/fman/Makefile | 12 +--
drivers/net/ethernet/freescale/fman/mac.c | 135 +++++++------------------
drivers/net/ethernet/freescale/fman/mac.h | 6 +-
4 files changed, 99 insertions(+), 143 deletions(-)
--
2.1.0
^ permalink raw reply
* Re: High CPU load by native_queued_spin_lock_slowpath
From: Eric Dumazet @ 2017-10-10 14:07 UTC (permalink / raw)
To: Sergey K.; +Cc: netdev
In-Reply-To: <CAGv8E+xZOweXEedymRuLt4idQmCM8Mk1TZOM=0p3YmnXbh-n+g@mail.gmail.com>
On Tue, 2017-10-10 at 18:00 +0600, Sergey K. wrote:
> I'm using Debian 9(stretch edition) kernel 4.9., hp dl385 g7 server
> with 32 cpu cores. NIC queues are tied to processor cores. Server is
> shaping traffic (iproute2 and htb discipline + skbinfo + ipset + ifb)
> and filtering some rules by iptables.
>
> At that moment, when traffic goes up about 1gbit/s cpu is very high
> loaded. Perf tool tells me that kernel module
> native_queued_spin_lock_slowpath loading cpu about 40%.
>
> After several hours of searching, I found that if I remove the htb
> discipline from ifb0, the high load goes down.
> Well, I think that problem with classify and shaping by htb.
>
> Who knows how to solve?
You use a single ifb0 on the whole (multiqueue) device for ingress ?
What about multiple ifb instead, one per RX queue ?
Alternative is to reduce contention and use a single RX queue.
^ permalink raw reply
* RE: [PATCH] fsl/fman: remove of_node
From: Madalin-cristian Bucur @ 2017-10-10 14:01 UTC (permalink / raw)
To: Madalin-cristian Bucur, David Miller
Cc: netdev@vger.kernel.org, andrew@lunn.ch, f.fainelli@gmail.com,
linux-kernel@vger.kernel.org, junote@outlook.com
In-Reply-To: <AM5PR0402MB2691AA6167F9C9A68C058797EC730@AM5PR0402MB2691.eurprd04.prod.outlook.com>
> -----Original Message-----
> From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org]
> On Behalf Of Madalin-cristian Bucur
> Sent: Wednesday, October 04, 2017 12:54 PM
> To: David Miller <davem@davemloft.net>
> Subject: RE: [PATCH] fsl/fman: remove of_node
>
> > -----Original Message-----
> > From: David Miller [mailto:davem@davemloft.net]
> > Sent: Wednesday, October 04, 2017 7:44 AM
> > To: Madalin-cristian Bucur <madalin.bucur@nxp.com>
> > Cc: netdev@vger.kernel.org; andrew@lunn.ch; f.fainelli@gmail.com; linux-
> > kernel@vger.kernel.org
> > Subject: Re: [PATCH] fsl/fman: remove of_node
> >
> > From: Madalin-cristian Bucur <madalin.bucur@nxp.com>
> > Date: Tue, 3 Oct 2017 08:49:31 +0000
> >
> > > My patch removes the of_node that was set to a device that was not an
> > > of_device, preventing duplicated probing of both the real of_device
> > > and the "fake" one created through this assignment.
> > >
> > > I understand that the DSA issue that triggered the initial change
> > > was related to DSA finding the network devices using
> > > of_find_net_device_by_node(), something that will not work for the
> > > DPAA case where the netdevice does not have an of_node. I do not know
> > > enough about DSA to come up with a solution for this problem now.
> > > Andrew, Florian, can you please comment on this?
> >
> > It sounds like you're knowingly breaking DSA.
>
> It never worked, even with the change I'm reverting.
I'll resend this change as part of a patch set that changes the device
used as net_dev dev to ensure DSA will find a of_device there. To make
that work some changes to adjust link (that also make it cleaner) were
needed. Also, to keep the old udev rules happy, I've changed the names
of the FMan kernel modules from fsl_fman_* to fsl_dpaa_fman*.
I do not have a DSA setup to test so I just tested the part related to
of_find_net_device_by_node() being able to determine the net_device
based on a device tree handle using an artificial device tree and code
construct. I hope that will help the initial reporter of the DSA issue
on DPAA (Junote Cai).
Madalin
^ permalink raw reply
* Re: [PATCH v2] XDP Program for Ip forward
From: Jesper Dangaard Brouer @ 2017-10-10 14:00 UTC (permalink / raw)
To: Christina Jacob
Cc: netdev, linux-kernel, linux-arm-kernel, Sunil.Goutham, daniel,
dsahern, Christina Jacob, brouer
In-Reply-To: <20171010151231.69fde82f@redhat.com>
On Tue, 10 Oct 2017 15:12:31 +0200
Jesper Dangaard Brouer <brouer@redhat.com> wrote:
> I'll try to test/benchmark your program...
In my initial testing, I cannot get this to work...
You do seem to XDP_REDIRECT out the right interface, but you have an
error with setting the correct MAC address.
--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Principal Kernel Engineer at Red Hat
LinkedIn: http://www.linkedin.com/in/brouer
^ permalink raw reply
* Re: [PATCH net-next RFC 1/9] net: dsa: mv88e6xxx: add accessors for PTP/TAI registers
From: Vivien Didelot @ 2017-10-10 13:59 UTC (permalink / raw)
To: Brandon Streiff, netdev
In-Reply-To: <87o9puaisb.fsf@weeman.i-did-not-set--mail-host-address--so-tickle-me>
Hi Brandon,
Vivien Didelot <vivien.didelot@savoirfairelinux.com> writes:
> Brandon Streiff <brandon.streiff@ni.com> writes:
>
>> + .port_ptp_read = mv88e6352_port_ptp_read,
>> + .port_ptp_write = mv88e6352_port_ptp_write,
>> + .ptp_read = mv88e6352_ptp_read,
>> + .ptp_write = mv88e6352_ptp_write,
>> + .tai_read = mv88e6352_tai_read,
>> + .tai_write = mv88e6352_tai_write,
>
>> + .port_ptp_read = mv88e6390_port_ptp_read,
>> + .port_ptp_write = mv88e6390_port_ptp_write,
>> + .ptp_read = mv88e6390_ptp_read,
>> + .ptp_write = mv88e6390_ptp_write,
>> + .tai_read = mv88e6390_tai_read,
>> + .tai_write = mv88e6390_tai_write,
>
> Only nitpick: please keep the mv88e63{52,90}_g2_avb_ prefix here.
>
> Otherwise thanks for respecting the code organization, very clear patch:
>
> Reviewed-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Also feel free to move the mv88e6*_g2_avb_ functions into a
global2_avb.c file.
Thank you,
Vivien
^ permalink raw reply
* usb/net/asix: null-ptr-deref in asix_suspend
From: Andrey Konovalov @ 2017-10-10 13:36 UTC (permalink / raw)
To: David S. Miller, Colin Ian King, Philippe Reynes, allan,
Dean Jenkins, Greg Ungerer, Jarod Wilson, Peter Chen, USB list,
netdev, LKML
Cc: Dmitry Vyukov, Kostya Serebryany, syzkaller
Hi!
I've got the following report while fuzzing the kernel with syzkaller.
On commit 8a5776a5f49812d29fe4b2d0a2d71675c3facf3f (4.14-rc4).
It seems that priv ends up being NULL.
usb 1-1: New USB device found, idVendor=0557, idProduct=2009
usb 1-1: New USB device strings: Mfr=204, Product=1, SerialNumber=0
usb 1-1: Product: a
usb 1-1: Manufacturer: a
gadgetfs: configuration #4
hub 1-1:4.2: bad descriptor, ignoring hub
hub: probe of 1-1:4.2 failed with error -5
asix 1-1:4.2 (unnamed net_device) (uninitialized): Failed to read reg
index 0x0000: -75
asix 1-1:4.2 eth1: register 'asix' at usb-dummy_hcd.0-1, ASIX AX8817x
USB 2.0 Ethernet, 08:d1:8e:63:00:88
asix 1-1:4.185 eth2: register 'asix' at usb-dummy_hcd.0-1, ASIX
AX8817x USB 2.0 Ethernet, 08:8f:0a:63:00:88
kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] PREEMPT SMP KASAN
Modules linked in:
CPU: 0 PID: 24 Comm: kworker/0:1 Not tainted 4.14.0-rc4-43422-geccacdd69a8c #400
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
Workqueue: usb_hub_wq hub_event
task: ffff88006bb36300 task.stack: ffff88006bba8000
RIP: 0010:asix_suspend+0x76/0xc0 drivers/net/usb/asix_devices.c:629
RSP: 0018:ffff88006bbae718 EFLAGS: 00010202
RAX: dffffc0000000000 RBX: ffff880061ba3b80 RCX: 1ffff1000c34d644
RDX: 0000000000000001 RSI: 0000000000000402 RDI: 0000000000000008
RBP: ffff88006bbae738 R08: 1ffff1000d775cad R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: ffff8800630a8b40
R13: 0000000000000000 R14: 0000000000000402 R15: ffff880061ba3b80
FS: 0000000000000000(0000) GS:ffff88006c600000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007ff33cf89000 CR3: 0000000061c0a000 CR4: 00000000000006f0
Call Trace:
usb_suspend_interface drivers/usb/core/driver.c:1209
usb_suspend_both+0x27f/0x7e0 drivers/usb/core/driver.c:1314
usb_runtime_suspend+0x41/0x120 drivers/usb/core/driver.c:1852
__rpm_callback+0x339/0xb60 drivers/base/power/runtime.c:334
rpm_callback+0x106/0x220 drivers/base/power/runtime.c:461
rpm_suspend+0x465/0x1980 drivers/base/power/runtime.c:596
__pm_runtime_suspend+0x11e/0x230 drivers/base/power/runtime.c:1009
pm_runtime_put_sync_autosuspend ./include/linux/pm_runtime.h:251
usb_new_device+0xa37/0x1020 drivers/usb/core/hub.c:2487
hub_port_connect drivers/usb/core/hub.c:4903
hub_port_connect_change drivers/usb/core/hub.c:5009
port_event drivers/usb/core/hub.c:5115
hub_event+0x194d/0x3740 drivers/usb/core/hub.c:5195
process_one_work+0xc7f/0x1db0 kernel/workqueue.c:2119
worker_thread+0x221/0x1850 kernel/workqueue.c:2253
kthread+0x3a1/0x470 kernel/kthread.c:231
ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:431
Code: 8d 7c 24 20 48 89 fa 48 c1 ea 03 80 3c 02 00 75 5b 48 b8 00 00
00 00 00 fc ff df 4d 8b 6c 24 20 49 8d 7d 08 48 89 fa 48 c1 ea 03 <80>
3c 02 00 75 34 4d 8b 6d 08 4d 85 ed 74 0b e8 26 2b 51 fd 4c
RIP: asix_suspend+0x76/0xc0 RSP: ffff88006bbae718
---[ end trace dfc4f5649284342c ]---
^ permalink raw reply
* RE: [patch net-next 2/4] net: sched: introduce per-egress action device callbacks
From: David Laight @ 2017-10-10 13:31 UTC (permalink / raw)
To: 'Jiri Pirko', netdev@vger.kernel.org
Cc: davem@davemloft.net, jhs@mojatatu.com, xiyou.wangcong@gmail.com,
saeedm@mellanox.com, matanb@mellanox.com, leonro@mellanox.com,
mlxsw@mellanox.com
In-Reply-To: <20171010073016.3682-3-jiri@resnulli.us>
From: Jiri Pirko
> Sent: 10 October 2017 08:30
> Introduce infrastructure that allows drivers to register callbacks that
> are called whenever tc would offload inserted rule and specified device
> acts as tc action egress device.
How does a driver safely unregister a callback?
(to avoid a race with the callback being called.)
Usually this requires a callback in the context that makes the
notification callbacks indicating that no more such callbacks
will be made.
David
^ permalink raw reply
* Re: [PATCH 00/12] radix-tree: split out struct radix_tree_root out to <linux/radix-tree-root.h>
From: Matthew Wilcox @ 2017-10-10 13:20 UTC (permalink / raw)
To: Masahiro Yamada
Cc: Linux Kernel Mailing List, Thomas Gleixner, Andrew Morton,
Ian Abbott, Ingo Molnar, Linus Torvalds, linux-cachefs, linux-sh,
Rodrigo Vivi, dri-devel, David Airlie, linux-rdma, Yoshinori Sato,
Tariq Toukan, Rich Felker, Leon Romanovsky, Jani Nikula,
J. Bruce Fields, David Howells, intel-gfx, Yishai
In-Reply-To: <CAK7LNAQ_SVA1aYYgjMFqJG5OzjYa1EbNDGdrCAOOen_dABX30g@mail.gmail.com>
On Tue, Oct 10, 2017 at 09:56:22PM +0900, Masahiro Yamada wrote:
> One refactoring alone does not produce much benefits,
> but making continuous efforts will disentangle the knotted threads.
> Of course, this might be a pipe dream...
A lot of people have had that dream, and some of those refactoring
efforts have proven worthwhile. But it's not a dream without costs;
your refactoring will conflict with other changes. I don't think the
benefit here is high enough to pursue this edition of the dream.
^ permalink raw reply
* Re: [PATCH v2] XDP Program for Ip forward
From: Jesper Dangaard Brouer @ 2017-10-10 13:12 UTC (permalink / raw)
To: Christina Jacob
Cc: netdev, linux-kernel, linux-arm-kernel, Sunil.Goutham, daniel,
dsahern, Christina Jacob, brouer
In-Reply-To: <1507620532-25804-1-git-send-email-Christina.Jacob@cavium.com>
On Tue, 10 Oct 2017 12:58:51 +0530 Christina Jacob <christina.jacob.koikara@gmail.com> wrote:
> The patch below implements port to port forwarding through route table and arp
> table lookup for ipv4 packets using bpf_redirect helper function and lpm_trie
> map. This has an improved performance over the normal kernel stack ip forward.
>
> Implementation details.
> -----------------------
[...]
>
> In the xdp3_user.c,
>
[...]
>
> In the xdp3_kern.c,
You forgot to update the program name in the cover letter.
> The array map for the 32 bit mask entries checked to see if there is a key that
> exactly matches with the destination ip. If it has a non zero destination mac
> entry then the xdp data is updated accordingly Otherwise a proper route and
> arp table lookup is done using the lpm_trie and the arp table array map.
>
> Usage: as ./xdp3 -S <ifindex1...ifindexn> (-S for
^^^^^
The executable name also changed.
> generic xdp implementation ifindex- the index of the interface to which
> the xdp program has to be attached.) in 4.14-rc3 kernel.
>
> Changes from v1 to v2
> ---------------------
>
> * As suggested by Jesper Dangaard Brouer
> 1. Changed the program name to list xdp_router_ipv4
Thanks
> 2. Changed the commandline arguments from ifindex list to interface name
> Usage : ./xdp_router_ipv4 [-S] <interface name list>
> -S for generic xdp implementation
> -interface name list is the list of interfaces to which
> the xdp program should attach to
Okay, you choose a slightly different way of implementing this, but it
shouldn't matter.
I'll try to test/benchmark your program...
--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Principal Kernel Engineer at Red Hat
LinkedIn: http://www.linkedin.com/in/brouer
^ permalink raw reply
* RE: [net-next 05/14] i40e/i40evf: always set the CLEARPBA flag when re-enabling interrupts
From: David Laight @ 2017-10-10 13:07 UTC (permalink / raw)
To: 'Jeff Kirsher', davem@davemloft.net
Cc: Jacob Keller, netdev@vger.kernel.org, nhorman@redhat.com,
sassmann@redhat.com, jogreene@redhat.com
In-Reply-To: <20171009223841.2557-6-jeffrey.t.kirsher@intel.com>
From: Jeff Kirsher
> Sent: 09 October 2017 23:39
> In the past we changed driver behavior to not clear the PBA when
> re-enabling interrupts. This change was motivated by the flawed belief
> that clearing the PBA would cause a lost interrupt if a receive
> interrupt occurred while interrupts were disabled.
>
> According to empirical testing this isn't the case. Additionally, the
> data sheet specifically says that we should set the CLEARPBA bit when
> re-enabling interrupts in a polling setup.
I presume this if the MSI-X Pending Bit Array?
Normally this bit is cleared when the interrupt is actioned.
If request the device clear the PBA then it (probably) won't
raise an interrupt when it is unmasked (by clearing the 'masked' bit).
If you've just checked all the rings (with the interrupt masked)
and you clear the PBA bit when you unmask interrupts then you will
need to do another scan of the rings to pick up any packets that
arrived (or tried to signal an interrupt) in that small gap.
'Empirical testing' probably won't hit the timing window.
David
^ permalink raw reply
* Re: [net-next 2/3] ip_gre: fix erspan tunnel mtu calculation
From: William Tu @ 2017-10-10 12:59 UTC (permalink / raw)
To: Xin Long; +Cc: therbert, davem, Linux Kernel Network Developers
In-Reply-To: <CADvbK_eu=FJ5H3cf399UNBqAAq8bpRaDODwHHrNPkq47RXkzsA@mail.gmail.com>
>> @@ -1242,14 +1241,14 @@ static int erspan_tunnel_init(struct net_device *dev)
>> struct ip_tunnel *tunnel = netdev_priv(dev);
>> int t_hlen;
>>
>> - tunnel->tun_hlen = 8;
>> + tunnel->tun_hlen = ERSPAN_GREHDR_LEN;
>> tunnel->parms.iph.protocol = IPPROTO_GRE;
>> tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
>> sizeof(struct erspanhdr);
>> t_hlen = tunnel->hlen + sizeof(struct iphdr);
>>
>> - dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4;
>> - dev->mtu = ETH_DATA_LEN - t_hlen - 4;
>> + dev->needed_headroom = LL_MAX_HEADER + t_hlen;
>> + dev->mtu = ETH_DATA_LEN - t_hlen;
> 1. I guess '+4-4' stuff was copied from __gre_tunnel_init(), I'm thinking
> it may be there for some reason.
>
I traced back to
4565e9919cda ("gre: Setup and TX path for gre/UDP foo-over-udp encapsulation")
and I think '+4-4' is there for GRE base header length.
Since now we do
dev->mtu = ETH_DATA_LEN - t_hlen;
and t_hlen already counts the the gre base header + optional header
len, I think it's not needed.
> 2. 'dev->needed_headroom =' and 'dev->mtu =' are really needed ?
> As I've seen both will be updated in .newlink:
> ipgre_newlink() -> ip_tunnel_newlink() -> ip_tunnel_bind_dev()
>
right, I also find both values gets overwritten by
ip_tunnel_bind_dev() using my test cases. Maybe we can remove them?
Thanks
William
^ permalink raw reply
* Re: [PATCH 00/12] radix-tree: split out struct radix_tree_root out to <linux/radix-tree-root.h>
From: Masahiro Yamada @ 2017-10-10 12:56 UTC (permalink / raw)
To: Matthew Wilcox
Cc: Linux Kernel Mailing List, Thomas Gleixner, Andrew Morton,
Ian Abbott, Ingo Molnar, Linus Torvalds, linux-cachefs, linux-sh,
Rodrigo Vivi, dri-devel, David Airlie, linux-rdma, Yoshinori Sato,
Tariq Toukan, Rich Felker, Leon Romanovsky, Jani Nikula,
J. Bruce Fields, David Howells, intel-gfx, Yishai
In-Reply-To: <20171010121829.GA26493@bombadil.infradead.org>
2017-10-10 21:18 GMT+09:00 Matthew Wilcox <willy@infradead.org>:
> On Mon, Oct 09, 2017 at 01:10:01AM +0900, Masahiro Yamada wrote:
>> Reducing the header dependency will help for speeding the kernel
>> build, suppressing unnecessary recompile of objects during
>> git-bisect'ing, etc.
>
> Well, does it? You could provide measurements showing before/after
> time to compile, or time to recompile after touching a header file that
> is included by radix-tree.h and not by radix-tree-root.h.
>
> Look at the files included (never mind the transitively included files):
>
> #include <linux/bitops.h>
> #include <linux/bug.h>
> #include <linux/kernel.h>
> #include <linux/list.h>
> #include <linux/preempt.h>
> #include <linux/rcupdate.h>
> #include <linux/spinlock.h>
> #include <linux/types.h>
>
> These are not exactly rare files to be included. My guess is that most
> of the files in the kernel end up depending on these files *anyway*, either
> directly or through some path that isn't the radix tree.
Good question.
I tested this series, and I confirmed
the total number of included headers decreased,
but did not decrease as much as I had expected.
The statement "most of the files in the kernel end
up depending on these files" is true.
But, with that excuse,
I do not want to conclude this kind of refactoring is pointless.
For example, how can we explain
commit bc6245e5efd70c41eaf9334b1b5e646745cb0fb3 ?
<linux/bug.h> includes the following three.
#include <asm/bug.h>
#include <linux/compiler.h>
#include <linux/build_bug.h>
Your statement applies to them too.
Actually, I did not see any impact
by replacing <linux/bug.h> in my files with <linux/build_bug.h>
Generally, people do not pay much attention
in decreasing header dependency.
One refactoring alone does not produce much benefits,
but making continuous efforts will disentangle the knotted threads.
Of course, this might be a pipe dream...
--
Best Regards
Masahiro Yamada
^ permalink raw reply
* [PATCH v2 net-next 2/2] net: dsa: lan9303: Add basic offloading of unicast traffic
From: Egil Hjelmeland @ 2017-10-10 12:49 UTC (permalink / raw)
To: andrew, vivien.didelot, f.fainelli, netdev, linux-kernel; +Cc: Egil Hjelmeland
In-Reply-To: <20171010124953.386-1-privat@egil-hjelmeland.no>
When both user ports are joined to the same bridge, the normal
HW MAC learning is enabled. This means that unicast traffic is forwarded
in HW.
If one of the user ports leave the bridge,
the ports goes back to the initial separated operation.
Port separation relies on disabled HW MAC learning. Hence the condition
that both ports must join same bridge.
Add brigde methods port_bridge_join, port_bridge_leave and
port_stp_state_set.
Signed-off-by: Egil Hjelmeland <privat@egil-hjelmeland.no>
---
drivers/net/dsa/lan9303-core.c | 82 ++++++++++++++++++++++++++++++++++++++++++
drivers/net/dsa/lan9303.h | 2 ++
2 files changed, 84 insertions(+)
diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c
index 2215ec1fbe1e..fecfe1fe67ea 100644
--- a/drivers/net/dsa/lan9303-core.c
+++ b/drivers/net/dsa/lan9303-core.c
@@ -18,6 +18,7 @@
#include <linux/mutex.h>
#include <linux/mii.h>
#include <linux/phy.h>
+#include <linux/if_bridge.h>
#include "lan9303.h"
@@ -146,6 +147,7 @@
# define LAN9303_SWE_PORT_STATE_FORWARDING_PORT0 (0)
# define LAN9303_SWE_PORT_STATE_LEARNING_PORT0 BIT(1)
# define LAN9303_SWE_PORT_STATE_BLOCKING_PORT0 BIT(0)
+# define LAN9303_SWE_PORT_STATE_DISABLED_PORT0 (3)
#define LAN9303_SWE_PORT_MIRROR 0x1846
# define LAN9303_SWE_PORT_MIRROR_SNIFF_ALL BIT(8)
# define LAN9303_SWE_PORT_MIRROR_SNIFFER_PORT2 BIT(7)
@@ -156,6 +158,7 @@
# define LAN9303_SWE_PORT_MIRROR_MIRRORED_PORT0 BIT(2)
# define LAN9303_SWE_PORT_MIRROR_ENABLE_RX_MIRRORING BIT(1)
# define LAN9303_SWE_PORT_MIRROR_ENABLE_TX_MIRRORING BIT(0)
+# define LAN9303_SWE_PORT_MIRROR_DISABLED 0
#define LAN9303_SWE_INGRESS_PORT_TYPE 0x1847
#define LAN9303_SWE_INGRESS_PORT_TYPE_VLAN 3
#define LAN9303_BM_CFG 0x1c00
@@ -556,6 +559,16 @@ static int lan9303_separate_ports(struct lan9303 *chip)
LAN9303_SWE_PORT_STATE_BLOCKING_PORT2);
}
+static void lan9303_bridge_ports(struct lan9303 *chip)
+{
+ /* ports bridged: remove mirroring */
+ lan9303_write_switch_reg(chip, LAN9303_SWE_PORT_MIRROR,
+ LAN9303_SWE_PORT_MIRROR_DISABLED);
+
+ lan9303_write_switch_reg(chip, LAN9303_SWE_PORT_STATE,
+ chip->swe_port_state);
+}
+
static int lan9303_handle_reset(struct lan9303 *chip)
{
if (!chip->reset_gpio)
@@ -844,6 +857,72 @@ static void lan9303_port_disable(struct dsa_switch *ds, int port,
}
}
+static int lan9303_port_bridge_join(struct dsa_switch *ds, int port,
+ struct net_device *br)
+{
+ struct lan9303 *chip = ds->priv;
+
+ dev_dbg(chip->dev, "%s(port %d)\n", __func__, port);
+ if (ds->ports[1].bridge_dev == ds->ports[2].bridge_dev) {
+ lan9303_bridge_ports(chip);
+ chip->is_bridged = true; /* unleash stp_state_set() */
+ }
+
+ return 0;
+}
+
+static void lan9303_port_bridge_leave(struct dsa_switch *ds, int port,
+ struct net_device *br)
+{
+ struct lan9303 *chip = ds->priv;
+
+ dev_dbg(chip->dev, "%s(port %d)\n", __func__, port);
+ if (chip->is_bridged) {
+ lan9303_separate_ports(chip);
+ chip->is_bridged = false;
+ }
+}
+
+static void lan9303_port_stp_state_set(struct dsa_switch *ds, int port,
+ u8 state)
+{
+ int portmask, portstate;
+ struct lan9303 *chip = ds->priv;
+
+ dev_dbg(chip->dev, "%s(port %d, state %d)\n",
+ __func__, port, state);
+
+ switch (state) {
+ case BR_STATE_DISABLED:
+ portstate = LAN9303_SWE_PORT_STATE_DISABLED_PORT0;
+ break;
+ case BR_STATE_BLOCKING:
+ case BR_STATE_LISTENING:
+ portstate = LAN9303_SWE_PORT_STATE_BLOCKING_PORT0;
+ break;
+ case BR_STATE_LEARNING:
+ portstate = LAN9303_SWE_PORT_STATE_LEARNING_PORT0;
+ break;
+ case BR_STATE_FORWARDING:
+ portstate = LAN9303_SWE_PORT_STATE_FORWARDING_PORT0;
+ break;
+ default:
+ portstate = LAN9303_SWE_PORT_STATE_DISABLED_PORT0;
+ dev_err(chip->dev, "unknown stp state: port %d, state %d\n",
+ port, state);
+ }
+
+ portmask = 0x3 << (port * 2);
+ portstate <<= (port * 2);
+
+ chip->swe_port_state = (chip->swe_port_state & ~portmask) | portstate;
+
+ if (chip->is_bridged)
+ lan9303_write_switch_reg(chip, LAN9303_SWE_PORT_STATE,
+ chip->swe_port_state);
+ /* else: touching SWE_PORT_STATE would break port separation */
+}
+
static const struct dsa_switch_ops lan9303_switch_ops = {
.get_tag_protocol = lan9303_get_tag_protocol,
.setup = lan9303_setup,
@@ -855,6 +934,9 @@ static const struct dsa_switch_ops lan9303_switch_ops = {
.get_sset_count = lan9303_get_sset_count,
.port_enable = lan9303_port_enable,
.port_disable = lan9303_port_disable,
+ .port_bridge_join = lan9303_port_bridge_join,
+ .port_bridge_leave = lan9303_port_bridge_leave,
+ .port_stp_state_set = lan9303_port_stp_state_set,
};
static int lan9303_register_switch(struct lan9303 *chip)
diff --git a/drivers/net/dsa/lan9303.h b/drivers/net/dsa/lan9303.h
index 4d8be555ff4d..68ecd544b658 100644
--- a/drivers/net/dsa/lan9303.h
+++ b/drivers/net/dsa/lan9303.h
@@ -21,6 +21,8 @@ struct lan9303 {
struct dsa_switch *ds;
struct mutex indirect_mutex; /* protect indexed register access */
const struct lan9303_phy_ops *ops;
+ bool is_bridged; /* true if port 1 and 2 are bridged */
+ u32 swe_port_state; /* remember SWE_PORT_STATE while not bridged */
};
extern const struct regmap_access_table lan9303_register_set;
--
2.11.0
^ permalink raw reply related
* [PATCH v2 net-next 1/2] net: dsa: lan9303: Move tag setup to new lan9303_setup_tagging
From: Egil Hjelmeland @ 2017-10-10 12:49 UTC (permalink / raw)
To: andrew, vivien.didelot, f.fainelli, netdev, linux-kernel; +Cc: Egil Hjelmeland
In-Reply-To: <20171010124953.386-1-privat@egil-hjelmeland.no>
Prepare for next patch:
Move tag setup from lan9303_separate_ports() to new function
lan9303_setup_tagging()
Signed-off-by: Egil Hjelmeland <privat@egil-hjelmeland.no>
---
drivers/net/dsa/lan9303-core.c | 42 +++++++++++++++++++++++++-----------------
1 file changed, 25 insertions(+), 17 deletions(-)
diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c
index 07355db2ad81..2215ec1fbe1e 100644
--- a/drivers/net/dsa/lan9303-core.c
+++ b/drivers/net/dsa/lan9303-core.c
@@ -157,6 +157,7 @@
# define LAN9303_SWE_PORT_MIRROR_ENABLE_RX_MIRRORING BIT(1)
# define LAN9303_SWE_PORT_MIRROR_ENABLE_TX_MIRRORING BIT(0)
#define LAN9303_SWE_INGRESS_PORT_TYPE 0x1847
+#define LAN9303_SWE_INGRESS_PORT_TYPE_VLAN 3
#define LAN9303_BM_CFG 0x1c00
#define LAN9303_BM_EGRSS_PORT_TYPE 0x1c0c
# define LAN9303_BM_EGRSS_PORT_TYPE_SPECIAL_TAG_PORT2 (BIT(17) | BIT(16))
@@ -510,11 +511,30 @@ static int lan9303_enable_processing_port(struct lan9303 *chip,
LAN9303_MAC_TX_CFG_X_TX_ENABLE);
}
+/* forward special tagged packets from port 0 to port 1 *or* port 2 */
+static int lan9303_setup_tagging(struct lan9303 *chip)
+{
+ int ret;
+ u32 val;
+ /* enable defining the destination port via special VLAN tagging
+ * for port 0
+ */
+ ret = lan9303_write_switch_reg(chip, LAN9303_SWE_INGRESS_PORT_TYPE,
+ LAN9303_SWE_INGRESS_PORT_TYPE_VLAN);
+ if (ret)
+ return ret;
+
+ /* tag incoming packets at port 1 and 2 on their way to port 0 to be
+ * able to discover their source port
+ */
+ val = LAN9303_BM_EGRSS_PORT_TYPE_SPECIAL_TAG_PORT0;
+ return lan9303_write_switch_reg(chip, LAN9303_BM_EGRSS_PORT_TYPE, val);
+}
+
/* We want a special working switch:
* - do not forward packets between port 1 and 2
* - forward everything from port 1 to port 0
* - forward everything from port 2 to port 0
- * - forward special tagged packets from port 0 to port 1 *or* port 2
*/
static int lan9303_separate_ports(struct lan9303 *chip)
{
@@ -529,22 +549,6 @@ static int lan9303_separate_ports(struct lan9303 *chip)
if (ret)
return ret;
- /* enable defining the destination port via special VLAN tagging
- * for port 0
- */
- ret = lan9303_write_switch_reg(chip, LAN9303_SWE_INGRESS_PORT_TYPE,
- 0x03);
- if (ret)
- return ret;
-
- /* tag incoming packets at port 1 and 2 on their way to port 0 to be
- * able to discover their source port
- */
- ret = lan9303_write_switch_reg(chip, LAN9303_BM_EGRSS_PORT_TYPE,
- LAN9303_BM_EGRSS_PORT_TYPE_SPECIAL_TAG_PORT0);
- if (ret)
- return ret;
-
/* prevent port 1 and 2 from forwarding packets by their own */
return lan9303_write_switch_reg(chip, LAN9303_SWE_PORT_STATE,
LAN9303_SWE_PORT_STATE_FORWARDING_PORT0 |
@@ -644,6 +648,10 @@ static int lan9303_setup(struct dsa_switch *ds)
return -EINVAL;
}
+ ret = lan9303_setup_tagging(chip);
+ if (ret)
+ dev_err(chip->dev, "failed to setup port tagging %d\n", ret);
+
ret = lan9303_separate_ports(chip);
if (ret)
dev_err(chip->dev, "failed to separate ports %d\n", ret);
--
2.11.0
^ permalink raw reply related
* [PATCH v2 net-next 0/2] lan9303: Add basic offloading of unicast traffic
From: Egil Hjelmeland @ 2017-10-10 12:49 UTC (permalink / raw)
To: andrew, vivien.didelot, f.fainelli, netdev, linux-kernel; +Cc: Egil Hjelmeland
This series add basic offloading of unicast traffic to the lan9303
DSA driver.
Review welcome!
Changes v1 -> v2:
- Patch 1: Codestyle linting.
- Patch 2: Remember SWE_PORT_STATE while not bridged.
Added constant LAN9303_SWE_PORT_MIRROR_DISABLED.
Egil Hjelmeland (2):
net: dsa: lan9303: Move tag setup to new lan9303_setup_tagging
net: dsa: lan9303: Add basic offloading of unicast traffic
drivers/net/dsa/lan9303-core.c | 124 +++++++++++++++++++++++++++++++++++------
drivers/net/dsa/lan9303.h | 2 +
2 files changed, 109 insertions(+), 17 deletions(-)
--
2.11.0
^ permalink raw reply
* Re: [PATCH net-next] openvswitch: add ct_clear action
From: Eric Garver @ 2017-10-10 12:48 UTC (permalink / raw)
To: Pravin Shelar; +Cc: ovs dev, Linux Kernel Network Developers
In-Reply-To: <CAOrHB_BUVwWqYQcbWiQ69EhnQpY8epqu55pRBGA6g7xG21zSqQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
On Mon, Oct 09, 2017 at 09:41:53PM -0700, Pravin Shelar wrote:
> On Fri, Oct 6, 2017 at 9:44 AM, Eric Garver <e@erig.me> wrote:
> > This adds a ct_clear action for clearing conntrack state. ct_clear is
> > currently implemented in OVS userspace, but is not backed by an action
> > in the kernel datapath. This is useful for flows that may modify a
> > packet tuple after a ct lookup has already occurred.
> >
> > Signed-off-by: Eric Garver <e@erig.me>
> Patch mostly looks good. I have following comments.
Thanks for the review Pravin!
> > ---
> > include/uapi/linux/openvswitch.h | 2 ++
> > net/openvswitch/actions.c | 5 +++++
> > net/openvswitch/conntrack.c | 12 ++++++++++++
> > net/openvswitch/conntrack.h | 7 +++++++
> > net/openvswitch/flow_netlink.c | 5 +++++
> > 5 files changed, 31 insertions(+)
> >
> > diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
> > index 156ee4cab82e..1b6e510e2cc6 100644
> > --- a/include/uapi/linux/openvswitch.h
> > +++ b/include/uapi/linux/openvswitch.h
> > @@ -806,6 +806,7 @@ struct ovs_action_push_eth {
> > * packet.
> > * @OVS_ACTION_ATTR_POP_ETH: Pop the outermost Ethernet header off the
> > * packet.
> > + * @OVS_ACTION_ATTR_CT_CLEAR: Clear conntrack state from the packet.
> > *
> > * Only a single header can be set with a single %OVS_ACTION_ATTR_SET. Not all
> > * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
> > @@ -835,6 +836,7 @@ enum ovs_action_attr {
> > OVS_ACTION_ATTR_TRUNC, /* u32 struct ovs_action_trunc. */
> > OVS_ACTION_ATTR_PUSH_ETH, /* struct ovs_action_push_eth. */
> > OVS_ACTION_ATTR_POP_ETH, /* No argument. */
> > + OVS_ACTION_ATTR_CT_CLEAR, /* No argument. */
> >
> > __OVS_ACTION_ATTR_MAX, /* Nothing past this will be accepted
> > * from userspace. */
> > diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
> > index a54a556fcdb5..db9c7f2e662b 100644
> > --- a/net/openvswitch/actions.c
> > +++ b/net/openvswitch/actions.c
> > @@ -1203,6 +1203,10 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
> > return err == -EINPROGRESS ? 0 : err;
> > break;
> >
> > + case OVS_ACTION_ATTR_CT_CLEAR:
> > + err = ovs_ct_clear(skb, key);
> > + break;
> > +
> > case OVS_ACTION_ATTR_PUSH_ETH:
> > err = push_eth(skb, key, nla_data(a));
> > break;
> > @@ -1210,6 +1214,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
> > case OVS_ACTION_ATTR_POP_ETH:
> > err = pop_eth(skb, key);
> > break;
> > +
> > }
> Unrelated change.
>
Right. Not sure how that got there. :/
> >
> > if (unlikely(err)) {
> > diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
> > index d558e882ca0c..f9b73c726ad7 100644
> > --- a/net/openvswitch/conntrack.c
> > +++ b/net/openvswitch/conntrack.c
> > @@ -1129,6 +1129,18 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
> > return err;
> > }
> >
> > +int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key)
> > +{
> > + if (skb_nfct(skb)) {
> > + nf_conntrack_put(skb_nfct(skb));
> > + nf_ct_set(skb, NULL, 0);
> Can the new conntract state be appropriate? may be IP_CT_UNTRACKED?
>
I think that will be fine. I'll run my tests again to verify.
> > + }
> > +
> > + ovs_ct_fill_key(skb, key);
> > +
> I do not see need to refill the key if there is no skb-nf-ct.
>
Good point.
> > + return 0;
> > +}
> > +
> > static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
> > const struct sw_flow_key *key, bool log)
> > {
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox