Netdev List
 help / color / mirror / Atom feed
* [PATCH V4 net-next 13/15] smc: receive data from RMBE
From: Ursula Braun @ 2017-01-09 15:55 UTC (permalink / raw)
  To: davem; +Cc: netdev, linux-s390, schwidefsky, heiko.carstens, utz.bacher,
	ubraun
In-Reply-To: <20170109155526.10961-1-ubraun@linux.vnet.ibm.com>

move RMBE data into user space buffer and update managing cursors

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
---
 net/smc/Makefile   |   2 +-
 net/smc/af_smc.c   |   7 +-
 net/smc/smc.h      |   4 +
 net/smc/smc_cdc.c  |   6 +-
 net/smc/smc_core.c |  10 +++
 net/smc/smc_rx.c   | 217 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/smc/smc_rx.h   |  23 ++++++
 net/smc/smc_tx.c   |  37 +++++++++
 net/smc/smc_tx.h   |   1 +
 9 files changed, 304 insertions(+), 3 deletions(-)
 create mode 100644 net/smc/smc_rx.c
 create mode 100644 net/smc/smc_rx.h

diff --git a/net/smc/Makefile b/net/smc/Makefile
index fc28d79..6255e29 100644
--- a/net/smc/Makefile
+++ b/net/smc/Makefile
@@ -1,3 +1,3 @@
 obj-$(CONFIG_SMC)	+= smc.o
 smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
-smc-y += smc_cdc.o smc_tx.o
+smc-y += smc_cdc.o smc_tx.o smc_rx.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index b62b69c..fc9c51c 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -38,6 +38,7 @@
 #include "smc_ib.h"
 #include "smc_pnet.h"
 #include "smc_tx.h"
+#include "smc_rx.h"
 
 static DEFINE_MUTEX(smc_create_lgr_pending);	/* serialize link group
 						 * creation
@@ -412,6 +413,7 @@ static int smc_connect_rdma(struct smc_sock *smc)
 
 	mutex_unlock(&smc_create_lgr_pending);
 	smc_tx_init(smc);
+	smc_rx_init(smc);
 
 out_connected:
 	smc_copy_sock_settings_to_clc(smc);
@@ -755,6 +757,7 @@ static void smc_listen_work(struct work_struct *work)
 	}
 
 	smc_tx_init(new_smc);
+	smc_rx_init(new_smc);
 
 out_connected:
 	sk_refcnt_debug_inc(newsmcsk);
@@ -950,7 +953,7 @@ static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 	if (smc->use_fallback)
 		rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
 	else
-		rc = sock_no_recvmsg(sock, msg, len, flags);
+		rc = smc_rx_recvmsg(smc, msg, len, flags);
 out:
 	release_sock(sk);
 	return rc;
@@ -1016,6 +1019,8 @@ static unsigned int smc_poll(struct file *file, struct socket *sock,
 			sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 		}
+		if (atomic_read(&smc->conn.bytes_to_rcv))
+			mask |= POLLIN | POLLRDNORM;
 		/* for now - to be enhanced in follow-on patch */
 	}
 
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 0c47d84..2bb1540 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -115,6 +115,10 @@ struct smc_connection {
 	struct smc_buf_desc	*rmb_desc;	/* RMBE descriptor */
 	int			rmbe_size;	/* RMBE size <== sock rmem */
 	int			rmbe_size_short;/* compressed notation */
+	int			rmbe_update_limit;
+						/* lower limit for consumer
+						 * cursor update
+						 */
 
 	struct smc_host_cdc_msg	local_tx_ctrl;	/* host byte order staging
 						 * buffer for CDC msg send
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index 77fe169..c0a6930 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -15,6 +15,7 @@
 #include "smc_wr.h"
 #include "smc_cdc.h"
 #include "smc_tx.h"
+#include "smc_rx.h"
 
 /********************************** send *************************************/
 
@@ -197,6 +198,7 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
 		atomic_add(diff_prod, &conn->bytes_to_rcv);
 		/* guarantee 0 <= bytes_to_rcv <= rmbe_size */
 		smp_mb__after_atomic();
+		smc->sk.sk_data_ready(&smc->sk);
 	}
 
 	if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort)
@@ -216,7 +218,9 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
 		return;
 
 	/* data available */
-	/* subsequent patch: send delayed ack, wake receivers */
+	if ((conn->local_rx_ctrl.prod_flags.write_blocked) ||
+	    (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req))
+		smc_tx_consumer_update(conn);
 }
 
 /* called under tasklet context */
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 537e387..e5c6395 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -489,6 +489,15 @@ struct smc_buf_desc *smc_rmb_get_slot(struct smc_link_group *lgr,
 	return NULL;
 }
 
+/* one of the conditions for announcing a receiver's current window size is
+ * that it "results in a minimum increase in the window size of 10% of the
+ * receive buffer space" [RFC7609]
+ */
+static inline int smc_rmb_wnd_update_limit(int rmbe_size)
+{
+	return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
+}
+
 /* create the tx buffer for an SMC socket */
 int smc_sndbuf_create(struct smc_sock *smc)
 {
@@ -620,6 +629,7 @@ int smc_rmb_create(struct smc_sock *smc)
 		conn->rmbe_size_short = tmp_bufsize_short;
 		smc->sk.sk_rcvbuf = tmp_bufsize * 2;
 		atomic_set(&conn->bytes_to_rcv, 0);
+		conn->rmbe_update_limit = smc_rmb_wnd_update_limit(tmp_bufsize);
 		return 0;
 	} else {
 		return -ENOMEM;
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
new file mode 100644
index 0000000..5d18787
--- /dev/null
+++ b/net/smc/smc_rx.c
@@ -0,0 +1,217 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Manage RMBE
+ * copy new RMBE data into user space
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <net/sock.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_cdc.h"
+#include "smc_tx.h" /* smc_tx_consumer_update() */
+#include "smc_rx.h"
+
+/* callback implementation for sk.sk_data_ready()
+ * to wakeup rcvbuf consumers that blocked with smc_rx_wait_data().
+ * indirectly called by smc_cdc_msg_recv_action().
+ */
+static void smc_rx_data_ready(struct sock *sk)
+{
+	struct socket_wq *wq;
+
+	/* derived from sock_def_readable() */
+	/* called already in smc_listen_work() */
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (skwq_has_sleeper(wq))
+		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
+						POLLRDNORM | POLLRDBAND);
+	if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
+	    (sk->sk_state == SMC_CLOSED))
+		sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
+	else
+		sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+	rcu_read_unlock();
+}
+
+/* blocks rcvbuf consumer until >=len bytes available or timeout or interrupted
+ *   @smc    smc socket
+ *   @timeo  pointer to max seconds to wait, pointer to value 0 for no timeout
+ * Returns:
+ * 1 if at least 1 byte available in rcvbuf or if socket error/shutdown.
+ * 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted).
+ */
+static int smc_rx_wait_data(struct smc_sock *smc, long *timeo)
+{
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+	struct smc_connection *conn = &smc->conn;
+	struct sock *sk = &smc->sk;
+	int rc;
+
+	if (atomic_read(&conn->bytes_to_rcv))
+		return 1;
+	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	add_wait_queue(sk_sleep(sk), &wait);
+	rc = sk_wait_event(sk, timeo,
+			   sk->sk_err ||
+			   sk->sk_shutdown & RCV_SHUTDOWN ||
+			   sock_flag(sk, SOCK_DONE) ||
+			   atomic_read(&conn->bytes_to_rcv) ||
+			   smc_cdc_rxed_any_close_or_senddone(conn),
+			   &wait);
+	remove_wait_queue(sk_sleep(sk), &wait);
+	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	return rc;
+}
+
+/* rcvbuf consumer: main API called by socket layer.
+ * called under sk lock.
+ */
+int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
+		   int flags)
+{
+	size_t copylen, read_done = 0, read_remaining = len;
+	size_t chunk_len, chunk_off, chunk_len_sum;
+	struct smc_connection *conn = &smc->conn;
+	union smc_host_cursor cons;
+	int readable, chunk;
+	char *rcvbuf_base;
+	struct sock *sk;
+	long timeo;
+	int target;		/* Read at least these many bytes */
+	int rc;
+
+	if (unlikely(flags & MSG_ERRQUEUE))
+		return -EINVAL; /* future work for sk.sk_family == AF_SMC */
+	if (flags & MSG_OOB)
+		return -EINVAL; /* future work */
+
+	sk = &smc->sk;
+	if (sk->sk_state == SMC_LISTEN)
+		return -ENOTCONN;
+	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+
+	msg->msg_namelen = 0;
+	/* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */
+	rcvbuf_base = conn->rmb_desc->cpu_addr;
+
+	do { /* while (read_remaining) */
+		if (read_done >= target)
+			break;
+
+		if (atomic_read(&conn->bytes_to_rcv))
+			goto copy;
+
+		if (read_done) {
+			if (sk->sk_err ||
+			    sk->sk_state == SMC_CLOSED ||
+			    (sk->sk_shutdown & RCV_SHUTDOWN) ||
+			    !timeo ||
+			    signal_pending(current) ||
+			    smc_cdc_rxed_any_close_or_senddone(conn) ||
+			    conn->local_tx_ctrl.conn_state_flags.
+			    peer_conn_abort)
+				break;
+		} else {
+			if (sock_flag(sk, SOCK_DONE))
+				break;
+			if (sk->sk_err) {
+				read_done = sock_error(sk);
+				break;
+			}
+			if (sk->sk_shutdown & RCV_SHUTDOWN ||
+			    smc_cdc_rxed_any_close_or_senddone(conn) ||
+			    conn->local_tx_ctrl.conn_state_flags.
+			    peer_conn_abort)
+				break;
+			if (sk->sk_state == SMC_CLOSED) {
+				if (!sock_flag(sk, SOCK_DONE)) {
+					/* This occurs when user tries to read
+					 * from never connected socket.
+					 */
+					read_done = -ENOTCONN;
+					break;
+				}
+				break;
+			}
+			if (signal_pending(current)) {
+				read_done = sock_intr_errno(timeo);
+				break;
+			}
+		}
+
+		if (!atomic_read(&conn->bytes_to_rcv)) {
+			smc_rx_wait_data(smc, &timeo);
+			continue;
+		}
+
+copy:
+		/* initialize variables for 1st iteration of subsequent loop */
+		/* could be just 1 byte, even after smc_rx_wait_data above */
+		readable = atomic_read(&conn->bytes_to_rcv);
+		/* not more than what user space asked for */
+		copylen = min_t(size_t, read_remaining, readable);
+		smc_curs_write(&cons,
+			       smc_curs_read(&conn->local_tx_ctrl.cons, conn),
+			       conn);
+		/* determine chunks where to read from rcvbuf */
+		/* either unwrapped case, or 1st chunk of wrapped case */
+		chunk_len = min_t(size_t,
+				  copylen, conn->rmbe_size - cons.count);
+		chunk_len_sum = chunk_len;
+		chunk_off = cons.count;
+		for (chunk = 0; chunk < 2; chunk++) {
+			if (!(flags & MSG_TRUNC)) {
+				rc = memcpy_to_msg(msg, rcvbuf_base + chunk_off,
+						   chunk_len);
+				if (rc) {
+					if (!read_done)
+						read_done = -EFAULT;
+					goto out;
+				}
+			}
+			read_remaining -= chunk_len;
+			read_done += chunk_len;
+
+			if (chunk_len_sum == copylen)
+				break; /* either on 1st or 2nd iteration */
+			/* prepare next (== 2nd) iteration */
+			chunk_len = copylen - chunk_len; /* remainder */
+			chunk_len_sum += chunk_len;
+			chunk_off = 0; /* modulo offset in recv ring buffer */
+		}
+
+		/* update cursors */
+		if (!(flags & MSG_PEEK)) {
+			smc_curs_add(conn->rmbe_size, &cons, copylen);
+			/* increased in recv tasklet smc_cdc_msg_rcv() */
+			smp_mb__before_atomic();
+			atomic_sub(copylen, &conn->bytes_to_rcv);
+			/* guarantee 0 <= bytes_to_rcv <= rmbe_size */
+			smp_mb__after_atomic();
+			smc_curs_write(&conn->local_tx_ctrl.cons,
+				       smc_curs_read(&cons, conn),
+				       conn);
+			/* send consumer cursor update if required */
+			/* similar to advertising new TCP rcv_wnd if required */
+			smc_tx_consumer_update(conn);
+		}
+	} while (read_remaining);
+out:
+	return read_done;
+}
+
+/* Initialize receive properties on connection establishment. NB: not __init! */
+void smc_rx_init(struct smc_sock *smc)
+{
+	smc->sk.sk_data_ready = smc_rx_data_ready;
+}
diff --git a/net/smc/smc_rx.h b/net/smc/smc_rx.h
new file mode 100644
index 0000000..b5b80e1
--- /dev/null
+++ b/net/smc/smc_rx.h
@@ -0,0 +1,23 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Manage RMBE
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_RX_H
+#define SMC_RX_H
+
+#include <linux/socket.h>
+#include <linux/types.h>
+
+#include "smc.h"
+
+void smc_rx_init(struct smc_sock *smc);
+int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
+		   int flags);
+
+#endif /* SMC_RX_H */
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index d86bef6..7e8799f 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -427,6 +427,43 @@ static void smc_tx_work(struct work_struct *work)
 	release_sock(&smc->sk);
 }
 
+void smc_tx_consumer_update(struct smc_connection *conn)
+{
+	union smc_host_cursor cfed, cons;
+	struct smc_cdc_tx_pend *pend;
+	struct smc_wr_buf *wr_buf;
+	int to_confirm, rc;
+
+	smc_curs_write(&cons,
+		       smc_curs_read(&conn->local_tx_ctrl.cons, conn),
+		       conn);
+	smc_curs_write(&cfed,
+		       smc_curs_read(&conn->rx_curs_confirmed, conn),
+		       conn);
+	to_confirm = smc_curs_diff(conn->rmbe_size, &cfed, &cons);
+
+	if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
+	    ((to_confirm > conn->rmbe_update_limit) &&
+	     ((to_confirm > (conn->rmbe_size / 2)) ||
+	      conn->local_rx_ctrl.prod_flags.write_blocked))) {
+		rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
+					   &wr_buf, &pend);
+		if (!rc)
+			rc = smc_cdc_msg_send(conn, wr_buf, pend);
+		if (rc < 0) {
+			schedule_work(&conn->tx_work);
+			return;
+		}
+		smc_curs_write(&conn->rx_curs_confirmed,
+			       smc_curs_read(&conn->local_tx_ctrl.cons, conn),
+			       conn);
+		conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0;
+	}
+	if (conn->local_rx_ctrl.prod_flags.write_blocked &&
+	    !atomic_read(&conn->bytes_to_rcv))
+		conn->local_rx_ctrl.prod_flags.write_blocked = 0;
+}
+
 /***************************** send initialize *******************************/
 
 /* Initialize send properties on connection establishment. NB: not __init! */
diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h
index 931c666..1d6a0dc 100644
--- a/net/smc/smc_tx.h
+++ b/net/smc/smc_tx.h
@@ -30,5 +30,6 @@ void smc_tx_init(struct smc_sock *smc);
 int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len);
 int smc_tx_sndbuf_nonempty(struct smc_connection *conn);
 void smc_tx_sndbuf_nonfull(struct smc_sock *smc);
+void smc_tx_consumer_update(struct smc_connection *conn);
 
 #endif /* SMC_TX_H */
-- 
2.8.4

^ permalink raw reply related

* [PATCH V4 net-next 14/15] smc: socket closing and linkgroup cleanup
From: Ursula Braun @ 2017-01-09 15:55 UTC (permalink / raw)
  To: davem; +Cc: netdev, linux-s390, schwidefsky, heiko.carstens, utz.bacher,
	ubraun
In-Reply-To: <20170109155526.10961-1-ubraun@linux.vnet.ibm.com>

smc_shutdown() and smc_release() handling
delayed linkgroup cleanup for linkgroups without connections

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
---
 net/smc/Makefile    |   2 +-
 net/smc/af_smc.c    | 127 ++++++++++++---
 net/smc/smc.h       |  18 +++
 net/smc/smc_cdc.c   |  23 ++-
 net/smc/smc_cdc.h   |   1 +
 net/smc/smc_close.c | 441 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/smc/smc_close.h |  28 ++++
 net/smc/smc_core.c  |   7 +-
 net/smc/smc_tx.c    |   8 +
 net/smc/smc_wr.c    |  47 ++++--
 net/smc/smc_wr.h    |   2 +
 11 files changed, 668 insertions(+), 36 deletions(-)
 create mode 100644 net/smc/smc_close.c
 create mode 100644 net/smc/smc_close.h

diff --git a/net/smc/Makefile b/net/smc/Makefile
index 6255e29..5cf0caf 100644
--- a/net/smc/Makefile
+++ b/net/smc/Makefile
@@ -1,3 +1,3 @@
 obj-$(CONFIG_SMC)	+= smc.o
 smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
-smc-y += smc_cdc.o smc_tx.o smc_rx.o
+smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index fc9c51c..3f543d5 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -39,6 +39,7 @@
 #include "smc_pnet.h"
 #include "smc_tx.h"
 #include "smc_rx.h"
+#include "smc_close.h"
 
 static DEFINE_MUTEX(smc_create_lgr_pending);	/* serialize link group
 						 * creation
@@ -70,14 +71,29 @@ static int smc_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
 	struct smc_sock *smc;
+	int rc = 0;
 
 	if (!sk)
 		goto out;
 
 	smc = smc_sk(sk);
-	lock_sock(sk);
+	sock_hold(sk);
+	if (sk->sk_state == SMC_LISTEN)
+		/* smc_close_non_accepted() is called and acquires
+		 * sock lock for child sockets again
+		 */
+		lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
+	else
+		lock_sock(sk);
 
-	sk->sk_state = SMC_CLOSED;
+	if (smc->use_fallback) {
+		sk->sk_state = SMC_CLOSED;
+		sk->sk_state_change(sk);
+	} else {
+		rc = smc_close_active(smc);
+		sock_set_flag(sk, SOCK_DEAD);
+		sk->sk_shutdown |= SHUTDOWN_MASK;
+	}
 	if (smc->clcsock) {
 		sock_release(smc->clcsock);
 		smc->clcsock = NULL;
@@ -86,11 +102,18 @@ static int smc_release(struct socket *sock)
 	/* detach socket */
 	sock_orphan(sk);
 	sock->sk = NULL;
+	if (smc->use_fallback) {
+		schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN);
+	} else if (sk->sk_state == SMC_CLOSED) {
+		smc_conn_free(&smc->conn);
+		schedule_delayed_work(&smc->sock_put_work,
+				      SMC_CLOSE_SOCK_PUT_DELAY);
+	}
 	release_sock(sk);
 
 	sock_put(sk);
 out:
-	return 0;
+	return rc;
 }
 
 static void smc_destruct(struct sock *sk)
@@ -120,6 +143,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
 	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
 	INIT_LIST_HEAD(&smc->accept_q);
 	spin_lock_init(&smc->accept_q_lock);
+	INIT_DELAYED_WORK(&smc->sock_put_work, smc_close_sock_put_work);
 	sk_refcnt_debug_inc(sk);
 
 	return sk;
@@ -417,7 +441,8 @@ static int smc_connect_rdma(struct smc_sock *smc)
 
 out_connected:
 	smc_copy_sock_settings_to_clc(smc);
-	smc->sk.sk_state = SMC_ACTIVE;
+	if (smc->sk.sk_state == SMC_INIT)
+		smc->sk.sk_state = SMC_ACTIVE;
 
 	return rc ? rc : local_contact;
 
@@ -559,8 +584,8 @@ static void smc_accept_unlink(struct sock *sk)
 /* remove a sock from the accept queue to bind it to a new socket created
  * for a socket accept call from user space
  */
-static struct sock *smc_accept_dequeue(struct sock *parent,
-				       struct socket *new_sock)
+struct sock *smc_accept_dequeue(struct sock *parent,
+				struct socket *new_sock)
 {
 	struct smc_sock *isk, *n;
 	struct sock *new_sk;
@@ -581,11 +606,17 @@ static struct sock *smc_accept_dequeue(struct sock *parent,
 }
 
 /* clean up for a created but never accepted sock */
-static void smc_close_non_accepted(struct sock *sk)
+void smc_close_non_accepted(struct sock *sk)
 {
 	struct smc_sock *smc = smc_sk(sk);
 
 	sock_hold(sk);
+	lock_sock(sk);
+	if (!sk->sk_lingertime)
+		/* wait for peer closing */
+		sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
+	if (!smc->use_fallback)
+		smc_close_active(smc);
 	if (smc->clcsock) {
 		struct socket *tcp;
 
@@ -593,7 +624,16 @@ static void smc_close_non_accepted(struct sock *sk)
 		smc->clcsock = NULL;
 		sock_release(tcp);
 	}
-	/* more closing stuff to be added with socket closing patch */
+	sock_set_flag(sk, SOCK_DEAD);
+	sk->sk_shutdown |= SHUTDOWN_MASK;
+	if (smc->use_fallback) {
+		schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN);
+	} else {
+		smc_conn_free(&smc->conn);
+		schedule_delayed_work(&smc->sock_put_work,
+				      SMC_CLOSE_SOCK_PUT_DELAY);
+	}
+	release_sock(sk);
 	sock_put(sk);
 }
 
@@ -761,11 +801,12 @@ static void smc_listen_work(struct work_struct *work)
 
 out_connected:
 	sk_refcnt_debug_inc(newsmcsk);
-	newsmcsk->sk_state = SMC_ACTIVE;
+	if (newsmcsk->sk_state == SMC_INIT)
+		newsmcsk->sk_state = SMC_ACTIVE;
 enqueue:
 	if (local_contact == SMC_FIRST_CONTACT)
 		mutex_unlock(&smc_create_lgr_pending);
-	lock_sock(&lsmc->sk);
+	lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
 	if (lsmc->sk.sk_state == SMC_LISTEN) {
 		smc_accept_enqueue(&lsmc->sk, newsmcsk);
 	} else { /* no longer listening */
@@ -791,6 +832,7 @@ static void smc_listen_work(struct work_struct *work)
 
 out_err:
 	newsmcsk->sk_state = SMC_CLOSED;
+	smc_conn_free(&new_smc->conn);
 	goto enqueue; /* queue new sock with sk_err set */
 }
 
@@ -911,7 +953,8 @@ static int smc_getname(struct socket *sock, struct sockaddr *addr,
 {
 	struct smc_sock *smc;
 
-	if (peer && (sock->sk->sk_state != SMC_ACTIVE))
+	if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
+	    (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
 		return -ENOTCONN;
 
 	smc = smc_sk(sock->sk);
@@ -927,7 +970,9 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 
 	smc = smc_sk(sk);
 	lock_sock(sk);
-	if (sk->sk_state != SMC_ACTIVE)
+	if ((sk->sk_state != SMC_ACTIVE) &&
+	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
+	    (sk->sk_state != SMC_INIT))
 		goto out;
 	if (smc->use_fallback)
 		rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
@@ -947,13 +992,21 @@ static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 
 	smc = smc_sk(sk);
 	lock_sock(sk);
-	if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
+	if ((sk->sk_state == SMC_INIT) ||
+	    (sk->sk_state == SMC_LISTEN) ||
+	    (sk->sk_state == SMC_CLOSED))
+		goto out;
+
+	if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
+		rc = 0;
 		goto out;
+	}
 
 	if (smc->use_fallback)
 		rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
 	else
 		rc = smc_rx_recvmsg(smc, msg, len, flags);
+
 out:
 	release_sock(sk);
 	return rc;
@@ -1013,7 +1066,8 @@ static unsigned int smc_poll(struct file *file, struct socket *sock,
 			mask |= smc_accept_poll(sk);
 		if (sk->sk_err)
 			mask |= POLLERR;
-		if (atomic_read(&smc->conn.sndbuf_space)) {
+		if (atomic_read(&smc->conn.sndbuf_space) ||
+		    (sk->sk_shutdown & SEND_SHUTDOWN)) {
 			mask |= POLLOUT | POLLWRNORM;
 		} else {
 			sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
@@ -1021,7 +1075,14 @@ static unsigned int smc_poll(struct file *file, struct socket *sock,
 		}
 		if (atomic_read(&smc->conn.bytes_to_rcv))
 			mask |= POLLIN | POLLRDNORM;
-		/* for now - to be enhanced in follow-on patch */
+		if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
+		    (sk->sk_state == SMC_CLOSED))
+			mask |= POLLHUP;
+		if (sk->sk_shutdown & RCV_SHUTDOWN)
+			mask |= POLLIN | POLLRDNORM | POLLRDHUP;
+		if (sk->sk_state == SMC_APPCLOSEWAIT1)
+			mask |= POLLIN;
+
 	}
 
 	return mask;
@@ -1032,31 +1093,53 @@ static int smc_shutdown(struct socket *sock, int how)
 	struct sock *sk = sock->sk;
 	struct smc_sock *smc;
 	int rc = -EINVAL;
+	int rc1 = 0;
 
 	smc = smc_sk(sk);
 
 	if ((how < SHUT_RD) || (how > SHUT_RDWR))
-		goto out_err;
+		return rc;
 
 	lock_sock(sk);
 
 	rc = -ENOTCONN;
-	if (sk->sk_state == SMC_CLOSED)
+	if ((sk->sk_state != SMC_LISTEN) &&
+	    (sk->sk_state != SMC_ACTIVE) &&
+	    (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
+	    (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
+	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
+	    (sk->sk_state != SMC_APPCLOSEWAIT2) &&
+	    (sk->sk_state != SMC_APPFINCLOSEWAIT))
 		goto out;
 	if (smc->use_fallback) {
 		rc = kernel_sock_shutdown(smc->clcsock, how);
 		sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
 		if (sk->sk_shutdown == SHUTDOWN_MASK)
 			sk->sk_state = SMC_CLOSED;
-	} else {
-		rc = sock_no_shutdown(sock, how);
+		goto out;
+	}
+	switch (how) {
+	case SHUT_RDWR:		/* shutdown in both directions */
+		rc = smc_close_active(smc);
+		break;
+	case SHUT_WR:
+		rc = smc_close_shutdown_write(smc);
+		break;
+	case SHUT_RD:
+		if (sk->sk_state == SMC_LISTEN)
+			rc = smc_close_active(smc);
+		else
+			rc = 0;
+			/* nothing more to do because peer is not involved */
+		break;
 	}
+	rc1 = kernel_sock_shutdown(smc->clcsock, how);
+	/* map sock_shutdown_cmd constants to sk_shutdown value range */
+	sk->sk_shutdown |= how + 1;
 
 out:
 	release_sock(sk);
-
-out_err:
-	return rc;
+	return rc ? rc : rc1;
 }
 
 static int smc_setsockopt(struct socket *sock, int level, int optname,
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 2bb1540..959a5d2 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -30,6 +30,16 @@ enum smc_state {		/* possible states of an SMC socket */
 	SMC_INIT	= 2,
 	SMC_CLOSED	= 7,
 	SMC_LISTEN	= 10,
+	/* normal close */
+	SMC_PEERCLOSEWAIT1	= 20,
+	SMC_PEERCLOSEWAIT2	= 21,
+	SMC_APPFINCLOSEWAIT	= 24,
+	SMC_APPCLOSEWAIT1	= 22,
+	SMC_APPCLOSEWAIT2	= 23,
+	SMC_PEERFINCLOSEWAIT	= 25,
+	/* abnormal close */
+	SMC_PEERABORTWAIT	= 26,
+	SMC_PROCESSABORT	= 27,
 };
 
 struct smc_link_group;
@@ -164,7 +174,13 @@ struct smc_sock {				/* smc sock container */
 	struct work_struct	smc_listen_work;/* prepare new accept socket */
 	struct list_head	accept_q;	/* sockets to be accepted */
 	spinlock_t		accept_q_lock;	/* protects accept_q */
+	struct delayed_work	sock_put_work;	/* final socket freeing */
 	bool			use_fallback;	/* fallback to tcp */
+	u8			wait_close_tx_prepared : 1;
+						/* shutdown wr or close
+						 * started, waiting for unsent
+						 * data to be sent
+						 */
 };
 
 static inline struct smc_sock *smc_sk(const struct sock *sk)
@@ -250,5 +266,7 @@ void smc_conn_free(struct smc_connection *conn);
 int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr,
 		    struct smc_ib_device *smcibdev, u8 ibport,
 		    struct smc_clc_msg_local *lcl, int srv_first_contact);
+struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock);
+void smc_close_non_accepted(struct sock *sk);
 
 #endif	/* __SMC_H */
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index c0a6930..5a33949 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -16,6 +16,7 @@
 #include "smc_cdc.h"
 #include "smc_tx.h"
 #include "smc_rx.h"
+#include "smc_close.h"
 
 /********************************** send *************************************/
 
@@ -55,6 +56,9 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
 			       cdcpend->conn);
 	}
 	smc_tx_sndbuf_nonfull(smc);
+	if (smc->sk.sk_state != SMC_ACTIVE)
+		/* wake up smc_close_wait_tx_pends() */
+		smc->sk.sk_state_change(&smc->sk);
 	bh_unlock_sock(&smc->sk);
 }
 
@@ -149,6 +153,14 @@ void smc_cdc_tx_dismiss_slots(struct smc_connection *conn)
 				(unsigned long)conn);
 }
 
+bool smc_cdc_tx_has_pending(struct smc_connection *conn)
+{
+	struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+
+	return smc_wr_tx_has_pending(link, SMC_CDC_MSG_TYPE,
+				     smc_cdc_tx_filter, (unsigned long)conn);
+}
+
 /********************************* receive ***********************************/
 
 static inline bool smc_cdc_before(u16 seq1, u16 seq2)
@@ -201,15 +213,20 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
 		smc->sk.sk_data_ready(&smc->sk);
 	}
 
-	if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort)
+	if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
 		smc->sk.sk_err = ECONNRESET;
+		conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+	}
 	if (smc_cdc_rxed_any_close_or_senddone(conn))
-		/* subsequent patch: terminate connection */
+		smc_close_passive_received(smc);
 
 	/* piggy backed tx info */
 	/* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */
-	if (diff_cons && smc_tx_prepared_sends(conn))
+	if (diff_cons && smc_tx_prepared_sends(conn)) {
 		smc_tx_sndbuf_nonempty(conn);
+		/* trigger socket release if connection closed */
+		smc_close_wake_tx_prepared(smc);
+	}
 
 	/* subsequent patch: trigger socket release if connection closed */
 
diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
index 135f613..8e1d76f 100644
--- a/net/smc/smc_cdc.h
+++ b/net/smc/smc_cdc.h
@@ -212,6 +212,7 @@ void smc_cdc_tx_dismiss_slots(struct smc_connection *conn);
 int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf,
 		     struct smc_cdc_tx_pend *pend);
 int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn);
+bool smc_cdc_tx_has_pending(struct smc_connection *conn);
 int smc_cdc_init(void) __init;
 
 #endif /* SMC_CDC_H */
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
new file mode 100644
index 0000000..d70c05b
--- /dev/null
+++ b/net/smc/smc_close.c
@@ -0,0 +1,441 @@
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  Socket Closing - normal and abnormal
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/workqueue.h>
+#include <net/sock.h>
+
+#include "smc.h"
+#include "smc_tx.h"
+#include "smc_cdc.h"
+#include "smc_close.h"
+
+#define SMC_CLOSE_WAIT_TX_PENDS_TIME		(5 * HZ)
+
+static void smc_close_cleanup_listen(struct sock *parent)
+{
+	struct sock *sk;
+
+	/* Close non-accepted connections */
+	while ((sk = smc_accept_dequeue(parent, NULL)))
+		smc_close_non_accepted(sk);
+}
+
+static void smc_close_wait_tx_pends(struct smc_sock *smc)
+{
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+	struct sock *sk = &smc->sk;
+	signed long timeout;
+
+	timeout = SMC_CLOSE_WAIT_TX_PENDS_TIME;
+	add_wait_queue(sk_sleep(sk), &wait);
+	while (!signal_pending(current) && timeout) {
+		int rc;
+
+		rc = sk_wait_event(sk, &timeout,
+				   !smc_cdc_tx_has_pending(&smc->conn),
+				   &wait);
+		if (rc)
+			break;
+	}
+	remove_wait_queue(sk_sleep(sk), &wait);
+}
+
+/* wait for sndbuf data being transmitted */
+static void smc_close_stream_wait(struct smc_sock *smc, long timeout)
+{
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+	struct sock *sk = &smc->sk;
+
+	if (!timeout)
+		return;
+
+	if (!smc_tx_prepared_sends(&smc->conn))
+		return;
+
+	smc->wait_close_tx_prepared = 1;
+	add_wait_queue(sk_sleep(sk), &wait);
+	while (!signal_pending(current) && timeout) {
+		int rc;
+
+		rc = sk_wait_event(sk, &timeout,
+				   !smc_tx_prepared_sends(&smc->conn) ||
+				   (sk->sk_err == ECONNABORTED) ||
+				   (sk->sk_err == ECONNRESET),
+				   &wait);
+		if (rc)
+			break;
+	}
+	remove_wait_queue(sk_sleep(sk), &wait);
+	smc->wait_close_tx_prepared = 0;
+}
+
+void smc_close_wake_tx_prepared(struct smc_sock *smc)
+{
+	if (smc->wait_close_tx_prepared)
+		/* wake up socket closing */
+		smc->sk.sk_state_change(&smc->sk);
+}
+
+static int smc_close_wr(struct smc_connection *conn)
+{
+	conn->local_tx_ctrl.conn_state_flags.peer_done_writing = 1;
+
+	return smc_cdc_get_slot_and_msg_send(conn);
+}
+
+static int smc_close_final(struct smc_connection *conn)
+{
+	if (atomic_read(&conn->bytes_to_rcv))
+		conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+	else
+		conn->local_tx_ctrl.conn_state_flags.peer_conn_closed = 1;
+
+	return smc_cdc_get_slot_and_msg_send(conn);
+}
+
+static int smc_close_abort(struct smc_connection *conn)
+{
+	conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+
+	return smc_cdc_get_slot_and_msg_send(conn);
+}
+
+/* terminate smc socket abnormally - active abort
+ * RDMA communication no longer possible
+ */
+void smc_close_active_abort(struct smc_sock *smc)
+{
+	struct smc_cdc_conn_state_flags *txflags =
+		&smc->conn.local_tx_ctrl.conn_state_flags;
+
+	bh_lock_sock(&smc->sk);
+	smc->sk.sk_err = ECONNABORTED;
+	if (smc->clcsock && smc->clcsock->sk) {
+		smc->clcsock->sk->sk_err = ECONNABORTED;
+		smc->clcsock->sk->sk_state_change(smc->clcsock->sk);
+	}
+	switch (smc->sk.sk_state) {
+	case SMC_INIT:
+		smc->sk.sk_state = SMC_PEERABORTWAIT;
+		break;
+	case SMC_APPCLOSEWAIT1:
+	case SMC_APPCLOSEWAIT2:
+		txflags->peer_conn_abort = 1;
+		sock_release(smc->clcsock);
+		if (!smc_cdc_rxed_any_close(&smc->conn))
+			smc->sk.sk_state = SMC_PEERABORTWAIT;
+		else
+			smc->sk.sk_state = SMC_CLOSED;
+		break;
+	case SMC_PEERCLOSEWAIT1:
+	case SMC_PEERCLOSEWAIT2:
+		if (!txflags->peer_conn_closed) {
+			smc->sk.sk_state = SMC_PEERABORTWAIT;
+			txflags->peer_conn_abort = 1;
+			sock_release(smc->clcsock);
+		} else {
+			smc->sk.sk_state = SMC_CLOSED;
+		}
+		break;
+	case SMC_PROCESSABORT:
+	case SMC_APPFINCLOSEWAIT:
+		if (!txflags->peer_conn_closed) {
+			txflags->peer_conn_abort = 1;
+			sock_release(smc->clcsock);
+		}
+		smc->sk.sk_state = SMC_CLOSED;
+		break;
+	case SMC_PEERFINCLOSEWAIT:
+	case SMC_PEERABORTWAIT:
+	case SMC_CLOSED:
+		break;
+	}
+
+	sock_set_flag(&smc->sk, SOCK_DEAD);
+	bh_unlock_sock(&smc->sk);
+	smc->sk.sk_state_change(&smc->sk);
+}
+
+int smc_close_active(struct smc_sock *smc)
+{
+	struct smc_cdc_conn_state_flags *txflags =
+		&smc->conn.local_tx_ctrl.conn_state_flags;
+	long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT;
+	struct smc_connection *conn = &smc->conn;
+	struct sock *sk = &smc->sk;
+	int old_state;
+	int rc = 0;
+
+	if (sock_flag(sk, SOCK_LINGER) &&
+	    !(current->flags & PF_EXITING))
+		timeout = sk->sk_lingertime;
+
+again:
+	old_state = sk->sk_state;
+	switch (old_state) {
+	case SMC_INIT:
+		sk->sk_state = SMC_CLOSED;
+		if (smc->smc_listen_work.func)
+			flush_work(&smc->smc_listen_work);
+		sock_put(sk);
+		break;
+	case SMC_LISTEN:
+		sk->sk_state = SMC_CLOSED;
+		sk->sk_state_change(sk); /* wake up accept */
+		if (smc->clcsock && smc->clcsock->sk) {
+			rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
+			/* wake up kernel_accept of smc_tcp_listen_worker */
+			smc->clcsock->sk->sk_data_ready(smc->clcsock->sk);
+		}
+		release_sock(sk);
+		smc_close_cleanup_listen(sk);
+		flush_work(&smc->tcp_listen_work);
+		lock_sock(sk);
+		break;
+	case SMC_ACTIVE:
+		smc_close_stream_wait(smc, timeout);
+		release_sock(sk);
+		cancel_work_sync(&conn->tx_work);
+		lock_sock(sk);
+		if (sk->sk_state == SMC_ACTIVE) {
+			/* send close request */
+			rc = smc_close_final(conn);
+			sk->sk_state = SMC_PEERCLOSEWAIT1;
+		} else {
+			/* peer event has changed the state */
+			goto again;
+		}
+		break;
+	case SMC_APPFINCLOSEWAIT:
+		/* socket already shutdown wr or both (active close) */
+		if (txflags->peer_done_writing &&
+		    !txflags->peer_conn_closed) {
+			/* just shutdown wr done, send close request */
+			rc = smc_close_final(conn);
+		}
+		sk->sk_state = SMC_CLOSED;
+		smc_close_wait_tx_pends(smc);
+		break;
+	case SMC_APPCLOSEWAIT1:
+	case SMC_APPCLOSEWAIT2:
+		if (!smc_cdc_rxed_any_close(conn))
+			smc_close_stream_wait(smc, timeout);
+		release_sock(sk);
+		cancel_work_sync(&conn->tx_work);
+		lock_sock(sk);
+		if (sk->sk_err != ECONNABORTED) {
+			/* confirm close from peer */
+			rc = smc_close_final(conn);
+			if (rc)
+				break;
+		}
+		if (smc_cdc_rxed_any_close(conn))
+			/* peer has closed the socket already */
+			sk->sk_state = SMC_CLOSED;
+		else
+			/* peer has just issued a shutdown write */
+			sk->sk_state = SMC_PEERFINCLOSEWAIT;
+		smc_close_wait_tx_pends(smc);
+		break;
+	case SMC_PEERCLOSEWAIT1:
+	case SMC_PEERCLOSEWAIT2:
+	case SMC_PEERFINCLOSEWAIT:
+		/* peer sending PeerConnectionClosed will cause transition */
+		break;
+	case SMC_PROCESSABORT:
+		cancel_work_sync(&conn->tx_work);
+		smc_close_abort(conn);
+		sk->sk_state = SMC_CLOSED;
+		smc_close_wait_tx_pends(smc);
+		break;
+	case SMC_PEERABORTWAIT:
+	case SMC_CLOSED:
+		/* nothing to do, add tracing in future patch */
+		break;
+	}
+
+	if (old_state != sk->sk_state)
+		sk->sk_state_change(&smc->sk);
+	return rc;
+}
+
+static void smc_close_passive_abort_received(struct smc_sock *smc)
+{
+	struct smc_cdc_conn_state_flags *txflags =
+		&smc->conn.local_tx_ctrl.conn_state_flags;
+	struct sock *sk = &smc->sk;
+
+	switch (sk->sk_state) {
+	case SMC_ACTIVE:
+	case SMC_APPFINCLOSEWAIT:
+	case SMC_APPCLOSEWAIT1:
+	case SMC_APPCLOSEWAIT2:
+		smc_close_abort(&smc->conn);
+		sk->sk_state = SMC_PROCESSABORT;
+		break;
+	case SMC_PEERCLOSEWAIT1:
+	case SMC_PEERCLOSEWAIT2:
+		if (txflags->peer_done_writing &&
+		    !txflags->peer_conn_closed) {
+			/* just shutdown, but not yet closed locally */
+			smc_close_abort(&smc->conn);
+			sk->sk_state = SMC_PROCESSABORT;
+		} else {
+			sk->sk_state = SMC_CLOSED;
+		}
+		break;
+	case SMC_PEERFINCLOSEWAIT:
+	case SMC_PEERABORTWAIT:
+		sk->sk_state = SMC_CLOSED;
+		break;
+	case SMC_INIT:
+	case SMC_PROCESSABORT:
+	/* nothing to do, add tracing in future patch */
+		break;
+	}
+}
+
+/* Some kind of closing has been received: peer_conn_closed, peer_conn_abort,
+ * or peer_done_writing.
+ * Called under tasklet context.
+ */
+void smc_close_passive_received(struct smc_sock *smc)
+{
+	struct smc_cdc_conn_state_flags *rxflags =
+		&smc->conn.local_rx_ctrl.conn_state_flags;
+	struct sock *sk = &smc->sk;
+	int old_state;
+
+	sk->sk_shutdown |= RCV_SHUTDOWN;
+	if (smc->clcsock && smc->clcsock->sk)
+		smc->clcsock->sk->sk_shutdown |= RCV_SHUTDOWN;
+	sock_set_flag(&smc->sk, SOCK_DONE);
+
+	old_state = sk->sk_state;
+
+	if (rxflags->peer_conn_abort) {
+		smc_close_passive_abort_received(smc);
+		goto wakeup;
+	}
+
+	switch (sk->sk_state) {
+	case SMC_INIT:
+		if (atomic_read(&smc->conn.bytes_to_rcv) ||
+		    (rxflags->peer_done_writing &&
+		     !rxflags->peer_conn_closed))
+			sk->sk_state = SMC_APPCLOSEWAIT1;
+		else
+			sk->sk_state = SMC_CLOSED;
+		break;
+	case SMC_ACTIVE:
+		sk->sk_state = SMC_APPCLOSEWAIT1;
+		break;
+	case SMC_PEERCLOSEWAIT1:
+		if (rxflags->peer_done_writing)
+			sk->sk_state = SMC_PEERCLOSEWAIT2;
+		/* fall through to check for closing */
+	case SMC_PEERCLOSEWAIT2:
+	case SMC_PEERFINCLOSEWAIT:
+		if (!smc_cdc_rxed_any_close(&smc->conn))
+			break;
+		if (sock_flag(sk, SOCK_DEAD) &&
+		    (sk->sk_shutdown == SHUTDOWN_MASK)) {
+			/* smc_release has already been called locally */
+			sk->sk_state = SMC_CLOSED;
+		} else {
+			/* just shutdown, but not yet closed locally */
+			sk->sk_state = SMC_APPFINCLOSEWAIT;
+		}
+		break;
+	case SMC_APPCLOSEWAIT1:
+	case SMC_APPCLOSEWAIT2:
+	case SMC_APPFINCLOSEWAIT:
+	case SMC_PEERABORTWAIT:
+	case SMC_PROCESSABORT:
+	case SMC_CLOSED:
+		/* nothing to do, add tracing in future patch */
+		break;
+	}
+
+wakeup:
+	if (old_state != sk->sk_state)
+		sk->sk_state_change(sk);
+	sk->sk_data_ready(sk); /* wakeup blocked rcvbuf consumers */
+	sk->sk_write_space(sk); /* wakeup blocked sndbuf producers */
+
+	if ((sk->sk_state == SMC_CLOSED) &&
+	    (sock_flag(sk, SOCK_DEAD) || (old_state == SMC_INIT))) {
+		smc_conn_free(&smc->conn);
+		schedule_delayed_work(&smc->sock_put_work,
+				      SMC_CLOSE_SOCK_PUT_DELAY);
+	}
+}
+
+void smc_close_sock_put_work(struct work_struct *work)
+{
+	struct smc_sock *smc = container_of(to_delayed_work(work),
+					    struct smc_sock,
+					    sock_put_work);
+
+	sock_put(&smc->sk);
+}
+
+int smc_close_shutdown_write(struct smc_sock *smc)
+{
+	struct smc_connection *conn = &smc->conn;
+	long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT;
+	struct sock *sk = &smc->sk;
+	int old_state;
+	int rc = 0;
+
+	if (sock_flag(sk, SOCK_LINGER))
+		timeout = sk->sk_lingertime;
+
+again:
+	old_state = sk->sk_state;
+	switch (old_state) {
+	case SMC_ACTIVE:
+		smc_close_stream_wait(smc, timeout);
+		release_sock(sk);
+		cancel_work_sync(&conn->tx_work);
+		lock_sock(sk);
+		/* send close wr request */
+		rc = smc_close_wr(conn);
+		if (sk->sk_state == SMC_ACTIVE)
+			sk->sk_state = SMC_PEERCLOSEWAIT1;
+		else
+			goto again;
+		break;
+	case SMC_APPCLOSEWAIT1:
+		/* passive close */
+		if (!smc_cdc_rxed_any_close(conn))
+			smc_close_stream_wait(smc, timeout);
+		release_sock(sk);
+		cancel_work_sync(&conn->tx_work);
+		lock_sock(sk);
+		/* confirm close from peer */
+		rc = smc_close_wr(conn);
+		sk->sk_state = SMC_APPCLOSEWAIT2;
+		break;
+	case SMC_APPCLOSEWAIT2:
+	case SMC_PEERFINCLOSEWAIT:
+	case SMC_PEERCLOSEWAIT1:
+	case SMC_PEERCLOSEWAIT2:
+	case SMC_APPFINCLOSEWAIT:
+	case SMC_PROCESSABORT:
+	case SMC_PEERABORTWAIT:
+		/* nothing to do, add tracing in future patch */
+		break;
+	}
+
+	if (old_state != sk->sk_state)
+		sk->sk_state_change(&smc->sk);
+	return rc;
+}
diff --git a/net/smc/smc_close.h b/net/smc/smc_close.h
new file mode 100644
index 0000000..bc9a2df
--- /dev/null
+++ b/net/smc/smc_close.h
@@ -0,0 +1,28 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Socket Closing
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_CLOSE_H
+#define SMC_CLOSE_H
+
+#include <linux/workqueue.h>
+
+#include "smc.h"
+
+#define SMC_MAX_STREAM_WAIT_TIMEOUT		(2 * HZ)
+#define SMC_CLOSE_SOCK_PUT_DELAY		HZ
+
+void smc_close_wake_tx_prepared(struct smc_sock *smc);
+void smc_close_active_abort(struct smc_sock *smc);
+int smc_close_active(struct smc_sock *smc);
+void smc_close_passive_received(struct smc_sock *smc);
+void smc_close_sock_put_work(struct work_struct *work);
+int smc_close_shutdown_write(struct smc_sock *smc);
+
+#endif /* SMC_CLOSE_H */
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index e5c6395..8b1d343 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -23,6 +23,7 @@
 #include "smc_wr.h"
 #include "smc_llc.h"
 #include "smc_cdc.h"
+#include "smc_close.h"
 
 #define SMC_LGR_NUM_INCR	256
 #define SMC_LGR_FREE_DELAY	(600 * HZ)
@@ -295,6 +296,7 @@ void smc_lgr_free(struct smc_link_group *lgr)
 void smc_lgr_terminate(struct smc_link_group *lgr)
 {
 	struct smc_connection *conn;
+	struct smc_sock *smc;
 	struct rb_node *node;
 
 	spin_lock_bh(&smc_lgr_list.lock);
@@ -311,11 +313,14 @@ void smc_lgr_terminate(struct smc_link_group *lgr)
 	node = rb_first(&lgr->conns_all);
 	while (node) {
 		conn = rb_entry(node, struct smc_connection, alert_node);
+		smc = container_of(conn, struct smc_sock, conn);
+		sock_hold(&smc->sk);
 		__smc_lgr_unregister_conn(conn);
+		smc_close_active_abort(smc);
+		sock_put(&smc->sk);
 		node = rb_first(&lgr->conns_all);
 	}
 	write_unlock_bh(&lgr->conns_lock);
-	schedule_delayed_work(&lgr->free_work, SMC_LGR_FREE_DELAY);
 }
 
 /* Determine vlan of internal TCP socket.
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 7e8799f..6e73b28 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -139,6 +139,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
 		if (sk->sk_state == SMC_INIT)
 			return -ENOTCONN;
 		if (smc->sk.sk_shutdown & SEND_SHUTDOWN ||
+		    (smc->sk.sk_err == ECONNABORTED) ||
 		    conn->local_tx_ctrl.conn_state_flags.peer_conn_abort)
 			return -EPIPE;
 		if (smc_cdc_rxed_any_close(conn))
@@ -392,6 +393,13 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
 				   &pend);
 	if (rc < 0) {
 		if (rc == -EBUSY) {
+			struct smc_sock *smc =
+				container_of(conn, struct smc_sock, conn);
+
+			if (smc->sk.sk_err == ECONNABORTED) {
+				rc = sock_error(&smc->sk);
+				goto out_unlock;
+			}
 			rc = 0;
 			schedule_work(&conn->tx_work);
 		}
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 14f3f3f..eadf157 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -81,6 +81,8 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
 	if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
 		return;
 	if (wc->status) {
+		struct smc_link_group *lgr;
+
 		for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
 			/* clear full struct smc_wr_tx_pend including .priv */
 			memset(&link->wr_tx_pends[i], 0,
@@ -89,9 +91,10 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
 			       sizeof(link->wr_tx_bufs[i]));
 			clear_bit(i, link->wr_tx_mask);
 		}
-		/* tbd in future patch: terminate connections of this link
-		 * group abnormally
-		 */
+		/* terminate connections of this link group abnormally */
+		lgr = container_of(link, struct smc_link_group,
+				   lnk[SMC_SINGLE_LINK]);
+		smc_lgr_terminate(lgr);
 	}
 	if (pnd_snd.handler)
 		pnd_snd.handler(&pnd_snd.priv, link, wc->status);
@@ -176,9 +179,12 @@ int smc_wr_tx_get_free_slot(struct smc_link *link,
 			(smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY),
 			SMC_WR_TX_WAIT_FREE_SLOT_TIME);
 		if (!rc) {
-			/* tbd in future patch: timeout - terminate connections
-			 * of this link group abnormally
-			 */
+			/* timeout - terminate connections */
+			struct smc_link_group *lgr;
+
+			lgr = container_of(link, struct smc_link_group,
+					   lnk[SMC_SINGLE_LINK]);
+			smc_lgr_terminate(lgr);
 			return -EPIPE;
 		}
 		if (rc == -ERESTARTSYS)
@@ -256,6 +262,24 @@ void smc_wr_tx_dismiss_slots(struct smc_link *link, u8 wr_rx_hdr_type,
 	}
 }
 
+bool smc_wr_tx_has_pending(struct smc_link *link, u8 wr_rx_hdr_type,
+			   smc_wr_tx_filter filter, unsigned long data)
+{
+	struct smc_wr_tx_pend_priv *tx_pend;
+	struct smc_wr_rx_hdr *wr_rx;
+	int i;
+
+	for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
+		wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[i];
+		if (wr_rx->type != wr_rx_hdr_type)
+			continue;
+		tx_pend = &link->wr_tx_pends[i].priv;
+		if (filter(tx_pend, data))
+			return true;
+	}
+	return false;
+}
+
 /****************************** receive queue ********************************/
 
 int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler)
@@ -310,14 +334,19 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
 			smc_wr_rx_demultiplex(&wc[i]);
 			smc_wr_rx_post(link); /* refill WR RX */
 		} else {
+			struct smc_link_group *lgr;
+
 			/* handle status errors */
 			switch (wc[i].status) {
 			case IB_WC_RETRY_EXC_ERR:
 			case IB_WC_RNR_RETRY_EXC_ERR:
 			case IB_WC_WR_FLUSH_ERR:
-			/* tbd in future patch: terminate connections of this
-			 * link group abnormally
-			 */
+				/* terminate connections of this link group
+				 * abnormally
+				 */
+				lgr = container_of(link, struct smc_link_group,
+						   lnk[SMC_SINGLE_LINK]);
+				smc_lgr_terminate(lgr);
 				break;
 			default:
 				smc_wr_rx_post(link); /* refill WR RX */
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
index 124f857..0b9beed 100644
--- a/net/smc/smc_wr.h
+++ b/net/smc/smc_wr.h
@@ -92,6 +92,8 @@ int smc_wr_tx_put_slot(struct smc_link *link,
 int smc_wr_tx_send(struct smc_link *link,
 		   struct smc_wr_tx_pend_priv *wr_pend_priv);
 void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
+bool smc_wr_tx_has_pending(struct smc_link *link, u8 wr_rx_hdr_type,
+			   smc_wr_tx_filter filter, unsigned long data);
 void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type,
 			     smc_wr_tx_filter filter,
 			     smc_wr_tx_dismisser dismisser,
-- 
2.8.4

^ permalink raw reply related

* [PATCH V4 net-next 15/15] smc: netlink interface for SMC sockets
From: Ursula Braun @ 2017-01-09 15:55 UTC (permalink / raw)
  To: davem; +Cc: netdev, linux-s390, schwidefsky, heiko.carstens, utz.bacher,
	ubraun
In-Reply-To: <20170109155526.10961-1-ubraun@linux.vnet.ibm.com>

Support for SMC socket monitoring via netlink sockets of protocol
NETLINK_SOCK_DIAG.

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
---
 include/net/smc.h             |  20 ++++
 include/net/sock.h            |   3 +
 include/uapi/linux/netlink.h  |   1 +
 include/uapi/linux/smc_diag.h |  85 +++++++++++++++++
 net/smc/Kconfig               |   9 ++
 net/smc/Makefile              |   1 +
 net/smc/af_smc.c              |  43 ++++++++-
 net/smc/smc.h                 |   2 +
 net/smc/smc_close.c           |   1 +
 net/smc/smc_diag.c            | 215 ++++++++++++++++++++++++++++++++++++++++++
 10 files changed, 379 insertions(+), 1 deletion(-)
 create mode 100644 include/net/smc.h
 create mode 100644 include/uapi/linux/smc_diag.h
 create mode 100644 net/smc/smc_diag.c

diff --git a/include/net/smc.h b/include/net/smc.h
new file mode 100644
index 0000000..12d2635
--- /dev/null
+++ b/include/net/smc.h
@@ -0,0 +1,20 @@
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  Definitions for the SMC module (socket related)
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+#ifndef _SMC_H
+#define _SMC_H
+
+struct smc_hashinfo {
+	rwlock_t lock;
+	struct hlist_head ht;
+};
+
+int smc_hash_sk(struct sock *sk);
+void smc_unhash_sk(struct sock *sk);
+#endif	/* _SMC_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index 99deda6..389a0a6 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -70,6 +70,7 @@
 #include <net/checksum.h>
 #include <net/tcp_states.h>
 #include <linux/net_tstamp.h>
+#include <net/smc.h>
 
 /*
  * This structure really needs to be cleaned up.
@@ -986,6 +987,7 @@ struct request_sock_ops;
 struct timewait_sock_ops;
 struct inet_hashinfo;
 struct raw_hashinfo;
+struct smc_hashinfo;
 struct module;
 
 /*
@@ -1094,6 +1096,7 @@ struct proto {
 		struct inet_hashinfo	*hashinfo;
 		struct udp_table	*udp_table;
 		struct raw_hashinfo	*raw_hash;
+		struct smc_hashinfo	*smc_hash;
 	} h;
 
 	struct module		*owner;
diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index 0dba4e4..f3946a2 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -27,6 +27,7 @@
 #define NETLINK_ECRYPTFS	19
 #define NETLINK_RDMA		20
 #define NETLINK_CRYPTO		21	/* Crypto layer */
+#define NETLINK_SMC		22	/* SMC monitoring */
 
 #define NETLINK_INET_DIAG	NETLINK_SOCK_DIAG
 
diff --git a/include/uapi/linux/smc_diag.h b/include/uapi/linux/smc_diag.h
new file mode 100644
index 0000000..0063919
--- /dev/null
+++ b/include/uapi/linux/smc_diag.h
@@ -0,0 +1,85 @@
+#ifndef _UAPI_SMC_DIAG_H_
+#define _UAPI_SMC_DIAG_H_
+
+#include <linux/types.h>
+#include <linux/inet_diag.h>
+#include <rdma/ib_verbs.h>
+
+/* Request structure */
+struct smc_diag_req {
+	__u8	diag_family;
+	__u8	pad[2];
+	__u8	diag_ext;		/* Query extended information */
+	struct inet_diag_sockid	id;
+};
+
+/* Base info structure. It contains socket identity (addrs/ports/cookie) based
+ * on the internal clcsock, and more SMC-related socket data
+ */
+struct smc_diag_msg {
+	__u8	diag_family;
+	__u8	diag_state;
+	__u8	diag_fallback;
+	__u8	diag_shutdown;
+	struct inet_diag_sockid id;
+
+	__u32	diag_uid;
+	__u64	diag_inode;
+};
+
+/* Extensions */
+
+enum {
+	SMC_DIAG_NONE,
+	SMC_DIAG_CONNINFO,
+	SMC_DIAG_LGRINFO,
+	SMC_DIAG_SHUTDOWN,
+	__SMC_DIAG_MAX,
+};
+
+#define SMC_DIAG_MAX (__SMC_DIAG_MAX - 1)
+
+/* SMC_DIAG_CONNINFO */
+
+struct smc_diag_cursor {
+	__u16	reserved;
+	__u16	wrap;
+	__u32	count;
+};
+
+struct smc_diag_conninfo {
+	__u32			token;		/* unique connection id */
+	__u32			sndbuf_size;	/* size of send buffer */
+	__u32			rmbe_size;	/* size of RMB element */
+	__u32			peer_rmbe_size;	/* size of peer RMB element */
+	/* local RMB element cursors */
+	struct smc_diag_cursor	rx_prod;	/* received producer cursor */
+	struct smc_diag_cursor	rx_cons;	/* received consumer cursor */
+	/* peer RMB element cursors */
+	struct smc_diag_cursor	tx_prod;	/* sent producer cursor */
+	struct smc_diag_cursor	tx_cons;	/* sent consumer cursor */
+	__u8			rx_prod_flags;	/* received producer flags */
+	__u8			rx_conn_state_flags; /* recvd connection flags*/
+	__u8			tx_prod_flags;	/* sent producer flags */
+	__u8			tx_conn_state_flags; /* sent connection flags*/
+	/* send buffer cursors */
+	struct smc_diag_cursor	tx_prep;	/* prepared to be sent cursor */
+	struct smc_diag_cursor	tx_sent;	/* sent cursor */
+	struct smc_diag_cursor	tx_fin;		/* confirmed sent cursor */
+};
+
+/* SMC_DIAG_LINKINFO */
+
+struct smc_diag_linkinfo {
+	__u8 link_id;			/* link identifier */
+	__u8 ibname[IB_DEVICE_NAME_MAX]; /* name of the RDMA device */
+	__u8 ibport;			/* RDMA device port number */
+	__u8 gid[40];			/* local GID */
+	__u8 peer_gid[40];		/* peer GID */
+};
+
+struct smc_diag_lgrinfo {
+	struct smc_diag_linkinfo	lnk[1];
+	__u8				role;
+};
+#endif /* _UAPI_SMC_DIAG_H_ */
diff --git a/net/smc/Kconfig b/net/smc/Kconfig
index bc02980..c717ef0 100644
--- a/net/smc/Kconfig
+++ b/net/smc/Kconfig
@@ -9,3 +9,12 @@ config SMC
 	  a separate socket family SMC.
 
 	  Select this option if you want to run SMC socket applications
+
+config SMC_DIAG
+	tristate "SMC: socket monitoring interface"
+	depends on SMC
+	---help---
+	  Support for SMC socket monitoring interface used by tools such as
+	  smcss.
+
+	  if unsure, say Y.
diff --git a/net/smc/Makefile b/net/smc/Makefile
index 5cf0caf..1881046 100644
--- a/net/smc/Makefile
+++ b/net/smc/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_SMC)	+= smc.o
+obj-$(CONFIG_SMC_DIAG)	+= smc_diag.o
 smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
 smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 3f543d5..5d4208a 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -29,6 +29,7 @@
 #include <linux/in.h>
 #include <net/sock.h>
 #include <net/tcp.h>
+#include <net/smc.h>
 
 #include "smc.h"
 #include "smc_clc.h"
@@ -59,13 +60,48 @@ static void smc_set_keepalive(struct sock *sk, int val)
 	smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
 }
 
-static struct proto smc_proto = {
+static struct smc_hashinfo smc_v4_hashinfo = {
+	.lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
+};
+
+int smc_hash_sk(struct sock *sk)
+{
+	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
+	struct hlist_head *head;
+
+	head = &h->ht;
+
+	write_lock_bh(&h->lock);
+	sk_add_node(sk, head);
+	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+	write_unlock_bh(&h->lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(smc_hash_sk);
+
+void smc_unhash_sk(struct sock *sk)
+{
+	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
+
+	write_lock_bh(&h->lock);
+	if (sk_del_node_init(sk))
+		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+	write_unlock_bh(&h->lock);
+}
+EXPORT_SYMBOL_GPL(smc_unhash_sk);
+
+struct proto smc_proto = {
 	.name		= "SMC",
 	.owner		= THIS_MODULE,
 	.keepalive	= smc_set_keepalive,
+	.hash		= smc_hash_sk,
+	.unhash		= smc_unhash_sk,
 	.obj_size	= sizeof(struct smc_sock),
+	.h.smc_hash	= &smc_v4_hashinfo,
 	.slab_flags	= SLAB_DESTROY_BY_RCU,
 };
+EXPORT_SYMBOL_GPL(smc_proto);
 
 static int smc_release(struct socket *sock)
 {
@@ -109,6 +145,7 @@ static int smc_release(struct socket *sock)
 		schedule_delayed_work(&smc->sock_put_work,
 				      SMC_CLOSE_SOCK_PUT_DELAY);
 	}
+	sk->sk_prot->unhash(sk);
 	release_sock(sk);
 
 	sock_put(sk);
@@ -144,6 +181,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
 	INIT_LIST_HEAD(&smc->accept_q);
 	spin_lock_init(&smc->accept_q_lock);
 	INIT_DELAYED_WORK(&smc->sock_put_work, smc_close_sock_put_work);
+	sk->sk_prot->hash(sk);
 	sk_refcnt_debug_inc(sk);
 
 	return sk;
@@ -536,6 +574,7 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
 		lsmc->sk.sk_err = -rc;
 		new_sk->sk_state = SMC_CLOSED;
 		sock_set_flag(new_sk, SOCK_DEAD);
+		sk->sk_prot->unhash(new_sk);
 		sock_put(new_sk);
 		*new_smc = NULL;
 		goto out;
@@ -545,6 +584,7 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
 			sock_release(new_clcsock);
 		new_sk->sk_state = SMC_CLOSED;
 		sock_set_flag(new_sk, SOCK_DEAD);
+		sk->sk_prot->unhash(new_sk);
 		sock_put(new_sk);
 		*new_smc = NULL;
 		goto out;
@@ -1320,6 +1360,7 @@ static int __init smc_init(void)
 		pr_err("%s: sock_register fails with %d\n", __func__, rc);
 		goto out_proto;
 	}
+	INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
 
 	rc = smc_ib_register_client();
 	if (rc) {
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 959a5d2..ee5fbea 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -21,6 +21,8 @@
 
 #define SMC_MAX_PORTS		2	/* Max # of ports */
 
+extern struct proto smc_proto;
+
 #ifdef ATOMIC64_INIT
 #define KERNEL_HAS_ATOMIC64
 #endif
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index d70c05b..03dfcc6 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -384,6 +384,7 @@ void smc_close_sock_put_work(struct work_struct *work)
 					    struct smc_sock,
 					    sock_put_work);
 
+	smc->sk.sk_prot->unhash(&smc->sk);
 	sock_put(&smc->sk);
 }
 
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
new file mode 100644
index 0000000..d2d01cf
--- /dev/null
+++ b/net/smc/smc_diag.c
@@ -0,0 +1,215 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Monitoring SMC transport protocol sockets
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/sock_diag.h>
+#include <linux/inet_diag.h>
+#include <linux/smc_diag.h>
+#include <net/netlink.h>
+#include <net/smc.h>
+
+#include "smc.h"
+#include "smc_core.h"
+
+static void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw)
+{
+	sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x",
+		be16_to_cpu(((__be16 *)gid_raw)[0]),
+		be16_to_cpu(((__be16 *)gid_raw)[1]),
+		be16_to_cpu(((__be16 *)gid_raw)[2]),
+		be16_to_cpu(((__be16 *)gid_raw)[3]),
+		be16_to_cpu(((__be16 *)gid_raw)[4]),
+		be16_to_cpu(((__be16 *)gid_raw)[5]),
+		be16_to_cpu(((__be16 *)gid_raw)[6]),
+		be16_to_cpu(((__be16 *)gid_raw)[7]));
+}
+
+static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk)
+{
+	struct smc_sock *smc = smc_sk(sk);
+
+	r->diag_family = sk->sk_family;
+	if (!smc->clcsock)
+		return;
+	r->id.idiag_sport = htons(smc->clcsock->sk->sk_num);
+	r->id.idiag_dport = smc->clcsock->sk->sk_dport;
+	r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if;
+	sock_diag_save_cookie(sk, r->id.idiag_cookie);
+	memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
+	memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
+	r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr;
+	r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr;
+}
+
+static int smc_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
+				   struct smc_diag_msg *r,
+				   struct user_namespace *user_ns)
+{
+	if (nla_put_u8(skb, SMC_DIAG_SHUTDOWN, sk->sk_shutdown))
+		return 1;
+
+	r->diag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
+	r->diag_inode = sock_i_ino(sk);
+	return 0;
+}
+
+static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
+			   struct netlink_callback *cb,
+			   const struct smc_diag_req *req,
+			   struct nlattr *bc)
+{
+	struct smc_sock *smc = smc_sk(sk);
+	struct user_namespace *user_ns;
+	struct smc_diag_msg *r;
+	struct nlmsghdr *nlh;
+
+	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+			cb->nlh->nlmsg_type, sizeof(*r), NLM_F_MULTI);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	r = nlmsg_data(nlh);
+	smc_diag_msg_common_fill(r, sk);
+	r->diag_state = sk->sk_state;
+	r->diag_fallback = smc->use_fallback;
+	user_ns = sk_user_ns(NETLINK_CB(cb->skb).sk);
+	if (smc_diag_msg_attrs_fill(sk, skb, r, user_ns))
+		goto errout;
+
+	if ((req->diag_ext & (1 << (SMC_DIAG_CONNINFO - 1))) && smc->conn.lgr) {
+		struct smc_connection *conn = &smc->conn;
+		struct smc_diag_conninfo cinfo = {
+			.token = conn->alert_token_local,
+			.sndbuf_size = conn->sndbuf_size,
+			.rmbe_size = conn->rmbe_size,
+			.peer_rmbe_size = conn->peer_rmbe_size,
+
+			.rx_prod.wrap = conn->local_rx_ctrl.prod.wrap,
+			.rx_prod.count = conn->local_rx_ctrl.prod.count,
+			.rx_cons.wrap = conn->local_rx_ctrl.cons.wrap,
+			.rx_cons.count = conn->local_rx_ctrl.cons.count,
+
+			.tx_prod.wrap = conn->local_tx_ctrl.prod.wrap,
+			.tx_prod.count = conn->local_tx_ctrl.prod.count,
+			.tx_cons.wrap = conn->local_tx_ctrl.cons.wrap,
+			.tx_cons.count = conn->local_tx_ctrl.cons.count,
+
+			.tx_prod_flags =
+				*(u8 *)&conn->local_tx_ctrl.prod_flags,
+			.tx_conn_state_flags =
+				*(u8 *)&conn->local_tx_ctrl.conn_state_flags,
+			.rx_prod_flags = *(u8 *)&conn->local_rx_ctrl.prod_flags,
+			.rx_conn_state_flags =
+				*(u8 *)&conn->local_rx_ctrl.conn_state_flags,
+
+			.tx_prep.wrap = conn->tx_curs_prep.wrap,
+			.tx_prep.count = conn->tx_curs_prep.count,
+			.tx_sent.wrap = conn->tx_curs_sent.wrap,
+			.tx_sent.count = conn->tx_curs_sent.count,
+			.tx_fin.wrap = conn->tx_curs_fin.wrap,
+			.tx_fin.count = conn->tx_curs_fin.count,
+		};
+
+		if (nla_put(skb, SMC_DIAG_CONNINFO, sizeof(cinfo), &cinfo) < 0)
+			goto errout;
+	}
+
+	if ((req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && smc->conn.lgr) {
+		struct smc_diag_lgrinfo linfo = {
+			.role = smc->conn.lgr->role,
+			.lnk[0].ibport = smc->conn.lgr->lnk[0].ibport,
+			.lnk[0].link_id = smc->conn.lgr->lnk[0].link_id,
+		};
+
+		memcpy(linfo.lnk[0].ibname,
+		       smc->conn.lgr->lnk[0].smcibdev->ibdev->name,
+		       sizeof(smc->conn.lgr->lnk[0].smcibdev->ibdev->name));
+		smc_gid_be16_convert(linfo.lnk[0].gid,
+				     smc->conn.lgr->lnk[0].gid.raw);
+		smc_gid_be16_convert(linfo.lnk[0].peer_gid,
+				     smc->conn.lgr->lnk[0].peer_gid);
+
+		if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0)
+			goto errout;
+	}
+
+	nlmsg_end(skb, nlh);
+	return 0;
+
+errout:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nlattr *bc = NULL;
+	struct hlist_head *head;
+	struct sock *sk;
+	int rc = 0;
+
+	read_lock(&smc_proto.h.smc_hash->lock);
+	head = &smc_proto.h.smc_hash->ht;
+	if (hlist_empty(head))
+		goto out;
+
+	sk_for_each(sk, head) {
+		if (!net_eq(sock_net(sk), net))
+			continue;
+		rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc);
+		if (rc)
+			break;
+	}
+
+out:
+	read_unlock(&smc_proto.h.smc_hash->lock);
+	return rc;
+}
+
+static int smc_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
+{
+	struct net *net = sock_net(skb->sk);
+
+	if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY &&
+	    h->nlmsg_flags & NLM_F_DUMP) {
+		{
+			struct netlink_dump_control c = {
+				.dump = smc_diag_dump,
+				.min_dump_alloc = SKB_WITH_OVERHEAD(32768),
+			};
+			return netlink_dump_start(net->diag_nlsk, skb, h, &c);
+		}
+	}
+	return 0;
+}
+
+static const struct sock_diag_handler smc_diag_handler = {
+	.family = AF_SMC,
+	.dump = smc_diag_handler_dump,
+};
+
+static int __init smc_diag_init(void)
+{
+	return sock_diag_register(&smc_diag_handler);
+}
+
+static void __exit smc_diag_exit(void)
+{
+	sock_diag_unregister(&smc_diag_handler);
+}
+
+module_init(smc_diag_init);
+module_exit(smc_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 43 /* AF_SMC */);
-- 
2.8.4

^ permalink raw reply related

* Re: [PATCH net-next v2] net: dsa: make "label" property optional for dsa2
From: Andrew Lunn @ 2017-01-09 16:00 UTC (permalink / raw)
  To: Vivien Didelot
  Cc: Jiri Pirko, netdev, linux-kernel, kernel, David S. Miller,
	Florian Fainelli, Uwe Kleine-König, Andrey Smirnov
In-Reply-To: <87y3yk2q5e.fsf@weeman.i-did-not-set--mail-host-address--so-tickle-me>

> > No. That should be unique within one switch. In mlxsw we name it "p1",
> > "p2", ...
> >
> > The final netdev names are:
> > enp3s0np1, enp3s0np2, ...
> 

mlxsw are pci devices, so it follows this convention, i think:

 *   [P<domain>]p<bus>s<slot>[f<function>][n<phys_port_name>|d<dev_port>]
 *                                          PCI geographical location

Our devices are not on PCI. So they won't follow this. I've no idea
what they actually follow, since some are MDIO devices, some are SPI
devices, some are memory mapped.

I'm not against making the label option, but i do want to better
understand what we get as a result, just to make sure it is sensible.

Vivien, could you try a recent udev and see what happens?

	Thanks
		Andrew

^ permalink raw reply

* Re: [for-next V2 06/10] net/mlx5: Add interface to get reference to a UAR
From: Saeed Mahameed @ 2017-01-09 16:05 UTC (permalink / raw)
  To: David Miller
  Cc: Yuval Shaia, Saeed Mahameed, Doug Ledford, Linux Netdev List,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA, Leon Romanovsky, Eli Cohen,
	Matan Barak, Leon Romanovsky
In-Reply-To: <20170109.104722.2246652236667186272.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>

On Mon, Jan 9, 2017 at 5:47 PM, David Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org> wrote:
> From: Saeed Mahameed <saeedm-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
> Date: Mon, 9 Jan 2017 10:31:36 +0200
>
>> We will submit an incremental patch for this, as checkpatch doesn't
>> complain about such minor things.
>
> Please fix this and resubmit the series.
>

Sure, will do this.

> Checkpatch not complaining is not an argument for fixing up coding
> style issues reported to you in feedback.

I just thought that this pull request is sitting in the mailing list
for too long and we need to move on.
Such minor issues can be fixed in incremental patches that doesn't
block net-next and rdma submissions.

Anyway will fix and submit v3.

thanks,
Saeed.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH net-next v2] net: dsa: make "label" property optional for dsa2
From: Jiri Pirko @ 2017-01-09 16:06 UTC (permalink / raw)
  To: Vivien Didelot
  Cc: netdev, linux-kernel, kernel, David S. Miller, Florian Fainelli,
	Andrew Lunn, Uwe Kleine-König, Andrey Smirnov
In-Reply-To: <87y3yk2q5e.fsf@weeman.i-did-not-set--mail-host-address--so-tickle-me>

Mon, Jan 09, 2017 at 04:45:33PM CET, vivien.didelot@savoirfairelinux.com wrote:
>Hi Jiri,
>
>Jiri Pirko <jiri@resnulli.us> writes:
>
>>>Extra question: shouldn't phys_port_{id,name} be switchdev attributes in
>>
>> Again, phys_port_id has nothing to do with switches. Should be removed
>> from dsa because its use there is incorrect.
>
>Florian, since 3a543ef just got in, can it be reverted?

Yes, please revert it. It is only in net-next.


>
>>>> I guess that it should be enough for you to implement
>>>> ndo_get_phys_port_name.
>>>
>>>Well, if this name must be unique on a system, it's not likely to happen
>>>until we agree that we use an ugly tXsYpZ template where X is a tree ID,
>>>or we assign system-wide unique IDs to switches, which requires a bit of
>>>changes.
>>
>> No. That should be unique within one switch. In mlxsw we name it "p1",
>> "p2", ...
>>
>> The final netdev names are:
>> enp3s0np1, enp3s0np2, ...
>
>OK perfect then, "p%d" sounds good. You seems to avoid "p0" in mlxsw, is
>there a reason for that?

We name these according to the front panel name. There's no "port 0"
on the front panel :)


>
>>>But again, this is not related to this patch ;-)
>>
>> It is! You are using phys_port_id, which is completely wrong. You should
>> not use it.
>
>I can resend this patch without the udev examples in the commit message
>if that can be less confusing.

Yes please.


>
>Thanks,
>
>        Vivien

^ permalink raw reply

* Re: [for-next V2 06/10] net/mlx5: Add interface to get reference to a UAR
From: David Miller @ 2017-01-09 16:06 UTC (permalink / raw)
  To: saeedm
  Cc: yuval.shaia, saeedm, dledford, netdev, linux-rdma, leonro, eli,
	matanb, leon
In-Reply-To: <CALzJLG92QPFG8BmQU3DAvqZ7zrqurkBEDPy4O-jZSMeH0jNPFQ@mail.gmail.com>

From: Saeed Mahameed <saeedm@dev.mellanox.co.il>
Date: Mon, 9 Jan 2017 18:05:16 +0200

> I just thought that this pull request is sitting in the mailing list
> for too long and we need to move on.

Nobody reviewed it for any element of it's substance, and we had an
intervening weekend.

It did not sit for too long but any measure whatsoever.

^ permalink raw reply

* Re: [PATCH net-next v2] net: dsa: make "label" property optional for dsa2
From: Jiri Pirko @ 2017-01-09 16:07 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: Vivien Didelot, netdev, linux-kernel, kernel, David S. Miller,
	Florian Fainelli, Uwe Kleine-König, Andrey Smirnov
In-Reply-To: <20170109160019.GF25588@lunn.ch>

Mon, Jan 09, 2017 at 05:00:19PM CET, andrew@lunn.ch wrote:
>> > No. That should be unique within one switch. In mlxsw we name it "p1",
>> > "p2", ...
>> >
>> > The final netdev names are:
>> > enp3s0np1, enp3s0np2, ...
>> 
>
>mlxsw are pci devices, so it follows this convention, i think:
>
> *   [P<domain>]p<bus>s<slot>[f<function>][n<phys_port_name>|d<dev_port>]
> *                                          PCI geographical location
>
>Our devices are not on PCI. So they won't follow this. I've no idea
>what they actually follow, since some are MDIO devices, some are SPI
>devices, some are memory mapped.

Got it. We just have to make sure udev names them appropriately. 


>
>I'm not against making the label option, but i do want to better
>understand what we get as a result, just to make sure it is sensible.
>
>Vivien, could you try a recent udev and see what happens?
>
>	Thanks
>		Andrew

^ permalink raw reply

* Re: [PATCH nf-next 0/7] xtables: use dedicated copy_to_user helpers
From: Pablo Neira Ayuso @ 2017-01-09 16:30 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: netfilter-devel, netdev, davem, fw, dborkman, Willem de Bruijn
In-Reply-To: <1483395586-105774-1-git-send-email-willemdebruijn.kernel@gmail.com>

On Mon, Jan 02, 2017 at 05:19:39PM -0500, Willem de Bruijn wrote:
> From: Willem de Bruijn <willemb@google.com>
> 
> xtables list and save interfaces share xt_match and xt_target state
> with userspace. The kernel and userspace definitions of these structs
> differ. Currently, the structs are copied wholesale, then patched up.
> The match and target structs contain a kernel pointer. Type-specific
> data may contain additional kernel-only state.
> 
> Introduce xt_match_to_user and xt_target_to_user helper functions to
> copy only fields intended to be shared with userspace.
> 
> Introduce xt_data_to_user to do the same for type-specific state. Add
> a field .usersize to xt_match and xt_target to define the range of
> bytes in .matchsize that should be shared with userspace. All matches
> and targets that define kernel-only data store this at the tail of
> their struct.

Series applied, thanks a lot Willem!

^ permalink raw reply

* Re: [PATCH net-next] net: dsa: select NET_SWITCHDEV
From: Vivien Didelot @ 2017-01-09 16:32 UTC (permalink / raw)
  To: Randy Dunlap, Florian Fainelli, netdev
  Cc: linux-kernel, kernel, David S. Miller, Andrew Lunn, Jiri Pirko
In-Reply-To: <336a46c0-705a-9cf0-1a0d-447af28893bb@infradead.org>

Hi Randy,

Randy Dunlap <rdunlap@infradead.org> writes:

> On 01/08/17 17:18, Florian Fainelli wrote:
>> On 01/08/2017 03:17 PM, Vivien Didelot wrote:
>>> DSA wraps SWITCHDEV, thus select it instead of depending on it.
>>>
>>> Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
>> 
>> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
>> 
>
> but when CONFIG_INET is not enabled, the patch causes this warning:
>
> warning: (NET_DSA) selects NET_SWITCHDEV which has unmet direct dependencies (NET && INET)

Thanks for spotting that! Would that be enough to change this first?

    diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
    index 675acbf1502d..c7263b70e72b 100644
    --- a/net/dsa/Kconfig
    +++ b/net/dsa/Kconfig
    @@ -1,6 +1,6 @@
    config HAVE_NET_DSA
            def_bool y
    -       depends on NETDEVICES && !S390
    +       depends on INET && NETDEVICES && !S390

    # Drivers must select NET_DSA and the appropriate tagging format

Thanks,

        Vivien

^ permalink raw reply

* Re: [Open-FCoE] [PATCH RFC 2/5] qedf: Add QLogic FastLinQ offload FCoE driver framework.
From: Chad Dupuis @ 2017-01-09 16:45 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: martin.petersen, fcoe-devel, netdev, QLogic-Storage-Upstream,
	linux-scsi, yuval.mintz
In-Reply-To: <7df11d77-df7b-af58-2646-e92cdbba2146@suse.de>


On Wed, 28 Dec 2016, 9:00am -0000, Hannes Reinecke wrote:

> On 12/23/2016 08:17 PM, Dupuis, Chad wrote:
> > From: "Dupuis, Chad" <chad.dupuis@cavium.com>
> > 
> > The QLogic FastLinQ Driver for FCoE (qedf) is the FCoE specific module
> > for 41000 Series Converged Network Adapters by QLogic.
> > 
> > This patch consists of following changes:
> >   - MAINTAINERS Makefile and Kconfig changes for qedf
> >   - PCI driver registration
> >   - libfc/fcoe host level initialization
> >   - SCSI host template initialization and callbacks
> >   - Debugfs and log level infrastructure
> >   - Link handling
> >   - Firmware interface structures
> >   - QED core module initialization
> >   - Light L2 interface callbacks
> > 
> > Signed-off-by: Nilesh Javali <nilesh.javali@cavium.com>
> > Signed-off-by: Manish Rangankar <manish.rangankar@cavium.com>
> > Signed-off-by: Saurav Kashyap <saurav.kashyap@cavium.com>
> > Signed-off-by: Chad Dupuis <chad.dupuis@cavium.com>
> > ---
> >  MAINTAINERS                      |    6 +
> >  drivers/scsi/Kconfig             |    1 +
> >  drivers/scsi/qedf/Kconfig        |   11 +
> >  drivers/scsi/qedf/Makefile       |    5 +
> >  drivers/scsi/qedf/qedf.h         |  555 ++++++
> >  drivers/scsi/qedf/qedf_attr.c    |  165 ++
> >  drivers/scsi/qedf/qedf_dbg.c     |  192 +++
> >  drivers/scsi/qedf/qedf_dbg.h     |  153 ++
> >  drivers/scsi/qedf/qedf_debugfs.c |  472 +++++
> >  drivers/scsi/qedf/qedf_main.c    | 3519 ++++++++++++++++++++++++++++++++++++++
> >  drivers/scsi/qedf/qedf_version.h |   15 +
> >  11 files changed, 5094 insertions(+)
> >  create mode 100644 drivers/scsi/qedf/Kconfig
> >  create mode 100644 drivers/scsi/qedf/Makefile
> >  create mode 100644 drivers/scsi/qedf/qedf.h
> >  create mode 100644 drivers/scsi/qedf/qedf_attr.c
> >  create mode 100644 drivers/scsi/qedf/qedf_dbg.c
> >  create mode 100644 drivers/scsi/qedf/qedf_dbg.h
> >  create mode 100644 drivers/scsi/qedf/qedf_debugfs.c
> >  create mode 100644 drivers/scsi/qedf/qedf_main.c
> >  create mode 100644 drivers/scsi/qedf/qedf_version.h
> > 
> [ .. ]
> > +/* Returns true if we have a valid vlan, false otherwise */
> > +static bool qedf_initiate_fipvlan_req(struct qedf_ctx *qedf)
> > +{
> > +	int rc;
> > +
> > +	if (atomic_read(&qedf->link_state) != QEDF_LINK_UP) {
> > +		QEDF_ERR(&(qedf->dbg_ctx), "Link not up.\n");
> > +		return  false;
> > +	}
> > +
> > +	while (qedf->fipvlan_retries--) {
> > +		if (qedf->vlan_id > 0)
> > +			return true;
> Some weird FCoE bridges (most notably HP VirtualConnect) return a VLAN
> ID of '0'. Shouldn't you rather test for '>= 0' here?

Will look into this but isn't a VLAN ID of 0 not valid?

> 
> [ .. ]
> > +
> > +static void qedf_flogi_resp(struct fc_seq *seq, struct fc_frame *fp,
> > +	void *arg)
> > +{
> > +	struct fc_exch *exch = fc_seq_exch(seq);
> > +	struct fc_lport *lport = exch->lp;
> > +	struct qedf_ctx *qedf = lport_priv(lport);
> > +
> > +	if (!qedf) {
> > +		QEDF_ERR(NULL, "qedf is NULL.\n");
> > +		return;
> > +	}
> > +
> > +	/*
> > +	 * If ERR_PTR is set then don't try to stat anything as it will cause
> > +	 * a crash when we access fp.
> > +	 */
> > +	if (fp == ERR_PTR(-FC_EX_TIMEOUT) ||
> > +	    fp == ERR_PTR(-FC_EX_CLOSED)) {
> > +		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_ELS,
> > +		    "fp has ERR_PTR() set.\n");
> > +		goto skip_stat;
> > +	}
> 
> Please use
> 
> if (IS_ERR(fp)) {
> 
> here instead of checking for individual error codes; if 'fp' has a
> different error value you'll continue with an invalid fp from here on.
>

Will fix up.
 
> [ .. ]
> 
> > +/**
> > + * qedf_xmit - qedf FCoE frame transmit function
> > + *
> > + */
> > +static int qedf_xmit(struct fc_lport *lport, struct fc_frame *fp)
> > +{
> > +	struct fc_lport		*base_lport;
> > +	struct qedf_ctx		*qedf;
> > +	struct ethhdr		*eh;
> > +	struct fcoe_crc_eof	*cp;
> > +	struct sk_buff		*skb;
> > +	struct fc_frame_header	*fh;
> > +	struct fcoe_hdr		*hp;
> > +	u8			sof, eof;
> > +	u32			crc;
> > +	unsigned int		hlen, tlen, elen;
> > +	int			wlen;
> > +	struct fc_stats		*stats;
> > +	struct fc_lport *tmp_lport;
> > +	struct fc_lport *vn_port = NULL;
> > +	struct qedf_rport *fcport;
> > +	int rc;
> > +	u16 vlan_tci = 0;
> > +	unsigned long flags;
> > +
> > +	qedf = (struct qedf_ctx *)lport_priv(lport);
> > +
> > +	fh = fc_frame_header_get(fp);
> > +	skb = fp_skb(fp);
> > +
> > +	/* Filter out traffic to other NPIV ports on the same host */
> > +	if (lport->vport)
> > +		base_lport = shost_priv(vport_to_shost(lport->vport));
> > +	else
> > +		base_lport = lport;
> > +
> > +	/* Flag if the destination is the base port */
> > +	if (base_lport->port_id == ntoh24(fh->fh_d_id)) {
> > +		vn_port = base_lport;
> > +	} else {
> > +		/* Got through the list of vports attached to the base_lport
> > +		 * and see if we have a match with the destination address.
> > +		 */
> > +		list_for_each_entry(tmp_lport, &base_lport->vports, list) {
> > +			if (tmp_lport->port_id == ntoh24(fh->fh_d_id)) {
> > +				vn_port = tmp_lport;
> > +				break;
> > +			}
> > +		}
> > +	}
> > +	if (vn_port && ntoh24(fh->fh_d_id) != FC_FID_FLOGI) {
> > +		struct fc_rport_priv *rdata = NULL;
> > +
> > +		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_LL2,
> > +		    "Dropping FCoE frame to %06x.\n", ntoh24(fh->fh_d_id));
> > +		kfree_skb(skb);
> > +		rdata = fc_rport_lookup(lport, ntoh24(fh->fh_d_id));
> > +		if (rdata)
> > +			rdata->retries = lport->max_rport_retry_count;
> > +		return -EINVAL;
> > +	}
> > +	/* End NPIV filtering */
> > +
> > +	if (!qedf->ctlr.sel_fcf) {
> > +		kfree_skb(skb);
> > +		return 0;
> > +	}
> > +
> > +	if (!test_bit(QEDF_LL2_STARTED, &qedf->flags)) {
> > +		QEDF_WARN(&(qedf->dbg_ctx), "LL2 not started\n");
> > +		kfree_skb(skb);
> > +		return 0;
> > +	}
> > +
> > +	if (atomic_read(&qedf->link_state) != QEDF_LINK_UP) {
> > +		QEDF_WARN(&(qedf->dbg_ctx), "qedf link down\n");
> > +		kfree_skb(skb);
> > +		return 0;
> > +	}
> > +
> > +	if (unlikely(fh->fh_r_ctl == FC_RCTL_ELS_REQ)) {
> > +		if (fcoe_ctlr_els_send(&qedf->ctlr, lport, skb))
> > +			return 0;
> > +	}
> > +
> > +	/* Check to see if this needs to be sent on an offloaded session */
> > +	spin_lock_irqsave(&qedf->hba_lock, flags);
> > +	fcport = qedf_fcport_lookup(qedf, ntoh24(fh->fh_d_id));
> > +	spin_unlock_irqrestore(&qedf->hba_lock, flags);
> > +
> Really sad, having to take a spinlock here to get to the session.
> Can't you use RCU for rport lookup?
> That would save you the spinlock here ...

Yes, will look into converting this to use RCU

> 
> > +	if (fcport && test_bit(QEDF_RPORT_SESSION_READY, &fcport->flags)) {
> > +		rc = qedf_xmit_l2_frame(fcport, fp);
> > +		/*
> > +		 * If the frame was successfully sent over the middle path
> > +		 * then do not try to also send it over the LL2 path
> > +		 */
> > +		if (rc)
> > +			return 0;
> > +	}
> > +
> > +	sof = fr_sof(fp);
> > +	eof = fr_eof(fp);
> > +
> > +	elen = sizeof(struct ethhdr);
> > +	hlen = sizeof(struct fcoe_hdr);
> > +	tlen = sizeof(struct fcoe_crc_eof);
> > +	wlen = (skb->len - tlen + sizeof(crc)) / FCOE_WORD_TO_BYTE;
> > +
> > +	skb->ip_summed = CHECKSUM_NONE;
> > +	crc = fcoe_fc_crc(fp);
> > +
> > +	/* copy port crc and eof to the skb buff */
> > +	if (skb_is_nonlinear(skb)) {
> > +		skb_frag_t *frag;
> > +
> > +		if (qedf_get_paged_crc_eof(skb, tlen)) {
> > +			kfree_skb(skb);
> > +			return -ENOMEM;
> > +		}
> > +		frag = &skb_shinfo(skb)->frags[skb_shinfo(skb)->nr_frags - 1];
> > +		cp = kmap_atomic(skb_frag_page(frag)) + frag->page_offset;
> > +	} else {
> > +		cp = (struct fcoe_crc_eof *)skb_put(skb, tlen);
> > +	}
> > +
> > +	memset(cp, 0, sizeof(*cp));
> > +	cp->fcoe_eof = eof;
> > +	cp->fcoe_crc32 = cpu_to_le32(~crc);
> > +	if (skb_is_nonlinear(skb)) {
> > +		kunmap_atomic(cp);
> > +		cp = NULL;
> > +	}
> > +
> > +
> > +	/* adjust skb network/transport offsets to match mac/fcoe/port */
> > +	skb_push(skb, elen + hlen);
> > +	skb_reset_mac_header(skb);
> > +	skb_reset_network_header(skb);
> > +	skb->mac_len = elen;
> > +	skb->protocol = htons(ETH_P_FCOE);
> > +
> > +	__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), qedf->vlan_id);
> > +
> > +	/* fill up mac and fcoe headers */
> > +	eh = eth_hdr(skb);
> > +	eh->h_proto = htons(ETH_P_FCOE);
> > +	if (qedf->ctlr.map_dest)
> > +		fc_fcoe_set_mac(eh->h_dest, fh->fh_d_id);
> > +	else
> > +		/* insert GW address */
> > +		ether_addr_copy(eh->h_dest, qedf->ctlr.dest_addr);
> > +
> > +	/* Set the source MAC address */
> > +	fc_fcoe_set_mac(eh->h_source, fh->fh_s_id);
> > +
> > +	hp = (struct fcoe_hdr *)(eh + 1);
> > +	memset(hp, 0, sizeof(*hp));
> > +	if (FC_FCOE_VER)
> > +		FC_FCOE_ENCAPS_VER(hp, FC_FCOE_VER);
> > +	hp->fcoe_sof = sof;
> > +
> > +	/*update tx stats */
> > +	stats = per_cpu_ptr(lport->stats, get_cpu());
> > +	stats->TxFrames++;
> > +	stats->TxWords += wlen;
> > +	put_cpu();
> > +
> > +	/* Get VLAN ID from skb for printing purposes */
> > +	__vlan_hwaccel_get_tag(skb, &vlan_tci);
> > +
> > +	/* send down to lld */
> > +	fr_dev(fp) = lport;
> > +	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_LL2, "FCoE frame send: "
> > +	    "src=%06x dest=%06x r_ctl=%x type=%x vlan=%04x.\n",
> > +	    ntoh24(fh->fh_s_id), ntoh24(fh->fh_d_id), fh->fh_r_ctl, fh->fh_type,
> > +	    vlan_tci);
> > +	if (qedf_dump_frames)
> > +		print_hex_dump(KERN_WARNING, "fcoe: ", DUMP_PREFIX_OFFSET, 16,
> > +		    1, skb->data, skb->len, false);
> > +	qed_ops->ll2->start_xmit(qedf->cdev, skb);
> > +
> > +	return 0;
> > +}
> > +
> > +static int qedf_alloc_sq(struct qedf_ctx *qedf, struct qedf_rport *fcport)
> > +{
> > +	int rval = 0;
> > +	u32 *pbl;
> > +	dma_addr_t page;
> > +	int num_pages;
> > +
> > +	/* Calculate appropriate queue and PBL sizes */
> > +	fcport->sq_mem_size = SQ_NUM_ENTRIES * sizeof(struct fcoe_wqe);
> > +	fcport->sq_mem_size = ALIGN(fcport->sq_mem_size, QEDF_PAGE_SIZE);
> > +	fcport->sq_pbl_size = (fcport->sq_mem_size / QEDF_PAGE_SIZE) *
> > +	    sizeof(void *);
> > +	fcport->sq_pbl_size = fcport->sq_pbl_size + QEDF_PAGE_SIZE;
> > +
> > +	fcport->sq = dma_alloc_coherent(&qedf->pdev->dev, fcport->sq_mem_size,
> > +	    &fcport->sq_dma, GFP_KERNEL);
> > +	if (!fcport->sq) {
> > +		QEDF_WARN(&(qedf->dbg_ctx), "Could not allocate send "
> > +			   "queue.\n");
> > +		rval = 1;
> > +		goto out;
> > +	}
> > +	memset(fcport->sq, 0, fcport->sq_mem_size);
> > +
> > +	fcport->sq_pbl = dma_alloc_coherent(&qedf->pdev->dev,
> > +	    fcport->sq_pbl_size, &fcport->sq_pbl_dma, GFP_KERNEL);
> > +	if (!fcport->sq_pbl) {
> > +		QEDF_WARN(&(qedf->dbg_ctx), "Could not allocate send "
> > +			   "queue PBL.\n");
> > +		rval = 1;
> > +		goto out_free_sq;
> > +	}
> > +	memset(fcport->sq_pbl, 0, fcport->sq_pbl_size);
> > +
> > +	/* Create PBL */
> > +	num_pages = fcport->sq_mem_size / QEDF_PAGE_SIZE;
> > +	page = fcport->sq_dma;
> > +	pbl = (u32 *)fcport->sq_pbl;
> > +
> > +	while (num_pages--) {
> > +		*pbl = U64_LO(page);
> > +		pbl++;
> > +		*pbl = U64_HI(page);
> > +		pbl++;
> > +		page += QEDF_PAGE_SIZE;
> > +	}
> > +
> > +	return rval;
> > +
> > +out_free_sq:
> > +	dma_free_coherent(&qedf->pdev->dev, fcport->sq_mem_size, fcport->sq,
> > +	    fcport->sq_dma);
> > +out:
> > +	return rval;
> > +}
> > +
> > +static void qedf_free_sq(struct qedf_ctx *qedf, struct qedf_rport *fcport)
> > +{
> > +	if (fcport->sq_pbl)
> > +		dma_free_coherent(&qedf->pdev->dev, fcport->sq_pbl_size,
> > +		    fcport->sq_pbl, fcport->sq_pbl_dma);
> > +	if (fcport->sq)
> > +		dma_free_coherent(&qedf->pdev->dev, fcport->sq_mem_size,
> > +		    fcport->sq, fcport->sq_dma);
> > +}
> > +
> > +/*
> > + * Allocate a cookie into the qedf_ctx rport list.  Assumes the hba lock
> > + * is held on entry.
> > + */
> > +static int qedf_alloc_conn_id(struct qedf_ctx *qedf, struct qedf_rport *fcport)
> > +{
> > +	int i;
> > +
> > +	for (i = 0; i < QEDF_MAX_SESSIONS; i++) {
> > +		qedf->curr_conn_id++;
> > +		if (qedf->curr_conn_id == QEDF_MAX_SESSIONS)
> > +			qedf->curr_conn_id = 0;
> > +		if (qedf->fcports[qedf->curr_conn_id] == NULL) {
> > +			qedf->fcports[qedf->curr_conn_id] = fcport;
> > +			fcport->conn_id = qedf->curr_conn_id;
> > +			break;
> > +		}
> > +	}
> > +	if (i == QEDF_MAX_SESSIONS)
> > +		return -1;
> > +	else
> > +		return 0;
> > +}
> > +
> Have you looked at the 'sbitmap' code for conn_id allocation?
> Should be giving you the same results, but you won't need to use a
> spinlock ...

I've not.  I'll look into sbitmap for V2.

> 
> 
> > +static int qedf_offload_connection(struct qedf_ctx *qedf,
> > +	struct qedf_rport *fcport)
> > +{
> > +	struct qed_fcoe_params_offload conn_info;
> > +	u32 port_id;
> > +	u8 lport_src_id[3];
> > +	int rval;
> > +	uint16_t total_sqe = (fcport->sq_mem_size / sizeof(struct fcoe_wqe));
> > +
> > +	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_CONN, "Offloading connection "
> > +		   "portid=%06x.\n", fcport->rdata->ids.port_id);
> > +	rval = qed_ops->acquire_conn(qedf->cdev, &fcport->handle,
> > +	    &fcport->fw_cid, &fcport->p_doorbell);
> > +	if (rval) {
> > +		QEDF_WARN(&(qedf->dbg_ctx), "Could not acquire connection "
> > +			   "for portid=%06x.\n", fcport->rdata->ids.port_id);
> > +		rval = 1; /* For some reason qed returns 0 on failure here */
> > +		goto out;
> > +	}
> > +
> > +	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_CONN, "portid=%06x "
> > +		   "fw_cid=%08x handle=%d.\n", fcport->rdata->ids.port_id,
> > +		   fcport->fw_cid, fcport->handle);
> > +
> > +	memset(&conn_info, 0, sizeof(struct qed_fcoe_params_offload));
> > +
> > +	/* Fill in the offload connection info */
> > +	conn_info.sq_pbl_addr = fcport->sq_pbl_dma;
> > +
> > +	conn_info.sq_curr_page_addr = (dma_addr_t)(*(u64 *)fcport->sq_pbl);
> > +	conn_info.sq_next_page_addr =
> > +	    (dma_addr_t)(*(u64 *)(fcport->sq_pbl + 8));
> > +
> > +	/* Need to use our FCoE MAC for the offload session */
> > +	port_id = fc_host_port_id(qedf->lport->host);
> > +	lport_src_id[2] = (port_id & 0x000000FF);
> > +	lport_src_id[1] = (port_id & 0x0000FF00) >> 8;
> > +	lport_src_id[0] = (port_id & 0x00FF0000) >> 16;
> > +	fc_fcoe_set_mac(conn_info.src_mac, lport_src_id);
> > +
> > +	ether_addr_copy(conn_info.dst_mac, qedf->ctlr.dest_addr);
> > +
> > +	conn_info.tx_max_fc_pay_len = fcport->rdata->maxframe_size;
> > +	conn_info.e_d_tov_timer_val = qedf->lport->e_d_tov / 20;
> > +	conn_info.rec_tov_timer_val = 3; /* I think this is what E3 was */
> > +	conn_info.rx_max_fc_pay_len = fcport->rdata->maxframe_size;
> > +
> > +	/* Set VLAN data */
> > +	conn_info.vlan_tag = qedf->vlan_id <<
> > +	    FCOE_CONN_OFFLOAD_RAMROD_DATA_VLAN_ID_SHIFT;
> > +	conn_info.vlan_tag |=
> > +	    qedf_default_prio << FCOE_CONN_OFFLOAD_RAMROD_DATA_PRIORITY_SHIFT;
> > +	conn_info.flags |= (FCOE_CONN_OFFLOAD_RAMROD_DATA_B_VLAN_FLAG_MASK <<
> > +	    FCOE_CONN_OFFLOAD_RAMROD_DATA_B_VLAN_FLAG_SHIFT);
> > +
> > +	/* Set host port source id */
> > +	port_id = fc_host_port_id(qedf->lport->host);
> > +	fcport->sid = port_id;
> > +	conn_info.s_id.addr_hi = (port_id & 0x000000FF);
> > +	conn_info.s_id.addr_mid = (port_id & 0x0000FF00) >> 8;
> > +	conn_info.s_id.addr_lo = (port_id & 0x00FF0000) >> 16;
> > +
> > +	conn_info.max_conc_seqs_c3 = fcport->rdata->max_seq;
> > +
> > +	/* Set remote port destination id */
> > +	port_id = fcport->rdata->rport->port_id;
> > +	conn_info.d_id.addr_hi = (port_id & 0x000000FF);
> > +	conn_info.d_id.addr_mid = (port_id & 0x0000FF00) >> 8;
> > +	conn_info.d_id.addr_lo = (port_id & 0x00FF0000) >> 16;
> > +
> > +	conn_info.def_q_idx = 0; /* Default index for send queue? */
> > +
> > +	/* Set FC-TAPE specific flags if needed */
> > +	if (fcport->dev_type == QEDF_RPORT_TYPE_TAPE) {
> > +		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_CONN,
> > +		    "Enable CONF, REC for portid=%06x.\n",
> > +		    fcport->rdata->ids.port_id);
> > +		conn_info.flags |= 1 <<
> > +		    FCOE_CONN_OFFLOAD_RAMROD_DATA_B_CONF_REQ_SHIFT;
> > +		conn_info.flags |=
> > +		    ((fcport->rdata->sp_features & FC_SP_FT_SEQC) ? 1 : 0) <<
> > +		    FCOE_CONN_OFFLOAD_RAMROD_DATA_B_REC_VALID_SHIFT;
> > +	}
> > +
> > +	rval = qed_ops->offload_conn(qedf->cdev, fcport->handle, &conn_info);
> > +	if (rval) {
> > +		QEDF_WARN(&(qedf->dbg_ctx), "Could not offload connection "
> > +			   "for portid=%06x.\n", fcport->rdata->ids.port_id);
> > +		goto out_free_conn;
> > +	} else
> > +		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_CONN, "Offload "
> > +			   "succeeded portid=%06x total_sqe=%d.\n",
> > +			   fcport->rdata->ids.port_id, total_sqe);
> > +
> > +	spin_lock_init(&fcport->rport_lock);
> > +	atomic_set(&fcport->free_sqes, total_sqe);
> > +	return 0;
> > +out_free_conn:
> > +	qed_ops->release_conn(qedf->cdev, fcport->handle);
> > +out:
> > +	return rval;
> > +}
> > +
> > +#define QEDF_TERM_BUFF_SIZE		10
> > +static void qedf_upload_connection(struct qedf_ctx *qedf,
> > +	struct qedf_rport *fcport)
> > +{
> > +	void *term_params;
> > +	dma_addr_t term_params_dma;
> > +
> > +	/* Term params needs to be a DMA coherent buffer as qed shared the
> > +	 * physical DMA address with the firmware. The buffer may be used in
> > +	 * the receive path so we may eventually have to move this.
> > +	 */
> > +	term_params = dma_alloc_coherent(&qedf->pdev->dev, QEDF_TERM_BUFF_SIZE,
> > +		&term_params_dma, GFP_KERNEL);
> > +
> > +	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_CONN, "Uploading connection "
> > +		   "port_id=%06x.\n", fcport->rdata->ids.port_id);
> > +
> > +	qed_ops->destroy_conn(qedf->cdev, fcport->handle, term_params_dma);
> > +	qed_ops->release_conn(qedf->cdev, fcport->handle);
> > +
> > +	dma_free_coherent(&qedf->pdev->dev, QEDF_TERM_BUFF_SIZE, term_params,
> > +	    term_params_dma);
> > +}
> > +
> > +static void qedf_cleanup_fcport(struct qedf_ctx *qedf,
> > +	struct qedf_rport *fcport)
> > +{
> > +	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_CONN, "Clearing conn_id=%u "
> > +		   "for portid=%06x.\n", fcport->conn_id,
> > +		   fcport->rdata->ids.port_id);
> > +
> > +	/* Flush any remaining i/o's before we upload the connection */
> > +	qedf_flush_active_ios(fcport, -1);
> > +
> > +	spin_lock(&qedf->hba_lock);
> > +	qedf->fcports[fcport->conn_id] = NULL;
> > +	fcport->conn_id = -1;
> > +	spin_unlock(&qedf->hba_lock);
> > +
> > +	if (test_and_clear_bit(QEDF_RPORT_SESSION_READY, &fcport->flags))
> > +		qedf_upload_connection(qedf, fcport);
> > +	qedf_free_sq(qedf, fcport);
> > +	fcport->rdata = NULL;
> > +	fcport->qedf = NULL;
> > +}
> > +
> > +/**
> > + * This event_callback is called after successful completion of libfc
> > + * initiated target login. qedf can proceed with initiating the session
> > + * establishment.
> > + */
> > +static void qedf_rport_event_handler(struct fc_lport *lport,
> > +				struct fc_rport_priv *rdata,
> > +				enum fc_rport_event event)
> > +{
> > +	struct qedf_ctx *qedf = lport_priv(lport);
> > +	struct fc_rport *rport = rdata->rport;
> > +	struct fc_rport_libfc_priv *rp;
> > +	struct qedf_rport *fcport;
> > +	u32 port_id;
> > +	int rval;
> > +
> > +	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC, "event = %d, "
> > +		   "port_id = 0x%x\n", event, rdata->ids.port_id);
> > +
> > +	switch (event) {
> > +	case RPORT_EV_READY:
> > +		if (!rport) {
> > +			QEDF_WARN(&(qedf->dbg_ctx), "rport is NULL.\n");
> > +			break;
> > +		}
> > +
> > +		rp = rport->dd_data;
> > +		fcport = (struct qedf_rport *)&rp[1];
> > +		fcport->qedf = qedf;
> > +
> > +		/*
> > +		 * Don't try to offload the session again. Can happen when we
> > +		 * get an ADISC
> > +		 */
> > +		if (test_bit(QEDF_RPORT_SESSION_READY, &fcport->flags)) {
> > +			QEDF_WARN(&(qedf->dbg_ctx), "Session already "
> > +				   "offloaded, portid=0x%x.\n",
> > +				   rdata->ids.port_id);
> > +			return;
> > +		}
> > +
> > +		/*
> > +		 * Set the connection id to -1 so we know if we ever assigned
> > +		 * one to the fcport.
> > +		 */
> > +		fcport->conn_id = -1;
> > +
> > +		if (rport->port_id == FC_FID_DIR_SERV) {
> > +			/*
> > +			 * qedf_rport structure doesn't exist for
> > +			 * directory server.
> > +			 * We should not come here, as lport will
> > +			 * take care of fabric login
> > +			 */
> > +			QEDF_WARN(&(qedf->dbg_ctx), "rport struct does not "
> > +			    "exist for dir server port_id=%x\n",
> > +			    rdata->ids.port_id);
> > +			break;
> > +		}
> > +
> > +		if (rdata->spp_type != FC_TYPE_FCP) {
> > +			QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC,
> > +			    "Not offlading since since spp type isn't FCP\n");
> > +			break;
> > +		}
> > +		if (!(rdata->ids.roles & FC_RPORT_ROLE_FCP_TARGET)) {
> > +			QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC,
> > +			    "Not FCP target so not offloading\n");
> > +			break;
> > +		}
> > +
> > +		spin_lock(&qedf->hba_lock);
> > +		rval = qedf_alloc_conn_id(qedf, fcport);
> > +		spin_unlock(&qedf->hba_lock);
> > +
> > +		if (rval) {
> > +			QEDF_WARN(&(qedf->dbg_ctx), "Could not allocate "
> > +				   "conn_id for port %06x.\n",
> > +				   rdata->ids.port_id);
> > +			break;
> > +		}
> > +
> > +		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC,
> > +			   "Assigned conn_id=%u to port_id=%06x.\n",
> > +			    fcport->conn_id, rdata->ids.port_id);
> > +
> > +		fcport->rdata = rdata;
> > +		fcport->rport = rport;
> > +
> > +		rval = qedf_alloc_sq(qedf, fcport);
> > +		if (rval) {
> > +			qedf_cleanup_fcport(qedf, fcport);
> > +			break;
> > +		}
> > +
> > +		/* Set device type */
> > +		if (rdata->flags & FC_RP_FLAGS_RETRY &&
> > +		    rdata->ids.roles & FC_RPORT_ROLE_FCP_TARGET &&
> > +		    !(rdata->ids.roles & FC_RPORT_ROLE_FCP_INITIATOR)) {
> > +			fcport->dev_type = QEDF_RPORT_TYPE_TAPE;
> > +			QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC,
> > +			    "portid=%06x is a TAPE device.\n",
> > +			    rdata->ids.port_id);
> > +		} else {
> > +			fcport->dev_type = QEDF_RPORT_TYPE_DISK;
> > +		}
> > +
> > +		rval = qedf_offload_connection(qedf, fcport);
> > +		if (rval) {
> > +			qedf_cleanup_fcport(qedf, fcport);
> > +			break;
> > +		}
> > +
> > +		/*
> > +		 * Set the session ready bit to let everyone know that this
> > +		 * connection is ready for I/O
> > +		 */
> > +		set_bit(QEDF_RPORT_SESSION_READY, &fcport->flags);
> > +		atomic_inc(&qedf->num_offloads);
> > +
> > +		break;
> > +	case RPORT_EV_LOGO:
> > +	case RPORT_EV_FAILED:
> > +	case RPORT_EV_STOP:
> > +		port_id = rdata->ids.port_id;
> > +		if (port_id == FC_FID_DIR_SERV)
> > +			break;
> > +
> > +		if (!rport) {
> > +			QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC,
> > +			    "port_id=%x - rport notcreated Yet!!\n", port_id);
> > +			break;
> > +		}
> > +		rp = rport->dd_data;
> > +		/*
> > +		 * Perform session upload. Note that rdata->peers is already
> > +		 * removed from disc->rports list before we get this event.
> > +		 */
> > +		fcport = (struct qedf_rport *)&rp[1];
> > +
> > +		/*
> > +		 * Only free the conn_id if this fcport was initialized with
> > +		 * one.
> > +		 */
> > +		if (fcport->conn_id > -1) {
> > +			set_bit(QEDF_RPORT_UPLOADING_CONNECTION, &fcport->flags);
> > +			qedf_cleanup_fcport(qedf, fcport);
> > +			clear_bit(QEDF_RPORT_UPLOADING_CONNECTION,
> > +			    &fcport->flags);
> > +			atomic_dec(&qedf->num_offloads);
> > +		}
> > +
> > +		break;
> > +
> > +	case RPORT_EV_NONE:
> > +		break;
> > +	}
> > +}
> > +
> > +static void qedf_abort_io(struct fc_lport *lport)
> > +{
> > +	/* NO-OP but need to fill in the template */
> > +}
> > +
> > +static void qedf_fcp_cleanup(struct fc_lport *lport)
> > +{
> > +	/*
> > +	 * NO-OP but need to fill in template to prevent a NULL
> > +	 * function pointer dereference during link down. I/Os
> > +	 * will be flushed when port is uploaded.
> > +	 */
> > +}
> > +
> > +static struct libfc_function_template qedf_lport_template = {
> > +	.frame_send		= qedf_xmit,
> > +	.fcp_abort_io		= qedf_abort_io,
> > +	.fcp_cleanup		= qedf_fcp_cleanup,
> > +	.rport_event_callback	= qedf_rport_event_handler,
> > +	.elsct_send		= qedf_elsct_send,
> > +};
> > +
> > +static void qedf_fcoe_ctlr_setup(struct qedf_ctx *qedf)
> > +{
> > +	fcoe_ctlr_init(&qedf->ctlr, FIP_ST_AUTO);
> > +
> > +	qedf->ctlr.send = qedf_fip_send;
> > +	qedf->ctlr.update_mac = qedf_update_src_mac;
> > +	qedf->ctlr.get_src_addr = qedf_get_src_mac;
> > +	ether_addr_copy(qedf->ctlr.ctl_src_addr, qedf->mac);
> > +}
> > +
> > +static int qedf_lport_setup(struct qedf_ctx *qedf)
> > +{
> > +	struct fc_lport *lport = qedf->lport;
> > +
> > +	lport->link_up = 0;
> > +	lport->max_retry_count = QEDF_FLOGI_RETRY_CNT;
> > +	lport->max_rport_retry_count = QEDF_RPORT_RETRY_CNT;
> > +	lport->service_params = (FCP_SPPF_INIT_FCN | FCP_SPPF_RD_XRDY_DIS |
> > +	    FCP_SPPF_RETRY | FCP_SPPF_CONF_COMPL);
> > +	lport->boot_time = jiffies;
> > +	lport->e_d_tov = 2 * 1000;
> > +	lport->r_a_tov = 10 * 1000;
> > +
> > +	/* Set NPIV support */
> > +	lport->does_npiv = 1;
> > +	fc_host_max_npiv_vports(lport->host) = QEDF_MAX_NPIV;
> > +
> > +	fc_set_wwnn(lport, qedf->wwnn);
> > +	fc_set_wwpn(lport, qedf->wwpn);
> > +
> > +	fcoe_libfc_config(lport, &qedf->ctlr, &qedf_lport_template, 0);
> > +
> > +	/* Allocate the exchange manager */
> > +	fc_exch_mgr_alloc(lport, FC_CLASS_3, qedf->max_scsi_xid + 1,
> > +	    qedf->max_els_xid, NULL);
> > +
> > +	if (fc_lport_init_stats(lport))
> > +		return -ENOMEM;
> > +
> > +	/* Finish lport config */
> > +	fc_lport_config(lport);
> > +
> > +	/* Set max frame size */
> > +	fc_set_mfs(lport, QEDF_MFS);
> > +	fc_host_maxframe_size(lport->host) = lport->mfs;
> > +
> > +	/* Set default dev_loss_tmo based on module parameter */
> > +	fc_host_dev_loss_tmo(lport->host) = qedf_dev_loss_tmo;
> > +
> > +	/* Set symbolic node name */
> > +	snprintf(fc_host_symbolic_name(lport->host), 256,
> > +	    "QLogic %s v%s", QEDF_MODULE_NAME, QEDF_VERSION);
> > +
> > +	return 0;
> > +}
> > +
> > +/*
> > + * NPIV functions
> > + */
> > +
> > +static int qedf_vport_libfc_config(struct fc_vport *vport,
> > +	struct fc_lport *lport)
> > +{
> > +	lport->link_up = 0;
> > +	lport->qfull = 0;
> > +	lport->max_retry_count = QEDF_FLOGI_RETRY_CNT;
> > +	lport->max_rport_retry_count = QEDF_RPORT_RETRY_CNT;
> > +	lport->service_params = (FCP_SPPF_INIT_FCN | FCP_SPPF_RD_XRDY_DIS |
> > +	    FCP_SPPF_RETRY | FCP_SPPF_CONF_COMPL);
> > +	lport->boot_time = jiffies;
> > +	lport->e_d_tov = 2 * 1000;
> > +	lport->r_a_tov = 10 * 1000;
> > +	lport->does_npiv = 1; /* Temporary until we add NPIV support */
> > +
> > +	/* Allocate stats for vport */
> > +	if (fc_lport_init_stats(lport))
> > +		return -ENOMEM;
> > +
> > +	/* Finish lport config */
> > +	fc_lport_config(lport);
> > +
> > +	/* offload related configuration */
> > +	lport->crc_offload = 0;
> > +	lport->seq_offload = 0;
> > +	lport->lro_enabled = 0;
> > +	lport->lro_xid = 0;
> > +	lport->lso_max = 0;
> > +
> > +	return 0;
> > +}
> > +
> > +static int qedf_vport_create(struct fc_vport *vport, bool disabled)
> > +{
> > +	struct Scsi_Host *shost = vport_to_shost(vport);
> > +	struct fc_lport *n_port = shost_priv(shost);
> > +	struct fc_lport *vn_port;
> > +	struct qedf_ctx *base_qedf = lport_priv(n_port);
> > +	struct qedf_ctx *vport_qedf;
> > +	int i;
> > +
> > +	char buf[32];
> > +	int rc = 0;
> > +
> > +	rc = fcoe_validate_vport_create(vport);
> > +	if (rc) {
> > +		fcoe_wwn_to_str(vport->port_name, buf, sizeof(buf));
> > +		QEDF_WARN(&(base_qedf->dbg_ctx), "Failed to create vport, "
> > +			   "WWPN (0x%s) already exists.\n", buf);
> > +		goto err1;
> > +	}
> > +
> > +	if (atomic_read(&base_qedf->link_state) != QEDF_LINK_UP) {
> > +		QEDF_WARN(&(base_qedf->dbg_ctx), "Cannot create vport "
> > +			   "because link is not up.\n");
> > +		rc = -EIO;
> > +		goto err1;
> > +	}
> > +
> > +	vn_port = libfc_vport_create(vport, sizeof(struct qedf_ctx));
> > +	if (!vn_port) {
> > +		QEDF_WARN(&(base_qedf->dbg_ctx), "Could not create lport "
> > +			   "for vport.\n");
> > +		rc = -ENOMEM;
> > +		goto err1;
> > +	}
> > +
> > +	fcoe_wwn_to_str(vport->port_name, buf, sizeof(buf));
> > +	QEDF_ERR(&(base_qedf->dbg_ctx), "Creating NPIV port, WWPN=%s.\n",
> > +	    buf);
> > +
> > +	/* Copy some fields from base_qedf */
> > +	vport_qedf = lport_priv(vn_port);
> > +	memcpy(vport_qedf, base_qedf, sizeof(struct qedf_ctx));
> > +
> > +	/* Set qedf data specific to this vport */
> > +	vport_qedf->lport = vn_port;
> > +	/* Use same hba_lock as base_qedf */
> > +	vport_qedf->hba_lock = base_qedf->hba_lock;
> > +	/* Purge any fcport info from base_qedf */
> > +	for (i = 0; i < QEDF_MAX_SESSIONS; i++)
> > +		vport_qedf->fcports[i] = NULL;
> > +	vport_qedf->pdev = base_qedf->pdev;
> > +	vport_qedf->cmd_mgr = base_qedf->cmd_mgr;
> > +	init_completion(&vport_qedf->flogi_compl);
> > +
> > +	rc = qedf_vport_libfc_config(vport, vn_port);
> > +	if (rc) {
> > +		QEDF_ERR(&(base_qedf->dbg_ctx), "Could not allocate memory "
> > +		    "for lport stats.\n");
> > +		goto err2;
> > +	}
> > +
> > +	fc_set_wwnn(vn_port, vport->node_name);
> > +	fc_set_wwpn(vn_port, vport->port_name);
> > +	vport_qedf->wwnn = vn_port->wwnn;
> > +	vport_qedf->wwpn = vn_port->wwpn;
> > +
> > +	vn_port->host->transportt = qedf_fc_vport_transport_template;
> > +	vn_port->host->can_queue = QEDF_MAX_ELS_XID;
> > +	vn_port->host->max_lun = qedf_max_lun;
> > +	vn_port->host->sg_tablesize = QEDF_MAX_BDS_PER_CMD;
> > +	vn_port->host->max_cmd_len = QEDF_MAX_CDB_LEN;
> > +
> > +	rc = scsi_add_host(vn_port->host, &vport->dev);
> > +	if (rc) {
> > +		QEDF_WARN(&(base_qedf->dbg_ctx), "Error adding Scsi_Host.\n");
> > +		goto err2;
> > +	}
> > +
> > +	/* Set default dev_loss_tmo based on module parameter */
> > +	fc_host_dev_loss_tmo(vn_port->host) = qedf_dev_loss_tmo;
> > +
> > +	/* Init libfc stuffs */
> > +	memcpy(&vn_port->tt, &qedf_lport_template,
> > +		sizeof(qedf_lport_template));
> > +	fc_exch_init(vn_port);
> > +	fc_elsct_init(vn_port);
> > +	fc_lport_init(vn_port);
> > +	fc_disc_init(vn_port);
> > +	fc_disc_config(vn_port, vn_port);
> > +
> > +
> > +	/* Allocate the exchange manager */
> > +	shost = vport_to_shost(vport);
> > +	n_port = shost_priv(shost);
> > +	fc_exch_mgr_list_clone(n_port, vn_port);
> > +
> > +	/* Set max frame size */
> > +	fc_set_mfs(vn_port, QEDF_MFS);
> > +
> > +	fc_host_port_type(vn_port->host) = FC_PORTTYPE_UNKNOWN;
> > +
> > +	if (disabled) {
> > +		fc_vport_set_state(vport, FC_VPORT_DISABLED);
> > +	} else {
> > +		vn_port->boot_time = jiffies;
> > +		fc_fabric_login(vn_port);
> > +		fc_vport_setlink(vn_port);
> > +	}
> > +
> > +	QEDF_INFO(&(base_qedf->dbg_ctx), QEDF_LOG_NPIV, "vn_port=%p.\n",
> > +		   vn_port);
> > +
> > +	/* Set up debug context for vport */
> > +	vport_qedf->dbg_ctx.host_no = vn_port->host->host_no;
> > +	vport_qedf->dbg_ctx.pdev = base_qedf->pdev;
> > +
> > +err2:
> > +	scsi_host_put(vn_port->host);
> > +err1:
> > +	return rc;
> > +}
> > +
> > +static int qedf_vport_destroy(struct fc_vport *vport)
> > +{
> > +	struct Scsi_Host *shost = vport_to_shost(vport);
> > +	struct fc_lport *n_port = shost_priv(shost);
> > +	struct fc_lport *vn_port = vport->dd_data;
> > +
> > +	mutex_lock(&n_port->lp_mutex);
> > +	list_del(&vn_port->list);
> > +	mutex_unlock(&n_port->lp_mutex);
> > +
> > +	fc_fabric_logoff(vn_port);
> > +	fc_lport_destroy(vn_port);
> > +
> > +	/* Detach from scsi-ml */
> > +	fc_remove_host(vn_port->host);
> > +	scsi_remove_host(vn_port->host);
> > +
> > +	/*
> > +	 * Only try to release the exchange manager if the vn_port
> > +	 * configuration is complete.
> > +	 */
> > +	if (vn_port->state == LPORT_ST_READY)
> > +		fc_exch_mgr_free(vn_port);
> > +
> > +	/* Free memory used by statistical counters */
> > +	fc_lport_free_stats(vn_port);
> > +
> > +	/* Release Scsi_Host */
> > +	if (vn_port->host)
> > +		scsi_host_put(vn_port->host);
> > +
> > +	return 0;
> > +}
> > +
> > +static int qedf_vport_disable(struct fc_vport *vport, bool disable)
> > +{
> > +	struct fc_lport *lport = vport->dd_data;
> > +
> > +	if (disable) {
> > +		fc_vport_set_state(vport, FC_VPORT_DISABLED);
> > +		fc_fabric_logoff(lport);
> > +	} else {
> > +		lport->boot_time = jiffies;
> > +		fc_fabric_login(lport);
> > +		fc_vport_setlink(lport);
> > +	}
> > +	return 0;
> > +}
> > +
> > +/*
> > + * During removal we need to wait for all the vports associated with a port
> > + * to be destroyed so we avoid a race condition where libfc is still trying
> > + * to reap vports while the driver remove function has already reaped the
> > + * driver contexts associated with the physical port.
> > + */
> > +static void qedf_wait_for_vport_destroy(struct qedf_ctx *qedf)
> > +{
> > +	struct fc_host_attrs *fc_host = shost_to_fc_host(qedf->lport->host);
> > +
> > +	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_NPIV,
> > +	    "Entered.\n");
> > +	while (fc_host->npiv_vports_inuse > 0) {
> > +		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_NPIV,
> > +		    "Waiting for all vports to be reaped.\n");
> > +		msleep(1000);
> > +	}
> > +}
> > +
> > +/**
> > + * qedf_fcoe_reset - Resets the fcoe
> > + *
> > + * @shost: shost the reset is from
> > + *
> > + * Returns: always 0
> > + */
> > +static int qedf_fcoe_reset(struct Scsi_Host *shost)
> > +{
> > +	struct fc_lport *lport = shost_priv(shost);
> > +
> > +	fc_fabric_logoff(lport);
> > +	fc_fabric_login(lport);
> > +	return 0;
> > +}
> > +
> > +static struct fc_host_statistics *qedf_fc_get_host_stats(struct Scsi_Host
> > +	*shost)
> > +{
> > +	struct fc_host_statistics *qedf_stats;
> > +	struct fc_lport *lport = shost_priv(shost);
> > +	struct qedf_ctx *qedf = lport_priv(lport);
> > +	struct qed_fcoe_stats *fw_fcoe_stats;
> > +
> > +	qedf_stats = fc_get_host_stats(shost);
> > +
> > +	/* We don't collect offload stats for specific NPIV ports */
> > +	if (lport->vport)
> > +		goto out;
> > +
> > +	fw_fcoe_stats = kmalloc(sizeof(struct qed_fcoe_stats), GFP_KERNEL);
> > +	if (!fw_fcoe_stats) {
> > +		QEDF_ERR(&(qedf->dbg_ctx), "Could not allocate memory for "
> > +		    "fw_fcoe_stats.\n");
> > +		goto out;
> > +	}
> > +
> > +	/* Query firmware for offload stats */
> > +	qed_ops->get_stats(qedf->cdev, fw_fcoe_stats);
> > +
> > +	/*
> > +	 * The expectation is that we add our offload stats to the stats
> > +	 * being maintained by libfc each time the fc_get_host_status callback
> > +	 * is invoked. The additions are not carried over for each call to
> > +	 * the fc_get_host_stats callback.
> > +	 */
> > +	qedf_stats->tx_frames += fw_fcoe_stats->fcoe_tx_data_pkt_cnt +
> > +	    fw_fcoe_stats->fcoe_tx_xfer_pkt_cnt +
> > +	    fw_fcoe_stats->fcoe_tx_other_pkt_cnt;
> > +	qedf_stats->rx_frames += fw_fcoe_stats->fcoe_rx_data_pkt_cnt +
> > +	    fw_fcoe_stats->fcoe_rx_xfer_pkt_cnt +
> > +	    fw_fcoe_stats->fcoe_rx_other_pkt_cnt;
> > +	qedf_stats->fcp_input_megabytes += fw_fcoe_stats->fcoe_rx_byte_cnt /
> > +	    1000000;
> > +	qedf_stats->fcp_output_megabytes += fw_fcoe_stats->fcoe_tx_byte_cnt /
> > +	    1000000;
> > +	qedf_stats->rx_words += fw_fcoe_stats->fcoe_rx_byte_cnt / 4;
> > +	qedf_stats->tx_words += fw_fcoe_stats->fcoe_tx_byte_cnt / 4;
> > +	qedf_stats->invalid_crc_count +=
> > +	    fw_fcoe_stats->fcoe_silent_drop_pkt_crc_error_cnt;
> > +	qedf_stats->dumped_frames =
> > +	    fw_fcoe_stats->fcoe_silent_drop_total_pkt_cnt;
> > +	qedf_stats->error_frames +=
> > +	    fw_fcoe_stats->fcoe_silent_drop_total_pkt_cnt;
> > +	qedf_stats->fcp_input_requests += qedf->input_requests;
> > +	qedf_stats->fcp_output_requests += qedf->output_requests;
> > +	qedf_stats->fcp_control_requests += qedf->control_requests;
> > +	qedf_stats->fcp_packet_aborts += qedf->packet_aborts;
> > +	qedf_stats->fcp_frame_alloc_failures += qedf->alloc_failures;
> > +
> > +	kfree(fw_fcoe_stats);
> > +out:
> > +	return qedf_stats;
> > +}
> > +
> > +static struct fc_function_template qedf_fc_transport_fn = {
> > +	.show_host_node_name = 1,
> > +	.show_host_port_name = 1,
> > +	.show_host_supported_classes = 1,
> > +	.show_host_supported_fc4s = 1,
> > +	.show_host_active_fc4s = 1,
> > +	.show_host_maxframe_size = 1,
> > +
> > +	.show_host_port_id = 1,
> > +	.show_host_supported_speeds = 1,
> > +	.get_host_speed = fc_get_host_speed,
> > +	.show_host_speed = 1,
> > +	.show_host_port_type = 1,
> > +	.get_host_port_state = fc_get_host_port_state,
> > +	.show_host_port_state = 1,
> > +	.show_host_symbolic_name = 1,
> > +
> > +	/*
> > +	 * Tell FC transport to allocate enough space to store the backpointer
> > +	 * for the associate qedf_rport struct.
> > +	 */
> > +	.dd_fcrport_size = (sizeof(struct fc_rport_libfc_priv) +
> > +				sizeof(struct qedf_rport)),
> > +	.show_rport_maxframe_size = 1,
> > +	.show_rport_supported_classes = 1,
> > +	.show_host_fabric_name = 1,
> > +	.show_starget_node_name = 1,
> > +	.show_starget_port_name = 1,
> > +	.show_starget_port_id = 1,
> > +	.set_rport_dev_loss_tmo = fc_set_rport_loss_tmo,
> > +	.show_rport_dev_loss_tmo = 1,
> > +	.get_fc_host_stats = qedf_fc_get_host_stats,
> > +	.issue_fc_host_lip = qedf_fcoe_reset,
> > +	.vport_create = qedf_vport_create,
> > +	.vport_delete = qedf_vport_destroy,
> > +	.vport_disable = qedf_vport_disable,
> > +	.bsg_request = fc_lport_bsg_request,
> > +};
> > +
> > +static struct fc_function_template qedf_fc_vport_transport_fn = {
> > +	.show_host_node_name = 1,
> > +	.show_host_port_name = 1,
> > +	.show_host_supported_classes = 1,
> > +	.show_host_supported_fc4s = 1,
> > +	.show_host_active_fc4s = 1,
> > +	.show_host_maxframe_size = 1,
> > +	.show_host_port_id = 1,
> > +	.show_host_supported_speeds = 1,
> > +	.get_host_speed = fc_get_host_speed,
> > +	.show_host_speed = 1,
> > +	.show_host_port_type = 1,
> > +	.get_host_port_state = fc_get_host_port_state,
> > +	.show_host_port_state = 1,
> > +	.show_host_symbolic_name = 1,
> > +	.dd_fcrport_size = (sizeof(struct fc_rport_libfc_priv) +
> > +				sizeof(struct qedf_rport)),
> > +	.show_rport_maxframe_size = 1,
> > +	.show_rport_supported_classes = 1,
> > +	.show_host_fabric_name = 1,
> > +	.show_starget_node_name = 1,
> > +	.show_starget_port_name = 1,
> > +	.show_starget_port_id = 1,
> > +	.set_rport_dev_loss_tmo = fc_set_rport_loss_tmo,
> > +	.show_rport_dev_loss_tmo = 1,
> > +	.get_fc_host_stats = fc_get_host_stats,
> > +	.issue_fc_host_lip = qedf_fcoe_reset,
> > +	.bsg_request = fc_lport_bsg_request,
> > +};
> > +
> > +static bool qedf_fp_has_work(struct qedf_fastpath *fp)
> > +{
> > +	struct qedf_ctx *qedf = fp->qedf;
> > +	struct global_queue *que;
> > +	struct qed_sb_info *sb_info = fp->sb_info;
> > +	struct status_block *sb = sb_info->sb_virt;
> > +	u16 prod_idx;
> > +
> > +	/* Get the pointer to the global CQ this completion is on */
> > +	que = qedf->global_queues[fp->sb_id];
> > +
> > +	rmb();
> > +
> > +	/* Get the current firmware producer index */
> > +	prod_idx = sb->pi_array[QEDF_FCOE_PARAMS_GL_RQ_PI];
> > +
> > +	return (que->cq_prod_idx != prod_idx);
> > +}
> > +
> > +/*
> > + * Interrupt handler code.
> > + */
> > +
> > +/* Process completion queue and copy CQE contents for deferred processesing
> > + *
> > + * Return true if we should wake the I/O thread, false if not.
> > + */
> > +static bool qedf_process_completions(struct qedf_fastpath *fp)
> > +{
> > +	struct qedf_ctx *qedf = fp->qedf;
> > +	struct qed_sb_info *sb_info = fp->sb_info;
> > +	struct status_block *sb = sb_info->sb_virt;
> > +	struct global_queue *que;
> > +	u16 prod_idx;
> > +	struct fcoe_cqe *cqe;
> > +	struct qedf_io_work *work;
> > +	unsigned long flags;
> > +	int num_handled = 0;
> > +	unsigned int cpu;
> > +	struct qedf_ioreq *io_req = NULL;
> > +	struct qedf_percpu_iothread_s *iothread;
> > +	u16 xid;
> > +	u16 new_cqes;
> > +	u32 comp_type;
> > +
> > +	/* Get the current firmware producer index */
> > +	prod_idx = sb->pi_array[QEDF_FCOE_PARAMS_GL_RQ_PI];
> > +
> > +	/* Get the pointer to the global CQ this completion is on */
> > +	que = qedf->global_queues[fp->sb_id];
> > +
> > +	/* Calculate the amount of new elements since last processing */
> > +	new_cqes = (prod_idx >= que->cq_prod_idx) ?
> > +	    (prod_idx - que->cq_prod_idx) :
> > +	    0x10000 - que->cq_prod_idx + prod_idx;
> > +
> > +	/* Save producer index */
> > +	que->cq_prod_idx = prod_idx;
> > +
> > +	while (new_cqes) {
> > +		fp->completions++;
> > +		num_handled++;
> > +		cqe = &que->cq[que->cq_cons_idx];
> > +
> > +		comp_type = (cqe->cqe_data >> FCOE_CQE_CQE_TYPE_SHIFT) &
> > +		    FCOE_CQE_CQE_TYPE_MASK;
> > +
> > +		/*
> > +		 * Process unsolicited CQEs directly in the interrupt handler
> > +		 * sine we need the fastpath ID
> > +		 */
> > +		if (comp_type == FCOE_UNSOLIC_CQE_TYPE) {
> > +			QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_UNSOL,
> > +			   "Unsolicated CQE.\n");
> > +			qedf_process_unsol_compl(qedf, fp->sb_id, cqe);
> > +			/*
> > +			 * Don't add a work list item.  Increment consumer
> > +			 * consumer index and move on.
> > +			 */
> > +			goto inc_idx;
> > +		}
> > +
> > +		xid = cqe->cqe_data & FCOE_CQE_TASK_ID_MASK;
> > +		io_req = &qedf->cmd_mgr->cmds[xid];
> > +
> > +		/*
> > +		 * Figure out which percpu thread we should queue this I/O
> > +		 * on.
> > +		 */
> > +		if (!io_req)
> > +			/* If there is not io_req assocated with this CQE
> > +			 * just queue it on CPU 0
> > +			 */
> > +			cpu = 0;
> > +		else {
> > +			cpu = io_req->cpu;
> > +			io_req->int_cpu = smp_processor_id();
> > +		}
> > +
> > +		work = mempool_alloc(qedf->io_mempool, GFP_ATOMIC);
> > +		if (!work) {
> > +			QEDF_WARN(&(qedf->dbg_ctx), "Could not allocate "
> > +				   "work for I/O completion.\n");
> > +			continue;
> > +		}
> > +		memset(work, 0, sizeof(struct qedf_io_work));
> > +
> > +		INIT_LIST_HEAD(&work->list);
> > +
> > +		/* Copy contents of CQE for deferred processing */
> > +		memcpy(&work->cqe, cqe, sizeof(struct fcoe_cqe));
> > +
> > +		work->qedf = fp->qedf;
> > +		work->fp = NULL; /* Only used for unsolicited frames */
> > +
> > +		iothread = &per_cpu(qedf_percpu_iothreads, cpu);
> > +		spin_lock_irqsave(&iothread->work_lock, flags);
> > +		list_add_tail(&work->list, &iothread->work_list);
> > +		spin_unlock_irqrestore(&iothread->work_lock, flags);
> > +		wake_up_process(iothread->iothread);
> > +
> > +inc_idx:
> > +		que->cq_cons_idx++;
> > +		if (que->cq_cons_idx == fp->cq_num_entries)
> > +			que->cq_cons_idx = 0;
> > +		new_cqes--;
> > +	}
> > +
> > +	return true;
> > +}
> > +
> > +
> > +/* MSI-X fastpath handler code */
> > +static irqreturn_t qedf_msix_handler(int irq, void *dev_id)
> > +{
> > +	struct qedf_fastpath *fp = dev_id;
> > +
> > +	if (!fp) {
> > +		QEDF_ERR(NULL, "fp is null.\n");
> > +		return IRQ_HANDLED;
> > +	}
> > +	if (!fp->sb_info) {
> > +		QEDF_ERR(NULL, "fp->sb_info in null.");
> > +		return IRQ_HANDLED;
> > +	}
> > +
> > +	/*
> > +	 * Disable interrupts for this status block while we process new
> > +	 * completions
> > +	 */
> > +	qed_sb_ack(fp->sb_info, IGU_INT_DISABLE, 0 /*do not update*/);
> > +
> > +	while (1) {
> > +		qedf_process_completions(fp);
> > +
> > +		if (qedf_fp_has_work(fp) == 0) {
> > +			/* Update the sb information */
> > +			qed_sb_update_sb_idx(fp->sb_info);
> > +			rmb();
> > +
> > +			if (qedf_fp_has_work(fp) == 0) {
> > +				/* Re-enable interrupts */
> > +				qed_sb_ack(fp->sb_info, IGU_INT_ENABLE, 1);
> > +				return IRQ_HANDLED;
> > +			}
> > +		}
> > +	}
> > +
> > +	/* Do we ever want to break out of above loop? */
> > +	return IRQ_HANDLED;
> > +}
> > +
> > +/* simd handler for MSI/INTa */
> > +static void qedf_simd_int_handler(void *cookie)
> > +{
> > +	/* Cookie is qedf_ctx struct */
> > +	struct qedf_ctx *qedf = (struct qedf_ctx *)cookie;
> > +
> > +	QEDF_WARN(&(qedf->dbg_ctx), "qedf=%p.\n", qedf);
> > +}
> > +
> > +#define QEDF_SIMD_HANDLER_NUM		0
> > +static void qedf_sync_free_irqs(struct qedf_ctx *qedf)
> > +{
> > +	int i;
> > +
> > +	if (qedf->int_info.msix_cnt) {
> > +		for (i = 0; i < qedf->int_info.used_cnt; i++) {
> > +			synchronize_irq(qedf->int_info.msix[i].vector);
> > +			irq_set_affinity_hint(qedf->int_info.msix[i].vector,
> > +			    NULL);
> > +			irq_set_affinity_notifier(qedf->int_info.msix[i].vector,
> > +			    NULL);
> > +			free_irq(qedf->int_info.msix[i].vector,
> > +			    &qedf->fp_array[i]);
> > +		}
> > +	} else
> > +		qed_ops->common->simd_handler_clean(qedf->cdev,
> > +		    QEDF_SIMD_HANDLER_NUM);
> > +
> > +	qedf->int_info.used_cnt = 0;
> > +	qed_ops->common->set_fp_int(qedf->cdev, 0);
> > +}
> > +
> > +static int qedf_request_msix_irq(struct qedf_ctx *qedf)
> > +{
> > +	int i, rc, cpu;
> > +
> > +	cpu = cpumask_first(cpu_online_mask);
> > +	for (i = 0; i < qedf->num_queues; i++) {
> > +		rc = request_irq(qedf->int_info.msix[i].vector,
> > +		    qedf_msix_handler, 0, "qedf", &qedf->fp_array[i]);
> > +
> > +		if (rc) {
> > +			QEDF_WARN(&(qedf->dbg_ctx), "request_irq failed.\n");
> > +			qedf_sync_free_irqs(qedf);
> > +			return rc;
> > +		}
> > +
> > +		qedf->int_info.used_cnt++;
> > +		rc = irq_set_affinity_hint(qedf->int_info.msix[i].vector,
> > +		    get_cpu_mask(cpu));
> > +		cpu = cpumask_next(cpu, cpu_online_mask);
> > +	 }
> > +
> > +	return 0;
> > +}
> > +
> Please use the irq allocation routines from hch here.

Will do.

> 
> 
> Cheers,
> 
> Hannes
> 

^ permalink raw reply

* Re: [Open-FCoE] [PATCH RFC 3/5] qedf: Add offloaded I/O request functions.
From: Chad Dupuis @ 2017-01-09 16:46 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: martin.petersen, fcoe-devel, netdev, QLogic-Storage-Upstream,
	linux-scsi, yuval.mintz
In-Reply-To: <7cfe465a-ad6c-7604-1262-d4dbf4aee525@suse.de>


On Wed, 28 Dec 2016, 9:08am -0000, Hannes Reinecke wrote:

> On 12/23/2016 08:17 PM, Dupuis, Chad wrote:
> > From: "Dupuis, Chad" <chad.dupuis@cavium.com>
> > 
> > This patch adds various I/O requests types that are handled in firmware:
> > 
> > - Normal I/O requests
> > - ABTS requests
> > - Cleanup requests
> > - Task management requests
> > 
> > It also contains:
> > 
> > - I/O request initialization
> > - Firmware completion handling
> > 
> > Signed-off-by: Nilesh Javali <nilesh.javali@cavium.com>
> > Signed-off-by: Manish Rangankar <manish.rangankar@cavium.com>
> > Signed-off-by: Saurav Kashyap <saurav.kashyap@cavium.com>
> > Signed-off-by: Chad Dupuis <chad.dupuis@cavium.com>
> > ---
> >  drivers/scsi/qedf/qedf_hsi.h |  427 ++++++++
> >  drivers/scsi/qedf/qedf_io.c  | 2303 ++++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 2730 insertions(+)
> >  create mode 100644 drivers/scsi/qedf/qedf_hsi.h
> >  create mode 100644 drivers/scsi/qedf/qedf_io.c
> > 
> [ .. ]
> 
> > +static int qedf_execute_tmf(struct qedf_rport *fcport, struct scsi_cmnd *sc_cmd,
> > +	uint8_t tm_flags)
> > +{
> > +	struct qedf_ioreq *io_req;
> > +	struct qedf_mp_req *tm_req;
> > +	struct fcoe_task_context *task;
> > +	struct fc_frame_header *fc_hdr;
> > +	struct fcp_cmnd *fcp_cmnd;
> > +	struct qedf_ctx *qedf = fcport->qedf;
> > +	int rc = 0;
> > +	uint16_t xid;
> > +	uint32_t sid, did;
> > +	int tmo = 0;
> > +	unsigned long flags;
> > +
> > +	if (!sc_cmd) {
> > +		QEDF_ERR(&(qedf->dbg_ctx), "invalid arg\n");
> > +		return FAILED;
> > +	}
> > +
> > +	if (!(test_bit(QEDF_RPORT_SESSION_READY, &fcport->flags))) {
> > +		QEDF_ERR(&(qedf->dbg_ctx), "fcport not offloaded\n");
> > +		rc = FAILED;
> > +		return FAILED;
> > +	}
> > +
> > +	scsi_block_requests(qedf->lport->host);
> > +
> Typically, EH commands will be executed after the scsi host is stopped
> and no commands are outstanding.
> So there's no point in issuing 'scsi_block_requests()' here.
> 

Will remove.

> Cheers,
> 
> Hannes
> 

^ permalink raw reply

* Re: [Open-FCoE] [PATCH RFC 5/5] qedf: Add FIP request handling
From: Chad Dupuis @ 2017-01-09 16:47 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: martin.petersen, fcoe-devel, netdev, QLogic-Storage-Upstream,
	linux-scsi, yuval.mintz
In-Reply-To: <a076b54b-966b-d652-7a0f-6b84aacb0ded@suse.de>


On Wed, 28 Dec 2016, 9:11am -0000, Hannes Reinecke wrote:

> On 12/23/2016 08:17 PM, Dupuis, Chad wrote:
> > From: "Dupuis, Chad" <chad.dupuis@cavium.com>
> > 
> > This patch adds handling for FIP requests and responses that are handled by
> > the driver itself and not by libfcoe.
> > 
> > Signed-off-by: Nilesh Javali <nilesh.javali@cavium.com>
> > Signed-off-by: Manish Rangankar <manish.rangankar@cavium.com>
> > Signed-off-by: Saurav Kashyap <saurav.kashyap@cavium.com>
> > Signed-off-by: Chad Dupuis <chad.dupuis@cavium.com>
> > ---
> >  drivers/scsi/qedf/qedf_fip.c | 267 +++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 267 insertions(+)
> >  create mode 100644 drivers/scsi/qedf/qedf_fip.c
> > 
> > diff --git a/drivers/scsi/qedf/qedf_fip.c b/drivers/scsi/qedf/qedf_fip.c
> > new file mode 100644
> > index 0000000..4f185c6
> > --- /dev/null
> > +++ b/drivers/scsi/qedf/qedf_fip.c
> > @@ -0,0 +1,267 @@
> > +/*
> > + *  QLogic FCoE Offload Driver
> > + *  Copyright (c) 2016 Cavium Inc.
> > + *
> > + *  This software is available under the terms of the GNU General Public License
> > + *  (GPL) Version 2, available from the file COPYING in the main directory of
> > + *  this source tree.
> > + */
> > +#include <linux/if_ether.h>
> > +#include <linux/if_vlan.h>
> > +#include "qedf.h"
> > +
> > +extern const struct qed_fcoe_ops *qed_ops;
> > +/*
> > + * FIP VLAN functions that will eventually move to libfcoe.
> > + */
> > +
> > +void qedf_fcoe_send_vlan_req(struct qedf_ctx *qedf)
> > +{
> > +	struct sk_buff *skb;
> > +	char *eth_fr;
> > +	int fr_len;
> > +	struct fip_vlan *vlan;
> > +#define MY_FIP_ALL_FCF_MACS        ((__u8[6]) { 1, 0x10, 0x18, 1, 0, 2 })
> > +	static u8 my_fcoe_all_fcfs[ETH_ALEN] = MY_FIP_ALL_FCF_MACS;
> 
> Do you support VN2VN, too?

Not currently, no.

> 
> Cheers,
> 
> Hannes
> 

^ permalink raw reply

* Re: [PATCH net-next] net: dsa: select NET_SWITCHDEV
From: Randy Dunlap @ 2017-01-09 16:47 UTC (permalink / raw)
  To: Vivien Didelot, Florian Fainelli, netdev
  Cc: linux-kernel, kernel, David S. Miller, Andrew Lunn, Jiri Pirko
In-Reply-To: <87d1fwurbw.fsf@weeman.i-did-not-set--mail-host-address--so-tickle-me>

On 01/09/17 08:32, Vivien Didelot wrote:
> Hi Randy,
> 
> Randy Dunlap <rdunlap@infradead.org> writes:
> 
>> On 01/08/17 17:18, Florian Fainelli wrote:
>>> On 01/08/2017 03:17 PM, Vivien Didelot wrote:
>>>> DSA wraps SWITCHDEV, thus select it instead of depending on it.
>>>>
>>>> Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
>>>
>>> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
>>>
>>
>> but when CONFIG_INET is not enabled, the patch causes this warning:
>>
>> warning: (NET_DSA) selects NET_SWITCHDEV which has unmet direct dependencies (NET && INET)
> 
> Thanks for spotting that! Would that be enough to change this first?
> 
>     diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
>     index 675acbf1502d..c7263b70e72b 100644
>     --- a/net/dsa/Kconfig
>     +++ b/net/dsa/Kconfig
>     @@ -1,6 +1,6 @@
>     config HAVE_NET_DSA
>             def_bool y
>     -       depends on NETDEVICES && !S390
>     +       depends on INET && NETDEVICES && !S390
> 
>     # Drivers must select NET_DSA and the appropriate tagging format

Yes, thanks.

Tested-by: Randy Dunlap <rdunlap@infradead.org>


-- 
~Randy

^ permalink raw reply

* [PATH net] tcp: do not export tcp_peer_is_proven()
From: Eric Dumazet @ 2017-01-09 16:51 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

From: Eric Dumazet <edumazet@google.com>

After commit 1fb6f159fd21 ("tcp: add tcp_conn_request"),
tcp_peer_is_proven() no longer needs to be exported.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/tcp_metrics.c |    1 -
 1 file changed, 1 deletion(-)

diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index d46f4d5b1c62edf95791e9d47d966c3bc61e1888..ba8f02d0f283c6eaaf14ed89103adea135093353 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -606,7 +606,6 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst,
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(tcp_peer_is_proven);
 
 void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst)
 {

^ permalink raw reply related

* Re: [PATCH] net: change init_inodecache() return void
From: David Miller @ 2017-01-09 17:05 UTC (permalink / raw)
  To: cugyly; +Cc: netdev, Linyu.Yuan
In-Reply-To: <1483780711-5759-1-git-send-email-cugyly@163.com>

From: yuan linyu <cugyly@163.com>
Date: Sat,  7 Jan 2017 17:18:31 +0800

> From: yuan linyu <Linyu.Yuan@alcatel-sbell.com.cn>
> 
> sock_init() call it but not check it's return value,
> so change it to void return and add an internal BUG_ON() check.
> 
> Signed-off-by: yuan linyu <Linyu.Yuan@alcatel-sbell.com.cn>

Applied.

^ permalink raw reply

* Re: [PATCH] [v2] net: qcom/emac: add ethtool support
From: David Miller @ 2017-01-09 17:08 UTC (permalink / raw)
  To: timur; +Cc: f.fainelli, netdev, alokc
In-Reply-To: <1483738981-31019-1-git-send-email-timur@codeaurora.org>

From: Timur Tabi <timur@codeaurora.org>
Date: Fri,  6 Jan 2017 15:43:01 -0600

> Add support for some ethtool methods: get/set link settings, get/set
> message level, get statistics, get link status, get ring params, get
> pause params, and restart autonegotiation.
> 
> The code to collect the hardware statistics is moved into its own
> function so that it can be used by "get statistics" method.
> 
> Signed-off-by: Timur Tabi <timur@codeaurora.org>
> ---
> 
> Notes:
>     I don't trust my implementation of emac_get_pauseparam.  I feel like
>     I'm missing something.
>     
>     v2: added emac_get_pauseparam and emac_get_ringparam

This doesn't apply cleanly to net-next, please respin.

^ permalink raw reply

* net/ipv6: use-after-free in sock_wfree
From: Andrey Konovalov @ 2017-01-09 17:08 UTC (permalink / raw)
  To: David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, netdev, LKML
  Cc: Dmitry Vyukov, Kostya Serebryany, Eric Dumazet, syzkaller

[-- Attachment #1: Type: text/plain, Size: 5712 bytes --]

Hi!

I've got the following error report while running the syzkaller fuzzer.

On commit a121103c922847ba5010819a3f250f1f7fc84ab8 (4.10-rc3).

A reproducer is attached.

==================================================================
BUG: KASAN: use-after-free in sock_wfree+0x118/0x120
Read of size 8 at addr ffff880062da0060 by task a.out/4140

page:ffffea00018b6800 count:1 mapcount:0 mapping:          (null)
index:0x0 compound_mapcount: 0
flags: 0x100000000008100(slab|head)
raw: 0100000000008100 0000000000000000 0000000000000000 0000000180130013
raw: dead000000000100 dead000000000200 ffff88006741f140 0000000000000000
page dumped because: kasan: bad access detected

CPU: 0 PID: 4140 Comm: a.out Not tainted 4.10.0-rc3+ #59
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:15
 dump_stack+0x292/0x398 lib/dump_stack.c:51
 describe_address mm/kasan/report.c:262
 kasan_report_error+0x121/0x560 mm/kasan/report.c:370
 kasan_report mm/kasan/report.c:392
 __asan_report_load8_noabort+0x3e/0x40 mm/kasan/report.c:413
 sock_flag ./arch/x86/include/asm/bitops.h:324
 sock_wfree+0x118/0x120 net/core/sock.c:1631
 skb_release_head_state+0xfc/0x250 net/core/skbuff.c:655
 skb_release_all+0x15/0x60 net/core/skbuff.c:668
 __kfree_skb+0x15/0x20 net/core/skbuff.c:684
 kfree_skb+0x16e/0x4e0 net/core/skbuff.c:705
 inet_frag_destroy+0x121/0x290 net/ipv4/inet_fragment.c:304
 inet_frag_put ./include/net/inet_frag.h:133
 nf_ct_frag6_gather+0x1125/0x38b0 net/ipv6/netfilter/nf_conntrack_reasm.c:617
 ipv6_defrag+0x21b/0x350 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c:68
 nf_hook_entry_hookfn ./include/linux/netfilter.h:102
 nf_hook_slow+0xc3/0x290 net/netfilter/core.c:310
 nf_hook ./include/linux/netfilter.h:212
 __ip6_local_out+0x52c/0xaf0 net/ipv6/output_core.c:160
 ip6_local_out+0x2d/0x170 net/ipv6/output_core.c:170
 ip6_send_skb+0xa1/0x340 net/ipv6/ip6_output.c:1722
 ip6_push_pending_frames+0xb3/0xe0 net/ipv6/ip6_output.c:1742
 rawv6_push_pending_frames net/ipv6/raw.c:613
 rawv6_sendmsg+0x2cff/0x4130 net/ipv6/raw.c:927
 inet_sendmsg+0x164/0x5b0 net/ipv4/af_inet.c:744
 sock_sendmsg_nosec net/socket.c:635
 sock_sendmsg+0xca/0x110 net/socket.c:645
 sock_write_iter+0x326/0x620 net/socket.c:848
 new_sync_write fs/read_write.c:499
 __vfs_write+0x483/0x760 fs/read_write.c:512
 vfs_write+0x187/0x530 fs/read_write.c:560
 SYSC_write fs/read_write.c:607
 SyS_write+0xfb/0x230 fs/read_write.c:599
 entry_SYSCALL_64_fastpath+0x1f/0xc2 arch/x86/entry/entry_64.S:203
RIP: 0033:0x7ff26e6f5b79
RSP: 002b:00007ff268e0ed98 EFLAGS: 00000206 ORIG_RAX: 0000000000000001
RAX: ffffffffffffffda RBX: 00007ff268e0f9c0 RCX: 00007ff26e6f5b79
RDX: 0000000000000010 RSI: 0000000020f50fe1 RDI: 0000000000000003
RBP: 00007ff26ebc1220 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000206 R12: 0000000000000000
R13: 00007ff268e0f9c0 R14: 00007ff26efec040 R15: 0000000000000003

The buggy address belongs to the object at ffff880062da0000
 which belongs to the cache RAWv6 of size 1504
The buggy address ffff880062da0060 is located 96 bytes inside
 of 1504-byte region [ffff880062da0000, ffff880062da05e0)

Freed by task 4113:
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:57
 save_stack+0x43/0xd0 mm/kasan/kasan.c:502
 set_track mm/kasan/kasan.c:514
 kasan_slab_free+0x73/0xc0 mm/kasan/kasan.c:578
 slab_free_hook mm/slub.c:1352
 slab_free_freelist_hook mm/slub.c:1374
 slab_free mm/slub.c:2951
 kmem_cache_free+0xb2/0x2c0 mm/slub.c:2973
 sk_prot_free net/core/sock.c:1377
 __sk_destruct+0x49c/0x6e0 net/core/sock.c:1452
 sk_destruct+0x47/0x80 net/core/sock.c:1460
 __sk_free+0x57/0x230 net/core/sock.c:1468
 sk_free+0x23/0x30 net/core/sock.c:1479
 sock_put ./include/net/sock.h:1638
 sk_common_release+0x31e/0x4e0 net/core/sock.c:2782
 rawv6_close+0x54/0x80 net/ipv6/raw.c:1214
 inet_release+0xed/0x1c0 net/ipv4/af_inet.c:425
 inet6_release+0x50/0x70 net/ipv6/af_inet6.c:431
 sock_release+0x8d/0x1e0 net/socket.c:599
 sock_close+0x16/0x20 net/socket.c:1063
 __fput+0x332/0x7f0 fs/file_table.c:208
 ____fput+0x15/0x20 fs/file_table.c:244
 task_work_run+0x19b/0x270 kernel/task_work.c:116
 exit_task_work ./include/linux/task_work.h:21
 do_exit+0x186b/0x2800 kernel/exit.c:839
 do_group_exit+0x149/0x420 kernel/exit.c:943
 SYSC_exit_group kernel/exit.c:954
 SyS_exit_group+0x1d/0x20 kernel/exit.c:952
 entry_SYSCALL_64_fastpath+0x1f/0xc2 arch/x86/entry/entry_64.S:203

Allocated by task 4115:
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:57
 save_stack+0x43/0xd0 mm/kasan/kasan.c:502
 set_track mm/kasan/kasan.c:514
 kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:605
 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:544
 slab_post_alloc_hook mm/slab.h:432
 slab_alloc_node mm/slub.c:2708
 slab_alloc mm/slub.c:2716
 kmem_cache_alloc+0x1af/0x250 mm/slub.c:2721
 sk_prot_alloc+0x65/0x2a0 net/core/sock.c:1334
 sk_alloc+0x105/0x1010 net/core/sock.c:1396
 inet6_create+0x44d/0x1150 net/ipv6/af_inet6.c:183
 __sock_create+0x4f6/0x880 net/socket.c:1199
 sock_create net/socket.c:1239
 SYSC_socket net/socket.c:1269
 SyS_socket+0xf9/0x230 net/socket.c:1249
 entry_SYSCALL_64_fastpath+0x1f/0xc2 arch/x86/entry/entry_64.S:203

Memory state around the buggy address:
 ffff880062d9ff00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
 ffff880062d9ff80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>ffff880062da0000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                                                       ^
 ffff880062da0080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff880062da0100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
==================================================================

[-- Attachment #2: ipv6-wfree-uaf-poc.c --]
[-- Type: text/x-csrc, Size: 9719 bytes --]

// autogenerated by syzkaller (http://github.com/google/syzkaller)

#ifndef __NR_ioctl
#define __NR_ioctl 16
#endif
#ifndef __NR_mmap
#define __NR_mmap 9
#endif
#ifndef __NR_socket
#define __NR_socket 41
#endif
#ifndef __NR_connect
#define __NR_connect 42
#endif
#ifndef __NR_bind
#define __NR_bind 49
#endif
#ifndef __NR_sendto
#define __NR_sendto 44
#endif
#ifndef __NR_recvfrom
#define __NR_recvfrom 45
#endif
#ifndef __NR_write
#define __NR_write 1
#endif

#define _GNU_SOURCE

#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/mount.h>
#include <sys/prctl.h>
#include <sys/resource.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/wait.h>

#include <linux/capability.h>
#include <linux/if.h>
#include <linux/if_tun.h>
#include <linux/sched.h>
#include <net/if_arp.h>

#include <assert.h>
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <grp.h>
#include <pthread.h>
#include <setjmp.h>
#include <signal.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

const int kFailStatus = 67;
const int kErrorStatus = 68;
const int kRetryStatus = 69;

__attribute__((noreturn)) void doexit(int status)
{
  syscall(__NR_exit_group, status);
  volatile unsigned i = 0;
  for (i = 0;; i++) {
  }
}

__attribute__((noreturn)) void fail(const char* msg, ...)
{
  int e = errno;
  fflush(stdout);
  va_list args;
  va_start(args, msg);
  vfprintf(stderr, msg, args);
  va_end(args);
  fprintf(stderr, " (errno %d)\n", e);
  doexit(e == ENOMEM ? kRetryStatus : kFailStatus);
}

__attribute__((noreturn)) void exitf(const char* msg, ...)
{
  int e = errno;
  fflush(stdout);
  va_list args;
  va_start(args, msg);
  vfprintf(stderr, msg, args);
  va_end(args);
  fprintf(stderr, " (errno %d)\n", e);
  doexit(kRetryStatus);
}

static int flag_debug;

void debug(const char* msg, ...)
{
  if (!flag_debug)
    return;
  va_list args;
  va_start(args, msg);
  vfprintf(stdout, msg, args);
  va_end(args);
  fflush(stdout);
}

__thread int skip_segv;
__thread jmp_buf segv_env;

static void segv_handler(int sig, siginfo_t* info, void* uctx)
{
  if (__atomic_load_n(&skip_segv, __ATOMIC_RELAXED))
    _longjmp(segv_env, 1);
  doexit(sig);
  for (;;) {
  }
}

static void install_segv_handler()
{
  struct sigaction sa;
  memset(&sa, 0, sizeof(sa));
  sa.sa_sigaction = segv_handler;
  sa.sa_flags = SA_NODEFER | SA_SIGINFO;
  sigaction(SIGSEGV, &sa, NULL);
  sigaction(SIGBUS, &sa, NULL);
}

#define NONFAILING(...)                                                \
  {                                                                    \
    __atomic_fetch_add(&skip_segv, 1, __ATOMIC_SEQ_CST);               \
    if (_setjmp(segv_env) == 0) {                                      \
      __VA_ARGS__;                                                     \
    }                                                                  \
    __atomic_fetch_sub(&skip_segv, 1, __ATOMIC_SEQ_CST);               \
  }

static uintptr_t execute_syscall(int nr, uintptr_t a0, uintptr_t a1,
                                 uintptr_t a2, uintptr_t a3,
                                 uintptr_t a4, uintptr_t a5,
                                 uintptr_t a6, uintptr_t a7,
                                 uintptr_t a8)
{
  switch (nr) {
  default:
    return syscall(nr, a0, a1, a2, a3, a4, a5);
  }
}

static void setup_main_process(uint64_t pid, bool enable_tun)
{
  struct sigaction sa;
  memset(&sa, 0, sizeof(sa));
  sa.sa_handler = SIG_IGN;
  syscall(SYS_rt_sigaction, 0x20, &sa, NULL, 8);
  syscall(SYS_rt_sigaction, 0x21, &sa, NULL, 8);
  install_segv_handler();

  char tmpdir_template[] = "./syzkaller.XXXXXX";
  char* tmpdir = mkdtemp(tmpdir_template);
  if (!tmpdir)
    fail("failed to mkdtemp");
  if (chmod(tmpdir, 0777))
    fail("failed to chmod");
  if (chdir(tmpdir))
    fail("failed to chdir");
}

static void loop();

static void sandbox_common()
{
  prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
  setpgrp();
  setsid();

  struct rlimit rlim;
  rlim.rlim_cur = rlim.rlim_max = 128 << 20;
  setrlimit(RLIMIT_AS, &rlim);
  rlim.rlim_cur = rlim.rlim_max = 1 << 20;
  setrlimit(RLIMIT_FSIZE, &rlim);
  rlim.rlim_cur = rlim.rlim_max = 1 << 20;
  setrlimit(RLIMIT_STACK, &rlim);
  rlim.rlim_cur = rlim.rlim_max = 0;
  setrlimit(RLIMIT_CORE, &rlim);

  unshare(CLONE_NEWNS);
  unshare(CLONE_NEWIPC);
  unshare(CLONE_IO);
}

static int do_sandbox_none()
{
  int pid = fork();
  if (pid)
    return pid;
  sandbox_common();
  loop();
  doexit(1);
}

long r[56];
void* thr(void* arg)
{
  switch ((long)arg) {
  case 0:
    r[0] =
        execute_syscall(__NR_mmap, 0x20000000ul, 0xf55000ul, 0x3ul,
                        0x32ul, 0xfffffffffffffffful, 0x0ul, 0, 0, 0);
    break;
  case 1:
    r[1] = execute_syscall(__NR_socket, 0xaul, 0x3ul, 0x2cul, 0, 0, 0,
                           0, 0, 0);
    break;
  case 2:
    NONFAILING(*(uint16_t*)0x20016000 = (uint16_t)0xa);
    NONFAILING(*(uint16_t*)0x20016002 = (uint16_t)0x204e);
    NONFAILING(*(uint32_t*)0x20016004 = (uint32_t)0x0);
    NONFAILING(*(uint64_t*)0x20016008 = (uint64_t)0x0);
    NONFAILING(*(uint64_t*)0x20016010 = (uint64_t)0x0);
    NONFAILING(*(uint32_t*)0x20016018 = (uint32_t)0x0);
    r[8] = execute_syscall(__NR_connect, r[1], 0x20016000ul, 0x20ul, 0,
                           0, 0, 0, 0, 0);
    break;
  case 3:
    NONFAILING(*(uint16_t*)0x20373000 = (uint16_t)0x2);
    NONFAILING(*(uint16_t*)0x20373002 = (uint16_t)0x204e);
    NONFAILING(*(uint8_t*)0x20373004 = (uint8_t)0xc0);
    NONFAILING(*(uint8_t*)0x20373005 = (uint8_t)0xa8);
    NONFAILING(*(uint8_t*)0x20373006 = (uint8_t)0xda);
    NONFAILING(*(uint8_t*)0x20373007 = (uint8_t)0xaa);
    NONFAILING(*(uint8_t*)0x20373008 = (uint8_t)0x0);
    NONFAILING(*(uint8_t*)0x20373009 = (uint8_t)0x0);
    NONFAILING(*(uint8_t*)0x2037300a = (uint8_t)0x0);
    NONFAILING(*(uint8_t*)0x2037300b = (uint8_t)0x0);
    NONFAILING(*(uint8_t*)0x2037300c = (uint8_t)0x0);
    NONFAILING(*(uint8_t*)0x2037300d = (uint8_t)0x0);
    NONFAILING(*(uint8_t*)0x2037300e = (uint8_t)0x0);
    NONFAILING(*(uint8_t*)0x2037300f = (uint8_t)0x0);
    r[23] = execute_syscall(__NR_bind, 0xfffffffffffffffful,
                            0x20373000ul, 0x10ul, 0, 0, 0, 0, 0, 0);
    break;
  case 4:
    NONFAILING(*(uint16_t*)0x20f4a000 = (uint16_t)0x0);
    NONFAILING(*(uint16_t*)0x20f4a002 = (uint16_t)0x204e);
    NONFAILING(*(uint32_t*)0x20f4a004 = (uint32_t)0x0);
    NONFAILING(*(uint8_t*)0x20f4a008 = (uint8_t)0x0);
    NONFAILING(*(uint8_t*)0x20f4a009 = (uint8_t)0x0);
    NONFAILING(*(uint8_t*)0x20f4a00a = (uint8_t)0x0);
    NONFAILING(*(uint8_t*)0x20f4a00b = (uint8_t)0x0);
    NONFAILING(*(uint8_t*)0x20f4a00c = (uint8_t)0x0);
    NONFAILING(*(uint8_t*)0x20f4a00d = (uint8_t)0x0);
    NONFAILING(*(uint8_t*)0x20f4a00e = (uint8_t)0x0);
    NONFAILING(*(uint8_t*)0x20f4a00f = (uint8_t)0x0);
    r[35] = execute_syscall(__NR_connect, 0xfffffffffffffffful,
                            0x20f4a000ul, 0x10ul, 0, 0, 0, 0, 0, 0);
    break;
  case 5:
    NONFAILING(*(uint16_t*)0x20f4cfe0 = (uint16_t)0xa);
    NONFAILING(*(uint16_t*)0x20f4cfe2 = (uint16_t)0x204e);
    NONFAILING(*(uint32_t*)0x20f4cfe4 = (uint32_t)0x0);
    NONFAILING(*(uint64_t*)0x20f4cfe8 = (uint64_t)0x0);
    NONFAILING(*(uint64_t*)0x20f4cff0 = (uint64_t)0x100000000000000);
    NONFAILING(*(uint32_t*)0x20f4cff8 = (uint32_t)0x5);
    r[42] =
        execute_syscall(__NR_sendto, 0xfffffffffffffffful, 0x20006000ul,
                        0x0ul, 0x0ul, 0x20f4cfe0ul, 0x20ul, 0, 0, 0);
    break;
  case 6:
    r[43] = execute_syscall(__NR_recvfrom, 0xfffffffffffffffful,
                            0x20144f28ul, 0x0ul, 0x10000ul,
                            0x20f4e000ul, 0x0ul, 0, 0, 0);
    break;
  case 7:
    r[44] = execute_syscall(__NR_socket, 0x1ful, 0x5ul, 0x2ul, 0, 0, 0,
                            0, 0, 0);
    break;
  case 8:
    r[45] = execute_syscall(__NR_write, r[1], 0x20aa4fdaul, 0xfffful, 0,
                            0, 0, 0, 0, 0);
    break;
  case 9:
    NONFAILING(*(uint32_t*)0x20f54000 = (uint32_t)0x0);
    NONFAILING(*(uint32_t*)0x20f54004 = (uint32_t)0x0);
    NONFAILING(*(uint64_t*)0x20f54008 = (uint64_t)0x0);
    r[49] =
        execute_syscall(__NR_ioctl, 0xfffffffffffffffful, 0xc010640bul,
                        0x20f54000ul, 0, 0, 0, 0, 0, 0);
    break;
  case 10:
    NONFAILING(*(uint32_t*)0x20f54000 = (uint32_t)0x0);
    NONFAILING(*(uint32_t*)0x20f54004 = (uint32_t)0x0);
    NONFAILING(*(uint64_t*)0x20f54008 = (uint64_t)0xfc51);
    r[53] =
        execute_syscall(__NR_ioctl, 0xfffffffffffffffful, 0xc010640bul,
                        0x20f54000ul, 0, 0, 0, 0, 0, 0);
    break;
  case 11:
    NONFAILING(memcpy((void*)0x20f50fe1, "\x1f\x00\x00\x80\x01\x00\x00"
                                         "\x16\x00\x00\x00\x9a\xc7\x00"
                                         "\x00\x06",
                      16));
    r[55] = execute_syscall(__NR_write, r[1], 0x20f50fe1ul, 0x10ul, 0,
                            0, 0, 0, 0, 0);
    break;
  }
  return 0;
}

void loop()
{
  long i;
  pthread_t th[24];

  memset(r, -1, sizeof(r));
  srand(getpid());
  for (i = 0; i < 12; i++) {
    pthread_create(&th[i], 0, thr, (void*)i);
    usleep(10000);
  }
  for (i = 0; i < 12; i++) {
    pthread_create(&th[12 + i], 0, thr, (void*)i);
    if (rand() % 2)
      usleep(rand() % 10000);
  }
  usleep(100000);
}

int main()
{
  setup_main_process(0, false);
  int pid = do_sandbox_none();
  int status = 0;
  while (waitpid(pid, &status, __WALL) != pid) {
  }
  return 0;
}

^ permalink raw reply

* Re: [PATCH] net: ibm: ehea: use new api ethtool_{get|set}_link_ksettings
From: David Miller @ 2017-01-09 17:10 UTC (permalink / raw)
  To: tremyfr; +Cc: dougmill, netdev, linux-kernel
In-Reply-To: <1483807667-18264-1-git-send-email-tremyfr@gmail.com>

From: Philippe Reynes <tremyfr@gmail.com>
Date: Sat,  7 Jan 2017 17:47:47 +0100

> The ethtool api {get|set}_settings is deprecated.
> We move this driver to new api {get|set}_link_ksettings.
> 
> Signed-off-by: Philippe Reynes <tremyfr@gmail.com>

Applied.

^ permalink raw reply

* Re: [PATCH] net: ibm: emac: use new api ethtool_{get|set}_link_ksettings
From: David Miller @ 2017-01-09 17:10 UTC (permalink / raw)
  To: tremyfr
  Cc: ivan, jarod, mugunthanvnm, felipe.balbi, fw, mpe, netdev,
	linux-kernel
In-Reply-To: <1483824747-6405-1-git-send-email-tremyfr@gmail.com>

From: Philippe Reynes <tremyfr@gmail.com>
Date: Sat,  7 Jan 2017 22:32:27 +0100

> The ethtool api {get|set}_settings is deprecated.
> We move this driver to new api {get|set}_link_ksettings.
> 
> Signed-off-by: Philippe Reynes <tremyfr@gmail.com>

Applied.

^ permalink raw reply

* Re: [PATCH] net: ibm: ibmveth: use new api ethtool_{get|set}_link_ksettings
From: David Miller @ 2017-01-09 17:10 UTC (permalink / raw)
  To: tremyfr; +Cc: tlfalcon, linux-kernel, paulus, netdev, linuxppc-dev
In-Reply-To: <1483824913-6989-1-git-send-email-tremyfr@gmail.com>

From: Philippe Reynes <tremyfr@gmail.com>
Date: Sat,  7 Jan 2017 22:35:13 +0100

> The ethtool api {get|set}_settings is deprecated.
> We move this driver to new api {get|set}_link_ksettings.
> 
> Signed-off-by: Philippe Reynes <tremyfr@gmail.com>

Applied.

^ permalink raw reply

* Re: [PATCH] net: ibm: ibmvnic: use new api ethtool_{get|set}_link_ksettings
From: David Miller @ 2017-01-09 17:10 UTC (permalink / raw)
  To: tremyfr
  Cc: tlfalcon, jallen, benh, paulus, mpe, netdev, linuxppc-dev,
	linux-kernel
In-Reply-To: <1483825049-7501-1-git-send-email-tremyfr@gmail.com>

From: Philippe Reynes <tremyfr@gmail.com>
Date: Sat,  7 Jan 2017 22:37:29 +0100

> The ethtool api {get|set}_settings is deprecated.
> We move this driver to new api {get|set}_link_ksettings.
> 
> Signed-off-by: Philippe Reynes <tremyfr@gmail.com>

Applied.

^ permalink raw reply

* Re: [PATCH] net: intel: e100: use new api ethtool_{get|set}_link_ksettings
From: David Miller @ 2017-01-09 17:10 UTC (permalink / raw)
  To: tremyfr; +Cc: jeffrey.t.kirsher, intel-wired-lan, netdev, linux-kernel
In-Reply-To: <1483827487-20355-1-git-send-email-tremyfr@gmail.com>

From: Philippe Reynes <tremyfr@gmail.com>
Date: Sat,  7 Jan 2017 23:18:07 +0100

> The ethtool api {get|set}_settings is deprecated.
> We move this driver to new api {get|set}_link_ksettings.
> 
> Signed-off-by: Philippe Reynes <tremyfr@gmail.com>

Applied.

^ permalink raw reply

* Re: [PATCH 0/4] net: ethernet: ti: cpsw: correct common res usage
From: Grygorii Strashko @ 2017-01-09 17:11 UTC (permalink / raw)
  To: Ivan Khoronzhuk, netdev, mugunthanvnm; +Cc: linux-omap, linux-kernel
In-Reply-To: <1483893663-15673-1-git-send-email-ivan.khoronzhuk@linaro.org>

Hi Ivan,

On 01/08/2017 10:40 AM, Ivan Khoronzhuk wrote:
> This series is intended to remove unneeded redundancies connected with
> common resource usage function.
>
> Based on net-next/master
> Tested on am572x idk
>
> Ivan Khoronzhuk (4):
>   net: ethernet: ti: cpsw: remove dual check from common res usage
>     function
>   net: ethernet: ti: cpsw: don't disable interrupts in ndo_open
>   net: ethernet: ti: cpsw: don't duplicate ndev_running
>   net: ethernet: ti: cpsw: don't duplicate common res in rx handler
>

thanks for the patches - I'll need some time to review them.

>  drivers/net/ethernet/ti/cpsw.c | 57 ++++++++++++++----------------------------
>  1 file changed, 19 insertions(+), 38 deletions(-)
>

-- 
regards,
-grygorii

^ permalink raw reply

* Re: net/ipv6: use-after-free in sock_wfree
From: Andrey Konovalov @ 2017-01-09 17:11 UTC (permalink / raw)
  To: David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, netdev, LKML
  Cc: Dmitry Vyukov, Kostya Serebryany, Eric Dumazet, syzkaller
In-Reply-To: <CAAeHK+yfNdNTkgCbUGbdRBM9bB=2DhGv1ZPCWm44CGL7zD=TLg@mail.gmail.com>

On Mon, Jan 9, 2017 at 6:08 PM, Andrey Konovalov <andreyknvl@google.com> wrote:
> Hi!
>
> I've got the following error report while running the syzkaller fuzzer.
>
> On commit a121103c922847ba5010819a3f250f1f7fc84ab8 (4.10-rc3).
>
> A reproducer is attached.
>
> ==================================================================
> BUG: KASAN: use-after-free in sock_wfree+0x118/0x120
> Read of size 8 at addr ffff880062da0060 by task a.out/4140
>
> page:ffffea00018b6800 count:1 mapcount:0 mapping:          (null)
> index:0x0 compound_mapcount: 0
> flags: 0x100000000008100(slab|head)
> raw: 0100000000008100 0000000000000000 0000000000000000 0000000180130013
> raw: dead000000000100 dead000000000200 ffff88006741f140 0000000000000000
> page dumped because: kasan: bad access detected
>
> CPU: 0 PID: 4140 Comm: a.out Not tainted 4.10.0-rc3+ #59
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
> Call Trace:
>  __dump_stack lib/dump_stack.c:15
>  dump_stack+0x292/0x398 lib/dump_stack.c:51
>  describe_address mm/kasan/report.c:262
>  kasan_report_error+0x121/0x560 mm/kasan/report.c:370
>  kasan_report mm/kasan/report.c:392
>  __asan_report_load8_noabort+0x3e/0x40 mm/kasan/report.c:413
>  sock_flag ./arch/x86/include/asm/bitops.h:324
>  sock_wfree+0x118/0x120 net/core/sock.c:1631
>  skb_release_head_state+0xfc/0x250 net/core/skbuff.c:655
>  skb_release_all+0x15/0x60 net/core/skbuff.c:668
>  __kfree_skb+0x15/0x20 net/core/skbuff.c:684
>  kfree_skb+0x16e/0x4e0 net/core/skbuff.c:705
>  inet_frag_destroy+0x121/0x290 net/ipv4/inet_fragment.c:304
>  inet_frag_put ./include/net/inet_frag.h:133
>  nf_ct_frag6_gather+0x1125/0x38b0 net/ipv6/netfilter/nf_conntrack_reasm.c:617
>  ipv6_defrag+0x21b/0x350 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c:68
>  nf_hook_entry_hookfn ./include/linux/netfilter.h:102
>  nf_hook_slow+0xc3/0x290 net/netfilter/core.c:310
>  nf_hook ./include/linux/netfilter.h:212
>  __ip6_local_out+0x52c/0xaf0 net/ipv6/output_core.c:160
>  ip6_local_out+0x2d/0x170 net/ipv6/output_core.c:170
>  ip6_send_skb+0xa1/0x340 net/ipv6/ip6_output.c:1722
>  ip6_push_pending_frames+0xb3/0xe0 net/ipv6/ip6_output.c:1742
>  rawv6_push_pending_frames net/ipv6/raw.c:613
>  rawv6_sendmsg+0x2cff/0x4130 net/ipv6/raw.c:927
>  inet_sendmsg+0x164/0x5b0 net/ipv4/af_inet.c:744
>  sock_sendmsg_nosec net/socket.c:635
>  sock_sendmsg+0xca/0x110 net/socket.c:645
>  sock_write_iter+0x326/0x620 net/socket.c:848
>  new_sync_write fs/read_write.c:499
>  __vfs_write+0x483/0x760 fs/read_write.c:512
>  vfs_write+0x187/0x530 fs/read_write.c:560
>  SYSC_write fs/read_write.c:607
>  SyS_write+0xfb/0x230 fs/read_write.c:599
>  entry_SYSCALL_64_fastpath+0x1f/0xc2 arch/x86/entry/entry_64.S:203
> RIP: 0033:0x7ff26e6f5b79
> RSP: 002b:00007ff268e0ed98 EFLAGS: 00000206 ORIG_RAX: 0000000000000001
> RAX: ffffffffffffffda RBX: 00007ff268e0f9c0 RCX: 00007ff26e6f5b79
> RDX: 0000000000000010 RSI: 0000000020f50fe1 RDI: 0000000000000003
> RBP: 00007ff26ebc1220 R08: 0000000000000000 R09: 0000000000000000
> R10: 0000000000000000 R11: 0000000000000206 R12: 0000000000000000
> R13: 00007ff268e0f9c0 R14: 00007ff26efec040 R15: 0000000000000003
>
> The buggy address belongs to the object at ffff880062da0000
>  which belongs to the cache RAWv6 of size 1504
> The buggy address ffff880062da0060 is located 96 bytes inside
>  of 1504-byte region [ffff880062da0000, ffff880062da05e0)
>
> Freed by task 4113:
>  save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:57
>  save_stack+0x43/0xd0 mm/kasan/kasan.c:502
>  set_track mm/kasan/kasan.c:514
>  kasan_slab_free+0x73/0xc0 mm/kasan/kasan.c:578
>  slab_free_hook mm/slub.c:1352
>  slab_free_freelist_hook mm/slub.c:1374
>  slab_free mm/slub.c:2951
>  kmem_cache_free+0xb2/0x2c0 mm/slub.c:2973
>  sk_prot_free net/core/sock.c:1377
>  __sk_destruct+0x49c/0x6e0 net/core/sock.c:1452
>  sk_destruct+0x47/0x80 net/core/sock.c:1460
>  __sk_free+0x57/0x230 net/core/sock.c:1468
>  sk_free+0x23/0x30 net/core/sock.c:1479
>  sock_put ./include/net/sock.h:1638
>  sk_common_release+0x31e/0x4e0 net/core/sock.c:2782
>  rawv6_close+0x54/0x80 net/ipv6/raw.c:1214
>  inet_release+0xed/0x1c0 net/ipv4/af_inet.c:425
>  inet6_release+0x50/0x70 net/ipv6/af_inet6.c:431
>  sock_release+0x8d/0x1e0 net/socket.c:599
>  sock_close+0x16/0x20 net/socket.c:1063
>  __fput+0x332/0x7f0 fs/file_table.c:208
>  ____fput+0x15/0x20 fs/file_table.c:244
>  task_work_run+0x19b/0x270 kernel/task_work.c:116
>  exit_task_work ./include/linux/task_work.h:21
>  do_exit+0x186b/0x2800 kernel/exit.c:839
>  do_group_exit+0x149/0x420 kernel/exit.c:943
>  SYSC_exit_group kernel/exit.c:954
>  SyS_exit_group+0x1d/0x20 kernel/exit.c:952
>  entry_SYSCALL_64_fastpath+0x1f/0xc2 arch/x86/entry/entry_64.S:203
>
> Allocated by task 4115:
>  save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:57
>  save_stack+0x43/0xd0 mm/kasan/kasan.c:502
>  set_track mm/kasan/kasan.c:514
>  kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:605
>  kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:544
>  slab_post_alloc_hook mm/slab.h:432
>  slab_alloc_node mm/slub.c:2708
>  slab_alloc mm/slub.c:2716
>  kmem_cache_alloc+0x1af/0x250 mm/slub.c:2721
>  sk_prot_alloc+0x65/0x2a0 net/core/sock.c:1334
>  sk_alloc+0x105/0x1010 net/core/sock.c:1396
>  inet6_create+0x44d/0x1150 net/ipv6/af_inet6.c:183
>  __sock_create+0x4f6/0x880 net/socket.c:1199
>  sock_create net/socket.c:1239
>  SYSC_socket net/socket.c:1269
>  SyS_socket+0xf9/0x230 net/socket.c:1249
>  entry_SYSCALL_64_fastpath+0x1f/0xc2 arch/x86/entry/entry_64.S:203
>
> Memory state around the buggy address:
>  ffff880062d9ff00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>  ffff880062d9ff80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>>ffff880062da0000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>                                                        ^
>  ffff880062da0080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>  ffff880062da0100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> ==================================================================

Sometimes this reproducer leads to another report:

INFO: rcu_sched self-detected stall on CPU
1-...: (1 GPs behind) idle=ead/140000000000001/0 softirq=8122/8123 fqs=6497
(t=26000 jiffies g=3021 c=3020 q=345)
Task dump for CPU 1:
syz-executor    R  running task    18904  3943   3941 0x0000000c
Call Trace:
 <IRQ>
 sched_show_task+0x3fa/0x560 kernel/sched/core.c:5217
 dump_cpu_task+0x71/0x90 kernel/sched/core.c:8822
 rcu_dump_cpu_stacks+0x318/0x35e kernel/rcu/tree.c:1290
 print_cpu_stall+0x39f/0x6e0 kernel/rcu/tree.c:1434
 check_cpu_stall.isra.63+0x702/0xe80 kernel/rcu/tree.c:1502
 __rcu_pending kernel/rcu/tree.c:3469
 rcu_pending kernel/rcu/tree.c:3533
 rcu_check_callbacks+0x27f/0xda0 kernel/rcu/tree.c:2867
 update_process_times+0x30/0x60 kernel/time/timer.c:1612
 tick_sched_handle.isra.18+0xb3/0xe0 kernel/time/tick-sched.c:151
 tick_sched_timer+0x72/0x120 kernel/time/tick-sched.c:1158
 __run_hrtimer kernel/time/hrtimer.c:1238
 __hrtimer_run_queues+0x38c/0xf80 kernel/time/hrtimer.c:1302
 hrtimer_interrupt+0x1ab/0x5c0 kernel/time/hrtimer.c:1336
 local_apic_timer_interrupt+0x6f/0xe0 arch/x86/kernel/apic/apic.c:936
 smp_apic_timer_interrupt+0x71/0xa0 arch/x86/kernel/apic/apic.c:960
 apic_timer_interrupt+0x93/0xa0
RIP: 0010:__sanitizer_cov_trace_pc+0x46/0x60 kernel/kcov.c:93
RSP: 0018:ffff88006ad66a98 EFLAGS: 00000216 ORIG_RAX: ffffffffffffff10
RAX: 0000000000004000 RBX: ffff880068f4e500 RCX: ffffc90000e6c000
RDX: 0000000000004000 RSI: ffffffff83d652b7 RDI: ffff880064f09f51
RBP: ffff88006ad66a98 R08: ffffed000d633ca1 R09: ffffed000d633ca1
R10: 0000000000000001 R11: ffffed000d633ca0 R12: ffff880064f00020
R13: ffff88006ad66c28 R14: 0000000000009f38 R15: dffffc0000000000
 </IRQ>
 _decode_session6+0x8a7/0x13f0 net/ipv6/xfrm6_policy.c:147
 __xfrm_decode_session+0x63/0x100 net/xfrm/xfrm_policy.c:2475
 xfrm_decode_session_reverse ./include/net/xfrm.h:1117
 icmpv6_route_lookup+0x410/0x780 net/ipv6/icmp.c:362
 icmp6_send+0x1611/0x29b0 net/ipv6/icmp.c:515
 icmpv6_send+0x12e/0x260 net/ipv6/ip6_icmp.c:42
 ip6_fragment+0x583/0x3920 net/ipv6/ip6_output.c:864
 ip6_finish_output+0x322/0x960 net/ipv6/ip6_output.c:146
 NF_HOOK_COND ./include/linux/netfilter.h:246
 ip6_output+0x1cb/0x8d0 net/ipv6/ip6_output.c:162
 dst_output ./include/net/dst.h:501
 ip6_local_out+0x95/0x170 net/ipv6/output_core.c:172
 ip6_send_skb+0xa1/0x340 net/ipv6/ip6_output.c:1722
 ip6_push_pending_frames+0xb3/0xe0 net/ipv6/ip6_output.c:1742
 rawv6_push_pending_frames net/ipv6/raw.c:613
 rawv6_sendmsg+0x2cff/0x4130 net/ipv6/raw.c:927
 inet_sendmsg+0x164/0x5b0 net/ipv4/af_inet.c:744
 sock_sendmsg_nosec net/socket.c:635
 sock_sendmsg+0xca/0x110 net/socket.c:645
 sock_write_iter+0x326/0x620 net/socket.c:848
 new_sync_write fs/read_write.c:499
 __vfs_write+0x483/0x760 fs/read_write.c:512
 vfs_write+0x187/0x530 fs/read_write.c:560
 SYSC_write fs/read_write.c:607
 SyS_write+0xfb/0x230 fs/read_write.c:599
 entry_SYSCALL_64_fastpath+0x1f/0xc2 arch/x86/entry/entry_64.S:203
RIP: 0033:0x4421d9
RSP: 002b:00007f090e289b58 EFLAGS: 00000296 ORIG_RAX: 0000000000000001
RAX: ffffffffffffffda RBX: 0000000000000005 RCX: 00000000004421d9
RDX: 000000000000ffff RSI: 0000000020aa4fda RDI: 0000000000000005
RBP: 00000000006de8c0 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000296 R12: 0000000000700000
R13: ffffffffffffffff R14: 0000000020f4a000 R15: 0000000000000010

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox