Netdev List

Netdev List
 help / color / mirror / Atom feed

* [net-next][PATCH 16/18] RDS: make message size limit compliant with spec
From: Santosh Shilimkar @ 2016-12-05  6:57 UTC (permalink / raw)
  To: netdev, davem; +Cc: linux-kernel, Santosh Shilimkar
In-Reply-To: <1480921073-9140-1-git-send-email-santosh.shilimkar@oracle.com>

From: Avinash Repaka <avinash.repaka@oracle.com>

RDS support max message size as 1M but the code doesn't check this
in all cases. Patch fixes it for RDMA & non-RDMA and RDS MR size
and its enforced irrespective of underlying transport.

Signed-off-by: Avinash Repaka <avinash.repaka@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/rdma.c |  9 ++++++++-
 net/rds/rds.h  |  3 +++
 net/rds/send.c | 31 +++++++++++++++++++++++++++++++
 3 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index dd508e0..6e8db33 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -40,7 +40,6 @@
 /*
  * XXX
  *  - build with sparse
- *  - should we limit the size of a mr region?  let transport return failure?
  *  - should we detect duplicate keys on a socket?  hmm.
  *  - an rdma is an mlock, apply rlimit?
  */
@@ -200,6 +199,14 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
 		goto out;
 	}
 
+	/* Restrict the size of mr irrespective of underlying transport
+	 * To account for unaligned mr regions, subtract one from nr_pages
+	 */
+	if ((nr_pages - 1) > (RDS_MAX_MSG_SIZE >> PAGE_SHIFT)) {
+		ret = -EMSGSIZE;
+		goto out;
+	}
+
 	rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n",
 		args->vec.addr, args->vec.bytes, nr_pages);
 
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 8ccd5a9..f713194 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -50,6 +50,9 @@ void rdsdebug(char *fmt, ...)
 #define RDS_FRAG_SHIFT	12
 #define RDS_FRAG_SIZE	((unsigned int)(1 << RDS_FRAG_SHIFT))
 
+/* Used to limit both RDMA and non-RDMA RDS message to 1MB */
+#define RDS_MAX_MSG_SIZE	((unsigned int)(1 << 20))
+
 #define RDS_CONG_MAP_BYTES	(65536 / 8)
 #define RDS_CONG_MAP_PAGES	(PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE)
 #define RDS_CONG_MAP_PAGE_BITS	(PAGE_SIZE * 8)
diff --git a/net/rds/send.c b/net/rds/send.c
index 45e025b..5cc6403 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -994,6 +994,26 @@ static int rds_send_mprds_hash(struct rds_sock *rs, struct rds_connection *conn)
 	return hash;
 }
 
+static int rds_rdma_bytes(struct msghdr *msg, size_t *rdma_bytes)
+{
+	struct rds_rdma_args *args;
+	struct cmsghdr *cmsg;
+
+	for_each_cmsghdr(cmsg, msg) {
+		if (!CMSG_OK(msg, cmsg))
+			return -EINVAL;
+
+		if (cmsg->cmsg_level != SOL_RDS)
+			continue;
+
+		if (cmsg->cmsg_type == RDS_CMSG_RDMA_ARGS) {
+			args = CMSG_DATA(cmsg);
+			*rdma_bytes += args->remote_vec.bytes;
+		}
+	}
+	return 0;
+}
+
 int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 {
 	struct sock *sk = sock->sk;
@@ -1008,6 +1028,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 	int nonblock = msg->msg_flags & MSG_DONTWAIT;
 	long timeo = sock_sndtimeo(sk, nonblock);
 	struct rds_conn_path *cpath;
+	size_t total_payload_len = payload_len, rdma_payload_len = 0;
 
 	/* Mirror Linux UDP mirror of BSD error message compatibility */
 	/* XXX: Perhaps MSG_MORE someday */
@@ -1040,6 +1061,16 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 	}
 	release_sock(sk);
 
+	ret = rds_rdma_bytes(msg, &rdma_payload_len);
+	if (ret)
+		goto out;
+
+	total_payload_len += rdma_payload_len;
+	if (max_t(size_t, payload_len, rdma_payload_len) > RDS_MAX_MSG_SIZE) {
+		ret = -EMSGSIZE;
+		goto out;
+	}
+
 	if (payload_len > rds_sk_sndbuf(rs)) {
 		ret = -EMSGSIZE;
 		goto out;
-- 
1.9.1

^ permalink raw reply related

* [net-next][PATCH 07/18] RDS: RDMA: return appropriate error on rdma map failures
From: Santosh Shilimkar @ 2016-12-05  6:57 UTC (permalink / raw)
  To: netdev, davem; +Cc: linux-kernel, Santosh Shilimkar
In-Reply-To: <1480921073-9140-1-git-send-email-santosh.shilimkar@oracle.com>

The first message to a remote node should prompt a new
connection even if it is RDMA operation. For RDMA operation
the MR mapping can fail because connections is not yet up.

Since the connection establishment is asynchronous,
we make sure the map failure because of unavailable
connection reach to the user by appropriate error code.
Before returning to the user, lets trigger the connection
so that its ready for the next retry.

Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/send.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/net/rds/send.c b/net/rds/send.c
index bb13c56..0a6f38b 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -945,6 +945,11 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
 			ret = rds_cmsg_rdma_map(rs, rm, cmsg);
 			if (!ret)
 				*allocated_mr = 1;
+			else if (ret == -ENODEV)
+				/* Accommodate the get_mr() case which can fail
+				 * if connection isn't established yet.
+				 */
+				ret = -EAGAIN;
 			break;
 		case RDS_CMSG_ATOMIC_CSWP:
 		case RDS_CMSG_ATOMIC_FADD:
@@ -1082,8 +1087,12 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 
 	/* Parse any control messages the user may have included. */
 	ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
-	if (ret)
+	if (ret) {
+		/* Trigger connection so that its ready for the next retry */
+		if (ret ==  -EAGAIN)
+			rds_conn_connect_if_down(conn);
 		goto out;
+	}
 
 	if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) {
 		printk_ratelimited(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
-- 
1.9.1

^ permalink raw reply related

* [net-next][PATCH 14/18] RDS: IB: fix panic due to handlers running post teardown
From: Santosh Shilimkar @ 2016-12-05  6:57 UTC (permalink / raw)
  To: netdev, davem; +Cc: linux-kernel, Santosh Shilimkar
In-Reply-To: <1480921073-9140-1-git-send-email-santosh.shilimkar@oracle.com>

Shutdown code reaping loop takes care of emptying the
CQ's before they being destroyed. And once tasklets are
killed, the hanlders are not expected to run.

But because of core tasklet code issues, tasklet handler could
still run even after tasklet_kill,
RDS IB shutdown code already reaps the CQs before freeing
cq/qp resources so as such the handlers have nothing left
to do post shutdown.

On other hand any handler running after teardown and trying
to access already freed qp/cq resources causes issues
Patch fixes this race by  makes sure that handlers returns
without any action post teardown.

Reviewed-by: Wengang <wen.gang.wang@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/ib.h    |  1 +
 net/rds/ib_cm.c | 12 ++++++++++++
 2 files changed, 13 insertions(+)

diff --git a/net/rds/ib.h b/net/rds/ib.h
index 4b133b8..8efd1eb 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -185,6 +185,7 @@ struct rds_ib_connection {
 
 	/* Endpoint role in connection */
 	int			i_active_side;
+	atomic_t		i_cq_quiesce;
 
 	/* Send/Recv vectors */
 	int			i_scq_vector;
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 33c8584..ce3775a 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -128,6 +128,8 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 			  ic->i_flowctl ? ", flow control" : "");
 	}
 
+	atomic_set(&ic->i_cq_quiesce, 0);
+
 	/* Init rings and fill recv. this needs to wait until protocol
 	 * negotiation is complete, since ring layout is different
 	 * from 3.1 to 4.1.
@@ -267,6 +269,10 @@ static void rds_ib_tasklet_fn_send(unsigned long data)
 
 	rds_ib_stats_inc(s_ib_tasklet_call);
 
+	/* if cq has been already reaped, ignore incoming cq event */
+	if (atomic_read(&ic->i_cq_quiesce))
+		return;
+
 	poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
 	ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
 	poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
@@ -308,6 +314,10 @@ static void rds_ib_tasklet_fn_recv(unsigned long data)
 
 	rds_ib_stats_inc(s_ib_tasklet_call);
 
+	/* if cq has been already reaped, ignore incoming cq event */
+	if (atomic_read(&ic->i_cq_quiesce))
+		return;
+
 	memset(&state, 0, sizeof(state));
 	poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
 	ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
@@ -804,6 +814,8 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
 		tasklet_kill(&ic->i_send_tasklet);
 		tasklet_kill(&ic->i_recv_tasklet);
 
+		atomic_set(&ic->i_cq_quiesce, 1);
+
 		/* first destroy the ib state that generates callbacks */
 		if (ic->i_cm_id->qp)
 			rdma_destroy_qp(ic->i_cm_id);
-- 
1.9.1

^ permalink raw reply related

* [net-next][PATCH 18/18] RDS: IB: add missing connection cache usage info
From: Santosh Shilimkar @ 2016-12-05  6:57 UTC (permalink / raw)
  To: netdev, davem; +Cc: linux-kernel, Santosh Shilimkar
In-Reply-To: <1480921073-9140-1-git-send-email-santosh.shilimkar@oracle.com>

rds-tools already support it.

Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 include/uapi/linux/rds.h | 1 +
 net/rds/ib.c             | 1 +
 2 files changed, 2 insertions(+)

diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h
index 3833113..410ae3c 100644
--- a/include/uapi/linux/rds.h
+++ b/include/uapi/linux/rds.h
@@ -183,6 +183,7 @@ struct rds_info_rdma_connection {
 	uint32_t	max_send_sge;
 	uint32_t	rdma_mr_max;
 	uint32_t	rdma_mr_size;
+	uint32_t	cache_allocs;
 };
 
 /* RDS message Receive Path Latency points */
diff --git a/net/rds/ib.c b/net/rds/ib.c
index 8d70884..b5e2699 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -313,6 +313,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
 		iinfo->max_send_wr = ic->i_send_ring.w_nr;
 		iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
 		iinfo->max_send_sge = rds_ibdev->max_sge;
+		iinfo->cache_allocs = atomic_read(&ic->i_cache_allocs);
 		rds_ib_get_mr_info(rds_ibdev, iinfo);
 	}
 	return 1;
-- 
1.9.1

^ permalink raw reply related

* [net-next][PATCH 12/18] RDS: IB: Add vector spreading for cqs
From: Santosh Shilimkar @ 2016-12-05  6:57 UTC (permalink / raw)
  To: netdev, davem; +Cc: linux-kernel, Santosh Shilimkar
In-Reply-To: <1480921073-9140-1-git-send-email-santosh.shilimkar@oracle.com>

Based on available device vectors, allocate cqs accordingly to
get better spread of completion vectors which helps performace
great deal..

Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/ib.c    | 11 +++++++++++
 net/rds/ib.h    |  5 +++++
 net/rds/ib_cm.c | 40 +++++++++++++++++++++++++++++++++++++---
 3 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/net/rds/ib.c b/net/rds/ib.c
index 5680d90..8d70884 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -111,6 +111,9 @@ static void rds_ib_dev_free(struct work_struct *work)
 		kfree(i_ipaddr);
 	}
 
+	if (rds_ibdev->vector_load)
+		kfree(rds_ibdev->vector_load);
+
 	kfree(rds_ibdev);
 }
 
@@ -159,6 +162,14 @@ static void rds_ib_add_one(struct ib_device *device)
 	rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom;
 	rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom;
 
+	rds_ibdev->vector_load = kzalloc(sizeof(int) * device->num_comp_vectors,
+					 GFP_KERNEL);
+	if (!rds_ibdev->vector_load) {
+		pr_err("RDS/IB: %s failed to allocate vector memory\n",
+			__func__);
+		goto put_dev;
+	}
+
 	rds_ibdev->dev = device;
 	rds_ibdev->pd = ib_alloc_pd(device, 0);
 	if (IS_ERR(rds_ibdev->pd)) {
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 4987387..4b133b8 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -185,6 +185,10 @@ struct rds_ib_connection {
 
 	/* Endpoint role in connection */
 	int			i_active_side;
+
+	/* Send/Recv vectors */
+	int			i_scq_vector;
+	int			i_rcq_vector;
 };
 
 /* This assumes that atomic_t is at least 32 bits */
@@ -227,6 +231,7 @@ struct rds_ib_device {
 	spinlock_t		spinlock;	/* protect the above */
 	atomic_t		refcount;
 	struct work_struct	free_work;
+	int			*vector_load;
 };
 
 #define ibdev_to_node(ibdev) dev_to_node(ibdev->dma_device)
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 4d1bf04..33c8584 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -358,6 +358,28 @@ static void rds_ib_cq_comp_handler_send(struct ib_cq *cq, void *context)
 	tasklet_schedule(&ic->i_send_tasklet);
 }
 
+static inline int ibdev_get_unused_vector(struct rds_ib_device *rds_ibdev)
+{
+	int min = rds_ibdev->vector_load[rds_ibdev->dev->num_comp_vectors - 1];
+	int index = rds_ibdev->dev->num_comp_vectors - 1;
+	int i;
+
+	for (i = rds_ibdev->dev->num_comp_vectors - 1; i >= 0; i--) {
+		if (rds_ibdev->vector_load[i] < min) {
+			index = i;
+			min = rds_ibdev->vector_load[i];
+		}
+	}
+
+	rds_ibdev->vector_load[index]++;
+	return index;
+}
+
+static inline void ibdev_put_vector(struct rds_ib_device *rds_ibdev, int index)
+{
+	rds_ibdev->vector_load[index]--;
+}
+
 /*
  * This needs to be very careful to not leave IS_ERR pointers around for
  * cleanup to trip over.
@@ -399,25 +421,30 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 	/* Protection domain and memory range */
 	ic->i_pd = rds_ibdev->pd;
 
+	ic->i_scq_vector = ibdev_get_unused_vector(rds_ibdev);
 	cq_attr.cqe = ic->i_send_ring.w_nr + fr_queue_space + 1;
-
+	cq_attr.comp_vector = ic->i_scq_vector;
 	ic->i_send_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_send,
 				     rds_ib_cq_event_handler, conn,
 				     &cq_attr);
 	if (IS_ERR(ic->i_send_cq)) {
 		ret = PTR_ERR(ic->i_send_cq);
 		ic->i_send_cq = NULL;
+		ibdev_put_vector(rds_ibdev, ic->i_scq_vector);
 		rdsdebug("ib_create_cq send failed: %d\n", ret);
 		goto out;
 	}
 
+	ic->i_rcq_vector = ibdev_get_unused_vector(rds_ibdev);
 	cq_attr.cqe = ic->i_recv_ring.w_nr;
+	cq_attr.comp_vector = ic->i_rcq_vector;
 	ic->i_recv_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_recv,
 				     rds_ib_cq_event_handler, conn,
 				     &cq_attr);
 	if (IS_ERR(ic->i_recv_cq)) {
 		ret = PTR_ERR(ic->i_recv_cq);
 		ic->i_recv_cq = NULL;
+		ibdev_put_vector(rds_ibdev, ic->i_rcq_vector);
 		rdsdebug("ib_create_cq recv failed: %d\n", ret);
 		goto out;
 	}
@@ -780,10 +807,17 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
 		/* first destroy the ib state that generates callbacks */
 		if (ic->i_cm_id->qp)
 			rdma_destroy_qp(ic->i_cm_id);
-		if (ic->i_send_cq)
+		if (ic->i_send_cq) {
+			if (ic->rds_ibdev)
+				ibdev_put_vector(ic->rds_ibdev, ic->i_scq_vector);
 			ib_destroy_cq(ic->i_send_cq);
-		if (ic->i_recv_cq)
+		}
+
+		if (ic->i_recv_cq) {
+			if (ic->rds_ibdev)
+				ibdev_put_vector(ic->rds_ibdev, ic->i_rcq_vector);
 			ib_destroy_cq(ic->i_recv_cq);
+		}
 
 		/* then free the resources that ib callbacks use */
 		if (ic->i_send_hdrs)
-- 
1.9.1

^ permalink raw reply related

* [net-next][PATCH 17/18] RDS: add receive message trace used by application
From: Santosh Shilimkar @ 2016-12-05  6:57 UTC (permalink / raw)
  To: netdev, davem; +Cc: linux-kernel, Santosh Shilimkar
In-Reply-To: <1480921073-9140-1-git-send-email-santosh.shilimkar@oracle.com>

Socket option to tap receive path latency in various stages
in nano seconds. It can be enabled on selective sockets using
using SO_RDS_MSG_RXPATH_LATENCY socket option. RDS will return
the data to application with RDS_CMSG_RXPATH_LATENCY in defined
format. Scope is left to add more trace points for future
without need of change in the interface.

Reviewed-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 include/uapi/linux/rds.h | 33 +++++++++++++++++++++++++++++++++
 net/rds/af_rds.c         | 28 ++++++++++++++++++++++++++++
 net/rds/ib_recv.c        |  4 ++++
 net/rds/rds.h            | 10 ++++++++++
 net/rds/recv.c           | 32 +++++++++++++++++++++++++++++---
 net/rds/tcp_recv.c       |  5 +++++
 6 files changed, 109 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h
index 0f9265c..3833113 100644
--- a/include/uapi/linux/rds.h
+++ b/include/uapi/linux/rds.h
@@ -52,6 +52,13 @@
 #define RDS_GET_MR_FOR_DEST		7
 #define SO_RDS_TRANSPORT		8
 
+/* Socket option to tap receive path latency
+ *	SO_RDS: SO_RDS_MSG_RXPATH_LATENCY
+ *	Format used struct rds_rx_trace_so
+ */
+#define SO_RDS_MSG_RXPATH_LATENCY	10
+
+
 /* supported values for SO_RDS_TRANSPORT */
 #define	RDS_TRANS_IB	0
 #define	RDS_TRANS_IWARP	1
@@ -77,6 +84,12 @@
  *	the same as for the GET_MR setsockopt.
  * RDS_CMSG_RDMA_STATUS (recvmsg)
  *	Returns the status of a completed RDMA operation.
+ * RDS_CMSG_RXPATH_LATENCY(recvmsg)
+ *	Returns rds message latencies in various stages of receive
+ *	path in nS. Its set per socket using SO_RDS_MSG_RXPATH_LATENCY
+ *	socket option. Legitimate points are defined in
+ *	enum rds_message_rxpath_latency. More points can be added in
+ *	future. CSMG format is struct rds_cmsg_rx_trace.
  */
 #define RDS_CMSG_RDMA_ARGS		1
 #define RDS_CMSG_RDMA_DEST		2
@@ -87,6 +100,7 @@
 #define RDS_CMSG_ATOMIC_CSWP		7
 #define RDS_CMSG_MASKED_ATOMIC_FADD	8
 #define RDS_CMSG_MASKED_ATOMIC_CSWP	9
+#define RDS_CMSG_RXPATH_LATENCY		11
 
 #define RDS_INFO_FIRST			10000
 #define RDS_INFO_COUNTERS		10000
@@ -171,6 +185,25 @@ struct rds_info_rdma_connection {
 	uint32_t	rdma_mr_size;
 };
 
+/* RDS message Receive Path Latency points */
+enum rds_message_rxpath_latency {
+	RDS_MSG_RX_HDR_TO_DGRAM_START = 0,
+	RDS_MSG_RX_DGRAM_REASSEMBLE,
+	RDS_MSG_RX_DGRAM_DELIVERED,
+	RDS_MSG_RX_DGRAM_TRACE_MAX
+};
+
+struct rds_rx_trace_so {
+	u8 rx_traces;
+	u8 rx_trace_pos[RDS_MSG_RX_DGRAM_TRACE_MAX];
+};
+
+struct rds_cmsg_rx_trace {
+	u8 rx_traces;
+	u8 rx_trace_pos[RDS_MSG_RX_DGRAM_TRACE_MAX];
+	u64 rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX];
+};
+
 /*
  * Congestion monitoring.
  * Congestion control in RDS happens at the host connection
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 2ac1e61..fd821740 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -298,6 +298,30 @@ static int rds_enable_recvtstamp(struct sock *sk, char __user *optval,
 	return 0;
 }
 
+static int rds_recv_track_latency(struct rds_sock *rs, char __user *optval,
+				  int optlen)
+{
+	struct rds_rx_trace_so trace;
+	int i;
+
+	if (optlen != sizeof(struct rds_rx_trace_so))
+		return -EFAULT;
+
+	if (copy_from_user(&trace, optval, sizeof(trace)))
+		return -EFAULT;
+
+	rs->rs_rx_traces = trace.rx_traces;
+	for (i = 0; i < rs->rs_rx_traces; i++) {
+		if (trace.rx_trace_pos[i] > RDS_MSG_RX_DGRAM_TRACE_MAX) {
+			rs->rs_rx_traces = 0;
+			return -EFAULT;
+		}
+		rs->rs_rx_trace[i] = trace.rx_trace_pos[i];
+	}
+
+	return 0;
+}
+
 static int rds_setsockopt(struct socket *sock, int level, int optname,
 			  char __user *optval, unsigned int optlen)
 {
@@ -338,6 +362,9 @@ static int rds_setsockopt(struct socket *sock, int level, int optname,
 		ret = rds_enable_recvtstamp(sock->sk, optval, optlen);
 		release_sock(sock->sk);
 		break;
+	case SO_RDS_MSG_RXPATH_LATENCY:
+		ret = rds_recv_track_latency(rs, optval, optlen);
+		break;
 	default:
 		ret = -ENOPROTOOPT;
 	}
@@ -484,6 +511,7 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
 	INIT_LIST_HEAD(&rs->rs_cong_list);
 	spin_lock_init(&rs->rs_rdma_lock);
 	rs->rs_rdma_keys = RB_ROOT;
+	rs->rs_rx_traces = 0;
 
 	spin_lock_bh(&rds_sock_lock);
 	list_add_tail(&rs->rs_item, &rds_sock_list);
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 4b0f126..e10624a 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -911,8 +911,12 @@ static void rds_ib_process_recv(struct rds_connection *conn,
 		ic->i_ibinc = ibinc;
 
 		hdr = &ibinc->ii_inc.i_hdr;
+		ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
+				local_clock();
 		memcpy(hdr, ihdr, sizeof(*hdr));
 		ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
+		ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_START] =
+				local_clock();
 
 		rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc,
 			 ic->i_recv_data_rem, hdr->h_flags);
diff --git a/net/rds/rds.h b/net/rds/rds.h
index f713194..07fff73 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -253,6 +253,11 @@ struct rds_ext_header_rdma_dest {
 #define RDS_EXTHDR_GEN_NUM	6
 
 #define __RDS_EXTHDR_MAX	16 /* for now */
+#define RDS_RX_MAX_TRACES	(RDS_MSG_RX_DGRAM_TRACE_MAX + 1)
+#define	RDS_MSG_RX_HDR		0
+#define	RDS_MSG_RX_START	1
+#define	RDS_MSG_RX_END		2
+#define	RDS_MSG_RX_CMSG		3
 
 struct rds_incoming {
 	atomic_t		i_refcount;
@@ -265,6 +270,7 @@ struct rds_incoming {
 
 	rds_rdma_cookie_t	i_rdma_cookie;
 	struct timeval		i_rx_tstamp;
+	u64			i_rx_lat_trace[RDS_RX_MAX_TRACES];
 };
 
 struct rds_mr {
@@ -575,6 +581,10 @@ struct rds_sock {
 	unsigned char		rs_recverr,
 				rs_cong_monitor;
 	u32			rs_hash_initval;
+
+	/* Socket receive path trace points*/
+	u8			rs_rx_traces;
+	u8			rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX];
 };
 
 static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
diff --git a/net/rds/recv.c b/net/rds/recv.c
index ba19eee..8b7e7b7 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -43,6 +43,8 @@
 void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
 		  __be32 saddr)
 {
+	int i;
+
 	atomic_set(&inc->i_refcount, 1);
 	INIT_LIST_HEAD(&inc->i_item);
 	inc->i_conn = conn;
@@ -50,6 +52,9 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
 	inc->i_rdma_cookie = 0;
 	inc->i_rx_tstamp.tv_sec = 0;
 	inc->i_rx_tstamp.tv_usec = 0;
+
+	for (i = 0; i < RDS_RX_MAX_TRACES; i++)
+		inc->i_rx_lat_trace[i] = 0;
 }
 EXPORT_SYMBOL_GPL(rds_inc_init);
 
@@ -373,6 +378,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
 		if (sock_flag(sk, SOCK_RCVTSTAMP))
 			do_gettimeofday(&inc->i_rx_tstamp);
 		rds_inc_addref(inc);
+		inc->i_rx_lat_trace[RDS_MSG_RX_END] = local_clock();
 		list_add_tail(&inc->i_item, &rs->rs_recv_queue);
 		__rds_wake_sk_sleep(sk);
 	} else {
@@ -534,7 +540,7 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
 		ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST,
 				sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie);
 		if (ret)
-			return ret;
+			goto out;
 	}
 
 	if ((inc->i_rx_tstamp.tv_sec != 0) &&
@@ -543,10 +549,30 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
 			       sizeof(struct timeval),
 			       &inc->i_rx_tstamp);
 		if (ret)
-			return ret;
+			goto out;
 	}
 
-	return 0;
+	if (rs->rs_rx_traces) {
+		struct rds_cmsg_rx_trace t;
+		int i, j;
+
+		inc->i_rx_lat_trace[RDS_MSG_RX_CMSG] = local_clock();
+		t.rx_traces =  rs->rs_rx_traces;
+		for (i = 0; i < rs->rs_rx_traces; i++) {
+			j = rs->rs_rx_trace[i];
+			t.rx_trace_pos[i] = j;
+			t.rx_trace[i] = inc->i_rx_lat_trace[j + 1] -
+					  inc->i_rx_lat_trace[j];
+		}
+
+		ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RXPATH_LATENCY,
+			       sizeof(t), &t);
+		if (ret)
+			goto out;
+	}
+
+out:
+	return ret;
 }
 
 int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index ad4892e..e006ef8 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -180,6 +180,9 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
 			rdsdebug("alloced tinc %p\n", tinc);
 			rds_inc_path_init(&tinc->ti_inc, cp,
 					  cp->cp_conn->c_faddr);
+			tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
+					local_clock();
+
 			/*
 			 * XXX * we might be able to use the __ variants when
 			 * we've already serialized at a higher level.
@@ -204,6 +207,8 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
 				/* could be 0 for a 0 len message */
 				tc->t_tinc_data_rem =
 					be32_to_cpu(tinc->ti_inc.i_hdr.h_len);
+				tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_START] =
+					local_clock();
 			}
 		}
 
-- 
1.9.1

^ permalink raw reply related

* Re: net: use-after-free in worker_thread
From: Herbert Xu @ 2016-12-05  7:19 UTC (permalink / raw)
  To: Cong Wang
  Cc: Andrey Konovalov, David S. Miller, Johannes Berg,
	Florian Westphal, Eric Dumazet, Bob Copeland, Tom Herbert,
	David Decotigny, netdev, LKML
In-Reply-To: <CAM_iQpWo42AnAT-LMJNSOL=3NQswRwHxPNC_4FAk3teAPEQDJg@mail.gmail.com>

On Sat, Dec 03, 2016 at 10:14:48AM -0800, Cong Wang wrote:
> On Sat, Dec 3, 2016 at 9:41 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > On Sat, Dec 3, 2016 at 4:56 AM, Andrey Konovalov <andreyknvl@google.com> wrote:
> >> Hi!
> >>
> >> I'm seeing lots of the following error reports while running the
> >> syzkaller fuzzer.
> >>
> >> Reports appeared when I updated to 3c49de52 (Dec 2) from 2caceb32 (Dec 1).
> >>
> >> ==================================================================
> >> BUG: KASAN: use-after-free in worker_thread+0x17d8/0x18a0
> >> Read of size 8 at addr ffff880067f3ecd8 by task kworker/3:1/774
> >>
> >> page:ffffea00019fce00 count:1 mapcount:0 mapping:          (null)
> >> index:0xffff880067f39c10 compound_mapcount: 0
> >> flags: 0x500000000004080(slab|head)
> >> page dumped because: kasan: bad access detected
> >>
> >> CPU: 3 PID: 774 Comm: kworker/3:1 Not tainted 4.9.0-rc7+ #66
> >> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
> >>  ffff88006c267838 ffffffff81f882da ffffffff6c25e338 1ffff1000d84ce9a
> >>  ffffed000d84ce92 ffff88006c25e340 0000000041b58ab3 ffffffff8541e198
> >>  ffffffff81f88048 0000000100000000 0000000041b58ab3 ffffffff853d3ee8
> >> Call Trace:
> >>  [<     inline     >] __dump_stack lib/dump_stack.c:15
> >>  [<ffffffff81f882da>] dump_stack+0x292/0x398 lib/dump_stack.c:51
> >>  [<     inline     >] describe_address mm/kasan/report.c:262
> >>  [<ffffffff817e50d1>] kasan_report_error+0x121/0x560 mm/kasan/report.c:368
> >>  [<     inline     >] kasan_report mm/kasan/report.c:390
> >>  [<ffffffff817e560e>] __asan_report_load8_noabort+0x3e/0x40
> >> mm/kasan/report.c:411
> >>  [<ffffffff81329b88>] worker_thread+0x17d8/0x18a0 kernel/workqueue.c:2228
> >>  [<ffffffff8133ebf3>] kthread+0x323/0x3e0 kernel/kthread.c:209
> >>  [<ffffffff84a2a22a>] ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:433
> >
> > Heck... this is the pending work vs. sk_destruct() race. :-/
> > We can't wait for the work in RCU callback, let me think about it...

Sorry, my patch was obviously crap as it was trying to delay the
freeing of a socket from sk_destruct which can't be done.

> Please try the attached patch, I only did compile test, I can't access
> my desktop now, so can't do further tests.

Thanks for the patch.  It'll obviously work but I wanted avoid that
because it penalises the common path for the rare case.

Andrey, please try this patch and let me know if it's any better.

---8<---
Subject: netlink: Do not schedule work from sk_destruct

It is wrong to schedule a work from sk_destruct using the socket
as the memory reserve because the socket will be freed immediately
after the return from sk_destruct.

Instead we should do the deferral prior to sk_free.

This patch does just that.

Fixes: 707693c8a498 ("netlink: Call cb->done from a worker thread")
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 602e5eb..8a642c5 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -322,11 +322,13 @@ static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
 	sk_mem_charge(sk, skb->truesize);
 }
 
-static void __netlink_sock_destruct(struct sock *sk)
+static void netlink_sock_destruct(struct sock *sk)
 {
 	struct netlink_sock *nlk = nlk_sk(sk);
 
 	if (nlk->cb_running) {
+		if (nlk->cb.done)
+			nlk->cb.done(&nlk->cb);
 		module_put(nlk->cb.module);
 		kfree_skb(nlk->cb.skb);
 	}
@@ -348,21 +350,7 @@ static void netlink_sock_destruct_work(struct work_struct *work)
 	struct netlink_sock *nlk = container_of(work, struct netlink_sock,
 						work);
 
-	nlk->cb.done(&nlk->cb);
-	__netlink_sock_destruct(&nlk->sk);
-}
-
-static void netlink_sock_destruct(struct sock *sk)
-{
-	struct netlink_sock *nlk = nlk_sk(sk);
-
-	if (nlk->cb_running && nlk->cb.done) {
-		INIT_WORK(&nlk->work, netlink_sock_destruct_work);
-		schedule_work(&nlk->work);
-		return;
-	}
-
-	__netlink_sock_destruct(sk);
+	sk_free(&nlk->sk);
 }
 
 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on
@@ -664,11 +652,17 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol,
 	goto out;
 }
 
-static void deferred_put_nlk_sk(struct rcu_head *head)
+static void deferred_free_nlk_sk(struct rcu_head *head)
 {
 	struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu);
 
-	sock_put(&nlk->sk);
+	if (nlk->cb_running && nlk->cb.done) {
+		INIT_WORK(&nlk->work, netlink_sock_destruct_work);
+		schedule_work(&nlk->work);
+		return;
+	}
+
+	sk_free(&nlk->sk);
 }
 
 static int netlink_release(struct socket *sock)
@@ -743,7 +737,9 @@ static int netlink_release(struct socket *sock)
 	local_bh_disable();
 	sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);
 	local_bh_enable();
-	call_rcu(&nlk->rcu, deferred_put_nlk_sk);
+
+	if (atomic_dec_and_test(&sk->sk_refcnt))
+		call_rcu(&nlk->rcu, deferred_free_nlk_sk);
 	return 0;
 }
 
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply related

* Re: [PATCH net-next 1/3] net: ethoc: Account for duplex changes
From: Tobias Klauser @ 2016-12-05  7:20 UTC (permalink / raw)
  To: Florian Fainelli; +Cc: netdev, tremyfr, davem, thierry.reding, andrew
In-Reply-To: <20161204204030.9853-2-f.fainelli@gmail.com>

On 2016-12-04 at 21:40:28 +0100, Florian Fainelli <f.fainelli@gmail.com> wrote:
> ethoc_mdio_poll() which is our PHYLIB adjust_link callback does nothing,
> we should at least react to duplex changes and change MODER accordingly.
> Speed changes is not a problem, since the OpenCores Ethernet core seems
> to be reacting okay without us telling it.
> 
> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>

Reviewed-by: Tobias Klauser <tklauser@distanz.ch>

^ permalink raw reply

* Re: [PATCH net-next 2/3] net: ethoc: Utilize of_get_mac_address()
From: Tobias Klauser @ 2016-12-05  7:20 UTC (permalink / raw)
  To: Florian Fainelli; +Cc: netdev, tremyfr, davem, thierry.reding, andrew
In-Reply-To: <20161204204030.9853-3-f.fainelli@gmail.com>

On 2016-12-04 at 21:40:29 +0100, Florian Fainelli <f.fainelli@gmail.com> wrote:
> Do not open code getting the MAC address exclusively from the
> "local-mac-address" property, but instead use of_get_mac_address() which
> looks up the MAC address using the 3 typical property names.
> 
> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>

Reviewed-by: Tobias Klauser <tklauser@distanz.ch>

^ permalink raw reply

* Re: net: use-after-free in worker_thread
From: Herbert Xu @ 2016-12-05  7:21 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Andrey Konovalov, David S. Miller, Cong Wang, Johannes Berg,
	Florian Westphal, Eric Dumazet, Bob Copeland, Tom Herbert,
	David Decotigny, netdev, LKML, Kostya Serebryany, Dmitry Vyukov,
	syzkaller
In-Reply-To: <1480772947.18162.402.camel@edumazet-glaptop3.roam.corp.google.com>

On Sat, Dec 03, 2016 at 05:49:07AM -0800, Eric Dumazet wrote:
>
> @@ -600,6 +600,7 @@ static int __netlink_create(struct net *net, struct socket *sock,
>  	}
>  	init_waitqueue_head(&nlk->wait);
>  
> +	sock_set_flag(sk, SOCK_RCU_FREE);
>  	sk->sk_destruct = netlink_sock_destruct;
>  	sk->sk_protocol = protocol;
>  	return 0;

It's not necessarily a big deal but I just wanted to point out
that SOCK_RCU_FREE is not equivalent to the call_rcu thing that
netlink does.  The latter only does the RCU deferral for the socket
release call which is the only place where it's needed while
SOCK_RCU_FREE will force every path to do an RCU deferral.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH net-next 3/3] net: ethoc: Demote packet dropped error message to debug
From: Tobias Klauser @ 2016-12-05  7:21 UTC (permalink / raw)
  To: Florian Fainelli; +Cc: netdev, tremyfr, davem, thierry.reding, andrew
In-Reply-To: <20161204204030.9853-4-f.fainelli@gmail.com>

On 2016-12-04 at 21:40:30 +0100, Florian Fainelli <f.fainelli@gmail.com> wrote:
> Spamming the console with: net eth1: packet dropped can happen
> fairly frequently if the adapter is busy transmitting, demote the
> message to a debug print.
> 
> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>

Reviewed-by: Tobias Klauser <tklauser@distanz.ch>

^ permalink raw reply

* [v2 PATCH] netlink: Do not schedule work from sk_destruct
From: Herbert Xu @ 2016-12-05  7:26 UTC (permalink / raw)
  To: Cong Wang
  Cc: Andrey Konovalov, David S. Miller, Johannes Berg,
	Florian Westphal, Eric Dumazet, Bob Copeland, Tom Herbert,
	David Decotigny, netdev, LKML
In-Reply-To: <20161205071946.GB9496@gondor.apana.org.au>

On Mon, Dec 05, 2016 at 03:19:46PM +0800, Herbert Xu wrote:
>
> Thanks for the patch.  It'll obviously work but I wanted avoid that
> because it penalises the common path for the rare case.
> 
> Andrey, please try this patch and let me know if it's any better.
> 
> ---8<---
> Subject: netlink: Do not schedule work from sk_destruct

Crap, I screwed it up again.  Here is a v2 which moves the atomic
call into the RCU callback as otherwise the socket can be freed from
another path while we await the RCU callback.

---8<---
It is wrong to schedule a work from sk_destruct using the socket
as the memory reserve because the socket will be freed immediately
after the return from sk_destruct.

Instead we should do the deferral prior to sk_free.

This patch does just that.

Fixes: 707693c8a498 ("netlink: Call cb->done from a worker thread")
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 602e5eb..463f5cf 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -322,11 +322,13 @@ static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
 	sk_mem_charge(sk, skb->truesize);
 }
 
-static void __netlink_sock_destruct(struct sock *sk)
+static void netlink_sock_destruct(struct sock *sk)
 {
 	struct netlink_sock *nlk = nlk_sk(sk);
 
 	if (nlk->cb_running) {
+		if (nlk->cb.done)
+			nlk->cb.done(&nlk->cb);
 		module_put(nlk->cb.module);
 		kfree_skb(nlk->cb.skb);
 	}
@@ -348,21 +350,7 @@ static void netlink_sock_destruct_work(struct work_struct *work)
 	struct netlink_sock *nlk = container_of(work, struct netlink_sock,
 						work);
 
-	nlk->cb.done(&nlk->cb);
-	__netlink_sock_destruct(&nlk->sk);
-}
-
-static void netlink_sock_destruct(struct sock *sk)
-{
-	struct netlink_sock *nlk = nlk_sk(sk);
-
-	if (nlk->cb_running && nlk->cb.done) {
-		INIT_WORK(&nlk->work, netlink_sock_destruct_work);
-		schedule_work(&nlk->work);
-		return;
-	}
-
-	__netlink_sock_destruct(sk);
+	sk_free(&nlk->sk);
 }
 
 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on
@@ -664,11 +652,21 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol,
 	goto out;
 }
 
-static void deferred_put_nlk_sk(struct rcu_head *head)
+static void deferred_free_nlk_sk(struct rcu_head *head)
 {
 	struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu);
+	struct sock *sk = &nlk->sk;
+
+	if (!atomic_dec_and_test(&sk->sk_refcnt))
+		return;
+
+	if (nlk->cb_running && nlk->cb.done) {
+		INIT_WORK(&nlk->work, netlink_sock_destruct_work);
+		schedule_work(&nlk->work);
+		return;
+	}
 
-	sock_put(&nlk->sk);
+	sk_free(sk);
 }
 
 static int netlink_release(struct socket *sock)
@@ -743,7 +741,7 @@ static int netlink_release(struct socket *sock)
 	local_bh_disable();
 	sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);
 	local_bh_enable();
-	call_rcu(&nlk->rcu, deferred_put_nlk_sk);
+	call_rcu(&nlk->rcu, deferred_free_nlk_sk);
 	return 0;
 }
 
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply related

* Re: [PATCH v2 net-next 1/4] bpf: xdp: Allow head adjustment in XDP prog
From: Jesper Dangaard Brouer @ 2016-12-05  7:35 UTC (permalink / raw)
  To: Martin KaFai Lau
  Cc: netdev, Alexei Starovoitov, Brenden Blanco, Daniel Borkmann,
	David Miller, Saeed Mahameed, Tariq Toukan, Kernel Team, brouer
In-Reply-To: <1480821446-4122277-2-git-send-email-kafai@fb.com>


On Sat, 3 Dec 2016 19:17:23 -0800 Martin KaFai Lau <kafai@fb.com> wrote:

> This patch allows XDP prog to extend/remove the packet
> data at the head (like adding or removing header).  It is
> done by adding a new XDP helper bpf_xdp_adjust_head().
> 
> It also renames bpf_helper_changes_skb_data() to
> bpf_helper_changes_pkt_data() to better reflect
> that XDP prog does not work on skb.
> 
> Acked-by: Alexei Starovoitov <ast@kernel.org>
> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> ---
[...]
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 56b43587d200..ccef948cf58a 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -2234,7 +2234,34 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = {
>  	.arg3_type	= ARG_ANYTHING,
>  };
>  
> -bool bpf_helper_changes_skb_data(void *func)
> +BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
> +{
> +	/* Both mlx4 and mlx5 driver align each packet to PAGE_SIZE when
> +	 * XDP prog is set.
> +	 * If the above is not true for the other drivers to support
> +	 * bpf_xdp_adjust_head, struct xdp_buff can be extended.
> +	 */
> +	unsigned long addr = (unsigned long)xdp->data & PAGE_MASK;
> +	void *data_hard_start = (void *)addr;
> +	void *data = xdp->data + offset;
> +
> +	if (unlikely(data < data_hard_start || data > xdp->data_end - ETH_HLEN))
> +		return -EINVAL;
> +
> +	xdp->data = data;
> +
> +	return 0;
> +}

Thanks for adjusting this, I like Daniel's suggestion.

For this patch:

Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>

As it still looks like 3/4 need some adjustments based on Saeed's
comments.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* RE: [patch net] net: fec: fix compile with CONFIG_M5272
From: Andy Duan @ 2016-12-05  7:31 UTC (permalink / raw)
  To: Nikita Yushchenko, David S. Miller, Troy Kisky, Andrew Lunn,
	Eric Nelson, Philippe Reynes, Johannes Berg,
	netdev@vger.kernel.org
  Cc: Chris Healy, Fabio Estevam, linux-kernel@vger.kernel.org
In-Reply-To: <1480864677-15310-1-git-send-email-nikita.yoush@cogentembedded.com>

From: Nikita Yushchenko <nikita.yoush@cogentembedded.com> Sent: Sunday, December 04, 2016 11:18 PM
 >To: David S. Miller <davem@davemloft.net>; Andy Duan
 ><fugang.duan@nxp.com>; Troy Kisky <troy.kisky@boundarydevices.com>;
 >Andrew Lunn <andrew@lunn.ch>; Eric Nelson <eric@nelint.com>; Philippe
 >Reynes <tremyfr@gmail.com>; Johannes Berg <johannes@sipsolutions.net>;
 >netdev@vger.kernel.org
 >Cc: Chris Healy <cphealy@gmail.com>; Fabio Estevam
 ><fabio.estevam@nxp.com>; linux-kernel@vger.kernel.org; Nikita
 >Yushchenko <nikita.yoush@cogentembedded.com>
 >Subject: [patch net] net: fec: fix compile with CONFIG_M5272
 >
 >Commit 4dfb80d18d05 ("net: fec: cache statistics while device is down")
 >introduced unconditional statistics-related actions.
 >
 >However, when driver is compiled with CONFIG_M5272, staticsics-related
 >definitions do not exist, which results into build errors.
 >
 >Fix that by adding needed #if !defined(CONFIG_M5272).
 >
 >Fixes: 4dfb80d18d05 ("net: fec: cache statistics while device is down")
 >Signed-off-by: Nikita Yushchenko <nikita.yoush@cogentembedded.com>
 >---
 > drivers/net/ethernet/freescale/fec_main.c | 12 +++++++++---
 > 1 file changed, 9 insertions(+), 3 deletions(-)
 >
 >diff --git a/drivers/net/ethernet/freescale/fec_main.c
 >b/drivers/net/ethernet/freescale/fec_main.c
 >index 6a20c24a2003..89e902767abb 100644
 >--- a/drivers/net/ethernet/freescale/fec_main.c
 >+++ b/drivers/net/ethernet/freescale/fec_main.c
 >@@ -2884,7 +2884,9 @@ fec_enet_close(struct net_device *ndev)
 > 	if (fep->quirks & FEC_QUIRK_ERR006687)
 > 		imx6q_cpuidle_fec_irqs_unused();
 >
 >+#if !defined(CONFIG_M5272)
 > 	fec_enet_update_ethtool_stats(ndev);
 >+#endif
It is better to define fec_enet_update_ethtool_stats()  for CONFIG_M5272 like static void fec_enet_update_ethtool_stats(struct net_device *dev){}, or directly return in .fec_enet_update_ethtool_stats():
@@ -2315,6 +2315,10 @@ static void fec_enet_update_ethtool_stats(struct net_device *dev)
        struct fec_enet_private *fep = netdev_priv(dev);
        int i;

+#if defined(CONFIG_M5272)
+       return;
+#endif
+
>
 > 	fec_enet_clk_enable(ndev, false);
 > 	pinctrl_pm_select_sleep_state(&fep->pdev->dev);
 >@@ -3192,7 +3194,9 @@ static int fec_enet_init(struct net_device *ndev)
 >
 > 	fec_restart(ndev);
 >
 >+#if !defined(CONFIG_M5272)
 > 	fec_enet_update_ethtool_stats(ndev);
 >+#endif
 >
ditto

 > 	return 0;
 > }
 >@@ -3292,9 +3296,11 @@ fec_probe(struct platform_device *pdev)
 > 	fec_enet_get_queue_num(pdev, &num_tx_qs, &num_rx_qs);
 >
 > 	/* Init network device */
 >-	ndev = alloc_etherdev_mqs(sizeof(struct fec_enet_private) +
 >-				  ARRAY_SIZE(fec_stats) * sizeof(u64),
 >-				  num_tx_qs, num_rx_qs);
 >+	ndev = alloc_etherdev_mqs(sizeof(struct fec_enet_private) #if
 >+!defined(CONFIG_M5272)
 >+				  + ARRAY_SIZE(fec_stats) * sizeof(u64)
 >#endif
 >+				  , num_tx_qs, num_rx_qs);
 > 	if (!ndev)
 > 		return -ENOMEM;
 >
 >--
 >2.1.4

^ permalink raw reply

* Re: [PATCH net-next 1/3] net: ethoc: Account for duplex changes
From: Thierry Reding @ 2016-12-05  8:00 UTC (permalink / raw)
  To: Florian Fainelli; +Cc: netdev, tremyfr, tklauser, davem, andrew
In-Reply-To: <20161204204030.9853-2-f.fainelli@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 575 bytes --]

On Sun, Dec 04, 2016 at 12:40:28PM -0800, Florian Fainelli wrote:
> ethoc_mdio_poll() which is our PHYLIB adjust_link callback does nothing,
> we should at least react to duplex changes and change MODER accordingly.
> Speed changes is not a problem, since the OpenCores Ethernet core seems
> to be reacting okay without us telling it.
> 
> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
> ---
>  drivers/net/ethernet/ethoc.c | 35 +++++++++++++++++++++++++++++++++++
>  1 file changed, 35 insertions(+)

Acked-by: Thierry Reding <thierry.reding@gmail.com>

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply

* Re: [PATCH net-next 2/3] net: ethoc: Utilize of_get_mac_address()
From: Thierry Reding @ 2016-12-05  8:00 UTC (permalink / raw)
  To: Florian Fainelli; +Cc: netdev, tremyfr, tklauser, davem, andrew
In-Reply-To: <20161204204030.9853-3-f.fainelli@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 493 bytes --]

On Sun, Dec 04, 2016 at 12:40:29PM -0800, Florian Fainelli wrote:
> Do not open code getting the MAC address exclusively from the
> "local-mac-address" property, but instead use of_get_mac_address() which
> looks up the MAC address using the 3 typical property names.
> 
> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
> ---
>  drivers/net/ethernet/ethoc.c | 7 +++----
>  1 file changed, 3 insertions(+), 4 deletions(-)

Acked-by: Thierry Reding <thierry.reding@gmail.com>

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply

* Re: [PATCH net-next 3/3] net: ethoc: Demote packet dropped error message to debug
From: Thierry Reding @ 2016-12-05  8:00 UTC (permalink / raw)
  To: Florian Fainelli; +Cc: netdev, tremyfr, tklauser, davem, andrew
In-Reply-To: <20161204204030.9853-4-f.fainelli@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 445 bytes --]

On Sun, Dec 04, 2016 at 12:40:30PM -0800, Florian Fainelli wrote:
> Spamming the console with: net eth1: packet dropped can happen
> fairly frequently if the adapter is busy transmitting, demote the
> message to a debug print.
> 
> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
> ---
>  drivers/net/ethernet/ethoc.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)

Acked-by: Thierry Reding <thierry.reding@gmail.com>

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply

* [v3 PATCH] netlink: Do not schedule work from sk_destruct
From: Herbert Xu @ 2016-12-05  7:28 UTC (permalink / raw)
  To: Cong Wang
  Cc: Andrey Konovalov, David S. Miller, Johannes Berg,
	Florian Westphal, Eric Dumazet, Bob Copeland, Tom Herbert,
	David Decotigny, netdev, LKML
In-Reply-To: <20161205072600.GA10204@gondor.apana.org.au>

On Mon, Dec 05, 2016 at 03:26:00PM +0800, Herbert Xu wrote:
> On Mon, Dec 05, 2016 at 03:19:46PM +0800, Herbert Xu wrote:
> >
> > Thanks for the patch.  It'll obviously work but I wanted avoid that
> > because it penalises the common path for the rare case.
> > 
> > Andrey, please try this patch and let me know if it's any better.
> > 
> > ---8<---
> > Subject: netlink: Do not schedule work from sk_destruct
> 
> Crap, I screwed it up again.  Here is a v2 which moves the atomic
> call into the RCU callback as otherwise the socket can be freed from
> another path while we await the RCU callback.

With the move it no longer makes sense to rename deferred_put_nlk_sk
so here is v3 which restores the original name.

---8<---
It is wrong to schedule a work from sk_destruct using the socket
as the memory reserve because the socket will be freed immediately
after the return from sk_destruct.

Instead we should do the deferral prior to sk_free.

This patch does just that.

Fixes: 707693c8a498 ("netlink: Call cb->done from a worker thread")
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 602e5eb..246f29d 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -322,11 +322,13 @@ static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
 	sk_mem_charge(sk, skb->truesize);
 }
 
-static void __netlink_sock_destruct(struct sock *sk)
+static void netlink_sock_destruct(struct sock *sk)
 {
 	struct netlink_sock *nlk = nlk_sk(sk);
 
 	if (nlk->cb_running) {
+		if (nlk->cb.done)
+			nlk->cb.done(&nlk->cb);
 		module_put(nlk->cb.module);
 		kfree_skb(nlk->cb.skb);
 	}
@@ -348,21 +350,7 @@ static void netlink_sock_destruct_work(struct work_struct *work)
 	struct netlink_sock *nlk = container_of(work, struct netlink_sock,
 						work);
 
-	nlk->cb.done(&nlk->cb);
-	__netlink_sock_destruct(&nlk->sk);
-}
-
-static void netlink_sock_destruct(struct sock *sk)
-{
-	struct netlink_sock *nlk = nlk_sk(sk);
-
-	if (nlk->cb_running && nlk->cb.done) {
-		INIT_WORK(&nlk->work, netlink_sock_destruct_work);
-		schedule_work(&nlk->work);
-		return;
-	}
-
-	__netlink_sock_destruct(sk);
+	sk_free(&nlk->sk);
 }
 
 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on
@@ -667,8 +655,18 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol,
 static void deferred_put_nlk_sk(struct rcu_head *head)
 {
 	struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu);
+	struct sock *sk = &nlk->sk;
+
+	if (!atomic_dec_and_test(&sk->sk_refcnt))
+		return;
+
+	if (nlk->cb_running && nlk->cb.done) {
+		INIT_WORK(&nlk->work, netlink_sock_destruct_work);
+		schedule_work(&nlk->work);
+		return;
+	}
 
-	sock_put(&nlk->sk);
+	sk_free(sk);
 }
 
 static int netlink_release(struct socket *sock)
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply related

* [patch net v2] net: fec: fix compile with CONFIG_M5272
From: Nikita Yushchenko @ 2016-12-05  8:16 UTC (permalink / raw)
  To: David S. Miller, Fugang Duan, Troy Kisky, Andrew Lunn,
	Eric Nelson, Philippe Reynes, Johannes Berg, netdev
  Cc: Chris Healy, Fabio Estevam, linux-kernel, Nikita Yushchenko

Commit 4dfb80d18d05 ("net: fec: cache statistics while device is down")
introduced unconditional statistics-related actions.

However, when driver is compiled with CONFIG_M5272, staticsics-related
definitions do not exist, which results into build errors.

Fix that by adding explicit handling of !defined(CONFIG_M5272) case.

Fixes: 4dfb80d18d05 ("net: fec: cache statistics while device is down")
Signed-off-by: Nikita Yushchenko <nikita.yoush@cogentembedded.com>
---
Changes since v1:
- instead of #ifdef'ing calls to fec_enet_update_ethtool_stats(), add
  definition of empty fec_enet_update_ethtool_stats() for CONFIG_M5272
  case,
- add FEC_STATS_SIZE macro to avoid #ifdef in the middle of
  alloc_etherdev_mqs() args.

 drivers/net/ethernet/freescale/fec_main.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 5f77caa59534..741cf4a57bfc 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -2313,6 +2313,8 @@ static const struct fec_stat {
 	{ "IEEE_rx_octets_ok", IEEE_R_OCTETS_OK },
 };
 
+#define FEC_STATS_SIZE		(ARRAY_SIZE(fec_stats) * sizeof(u64))
+
 static void fec_enet_update_ethtool_stats(struct net_device *dev)
 {
 	struct fec_enet_private *fep = netdev_priv(dev);
@@ -2330,7 +2332,7 @@ static void fec_enet_get_ethtool_stats(struct net_device *dev,
 	if (netif_running(dev))
 		fec_enet_update_ethtool_stats(dev);
 
-	memcpy(data, fep->ethtool_stats, ARRAY_SIZE(fec_stats) * sizeof(u64));
+	memcpy(data, fep->ethtool_stats, FEC_STATS_SIZE);
 }
 
 static void fec_enet_get_strings(struct net_device *netdev,
@@ -2355,6 +2357,12 @@ static int fec_enet_get_sset_count(struct net_device *dev, int sset)
 		return -EOPNOTSUPP;
 	}
 }
+
+#else	/* !defined(CONFIG_M5272) */
+#define FEC_STATS_SIZE	0
+static inline void fec_enet_update_ethtool_stats(struct net_device *dev)
+{
+}
 #endif /* !defined(CONFIG_M5272) */
 
 static int fec_enet_nway_reset(struct net_device *dev)
@@ -3293,8 +3301,7 @@ fec_probe(struct platform_device *pdev)
 
 	/* Init network device */
 	ndev = alloc_etherdev_mqs(sizeof(struct fec_enet_private) +
-				  ARRAY_SIZE(fec_stats) * sizeof(u64),
-				  num_tx_qs, num_rx_qs);
+				  FEC_STAT_SIZE, num_tx_qs, num_rx_qs);
 	if (!ndev)
 		return -ENOMEM;
 
-- 
2.1.4

^ permalink raw reply related

* Re: [PATCH v1 net-next 1/5] net: dsa: mv88e6xxx: Reserved Management frames to CPU
From: Richard Cochran @ 2016-12-05  8:26 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: Vivien Didelot, David Miller, netdev
In-Reply-To: <20161204202234.GA20743@lunn.ch>

On Sun, Dec 04, 2016 at 09:22:34PM +0100, Andrew Lunn wrote:
> 3) We have a prefix for us humans to help us find the code. Now we
> have ops, i cannot simply do M-. and emacs will take me to the
> implementation. I have to search for it a bit. Having the hint g1_
> tells me to go look in global1.c. Having the hint g2_ tells me to go
> look in global2.c. Having the port_ tells me to go look in port.c.
> Having no prefix tells me the code is scattered around and grep is my
> friend.
> 
> The prefix is just a hint where the function is in the source
> code. Nothing more.

Just chiming in here:  Having a function interface with callback
functions is widely used pattern in the kernel, but adding little
prefixes is not.  Sure, you have to look to find a particular instance
of a callback, but it isn't _that_ hard.

Thanks,
Richard

^ permalink raw reply

* Re: [PATCH 1/1] NET: usb: cdc_ncm: adding MBIM RESET_FUNCTION request and modifying ncm bind common code
From: Daniele Palmas @ 2016-12-05  8:31 UTC (permalink / raw)
  To: Bjørn Mork; +Cc: Oliver Neukum, linux-usb, netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <CAGRyCJFfNj4C_pNXWhMT7tVvdV4ZFzJ5d+LFAatvQmTuWU7OUg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

Hi,

2016-11-28 12:23 GMT+01:00 Daniele Palmas <dnlplm-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>:
> 2016-11-26 22:17 GMT+01:00 Bjørn Mork <bjorn-yOkvZcmFvRU@public.gmane.org>:
>> Bjørn Mork <bjorn-yOkvZcmFvRU@public.gmane.org> writes:
>>
>>> Finally, I found my modems (or at least a number of them) again today.
>>> But I'm sorry to say, that the troublesome Huawei E3372h-153 is still
>>> giving us a hard time.  It does not work with your patch. The symptom is
>>> the same as earlier:  The modem returns MBIM frames with 32bit headers.
>>>
>>> So for now, I have to NAK this patch.
>>>
>>> I am sure we can find a good solution that makes all of these modems
>>> work, but I cannot support a patch that breaks previously working
>>> configurations. Sorry.  I'll do a few experiments and see if there is a
>>> simple fix for this.  Otherwise we'll probably have to do the quirk
>>> game.
>>
>>
>> This is a proof-of-concept only, but it appears to be working.  Please
>> test with your device(s) too.  It's still mostly your code, as you can
>> see.
>
> Sorry, this does not work :-(
>
> The problem is always in the altsetting toggle: if I comment that
> part, your patch is working fine.
>
> I'm now wondering if the safest thing is to add a very simple quirk in
> cdc_mbim that makes this toggle not to be applied with the buggy
> modems and then unconditionally add the RESET_FUNCTION request in
> cdc_mbim_bind after cdc_ncm_bind_common has been executed.
>

I went back to this and further checking revealed that MBIM function
reset is not needed and the only problem is related to commit
48906f62c96cc2cd35753e59310cb70eb08cc6a5 cdc_ncm: toggle altsetting to
force reset before setup, that, however, is mandatory for some Huawei
modems.

My proposal is to add something like
CDC_MBIM_FLAG_AVOID_ALTSETTING_TOGGLE quirk to avoid the toggle.

To have a conservative approach, I would add only Telit LE922 to this
quirk and keep also the usleep_range delay, since I do not have a
clear view on all the VIDs/PIDs affected by this quirk. Then other
modems, once tested, can be added to the quirk if the delay is not
enough.

If this approach, though ugly, has chances to be accepted I can write
a new patch.

Thanks,
Daniele

> Daniele
>
>>
>> If this turns out to work, then I believe we should refactor
>> cdc_ncm_init() and cdc_ncm_bind_common() to make the whole
>> initialisation sequence a bit cleaner.  And maybe also include
>> cdc_mbim_bind().  Ideally, the MBIM specific RESET should happen there
>> instead of "polluting" the NCM driver with MBIM specific code.
>>
>> But anyway:  The sequence that seems to work for both the  E3372h-153
>> and the EM7455 is
>>
>>  USB_CDC_GET_NTB_PARAMETERS
>>  USB_CDC_RESET_FUNCTION
>>  usb_set_interface(dev->udev, 'data interface no', 0);
>>  remaining parts of cdc_ncm_init(), excluding USB_CDC_GET_NTB_PARAMETERS
>>  usb_set_interface(dev->udev, 'data interface no', 'data alt setting');
>>
>> without any additional delay between the two usb_set_interface() calls.
>> So the major difference from your patch is that I moved the two control
>> requests out of cdc_ncm_init() to allow running them _before_ setting
>> the data interface to altsetting 0.
>>
>> But maybe I was just lucky.  This was barely proof tested.  Needs a lot
>> more testing and cleanups as suggested.  I'd appreciate it if you
>> continued that, as I don't really have any time for it...
>>
>> FWIW, I also ran a quick test with a D-Link DWM-156A7 (Mediatek MBIM
>> firmware) and a Huawei E367 (Qualcomm device with early Huawei MBIM
>> firmware, distinctly different from the E3372h-153 and most other
>> MBIM devices I've seen)
>>
>>
>>
>> Bjørn
>>
>> ---
>>  drivers/net/usb/cdc_ncm.c    | 48 ++++++++++++++++++++++++++++----------------
>>  include/uapi/linux/usb/cdc.h |  1 +
>>  2 files changed, 32 insertions(+), 17 deletions(-)
>>
>> diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
>> index 877c9516e781..be019cbf1719 100644
>> --- a/drivers/net/usb/cdc_ncm.c
>> +++ b/drivers/net/usb/cdc_ncm.c
>> @@ -488,16 +488,6 @@ static int cdc_ncm_init(struct usbnet *dev)
>>         u8 iface_no = ctx->control->cur_altsetting->desc.bInterfaceNumber;
>>         int err;
>>
>> -       err = usbnet_read_cmd(dev, USB_CDC_GET_NTB_PARAMETERS,
>> -                             USB_TYPE_CLASS | USB_DIR_IN
>> -                             |USB_RECIP_INTERFACE,
>> -                             0, iface_no, &ctx->ncm_parm,
>> -                             sizeof(ctx->ncm_parm));
>> -       if (err < 0) {
>> -               dev_err(&dev->intf->dev, "failed GET_NTB_PARAMETERS\n");
>> -               return err; /* GET_NTB_PARAMETERS is required */
>> -       }
>> -
>>         /* set CRC Mode */
>>         if (cdc_ncm_flags(dev) & USB_CDC_NCM_NCAP_CRC_MODE) {
>>                 dev_dbg(&dev->intf->dev, "Setting CRC mode off\n");
>> @@ -837,12 +827,43 @@ int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_
>>                 }
>>         }
>>
>> +       iface_no = ctx->control->cur_altsetting->desc.bInterfaceNumber;
>> +       temp = usbnet_read_cmd(dev, USB_CDC_GET_NTB_PARAMETERS,
>> +                              USB_TYPE_CLASS | USB_DIR_IN
>> +                              | USB_RECIP_INTERFACE,
>> +                              0, iface_no, &ctx->ncm_parm,
>> +                              sizeof(ctx->ncm_parm));
>> +       if (temp < 0) {
>> +               dev_err(&dev->intf->dev, "failed GET_NTB_PARAMETERS\n");
>> +               goto error; /* GET_NTB_PARAMETERS is required */
>> +       }
>> +
>> +       /* Some modems (e.g. Telit LE922A6) need to reset the MBIM function
>> +        * or they will fail to work properly.
>> +        * For details on RESET_FUNCTION request see document
>> +        * "USB Communication Class Subclass Specification for MBIM"
>> +        * RESET_FUNCTION should be harmless for all the other MBIM modems
>> +        */
>> +       if (cdc_ncm_comm_intf_is_mbim(ctx->control->cur_altsetting)) {
>> +               temp = usbnet_write_cmd(dev, USB_CDC_RESET_FUNCTION,
>> +                                       USB_TYPE_CLASS | USB_DIR_OUT
>> +                                       | USB_RECIP_INTERFACE,
>> +                                       0, iface_no, NULL, 0);
>> +               if (temp < 0)
>> +                       dev_err(&dev->intf->dev, "failed RESET_FUNCTION\n");
>> +       }
>> +
>>         iface_no = ctx->data->cur_altsetting->desc.bInterfaceNumber;
>>
>>         /* Reset data interface. Some devices will not reset properly
>>          * unless they are configured first.  Toggle the altsetting to
>>          * force a reset
>> +        * This is applied only to ncm devices, since it has been verified
>> +        * to cause issues with some MBIM modems (e.g. Telit LE922A6).
>> +        * MBIM devices reset is achieved using MBIM request RESET_FUNCTION
>> +        * in cdc_ncm_init
>>          */
>> +
>>         usb_set_interface(dev->udev, iface_no, data_altsetting);
>>         temp = usb_set_interface(dev->udev, iface_no, 0);
>>         if (temp) {
>> @@ -854,13 +875,6 @@ int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_
>>         if (cdc_ncm_init(dev))
>>                 goto error2;
>>
>> -       /* Some firmwares need a pause here or they will silently fail
>> -        * to set up the interface properly.  This value was decided
>> -        * empirically on a Sierra Wireless MC7455 running 02.08.02.00
>> -        * firmware.
>> -        */
>> -       usleep_range(10000, 20000);
>> -
>>         /* configure data interface */
>>         temp = usb_set_interface(dev->udev, iface_no, data_altsetting);
>>         if (temp) {
>> diff --git a/include/uapi/linux/usb/cdc.h b/include/uapi/linux/usb/cdc.h
>> index e2bc417b243b..30258fb229e6 100644
>> --- a/include/uapi/linux/usb/cdc.h
>> +++ b/include/uapi/linux/usb/cdc.h
>> @@ -231,6 +231,7 @@ struct usb_cdc_mbim_extended_desc {
>>
>>  #define USB_CDC_SEND_ENCAPSULATED_COMMAND      0x00
>>  #define USB_CDC_GET_ENCAPSULATED_RESPONSE      0x01
>> +#define USB_CDC_RESET_FUNCTION                 0x05
>>  #define USB_CDC_REQ_SET_LINE_CODING            0x20
>>  #define USB_CDC_REQ_GET_LINE_CODING            0x21
>>  #define USB_CDC_REQ_SET_CONTROL_LINE_STATE     0x22
>> --
>> 2.10.2
>>
>>
>>
--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH]PTP:PTP_CHARDEV:add input parameters check to avoid the NULL pointer reference
From: Richard Cochran @ 2016-12-05  8:32 UTC (permalink / raw)
  To: Deng Feng
  Cc: netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	cxdx2006@gmail.com
In-Reply-To: <PS1PR01MB0844B3E001ABF0DF39D1B11BDC830@PS1PR01MB0844.apcprd01.prod.exchangelabs.com>

On Mon, Dec 05, 2016 at 08:02:02AM +0000, Deng Feng wrote:
> When we call one function which contained pointers as input parameters,
> we must check the pointers are NULL or not before we make use of them,
> which avoid the NULL pointer reference .

NAK.

These functions belong to an internal kernel API, and the callers
always provide non-NULL arguments.  This isn't some user space
library.

Thanks,
Richard

^ permalink raw reply

* Re: [PATCH] net:phy fix driver reference count error when attach and detach phy device
From: maowenan @ 2016-12-05  8:47 UTC (permalink / raw)
  To: David Laight, netdev@vger.kernel.org, f.fainelli@gmail.com,
	dingtianhong@huawei.com, weiyongjun (A)
In-Reply-To: <063D6719AE5E284EB5DD2968C1650D6DB0232613@AcuExch.aculab.com>



On 2016/12/2 17:45, David Laight wrote:
> From: Mao Wenan
>> Sent: 30 November 2016 10:23
>> The nic in my board use the phy dev from marvell, and the system will
>> load the marvell phy driver automatically, but when I remove the phy
>> drivers, the system immediately panic:
>> Call trace:
>> [ 2582.834493] [<ffff800000715384>] phy_state_machine+0x3c/0x438 [
>> 2582.851754] [<ffff8000000db3b8>] process_one_work+0x150/0x428 [
>> 2582.868188] [<ffff8000000db7d4>] worker_thread+0x144/0x4b0 [
>> 2582.883882] [<ffff8000000e1d0c>] kthread+0xfc/0x110
>>
>> there should be proper reference counting in place to avoid that.
>> I found that phy_attach_direct() forgets to add phy device driver
>> reference count, and phy_detach() forgets to subtract reference count.
>> This patch is to fix this bug, after that panic is disappeared when remove
>> marvell.ko
>>
>> Signed-off-by: Mao Wenan <maowenan@huawei.com>
>> ---
>>  drivers/net/phy/phy_device.c | 7 +++++++
>>  1 file changed, 7 insertions(+)
>>
>> diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
>> index 1a4bf8a..a7ec7c2 100644
>> --- a/drivers/net/phy/phy_device.c
>> +++ b/drivers/net/phy/phy_device.c
>> @@ -866,6 +866,11 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
>>  		return -EIO;
>>  	}
>>
>> +	if (!try_module_get(d->driver->owner)) {
>> +		dev_err(&dev->dev, "failed to get the device driver module\n");
>> +		return -EIO;
>> +	}
> 
> If this is the phy code, what stops the phy driver being unloaded
> before the try_module_get() obtains a reference.
> If it isn't the phy driver then there ought to be a reference count obtained
> when the phy driver is located (by whatever decides which phy driver to use).
> Even if that code later releases its reference (it probably shouldn't on success)
> then you can't fail to get an extra reference here.

[Mao Wenan]Yes, this is phy code, in function phy_attach_direct(), drivers/net/phy/phy_device.c.
when one NIC driver to do probe behavior, it will attach one matched phy driver. phy_attach_direct()
is to obtain phy driver reference and bind phy driver, if try_module_get() execute on success, the reference
count is added; if failed, the driver can't be attached to this NIC, and it can't added the phy driver
reference count. So before try_module_get obtains a reference, phy driver can't can't be bound to this NIC.
when the phy driver is attached to NIC, the reference count is added, if someone remove phy driver directly,
it will be failed because reference count is not equal to 0.

An example of call trace when there is NIC driver to attch one phy driver:
hns_nic_dev_probe->hns_nic_try_get_ae->hns_nic_init_phy->of_phy_connect->phy_connect_direct->phy_attach_direct

Consider the steps of phy driver(marvell.ko) added and removed, and NIC driver(hns_enet_drv.ko) added and removed:
1)insmod marvell       ref=0
2)insmod hns_enet_drv  ref=1
3)rmmod marvell        (should not on success, ref=1)
4)rmmod hns_enet_drv   ref=0
5)rmmod marvell        (should on success, because ref=0)

if we don't add the reference count in phy_attach_direct(the second step ref=0), so the third step rmmod marvell will
be panic, because there is one user remain use marvell driver and phy_stat_machine use the NULL drv pointer.

> 
>> +
>>  	get_device(d);
>>
>>  	/* Assume that if there is no driver, that it doesn't
>> @@ -921,6 +926,7 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
>>
>>  error:
>>  	put_device(d);
>> +	module_put(d->driver->owner);
> 
> Are those two in the wrong order ?
> 
>>  	module_put(bus->owner);
>>  	return err;
>>  }
>> @@ -998,6 +1004,7 @@ void phy_detach(struct phy_device *phydev)
>>  	bus = phydev->mdio.bus;
>>
>>  	put_device(&phydev->mdio.dev);
>> +	module_put(phydev->mdio.dev.driver->owner);
>>  	module_put(bus->owner);
> 
> Where is this code called from?
> You can't call it from the phy driver because the driver can be unloaded
> as soon as the last reference is removed.
> At that point the code memory is freed.

[Mao Wenan] it is called by NIC when it is removed, which aims to disconnect one bound phy driver. If this phy driver
is not used for this NIC, reference count should be subtracted, and phy driver can be removed if there is no user.
hns_nic_dev_remove->phy_disconnect->phy_detach



> 
>>  }
>>  EXPORT_SYMBOL(phy_detach);
>> --
>> 2.7.0
>>
> 
> 
> .
> 

^ permalink raw reply

* [PATCH v2 0/6] net: stmmac: make DMA programmable burst length more configurable
From: Niklas Cassel @ 2016-12-05  9:10 UTC (permalink / raw)
  To: netdev; +Cc: Niklas Cassel, devicetree, linux-kernel, linux-doc

From: Niklas Cassel <niklas.cassel@axis.com>

Make DMA programmable burst length more configurable in the stmmac driver.

This is done by adding support for independent pbl for tx/rx through DT.
More fine grained tuning of pbl is possible thanks to a DT property saying
that we should NOT multiply pbl values by x8/x4 in hardware.

All new DT properties are optional, and created in a way that it will not
affect any existing DT configurations.

Changes since V1:
Created cover-letter.
Rebased patch set against next-20161205, since conflicting patches to
stmmac_platform.c has been merged since V1.


Niklas Cassel (6):
  net: stmmac: return error if no DMA configuration is found
  net: stmmac: simplify the common DMA init API
  net: stmmac: stmmac_platform: fix parsing of DT binding
  net: stmmac: dwmac1000: fix define DMA_BUS_MODE_RPBL_MASK
  net: stmmac: add support for independent DMA pbl for tx/rx
  net: smmac: allow configuring lower pbl values

 Documentation/devicetree/bindings/net/stmmac.txt   |  8 +++++-
 Documentation/networking/stmmac.txt                | 24 ++++++++++++-----
 drivers/net/ethernet/stmicro/stmmac/common.h       |  4 +--
 drivers/net/ethernet/stmicro/stmmac/dwmac1000.h    |  2 +-
 .../net/ethernet/stmicro/stmmac/dwmac1000_dma.c    | 26 ++++++++++---------
 drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c |  7 ++---
 drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c   | 25 ++++++++++--------
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 17 ++++++------
 drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c   |  2 ++
 .../net/ethernet/stmicro/stmmac/stmmac_platform.c  | 30 ++++++++++++----------
 include/linux/stmmac.h                             |  3 +++
 11 files changed, 89 insertions(+), 59 deletions(-)

-- 
2.1.4

^ permalink raw reply

* [PATCH v2 1/6] net: stmmac: return error if no DMA configuration is found
From: Niklas Cassel @ 2016-12-05  9:10 UTC (permalink / raw)
  To: Giuseppe Cavallaro, Alexandre Torgue; +Cc: Niklas Cassel, netdev, linux-kernel
In-Reply-To: <1480929039-20257-1-git-send-email-niklass@axis.com>

From: Niklas Cassel <niklas.cassel@axis.com>

All drivers except pci glue layer calls stmmac_probe_config_dt.
stmmac_probe_config_dt does a kzalloc dma_cfg.

pci glue layer does kzalloc dma_cfg explicitly, so all current
drivers does a kzalloc dma_cfg.

Return an error if no DMA configuration is found, that way
we can assume that the DMA configuration always exists.

Signed-off-by: Niklas Cassel <niklas.cassel@axis.com>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 982c95213da4..14366800e5e6 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -1578,16 +1578,12 @@ static void stmmac_check_ether_addr(struct stmmac_priv *priv)
  */
 static int stmmac_init_dma_engine(struct stmmac_priv *priv)
 {
-	int pbl = DEFAULT_DMA_PBL, fixed_burst = 0, aal = 0;
-	int mixed_burst = 0;
 	int atds = 0;
 	int ret = 0;
 
-	if (priv->plat->dma_cfg) {
-		pbl = priv->plat->dma_cfg->pbl;
-		fixed_burst = priv->plat->dma_cfg->fixed_burst;
-		mixed_burst = priv->plat->dma_cfg->mixed_burst;
-		aal = priv->plat->dma_cfg->aal;
+	if (!priv->plat->dma_cfg) {
+		dev_err(priv->device, "DMA configuration not found\n");
+		return -EINVAL;
 	}
 
 	if (priv->extend_desc && (priv->mode == STMMAC_RING_MODE))
@@ -1599,8 +1595,12 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv)
 		return ret;
 	}
 
-	priv->hw->dma->init(priv->ioaddr, pbl, fixed_burst, mixed_burst,
-			    aal, priv->dma_tx_phy, priv->dma_rx_phy, atds);
+	priv->hw->dma->init(priv->ioaddr,
+			    priv->plat->dma_cfg->pbl,
+			    priv->plat->dma_cfg->fixed_burst,
+			    priv->plat->dma_cfg->mixed_burst,
+			    priv->plat->dma_cfg->aal,
+			    priv->dma_tx_phy, priv->dma_rx_phy, atds);
 
 	if (priv->synopsys_id >= DWMAC_CORE_4_00) {
 		priv->rx_tail_addr = priv->dma_rx_phy +
-- 
2.1.4

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox