Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next v3 5/8] sctp: move flushing of data chunks out of sctp_outq_flush
From: Marcelo Ricardo Leitner @ 2018-05-14 17:34 UTC (permalink / raw)
  To: netdev; +Cc: linux-sctp, Neil Horman, Xin Long, Vlad Yasevich
In-Reply-To: <cover.1526318522.git.marcelo.leitner@gmail.com>

To the new sctp_outq_flush_data. Again, smaller functions and with well
defined objectives.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
---
 net/sctp/outqueue.c | 149 ++++++++++++++++++++++++++--------------------------
 1 file changed, 75 insertions(+), 74 deletions(-)

diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 49e80bf2ade73914f3be275016df0a6e7f06f606..bfa2e43dfd31d115862b65d1650858fcaa6f203b 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -1038,46 +1038,17 @@ static bool sctp_outq_flush_rtx(struct sctp_outq *q,
 
 	return true;
 }
-/*
- * Try to flush an outqueue.
- *
- * Description: Send everything in q which we legally can, subject to
- * congestion limitations.
- * * Note: This function can be called from multiple contexts so appropriate
- * locking concerns must be made.  Today we use the sock lock to protect
- * this function.
- */
-static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
+
+static void sctp_outq_flush_data(struct sctp_outq *q,
+				 struct sctp_transport **_transport,
+				 struct list_head *transport_list,
+				 int rtx_timeout, gfp_t gfp)
 {
-	struct sctp_packet *packet;
+	struct sctp_transport *transport = *_transport;
+	struct sctp_packet *packet = transport ? &transport->packet : NULL;
 	struct sctp_association *asoc = q->asoc;
-	struct sctp_transport *transport = NULL;
 	struct sctp_chunk *chunk;
 	enum sctp_xmit status;
-	int error = 0;
-
-	/* These transports have chunks to send. */
-	struct list_head transport_list;
-	struct list_head *ltransport;
-
-	INIT_LIST_HEAD(&transport_list);
-	packet = NULL;
-
-	/*
-	 * 6.10 Bundling
-	 *   ...
-	 *   When bundling control chunks with DATA chunks, an
-	 *   endpoint MUST place control chunks first in the outbound
-	 *   SCTP packet.  The transmitter MUST transmit DATA chunks
-	 *   within a SCTP packet in increasing order of TSN.
-	 *   ...
-	 */
-
-	sctp_outq_flush_ctrl(q, &transport, &transport_list, gfp);
-	packet = &transport->packet;
-
-	if (q->asoc->src_out_of_asoc_ok)
-		goto sctp_flush_out;
 
 	/* Is it OK to send data chunks?  */
 	switch (asoc->state) {
@@ -1102,10 +1073,11 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 		 * current cwnd).
 		 */
 		if (!list_empty(&q->retransmit)) {
-			if (!sctp_outq_flush_rtx(q, &transport, &transport_list,
+			if (!sctp_outq_flush_rtx(q, _transport, transport_list,
 						 rtx_timeout))
 				break;
 			/* We may have switched current transport */
+			transport = *_transport;
 			packet = &transport->packet;
 		}
 
@@ -1131,12 +1103,14 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 
 			if (asoc->stream.out[sid].state == SCTP_STREAM_CLOSED) {
 				sctp_outq_head_data(q, chunk);
-				goto sctp_flush_out;
+				break;
 			}
 
-			if (sctp_outq_select_transport(chunk, asoc, &transport,
-						       &transport_list))
+			if (sctp_outq_select_transport(chunk, asoc, _transport,
+						       transport_list)) {
+				transport = *_transport;
 				packet = &transport->packet;
+			}
 
 			pr_debug("%s: outq:%p, chunk:%p[%s], tx-tsn:0x%x skb->head:%p "
 				 "skb->users:%d\n",
@@ -1148,8 +1122,10 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 
 			/* Add the chunk to the packet.  */
 			status = sctp_packet_transmit_chunk(packet, chunk, 0, gfp);
-
 			switch (status) {
+			case SCTP_XMIT_OK:
+				break;
+
 			case SCTP_XMIT_PMTU_FULL:
 			case SCTP_XMIT_RWND_FULL:
 			case SCTP_XMIT_DELAY:
@@ -1161,41 +1137,25 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 					 status);
 
 				sctp_outq_head_data(q, chunk);
-				goto sctp_flush_out;
-
-			case SCTP_XMIT_OK:
-				/* The sender is in the SHUTDOWN-PENDING state,
-				 * The sender MAY set the I-bit in the DATA
-				 * chunk header.
-				 */
-				if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING)
-					chunk->chunk_hdr->flags |= SCTP_DATA_SACK_IMM;
-				if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
-					asoc->stats.ouodchunks++;
-				else
-					asoc->stats.oodchunks++;
-
-				/* Only now it's safe to consider this
-				 * chunk as sent, sched-wise.
-				 */
-				sctp_sched_dequeue_done(q, chunk);
-
-				break;
-
-			default:
-				BUG();
+				return;
 			}
 
-			/* BUG: We assume that the sctp_packet_transmit()
-			 * call below will succeed all the time and add the
-			 * chunk to the transmitted list and restart the
-			 * timers.
-			 * It is possible that the call can fail under OOM
-			 * conditions.
-			 *
-			 * Is this really a problem?  Won't this behave
-			 * like a lost TSN?
+			/* The sender is in the SHUTDOWN-PENDING state,
+			 * The sender MAY set the I-bit in the DATA
+			 * chunk header.
 			 */
+			if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING)
+				chunk->chunk_hdr->flags |= SCTP_DATA_SACK_IMM;
+			if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
+				asoc->stats.ouodchunks++;
+			else
+				asoc->stats.oodchunks++;
+
+			/* Only now it's safe to consider this
+			 * chunk as sent, sched-wise.
+			 */
+			sctp_sched_dequeue_done(q, chunk);
+
 			list_add_tail(&chunk->transmitted_list,
 				      &transport->transmitted);
 
@@ -1206,7 +1166,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 			 * COOKIE-ECHO chunk.
 			 */
 			if (packet->has_cookie_echo)
-				goto sctp_flush_out;
+				break;
 		}
 		break;
 
@@ -1214,6 +1174,47 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 		/* Do nothing.  */
 		break;
 	}
+}
+
+/*
+ * Try to flush an outqueue.
+ *
+ * Description: Send everything in q which we legally can, subject to
+ * congestion limitations.
+ * * Note: This function can be called from multiple contexts so appropriate
+ * locking concerns must be made.  Today we use the sock lock to protect
+ * this function.
+ */
+static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
+{
+	struct sctp_packet *packet;
+	struct sctp_association *asoc = q->asoc;
+	struct sctp_transport *transport = NULL;
+	int error = 0;
+
+	/* These transports have chunks to send. */
+	struct list_head transport_list;
+	struct list_head *ltransport;
+
+	INIT_LIST_HEAD(&transport_list);
+	packet = NULL;
+
+	/*
+	 * 6.10 Bundling
+	 *   ...
+	 *   When bundling control chunks with DATA chunks, an
+	 *   endpoint MUST place control chunks first in the outbound
+	 *   SCTP packet.  The transmitter MUST transmit DATA chunks
+	 *   within a SCTP packet in increasing order of TSN.
+	 *   ...
+	 */
+
+	sctp_outq_flush_ctrl(q, &transport, &transport_list, gfp);
+
+	if (q->asoc->src_out_of_asoc_ok)
+		goto sctp_flush_out;
+
+	sctp_outq_flush_data(q, &transport, &transport_list, rtx_timeout, gfp);
 
 sctp_flush_out:
 
-- 
2.14.3

^ permalink raw reply related

* [PATCH net-next v3 8/8] sctp: rework switch cases in sctp_outq_flush_data
From: Marcelo Ricardo Leitner @ 2018-05-14 17:34 UTC (permalink / raw)
  To: netdev; +Cc: linux-sctp, Neil Horman, Xin Long, Vlad Yasevich
In-Reply-To: <cover.1526318522.git.marcelo.leitner@gmail.com>

Remove an inner one, which tended to be error prone due to the cascading
and it can be replaced by a simple if ().

Rework the outer one so that the actual flush code is not inside it. Now
we first validate if we can or cannot send data, return if not, and then
the flush code.

Suggested-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
---
 net/sctp/outqueue.c | 191 +++++++++++++++++++++++++---------------------------
 1 file changed, 93 insertions(+), 98 deletions(-)

diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index e1632b8e2900f88070aebc82bbe4eca61f424e87..e9c22b3db11cd702afc52f8233fb7f39d917698a 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -1058,122 +1058,117 @@ static void sctp_outq_flush_data(struct sctp_outq *q,
 		 * chunk.
 		 */
 		if (!packet || !packet->has_cookie_echo)
-			break;
+			return;
 
 		/* fallthru */
 	case SCTP_STATE_ESTABLISHED:
 	case SCTP_STATE_SHUTDOWN_PENDING:
 	case SCTP_STATE_SHUTDOWN_RECEIVED:
-		/*
-		 * RFC 2960 6.1  Transmission of DATA Chunks
-		 *
-		 * C) When the time comes for the sender to transmit,
-		 * before sending new DATA chunks, the sender MUST
-		 * first transmit any outstanding DATA chunks which
-		 * are marked for retransmission (limited by the
-		 * current cwnd).
-		 */
-		if (!list_empty(&q->retransmit)) {
-			if (!sctp_outq_flush_rtx(q, _transport, transport_list,
-						 rtx_timeout, gfp))
-				break;
-			/* We may have switched current transport */
-			transport = *_transport;
-			packet = &transport->packet;
-		}
+		break;
 
-		/* Apply Max.Burst limitation to the current transport in
-		 * case it will be used for new data.  We are going to
-		 * rest it before we return, but we want to apply the limit
-		 * to the currently queued data.
-		 */
-		if (transport)
-			sctp_transport_burst_limited(transport);
-
-		/* Finally, transmit new packets.  */
-		while ((chunk = sctp_outq_dequeue_data(q)) != NULL) {
-			__u32 sid = ntohs(chunk->subh.data_hdr->stream);
-
-			/* Has this chunk expired? */
-			if (sctp_chunk_abandoned(chunk)) {
-				sctp_sched_dequeue_done(q, chunk);
-				sctp_chunk_fail(chunk, 0);
-				sctp_chunk_free(chunk);
-				continue;
-			}
+	default:
+		/* Do nothing. */
+		return;
+	}
 
-			if (asoc->stream.out[sid].state == SCTP_STREAM_CLOSED) {
-				sctp_outq_head_data(q, chunk);
-				break;
-			}
+	/*
+	 * RFC 2960 6.1  Transmission of DATA Chunks
+	 *
+	 * C) When the time comes for the sender to transmit,
+	 * before sending new DATA chunks, the sender MUST
+	 * first transmit any outstanding DATA chunks which
+	 * are marked for retransmission (limited by the
+	 * current cwnd).
+	 */
+	if (!list_empty(&q->retransmit)) {
+		if (!sctp_outq_flush_rtx(q, _transport, transport_list,
+					 rtx_timeout, gfp))
+			return;
+		/* We may have switched current transport */
+		transport = *_transport;
+		packet = &transport->packet;
+	}
 
-			if (sctp_outq_select_transport(chunk, asoc, _transport,
-						       transport_list)) {
-				transport = *_transport;
-				packet = &transport->packet;
-			}
+	/* Apply Max.Burst limitation to the current transport in
+	 * case it will be used for new data.  We are going to
+	 * rest it before we return, but we want to apply the limit
+	 * to the currently queued data.
+	 */
+	if (transport)
+		sctp_transport_burst_limited(transport);
 
-			pr_debug("%s: outq:%p, chunk:%p[%s], tx-tsn:0x%x skb->head:%p "
-				 "skb->users:%d\n",
-				 __func__, q, chunk, chunk && chunk->chunk_hdr ?
-				 sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) :
-				 "illegal chunk", ntohl(chunk->subh.data_hdr->tsn),
-				 chunk->skb ? chunk->skb->head : NULL, chunk->skb ?
-				 refcount_read(&chunk->skb->users) : -1);
-
-			/* Add the chunk to the packet.  */
-			status = sctp_packet_transmit_chunk(packet, chunk, 0, gfp);
-			switch (status) {
-			case SCTP_XMIT_OK:
-				break;
+	/* Finally, transmit new packets.  */
+	while ((chunk = sctp_outq_dequeue_data(q)) != NULL) {
+		__u32 sid = ntohs(chunk->subh.data_hdr->stream);
 
-			case SCTP_XMIT_PMTU_FULL:
-			case SCTP_XMIT_RWND_FULL:
-			case SCTP_XMIT_DELAY:
-				/* We could not append this chunk, so put
-				 * the chunk back on the output queue.
-				 */
-				pr_debug("%s: could not transmit tsn:0x%x, status:%d\n",
-					 __func__, ntohl(chunk->subh.data_hdr->tsn),
-					 status);
+		/* Has this chunk expired? */
+		if (sctp_chunk_abandoned(chunk)) {
+			sctp_sched_dequeue_done(q, chunk);
+			sctp_chunk_fail(chunk, 0);
+			sctp_chunk_free(chunk);
+			continue;
+		}
 
-				sctp_outq_head_data(q, chunk);
-				return;
-			}
+		if (asoc->stream.out[sid].state == SCTP_STREAM_CLOSED) {
+			sctp_outq_head_data(q, chunk);
+			break;
+		}
 
-			/* The sender is in the SHUTDOWN-PENDING state,
-			 * The sender MAY set the I-bit in the DATA
-			 * chunk header.
-			 */
-			if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING)
-				chunk->chunk_hdr->flags |= SCTP_DATA_SACK_IMM;
-			if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
-				asoc->stats.ouodchunks++;
-			else
-				asoc->stats.oodchunks++;
+		if (sctp_outq_select_transport(chunk, asoc, _transport,
+					       transport_list)) {
+			transport = *_transport;
+			packet = &transport->packet;
+		}
 
-			/* Only now it's safe to consider this
-			 * chunk as sent, sched-wise.
+		pr_debug("%s: outq:%p, chunk:%p[%s], tx-tsn:0x%x skb->head:%p "
+			 "skb->users:%d\n",
+			 __func__, q, chunk, chunk && chunk->chunk_hdr ?
+			 sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) :
+			 "illegal chunk", ntohl(chunk->subh.data_hdr->tsn),
+			 chunk->skb ? chunk->skb->head : NULL, chunk->skb ?
+			 refcount_read(&chunk->skb->users) : -1);
+
+		/* Add the chunk to the packet.  */
+		status = sctp_packet_transmit_chunk(packet, chunk, 0, gfp);
+		if (status != SCTP_XMIT_OK) {
+			/* We could not append this chunk, so put
+			 * the chunk back on the output queue.
 			 */
-			sctp_sched_dequeue_done(q, chunk);
+			pr_debug("%s: could not transmit tsn:0x%x, status:%d\n",
+				 __func__, ntohl(chunk->subh.data_hdr->tsn),
+				 status);
 
-			list_add_tail(&chunk->transmitted_list,
-				      &transport->transmitted);
+			sctp_outq_head_data(q, chunk);
+			break;
+		}
 
-			sctp_transport_reset_t3_rtx(transport);
-			transport->last_time_sent = jiffies;
+		/* The sender is in the SHUTDOWN-PENDING state,
+		 * The sender MAY set the I-bit in the DATA
+		 * chunk header.
+		 */
+		if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING)
+			chunk->chunk_hdr->flags |= SCTP_DATA_SACK_IMM;
+		if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
+			asoc->stats.ouodchunks++;
+		else
+			asoc->stats.oodchunks++;
 
-			/* Only let one DATA chunk get bundled with a
-			 * COOKIE-ECHO chunk.
-			 */
-			if (packet->has_cookie_echo)
-				break;
-		}
-		break;
+		/* Only now it's safe to consider this
+		 * chunk as sent, sched-wise.
+		 */
+		sctp_sched_dequeue_done(q, chunk);
 
-	default:
-		/* Do nothing.  */
-		break;
+		list_add_tail(&chunk->transmitted_list,
+			      &transport->transmitted);
+
+		sctp_transport_reset_t3_rtx(transport);
+		transport->last_time_sent = jiffies;
+
+		/* Only let one DATA chunk get bundled with a
+		 * COOKIE-ECHO chunk.
+		 */
+		if (packet->has_cookie_echo)
+			break;
 	}
 }
 
-- 
2.14.3

^ permalink raw reply related

* [PATCH net-next v3 6/8] sctp: move transport flush code out of sctp_outq_flush
From: Marcelo Ricardo Leitner @ 2018-05-14 17:34 UTC (permalink / raw)
  To: netdev; +Cc: linux-sctp, Neil Horman, Xin Long, Vlad Yasevich
In-Reply-To: <cover.1526318522.git.marcelo.leitner@gmail.com>

To the new sctp_outq_flush_transports.

Comment on Nagle is outdated and removed. Nagle is performed earlier, while
checking if the chunk fits the packet: if the outq length is not enough to
fill the packet, it returns SCTP_XMIT_DELAY.

So by when it gets to sctp_outq_flush_transports, it has to go through all
enlisted transports.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
---
 net/sctp/outqueue.c | 56 +++++++++++++++++++++++++----------------------------
 1 file changed, 26 insertions(+), 30 deletions(-)

diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index bfa2e43dfd31d115862b65d1650858fcaa6f203b..44465e64857b79636a78917a12776402ccb8f990 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -1176,6 +1176,29 @@ static void sctp_outq_flush_data(struct sctp_outq *q,
 	}
 }
 
+static void sctp_outq_flush_transports(struct sctp_outq *q,
+				       struct list_head *transport_list,
+				       gfp_t gfp)
+{
+	struct list_head *ltransport;
+	struct sctp_packet *packet;
+	struct sctp_transport *t;
+	int error = 0;
+
+	while ((ltransport = sctp_list_dequeue(transport_list)) != NULL) {
+		t = list_entry(ltransport, struct sctp_transport, send_ready);
+		packet = &t->packet;
+		if (!sctp_packet_empty(packet)) {
+			error = sctp_packet_transmit(packet, gfp);
+			if (error < 0)
+				q->asoc->base.sk->sk_err = -error;
+		}
+
+		/* Clear the burst limited state, if any */
+		sctp_transport_burst_reset(t);
+	}
+}
+
 /*
  * Try to flush an outqueue.
  *
@@ -1187,17 +1210,10 @@ static void sctp_outq_flush_data(struct sctp_outq *q,
  */
 static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 {
-	struct sctp_packet *packet;
-	struct sctp_association *asoc = q->asoc;
+	/* Current transport being used. It's NOT the same as curr active one */
 	struct sctp_transport *transport = NULL;
-	int error = 0;
-
 	/* These transports have chunks to send. */
-	struct list_head transport_list;
-	struct list_head *ltransport;
-
-	INIT_LIST_HEAD(&transport_list);
-	packet = NULL;
+	LIST_HEAD(transport_list);
 
 	/*
 	 * 6.10 Bundling
@@ -1218,27 +1234,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 
 sctp_flush_out:
 
-	/* Before returning, examine all the transports touched in
-	 * this call.  Right now, we bluntly force clear all the
-	 * transports.  Things might change after we implement Nagle.
-	 * But such an examination is still required.
-	 *
-	 * --xguo
-	 */
-	while ((ltransport = sctp_list_dequeue(&transport_list)) != NULL) {
-		struct sctp_transport *t = list_entry(ltransport,
-						      struct sctp_transport,
-						      send_ready);
-		packet = &t->packet;
-		if (!sctp_packet_empty(packet)) {
-			error = sctp_packet_transmit(packet, gfp);
-			if (error < 0)
-				asoc->base.sk->sk_err = -error;
-		}
-
-		/* Clear the burst limited state, if any */
-		sctp_transport_burst_reset(t);
-	}
+	sctp_outq_flush_transports(q, &transport_list, gfp);
 }
 
 /* Update unack_data based on the incoming SACK chunk */
-- 
2.14.3

^ permalink raw reply related

* [PATCH net-next v3 7/8] sctp: make use of gfp on retransmissions
From: Marcelo Ricardo Leitner @ 2018-05-14 17:34 UTC (permalink / raw)
  To: netdev; +Cc: linux-sctp, Neil Horman, Xin Long, Vlad Yasevich
In-Reply-To: <cover.1526318522.git.marcelo.leitner@gmail.com>

Retransmissions may be triggered when in user context, so lets make use
of gfp.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
---
 net/sctp/outqueue.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 44465e64857b79636a78917a12776402ccb8f990..e1632b8e2900f88070aebc82bbe4eca61f424e87 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -608,7 +608,7 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
  * The return value is a normal kernel error return value.
  */
 static int __sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
-				 int rtx_timeout, int *start_timer)
+				 int rtx_timeout, int *start_timer, gfp_t gfp)
 {
 	struct sctp_transport *transport = pkt->transport;
 	struct sctp_chunk *chunk, *chunk1;
@@ -684,12 +684,12 @@ static int __sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
 				 * control chunks are already freed so there
 				 * is nothing we can do.
 				 */
-				sctp_packet_transmit(pkt, GFP_ATOMIC);
+				sctp_packet_transmit(pkt, gfp);
 				goto redo;
 			}
 
 			/* Send this packet.  */
-			error = sctp_packet_transmit(pkt, GFP_ATOMIC);
+			error = sctp_packet_transmit(pkt, gfp);
 
 			/* If we are retransmitting, we should only
 			 * send a single packet.
@@ -705,7 +705,7 @@ static int __sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
 
 		case SCTP_XMIT_RWND_FULL:
 			/* Send this packet. */
-			error = sctp_packet_transmit(pkt, GFP_ATOMIC);
+			error = sctp_packet_transmit(pkt, gfp);
 
 			/* Stop sending DATA as there is no more room
 			 * at the receiver.
@@ -715,7 +715,7 @@ static int __sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
 
 		case SCTP_XMIT_DELAY:
 			/* Send this packet. */
-			error = sctp_packet_transmit(pkt, GFP_ATOMIC);
+			error = sctp_packet_transmit(pkt, gfp);
 
 			/* Stop sending DATA because of nagle delay. */
 			done = 1;
@@ -991,7 +991,7 @@ static void sctp_outq_flush_ctrl(struct sctp_outq *q,
 static bool sctp_outq_flush_rtx(struct sctp_outq *q,
 				struct sctp_transport **_transport,
 				struct list_head *transport_list,
-				int rtx_timeout)
+				int rtx_timeout, gfp_t gfp)
 {
 	struct sctp_transport *transport = *_transport;
 	struct sctp_packet *packet = transport ? &transport->packet : NULL;
@@ -1015,7 +1015,8 @@ static bool sctp_outq_flush_rtx(struct sctp_outq *q,
 				   asoc->peer.ecn_capable);
 	}
 
-	error = __sctp_outq_flush_rtx(q, packet, rtx_timeout, &start_timer);
+	error = __sctp_outq_flush_rtx(q, packet, rtx_timeout, &start_timer,
+				      gfp);
 	if (error < 0)
 		asoc->base.sk->sk_err = -error;
 
@@ -1074,7 +1075,7 @@ static void sctp_outq_flush_data(struct sctp_outq *q,
 		 */
 		if (!list_empty(&q->retransmit)) {
 			if (!sctp_outq_flush_rtx(q, _transport, transport_list,
-						 rtx_timeout))
+						 rtx_timeout, gfp))
 				break;
 			/* We may have switched current transport */
 			transport = *_transport;
-- 
2.14.3

^ permalink raw reply related

* [PATCH net-next v3 4/8] sctp: move outq data rtx code out of sctp_outq_flush
From: Marcelo Ricardo Leitner @ 2018-05-14 17:34 UTC (permalink / raw)
  To: netdev; +Cc: linux-sctp, Neil Horman, Xin Long, Vlad Yasevich
In-Reply-To: <cover.1526318522.git.marcelo.leitner@gmail.com>

This patch renames current sctp_outq_flush_rtx to __sctp_outq_flush_rtx
and create a new sctp_outq_flush_rtx, with the code that was on
sctp_outq_flush. Again, the idea is to have functions with small and
defined objectives.

Yes, there is an open-coded path selection in the now sctp_outq_flush_rtx.
That is kept as is for now because it may be very different when we
implement retransmission path selection algorithms for CMT-SCTP.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
---
 net/sctp/outqueue.c | 101 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 58 insertions(+), 43 deletions(-)

diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 92f14f51edf28cf596c76b98854a10c0fd28bb22..49e80bf2ade73914f3be275016df0a6e7f06f606 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -601,14 +601,14 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
 
 /*
  * Transmit DATA chunks on the retransmit queue.  Upon return from
- * sctp_outq_flush_rtx() the packet 'pkt' may contain chunks which
+ * __sctp_outq_flush_rtx() the packet 'pkt' may contain chunks which
  * need to be transmitted by the caller.
  * We assume that pkt->transport has already been set.
  *
  * The return value is a normal kernel error return value.
  */
-static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
-			       int rtx_timeout, int *start_timer)
+static int __sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
+				 int rtx_timeout, int *start_timer)
 {
 	struct sctp_transport *transport = pkt->transport;
 	struct sctp_chunk *chunk, *chunk1;
@@ -987,6 +987,57 @@ static void sctp_outq_flush_ctrl(struct sctp_outq *q,
 	}
 }
 
+/* Returns false if new data shouldn't be sent */
+static bool sctp_outq_flush_rtx(struct sctp_outq *q,
+				struct sctp_transport **_transport,
+				struct list_head *transport_list,
+				int rtx_timeout)
+{
+	struct sctp_transport *transport = *_transport;
+	struct sctp_packet *packet = transport ? &transport->packet : NULL;
+	struct sctp_association *asoc = q->asoc;
+	int error, start_timer = 0;
+
+	if (asoc->peer.retran_path->state == SCTP_UNCONFIRMED)
+		return false;
+
+	if (transport != asoc->peer.retran_path) {
+		/* Switch transports & prepare the packet.  */
+		transport = asoc->peer.retran_path;
+		*_transport = transport;
+
+		if (list_empty(&transport->send_ready))
+			list_add_tail(&transport->send_ready,
+				      transport_list);
+
+		packet = &transport->packet;
+		sctp_packet_config(packet, asoc->peer.i.init_tag,
+				   asoc->peer.ecn_capable);
+	}
+
+	error = __sctp_outq_flush_rtx(q, packet, rtx_timeout, &start_timer);
+	if (error < 0)
+		asoc->base.sk->sk_err = -error;
+
+	if (start_timer) {
+		sctp_transport_reset_t3_rtx(transport);
+		transport->last_time_sent = jiffies;
+	}
+
+	/* This can happen on COOKIE-ECHO resend.  Only
+	 * one chunk can get bundled with a COOKIE-ECHO.
+	 */
+	if (packet->has_cookie_echo)
+		return false;
+
+	/* Don't send new data if there is still data
+	 * waiting to retransmit.
+	 */
+	if (!list_empty(&q->retransmit))
+		return false;
+
+	return true;
+}
 /*
  * Try to flush an outqueue.
  *
@@ -1000,12 +1051,10 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 {
 	struct sctp_packet *packet;
 	struct sctp_association *asoc = q->asoc;
-	__u32 vtag = asoc->peer.i.init_tag;
 	struct sctp_transport *transport = NULL;
 	struct sctp_chunk *chunk;
 	enum sctp_xmit status;
 	int error = 0;
-	int start_timer = 0;
 
 	/* These transports have chunks to send. */
 	struct list_head transport_list;
@@ -1053,45 +1102,11 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 		 * current cwnd).
 		 */
 		if (!list_empty(&q->retransmit)) {
-			if (asoc->peer.retran_path->state == SCTP_UNCONFIRMED)
-				goto sctp_flush_out;
-			if (transport == asoc->peer.retran_path)
-				goto retran;
-
-			/* Switch transports & prepare the packet.  */
-
-			transport = asoc->peer.retran_path;
-
-			if (list_empty(&transport->send_ready)) {
-				list_add_tail(&transport->send_ready,
-					      &transport_list);
-			}
-
+			if (!sctp_outq_flush_rtx(q, &transport, &transport_list,
+						 rtx_timeout))
+				break;
+			/* We may have switched current transport */
 			packet = &transport->packet;
-			sctp_packet_config(packet, vtag,
-					   asoc->peer.ecn_capable);
-		retran:
-			error = sctp_outq_flush_rtx(q, packet,
-						    rtx_timeout, &start_timer);
-			if (error < 0)
-				asoc->base.sk->sk_err = -error;
-
-			if (start_timer) {
-				sctp_transport_reset_t3_rtx(transport);
-				transport->last_time_sent = jiffies;
-			}
-
-			/* This can happen on COOKIE-ECHO resend.  Only
-			 * one chunk can get bundled with a COOKIE-ECHO.
-			 */
-			if (packet->has_cookie_echo)
-				goto sctp_flush_out;
-
-			/* Don't send new data if there is still data
-			 * waiting to retransmit.
-			 */
-			if (!list_empty(&q->retransmit))
-				goto sctp_flush_out;
 		}
 
 		/* Apply Max.Burst limitation to the current transport in
-- 
2.14.3

^ permalink raw reply related

* [PATCH net-next v3 2/8] sctp: factor out sctp_outq_select_transport
From: Marcelo Ricardo Leitner @ 2018-05-14 17:34 UTC (permalink / raw)
  To: netdev; +Cc: linux-sctp, Neil Horman, Xin Long, Vlad Yasevich
In-Reply-To: <cover.1526318522.git.marcelo.leitner@gmail.com>

We had two spots doing such complex operation and they were very close to
each other, a bit more tailored to here or there.

This patch unifies these under the same function,
sctp_outq_select_transport, which knows how to handle control chunks and
original transmissions (but not retransmissions).

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
---
 net/sctp/outqueue.c | 187 +++++++++++++++++++++++++---------------------------
 1 file changed, 90 insertions(+), 97 deletions(-)

diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 300bd0dfc7c14c9df579dbe2f9e78dd8356ae1a3..bda50596d4bfebeac03966c5a161473df1c1986a 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -791,6 +791,90 @@ static int sctp_packet_singleton(struct sctp_transport *transport,
 	return sctp_packet_transmit(&singleton, gfp);
 }
 
+static bool sctp_outq_select_transport(struct sctp_chunk *chunk,
+				       struct sctp_association *asoc,
+				       struct sctp_transport **transport,
+				       struct list_head *transport_list)
+{
+	struct sctp_transport *new_transport = chunk->transport;
+	struct sctp_transport *curr = *transport;
+	bool changed = false;
+
+	if (!new_transport) {
+		if (!sctp_chunk_is_data(chunk)) {
+			/*
+			 * If we have a prior transport pointer, see if
+			 * the destination address of the chunk
+			 * matches the destination address of the
+			 * current transport.  If not a match, then
+			 * try to look up the transport with a given
+			 * destination address.  We do this because
+			 * after processing ASCONFs, we may have new
+			 * transports created.
+			 */
+			if (curr && sctp_cmp_addr_exact(&chunk->dest,
+							&curr->ipaddr))
+				new_transport = curr;
+			else
+				new_transport = sctp_assoc_lookup_paddr(asoc,
+								  &chunk->dest);
+		}
+
+		/* if we still don't have a new transport, then
+		 * use the current active path.
+		 */
+		if (!new_transport)
+			new_transport = asoc->peer.active_path;
+	} else {
+		__u8 type;
+
+		switch (new_transport->state) {
+		case SCTP_INACTIVE:
+		case SCTP_UNCONFIRMED:
+		case SCTP_PF:
+			/* If the chunk is Heartbeat or Heartbeat Ack,
+			 * send it to chunk->transport, even if it's
+			 * inactive.
+			 *
+			 * 3.3.6 Heartbeat Acknowledgement:
+			 * ...
+			 * A HEARTBEAT ACK is always sent to the source IP
+			 * address of the IP datagram containing the
+			 * HEARTBEAT chunk to which this ack is responding.
+			 * ...
+			 *
+			 * ASCONF_ACKs also must be sent to the source.
+			 */
+			type = chunk->chunk_hdr->type;
+			if (type != SCTP_CID_HEARTBEAT &&
+			    type != SCTP_CID_HEARTBEAT_ACK &&
+			    type != SCTP_CID_ASCONF_ACK)
+				new_transport = asoc->peer.active_path;
+			break;
+		default:
+			break;
+		}
+	}
+
+	/* Are we switching transports? Take care of transport locks. */
+	if (new_transport != curr) {
+		changed = true;
+		curr = new_transport;
+		*transport = curr;
+		if (list_empty(&curr->send_ready))
+			list_add_tail(&curr->send_ready, transport_list);
+
+		sctp_packet_config(&curr->packet, asoc->peer.i.init_tag,
+				   asoc->peer.ecn_capable);
+		/* We've switched transports, so apply the
+		 * Burst limit to the new transport.
+		 */
+		sctp_transport_burst_limited(curr);
+	}
+
+	return changed;
+}
+
 /*
  * Try to flush an outqueue.
  *
@@ -806,7 +890,6 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 	struct sctp_association *asoc = q->asoc;
 	__u32 vtag = asoc->peer.i.init_tag;
 	struct sctp_transport *transport = NULL;
-	struct sctp_transport *new_transport;
 	struct sctp_chunk *chunk, *tmp;
 	enum sctp_xmit status;
 	int error = 0;
@@ -843,68 +926,12 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 
 		list_del_init(&chunk->list);
 
-		/* Pick the right transport to use. */
-		new_transport = chunk->transport;
-
-		if (!new_transport) {
-			/*
-			 * If we have a prior transport pointer, see if
-			 * the destination address of the chunk
-			 * matches the destination address of the
-			 * current transport.  If not a match, then
-			 * try to look up the transport with a given
-			 * destination address.  We do this because
-			 * after processing ASCONFs, we may have new
-			 * transports created.
-			 */
-			if (transport &&
-			    sctp_cmp_addr_exact(&chunk->dest,
-						&transport->ipaddr))
-					new_transport = transport;
-			else
-				new_transport = sctp_assoc_lookup_paddr(asoc,
-								&chunk->dest);
-
-			/* if we still don't have a new transport, then
-			 * use the current active path.
-			 */
-			if (!new_transport)
-				new_transport = asoc->peer.active_path;
-		} else if ((new_transport->state == SCTP_INACTIVE) ||
-			   (new_transport->state == SCTP_UNCONFIRMED) ||
-			   (new_transport->state == SCTP_PF)) {
-			/* If the chunk is Heartbeat or Heartbeat Ack,
-			 * send it to chunk->transport, even if it's
-			 * inactive.
-			 *
-			 * 3.3.6 Heartbeat Acknowledgement:
-			 * ...
-			 * A HEARTBEAT ACK is always sent to the source IP
-			 * address of the IP datagram containing the
-			 * HEARTBEAT chunk to which this ack is responding.
-			 * ...
-			 *
-			 * ASCONF_ACKs also must be sent to the source.
-			 */
-			if (chunk->chunk_hdr->type != SCTP_CID_HEARTBEAT &&
-			    chunk->chunk_hdr->type != SCTP_CID_HEARTBEAT_ACK &&
-			    chunk->chunk_hdr->type != SCTP_CID_ASCONF_ACK)
-				new_transport = asoc->peer.active_path;
-		}
-
-		/* Are we switching transports?
-		 * Take care of transport locks.
+		/* Pick the right transport to use. Should always be true for
+		 * the first chunk as we don't have a transport by then.
 		 */
-		if (new_transport != transport) {
-			transport = new_transport;
-			if (list_empty(&transport->send_ready)) {
-				list_add_tail(&transport->send_ready,
-					      &transport_list);
-			}
+		if (sctp_outq_select_transport(chunk, asoc, &transport,
+					       &transport_list))
 			packet = &transport->packet;
-			sctp_packet_config(packet, vtag,
-					   asoc->peer.ecn_capable);
-		}
 
 		switch (chunk->chunk_hdr->type) {
 		/*
@@ -1072,43 +1099,9 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 				goto sctp_flush_out;
 			}
 
-			/* If there is a specified transport, use it.
-			 * Otherwise, we want to use the active path.
-			 */
-			new_transport = chunk->transport;
-			if (!new_transport ||
-			    ((new_transport->state == SCTP_INACTIVE) ||
-			     (new_transport->state == SCTP_UNCONFIRMED) ||
-			     (new_transport->state == SCTP_PF)))
-				new_transport = asoc->peer.active_path;
-			if (new_transport->state == SCTP_UNCONFIRMED) {
-				WARN_ONCE(1, "Attempt to send packet on unconfirmed path.");
-				sctp_sched_dequeue_done(q, chunk);
-				sctp_chunk_fail(chunk, 0);
-				sctp_chunk_free(chunk);
-				continue;
-			}
-
-			/* Change packets if necessary.  */
-			if (new_transport != transport) {
-				transport = new_transport;
-
-				/* Schedule to have this transport's
-				 * packet flushed.
-				 */
-				if (list_empty(&transport->send_ready)) {
-					list_add_tail(&transport->send_ready,
-						      &transport_list);
-				}
-
+			if (sctp_outq_select_transport(chunk, asoc, &transport,
+						       &transport_list))
 				packet = &transport->packet;
-				sctp_packet_config(packet, vtag,
-						   asoc->peer.ecn_capable);
-				/* We've switched transports, so apply the
-				 * Burst limit to the new transport.
-				 */
-				sctp_transport_burst_limited(transport);
-			}
 
 			pr_debug("%s: outq:%p, chunk:%p[%s], tx-tsn:0x%x skb->head:%p "
 				 "skb->users:%d\n",
-- 
2.14.3

^ permalink raw reply related

* [PATCH net-next v3 3/8] sctp: move the flush of ctrl chunks into its own function
From: Marcelo Ricardo Leitner @ 2018-05-14 17:34 UTC (permalink / raw)
  To: netdev; +Cc: linux-sctp, Neil Horman, Xin Long, Vlad Yasevich
In-Reply-To: <cover.1526318522.git.marcelo.leitner@gmail.com>

Named sctp_outq_flush_ctrl and, with that, keep the contexts contained.

One small fix embedded is the reset of one_packet at every iteration.
This allows bundling of some control chunks in case they were preceeded by
another control chunk that cannot be bundled.

Other than this, it has the same behavior.

Changes since v2:
- Fixed panic reported by kbuild test robot if building with
  only up to this patch applied, due to bad parameter to
  sctp_outq_select_transport and by not initializing packet after
  calling sctp_outq_flush_ctrl.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
---
 net/sctp/outqueue.c | 92 ++++++++++++++++++++++++++++++++---------------------
 1 file changed, 56 insertions(+), 36 deletions(-)

diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index bda50596d4bfebeac03966c5a161473df1c1986a..92f14f51edf28cf596c76b98854a10c0fd28bb22 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -875,45 +875,21 @@ static bool sctp_outq_select_transport(struct sctp_chunk *chunk,
 	return changed;
 }

-/*
- * Try to flush an outqueue.
- *
- * Description: Send everything in q which we legally can, subject to
- * congestion limitations.
- * * Note: This function can be called from multiple contexts so appropriate
- * locking concerns must be made.  Today we use the sock lock to protect
- * this function.
- */
-static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
+static void sctp_outq_flush_ctrl(struct sctp_outq *q,
+				 struct sctp_transport **_transport,
+				 struct list_head *transport_list,
+				 gfp_t gfp)
 {
-	struct sctp_packet *packet;
+	struct sctp_transport *transport = *_transport;
 	struct sctp_association *asoc = q->asoc;
-	__u32 vtag = asoc->peer.i.init_tag;
-	struct sctp_transport *transport = NULL;
+	struct sctp_packet *packet = NULL;
 	struct sctp_chunk *chunk, *tmp;
 	enum sctp_xmit status;
-	int error = 0;
-	int start_timer = 0;
-	int one_packet = 0;
-
-	/* These transports have chunks to send. */
-	struct list_head transport_list;
-	struct list_head *ltransport;
-
-	INIT_LIST_HEAD(&transport_list);
-	packet = NULL;
-
-	/*
-	 * 6.10 Bundling
-	 *   ...
-	 *   When bundling control chunks with DATA chunks, an
-	 *   endpoint MUST place control chunks first in the outbound
-	 *   SCTP packet.  The transmitter MUST transmit DATA chunks
-	 *   within a SCTP packet in increasing order of TSN.
-	 *   ...
-	 */
+	int one_packet, error;

 	list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) {
+		one_packet = 0;
+
 		/* RFC 5061, 5.3
 		 * F1) This means that until such time as the ASCONF
 		 * containing the add is acknowledged, the sender MUST
@@ -929,9 +905,11 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 		/* Pick the right transport to use. Should always be true for
 		 * the first chunk as we don't have a transport by then.
 		 */
-		if (sctp_outq_select_transport(chunk, asoc, &transport,
-					       &transport_list))
+		if (sctp_outq_select_transport(chunk, asoc, _transport,
+					       transport_list)) {
+			transport = *_transport;
 			packet = &transport->packet;
+		}

 		switch (chunk->chunk_hdr->type) {
 		/*
@@ -954,6 +932,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 			if (sctp_test_T_bit(chunk))
 				packet->vtag = asoc->c.my_vtag;
 			/* fallthru */
+
 		/* The following chunks are "response" chunks, i.e.
 		 * they are generated in response to something we
 		 * received.  If we are sending these, then we can
@@ -979,7 +958,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 		case SCTP_CID_RECONF:
 			status = sctp_packet_transmit_chunk(packet, chunk,
 							    one_packet, gfp);
-			if (status  != SCTP_XMIT_OK) {
+			if (status != SCTP_XMIT_OK) {
 				/* put the chunk back */
 				list_add(&chunk->list, &q->control_chunk_list);
 				break;
@@ -1006,6 +985,47 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 			BUG();
 		}
 	}
+}
+
+/*
+ * Try to flush an outqueue.
+ *
+ * Description: Send everything in q which we legally can, subject to
+ * congestion limitations.
+ * * Note: This function can be called from multiple contexts so appropriate
+ * locking concerns must be made.  Today we use the sock lock to protect
+ * this function.
+ */
+static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
+{
+	struct sctp_packet *packet;
+	struct sctp_association *asoc = q->asoc;
+	__u32 vtag = asoc->peer.i.init_tag;
+	struct sctp_transport *transport = NULL;
+	struct sctp_chunk *chunk;
+	enum sctp_xmit status;
+	int error = 0;
+	int start_timer = 0;
+
+	/* These transports have chunks to send. */
+	struct list_head transport_list;
+	struct list_head *ltransport;
+
+	INIT_LIST_HEAD(&transport_list);
+	packet = NULL;
+
+	/*
+	 * 6.10 Bundling
+	 *   ...
+	 *   When bundling control chunks with DATA chunks, an
+	 *   endpoint MUST place control chunks first in the outbound
+	 *   SCTP packet.  The transmitter MUST transmit DATA chunks
+	 *   within a SCTP packet in increasing order of TSN.
+	 *   ...
+	 */
+
+	sctp_outq_flush_ctrl(q, &transport, &transport_list, gfp);
+	packet = &transport->packet;

 	if (q->asoc->src_out_of_asoc_ok)
 		goto sctp_flush_out;

^ permalink raw reply related

* [PATCH net-next v3 0/8] sctp: refactor sctp_outq_flush
From: Marcelo Ricardo Leitner @ 2018-05-14 17:34 UTC (permalink / raw)
  To: netdev; +Cc: linux-sctp, Neil Horman, Xin Long, Vlad Yasevich

Currently sctp_outq_flush does many different things and arguably
unrelated, such as doing transport selection and outq dequeueing.

This patchset refactors it into smaller and more dedicated functions.
The end behavior should be the same.

The next patchset will rework the function parameters.

Changes since v1:
- fix build issues on patches 3 and 4, and updated 5 and 8 because of
  it.

Changes since v2:
- fixed panic if building with just up to patch 3 applied

Marcelo Ricardo Leitner (8):
  sctp: add sctp_packet_singleton
  sctp: factor out sctp_outq_select_transport
  sctp: move the flush of ctrl chunks into its own function
  sctp: move outq data rtx code out of sctp_outq_flush
  sctp: move flushing of data chunks out of sctp_outq_flush
  sctp: move transport flush code out of sctp_outq_flush
  sctp: make use of gfp on retransmissions
  sctp: rework switch cases in sctp_outq_flush_data

 net/sctp/outqueue.c | 593 +++++++++++++++++++++++++++-------------------------
 1 file changed, 311 insertions(+), 282 deletions(-)

^ permalink raw reply

* [PATCH net-next v3 1/8] sctp: add sctp_packet_singleton
From: Marcelo Ricardo Leitner @ 2018-05-14 17:34 UTC (permalink / raw)
  To: netdev; +Cc: linux-sctp, Neil Horman, Xin Long, Vlad Yasevich
In-Reply-To: <cover.1526318522.git.marcelo.leitner@gmail.com>

Factor out the code for generating singletons. It's used only once, but
helps to keep the context contained.

The const variables are to ease the reading of subsequent calls in there.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
---
 net/sctp/outqueue.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index dee7cbd5483149024f2f3195db2fe4d473b1a00a..300bd0dfc7c14c9df579dbe2f9e78dd8356ae1a3 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -776,6 +776,20 @@ void sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp)
 	sctp_outq_flush(q, 0, gfp);
 }
 
+static int sctp_packet_singleton(struct sctp_transport *transport,
+				 struct sctp_chunk *chunk, gfp_t gfp)
+{
+	const struct sctp_association *asoc = transport->asoc;
+	const __u16 sport = asoc->base.bind_addr.port;
+	const __u16 dport = asoc->peer.port;
+	const __u32 vtag = asoc->peer.i.init_tag;
+	struct sctp_packet singleton;
+
+	sctp_packet_init(&singleton, transport, sport, dport);
+	sctp_packet_config(&singleton, vtag, 0);
+	sctp_packet_append_chunk(&singleton, chunk);
+	return sctp_packet_transmit(&singleton, gfp);
+}
 
 /*
  * Try to flush an outqueue.
@@ -789,10 +803,7 @@ void sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp)
 static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 {
 	struct sctp_packet *packet;
-	struct sctp_packet singleton;
 	struct sctp_association *asoc = q->asoc;
-	__u16 sport = asoc->base.bind_addr.port;
-	__u16 dport = asoc->peer.port;
 	__u32 vtag = asoc->peer.i.init_tag;
 	struct sctp_transport *transport = NULL;
 	struct sctp_transport *new_transport;
@@ -905,10 +916,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 		case SCTP_CID_INIT:
 		case SCTP_CID_INIT_ACK:
 		case SCTP_CID_SHUTDOWN_COMPLETE:
-			sctp_packet_init(&singleton, transport, sport, dport);
-			sctp_packet_config(&singleton, vtag, 0);
-			sctp_packet_append_chunk(&singleton, chunk);
-			error = sctp_packet_transmit(&singleton, gfp);
+			error = sctp_packet_singleton(transport, chunk, gfp);
 			if (error < 0) {
 				asoc->base.sk->sk_err = -error;
 				return;
-- 
2.14.3

^ permalink raw reply related

* (unknown), 
From: Jessica @ 2018-05-14 17:30 UTC (permalink / raw)


Hallo groeten, kunt u me alsjeblieft dringend terugschrijven.

^ permalink raw reply

* Re: RFQ 5142018 OCEAN
From: Ocean-trade co.Ltd @ 2018-05-14  6:46 UTC (permalink / raw)


[-- Attachment #1: Type: text/plain, Size: 316 bytes --]

Good morning,

Just wondering if this quotation you gave my colleague is still valid?

Please kindly re-quote for the attached asap,

Awaiting your reply.


Thanks.

Sincerely,
Yusuf Ibrahim

(OCEAN TRADE LTD)
D/89, SherShah Road, S.I.T.E., Karachi-75730,
Pakistan.
TEL: +(92-21) 111-267-354
FAX: +(92-21) 325-872-40

[-- Attachment #2: OCEAN TRADE_PDF.jar --]
[-- Type: application/java-archive, Size: 678901 bytes --]

^ permalink raw reply

* Re: [net-next PATCH 1/3] net: Refactor XPS for CPUs and Rx queues
From: Nambiar, Amritha @ 2018-05-14 17:18 UTC (permalink / raw)
  To: Tom Herbert
  Cc: Linux Kernel Network Developers, David S. Miller, Alexander Duyck,
	Sridhar Samudrala, Eric Dumazet, Hannes Frederic Sowa
In-Reply-To: <CALx6S34sedBi5VMhaCf2QZLQ57s6OUSeKoGctwK9BkgkNvXcog@mail.gmail.com>

On 5/14/2018 9:53 AM, Tom Herbert wrote:
> On Wed, May 9, 2018 at 1:54 PM, Nambiar, Amritha
> <amritha.nambiar@intel.com> wrote:
>> On 5/9/2018 1:31 PM, Tom Herbert wrote:
>>> On Thu, Apr 19, 2018 at 6:04 PM, Amritha Nambiar
>>> <amritha.nambiar@intel.com> wrote:
>>>> Refactor XPS code to support Tx queue selection based on
>>>> CPU map or Rx queue map.
>>>>
>>>> Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
>>>> ---
>>>>  include/linux/netdevice.h |   82 +++++++++++++++++-
>>>>  net/core/dev.c            |  206 +++++++++++++++++++++++++++++----------------
>>>>  net/core/net-sysfs.c      |    4 -
>>>>  3 files changed, 216 insertions(+), 76 deletions(-)
>>>>
>>>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>>>> index 14e0777..40a9171 100644
>>>> --- a/include/linux/netdevice.h
>>>> +++ b/include/linux/netdevice.h
>>>> @@ -730,10 +730,21 @@ struct xps_map {
>>>>   */
>>>>  struct xps_dev_maps {
>>>>         struct rcu_head rcu;
>>>> -       struct xps_map __rcu *cpu_map[0];
>>>> +       struct xps_map __rcu *attr_map[0];
>>>
>>> This seems unnecessarily complicated to me. Why not just add another
>>> map called something like "rxq2txq_map". Then when selecting TXQ just
>>> check the new map first and then the normal cpu_map if there's not a
>>> hit.
>>>
>>
>> This is just a change in the name to something more generic ('attr')
>> since the maps can either be cpu based or rxq based. I have added two
>> map types, XPS_MAP_RXQS, XPS_MAP_CPUS and the TXQ selection (in patch
> 
> I think adding map types is overkill and we really don't want to turn
> this in to a generic but complex interface with a bunch of map types.
> Just have two pointers to the two different maps.
> 

Sorry if I wasn't clear enough, instead of having two different pointers:
struct xps_dev_maps __rcu *xps_cpu_maps;
struct xps_dev_maps __rcu *xps_rxq_maps;
in the struct netdevice below, I have a pointer array:
struct xps_dev_maps __rcu *xps_maps[__XPS_MAP_MAX];
and use the map type enum as an index to select which one we are updating.

The idea was this might look cleaner while selecting the queue where the
processing order would be XPS_MAP_RXQS first and then XPS_MAP_CPUS.

>> 2/3) works how you described,  first based on the RXQ map and if there
>> is no hit, falls to the normal CPU map.
>>
>>>>  };
>>>> -#define XPS_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) +         \
>>>> +
>>>> +#define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) +     \
>>>>         (nr_cpu_ids * (_tcs) * sizeof(struct xps_map *)))
>>>> +
>>>> +#define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\
>>>> +       (_rxqs * (_tcs) * sizeof(struct xps_map *)))
>>>> +
>>>> +enum xps_map_type {
>>>> +       XPS_MAP_RXQS,
>>>> +       XPS_MAP_CPUS,
>>>> +       __XPS_MAP_MAX
>>>> +};
>>>> +
>>>>  #endif /* CONFIG_XPS */
>>>>
>>>>  #define TC_MAX_QUEUE   16
>>>> @@ -1867,7 +1878,7 @@ struct net_device {
>>>>         int                     watchdog_timeo;
>>>>
>>>>  #ifdef CONFIG_XPS
>>>> -       struct xps_dev_maps __rcu *xps_maps;
>>>> +       struct xps_dev_maps __rcu *xps_maps[__XPS_MAP_MAX];
>>>>  #endif
>>>>  #ifdef CONFIG_NET_CLS_ACT
>>>>         struct mini_Qdisc __rcu *miniq_egress;
>>>> @@ -3204,6 +3215,71 @@ static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
>>>>  #ifdef CONFIG_XPS
>>>>  int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>>>                         u16 index);
>>>> +int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
>>>> +                         u16 index, enum xps_map_type type);
>>>> +
>>>> +static inline bool attr_test_mask(unsigned long j, const unsigned long *mask,
>>>> +                                 unsigned int nr_bits)
>>>> +{
>>>> +#ifdef CONFIG_DEBUG_PER_CPU_MAPS
>>>> +       WARN_ON_ONCE(j >= nr_bits);
>>>> +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
>>>
>>> This #ifdef block appears 3 times in the patch. Seems like it should
>>> be replace by simple macro.
>>
>> Sure, will do in the next version.
>>
>>>
>>>> +       return test_bit(j, mask);
>>>> +}
>>>> +
>>>> +static inline bool attr_test_online(unsigned long j,
>>>> +                                   const unsigned long *online_mask,
>>>> +                                   unsigned int nr_bits)
>>>> +{
>>>> +#ifdef CONFIG_DEBUG_PER_CPU_MAPS
>>>> +       WARN_ON_ONCE(j >= nr_bits);
>>>> +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
>>>> +
>>>> +       if (online_mask)
>>>> +               return test_bit(j, online_mask);
>>>> +
>>>> +       if (j >= 0 && j < nr_bits)
>>>> +               return true;
>>>> +
>>>> +       return false;
>>>> +}
>>>> +
>>>> +static inline unsigned int attrmask_next(int n, const unsigned long *srcp,
>>>> +                                        unsigned int nr_bits)
>>>> +{
>>>> +       /* -1 is a legal arg here. */
>>>> +       if (n != -1) {
>>>> +#ifdef CONFIG_DEBUG_PER_CPU_MAPS
>>>> +               WARN_ON_ONCE(n >= nr_bits);
>>>> +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
>>>> +       }
>>>> +
>>>> +       if (srcp)
>>>> +               return find_next_bit(srcp, nr_bits, n + 1);
>>>> +
>>>> +       return n + 1;
>>>> +}
>>>> +
>>>> +static inline int attrmask_next_and(int n, const unsigned long *src1p,
>>>> +                                   const unsigned long *src2p,
>>>> +                                   unsigned int nr_bits)
>>>> +{
>>>> +       /* -1 is a legal arg here. */
>>>> +       if (n != -1) {
>>>> +#ifdef CONFIG_DEBUG_PER_CPU_MAPS
>>>> +               WARN_ON_ONCE(n >= nr_bits);
>>>> +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
>>>> +       }
>>>> +
>>>> +       if (src1p && src2p)
>>>> +               return find_next_and_bit(src1p, src2p, nr_bits, n + 1);
>>>> +       else if (src1p)
>>>> +               return find_next_bit(src1p, nr_bits, n + 1);
>>>> +       else if (src2p)
>>>> +               return find_next_bit(src2p, nr_bits, n + 1);
>>>> +
>>>> +       return n + 1;
>>>> +}
>>>>  #else
>>>>  static inline int netif_set_xps_queue(struct net_device *dev,
>>>>                                       const struct cpumask *mask,
>>>> diff --git a/net/core/dev.c b/net/core/dev.c
>>>> index a490ef6..17c4883 100644
>>>> --- a/net/core/dev.c
>>>> +++ b/net/core/dev.c
>>>> @@ -2092,7 +2092,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
>>>>         int pos;
>>>>
>>>>         if (dev_maps)
>>>> -               map = xmap_dereference(dev_maps->cpu_map[tci]);
>>>> +               map = xmap_dereference(dev_maps->attr_map[tci]);
>>>>         if (!map)
>>>>                 return false;
>>>>
>>>> @@ -2105,7 +2105,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
>>>>                         break;
>>>>                 }
>>>>
>>>> -               RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
>>>> +               RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
>>>>                 kfree_rcu(map, rcu);
>>>>                 return false;
>>>>         }
>>>> @@ -2138,30 +2138,47 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
>>>>  static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
>>>>                                    u16 count)
>>>>  {
>>>> +       const unsigned long *possible_mask = NULL;
>>>> +       enum xps_map_type type = XPS_MAP_RXQS;
>>>>         struct xps_dev_maps *dev_maps;
>>>> -       int cpu, i;
>>>>         bool active = false;
>>>> +       unsigned int nr_ids;
>>>> +       int i, j;
>>>>
>>>>         mutex_lock(&xps_map_mutex);
>>>> -       dev_maps = xmap_dereference(dev->xps_maps);
>>>>
>>>> -       if (!dev_maps)
>>>> -               goto out_no_maps;
>>>> +       while (type < __XPS_MAP_MAX) {
>>>> +               dev_maps = xmap_dereference(dev->xps_maps[type]);
>>>> +               if (!dev_maps)
>>>> +                       goto out_no_maps;
>>>> +
>>>> +               if (type == XPS_MAP_CPUS) {
>>>> +                       if (num_possible_cpus() > 1)
>>>> +                               possible_mask = cpumask_bits(cpu_possible_mask);
>>>> +                       nr_ids = nr_cpu_ids;
>>>> +               } else if (type == XPS_MAP_RXQS) {
>>>> +                       nr_ids = dev->num_rx_queues;
>>>> +               }
>>>>
>>>> -       for_each_possible_cpu(cpu)
>>>> -               active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
>>>> -                                              offset, count);
>>>> +               for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
>>>> +                    j < nr_ids;)
>>>> +                       active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
>>>> +                                                      count);
>>>> +               if (!active) {
>>>> +                       RCU_INIT_POINTER(dev->xps_maps[type], NULL);
>>>> +                       kfree_rcu(dev_maps, rcu);
>>>> +               }
>>>>
>>>> -       if (!active) {
>>>> -               RCU_INIT_POINTER(dev->xps_maps, NULL);
>>>> -               kfree_rcu(dev_maps, rcu);
>>>> +               if (type == XPS_MAP_CPUS) {
>>>> +                       for (i = offset + (count - 1); count--; i--)
>>>> +                               netdev_queue_numa_node_write(
>>>> +                                       netdev_get_tx_queue(dev, i),
>>>> +                                                           NUMA_NO_NODE);
>>>> +               }
>>>> +out_no_maps:
>>>> +               type++;
>>>>         }
>>>>
>>>> -       for (i = offset + (count - 1); count--; i--)
>>>> -               netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
>>>> -                                            NUMA_NO_NODE);
>>>> -
>>>> -out_no_maps:
>>>>         mutex_unlock(&xps_map_mutex);
>>>>  }
>>>>
>>>> @@ -2170,11 +2187,11 @@ static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
>>>>         netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
>>>>  }
>>>>
>>>> -static struct xps_map *expand_xps_map(struct xps_map *map,
>>>> -                                     int cpu, u16 index)
>>>> +static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
>>>> +                                     u16 index, enum xps_map_type type)
>>>>  {
>>>> -       struct xps_map *new_map;
>>>>         int alloc_len = XPS_MIN_MAP_ALLOC;
>>>> +       struct xps_map *new_map = NULL;
>>>>         int i, pos;
>>>>
>>>>         for (pos = 0; map && pos < map->len; pos++) {
>>>> @@ -2183,7 +2200,7 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
>>>>                 return map;
>>>>         }
>>>>
>>>> -       /* Need to add queue to this CPU's existing map */
>>>> +       /* Need to add tx-queue to this CPU's/rx-queue's existing map */
>>>>         if (map) {
>>>>                 if (pos < map->alloc_len)
>>>>                         return map;
>>>> @@ -2191,9 +2208,14 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
>>>>                 alloc_len = map->alloc_len * 2;
>>>>         }
>>>>
>>>> -       /* Need to allocate new map to store queue on this CPU's map */
>>>> -       new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
>>>> -                              cpu_to_node(cpu));
>>>> +       /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
>>>> +        *  map
>>>> +        */
>>>> +       if (type == XPS_MAP_RXQS)
>>>> +               new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
>>>> +       else if (type == XPS_MAP_CPUS)
>>>> +               new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
>>>> +                                      cpu_to_node(attr_index));
>>>>         if (!new_map)
>>>>                 return NULL;
>>>>
>>>> @@ -2205,14 +2227,16 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
>>>>         return new_map;
>>>>  }
>>>>
>>>> -int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>>> -                       u16 index)
>>>> +int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
>>>> +                         u16 index, enum xps_map_type type)
>>>>  {
>>>> +       const unsigned long *online_mask = NULL, *possible_mask = NULL;
>>>>         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
>>>> -       int i, cpu, tci, numa_node_id = -2;
>>>> +       int i, j, tci, numa_node_id = -2;
>>>>         int maps_sz, num_tc = 1, tc = 0;
>>>>         struct xps_map *map, *new_map;
>>>>         bool active = false;
>>>> +       unsigned int nr_ids;
>>>>
>>>>         if (dev->num_tc) {
>>>>                 num_tc = dev->num_tc;
>>>> @@ -2221,16 +2245,33 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>>>                         return -EINVAL;
>>>>         }
>>>>
>>>> -       maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
>>>> +       switch (type) {
>>>> +       case XPS_MAP_RXQS:
>>>> +               maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
>>>> +               dev_maps = xmap_dereference(dev->xps_maps[XPS_MAP_RXQS]);
>>>> +               nr_ids = dev->num_rx_queues;
>>>> +               break;
>>>> +       case XPS_MAP_CPUS:
>>>> +               maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
>>>> +               if (num_possible_cpus() > 1) {
>>>> +                       online_mask = cpumask_bits(cpu_online_mask);
>>>> +                       possible_mask = cpumask_bits(cpu_possible_mask);
>>>> +               }
>>>> +               dev_maps = xmap_dereference(dev->xps_maps[XPS_MAP_CPUS]);
>>>> +               nr_ids = nr_cpu_ids;
>>>> +               break;
>>>> +       default:
>>>> +               return -EINVAL;
>>>> +       }
>>>> +
>>>>         if (maps_sz < L1_CACHE_BYTES)
>>>>                 maps_sz = L1_CACHE_BYTES;
>>>>
>>>>         mutex_lock(&xps_map_mutex);
>>>>
>>>> -       dev_maps = xmap_dereference(dev->xps_maps);
>>>> -
>>>>         /* allocate memory for queue storage */
>>>> -       for_each_cpu_and(cpu, cpu_online_mask, mask) {
>>>> +       for (j = -1; j = attrmask_next_and(j, online_mask, mask, nr_ids),
>>>> +            j < nr_ids;) {
>>>>                 if (!new_dev_maps)
>>>>                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
>>>>                 if (!new_dev_maps) {
>>>> @@ -2238,73 +2279,81 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>>>                         return -ENOMEM;
>>>>                 }
>>>>
>>>> -               tci = cpu * num_tc + tc;
>>>> -               map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
>>>> +               tci = j * num_tc + tc;
>>>> +               map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
>>>>                                  NULL;
>>>>
>>>> -               map = expand_xps_map(map, cpu, index);
>>>> +               map = expand_xps_map(map, j, index, type);
>>>>                 if (!map)
>>>>                         goto error;
>>>>
>>>> -               RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
>>>> +               RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
>>>>         }
>>>>
>>>>         if (!new_dev_maps)
>>>>                 goto out_no_new_maps;
>>>>
>>>> -       for_each_possible_cpu(cpu) {
>>>> +       for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
>>>> +            j < nr_ids;) {
>>>>                 /* copy maps belonging to foreign traffic classes */
>>>> -               for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
>>>> +               for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
>>>>                         /* fill in the new device map from the old device map */
>>>> -                       map = xmap_dereference(dev_maps->cpu_map[tci]);
>>>> -                       RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
>>>> +                       map = xmap_dereference(dev_maps->attr_map[tci]);
>>>> +                       RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
>>>>                 }
>>>>
>>>>                 /* We need to explicitly update tci as prevous loop
>>>>                  * could break out early if dev_maps is NULL.
>>>>                  */
>>>> -               tci = cpu * num_tc + tc;
>>>> +               tci = j * num_tc + tc;
>>>>
>>>> -               if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
>>>> -                       /* add queue to CPU maps */
>>>> +               if (attr_test_mask(j, mask, nr_ids) &&
>>>> +                   attr_test_online(j, online_mask, nr_ids)) {
>>>> +                       /* add tx-queue to CPU/rx-queue maps */
>>>>                         int pos = 0;
>>>>
>>>> -                       map = xmap_dereference(new_dev_maps->cpu_map[tci]);
>>>> +                       map = xmap_dereference(new_dev_maps->attr_map[tci]);
>>>>                         while ((pos < map->len) && (map->queues[pos] != index))
>>>>                                 pos++;
>>>>
>>>>                         if (pos == map->len)
>>>>                                 map->queues[map->len++] = index;
>>>>  #ifdef CONFIG_NUMA
>>>> -                       if (numa_node_id == -2)
>>>> -                               numa_node_id = cpu_to_node(cpu);
>>>> -                       else if (numa_node_id != cpu_to_node(cpu))
>>>> -                               numa_node_id = -1;
>>>> +                       if (type == XPS_MAP_CPUS) {
>>>> +                               if (numa_node_id == -2)
>>>> +                                       numa_node_id = cpu_to_node(j);
>>>> +                               else if (numa_node_id != cpu_to_node(j))
>>>> +                                       numa_node_id = -1;
>>>> +                       }
>>>>  #endif
>>>>                 } else if (dev_maps) {
>>>>                         /* fill in the new device map from the old device map */
>>>> -                       map = xmap_dereference(dev_maps->cpu_map[tci]);
>>>> -                       RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
>>>> +                       map = xmap_dereference(dev_maps->attr_map[tci]);
>>>> +                       RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
>>>>                 }
>>>>
>>>>                 /* copy maps belonging to foreign traffic classes */
>>>>                 for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
>>>>                         /* fill in the new device map from the old device map */
>>>> -                       map = xmap_dereference(dev_maps->cpu_map[tci]);
>>>> -                       RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
>>>> +                       map = xmap_dereference(dev_maps->attr_map[tci]);
>>>> +                       RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
>>>>                 }
>>>>         }
>>>>
>>>> -       rcu_assign_pointer(dev->xps_maps, new_dev_maps);
>>>> +       if (type == XPS_MAP_RXQS)
>>>> +               rcu_assign_pointer(dev->xps_maps[XPS_MAP_RXQS], new_dev_maps);
>>>> +       else if (type == XPS_MAP_CPUS)
>>>> +               rcu_assign_pointer(dev->xps_maps[XPS_MAP_CPUS], new_dev_maps);
>>>>
>>>>         /* Cleanup old maps */
>>>>         if (!dev_maps)
>>>>                 goto out_no_old_maps;
>>>>
>>>> -       for_each_possible_cpu(cpu) {
>>>> -               for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
>>>> -                       new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
>>>> -                       map = xmap_dereference(dev_maps->cpu_map[tci]);
>>>> +       for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
>>>> +            j < nr_ids;) {
>>>> +               for (i = num_tc, tci = j * num_tc; i--; tci++) {
>>>> +                       new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
>>>> +                       map = xmap_dereference(dev_maps->attr_map[tci]);
>>>>                         if (map && map != new_map)
>>>>                                 kfree_rcu(map, rcu);
>>>>                 }
>>>> @@ -2317,19 +2366,23 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>>>         active = true;
>>>>
>>>>  out_no_new_maps:
>>>> -       /* update Tx queue numa node */
>>>> -       netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
>>>> -                                    (numa_node_id >= 0) ? numa_node_id :
>>>> -                                    NUMA_NO_NODE);
>>>> +       if (type == XPS_MAP_CPUS) {
>>>> +               /* update Tx queue numa node */
>>>> +               netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
>>>> +                                            (numa_node_id >= 0) ?
>>>> +                                            numa_node_id : NUMA_NO_NODE);
>>>> +       }
>>>>
>>>>         if (!dev_maps)
>>>>                 goto out_no_maps;
>>>>
>>>> -       /* removes queue from unused CPUs */
>>>> -       for_each_possible_cpu(cpu) {
>>>> -               for (i = tc, tci = cpu * num_tc; i--; tci++)
>>>> +       /* removes tx-queue from unused CPUs/rx-queues */
>>>> +       for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
>>>> +            j < nr_ids;) {
>>>> +               for (i = tc, tci = j * num_tc; i--; tci++)
>>>>                         active |= remove_xps_queue(dev_maps, tci, index);
>>>> -               if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
>>>> +               if (!attr_test_mask(j, mask, nr_ids) ||
>>>> +                   !attr_test_online(j, online_mask, nr_ids))
>>>>                         active |= remove_xps_queue(dev_maps, tci, index);
>>>>                 for (i = num_tc - tc, tci++; --i; tci++)
>>>>                         active |= remove_xps_queue(dev_maps, tci, index);
>>>> @@ -2337,7 +2390,10 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>>>
>>>>         /* free map if not active */
>>>>         if (!active) {
>>>> -               RCU_INIT_POINTER(dev->xps_maps, NULL);
>>>> +               if (type == XPS_MAP_RXQS)
>>>> +                       RCU_INIT_POINTER(dev->xps_maps[XPS_MAP_RXQS], NULL);
>>>> +               else if (type == XPS_MAP_CPUS)
>>>> +                       RCU_INIT_POINTER(dev->xps_maps[XPS_MAP_CPUS], NULL);
>>>>                 kfree_rcu(dev_maps, rcu);
>>>>         }
>>>>
>>>> @@ -2347,11 +2403,12 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>>>         return 0;
>>>>  error:
>>>>         /* remove any maps that we added */
>>>> -       for_each_possible_cpu(cpu) {
>>>> -               for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
>>>> -                       new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
>>>> +       for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
>>>> +            j < nr_ids;) {
>>>> +               for (i = num_tc, tci = j * num_tc; i--; tci++) {
>>>> +                       new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
>>>>                         map = dev_maps ?
>>>> -                             xmap_dereference(dev_maps->cpu_map[tci]) :
>>>> +                             xmap_dereference(dev_maps->attr_map[tci]) :
>>>>                               NULL;
>>>>                         if (new_map && new_map != map)
>>>>                                 kfree(new_map);
>>>> @@ -2363,6 +2420,13 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>>>         kfree(new_dev_maps);
>>>>         return -ENOMEM;
>>>>  }
>>>> +
>>>> +int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>>> +                       u16 index)
>>>> +{
>>>> +       return __netif_set_xps_queue(dev, cpumask_bits(mask), index,
>>>> +                                    XPS_MAP_CPUS);
>>>> +}
>>>>  EXPORT_SYMBOL(netif_set_xps_queue);
>>>>
>>>>  #endif
>>>> @@ -3400,7 +3464,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
>>>>         int queue_index = -1;
>>>>
>>>>         rcu_read_lock();
>>>> -       dev_maps = rcu_dereference(dev->xps_maps);
>>>> +       dev_maps = rcu_dereference(dev->xps_maps[XPS_MAP_CPUS]);
>>>>         if (dev_maps) {
>>>>                 unsigned int tci = skb->sender_cpu - 1;
>>>>
>>>> @@ -3409,7 +3473,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
>>>>                         tci += netdev_get_prio_tc_map(dev, skb->priority);
>>>>                 }
>>>>
>>>> -               map = rcu_dereference(dev_maps->cpu_map[tci]);
>>>> +               map = rcu_dereference(dev_maps->attr_map[tci]);
>>>>                 if (map) {
>>>>                         if (map->len == 1)
>>>>                                 queue_index = map->queues[0];
>>>> diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
>>>> index c476f07..d7abd33 100644
>>>> --- a/net/core/net-sysfs.c
>>>> +++ b/net/core/net-sysfs.c
>>>> @@ -1227,13 +1227,13 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue,
>>>>         }
>>>>
>>>>         rcu_read_lock();
>>>> -       dev_maps = rcu_dereference(dev->xps_maps);
>>>> +       dev_maps = rcu_dereference(dev->xps_maps[XPS_MAP_CPUS]);
>>>>         if (dev_maps) {
>>>>                 for_each_possible_cpu(cpu) {
>>>>                         int i, tci = cpu * num_tc + tc;
>>>>                         struct xps_map *map;
>>>>
>>>> -                       map = rcu_dereference(dev_maps->cpu_map[tci]);
>>>> +                       map = rcu_dereference(dev_maps->attr_map[tci]);
>>>>                         if (!map)
>>>>                                 continue;
>>>>
>>>>

^ permalink raw reply

* Re: [PATCH v3] selftests: add headers_install to lib.mk
From: Paolo Bonzini @ 2018-05-14 17:16 UTC (permalink / raw)
  To: Anders Roxell, yamada.masahiro, michal.lkml, shuah, bamv2005,
	brgl, akpm, rppt, aarcange
  Cc: linux-kbuild, linux-kernel, linux-kselftest, netdev
In-Reply-To: <20180514115809.9803-1-anders.roxell@linaro.org>

On 14/05/2018 13:58, Anders Roxell wrote:
> If the kernel headers aren't installed we can't build all the tests.
> Add a new make target rule 'khdr' in the file lib.mk to generate the
> kernel headers and that gets include for every test-dir Makefile that
> includes lib.mk If the testdir in turn have its own sub-dirs the
> top_srcdir needs to be set to the linux-rootdir to be able to generate
> the kernel headers.
> 
> Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
> Reviewed-by: Fathi Boudra <fathi.boudra@linaro.org>

Ack for KVM parts.

Paolo

^ permalink raw reply

* [PATCH bpf-next v6 4/4] bpf: bpftool, support for sockhash
From: John Fastabend @ 2018-05-14 17:00 UTC (permalink / raw)
  To: daniel, ast; +Cc: netdev, davem, John Fastabend
In-Reply-To: <1526317219-7752-1-git-send-email-john.fastabend@gmail.com>

This adds the SOCKHASH map type to bpftools so that we get correct
pretty printing.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
---
 tools/bpf/bpftool/map.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index af6766e..097b1a5 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -66,6 +66,7 @@
 	[BPF_MAP_TYPE_DEVMAP]		= "devmap",
 	[BPF_MAP_TYPE_SOCKMAP]		= "sockmap",
 	[BPF_MAP_TYPE_CPUMAP]		= "cpumap",
+	[BPF_MAP_TYPE_SOCKHASH]		= "sockhash",
 };
 
 static bool map_is_per_cpu(__u32 type)
-- 
1.9.1

^ permalink raw reply related

* [PATCH bpf-next v6 3/4] bpf: selftest additions for SOCKHASH
From: John Fastabend @ 2018-05-14 17:00 UTC (permalink / raw)
  To: daniel, ast; +Cc: netdev, davem, John Fastabend
In-Reply-To: <1526317219-7752-1-git-send-email-john.fastabend@gmail.com>

This runs existing SOCKMAP tests with SOCKHASH map type. To do this
we push programs into include file and build two BPF programs. One
for SOCKHASH and one for SOCKMAP.

We then run the entire test suite with each type.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
---
 tools/include/uapi/linux/bpf.h                   |  52 +++-
 tools/testing/selftests/bpf/Makefile             |   2 +-
 tools/testing/selftests/bpf/bpf_helpers.h        |   8 +
 tools/testing/selftests/bpf/test_sockhash_kern.c |   5 +
 tools/testing/selftests/bpf/test_sockmap.c       |  27 +-
 tools/testing/selftests/bpf/test_sockmap_kern.c  | 343 +--------------------
 tools/testing/selftests/bpf/test_sockmap_kern.h  | 363 +++++++++++++++++++++++
 7 files changed, 452 insertions(+), 348 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_sockhash_kern.c
 create mode 100644 tools/testing/selftests/bpf/test_sockmap_kern.h

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 02e4112..1205d86 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -118,6 +118,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_SOCKMAP,
 	BPF_MAP_TYPE_CPUMAP,
 	BPF_MAP_TYPE_XSKMAP,
+	BPF_MAP_TYPE_SOCKHASH,
 };
 
 enum bpf_prog_type {
@@ -1855,6 +1856,52 @@ struct bpf_stack_build_id {
  *             Egress device index on success, 0 if packet needs to continue
  *             up the stack for further processing or a negative error in case
  *             of failure.
+ * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
+ *	Description
+ *		Add an entry to, or update a sockhash *map* referencing sockets.
+ *		The *skops* is used as a new value for the entry associated to
+ *		*key*. *flags* is one of:
+ *
+ *		**BPF_NOEXIST**
+ *			The entry for *key* must not exist in the map.
+ *		**BPF_EXIST**
+ *			The entry for *key* must already exist in the map.
+ *		**BPF_ANY**
+ *			No condition on the existence of the entry for *key*.
+ *
+ *		If the *map* has eBPF programs (parser and verdict), those will
+ *		be inherited by the socket being added. If the socket is
+ *		already attached to eBPF programs, this results in an error.
+ *	Return
+ *		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags)
+ *	Description
+ *		This helper is used in programs implementing policies at the
+ *		socket level. If the message *msg* is allowed to pass (i.e. if
+ *		the verdict eBPF program returns **SK_PASS**), redirect it to
+ *		the socket referenced by *map* (of type
+ *		**BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
+ *		egress interfaces can be used for redirection. The
+ *		**BPF_F_INGRESS** value in *flags* is used to make the
+ *		distinction (ingress path is selected if the flag is present,
+ *		egress path otherwise). This is the only flag supported for now.
+ *	Return
+ *		**SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags)
+ *	Description
+ *		This helper is used in programs implementing policies at the
+ *		skb socket level. If the sk_buff *skb* is allowed to pass (i.e.
+ *		if the verdeict eBPF program returns **SK_PASS**), redirect it
+ *		to the socket referenced by *map* (of type
+ *		**BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
+ *		egress interfaces can be used for redirection. The
+ *		**BPF_F_INGRESS** value in *flags* is used to make the
+ *		distinction (ingress path is selected if the flag is present,
+ *		egress otherwise). This is the only flag supported for now.
+ *	Return
+ *		**SK_PASS** on success, or **SK_DROP** on error.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1926,7 +1973,10 @@ struct bpf_stack_build_id {
 	FN(skb_get_xfrm_state),		\
 	FN(get_stack),			\
 	FN(skb_load_bytes_relative),	\
-	FN(fib_lookup),
+	FN(fib_lookup),			\
+	FN(sock_hash_update),		\
+	FN(msg_redirect_hash),		\
+	FN(sk_redirect_hash),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 438d4f9..bc402c9 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -33,7 +33,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
 	sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o \
 	sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o \
 	test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o \
-	test_get_stack_rawtp.o
+	test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 2375d06..8f143df 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -75,9 +75,14 @@ static int (*bpf_sock_ops_cb_flags_set)(void *ctx, int flags) =
 	(void *) BPF_FUNC_sock_ops_cb_flags_set;
 static int (*bpf_sk_redirect_map)(void *ctx, void *map, int key, int flags) =
 	(void *) BPF_FUNC_sk_redirect_map;
+static int (*bpf_sk_redirect_hash)(void *ctx, void *map, void *key, int flags) =
+	(void *) BPF_FUNC_sk_redirect_hash;
 static int (*bpf_sock_map_update)(void *map, void *key, void *value,
 				  unsigned long long flags) =
 	(void *) BPF_FUNC_sock_map_update;
+static int (*bpf_sock_hash_update)(void *map, void *key, void *value,
+				   unsigned long long flags) =
+	(void *) BPF_FUNC_sock_hash_update;
 static int (*bpf_perf_event_read_value)(void *map, unsigned long long flags,
 					void *buf, unsigned int buf_size) =
 	(void *) BPF_FUNC_perf_event_read_value;
@@ -88,6 +93,9 @@ static int (*bpf_override_return)(void *ctx, unsigned long rc) =
 	(void *) BPF_FUNC_override_return;
 static int (*bpf_msg_redirect_map)(void *ctx, void *map, int key, int flags) =
 	(void *) BPF_FUNC_msg_redirect_map;
+static int (*bpf_msg_redirect_hash)(void *ctx,
+				    void *map, void *key, int flags) =
+	(void *) BPF_FUNC_msg_redirect_hash;
 static int (*bpf_msg_apply_bytes)(void *ctx, int len) =
 	(void *) BPF_FUNC_msg_apply_bytes;
 static int (*bpf_msg_cork_bytes)(void *ctx, int len) =
diff --git a/tools/testing/selftests/bpf/test_sockhash_kern.c b/tools/testing/selftests/bpf/test_sockhash_kern.c
new file mode 100644
index 0000000..e675591
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_sockhash_kern.c
@@ -0,0 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
+#undef SOCKMAP
+#define TEST_MAP_TYPE BPF_MAP_TYPE_SOCKHASH
+#include "./test_sockmap_kern.h"
diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 29c022d..eb17fae 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -47,7 +47,8 @@
 #define S1_PORT 10000
 #define S2_PORT 10001
 
-#define BPF_FILENAME "test_sockmap_kern.o"
+#define BPF_SOCKMAP_FILENAME "test_sockmap_kern.o"
+#define BPF_SOCKHASH_FILENAME "test_sockhash_kern.o"
 #define CG_PATH "/sockmap"
 
 /* global sockets */
@@ -1260,9 +1261,8 @@ static int test_start_end(int cgrp)
 	BPF_PROG_TYPE_SK_MSG,
 };
 
-static int populate_progs(void)
+static int populate_progs(char *bpf_file)
 {
-	char *bpf_file = BPF_FILENAME;
 	struct bpf_program *prog;
 	struct bpf_object *obj;
 	int i = 0;
@@ -1306,11 +1306,11 @@ static int populate_progs(void)
 	return 0;
 }
 
-static int test_suite(void)
+static int __test_suite(char *bpf_file)
 {
 	int cg_fd, err;
 
-	err = populate_progs();
+	err = populate_progs(bpf_file);
 	if (err < 0) {
 		fprintf(stderr, "ERROR: (%i) load bpf failed\n", err);
 		return err;
@@ -1347,17 +1347,30 @@ static int test_suite(void)
 
 out:
 	printf("Summary: %i PASSED %i FAILED\n", passed, failed);
+	cleanup_cgroup_environment();
 	close(cg_fd);
 	return err;
 }
 
+static int test_suite(void)
+{
+	int err;
+
+	err = __test_suite(BPF_SOCKMAP_FILENAME);
+	if (err)
+		goto out;
+	err = __test_suite(BPF_SOCKHASH_FILENAME);
+out:
+	return err;
+}
+
 int main(int argc, char **argv)
 {
 	struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY};
 	int iov_count = 1, length = 1024, rate = 1;
 	struct sockmap_options options = {0};
 	int opt, longindex, err, cg_fd = 0;
-	char *bpf_file = BPF_FILENAME;
+	char *bpf_file = BPF_SOCKMAP_FILENAME;
 	int test = PING_PONG;
 
 	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
@@ -1438,7 +1451,7 @@ int main(int argc, char **argv)
 		return -1;
 	}
 
-	err = populate_progs();
+	err = populate_progs(bpf_file);
 	if (err) {
 		fprintf(stderr, "populate program: (%s) %s\n",
 			bpf_file, strerror(errno));
diff --git a/tools/testing/selftests/bpf/test_sockmap_kern.c b/tools/testing/selftests/bpf/test_sockmap_kern.c
index 33de97e..677b2ed 100644
--- a/tools/testing/selftests/bpf/test_sockmap_kern.c
+++ b/tools/testing/selftests/bpf/test_sockmap_kern.c
@@ -1,340 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-// Copyright (c) 2017-2018 Covalent IO, Inc. http://covalent.io
-#include <stddef.h>
-#include <string.h>
-#include <linux/bpf.h>
-#include <linux/if_ether.h>
-#include <linux/if_packet.h>
-#include <linux/ip.h>
-#include <linux/ipv6.h>
-#include <linux/in.h>
-#include <linux/udp.h>
-#include <linux/tcp.h>
-#include <linux/pkt_cls.h>
-#include <sys/socket.h>
-#include "bpf_helpers.h"
-#include "bpf_endian.h"
-
-/* Sockmap sample program connects a client and a backend together
- * using cgroups.
- *
- *    client:X <---> frontend:80 client:X <---> backend:80
- *
- * For simplicity we hard code values here and bind 1:1. The hard
- * coded values are part of the setup in sockmap.sh script that
- * is associated with this BPF program.
- *
- * The bpf_printk is verbose and prints information as connections
- * are established and verdicts are decided.
- */
-
-#define bpf_printk(fmt, ...)					\
-({								\
-	       char ____fmt[] = fmt;				\
-	       bpf_trace_printk(____fmt, sizeof(____fmt),	\
-				##__VA_ARGS__);			\
-})
-
-struct bpf_map_def SEC("maps") sock_map = {
-	.type = BPF_MAP_TYPE_SOCKMAP,
-	.key_size = sizeof(int),
-	.value_size = sizeof(int),
-	.max_entries = 20,
-};
-
-struct bpf_map_def SEC("maps") sock_map_txmsg = {
-	.type = BPF_MAP_TYPE_SOCKMAP,
-	.key_size = sizeof(int),
-	.value_size = sizeof(int),
-	.max_entries = 20,
-};
-
-struct bpf_map_def SEC("maps") sock_map_redir = {
-	.type = BPF_MAP_TYPE_SOCKMAP,
-	.key_size = sizeof(int),
-	.value_size = sizeof(int),
-	.max_entries = 20,
-};
-
-struct bpf_map_def SEC("maps") sock_apply_bytes = {
-	.type = BPF_MAP_TYPE_ARRAY,
-	.key_size = sizeof(int),
-	.value_size = sizeof(int),
-	.max_entries = 1
-};
-
-struct bpf_map_def SEC("maps") sock_cork_bytes = {
-	.type = BPF_MAP_TYPE_ARRAY,
-	.key_size = sizeof(int),
-	.value_size = sizeof(int),
-	.max_entries = 1
-};
-
-struct bpf_map_def SEC("maps") sock_pull_bytes = {
-	.type = BPF_MAP_TYPE_ARRAY,
-	.key_size = sizeof(int),
-	.value_size = sizeof(int),
-	.max_entries = 2
-};
-
-struct bpf_map_def SEC("maps") sock_redir_flags = {
-	.type = BPF_MAP_TYPE_ARRAY,
-	.key_size = sizeof(int),
-	.value_size = sizeof(int),
-	.max_entries = 1
-};
-
-struct bpf_map_def SEC("maps") sock_skb_opts = {
-	.type = BPF_MAP_TYPE_ARRAY,
-	.key_size = sizeof(int),
-	.value_size = sizeof(int),
-	.max_entries = 1
-};
-
-SEC("sk_skb1")
-int bpf_prog1(struct __sk_buff *skb)
-{
-	return skb->len;
-}
-
-SEC("sk_skb2")
-int bpf_prog2(struct __sk_buff *skb)
-{
-	__u32 lport = skb->local_port;
-	__u32 rport = skb->remote_port;
-	int len, *f, ret, zero = 0;
-	__u64 flags = 0;
-
-	if (lport == 10000)
-		ret = 10;
-	else
-		ret = 1;
-
-	len = (__u32)skb->data_end - (__u32)skb->data;
-	f = bpf_map_lookup_elem(&sock_skb_opts, &zero);
-	if (f && *f) {
-		ret = 3;
-		flags = *f;
-	}
-
-	bpf_printk("sk_skb2: redirect(%iB) flags=%i\n",
-		   len, flags);
-	return bpf_sk_redirect_map(skb, &sock_map, ret, flags);
-}
-
-SEC("sockops")
-int bpf_sockmap(struct bpf_sock_ops *skops)
-{
-	__u32 lport, rport;
-	int op, err = 0, index, key, ret;
-
-
-	op = (int) skops->op;
-
-	switch (op) {
-	case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
-		lport = skops->local_port;
-		rport = skops->remote_port;
-
-		if (lport == 10000) {
-			ret = 1;
-			err = bpf_sock_map_update(skops, &sock_map, &ret,
-						  BPF_NOEXIST);
-			bpf_printk("passive(%i -> %i) map ctx update err: %d\n",
-				   lport, bpf_ntohl(rport), err);
-		}
-		break;
-	case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
-		lport = skops->local_port;
-		rport = skops->remote_port;
-
-		if (bpf_ntohl(rport) == 10001) {
-			ret = 10;
-			err = bpf_sock_map_update(skops, &sock_map, &ret,
-						  BPF_NOEXIST);
-			bpf_printk("active(%i -> %i) map ctx update err: %d\n",
-				   lport, bpf_ntohl(rport), err);
-		}
-		break;
-	default:
-		break;
-	}
-
-	return 0;
-}
-
-SEC("sk_msg1")
-int bpf_prog4(struct sk_msg_md *msg)
-{
-	int *bytes, zero = 0, one = 1;
-	int *start, *end;
-
-	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
-	if (bytes)
-		bpf_msg_apply_bytes(msg, *bytes);
-	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
-	if (bytes)
-		bpf_msg_cork_bytes(msg, *bytes);
-	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
-	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
-	if (start && end)
-		bpf_msg_pull_data(msg, *start, *end, 0);
-	return SK_PASS;
-}
-
-SEC("sk_msg2")
-int bpf_prog5(struct sk_msg_md *msg)
-{
-	int err1 = -1, err2 = -1, zero = 0, one = 1;
-	int *bytes, *start, *end, len1, len2;
-
-	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
-	if (bytes)
-		err1 = bpf_msg_apply_bytes(msg, *bytes);
-	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
-	if (bytes)
-		err2 = bpf_msg_cork_bytes(msg, *bytes);
-	len1 = (__u64)msg->data_end - (__u64)msg->data;
-	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
-	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
-	if (start && end) {
-		int err;
-
-		bpf_printk("sk_msg2: pull(%i:%i)\n",
-			   start ? *start : 0, end ? *end : 0);
-		err = bpf_msg_pull_data(msg, *start, *end, 0);
-		if (err)
-			bpf_printk("sk_msg2: pull_data err %i\n",
-				   err);
-		len2 = (__u64)msg->data_end - (__u64)msg->data;
-		bpf_printk("sk_msg2: length update %i->%i\n",
-			   len1, len2);
-	}
-	bpf_printk("sk_msg2: data length %i err1 %i err2 %i\n",
-		   len1, err1, err2);
-	return SK_PASS;
-}
-
-SEC("sk_msg3")
-int bpf_prog6(struct sk_msg_md *msg)
-{
-	int *bytes, zero = 0, one = 1, key = 0;
-	int *start, *end, *f;
-	__u64 flags = 0;
-
-	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
-	if (bytes)
-		bpf_msg_apply_bytes(msg, *bytes);
-	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
-	if (bytes)
-		bpf_msg_cork_bytes(msg, *bytes);
-	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
-	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
-	if (start && end)
-		bpf_msg_pull_data(msg, *start, *end, 0);
-	f = bpf_map_lookup_elem(&sock_redir_flags, &zero);
-	if (f && *f) {
-		key = 2;
-		flags = *f;
-	}
-	return bpf_msg_redirect_map(msg, &sock_map_redir, key, flags);
-}
-
-SEC("sk_msg4")
-int bpf_prog7(struct sk_msg_md *msg)
-{
-	int err1 = 0, err2 = 0, zero = 0, one = 1, key = 0;
-	int *f, *bytes, *start, *end, len1, len2;
-	__u64 flags = 0;
-
-		int err;
-	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
-	if (bytes)
-		err1 = bpf_msg_apply_bytes(msg, *bytes);
-	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
-	if (bytes)
-		err2 = bpf_msg_cork_bytes(msg, *bytes);
-	len1 = (__u64)msg->data_end - (__u64)msg->data;
-	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
-	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
-	if (start && end) {
-
-		bpf_printk("sk_msg2: pull(%i:%i)\n",
-			   start ? *start : 0, end ? *end : 0);
-		err = bpf_msg_pull_data(msg, *start, *end, 0);
-		if (err)
-			bpf_printk("sk_msg2: pull_data err %i\n",
-				   err);
-		len2 = (__u64)msg->data_end - (__u64)msg->data;
-		bpf_printk("sk_msg2: length update %i->%i\n",
-			   len1, len2);
-	}
-	f = bpf_map_lookup_elem(&sock_redir_flags, &zero);
-	if (f && *f) {
-		key = 2;
-		flags = *f;
-	}
-	bpf_printk("sk_msg3: redirect(%iB) flags=%i err=%i\n",
-		   len1, flags, err1 ? err1 : err2);
-	err = bpf_msg_redirect_map(msg, &sock_map_redir, key, flags);
-	bpf_printk("sk_msg3: err %i\n", err);
-	return err;
-}
-
-SEC("sk_msg5")
-int bpf_prog8(struct sk_msg_md *msg)
-{
-	void *data_end = (void *)(long) msg->data_end;
-	void *data = (void *)(long) msg->data;
-	int ret = 0, *bytes, zero = 0;
-
-	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
-	if (bytes) {
-		ret = bpf_msg_apply_bytes(msg, *bytes);
-		if (ret)
-			return SK_DROP;
-	} else {
-		return SK_DROP;
-	}
-	return SK_PASS;
-}
-SEC("sk_msg6")
-int bpf_prog9(struct sk_msg_md *msg)
-{
-	void *data_end = (void *)(long) msg->data_end;
-	void *data = (void *)(long) msg->data;
-	int ret = 0, *bytes, zero = 0;
-
-	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
-	if (bytes) {
-		if (((__u64)data_end - (__u64)data) >= *bytes)
-			return SK_PASS;
-		ret = bpf_msg_cork_bytes(msg, *bytes);
-		if (ret)
-			return SK_DROP;
-	}
-	return SK_PASS;
-}
-
-SEC("sk_msg7")
-int bpf_prog10(struct sk_msg_md *msg)
-{
-	int *bytes, zero = 0, one = 1;
-	int *start, *end;
-
-	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
-	if (bytes)
-		bpf_msg_apply_bytes(msg, *bytes);
-	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
-	if (bytes)
-		bpf_msg_cork_bytes(msg, *bytes);
-	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
-	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
-	if (start && end)
-		bpf_msg_pull_data(msg, *start, *end, 0);
-
-	return SK_DROP;
-}
-
-int _version SEC("version") = 1;
-char _license[] SEC("license") = "GPL";
+// Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
+#define SOCKMAP
+#define TEST_MAP_TYPE BPF_MAP_TYPE_SOCKMAP
+#include "./test_sockmap_kern.h"
diff --git a/tools/testing/selftests/bpf/test_sockmap_kern.h b/tools/testing/selftests/bpf/test_sockmap_kern.h
new file mode 100644
index 0000000..8e8e417
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_sockmap_kern.h
@@ -0,0 +1,363 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2017-2018 Covalent IO, Inc. http://covalent.io */
+#include <stddef.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/in.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+#include <linux/pkt_cls.h>
+#include <sys/socket.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+/* Sockmap sample program connects a client and a backend together
+ * using cgroups.
+ *
+ *    client:X <---> frontend:80 client:X <---> backend:80
+ *
+ * For simplicity we hard code values here and bind 1:1. The hard
+ * coded values are part of the setup in sockmap.sh script that
+ * is associated with this BPF program.
+ *
+ * The bpf_printk is verbose and prints information as connections
+ * are established and verdicts are decided.
+ */
+
+#define bpf_printk(fmt, ...)					\
+({								\
+	       char ____fmt[] = fmt;				\
+	       bpf_trace_printk(____fmt, sizeof(____fmt),	\
+				##__VA_ARGS__);			\
+})
+
+struct bpf_map_def SEC("maps") sock_map = {
+	.type = TEST_MAP_TYPE,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 20,
+};
+
+struct bpf_map_def SEC("maps") sock_map_txmsg = {
+	.type = TEST_MAP_TYPE,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 20,
+};
+
+struct bpf_map_def SEC("maps") sock_map_redir = {
+	.type = TEST_MAP_TYPE,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 20,
+};
+
+struct bpf_map_def SEC("maps") sock_apply_bytes = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 1
+};
+
+struct bpf_map_def SEC("maps") sock_cork_bytes = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 1
+};
+
+struct bpf_map_def SEC("maps") sock_pull_bytes = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 2
+};
+
+struct bpf_map_def SEC("maps") sock_redir_flags = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 1
+};
+
+struct bpf_map_def SEC("maps") sock_skb_opts = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 1
+};
+
+SEC("sk_skb1")
+int bpf_prog1(struct __sk_buff *skb)
+{
+	return skb->len;
+}
+
+SEC("sk_skb2")
+int bpf_prog2(struct __sk_buff *skb)
+{
+	__u32 lport = skb->local_port;
+	__u32 rport = skb->remote_port;
+	int len, *f, ret, zero = 0;
+	__u64 flags = 0;
+
+	if (lport == 10000)
+		ret = 10;
+	else
+		ret = 1;
+
+	len = (__u32)skb->data_end - (__u32)skb->data;
+	f = bpf_map_lookup_elem(&sock_skb_opts, &zero);
+	if (f && *f) {
+		ret = 3;
+		flags = *f;
+	}
+
+	bpf_printk("sk_skb2: redirect(%iB) flags=%i\n",
+		   len, flags);
+#ifdef SOCKMAP
+	return bpf_sk_redirect_map(skb, &sock_map, ret, flags);
+#else
+	return bpf_sk_redirect_hash(skb, &sock_map, &ret, flags);
+#endif
+
+}
+
+SEC("sockops")
+int bpf_sockmap(struct bpf_sock_ops *skops)
+{
+	__u32 lport, rport;
+	int op, err = 0, index, key, ret;
+
+
+	op = (int) skops->op;
+
+	switch (op) {
+	case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+		lport = skops->local_port;
+		rport = skops->remote_port;
+
+		if (lport == 10000) {
+			ret = 1;
+#ifdef SOCKMAP
+			err = bpf_sock_map_update(skops, &sock_map, &ret,
+						  BPF_NOEXIST);
+#else
+			err = bpf_sock_hash_update(skops, &sock_map, &ret,
+						   BPF_NOEXIST);
+#endif
+			bpf_printk("passive(%i -> %i) map ctx update err: %d\n",
+				   lport, bpf_ntohl(rport), err);
+		}
+		break;
+	case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+		lport = skops->local_port;
+		rport = skops->remote_port;
+
+		if (bpf_ntohl(rport) == 10001) {
+			ret = 10;
+#ifdef SOCKMAP
+			err = bpf_sock_map_update(skops, &sock_map, &ret,
+						  BPF_NOEXIST);
+#else
+			err = bpf_sock_hash_update(skops, &sock_map, &ret,
+						   BPF_NOEXIST);
+#endif
+			bpf_printk("active(%i -> %i) map ctx update err: %d\n",
+				   lport, bpf_ntohl(rport), err);
+		}
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+SEC("sk_msg1")
+int bpf_prog4(struct sk_msg_md *msg)
+{
+	int *bytes, zero = 0, one = 1;
+	int *start, *end;
+
+	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+	if (bytes)
+		bpf_msg_apply_bytes(msg, *bytes);
+	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+	if (bytes)
+		bpf_msg_cork_bytes(msg, *bytes);
+	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
+	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
+	if (start && end)
+		bpf_msg_pull_data(msg, *start, *end, 0);
+	return SK_PASS;
+}
+
+SEC("sk_msg2")
+int bpf_prog5(struct sk_msg_md *msg)
+{
+	int err1 = -1, err2 = -1, zero = 0, one = 1;
+	int *bytes, *start, *end, len1, len2;
+
+	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+	if (bytes)
+		err1 = bpf_msg_apply_bytes(msg, *bytes);
+	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+	if (bytes)
+		err2 = bpf_msg_cork_bytes(msg, *bytes);
+	len1 = (__u64)msg->data_end - (__u64)msg->data;
+	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
+	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
+	if (start && end) {
+		int err;
+
+		bpf_printk("sk_msg2: pull(%i:%i)\n",
+			   start ? *start : 0, end ? *end : 0);
+		err = bpf_msg_pull_data(msg, *start, *end, 0);
+		if (err)
+			bpf_printk("sk_msg2: pull_data err %i\n",
+				   err);
+		len2 = (__u64)msg->data_end - (__u64)msg->data;
+		bpf_printk("sk_msg2: length update %i->%i\n",
+			   len1, len2);
+	}
+	bpf_printk("sk_msg2: data length %i err1 %i err2 %i\n",
+		   len1, err1, err2);
+	return SK_PASS;
+}
+
+SEC("sk_msg3")
+int bpf_prog6(struct sk_msg_md *msg)
+{
+	int *bytes, zero = 0, one = 1, key = 0;
+	int *start, *end, *f;
+	__u64 flags = 0;
+
+	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+	if (bytes)
+		bpf_msg_apply_bytes(msg, *bytes);
+	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+	if (bytes)
+		bpf_msg_cork_bytes(msg, *bytes);
+	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
+	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
+	if (start && end)
+		bpf_msg_pull_data(msg, *start, *end, 0);
+	f = bpf_map_lookup_elem(&sock_redir_flags, &zero);
+	if (f && *f) {
+		key = 2;
+		flags = *f;
+	}
+#ifdef SOCKMAP
+	return bpf_msg_redirect_map(msg, &sock_map_redir, key, flags);
+#else
+	return bpf_msg_redirect_hash(msg, &sock_map_redir, &key, flags);
+#endif
+}
+
+SEC("sk_msg4")
+int bpf_prog7(struct sk_msg_md *msg)
+{
+	int err1 = 0, err2 = 0, zero = 0, one = 1, key = 0;
+	int *f, *bytes, *start, *end, len1, len2;
+	__u64 flags = 0;
+
+		int err;
+	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+	if (bytes)
+		err1 = bpf_msg_apply_bytes(msg, *bytes);
+	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+	if (bytes)
+		err2 = bpf_msg_cork_bytes(msg, *bytes);
+	len1 = (__u64)msg->data_end - (__u64)msg->data;
+	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
+	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
+	if (start && end) {
+
+		bpf_printk("sk_msg2: pull(%i:%i)\n",
+			   start ? *start : 0, end ? *end : 0);
+		err = bpf_msg_pull_data(msg, *start, *end, 0);
+		if (err)
+			bpf_printk("sk_msg2: pull_data err %i\n",
+				   err);
+		len2 = (__u64)msg->data_end - (__u64)msg->data;
+		bpf_printk("sk_msg2: length update %i->%i\n",
+			   len1, len2);
+	}
+	f = bpf_map_lookup_elem(&sock_redir_flags, &zero);
+	if (f && *f) {
+		key = 2;
+		flags = *f;
+	}
+	bpf_printk("sk_msg3: redirect(%iB) flags=%i err=%i\n",
+		   len1, flags, err1 ? err1 : err2);
+#ifdef SOCKMAP
+	err = bpf_msg_redirect_map(msg, &sock_map_redir, key, flags);
+#else
+	err = bpf_msg_redirect_hash(msg, &sock_map_redir, &key, flags);
+#endif
+	bpf_printk("sk_msg3: err %i\n", err);
+	return err;
+}
+
+SEC("sk_msg5")
+int bpf_prog8(struct sk_msg_md *msg)
+{
+	void *data_end = (void *)(long) msg->data_end;
+	void *data = (void *)(long) msg->data;
+	int ret = 0, *bytes, zero = 0;
+
+	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+	if (bytes) {
+		ret = bpf_msg_apply_bytes(msg, *bytes);
+		if (ret)
+			return SK_DROP;
+	} else {
+		return SK_DROP;
+	}
+	return SK_PASS;
+}
+SEC("sk_msg6")
+int bpf_prog9(struct sk_msg_md *msg)
+{
+	void *data_end = (void *)(long) msg->data_end;
+	void *data = (void *)(long) msg->data;
+	int ret = 0, *bytes, zero = 0;
+
+	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+	if (bytes) {
+		if (((__u64)data_end - (__u64)data) >= *bytes)
+			return SK_PASS;
+		ret = bpf_msg_cork_bytes(msg, *bytes);
+		if (ret)
+			return SK_DROP;
+	}
+	return SK_PASS;
+}
+
+SEC("sk_msg7")
+int bpf_prog10(struct sk_msg_md *msg)
+{
+	int *bytes, zero = 0, one = 1;
+	int *start, *end;
+
+	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+	if (bytes)
+		bpf_msg_apply_bytes(msg, *bytes);
+	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+	if (bytes)
+		bpf_msg_cork_bytes(msg, *bytes);
+	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
+	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
+	if (start && end)
+		bpf_msg_pull_data(msg, *start, *end, 0);
+
+	return SK_DROP;
+}
+
+int _version SEC("version") = 1;
+char _license[] SEC("license") = "GPL";
-- 
1.9.1

^ permalink raw reply related

* [PATCH bpf-next v6 2/4] bpf: sockmap, add hash map support
From: John Fastabend @ 2018-05-14 17:00 UTC (permalink / raw)
  To: daniel, ast; +Cc: netdev, davem, John Fastabend
In-Reply-To: <1526317219-7752-1-git-send-email-john.fastabend@gmail.com>

Sockmap is currently backed by an array and enforces keys to be
four bytes. This works well for many use cases and was originally
modeled after devmap which also uses four bytes keys. However,
this has become limiting in larger use cases where a hash would
be more appropriate. For example users may want to use the 5-tuple
of the socket as the lookup key.

To support this add hash support.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h       |   8 +
 include/linux/bpf_types.h |   1 +
 include/uapi/linux/bpf.h  |  52 ++++-
 kernel/bpf/core.c         |   1 +
 kernel/bpf/sockmap.c      | 494 ++++++++++++++++++++++++++++++++++++++++++++--
 kernel/bpf/verifier.c     |  14 +-
 net/core/filter.c         |  58 ++++++
 7 files changed, 610 insertions(+), 18 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a38e474..ed0122b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -668,6 +668,7 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map)
 
 #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET)
 struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
+struct sock  *__sock_hash_lookup_elem(struct bpf_map *map, void *key);
 int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type);
 #else
 static inline struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
@@ -675,6 +676,12 @@ static inline struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
 	return NULL;
 }
 
+static inline struct sock  *__sock_hash_lookup_elem(struct bpf_map *map,
+						    void *key)
+{
+	return NULL;
+}
+
 static inline int sock_map_prog(struct bpf_map *map,
 				struct bpf_prog *prog,
 				u32 type)
@@ -724,6 +731,7 @@ static inline void __xsk_map_flush(struct bpf_map *map)
 extern const struct bpf_func_proto bpf_get_stackid_proto;
 extern const struct bpf_func_proto bpf_get_stack_proto;
 extern const struct bpf_func_proto bpf_sock_map_update_proto;
+extern const struct bpf_func_proto bpf_sock_hash_update_proto;
 
 /* Shared helpers among cBPF and eBPF. */
 void bpf_user_rnd_init_once(void);
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index d7df1b32..b67f879 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -47,6 +47,7 @@
 BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
 #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_INET)
 BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops)
 #endif
 BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
 #if defined(CONFIG_XDP_SOCKETS)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 02e4112..1205d86 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -118,6 +118,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_SOCKMAP,
 	BPF_MAP_TYPE_CPUMAP,
 	BPF_MAP_TYPE_XSKMAP,
+	BPF_MAP_TYPE_SOCKHASH,
 };
 
 enum bpf_prog_type {
@@ -1855,6 +1856,52 @@ struct bpf_stack_build_id {
  *             Egress device index on success, 0 if packet needs to continue
  *             up the stack for further processing or a negative error in case
  *             of failure.
+ * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
+ *	Description
+ *		Add an entry to, or update a sockhash *map* referencing sockets.
+ *		The *skops* is used as a new value for the entry associated to
+ *		*key*. *flags* is one of:
+ *
+ *		**BPF_NOEXIST**
+ *			The entry for *key* must not exist in the map.
+ *		**BPF_EXIST**
+ *			The entry for *key* must already exist in the map.
+ *		**BPF_ANY**
+ *			No condition on the existence of the entry for *key*.
+ *
+ *		If the *map* has eBPF programs (parser and verdict), those will
+ *		be inherited by the socket being added. If the socket is
+ *		already attached to eBPF programs, this results in an error.
+ *	Return
+ *		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags)
+ *	Description
+ *		This helper is used in programs implementing policies at the
+ *		socket level. If the message *msg* is allowed to pass (i.e. if
+ *		the verdict eBPF program returns **SK_PASS**), redirect it to
+ *		the socket referenced by *map* (of type
+ *		**BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
+ *		egress interfaces can be used for redirection. The
+ *		**BPF_F_INGRESS** value in *flags* is used to make the
+ *		distinction (ingress path is selected if the flag is present,
+ *		egress path otherwise). This is the only flag supported for now.
+ *	Return
+ *		**SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags)
+ *	Description
+ *		This helper is used in programs implementing policies at the
+ *		skb socket level. If the sk_buff *skb* is allowed to pass (i.e.
+ *		if the verdeict eBPF program returns **SK_PASS**), redirect it
+ *		to the socket referenced by *map* (of type
+ *		**BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
+ *		egress interfaces can be used for redirection. The
+ *		**BPF_F_INGRESS** value in *flags* is used to make the
+ *		distinction (ingress path is selected if the flag is present,
+ *		egress otherwise). This is the only flag supported for now.
+ *	Return
+ *		**SK_PASS** on success, or **SK_DROP** on error.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1926,7 +1973,10 @@ struct bpf_stack_build_id {
 	FN(skb_get_xfrm_state),		\
 	FN(get_stack),			\
 	FN(skb_load_bytes_relative),	\
-	FN(fib_lookup),
+	FN(fib_lookup),			\
+	FN(sock_hash_update),		\
+	FN(msg_redirect_hash),		\
+	FN(sk_redirect_hash),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d0d7d94..2194c6a 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1707,6 +1707,7 @@ void bpf_user_rnd_init_once(void)
 const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
 const struct bpf_func_proto bpf_get_current_comm_proto __weak;
 const struct bpf_func_proto bpf_sock_map_update_proto __weak;
+const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
 
 const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
 {
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index beab9ec..b846329 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -60,6 +60,28 @@ struct bpf_stab {
 	struct bpf_sock_progs progs;
 };
 
+struct bucket {
+	struct hlist_head head;
+	raw_spinlock_t lock;
+};
+
+struct bpf_htab {
+	struct bpf_map map;
+	struct bucket *buckets;
+	atomic_t count;
+	u32 n_buckets;
+	u32 elem_size;
+	struct bpf_sock_progs progs;
+};
+
+struct htab_elem {
+	struct rcu_head rcu;
+	struct hlist_node hash_node;
+	u32 hash;
+	struct sock *sk;
+	char key[0];
+};
+
 enum smap_psock_state {
 	SMAP_TX_RUNNING,
 };
@@ -67,6 +89,8 @@ enum smap_psock_state {
 struct smap_psock_map_entry {
 	struct list_head list;
 	struct sock **entry;
+	struct htab_elem *hash_link;
+	struct bpf_htab *htab;
 };
 
 struct smap_psock {
@@ -195,6 +219,12 @@ static void bpf_tcp_release(struct sock *sk)
 	rcu_read_unlock();
 }
 
+static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
+{
+	atomic_dec(&htab->count);
+	kfree_rcu(l, rcu);
+}
+
 static void bpf_tcp_close(struct sock *sk, long timeout)
 {
 	void (*close_fun)(struct sock *sk, long timeout);
@@ -231,10 +261,16 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
 	}
 
 	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
-		osk = cmpxchg(e->entry, sk, NULL);
-		if (osk == sk) {
-			list_del(&e->list);
-			smap_release_sock(psock, sk);
+		if (e->entry) {
+			osk = cmpxchg(e->entry, sk, NULL);
+			if (osk == sk) {
+				list_del(&e->list);
+				smap_release_sock(psock, sk);
+			}
+		} else {
+			hlist_del_rcu(&e->hash_link->hash_node);
+			smap_release_sock(psock, e->hash_link->sk);
+			free_htab_elem(e->htab, e->hash_link);
 		}
 	}
 	write_unlock_bh(&sk->sk_callback_lock);
@@ -1527,12 +1563,14 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 	return ERR_PTR(err);
 }
 
-static void smap_list_remove(struct smap_psock *psock, struct sock **entry)
+static void smap_list_remove(struct smap_psock *psock,
+			     struct sock **entry,
+			     struct htab_elem *hash_link)
 {
 	struct smap_psock_map_entry *e, *tmp;
 
 	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
-		if (e->entry == entry) {
+		if (e->entry == entry || e->hash_link == hash_link) {
 			list_del(&e->list);
 			break;
 		}
@@ -1570,7 +1608,7 @@ static void sock_map_free(struct bpf_map *map)
 		 * to be null and queued for garbage collection.
 		 */
 		if (likely(psock)) {
-			smap_list_remove(psock, &stab->sock_map[i]);
+			smap_list_remove(psock, &stab->sock_map[i], NULL);
 			smap_release_sock(psock, sock);
 		}
 		write_unlock_bh(&sock->sk_callback_lock);
@@ -1629,7 +1667,7 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key)
 
 	if (psock->bpf_parse)
 		smap_stop_sock(psock, sock);
-	smap_list_remove(psock, &stab->sock_map[k]);
+	smap_list_remove(psock, &stab->sock_map[k], NULL);
 	smap_release_sock(psock, sock);
 out:
 	write_unlock_bh(&sock->sk_callback_lock);
@@ -1746,10 +1784,12 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
 		new = true;
 	}
 
-	e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
-	if (!e) {
-		err = -ENOMEM;
-		goto out_progs;
+	if (map_link) {
+		e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
+		if (!e) {
+			err = -ENOMEM;
+			goto out_progs;
+		}
 	}
 
 	/* 3. At this point we have a reference to a valid psock that is
@@ -1783,6 +1823,7 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
 	write_unlock_bh(&sock->sk_callback_lock);
 	return err;
 out_free:
+	kfree(e);
 	smap_release_sock(psock, sock);
 out_progs:
 	if (verdict)
@@ -1829,7 +1870,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 		struct smap_psock *opsock = smap_psock_sk(osock);
 
 		write_lock_bh(&osock->sk_callback_lock);
-		smap_list_remove(opsock, &stab->sock_map[i]);
+		smap_list_remove(opsock, &stab->sock_map[i], NULL);
 		smap_release_sock(opsock, osock);
 		write_unlock_bh(&osock->sk_callback_lock);
 	}
@@ -1846,6 +1887,10 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
 		struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
 
 		progs = &stab->progs;
+	} else if (map->map_type == BPF_MAP_TYPE_SOCKHASH) {
+		struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+		progs = &htab->progs;
 	} else {
 		return -EINVAL;
 	}
@@ -1906,11 +1951,19 @@ static int sock_map_update_elem(struct bpf_map *map,
 
 static void sock_map_release(struct bpf_map *map)
 {
-	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
 	struct bpf_sock_progs *progs;
 	struct bpf_prog *orig;
 
-	progs = &stab->progs;
+	if (map->map_type == BPF_MAP_TYPE_SOCKMAP) {
+		struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+		progs = &stab->progs;
+	} else {
+		struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+		progs = &htab->progs;
+	}
+
 	orig = xchg(&progs->bpf_parse, NULL);
 	if (orig)
 		bpf_prog_put(orig);
@@ -1923,6 +1976,390 @@ static void sock_map_release(struct bpf_map *map)
 		bpf_prog_put(orig);
 }
 
+static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
+{
+	struct bpf_htab *htab;
+	int i, err;
+	u64 cost;
+
+	if (!capable(CAP_NET_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	/* check sanity of attributes */
+	if (attr->max_entries == 0 || attr->value_size != 4 ||
+	    attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
+		return ERR_PTR(-EINVAL);
+
+	err = bpf_tcp_ulp_register();
+	if (err && err != -EEXIST)
+		return ERR_PTR(err);
+
+	htab = kzalloc(sizeof(*htab), GFP_USER);
+	if (!htab)
+		return ERR_PTR(-ENOMEM);
+
+	bpf_map_init_from_attr(&htab->map, attr);
+
+	htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
+	htab->elem_size = sizeof(struct htab_elem) +
+			  round_up(htab->map.key_size, 8);
+
+	if (htab->n_buckets == 0 ||
+	    htab->n_buckets > U32_MAX / sizeof(struct bucket))
+		goto free_htab;
+
+	cost = (u64) htab->n_buckets * sizeof(struct bucket) +
+	       (u64) htab->elem_size * htab->map.max_entries;
+
+	if (cost >= U32_MAX - PAGE_SIZE)
+		goto free_htab;
+
+	htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+	err = bpf_map_precharge_memlock(htab->map.pages);
+	if (err)
+		goto free_htab;
+
+	err = -ENOMEM;
+	htab->buckets = bpf_map_area_alloc(
+				htab->n_buckets * sizeof(struct bucket),
+				htab->map.numa_node);
+	if (!htab->buckets)
+		goto free_htab;
+
+	for (i = 0; i < htab->n_buckets; i++) {
+		INIT_HLIST_HEAD(&htab->buckets[i].head);
+		raw_spin_lock_init(&htab->buckets[i].lock);
+	}
+
+	return &htab->map;
+free_htab:
+	kfree(htab);
+	return ERR_PTR(err);
+}
+
+static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
+{
+	return &htab->buckets[hash & (htab->n_buckets - 1)];
+}
+
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+	return &__select_bucket(htab, hash)->head;
+}
+
+static void sock_hash_free(struct bpf_map *map)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	int i;
+
+	synchronize_rcu();
+
+	/* At this point no update, lookup or delete operations can happen.
+	 * However, be aware we can still get a socket state event updates,
+	 * and data ready callabacks that reference the psock from sk_user_data
+	 * Also psock worker threads are still in-flight. So smap_release_sock
+	 * will only free the psock after cancel_sync on the worker threads
+	 * and a grace period expire to ensure psock is really safe to remove.
+	 */
+	rcu_read_lock();
+	for (i = 0; i < htab->n_buckets; i++) {
+		struct hlist_head *head = select_bucket(htab, i);
+		struct hlist_node *n;
+		struct htab_elem *l;
+
+		hlist_for_each_entry_safe(l, n, head, hash_node) {
+			struct sock *sock = l->sk;
+			struct smap_psock *psock;
+
+			hlist_del_rcu(&l->hash_node);
+			write_lock_bh(&sock->sk_callback_lock);
+			psock = smap_psock_sk(sock);
+			/* This check handles a racing sock event that can get
+			 * the sk_callback_lock before this case but after xchg
+			 * causing the refcnt to hit zero and sock user data
+			 * (psock) to be null and queued for garbage collection.
+			 */
+			if (likely(psock)) {
+				smap_list_remove(psock, NULL, l);
+				smap_release_sock(psock, sock);
+			}
+			write_unlock_bh(&sock->sk_callback_lock);
+			kfree(l);
+		}
+	}
+	rcu_read_unlock();
+	bpf_map_area_free(htab->buckets);
+	kfree(htab);
+}
+
+static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
+					      void *key, u32 key_size, u32 hash,
+					      struct sock *sk,
+					      struct htab_elem *old_elem)
+{
+	struct htab_elem *l_new;
+
+	if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
+		if (!old_elem) {
+			atomic_dec(&htab->count);
+			return ERR_PTR(-E2BIG);
+		}
+	}
+	l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
+			     htab->map.numa_node);
+	if (!l_new)
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(l_new->key, key, key_size);
+	l_new->sk = sk;
+	l_new->hash = hash;
+	return l_new;
+}
+
+static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
+					 u32 hash, void *key, u32 key_size)
+{
+	struct htab_elem *l;
+
+	hlist_for_each_entry_rcu(l, head, hash_node) {
+		if (l->hash == hash && !memcmp(&l->key, key, key_size))
+			return l;
+	}
+
+	return NULL;
+}
+
+static inline u32 htab_map_hash(const void *key, u32 key_len)
+{
+	return jhash(key, key_len, 0);
+}
+
+static int sock_hash_get_next_key(struct bpf_map *map,
+				  void *key, void *next_key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct htab_elem *l, *next_l;
+	struct hlist_head *h;
+	u32 hash, key_size;
+	int i = 0;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	key_size = map->key_size;
+	if (!key)
+		goto find_first_elem;
+	hash = htab_map_hash(key, key_size);
+	h = select_bucket(htab, hash);
+
+	l = lookup_elem_raw(h, hash, key, key_size);
+	if (!l)
+		goto find_first_elem;
+	next_l = hlist_entry_safe(
+		     rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
+		     struct htab_elem, hash_node);
+	if (next_l) {
+		memcpy(next_key, next_l->key, key_size);
+		return 0;
+	}
+
+	/* no more elements in this hash list, go to the next bucket */
+	i = hash & (htab->n_buckets - 1);
+	i++;
+
+find_first_elem:
+	/* iterate over buckets */
+	for (; i < htab->n_buckets; i++) {
+		h = select_bucket(htab, i);
+
+		/* pick first element in the bucket */
+		next_l = hlist_entry_safe(
+				rcu_dereference_raw(hlist_first_rcu(h)),
+				struct htab_elem, hash_node);
+		if (next_l) {
+			/* if it's not empty, just return it */
+			memcpy(next_key, next_l->key, key_size);
+			return 0;
+		}
+	}
+
+	/* iterated over all buckets and all elements */
+	return -ENOENT;
+}
+
+static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
+				     struct bpf_map *map,
+				     void *key, u64 map_flags)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct bpf_sock_progs *progs = &htab->progs;
+	struct htab_elem *l_new = NULL, *l_old;
+	struct smap_psock_map_entry *e = NULL;
+	struct hlist_head *head;
+	struct smap_psock *psock;
+	u32 key_size, hash;
+	struct sock *sock;
+	struct bucket *b;
+	int err;
+
+	sock = skops->sk;
+
+	if (sock->sk_type != SOCK_STREAM ||
+	    sock->sk_protocol != IPPROTO_TCP)
+		return -EOPNOTSUPP;
+
+	if (unlikely(map_flags > BPF_EXIST))
+		return -EINVAL;
+
+	e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
+	if (!e)
+		return -ENOMEM;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	key_size = map->key_size;
+	hash = htab_map_hash(key, key_size);
+	b = __select_bucket(htab, hash);
+	head = &b->head;
+
+	err = __sock_map_ctx_update_elem(map, progs, sock, NULL, key);
+	if (err)
+		goto err;
+
+	/* bpf_map_update_elem() can be called in_irq() */
+	raw_spin_lock_bh(&b->lock);
+	l_old = lookup_elem_raw(head, hash, key, key_size);
+	if (l_old && map_flags == BPF_NOEXIST) {
+		err = -EEXIST;
+		goto bucket_err;
+	}
+	if (!l_old && map_flags == BPF_EXIST) {
+		err = -ENOENT;
+		goto bucket_err;
+	}
+
+	l_new = alloc_sock_hash_elem(htab, key, key_size, hash, sock, l_old);
+	if (IS_ERR(l_new)) {
+		err = PTR_ERR(l_new);
+		goto bucket_err;
+	}
+
+	psock = smap_psock_sk(sock);
+	if (unlikely(!psock)) {
+		err = -EINVAL;
+		goto bucket_err;
+	}
+
+	e->hash_link = l_new;
+	e->htab = container_of(map, struct bpf_htab, map);
+	list_add_tail(&e->list, &psock->maps);
+
+	/* add new element to the head of the list, so that
+	 * concurrent search will find it before old elem
+	 */
+	hlist_add_head_rcu(&l_new->hash_node, head);
+	if (l_old) {
+		psock = smap_psock_sk(l_old->sk);
+
+		hlist_del_rcu(&l_old->hash_node);
+		smap_list_remove(psock, NULL, l_old);
+		smap_release_sock(psock, l_old->sk);
+		free_htab_elem(htab, l_old);
+	}
+	raw_spin_unlock_bh(&b->lock);
+	return 0;
+bucket_err:
+	raw_spin_unlock_bh(&b->lock);
+err:
+	kfree(e);
+	psock = smap_psock_sk(sock);
+	if (psock)
+		smap_release_sock(psock, sock);
+	return err;
+}
+
+static int sock_hash_update_elem(struct bpf_map *map,
+				void *key, void *value, u64 flags)
+{
+	struct bpf_sock_ops_kern skops;
+	u32 fd = *(u32 *)value;
+	struct socket *socket;
+	int err;
+
+	socket = sockfd_lookup(fd, &err);
+	if (!socket)
+		return err;
+
+	skops.sk = socket->sk;
+	if (!skops.sk) {
+		fput(socket->file);
+		return -EINVAL;
+	}
+
+	err = sock_hash_ctx_update_elem(&skops, map, key, flags);
+	fput(socket->file);
+	return err;
+}
+
+static int sock_hash_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct hlist_head *head;
+	struct bucket *b;
+	struct htab_elem *l;
+	u32 hash, key_size;
+	int ret = -ENOENT;
+
+	key_size = map->key_size;
+	hash = htab_map_hash(key, key_size);
+	b = __select_bucket(htab, hash);
+	head = &b->head;
+
+	raw_spin_lock_bh(&b->lock);
+	l = lookup_elem_raw(head, hash, key, key_size);
+	if (l) {
+		struct sock *sock = l->sk;
+		struct smap_psock *psock;
+
+		hlist_del_rcu(&l->hash_node);
+		write_lock_bh(&sock->sk_callback_lock);
+		psock = smap_psock_sk(sock);
+		/* This check handles a racing sock event that can get the
+		 * sk_callback_lock before this case but after xchg happens
+		 * causing the refcnt to hit zero and sock user data (psock)
+		 * to be null and queued for garbage collection.
+		 */
+		if (likely(psock)) {
+			smap_list_remove(psock, NULL, l);
+			smap_release_sock(psock, sock);
+		}
+		write_unlock_bh(&sock->sk_callback_lock);
+		free_htab_elem(htab, l);
+		ret = 0;
+	}
+	raw_spin_unlock_bh(&b->lock);
+	return ret;
+}
+
+struct sock  *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct hlist_head *head;
+	struct htab_elem *l;
+	u32 key_size, hash;
+	struct bucket *b;
+	struct sock *sk;
+
+	key_size = map->key_size;
+	hash = htab_map_hash(key, key_size);
+	b = __select_bucket(htab, hash);
+	head = &b->head;
+
+	raw_spin_lock_bh(&b->lock);
+	l = lookup_elem_raw(head, hash, key, key_size);
+	sk = l ? l->sk : NULL;
+	raw_spin_unlock_bh(&b->lock);
+	return sk;
+}
+
 const struct bpf_map_ops sock_map_ops = {
 	.map_alloc = sock_map_alloc,
 	.map_free = sock_map_free,
@@ -1933,6 +2370,15 @@ static void sock_map_release(struct bpf_map *map)
 	.map_release_uref = sock_map_release,
 };
 
+const struct bpf_map_ops sock_hash_ops = {
+	.map_alloc = sock_hash_alloc,
+	.map_free = sock_hash_free,
+	.map_lookup_elem = sock_map_lookup,
+	.map_get_next_key = sock_hash_get_next_key,
+	.map_update_elem = sock_hash_update_elem,
+	.map_delete_elem = sock_hash_delete_elem,
+};
+
 BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
 	   struct bpf_map *, map, void *, key, u64, flags)
 {
@@ -1950,3 +2396,21 @@ static void sock_map_release(struct bpf_map *map)
 	.arg3_type	= ARG_PTR_TO_MAP_KEY,
 	.arg4_type	= ARG_ANYTHING,
 };
+
+BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock,
+	   struct bpf_map *, map, void *, key, u64, flags)
+{
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	return sock_hash_ctx_update_elem(bpf_sock, map, key, flags);
+}
+
+const struct bpf_func_proto bpf_sock_hash_update_proto = {
+	.func		= bpf_sock_hash_update,
+	.gpl_only	= false,
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_PTR_TO_MAP_KEY,
+	.arg4_type	= ARG_ANYTHING,
+};
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d92d9c37..a9e4b13 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2093,6 +2093,13 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		    func_id != BPF_FUNC_msg_redirect_map)
 			goto error;
 		break;
+	case BPF_MAP_TYPE_SOCKHASH:
+		if (func_id != BPF_FUNC_sk_redirect_hash &&
+		    func_id != BPF_FUNC_sock_hash_update &&
+		    func_id != BPF_FUNC_map_delete_elem &&
+		    func_id != BPF_FUNC_msg_redirect_hash)
+			goto error;
+		break;
 	default:
 		break;
 	}
@@ -2130,11 +2137,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		break;
 	case BPF_FUNC_sk_redirect_map:
 	case BPF_FUNC_msg_redirect_map:
+	case BPF_FUNC_sock_map_update:
 		if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
 			goto error;
 		break;
-	case BPF_FUNC_sock_map_update:
-		if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
+	case BPF_FUNC_sk_redirect_hash:
+	case BPF_FUNC_msg_redirect_hash:
+	case BPF_FUNC_sock_hash_update:
+		if (map->map_type != BPF_MAP_TYPE_SOCKHASH)
 			goto error;
 		break;
 	default:
diff --git a/net/core/filter.c b/net/core/filter.c
index 61a3ed6..6d0d156 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2074,6 +2074,33 @@ int skb_do_redirect(struct sk_buff *skb)
 	.arg2_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
+	   struct bpf_map *, map, void *, key, u64, flags)
+{
+	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+	/* If user passes invalid input drop the packet. */
+	if (unlikely(flags & ~(BPF_F_INGRESS)))
+		return SK_DROP;
+
+	tcb->bpf.flags = flags;
+	tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key);
+	if (!tcb->bpf.sk_redir)
+		return SK_DROP;
+
+	return SK_PASS;
+}
+
+static const struct bpf_func_proto bpf_sk_redirect_hash_proto = {
+	.func           = bpf_sk_redirect_hash,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_CONST_MAP_PTR,
+	.arg3_type      = ARG_PTR_TO_MAP_KEY,
+	.arg4_type      = ARG_ANYTHING,
+};
+
 BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
 	   struct bpf_map *, map, u32, key, u64, flags)
 {
@@ -2108,6 +2135,31 @@ struct sock *do_sk_redirect_map(struct sk_buff *skb)
 	.arg4_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg,
+	   struct bpf_map *, map, void *, key, u64, flags)
+{
+	/* If user passes invalid input drop the packet. */
+	if (unlikely(flags & ~(BPF_F_INGRESS)))
+		return SK_DROP;
+
+	msg->flags = flags;
+	msg->sk_redir = __sock_hash_lookup_elem(map, key);
+	if (!msg->sk_redir)
+		return SK_DROP;
+
+	return SK_PASS;
+}
+
+static const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
+	.func           = bpf_msg_redirect_hash,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_CONST_MAP_PTR,
+	.arg3_type      = ARG_PTR_TO_MAP_KEY,
+	.arg4_type      = ARG_ANYTHING,
+};
+
 BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg,
 	   struct bpf_map *, map, u32, key, u64, flags)
 {
@@ -4502,6 +4554,8 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 		return &bpf_sock_ops_cb_flags_set_proto;
 	case BPF_FUNC_sock_map_update:
 		return &bpf_sock_map_update_proto;
+	case BPF_FUNC_sock_hash_update:
+		return &bpf_sock_hash_update_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -4513,6 +4567,8 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	switch (func_id) {
 	case BPF_FUNC_msg_redirect_map:
 		return &bpf_msg_redirect_map_proto;
+	case BPF_FUNC_msg_redirect_hash:
+		return &bpf_msg_redirect_hash_proto;
 	case BPF_FUNC_msg_apply_bytes:
 		return &bpf_msg_apply_bytes_proto;
 	case BPF_FUNC_msg_cork_bytes:
@@ -4544,6 +4600,8 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 		return &bpf_get_socket_uid_proto;
 	case BPF_FUNC_sk_redirect_map:
 		return &bpf_sk_redirect_map_proto;
+	case BPF_FUNC_sk_redirect_hash:
+		return &bpf_sk_redirect_hash_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
1.9.1

^ permalink raw reply related

* [PATCH bpf-next v6 1/4] bpf: sockmap, refactor sockmap routines to work with hashmap
From: John Fastabend @ 2018-05-14 17:00 UTC (permalink / raw)
  To: daniel, ast; +Cc: netdev, davem, John Fastabend
In-Reply-To: <1526317219-7752-1-git-send-email-john.fastabend@gmail.com>

This patch only refactors the existing sockmap code. This will allow
much of the psock initialization code path and bpf helper codes to
work for both sockmap bpf map types that are backed by an array, the
currently supported type, and the new hash backed bpf map type
sockhash.

Most the fallout comes from three changes,

  - Pushing bpf programs into an independent structure so we
    can use it from the htab struct in the next patch.
  - Generalizing helpers to use void *key instead of the hardcoded
    u32.
  - Instead of passing map/key through the metadata we now do
    the lookup inline. This avoids storing the key in the metadata
    which will be useful when keys can be longer than 4 bytes. We
    rename the sk pointers to sk_redir at this point as well to
    avoid any confusion between the current sk pointer and the
    redirect pointer sk_redir.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h |   3 +-
 include/net/tcp.h      |   3 +-
 kernel/bpf/sockmap.c   | 148 +++++++++++++++++++++++++++++--------------------
 net/core/filter.c      |  31 +++--------
 4 files changed, 98 insertions(+), 87 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index da7e165..9dbcb9d 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -515,9 +515,8 @@ struct sk_msg_buff {
 	int sg_end;
 	struct scatterlist sg_data[MAX_SKB_FRAGS];
 	bool sg_copy[MAX_SKB_FRAGS];
-	__u32 key;
 	__u32 flags;
-	struct bpf_map *map;
+	struct sock *sk_redir;
 	struct sk_buff *skb;
 	struct list_head list;
 };
diff --git a/include/net/tcp.h b/include/net/tcp.h
index cf803fe..0592873 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -814,9 +814,8 @@ struct tcp_skb_cb {
 #endif
 		} header;	/* For incoming skbs */
 		struct {
-			__u32 key;
 			__u32 flags;
-			struct bpf_map *map;
+			struct sock *sk_redir;
 			void *data_end;
 		} bpf;
 	};
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 098eca5..beab9ec 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -48,14 +48,18 @@
 #define SOCK_CREATE_FLAG_MASK \
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
 
-struct bpf_stab {
-	struct bpf_map map;
-	struct sock **sock_map;
+struct bpf_sock_progs {
 	struct bpf_prog *bpf_tx_msg;
 	struct bpf_prog *bpf_parse;
 	struct bpf_prog *bpf_verdict;
 };
 
+struct bpf_stab {
+	struct bpf_map map;
+	struct sock **sock_map;
+	struct bpf_sock_progs progs;
+};
+
 enum smap_psock_state {
 	SMAP_TX_RUNNING,
 };
@@ -461,7 +465,7 @@ static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md)
 static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md)
 {
 	return ((_rc == SK_PASS) ?
-	       (md->map ? __SK_REDIRECT : __SK_PASS) :
+	       (md->sk_redir ? __SK_REDIRECT : __SK_PASS) :
 	       __SK_DROP);
 }
 
@@ -1092,7 +1096,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
 	 * when we orphan the skb so that we don't have the possibility
 	 * to reference a stale map.
 	 */
-	TCP_SKB_CB(skb)->bpf.map = NULL;
+	TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
 	skb->sk = psock->sock;
 	bpf_compute_data_pointers(skb);
 	preempt_disable();
@@ -1102,7 +1106,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
 
 	/* Moving return codes from UAPI namespace into internal namespace */
 	return rc == SK_PASS ?
-		(TCP_SKB_CB(skb)->bpf.map ? __SK_REDIRECT : __SK_PASS) :
+		(TCP_SKB_CB(skb)->bpf.sk_redir ? __SK_REDIRECT : __SK_PASS) :
 		__SK_DROP;
 }
 
@@ -1372,7 +1376,6 @@ static int smap_init_sock(struct smap_psock *psock,
 }
 
 static void smap_init_progs(struct smap_psock *psock,
-			    struct bpf_stab *stab,
 			    struct bpf_prog *verdict,
 			    struct bpf_prog *parse)
 {
@@ -1450,14 +1453,13 @@ static void smap_gc_work(struct work_struct *w)
 	kfree(psock);
 }
 
-static struct smap_psock *smap_init_psock(struct sock *sock,
-					  struct bpf_stab *stab)
+static struct smap_psock *smap_init_psock(struct sock *sock, int node)
 {
 	struct smap_psock *psock;
 
 	psock = kzalloc_node(sizeof(struct smap_psock),
 			     GFP_ATOMIC | __GFP_NOWARN,
-			     stab->map.numa_node);
+			     node);
 	if (!psock)
 		return ERR_PTR(-ENOMEM);
 
@@ -1662,40 +1664,26 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key)
  *  - sock_map must use READ_ONCE and (cmp)xchg operations
  *  - BPF verdict/parse programs must use READ_ONCE and xchg operations
  */
-static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
-				    struct bpf_map *map,
-				    void *key, u64 flags)
+
+static int __sock_map_ctx_update_elem(struct bpf_map *map,
+				      struct bpf_sock_progs *progs,
+				      struct sock *sock,
+				      struct sock **map_link,
+				      void *key)
 {
-	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
-	struct smap_psock_map_entry *e = NULL;
 	struct bpf_prog *verdict, *parse, *tx_msg;
-	struct sock *osock, *sock;
+	struct smap_psock_map_entry *e = NULL;
 	struct smap_psock *psock;
-	u32 i = *(u32 *)key;
 	bool new = false;
 	int err;
 
-	if (unlikely(flags > BPF_EXIST))
-		return -EINVAL;
-
-	if (unlikely(i >= stab->map.max_entries))
-		return -E2BIG;
-
-	sock = READ_ONCE(stab->sock_map[i]);
-	if (flags == BPF_EXIST && !sock)
-		return -ENOENT;
-	else if (flags == BPF_NOEXIST && sock)
-		return -EEXIST;
-
-	sock = skops->sk;
-
 	/* 1. If sock map has BPF programs those will be inherited by the
 	 * sock being added. If the sock is already attached to BPF programs
 	 * this results in an error.
 	 */
-	verdict = READ_ONCE(stab->bpf_verdict);
-	parse = READ_ONCE(stab->bpf_parse);
-	tx_msg = READ_ONCE(stab->bpf_tx_msg);
+	verdict = READ_ONCE(progs->bpf_verdict);
+	parse = READ_ONCE(progs->bpf_parse);
+	tx_msg = READ_ONCE(progs->bpf_tx_msg);
 
 	if (parse && verdict) {
 		/* bpf prog refcnt may be zero if a concurrent attach operation
@@ -1703,11 +1691,11 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 		 * we increment the refcnt. If this is the case abort with an
 		 * error.
 		 */
-		verdict = bpf_prog_inc_not_zero(stab->bpf_verdict);
+		verdict = bpf_prog_inc_not_zero(progs->bpf_verdict);
 		if (IS_ERR(verdict))
 			return PTR_ERR(verdict);
 
-		parse = bpf_prog_inc_not_zero(stab->bpf_parse);
+		parse = bpf_prog_inc_not_zero(progs->bpf_parse);
 		if (IS_ERR(parse)) {
 			bpf_prog_put(verdict);
 			return PTR_ERR(parse);
@@ -1715,7 +1703,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 	}
 
 	if (tx_msg) {
-		tx_msg = bpf_prog_inc_not_zero(stab->bpf_tx_msg);
+		tx_msg = bpf_prog_inc_not_zero(progs->bpf_tx_msg);
 		if (IS_ERR(tx_msg)) {
 			if (verdict)
 				bpf_prog_put(verdict);
@@ -1748,7 +1736,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 			goto out_progs;
 		}
 	} else {
-		psock = smap_init_psock(sock, stab);
+		psock = smap_init_psock(sock, map->numa_node);
 		if (IS_ERR(psock)) {
 			err = PTR_ERR(psock);
 			goto out_progs;
@@ -1763,7 +1751,6 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 		err = -ENOMEM;
 		goto out_progs;
 	}
-	e->entry = &stab->sock_map[i];
 
 	/* 3. At this point we have a reference to a valid psock that is
 	 * running. Attach any BPF programs needed.
@@ -1780,7 +1767,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 		err = smap_init_sock(psock, sock);
 		if (err)
 			goto out_free;
-		smap_init_progs(psock, stab, verdict, parse);
+		smap_init_progs(psock, verdict, parse);
 		smap_start_sock(psock, sock);
 	}
 
@@ -1789,19 +1776,12 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 	 * it with. Because we can only have a single set of programs if
 	 * old_sock has a strp we can stop it.
 	 */
-	list_add_tail(&e->list, &psock->maps);
-	write_unlock_bh(&sock->sk_callback_lock);
-
-	osock = xchg(&stab->sock_map[i], sock);
-	if (osock) {
-		struct smap_psock *opsock = smap_psock_sk(osock);
-
-		write_lock_bh(&osock->sk_callback_lock);
-		smap_list_remove(opsock, &stab->sock_map[i]);
-		smap_release_sock(opsock, osock);
-		write_unlock_bh(&osock->sk_callback_lock);
+	if (map_link) {
+		e->entry = map_link;
+		list_add_tail(&e->list, &psock->maps);
 	}
-	return 0;
+	write_unlock_bh(&sock->sk_callback_lock);
+	return err;
 out_free:
 	smap_release_sock(psock, sock);
 out_progs:
@@ -1816,23 +1796,69 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 	return err;
 }
 
-int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
+static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
+				    struct bpf_map *map,
+				    void *key, u64 flags)
 {
 	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	struct bpf_sock_progs *progs = &stab->progs;
+	struct sock *osock, *sock;
+	u32 i = *(u32 *)key;
+	int err;
+
+	if (unlikely(flags > BPF_EXIST))
+		return -EINVAL;
+
+	if (unlikely(i >= stab->map.max_entries))
+		return -E2BIG;
+
+	sock = READ_ONCE(stab->sock_map[i]);
+	if (flags == BPF_EXIST && !sock)
+		return -ENOENT;
+	else if (flags == BPF_NOEXIST && sock)
+		return -EEXIST;
+
+	sock = skops->sk;
+	err = __sock_map_ctx_update_elem(map, progs, sock, &stab->sock_map[i],
+					 key);
+	if (err)
+		goto out;
+
+	osock = xchg(&stab->sock_map[i], sock);
+	if (osock) {
+		struct smap_psock *opsock = smap_psock_sk(osock);
+
+		write_lock_bh(&osock->sk_callback_lock);
+		smap_list_remove(opsock, &stab->sock_map[i]);
+		smap_release_sock(opsock, osock);
+		write_unlock_bh(&osock->sk_callback_lock);
+	}
+out:
+	return 0;
+}
+
+int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
+{
+	struct bpf_sock_progs *progs;
 	struct bpf_prog *orig;
 
-	if (unlikely(map->map_type != BPF_MAP_TYPE_SOCKMAP))
+	if (map->map_type == BPF_MAP_TYPE_SOCKMAP) {
+		struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+		progs = &stab->progs;
+	} else {
 		return -EINVAL;
+	}
 
 	switch (type) {
 	case BPF_SK_MSG_VERDICT:
-		orig = xchg(&stab->bpf_tx_msg, prog);
+		orig = xchg(&progs->bpf_tx_msg, prog);
 		break;
 	case BPF_SK_SKB_STREAM_PARSER:
-		orig = xchg(&stab->bpf_parse, prog);
+		orig = xchg(&progs->bpf_parse, prog);
 		break;
 	case BPF_SK_SKB_STREAM_VERDICT:
-		orig = xchg(&stab->bpf_verdict, prog);
+		orig = xchg(&progs->bpf_verdict, prog);
 		break;
 	default:
 		return -EOPNOTSUPP;
@@ -1881,16 +1907,18 @@ static int sock_map_update_elem(struct bpf_map *map,
 static void sock_map_release(struct bpf_map *map)
 {
 	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	struct bpf_sock_progs *progs;
 	struct bpf_prog *orig;
 
-	orig = xchg(&stab->bpf_parse, NULL);
+	progs = &stab->progs;
+	orig = xchg(&progs->bpf_parse, NULL);
 	if (orig)
 		bpf_prog_put(orig);
-	orig = xchg(&stab->bpf_verdict, NULL);
+	orig = xchg(&progs->bpf_verdict, NULL);
 	if (orig)
 		bpf_prog_put(orig);
 
-	orig = xchg(&stab->bpf_tx_msg, NULL);
+	orig = xchg(&progs->bpf_tx_msg, NULL);
 	if (orig)
 		bpf_prog_put(orig);
 }
diff --git a/net/core/filter.c b/net/core/filter.c
index ca60d28..61a3ed6 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2083,9 +2083,10 @@ int skb_do_redirect(struct sk_buff *skb)
 	if (unlikely(flags & ~(BPF_F_INGRESS)))
 		return SK_DROP;
 
-	tcb->bpf.key = key;
 	tcb->bpf.flags = flags;
-	tcb->bpf.map = map;
+	tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key);
+	if (!tcb->bpf.sk_redir)
+		return SK_DROP;
 
 	return SK_PASS;
 }
@@ -2093,16 +2094,8 @@ int skb_do_redirect(struct sk_buff *skb)
 struct sock *do_sk_redirect_map(struct sk_buff *skb)
 {
 	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
-	struct sock *sk = NULL;
-
-	if (tcb->bpf.map) {
-		sk = __sock_map_lookup_elem(tcb->bpf.map, tcb->bpf.key);
 
-		tcb->bpf.key = 0;
-		tcb->bpf.map = NULL;
-	}
-
-	return sk;
+	return tcb->bpf.sk_redir;
 }
 
 static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
@@ -2122,25 +2115,17 @@ struct sock *do_sk_redirect_map(struct sk_buff *skb)
 	if (unlikely(flags & ~(BPF_F_INGRESS)))
 		return SK_DROP;
 
-	msg->key = key;
 	msg->flags = flags;
-	msg->map = map;
+	msg->sk_redir = __sock_map_lookup_elem(map, key);
+	if (!msg->sk_redir)
+		return SK_DROP;
 
 	return SK_PASS;
 }
 
 struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
 {
-	struct sock *sk = NULL;
-
-	if (msg->map) {
-		sk = __sock_map_lookup_elem(msg->map, msg->key);
-
-		msg->key = 0;
-		msg->map = NULL;
-	}
-
-	return sk;
+	return msg->sk_redir;
 }
 
 static const struct bpf_func_proto bpf_msg_redirect_map_proto = {
-- 
1.9.1

^ permalink raw reply related

* [PATCH bpf-next v6 0/4] Hash support for sock
From: John Fastabend @ 2018-05-14 17:00 UTC (permalink / raw)
  To: daniel, ast; +Cc: netdev, davem, John Fastabend

In the original sockmap implementation we got away with using an
array similar to devmap. However, unlike devmap where an ifindex
has a nice 1:1 function into the map we have found some use cases
with sockets that need to be referenced using longer keys.

This series adds support for a sockhash map reusing as much of
the sockmap code as possible. I made the decision to add sockhash
specific helpers vs trying to generalize the existing helpers
because (a) they have sockmap in the name and (b) the keys are
different types. I prefer to be explicit here rather than play
type games or do something else tricky.

To test this we duplicate all the sockmap testing except swap out
the sockmap with a sockhash.

v2: fix file stats and add v2 tag
v3: move tool updates into test patch, move bpftool updates into
    its own patch, and fixup the test patch stats to catch the
    renamed file and provide only diffs +/- on that.
v4: Add documentation to UAPI bpf.h
v5: Add documentation to tools UAPI bpf.h
v6: 'git add' test_sockhash_kern.c which was previously missing
    but was not causing issues because of typo in test script,
    noticed by Daniel. After this the git format-patch -M option
    no longer tracks the rename of the test_sockmap_kern files for
    some reason. I guess the diff has exceeded some threshold.

Just a note I pushed Dave's Acks through v4 into v5 due to small
size of changes.

John Fastabend (4):
  bpf: sockmap, refactor sockmap routines to work with hashmap
  bpf: sockmap, add hash map support
  bpf: selftest additions for SOCKHASH
  bpf: bpftool, support for sockhash

 include/linux/bpf.h                              |   8 +
 include/linux/bpf_types.h                        |   1 +
 include/linux/filter.h                           |   3 +-
 include/net/tcp.h                                |   3 +-
 include/uapi/linux/bpf.h                         |  52 +-
 kernel/bpf/core.c                                |   1 +
 kernel/bpf/sockmap.c                             | 638 ++++++++++++++++++++---
 kernel/bpf/verifier.c                            |  14 +-
 net/core/filter.c                                |  89 +++-
 tools/bpf/bpftool/map.c                          |   1 +
 tools/include/uapi/linux/bpf.h                   |  52 +-
 tools/testing/selftests/bpf/Makefile             |   2 +-
 tools/testing/selftests/bpf/bpf_helpers.h        |   8 +
 tools/testing/selftests/bpf/test_sockhash_kern.c |   5 +
 tools/testing/selftests/bpf/test_sockmap.c       |  27 +-
 tools/testing/selftests/bpf/test_sockmap_kern.c  | 343 +-----------
 tools/testing/selftests/bpf/test_sockmap_kern.h  | 363 +++++++++++++
 17 files changed, 1159 insertions(+), 451 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_sockhash_kern.c
 create mode 100644 tools/testing/selftests/bpf/test_sockmap_kern.h

-- 
1.9.1

^ permalink raw reply

* Re: [PATCH v3 0/1] multi-threading device shutdown
From: Pavel Tatashin @ 2018-05-14 16:54 UTC (permalink / raw)
  To: Greg KH
  Cc: steven.sistare, daniel.m.jordan, linux-kernel, jeffrey.t.kirsher,
	intel-wired-lan, netdev, alexander.duyck, tobin
In-Reply-To: <20180514150336.GA18769@kroah.com>

On 05/14/2018 11:03 AM, Greg KH wrote:
> On Mon, May 07, 2018 at 11:54:01AM -0400, Pavel Tatashin wrote:
>> Changelog
>> v2 - v3
>> 	- Fixed warning from kbuild test.
>> 	- Moved device_lock/device_unlock inside device_shutdown_tree().
>>
>> v1 - v2
>> 	- It turns out we cannot lock more than MAX_LOCK_DEPTH by a single
>> 	  thread. (By default this value is 48), and is used to detect
>> 	  deadlocks. So, I re-wrote the code to only lock one devices per
>> 	  thread instead of pre-locking all devices by the main thread.
>> 	- Addressed comments from Tobin C. Harding.
>> 	- As suggested by Alexander Duyck removed ixgbe changes. It can be
>> 	  done as a separate work scaling RTNL mutex.
>>
>> Do a faster shutdown by calling dev->*->shutdown(dev) in parallel.
>> device_shutdown() calls these functions for every single device but
>> only using one thread.
>>
>> Since, nothing else is running on the machine by the device_shutdown()
>> s called, there is no reason not to utilize all the available CPU
>> resources.
> 
> Ah, we can hope so.  I bet this is going to break something, so can we
> have some way of turning it on/off dynamically for when it does?

Hi Greg,

Sure, I will add a kernel parameter to optionally disable this feature in the next patch revision.

Thank you,
Pavel

> 
> thanks,
> 
> greg k-h
> 

^ permalink raw reply

* Re: [net-next PATCH 1/3] net: Refactor XPS for CPUs and Rx queues
From: Tom Herbert @ 2018-05-14 16:53 UTC (permalink / raw)
  To: Nambiar, Amritha
  Cc: Linux Kernel Network Developers, David S. Miller, Alexander Duyck,
	Sridhar Samudrala, Eric Dumazet, Hannes Frederic Sowa
In-Reply-To: <951a1587-e51a-2927-902d-1f3d2f3b2d7f@intel.com>

On Wed, May 9, 2018 at 1:54 PM, Nambiar, Amritha
<amritha.nambiar@intel.com> wrote:
> On 5/9/2018 1:31 PM, Tom Herbert wrote:
>> On Thu, Apr 19, 2018 at 6:04 PM, Amritha Nambiar
>> <amritha.nambiar@intel.com> wrote:
>>> Refactor XPS code to support Tx queue selection based on
>>> CPU map or Rx queue map.
>>>
>>> Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
>>> ---
>>>  include/linux/netdevice.h |   82 +++++++++++++++++-
>>>  net/core/dev.c            |  206 +++++++++++++++++++++++++++++----------------
>>>  net/core/net-sysfs.c      |    4 -
>>>  3 files changed, 216 insertions(+), 76 deletions(-)
>>>
>>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>>> index 14e0777..40a9171 100644
>>> --- a/include/linux/netdevice.h
>>> +++ b/include/linux/netdevice.h
>>> @@ -730,10 +730,21 @@ struct xps_map {
>>>   */
>>>  struct xps_dev_maps {
>>>         struct rcu_head rcu;
>>> -       struct xps_map __rcu *cpu_map[0];
>>> +       struct xps_map __rcu *attr_map[0];
>>
>> This seems unnecessarily complicated to me. Why not just add another
>> map called something like "rxq2txq_map". Then when selecting TXQ just
>> check the new map first and then the normal cpu_map if there's not a
>> hit.
>>
>
> This is just a change in the name to something more generic ('attr')
> since the maps can either be cpu based or rxq based. I have added two
> map types, XPS_MAP_RXQS, XPS_MAP_CPUS and the TXQ selection (in patch

I think adding map types is overkill and we really don't want to turn
this in to a generic but complex interface with a bunch of map types.
Just have two pointers to the two different maps.

> 2/3) works how you described,  first based on the RXQ map and if there
> is no hit, falls to the normal CPU map.
>
>>>  };
>>> -#define XPS_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) +         \
>>> +
>>> +#define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) +     \
>>>         (nr_cpu_ids * (_tcs) * sizeof(struct xps_map *)))
>>> +
>>> +#define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\
>>> +       (_rxqs * (_tcs) * sizeof(struct xps_map *)))
>>> +
>>> +enum xps_map_type {
>>> +       XPS_MAP_RXQS,
>>> +       XPS_MAP_CPUS,
>>> +       __XPS_MAP_MAX
>>> +};
>>> +
>>>  #endif /* CONFIG_XPS */
>>>
>>>  #define TC_MAX_QUEUE   16
>>> @@ -1867,7 +1878,7 @@ struct net_device {
>>>         int                     watchdog_timeo;
>>>
>>>  #ifdef CONFIG_XPS
>>> -       struct xps_dev_maps __rcu *xps_maps;
>>> +       struct xps_dev_maps __rcu *xps_maps[__XPS_MAP_MAX];
>>>  #endif
>>>  #ifdef CONFIG_NET_CLS_ACT
>>>         struct mini_Qdisc __rcu *miniq_egress;
>>> @@ -3204,6 +3215,71 @@ static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
>>>  #ifdef CONFIG_XPS
>>>  int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>>                         u16 index);
>>> +int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
>>> +                         u16 index, enum xps_map_type type);
>>> +
>>> +static inline bool attr_test_mask(unsigned long j, const unsigned long *mask,
>>> +                                 unsigned int nr_bits)
>>> +{
>>> +#ifdef CONFIG_DEBUG_PER_CPU_MAPS
>>> +       WARN_ON_ONCE(j >= nr_bits);
>>> +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
>>
>> This #ifdef block appears 3 times in the patch. Seems like it should
>> be replace by simple macro.
>
> Sure, will do in the next version.
>
>>
>>> +       return test_bit(j, mask);
>>> +}
>>> +
>>> +static inline bool attr_test_online(unsigned long j,
>>> +                                   const unsigned long *online_mask,
>>> +                                   unsigned int nr_bits)
>>> +{
>>> +#ifdef CONFIG_DEBUG_PER_CPU_MAPS
>>> +       WARN_ON_ONCE(j >= nr_bits);
>>> +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
>>> +
>>> +       if (online_mask)
>>> +               return test_bit(j, online_mask);
>>> +
>>> +       if (j >= 0 && j < nr_bits)
>>> +               return true;
>>> +
>>> +       return false;
>>> +}
>>> +
>>> +static inline unsigned int attrmask_next(int n, const unsigned long *srcp,
>>> +                                        unsigned int nr_bits)
>>> +{
>>> +       /* -1 is a legal arg here. */
>>> +       if (n != -1) {
>>> +#ifdef CONFIG_DEBUG_PER_CPU_MAPS
>>> +               WARN_ON_ONCE(n >= nr_bits);
>>> +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
>>> +       }
>>> +
>>> +       if (srcp)
>>> +               return find_next_bit(srcp, nr_bits, n + 1);
>>> +
>>> +       return n + 1;
>>> +}
>>> +
>>> +static inline int attrmask_next_and(int n, const unsigned long *src1p,
>>> +                                   const unsigned long *src2p,
>>> +                                   unsigned int nr_bits)
>>> +{
>>> +       /* -1 is a legal arg here. */
>>> +       if (n != -1) {
>>> +#ifdef CONFIG_DEBUG_PER_CPU_MAPS
>>> +               WARN_ON_ONCE(n >= nr_bits);
>>> +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
>>> +       }
>>> +
>>> +       if (src1p && src2p)
>>> +               return find_next_and_bit(src1p, src2p, nr_bits, n + 1);
>>> +       else if (src1p)
>>> +               return find_next_bit(src1p, nr_bits, n + 1);
>>> +       else if (src2p)
>>> +               return find_next_bit(src2p, nr_bits, n + 1);
>>> +
>>> +       return n + 1;
>>> +}
>>>  #else
>>>  static inline int netif_set_xps_queue(struct net_device *dev,
>>>                                       const struct cpumask *mask,
>>> diff --git a/net/core/dev.c b/net/core/dev.c
>>> index a490ef6..17c4883 100644
>>> --- a/net/core/dev.c
>>> +++ b/net/core/dev.c
>>> @@ -2092,7 +2092,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
>>>         int pos;
>>>
>>>         if (dev_maps)
>>> -               map = xmap_dereference(dev_maps->cpu_map[tci]);
>>> +               map = xmap_dereference(dev_maps->attr_map[tci]);
>>>         if (!map)
>>>                 return false;
>>>
>>> @@ -2105,7 +2105,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
>>>                         break;
>>>                 }
>>>
>>> -               RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
>>> +               RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
>>>                 kfree_rcu(map, rcu);
>>>                 return false;
>>>         }
>>> @@ -2138,30 +2138,47 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
>>>  static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
>>>                                    u16 count)
>>>  {
>>> +       const unsigned long *possible_mask = NULL;
>>> +       enum xps_map_type type = XPS_MAP_RXQS;
>>>         struct xps_dev_maps *dev_maps;
>>> -       int cpu, i;
>>>         bool active = false;
>>> +       unsigned int nr_ids;
>>> +       int i, j;
>>>
>>>         mutex_lock(&xps_map_mutex);
>>> -       dev_maps = xmap_dereference(dev->xps_maps);
>>>
>>> -       if (!dev_maps)
>>> -               goto out_no_maps;
>>> +       while (type < __XPS_MAP_MAX) {
>>> +               dev_maps = xmap_dereference(dev->xps_maps[type]);
>>> +               if (!dev_maps)
>>> +                       goto out_no_maps;
>>> +
>>> +               if (type == XPS_MAP_CPUS) {
>>> +                       if (num_possible_cpus() > 1)
>>> +                               possible_mask = cpumask_bits(cpu_possible_mask);
>>> +                       nr_ids = nr_cpu_ids;
>>> +               } else if (type == XPS_MAP_RXQS) {
>>> +                       nr_ids = dev->num_rx_queues;
>>> +               }
>>>
>>> -       for_each_possible_cpu(cpu)
>>> -               active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
>>> -                                              offset, count);
>>> +               for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
>>> +                    j < nr_ids;)
>>> +                       active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
>>> +                                                      count);
>>> +               if (!active) {
>>> +                       RCU_INIT_POINTER(dev->xps_maps[type], NULL);
>>> +                       kfree_rcu(dev_maps, rcu);
>>> +               }
>>>
>>> -       if (!active) {
>>> -               RCU_INIT_POINTER(dev->xps_maps, NULL);
>>> -               kfree_rcu(dev_maps, rcu);
>>> +               if (type == XPS_MAP_CPUS) {
>>> +                       for (i = offset + (count - 1); count--; i--)
>>> +                               netdev_queue_numa_node_write(
>>> +                                       netdev_get_tx_queue(dev, i),
>>> +                                                           NUMA_NO_NODE);
>>> +               }
>>> +out_no_maps:
>>> +               type++;
>>>         }
>>>
>>> -       for (i = offset + (count - 1); count--; i--)
>>> -               netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
>>> -                                            NUMA_NO_NODE);
>>> -
>>> -out_no_maps:
>>>         mutex_unlock(&xps_map_mutex);
>>>  }
>>>
>>> @@ -2170,11 +2187,11 @@ static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
>>>         netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
>>>  }
>>>
>>> -static struct xps_map *expand_xps_map(struct xps_map *map,
>>> -                                     int cpu, u16 index)
>>> +static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
>>> +                                     u16 index, enum xps_map_type type)
>>>  {
>>> -       struct xps_map *new_map;
>>>         int alloc_len = XPS_MIN_MAP_ALLOC;
>>> +       struct xps_map *new_map = NULL;
>>>         int i, pos;
>>>
>>>         for (pos = 0; map && pos < map->len; pos++) {
>>> @@ -2183,7 +2200,7 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
>>>                 return map;
>>>         }
>>>
>>> -       /* Need to add queue to this CPU's existing map */
>>> +       /* Need to add tx-queue to this CPU's/rx-queue's existing map */
>>>         if (map) {
>>>                 if (pos < map->alloc_len)
>>>                         return map;
>>> @@ -2191,9 +2208,14 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
>>>                 alloc_len = map->alloc_len * 2;
>>>         }
>>>
>>> -       /* Need to allocate new map to store queue on this CPU's map */
>>> -       new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
>>> -                              cpu_to_node(cpu));
>>> +       /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
>>> +        *  map
>>> +        */
>>> +       if (type == XPS_MAP_RXQS)
>>> +               new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
>>> +       else if (type == XPS_MAP_CPUS)
>>> +               new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
>>> +                                      cpu_to_node(attr_index));
>>>         if (!new_map)
>>>                 return NULL;
>>>
>>> @@ -2205,14 +2227,16 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
>>>         return new_map;
>>>  }
>>>
>>> -int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>> -                       u16 index)
>>> +int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
>>> +                         u16 index, enum xps_map_type type)
>>>  {
>>> +       const unsigned long *online_mask = NULL, *possible_mask = NULL;
>>>         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
>>> -       int i, cpu, tci, numa_node_id = -2;
>>> +       int i, j, tci, numa_node_id = -2;
>>>         int maps_sz, num_tc = 1, tc = 0;
>>>         struct xps_map *map, *new_map;
>>>         bool active = false;
>>> +       unsigned int nr_ids;
>>>
>>>         if (dev->num_tc) {
>>>                 num_tc = dev->num_tc;
>>> @@ -2221,16 +2245,33 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>>                         return -EINVAL;
>>>         }
>>>
>>> -       maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
>>> +       switch (type) {
>>> +       case XPS_MAP_RXQS:
>>> +               maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
>>> +               dev_maps = xmap_dereference(dev->xps_maps[XPS_MAP_RXQS]);
>>> +               nr_ids = dev->num_rx_queues;
>>> +               break;
>>> +       case XPS_MAP_CPUS:
>>> +               maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
>>> +               if (num_possible_cpus() > 1) {
>>> +                       online_mask = cpumask_bits(cpu_online_mask);
>>> +                       possible_mask = cpumask_bits(cpu_possible_mask);
>>> +               }
>>> +               dev_maps = xmap_dereference(dev->xps_maps[XPS_MAP_CPUS]);
>>> +               nr_ids = nr_cpu_ids;
>>> +               break;
>>> +       default:
>>> +               return -EINVAL;
>>> +       }
>>> +
>>>         if (maps_sz < L1_CACHE_BYTES)
>>>                 maps_sz = L1_CACHE_BYTES;
>>>
>>>         mutex_lock(&xps_map_mutex);
>>>
>>> -       dev_maps = xmap_dereference(dev->xps_maps);
>>> -
>>>         /* allocate memory for queue storage */
>>> -       for_each_cpu_and(cpu, cpu_online_mask, mask) {
>>> +       for (j = -1; j = attrmask_next_and(j, online_mask, mask, nr_ids),
>>> +            j < nr_ids;) {
>>>                 if (!new_dev_maps)
>>>                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
>>>                 if (!new_dev_maps) {
>>> @@ -2238,73 +2279,81 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>>                         return -ENOMEM;
>>>                 }
>>>
>>> -               tci = cpu * num_tc + tc;
>>> -               map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
>>> +               tci = j * num_tc + tc;
>>> +               map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
>>>                                  NULL;
>>>
>>> -               map = expand_xps_map(map, cpu, index);
>>> +               map = expand_xps_map(map, j, index, type);
>>>                 if (!map)
>>>                         goto error;
>>>
>>> -               RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
>>> +               RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
>>>         }
>>>
>>>         if (!new_dev_maps)
>>>                 goto out_no_new_maps;
>>>
>>> -       for_each_possible_cpu(cpu) {
>>> +       for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
>>> +            j < nr_ids;) {
>>>                 /* copy maps belonging to foreign traffic classes */
>>> -               for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
>>> +               for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
>>>                         /* fill in the new device map from the old device map */
>>> -                       map = xmap_dereference(dev_maps->cpu_map[tci]);
>>> -                       RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
>>> +                       map = xmap_dereference(dev_maps->attr_map[tci]);
>>> +                       RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
>>>                 }
>>>
>>>                 /* We need to explicitly update tci as prevous loop
>>>                  * could break out early if dev_maps is NULL.
>>>                  */
>>> -               tci = cpu * num_tc + tc;
>>> +               tci = j * num_tc + tc;
>>>
>>> -               if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
>>> -                       /* add queue to CPU maps */
>>> +               if (attr_test_mask(j, mask, nr_ids) &&
>>> +                   attr_test_online(j, online_mask, nr_ids)) {
>>> +                       /* add tx-queue to CPU/rx-queue maps */
>>>                         int pos = 0;
>>>
>>> -                       map = xmap_dereference(new_dev_maps->cpu_map[tci]);
>>> +                       map = xmap_dereference(new_dev_maps->attr_map[tci]);
>>>                         while ((pos < map->len) && (map->queues[pos] != index))
>>>                                 pos++;
>>>
>>>                         if (pos == map->len)
>>>                                 map->queues[map->len++] = index;
>>>  #ifdef CONFIG_NUMA
>>> -                       if (numa_node_id == -2)
>>> -                               numa_node_id = cpu_to_node(cpu);
>>> -                       else if (numa_node_id != cpu_to_node(cpu))
>>> -                               numa_node_id = -1;
>>> +                       if (type == XPS_MAP_CPUS) {
>>> +                               if (numa_node_id == -2)
>>> +                                       numa_node_id = cpu_to_node(j);
>>> +                               else if (numa_node_id != cpu_to_node(j))
>>> +                                       numa_node_id = -1;
>>> +                       }
>>>  #endif
>>>                 } else if (dev_maps) {
>>>                         /* fill in the new device map from the old device map */
>>> -                       map = xmap_dereference(dev_maps->cpu_map[tci]);
>>> -                       RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
>>> +                       map = xmap_dereference(dev_maps->attr_map[tci]);
>>> +                       RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
>>>                 }
>>>
>>>                 /* copy maps belonging to foreign traffic classes */
>>>                 for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
>>>                         /* fill in the new device map from the old device map */
>>> -                       map = xmap_dereference(dev_maps->cpu_map[tci]);
>>> -                       RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
>>> +                       map = xmap_dereference(dev_maps->attr_map[tci]);
>>> +                       RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
>>>                 }
>>>         }
>>>
>>> -       rcu_assign_pointer(dev->xps_maps, new_dev_maps);
>>> +       if (type == XPS_MAP_RXQS)
>>> +               rcu_assign_pointer(dev->xps_maps[XPS_MAP_RXQS], new_dev_maps);
>>> +       else if (type == XPS_MAP_CPUS)
>>> +               rcu_assign_pointer(dev->xps_maps[XPS_MAP_CPUS], new_dev_maps);
>>>
>>>         /* Cleanup old maps */
>>>         if (!dev_maps)
>>>                 goto out_no_old_maps;
>>>
>>> -       for_each_possible_cpu(cpu) {
>>> -               for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
>>> -                       new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
>>> -                       map = xmap_dereference(dev_maps->cpu_map[tci]);
>>> +       for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
>>> +            j < nr_ids;) {
>>> +               for (i = num_tc, tci = j * num_tc; i--; tci++) {
>>> +                       new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
>>> +                       map = xmap_dereference(dev_maps->attr_map[tci]);
>>>                         if (map && map != new_map)
>>>                                 kfree_rcu(map, rcu);
>>>                 }
>>> @@ -2317,19 +2366,23 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>>         active = true;
>>>
>>>  out_no_new_maps:
>>> -       /* update Tx queue numa node */
>>> -       netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
>>> -                                    (numa_node_id >= 0) ? numa_node_id :
>>> -                                    NUMA_NO_NODE);
>>> +       if (type == XPS_MAP_CPUS) {
>>> +               /* update Tx queue numa node */
>>> +               netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
>>> +                                            (numa_node_id >= 0) ?
>>> +                                            numa_node_id : NUMA_NO_NODE);
>>> +       }
>>>
>>>         if (!dev_maps)
>>>                 goto out_no_maps;
>>>
>>> -       /* removes queue from unused CPUs */
>>> -       for_each_possible_cpu(cpu) {
>>> -               for (i = tc, tci = cpu * num_tc; i--; tci++)
>>> +       /* removes tx-queue from unused CPUs/rx-queues */
>>> +       for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
>>> +            j < nr_ids;) {
>>> +               for (i = tc, tci = j * num_tc; i--; tci++)
>>>                         active |= remove_xps_queue(dev_maps, tci, index);
>>> -               if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
>>> +               if (!attr_test_mask(j, mask, nr_ids) ||
>>> +                   !attr_test_online(j, online_mask, nr_ids))
>>>                         active |= remove_xps_queue(dev_maps, tci, index);
>>>                 for (i = num_tc - tc, tci++; --i; tci++)
>>>                         active |= remove_xps_queue(dev_maps, tci, index);
>>> @@ -2337,7 +2390,10 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>>
>>>         /* free map if not active */
>>>         if (!active) {
>>> -               RCU_INIT_POINTER(dev->xps_maps, NULL);
>>> +               if (type == XPS_MAP_RXQS)
>>> +                       RCU_INIT_POINTER(dev->xps_maps[XPS_MAP_RXQS], NULL);
>>> +               else if (type == XPS_MAP_CPUS)
>>> +                       RCU_INIT_POINTER(dev->xps_maps[XPS_MAP_CPUS], NULL);
>>>                 kfree_rcu(dev_maps, rcu);
>>>         }
>>>
>>> @@ -2347,11 +2403,12 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>>         return 0;
>>>  error:
>>>         /* remove any maps that we added */
>>> -       for_each_possible_cpu(cpu) {
>>> -               for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
>>> -                       new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
>>> +       for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
>>> +            j < nr_ids;) {
>>> +               for (i = num_tc, tci = j * num_tc; i--; tci++) {
>>> +                       new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
>>>                         map = dev_maps ?
>>> -                             xmap_dereference(dev_maps->cpu_map[tci]) :
>>> +                             xmap_dereference(dev_maps->attr_map[tci]) :
>>>                               NULL;
>>>                         if (new_map && new_map != map)
>>>                                 kfree(new_map);
>>> @@ -2363,6 +2420,13 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>>         kfree(new_dev_maps);
>>>         return -ENOMEM;
>>>  }
>>> +
>>> +int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>> +                       u16 index)
>>> +{
>>> +       return __netif_set_xps_queue(dev, cpumask_bits(mask), index,
>>> +                                    XPS_MAP_CPUS);
>>> +}
>>>  EXPORT_SYMBOL(netif_set_xps_queue);
>>>
>>>  #endif
>>> @@ -3400,7 +3464,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
>>>         int queue_index = -1;
>>>
>>>         rcu_read_lock();
>>> -       dev_maps = rcu_dereference(dev->xps_maps);
>>> +       dev_maps = rcu_dereference(dev->xps_maps[XPS_MAP_CPUS]);
>>>         if (dev_maps) {
>>>                 unsigned int tci = skb->sender_cpu - 1;
>>>
>>> @@ -3409,7 +3473,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
>>>                         tci += netdev_get_prio_tc_map(dev, skb->priority);
>>>                 }
>>>
>>> -               map = rcu_dereference(dev_maps->cpu_map[tci]);
>>> +               map = rcu_dereference(dev_maps->attr_map[tci]);
>>>                 if (map) {
>>>                         if (map->len == 1)
>>>                                 queue_index = map->queues[0];
>>> diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
>>> index c476f07..d7abd33 100644
>>> --- a/net/core/net-sysfs.c
>>> +++ b/net/core/net-sysfs.c
>>> @@ -1227,13 +1227,13 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue,
>>>         }
>>>
>>>         rcu_read_lock();
>>> -       dev_maps = rcu_dereference(dev->xps_maps);
>>> +       dev_maps = rcu_dereference(dev->xps_maps[XPS_MAP_CPUS]);
>>>         if (dev_maps) {
>>>                 for_each_possible_cpu(cpu) {
>>>                         int i, tci = cpu * num_tc + tc;
>>>                         struct xps_map *map;
>>>
>>> -                       map = rcu_dereference(dev_maps->cpu_map[tci]);
>>> +                       map = rcu_dereference(dev_maps->attr_map[tci]);
>>>                         if (!map)
>>>                                 continue;
>>>
>>>

^ permalink raw reply

* Re: [PATCH 06/14] net: sched: implement reference counted action release
From: Jiri Pirko @ 2018-05-14 16:47 UTC (permalink / raw)
  To: Vlad Buslov
  Cc: netdev, davem, jhs, xiyou.wangcong, pablo, kadlec, fw, ast,
	daniel, edumazet, keescook, linux-kernel, netfilter-devel,
	coreteam, kliteyn
In-Reply-To: <1526308035-12484-7-git-send-email-vladbu@mellanox.com>

Mon, May 14, 2018 at 04:27:07PM CEST, vladbu@mellanox.com wrote:

[...]


>+static int tcf_action_del_1(struct net *net, char *kind, u32 index,
>+			    struct netlink_ext_ack *extack)
>+{
>+	const struct tc_action_ops *ops;
>+	int err = -EINVAL;
>+
>+	ops = tc_lookup_action_n(kind);
>+	if (!ops) {

How this can happen? Apparently you hold reference to the module since
you put it couple lines below. In that case, I don't really understand
why you need to lookup and just don't use "ops" pointer you have in
tcf_action_delete().


>+		NL_SET_ERR_MSG(extack, "Specified TC action not found");
>+		goto err_out;
>+	}
>+
>+	if (ops->delete)
>+		err = ops->delete(net, index);
>+
>+	module_put(ops->owner);
>+err_out:
>+	return err;
>+}
>+
> static int tca_action_flush(struct net *net, struct nlattr *nla,
> 			    struct nlmsghdr *n, u32 portid,
> 			    struct netlink_ext_ack *extack)
>@@ -1052,6 +1088,36 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
> 	return err;
> }
> 
>+static int tcf_action_delete(struct net *net, struct list_head *actions,
>+			     struct netlink_ext_ack *extack)
>+{
>+	int ret;
>+	struct tc_action *a, *tmp;
>+	char kind[IFNAMSIZ];
>+	u32 act_index;
>+
>+	list_for_each_entry_safe(a, tmp, actions, list) {
>+		const struct tc_action_ops *ops = a->ops;
>+
>+		/* Actions can be deleted concurrently
>+		 * so we must save their type and id to search again
>+		 * after reference is released.
>+		 */
>+		strncpy(kind, a->ops->kind, sizeof(kind) - 1);
>+		act_index = a->tcfa_index;
>+
>+		list_del(&a->list);
>+		if (tcf_action_put(a))
>+			module_put(ops->owner);
>+
>+		/* now do the delete */
>+		ret = tcf_action_del_1(net, kind, act_index, extack);
>+		if (ret < 0)
>+			return ret;
>+	}
>+	return 0;
>+}
>+

[...]

^ permalink raw reply

* Re: [PATCH V2] mlx4_core: allocate ICM memory in page size chunks
From: Qing Huang @ 2018-05-14 16:41 UTC (permalink / raw)
  To: Tariq Toukan, tariqt, davem, haakon.bugge, yanjun.zhu
  Cc: netdev, linux-rdma, linux-kernel
In-Reply-To: <2797ac27-022c-0818-388c-e4a6131ad1ca@gmail.com>



On 5/13/2018 2:00 AM, Tariq Toukan wrote:
>
>
> On 11/05/2018 10:23 PM, Qing Huang wrote:
>> When a system is under memory presure (high usage with fragments),
>> the original 256KB ICM chunk allocations will likely trigger kernel
>> memory management to enter slow path doing memory compact/migration
>> ops in order to complete high order memory allocations.
>>
>> When that happens, user processes calling uverb APIs may get stuck
>> for more than 120s easily even though there are a lot of free pages
>> in smaller chunks available in the system.
>>
>> Syslog:
>> ...
>> Dec 10 09:04:51 slcc03db02 kernel: [397078.572732] INFO: task
>> oracle_205573_e:205573 blocked for more than 120 seconds.
>> ...
>>
>> With 4KB ICM chunk size on x86_64 arch, the above issue is fixed.
>>
>> However in order to support smaller ICM chunk size, we need to fix
>> another issue in large size kcalloc allocations.
>>
>> E.g.
>> Setting log_num_mtt=30 requires 1G mtt entries. With the 4KB ICM chunk
>> size, each ICM chunk can only hold 512 mtt entries (8 bytes for each mtt
>> entry). So we need a 16MB allocation for a table->icm pointer array to
>> hold 2M pointers which can easily cause kcalloc to fail.
>>
>> The solution is to use vzalloc to replace kcalloc. There is no need
>> for contiguous memory pages for a driver meta data structure (no need
>> of DMA ops).
>>
>> Signed-off-by: Qing Huang <qing.huang@oracle.com>
>> Acked-by: Daniel Jurgens <danielj@mellanox.com>
>> Reviewed-by: Zhu Yanjun <yanjun.zhu@oracle.com>
>> ---
>> v2 -> v1: adjusted chunk size to reflect different architectures.
>>
>>   drivers/net/ethernet/mellanox/mlx4/icm.c | 14 +++++++-------
>>   1 file changed, 7 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c 
>> b/drivers/net/ethernet/mellanox/mlx4/icm.c
>> index a822f7a..ccb62b8 100644
>> --- a/drivers/net/ethernet/mellanox/mlx4/icm.c
>> +++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
>> @@ -43,12 +43,12 @@
>>   #include "fw.h"
>>     /*
>> - * We allocate in as big chunks as we can, up to a maximum of 256 KB
>> - * per chunk.
>> + * We allocate in page size (default 4KB on many archs) chunks to 
>> avoid high
>> + * order memory allocations in fragmented/high usage memory situation.
>>    */
>>   enum {
>> -    MLX4_ICM_ALLOC_SIZE    = 1 << 18,
>> -    MLX4_TABLE_CHUNK_SIZE    = 1 << 18
>> +    MLX4_ICM_ALLOC_SIZE    = 1 << PAGE_SHIFT,
>> +    MLX4_TABLE_CHUNK_SIZE    = 1 << PAGE_SHIFT
>
> Which is actually PAGE_SIZE.

Yes, we wanted to avoid high order memory allocations.

> Also, please add a comma at the end of the last entry.

Hmm..., followed the existing code style and checkpatch.pl didn't 
complain about the comma.

>
>>   };
>>     static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct 
>> mlx4_icm_chunk *chunk)
>> @@ -400,7 +400,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, 
>> struct mlx4_icm_table *table,
>>       obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size;
>>       num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
>>   -    table->icm      = kcalloc(num_icm, sizeof(*table->icm), 
>> GFP_KERNEL);
>> +    table->icm      = vzalloc(num_icm * sizeof(*table->icm));
>
> Why not kvzalloc ?

I think table->icm pointer array doesn't really need physically 
contiguous memory. Sometimes high order
memory allocation by kmalloc variants may trigger slow path and cause 
tasks to be blocked.

Thanks,
Qing

>
>>       if (!table->icm)
>>           return -ENOMEM;
>>       table->virt     = virt;
>> @@ -446,7 +446,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, 
>> struct mlx4_icm_table *table,
>>               mlx4_free_icm(dev, table->icm[i], use_coherent);
>>           }
>>   -    kfree(table->icm);
>> +    vfree(table->icm);
>>         return -ENOMEM;
>>   }
>> @@ -462,5 +462,5 @@ void mlx4_cleanup_icm_table(struct mlx4_dev *dev, 
>> struct mlx4_icm_table *table)
>>               mlx4_free_icm(dev, table->icm[i], table->coherent);
>>           }
>>   -    kfree(table->icm);
>> +    vfree(table->icm);
>>   }
>>
>
> Thanks for your patch.
>
> I need to verify there is no dramatic performance degradation here.
> You can prepare and send a v3 in the meanwhile.
>
> Thanks,
> Tariq
> -- 
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 03/14] net: sched: add 'delete' function to action ops
From: Jiri Pirko @ 2018-05-14 16:30 UTC (permalink / raw)
  To: Vlad Buslov
  Cc: netdev, davem, jhs, xiyou.wangcong, pablo, kadlec, fw, ast,
	daniel, edumazet, keescook, linux-kernel, netfilter-devel,
	coreteam, kliteyn
In-Reply-To: <20180514151222.GC1848@nanopsycho>

Mon, May 14, 2018 at 05:12:22PM CEST, jiri@resnulli.us wrote:
>Mon, May 14, 2018 at 04:27:04PM CEST, vladbu@mellanox.com wrote:
>>Extend action ops with 'delete' function. Each action type to implement its
>>own delete function that doesn't depend on rtnl lock.
>>
>>Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
>>---
>> include/net/act_api.h | 1 +
>> 1 file changed, 1 insertion(+)
>>
>>diff --git a/include/net/act_api.h b/include/net/act_api.h
>>index e634014..73175a3 100644
>>--- a/include/net/act_api.h
>>+++ b/include/net/act_api.h
>>@@ -100,6 +100,7 @@ struct tc_action_ops {
>> 	void	(*stats_update)(struct tc_action *, u64, u32, u64);
>> 	size_t  (*get_fill_size)(const struct tc_action *act);
>> 	struct net_device *(*get_dev)(const struct tc_action *a);
>>+	int     (*delete)(struct net *net, u32 index);
>
>Probably better to squash this to patch 14.

Oh, I see you call it in patch 6. Fine.

^ permalink raw reply

* Re: [PATCH 06/14] net: sched: implement reference counted action release
From: Jiri Pirko @ 2018-05-14 16:28 UTC (permalink / raw)
  To: Vlad Buslov
  Cc: netdev, davem, jhs, xiyou.wangcong, pablo, kadlec, fw, ast,
	daniel, edumazet, keescook, linux-kernel, netfilter-devel,
	coreteam, kliteyn
In-Reply-To: <1526308035-12484-7-git-send-email-vladbu@mellanox.com>

Mon, May 14, 2018 at 04:27:07PM CEST, vladbu@mellanox.com wrote:
>Implement helper function to delete action using new action ops delete
>function implemented by each action for lockless execution.

Reading this sentense for 4 times. I still don't understand what you say :(

>
>Implement action put function that releases reference to action and frees

When you write "Implement action put function" it is cryptic. Just
say "Implement function __tcf_action_put()".


>it if necessary. Refactor action deletion code to use new put function and
>not to rely on rtnl lock. Remove rtnl lock assertions that are no longer
>needed.

[...]

^ permalink raw reply

* Re: [PATCH 05/14] net: sched: always take reference to action
From: Jiri Pirko @ 2018-05-14 16:23 UTC (permalink / raw)
  To: Vlad Buslov
  Cc: netdev, davem, jhs, xiyou.wangcong, pablo, kadlec, fw, ast,
	daniel, edumazet, keescook, linux-kernel, netfilter-devel,
	coreteam, kliteyn
In-Reply-To: <1526308035-12484-6-git-send-email-vladbu@mellanox.com>

Mon, May 14, 2018 at 04:27:06PM CEST, vladbu@mellanox.com wrote:
>Without rtnl lock protection it is no longer safe to use pointer to tc
>action without holding reference to it. (it can be destroyed concurrently)
>
>Remove unsafe action idr lookup function. Instead of it, implement safe tcf
>idr check function that atomically looks up action in idr and increments
>its reference and bind counters.
>
>Implement both action search and check using new safe function.
>
>Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
>---
> net/sched/act_api.c | 38 ++++++++++++++++----------------------
> 1 file changed, 16 insertions(+), 22 deletions(-)
>
>diff --git a/net/sched/act_api.c b/net/sched/act_api.c
>index 1331beb..9459cce 100644
>--- a/net/sched/act_api.c
>+++ b/net/sched/act_api.c
>@@ -284,44 +284,38 @@ int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb,
> }
> EXPORT_SYMBOL(tcf_generic_walker);
> 
>-static struct tc_action *tcf_idr_lookup(u32 index, struct tcf_idrinfo *idrinfo)
>+bool __tcf_idr_check(struct tc_action_net *tn, u32 index, struct tc_action **a,
>+		     int bind)
> {
>-	struct tc_action *p = NULL;
>+	struct tcf_idrinfo *idrinfo = tn->idrinfo;
>+	struct tc_action *p;
> 
> 	spin_lock_bh(&idrinfo->lock);

Why "_bh" variant is necessary here?

> 	p = idr_find(&idrinfo->action_idr, index);
>+	if (p) {
>+		refcount_inc(&p->tcfa_refcnt);
>+		if (bind)
>+			atomic_inc(&p->tcfa_bindcnt);
>+	}
> 	spin_unlock_bh(&idrinfo->lock);

[...]

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox