Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next v2 10/10] sctp: introduce round robin stream scheduler
From: Marcelo Ricardo Leitner @ 2017-10-03 22:20 UTC (permalink / raw)
  To: netdev; +Cc: linux-sctp, Neil Horman, Vlad Yasevich, Xin Long, David Laight
In-Reply-To: <cover.1507069005.git.marcelo.leitner@gmail.com>

This patch introduces RFC Draft ndata section 3.2 Priority Based
Scheduler (SCTP_SS_RR).

Works by maintaining a list of enqueued streams and tracking the last
one used to send data. When the datamsg is done, it switches to the next
stream.

See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13
Tested-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
---
 include/net/sctp/structs.h |  11 +++
 include/uapi/linux/sctp.h  |   3 +-
 net/sctp/Makefile          |   3 +-
 net/sctp/stream_sched.c    |   2 +
 net/sctp/stream_sched_rr.c | 201 +++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 218 insertions(+), 2 deletions(-)
 create mode 100644 net/sctp/stream_sched_rr.c

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 40eb8d66a37c3ecee39141dc111663f7aac7326a..16f949eef52fdfd7c90fa15b44093334d1355aaf 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1348,6 +1348,10 @@ struct sctp_stream_out_ext {
 			struct list_head prio_list;
 			struct sctp_stream_priorities *prio_head;
 		};
+		/* Fields used by RR scheduler */
+		struct {
+			struct list_head rr_list;
+		};
 	};
 };
 
@@ -1374,6 +1378,13 @@ struct sctp_stream {
 			/* List of priorities scheduled */
 			struct list_head prio_list;
 		};
+		/* Fields used by RR scheduler */
+		struct {
+			/* List of streams scheduled */
+			struct list_head rr_list;
+			/* The next stream stream in line */
+			struct sctp_stream_out_ext *rr_next;
+		};
 	};
 };
 
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 850fa8b29d7e8163dc4ee88af192309bb2535ae9..6cd7d416ca406e59d3214976fc425bb805f5c6cc 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -1100,7 +1100,8 @@ struct sctp_add_streams {
 enum sctp_sched_type {
 	SCTP_SS_FCFS,
 	SCTP_SS_PRIO,
-	SCTP_SS_MAX = SCTP_SS_PRIO
+	SCTP_SS_RR,
+	SCTP_SS_MAX = SCTP_SS_RR
 };
 
 #endif /* _UAPI_SCTP_H */
diff --git a/net/sctp/Makefile b/net/sctp/Makefile
index 647c9cfd4e95be4429d25792e5832d7be2efc5c8..bf90c53977190ff563c2b43af31afb7c431d4534 100644
--- a/net/sctp/Makefile
+++ b/net/sctp/Makefile
@@ -12,7 +12,8 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
 	  inqueue.o outqueue.o ulpqueue.o \
 	  tsnmap.o bind_addr.o socket.o primitive.o \
 	  output.o input.o debug.o stream.o auth.o \
-	  offload.o stream_sched.o stream_sched_prio.o
+	  offload.o stream_sched.o stream_sched_prio.o \
+	  stream_sched_rr.o
 
 sctp_probe-y := probe.o
 
diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c
index 115ddb7651695cca7417cb63004a1a59c93523b8..03513a9fa110b5317af4502f98ab37702c1eddb9 100644
--- a/net/sctp/stream_sched.c
+++ b/net/sctp/stream_sched.c
@@ -122,10 +122,12 @@ static struct sctp_sched_ops sctp_sched_fcfs = {
 /* API to other parts of the stack */
 
 extern struct sctp_sched_ops sctp_sched_prio;
+extern struct sctp_sched_ops sctp_sched_rr;
 
 struct sctp_sched_ops *sctp_sched_ops[] = {
 	&sctp_sched_fcfs,
 	&sctp_sched_prio,
+	&sctp_sched_rr,
 };
 
 int sctp_sched_set_sched(struct sctp_association *asoc,
diff --git a/net/sctp/stream_sched_rr.c b/net/sctp/stream_sched_rr.c
new file mode 100644
index 0000000000000000000000000000000000000000..7612a438c5b939ae1c26c4acc06902749b601524
--- /dev/null
+++ b/net/sctp/stream_sched_rr.c
@@ -0,0 +1,201 @@
+/* SCTP kernel implementation
+ * (C) Copyright Red Hat Inc. 2017
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions manipulate sctp stream queue/scheduling.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email addresched(es):
+ *    lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ *    Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+ */
+
+#include <linux/list.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+#include <net/sctp/stream_sched.h>
+
+/* Priority handling
+ * RFC DRAFT ndata section 3.2
+ */
+static void sctp_sched_rr_unsched_all(struct sctp_stream *stream);
+
+static void sctp_sched_rr_next_stream(struct sctp_stream *stream)
+{
+	struct list_head *pos;
+
+	pos = stream->rr_next->rr_list.next;
+	if (pos == &stream->rr_list)
+		pos = pos->next;
+	stream->rr_next = list_entry(pos, struct sctp_stream_out_ext, rr_list);
+}
+
+static void sctp_sched_rr_unsched(struct sctp_stream *stream,
+				  struct sctp_stream_out_ext *soute)
+{
+	if (stream->rr_next == soute)
+		/* Try to move to the next stream */
+		sctp_sched_rr_next_stream(stream);
+
+	list_del_init(&soute->rr_list);
+
+	/* If we have no other stream queued, clear next */
+	if (list_empty(&stream->rr_list))
+		stream->rr_next = NULL;
+}
+
+static void sctp_sched_rr_sched(struct sctp_stream *stream,
+				struct sctp_stream_out_ext *soute)
+{
+	if (!list_empty(&soute->rr_list))
+		/* Already scheduled. */
+		return;
+
+	/* Schedule the stream */
+	list_add_tail(&soute->rr_list, &stream->rr_list);
+
+	if (!stream->rr_next)
+		stream->rr_next = soute;
+}
+
+static int sctp_sched_rr_set(struct sctp_stream *stream, __u16 sid,
+			     __u16 prio, gfp_t gfp)
+{
+	return 0;
+}
+
+static int sctp_sched_rr_get(struct sctp_stream *stream, __u16 sid,
+			     __u16 *value)
+{
+	return 0;
+}
+
+static int sctp_sched_rr_init(struct sctp_stream *stream)
+{
+	INIT_LIST_HEAD(&stream->rr_list);
+	stream->rr_next = NULL;
+
+	return 0;
+}
+
+static int sctp_sched_rr_init_sid(struct sctp_stream *stream, __u16 sid,
+				  gfp_t gfp)
+{
+	INIT_LIST_HEAD(&stream->out[sid].ext->rr_list);
+
+	return 0;
+}
+
+static void sctp_sched_rr_free(struct sctp_stream *stream)
+{
+	sctp_sched_rr_unsched_all(stream);
+}
+
+static void sctp_sched_rr_enqueue(struct sctp_outq *q,
+				  struct sctp_datamsg *msg)
+{
+	struct sctp_stream *stream;
+	struct sctp_chunk *ch;
+	__u16 sid;
+
+	ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list);
+	sid = sctp_chunk_stream_no(ch);
+	stream = &q->asoc->stream;
+	sctp_sched_rr_sched(stream, stream->out[sid].ext);
+}
+
+static struct sctp_chunk *sctp_sched_rr_dequeue(struct sctp_outq *q)
+{
+	struct sctp_stream *stream = &q->asoc->stream;
+	struct sctp_stream_out_ext *soute;
+	struct sctp_chunk *ch = NULL;
+
+	/* Bail out quickly if queue is empty */
+	if (list_empty(&q->out_chunk_list))
+		goto out;
+
+	/* Find which chunk is next */
+	if (stream->out_curr)
+		soute = stream->out_curr->ext;
+	else
+		soute = stream->rr_next;
+	ch = list_entry(soute->outq.next, struct sctp_chunk, stream_list);
+
+	sctp_sched_dequeue_common(q, ch);
+
+out:
+	return ch;
+}
+
+static void sctp_sched_rr_dequeue_done(struct sctp_outq *q,
+				       struct sctp_chunk *ch)
+{
+	struct sctp_stream_out_ext *soute;
+	__u16 sid;
+
+	/* Last chunk on that msg, move to the next stream */
+	sid = sctp_chunk_stream_no(ch);
+	soute = q->asoc->stream.out[sid].ext;
+
+	sctp_sched_rr_next_stream(&q->asoc->stream);
+
+	if (list_empty(&soute->outq))
+		sctp_sched_rr_unsched(&q->asoc->stream, soute);
+}
+
+static void sctp_sched_rr_sched_all(struct sctp_stream *stream)
+{
+	struct sctp_association *asoc;
+	struct sctp_stream_out_ext *soute;
+	struct sctp_chunk *ch;
+
+	asoc = container_of(stream, struct sctp_association, stream);
+	list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) {
+		__u16 sid;
+
+		sid = sctp_chunk_stream_no(ch);
+		soute = stream->out[sid].ext;
+		if (soute)
+			sctp_sched_rr_sched(stream, soute);
+	}
+}
+
+static void sctp_sched_rr_unsched_all(struct sctp_stream *stream)
+{
+	struct sctp_stream_out_ext *soute, *tmp;
+
+	list_for_each_entry_safe(soute, tmp, &stream->rr_list, rr_list)
+		sctp_sched_rr_unsched(stream, soute);
+}
+
+struct sctp_sched_ops sctp_sched_rr = {
+	.set = sctp_sched_rr_set,
+	.get = sctp_sched_rr_get,
+	.init = sctp_sched_rr_init,
+	.init_sid = sctp_sched_rr_init_sid,
+	.free = sctp_sched_rr_free,
+	.enqueue = sctp_sched_rr_enqueue,
+	.dequeue = sctp_sched_rr_dequeue,
+	.dequeue_done = sctp_sched_rr_dequeue_done,
+	.sched_all = sctp_sched_rr_sched_all,
+	.unsched_all = sctp_sched_rr_unsched_all,
+};
-- 
2.13.5

^ permalink raw reply related

* [PATCH net-next 0/3] A own subdirectory for shared TCP code
From: Richard Sailer @ 2017-10-03 22:22 UTC (permalink / raw)
  To: netdev, davem; +Cc: kuznet, yoshfuji

net/ipv4 currently contains around 100 source files containing the
IP implementation and lots of other functionality (UDP, TCP, xfrm, etc.)

# 1/3
To make the networking source tree more self documenting and well 
structured 1/3 moves the 30 shared TCP source files to a own 
subdirectory of net/ipv4/.

# 2/3 
A huge part of the TCP code is congestion control algorithms.
With the same goal as above, this moves the cc algos to a own
subdirectory of net/ipv4/tcp/

# 3/3
Implementation of a suggestion of valdis, to have a new directory
net/ipv4v6_shared that contains all the code shared between
IPv4 and IPv6. This creates net/ipv4v6_shared and moves the 
net/ipv4/tcp directory of shared TCP code there.

While every commit depends on its previous commits, it is
also possible and sensible to only apply (1) or (1,2) , depending
on how conservative one wants to decide regarding directory
structure changes.

[PATCH net-next 1/3] net/ipv4: Move shared tcp code to own subdirectory
[PATCH net-next 2/3] tcp: Move cc algorithms to own subdirectory
[PATCH net-next 3/3] Move shared tcp code to net/ipv4v6shared

^ permalink raw reply

* [PATCH net-next 1/3] net/ipv4: Move shared tcp code to own subdirectory
From: Richard Sailer @ 2017-10-03 22:22 UTC (permalink / raw)
  To: netdev, davem; +Cc: kuznet, yoshfuji
In-Reply-To: <20171003222213.7996-1-richard_siegfried@systemli.org>

net/ipv4 contains around 100 source files containing the IP implementation
and various other functionality (UDP, TCP, xfrm, etc.). 30 of them make up
the TCP implementation.

I think moving the TCP implementation to a own subdirectory
makes the source tree more explorable and well structured.

Note: This only affects the TCP Code shared by IPv4 and IPv6,
the IPv4 specific part (tcp_ipv4.c and tcp_offload.c) is left in
net/ipv4/.

This creates the directory, moves the files and updates the Makefiles
acordingly.

Signed-off-by: Richard Sailer <richard_siegfried@systemli.org>
---
 net/ipv4/Makefile                  | 25 ++-----------------------
 net/ipv4/tcp/Makefile              | 26 ++++++++++++++++++++++++++
 net/ipv4/{ => tcp}/tcp.c           |  0
 net/ipv4/{ => tcp}/tcp_bbr.c       |  0
 net/ipv4/{ => tcp}/tcp_bic.c       |  0
 net/ipv4/{ => tcp}/tcp_cdg.c       |  0
 net/ipv4/{ => tcp}/tcp_cong.c      |  0
 net/ipv4/{ => tcp}/tcp_cubic.c     |  0
 net/ipv4/{ => tcp}/tcp_dctcp.c     |  0
 net/ipv4/{ => tcp}/tcp_diag.c      |  0
 net/ipv4/{ => tcp}/tcp_fastopen.c  |  0
 net/ipv4/{ => tcp}/tcp_highspeed.c |  0
 net/ipv4/{ => tcp}/tcp_htcp.c      |  0
 net/ipv4/{ => tcp}/tcp_hybla.c     |  0
 net/ipv4/{ => tcp}/tcp_illinois.c  |  0
 net/ipv4/{ => tcp}/tcp_input.c     |  0
 net/ipv4/{ => tcp}/tcp_lp.c        |  0
 net/ipv4/{ => tcp}/tcp_metrics.c   |  0
 net/ipv4/{ => tcp}/tcp_minisocks.c |  0
 net/ipv4/{ => tcp}/tcp_nv.c        |  0
 net/ipv4/{ => tcp}/tcp_output.c    |  0
 net/ipv4/{ => tcp}/tcp_probe.c     |  0
 net/ipv4/{ => tcp}/tcp_rate.c      |  0
 net/ipv4/{ => tcp}/tcp_recovery.c  |  0
 net/ipv4/{ => tcp}/tcp_scalable.c  |  0
 net/ipv4/{ => tcp}/tcp_timer.c     |  0
 net/ipv4/{ => tcp}/tcp_ulp.c       |  0
 net/ipv4/{ => tcp}/tcp_vegas.c     |  0
 net/ipv4/{ => tcp}/tcp_vegas.h     |  0
 net/ipv4/{ => tcp}/tcp_veno.c      |  0
 net/ipv4/{ => tcp}/tcp_westwood.c  |  0
 net/ipv4/{ => tcp}/tcp_yeah.c      |  0
 32 files changed, 28 insertions(+), 23 deletions(-)
 create mode 100644 net/ipv4/tcp/Makefile
 rename net/ipv4/{ => tcp}/tcp.c (100%)
 rename net/ipv4/{ => tcp}/tcp_bbr.c (100%)
 rename net/ipv4/{ => tcp}/tcp_bic.c (100%)
 rename net/ipv4/{ => tcp}/tcp_cdg.c (100%)
 rename net/ipv4/{ => tcp}/tcp_cong.c (100%)
 rename net/ipv4/{ => tcp}/tcp_cubic.c (100%)
 rename net/ipv4/{ => tcp}/tcp_dctcp.c (100%)
 rename net/ipv4/{ => tcp}/tcp_diag.c (100%)
 rename net/ipv4/{ => tcp}/tcp_fastopen.c (100%)
 rename net/ipv4/{ => tcp}/tcp_highspeed.c (100%)
 rename net/ipv4/{ => tcp}/tcp_htcp.c (100%)
 rename net/ipv4/{ => tcp}/tcp_hybla.c (100%)
 rename net/ipv4/{ => tcp}/tcp_illinois.c (100%)
 rename net/ipv4/{ => tcp}/tcp_input.c (100%)
 rename net/ipv4/{ => tcp}/tcp_lp.c (100%)
 rename net/ipv4/{ => tcp}/tcp_metrics.c (100%)
 rename net/ipv4/{ => tcp}/tcp_minisocks.c (100%)
 rename net/ipv4/{ => tcp}/tcp_nv.c (100%)
 rename net/ipv4/{ => tcp}/tcp_output.c (100%)
 rename net/ipv4/{ => tcp}/tcp_probe.c (100%)
 rename net/ipv4/{ => tcp}/tcp_rate.c (100%)
 rename net/ipv4/{ => tcp}/tcp_recovery.c (100%)
 rename net/ipv4/{ => tcp}/tcp_scalable.c (100%)
 rename net/ipv4/{ => tcp}/tcp_timer.c (100%)
 rename net/ipv4/{ => tcp}/tcp_ulp.c (100%)
 rename net/ipv4/{ => tcp}/tcp_vegas.c (100%)
 rename net/ipv4/{ => tcp}/tcp_vegas.h (100%)
 rename net/ipv4/{ => tcp}/tcp_veno.c (100%)
 rename net/ipv4/{ => tcp}/tcp_westwood.c (100%)
 rename net/ipv4/{ => tcp}/tcp_yeah.c (100%)

diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index afcb435adfbe..a106a1cf0e11 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -2,14 +2,11 @@
 # Makefile for the Linux TCP/IP (INET) layer.
 #
 
-obj-y     := route.o inetpeer.o protocol.o \
+obj-y     := tcp/ route.o inetpeer.o protocol.o \
 	     ip_input.o ip_fragment.o ip_forward.o ip_options.o \
 	     ip_output.o ip_sockglue.o inet_hashtables.o \
 	     inet_timewait_sock.o inet_connection_sock.o \
-	     tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
-	     tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
-	     tcp_rate.o tcp_recovery.o tcp_ulp.o \
-	     tcp_offload.o datagram.o raw.o udp.o udplite.o \
+	     datagram.o raw.o udp.o udplite.o tcp_offload.o tcp_ipv4.o \
 	     udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
 	     fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
 	     inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o
@@ -39,26 +36,8 @@ obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
 obj-$(CONFIG_IP_PNP) += ipconfig.o
 obj-$(CONFIG_NETFILTER)	+= netfilter.o netfilter/
 obj-$(CONFIG_INET_DIAG) += inet_diag.o 
-obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
 obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
 obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o
-obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
-obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
-obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
-obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
-obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
-obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o
-obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
-obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
-obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
-obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
-obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
-obj-$(CONFIG_TCP_CONG_NV) += tcp_nv.o
-obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o
-obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
-obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
-obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
-obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
 obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/tcp/Makefile b/net/ipv4/tcp/Makefile
new file mode 100644
index 000000000000..91d2c991a243
--- /dev/null
+++ b/net/ipv4/tcp/Makefile
@@ -0,0 +1,26 @@
+#
+# Makefile for Linux TCP
+#
+
+obj-y     := tcp.o tcp_input.o tcp_output.o tcp_timer.o \
+	     tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
+	     tcp_rate.o tcp_recovery.o tcp_ulp.o
+
+obj-$(CONFIG_INET_TCP_DIAG)     +=  tcp_diag.o
+obj-$(CONFIG_NET_TCPPROBE)      +=  tcp_probe.o
+obj-$(CONFIG_TCP_CONG_BBR)      +=  tcp_bbr.o
+obj-$(CONFIG_TCP_CONG_BIC)      +=  tcp_bic.o
+obj-$(CONFIG_TCP_CONG_CDG)      +=  tcp_cdg.o
+obj-$(CONFIG_TCP_CONG_CUBIC)    +=  tcp_cubic.o
+obj-$(CONFIG_TCP_CONG_DCTCP)    +=  tcp_dctcp.o
+obj-$(CONFIG_TCP_CONG_WESTWOOD) +=  tcp_westwood.o
+obj-$(CONFIG_TCP_CONG_HSTCP)    +=  tcp_highspeed.o
+obj-$(CONFIG_TCP_CONG_HYBLA)    +=  tcp_hybla.o
+obj-$(CONFIG_TCP_CONG_HTCP)     +=  tcp_htcp.o
+obj-$(CONFIG_TCP_CONG_VEGAS)    +=  tcp_vegas.o
+obj-$(CONFIG_TCP_CONG_NV)       +=  tcp_nv.o
+obj-$(CONFIG_TCP_CONG_VENO)     +=  tcp_veno.o
+obj-$(CONFIG_TCP_CONG_SCALABLE) +=  tcp_scalable.o
+obj-$(CONFIG_TCP_CONG_LP)       +=  tcp_lp.o
+obj-$(CONFIG_TCP_CONG_YEAH)     +=  tcp_yeah.o
+obj-$(CONFIG_TCP_CONG_ILLINOIS) +=  tcp_illinois.o
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp/tcp.c
similarity index 100%
rename from net/ipv4/tcp.c
rename to net/ipv4/tcp/tcp.c
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp/tcp_bbr.c
similarity index 100%
rename from net/ipv4/tcp_bbr.c
rename to net/ipv4/tcp/tcp_bbr.c
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp/tcp_bic.c
similarity index 100%
rename from net/ipv4/tcp_bic.c
rename to net/ipv4/tcp/tcp_bic.c
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp/tcp_cdg.c
similarity index 100%
rename from net/ipv4/tcp_cdg.c
rename to net/ipv4/tcp/tcp_cdg.c
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp/tcp_cong.c
similarity index 100%
rename from net/ipv4/tcp_cong.c
rename to net/ipv4/tcp/tcp_cong.c
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp/tcp_cubic.c
similarity index 100%
rename from net/ipv4/tcp_cubic.c
rename to net/ipv4/tcp/tcp_cubic.c
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp/tcp_dctcp.c
similarity index 100%
rename from net/ipv4/tcp_dctcp.c
rename to net/ipv4/tcp/tcp_dctcp.c
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp/tcp_diag.c
similarity index 100%
rename from net/ipv4/tcp_diag.c
rename to net/ipv4/tcp/tcp_diag.c
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp/tcp_fastopen.c
similarity index 100%
rename from net/ipv4/tcp_fastopen.c
rename to net/ipv4/tcp/tcp_fastopen.c
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp/tcp_highspeed.c
similarity index 100%
rename from net/ipv4/tcp_highspeed.c
rename to net/ipv4/tcp/tcp_highspeed.c
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp/tcp_htcp.c
similarity index 100%
rename from net/ipv4/tcp_htcp.c
rename to net/ipv4/tcp/tcp_htcp.c
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp/tcp_hybla.c
similarity index 100%
rename from net/ipv4/tcp_hybla.c
rename to net/ipv4/tcp/tcp_hybla.c
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp/tcp_illinois.c
similarity index 100%
rename from net/ipv4/tcp_illinois.c
rename to net/ipv4/tcp/tcp_illinois.c
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp/tcp_input.c
similarity index 100%
rename from net/ipv4/tcp_input.c
rename to net/ipv4/tcp/tcp_input.c
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp/tcp_lp.c
similarity index 100%
rename from net/ipv4/tcp_lp.c
rename to net/ipv4/tcp/tcp_lp.c
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp/tcp_metrics.c
similarity index 100%
rename from net/ipv4/tcp_metrics.c
rename to net/ipv4/tcp/tcp_metrics.c
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp/tcp_minisocks.c
similarity index 100%
rename from net/ipv4/tcp_minisocks.c
rename to net/ipv4/tcp/tcp_minisocks.c
diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp/tcp_nv.c
similarity index 100%
rename from net/ipv4/tcp_nv.c
rename to net/ipv4/tcp/tcp_nv.c
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp/tcp_output.c
similarity index 100%
rename from net/ipv4/tcp_output.c
rename to net/ipv4/tcp/tcp_output.c
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp/tcp_probe.c
similarity index 100%
rename from net/ipv4/tcp_probe.c
rename to net/ipv4/tcp/tcp_probe.c
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp/tcp_rate.c
similarity index 100%
rename from net/ipv4/tcp_rate.c
rename to net/ipv4/tcp/tcp_rate.c
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp/tcp_recovery.c
similarity index 100%
rename from net/ipv4/tcp_recovery.c
rename to net/ipv4/tcp/tcp_recovery.c
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp/tcp_scalable.c
similarity index 100%
rename from net/ipv4/tcp_scalable.c
rename to net/ipv4/tcp/tcp_scalable.c
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp/tcp_timer.c
similarity index 100%
rename from net/ipv4/tcp_timer.c
rename to net/ipv4/tcp/tcp_timer.c
diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp/tcp_ulp.c
similarity index 100%
rename from net/ipv4/tcp_ulp.c
rename to net/ipv4/tcp/tcp_ulp.c
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp/tcp_vegas.c
similarity index 100%
rename from net/ipv4/tcp_vegas.c
rename to net/ipv4/tcp/tcp_vegas.c
diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp/tcp_vegas.h
similarity index 100%
rename from net/ipv4/tcp_vegas.h
rename to net/ipv4/tcp/tcp_vegas.h
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp/tcp_veno.c
similarity index 100%
rename from net/ipv4/tcp_veno.c
rename to net/ipv4/tcp/tcp_veno.c
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp/tcp_westwood.c
similarity index 100%
rename from net/ipv4/tcp_westwood.c
rename to net/ipv4/tcp/tcp_westwood.c
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp/tcp_yeah.c
similarity index 100%
rename from net/ipv4/tcp_yeah.c
rename to net/ipv4/tcp/tcp_yeah.c
-- 
2.14.1

^ permalink raw reply related

* [PATCH net-next 3/3] Move shared tcp code to net/ipv4v6shared
From: Richard Sailer @ 2017-10-03 22:22 UTC (permalink / raw)
  To: netdev, davem; +Cc: kuznet, yoshfuji
In-Reply-To: <20171003222213.7996-1-richard_siegfried@systemli.org>

Currently a lot of Code shared between IPv4 and IPv6 resides in net/ipv4.

As an attempt to make things more modular and encapsulated + the source tree
more self documenting this commit:

  * introduces net/ipv4v6shared
  * moves the shared tcp code there
  * updates the makefiles accordingly
  * creates a new Kconfig file for the TCP code
  * updates all other Kconfig files accordingly

Of course I left the ipv4 specific parts of tcp (tcp_ipv4.c and tcp_offload.c)
in net/ipv4.

Thanks valdis from #kernelnewbies for the hint of doing this, when already
isolating tcp code.

Signed-off-by: Richard Sailer <richard_siegfried@systemli.org>
---
 net/Kconfig                                        |   1 +
 net/Makefile                                       |   6 +-
 net/ipv4/Kconfig                                   | 321 --------------------
 net/ipv4/Makefile                                  |   4 +-
 net/ipv4v6_shared/Makefile                         |   7 +
 net/ipv4v6_shared/tcp/Kconfig                      | 324 +++++++++++++++++++++
 net/{ipv4 => ipv4v6_shared}/tcp/Makefile           |   0
 net/{ipv4 => ipv4v6_shared}/tcp/cc_algos/Makefile  |   0
 net/{ipv4 => ipv4v6_shared}/tcp/cc_algos/tcp_bbr.c |   0
 net/{ipv4 => ipv4v6_shared}/tcp/cc_algos/tcp_bic.c |   0
 net/{ipv4 => ipv4v6_shared}/tcp/cc_algos/tcp_cdg.c |   0
 .../tcp/cc_algos/tcp_cubic.c                       |   0
 .../tcp/cc_algos/tcp_dctcp.c                       |   0
 .../tcp/cc_algos/tcp_highspeed.c                   |   0
 .../tcp/cc_algos/tcp_htcp.c                        |   0
 .../tcp/cc_algos/tcp_hybla.c                       |   0
 .../tcp/cc_algos/tcp_illinois.c                    |   0
 net/{ipv4 => ipv4v6_shared}/tcp/cc_algos/tcp_lp.c  |   0
 net/{ipv4 => ipv4v6_shared}/tcp/cc_algos/tcp_nv.c  |   0
 .../tcp/cc_algos/tcp_scalable.c                    |   0
 .../tcp/cc_algos/tcp_vegas.c                       |   0
 .../tcp/cc_algos/tcp_vegas.h                       |   0
 .../tcp/cc_algos/tcp_veno.c                        |   0
 .../tcp/cc_algos/tcp_westwood.c                    |   0
 .../tcp/cc_algos/tcp_yeah.c                        |   0
 net/{ipv4 => ipv4v6_shared}/tcp/tcp.c              |   0
 net/{ipv4 => ipv4v6_shared}/tcp/tcp_cong.c         |   0
 net/{ipv4 => ipv4v6_shared}/tcp/tcp_diag.c         |   0
 net/{ipv4 => ipv4v6_shared}/tcp/tcp_fastopen.c     |   0
 net/{ipv4 => ipv4v6_shared}/tcp/tcp_input.c        |   0
 net/{ipv4 => ipv4v6_shared}/tcp/tcp_metrics.c      |   0
 net/{ipv4 => ipv4v6_shared}/tcp/tcp_minisocks.c    |   0
 net/{ipv4 => ipv4v6_shared}/tcp/tcp_output.c       |   0
 net/{ipv4 => ipv4v6_shared}/tcp/tcp_probe.c        |   0
 net/{ipv4 => ipv4v6_shared}/tcp/tcp_rate.c         |   0
 net/{ipv4 => ipv4v6_shared}/tcp/tcp_recovery.c     |   0
 net/{ipv4 => ipv4v6_shared}/tcp/tcp_timer.c        |   0
 net/{ipv4 => ipv4v6_shared}/tcp/tcp_ulp.c          |   0
 38 files changed, 339 insertions(+), 324 deletions(-)
 create mode 100644 net/ipv4v6_shared/Makefile
 create mode 100644 net/ipv4v6_shared/tcp/Kconfig
 rename net/{ipv4 => ipv4v6_shared}/tcp/Makefile (100%)
 rename net/{ipv4 => ipv4v6_shared}/tcp/cc_algos/Makefile (100%)
 rename net/{ipv4 => ipv4v6_shared}/tcp/cc_algos/tcp_bbr.c (100%)
 rename net/{ipv4 => ipv4v6_shared}/tcp/cc_algos/tcp_bic.c (100%)
 rename net/{ipv4 => ipv4v6_shared}/tcp/cc_algos/tcp_cdg.c (100%)
 rename net/{ipv4 => ipv4v6_shared}/tcp/cc_algos/tcp_cubic.c (100%)
 rename net/{ipv4 => ipv4v6_shared}/tcp/cc_algos/tcp_dctcp.c (100%)
 rename net/{ipv4 => ipv4v6_shared}/tcp/cc_algos/tcp_highspeed.c (100%)
 rename net/{ipv4 => ipv4v6_shared}/tcp/cc_algos/tcp_htcp.c (100%)
 rename net/{ipv4 => ipv4v6_shared}/tcp/cc_algos/tcp_hybla.c (100%)
 rename net/{ipv4 => ipv4v6_shared}/tcp/cc_algos/tcp_illinois.c (100%)
 rename net/{ipv4 => ipv4v6_shared}/tcp/cc_algos/tcp_lp.c (100%)
 rename net/{ipv4 => ipv4v6_shared}/tcp/cc_algos/tcp_nv.c (100%)
 rename net/{ipv4 => ipv4v6_shared}/tcp/cc_algos/tcp_scalable.c (100%)
 rename net/{ipv4 => ipv4v6_shared}/tcp/cc_algos/tcp_vegas.c (100%)
 rename net/{ipv4 => ipv4v6_shared}/tcp/cc_algos/tcp_vegas.h (100%)
 rename net/{ipv4 => ipv4v6_shared}/tcp/cc_algos/tcp_veno.c (100%)
 rename net/{ipv4 => ipv4v6_shared}/tcp/cc_algos/tcp_westwood.c (100%)
 rename net/{ipv4 => ipv4v6_shared}/tcp/cc_algos/tcp_yeah.c (100%)
 rename net/{ipv4 => ipv4v6_shared}/tcp/tcp.c (100%)
 rename net/{ipv4 => ipv4v6_shared}/tcp/tcp_cong.c (100%)
 rename net/{ipv4 => ipv4v6_shared}/tcp/tcp_diag.c (100%)
 rename net/{ipv4 => ipv4v6_shared}/tcp/tcp_fastopen.c (100%)
 rename net/{ipv4 => ipv4v6_shared}/tcp/tcp_input.c (100%)
 rename net/{ipv4 => ipv4v6_shared}/tcp/tcp_metrics.c (100%)
 rename net/{ipv4 => ipv4v6_shared}/tcp/tcp_minisocks.c (100%)
 rename net/{ipv4 => ipv4v6_shared}/tcp/tcp_output.c (100%)
 rename net/{ipv4 => ipv4v6_shared}/tcp/tcp_probe.c (100%)
 rename net/{ipv4 => ipv4v6_shared}/tcp/tcp_rate.c (100%)
 rename net/{ipv4 => ipv4v6_shared}/tcp/tcp_recovery.c (100%)
 rename net/{ipv4 => ipv4v6_shared}/tcp/tcp_timer.c (100%)
 rename net/{ipv4 => ipv4v6_shared}/tcp/tcp_ulp.c (100%)

diff --git a/net/Kconfig b/net/Kconfig
index 9dba2715919d..caa1c51c577d 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -86,6 +86,7 @@ config INET
 
 if INET
 source "net/ipv4/Kconfig"
+source "net/ipv4v6_shared/tcp/Kconfig"
 source "net/ipv6/Kconfig"
 source "net/netlabel/Kconfig"
 
diff --git a/net/Makefile b/net/Makefile
index ae2fe2283d2f..b089a88d3d5d 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -14,7 +14,11 @@ obj-$(CONFIG_NET)		+= $(tmp-y)
 obj-$(CONFIG_LLC)		+= llc/
 obj-$(CONFIG_NET)		+= ethernet/ 802/ sched/ netlink/ bpf/
 obj-$(CONFIG_NETFILTER)		+= netfilter/
-obj-$(CONFIG_INET)		+= ipv4/
+
+# IPv6 depends on CONFIG_INET anyways, so this also includes the shared code
+# for IPv6 when needed.
+obj-$(CONFIG_INET)		+= ipv4/ ipv4v6_shared/
+
 obj-$(CONFIG_TLS)		+= tls/
 obj-$(CONFIG_XFRM)		+= xfrm/
 obj-$(CONFIG_UNIX)		+= unix/
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 91a2557942fa..aa952b0c6a8c 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -261,42 +261,6 @@ config IP_PIMSM_V2
 	  gated-5). This routing protocol is not used widely, so say N unless
 	  you want to play with it.
 
-config SYN_COOKIES
-	bool "IP: TCP syncookie support"
-	---help---
-	  Normal TCP/IP networking is open to an attack known as "SYN
-	  flooding". This denial-of-service attack prevents legitimate remote
-	  users from being able to connect to your computer during an ongoing
-	  attack and requires very little work from the attacker, who can
-	  operate from anywhere on the Internet.
-
-	  SYN cookies provide protection against this type of attack. If you
-	  say Y here, the TCP/IP stack will use a cryptographic challenge
-	  protocol known as "SYN cookies" to enable legitimate users to
-	  continue to connect, even when your machine is under attack. There
-	  is no need for the legitimate users to change their TCP/IP software;
-	  SYN cookies work transparently to them. For technical information
-	  about SYN cookies, check out <http://cr.yp.to/syncookies.html>.
-
-	  If you are SYN flooded, the source address reported by the kernel is
-	  likely to have been forged by the attacker; it is only reported as
-	  an aid in tracing the packets to their actual source and should not
-	  be taken as absolute truth.
-
-	  SYN cookies may prevent correct error reporting on clients when the
-	  server is really overloaded. If this happens frequently better turn
-	  them off.
-
-	  If you say Y here, you can disable SYN cookies at run time by
-	  saying Y to "/proc file system support" and
-	  "Sysctl support" below and executing the command
-
-	  echo 0 > /proc/sys/net/ipv4/tcp_syncookies
-
-	  after the /proc file system has been mounted.
-
-	  If unsure, say N.
-
 config NET_IPVTI
 	tristate "Virtual (secure) IP: tunneling"
 	select INET_TUNNEL
@@ -465,288 +429,3 @@ config INET_DIAG_DESTROY
 	  had been disconnected.
 	  If unsure, say N.
 
-menuconfig TCP_CONG_ADVANCED
-	bool "TCP: advanced congestion control"
-	---help---
-	  Support for selection of various TCP congestion control
-	  modules.
-
-	  Nearly all users can safely say no here, and a safe default
-	  selection will be made (CUBIC with new Reno as a fallback).
-
-	  If unsure, say N.
-
-if TCP_CONG_ADVANCED
-
-config TCP_CONG_BIC
-	tristate "Binary Increase Congestion (BIC) control"
-	default m
-	---help---
-	BIC-TCP is a sender-side only change that ensures a linear RTT
-	fairness under large windows while offering both scalability and
-	bounded TCP-friendliness. The protocol combines two schemes
-	called additive increase and binary search increase. When the
-	congestion window is large, additive increase with a large
-	increment ensures linear RTT fairness as well as good
-	scalability. Under small congestion windows, binary search
-	increase provides TCP friendliness.
-	See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
-
-config TCP_CONG_CUBIC
-	tristate "CUBIC TCP"
-	default y
-	---help---
-	This is version 2.0 of BIC-TCP which uses a cubic growth function
-	among other techniques.
-	See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
-
-config TCP_CONG_WESTWOOD
-	tristate "TCP Westwood+"
-	default m
-	---help---
-	TCP Westwood+ is a sender-side only modification of the TCP Reno
-	protocol stack that optimizes the performance of TCP congestion
-	control. It is based on end-to-end bandwidth estimation to set
-	congestion window and slow start threshold after a congestion
-	episode. Using this estimation, TCP Westwood+ adaptively sets a
-	slow start threshold and a congestion window which takes into
-	account the bandwidth used  at the time congestion is experienced.
-	TCP Westwood+ significantly increases fairness wrt TCP Reno in
-	wired networks and throughput over wireless links.
-
-config TCP_CONG_HTCP
-        tristate "H-TCP"
-        default m
-	---help---
-	H-TCP is a send-side only modifications of the TCP Reno
-	protocol stack that optimizes the performance of TCP
-	congestion control for high speed network links. It uses a
-	modeswitch to change the alpha and beta parameters of TCP Reno
-	based on network conditions and in a way so as to be fair with
-	other Reno and H-TCP flows.
-
-config TCP_CONG_HSTCP
-	tristate "High Speed TCP"
-	default n
-	---help---
-	Sally Floyd's High Speed TCP (RFC 3649) congestion control.
-	A modification to TCP's congestion control mechanism for use
-	with large congestion windows. A table indicates how much to
-	increase the congestion window by when an ACK is received.
- 	For more detail	see http://www.icir.org/floyd/hstcp.html
-
-config TCP_CONG_HYBLA
-	tristate "TCP-Hybla congestion control algorithm"
-	default n
-	---help---
-	TCP-Hybla is a sender-side only change that eliminates penalization of
-	long-RTT, large-bandwidth connections, like when satellite legs are
-	involved, especially when sharing a common bottleneck with normal
-	terrestrial connections.
-
-config TCP_CONG_VEGAS
-	tristate "TCP Vegas"
-	default n
-	---help---
-	TCP Vegas is a sender-side only change to TCP that anticipates
-	the onset of congestion by estimating the bandwidth. TCP Vegas
-	adjusts the sending rate by modifying the congestion
-	window. TCP Vegas should provide less packet loss, but it is
-	not as aggressive as TCP Reno.
-
-config TCP_CONG_NV
-       tristate "TCP NV"
-       default n
-       ---help---
-       TCP NV is a follow up to TCP Vegas. It has been modified to deal with
-       10G networks, measurement noise introduced by LRO, GRO and interrupt
-       coalescence. In addition, it will decrease its cwnd multiplicatively
-       instead of linearly.
-
-       Note that in general congestion avoidance (cwnd decreased when # packets
-       queued grows) cannot coexist with congestion control (cwnd decreased only
-       when there is packet loss) due to fairness issues. One scenario when they
-       can coexist safely is when the CA flows have RTTs << CC flows RTTs.
-
-       For further details see http://www.brakmo.org/networking/tcp-nv/
-
-config TCP_CONG_SCALABLE
-	tristate "Scalable TCP"
-	default n
-	---help---
-	Scalable TCP is a sender-side only change to TCP which uses a
-	MIMD congestion control algorithm which has some nice scaling
-	properties, though is known to have fairness issues.
-	See http://www.deneholme.net/tom/scalable/
-
-config TCP_CONG_LP
-	tristate "TCP Low Priority"
-	default n
-	---help---
-	TCP Low Priority (TCP-LP), a distributed algorithm whose goal is
-	to utilize only the excess network bandwidth as compared to the
-	``fair share`` of bandwidth as targeted by TCP.
-	See http://www-ece.rice.edu/networks/TCP-LP/
-
-config TCP_CONG_VENO
-	tristate "TCP Veno"
-	default n
-	---help---
-	TCP Veno is a sender-side only enhancement of TCP to obtain better
-	throughput over wireless networks. TCP Veno makes use of state
-	distinguishing to circumvent the difficult judgment of the packet loss
-	type. TCP Veno cuts down less congestion window in response to random
-	loss packets.
-	See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186> 
-
-config TCP_CONG_YEAH
-	tristate "YeAH TCP"
-	select TCP_CONG_VEGAS
-	default n
-	---help---
-	YeAH-TCP is a sender-side high-speed enabled TCP congestion control
-	algorithm, which uses a mixed loss/delay approach to compute the
-	congestion window. It's design goals target high efficiency,
-	internal, RTT and Reno fairness, resilience to link loss while
-	keeping network elements load as low as possible.
-
-	For further details look here:
-	  http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
-
-config TCP_CONG_ILLINOIS
-	tristate "TCP Illinois"
-	default n
-	---help---
-	TCP-Illinois is a sender-side modification of TCP Reno for
-	high speed long delay links. It uses round-trip-time to
-	adjust the alpha and beta parameters to achieve a higher average
-	throughput and maintain fairness.
-
-	For further details see:
-	  http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
-
-config TCP_CONG_DCTCP
-	tristate "DataCenter TCP (DCTCP)"
-	default n
-	---help---
-	DCTCP leverages Explicit Congestion Notification (ECN) in the network to
-	provide multi-bit feedback to the end hosts. It is designed to provide:
-
-	- High burst tolerance (incast due to partition/aggregate),
-	- Low latency (short flows, queries),
-	- High throughput (continuous data updates, large file transfers) with
-	  commodity, shallow-buffered switches.
-
-	All switches in the data center network running DCTCP must support
-	ECN marking and be configured for marking when reaching defined switch
-	buffer thresholds. The default ECN marking threshold heuristic for
-	DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets
-	(~100KB) at 10Gbps, but might need further careful tweaking.
-
-	For further details see:
-	  http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
-
-config TCP_CONG_CDG
-	tristate "CAIA Delay-Gradient (CDG)"
-	default n
-	---help---
-	CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies
-	the TCP sender in order to:
-
-	  o Use the delay gradient as a congestion signal.
-	  o Back off with an average probability that is independent of the RTT.
-	  o Coexist with flows that use loss-based congestion control.
-	  o Tolerate packet loss unrelated to congestion.
-
-	For further details see:
-	  D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
-	  delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg
-
-config TCP_CONG_BBR
-	tristate "BBR TCP"
-	default n
-	---help---
-
-	BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
-	maximize network utilization and minimize queues. It builds an explicit
-	model of the the bottleneck delivery rate and path round-trip
-	propagation delay. It tolerates packet loss and delay unrelated to
-	congestion. It can operate over LAN, WAN, cellular, wifi, or cable
-	modem links. It can coexist with flows that use loss-based congestion
-	control, and can operate with shallow buffers, deep buffers,
-	bufferbloat, policers, or AQM schemes that do not provide a delay
-	signal. It requires the fq ("Fair Queue") pacing packet scheduler.
-
-choice
-	prompt "Default TCP congestion control"
-	default DEFAULT_CUBIC
-	help
-	  Select the TCP congestion control that will be used by default
-	  for all connections.
-
-	config DEFAULT_BIC
-		bool "Bic" if TCP_CONG_BIC=y
-
-	config DEFAULT_CUBIC
-		bool "Cubic" if TCP_CONG_CUBIC=y
-
-	config DEFAULT_HTCP
-		bool "Htcp" if TCP_CONG_HTCP=y
-
-	config DEFAULT_HYBLA
-		bool "Hybla" if TCP_CONG_HYBLA=y
-
-	config DEFAULT_VEGAS
-		bool "Vegas" if TCP_CONG_VEGAS=y
-
-	config DEFAULT_VENO
-		bool "Veno" if TCP_CONG_VENO=y
-
-	config DEFAULT_WESTWOOD
-		bool "Westwood" if TCP_CONG_WESTWOOD=y
-
-	config DEFAULT_DCTCP
-		bool "DCTCP" if TCP_CONG_DCTCP=y
-
-	config DEFAULT_CDG
-		bool "CDG" if TCP_CONG_CDG=y
-
-	config DEFAULT_BBR
-		bool "BBR" if TCP_CONG_BBR=y
-
-	config DEFAULT_RENO
-		bool "Reno"
-endchoice
-
-endif
-
-config TCP_CONG_CUBIC
-	tristate
-	depends on !TCP_CONG_ADVANCED
-	default y
-
-config DEFAULT_TCP_CONG
-	string
-	default "bic" if DEFAULT_BIC
-	default "cubic" if DEFAULT_CUBIC
-	default "htcp" if DEFAULT_HTCP
-	default "hybla" if DEFAULT_HYBLA
-	default "vegas" if DEFAULT_VEGAS
-	default "westwood" if DEFAULT_WESTWOOD
-	default "veno" if DEFAULT_VENO
-	default "reno" if DEFAULT_RENO
-	default "dctcp" if DEFAULT_DCTCP
-	default "cdg" if DEFAULT_CDG
-	default "bbr" if DEFAULT_BBR
-	default "cubic"
-
-config TCP_MD5SIG
-	bool "TCP: MD5 Signature Option support (RFC2385)"
-	select CRYPTO
-	select CRYPTO_MD5
-	---help---
-	  RFC2385 specifies a method of giving MD5 protection to TCP sessions.
-	  Its main (only?) use is to protect BGP sessions between core routers
-	  on the Internet.
-
-	  If unsure, say N.
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index a106a1cf0e11..d170d6e1417c 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -1,8 +1,8 @@
 #
-# Makefile for the Linux TCP/IP (INET) layer.
+# Makefile for the Linux IP layer.
 #
 
-obj-y     := tcp/ route.o inetpeer.o protocol.o \
+obj-y     := route.o inetpeer.o protocol.o \
 	     ip_input.o ip_fragment.o ip_forward.o ip_options.o \
 	     ip_output.o ip_sockglue.o inet_hashtables.o \
 	     inet_timewait_sock.o inet_connection_sock.o \
diff --git a/net/ipv4v6_shared/Makefile b/net/ipv4v6_shared/Makefile
new file mode 100644
index 000000000000..41a67a0cf210
--- /dev/null
+++ b/net/ipv4v6_shared/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for shared networking code called/used by IPv4 and IPv6
+#
+# Note: Currently this only contains the shared part of the TCP Implementation
+#   a lot of the shared code still resides in net/ipv4/
+
+obj-y     := tcp/
diff --git a/net/ipv4v6_shared/tcp/Kconfig b/net/ipv4v6_shared/tcp/Kconfig
new file mode 100644
index 000000000000..dff4d046a42b
--- /dev/null
+++ b/net/ipv4v6_shared/tcp/Kconfig
@@ -0,0 +1,324 @@
+#
+# TCP configuration
+#
+config SYN_COOKIES
+	bool "IP: TCP syncookie support"
+	---help---
+	  Normal TCP/IP networking is open to an attack known as "SYN
+	  flooding". This denial-of-service attack prevents legitimate remote
+	  users from being able to connect to your computer during an ongoing
+	  attack and requires very little work from the attacker, who can
+	  operate from anywhere on the Internet.
+
+	  SYN cookies provide protection against this type of attack. If you
+	  say Y here, the TCP/IP stack will use a cryptographic challenge
+	  protocol known as "SYN cookies" to enable legitimate users to
+	  continue to connect, even when your machine is under attack. There
+	  is no need for the legitimate users to change their TCP/IP software;
+	  SYN cookies work transparently to them. For technical information
+	  about SYN cookies, check out <http://cr.yp.to/syncookies.html>.
+
+	  If you are SYN flooded, the source address reported by the kernel is
+	  likely to have been forged by the attacker; it is only reported as
+	  an aid in tracing the packets to their actual source and should not
+	  be taken as absolute truth.
+
+	  SYN cookies may prevent correct error reporting on clients when the
+	  server is really overloaded. If this happens frequently better turn
+	  them off.
+
+	  If you say Y here, you can disable SYN cookies at run time by
+	  saying Y to "/proc file system support" and
+	  "Sysctl support" below and executing the command
+
+	  echo 0 > /proc/sys/net/ipv4/tcp_syncookies
+
+	  after the /proc file system has been mounted.
+
+	  If unsure, say N.
+
+menuconfig TCP_CONG_ADVANCED
+	bool "TCP: advanced congestion control"
+	---help---
+	  Support for selection of various TCP congestion control
+	  modules.
+
+	  Nearly all users can safely say no here, and a safe default
+	  selection will be made (CUBIC with new Reno as a fallback).
+
+	  If unsure, say N.
+
+if TCP_CONG_ADVANCED
+
+config TCP_CONG_BIC
+	tristate "Binary Increase Congestion (BIC) control"
+	default m
+	---help---
+	BIC-TCP is a sender-side only change that ensures a linear RTT
+	fairness under large windows while offering both scalability and
+	bounded TCP-friendliness. The protocol combines two schemes
+	called additive increase and binary search increase. When the
+	congestion window is large, additive increase with a large
+	increment ensures linear RTT fairness as well as good
+	scalability. Under small congestion windows, binary search
+	increase provides TCP friendliness.
+	See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
+
+config TCP_CONG_CUBIC
+	tristate "CUBIC TCP"
+	default y
+	---help---
+	This is version 2.0 of BIC-TCP which uses a cubic growth function
+	among other techniques.
+	See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
+
+config TCP_CONG_WESTWOOD
+	tristate "TCP Westwood+"
+	default m
+	---help---
+	TCP Westwood+ is a sender-side only modification of the TCP Reno
+	protocol stack that optimizes the performance of TCP congestion
+	control. It is based on end-to-end bandwidth estimation to set
+	congestion window and slow start threshold after a congestion
+	episode. Using this estimation, TCP Westwood+ adaptively sets a
+	slow start threshold and a congestion window which takes into
+	account the bandwidth used  at the time congestion is experienced.
+	TCP Westwood+ significantly increases fairness wrt TCP Reno in
+	wired networks and throughput over wireless links.
+
+config TCP_CONG_HTCP
+        tristate "H-TCP"
+        default m
+	---help---
+	H-TCP is a send-side only modifications of the TCP Reno
+	protocol stack that optimizes the performance of TCP
+	congestion control for high speed network links. It uses a
+	modeswitch to change the alpha and beta parameters of TCP Reno
+	based on network conditions and in a way so as to be fair with
+	other Reno and H-TCP flows.
+
+config TCP_CONG_HSTCP
+	tristate "High Speed TCP"
+	default n
+	---help---
+	Sally Floyd's High Speed TCP (RFC 3649) congestion control.
+	A modification to TCP's congestion control mechanism for use
+	with large congestion windows. A table indicates how much to
+	increase the congestion window by when an ACK is received.
+ 	For more detail	see http://www.icir.org/floyd/hstcp.html
+
+config TCP_CONG_HYBLA
+	tristate "TCP-Hybla congestion control algorithm"
+	default n
+	---help---
+	TCP-Hybla is a sender-side only change that eliminates penalization of
+	long-RTT, large-bandwidth connections, like when satellite legs are
+	involved, especially when sharing a common bottleneck with normal
+	terrestrial connections.
+
+config TCP_CONG_VEGAS
+	tristate "TCP Vegas"
+	default n
+	---help---
+	TCP Vegas is a sender-side only change to TCP that anticipates
+	the onset of congestion by estimating the bandwidth. TCP Vegas
+	adjusts the sending rate by modifying the congestion
+	window. TCP Vegas should provide less packet loss, but it is
+	not as aggressive as TCP Reno.
+
+config TCP_CONG_NV
+       tristate "TCP NV"
+       default n
+       ---help---
+       TCP NV is a follow up to TCP Vegas. It has been modified to deal with
+       10G networks, measurement noise introduced by LRO, GRO and interrupt
+       coalescence. In addition, it will decrease its cwnd multiplicatively
+       instead of linearly.
+
+       Note that in general congestion avoidance (cwnd decreased when # packets
+       queued grows) cannot coexist with congestion control (cwnd decreased only
+       when there is packet loss) due to fairness issues. One scenario when they
+       can coexist safely is when the CA flows have RTTs << CC flows RTTs.
+
+       For further details see http://www.brakmo.org/networking/tcp-nv/
+
+config TCP_CONG_SCALABLE
+	tristate "Scalable TCP"
+	default n
+	---help---
+	Scalable TCP is a sender-side only change to TCP which uses a
+	MIMD congestion control algorithm which has some nice scaling
+	properties, though is known to have fairness issues.
+	See http://www.deneholme.net/tom/scalable/
+
+config TCP_CONG_LP
+	tristate "TCP Low Priority"
+	default n
+	---help---
+	TCP Low Priority (TCP-LP), a distributed algorithm whose goal is
+	to utilize only the excess network bandwidth as compared to the
+	``fair share`` of bandwidth as targeted by TCP.
+	See http://www-ece.rice.edu/networks/TCP-LP/
+
+config TCP_CONG_VENO
+	tristate "TCP Veno"
+	default n
+	---help---
+	TCP Veno is a sender-side only enhancement of TCP to obtain better
+	throughput over wireless networks. TCP Veno makes use of state
+	distinguishing to circumvent the difficult judgment of the packet loss
+	type. TCP Veno cuts down less congestion window in response to random
+	loss packets.
+	See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186> 
+
+config TCP_CONG_YEAH
+	tristate "YeAH TCP"
+	select TCP_CONG_VEGAS
+	default n
+	---help---
+	YeAH-TCP is a sender-side high-speed enabled TCP congestion control
+	algorithm, which uses a mixed loss/delay approach to compute the
+	congestion window. It's design goals target high efficiency,
+	internal, RTT and Reno fairness, resilience to link loss while
+	keeping network elements load as low as possible.
+
+	For further details look here:
+	  http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
+
+config TCP_CONG_ILLINOIS
+	tristate "TCP Illinois"
+	default n
+	---help---
+	TCP-Illinois is a sender-side modification of TCP Reno for
+	high speed long delay links. It uses round-trip-time to
+	adjust the alpha and beta parameters to achieve a higher average
+	throughput and maintain fairness.
+
+	For further details see:
+	  http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
+
+config TCP_CONG_DCTCP
+	tristate "DataCenter TCP (DCTCP)"
+	default n
+	---help---
+	DCTCP leverages Explicit Congestion Notification (ECN) in the network to
+	provide multi-bit feedback to the end hosts. It is designed to provide:
+
+	- High burst tolerance (incast due to partition/aggregate),
+	- Low latency (short flows, queries),
+	- High throughput (continuous data updates, large file transfers) with
+	  commodity, shallow-buffered switches.
+
+	All switches in the data center network running DCTCP must support
+	ECN marking and be configured for marking when reaching defined switch
+	buffer thresholds. The default ECN marking threshold heuristic for
+	DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets
+	(~100KB) at 10Gbps, but might need further careful tweaking.
+
+	For further details see:
+	  http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
+
+config TCP_CONG_CDG
+	tristate "CAIA Delay-Gradient (CDG)"
+	default n
+	---help---
+	CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies
+	the TCP sender in order to:
+
+	  o Use the delay gradient as a congestion signal.
+	  o Back off with an average probability that is independent of the RTT.
+	  o Coexist with flows that use loss-based congestion control.
+	  o Tolerate packet loss unrelated to congestion.
+
+	For further details see:
+	  D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
+	  delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg
+
+config TCP_CONG_BBR
+	tristate "BBR TCP"
+	default n
+	---help---
+
+	BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
+	maximize network utilization and minimize queues. It builds an explicit
+	model of the the bottleneck delivery rate and path round-trip
+	propagation delay. It tolerates packet loss and delay unrelated to
+	congestion. It can operate over LAN, WAN, cellular, wifi, or cable
+	modem links. It can coexist with flows that use loss-based congestion
+	control, and can operate with shallow buffers, deep buffers,
+	bufferbloat, policers, or AQM schemes that do not provide a delay
+	signal. It requires the fq ("Fair Queue") pacing packet scheduler.
+
+choice
+	prompt "Default TCP congestion control"
+	default DEFAULT_CUBIC
+	help
+	  Select the TCP congestion control that will be used by default
+	  for all connections.
+
+	config DEFAULT_BIC
+		bool "Bic" if TCP_CONG_BIC=y
+
+	config DEFAULT_CUBIC
+		bool "Cubic" if TCP_CONG_CUBIC=y
+
+	config DEFAULT_HTCP
+		bool "Htcp" if TCP_CONG_HTCP=y
+
+	config DEFAULT_HYBLA
+		bool "Hybla" if TCP_CONG_HYBLA=y
+
+	config DEFAULT_VEGAS
+		bool "Vegas" if TCP_CONG_VEGAS=y
+
+	config DEFAULT_VENO
+		bool "Veno" if TCP_CONG_VENO=y
+
+	config DEFAULT_WESTWOOD
+		bool "Westwood" if TCP_CONG_WESTWOOD=y
+
+	config DEFAULT_DCTCP
+		bool "DCTCP" if TCP_CONG_DCTCP=y
+
+	config DEFAULT_CDG
+		bool "CDG" if TCP_CONG_CDG=y
+
+	config DEFAULT_BBR
+		bool "BBR" if TCP_CONG_BBR=y
+
+	config DEFAULT_RENO
+		bool "Reno"
+endchoice
+
+endif
+
+config TCP_CONG_CUBIC
+	tristate
+	depends on !TCP_CONG_ADVANCED
+	default y
+
+config DEFAULT_TCP_CONG
+	string
+	default "bic" if DEFAULT_BIC
+	default "cubic" if DEFAULT_CUBIC
+	default "htcp" if DEFAULT_HTCP
+	default "hybla" if DEFAULT_HYBLA
+	default "vegas" if DEFAULT_VEGAS
+	default "westwood" if DEFAULT_WESTWOOD
+	default "veno" if DEFAULT_VENO
+	default "reno" if DEFAULT_RENO
+	default "dctcp" if DEFAULT_DCTCP
+	default "cdg" if DEFAULT_CDG
+	default "bbr" if DEFAULT_BBR
+	default "cubic"
+
+config TCP_MD5SIG
+	bool "TCP: MD5 Signature Option support (RFC2385)"
+	select CRYPTO
+	select CRYPTO_MD5
+	---help---
+	  RFC2385 specifies a method of giving MD5 protection to TCP sessions.
+	  Its main (only?) use is to protect BGP sessions between core routers
+	  on the Internet.
+
+	  If unsure, say N.
diff --git a/net/ipv4/tcp/Makefile b/net/ipv4v6_shared/tcp/Makefile
similarity index 100%
rename from net/ipv4/tcp/Makefile
rename to net/ipv4v6_shared/tcp/Makefile
diff --git a/net/ipv4/tcp/cc_algos/Makefile b/net/ipv4v6_shared/tcp/cc_algos/Makefile
similarity index 100%
rename from net/ipv4/tcp/cc_algos/Makefile
rename to net/ipv4v6_shared/tcp/cc_algos/Makefile
diff --git a/net/ipv4/tcp/cc_algos/tcp_bbr.c b/net/ipv4v6_shared/tcp/cc_algos/tcp_bbr.c
similarity index 100%
rename from net/ipv4/tcp/cc_algos/tcp_bbr.c
rename to net/ipv4v6_shared/tcp/cc_algos/tcp_bbr.c
diff --git a/net/ipv4/tcp/cc_algos/tcp_bic.c b/net/ipv4v6_shared/tcp/cc_algos/tcp_bic.c
similarity index 100%
rename from net/ipv4/tcp/cc_algos/tcp_bic.c
rename to net/ipv4v6_shared/tcp/cc_algos/tcp_bic.c
diff --git a/net/ipv4/tcp/cc_algos/tcp_cdg.c b/net/ipv4v6_shared/tcp/cc_algos/tcp_cdg.c
similarity index 100%
rename from net/ipv4/tcp/cc_algos/tcp_cdg.c
rename to net/ipv4v6_shared/tcp/cc_algos/tcp_cdg.c
diff --git a/net/ipv4/tcp/cc_algos/tcp_cubic.c b/net/ipv4v6_shared/tcp/cc_algos/tcp_cubic.c
similarity index 100%
rename from net/ipv4/tcp/cc_algos/tcp_cubic.c
rename to net/ipv4v6_shared/tcp/cc_algos/tcp_cubic.c
diff --git a/net/ipv4/tcp/cc_algos/tcp_dctcp.c b/net/ipv4v6_shared/tcp/cc_algos/tcp_dctcp.c
similarity index 100%
rename from net/ipv4/tcp/cc_algos/tcp_dctcp.c
rename to net/ipv4v6_shared/tcp/cc_algos/tcp_dctcp.c
diff --git a/net/ipv4/tcp/cc_algos/tcp_highspeed.c b/net/ipv4v6_shared/tcp/cc_algos/tcp_highspeed.c
similarity index 100%
rename from net/ipv4/tcp/cc_algos/tcp_highspeed.c
rename to net/ipv4v6_shared/tcp/cc_algos/tcp_highspeed.c
diff --git a/net/ipv4/tcp/cc_algos/tcp_htcp.c b/net/ipv4v6_shared/tcp/cc_algos/tcp_htcp.c
similarity index 100%
rename from net/ipv4/tcp/cc_algos/tcp_htcp.c
rename to net/ipv4v6_shared/tcp/cc_algos/tcp_htcp.c
diff --git a/net/ipv4/tcp/cc_algos/tcp_hybla.c b/net/ipv4v6_shared/tcp/cc_algos/tcp_hybla.c
similarity index 100%
rename from net/ipv4/tcp/cc_algos/tcp_hybla.c
rename to net/ipv4v6_shared/tcp/cc_algos/tcp_hybla.c
diff --git a/net/ipv4/tcp/cc_algos/tcp_illinois.c b/net/ipv4v6_shared/tcp/cc_algos/tcp_illinois.c
similarity index 100%
rename from net/ipv4/tcp/cc_algos/tcp_illinois.c
rename to net/ipv4v6_shared/tcp/cc_algos/tcp_illinois.c
diff --git a/net/ipv4/tcp/cc_algos/tcp_lp.c b/net/ipv4v6_shared/tcp/cc_algos/tcp_lp.c
similarity index 100%
rename from net/ipv4/tcp/cc_algos/tcp_lp.c
rename to net/ipv4v6_shared/tcp/cc_algos/tcp_lp.c
diff --git a/net/ipv4/tcp/cc_algos/tcp_nv.c b/net/ipv4v6_shared/tcp/cc_algos/tcp_nv.c
similarity index 100%
rename from net/ipv4/tcp/cc_algos/tcp_nv.c
rename to net/ipv4v6_shared/tcp/cc_algos/tcp_nv.c
diff --git a/net/ipv4/tcp/cc_algos/tcp_scalable.c b/net/ipv4v6_shared/tcp/cc_algos/tcp_scalable.c
similarity index 100%
rename from net/ipv4/tcp/cc_algos/tcp_scalable.c
rename to net/ipv4v6_shared/tcp/cc_algos/tcp_scalable.c
diff --git a/net/ipv4/tcp/cc_algos/tcp_vegas.c b/net/ipv4v6_shared/tcp/cc_algos/tcp_vegas.c
similarity index 100%
rename from net/ipv4/tcp/cc_algos/tcp_vegas.c
rename to net/ipv4v6_shared/tcp/cc_algos/tcp_vegas.c
diff --git a/net/ipv4/tcp/cc_algos/tcp_vegas.h b/net/ipv4v6_shared/tcp/cc_algos/tcp_vegas.h
similarity index 100%
rename from net/ipv4/tcp/cc_algos/tcp_vegas.h
rename to net/ipv4v6_shared/tcp/cc_algos/tcp_vegas.h
diff --git a/net/ipv4/tcp/cc_algos/tcp_veno.c b/net/ipv4v6_shared/tcp/cc_algos/tcp_veno.c
similarity index 100%
rename from net/ipv4/tcp/cc_algos/tcp_veno.c
rename to net/ipv4v6_shared/tcp/cc_algos/tcp_veno.c
diff --git a/net/ipv4/tcp/cc_algos/tcp_westwood.c b/net/ipv4v6_shared/tcp/cc_algos/tcp_westwood.c
similarity index 100%
rename from net/ipv4/tcp/cc_algos/tcp_westwood.c
rename to net/ipv4v6_shared/tcp/cc_algos/tcp_westwood.c
diff --git a/net/ipv4/tcp/cc_algos/tcp_yeah.c b/net/ipv4v6_shared/tcp/cc_algos/tcp_yeah.c
similarity index 100%
rename from net/ipv4/tcp/cc_algos/tcp_yeah.c
rename to net/ipv4v6_shared/tcp/cc_algos/tcp_yeah.c
diff --git a/net/ipv4/tcp/tcp.c b/net/ipv4v6_shared/tcp/tcp.c
similarity index 100%
rename from net/ipv4/tcp/tcp.c
rename to net/ipv4v6_shared/tcp/tcp.c
diff --git a/net/ipv4/tcp/tcp_cong.c b/net/ipv4v6_shared/tcp/tcp_cong.c
similarity index 100%
rename from net/ipv4/tcp/tcp_cong.c
rename to net/ipv4v6_shared/tcp/tcp_cong.c
diff --git a/net/ipv4/tcp/tcp_diag.c b/net/ipv4v6_shared/tcp/tcp_diag.c
similarity index 100%
rename from net/ipv4/tcp/tcp_diag.c
rename to net/ipv4v6_shared/tcp/tcp_diag.c
diff --git a/net/ipv4/tcp/tcp_fastopen.c b/net/ipv4v6_shared/tcp/tcp_fastopen.c
similarity index 100%
rename from net/ipv4/tcp/tcp_fastopen.c
rename to net/ipv4v6_shared/tcp/tcp_fastopen.c
diff --git a/net/ipv4/tcp/tcp_input.c b/net/ipv4v6_shared/tcp/tcp_input.c
similarity index 100%
rename from net/ipv4/tcp/tcp_input.c
rename to net/ipv4v6_shared/tcp/tcp_input.c
diff --git a/net/ipv4/tcp/tcp_metrics.c b/net/ipv4v6_shared/tcp/tcp_metrics.c
similarity index 100%
rename from net/ipv4/tcp/tcp_metrics.c
rename to net/ipv4v6_shared/tcp/tcp_metrics.c
diff --git a/net/ipv4/tcp/tcp_minisocks.c b/net/ipv4v6_shared/tcp/tcp_minisocks.c
similarity index 100%
rename from net/ipv4/tcp/tcp_minisocks.c
rename to net/ipv4v6_shared/tcp/tcp_minisocks.c
diff --git a/net/ipv4/tcp/tcp_output.c b/net/ipv4v6_shared/tcp/tcp_output.c
similarity index 100%
rename from net/ipv4/tcp/tcp_output.c
rename to net/ipv4v6_shared/tcp/tcp_output.c
diff --git a/net/ipv4/tcp/tcp_probe.c b/net/ipv4v6_shared/tcp/tcp_probe.c
similarity index 100%
rename from net/ipv4/tcp/tcp_probe.c
rename to net/ipv4v6_shared/tcp/tcp_probe.c
diff --git a/net/ipv4/tcp/tcp_rate.c b/net/ipv4v6_shared/tcp/tcp_rate.c
similarity index 100%
rename from net/ipv4/tcp/tcp_rate.c
rename to net/ipv4v6_shared/tcp/tcp_rate.c
diff --git a/net/ipv4/tcp/tcp_recovery.c b/net/ipv4v6_shared/tcp/tcp_recovery.c
similarity index 100%
rename from net/ipv4/tcp/tcp_recovery.c
rename to net/ipv4v6_shared/tcp/tcp_recovery.c
diff --git a/net/ipv4/tcp/tcp_timer.c b/net/ipv4v6_shared/tcp/tcp_timer.c
similarity index 100%
rename from net/ipv4/tcp/tcp_timer.c
rename to net/ipv4v6_shared/tcp/tcp_timer.c
diff --git a/net/ipv4/tcp/tcp_ulp.c b/net/ipv4v6_shared/tcp/tcp_ulp.c
similarity index 100%
rename from net/ipv4/tcp/tcp_ulp.c
rename to net/ipv4v6_shared/tcp/tcp_ulp.c
-- 
2.14.1

^ permalink raw reply related

* [PATCH net-next 2/3] tcp: Move cc algorithms to own subdirectory
From: Richard Sailer @ 2017-10-03 22:22 UTC (permalink / raw)
  To: netdev, davem; +Cc: kuznet, yoshfuji
In-Reply-To: <20171003222213.7996-1-richard_siegfried@systemli.org>

Essentially the TCP code files consist of two groups:

  * main code and some utilities (13 files)
  * pluggable Congestion Control Algorithms (17 files)

Similar to the previous commit, this moves the Congestion control Algorithms to
a own subdirecty and updates the Makefiles accordingly to make the source tree
more well structured and self explaining.

Signed-off-by: Richard Sailer <richard_siegfried@systemli.org>
---
 net/ipv4/tcp/Makefile                       | 18 ++----------------
 net/ipv4/tcp/cc_algos/Makefile              | 21 +++++++++++++++++++++
 net/ipv4/tcp/{ => cc_algos}/tcp_bbr.c       |  0
 net/ipv4/tcp/{ => cc_algos}/tcp_bic.c       |  0
 net/ipv4/tcp/{ => cc_algos}/tcp_cdg.c       |  0
 net/ipv4/tcp/{ => cc_algos}/tcp_cubic.c     |  0
 net/ipv4/tcp/{ => cc_algos}/tcp_dctcp.c     |  0
 net/ipv4/tcp/{ => cc_algos}/tcp_highspeed.c |  0
 net/ipv4/tcp/{ => cc_algos}/tcp_htcp.c      |  0
 net/ipv4/tcp/{ => cc_algos}/tcp_hybla.c     |  0
 net/ipv4/tcp/{ => cc_algos}/tcp_illinois.c  |  0
 net/ipv4/tcp/{ => cc_algos}/tcp_lp.c        |  0
 net/ipv4/tcp/{ => cc_algos}/tcp_nv.c        |  0
 net/ipv4/tcp/{ => cc_algos}/tcp_scalable.c  |  0
 net/ipv4/tcp/{ => cc_algos}/tcp_vegas.c     |  0
 net/ipv4/tcp/{ => cc_algos}/tcp_vegas.h     |  0
 net/ipv4/tcp/{ => cc_algos}/tcp_veno.c      |  0
 net/ipv4/tcp/{ => cc_algos}/tcp_westwood.c  |  0
 net/ipv4/tcp/{ => cc_algos}/tcp_yeah.c      |  0
 19 files changed, 23 insertions(+), 16 deletions(-)
 create mode 100644 net/ipv4/tcp/cc_algos/Makefile
 rename net/ipv4/tcp/{ => cc_algos}/tcp_bbr.c (100%)
 rename net/ipv4/tcp/{ => cc_algos}/tcp_bic.c (100%)
 rename net/ipv4/tcp/{ => cc_algos}/tcp_cdg.c (100%)
 rename net/ipv4/tcp/{ => cc_algos}/tcp_cubic.c (100%)
 rename net/ipv4/tcp/{ => cc_algos}/tcp_dctcp.c (100%)
 rename net/ipv4/tcp/{ => cc_algos}/tcp_highspeed.c (100%)
 rename net/ipv4/tcp/{ => cc_algos}/tcp_htcp.c (100%)
 rename net/ipv4/tcp/{ => cc_algos}/tcp_hybla.c (100%)
 rename net/ipv4/tcp/{ => cc_algos}/tcp_illinois.c (100%)
 rename net/ipv4/tcp/{ => cc_algos}/tcp_lp.c (100%)
 rename net/ipv4/tcp/{ => cc_algos}/tcp_nv.c (100%)
 rename net/ipv4/tcp/{ => cc_algos}/tcp_scalable.c (100%)
 rename net/ipv4/tcp/{ => cc_algos}/tcp_vegas.c (100%)
 rename net/ipv4/tcp/{ => cc_algos}/tcp_vegas.h (100%)
 rename net/ipv4/tcp/{ => cc_algos}/tcp_veno.c (100%)
 rename net/ipv4/tcp/{ => cc_algos}/tcp_westwood.c (100%)
 rename net/ipv4/tcp/{ => cc_algos}/tcp_yeah.c (100%)

diff --git a/net/ipv4/tcp/Makefile b/net/ipv4/tcp/Makefile
index 91d2c991a243..60e96525a489 100644
--- a/net/ipv4/tcp/Makefile
+++ b/net/ipv4/tcp/Makefile
@@ -8,19 +8,5 @@ obj-y     := tcp.o tcp_input.o tcp_output.o tcp_timer.o \
 
 obj-$(CONFIG_INET_TCP_DIAG)     +=  tcp_diag.o
 obj-$(CONFIG_NET_TCPPROBE)      +=  tcp_probe.o
-obj-$(CONFIG_TCP_CONG_BBR)      +=  tcp_bbr.o
-obj-$(CONFIG_TCP_CONG_BIC)      +=  tcp_bic.o
-obj-$(CONFIG_TCP_CONG_CDG)      +=  tcp_cdg.o
-obj-$(CONFIG_TCP_CONG_CUBIC)    +=  tcp_cubic.o
-obj-$(CONFIG_TCP_CONG_DCTCP)    +=  tcp_dctcp.o
-obj-$(CONFIG_TCP_CONG_WESTWOOD) +=  tcp_westwood.o
-obj-$(CONFIG_TCP_CONG_HSTCP)    +=  tcp_highspeed.o
-obj-$(CONFIG_TCP_CONG_HYBLA)    +=  tcp_hybla.o
-obj-$(CONFIG_TCP_CONG_HTCP)     +=  tcp_htcp.o
-obj-$(CONFIG_TCP_CONG_VEGAS)    +=  tcp_vegas.o
-obj-$(CONFIG_TCP_CONG_NV)       +=  tcp_nv.o
-obj-$(CONFIG_TCP_CONG_VENO)     +=  tcp_veno.o
-obj-$(CONFIG_TCP_CONG_SCALABLE) +=  tcp_scalable.o
-obj-$(CONFIG_TCP_CONG_LP)       +=  tcp_lp.o
-obj-$(CONFIG_TCP_CONG_YEAH)     +=  tcp_yeah.o
-obj-$(CONFIG_TCP_CONG_ILLINOIS) +=  tcp_illinois.o
+
+obj-$(CONFIG_TCP_CONG_ADVANCED) +=  cc_algos/
diff --git a/net/ipv4/tcp/cc_algos/Makefile b/net/ipv4/tcp/cc_algos/Makefile
new file mode 100644
index 000000000000..81999dd2a045
--- /dev/null
+++ b/net/ipv4/tcp/cc_algos/Makefile
@@ -0,0 +1,21 @@
+#
+# Makefile for all pluggable TCP congestion control algorithms
+#
+
+
+obj-$(CONFIG_TCP_CONG_BBR)      +=  tcp_bbr.o
+obj-$(CONFIG_TCP_CONG_BIC)      +=  tcp_bic.o
+obj-$(CONFIG_TCP_CONG_CDG)      +=  tcp_cdg.o
+obj-$(CONFIG_TCP_CONG_CUBIC)    +=  tcp_cubic.o
+obj-$(CONFIG_TCP_CONG_DCTCP)    +=  tcp_dctcp.o
+obj-$(CONFIG_TCP_CONG_WESTWOOD) +=  tcp_westwood.o
+obj-$(CONFIG_TCP_CONG_HSTCP)    +=  tcp_highspeed.o
+obj-$(CONFIG_TCP_CONG_HYBLA)    +=  tcp_hybla.o
+obj-$(CONFIG_TCP_CONG_HTCP)     +=  tcp_htcp.o
+obj-$(CONFIG_TCP_CONG_VEGAS)    +=  tcp_vegas.o
+obj-$(CONFIG_TCP_CONG_NV)       +=  tcp_nv.o
+obj-$(CONFIG_TCP_CONG_VENO)     +=  tcp_veno.o
+obj-$(CONFIG_TCP_CONG_SCALABLE) +=  tcp_scalable.o
+obj-$(CONFIG_TCP_CONG_LP)       +=  tcp_lp.o
+obj-$(CONFIG_TCP_CONG_YEAH)     +=  tcp_yeah.o
+obj-$(CONFIG_TCP_CONG_ILLINOIS) +=  tcp_illinois.o
diff --git a/net/ipv4/tcp/tcp_bbr.c b/net/ipv4/tcp/cc_algos/tcp_bbr.c
similarity index 100%
rename from net/ipv4/tcp/tcp_bbr.c
rename to net/ipv4/tcp/cc_algos/tcp_bbr.c
diff --git a/net/ipv4/tcp/tcp_bic.c b/net/ipv4/tcp/cc_algos/tcp_bic.c
similarity index 100%
rename from net/ipv4/tcp/tcp_bic.c
rename to net/ipv4/tcp/cc_algos/tcp_bic.c
diff --git a/net/ipv4/tcp/tcp_cdg.c b/net/ipv4/tcp/cc_algos/tcp_cdg.c
similarity index 100%
rename from net/ipv4/tcp/tcp_cdg.c
rename to net/ipv4/tcp/cc_algos/tcp_cdg.c
diff --git a/net/ipv4/tcp/tcp_cubic.c b/net/ipv4/tcp/cc_algos/tcp_cubic.c
similarity index 100%
rename from net/ipv4/tcp/tcp_cubic.c
rename to net/ipv4/tcp/cc_algos/tcp_cubic.c
diff --git a/net/ipv4/tcp/tcp_dctcp.c b/net/ipv4/tcp/cc_algos/tcp_dctcp.c
similarity index 100%
rename from net/ipv4/tcp/tcp_dctcp.c
rename to net/ipv4/tcp/cc_algos/tcp_dctcp.c
diff --git a/net/ipv4/tcp/tcp_highspeed.c b/net/ipv4/tcp/cc_algos/tcp_highspeed.c
similarity index 100%
rename from net/ipv4/tcp/tcp_highspeed.c
rename to net/ipv4/tcp/cc_algos/tcp_highspeed.c
diff --git a/net/ipv4/tcp/tcp_htcp.c b/net/ipv4/tcp/cc_algos/tcp_htcp.c
similarity index 100%
rename from net/ipv4/tcp/tcp_htcp.c
rename to net/ipv4/tcp/cc_algos/tcp_htcp.c
diff --git a/net/ipv4/tcp/tcp_hybla.c b/net/ipv4/tcp/cc_algos/tcp_hybla.c
similarity index 100%
rename from net/ipv4/tcp/tcp_hybla.c
rename to net/ipv4/tcp/cc_algos/tcp_hybla.c
diff --git a/net/ipv4/tcp/tcp_illinois.c b/net/ipv4/tcp/cc_algos/tcp_illinois.c
similarity index 100%
rename from net/ipv4/tcp/tcp_illinois.c
rename to net/ipv4/tcp/cc_algos/tcp_illinois.c
diff --git a/net/ipv4/tcp/tcp_lp.c b/net/ipv4/tcp/cc_algos/tcp_lp.c
similarity index 100%
rename from net/ipv4/tcp/tcp_lp.c
rename to net/ipv4/tcp/cc_algos/tcp_lp.c
diff --git a/net/ipv4/tcp/tcp_nv.c b/net/ipv4/tcp/cc_algos/tcp_nv.c
similarity index 100%
rename from net/ipv4/tcp/tcp_nv.c
rename to net/ipv4/tcp/cc_algos/tcp_nv.c
diff --git a/net/ipv4/tcp/tcp_scalable.c b/net/ipv4/tcp/cc_algos/tcp_scalable.c
similarity index 100%
rename from net/ipv4/tcp/tcp_scalable.c
rename to net/ipv4/tcp/cc_algos/tcp_scalable.c
diff --git a/net/ipv4/tcp/tcp_vegas.c b/net/ipv4/tcp/cc_algos/tcp_vegas.c
similarity index 100%
rename from net/ipv4/tcp/tcp_vegas.c
rename to net/ipv4/tcp/cc_algos/tcp_vegas.c
diff --git a/net/ipv4/tcp/tcp_vegas.h b/net/ipv4/tcp/cc_algos/tcp_vegas.h
similarity index 100%
rename from net/ipv4/tcp/tcp_vegas.h
rename to net/ipv4/tcp/cc_algos/tcp_vegas.h
diff --git a/net/ipv4/tcp/tcp_veno.c b/net/ipv4/tcp/cc_algos/tcp_veno.c
similarity index 100%
rename from net/ipv4/tcp/tcp_veno.c
rename to net/ipv4/tcp/cc_algos/tcp_veno.c
diff --git a/net/ipv4/tcp/tcp_westwood.c b/net/ipv4/tcp/cc_algos/tcp_westwood.c
similarity index 100%
rename from net/ipv4/tcp/tcp_westwood.c
rename to net/ipv4/tcp/cc_algos/tcp_westwood.c
diff --git a/net/ipv4/tcp/tcp_yeah.c b/net/ipv4/tcp/cc_algos/tcp_yeah.c
similarity index 100%
rename from net/ipv4/tcp/tcp_yeah.c
rename to net/ipv4/tcp/cc_algos/tcp_yeah.c
-- 
2.14.1

^ permalink raw reply related

* Re: [PATCH V2] Fix a sleep-in-atomic bug in shash_setkey_unaligned
From: Marcelo Ricardo Leitner @ 2017-10-03 22:33 UTC (permalink / raw)
  To: Jia-Ju Bai
  Cc: davem, herbert, nhorman, vyasevich, luto, kvalo, linux-crypto,
	netdev, linux-sctp, linux-wireless
In-Reply-To: <1506997522-26684-1-git-send-email-baijiaju1990@163.com>

On Tue, Oct 03, 2017 at 10:25:22AM +0800, Jia-Ju Bai wrote:
> The SCTP program may sleep under a spinlock, and the function call path is:
> sctp_generate_t3_rtx_event (acquire the spinlock)
>   sctp_do_sm
>     sctp_side_effects
>       sctp_cmd_interpreter
>         sctp_make_init_ack
>           sctp_pack_cookie
>             crypto_shash_setkey
>               shash_setkey_unaligned
>                 kmalloc(GFP_KERNEL)

Are you sure this can happen?
The host is not supposed to store any information when replying to an
INIT packet (which generated the INIT_ACK listed above). That said,
it's weird to see the timer function triggering so.

Checking now, that code is dead actually:
$ git grep -A 2 SCTP_CMD_GEN_INIT_ACK
sm_sideeffect.c:                case SCTP_CMD_GEN_INIT_ACK:
sm_sideeffect.c-                        /* Generate an INIT ACK chunk.
*/
sm_sideeffect.c-                        new_obj =
sctp_make_init_ack(asoc, chunk, GFP_ATOMIC,

Nobody is triggering a call to sctp_cmd_interpreter with
SCTP_CMD_GEN_INIT_ACK command, which would generate the callstack
above.

  Marcelo

^ permalink raw reply

* [PATCH net] bpf: fix bpf_tail_call() x64 JIT
From: Alexei Starovoitov @ 2017-10-03 22:37 UTC (permalink / raw)
  To: David S . Miller; +Cc: Daniel Borkmann, Martin KaFai Lau, netdev, kernel-team

- bpf prog_array just like all other types of bpf array accepts 32-bit index.
  Clarify that in the comment.
- fix x64 JIT of bpf_tail_call which was incorrectly loading 8 instead of 4 bytes
- tighten corresponding check in the interpreter to stay consistent

The JIT bug can be triggered after introduction of BPF_F_NUMA_NODE flag
in commit 96eabe7a40aa in 4.14. Before that the map_flags would stay zero and
though JIT code is wrong it will check bounds correctly.
Hence two fixes tags. All other JITs don't have this problem.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Fixes: 96eabe7a40aa ("bpf: Allow selecting numa node during map creation")
Fixes: b52f00e6a715 ("x86: bpf_jit: implement bpf_tail_call() helper")
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
---
Backport to stable would be nice, but not strictly necessary.

 arch/x86/net/bpf_jit_comp.c | 4 ++--
 include/uapi/linux/bpf.h    | 2 +-
 kernel/bpf/core.c           | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 8c9573660d51..0554e8aef4d5 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -284,9 +284,9 @@ static void emit_bpf_tail_call(u8 **pprog)
 	/* if (index >= array->map.max_entries)
 	 *   goto out;
 	 */
-	EMIT4(0x48, 0x8B, 0x46,                   /* mov rax, qword ptr [rsi + 16] */
+	EMIT2(0x89, 0xD2);                        /* mov edx, edx */
+	EMIT3(0x39, 0x56,                         /* cmp dword ptr [rsi + 16], edx */
 	      offsetof(struct bpf_array, map.max_entries));
-	EMIT3(0x48, 0x39, 0xD0);                  /* cmp rax, rdx */
 #define OFFSET1 43 /* number of bytes to jump */
 	EMIT2(X86_JBE, OFFSET1);                  /* jbe out */
 	label1 = cnt;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 43ab5c402f98..f90860d1f897 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -312,7 +312,7 @@ union bpf_attr {
  *     jump into another BPF program
  *     @ctx: context pointer passed to next program
  *     @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY
- *     @index: index inside array that selects specific program to run
+ *     @index: 32-bit index inside array that selects specific program to run
  *     Return: 0 on success or negative error
  *
  * int bpf_clone_redirect(skb, ifindex, flags)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 917cc04a0a94..7b62df86be1d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1022,7 +1022,7 @@ static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn,
 		struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
 		struct bpf_array *array = container_of(map, struct bpf_array, map);
 		struct bpf_prog *prog;
-		u64 index = BPF_R3;
+		u32 index = BPF_R3;
 
 		if (unlikely(index >= array->map.max_entries))
 			goto out;
-- 
2.9.5

^ permalink raw reply related

* Re: [PATCH] net: stmmac: dwmac-rk: Add RK3128 GMAC support
From: David Miller @ 2017-10-03 22:40 UTC (permalink / raw)
  To: david.wu
  Cc: heiko, peppe.cavallaro, alexandre.torgue, huangtao, netdev,
	linux-rockchip, linux-kernel
In-Reply-To: <1506764843-24681-1-git-send-email-david.wu@rock-chips.com>

From: David Wu <david.wu@rock-chips.com>
Date: Sat, 30 Sep 2017 17:47:23 +0800

> Add constants and callback functions for the dwmac on rk3128 soc.
> As can be seen, the base structure is the same, only registers
> and the bits in them moved slightly.
> 
> Signed-off-by: David Wu <david.wu@rock-chips.com>

Applied, thank you.

^ permalink raw reply

* Re: [PATCH net-next 0/3] A own subdirectory for shared TCP code
From: Eric Dumazet @ 2017-10-03 22:42 UTC (permalink / raw)
  To: Richard Sailer; +Cc: netdev, davem, kuznet, yoshfuji
In-Reply-To: <20171003222213.7996-1-richard_siegfried@systemli.org>

On Wed, 2017-10-04 at 00:22 +0200, Richard Sailer wrote:
> net/ipv4 currently contains around 100 source files containing the
> IP implementation and lots of other functionality (UDP, TCP, xfrm, etc.)
> 
> # 1/3
> To make the networking source tree more self documenting and well 
> structured 1/3 moves the 30 shared TCP source files to a own 
> subdirectory of net/ipv4/.
> 
> # 2/3 
> A huge part of the TCP code is congestion control algorithms.
> With the same goal as above, this moves the cc algos to a own
> subdirectory of net/ipv4/tcp/
> 
> # 3/3
> Implementation of a suggestion of valdis, to have a new directory
> net/ipv4v6_shared that contains all the code shared between
> IPv4 and IPv6. This creates net/ipv4v6_shared and moves the 
> net/ipv4/tcp directory of shared TCP code there.
> 
> While every commit depends on its previous commits, it is
> also possible and sensible to only apply (1) or (1,2) , depending
> on how conservative one wants to decide regarding directory
> structure changes.
> 
> [PATCH net-next 1/3] net/ipv4: Move shared tcp code to own subdirectory
> [PATCH net-next 2/3] tcp: Move cc algorithms to own subdirectory
> [PATCH net-next 3/3] Move shared tcp code to net/ipv4v6shared


NACK for the whole series.

Simple reason :

This is going to be a maintenance nightmare, for people dealing with TCP
stack every day.

^ permalink raw reply

* Re: [PATCH V2] Fix a sleep-in-atomic bug in shash_setkey_unaligned
From: Marcelo Ricardo Leitner @ 2017-10-03 22:45 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Andy Lutomirski, Jia-Ju Bai, David S. Miller, Neil Horman,
	vyasevich, Kalle Valo, Linux Crypto Mailing List,
	Network Development, linux-sctp, Linux Wireless List
In-Reply-To: <20171003052643.GB22750@gondor.apana.org.au>

On Tue, Oct 03, 2017 at 01:26:43PM +0800, Herbert Xu wrote:
> On Mon, Oct 02, 2017 at 09:18:24PM -0700, Andy Lutomirski wrote:
> > > On Oct 2, 2017, at 7:25 PM, Jia-Ju Bai <baijiaju1990@163.com> wrote:
> > >
> > > The SCTP program may sleep under a spinlock, and the function call path is:
> > > sctp_generate_t3_rtx_event (acquire the spinlock)
> > >  sctp_do_sm
> > >    sctp_side_effects
> > >      sctp_cmd_interpreter
> > >        sctp_make_init_ack
> > >          sctp_pack_cookie
> > >            crypto_shash_setkey
> > >              shash_setkey_unaligned
> > >                kmalloc(GFP_KERNEL)
> > 
> > I'm going to go out on a limb here: why on Earth is out crypto API so
> > full of indirection that we allocate memory at all here?
> 
> The crypto API operates on a one key per-tfm basis.  So normally
> tfm allocation and key setting is done once only and not done on
> the data path.
> 
> I have looked at the SCTP code and it appears to fit this paradigm.
> That is, we should be able to allocate the tfm and set the key when
> the key is actually generated via get_random_bytes, rather than every
> time the key is used which is not only a waste but as you see runs
> into API issues.

Fair point, but

> 
> Usually if you're invoking setkey from a non-sleeping code-path
> you're probably doing something wrong.

Usually but not always. There are 3 calls to that function on SCTP
code:
- pack a cookie, which is sent on an INIT_ACK packet to the client
- unpack the cookie above, after it is sent back by the client on a
  COOKIE_ECHO packet
- send a chunk authenticated by a hash

the first two happen during softirq processing, while processing a
packet that was received.

As I explained on the other email, SCTP code is not supposed to store
any information about the peer between the 1st and the 2nd moments
above, to be less vulnerable to DoS attacks (it's planned so by the
RFC), thus why it uses the cookie.

The 3rd one we probably can improve, but I don't think we can do much
about the 2 first ones from the SCTP side.

Note on sctp_sf_do_5_1B_init() how sctp_make_init_ack() is explicitly
called with GFP_ATOMIC, and also on sctp_sf_do_unexpected_init().
Though we can't propagate that to crypto_shash_setkey.

Ideas?

Thanks,
Marcelo

> 
> As someone else noted recently, there is no single forum for
> reviewing code that uses the crypto API so buggy code like this
> is not surprising.
> 
> > We're synchronously computing a hash of a small amount of data using
> > either HMAC-SHA1 or HMAC-SHA256 (determined at runtime) if I read it
> > right.  There's a sane way to do this that doesn't need kmalloc,
> > alloca, or fancy indirection.  And then there's crypto_shash_xyz().
> 
> There are some legitimate cases where you want to use a different
> key for every hashing operation.  But so far these are uses have
> been very few so there has been no need to provide an API for them.
> 
> Cheers,
> -- 
> Email: Herbert Xu <herbert@gondor.apana.org.au>
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
> --
> To unsubscribe from this list: send the line "unsubscribe linux-sctp" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply

* Re: [PATCH V2] Fix a sleep-in-atomic bug in shash_setkey_unaligned
From: Marcelo Ricardo Leitner @ 2017-10-03 22:46 UTC (permalink / raw)
  To: Jia-Ju Bai
  Cc: davem-fT/PcQaiUtIeIZ0/mPfg9Q,
	herbert-lOAM2aK0SrRLBo1qDEOMRrpzq4S04n8Q,
	nhorman-2XuSBdqkA4R54TAoqtyWWQ, vyasevich-Re5JQEeQqe8AvxtiuMwx3w,
	luto-DgEjT+Ai2ygdnm+yROfE0A, kvalo-sgV2jX0FEOL9JmXXK+q4OQ,
	linux-crypto-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA, linux-sctp-u79uwXL29TY76Z2rM5mHXA,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20171003223308.GD19750-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>

On Tue, Oct 03, 2017 at 07:33:08PM -0300, Marcelo Ricardo Leitner wrote:
> On Tue, Oct 03, 2017 at 10:25:22AM +0800, Jia-Ju Bai wrote:
> > The SCTP program may sleep under a spinlock, and the function call path is:
> > sctp_generate_t3_rtx_event (acquire the spinlock)
> >   sctp_do_sm
> >     sctp_side_effects
> >       sctp_cmd_interpreter
> >         sctp_make_init_ack
> >           sctp_pack_cookie
> >             crypto_shash_setkey
> >               shash_setkey_unaligned
> >                 kmalloc(GFP_KERNEL)
> 
> Are you sure this can happen?
> The host is not supposed to store any information when replying to an
> INIT packet (which generated the INIT_ACK listed above). That said,
> it's weird to see the timer function triggering so.
> 
> Checking now, that code is dead actually:
> $ git grep -A 2 SCTP_CMD_GEN_INIT_ACK
> sm_sideeffect.c:                case SCTP_CMD_GEN_INIT_ACK:
> sm_sideeffect.c-                        /* Generate an INIT ACK chunk.
> */
> sm_sideeffect.c-                        new_obj =
> sctp_make_init_ack(asoc, chunk, GFP_ATOMIC,
> 
> Nobody is triggering a call to sctp_cmd_interpreter with
> SCTP_CMD_GEN_INIT_ACK command, which would generate the callstack
> above.

Nevertheless, the issue is real through other call paths.

Thanks,
Marcelo

^ permalink raw reply

* Re: [PATCH net] bpf: fix bpf_tail_call() x64 JIT
From: Eric Dumazet @ 2017-10-03 22:48 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: David S . Miller, Daniel Borkmann, Martin KaFai Lau, netdev,
	kernel-team
In-Reply-To: <20171003223720.1517246-1-ast@fb.com>

On Tue, 2017-10-03 at 15:37 -0700, Alexei Starovoitov wrote:
> - bpf prog_array just like all other types of bpf array accepts 32-bit index.
>   Clarify that in the comment.
> - fix x64 JIT of bpf_tail_call which was incorrectly loading 8 instead of 4 bytes
> - tighten corresponding check in the interpreter to stay consistent
> 
> The JIT bug can be triggered after introduction of BPF_F_NUMA_NODE flag
> in commit 96eabe7a40aa in 4.14. Before that the map_flags would stay zero and
> though JIT code is wrong it will check bounds correctly.
> Hence two fixes tags. All other JITs don't have this problem.
> 
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> Fixes: 96eabe7a40aa ("bpf: Allow selecting numa node during map creation")
> Fixes: b52f00e6a715 ("x86: bpf_jit: implement bpf_tail_call() helper")
> Acked-by: Daniel Borkmann <daniel@iogearbox.net>
> Acked-by: Martin KaFai Lau <kafai@fb.com>
> ---
> Backport to stable would be nice, but not strictly necessary.

Reviewed-by: Eric Dumazet <edumazet@google.com>

Thanks !

^ permalink raw reply

* Re: [PATCH v3 net-next] net: core: decouple ifalias get/set from rtnl lock
From: David Miller @ 2017-10-03 22:55 UTC (permalink / raw)
  To: fw; +Cc: netdev, edumazet
In-Reply-To: <20171002215005.10417-1-fw@strlen.de>

From: Florian Westphal <fw@strlen.de>
Date: Mon,  2 Oct 2017 23:50:05 +0200

> Device alias can be set by either rtnetlink (rtnl is held) or sysfs.
> 
> rtnetlink hold the rtnl mutex, sysfs acquires it for this purpose.
> Add an extra mutex for it and use rcu to protect concurrent accesses.
> 
> This allows the sysfs path to not take rtnl and would later allow
> to not hold it when dumping ifalias.
> 
> Based on suggestion from Eric Dumazet.
> 
> Signed-off-by: Florian Westphal <fw@strlen.de>

Applied, thanks Florian.

^ permalink raw reply

* Re: [PATCH net-next 0/3] A own subdirectory for shared TCP code
From: David Miller @ 2017-10-03 23:03 UTC (permalink / raw)
  To: richard_siegfried; +Cc: netdev, kuznet, yoshfuji
In-Reply-To: <20171003222213.7996-1-richard_siegfried@systemli.org>

From: Richard Sailer <richard_siegfried@systemli.org>
Date: Wed,  4 Oct 2017 00:22:10 +0200

> net/ipv4 currently contains around 100 source files containing the
> IP implementation and lots of other functionality (UDP, TCP, xfrm, etc.)

As someone who has to do backports regularly to -stable, there is no way
I am applying this.

Sorry.

^ permalink raw reply

* Re: [PATCH net] bpf: fix bpf_tail_call() x64 JIT
From: David Miller @ 2017-10-03 23:05 UTC (permalink / raw)
  To: ast; +Cc: daniel, kafai, netdev, kernel-team
In-Reply-To: <20171003223720.1517246-1-ast@fb.com>

From: Alexei Starovoitov <ast@fb.com>
Date: Tue, 3 Oct 2017 15:37:20 -0700

> - bpf prog_array just like all other types of bpf array accepts 32-bit index.
>   Clarify that in the comment.
> - fix x64 JIT of bpf_tail_call which was incorrectly loading 8 instead of 4 bytes
> - tighten corresponding check in the interpreter to stay consistent
> 
> The JIT bug can be triggered after introduction of BPF_F_NUMA_NODE flag
> in commit 96eabe7a40aa in 4.14. Before that the map_flags would stay zero and
> though JIT code is wrong it will check bounds correctly.
> Hence two fixes tags. All other JITs don't have this problem.
> 
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> Fixes: 96eabe7a40aa ("bpf: Allow selecting numa node during map creation")
> Fixes: b52f00e6a715 ("x86: bpf_jit: implement bpf_tail_call() helper")
> Acked-by: Daniel Borkmann <daniel@iogearbox.net>
> Acked-by: Martin KaFai Lau <kafai@fb.com>
> ---
> Backport to stable would be nice, but not strictly necessary.

Applied and queued up for -stable, thanks.

^ permalink raw reply

* Re: [net-next 0/9][pull request] 100GbE Intel Wired LAN Driver Updates 2017-10-03
From: David Miller @ 2017-10-03 23:25 UTC (permalink / raw)
  To: jeffrey.t.kirsher; +Cc: netdev, nhorman, sassmann, jogreene
In-Reply-To: <20171003163138.47569-1-jeffrey.t.kirsher@intel.com>

From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Tue,  3 Oct 2017 09:31:29 -0700

> This series contains updates to fm10k only.

Pulled, thanks Jeff.

^ permalink raw reply

* Re: [PATCH net-next v2 00/10] Introduce SCTP Stream Schedulers
From: David Miller @ 2017-10-03 23:27 UTC (permalink / raw)
  To: marcelo.leitner
  Cc: netdev, linux-sctp, nhorman, vyasevich, lucien.xin, David.Laight
In-Reply-To: <cover.1507069005.git.marcelo.leitner@gmail.com>

From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Tue,  3 Oct 2017 19:20:07 -0300

> This patchset introduces the SCTP Stream Schedulers are defined by
> https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13
> 
> It provides 3 schedulers at the moment: FCFS, Priority and Round Robin.
> The other 3, Round Robin per packet, Fair Capacity and Weighted Fair
> Capacity will be added later. More specifically, WFQ is required by
> WebRTC Datachannels.
> 
> The draft also defines the idata chunk, allowing a usermsg to be
> interrupted by another piece of idata from another stream. This patchset
> *doesn't* include it. It will be posted later by Xin Long.  Its
> integration with this patchset is very simple and it basically only
> requires a tweak in sctp_sched_dequeue_done(), to ignore datamsg
> boundaries.
> 
> The first 5 patches are a preparation for the next ones. The most
> relevant patches are the 4th and 6th ones. More details are available on
> each patch.
> 
> v2: changelog update on patch 3

This doesn't look too scary :-)

Series applied, thanks!

^ permalink raw reply

* [next-queue PATCH v3 0/4] TSN: Add qdisc based config interface for CBS
From: Vinicius Costa Gomes @ 2017-10-03 23:44 UTC (permalink / raw)
  To: netdev, intel-wired-lan
  Cc: Vinicius Costa Gomes, jhs, xiyou.wangcong, jiri, andre.guedes,
	ivan.briano, jesus.sanchez-palencia, boon.leong.ong,
	richardcochran, henrik, levipearson, rodney.cummings

Hi,

Changes since v2:
 - squashed the patch introducing the userspace API into the patch
   implementing CBS;

Changes since v1:
 - Solved the mqprio dependency;
 - Fixed a mqprio bug, that caused the inner qdisc to have a wrong
   dev_queue associated with it;

Changes from the RFC:
 - Fixed comments from Henrik Austad;
 - Simplified the Qdisc, using the generic implementation of callbacks
   where possible;
 - Small refactor on the driver (igb) code;

This patchset is a proposal of how the Traffic Control subsystem can
be used to offload the configuration of the Credit Based Shaper
(defined in the IEEE 802.1Q-2014 Section 8.6.8.2) into supported
network devices.

As part of this work, we've assessed previous public discussions
related to TSN enabling: patches from Henrik Austad (Cisco), the
presentation from Eric Mann at Linux Plumbers 2012, patches from
Gangfeng Huang (National Instruments) and the current state of the
OpenAVNU project (https://github.com/AVnu/OpenAvnu/).

Overview
========

Time-sensitive Networking (TSN) is a set of standards that aim to
address resources availability for providing bandwidth reservation and
bounded latency on Ethernet based LANs. The proposal described here
aims to cover mainly what is needed to enable the following standards:
802.1Qat and 802.1Qav.

The initial target of this work is the Intel i210 NIC, but other
controllers' datasheet were also taken into account, like the Renesas
RZ/A1H RZ/A1M group and the Synopsis DesignWare Ethernet QoS
controller.


Proposal
========

Feature-wise, what is covered here is the configuration interfaces for
HW implementations of the Credit-Based shaper (CBS, 802.1Qav). CBS is
a per-queue shaper. Given that this feature is related to traffic
shaping, and that the traffic control subsystem already provides a
queueing discipline that offloads config into the device driver (i.e.
mqprio), designing a new qdisc for the specific purpose of offloading
the config for the CBS shaper seemed like a good fit.

For steering traffic into the correct queues, we use the socket option
SO_PRIORITY and then a mechanism to map priority to traffic classes /
Tx queues. The qdisc mqprio is currently used in our tests.

As for the CBS config interface, this patchset is proposing a new
qdisc called 'cbs'. Its 'tc' cmd line is:

$ tc qdisc add dev IFACE parent ID cbs locredit N hicredit M sendslope S \
     idleslope I

   Note that the parameters for this qdisc are the ones defined by the
   802.1Q-2014 spec, so no hardware specific functionality is exposed here.

Per-stream shaping, as defined by IEEE 802.1Q-2014 Section 34.6.1, is
not yet covered by this proposal.


Testing this RFC
================

Attached to this cover letter are:
 - calculate_cbs_params.py: A Python script to calculate the
   parameters to the CBS queueing discipline;
 - tsn-talker.c: A sample C implementation of the talker side of a stream;
 - tsn-listener.c: A sample C implementation of the listener side of a
   stream;

For testing the patches of this series, you may want to use the
attached samples to this cover letter and use the 'mqprio' qdisc to
setup the priorities to Tx queues mapping, together with the 'cbs'
qdisc to configure the HW shaper of the i210 controller:

1) Setup priorities to traffic classes to hardware queues mapping
$ tc qdisc replace dev ens4 handle 100: parent root mqprio num_tc 3 \
     map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 queues 1@0 1@1 2@2 hw 0

For a more detailed explanation, see mqprio(8), in short, this command
will map traffic with priority 3 to the hardware queue 0, traffic with
priority 2 to hardware queue 1, and the rest will be mapped to
hardware queues 2 and 3.

2) Check scheme. You want to get the inner qdiscs ID from the bottom up
$ tc -g class show dev ens4

Ex.:
+---(100:3) mqprio
|    +---(100:6) mqprio
|    +---(100:7) mqprio
|
+---(100:2) mqprio
|    +---(100:5) mqprio
|
+---(100:1) mqprio
     +---(100:4) mqprio

* Here '100:4' is Tx Queue #0 and '100:5' is Tx Queue #1.

3) Calculate CBS parameters for classes A and B. i.e. BW for A is 20Mbps and
   for B is 10Mbps:
$ calc_cbs_params.py -A 20000 -a 1500 -B 10000 -b 1500

4) Configure CBS for traffic class A (priority 3) as provided by the script:
$ tc qdisc replace dev ens4 parent 100:4 cbs locredit -1470 \
     hicredit 30 sendslope -980000 idleslope 20000

5) Configure CBS for traffic class B (priority 2):
$ tc qdisc replace dev ens4 parent 100:5 cbs \
     locredit -1485 hicredit 31 sendslope -990000 idleslope 10000

6) Run Listener:
$ ./tsn-listener -d 01:AA:AA:AA:AA:AA -i ens4 -s 1500

7) Run Talker for class A (prio 3 here), compiled from samples/tsn/talker.c
$ ./tsn-talker -d 01:AA:AA:AA:AA:AA -i ens4 -p 3 -s 1500

 * The bandwidth displayed on the listener output at this stage should be very
   close to the one configured for class A.

8) You can also run a Talker for class B (prio 2 here and using a
different address):
$ ./tsn-talker -d 01:BB:BB:BB:BB:BB -i ens4 -s 1500


Authors
=======
 - Andre Guedes <andre.guedes@intel.com>
 - Ivan Briano <ivan.briano@intel.com>
 - Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
 - Vinicius Gomes <vinicius.gomes@intel.com>

Andre Guedes (1):
  igb: Add support for CBS offload

Jesus Sanchez-Palencia (2):
  mqprio: Implement select_queue class_ops
  net/sched: Fix accessing invalid dev_queue

Vinicius Costa Gomes (1):
  net/sched: Introduce Credit Based Shaper (CBS) qdisc

 drivers/net/ethernet/intel/igb/e1000_defines.h |  23 ++
 drivers/net/ethernet/intel/igb/e1000_regs.h    |   8 +
 drivers/net/ethernet/intel/igb/igb.h           |   6 +
 drivers/net/ethernet/intel/igb/igb_main.c      | 347 +++++++++++++++++++++++++
 include/linux/netdevice.h                      |   1 +
 include/net/pkt_sched.h                        |   9 +
 include/uapi/linux/pkt_sched.h                 |  17 ++
 net/sched/Kconfig                              |  11 +
 net/sched/Makefile                             |   1 +
 net/sched/sch_cbs.c                            | 225 ++++++++++++++++
 net/sched/sch_generic.c                        |   8 +-
 net/sched/sch_mqprio.c                         |   7 +
 12 files changed, 662 insertions(+), 1 deletion(-)
 create mode 100644 net/sched/sch_cbs.c

Annex: Sample files
===================

calc_cbs_params.py
--8<---------------cut here---------------start------------->8---
#!/usr/bin/env python
#
# Copyright (c) 2017, Intel Corporation
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#     * Redistributions of source code must retain the above copyright notice,
#       this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#     * Neither the name of Intel Corporation nor the names of its contributors
#       may be used to endorse or promote products derived from this software
#       without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import argparse
import math

def print_cbs_params_for_class_a(args):
    idleslope = args.idleslope_a
    sendslope = idleslope - args.link_speed

    # According to 802.1Q-2014 spec, Annex L, hiCredit and
    # loCredit for SR class A are calculated following the
    # equations L-10 and L-12, respectively.
    hicredit = math.ceil(idleslope * args.frame_non_sr / args.link_speed)
    locredit = math.ceil(sendslope * args.frame_a / args.link_speed)

    print("tc qdisc add dev <IFNAME> parent <QDISC-ID> cbs idleslope %d sendslope %d hicredit %d locredit %d" % \
          (idleslope, sendslope, hicredit, locredit))

def print_cbs_params_for_class_b(args):
    idleslope = args.idleslope_b
    sendslope = idleslope - args.link_speed

    # Annex L doesn't present a straightforward equation to
    # calculate hiCredit for Class B so we have to derive it
    # based on generic equations presented in that Annex.
    #
    # L-3 is the primary equation to calculate hiCredit. Section
    # L.2 states that the 'maxInterferenceSize' for SR class B
    # is the maximum burst size for SR class A plus the
    # maxInterferenceSize from SR class A (which is equal to the
    # maximum frame from non-SR traffic).
    #
    # The maximum burst size for SR class A equation is shown in
    # L-16. Merging L-16 into L-3 we get the resulting equation
    # which calculates hiCredit B (refer to section L.3 in case
    # you're not familiar with the legend):
    #
    # hiCredit B = Rb * (     Mo         Ma   )
    #                     ---------- + ------
    #                      Ro - Ra       Ro
    #
    hicredit = math.ceil(idleslope * \
               ((args.frame_non_sr / (args.link_speed - args.idleslope_a)) + \
               (args.frame_a / args.link_speed)))

    # loCredit B is calculated following equation L-2.
    locredit = math.ceil(sendslope * args.frame_b / args.link_speed)

    print("tc qdisc add dev <IFNAME> parent <QDISC-ID> cbs idleslope %d sendslope %d hicredit %d locredit %d" % \
          (idleslope, sendslope, hicredit, locredit))

def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-S', dest='link_speed', default=1000000.0, type=float,
                        help='Link speed in kbps')
    parser.add_argument('-s', dest='frame_non_sr', default=1500.0, type=float,
                        help='Maximum frame size from non-SR traffic (MTU size'
                        'usually')
    parser.add_argument('-A', dest='idleslope_a', default=0, type=float,
                        help='Idleslope for SR class A in kbps')
    parser.add_argument('-a', dest='frame_a', default=0, type=float,
                        help='Maximum frame size for SR class A traffic')
    parser.add_argument('-B', dest='idleslope_b', default=0, type=float,
                        help='Idleslope for SR class B in kbps')
    parser.add_argument('-b', dest='frame_b', default=0, type=float,
                        help='Maximum frame size for SR class B traffic')

    args = parser.parse_args()

    if args.idleslope_a > 0:
        print_cbs_params_for_class_a(args)

    if args.idleslope_b > 0:
        print_cbs_params_for_class_b(args)

if __name__ == "__main__":
    main()
--8<---------------cut here---------------end--------------->8---

tsn-talker.c
--8<---------------cut here---------------start------------->8---
/*
 * Copyright (c) 2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *     * Redistributions of source code must retain the above copyright notice,
 *       this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Intel Corporation nor the names of its contributors
 *       may be used to endorse or promote products derived from this software
 *       without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
 * OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <alloca.h>
#include <argp.h>
#include <arpa/inet.h>
#include <inttypes.h>
#include <linux/if.h>
#include <linux/if_ether.h>
#include <linux/if_packet.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <unistd.h>

#define MAGIC 0xCC

static uint8_t ifname[IFNAMSIZ];
static uint8_t macaddr[ETH_ALEN];
static int priority = -1;
static size_t size = 1500;
static uint64_t seq;
static int delay = -1;

static struct argp_option options[] = {
	{"dst-addr", 'd', "MACADDR", 0, "Stream Destination MAC address" },
	{"delay", 'D', "NUM", 0, "Delay (in us) between packet transmission" },
	{"ifname", 'i', "IFNAME", 0, "Network Interface" },
	{"prio", 'p', "NUM", 0, "SO_PRIORITY to be set in socket" },
	{"packet-size", 's', "NUM", 0, "Size of packets to be transmitted" },
	{ 0 }
};

static error_t parser(int key, char *arg, struct argp_state *state)
{
	int res;

	switch (key) {
	case 'd':
		res = sscanf(arg, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx",
					&macaddr[0], &macaddr[1], &macaddr[2],
					&macaddr[3], &macaddr[4], &macaddr[5]);
		if (res != 6) {
			printf("Invalid address\n");
			exit(EXIT_FAILURE);
		}

		break;
	case 'D':
		delay = atoi(arg);
		break;
	case 'i':
		strncpy(ifname, arg, sizeof(ifname) - 1);
		break;
	case 'p':
		priority = atoi(arg);
		break;
	case 's':
		size = atoi(arg);
		break;
	}

	return 0;
}

static struct argp argp = { options, parser };

int main(int argc, char *argv[])
{
	int fd, res;
	struct ifreq req;
	uint8_t *data;
	struct sockaddr_ll sk_addr = {
		.sll_family = AF_PACKET,
		.sll_protocol = htons(ETH_P_TSN),
		.sll_halen = ETH_ALEN,
	};

	argp_parse(&argp, argc, argv, 0, NULL, NULL);

	fd = socket(AF_PACKET, SOCK_DGRAM, htons(ETH_P_TSN));
	if (fd < 0) {
		perror("Couldn't open socket");
		return 1;
	}

	strncpy(req.ifr_name, ifname, sizeof(req.ifr_name));
	res = ioctl(fd, SIOCGIFINDEX, &req);
	if (res < 0) {
		perror("Couldn't get interface index");
		goto err;
	}

	sk_addr.sll_ifindex = req.ifr_ifindex;
	memcpy(&sk_addr.sll_addr, macaddr, ETH_ALEN);

	if (priority != -1) {
		res = setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &priority,
							sizeof(priority));
		if (res < 0) {
			perror("Couldn't set priority");
			goto err;
		}

	}

	data = alloca(size);
	memset(data, MAGIC, size);

	printf("Sending packets...\n");

	while (1) {
		uint64_t *seq_ptr = (uint64_t *) &data[0];
		ssize_t n;

		*seq_ptr = seq++;

		n = sendto(fd, data, size, 0, (struct sockaddr *) &sk_addr,
							sizeof(sk_addr));
		if (n < 0)
			perror("Failed to send data");

		if (delay > 0)
			usleep(delay);
	}

	close(fd);
	return 0;

err:
	close(fd);
	return 1;
}
--8<---------------cut here---------------end--------------->8---

tsn-listener.c
--8<---------------cut here---------------start------------->8---
/*
 * Copyright (c) 2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *     * Redistributions of source code must retain the above copyright notice,
 *       this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Intel Corporation nor the names of its contributors
 *       may be used to endorse or promote products derived from this software
 *       without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
 * OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <alloca.h>
#include <argp.h>
#include <arpa/inet.h>
#include <inttypes.h>
#include <linux/if.h>
#include <linux/if_ether.h>
#include <linux/if_packet.h>
#include <poll.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/timerfd.h>
#include <unistd.h>

static uint8_t ifname[IFNAMSIZ];
static uint8_t macaddr[ETH_ALEN];
static uint64_t data_count;
static int size = 1500;
static time_t interval = 1;
static bool check_seq = false;
static uint64_t expected_seq;

static struct argp_option options[] = {
	{"check-seq", 'c', NULL, 0, "Check sequence number within packet" },
	{"dst-addr", 'd', "MACADDR", 0, "Stream Destination MAC address" },
	{"ifname", 'i', "IFNAME", 0, "Network Interface" },
	{"interval", 'I', "SEC", 0, "Interval between bandwidth reports" },
	{"packet-size", 's', "NUM", 0, "Expected packet size" },
	{ 0 }
};

static error_t parser(int key, char *arg, struct argp_state *state)
{
	int res;

	switch (key) {
	case 'c':
		check_seq = true;
		break;
	case 'd':
		res = sscanf(arg, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx",
					&macaddr[0], &macaddr[1], &macaddr[2],
					&macaddr[3], &macaddr[4], &macaddr[5]);
		if (res != 6) {
			printf("Invalid address\n");
			exit(EXIT_FAILURE);
		}

		break;
	case 'i':
		strncpy(ifname, arg, sizeof(ifname) - 1);
		break;
	case 'I':
		interval = atoi(arg);
		break;
	case 's':
		size = atoi(arg);
		break;
	}

	return 0;
}

static struct argp argp = { options, parser };

static int setup_timer(void)
{
	int fd, res;
	struct itimerspec tspec = { 0 };

	fd = timerfd_create(CLOCK_MONOTONIC, 0);
	if (fd < 0) {
		perror("Couldn't create timer");
		return -1;
	}

	tspec.it_value.tv_sec = interval;
	tspec.it_interval.tv_sec = interval;

	res = timerfd_settime(fd, 0, &tspec, NULL);
	if (res < 0) {
		perror("Couldn't set timer");
		close(fd);
		return -1;
	}

	return fd;
}

static int setup_socket(void)
{
	int fd, res;
	struct sockaddr_ll sk_addr = {
		.sll_family = AF_PACKET,
		.sll_protocol = htons(ETH_P_TSN),
	};

	fd = socket(AF_PACKET, SOCK_DGRAM, htons(ETH_P_TSN));
	if (fd < 0) {
		perror("Couldn't open socket");
		return -1;
	}

	/* If user provided a network interface, bind() to it. */
	if (ifname[0] != '\0') {
		struct ifreq req;

		strncpy(req.ifr_name, ifname, sizeof(req.ifr_name));
		res = ioctl(fd, SIOCGIFINDEX, &req);
		if (res < 0) {
			perror("Couldn't get interface index");
			goto err;
		}

		sk_addr.sll_ifindex = req.ifr_ifindex;

		res = bind(fd, (struct sockaddr *) &sk_addr, sizeof(sk_addr));
		if (res < 0) {
			perror("Couldn't bind() to interface");
			goto err;
		}
	}

	/* If user provided the stream destination address, set it as multicast
	 * address.
	 */
	if (macaddr[0] != '\0') {
		struct packet_mreq mreq;

		mreq.mr_ifindex = sk_addr.sll_ifindex;
		mreq.mr_type = PACKET_MR_MULTICAST;
		mreq.mr_alen = ETH_ALEN;
		memcpy(&mreq.mr_address, macaddr, ETH_ALEN);

		res = setsockopt(fd, SOL_PACKET, PACKET_ADD_MEMBERSHIP,
					&mreq, sizeof(struct packet_mreq));
		if (res < 0) {
			perror("Couldn't set PACKET_ADD_MEMBERSHIP");
			goto err;
		}
	}

	return fd;

err:
	close(fd);
	return -1;
}

static void recv_packet(int fd)
{
	uint8_t *data = alloca(size);
	ssize_t n = recv(fd, data, size, 0);

	if (n < 0) {
		perror("Failed to receive data");
		return;
	}

	if (n != size)
		printf("Size mismatch: expected %d, got %d\n", size, n);

	if (check_seq) {
		uint64_t *seq = (uint64_t *) &data[0];

		/* If 'expected_seq' is equal to zero, it means this is the
		 * first packet we received so we don't know what sequence
		 * number to expect.
		 */
		if (expected_seq == 0)
			expected_seq = *seq;

		if (*seq != expected_seq) {
			printf("Sequence mismatch: expected %llu, got %llu\n",
					expected_seq, *seq);

			expected_seq = *seq;
		}

		expected_seq++;
	}

	data_count += n;
}

static void report_bw(int fd)
{
	uint64_t expirations;
	ssize_t n = read(fd, &expirations, sizeof(uint64_t));

	if (n < 0) {
		perror("Couldn't read timerfd");
		return;
	}

	if (expirations != 1)
		printf("Some went wrong with timerfd\n");

	printf("Receiving data rate: %llu kbps\n", (data_count * 8) / (1000 * interval));

	data_count = 0;
}

int main(int argc, char *argv[])
{
	int sk_fd, timer_fd, res;
	struct pollfd fds[2];

	argp_parse(&argp, argc, argv, 0, NULL, NULL);

	sk_fd = setup_socket();
	if (sk_fd < 0)
		return 1;

	timer_fd = setup_timer();
	if (timer_fd < 0) {
		close(sk_fd);
		return 1;
	}

	fds[0].fd = sk_fd;
	fds[0].events = POLLIN;
	fds[1].fd = timer_fd;
	fds[1].events = POLLIN;

	printf("Waiting for packets...\n");

	while (1) {
		res = poll(fds, 2, -1);
		if (res < 0) {
			perror("Error on poll()");
			goto err;
		}

		if (fds[0].revents & POLLIN)
			recv_packet(fds[0].fd);

		if (fds[1].revents & POLLIN) {
			report_bw(fds[1].fd);
		}
	}

	close(timer_fd);
	close(sk_fd);
	return 0;

err:
	close(timer_fd);
	close(sk_fd);
	return 1;
}
--8<---------------cut here---------------end--------------->8---

^ permalink raw reply

* [next-queue PATCH v3 1/4] mqprio: Implement select_queue class_ops
From: Vinicius Costa Gomes @ 2017-10-03 23:44 UTC (permalink / raw)
  To: netdev, intel-wired-lan
  Cc: Jesus Sanchez-Palencia, jhs, xiyou.wangcong, jiri, andre.guedes,
	ivan.briano, boon.leong.ong, richardcochran, henrik, levipearson,
	rodney.cummings
In-Reply-To: <20171003234435.8979-1-vinicius.gomes@intel.com>

From: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>

When replacing a child qdisc from mqprio, tc_modify_qdisc() must fetch
the netdev_queue pointer that the current child qdisc is associated
with before creating the new qdisc.

Currently, when using mqprio as root qdisc, the kernel will end up
getting the queue #0 pointer from the mqprio (root qdisc), which leaves
any new child qdisc with a possibly wrong netdev_queue pointer.

Implementing the Qdisc_class_ops select_queue() on mqprio fixes this
issue and avoid an inconsistent state when child qdiscs are replaced.

Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
---
 net/sched/sch_mqprio.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 6bcdfe6e7b63..9d762ca4a76c 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -396,6 +396,12 @@ static void mqprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 	}
 }
 
+static struct netdev_queue *mqprio_select_queue(struct Qdisc *sch,
+					    struct tcmsg *tcm)
+{
+	return mqprio_queue_get(sch, TC_H_MIN(tcm->tcm_parent));
+}
+
 static const struct Qdisc_class_ops mqprio_class_ops = {
 	.graft		= mqprio_graft,
 	.leaf		= mqprio_leaf,
@@ -403,6 +409,7 @@ static const struct Qdisc_class_ops mqprio_class_ops = {
 	.walk		= mqprio_walk,
 	.dump		= mqprio_dump_class,
 	.dump_stats	= mqprio_dump_class_stats,
+	.select_queue	= mqprio_select_queue,
 };
 
 static struct Qdisc_ops mqprio_qdisc_ops __read_mostly = {
-- 
2.14.2

^ permalink raw reply related

* [next-queue PATCH v3 2/4] net/sched: Fix accessing invalid dev_queue
From: Vinicius Costa Gomes @ 2017-10-03 23:44 UTC (permalink / raw)
  To: netdev, intel-wired-lan
  Cc: Jesus Sanchez-Palencia, jhs, xiyou.wangcong, jiri, andre.guedes,
	ivan.briano, boon.leong.ong, richardcochran, henrik, levipearson,
	rodney.cummings
In-Reply-To: <20171003234435.8979-1-vinicius.gomes@intel.com>

From: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>

In qdisc_alloc() the dev_queue pointer was used without any checks being
performed. If qdisc_create() gets a null dev_queue pointer, it just
passes it along to qdisc_alloc(), leading to a crash. That happens if a
root qdisc implements select_queue() and returns a null dev_queue
pointer for an "invalid handle", for example.

One way to reproduce that is:

1) Setup mqprio
$ tc qdisc replace dev enp3s0 parent root mqprio num_tc 3 \
     	   map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 queues 1@0 1@1 2@2 hw 0

2) Replace the first inner qdisc
$ tc qdisc replace dev enp3s0 parent 8001:1 pfifo_fast

This will lead to the following crash:

[15274.874506] BUG: unable to handle kernel NULL pointer dereference at 0000000000000058
[15274.875044] IP: qdisc_alloc+0xd/0xf0
[15274.875253] PGD 1a37d067 P4D 1a37d067 PUD 1a0cb067 PMD 0
[15274.875566] Oops: 0000 [#1] SMP
[15274.875813] Modules linked in:
[15274.875993] CPU: 0 PID: 2006 Comm: tc Not tainted 4.14.0-rc1+ #42
[15274.876415] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-20170228_101828-anatol 04/01/2014
[15274.877019] task: ffff88001eb91d00 task.stack: ffffc90000734000
[15274.877359] RIP: 0010:qdisc_alloc+0xd/0xf0
[15274.877606] RSP: 0018:ffffc90000737a88 EFLAGS: 00010286
[15274.877904] RAX: ffffffff81ef0da0 RBX: 0000000000000000 RCX: 0000000000000000
[15274.878331] RDX: ffff88001a046830 RSI: ffffffff81ef0da0 RDI: 0000000000000000
[15274.878757] RBP: 0000000001000001 R08: 000000000000006c R09: ffffc90000737b40
[15274.879165] R10: fffffffffffffc20 R11: 0000000000000000 R12: ffff88001e308000
[15274.879571] R13: 0000000000000088 R14: ffffc90000737b40 R15: 0000000000000000
[15274.879978] FS:  00007f790a90a700(0000) GS:ffff88001fc00000(0000) knlGS:0000000000000000
[15274.880457] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[15274.880784] CR2: 0000000000000058 CR3: 000000001a003004 CR4: 00000000003606f0
[15274.881203] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[15274.881620] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[15274.882026] Call Trace:
[15274.882179]  qdisc_create+0xb0/0x380
[15274.882388]  ? security_capable+0x2e/0x50
[15274.882616]  ? nla_parse+0x32/0x100
[15274.882818]  tc_modify_qdisc+0x47f/0x560
[15274.883046]  rtnetlink_rcv_msg+0x1db/0x200
[15274.883284]  ? __skb_try_recv_datagram+0xd6/0x150
[15274.883555]  ? rtnl_calcit.isra.21+0xe0/0xe0
[15274.883814]  netlink_rcv_skb+0x92/0xf0
[15274.884031]  netlink_unicast+0x12c/0x1d0
[15274.884258]  netlink_sendmsg+0x2ee/0x330
[15274.884486]  sock_sendmsg+0x28/0x40
[15274.884689]  ___sys_sendmsg+0x25d/0x280
[15274.884913]  ? __alloc_pages_nodemask+0x120/0x180
[15274.885185]  ? page_add_new_anon_rmap+0x90/0xb0
[15274.885448]  ? __handle_mm_fault+0x74e/0xc40
[15274.885695]  ? __sys_sendmsg+0x3e/0x70
[15274.885912]  __sys_sendmsg+0x3e/0x70
[15274.886123]  entry_SYSCALL_64_fastpath+0x13/0x94
[15274.886387] RIP: 0033:0x7f7909b1a1b7
[15274.886594] RSP: 002b:00007ffe14315028 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
[15274.887042] RAX: ffffffffffffffda RBX: 0000000000661860 RCX: 00007f7909b1a1b7
[15274.887450] RDX: 0000000000000000 RSI: 00007ffe143150a0 RDI: 0000000000000003
[15274.887854] RBP: 00007ffe143150a0 R08: 0000000000000001 R09: 0000000000000000
[15274.888256] R10: 00000000000005f1 R11: 0000000000000246 R12: 00007ffe143150e0
[15274.888660] R13: 000000000066e620 R14: 00007ffe1431d180 R15: 0000000000000000
[15274.889070] Code: a8 01 74 09 48 89 df 5b e9 71 ff ff ff 5b c3 f3 c3 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 41 55 41 54 55 53 48 89 fb 44 8b 6e 20 <8b> 57 58 48 89 f5 4c 8b 27 be c0 80 40 01 41 8d bd 40 01 00 00
[15274.890154] RIP: qdisc_alloc+0xd/0xf0 RSP: ffffc90000737a88
[15274.890479] CR2: 0000000000000058
[15274.890677] ---[ end trace 6d0e946c11e46095 ]---

Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
---
 net/sched/sch_generic.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index a0a198768aad..de2408f1ccd3 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -603,8 +603,14 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 	struct Qdisc *sch;
 	unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size;
 	int err = -ENOBUFS;
-	struct net_device *dev = dev_queue->dev;
+	struct net_device *dev;
+
+	if (!dev_queue) {
+		err = -EINVAL;
+		goto errout;
+	}
 
+	dev = dev_queue->dev;
 	p = kzalloc_node(size, GFP_KERNEL,
 			 netdev_queue_numa_node_read(dev_queue));
 
-- 
2.14.2

^ permalink raw reply related

* [next-queue PATCH v2 4/5] net/sched: Introduce Credit Based Shaper (CBS) qdisc
From: Vinicius Costa Gomes @ 2017-10-03 23:44 UTC (permalink / raw)
  To: netdev, intel-wired-lan
  Cc: Vinicius Costa Gomes, jhs, xiyou.wangcong, jiri, andre.guedes,
	ivan.briano, jesus.sanchez-palencia, boon.leong.ong,
	richardcochran, henrik, levipearson, rodney.cummings
In-Reply-To: <20171003234435.8979-1-vinicius.gomes@intel.com>

This queueing discipline implements the shaper algorithm defined by
the 802.1Q-2014 Section 8.6.8.2 and detailed in Annex L.

It's primary usage is to apply some bandwidth reservation to user
defined traffic classes, which are mapped to different queues via the
mqprio qdisc.

Initially, it only supports offloading the traffic shaping work to
supporting controllers.

Later, when a software implementation is added, the current dependency
on being installed "under" mqprio can be lifted.

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
---
 include/linux/netdevice.h |   1 +
 include/net/pkt_sched.h   |   9 ++
 net/sched/Kconfig         |  11 +++
 net/sched/Makefile        |   1 +
 net/sched/sch_cbs.c       | 225 ++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 247 insertions(+)
 create mode 100644 net/sched/sch_cbs.c

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f535779d9dc1..5d6fb06fd80f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -775,6 +775,7 @@ enum tc_setup_type {
 	TC_SETUP_CLSFLOWER,
 	TC_SETUP_CLSMATCHALL,
 	TC_SETUP_CLSBPF,
+	TC_SETUP_CBS,
 };
 
 /* These structures hold the attributes of xdp state that are being passed
diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index 259bc191ba59..7c597b050b36 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -146,4 +146,13 @@ static inline bool is_classid_clsact_egress(u32 classid)
 	       TC_H_MIN(classid) == TC_H_MIN(TC_H_MIN_EGRESS);
 }
 
+struct tc_cbs_qopt_offload {
+	u8 enable;
+	s32 queue;
+	s32 hicredit;
+	s32 locredit;
+	s32 idleslope;
+	s32 sendslope;
+};
+
 #endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index e70ed26485a2..c03d86a7775e 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -172,6 +172,17 @@ config NET_SCH_TBF
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_tbf.
 
+config NET_SCH_CBS
+	tristate "Credit Based Shaper (CBS)"
+	---help---
+	  Say Y here if you want to use the Credit Based Shaper (CBS) packet
+	  scheduling algorithm.
+
+	  See the top of <file:net/sched/sch_cbs.c> for more details.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_cbs.
+
 config NET_SCH_GRED
 	tristate "Generic Random Early Detection (GRED)"
 	---help---
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 7b915d226de7..80c8f92d162d 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -52,6 +52,7 @@ obj-$(CONFIG_NET_SCH_FQ_CODEL)	+= sch_fq_codel.o
 obj-$(CONFIG_NET_SCH_FQ)	+= sch_fq.o
 obj-$(CONFIG_NET_SCH_HHF)	+= sch_hhf.o
 obj-$(CONFIG_NET_SCH_PIE)	+= sch_pie.o
+obj-$(CONFIG_NET_SCH_CBS)	+= sch_cbs.o
 
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
new file mode 100644
index 000000000000..3e0fb0b92160
--- /dev/null
+++ b/net/sched/sch_cbs.c
@@ -0,0 +1,225 @@
+/*
+ * net/sched/sch_cbs.c	Credit Based Shaper
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Vinicius Costa Gomes <vinicius.gomes@intel.com>
+ *
+ */
+
+/* Credit Based Shaper (CBS)
+   =========================
+
+   This is a simple rate-limiting shaper aimed at TSN applications on
+   systems with known traffic workloads.
+
+   Its algorithm is defined by the IEEE 802.1Q-2014 Specification,
+   Section 8.6.8.2, and explained in more detail in the Annex L of the
+   same specification.
+
+   There are four tunables to be considered:
+
+	'idleslope': Idleslope is the rate of credits that is
+	accumulated (in kilobits per second) when there is at least
+	one packet waiting for transmission. Packets are transmitted
+	when the current value of credits is equal or greater than
+	zero. When there is no packet to be transmitted the amount of
+	credits is set to zero. This is the main tunable of the CBS
+	algorithm.
+
+	'sendslope':
+	Sendslope is the rate of credits that is depleted (it should be a
+	negative number of kilobits per second) when a transmission is
+	ocurring. It can be calculated as follows, (IEEE 802.1Q-2014 Section
+	8.6.8.2 item g):
+
+	sendslope = idleslope - port_transmit_rate
+
+	'hicredit': Hicredit defines the maximum amount of credits (in
+	bytes) that can be accumulated. Hicredit depends on the
+	characteristics of interfering traffic,
+	'max_interference_size' is the maximum size of any burst of
+	traffic that can delay the transmission of a frame that is
+	available for transmission for this traffic class, (IEEE
+	802.1Q-2014 Annex L, Equation L-3):
+
+	hicredit = max_interference_size * (idleslope / port_transmit_rate)
+
+	'locredit': Locredit is the minimum amount of credits that can
+	be reached. It is a function of the traffic flowing through
+	this qdisc (IEEE 802.1Q-2014 Annex L, Equation L-2):
+
+	locredit = max_frame_size * (sendslope / port_transmit_rate)
+*/
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+
+struct cbs_sched_data {
+	s32 queue;
+	s32 locredit;
+	s32 hicredit;
+	s32 sendslope;
+	s32 idleslope;
+};
+
+static int cbs_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+		       struct sk_buff **to_free)
+{
+	return qdisc_enqueue_tail(skb, sch);
+}
+
+static const struct nla_policy cbs_policy[TCA_CBS_MAX + 1] = {
+	[TCA_CBS_PARMS]	= { .len = sizeof(struct tc_cbs_qopt) },
+};
+
+static int cbs_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct cbs_sched_data *q = qdisc_priv(sch);
+	struct tc_cbs_qopt_offload cbs = { };
+	struct nlattr *tb[TCA_CBS_MAX + 1];
+	const struct net_device_ops *ops;
+	struct tc_cbs_qopt *qopt;
+	struct net_device *dev;
+	int err;
+
+	err = nla_parse_nested(tb, TCA_CBS_MAX, opt, cbs_policy, NULL);
+	if (err < 0)
+		return err;
+
+	err = -EINVAL;
+	if (!tb[TCA_CBS_PARMS])
+		goto done;
+
+	qopt = nla_data(tb[TCA_CBS_PARMS]);
+
+	dev = qdisc_dev(sch);
+	ops = dev->netdev_ops;
+
+	cbs.queue = q->queue;
+	cbs.enable = 1;
+	cbs.hicredit = qopt->hicredit;
+	cbs.locredit = qopt->locredit;
+	cbs.idleslope = qopt->idleslope;
+	cbs.sendslope = qopt->sendslope;
+
+	err = -EOPNOTSUPP;
+	if (!ops->ndo_setup_tc)
+		goto done;
+
+	err = ops->ndo_setup_tc(dev, TC_SETUP_CBS, &cbs);
+	if (err < 0)
+		goto done;
+
+	q->hicredit = cbs.hicredit;
+	q->locredit = cbs.locredit;
+	q->idleslope = cbs.idleslope;
+	q->sendslope = cbs.sendslope;
+
+done:
+	return err;
+}
+
+static int cbs_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct cbs_sched_data *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+
+	if (!opt)
+		return -EINVAL;
+
+	q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0);
+
+	return cbs_change(sch, opt);
+}
+
+static void cbs_destroy(struct Qdisc *sch)
+{
+	struct cbs_sched_data *q = qdisc_priv(sch);
+	struct tc_cbs_qopt_offload cbs = { };
+	const struct net_device_ops *ops;
+	struct net_device *dev;
+	int err;
+
+	q->hicredit = 0;
+	q->locredit = 0;
+	q->idleslope = 0;
+	q->sendslope = 0;
+
+	dev = qdisc_dev(sch);
+	ops = dev->netdev_ops;
+
+	if (!ops->ndo_setup_tc)
+		return;
+
+	cbs.queue = q->queue;
+	cbs.enable = 0;
+
+	err = ops->ndo_setup_tc(dev, TC_SETUP_CBS, &cbs);
+	if (err < 0)
+		pr_warn("Couldn't reset queue %d to default values\n",
+			cbs.queue);
+}
+
+static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct cbs_sched_data *q = qdisc_priv(sch);
+	struct nlattr *nest;
+	struct tc_cbs_qopt opt;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (!nest)
+		goto nla_put_failure;
+
+	opt.hicredit = q->hicredit;
+	opt.locredit = q->locredit;
+	opt.sendslope = q->sendslope;
+	opt.idleslope = q->idleslope;
+
+	if (nla_put(skb, TCA_CBS_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	return nla_nest_end(skb, nest);
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -1;
+}
+
+static struct Qdisc_ops cbs_qdisc_ops __read_mostly = {
+	.next		=	NULL,
+	.id		=	"cbs",
+	.priv_size	=	sizeof(struct cbs_sched_data),
+	.enqueue	=	cbs_enqueue,
+	.dequeue	=	qdisc_dequeue_head,
+	.peek		=	qdisc_peek_dequeued,
+	.init		=	cbs_init,
+	.reset		=	qdisc_reset_queue,
+	.destroy	=	cbs_destroy,
+	.change		=	cbs_change,
+	.dump		=	cbs_dump,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init cbs_module_init(void)
+{
+	return register_qdisc(&cbs_qdisc_ops);
+}
+
+static void __exit cbs_module_exit(void)
+{
+	unregister_qdisc(&cbs_qdisc_ops);
+}
+module_init(cbs_module_init)
+module_exit(cbs_module_exit)
+MODULE_LICENSE("GPL");
-- 
2.14.2

^ permalink raw reply related

* [next-queue PATCH v3 4/4] igb: Add support for CBS offload
From: Vinicius Costa Gomes @ 2017-10-03 23:44 UTC (permalink / raw)
  To: netdev, intel-wired-lan
  Cc: Andre Guedes, jhs, xiyou.wangcong, jiri, ivan.briano,
	jesus.sanchez-palencia, boon.leong.ong, richardcochran, henrik,
	levipearson, rodney.cummings
In-Reply-To: <20171003234435.8979-1-vinicius.gomes@intel.com>

From: Andre Guedes <andre.guedes@intel.com>

This patch adds support for Credit-Based Shaper (CBS) qdisc offload
from Traffic Control system. This support enable us to leverage the
Forwarding and Queuing for Time-Sensitive Streams (FQTSS) features
from Intel i210 Ethernet Controller. FQTSS is the former 802.1Qav
standard which was merged into 802.1Q in 2014. It enables traffic
prioritization and bandwidth reservation via the Credit-Based Shaper
which is implemented in hardware by i210 controller.

The patch introduces the igb_setup_tc() function which implements the
support for CBS qdisc hardware offload in the IGB driver. CBS offload
is the only traffic control offload supported by the driver at the
moment.

FQTSS transmission mode from i210 controller is automatically enabled
by the IGB driver when the CBS is enabled for the first hardware
queue. Likewise, FQTSS mode is automatically disabled when CBS is
disabled for the last hardware queue. Changing FQTSS mode requires NIC
reset.

FQTSS feature is supported by i210 controller only.

Signed-off-by: Andre Guedes <andre.guedes@intel.com>
---
 drivers/net/ethernet/intel/igb/e1000_defines.h |  23 ++
 drivers/net/ethernet/intel/igb/e1000_regs.h    |   8 +
 drivers/net/ethernet/intel/igb/igb.h           |   6 +
 drivers/net/ethernet/intel/igb/igb_main.c      | 347 +++++++++++++++++++++++++
 4 files changed, 384 insertions(+)

diff --git a/drivers/net/ethernet/intel/igb/e1000_defines.h b/drivers/net/ethernet/intel/igb/e1000_defines.h
index 1de82f247312..83cabff1e0ab 100644
--- a/drivers/net/ethernet/intel/igb/e1000_defines.h
+++ b/drivers/net/ethernet/intel/igb/e1000_defines.h
@@ -353,7 +353,18 @@
 #define E1000_RXPBS_CFG_TS_EN           0x80000000
 
 #define I210_RXPBSIZE_DEFAULT		0x000000A2 /* RXPBSIZE default */
+#define I210_RXPBSIZE_MASK		0x0000003F
+#define I210_RXPBSIZE_PB_32KB		0x00000020
 #define I210_TXPBSIZE_DEFAULT		0x04000014 /* TXPBSIZE default */
+#define I210_TXPBSIZE_MASK		0xC0FFFFFF
+#define I210_TXPBSIZE_PB0_8KB		(8 << 0)
+#define I210_TXPBSIZE_PB1_8KB		(8 << 6)
+#define I210_TXPBSIZE_PB2_4KB		(4 << 12)
+#define I210_TXPBSIZE_PB3_4KB		(4 << 18)
+
+#define I210_DTXMXPKTSZ_DEFAULT		0x00000098
+
+#define I210_SR_QUEUES_NUM		2
 
 /* SerDes Control */
 #define E1000_SCTL_DISABLE_SERDES_LOOPBACK 0x0400
@@ -1051,4 +1062,16 @@
 #define E1000_VLAPQF_P_VALID(_n)	(0x1 << (3 + (_n) * 4))
 #define E1000_VLAPQF_QUEUE_MASK	0x03
 
+/* TX Qav Control fields */
+#define E1000_TQAVCTRL_XMIT_MODE	BIT(0)
+#define E1000_TQAVCTRL_DATAFETCHARB	BIT(4)
+#define E1000_TQAVCTRL_DATATRANARB	BIT(8)
+
+/* TX Qav Credit Control fields */
+#define E1000_TQAVCC_IDLESLOPE_MASK	0xFFFF
+#define E1000_TQAVCC_QUEUEMODE		BIT(31)
+
+/* Transmit Descriptor Control fields */
+#define E1000_TXDCTL_PRIORITY		BIT(27)
+
 #endif
diff --git a/drivers/net/ethernet/intel/igb/e1000_regs.h b/drivers/net/ethernet/intel/igb/e1000_regs.h
index 58adbf234e07..8eee081d395f 100644
--- a/drivers/net/ethernet/intel/igb/e1000_regs.h
+++ b/drivers/net/ethernet/intel/igb/e1000_regs.h
@@ -421,6 +421,14 @@ do { \
 
 #define E1000_I210_FLA		0x1201C
 
+#define E1000_I210_DTXMXPKTSZ	0x355C
+
+#define E1000_I210_TXDCTL(_n)	(0x0E028 + ((_n) * 0x40))
+
+#define E1000_I210_TQAVCTRL	0x3570
+#define E1000_I210_TQAVCC(_n)	(0x3004 + ((_n) * 0x40))
+#define E1000_I210_TQAVHC(_n)	(0x300C + ((_n) * 0x40))
+
 #define E1000_INVM_DATA_REG(_n)	(0x12120 + 4*(_n))
 #define E1000_INVM_SIZE		64 /* Number of INVM Data Registers */
 
diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index 06ffb2bc713e..92845692087a 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -281,6 +281,11 @@ struct igb_ring {
 	u16 count;			/* number of desc. in the ring */
 	u8 queue_index;			/* logical index of the ring*/
 	u8 reg_idx;			/* physical index of the ring */
+	bool cbs_enable;		/* indicates if CBS is enabled */
+	s32 idleslope;			/* idleSlope in kbps */
+	s32 sendslope;			/* sendSlope in kbps */
+	s32 hicredit;			/* hiCredit in bytes */
+	s32 locredit;			/* loCredit in bytes */
 
 	/* everything past this point are written often */
 	u16 next_to_clean;
@@ -621,6 +626,7 @@ struct igb_adapter {
 #define IGB_FLAG_EEE			BIT(14)
 #define IGB_FLAG_VLAN_PROMISC		BIT(15)
 #define IGB_FLAG_RX_LEGACY		BIT(16)
+#define IGB_FLAG_FQTSS			BIT(17)
 
 /* Media Auto Sense */
 #define IGB_MAS_ENABLE_0		0X0001
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index fd4a46b03cc8..03b8d0f4acfd 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -34,6 +34,7 @@
 #include <linux/slab.h>
 #include <net/checksum.h>
 #include <net/ip6_checksum.h>
+#include <net/pkt_sched.h>
 #include <linux/net_tstamp.h>
 #include <linux/mii.h>
 #include <linux/ethtool.h>
@@ -62,6 +63,17 @@
 #define BUILD 0
 #define DRV_VERSION __stringify(MAJ) "." __stringify(MIN) "." \
 __stringify(BUILD) "-k"
+
+enum queue_mode {
+	QUEUE_MODE_STRICT_PRIORITY,
+	QUEUE_MODE_STREAM_RESERVATION,
+};
+
+enum tx_queue_prio {
+	TX_QUEUE_PRIO_HIGH,
+	TX_QUEUE_PRIO_LOW,
+};
+
 char igb_driver_name[] = "igb";
 char igb_driver_version[] = DRV_VERSION;
 static const char igb_driver_string[] =
@@ -1271,6 +1283,12 @@ static int igb_alloc_q_vector(struct igb_adapter *adapter,
 		ring->count = adapter->tx_ring_count;
 		ring->queue_index = txr_idx;
 
+		ring->cbs_enable = false;
+		ring->idleslope = 0;
+		ring->sendslope = 0;
+		ring->hicredit = 0;
+		ring->locredit = 0;
+
 		u64_stats_init(&ring->tx_syncp);
 		u64_stats_init(&ring->tx_syncp2);
 
@@ -1598,6 +1616,284 @@ static void igb_get_hw_control(struct igb_adapter *adapter)
 			ctrl_ext | E1000_CTRL_EXT_DRV_LOAD);
 }
 
+static void enable_fqtss(struct igb_adapter *adapter, bool enable)
+{
+	struct net_device *netdev = adapter->netdev;
+	struct e1000_hw *hw = &adapter->hw;
+
+	WARN_ON(hw->mac.type != e1000_i210);
+
+	if (enable)
+		adapter->flags |= IGB_FLAG_FQTSS;
+	else
+		adapter->flags &= ~IGB_FLAG_FQTSS;
+
+	if (netif_running(netdev))
+		schedule_work(&adapter->reset_task);
+}
+
+static bool is_fqtss_enabled(struct igb_adapter *adapter)
+{
+	return (adapter->flags & IGB_FLAG_FQTSS) ? true : false;
+}
+
+static void set_tx_desc_fetch_prio(struct e1000_hw *hw, int queue,
+				   enum tx_queue_prio prio)
+{
+	u32 val;
+
+	WARN_ON(hw->mac.type != e1000_i210);
+	WARN_ON(queue < 0 || queue > 4);
+
+	val = rd32(E1000_I210_TXDCTL(queue));
+
+	if (prio == TX_QUEUE_PRIO_HIGH)
+		val |= E1000_TXDCTL_PRIORITY;
+	else
+		val &= ~E1000_TXDCTL_PRIORITY;
+
+	wr32(E1000_I210_TXDCTL(queue), val);
+}
+
+static void set_queue_mode(struct e1000_hw *hw, int queue, enum queue_mode mode)
+{
+	u32 val;
+
+	WARN_ON(hw->mac.type != e1000_i210);
+	WARN_ON(queue < 0 || queue > 1);
+
+	val = rd32(E1000_I210_TQAVCC(queue));
+
+	if (mode == QUEUE_MODE_STREAM_RESERVATION)
+		val |= E1000_TQAVCC_QUEUEMODE;
+	else
+		val &= ~E1000_TQAVCC_QUEUEMODE;
+
+	wr32(E1000_I210_TQAVCC(queue), val);
+}
+
+/**
+ *  igb_configure_cbs - Configure Credit-Based Shaper (CBS)
+ *  @adapter: pointer to adapter struct
+ *  @queue: queue number
+ *  @enable: true = enable CBS, false = disable CBS
+ *  @idleslope: idleSlope in kbps
+ *  @sendslope: sendSlope in kbps
+ *  @hicredit: hiCredit in bytes
+ *  @locredit: loCredit in bytes
+ *
+ *  Configure CBS for a given hardware queue. When disabling, idleslope,
+ *  sendslope, hicredit, locredit arguments are ignored. Returns 0 if
+ *  success. Negative otherwise.
+ **/
+static void igb_configure_cbs(struct igb_adapter *adapter, int queue,
+			      bool enable, int idleslope, int sendslope,
+			      int hicredit, int locredit)
+{
+	struct net_device *netdev = adapter->netdev;
+	struct e1000_hw *hw = &adapter->hw;
+	u32 tqavcc;
+	u16 value;
+
+	WARN_ON(hw->mac.type != e1000_i210);
+	WARN_ON(queue < 0 || queue > 1);
+
+	if (enable) {
+		set_tx_desc_fetch_prio(hw, queue, TX_QUEUE_PRIO_HIGH);
+		set_queue_mode(hw, queue, QUEUE_MODE_STREAM_RESERVATION);
+
+		/* According to i210 datasheet section 7.2.7.7, we should set
+		 * the 'idleSlope' field from TQAVCC register following the
+		 * equation:
+		 *
+		 * For 100 Mbps link speed:
+		 *
+		 *     value = BW * 0x7735 * 0.2                          (E1)
+		 *
+		 * For 1000Mbps link speed:
+		 *
+		 *     value = BW * 0x7735 * 2                            (E2)
+		 *
+		 * E1 and E2 can be merged into one equation as shown below.
+		 * Note that 'link-speed' is in Mbps.
+		 *
+		 *     value = BW * 0x7735 * 2 * link-speed
+		 *                           --------------               (E3)
+		 *                                1000
+		 *
+		 * 'BW' is the percentage bandwidth out of full link speed
+		 * which can be found with the following equation. Note that
+		 * idleSlope here is the parameter from this function which
+		 * is in kbps.
+		 *
+		 *     BW =     idleSlope
+		 *          -----------------                             (E4)
+		 *          link-speed * 1000
+		 *
+		 * That said, we can come up with a generic equation to
+		 * calculate the value we should set it TQAVCC register by
+		 * replacing 'BW' in E3 by E4. The resulting equation is:
+		 *
+		 * value =     idleSlope     * 0x7735 * 2 * link-speed
+		 *         -----------------            --------------    (E5)
+		 *         link-speed * 1000                 1000
+		 *
+		 * 'link-speed' is present in both sides of the fraction so
+		 * it is canceled out. The final equation is the following:
+		 *
+		 *     value = idleSlope * 61034
+		 *             -----------------                          (E6)
+		 *                  1000000
+		 */
+		value = DIV_ROUND_UP_ULL(idleslope * 61034ULL, 1000000);
+
+		tqavcc = rd32(E1000_I210_TQAVCC(queue));
+		tqavcc &= ~E1000_TQAVCC_IDLESLOPE_MASK;
+		tqavcc |= value;
+		wr32(E1000_I210_TQAVCC(queue), tqavcc);
+
+		wr32(E1000_I210_TQAVHC(queue), 0x80000000 + hicredit * 0x7735);
+	} else {
+		set_tx_desc_fetch_prio(hw, queue, TX_QUEUE_PRIO_LOW);
+		set_queue_mode(hw, queue, QUEUE_MODE_STRICT_PRIORITY);
+
+		/* Set idleSlope to zero. */
+		tqavcc = rd32(E1000_I210_TQAVCC(queue));
+		tqavcc &= ~E1000_TQAVCC_IDLESLOPE_MASK;
+		wr32(E1000_I210_TQAVCC(queue), tqavcc);
+
+		/* Set hiCredit to zero. */
+		wr32(E1000_I210_TQAVHC(queue), 0);
+	}
+
+	/* XXX: In i210 controller the sendSlope and loCredit parameters from
+	 * CBS are not configurable by software so we don't do any 'controller
+	 * configuration' in respect to these parameters.
+	 */
+
+	netdev_dbg(netdev, "CBS %s: queue %d idleslope %d sendslope %d hiCredit %d locredit %d\n",
+		   (enable) ? "enabled" : "disabled", queue,
+		   idleslope, sendslope, hicredit, locredit);
+}
+
+static int igb_save_cbs_params(struct igb_adapter *adapter, int queue,
+			       bool enable, int idleslope, int sendslope,
+			       int hicredit, int locredit)
+{
+	struct igb_ring *ring;
+
+	if (queue < 0 || queue > adapter->num_tx_queues)
+		return -EINVAL;
+
+	ring = adapter->tx_ring[queue];
+
+	ring->cbs_enable = enable;
+	ring->idleslope = idleslope;
+	ring->sendslope = sendslope;
+	ring->hicredit = hicredit;
+	ring->locredit = locredit;
+
+	return 0;
+}
+
+static bool is_any_cbs_enabled(struct igb_adapter *adapter)
+{
+	struct igb_ring *ring;
+	int i;
+
+	for (i = 0; i < adapter->num_tx_queues; i++) {
+		ring = adapter->tx_ring[i];
+
+		if (ring->cbs_enable)
+			return true;
+	}
+
+	return false;
+}
+
+static void igb_setup_tx_mode(struct igb_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+	struct e1000_hw *hw = &adapter->hw;
+	u32 val;
+
+	/* Only i210 controller supports changing the transmission mode. */
+	if (hw->mac.type != e1000_i210)
+		return;
+
+	if (is_fqtss_enabled(adapter)) {
+		int i, max_queue;
+
+		/* Configure TQAVCTRL register: set transmit mode to 'Qav',
+		 * set data fetch arbitration to 'round robin' and set data
+		 * transfer arbitration to 'credit shaper algorithm.
+		 */
+		val = rd32(E1000_I210_TQAVCTRL);
+		val |= E1000_TQAVCTRL_XMIT_MODE | E1000_TQAVCTRL_DATATRANARB;
+		val &= ~E1000_TQAVCTRL_DATAFETCHARB;
+		wr32(E1000_I210_TQAVCTRL, val);
+
+		/* Configure Tx and Rx packet buffers sizes as described in
+		 * i210 datasheet section 7.2.7.7.
+		 */
+		val = rd32(E1000_TXPBS);
+		val &= ~I210_TXPBSIZE_MASK;
+		val |= I210_TXPBSIZE_PB0_8KB | I210_TXPBSIZE_PB1_8KB |
+			I210_TXPBSIZE_PB2_4KB | I210_TXPBSIZE_PB3_4KB;
+		wr32(E1000_TXPBS, val);
+
+		val = rd32(E1000_RXPBS);
+		val &= ~I210_RXPBSIZE_MASK;
+		val |= I210_RXPBSIZE_PB_32KB;
+		wr32(E1000_RXPBS, val);
+
+		/* Section 8.12.9 states that MAX_TPKT_SIZE from DTXMXPKTSZ
+		 * register should not exceed the buffer size programmed in
+		 * TXPBS. The smallest buffer size programmed in TXPBS is 4kB
+		 * so according to the datasheet we should set MAX_TPKT_SIZE to
+		 * 4kB / 64.
+		 *
+		 * However, when we do so, no frame from queue 2 and 3 are
+		 * transmitted.  It seems the MAX_TPKT_SIZE should not be great
+		 * or _equal_ to the buffer size programmed in TXPBS. For this
+		 * reason, we set set MAX_ TPKT_SIZE to (4kB - 1) / 64.
+		 */
+		val = (4096 - 1) / 64;
+		wr32(E1000_I210_DTXMXPKTSZ, val);
+
+		/* Since FQTSS mode is enabled, apply any CBS configuration
+		 * previously set. If no previous CBS configuration has been
+		 * done, then the initial configuration is applied, which means
+		 * CBS is disabled.
+		 */
+		max_queue = (adapter->num_tx_queues < I210_SR_QUEUES_NUM) ?
+			    adapter->num_tx_queues : I210_SR_QUEUES_NUM;
+
+		for (i = 0; i < max_queue; i++) {
+			struct igb_ring *ring = adapter->tx_ring[i];
+
+			igb_configure_cbs(adapter, i, ring->cbs_enable,
+					  ring->idleslope, ring->sendslope,
+					  ring->hicredit, ring->locredit);
+		}
+	} else {
+		wr32(E1000_RXPBS, I210_RXPBSIZE_DEFAULT);
+		wr32(E1000_TXPBS, I210_TXPBSIZE_DEFAULT);
+		wr32(E1000_I210_DTXMXPKTSZ, I210_DTXMXPKTSZ_DEFAULT);
+
+		val = rd32(E1000_I210_TQAVCTRL);
+		/* According to Section 8.12.21, the other flags we've set when
+		 * enabling FQTSS are not relevant when disabling FQTSS so we
+		 * don't set they here.
+		 */
+		val &= ~E1000_TQAVCTRL_XMIT_MODE;
+		wr32(E1000_I210_TQAVCTRL, val);
+	}
+
+	netdev_dbg(netdev, "FQTSS %s\n", (is_fqtss_enabled(adapter)) ?
+		   "enabled" : "disabled");
+}
+
 /**
  *  igb_configure - configure the hardware for RX and TX
  *  @adapter: private board structure
@@ -1609,6 +1905,7 @@ static void igb_configure(struct igb_adapter *adapter)
 
 	igb_get_hw_control(adapter);
 	igb_set_rx_mode(netdev);
+	igb_setup_tx_mode(adapter);
 
 	igb_restore_vlan(adapter);
 
@@ -2150,6 +2447,55 @@ igb_features_check(struct sk_buff *skb, struct net_device *dev,
 	return features;
 }
 
+static int igb_offload_cbs(struct igb_adapter *adapter,
+			   struct tc_cbs_qopt_offload *qopt)
+{
+	struct e1000_hw *hw = &adapter->hw;
+	int err;
+
+	/* CBS offloading is only supported by i210 controller. */
+	if (hw->mac.type != e1000_i210)
+		return -EOPNOTSUPP;
+
+	/* CBS offloading is only supported by queue 0 and queue 1. */
+	if (qopt->queue < 0 || qopt->queue > 1)
+		return -EINVAL;
+
+	err = igb_save_cbs_params(adapter, qopt->queue, qopt->enable,
+				  qopt->idleslope, qopt->sendslope,
+				  qopt->hicredit, qopt->locredit);
+	if (err)
+		return err;
+
+	if (is_fqtss_enabled(adapter)) {
+		igb_configure_cbs(adapter, qopt->queue, qopt->enable,
+				  qopt->idleslope, qopt->sendslope,
+				  qopt->hicredit, qopt->locredit);
+
+		if (!is_any_cbs_enabled(adapter))
+			enable_fqtss(adapter, false);
+
+	} else {
+		enable_fqtss(adapter, true);
+	}
+
+	return 0;
+}
+
+static int igb_setup_tc(struct net_device *dev, enum tc_setup_type type,
+			void *type_data)
+{
+	struct igb_adapter *adapter = netdev_priv(dev);
+
+	switch (type) {
+	case TC_SETUP_CBS:
+		return igb_offload_cbs(adapter, type_data);
+
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
 static const struct net_device_ops igb_netdev_ops = {
 	.ndo_open		= igb_open,
 	.ndo_stop		= igb_close,
@@ -2175,6 +2521,7 @@ static const struct net_device_ops igb_netdev_ops = {
 	.ndo_set_features	= igb_set_features,
 	.ndo_fdb_add		= igb_ndo_fdb_add,
 	.ndo_features_check	= igb_features_check,
+	.ndo_setup_tc		= igb_setup_tc,
 };
 
 /**
-- 
2.14.2

^ permalink raw reply related

* [next-queue PATCH v3 3/4] net/sched: Introduce Credit Based Shaper (CBS) qdisc
From: Vinicius Costa Gomes @ 2017-10-03 23:44 UTC (permalink / raw)
  To: netdev, intel-wired-lan
  Cc: Vinicius Costa Gomes, jhs, xiyou.wangcong, jiri, andre.guedes,
	ivan.briano, jesus.sanchez-palencia, boon.leong.ong,
	richardcochran, henrik, levipearson, rodney.cummings
In-Reply-To: <20171003234435.8979-1-vinicius.gomes@intel.com>

This queueing discipline implements the shaper algorithm defined by
the 802.1Q-2014 Section 8.6.8.2 and detailed in Annex L.

It's primary usage is to apply some bandwidth reservation to user
defined traffic classes, which are mapped to different queues via the
mqprio qdisc.

Initially, it only supports offloading the traffic shaping work to
supporting controllers.

Later, when a software implementation is added, the current dependency
on being installed "under" mqprio can be lifted.

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
---
 include/linux/netdevice.h      |   1 +
 include/net/pkt_sched.h        |   9 ++
 include/uapi/linux/pkt_sched.h |  17 ++++
 net/sched/Kconfig              |  11 ++
 net/sched/Makefile             |   1 +
 net/sched/sch_cbs.c            | 225 +++++++++++++++++++++++++++++++++++++++++
 6 files changed, 264 insertions(+)
 create mode 100644 net/sched/sch_cbs.c

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e1d6ef130611..b8798adc214f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -775,6 +775,7 @@ enum tc_setup_type {
 	TC_SETUP_CLSFLOWER,
 	TC_SETUP_CLSMATCHALL,
 	TC_SETUP_CLSBPF,
+	TC_SETUP_CBS,
 };
 
 /* These structures hold the attributes of xdp state that are being passed
diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index 259bc191ba59..7c597b050b36 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -146,4 +146,13 @@ static inline bool is_classid_clsact_egress(u32 classid)
 	       TC_H_MIN(classid) == TC_H_MIN(TC_H_MIN_EGRESS);
 }
 
+struct tc_cbs_qopt_offload {
+	u8 enable;
+	s32 queue;
+	s32 hicredit;
+	s32 locredit;
+	s32 idleslope;
+	s32 sendslope;
+};
+
 #endif
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 099bf5528fed..27c849c053cf 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -871,4 +871,21 @@ struct tc_pie_xstats {
 	__u32 maxq;             /* maximum queue size */
 	__u32 ecn_mark;         /* packets marked with ecn*/
 };
+
+/* CBS */
+struct tc_cbs_qopt {
+	__s32 hicredit;
+	__s32 locredit;
+	__s32 idleslope;
+	__s32 sendslope;
+};
+
+enum {
+	TCA_CBS_UNSPEC,
+	TCA_CBS_PARMS,
+	__TCA_CBS_MAX,
+};
+
+#define TCA_CBS_MAX (__TCA_CBS_MAX - 1)
+
 #endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index e70ed26485a2..c03d86a7775e 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -172,6 +172,17 @@ config NET_SCH_TBF
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_tbf.
 
+config NET_SCH_CBS
+	tristate "Credit Based Shaper (CBS)"
+	---help---
+	  Say Y here if you want to use the Credit Based Shaper (CBS) packet
+	  scheduling algorithm.
+
+	  See the top of <file:net/sched/sch_cbs.c> for more details.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_cbs.
+
 config NET_SCH_GRED
 	tristate "Generic Random Early Detection (GRED)"
 	---help---
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 7b915d226de7..80c8f92d162d 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -52,6 +52,7 @@ obj-$(CONFIG_NET_SCH_FQ_CODEL)	+= sch_fq_codel.o
 obj-$(CONFIG_NET_SCH_FQ)	+= sch_fq.o
 obj-$(CONFIG_NET_SCH_HHF)	+= sch_hhf.o
 obj-$(CONFIG_NET_SCH_PIE)	+= sch_pie.o
+obj-$(CONFIG_NET_SCH_CBS)	+= sch_cbs.o
 
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
new file mode 100644
index 000000000000..3e0fb0b92160
--- /dev/null
+++ b/net/sched/sch_cbs.c
@@ -0,0 +1,225 @@
+/*
+ * net/sched/sch_cbs.c	Credit Based Shaper
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Vinicius Costa Gomes <vinicius.gomes@intel.com>
+ *
+ */
+
+/* Credit Based Shaper (CBS)
+   =========================
+
+   This is a simple rate-limiting shaper aimed at TSN applications on
+   systems with known traffic workloads.
+
+   Its algorithm is defined by the IEEE 802.1Q-2014 Specification,
+   Section 8.6.8.2, and explained in more detail in the Annex L of the
+   same specification.
+
+   There are four tunables to be considered:
+
+	'idleslope': Idleslope is the rate of credits that is
+	accumulated (in kilobits per second) when there is at least
+	one packet waiting for transmission. Packets are transmitted
+	when the current value of credits is equal or greater than
+	zero. When there is no packet to be transmitted the amount of
+	credits is set to zero. This is the main tunable of the CBS
+	algorithm.
+
+	'sendslope':
+	Sendslope is the rate of credits that is depleted (it should be a
+	negative number of kilobits per second) when a transmission is
+	ocurring. It can be calculated as follows, (IEEE 802.1Q-2014 Section
+	8.6.8.2 item g):
+
+	sendslope = idleslope - port_transmit_rate
+
+	'hicredit': Hicredit defines the maximum amount of credits (in
+	bytes) that can be accumulated. Hicredit depends on the
+	characteristics of interfering traffic,
+	'max_interference_size' is the maximum size of any burst of
+	traffic that can delay the transmission of a frame that is
+	available for transmission for this traffic class, (IEEE
+	802.1Q-2014 Annex L, Equation L-3):
+
+	hicredit = max_interference_size * (idleslope / port_transmit_rate)
+
+	'locredit': Locredit is the minimum amount of credits that can
+	be reached. It is a function of the traffic flowing through
+	this qdisc (IEEE 802.1Q-2014 Annex L, Equation L-2):
+
+	locredit = max_frame_size * (sendslope / port_transmit_rate)
+*/
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+
+struct cbs_sched_data {
+	s32 queue;
+	s32 locredit;
+	s32 hicredit;
+	s32 sendslope;
+	s32 idleslope;
+};
+
+static int cbs_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+		       struct sk_buff **to_free)
+{
+	return qdisc_enqueue_tail(skb, sch);
+}
+
+static const struct nla_policy cbs_policy[TCA_CBS_MAX + 1] = {
+	[TCA_CBS_PARMS]	= { .len = sizeof(struct tc_cbs_qopt) },
+};
+
+static int cbs_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct cbs_sched_data *q = qdisc_priv(sch);
+	struct tc_cbs_qopt_offload cbs = { };
+	struct nlattr *tb[TCA_CBS_MAX + 1];
+	const struct net_device_ops *ops;
+	struct tc_cbs_qopt *qopt;
+	struct net_device *dev;
+	int err;
+
+	err = nla_parse_nested(tb, TCA_CBS_MAX, opt, cbs_policy, NULL);
+	if (err < 0)
+		return err;
+
+	err = -EINVAL;
+	if (!tb[TCA_CBS_PARMS])
+		goto done;
+
+	qopt = nla_data(tb[TCA_CBS_PARMS]);
+
+	dev = qdisc_dev(sch);
+	ops = dev->netdev_ops;
+
+	cbs.queue = q->queue;
+	cbs.enable = 1;
+	cbs.hicredit = qopt->hicredit;
+	cbs.locredit = qopt->locredit;
+	cbs.idleslope = qopt->idleslope;
+	cbs.sendslope = qopt->sendslope;
+
+	err = -EOPNOTSUPP;
+	if (!ops->ndo_setup_tc)
+		goto done;
+
+	err = ops->ndo_setup_tc(dev, TC_SETUP_CBS, &cbs);
+	if (err < 0)
+		goto done;
+
+	q->hicredit = cbs.hicredit;
+	q->locredit = cbs.locredit;
+	q->idleslope = cbs.idleslope;
+	q->sendslope = cbs.sendslope;
+
+done:
+	return err;
+}
+
+static int cbs_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct cbs_sched_data *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+
+	if (!opt)
+		return -EINVAL;
+
+	q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0);
+
+	return cbs_change(sch, opt);
+}
+
+static void cbs_destroy(struct Qdisc *sch)
+{
+	struct cbs_sched_data *q = qdisc_priv(sch);
+	struct tc_cbs_qopt_offload cbs = { };
+	const struct net_device_ops *ops;
+	struct net_device *dev;
+	int err;
+
+	q->hicredit = 0;
+	q->locredit = 0;
+	q->idleslope = 0;
+	q->sendslope = 0;
+
+	dev = qdisc_dev(sch);
+	ops = dev->netdev_ops;
+
+	if (!ops->ndo_setup_tc)
+		return;
+
+	cbs.queue = q->queue;
+	cbs.enable = 0;
+
+	err = ops->ndo_setup_tc(dev, TC_SETUP_CBS, &cbs);
+	if (err < 0)
+		pr_warn("Couldn't reset queue %d to default values\n",
+			cbs.queue);
+}
+
+static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct cbs_sched_data *q = qdisc_priv(sch);
+	struct nlattr *nest;
+	struct tc_cbs_qopt opt;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (!nest)
+		goto nla_put_failure;
+
+	opt.hicredit = q->hicredit;
+	opt.locredit = q->locredit;
+	opt.sendslope = q->sendslope;
+	opt.idleslope = q->idleslope;
+
+	if (nla_put(skb, TCA_CBS_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	return nla_nest_end(skb, nest);
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -1;
+}
+
+static struct Qdisc_ops cbs_qdisc_ops __read_mostly = {
+	.next		=	NULL,
+	.id		=	"cbs",
+	.priv_size	=	sizeof(struct cbs_sched_data),
+	.enqueue	=	cbs_enqueue,
+	.dequeue	=	qdisc_dequeue_head,
+	.peek		=	qdisc_peek_dequeued,
+	.init		=	cbs_init,
+	.reset		=	qdisc_reset_queue,
+	.destroy	=	cbs_destroy,
+	.change		=	cbs_change,
+	.dump		=	cbs_dump,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init cbs_module_init(void)
+{
+	return register_qdisc(&cbs_qdisc_ops);
+}
+
+static void __exit cbs_module_exit(void)
+{
+	unregister_qdisc(&cbs_qdisc_ops);
+}
+module_init(cbs_module_init)
+module_exit(cbs_module_exit)
+MODULE_LICENSE("GPL");
-- 
2.14.2

^ permalink raw reply related

* [next-queue PATCH v2 3/5] net/sched: Introduce the user API for the CBS shaper
From: Vinicius Costa Gomes @ 2017-10-03 23:44 UTC (permalink / raw)
  To: netdev, intel-wired-lan
  Cc: Vinicius Costa Gomes, jhs, xiyou.wangcong, jiri, andre.guedes,
	ivan.briano, jesus.sanchez-palencia, boon.leong.ong,
	richardcochran, henrik, levipearson, rodney.cummings
In-Reply-To: <20171003234435.8979-1-vinicius.gomes@intel.com>

Export the API necessary for configuring the CBS shaper (implemented
in the next patch) via the tc tool.

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
---
 include/uapi/linux/pkt_sched.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 099bf5528fed..27c849c053cf 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -871,4 +871,21 @@ struct tc_pie_xstats {
 	__u32 maxq;             /* maximum queue size */
 	__u32 ecn_mark;         /* packets marked with ecn*/
 };
+
+/* CBS */
+struct tc_cbs_qopt {
+	__s32 hicredit;
+	__s32 locredit;
+	__s32 idleslope;
+	__s32 sendslope;
+};
+
+enum {
+	TCA_CBS_UNSPEC,
+	TCA_CBS_PARMS,
+	__TCA_CBS_MAX,
+};
+
+#define TCA_CBS_MAX (__TCA_CBS_MAX - 1)
+
 #endif
-- 
2.14.2

^ permalink raw reply related

* [next-queue PATCH v2 5/5] igb: Add support for CBS offload
From: Vinicius Costa Gomes @ 2017-10-03 23:44 UTC (permalink / raw)
  To: netdev, intel-wired-lan
  Cc: Andre Guedes, jhs, xiyou.wangcong, jiri, ivan.briano,
	jesus.sanchez-palencia, boon.leong.ong, richardcochran, henrik,
	levipearson, rodney.cummings
In-Reply-To: <20171003234435.8979-1-vinicius.gomes@intel.com>

From: Andre Guedes <andre.guedes@intel.com>

This patch adds support for Credit-Based Shaper (CBS) qdisc offload
from Traffic Control system. This support enable us to leverage the
Forwarding and Queuing for Time-Sensitive Streams (FQTSS) features
from Intel i210 Ethernet Controller. FQTSS is the former 802.1Qav
standard which was merged into 802.1Q in 2014. It enables traffic
prioritization and bandwidth reservation via the Credit-Based Shaper
which is implemented in hardware by i210 controller.

The patch introduces the igb_setup_tc() function which implements the
support for CBS qdisc hardware offload in the IGB driver. CBS offload
is the only traffic control offload supported by the driver at the
moment.

FQTSS transmission mode from i210 controller is automatically enabled
by the IGB driver when the CBS is enabled for the first hardware
queue. Likewise, FQTSS mode is automatically disabled when CBS is
disabled for the last hardware queue. Changing FQTSS mode requires NIC
reset.

FQTSS feature is supported by i210 controller only.

Signed-off-by: Andre Guedes <andre.guedes@intel.com>
---
 drivers/net/ethernet/intel/igb/e1000_defines.h |  23 ++
 drivers/net/ethernet/intel/igb/e1000_regs.h    |   8 +
 drivers/net/ethernet/intel/igb/igb.h           |   6 +
 drivers/net/ethernet/intel/igb/igb_main.c      | 347 +++++++++++++++++++++++++
 4 files changed, 384 insertions(+)

diff --git a/drivers/net/ethernet/intel/igb/e1000_defines.h b/drivers/net/ethernet/intel/igb/e1000_defines.h
index 1de82f247312..83cabff1e0ab 100644
--- a/drivers/net/ethernet/intel/igb/e1000_defines.h
+++ b/drivers/net/ethernet/intel/igb/e1000_defines.h
@@ -353,7 +353,18 @@
 #define E1000_RXPBS_CFG_TS_EN           0x80000000
 
 #define I210_RXPBSIZE_DEFAULT		0x000000A2 /* RXPBSIZE default */
+#define I210_RXPBSIZE_MASK		0x0000003F
+#define I210_RXPBSIZE_PB_32KB		0x00000020
 #define I210_TXPBSIZE_DEFAULT		0x04000014 /* TXPBSIZE default */
+#define I210_TXPBSIZE_MASK		0xC0FFFFFF
+#define I210_TXPBSIZE_PB0_8KB		(8 << 0)
+#define I210_TXPBSIZE_PB1_8KB		(8 << 6)
+#define I210_TXPBSIZE_PB2_4KB		(4 << 12)
+#define I210_TXPBSIZE_PB3_4KB		(4 << 18)
+
+#define I210_DTXMXPKTSZ_DEFAULT		0x00000098
+
+#define I210_SR_QUEUES_NUM		2
 
 /* SerDes Control */
 #define E1000_SCTL_DISABLE_SERDES_LOOPBACK 0x0400
@@ -1051,4 +1062,16 @@
 #define E1000_VLAPQF_P_VALID(_n)	(0x1 << (3 + (_n) * 4))
 #define E1000_VLAPQF_QUEUE_MASK	0x03
 
+/* TX Qav Control fields */
+#define E1000_TQAVCTRL_XMIT_MODE	BIT(0)
+#define E1000_TQAVCTRL_DATAFETCHARB	BIT(4)
+#define E1000_TQAVCTRL_DATATRANARB	BIT(8)
+
+/* TX Qav Credit Control fields */
+#define E1000_TQAVCC_IDLESLOPE_MASK	0xFFFF
+#define E1000_TQAVCC_QUEUEMODE		BIT(31)
+
+/* Transmit Descriptor Control fields */
+#define E1000_TXDCTL_PRIORITY		BIT(27)
+
 #endif
diff --git a/drivers/net/ethernet/intel/igb/e1000_regs.h b/drivers/net/ethernet/intel/igb/e1000_regs.h
index 58adbf234e07..8eee081d395f 100644
--- a/drivers/net/ethernet/intel/igb/e1000_regs.h
+++ b/drivers/net/ethernet/intel/igb/e1000_regs.h
@@ -421,6 +421,14 @@ do { \
 
 #define E1000_I210_FLA		0x1201C
 
+#define E1000_I210_DTXMXPKTSZ	0x355C
+
+#define E1000_I210_TXDCTL(_n)	(0x0E028 + ((_n) * 0x40))
+
+#define E1000_I210_TQAVCTRL	0x3570
+#define E1000_I210_TQAVCC(_n)	(0x3004 + ((_n) * 0x40))
+#define E1000_I210_TQAVHC(_n)	(0x300C + ((_n) * 0x40))
+
 #define E1000_INVM_DATA_REG(_n)	(0x12120 + 4*(_n))
 #define E1000_INVM_SIZE		64 /* Number of INVM Data Registers */
 
diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index 06ffb2bc713e..92845692087a 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -281,6 +281,11 @@ struct igb_ring {
 	u16 count;			/* number of desc. in the ring */
 	u8 queue_index;			/* logical index of the ring*/
 	u8 reg_idx;			/* physical index of the ring */
+	bool cbs_enable;		/* indicates if CBS is enabled */
+	s32 idleslope;			/* idleSlope in kbps */
+	s32 sendslope;			/* sendSlope in kbps */
+	s32 hicredit;			/* hiCredit in bytes */
+	s32 locredit;			/* loCredit in bytes */
 
 	/* everything past this point are written often */
 	u16 next_to_clean;
@@ -621,6 +626,7 @@ struct igb_adapter {
 #define IGB_FLAG_EEE			BIT(14)
 #define IGB_FLAG_VLAN_PROMISC		BIT(15)
 #define IGB_FLAG_RX_LEGACY		BIT(16)
+#define IGB_FLAG_FQTSS			BIT(17)
 
 /* Media Auto Sense */
 #define IGB_MAS_ENABLE_0		0X0001
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index fd4a46b03cc8..03b8d0f4acfd 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -34,6 +34,7 @@
 #include <linux/slab.h>
 #include <net/checksum.h>
 #include <net/ip6_checksum.h>
+#include <net/pkt_sched.h>
 #include <linux/net_tstamp.h>
 #include <linux/mii.h>
 #include <linux/ethtool.h>
@@ -62,6 +63,17 @@
 #define BUILD 0
 #define DRV_VERSION __stringify(MAJ) "." __stringify(MIN) "." \
 __stringify(BUILD) "-k"
+
+enum queue_mode {
+	QUEUE_MODE_STRICT_PRIORITY,
+	QUEUE_MODE_STREAM_RESERVATION,
+};
+
+enum tx_queue_prio {
+	TX_QUEUE_PRIO_HIGH,
+	TX_QUEUE_PRIO_LOW,
+};
+
 char igb_driver_name[] = "igb";
 char igb_driver_version[] = DRV_VERSION;
 static const char igb_driver_string[] =
@@ -1271,6 +1283,12 @@ static int igb_alloc_q_vector(struct igb_adapter *adapter,
 		ring->count = adapter->tx_ring_count;
 		ring->queue_index = txr_idx;
 
+		ring->cbs_enable = false;
+		ring->idleslope = 0;
+		ring->sendslope = 0;
+		ring->hicredit = 0;
+		ring->locredit = 0;
+
 		u64_stats_init(&ring->tx_syncp);
 		u64_stats_init(&ring->tx_syncp2);
 
@@ -1598,6 +1616,284 @@ static void igb_get_hw_control(struct igb_adapter *adapter)
 			ctrl_ext | E1000_CTRL_EXT_DRV_LOAD);
 }
 
+static void enable_fqtss(struct igb_adapter *adapter, bool enable)
+{
+	struct net_device *netdev = adapter->netdev;
+	struct e1000_hw *hw = &adapter->hw;
+
+	WARN_ON(hw->mac.type != e1000_i210);
+
+	if (enable)
+		adapter->flags |= IGB_FLAG_FQTSS;
+	else
+		adapter->flags &= ~IGB_FLAG_FQTSS;
+
+	if (netif_running(netdev))
+		schedule_work(&adapter->reset_task);
+}
+
+static bool is_fqtss_enabled(struct igb_adapter *adapter)
+{
+	return (adapter->flags & IGB_FLAG_FQTSS) ? true : false;
+}
+
+static void set_tx_desc_fetch_prio(struct e1000_hw *hw, int queue,
+				   enum tx_queue_prio prio)
+{
+	u32 val;
+
+	WARN_ON(hw->mac.type != e1000_i210);
+	WARN_ON(queue < 0 || queue > 4);
+
+	val = rd32(E1000_I210_TXDCTL(queue));
+
+	if (prio == TX_QUEUE_PRIO_HIGH)
+		val |= E1000_TXDCTL_PRIORITY;
+	else
+		val &= ~E1000_TXDCTL_PRIORITY;
+
+	wr32(E1000_I210_TXDCTL(queue), val);
+}
+
+static void set_queue_mode(struct e1000_hw *hw, int queue, enum queue_mode mode)
+{
+	u32 val;
+
+	WARN_ON(hw->mac.type != e1000_i210);
+	WARN_ON(queue < 0 || queue > 1);
+
+	val = rd32(E1000_I210_TQAVCC(queue));
+
+	if (mode == QUEUE_MODE_STREAM_RESERVATION)
+		val |= E1000_TQAVCC_QUEUEMODE;
+	else
+		val &= ~E1000_TQAVCC_QUEUEMODE;
+
+	wr32(E1000_I210_TQAVCC(queue), val);
+}
+
+/**
+ *  igb_configure_cbs - Configure Credit-Based Shaper (CBS)
+ *  @adapter: pointer to adapter struct
+ *  @queue: queue number
+ *  @enable: true = enable CBS, false = disable CBS
+ *  @idleslope: idleSlope in kbps
+ *  @sendslope: sendSlope in kbps
+ *  @hicredit: hiCredit in bytes
+ *  @locredit: loCredit in bytes
+ *
+ *  Configure CBS for a given hardware queue. When disabling, idleslope,
+ *  sendslope, hicredit, locredit arguments are ignored. Returns 0 if
+ *  success. Negative otherwise.
+ **/
+static void igb_configure_cbs(struct igb_adapter *adapter, int queue,
+			      bool enable, int idleslope, int sendslope,
+			      int hicredit, int locredit)
+{
+	struct net_device *netdev = adapter->netdev;
+	struct e1000_hw *hw = &adapter->hw;
+	u32 tqavcc;
+	u16 value;
+
+	WARN_ON(hw->mac.type != e1000_i210);
+	WARN_ON(queue < 0 || queue > 1);
+
+	if (enable) {
+		set_tx_desc_fetch_prio(hw, queue, TX_QUEUE_PRIO_HIGH);
+		set_queue_mode(hw, queue, QUEUE_MODE_STREAM_RESERVATION);
+
+		/* According to i210 datasheet section 7.2.7.7, we should set
+		 * the 'idleSlope' field from TQAVCC register following the
+		 * equation:
+		 *
+		 * For 100 Mbps link speed:
+		 *
+		 *     value = BW * 0x7735 * 0.2                          (E1)
+		 *
+		 * For 1000Mbps link speed:
+		 *
+		 *     value = BW * 0x7735 * 2                            (E2)
+		 *
+		 * E1 and E2 can be merged into one equation as shown below.
+		 * Note that 'link-speed' is in Mbps.
+		 *
+		 *     value = BW * 0x7735 * 2 * link-speed
+		 *                           --------------               (E3)
+		 *                                1000
+		 *
+		 * 'BW' is the percentage bandwidth out of full link speed
+		 * which can be found with the following equation. Note that
+		 * idleSlope here is the parameter from this function which
+		 * is in kbps.
+		 *
+		 *     BW =     idleSlope
+		 *          -----------------                             (E4)
+		 *          link-speed * 1000
+		 *
+		 * That said, we can come up with a generic equation to
+		 * calculate the value we should set it TQAVCC register by
+		 * replacing 'BW' in E3 by E4. The resulting equation is:
+		 *
+		 * value =     idleSlope     * 0x7735 * 2 * link-speed
+		 *         -----------------            --------------    (E5)
+		 *         link-speed * 1000                 1000
+		 *
+		 * 'link-speed' is present in both sides of the fraction so
+		 * it is canceled out. The final equation is the following:
+		 *
+		 *     value = idleSlope * 61034
+		 *             -----------------                          (E6)
+		 *                  1000000
+		 */
+		value = DIV_ROUND_UP_ULL(idleslope * 61034ULL, 1000000);
+
+		tqavcc = rd32(E1000_I210_TQAVCC(queue));
+		tqavcc &= ~E1000_TQAVCC_IDLESLOPE_MASK;
+		tqavcc |= value;
+		wr32(E1000_I210_TQAVCC(queue), tqavcc);
+
+		wr32(E1000_I210_TQAVHC(queue), 0x80000000 + hicredit * 0x7735);
+	} else {
+		set_tx_desc_fetch_prio(hw, queue, TX_QUEUE_PRIO_LOW);
+		set_queue_mode(hw, queue, QUEUE_MODE_STRICT_PRIORITY);
+
+		/* Set idleSlope to zero. */
+		tqavcc = rd32(E1000_I210_TQAVCC(queue));
+		tqavcc &= ~E1000_TQAVCC_IDLESLOPE_MASK;
+		wr32(E1000_I210_TQAVCC(queue), tqavcc);
+
+		/* Set hiCredit to zero. */
+		wr32(E1000_I210_TQAVHC(queue), 0);
+	}
+
+	/* XXX: In i210 controller the sendSlope and loCredit parameters from
+	 * CBS are not configurable by software so we don't do any 'controller
+	 * configuration' in respect to these parameters.
+	 */
+
+	netdev_dbg(netdev, "CBS %s: queue %d idleslope %d sendslope %d hiCredit %d locredit %d\n",
+		   (enable) ? "enabled" : "disabled", queue,
+		   idleslope, sendslope, hicredit, locredit);
+}
+
+static int igb_save_cbs_params(struct igb_adapter *adapter, int queue,
+			       bool enable, int idleslope, int sendslope,
+			       int hicredit, int locredit)
+{
+	struct igb_ring *ring;
+
+	if (queue < 0 || queue > adapter->num_tx_queues)
+		return -EINVAL;
+
+	ring = adapter->tx_ring[queue];
+
+	ring->cbs_enable = enable;
+	ring->idleslope = idleslope;
+	ring->sendslope = sendslope;
+	ring->hicredit = hicredit;
+	ring->locredit = locredit;
+
+	return 0;
+}
+
+static bool is_any_cbs_enabled(struct igb_adapter *adapter)
+{
+	struct igb_ring *ring;
+	int i;
+
+	for (i = 0; i < adapter->num_tx_queues; i++) {
+		ring = adapter->tx_ring[i];
+
+		if (ring->cbs_enable)
+			return true;
+	}
+
+	return false;
+}
+
+static void igb_setup_tx_mode(struct igb_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+	struct e1000_hw *hw = &adapter->hw;
+	u32 val;
+
+	/* Only i210 controller supports changing the transmission mode. */
+	if (hw->mac.type != e1000_i210)
+		return;
+
+	if (is_fqtss_enabled(adapter)) {
+		int i, max_queue;
+
+		/* Configure TQAVCTRL register: set transmit mode to 'Qav',
+		 * set data fetch arbitration to 'round robin' and set data
+		 * transfer arbitration to 'credit shaper algorithm.
+		 */
+		val = rd32(E1000_I210_TQAVCTRL);
+		val |= E1000_TQAVCTRL_XMIT_MODE | E1000_TQAVCTRL_DATATRANARB;
+		val &= ~E1000_TQAVCTRL_DATAFETCHARB;
+		wr32(E1000_I210_TQAVCTRL, val);
+
+		/* Configure Tx and Rx packet buffers sizes as described in
+		 * i210 datasheet section 7.2.7.7.
+		 */
+		val = rd32(E1000_TXPBS);
+		val &= ~I210_TXPBSIZE_MASK;
+		val |= I210_TXPBSIZE_PB0_8KB | I210_TXPBSIZE_PB1_8KB |
+			I210_TXPBSIZE_PB2_4KB | I210_TXPBSIZE_PB3_4KB;
+		wr32(E1000_TXPBS, val);
+
+		val = rd32(E1000_RXPBS);
+		val &= ~I210_RXPBSIZE_MASK;
+		val |= I210_RXPBSIZE_PB_32KB;
+		wr32(E1000_RXPBS, val);
+
+		/* Section 8.12.9 states that MAX_TPKT_SIZE from DTXMXPKTSZ
+		 * register should not exceed the buffer size programmed in
+		 * TXPBS. The smallest buffer size programmed in TXPBS is 4kB
+		 * so according to the datasheet we should set MAX_TPKT_SIZE to
+		 * 4kB / 64.
+		 *
+		 * However, when we do so, no frame from queue 2 and 3 are
+		 * transmitted.  It seems the MAX_TPKT_SIZE should not be great
+		 * or _equal_ to the buffer size programmed in TXPBS. For this
+		 * reason, we set set MAX_ TPKT_SIZE to (4kB - 1) / 64.
+		 */
+		val = (4096 - 1) / 64;
+		wr32(E1000_I210_DTXMXPKTSZ, val);
+
+		/* Since FQTSS mode is enabled, apply any CBS configuration
+		 * previously set. If no previous CBS configuration has been
+		 * done, then the initial configuration is applied, which means
+		 * CBS is disabled.
+		 */
+		max_queue = (adapter->num_tx_queues < I210_SR_QUEUES_NUM) ?
+			    adapter->num_tx_queues : I210_SR_QUEUES_NUM;
+
+		for (i = 0; i < max_queue; i++) {
+			struct igb_ring *ring = adapter->tx_ring[i];
+
+			igb_configure_cbs(adapter, i, ring->cbs_enable,
+					  ring->idleslope, ring->sendslope,
+					  ring->hicredit, ring->locredit);
+		}
+	} else {
+		wr32(E1000_RXPBS, I210_RXPBSIZE_DEFAULT);
+		wr32(E1000_TXPBS, I210_TXPBSIZE_DEFAULT);
+		wr32(E1000_I210_DTXMXPKTSZ, I210_DTXMXPKTSZ_DEFAULT);
+
+		val = rd32(E1000_I210_TQAVCTRL);
+		/* According to Section 8.12.21, the other flags we've set when
+		 * enabling FQTSS are not relevant when disabling FQTSS so we
+		 * don't set they here.
+		 */
+		val &= ~E1000_TQAVCTRL_XMIT_MODE;
+		wr32(E1000_I210_TQAVCTRL, val);
+	}
+
+	netdev_dbg(netdev, "FQTSS %s\n", (is_fqtss_enabled(adapter)) ?
+		   "enabled" : "disabled");
+}
+
 /**
  *  igb_configure - configure the hardware for RX and TX
  *  @adapter: private board structure
@@ -1609,6 +1905,7 @@ static void igb_configure(struct igb_adapter *adapter)
 
 	igb_get_hw_control(adapter);
 	igb_set_rx_mode(netdev);
+	igb_setup_tx_mode(adapter);
 
 	igb_restore_vlan(adapter);
 
@@ -2150,6 +2447,55 @@ igb_features_check(struct sk_buff *skb, struct net_device *dev,
 	return features;
 }
 
+static int igb_offload_cbs(struct igb_adapter *adapter,
+			   struct tc_cbs_qopt_offload *qopt)
+{
+	struct e1000_hw *hw = &adapter->hw;
+	int err;
+
+	/* CBS offloading is only supported by i210 controller. */
+	if (hw->mac.type != e1000_i210)
+		return -EOPNOTSUPP;
+
+	/* CBS offloading is only supported by queue 0 and queue 1. */
+	if (qopt->queue < 0 || qopt->queue > 1)
+		return -EINVAL;
+
+	err = igb_save_cbs_params(adapter, qopt->queue, qopt->enable,
+				  qopt->idleslope, qopt->sendslope,
+				  qopt->hicredit, qopt->locredit);
+	if (err)
+		return err;
+
+	if (is_fqtss_enabled(adapter)) {
+		igb_configure_cbs(adapter, qopt->queue, qopt->enable,
+				  qopt->idleslope, qopt->sendslope,
+				  qopt->hicredit, qopt->locredit);
+
+		if (!is_any_cbs_enabled(adapter))
+			enable_fqtss(adapter, false);
+
+	} else {
+		enable_fqtss(adapter, true);
+	}
+
+	return 0;
+}
+
+static int igb_setup_tc(struct net_device *dev, enum tc_setup_type type,
+			void *type_data)
+{
+	struct igb_adapter *adapter = netdev_priv(dev);
+
+	switch (type) {
+	case TC_SETUP_CBS:
+		return igb_offload_cbs(adapter, type_data);
+
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
 static const struct net_device_ops igb_netdev_ops = {
 	.ndo_open		= igb_open,
 	.ndo_stop		= igb_close,
@@ -2175,6 +2521,7 @@ static const struct net_device_ops igb_netdev_ops = {
 	.ndo_set_features	= igb_set_features,
 	.ndo_fdb_add		= igb_ndo_fdb_add,
 	.ndo_features_check	= igb_features_check,
+	.ndo_setup_tc		= igb_setup_tc,
 };
 
 /**
-- 
2.14.2

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox