Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next v15 5/7] sch_cake: Add DiffServ handling
From: Toke Høiland-Jørgensen @ 2018-05-22 13:57 UTC (permalink / raw)
  To: netdev, cake
In-Reply-To: <152699741881.21931.11656377745581563912.stgit@alrua-kau>

This adds support for DiffServ-based priority queueing to CAKE. If the
shaper is in use, each priority tier gets its own virtual clock, which
limits that tier's rate to a fraction of the overall shaped rate, to
discourage trying to game the priority mechanism.

CAKE defaults to a simple, three-tier mode that interprets most code points
as "best effort", but places CS1 traffic into a low-priority "bulk" tier
which is assigned 1/16 of the total rate, and a few code points indicating
latency-sensitive or control traffic (specifically TOS4, VA, EF, CS6, CS7)
into a "latency sensitive" high-priority tier, which is assigned 1/4 rate.
The other supported DiffServ modes are a 4-tier mode matching the 802.11e
precedence rules, as well as two 8-tier modes, one of which implements
strict precedence of the eight priority levels.

This commit also adds an optional DiffServ 'wash' mode, which will zero out
the DSCP fields of any packet passing through CAKE. While this can
technically be done with other mechanisms in the kernel, having the feature
available in CAKE significantly decreases configuration complexity; and the
implementation cost is low on top of the other DiffServ-handling code.

Filters and applications can set the skb->priority field to override the
DSCP-based classification into tiers. If TC_H_MAJ(skb->priority) matches
CAKE's qdisc handle, the minor number will be interpreted as a priority
tier if it is less than or equal to the number of configured priority
tiers.

Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
---
 net/sched/sch_cake.c |  412 +++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 404 insertions(+), 8 deletions(-)

diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index 6f7cae705c84..6384765e97b0 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -298,6 +298,68 @@ static void cobalt_set_enqueue_time(struct sk_buff *skb,
 
 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
 
+/* Diffserv lookup tables */
+
+static const u8 precedence[] = {
+	0, 0, 0, 0, 0, 0, 0, 0,
+	1, 1, 1, 1, 1, 1, 1, 1,
+	2, 2, 2, 2, 2, 2, 2, 2,
+	3, 3, 3, 3, 3, 3, 3, 3,
+	4, 4, 4, 4, 4, 4, 4, 4,
+	5, 5, 5, 5, 5, 5, 5, 5,
+	6, 6, 6, 6, 6, 6, 6, 6,
+	7, 7, 7, 7, 7, 7, 7, 7,
+};
+
+static const u8 diffserv8[] = {
+	2, 5, 1, 2, 4, 2, 2, 2,
+	0, 2, 1, 2, 1, 2, 1, 2,
+	5, 2, 4, 2, 4, 2, 4, 2,
+	3, 2, 3, 2, 3, 2, 3, 2,
+	6, 2, 3, 2, 3, 2, 3, 2,
+	6, 2, 2, 2, 6, 2, 6, 2,
+	7, 2, 2, 2, 2, 2, 2, 2,
+	7, 2, 2, 2, 2, 2, 2, 2,
+};
+
+static const u8 diffserv4[] = {
+	0, 2, 0, 0, 2, 0, 0, 0,
+	1, 0, 0, 0, 0, 0, 0, 0,
+	2, 0, 2, 0, 2, 0, 2, 0,
+	2, 0, 2, 0, 2, 0, 2, 0,
+	3, 0, 2, 0, 2, 0, 2, 0,
+	3, 0, 0, 0, 3, 0, 3, 0,
+	3, 0, 0, 0, 0, 0, 0, 0,
+	3, 0, 0, 0, 0, 0, 0, 0,
+};
+
+static const u8 diffserv3[] = {
+	0, 0, 0, 0, 2, 0, 0, 0,
+	1, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 2, 0, 2, 0,
+	2, 0, 0, 0, 0, 0, 0, 0,
+	2, 0, 0, 0, 0, 0, 0, 0,
+};
+
+static const u8 besteffort[] = {
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+/* tin priority order for stats dumping */
+
+static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
+static const u8 bulk_order[] = {1, 0, 2, 3};
+
 #define REC_INV_SQRT_CACHE (16)
 static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
 
@@ -1415,6 +1477,46 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
 	return idx + (tin << 16);
 }
 
+static void cake_wash_diffserv(struct sk_buff *skb)
+{
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
+		break;
+	case htons(ETH_P_IPV6):
+		ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
+		break;
+	default:
+		break;
+	}
+}
+
+static u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash)
+{
+	u8 dscp;
+
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
+		if (wash && dscp)
+			ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
+		return dscp;
+
+	case htons(ETH_P_IPV6):
+		dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
+		if (wash && dscp)
+			ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
+		return dscp;
+
+	case htons(ETH_P_ARP):
+		return 0x38;  /* CS7 - Net Control */
+
+	default:
+		/* If there is no Diffserv field, treat as best-effort */
+		return 0;
+	}
+}
+
 static void cake_reconfigure(struct Qdisc *sch);
 
 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
@@ -1429,7 +1531,26 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	struct cake_flow *flow;
 	u32 idx, tin;
 
-	tin = 0;
+	if (TC_H_MAJ(skb->priority) == sch->handle &&
+	    TC_H_MIN(skb->priority) > 0 &&
+	    TC_H_MIN(skb->priority) <= q->tin_cnt) {
+		tin = TC_H_MIN(skb->priority) - 1;
+
+		if (q->rate_flags & CAKE_FLAG_WASH)
+			cake_wash_diffserv(skb);
+	} else if (q->tin_mode != CAKE_DIFFSERV_BESTEFFORT) {
+		/* extract the Diffserv Precedence field, if it exists */
+		/* and clear DSCP bits if washing */
+		tin = q->tin_index[cake_handle_diffserv(skb,
+				q->rate_flags & CAKE_FLAG_WASH)];
+		if (unlikely(tin >= q->tin_cnt))
+			tin = 0;
+	} else {
+		tin = 0;
+		if (q->rate_flags & CAKE_FLAG_WASH)
+			cake_wash_diffserv(skb);
+	}
+
 	b = &q->tins[tin];
 
 	/* choose flow to insert into */
@@ -1930,18 +2051,275 @@ static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
 	b->cparams.p_dec = 1 << 20; /* 1/4096 */
 }
 
-static void cake_reconfigure(struct Qdisc *sch)
+static int cake_config_besteffort(struct Qdisc *sch)
 {
 	struct cake_sched_data *q = qdisc_priv(sch);
 	struct cake_tin_data *b = &q->tins[0];
-	int c, ft = 0;
+	u32 mtu = psched_mtu(qdisc_dev(sch));
+	u64 rate = q->rate_bps;
 
 	q->tin_cnt = 1;
-	cake_set_rate(b, q->rate_bps, psched_mtu(qdisc_dev(sch)),
+
+	q->tin_index = besteffort;
+	q->tin_order = normal_order;
+
+	cake_set_rate(b, rate, mtu,
 		      us_to_ns(q->target), us_to_ns(q->interval));
 	b->tin_quantum_band = 65535;
 	b->tin_quantum_prio = 65535;
 
+	return 0;
+}
+
+static int cake_config_precedence(struct Qdisc *sch)
+{
+	/* convert high-level (user visible) parameters into internal format */
+	struct cake_sched_data *q = qdisc_priv(sch);
+	u32 mtu = psched_mtu(qdisc_dev(sch));
+	u64 rate = q->rate_bps;
+	u32 quantum1 = 256;
+	u32 quantum2 = 256;
+	u32 i;
+
+	q->tin_cnt = 8;
+	q->tin_index = precedence;
+	q->tin_order = normal_order;
+
+	for (i = 0; i < q->tin_cnt; i++) {
+		struct cake_tin_data *b = &q->tins[i];
+
+		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
+			      us_to_ns(q->interval));
+
+		b->tin_quantum_prio = max_t(u16, 1U, quantum1);
+		b->tin_quantum_band = max_t(u16, 1U, quantum2);
+
+		/* calculate next class's parameters */
+		rate  *= 7;
+		rate >>= 3;
+
+		quantum1  *= 3;
+		quantum1 >>= 1;
+
+		quantum2  *= 7;
+		quantum2 >>= 3;
+	}
+
+	return 0;
+}
+
+/*	List of known Diffserv codepoints:
+ *
+ *	Least Effort (CS1)
+ *	Best Effort (CS0)
+ *	Max Reliability & LLT "Lo" (TOS1)
+ *	Max Throughput (TOS2)
+ *	Min Delay (TOS4)
+ *	LLT "La" (TOS5)
+ *	Assured Forwarding 1 (AF1x) - x3
+ *	Assured Forwarding 2 (AF2x) - x3
+ *	Assured Forwarding 3 (AF3x) - x3
+ *	Assured Forwarding 4 (AF4x) - x3
+ *	Precedence Class 2 (CS2)
+ *	Precedence Class 3 (CS3)
+ *	Precedence Class 4 (CS4)
+ *	Precedence Class 5 (CS5)
+ *	Precedence Class 6 (CS6)
+ *	Precedence Class 7 (CS7)
+ *	Voice Admit (VA)
+ *	Expedited Forwarding (EF)
+
+ *	Total 25 codepoints.
+ */
+
+/*	List of traffic classes in RFC 4594:
+ *		(roughly descending order of contended priority)
+ *		(roughly ascending order of uncontended throughput)
+ *
+ *	Network Control (CS6,CS7)      - routing traffic
+ *	Telephony (EF,VA)         - aka. VoIP streams
+ *	Signalling (CS5)               - VoIP setup
+ *	Multimedia Conferencing (AF4x) - aka. video calls
+ *	Realtime Interactive (CS4)     - eg. games
+ *	Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
+ *	Broadcast Video (CS3)
+ *	Low Latency Data (AF2x,TOS4)      - eg. database
+ *	Ops, Admin, Management (CS2,TOS1) - eg. ssh
+ *	Standard Service (CS0 & unrecognised codepoints)
+ *	High Throughput Data (AF1x,TOS2)  - eg. web traffic
+ *	Low Priority Data (CS1)           - eg. BitTorrent
+
+ *	Total 12 traffic classes.
+ */
+
+static int cake_config_diffserv8(struct Qdisc *sch)
+{
+/*	Pruned list of traffic classes for typical applications:
+ *
+ *		Network Control          (CS6, CS7)
+ *		Minimum Latency          (EF, VA, CS5, CS4)
+ *		Interactive Shell        (CS2, TOS1)
+ *		Low Latency Transactions (AF2x, TOS4)
+ *		Video Streaming          (AF4x, AF3x, CS3)
+ *		Bog Standard             (CS0 etc.)
+ *		High Throughput          (AF1x, TOS2)
+ *		Background Traffic       (CS1)
+ *
+ *		Total 8 traffic classes.
+ */
+
+	struct cake_sched_data *q = qdisc_priv(sch);
+	u32 mtu = psched_mtu(qdisc_dev(sch));
+	u64 rate = q->rate_bps;
+	u32 quantum1 = 256;
+	u32 quantum2 = 256;
+	u32 i;
+
+	q->tin_cnt = 8;
+
+	/* codepoint to class mapping */
+	q->tin_index = diffserv8;
+	q->tin_order = normal_order;
+
+	/* class characteristics */
+	for (i = 0; i < q->tin_cnt; i++) {
+		struct cake_tin_data *b = &q->tins[i];
+
+		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
+			      us_to_ns(q->interval));
+
+		b->tin_quantum_prio = max_t(u16, 1U, quantum1);
+		b->tin_quantum_band = max_t(u16, 1U, quantum2);
+
+		/* calculate next class's parameters */
+		rate  *= 7;
+		rate >>= 3;
+
+		quantum1  *= 3;
+		quantum1 >>= 1;
+
+		quantum2  *= 7;
+		quantum2 >>= 3;
+	}
+
+	return 0;
+}
+
+static int cake_config_diffserv4(struct Qdisc *sch)
+{
+/*  Further pruned list of traffic classes for four-class system:
+ *
+ *	    Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
+ *	    Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1)
+ *	    Best Effort        (CS0, AF1x, TOS2, and those not specified)
+ *	    Background Traffic (CS1)
+ *
+ *		Total 4 traffic classes.
+ */
+
+	struct cake_sched_data *q = qdisc_priv(sch);
+	u32 mtu = psched_mtu(qdisc_dev(sch));
+	u64 rate = q->rate_bps;
+	u32 quantum = 1024;
+
+	q->tin_cnt = 4;
+
+	/* codepoint to class mapping */
+	q->tin_index = diffserv4;
+	q->tin_order = bulk_order;
+
+	/* class characteristics */
+	cake_set_rate(&q->tins[0], rate, mtu,
+		      us_to_ns(q->target), us_to_ns(q->interval));
+	cake_set_rate(&q->tins[1], rate >> 4, mtu,
+		      us_to_ns(q->target), us_to_ns(q->interval));
+	cake_set_rate(&q->tins[2], rate >> 1, mtu,
+		      us_to_ns(q->target), us_to_ns(q->interval));
+	cake_set_rate(&q->tins[3], rate >> 2, mtu,
+		      us_to_ns(q->target), us_to_ns(q->interval));
+
+	/* priority weights */
+	q->tins[0].tin_quantum_prio = quantum;
+	q->tins[1].tin_quantum_prio = quantum >> 4;
+	q->tins[2].tin_quantum_prio = quantum << 2;
+	q->tins[3].tin_quantum_prio = quantum << 4;
+
+	/* bandwidth-sharing weights */
+	q->tins[0].tin_quantum_band = quantum;
+	q->tins[1].tin_quantum_band = quantum >> 4;
+	q->tins[2].tin_quantum_band = quantum >> 1;
+	q->tins[3].tin_quantum_band = quantum >> 2;
+
+	return 0;
+}
+
+static int cake_config_diffserv3(struct Qdisc *sch)
+{
+/*  Simplified Diffserv structure with 3 tins.
+ *		Low Priority		(CS1)
+ *		Best Effort
+ *		Latency Sensitive	(TOS4, VA, EF, CS6, CS7)
+ */
+	struct cake_sched_data *q = qdisc_priv(sch);
+	u32 mtu = psched_mtu(qdisc_dev(sch));
+	u64 rate = q->rate_bps;
+	u32 quantum = 1024;
+
+	q->tin_cnt = 3;
+
+	/* codepoint to class mapping */
+	q->tin_index = diffserv3;
+	q->tin_order = bulk_order;
+
+	/* class characteristics */
+	cake_set_rate(&q->tins[0], rate, mtu,
+		      us_to_ns(q->target), us_to_ns(q->interval));
+	cake_set_rate(&q->tins[1], rate >> 4, mtu,
+		      us_to_ns(q->target), us_to_ns(q->interval));
+	cake_set_rate(&q->tins[2], rate >> 2, mtu,
+		      us_to_ns(q->target), us_to_ns(q->interval));
+
+	/* priority weights */
+	q->tins[0].tin_quantum_prio = quantum;
+	q->tins[1].tin_quantum_prio = quantum >> 4;
+	q->tins[2].tin_quantum_prio = quantum << 4;
+
+	/* bandwidth-sharing weights */
+	q->tins[0].tin_quantum_band = quantum;
+	q->tins[1].tin_quantum_band = quantum >> 4;
+	q->tins[2].tin_quantum_band = quantum >> 2;
+
+	return 0;
+}
+
+static void cake_reconfigure(struct Qdisc *sch)
+{
+	struct cake_sched_data *q = qdisc_priv(sch);
+	int c, ft;
+
+	switch (q->tin_mode) {
+	case CAKE_DIFFSERV_BESTEFFORT:
+		ft = cake_config_besteffort(sch);
+		break;
+
+	case CAKE_DIFFSERV_PRECEDENCE:
+		ft = cake_config_precedence(sch);
+		break;
+
+	case CAKE_DIFFSERV_DIFFSERV8:
+		ft = cake_config_diffserv8(sch);
+		break;
+
+	case CAKE_DIFFSERV_DIFFSERV4:
+		ft = cake_config_diffserv4(sch);
+		break;
+
+	case CAKE_DIFFSERV_DIFFSERV3:
+	default:
+		ft = cake_config_diffserv3(sch);
+		break;
+	}
+
 	for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
 		cake_clear_tin(sch, c);
 		q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
@@ -1997,6 +2375,16 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt,
 	if (tb[TCA_CAKE_BASE_RATE64])
 		q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
 
+	if (tb[TCA_CAKE_DIFFSERV_MODE])
+		q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
+
+	if (tb[TCA_CAKE_WASH]) {
+		if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
+			q->rate_flags |= CAKE_FLAG_WASH;
+		else
+			q->rate_flags &= ~CAKE_FLAG_WASH;
+	}
+
 	if (tb[TCA_CAKE_FLOW_MODE])
 		q->flow_mode = (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
 				CAKE_FLOW_MASK);
@@ -2060,7 +2448,7 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt,
 	int i, j, err;
 
 	sch->limit = 10240;
-	q->tin_mode = CAKE_DIFFSERV_BESTEFFORT;
+	q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
 	q->flow_mode  = CAKE_FLOW_TRIPLE;
 
 	q->rate_bps = 0; /* unlimited by default */
@@ -2170,6 +2558,13 @@ static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
 			!!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
 		goto nla_put_failure;
 
+	if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
+		goto nla_put_failure;
+
+	if (nla_put_u32(skb, TCA_CAKE_WASH,
+			!!(q->rate_flags & CAKE_FLAG_WASH)))
+		goto nla_put_failure;
+
 	return nla_nest_end(skb, opts);
 
 nla_put_failure:
@@ -2223,7 +2618,7 @@ static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 	} while (0)
 
 	for (i = 0; i < q->tin_cnt; i++) {
-		struct cake_tin_data *b = &q->tins[i];
+		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
 
 		ts = nla_nest_start(d->skb, i + 1);
 		if (!ts)
@@ -2322,7 +2717,8 @@ static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 	u32 idx = cl - 1;
 
 	if (idx < CAKE_QUEUES * q->tin_cnt) {
-		const struct cake_tin_data *b = &q->tins[idx / CAKE_QUEUES];
+		const struct cake_tin_data *b = \
+			&q->tins[q->tin_order[idx / CAKE_QUEUES]];
 		const struct sk_buff *skb;
 
 		flow = &b->flows[idx % CAKE_QUEUES];
@@ -2394,7 +2790,7 @@ static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 		return;
 
 	for (i = 0; i < q->tin_cnt; i++) {
-		struct cake_tin_data *b = &q->tins[i];
+		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
 
 		for (j = 0; j < CAKE_QUEUES; j++) {
 			if (list_empty(&b->flows[j].flowchain) ||

^ permalink raw reply related

* Re: [PATCH net-next 1/2] tcp: add max_quickacks param to tcp_incr_quickack and tcp_enter_quickack_mode
From: Soheil Hassas Yeganeh @ 2018-05-22 14:03 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, netdev, Van Jacobson, Neal Cardwell,
	Yuchung Cheng, Eric Dumazet
In-Reply-To: <20180521220857.229273-2-edumazet@google.com>

On Mon, May 21, 2018 at 6:08 PM, Eric Dumazet <edumazet@google.com> wrote:
> We want to add finer control of the number of ACK packets sent after
> ECN events.
>
> This patch is not changing current behavior, it only enables following
> change.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>

Acked-by: Soheil Hassas Yeganeh <soheil@google.com>

> ---
>  net/ipv4/tcp_input.c | 24 +++++++++++++-----------
>  1 file changed, 13 insertions(+), 11 deletions(-)
>
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index aebb29ab2fdf2ceaa182cd11928f145a886149ff..2e970e9f4e09d966b703af2d14d521a4328eba7e 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -203,21 +203,23 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
>         }
>  }
>
> -static void tcp_incr_quickack(struct sock *sk)
> +static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks)
>  {
>         struct inet_connection_sock *icsk = inet_csk(sk);
>         unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
>
>         if (quickacks == 0)
>                 quickacks = 2;
> +       quickacks = min(quickacks, max_quickacks);
>         if (quickacks > icsk->icsk_ack.quick)
> -               icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
> +               icsk->icsk_ack.quick = quickacks;
>  }
>
> -static void tcp_enter_quickack_mode(struct sock *sk)
> +static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
>  {
>         struct inet_connection_sock *icsk = inet_csk(sk);
> -       tcp_incr_quickack(sk);
> +
> +       tcp_incr_quickack(sk, max_quickacks);
>         icsk->icsk_ack.pingpong = 0;
>         icsk->icsk_ack.ato = TCP_ATO_MIN;
>  }
> @@ -261,7 +263,7 @@ static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
>                  * it is probably a retransmit.
>                  */
>                 if (tp->ecn_flags & TCP_ECN_SEEN)
> -                       tcp_enter_quickack_mode((struct sock *)tp);
> +                       tcp_enter_quickack_mode((struct sock *)tp, TCP_MAX_QUICKACKS);
>                 break;
>         case INET_ECN_CE:
>                 if (tcp_ca_needs_ecn((struct sock *)tp))
> @@ -269,7 +271,7 @@ static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
>
>                 if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
>                         /* Better not delay acks, sender can have a very low cwnd */
> -                       tcp_enter_quickack_mode((struct sock *)tp);
> +                       tcp_enter_quickack_mode((struct sock *)tp, TCP_MAX_QUICKACKS);
>                         tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
>                 }
>                 tp->ecn_flags |= TCP_ECN_SEEN;
> @@ -686,7 +688,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
>                 /* The _first_ data packet received, initialize
>                  * delayed ACK engine.
>                  */
> -               tcp_incr_quickack(sk);
> +               tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
>                 icsk->icsk_ack.ato = TCP_ATO_MIN;
>         } else {
>                 int m = now - icsk->icsk_ack.lrcvtime;
> @@ -702,7 +704,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
>                         /* Too long gap. Apparently sender failed to
>                          * restart window, so that we send ACKs quickly.
>                          */
> -                       tcp_incr_quickack(sk);
> +                       tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
>                         sk_mem_reclaim(sk);
>                 }
>         }
> @@ -4179,7 +4181,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
>         if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
>             before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
>                 NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
> -               tcp_enter_quickack_mode(sk);
> +               tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
>
>                 if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
>                         u32 end_seq = TCP_SKB_CB(skb)->end_seq;
> @@ -4706,7 +4708,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
>                 tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
>
>  out_of_window:
> -               tcp_enter_quickack_mode(sk);
> +               tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
>                 inet_csk_schedule_ack(sk);
>  drop:
>                 tcp_drop(sk, skb);
> @@ -5790,7 +5792,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
>                          * to stand against the temptation 8)     --ANK
>                          */
>                         inet_csk_schedule_ack(sk);
> -                       tcp_enter_quickack_mode(sk);
> +                       tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
>                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
>                                                   TCP_DELACK_MAX, TCP_RTO_MAX);
>
> --
> 2.17.0.441.gb46fe60e1d-goog
>

^ permalink raw reply

* Re: [PATCH net-next 2/2] tcp: do not aggressively quick ack after ECN events
From: Soheil Hassas Yeganeh @ 2018-05-22 14:04 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, netdev, Van Jacobson, Neal Cardwell,
	Yuchung Cheng, Eric Dumazet
In-Reply-To: <20180521220857.229273-3-edumazet@google.com>

On Mon, May 21, 2018 at 6:08 PM, Eric Dumazet <edumazet@google.com> wrote:
> ECN signals currently forces TCP to enter quickack mode for
> up to 16 (TCP_MAX_QUICKACKS) following incoming packets.
>
> We believe this is not needed, and only sending one immediate ack
> for the current packet should be enough.
>
> This should reduce the extra load noticed in DCTCP environments,
> after congestion events.
>
> This is part 2 of our effort to reduce pure ACK packets.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>

Acked-by: Soheil Hassas Yeganeh <soheil@google.com>

Thanks for the patch!

^ permalink raw reply

* Re: [PATCH net-next v15 4/7] sch_cake: Add NAT awareness to packet classifier
From: Pablo Neira Ayuso @ 2018-05-22 14:07 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: netdev, cake, netfilter-devel
In-Reply-To: <152699745846.21931.4558451708304709296.stgit@alrua-kau>

Hi Toke,

On Tue, May 22, 2018 at 03:57:38PM +0200, Toke Høiland-Jørgensen wrote:
> When CAKE is deployed on a gateway that also performs NAT (which is a
> common deployment mode), the host fairness mechanism cannot distinguish
> internal hosts from each other, and so fails to work correctly.
> 
> To fix this, we add an optional NAT awareness mode, which will query the
> kernel conntrack mechanism to obtain the pre-NAT addresses for each packet
> and use that in the flow and host hashing.
> 
> When the shaper is enabled and the host is already performing NAT, the cost
> of this lookup is negligible. However, in unlimited mode with no NAT being
> performed, there is a significant CPU cost at higher bandwidths. For this
> reason, the feature is turned off by default.
> 
> Cc: netfilter-devel@vger.kernel.org
> Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
> ---
>  net/sched/sch_cake.c |   79 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 79 insertions(+)
> 
> diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
> index 68ac908470f1..6f7cae705c84 100644
> --- a/net/sched/sch_cake.c
> +++ b/net/sched/sch_cake.c
> @@ -71,6 +71,12 @@
>  #include <net/tcp.h>
>  #include <net/flow_dissector.h>
>  
> +#if IS_REACHABLE(CONFIG_NF_CONNTRACK)
> +#include <net/netfilter/nf_conntrack_core.h>
> +#include <net/netfilter/nf_conntrack_zones.h>
> +#include <net/netfilter/nf_conntrack.h>
> +#endif
> +
>  #define CAKE_SET_WAYS (8)
>  #define CAKE_MAX_TINS (8)
>  #define CAKE_QUEUES (1024)
> @@ -516,6 +522,60 @@ static bool cobalt_should_drop(struct cobalt_vars *vars,
>  	return drop;
>  }
>  
> +#if IS_REACHABLE(CONFIG_NF_CONNTRACK)
> +
> +static void cake_update_flowkeys(struct flow_keys *keys,
> +				 const struct sk_buff *skb)
> +{
> +	const struct nf_conntrack_tuple *tuple;
> +	enum ip_conntrack_info ctinfo;
> +	struct nf_conn *ct;
> +	bool rev = false;
> +
> +	if (tc_skb_protocol(skb) != htons(ETH_P_IP))
> +		return;
> +
> +	ct = nf_ct_get(skb, &ctinfo);
> +	if (ct) {
> +		tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo));
> +	} else {
> +		const struct nf_conntrack_tuple_hash *hash;
> +		struct nf_conntrack_tuple srctuple;
> +
> +		if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
> +				       NFPROTO_IPV4, dev_net(skb->dev),
> +				       &srctuple))
> +			return;
> +
> +		hash = nf_conntrack_find_get(dev_net(skb->dev),
> +					     &nf_ct_zone_dflt,
> +					     &srctuple);
> +		if (!hash)
> +			return;
> +
> +		rev = true;
> +		ct = nf_ct_tuplehash_to_ctrack(hash);
> +		tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir);
> +	}
> +
> +	keys->addrs.v4addrs.src = rev ? tuple->dst.u3.ip : tuple->src.u3.ip;
> +	keys->addrs.v4addrs.dst = rev ? tuple->src.u3.ip : tuple->dst.u3.ip;
> +
> +	if (keys->ports.ports) {
> +		keys->ports.src = rev ? tuple->dst.u.all : tuple->src.u.all;
> +		keys->ports.dst = rev ? tuple->src.u.all : tuple->dst.u.all;
> +	}
> +	if (rev)
> +		nf_ct_put(ct);
> +}

This is going to pull in the nf_conntrack module, even if you may not
want it, as soon as cake is in place.

^ permalink raw reply

* Re: [net-next PATCH v2 2/4] net: Enable Tx queue selection based on Rx queues
From: Tom Herbert @ 2018-05-22 14:09 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Amritha Nambiar, Linux Kernel Network Developers, David S. Miller,
	Alexander Duyck, Sridhar Samudrala, Eric Dumazet,
	Hannes Frederic Sowa
In-Reply-To: <CAF=yD-+Um_ZjcPJyL6G8h0isBDG35q1d=mB4X7NB=unu813KOg@mail.gmail.com>

On Mon, May 21, 2018 at 8:12 AM, Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
> On Mon, May 21, 2018 at 10:51 AM, Tom Herbert <tom@herbertland.com> wrote:
>> On Sat, May 19, 2018 at 1:27 PM, Willem de Bruijn
>> <willemdebruijn.kernel@gmail.com> wrote:
>>> On Sat, May 19, 2018 at 4:13 PM, Willem de Bruijn
>>> <willemdebruijn.kernel@gmail.com> wrote:
>>>> On Fri, May 18, 2018 at 12:03 AM, Tom Herbert <tom@herbertland.com> wrote:
>>>>> On Tue, May 15, 2018 at 6:26 PM, Amritha Nambiar
>>>>> <amritha.nambiar@intel.com> wrote:
>>>>>> This patch adds support to pick Tx queue based on the Rx queue map
>>>>>> configuration set by the admin through the sysfs attribute
>>>>>> for each Tx queue. If the user configuration for receive
>>>>>> queue map does not apply, then the Tx queue selection falls back
>>>>>> to CPU map based selection and finally to hashing.
>>>>>>
>>>>>> Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
>>>>>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>>>>>> ---
>>>
>>>>>> +static int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
>>>>>> +{
>>>>>> +#ifdef CONFIG_XPS
>>>>>> +       enum xps_map_type i = XPS_MAP_RXQS;
>>>>>> +       struct xps_dev_maps *dev_maps;
>>>>>> +       struct sock *sk = skb->sk;
>>>>>> +       int queue_index = -1;
>>>>>> +       unsigned int tci = 0;
>>>>>> +
>>>>>> +       if (sk && sk->sk_rx_queue_mapping <= dev->real_num_rx_queues &&
>>>>>> +           dev->ifindex == sk->sk_rx_ifindex)
>>>>>> +               tci = sk->sk_rx_queue_mapping;
>>>>>> +
>>>>>> +       rcu_read_lock();
>>>>>> +       while (queue_index < 0 && i < __XPS_MAP_MAX) {
>>>>>> +               if (i == XPS_MAP_CPUS)
>>>>>
>>>>> This while loop typifies exactly why I don't think the XPS maps should
>>>>> be an array.
>>>>
>>>> +1
>>>
>>> as a matter of fact, as enabling both cpu and rxqueue map at the same
>>> time makes no sense, only one map is needed at any one time. The
>>> only difference is in how it is indexed. It should probably not be possible
>>> to configure both at the same time. Keeping a single map probably also
>>> significantly simplifies patch 1/4.
>>
>> Willem,
>>
>> I think it might makes sense to have them both. Maybe one application
>> is spin polling that needs this, where others might be happy with
>> normal CPU mappings as default.
>
> Some entries in the rx_queue table have queue_pair affinity
> configured, the others return -1 to fall through to the cpu
> affinity table?
>
Right, that's the intent of the while loop.

> I guess that implies flow steering to those special purpose
> queues. I wonder whether this would be used this in practice.
> I does make the code more complex by having to duplicate
> the map lookup logic (mostly, patch 1/4).

That's a good pont. I think we need more information on how the
feature is going to be used in practice. My assumption is that there
are some number of "special" queues for which spin polling is being
done.

Tom

^ permalink raw reply

* Re: [PATCH net-next v15 4/7] sch_cake: Add NAT awareness to packet classifier
From: Toke Høiland-Jørgensen @ 2018-05-22 14:11 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netdev, cake, netfilter-devel
In-Reply-To: <20180522140759.2rl25eggaoaecw4m@salvia>

Pablo Neira Ayuso <pablo@netfilter.org> writes:

> Hi Toke,
>
> On Tue, May 22, 2018 at 03:57:38PM +0200, Toke Høiland-Jørgensen wrote:
>> When CAKE is deployed on a gateway that also performs NAT (which is a
>> common deployment mode), the host fairness mechanism cannot distinguish
>> internal hosts from each other, and so fails to work correctly.
>> 
>> To fix this, we add an optional NAT awareness mode, which will query the
>> kernel conntrack mechanism to obtain the pre-NAT addresses for each packet
>> and use that in the flow and host hashing.
>> 
>> When the shaper is enabled and the host is already performing NAT, the cost
>> of this lookup is negligible. However, in unlimited mode with no NAT being
>> performed, there is a significant CPU cost at higher bandwidths. For this
>> reason, the feature is turned off by default.
>> 
>> Cc: netfilter-devel@vger.kernel.org
>> Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
>> ---
>>  net/sched/sch_cake.c |   79 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 79 insertions(+)
>> 
>> diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
>> index 68ac908470f1..6f7cae705c84 100644
>> --- a/net/sched/sch_cake.c
>> +++ b/net/sched/sch_cake.c
>> @@ -71,6 +71,12 @@
>>  #include <net/tcp.h>
>>  #include <net/flow_dissector.h>
>>  
>> +#if IS_REACHABLE(CONFIG_NF_CONNTRACK)
>> +#include <net/netfilter/nf_conntrack_core.h>
>> +#include <net/netfilter/nf_conntrack_zones.h>
>> +#include <net/netfilter/nf_conntrack.h>
>> +#endif
>> +
>>  #define CAKE_SET_WAYS (8)
>>  #define CAKE_MAX_TINS (8)
>>  #define CAKE_QUEUES (1024)
>> @@ -516,6 +522,60 @@ static bool cobalt_should_drop(struct cobalt_vars *vars,
>>  	return drop;
>>  }
>>  
>> +#if IS_REACHABLE(CONFIG_NF_CONNTRACK)
>> +
>> +static void cake_update_flowkeys(struct flow_keys *keys,
>> +				 const struct sk_buff *skb)
>> +{
>> +	const struct nf_conntrack_tuple *tuple;
>> +	enum ip_conntrack_info ctinfo;
>> +	struct nf_conn *ct;
>> +	bool rev = false;
>> +
>> +	if (tc_skb_protocol(skb) != htons(ETH_P_IP))
>> +		return;
>> +
>> +	ct = nf_ct_get(skb, &ctinfo);
>> +	if (ct) {
>> +		tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo));
>> +	} else {
>> +		const struct nf_conntrack_tuple_hash *hash;
>> +		struct nf_conntrack_tuple srctuple;
>> +
>> +		if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
>> +				       NFPROTO_IPV4, dev_net(skb->dev),
>> +				       &srctuple))
>> +			return;
>> +
>> +		hash = nf_conntrack_find_get(dev_net(skb->dev),
>> +					     &nf_ct_zone_dflt,
>> +					     &srctuple);
>> +		if (!hash)
>> +			return;
>> +
>> +		rev = true;
>> +		ct = nf_ct_tuplehash_to_ctrack(hash);
>> +		tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir);
>> +	}
>> +
>> +	keys->addrs.v4addrs.src = rev ? tuple->dst.u3.ip : tuple->src.u3.ip;
>> +	keys->addrs.v4addrs.dst = rev ? tuple->src.u3.ip : tuple->dst.u3.ip;
>> +
>> +	if (keys->ports.ports) {
>> +		keys->ports.src = rev ? tuple->dst.u.all : tuple->src.u.all;
>> +		keys->ports.dst = rev ? tuple->src.u.all : tuple->dst.u.all;
>> +	}
>> +	if (rev)
>> +		nf_ct_put(ct);
>> +}
>
> This is going to pull in the nf_conntrack module, even if you may not
> want it, as soon as cake is in place.

Yeah, we are aware of that; we get a moddep on nf_conntrack. Our main
deployment scenario has been home routers where conntrack is used
anyway, so this has not been much of an issue. However, if there is a
way to avoid this, and instead detect at runtime if conntrack is
available, that would certainly be useful. Is there? :)

-Toke

^ permalink raw reply

* RE: [PATCH v2] packet: track ring entry use using a shadow ring to prevent RX ring overrun
From: Jon Rosen (jrosen) @ 2018-05-22 14:12 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: David S. Miller, Willem de Bruijn, Eric Dumazet, Kees Cook,
	David Windsor, Rosen, Rami, Reshetova, Elena, Mike Maloney,
	Benjamin Poirier, Thomas Gleixner, Greg Kroah-Hartman,
	open list:NETWORKING [GENERAL], open list
In-Reply-To: <CAF=yD-Kfto4jJYMFzn=PV8OYdhEmdNfW+aakDhRMzRBWhWY0UQ@mail.gmail.com>

On Monday, May 21, 2018 2:17 PM, Jon Rosen (jrosen) <jrosen@cisco.com> wrote:
> On Monday, May 21, 2018 1:07 PM, Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
>> On Mon, May 21, 2018 at 8:57 AM, Jon Rosen (jrosen) <jrosen@cisco.com> wrote:

...snip...

>>
>> A setsockopt for userspace to signal a stricter interpretation of
>> tp_status to elide the shadow hack could then be considered.
>> It's not pretty. Either way, no full new version is required.
>>
>>> As much as I would like to find a solution that doesn't require
>>> the spin lock I have yet to do so. Maybe the answer is that
>>> existing applications will need to suffer the performance impact
>>> but a new version or option for TPACKET_V1/V2 could be added to
>>> indicate strict adherence of the TP_STATUS_USER bit and then the
>>> original diffs could be used.

It looks like adding new socket options is pretty rare so I
wonder if a better option might be to define a new TP_STATUS_XXX
bit which would signal from a userspace application to the kernel
that it strictly interprets the TP_STATUS_USER bit to determine
ownership.

Todays applications set tp_status = TP_STATUS_KERNEL(0) for the
kernel to pick up the entry.  We could define a new value to pass
ownership as well as one to indicate to other kernel threads that
an entry is inuse:

        #define TP_STATUS_USER_TO_KERNEL        (1 << 8)
        #define TP_STATUS_INUSE                 (1 << 9)

If the kernel sees tp_status == TP_STATUS_KERNEL then it should
use the shadow method for tacking ownership. If it sees tp_status
== TP_STATUS_USER_TO_KERNEL then it can use the TP_STATUS_INUSE
method.

>>>
>>> There is another option I was considering but have yet to try
>>> which would avoid needing a shadow ring by using counter(s) to
>>> track maximum sequence number queued to userspace vs. the next
>>> sequence number to be allocated in the ring.  If the difference
>>> is greater than the size of the ring then the ring can be
>>> considered full and the allocation would fail. Of course this may
>>> create an additional hotspot between cores, not sure if that
>>> would be significant or not.
>>
>> Please do have a look, but I don't think that this will work in this
>> case in practice. It requires tracking the producer tail. Updating
>> the slowest writer requires probing each subsequent slot's status
>> byte to find the new tail, which is a lot of (by then cold) cacheline
>> reads.
>
> I've thought about it a little more and am not convinced it's
> workable but I'll spend a little more time on it before giving
> up.

I've given up on this method.  Just don't see how to make it work.


^ permalink raw reply

* pull-request: wireless-drivers 2018-05-22
From: Kalle Valo @ 2018-05-22 14:28 UTC (permalink / raw)
  To: David Miller; +Cc: linux-wireless, netdev, linux-kernel

Hi Dave,

here's a pull request to net tree for 4.17. Please let me know if you
have any problems.

Kalle

The following changes since commit a8d7aa17bbc970971ccdf71988ea19230ab368b1:

  dccp: fix tasklet usage (2018-05-03 15:14:57 -0400)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/kvalo/wireless-drivers.git tags/wireless-drivers-for-davem-2018-05-22

for you to fetch changes up to 813477aa49aac5deba04eb4956360dde58a0e807:

  MAINTAINERS: change Kalle as wcn36xx maintainer (2018-05-22 15:36:41 +0300)

----------------------------------------------------------------
wireless-drivers fixes for 4.17

Hopefully the last fixes for 4.17. ssb is again causing problems so we
had to revert a commit and fix it better. Also a small fix to bcma and
some MAINTAINERS file updates.

ssb

* fix regression with all module PCI cards, for example using b43 and
  b44 drivers

* try again fixing a MIPS linker error

bcma

* fix truncated info log messages

----------------------------------------------------------------
Kalle Valo (3):
      MAINTAINERS: update Kalle's email address
      MAINTAINERS: change Kalle as ath.ko maintainer
      MAINTAINERS: change Kalle as wcn36xx maintainer

Rafał Miłecki (3):
      bcma: fix buffer size caused crash in bcma_core_mips_print_irq()
      Revert "ssb: Prevent build of PCI host features in module"
      ssb: make SSB_PCICORE_HOSTMODE depend on SSB = y

 MAINTAINERS                | 8 ++++----
 drivers/bcma/driver_mips.c | 2 +-
 drivers/ssb/Kconfig        | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

^ permalink raw reply

* Re: [PATCH 1/4] arcnet: com20020: Add com20020 io mapped version
From: Andrea Greco @ 2018-05-22 14:44 UTC (permalink / raw)
  To: David Miller; +Cc: tobin, a.greco, m.grzeschik, linux-kernel, netdev
In-Reply-To: <20180518.135152.51730771671749217.davem@davemloft.net>

On 05/18/2018 07:51 PM, David Miller wrote:
> From: Andrea Greco <andrea.greco.gapmilano@gmail.com>
> Date: Fri, 18 May 2018 14:18:41 +0200
> 
>> In com20020.c found this:
>> /* FIXME: do this some other way! */
>> if (!dev->dev_addr[0])
>> dev->dev_addr[0] = arcnet_inb(ioaddr, 8);
>>
>> NODE-ID, must be univoque, for all arcnet network.
>> My previews idea was take random value but, this could create a
>> collision over network.
>>
>> A possible solution is:
>> In case of collision com20020 set a bit in status register.
>> Then peak a new NODE-ID and repeat this while correct NODE-ID is found.
>>
>> Other ideas is pass it via DTS.
>> But suppose have 2 same product in same network, same address same problem.
>> For this reason i prefer left standard driver behavior.
>>
>> Other ideas for solve this ?
> 
> Is there no way to obtain a unique value from the device?
> 
> If having a unique ID to talk on the ARCNET is so critical, there must
> be some way to properly allocation and use a unique ID.

Device can rise interrupt in case of Duplicate ID over the network.

> I guess this must be a general problem with this driver already.
I think arcnet network and relative NODE-ID designed during project 
phase, and address is always fixed.

In fact:
Other version of this dirver: PCI, ISA friends.
Simple work as module, and Node-ID is param of modules.

My opinion is that:
Before run `ifconfig arc0 up`, user has to setup hardware address with 
`ip link set dev arc0 address D2`.

All this is like a IP network with static address. If your IP address is 
duplicated, is not IP problem.

> You still need to address the issue of 'dev' being leaked on probe
> error paths.
For solve this,i think all considered a random address could be a good 
solution.

Regards, Andrea

^ permalink raw reply

* Re: [PATCH net-next 00/13] nfp: abm: add basic support for advanced buffering NIC
From: Or Gerlitz @ 2018-05-22 14:50 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: David Miller, Linux Netdev List, oss-drivers, Andy Gospodarek,
	linux-internal
In-Reply-To: <CAJpBn1ywEO2MayK=EvzzLYLXfZ2xuWCfyiaAYLtL-gnrj2gKwQ@mail.gmail.com>

On Tue, May 22, 2018 at 10:56 AM, Jakub Kicinski
<jakub.kicinski@netronome.com> wrote:
> On Mon, May 21, 2018 at 11:32 PM, Or Gerlitz wrote:
>> On Tue, May 22, 2018 at 8:12 AM, Jakub Kicinski wrote:
>>> Hi!
>>>
>>> This series lays groundwork for advanced buffer management NIC feature.
>>> It makes necessary NFP core changes, spawns representors and adds devlink
>>> glue.  Following series will add the actual buffering configuration (patch
>>> series size limit).
>>>
>>> First three patches add support for configuring NFP buffer pools via a
>>> mailbox.  The existing devlink APIs are used for the purpose.
>>>
>>> Third patch allows us to perform small reads from the NFP memory.
>>>
>>> The rest of the patch set adds eswitch mode change support and makes
>>> the driver spawn appropriate representors.
>>
>> Hi Jakub,
>>
>> Could you provide more higher level description on the abm use-case
>> and nature of these representors? I understand that under abm you are
>> modeling the nic as switch with vNIC ports, does vNIC port and vNIC
>> port rep have the same characteristics as VF and VF rep (xmit on one side
>> <--> send on 2nd side),  does traffic is to be offloaded using TC, etc.
>> What one would be doing with vNIC instance, hand it to container ala the Intel
>> VMDQ concept?
>> can this be seen as veth HW offload? etc

> Yes, the reprs can be used like VF reprs but that's not the main use
> case. We are targeting container world with ABM, so no VFs and no
> SR-IOV.  There is only one vNIC per port and no veth offload etc. In

one vNIC for multiple containers? or you have a (v?) port per container?

> In the most basic scenario with 1 PF corresponding to 1 port there is no
> real use for switching.

multiple containers? please clarify it a little better

> The main purpose here is that we want to setup the buffering and QoS
> inside the NIC (both for TX and RX) and then use eBPF to perform
> filtering, queue assignment and per-application RSS. That's pretty
> much it at this point.

> Switching if any will be a basic bridge offload.  QoS configuration
> will all be done using TC qdisc offload, RED etc. exactly like mlxsw :)

I guess I'll understand it better once you clarify the multiple
containers thing,
thanks for the details and openness

^ permalink raw reply

* [PATCH net-next v4 0/3] fib rule selftest
From: Roopa Prabhu @ 2018-05-22 14:51 UTC (permalink / raw)
  To: davem; +Cc: netdev, nikolay, dsa, idosch

From: Roopa Prabhu <roopa@cumulusnetworks.com>

This series adds a new test to test fib rules.
ip route get is used to test fib rule matches.
This series also extends ip route get to match on
sport and dport to test recent support of sport
and dport fib rule match.

v2 - address ido's commemt to make sport dport
ip route get to work correctly for input route
get. I don't support ip route get on ip-proto match yet.
ip route get creates a udp packet and i have left
it at that. We could extend ip route get to support
a few ip proto matches in followup patches.

v3 - Support ip_proto (only tcp and udp) match in getroute.
dropped printing of new match attrs in ip route get, 
because ipv6 does not print it. And ipv6 currrently shares
the dump api with ipv6 notify and its better to not add them
to the notify api. dropped it to keep the api consistent between
ipv4 and ipv6 (though uid is already printed in the ipv4 case).
If we need it, both ipv4 and ipv6 can be enhanced to provide
a separate get api. Moved skb creation for ipv4 to a separate func.

v4 - drop separate skb for netlink and fix concerns around rcu and netlink
     reply (as pointed out by DaveM). I now try to reset the skb after the route
     lookup and before the netlink send (testing shows this is ok. More eyes and
     any feedback here will be helpful)

Roopa Prabhu (3):
  ipv4: support sport, dport and ip_proto in RTM_GETROUTE
  ipv6: support sport, dport and ip_proto in RTM_GETROUTE
  selftests: net: initial fib rule tests

 include/uapi/linux/rtnetlink.h                |   2 +
 net/ipv4/route.c                              | 152 ++++++++++++-----
 net/ipv6/route.c                              |  25 +++
 tools/testing/selftests/net/Makefile          |   2 +-
 tools/testing/selftests/net/fib_rule_tests.sh | 224 ++++++++++++++++++++++++++
 5 files changed, 366 insertions(+), 39 deletions(-)
 create mode 100644 tools/testing/selftests/net/fib_rule_tests.sh

-- 
2.1.4

^ permalink raw reply

* [PATCH net-next v4 1/3] ipv4: support sport, dport and ip_proto in RTM_GETROUTE
From: Roopa Prabhu @ 2018-05-22 14:51 UTC (permalink / raw)
  To: davem; +Cc: netdev, nikolay, dsa, idosch
In-Reply-To: <1527000668-25253-1-git-send-email-roopa@cumulusnetworks.com>

From: Roopa Prabhu <roopa@cumulusnetworks.com>

This is a followup to fib rules sport, dport and ipproto
match support. Only supports tcp, udp and icmp for ipproto.
Used by fib rule self tests.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
---
 include/net/ip.h               |   3 +
 include/uapi/linux/rtnetlink.h |   3 +
 net/ipv4/Makefile              |   2 +-
 net/ipv4/fib_frontend.c        |   4 ++
 net/ipv4/netlink.c             |  23 +++++++
 net/ipv4/route.c               | 146 ++++++++++++++++++++++++++++++-----------
 6 files changed, 141 insertions(+), 40 deletions(-)
 create mode 100644 net/ipv4/netlink.c

diff --git a/include/net/ip.h b/include/net/ip.h
index bada1f1..0d2281b 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -664,4 +664,7 @@ extern int sysctl_icmp_msgs_burst;
 int ip_misc_proc_init(void);
 #endif
 
+int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto,
+				struct netlink_ext_ack *extack);
+
 #endif	/* _IP_H */
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 9b15005..cabb210 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -327,6 +327,9 @@ enum rtattr_type_t {
 	RTA_PAD,
 	RTA_UID,
 	RTA_TTL_PROPAGATE,
+	RTA_IP_PROTO,
+	RTA_SPORT,
+	RTA_DPORT,
 	__RTA_MAX
 };
 
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index b379520..13f2ba9 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -14,7 +14,7 @@ obj-y     := route.o inetpeer.o protocol.o \
 	     udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
 	     fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
 	     inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
-	     metrics.o
+	     metrics.o netlink.o
 
 obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
 obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 4d622112..cf5cfc5 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -649,6 +649,10 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
 	[RTA_ENCAP]		= { .type = NLA_NESTED },
 	[RTA_UID]		= { .type = NLA_U32 },
 	[RTA_MARK]		= { .type = NLA_U32 },
+	[RTA_TABLE]		= { .type = NLA_U32 },
+	[RTA_IP_PROTO]		= { .type = NLA_U8 },
+	[RTA_SPORT]		= { .type = NLA_U16 },
+	[RTA_DPORT]		= { .type = NLA_U16 },
 };
 
 static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
diff --git a/net/ipv4/netlink.c b/net/ipv4/netlink.c
new file mode 100644
index 0000000..f86bb4f
--- /dev/null
+++ b/net/ipv4/netlink.c
@@ -0,0 +1,23 @@
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/types.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/ip.h>
+
+int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto,
+				struct netlink_ext_ack *extack)
+{
+	*ip_proto = nla_get_u8(attr);
+
+	switch (*ip_proto) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+	case IPPROTO_ICMP:
+		return 0;
+	default:
+		NL_SET_ERR_MSG(extack, "Unsupported ip proto");
+		return -EOPNOTSUPP;
+	}
+}
+EXPORT_SYMBOL_GPL(rtm_getroute_parse_ip_proto);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 2cfa1b5..0e401dc 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2574,11 +2574,10 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
 EXPORT_SYMBOL_GPL(ip_route_output_flow);
 
 /* called with rcu_read_lock held */
-static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
-			struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
-			u32 seq)
+static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
+			struct rtable *rt, u32 table_id, struct flowi4 *fl4,
+			struct sk_buff *skb, u32 portid, u32 seq)
 {
-	struct rtable *rt = skb_rtable(skb);
 	struct rtmsg *r;
 	struct nlmsghdr *nlh;
 	unsigned long expires = 0;
@@ -2674,7 +2673,7 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
 			}
 		} else
 #endif
-			if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
+			if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
 				goto nla_put_failure;
 	}
 
@@ -2689,43 +2688,93 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
 	return -EMSGSIZE;
 }
 
+static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
+						   u8 ip_proto, __be16 sport,
+						   __be16 dport)
+{
+	struct sk_buff *skb;
+	struct iphdr *iph;
+
+	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb)
+		return NULL;
+
+	/* Reserve room for dummy headers, this skb can pass
+	 * through good chunk of routing engine.
+	 */
+	skb_reset_mac_header(skb);
+	skb_reset_network_header(skb);
+	skb->protocol = htons(ETH_P_IP);
+	iph = skb_put(skb, sizeof(struct iphdr));
+	iph->protocol = ip_proto;
+	iph->saddr = src;
+	iph->daddr = dst;
+	iph->version = 0x4;
+	iph->frag_off = 0;
+	iph->ihl = 0x5;
+	skb_set_transport_header(skb, skb->len);
+
+	switch (iph->protocol) {
+	case IPPROTO_UDP: {
+		struct udphdr *udph;
+
+		udph = skb_put_zero(skb, sizeof(struct udphdr));
+		udph->source = sport;
+		udph->dest = dport;
+		udph->len = sizeof(struct udphdr);
+		udph->check = 0;
+		break;
+	}
+	case IPPROTO_TCP: {
+		struct tcphdr *tcph;
+
+		tcph = skb_put_zero(skb, sizeof(struct tcphdr));
+		tcph->source	= sport;
+		tcph->dest	= dport;
+		tcph->doff	= sizeof(struct tcphdr) / 4;
+		tcph->rst = 1;
+		tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
+					    src, dst, 0);
+		break;
+	}
+	case IPPROTO_ICMP: {
+		struct icmphdr *icmph;
+
+		icmph = skb_put_zero(skb, sizeof(struct icmphdr));
+		icmph->type = ICMP_ECHO;
+		icmph->code = 0;
+	}
+	}
+
+	return skb;
+}
+
 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 			     struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(in_skb->sk);
-	struct rtmsg *rtm;
 	struct nlattr *tb[RTA_MAX+1];
+	u32 table_id = RT_TABLE_MAIN;
+	__be16 sport = 0, dport = 0;
 	struct fib_result res = {};
+	u8 ip_proto = IPPROTO_UDP;
 	struct rtable *rt = NULL;
+	struct sk_buff *skb;
+	struct rtmsg *rtm;
 	struct flowi4 fl4;
 	__be32 dst = 0;
 	__be32 src = 0;
+	kuid_t uid;
 	u32 iif;
 	int err;
 	int mark;
-	struct sk_buff *skb;
-	u32 table_id = RT_TABLE_MAIN;
-	kuid_t uid;
 
 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
 			  extack);
 	if (err < 0)
-		goto errout;
+		return err;
 
 	rtm = nlmsg_data(nlh);
-
-	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
-	if (!skb) {
-		err = -ENOBUFS;
-		goto errout;
-	}
-
-	/* Reserve room for dummy headers, this skb can pass
-	   through good chunk of routing engine.
-	 */
-	skb_reset_mac_header(skb);
-	skb_reset_network_header(skb);
-
 	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
 	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
@@ -2735,14 +2784,22 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	else
 		uid = (iif ? INVALID_UID : current_uid());
 
-	/* Bugfix: need to give ip_route_input enough of an IP header to
-	 * not gag.
-	 */
-	ip_hdr(skb)->protocol = IPPROTO_UDP;
-	ip_hdr(skb)->saddr = src;
-	ip_hdr(skb)->daddr = dst;
+	if (tb[RTA_IP_PROTO]) {
+		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
+						  &ip_proto, extack);
+		if (err)
+			return err;
+	}
+
+	if (tb[RTA_SPORT])
+		sport = nla_get_be16(tb[RTA_SPORT]);
 
-	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
+	if (tb[RTA_DPORT])
+		dport = nla_get_be16(tb[RTA_DPORT]);
+
+	skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
+	if (!skb)
+		return -ENOBUFS;
 
 	memset(&fl4, 0, sizeof(fl4));
 	fl4.daddr = dst;
@@ -2751,6 +2808,11 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
 	fl4.flowi4_mark = mark;
 	fl4.flowi4_uid = uid;
+	if (sport)
+		fl4.fl4_sport = sport;
+	if (dport)
+		fl4.fl4_dport = dport;
+	fl4.flowi4_proto = ip_proto;
 
 	rcu_read_lock();
 
@@ -2760,10 +2822,10 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 		dev = dev_get_by_index_rcu(net, iif);
 		if (!dev) {
 			err = -ENODEV;
-			goto errout_free;
+			goto errout_rcu;
 		}
 
-		skb->protocol	= htons(ETH_P_IP);
+		fl4.flowi4_iif = iif; /* for rt_fill_info */
 		skb->dev	= dev;
 		skb->mark	= mark;
 		err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
@@ -2783,7 +2845,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	}
 
 	if (err)
-		goto errout_free;
+		goto errout_rcu;
 
 	if (rtm->rtm_flags & RTM_F_NOTIFY)
 		rt->rt_flags |= RTCF_NOTIFY;
@@ -2791,34 +2853,40 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
 		table_id = res.table ? res.table->tb_id : 0;
 
+	/* reset skb for netlink reply msg */
+	skb_trim(skb, 0);
+	skb_reset_network_header(skb);
+	skb_reset_transport_header(skb);
+	skb_reset_mac_header(skb);
+
 	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
 		if (!res.fi) {
 			err = fib_props[res.type].error;
 			if (!err)
 				err = -EHOSTUNREACH;
-			goto errout_free;
+			goto errout_rcu;
 		}
 		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
 				    nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
 				    rt->rt_type, res.prefix, res.prefixlen,
 				    fl4.flowi4_tos, res.fi, 0);
 	} else {
-		err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
+		err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
 				   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
 	}
 	if (err < 0)
-		goto errout_free;
+		goto errout_rcu;
 
 	rcu_read_unlock();
 
 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
-errout:
-	return err;
 
 errout_free:
+	return err;
+errout_rcu:
 	rcu_read_unlock();
 	kfree_skb(skb);
-	goto errout;
+	goto errout_free;
 }
 
 void ip_rt_multicast_event(struct in_device *in_dev)
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next v4 2/3] ipv6: support sport, dport and ip_proto in RTM_GETROUTE
From: Roopa Prabhu @ 2018-05-22 14:51 UTC (permalink / raw)
  To: davem; +Cc: netdev, nikolay, dsa, idosch
In-Reply-To: <1527000668-25253-1-git-send-email-roopa@cumulusnetworks.com>

From: Roopa Prabhu <roopa@cumulusnetworks.com>

This is a followup to fib6 rules sport, dport and ipproto
match support. Only supports tcp, udp and icmp for ipproto.
Used by fib rule self tests.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
---
 net/ipv6/route.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index cc24ed3..7f1babb 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -63,6 +63,7 @@
 #include <net/lwtunnel.h>
 #include <net/ip_tunnels.h>
 #include <net/l3mdev.h>
+#include <net/ip.h>
 #include <trace/events/fib6.h>
 
 #include <linux/uaccess.h>
@@ -4083,6 +4084,9 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
 	[RTA_UID]		= { .type = NLA_U32 },
 	[RTA_MARK]		= { .type = NLA_U32 },
 	[RTA_TABLE]		= { .type = NLA_U32 },
+	[RTA_IP_PROTO]		= { .type = NLA_U8 },
+	[RTA_SPORT]		= { .type = NLA_U16 },
+	[RTA_DPORT]		= { .type = NLA_U16 },
 };
 
 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -4794,6 +4798,19 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	else
 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
 
+	if (tb[RTA_SPORT])
+		fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
+
+	if (tb[RTA_DPORT])
+		fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
+
+	if (tb[RTA_IP_PROTO]) {
+		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
+						  &fl6.flowi6_proto, extack);
+		if (err)
+			goto errout;
+	}
+
 	if (iif) {
 		struct net_device *dev;
 		int flags = 0;
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next v4 3/3] selftests: net: initial fib rule tests
From: Roopa Prabhu @ 2018-05-22 14:51 UTC (permalink / raw)
  To: davem; +Cc: netdev, nikolay, dsa, idosch
In-Reply-To: <1527000668-25253-1-git-send-email-roopa@cumulusnetworks.com>

From: Roopa Prabhu <roopa@cumulusnetworks.com>

This adds a first set of tests for fib rule match/action for
ipv4 and ipv6. Initial tests only cover action lookup table.
can be extended to cover other actions in the future.
Uses ip route get to validate the rule lookup.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
---
 tools/testing/selftests/net/Makefile          |   2 +-
 tools/testing/selftests/net/fib_rule_tests.sh | 248 ++++++++++++++++++++++++++
 2 files changed, 249 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/net/fib_rule_tests.sh

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index e60dddb..7cb0f49 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -6,7 +6,7 @@ CFLAGS += -I../../../../usr/include/
 
 TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetlink.sh
 TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh udpgso.sh
-TEST_PROGS += udpgso_bench.sh
+TEST_PROGS += udpgso_bench.sh fib_rule_tests.sh
 TEST_PROGS_EXTENDED := in_netns.sh
 TEST_GEN_FILES =  socket
 TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy
diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh
new file mode 100644
index 0000000..d4cfb6a
--- /dev/null
+++ b/tools/testing/selftests/net/fib_rule_tests.sh
@@ -0,0 +1,248 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test is for checking IPv4 and IPv6 FIB rules API
+
+ret=0
+
+PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no}
+IP="ip -netns testns"
+
+RTABLE=100
+GW_IP4=192.51.100.2
+SRC_IP=192.51.100.3
+GW_IP6=2001:db8:1::2
+SRC_IP6=2001:db8:1::3
+
+DEV_ADDR=192.51.100.1
+DEV=dummy0
+
+log_test()
+{
+	local rc=$1
+	local expected=$2
+	local msg="$3"
+
+	if [ ${rc} -eq ${expected} ]; then
+		nsuccess=$((nsuccess+1))
+		printf "\n    TEST: %-50s  [ OK ]\n" "${msg}"
+	else
+		nfail=$((nfail+1))
+		printf "\n    TEST: %-50s  [FAIL]\n" "${msg}"
+		if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
+			echo
+			echo "hit enter to continue, 'q' to quit"
+			read a
+			[ "$a" = "q" ] && exit 1
+		fi
+	fi
+}
+
+log_section()
+{
+	echo
+	echo "######################################################################"
+	echo "TEST SECTION: $*"
+	echo "######################################################################"
+}
+
+setup()
+{
+	set -e
+	ip netns add testns
+	$IP link set dev lo up
+
+	$IP link add dummy0 type dummy
+	$IP link set dev dummy0 up
+	$IP address add 198.51.100.1/24 dev dummy0
+	$IP -6 address add 2001:db8:1::1/64 dev dummy0
+
+	set +e
+}
+
+cleanup()
+{
+	$IP link del dev dummy0 &> /dev/null
+	ip netns del testns
+}
+
+fib_check_iproute_support()
+{
+	ip rule help 2>&1 | grep -q $1
+	if [ $? -ne 0 ]; then
+		echo "SKIP: iproute2 iprule too old, missing $1 match"
+		return 1
+	fi
+
+	ip route get help 2>&1 | grep -q $2
+	if [ $? -ne 0 ]; then
+		echo "SKIP: iproute2 get route too old, missing $2 match"
+		return 1
+	fi
+
+	return 0
+}
+
+fib_rule6_del()
+{
+	$IP -6 rule del $1
+	log_test $? 0 "rule6 del $1"
+}
+
+fib_rule6_del_by_pref()
+{
+	pref=$($IP -6 rule show | grep "$1 lookup $TABLE" | cut -d ":" -f 1)
+	$IP -6 rule del pref $pref
+}
+
+fib_rule6_test_match_n_redirect()
+{
+	local match="$1"
+	local getmatch="$2"
+
+	$IP -6 rule add $match table $RTABLE
+	$IP -6 route get $GW_IP6 $getmatch | grep -q "table $RTABLE"
+	log_test $? 0 "rule6 check: $1"
+
+	fib_rule6_del_by_pref "$match"
+	log_test $? 0 "rule6 del by pref: $match"
+}
+
+fib_rule6_test()
+{
+	# setup the fib rule redirect route
+	$IP -6 route add table $RTABLE default via $GW_IP6 dev $DEV onlink
+
+	match="oif $DEV"
+	fib_rule6_test_match_n_redirect "$match" "$match" "oif redirect to table"
+
+	match="from $SRC_IP6 iif $DEV"
+	fib_rule6_test_match_n_redirect "$match" "$match" "iif redirect to table"
+
+	match="tos 0x10"
+	fib_rule6_test_match_n_redirect "$match" "$match" "tos redirect to table"
+
+	match="fwmark 0x64"
+	getmatch="mark 0x64"
+	fib_rule6_test_match_n_redirect "$match" "$getmatch" "fwmark redirect to table"
+
+	fib_check_iproute_support "uidrange" "uid"
+	if [ $? -eq 0 ]; then
+		match="uidrange 100-100"
+		getmatch="uid 100"
+		fib_rule6_test_match_n_redirect "$match" "$getmatch" "uid redirect to table"
+	fi
+
+	fib_check_iproute_support "sport" "sport"
+	if [ $? -eq 0 ]; then
+		match="sport 666 dport 777"
+		fib_rule6_test_match_n_redirect "$match" "$match" "sport and dport redirect to table"
+	fi
+
+	fib_check_iproute_support "ipproto" "ipproto"
+	if [ $? -eq 0 ]; then
+		match="ipproto tcp"
+		fib_rule6_test_match_n_redirect "$match" "$match" "ipproto match"
+	fi
+
+	fib_check_iproute_support "ipproto" "ipproto"
+	if [ $? -eq 0 ]; then
+		match="ipproto icmp"
+		fib_rule6_test_match_n_redirect "$match" "$match" "ipproto icmp match"
+	fi
+}
+
+fib_rule4_del()
+{
+	$IP rule del $1
+	log_test $? 0 "del $1"
+}
+
+fib_rule4_del_by_pref()
+{
+	pref=$($IP rule show | grep "$1 lookup $TABLE" | cut -d ":" -f 1)
+	$IP rule del pref $pref
+}
+
+fib_rule4_test_match_n_redirect()
+{
+	local match="$1"
+	local getmatch="$2"
+
+	$IP rule add $match table $RTABLE
+	$IP route get $GW_IP4 $getmatch | grep -q "table $RTABLE"
+	log_test $? 0 "rule4 check: $1"
+
+	fib_rule4_del_by_pref "$match"
+	log_test $? 0 "rule4 del by pref: $match"
+}
+
+fib_rule4_test()
+{
+	# setup the fib rule redirect route
+	$IP route add table $RTABLE default via $GW_IP4 dev $DEV onlink
+
+	match="oif $DEV"
+	fib_rule4_test_match_n_redirect "$match" "$match" "oif redirect to table"
+
+	match="from $SRC_IP iif $DEV"
+	fib_rule4_test_match_n_redirect "$match" "$match" "iif redirect to table"
+
+	match="tos 0x10"
+	fib_rule4_test_match_n_redirect "$match" "$match" "tos redirect to table"
+
+	match="fwmark 0x64"
+	getmatch="mark 0x64"
+	fib_rule4_test_match_n_redirect "$match" "$getmatch" "fwmark redirect to table"
+
+	fib_check_iproute_support "uidrange" "uid"
+	if [ $? -eq 0 ]; then
+		match="uidrange 100-100"
+		getmatch="uid 100"
+		fib_rule4_test_match_n_redirect "$match" "$getmatch" "uid redirect to table"
+	fi
+
+	fib_check_iproute_support "sport" "sport"
+	if [ $? -eq 0 ]; then
+		match="sport 666 dport 777"
+		fib_rule4_test_match_n_redirect "$match" "$match" "sport and dport redirect to table"
+	fi
+
+	fib_check_iproute_support "ipproto" "ipproto"
+	if [ $? -eq 0 ]; then
+		match="ipproto tcp"
+		fib_rule4_test_match_n_redirect "$match" "$match" "ipproto tcp match"
+	fi
+
+	fib_check_iproute_support "ipproto" "ipproto"
+	if [ $? -eq 0 ]; then
+		match="ipproto icmp"
+		fib_rule4_test_match_n_redirect "$match" "$match" "ipproto icmp match"
+	fi
+}
+
+run_fibrule_tests()
+{
+	log_section "IPv4 fib rule"
+	fib_rule4_test
+	log_section "IPv6 fib rule"
+	fib_rule6_test
+}
+
+if [ "$(id -u)" -ne 0 ];then
+	echo "SKIP: Need root privileges"
+	exit 0
+fi
+
+if [ ! -x "$(command -v ip)" ]; then
+	echo "SKIP: Could not run test without ip tool"
+	exit 0
+fi
+
+# start clean
+cleanup &> /dev/null
+setup
+run_fibrule_tests
+cleanup
+
+exit $ret
-- 
2.1.4

^ permalink raw reply related

* Re: [PATCH net-next v4 1/3] ipv4: support sport, dport and ip_proto in RTM_GETROUTE
From: Eric Dumazet @ 2018-05-22 15:04 UTC (permalink / raw)
  To: Roopa Prabhu, davem; +Cc: netdev, nikolay, dsa, idosch
In-Reply-To: <1527000668-25253-2-git-send-email-roopa@cumulusnetworks.com>



On 05/22/2018 07:51 AM, Roopa Prabhu wrote:
> From: Roopa Prabhu <roopa@cumulusnetworks.com>
> 
> 

...

> diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
> index 4d622112..cf5cfc5 100644
> --- a/net/ipv4/fib_frontend.c
> +++ b/net/ipv4/fib_frontend.c
> @@ -649,6 +649,10 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
>  	[RTA_ENCAP]		= { .type = NLA_NESTED },
>  	[RTA_UID]		= { .type = NLA_U32 },
>  	[RTA_MARK]		= { .type = NLA_U32 },
> +	[RTA_TABLE]		= { .type = NLA_U32 },
> +	[RTA_IP_PROTO]		= { .type = NLA_U8 },
> +	[RTA_SPORT]		= { .type = NLA_U16 },
> +	[RTA_DPORT]		= { .type = NLA_U16 },
>  };

Hi Roopa

RTA_TABLE addition looks like a bug fix for net tree ?

This should be sent as an independent patch IMO.

Thanks.

^ permalink raw reply

* Re: [PATCH net-next v11 2/5] netvsc: refactor notifier/event handling code to use the failover framework
From: Jiri Pirko @ 2018-05-22 15:13 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Sridhar Samudrala, stephen, davem, netdev, virtualization,
	virtio-dev, jesse.brandeburg, alexander.h.duyck, kubakici,
	jasowang, loseweigh, aaron.f.brown, anjali.singhai
In-Reply-To: <20180522163502-mutt-send-email-mst@kernel.org>

Tue, May 22, 2018 at 03:39:33PM CEST, mst@redhat.com wrote:
>On Tue, May 22, 2018 at 03:26:26PM +0200, Jiri Pirko wrote:
>> Tue, May 22, 2018 at 03:17:37PM CEST, mst@redhat.com wrote:
>> >On Tue, May 22, 2018 at 03:14:22PM +0200, Jiri Pirko wrote:
>> >> Tue, May 22, 2018 at 03:12:40PM CEST, mst@redhat.com wrote:
>> >> >On Tue, May 22, 2018 at 11:08:53AM +0200, Jiri Pirko wrote:
>> >> >> Tue, May 22, 2018 at 11:06:37AM CEST, jiri@resnulli.us wrote:
>> >> >> >Tue, May 22, 2018 at 04:06:18AM CEST, sridhar.samudrala@intel.com wrote:
>> >> >> >>Use the registration/notification framework supported by the generic
>> >> >> >>failover infrastructure.
>> >> >> >>
>> >> >> >>Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>> >> >> >
>> >> >> >In previous patchset versions, the common code did
>> >> >> >netdev_rx_handler_register() and netdev_upper_dev_link() etc
>> >> >> >(netvsc_vf_join()). Now, this is still done in netvsc. Why?
>> >> >> >
>> >> >> >This should be part of the common "failover" code.
>> >> >> >
>> >> >> 
>> >> >> Also note that in the current patchset you use IFF_FAILOVER flag for
>> >> >> master, yet for the slave you use IFF_SLAVE. That is wrong.
>> >> >> IFF_FAILOVER_SLAVE should be used.
>> >> >
>> >> >Or drop IFF_FAILOVER_SLAVE and set both IFF_FAILOVER and IFF_SLAVE?
>> >> 
>> >> No. IFF_SLAVE is for bonding.
>> >
>> >What breaks if we reuse it for failover?
>> 
>> This is exposed to userspace. IFF_SLAVE is expected for bonding slaves.
>> And failover slave is not a bonding slave.
>
>That does not really answer the question.  I'd claim it's sufficiently
>like a bond slave for IFF_SLAVE to make sense.
>
>In fact you will find that netvsc already sets IFF_SLAVE, and so

netvsc does the whole failover thing in a wrong way. This patchset is
trying to fix it.

>does e.g. the eql driver.
>
>The advantage of using IFF_SLAVE is that userspace knows to skip it.  If

The userspace should know how to skip other types of slaves - team,
bridge, ovs, etc. The "master link" should be the one to look at.


>we don't set IFF_SLAVE existing userspace tries to use the lowerdev.

Each master type has a IFF_ master flag and IFF_ slave flag. In private
flag. I don't see no reason to break this pattern here.

^ permalink raw reply

* Re: [PATCH net-next] net: stmmac: Add PPS and Flexible PPS support
From: Andrew Lunn @ 2018-05-22 15:14 UTC (permalink / raw)
  To: Jose Abreu, Richard Cochran
  Cc: netdev, David S. Miller, Joao Pinto, Vitor Soares,
	Giuseppe Cavallaro, Alexandre Torgue
In-Reply-To: <072478625b1cb3d4af9e3b42f83ece7303fd554e.1526993857.git.joabreu@synopsys.com>

On Tue, May 22, 2018 at 01:58:40PM +0100, Jose Abreu wrote:
> This adds support for PPS output and Flexible PPS (which is equivalent
> to per_out output of PTP subsystem).

You forgot to Cc: the PTP maintainer, Richard Cochran <richardcochran@gmail.com>

    Andrew

^ permalink raw reply

* Re: [PATCH] net/mlx4: fix spelling mistake: "Inrerface" -> "Interface"
From: Tariq Toukan @ 2018-05-22 15:21 UTC (permalink / raw)
  To: Colin King, Tariq Toukan, David S . Miller, netdev, linux-rdma
  Cc: kernel-janitors, linux-kernel
In-Reply-To: <20180522083728.5874-1-colin.king@canonical.com>



On 22/05/2018 11:37 AM, Colin King wrote:
> From: Colin Ian King <colin.king@canonical.com>
> 
> Trivial fix to spelling mistake in mlx4_dbg debug message
> 
> Signed-off-by: Colin Ian King <colin.king@canonical.com>
> ---
>   drivers/net/ethernet/mellanox/mlx4/intf.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx4/intf.c b/drivers/net/ethernet/mellanox/mlx4/intf.c
> index 2edcce98ab2d..6bd4103265d2 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/intf.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/intf.c
> @@ -172,7 +172,7 @@ int mlx4_do_bond(struct mlx4_dev *dev, bool enable)
>   		list_add_tail(&dev_ctx->list, &priv->ctx_list);
>   		spin_unlock_irqrestore(&priv->ctx_lock, flags);
>   
> -		mlx4_dbg(dev, "Inrerface for protocol %d restarted with when bonded mode is %s\n",
> +		mlx4_dbg(dev, "Interface for protocol %d restarted with when bonded mode is %s\n",

Thanks Colin.
I think there's one more thing to fix here.
It is redundant to say "with when", it was probably done by mistake. 
Let's rephrase, maybe this way?

restarted with bonded mode %s

>   			 dev_ctx->intf->protocol, enable ?
>   			 "enabled" : "disabled");
>   	}
> 

^ permalink raw reply

* Re: [PATCH] net/mlx4: fix spelling mistake: "Inrerface" -> "Interface"
From: Colin Ian King @ 2018-05-22 15:23 UTC (permalink / raw)
  To: Tariq Toukan, David S . Miller, netdev, linux-rdma
  Cc: kernel-janitors, linux-kernel
In-Reply-To: <77704a43-2b78-e88b-04b9-9a2623e8b5fb@mellanox.com>

On 22/05/18 16:21, Tariq Toukan wrote:
> 
> 
> On 22/05/2018 11:37 AM, Colin King wrote:
>> From: Colin Ian King <colin.king@canonical.com>
>>
>> Trivial fix to spelling mistake in mlx4_dbg debug message
>>
>> Signed-off-by: Colin Ian King <colin.king@canonical.com>
>> ---
>>   drivers/net/ethernet/mellanox/mlx4/intf.c | 2 +-
>>   1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/drivers/net/ethernet/mellanox/mlx4/intf.c
>> b/drivers/net/ethernet/mellanox/mlx4/intf.c
>> index 2edcce98ab2d..6bd4103265d2 100644
>> --- a/drivers/net/ethernet/mellanox/mlx4/intf.c
>> +++ b/drivers/net/ethernet/mellanox/mlx4/intf.c
>> @@ -172,7 +172,7 @@ int mlx4_do_bond(struct mlx4_dev *dev, bool enable)
>>           list_add_tail(&dev_ctx->list, &priv->ctx_list);
>>           spin_unlock_irqrestore(&priv->ctx_lock, flags);
>>   -        mlx4_dbg(dev, "Inrerface for protocol %d restarted with
>> when bonded mode is %s\n",
>> +        mlx4_dbg(dev, "Interface for protocol %d restarted with when
>> bonded mode is %s\n",
> 
> Thanks Colin.
> I think there's one more thing to fix here.
> It is redundant to say "with when", it was probably done by mistake.
> Let's rephrase, maybe this way?
> 
> restarted with bonded mode %s

Sounds like a good idea, do you want me to send V2 of the patch with
this fix?

> 
>>                dev_ctx->intf->protocol, enable ?
>>                "enabled" : "disabled");
>>       }
>>
> 
> -- 
> To unsubscribe from this list: send the line "unsubscribe
> kernel-janitors" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] net/mlx4: fix spelling mistake: "Inrerface" -> "Interface"
From: Tariq Toukan @ 2018-05-22 15:25 UTC (permalink / raw)
  To: Colin Ian King, Tariq Toukan, David S . Miller, netdev,
	linux-rdma
  Cc: kernel-janitors, linux-kernel
In-Reply-To: <297abbee-1f5c-6159-e559-44d83ddfb7ff@canonical.com>



On 22/05/2018 6:23 PM, Colin Ian King wrote:
> On 22/05/18 16:21, Tariq Toukan wrote:
>>
>>
>> On 22/05/2018 11:37 AM, Colin King wrote:
>>> From: Colin Ian King <colin.king@canonical.com>
>>>
>>> Trivial fix to spelling mistake in mlx4_dbg debug message
>>>
>>> Signed-off-by: Colin Ian King <colin.king@canonical.com>
>>> ---
>>>    drivers/net/ethernet/mellanox/mlx4/intf.c | 2 +-
>>>    1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/net/ethernet/mellanox/mlx4/intf.c
>>> b/drivers/net/ethernet/mellanox/mlx4/intf.c
>>> index 2edcce98ab2d..6bd4103265d2 100644
>>> --- a/drivers/net/ethernet/mellanox/mlx4/intf.c
>>> +++ b/drivers/net/ethernet/mellanox/mlx4/intf.c
>>> @@ -172,7 +172,7 @@ int mlx4_do_bond(struct mlx4_dev *dev, bool enable)
>>>            list_add_tail(&dev_ctx->list, &priv->ctx_list);
>>>            spin_unlock_irqrestore(&priv->ctx_lock, flags);
>>>    -        mlx4_dbg(dev, "Inrerface for protocol %d restarted with
>>> when bonded mode is %s\n",
>>> +        mlx4_dbg(dev, "Interface for protocol %d restarted with when
>>> bonded mode is %s\n",
>>
>> Thanks Colin.
>> I think there's one more thing to fix here.
>> It is redundant to say "with when", it was probably done by mistake.
>> Let's rephrase, maybe this way?
>>
>> restarted with bonded mode %s
> 
> Sounds like a good idea, do you want me to send V2 of the patch with
> this fix?
> 

Yes please.

>>
>>>                 dev_ctx->intf->protocol, enable ?
>>>                 "enabled" : "disabled");
>>>        }
>>>
>>
>> -- 
>> To unsubscribe from this list: send the line "unsubscribe
>> kernel-janitors" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply

* Re: [PATCH net-next v11 2/5] netvsc: refactor notifier/event handling code to use the failover framework
From: Samudrala, Sridhar @ 2018-05-22 15:28 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: mst, stephen, davem, netdev, virtualization, virtio-dev,
	jesse.brandeburg, alexander.h.duyck, kubakici, jasowang,
	loseweigh, aaron.f.brown, anjali.singhai
In-Reply-To: <20180522090853.GF2149@nanopsycho>


On 5/22/2018 2:08 AM, Jiri Pirko wrote:
> Tue, May 22, 2018 at 11:06:37AM CEST, jiri@resnulli.us wrote:
>> Tue, May 22, 2018 at 04:06:18AM CEST, sridhar.samudrala@intel.com wrote:
>>> Use the registration/notification framework supported by the generic
>>> failover infrastructure.
>>>
>>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>> In previous patchset versions, the common code did
>> netdev_rx_handler_register() and netdev_upper_dev_link() etc
>> (netvsc_vf_join()). Now, this is still done in netvsc. Why?
>>
>> This should be part of the common "failover" code.

Based on Stephen's feedback on earlier patches, i tried to minimize the changes to
netvsc and only commonize the notifier and the main event handler routine.
Another complication is that netvsc does part of registration in a delayed workqueue.

It should be possible to move some of the code from net_failover.c to generic
failover.c in future if Stephen is ok with it.


>>
> Also note that in the current patchset you use IFF_FAILOVER flag for
> master, yet for the slave you use IFF_SLAVE. That is wrong.
> IFF_FAILOVER_SLAVE should be used.

Not sure which code you are referring to.  I only set IFF_FAILOVER_SLAVE
in patch 3.

^ permalink raw reply

* Re: [PATCH net-next v11 2/5] netvsc: refactor notifier/event handling code to use the failover framework
From: Michael S. Tsirkin @ 2018-05-22 15:32 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Sridhar Samudrala, stephen, davem, netdev, virtualization,
	virtio-dev, jesse.brandeburg, alexander.h.duyck, kubakici,
	jasowang, loseweigh, aaron.f.brown, anjali.singhai
In-Reply-To: <20180522151343.GJ2149@nanopsycho>

On Tue, May 22, 2018 at 05:13:43PM +0200, Jiri Pirko wrote:
> Tue, May 22, 2018 at 03:39:33PM CEST, mst@redhat.com wrote:
> >On Tue, May 22, 2018 at 03:26:26PM +0200, Jiri Pirko wrote:
> >> Tue, May 22, 2018 at 03:17:37PM CEST, mst@redhat.com wrote:
> >> >On Tue, May 22, 2018 at 03:14:22PM +0200, Jiri Pirko wrote:
> >> >> Tue, May 22, 2018 at 03:12:40PM CEST, mst@redhat.com wrote:
> >> >> >On Tue, May 22, 2018 at 11:08:53AM +0200, Jiri Pirko wrote:
> >> >> >> Tue, May 22, 2018 at 11:06:37AM CEST, jiri@resnulli.us wrote:
> >> >> >> >Tue, May 22, 2018 at 04:06:18AM CEST, sridhar.samudrala@intel.com wrote:
> >> >> >> >>Use the registration/notification framework supported by the generic
> >> >> >> >>failover infrastructure.
> >> >> >> >>
> >> >> >> >>Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
> >> >> >> >
> >> >> >> >In previous patchset versions, the common code did
> >> >> >> >netdev_rx_handler_register() and netdev_upper_dev_link() etc
> >> >> >> >(netvsc_vf_join()). Now, this is still done in netvsc. Why?
> >> >> >> >
> >> >> >> >This should be part of the common "failover" code.
> >> >> >> >
> >> >> >> 
> >> >> >> Also note that in the current patchset you use IFF_FAILOVER flag for
> >> >> >> master, yet for the slave you use IFF_SLAVE. That is wrong.
> >> >> >> IFF_FAILOVER_SLAVE should be used.
> >> >> >
> >> >> >Or drop IFF_FAILOVER_SLAVE and set both IFF_FAILOVER and IFF_SLAVE?
> >> >> 
> >> >> No. IFF_SLAVE is for bonding.
> >> >
> >> >What breaks if we reuse it for failover?
> >> 
> >> This is exposed to userspace. IFF_SLAVE is expected for bonding slaves.
> >> And failover slave is not a bonding slave.
> >
> >That does not really answer the question.  I'd claim it's sufficiently
> >like a bond slave for IFF_SLAVE to make sense.
> >
> >In fact you will find that netvsc already sets IFF_SLAVE, and so
> 
> netvsc does the whole failover thing in a wrong way. This patchset is
> trying to fix it.

Maybe, but we don't need gratuitous changes either, especially if they
break userspace.

> >does e.g. the eql driver.
> >
> >The advantage of using IFF_SLAVE is that userspace knows to skip it.  If
> 
> The userspace should know how to skip other types of slaves - team,
> bridge, ovs, etc.
> The "master link" should be the one to look at.
> 

How should existing userspace know which ones to skip and which one is
the master?  Right now userspace seems to assume whatever does not have
IFF_SLAVE should be looked at. Are you saying that's not the right thing
to do and userspace should be fixed? What should userspace do in
your opinion that will be forward compatible with future kernels?

> 
> >we don't set IFF_SLAVE existing userspace tries to use the lowerdev.
> 
> Each master type has a IFF_ master flag and IFF_ slave flag.

Could you give some examples please?

> In private
> flag. I don't see no reason to break this pattern here.

Other masters are setup from userspace, this one is set up automatically
by kernel. So the bar is higher, we need an interface that existing
userspace knows about.  We can't just say "oh if userspace set this up
it should know to skip lowerdevs".

Otherwise multiple interfaces with same mac tend to confuse userspace.

-- 
MST

^ permalink raw reply

* Re: [PATCH v3] mlx4_core: allocate ICM memory in page size chunks
From: Tariq Toukan @ 2018-05-22 15:33 UTC (permalink / raw)
  To: Qing Huang, Eric Dumazet, tariqt, davem, haakon.bugge, yanjun.zhu
  Cc: netdev, linux-rdma, linux-kernel, gi-oh.kim
In-Reply-To: <19b7818e-16f6-2349-dc34-245c2f215f6f@oracle.com>



On 18/05/2018 12:45 AM, Qing Huang wrote:
> 
> 
> On 5/17/2018 2:14 PM, Eric Dumazet wrote:
>> On 05/17/2018 01:53 PM, Qing Huang wrote:
>>> When a system is under memory presure (high usage with fragments),
>>> the original 256KB ICM chunk allocations will likely trigger kernel
>>> memory management to enter slow path doing memory compact/migration
>>> ops in order to complete high order memory allocations.
>>>
>>> When that happens, user processes calling uverb APIs may get stuck
>>> for more than 120s easily even though there are a lot of free pages
>>> in smaller chunks available in the system.
>>>
>>> Syslog:
>>> ...
>>> Dec 10 09:04:51 slcc03db02 kernel: [397078.572732] INFO: task
>>> oracle_205573_e:205573 blocked for more than 120 seconds.
>>> ...
>>>
>> NACK on this patch.
>>
>> You have been asked repeatedly to use kvmalloc()
>>
>> This is not a minor suggestion.
>>
>> Take a look 
>> athttps://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d8c13f2271ec5178c52fbde072ec7b562651ed9d 
>>
> 
> Would you please take a look at how table->icm is being used in the mlx4 
> driver? It's a meta data used for individual pointer variable referencing,
> not as data frag or in/out buffer. It has no need for contiguous phy. 
> memory.
> 
> Thanks.
> 

NACK.

This would cause a degradation when iterating the entries of table->icm.
For example, in mlx4_table_get_range.

Thanks,
Tariq

>> And you'll understand some people care about this.
>>
>> Strongly.
>>
>> Thanks.
>>
> 

^ permalink raw reply

* [PATCH net-next v2 0/2] udp gso fixes
From: Willem de Bruijn @ 2018-05-22 15:34 UTC (permalink / raw)
  To: netdev; +Cc: davem, Willem de Bruijn

From: Willem de Bruijn <willemb@google.com>

A few small fixes:
- disallow segmentation with XFRM
- do not leak gso packets into the ingress path

Changes
  v1 -> v2
  - fix build failure in team.c
  - drop scatter-gather fix:
      this is now fixed by commit 113f99c33585 ("net: test tailroom
      before appending to linear skb"). After this patch gso skbs are
      built non-linear regardless of NETIF_F_SG and skb_segment builds
      linear segs.

Willem de Bruijn (4):
  udp: exclude gso from xfrm paths
  gso: limit udp gso to egress-only virtual devices

 drivers/net/bonding/bond_main.c | 5 +++--
 drivers/net/team/team.c         | 5 +++--
 include/linux/netdev_features.h | 1 -
 net/ipv4/udp.c                  | 3 ++-
 net/ipv6/udp.c                  | 3 ++-
 5 files changed, 10 insertions(+), 7 deletions(-)

-- 
2.17.0.441.gb46fe60e1d-goog

^ permalink raw reply

* [PATCH net-next v2 1/2] udp: exclude gso from xfrm paths
From: Willem de Bruijn @ 2018-05-22 15:34 UTC (permalink / raw)
  To: netdev; +Cc: davem, Willem de Bruijn, Michal Kubecek
In-Reply-To: <20180522153440.204128-1-willemdebruijn.kernel@gmail.com>

From: Willem de Bruijn <willemb@google.com>

UDP GSO delays final datagram construction to the GSO layer. This
conflicts with protocol transformations.

Fixes: bec1f6f69736 ("udp: generate gso with UDP_SEGMENT")
CC: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 net/ipv4/udp.c | 3 ++-
 net/ipv6/udp.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index ff4d4ba67735..d71f1f3e1155 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -788,7 +788,8 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
 			return -EINVAL;
 		if (sk->sk_no_check_tx)
 			return -EINVAL;
-		if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite)
+		if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite ||
+		    dst_xfrm(skb_dst(skb)))
 			return -EIO;
 
 		skb_shinfo(skb)->gso_size = cork->gso_size;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 2839c1bd1e58..426c9d2b418d 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1053,7 +1053,8 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
 			return -EINVAL;
 		if (udp_sk(sk)->no_check6_tx)
 			return -EINVAL;
-		if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite)
+		if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite ||
+		    dst_xfrm(skb_dst(skb)))
 			return -EIO;
 
 		skb_shinfo(skb)->gso_size = cork->gso_size;
-- 
2.17.0.441.gb46fe60e1d-goog

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox