* [PATCH bpf-next 4/7] nfp: bpf: copy range info for all operands of all ALU operations
From: Jakub Kicinski @ 2018-06-25 3:54 UTC (permalink / raw)
To: alexei.starovoitov, daniel; +Cc: oss-drivers, netdev, Jiong Wang
In-Reply-To: <20180625035421.2991-1-jakub.kicinski@netronome.com>
From: Jiong Wang <jiong.wang@netronome.com>
NFP verifier hook is coping range information of the shift amount for
indirect shift operation so optimized shift sequences could be generated.
We want to use range info to do more things. For example, to decide whether
multiplication and divide are supported on the given range.
This patch simply let NFP verifier hook to copy range info for all operands
of all ALU operands.
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
drivers/net/ethernet/netronome/nfp/bpf/main.h | 33 +++++++------------
.../net/ethernet/netronome/nfp/bpf/offload.c | 4 ++-
.../net/ethernet/netronome/nfp/bpf/verifier.c | 6 +++-
3 files changed, 20 insertions(+), 23 deletions(-)
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 5975a19c28cb..c985d0ac61a3 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -265,6 +265,8 @@ struct nfp_bpf_reg_state {
* @arg2: arg2 for call instructions
* @umin_src: copy of core verifier umin_value for src opearnd.
* @umax_src: copy of core verifier umax_value for src operand.
+ * @umin_dst: copy of core verifier umin_value for dst opearnd.
+ * @umax_dst: copy of core verifier umax_value for dst operand.
* @off: index of first generated machine instruction (in nfp_prog.prog)
* @n: eBPF instruction number
* @flags: eBPF instruction extra optimization flags
@@ -300,12 +302,15 @@ struct nfp_insn_meta {
struct bpf_reg_state arg1;
struct nfp_bpf_reg_state arg2;
};
- /* We are interested in range info for some operands,
- * for example, the shift amount which is kept in src operand.
+ /* We are interested in range info for operands of ALU
+ * operations. For example, shift amount, multiplicand and
+ * multiplier etc.
*/
struct {
u64 umin_src;
u64 umax_src;
+ u64 umin_dst;
+ u64 umax_dst;
};
};
unsigned int off;
@@ -339,6 +344,11 @@ static inline u8 mbpf_mode(const struct nfp_insn_meta *meta)
return BPF_MODE(meta->insn.code);
}
+static inline bool is_mbpf_alu(const struct nfp_insn_meta *meta)
+{
+ return mbpf_class(meta) == BPF_ALU64 || mbpf_class(meta) == BPF_ALU;
+}
+
static inline bool is_mbpf_load(const struct nfp_insn_meta *meta)
{
return (meta->insn.code & ~BPF_SIZE_MASK) == (BPF_LDX | BPF_MEM);
@@ -384,25 +394,6 @@ static inline bool is_mbpf_xadd(const struct nfp_insn_meta *meta)
return (meta->insn.code & ~BPF_SIZE_MASK) == (BPF_STX | BPF_XADD);
}
-static inline bool is_mbpf_indir_shift(const struct nfp_insn_meta *meta)
-{
- u8 code = meta->insn.code;
- bool is_alu, is_shift;
- u8 opclass, opcode;
-
- opclass = BPF_CLASS(code);
- is_alu = opclass == BPF_ALU64 || opclass == BPF_ALU;
- if (!is_alu)
- return false;
-
- opcode = BPF_OP(code);
- is_shift = opcode == BPF_LSH || opcode == BPF_RSH || opcode == BPF_ARSH;
- if (!is_shift)
- return false;
-
- return BPF_SRC(code) == BPF_X;
-}
-
/**
* struct nfp_prog - nfp BPF program
* @bpf: backpointer to the bpf app priv structure
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index 856a0003bb75..78f44c4d95b4 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -190,8 +190,10 @@ nfp_prog_prepare(struct nfp_prog *nfp_prog, const struct bpf_insn *prog,
meta->insn = prog[i];
meta->n = i;
- if (is_mbpf_indir_shift(meta))
+ if (is_mbpf_alu(meta)) {
meta->umin_src = U64_MAX;
+ meta->umin_dst = U64_MAX;
+ }
list_add_tail(&meta->l, &nfp_prog->insns);
}
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index e862b739441f..7bd9666bd8ff 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -551,12 +551,16 @@ nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx)
if (is_mbpf_xadd(meta))
return nfp_bpf_check_xadd(nfp_prog, meta, env);
- if (is_mbpf_indir_shift(meta)) {
+ if (is_mbpf_alu(meta)) {
const struct bpf_reg_state *sreg =
cur_regs(env) + meta->insn.src_reg;
+ const struct bpf_reg_state *dreg =
+ cur_regs(env) + meta->insn.dst_reg;
meta->umin_src = min(meta->umin_src, sreg->umin_value);
meta->umax_src = max(meta->umax_src, sreg->umax_value);
+ meta->umin_dst = min(meta->umin_dst, dreg->umin_value);
+ meta->umax_dst = max(meta->umax_dst, dreg->umax_value);
}
return 0;
--
2.17.1
^ permalink raw reply related
* [PATCH bpf-next 3/7] nfp: bpf: rename umin/umax to umin_src/umax_src
From: Jakub Kicinski @ 2018-06-25 3:54 UTC (permalink / raw)
To: alexei.starovoitov, daniel; +Cc: oss-drivers, netdev, Jiong Wang
In-Reply-To: <20180625035421.2991-1-jakub.kicinski@netronome.com>
From: Jiong Wang <jiong.wang@netronome.com>
The two fields are a copy of umin and umax info of bpf_insn->src_reg
generated by verifier.
Rename to make their meaning clear.
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
drivers/net/ethernet/netronome/nfp/bpf/jit.c | 12 ++++++------
drivers/net/ethernet/netronome/nfp/bpf/main.h | 10 +++++-----
drivers/net/ethernet/netronome/nfp/bpf/offload.c | 2 +-
drivers/net/ethernet/netronome/nfp/bpf/verifier.c | 4 ++--
4 files changed, 14 insertions(+), 14 deletions(-)
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 33111739b210..4a629e9b5c0f 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -1772,8 +1772,8 @@ static int shl_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
u8 dst, src;
dst = insn->dst_reg * 2;
- umin = meta->umin;
- umax = meta->umax;
+ umin = meta->umin_src;
+ umax = meta->umax_src;
if (umin == umax)
return __shl_imm64(nfp_prog, dst, umin);
@@ -1881,8 +1881,8 @@ static int shr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
u8 dst, src;
dst = insn->dst_reg * 2;
- umin = meta->umin;
- umax = meta->umax;
+ umin = meta->umin_src;
+ umax = meta->umax_src;
if (umin == umax)
return __shr_imm64(nfp_prog, dst, umin);
@@ -1995,8 +1995,8 @@ static int ashr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
u8 dst, src;
dst = insn->dst_reg * 2;
- umin = meta->umin;
- umax = meta->umax;
+ umin = meta->umin_src;
+ umax = meta->umax_src;
if (umin == umax)
return __ashr_imm64(nfp_prog, dst, umin);
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 654fe7823e5e..5975a19c28cb 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -263,8 +263,8 @@ struct nfp_bpf_reg_state {
* @func_id: function id for call instructions
* @arg1: arg1 for call instructions
* @arg2: arg2 for call instructions
- * @umin: copy of core verifier umin_value.
- * @umax: copy of core verifier umax_value.
+ * @umin_src: copy of core verifier umin_value for src opearnd.
+ * @umax_src: copy of core verifier umax_value for src operand.
* @off: index of first generated machine instruction (in nfp_prog.prog)
* @n: eBPF instruction number
* @flags: eBPF instruction extra optimization flags
@@ -301,11 +301,11 @@ struct nfp_insn_meta {
struct nfp_bpf_reg_state arg2;
};
/* We are interested in range info for some operands,
- * for example, the shift amount.
+ * for example, the shift amount which is kept in src operand.
*/
struct {
- u64 umin;
- u64 umax;
+ u64 umin_src;
+ u64 umax_src;
};
};
unsigned int off;
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index 7eae4c0266f8..856a0003bb75 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -191,7 +191,7 @@ nfp_prog_prepare(struct nfp_prog *nfp_prog, const struct bpf_insn *prog,
meta->insn = prog[i];
meta->n = i;
if (is_mbpf_indir_shift(meta))
- meta->umin = U64_MAX;
+ meta->umin_src = U64_MAX;
list_add_tail(&meta->l, &nfp_prog->insns);
}
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index 4bfeba7b21b2..e862b739441f 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -555,8 +555,8 @@ nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx)
const struct bpf_reg_state *sreg =
cur_regs(env) + meta->insn.src_reg;
- meta->umin = min(meta->umin, sreg->umin_value);
- meta->umax = max(meta->umax, sreg->umax_value);
+ meta->umin_src = min(meta->umin_src, sreg->umin_value);
+ meta->umax_src = max(meta->umax_src, sreg->umax_value);
}
return 0;
--
2.17.1
^ permalink raw reply related
* [PATCH bpf-next 2/7] lib: reciprocal_div: implement the improved algorithm on the paper mentioned
From: Jakub Kicinski @ 2018-06-25 3:54 UTC (permalink / raw)
To: alexei.starovoitov, daniel; +Cc: oss-drivers, netdev, Jiong Wang
In-Reply-To: <20180625035421.2991-1-jakub.kicinski@netronome.com>
From: Jiong Wang <jiong.wang@netronome.com>
The new added "reciprocal_value_adv" implements the advanced version of the
algorithm described in Figure 4.2 of the paper except when dividend has MSB
set which would require u128 divide on host and actually could be easily
handled before calling the new "reciprocal_value_adv".
The advanced version requires more complex calculation to get the
reciprocal multiplier and other control variables, but then could reduce
the required emulation operations.
It makes no sense to use this advanced version for host divide emulation,
those extra complexities for calculating multiplier etc could completely
waive our saving on emulation operations.
However, it makes sense to use it for JIT divide code generation (for
example eBPF JIT backends) for which we are willing to trade performance of
JITed code with that of host. As shown by the following pseudo code, the
required emulation operations could go down from 6 (the basic version) to 3
or 4.
To use the result of "reciprocal_value_adv", suppose we want to calculate
n/d, the C-style pseudo code will be the following, it could be easily
changed to real code generation for other JIT targets.
struct reciprocal_value_adv rvalue;
u8 pre_shift, exp;
if (d >= (1u << 31)) {
result = n >= d;
return;
}
rvalue = reciprocal_value_adv(d, 32)
exp = rvalue.exp;
if (rvalue.is_wide_m && !(d & 1)) {
pre_shift = fls(d & -d) - 1;
rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift);
} else {
pre_shift = 0;
}
// code generation starts.
if (imm == 1 << exp) {
result = n >> exp;
} else if (rvalue.is_wide_m) {
// pre_shift must be zero when reached here.
t = (n * rvalue.m) >> 32;
result = n - t;
result >>= 1;
result += t;
result >>= rvalue.sh - 1;
} else {
if (pre_shift)
result = n >> pre_shift;
result = ((u64)result * rvalue.m) >> 32;
result >>= rvalue.sh;
}
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
include/linux/reciprocal_div.h | 65 ++++++++++++++++++++++++++++++++++
lib/reciprocal_div.c | 37 +++++++++++++++++++
2 files changed, 102 insertions(+)
diff --git a/include/linux/reciprocal_div.h b/include/linux/reciprocal_div.h
index e031e9f2f9d8..5a695e4697d3 100644
--- a/include/linux/reciprocal_div.h
+++ b/include/linux/reciprocal_div.h
@@ -25,6 +25,9 @@ struct reciprocal_value {
u8 sh1, sh2;
};
+/* "reciprocal_value" and "reciprocal_divide" together implement the basic
+ * version of the algorithm described in Figure 4.1 of the paper.
+ */
struct reciprocal_value reciprocal_value(u32 d);
static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R)
@@ -33,4 +36,66 @@ static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R)
return (t + ((a - t) >> R.sh1)) >> R.sh2;
}
+struct reciprocal_value_adv {
+ u32 m;
+ u8 sh, exp;
+ bool is_wide_m;
+};
+
+/* "reciprocal_value_adv" implements the advanced version of the algorithm
+ * described in Figure 4.2 of the paper except when dividend has MSB set which
+ * would require u128 divide on host and actually could be easily handled before
+ * calling "reciprocal_value_adv".
+ *
+ * The advanced version requires more complex calculation to get the reciprocal
+ * multiplier and other control variables, but then could reduce the required
+ * emulation operations.
+ *
+ * It makes no sense to use this advanced version for host divide emulation,
+ * those extra complexities for calculating multiplier etc could completely
+ * waive our saving on emulation operations.
+ *
+ * However, it makes sense to use it for JIT divide code generation for which
+ * we are willing to trade performance of JITed code with that of host. As shown
+ * by the following pseudo code, the required emulation operations could go down
+ * from 6 (the basic version) to 3 or 4.
+ *
+ * To use the result of "reciprocal_value_adv", suppose we want to calculate
+ * n/d:
+ *
+ * struct reciprocal_value_adv rvalue;
+ * u8 pre_shift, exp;
+ *
+ * if (d >= (1u << 31)) {
+ * result = n >= d;
+ * return;
+ * }
+ * rvalue = reciprocal_value_adv(d, 32)
+ * exp = rvalue.exp;
+ * if (rvalue.is_wide_m && !(d & 1)) {
+ * pre_shift = fls(d & -d) - 1;
+ * rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift);
+ * } else {
+ * pre_shift = 0;
+ * }
+ *
+ * // code generation starts.
+ * if (imm == 1 << exp) {
+ * result = n >> exp;
+ * } else if (rvalue.is_wide_m) {
+ * // pre_shift must be zero when reached here.
+ * t = (n * rvalue.m) >> 32;
+ * result = n - t;
+ * result >>= 1;
+ * result += t;
+ * result >>= rvalue.sh - 1;
+ * } else {
+ * if (pre_shift)
+ * result = n >> pre_shift;
+ * result = ((u64)result * rvalue.m) >> 32;
+ * result >>= rvalue.sh;
+ * }
+ */
+struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec);
+
#endif /* _LINUX_RECIPROCAL_DIV_H */
diff --git a/lib/reciprocal_div.c b/lib/reciprocal_div.c
index fcb4ce682c6f..a41501ebad7c 100644
--- a/lib/reciprocal_div.c
+++ b/lib/reciprocal_div.c
@@ -26,3 +26,40 @@ struct reciprocal_value reciprocal_value(u32 d)
return R;
}
EXPORT_SYMBOL(reciprocal_value);
+
+struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec)
+{
+ struct reciprocal_value_adv R;
+ u32 l, post_shift;
+ u64 mhigh, mlow;
+
+ l = fls(d - 1);
+ post_shift = l;
+ /* NOTE: mlow/mhigh could overflow u64 when l == 32 which means d has
+ * MSB set. This case needs to be handled before calling
+ * "reciprocal_value_adv", please see the comment at
+ * include/linux/reciprocal_div.h.
+ */
+ mlow = 1ULL << (32 + l);
+ do_div(mlow, d);
+ mhigh = (1ULL << (32 + l)) + (1ULL << (32 + l - prec));
+ do_div(mhigh, d);
+
+ for (; post_shift > 0; post_shift--) {
+ u64 lo = mlow >> 1, hi = mhigh >> 1;
+
+ if (lo >= hi)
+ break;
+
+ mlow = lo;
+ mhigh = hi;
+ }
+
+ R.m = (u32)mhigh;
+ R.sh = post_shift;
+ R.exp = l;
+ R.is_wide_m = mhigh > U32_MAX;
+
+ return R;
+}
+EXPORT_SYMBOL(reciprocal_value_adv);
--
2.17.1
^ permalink raw reply related
* [PATCH bpf-next 1/7] nfp: bpf: allow source ptr type be map ptr in memcpy optimization
From: Jakub Kicinski @ 2018-06-25 3:54 UTC (permalink / raw)
To: alexei.starovoitov, daniel; +Cc: oss-drivers, netdev, Jiong Wang
In-Reply-To: <20180625035421.2991-1-jakub.kicinski@netronome.com>
From: Jiong Wang <jiong.wang@netronome.com>
Map read has been supported on NFP, this patch enables optimization for
memcpy from map to packet.
This patch also fixed one latent bug which will cause copying from
unexpected address once memcpy for map pointer enabled.
Reported-by: Mary Pham <mary.pham@netronome.com>
Reported-by: David Beckett <david.beckett@netronome.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
drivers/net/ethernet/netronome/nfp/bpf/jit.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 8a92088df0d7..33111739b210 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -670,7 +670,7 @@ static int nfp_cpp_memcpy(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
xfer_num = round_up(len, 4) / 4;
if (src_40bit_addr)
- addr40_offset(nfp_prog, meta->insn.src_reg, off, &src_base,
+ addr40_offset(nfp_prog, meta->insn.src_reg * 2, off, &src_base,
&off);
/* Setup PREV_ALU fields to override memory read length. */
@@ -3299,7 +3299,8 @@ curr_pair_is_memcpy(struct nfp_insn_meta *ld_meta,
if (!is_mbpf_load(ld_meta) || !is_mbpf_store(st_meta))
return false;
- if (ld_meta->ptr.type != PTR_TO_PACKET)
+ if (ld_meta->ptr.type != PTR_TO_PACKET &&
+ ld_meta->ptr.type != PTR_TO_MAP_VALUE)
return false;
if (st_meta->ptr.type != PTR_TO_PACKET)
--
2.17.1
^ permalink raw reply related
* [PATCH bpf-next 0/7] nfp: bpf: add multiplication, divide and memcpy from maps
From: Jakub Kicinski @ 2018-06-25 3:54 UTC (permalink / raw)
To: alexei.starovoitov, daniel; +Cc: oss-drivers, netdev, Jakub Kicinski
Hi!
This set enables memcpy optimization when the source is a map pointer.
The rest adds multiplication and devide support with Jiong describes
as follows:
NFP supports u16 and u32 multiplication. Multiplication is done 8-bits per
step, therefore we need 2 steps for u16 and 4 steps for u32.
We also need one start instruction to initialize the sequence and one or
two instructions to fetch the result depending on either you need the high
halve of u32 multiplication.
For ALU64, if either operand is beyond u32's value range, we reject it. One
thing to note, if the source operand is BPF_K, then we need to check "imm"
field directly, and we'd reject it if it is negative. Because for ALU64,
"imm" (with s32 type) is expected to be sign extended to s64 which NFP mul
doesn't support. For ALU32, it is fine for "imm" be negative though,
because the result is 32-bits and here is no difference on the low halve
of result for signed/unsigned mul, so we will get correct result.
NFP doesn't have integer divide instruction, this patch set uses reciprocal
algorithm (the basic one, reciprocal_div) to emulate it.
For each u32 divide, we would need 11 instructions to finish the operation.
7 (for multiplication) + 4 (various ALUs) = 11
Given NFP only supports multiplication no bigger than u32, we'd require
divisor and dividend no bigger than that as well.
Also eBPF doesn't support signed divide and has enforced this on C language
level by failing compilation. However LLVM assembler hasn't enforced this,
so it is possible for negative constant to leak in as a BPF_K operand
through assembly code, we reject such cases as well.
Meanwhile reciprocal_div.h only implemented the basic version of:
"Division by Invariant Integers Using Multiplication"
- Torbjörn Granlund and Peter L. Montgomery
This patch set further implements the optimized version (Figure 4.2 in the
paper) inside existing reciprocal_div.h. When the divider is even and the
calculated reciprocal magic number doesn't fit u32, we could reduce the
required ALU instructions from 4 to 2 or 1 for some cases.
The advanced version requires more complex calculation to get the
reciprocal multiplier and other control variables, but then could reduce
the required emulation operations. It makes sense to use it for JIT divide
code generation (for example eBPF JIT backends) for which we are willing to
trade performance of JITed code with that of host.
Jiong Wang (7):
nfp: bpf: allow source ptr type be map ptr in memcpy optimization
lib: reciprocal_div: implement the improved algorithm on the paper
mentioned
nfp: bpf: rename umin/umax to umin_src/umax_src
nfp: bpf: copy range info for all operands of all ALU operations
nfp: bpf: support u16 and u32 multiplications
nfp: bpf: support u32 divide using reciprocal_div.h
nfp: bpf: migrate to advanced reciprocal divide in reciprocal_div.h
drivers/net/ethernet/netronome/nfp/bpf/jit.c | 232 +++++++++++++++++-
drivers/net/ethernet/netronome/nfp/bpf/main.h | 43 ++--
.../net/ethernet/netronome/nfp/bpf/offload.c | 6 +-
.../net/ethernet/netronome/nfp/bpf/verifier.c | 95 ++++++-
drivers/net/ethernet/netronome/nfp/nfp_asm.h | 28 +++
include/linux/reciprocal_div.h | 65 +++++
lib/reciprocal_div.c | 37 +++
7 files changed, 467 insertions(+), 39 deletions(-)
--
2.17.1
^ permalink raw reply
* Re: [patch net-next 2/3] nfp: handle cls_flower command default case
From: Jakub Kicinski @ 2018-06-25 3:11 UTC (permalink / raw)
To: Jiri Pirko
Cc: netdev, davem, simon.horman, john.hurley, pieter.jansenvanvuuren,
oss-drivers, michael.chan, intel-wired-lan, mlxsw
In-Reply-To: <20180624083839.1692-3-jiri@resnulli.us>
On Sun, 24 Jun 2018 10:38:38 +0200, Jiri Pirko wrote:
> From: Jiri Pirko <jiri@mellanox.com>
>
> Currently the default case is not handled, which with future command
> introductions would introduce a warning. So handle it.
>
> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
^ permalink raw reply
* Re: [PATCH net-next] route: add support for directed broadcast forwarding
From: Xin Long @ 2018-06-25 2:47 UTC (permalink / raw)
To: network dev; +Cc: davem, David Ahern
In-Reply-To: <671e900d14a124f1de7785ee44c437d0826b9e2a.1529894708.git.lucien.xin@gmail.com>
[-- Attachment #1: Type: text/plain, Size: 5443 bytes --]
On Mon, Jun 25, 2018 at 10:45 AM, Xin Long <lucien.xin@gmail.com> wrote:
> This patch implements the feature described in rfc1812#section-5.3.5.2
> and rfc2644. It allows the router to forward directed broadcast when
> sysctl mc_forwarding is enabled.
>
> Note that this feature could be done by iptables -j TEE, but it would
> cause some problems:
> - target TEE's gateway param has to be set with a specific address,
> and it's not flexible especially when the route wants forward all
> directed broadcasts.
> - this duplicates the directed broadcasts so this may cause side
> effects to applications.
>
> Besides, to keep consistent with other os router like BSD, it's also
> necessary to implement it in the route rx path.
>
> Signed-off-by: Xin Long <lucien.xin@gmail.com>
> ---
> include/linux/inetdevice.h | 1 +
> include/uapi/linux/ip.h | 1 +
> include/uapi/linux/netconf.h | 1 +
> net/ipv4/devinet.c | 7 +++++++
> net/ipv4/route.c | 6 +++++-
> 5 files changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
> index 27650f1..c759d1c 100644
> --- a/include/linux/inetdevice.h
> +++ b/include/linux/inetdevice.h
> @@ -93,6 +93,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev)
>
> #define IN_DEV_FORWARD(in_dev) IN_DEV_CONF_GET((in_dev), FORWARDING)
> #define IN_DEV_MFORWARD(in_dev) IN_DEV_ANDCONF((in_dev), MC_FORWARDING)
> +#define IN_DEV_BFORWARD(in_dev) IN_DEV_ANDCONF((in_dev), BC_FORWARDING)
> #define IN_DEV_RPFILTER(in_dev) IN_DEV_MAXCONF((in_dev), RP_FILTER)
> #define IN_DEV_SRC_VMARK(in_dev) IN_DEV_ORCONF((in_dev), SRC_VMARK)
> #define IN_DEV_SOURCE_ROUTE(in_dev) IN_DEV_ANDCONF((in_dev), \
> diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h
> index b24a742..2b756b5 100644
> --- a/include/uapi/linux/ip.h
> +++ b/include/uapi/linux/ip.h
> @@ -139,6 +139,7 @@ enum
> {
> IPV4_DEVCONF_FORWARDING=1,
> IPV4_DEVCONF_MC_FORWARDING,
> + IPV4_DEVCONF_BC_FORWARDING,
> IPV4_DEVCONF_PROXY_ARP,
> IPV4_DEVCONF_ACCEPT_REDIRECTS,
> IPV4_DEVCONF_SECURE_REDIRECTS,
> diff --git a/include/uapi/linux/netconf.h b/include/uapi/linux/netconf.h
> index c84fcdf..a5cd70e 100644
> --- a/include/uapi/linux/netconf.h
> +++ b/include/uapi/linux/netconf.h
> @@ -15,6 +15,7 @@ enum {
> NETCONFA_FORWARDING,
> NETCONFA_RP_FILTER,
> NETCONFA_MC_FORWARDING,
> + NETCONFA_BC_FORWARDING,
> NETCONFA_PROXY_NEIGH,
> NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
> NETCONFA_INPUT,
> diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
> index d7585ab..ea30ab6 100644
> --- a/net/ipv4/devinet.c
> +++ b/net/ipv4/devinet.c
> @@ -1827,6 +1827,8 @@ static int inet_netconf_msgsize_devconf(int type)
> size += nla_total_size(4);
> if (all || type == NETCONFA_MC_FORWARDING)
> size += nla_total_size(4);
> + if (all || type == NETCONFA_BC_FORWARDING)
> + size += nla_total_size(4);
> if (all || type == NETCONFA_PROXY_NEIGH)
> size += nla_total_size(4);
> if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
> @@ -1873,6 +1875,10 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
> nla_put_s32(skb, NETCONFA_MC_FORWARDING,
> IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)
> goto nla_put_failure;
> + if ((all || type == NETCONFA_BC_FORWARDING) &&
> + nla_put_s32(skb, NETCONFA_BC_FORWARDING,
> + IPV4_DEVCONF(*devconf, BC_FORWARDING)) < 0)
> + goto nla_put_failure;
> if ((all || type == NETCONFA_PROXY_NEIGH) &&
> nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
> IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
> @@ -2259,6 +2265,7 @@ static struct devinet_sysctl_table {
> DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
> devinet_sysctl_forward),
> DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
> + DEVINET_SYSCTL_RW_ENTRY(BC_FORWARDING, "bc_forwarding"),
>
> DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"),
> DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"),
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index 1df6e97..b678466 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -1996,8 +1996,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
> goto no_route;
> }
>
> - if (res->type == RTN_BROADCAST)
> + if (res->type == RTN_BROADCAST) {
> + if (IN_DEV_BFORWARD(in_dev))
> + goto make_route;
> goto brd_input;
> + }
>
> if (res->type == RTN_LOCAL) {
> err = fib_validate_source(skb, saddr, daddr, tos,
> @@ -2014,6 +2017,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
> if (res->type != RTN_UNICAST)
> goto martian_destination;
>
> +make_route:
> err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
> out: return err;
>
> --
> 2.1.0
>
attachment is some testing scipts.
[-- Attachment #2: bc_fwd.sh --]
[-- Type: text/x-sh, Size: 2555 bytes --]
#!/bin/bash
# TOPO:
# host1 172.16.1.1/24 <-> .254/24 RTR .254/24 <-> 192.168.1.1/24 host2
# <-> .253/24 RTR2 .254/24 <-> 192.168.2.1/24 host3
netns_list="host1 host2 host3 RTR RTR2"
for i in $netns_list; do
ip netns del $i > /dev/null 2>&1
done
for i in $netns_list; do
ip netns add $i
done
ip link add host1_eth1 type veth peer name RTR_eth1
ip link add host1_eth2 type veth peer name RTR2_eth1
ip link add host2_eth1 type veth peer name RTR_eth2
ip link add host3_eth1 type veth peer name RTR2_eth2
ip link set RTR_eth1 netns RTR
ip link set RTR2_eth1 netns RTR2
ip link set RTR_eth2 netns RTR
ip link set RTR2_eth2 netns RTR2
ip link set host1_eth1 netns host1
ip link set host1_eth2 netns host1
ip link set host2_eth1 netns host2
ip link set host3_eth1 netns host3
ip netns exec host1 brctl addbr host1_br0
ip netns exec host1 brctl addif host1_br0 host1_eth1
ip netns exec host1 brctl addif host1_br0 host1_eth2
ip netns exec host1 ifconfig host1_br0 172.16.1.1/24 up
ip netns exec host1 ip link set host1_eth1 up
ip netns exec host1 ip link set host1_eth2 up
ip netns exec RTR ifconfig RTR_eth1 172.16.1.254/24 up
ip netns exec RTR2 ifconfig RTR2_eth1 172.16.1.253/24 up
ip netns exec RTR ifconfig RTR_eth2 192.168.1.254/24 up
ip netns exec RTR2 ifconfig RTR2_eth2 192.168.2.254/24 up
ip netns exec RTR sysctl -w net.ipv4.conf.all.forwarding=1
ip netns exec RTR2 sysctl -w net.ipv4.conf.all.forwarding=1
ip netns exec RTR sysctl -w net.ipv4.conf.RTR_eth1.bc_forwarding=1
ip netns exec RTR2 sysctl -w net.ipv4.conf.RTR2_eth1.bc_forwarding=1
ip netns exec RTR sysctl -w net.ipv4.conf.all.bc_forwarding=1
ip netns exec RTR2 sysctl -w net.ipv4.conf.all.bc_forwarding=1
ip netns exec host2 ifconfig host2_eth1 192.168.1.1/24 up
ip netns exec host3 ifconfig host3_eth1 192.168.2.1/24 up
ip netns exec host1 ip route add default nexthop via 172.16.1.254 nexthop via 172.16.1.253
ip netns exec host2 ip route add default via 192.168.1.254
ip netns exec host3 ip route add default via 192.168.2.254
ip netns exec host2 sysctl -w net.ipv4.icmp_echo_ignore_broadcasts=0
ip netns exec host3 sysctl -w net.ipv4.icmp_echo_ignore_broadcasts=0
ip netns exec RTR sysctl -w net.ipv4.icmp_echo_ignore_broadcasts=0
ip netns exec RTR2 sysctl -w net.ipv4.icmp_echo_ignore_broadcasts=0
ip netns exec host1 ping 192.168.2.255 -c 1
# ip netns exec host3 tcpdump -i host3_eth1 -p icmp -nn
# ip netns exec host1 ping 192.168.1.255 -c 1
# ip netns exec host2 tcpdump -i host2_eth1 -p icmp -nn
# ip netns exec host1 ping 255.255.255.255 -c 1
^ permalink raw reply
* [PATCH net-next] route: add support for directed broadcast forwarding
From: Xin Long @ 2018-06-25 2:45 UTC (permalink / raw)
To: network dev; +Cc: davem, David Ahern
This patch implements the feature described in rfc1812#section-5.3.5.2
and rfc2644. It allows the router to forward directed broadcast when
sysctl mc_forwarding is enabled.
Note that this feature could be done by iptables -j TEE, but it would
cause some problems:
- target TEE's gateway param has to be set with a specific address,
and it's not flexible especially when the route wants forward all
directed broadcasts.
- this duplicates the directed broadcasts so this may cause side
effects to applications.
Besides, to keep consistent with other os router like BSD, it's also
necessary to implement it in the route rx path.
Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
include/linux/inetdevice.h | 1 +
include/uapi/linux/ip.h | 1 +
include/uapi/linux/netconf.h | 1 +
net/ipv4/devinet.c | 7 +++++++
net/ipv4/route.c | 6 +++++-
5 files changed, 15 insertions(+), 1 deletion(-)
diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index 27650f1..c759d1c 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -93,6 +93,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev)
#define IN_DEV_FORWARD(in_dev) IN_DEV_CONF_GET((in_dev), FORWARDING)
#define IN_DEV_MFORWARD(in_dev) IN_DEV_ANDCONF((in_dev), MC_FORWARDING)
+#define IN_DEV_BFORWARD(in_dev) IN_DEV_ANDCONF((in_dev), BC_FORWARDING)
#define IN_DEV_RPFILTER(in_dev) IN_DEV_MAXCONF((in_dev), RP_FILTER)
#define IN_DEV_SRC_VMARK(in_dev) IN_DEV_ORCONF((in_dev), SRC_VMARK)
#define IN_DEV_SOURCE_ROUTE(in_dev) IN_DEV_ANDCONF((in_dev), \
diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h
index b24a742..2b756b5 100644
--- a/include/uapi/linux/ip.h
+++ b/include/uapi/linux/ip.h
@@ -139,6 +139,7 @@ enum
{
IPV4_DEVCONF_FORWARDING=1,
IPV4_DEVCONF_MC_FORWARDING,
+ IPV4_DEVCONF_BC_FORWARDING,
IPV4_DEVCONF_PROXY_ARP,
IPV4_DEVCONF_ACCEPT_REDIRECTS,
IPV4_DEVCONF_SECURE_REDIRECTS,
diff --git a/include/uapi/linux/netconf.h b/include/uapi/linux/netconf.h
index c84fcdf..a5cd70e 100644
--- a/include/uapi/linux/netconf.h
+++ b/include/uapi/linux/netconf.h
@@ -15,6 +15,7 @@ enum {
NETCONFA_FORWARDING,
NETCONFA_RP_FILTER,
NETCONFA_MC_FORWARDING,
+ NETCONFA_BC_FORWARDING,
NETCONFA_PROXY_NEIGH,
NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
NETCONFA_INPUT,
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index d7585ab..ea30ab6 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1827,6 +1827,8 @@ static int inet_netconf_msgsize_devconf(int type)
size += nla_total_size(4);
if (all || type == NETCONFA_MC_FORWARDING)
size += nla_total_size(4);
+ if (all || type == NETCONFA_BC_FORWARDING)
+ size += nla_total_size(4);
if (all || type == NETCONFA_PROXY_NEIGH)
size += nla_total_size(4);
if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
@@ -1873,6 +1875,10 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
nla_put_s32(skb, NETCONFA_MC_FORWARDING,
IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)
goto nla_put_failure;
+ if ((all || type == NETCONFA_BC_FORWARDING) &&
+ nla_put_s32(skb, NETCONFA_BC_FORWARDING,
+ IPV4_DEVCONF(*devconf, BC_FORWARDING)) < 0)
+ goto nla_put_failure;
if ((all || type == NETCONFA_PROXY_NEIGH) &&
nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
@@ -2259,6 +2265,7 @@ static struct devinet_sysctl_table {
DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
devinet_sysctl_forward),
DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
+ DEVINET_SYSCTL_RW_ENTRY(BC_FORWARDING, "bc_forwarding"),
DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"),
DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 1df6e97..b678466 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1996,8 +1996,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
goto no_route;
}
- if (res->type == RTN_BROADCAST)
+ if (res->type == RTN_BROADCAST) {
+ if (IN_DEV_BFORWARD(in_dev))
+ goto make_route;
goto brd_input;
+ }
if (res->type == RTN_LOCAL) {
err = fib_validate_source(skb, saddr, daddr, tos,
@@ -2014,6 +2017,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (res->type != RTN_UNICAST)
goto martian_destination;
+make_route:
err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
out: return err;
--
2.1.0
^ permalink raw reply related
* [PATCH net-next 5/5] sctp: check for ipv6_pinfo legal sndflow with flowlabel in sctp_v6_get_dst
From: Xin Long @ 2018-06-25 2:14 UTC (permalink / raw)
To: network dev, linux-sctp; +Cc: Marcelo Ricardo Leitner, Neil Horman, davem
In-Reply-To: <cover.1529892764.git.lucien.xin@gmail.com>
The transport with illegal flowlabel should not be allowed to send
packets. Other transport protocols already denies this.
Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
net/sctp/ipv6.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 772513d..d83ddc4 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -262,6 +262,15 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
if (t->flowlabel & SCTP_FLOWLABEL_SET_MASK)
fl6->flowlabel = htonl(t->flowlabel & SCTP_FLOWLABEL_VAL_MASK);
+ if (np->sndflow && (fl6->flowlabel & IPV6_FLOWLABEL_MASK)) {
+ struct ip6_flowlabel *flowlabel;
+
+ flowlabel = fl6_sock_lookup(sk, fl6->flowlabel);
+ if (!flowlabel)
+ goto out;
+ fl6_sock_release(flowlabel);
+ }
+
pr_debug("%s: dst=%pI6 ", __func__, &fl6->daddr);
if (asoc)
--
2.1.0
^ permalink raw reply related
* [PATCH net-next 4/5] sctp: add support for setting flowlabel when adding a transport
From: Xin Long @ 2018-06-25 2:14 UTC (permalink / raw)
To: network dev, linux-sctp; +Cc: Marcelo Ricardo Leitner, Neil Horman, davem
In-Reply-To: <cover.1529892764.git.lucien.xin@gmail.com>
Struct sockaddr_in6 has the member sin6_flowinfo that includes the
ipv6 flowlabel, it should also support for setting flowlabel when
adding a transport whose ipaddr is from userspace.
Note that addrinfo in sctp_sendmsg is using struct in6_addr for
the secondary addrs, which doesn't contain sin6_flowinfo, and
it needs to copy sin6_flowinfo from the primary addr.
Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
net/sctp/associola.c | 12 ++++++++++--
net/sctp/socket.c | 5 +++++
2 files changed, 15 insertions(+), 2 deletions(-)
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 16ecfbc..297d9cf 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -650,8 +650,16 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
peer->sackdelay = asoc->sackdelay;
peer->sackfreq = asoc->sackfreq;
- if (addr->sa.sa_family == AF_INET6)
- peer->flowlabel = asoc->flowlabel;
+ if (addr->sa.sa_family == AF_INET6) {
+ __be32 info = addr->v6.sin6_flowinfo;
+
+ if (info) {
+ peer->flowlabel = ntohl(info & IPV6_FLOWLABEL_MASK);
+ peer->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
+ } else {
+ peer->flowlabel = asoc->flowlabel;
+ }
+ }
peer->dscp = asoc->dscp;
/* Enable/disable heartbeat, SACK delay, and path MTU discovery
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 857de62..1df5d07 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1697,6 +1697,7 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags,
struct sctp_association *asoc;
enum sctp_scope scope;
struct cmsghdr *cmsg;
+ __be32 flowinfo = 0;
struct sctp_af *af;
int err;
@@ -1781,6 +1782,9 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags,
if (!cmsgs->addrs_msg)
return 0;
+ if (daddr->sa.sa_family == AF_INET6)
+ flowinfo = daddr->v6.sin6_flowinfo;
+
/* sendv addr list parse */
for_each_cmsghdr(cmsg, cmsgs->addrs_msg) {
struct sctp_transport *transport;
@@ -1813,6 +1817,7 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags,
}
dlen = sizeof(struct in6_addr);
+ daddr->v6.sin6_flowinfo = flowinfo;
daddr->v6.sin6_family = AF_INET6;
daddr->v6.sin6_port = htons(asoc->peer.port);
memcpy(&daddr->v6.sin6_addr, CMSG_DATA(cmsg), dlen);
--
2.1.0
^ permalink raw reply related
* [PATCH net-next 3/5] sctp: add spp_ipv6_flowlabel and spp_dscp for sctp_paddrparams
From: Xin Long @ 2018-06-25 2:14 UTC (permalink / raw)
To: network dev, linux-sctp; +Cc: Marcelo Ricardo Leitner, Neil Horman, davem
In-Reply-To: <cover.1529892764.git.lucien.xin@gmail.com>
spp_ipv6_flowlabel and spp_dscp are added in sctp_paddrparams in
this patch so that users could set sctp_sock/asoc/transport dscp
and flowlabel with spp_flags SPP_IPV6_FLOWLABEL or SPP_DSCP by
SCTP_PEER_ADDR_PARAMS , as described section 8.1.12 in RFC6458.
As said in last patch, it uses '| 0x100000' or '|0x1' to mark
flowlabel or dscp is set, so that their values could be set
to 0.
Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
include/uapi/linux/sctp.h | 4 ++
net/sctp/socket.c | 152 ++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 156 insertions(+)
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index c02986a..b479db5 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -763,6 +763,8 @@ enum sctp_spp_flags {
SPP_SACKDELAY_DISABLE = 1<<6, /*Disable SACK*/
SPP_SACKDELAY = SPP_SACKDELAY_ENABLE | SPP_SACKDELAY_DISABLE,
SPP_HB_TIME_IS_ZERO = 1<<7, /* Set HB delay to 0 */
+ SPP_IPV6_FLOWLABEL = 1<<8,
+ SPP_DSCP = 1<<9,
};
struct sctp_paddrparams {
@@ -773,6 +775,8 @@ struct sctp_paddrparams {
__u32 spp_pathmtu;
__u32 spp_sackdelay;
__u32 spp_flags;
+ __u32 spp_ipv6_flowlabel;
+ __u8 spp_dscp;
} __attribute__((packed, aligned(4)));
/*
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index bf11f9c..857de62 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2393,6 +2393,8 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval,
* uint32_t spp_pathmtu;
* uint32_t spp_sackdelay;
* uint32_t spp_flags;
+ * uint32_t spp_ipv6_flowlabel;
+ * uint8_t spp_dscp;
* };
*
* spp_assoc_id - (one-to-many style socket) This is filled in the
@@ -2472,6 +2474,45 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval,
* also that this field is mutually exclusive to
* SPP_SACKDELAY_ENABLE, setting both will have undefined
* results.
+ *
+ * SPP_IPV6_FLOWLABEL: Setting this flag enables the
+ * setting of the IPV6 flow label value. The value is
+ * contained in the spp_ipv6_flowlabel field.
+ * Upon retrieval, this flag will be set to indicate that
+ * the spp_ipv6_flowlabel field has a valid value returned.
+ * If a specific destination address is set (in the
+ * spp_address field), then the value returned is that of
+ * the address. If just an association is specified (and
+ * no address), then the association's default flow label
+ * is returned. If neither an association nor a destination
+ * is specified, then the socket's default flow label is
+ * returned. For non-IPv6 sockets, this flag will be left
+ * cleared.
+ *
+ * SPP_DSCP: Setting this flag enables the setting of the
+ * Differentiated Services Code Point (DSCP) value
+ * associated with either the association or a specific
+ * address. The value is obtained in the spp_dscp field.
+ * Upon retrieval, this flag will be set to indicate that
+ * the spp_dscp field has a valid value returned. If a
+ * specific destination address is set when called (in the
+ * spp_address field), then that specific destination
+ * address's DSCP value is returned. If just an association
+ * is specified, then the association's default DSCP is
+ * returned. If neither an association nor a destination is
+ * specified, then the socket's default DSCP is returned.
+ *
+ * spp_ipv6_flowlabel
+ * - This field is used in conjunction with the
+ * SPP_IPV6_FLOWLABEL flag and contains the IPv6 flow label.
+ * The 20 least significant bits are used for the flow
+ * label. This setting has precedence over any IPv6-layer
+ * setting.
+ *
+ * spp_dscp - This field is used in conjunction with the SPP_DSCP flag
+ * and contains the DSCP. The 6 most significant bits are
+ * used for the DSCP. This setting has precedence over any
+ * IPv4- or IPv6- layer setting.
*/
static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
struct sctp_transport *trans,
@@ -2611,6 +2652,51 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
}
}
+ if (params->spp_flags & SPP_IPV6_FLOWLABEL) {
+ if (trans && trans->ipaddr.sa.sa_family == AF_INET6) {
+ trans->flowlabel = params->spp_ipv6_flowlabel &
+ SCTP_FLOWLABEL_VAL_MASK;
+ trans->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
+ } else if (asoc) {
+ list_for_each_entry(trans,
+ &asoc->peer.transport_addr_list,
+ transports) {
+ if (trans->ipaddr.sa.sa_family != AF_INET6)
+ continue;
+ trans->flowlabel = params->spp_ipv6_flowlabel &
+ SCTP_FLOWLABEL_VAL_MASK;
+ trans->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
+ }
+ asoc->flowlabel = params->spp_ipv6_flowlabel &
+ SCTP_FLOWLABEL_VAL_MASK;
+ asoc->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
+ } else if (sctp_opt2sk(sp)->sk_family == AF_INET6) {
+ sp->flowlabel = params->spp_ipv6_flowlabel &
+ SCTP_FLOWLABEL_VAL_MASK;
+ sp->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
+ }
+ }
+
+ if (params->spp_flags & SPP_DSCP) {
+ if (trans) {
+ trans->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK;
+ trans->dscp |= SCTP_DSCP_SET_MASK;
+ } else if (asoc) {
+ list_for_each_entry(trans,
+ &asoc->peer.transport_addr_list,
+ transports) {
+ trans->dscp = params->spp_dscp &
+ SCTP_DSCP_VAL_MASK;
+ trans->dscp |= SCTP_DSCP_SET_MASK;
+ }
+ asoc->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK;
+ asoc->dscp |= SCTP_DSCP_SET_MASK;
+ } else {
+ sp->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK;
+ sp->dscp |= SCTP_DSCP_SET_MASK;
+ }
+ }
+
return 0;
}
@@ -5453,6 +5539,45 @@ static int sctp_getsockopt_peeloff_flags(struct sock *sk, int len,
* also that this field is mutually exclusive to
* SPP_SACKDELAY_ENABLE, setting both will have undefined
* results.
+ *
+ * SPP_IPV6_FLOWLABEL: Setting this flag enables the
+ * setting of the IPV6 flow label value. The value is
+ * contained in the spp_ipv6_flowlabel field.
+ * Upon retrieval, this flag will be set to indicate that
+ * the spp_ipv6_flowlabel field has a valid value returned.
+ * If a specific destination address is set (in the
+ * spp_address field), then the value returned is that of
+ * the address. If just an association is specified (and
+ * no address), then the association's default flow label
+ * is returned. If neither an association nor a destination
+ * is specified, then the socket's default flow label is
+ * returned. For non-IPv6 sockets, this flag will be left
+ * cleared.
+ *
+ * SPP_DSCP: Setting this flag enables the setting of the
+ * Differentiated Services Code Point (DSCP) value
+ * associated with either the association or a specific
+ * address. The value is obtained in the spp_dscp field.
+ * Upon retrieval, this flag will be set to indicate that
+ * the spp_dscp field has a valid value returned. If a
+ * specific destination address is set when called (in the
+ * spp_address field), then that specific destination
+ * address's DSCP value is returned. If just an association
+ * is specified, then the association's default DSCP is
+ * returned. If neither an association nor a destination is
+ * specified, then the socket's default DSCP is returned.
+ *
+ * spp_ipv6_flowlabel
+ * - This field is used in conjunction with the
+ * SPP_IPV6_FLOWLABEL flag and contains the IPv6 flow label.
+ * The 20 least significant bits are used for the flow
+ * label. This setting has precedence over any IPv6-layer
+ * setting.
+ *
+ * spp_dscp - This field is used in conjunction with the SPP_DSCP flag
+ * and contains the DSCP. The 6 most significant bits are
+ * used for the DSCP. This setting has precedence over any
+ * IPv4- or IPv6- layer setting.
*/
static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
char __user *optval, int __user *optlen)
@@ -5499,6 +5624,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
/*draft-11 doesn't say what to return in spp_flags*/
params.spp_flags = trans->param_flags;
+ if (trans->flowlabel & SCTP_FLOWLABEL_SET_MASK) {
+ params.spp_ipv6_flowlabel = trans->flowlabel &
+ SCTP_FLOWLABEL_VAL_MASK;
+ params.spp_flags |= SPP_IPV6_FLOWLABEL;
+ }
+ if (trans->dscp & SCTP_DSCP_SET_MASK) {
+ params.spp_dscp = trans->dscp & SCTP_DSCP_VAL_MASK;
+ params.spp_flags |= SPP_DSCP;
+ }
} else if (asoc) {
/* Fetch association values. */
params.spp_hbinterval = jiffies_to_msecs(asoc->hbinterval);
@@ -5508,6 +5642,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
/*draft-11 doesn't say what to return in spp_flags*/
params.spp_flags = asoc->param_flags;
+ if (asoc->flowlabel & SCTP_FLOWLABEL_SET_MASK) {
+ params.spp_ipv6_flowlabel = asoc->flowlabel &
+ SCTP_FLOWLABEL_VAL_MASK;
+ params.spp_flags |= SPP_IPV6_FLOWLABEL;
+ }
+ if (asoc->dscp & SCTP_DSCP_SET_MASK) {
+ params.spp_dscp = asoc->dscp & SCTP_DSCP_VAL_MASK;
+ params.spp_flags |= SPP_DSCP;
+ }
} else {
/* Fetch socket values. */
params.spp_hbinterval = sp->hbinterval;
@@ -5517,6 +5660,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
/*draft-11 doesn't say what to return in spp_flags*/
params.spp_flags = sp->param_flags;
+ if (sp->flowlabel & SCTP_FLOWLABEL_SET_MASK) {
+ params.spp_ipv6_flowlabel = sp->flowlabel &
+ SCTP_FLOWLABEL_VAL_MASK;
+ params.spp_flags |= SPP_IPV6_FLOWLABEL;
+ }
+ if (sp->dscp & SCTP_DSCP_SET_MASK) {
+ params.spp_dscp = sp->dscp & SCTP_DSCP_VAL_MASK;
+ params.spp_flags |= SPP_DSCP;
+ }
}
if (copy_to_user(optval, ¶ms, len))
--
2.1.0
^ permalink raw reply related
* [PATCH net-next 2/5] sctp: add support for dscp and flowlabel per transport
From: Xin Long @ 2018-06-25 2:14 UTC (permalink / raw)
To: network dev, linux-sctp; +Cc: Marcelo Ricardo Leitner, Neil Horman, davem
In-Reply-To: <cover.1529892764.git.lucien.xin@gmail.com>
Like some other per transport params, flowlabel and dscp are added
in transport, asoc and sctp_sock. By default, transport sets its
value from asoc's, and asoc does it from sctp_sock. flowlabel
only works for ipv6 transport.
Other than that they need to be passed down in sctp_xmit, flow4/6
also needs to set them before looking up route in get_dst.
Note that it uses '& 0x100000' to check if flowlabel is set and
'& 0x1' (tos 1st bit is unused) to check if dscp is set by users,
so that they could be set to 0 by sockopt in next patch.
Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
include/linux/sctp.h | 7 +++++++
include/net/sctp/structs.h | 9 +++++++++
net/sctp/associola.c | 7 +++++++
net/sctp/ipv6.c | 11 +++++++++--
net/sctp/protocol.c | 16 ++++++++++++----
5 files changed, 44 insertions(+), 6 deletions(-)
diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index b36c766..83d9434 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -801,4 +801,11 @@ struct sctp_strreset_resptsn {
__be32 receivers_next_tsn;
};
+enum {
+ SCTP_DSCP_SET_MASK = 0x1,
+ SCTP_DSCP_VAL_MASK = 0xfc,
+ SCTP_FLOWLABEL_SET_MASK = 0x100000,
+ SCTP_FLOWLABEL_VAL_MASK = 0xfffff
+};
+
#endif /* __LINUX_SCTP_H__ */
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 701a517..ab869e0 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -193,6 +193,9 @@ struct sctp_sock {
/* This is the max_retrans value for new associations. */
__u16 pathmaxrxt;
+ __u32 flowlabel;
+ __u8 dscp;
+
/* The initial Path MTU to use for new associations. */
__u32 pathmtu;
@@ -895,6 +898,9 @@ struct sctp_transport {
*/
__u16 pathmaxrxt;
+ __u32 flowlabel;
+ __u8 dscp;
+
/* This is the partially failed retrans value for the transport
* and will be initialized from the assocs value. This can be changed
* using the SCTP_PEER_ADDR_THLDS socket option
@@ -1772,6 +1778,9 @@ struct sctp_association {
*/
__u16 pathmaxrxt;
+ __u32 flowlabel;
+ __u8 dscp;
+
/* Flag that path mtu update is pending */
__u8 pmtu_pending;
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 5d5a162..16ecfbc 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -115,6 +115,9 @@ static struct sctp_association *sctp_association_init(
/* Initialize path max retrans value. */
asoc->pathmaxrxt = sp->pathmaxrxt;
+ asoc->flowlabel = sp->flowlabel;
+ asoc->dscp = sp->dscp;
+
/* Initialize default path MTU. */
asoc->pathmtu = sp->pathmtu;
@@ -647,6 +650,10 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
peer->sackdelay = asoc->sackdelay;
peer->sackfreq = asoc->sackfreq;
+ if (addr->sa.sa_family == AF_INET6)
+ peer->flowlabel = asoc->flowlabel;
+ peer->dscp = asoc->dscp;
+
/* Enable/disable heartbeat, SACK delay, and path MTU discovery
* based on association setting.
*/
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 7339918..772513d 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -209,12 +209,17 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
struct sock *sk = skb->sk;
struct ipv6_pinfo *np = inet6_sk(sk);
struct flowi6 *fl6 = &transport->fl.u.ip6;
+ __u8 tclass = np->tclass;
int res;
pr_debug("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n", __func__, skb,
skb->len, &fl6->saddr, &fl6->daddr);
- IP6_ECN_flow_xmit(sk, fl6->flowlabel);
+ if (transport->dscp & SCTP_DSCP_SET_MASK)
+ tclass = transport->dscp & SCTP_DSCP_VAL_MASK;
+
+ if (INET_ECN_is_capable(tclass))
+ IP6_ECN_flow_xmit(sk, fl6->flowlabel);
if (!(transport->param_flags & SPP_PMTUD_ENABLE))
skb->ignore_df = 1;
@@ -223,7 +228,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
rcu_read_lock();
res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt),
- np->tclass);
+ tclass);
rcu_read_unlock();
return res;
}
@@ -254,6 +259,8 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
fl6->flowi6_oif = daddr->v6.sin6_scope_id;
else if (asoc)
fl6->flowi6_oif = asoc->base.sk->sk_bound_dev_if;
+ if (t->flowlabel & SCTP_FLOWLABEL_SET_MASK)
+ fl6->flowlabel = htonl(t->flowlabel & SCTP_FLOWLABEL_VAL_MASK);
pr_debug("%s: dst=%pI6 ", __func__, &fl6->daddr);
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 5dffbc4..d57fd30 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -426,13 +426,16 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
struct dst_entry *dst = NULL;
union sctp_addr *daddr = &t->ipaddr;
union sctp_addr dst_saddr;
+ __u8 tos = inet_sk(sk)->tos;
+ if (t->dscp & SCTP_DSCP_SET_MASK)
+ tos = t->dscp & SCTP_DSCP_VAL_MASK;
memset(fl4, 0x0, sizeof(struct flowi4));
fl4->daddr = daddr->v4.sin_addr.s_addr;
fl4->fl4_dport = daddr->v4.sin_port;
fl4->flowi4_proto = IPPROTO_SCTP;
if (asoc) {
- fl4->flowi4_tos = RT_CONN_FLAGS(asoc->base.sk);
+ fl4->flowi4_tos = RT_CONN_FLAGS_TOS(asoc->base.sk, tos);
fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if;
fl4->fl4_sport = htons(asoc->base.bind_addr.port);
}
@@ -495,7 +498,7 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
fl4->fl4_sport = laddr->a.v4.sin_port;
flowi4_update_output(fl4,
asoc->base.sk->sk_bound_dev_if,
- RT_CONN_FLAGS(asoc->base.sk),
+ RT_CONN_FLAGS_TOS(asoc->base.sk, tos),
daddr->v4.sin_addr.s_addr,
laddr->a.v4.sin_addr.s_addr);
@@ -971,16 +974,21 @@ static inline int sctp_v4_xmit(struct sk_buff *skb,
struct sctp_transport *transport)
{
struct inet_sock *inet = inet_sk(skb->sk);
+ __u8 dscp = inet->tos;
pr_debug("%s: skb:%p, len:%d, src:%pI4, dst:%pI4\n", __func__, skb,
- skb->len, &transport->fl.u.ip4.saddr, &transport->fl.u.ip4.daddr);
+ skb->len, &transport->fl.u.ip4.saddr,
+ &transport->fl.u.ip4.daddr);
+
+ if (transport->dscp & SCTP_DSCP_SET_MASK)
+ dscp = transport->dscp & SCTP_DSCP_VAL_MASK;
inet->pmtudisc = transport->param_flags & SPP_PMTUD_ENABLE ?
IP_PMTUDISC_DO : IP_PMTUDISC_DONT;
SCTP_INC_STATS(sock_net(&inet->sk), SCTP_MIB_OUTSCTPPACKS);
- return ip_queue_xmit(&inet->sk, skb, &transport->fl);
+ return __ip_queue_xmit(&inet->sk, skb, &transport->fl, dscp);
}
static struct sctp_af sctp_af_inet;
--
2.1.0
^ permalink raw reply related
* [PATCH net-next 1/5] ipv4: add __ip_queue_xmit() that supports tos param
From: Xin Long @ 2018-06-25 2:14 UTC (permalink / raw)
To: network dev, linux-sctp; +Cc: Marcelo Ricardo Leitner, Neil Horman, davem
In-Reply-To: <cover.1529892764.git.lucien.xin@gmail.com>
This patch introduces __ip_queue_xmit(), through which the callers
can pass tos param into it without having to set inet->tos. For
ipv6, ip6_xmit() already allows passing tclass parameter.
It's needed when some transport protocol doesn't use inet->tos,
like sctp's per transport dscp, which will be added in next patch.
Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
include/net/ip.h | 2 ++
net/ipv4/ip_output.c | 13 ++++++++++---
2 files changed, 12 insertions(+), 3 deletions(-)
diff --git a/include/net/ip.h b/include/net/ip.h
index 0d2281b..ca05b77 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -148,6 +148,8 @@ void ip_send_check(struct iphdr *ip);
int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);
int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);
+int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
+ __u8 tos);
int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl);
void ip_init(void);
int ip_append_data(struct sock *sk, struct flowi4 *fl4,
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index b3308e9..107d37f 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -423,7 +423,8 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
}
/* Note: skb->sk can be different from sk, in case of tunnels */
-int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
+int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
+ __u8 tos)
{
struct inet_sock *inet = inet_sk(sk);
struct net *net = sock_net(sk);
@@ -462,7 +463,7 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
inet->inet_dport,
inet->inet_sport,
sk->sk_protocol,
- RT_CONN_FLAGS(sk),
+ RT_CONN_FLAGS_TOS(sk, tos),
sk->sk_bound_dev_if);
if (IS_ERR(rt))
goto no_route;
@@ -478,7 +479,7 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
skb_reset_network_header(skb);
iph = ip_hdr(skb);
- *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
+ *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (tos & 0xff));
if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
iph->frag_off = htons(IP_DF);
else
@@ -511,6 +512,12 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
kfree_skb(skb);
return -EHOSTUNREACH;
}
+EXPORT_SYMBOL(__ip_queue_xmit);
+
+int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
+{
+ return __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos);
+}
EXPORT_SYMBOL(ip_queue_xmit);
static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
--
2.1.0
^ permalink raw reply related
* [PATCH net-next 0/5] sctp: fully support for dscp and flowlabel per transport
From: Xin Long @ 2018-06-25 2:14 UTC (permalink / raw)
To: network dev, linux-sctp; +Cc: Marcelo Ricardo Leitner, Neil Horman, davem
Now dscp and flowlabel are set from sock when sending the packets,
but being multi-homing, sctp also supports for dscp and flowlabel
per transport, which is described in section 8.1.12 in RFC6458.
Xin Long (5):
ipv4: add __ip_queue_xmit() that supports tos param
sctp: add support for dscp and flowlabel per transport
sctp: add spp_ipv6_flowlabel and spp_dscp for sctp_paddrparams
sctp: add support for setting flowlabel when adding a transport
sctp: check for ipv6_pinfo legal sndflow with flowlabel in
sctp_v6_get_dst
include/linux/sctp.h | 7 ++
include/net/ip.h | 2 +
include/net/sctp/structs.h | 9 +++
include/uapi/linux/sctp.h | 4 ++
net/ipv4/ip_output.c | 13 +++-
net/sctp/associola.c | 15 +++++
net/sctp/ipv6.c | 20 +++++-
net/sctp/protocol.c | 16 +++--
net/sctp/socket.c | 157 +++++++++++++++++++++++++++++++++++++++++++++
9 files changed, 234 insertions(+), 9 deletions(-)
--
2.1.0
^ permalink raw reply
* [PATCHv2 net-next] sctp: add support for SCTP_REUSE_PORT sockopt
From: Xin Long @ 2018-06-25 2:06 UTC (permalink / raw)
To: network dev, linux-sctp
Cc: Marcelo Ricardo Leitner, Neil Horman, Michael Tuexen, davem
This feature is actually already supported by sk->sk_reuse which can be
set by socket level opt SO_REUSEADDR. But it's not working exactly as
RFC6458 demands in section 8.1.27, like:
- This option only supports one-to-one style SCTP sockets
- This socket option must not be used after calling bind()
or sctp_bindx().
Besides, SCTP_REUSE_PORT sockopt should be provided for user's programs.
Otherwise, the programs with SCTP_REUSE_PORT from other systems will not
work in linux.
To separate it from the socket level version, this patch adds 'reuse' in
sctp_sock and it works pretty much as sk->sk_reuse, but with some extra
setup limitations that are needed when it is being enabled.
"It should be noted that the behavior of the socket-level socket option
to reuse ports and/or addresses for SCTP sockets is unspecified", so it
leaves SO_REUSEADDR as is for the compatibility.
Note that the name SCTP_REUSE_PORT is kind of confusing, it is identical
to SO_REUSEADDR with some extra restriction, so here it uses 'reuse' in
sctp_sock instead of 'reuseport'. As for sk->sk_reuseport support for
SCTP, it will be added in another patch.
Thanks to Neil to make this clear.
v1->v2:
- add sctp_sk->reuse to separate it from the socket level version.
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
include/net/sctp/structs.h | 1 +
include/uapi/linux/sctp.h | 1 +
net/sctp/socket.c | 62 ++++++++++++++++++++++++++++++++++++++++------
3 files changed, 57 insertions(+), 7 deletions(-)
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index e0f962d..701a517 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -220,6 +220,7 @@ struct sctp_sock {
__u32 adaptation_ind;
__u32 pd_point;
__u16 nodelay:1,
+ reuse:1,
disable_fragments:1,
v4mapped:1,
frag_interleave:1,
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index b64d583..c02986a 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -100,6 +100,7 @@ typedef __s32 sctp_assoc_t;
#define SCTP_RECVNXTINFO 33
#define SCTP_DEFAULT_SNDINFO 34
#define SCTP_AUTH_DEACTIVATE_KEY 35
+#define SCTP_REUSE_PORT 36
/* Internal Socket Options. Some of the sctp library functions are
* implemented using these socket options.
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 0e91e83..bf11f9c 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4170,6 +4170,28 @@ static int sctp_setsockopt_interleaving_supported(struct sock *sk,
return retval;
}
+static int sctp_setsockopt_reuse_port(struct sock *sk, char __user *optval,
+ unsigned int optlen)
+{
+ int val;
+
+ if (!sctp_style(sk, TCP))
+ return -EOPNOTSUPP;
+
+ if (sctp_sk(sk)->ep->base.bind_addr.port)
+ return -EFAULT;
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+
+ if (get_user(val, (int __user *)optval))
+ return -EFAULT;
+
+ sctp_sk(sk)->reuse = !!val;
+
+ return 0;
+}
+
/* API 6.2 setsockopt(), getsockopt()
*
* Applications use setsockopt() and getsockopt() to set or retrieve
@@ -4364,6 +4386,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
retval = sctp_setsockopt_interleaving_supported(sk, optval,
optlen);
break;
+ case SCTP_REUSE_PORT:
+ retval = sctp_setsockopt_reuse_port(sk, optval, optlen);
+ break;
default:
retval = -ENOPROTOOPT;
break;
@@ -7197,6 +7222,26 @@ static int sctp_getsockopt_interleaving_supported(struct sock *sk, int len,
return retval;
}
+static int sctp_getsockopt_reuse_port(struct sock *sk, int len,
+ char __user *optval,
+ int __user *optlen)
+{
+ int val;
+
+ if (len < sizeof(int))
+ return -EINVAL;
+
+ len = sizeof(int);
+ val = sctp_sk(sk)->reuse;
+ if (put_user(len, optlen))
+ return -EFAULT;
+
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+
+ return 0;
+}
+
static int sctp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen)
{
@@ -7392,6 +7437,9 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
retval = sctp_getsockopt_interleaving_supported(sk, len, optval,
optlen);
break;
+ case SCTP_REUSE_PORT:
+ retval = sctp_getsockopt_reuse_port(sk, len, optval, optlen);
+ break;
default:
retval = -ENOPROTOOPT;
break;
@@ -7429,6 +7477,7 @@ static struct sctp_bind_bucket *sctp_bucket_create(
static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
{
+ bool reuse = (sk->sk_reuse || sctp_sk(sk)->reuse);
struct sctp_bind_hashbucket *head; /* hash list */
struct sctp_bind_bucket *pp;
unsigned short snum;
@@ -7501,13 +7550,11 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
* used by other socket (pp->owner not empty); that other
* socket is going to be sk2.
*/
- int reuse = sk->sk_reuse;
struct sock *sk2;
pr_debug("%s: found a possible match\n", __func__);
- if (pp->fastreuse && sk->sk_reuse &&
- sk->sk_state != SCTP_SS_LISTENING)
+ if (pp->fastreuse && reuse && sk->sk_state != SCTP_SS_LISTENING)
goto success;
/* Run through the list of sockets bound to the port
@@ -7525,7 +7572,7 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
ep2 = sctp_sk(sk2)->ep;
if (sk == sk2 ||
- (reuse && sk2->sk_reuse &&
+ (reuse && (sk2->sk_reuse || sctp_sk(sk2)->reuse) &&
sk2->sk_state != SCTP_SS_LISTENING))
continue;
@@ -7549,12 +7596,12 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
* SO_REUSEADDR on this socket -sk-).
*/
if (hlist_empty(&pp->owner)) {
- if (sk->sk_reuse && sk->sk_state != SCTP_SS_LISTENING)
+ if (reuse && sk->sk_state != SCTP_SS_LISTENING)
pp->fastreuse = 1;
else
pp->fastreuse = 0;
} else if (pp->fastreuse &&
- (!sk->sk_reuse || sk->sk_state == SCTP_SS_LISTENING))
+ (!reuse || sk->sk_state == SCTP_SS_LISTENING))
pp->fastreuse = 0;
/* We are set, so fill up all the data in the hash table
@@ -7685,7 +7732,7 @@ int sctp_inet_listen(struct socket *sock, int backlog)
err = 0;
sctp_unhash_endpoint(ep);
sk->sk_state = SCTP_SS_CLOSED;
- if (sk->sk_reuse)
+ if (sk->sk_reuse || sctp_sk(sk)->reuse)
sctp_sk(sk)->bind_hash->fastreuse = 1;
goto out;
}
@@ -8550,6 +8597,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
newsk->sk_no_check_tx = sk->sk_no_check_tx;
newsk->sk_no_check_rx = sk->sk_no_check_rx;
newsk->sk_reuse = sk->sk_reuse;
+ sctp_sk(newsk)->reuse = sp->reuse;
newsk->sk_shutdown = sk->sk_shutdown;
newsk->sk_destruct = sctp_destruct_sock;
--
2.1.0
^ permalink raw reply related
* Re: [PATCH 0/4] docs: e100[0] fix build errors
From: Tobin C. Harding @ 2018-06-25 2:06 UTC (permalink / raw)
To: Kirsher, Jeffrey T
Cc: Randy Dunlap, Jonathan Corbet, David S. Miller,
linux-doc@vger.kernel.org, netdev@vger.kernel.org,
linux-kernel@vger.kernel.org
In-Reply-To: <91A1572C-63A6-4420-B01B-A0DD5F84200A@intel.com>
On Mon, Jun 25, 2018 at 01:11:33AM +0000, Kirsher, Jeffrey T wrote:
>
>
> > On Jun 24, 2018, at 16:41, Tobin C. Harding <me@tobin.cc> wrote:
> >
> >> On Fri, Jun 22, 2018 at 01:22:37PM -0700, Randy Dunlap wrote:
> >> Hi Tobin,
> >>
> >>> On 06/21/2018 05:37 PM, Tobin C. Harding wrote:
> >>> Hi Jonathan,
> >>>
> >>> This patch set fixes current docs build failure on Linus' mainline
> >>>
> >>> commit: (ba4dbdedd3ed Merge tag 'jfs-4.18' of git://github.com/kleikamp/linux-shaggy)
> >>>
> >>> (FYI this is 8 commits after Linux 4.18-rc1).
> >>>
> >>> And also same build errors on today's linux-next
> >>>
> >>> 8439c34f07a3 (tag: next-20180621, linux-next/master, linux-next) Add linux-next specific files for 20180621
> >>>
> >>>
> >>> I split the patches in between the two drivers to enable use of the
> >>> 'Fixes:' tag.
> >>>
> >>> Tobin C. Harding (4):
> >>> Documentation: e100: Use correct heading adornment
> >>> Documentation: e1000: Use correct heading adornment
> >>> Documentation: e100: Fix docs build error
> >>> Documentation: e1000: Fix docs build error
> >>>
> >>> Documentation/networking/e100.rst | 112 +++++++++++++++--------------
> >>> Documentation/networking/e1000.rst | 76 ++++++++++----------
> >>> 2 files changed, 96 insertions(+), 92 deletions(-)
> >>
> >> I am still seeing a few warnings (your 4 patches applied to
> >> linux-next-20180622):
> >>
> >> linux-next-20180622/Documentation/networking/e100.rst:57: WARNING: Literal block expected; none found.
> >> linux-next-20180622/Documentation/networking/e100.rst:68: WARNING: Literal block expected; none found.
> >> linux-next-20180622/Documentation/networking/e100.rst:75: WARNING: Literal block expected; none found.
> >> linux-next-20180622/Documentation/networking/e100.rst:84: WARNING: Literal block expected; none found.
> >> linux-next-20180622/Documentation/networking/e100.rst:93: WARNING: Inline emphasis start-string without end-string.
> >> linux-next-20180622/Documentation/networking/e1000.rst:83: ERROR: Unexpected indentation.
> >> linux-next-20180622/Documentation/networking/e1000.rst:84: WARNING: Block quote ends without a blank line; unexpected unindent.
> >> linux-next-20180622/Documentation/networking/e1000.rst:173: WARNING: Definition list ends without a blank line; unexpected unindent.
> >> linux-next-20180622/Documentation/networking/e1000.rst:236: WARNING: Definition list ends without a blank line; unexpected unindent.
> >>
> >>
> >> You didn't get these warnings? or they weren't important?
> >> or "perfect" was not your primary goal? :) [which is fine]
> >>
> >> [I see around 50 similar doc formatting warnings/errors in the entire
> >> Documentation build.]
> >
> > Thanks for testing this Randy. You are right there are a bunch of
> > warnings, I did not think to grep for warnings on these files - sloppy
> > work by me. The series has been applied already but I'll add clearing
> > these warnings to my TODO list.
>
> Thanks, already on it. I have a set of patches that I will push once Dave open’s up his net-next tree.
Oh cool, even better.
thanks,
Tobin.
^ permalink raw reply
* Re: [PATCH 0/4] docs: e100[0] fix build errors
From: Kirsher, Jeffrey T @ 2018-06-25 1:11 UTC (permalink / raw)
To: Tobin C. Harding
Cc: Randy Dunlap, Jonathan Corbet, David S. Miller,
linux-doc@vger.kernel.org, netdev@vger.kernel.org,
linux-kernel@vger.kernel.org
In-Reply-To: <20180624234135.GE3006@eros>
> On Jun 24, 2018, at 16:41, Tobin C. Harding <me@tobin.cc> wrote:
>
>> On Fri, Jun 22, 2018 at 01:22:37PM -0700, Randy Dunlap wrote:
>> Hi Tobin,
>>
>>> On 06/21/2018 05:37 PM, Tobin C. Harding wrote:
>>> Hi Jonathan,
>>>
>>> This patch set fixes current docs build failure on Linus' mainline
>>>
>>> commit: (ba4dbdedd3ed Merge tag 'jfs-4.18' of git://github.com/kleikamp/linux-shaggy)
>>>
>>> (FYI this is 8 commits after Linux 4.18-rc1).
>>>
>>> And also same build errors on today's linux-next
>>>
>>> 8439c34f07a3 (tag: next-20180621, linux-next/master, linux-next) Add linux-next specific files for 20180621
>>>
>>>
>>> I split the patches in between the two drivers to enable use of the
>>> 'Fixes:' tag.
>>>
>>> Tobin C. Harding (4):
>>> Documentation: e100: Use correct heading adornment
>>> Documentation: e1000: Use correct heading adornment
>>> Documentation: e100: Fix docs build error
>>> Documentation: e1000: Fix docs build error
>>>
>>> Documentation/networking/e100.rst | 112 +++++++++++++++--------------
>>> Documentation/networking/e1000.rst | 76 ++++++++++----------
>>> 2 files changed, 96 insertions(+), 92 deletions(-)
>>
>> I am still seeing a few warnings (your 4 patches applied to
>> linux-next-20180622):
>>
>> linux-next-20180622/Documentation/networking/e100.rst:57: WARNING: Literal block expected; none found.
>> linux-next-20180622/Documentation/networking/e100.rst:68: WARNING: Literal block expected; none found.
>> linux-next-20180622/Documentation/networking/e100.rst:75: WARNING: Literal block expected; none found.
>> linux-next-20180622/Documentation/networking/e100.rst:84: WARNING: Literal block expected; none found.
>> linux-next-20180622/Documentation/networking/e100.rst:93: WARNING: Inline emphasis start-string without end-string.
>> linux-next-20180622/Documentation/networking/e1000.rst:83: ERROR: Unexpected indentation.
>> linux-next-20180622/Documentation/networking/e1000.rst:84: WARNING: Block quote ends without a blank line; unexpected unindent.
>> linux-next-20180622/Documentation/networking/e1000.rst:173: WARNING: Definition list ends without a blank line; unexpected unindent.
>> linux-next-20180622/Documentation/networking/e1000.rst:236: WARNING: Definition list ends without a blank line; unexpected unindent.
>>
>>
>> You didn't get these warnings? or they weren't important?
>> or "perfect" was not your primary goal? :) [which is fine]
>>
>> [I see around 50 similar doc formatting warnings/errors in the entire
>> Documentation build.]
>
> Thanks for testing this Randy. You are right there are a bunch of
> warnings, I did not think to grep for warnings on these files - sloppy
> work by me. The series has been applied already but I'll add clearing
> these warnings to my TODO list.
Thanks, already on it. I have a set of patches that I will push once Dave open’s up his net-next tree.
>
>> Anyway, much better than it was. Thanks.
>>
>> Tested-by: Randy Dunlap <rdunlap@infradead.org>
>
> Thanks again,
> Tobin.
^ permalink raw reply
* Re: [PATCH 0/4] docs: e100[0] fix build errors
From: Tobin C. Harding @ 2018-06-24 23:41 UTC (permalink / raw)
To: Randy Dunlap
Cc: Jonathan Corbet, Jeff Kirsher, David S. Miller, linux-doc, netdev,
linux-kernel
In-Reply-To: <ba768d6c-dd21-3600-5cba-151b5d6185f1@infradead.org>
On Fri, Jun 22, 2018 at 01:22:37PM -0700, Randy Dunlap wrote:
> Hi Tobin,
>
> On 06/21/2018 05:37 PM, Tobin C. Harding wrote:
> > Hi Jonathan,
> >
> > This patch set fixes current docs build failure on Linus' mainline
> >
> > commit: (ba4dbdedd3ed Merge tag 'jfs-4.18' of git://github.com/kleikamp/linux-shaggy)
> >
> > (FYI this is 8 commits after Linux 4.18-rc1).
> >
> > And also same build errors on today's linux-next
> >
> > 8439c34f07a3 (tag: next-20180621, linux-next/master, linux-next) Add linux-next specific files for 20180621
> >
> >
> > I split the patches in between the two drivers to enable use of the
> > 'Fixes:' tag.
> >
> > Tobin C. Harding (4):
> > Documentation: e100: Use correct heading adornment
> > Documentation: e1000: Use correct heading adornment
> > Documentation: e100: Fix docs build error
> > Documentation: e1000: Fix docs build error
> >
> > Documentation/networking/e100.rst | 112 +++++++++++++++--------------
> > Documentation/networking/e1000.rst | 76 ++++++++++----------
> > 2 files changed, 96 insertions(+), 92 deletions(-)
>
> I am still seeing a few warnings (your 4 patches applied to
> linux-next-20180622):
>
> linux-next-20180622/Documentation/networking/e100.rst:57: WARNING: Literal block expected; none found.
> linux-next-20180622/Documentation/networking/e100.rst:68: WARNING: Literal block expected; none found.
> linux-next-20180622/Documentation/networking/e100.rst:75: WARNING: Literal block expected; none found.
> linux-next-20180622/Documentation/networking/e100.rst:84: WARNING: Literal block expected; none found.
> linux-next-20180622/Documentation/networking/e100.rst:93: WARNING: Inline emphasis start-string without end-string.
> linux-next-20180622/Documentation/networking/e1000.rst:83: ERROR: Unexpected indentation.
> linux-next-20180622/Documentation/networking/e1000.rst:84: WARNING: Block quote ends without a blank line; unexpected unindent.
> linux-next-20180622/Documentation/networking/e1000.rst:173: WARNING: Definition list ends without a blank line; unexpected unindent.
> linux-next-20180622/Documentation/networking/e1000.rst:236: WARNING: Definition list ends without a blank line; unexpected unindent.
>
>
> You didn't get these warnings? or they weren't important?
> or "perfect" was not your primary goal? :) [which is fine]
>
> [I see around 50 similar doc formatting warnings/errors in the entire
> Documentation build.]
Thanks for testing this Randy. You are right there are a bunch of
warnings, I did not think to grep for warnings on these files - sloppy
work by me. The series has been applied already but I'll add clearing
these warnings to my TODO list.
> Anyway, much better than it was. Thanks.
>
> Tested-by: Randy Dunlap <rdunlap@infradead.org>
Thanks again,
Tobin.
^ permalink raw reply
* Suspend of SDIO function devices
From: Daniel Mack @ 2018-06-24 20:46 UTC (permalink / raw)
To: Ulf Hansson, Chris Ball
Cc: linux-mmc-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
libertas-dev-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r@public.gmane.org,
linux-wireless-u79uwXL29TY76Z2rM5mHXA,
netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Hi,
I'm currently looking into the suspend callbacks of drivers of hardware
that use an SDIO interface, specifically the libertas_sdio driver:
drivers/net/wireless/marvell/libertas/if_sdio.c
The comments in if_sdio_suspend() suggest that by returning -ENOSYS due
to runtime-dependant circumstances, the MMC core will remove the card
entirely at suspend time. I then searched for the bits that do that and
failed, until I came across this old commit, which first appeared in 3.16:
573185cc7e6 mmc: core: Invoke sdio func driver's PM callbacks from
the sdio bus
Before that commit, the mmc core did in fact invoke the card's
.suspend() callback manually and if it returned a non-zero result, it
would remove the card. Now that the generic pm functions are in place,
this does no longer happen because the host and its clients are
independent entities. Consequently, systems fail to suspend when the
libertas_sdio module is loaded.
The pm notifier code in drivers/mmc/core/core.c does still handle cases
where no pm functions are provided at all (in which case it removes the
card), but it doesn't handle -ENOSYS return values at runtime.
Now I'm wondering how this is supposed to work, and which end needs
fixing. The mmc/sdio core by restoring the old logic from before
573185cc7e6, or the libertas driver.
The platform I'm working on does not retain power for the SDIO slaves,
so a complete re-init is necessary after resume.
Please advise, I'm happy to test approaches and send patches.
Thanks,
Daniel
^ permalink raw reply
* [PATCH net-next] tls: Removed unused variable
From: Vakul Garg @ 2018-06-24 20:07 UTC (permalink / raw)
To: davem; +Cc: netdev, linux-kernel, borisp, aviadye, davejwatson, Vakul Garg
Removed unused variable 'rxm' from tls_queue().
Signed-off-by: Vakul Garg <vakul.garg@nxp.com>
---
net/tls/tls_sw.c | 3 ---
1 file changed, 3 deletions(-)
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index f127fac88acf..727433b37bb5 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -990,9 +990,6 @@ static void tls_queue(struct strparser *strp, struct sk_buff *skb)
{
struct tls_context *tls_ctx = tls_get_ctx(strp->sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
- struct strp_msg *rxm;
-
- rxm = strp_msg(skb);
ctx->decrypted = false;
--
2.13.6
^ permalink raw reply related
* Re: [PATCH rdma-next 06/12] RDMA/uverbs: Don't overwrite NULL pointer with ZERO_SIZE_PTR
From: Jason Gunthorpe @ 2018-06-24 19:57 UTC (permalink / raw)
To: Leon Romanovsky
Cc: Doug Ledford, Leon Romanovsky, RDMA mailing list, Hadar Hen Zion,
Matan Barak, Michael J Ruhl, Noa Osherovich, Raed Salem,
Yishai Hadas, Saeed Mahameed, linux-netdev
In-Reply-To: <20180624082353.16138-7-leon@kernel.org>
On Sun, Jun 24, 2018 at 11:23:47AM +0300, Leon Romanovsky wrote:
> From: Leon Romanovsky <leonro@mellanox.com>
>
> Number of specs is provided by user and in valid case can be equal to zero.
> Such argument causes to call to kcalloc() with zero-length request and in
> return the ZERO_SIZE_PTR is assigned. This pointer is different from NULL
> and makes various if (..) checks to success.
The one seems really weird. There is nothing wrong with ZERO_SIZE_PTR,
but this description and fix suggest that something did
ptr = kalloc(0);
ptr[0] = ...;
Which is not allowed of course. Doesn't this mean there is also a
missing range check someplace?
Jason
^ permalink raw reply
* Re: [PATCH rdma-next 09/12] RDMA/mlx5: Fix shift overflow in mlx5_ib_create_wq
From: Jason Gunthorpe @ 2018-06-24 19:56 UTC (permalink / raw)
To: Leon Romanovsky
Cc: Doug Ledford, Leon Romanovsky, RDMA mailing list, Hadar Hen Zion,
Matan Barak, Michael J Ruhl, Noa Osherovich, Raed Salem,
Yishai Hadas, Saeed Mahameed, linux-netdev
In-Reply-To: <20180624082353.16138-10-leon@kernel.org>
On Sun, Jun 24, 2018 at 11:23:50AM +0300, Leon Romanovsky wrote:
> From: Leon Romanovsky <leonro@mellanox.com>
>
> [ 61.182439] UBSAN: Undefined behaviour in drivers/infiniband/hw/mlx5/qp.c:5366:34
> [ 61.183673] shift exponent 4294967288 is too large for 32-bit type 'unsigned int'
> [ 61.185530] CPU: 0 PID: 639 Comm: qp Not tainted 4.18.0-rc1-00037-g4aa1d69a9c60-dirty #96
> [ 61.186981] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-2.fc27 04/01/2014
> [ 61.188315] Call Trace:
> [ 61.188661] dump_stack+0xc7/0x13b
> [ 61.190427] ubsan_epilogue+0x9/0x49
> [ 61.190899] __ubsan_handle_shift_out_of_bounds+0x1ea/0x22f
> [ 61.197040] mlx5_ib_create_wq+0x1c99/0x1d50
> [ 61.206632] ib_uverbs_ex_create_wq+0x499/0x820
> [ 61.213892] ib_uverbs_write+0x77e/0xae0
> [ 61.248018] vfs_write+0x121/0x3b0
> [ 61.249831] ksys_write+0xa1/0x120
> [ 61.254024] do_syscall_64+0x7c/0x2a0
> [ 61.256178] entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [ 61.259211] RIP: 0033:0x7f54bab70e99
> [ 61.262125] Code: 00 f3 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89
> [ 61.268678] RSP: 002b:00007ffe1541c318 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
> [ 61.271076] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f54bab70e99
> [ 61.273795] RDX: 0000000000000070 RSI: 0000000020000240 RDI: 0000000000000003
> [ 61.276982] RBP: 00007ffe1541c330 R08: 00000000200078e0 R09: 0000000000000002
> [ 61.280035] R10: 0000000000000000 R11: 0000000000000246 R12: 00000000004005c0
> [ 61.283279] R13: 00007ffe1541c420 R14: 0000000000000000 R15: 0000000000000000
>
> Cc: <stable@vger.kernel.org> # 4.7
> Fixes: 79b20a6c3014 ("IB/mlx5: Add receive Work Queue verbs")
> Cc: syzkaller <syzkaller@googlegroups.com>
> Reported-by: Noa Osherovich <noaos@mellanox.com>
> Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
> drivers/infiniband/hw/mlx5/qp.c | 6 +++++-
> 1 file changed, 5 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
> index 6034a670859f..8e40263fd40e 100644
> +++ b/drivers/infiniband/hw/mlx5/qp.c
> @@ -5377,7 +5377,11 @@ static int set_user_rq_size(struct mlx5_ib_dev *dev,
>
> rwq->wqe_count = ucmd->rq_wqe_count;
> rwq->wqe_shift = ucmd->rq_wqe_shift;
> - rwq->buf_size = (rwq->wqe_count << rwq->wqe_shift);
> + rwq->buf_size =
> + shift_overflow((size_t)rwq->wqe_count, (size_t)rwq->wqe_shift);
The casts are redundant, the function argument is already size_t so
implicit promotion is guaranteed.
Jason
^ permalink raw reply
* Re: [V2] brcmfmac: stop watchdog before detach and free everything
From: Kalle Valo @ 2018-06-24 16:58 UTC (permalink / raw)
To: Michael Trimarchi
Cc: Arend van Spriel, Franky Lin, Hante Meuleman, Chi-Hsien Lin,
Wright Feng, David S. Miller, Pieter-Paul Giesberts, Ian Molton,
linux-wireless, brcm80211-dev-list.pdl, brcm80211-dev-list,
netdev, linux-kernel
In-Reply-To: <20180530090633.GA15390@panicking>
Michael Trimarchi <michael@amarulasolutions.com> wrote:
> Using built-in in kernel image without a firmware in filesystem
> or in the kernel image can lead to a kernel NULL pointer deference.
> Watchdog need to be stopped in brcmf_sdio_remove
>
> The system is going down NOW!
> [ 1348.110759] Unable to handle kernel NULL pointer dereference at virtual address 000002f8
> Sent SIGTERM to all processes
> [ 1348.121412] Mem abort info:
> [ 1348.126962] ESR = 0x96000004
> [ 1348.130023] Exception class = DABT (current EL), IL = 32 bits
> [ 1348.135948] SET = 0, FnV = 0
> [ 1348.138997] EA = 0, S1PTW = 0
> [ 1348.142154] Data abort info:
> [ 1348.145045] ISV = 0, ISS = 0x00000004
> [ 1348.148884] CM = 0, WnR = 0
> [ 1348.151861] user pgtable: 4k pages, 48-bit VAs, pgdp = (____ptrval____)
> [ 1348.158475] [00000000000002f8] pgd=0000000000000000
> [ 1348.163364] Internal error: Oops: 96000004 [#1] PREEMPT SMP
> [ 1348.168927] Modules linked in: ipv6
> [ 1348.172421] CPU: 3 PID: 1421 Comm: brcmf_wdog/mmc0 Not tainted 4.17.0-rc5-next-20180517 #18
> [ 1348.180757] Hardware name: Amarula A64-Relic (DT)
> [ 1348.185455] pstate: 60000005 (nZCv daif -PAN -UAO)
> [ 1348.190251] pc : brcmf_sdiod_freezer_count+0x0/0x20
> [ 1348.195124] lr : brcmf_sdio_watchdog_thread+0x64/0x290
> [ 1348.200253] sp : ffff00000b85be30
> [ 1348.203561] x29: ffff00000b85be30 x28: 0000000000000000
> [ 1348.208868] x27: ffff00000b6cb918 x26: ffff80003b990638
> [ 1348.214176] x25: ffff0000087b1a20 x24: ffff80003b94f800
> [ 1348.219483] x23: ffff000008e620c8 x22: ffff000008f0b660
> [ 1348.224790] x21: ffff000008c6a858 x20: 00000000fffffe00
> [ 1348.230097] x19: ffff80003b94f800 x18: 0000000000000001
> [ 1348.235404] x17: 0000ffffab2e8a74 x16: ffff0000080d7de8
> [ 1348.240711] x15: 0000000000000000 x14: 0000000000000400
> [ 1348.246018] x13: 0000000000000400 x12: 0000000000000001
> [ 1348.251324] x11: 00000000000002c4 x10: 0000000000000a10
> [ 1348.256631] x9 : ffff00000b85bc40 x8 : ffff80003be11870
> [ 1348.261937] x7 : ffff80003dfc7308 x6 : 000000078ff08b55
> [ 1348.267243] x5 : 00000139e1058400 x4 : 0000000000000000
> [ 1348.272550] x3 : dead000000000100 x2 : 958f2788d6618100
> [ 1348.277856] x1 : 00000000fffffe00 x0 : 0000000000000000
>
> Signed-off-by: Michael Trimarchi <michael@amarulasolutions.com>
> Acked-by: Arend van Spriel <arend.vanspriel@broadcom.com>
> Tested-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Patch applied to wireless-drivers.git, thanks.
373c83a801f1 brcmfmac: stop watchdog before detach and free everything
--
https://patchwork.kernel.org/patch/10437931/
https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches
^ permalink raw reply
* [PATCH net-next 3/3] r8169: don't check WoL when powering down PHY and interface is down
From: Heiner Kallweit @ 2018-06-24 16:40 UTC (permalink / raw)
To: Realtek linux nic maintainers, David Miller; +Cc: netdev@vger.kernel.org
In-Reply-To: <f8a65bb4-aed5-7264-1e1c-b8c76ee79717@gmail.com>
We can power down the PHY irregardless of WOL settings if interface
is down. So far we would have left the PHY enabled if WOL options
are set and the interface is brought down.
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
---
drivers/net/ethernet/realtek/r8169.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index f8a1309a..1d33672c 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -4635,7 +4635,7 @@ static void rtl_wol_suspend_quirk(struct rtl8169_private *tp)
static bool rtl_wol_pll_power_down(struct rtl8169_private *tp)
{
- if (!tp->saved_wolopts)
+ if (!netif_running(tp->dev) || !tp->saved_wolopts)
return false;
rtl_speed_down(tp);
--
2.18.0
^ permalink raw reply related
* [PATCH net-next 2/3] r8169: improve saved_wolopts handling
From: Heiner Kallweit @ 2018-06-24 16:39 UTC (permalink / raw)
To: Realtek linux nic maintainers, David Miller; +Cc: netdev@vger.kernel.org
In-Reply-To: <f8a65bb4-aed5-7264-1e1c-b8c76ee79717@gmail.com>
Let's make saved_wolopts a shadow copy of the WoL options. This allows
to simplify the code and get rid of calls to now unneeded function
__rtl8169_get_wol(). However don't remove __rtl8169_get_wol()
completely to be prepared for the case that we can respect BIOS WOL
settings again.
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
---
drivers/net/ethernet/realtek/r8169.c | 34 ++++++++++++----------------
1 file changed, 14 insertions(+), 20 deletions(-)
diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 480fb141..f8a1309a 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -1587,6 +1587,12 @@ static void rtl8169_check_link_status(struct net_device *dev,
#define WAKE_ANY (WAKE_PHY | WAKE_MAGIC | WAKE_UCAST | WAKE_BCAST | WAKE_MCAST)
+/* Currently we only enable WoL if explicitly told by userspace to circumvent
+ * issues on certain platforms, see commit bde135a672bf ("r8169: only enable
+ * PCI wakeups when WOL is active"). Let's keep __rtl8169_get_wol() for the
+ * case that we want to respect BIOS settings again.
+ */
+#if 0
static u32 __rtl8169_get_wol(struct rtl8169_private *tp)
{
u8 options;
@@ -1621,25 +1627,16 @@ static u32 __rtl8169_get_wol(struct rtl8169_private *tp)
return wolopts;
}
+#endif
static void rtl8169_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
{
struct rtl8169_private *tp = netdev_priv(dev);
- struct device *d = tp_to_dev(tp);
-
- pm_runtime_get_noresume(d);
rtl_lock_work(tp);
-
wol->supported = WAKE_ANY;
- if (pm_runtime_active(d))
- wol->wolopts = __rtl8169_get_wol(tp);
- else
- wol->wolopts = tp->saved_wolopts;
-
+ wol->wolopts = tp->saved_wolopts;
rtl_unlock_work(tp);
-
- pm_runtime_put_noidle(d);
}
static void __rtl8169_set_wol(struct rtl8169_private *tp, u32 wolopts)
@@ -1719,14 +1716,14 @@ static int rtl8169_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
rtl_lock_work(tp);
+ tp->saved_wolopts = wol->wolopts & WAKE_ANY;
+
if (pm_runtime_active(d))
- __rtl8169_set_wol(tp, wol->wolopts);
- else
- tp->saved_wolopts = wol->wolopts;
+ __rtl8169_set_wol(tp, tp->saved_wolopts);
rtl_unlock_work(tp);
- device_set_wakeup_enable(d, wol->wolopts);
+ device_set_wakeup_enable(d, tp->saved_wolopts);
pm_runtime_put_noidle(d);
@@ -4638,7 +4635,7 @@ static void rtl_wol_suspend_quirk(struct rtl8169_private *tp)
static bool rtl_wol_pll_power_down(struct rtl8169_private *tp)
{
- if (!(__rtl8169_get_wol(tp) & WAKE_ANY))
+ if (!tp->saved_wolopts)
return false;
rtl_speed_down(tp);
@@ -7219,7 +7216,6 @@ static int rtl_open(struct net_device *dev)
rtl_unlock_work(tp);
- tp->saved_wolopts = 0;
pm_runtime_put_sync(&pdev->dev);
rtl8169_check_link_status(dev, tp);
@@ -7367,7 +7363,6 @@ static int rtl8169_runtime_suspend(struct device *device)
}
rtl_lock_work(tp);
- tp->saved_wolopts = __rtl8169_get_wol(tp);
__rtl8169_set_wol(tp, WAKE_ANY);
rtl_unlock_work(tp);
@@ -7392,7 +7387,6 @@ static int rtl8169_runtime_resume(struct device *device)
rtl_lock_work(tp);
__rtl8169_set_wol(tp, tp->saved_wolopts);
- tp->saved_wolopts = 0;
rtl_unlock_work(tp);
__rtl8169_resume(dev);
@@ -7462,7 +7456,7 @@ static void rtl_shutdown(struct pci_dev *pdev)
rtl8169_hw_reset(tp);
if (system_state == SYSTEM_POWER_OFF) {
- if (__rtl8169_get_wol(tp) & WAKE_ANY) {
+ if (tp->saved_wolopts) {
rtl_wol_suspend_quirk(tp);
rtl_wol_shutdown_quirk(tp);
}
--
2.18.0
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox