Netdev List
 help / color / mirror / Atom feed
* [net-next:master 42/70] drivers/net/usb/lan78xx.c:1651:37-38: WARNING: Use ARRAY_SIZE
From: kbuild test robot @ 2018-04-25  4:16 UTC (permalink / raw)
  To: Raghuram Chary J; +Cc: kbuild-all, netdev

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git master
head:   c749fa181bd5848be78691d23168ec61ce691b95
commit: 496218656f9857d801512efdec1d609ebbe8a83b [42/70] lan78xx: Add support to dump lan78xx registers


coccinelle warnings: (new ones prefixed by >>)

>> drivers/net/usb/lan78xx.c:1651:37-38: WARNING: Use ARRAY_SIZE

vim +1651 drivers/net/usb/lan78xx.c

  1641	
  1642	static void
  1643	lan78xx_get_regs(struct net_device *netdev, struct ethtool_regs *regs,
  1644			 void *buf)
  1645	{
  1646		u32 *data = buf;
  1647		int i, j;
  1648		struct lan78xx_net *dev = netdev_priv(netdev);
  1649	
  1650		/* Read Device/MAC registers */
> 1651		for (i = 0; i < (sizeof(lan78xx_regs) / sizeof(u32)); i++)
  1652			lan78xx_read_reg(dev, lan78xx_regs[i], &data[i]);
  1653	
  1654		if (!netdev->phydev)
  1655			return;
  1656	
  1657		/* Read PHY registers */
  1658		for (j = 0; j < 32; i++, j++)
  1659			data[i] = phy_read(netdev->phydev, j);
  1660	}
  1661	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

^ permalink raw reply

* [PATCH bpf-next 0/4] nfp: bpf: optimize negative sums
From: Jakub Kicinski @ 2018-04-25  4:22 UTC (permalink / raw)
  To: alexei.starovoitov, daniel; +Cc: oss-drivers, netdev, Jakub Kicinski

Hi!

This set adds an optimization run to the NFP jit to turn ADD and SUB
instructions with negative immediate into the opposite operation with
a positive immediate.  NFP can fit small immediates into the instructions
but it can't ever fit negative immediates.  Addition of small negative
immediates is quite common in BPF programs for stack address calculations,
therefore this optimization gives us non-negligible savings in instruction
count (up to 4%).


Jakub Kicinski (4):
  nfp: bpf: remove double space
  nfp: bpf: optimize add/sub of a negative constant
  nfp: bpf: tabularize generations of compare operations
  nfp: bpf: optimize comparisons to negative constants

 drivers/net/ethernet/netronome/nfp/bpf/jit.c  | 231 +++++++++++++-------------
 drivers/net/ethernet/netronome/nfp/bpf/main.h |   6 +-
 2 files changed, 124 insertions(+), 113 deletions(-)

-- 
2.16.2

^ permalink raw reply

* [PATCH bpf-next 1/4] nfp: bpf: remove double space
From: Jakub Kicinski @ 2018-04-25  4:22 UTC (permalink / raw)
  To: alexei.starovoitov, daniel; +Cc: oss-drivers, netdev, Jakub Kicinski
In-Reply-To: <20180425042239.27869-1-jakub.kicinski@netronome.com>

Whitespace cleanup - remove double space.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 29b4e5f8c102..9cc638718272 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -1400,7 +1400,7 @@ map_call_stack_common(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	if (!load_lm_ptr)
 		return 0;
 
-	emit_csr_wr(nfp_prog, stack_reg(nfp_prog),  NFP_CSR_ACT_LM_ADDR0);
+	emit_csr_wr(nfp_prog, stack_reg(nfp_prog), NFP_CSR_ACT_LM_ADDR0);
 	wrp_nops(nfp_prog, 3);
 
 	return 0;
-- 
2.16.2

^ permalink raw reply related

* [PATCH bpf-next 2/4] nfp: bpf: optimize add/sub of a negative constant
From: Jakub Kicinski @ 2018-04-25  4:22 UTC (permalink / raw)
  To: alexei.starovoitov, daniel; +Cc: oss-drivers, netdev, Jakub Kicinski
In-Reply-To: <20180425042239.27869-1-jakub.kicinski@netronome.com>

NFP instruction set can fit small immediates into the instruction.
Negative integers, however, will never fit because they will have
highest bit set.  If we swap the ALU op between ADD and SUB and
negate the constant we have a better chance of fitting small negative
integers into the instruction itself and saving one or two cycles.

immed[gprB_21, 0xfffffffc]
alu[gprA_4, gprA_4, +, gprB_21], gpr_wrboth
immed[gprB_21, 0xffffffff]
alu[gprA_5, gprA_5, +carry, gprB_21], gpr_wrboth

now becomes:

alu[gprA_4, gprA_4, -, 4], gpr_wrboth
alu[gprA_5, gprA_5, -carry, 0], gpr_wrboth

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c | 35 ++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 9cc638718272..a5590988fc69 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -2777,6 +2777,40 @@ static void nfp_bpf_opt_reg_init(struct nfp_prog *nfp_prog)
 	}
 }
 
+/* abs(insn.imm) will fit better into unrestricted reg immediate -
+ * convert add/sub of a negative number into a sub/add of a positive one.
+ */
+static void nfp_bpf_opt_neg_add_sub(struct nfp_prog *nfp_prog)
+{
+	struct nfp_insn_meta *meta;
+
+	list_for_each_entry(meta, &nfp_prog->insns, l) {
+		struct bpf_insn insn = meta->insn;
+
+		if (meta->skip)
+			continue;
+
+		if (BPF_CLASS(insn.code) != BPF_ALU &&
+		    BPF_CLASS(insn.code) != BPF_ALU64)
+			continue;
+		if (BPF_SRC(insn.code) != BPF_K)
+			continue;
+		if (insn.imm >= 0)
+			continue;
+
+		if (BPF_OP(insn.code) == BPF_ADD)
+			insn.code = BPF_CLASS(insn.code) | BPF_SUB;
+		else if (BPF_OP(insn.code) == BPF_SUB)
+			insn.code = BPF_CLASS(insn.code) | BPF_ADD;
+		else
+			continue;
+
+		meta->insn.code = insn.code | BPF_K;
+
+		meta->insn.imm = -insn.imm;
+	}
+}
+
 /* Remove masking after load since our load guarantees this is not needed */
 static void nfp_bpf_opt_ld_mask(struct nfp_prog *nfp_prog)
 {
@@ -3212,6 +3246,7 @@ static int nfp_bpf_optimize(struct nfp_prog *nfp_prog)
 {
 	nfp_bpf_opt_reg_init(nfp_prog);
 
+	nfp_bpf_opt_neg_add_sub(nfp_prog);
 	nfp_bpf_opt_ld_mask(nfp_prog);
 	nfp_bpf_opt_ld_shift(nfp_prog);
 	nfp_bpf_opt_ldst_gather(nfp_prog);
-- 
2.16.2

^ permalink raw reply related

* [PATCH bpf-next 3/4] nfp: bpf: tabularize generations of compare operations
From: Jakub Kicinski @ 2018-04-25  4:22 UTC (permalink / raw)
  To: alexei.starovoitov, daniel; +Cc: oss-drivers, netdev, Jakub Kicinski
In-Reply-To: <20180425042239.27869-1-jakub.kicinski@netronome.com>

There are quite a few compare instructions now, use a table
to translate BPF instruction code to NFP instruction parameters
instead of parameterizing helpers.  This saves LOC and makes
future extensions easier.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c | 168 ++++++++++-----------------
 1 file changed, 61 insertions(+), 107 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index a5590988fc69..5b8da7a67df4 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -1214,45 +1214,79 @@ wrp_test_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
 	return 0;
 }
 
-static int
-wrp_cmp_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
-	    enum br_mask br_mask, bool swap)
+static const struct jmp_code_map {
+	enum br_mask br_mask;
+	bool swap;
+} jmp_code_map[] = {
+	[BPF_JGT >> 4]	= { BR_BLO, true },
+	[BPF_JGE >> 4]	= { BR_BHS, false },
+	[BPF_JLT >> 4]	= { BR_BLO, false },
+	[BPF_JLE >> 4]	= { BR_BHS, true },
+	[BPF_JSGT >> 4]	= { BR_BLT, true },
+	[BPF_JSGE >> 4]	= { BR_BGE, false },
+	[BPF_JSLT >> 4]	= { BR_BLT, false },
+	[BPF_JSLE >> 4]	= { BR_BGE, true },
+};
+
+static const struct jmp_code_map *nfp_jmp_code_get(struct nfp_insn_meta *meta)
+{
+	unsigned int op;
+
+	op = BPF_OP(meta->insn.code) >> 4;
+	/* br_mask of 0 is BR_BEQ which we don't use in jump code table */
+	if (WARN_ONCE(op >= ARRAY_SIZE(jmp_code_map) ||
+		      !jmp_code_map[op].br_mask,
+		      "no code found for jump instruction"))
+		return NULL;
+
+	return &jmp_code_map[op];
+}
+
+static int cmp_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
 	const struct bpf_insn *insn = &meta->insn;
 	u64 imm = insn->imm; /* sign extend */
+	const struct jmp_code_map *code;
 	u8 reg = insn->dst_reg * 2;
 	swreg tmp_reg;
 
+	code = nfp_jmp_code_get(meta);
+	if (!code)
+		return -EINVAL;
+
 	tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
-	if (!swap)
+	if (!code->swap)
 		emit_alu(nfp_prog, reg_none(), reg_a(reg), ALU_OP_SUB, tmp_reg);
 	else
 		emit_alu(nfp_prog, reg_none(), tmp_reg, ALU_OP_SUB, reg_a(reg));
 
 	tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
-	if (!swap)
+	if (!code->swap)
 		emit_alu(nfp_prog, reg_none(),
 			 reg_a(reg + 1), ALU_OP_SUB_C, tmp_reg);
 	else
 		emit_alu(nfp_prog, reg_none(),
 			 tmp_reg, ALU_OP_SUB_C, reg_a(reg + 1));
 
-	emit_br(nfp_prog, br_mask, insn->off, 0);
+	emit_br(nfp_prog, code->br_mask, insn->off, 0);
 
 	return 0;
 }
 
-static int
-wrp_cmp_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
-	    enum br_mask br_mask, bool swap)
+static int cmp_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
 	const struct bpf_insn *insn = &meta->insn;
+	const struct jmp_code_map *code;
 	u8 areg, breg;
 
+	code = nfp_jmp_code_get(meta);
+	if (!code)
+		return -EINVAL;
+
 	areg = insn->dst_reg * 2;
 	breg = insn->src_reg * 2;
 
-	if (swap) {
+	if (code->swap) {
 		areg ^= breg;
 		breg ^= areg;
 		areg ^= breg;
@@ -1261,7 +1295,7 @@ wrp_cmp_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
 	emit_alu(nfp_prog, reg_none(), reg_a(areg), ALU_OP_SUB, reg_b(breg));
 	emit_alu(nfp_prog, reg_none(),
 		 reg_a(areg + 1), ALU_OP_SUB_C, reg_b(breg + 1));
-	emit_br(nfp_prog, br_mask, insn->off, 0);
+	emit_br(nfp_prog, code->br_mask, insn->off, 0);
 
 	return 0;
 }
@@ -2283,46 +2317,6 @@ static int jeq_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	return 0;
 }
 
-static int jgt_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return wrp_cmp_imm(nfp_prog, meta, BR_BLO, true);
-}
-
-static int jge_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return wrp_cmp_imm(nfp_prog, meta, BR_BHS, false);
-}
-
-static int jlt_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return wrp_cmp_imm(nfp_prog, meta, BR_BLO, false);
-}
-
-static int jle_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return wrp_cmp_imm(nfp_prog, meta, BR_BHS, true);
-}
-
-static int jsgt_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return wrp_cmp_imm(nfp_prog, meta, BR_BLT, true);
-}
-
-static int jsge_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return wrp_cmp_imm(nfp_prog, meta, BR_BGE, false);
-}
-
-static int jslt_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return wrp_cmp_imm(nfp_prog, meta, BR_BLT, false);
-}
-
-static int jsle_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return wrp_cmp_imm(nfp_prog, meta, BR_BGE, true);
-}
-
 static int jset_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
 	const struct bpf_insn *insn = &meta->insn;
@@ -2392,46 +2386,6 @@ static int jeq_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	return 0;
 }
 
-static int jgt_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return wrp_cmp_reg(nfp_prog, meta, BR_BLO, true);
-}
-
-static int jge_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return wrp_cmp_reg(nfp_prog, meta, BR_BHS, false);
-}
-
-static int jlt_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return wrp_cmp_reg(nfp_prog, meta, BR_BLO, false);
-}
-
-static int jle_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return wrp_cmp_reg(nfp_prog, meta, BR_BHS, true);
-}
-
-static int jsgt_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return wrp_cmp_reg(nfp_prog, meta, BR_BLT, true);
-}
-
-static int jsge_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return wrp_cmp_reg(nfp_prog, meta, BR_BGE, false);
-}
-
-static int jslt_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return wrp_cmp_reg(nfp_prog, meta, BR_BLT, false);
-}
-
-static int jsle_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return wrp_cmp_reg(nfp_prog, meta, BR_BGE, true);
-}
-
 static int jset_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
 	return wrp_test_reg(nfp_prog, meta, ALU_OP_AND, BR_BNE);
@@ -2520,25 +2474,25 @@ static const instr_cb_t instr_cb[256] = {
 	[BPF_ST | BPF_MEM | BPF_DW] =	mem_st8,
 	[BPF_JMP | BPF_JA | BPF_K] =	jump,
 	[BPF_JMP | BPF_JEQ | BPF_K] =	jeq_imm,
-	[BPF_JMP | BPF_JGT | BPF_K] =	jgt_imm,
-	[BPF_JMP | BPF_JGE | BPF_K] =	jge_imm,
-	[BPF_JMP | BPF_JLT | BPF_K] =	jlt_imm,
-	[BPF_JMP | BPF_JLE | BPF_K] =	jle_imm,
-	[BPF_JMP | BPF_JSGT | BPF_K] =  jsgt_imm,
-	[BPF_JMP | BPF_JSGE | BPF_K] =  jsge_imm,
-	[BPF_JMP | BPF_JSLT | BPF_K] =  jslt_imm,
-	[BPF_JMP | BPF_JSLE | BPF_K] =  jsle_imm,
+	[BPF_JMP | BPF_JGT | BPF_K] =	cmp_imm,
+	[BPF_JMP | BPF_JGE | BPF_K] =	cmp_imm,
+	[BPF_JMP | BPF_JLT | BPF_K] =	cmp_imm,
+	[BPF_JMP | BPF_JLE | BPF_K] =	cmp_imm,
+	[BPF_JMP | BPF_JSGT | BPF_K] =  cmp_imm,
+	[BPF_JMP | BPF_JSGE | BPF_K] =  cmp_imm,
+	[BPF_JMP | BPF_JSLT | BPF_K] =  cmp_imm,
+	[BPF_JMP | BPF_JSLE | BPF_K] =  cmp_imm,
 	[BPF_JMP | BPF_JSET | BPF_K] =	jset_imm,
 	[BPF_JMP | BPF_JNE | BPF_K] =	jne_imm,
 	[BPF_JMP | BPF_JEQ | BPF_X] =	jeq_reg,
-	[BPF_JMP | BPF_JGT | BPF_X] =	jgt_reg,
-	[BPF_JMP | BPF_JGE | BPF_X] =	jge_reg,
-	[BPF_JMP | BPF_JLT | BPF_X] =	jlt_reg,
-	[BPF_JMP | BPF_JLE | BPF_X] =	jle_reg,
-	[BPF_JMP | BPF_JSGT | BPF_X] =  jsgt_reg,
-	[BPF_JMP | BPF_JSGE | BPF_X] =  jsge_reg,
-	[BPF_JMP | BPF_JSLT | BPF_X] =  jslt_reg,
-	[BPF_JMP | BPF_JSLE | BPF_X] =  jsle_reg,
+	[BPF_JMP | BPF_JGT | BPF_X] =	cmp_reg,
+	[BPF_JMP | BPF_JGE | BPF_X] =	cmp_reg,
+	[BPF_JMP | BPF_JLT | BPF_X] =	cmp_reg,
+	[BPF_JMP | BPF_JLE | BPF_X] =	cmp_reg,
+	[BPF_JMP | BPF_JSGT | BPF_X] =  cmp_reg,
+	[BPF_JMP | BPF_JSGE | BPF_X] =  cmp_reg,
+	[BPF_JMP | BPF_JSLT | BPF_X] =  cmp_reg,
+	[BPF_JMP | BPF_JSLE | BPF_X] =  cmp_reg,
 	[BPF_JMP | BPF_JSET | BPF_X] =	jset_reg,
 	[BPF_JMP | BPF_JNE | BPF_X] =	jne_reg,
 	[BPF_JMP | BPF_CALL] =		call,
-- 
2.16.2

^ permalink raw reply related

* [PATCH bpf-next 4/4] nfp: bpf: optimize comparisons to negative constants
From: Jakub Kicinski @ 2018-04-25  4:22 UTC (permalink / raw)
  To: alexei.starovoitov, daniel; +Cc: oss-drivers, netdev, Jakub Kicinski
In-Reply-To: <20180425042239.27869-1-jakub.kicinski@netronome.com>

Comparison instruction requires a subtraction.  If the constant
is negative we are more likely to fit it into a NFP instruction
directly if we change the sign and use addition.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c  | 42 +++++++++++++++++++--------
 drivers/net/ethernet/netronome/nfp/bpf/main.h |  6 +++-
 2 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 5b8da7a67df4..65f0791cae0c 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -1247,6 +1247,7 @@ static int cmp_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	const struct bpf_insn *insn = &meta->insn;
 	u64 imm = insn->imm; /* sign extend */
 	const struct jmp_code_map *code;
+	enum alu_op alu_op, carry_op;
 	u8 reg = insn->dst_reg * 2;
 	swreg tmp_reg;
 
@@ -1254,19 +1255,22 @@ static int cmp_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	if (!code)
 		return -EINVAL;
 
+	alu_op = meta->jump_neg_op ? ALU_OP_ADD : ALU_OP_SUB;
+	carry_op = meta->jump_neg_op ? ALU_OP_ADD_C : ALU_OP_SUB_C;
+
 	tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
 	if (!code->swap)
-		emit_alu(nfp_prog, reg_none(), reg_a(reg), ALU_OP_SUB, tmp_reg);
+		emit_alu(nfp_prog, reg_none(), reg_a(reg), alu_op, tmp_reg);
 	else
-		emit_alu(nfp_prog, reg_none(), tmp_reg, ALU_OP_SUB, reg_a(reg));
+		emit_alu(nfp_prog, reg_none(), tmp_reg, alu_op, reg_a(reg));
 
 	tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
 	if (!code->swap)
 		emit_alu(nfp_prog, reg_none(),
-			 reg_a(reg + 1), ALU_OP_SUB_C, tmp_reg);
+			 reg_a(reg + 1), carry_op, tmp_reg);
 	else
 		emit_alu(nfp_prog, reg_none(),
-			 tmp_reg, ALU_OP_SUB_C, reg_a(reg + 1));
+			 tmp_reg, carry_op, reg_a(reg + 1));
 
 	emit_br(nfp_prog, code->br_mask, insn->off, 0);
 
@@ -2745,21 +2749,35 @@ static void nfp_bpf_opt_neg_add_sub(struct nfp_prog *nfp_prog)
 			continue;
 
 		if (BPF_CLASS(insn.code) != BPF_ALU &&
-		    BPF_CLASS(insn.code) != BPF_ALU64)
+		    BPF_CLASS(insn.code) != BPF_ALU64 &&
+		    BPF_CLASS(insn.code) != BPF_JMP)
 			continue;
 		if (BPF_SRC(insn.code) != BPF_K)
 			continue;
 		if (insn.imm >= 0)
 			continue;
 
-		if (BPF_OP(insn.code) == BPF_ADD)
-			insn.code = BPF_CLASS(insn.code) | BPF_SUB;
-		else if (BPF_OP(insn.code) == BPF_SUB)
-			insn.code = BPF_CLASS(insn.code) | BPF_ADD;
-		else
-			continue;
+		if (BPF_CLASS(insn.code) == BPF_JMP) {
+			switch (BPF_OP(insn.code)) {
+			case BPF_JGE:
+			case BPF_JSGE:
+			case BPF_JLT:
+			case BPF_JSLT:
+				meta->jump_neg_op = true;
+				break;
+			default:
+				continue;
+			}
+		} else {
+			if (BPF_OP(insn.code) == BPF_ADD)
+				insn.code = BPF_CLASS(insn.code) | BPF_SUB;
+			else if (BPF_OP(insn.code) == BPF_SUB)
+				insn.code = BPF_CLASS(insn.code) | BPF_ADD;
+			else
+				continue;
 
-		meta->insn.code = insn.code | BPF_K;
+			meta->insn.code = insn.code | BPF_K;
+		}
 
 		meta->insn.imm = -insn.imm;
 	}
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 4981c8944ca3..68b5d326483d 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -236,6 +236,7 @@ struct nfp_bpf_reg_state {
  * @xadd_over_16bit: 16bit immediate is not guaranteed
  * @xadd_maybe_16bit: 16bit immediate is possible
  * @jmp_dst: destination info for jump instructions
+ * @jump_neg_op: jump instruction has inverted immediate, use ADD instead of SUB
  * @func_id: function id for call instructions
  * @arg1: arg1 for call instructions
  * @arg2: arg2 for call instructions
@@ -264,7 +265,10 @@ struct nfp_insn_meta {
 			bool xadd_maybe_16bit;
 		};
 		/* jump */
-		struct nfp_insn_meta *jmp_dst;
+		struct {
+			struct nfp_insn_meta *jmp_dst;
+			bool jump_neg_op;
+		};
 		/* function calls */
 		struct {
 			u32 func_id;
-- 
2.16.2

^ permalink raw reply related

* [RFC v3 3/5] virtio_ring: add packed ring support
From: Tiwei Bie @ 2018-04-25  5:15 UTC (permalink / raw)
  To: mst, jasowang, virtualization, linux-kernel, netdev
  Cc: wexu, jfreimann, tiwei.bie
In-Reply-To: <20180425051550.24342-1-tiwei.bie@intel.com>

This commit introduces the basic support (without EVENT_IDX)
for packed ring.

Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
---
 drivers/virtio/virtio_ring.c | 444 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 434 insertions(+), 10 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index e164822ca66e..0181e93897be 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -58,7 +58,8 @@
 
 struct vring_desc_state {
 	void *data;			/* Data for callback. */
-	struct vring_desc *indir_desc;	/* Indirect descriptor, if any. */
+	void *indir_desc;		/* Indirect descriptor, if any. */
+	int num;			/* Descriptor list length. */
 };
 
 struct vring_virtqueue {
@@ -142,6 +143,16 @@ struct vring_virtqueue {
 
 #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
 
+static inline bool virtqueue_use_indirect(struct virtqueue *_vq,
+					  unsigned int total_sg)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	/* If the host supports indirect descriptor tables, and we have multiple
+	 * buffers, then go indirect. FIXME: tune this threshold */
+	return (vq->indirect && total_sg > 1 && vq->vq.num_free);
+}
+
 /*
  * Modern virtio devices have feature bits to specify whether they need a
  * quirk and bypass the IOMMU. If not there, just use the DMA API.
@@ -327,9 +338,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
 
 	head = vq->free_head;
 
-	/* If the host supports indirect descriptor tables, and we have multiple
-	 * buffers, then go indirect. FIXME: tune this threshold */
-	if (vq->indirect && total_sg > 1 && vq->vq.num_free)
+	if (virtqueue_use_indirect(_vq, total_sg))
 		desc = alloc_indirect_split(_vq, total_sg, gfp);
 	else {
 		desc = NULL;
@@ -741,6 +750,49 @@ static inline unsigned vring_size_packed(unsigned int num, unsigned long align)
 		& ~(align - 1)) + sizeof(struct vring_packed_desc_event) * 2;
 }
 
+static void vring_unmap_one_packed(const struct vring_virtqueue *vq,
+				   struct vring_packed_desc *desc)
+{
+	u16 flags;
+
+	if (!vring_use_dma_api(vq->vq.vdev))
+		return;
+
+	flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
+
+	if (flags & VRING_DESC_F_INDIRECT) {
+		dma_unmap_single(vring_dma_dev(vq),
+				 virtio64_to_cpu(vq->vq.vdev, desc->addr),
+				 virtio32_to_cpu(vq->vq.vdev, desc->len),
+				 (flags & VRING_DESC_F_WRITE) ?
+				 DMA_FROM_DEVICE : DMA_TO_DEVICE);
+	} else {
+		dma_unmap_page(vring_dma_dev(vq),
+			       virtio64_to_cpu(vq->vq.vdev, desc->addr),
+			       virtio32_to_cpu(vq->vq.vdev, desc->len),
+			       (flags & VRING_DESC_F_WRITE) ?
+			       DMA_FROM_DEVICE : DMA_TO_DEVICE);
+	}
+}
+
+static struct vring_packed_desc *alloc_indirect_packed(struct virtqueue *_vq,
+						       unsigned int total_sg,
+						       gfp_t gfp)
+{
+	struct vring_packed_desc *desc;
+
+	/*
+	 * We require lowmem mappings for the descriptors because
+	 * otherwise virt_to_phys will give us bogus addresses in the
+	 * virtqueue.
+	 */
+	gfp &= ~__GFP_HIGHMEM;
+
+	desc = kmalloc(total_sg * sizeof(struct vring_packed_desc), gfp);
+
+	return desc;
+}
+
 static inline int virtqueue_add_packed(struct virtqueue *_vq,
 				       struct scatterlist *sgs[],
 				       unsigned int total_sg,
@@ -750,47 +802,419 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
 				       void *ctx,
 				       gfp_t gfp)
 {
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	struct vring_packed_desc *desc;
+	struct scatterlist *sg;
+	unsigned int i, n, descs_used, uninitialized_var(prev), err_idx;
+	__virtio16 uninitialized_var(head_flags), flags;
+	int head, wrap_counter;
+	bool indirect;
+
+	START_USE(vq);
+
+	BUG_ON(data == NULL);
+	BUG_ON(ctx && vq->indirect);
+
+	if (unlikely(vq->broken)) {
+		END_USE(vq);
+		return -EIO;
+	}
+
+#ifdef DEBUG
+	{
+		ktime_t now = ktime_get();
+
+		/* No kick or get, with .1 second between?  Warn. */
+		if (vq->last_add_time_valid)
+			WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time))
+					    > 100);
+		vq->last_add_time = now;
+		vq->last_add_time_valid = true;
+	}
+#endif
+
+	BUG_ON(total_sg == 0);
+
+	head = vq->next_avail_idx;
+	wrap_counter = vq->wrap_counter;
+
+	if (virtqueue_use_indirect(_vq, total_sg))
+		desc = alloc_indirect_packed(_vq, total_sg, gfp);
+	else {
+		desc = NULL;
+		WARN_ON_ONCE(total_sg > vq->vring_packed.num && !vq->indirect);
+	}
+
+	if (desc) {
+		/* Use a single buffer which doesn't continue */
+		indirect = true;
+		/* Set up rest to use this indirect table. */
+		i = 0;
+		descs_used = 1;
+	} else {
+		indirect = false;
+		desc = vq->vring_packed.desc;
+		i = head;
+		descs_used = total_sg;
+	}
+
+	if (vq->vq.num_free < descs_used) {
+		pr_debug("Can't add buf len %i - avail = %i\n",
+			 descs_used, vq->vq.num_free);
+		/* FIXME: for historical reasons, we force a notify here if
+		 * there are outgoing parts to the buffer.  Presumably the
+		 * host should service the ring ASAP. */
+		if (out_sgs)
+			vq->notify(&vq->vq);
+		if (indirect)
+			kfree(desc);
+		END_USE(vq);
+		return -ENOSPC;
+	}
+
+	for (n = 0; n < out_sgs + in_sgs; n++) {
+		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
+			dma_addr_t addr = vring_map_one_sg(vq, sg, n < out_sgs ?
+					       DMA_TO_DEVICE : DMA_FROM_DEVICE);
+			if (vring_mapping_error(vq, addr))
+				goto unmap_release;
+
+			flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT |
+					(n < out_sgs ? 0 : VRING_DESC_F_WRITE) |
+					VRING_DESC_F_AVAIL(vq->wrap_counter) |
+					VRING_DESC_F_USED(!vq->wrap_counter));
+			if (!indirect && i == head)
+				head_flags = flags;
+			else
+				desc[i].flags = flags;
+
+			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
+			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
+			i++;
+			if (!indirect && i >= vq->vring_packed.num) {
+				i = 0;
+				vq->wrap_counter ^= 1;
+			}
+		}
+	}
+
+	prev = (i > 0 ? i : vq->vring_packed.num) - 1;
+	desc[prev].id = cpu_to_virtio32(_vq->vdev, head);
+
+	/* Last one doesn't continue. */
+	if (total_sg == 1)
+		head_flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
+	else
+		desc[prev].flags &= cpu_to_virtio16(_vq->vdev,
+						~VRING_DESC_F_NEXT);
+
+	if (indirect) {
+		/* Now that the indirect table is filled in, map it. */
+		dma_addr_t addr = vring_map_single(
+			vq, desc, total_sg * sizeof(struct vring_packed_desc),
+			DMA_TO_DEVICE);
+		if (vring_mapping_error(vq, addr))
+			goto unmap_release;
+
+		head_flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT |
+					     VRING_DESC_F_AVAIL(wrap_counter) |
+					     VRING_DESC_F_USED(!wrap_counter));
+		vq->vring_packed.desc[head].addr = cpu_to_virtio64(_vq->vdev,
+								   addr);
+		vq->vring_packed.desc[head].len = cpu_to_virtio32(_vq->vdev,
+				total_sg * sizeof(struct vring_packed_desc));
+		vq->vring_packed.desc[head].id = cpu_to_virtio32(_vq->vdev,
+								 head);
+	}
+
+	/* We're using some buffers from the free list. */
+	vq->vq.num_free -= descs_used;
+
+	/* Update free pointer */
+	if (indirect) {
+		n = head + 1;
+		if (n >= vq->vring_packed.num) {
+			n = 0;
+			vq->wrap_counter ^= 1;
+		}
+		vq->next_avail_idx = n;
+	} else
+		vq->next_avail_idx = i;
+
+	/* Store token and indirect buffer state. */
+	vq->desc_state[head].num = descs_used;
+	vq->desc_state[head].data = data;
+	if (indirect)
+		vq->desc_state[head].indir_desc = desc;
+	else
+		vq->desc_state[head].indir_desc = ctx;
+
+	/* A driver MUST NOT make the first descriptor in the list
+	 * available before all subsequent descriptors comprising
+	 * the list are made available. */
+	virtio_wmb(vq->weak_barriers);
+	vq->vring_packed.desc[head].flags = head_flags;
+	vq->num_added += descs_used;
+
+	pr_debug("Added buffer head %i to %p\n", head, vq);
+	END_USE(vq);
+
+	return 0;
+
+unmap_release:
+	err_idx = i;
+	i = head;
+
+	for (n = 0; n < total_sg; n++) {
+		if (i == err_idx)
+			break;
+		vring_unmap_one_packed(vq, &desc[i]);
+		i++;
+		if (!indirect && i >= vq->vring_packed.num)
+			i = 0;
+	}
+
+	vq->wrap_counter = wrap_counter;
+
+	if (indirect)
+		kfree(desc);
+
+	END_USE(vq);
 	return -EIO;
 }
 
 static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
 {
-	return false;
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	u16 flags;
+	bool needs_kick;
+	u32 snapshot;
+
+	START_USE(vq);
+	/* We need to expose the new flags value before checking notification
+	 * suppressions. */
+	virtio_mb(vq->weak_barriers);
+
+	snapshot = *(u32 *)vq->vring_packed.device;
+	flags = cpu_to_virtio16(_vq->vdev, snapshot >> 16) & 0x3;
+
+#ifdef DEBUG
+	if (vq->last_add_time_valid) {
+		WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
+					      vq->last_add_time)) > 100);
+	}
+	vq->last_add_time_valid = false;
+#endif
+
+	needs_kick = (flags != VRING_EVENT_F_DISABLE);
+	END_USE(vq);
+	return needs_kick;
+}
+
+static void detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
+			      void **ctx)
+{
+	struct vring_packed_desc *desc;
+	unsigned int i, j;
+
+	/* Clear data ptr. */
+	vq->desc_state[head].data = NULL;
+
+	i = head;
+
+	for (j = 0; j < vq->desc_state[head].num; j++) {
+		desc = &vq->vring_packed.desc[i];
+		vring_unmap_one_packed(vq, desc);
+		i++;
+		if (i >= vq->vring_packed.num)
+			i = 0;
+	}
+
+	vq->vq.num_free += vq->desc_state[head].num;
+
+	if (vq->indirect) {
+		u32 len;
+
+		/* Free the indirect table, if any, now that it's unmapped. */
+		desc = vq->desc_state[head].indir_desc;
+		if (!desc)
+			return;
+
+		len = virtio32_to_cpu(vq->vq.vdev,
+				      vq->vring_packed.desc[head].len);
+
+		for (j = 0; j < len / sizeof(struct vring_packed_desc); j++)
+			vring_unmap_one_packed(vq, &desc[j]);
+
+		kfree(desc);
+		vq->desc_state[head].indir_desc = NULL;
+	} else if (ctx) {
+		*ctx = vq->desc_state[head].indir_desc;
+	}
 }
 
 static inline bool more_used_packed(const struct vring_virtqueue *vq)
 {
-	return false;
+	u16 last_used, flags;
+	bool avail, used;
+
+	if (vq->vq.num_free == vq->vring_packed.num)
+		return false;
+
+	last_used = vq->last_used_idx;
+	flags = virtio16_to_cpu(vq->vq.vdev,
+				vq->vring_packed.desc[last_used].flags);
+	avail = flags & VRING_DESC_F_AVAIL(1);
+	used = flags & VRING_DESC_F_USED(1);
+
+	return avail == used;
 }
 
 static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
 					  unsigned int *len,
 					  void **ctx)
 {
-	return NULL;
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	void *ret;
+	unsigned int i;
+	u16 last_used;
+
+	START_USE(vq);
+
+	if (unlikely(vq->broken)) {
+		END_USE(vq);
+		return NULL;
+	}
+
+	if (!more_used_packed(vq)) {
+		pr_debug("No more buffers in queue\n");
+		END_USE(vq);
+		return NULL;
+	}
+
+	/* Only get used elements after they have been exposed by host. */
+	virtio_rmb(vq->weak_barriers);
+
+	last_used = vq->last_used_idx;
+	i = virtio32_to_cpu(_vq->vdev, vq->vring_packed.desc[last_used].id);
+	*len = virtio32_to_cpu(_vq->vdev, vq->vring_packed.desc[last_used].len);
+
+	if (unlikely(i >= vq->vring_packed.num)) {
+		BAD_RING(vq, "id %u out of range\n", i);
+		return NULL;
+	}
+	if (unlikely(!vq->desc_state[i].data)) {
+		BAD_RING(vq, "id %u is not a head!\n", i);
+		return NULL;
+	}
+
+	/* detach_buf_packed clears data, so grab it now. */
+	ret = vq->desc_state[i].data;
+	detach_buf_packed(vq, i, ctx);
+
+	vq->last_used_idx += vq->desc_state[i].num;
+	if (vq->last_used_idx >= vq->vring_packed.num)
+		vq->last_used_idx -= vq->vring_packed.num;
+
+#ifdef DEBUG
+	vq->last_add_time_valid = false;
+#endif
+
+	END_USE(vq);
+	return ret;
 }
 
 static void virtqueue_disable_cb_packed(struct virtqueue *_vq)
 {
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	if (vq->event_flags_shadow != VRING_EVENT_F_DISABLE) {
+		vq->event_flags_shadow = VRING_EVENT_F_DISABLE;
+		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
+							vq->event_flags_shadow);
+	}
 }
 
 static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
 {
-	return 0;
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	START_USE(vq);
+
+	/* We optimistically turn back on interrupts, then check if there was
+	 * more to do. */
+
+	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
+		virtio_wmb(vq->weak_barriers);
+		vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
+		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
+							vq->event_flags_shadow);
+	}
+
+	END_USE(vq);
+	return vq->last_used_idx;
 }
 
 static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned last_used_idx)
 {
-	return false;
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	bool avail, used;
+	u16 flags;
+
+	virtio_mb(vq->weak_barriers);
+	flags = virtio16_to_cpu(vq->vq.vdev,
+			vq->vring_packed.desc[last_used_idx].flags);
+	avail = flags & VRING_DESC_F_AVAIL(1);
+	used = flags & VRING_DESC_F_USED(1);
+	return avail == used;
 }
 
 static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq)
 {
-	return false;
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	START_USE(vq);
+
+	/* We optimistically turn back on interrupts, then check if there was
+	 * more to do. */
+
+	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
+		virtio_wmb(vq->weak_barriers);
+		vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
+		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
+							vq->event_flags_shadow);
+	}
+
+	if (more_used_packed(vq)) {
+		END_USE(vq);
+		return false;
+	}
+
+	END_USE(vq);
+	return true;
 }
 
 static void *virtqueue_detach_unused_buf_packed(struct virtqueue *_vq)
 {
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	unsigned int i;
+	void *buf;
+
+	START_USE(vq);
+
+	for (i = 0; i < vq->vring_packed.num; i++) {
+		if (!vq->desc_state[i].data)
+			continue;
+		/* detach_buf clears data, so grab it now. */
+		buf = vq->desc_state[i].data;
+		detach_buf_packed(vq, i, NULL);
+		END_USE(vq);
+		return buf;
+	}
+	/* That should have freed everything. */
+	BUG_ON(vq->vq.num_free != vq->vring_packed.num);
+
+	END_USE(vq);
 	return NULL;
 }
 
-- 
2.11.0

^ permalink raw reply related

* [RFC v3 0/5] virtio: support packed ring
From: Tiwei Bie @ 2018-04-25  5:15 UTC (permalink / raw)
  To: mst, jasowang, virtualization, linux-kernel, netdev
  Cc: wexu, jfreimann, tiwei.bie

Hello everyone,

This RFC implements packed ring support in virtio driver.

Some simple functional tests have been done with Jason's
packed ring implementation in vhost:

https://lkml.org/lkml/2018/4/23/12

Both of ping and netperf worked as expected (with EVENT_IDX
disabled). But there are below known issues:

1. Reloading the guest driver will break the Tx/Rx;
2. Zeroing the flags when detaching a used desc will
   break the guest -> host path.

Some simple functional tests have also been done with
Wei's packed ring implementation in QEMU:

http://lists.nongnu.org/archive/html/qemu-devel/2018-04/msg00342.html

Both of ping and netperf worked as expected (with EVENT_IDX
disabled). Reloading the guest driver also worked as expected.

TODO:
- Refinements (for code and commit log) and bug fixes;
- Discuss/fix/test EVENT_IDX support;
- Test devices other than net;

RFC v2 -> RFC v3:
- Split into small patches (Jason);
- Add helper virtqueue_use_indirect() (Jason);
- Just set id for the last descriptor of a list (Jason);
- Calculate the prev in virtqueue_add_packed() (Jason);
- Fix/improve desc suppression code (Jason/MST);
- Refine the code layout for XXX_split/packed and wrappers (MST);
- Fix the comments and API in uapi (MST);
- Remove the BUG_ON() for indirect (Jason);
- Some other refinements and bug fixes;

RFC v1 -> RFC v2:
- Add indirect descriptor support - compile test only;
- Add event suppression supprt - compile test only;
- Move vring_packed_init() out of uapi (Jason, MST);
- Merge two loops into one in virtqueue_add_packed() (Jason);
- Split vring_unmap_one() for packed ring and split ring (Jason);
- Avoid using '%' operator (Jason);
- Rename free_head -> next_avail_idx (Jason);
- Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
- Some other refinements and bug fixes;

Thanks!

Tiwei Bie (5):
  virtio: add packed ring definitions
  virtio_ring: support creating packed ring
  virtio_ring: add packed ring support
  virtio_ring: add event idx support in packed ring
  virtio_ring: enable packed ring

 drivers/virtio/virtio_ring.c       | 1271 ++++++++++++++++++++++++++++--------
 include/linux/virtio_ring.h        |    8 +-
 include/uapi/linux/virtio_config.h |   12 +-
 include/uapi/linux/virtio_ring.h   |   36 +
 4 files changed, 1049 insertions(+), 278 deletions(-)

-- 
2.11.0

^ permalink raw reply

* [RFC v3 1/5] virtio: add packed ring definitions
From: Tiwei Bie @ 2018-04-25  5:15 UTC (permalink / raw)
  To: mst, jasowang, virtualization, linux-kernel, netdev; +Cc: wexu
In-Reply-To: <20180425051550.24342-1-tiwei.bie@intel.com>

Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
---
 include/uapi/linux/virtio_config.h | 12 +++++++++++-
 include/uapi/linux/virtio_ring.h   | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
index 308e2096291f..a6e392325e3a 100644
--- a/include/uapi/linux/virtio_config.h
+++ b/include/uapi/linux/virtio_config.h
@@ -49,7 +49,7 @@
  * transport being used (eg. virtio_ring), the rest are per-device feature
  * bits. */
 #define VIRTIO_TRANSPORT_F_START	28
-#define VIRTIO_TRANSPORT_F_END		34
+#define VIRTIO_TRANSPORT_F_END		36
 
 #ifndef VIRTIO_CONFIG_NO_LEGACY
 /* Do we get callbacks when the ring is completely used, even if we've
@@ -71,4 +71,14 @@
  * this is for compatibility with legacy systems.
  */
 #define VIRTIO_F_IOMMU_PLATFORM		33
+
+/* This feature indicates support for the packed virtqueue layout. */
+#define VIRTIO_F_RING_PACKED		34
+
+/*
+ * This feature indicates that all buffers are used by the device
+ * in the same order in which they have been made available.
+ */
+#define VIRTIO_F_IN_ORDER		35
+
 #endif /* _UAPI_LINUX_VIRTIO_CONFIG_H */
diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
index 6d5d5faa989b..3932cb80c347 100644
--- a/include/uapi/linux/virtio_ring.h
+++ b/include/uapi/linux/virtio_ring.h
@@ -44,6 +44,9 @@
 /* This means the buffer contains a list of buffer descriptors. */
 #define VRING_DESC_F_INDIRECT	4
 
+#define VRING_DESC_F_AVAIL(b)	((b) << 7)
+#define VRING_DESC_F_USED(b)	((b) << 15)
+
 /* The Host uses this in used->flags to advise the Guest: don't kick me when
  * you add a buffer.  It's unreliable, so it's simply an optimization.  Guest
  * will still kick if it's out of buffers. */
@@ -53,6 +56,10 @@
  * optimization.  */
 #define VRING_AVAIL_F_NO_INTERRUPT	1
 
+#define VRING_EVENT_F_ENABLE	0x0
+#define VRING_EVENT_F_DISABLE	0x1
+#define VRING_EVENT_F_DESC	0x2
+
 /* We support indirect buffer descriptors */
 #define VIRTIO_RING_F_INDIRECT_DESC	28
 
@@ -171,4 +178,33 @@ static inline int vring_need_event(__u16 event_idx, __u16 new_idx, __u16 old)
 	return (__u16)(new_idx - event_idx - 1) < (__u16)(new_idx - old);
 }
 
+struct vring_packed_desc_event {
+	/* __virtio16 off  : 15; // Descriptor Event Offset
+	 * __virtio16 wrap : 1;  // Descriptor Event Wrap Counter */
+	__virtio16 off_wrap;
+	/* __virtio16 flags : 2; // Descriptor Event Flags */
+	__virtio16 flags;
+};
+
+struct vring_packed_desc {
+	/* Buffer Address. */
+	__virtio64 addr;
+	/* Buffer Length. */
+	__virtio32 len;
+	/* Buffer ID. */
+	__virtio16 id;
+	/* The flags depending on descriptor type. */
+	__virtio16 flags;
+};
+
+struct vring_packed {
+	unsigned int num;
+
+	struct vring_packed_desc *desc;
+
+	struct vring_packed_desc_event *driver;
+
+	struct vring_packed_desc_event *device;
+};
+
 #endif /* _UAPI_LINUX_VIRTIO_RING_H */
-- 
2.11.0

^ permalink raw reply related

* [RFC v3 2/5] virtio_ring: support creating packed ring
From: Tiwei Bie @ 2018-04-25  5:15 UTC (permalink / raw)
  To: mst, jasowang, virtualization, linux-kernel, netdev; +Cc: wexu
In-Reply-To: <20180425051550.24342-1-tiwei.bie@intel.com>

This commit introduces the support for creating packed ring.
All split ring specific functions are added _split suffix.
Some necessary stubs for packed ring are also added.

Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
---
 drivers/virtio/virtio_ring.c | 764 ++++++++++++++++++++++++++++---------------
 include/linux/virtio_ring.h  |   8 +-
 2 files changed, 513 insertions(+), 259 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 71458f493cf8..e164822ca66e 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -64,8 +64,8 @@ struct vring_desc_state {
 struct vring_virtqueue {
 	struct virtqueue vq;
 
-	/* Actual memory layout for this queue */
-	struct vring vring;
+	/* Is this a packed ring? */
+	bool packed;
 
 	/* Can we use weak barriers? */
 	bool weak_barriers;
@@ -79,19 +79,45 @@ struct vring_virtqueue {
 	/* Host publishes avail event idx */
 	bool event;
 
-	/* Head of free buffer list. */
-	unsigned int free_head;
 	/* Number we've added since last sync. */
 	unsigned int num_added;
 
 	/* Last used index we've seen. */
 	u16 last_used_idx;
 
-	/* Last written value to avail->flags */
-	u16 avail_flags_shadow;
+	union {
+		/* Available for split ring */
+		struct {
+			/* Actual memory layout for this queue. */
+			struct vring vring;
 
-	/* Last written value to avail->idx in guest byte order */
-	u16 avail_idx_shadow;
+			/* Head of free buffer list. */
+			unsigned int free_head;
+
+			/* Last written value to avail->flags */
+			u16 avail_flags_shadow;
+
+			/* Last written value to avail->idx in
+			 * guest byte order. */
+			u16 avail_idx_shadow;
+		};
+
+		/* Available for packed ring */
+		struct {
+			/* Actual memory layout for this queue. */
+			struct vring_packed vring_packed;
+
+			/* Driver ring wrap counter. */
+			u8 wrap_counter;
+
+			/* Index of the next avail descriptor. */
+			unsigned int next_avail_idx;
+
+			/* Last written value to driver->flags in
+			 * guest byte order. */
+			u16 event_flags_shadow;
+		};
+	};
 
 	/* How to notify other side. FIXME: commonalize hcalls! */
 	bool (*notify)(struct virtqueue *vq);
@@ -201,8 +227,17 @@ static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
 			      cpu_addr, size, direction);
 }
 
-static void vring_unmap_one(const struct vring_virtqueue *vq,
-			    struct vring_desc *desc)
+static int vring_mapping_error(const struct vring_virtqueue *vq,
+			       dma_addr_t addr)
+{
+	if (!vring_use_dma_api(vq->vq.vdev))
+		return 0;
+
+	return dma_mapping_error(vring_dma_dev(vq), addr);
+}
+
+static void vring_unmap_one_split(const struct vring_virtqueue *vq,
+				  struct vring_desc *desc)
 {
 	u16 flags;
 
@@ -226,17 +261,9 @@ static void vring_unmap_one(const struct vring_virtqueue *vq,
 	}
 }
 
-static int vring_mapping_error(const struct vring_virtqueue *vq,
-			       dma_addr_t addr)
-{
-	if (!vring_use_dma_api(vq->vq.vdev))
-		return 0;
-
-	return dma_mapping_error(vring_dma_dev(vq), addr);
-}
-
-static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
-					 unsigned int total_sg, gfp_t gfp)
+static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
+					       unsigned int total_sg,
+					       gfp_t gfp)
 {
 	struct vring_desc *desc;
 	unsigned int i;
@@ -257,14 +284,14 @@ static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
 	return desc;
 }
 
-static inline int virtqueue_add(struct virtqueue *_vq,
-				struct scatterlist *sgs[],
-				unsigned int total_sg,
-				unsigned int out_sgs,
-				unsigned int in_sgs,
-				void *data,
-				void *ctx,
-				gfp_t gfp)
+static inline int virtqueue_add_split(struct virtqueue *_vq,
+				      struct scatterlist *sgs[],
+				      unsigned int total_sg,
+				      unsigned int out_sgs,
+				      unsigned int in_sgs,
+				      void *data,
+				      void *ctx,
+				      gfp_t gfp)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
 	struct scatterlist *sg;
@@ -303,7 +330,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 	/* If the host supports indirect descriptor tables, and we have multiple
 	 * buffers, then go indirect. FIXME: tune this threshold */
 	if (vq->indirect && total_sg > 1 && vq->vq.num_free)
-		desc = alloc_indirect(_vq, total_sg, gfp);
+		desc = alloc_indirect_split(_vq, total_sg, gfp);
 	else {
 		desc = NULL;
 		WARN_ON_ONCE(total_sg > vq->vring.num && !vq->indirect);
@@ -424,7 +451,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 	for (n = 0; n < total_sg; n++) {
 		if (i == err_idx)
 			break;
-		vring_unmap_one(vq, &desc[i]);
+		vring_unmap_one_split(vq, &desc[i]);
 		i = virtio16_to_cpu(_vq->vdev, vq->vring.desc[i].next);
 	}
 
@@ -435,6 +462,355 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 	return -EIO;
 }
 
+static bool virtqueue_kick_prepare_split(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	u16 new, old;
+	bool needs_kick;
+
+	START_USE(vq);
+	/* We need to expose available array entries before checking avail
+	 * event. */
+	virtio_mb(vq->weak_barriers);
+
+	old = vq->avail_idx_shadow - vq->num_added;
+	new = vq->avail_idx_shadow;
+	vq->num_added = 0;
+
+#ifdef DEBUG
+	if (vq->last_add_time_valid) {
+		WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
+					      vq->last_add_time)) > 100);
+	}
+	vq->last_add_time_valid = false;
+#endif
+
+	if (vq->event) {
+		needs_kick = vring_need_event(virtio16_to_cpu(_vq->vdev, vring_avail_event(&vq->vring)),
+					      new, old);
+	} else {
+		needs_kick = !(vq->vring.used->flags & cpu_to_virtio16(_vq->vdev, VRING_USED_F_NO_NOTIFY));
+	}
+	END_USE(vq);
+	return needs_kick;
+}
+
+static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
+			     void **ctx)
+{
+	unsigned int i, j;
+	__virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
+
+	/* Clear data ptr. */
+	vq->desc_state[head].data = NULL;
+
+	/* Put back on free list: unmap first-level descriptors and find end */
+	i = head;
+
+	while (vq->vring.desc[i].flags & nextflag) {
+		vring_unmap_one_split(vq, &vq->vring.desc[i]);
+		i = virtio16_to_cpu(vq->vq.vdev, vq->vring.desc[i].next);
+		vq->vq.num_free++;
+	}
+
+	vring_unmap_one_split(vq, &vq->vring.desc[i]);
+	vq->vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, vq->free_head);
+	vq->free_head = head;
+
+	/* Plus final descriptor */
+	vq->vq.num_free++;
+
+	if (vq->indirect) {
+		struct vring_desc *indir_desc = vq->desc_state[head].indir_desc;
+		u32 len;
+
+		/* Free the indirect table, if any, now that it's unmapped. */
+		if (!indir_desc)
+			return;
+
+		len = virtio32_to_cpu(vq->vq.vdev, vq->vring.desc[head].len);
+
+		BUG_ON(!(vq->vring.desc[head].flags &
+			 cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
+		BUG_ON(len == 0 || len % sizeof(struct vring_desc));
+
+		for (j = 0; j < len / sizeof(struct vring_desc); j++)
+			vring_unmap_one_split(vq, &indir_desc[j]);
+
+		kfree(indir_desc);
+		vq->desc_state[head].indir_desc = NULL;
+	} else if (ctx) {
+		*ctx = vq->desc_state[head].indir_desc;
+	}
+}
+
+static inline bool more_used_split(const struct vring_virtqueue *vq)
+{
+	return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev, vq->vring.used->idx);
+}
+
+static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq,
+					 unsigned int *len,
+					 void **ctx)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	void *ret;
+	unsigned int i;
+	u16 last_used;
+
+	START_USE(vq);
+
+	if (unlikely(vq->broken)) {
+		END_USE(vq);
+		return NULL;
+	}
+
+	if (!more_used_split(vq)) {
+		pr_debug("No more buffers in queue\n");
+		END_USE(vq);
+		return NULL;
+	}
+
+	/* Only get used array entries after they have been exposed by host. */
+	virtio_rmb(vq->weak_barriers);
+
+	last_used = (vq->last_used_idx & (vq->vring.num - 1));
+	i = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].id);
+	*len = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].len);
+
+	if (unlikely(i >= vq->vring.num)) {
+		BAD_RING(vq, "id %u out of range\n", i);
+		return NULL;
+	}
+	if (unlikely(!vq->desc_state[i].data)) {
+		BAD_RING(vq, "id %u is not a head!\n", i);
+		return NULL;
+	}
+
+	/* detach_buf_split clears data, so grab it now. */
+	ret = vq->desc_state[i].data;
+	detach_buf_split(vq, i, ctx);
+	vq->last_used_idx++;
+	/* If we expect an interrupt for the next entry, tell host
+	 * by writing event index and flush out the write before
+	 * the read in the next get_buf call. */
+	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT))
+		virtio_store_mb(vq->weak_barriers,
+				&vring_used_event(&vq->vring),
+				cpu_to_virtio16(_vq->vdev, vq->last_used_idx));
+
+#ifdef DEBUG
+	vq->last_add_time_valid = false;
+#endif
+
+	END_USE(vq);
+	return ret;
+}
+
+static void virtqueue_disable_cb_split(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
+		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
+		if (!vq->event)
+			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
+	}
+}
+
+static unsigned virtqueue_enable_cb_prepare_split(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	u16 last_used_idx;
+
+	START_USE(vq);
+
+	/* We optimistically turn back on interrupts, then check if there was
+	 * more to do. */
+	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
+	 * either clear the flags bit or point the event index at the next
+	 * entry. Always do both to keep code simple. */
+	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
+		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
+		if (!vq->event)
+			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
+	}
+	vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, last_used_idx = vq->last_used_idx);
+	END_USE(vq);
+	return last_used_idx;
+}
+
+static bool virtqueue_poll_split(struct virtqueue *_vq, unsigned last_used_idx)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	virtio_mb(vq->weak_barriers);
+	return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev, vq->vring.used->idx);
+}
+
+static bool virtqueue_enable_cb_delayed_split(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	u16 bufs;
+
+	START_USE(vq);
+
+	/* We optimistically turn back on interrupts, then check if there was
+	 * more to do. */
+	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
+	 * either clear the flags bit or point the event index at the next
+	 * entry. Always update the event index to keep code simple. */
+	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
+		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
+		if (!vq->event)
+			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
+	}
+	/* TODO: tune this threshold */
+	bufs = (u16)(vq->avail_idx_shadow - vq->last_used_idx) * 3 / 4;
+
+	virtio_store_mb(vq->weak_barriers,
+			&vring_used_event(&vq->vring),
+			cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs));
+
+	if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->vring.used->idx) - vq->last_used_idx) > bufs)) {
+		END_USE(vq);
+		return false;
+	}
+
+	END_USE(vq);
+	return true;
+}
+
+static void *virtqueue_detach_unused_buf_split(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	unsigned int i;
+	void *buf;
+
+	START_USE(vq);
+
+	for (i = 0; i < vq->vring.num; i++) {
+		if (!vq->desc_state[i].data)
+			continue;
+		/* detach_buf clears data, so grab it now. */
+		buf = vq->desc_state[i].data;
+		detach_buf_split(vq, i, NULL);
+		vq->avail_idx_shadow--;
+		vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
+		END_USE(vq);
+		return buf;
+	}
+	/* That should have freed everything. */
+	BUG_ON(vq->vq.num_free != vq->vring.num);
+
+	END_USE(vq);
+	return NULL;
+}
+
+/*
+ * The layout for the packed ring is a continuous chunk of memory
+ * which looks like this.
+ *
+ * struct vring_packed {
+ *	// The actual descriptors (16 bytes each)
+ *	struct vring_packed_desc desc[num];
+ *
+ *	// Padding to the next align boundary.
+ *	char pad[];
+ *
+ *	// Driver Event Suppression
+ *	struct vring_packed_desc_event driver;
+ *
+ *	// Device Event Suppression
+ *	struct vring_packed_desc_event device;
+ * };
+ */
+static inline void vring_init_packed(struct vring_packed *vr, unsigned int num,
+				     void *p, unsigned long align)
+{
+	vr->num = num;
+	vr->desc = p;
+	vr->driver = (void *)(((uintptr_t)p + sizeof(struct vring_packed_desc)
+		* num + align - 1) & ~(align - 1));
+	vr->device = vr->driver + 1;
+}
+
+static inline unsigned vring_size_packed(unsigned int num, unsigned long align)
+{
+	return ((sizeof(struct vring_packed_desc) * num + align - 1)
+		& ~(align - 1)) + sizeof(struct vring_packed_desc_event) * 2;
+}
+
+static inline int virtqueue_add_packed(struct virtqueue *_vq,
+				       struct scatterlist *sgs[],
+				       unsigned int total_sg,
+				       unsigned int out_sgs,
+				       unsigned int in_sgs,
+				       void *data,
+				       void *ctx,
+				       gfp_t gfp)
+{
+	return -EIO;
+}
+
+static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
+{
+	return false;
+}
+
+static inline bool more_used_packed(const struct vring_virtqueue *vq)
+{
+	return false;
+}
+
+static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
+					  unsigned int *len,
+					  void **ctx)
+{
+	return NULL;
+}
+
+static void virtqueue_disable_cb_packed(struct virtqueue *_vq)
+{
+}
+
+static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
+{
+	return 0;
+}
+
+static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned last_used_idx)
+{
+	return false;
+}
+
+static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq)
+{
+	return false;
+}
+
+static void *virtqueue_detach_unused_buf_packed(struct virtqueue *_vq)
+{
+	return NULL;
+}
+
+static inline int virtqueue_add(struct virtqueue *_vq,
+				struct scatterlist *sgs[],
+				unsigned int total_sg,
+				unsigned int out_sgs,
+				unsigned int in_sgs,
+				void *data,
+				void *ctx,
+				gfp_t gfp)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	return vq->packed ? virtqueue_add_packed(_vq, sgs, total_sg, out_sgs,
+						 in_sgs, data, ctx, gfp) :
+			    virtqueue_add_split(_vq, sgs, total_sg, out_sgs,
+						in_sgs, data, ctx, gfp);
+}
+
 /**
  * virtqueue_add_sgs - expose buffers to other end
  * @vq: the struct virtqueue we're talking about.
@@ -551,34 +927,9 @@ EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_ctx);
 bool virtqueue_kick_prepare(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
-	u16 new, old;
-	bool needs_kick;
 
-	START_USE(vq);
-	/* We need to expose available array entries before checking avail
-	 * event. */
-	virtio_mb(vq->weak_barriers);
-
-	old = vq->avail_idx_shadow - vq->num_added;
-	new = vq->avail_idx_shadow;
-	vq->num_added = 0;
-
-#ifdef DEBUG
-	if (vq->last_add_time_valid) {
-		WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
-					      vq->last_add_time)) > 100);
-	}
-	vq->last_add_time_valid = false;
-#endif
-
-	if (vq->event) {
-		needs_kick = vring_need_event(virtio16_to_cpu(_vq->vdev, vring_avail_event(&vq->vring)),
-					      new, old);
-	} else {
-		needs_kick = !(vq->vring.used->flags & cpu_to_virtio16(_vq->vdev, VRING_USED_F_NO_NOTIFY));
-	}
-	END_USE(vq);
-	return needs_kick;
+	return vq->packed ? virtqueue_kick_prepare_packed(_vq) :
+			    virtqueue_kick_prepare_split(_vq);
 }
 EXPORT_SYMBOL_GPL(virtqueue_kick_prepare);
 
@@ -626,58 +977,9 @@ bool virtqueue_kick(struct virtqueue *vq)
 }
 EXPORT_SYMBOL_GPL(virtqueue_kick);
 
-static void detach_buf(struct vring_virtqueue *vq, unsigned int head,
-		       void **ctx)
-{
-	unsigned int i, j;
-	__virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
-
-	/* Clear data ptr. */
-	vq->desc_state[head].data = NULL;
-
-	/* Put back on free list: unmap first-level descriptors and find end */
-	i = head;
-
-	while (vq->vring.desc[i].flags & nextflag) {
-		vring_unmap_one(vq, &vq->vring.desc[i]);
-		i = virtio16_to_cpu(vq->vq.vdev, vq->vring.desc[i].next);
-		vq->vq.num_free++;
-	}
-
-	vring_unmap_one(vq, &vq->vring.desc[i]);
-	vq->vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, vq->free_head);
-	vq->free_head = head;
-
-	/* Plus final descriptor */
-	vq->vq.num_free++;
-
-	if (vq->indirect) {
-		struct vring_desc *indir_desc = vq->desc_state[head].indir_desc;
-		u32 len;
-
-		/* Free the indirect table, if any, now that it's unmapped. */
-		if (!indir_desc)
-			return;
-
-		len = virtio32_to_cpu(vq->vq.vdev, vq->vring.desc[head].len);
-
-		BUG_ON(!(vq->vring.desc[head].flags &
-			 cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
-		BUG_ON(len == 0 || len % sizeof(struct vring_desc));
-
-		for (j = 0; j < len / sizeof(struct vring_desc); j++)
-			vring_unmap_one(vq, &indir_desc[j]);
-
-		kfree(indir_desc);
-		vq->desc_state[head].indir_desc = NULL;
-	} else if (ctx) {
-		*ctx = vq->desc_state[head].indir_desc;
-	}
-}
-
 static inline bool more_used(const struct vring_virtqueue *vq)
 {
-	return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev, vq->vring.used->idx);
+	return vq->packed ? more_used_packed(vq) : more_used_split(vq);
 }
 
 /**
@@ -700,57 +1002,9 @@ void *virtqueue_get_buf_ctx(struct virtqueue *_vq, unsigned int *len,
 			    void **ctx)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
-	void *ret;
-	unsigned int i;
-	u16 last_used;
 
-	START_USE(vq);
-
-	if (unlikely(vq->broken)) {
-		END_USE(vq);
-		return NULL;
-	}
-
-	if (!more_used(vq)) {
-		pr_debug("No more buffers in queue\n");
-		END_USE(vq);
-		return NULL;
-	}
-
-	/* Only get used array entries after they have been exposed by host. */
-	virtio_rmb(vq->weak_barriers);
-
-	last_used = (vq->last_used_idx & (vq->vring.num - 1));
-	i = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].id);
-	*len = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].len);
-
-	if (unlikely(i >= vq->vring.num)) {
-		BAD_RING(vq, "id %u out of range\n", i);
-		return NULL;
-	}
-	if (unlikely(!vq->desc_state[i].data)) {
-		BAD_RING(vq, "id %u is not a head!\n", i);
-		return NULL;
-	}
-
-	/* detach_buf clears data, so grab it now. */
-	ret = vq->desc_state[i].data;
-	detach_buf(vq, i, ctx);
-	vq->last_used_idx++;
-	/* If we expect an interrupt for the next entry, tell host
-	 * by writing event index and flush out the write before
-	 * the read in the next get_buf call. */
-	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT))
-		virtio_store_mb(vq->weak_barriers,
-				&vring_used_event(&vq->vring),
-				cpu_to_virtio16(_vq->vdev, vq->last_used_idx));
-
-#ifdef DEBUG
-	vq->last_add_time_valid = false;
-#endif
-
-	END_USE(vq);
-	return ret;
+	return vq->packed ? virtqueue_get_buf_ctx_packed(_vq, len, ctx) :
+			    virtqueue_get_buf_ctx_split(_vq, len, ctx);
 }
 EXPORT_SYMBOL_GPL(virtqueue_get_buf_ctx);
 
@@ -772,12 +1026,10 @@ void virtqueue_disable_cb(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
 
-	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
-		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
-		if (!vq->event)
-			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
-	}
-
+	if (vq->packed)
+		virtqueue_disable_cb_packed(_vq);
+	else
+		virtqueue_disable_cb_split(_vq);
 }
 EXPORT_SYMBOL_GPL(virtqueue_disable_cb);
 
@@ -796,23 +1048,9 @@ EXPORT_SYMBOL_GPL(virtqueue_disable_cb);
 unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
-	u16 last_used_idx;
 
-	START_USE(vq);
-
-	/* We optimistically turn back on interrupts, then check if there was
-	 * more to do. */
-	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
-	 * either clear the flags bit or point the event index at the next
-	 * entry. Always do both to keep code simple. */
-	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
-		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
-		if (!vq->event)
-			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
-	}
-	vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, last_used_idx = vq->last_used_idx);
-	END_USE(vq);
-	return last_used_idx;
+	return vq->packed ? virtqueue_enable_cb_prepare_packed(_vq) :
+			    virtqueue_enable_cb_prepare_split(_vq);
 }
 EXPORT_SYMBOL_GPL(virtqueue_enable_cb_prepare);
 
@@ -829,8 +1067,8 @@ bool virtqueue_poll(struct virtqueue *_vq, unsigned last_used_idx)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
 
-	virtio_mb(vq->weak_barriers);
-	return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev, vq->vring.used->idx);
+	return vq->packed ? virtqueue_poll_packed(_vq, last_used_idx) :
+			    virtqueue_poll_split(_vq, last_used_idx);
 }
 EXPORT_SYMBOL_GPL(virtqueue_poll);
 
@@ -868,34 +1106,9 @@ EXPORT_SYMBOL_GPL(virtqueue_enable_cb);
 bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
-	u16 bufs;
 
-	START_USE(vq);
-
-	/* We optimistically turn back on interrupts, then check if there was
-	 * more to do. */
-	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
-	 * either clear the flags bit or point the event index at the next
-	 * entry. Always update the event index to keep code simple. */
-	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
-		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
-		if (!vq->event)
-			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
-	}
-	/* TODO: tune this threshold */
-	bufs = (u16)(vq->avail_idx_shadow - vq->last_used_idx) * 3 / 4;
-
-	virtio_store_mb(vq->weak_barriers,
-			&vring_used_event(&vq->vring),
-			cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs));
-
-	if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->vring.used->idx) - vq->last_used_idx) > bufs)) {
-		END_USE(vq);
-		return false;
-	}
-
-	END_USE(vq);
-	return true;
+	return vq->packed ? virtqueue_enable_cb_delayed_packed(_vq) :
+			    virtqueue_enable_cb_delayed_split(_vq);
 }
 EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed);
 
@@ -910,27 +1123,9 @@ EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed);
 void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
-	unsigned int i;
-	void *buf;
 
-	START_USE(vq);
-
-	for (i = 0; i < vq->vring.num; i++) {
-		if (!vq->desc_state[i].data)
-			continue;
-		/* detach_buf clears data, so grab it now. */
-		buf = vq->desc_state[i].data;
-		detach_buf(vq, i, NULL);
-		vq->avail_idx_shadow--;
-		vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
-		END_USE(vq);
-		return buf;
-	}
-	/* That should have freed everything. */
-	BUG_ON(vq->vq.num_free != vq->vring.num);
-
-	END_USE(vq);
-	return NULL;
+	return vq->packed ? virtqueue_detach_unused_buf_packed(_vq) :
+			    virtqueue_detach_unused_buf_split(_vq);
 }
 EXPORT_SYMBOL_GPL(virtqueue_detach_unused_buf);
 
@@ -955,7 +1150,8 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
 EXPORT_SYMBOL_GPL(vring_interrupt);
 
 struct virtqueue *__vring_new_virtqueue(unsigned int index,
-					struct vring vring,
+					union vring_union vring,
+					bool packed,
 					struct virtio_device *vdev,
 					bool weak_barriers,
 					bool context,
@@ -963,19 +1159,20 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
 					void (*callback)(struct virtqueue *),
 					const char *name)
 {
-	unsigned int i;
+	unsigned int num, i;
 	struct vring_virtqueue *vq;
 
-	vq = kmalloc(sizeof(*vq) + vring.num * sizeof(struct vring_desc_state),
+	num = packed ? vring.vring_packed.num : vring.vring_split.num;
+
+	vq = kmalloc(sizeof(*vq) + num * sizeof(struct vring_desc_state),
 		     GFP_KERNEL);
 	if (!vq)
 		return NULL;
 
-	vq->vring = vring;
 	vq->vq.callback = callback;
 	vq->vq.vdev = vdev;
 	vq->vq.name = name;
-	vq->vq.num_free = vring.num;
+	vq->vq.num_free = num;
 	vq->vq.index = index;
 	vq->we_own_ring = false;
 	vq->queue_dma_addr = 0;
@@ -984,9 +1181,8 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
 	vq->weak_barriers = weak_barriers;
 	vq->broken = false;
 	vq->last_used_idx = 0;
-	vq->avail_flags_shadow = 0;
-	vq->avail_idx_shadow = 0;
 	vq->num_added = 0;
+	vq->packed = packed;
 	list_add_tail(&vq->vq.list, &vdev->vqs);
 #ifdef DEBUG
 	vq->in_use = false;
@@ -997,18 +1193,37 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
 		!context;
 	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
 
+	if (vq->packed) {
+		vq->vring_packed = vring.vring_packed;
+		vq->next_avail_idx = 0;
+		vq->wrap_counter = 1;
+		vq->event_flags_shadow = 0;
+	} else {
+		vq->vring = vring.vring_split;
+		vq->avail_flags_shadow = 0;
+		vq->avail_idx_shadow = 0;
+
+		/* Put everything in free lists. */
+		vq->free_head = 0;
+		for (i = 0; i < num-1; i++)
+			vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
+	}
+
 	/* No callback?  Tell other side not to bother us. */
 	if (!callback) {
-		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
-		if (!vq->event)
-			vq->vring.avail->flags = cpu_to_virtio16(vdev, vq->avail_flags_shadow);
+		if (packed) {
+			vq->event_flags_shadow = VRING_EVENT_F_DISABLE;
+			vq->vring_packed.driver->flags = cpu_to_virtio16(vdev,
+						vq->event_flags_shadow);
+		} else {
+			vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
+			if (!vq->event)
+				vq->vring.avail->flags = cpu_to_virtio16(vdev,
+						vq->avail_flags_shadow);
+		}
 	}
 
-	/* Put everything in free lists. */
-	vq->free_head = 0;
-	for (i = 0; i < vring.num-1; i++)
-		vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
-	memset(vq->desc_state, 0, vring.num * sizeof(struct vring_desc_state));
+	memset(vq->desc_state, 0, num * sizeof(struct vring_desc_state));
 
 	return &vq->vq;
 }
@@ -1056,6 +1271,12 @@ static void vring_free_queue(struct virtio_device *vdev, size_t size,
 	}
 }
 
+static inline int
+__vring_size(unsigned int num, unsigned long align, bool packed)
+{
+	return packed ? vring_size_packed(num, align) : vring_size(num, align);
+}
+
 struct virtqueue *vring_create_virtqueue(
 	unsigned int index,
 	unsigned int num,
@@ -1072,7 +1293,8 @@ struct virtqueue *vring_create_virtqueue(
 	void *queue = NULL;
 	dma_addr_t dma_addr;
 	size_t queue_size_in_bytes;
-	struct vring vring;
+	union vring_union vring;
+	bool packed;
 
 	/* We assume num is a power of 2. */
 	if (num & (num - 1)) {
@@ -1080,9 +1302,13 @@ struct virtqueue *vring_create_virtqueue(
 		return NULL;
 	}
 
+	packed = virtio_has_feature(vdev, VIRTIO_F_RING_PACKED);
+
 	/* TODO: allocate each queue chunk individually */
-	for (; num && vring_size(num, vring_align) > PAGE_SIZE; num /= 2) {
-		queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
+	for (; num && __vring_size(num, vring_align, packed) > PAGE_SIZE;
+			num /= 2) {
+		queue = vring_alloc_queue(vdev, __vring_size(num, vring_align,
+							     packed),
 					  &dma_addr,
 					  GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO);
 		if (queue)
@@ -1094,17 +1320,21 @@ struct virtqueue *vring_create_virtqueue(
 
 	if (!queue) {
 		/* Try to get a single page. You are my only hope! */
-		queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
+		queue = vring_alloc_queue(vdev, __vring_size(num, vring_align,
+							     packed),
 					  &dma_addr, GFP_KERNEL|__GFP_ZERO);
 	}
 	if (!queue)
 		return NULL;
 
-	queue_size_in_bytes = vring_size(num, vring_align);
-	vring_init(&vring, num, queue, vring_align);
+	queue_size_in_bytes = __vring_size(num, vring_align, packed);
+	if (packed)
+		vring_init_packed(&vring.vring_packed, num, queue, vring_align);
+	else
+		vring_init(&vring.vring_split, num, queue, vring_align);
 
-	vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
-				   notify, callback, name);
+	vq = __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
+				   context, notify, callback, name);
 	if (!vq) {
 		vring_free_queue(vdev, queue_size_in_bytes, queue,
 				 dma_addr);
@@ -1130,10 +1360,17 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
 				      void (*callback)(struct virtqueue *vq),
 				      const char *name)
 {
-	struct vring vring;
-	vring_init(&vring, num, pages, vring_align);
-	return __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
-				     notify, callback, name);
+	union vring_union vring;
+	bool packed;
+
+	packed = virtio_has_feature(vdev, VIRTIO_F_RING_PACKED);
+	if (packed)
+		vring_init_packed(&vring.vring_packed, num, pages, vring_align);
+	else
+		vring_init(&vring.vring_split, num, pages, vring_align);
+
+	return __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
+				     context, notify, callback, name);
 }
 EXPORT_SYMBOL_GPL(vring_new_virtqueue);
 
@@ -1143,7 +1380,9 @@ void vring_del_virtqueue(struct virtqueue *_vq)
 
 	if (vq->we_own_ring) {
 		vring_free_queue(vq->vq.vdev, vq->queue_size_in_bytes,
-				 vq->vring.desc, vq->queue_dma_addr);
+				 vq->packed ? (void *)vq->vring_packed.desc :
+					      (void *)vq->vring.desc,
+				 vq->queue_dma_addr);
 	}
 	list_del(&_vq->list);
 	kfree(vq);
@@ -1185,7 +1424,7 @@ unsigned int virtqueue_get_vring_size(struct virtqueue *_vq)
 
 	struct vring_virtqueue *vq = to_vvq(_vq);
 
-	return vq->vring.num;
+	return vq->packed ? vq->vring_packed.num : vq->vring.num;
 }
 EXPORT_SYMBOL_GPL(virtqueue_get_vring_size);
 
@@ -1228,6 +1467,10 @@ dma_addr_t virtqueue_get_avail_addr(struct virtqueue *_vq)
 
 	BUG_ON(!vq->we_own_ring);
 
+	if (vq->packed)
+		return vq->queue_dma_addr + ((char *)vq->vring_packed.driver -
+				(char *)vq->vring_packed.desc);
+
 	return vq->queue_dma_addr +
 		((char *)vq->vring.avail - (char *)vq->vring.desc);
 }
@@ -1239,11 +1482,16 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *_vq)
 
 	BUG_ON(!vq->we_own_ring);
 
+	if (vq->packed)
+		return vq->queue_dma_addr + ((char *)vq->vring_packed.device -
+				(char *)vq->vring_packed.desc);
+
 	return vq->queue_dma_addr +
 		((char *)vq->vring.used - (char *)vq->vring.desc);
 }
 EXPORT_SYMBOL_GPL(virtqueue_get_used_addr);
 
+/* Only available for split ring */
 const struct vring *virtqueue_get_vring(struct virtqueue *vq)
 {
 	return &to_vvq(vq)->vring;
diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
index bbf32524ab27..a0075894ad16 100644
--- a/include/linux/virtio_ring.h
+++ b/include/linux/virtio_ring.h
@@ -60,6 +60,11 @@ static inline void virtio_store_mb(bool weak_barriers,
 struct virtio_device;
 struct virtqueue;
 
+union vring_union {
+	struct vring vring_split;
+	struct vring_packed vring_packed;
+};
+
 /*
  * Creates a virtqueue and allocates the descriptor ring.  If
  * may_reduce_num is set, then this may allocate a smaller ring than
@@ -79,7 +84,8 @@ struct virtqueue *vring_create_virtqueue(unsigned int index,
 
 /* Creates a virtqueue with a custom layout. */
 struct virtqueue *__vring_new_virtqueue(unsigned int index,
-					struct vring vring,
+					union vring_union vring,
+					bool packed,
 					struct virtio_device *vdev,
 					bool weak_barriers,
 					bool ctx,
-- 
2.11.0

^ permalink raw reply related

* [RFC v3 4/5] virtio_ring: add event idx support in packed ring
From: Tiwei Bie @ 2018-04-25  5:15 UTC (permalink / raw)
  To: mst, jasowang, virtualization, linux-kernel, netdev
  Cc: wexu, jfreimann, tiwei.bie
In-Reply-To: <20180425051550.24342-1-tiwei.bie@intel.com>

This commit introduces the event idx support in packed
ring. This feature is temporarily disabled, because the
implementation in this patch may not work as expected,
and some further discussions on the implementation are
needed, e.g. do we have to check the wrap counter when
checking whether a kick is needed?

Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
---
 drivers/virtio/virtio_ring.c | 53 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 49 insertions(+), 4 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 0181e93897be..b1039c2985b9 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -986,7 +986,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
 static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
-	u16 flags;
+	u16 new, old, off_wrap, flags;
 	bool needs_kick;
 	u32 snapshot;
 
@@ -995,7 +995,12 @@ static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
 	 * suppressions. */
 	virtio_mb(vq->weak_barriers);
 
+	old = vq->next_avail_idx - vq->num_added;
+	new = vq->next_avail_idx;
+	vq->num_added = 0;
+
 	snapshot = *(u32 *)vq->vring_packed.device;
+	off_wrap = virtio16_to_cpu(_vq->vdev, snapshot & 0xffff);
 	flags = cpu_to_virtio16(_vq->vdev, snapshot >> 16) & 0x3;
 
 #ifdef DEBUG
@@ -1006,7 +1011,10 @@ static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
 	vq->last_add_time_valid = false;
 #endif
 
-	needs_kick = (flags != VRING_EVENT_F_DISABLE);
+	if (flags == VRING_EVENT_F_DESC)
+		needs_kick = vring_need_event(off_wrap & ~(1<<15), new, old);
+	else
+		needs_kick = (flags != VRING_EVENT_F_DISABLE);
 	END_USE(vq);
 	return needs_kick;
 }
@@ -1116,6 +1124,15 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
 	if (vq->last_used_idx >= vq->vring_packed.num)
 		vq->last_used_idx -= vq->vring_packed.num;
 
+	/* If we expect an interrupt for the next entry, tell host
+	 * by writing event index and flush out the write before
+	 * the read in the next get_buf call. */
+	if (vq->event_flags_shadow == VRING_EVENT_F_DESC)
+		virtio_store_mb(vq->weak_barriers,
+				&vq->vring_packed.driver->off_wrap,
+				cpu_to_virtio16(_vq->vdev, vq->last_used_idx |
+						(vq->wrap_counter << 15)));
+
 #ifdef DEBUG
 	vq->last_add_time_valid = false;
 #endif
@@ -1143,10 +1160,17 @@ static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
 
 	/* We optimistically turn back on interrupts, then check if there was
 	 * more to do. */
+	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
+	 * either clear the flags bit or point the event index at the next
+	 * entry. Always update the event index to keep code simple. */
+
+	vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev,
+			vq->last_used_idx | (vq->wrap_counter << 15));
 
 	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
 		virtio_wmb(vq->weak_barriers);
-		vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
+		vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC :
+						     VRING_EVENT_F_ENABLE;
 		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
 							vq->event_flags_shadow);
 	}
@@ -1172,15 +1196,34 @@ static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned last_used_idx)
 static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
+	u16 bufs, used_idx, wrap_counter;
 
 	START_USE(vq);
 
 	/* We optimistically turn back on interrupts, then check if there was
 	 * more to do. */
+	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
+	 * either clear the flags bit or point the event index at the next
+	 * entry. Always update the event index to keep code simple. */
+
+	/* TODO: tune this threshold */
+	bufs = (u16)(vq->next_avail_idx - vq->last_used_idx) * 3 / 4;
+
+	used_idx = vq->last_used_idx + bufs;
+	wrap_counter = vq->wrap_counter;
+
+	if (used_idx >= vq->vring_packed.num) {
+		used_idx -= vq->vring_packed.num;
+		wrap_counter ^= 1;
+	}
+
+	vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev,
+			used_idx | (wrap_counter << 15));
 
 	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
 		virtio_wmb(vq->weak_barriers);
-		vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
+		vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC :
+						     VRING_EVENT_F_ENABLE;
 		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
 							vq->event_flags_shadow);
 	}
@@ -1822,8 +1865,10 @@ void vring_transport_features(struct virtio_device *vdev)
 		switch (i) {
 		case VIRTIO_RING_F_INDIRECT_DESC:
 			break;
+#if 0
 		case VIRTIO_RING_F_EVENT_IDX:
 			break;
+#endif
 		case VIRTIO_F_VERSION_1:
 			break;
 		case VIRTIO_F_IOMMU_PLATFORM:
-- 
2.11.0

^ permalink raw reply related

* [RFC v3 5/5] virtio_ring: enable packed ring
From: Tiwei Bie @ 2018-04-25  5:15 UTC (permalink / raw)
  To: mst, jasowang, virtualization, linux-kernel, netdev; +Cc: wexu
In-Reply-To: <20180425051550.24342-1-tiwei.bie@intel.com>

Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
---
 drivers/virtio/virtio_ring.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index b1039c2985b9..9a3d13e1e2ba 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -1873,6 +1873,8 @@ void vring_transport_features(struct virtio_device *vdev)
 			break;
 		case VIRTIO_F_IOMMU_PLATFORM:
 			break;
+		case VIRTIO_F_RING_PACKED:
+			break;
 		default:
 			/* We don't understand this bit. */
 			__virtio_clear_bit(vdev, i);
-- 
2.11.0

^ permalink raw reply related

* KASAN: stack-out-of-bounds Write in compat_copy_entries
From: syzbot @ 2018-04-25  5:19 UTC (permalink / raw)
  To: bridge, coreteam, davem, fw, kadlec, linux-kernel, netdev,
	netfilter-devel, pablo, stephen, syzkaller-bugs

Hello,

syzbot hit the following crash on upstream commit
24cac7009cb1b211f1c793ecb6a462c03dc35818 (Tue Apr 24 21:16:40 2018 +0000)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
syzbot dashboard link:  
https://syzkaller.appspot.com/bug?extid=4e42a04e0bc33cb6c087

So far this crash happened 3 times on upstream.
syzkaller reproducer:  
https://syzkaller.appspot.com/x/repro.syz?id=4827027970457600
Raw console output:  
https://syzkaller.appspot.com/x/log.txt?id=6212733133389824
Kernel config:  
https://syzkaller.appspot.com/x/.config?id=7043958930931867332
compiler: gcc (GCC) 8.0.1 20180413 (experimental)
user-space arch: i386

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+4e42a04e0bc33cb6c087@syzkaller.appspotmail.com
It will help syzbot understand when the bug is fixed. See footer for  
details.
If you forward the report, please keep this part and the footer.

random: sshd: uninitialized urandom read (32 bytes read)
random: sshd: uninitialized urandom read (32 bytes read)
random: sshd: uninitialized urandom read (32 bytes read)
IPVS: ftp: loaded support on port[0] = 21
==================================================================
BUG: KASAN: stack-out-of-bounds in strlcpy include/linux/string.h:300  
[inline]
BUG: KASAN: stack-out-of-bounds in compat_mtw_from_user  
net/bridge/netfilter/ebtables.c:1957 [inline]
BUG: KASAN: stack-out-of-bounds in ebt_size_mwt  
net/bridge/netfilter/ebtables.c:2059 [inline]
BUG: KASAN: stack-out-of-bounds in size_entry_mwt  
net/bridge/netfilter/ebtables.c:2155 [inline]
BUG: KASAN: stack-out-of-bounds in compat_copy_entries+0x96c/0x14a0  
net/bridge/netfilter/ebtables.c:2194
Write of size 33 at addr ffff8801b0abf888 by task syz-executor0/4504

CPU: 0 PID: 4504 Comm: syz-executor0 Not tainted 4.17.0-rc2+ #40
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
Call Trace:
  __dump_stack lib/dump_stack.c:77 [inline]
  dump_stack+0x1b9/0x294 lib/dump_stack.c:113
  print_address_description+0x6c/0x20b mm/kasan/report.c:256
  kasan_report_error mm/kasan/report.c:354 [inline]
  kasan_report.cold.7+0x242/0x2fe mm/kasan/report.c:412
  check_memory_region_inline mm/kasan/kasan.c:260 [inline]
  check_memory_region+0x13e/0x1b0 mm/kasan/kasan.c:267
  memcpy+0x37/0x50 mm/kasan/kasan.c:303
  strlcpy include/linux/string.h:300 [inline]
  compat_mtw_from_user net/bridge/netfilter/ebtables.c:1957 [inline]
  ebt_size_mwt net/bridge/netfilter/ebtables.c:2059 [inline]
  size_entry_mwt net/bridge/netfilter/ebtables.c:2155 [inline]
  compat_copy_entries+0x96c/0x14a0 net/bridge/netfilter/ebtables.c:2194
  compat_do_replace+0x483/0x900 net/bridge/netfilter/ebtables.c:2285
  compat_do_ebt_set_ctl+0x2ac/0x324 net/bridge/netfilter/ebtables.c:2367
  compat_nf_sockopt net/netfilter/nf_sockopt.c:144 [inline]
  compat_nf_setsockopt+0x9b/0x140 net/netfilter/nf_sockopt.c:156
  compat_ip_setsockopt+0xff/0x140 net/ipv4/ip_sockglue.c:1279
  inet_csk_compat_setsockopt+0x97/0x120 net/ipv4/inet_connection_sock.c:1041
  compat_tcp_setsockopt+0x49/0x80 net/ipv4/tcp.c:2901
  compat_sock_common_setsockopt+0xb4/0x150 net/core/sock.c:3050
  __compat_sys_setsockopt+0x1ab/0x7c0 net/compat.c:403
  __do_compat_sys_setsockopt net/compat.c:416 [inline]
  __se_compat_sys_setsockopt net/compat.c:413 [inline]
  __ia32_compat_sys_setsockopt+0xbd/0x150 net/compat.c:413
  do_syscall_32_irqs_on arch/x86/entry/common.c:323 [inline]
  do_fast_syscall_32+0x345/0xf9b arch/x86/entry/common.c:394
  entry_SYSENTER_compat+0x70/0x7f arch/x86/entry/entry_64_compat.S:139
RIP: 0023:0xf7fb3cb9
RSP: 002b:00000000fff0c26c EFLAGS: 00000282 ORIG_RAX: 000000000000016e
RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000000000
RDX: 0000000000000080 RSI: 0000000020000300 RDI: 00000000000005f4
RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000

The buggy address belongs to the page:
page:ffffea0006c2afc0 count:0 mapcount:0 mapping:0000000000000000 index:0x0
flags: 0x2fffc0000000000()
raw: 02fffc0000000000 0000000000000000 0000000000000000 00000000ffffffff
raw: 0000000000000000 ffffea0006c20101 0000000000000000 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
  ffff8801b0abf780: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
  ffff8801b0abf800: 00 00 00 00 00 f1 f1 f1 f1 00 00 f2 f2 f2 f2 f2
> ffff8801b0abf880: f2 00 00 00 07 f3 f3 f3 f3 00 00 00 00 00 00 00
                                ^
  ffff8801b0abf900: 00 00 00 00 00 00 00 00 00 00 f1 f1 f1 f1 00 00
  ffff8801b0abf980: 00 f2 f2 f2 f2 f2 00 00 00 00 00 00 00 00 00 00
==================================================================


---
This bug is generated by a dumb bot. It may contain errors.
See https://goo.gl/tpsmEJ for details.
Direct all questions to syzkaller@googlegroups.com.

syzbot will keep track of this bug report.
If you forgot to add the Reported-by tag, once the fix for this bug is  
merged
into any tree, please reply to this email with:
#syz fix: exact-commit-title
If you want to test a patch for this bug, please reply with:
#syz test: git://repo/address.git branch
and provide the patch inline or as an attachment.
To mark this as a duplicate of another syzbot report, please reply with:
#syz dup: exact-subject-of-another-report
If it's a one-off invalid bug report, please reply with:
#syz invalid
Note: if the crash happens again, it will cause creation of a new bug  
report.
Note: all commands must start from beginning of the line in the email body.

^ permalink raw reply

* RE: [PATCH net-next] lan78xx: Lan7801 Support for Fixed PHY
From: RaghuramChary.Jallipalli @ 2018-04-25  5:21 UTC (permalink / raw)
  To: andrew; +Cc: davem, netdev, UNGLinuxDriver, Woojung.Huh
In-Reply-To: <20180423124203.GC25919@lunn.ch>

Hi Andrew,

> > +#define DRIVER_VERSION	"1.0.7"
> 
> Hi Raghuram
> 
> Driver version strings a pretty pointless. You might want to remove it.
>
OK, will remove it.
 
> > +			netdev_info(dev->net, "Registered FIXED PHY\n");
> 
> There are too many detdev_info() messages here. Maybe make them both
> netdev_dbg().
>

OK. Will modify to netdev_dbg()
 
> > +			dev->interface = PHY_INTERFACE_MODE_RGMII;
> > +			dev->fixedphy = phydev;
> 
> You can use
> 
> if (!phy_is_pseudo_fixed_link(phydev))
> 
> to determine is a PHY is a fixed phy. I think you can then do without
> dev->fixedphy.
> 
dev->fixedphy stores the fixed phydev, which will be passed to the 
fixed_phy_unregister routine , so I think phy_is_pseudo_fixed_link check is not necessary.

> > +			ret = lan78xx_write_reg(dev, HW_CFG, buf);
> > +			goto phyinit;
> 
> Please don't use a goto like this. Maybe turn this into a switch statement?
>
OK. Will modify.
 
> > static int lan78xx_phy_init(struct lan78xx_net *dev)
> >  		goto error;
> 
> Please take a look at what happens at error: It does not look correct.
> Probably now is a good time to refactor the whole of lan78xx_phy_init()
> 

OK. Will take care.

Thanks,
-Raghu

^ permalink raw reply

* [PATCH net-next 0/2] tcp: mmap: rework zerocopy receive
From: Eric Dumazet @ 2018-04-25  5:27 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, Andy Lutomirski, linux-kernel, linux-mm,
	Soheil Hassas Yeganeh, Eric Dumazet, Eric Dumazet

syzbot reported a lockdep issue caused by tcp mmap() support.

I implemented Andy Lutomirski nice suggestions to resolve the
issue and increase scalability as well.

First patch is adding a new setsockopt() operation and changes mmap()
behavior.

Second patch changes tcp_mmap reference program.

Eric Dumazet (2):
  tcp: add TCP_ZEROCOPY_RECEIVE support for zerocopy receive
  selftests: net: tcp_mmap must use TCP_ZEROCOPY_RECEIVE

 include/uapi/linux/tcp.h               |   8 ++
 net/ipv4/tcp.c                         | 186 +++++++++++++------------
 tools/testing/selftests/net/tcp_mmap.c |  63 +++++----
 3 files changed, 139 insertions(+), 118 deletions(-)

-- 
2.17.0.484.g0c8726318c-goog

^ permalink raw reply

* [PATCH net-next 1/2] tcp: add TCP_ZEROCOPY_RECEIVE support for zerocopy receive
From: Eric Dumazet @ 2018-04-25  5:27 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, Andy Lutomirski, linux-kernel, linux-mm,
	Soheil Hassas Yeganeh, Eric Dumazet, Eric Dumazet
In-Reply-To: <20180425052722.73022-1-edumazet@google.com>

When adding tcp mmap() implementation, I forgot that socket lock
had to be taken before current->mm->mmap_sem. syzbot eventually caught
the bug.

Since we can not lock the socket in tcp mmap() handler we have to
split the operation in two phases.

1) mmap() on a tcp socket simply reserves VMA space, and nothing else.
  This operation does not involve any TCP locking.

2) setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) implements
 the transfert of pages from skbs to one VMA.
  This operation only uses down_read(&current->mm->mmap_sem) after
  holding TCP lock, thus solving the lockdep issue.

This new implementation was suggested by Andy Lutomirski with great details.

Benefits are :

- Better scalability, in case multiple threads reuse VMAS
   (without mmap()/munmap() calls) since mmap_sem wont be write locked.

- Better error recovery.
   The previous mmap() model had to provide the expected size of the
   mapping. If for some reason one part could not be mapped (partial MSS),
   the whole operation had to be aborted.
   With the tcp_zerocopy_receive struct, kernel can report how
   many bytes were successfuly mapped, and how many bytes should
   be read to skip the problematic sequence.

- No more memory allocation to hold an array of page pointers.
  16 MB mappings needed 32 KB for this array, potentially using vmalloc() :/

- skbs are freed while mmap_sem has been released

Following patch makes the change in tcp_mmap tool to demonstrate
one possible use of mmap() and setsockopt(... TCP_ZEROCOPY_RECEIVE ...)

Note that memcg might require additional changes.

Fixes: 93ab6cc69162 ("tcp: implement mmap() for zero copy receive")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Suggested-by: Andy Lutomirski <luto@kernel.org>
Cc: linux-mm@kvack.org
Cc: Soheil Hassas Yeganeh <soheil@google.com>
---
 include/uapi/linux/tcp.h |   8 ++
 net/ipv4/tcp.c           | 186 ++++++++++++++++++++-------------------
 2 files changed, 103 insertions(+), 91 deletions(-)

diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 379b08700a542d49bbce9b4b49b17879d00b69bb..e9e8373b34b9ddc735329341b91f455bf5c0b17c 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -122,6 +122,7 @@ enum {
 #define TCP_MD5SIG_EXT		32	/* TCP MD5 Signature with extensions */
 #define TCP_FASTOPEN_KEY	33	/* Set the key for Fast Open (cookie) */
 #define TCP_FASTOPEN_NO_COOKIE	34	/* Enable TFO without a TFO cookie */
+#define TCP_ZEROCOPY_RECEIVE	35
 
 struct tcp_repair_opt {
 	__u32	opt_code;
@@ -276,4 +277,11 @@ struct tcp_diag_md5sig {
 	__u8	tcpm_key[TCP_MD5SIG_MAXKEYLEN];
 };
 
+/* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */
+
+struct tcp_zerocopy_receive {
+	__u64 address;		/* in: address of mapping */
+	__u32 length;		/* in/out: number of bytes to map/mapped */
+	__u32 recv_skip_hint;	/* out: amount of bytes to skip */
+};
 #endif /* _UAPI_LINUX_TCP_H */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index dfd090ea54ad47112fc23c61180b5bf8edd2c736..a28eca97df9465a0aa522a1833a171b53c237b1f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1726,118 +1726,108 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
 }
 EXPORT_SYMBOL(tcp_set_rcvlowat);
 
-/* When user wants to mmap X pages, we first need to perform the mapping
- * before freeing any skbs in receive queue, otherwise user would be unable
- * to fallback to standard recvmsg(). This happens if some data in the
- * requested block is not exactly fitting in a page.
- *
- * We only support order-0 pages for the moment.
- * mmap() on TCP is very strict, there is no point
- * trying to accommodate with pathological layouts.
- */
+static const struct vm_operations_struct tcp_vm_ops = {
+};
+
 int tcp_mmap(struct file *file, struct socket *sock,
 	     struct vm_area_struct *vma)
 {
-	unsigned long size = vma->vm_end - vma->vm_start;
-	unsigned int nr_pages = size >> PAGE_SHIFT;
-	struct page **pages_array = NULL;
-	u32 seq, len, offset, nr = 0;
-	struct sock *sk = sock->sk;
-	const skb_frag_t *frags;
+	if (vma->vm_flags & (VM_WRITE | VM_EXEC))
+		return -EPERM;
+	vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
+
+	/* Instruct vm_insert_page() to not down_read(mmap_sem) */
+	vma->vm_flags |= VM_MIXEDMAP;
+
+	vma->vm_ops = &tcp_vm_ops;
+	return 0;
+}
+EXPORT_SYMBOL(tcp_mmap);
+
+static int tcp_zerocopy_receive(struct sock *sk,
+				struct tcp_zerocopy_receive *zc)
+{
+	unsigned long address = (unsigned long)zc->address;
+	const skb_frag_t *frags = NULL;
+	u32 length = 0, seq, offset;
+	struct vm_area_struct *vma;
+	struct sk_buff *skb = NULL;
 	struct tcp_sock *tp;
-	struct sk_buff *skb;
 	int ret;
 
-	if (vma->vm_pgoff || !nr_pages)
+	if (address & (PAGE_SIZE - 1) || address != zc->address)
 		return -EINVAL;
 
-	if (vma->vm_flags & VM_WRITE)
-		return -EPERM;
-	/* TODO: Maybe the following is not needed if pages are COW */
-	vma->vm_flags &= ~VM_MAYWRITE;
-
-	lock_sock(sk);
-
-	ret = -ENOTCONN;
 	if (sk->sk_state == TCP_LISTEN)
-		goto out;
+		return -ENOTCONN;
 
 	sock_rps_record_flow(sk);
 
-	if (tcp_inq(sk) < size) {
-		ret = sock_flag(sk, SOCK_DONE) ? -EIO : -EAGAIN;
+	down_read(&current->mm->mmap_sem);
+
+	ret = -EINVAL;
+	vma = find_vma(current->mm, address);
+	if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops)
 		goto out;
-	}
+	zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
+
 	tp = tcp_sk(sk);
 	seq = tp->copied_seq;
-	/* Abort if urgent data is in the area */
-	if (unlikely(tp->urg_data)) {
-		u32 urg_offset = tp->urg_seq - seq;
+	zc->length = min_t(u32, zc->length, tcp_inq(sk));
 
-		ret = -EINVAL;
-		if (urg_offset < size)
-			goto out;
-	}
-	ret = -ENOMEM;
-	pages_array = kvmalloc_array(nr_pages, sizeof(struct page *),
-				     GFP_KERNEL);
-	if (!pages_array)
-		goto out;
-	skb = tcp_recv_skb(sk, seq, &offset);
-	ret = -EINVAL;
-skb_start:
-	/* We do not support anything not in page frags */
-	offset -= skb_headlen(skb);
-	if ((int)offset < 0)
-		goto out;
-	if (skb_has_frag_list(skb))
-		goto out;
-	len = skb->data_len - offset;
-	frags = skb_shinfo(skb)->frags;
-	while (offset) {
-		if (frags->size > offset)
-			goto out;
-		offset -= frags->size;
-		frags++;
-	}
-	while (nr < nr_pages) {
-		if (len) {
-			if (len < PAGE_SIZE)
-				goto out;
-			if (frags->size != PAGE_SIZE || frags->page_offset)
-				goto out;
-			pages_array[nr++] = skb_frag_page(frags);
-			frags++;
-			len -= PAGE_SIZE;
-			seq += PAGE_SIZE;
-			continue;
-		}
-		skb = skb->next;
-		offset = seq - TCP_SKB_CB(skb)->seq;
-		goto skb_start;
-	}
-	/* OK, we have a full set of pages ready to be inserted into vma */
-	for (nr = 0; nr < nr_pages; nr++) {
-		ret = vm_insert_page(vma, vma->vm_start + (nr << PAGE_SHIFT),
-				     pages_array[nr]);
-		if (ret)
-			goto out;
-	}
-	/* operation is complete, we can 'consume' all skbs */
-	tp->copied_seq = seq;
-	tcp_rcv_space_adjust(sk);
-
-	/* Clean up data we have read: This will do ACK frames. */
-	tcp_recv_skb(sk, seq, &offset);
-	tcp_cleanup_rbuf(sk, size);
+	zap_page_range(vma, address, zc->length);
 
+	zc->recv_skip_hint = 0;
 	ret = 0;
+	while (length + PAGE_SIZE <= zc->length) {
+		if (zc->recv_skip_hint < PAGE_SIZE) {
+			if (skb) {
+				skb = skb->next;
+				offset = seq - TCP_SKB_CB(skb)->seq;
+			} else {
+				skb = tcp_recv_skb(sk, seq, &offset);
+			}
+
+			zc->recv_skip_hint = skb->len - offset;
+			offset -= skb_headlen(skb);
+			if ((int)offset < 0 || skb_has_frag_list(skb))
+				break;
+			frags = skb_shinfo(skb)->frags;
+			while (offset) {
+				if (frags->size > offset)
+					goto out;
+				offset -= frags->size;
+				frags++;
+			}
+		}
+		if (frags->size != PAGE_SIZE || frags->page_offset)
+			break;
+		ret = vm_insert_page(vma, address + length,
+				     skb_frag_page(frags));
+		if (ret)
+			break;
+		length += PAGE_SIZE;
+		seq += PAGE_SIZE;
+		zc->recv_skip_hint -= PAGE_SIZE;
+		frags++;
+	}
 out:
-	release_sock(sk);
-	kvfree(pages_array);
+	up_read(&current->mm->mmap_sem);
+	if (length) {
+		tp->copied_seq = seq;
+		tcp_rcv_space_adjust(sk);
+
+		/* Clean up data we have read: This will do ACK frames. */
+		tcp_recv_skb(sk, seq, &offset);
+		tcp_cleanup_rbuf(sk, length);
+		ret = 0;
+	} else {
+		if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
+			ret = -EIO;
+	}
+	zc->length = length;
 	return ret;
 }
-EXPORT_SYMBOL(tcp_mmap);
 
 static void tcp_update_recv_tstamps(struct sk_buff *skb,
 				    struct scm_timestamping *tss)
@@ -2738,6 +2728,20 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 
 		return tcp_fastopen_reset_cipher(net, sk, key, sizeof(key));
 	}
+	case TCP_ZEROCOPY_RECEIVE: {
+		struct tcp_zerocopy_receive zc;
+
+		if (optlen != sizeof(zc))
+			return -EINVAL;
+		if (copy_from_user(&zc, optval, optlen))
+			return -EFAULT;
+		lock_sock(sk);
+		err = tcp_zerocopy_receive(sk, &zc);
+		release_sock(sk);
+		if (!err && copy_to_user(optval, &zc, optlen))
+			err = -EFAULT;
+		return err;
+	}
 	default:
 		/* fallthru */
 		break;
-- 
2.17.0.484.g0c8726318c-goog

^ permalink raw reply related

* [PATCH net-next 2/2] selftests: net: tcp_mmap must use TCP_ZEROCOPY_RECEIVE
From: Eric Dumazet @ 2018-04-25  5:27 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, Andy Lutomirski, linux-kernel, linux-mm,
	Soheil Hassas Yeganeh, Eric Dumazet, Eric Dumazet
In-Reply-To: <20180425052722.73022-1-edumazet@google.com>

After prior kernel change, mmap() on TCP socket only reserves VMA.

We have to use setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...)
to perform the transfert of pages from skbs in TCP receive queue into such VMA.

struct tcp_zerocopy_receive {
	__u64 address;		/* in: address of mapping */
	__u32 length;		/* in/out: number of bytes to map/mapped */
	__u32 recv_skip_hint;	/* out: amount of bytes to skip */
};

After a successful setsockopt(...TCP_ZEROCOPY_RECEIVE...), @length contains
number of bytes that were mapped, and @recv_skip_hint contains number of bytes
that should be read using conventional read()/recv()/recvmsg() system calls,
to skip a sequence of bytes that can not be mapped, because not properly page
aligned.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
---
 tools/testing/selftests/net/tcp_mmap.c | 63 +++++++++++++++-----------
 1 file changed, 36 insertions(+), 27 deletions(-)

diff --git a/tools/testing/selftests/net/tcp_mmap.c b/tools/testing/selftests/net/tcp_mmap.c
index dea342fe6f4e88b5709d2ac37b2fc9a2a320bf44..5b381cdbdd6319556ba4e3dad530fae8f13f5a9b 100644
--- a/tools/testing/selftests/net/tcp_mmap.c
+++ b/tools/testing/selftests/net/tcp_mmap.c
@@ -76,9 +76,10 @@
 #include <time.h>
 #include <sys/time.h>
 #include <netinet/in.h>
-#include <netinet/tcp.h>
 #include <arpa/inet.h>
 #include <poll.h>
+#include <linux/tcp.h>
+#include <assert.h>
 
 #ifndef MSG_ZEROCOPY
 #define MSG_ZEROCOPY    0x4000000
@@ -134,11 +135,12 @@ void hash_zone(void *zone, unsigned int length)
 void *child_thread(void *arg)
 {
 	unsigned long total_mmap = 0, total = 0;
+	struct tcp_zerocopy_receive zc;
 	unsigned long delta_usec;
 	int flags = MAP_SHARED;
 	struct timeval t0, t1;
 	char *buffer = NULL;
-	void *oaddr = NULL;
+	void *addr = NULL;
 	double throughput;
 	struct rusage ru;
 	int lu, fd;
@@ -153,41 +155,45 @@ void *child_thread(void *arg)
 		perror("malloc");
 		goto error;
 	}
+	if (zflg) {
+		addr = mmap(NULL, chunk_size, PROT_READ, flags, fd, 0);
+		if (addr == (void *)-1)
+			zflg = 0;
+	}
 	while (1) {
 		struct pollfd pfd = { .fd = fd, .events = POLLIN, };
 		int sub;
 
 		poll(&pfd, 1, 10000);
 		if (zflg) {
-			void *naddr;
+			int res;
 
-			naddr = mmap(oaddr, chunk_size, PROT_READ, flags, fd, 0);
-			if (naddr == (void *)-1) {
-				if (errno == EAGAIN) {
-					/* That is if SO_RCVLOWAT is buggy */
-					usleep(1000);
-					continue;
-				}
-				if (errno == EINVAL) {
-					flags = MAP_SHARED;
-					oaddr = NULL;
-					goto fallback;
-				}
-				if (errno != EIO)
-					perror("mmap()");
+			zc.address = (__u64)addr;
+			zc.length = chunk_size;
+			zc.recv_skip_hint = 0;
+			res = setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE,
+					 &zc, sizeof(zc));
+			if (res == -1)
 				break;
+
+			if (zc.length) {
+				assert(zc.length <= chunk_size);
+				total_mmap += zc.length;
+				if (xflg)
+					hash_zone(addr, zc.length);
+				total += zc.length;
 			}
-			total_mmap += chunk_size;
-			if (xflg)
-				hash_zone(naddr, chunk_size);
-			total += chunk_size;
-			if (!keepflag) {
-				flags |= MAP_FIXED;
-				oaddr = naddr;
+			if (zc.recv_skip_hint) {
+				assert(zc.recv_skip_hint <= chunk_size);
+				lu = read(fd, buffer, zc.recv_skip_hint);
+				if (lu > 0) {
+					if (xflg)
+						hash_zone(buffer, lu);
+					total += lu;
+				}
 			}
 			continue;
 		}
-fallback:
 		sub = 0;
 		while (sub < chunk_size) {
 			lu = read(fd, buffer + sub, chunk_size - sub);
@@ -228,6 +234,8 @@ void *child_thread(void *arg)
 error:
 	free(buffer);
 	close(fd);
+	if (zflg)
+		munmap(addr, chunk_size);
 	pthread_exit(0);
 }
 
@@ -371,7 +379,8 @@ int main(int argc, char *argv[])
 		setup_sockaddr(cfg_family, host, &listenaddr);
 
 		if (mss &&
-		    setsockopt(fdlisten, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == -1) {
+		    setsockopt(fdlisten, IPPROTO_TCP, TCP_MAXSEG,
+			       &mss, sizeof(mss)) == -1) {
 			perror("setsockopt TCP_MAXSEG");
 			exit(1);
 		}
@@ -402,7 +411,7 @@ int main(int argc, char *argv[])
 	setup_sockaddr(cfg_family, host, &addr);
 
 	if (mss &&
-	    setsockopt(fd, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == -1) {
+	    setsockopt(fd, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == -1) {
 		perror("setsockopt TCP_MAXSEG");
 		exit(1);
 	}
-- 
2.17.0.484.g0c8726318c-goog

^ permalink raw reply related

* [RFC bpf] bpf, x64: fix JIT emission for dead code
From: Gianluca Borello @ 2018-04-25  5:42 UTC (permalink / raw)
  To: netdev; +Cc: daniel, ast, Gianluca Borello

Commit 2a5418a13fcf ("bpf: improve dead code sanitizing") replaced dead
code with a series of ja-1 instructions, for safety. That made JIT
compilation much more complex for some BPF programs. One instance of such
programs is, for example:

bool flag = false
...
/* A bunch of other code */
...
if (flag)
        do_something()

In some cases llvm is not able to remove at compile time the code for
do_something(), so the generated BPF program ends up with a large amount
of dead instructions. In one specific real life example, there are two
series of ~500 and ~1000 dead instructions in the program. When the
verifier replaces them with a series of ja-1 instructions, it causes an
interesting behavior at JIT time.

During the first pass, since all the instructions are estimated at 64
bytes, the ja-1 instructions end up being translated as 5 bytes JMP
instructions (0xE9), since the jump offsets become increasingly large (>
127) as each instruction gets discovered to be 5 bytes instead of the
estimated 64.

Starting from the second pass, the first N instructions of the ja-1
sequence get translated into 2 bytes JMPs (0xEB) because the jump offsets
become <= 127 this time. In particular, N is defined as roughly 127 / (5
- 2) ~= 42. So, each further pass will make the subsequent N JMP
instructions shrink from 5 to 2 bytes, making the image shrink every time.
This means that in order to have the entire program converge, there need
to be, in the real example above, at least ~1000 / 42 ~= 24 passes just
for translating the dead code. If we add this number to the passes needed
to translate the other non dead code, it brings such program to 40+
passes, and JIT doesn't complete. Ultimately the userspace loader fails
because such BPF program was supposed to be part of a prog array owner
being JITed.

While it is certainly possible to try to refactor such programs to help
the compiler remove dead code, the behavior is not really intuitive and it
puts further burden on the BPF developer who is not expecting such
behavior. To make things worse, such programs are working just fine in all
the kernel releases prior to the ja-1 fix.

A possible approach to mitigate this behavior consists into noticing that
for ja-1 instructions we don't really need to rely on the estimated size
of the previous and current instructions, we know that a -1 BPF jump
offset can be safely translated into a 0xEB instruction with a jump offset
of -2.

Such fix brings the BPF program in the previous example to complete again
in ~9 passes.

Fixes: 2a5418a13fcf ("bpf: improve dead code sanitizing")
Signed-off-by: Gianluca Borello <g.borello@gmail.com>
---
Hi

Posting this as RFC since I just wanted to report this potential bug and
propose a possible fix, although I'm not sure if:

1) There might be a better fix on the JIT side
2) We might want to replace the ja-1 instructions with something else
3) We might want to ignore this issue

If we choose option 3, I'd just like to point out that this caused a real
regression on a couple BPF programs that are part of a larger collection
of programs that used to work fine on 4.15 and I recently found broken 
in 4.16, so there would be some value in somehow addressing this.

Thanks

 arch/x86/net/bpf_jit_comp.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index b725154182cc..abce27ceb411 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1027,7 +1027,17 @@ xadd:			if (is_imm8(insn->off))
 			break;
 
 		case BPF_JMP | BPF_JA:
-			jmp_offset = addrs[i + insn->off] - addrs[i];
+			if (insn->off == -1)
+				/* -1 jmp instructions will always jump
+				 * backwards two bytes. Explicitly handling
+				 * this case avoids wasting too many passes
+				 * when there are long sequences of replaced
+				 * dead code.
+				 */
+				jmp_offset = -2;
+			else
+				jmp_offset = addrs[i + insn->off] - addrs[i];
+
 			if (!jmp_offset)
 				/* optimize out nop jumps */
 				break;
-- 
2.17.0

^ permalink raw reply related

* Re: [PATCH net-next 1/2] tcp: add TCP_ZEROCOPY_RECEIVE support for zerocopy receive
From: Christoph Hellwig @ 2018-04-25  6:28 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, netdev, Andy Lutomirski, linux-kernel, linux-mm,
	Soheil Hassas Yeganeh, Eric Dumazet
In-Reply-To: <20180425052722.73022-2-edumazet@google.com>

On Tue, Apr 24, 2018 at 10:27:21PM -0700, Eric Dumazet wrote:
> When adding tcp mmap() implementation, I forgot that socket lock
> had to be taken before current->mm->mmap_sem. syzbot eventually caught
> the bug.
> 
> Since we can not lock the socket in tcp mmap() handler we have to
> split the operation in two phases.
> 
> 1) mmap() on a tcp socket simply reserves VMA space, and nothing else.
>   This operation does not involve any TCP locking.
> 
> 2) setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) implements
>  the transfert of pages from skbs to one VMA.
>   This operation only uses down_read(&current->mm->mmap_sem) after
>   holding TCP lock, thus solving the lockdep issue.
> 
> This new implementation was suggested by Andy Lutomirski with great details.

Thanks, this looks much more sensible to me.

^ permalink raw reply

* Re: [PATCH net-next 3/4] nfp: flower: support offloading multiple rules with same cookie
From: Or Gerlitz @ 2018-04-25  6:31 UTC (permalink / raw)
  To: Jakub Kicinski, John Hurley
  Cc: David Miller, Linux Netdev List, oss-drivers, ASAP_Direct_Dev
In-Reply-To: <20180425041704.26882-4-jakub.kicinski@netronome.com>

On Wed, Apr 25, 2018 at 7:17 AM, Jakub Kicinski
<jakub.kicinski@netronome.com> wrote:
> From: John Hurley <john.hurley@netronome.com>
>
> When multiple netdevs are attached to a tc offload block and register for
> callbacks, a rule added to the block will be propogated to all netdevs.
> Previously these were detected as duplicates (based on cookie) and
> rejected. Modify the rule nfp lookup function to optionally include an
> ingress netdev and a host context along with the cookie value when
> searching for a rule. When a new rule is passed to the driver, the netdev
> the rule is to be attached to is considered when searching for dublicates.

so if the same rule (cookie) is provided to the driver through multiple ingress
devices you will not reject it -- what is the use case for that, is it
block sharing?

^ permalink raw reply

* [PATCH v1 net-next] lan78xx: Lan7801 Support for Fixed PHY
From: Raghuram Chary J @ 2018-04-25  6:43 UTC (permalink / raw)
  To: davem; +Cc: netdev, unglinuxdriver, woojung.huh, raghuramchary.jallipalli

Adding Fixed PHY support to the lan78xx driver.

Signed-off-by: Raghuram Chary J <raghuramchary.jallipalli@microchip.com>
---
v0->v1:
   * Remove driver version #define
   * Modify netdev_info to netdev_dbg
   * Move lan7801 specific to new routine and add switch case
   * Minor cleanup
---
 drivers/net/usb/Kconfig   |   1 +
 drivers/net/usb/lan78xx.c | 113 ++++++++++++++++++++++++++++++++--------------
 2 files changed, 79 insertions(+), 35 deletions(-)

diff --git a/drivers/net/usb/Kconfig b/drivers/net/usb/Kconfig
index f28bd74ac275..418b0904cecb 100644
--- a/drivers/net/usb/Kconfig
+++ b/drivers/net/usb/Kconfig
@@ -111,6 +111,7 @@ config USB_LAN78XX
 	select MII
 	select PHYLIB
 	select MICROCHIP_PHY
+	select FIXED_PHY
 	help
 	  This option adds support for Microchip LAN78XX based USB 2
 	  & USB 3 10/100/1000 Ethernet adapters.
diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index 0867f7275852..47fa34a2d09f 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -36,13 +36,12 @@
 #include <linux/irq.h>
 #include <linux/irqchip/chained_irq.h>
 #include <linux/microchipphy.h>
-#include <linux/phy.h>
+#include <linux/phy_fixed.h>
 #include "lan78xx.h"
 
 #define DRIVER_AUTHOR	"WOOJUNG HUH <woojung.huh@microchip.com>"
 #define DRIVER_DESC	"LAN78XX USB 3.0 Gigabit Ethernet Devices"
 #define DRIVER_NAME	"lan78xx"
-#define DRIVER_VERSION	"1.0.6"
 
 #define TX_TIMEOUT_JIFFIES		(5 * HZ)
 #define THROTTLE_JIFFIES		(HZ / 8)
@@ -402,6 +401,7 @@ struct lan78xx_net {
 	struct statstage	stats;
 
 	struct irq_domain_data	domain_data;
+	struct phy_device	*fixedphy;
 };
 
 /* define external phy id */
@@ -1477,7 +1477,6 @@ static void lan78xx_get_drvinfo(struct net_device *net,
 	struct lan78xx_net *dev = netdev_priv(net);
 
 	strncpy(info->driver, DRIVER_NAME, sizeof(info->driver));
-	strncpy(info->version, DRIVER_VERSION, sizeof(info->version));
 	usb_make_path(dev->udev, info->bus_info, sizeof(info->bus_info));
 }
 
@@ -2003,52 +2002,91 @@ static int ksz9031rnx_fixup(struct phy_device *phydev)
 	return 1;
 }
 
-static int lan78xx_phy_init(struct lan78xx_net *dev)
+static int lan7801_phy_init(struct phy_device **phydev,
+			    struct lan78xx_net *dev)
 {
+	u32 buf;
 	int ret;
-	u32 mii_adv;
-	struct phy_device *phydev;
-
-	phydev = phy_find_first(dev->mdiobus);
-	if (!phydev) {
-		netdev_err(dev->net, "no PHY found\n");
-		return -EIO;
-	}
-
-	if ((dev->chipid == ID_REV_CHIP_ID_7800_) ||
-	    (dev->chipid == ID_REV_CHIP_ID_7850_)) {
-		phydev->is_internal = true;
-		dev->interface = PHY_INTERFACE_MODE_GMII;
-
-	} else if (dev->chipid == ID_REV_CHIP_ID_7801_) {
-		if (!phydev->drv) {
+	struct fixed_phy_status fphy_status = {
+		.link = 1,
+		.speed = SPEED_1000,
+		.duplex = DUPLEX_FULL,
+	};
+
+	if (!*phydev) {
+		netdev_dbg(dev->net, "PHY Not Found!! Registering Fixed PHY\n");
+		*phydev = fixed_phy_register(PHY_POLL, &fphy_status, -1,
+					     NULL);
+		if (IS_ERR(*phydev)) {
+			netdev_err(dev->net, "No PHY/fixed_PHY found\n");
+			return -ENODEV;
+		}
+		netdev_dbg(dev->net, "Registered FIXED PHY\n");
+		dev->interface = PHY_INTERFACE_MODE_RGMII;
+		dev->fixedphy = *phydev;
+		ret = lan78xx_write_reg(dev, MAC_RGMII_ID,
+					MAC_RGMII_ID_TXC_DELAY_EN_);
+		ret = lan78xx_write_reg(dev, RGMII_TX_BYP_DLL, 0x3D00);
+		ret = lan78xx_read_reg(dev, HW_CFG, &buf);
+		buf |= HW_CFG_CLK125_EN_;
+		buf |= HW_CFG_REFCLK25_EN_;
+		ret = lan78xx_write_reg(dev, HW_CFG, buf);
+	} else {
+		if (!(*phydev)->drv) {
 			netdev_err(dev->net, "no PHY driver found\n");
 			return -EIO;
 		}
-
 		dev->interface = PHY_INTERFACE_MODE_RGMII;
-
 		/* external PHY fixup for KSZ9031RNX */
 		ret = phy_register_fixup_for_uid(PHY_KSZ9031RNX, 0xfffffff0,
 						 ksz9031rnx_fixup);
 		if (ret < 0) {
-			netdev_err(dev->net, "fail to register fixup\n");
+			netdev_err(dev->net, "fail to register fixup PHY_KSZ9031RNX\n");
 			return ret;
 		}
 		/* external PHY fixup for LAN8835 */
 		ret = phy_register_fixup_for_uid(PHY_LAN8835, 0xfffffff0,
 						 lan8835_fixup);
 		if (ret < 0) {
-			netdev_err(dev->net, "fail to register fixup\n");
+			netdev_err(dev->net, "fail to register fixup for PHY_LAN8835\n");
 			return ret;
 		}
 		/* add more external PHY fixup here if needed */
 
-		phydev->is_internal = false;
-	} else {
-		netdev_err(dev->net, "unknown ID found\n");
-		ret = -EIO;
-		goto error;
+		(*phydev)->is_internal = false;
+	}
+	return 0;
+}
+
+static int lan78xx_phy_init(struct lan78xx_net *dev)
+{
+	int ret;
+	u32 mii_adv;
+	struct phy_device *phydev;
+
+	phydev = phy_find_first(dev->mdiobus);
+	switch (dev->chipid) {
+	case ID_REV_CHIP_ID_7801_:
+		ret = lan7801_phy_init(&phydev, dev);
+		if (ret < 0) {
+			netdev_err(dev->net, "lan7801: PHY Init Failed");
+			return ret;
+		}
+		break;
+
+	case ID_REV_CHIP_ID_7800_:
+	case ID_REV_CHIP_ID_7850_:
+		if (!phydev) {
+			netdev_err(dev->net, "no PHY found\n");
+			return -EIO;
+		}
+		phydev->is_internal = true;
+		dev->interface = PHY_INTERFACE_MODE_GMII;
+		break;
+
+	default:
+		netdev_err(dev->net, "Unknown CHIP ID found\n");
+		return -EIO;
 	}
 
 	/* if phyirq is not set, use polling mode in phylib */
@@ -2067,6 +2105,12 @@ static int lan78xx_phy_init(struct lan78xx_net *dev)
 	if (ret) {
 		netdev_err(dev->net, "can't attach PHY to %s\n",
 			   dev->mdiobus->id);
+		if (dev->chipid == ID_REV_CHIP_ID_7801_) {
+			phy_unregister_fixup_for_uid(PHY_KSZ9031RNX,
+						     0xfffffff0);
+			phy_unregister_fixup_for_uid(PHY_LAN8835,
+						     0xfffffff0);
+		}
 		return -EIO;
 	}
 
@@ -2084,12 +2128,6 @@ static int lan78xx_phy_init(struct lan78xx_net *dev)
 	dev->fc_autoneg = phydev->autoneg;
 
 	return 0;
-
-error:
-	phy_unregister_fixup_for_uid(PHY_KSZ9031RNX, 0xfffffff0);
-	phy_unregister_fixup_for_uid(PHY_LAN8835, 0xfffffff0);
-
-	return ret;
 }
 
 static int lan78xx_set_rx_max_frame_length(struct lan78xx_net *dev, int size)
@@ -3501,6 +3539,11 @@ static void lan78xx_disconnect(struct usb_interface *intf)
 
 	phy_disconnect(net->phydev);
 
+	if (dev->fixedphy) {
+		fixed_phy_unregister(dev->fixedphy);
+		dev->fixedphy = NULL;
+	}
+
 	unregister_netdev(net);
 
 	cancel_delayed_work_sync(&dev->wq);
-- 
2.16.2

^ permalink raw reply related

* Re: [PATCH] can: janz-ican3: fix ican3_xmit()'s return type
From: Marc Kleine-Budde @ 2018-04-25  6:47 UTC (permalink / raw)
  To: Luc Van Oostenryck, linux-kernel; +Cc: Wolfgang Grandegger, linux-can, netdev
In-Reply-To: <20180424131609.3258-1-luc.vanoostenryck@gmail.com>


[-- Attachment #1.1: Type: text/plain, Size: 717 bytes --]

On 04/24/2018 03:16 PM, Luc Van Oostenryck wrote:
> The method ndo_start_xmit() is defined as returning an 'netdev_tx_t',
> which is a typedef for an enum type, but the implementation in this
> driver returns an 'int'.
> 
> Fix this by returning 'netdev_tx_t' in this driver too.
> 
> Signed-off-by: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
> ---

Can you send a series fixing the return type in all CAN drivers?

Marc

-- 
Pengutronix e.K.                  | Marc Kleine-Budde           |
Industrial Linux Solutions        | Phone: +49-231-2826-924     |
Vertretung West/Dortmund          | Fax:   +49-5121-206917-5555 |
Amtsgericht Hildesheim, HRA 2686  | http://www.pengutronix.de   |


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* [PATCH bpf-next] bpf: clear the ip_tunnel_info.
From: William Tu @ 2018-04-25  6:46 UTC (permalink / raw)
  To: netdev

The percpu metadata_dst might carry the stale ip_tunnel_info
and cause incorrect behavior.  When mixing tests using ipv4/ipv6
bpf vxlan and geneve tunnel, the ipv6 tunnel info incorrectly uses
ipv4's src ip addr as its ipv6 src address, because the previous
tunnel info does not clean up.  The patch zeros the fields in
ip_tunnel_info.

Signed-off-by: William Tu <u9012063@gmail.com>
Reported-by: Yifeng Sun <pkusunyifeng@gmail.com>
---
 net/core/filter.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/core/filter.c b/net/core/filter.c
index 8e45c6c7ab08..d3781daa26ab 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3281,6 +3281,7 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
 	skb_dst_set(skb, (struct dst_entry *) md);
 
 	info = &md->u.tun_info;
+	memset(info, 0, sizeof(*info));
 	info->mode = IP_TUNNEL_INFO_TX;
 
 	info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
-- 
2.7.4

^ permalink raw reply related

* e1000e I219 timestamping oops related to TSYNCRXCTL read
From: Benjamin Poirier @ 2018-04-25  6:52 UTC (permalink / raw)
  To: Bruce Allan, Yanir Lubetkin, Jacob Keller, Neftin, Sasha
  Cc: Alexander Duyck, Jeff Kirsher, Achim Mildenberger, olouvignes,
	jayanth, ehabkost, postmodern.mod3, Bart.VanAssche,
	intel-wired-lan, netdev

In the following openSUSE bug report
https://bugzilla.suse.com/show_bug.cgi?id=1075876
Achim reported an oops related to e1000e timestamping:
kernel: RIP: 0010:[<ffffffff8110303f>] timecounter_read+0xf/0x50
[...]
kernel: Call Trace:
kernel:  [<ffffffffa0806b0f>] e1000e_phc_gettime+0x2f/0x60 [e1000e]
kernel:  [<ffffffffa0806c5d>] e1000e_systim_overflow_work+0x1d/0x80 [e1000e]
kernel:  [<ffffffff810992c5>] process_one_work+0x155/0x440
kernel:  [<ffffffff81099e16>] worker_thread+0x116/0x4b0
kernel:  [<ffffffff8109f422>] kthread+0xd2/0xf0
kernel:  [<ffffffff8163184f>] ret_from_fork+0x3f/0x70

It always occurs 4 hours after boot but not on every boot. It is most
likely the same problem reported here:
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1668356
http://lkml.iu.edu/hypermail/linux/kernel/1506.2/index.html#02530
https://bugzilla.redhat.com/show_bug.cgi?id=1463882
https://bugzilla.redhat.com/show_bug.cgi?id=1431863

This occurs with MAC: 12, e1000_pch_spt/I219. The reporter has
reproduced it on a v4.16 derivative.

We've traced it to the fact that e1000e_systim_reset() skips the
timecounter_init() call if e1000e_get_base_timinca() returns -EINVAL,
which leads to a null deref in timecounter_read() (see comment 8 of the
suse bugzilla for more details.)

In commit 83129b37ef35 ("e1000e: fix systim issues", v4.2-rc1) Yanir
reworked e1000e_get_base_timinca() in such a way that it can return
-EINVAL for e1000_pch_spt if the SYSCFI bit is not set in TSYNCRXCTL.
This is also the commit that was identified by bisection in the second
link above (lkml).

What we've observed (in comment 14) is that TSYNCRXCTL reads sometimes
don't have the SYSCFI bit set. Retrying the read shortly after finds the
bit to be set. This was observed at boot (probe) but also link up and
link down.

I have a few questions:

What's the purpose of the SYSCFI bit in TSYNCRXCTL ("Reserved" in the
datasheet)?

Why does it look like subsequent reads of TSYNCRXCTL sometimes have the
SYSCFI bit set/not set on I219?

Is it right to check the SYSCFI bit in e1000e_get_base_timinca() for
_spt and return -EINVAL if it's not set? Could we just remove that
check?

The patch in comment 13 of the suse bugzilla works around the problem by
retrying TSYNCRXCTL reads, maybe we could instead remove that read
altogether or move the timecounter_init() call to at least avoid the
oops. The best approach to take seems to depend on the behavior expected
of TSYNCRXCTL reads on I219 so I'm hoping that you could provide more
info on that.

Thanks,
-Benjamin

^ permalink raw reply

* Re: [PATCH 1/1] IB/rxe: avoid double kfree_skb
From: Yanjun Zhu @ 2018-04-25  6:56 UTC (permalink / raw)
  To: Doug Ledford, monis, jgg, linux-rdma; +Cc: netdev
In-Reply-To: <0b87f416-eee1-fd6c-c386-27469d6db143@oracle.com>

Hi, all

rxe_send [rdma_rxe]
     ip_local_out
         __ip_local_out
         ip_output
             ip_finish_output
                 ip_finish_output2
                     dev_queue_xmit
                         __dev_queue_xmit
                             dev_hard_start_xmit
                                 e1000_xmit_frame [e1000]

When skb is sent, it will pass the above functions. I checked all the 
above functions. If error occurs in the above functions after 
ip_local_out, kfree_skb will be called.
So when ip_local_out returns an error, skb should be freed. It is not 
necessary to call kfree_skb in soft roce module again.

If I am wrong, please correct me.

Zhu Yanjun
On 2018/4/24 16:34, Yanjun Zhu wrote:
> Hi, all
>
> rxe_send
>     ip_local_out
>         __ip_local_out
>             nf_hook_slow
>
> In the above call process, nf_hook_slow drops and frees skb, then 
> -EPERM is returned when iptables rules(iptables -I OUTPUT -p udp 
> --dport 4791 -j DROP) is set.
>
> If skb->users is not changed in softroce, kfree_skb should not be 
> called in this module.
>
> I will make further investigations about other error handler after 
> ip_local_out.
> If I am wrong, please correct me.
>
> Any reply is appreciated.
>
> Zhu Yanjun
> On 2018/4/20 13:46, Yanjun Zhu wrote:
>>
>>
>> On 2018/4/20 10:19, Doug Ledford wrote:
>>> On Thu, 2018-04-19 at 10:01 -0400, Zhu Yanjun wrote:
>>>> When skb is dropped by iptables rules, the skb is freed at the same 
>>>> time
>>>> -EPERM is returned. So in softroce, it is not necessary to free skb 
>>>> again.
>>>> Or else, crash will occur.
>>>>
>>>> The steps to reproduce:
>>>>
>>>>       server                       client
>>>>      ---------                    ---------
>>>>      |1.1.1.1|<----rxe-channel--->|1.1.1.2|
>>>>      ---------                    ---------
>>>>
>>>> On server: rping -s -a 1.1.1.1 -v -C 10000 -S 512
>>>> On client: rping -c -a 1.1.1.1 -v -C 10000 -S 512
>>>>
>>>> The kernel configs CONFIG_DEBUG_KMEMLEAK and
>>>> CONFIG_DEBUG_OBJECTS are enabled on both server and client.
>>>>
>>>> When rping runs, run the following command in server:
>>>>
>>>> iptables -I OUTPUT -p udp  --dport 4791 -j DROP
>>>>
>>>> Without this patch, crash will occur.
>>>>
>>>> CC: Srinivas Eeda <srinivas.eeda@oracle.com>
>>>> CC: Junxiao Bi <junxiao.bi@oracle.com>
>>>> Signed-off-by: Zhu Yanjun <yanjun.zhu@oracle.com>
>>>> Reviewed-by: Yuval Shaia <yuval.shaia@oracle.com>
>>> I have no reason to doubt your analysis, but if there are a bunch of
>>> error paths for net_xmit and they all return with your skb still being
>>> valid and holding a reference, and then one oddball that returns with
>>> your skb already gone, that just sounds like a mistake waiting to 
>>> happen
>>> (not to mention a bajillion special cases sprinkled everywhere to deal
>>> with this apparent inconsistency).
>>>
>>> Can we get a netdev@ confirmation on this being the right solution?
>> Yes. I agree with you.
>> After iptables rule "iptables -I OUTPUT -p udp  --dport 4791 -j 
>> DROP", the skb is freed in this function
>>
>> /* Returns 1 if okfn() needs to be executed by the caller,
>>  * -EPERM for NF_DROP, 0 otherwise.  Caller must hold rcu_read_lock. */
>> int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
>>                  const struct nf_hook_entries *e, unsigned int s)
>> {
>>         unsigned int verdict;
>>         int ret;
>>
>>         for (; s < e->num_hook_entries; s++) {
>>                 verdict = nf_hook_entry_hookfn(&e->hooks[s], skb, 
>> state);
>>                 switch (verdict & NF_VERDICT_MASK) {
>>                 case NF_ACCEPT:
>>                         break;
>>                 case NF_DROP:
>> kfree_skb(skb);                               <----here, skb is freed
>>                         ret = NF_DROP_GETERR(verdict);
>>                         if (ret == 0)
>>                                 ret = -EPERM;
>>                         return ret;
>>                 case NF_QUEUE:
>>                         ret = nf_queue(skb, state, e, s, verdict);
>>                         if (ret == 1)
>>                                 continue;
>>                         return ret;
>>                 default:
>>                         /* Implicit handling for NF_STOLEN, as well 
>> as any other
>>                          * non conventional verdicts.
>>                          */
>>                         return 0;
>>                 }
>>         }
>>
>>         return 1;
>> }
>> EXPORT_SYMBOL(nf_hook_slow);
>>
>> If I am wrong, please correct me.
>>
>> And my test environment is still there, any solution can be verified 
>> in it.
>>
>> Zhu Yanjun
>>>
>>>> ---
>>>>   drivers/infiniband/sw/rxe/rxe_net.c  | 3 +++
>>>>   drivers/infiniband/sw/rxe/rxe_req.c  | 5 +++--
>>>>   drivers/infiniband/sw/rxe/rxe_resp.c | 9 ++++++---
>>>>   3 files changed, 12 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/drivers/infiniband/sw/rxe/rxe_net.c 
>>>> b/drivers/infiniband/sw/rxe/rxe_net.c
>>>> index 9da6e37..2094434 100644
>>>> --- a/drivers/infiniband/sw/rxe/rxe_net.c
>>>> +++ b/drivers/infiniband/sw/rxe/rxe_net.c
>>>> @@ -511,6 +511,9 @@ int rxe_send(struct rxe_pkt_info *pkt, struct 
>>>> sk_buff *skb)
>>>>            if (unlikely(net_xmit_eval(err))) {
>>>>                  pr_debug("error sending packet: %d\n", err);
>>>> +               /* -EPERM means the skb is dropped and freed. */
>>>> +               if (err == -EPERM)
>>>> +                       return -EPERM;
>>>>                  return -EAGAIN;
>>>>          }
>>>>   diff --git a/drivers/infiniband/sw/rxe/rxe_req.c 
>>>> b/drivers/infiniband/sw/rxe/rxe_req.c
>>>> index 7bdaf71..9d2efec 100644
>>>> --- a/drivers/infiniband/sw/rxe/rxe_req.c
>>>> +++ b/drivers/infiniband/sw/rxe/rxe_req.c
>>>> @@ -727,8 +727,9 @@ int rxe_requester(void *arg)
>>>>                    rollback_state(wqe, qp, &rollback_wqe, 
>>>> rollback_psn);
>>>>   -               if (ret == -EAGAIN) {
>>>> -                       kfree_skb(skb);
>>>> +               if ((ret == -EAGAIN) || (ret == -EPERM)) {
>>>> +                       if (ret == -EAGAIN)
>>>> +                               kfree_skb(skb);
>>>>                          rxe_run_task(&qp->req.task, 1);
>>>>                          goto exit;
>>>>                  }
>>>> diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c 
>>>> b/drivers/infiniband/sw/rxe/rxe_resp.c
>>>> index a65c996..6bdf9b2 100644
>>>> --- a/drivers/infiniband/sw/rxe/rxe_resp.c
>>>> +++ b/drivers/infiniband/sw/rxe/rxe_resp.c
>>>> @@ -742,7 +742,8 @@ static enum resp_states read_reply(struct 
>>>> rxe_qp *qp,
>>>>          err = rxe_xmit_packet(rxe, qp, &ack_pkt, skb);
>>>>          if (err) {
>>>>                  pr_err("Failed sending RDMA reply.\n");
>>>> -               kfree_skb(skb);
>>>> +               if (err != -EPERM)
>>>> +                       kfree_skb(skb);
>>>>                  return RESPST_ERR_RNR;
>>>>          }
>>>>   @@ -956,7 +957,8 @@ static int send_ack(struct rxe_qp *qp, struct 
>>>> rxe_pkt_info *pkt,
>>>>          err = rxe_xmit_packet(rxe, qp, &ack_pkt, skb);
>>>>          if (err) {
>>>>                  pr_err_ratelimited("Failed sending ack\n");
>>>> -               kfree_skb(skb);
>>>> +               if (err != -EPERM)
>>>> +                       kfree_skb(skb);
>>>>          }
>>>>     err1:
>>>> @@ -1141,7 +1143,8 @@ static enum resp_states 
>>>> duplicate_request(struct rxe_qp *qp,
>>>>                          if (rc) {
>>>>                                  pr_err("Failed resending result. 
>>>> This flow is not handled - skb ignored\n");
>>>>                                  rxe_drop_ref(qp);
>>>> -                               kfree_skb(skb_copy);
>>>> +                               if (rc != -EPERM)
>>>> +                                       kfree_skb(skb_copy);
>>>>                                  rc = RESPST_CLEANUP;
>>>>                                  goto out;
>>>>                          }
>>>> -- 
>>>> 2.7.4
>>>>
>>
>> -- 
>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox