Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH v2] netfilter: SYNPROXY: fix process non tcp packet bug in {ipv4,ipv6}_synproxy_hook
From: Lin Zhang @ 2017-09-30 10:25 UTC (permalink / raw)
  To: pablo, kadlec, fw, davem, kuznet, yoshfuji
  Cc: netfilter-devel, coreteam, netdev, Lin Zhang

In function {ipv4,ipv6}_synproxy_hook we expect a normal tcp packet,
but the real server maybe reply an icmp error packet related to the 
exist tcp conntrack, so we will access wrong tcp data.

For fix it, check for the protocol field and only process tcp traffic.

Signed-off-by: Lin Zhang <xiaolou4617@gmail.com>
---
changelog:

v1 -> v2:
	* we only deal with TCP traffic, per Pablo Neira.
---
 net/ipv4/netfilter/ipt_SYNPROXY.c  | 3 ++-
 net/ipv6/netfilter/ip6t_SYNPROXY.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index 811689e..13bec90 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -330,7 +330,8 @@ static unsigned int ipv4_synproxy_hook(void *priv,
 	if (synproxy == NULL)
 		return NF_ACCEPT;
 
-	if (nf_is_loopback_packet(skb))
+	if (nf_is_loopback_packet(skb) ||
+	    ip_hdr(skb)->protocol != IPPROTO_TCP)
 		return NF_ACCEPT;
 
 	thoff = ip_hdrlen(skb);
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c
index a5cd43d..af19c16 100644
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -347,7 +347,8 @@ static unsigned int ipv6_synproxy_hook(void *priv,
 	if (synproxy == NULL)
 		return NF_ACCEPT;
 
-	if (nf_is_loopback_packet(skb))
+	if (nf_is_loopback_packet(skb) ||
+	    ipv6_hdr(skb)->protocol != IPPROTO_TCP)
 		return NF_ACCEPT;
 
 	nexthdr = ipv6_hdr(skb)->nexthdr;
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH] net: stmmac: dwmac-rk: Add RK3128 GMAC support
From: David Wu @ 2017-09-30  9:47 UTC (permalink / raw)
  To: davem, heiko
  Cc: peppe.cavallaro, alexandre.torgue, huangtao, netdev,
	linux-rockchip, linux-kernel, David Wu

Add constants and callback functions for the dwmac on rk3128 soc.
As can be seen, the base structure is the same, only registers
and the bits in them moved slightly.

Signed-off-by: David Wu <david.wu@rock-chips.com>
---
 .../devicetree/bindings/net/rockchip-dwmac.txt     |   1 +
 drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c     | 112 +++++++++++++++++++++
 2 files changed, 113 insertions(+)

diff --git a/Documentation/devicetree/bindings/net/rockchip-dwmac.txt b/Documentation/devicetree/bindings/net/rockchip-dwmac.txt
index 6af8eed..9c16ee2 100644
--- a/Documentation/devicetree/bindings/net/rockchip-dwmac.txt
+++ b/Documentation/devicetree/bindings/net/rockchip-dwmac.txt
@@ -4,6 +4,7 @@ The device node has following properties.
 
 Required properties:
  - compatible: should be "rockchip,<name>-gamc"
+   "rockchip,rk3128-gmac": found on RK312x SoCs
    "rockchip,rk3228-gmac": found on RK322x SoCs
    "rockchip,rk3288-gmac": found on RK3288 SoCs
    "rockchip,rk3328-gmac": found on RK3328 SoCs
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
index 99823f5..13133b3 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
@@ -83,6 +83,117 @@ struct rk_priv_data {
 	(((tx) ? soc##_GMAC_TXCLK_DLY_ENABLE : soc##_GMAC_TXCLK_DLY_DISABLE) | \
 	 ((rx) ? soc##_GMAC_RXCLK_DLY_ENABLE : soc##_GMAC_RXCLK_DLY_DISABLE))
 
+#define RK3128_GRF_MAC_CON0	0x0168
+#define RK3128_GRF_MAC_CON1	0x016c
+
+/* RK3128_GRF_MAC_CON0 */
+#define RK3128_GMAC_TXCLK_DLY_ENABLE   GRF_BIT(14)
+#define RK3128_GMAC_TXCLK_DLY_DISABLE  GRF_CLR_BIT(14)
+#define RK3128_GMAC_RXCLK_DLY_ENABLE   GRF_BIT(15)
+#define RK3128_GMAC_RXCLK_DLY_DISABLE  GRF_CLR_BIT(15)
+#define RK3128_GMAC_CLK_RX_DL_CFG(val) HIWORD_UPDATE(val, 0x7F, 7)
+#define RK3128_GMAC_CLK_TX_DL_CFG(val) HIWORD_UPDATE(val, 0x7F, 0)
+
+/* RK3128_GRF_MAC_CON1 */
+#define RK3128_GMAC_PHY_INTF_SEL_RGMII	\
+		(GRF_BIT(6) | GRF_CLR_BIT(7) | GRF_CLR_BIT(8))
+#define RK3128_GMAC_PHY_INTF_SEL_RMII	\
+		(GRF_CLR_BIT(6) | GRF_CLR_BIT(7) | GRF_BIT(8))
+#define RK3128_GMAC_FLOW_CTRL          GRF_BIT(9)
+#define RK3128_GMAC_FLOW_CTRL_CLR      GRF_CLR_BIT(9)
+#define RK3128_GMAC_SPEED_10M          GRF_CLR_BIT(10)
+#define RK3128_GMAC_SPEED_100M         GRF_BIT(10)
+#define RK3128_GMAC_RMII_CLK_25M       GRF_BIT(11)
+#define RK3128_GMAC_RMII_CLK_2_5M      GRF_CLR_BIT(11)
+#define RK3128_GMAC_CLK_125M           (GRF_CLR_BIT(12) | GRF_CLR_BIT(13))
+#define RK3128_GMAC_CLK_25M            (GRF_BIT(12) | GRF_BIT(13))
+#define RK3128_GMAC_CLK_2_5M           (GRF_CLR_BIT(12) | GRF_BIT(13))
+#define RK3128_GMAC_RMII_MODE          GRF_BIT(14)
+#define RK3128_GMAC_RMII_MODE_CLR      GRF_CLR_BIT(14)
+
+static void rk3128_set_to_rgmii(struct rk_priv_data *bsp_priv,
+				int tx_delay, int rx_delay)
+{
+	struct device *dev = &bsp_priv->pdev->dev;
+
+	if (IS_ERR(bsp_priv->grf)) {
+		dev_err(dev, "Missing rockchip,grf property\n");
+		return;
+	}
+
+	regmap_write(bsp_priv->grf, RK3128_GRF_MAC_CON1,
+		     RK3128_GMAC_PHY_INTF_SEL_RGMII |
+		     RK3128_GMAC_RMII_MODE_CLR);
+	regmap_write(bsp_priv->grf, RK3128_GRF_MAC_CON0,
+		     DELAY_ENABLE(RK3128, tx_delay, rx_delay) |
+		     RK3128_GMAC_CLK_RX_DL_CFG(rx_delay) |
+		     RK3128_GMAC_CLK_TX_DL_CFG(tx_delay));
+}
+
+static void rk3128_set_to_rmii(struct rk_priv_data *bsp_priv)
+{
+	struct device *dev = &bsp_priv->pdev->dev;
+
+	if (IS_ERR(bsp_priv->grf)) {
+		dev_err(dev, "Missing rockchip,grf property\n");
+		return;
+	}
+
+	regmap_write(bsp_priv->grf, RK3128_GRF_MAC_CON1,
+		     RK3128_GMAC_PHY_INTF_SEL_RMII | RK3128_GMAC_RMII_MODE);
+}
+
+static void rk3128_set_rgmii_speed(struct rk_priv_data *bsp_priv, int speed)
+{
+	struct device *dev = &bsp_priv->pdev->dev;
+
+	if (IS_ERR(bsp_priv->grf)) {
+		dev_err(dev, "Missing rockchip,grf property\n");
+		return;
+	}
+
+	if (speed == 10)
+		regmap_write(bsp_priv->grf, RK3128_GRF_MAC_CON1,
+			     RK3128_GMAC_CLK_2_5M);
+	else if (speed == 100)
+		regmap_write(bsp_priv->grf, RK3128_GRF_MAC_CON1,
+			     RK3128_GMAC_CLK_25M);
+	else if (speed == 1000)
+		regmap_write(bsp_priv->grf, RK3128_GRF_MAC_CON1,
+			     RK3128_GMAC_CLK_125M);
+	else
+		dev_err(dev, "unknown speed value for RGMII! speed=%d", speed);
+}
+
+static void rk3128_set_rmii_speed(struct rk_priv_data *bsp_priv, int speed)
+{
+	struct device *dev = &bsp_priv->pdev->dev;
+
+	if (IS_ERR(bsp_priv->grf)) {
+		dev_err(dev, "Missing rockchip,grf property\n");
+		return;
+	}
+
+	if (speed == 10) {
+		regmap_write(bsp_priv->grf, RK3128_GRF_MAC_CON1,
+			     RK3128_GMAC_RMII_CLK_2_5M |
+			     RK3128_GMAC_SPEED_10M);
+	} else if (speed == 100) {
+		regmap_write(bsp_priv->grf, RK3128_GRF_MAC_CON1,
+			     RK3128_GMAC_RMII_CLK_25M |
+			     RK3128_GMAC_SPEED_100M);
+	} else {
+		dev_err(dev, "unknown speed value for RMII! speed=%d", speed);
+	}
+}
+
+static const struct rk_gmac_ops rk3128_ops = {
+	.set_to_rgmii = rk3128_set_to_rgmii,
+	.set_to_rmii = rk3128_set_to_rmii,
+	.set_rgmii_speed = rk3128_set_rgmii_speed,
+	.set_rmii_speed = rk3128_set_rmii_speed,
+};
+
 #define RK3228_GRF_MAC_CON0	0x0900
 #define RK3228_GRF_MAC_CON1	0x0904
 
@@ -1313,6 +1424,7 @@ static int rk_gmac_resume(struct device *dev)
 static SIMPLE_DEV_PM_OPS(rk_gmac_pm_ops, rk_gmac_suspend, rk_gmac_resume);
 
 static const struct of_device_id rk_gmac_dwmac_match[] = {
+	{ .compatible = "rockchip,rk3128-gmac", .data = &rk3128_ops },
 	{ .compatible = "rockchip,rk3228-gmac", .data = &rk3228_ops },
 	{ .compatible = "rockchip,rk3288-gmac", .data = &rk3288_ops },
 	{ .compatible = "rockchip,rk3328-gmac", .data = &rk3328_ops },
-- 
1.9.1

^ permalink raw reply related

* [PATCH v2 2/2] net: phonet: mark phonet_protocol as const
From: Lin Zhang @ 2017-09-30  9:40 UTC (permalink / raw)
  To: davem; +Cc: netdev, courmisch, Lin Zhang

The phonet_protocol structs don't need to be written by anyone and
so can be marked as const.

Signed-off-by: Lin Zhang <xiaolou4617@gmail.com>
---
changelog:

v1 -> v2:
	* remove __read_mostly tag
---
 include/net/phonet/phonet.h |  6 ++++--
 net/phonet/af_phonet.c      | 15 ++++++++-------
 net/phonet/datagram.c       |  2 +-
 net/phonet/pep.c            |  2 +-
 4 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/include/net/phonet/phonet.h b/include/net/phonet/phonet.h
index 039cc29..51e1a2a 100644
--- a/include/net/phonet/phonet.h
+++ b/include/net/phonet/phonet.h
@@ -108,8 +108,10 @@ struct phonet_protocol {
 	int			sock_type;
 };
 
-int phonet_proto_register(unsigned int protocol, struct phonet_protocol *pp);
-void phonet_proto_unregister(unsigned int protocol, struct phonet_protocol *pp);
+int phonet_proto_register(unsigned int protocol,
+		const struct phonet_protocol *pp);
+void phonet_proto_unregister(unsigned int protocol,
+		const struct phonet_protocol *pp);
 
 int phonet_sysctl_init(void);
 void phonet_sysctl_exit(void);
diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index b12142e..63b9870 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -35,11 +35,11 @@
 #include <net/phonet/pn_dev.h>
 
 /* Transport protocol registration */
-static struct phonet_protocol *proto_tab[PHONET_NPROTO] __read_mostly;
+static const struct phonet_protocol *proto_tab[PHONET_NPROTO];
 
-static struct phonet_protocol *phonet_proto_get(unsigned int protocol)
+static const struct phonet_protocol *phonet_proto_get(unsigned int protocol)
 {
-	struct phonet_protocol *pp;
+	const struct phonet_protocol *pp;
 
 	if (protocol >= PHONET_NPROTO)
 		return NULL;
@@ -53,7 +53,7 @@ static struct phonet_protocol *phonet_proto_get(unsigned int protocol)
 	return pp;
 }
 
-static inline void phonet_proto_put(struct phonet_protocol *pp)
+static inline void phonet_proto_put(const struct phonet_protocol *pp)
 {
 	module_put(pp->prot->owner);
 }
@@ -65,7 +65,7 @@ static int pn_socket_create(struct net *net, struct socket *sock, int protocol,
 {
 	struct sock *sk;
 	struct pn_sock *pn;
-	struct phonet_protocol *pnp;
+	const struct phonet_protocol *pnp;
 	int err;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -470,7 +470,7 @@ static int phonet_rcv(struct sk_buff *skb, struct net_device *dev,
 static DEFINE_MUTEX(proto_tab_lock);
 
 int __init_or_module phonet_proto_register(unsigned int protocol,
-						struct phonet_protocol *pp)
+				const struct phonet_protocol *pp)
 {
 	int err = 0;
 
@@ -492,7 +492,8 @@ int __init_or_module phonet_proto_register(unsigned int protocol,
 }
 EXPORT_SYMBOL(phonet_proto_register);
 
-void phonet_proto_unregister(unsigned int protocol, struct phonet_protocol *pp)
+void phonet_proto_unregister(unsigned int protocol,
+			const struct phonet_protocol *pp)
 {
 	mutex_lock(&proto_tab_lock);
 	BUG_ON(proto_tab[protocol] != pp);
diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c
index 5e71043..b44fb90 100644
--- a/net/phonet/datagram.c
+++ b/net/phonet/datagram.c
@@ -195,7 +195,7 @@ static int pn_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 	.name		= "PHONET",
 };
 
-static struct phonet_protocol pn_dgram_proto = {
+static const struct phonet_protocol pn_dgram_proto = {
 	.ops		= &phonet_dgram_ops,
 	.prot		= &pn_proto,
 	.sock_type	= SOCK_DGRAM,
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index e815379..9fc76b1 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -1351,7 +1351,7 @@ static void pep_sock_unhash(struct sock *sk)
 	.name		= "PNPIPE",
 };
 
-static struct phonet_protocol pep_pn_proto = {
+static const struct phonet_protocol pep_pn_proto = {
 	.ops		= &phonet_stream_ops,
 	.prot		= &pep_proto,
 	.sock_type	= SOCK_SEQPACKET,
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v2 1/2] net: phonet: mark header_ops as const
From: Lin Zhang @ 2017-09-30  9:39 UTC (permalink / raw)
  To: davem; +Cc: netdev, courmisch, Lin Zhang

Signed-off-by: Lin Zhang <xiaolou4617@gmail.com>
---
 net/phonet/af_phonet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index f925753..b12142e 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -149,7 +149,7 @@ static int pn_header_parse(const struct sk_buff *skb, unsigned char *haddr)
 	return 1;
 }
 
-struct header_ops phonet_header_ops = {
+const struct header_ops phonet_header_ops = {
 	.create = pn_header_create,
 	.parse = pn_header_parse,
 };
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH 2/2] net: phonet: mark phonet_protocol as const
From: Lin Zhang @ 2017-09-30  9:11 UTC (permalink / raw)
  To: davem; +Cc: netdev, courmisch, Lin Zhang

The phonet_protocol structs don't need to be written by anyone and
so can be marked as const.

Signed-off-by: Lin Zhang <xiaolou4617@gmail.com>
---
 include/net/phonet/phonet.h |  6 ++++--
 net/phonet/af_phonet.c      | 15 ++++++++-------
 net/phonet/datagram.c       |  2 +-
 net/phonet/pep.c            |  2 +-
 4 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/include/net/phonet/phonet.h b/include/net/phonet/phonet.h
index 039cc29..51e1a2a 100644
--- a/include/net/phonet/phonet.h
+++ b/include/net/phonet/phonet.h
@@ -108,8 +108,10 @@ struct phonet_protocol {
 	int			sock_type;
 };
 
-int phonet_proto_register(unsigned int protocol, struct phonet_protocol *pp);
-void phonet_proto_unregister(unsigned int protocol, struct phonet_protocol *pp);
+int phonet_proto_register(unsigned int protocol,
+		const struct phonet_protocol *pp);
+void phonet_proto_unregister(unsigned int protocol,
+		const struct phonet_protocol *pp);
 
 int phonet_sysctl_init(void);
 void phonet_sysctl_exit(void);
diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index b12142e..3b0ef69 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -35,11 +35,11 @@
 #include <net/phonet/pn_dev.h>
 
 /* Transport protocol registration */
-static struct phonet_protocol *proto_tab[PHONET_NPROTO] __read_mostly;
+static const struct phonet_protocol *proto_tab[PHONET_NPROTO] __read_mostly;
 
-static struct phonet_protocol *phonet_proto_get(unsigned int protocol)
+static const struct phonet_protocol *phonet_proto_get(unsigned int protocol)
 {
-	struct phonet_protocol *pp;
+	const struct phonet_protocol *pp;
 
 	if (protocol >= PHONET_NPROTO)
 		return NULL;
@@ -53,7 +53,7 @@ static struct phonet_protocol *phonet_proto_get(unsigned int protocol)
 	return pp;
 }
 
-static inline void phonet_proto_put(struct phonet_protocol *pp)
+static inline void phonet_proto_put(const struct phonet_protocol *pp)
 {
 	module_put(pp->prot->owner);
 }
@@ -65,7 +65,7 @@ static int pn_socket_create(struct net *net, struct socket *sock, int protocol,
 {
 	struct sock *sk;
 	struct pn_sock *pn;
-	struct phonet_protocol *pnp;
+	const struct phonet_protocol *pnp;
 	int err;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -470,7 +470,7 @@ static int phonet_rcv(struct sk_buff *skb, struct net_device *dev,
 static DEFINE_MUTEX(proto_tab_lock);
 
 int __init_or_module phonet_proto_register(unsigned int protocol,
-						struct phonet_protocol *pp)
+				const struct phonet_protocol *pp)
 {
 	int err = 0;
 
@@ -492,7 +492,8 @@ int __init_or_module phonet_proto_register(unsigned int protocol,
 }
 EXPORT_SYMBOL(phonet_proto_register);
 
-void phonet_proto_unregister(unsigned int protocol, struct phonet_protocol *pp)
+void phonet_proto_unregister(unsigned int protocol,
+			const struct phonet_protocol *pp)
 {
 	mutex_lock(&proto_tab_lock);
 	BUG_ON(proto_tab[protocol] != pp);
diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c
index 5e71043..b44fb90 100644
--- a/net/phonet/datagram.c
+++ b/net/phonet/datagram.c
@@ -195,7 +195,7 @@ static int pn_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 	.name		= "PHONET",
 };
 
-static struct phonet_protocol pn_dgram_proto = {
+static const struct phonet_protocol pn_dgram_proto = {
 	.ops		= &phonet_dgram_ops,
 	.prot		= &pn_proto,
 	.sock_type	= SOCK_DGRAM,
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index e815379..9fc76b1 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -1351,7 +1351,7 @@ static void pep_sock_unhash(struct sock *sk)
 	.name		= "PNPIPE",
 };
 
-static struct phonet_protocol pep_pn_proto = {
+static const struct phonet_protocol pep_pn_proto = {
 	.ops		= &phonet_stream_ops,
 	.prot		= &pep_proto,
 	.sock_type	= SOCK_SEQPACKET,
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH 1/2] net: phonet: mark header_ops as const
From: Lin Zhang @ 2017-09-30  9:08 UTC (permalink / raw)
  To: davem; +Cc: netdev, courmisch, Lin Zhang

Signed-off-by: Lin Zhang <xiaolou4617@gmail.com>
---
 net/phonet/af_phonet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index f925753..b12142e 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -149,7 +149,7 @@ static int pn_header_parse(const struct sk_buff *skb, unsigned char *haddr)
 	return 1;
 }
 
-struct header_ops phonet_header_ops = {
+const struct header_ops phonet_header_ops = {
 	.create = pn_header_create,
 	.parse = pn_header_parse,
 };
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net] net: fib_rules: Fix fib_rules_ops->compare implementations to support exact match
From: Shmulik Ladkani @ 2017-09-30  8:59 UTC (permalink / raw)
  To: David S . Miller, netdev
  Cc: Mateusz Bajorski, David Ahern, Thomas Graf, Shmulik Ladkani

From: Shmulik Ladkani <shmulik.ladkani@gmail.com>

Commit 153380ec4b9b ("fib_rules: Added NLM_F_EXCL support to fib_nl_newrule")
added a check to 'fib_nl_newrule' that tests whether the suggested rule
already exists (i.e. has same properties). The check uses
fib_rules_ops->compare() method to compare the protocol specific
properties.

However, the 'compare()' implementations have wildcard semantics:
They compared the src_len of the given 'frh' against existing rules
ONLY if frh->src_len is specified (i.e. non-zero).
Same applies to dst_len and tos fields.

The wildcard behavior exists, since formerly (prior 153380ec4b9b),
the only use of 'compare' was in fib_nl_delrule, and it was expected
that one can delete a rule using a partial match.

However, the wildcard behavior is incorrect for fib_nl_newrule:
It means that if one specifies 0.0.0.0/0 as the FRA_SRC (or equivalently
doesn't specify a FRA_SRC), frh->src_len will NOT be compared against
other rules, resulting in compare returning true.

This leads to inconsistencies, depending on order of operations, e.g.:

A working sequence:
  # ip ru add from 0.0.0.0/0 iif eth2 pref 22 table 22
  # ip ru add from 10.0.0.0/8 iif eth2 pref 22 table 22
  # ip ru | grep 22                   # both added successfully
  22:  from all iif eth2 lookup 22
  22:  from 10.0.0.0/8 iif eth2 lookup 22

A non-working sequence:
  # ip ru add from 10.0.0.0/8 iif eth2 pref 33 table 33
  # ip ru add from 0.0.0.0/0 iif eth2 pref 33 table 33
  RTNETLINK answers: File exists      # wildcard compare returned true

Fix, by adding an 'exact' mode for comparison:
In exact mode, frh->src_len is compared to existing rules' src_len
regardless whether frh->src_len is non-zero.
(same applies for 'dst_len' and 'tos').

Fixes: 153380ec4b9b ("fib_rules: Added NLM_F_EXCL support to fib_nl_newrule")
Reported-by: Eyal Birger <eyal.birger@gmail.com>
Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
---
 An alternative I considered was changing all 'compare' implementations
 to have "exact" semantics.
 This has the adavantage of less user confusion, as both NEWRULE and
 DELRULE have an "exact match" behavior.
 Alas, I assume there are existing users which might rely on the
 wildcard behavior of DELRULE.
 Therefore, this patch results in NEWRULE+NLM_F_EXCL having "exact"
 semantics, but keeps the "wildcard" semantics for DELRULE.
---
 include/net/fib_rules.h | 2 +-
 net/core/fib_rules.c    | 4 ++--
 net/decnet/dn_rules.c   | 6 +++---
 net/ipv4/fib_rules.c    | 8 ++++----
 net/ipv4/ipmr.c         | 2 +-
 net/ipv6/fib6_rules.c   | 8 ++++----
 net/ipv6/ip6mr.c        | 2 +-
 7 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index 3d7f1cefc6f5..23c406282af8 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -74,7 +74,7 @@ struct fib_rules_ops {
 	int			(*delete)(struct fib_rule *);
 	int			(*compare)(struct fib_rule *,
 					   struct fib_rule_hdr *,
-					   struct nlattr **);
+					   struct nlattr **, bool);
 	int			(*fill)(struct fib_rule *, struct sk_buff *,
 					struct fib_rule_hdr *);
 	size_t			(*nlmsg_payload)(struct fib_rule *);
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 9a6d97c1d810..7e5cdee5263a 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -422,7 +422,7 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
 		    !uid_eq(r->uid_range.end, rule->uid_range.end))
 			continue;
 
-		if (!ops->compare(r, frh, tb))
+		if (!ops->compare(r, frh, tb, true))
 			continue;
 		return 1;
 	}
@@ -702,7 +702,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 		     !uid_eq(rule->uid_range.end, range.end)))
 			continue;
 
-		if (!ops->compare(rule, frh, tb))
+		if (!ops->compare(rule, frh, tb, false))
 			continue;
 
 		if (rule->flags & FIB_RULE_PERMANENT) {
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
index 295bbd6a56f2..f8117248564c 100644
--- a/net/decnet/dn_rules.c
+++ b/net/decnet/dn_rules.c
@@ -158,14 +158,14 @@ static int dn_fib_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
 }
 
 static int dn_fib_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
-			       struct nlattr **tb)
+			       struct nlattr **tb, bool exact)
 {
 	struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
 
-	if (frh->src_len && (r->src_len != frh->src_len))
+	if ((exact || frh->src_len) && r->src_len != frh->src_len)
 		return 0;
 
-	if (frh->dst_len && (r->dst_len != frh->dst_len))
+	if ((exact || frh->dst_len) && r->dst_len != frh->dst_len)
 		return 0;
 
 	if (frh->src_len && (r->src != nla_get_le16(tb[FRA_SRC])))
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 35d646a62ad4..13369b7e247f 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -277,17 +277,17 @@ static int fib4_rule_delete(struct fib_rule *rule)
 }
 
 static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
-			     struct nlattr **tb)
+			     struct nlattr **tb, bool exact)
 {
 	struct fib4_rule *rule4 = (struct fib4_rule *) rule;
 
-	if (frh->src_len && (rule4->src_len != frh->src_len))
+	if ((exact || frh->src_len) && rule4->src_len != frh->src_len)
 		return 0;
 
-	if (frh->dst_len && (rule4->dst_len != frh->dst_len))
+	if ((exact || frh->dst_len) && rule4->dst_len != frh->dst_len)
 		return 0;
 
-	if (frh->tos && (rule4->tos != frh->tos))
+	if ((exact || frh->tos) && rule4->tos != frh->tos)
 		return 0;
 
 #ifdef CONFIG_IP_ROUTE_CLASSID
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 292a8e80bdfa..13d6702d5b8f 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -192,7 +192,7 @@ static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
 }
 
 static int ipmr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
-			     struct nlattr **tb)
+			     struct nlattr **tb, bool exact)
 {
 	return 1;
 }
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index b240f24a6e52..7ffbff3777ce 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -265,17 +265,17 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
 }
 
 static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
-			     struct nlattr **tb)
+			     struct nlattr **tb, bool exact)
 {
 	struct fib6_rule *rule6 = (struct fib6_rule *) rule;
 
-	if (frh->src_len && (rule6->src.plen != frh->src_len))
+	if ((exact || frh->src_len) && rule6->src.plen != frh->src_len)
 		return 0;
 
-	if (frh->dst_len && (rule6->dst.plen != frh->dst_len))
+	if ((exact || frh->dst_len) && rule6->dst.plen != frh->dst_len)
 		return 0;
 
-	if (frh->tos && (rule6->tclass != frh->tos))
+	if ((exact || frh->tos) && rule6->tclass != frh->tos)
 		return 0;
 
 	if (frh->src_len &&
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index f5500f5444e9..95a0f15c4810 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -196,7 +196,7 @@ static int ip6mr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
 }
 
 static int ip6mr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
-			      struct nlattr **tb)
+			      struct nlattr **tb, bool exact)
 {
 	return 1;
 }
-- 
2.14.1

^ permalink raw reply related

* Re: tc-ipt v0.2: Extension does not know id 1504083504
From: Corey Hickey @ 2017-09-30  6:52 UTC (permalink / raw)
  To: Sergey K., netdev
In-Reply-To: <CAGv8E+wkpsZL8SMzx+oXei6245RU+SacrBcUN7p2RtXcaBCafw@mail.gmail.com>

On 2017-09-29 23:34, Sergey K. wrote:
> Hello.
> 
> I have to apply this patch
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/shemminger/iproute2.git/commit/?h=v4.10.0&id=97a02cabefb2e2dcfe27f89943709afa84be5525
> 
> to my version of iproute2.
> But now I have a new information message, when I'm using construction like this:
> 
> tc filter add dev eth0 parent ffff: u32 match u32 0 0 action xt -j
> MARK --set-mark 0
> 
> message text:
> tc-ipt v0.2: Extension does not know id 1504083504
> 
> 
> I'm using Debian Stretch, kernel 4.9.0-3-amd64, iptables 1.6.0 and
> patched iproute 4.9.0
> 
> How to solve?

Funny, I just ran into this too and subscribed here to report it. The 
error occurs during parsing of any options to the jump target; if the 
target has no options, there is no error.

The problem seems to be an outdated version of struct xtables_target in 
include/xtables.h. The version in iptables has an additional member 
"udata" that makes the offsets in the struct different for anything 
following.

A quick fix for this particular problem is to copy include/xtables.h from:
git://git.netfilter.org/iptables
...into include/ in the iproute2 source, then recompile after a 'make 
clean'.

As for a comprehensive fix, I don't know--presumably other headers in 
include/ may be out of date, but I don't want to just blindly send a 
patch unless someone who knows the ramifications says it's ok. This 
seems like it would need maintainer oversight. If there's something I 
can do, though, let me know.

-Corey

^ permalink raw reply

* Re: [PATCH v2] netlink: do not proceed if dump's start() errs
From: Johannes Berg @ 2017-09-30  6:56 UTC (permalink / raw)
  To: Jason A. Donenfeld, davem, netdev, linux-kernel
In-Reply-To: <20170927224144.1749-1-Jason@zx2c4.com>

On Thu, 2017-09-28 at 00:41 +0200, Jason A. Donenfeld wrote:
> Drivers that use the start method for netlink dumping rely on dumpit
> not
> being called if start fails. For example, ila_xlat.c allocates memory
> and assigns it to cb->args[0] in its start() function. It might fail
> to
> do that and return -ENOMEM instead. However, even when returning an
> error, dumpit will be called, which, in the example above, quickly
> dereferences the memory in cb->args[0], which will OOPS the kernel.
> This
> is but one example of how this goes wrong.
> 
> Since start() has always been a function with an int return type, it
> therefore makes sense to use it properly, rather than ignoring it.
> This
> patch thus returns early and does not call dumpit() when start()
> fails.
> 
> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>

Reviewed-by: Johannes Berg <johannes@sipsolutions.net>

FWIW, I found another (indirect, via genetlink, like ila_xlat.c) in-
tree user that cares and expects the correct failure behaviour:

net/ipv6/seg6.c
	.start  = seg6_genl_dumphmac_start,

which can also have memory allocation failures. No others appear to
exist, afaict.

Either way, perhaps it's worth sending this to stable for that reason.

johannes

^ permalink raw reply

* tc-ipt v0.2: Extension does not know id 1504083504
From: Sergey K. @ 2017-09-30  6:34 UTC (permalink / raw)
  To: netdev

Hello.

I have to apply this patch

https://git.kernel.org/pub/scm/linux/kernel/git/shemminger/iproute2.git/commit/?h=v4.10.0&id=97a02cabefb2e2dcfe27f89943709afa84be5525

to my version of iproute2.
But now I have a new information message, when I'm using construction like this:

tc filter add dev eth0 parent ffff: u32 match u32 0 0 action xt -j
MARK --set-mark 0

message text:
tc-ipt v0.2: Extension does not know id 1504083504


I'm using Debian Stretch, kernel 4.9.0-3-amd64, iptables 1.6.0 and
patched iproute 4.9.0

How to solve?

^ permalink raw reply

* Re: [PATCH v2] netlink: do not proceed if dump's start() errs
From: David Miller @ 2017-09-30  6:27 UTC (permalink / raw)
  To: Jason; +Cc: johannes.berg, netdev, linux-kernel, johannes
In-Reply-To: <20170927224144.1749-1-Jason@zx2c4.com>

From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Thu, 28 Sep 2017 00:41:44 +0200

> Drivers that use the start method for netlink dumping rely on dumpit not
> being called if start fails. For example, ila_xlat.c allocates memory
> and assigns it to cb->args[0] in its start() function. It might fail to
> do that and return -ENOMEM instead. However, even when returning an
> error, dumpit will be called, which, in the example above, quickly
> dereferences the memory in cb->args[0], which will OOPS the kernel. This
> is but one example of how this goes wrong.
> 
> Since start() has always been a function with an int return type, it
> therefore makes sense to use it properly, rather than ignoring it. This
> patch thus returns early and does not call dumpit() when start() fails.
> 
> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
> Cc: Johannes Berg <johannes@sipsolutions.net>

Johannes, this looks straightforward to me, but please would you give
it a quick review?

Thank you.

^ permalink raw reply

* Re: [PATCH] net: hns3: Fix an error handling path in 'hclge_rss_init_hw()'
From: Yunsheng Lin @ 2017-09-30  6:10 UTC (permalink / raw)
  To: Christophe JAILLET, yisen.zhuang, salil.mehta, davem, lipeng321,
	colin.king, arnd
  Cc: netdev, linux-kernel, kernel-janitors
In-Reply-To: <20170930053434.4558-1-christophe.jaillet@wanadoo.fr>

Hi, Christophe

On 2017/9/30 13:34, Christophe JAILLET wrote:
> If this sanity check fails, we must free 'rss_indir'. Otherwise there is a
> memory leak.
> 'goto err' as done in the other error handling paths to fix it.

Thanks for fixing.

> 
> Fixes: 46a3df9f9718 ("net: hns3: Fix for setting rss_size incorrectly")
> Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
> ---
>  drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
> index e0685e630afe..c1cdbfd83bdb 100644
> --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
> +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
> @@ -2652,7 +2652,8 @@ static int hclge_rss_init_hw(struct hclge_dev *hdev)
>  		dev_err(&hdev->pdev->dev,
>  			"Configure rss tc size failed, invalid TC_SIZE = %d\n",
>  			rss_size);
> -		return -EINVAL;
> +		ret = -EINVAL;
> +		goto err;
>  	}
>  
>  	roundup_size = roundup_pow_of_two(rss_size);
> 

^ permalink raw reply

* [PATCH] net: hns3: Fix an error handling path in 'hclge_rss_init_hw()'
From: Christophe JAILLET @ 2017-09-30  5:34 UTC (permalink / raw)
  To: yisen.zhuang, salil.mehta, davem, linyunsheng, lipeng321,
	colin.king, arnd
  Cc: netdev, linux-kernel, kernel-janitors, Christophe JAILLET

If this sanity check fails, we must free 'rss_indir'. Otherwise there is a
memory leak.
'goto err' as done in the other error handling paths to fix it.

Fixes: 46a3df9f9718 ("net: hns3: Fix for setting rss_size incorrectly")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index e0685e630afe..c1cdbfd83bdb 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -2652,7 +2652,8 @@ static int hclge_rss_init_hw(struct hclge_dev *hdev)
 		dev_err(&hdev->pdev->dev,
 			"Configure rss tc size failed, invalid TC_SIZE = %d\n",
 			rss_size);
-		return -EINVAL;
+		ret = -EINVAL;
+		goto err;
 	}
 
 	roundup_size = roundup_pow_of_two(rss_size);
-- 
2.11.0

^ permalink raw reply related

* Re: [PATCH] mkiss: remove redundant check on len being zero
From: David Miller @ 2017-09-30  4:44 UTC (permalink / raw)
  To: colin.king
  Cc: stephen, johannes.berg, ralf, netdev, kernel-janitors,
	linux-kernel
In-Reply-To: <20170927214513.22562-1-colin.king@canonical.com>

From: Colin King <colin.king@canonical.com>
Date: Wed, 27 Sep 2017 22:45:13 +0100

> From: Colin Ian King <colin.king@canonical.com>
> 
> The check on len is redundant as it is always greater than 1,
> so just remove it and make the printk less complex.
> 
> Detected by CoverityScan, CID#1226729 ("Logically dead code")
> 
> Signed-off-by: Colin Ian King <colin.king@canonical.com>

Applied, thanks.

^ permalink raw reply

* Re: [PATCH] net-ipv6: add support for sockopt(SOL_IPV6, IPV6_FREEBIND)
From: David Miller @ 2017-09-30  4:31 UTC (permalink / raw)
  To: zenczykowski; +Cc: maze, netdev
In-Reply-To: <20170927043242.151591-1-zenczykowski@gmail.com>

From: Maciej Żenczykowski <zenczykowski@gmail.com>
Date: Tue, 26 Sep 2017 21:32:42 -0700

> From: Maciej Żenczykowski <maze@google.com>
> 
> So far we've been relying on sockopt(SOL_IP, IP_FREEBIND) being usable
> even on IPv6 sockets.
> 
> However, it turns out it is perfectly reasonable to want to set freebind
> on an AF_INET6 SOCK_RAW socket - but there is no way to set any SOL_IP
> socket option on such a socket (they're all blindly errored out).
> 
> One use case for this is to allow spoofing src ip on a raw socket
> via sendmsg cmsg.
> 
> Tested:
>   built, and booted
>   # python
>   >>> import socket
>   >>> SOL_IP = socket.SOL_IP
>   >>> SOL_IPV6 = socket.IPPROTO_IPV6
>   >>> IP_FREEBIND = 15
>   >>> IPV6_FREEBIND = 78
>   >>> s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM, 0)
>   >>> s.getsockopt(SOL_IP, IP_FREEBIND)
>   0
>   >>> s.getsockopt(SOL_IPV6, IPV6_FREEBIND)
>   0
>   >>> s.setsockopt(SOL_IPV6, IPV6_FREEBIND, 1)
>   >>> s.getsockopt(SOL_IP, IP_FREEBIND)
>   1
>   >>> s.getsockopt(SOL_IPV6, IPV6_FREEBIND)
>   1
> 
> Signed-off-by: Maciej Żenczykowski <maze@google.com>

Applied to net-next, thanks.

^ permalink raw reply

* Re: [PATCH net-next] liquidio: fix format truncation warning reported by gcc 7.1.1
From: David Miller @ 2017-09-30  4:29 UTC (permalink / raw)
  To: felix.manlunas; +Cc: netdev, raghu.vatsavayi, derek.chickles, satananda.burla
In-Reply-To: <20170926184827.GA3512@felix-thinkpad.cavium.com>

From: Felix Manlunas <felix.manlunas@cavium.com>
Date: Tue, 26 Sep 2017 11:48:27 -0700

> From: Satanand Burla <satananda.burla@cavium.com>
> 
> gcc 7.1.1 with -Wformat-truncation reports these warnings:
> 
> drivers/net/ethernet/cavium/liquidio/lio_core.c: In function `octeon_setup_interrupt':
> drivers/net/ethernet/cavium/liquidio/lio_core.c:1003:41: warning: `%u' directive output may be truncated writing between 1 and 10 bytes into a region of size between 0 and 13 [-Wformat-truncation=]
>        INTRNAMSIZ, "LiquidIO%u-pf%u-rxtx-%u",
>                                          ^~

Please just increase INTRNAMSIZ as needed.

Thanks.

^ permalink raw reply

* Re: [PATCH net-next] net: ipv6: send NS for DAD when link operationally up
From: David Miller @ 2017-09-30  4:26 UTC (permalink / raw)
  To: mmanning; +Cc: netdev
In-Reply-To: <1506373296-24977-1-git-send-email-mmanning@brocade.com>

From: Mike Manning <mmanning@brocade.com>
Date: Mon, 25 Sep 2017 22:01:36 +0100

> The NS for DAD are sent on admin up as long as a valid qdisc is found.
> A race condition exists by which these packets will not egress the
> interface if the operational state of the lower device is not yet up.
> The solution is to delay DAD until the link is operationally up
> according to RFC2863. Rather than only doing this, follow the existing
> code checks by deferring IPv6 device initialization altogether. The fix
> allows DAD on devices like tunnels that are controlled by userspace
> control plane. The fix has no impact on regular deployments, but means
> that there is no IPv6 connectivity until the port has been opened in
> the case of port-based network access control, which should be
> desirable.
> 
> Signed-off-by: Mike Manning <mmanning@brocade.com>

Applied, thank you.

^ permalink raw reply

* Re: [net-next V2 PATCH 5/5] samples/bpf: add cpumap sample program xdp_redirect_cpu
From: Alexei Starovoitov @ 2017-09-30  3:06 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: netdev, jakub.kicinski, Michael S. Tsirkin, Jason Wang, mchan,
	John Fastabend, peter.waskiewicz.jr, Daniel Borkmann,
	Andy Gospodarek
In-Reply-To: <150670287254.23765.608477053861949304.stgit@firesoul>

On Fri, Sep 29, 2017 at 06:34:32PM +0200, Jesper Dangaard Brouer wrote:
> +
> +static __always_inline
> +u16 get_dest_port_ipv4_udp(struct xdp_md *ctx, u64 nh_off)
> +{
> +	void *data_end = (void *)(long)ctx->data_end;
> +	void *data     = (void *)(long)ctx->data;
> +        struct iphdr *iph = data + nh_off;
> +	struct udphdr *udph;
> +	u16 dport;

formatting is off in the above.

> +
> +        if (iph + 1 > data_end)
> +                return 0;
> +	if (!(iph->protocol == IPPROTO_UDP))
> +		return 0;
> +
> +	udph = (void *)(iph + 1);
> +	if (udph + 1 > data_end)
> +		return 0;
> +
> +	dport = ntohs(udph->dest);
> +	return dport;
> +}
> +
> +static __always_inline
> +int get_proto_ipv4(struct xdp_md *ctx, u64 nh_off)
> +{
> +	void *data_end = (void *)(long)ctx->data_end;
> +	void *data     = (void *)(long)ctx->data;
> +        struct iphdr *iph = data + nh_off;
> +
> +        if (iph + 1 > data_end)
> +                return 0;
> +        return iph->protocol;

and here

> +}
> +
> +static __always_inline
> +int get_proto_ipv6(struct xdp_md *ctx, u64 nh_off)
> +{
> +	void *data_end = (void *)(long)ctx->data_end;
> +	void *data     = (void *)(long)ctx->data;
> +        struct ipv6hdr *ip6h = data + nh_off;
> +
> +        if (ip6h + 1 > data_end)
> +                return 0;

and here

> +/*** Trace point code ***/
> +
> +/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
> + * Code in:                kernel/include/trace/events/xdp.h
> + */
> +struct xdp_redirect_ctx {
> +	unsigned short common_type;	//	offset:0;  size:2; signed:0;
> +	unsigned char common_flags;	//	offset:2;  size:1; signed:0;
> +	unsigned char common_preempt_count;//	offset:3;  size:1; signed:0;
> +	int common_pid;			//	offset:4;  size:4; signed:1;

this part is not right. First 8 bytes are not accessible by bpf code.
Please use __u64 pad; or similar here.
Just noticed that samples/bpf/xdp_monitor_kern.c has the same problem.

> +
> +	int prog_id;			//	offset:8;  size:4; signed:1;
> +	u32 act;			//	offset:12  size:4; signed:0;
> +	int ifindex;			//	offset:16  size:4; signed:1;
> +	int err;			//	offset:20  size:4; signed:1;
> +	int to_ifindex;			//	offset:24  size:4; signed:1;
> +	u32 map_id;			//	offset:28  size:4; signed:0;
> +	int map_index;			//	offset:32  size:4; signed:1;
> +};					//	offset:36

the second part of fields is correct.

> +/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format
> + * Code in:                kernel/include/trace/events/xdp.h
> + */
> +struct xdp_exception_ctx {
> +	unsigned short common_type;	//	offset:0;  size:2; signed:0;
> +	unsigned char common_flags;	//	offset:2;  size:1; signed:0;
> +	unsigned char common_preempt_count;//	offset:3;  size:1; signed:0;
> +	int common_pid;			//	offset:4;  size:4; signed:1;

same problem here.

> +	int prog_id;			//	offset:8;  size:4; signed:1;
> +	u32 act;			//	offset:12; size:4; signed:0;
> +	int ifindex;			//	offset:16; size:4; signed:1;
> +};
> +
> +SEC("tracepoint/xdp/xdp_exception")
> +int trace_xdp_exception(struct xdp_exception_ctx *ctx)
> +{
> +	struct datarec *rec;
> +	u32 key = 0;
> +
> +	rec = bpf_map_lookup_elem(&exception_cnt, &key);
> +	if (!rec)
> +		return 1;
> +	rec->dropped += 1;
> +
> +	return 0;
> +}
> +
> +/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format
> + * Code in:         kernel/include/trace/events/xdp.h
> + */
> +struct cpumap_enqueue_ctx {
> +	unsigned short common_type;	//	offset:0;  size:2; signed:0;
> +	unsigned char common_flags;	//	offset:2;  size:1; signed:0;
> +	unsigned char common_preempt_count;//	offset:3;  size:1; signed:0;
> +	int common_pid;			//	offset:4;  size:4; signed:1;

and here

> +/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format
> + * Code in:         kernel/include/trace/events/xdp.h
> + */
> +struct cpumap_kthread_ctx {
> +	unsigned short common_type;	//	offset:0;  size:2; signed:0;
> +	unsigned char common_flags;	//	offset:2;  size:1; signed:0;
> +	unsigned char common_preempt_count;//	offset:3;  size:1; signed:0;
> +	int common_pid;			//	offset:4;  size:4; signed:1;

and here.

> +int main(int argc, char **argv)
> +{
> +	struct rlimit r = {10 * 1024*1024, RLIM_INFINITY};

extra spaces around * wouldn't hurt

> +	/* Parse commands line args */
> +	while ((opt = getopt_long(argc, argv, "hSd:",
> +				  long_options, &longindex)) != -1) {
> +		switch (opt) {
> +		case 'd':
> +			if (strlen(optarg) >= IF_NAMESIZE) {
> +				fprintf(stderr, "ERR: --dev name too long\n");
> +				goto error;
> +			}
> +			ifname = (char *)&ifname_buf;
> +			strncpy(ifname, optarg, IF_NAMESIZE);
> +			ifindex = if_nametoindex(ifname);
> +			if (ifindex == 0) {
> +				fprintf(stderr,
> +					"ERR: --dev name unknown err(%d):%s\n",
> +					errno, strerror(errno));
> +				goto error;
> +			}
> +			break;
> +		case 's':
> +			interval = atoi(optarg);
> +			break;
> +		case 'S':
> +			xdp_flags |= XDP_FLAGS_SKB_MODE;
> +			break;
> +		case 'D':
> +			debug = true;
> +			break;
> +		case 'z':
> +			use_separators = false;
> +			break;
> +		case 'p':
> +			/* Selecting eBPF prog to load */
> +			prog_num = atoi(optarg);
> +			if (prog_num < 0 || prog_num >= MAX_PROG) {
> +				fprintf(stderr,
> +					"--prognum too large err(%d):%s\n",
> +					errno, strerror(errno));
> +				goto error;
> +			}
> +			break;
> +		case 'c':
> +			/* Add multiple CPUs */
> +			add_cpu = strtoul(optarg, NULL, 0);
> +			if (add_cpu > MAX_CPUS) {
> +				fprintf(stderr,
> +				"--cpu nr too large for cpumap err(%d):%s\n",
> +					errno, strerror(errno));
> +				goto error;
> +			}
> +			create_cpu_entry(add_cpu, qsize, added_cpus, true);
> +			added_cpus++;
> +			break;
> +		case 'q':
> +			qsize = atoi(optarg);
> +			break;
> +		case 'h':
> +		error:
> +		default:
> +			usage(argv);
> +			return EXIT_FAIL_OPTION;

the rest looks good. Nice that it parses cmdline properly
and even has -h flag :)

^ permalink raw reply

* Re: [PATCH net-next 2/5] bpf: Add map_name to bpf_map_info
From: Jakub Kicinski @ 2017-09-30  2:52 UTC (permalink / raw)
  To: Martin KaFai Lau; +Cc: netdev, Alexei Starovoitov, Daniel Borkmann, kernel-team
In-Reply-To: <20170930023518.bmc6dbpnz27wunod@kafai-mbp>

On Fri, 29 Sep 2017 19:35:18 -0700, Martin KaFai Lau wrote:
> On Sat, Sep 30, 2017 at 02:07:46AM +0000, Jakub Kicinski wrote:
> > Hi Martin!
> > 
> > On Wed, 27 Sep 2017 14:37:53 -0700, Martin KaFai Lau wrote:  
> > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > index 33ccc474fb04..252f4bc9eb25 100644
> > > --- a/include/linux/bpf.h
> > > +++ b/include/linux/bpf.h
> > > @@ -56,6 +56,7 @@ struct bpf_map {
> > >  	struct work_struct work;
> > >  	atomic_t usercnt;
> > >  	struct bpf_map *inner_map_meta;
> > > +	u8 name[BPF_OBJ_NAME_LEN];  
> > 
> > Any reason not to use plain char?  I was looking at adding names to
> > bpftool and:  
> Happy to have early user :)

The newly exposed info is so useful I couldn't resist :)

> It was mostly due to my early idea on treating the name as
> a blob without much char checking, which was then trashed
> but did not reflect that here.
> 
> I will make a followup patch later.

Awesome, thanks!

^ permalink raw reply

* Re: [PATCH net-next 2/5] bpf: Add map_name to bpf_map_info
From: Martin KaFai Lau @ 2017-09-30  2:35 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: netdev, Alexei Starovoitov, Daniel Borkmann, kernel-team
In-Reply-To: <20170929190746.322df52d@cakuba>

On Sat, Sep 30, 2017 at 02:07:46AM +0000, Jakub Kicinski wrote:
> Hi Martin!
> 
> On Wed, 27 Sep 2017 14:37:53 -0700, Martin KaFai Lau wrote:
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index 33ccc474fb04..252f4bc9eb25 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -56,6 +56,7 @@ struct bpf_map {
> >  	struct work_struct work;
> >  	atomic_t usercnt;
> >  	struct bpf_map *inner_map_meta;
> > +	u8 name[BPF_OBJ_NAME_LEN];
> 
> Any reason not to use plain char?  I was looking at adding names to
> bpftool and:
Happy to have early user :)

It was mostly due to my early idea on treating the name as
a blob without much char checking, which was then trashed
but did not reflect that here.

I will make a followup patch later.
> 
> map.c: In function ‘show_map_close’:
> map.c:386:13: warning: pointer targets in passing argument 1 of ‘strlen’ differ in signedness [-Wpointer-sign]
>   if (strlen(info->name))
>              ^~~~
> In file included from map.c:43:0:
> /usr/include/string.h:399:15: note: expected ‘const char *’ but argument is of type ‘__u8 * {aka unsigned char *}’
>  extern size_t strlen (const char *__s)
>                ^~~~~~
> >  };
> >  
> >  /* function argument constraints */
> 

^ permalink raw reply

* Re: [PATCH net-next 2/5] bpf: Add map_name to bpf_map_info
From: Jakub Kicinski @ 2017-09-30  2:07 UTC (permalink / raw)
  To: Martin KaFai Lau; +Cc: netdev, Alexei Starovoitov, Daniel Borkmann, kernel-team
In-Reply-To: <20170927213756.1254938-3-kafai@fb.com>

Hi Martin!

On Wed, 27 Sep 2017 14:37:53 -0700, Martin KaFai Lau wrote:
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 33ccc474fb04..252f4bc9eb25 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -56,6 +56,7 @@ struct bpf_map {
>  	struct work_struct work;
>  	atomic_t usercnt;
>  	struct bpf_map *inner_map_meta;
> +	u8 name[BPF_OBJ_NAME_LEN];

Any reason not to use plain char?  I was looking at adding names to
bpftool and:

map.c: In function ‘show_map_close’:
map.c:386:13: warning: pointer targets in passing argument 1 of ‘strlen’ differ in signedness [-Wpointer-sign]
  if (strlen(info->name))
             ^~~~
In file included from map.c:43:0:
/usr/include/string.h:399:15: note: expected ‘const char *’ but argument is of type ‘__u8 * {aka unsigned char *}’
 extern size_t strlen (const char *__s)
               ^~~~~~
>  };
>  
>  /* function argument constraints */

^ permalink raw reply

* Re: [PATCH net] RDS: IB: Limit the scope of has_fr/has_fmr variables
From: Santosh Shilimkar @ 2017-09-30  1:38 UTC (permalink / raw)
  To: Avinash Repaka, David S. Miller, netdev, linux-rdma, rds-devel,
	linux-kernel
In-Reply-To: <1506734030-15205-1-git-send-email-avinash.repaka@oracle.com>

On 9/29/2017 6:13 PM, Avinash Repaka wrote:
> This patch fixes the scope of has_fr and has_fmr variables as they are
> needed only in rds_ib_add_one().
> 
> Signed-off-by: Avinash Repaka <avinash.repaka@oracle.com>
> ---
Indeed the final merge version actually didn't need
those across files. Change looks good to me. Thanks !!

Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>

^ permalink raw reply

* Re: [PATCH net-next] vhost_net: do not stall on zerocopy depletion
From: Willem de Bruijn @ 2017-09-30  1:25 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Network Development, David Miller, Jason Wang, Koichiro Den,
	virtualization, Willem de Bruijn
In-Reply-To: <20170929223710-mutt-send-email-mst@kernel.org>

On Fri, Sep 29, 2017 at 3:38 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Wed, Sep 27, 2017 at 08:25:56PM -0400, Willem de Bruijn wrote:
>> From: Willem de Bruijn <willemb@google.com>
>>
>> Vhost-net has a hard limit on the number of zerocopy skbs in flight.
>> When reached, transmission stalls. Stalls cause latency, as well as
>> head-of-line blocking of other flows that do not use zerocopy.
>>
>> Instead of stalling, revert to copy-based transmission.
>>
>> Tested by sending two udp flows from guest to host, one with payload
>> of VHOST_GOODCOPY_LEN, the other too small for zerocopy (1B). The
>> large flow is redirected to a netem instance with 1MBps rate limit
>> and deep 1000 entry queue.
>>
>>   modprobe ifb
>>   ip link set dev ifb0 up
>>   tc qdisc add dev ifb0 root netem limit 1000 rate 1MBit
>>
>>   tc qdisc add dev tap0 ingress
>>   tc filter add dev tap0 parent ffff: protocol ip \
>>       u32 match ip dport 8000 0xffff \
>>       action mirred egress redirect dev ifb0
>>
>> Before the delay, both flows process around 80K pps. With the delay,
>> before this patch, both process around 400. After this patch, the
>> large flow is still rate limited, while the small reverts to its
>> original rate. See also discussion in the first link, below.
>>
>> The limit in vhost_exceeds_maxpend must be carefully chosen. When
>> vq->num >> 1, the flows remain correlated. This value happens to
>> correspond to VHOST_MAX_PENDING for vq->num == 256. Allow smaller
>> fractions and ensure correctness also for much smaller values of
>> vq->num, by testing the min() of both explicitly. See also the
>> discussion in the second link below.
>>
>> Link:http://lkml.kernel.org/r/CAF=yD-+Wk9sc9dXMUq1+x_hh=3ThTXa6BnZkygP3tgVpjbp93g@mail.gmail.com
>> Link:http://lkml.kernel.org/r/20170819064129.27272-1-den@klaipeden.com
>> Signed-off-by: Willem de Bruijn <willemb@google.com>
>
> I'd like to see the effect on the non rate limited case though.
> If guest is quick won't we have lots of copies then?

Yes, but not significantly more than without this patch.

I ran 1, 10 and 100 flow tcp_stream throughput tests from a sender
in the guest to a receiver in the host.

To answer the other benchmark question first, I did not see anything
noteworthy when increasing vq->num from 256 to 1024.

With 1 and 10 flows without this patch all packets use zerocopy.
With the patch, less than 1% eschews zerocopy.

With 100 flows, even without this patch, 90+% of packets are copied.
Some zerocopy packets from vhost_net fail this test in tun.c

    if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS)

Generating packets with up to 21 frags. I'm not sure yet why or
what the fraction of these packets is. But this in turn can
disable zcopy_used in vhost_net_tx_select_zcopy for a
larger share of packets:

        return !net->tx_flush &&
                net->tx_packets / 64 >= net->tx_zcopy_err;

Because the number of copied and zerocopy packets are the
same before and after the patch, so are the overall throughput
numbers.

^ permalink raw reply

* Re: [PATCH net-next v2 1/7] net: dsa: add master helper to look up slaves
From: Florian Fainelli @ 2017-09-30  1:14 UTC (permalink / raw)
  To: Vivien Didelot, netdev; +Cc: linux-kernel, kernel, David S. Miller, Andrew Lunn
In-Reply-To: <20170929211921.5571-2-vivien.didelot@savoirfairelinux.com>



On 09/29/2017 02:19 PM, Vivien Didelot wrote:
> The DSA tagging code does not need to know about the DSA architecture,
> it only needs to return the slave device corresponding to the source
> port index (and eventually the source device index for cascade-capable
> switches) parsed from the frame received on the master device.
> 
> For this purpose, provide an inline dsa_master_get_slave helper which
> validates the device and port indexes and look up the slave device.
> 
> This makes the tagging rcv functions more concise and robust, and also
> makes dsa_get_cpu_port obsolete.
> 
> Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
-- 
Florian

^ permalink raw reply

* [PATCH net] RDS: IB: Limit the scope of has_fr/has_fmr variables
From: Avinash Repaka @ 2017-09-30  1:13 UTC (permalink / raw)
  To: Santosh Shilimkar, David S. Miller, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	rds-devel-N0ozoZBvEnrZJqsBc5GL+g,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
  Cc: Avinash Repaka

This patch fixes the scope of has_fr and has_fmr variables as they are
needed only in rds_ib_add_one().

Signed-off-by: Avinash Repaka <avinash.repaka-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
---
 net/rds/ib.c | 11 ++++++-----
 net/rds/ib.h |  2 --
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/net/rds/ib.c b/net/rds/ib.c
index a0954ac..36dd209 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -126,6 +126,7 @@ void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
 static void rds_ib_add_one(struct ib_device *device)
 {
 	struct rds_ib_device *rds_ibdev;
+	bool has_fr, has_fmr;
 
 	/* Only handle IB (no iWARP) devices */
 	if (device->node_type != RDMA_NODE_IB_CA)
@@ -143,11 +144,11 @@ static void rds_ib_add_one(struct ib_device *device)
 	rds_ibdev->max_wrs = device->attrs.max_qp_wr;
 	rds_ibdev->max_sge = min(device->attrs.max_sge, RDS_IB_MAX_SGE);
 
-	rds_ibdev->has_fr = (device->attrs.device_cap_flags &
-				  IB_DEVICE_MEM_MGT_EXTENSIONS);
-	rds_ibdev->has_fmr = (device->alloc_fmr && device->dealloc_fmr &&
-			    device->map_phys_fmr && device->unmap_fmr);
-	rds_ibdev->use_fastreg = (rds_ibdev->has_fr && !rds_ibdev->has_fmr);
+	has_fr = (device->attrs.device_cap_flags &
+		  IB_DEVICE_MEM_MGT_EXTENSIONS);
+	has_fmr = (device->alloc_fmr && device->dealloc_fmr &&
+		   device->map_phys_fmr && device->unmap_fmr);
+	rds_ibdev->use_fastreg = (has_fr && !has_fmr);
 
 	rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32;
 	rds_ibdev->max_1m_mrs = device->attrs.max_mr ?
diff --git a/net/rds/ib.h b/net/rds/ib.h
index bf48224..6ea6a27 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -215,8 +215,6 @@ struct rds_ib_device {
 	struct list_head	conn_list;
 	struct ib_device	*dev;
 	struct ib_pd		*pd;
-	bool                    has_fmr;
-	bool                    has_fr;
 	bool                    use_fastreg;
 
 	unsigned int		max_mrs;
-- 
2.4.11

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox