Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 10/12] netfilter: cttimeout: ctnl_timeout_find_get() returns incorrect pointer to type
From: Pablo Neira Ayuso @ 2018-09-11  0:20 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180911002044.9100-1-pablo@netfilter.org>

Compiler did not catch incorrect typing in the rcu hook assignment.

 % nfct add timeout test-tcp inet tcp established 100 close 10 close_wait 10
 % iptables -I OUTPUT -t raw -p tcp -j CT --timeout test-tcp
 dmesg - xt_CT: Timeout policy `test-tcp' can only be used by L3 protocol number 25000

The CT target bails out with incorrect layer 3 protocol number.

Fixes: 6c1fd7dc489d ("netfilter: cttimeout: decouple timeout policy from nfnetlink_cttimeout object")
Reported-by: Harsha Sharma <harshasharmaiitr@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink_cttimeout.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index d46a236cdf31..a30f8ba4b89a 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -489,8 +489,8 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl,
 	return err;
 }
 
-static struct ctnl_timeout *
-ctnl_timeout_find_get(struct net *net, const char *name)
+static struct nf_ct_timeout *ctnl_timeout_find_get(struct net *net,
+						   const char *name)
 {
 	struct ctnl_timeout *timeout, *matching = NULL;
 
@@ -509,7 +509,7 @@ ctnl_timeout_find_get(struct net *net, const char *name)
 		break;
 	}
 err:
-	return matching;
+	return matching ? &matching->timeout : NULL;
 }
 
 static void ctnl_timeout_put(struct nf_ct_timeout *t)
-- 
2.11.0

^ permalink raw reply related

* [PATCH 09/12] netfilter: conntrack: timeout interface depend on CONFIG_NF_CONNTRACK_TIMEOUT
From: Pablo Neira Ayuso @ 2018-09-11  0:20 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180911002044.9100-1-pablo@netfilter.org>

Now that cttimeout support for nft_ct is in place, these should depend
on CONFIG_NF_CONNTRACK_TIMEOUT otherwise we can crash when dumping the
policy if this option is not enabled.

[   71.600121] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
[...]
[   71.600141] CPU: 3 PID: 7612 Comm: nft Not tainted 4.18.0+ #246
[...]
[   71.600188] Call Trace:
[   71.600201]  ? nft_ct_timeout_obj_dump+0xc6/0xf0 [nft_ct]

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_proto_dccp.c    | 12 ++++++------
 net/netfilter/nf_conntrack_proto_generic.c |  8 ++++----
 net/netfilter/nf_conntrack_proto_gre.c     |  8 ++++----
 net/netfilter/nf_conntrack_proto_icmp.c    |  8 ++++----
 net/netfilter/nf_conntrack_proto_icmpv6.c  |  8 ++++----
 net/netfilter/nf_conntrack_proto_sctp.c    | 14 +++++++-------
 net/netfilter/nf_conntrack_proto_tcp.c     | 12 ++++++------
 net/netfilter/nf_conntrack_proto_udp.c     | 20 ++++++++++----------
 8 files changed, 45 insertions(+), 45 deletions(-)

diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index b81f70039828..f3f91ed2c21a 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -675,7 +675,7 @@ static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct)
 }
 #endif
 
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
 
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_cttimeout.h>
@@ -728,7 +728,7 @@ dccp_timeout_nla_policy[CTA_TIMEOUT_DCCP_MAX+1] = {
 	[CTA_TIMEOUT_DCCP_CLOSING]	= { .type = NLA_U32 },
 	[CTA_TIMEOUT_DCCP_TIMEWAIT]	= { .type = NLA_U32 },
 };
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
 
 #ifdef CONFIG_SYSCTL
 /* template, data assigned later */
@@ -863,7 +863,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 = {
 	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
 	.nla_policy		= nf_ct_port_nla_policy,
 #endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
 	.ctnl_timeout		= {
 		.nlattr_to_obj	= dccp_timeout_nlattr_to_obj,
 		.obj_to_nlattr	= dccp_timeout_obj_to_nlattr,
@@ -871,7 +871,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 = {
 		.obj_size	= sizeof(unsigned int) * CT_DCCP_MAX,
 		.nla_policy	= dccp_timeout_nla_policy,
 	},
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
 	.init_net		= dccp_init_net,
 	.get_net_proto		= dccp_get_net_proto,
 };
@@ -896,7 +896,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 = {
 	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
 	.nla_policy		= nf_ct_port_nla_policy,
 #endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
 	.ctnl_timeout		= {
 		.nlattr_to_obj	= dccp_timeout_nlattr_to_obj,
 		.obj_to_nlattr	= dccp_timeout_obj_to_nlattr,
@@ -904,7 +904,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 = {
 		.obj_size	= sizeof(unsigned int) * CT_DCCP_MAX,
 		.nla_policy	= dccp_timeout_nla_policy,
 	},
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
 	.init_net		= dccp_init_net,
 	.get_net_proto		= dccp_get_net_proto,
 };
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index ac4a0b296dcd..1df3244ecd07 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -70,7 +70,7 @@ static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb,
 	return ret;
 }
 
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
 
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_cttimeout.h>
@@ -113,7 +113,7 @@ static const struct nla_policy
 generic_timeout_nla_policy[CTA_TIMEOUT_GENERIC_MAX+1] = {
 	[CTA_TIMEOUT_GENERIC_TIMEOUT]	= { .type = NLA_U32 },
 };
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
 
 #ifdef CONFIG_SYSCTL
 static struct ctl_table generic_sysctl_table[] = {
@@ -164,7 +164,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic =
 	.pkt_to_tuple		= generic_pkt_to_tuple,
 	.packet			= generic_packet,
 	.new			= generic_new,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
 	.ctnl_timeout		= {
 		.nlattr_to_obj	= generic_timeout_nlattr_to_obj,
 		.obj_to_nlattr	= generic_timeout_obj_to_nlattr,
@@ -172,7 +172,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic =
 		.obj_size	= sizeof(unsigned int),
 		.nla_policy	= generic_timeout_nla_policy,
 	},
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
 	.init_net		= generic_init_net,
 	.get_net_proto		= generic_get_net_proto,
 };
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index d1632252bf5b..650eb4fba2c5 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -285,7 +285,7 @@ static void gre_destroy(struct nf_conn *ct)
 		nf_ct_gre_keymap_destroy(master);
 }
 
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
 
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_cttimeout.h>
@@ -334,7 +334,7 @@ gre_timeout_nla_policy[CTA_TIMEOUT_GRE_MAX+1] = {
 	[CTA_TIMEOUT_GRE_UNREPLIED]	= { .type = NLA_U32 },
 	[CTA_TIMEOUT_GRE_REPLIED]	= { .type = NLA_U32 },
 };
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
 
 static int gre_init_net(struct net *net, u_int16_t proto)
 {
@@ -367,7 +367,7 @@ static const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 = {
 	.nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
 	.nla_policy	 = nf_ct_port_nla_policy,
 #endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
 	.ctnl_timeout    = {
 		.nlattr_to_obj	= gre_timeout_nlattr_to_obj,
 		.obj_to_nlattr	= gre_timeout_obj_to_nlattr,
@@ -375,7 +375,7 @@ static const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 = {
 		.obj_size	= sizeof(unsigned int) * GRE_CT_MAX,
 		.nla_policy	= gre_timeout_nla_policy,
 	},
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
 	.net_id		= &proto_gre_net_id,
 	.init_net	= gre_init_net,
 };
diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c
index 036670b38282..43c7e1a217b9 100644
--- a/net/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/netfilter/nf_conntrack_proto_icmp.c
@@ -273,7 +273,7 @@ static unsigned int icmp_nlattr_tuple_size(void)
 }
 #endif
 
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
 
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_cttimeout.h>
@@ -313,7 +313,7 @@ static const struct nla_policy
 icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = {
 	[CTA_TIMEOUT_ICMP_TIMEOUT]	= { .type = NLA_U32 },
 };
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
 
 #ifdef CONFIG_SYSCTL
 static struct ctl_table icmp_sysctl_table[] = {
@@ -374,7 +374,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp =
 	.nlattr_to_tuple	= icmp_nlattr_to_tuple,
 	.nla_policy		= icmp_nla_policy,
 #endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
 	.ctnl_timeout		= {
 		.nlattr_to_obj	= icmp_timeout_nlattr_to_obj,
 		.obj_to_nlattr	= icmp_timeout_obj_to_nlattr,
@@ -382,7 +382,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp =
 		.obj_size	= sizeof(unsigned int),
 		.nla_policy	= icmp_timeout_nla_policy,
 	},
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
 	.init_net		= icmp_init_net,
 	.get_net_proto		= icmp_get_net_proto,
 };
diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c
index bed07b998a10..97e40f77d678 100644
--- a/net/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/netfilter/nf_conntrack_proto_icmpv6.c
@@ -274,7 +274,7 @@ static unsigned int icmpv6_nlattr_tuple_size(void)
 }
 #endif
 
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
 
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_cttimeout.h>
@@ -314,7 +314,7 @@ static const struct nla_policy
 icmpv6_timeout_nla_policy[CTA_TIMEOUT_ICMPV6_MAX+1] = {
 	[CTA_TIMEOUT_ICMPV6_TIMEOUT]	= { .type = NLA_U32 },
 };
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
 
 #ifdef CONFIG_SYSCTL
 static struct ctl_table icmpv6_sysctl_table[] = {
@@ -373,7 +373,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 =
 	.nlattr_to_tuple	= icmpv6_nlattr_to_tuple,
 	.nla_policy		= icmpv6_nla_policy,
 #endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
 	.ctnl_timeout		= {
 		.nlattr_to_obj	= icmpv6_timeout_nlattr_to_obj,
 		.obj_to_nlattr	= icmpv6_timeout_obj_to_nlattr,
@@ -381,7 +381,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 =
 		.obj_size	= sizeof(unsigned int),
 		.nla_policy	= icmpv6_timeout_nla_policy,
 	},
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
 	.init_net		= icmpv6_init_net,
 	.get_net_proto		= icmpv6_get_net_proto,
 };
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 5eddfd32b852..e4d738d34cd0 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -591,7 +591,7 @@ static int nlattr_to_sctp(struct nlattr *cda[], struct nf_conn *ct)
 }
 #endif
 
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
 
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_cttimeout.h>
@@ -646,7 +646,7 @@ sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = {
 	[CTA_TIMEOUT_SCTP_HEARTBEAT_SENT]	= { .type = NLA_U32 },
 	[CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED]	= { .type = NLA_U32 },
 };
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
 
 
 #ifdef CONFIG_SYSCTL
@@ -780,7 +780,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 = {
 	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
 	.nla_policy		= nf_ct_port_nla_policy,
 #endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
 	.ctnl_timeout		= {
 		.nlattr_to_obj	= sctp_timeout_nlattr_to_obj,
 		.obj_to_nlattr	= sctp_timeout_obj_to_nlattr,
@@ -788,7 +788,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 = {
 		.obj_size	= sizeof(unsigned int) * SCTP_CONNTRACK_MAX,
 		.nla_policy	= sctp_timeout_nla_policy,
 	},
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
 	.init_net		= sctp_init_net,
 	.get_net_proto		= sctp_get_net_proto,
 };
@@ -813,7 +813,8 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 = {
 	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,
 	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
 	.nla_policy		= nf_ct_port_nla_policy,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#endif
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
 	.ctnl_timeout		= {
 		.nlattr_to_obj	= sctp_timeout_nlattr_to_obj,
 		.obj_to_nlattr	= sctp_timeout_obj_to_nlattr,
@@ -821,8 +822,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 = {
 		.obj_size	= sizeof(unsigned int) * SCTP_CONNTRACK_MAX,
 		.nla_policy	= sctp_timeout_nla_policy,
 	},
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
-#endif
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
 	.init_net		= sctp_init_net,
 	.get_net_proto		= sctp_get_net_proto,
 };
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 3e2dc56a96c3..b4bdf9eda7b7 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1279,7 +1279,7 @@ static unsigned int tcp_nlattr_tuple_size(void)
 }
 #endif
 
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
 
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_cttimeout.h>
@@ -1394,7 +1394,7 @@ static const struct nla_policy tcp_timeout_nla_policy[CTA_TIMEOUT_TCP_MAX+1] = {
 	[CTA_TIMEOUT_TCP_RETRANS]	= { .type = NLA_U32 },
 	[CTA_TIMEOUT_TCP_UNACK]		= { .type = NLA_U32 },
 };
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
 
 #ifdef CONFIG_SYSCTL
 static struct ctl_table tcp_sysctl_table[] = {
@@ -1558,7 +1558,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 =
 	.nlattr_size		= TCP_NLATTR_SIZE,
 	.nla_policy		= nf_ct_port_nla_policy,
 #endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
 	.ctnl_timeout		= {
 		.nlattr_to_obj	= tcp_timeout_nlattr_to_obj,
 		.obj_to_nlattr	= tcp_timeout_obj_to_nlattr,
@@ -1567,7 +1567,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 =
 					TCP_CONNTRACK_TIMEOUT_MAX,
 		.nla_policy	= tcp_timeout_nla_policy,
 	},
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
 	.init_net		= tcp_init_net,
 	.get_net_proto		= tcp_get_net_proto,
 };
@@ -1593,7 +1593,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 =
 	.nlattr_tuple_size	= tcp_nlattr_tuple_size,
 	.nla_policy		= nf_ct_port_nla_policy,
 #endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
 	.ctnl_timeout		= {
 		.nlattr_to_obj	= tcp_timeout_nlattr_to_obj,
 		.obj_to_nlattr	= tcp_timeout_obj_to_nlattr,
@@ -1602,7 +1602,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 =
 					TCP_CONNTRACK_TIMEOUT_MAX,
 		.nla_policy	= tcp_timeout_nla_policy,
 	},
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
 	.init_net		= tcp_init_net,
 	.get_net_proto		= tcp_get_net_proto,
 };
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 9272a2c525a8..3065fb8ef91b 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -171,7 +171,7 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
 	return NF_ACCEPT;
 }
 
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
 
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_cttimeout.h>
@@ -221,7 +221,7 @@ udp_timeout_nla_policy[CTA_TIMEOUT_UDP_MAX+1] = {
        [CTA_TIMEOUT_UDP_UNREPLIED]	= { .type = NLA_U32 },
        [CTA_TIMEOUT_UDP_REPLIED]	= { .type = NLA_U32 },
 };
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
 
 #ifdef CONFIG_SYSCTL
 static struct ctl_table udp_sysctl_table[] = {
@@ -292,7 +292,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 =
 	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,
 	.nla_policy		= nf_ct_port_nla_policy,
 #endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
 	.ctnl_timeout		= {
 		.nlattr_to_obj	= udp_timeout_nlattr_to_obj,
 		.obj_to_nlattr	= udp_timeout_obj_to_nlattr,
@@ -300,7 +300,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 =
 		.obj_size	= sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX,
 		.nla_policy	= udp_timeout_nla_policy,
 	},
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
 	.init_net		= udp_init_net,
 	.get_net_proto		= udp_get_net_proto,
 };
@@ -321,7 +321,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 =
 	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,
 	.nla_policy		= nf_ct_port_nla_policy,
 #endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
 	.ctnl_timeout		= {
 		.nlattr_to_obj	= udp_timeout_nlattr_to_obj,
 		.obj_to_nlattr	= udp_timeout_obj_to_nlattr,
@@ -329,7 +329,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 =
 		.obj_size	= sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX,
 		.nla_policy	= udp_timeout_nla_policy,
 	},
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
 	.init_net		= udp_init_net,
 	.get_net_proto		= udp_get_net_proto,
 };
@@ -350,7 +350,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 =
 	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,
 	.nla_policy		= nf_ct_port_nla_policy,
 #endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
 	.ctnl_timeout		= {
 		.nlattr_to_obj	= udp_timeout_nlattr_to_obj,
 		.obj_to_nlattr	= udp_timeout_obj_to_nlattr,
@@ -358,7 +358,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 =
 		.obj_size	= sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX,
 		.nla_policy	= udp_timeout_nla_policy,
 	},
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
 	.init_net		= udp_init_net,
 	.get_net_proto		= udp_get_net_proto,
 };
@@ -379,7 +379,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 =
 	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,
 	.nla_policy		= nf_ct_port_nla_policy,
 #endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
 	.ctnl_timeout		= {
 		.nlattr_to_obj	= udp_timeout_nlattr_to_obj,
 		.obj_to_nlattr	= udp_timeout_obj_to_nlattr,
@@ -387,7 +387,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 =
 		.obj_size	= sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX,
 		.nla_policy	= udp_timeout_nla_policy,
 	},
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
 	.init_net		= udp_init_net,
 	.get_net_proto		= udp_get_net_proto,
 };
-- 
2.11.0

^ permalink raw reply related

* [PATCH 11/12] netfilter: nfnetlink_queue: Solve the NFQUEUE/conntrack clash for NF_REPEAT
From: Pablo Neira Ayuso @ 2018-09-11  0:20 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180911002044.9100-1-pablo@netfilter.org>

From: Michal 'vorner' Vaner <michal.vaner@avast.com>

NF_REPEAT places the packet at the beginning of the iptables chain
instead of accepting or rejecting it right away. The packet however will
reach the end of the chain and continue to the end of iptables
eventually, so it needs the same handling as NF_ACCEPT and NF_DROP.

Fixes: 368982cd7d1b ("netfilter: nfnetlink_queue: resolve clash for unconfirmed conntracks")
Signed-off-by: Michal 'vorner' Vaner <michal.vaner@avast.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink_queue.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index ea4ba551abb2..d33094f4ec41 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -233,6 +233,7 @@ static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
 	int err;
 
 	if (verdict == NF_ACCEPT ||
+	    verdict == NF_REPEAT ||
 	    verdict == NF_STOP) {
 		rcu_read_lock();
 		ct_hook = rcu_dereference(nf_ct_hook);
-- 
2.11.0

^ permalink raw reply related

* [PATCH 12/12] netfilter: xt_hashlimit: use s->file instead of s->private
From: Pablo Neira Ayuso @ 2018-09-11  0:20 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180911002044.9100-1-pablo@netfilter.org>

From: Cong Wang <xiyou.wangcong@gmail.com>

After switching to the new procfs API, it is supposed to
retrieve the private pointer from PDE_DATA(file_inode(s->file)),
s->private is no longer referred.

Fixes: 1cd671827290 ("netfilter/x_tables: switch to proc_create_seq_private")
Reported-by: Sami Farin <hvtaifwkbgefbaei@gmail.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Tested-by: Sami Farin <hvtaifwkbgefbaei@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_hashlimit.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 9b16402f29af..3e7d259e5d8d 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -1057,7 +1057,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 static void *dl_seq_start(struct seq_file *s, loff_t *pos)
 	__acquires(htable->lock)
 {
-	struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private));
+	struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
 	unsigned int *bucket;
 
 	spin_lock_bh(&htable->lock);
@@ -1074,7 +1074,7 @@ static void *dl_seq_start(struct seq_file *s, loff_t *pos)
 
 static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos)
 {
-	struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private));
+	struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
 	unsigned int *bucket = v;
 
 	*pos = ++(*bucket);
@@ -1088,7 +1088,7 @@ static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos)
 static void dl_seq_stop(struct seq_file *s, void *v)
 	__releases(htable->lock)
 {
-	struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private));
+	struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
 	unsigned int *bucket = v;
 
 	if (!IS_ERR(bucket))
@@ -1130,7 +1130,7 @@ static void dl_seq_print(struct dsthash_ent *ent, u_int8_t family,
 static int dl_seq_real_show_v2(struct dsthash_ent *ent, u_int8_t family,
 			       struct seq_file *s)
 {
-	struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->private));
+	struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file));
 
 	spin_lock(&ent->lock);
 	/* recalculate to show accurate numbers */
@@ -1145,7 +1145,7 @@ static int dl_seq_real_show_v2(struct dsthash_ent *ent, u_int8_t family,
 static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
 			       struct seq_file *s)
 {
-	struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->private));
+	struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file));
 
 	spin_lock(&ent->lock);
 	/* recalculate to show accurate numbers */
@@ -1160,7 +1160,7 @@ static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
 static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
 			    struct seq_file *s)
 {
-	struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->private));
+	struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file));
 
 	spin_lock(&ent->lock);
 	/* recalculate to show accurate numbers */
@@ -1174,7 +1174,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
 
 static int dl_seq_show_v2(struct seq_file *s, void *v)
 {
-	struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private));
+	struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
 	unsigned int *bucket = (unsigned int *)v;
 	struct dsthash_ent *ent;
 
@@ -1188,7 +1188,7 @@ static int dl_seq_show_v2(struct seq_file *s, void *v)
 
 static int dl_seq_show_v1(struct seq_file *s, void *v)
 {
-	struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private));
+	struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
 	unsigned int *bucket = v;
 	struct dsthash_ent *ent;
 
@@ -1202,7 +1202,7 @@ static int dl_seq_show_v1(struct seq_file *s, void *v)
 
 static int dl_seq_show(struct seq_file *s, void *v)
 {
-	struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private));
+	struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
 	unsigned int *bucket = v;
 	struct dsthash_ent *ent;
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next] scsi: libcxgbi: fib6_ino reference in rt6_info is rcu protected
From: dsahern @ 2018-09-11  0:21 UTC (permalink / raw)
  To: netdev; +Cc: David Ahern

From: David Ahern <dsahern@gmail.com>

The fib6_info reference in rt6_info is rcu protected. Add a helper
to extract prefsrc from and update cxgbi_check_route6 to use it.

Fixes: 0153167aebd0 ("net/ipv6: Remove rt6i_prefsrc")
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: David Ahern <dsahern@gmail.com>
---
 drivers/scsi/cxgbi/libcxgbi.c |  5 ++---
 include/net/ip6_fib.h         | 19 +++++++++++++++++++
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c
index 6b3ea50c594e..75f876409fb9 100644
--- a/drivers/scsi/cxgbi/libcxgbi.c
+++ b/drivers/scsi/cxgbi/libcxgbi.c
@@ -784,7 +784,8 @@ cxgbi_check_route6(struct sockaddr *dst_addr, int ifindex)
 	csk->mtu = mtu;
 	csk->dst = dst;
 
-	if (!rt->from || ipv6_addr_any(&rt->from->fib6_prefsrc.addr)) {
+	rt6_get_prefsrc(rt, &pref_saddr);
+	if (ipv6_addr_any(&pref_saddr)) {
 		struct inet6_dev *idev = ip6_dst_idev((struct dst_entry *)rt);
 
 		err = ipv6_dev_get_saddr(&init_net, idev ? idev->dev : NULL,
@@ -794,8 +795,6 @@ cxgbi_check_route6(struct sockaddr *dst_addr, int ifindex)
 				&daddr6->sin6_addr);
 			goto rel_rt;
 		}
-	} else {
-		pref_saddr = rt->from->fib6_prefsrc.addr;
 	}
 
 	csk->csk_family = AF_INET6;
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index c7496663f99a..f06e968f1992 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -412,6 +412,25 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
 	     struct nl_info *info, struct netlink_ext_ack *extack);
 int fib6_del(struct fib6_info *rt, struct nl_info *info);
 
+static inline
+void rt6_get_prefsrc(const struct rt6_info *rt, struct in6_addr *addr)
+{
+	const struct fib6_info *from;
+
+	rcu_read_lock();
+
+	from = rcu_dereference(rt->from);
+	if (from) {
+		*addr = from->fib6_prefsrc.addr;
+	} else {
+		struct in6_addr in6_zero = {};
+
+		*addr = in6_zero;
+	}
+
+	rcu_read_unlock();
+}
+
 static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i)
 {
 	return f6i->fib6_nh.nh_dev;
-- 
2.11.0

^ permalink raw reply related

* Re: [Patch net] rds: mark bound socket with SOCK_RCU_FREE
From: Sowmini Varadhan @ 2018-09-11  0:24 UTC (permalink / raw)
  To: Cong Wang; +Cc: Santosh Shilimkar, Linux Kernel Network Developers, rds-devel
In-Reply-To: <CAM_iQpXcydKdTyT25F5Bbk7y9BMJLTKMT42o3J0bbuodjXjetA@mail.gmail.com>

On (09/10/18 17:16), Cong Wang wrote:
> >
> > On (09/10/18 16:51), Cong Wang wrote:
> > >
> > >         __rds_create_bind_key(key, addr, port, scope_id);
> > > -       rs = rhashtable_lookup_fast(&bind_hash_table, key, ht_parms);
> > > +       rcu_read_lock();
> > > +       rs = rhashtable_lookup(&bind_hash_table, key, ht_parms);
> > >         if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
> > >                 rds_sock_addref(rs);
> > >         else
> > >                 rs = NULL;
> > > +       rcu_read_unlock();
> >
> > aiui, the rcu_read lock/unlock is only useful if the write
> > side doing destructive operations  does something to make sure readers
> > are done before doing the destructive opertion. AFAIK, that does
> > not exist for rds socket management today
> 
> That is exactly why we need it here, right?

Maybe I am confused, what exactly is the patch you are proposing?

Does it have the SOCK_RCU_FREE change? 
Does it have the rcu_read_lock you have above? 
Where is the call_rcu?

> Hmm, so you are saying synchronize_rcu() is kinda more correct
> than call_rcu()??  


I'm not saying that, I'm asking "what exactly is the patch
you are proposing?" The only one on record is 
   http://patchwork.ozlabs.org/patch/968282/
which does not have either synchronize_rcu or call_rcu.

> I never hear this before, would like to know why.

Please post precise patches first.

Thanks.

^ permalink raw reply

* Re: [Patch net] rds: mark bound socket with SOCK_RCU_FREE
From: Santosh Shilimkar @ 2018-09-11  0:26 UTC (permalink / raw)
  To: Cong Wang, Sowmini Varadhan; +Cc: Linux Kernel Network Developers, rds-devel
In-Reply-To: <CAM_iQpXcydKdTyT25F5Bbk7y9BMJLTKMT42o3J0bbuodjXjetA@mail.gmail.com>

On 9/10/2018 5:16 PM, Cong Wang wrote:
> On Mon, Sep 10, 2018 at 5:04 PM Sowmini Varadhan
> <sowmini.varadhan@oracle.com> wrote:
>>
>> On (09/10/18 16:51), Cong Wang wrote:
>>>
>>>          __rds_create_bind_key(key, addr, port, scope_id);
>>> -       rs = rhashtable_lookup_fast(&bind_hash_table, key, ht_parms);
>>> +       rcu_read_lock();
>>> +       rs = rhashtable_lookup(&bind_hash_table, key, ht_parms);
>>>          if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
>>>                  rds_sock_addref(rs);
>>>          else
>>>                  rs = NULL;
>>> +       rcu_read_unlock();
>>
>> aiui, the rcu_read lock/unlock is only useful if the write
>> side doing destructive operations  does something to make sure readers
>> are done before doing the destructive opertion. AFAIK, that does
>> not exist for rds socket management today
> 
> That is exactly why we need it here, right?
> 
> As you mentioned previously, the sock could be freed after
> rhashtable_lookup_fast() but before rds_sock_addref(), extending
> the RCU read section after rds_sock_addref() exactly solves
> the problem here.
>
Thats the case really.

> Or is there any other destructive problem you didn't mention?
> Mind to be specific?
> 
> 
>>
>>
>>> Although sock release path is not a very hot path, but blocking
>>> it isn't a good idea either, especially when you can use call_rcu(),
>>> which has the same effect.
>>>
>>> I don't see any reason we should prefer synchronize_rcu() here.
>>
>> Usually correctness (making sure all readers are done, before nuking a
>> data structure) is a little bit more important than perforamance, aka
>> "safety before speed" is what I've always been taught.
>>
> 
> Hmm, so you are saying synchronize_rcu() is kinda more correct
> than call_rcu()?? I never hear this before, would like to know why.
> 
> To my best knowledge, the only difference between them is the context,
> one is blocking, the other is non-blocking. Their correctness must be
> equivalent.
> 
We have burn our hands with blocking synchronize_rcu() and that was
actually the main reason we moved to rw locks from rcu to localise
the cost. call_rcu()should be just fine as long as it plugs the hole.
I don't want to add blocking in this path since this slows
down the connection shutdown path and at times lead to soft dumps.

Would you mind posting an updated patch please with call_rcu and
above extended RCU grace period with rcu_read_lock. Thanks !!

Regards,
Santosh

Regards,
Santosh

^ permalink raw reply

* Re: INFO: rcu detected stall in vprintk_func
From: Dmitry Vyukov @ 2018-09-11  5:28 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: syzbot, LKML, Petr Mladek, Steven Rostedt, Sergey Senozhatsky,
	syzkaller-bugs, Samuel Ortiz, David Miller, linux-wireless,
	netdev
In-Reply-To: <20180402020413.GD3795@jagdpanzerIV>

On Mon, Apr 2, 2018 at 4:04 AM, Sergey Senozhatsky
<sergey.senozhatsky.work@gmail.com> wrote:
> On (04/02/18 10:54), Sergey Senozhatsky wrote:
>> > > If you forward the report, please keep this part and the footer.
>> > >
>> > > llcp: nfc_llcp_send_ui_frame: Could not allocate PDU
>> > > llcp: nfc_llcp_send_ui_frame: Could not allocate PDU
>> > > llcp: nfc_llcp_send_ui_frame: Could not allocate PDU
>> > > llcp: nfc_llcp_send_ui_frame: Could not allocate PDU
>
> [..]
>
>> diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c
>> index ef4026a23e80..a309a27581da 100644
>> --- a/net/nfc/llcp_core.c
>> +++ b/net/nfc/llcp_core.c
>> @@ -1386,7 +1386,7 @@ static void nfc_llcp_recv_agf(struct nfc_llcp_local *local, struct sk_buff *skb)
>>
>>               new_skb = nfc_alloc_recv_skb(pdu_len, GFP_KERNEL);
>>               if (new_skb == NULL) {
>> -                     pr_err("Could not allocate PDU\n");
>> +                     pr_err_ratelimited("Could not allocate PDU\n");
>>                       return;
>>               }
>
> And of course I ended up patching the wrong function...
> What I actually meant was:
>
> ---
>
> diff --git a/net/nfc/llcp_commands.c b/net/nfc/llcp_commands.c
> index 2ceefa183cee..2f3becb709b8 100644
> --- a/net/nfc/llcp_commands.c
> +++ b/net/nfc/llcp_commands.c
> @@ -755,7 +755,7 @@ int nfc_llcp_send_ui_frame(struct nfc_llcp_sock *sock, u8 ssap, u8 dsap,
>                 pdu = nfc_alloc_send_skb(sock->dev, &sock->sk, MSG_DONTWAIT,
>                                          frag_len + LLCP_HEADER_SIZE, &err);
>                 if (pdu == NULL) {
> -                       pr_err("Could not allocate PDU\n");
> +                       pr_err_ratelimited("Could not allocate PDU\n");
>                         continue;
>                 }
>


This is a duplicate of:

https://syzkaller.appspot.com/bug?id=ff33bd26f9147a02c60039098b063219df1ac131

#syz dup: INFO: rcu detected stall in llcp_sock_sendmsg

^ permalink raw reply

* Re: unexpected GRO/veth behavior
From: Toshiaki Makita @ 2018-09-11  0:33 UTC (permalink / raw)
  To: Eric Dumazet, Paolo Abeni, netdev
In-Reply-To: <f84e9363-bdc0-5e41-ca1a-99f612fcc708@gmail.com>

On 2018/09/10 23:56, Eric Dumazet wrote:
> On 09/10/2018 07:44 AM, Paolo Abeni wrote:
>> hi all,
>>
>> while testing some local patches I observed that the TCP tput in the
>> following scenario:
>>
>> # the following enable napi on veth0, so that we can trigger the
>> # GRO path with namespaces
>> ip netns add test
>> ip link add type veth
>> ip link set dev veth0 netns test
>> ip -n test link set lo up
>> ip -n test link set veth0 up
>> ip -n test addr add dev veth0 172.16.1.2/24
>> ip link set dev veth1 up
>> ip addr add dev veth1 172.16.1.1/24
>> IDX=`ip netns exec test cat /sys/class/net/veth0/ifindex`
>>
>> # 'xdp_pass' is a NO-OP XDP program that simply return XDP_PASS
>> ip netns exec test ./xdp_pass $IDX &
>> taskset 0x2 ip netns exec test iperf3 -s -i 60 &
>> taskset 0x1 iperf3 -c 172.16.1.2 -t 60 -i 60
>>
>> is quite lower than expected (~800Mbps). 'perf' shows a weird topmost 
>> offender:
>>
> 
> 
> But... why GRO would even be needed in this scenario ?
> 
> GRO is really meant for physical devices, having to mess with skb->sk adds extra cost
> in this already heavy cost engine.
> 
> Virtual devices should already be fed with TSO packets.

Because XDP does not have SG feature (GRO path in veth is used only when
XDP is enabled).

I have tested configuration like this:

NIC ---(XDP_REDIRECT)---> veth===veth (XDP_PASS)

GRO seems to work and improves TCP throughput in this case.


Now I noticed I did not test:

netperf -> veth===veth (XDP_PASS) -> netserver

which I think is the case where Paolo faces a problem.

I think it is not the case XDP can improve performance. I think I can
disable GRO for packets with skb->sk != NULL in veth.

-- 
Toshiaki Makita

^ permalink raw reply

* Re: [Patch net] rds: mark bound socket with SOCK_RCU_FREE
From: Cong Wang @ 2018-09-11  0:39 UTC (permalink / raw)
  To: Sowmini Varadhan
  Cc: Santosh Shilimkar, Linux Kernel Network Developers, rds-devel
In-Reply-To: <20180911002423.GL4668@oracle.com>

On Mon, Sep 10, 2018 at 5:24 PM Sowmini Varadhan
<sowmini.varadhan@oracle.com> wrote:
>
> On (09/10/18 17:16), Cong Wang wrote:
> > >
> > > On (09/10/18 16:51), Cong Wang wrote:
> > > >
> > > >         __rds_create_bind_key(key, addr, port, scope_id);
> > > > -       rs = rhashtable_lookup_fast(&bind_hash_table, key, ht_parms);
> > > > +       rcu_read_lock();
> > > > +       rs = rhashtable_lookup(&bind_hash_table, key, ht_parms);
> > > >         if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
> > > >                 rds_sock_addref(rs);
> > > >         else
> > > >                 rs = NULL;
> > > > +       rcu_read_unlock();
> > >
> > > aiui, the rcu_read lock/unlock is only useful if the write
> > > side doing destructive operations  does something to make sure readers
> > > are done before doing the destructive opertion. AFAIK, that does
> > > not exist for rds socket management today
> >
> > That is exactly why we need it here, right?
>
> Maybe I am confused, what exactly is the patch you are proposing?
>
> Does it have the SOCK_RCU_FREE change?

Yes, that patch is obviously on top of this patch.


> Does it have the rcu_read_lock you have above?
> Where is the call_rcu?

Sure, as it is on top of this patch, the call_rcu() is
here:

void sk_destruct(struct sock *sk)
{
        if (sock_flag(sk, SOCK_RCU_FREE))
                call_rcu(&sk->sk_rcu, __sk_destruct);
        else
                __sk_destruct(&sk->sk_rcu);
}


>
> > Hmm, so you are saying synchronize_rcu() is kinda more correct
> > than call_rcu()??
>
>
> I'm not saying that, I'm asking "what exactly is the patch
> you are proposing?" The only one on record is
>    http://patchwork.ozlabs.org/patch/968282/
> which does not have either synchronize_rcu or call_rcu.

It is very obviously on top of this patch ($subject).


>
> > I never hear this before, would like to know why.
>
> Please post precise patches first.

Already showed you precisely what is is, just on top
of this one.

^ permalink raw reply

* Re: Re: [PATCH net-next v2 0/5] virtio: support packed ring
From: Tiwei Bie @ 2018-09-11  5:37 UTC (permalink / raw)
  To: Jason Wang
  Cc: Michael S. Tsirkin, virtualization, linux-kernel, netdev,
	virtio-dev, wexu, jfreimann
In-Reply-To: <f5ef0393-cf02-0795-a51a-8783ee300037@redhat.com>

On Mon, Sep 10, 2018 at 11:33:17AM +0800, Jason Wang wrote:
> On 2018年09月10日 11:00, Tiwei Bie wrote:
> > On Fri, Sep 07, 2018 at 09:00:49AM -0400, Michael S. Tsirkin wrote:
> > > On Fri, Sep 07, 2018 at 09:22:25AM +0800, Tiwei Bie wrote:
> > > > On Mon, Aug 27, 2018 at 05:00:40PM +0300, Michael S. Tsirkin wrote:
> > > > > Are there still plans to test the performance with vost pmd?
> > > > > vhost doesn't seem to show a performance gain ...
> > > > > 
> > > > I tried some performance tests with vhost PMD. In guest, the
> > > > XDP program will return XDP_DROP directly. And in host, testpmd
> > > > will do txonly fwd.
> > > > 
> > > > When burst size is 1 and packet size is 64 in testpmd and
> > > > testpmd needs to iterate 5 Tx queues (but only the first two
> > > > queues are enabled) to prepare and inject packets, I got ~12%
> > > > performance boost (5.7Mpps -> 6.4Mpps). And if the vhost PMD
> > > > is faster (e.g. just need to iterate the first two queues to
> > > > prepare and inject packets), then I got similar performance
> > > > for both rings (~9.9Mpps) (packed ring's performance can be
> > > > lower, because it's more complicated in driver.)
> > > > 
> > > > I think packed ring makes vhost PMD faster, but it doesn't make
> > > > the driver faster. In packed ring, the ring is simplified, and
> > > > the handling of the ring in vhost (device) is also simplified,
> > > > but things are not simplified in driver, e.g. although there is
> > > > no desc table in the virtqueue anymore, driver still needs to
> > > > maintain a private desc state table (which is still managed as
> > > > a list in this patch set) to support the out-of-order desc
> > > > processing in vhost (device).
> > > > 
> > > > I think this patch set is mainly to make the driver have a full
> > > > functional support for the packed ring, which makes it possible
> > > > to leverage the packed ring feature in vhost (device). But I'm
> > > > not sure whether there is any other better idea, I'd like to
> > > > hear your thoughts. Thanks!
> > > Just this: Jens seems to report a nice gain with virtio and
> > > vhost pmd across the board. Try to compare virtio and
> > > virtio pmd to see what does pmd do better?
> > The virtio PMD (drivers/net/virtio) in DPDK doesn't need to share
> > the virtio ring operation code with other drivers and is highly
> > optimized for network. E.g. in Rx, the Rx burst function won't
> > chain descs. So the ID management for the Rx ring can be quite
> > simple and straightforward, we just need to initialize these IDs
> > when initializing the ring and don't need to change these IDs
> > in data path anymore (the mergable Rx code in that patch set
> > assumes the descs will be written back in order, which should be
> > fixed. I.e., the ID in the desc should be used to index vq->descx[]).
> > The Tx code in that patch set also assumes the descs will be
> > written back by device in order, which should be fixed.
> 
> Yes it is. I think I've pointed it out in some early version of pmd patch.
> So I suspect part (or all) of the boost may come from in order feature.
> 
> > 
> > But in kernel virtio driver, the virtio_ring.c is very generic.
> > The enqueue (virtqueue_add()) and dequeue (virtqueue_get_buf_ctx())
> > functions need to support all the virtio devices and should be
> > able to handle all the possible cases that may happen. So although
> > the packed ring can be very efficient in some cases, currently
> > the room to optimize the performance in kernel's virtio_ring.c
> > isn't that much. If we want to take the fully advantage of the
> > packed ring's efficiency, we need some further e.g. API changes
> > in virtio_ring.c, which shouldn't be part of this patch set.
> 
> Could you please share more thoughts on this e.g how to improve the API?
> Notice since the API is shared by both split ring and packed ring, it may
> improve the performance of split ring as well. One can easily imagine a
> batching API, but it does not have many real users now, the only case is the
> XDP transmission which can accept an array of XDP frames.

I don't have detailed thoughts on this yet. But kernel's
virtio_ring.c is quite generic compared with what we did
in virtio PMD.

> 
> > So
> > I still think this patch set is mainly to make the kernel virtio
> > driver to have a full functional support of the packed ring, and
> > we can't expect impressive performance boost with it.
> 
> We can only gain when virtio ring layout is the bottleneck. If there're
> bottlenecks elsewhere, we probably won't see any increasing in the numbers.
> Vhost-net is an example, and lots of optimizations have proved that virtio
> ring is not the main bottleneck for the current codes. I suspect it also the
> case of virtio driver. Did perf tell us any interesting things in virtio
> driver?
> 
> Thanks
> 
> > 
> > > 
> > > > > On Wed, Jul 11, 2018 at 10:27:06AM +0800, Tiwei Bie wrote:
> > > > > > Hello everyone,
> > > > > > 
> > > > > > This patch set implements packed ring support in virtio driver.
> > > > > > 
> > > > > > Some functional tests have been done with Jason's
> > > > > > packed ring implementation in vhost:
> > > > > > 
> > > > > > https://lkml.org/lkml/2018/7/3/33
> > > > > > 
> > > > > > Both of ping and netperf worked as expected.
> > > > > > 
> > > > > > v1 -> v2:
> > > > > > - Use READ_ONCE() to read event off_wrap and flags together (Jason);
> > > > > > - Add comments related to ccw (Jason);
> > > > > > 
> > > > > > RFC (v6) -> v1:
> > > > > > - Avoid extra virtio_wmb() in virtqueue_enable_cb_delayed_packed()
> > > > > >    when event idx is off (Jason);
> > > > > > - Fix bufs calculation in virtqueue_enable_cb_delayed_packed() (Jason);
> > > > > > - Test the state of the desc at used_idx instead of last_used_idx
> > > > > >    in virtqueue_enable_cb_delayed_packed() (Jason);
> > > > > > - Save wrap counter (as part of queue state) in the return value
> > > > > >    of virtqueue_enable_cb_prepare_packed();
> > > > > > - Refine the packed ring definitions in uapi;
> > > > > > - Rebase on the net-next tree;
> > > > > > 
> > > > > > RFC v5 -> RFC v6:
> > > > > > - Avoid tracking addr/len/flags when DMA API isn't used (MST/Jason);
> > > > > > - Define wrap counter as bool (Jason);
> > > > > > - Use ALIGN() in vring_init_packed() (Jason);
> > > > > > - Avoid using pointer to track `next` in detach_buf_packed() (Jason);
> > > > > > - Add comments for barriers (Jason);
> > > > > > - Don't enable RING_PACKED on ccw for now (noticed by Jason);
> > > > > > - Refine the memory barrier in virtqueue_poll();
> > > > > > - Add a missing memory barrier in virtqueue_enable_cb_delayed_packed();
> > > > > > - Remove the hacks in virtqueue_enable_cb_prepare_packed();
> > > > > > 
> > > > > > RFC v4 -> RFC v5:
> > > > > > - Save DMA addr, etc in desc state (Jason);
> > > > > > - Track used wrap counter;
> > > > > > 
> > > > > > RFC v3 -> RFC v4:
> > > > > > - Make ID allocation support out-of-order (Jason);
> > > > > > - Various fixes for EVENT_IDX support;
> > > > > > 
> > > > > > RFC v2 -> RFC v3:
> > > > > > - Split into small patches (Jason);
> > > > > > - Add helper virtqueue_use_indirect() (Jason);
> > > > > > - Just set id for the last descriptor of a list (Jason);
> > > > > > - Calculate the prev in virtqueue_add_packed() (Jason);
> > > > > > - Fix/improve desc suppression code (Jason/MST);
> > > > > > - Refine the code layout for XXX_split/packed and wrappers (MST);
> > > > > > - Fix the comments and API in uapi (MST);
> > > > > > - Remove the BUG_ON() for indirect (Jason);
> > > > > > - Some other refinements and bug fixes;
> > > > > > 
> > > > > > RFC v1 -> RFC v2:
> > > > > > - Add indirect descriptor support - compile test only;
> > > > > > - Add event suppression supprt - compile test only;
> > > > > > - Move vring_packed_init() out of uapi (Jason, MST);
> > > > > > - Merge two loops into one in virtqueue_add_packed() (Jason);
> > > > > > - Split vring_unmap_one() for packed ring and split ring (Jason);
> > > > > > - Avoid using '%' operator (Jason);
> > > > > > - Rename free_head -> next_avail_idx (Jason);
> > > > > > - Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
> > > > > > - Some other refinements and bug fixes;
> > > > > > 
> > > > > > Thanks!
> > > > > > 
> > > > > > Tiwei Bie (5):
> > > > > >    virtio: add packed ring definitions
> > > > > >    virtio_ring: support creating packed ring
> > > > > >    virtio_ring: add packed ring support
> > > > > >    virtio_ring: add event idx support in packed ring
> > > > > >    virtio_ring: enable packed ring
> > > > > > 
> > > > > >   drivers/s390/virtio/virtio_ccw.c   |   14 +
> > > > > >   drivers/virtio/virtio_ring.c       | 1365 ++++++++++++++++++++++------
> > > > > >   include/linux/virtio_ring.h        |    8 +-
> > > > > >   include/uapi/linux/virtio_config.h |    3 +
> > > > > >   include/uapi/linux/virtio_ring.h   |   43 +
> > > > > >   5 files changed, 1157 insertions(+), 276 deletions(-)
> > > > > > 
> > > > > > -- 
> > > > > > 2.18.0
> > > > > ---------------------------------------------------------------------
> > > > > To unsubscribe, e-mail: virtio-dev-unsubscribe@lists.oasis-open.org
> > > > > For additional commands, e-mail: virtio-dev-help@lists.oasis-open.org
> > > > > 
> 

^ permalink raw reply

* Re: [Patch net] rds: mark bound socket with SOCK_RCU_FREE
From: Cong Wang @ 2018-09-11  0:45 UTC (permalink / raw)
  To: Santosh Shilimkar
  Cc: Sowmini Varadhan, Linux Kernel Network Developers, rds-devel
In-Reply-To: <17b2b2b8-12ce-076e-a961-2a8bee1d021f@oracle.com>

On Mon, Sep 10, 2018 at 5:26 PM Santosh Shilimkar
<santosh.shilimkar@oracle.com> wrote:
> Would you mind posting an updated patch please with call_rcu and
> above extended RCU grace period with rcu_read_lock. Thanks !!

If you prefer to fix _two_ problems in one patch, sure.

For the record, the bug this patch fixes is NOT same with the one
in rds_find_bound(), because there is no rds_find_bound() in
the backtrace. If you want to see the full backtrace, here it is:

https://marc.info/?l=linux-netdev&m=153644444808962&w=2

This is why I believe they are two problems.

Whether fixing two problems in one patch or two patches is
merely a preference, I leave it up to you.

^ permalink raw reply

* Re: [PATCH net-next] virtio_net: ethtool tx napi configuration
From: Jason Wang @ 2018-09-11  0:45 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Network Development, David Miller, caleb.raitto,
	Michael S. Tsirkin, Jon Olson (Google Drive), Willem de Bruijn
In-Reply-To: <CAF=yD-+LnOzzUReqoSFh86X2nSoWcSEZHbvyYc9ZMxshvB3AZA@mail.gmail.com>



On 2018年09月10日 21:35, Willem de Bruijn wrote:
> On Mon, Sep 10, 2018 at 2:01 AM Jason Wang <jasowang@redhat.com> wrote:
>>
>>
>> On 2018年09月10日 06:44, Willem de Bruijn wrote:
>>> From: Willem de Bruijn <willemb@google.com>
>>>
>>> Implement ethtool .set_coalesce (-C) and .get_coalesce (-c) handlers.
>>> Interrupt moderation is currently not supported, so these accept and
>>> display the default settings of 0 usec and 1 frame.
>>>
>>> Toggle tx napi through a bit in tx-frames. So as to not interfere
>>> with possible future interrupt moderation, use bit 10, well outside
>>> the reasonable range of real interrupt moderation values.
>>>
>>> Changes are not atomic. The tx IRQ, napi BH and transmit path must
>>> be quiesced when switching modes. Only allow changing this setting
>>> when the device is down.
>> I cook a fixup, and it looks works in my setup:
>>
>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> index b320b6b14749..9181c3f2f832 100644
>> --- a/drivers/net/virtio_net.c
>> +++ b/drivers/net/virtio_net.c
>> @@ -2204,10 +2204,17 @@ static int virtnet_set_coalesce(struct
>> net_device *dev,
>>                   return -EINVAL;
>>
>>           if (napi_weight ^ vi->sq[0].napi.weight) {
>> -               if (dev->flags & IFF_UP)
>> -                       return -EBUSY;
>> -               for (i = 0; i < vi->max_queue_pairs; i++)
>> +               for (i = 0; i < vi->max_queue_pairs; i++) {
>> +                       struct netdev_queue *txq =
>> +                              netdev_get_tx_queue(vi->dev, i);
>> +
>> + virtnet_napi_tx_disable(&vi->sq[i].napi);
>> +                       __netif_tx_lock_bh(txq);
>>                           vi->sq[i].napi.weight = napi_weight;
>> +                       __netif_tx_unlock_bh(txq);
>> +                       virtnet_napi_tx_enable(vi, vi->sq[i].vq,
>> + &vi->sq[i].napi);
>> +               }
>>           }
>>
>>           return 0;
> Thanks! It passes my simple stress test, too. Which consists of two
> concurrent loops, one toggling the ethtool option, another running
> TCP_RR.
>
>> The only left case is the speculative tx polling in RX NAPI. I think we
>> don't need to care in this case since it was not a must for correctness.
> As long as the txq lock is held that will be a noop, anyway. The other
> concurrent action is skb_xmit_done. It looks correct to me, but need
> to think about it a bit. The tricky transition is coming out of napi without
> having >= 2 + MAX_SKB_FRAGS clean descriptors. If the queue is
> stopped it may deadlock transmission in no-napi mode.

Yes, maybe we can enable tx queue when napi weight is zero in 
virtnet_poll_tx(). Let me do more stress test on this.

>
>>> Link: https://patchwork.ozlabs.org/patch/948149/
>>> Suggested-by: Jason Wang <jasowang@redhat.com>
>>> Signed-off-by: Willem de Bruijn <willemb@google.com>
>>> ---
>>>    drivers/net/virtio_net.c | 52 ++++++++++++++++++++++++++++++++++++++++
>>>    1 file changed, 52 insertions(+)
>>>
>>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>>> index 765920905226..b320b6b14749 100644
>>> --- a/drivers/net/virtio_net.c
>>> +++ b/drivers/net/virtio_net.c
>>> @@ -66,6 +66,8 @@ DECLARE_EWMA(pkt_len, 0, 64)
>>>
>>>    #define VIRTNET_DRIVER_VERSION "1.0.0"
>>>
>>> +static const u32 ethtool_coalesce_napi_mask = (1UL << 10);
>>> +
>>>    static const unsigned long guest_offloads[] = {
>>>        VIRTIO_NET_F_GUEST_TSO4,
>>>        VIRTIO_NET_F_GUEST_TSO6,
>>> @@ -2181,6 +2183,54 @@ static int virtnet_get_link_ksettings(struct net_device *dev,
>>>        return 0;
>>>    }
>>>
>>> +static int virtnet_set_coalesce(struct net_device *dev,
>>> +                             struct ethtool_coalesce *ec)
>>> +{
>>> +     const struct ethtool_coalesce ec_default = {
>>> +             .cmd = ETHTOOL_SCOALESCE,
>>> +             .rx_max_coalesced_frames = 1,
>> I think rx part is no necessary.
> The definition of ethtool_coalesce has:
>
> "* It is illegal to set both usecs and max_frames to zero as this
>   * would cause interrupts to never be generated.  To disable
>   * coalescing, set usecs = 0 and max_frames = 1."
>
> I'd rather not diverge from this prescribed behavior unless there's
> a strong reason.

I get this.

>
> On the related point in the other thread:
>
>> Rethink about this, how about something like:
>>
>> - UINT_MAX: no tx interrupt
>> - other value: tx interrupt with possible interrupt moderation
> Okay, that will be simpler to configure.

I wonder maybe we can use ethtool_coalesce definition. E.g usecs = 0 && 
max_frame = 0 means no interrupt? Just a thought, I'm ok with either.

Thanks

^ permalink raw reply

* Re: [net-next, PATCH 2/2, v1] net: socionext: add AF_XDP support
From: Toshiaki Makita @ 2018-09-11  0:45 UTC (permalink / raw)
  To: Ilias Apalodimas
  Cc: netdev, jaswinder.singh, ard.biesheuvel, masami.hiramatsu, arnd,
	mykyta.iziumtsev, bjorn.topel, magnus.karlsson,
	Jesper Dangaard Brouer, Alexei Starovoitov, Daniel Borkmann,
	Björn Töpel
In-Reply-To: <20180910162132.GA16240@apalos>

On 2018/09/11 1:21, Ilias Apalodimas wrote:
>>> @@ -707,6 +731,26 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget)
>>>  		if (unlikely(!buf_addr))
>>>  			break;
>>>  
>>> +		if (xdp_prog) {
>>> +			xdp_result = netsec_run_xdp(desc, priv, xdp_prog,
>>> +						    pkt_len);
>>> +			if (xdp_result != NETSEC_XDP_PASS) {
>>> +				xdp_flush |= xdp_result & NETSEC_XDP_REDIR;
>>> +
>>> +				dma_unmap_single_attrs(priv->dev,
>>> +						       desc->dma_addr,
>>> +						       desc->len, DMA_TO_DEVICE,
>>> +						       DMA_ATTR_SKIP_CPU_SYNC);
>>> +
>>> +				desc->len = desc_len;
>>> +				desc->dma_addr = dma_handle;
>>> +				desc->addr = buf_addr;
>>> +				netsec_rx_fill(priv, idx, 1);
>>> +				nsetsec_adv_desc(&dring->tail);
>>> +			}
>>> +			continue;
>>
>> Continue even on XDP_PASS? Is this really correct?
>>
>> Also seems there is no handling of adjust_head/tail for XDP_PASS case.
>>
> A question on this. Should XDP related frames be allocated using 1 page
> per packet?

AFAIK there is no such constraint, e.g. i40e allocates 1 page per 2 packets.

-- 
Toshiaki Makita

^ permalink raw reply

* Re: [Patch net] rds: mark bound socket with SOCK_RCU_FREE
From: Santosh Shilimkar @ 2018-09-11  0:56 UTC (permalink / raw)
  To: Cong Wang; +Cc: Sowmini Varadhan, Linux Kernel Network Developers, rds-devel
In-Reply-To: <CAM_iQpVn4pKCLM6O2251t-PFeUGNW45yY0Bn4XC4zXqL2C=Hgw@mail.gmail.com>

On 9/10/2018 5:45 PM, Cong Wang wrote:
> On Mon, Sep 10, 2018 at 5:26 PM Santosh Shilimkar
> <santosh.shilimkar@oracle.com> wrote:
>> Would you mind posting an updated patch please with call_rcu and
>> above extended RCU grace period with rcu_read_lock. Thanks !!
> 
> If you prefer to fix _two_ problems in one patch, sure.
> 
> For the record, the bug this patch fixes is NOT same with the one
> in rds_find_bound(), because there is no rds_find_bound() in
> the backtrace. If you want to see the full backtrace, here it is:
> 
> https://marc.info/?l=linux-netdev&m=153644444808962&w=2
> 
> This is why I believe they are two problems.
> 
> Whether fixing two problems in one patch or two patches is
> merely a preference, I leave it up to you.
> 
I had a partial fix as well in past but since it wasn't covering
complete window, it was abandoned.

Since we know the other race, it would be best to address it
together and hence requested a combine patch. Thanks for help !!

Regards,
Santosh

^ permalink raw reply

* Re: [Patch net] rds: mark bound socket with SOCK_RCU_FREE
From: Cong Wang @ 2018-09-11  0:59 UTC (permalink / raw)
  To: Santosh Shilimkar
  Cc: Sowmini Varadhan, Linux Kernel Network Developers, rds-devel
In-Reply-To: <d97c6391-4ca7-e36a-09da-483b93218507@oracle.com>

On Mon, Sep 10, 2018 at 5:56 PM Santosh Shilimkar
<santosh.shilimkar@oracle.com> wrote:
>
> On 9/10/2018 5:45 PM, Cong Wang wrote:
> > On Mon, Sep 10, 2018 at 5:26 PM Santosh Shilimkar
> > <santosh.shilimkar@oracle.com> wrote:
> >> Would you mind posting an updated patch please with call_rcu and
> >> above extended RCU grace period with rcu_read_lock. Thanks !!
> >
> > If you prefer to fix _two_ problems in one patch, sure.
> >
> > For the record, the bug this patch fixes is NOT same with the one
> > in rds_find_bound(), because there is no rds_find_bound() in
> > the backtrace. If you want to see the full backtrace, here it is:
> >
> > https://marc.info/?l=linux-netdev&m=153644444808962&w=2
> >
> > This is why I believe they are two problems.
> >
> > Whether fixing two problems in one patch or two patches is
> > merely a preference, I leave it up to you.
> >
> I had a partial fix as well in past but since it wasn't covering
> complete window, it was abandoned.
>
> Since we know the other race, it would be best to address it
> together and hence requested a combine patch. Thanks for help !!

No problem! v2 is coming shortly after passing syzbot test. :)

^ permalink raw reply

* Re: [RFC PATCH bpf-next v2 0/4] Implement bpf queue/stack maps
From: Alexei Starovoitov @ 2018-09-11  1:04 UTC (permalink / raw)
  To: Mauricio Vasquez
  Cc: Alexei Starovoitov, Daniel Borkmann, Network Development,
	Joe Stringer

On Fri, Sep 7, 2018 at 1:40 PM, Mauricio Vasquez
<mauricio.vasquez@polito.it> wrote:
>
> I read the Joe's proposal and using that for this problem looks like a nice
> solution.
>
> I think a good trade-off for now would be to go ahead with a queue/stack map
> without preallocating support (or maybe include it having always in mind
> that this issue has to be solved in the near future) and then, as a
> separated work, try to use Joe's proposal in the map helpers.
>
> What do you think?

the problem with such approach is that it would mean that
non-prealloc stack/queue api will be different from future one
after verifier will get smarter.
The alternative would be to support by-value api only.
Meaning let stack/queue support value_size = 1,2,4,8 byte only.
Then bpf_push|pop_elem() helper will be by-value
instead of returning a pointer.
No rcu callback issues and implementation on the kernel
side can be much simpler.
I think simple array of value_size elems with head/tail indices
will be enough.
Once we have Joe's verifier improvements
we can add alloc/free bpf object management facility
and use 8-byte stack/queue mapas a stack of pointers.
I think decoupling memory operations alloc/free from
stack push/pop would be additional benefit of such design.

^ permalink raw reply

* [PATCH][net-next][v2] netlink: remove hash::nelems check in netlink_insert
From: Li RongQing @ 2018-09-11  1:05 UTC (permalink / raw)
  To: netdev

The type of hash::nelems has been changed from size_t to atom_t
which in fact is int, so not need to check if BITS_PER_LONG, that
is bit number of size_t, is bigger than 32

and rht_grow_above_max() will be called to check if hashtable is
too big, ensure it can not bigger than 1<<31

Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
Signed-off-by: Li RongQing <lirongqing@baidu.com>
---
 net/netlink/af_netlink.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index b4a29bcc33b9..e3a0538ec0be 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -574,11 +574,6 @@ static int netlink_insert(struct sock *sk, u32 portid)
 	if (nlk_sk(sk)->bound)
 		goto err;
 
-	err = -ENOMEM;
-	if (BITS_PER_LONG > 32 &&
-	    unlikely(atomic_read(&table->hash.nelems) >= UINT_MAX))
-		goto err;
-
 	nlk_sk(sk)->portid = portid;
 	sock_hold(sk);
 
-- 
2.16.2

^ permalink raw reply related

* Re: [PATCH net] ipv6: use rt6_info members when dst is set in rt6_fill_node
From: Hangbin Liu @ 2018-09-11  1:04 UTC (permalink / raw)
  To: David Ahern; +Cc: Xin Long, network dev, davem, Roopa Prabhu
In-Reply-To: <c519ac67-a32e-4b68-76d4-49378e1440df@cumulusnetworks.com>

On Mon, Sep 10, 2018 at 01:07:11PM -0600, David Ahern wrote:
> On 9/10/18 11:55 AM, Xin Long wrote:
> > On Tue, Sep 11, 2018 at 12:13 AM David Ahern <dsa@cumulusnetworks.com> wrote:
> >>
> >> On 9/9/18 12:29 AM, Xin Long wrote:
> >>>>> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> >>>>> index 18e00ce..e554922 100644
> >>>>> --- a/net/ipv6/route.c
> >>>>> +++ b/net/ipv6/route.c
> >>>>> @@ -4670,20 +4670,33 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
> >>>>>                        int iif, int type, u32 portid, u32 seq,
> >>>>>                        unsigned int flags)
> >>>>>  {
> >>>>> -     struct rtmsg *rtm;
> >>>>> +     struct rt6key *fib6_prefsrc, *fib6_dst, *fib6_src;
> >>>>> +     struct rt6_info *rt6 = (struct rt6_info *)dst;
> >>>>> +     u32 *pmetrics, table, fib6_flags;
> >>>>>       struct nlmsghdr *nlh;
> >>>>> +     struct rtmsg *rtm;
> >>>>>       long expires = 0;
> >>>>> -     u32 *pmetrics;
> >>>>> -     u32 table;
> >>>>>
> >>>>>       nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
> >>>>>       if (!nlh)
> >>>>>               return -EMSGSIZE;
> >>>>>
> >>>>> +     if (rt6) {
> >>>>> +             fib6_dst = &rt6->rt6i_dst;
> >>>>> +             fib6_src = &rt6->rt6i_src;
> >>>>> +             fib6_flags = rt6->rt6i_flags;
> >>>>> +             fib6_prefsrc = &rt6->rt6i_prefsrc;
> >>>>> +     } else {
> >>>>> +             fib6_dst = &rt->fib6_dst;
> >>>>> +             fib6_src = &rt->fib6_src;
> >>>>> +             fib6_flags = rt->fib6_flags;
> >>>>> +             fib6_prefsrc = &rt->fib6_prefsrc;
> >>>>> +     }
> >>>>
> >>>> Unless I am missing something at the moment, an rt6_info can only have
> >>>> the same dst, src and prefsrc as the fib6_info on which it is based.
> >>>> Thus, only the flags is needed above. That simplifies this patch a lot.
> >>> If dst, src and prefsrc in rt6_info are always the same as these in fib6_info,
> >>> why do we need them in rt6_info? we could just get it by 'from'.
> >>>
> >>
> >> I just sent a patch removing rt6i_prefsrc. It is set with only 1 reader
> >> that can be converted.
> >>
> >> rt6i_src is checked against the fib6_info to invalidate a dst if the src
> >> has changed, so a valid rt will always have the same rt6i_src as the
> >> rt->from.
> >>
> >> rt6i_dst is set to the dest address / 128 in cases, so it should be used
> >> for rt6_info cases above.
> > So that means, I will use rt6i_dst and rt6i_flags when dst is set?
> > how about I use rt6i_src there as well? just to make it look clear.
> > and plus the gw/nh dump fix in rt6_fill_node():
> > -        if (rt->fib6_nsiblings) {
> > +        if (rt6) {
> > +                if (fib6_flags & RTF_GATEWAY)
> > +                        if (nla_put_in6_addr(skb, RTA_GATEWAY,
> > +                                             &rt6->rt6i_gateway) < 0)
> > +                                goto nla_put_failure;
> > +
> > +                if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
> > +                        goto nla_put_failure;
> > +        } else if (rt->fib6_nsiblings) {
> >                  struct fib6_info *sibling, *next_sibling;
> >                  struct nlattr *mp;
> > 
> > looks good to you?
> > 
> 
> sure

Hmm... Then how to deal the other nh info filled by rt6_nexthop_info()?
I was planed to fix the gw issue[1] by syncing the gw and flags info. Like

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 18e00ce..62621b4 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -998,6 +998,21 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
 	rt->rt6i_prefsrc = ort->fib6_prefsrc;
 }
 
+static void rt6_update_info(struct rt6_info *rt)
+{
+	struct fib6_info *from;
+
+	rcu_read_lock();
+	from = rcu_dereference(rt->from);
+	fib6_info_hold(from);
+	rcu_read_unlock();
+
+	from->fib6_flags = rt->rt6i_flags;
+	from->fib6_nh.nh_gw = rt->rt6i_gateway;
+
+	fib6_info_release(from);
+}
+
 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
 					struct in6_addr *saddr)
 {
@@ -3446,6 +3463,8 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
 		goto out;
 	}
 
+	rt6_update_info(nrt);
+
 	netevent.old = &rt->dst;
 	netevent.new = &nrt->dst;
 	netevent.daddr = &msg->dest;
-- 
2.5.5


[1] https://patchwork.ozlabs.org/patch/966972/

Thanks
Hangbin

^ permalink raw reply related

* [PATCH net-next v3 03/17] zinc: ChaCha20 generic C implementation
From: Jason A. Donenfeld @ 2018-09-11  1:08 UTC (permalink / raw)
  To: linux-kernel, netdev, davem, gregkh
  Cc: Jason A. Donenfeld, Andy Lutomirski, Samuel Neves,
	Jean-Philippe Aumasson, linux-crypto
In-Reply-To: <20180911010838.8818-1-Jason@zx2c4.com>

This implements the ChaCha20 permutation as a single C statement, by way
of the comma operator, which the compiler is able to simplify
terrifically.

Information: https://cr.yp.to/chacha.html

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: linux-crypto@vger.kernel.org
---
 include/zinc/chacha20.h      |  53 +++++++++++
 lib/zinc/Kconfig             |   4 +
 lib/zinc/Makefile            |   4 +
 lib/zinc/chacha20/chacha20.c | 168 +++++++++++++++++++++++++++++++++++
 lib/zinc/main.c              |   5 ++
 5 files changed, 234 insertions(+)
 create mode 100644 include/zinc/chacha20.h
 create mode 100644 lib/zinc/chacha20/chacha20.c

diff --git a/include/zinc/chacha20.h b/include/zinc/chacha20.h
new file mode 100644
index 000000000000..d09afbccb601
--- /dev/null
+++ b/include/zinc/chacha20.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#ifndef _ZINC_CHACHA20_H
+#define _ZINC_CHACHA20_H
+
+#include <asm/unaligned.h>
+#include <linux/simd.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+enum {
+	CHACHA20_IV_SIZE = 16,
+	CHACHA20_KEY_SIZE = 32,
+	CHACHA20_BLOCK_SIZE = 64,
+	HCHACHA20_KEY_SIZE = 32,
+	HCHACHA20_NONCE_SIZE = 16
+};
+
+struct chacha20_ctx {
+	u32 key[8];
+	u32 counter[4];
+} __aligned(32);
+
+void chacha20_fpu_init(void);
+
+static inline void chacha20_init(struct chacha20_ctx *state,
+				 const u8 key[CHACHA20_KEY_SIZE],
+				 const u64 nonce)
+{
+	state->key[0] = get_unaligned_le32(key + 0);
+	state->key[1] = get_unaligned_le32(key + 4);
+	state->key[2] = get_unaligned_le32(key + 8);
+	state->key[3] = get_unaligned_le32(key + 12);
+	state->key[4] = get_unaligned_le32(key + 16);
+	state->key[5] = get_unaligned_le32(key + 20);
+	state->key[6] = get_unaligned_le32(key + 24);
+	state->key[7] = get_unaligned_le32(key + 28);
+	state->counter[0] = state->counter[1] = 0;
+	state->counter[2] = nonce & U32_MAX;
+	state->counter[3] = nonce >> 32;
+}
+void chacha20(struct chacha20_ctx *state, u8 *dst, const u8 *src, u32 len,
+	      simd_context_t simd_context);
+
+/* Derived key should be 32-bit aligned */
+void hchacha20(u8 derived_key[CHACHA20_KEY_SIZE],
+	       const u8 nonce[HCHACHA20_NONCE_SIZE],
+	       const u8 key[HCHACHA20_KEY_SIZE], simd_context_t simd_context);
+
+#endif /* _ZINC_CHACHA20_H */
diff --git a/lib/zinc/Kconfig b/lib/zinc/Kconfig
index aa4f8d449d6b..5311a0d6ba8b 100644
--- a/lib/zinc/Kconfig
+++ b/lib/zinc/Kconfig
@@ -6,6 +6,10 @@ config ZINC
 	select NEON
 	select KERNEL_MODE_NEON
 
+config ZINC_CHACHA20
+	bool
+	select ZINC
+
 config ZINC_DEBUG
 	bool "Zinc cryptography library debugging and self-tests"
 	depends on ZINC
diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
index dad47573de42..0b5a964bfba6 100644
--- a/lib/zinc/Makefile
+++ b/lib/zinc/Makefile
@@ -3,6 +3,10 @@ ccflags-y += -Wframe-larger-than=8192
 ccflags-y += -D'pr_fmt(fmt)=KBUILD_MODNAME ": " fmt'
 ccflags-$(CONFIG_ZINC_DEBUG) += -DDEBUG
 
+ifeq ($(CONFIG_ZINC_CHACHA20),y)
+zinc-y += chacha20/chacha20.o
+endif
+
 zinc-y += main.o
 
 obj-$(CONFIG_ZINC) := zinc.o
diff --git a/lib/zinc/chacha20/chacha20.c b/lib/zinc/chacha20/chacha20.c
new file mode 100644
index 000000000000..ab5ef07dd4b5
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20.c
@@ -0,0 +1,168 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * Implementation of the ChaCha20 stream cipher.
+ *
+ * Information: https://cr.yp.to/chacha.html
+ */
+
+#include <zinc/chacha20.h>
+
+#include <linux/kernel.h>
+#include <crypto/algapi.h>
+
+#ifndef HAVE_CHACHA20_ARCH_IMPLEMENTATION
+void __init chacha20_fpu_init(void)
+{
+}
+static inline bool chacha20_arch(u8 *out, const u8 *in, const size_t len,
+				 const u32 key[8], const u32 counter[4],
+				 simd_context_t simd_context)
+{
+	return false;
+}
+static inline bool hchacha20_arch(u8 *derived_key, const u8 *nonce,
+				  const u8 *key, simd_context_t simd_context)
+{
+	return false;
+}
+#endif
+
+#define EXPAND_32_BYTE_K 0x61707865U, 0x3320646eU, 0x79622d32U, 0x6b206574U
+
+#define QUARTER_ROUND(x, a, b, c, d) ( \
+	x[a] += x[b], \
+	x[d] = rol32((x[d] ^ x[a]), 16), \
+	x[c] += x[d], \
+	x[b] = rol32((x[b] ^ x[c]), 12), \
+	x[a] += x[b], \
+	x[d] = rol32((x[d] ^ x[a]), 8), \
+	x[c] += x[d], \
+	x[b] = rol32((x[b] ^ x[c]), 7) \
+)
+
+#define C(i, j) (i * 4 + j)
+
+#define DOUBLE_ROUND(x) ( \
+	/* Column Round */ \
+	QUARTER_ROUND(x, C(0, 0), C(1, 0), C(2, 0), C(3, 0)), \
+	QUARTER_ROUND(x, C(0, 1), C(1, 1), C(2, 1), C(3, 1)), \
+	QUARTER_ROUND(x, C(0, 2), C(1, 2), C(2, 2), C(3, 2)), \
+	QUARTER_ROUND(x, C(0, 3), C(1, 3), C(2, 3), C(3, 3)), \
+	/* Diagonal Round */ \
+	QUARTER_ROUND(x, C(0, 0), C(1, 1), C(2, 2), C(3, 3)), \
+	QUARTER_ROUND(x, C(0, 1), C(1, 2), C(2, 3), C(3, 0)), \
+	QUARTER_ROUND(x, C(0, 2), C(1, 3), C(2, 0), C(3, 1)), \
+	QUARTER_ROUND(x, C(0, 3), C(1, 0), C(2, 1), C(3, 2)) \
+)
+
+#define TWENTY_ROUNDS(x) ( \
+	DOUBLE_ROUND(x), \
+	DOUBLE_ROUND(x), \
+	DOUBLE_ROUND(x), \
+	DOUBLE_ROUND(x), \
+	DOUBLE_ROUND(x), \
+	DOUBLE_ROUND(x), \
+	DOUBLE_ROUND(x), \
+	DOUBLE_ROUND(x), \
+	DOUBLE_ROUND(x), \
+	DOUBLE_ROUND(x) \
+)
+
+static void chacha20_block_generic(__le32 *stream, u32 *state)
+{
+	u32 x[CHACHA20_BLOCK_SIZE / sizeof(u32)];
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(x); ++i)
+		x[i] = state[i];
+
+	TWENTY_ROUNDS(x);
+
+	for (i = 0; i < ARRAY_SIZE(x); ++i)
+		stream[i] = cpu_to_le32(x[i] + state[i]);
+
+	++state[12];
+}
+
+static void chacha20_generic(u8 *out, const u8 *in, u32 len, const u32 key[8],
+			     const u32 counter[4])
+{
+	__le32 buf[CHACHA20_BLOCK_SIZE / sizeof(__le32)];
+	u32 x[] = {
+		EXPAND_32_BYTE_K,
+		key[0], key[1], key[2], key[3],
+		key[4], key[5], key[6], key[7],
+		counter[0], counter[1], counter[2], counter[3]
+	};
+
+	if (out != in)
+		memmove(out, in, len);
+
+	while (len >= CHACHA20_BLOCK_SIZE) {
+		chacha20_block_generic(buf, x);
+		crypto_xor(out, (u8 *)buf, CHACHA20_BLOCK_SIZE);
+		len -= CHACHA20_BLOCK_SIZE;
+		out += CHACHA20_BLOCK_SIZE;
+	}
+	if (len) {
+		chacha20_block_generic(buf, x);
+		crypto_xor(out, (u8 *)buf, len);
+	}
+}
+
+void chacha20(struct chacha20_ctx *state, u8 *dst, const u8 *src, u32 len,
+	      simd_context_t simd_context)
+{
+	if (!chacha20_arch(dst, src, len, state->key, state->counter,
+			   simd_context))
+		chacha20_generic(dst, src, len, state->key, state->counter);
+	state->counter[0] += (len + 63) / 64;
+}
+EXPORT_SYMBOL(chacha20);
+
+static void hchacha20_generic(u8 derived_key[CHACHA20_KEY_SIZE],
+			      const u8 nonce[HCHACHA20_NONCE_SIZE],
+			      const u8 key[HCHACHA20_KEY_SIZE])
+{
+	__le32 *out = (__force __le32 *)derived_key;
+	u32 x[] = { EXPAND_32_BYTE_K,
+		    get_unaligned_le32(key + 0),
+		    get_unaligned_le32(key + 4),
+		    get_unaligned_le32(key + 8),
+		    get_unaligned_le32(key + 12),
+		    get_unaligned_le32(key + 16),
+		    get_unaligned_le32(key + 20),
+		    get_unaligned_le32(key + 24),
+		    get_unaligned_le32(key + 28),
+		    get_unaligned_le32(nonce + 0),
+		    get_unaligned_le32(nonce + 4),
+		    get_unaligned_le32(nonce + 8),
+		    get_unaligned_le32(nonce + 12)
+	};
+
+	TWENTY_ROUNDS(x);
+
+	out[0] = cpu_to_le32(x[0]);
+	out[1] = cpu_to_le32(x[1]);
+	out[2] = cpu_to_le32(x[2]);
+	out[3] = cpu_to_le32(x[3]);
+	out[4] = cpu_to_le32(x[12]);
+	out[5] = cpu_to_le32(x[13]);
+	out[6] = cpu_to_le32(x[14]);
+	out[7] = cpu_to_le32(x[15]);
+}
+
+/* Derived key should be 32-bit aligned */
+void hchacha20(u8 derived_key[CHACHA20_KEY_SIZE],
+	       const u8 nonce[HCHACHA20_NONCE_SIZE],
+	       const u8 key[HCHACHA20_KEY_SIZE], simd_context_t simd_context)
+{
+	if (!hchacha20_arch(derived_key, nonce, key, simd_context))
+		hchacha20_generic(derived_key, nonce, key);
+}
+/* Deliberately not EXPORT_SYMBOL'd, since there are few reasons why somebody
+ * should be using this directly, rather than via xchacha20. Revisit only in
+ * the unlikely event that somebody has a good reason to export this.
+ */
diff --git a/lib/zinc/main.c b/lib/zinc/main.c
index ceece33ff5a7..7e8e84b706b7 100644
--- a/lib/zinc/main.c
+++ b/lib/zinc/main.c
@@ -3,6 +3,8 @@
  * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
  */
 
+#include <zinc/chacha20.h>
+
 #include <linux/init.h>
 #include <linux/module.h>
 
@@ -17,6 +19,9 @@
 
 static int __init mod_init(void)
 {
+#ifdef CONFIG_ZINC_CHACHA20
+	chacha20_fpu_init();
+#endif
 	return 0;
 }
 
-- 
2.18.0

^ permalink raw reply related

* Re: [PATCH] net: ipv4: Use BUG_ON directly instead of a if condition followed by BUG
From: zhong jiang @ 2018-09-11  1:12 UTC (permalink / raw)
  To: Jiri Benc; +Cc: davem, edumazet, kuznet, yoshfuji, netdev, linux-kernel
In-Reply-To: <20180910173957.5f3728e6@redhat.com>

On 2018/9/10 23:39, Jiri Benc wrote:
> On Mon, 10 Sep 2018 22:38:02 +0800, zhong jiang wrote:
>> +			BUG(skb_copy_bits(skb, ptr, &tmp, 1));
> I doubt this builds.
 This is my fault. should Be BUG_ON

 Thanks,
 zhong jiang
>  Jiri
>
> .
>

^ permalink raw reply

* Re: [PATCH net-next] virtio_net: ethtool tx napi configuration
From: Willem de Bruijn @ 2018-09-11  1:14 UTC (permalink / raw)
  To: Jason Wang
  Cc: Network Development, David Miller, caleb.raitto,
	Michael S. Tsirkin, Jon Olson (Google Drive), Willem de Bruijn
In-Reply-To: <78a3f2f8-7c86-f2a1-4ffe-b9bb270ed186@redhat.com>

> >> I cook a fixup, and it looks works in my setup:
> >>
> >> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> >> index b320b6b14749..9181c3f2f832 100644
> >> --- a/drivers/net/virtio_net.c
> >> +++ b/drivers/net/virtio_net.c
> >> @@ -2204,10 +2204,17 @@ static int virtnet_set_coalesce(struct
> >> net_device *dev,
> >>                   return -EINVAL;
> >>
> >>           if (napi_weight ^ vi->sq[0].napi.weight) {
> >> -               if (dev->flags & IFF_UP)
> >> -                       return -EBUSY;
> >> -               for (i = 0; i < vi->max_queue_pairs; i++)
> >> +               for (i = 0; i < vi->max_queue_pairs; i++) {
> >> +                       struct netdev_queue *txq =
> >> +                              netdev_get_tx_queue(vi->dev, i);
> >> +
> >> + virtnet_napi_tx_disable(&vi->sq[i].napi);
> >> +                       __netif_tx_lock_bh(txq);
> >>                           vi->sq[i].napi.weight = napi_weight;
> >> +                       __netif_tx_unlock_bh(txq);
> >> +                       virtnet_napi_tx_enable(vi, vi->sq[i].vq,
> >> + &vi->sq[i].napi);
> >> +               }
> >>           }
> >>
> >>           return 0;
> > Thanks! It passes my simple stress test, too. Which consists of two
> > concurrent loops, one toggling the ethtool option, another running
> > TCP_RR.
> >
> >> The only left case is the speculative tx polling in RX NAPI. I think we
> >> don't need to care in this case since it was not a must for correctness.
> > As long as the txq lock is held that will be a noop, anyway. The other
> > concurrent action is skb_xmit_done. It looks correct to me, but need
> > to think about it a bit. The tricky transition is coming out of napi without
> > having >= 2 + MAX_SKB_FRAGS clean descriptors. If the queue is
> > stopped it may deadlock transmission in no-napi mode.
>
> Yes, maybe we can enable tx queue when napi weight is zero in
> virtnet_poll_tx().

Yes, that precaution should resolve that edge case.

> Let me do more stress test on this.
>
> >
> >>> Link: https://patchwork.ozlabs.org/patch/948149/
> >>> Suggested-by: Jason Wang <jasowang@redhat.com>
> >>> Signed-off-by: Willem de Bruijn <willemb@google.com>
> >>> ---
> >>>    drivers/net/virtio_net.c | 52 ++++++++++++++++++++++++++++++++++++++++
> >>>    1 file changed, 52 insertions(+)
> >>>
> >>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> >>> index 765920905226..b320b6b14749 100644
> >>> --- a/drivers/net/virtio_net.c
> >>> +++ b/drivers/net/virtio_net.c
> >>> @@ -66,6 +66,8 @@ DECLARE_EWMA(pkt_len, 0, 64)
> >>>
> >>>    #define VIRTNET_DRIVER_VERSION "1.0.0"
> >>>
> >>> +static const u32 ethtool_coalesce_napi_mask = (1UL << 10);
> >>> +
> >>>    static const unsigned long guest_offloads[] = {
> >>>        VIRTIO_NET_F_GUEST_TSO4,
> >>>        VIRTIO_NET_F_GUEST_TSO6,
> >>> @@ -2181,6 +2183,54 @@ static int virtnet_get_link_ksettings(struct net_device *dev,
> >>>        return 0;
> >>>    }
> >>>
> >>> +static int virtnet_set_coalesce(struct net_device *dev,
> >>> +                             struct ethtool_coalesce *ec)
> >>> +{
> >>> +     const struct ethtool_coalesce ec_default = {
> >>> +             .cmd = ETHTOOL_SCOALESCE,
> >>> +             .rx_max_coalesced_frames = 1,
> >> I think rx part is no necessary.
> > The definition of ethtool_coalesce has:
> >
> > "* It is illegal to set both usecs and max_frames to zero as this
> >   * would cause interrupts to never be generated.  To disable
> >   * coalescing, set usecs = 0 and max_frames = 1."
> >
> > I'd rather not diverge from this prescribed behavior unless there's
> > a strong reason.
>
> I get this.
>
> >
> > On the related point in the other thread:
> >
> >> Rethink about this, how about something like:
> >>
> >> - UINT_MAX: no tx interrupt
> >> - other value: tx interrupt with possible interrupt moderation
> > Okay, that will be simpler to configure.
>
> I wonder maybe we can use ethtool_coalesce definition. E.g usecs = 0 &&
> max_frame = 0 means no interrupt? Just a thought, I'm ok with either.

Come to think of it, max_frames 0 is no worse than max_frames
UINT_MAX. Sure, let's do that.

^ permalink raw reply

* [PATCHv2 iproute2] bridge/mdb: fix missing new line when show bridge mdb
From: Hangbin Liu @ 2018-09-11  1:26 UTC (permalink / raw)
  To: netdev; +Cc: Stephen Hemminger, David Ahern, Phil Sutter, Hangbin Liu
In-Reply-To: <1536118423-20604-1-git-send-email-liuhangbin@gmail.com>

The bridge mdb show is broken on current iproute2. e.g.
]# bridge mdb show
34: br0  veth0_br  224.1.1.2  temp 34: br0  veth0_br  224.1.1.1  temp

After fix:
]# bridge mdb show
34: br0  veth0_br  224.1.1.2  temp
34: br0  veth0_br  224.1.1.1  temp

v2: use json print lib as Stephen suggested.

Reported-by: Ying Xu <yinxu@redhat.com>
Fixes: c7c1a1ef51aea ("bridge: colorize output and use JSON print library")
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
---
 bridge/mdb.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/bridge/mdb.c b/bridge/mdb.c
index f38dc67..408117d 100644
--- a/bridge/mdb.c
+++ b/bridge/mdb.c
@@ -107,6 +107,10 @@ static void br_print_router_ports(FILE *f, struct rtattr *attr,
 			fprintf(f, "%s ", port_ifname);
 		}
 	}
+
+	if (!is_json_context() && !show_stats)
+		print_string(PRINT_FP, NULL, "\n", NULL);
+
 	close_json_array(PRINT_JSON, NULL);
 }
 
@@ -164,6 +168,10 @@ static void print_mdb_entry(FILE *f, int ifindex, const struct br_mdb_entry *e,
 		print_string(PRINT_ANY, "timer", " %s",
 			     format_timer(timer));
 	}
+
+	if (!is_json_context())
+		print_string(PRINT_FP, NULL, "\n", NULL);
+
 	close_json_object();
 }
 
-- 
2.5.5

^ permalink raw reply related

* [Patch net v2] rds: fix two RCU related problems
From: Cong Wang @ 2018-09-11  1:27 UTC (permalink / raw)
  To: netdev; +Cc: Cong Wang, Sowmini Varadhan, Santosh Shilimkar, rds-devel

When a rds sock is bound, it is inserted into the bind_hash_table
which is protected by RCU. But when releasing rds sock, after it
is removed from this hash table, it is freed immediately without
respecting RCU grace period. This could cause some use-after-free
as reported by syzbot.

Mark the rds sock with SOCK_RCU_FREE before inserting it into the
bind_hash_table, so that it would be always freed after a RCU grace
period.

The other problem is in rds_find_bound(), the rds sock could be
freed in between rhashtable_lookup_fast() and rds_sock_addref(),
so we need to extend RCU read lock protection in rds_find_bound()
to close this race condition.

Reported-and-tested-by: syzbot+8967084bcac563795dc6@syzkaller.appspotmail.com
Reported-by: syzbot+93a5839deb355537440f@syzkaller.appspotmail.com
Cc: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Cc: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Cc: rds-devel@oss.oracle.com
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
---
 net/rds/bind.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/rds/bind.c b/net/rds/bind.c
index 3ab55784b637..762d2c6788a3 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -76,11 +76,13 @@ struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port,
 	struct rds_sock *rs;

 	__rds_create_bind_key(key, addr, port, scope_id);
-	rs = rhashtable_lookup_fast(&bind_hash_table, key, ht_parms);
+	rcu_read_lock();
+	rs = rhashtable_lookup(&bind_hash_table, key, ht_parms);
 	if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
 		rds_sock_addref(rs);
 	else
 		rs = NULL;
+	rcu_read_unlock();

 	rdsdebug("returning rs %p for %pI6c:%u\n", rs, addr,
 		 ntohs(port));
@@ -235,6 +237,7 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		goto out;
 	}

+	sock_set_flag(sk, SOCK_RCU_FREE);
 	ret = rds_add_bound(rs, binding_addr, &port, scope_id);
 	if (ret)
 		goto out;
-- 
2.14.4

^ permalink raw reply related

* Re: [Patch net v2] rds: fix two RCU related problems
From: Santosh Shilimkar @ 2018-09-11  1:33 UTC (permalink / raw)
  To: Cong Wang, netdev; +Cc: Sowmini Varadhan, rds-devel
In-Reply-To: <20180911012726.5353-1-xiyou.wangcong@gmail.com>

On 9/10/2018 6:27 PM, Cong Wang wrote:
> When a rds sock is bound, it is inserted into the bind_hash_table
> which is protected by RCU. But when releasing rds sock, after it
> is removed from this hash table, it is freed immediately without
> respecting RCU grace period. This could cause some use-after-free
> as reported by syzbot.
> 
> Mark the rds sock with SOCK_RCU_FREE before inserting it into the
> bind_hash_table, so that it would be always freed after a RCU grace
> period.
> 
> The other problem is in rds_find_bound(), the rds sock could be
> freed in between rhashtable_lookup_fast() and rds_sock_addref(),
> so we need to extend RCU read lock protection in rds_find_bound()
> to close this race condition.
> 
> Reported-and-tested-by: syzbot+8967084bcac563795dc6@syzkaller.appspotmail.com
> Reported-by: syzbot+93a5839deb355537440f@syzkaller.appspotmail.com
> Cc: Sowmini Varadhan <sowmini.varadhan@oracle.com>
> Cc: Santosh Shilimkar <santosh.shilimkar@oracle.com>
> Cc: rds-devel@oss.oracle.com
> Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
> ---
Thank you !!
Acked-by: Santosh Shilimkar <santosh.shilimkar@oarcle.com>

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox