Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 1/5] ipvs: add backup_only flag to avoid loops
From: Simon Horman @ 2013-03-18 13:15 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: lvs-devel, netdev, netfilter-devel, Wensong Zhang,
	Julian Anastasov, Simon Horman
In-Reply-To: <1363612543-9787-1-git-send-email-horms@verge.net.au>

From: Julian Anastasov <ja@ssi.bg>

	Dmitry Akindinov is reporting for a problem where
SYNs are looping between the master and backup server
when the backup server is used as real server in DR mode
and has IPVS rules to function as director.

	Even when the backup function is enabled we
continue to forward traffic and schedule new connections
when the current master is using the backup server as
real server. While this is not a problem for NAT, for
DR and TUN method the backup server can not determine
if a request comes from client or from director.

	To avoid such loops add new sysctl flag
backup_only. It can be needed for DR/TUN setups that
do not need backup and director function at the
same time. When the backup function is enabled we
stop any forwarding and pass the traffic to the local
stack (real server mode). The flag disables the
director function when the backup function is enabled.

	For setups that enable backup function for
some virtual services and director function for
other virtual services there should be another more
complex solution to support DR/TUN mode, may be to
assign per-virtual service syncid value, so that
we can differentiate the requests.

Reported-by: Dmitry Akindinov <dimak@stalker.com>
Tested-by: German Myzovsky <lawyer@sipnet.ru>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 Documentation/networking/ipvs-sysctl.txt |    7 +++++++
 include/net/ip_vs.h                      |   12 ++++++++++++
 net/netfilter/ipvs/ip_vs_core.c          |   12 ++++++++----
 net/netfilter/ipvs/ip_vs_ctl.c           |    7 +++++++
 4 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/Documentation/networking/ipvs-sysctl.txt b/Documentation/networking/ipvs-sysctl.txt
index f2a2488..9573d0c 100644
--- a/Documentation/networking/ipvs-sysctl.txt
+++ b/Documentation/networking/ipvs-sysctl.txt
@@ -15,6 +15,13 @@ amemthresh - INTEGER
         enabled and the variable is automatically set to 2, otherwise
         the strategy is disabled and the variable is  set  to 1.
 
+backup_only - BOOLEAN
+	0 - disabled (default)
+	not 0 - enabled
+
+	If set, disable the director function while the server is
+	in backup mode to avoid packet loops for DR/TUN methods.
+
 conntrack - BOOLEAN
 	0 - disabled (default)
 	not 0 - enabled
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 68c69d5..fce8e6b 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -976,6 +976,7 @@ struct netns_ipvs {
 	int			sysctl_sync_retries;
 	int			sysctl_nat_icmp_send;
 	int			sysctl_pmtu_disc;
+	int			sysctl_backup_only;
 
 	/* ip_vs_lblc */
 	int			sysctl_lblc_expiration;
@@ -1067,6 +1068,12 @@ static inline int sysctl_pmtu_disc(struct netns_ipvs *ipvs)
 	return ipvs->sysctl_pmtu_disc;
 }
 
+static inline int sysctl_backup_only(struct netns_ipvs *ipvs)
+{
+	return ipvs->sync_state & IP_VS_STATE_BACKUP &&
+	       ipvs->sysctl_backup_only;
+}
+
 #else
 
 static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs)
@@ -1114,6 +1121,11 @@ static inline int sysctl_pmtu_disc(struct netns_ipvs *ipvs)
 	return 1;
 }
 
+static inline int sysctl_backup_only(struct netns_ipvs *ipvs)
+{
+	return 0;
+}
+
 #endif
 
 /*
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 47edf5a..18b4bc5 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1577,7 +1577,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 	}
 	/* ipvs enabled in this netns ? */
 	net = skb_net(skb);
-	if (!net_ipvs(net)->enable)
+	ipvs = net_ipvs(net);
+	if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
 		return NF_ACCEPT;
 
 	ip_vs_fill_iph_skb(af, skb, &iph);
@@ -1654,7 +1655,6 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 	}
 
 	IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
-	ipvs = net_ipvs(net);
 	/* Check the server status */
 	if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
 		/* the destination server is not available */
@@ -1815,13 +1815,15 @@ ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
 {
 	int r;
 	struct net *net;
+	struct netns_ipvs *ipvs;
 
 	if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
 		return NF_ACCEPT;
 
 	/* ipvs enabled in this netns ? */
 	net = skb_net(skb);
-	if (!net_ipvs(net)->enable)
+	ipvs = net_ipvs(net);
+	if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
 		return NF_ACCEPT;
 
 	return ip_vs_in_icmp(skb, &r, hooknum);
@@ -1835,6 +1837,7 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
 {
 	int r;
 	struct net *net;
+	struct netns_ipvs *ipvs;
 	struct ip_vs_iphdr iphdr;
 
 	ip_vs_fill_iph_skb(AF_INET6, skb, &iphdr);
@@ -1843,7 +1846,8 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
 
 	/* ipvs enabled in this netns ? */
 	net = skb_net(skb);
-	if (!net_ipvs(net)->enable)
+	ipvs = net_ipvs(net);
+	if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
 		return NF_ACCEPT;
 
 	return ip_vs_in_icmp_v6(skb, &r, hooknum, &iphdr);
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index c68198b..9e2d1cc 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1808,6 +1808,12 @@ static struct ctl_table vs_vars[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "backup_only",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 #ifdef CONFIG_IP_VS_DEBUG
 	{
 		.procname	= "debug_level",
@@ -3741,6 +3747,7 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
 	tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
 	ipvs->sysctl_pmtu_disc = 1;
 	tbl[idx++].data = &ipvs->sysctl_pmtu_disc;
+	tbl[idx++].data = &ipvs->sysctl_backup_only;
 
 
 	ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 2/5] ipvs: remove extra rcu lock
From: Simon Horman @ 2013-03-18 13:15 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: lvs-devel, netdev, netfilter-devel, Wensong Zhang,
	Julian Anastasov, Simon Horman
In-Reply-To: <1363612543-9787-1-git-send-email-horms@verge.net.au>

From: Julian Anastasov <ja@ssi.bg>

	In 3.7 we added code that uses ipv4_update_pmtu
but after commit c5ae7d4192 (ipv4: must use rcu protection
while calling fib_lookup) the RCU lock is not needed.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_core.c |    2 --
 1 file changed, 2 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 18b4bc5..61f49d2 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1394,10 +1394,8 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
 			skb_reset_network_header(skb);
 			IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n",
 				&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu);
-			rcu_read_lock();
 			ipv4_update_pmtu(skb, dev_net(skb->dev),
 					 mtu, 0, 0, 0, 0);
-			rcu_read_unlock();
 			/* Client uses PMTUD? */
 			if (!(cih->frag_off & htons(IP_DF)))
 				goto ignore_ipip;
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 3/5] ipvs: fix sctp chunk length order
From: Simon Horman @ 2013-03-18 13:15 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: lvs-devel, netdev, netfilter-devel, Wensong Zhang,
	Julian Anastasov, Simon Horman
In-Reply-To: <1363612543-9787-1-git-send-email-horms@verge.net.au>

From: Julian Anastasov <ja@ssi.bg>

	Fix wrong but non-fatal access to chunk length.
sch->length should be in network order, next chunk should
be aligned to 4 bytes. Problem noticed in sparse output.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_proto_sctp.c |   16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index ae8ec6f..cd1d729 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -906,7 +906,7 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
 	sctp_chunkhdr_t _sctpch, *sch;
 	unsigned char chunk_type;
 	int event, next_state;
-	int ihl;
+	int ihl, cofs;
 
 #ifdef CONFIG_IP_VS_IPV6
 	ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
@@ -914,8 +914,8 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
 	ihl = ip_hdrlen(skb);
 #endif
 
-	sch = skb_header_pointer(skb, ihl + sizeof(sctp_sctphdr_t),
-				sizeof(_sctpch), &_sctpch);
+	cofs = ihl + sizeof(sctp_sctphdr_t);
+	sch = skb_header_pointer(skb, cofs, sizeof(_sctpch), &_sctpch);
 	if (sch == NULL)
 		return;
 
@@ -933,10 +933,12 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
 	 */
 	if ((sch->type == SCTP_CID_COOKIE_ECHO) ||
 	    (sch->type == SCTP_CID_COOKIE_ACK)) {
-		sch = skb_header_pointer(skb, (ihl + sizeof(sctp_sctphdr_t) +
-				sch->length), sizeof(_sctpch), &_sctpch);
-		if (sch) {
-			if (sch->type == SCTP_CID_ABORT)
+		int clen = ntohs(sch->length);
+
+		if (clen >= sizeof(sctp_chunkhdr_t)) {
+			sch = skb_header_pointer(skb, cofs + ALIGN(clen, 4),
+						 sizeof(_sctpch), &_sctpch);
+			if (sch && sch->type == SCTP_CID_ABORT)
 				chunk_type = sch->type;
 		}
 	}
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 4/5] ipvs: fix hashing in ip_vs_svc_hashkey
From: Simon Horman @ 2013-03-18 13:15 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: lvs-devel, netdev, netfilter-devel, Wensong Zhang,
	Julian Anastasov, Simon Horman
In-Reply-To: <1363612543-9787-1-git-send-email-horms@verge.net.au>

From: Julian Anastasov <ja@ssi.bg>

	net is a pointer in host order, mix it properly
with other keys in network order. Fixes sparse warning.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_ctl.c |    8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 9e2d1cc..8104120 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -271,16 +271,18 @@ ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto,
 {
 	register unsigned int porth = ntohs(port);
 	__be32 addr_fold = addr->ip;
+	__u32 ahash;
 
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6)
 		addr_fold = addr->ip6[0]^addr->ip6[1]^
 			    addr->ip6[2]^addr->ip6[3];
 #endif
-	addr_fold ^= ((size_t)net>>8);
+	ahash = ntohl(addr_fold);
+	ahash ^= ((size_t) net >> 8);
 
-	return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
-		& IP_VS_SVC_TAB_MASK;
+	return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) &
+	       IP_VS_SVC_TAB_MASK;
 }
 
 /*
-- 
1.7.10.4

^ permalink raw reply related

* [GIT PULL nf-next] IPVS
From: Simon Horman @ 2013-03-18 13:15 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: lvs-devel, netdev, netfilter-devel, Wensong Zhang,
	Julian Anastasov

Hi Pablo,

please consider the following IPVS enhancements from Julian for 3.10.

----------------------------------------------------------------
The following changes since commit 1cdb09056b27b2a06b06dc7187d2c33d57082d20:

  netfilter: nfnetlink_queue: use xor hash function to distribute instances (2013-03-15 12:38:40 +0100)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/horms/ipvs-next.git tags/ipvs-for-v3.10

for you to fetch changes up to e095c65caffc923a0d1fb27763c3f9ec1dcb57fc:

  ipvs: fix some sparse warnings (2013-03-18 19:25:14 +0900)

----------------------------------------------------------------
IPVS enhancements for v3.10 from Julian Anastasov

----------------------------------------------------------------
Julian Anastasov (5):
      ipvs: add backup_only flag to avoid loops
      ipvs: remove extra rcu lock
      ipvs: fix sctp chunk length order
      ipvs: fix hashing in ip_vs_svc_hashkey
      ipvs: fix some sparse warnings

 Documentation/networking/ipvs-sysctl.txt |    7 +++++++
 include/net/ip_vs.h                      |   14 +++++++++++++-
 net/netfilter/ipvs/ip_vs_core.c          |   22 +++++++++-------------
 net/netfilter/ipvs/ip_vs_ctl.c           |   15 ++++++++++++---
 net/netfilter/ipvs/ip_vs_est.c           |    2 +-
 net/netfilter/ipvs/ip_vs_proto_sctp.c    |   16 +++++++++-------
 6 files changed, 51 insertions(+), 25 deletions(-)

^ permalink raw reply

* Re: [PATCH 4/4] xen-netback: coalesce slots before copying
From: James Harper @ 2013-03-18 13:09 UTC (permalink / raw)
  To: Wei Liu, netdev@vger.kernel.org, xen-devel@lists.xen.org
  Cc: annie.li@oracle.com, ian.campbell@citrix.com,
	konrad.wilk@oracle.com
In-Reply-To: <1363602955-24790-5-git-send-email-wei.liu2@citrix.com>

> 
> This patch tries to coalesce tx requests when constructing grant copy
> structures. It enables netback to deal with situation when frontend's
> MAX_SKB_FRAGS is larger than backend's MAX_SKB_FRAGS.
> 
> It defines max_skb_slots, which is a estimation of the maximum number of
> slots
> a guest can send, anything bigger than that is considered malicious. Now it is
> set to 20, which should be enough to accommodate Linux (16 to 19) and
> possibly
> Windows (19?).
> 
> +/*
> + * This is an estimation of the maximum possible frags a SKB might
> + * have, anything larger than this is considered malicious. Typically
> + * Linux has 16 to 19, Windows has 19(?).
> + */

Could you remove the "Windows has 19(?)" comment? I don't think it's helpful, even with the "(?)"... I just checked and windows 2008R2 gives GPLPV a maximum of 20 buffers in all the testing I've done, and that's after the header is coalesced so it's probably more than that. I'm pretty sure I tested windows 2003 quite a while back and I could coax it into giving ridiculous numbers of buffers when using iperf with tiny buffers.

Maybe "Windows has >19" if you need to put a number on it?

James

^ permalink raw reply

* [PATCH] macvlan: Remove an unnecessary goto
From: Kusanagi Kouichi @ 2013-03-18 13:03 UTC (permalink / raw)
  To: netdev; +Cc: Patrick McHardy, linux-kernel

Use else instead.

Signed-off-by: Kusanagi Kouichi <slash@ac.auone-net.jp>
---
 drivers/net/macvlan.c |   27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 73abbc1..92d16ad 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -251,31 +251,30 @@ out:
 static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	const struct macvlan_dev *vlan = netdev_priv(dev);
-	const struct macvlan_port *port = vlan->port;
-	const struct macvlan_dev *dest;
-	__u8 ip_summed = skb->ip_summed;
 
 	if (vlan->mode == MACVLAN_MODE_BRIDGE) {
 		const struct ethhdr *eth = (void *)skb->data;
+		const struct macvlan_port * const port = vlan->port;
+		const __u8 ip_summed = skb->ip_summed;
+
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
 
-		/* send to other bridge ports directly */
 		if (is_multicast_ether_addr(eth->h_dest)) {
+			/* send to other bridge ports directly */
 			macvlan_broadcast(skb, port, dev, MACVLAN_MODE_BRIDGE);
-			goto xmit_world;
+		} else {
+			const struct macvlan_dev * const dest =
+				macvlan_hash_lookup(port, eth->h_dest);
+			if (dest && dest->mode == MACVLAN_MODE_BRIDGE) {
+				/* send to lowerdev first for its network taps */
+				dev_forward_skb(vlan->lowerdev, skb);
+				return NET_XMIT_SUCCESS;
+			}
 		}
 
-		dest = macvlan_hash_lookup(port, eth->h_dest);
-		if (dest && dest->mode == MACVLAN_MODE_BRIDGE) {
-			/* send to lowerdev first for its network taps */
-			dev_forward_skb(vlan->lowerdev, skb);
-
-			return NET_XMIT_SUCCESS;
-		}
+		skb->ip_summed = ip_summed;
 	}
 
-xmit_world:
-	skb->ip_summed = ip_summed;
 	skb->dev = vlan->lowerdev;
 	return dev_queue_xmit(skb);
 }
-- 
1.7.10.4

^ permalink raw reply related

* IPv6 testing - TAHI project on Linux
From: Alexandru Copot @ 2013-03-18 13:00 UTC (permalink / raw)
  To: netdev; +Cc: Daniel Baluta

Hello,

The TAHI project[1] is a testing framework for the IPv6 network
stack, originally designed for FreeBSD. Using a patch from  Mark Atkinson[2]
we can build and use v6eval on 32bit Linux. We were able to make some changes
and compile it on 64bit with multilib support.[3]

We also wrote some scripts[4] to create a virtual testing environment. The
scripts are able to download, configure and build a kernel then automatically
run the IPv6 core tests with v6eval inside 2 KVM virtual machines.

There are some tests known to fail:
* Spec tests 32, 33, 34, 35, 39, 40, 47, 48
  These fail because the kernel is currently more strict when receiving
  headers with invalid TLV options. So ICMP Parameter Problem is not sent
  for the following bad options.

* Addr tests 3, 5, 14, 15, 23, 24
  These fail when the Node Under Test has global IPv6 addresses configured
  for routing testing. However, the kernel does DAD properly.

If you are going to use the test automation scripts[4], you might also want
to use our modified v6eval-remotes[5]. These are using a SLIP connection for
remote control with SSH, instead of the plain serial connection.

Any feedback is welcome.

[1] - http://www.tahi.org
[2] - http://wundermark.blogspot.ro/2012/02/linux-patches-for-tahi-v6eval-332.html
[3] - https://github.com/IxLabs/tahi-linux/tree/master/v6eval
[4] - https://github.com/IxLabs/tahi-linux/tree/master/scripts
[5] - https://github.com/IxLabs/tahi-linux/tree/master/v6eval-remotes

^ permalink raw reply

* [PATCH] net: Fix a comment typo
From: Kusanagi Kouichi @ 2013-03-18 12:59 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel, Jiri Kosina

Signed-off-by: Kusanagi Kouichi <slash@ac.auone-net.jp>
---
 net/core/dev.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index dffbef7..69f696a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3326,7 +3326,7 @@ EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
  *	netdev_rx_handler_unregister - unregister receive handler
  *	@dev: device to unregister a handler from
  *
- *	Unregister a receive hander from a device.
+ *	Unregister a receive handler from a device.
  *
  *	The caller must hold the rtnl_mutex.
  */
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH v2 4/4] inet_diag: fix FIN_WAIT2 timer expression in inet_diag
From: Toshiaki Makita @ 2013-03-18 12:50 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, netdev; +Cc: Toshiaki Makita
In-Reply-To: <1363610780.7791.5.camel@ubuntu-vm-makita>

This is a change similar to the patch for /proc/net/tcp.
FIN_WAIT2 timers for orphaned sockets are also shown in inet_diag.
There will be no keepalive timer for orphand FIN_WAIT2 sockets by
this change.
Besides, timer calculation of an inet_connection_sock is splitted
into two pieces because few common codes remain between timer
calculation of a tcp_sock and a dccp_sock: Special handling with
FIN_WAIT2 is needed for a tcp_sock, and consideration of
ICSK_TIME_PROBE0 is unnecessary for a dccp_sock.

Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
---
 include/linux/inet_diag.h |    2 ++
 net/dccp/diag.c           |   15 +++++++++++++++
 net/ipv4/inet_diag.c      |   20 --------------------
 net/ipv4/tcp_diag.c       |   27 +++++++++++++++++++++++++++
 4 files changed, 44 insertions(+), 20 deletions(-)

diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h
index 46da024..f7f359e 100644
--- a/include/linux/inet_diag.h
+++ b/include/linux/inet_diag.h
@@ -1,6 +1,7 @@
 #ifndef _INET_DIAG_H_
 #define _INET_DIAG_H_ 1
 
+#include <linux/kernel.h>
 #include <uapi/linux/inet_diag.h>
 
 struct sock;
@@ -27,6 +28,7 @@ struct inet_diag_handler {
 };
 
 struct inet_connection_sock;
+#define EXPIRES_IN_MS(tmo)  DIV_ROUND_UP((tmo - jiffies) * 1000, HZ)
 int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 			      struct sk_buff *skb, struct inet_diag_req_v2 *req,
 			      struct user_namespace *user_ns,
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
index 028fc43..b979a5b 100644
--- a/net/dccp/diag.c
+++ b/net/dccp/diag.c
@@ -42,6 +42,21 @@ static void dccp_get_info(struct sock *sk, struct tcp_info *info)
 static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
 			       void *_info)
 {
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
+		r->idiag_timer = 1;
+		r->idiag_retrans = icsk->icsk_retransmits;
+		r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
+	} else if (timer_pending(&sk->sk_timer)) {
+		r->idiag_timer = 2;
+		r->idiag_retrans = icsk->icsk_probes_out;
+		r->idiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires);
+	} else {
+		r->idiag_timer = 0;
+		r->idiag_expires = 0;
+	}
+
 	r->idiag_rqueue = r->idiag_wqueue = 0;
 
 	if (_info != NULL)
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 7afa2c3..32c22a7 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -156,26 +156,6 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 		goto out;
 	}
 
-#define EXPIRES_IN_MS(tmo)  DIV_ROUND_UP((tmo - jiffies) * 1000, HZ)
-
-	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
-		r->idiag_timer = 1;
-		r->idiag_retrans = icsk->icsk_retransmits;
-		r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
-	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
-		r->idiag_timer = 4;
-		r->idiag_retrans = icsk->icsk_probes_out;
-		r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
-	} else if (timer_pending(&sk->sk_timer)) {
-		r->idiag_timer = 2;
-		r->idiag_retrans = icsk->icsk_probes_out;
-		r->idiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires);
-	} else {
-		r->idiag_timer = 0;
-		r->idiag_expires = 0;
-	}
-#undef EXPIRES_IN_MS
-
 	if (ext & (1 << (INET_DIAG_INFO - 1))) {
 		attr = nla_reserve(skb, INET_DIAG_INFO,
 				   sizeof(struct tcp_info));
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index ed3f2ad..120a4250 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -20,9 +20,36 @@
 static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
 			      void *_info)
 {
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 	const struct tcp_sock *tp = tcp_sk(sk);
 	struct tcp_info *info = _info;
 
+	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
+		r->idiag_timer = 1;
+		r->idiag_retrans = icsk->icsk_retransmits;
+		r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
+	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
+		r->idiag_timer = 4;
+		r->idiag_retrans = icsk->icsk_probes_out;
+		r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
+	} else if (timer_pending(&sk->sk_timer)) {
+		if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
+			const unsigned long timer_expires =
+				sk->sk_timer.expires +
+				(tp->linger2 >= 0 ? TCP_TIMEWAIT_LEN : 0);
+
+			r->idiag_timer = 3;
+			r->idiag_expires = EXPIRES_IN_MS(timer_expires);
+		} else {
+			r->idiag_timer = 2;
+			r->idiag_retrans = icsk->icsk_probes_out;
+			r->idiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires);
+		}
+	} else {
+		r->idiag_timer = 0;
+		r->idiag_expires = 0;
+	}
+
 	if (sk->sk_state == TCP_LISTEN) {
 		r->idiag_rqueue = sk->sk_ack_backlog;
 		r->idiag_wqueue = sk->sk_max_ack_backlog;
-- 
1.7.10.4

^ permalink raw reply related

* Fwd: IW channels list question
From: Francisco Cuesta @ 2013-03-18 12:48 UTC (permalink / raw)
  To: netdev
In-Reply-To: <CANMvdA-oX331PYALksgwWRPycHs+rL6Kmk4CVXG3UsBpF=qKnQ@mail.gmail.com>

Hello iw developers,

I would like to ask a question related to the command iw list. The
channels shown with this command where are they taken from? I mean,
does iw call the drivers' API to see which frequencies it can provide?

I ask this in that if I change the regulatory domain to Japan for
instance, I cannot see the 4.9GHz channels with iw list, does it mean
that the driver doesn't support such a channels or that I maybe have
to modify iw list source in order to make them show up?

Thanks in advance,

REgards,

Francisco

^ permalink raw reply

* [PATCH v2 3/4] tcp: fix the indentation in timer calculation of /proc/net/tcp
From: Toshiaki Makita @ 2013-03-18 12:46 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, netdev; +Cc: Toshiaki Makita
In-Reply-To: <1363610615.7791.2.camel@ubuntu-vm-makita>

Fix the indentation in get_tcp4_sock and get_tcp6_sock.

Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
---
 net/ipv4/tcp_ipv4.c |   12 ++++++------
 net/ipv6/tcp_ipv6.c |   12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 86a0685..6f0ae53 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2660,11 +2660,11 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
 	int rx_queue;
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
-		timer_active	= 1;
-		timer_expires	= icsk->icsk_timeout;
+		timer_active		= 1;
+		timer_expires		= icsk->icsk_timeout;
 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
-		timer_active	= 4;
-		timer_expires	= icsk->icsk_timeout;
+		timer_active		= 4;
+		timer_expires		= icsk->icsk_timeout;
 	} else if (timer_pending(&sk->sk_timer)) {
 		if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
 			timer_active	= 3;
@@ -2676,8 +2676,8 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
 			timer_expires	= sk->sk_timer.expires;
 		}
 	} else {
-		timer_active	= 0;
-		timer_expires = jiffies;
+		timer_active		= 0;
+		timer_expires		= jiffies;
 	}
 
 	if (sk->sk_state == TCP_LISTEN)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index b74b133..78cdb81 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1805,11 +1805,11 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
 	srcp  = ntohs(inet->inet_sport);
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
-		timer_active	= 1;
-		timer_expires	= icsk->icsk_timeout;
+		timer_active		= 1;
+		timer_expires		= icsk->icsk_timeout;
 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
-		timer_active	= 4;
-		timer_expires	= icsk->icsk_timeout;
+		timer_active		= 4;
+		timer_expires		= icsk->icsk_timeout;
 	} else if (timer_pending(&sp->sk_timer)) {
 		if (sp->sk_state == TCP_FIN_WAIT2 && sock_flag(sp, SOCK_DEAD)) {
 			timer_active	= 3;
@@ -1821,8 +1821,8 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
 			timer_expires	= sp->sk_timer.expires;
 		}
 	} else {
-		timer_active	= 0;
-		timer_expires = jiffies;
+		timer_active		= 0;
+		timer_expires		= jiffies;
 	}
 
 	seq_printf(seq,
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH v2 2/4] tcp: fix FIN_WAIT2 timer expression in /proc/net/tcp
From: Toshiaki Makita @ 2013-03-18 12:43 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, netdev; +Cc: Toshiaki Makita
In-Reply-To: <1363610344.7121.5.camel@ubuntu-vm-makita>

When tcp_fin_timeout is greater than 60 and a socket is half-closed,
/proc/net/tcp shows a FIN_WAIT2 keepalive timer. This is confusing
because keepalive probes will never be sent in this case.
Keepalive probes will really be sent with a FIN_WAIT2 keepalive timer
shown when a socket is not closed but shutdown and SO_KEEPALIVE is set.

Even without this change, a keepalive timer for an orphaned FIN_WAIT2
socket will eventually turn to a timewait timer, and by default,
or when tcp_fin_timeout is 60, there will be no keepalive timer for
orphaned FIN_WAIT2 sockets. Therefore, this change will hardly affect
a bad influence to userspace.

This patch changes the expression to show only timewait timer for
orphaned FIN_WAIT2 sockets.

Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
---
 net/ipv4/tcp_ipv4.c |   11 +++++++++--
 net/ipv6/tcp_ipv6.c |   11 +++++++++--
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4a8ec45..86a0685 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2666,8 +2666,15 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
 		timer_active	= 4;
 		timer_expires	= icsk->icsk_timeout;
 	} else if (timer_pending(&sk->sk_timer)) {
-		timer_active	= 2;
-		timer_expires	= sk->sk_timer.expires;
+		if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
+			timer_active	= 3;
+			timer_expires	= sk->sk_timer.expires +
+					  (tp->linger2 >= 0 ?
+					   TCP_TIMEWAIT_LEN : 0);
+		} else {
+			timer_active	= 2;
+			timer_expires	= sk->sk_timer.expires;
+		}
 	} else {
 		timer_active	= 0;
 		timer_expires = jiffies;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 9b64600..b74b133 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1811,8 +1811,15 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
 		timer_active	= 4;
 		timer_expires	= icsk->icsk_timeout;
 	} else if (timer_pending(&sp->sk_timer)) {
-		timer_active	= 2;
-		timer_expires	= sp->sk_timer.expires;
+		if (sp->sk_state == TCP_FIN_WAIT2 && sock_flag(sp, SOCK_DEAD)) {
+			timer_active	= 3;
+			timer_expires	= sp->sk_timer.expires +
+					  (tp->linger2 >= 0 ?
+					   TCP_TIMEWAIT_LEN : 0);
+		} else {
+			timer_active	= 2;
+			timer_expires	= sp->sk_timer.expires;
+		}
 	} else {
 		timer_active	= 0;
 		timer_expires = jiffies;
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH v2 1/4] tcp: fix too short FIN_WAIT2 time out
From: Toshiaki Makita @ 2013-03-18 12:39 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, netdev; +Cc: Toshiaki Makita
In-Reply-To: <1363610163.7121.2.camel@ubuntu-vm-makita>

When tcp_fin_timeout is between 60 and 120, a FIN_WAIT2 socket
disappears in (tcp_fin_timeout - 60) * 2 sec, which is shorter
than expected.

Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
---
 net/ipv4/tcp_timer.c |    8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b78aac3..c20e474 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -576,12 +576,8 @@ static void tcp_keepalive_timer (unsigned long data)
 
 	if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
 		if (tp->linger2 >= 0) {
-			const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
-
-			if (tmo > 0) {
-				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
-				goto out;
-			}
+			tcp_time_wait(sk, TCP_FIN_WAIT2, TCP_TIMEWAIT_LEN);
+			goto out;
 		}
 		tcp_send_active_reset(sk, GFP_ATOMIC);
 		goto death;
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH v2 0/4] tcp: fix problems when tcp_fin_timeout is greater than 60
From: Toshiaki Makita @ 2013-03-18 12:36 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, netdev; +Cc: Toshiaki Makita

This is resubmission of the patch set that fixes problems when
tcp_fin_timeout is greater than 60.

- fix too short FIN_WAIT2 timer when tcp_fin_timeout is between 60
  and 120.
- fix FIN_WAIT2 timer expression in /proc/net/tcp and inet_diag when
  tcp_fin_timeout is greater than 60.

v2:
 - Separated changes of indentation to another patch.
 - Revised the comment of the patch modifying the expression of
   /proc/net/tcp to explain the reason for change.
 - Modified inet_diag.

Toshiaki Makita (4):
  tcp: fix too short FIN_WAIT2 time out
  tcp: fix FIN_WAIT2 timer expression in /proc/net/tcp
  tcp: fix the indentation in timer calculation of /proc/net/tcp
  inet_diag: fix FIN_WAIT2 timer expression in inet_diag

 include/linux/inet_diag.h |    2 ++
 net/dccp/diag.c           |   15 +++++++++++++++
 net/ipv4/inet_diag.c      |   20 --------------------
 net/ipv4/tcp_diag.c       |   27 +++++++++++++++++++++++++++
 net/ipv4/tcp_ipv4.c       |   23 +++++++++++++++--------
 net/ipv4/tcp_timer.c      |    8 ++------
 net/ipv6/tcp_ipv6.c       |   23 +++++++++++++++--------
 7 files changed, 76 insertions(+), 42 deletions(-)

-- 
1.7.10.4

^ permalink raw reply

* Re: [PATCH 1/4] xen-netfront: remove unused variable `extra'
From: Ian Campbell @ 2013-03-18 12:14 UTC (permalink / raw)
  To: Wei Liu
  Cc: netdev@vger.kernel.org, xen-devel@lists.xen.org,
	konrad.wilk@oracle.com, annie.li@oracle.com
In-Reply-To: <1363608271.29093.196.camel@zion.uk.xensource.com>

On Mon, 2013-03-18 at 12:04 +0000, Wei Liu wrote:
> On Mon, 2013-03-18 at 11:42 +0000, Ian Campbell wrote:
> > On Mon, 2013-03-18 at 10:35 +0000, Wei Liu wrote:
> > 
> > I think a few more words are needed here since from the code you are
> > removing it seems very much like gso is used for something. If you have
> > a proof that the "extra = gso" case is never hit then please explain it.
> > Perhaps a reference to the removal of the last user?
> > 
> > Or maybe it is the case that it should be used and the bug is that it
> > isn't?
> > 
> 
> Looks like the latter one. 'extra' field should  be used to get hold of
> the last extra info in the ring. ;-)
> 
> But, the only extra info in upstream kernel is XEN_NETIF_EXTRA_TYPE_GSO,
> so there's really no other extra info in the ring at that point. Could
> it be possible that it is something from classic Xen kernel?

The classic kernel netfront has exactly the same code it seems and
netif_extra_type_gso is the only one I've ever heard of.

Maybe this extra thing is just redundant unless/until a second extra
comes along.

Ian.

^ permalink raw reply

* Re: [PATCH 1/4] xen-netfront: remove unused variable `extra'
From: Wei Liu @ 2013-03-18 12:04 UTC (permalink / raw)
  To: Ian Campbell
  Cc: wei.liu2, netdev@vger.kernel.org, xen-devel@lists.xen.org,
	konrad.wilk@oracle.com, annie.li@oracle.com
In-Reply-To: <1363606922.30193.21.camel@zakaz.uk.xensource.com>

On Mon, 2013-03-18 at 11:42 +0000, Ian Campbell wrote:
> On Mon, 2013-03-18 at 10:35 +0000, Wei Liu wrote:
> 
> I think a few more words are needed here since from the code you are
> removing it seems very much like gso is used for something. If you have
> a proof that the "extra = gso" case is never hit then please explain it.
> Perhaps a reference to the removal of the last user?
> 
> Or maybe it is the case that it should be used and the bug is that it
> isn't?
> 

Looks like the latter one. 'extra' field should  be used to get hold of
the last extra info in the ring. ;-)

But, the only extra info in upstream kernel is XEN_NETIF_EXTRA_TYPE_GSO,
so there's really no other extra info in the ring at that point. Could
it be possible that it is something from classic Xen kernel?


Wei.

> > Signed-off-by: Wei Liu <wei.liu2@citrix.com>
> > ---
> >  drivers/net/xen-netfront.c |    8 +-------
> >  1 file changed, 1 insertion(+), 7 deletions(-)
> > 
> > diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
> > index 7ffa43b..5527663 100644
> > --- a/drivers/net/xen-netfront.c
> > +++ b/drivers/net/xen-netfront.c
> > @@ -537,7 +537,6 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
> >  	struct netfront_info *np = netdev_priv(dev);
> >  	struct netfront_stats *stats = this_cpu_ptr(np->stats);
> >  	struct xen_netif_tx_request *tx;
> > -	struct xen_netif_extra_info *extra;
> >  	char *data = skb->data;
> >  	RING_IDX i;
> >  	grant_ref_t ref;
> > @@ -581,7 +580,6 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
> >  	tx->gref = np->grant_tx_ref[id] = ref;
> >  	tx->offset = offset;
> >  	tx->size = len;
> > -	extra = NULL;
> >  
> >  	tx->flags = 0;
> >  	if (skb->ip_summed == CHECKSUM_PARTIAL)
> > @@ -597,10 +595,7 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
> >  		gso = (struct xen_netif_extra_info *)
> >  			RING_GET_REQUEST(&np->tx, ++i);
> >  
> > -		if (extra)
> > -			extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE;
> > -		else
> > -			tx->flags |= XEN_NETTXF_extra_info;
> > +		tx->flags |= XEN_NETTXF_extra_info;
> >  
> >  		gso->u.gso.size = skb_shinfo(skb)->gso_size;
> >  		gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
> > @@ -609,7 +604,6 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
> >  
> >  		gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
> >  		gso->flags = 0;
> > -		extra = gso;
> >  	}
> >  
> >  	np->tx.req_prod_pvt = i + 1;
> 
> 

^ permalink raw reply

* Re: [PATCH] ath: changed kmalloc to kmemdup
From: Kalle Valo @ 2013-03-18 12:09 UTC (permalink / raw)
  To: Andrei Epure; +Cc: linville, linux-wireless, netdev, linux-kernel
In-Reply-To: <1362919199-16292-1-git-send-email-epure.andrei@gmail.com>

Andrei Epure <epure.andrei@gmail.com> writes:

> Signed-off-by: Andrei Epure <epure.andrei@gmail.com>
> ---
>  drivers/net/wireless/ath/ath6kl/usb.c |    4 +---
>  1 file changed, 1 insertion(+), 3 deletions(-)
>
> diff --git a/drivers/net/wireless/ath/ath6kl/usb.c b/drivers/net/wireless/ath/ath6kl/usb.c
> index 5fcd342..ffa1daa 100644
> --- a/drivers/net/wireless/ath/ath6kl/usb.c
> +++ b/drivers/net/wireless/ath/ath6kl/usb.c
> @@ -856,11 +856,9 @@ static int ath6kl_usb_submit_ctrl_out(struct ath6kl_usb *ar_usb,
>  	int ret;
>  
>  	if (size > 0) {
> -		buf = kmalloc(size, GFP_KERNEL);
> +		buf = kmemdup(data, size, GFP_KERNEL);
>  		if (buf == NULL)
>  			return -ENOMEM;
> -
> -		memcpy(buf, data, size);
>  	}

Thanks, applied to ath6kl.git.

-- 
Kalle Valo

^ permalink raw reply

* Re: [PATCH 4/4] xen-netback: coalesce slots before copying
From: Ian Campbell @ 2013-03-18 12:07 UTC (permalink / raw)
  To: Wei Liu
  Cc: netdev@vger.kernel.org, xen-devel@lists.xen.org,
	konrad.wilk@oracle.com, annie.li@oracle.com
In-Reply-To: <1363602955-24790-5-git-send-email-wei.liu2@citrix.com>

On Mon, 2013-03-18 at 10:35 +0000, Wei Liu wrote:
> This patch tries to coalesce tx requests when constructing grant copy
> structures. It enables netback to deal with situation when frontend's
> MAX_SKB_FRAGS is larger than backend's MAX_SKB_FRAGS.
> 
> It defines max_skb_slots, which is a estimation of the maximum number of slots
> a guest can send, anything bigger than that is considered malicious. Now it is
> set to 20, which should be enough to accommodate Linux (16 to 19) and possibly
> Windows (19?).
> 
> Also change variable name from "frags" to "slots" in netbk_count_requests.
> 
> Signed-off-by: Wei Liu <wei.liu2@citrix.com>
> ---
>  drivers/net/xen-netback/netback.c |  204 ++++++++++++++++++++++++++++---------
>  1 file changed, 157 insertions(+), 47 deletions(-)
> 
> diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
> index 6e8e51a..d7bbce9 100644
> --- a/drivers/net/xen-netback/netback.c
> +++ b/drivers/net/xen-netback/netback.c
> @@ -47,9 +47,20 @@
>  #include <asm/xen/hypercall.h>
>  #include <asm/xen/page.h>
>  
> +/*
> + * This is an estimation of the maximum possible frags a SKB might
> + * have, anything larger than this is considered malicious. Typically
> + * Linux has 16 to 19, Windows has 19(?).
> + */
> +#define MAX_SKB_SLOTS_DEFAULT 20
> +static unsigned int max_skb_slots = MAX_SKB_SLOTS_DEFAULT;
> +module_param(max_skb_slots, uint, 0444);
> +
>  struct pending_tx_info {
> -	struct xen_netif_tx_request req;
> +	struct xen_netif_tx_request req; /* coalesced tx request  */
>  	struct xenvif *vif;
> +	unsigned int nr_tx_req; /* how many tx req we have in a chain (>=1) */
> +	unsigned int start_idx; /* starting index of pending ring index */

This one should be a RING_IDX I think, not an unsigned int.

>  };
>  typedef unsigned int pending_ring_idx_t;
>  
> @@ -251,7 +262,7 @@ static int max_required_rx_slots(struct xenvif *vif)
>  	int max = DIV_ROUND_UP(vif->dev->mtu, PAGE_SIZE);
>  
>  	if (vif->can_sg || vif->gso || vif->gso_prefix)
> -		max += MAX_SKB_FRAGS + 1; /* extra_info + frags */
> +		max += max_skb_slots + 1; /* extra_info + frags */
>  
>  	return max;
>  }
> @@ -657,7 +668,7 @@ static void xen_netbk_rx_action(struct xen_netbk *netbk)
>  		__skb_queue_tail(&rxq, skb);
>  
>  		/* Filled the batch queue? */
> -		if (count + MAX_SKB_FRAGS >= XEN_NETIF_RX_RING_SIZE)
> +		if (count + max_skb_slots >= XEN_NETIF_RX_RING_SIZE)
>  			break;
>  	}
>  
> @@ -908,34 +919,34 @@ static int netbk_count_requests(struct xenvif *vif,
>  				int work_to_do)
>  {
>  	RING_IDX cons = vif->tx.req_cons;
> -	int frags = 0;
> +	int slots = 0;
>  
>  	if (!(first->flags & XEN_NETTXF_more_data))
>  		return 0;
>  
>  	do {
> -		if (frags >= work_to_do) {
> -			netdev_err(vif->dev, "Need more frags\n");
> +		if (slots >= work_to_do) {
> +			netdev_err(vif->dev, "Need more slots\n");
>  			netbk_fatal_tx_err(vif);
>  			return -ENODATA;
>  		}
>  
> -		if (unlikely(frags >= MAX_SKB_FRAGS)) {
> -			netdev_err(vif->dev, "Too many frags\n");
> +		if (unlikely(slots >= max_skb_slots)) {
> +			netdev_err(vif->dev, "Too many slots\n");
>  			netbk_fatal_tx_err(vif);
>  			return -E2BIG;
>  		}
>  
> -		memcpy(txp, RING_GET_REQUEST(&vif->tx, cons + frags),
> +		memcpy(txp, RING_GET_REQUEST(&vif->tx, cons + slots),
>  		       sizeof(*txp));
>  		if (txp->size > first->size) {
> -			netdev_err(vif->dev, "Frag is bigger than frame.\n");
> +			netdev_err(vif->dev, "Packet is bigger than frame.\n");
>  			netbk_fatal_tx_err(vif);
>  			return -EIO;
>  		}
>  
>  		first->size -= txp->size;
> -		frags++;
> +		slots++;
>  
>  		if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) {
>  			netdev_err(vif->dev, "txp->offset: %x, size: %u\n",
> @@ -944,7 +955,7 @@ static int netbk_count_requests(struct xenvif *vif,
>  			return -EINVAL;
>  		}
>  	} while ((txp++)->flags & XEN_NETTXF_more_data);
> -	return frags;
> +	return slots;
>  }
>  
>  static struct page *xen_netbk_alloc_page(struct xen_netbk *netbk,
> @@ -968,48 +979,120 @@ static struct gnttab_copy *xen_netbk_get_requests(struct xen_netbk *netbk,
>  	struct skb_shared_info *shinfo = skb_shinfo(skb);
>  	skb_frag_t *frags = shinfo->frags;
>  	u16 pending_idx = *((u16 *)skb->data);
> -	int i, start;
> +	u16 head_idx = 0;
> +	int slot, start;
> +	struct page *page;
> +	pending_ring_idx_t index;
> +	uint16_t dst_offset;
> +	unsigned int nr_slots;
> +	struct pending_tx_info *first = NULL;
> +	int nr_txp;
> +	unsigned int start_idx = 0;
> +
> +	/* At this point shinfo->nr_frags is in fact the number of
> +	 * slots, which can be as large as max_skb_slots.
> +	 */
> +	nr_slots = shinfo->nr_frags;
>  
>  	/* Skip first skb fragment if it is on same page as header fragment. */
>  	start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
>  
> -	for (i = start; i < shinfo->nr_frags; i++, txp++) {
> -		struct page *page;
> -		pending_ring_idx_t index;
> +	/* Coalesce tx requests, at this point the packet passed in
> +	 * should be <= 64K. Any packets larger than 64K has been
> +	 * dropped / caused fatal error early on.

Whereabouts is this? Since the size field is u16 how do we even detect
this case. Since (at least prior to your other fix in this series) it
would have overflowed when the guest constructed the request.


> @@ -1025,6 +1108,7 @@ static int xen_netbk_tx_check_gop(struct xen_netbk *netbk,
>  	struct gnttab_copy *gop = *gopp;
>  	u16 pending_idx = *((u16 *)skb->data);
>  	struct skb_shared_info *shinfo = skb_shinfo(skb);
> +	struct pending_tx_info *tx_info;
>  	int nr_frags = shinfo->nr_frags;
>  	int i, err, start;
>  
> @@ -1037,12 +1121,17 @@ static int xen_netbk_tx_check_gop(struct xen_netbk *netbk,
>  	start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
>  
>  	for (i = start; i < nr_frags; i++) {
> -		int j, newerr;
> +		int j, newerr = 0, n;
>  
>  		pending_idx = frag_get_pending_idx(&shinfo->frags[i]);
> +		tx_info = &netbk->pending_tx_info[pending_idx];
>  
>  		/* Check error status: if okay then remember grant handle. */
> -		newerr = (++gop)->status;
> +		for (n = 0; n < tx_info->nr_tx_req; n++) {
struct pending_tx_info is used in some arrays which can have a fair few
elements so if there are ways to reduce the size that is worth
considering I think.

So rather than storing both nr_tx_req and start_idx can we just store
start_idx and loop while start_idx != 0 (where the first one has
start_idx == zero)?

This might fall out more naturally if you were to instead store next_idx
in each pending tx with a suitable terminator at the end? Or could be
last_idx if it is convenient to count that way round, you don't need to
respond in-order.

Ian.

^ permalink raw reply

* [PATCH 3/3] net: ftgmac100: Use module_platform_driver()
From: Sachin Kamat @ 2013-03-18 11:50 UTC (permalink / raw)
  To: netdev; +Cc: davem, sachin.kamat, Po-Yu Chuang
In-Reply-To: <1363607448-17369-1-git-send-email-sachin.kamat@linaro.org>

module_platform_driver macro removes some boilerplate and makes
the code simpler.

Signed-off-by: Sachin Kamat <sachin.kamat@linaro.org>
Cc: Po-Yu Chuang <ratbert@faraday-tech.com>
---
 drivers/net/ethernet/faraday/ftgmac100.c |   17 +----------------
 1 files changed, 1 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c
index 0e817e6..21b85fb 100644
--- a/drivers/net/ethernet/faraday/ftgmac100.c
+++ b/drivers/net/ethernet/faraday/ftgmac100.c
@@ -1349,22 +1349,7 @@ static struct platform_driver ftgmac100_driver = {
 	},
 };
 
-/******************************************************************************
- * initialization / finalization
- *****************************************************************************/
-static int __init ftgmac100_init(void)
-{
-	pr_info("Loading version " DRV_VERSION " ...\n");
-	return platform_driver_register(&ftgmac100_driver);
-}
-
-static void __exit ftgmac100_exit(void)
-{
-	platform_driver_unregister(&ftgmac100_driver);
-}
-
-module_init(ftgmac100_init);
-module_exit(ftgmac100_exit);
+module_platform_driver(ftgmac100_driver);
 
 MODULE_AUTHOR("Po-Yu Chuang <ratbert@faraday-tech.com>");
 MODULE_DESCRIPTION("FTGMAC100 driver");
-- 
1.7.4.1

^ permalink raw reply related

* [PATCH 2/3] net: ep93xx_eth: Use module_platform_driver()
From: Sachin Kamat @ 2013-03-18 11:50 UTC (permalink / raw)
  To: netdev; +Cc: davem, sachin.kamat
In-Reply-To: <1363607448-17369-1-git-send-email-sachin.kamat@linaro.org>

module_platform_driver macro removes some boilerplate and makes
the code simpler.

Signed-off-by: Sachin Kamat <sachin.kamat@linaro.org>
---
 drivers/net/ethernet/cirrus/ep93xx_eth.c |   13 +------------
 1 files changed, 1 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/cirrus/ep93xx_eth.c b/drivers/net/ethernet/cirrus/ep93xx_eth.c
index 354cbb7..67b0388 100644
--- a/drivers/net/ethernet/cirrus/ep93xx_eth.c
+++ b/drivers/net/ethernet/cirrus/ep93xx_eth.c
@@ -887,18 +887,7 @@ static struct platform_driver ep93xx_eth_driver = {
 	},
 };
 
-static int __init ep93xx_eth_init_module(void)
-{
-	printk(KERN_INFO DRV_MODULE_NAME " version " DRV_MODULE_VERSION " loading\n");
-	return platform_driver_register(&ep93xx_eth_driver);
-}
-
-static void __exit ep93xx_eth_cleanup_module(void)
-{
-	platform_driver_unregister(&ep93xx_eth_driver);
-}
+module_platform_driver(ep93xx_eth_driver);
 
-module_init(ep93xx_eth_init_module);
-module_exit(ep93xx_eth_cleanup_module);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("platform:ep93xx-eth");
-- 
1.7.4.1

^ permalink raw reply related

* [PATCH 1/3] net: dm9000: Use module_platform_driver()
From: Sachin Kamat @ 2013-03-18 11:50 UTC (permalink / raw)
  To: netdev; +Cc: davem, sachin.kamat

module_platform_driver macro removes some boilerplate and makes
the code simpler.

Signed-off-by: Sachin Kamat <sachin.kamat@linaro.org>
---
This series compile tested against linux-next tree (20130318).
---
 drivers/net/ethernet/davicom/dm9000.c |   17 +----------------
 1 files changed, 1 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/davicom/dm9000.c b/drivers/net/ethernet/davicom/dm9000.c
index 8cdf025..f38f677 100644
--- a/drivers/net/ethernet/davicom/dm9000.c
+++ b/drivers/net/ethernet/davicom/dm9000.c
@@ -1687,22 +1687,7 @@ static struct platform_driver dm9000_driver = {
 	.remove  = dm9000_drv_remove,
 };
 
-static int __init
-dm9000_init(void)
-{
-	printk(KERN_INFO "%s Ethernet Driver, V%s\n", CARDNAME, DRV_VERSION);
-
-	return platform_driver_register(&dm9000_driver);
-}
-
-static void __exit
-dm9000_cleanup(void)
-{
-	platform_driver_unregister(&dm9000_driver);
-}
-
-module_init(dm9000_init);
-module_exit(dm9000_cleanup);
+module_platform_driver(dm9000_driver);
 
 MODULE_AUTHOR("Sascha Hauer, Ben Dooks");
 MODULE_DESCRIPTION("Davicom DM9000 network driver");
-- 
1.7.4.1

^ permalink raw reply related

* Re: [PATCH 2/4] xen-netfront: drop skb when skb->len > 65535
From: Ian Campbell @ 2013-03-18 11:42 UTC (permalink / raw)
  To: Wei Liu
  Cc: netdev@vger.kernel.org, xen-devel@lists.xen.org,
	konrad.wilk@oracle.com, annie.li@oracle.com
In-Reply-To: <1363602955-24790-3-git-send-email-wei.liu2@citrix.com>

On Mon, 2013-03-18 at 10:35 +0000, Wei Liu wrote:
> The `size' field of Xen network wire format is uint16_t, anything bigger than
> 65535 will cause overflow.
> 
> Signed-off-by: Wei Liu <wei.liu2@citrix.com>
> ---
>  drivers/net/xen-netfront.c |   12 ++++++++++++
>  1 file changed, 12 insertions(+)
> 
> diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
> index 5527663..8c3d065 100644
> --- a/drivers/net/xen-netfront.c
> +++ b/drivers/net/xen-netfront.c
> @@ -547,6 +547,18 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
>  	unsigned int len = skb_headlen(skb);
>  	unsigned long flags;
>  
> +	/*
> +	 * wire format of xen_netif_tx_request only supports skb->len
> +	 * < 64K, because size field in xen_netif_tx_request is
> +	 * uint16_t.

Is there some field we can set e.g. in struct ethernet_device which
would stop this from happening?


> +	 */
> +	if (unlikely(skb->len > (uint16_t)(~0))) {
> +		net_alert_ratelimited(
> +			"xennet: skb->len = %d, too big for wire format\n",
> +			skb->len);
> +		goto drop;
> +	}
> +
>  	slots = DIV_ROUND_UP(offset + len, PAGE_SIZE) +
>  		xennet_count_skb_frag_slots(skb);
>  	if (unlikely(slots > MAX_SKB_FRAGS + 1)) {

^ permalink raw reply

* Re: [PATCH 1/4] xen-netfront: remove unused variable `extra'
From: Ian Campbell @ 2013-03-18 11:42 UTC (permalink / raw)
  To: Wei Liu
  Cc: netdev@vger.kernel.org, xen-devel@lists.xen.org,
	konrad.wilk@oracle.com, annie.li@oracle.com
In-Reply-To: <1363602955-24790-2-git-send-email-wei.liu2@citrix.com>

On Mon, 2013-03-18 at 10:35 +0000, Wei Liu wrote:

I think a few more words are needed here since from the code you are
removing it seems very much like gso is used for something. If you have
a proof that the "extra = gso" case is never hit then please explain it.
Perhaps a reference to the removal of the last user?

Or maybe it is the case that it should be used and the bug is that it
isn't?

> Signed-off-by: Wei Liu <wei.liu2@citrix.com>
> ---
>  drivers/net/xen-netfront.c |    8 +-------
>  1 file changed, 1 insertion(+), 7 deletions(-)
> 
> diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
> index 7ffa43b..5527663 100644
> --- a/drivers/net/xen-netfront.c
> +++ b/drivers/net/xen-netfront.c
> @@ -537,7 +537,6 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
>  	struct netfront_info *np = netdev_priv(dev);
>  	struct netfront_stats *stats = this_cpu_ptr(np->stats);
>  	struct xen_netif_tx_request *tx;
> -	struct xen_netif_extra_info *extra;
>  	char *data = skb->data;
>  	RING_IDX i;
>  	grant_ref_t ref;
> @@ -581,7 +580,6 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
>  	tx->gref = np->grant_tx_ref[id] = ref;
>  	tx->offset = offset;
>  	tx->size = len;
> -	extra = NULL;
>  
>  	tx->flags = 0;
>  	if (skb->ip_summed == CHECKSUM_PARTIAL)
> @@ -597,10 +595,7 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
>  		gso = (struct xen_netif_extra_info *)
>  			RING_GET_REQUEST(&np->tx, ++i);
>  
> -		if (extra)
> -			extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE;
> -		else
> -			tx->flags |= XEN_NETTXF_extra_info;
> +		tx->flags |= XEN_NETTXF_extra_info;
>  
>  		gso->u.gso.size = skb_shinfo(skb)->gso_size;
>  		gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
> @@ -609,7 +604,6 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
>  
>  		gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
>  		gso->flags = 0;
> -		extra = gso;
>  	}
>  
>  	np->tx.req_prod_pvt = i + 1;

^ permalink raw reply

* Re: [PATCH 3/4] xen-netback: remove skb in xen_netbk_alloc_page
From: Ian Campbell @ 2013-03-18 11:37 UTC (permalink / raw)
  To: Wei Liu
  Cc: netdev@vger.kernel.org, xen-devel@lists.xen.org,
	konrad.wilk@oracle.com, annie.li@oracle.com
In-Reply-To: <1363602955-24790-4-git-send-email-wei.liu2@citrix.com>

On Mon, 2013-03-18 at 10:35 +0000, Wei Liu wrote:
> This variable is never used.
> 
> Signed-off-by: Wei Liu <wei.liu2@citrix.com>

Acked-by: Ian Campbell <ian.campbell@citrix.com>

> ---
>  drivers/net/xen-netback/netback.c |    5 ++---
>  1 file changed, 2 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
> index da726a3..6e8e51a 100644
> --- a/drivers/net/xen-netback/netback.c
> +++ b/drivers/net/xen-netback/netback.c
> @@ -948,7 +948,6 @@ static int netbk_count_requests(struct xenvif *vif,
>  }
>  
>  static struct page *xen_netbk_alloc_page(struct xen_netbk *netbk,
> -					 struct sk_buff *skb,
>  					 u16 pending_idx)
>  {
>  	struct page *page;
> @@ -982,7 +981,7 @@ static struct gnttab_copy *xen_netbk_get_requests(struct xen_netbk *netbk,
>  
>  		index = pending_index(netbk->pending_cons++);
>  		pending_idx = netbk->pending_ring[index];
> -		page = xen_netbk_alloc_page(netbk, skb, pending_idx);
> +		page = xen_netbk_alloc_page(netbk, pending_idx);
>  		if (!page)
>  			goto err;
>  
> @@ -1387,7 +1386,7 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk)
>  		}
>  
>  		/* XXX could copy straight to head */
> -		page = xen_netbk_alloc_page(netbk, skb, pending_idx);
> +		page = xen_netbk_alloc_page(netbk, pending_idx);
>  		if (!page) {
>  			kfree_skb(skb);
>  			netbk_tx_err(vif, &txreq, idx);

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox