Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 18/18] export sysctl symbols needed by ip_vs_sync
From: Simon Horman @ 2012-05-02  6:22 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: lvs-devel, netdev, netfilter-devel, Wensong Zhang,
	Julian Anastasov, Hans Schillstrom, Jesper Dangaard Brouer,
	Simon Horman
In-Reply-To: <1335939762-1912-1-git-send-email-horms@verge.net.au>

From: Hans Schillstrom <hans.schillstrom@ericsson.com>

To build ip_vs as a module sysctl_rmem_max and sysctl_wmem_max
needs to be exported.
The dependency was added by "ipvs: wakeup master thread" patch

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/core/sock.c |    2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/core/sock.c b/net/core/sock.c
index c7e60ea..ac3131a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -258,7 +258,9 @@ static struct lock_class_key af_callback_keys[AF_MAX];
 
 /* Run time adjustable parameters. */
 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
+EXPORT_SYMBOL(sysctl_wmem_max);
 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
+EXPORT_SYMBOL(sysctl_rmem_max);
 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 
-- 
1.7.10


^ permalink raw reply related

* [PATCH 13/18] ipvs: reduce sync rate with time thresholds
From: Simon Horman @ 2012-05-02  6:22 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: lvs-devel, netdev, netfilter-devel, Wensong Zhang,
	Julian Anastasov, Hans Schillstrom, Jesper Dangaard Brouer,
	Simon Horman
In-Reply-To: <1335939762-1912-1-git-send-email-horms@verge.net.au>

From: Julian Anastasov <ja@ssi.bg>

	Add two new sysctl vars to control the sync rate with the
main idea to reduce the rate for connection templates because
currently it depends on the packet rate for controlled connections.
This mechanism should be useful also for normal connections
with high traffic.

sync_refresh_period: in seconds, difference in reported connection
	timer that triggers new sync message. It can be used to
	avoid sync messages for the specified period (or half of
	the connection timeout if it is lower) if connection state
	is not changed from last sync.

sync_retries: integer, 0..3, defines sync retries with period of
	sync_refresh_period/8. Useful to protect against loss of
	sync messages.

	Allow sysctl_sync_threshold to be used with
sysctl_sync_period=0, so that only single sync message is sent
if sync_refresh_period is also 0.

	Add new field "sync_endtime" in connection structure to
hold the reported time when connection expires. The 2 lowest
bits will represent the retry count.

	As the sysctl_sync_period now can be 0 use ACCESS_ONCE to
avoid division by zero.

	Special thanks to Aleksey Chudov for being patient with me,
for his extensive reports and helping in all tests.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Tested-by: Aleksey Chudov <aleksey.chudov@gmail.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h             |   30 +++++++++-
 net/netfilter/ipvs/ip_vs_conn.c |    7 ++-
 net/netfilter/ipvs/ip_vs_core.c |   30 +---------
 net/netfilter/ipvs/ip_vs_ctl.c  |   25 +++++++-
 net/netfilter/ipvs/ip_vs_sync.c |  121 +++++++++++++++++++++++++++++++++------
 5 files changed, 165 insertions(+), 48 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 8721a78..941df45 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -505,6 +505,7 @@ struct ip_vs_conn {
 						 * state transition triggerd
 						 * synchronization
 						 */
+	unsigned long		sync_endtime;	/* jiffies + sent_retries */
 
 	/* Control members */
 	struct ip_vs_conn       *control;       /* Master control connection */
@@ -876,6 +877,8 @@ struct netns_ipvs {
 	int			sysctl_expire_nodest_conn;
 	int			sysctl_expire_quiescent_template;
 	int			sysctl_sync_threshold[2];
+	unsigned int		sysctl_sync_refresh_period;
+	int			sysctl_sync_retries;
 	int			sysctl_nat_icmp_send;
 
 	/* ip_vs_lblc */
@@ -917,10 +920,13 @@ struct netns_ipvs {
 #define DEFAULT_SYNC_THRESHOLD	3
 #define DEFAULT_SYNC_PERIOD	50
 #define DEFAULT_SYNC_VER	1
+#define DEFAULT_SYNC_REFRESH_PERIOD	(0U * HZ)
+#define DEFAULT_SYNC_RETRIES		0
 #define IPVS_SYNC_WAKEUP_RATE	8
 #define IPVS_SYNC_QLEN_MAX	(IPVS_SYNC_WAKEUP_RATE * 4)
 #define IPVS_SYNC_SEND_DELAY	(HZ / 50)
 #define IPVS_SYNC_CHECK_PERIOD	HZ
+#define IPVS_SYNC_FLUSH_TIME	(HZ * 2)
 
 #ifdef CONFIG_SYSCTL
 
@@ -931,7 +937,17 @@ static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs)
 
 static inline int sysctl_sync_period(struct netns_ipvs *ipvs)
 {
-	return ipvs->sysctl_sync_threshold[1];
+	return ACCESS_ONCE(ipvs->sysctl_sync_threshold[1]);
+}
+
+static inline unsigned int sysctl_sync_refresh_period(struct netns_ipvs *ipvs)
+{
+	return ACCESS_ONCE(ipvs->sysctl_sync_refresh_period);
+}
+
+static inline int sysctl_sync_retries(struct netns_ipvs *ipvs)
+{
+	return ipvs->sysctl_sync_retries;
 }
 
 static inline int sysctl_sync_ver(struct netns_ipvs *ipvs)
@@ -961,6 +977,16 @@ static inline int sysctl_sync_period(struct netns_ipvs *ipvs)
 	return DEFAULT_SYNC_PERIOD;
 }
 
+static inline unsigned int sysctl_sync_refresh_period(struct netns_ipvs *ipvs)
+{
+	return DEFAULT_SYNC_REFRESH_PERIOD;
+}
+
+static inline int sysctl_sync_retries(struct netns_ipvs *ipvs)
+{
+	return DEFAULT_SYNC_RETRIES & 3;
+}
+
 static inline int sysctl_sync_ver(struct netns_ipvs *ipvs)
 {
 	return DEFAULT_SYNC_VER;
@@ -1248,7 +1274,7 @@ extern struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp);
 extern int start_sync_thread(struct net *net, int state, char *mcast_ifn,
 			     __u8 syncid);
 extern int stop_sync_thread(struct net *net, int state);
-extern void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp);
+extern void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts);
 
 
 /*
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 9d237d7..46260fb 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -761,7 +761,8 @@ int ip_vs_check_template(struct ip_vs_conn *ct)
 static void ip_vs_conn_expire(unsigned long data)
 {
 	struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
-	struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
+	struct net *net = ip_vs_conn_net(cp);
+	struct netns_ipvs *ipvs = net_ipvs(net);
 
 	cp->timeout = 60*HZ;
 
@@ -826,6 +827,9 @@ static void ip_vs_conn_expire(unsigned long data)
 		  atomic_read(&cp->refcnt)-1,
 		  atomic_read(&cp->n_control));
 
+	if (ipvs->sync_state & IP_VS_STATE_MASTER)
+		ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs));
+
 	ip_vs_conn_put(cp);
 }
 
@@ -899,6 +903,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
 	/* Set its state and timeout */
 	cp->state = 0;
 	cp->timeout = 3*HZ;
+	cp->sync_endtime = jiffies & ~3UL;
 
 	/* Bind its packet transmitter */
 #ifdef CONFIG_IP_VS_IPV6
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index b5a5c73..7ce5819 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1613,34 +1613,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 	else
 		pkts = atomic_add_return(1, &cp->in_pkts);
 
-	if ((ipvs->sync_state & IP_VS_STATE_MASTER) &&
-	    cp->protocol == IPPROTO_SCTP) {
-		if ((cp->state == IP_VS_SCTP_S_ESTABLISHED &&
-			(pkts % sysctl_sync_period(ipvs)
-			 == sysctl_sync_threshold(ipvs))) ||
-				(cp->old_state != cp->state &&
-				 ((cp->state == IP_VS_SCTP_S_CLOSED) ||
-				  (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) ||
-				  (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) {
-			ip_vs_sync_conn(net, cp);
-			goto out;
-		}
-	}
-
-	/* Keep this block last: TCP and others with pp->num_states <= 1 */
-	else if ((ipvs->sync_state & IP_VS_STATE_MASTER) &&
-	    (((cp->protocol != IPPROTO_TCP ||
-	       cp->state == IP_VS_TCP_S_ESTABLISHED) &&
-	      (pkts % sysctl_sync_period(ipvs)
-	       == sysctl_sync_threshold(ipvs))) ||
-	     ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
-	      ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
-	       (cp->state == IP_VS_TCP_S_CLOSE) ||
-	       (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
-	       (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
-		ip_vs_sync_conn(net, cp);
-out:
-	cp->old_state = cp->state;
+	if (ipvs->sync_state & IP_VS_STATE_MASTER)
+		ip_vs_sync_conn(net, cp, pkts);
 
 	ip_vs_conn_put(cp);
 	return ret;
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 854e9a6..83bdbbc 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1599,6 +1599,10 @@ static int ip_vs_zero_all(struct net *net)
 }
 
 #ifdef CONFIG_SYSCTL
+
+static int zero;
+static int three = 3;
+
 static int
 proc_do_defense_mode(ctl_table *table, int write,
 		     void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -1632,7 +1636,8 @@ proc_do_sync_threshold(ctl_table *table, int write,
 	memcpy(val, valp, sizeof(val));
 
 	rc = proc_dointvec(table, write, buffer, lenp, ppos);
-	if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
+	if (write && (valp[0] < 0 || valp[1] < 0 ||
+	    (valp[0] >= valp[1] && valp[1]))) {
 		/* Restore the correct value */
 		memcpy(valp, val, sizeof(val));
 	}
@@ -1755,6 +1760,20 @@ static struct ctl_table vs_vars[] = {
 		.proc_handler	= proc_do_sync_threshold,
 	},
 	{
+		.procname	= "sync_refresh_period",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "sync_retries",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &three,
+	},
+	{
 		.procname	= "nat_icmp_send",
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
@@ -3685,6 +3704,10 @@ int __net_init ip_vs_control_net_init_sysctl(struct net *net)
 	ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
 	tbl[idx].data = &ipvs->sysctl_sync_threshold;
 	tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
+	ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD;
+	tbl[idx++].data = &ipvs->sysctl_sync_refresh_period;
+	ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3);
+	tbl[idx++].data = &ipvs->sysctl_sync_retries;
 	tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
 
 
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index eafc1d2..4aa9290 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -451,11 +451,94 @@ ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs)
 	return sb;
 }
 
+/* Check if conn should be synced.
+ * pkts: conn packets, use sysctl_sync_threshold to avoid packet check
+ * - (1) sync_refresh_period: reduce sync rate. Additionally, retry
+ *	sync_retries times with period of sync_refresh_period/8
+ * - (2) if both sync_refresh_period and sync_period are 0 send sync only
+ *	for state changes or only once when pkts matches sync_threshold
+ * - (3) templates: rate can be reduced only with sync_refresh_period or
+ *	with (2)
+ */
+static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs,
+				  struct ip_vs_conn *cp, int pkts)
+{
+	unsigned long orig = ACCESS_ONCE(cp->sync_endtime);
+	unsigned long now = jiffies;
+	unsigned long n = (now + cp->timeout) & ~3UL;
+	unsigned int sync_refresh_period;
+	int sync_period;
+	int force;
+
+	/* Check if we sync in current state */
+	if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE))
+		force = 0;
+	else if (likely(cp->protocol == IPPROTO_TCP)) {
+		if (!((1 << cp->state) &
+		      ((1 << IP_VS_TCP_S_ESTABLISHED) |
+		       (1 << IP_VS_TCP_S_FIN_WAIT) |
+		       (1 << IP_VS_TCP_S_CLOSE) |
+		       (1 << IP_VS_TCP_S_CLOSE_WAIT) |
+		       (1 << IP_VS_TCP_S_TIME_WAIT))))
+			return 0;
+		force = cp->state != cp->old_state;
+		if (force && cp->state != IP_VS_TCP_S_ESTABLISHED)
+			goto set;
+	} else if (unlikely(cp->protocol == IPPROTO_SCTP)) {
+		if (!((1 << cp->state) &
+		      ((1 << IP_VS_SCTP_S_ESTABLISHED) |
+		       (1 << IP_VS_SCTP_S_CLOSED) |
+		       (1 << IP_VS_SCTP_S_SHUT_ACK_CLI) |
+		       (1 << IP_VS_SCTP_S_SHUT_ACK_SER))))
+			return 0;
+		force = cp->state != cp->old_state;
+		if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED)
+			goto set;
+	} else {
+		/* UDP or another protocol with single state */
+		force = 0;
+	}
+
+	sync_refresh_period = sysctl_sync_refresh_period(ipvs);
+	if (sync_refresh_period > 0) {
+		long diff = n - orig;
+		long min_diff = max(cp->timeout >> 1, 10UL * HZ);
+
+		/* Avoid sync if difference is below sync_refresh_period
+		 * and below the half timeout.
+		 */
+		if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) {
+			int retries = orig & 3;
+
+			if (retries >= sysctl_sync_retries(ipvs))
+				return 0;
+			if (time_before(now, orig - cp->timeout +
+					(sync_refresh_period >> 3)))
+				return 0;
+			n |= retries + 1;
+		}
+	}
+	sync_period = sysctl_sync_period(ipvs);
+	if (sync_period > 0) {
+		if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) &&
+		    pkts % sync_period != sysctl_sync_threshold(ipvs))
+			return 0;
+	} else if (sync_refresh_period <= 0 &&
+		   pkts != sysctl_sync_threshold(ipvs))
+		return 0;
+
+set:
+	cp->old_state = cp->state;
+	n = cmpxchg(&cp->sync_endtime, orig, n);
+	return n == orig || force;
+}
+
 /*
  *      Version 0 , could be switched in by sys_ctl.
  *      Add an ip_vs_conn information into the current sync_buff.
  */
-void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp)
+static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
+			       int pkts)
 {
 	struct netns_ipvs *ipvs = net_ipvs(net);
 	struct ip_vs_sync_mesg_v0 *m;
@@ -468,6 +551,9 @@ void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp)
 	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
 		return;
 
+	if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
+		return;
+
 	spin_lock(&ipvs->sync_buff_lock);
 	if (!ipvs->sync_buff) {
 		ipvs->sync_buff =
@@ -513,8 +599,14 @@ void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp)
 	spin_unlock(&ipvs->sync_buff_lock);
 
 	/* synchronize its controller if it has */
-	if (cp->control)
-		ip_vs_sync_conn(net, cp->control);
+	cp = cp->control;
+	if (cp) {
+		if (cp->flags & IP_VS_CONN_F_TEMPLATE)
+			pkts = atomic_add_return(1, &cp->in_pkts);
+		else
+			pkts = sysctl_sync_threshold(ipvs);
+		ip_vs_sync_conn(net, cp->control, pkts);
+	}
 }
 
 /*
@@ -522,7 +614,7 @@ void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp)
  *      Called by ip_vs_in.
  *      Sending Version 1 messages
  */
-void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp)
+void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts)
 {
 	struct netns_ipvs *ipvs = net_ipvs(net);
 	struct ip_vs_sync_mesg *m;
@@ -532,13 +624,16 @@ void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp)
 
 	/* Handle old version of the protocol */
 	if (sysctl_sync_ver(ipvs) == 0) {
-		ip_vs_sync_conn_v0(net, cp);
+		ip_vs_sync_conn_v0(net, cp, pkts);
 		return;
 	}
 	/* Do not sync ONE PACKET */
 	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
 		goto control;
 sloop:
+	if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
+		goto control;
+
 	/* Sanity checks */
 	pe_name_len = 0;
 	if (cp->pe_data_len) {
@@ -653,16 +748,10 @@ control:
 	cp = cp->control;
 	if (!cp)
 		return;
-	/*
-	 * Reduce sync rate for templates
-	 * i.e only increment in_pkts for Templates.
-	 */
-	if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
-		int pkts = atomic_add_return(1, &cp->in_pkts);
-
-		if (pkts % sysctl_sync_period(ipvs) != 1)
-			return;
-	}
+	if (cp->flags & IP_VS_CONN_F_TEMPLATE)
+		pkts = atomic_add_return(1, &cp->in_pkts);
+	else
+		pkts = sysctl_sync_threshold(ipvs);
 	goto sloop;
 }
 
@@ -1494,7 +1583,7 @@ next_sync_buff(struct netns_ipvs *ipvs)
 	if (sb)
 		return sb;
 	/* Do not delay entries in buffer for more than 2 seconds */
-	return get_curr_sync_buff(ipvs, 2 * HZ);
+	return get_curr_sync_buff(ipvs, IPVS_SYNC_FLUSH_TIME);
 }
 
 static int sync_thread_master(void *data)
-- 
1.7.10


^ permalink raw reply related

* [PATCH 11/18] ipvs: always update some of the flags bits in backup
From: Simon Horman @ 2012-05-02  6:22 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: lvs-devel, netdev, netfilter-devel, Wensong Zhang,
	Julian Anastasov, Hans Schillstrom, Jesper Dangaard Brouer,
	Simon Horman
In-Reply-To: <1335939762-1912-1-git-send-email-horms@verge.net.au>

From: Julian Anastasov <ja@ssi.bg>

	As the goal is to mirror the inactconns/activeconns
counters in the backup server, make sure the cp->flags are
updated even if cp is still not bound to dest. If cp->flags
are not updated ip_vs_bind_dest will rely only on the initial
flags when updating the counters. To avoid mistakes and
complicated checks for protocol state rely only on the
IP_VS_CONN_F_INACTIVE bit when updating the counters.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Tested-by: Aleksey Chudov <aleksey.chudov@gmail.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/linux/ip_vs.h           |    5 +++
 net/netfilter/ipvs/ip_vs_sync.c |   65 ++++++++++++++-------------------------
 2 files changed, 28 insertions(+), 42 deletions(-)

diff --git a/include/linux/ip_vs.h b/include/linux/ip_vs.h
index be0ef3d..8a2d438 100644
--- a/include/linux/ip_vs.h
+++ b/include/linux/ip_vs.h
@@ -89,6 +89,7 @@
 #define IP_VS_CONN_F_TEMPLATE	0x1000		/* template, not connection */
 #define IP_VS_CONN_F_ONE_PACKET	0x2000		/* forward only one packet */
 
+/* Initial bits allowed in backup server */
 #define IP_VS_CONN_F_BACKUP_MASK (IP_VS_CONN_F_FWD_MASK | \
 				  IP_VS_CONN_F_NOOUTPUT | \
 				  IP_VS_CONN_F_INACTIVE | \
@@ -97,6 +98,10 @@
 				  IP_VS_CONN_F_TEMPLATE \
 				 )
 
+/* Bits allowed to update in backup server */
+#define IP_VS_CONN_F_BACKUP_UPD_MASK (IP_VS_CONN_F_INACTIVE | \
+				      IP_VS_CONN_F_SEQ_MASK)
+
 /* Flags that are not sent to backup server start from bit 16 */
 #define IP_VS_CONN_F_NFCT	(1 << 16)	/* use netfilter conntrack */
 
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index f4e0b6c..eeed767 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -731,9 +731,30 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
 	else
 		cp = ip_vs_ct_in_get(param);
 
-	if (cp && param->pe_data) 	/* Free pe_data */
+	if (cp) {
+		/* Free pe_data */
 		kfree(param->pe_data);
-	if (!cp) {
+
+		dest = cp->dest;
+		if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE &&
+		    !(flags & IP_VS_CONN_F_TEMPLATE) && dest) {
+			if (flags & IP_VS_CONN_F_INACTIVE) {
+				atomic_dec(&dest->activeconns);
+				atomic_inc(&dest->inactconns);
+			} else {
+				atomic_inc(&dest->activeconns);
+				atomic_dec(&dest->inactconns);
+			}
+		}
+		flags &= IP_VS_CONN_F_BACKUP_UPD_MASK;
+		flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK;
+		cp->flags = flags;
+		if (!dest) {
+			dest = ip_vs_try_bind_dest(cp);
+			if (dest)
+				atomic_dec(&dest->refcnt);
+		}
+	} else {
 		/*
 		 * Find the appropriate destination for the connection.
 		 * If it is not found the connection will remain unbound
@@ -742,18 +763,6 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
 		dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr,
 				       param->vport, protocol, fwmark, flags);
 
-		/*  Set the approprite ativity flag */
-		if (protocol == IPPROTO_TCP) {
-			if (state != IP_VS_TCP_S_ESTABLISHED)
-				flags |= IP_VS_CONN_F_INACTIVE;
-			else
-				flags &= ~IP_VS_CONN_F_INACTIVE;
-		} else if (protocol == IPPROTO_SCTP) {
-			if (state != IP_VS_SCTP_S_ESTABLISHED)
-				flags |= IP_VS_CONN_F_INACTIVE;
-			else
-				flags &= ~IP_VS_CONN_F_INACTIVE;
-		}
 		cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark);
 		if (dest)
 			atomic_dec(&dest->refcnt);
@@ -763,34 +772,6 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
 			IP_VS_DBG(2, "BACKUP, add new conn. failed\n");
 			return;
 		}
-	} else if (!cp->dest) {
-		dest = ip_vs_try_bind_dest(cp);
-		if (dest)
-			atomic_dec(&dest->refcnt);
-	} else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) &&
-		(cp->state != state)) {
-		/* update active/inactive flag for the connection */
-		dest = cp->dest;
-		if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
-			(state != IP_VS_TCP_S_ESTABLISHED)) {
-			atomic_dec(&dest->activeconns);
-			atomic_inc(&dest->inactconns);
-			cp->flags |= IP_VS_CONN_F_INACTIVE;
-		} else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
-			(state == IP_VS_TCP_S_ESTABLISHED)) {
-			atomic_inc(&dest->activeconns);
-			atomic_dec(&dest->inactconns);
-			cp->flags &= ~IP_VS_CONN_F_INACTIVE;
-		}
-	} else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) &&
-		(cp->state != state)) {
-		dest = cp->dest;
-		if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
-		(state != IP_VS_SCTP_S_ESTABLISHED)) {
-			atomic_dec(&dest->activeconns);
-			atomic_inc(&dest->inactconns);
-			cp->flags &= ~IP_VS_CONN_F_INACTIVE;
-		}
 	}
 
 	if (opt)
-- 
1.7.10


^ permalink raw reply related

* [PATCH 09/18] ipvs: remove check for IP_VS_CONN_F_SYNC from ip_vs_bind_dest
From: Simon Horman @ 2012-05-02  6:22 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: lvs-devel, netdev, netfilter-devel, Wensong Zhang,
	Julian Anastasov, Hans Schillstrom, Jesper Dangaard Brouer,
	Simon Horman
In-Reply-To: <1335939762-1912-1-git-send-email-horms@verge.net.au>

From: Julian Anastasov <ja@ssi.bg>

	As the IP_VS_CONN_F_INACTIVE bit is properly set
in cp->flags for all kind of connections we do not need to
add special checks for synced connections when updating
the activeconns/inactconns counters for first time. Now
logic will look just like in ip_vs_unbind_dest.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_conn.c |    9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index f562e63..7647f3b 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -585,11 +585,10 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
 
 	/* Update the connection counters */
 	if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
-		/* It is a normal connection, so increase the inactive
-		   connection counter because it is in TCP SYNRECV
-		   state (inactive) or other protocol inacive state */
-		if ((cp->flags & IP_VS_CONN_F_SYNC) &&
-		    (!(cp->flags & IP_VS_CONN_F_INACTIVE)))
+		/* It is a normal connection, so modify the counters
+		 * according to the flags, later the protocol can
+		 * update them on state change */
+		if (!(cp->flags & IP_VS_CONN_F_INACTIVE))
 			atomic_inc(&dest->activeconns);
 		else
 			atomic_inc(&dest->inactconns);
-- 
1.7.10


^ permalink raw reply related

* [PATCH 06/18] ipvs: SH scheduler does not need GFP_ATOMIC allocation
From: Simon Horman @ 2012-05-02  6:22 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: lvs-devel, netdev, netfilter-devel, Wensong Zhang,
	Julian Anastasov, Hans Schillstrom, Jesper Dangaard Brouer,
	Hans Schillstrom, Simon Horman
In-Reply-To: <1335939762-1912-1-git-send-email-horms@verge.net.au>

From: Julian Anastasov <ja@ssi.bg>

        Schedulers are initialized and bound to services only
on commands.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Hans Schillstrom <hans@schillstrom.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_sh.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index 91e97ee..0512652 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -162,7 +162,7 @@ static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
 
 	/* allocate the SH table for this service */
 	tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE,
-		      GFP_ATOMIC);
+		      GFP_KERNEL);
 	if (tbl == NULL)
 		return -ENOMEM;
 
-- 
1.7.10


^ permalink raw reply related

* [PATCH 04/18] ipvs: WRR scheduler does not need GFP_ATOMIC allocation
From: Simon Horman @ 2012-05-02  6:22 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: lvs-devel, netdev, netfilter-devel, Wensong Zhang,
	Julian Anastasov, Hans Schillstrom, Jesper Dangaard Brouer,
	Hans Schillstrom, Simon Horman
In-Reply-To: <1335939762-1912-1-git-send-email-horms@verge.net.au>

From: Julian Anastasov <ja@ssi.bg>

	Schedulers are initialized and bound to services only
on commands.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Hans Schillstrom <hans@schillstrom.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_wrr.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c
index fd0d4e0..231be7d 100644
--- a/net/netfilter/ipvs/ip_vs_wrr.c
+++ b/net/netfilter/ipvs/ip_vs_wrr.c
@@ -84,7 +84,7 @@ static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
 	/*
 	 *    Allocate the mark variable for WRR scheduling
 	 */
-	mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC);
+	mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_KERNEL);
 	if (mark == NULL)
 		return -ENOMEM;
 
-- 
1.7.10


^ permalink raw reply related

* [GIT PULL net-next v2] IPVS
From: Simon Horman @ 2012-05-02  6:22 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: lvs-devel, netdev, netfilter-devel, Wensong Zhang,
	Julian Anastasov, Hans Schillstrom, Jesper Dangaard Brouer

Hi Pablo,

please consider the following 18 changes for 3.5.

This replaces a 17 patch pull request that I sent earlier today by adding a
patch from Hans patch to the head of the request.  You should be able to
just pull again if you have already pulled the original series and not
applied anything else on top.

Please note that there will be a conflict with the following when merged
with net-next due to the following change which is already present in
David's tree. It should be a trivial merge but please let me know if you
want to handle this another way.

	4a17fd5 sock: Introduce named constants for sk_reuse

The following changes since commit c3dc836d807a9b9855eefe535fdcbcf7cbb7a574:

  netfilter: bridge: optionally set indev to vlan (2012-04-24 01:22:44 +0200)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/horms/ipvs-next.git master

for you to fetch changes up to 3de4b8ca4ffa3dafaeef985fabe152a1717a8ea0:

  export sysctl symbols needed by ip_vs_sync (2012-05-02 14:48:55 +0900)

----------------------------------------------------------------
H Hartley Sweeten (2):
      IPVS: ip_vs_ftp.c: local functions should not be exposed globally
      IPVS: ip_vs_proto.c: local functions should not be exposed globally

Hans Schillstrom (1):
      export sysctl symbols needed by ip_vs_sync

Julian Anastasov (14):
      ipvs: timeout tables do not need GFP_ATOMIC allocation
      ipvs: LBLC scheduler does not need GFP_ATOMIC allocation on init
      ipvs: DH scheduler does not need GFP_ATOMIC allocation
      ipvs: WRR scheduler does not need GFP_ATOMIC allocation
      ipvs: LBLCR scheduler does not need GFP_ATOMIC allocation on init
      ipvs: SH scheduler does not need GFP_ATOMIC allocation
      ipvs: ignore IP_VS_CONN_F_NOOUTPUT in backup server
      ipvs: remove check for IP_VS_CONN_F_SYNC from ip_vs_bind_dest
      ipvs: fix ip_vs_try_bind_dest to rebind app and transmitter
      ipvs: always update some of the flags bits in backup
      ipvs: wakeup master thread
      ipvs: reduce sync rate with time thresholds
      ipvs: add support for sync threads
      ipvs: optimize the use of flags in ip_vs_bind_dest

Sasha Levin (1):
      netfilter: ipvs: use GFP_KERNEL allocation where possible

 include/linux/ip_vs.h            |    5 +
 include/net/ip_vs.h              |   87 ++++-
 net/core/sock.c                  |    2 +
 net/netfilter/ipvs/ip_vs_conn.c  |   69 ++--
 net/netfilter/ipvs/ip_vs_core.c  |   30 +-
 net/netfilter/ipvs/ip_vs_ctl.c   |   70 +++-
 net/netfilter/ipvs/ip_vs_dh.c    |    2 +-
 net/netfilter/ipvs/ip_vs_ftp.c   |    2 +-
 net/netfilter/ipvs/ip_vs_lblc.c  |    2 +-
 net/netfilter/ipvs/ip_vs_lblcr.c |    2 +-
 net/netfilter/ipvs/ip_vs_proto.c |    6 +-
 net/netfilter/ipvs/ip_vs_sh.c    |    2 +-
 net/netfilter/ipvs/ip_vs_sync.c  |  662 ++++++++++++++++++++++++++------------
 net/netfilter/ipvs/ip_vs_wrr.c   |    2 +-
 14 files changed, 669 insertions(+), 274 deletions(-)

^ permalink raw reply

* Re: [GIT PULL next-net] IPVS
From: Simon Horman @ 2012-05-02  6:18 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: lvs-devel, netdev, netfilter-devel, Wensong Zhang,
	Julian Anastasov, Hans Schillstrom, Jesper Dangaard Brouer
In-Reply-To: <20120502055345.GA28625@verge.net.au>

On Wed, May 02, 2012 at 02:53:49PM +0900, Simon Horman wrote:
> On Wed, May 02, 2012 at 10:24:44AM +0900, Simon Horman wrote:
> > Hi Pablo,
> > 
> > please consider the following 5 changes for 3.4, they are all bug fixes.
> > I would also like these changes considered for stable.
> 
> Hi Pablo,
> 
> Hans has pointed out to me that the series should include one more patch.
> I will send an updated series ASAP.

Also, the pull request will be for 3.5, not 3.4.
Cut and paste silliness my me, sorry.

^ permalink raw reply

* [PATCH] netfilter: Fix error in ipq_enqueue_packet
From: Krishna Kumar @ 2012-05-02  6:10 UTC (permalink / raw)
  To: davem; +Cc: netdev, kaber, Krishna Kumar

ipq_enqueue_packet sets status=-EINVAL and calls
ipq_build_packet_message(entry, &status). This can
set status=0 while returning an skb. The next line:
	if (!peer_pid)
		goto err_out_free_nskb;
which wrongly returns success.

Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
---
 net/ipv4/netfilter/ip_queue.c  |    6 ++++--
 net/ipv6/netfilter/ip6_queue.c |    6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff -ruNp org/net/ipv4/netfilter/ip_queue.c new/net/ipv4/netfilter/ip_queue.c
--- org/net/ipv4/netfilter/ip_queue.c	2012-04-23 08:28:23.000000000 +0530
+++ new/net/ipv4/netfilter/ip_queue.c	2012-05-02 11:28:33.899790397 +0530
@@ -227,7 +227,7 @@ nlmsg_failure:
 static int
 ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
 {
-	int status = -EINVAL;
+	int status;
 	struct sk_buff *nskb;
 
 	if (copy_mode == IPQ_COPY_NONE)
@@ -239,8 +239,10 @@ ipq_enqueue_packet(struct nf_queue_entry
 
 	spin_lock_bh(&queue_lock);
 
-	if (!peer_pid)
+	if (!peer_pid) {
+		status = -EINVAL;
 		goto err_out_free_nskb;
+	}
 
 	if (queue_total >= queue_maxlen) {
 		queue_dropped++;
diff -ruNp org/net/ipv6/netfilter/ip6_queue.c new/net/ipv6/netfilter/ip6_queue.c
--- org/net/ipv6/netfilter/ip6_queue.c	2012-04-23 08:28:23.000000000 +0530
+++ new/net/ipv6/netfilter/ip6_queue.c	2012-05-02 11:30:21.199578311 +0530
@@ -227,7 +227,7 @@ nlmsg_failure:
 static int
 ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
 {
-	int status = -EINVAL;
+	int status;
 	struct sk_buff *nskb;
 
 	if (copy_mode == IPQ_COPY_NONE)
@@ -239,8 +239,10 @@ ipq_enqueue_packet(struct nf_queue_entry
 
 	spin_lock_bh(&queue_lock);
 
-	if (!peer_pid)
+	if (!peer_pid) {
+		status = -EINVAL;
 		goto err_out_free_nskb;
+	}
 
 	if (queue_total >= queue_maxlen) {
 		queue_dropped++;

^ permalink raw reply

* [RFC] [PATCH] ipv6: Fix potential hang in mif6_add
From: Krishna Kumar @ 2012-05-02  6:09 UTC (permalink / raw)
  To: davem; +Cc: netdev, Krishna Kumar

Fix a potential "waiting for %s to become free" hang.
mif6_add() called ip6mr_reg_vif to allocate/register a
netdevice, which also took an extra reference on the
device. On error, mif6_add should drop this reference
before unregistering the device.

(untested, noticed from code walkthrough)

Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
---
 net/ipv6/ip6mr.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff -ruNp org/net/ipv6/ip6mr.c new/net/ipv6/ip6mr.c
--- org/net/ipv6/ip6mr.c	2012-04-19 09:03:34.000000000 +0530
+++ new/net/ipv6/ip6mr.c	2012-05-02 10:51:01.953110994 +0530
@@ -937,8 +937,8 @@ static int mif6_add(struct net *net, str
 			return -ENOBUFS;
 		err = dev_set_allmulti(dev, 1);
 		if (err) {
-			unregister_netdevice(dev);
 			dev_put(dev);
+			unregister_netdevice(dev);
 			return err;
 		}
 		break;

^ permalink raw reply

* Re: [GIT PULL next-net] IPVS
From: Simon Horman @ 2012-05-02  5:53 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: lvs-devel, netdev, netfilter-devel, Wensong Zhang,
	Julian Anastasov, Hans Schillstrom, Jesper Dangaard Brouer
In-Reply-To: <1335921901-21284-1-git-send-email-horms@verge.net.au>

On Wed, May 02, 2012 at 10:24:44AM +0900, Simon Horman wrote:
> Hi Pablo,
> 
> please consider the following 5 changes for 3.4, they are all bug fixes.
> I would also like these changes considered for stable.

Hi Pablo,

Hans has pointed out to me that the series should include one more patch.
I will send an updated series ASAP.

^ permalink raw reply

* Re: [PATCH] net: skb_set_dev do not unconditionally drop ref to dst
From: Frank Blaschka @ 2012-05-02  5:50 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: davem, netdev, arnd, linux-s390
In-Reply-To: <1335765093.2296.4.camel@edumazet-glaptop>

On Mon, Apr 30, 2012 at 07:51:33AM +0200, Eric Dumazet wrote:
> On Mon, 2012-04-30 at 07:38 +0200, Frank Blaschka wrote:
> > From: Frank Blaschka <frank.blaschka@de.ibm.com>
> > 
> > commit 8a83a00b0735190384a348156837918271034144 unconditionally
> > drops dst reference when skb->dev is set. This causes a regression
> > with VLAN and the qeth_l3 network driver. qeth_l3 can not get gw
> > information from the skb coming from the vlan driver. It is only
> > valid to drop the dst in case of different name spaces.
> > 
> > Signed-off-by: Frank Blaschka <frank.blaschka@de.ibm.com>
> > ---
> >  net/core/dev.c |    2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -1881,8 +1881,8 @@ EXPORT_SYMBOL(netif_device_attach);
> >  #ifdef CONFIG_NET_NS
> >  void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
> >  {
> > -	skb_dst_drop(skb);
> >  	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
> > +		skb_dst_drop(skb);
> >  		secpath_reset(skb);
> >  		nf_reset(skb);
> >  		skb_init_secmark(skb);
> > 
> 
> You forgot CC Arnd Bergmann <arnd@arndb.de> ?
> 
> But we do want to do the skb_dst_drop() in dev_forward_skb()
> 
> Your patch breaks dev_forward_skb() then.
If NET_NS is not defined is this broken too?
> 
> But apparently this forward path was alredy broken in Arnd patch...
>
Ok, until nobody comes up with an other idea I will post a patch to
change back the vlan driver (use skb->dev = dev instead of skb_set_dev)
next week. 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply

* Re: [V2 PATCH 0/9] vhost/macvtap zeropcopy fixes
From: Michael S. Tsirkin @ 2012-05-02  5:50 UTC (permalink / raw)
  To: Jason Wang; +Cc: eric.dumazet, netdev, linux-kernel, ebiederm, davem
In-Reply-To: <20120502033901.11782.13157.stgit@amd-6168-8-1.englab.nay.redhat.com>

On Wed, May 02, 2012 at 11:41:21AM +0800, Jason Wang wrote:
> This is an updated since the last series of vhost/macvtap zerocopy fixes which
> fixes the the possible transmission stall, host kernel stack overflow and other
> misc fixes.
> 
> Changes from V1:
> - Addressing comments from Eric and Michael.
> - Adding more fixes into the seires.

Thanks for fixing this.
Acked-by: Michael S. Tsirkin <mst@redhat.com>

Dave, can you merge this for 3.4 please?
Thanks!

> ---
> 
> Jason Wang (9):
>       macvtap: zerocopy: fix offset calculation when building skb
>       macvtap: zerocopy: fix truesize underestimation
>       macvtap: zerocopy: put page when fail to get all requested user pages
>       macvtap: zerocopy: set SKBTX_DEV_ZEROCOPY only when skb is built successfully
>       macvtap: zerocopy: validate vectors before building skb
>       vhost_net: zerocopy: fix possible NULL pointer dereference of vq->bufs
>       vhost_net: re-poll only on EAGAIN or ENOBUFS
>       vhost_net: zerocopy: adding and signalling immediately when fully copied
>       vhost: zerocopy: poll vq in zerocopy callback
> 
> 
>  drivers/net/macvtap.c |   57 ++++++++++++++++++++++++++++++++++---------------
>  drivers/vhost/net.c   |    7 ++++--
>  drivers/vhost/vhost.c |    1 +
>  3 files changed, 46 insertions(+), 19 deletions(-)
> 
> -- 
> Jason Wang

^ permalink raw reply

* Re: [PATCH] RPS: Sparse connection optimizations
From: Deng-Cheng Zhu @ 2012-05-02  4:16 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: davem, therbert, netdev
In-Reply-To: <1335931968.22133.47.camel@edumazet-glaptop>

On 05/02/2012 12:12 PM, Eric Dumazet wrote:
> On Wed, 2012-05-02 at 12:08 +0800, Deng-Cheng Zhu wrote:
>
>> Unfortunately, on the Malta platform, NIC irqs are not suitable for SMP
>> IRQ affinity -- they are based on XT-PIC. However, I did do some RPS
>> tests where different CPU masks were assigned to the 2 NICs. And the
>> throughput *WAS* better than that of assigning the same mask to NICs.
>> But the problem addressed in this patch *STILL* exists -- hash indexing
>> causes imbalance across CPUs in the case of sparse connections.
>
> You mean your two NIC irqs are handled by CPU0 and this cant be
> changed ?
>
> That really is bad.
>
>
>

Yes, that's right. It's not IO-APIC enabled. When trying to write values
to /proc/irq/$irqnum/smp_affinity:

echo: write error: Input/output error


Deng-Cheng

^ permalink raw reply

* Re: [PATCH] RPS: Sparse connection optimizations
From: Eric Dumazet @ 2012-05-02  4:12 UTC (permalink / raw)
  To: Deng-Cheng Zhu; +Cc: davem, therbert, netdev
In-Reply-To: <4FA0B337.1070401@mips.com>

On Wed, 2012-05-02 at 12:08 +0800, Deng-Cheng Zhu wrote:

> Unfortunately, on the Malta platform, NIC irqs are not suitable for SMP
> IRQ affinity -- they are based on XT-PIC. However, I did do some RPS
> tests where different CPU masks were assigned to the 2 NICs. And the
> throughput *WAS* better than that of assigning the same mask to NICs.
> But the problem addressed in this patch *STILL* exists -- hash indexing
> causes imbalance across CPUs in the case of sparse connections.

You mean your two NIC irqs are handled by CPU0 and this cant be
changed ?

That really is bad.

^ permalink raw reply

* Re: [PATCH] RPS: Sparse connection optimizations
From: Deng-Cheng Zhu @ 2012-05-02  4:08 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: davem, therbert, netdev
In-Reply-To: <1335614156.2900.66.camel@edumazet-glaptop>

On 04/28/2012 07:55 PM, Eric Dumazet wrote:
> On Sat, 2012-04-28 at 18:10 +0800, Deng-Cheng Zhu wrote:
>> From: Deng-Cheng Zhu<dczhu@mips.com>
>>
>> Currently, choosing target CPU to process the incoming packet is based on
>> skb->rxhash. In the case of sparse connections, this could lead to
>> relatively low and inconsistent bandwidth while doing network throughput
>> tests -- CPU selection in the RPS map is imbalanced. Even with the same
>> hash value, 2 packets could come from different devices. Besides, on
>> architectures like MIPS with multi-threaded cores, siblings of CPU0 should
>> not be selected when others are not saturated.
>
> What CPU0 is doing so special you have to mention it in this changelog ?

Serve the NIC hardware irq and do the 1st part of softirq in RPS.
Changes relating to this will be made in v2. Thanks.

>>
>> This patch introduces a feature that allows some flows to select their CPUs
>> by looping the RPS CPU maps. Some tests were performed on the MIPS Malta
>> 1004K platform (2 cores, each with 2 VPEs) at 25Mhz with 2 Intel Pro/1000
>> NICs. The Malta board works as a router between 2 PCs. Using iperf, here
>> are results:
>
>
> RPS on a router ? Thats not very good, unless you perform a crazy amount
> of work in iptables rules maybe ?
>
> One packet comes, its better to handle it right now and send it right
> now on the same cpu. No IPI cost, no cache line misses...

Theoretically, you are right. But RPS works early -- when the hot CPU
gets really hot, RPS takes effect even with forwarding workload. Earlier
experiments on the mentioned Malta platform proved it as well
(typically 45% ~ more than doubled, amazingly).

>
> RPS is something more suitable to TCP handling in local host because
> stack has big memory footprint and latencies, not for forwarding
> workload.

See above.

> I suspect you can reach more throughput using appropriate tunings
> (correct interrupt affinities). This sounds like a bad config from the
> very beginning.

Unfortunately, on the Malta platform, NIC irqs are not suitable for SMP
IRQ affinity -- they are based on XT-PIC. However, I did do some RPS
tests where different CPU masks were assigned to the 2 NICs. And the
throughput *WAS* better than that of assigning the same mask to NICs.
But the problem addressed in this patch *STILL* exists -- hash indexing
causes imbalance across CPUs in the case of sparse connections.

>
>> +};
>> +
>> +static struct cpu_flow flow[CONFIG_NR_RPS_MAP_LOOPS][NR_CPUS];
>
> Thats absolutely not allowed to add a [NR_CPUS] array anywhere in linux
> kernel in 2012.

Good advice. Will be different in V2.

>
>> +/*
>> + * We've got CONFIG_SMP to do RPS, so only arch define is needed here to access
>> + * sibling specific information.
>> + */
>> +#if defined(CONFIG_MIPS)
>
> Thats not allowed to add a CONFIG_somearch in net/core/dev.c

Good advice too.


Thanks!

Deng-Cheng

^ permalink raw reply

* Re: pull request: wireless 2012-05-01
From: David Miller @ 2012-05-02  4:05 UTC (permalink / raw)
  To: linville-2XuSBdqkA4R54TAoqtyWWQ
  Cc: linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20120501183157.GA2987-2XuSBdqkA4R54TAoqtyWWQ@public.gmane.org>

From: "John W. Linville" <linville-2XuSBdqkA4R54TAoqtyWWQ@public.gmane.org>
Date: Tue, 1 May 2012 14:31:58 -0400

> Part of these are Bluetooth -- Gustavo says:
> 
> "A few more fixes to 3.4, there is three new device id supported, a missing
> break and a fix to retain key during the connection."
> 
> Along with those, we have some 802.11 fixes.  Felix gives us a fix
> for sending EAP frames for AP VLAN interfaces.  Franky gives us
> two brcmfmac fixes, one for missed event completions and another
> for an improperly initialized function pointer.  Grazvydas Ignotas
> gives us a couple of wl1251 fixes for some improperly freed memory.
> Jonathan Bither gives us an ath5k fix for an AHB resource leak.
> Seth Forshee gives us a b43 fix to avoid a crash when b43 fails to
> properly initialize.  Finally, Wey-yi gives us a fixup for a previously
> merged fix to use the correct firmware for certain iwlwifi devices.
> 
> The brcmfmac fixes are bigger than I would like to see.  But since
> they are isolated to that driver and fix real problems, I felt they
> were still acceptable.
> 
> Please let me know if there are problems!

Pulled, thanks John.
--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 3/4 v2 net-next] net: make GRO aware of skb->head_frag
From: Eric Dumazet @ 2012-05-02  3:54 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Alexander Duyck, David Miller, netdev, Neal Cardwell, Tom Herbert,
	Jeff Kirsher, Michael Chan, Matt Carlson, Herbert Xu,
	Ben Hutchings, Ilpo Järvinen, Maciej Żenczykowski
In-Reply-To: <1335926862.22133.42.camel@edumazet-glaptop>

On Wed, 2012-05-02 at 04:47 +0200, Eric Dumazet wrote:

> Thanks Alex, I'll take a look.
> 
> It seems my tcpdump is different than yours.
> 
> 

I'll test following patch in a couple of hours :

Thanks !

 net/ipv4/tcp_input.c |   16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 96a631d..910a794 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4467,7 +4467,8 @@ static bool tcp_try_coalesce(struct sock *sk,
 			     struct sk_buff *from,
 			     bool *fragstolen)
 {
-	int delta, len = from->len;
+	int i, delta, len = from->len;
+	bool fastpath;
 
 	*fragstolen = false;
 	if (tcp_hdr(from)->fin)
@@ -4484,6 +4485,7 @@ merge:
 	if (skb_has_frag_list(to) || skb_has_frag_list(from))
 		return false;
 
+	fastpath = !from->cloned || atomic_read(&skb_shinfo(from)->dataref) == 1;
 	if (skb_headlen(from) == 0 &&
 	    (skb_shinfo(to)->nr_frags +
 	     skb_shinfo(from)->nr_frags <= MAX_SKB_FRAGS)) {
@@ -4497,7 +4499,13 @@ copyfrags:
 		       skb_shinfo(from)->frags,
 		       skb_shinfo(from)->nr_frags * sizeof(skb_frag_t));
 		skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags;
-		skb_shinfo(from)->nr_frags = 0;
+
+		if (fastpath)
+			skb_shinfo(from)->nr_frags = 0;
+		else
+			for (i = 0; i < skb_shinfo(from)->nr_frags; i++)
+				skb_frag_ref(from, i);
+
 		to->truesize += delta;
 		atomic_add(delta, &sk->sk_rmem_alloc);
 		sk_mem_charge(sk, delta);
@@ -4515,7 +4523,9 @@ copyfrags:
 		offset = from->data - (unsigned char *)page_address(page);
 		skb_fill_page_desc(to, skb_shinfo(to)->nr_frags,
 				   page, offset, skb_headlen(from));
-		*fragstolen = true;
+		*fragstolen = fastpath;
+		if (!fastpath)
+			get_page(page);
 		delta = len; /* we dont know real truesize... */
 		goto copyfrags;
 	}

^ permalink raw reply related

* [V2 PATCH 7/9] vhost_net: re-poll only on EAGAIN or ENOBUFS
From: Jason Wang @ 2012-05-02  3:42 UTC (permalink / raw)
  To: eric.dumazet, mst, netdev, linux-kernel, ebiederm, davem
In-Reply-To: <20120502033901.11782.13157.stgit@amd-6168-8-1.englab.nay.redhat.com>

Currently, we restart tx polling unconditionally when sendmsg()
fails. This would cause unnecessary wakeups of vhost wokers and waste
cpu utlization when evil userspace(guest driver) is able to hit EFAULT or
EINVAL.

The polling is only needed when the socket send buffer were exceeded or not
enough memory. So fix this by restarting polling only when sendmsg() returns
EAGAIN/ENOBUFS.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index ffdc0d8..62828aa 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -257,7 +257,8 @@ static void handle_tx(struct vhost_net *net)
 					UIO_MAXIOV;
 			}
 			vhost_discard_vq_desc(vq, 1);
-			tx_poll_start(net, sock);
+			if (err == -EAGAIN || err == -ENOBUFS)
+				tx_poll_start(net, sock);
 			break;
 		}
 		if (err != len)

^ permalink raw reply related

* [V2 PATCH 5/9] macvtap: zerocopy: validate vectors before building skb
From: Jason Wang @ 2012-05-02  3:42 UTC (permalink / raw)
  To: eric.dumazet, mst, netdev, linux-kernel, ebiederm, davem
In-Reply-To: <20120502033901.11782.13157.stgit@amd-6168-8-1.englab.nay.redhat.com>

There're several reasons that the vectors need to be validated:

- Return error when caller provides vectors whose num is greater than UIO_MAXIOV.
- Linearize part of skb when userspace provides vectors grater than MAX_SKB_FRAGS.
- Return error when userspace provides vectors whose total length may exceed
- MAX_SKB_FRAGS * PAGE_SIZE.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/macvtap.c |   25 +++++++++++++++++++++----
 1 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index a4ff694..163559c 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -529,9 +529,10 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
 		}
 		base = (unsigned long)from->iov_base + offset;
 		size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+		if (i + size > MAX_SKB_FRAGS)
+			return -EMSGSIZE;
 		num_pages = get_user_pages_fast(base, size, 0, &page[i]);
-		if ((num_pages != size) ||
-		    (num_pages > MAX_SKB_FRAGS - skb_shinfo(skb)->nr_frags)) {
+		if (num_pages != size) {
 			for (i = 0; i < num_pages; i++)
 				put_page(page[i]);
 			return -EFAULT;
@@ -651,7 +652,7 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
 	int err;
 	struct virtio_net_hdr vnet_hdr = { 0 };
 	int vnet_hdr_len = 0;
-	int copylen;
+	int copylen = 0;
 	bool zerocopy = false;
 
 	if (q->flags & IFF_VNET_HDR) {
@@ -680,15 +681,31 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
 	if (unlikely(len < ETH_HLEN))
 		goto err;
 
+	err = -EMSGSIZE;
+	if (unlikely(count > UIO_MAXIOV))
+		goto err;
+
 	if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY))
 		zerocopy = true;
 
 	if (zerocopy) {
+		/* Userspace may produce vectors with count greater than
+		 * MAX_SKB_FRAGS, so we need to linearize parts of the skb
+		 * to let the rest of data to be fit in the frags.
+		 */
+		if (count > MAX_SKB_FRAGS) {
+			copylen = iov_length(iv, count - MAX_SKB_FRAGS);
+			if (copylen < vnet_hdr_len)
+				copylen = 0;
+			else
+				copylen -= vnet_hdr_len;
+		}
 		/* There are 256 bytes to be copied in skb, so there is enough
 		 * room for skb expand head in case it is used.
 		 * The rest buffer is mapped from userspace.
 		 */
-		copylen = vnet_hdr.hdr_len;
+		if (copylen < vnet_hdr.hdr_len)
+			copylen = vnet_hdr.hdr_len;
 		if (!copylen)
 			copylen = GOODCOPY_LEN;
 	} else

^ permalink raw reply related

* [V2 PATCH 0/9] vhost/macvtap zeropcopy fixes
From: Jason Wang @ 2012-05-02  3:41 UTC (permalink / raw)
  To: eric.dumazet, mst, netdev, linux-kernel, ebiederm, davem

This is an updated since the last series of vhost/macvtap zerocopy fixes which
fixes the the possible transmission stall, host kernel stack overflow and other
misc fixes.

Changes from V1:
- Addressing comments from Eric and Michael.
- Adding more fixes into the seires.

---

Jason Wang (9):
      macvtap: zerocopy: fix offset calculation when building skb
      macvtap: zerocopy: fix truesize underestimation
      macvtap: zerocopy: put page when fail to get all requested user pages
      macvtap: zerocopy: set SKBTX_DEV_ZEROCOPY only when skb is built successfully
      macvtap: zerocopy: validate vectors before building skb
      vhost_net: zerocopy: fix possible NULL pointer dereference of vq->bufs
      vhost_net: re-poll only on EAGAIN or ENOBUFS
      vhost_net: zerocopy: adding and signalling immediately when fully copied
      vhost: zerocopy: poll vq in zerocopy callback


 drivers/net/macvtap.c |   57 ++++++++++++++++++++++++++++++++++---------------
 drivers/vhost/net.c   |    7 ++++--
 drivers/vhost/vhost.c |    1 +
 3 files changed, 46 insertions(+), 19 deletions(-)

-- 
Jason Wang

^ permalink raw reply

* [V2 PATCH 9/9] vhost: zerocopy: poll vq in zerocopy callback
From: Jason Wang @ 2012-05-02  3:42 UTC (permalink / raw)
  To: eric.dumazet, mst, netdev, linux-kernel, ebiederm, davem
In-Reply-To: <20120502033901.11782.13157.stgit@amd-6168-8-1.englab.nay.redhat.com>

We add used and signal guest in worker thread but did not poll the virtqueue
during the zero copy callback. This may lead the missing of adding and
signalling during zerocopy. Solve this by polling the virtqueue and let it
wakeup the worker during callback.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 947f00d..7b75fdf 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1604,6 +1604,7 @@ void vhost_zerocopy_callback(void *arg)
 	struct vhost_ubuf_ref *ubufs = ubuf->arg;
 	struct vhost_virtqueue *vq = ubufs->vq;
 
+	vhost_poll_queue(&vq->poll);
 	/* set len = 1 to mark this desc buffers done DMA */
 	vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN;
 	kref_put(&ubufs->kref, vhost_zerocopy_done_signal);

^ permalink raw reply related

* [V2 PATCH 8/9] vhost_net: zerocopy: adding and signalling immediately when fully copied
From: Jason Wang @ 2012-05-02  3:42 UTC (permalink / raw)
  To: eric.dumazet, mst, netdev, linux-kernel, ebiederm, davem
In-Reply-To: <20120502033901.11782.13157.stgit@amd-6168-8-1.englab.nay.redhat.com>

When a packet were fully copied in zerocopy, we don't wait for the DMA done to
mark the done flag, so after the packet were passed to lower device, we need to
add used and signal guest immediately.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 62828aa..1dc2aeb 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -266,6 +266,8 @@ static void handle_tx(struct vhost_net *net)
 				 " len %d != %zd\n", err, len);
 		if (!zcopy)
 			vhost_add_used_and_signal(&net->dev, vq, head, 0);
+		else
+			vhost_zerocopy_signal_used(vq);
 		total_len += len;
 		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
 			vhost_poll_queue(&vq->poll);

^ permalink raw reply related

* [V2 PATCH 6/9] vhost_net: zerocopy: fix possible NULL pointer dereference of vq->bufs
From: Jason Wang @ 2012-05-02  3:42 UTC (permalink / raw)
  To: eric.dumazet, mst, netdev, linux-kernel, ebiederm, davem
In-Reply-To: <20120502033901.11782.13157.stgit@amd-6168-8-1.englab.nay.redhat.com>

When we want to disable vhost_net backend while there's a tx work, a possible
NULL pointer defernece may happen we we try to deference the vq->bufs after
vhost_net_set_backend() assign a NULL to it.

As suggested by Michael, fix this by checking the vq->bufs instead of
vhost_sock_zcopy().
---
 drivers/vhost/net.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f0da2c3..ffdc0d8 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -166,7 +166,7 @@ static void handle_tx(struct vhost_net *net)
 	if (wmem < sock->sk->sk_sndbuf / 2)
 		tx_poll_stop(net);
 	hdr_size = vq->vhost_hlen;
-	zcopy = vhost_sock_zcopy(sock);
+	zcopy = vq->ubufs;
 
 	for (;;) {
 		/* Release DMAs done buffers first */

^ permalink raw reply related

* [V2 PATCH 4/9] macvtap: zerocopy: set SKBTX_DEV_ZEROCOPY only when skb is built successfully
From: Jason Wang @ 2012-05-02  3:42 UTC (permalink / raw)
  To: eric.dumazet, mst, netdev, linux-kernel, ebiederm, davem
In-Reply-To: <20120502033901.11782.13157.stgit@amd-6168-8-1.englab.nay.redhat.com>

Current the SKBTX_DEV_ZEROCOPY is set unconditionally after
zerocopy_sg_from_iovec(), this would lead NULL pointer when macvtap
fails to build zerocopy skb because destructor_arg was not
initialized. Solve this by set this flag after the skb were built
successfully.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/macvtap.c |    9 +++++----
 1 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 9ab182a..a4ff694 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -699,10 +699,9 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
 	if (!skb)
 		goto err;
 
-	if (zerocopy) {
+	if (zerocopy)
 		err = zerocopy_sg_from_iovec(skb, iv, vnet_hdr_len, count);
-		skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
-	} else
+	else
 		err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len,
 						   len);
 	if (err)
@@ -721,8 +720,10 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
 	rcu_read_lock_bh();
 	vlan = rcu_dereference_bh(q->vlan);
 	/* copy skb_ubuf_info for callback when skb has no error */
-	if (zerocopy)
+	if (zerocopy) {
 		skb_shinfo(skb)->destructor_arg = m->msg_control;
+		skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+	}
 	if (vlan)
 		macvlan_start_xmit(skb, vlan->dev);
 	else

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox