Netdev List
 help / color / mirror / Atom feed
* [patch v2 03/12] [PATCH 03/12] IPVS: compact ip_vs_sched_persist()
From: Simon Horman @ 2010-10-01 14:35 UTC (permalink / raw)
  To: lvs-devel, netdev, netfilter, netfilter-devel
  Cc: Jan Engelhardt, Stephen Hemminger, Wensong Zhang,
	Julian Anastasov, Patrick McHardy
In-Reply-To: <20101001143517.645421976@akiko.akashicho.tokyo.vergenet.net>

[-- Attachment #1: 0003-IPVS-compact-ip_vs_sched_persist.patch --]
[-- Type: text/plain, Size: 5731 bytes --]

Compact ip_vs_sched_persist() by setting up parameters
and calling functions once.

Signed-off-by: Simon Horman <horms@verge.net.au>
---

v2
* Make "union nf_inet_addr fwmark" const
* Don't remove the comment next to the declaration of dport
* Add a comment to the declaration of vport

Index: lvs-test-2.6/net/netfilter/ipvs/ip_vs_core.c
===================================================================
--- lvs-test-2.6.orig/net/netfilter/ipvs/ip_vs_core.c	2010-10-01 21:56:39.000000000 +0900
+++ lvs-test-2.6/net/netfilter/ipvs/ip_vs_core.c	2010-10-01 22:02:41.000000000 +0900
@@ -193,10 +193,14 @@ ip_vs_sched_persist(struct ip_vs_service
 	struct ip_vs_iphdr iph;
 	struct ip_vs_dest *dest;
 	struct ip_vs_conn *ct;
-	__be16  dport;			/* destination port to forward */
+	int protocol = iph.protocol;
+	__be16 dport = 0;		/* destination port to forward */
+	__be16 vport = 0;		/* virtual service port */
 	unsigned int flags;
 	union nf_inet_addr snet;	/* source network of the client,
 					   after masking */
+	const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
+	const union nf_inet_addr *vaddr = &iph.daddr;
 
 	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
 
@@ -227,119 +231,58 @@ ip_vs_sched_persist(struct ip_vs_service
 	 * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
 	 * is created for other persistent services.
 	 */
-	if (ports[1] == svc->port) {
-		/* Check if a template already exists */
-		if (svc->port != FTPPORT)
-			ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
-					     &iph.daddr, ports[1]);
-		else
-			ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
-					     &iph.daddr, 0);
-
-		if (!ct || !ip_vs_check_template(ct)) {
-			/*
-			 * No template found or the dest of the connection
-			 * template is not available.
-			 */
-			dest = svc->scheduler->schedule(svc, skb);
-			if (dest == NULL) {
-				IP_VS_DBG(1, "p-schedule: no dest found.\n");
-				return NULL;
-			}
-
-			/*
-			 * Create a template like <protocol,caddr,0,
-			 * vaddr,vport,daddr,dport> for non-ftp service,
-			 * and <protocol,caddr,0,vaddr,0,daddr,0>
-			 * for ftp service.
+	{
+		if (ports[1] == svc->port) {
+			/* non-FTP template:
+			 * <protocol, caddr, 0, vaddr, vport, daddr, dport>
+			 * FTP template:
+			 * <protocol, caddr, 0, vaddr, 0, daddr, 0>
 			 */
 			if (svc->port != FTPPORT)
-				ct = ip_vs_conn_new(svc->af, iph.protocol,
-						    &snet, 0,
-						    &iph.daddr,
-						    ports[1],
-						    &dest->addr, dest->port,
-						    IP_VS_CONN_F_TEMPLATE,
-						    dest);
-			else
-				ct = ip_vs_conn_new(svc->af, iph.protocol,
-						    &snet, 0,
-						    &iph.daddr, 0,
-						    &dest->addr, 0,
-						    IP_VS_CONN_F_TEMPLATE,
-						    dest);
-			if (ct == NULL)
-				return NULL;
-
-			ct->timeout = svc->timeout;
+				vport = ports[1];
 		} else {
-			/* set destination with the found template */
-			dest = ct->dest;
-		}
-		dport = dest->port;
-	} else {
-		/*
-		 * Note: persistent fwmark-based services and persistent
-		 * port zero service are handled here.
-		 * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
-		 * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
-		 */
-		if (svc->fwmark) {
-			union nf_inet_addr fwmark = {
-				.ip = htonl(svc->fwmark)
-			};
-
-			ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0,
-					     &fwmark, 0);
-		} else
-			ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
-					     &iph.daddr, 0);
-
-		if (!ct || !ip_vs_check_template(ct)) {
-			/*
-			 * If it is not persistent port zero, return NULL,
-			 * otherwise create a connection template.
+			/* Note: persistent fwmark-based services and
+			 * persistent port zero service are handled here.
+			 * fwmark template:
+			 * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
+			 * port zero template:
+			 * <protocol,caddr,0,vaddr,0,daddr,0>
 			 */
-			if (svc->port)
-				return NULL;
-
-			dest = svc->scheduler->schedule(svc, skb);
-			if (dest == NULL) {
-				IP_VS_DBG(1, "p-schedule: no dest found.\n");
-				return NULL;
+			if (svc->fwmark) {
+				protocol = IPPROTO_IP;
+				vaddr = &fwmark;
 			}
+		}
+	}
 
-			/*
-			 * Create a template according to the service
-			 */
-			if (svc->fwmark) {
-				union nf_inet_addr fwmark = {
-					.ip = htonl(svc->fwmark)
-				};
-
-				ct = ip_vs_conn_new(svc->af, IPPROTO_IP,
-						    &snet, 0,
-						    &fwmark, 0,
-						    &dest->addr, 0,
-						    IP_VS_CONN_F_TEMPLATE,
-						    dest);
-			} else
-				ct = ip_vs_conn_new(svc->af, iph.protocol,
-						    &snet, 0,
-						    &iph.daddr, 0,
-						    &dest->addr, 0,
-						    IP_VS_CONN_F_TEMPLATE,
-						    dest);
-			if (ct == NULL)
-				return NULL;
+	/* Check if a template already exists */
+	ct = ip_vs_ct_in_get(svc->af, protocol, &snet, 0, vaddr, vport);
 
-			ct->timeout = svc->timeout;
-		} else {
-			/* set destination with the found template */
-			dest = ct->dest;
+	if (!ct || !ip_vs_check_template(ct)) {
+		/* No template found or the dest of the connection
+		 * template is not available.
+		 */
+		dest = svc->scheduler->schedule(svc, skb);
+		if (!dest) {
+			IP_VS_DBG(1, "p-schedule: no dest found.\n");
+			return NULL;
 		}
-		dport = ports[1];
-	}
+
+		if (ports[1] == svc->port && svc->port != FTPPORT)
+			dport = dest->port;
+
+		/* Create a template */
+		ct = ip_vs_conn_new(svc->af, protocol, &snet, 0,vaddr, vport,
+				    &dest->addr, dport,
+				    IP_VS_CONN_F_TEMPLATE, dest);
+		if (ct == NULL)
+			return NULL;
+
+		ct->timeout = svc->timeout;
+	} else
+		/* set destination with the found template */
+		dest = ct->dest;
+	dport = dest->port;
 
 	flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
 		 && iph.protocol == IPPROTO_UDP)?


^ permalink raw reply

* [patch v2 04/12] [PATCH 04/12] IPVS: Add struct ip_vs_conn_param
From: Simon Horman @ 2010-10-01 14:35 UTC (permalink / raw)
  To: lvs-devel, netdev, netfilter, netfilter-devel
  Cc: Jan Engelhardt, Stephen Hemminger, Wensong Zhang,
	Julian Anastasov, Patrick McHardy
In-Reply-To: <20101001143517.645421976@akiko.akashicho.tokyo.vergenet.net>

[-- Attachment #1: 0004-IPVS-Add-struct-ip_vs_conn_param.patch --]
[-- Type: text/plain, Size: 25185 bytes --]

Signed-off-by: Simon Horman <horms@verge.net.au>
---

The motivation for this is to allow persistence engine modules to
fill in the parameters.

v0.3
* Add missing changes to ip_vs_ftp.c

v2
* make "union nf_inet_addr fwmark" const
* Update for the recent addition of ip_vs_nfct.c

Index: lvs-test-2.6/include/net/ip_vs.h
===================================================================
--- lvs-test-2.6.orig/include/net/ip_vs.h	2010-10-01 21:56:39.000000000 +0900
+++ lvs-test-2.6/include/net/ip_vs.h	2010-10-01 22:07:22.000000000 +0900
@@ -357,6 +357,15 @@ struct ip_vs_protocol {
 
 extern struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto);
 
+struct ip_vs_conn_param {
+	const union nf_inet_addr	*caddr;
+	const union nf_inet_addr	*vaddr;
+	__be16				cport;
+	__be16				vport;
+	__u16				protocol;
+	u16				af;
+};
+
 /*
  *	IP_VS structure allocated for each dynamically scheduled connection
  */
@@ -626,13 +635,23 @@ enum {
 	IP_VS_DIR_LAST,
 };
 
-extern struct ip_vs_conn *ip_vs_conn_in_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port);
-
-extern struct ip_vs_conn *ip_vs_ct_in_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port);
+static inline void ip_vs_conn_fill_param(int af, int protocol,
+					 const union nf_inet_addr *caddr,
+					 __be16 cport,
+					 const union nf_inet_addr *vaddr,
+					 __be16 vport,
+					 struct ip_vs_conn_param *p)
+{
+	p->af = af;
+	p->protocol = protocol;
+	p->caddr = caddr;
+	p->cport = cport;
+	p->vaddr = vaddr;
+	p->vport = vport;
+}
+
+struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p);
+struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p);
 
 struct ip_vs_conn * ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
 					    struct ip_vs_protocol *pp,
@@ -640,9 +659,7 @@ struct ip_vs_conn * ip_vs_conn_in_get_pr
 					    unsigned int proto_off,
 					    int inverse);
 
-extern struct ip_vs_conn *ip_vs_conn_out_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port);
+struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p);
 
 struct ip_vs_conn * ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
 					     struct ip_vs_protocol *pp,
@@ -658,11 +675,10 @@ static inline void __ip_vs_conn_put(stru
 extern void ip_vs_conn_put(struct ip_vs_conn *cp);
 extern void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport);
 
-extern struct ip_vs_conn *
-ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
-	       const union nf_inet_addr *vaddr, __be16 vport,
-	       const union nf_inet_addr *daddr, __be16 dport, unsigned flags,
-	       struct ip_vs_dest *dest);
+struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p,
+				  const union nf_inet_addr *daddr,
+				  __be16 dport, unsigned flags,
+				  struct ip_vs_dest *dest);
 extern void ip_vs_conn_expire_now(struct ip_vs_conn *cp);
 
 extern const char * ip_vs_state_name(__u16 proto, int state);
Index: lvs-test-2.6/net/netfilter/ipvs/ip_vs_conn.c
===================================================================
--- lvs-test-2.6.orig/net/netfilter/ipvs/ip_vs_conn.c	2010-10-01 21:56:39.000000000 +0900
+++ lvs-test-2.6/net/netfilter/ipvs/ip_vs_conn.c	2010-10-01 22:07:22.000000000 +0900
@@ -218,27 +218,26 @@ static inline int ip_vs_conn_unhash(stru
 /*
  *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
  *  Called for pkts coming from OUTside-to-INside.
- *	s_addr, s_port: pkt source address (foreign host)
- *	d_addr, d_port: pkt dest address (load balancer)
+ *	p->caddr, p->cport: pkt source address (foreign host)
+ *	p->vaddr, p->vport: pkt dest address (load balancer)
  */
-static inline struct ip_vs_conn *__ip_vs_conn_in_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
+static inline struct ip_vs_conn *
+__ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
 {
 	unsigned hash;
 	struct ip_vs_conn *cp;
 
-	hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
+	hash = ip_vs_conn_hashkey(p->af, p->protocol, p->caddr, p->cport);
 
 	ct_read_lock(hash);
 
 	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (cp->af == af &&
-		    ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
-		    ip_vs_addr_equal(af, d_addr, &cp->vaddr) &&
-		    s_port == cp->cport && d_port == cp->vport &&
-		    ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
-		    protocol == cp->protocol) {
+		if (cp->af == p->af &&
+		    ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
+		    ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
+		    p->cport == cp->cport && p->vport == cp->vport &&
+		    ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
+		    p->protocol == cp->protocol) {
 			/* HIT */
 			atomic_inc(&cp->refcnt);
 			ct_read_unlock(hash);
@@ -251,71 +250,82 @@ static inline struct ip_vs_conn *__ip_vs
 	return NULL;
 }
 
-struct ip_vs_conn *ip_vs_conn_in_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
+struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
 {
 	struct ip_vs_conn *cp;
 
-	cp = __ip_vs_conn_in_get(af, protocol, s_addr, s_port, d_addr, d_port);
-	if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
-		cp = __ip_vs_conn_in_get(af, protocol, s_addr, 0, d_addr,
-					 d_port);
+	cp = __ip_vs_conn_in_get(p);
+	if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) {
+		struct ip_vs_conn_param cport_zero_p = *p;
+		cport_zero_p.cport = 0;
+		cp = __ip_vs_conn_in_get(&cport_zero_p);
+	}
 
 	IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n",
-		      ip_vs_proto_name(protocol),
-		      IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
-		      IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
+		      ip_vs_proto_name(p->protocol),
+		      IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+		      IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
 		      cp ? "hit" : "not hit");
 
 	return cp;
 }
 
+static int
+ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb,
+			    const struct ip_vs_iphdr *iph,
+			    unsigned int proto_off, int inverse,
+			    struct ip_vs_conn_param *p)
+{
+	__be16 _ports[2], *pptr;
+
+	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
+	if (pptr == NULL)
+		return 1;
+
+	if (likely(!inverse))
+		ip_vs_conn_fill_param(af, iph->protocol, &iph->saddr, pptr[0],
+				      &iph->daddr, pptr[1], p);
+	else
+		ip_vs_conn_fill_param(af, iph->protocol, &iph->saddr, pptr[0],
+				      &iph->daddr, pptr[1], p);
+	return 0;
+}
+
 struct ip_vs_conn *
 ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
 			struct ip_vs_protocol *pp,
 			const struct ip_vs_iphdr *iph,
 			unsigned int proto_off, int inverse)
 {
-	__be16 _ports[2], *pptr;
+	struct ip_vs_conn_param p;
 
-	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
-	if (pptr == NULL)
+	if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p))
 		return NULL;
 
-	if (likely(!inverse))
-		return ip_vs_conn_in_get(af, iph->protocol,
-					 &iph->saddr, pptr[0],
-					 &iph->daddr, pptr[1]);
-	else
-		return ip_vs_conn_in_get(af, iph->protocol,
-					 &iph->daddr, pptr[1],
-					 &iph->saddr, pptr[0]);
+	return ip_vs_conn_in_get(&p);
 }
 EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto);
 
 /* Get reference to connection template */
-struct ip_vs_conn *ip_vs_ct_in_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
+struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
 {
 	unsigned hash;
 	struct ip_vs_conn *cp;
 
-	hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
+	hash = ip_vs_conn_hashkey(p->af, p->protocol, p->caddr, p->cport);
 
 	ct_read_lock(hash);
 
 	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (cp->af == af &&
-		    ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
+		if (cp->af == p->af &&
+		    ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
 		    /* protocol should only be IPPROTO_IP if
-		     * d_addr is a fwmark */
-		    ip_vs_addr_equal(protocol == IPPROTO_IP ? AF_UNSPEC : af,
-		                     d_addr, &cp->vaddr) &&
-		    s_port == cp->cport && d_port == cp->vport &&
+		     * p->vaddr is a fwmark */
+		    ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC :
+				     p->af, p->vaddr, &cp->vaddr) &&
+		    p->cport == cp->cport && p->vport == cp->vport &&
 		    cp->flags & IP_VS_CONN_F_TEMPLATE &&
-		    protocol == cp->protocol) {
+		    p->protocol == cp->protocol) {
 			/* HIT */
 			atomic_inc(&cp->refcnt);
 			goto out;
@@ -327,9 +337,9 @@ struct ip_vs_conn *ip_vs_ct_in_get
 	ct_read_unlock(hash);
 
 	IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
-		      ip_vs_proto_name(protocol),
-		      IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
-		      IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
+		      ip_vs_proto_name(p->protocol),
+		      IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+		      IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
 		      cp ? "hit" : "not hit");
 
 	return cp;
@@ -341,9 +351,7 @@ struct ip_vs_conn *ip_vs_ct_in_get
  *	s_addr, s_port: pkt source address (inside host)
  *	d_addr, d_port: pkt dest address (foreign host)
  */
-struct ip_vs_conn *ip_vs_conn_out_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
+struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
 {
 	unsigned hash;
 	struct ip_vs_conn *cp, *ret=NULL;
@@ -351,16 +359,16 @@ struct ip_vs_conn *ip_vs_conn_out_get
 	/*
 	 *	Check for "full" addressed entries
 	 */
-	hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port);
+	hash = ip_vs_conn_hashkey(p->af, p->protocol, p->vaddr, p->vport);
 
 	ct_read_lock(hash);
 
 	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (cp->af == af &&
-		    ip_vs_addr_equal(af, d_addr, &cp->caddr) &&
-		    ip_vs_addr_equal(af, s_addr, &cp->daddr) &&
-		    d_port == cp->cport && s_port == cp->dport &&
-		    protocol == cp->protocol) {
+		if (cp->af == p->af &&
+		    ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
+		    ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
+		    p->vport == cp->cport && p->cport == cp->dport &&
+		    p->protocol == cp->protocol) {
 			/* HIT */
 			atomic_inc(&cp->refcnt);
 			ret = cp;
@@ -371,9 +379,9 @@ struct ip_vs_conn *ip_vs_conn_out_get
 	ct_read_unlock(hash);
 
 	IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
-		      ip_vs_proto_name(protocol),
-		      IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
-		      IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
+		      ip_vs_proto_name(p->protocol),
+		      IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+		      IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
 		      ret ? "hit" : "not hit");
 
 	return ret;
@@ -385,20 +393,12 @@ ip_vs_conn_out_get_proto(int af, const s
 			 const struct ip_vs_iphdr *iph,
 			 unsigned int proto_off, int inverse)
 {
-	__be16 _ports[2], *pptr;
+	struct ip_vs_conn_param p;
 
-	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
-	if (pptr == NULL)
+	if (!ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p))
 		return NULL;
 
-	if (likely(!inverse))
-		return ip_vs_conn_out_get(af, iph->protocol,
-					  &iph->saddr, pptr[0],
-					  &iph->daddr, pptr[1]);
-	else
-		return ip_vs_conn_out_get(af, iph->protocol,
-					  &iph->daddr, pptr[1],
-					  &iph->saddr, pptr[0]);
+	return ip_vs_conn_out_get(&p);
 }
 EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto);
 
@@ -758,13 +758,12 @@ void ip_vs_conn_expire_now(struct ip_vs_
  *	Create a new connection entry and hash it into the ip_vs_conn_tab
  */
 struct ip_vs_conn *
-ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
-	       const union nf_inet_addr *vaddr, __be16 vport,
+ip_vs_conn_new(const struct ip_vs_conn_param *p,
 	       const union nf_inet_addr *daddr, __be16 dport, unsigned flags,
 	       struct ip_vs_dest *dest)
 {
 	struct ip_vs_conn *cp;
-	struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
+	struct ip_vs_protocol *pp = ip_vs_proto_get(p->protocol);
 
 	cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
 	if (cp == NULL) {
@@ -774,14 +773,14 @@ ip_vs_conn_new(int af, int proto, const
 
 	INIT_LIST_HEAD(&cp->c_list);
 	setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
-	cp->af		   = af;
-	cp->protocol	   = proto;
-	ip_vs_addr_copy(af, &cp->caddr, caddr);
-	cp->cport	   = cport;
-	ip_vs_addr_copy(af, &cp->vaddr, vaddr);
-	cp->vport	   = vport;
+	cp->af		   = p->af;
+	cp->protocol	   = p->protocol;
+	ip_vs_addr_copy(p->af, &cp->caddr, p->caddr);
+	cp->cport	   = p->cport;
+	ip_vs_addr_copy(p->af, &cp->vaddr, p->vaddr);
+	cp->vport	   = p->vport;
 	/* proto should only be IPPROTO_IP if d_addr is a fwmark */
-	ip_vs_addr_copy(proto == IPPROTO_IP ? AF_UNSPEC : af,
+	ip_vs_addr_copy(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
 			&cp->daddr, daddr);
 	cp->dport          = dport;
 	cp->flags	   = flags;
@@ -810,7 +809,7 @@ ip_vs_conn_new(int af, int proto, const
 
 	/* Bind its packet transmitter */
 #ifdef CONFIG_IP_VS_IPV6
-	if (af == AF_INET6)
+	if (p->af == AF_INET6)
 		ip_vs_bind_xmit_v6(cp);
 	else
 #endif
Index: lvs-test-2.6/net/netfilter/ipvs/ip_vs_core.c
===================================================================
--- lvs-test-2.6.orig/net/netfilter/ipvs/ip_vs_core.c	2010-10-01 22:06:23.000000000 +0900
+++ lvs-test-2.6/net/netfilter/ipvs/ip_vs_core.c	2010-10-01 22:10:46.000000000 +0900
@@ -193,14 +193,11 @@ ip_vs_sched_persist(struct ip_vs_service
 	struct ip_vs_iphdr iph;
 	struct ip_vs_dest *dest;
 	struct ip_vs_conn *ct;
-	int protocol = iph.protocol;
 	__be16 dport = 0;		/* destination port to forward */
-	__be16 vport = 0;		/* virtual service port */
 	unsigned int flags;
 	union nf_inet_addr snet;	/* source network of the client,
 					   after masking */
-	const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
-	const union nf_inet_addr *vaddr = &iph.daddr;
+	struct ip_vs_conn_param param;
 
 	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
 
@@ -232,6 +229,11 @@ ip_vs_sched_persist(struct ip_vs_service
 	 * is created for other persistent services.
 	 */
 	{
+		int protocol = iph.protocol;
+		const union nf_inet_addr *vaddr = &iph.daddr;
+		const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
+		__be16 vport = 0;
+
 		if (ports[1] == svc->port) {
 			/* non-FTP template:
 			 * <protocol, caddr, 0, vaddr, vport, daddr, dport>
@@ -253,11 +255,12 @@ ip_vs_sched_persist(struct ip_vs_service
 				vaddr = &fwmark;
 			}
 		}
+		ip_vs_conn_fill_param(svc->af, protocol, &snet, 0,
+				      vaddr, vport, &param);
 	}
 
 	/* Check if a template already exists */
-	ct = ip_vs_ct_in_get(svc->af, protocol, &snet, 0, vaddr, vport);
-
+	ct = ip_vs_ct_in_get(&param);
 	if (!ct || !ip_vs_check_template(ct)) {
 		/* No template found or the dest of the connection
 		 * template is not available.
@@ -272,8 +275,7 @@ ip_vs_sched_persist(struct ip_vs_service
 			dport = dest->port;
 
 		/* Create a template */
-		ct = ip_vs_conn_new(svc->af, protocol, &snet, 0,vaddr, vport,
-				    &dest->addr, dport,
+		ct = ip_vs_conn_new(&param, &dest->addr, dport,
 				    IP_VS_CONN_F_TEMPLATE, dest);
 		if (ct == NULL)
 			return NULL;
@@ -291,12 +293,7 @@ ip_vs_sched_persist(struct ip_vs_service
 	/*
 	 *    Create a new connection according to the template
 	 */
-	cp = ip_vs_conn_new(svc->af, iph.protocol,
-			    &iph.saddr, ports[0],
-			    &iph.daddr, ports[1],
-			    &dest->addr, dport,
-			    flags,
-			    dest);
+	cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest);
 	if (cp == NULL) {
 		ip_vs_conn_put(ct);
 		return NULL;
@@ -363,14 +360,16 @@ ip_vs_schedule(struct ip_vs_service *svc
 	/*
 	 *    Create a connection entry.
 	 */
-	cp = ip_vs_conn_new(svc->af, iph.protocol,
-			    &iph.saddr, pptr[0],
-			    &iph.daddr, pptr[1],
-			    &dest->addr, dest->port ? dest->port : pptr[1],
-			    flags,
-			    dest);
-	if (cp == NULL)
-		return NULL;
+	{
+		struct ip_vs_conn_param p;
+		ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr,
+				      pptr[0], &iph.daddr, pptr[1], &p);
+		cp = ip_vs_conn_new(&p, &dest->addr,
+				    dest->port ? dest->port : pptr[1],
+				    flags, dest);
+		if (!cp)
+			return NULL;
+	}
 
 	IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
 		      "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
@@ -426,14 +425,17 @@ int ip_vs_leave(struct ip_vs_service *sv
 
 		/* create a new connection entry */
 		IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
-		cp = ip_vs_conn_new(svc->af, iph.protocol,
-				    &iph.saddr, pptr[0],
-				    &iph.daddr, pptr[1],
-				    &daddr, 0,
-				    IP_VS_CONN_F_BYPASS | flags,
-				    NULL);
-		if (cp == NULL)
-			return NF_DROP;
+		{
+			struct ip_vs_conn_param p;
+			ip_vs_conn_fill_param(svc->af, iph.protocol,
+					      &iph.saddr, pptr[0],
+					      &iph.daddr, pptr[1], &p);
+			cp = ip_vs_conn_new(&p, &daddr, 0,
+					    IP_VS_CONN_F_BYPASS | flags,
+					    NULL);
+			if (!cp)
+				return NF_DROP;
+		}
 
 		/* statistics */
 		ip_vs_in_stats(cp, skb);
Index: lvs-test-2.6/net/netfilter/ipvs/ip_vs_ftp.c
===================================================================
--- lvs-test-2.6.orig/net/netfilter/ipvs/ip_vs_ftp.c	2010-10-01 22:14:10.000000000 +0900
+++ lvs-test-2.6/net/netfilter/ipvs/ip_vs_ftp.c	2010-10-01 22:21:09.000000000 +0900
@@ -195,13 +195,17 @@ static int ip_vs_ftp_out(struct ip_vs_ap
 		/*
 		 * Now update or create an connection entry for it
 		 */
-		n_cp = ip_vs_conn_out_get(AF_INET, iph->protocol, &from, port,
-					  &cp->caddr, 0);
+		{
+			struct ip_vs_conn_param p;
+			ip_vs_conn_fill_param(AF_INET, iph->protocol,
+					      &from, port, &cp->caddr, 0, &p);
+			n_cp = ip_vs_conn_out_get(&p);
+		}
 		if (!n_cp) {
-			n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
-					      &cp->caddr, 0,
-					      &cp->vaddr, port,
-					      &from, port,
+			struct ip_vs_conn_param p;
+			ip_vs_conn_fill_param(AF_INET, IPPROTO_TCP, &cp->caddr,
+					      0, &cp->vaddr, port, &p);
+			n_cp = ip_vs_conn_new(&p, &from, port,
 					      IP_VS_CONN_F_NO_CPORT |
 					      IP_VS_CONN_F_NFCT,
 					      cp->dest);
@@ -347,21 +351,22 @@ static int ip_vs_ftp_in(struct ip_vs_app
 		  ip_vs_proto_name(iph->protocol),
 		  &to.ip, ntohs(port), &cp->vaddr.ip, 0);
 
-	n_cp = ip_vs_conn_in_get(AF_INET, iph->protocol,
-				 &to, port,
-				 &cp->vaddr, htons(ntohs(cp->vport)-1));
-	if (!n_cp) {
-		n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
-				      &to, port,
+	{
+		struct ip_vs_conn_param p;
+		ip_vs_conn_fill_param(AF_INET, iph->protocol, &to, port,
 				      &cp->vaddr, htons(ntohs(cp->vport)-1),
-				      &cp->daddr, htons(ntohs(cp->dport)-1),
-				      IP_VS_CONN_F_NFCT,
-				      cp->dest);
-		if (!n_cp)
-			return 0;
+				      &p);
+		n_cp = ip_vs_conn_in_get(&p);
+		if (!n_cp) {
+			n_cp = ip_vs_conn_new(&p, &cp->daddr,
+					      htons(ntohs(cp->dport)-1),
+					      IP_VS_CONN_F_NFCT, cp->dest);
+			if (!n_cp)
+				return 0;
 
-		/* add its controller */
-		ip_vs_control_add(n_cp, cp);
+			/* add its controller */
+			ip_vs_control_add(n_cp, cp);
+		}
 	}
 
 	/*
Index: lvs-test-2.6/net/netfilter/ipvs/ip_vs_nfct.c
===================================================================
--- lvs-test-2.6.orig/net/netfilter/ipvs/ip_vs_nfct.c	2010-10-01 22:11:53.000000000 +0900
+++ lvs-test-2.6/net/netfilter/ipvs/ip_vs_nfct.c	2010-10-01 22:24:29.000000000 +0900
@@ -140,6 +140,7 @@ static void ip_vs_nfct_expect_callback(s
 {
 	struct nf_conntrack_tuple *orig, new_reply;
 	struct ip_vs_conn *cp;
+	struct ip_vs_conn_param p;
 
 	if (exp->tuple.src.l3num != PF_INET)
 		return;
@@ -154,9 +155,10 @@ static void ip_vs_nfct_expect_callback(s
 
 	/* RS->CLIENT */
 	orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
-	cp = ip_vs_conn_out_get(exp->tuple.src.l3num, orig->dst.protonum,
-				&orig->src.u3, orig->src.u.tcp.port,
-				&orig->dst.u3, orig->dst.u.tcp.port);
+	ip_vs_conn_fill_param(exp->tuple.src.l3num, orig->dst.protonum,
+			      &orig->src.u3, orig->src.u.tcp.port,
+			      &orig->dst.u3, orig->dst.u.tcp.port, &p);
+	cp = ip_vs_conn_out_get(&p);
 	if (cp) {
 		/* Change reply CLIENT->RS to CLIENT->VS */
 		new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
@@ -176,9 +178,7 @@ static void ip_vs_nfct_expect_callback(s
 	}
 
 	/* CLIENT->VS */
-	cp = ip_vs_conn_in_get(exp->tuple.src.l3num, orig->dst.protonum,
-			       &orig->src.u3, orig->src.u.tcp.port,
-			       &orig->dst.u3, orig->dst.u.tcp.port);
+	cp = ip_vs_conn_in_get(&p);
 	if (cp) {
 		/* Change reply VS->CLIENT to RS->CLIENT */
 		new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
Index: lvs-test-2.6/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
===================================================================
--- lvs-test-2.6.orig/net/netfilter/ipvs/ip_vs_proto_ah_esp.c	2010-10-01 21:55:19.000000000 +0900
+++ lvs-test-2.6/net/netfilter/ipvs/ip_vs_proto_ah_esp.c	2010-10-01 22:23:33.000000000 +0900
@@ -40,6 +40,19 @@ struct isakmp_hdr {
 
 #define PORT_ISAKMP	500
 
+static void
+ah_esp_conn_fill_param_proto(int af, const struct ip_vs_iphdr *iph,
+			     int inverse, struct ip_vs_conn_param *p)
+{
+	if (likely(!inverse))
+		ip_vs_conn_fill_param(af, IPPROTO_UDP,
+				      &iph->saddr, htons(PORT_ISAKMP),
+				      &iph->daddr, htons(PORT_ISAKMP), p);
+	else
+		ip_vs_conn_fill_param(af, iph->protocol,
+				      &iph->saddr, htons(PORT_ISAKMP),
+				      &iph->daddr, htons(PORT_ISAKMP), p);
+}
 
 static struct ip_vs_conn *
 ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
@@ -47,21 +60,10 @@ ah_esp_conn_in_get(int af, const struct
 		   int inverse)
 {
 	struct ip_vs_conn *cp;
+	struct ip_vs_conn_param p;
 
-	if (likely(!inverse)) {
-		cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
-				       &iph->saddr,
-				       htons(PORT_ISAKMP),
-				       &iph->daddr,
-				       htons(PORT_ISAKMP));
-	} else {
-		cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
-				       &iph->daddr,
-				       htons(PORT_ISAKMP),
-				       &iph->saddr,
-				       htons(PORT_ISAKMP));
-	}
-
+	ah_esp_conn_fill_param_proto(af, iph, inverse, &p);
+	cp = ip_vs_conn_in_get(&p);
 	if (!cp) {
 		/*
 		 * We are not sure if the packet is from our
@@ -87,21 +89,10 @@ ah_esp_conn_out_get(int af, const struct
 		    int inverse)
 {
 	struct ip_vs_conn *cp;
+	struct ip_vs_conn_param p;
 
-	if (likely(!inverse)) {
-		cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
-					&iph->saddr,
-					htons(PORT_ISAKMP),
-					&iph->daddr,
-					htons(PORT_ISAKMP));
-	} else {
-		cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
-					&iph->daddr,
-					htons(PORT_ISAKMP),
-					&iph->saddr,
-					htons(PORT_ISAKMP));
-	}
-
+	ah_esp_conn_fill_param_proto(af, iph, inverse, &p);
+	cp = ip_vs_conn_out_get(&p);
 	if (!cp) {
 		IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet "
 			      "%s%s %s->%s\n",
Index: lvs-test-2.6/net/netfilter/ipvs/ip_vs_sync.c
===================================================================
--- lvs-test-2.6.orig/net/netfilter/ipvs/ip_vs_sync.c	2010-10-01 21:55:19.000000000 +0900
+++ lvs-test-2.6/net/netfilter/ipvs/ip_vs_sync.c	2010-10-01 22:23:33.000000000 +0900
@@ -301,6 +301,7 @@ static void ip_vs_process_message(const
 	struct ip_vs_conn *cp;
 	struct ip_vs_protocol *pp;
 	struct ip_vs_dest *dest;
+	struct ip_vs_conn_param param;
 	char *p;
 	int i;
 
@@ -370,18 +371,17 @@ static void ip_vs_process_message(const
 			}
 		}
 
-		if (!(flags & IP_VS_CONN_F_TEMPLATE))
-			cp = ip_vs_conn_in_get(AF_INET, s->protocol,
-					       (union nf_inet_addr *)&s->caddr,
-					       s->cport,
-					       (union nf_inet_addr *)&s->vaddr,
-					       s->vport);
-		else
-			cp = ip_vs_ct_in_get(AF_INET, s->protocol,
-					     (union nf_inet_addr *)&s->caddr,
-					     s->cport,
-					     (union nf_inet_addr *)&s->vaddr,
-					     s->vport);
+		{
+			ip_vs_conn_fill_param(AF_INET, s->protocol,
+					      (union nf_inet_addr *)&s->caddr,
+					      s->cport,
+					      (union nf_inet_addr *)&s->vaddr,
+					      s->vport, &param);
+			if (!(flags & IP_VS_CONN_F_TEMPLATE))
+				cp = ip_vs_conn_in_get(&param);
+			else
+				cp = ip_vs_ct_in_get(&param);
+		}
 		if (!cp) {
 			/*
 			 * Find the appropriate destination for the connection.
@@ -406,14 +406,9 @@ static void ip_vs_process_message(const
 				else
 					flags &= ~IP_VS_CONN_F_INACTIVE;
 			}
-			cp = ip_vs_conn_new(AF_INET, s->protocol,
-					    (union nf_inet_addr *)&s->caddr,
-					    s->cport,
-					    (union nf_inet_addr *)&s->vaddr,
-					    s->vport,
+			cp = ip_vs_conn_new(&param,
 					    (union nf_inet_addr *)&s->daddr,
-					    s->dport,
-					    flags, dest);
+					    s->dport, flags, dest);
 			if (dest)
 				atomic_dec(&dest->refcnt);
 			if (!cp) {


^ permalink raw reply

* [patch v2 05/12] [PATCH 05/12] IPVS: Allow null argument to ip_vs_scheduler_put()
From: Simon Horman @ 2010-10-01 14:35 UTC (permalink / raw)
  To: lvs-devel, netdev, netfilter, netfilter-devel
  Cc: Jan Engelhardt, Stephen Hemminger, Wensong Zhang,
	Julian Anastasov, Patrick McHardy
In-Reply-To: <20101001143517.645421976@akiko.akashicho.tokyo.vergenet.net>

[-- Attachment #1: 0005-IPVS-Allow-null-argument-to-ip_vs_scheduler_put.patch --]
[-- Type: text/plain, Size: 2041 bytes --]

This simplifies caller logic sightly.

Signed-off-by: Simon Horman <horms@verge.net.au>
---

v2
* Trivial rediff

Index: lvs-test-2.6/net/netfilter/ipvs/ip_vs_ctl.c
===================================================================
--- lvs-test-2.6.orig/net/netfilter/ipvs/ip_vs_ctl.c	2010-10-01 22:25:53.000000000 +0900
+++ lvs-test-2.6/net/netfilter/ipvs/ip_vs_ctl.c	2010-10-01 22:26:10.000000000 +0900
@@ -1144,7 +1144,7 @@ ip_vs_add_service(struct ip_vs_service_u
 	if (sched == NULL) {
 		pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
 		ret = -ENOENT;
-		goto out_mod_dec;
+		goto out_err;
 	}
 
 #ifdef CONFIG_IP_VS_IPV6
@@ -1204,7 +1204,7 @@ ip_vs_add_service(struct ip_vs_service_u
 	*svc_p = svc;
 	return 0;
 
-  out_err:
+ out_err:
 	if (svc != NULL) {
 		if (svc->scheduler)
 			ip_vs_unbind_scheduler(svc);
@@ -1217,7 +1217,6 @@ ip_vs_add_service(struct ip_vs_service_u
 	}
 	ip_vs_scheduler_put(sched);
 
-  out_mod_dec:
 	/* decrease the module use count */
 	ip_vs_use_count_dec();
 
@@ -1300,10 +1299,7 @@ ip_vs_edit_service(struct ip_vs_service
 #ifdef CONFIG_IP_VS_IPV6
   out:
 #endif
-
-	if (old_sched)
-		ip_vs_scheduler_put(old_sched);
-
+	ip_vs_scheduler_put(old_sched);
 	return ret;
 }
 
@@ -1327,8 +1323,7 @@ static void __ip_vs_del_service(struct i
 	/* Unbind scheduler */
 	old_sched = svc->scheduler;
 	ip_vs_unbind_scheduler(svc);
-	if (old_sched)
-		ip_vs_scheduler_put(old_sched);
+	ip_vs_scheduler_put(old_sched);
 
 	/* Unbind app inc */
 	if (svc->inc) {
Index: lvs-test-2.6/net/netfilter/ipvs/ip_vs_sched.c
===================================================================
--- lvs-test-2.6.orig/net/netfilter/ipvs/ip_vs_sched.c	2010-10-01 22:25:53.000000000 +0900
+++ lvs-test-2.6/net/netfilter/ipvs/ip_vs_sched.c	2010-10-01 22:26:10.000000000 +0900
@@ -159,7 +159,7 @@ struct ip_vs_scheduler *ip_vs_scheduler_
 
 void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
 {
-	if (scheduler->module)
+	if (scheduler && scheduler->module)
 		module_put(scheduler->module);
 }
 


^ permalink raw reply

* [patch v2 07/12] [PATCH 07/12] IPVS: Add struct ip_vs_pe
From: Simon Horman @ 2010-10-01 14:35 UTC (permalink / raw)
  To: lvs-devel, netdev, netfilter, netfilter-devel
  Cc: Jan Engelhardt, Stephen Hemminger, Wensong Zhang,
	Julian Anastasov, Patrick McHardy
In-Reply-To: <20101001143517.645421976@akiko.akashicho.tokyo.vergenet.net>

[-- Attachment #1: 0007-IPVS-Add-struct-ip_vs_pe.patch --]
[-- Type: text/plain, Size: 11305 bytes --]

Signed-off-by: Simon Horman <horms@verge.net.au>
--- 

This the first of several patches that add persistence engines.

v2
* Don't leak pe_data
  - It wasn't being freed anywhere, ever
* Trivial rediff

Index: lvs-test-2.6/include/linux/ip_vs.h
===================================================================
--- lvs-test-2.6.orig/include/linux/ip_vs.h	2010-10-01 22:47:39.000000000 +0900
+++ lvs-test-2.6/include/linux/ip_vs.h	2010-10-01 22:48:51.000000000 +0900
@@ -99,8 +99,10 @@
 				0)
 
 #define IP_VS_SCHEDNAME_MAXLEN	16
+#define IP_VS_PENAME_MAXLEN	16
 #define IP_VS_IFNAME_MAXLEN	16
 
+#define IP_VS_PEDATA_MAXLEN     255
 
 /*
  *	The struct ip_vs_service_user and struct ip_vs_dest_user are
Index: lvs-test-2.6/include/net/ip_vs.h
===================================================================
--- lvs-test-2.6.orig/include/net/ip_vs.h	2010-10-01 22:48:42.000000000 +0900
+++ lvs-test-2.6/include/net/ip_vs.h	2010-10-01 22:48:51.000000000 +0900
@@ -364,6 +364,10 @@ struct ip_vs_conn_param {
 	__be16				vport;
 	__u16				protocol;
 	u16				af;
+
+	const struct ip_vs_pe		*pe;
+	char				*pe_data;
+	__u8				pe_data_len;
 };
 
 /*
@@ -416,6 +420,9 @@ struct ip_vs_conn {
 	void                    *app_data;      /* Application private data */
 	struct ip_vs_seq        in_seq;         /* incoming seq. struct */
 	struct ip_vs_seq        out_seq;        /* outgoing seq. struct */
+
+	char			*pe_data;
+	__u8			pe_data_len;
 };
 
 
@@ -486,6 +493,9 @@ struct ip_vs_service {
 	struct ip_vs_scheduler	*scheduler;    /* bound scheduler object */
 	rwlock_t		sched_lock;    /* lock sched_data */
 	void			*sched_data;   /* scheduler application data */
+
+	/* alternate persistence engine */
+	struct ip_vs_pe		*pe;
 };
 
 
@@ -549,6 +559,19 @@ struct ip_vs_scheduler {
 				       const struct sk_buff *skb);
 };
 
+/* The persistence engine object */
+struct ip_vs_pe {
+	struct list_head	n_list;		/* d-linked list head */
+	char			*name;		/* scheduler name */
+	atomic_t		refcnt;		/* reference counter */
+	struct module		*module;	/* THIS_MODULE/NULL */
+
+	/* get the connection template, if any */
+	int (*fill_param)(struct ip_vs_conn_param *p, struct sk_buff *skb);
+	bool (*ct_match)(const struct ip_vs_conn_param *p,
+			 struct ip_vs_conn *ct);
+	u32 (*hashkey_raw)(const struct ip_vs_conn_param *p, u32 initval);
+};
 
 /*
  *	The application module object (a.k.a. app incarnation)
@@ -648,6 +671,8 @@ static inline void ip_vs_conn_fill_param
 	p->cport = cport;
 	p->vaddr = vaddr;
 	p->vport = vport;
+	p->pe = NULL;
+	p->pe_data = NULL;
 }
 
 struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p);
@@ -803,7 +828,7 @@ extern int ip_vs_unbind_scheduler(struct
 extern struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name);
 extern void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler);
 extern struct ip_vs_conn *
-ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb);
+ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb);
 extern int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 			struct ip_vs_protocol *pp);
 
Index: lvs-test-2.6/net/netfilter/ipvs/ip_vs_conn.c
===================================================================
--- lvs-test-2.6.orig/net/netfilter/ipvs/ip_vs_conn.c	2010-10-01 22:48:42.000000000 +0900
+++ lvs-test-2.6/net/netfilter/ipvs/ip_vs_conn.c	2010-10-01 22:49:15.000000000 +0900
@@ -148,6 +148,29 @@ static unsigned int ip_vs_conn_hashkey(i
 		& ip_vs_conn_tab_mask;
 }
 
+static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p)
+{
+	if (p->pe && p->pe->hashkey_raw)
+		return p->pe->hashkey_raw(p, ip_vs_conn_rnd) &
+			ip_vs_conn_tab_mask;
+	return ip_vs_conn_hashkey(p->af, p->protocol, p->caddr, p->cport);
+}
+
+static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
+{
+	struct ip_vs_conn_param p;
+
+	ip_vs_conn_fill_param(cp->af, cp->protocol, &cp->caddr, cp->cport,
+			      NULL, 0, &p);
+
+	if (cp->dest->svc->pe) {
+		p.pe = cp->dest->svc->pe;
+		p.pe_data = cp->pe_data;
+		p.pe_data_len = cp->pe_data_len;
+	}
+
+	return ip_vs_conn_hashkey_param(&p);
+}
 
 /*
  *	Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
@@ -162,7 +185,7 @@ static inline int ip_vs_conn_hash(struct
 		return 0;
 
 	/* Hash by protocol, client address and port */
-	hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
+	hash = ip_vs_conn_hashkey_conn(cp);
 
 	ct_write_lock(hash);
 	spin_lock(&cp->lock);
@@ -195,7 +218,7 @@ static inline int ip_vs_conn_unhash(stru
 	int ret;
 
 	/* unhash it and decrease its reference counter */
-	hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
+	hash = ip_vs_conn_hashkey_conn(cp);
 
 	ct_write_lock(hash);
 	spin_lock(&cp->lock);
@@ -227,7 +250,7 @@ __ip_vs_conn_in_get(const struct ip_vs_c
 	unsigned hash;
 	struct ip_vs_conn *cp;
 
-	hash = ip_vs_conn_hashkey(p->af, p->protocol, p->caddr, p->cport);
+	hash = ip_vs_conn_hashkey_param(p);
 
 	ct_read_lock(hash);
 
@@ -312,11 +335,17 @@ struct ip_vs_conn *ip_vs_ct_in_get(const
 	unsigned hash;
 	struct ip_vs_conn *cp;
 
-	hash = ip_vs_conn_hashkey(p->af, p->protocol, p->caddr, p->cport);
+	hash = ip_vs_conn_hashkey_param(p);
 
 	ct_read_lock(hash);
 
 	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+		if (p->pe && p->pe->ct_match) {
+			if (p->pe->ct_match(p, cp))
+				goto out;
+			continue;
+		}
+
 		if (cp->af == p->af &&
 		    ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
 		    /* protocol should only be IPPROTO_IP if
@@ -325,15 +354,14 @@ struct ip_vs_conn *ip_vs_ct_in_get(const
 				     p->af, p->vaddr, &cp->vaddr) &&
 		    p->cport == cp->cport && p->vport == cp->vport &&
 		    cp->flags & IP_VS_CONN_F_TEMPLATE &&
-		    p->protocol == cp->protocol) {
-			/* HIT */
-			atomic_inc(&cp->refcnt);
+		    p->protocol == cp->protocol)
 			goto out;
-		}
 	}
 	cp = NULL;
 
   out:
+	if (cp)
+		atomic_inc(&cp->refcnt);
 	ct_read_unlock(hash);
 
 	IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
@@ -359,7 +387,7 @@ struct ip_vs_conn *ip_vs_conn_out_get(co
 	/*
 	 *	Check for "full" addressed entries
 	 */
-	hash = ip_vs_conn_hashkey(p->af, p->protocol, p->vaddr, p->vport);
+	hash = ip_vs_conn_hashkey_param(p);
 
 	ct_read_lock(hash);
 
@@ -724,6 +752,7 @@ static void ip_vs_conn_expire(unsigned l
 		if (cp->flags & IP_VS_CONN_F_NFCT)
 			ip_vs_conn_drop_conntrack(cp);
 
+		kfree(cp->pe_data);
 		if (unlikely(cp->app != NULL))
 			ip_vs_unbind_app(cp);
 		ip_vs_unbind_dest(cp);
@@ -784,6 +813,10 @@ ip_vs_conn_new(const struct ip_vs_conn_p
 			&cp->daddr, daddr);
 	cp->dport          = dport;
 	cp->flags	   = flags;
+	if (flags & IP_VS_CONN_F_TEMPLATE && p->pe_data) {
+		cp->pe_data = p->pe_data;
+		cp->pe_data_len = p->pe_data_len;
+	}
 	spin_lock_init(&cp->lock);
 
 	/*
@@ -834,7 +867,6 @@ ip_vs_conn_new(const struct ip_vs_conn_p
 	return cp;
 }
 
-
 /*
  *	/proc/net/ip_vs_conn entries
  */
@@ -850,7 +882,7 @@ static void *ip_vs_conn_array(struct seq
 		list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
 			if (pos-- == 0) {
 				seq->private = &ip_vs_conn_tab[idx];
-				return cp;
+			return cp;
 			}
 		}
 		ct_read_unlock_bh(idx);
Index: lvs-test-2.6/net/netfilter/ipvs/ip_vs_core.c
===================================================================
--- lvs-test-2.6.orig/net/netfilter/ipvs/ip_vs_core.c	2010-10-01 22:48:42.000000000 +0900
+++ lvs-test-2.6/net/netfilter/ipvs/ip_vs_core.c	2010-10-01 22:49:15.000000000 +0900
@@ -176,6 +176,19 @@ ip_vs_set_state(struct ip_vs_conn *cp, i
 	return pp->state_transition(cp, direction, skb, pp);
 }
 
+static inline int
+ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
+			      struct sk_buff *skb, int protocol,
+			      const union nf_inet_addr *caddr, __be16 cport,
+			      const union nf_inet_addr *vaddr, __be16 vport,
+			      struct ip_vs_conn_param *p)
+{
+	ip_vs_conn_fill_param(svc->af, protocol, caddr, cport, vaddr, vport, p);
+	p->pe = svc->pe;
+	if (p->pe && p->pe->fill_param)
+		return p->pe->fill_param(p, skb);
+	return 0;
+}
 
 /*
  *  IPVS persistent scheduling function
@@ -186,7 +199,7 @@ ip_vs_set_state(struct ip_vs_conn *cp, i
  */
 static struct ip_vs_conn *
 ip_vs_sched_persist(struct ip_vs_service *svc,
-		    const struct sk_buff *skb,
+		    struct sk_buff *skb,
 		    __be16 ports[2])
 {
 	struct ip_vs_conn *cp = NULL;
@@ -255,8 +268,9 @@ ip_vs_sched_persist(struct ip_vs_service
 				vaddr = &fwmark;
 			}
 		}
-		ip_vs_conn_fill_param(svc->af, protocol, &snet, 0,
-				      vaddr, vport, &param);
+		if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
+						  vaddr, vport, &param))
+			return NULL;
 	}
 
 	/* Check if a template already exists */
@@ -268,22 +282,31 @@ ip_vs_sched_persist(struct ip_vs_service
 		dest = svc->scheduler->schedule(svc, skb);
 		if (!dest) {
 			IP_VS_DBG(1, "p-schedule: no dest found.\n");
+			kfree(param.pe_data);
 			return NULL;
 		}
 
 		if (ports[1] == svc->port && svc->port != FTPPORT)
 			dport = dest->port;
 
-		/* Create a template */
+		/* Create a template
+		 * This adds param.pe_data to the template,
+		 * and thus param.pe_data will be destroyed
+		 * when the template expires */
 		ct = ip_vs_conn_new(&param, &dest->addr, dport,
 				    IP_VS_CONN_F_TEMPLATE, dest);
-		if (ct == NULL)
+		if (ct == NULL) {
+			kfree(param.pe_data);
 			return NULL;
+		}
 
 		ct->timeout = svc->timeout;
-	} else
+	} else {
 		/* set destination with the found template */
 		dest = ct->dest;
+		kfree(param.pe_data);
+	}
+
 	dport = dest->port;
 
 	flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
@@ -317,7 +340,7 @@ ip_vs_sched_persist(struct ip_vs_service
  *  Protocols supported: TCP, UDP
  */
 struct ip_vs_conn *
-ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb)
 {
 	struct ip_vs_conn *cp = NULL;
 	struct ip_vs_iphdr iph;
Index: lvs-test-2.6/net/netfilter/ipvs/ip_vs_sync.c
===================================================================
--- lvs-test-2.6.orig/net/netfilter/ipvs/ip_vs_sync.c	2010-10-01 22:48:42.000000000 +0900
+++ lvs-test-2.6/net/netfilter/ipvs/ip_vs_sync.c	2010-10-01 22:48:51.000000000 +0900
@@ -288,6 +288,16 @@ void ip_vs_sync_conn(struct ip_vs_conn *
 		ip_vs_sync_conn(cp->control);
 }
 
+static inline int
+ip_vs_conn_fill_param_sync(int af, int protocol,
+			   const union nf_inet_addr *caddr, __be16 cport,
+			   const union nf_inet_addr *vaddr, __be16 vport,
+			   struct ip_vs_conn_param *p)
+{
+	/* XXX: Need to take into account persistence engine */
+	ip_vs_conn_fill_param(af, protocol, caddr, cport, vaddr, vport, p);
+	return 0;
+}
 
 /*
  *      Process received multicast message and create the corresponding
@@ -372,11 +382,14 @@ static void ip_vs_process_message(const
 		}
 
 		{
-			ip_vs_conn_fill_param(AF_INET, s->protocol,
+			if (ip_vs_conn_fill_param_sync(AF_INET, s->protocol,
 					      (union nf_inet_addr *)&s->caddr,
 					      s->cport,
 					      (union nf_inet_addr *)&s->vaddr,
-					      s->vport, &param);
+					      s->vport, &param)) {
+				pr_err("ip_vs_conn_fill_param_sync failed");
+				return;
+			}
 			if (!(flags & IP_VS_CONN_F_TEMPLATE))
 				cp = ip_vs_conn_in_get(&param);
 			else


^ permalink raw reply

* [patch v2 08/12] [PATCH 08/12] IPVS: Add persistence engine data to /proc/net/ip_vs_conn
From: Simon Horman @ 2010-10-01 14:35 UTC (permalink / raw)
  To: lvs-devel, netdev, netfilter, netfilter-devel
  Cc: Jan Engelhardt, Stephen Hemminger, Wensong Zhang,
	Julian Anastasov, Patrick McHardy
In-Reply-To: <20101001143517.645421976@akiko.akashicho.tokyo.vergenet.net>

[-- Attachment #1: 0008-IPVS-Add-persistence-engine-data-to-proc-net-ip_vs_c.patch --]
[-- Type: text/plain, Size: 2901 bytes --]

This shouldn't break compatibility with userspace as the new data
is at the end of the line.

I have confirmed that this doesn't break ipvsadm, the main (only?)
user-space user of this data.

Signed-off-by: Simon Horman <horms@verge.net.au>

---

* Jan Engelhardt suggested using netlink to do this, but it seems like
  overkill to me. I'm willing to be convinced otherwise.

v2
* Trivial rediff

Index: lvs-test-2.6/include/net/ip_vs.h
===================================================================
--- lvs-test-2.6.orig/include/net/ip_vs.h	2010-10-01 22:27:17.000000000 +0900
+++ lvs-test-2.6/include/net/ip_vs.h	2010-10-01 22:27:32.000000000 +0900
@@ -571,6 +571,7 @@ struct ip_vs_pe {
 	bool (*ct_match)(const struct ip_vs_conn_param *p,
 			 struct ip_vs_conn *ct);
 	u32 (*hashkey_raw)(const struct ip_vs_conn_param *p, u32 initval);
+	int (*show_pe_data)(const struct ip_vs_conn *cp, char *buf);
 };
 
 /*
Index: lvs-test-2.6/net/netfilter/ipvs/ip_vs_conn.c
===================================================================
--- lvs-test-2.6.orig/net/netfilter/ipvs/ip_vs_conn.c	2010-10-01 22:27:17.000000000 +0900
+++ lvs-test-2.6/net/netfilter/ipvs/ip_vs_conn.c	2010-10-01 22:27:32.000000000 +0900
@@ -938,30 +938,44 @@ static int ip_vs_conn_seq_show(struct se
 
 	if (v == SEQ_START_TOKEN)
 		seq_puts(seq,
-   "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires\n");
+   "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires PEName PEData\n");
 	else {
 		const struct ip_vs_conn *cp = v;
+		char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
+		size_t len = 0;
+
+		if (cp->dest->svc->pe && cp->dest->svc->pe->show_pe_data) {
+			pe_data[0] = ' ';
+			len = strlen(cp->dest->svc->pe->name);
+			memcpy(pe_data + 1, cp->dest->svc->pe->name, len);
+			pe_data[len + 1] = ' ';
+			len += 2;
+			len += cp->dest->svc->pe->show_pe_data(cp,
+							       pe_data + len);
+		}
+		pe_data[len] = '\0';
 
 #ifdef CONFIG_IP_VS_IPV6
 		if (cp->af == AF_INET6)
-			seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X %pI6 %04X %-11s %7lu\n",
+			seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
+				"%pI6 %04X %-11s %7lu%s\n",
 				ip_vs_proto_name(cp->protocol),
 				&cp->caddr.in6, ntohs(cp->cport),
 				&cp->vaddr.in6, ntohs(cp->vport),
 				&cp->daddr.in6, ntohs(cp->dport),
 				ip_vs_state_name(cp->protocol, cp->state),
-				(cp->timer.expires-jiffies)/HZ);
+				(cp->timer.expires-jiffies)/HZ, pe_data);
 		else
 #endif
 			seq_printf(seq,
 				"%-3s %08X %04X %08X %04X"
-				" %08X %04X %-11s %7lu\n",
+				" %08X %04X %-11s %7lu%s\n",
 				ip_vs_proto_name(cp->protocol),
 				ntohl(cp->caddr.ip), ntohs(cp->cport),
 				ntohl(cp->vaddr.ip), ntohs(cp->vport),
 				ntohl(cp->daddr.ip), ntohs(cp->dport),
 				ip_vs_state_name(cp->protocol, cp->state),
-				(cp->timer.expires-jiffies)/HZ);
+				(cp->timer.expires-jiffies)/HZ, pe_data);
 	}
 	return 0;
 }


^ permalink raw reply

* [patch v2 11/12] [PATCH 11/12] IPVS: Fallback if persistence engine fails
From: Simon Horman @ 2010-10-01 14:35 UTC (permalink / raw)
  To: lvs-devel, netdev, netfilter, netfilter-devel
  Cc: Jan Engelhardt, Stephen Hemminger, Wensong Zhang,
	Julian Anastasov, Patrick McHardy
In-Reply-To: <20101001143517.645421976@akiko.akashicho.tokyo.vergenet.net>

[-- Attachment #1: 0011-IPVS-Fallback-if-persistence-engine-fails.patch --]
[-- Type: text/plain, Size: 2961 bytes --]

Fall back to normal persistence handling if the persistence
engine fails to recognise a packet.

This way, at least the packet will go somewhere.

It is envisaged that iptables could be used to block packets
such if this is not desired although nf_conntrack_sip would
likely need to be enhanced first.

Signed-off-by: Simon Horman <horms@verge.net.au>

---

v2
* Trivial rediff

Index: lvs-test-2.6/net/netfilter/ipvs/ip_vs_conn.c
===================================================================
--- lvs-test-2.6.orig/net/netfilter/ipvs/ip_vs_conn.c	2010-10-01 22:27:32.000000000 +0900
+++ lvs-test-2.6/net/netfilter/ipvs/ip_vs_conn.c	2010-10-01 22:37:32.000000000 +0900
@@ -150,7 +150,7 @@ static unsigned int ip_vs_conn_hashkey(i
 
 static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p)
 {
-	if (p->pe && p->pe->hashkey_raw)
+	if (p->pe_data && p->pe->hashkey_raw)
 		return p->pe->hashkey_raw(p, ip_vs_conn_rnd) &
 			ip_vs_conn_tab_mask;
 	return ip_vs_conn_hashkey(p->af, p->protocol, p->caddr, p->cport);
@@ -340,7 +340,7 @@ struct ip_vs_conn *ip_vs_ct_in_get(const
 	ct_read_lock(hash);
 
 	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (p->pe && p->pe->ct_match) {
+		if (p->pe_data && p->pe->ct_match) {
 			if (p->pe->ct_match(p, cp))
 				goto out;
 			continue;
@@ -944,7 +944,7 @@ static int ip_vs_conn_seq_show(struct se
 		char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
 		size_t len = 0;
 
-		if (cp->dest->svc->pe && cp->dest->svc->pe->show_pe_data) {
+		if (cp->pe_data && cp->dest->svc->pe->show_pe_data) {
 			pe_data[0] = ' ';
 			len = strlen(cp->dest->svc->pe->name);
 			memcpy(pe_data + 1, cp->dest->svc->pe->name, len);
Index: lvs-test-2.6/net/netfilter/ipvs/ip_vs_core.c
===================================================================
--- lvs-test-2.6.orig/net/netfilter/ipvs/ip_vs_core.c	2010-10-01 22:27:17.000000000 +0900
+++ lvs-test-2.6/net/netfilter/ipvs/ip_vs_core.c	2010-10-01 22:37:32.000000000 +0900
@@ -176,7 +176,7 @@ ip_vs_set_state(struct ip_vs_conn *cp, i
 	return pp->state_transition(cp, direction, skb, pp);
 }
 
-static inline int
+static inline void
 ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
 			      struct sk_buff *skb, int protocol,
 			      const union nf_inet_addr *caddr, __be16 cport,
@@ -186,8 +186,7 @@ ip_vs_conn_fill_param_persist(const stru
 	ip_vs_conn_fill_param(svc->af, protocol, caddr, cport, vaddr, vport, p);
 	p->pe = svc->pe;
 	if (p->pe && p->pe->fill_param)
-		return p->pe->fill_param(p, skb);
-	return 0;
+		p->pe->fill_param(p, skb);
 }
 
 /*
@@ -268,9 +267,8 @@ ip_vs_sched_persist(struct ip_vs_service
 				vaddr = &fwmark;
 			}
 		}
-		if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
-						  vaddr, vport, &param))
-			return NULL;
+		ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
+					      vaddr, vport, &param);
 	}
 
 	/* Check if a template already exists */


^ permalink raw reply

* [patch v2 12/12] [PATCH 12/12] IPVS: sip persistence engine
From: Simon Horman @ 2010-10-01 14:35 UTC (permalink / raw)
  To: lvs-devel, netdev, netfilter, netfilter-devel
  Cc: Jan Engelhardt, Stephen Hemminger, Wensong Zhang,
	Julian Anastasov, Patrick McHardy
In-Reply-To: <20101001143517.645421976@akiko.akashicho.tokyo.vergenet.net>

[-- Attachment #1: 0012-IPVS-sip-persistence-engine.patch --]
[-- Type: text/plain, Size: 7063 bytes --]

Add the SIP callid as a key for persistence.

This allows multiple connections from the same IP address to be
differentiated on the basis of the callid.

When used in conjunction with the persistence mask, it allows connections
from different  IP addresses to be aggregated on the basis of the callid.

It is envisaged that a persistence mask of 0.0.0.0 will be a useful
setting.  That is, ignore the source IP address when checking for
persistence.

It is envisaged that this option will be used in conjunction with
one-packet scheduling.

This only works with UDP and cannot be made to work with TCP
within the current framework.

Signed-off-by: Simon Horman <horms@verge.net.au>

---

v1
* Use buf[] instead of poiter arithmetic in ip_vs_dbg_callid()
  As suggested by Jan Engelhardt

v2
* Use GFP_ATOMIC for allocations inside of ip_vs_sip_fill_param()
  which is called in an atomic context. This resolves the
  "scheduling while atomic" problem.
* As noted by Julian Anastasov RFC 3261 section 8.1.1.4 says
  "Call-IDs are case-sensitive and are simply compared byte-by-byte",
  so may be memcmp should be used instead of strnicmp() in
  ip_vs_sip_ct_match().
* Spelling fix in comment: persistance -> persistence
* Trivial rediff

Index: lvs-test-2.6/net/netfilter/ipvs/Kconfig
===================================================================
--- lvs-test-2.6.orig/net/netfilter/ipvs/Kconfig	2010-10-01 22:46:59.000000000 +0900
+++ lvs-test-2.6/net/netfilter/ipvs/Kconfig	2010-10-01 22:50:17.000000000 +0900
@@ -256,4 +256,11 @@ config	IP_VS_NFCT
 	  connection state to be exported to the Netfilter framework
 	  for filtering purposes.
 
+config	IP_VS_PE_SIP
+	tristate "SIP persistence engine"
+        depends on IP_VS_PROTO_UDP
+	depends on NF_CONNTRACK_SIP
+	---help---
+	  Allow persistence based on the SIP Call-ID
+
 endif # IP_VS
Index: lvs-test-2.6/net/netfilter/ipvs/Makefile
===================================================================
--- lvs-test-2.6.orig/net/netfilter/ipvs/Makefile	2010-10-01 22:50:17.000000000 +0900
+++ lvs-test-2.6/net/netfilter/ipvs/Makefile	2010-10-01 22:50:17.000000000 +0900
@@ -35,3 +35,6 @@ obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
 
 # IPVS application helpers
 obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
+
+# IPVS connection template retrievers
+obj-$(CONFIG_IP_VS_PE_SIP) += ip_vs_pe_sip.o
Index: lvs-test-2.6/net/netfilter/ipvs/ip_vs_pe_sip.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ lvs-test-2.6/net/netfilter/ipvs/ip_vs_pe_sip.c	2010-10-01 22:50:23.000000000 +0900
@@ -0,0 +1,167 @@
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <linux/netfilter/nf_conntrack_sip.h>
+
+static const char *ip_vs_dbg_callid(char *buf, size_t buf_len,
+				    const char *callid, size_t callid_len,
+				    int *idx)
+{
+	size_t len = min(min(callid_len, (size_t)64), buf_len - *idx - 1);
+	memcpy(buf + *idx, callid, len);
+	buf[*idx+len] = '\0';
+	*idx += len + 1;
+	return buf + *idx - len;
+}
+
+#define IP_VS_DEBUG_CALLID(callid, len)					\
+	ip_vs_dbg_callid(ip_vs_dbg_buf, sizeof(ip_vs_dbg_buf),		\
+			 callid, len, &ip_vs_dbg_idx)
+
+static int get_callid(const char *dptr, unsigned int dataoff,
+		      unsigned int datalen,
+		      unsigned int *matchoff, unsigned int *matchlen)
+{
+	/* Find callid */
+	while (1) {
+		int ret = ct_sip_get_header(NULL, dptr, dataoff, datalen,
+					    SIP_HDR_CALL_ID, matchoff,
+					    matchlen);
+		if (ret > 0)
+			break;
+		if (!ret)
+			return 0;
+		dataoff += *matchoff;
+	}
+
+	/* Empty callid is useless */
+	if (!*matchlen)
+		return -EINVAL;
+
+	/* Too large is useless */
+	if (*matchlen > IP_VS_PEDATA_MAXLEN)
+		return -EINVAL;
+
+	/* SIP headers are always followed by a line terminator */
+	if (*matchoff + *matchlen == datalen)
+		return -EINVAL;
+
+	/* RFC 2543 allows lines to be terminated with CR, LF or CRLF,
+	 * RFC 3261 allows only CRLF, we support both. */
+	if (*(dptr + *matchoff + *matchlen) != '\r' &&
+	    *(dptr + *matchoff + *matchlen) != '\n')
+		return -EINVAL;
+
+	IP_VS_DBG_BUF(9, "SIP callid %s (%d bytes)\n",
+		      IP_VS_DEBUG_CALLID(dptr + *matchoff, *matchlen),
+		      *matchlen);
+	return 0;
+}
+
+static int
+ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
+{
+	struct ip_vs_iphdr iph;
+	unsigned int dataoff, datalen, matchoff, matchlen;
+	const char *dptr;
+
+	ip_vs_fill_iphdr(p->af, skb_network_header(skb), &iph);
+
+	/* Only useful with UDP */
+	if (iph.protocol != IPPROTO_UDP)
+		return -EINVAL;
+
+	/* No Data ? */
+	dataoff = iph.len + sizeof(struct udphdr);
+	if (dataoff >= skb->len)
+		return -EINVAL;
+
+	dptr = skb->data + dataoff;
+	datalen = skb->len - dataoff;
+
+	if (get_callid(dptr, dataoff, datalen, &matchoff, &matchlen))
+		return -EINVAL;
+
+	p->pe_data = kmalloc(matchlen, GFP_ATOMIC);
+	if (!p->pe_data)
+		return -ENOMEM;
+
+	/* N.B: pe_data is only set on success,
+	 * this allows fallback to the default persistence logic on failure
+	 */
+	memcpy(p->pe_data, dptr + matchoff, matchlen);
+	p->pe_data_len = matchlen;
+
+	return 0;
+}
+
+static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p,
+				  struct ip_vs_conn *ct)
+
+{
+	bool ret = 0;
+
+	if (ct->af == p->af &&
+	    ip_vs_addr_equal(p->af, p->caddr, &ct->caddr) &&
+	    /* protocol should only be IPPROTO_IP if
+	     * d_addr is a fwmark */
+	    ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
+			     p->vaddr, &ct->vaddr) &&
+	    ct->vport == p->vport &&
+	    ct->flags & IP_VS_CONN_F_TEMPLATE &&
+	    ct->protocol == p->protocol &&
+	    ct->pe_data && ct->pe_data_len == p->pe_data_len &&
+	    !memcmp(ct->pe_data, p->pe_data, p->pe_data_len))
+		ret = 1;
+
+	IP_VS_DBG_BUF(9, "SIP template match %s %s->%s:%d %s\n",
+		      ip_vs_proto_name(p->protocol),
+		      IP_VS_DEBUG_CALLID(p->pe_data, p->pe_data_len),
+		      IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
+		      ret ? "hit" : "not hit");
+
+	return ret;
+}
+
+static u32 ip_vs_sip_hashkey_raw(const struct ip_vs_conn_param *p,
+				 u32 initval)
+{
+	return jhash(p->pe_data, p->pe_data_len, initval);
+}
+
+static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf)
+{
+	memcpy(buf, cp->pe_data, cp->pe_data_len);
+	return cp->pe_data_len;
+}
+
+static struct ip_vs_pe ip_vs_sip_pe =
+{
+	.name =			"sip",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.n_list =		LIST_HEAD_INIT(ip_vs_sip_pe.n_list),
+	.fill_param =		ip_vs_sip_fill_param,
+	.ct_match =		ip_vs_sip_ct_match,
+	.hashkey_raw =		ip_vs_sip_hashkey_raw,
+	.show_pe_data =	ip_vs_sip_show_pe_data,
+};
+
+static int __init ip_vs_sip_init(void)
+{
+	return register_ip_vs_pe(&ip_vs_sip_pe);
+}
+
+static void __exit ip_vs_sip_cleanup(void)
+{
+	unregister_ip_vs_pe(&ip_vs_sip_pe);
+}
+
+module_init(ip_vs_sip_init);
+module_exit(ip_vs_sip_cleanup);
+MODULE_LICENSE("GPL");


^ permalink raw reply

* [patch v2 0/2] [patch v1 0/2] ipvsadm: SIP Persistence Engine
From: Simon Horman @ 2010-10-01 14:40 UTC (permalink / raw)
  To: lvs-devel, netdev, netfilter, netfilter-devel
  Cc: Jan Engelhardt, Stephen Hemminger, Wensong Zhang,
	Julian Anastasov, Patrick McHardy

This series is the ipvsadm companion to the kernel
patch series "IPVS: SIP Persistence Engine" v2.


^ permalink raw reply

* [patch v2 2/2] [PATCH 2/2] Add support for persistence engines.
From: Simon Horman @ 2010-10-01 14:40 UTC (permalink / raw)
  To: lvs-devel, netdev, netfilter, netfilter-devel
  Cc: Jan Engelhardt, Stephen Hemminger, Wensong Zhang,
	Julian Anastasov, Patrick McHardy
In-Reply-To: <20101001144041.414393254@akiko.akashicho.tokyo.vergenet.net>

[-- Attachment #1: 0002-Add-support-for-persistence-engines.patch --]
[-- Type: text/plain, Size: 13923 bytes --]

This adds the --pe [engine] option to the -A and -E commands
which allows a persistence engine to be associated with a virtual service.
The absence of --pe sets no persistence engine.

The --pe option only works when ipvsadm is compiled to use netlink
for user-space/kernel communication.

This patch also allows the --persistent-conn option to be given to the -L
command, which will list persistence engine data, if any is present, when
listing connections (and persistence templates).

At this time the only (proposed) persistence engine is sip.

Signed-off-by: Simon Horman <horms@verge.net.au>

---

v0.4
* Fix indentation of --pe help text

v2
* Only display pe_data if it is present

Index: github.com/Makefile
===================================================================
--- github.com.orig/Makefile	2010-10-01 22:58:26.000000000 +0900
+++ github.com/Makefile	2010-10-01 23:00:10.000000000 +0900
@@ -29,6 +29,7 @@ NAME		= ipvsadm
 VERSION		= $(shell cat VERSION)
 RELEASE		= 1
 SCHEDULERS	= "$(shell cat SCHEDULERS)"
+PE_LIST		= "$(shell cat PERSISTENCE_ENGINES)"
 PROGROOT	= $(shell basename `pwd`)
 ARCH		= $(shell uname -m)
 RPMSOURCEDIR	= $(shell rpm --eval '%_sourcedir')
@@ -83,7 +84,7 @@ ifneq (0,$(HAVE_NL))
 LIBS		+= -lnl
 endif
 DEFINES		= -DVERSION=\"$(VERSION)\" -DSCHEDULERS=\"$(SCHEDULERS)\" \
-		  $(POPT_DEFINE)
+		  -DPE_LIST=\"$(PE_LIST)\" $(POPT_DEFINE)
 DEFINES		+= $(shell if [ ! -f ../ip_vs.h ]; then	\
 		     echo "-DHAVE_NET_IP_VS_H"; fi;)
 
Index: github.com/PERSISTENCE_ENGINES
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ github.com/PERSISTENCE_ENGINES	2010-10-01 23:00:10.000000000 +0900
@@ -0,0 +1 @@
+sip
Index: github.com/ipvsadm.8
===================================================================
--- github.com.orig/ipvsadm.8	2010-09-26 22:07:58.000000000 +0900
+++ github.com/ipvsadm.8	2010-10-01 23:00:10.000000000 +0900
@@ -391,6 +391,10 @@ with this option will display the persis
 information of each server in service listing. The persistent
 connection is used to forward the actual connections from the same
 client/network to the same server.
+.sp
+The \fIlist\fP command with the -c, --connection option and this option
+will include persistence engine data, if any is present, when listing
+connections.
 .TP
 .B --sort
 Sort the list of virtual services and real servers. The virtual
Index: github.com/ipvsadm.c
===================================================================
--- github.com.orig/ipvsadm.c	2010-10-01 23:00:09.000000000 +0900
+++ github.com/ipvsadm.c	2010-10-01 23:00:20.000000000 +0900
@@ -181,13 +181,15 @@ static const char* cmdnames[] = {
 #define OPT_SYNCID		0x080000
 #define OPT_EXACT		0x100000
 #define OPT_ONEPACKET		0x200000
-#define NUMBER_OF_OPT		22
+#define OPT_PERSISTENCE_ENGINE  0x400000
+#define NUMBER_OF_OPT		23
 
 static const char* optnames[] = {
 	"numeric",
 	"connection",
 	"service-address",
 	"scheduler",
+	"pe",
 	"persistent",
 	"netmask",
 	"real-server",
@@ -282,6 +284,7 @@ enum {
 	TAG_PERSISTENTCONN,
 	TAG_SORT,
 	TAG_NO_SORT,
+	TAG_PERSISTENCE_ENGINE,
 };
 
 /* various parsing helpers & parsing functions */
@@ -421,6 +424,8 @@ parse_options(int argc, char **argv, str
 		{ "exact", 'X', POPT_ARG_NONE, NULL, 'X', NULL, NULL },
 		{ "ipv6", '6', POPT_ARG_NONE, NULL, '6', NULL, NULL },
 		{ "ops", 'o', POPT_ARG_NONE, NULL, 'o', NULL, NULL },
+		{ "pe", '\0', POPT_ARG_STRING, &optarg, TAG_PERSISTENCE_ENGINE,
+		  NULL, NULL },
 		{ NULL, 0, 0, NULL, 0, NULL, NULL }
 	};
 
@@ -647,6 +652,10 @@ parse_options(int argc, char **argv, str
 			set_option(options, OPT_ONEPACKET);
 			ce->svc.flags |= IP_VS_SVC_F_ONEPACKET;
 			break;
+		case TAG_PERSISTENCE_ENGINE:
+			set_option(options, OPT_PERSISTENCE_ENGINE);
+			strncpy(ce->svc.pe_name, optarg, IP_VS_PENAME_MAXLEN);
+			break;
 		default:
 			fail(2, "invalid option `%s'",
 			     poptBadOption(context, POPT_BADOPTION_NOALIAS));
@@ -763,9 +772,10 @@ static int process_options(int argc, cha
 
 	switch (ce.cmd) {
 	case CMD_LIST:
-		if (options & (OPT_CONNECTION|OPT_TIMEOUT|OPT_DAEMON) &&
-		    options & (OPT_STATS|OPT_PERSISTENTCONN|
-			       OPT_RATE|OPT_THRESHOLDS))
+		if ((options & (OPT_CONNECTION|OPT_TIMEOUT|OPT_DAEMON) &&
+		     options & (OPT_STATS|OPT_RATE|OPT_THRESHOLDS)) ||
+		    (options & (OPT_TIMEOUT|OPT_DAEMON) &&
+		     options & OPT_PERSISTENTCONN))
 			fail(2, "options conflicts in the list command");
 
 		if (options & OPT_CONNECTION)
@@ -1060,7 +1070,7 @@ static void usage_exit(const char *progr
 	version(stream);
 	fprintf(stream,
 		"Usage:\n"
-		"  %s -A|E -t|u|f service-address [-s scheduler] [-p [timeout]] [-M netmask]\n"
+		"  %s -A|E -t|u|f service-address [-s scheduler] [-p [timeout]] [-M netmask] [--pe persistence_engine]\n"
 		"  %s -D -t|u|f service-address\n"
 		"  %s -C\n"
 		"  %s -R\n"
@@ -1105,6 +1115,8 @@ static void usage_exit(const char *progr
 		"  --ipv6         -6                   fwmark entry uses IPv6\n"
 		"  --scheduler    -s scheduler         one of " SCHEDULERS ",\n"
 		"                                      the default scheduler is %s.\n"
+		"  --pe            engine              alternate persistence engine may be " PE_LIST ",\n"
+		"                                      not set by default.\n"
 		"  --persistent   -p [timeout]         persistent service\n"
 		"  --netmask      -M netmask           persistent granularity mask\n"
 		"  --real-server  -r server-address    server-address is host (and port)\n"
@@ -1225,6 +1237,8 @@ static void print_conn(char *buf, unsign
 	char            state[16];
 	unsigned int    expires;
 	unsigned short  af = AF_INET;
+	char		pe_name[IP_VS_PENAME_MAXLEN];
+	char		pe_data[IP_VS_PEDATA_MAXLEN];
 
 	int n;
 	char temp1[INET6_ADDRSTRLEN], temp2[INET6_ADDRSTRLEN], temp3[INET6_ADDRSTRLEN];
@@ -1232,9 +1246,10 @@ static void print_conn(char *buf, unsign
 	unsigned int	minutes, seconds;
 	char		expire_str[12];
 
-	if ((n = sscanf(buf, "%s %s %hX %s %hX %s %hX %s %d",
+	if ((n = sscanf(buf, "%s %s %hX %s %hX %s %hX %s %d %s %s",
 			protocol, temp1, &cport, temp2, &vport,
-			temp3, &dport, state, &expires)) == -1)
+			temp3, &dport, state, &expires,
+			pe_name, pe_data)) == -1)
 		exit(1);
 
 	if (strcmp(protocol, "TCP") == 0)
@@ -1268,8 +1283,13 @@ static void print_conn(char *buf, unsign
 	minutes = expires / 60;
 	sprintf(expire_str, "%02d:%02d", minutes, seconds);
 
-	printf("%-3s %-6s %-11s %-18s %-18s %s\n",
-	       protocol, expire_str, state, cname, vname, dname);
+	if (format & FMT_PERSISTENTCONN && n == 11)
+		printf("%-3s %-6s %-11s %-18s %-18s %-16s %-18s %s\n",
+		       protocol, expire_str, state, cname, vname, dname,
+		       pe_name, pe_data);
+	else
+		printf("%-3s %-6s %-11s %-18s %-18s %s\n",
+		       protocol, expire_str, state, cname, vname, dname);
 
 	free(cname);
 	free(vname);
@@ -1295,8 +1315,13 @@ void list_conn(unsigned int format)
 		exit(1);
 	}
 	printf("IPVS connection entries\n");
-	printf("pro expire %-11s %-18s %-18s %s\n",
-	       "state", "source", "virtual", "destination");
+	if (format & FMT_PERSISTENTCONN)
+		printf("pro expire %-11s %-18s %-18s %-18s %-16s %s\n",
+		       "state", "source", "virtual", "destination",
+		       "pe name", "pe_data");
+	else
+		printf("pro expire %-11s %-18s %-18s %s\n",
+		       "state", "source", "virtual", "destination");
 
 	/*
 	 * Print the VS information according to the format
@@ -1459,6 +1484,8 @@ print_service_entry(ipvs_service_entry_t
 					printf(" -M %i", se->netmask);
 				}
 		}
+		if (se->pe_name[0])
+			printf(" pe %s", se->pe_name);
 		if (se->flags & IP_VS_SVC_F_ONEPACKET)
 			printf(" ops");
 	} else if (format & FMT_STATS) {
@@ -1488,6 +1515,8 @@ print_service_entry(ipvs_service_entry_t
 			if (se->af == AF_INET6)
 				if (se->netmask != 128)
 					printf(" mask %i", se->netmask);
+			if (se->pe_name[0])
+				printf(" pe %s", se->pe_name);
 			if (se->flags & IP_VS_SVC_F_ONEPACKET)
 				printf(" ops");
 		}
Index: github.com/libipvs/ip_vs.h
===================================================================
--- github.com.orig/libipvs/ip_vs.h	2010-09-26 22:07:58.000000000 +0900
+++ github.com/libipvs/ip_vs.h	2010-10-01 23:00:10.000000000 +0900
@@ -92,8 +92,11 @@
 #define IP_VS_CONN_F_ONE_PACKET	0x2000		/* forward only one packet */
 
 #define IP_VS_SCHEDNAME_MAXLEN	16
+#define IP_VS_PENAME_MAXLEN	16
 #define IP_VS_IFNAME_MAXLEN	16
 
+#define IP_VS_PEDATA_MAXLEN	255
+
 union nf_inet_addr {
         __u32           all[4];
         __be32          ip;
@@ -134,6 +137,7 @@ struct ip_vs_service_user {
 	__be32			netmask;	/* persistent netmask */
 	u_int16_t		af;
 	union nf_inet_addr	addr;
+	char			pe_name[IP_VS_PENAME_MAXLEN];
 };
 
 struct ip_vs_dest_kern {
@@ -240,6 +244,7 @@ struct ip_vs_service_entry {
 
 	u_int16_t		af;
 	union nf_inet_addr	addr;
+	char			pe_name[IP_VS_PENAME_MAXLEN];
 
 };
 
@@ -429,6 +434,9 @@ enum {
 	IPVS_SVC_ATTR_NETMASK,		/* persistent netmask */
 
 	IPVS_SVC_ATTR_STATS,		/* nested attribute for service stats */
+
+	IPVS_SVC_ATTR_PE_NAME,		/* name of scheduler */
+
 	__IPVS_SVC_ATTR_MAX,
 };
 
Index: github.com/libipvs/libipvs.c
===================================================================
--- github.com.orig/libipvs/libipvs.c	2010-09-26 22:07:58.000000000 +0900
+++ github.com/libipvs/libipvs.c	2010-10-01 23:00:10.000000000 +0900
@@ -40,6 +40,15 @@ static int family, try_nl = 1;
 	{ errno = EAFNOSUPPORT; return ret; }			\
 	s->__addr_v4 = s->addr.ip;				\
 
+#define CHECK_PE(s, ret) if (s->pe_name)			\
+	{ errno = EAFNOSUPPORT; return ret; }
+
+#define CHECK_COMPAT_DEST(s, ret) CHECK_IPV4(s, ret)
+
+#define CHECK_COMPAT_SVC(s, ret)				\
+	CHECK_IPV4(s, ret);					\
+	CHECK_PE(s, ret);
+
 #ifdef LIBIPVS_USE_NL
 struct nl_msg *ipvs_nl_message(int cmd, int flags)
 {
@@ -218,6 +227,8 @@ static int ipvs_nl_fill_service_attr(str
 	}
 
 	NLA_PUT_STRING(msg, IPVS_SVC_ATTR_SCHED_NAME, svc->sched_name);
+	if (svc->pe_name)
+		NLA_PUT_STRING(msg, IPVS_SVC_ATTR_PE_NAME, svc->pe_name);
 	NLA_PUT(msg, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags);
 	NLA_PUT_U32(msg, IPVS_SVC_ATTR_TIMEOUT, svc->timeout);
 	NLA_PUT_U32(msg, IPVS_SVC_ATTR_NETMASK, svc->netmask);
@@ -245,7 +256,7 @@ int ipvs_add_service(ipvs_service_t *svc
 	}
 #endif
 
-	CHECK_IPV4(svc, -1);
+	CHECK_COMPAT_SVC(svc, -1);
 	return setsockopt(sockfd, IPPROTO_IP, IP_VS_SO_SET_ADD, (char *)svc,
 			  sizeof(struct ip_vs_service_kern));
 }
@@ -265,7 +276,7 @@ int ipvs_update_service(ipvs_service_t *
 		return ipvs_nl_send_message(msg, ipvs_nl_noop_cb, NULL);
 	}
 #endif
-	CHECK_IPV4(svc, -1);
+	CHECK_COMPAT_SVC(svc, -1);
 	return setsockopt(sockfd, IPPROTO_IP, IP_VS_SO_SET_EDIT, (char *)svc,
 			  sizeof(struct ip_vs_service_kern));
 }
@@ -285,7 +296,7 @@ int ipvs_del_service(ipvs_service_t *svc
 		return ipvs_nl_send_message(msg, ipvs_nl_noop_cb, NULL);
 	}
 #endif
-	CHECK_IPV4(svc, -1);
+	CHECK_COMPAT_SVC(svc, -1);
 	return setsockopt(sockfd, IPPROTO_IP, IP_VS_SO_SET_DEL, (char *)svc,
 			  sizeof(struct ip_vs_service_kern));
 }
@@ -310,7 +321,7 @@ int ipvs_zero_service(ipvs_service_t *sv
 		return ipvs_nl_send_message(msg, ipvs_nl_noop_cb, NULL);
 	}
 #endif
-	CHECK_IPV4(svc, -1);
+	CHECK_COMPAT_SVC(svc, -1);
 	return setsockopt(sockfd, IPPROTO_IP, IP_VS_SO_SET_ZERO, (char *)svc,
 			  sizeof(struct ip_vs_service_kern));
 }
@@ -360,8 +371,8 @@ nla_put_failure:
 	}
 #endif
 
-	CHECK_IPV4(svc, -1);
-	CHECK_IPV4(dest, -1);
+	CHECK_COMPAT_SVC(svc, -1);
+	CHECK_COMPAT_DEST(dest, -1);
 	memcpy(&svcdest.svc, svc, sizeof(svcdest.svc));
 	memcpy(&svcdest.dest, dest, sizeof(svcdest.dest));
 	return setsockopt(sockfd, IPPROTO_IP, IP_VS_SO_SET_ADDDEST,
@@ -389,8 +400,8 @@ nla_put_failure:
 		return -1;
 	}
 #endif
-	CHECK_IPV4(svc, -1);
-	CHECK_IPV4(dest, -1);
+	CHECK_COMPAT_SVC(svc, -1);
+	CHECK_COMPAT_DEST(dest, -1);
 	memcpy(&svcdest.svc, svc, sizeof(svcdest.svc));
 	memcpy(&svcdest.dest, dest, sizeof(svcdest.dest));
 	return setsockopt(sockfd, IPPROTO_IP, IP_VS_SO_SET_EDITDEST,
@@ -419,8 +430,8 @@ nla_put_failure:
 	}
 #endif
 
-	CHECK_IPV4(svc, -1);
-	CHECK_IPV4(dest, -1);
+	CHECK_COMPAT_SVC(svc, -1);
+	CHECK_COMPAT_DEST(dest, -1);
 	memcpy(&svcdest.svc, svc, sizeof(svcdest.svc));
 	memcpy(&svcdest.dest, dest, sizeof(svcdest.dest));
 	return setsockopt(sockfd, IPPROTO_IP, IP_VS_SO_SET_DELDEST,
@@ -593,6 +604,11 @@ static int ipvs_services_parse_cb(struct
 		nla_get_string(svc_attrs[IPVS_SVC_ATTR_SCHED_NAME]),
 		IP_VS_SCHEDNAME_MAXLEN);
 
+	if (svc_attrs[IPVS_SVC_ATTR_PE_NAME])
+		strncpy(get->entrytable[i].pe_name,
+			nla_get_string(svc_attrs[IPVS_SVC_ATTR_PE_NAME]),
+			IP_VS_PENAME_MAXLEN);
+
 	get->entrytable[i].netmask = nla_get_u32(svc_attrs[IPVS_SVC_ATTR_NETMASK]);
 	get->entrytable[i].timeout = nla_get_u32(svc_attrs[IPVS_SVC_ATTR_TIMEOUT]);
 	nla_memcpy(&flags, svc_attrs[IPVS_SVC_ATTR_FLAGS], sizeof(flags));
@@ -937,7 +953,8 @@ ipvs_get_service_err2:
 	}
 #endif
 
-	CHECK_IPV4(svc, NULL);
+	CHECK_COMPAT_SVC(svc, NULL);
+	CHECK_PE(svc, NULL);
 	if (getsockopt(sockfd, IPPROTO_IP, IP_VS_SO_GET_SERVICE,
 		       (char *)svc, &len)) {
 		free(svc);
@@ -945,6 +962,7 @@ ipvs_get_service_err2:
 	}
 	svc->af = AF_INET;
 	svc->addr.ip = svc->__addr_v4;
+	svc->pe_name[0] = '\0';
 	return svc;
 }
 
@@ -1086,9 +1104,9 @@ const char *ipvs_strerror(int err)
 		const char *message;
 	} table [] = {
 		{ ipvs_add_service, EEXIST, "Service already exists" },
-		{ ipvs_add_service, ENOENT, "Scheduler not found" },
+		{ ipvs_add_service, ENOENT, "Scheduler or persistence engine not found" },
 		{ ipvs_update_service, ESRCH, "No such service" },
-		{ ipvs_update_service, ENOENT, "Scheduler not found" },
+		{ ipvs_update_service, ENOENT, "Scheduler or persistence engine not found" },
 		{ ipvs_del_service, ESRCH, "No such service" },
 		{ ipvs_zero_service, ESRCH, "No such service" },
 		{ ipvs_add_dest, ESRCH, "Service not defined" },


^ permalink raw reply

* Re: [PATCH 0/2] qcusbnet: Cleanups
From: Elly Jones @ 2010-10-01 14:42 UTC (permalink / raw)
  To: Joe Perches; +Cc: David Miller, netdev, dbrownell, mjg59, jglasgow, msb, olofj
In-Reply-To: <1285893935.10780.11.camel@Joe-Laptop>

On Thu, Sep 30, 2010 at 05:45:35PM -0700, Joe Perches wrote:
> On Thu, 2010-09-30 at 17:33 -0700, David Miller wrote:
> > From: Joe Perches <joe@perches.com>
> > Date: Tue, 28 Sep 2010 19:39:56 -0700
> > > Perhaps some of these cleanups are in order?
> > I don't see this driver in any of my trees, so someone else
> > should be taking this in it seems.
> 
> These cleanups are meant for Elly Jones on top
> of the Qualcomm Gobi 2000 driver she submitted.
> 
> http://patchwork.ozlabs.org/patch/66006/
> 

Wow, thank you! I'll incorporate your fixes, make sure I have clean
checkpatch output, and send a v2.

-- Elly

^ permalink raw reply

* [patch v2 1/2] [PATCH 1/2] Slightly simplify options conflicts logic
From: Simon Horman @ 2010-10-01 14:40 UTC (permalink / raw)
  To: lvs-devel, netdev, netfilter, netfilter-devel
  Cc: Jan Engelhardt, Stephen Hemminger, Wensong Zhang,
	Julian Anastasov, Patrick McHardy
In-Reply-To: <20101001144041.414393254@akiko.akashicho.tokyo.vergenet.net>

[-- Attachment #1: 0001-Slightly-simplify-options-conflicts-logic.patch --]
[-- Type: text/plain, Size: 744 bytes --]

Signed-off-by: Simon Horman <horms@verge.net.au>

diff --git a/ipvsadm.c b/ipvsadm.c
index 76ec7c4..1ac6c7f 100644
--- a/ipvsadm.c
+++ b/ipvsadm.c
@@ -763,11 +763,9 @@ static int process_options(int argc, char **argv, int reading_stdin)
 
 	switch (ce.cmd) {
 	case CMD_LIST:
-		if ((options & OPT_CONNECTION ||
-		     options & OPT_TIMEOUT || options & OPT_DAEMON) &&
-		    (options & OPT_STATS ||
-		     options & OPT_PERSISTENTCONN ||
-		     options & OPT_RATE || options & OPT_THRESHOLDS))
+		if (options & (OPT_CONNECTION|OPT_TIMEOUT|OPT_DAEMON) &&
+		    options & (OPT_STATS|OPT_PERSISTENTCONN|
+			       OPT_RATE|OPT_THRESHOLDS))
 			fail(2, "options conflicts in the list command");
 
 		if (options & OPT_CONNECTION)
-- 
1.7.1



^ permalink raw reply related

* [patch v2 02/12] [PATCH 02/12] netfilter: nf_conntrack_sip: Add callid parser
From: Simon Horman @ 2010-10-01 14:35 UTC (permalink / raw)
  To: lvs-devel, netdev, netfilter, netfilter-devel
  Cc: Jan Engelhardt, Stephen Hemminger, Wensong Zhang,
	Julian Anastasov, Patrick McHardy
In-Reply-To: <20101001143517.645421976@akiko.akashicho.tokyo.vergenet.net>

[-- Attachment #1: 0002-netfilter-nf_conntrack_sip-Add-callid-parser.patch --]
[-- Type: text/plain, Size: 2444 bytes --]

Signed-off-by: Simon Horman <horms@verge.net.au>

--- 

The motivation for this is for it to be used by LVS as per
subsequent patches.

* Patrick McHardy suggested changing word_len to check for the
  next newline or whitespace. But I believe this is incorrect.
  For example '#' would be permitted but it is invalid
  in a word according to RFC3261.


diff --git a/include/linux/netfilter/nf_conntrack_sip.h b/include/linux/netfilter/nf_conntrack_sip.h
index ff8cfbc..0ce91d5 100644
--- a/include/linux/netfilter/nf_conntrack_sip.h
+++ b/include/linux/netfilter/nf_conntrack_sip.h
@@ -89,6 +89,7 @@ enum sip_header_types {
 	SIP_HDR_VIA_TCP,
 	SIP_HDR_EXPIRES,
 	SIP_HDR_CONTENT_LENGTH,
+	SIP_HDR_CALL_ID,
 };
 
 enum sdp_header_types {
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 2fd1ea2..715ce54 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -130,6 +130,44 @@ static int digits_len(const struct nf_conn *ct, const char *dptr,
 	return len;
 }
 
+static int iswordc(const char c)
+{
+	if (isalnum(c) || c == '!' || c == '"' || c == '%' ||
+	    (c >= '(' && c <= '/') || c == ':' || c == '<' || c == '>' ||
+	    c == '?' || (c >= '[' && c <= ']') || c == '_' || c == '`' ||
+	    c == '{' || c == '}' || c == '~')
+		return 1;
+	return 0;
+}
+
+static int word_len(const char *dptr, const char *limit)
+{
+	int len = 0;
+	while (dptr < limit && iswordc(*dptr)) {
+		dptr++;
+		len++;
+	}
+	return len;
+}
+
+static int callid_len(const struct nf_conn *ct, const char *dptr,
+		      const char *limit, int *shift)
+{
+	int len, domain_len;
+
+	len = word_len(dptr, limit);
+	dptr += len;
+	if (!len || dptr == limit || *dptr != '@')
+		return len;
+	dptr++;
+	len++;
+
+	domain_len = word_len(dptr, limit);
+	if (!domain_len)
+		return 0;
+	return len + domain_len;
+}
+
 /* get media type + port length */
 static int media_len(const struct nf_conn *ct, const char *dptr,
 		     const char *limit, int *shift)
@@ -299,6 +337,7 @@ static const struct sip_header ct_sip_hdrs[] = {
 	[SIP_HDR_VIA_TCP]		= SIP_HDR("Via", "v", "TCP ", epaddr_len),
 	[SIP_HDR_EXPIRES]		= SIP_HDR("Expires", NULL, NULL, digits_len),
 	[SIP_HDR_CONTENT_LENGTH]	= SIP_HDR("Content-Length", "l", NULL, digits_len),
+	[SIP_HDR_CALL_ID]		= SIP_HDR("Call-Id", "i", NULL, callid_len),
 };
 
 static const char *sip_follow_continuation(const char *dptr, const char *limit)
-- 
1.7.1



^ permalink raw reply related

* [patch v2 06/12] [PATCH 06/12] IPVS: ip_vs_{un,}bind_scheduler NULL arguments
From: Simon Horman @ 2010-10-01 14:35 UTC (permalink / raw)
  To: lvs-devel, netdev, netfilter, netfilter-devel
  Cc: Jan Engelhardt, Stephen Hemminger, Wensong Zhang,
	Julian Anastasov, Patrick McHardy
In-Reply-To: <20101001143517.645421976@akiko.akashicho.tokyo.vergenet.net>

[-- Attachment #1: 0006-IPVS-ip_vs_-un-bind_scheduler-NULL-arguments.patch --]
[-- Type: text/plain, Size: 1770 bytes --]

In general NULL arguments aren't passed by the few callers that exist,
so don't test for them.

The exception is to make passing NULL to ip_vs_unbind_scheduler() a noop.

Signed-off-by: Simon Horman <horms@verge.net.au>

v2
* Trivial rediff

diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 84dae47..d57cc4a 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1229,8 +1229,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
 
  out_err:
 	if (svc != NULL) {
-		if (svc->scheduler)
-			ip_vs_unbind_scheduler(svc);
+		ip_vs_unbind_scheduler(svc);
 		if (svc->inc) {
 			local_bh_disable();
 			ip_vs_app_inc_put(svc->inc);
diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c
index cd77902..be0780a 100644
--- a/net/netfilter/ipvs/ip_vs_sched.c
+++ b/net/netfilter/ipvs/ip_vs_sched.c
@@ -46,15 +46,6 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
 {
 	int ret;
 
-	if (svc == NULL) {
-		pr_err("%s(): svc arg NULL\n", __func__);
-		return -EINVAL;
-	}
-	if (scheduler == NULL) {
-		pr_err("%s(): scheduler arg NULL\n", __func__);
-		return -EINVAL;
-	}
-
 	svc->scheduler = scheduler;
 
 	if (scheduler->init_service) {
@@ -74,18 +65,10 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
  */
 int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
 {
-	struct ip_vs_scheduler *sched;
+	struct ip_vs_scheduler *sched = svc->scheduler;
 
-	if (svc == NULL) {
-		pr_err("%s(): svc arg NULL\n", __func__);
-		return -EINVAL;
-	}
-
-	sched = svc->scheduler;
-	if (sched == NULL) {
-		pr_err("%s(): svc isn't bound\n", __func__);
-		return -EINVAL;
-	}
+	if (!sched)
+		return 0;
 
 	if (sched->done_service) {
 		if (sched->done_service(svc) != 0) {
-- 
1.7.1



^ permalink raw reply related

* [patch v2 09/12] [PATCH 09/12] IPVS: management of persistence engine modules
From: Simon Horman @ 2010-10-01 14:35 UTC (permalink / raw)
  To: lvs-devel, netdev, netfilter, netfilter-devel
  Cc: Jan Engelhardt, Stephen Hemminger, Wensong Zhang,
	Julian Anastasov, Patrick McHardy
In-Reply-To: <20101001143517.645421976@akiko.akashicho.tokyo.vergenet.net>

[-- Attachment #1: 0009-IPVS-management-of-persistence-engine-modules.patch --]
[-- Type: text/plain, Size: 5704 bytes --]

This is based heavily on the scheduler management code

Signed-off-by: Simon Horman <horms@verge.net.au>

---

v0.4
* Export register_ip_vs_pe and unregister_ip_vs_pe
* Use one line comment format for one line comments.
* Only use at most one blank line consecutively

v1
* As suggested by Stephen Hemminger
  - Convert __ip_vs_pe_lock from a rwlock to a spinlock.
    This code isn't performance-critical, so there is no need for RCU.
  - Rename __ip_vs_pe_lock as ip_vs_pe_lock
* Stephen also suggested open-coding ip_vs_{un,}bind_pe()
  as they are very short. But I would prefer to keep them as they are used
  along side ip_vs_{un,}bind_scheduler which are too long to be open-coded.

v2
* Update for recent addition of ip_vs_nfct.c
* Trivial rediff

Index: lvs-test-2.6/include/net/ip_vs.h
===================================================================
--- lvs-test-2.6.orig/include/net/ip_vs.h	2010-10-01 22:27:32.000000000 +0900
+++ lvs-test-2.6/include/net/ip_vs.h	2010-10-01 22:30:16.000000000 +0900
@@ -795,6 +795,12 @@ extern int ip_vs_app_pkt_in(struct ip_vs
 extern int ip_vs_app_init(void);
 extern void ip_vs_app_cleanup(void);
 
+void ip_vs_bind_pe(struct ip_vs_service *svc, struct ip_vs_pe *pe);
+void ip_vs_unbind_pe(struct ip_vs_service *svc);
+int register_ip_vs_pe(struct ip_vs_pe *pe);
+int unregister_ip_vs_pe(struct ip_vs_pe *pe);
+extern struct ip_vs_pe *ip_vs_pe_get(const char *name);
+extern void ip_vs_pe_put(struct ip_vs_pe *pe);
 
 /*
  *	IPVS protocol functions (from ip_vs_proto.c)
Index: lvs-test-2.6/net/netfilter/ipvs/ip_vs_pe.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ lvs-test-2.6/net/netfilter/ipvs/ip_vs_pe.c	2010-10-01 22:30:16.000000000 +0900
@@ -0,0 +1,147 @@
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <asm/string.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+
+#include <net/ip_vs.h>
+
+/* IPVS pe list */
+static LIST_HEAD(ip_vs_pe);
+
+/* lock for service table */
+static DEFINE_SPINLOCK(ip_vs_pe_lock);
+
+/* Bind a service with a pe */
+void ip_vs_bind_pe(struct ip_vs_service *svc, struct ip_vs_pe *pe)
+{
+	svc->pe = pe;
+}
+
+/* Unbind a service from its pe */
+void ip_vs_unbind_pe(struct ip_vs_service *svc)
+{
+	svc->pe = NULL;
+}
+
+/* Get pe in the pe list by name */
+static struct ip_vs_pe *
+ip_vs_pe_getbyname(const char *pe_name)
+{
+	struct ip_vs_pe *pe;
+
+	IP_VS_DBG(2, "%s(): pe_name \"%s\"\n", __func__,
+		  pe_name);
+
+	spin_lock_bh(&ip_vs_pe_lock);
+
+	list_for_each_entry(pe, &ip_vs_pe, n_list) {
+		/* Test and get the modules atomically */
+		if (pe->module &&
+		    !try_module_get(pe->module)) {
+			/* This pe is just deleted */
+			continue;
+		}
+		if (strcmp(pe_name, pe->name)==0) {
+			/* HIT */
+			spin_unlock_bh(&ip_vs_pe_lock);
+			return pe;
+		}
+		if (pe->module)
+			module_put(pe->module);
+	}
+
+	spin_unlock_bh(&ip_vs_pe_lock);
+	return NULL;
+}
+
+/* Lookup pe and try to load it if it doesn't exist */
+struct ip_vs_pe *ip_vs_pe_get(const char *name)
+{
+	struct ip_vs_pe *pe;
+
+	/* Search for the pe by name */
+	pe = ip_vs_pe_getbyname(name);
+
+	/* If pe not found, load the module and search again */
+	if (!pe) {
+		request_module("ip_vs_pe_%s", name);
+		pe = ip_vs_pe_getbyname(name);
+	}
+
+	return pe;
+}
+
+void ip_vs_pe_put(struct ip_vs_pe *pe)
+{
+	if (pe && pe->module)
+		module_put(pe->module);
+}
+
+/* Register a pe in the pe list */
+int register_ip_vs_pe(struct ip_vs_pe *pe)
+{
+	struct ip_vs_pe *tmp;
+
+	/* increase the module use count */
+	ip_vs_use_count_inc();
+
+	spin_lock_bh(&ip_vs_pe_lock);
+
+	if (!list_empty(&pe->n_list)) {
+		spin_unlock_bh(&ip_vs_pe_lock);
+		ip_vs_use_count_dec();
+		pr_err("%s(): [%s] pe already linked\n",
+		       __func__, pe->name);
+		return -EINVAL;
+	}
+
+	/* Make sure that the pe with this name doesn't exist
+	 * in the pe list.
+	 */
+	list_for_each_entry(tmp, &ip_vs_pe, n_list) {
+		if (strcmp(tmp->name, pe->name) == 0) {
+			spin_unlock_bh(&ip_vs_pe_lock);
+			ip_vs_use_count_dec();
+			pr_err("%s(): [%s] pe already existed "
+			       "in the system\n", __func__, pe->name);
+			return -EINVAL;
+		}
+	}
+	/* Add it into the d-linked pe list */
+	list_add(&pe->n_list, &ip_vs_pe);
+	spin_unlock_bh(&ip_vs_pe_lock);
+
+	pr_info("[%s] pe registered.\n", pe->name);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(register_ip_vs_pe);
+
+/* Unregister a pe from the pe list */
+int unregister_ip_vs_pe(struct ip_vs_pe *pe)
+{
+	spin_lock_bh(&ip_vs_pe_lock);
+	if (list_empty(&pe->n_list)) {
+		spin_unlock_bh(&ip_vs_pe_lock);
+		pr_err("%s(): [%s] pe is not in the list. failed\n",
+		       __func__, pe->name);
+		return -EINVAL;
+	}
+
+	/* Remove it from the d-linked pe list */
+	list_del(&pe->n_list);
+	spin_unlock_bh(&ip_vs_pe_lock);
+
+	/* decrease the module use count */
+	ip_vs_use_count_dec();
+
+	pr_info("[%s] pe unregistered.\n", pe->name);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(unregister_ip_vs_pe);
Index: lvs-test-2.6/net/netfilter/ipvs/Makefile
===================================================================
--- lvs-test-2.6.orig/net/netfilter/ipvs/Makefile	2010-10-01 22:30:26.000000000 +0900
+++ lvs-test-2.6/net/netfilter/ipvs/Makefile	2010-10-01 22:30:59.000000000 +0900
@@ -14,7 +14,7 @@ ip_vs-extra_objs-$(CONFIG_IP_VS_NFCT) +=
 
 ip_vs-objs :=	ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o	   \
 		ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o	   		   \
-		ip_vs_est.o ip_vs_proto.o 				   \
+		ip_vs_est.o ip_vs_proto.o ip_vs_pe.o			   \
 		$(ip_vs_proto-objs-y) $(ip_vs-extra_objs-y)
 
 


^ permalink raw reply

* [patch v2 10/12] [PATCH 10/12] IPVS: Allow configuration of persistence engines
From: Simon Horman @ 2010-10-01 14:35 UTC (permalink / raw)
  To: lvs-devel, netdev, netfilter, netfilter-devel
  Cc: Jan Engelhardt, Stephen Hemminger, Wensong Zhang,
	Julian Anastasov, Patrick McHardy
In-Reply-To: <20101001143517.645421976@akiko.akashicho.tokyo.vergenet.net>

[-- Attachment #1: 0010-IPVS-Allow-configuration-of-persistence-engines.patch --]
[-- Type: text/plain, Size: 6258 bytes --]

Allow the persistence engine of a virtual service to be set, edited
and unset.

This feature only works with the netlink user-space interface.

Signed-off-by: Simon Horman <horms@verge.net.au>

---

v2
* Dereference persistence engine when a service is deleted.
  This allows persistence modules to be removed from the kernel
  once they are no longer used.
* Update for the recent "ipvs: changes related to service usecnt" change
* Trivial rediff

Index: lvs-test-2.6/include/linux/ip_vs.h
===================================================================
--- lvs-test-2.6.orig/include/linux/ip_vs.h	2010-10-01 22:41:07.000000000 +0900
+++ lvs-test-2.6/include/linux/ip_vs.h	2010-10-01 22:42:08.000000000 +0900
@@ -336,6 +336,9 @@ enum {
 	IPVS_SVC_ATTR_NETMASK,		/* persistent netmask */
 
 	IPVS_SVC_ATTR_STATS,		/* nested attribute for service stats */
+
+	IPVS_SVC_ATTR_PE_NAME,		/* name of ct retriever */
+
 	__IPVS_SVC_ATTR_MAX,
 };
 
Index: lvs-test-2.6/include/net/ip_vs.h
===================================================================
--- lvs-test-2.6.orig/include/net/ip_vs.h	2010-10-01 22:41:54.000000000 +0900
+++ lvs-test-2.6/include/net/ip_vs.h	2010-10-01 22:42:08.000000000 +0900
@@ -444,6 +444,7 @@ struct ip_vs_service_user_kern {
 
 	/* virtual service options */
 	char			*sched_name;
+	char			*pe_name;
 	unsigned		flags;		/* virtual service flags */
 	unsigned		timeout;	/* persistent timeout in sec */
 	u32			netmask;	/* persistent netmask */
Index: lvs-test-2.6/net/netfilter/ipvs/ip_vs_ctl.c
===================================================================
--- lvs-test-2.6.orig/net/netfilter/ipvs/ip_vs_ctl.c	2010-10-01 22:41:06.000000000 +0900
+++ lvs-test-2.6/net/netfilter/ipvs/ip_vs_ctl.c	2010-10-01 22:44:51.000000000 +0900
@@ -1134,6 +1134,7 @@ ip_vs_add_service(struct ip_vs_service_u
 {
 	int ret = 0;
 	struct ip_vs_scheduler *sched = NULL;
+	struct ip_vs_pe *pe = NULL;
 	struct ip_vs_service *svc = NULL;
 
 	/* increase the module use count */
@@ -1147,6 +1148,16 @@ ip_vs_add_service(struct ip_vs_service_u
 		goto out_err;
 	}
 
+	if (u->pe_name && *u->pe_name) {
+		pe = ip_vs_pe_get(u->pe_name);
+		if (pe == NULL) {
+			pr_info("persistence engine module ip_vs_pe_%s "
+				"not found\n", u->pe_name);
+			ret = -ENOENT;
+			goto out_err;
+		}
+	}
+
 #ifdef CONFIG_IP_VS_IPV6
 	if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
 		ret = -EINVAL;
@@ -1184,6 +1195,10 @@ ip_vs_add_service(struct ip_vs_service_u
 		goto out_err;
 	sched = NULL;
 
+	/* Bind the ct retriever */
+	ip_vs_bind_pe(svc, pe);
+	pe = NULL;
+
 	/* Update the virtual service counters */
 	if (svc->port == FTPPORT)
 		atomic_inc(&ip_vs_ftpsvc_counter);
@@ -1215,6 +1230,7 @@ ip_vs_add_service(struct ip_vs_service_u
 		kfree(svc);
 	}
 	ip_vs_scheduler_put(sched);
+	ip_vs_pe_put(pe);
 
 	/* decrease the module use count */
 	ip_vs_use_count_dec();
@@ -1230,6 +1246,7 @@ static int
 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
 {
 	struct ip_vs_scheduler *sched, *old_sched;
+	struct ip_vs_pe *pe = NULL, *old_pe = NULL;
 	int ret = 0;
 
 	/*
@@ -1242,6 +1259,17 @@ ip_vs_edit_service(struct ip_vs_service
 	}
 	old_sched = sched;
 
+	if (u->pe_name && *u->pe_name) {
+		pe = ip_vs_pe_get(u->pe_name);
+		if (pe == NULL) {
+			pr_info("persistence engine module ip_vs_pe_%s "
+				"not found\n", u->pe_name);
+			ret = -ENOENT;
+			goto out;
+		}
+		old_pe = pe;
+	}
+
 #ifdef CONFIG_IP_VS_IPV6
 	if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
 		ret = -EINVAL;
@@ -1293,12 +1321,17 @@ ip_vs_edit_service(struct ip_vs_service
 		}
 	}
 
+	old_pe = svc->pe;
+	if (pe != old_pe) {
+		ip_vs_unbind_pe(svc);
+		ip_vs_bind_pe(svc, pe);
+	}
+
   out_unlock:
 	write_unlock_bh(&__ip_vs_svc_lock);
-#ifdef CONFIG_IP_VS_IPV6
   out:
-#endif
 	ip_vs_scheduler_put(old_sched);
+	ip_vs_pe_put(old_pe);
 	return ret;
 }
 
@@ -1312,6 +1345,9 @@ static void __ip_vs_del_service(struct i
 {
 	struct ip_vs_dest *dest, *nxt;
 	struct ip_vs_scheduler *old_sched;
+	struct ip_vs_pe *old_pe;
+
+	pr_info("%s: enter\n", __func__);
 
 	/* Count only IPv4 services for old get/setsockopt interface */
 	if (svc->af == AF_INET)
@@ -1324,6 +1360,11 @@ static void __ip_vs_del_service(struct i
 	ip_vs_unbind_scheduler(svc);
 	ip_vs_scheduler_put(old_sched);
 
+	/* Unbind persistence engine */
+	old_pe = svc->pe;
+	ip_vs_unbind_pe(svc);
+	ip_vs_pe_put(old_pe);
+
 	/* Unbind app inc */
 	if (svc->inc) {
 		ip_vs_app_inc_put(svc->inc);
@@ -2539,6 +2580,8 @@ static const struct nla_policy ip_vs_svc
 	[IPVS_SVC_ATTR_FWMARK]		= { .type = NLA_U32 },
 	[IPVS_SVC_ATTR_SCHED_NAME]	= { .type = NLA_NUL_STRING,
 					    .len = IP_VS_SCHEDNAME_MAXLEN },
+	[IPVS_SVC_ATTR_PE_NAME]		= { .type = NLA_NUL_STRING,
+					    .len = IP_VS_PENAME_MAXLEN },
 	[IPVS_SVC_ATTR_FLAGS]		= { .type = NLA_BINARY,
 					    .len = sizeof(struct ip_vs_flags) },
 	[IPVS_SVC_ATTR_TIMEOUT]		= { .type = NLA_U32 },
@@ -2615,6 +2658,8 @@ static int ip_vs_genl_fill_service(struc
 	}
 
 	NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name);
+	if (svc->pe)
+		NLA_PUT_STRING(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name);
 	NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags);
 	NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ);
 	NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask);
@@ -2741,11 +2786,12 @@ static int ip_vs_genl_parse_service(stru
 
 	/* If a full entry was requested, check for the additional fields */
 	if (full_entry) {
-		struct nlattr *nla_sched, *nla_flags, *nla_timeout,
+		struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
 			      *nla_netmask;
 		struct ip_vs_flags flags;
 
 		nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
+		nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
 		nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
 		nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
 		nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
@@ -2763,6 +2809,7 @@ static int ip_vs_genl_parse_service(stru
 		usvc->flags = (usvc->flags & ~flags.mask) |
 			      (flags.flags & flags.mask);
 		usvc->sched_name = nla_data(nla_sched);
+		usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
 		usvc->timeout = nla_get_u32(nla_timeout);
 		usvc->netmask = nla_get_u32(nla_netmask);
 	}


^ permalink raw reply

* Re: [patch v2 00/12] IPVS: SIP Persistence Engine
From: Simon Horman @ 2010-10-01 15:09 UTC (permalink / raw)
  To: lvs-devel, netdev, netfilter, netfilter-devel
  Cc: Jan Engelhardt, Stephen Hemminger, Wensong Zhang,
	Julian Anastasov, Patrick McHardy
In-Reply-To: <20101001143517.645421976@akiko.akashicho.tokyo.vergenet.net>

On Fri, Oct 01, 2010 at 11:35:17PM +0900, Simon Horman wrote:
> This patch series adds load-balancing of UDP SIP based on Call-ID to
> IPVS as well as a frame-work for extending IPVS to handle alternate
> persistence requirements.

Patrick, I would like these patches considered for inclusion
in nf-next-2.6. There is some minor diffing noise between
this series and "ipvs: Keep track of backlog connections",
which I posted for inclusion in nf-next-2.6 recently. Let
me know if you want me to do something about the noise.


^ permalink raw reply

* Re: Packet time delays on multi-core systems
From: Eric Dumazet @ 2010-10-01 15:27 UTC (permalink / raw)
  To: Alexey Vlasov
  Cc: Linux Kernel Mailing List, netdev, Jeff Kirsher, Emil Tantilov
In-Reply-To: <20101001141858.GE4094@beaver.vrungel.ru>

Le vendredi 01 octobre 2010 à 18:18 +0400, Alexey Vlasov a écrit :

> NIC statistics:
>      rx_packets: 2973717440
>      tx_packets: 3032670910
>      rx_bytes: 1892633650741
>      tx_bytes: 2536130682695
>      rx_broadcast: 118773199
>      tx_broadcast: 68013
>      rx_multicast: 95257
>      tx_multicast: 0
>      rx_errors: 0
>      tx_errors: 0
>      tx_dropped: 0
>      multicast: 95257
>      collisions: 0
>      rx_length_errors: 0
>      rx_over_errors: 0
>      rx_crc_errors: 0
>      rx_frame_errors: 0
>      rx_no_buffer_count: 7939
>      rx_queue_drop_packet_count: 1324025520
>      rx_missed_errors: 146631
>      tx_aborted_errors: 0
>      tx_carrier_errors: 0
>      tx_fifo_errors: 0
>      tx_heartbeat_errors: 0
>      tx_window_errors: 0
>      tx_abort_late_coll: 0
>      tx_deferred_ok: 0
>      tx_single_coll_ok: 0
>      tx_multi_coll_ok: 0
>      tx_timeout_count: 0
>      tx_restart_queue: 50715
>      rx_long_length_errors: 0
>      rx_short_length_errors: 0
>      rx_align_errors: 0
>      tx_tcp_seg_good: 344724062
>      tx_tcp_seg_failed: 0
>      rx_flow_control_xon: 0
>      rx_flow_control_xoff: 0
>      tx_flow_control_xon: 0
>      tx_flow_control_xoff: 0
>      rx_long_byte_count: 1892633650741
>      rx_csum_offload_good: 2973697420
>      rx_csum_offload_errors: 6235
>      tx_dma_out_of_sync: 0
>      alloc_rx_buff_failed: 0
>      tx_smbus: 9327
>      rx_smbus: 118531661
>      dropped_smbus: 0
>      tx_queue_0_packets: 797617475
>      tx_queue_0_bytes: 630191908685
>      tx_queue_1_packets: 719681297
>      tx_queue_1_bytes: 625907304846
>      tx_queue_2_packets: 718841556
>      tx_queue_2_bytes: 620522418855
>      tx_queue_3_packets: 796521255
>      tx_queue_3_bytes: 646196024585
>      rx_queue_0_packets: 788885797
>      rx_queue_0_bytes: 458936338699
>      rx_queue_0_drops: 0
>      rx_queue_1_packets: 701354604
>      rx_queue_1_bytes: 457490536453
>      rx_queue_1_drops: 0
>      rx_queue_2_packets: 791887663
>      rx_queue_2_bytes: 534425333616
>      rx_queue_2_drops: 0
>      rx_queue_3_packets: 691579028
>      rx_queue_3_bytes: 429887244557
>      rx_queue_3_drops: 0
> 11.111.80: R 1983626201:1983626201(0) win 0
> > > 

OK

IGB stats are wrong... for rx_queue_drop_packet_count field at least

Here is a patch against 2.6.32.23, to get the idea...

Dont trust it unless you patch your kernel ;)

Thanks

Note: current linux-2.6 tree doesnt have this bug.

[PATCH] igb: rx_fifo_errors counter fix

Alexey Vlasov reported insane rx_queue_drop_packet_count
(rx_fifo_errors) values. 

IGB drivers is doing an accumulation for 82575, instead using a zero
value for rqdpc_total.

Reported-by: Alexey Vlasov <renton@renton.name>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 linux-2.6.32.23/net/igb/igb_main.c |    5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

--- linux-2.6.32.23/drivers/net/igb/igb_main.c.orig
+++ linux-2.6.32.23/drivers/net/igb/igb_main.c
@@ -3552,6 +3552,7 @@
 	struct e1000_hw *hw = &adapter->hw;
 	struct pci_dev *pdev = adapter->pdev;
 	u16 phy_tmp;
+	unsigned long rqdpc_total = 0;
 
 #define PHY_IDLE_ERROR_COUNT_MASK 0x00FF
 
@@ -3645,7 +3646,6 @@
 
 	if (hw->mac.type != e1000_82575) {
 		u32 rqdpc_tmp;
-		u64 rqdpc_total = 0;
 		int i;
 		/* Read out drops stats per RX queue.  Notice RQDPC (Receive
 		 * Queue Drop Packet Count) stats only gets incremented, if
@@ -3660,7 +3660,6 @@
 			adapter->rx_ring[i].rx_stats.drops += rqdpc_tmp;
 			rqdpc_total += adapter->rx_ring[i].rx_stats.drops;
 		}
-		adapter->net_stats.rx_fifo_errors = rqdpc_total;
 	}
 
 	/* Note RNBC (Receive No Buffers Count) is an not an exact
@@ -3668,7 +3667,7 @@
 	 * one of the reason for saving it in rx_fifo_errors, as its
 	 * potentially not a true drop.
 	 */
-	adapter->net_stats.rx_fifo_errors += adapter->stats.rnbc;
+	adapter->net_stats.rx_fifo_errors = rqdpc_total + adapter->stats.rnbc;
 
 	/* RLEC on some newer hardware can be incorrect so build
 	 * our own version based on RUC and ROC */

^ permalink raw reply

* Re: [PATCH 0/2] qcusbnet: Cleanups
From: Joe Perches @ 2010-10-01 15:56 UTC (permalink / raw)
  To: Elly Jones; +Cc: David Miller, netdev, dbrownell, mjg59, jglasgow, msb, olofj
In-Reply-To: <20101001144231.GC8632@google.com>

On Fri, 2010-10-01 at 10:42 -0400, Elly Jones wrote:
> On Thu, Sep 30, 2010 at 05:45:35PM -0700, Joe Perches wrote:
> > These cleanups are meant for Elly Jones on top
> > of the Qualcomm Gobi 2000 driver she submitted.
> Wow, thank you! I'll incorporate your fixes, make sure I have clean
> checkpatch output, and send a v2.

Hi Elly,

I wouldn't go overboard on checkpatch compliance.
I think you can ignore several messages.  I did.

Braces around single line statements are ok by me.
80+ column lines don't bother me much.

Probably the extern qcusbnet_debug should go into
some .h file though.

cheers, Joe


^ permalink raw reply

* pull request: wireless-next-2.6 2010-10-01
From: John W. Linville @ 2010-10-01 16:18 UTC (permalink / raw)
  To: davem; +Cc: linux-wireless, netdev

Dave,

Here is the latest round of wireless LAN updates intended for 2.6.37.
Included are some ath5k bits from Bruno Randolf, some carl9170 updates
from Christian Lamparter, some mac80211 updates from Johannes Berg,
some work for supporting multiple VIFs on one device from Ben Greear,
and a smattering of other bits.

Please let me know if there are problems!

Thanks,

John

---

The following changes since commit 94d57c4cfaa43e29ca5fa5ff874048cfc67276f5:

  enic: Update MAINTAINERS (2010-10-01 00:36:53 -0700)

are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-next-2.6.git for-davem

Ben Greear (4):
      ath5k: Add bssid mask and rxfilter to debugfs.
      mac80211: Support multiple VIFS per AP in debugfs.
      mac80211/ath9k: Support AMPDU with multiple VIFs.
      mac80211: Support receiving data frames on multiple vifs.

Bruno Randolf (5):
      ath5k: Remove unused variable for atim window
      ath5k: Check and fix ATIM window
      ath5k: Increase "fudge" for beacon timers
      ath5k: Disable interrupts in ath5k_hw_get_tsf64
      ath5k: Fix bitmasks and typos for PCU Diagnostic register

Christian Lamparter (8):
      mac80211: clear txflags for ps-filtered frames
      carl9170: remove non-standard ba session teardown
      carl9170: angle brackets for wiki link
      carl9170: remove lost-frame workaround
      carl9170: fix tx_ampdu_upload counter
      carl9170: fix WARN_ON triggered by Broadcom HT STAs
      carl9170: revamp carl9170_tx_prepare
      carl9170: interrupt urbs must not set URB_ZERO_PACKET

Dan Carpenter (1):
      airo: remove "basic_rate" module option

Felix Fietkau (3):
      ath9k_hw: simplify revision checks for AR9280
      ath9k_hw: simplify revision checks for AR9285
      ath9k_hw: simplify revision checks for AR9287

Florian Mickler (1):
      iwl3945: queue the right work if the scan needs to be aborted

Ido Yariv (1):
      wl1271: Fix overflow in wl1271_boot_upload_nvs

Johannes Berg (7):
      mac80211: fix use-after-free
      mac80211: remove prepare_for_handlers sdata argument
      mac80211: consolidate packet processing
      mac80211: clean up rx handling wrt. found_sta
      mac80211: fix release_reorder_timeout in scan
      mac80211: move packet flags into packet
      cfg80211: always set IBSS basic rates

John W. Linville (5):
      Merge branch 'master' of git://git.kernel.org/.../linville/wireless-2.6
      Revert "mac80211: fix use-after-free"
      wl12xx: fix separate-object-folder builds
      Merge branch 'master' of git://git.kernel.org/.../linville/wireless-2.6
      Merge branch 'master' of git://git.kernel.org/.../linville/wireless-next-2.6 into for-davem

Jouni Malinen (2):
      mac80211: Filter ProbeReq SuppRates based on TX rate mask
      mac80211: Add DS Parameter Set into Probe Request on 2.4 GHz

Juuso Oikarinen (1):
      mac80211: Fix WMM driver queue configuration

Luis R. Rodriguez (1):
      mac80211: fix offchannel assumption upon association

Rajkumar Manoharan (2):
      ath9k_htc: Fix beacon distribution in IBSS mode
      ath9k_htc: Fix TKIP disconnect failure with HTC drivers

Teemu Paasikivi (1):
      nl80211: Fix exit from nl80211_set_power_save

Vasanthakumar Thiagarajan (1):
      ath9k: Fix tx struck state with paprd

 drivers/net/wireless/Makefile                   |    3 +-
 drivers/net/wireless/airo.c                     |   11 -
 drivers/net/wireless/ath/ath5k/ath5k.h          |    2 +-
 drivers/net/wireless/ath/ath5k/attach.c         |    1 -
 drivers/net/wireless/ath/ath5k/base.c           |   16 +-
 drivers/net/wireless/ath/ath5k/debug.c          |   58 +++++
 drivers/net/wireless/ath/ath5k/debug.h          |    1 +
 drivers/net/wireless/ath/ath5k/dma.c            |    4 +-
 drivers/net/wireless/ath/ath5k/pcu.c            |   99 ++++++++-
 drivers/net/wireless/ath/ath5k/phy.c            |    4 +-
 drivers/net/wireless/ath/ath5k/reg.h            |   29 +--
 drivers/net/wireless/ath/ath9k/ar5008_phy.c     |   22 +-
 drivers/net/wireless/ath/ath9k/ar9002_calib.c   |   21 +--
 drivers/net/wireless/ath/ath9k/ar9002_hw.c      |    2 +-
 drivers/net/wireless/ath/ath9k/eeprom.h         |    2 +-
 drivers/net/wireless/ath/ath9k/eeprom_4k.c      |   10 +-
 drivers/net/wireless/ath/ath9k/eeprom_9287.c    |    6 +-
 drivers/net/wireless/ath/ath9k/eeprom_def.c     |   18 +-
 drivers/net/wireless/ath/ath9k/htc_drv_beacon.c |    9 +-
 drivers/net/wireless/ath/ath9k/htc_drv_init.c   |    3 +
 drivers/net/wireless/ath/ath9k/hw.c             |   27 +-
 drivers/net/wireless/ath/ath9k/init.c           |    2 +-
 drivers/net/wireless/ath/ath9k/main.c           |    6 +-
 drivers/net/wireless/ath/ath9k/recv.c           |   10 +-
 drivers/net/wireless/ath/ath9k/reg.h            |   34 +---
 drivers/net/wireless/ath/ath9k/xmit.c           |    3 +-
 drivers/net/wireless/ath/carl9170/Kconfig       |    2 +-
 drivers/net/wireless/ath/carl9170/carl9170.h    |    1 -
 drivers/net/wireless/ath/carl9170/debug.c       |    4 -
 drivers/net/wireless/ath/carl9170/main.c        |   20 ++-
 drivers/net/wireless/ath/carl9170/tx.c          |  192 ++++++---------
 drivers/net/wireless/ath/carl9170/usb.c         |    2 -
 drivers/net/wireless/wl12xx/Makefile            |    3 +
 drivers/net/wireless/wl12xx/wl1271_boot.c       |   17 +-
 include/net/mac80211.h                          |   47 +++--
 net/mac80211/debugfs.c                          |    1 -
 net/mac80211/debugfs_netdev.c                   |    3 +
 net/mac80211/debugfs_sta.c                      |    2 +-
 net/mac80211/ieee80211_i.h                      |   43 +++-
 net/mac80211/main.c                             |    3 +-
 net/mac80211/mlme.c                             |   16 +-
 net/mac80211/rx.c                               |  291 ++++++++++++-----------
 net/mac80211/scan.c                             |    3 +-
 net/mac80211/sta_info.c                         |   15 +-
 net/mac80211/status.c                           |    1 +
 net/mac80211/util.c                             |   47 +++-
 net/mac80211/wpa.c                              |    2 +-
 net/wireless/ibss.c                             |   19 ++
 net/wireless/nl80211.c                          |   20 +--
 49 files changed, 671 insertions(+), 486 deletions(-)

Omnibus patch available here:

	http://www.kernel.org/pub/linux/kernel/people/linville/wireless-next-2.6-2010-10-01.patch.bz2

-- 
John W. Linville		Someday the world will need a hero, and you
linville@tuxdriver.com			might be all we have.  Be ready.

^ permalink raw reply

* [PATCH] selinux: add dcbnl netlink messages to selinux
From: John Fastabend @ 2010-10-01 17:07 UTC (permalink / raw)
  To: jmorris; +Cc: john.r.fastabend, netdev, linux-kernel

Add DCBNL netlink msg support to selinux. Without this
'unrecognized netlink message' warnings may be seen.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---

 security/selinux/nlmsgtab.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c
index 75ec0c6..8b02b21 100644
--- a/security/selinux/nlmsgtab.c
+++ b/security/selinux/nlmsgtab.c
@@ -65,6 +65,8 @@ static struct nlmsg_perm nlmsg_route_perms[] =
 	{ RTM_NEWADDRLABEL,	NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
 	{ RTM_DELADDRLABEL,	NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
 	{ RTM_GETADDRLABEL,	NETLINK_ROUTE_SOCKET__NLMSG_READ  },
+	{ RTM_GETDCB,		NETLINK_ROUTE_SOCKET__NLMSG_READ  },
+	{ RTM_SETDCB,		NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
 };
 
 static struct nlmsg_perm nlmsg_firewall_perms[] =

^ permalink raw reply related

* Re: [PATCH net-next 0/8] tg3: Bugfixes and updates
From: Matt Carlson @ 2010-10-01 17:25 UTC (permalink / raw)
  To: David Miller; +Cc: Matthew Carlson, netdev@vger.kernel.org, andy@greyhouse.net
In-Reply-To: <20101001.002628.267960834.davem@davemloft.net>

On Fri, Oct 01, 2010 at 12:26:28AM -0700, David Miller wrote:
> From: "Matt Carlson" <mcarlson@broadcom.com>
> Date: Thu, 30 Sep 2010 13:34:29 -0700
> 
> > This patchset implements some bugfixes, removes the 5724 device
> > ID and introduces extended rx buffer rings.
> 
> All applied....
> 
> But really, I want to hear some real justification for a 2048 entry RX
> ring at gigabit speeds.  I even think 512 is way too large for gigabit
> parts.

I don't have any personal experience where a larger ring size could
benefit.  However, I have heard of situations in the past where people
have said increasing the amount of rx buffers available has smoothed
over some bursty traffic / cpu usage patterns.  These people really did
want more than 512 rx buffers.

> Any machine that gets one of these newer 5717 parts does not need that
> much queueing, and too deep queues tend to hurt locality and thus
> performance.

Good point.  I'll see if we can scale the BD ring size based on the
number of rx buffers the administrator has configured.


^ permalink raw reply

* Re: [RFC] bonding: fix workqueue re-arming races
From: Jiri Bohac @ 2010-10-01 18:22 UTC (permalink / raw)
  To: Narendra K
  Cc: Jay Vosburgh, Jiri Bohac, bonding-devel, markine, jarkao2, chavey,
	netdev
In-Reply-To: <20100924112352.GA32716@auslistsprd01.us.dell.com>

On Fri, Sep 24, 2010 at 06:23:53AM -0500, Narendra K wrote:
> On Fri, Sep 17, 2010 at 04:14:33AM +0530, Jay Vosburgh wrote:
> > Jay Vosburgh <fubar@us.ibm.com> wrote:
> The follwing call trace was seen -
> 
> 2.6.35.with.upstream.patch-next-20100811-0.7-default+
> [14602.945876] ------------[ cut here ]------------
> [14602.950474] kernel BUG at kernel/workqueue.c:2844!
> [14602.955242] invalid opcode: 0000 [#1] SMP 
> [14602.959341] last sysfs file: /sys/class/net/bonding_masters
> [14602.964888] CPU 1 
> [14602.966714] Modules linked in: af_packet bonding ipv6 cpufreq_conservative cpufreq_userspace cpufreq_powersave acpi_cpufreq mperf microcode fuse loop dm_mod joydev usbhid hid bnx2 tpm_tis tpm tpm_bios rtc_cmos iTCO_wdt iTCO_vendor_support sr_mod power_meter cdrom sg serio_raw mptctl pcspkr rtc_core usb_storage dcdbas rtc_lib button uhci_hcd ehci_hcd usbcore sd_mod crc_t10dif edd ext3 mbcache jbd fan processor ide_pci_generic ide_core ata_generic ata_piix libata mptsas mptscsih mptbase scsi_transport_sas scsi_mod thermal thermal_sys hwmon
> [14603.015002] 
> [14603.016524] Pid: 4006, comm: ifdown-bonding Not tainted 2.6.35.with.upstream.patch-next-20100811-0.7-default+ #2 0M233H/PowerEdge R710
> [14603.028554] RIP: 0010:[<ffffffff81067b50>]  [<ffffffff81067b50>] destroy_workqueue+0x1d0/0x1e0
> [14603.037144] RSP: 0018:ffff88022a379d88  EFLAGS: 00010286
> [14603.042432] RAX: 000000000000003c RBX: ffff880228674240 RCX: ffff880228f0e800
> [14603.049534] RDX: 0000000000001000 RSI: 0000000000000002 RDI: 000000000000001a
> [14603.056638] RBP: ffff88022a379da8 R08: ffff88022a379cf8 R09: 0000000000000000
> [14603.063741] R10: 00000000ffffffff R11: 0000000000000000 R12: 0000000000000002
> [14603.070842] R13: ffffffff817b8560 R14: ffff8802299d1480 R15: ffff8802299d1488
> [14603.077944] FS:  00007f8e6a28f700(0000) GS:ffff880001c00000(0000) knlGS:0000000000000000
> [14603.085999] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> [14603.091719] CR2: 00007f8e6a2c2000 CR3: 0000000127d1c000 CR4: 00000000000006e0
> [14603.098822] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [14603.105924] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
> [14603.113026] Process ifdown-bonding (pid: 4006, threadinfo ffff88022a378000, task ffff8802299b0080)
> [14603.121944] Stack:
> [14603.123944]  ffff88022a379da8 ffff8802299d1000 ffff8802299d1000 000000010036b6a4
> [14603.131182] <0> ffff88022a379dc8 ffffffffa030a91d ffff8802299d1000 000000010036b6a4
> [14603.138857] <0> ffff88022a379e28 ffffffff812e0a08 ffff88022a379e38 ffff88022a379de8
> [14603.146718] Call Trace:
> [14603.149158]  [<ffffffffa030a91d>] bond_destructor+0x1d/0x30 [bonding]
> [14603.155572]  [<ffffffff812e0a08>] netdev_run_todo+0x1a8/0x270
> [14603.161293]  [<ffffffff812ee859>] rtnl_unlock+0x9/0x10
> [14603.166411]  [<ffffffffa0317824>] bonding_store_bonds+0x1c4/0x1f0 [bonding]
> [14603.173342]  [<ffffffff810f26be>] ? alloc_pages_current+0x9e/0x110
> [14603.179497]  [<ffffffff81285c9e>] class_attr_store+0x1e/0x20
> [14603.185132]  [<ffffffff8116e365>] sysfs_write_file+0xc5/0x140
> [14603.190853]  [<ffffffff8110a68f>] vfs_write+0xcf/0x190
> [14603.195967]  [<ffffffff8110a840>] sys_write+0x50/0x90
> [14603.200996]  [<ffffffff81002ec2>] system_call_fastpath+0x16/0x1b
> [14603.206974] Code: 00 7f 14 8b 3b eb 91 3d 00 10 00 00 89 c2 77 10 8b 3b e9 07 ff ff ff 3d 00 10 00 00 89 c2 76 f0 8b 3b e9 a9 fe ff ff 0f 0b eb fe <0f> 0b eb fe 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 8b 3d 00 
> [14603.226419] RIP  [<ffffffff81067b50>] destroy_workqueue+0x1d0/0x1e0
> [14603.232669]  RSP <ffff88022a379d88>
> [    0.000000] Initializing cgroup subsys cpuset
> [    0.000000] Initializing cgroup subsys cpu

This should be the BUG_ON(cwq->nr_active) in
destroy_workqueue()

This is really strange. bondng_store_bonds() can do two things:
create or delete a bonding device.

I checked the delete path, where I would normally expect such a
problem, but I can't find a way it could fail in this way.
bondng_store_bonds() calls unregister_netdevice(), which 
- calls rollback_registered() -> bond_close()
- puts the device on the net_todo_list.
On rtnl_unlock() netdev_run_todo() gets called and that calls 
bond_destructor().

bond_close() now makes sure the rearming work items are not
pending, thus, the only work items that may still be pending on
the workqueue are the non-rearming "commit" work items.
flush_workqueue(), called at the beginning of destroy_workqueue()
should have waited for these to finish.
If all of the above is correct, this BUG_ON should never trigger.

Maybe I am overlooking something, or it may be some kind of
failure/race condition in the create path, resulting in
bond_destructor() being called as well.

Narendra, any chance to capture the dmesg lines preceeding the
BUG message? This should show which of the above cases it is.

I will try to come up with a debug patch that will tell us which
work remains active on the work queue.

-- 
Jiri Bohac <jbohac@suse.cz>
SUSE Labs, SUSE CZ


^ permalink raw reply

* Re: [PATCH 0/2] qcusbnet: Cleanups
From: David Brownell @ 2010-10-01 18:29 UTC (permalink / raw)
  To: Joe Perches, Elly Jones
  Cc: David Miller, netdev, dbrownell, mjg59, jglasgow, msb, olofj
In-Reply-To: <20101001144231.GC8632@google.com>

I've not noticed this driver before .. can its
name be changed soon?  As it is, it's easily
confused with "usbnet", the driver framework.

Please give it a name that just reflects the
hardware it drives, eliminating that confusion.



^ permalink raw reply

* Re: Packet time delays on multi-core systems
From: Jeff Kirsher @ 2010-10-01 18:54 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Alexey Vlasov, Linux Kernel Mailing List, netdev, Emil Tantilov
In-Reply-To: <1285946861.21547.24.camel@edumazet-laptop>

On Fri, Oct 1, 2010 at 08:27, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le vendredi 01 octobre 2010 à 18:18 +0400, Alexey Vlasov a écrit :
>
>> NIC statistics:
>>      rx_packets: 2973717440
>>      tx_packets: 3032670910
>>      rx_bytes: 1892633650741
>>      tx_bytes: 2536130682695
>>      rx_broadcast: 118773199
>>      tx_broadcast: 68013
>>      rx_multicast: 95257
>>      tx_multicast: 0
>>      rx_errors: 0
>>      tx_errors: 0
>>      tx_dropped: 0
>>      multicast: 95257
>>      collisions: 0
>>      rx_length_errors: 0
>>      rx_over_errors: 0
>>      rx_crc_errors: 0
>>      rx_frame_errors: 0
>>      rx_no_buffer_count: 7939
>>      rx_queue_drop_packet_count: 1324025520
>>      rx_missed_errors: 146631
>>      tx_aborted_errors: 0
>>      tx_carrier_errors: 0
>>      tx_fifo_errors: 0
>>      tx_heartbeat_errors: 0
>>      tx_window_errors: 0
>>      tx_abort_late_coll: 0
>>      tx_deferred_ok: 0
>>      tx_single_coll_ok: 0
>>      tx_multi_coll_ok: 0
>>      tx_timeout_count: 0
>>      tx_restart_queue: 50715
>>      rx_long_length_errors: 0
>>      rx_short_length_errors: 0
>>      rx_align_errors: 0
>>      tx_tcp_seg_good: 344724062
>>      tx_tcp_seg_failed: 0
>>      rx_flow_control_xon: 0
>>      rx_flow_control_xoff: 0
>>      tx_flow_control_xon: 0
>>      tx_flow_control_xoff: 0
>>      rx_long_byte_count: 1892633650741
>>      rx_csum_offload_good: 2973697420
>>      rx_csum_offload_errors: 6235
>>      tx_dma_out_of_sync: 0
>>      alloc_rx_buff_failed: 0
>>      tx_smbus: 9327
>>      rx_smbus: 118531661
>>      dropped_smbus: 0
>>      tx_queue_0_packets: 797617475
>>      tx_queue_0_bytes: 630191908685
>>      tx_queue_1_packets: 719681297
>>      tx_queue_1_bytes: 625907304846
>>      tx_queue_2_packets: 718841556
>>      tx_queue_2_bytes: 620522418855
>>      tx_queue_3_packets: 796521255
>>      tx_queue_3_bytes: 646196024585
>>      rx_queue_0_packets: 788885797
>>      rx_queue_0_bytes: 458936338699
>>      rx_queue_0_drops: 0
>>      rx_queue_1_packets: 701354604
>>      rx_queue_1_bytes: 457490536453
>>      rx_queue_1_drops: 0
>>      rx_queue_2_packets: 791887663
>>      rx_queue_2_bytes: 534425333616
>>      rx_queue_2_drops: 0
>>      rx_queue_3_packets: 691579028
>>      rx_queue_3_bytes: 429887244557
>>      rx_queue_3_drops: 0
>> 11.111.80: R 1983626201:1983626201(0) win 0
>> > >
>
> OK
>
> IGB stats are wrong... for rx_queue_drop_packet_count field at least
>
> Here is a patch against 2.6.32.23, to get the idea...
>
> Dont trust it unless you patch your kernel ;)
>
> Thanks
>
> Note: current linux-2.6 tree doesnt have this bug.
>
> [PATCH] igb: rx_fifo_errors counter fix
>
> Alexey Vlasov reported insane rx_queue_drop_packet_count
> (rx_fifo_errors) values.
>
> IGB drivers is doing an accumulation for 82575, instead using a zero
> value for rqdpc_total.
>
> Reported-by: Alexey Vlasov <renton@renton.name>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> ---
>  linux-2.6.32.23/net/igb/igb_main.c |    5 ++---
>  1 file changed, 2 insertions(+), 3 deletions(-)
>

Thanks Eric!  I have added the patch to my queue.

-- 
Cheers,
Jeff

^ permalink raw reply

* sysctl_{tcp,udp,sctp}_mem overflow on 16TB system.
From: Robin Holt @ 2010-10-01 19:39 UTC (permalink / raw)
  To: David S. Miller, Alexey Kuznetsov, Pekka Savola (ipv6),
	James Morris, Hideaki YOSHIFUJI <yosh
  Cc: linux-kernel, netdev, linux-decnet-user, linux-sctp


On a 16TB system, we noticed that sysctl_tcp_mem[2] and sysctl_udp_mem[2]
were negative.  Code review indicates that the same should occur with
sysctl_sctp_mem[2].

There are a couple ways we could address this.  The one which appears most
reasonable would be to change the struct proto defintion for sysctl_mem
from an int to a long and handle all the associated fallout.

An alternative is to limit the calculation to 1/2 INT_MAX.  The downside
being that the administrator could not tune the system to use more than
INT_MAX memory when much more is available.

Is there a compelling reason to not change the structure's definition
over to longs instead of ints and deal with the fallout from that change?

Thanks,
Robin Holt

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox