Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next-2.6 v2 0/2] sctp: Add Auto-ASCONF support
From: Michio Honda @ 2011-04-07  9:08 UTC (permalink / raw)
  To: netdev; +Cc: lksctp-developers

SCTP reconfigure the IP addresses in the association by using ASCONF chunks as mentioned in RFC5061.  
For example, we can start to use the newly configured IP address in the existing association.  
ASCONF operation is invoked in two ways: 
First is done by the application to call sctp_bindx() system call.  
Second is automatic operation in the SCTP stack with address events in the host computer (called auto_asconf) .  
The former is already implemented, but the latter is not yet. This patch enables it with one sysctl parameter and setsockopt() system call.  
(This patch is a part of larger patch that supports complete auto_asconf)

Signed-off-by: Michio Honda <micchie@sfc.wide.ad.jp>
---
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 11684d9..11c3060 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -767,6 +767,7 @@ enum {
 	NET_SCTP_SNDBUF_POLICY		 = 15,
 	NET_SCTP_SACK_TIMEOUT		 = 16,
 	NET_SCTP_RCVBUF_POLICY		 = 17,
+	NET_SCTP_AUTO_ASCONF_ENABLE	 = 18,
 };
 
 /* /proc/sys/net/bridge */
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 505845d..75ba6a4 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -121,6 +121,7 @@ extern int sctp_copy_local_addr_list(struct sctp_bind_addr *,
 				     int flags);
 extern struct sctp_pf *sctp_get_pf_specific(sa_family_t family);
 extern int sctp_register_pf(struct sctp_pf *, sa_family_t);
+void sctp_addr_wq_mgmt(union sctp_addr *, int);
 
 /*
  * sctp/socket.c
@@ -135,6 +136,7 @@ void sctp_sock_rfree(struct sk_buff *skb);
 void sctp_copy_sock(struct sock *newsk, struct sock *sk,
 		    struct sctp_association *asoc);
 extern struct percpu_counter sctp_sockets_allocated;
+int sctp_asconf_mgmt(struct sctp_endpoint *);
 
 /*
  * sctp/primitive.c
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index cc9185c..3e0351a 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -205,6 +205,11 @@ extern struct sctp_globals {
 	 * It is a list of sctp_sockaddr_entry.
 	 */
 	struct list_head local_addr_list;
+	int auto_asconf_enable;
+	struct list_head addr_waitq;
+	struct timer_list addr_wq_timer;
+	struct list_head auto_asconf_eplist;
+	spinlock_t addr_wq_lock;
 
 	/* Lock that protects the local_addr_list writers */
 	spinlock_t addr_list_lock;
@@ -264,6 +269,11 @@ extern struct sctp_globals {
 #define sctp_port_hashtable		(sctp_globals.port_hashtable)
 #define sctp_local_addr_list		(sctp_globals.local_addr_list)
 #define sctp_local_addr_lock		(sctp_globals.addr_list_lock)
+#define sctp_auto_asconf_eplist		(sctp_globals.auto_asconf_eplist)
+#define sctp_addr_waitq			(sctp_globals.addr_waitq)
+#define sctp_addr_wq_timer		(sctp_globals.addr_wq_timer)
+#define sctp_addr_wq_lock		(sctp_globals.addr_wq_lock)
+#define sctp_auto_asconf_enable		(sctp_globals.auto_asconf_enable)
 #define sctp_scope_policy		(sctp_globals.ipv4_scope_policy)
 #define sctp_addip_enable		(sctp_globals.addip_enable)
 #define sctp_addip_noauth		(sctp_globals.addip_noauth_enable)
@@ -796,6 +806,15 @@ struct sctp_sockaddr_entry {
 	__u8 valid;
 };
 
+#define SCTP_NEWADDR	1
+#define SCTP_DELADDR	2
+#define SCTP_ADDRESS_TICK_DELAY	500
+struct sctp_addr_wait {
+	struct list_head list;
+	union sctp_addr a;
+	int	cmd;
+};
+
 typedef struct sctp_chunk *(sctp_packet_phandler_t)(struct sctp_association *);
 
 /* This structure holds lists of chunks as we are assembling for
@@ -1239,6 +1258,7 @@ sctp_scope_t sctp_scope(const union sctp_addr *);
 int sctp_in_scope(const union sctp_addr *addr, const sctp_scope_t scope);
 int sctp_is_any(struct sock *sk, const union sctp_addr *addr);
 int sctp_addr_is_valid(const union sctp_addr *addr);
+int sctp_is_ep_boundall(struct sock *sk);
 
 
 /* What type of endpoint?  */
@@ -1267,6 +1287,7 @@ struct sctp_ep_common {
 	/* Fields to help us manage our entries in the hash tables. */
 	struct hlist_node node;
 	int hashent;
+	struct list_head auto_asconf_list;
 
 	/* Runtime type information.  What kind of endpoint is this? */
 	sctp_endpoint_type_t type;
@@ -1369,6 +1390,7 @@ struct sctp_endpoint {
 	/* SCTP-AUTH: endpoint shared keys */
 	struct list_head endpoint_shared_keys;
 	__u16 active_key_id;
+	int do_auto_asconf;
 };
 
 /* Recover the outter endpoint structure. */
diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h
index e73ebda..75c96b1 100644
--- a/include/net/sctp/user.h
+++ b/include/net/sctp/user.h
@@ -91,6 +91,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_PEER_AUTH_CHUNKS	26	/* Read only */
 #define SCTP_LOCAL_AUTH_CHUNKS	27	/* Read only */
 #define SCTP_GET_ASSOC_NUMBER	28	/* Read only */
+#define SCTP_AUTO_ASCONF	29
 
 /* Internal Socket Options. Some of the sctp library functions are
  * implemented using these socket options.
diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
index faf71d1..426715f 100644
--- a/net/sctp/bind_addr.c
+++ b/net/sctp/bind_addr.c
@@ -536,6 +536,23 @@ int sctp_in_scope(const union sctp_addr *addr, sctp_scope_t scope)
 	return 0;
 }
 
+int sctp_is_ep_boundall(struct sock *sk)
+{
+	struct sctp_bind_addr *bp;
+	struct sctp_sockaddr_entry *addr;
+
+	bp = &sctp_sk(sk)->ep->base.bind_addr;
+	if (sctp_list_single_entry(&bp->address_list)) {
+		addr = list_entry(bp->address_list.next,
+				  struct sctp_sockaddr_entry, list);
+		if (sctp_is_any(sk, &addr->a))
+			return 1;
+		else
+			return 0;
+	}
+	return 1;
+}
+
 /********************************************************************
  * 3rd Level Abstractions
  ********************************************************************/
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 865ce7b..1b31b4d 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -105,6 +105,7 @@ static int sctp_inet6addr_event(struct notifier_block *this, unsigned long ev,
 			addr->valid = 1;
 			spin_lock_bh(&sctp_local_addr_lock);
 			list_add_tail_rcu(&addr->list, &sctp_local_addr_list);
+			sctp_addr_wq_mgmt(&addr->a, SCTP_NEWADDR);
 			spin_unlock_bh(&sctp_local_addr_lock);
 		}
 		break;
@@ -115,6 +116,7 @@ static int sctp_inet6addr_event(struct notifier_block *this, unsigned long ev,
 			if (addr->a.sa.sa_family == AF_INET6 &&
 					ipv6_addr_equal(&addr->a.v6.sin6_addr,
 						&ifa->addr)) {
+				sctp_addr_wq_mgmt(&addr->a, SCTP_DELADDR);
 				found = 1;
 				addr->valid = 0;
 				list_del_rcu(&addr->list);
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 152976e..f9e0bd6 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -636,6 +636,194 @@ static void sctp_v4_ecn_capable(struct sock *sk)
 	INET_ECN_xmit(sk);
 }
 
+void sctp_addr_wq_timeout_handler(unsigned long arg)
+{
+	struct sctp_addr_wait *addrw = NULL;
+	union sctp_addr *addr = NULL;
+	struct sctp_ep_common *epb = NULL;
+	struct sctp_endpoint *ep = NULL;
+
+	spin_lock_bh(&sctp_addr_wq_lock);
+retry_wq:
+	if (list_empty(&sctp_addr_waitq)) {
+		SCTP_DEBUG_PRINTK("sctp_addrwq_timo_handler: nothing in addr waitq\n");
+		spin_unlock_bh(&sctp_addr_wq_lock);
+		return;
+	}
+	addrw = list_first_entry(&sctp_addr_waitq, struct sctp_addr_wait, list);
+	if (addrw->cmd != SCTP_NEWADDR && addrw->cmd != SCTP_DELADDR) {
+		SCTP_DEBUG_PRINTK("sctp_addrwq_timo_handler: cmd is neither NEWADDR nor DELADDR\n");
+		list_del(&addrw->list);
+		kfree(addrw);
+		goto retry_wq;
+	}
+
+	addr = &addrw->a;
+	SCTP_DEBUG_PRINTK_IPADDR("sctp_addrwq_timo_handler: the first ent in wq %p is ",
+	    " for cmd %d at entry %p\n", &sctp_addr_waitq, addr, addrw->cmd,
+	    addrw);
+
+	/* Now we send an ASCONF for each association */
+	/* Note. we currently don't handle link local IPv6 addressees */
+	if (addr->sa.sa_family == AF_INET6) {
+		struct in6_addr *in6 = (struct in6_addr *)&addr->v6.sin6_addr;
+
+		if (ipv6_addr_type(&addr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) {
+			SCTP_DEBUG_PRINTK("sctp_timo_handler: link local, hence don't tell eps\n");
+			list_del(&addrw->list);
+			kfree(addrw);
+			goto retry_wq;
+		}
+		if (ipv6_chk_addr(&init_net, in6, NULL, 0) == 0 &&
+		    addrw->cmd == SCTP_NEWADDR) {
+			unsigned long timeo_val;
+
+			SCTP_DEBUG_PRINTK("sctp_timo_handler: this is on DAD, trying %d sec later\n",
+			    SCTP_ADDRESS_TICK_DELAY);
+			timeo_val = jiffies;
+			timeo_val += msecs_to_jiffies(SCTP_ADDRESS_TICK_DELAY);
+			(void)mod_timer(&sctp_addr_wq_timer, timeo_val);
+			spin_unlock_bh(&sctp_addr_wq_lock);
+			return;
+		}
+	}
+	list_for_each_entry(epb, &sctp_auto_asconf_eplist, auto_asconf_list) {
+		if (epb == NULL) {
+			SCTP_DEBUG_PRINTK("addrwq_timo_handler: no epb\n");
+			continue;
+		}
+		if (!sctp_is_ep_boundall(epb->sk))
+			/* ignore bound-specific endpoints */
+			continue;
+		ep = sctp_ep(epb);
+		sctp_bh_lock_sock(epb->sk);
+		if (sctp_asconf_mgmt(ep) < 0) {
+			SCTP_DEBUG_PRINTK("sctp_addrwq_timo_handler: sctp_asconf_mgmt failed\n");
+			sctp_bh_unlock_sock(epb->sk);
+			continue;
+		}
+		sctp_bh_unlock_sock(epb->sk);
+	}
+
+	list_del(&addrw->list);
+	kfree(addrw);
+
+	if (list_empty(&sctp_addr_waitq)) {
+		spin_unlock_bh(&sctp_addr_wq_lock);
+		return;
+	} else
+		goto retry_wq;
+
+	spin_unlock_bh(&sctp_addr_wq_lock);
+}
+
+static void sctp_free_addr_wq()
+{
+	struct sctp_addr_wait *addrw = NULL;
+	struct sctp_addr_wait *temp = NULL;
+
+	spin_lock_bh(&sctp_addr_wq_lock);
+	(void)del_timer(&sctp_addr_wq_timer);
+	list_for_each_entry_safe(addrw, temp, &sctp_addr_waitq, list) {
+		list_del(&addrw->list);
+		kfree(addrw);
+	}
+	spin_unlock_bh(&sctp_addr_wq_lock);
+}
+
+void sctp_addr_wq_mgmt(union sctp_addr *reqaddr, int cmd)
+{
+	struct sctp_addr_wait *addrw = NULL;
+	struct sctp_addr_wait *addrw_new = NULL;
+	unsigned long timeo_val;
+	union sctp_addr *tmpaddr;
+
+	/* first, we check if an opposite message already exist in the queue.
+	 * If we found such message, it is removed.
+	 * This operation is a bit stupid, but the DHCP client attaches the
+	 * new address after a couple of addition and deletion of that address
+	 */
+
+	if (reqaddr == NULL) {
+		SCTP_DEBUG_PRINTK("sctp_addr_wq_mgmt: no address message?\n");
+		return;
+	}
+
+	spin_lock_bh(&sctp_addr_wq_lock);
+	/* Offsets existing events in addr_wq */
+	list_for_each_entry(addrw, &sctp_addr_waitq, list) {
+		if (addrw->a.sa.sa_family != reqaddr->sa.sa_family)
+			continue;
+		if (reqaddr->sa.sa_family == AF_INET) {
+			if (reqaddr->v4.sin_addr.s_addr ==
+			    addrw->a.v4.sin_addr.s_addr) {
+				if (cmd != addrw->cmd) {
+					tmpaddr = &addrw->a;
+					SCTP_DEBUG_PRINTK_IPADDR("sctp_addr_wq_mgmt offsets existing entry for %d ",
+					    " in waitq %p\n", addrw->cmd,
+					    tmpaddr, &sctp_addr_waitq);
+					list_del(&addrw->list);
+					kfree(addrw);
+					/* nothing to do anymore */
+					spin_unlock_bh(&sctp_addr_wq_lock);
+					return;
+				}
+			}
+		} else if (reqaddr->sa.sa_family == AF_INET6) {
+			if (ipv6_addr_equal(&reqaddr->v6.sin6_addr,
+			    &addrw->a.v6.sin6_addr)) {
+				if (cmd != addrw->cmd) {
+					tmpaddr = &addrw->a;
+					SCTP_DEBUG_PRINTK_IPADDR("sctp_addr_wq_mgmt: offsets existing entry for %d ",
+					    " in waitq %p\n", addrw->cmd,
+					    tmpaddr, &sctp_addr_waitq);
+					list_del(&addrw->list);
+					kfree(addrw);
+					spin_unlock_bh(&sctp_addr_wq_lock);
+					return;
+				}
+			}
+		}
+	}
+
+	/* OK, we have to add the new address to the wait queue */
+	addrw_new = kzalloc(sizeof(struct sctp_addr_wait), GFP_ATOMIC);
+	if (addrw_new == NULL) {
+		SCTP_DEBUG_PRINTK("sctp_addr_weitq_mgmt no memory? return\n");
+		spin_unlock_bh(&sctp_addr_wq_lock);
+		return;
+	}
+	if (reqaddr->sa.sa_family == AF_INET) {
+		addrw_new->a.v4.sin_family = AF_INET;
+		addrw_new->a.v4.sin_addr.s_addr = reqaddr->v4.sin_addr.s_addr;
+	} else if (reqaddr->sa.sa_family == AF_INET6) {
+		addrw_new->a.v6.sin6_family = AF_INET6;
+		ipv6_addr_copy(&addrw_new->a.v6.sin6_addr,
+		    &reqaddr->v6.sin6_addr);
+	} else {
+		SCTP_DEBUG_PRINTK("sctp_addr_waitq_mgmt: Unknown family of request addr, return\n");
+		kfree(addrw_new);
+		spin_unlock_bh(&sctp_addr_wq_lock);
+		return;
+	}
+	addrw_new->cmd = cmd;
+	list_add_tail(&addrw_new->list, &sctp_addr_waitq);
+	tmpaddr = &addrw_new->a;
+	SCTP_DEBUG_PRINTK_IPADDR("sctp_addr_wq_mgmt add new entry for cmd:%d ",
+	    " in waitq %p, start a timer\n",
+	    addrw_new->cmd, tmpaddr, &sctp_addr_waitq);
+
+	if (timer_pending(&sctp_addr_wq_timer)) {
+		SCTP_DEBUG_PRINTK("sctp_addr_wq_mgmt: addr_wq timer is already running\n");
+		spin_unlock_bh(&sctp_addr_wq_lock);
+		return;
+	}
+	timeo_val = jiffies;
+	timeo_val += msecs_to_jiffies(SCTP_ADDRESS_TICK_DELAY);
+	(void)mod_timer(&sctp_addr_wq_timer, timeo_val);
+	spin_unlock_bh(&sctp_addr_wq_lock);
+}
+
 /* Event handler for inet address addition/deletion events.
  * The sctp_local_addr_list needs to be protocted by a spin lock since
  * multiple notifiers (say IPv4 and IPv6) may be running at the same
@@ -663,6 +851,7 @@ static int sctp_inetaddr_event(struct notifier_block *this, unsigned long ev,
 			addr->valid = 1;
 			spin_lock_bh(&sctp_local_addr_lock);
 			list_add_tail_rcu(&addr->list, &sctp_local_addr_list);
+			sctp_addr_wq_mgmt(&addr->a, SCTP_NEWADDR);
 			spin_unlock_bh(&sctp_local_addr_lock);
 		}
 		break;
@@ -673,6 +862,7 @@ static int sctp_inetaddr_event(struct notifier_block *this, unsigned long ev,
 			if (addr->a.sa.sa_family == AF_INET &&
 					addr->a.v4.sin_addr.s_addr ==
 					ifa->ifa_local) {
+				sctp_addr_wq_mgmt(&addr->a, SCTP_DELADDR);
 				found = 1;
 				addr->valid = 0;
 				list_del_rcu(&addr->list);
@@ -1280,6 +1470,14 @@ SCTP_STATIC __init int sctp_init(void)
 	spin_lock_init(&sctp_local_addr_lock);
 	sctp_get_local_addr_list();
 
+	/* Initialize the address event list */
+	INIT_LIST_HEAD(&sctp_addr_waitq);
+	INIT_LIST_HEAD(&sctp_auto_asconf_eplist);
+	spin_lock_init(&sctp_addr_wq_lock);
+	sctp_addr_wq_timer.expires = 0;
+	setup_timer(&sctp_addr_wq_timer, sctp_addr_wq_timeout_handler,
+	    (unsigned long)NULL);
+
 	status = sctp_v4_protosw_init();
 
 	if (status)
@@ -1344,6 +1542,7 @@ err_chunk_cachep:
 /* Exit handler for the SCTP protocol.  */
 SCTP_STATIC __exit void sctp_exit(void)
 {
+	sctp_free_addr_wq();
 	/* BUG.  This should probably do something useful like clean
 	 * up all the remaining associations and all that memory.
 	 */
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 3951a10..27dffa3 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -807,6 +807,47 @@ out:
 	return retval;
 }
 
+/* set addr events to assocs in the endpoint.  ep and addr_wq must be locked */
+int
+sctp_asconf_mgmt(struct sctp_endpoint *ep)
+{
+	struct sock *sk = ep->base.sk;
+	struct sctp_addr_wait *addrw = NULL;
+	union sctp_addr *addr = NULL;
+	int cmd;
+	int error = 0;
+
+	if (ep == NULL || sk == NULL)
+		return -EINVAL;
+	if (list_empty(&sctp_addr_waitq)) {
+		SCTP_DEBUG_PRINTK("asconf_mgmt: nothing in the wq\n");
+		return -EINVAL;
+	}
+	addrw = list_first_entry(&sctp_addr_waitq, struct sctp_addr_wait, list);
+	if (addrw->cmd != SCTP_NEWADDR && addrw->cmd != SCTP_DELADDR)
+		return -EINVAL;
+	addr = &addrw->a;
+	addr->v4.sin_port = htons(ep->base.bind_addr.port);
+	cmd = addrw->cmd;
+
+	SCTP_DEBUG_PRINTK("sctp_asconf_mgmt sk:%p ep:%p\n", sk, ep);
+	if (cmd == SCTP_NEWADDR) {
+		error = sctp_send_asconf_add_ip(sk, (struct sockaddr *)addr, 1);
+		if (error) {
+			SCTP_DEBUG_PRINTK("asconf_mgmt: send_asconf_add_ip returns %d\n", error);
+			return error;
+		}
+	} else if (cmd == SCTP_DELADDR) {
+		error = sctp_send_asconf_del_ip(sk, (struct sockaddr *)addr, 1);
+		if (error) {
+			SCTP_DEBUG_PRINTK("asconf_mgmt: send_asconf_del_ip returns %d\n", error);
+			return error;
+		}
+	}
+
+	return 0;
+}
+
 /* Helper for tunneling sctp_bindx() requests through sctp_setsockopt()
  *
  * API 8.1
@@ -3341,6 +3382,45 @@ static int sctp_setsockopt_del_key(struct sock *sk,
 
 }
 
+/*
+ * 8.1.23 SCTP_AUTO_ASCONF
+ *
+ * This option will enable or disable the use of the automatic generation of
+ * ASCONF chunks to add and delete addresses to an existing association.  Note
+ * that this option has two caveats namely: a) it only affects sockets that
+ * are bound to all addresses available to the SCTP stack, and b) the system
+ * administrator may have an overriding control that turns the ASCONF feature
+ * off no matter what setting the socket option may have.
+ * This option expects an integer boolean flag, where a non-zero value turns on
+ * the option, and a zero value turns off the option.
+ * Note. In this implementation, socket operation overrides default parameter
+ * being set by sysctl as well as FreeBSD implementation
+ */
+static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
+					unsigned int optlen)
+{
+	int val;
+	struct sctp_endpoint *ep = sctp_sk(sk)->ep;
+
+	if (optlen < sizeof(int))
+		return -EINVAL;
+	if (get_user(val, (int __user *)optval))
+		return -EFAULT;
+	if (!sctp_is_ep_boundall(sk) && val)
+		return -EINVAL;
+	if ((val && ep->do_auto_asconf) || (!val && !ep->do_auto_asconf))
+		return 0;
+
+	if (val == 0 && ep->do_auto_asconf) {
+		list_del(&ep->base.auto_asconf_list);
+		ep->do_auto_asconf = 0;
+	} else if (val && !ep->do_auto_asconf) {
+		list_add_tail(&ep->base.auto_asconf_list,
+		    &sctp_auto_asconf_eplist);
+		ep->do_auto_asconf = 1;
+	}
+	return 0;
+}
 
 /* API 6.2 setsockopt(), getsockopt()
  *
@@ -3488,6 +3568,9 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_AUTH_DELETE_KEY:
 		retval = sctp_setsockopt_del_key(sk, optval, optlen);
 		break;
+	case SCTP_AUTO_ASCONF:
+		retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -3770,6 +3853,13 @@ SCTP_STATIC int sctp_init_sock(struct sock *sk)
 	local_bh_disable();
 	percpu_counter_inc(&sctp_sockets_allocated);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+	if (sctp_auto_asconf_enable) {
+		list_add_tail(&ep->base.auto_asconf_list,
+		    &sctp_auto_asconf_eplist);
+		ep->do_auto_asconf = 1;
+	} else
+		ep->do_auto_asconf = 0;
+	SCTP_DEBUG_PRINTK("sctp_init_sk sk:%p ep:%p\n", sk, ep);
 	local_bh_enable();
 
 	return 0;
@@ -3784,6 +3874,10 @@ SCTP_STATIC void sctp_destroy_sock(struct sock *sk)
 
 	/* Release our hold on the endpoint. */
 	ep = sctp_sk(sk)->ep;
+	if (ep->do_auto_asconf) {
+		ep->do_auto_asconf = 0;
+		list_del(&ep->base.auto_asconf_list);
+	}
 	sctp_endpoint_free(ep);
 	local_bh_disable();
 	percpu_counter_dec(&sctp_sockets_allocated);
@@ -5283,6 +5377,28 @@ static int sctp_getsockopt_assoc_number(struct sock *sk, int len,
 	return 0;
 }
 
+/*
+ * 8.1.23 SCTP_AUTO_ASCONF
+ * See the corresponding setsockopt entry as description
+ */
+static int sctp_getsockopt_auto_asconf(struct sock *sk, int len,
+				   char __user *optval, int __user *optlen)
+{
+	int val = 0;
+
+	if (len < sizeof(int))
+		return -EINVAL;
+
+	len = sizeof(int);
+	if (sctp_sk(sk)->ep->do_auto_asconf && sctp_is_ep_boundall(sk))
+		val = 1;
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &val, len))
+		return -EFAULT;
+	return 0;
+}
+
 SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
 				char __user *optval, int __user *optlen)
 {
@@ -5415,6 +5531,9 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
 	case SCTP_GET_ASSOC_NUMBER:
 		retval = sctp_getsockopt_assoc_number(sk, len, optval, optlen);
 		break;
+	case SCTP_AUTO_ASCONF:
+		retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 50cb57f..df39789 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -183,6 +183,13 @@ static ctl_table sctp_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 	{
+		.procname	= "auto_asconf_enable",
+		.data		= &sctp_auto_asconf_enable,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
 		.procname	= "prsctp_enable",
 		.data		= &sctp_prsctp_enable,
 		.maxlen		= sizeof(int),


^ permalink raw reply related

* RE: Low performance Intel 10GE NIC (3.2.10) on 2.6.38 Kernel
From: Eric Dumazet @ 2011-04-07  9:06 UTC (permalink / raw)
  To: Wei Gu; +Cc: netdev, Alexander Duyck, Jeff Kirsher
In-Reply-To: <D12839161ADD3A4B8DA63D1A134D084026E48B9F23@ESGSCCMS0001.eapac.ericsson.se>

Le jeudi 07 avril 2011 à 16:39 +0800, Wei Gu a écrit :
> I'm only insert a prerouting hook to make a copy of the incomming
> packet and swap the L2/L3 header, send it back on the same interface.
> 

Small packets or big ones ?

You dont need to copy the packet, its expensive.


> BTW, some times I notices that the perf tool was not mapping the
> symbol correclly, I don't why?
> 

You might try to put ixgbe in static kernel, not in a module.

> I will try a fresh install of kernel 2.6.30 and do the test with the
> shipped ixgbe driver again.
> 

OK thanks.






^ permalink raw reply

* [PATCH net-next-2.6 v2 2/2] sctp: Add a valid address list in association local
From: Michio Honda @ 2011-04-07  9:02 UTC (permalink / raw)
  To: netdev; +Cc: lksctp-developers

When the SCTP association transmits an ASCONF with ADD_IP_ADDRESS, that association cannot use the adding address until it receives ASCONF-ACK.  
This patch prevents that associations that do not receive ASCONF-ACK use the adding address.
(This patch is a part of larger patch that supports complete auto_asconf)

Signed-off-by: Michio Honda <micchie@sfc.wide.ad.jp>
---
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 505845d..f1f439c 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -135,6 +135,7 @@ void sctp_sock_rfree(struct sk_buff *skb);
 void sctp_copy_sock(struct sock *newsk, struct sock *sk,
 		    struct sctp_association *asoc);
 extern struct percpu_counter sctp_sockets_allocated;
+void sctp_add_addr_to_laddr(struct sockaddr *, struct sctp_association *);
 
 /*
  * sctp/primitive.c
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index cc9185c..f6ca775 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1901,6 +1901,14 @@ struct sctp_association {
 	 * after reaching 4294967295.
 	 */
 	__u32 addip_serial;
+	/* list of valid addresses in association local
+	 * This list is needed to ensure base.bind_addr being a valid address
+	 * list of the endpoint-wide.  When one of associations receives
+	 * ASCONF-ACK, that address is added to this list.  When all
+	 * associations belonging to the same endpoint receive ASCONF-ACKs,
+	 * that address is added to base.bind_addr
+	 */
+	struct list_head asoc_laddr_list;
 
 	/* SCTP AUTH: list of the endpoint shared keys.  These
 	 * keys are provided out of band by the user applicaton
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 6b04287..614834f 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -279,6 +279,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 	asoc->peer.asconf_capable = 0;
 	if (sctp_addip_noauth)
 		asoc->peer.asconf_capable = 1;
+	INIT_LIST_HEAD(&asoc->asoc_laddr_list);
 
 	/* Create an input queue.  */
 	sctp_inq_init(&asoc->base.inqueue);
@@ -446,6 +447,15 @@ void sctp_association_free(struct sctp_association *asoc)
 	/* Free any cached ASCONF_ACK chunk. */
 	sctp_assoc_free_asconf_acks(asoc);
 
+	if (!list_empty(&asoc->asoc_laddr_list)) {
+		struct sctp_sockaddr_entry *laddr, *tmp;
+		list_for_each_entry_safe(laddr, tmp, &asoc->asoc_laddr_list, \
+		    list) {
+			list_del(&laddr->list);
+			kfree(laddr);
+		}
+	}
+
 	/* Free any cached ASCONF chunk. */
 	if (asoc->addip_last_asconf)
 		sctp_chunk_free(asoc->addip_last_asconf);
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 865ce7b..bbf3152 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -333,6 +333,18 @@ static void sctp_v6_get_saddr(struct sctp_sock *sk,
 			}
 		}
 	}
+	if (baddr == NULL) {
+		/* We don't have a valid src addr in "endpoint-wide".
+		 * Looking up in assoc-locally valid address list.
+		 */
+		list_for_each_entry(laddr, &asoc->asoc_laddr_list, list) {
+			bmatchlen = sctp_v6_addr_match_len(daddr, &laddr->a);
+			if (!baddr || (matchlen < bmatchlen)) {
+				baddr = &laddr->a;
+				matchlen = bmatchlen;
+			}
+		}
+	}
 
 	if (baddr) {
 		memcpy(saddr, baddr, sizeof(union sctp_addr));
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 152976e..918a62e 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -516,6 +516,13 @@ static struct dst_entry *sctp_v4_get_dst(struct sctp_association *asoc,
 				goto out_unlock;
 		}
 		rcu_read_unlock();
+		/* We don't have a valid src addr in "endpoint-wide".
+		 * Looking up in assoc-locally valid address list.
+		 */
+		list_for_each_entry(laddr, &asoc->asoc_laddr_list, list) {
+			if (sctp_v4_cmp_addr(&dst_saddr, &laddr->a))
+				goto out_unlock;
+		}
 
 		/* None of the bound addresses match the source address of the
 		 * dst. So release it.
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index de98665..1a44f88 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -3171,7 +3171,6 @@ static void sctp_asconf_param_success(struct sctp_association *asoc,
 	struct sctp_bind_addr *bp = &asoc->base.bind_addr;
 	union sctp_addr_param *addr_param;
 	struct sctp_transport *transport;
-	struct sctp_sockaddr_entry *saddr;
 
 	addr_param = (union sctp_addr_param *)
 			((void *)asconf_param + sizeof(sctp_addip_param_t));
@@ -3186,10 +3185,10 @@ static void sctp_asconf_param_success(struct sctp_association *asoc,
 		 * held, so the list can not change.
 		 */
 		local_bh_disable();
-		list_for_each_entry(saddr, &bp->address_list, list) {
-			if (sctp_cmp_addr_exact(&saddr->a, &addr))
-				saddr->state = SCTP_ADDR_SRC;
-		}
+		/* Until this ASCONF is acked on all associations, we cannot
+		 * consider this address as ADDR_SRC
+		 */
+		sctp_add_addr_to_laddr(&addr.sa, asoc);
 		local_bh_enable();
 		list_for_each_entry(transport, &asoc->peer.transport_addr_list,
 				transports) {
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 3951a10..1a06469 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -807,6 +807,145 @@ out:
 	return retval;
 }
 
+/* Add a new address to the list contains available addresses only in the
+ * association.  If the new address is also available on the other associations
+ * on the endpoint, it is marked as SCTP_ADDR_SRC in the bind address list on
+ * the endpoint.  This situation is possible when some of associations receive
+ * ASCONF-ACK for ADD_IP at the endpoint
+ */
+void
+sctp_add_addr_to_laddr(struct sockaddr *sa, struct sctp_association *asoc)
+{
+	struct sctp_endpoint *ep = asoc->ep;
+	struct sctp_association *tmp = NULL;
+	struct sctp_bind_addr *bp;
+	struct sctp_sockaddr_entry *addr;
+	struct sockaddr_in *sin = NULL;
+	struct sockaddr_in6 *sin6 = NULL;
+	int local;
+	int found;
+
+	union sctp_addr *tmpaddr = NULL;
+	tmpaddr = (union sctp_addr *)sa;
+	SCTP_DEBUG_PRINTK_IPADDR("add_addr_to_laddr: asoc: %p ", " ep: %p",
+	    asoc, tmpaddr, ep);
+	if (sa->sa_family == AF_INET)
+		sin = (struct sockaddr_in *)sa;
+	else if (sa->sa_family == AF_INET6)
+		sin6 = (struct sockaddr_in6 *)sa;
+
+	/* Check if this address is locally available in the other asocs */
+	local = 0;
+	list_for_each_entry(tmp, &ep->asocs, asocs) {
+		if (tmp == asoc)
+			continue;
+		found = 0;
+		list_for_each_entry(addr, &tmp->asoc_laddr_list, list) {
+			tmpaddr = &addr->a;
+			if (sa->sa_family != addr->a.sa.sa_family)
+				continue;
+			if (sa->sa_family == AF_INET) {
+				if (sin->sin_addr.s_addr ==
+				    addr->a.v4.sin_addr.s_addr)
+					found = 1;
+			} else if (sa->sa_family == AF_INET6) {
+				if (ipv6_addr_equal(&sin6->sin6_addr,
+				    &addr->a.v6.sin6_addr))
+					found = 1;
+
+			}
+		}
+		if (!found) {
+			SCTP_DEBUG_PRINTK("add_addr_to_laddr: not found in asoc %p\n", tmp);
+			local = 1;
+			break;
+		}
+	}
+	addr = NULL;
+
+	if (local) {
+		/* this address is not available in some of the other
+		 * associations.  So add as locally-available in this
+		 * asocciation
+		 */
+		addr = kmalloc(sizeof(struct sctp_sockaddr_entry), GFP_ATOMIC);
+		if  (addr == NULL) {
+			SCTP_DEBUG_PRINTK("add_addr_to_laddr: failed to allocate memory for this address\n");
+			return;
+		}
+		memset(addr, 0, sizeof(struct sctp_sockaddr_entry));
+		if (sa->sa_family == AF_INET) {
+			addr->a.sa.sa_family = AF_INET;
+			addr->a.v4.sin_port = sin->sin_port;
+			addr->a.v4.sin_addr.s_addr = sin->sin_addr.s_addr;
+		} else if (sa->sa_family == AF_INET6) {
+			addr->a.sa.sa_family = AF_INET6;
+			addr->a.v6.sin6_port = sin6->sin6_port;
+			memcpy(&addr->a.v6.sin6_addr, &sin6->sin6_addr,
+			    sizeof(struct in6_addr));
+		}
+		list_add_tail(&addr->list, &asoc->asoc_laddr_list);
+		SCTP_DEBUG_PRINTK("add_addr_to_laddr: now we added this address to the local list on asoc %p\n", asoc);
+	} else {
+		/* this address is also available in all other asocs.  So set
+		 * it as ADDR_SRC in the bind-addr list in the endpoint, then
+		 * remove from the asoc_laddr_list on the associations.
+		 */
+		SCTP_DEBUG_PRINTK("add_addr_to_laddr: this address is available in all other asocs\n");
+		bp = &asoc->base.bind_addr;
+
+		/* change state of the new address in the bind list */
+		list_for_each_entry(addr, &bp->address_list, list) {
+			if (addr->state != SCTP_ADDR_NEW)
+				continue;
+			if (addr->a.sa.sa_family != sa->sa_family)
+				continue;
+			if (addr->a.sa.sa_family == AF_INET) {
+				if (sin->sin_port != addr->a.v4.sin_port)
+					continue;
+				if (sin->sin_addr.s_addr !=
+				    addr->a.v4.sin_addr.s_addr)
+					continue;
+			} else if (addr->a.sa.sa_family == AF_INET6) {
+				if (sin6->sin6_port != addr->a.v6.sin6_port)
+					continue;
+				if (!ipv6_addr_equal(&sin6->sin6_addr,
+				    &addr->a.v6.sin6_addr))
+					continue;
+			}
+			SCTP_DEBUG_PRINTK("add_addr_to_laddr: found the entry for this address with ADDR_NEW flag, set to ADDR_SRC\n");
+			addr->state = SCTP_ADDR_SRC;
+		}
+
+		/* remove the entry of this address from the asoc-local list */
+		list_for_each_entry(tmp, &ep->asocs, asocs) {
+			if (tmp == asoc)
+				continue;
+			addr = NULL;
+			list_for_each_entry(addr, &tmp->asoc_laddr_list, list) {
+				if (sa->sa_family != addr->a.sa.sa_family)
+					continue;
+				if (sa->sa_family == AF_INET) {
+					if (sin->sin_addr.s_addr !=
+					    addr->a.v4.sin_addr.s_addr)
+						continue;
+				} else if (sa->sa_family == AF_INET6) {
+					if (!ipv6_addr_equal(&sin6->sin6_addr,
+					    &addr->a.v6.sin6_addr))
+						continue;
+				}
+				break;
+			}
+			if (addr == NULL) {
+				SCTP_DEBUG_PRINTK("add_addr_to_laddr: Huh, asoc %p doesn't have the entry for this address?\n", asoc);
+				continue;
+			}
+			list_del(&addr->list);
+			kfree(addr);
+		}
+	}
+}
+
 /* Helper for tunneling sctp_bindx() requests through sctp_setsockopt()
  *
  * API 8.1


^ permalink raw reply related

* [PATCH net-next-2.6 v2 1/2] sctp: Add ASCONF operation on the single-homed host
From: Michio Honda @ 2011-04-07  9:02 UTC (permalink / raw)
  To: netdev; +Cc: lksctp-developers

SCTP can change the IP address on the single-homed host.  
In this case, the SCTP association transmits an ASCONF packet including addition of the new IP address and deletion of the old address.  
This patch implements this functionality.  
(This patch is a part of larger patch that supports complete auto_asconf)

Signed-off-by: Michio Honda <micchie@sfc.wide.ad.jp>
---
diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index c70d8cc..d7a4ee3 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -441,4 +441,8 @@ enum {
  */
 #define SCTP_AUTH_RANDOM_LENGTH 32
 
+/* ASCONF PARAMETERS */
+#define SCTP_ASCONF_V4_PARAM_LEN 16
+#define SCTP_ASCONF_V6_PARAM_LEN 28
+
 #endif /* __sctp_constants_h__ */
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 9352d12..498a3cf 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -259,6 +259,7 @@ struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc,
 				       struct sctp_chunk *asconf);
 int sctp_process_asconf_ack(struct sctp_association *asoc,
 			    struct sctp_chunk *asconf_ack);
+void sctp_path_check_and_react(struct sctp_association *, struct sockaddr *);
 struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc,
 				    __u32 new_cum_tsn, size_t nstreams,
 				    struct sctp_fwdtsn_skip *skiplist);
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index cc9185c..400ee8a 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1901,6 +1901,9 @@ struct sctp_association {
 	 * after reaching 4294967295.
 	 */
 	__u32 addip_serial;
+	union sctp_addr *asconf_addr_del_pending;
+	__u32 asconf_del_pending_cid;
+	int src_out_of_asoc_ok;
 
 	/* SCTP AUTH: list of the endpoint shared keys.  These
 	 * keys are provided out of band by the user applicaton
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 6b04287..2dfd0e8 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -279,6 +279,9 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 	asoc->peer.asconf_capable = 0;
 	if (sctp_addip_noauth)
 		asoc->peer.asconf_capable = 1;
+	asoc->asconf_addr_del_pending = NULL;
+	asoc->asconf_del_pending_cid = 0;
+	asoc->src_out_of_asoc_ok = 0;
 
 	/* Create an input queue.  */
 	sctp_inq_init(&asoc->base.inqueue);
@@ -443,6 +446,10 @@ void sctp_association_free(struct sctp_association *asoc)
 
 	asoc->peer.transport_count = 0;
 
+	/* Free pending address space being deleted */
+	if (asoc->asconf_addr_del_pending != NULL)
+		kfree(asoc->asconf_addr_del_pending);
+
 	/* Free any cached ASCONF_ACK chunk. */
 	sctp_assoc_free_asconf_acks(asoc);
 
@@ -1277,7 +1284,7 @@ void sctp_assoc_update(struct sctp_association *asoc,
  */
 void sctp_assoc_update_retran_path(struct sctp_association *asoc)
 {
-	struct sctp_transport *t, *next;
+	struct sctp_transport *t, *next, *unconfirmed;
 	struct list_head *head = &asoc->peer.transport_addr_list;
 	struct list_head *pos;
 
@@ -1287,7 +1294,7 @@ void sctp_assoc_update_retran_path(struct sctp_association *asoc)
 	/* Find the next transport in a round-robin fashion. */
 	t = asoc->peer.retran_path;
 	pos = &t->transports;
-	next = NULL;
+	next = unconfirmed = NULL;
 
 	while (1) {
 		/* Skip the head. */
@@ -1318,11 +1325,15 @@ void sctp_assoc_update_retran_path(struct sctp_association *asoc)
 			 */
 			if (t->state != SCTP_UNCONFIRMED && !next)
 				next = t;
+			else if (t->state == SCTP_UNCONFIRMED)
+				unconfirmed = t;
 		}
 	}
 
 	if (t)
 		asoc->peer.retran_path = t;
+	else if (unconfirmed)
+		asoc->peer.retran_path = t = unconfirmed;
 
 	SCTP_DEBUG_PRINTK_IPADDR("sctp_assoc_update_retran_path:association"
 				 " %p addr: ",
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 865ce7b..56c97ce 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -332,6 +332,13 @@ static void sctp_v6_get_saddr(struct sctp_sock *sk,
 				matchlen = bmatchlen;
 			}
 		}
+		if (laddr->state == SCTP_ADDR_NEW && asoc->src_out_of_asoc_ok) {
+			bmatchlen = sctp_v6_addr_match_len(daddr, &laddr->a);
+			if (!baddr || (matchlen < bmatchlen)) {
+				baddr = &laddr->a;
+				matchlen = bmatchlen;
+			}
+		}
 	}
 
 	if (baddr) {
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 26dc005..033ea20 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -344,7 +344,14 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk)
 			break;
 		}
 	} else {
-		list_add_tail(&chunk->list, &q->control_chunk_list);
+		/* We add the ASCONF for the only one newly added address at
+		 * the front of the queue
+		 */
+		if (q->asoc->src_out_of_asoc_ok && \
+		    chunk->chunk_hdr->type == SCTP_CID_ASCONF)
+			list_add(&chunk->list, &q->control_chunk_list);
+		else
+			list_add_tail(&chunk->list, &q->control_chunk_list);
 		SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS);
 	}
 
@@ -850,6 +857,24 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 		case SCTP_CID_SHUTDOWN:
 		case SCTP_CID_ECN_ECNE:
 		case SCTP_CID_ASCONF:
+			/* RFC 5061, 5.3
+			 * F1) This means that until such time as the ASCONF
+			 * containing the add is acknowledged, the sender MUST
+			 * NOT use the new IP address as a source for ANY SCTP
+			 * packet except on carrying an ASCONF Chunk.
+			 */
+			if (asoc->src_out_of_asoc_ok) {
+				SCTP_DEBUG_PRINTK("outq_flush: out_of_asoc_ok, transmit chunk type %d\n",
+				    chunk->chunk_hdr->type);
+				packet = &transport->packet;
+				sctp_packet_config(packet, vtag,
+						asoc->peer.ecn_capable);
+				sctp_packet_append_chunk(packet, chunk);
+				error = sctp_packet_transmit(packet);
+				if (error < 0)
+					return error;
+				goto sctp_flush_out;
+			}
 		case SCTP_CID_FWD_TSN:
 			status = sctp_packet_transmit_chunk(packet, chunk,
 							    one_packet);
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 152976e..b8ec3cc 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -510,7 +510,8 @@ static struct dst_entry *sctp_v4_get_dst(struct sctp_association *asoc,
 		sctp_v4_dst_saddr(&dst_saddr, dst, htons(bp->port));
 		rcu_read_lock();
 		list_for_each_entry_rcu(laddr, &bp->address_list, list) {
-			if (!laddr->valid || (laddr->state != SCTP_ADDR_SRC))
+			if (!laddr->valid || (laddr->state != SCTP_ADDR_SRC &&
+			    asoc->src_out_of_asoc_ok == 0))
 				continue;
 			if (sctp_v4_cmp_addr(&dst_saddr, &laddr->a))
 				goto out_unlock;
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index de98665..5a085b3 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -2651,6 +2651,61 @@ __u32 sctp_generate_tsn(const struct sctp_endpoint *ep)
 	return retval;
 }
 
+void
+sctp_path_check_and_react(struct sctp_association *asoc, struct sockaddr *sa)
+{
+	struct sctp_transport *trans;
+	int addrnum, family;
+	struct sctp_sockaddr_entry *saddr;
+	struct sctp_bind_addr *bp;
+	union sctp_addr *tmpaddr;
+
+	family = sa->sa_family;
+	bp = &asoc->base.bind_addr;
+	addrnum = 0;
+	/* count up the number of local addresses in the same family */
+	list_for_each_entry(saddr, &bp->address_list, list) {
+		if (saddr->a.sa.sa_family == family) {
+			tmpaddr = &saddr->a;
+			if (family == AF_INET6 &&
+			    ipv6_addr_type(&tmpaddr->v6.sin6_addr) &
+			    IPV6_ADDR_LINKLOCAL) {
+				continue;
+			}
+			addrnum++;
+		}
+	}
+	if (addrnum == 1) {
+		union sctp_addr *tmpaddr;
+		tmpaddr = (union sctp_addr *)sa;
+		SCTP_DEBUG_PRINTK_IPADDR("pcheck_react: only 1 local addr in asoc %p ",
+		    " family %d\n", asoc, tmpaddr, family);
+		list_for_each_entry(trans, &asoc->peer.transport_addr_list,
+		    transports) {
+			/* reset path information and release refcount to the
+			 * dst_entry  based on the src change */
+			sctp_transport_hold(trans);
+			trans->cwnd = min(4*asoc->pathmtu,
+			    max_t(__u32, 2*asoc->pathmtu, 4380));
+			trans->ssthresh = asoc->peer.i.a_rwnd;
+			trans->rtt = 0;
+			trans->srtt = 0;
+			trans->rttvar = 0;
+			trans->rto = asoc->rto_initial;
+			dst_release(trans->dst);
+			trans->dst = NULL;
+			memset(&trans->saddr, 0, sizeof(union sctp_addr));
+			sctp_transport_route(trans, NULL,
+			    sctp_sk(asoc->base.sk));
+			SCTP_DEBUG_PRINTK_IPADDR("we freed dst_entry (asoc: %p dst: ",
+			    " trans: %p)\n", asoc, (&trans->ipaddr), trans);
+			trans->rto_pending = 1;
+			sctp_transport_put(trans);
+		}
+	}
+	return;
+}
+
 /*
  * ADDIP 3.1.1 Address Configuration Change Chunk (ASCONF)
  *      0                   1                   2                   3
@@ -2744,11 +2799,29 @@ struct sctp_chunk *sctp_make_asconf_update_ip(struct sctp_association *asoc,
 	int			addr_param_len = 0;
 	int 			totallen = 0;
 	int 			i;
+	sctp_addip_param_t del_param; /* 8 Bytes (Type 0xC002, Len and CrrID) */
+	sctp_addip_param_t spr_param;
+	struct sctp_af *del_af;
+	struct sctp_af *spr_af;
+	int del_addr_param_len = 0;
+	int spr_addr_param_len = 0;
+	int del_paramlen = sizeof(sctp_addip_param_t);
+	int spr_paramlen = sizeof(sctp_addip_param_t);
+	union sctp_addr_param del_addr_param; /* (v4) 8 Bytes, (v6) 20 Bytes */
+	union sctp_addr_param spr_addr_param;
+	int			v4 = 0;
+	int			v6 = 0;
 
 	/* Get total length of all the address parameters. */
 	addr_buf = addrs;
 	for (i = 0; i < addrcnt; i++) {
 		addr = (union sctp_addr *)addr_buf;
+		if (addr != NULL) {
+			if (addr->sa.sa_family == AF_INET)
+				v4 = 1;
+			else if (addr->sa.sa_family == AF_INET6)
+				v6 = 1;
+		}
 		af = sctp_get_af_specific(addr->v4.sin_family);
 		addr_param_len = af->to_addr_param(addr, &addr_param);
 
@@ -2757,6 +2830,40 @@ struct sctp_chunk *sctp_make_asconf_update_ip(struct sctp_association *asoc,
 
 		addr_buf += af->sockaddr_len;
 	}
+	/* Add the length of a pending address being deleted */
+	if (flags == SCTP_PARAM_ADD_IP && asoc->asconf_addr_del_pending) {
+		if ((asoc->asconf_addr_del_pending->sa.sa_family == AF_INET
+		    && v4) ||
+		    (asoc->asconf_addr_del_pending->sa.sa_family == AF_INET6
+		    && v6)) {
+			del_af = sctp_get_af_specific(
+			    asoc->asconf_addr_del_pending->sa.sa_family);
+			del_addr_param_len = del_af->to_addr_param(
+			    asoc->asconf_addr_del_pending, &del_addr_param);
+			totallen += del_paramlen;
+			totallen += del_addr_param_len;
+			SCTP_DEBUG_PRINTK("mkasconf_update_ip: now we picked del_pending addr, totallen for all addresses is %d\n",
+			    totallen);
+			/* for Set Primary (equal size as del parameters */
+			totallen += del_paramlen;
+			totallen += del_addr_param_len;
+		}
+		if (v4) {
+			if (totallen != SCTP_ASCONF_V4_PARAM_LEN * 2 &&
+			    totallen != SCTP_ASCONF_V4_PARAM_LEN * 3) {
+				SCTP_DEBUG_PRINTK("mkasconf_update_ip: incorrect total length of ASCONF parameters, del + add MUST be 32 bytes, but %d bytes\n", totallen);
+			return NULL;
+			}
+		} else if (v6) {
+			if (totallen != SCTP_ASCONF_V6_PARAM_LEN * 2 &&
+			    totallen != SCTP_ASCONF_V6_PARAM_LEN * 3) {
+				SCTP_DEBUG_PRINTK("mkasconf_update_ip: incorrect total length of ASCONF parameters, del + add MUST be 56 bytes, but %d bytes\n", totallen);
+			return NULL;
+			}
+		}
+	}
+	SCTP_DEBUG_PRINTK("mkasconf_update_ip: call mkasconf() for %d bytes\n",
+	    totallen);
 
 	/* Create an asconf chunk with the required length. */
 	retval = sctp_make_asconf(asoc, laddr, totallen);
@@ -2778,6 +2885,32 @@ struct sctp_chunk *sctp_make_asconf_update_ip(struct sctp_association *asoc,
 
 		addr_buf += af->sockaddr_len;
 	}
+	if (flags == SCTP_PARAM_ADD_IP && asoc->asconf_addr_del_pending) {
+		addr = asoc->asconf_addr_del_pending;
+		del_af = sctp_get_af_specific(addr->v4.sin_family);
+		del_addr_param_len = del_af->to_addr_param(addr,
+		    &del_addr_param);
+		del_param.param_hdr.type = SCTP_PARAM_DEL_IP;
+		del_param.param_hdr.length = htons(del_paramlen +
+		    del_addr_param_len);
+		del_param.crr_id = i;
+		asoc->asconf_del_pending_cid = i;
+
+		sctp_addto_chunk(retval, del_paramlen, &del_param);
+		sctp_addto_chunk(retval, del_addr_param_len, &del_addr_param);
+		/* For SET_PRIMARY */
+		addr_buf = addrs;
+		addr = (union sctp_addr *)addr_buf;
+		spr_af = sctp_get_af_specific(addr->v4.sin_family);
+		spr_addr_param_len = spr_af->to_addr_param(addr,
+		    &spr_addr_param);
+		spr_param.param_hdr.type = SCTP_PARAM_SET_PRIMARY;
+		spr_param.param_hdr.length = htons(spr_paramlen +
+		    spr_addr_param_len);
+		spr_param.crr_id = (i+1);
+		sctp_addto_chunk(retval, spr_paramlen, &spr_param);
+		sctp_addto_chunk(retval, spr_addr_param_len, &spr_addr_param);
+	}
 	return retval;
 }
 
@@ -2990,7 +3123,7 @@ static __be16 sctp_process_asconf_param(struct sctp_association *asoc,
 		 * an Error Cause TLV set to the new error code 'Request to
 		 * Delete Source IP Address'
 		 */
-		if (sctp_cmp_addr_exact(sctp_source(asconf), &addr))
+		if (sctp_cmp_addr_exact(&asconf->source, &addr))
 			return SCTP_ERROR_DEL_SRC_IP;
 
 		/* Section 4.2.2
@@ -3193,16 +3326,37 @@ static void sctp_asconf_param_success(struct sctp_association *asoc,
 		local_bh_enable();
 		list_for_each_entry(transport, &asoc->peer.transport_addr_list,
 				transports) {
-			if (transport->state == SCTP_ACTIVE)
+			if (transport->state == SCTP_ACTIVE &&
+			    !asoc->src_out_of_asoc_ok)
 				continue;
 			dst_release(transport->dst);
 			sctp_transport_route(transport, NULL,
 					     sctp_sk(asoc->base.sk));
 		}
+		asoc->src_out_of_asoc_ok = 0;
 		break;
 	case SCTP_PARAM_DEL_IP:
 		local_bh_disable();
 		sctp_del_bind_addr(bp, &addr);
+		if (asoc->asconf_addr_del_pending != NULL) {
+			if ((addr.sa.sa_family == AF_INET) &&
+			    (asoc->asconf_addr_del_pending->sa.sa_family ==
+			     AF_INET)) {
+				if (asoc->asconf_addr_del_pending->v4.sin_addr.s_addr == addr.v4.sin_addr.s_addr) {
+					kfree(asoc->asconf_addr_del_pending);
+					asoc->asconf_del_pending_cid = 0;
+					asoc->asconf_addr_del_pending = NULL;
+				}
+			} else if ((addr.sa.sa_family == AF_INET6) &&
+				(asoc->asconf_addr_del_pending->sa.sa_family ==
+				 AF_INET6)) {
+				if (ipv6_addr_equal(&asoc->asconf_addr_del_pending->v6.sin6_addr, &addr.v6.sin6_addr)) {
+					kfree(asoc->asconf_addr_del_pending);
+					asoc->asconf_del_pending_cid = 0;
+					asoc->asconf_addr_del_pending = NULL;
+				}
+			}
+		}
 		local_bh_enable();
 		list_for_each_entry(transport, &asoc->peer.transport_addr_list,
 				transports) {
@@ -3293,6 +3447,8 @@ int sctp_process_asconf_ack(struct sctp_association *asoc,
 	int	no_err = 1;
 	int	retval = 0;
 	__be16	err_code = SCTP_ERROR_NO_ERROR;
+	sctp_addip_param_t *first_asconf_param = NULL;
+	int first_asconf_paramlen;
 
 	/* Skip the chunkhdr and addiphdr from the last asconf sent and store
 	 * a pointer to address parameter.
@@ -3307,6 +3463,8 @@ int sctp_process_asconf_ack(struct sctp_association *asoc,
 	length = ntohs(addr_param->v4.param_hdr.length);
 	asconf_param = (sctp_addip_param_t *)((void *)addr_param + length);
 	asconf_len -= length;
+	first_asconf_param = asconf_param;
+	first_asconf_paramlen = ntohs(first_asconf_param->param_hdr.length);
 
 	/* ADDIP 4.1
 	 * A8) If there is no response(s) to specific TLV parameter(s), and no
@@ -3361,6 +3519,35 @@ int sctp_process_asconf_ack(struct sctp_association *asoc,
 		asconf_len -= length;
 	}
 
+	/* When the source address obviously changes to newly added one, we
+	   reset the cwnd to re-probe the path condition
+	*/
+	if (no_err && first_asconf_param->param_hdr.type == SCTP_PARAM_ADD_IP) {
+		if (first_asconf_paramlen == SCTP_ASCONF_V4_PARAM_LEN) {
+			struct sockaddr_in sin;
+
+			memset(&sin, 0, sizeof(struct sockaddr_in));
+			sin.sin_family = AF_INET;
+			memcpy(&sin.sin_addr.s_addr, first_asconf_param + 1,
+					sizeof(struct in_addr));
+			sctp_path_check_and_react(asoc,
+					(struct sockaddr *)&sin);
+
+		} else if (first_asconf_paramlen == SCTP_ASCONF_V6_PARAM_LEN) {
+			struct sockaddr_in6 sin6;
+
+			memset(&sin6, 0, sizeof(struct sockaddr_in6));
+			sin6.sin6_family = AF_INET6;
+			memcpy(&sin6.sin6_addr, first_asconf_param + 1,
+					sizeof(struct in6_addr));
+			sctp_path_check_and_react(asoc,
+					(struct sockaddr *)&sin6);
+		} else {
+			SCTP_DEBUG_PRINTK("funny asconf_paramlen? (%d)\n",
+			    first_asconf_paramlen);
+		}
+	}
+
 	/* Free the cached last sent asconf chunk. */
 	list_del_init(&asconf->transmitted_list);
 	sctp_chunk_free(asconf);
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 3951a10..2bfe2a9 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -527,6 +527,7 @@ static int sctp_send_asconf_add_ip(struct sock		*sk,
 	struct list_head		*p;
 	int 				i;
 	int 				retval = 0;
+	struct sctp_transport		*trans = NULL;
 
 	if (!sctp_addip_enable)
 		return retval;
@@ -583,13 +584,10 @@ static int sctp_send_asconf_add_ip(struct sock		*sk,
 			goto out;
 		}
 
-		retval = sctp_send_asconf(asoc, chunk);
-		if (retval)
-			goto out;
-
 		/* Add the new addresses to the bind address list with
 		 * use_as_src set to 0.
 		 */
+		SCTP_DEBUG_PRINTK("snd_asconf_addip: next, add_bind_addr with ADDR_NEW flag\n");
 		addr_buf = addrs;
 		for (i = 0; i < addrcnt; i++) {
 			addr = (union sctp_addr *)addr_buf;
@@ -599,6 +597,28 @@ static int sctp_send_asconf_add_ip(struct sock		*sk,
 						    SCTP_ADDR_NEW, GFP_ATOMIC);
 			addr_buf += af->sockaddr_len;
 		}
+		list_for_each_entry(trans, &asoc->peer.transport_addr_list,
+		    transports) {
+			if (asoc->asconf_addr_del_pending != NULL)
+				/* This ADDIP ASCONF piggybacks DELIP for the
+				 * last address, so need to select src addr
+				 * from the out_of_asoc addrs
+				 */
+				asoc->src_out_of_asoc_ok = 1;
+			/* Clear the source and route cache in the path */
+			memset(&trans->saddr, 0, sizeof(union sctp_addr));
+			dst_release(trans->dst);
+			trans->cwnd = min(4*asoc->pathmtu, max_t(__u32,
+			    2*asoc->pathmtu, 4380));
+			trans->ssthresh = asoc->peer.i.a_rwnd;
+			trans->rto = asoc->rto_initial;
+			trans->rtt = 0;
+			trans->srtt = 0;
+			trans->rttvar = 0;
+			sctp_transport_route(trans, NULL,
+			    sctp_sk(asoc->base.sk));
+		}
+		retval = sctp_send_asconf(asoc, chunk);
 	}
 
 out:
@@ -711,7 +731,9 @@ static int sctp_send_asconf_del_ip(struct sock		*sk,
 	struct sctp_sockaddr_entry *saddr;
 	int 			i;
 	int 			retval = 0;
+	int			stored = 0;
 
+	chunk = NULL;
 	if (!sctp_addip_enable)
 		return retval;
 
@@ -762,8 +784,42 @@ static int sctp_send_asconf_del_ip(struct sock		*sk,
 		bp = &asoc->base.bind_addr;
 		laddr = sctp_find_unmatch_addr(bp, (union sctp_addr *)addrs,
 					       addrcnt, sp);
-		if (!laddr)
-			continue;
+		if ((laddr == NULL) && (addrcnt == 1)) {
+			union sctp_addr *sa_addr = NULL;
+
+			if (asoc->asconf_addr_del_pending == NULL) {
+				asoc->asconf_addr_del_pending =
+				    kmalloc(sizeof(union sctp_addr),
+				    GFP_ATOMIC);
+				memset(asoc->asconf_addr_del_pending, 0,
+						sizeof(union sctp_addr));
+				if (addrs->sa_family == AF_INET) {
+					struct sockaddr_in *sin;
+
+					sin = (struct sockaddr_in *)addrs;
+					asoc->asconf_addr_del_pending->v4.sin_family = AF_INET;
+					memcpy(&asoc->asconf_addr_del_pending->v4.sin_addr, &sin->sin_addr, sizeof(struct in_addr));
+				} else if (addrs->sa_family == AF_INET6) {
+					struct sockaddr_in6 *sin6;
+
+					sin6 = (struct sockaddr_in6 *)addrs;
+					asoc->asconf_addr_del_pending->v6.sin6_family = AF_INET6;
+					memcpy(&asoc->asconf_addr_del_pending->v6.sin6_addr, &sin6->sin6_addr, sizeof(struct in6_addr));
+				}
+				sa_addr = (union sctp_addr *)addrs;
+				SCTP_DEBUG_PRINTK_IPADDR("send_asconf_del_ip: keep the last address asoc: %p ",
+				    " at %p\n", asoc, sa_addr,
+				    asoc->asconf_addr_del_pending);
+				stored = 1;
+				goto skip_mkasconf;
+			} else {
+				SCTP_DEBUG_PRINTK_IPADDR("send_asconf_del_ip: asoc %p, deleting last address ",
+				    " is already stored at %p\n", asoc,
+				    asoc->asconf_addr_del_pending,
+				    asoc->asconf_addr_del_pending);
+				continue;
+			}
+		}
 
 		/* We do not need RCU protection throughout this loop
 		 * because this is done under a socket lock from the
@@ -776,6 +832,7 @@ static int sctp_send_asconf_del_ip(struct sock		*sk,
 			goto out;
 		}
 
+skip_mkasconf:
 		/* Reset use_as_src flag for the addresses in the bind address
 		 * list that are to be deleted.
 		 */
@@ -797,10 +854,16 @@ static int sctp_send_asconf_del_ip(struct sock		*sk,
 		list_for_each_entry(transport, &asoc->peer.transport_addr_list,
 					transports) {
 			dst_release(transport->dst);
+			/* Clear source address cache */
+			memset(&transport->saddr, 0, sizeof(union sctp_addr));
 			sctp_transport_route(transport, NULL,
 					     sctp_sk(asoc->base.sk));
 		}
 
+		if (stored) {
+			/* We don't need to transmit ASCONF */
+			continue;
+		}
 		retval = sctp_send_asconf(asoc, chunk);
 	}
 out:


^ permalink raw reply related

* RE: Low performance Intel 10GE NIC (3.2.10) on 2.6.38 Kernel
From: Wei Gu @ 2011-04-07  8:39 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, Alexander Duyck, Jeff Kirsher
In-Reply-To: <1302163650.3357.8.camel@edumazet-laptop>

I'm only insert a prerouting hook to make a copy of the incomming packet and swap the L2/L3 header, send it back on the same interface.

BTW, some times I notices that the perf tool was not mapping the symbol correclly, I don't why?

I will try a fresh install of kernel 2.6.30 and do the test with the shipped ixgbe driver again.


-----Original Message-----
From: Eric Dumazet [mailto:eric.dumazet@gmail.com]
Sent: Thursday, April 07, 2011 4:08 PM
To: Wei Gu
Cc: netdev; Alexander Duyck; Jeff Kirsher
Subject: Re: Low performance Intel 10GE NIC (3.2.10) on 2.6.38 Kernel

Le jeudi 07 avril 2011 à 15:22 +0800, Wei Gu a écrit :
> Hi guys,
> As I talked with Eric, that I get a very low performance on Linux 2.6.38 kernel with intel ixgbe-3.2.10 driver.
> I test different rx buff size on the Intel 10G NIC, by setting ethtool -G rx 4096.
> I get the lowest performance(~50Kpps Rx&Tx) by setting the rx==4096.
> Once I decrease the Rx to 512 (default) then I can get Max 250Kpps Rx&Tx on 1 NIC.
>
> I was runing this test with HP DL580 4 Sock CPUs, and full memeory configuration.
> modprobe ixgbe RSS=8,8,8,8,8,8,8,8 FdirMode=0,0,0,0,0,0,0,0
> Node=0,0,1,1,2,2,3,3 Numactrl --hardware
> available: 4 nodes (0-3)
> node 0 cpus: 0 1 2 3 4 5 6 7 32 33 34 35 36 37 38 39 node 0 size:
> 65525 MB node 0 free: 63053 MB node 1 cpus: 8 9 10 11 12 13 14 15 40
> 41 42 43 44 45 46 47 node 1 size: 65536 MB node 1 free: 63388 MB node
> 2 cpus: 16 17 18 19 20 21 22 23 48 49 50 51 52 53 54 55 node 2 size:
> 65536 MB node 2 free: 63344 MB node 3 cpus: 24 25 26 27 28 29 30 31 56
> 57 58 59 60 61 62 63 node 3 size: 65535 MB node 3 free: 63376 MB
>
> Then I binding the eth10's rx and tx's IRQs to core "24 25 26 27 28 29 30 31", one by one, which means 1 rx and 1 tx was share 1 core.
>
>
> I did the same test on 2.6.32 kernel, I can get >2.5M tx&rx with the
> same setup on RHEL6(2.6.32) Linux. But never reach 10.000.000 rx&tx on
> a single NIC:)
>
> I also test the 2.6.38 shipped intel ixgbe driver It has the same problem.
>
> This is a perf record with linux shipped ixgbe driver, looks it has a
> very high irq/s rate. And the softirq was busy on alloc_iova
>
>
> PerfTop:  512417 irqs/sec  kernel:91.3%  exact:  0.0% [1000Hz
> cpu-clock-msecs],  (all, 64 CPUs)
> ------------------------------------------------------------------------------------------------------------------------------------------------------
> -      0.82%     ksoftirqd/24  [kernel.kallsyms]          [k] _raw_spin_unlock_irqrestore
> \u2592   - _raw_spin_unlock_irqrestore
> \u2592      - 44.27% alloc_iova
> \u2592           intel_alloc_iova
> \u2592           __intel_map_single
> \u2592           intel_map_page
> \u2592         - ixgbe_init_interrupt_scheme
> \u2592            - 59.97% ixgbe_alloc_rx_buffers
> \u2592                 ixgbe_clean_rx_irq
> \u2592                 0xffffffffa033a5
> \u2592                 net_rx_action
> u2592                 __do_softirq
> \u2592               + call_softirq
> \u2592            - 40.03% ixgbe_change_mtu
> \u2592                 ixgbe_change_mtu
> \u2592                 dev_hard_start_xmit
> \u2592                 sch_direct_xmit
> \u2592                 dev_queue_xmit
> \u2592                 vlan_dev_hard_start_xmit
> \u2592                 hook_func
> \u2592                 nf_iterate
> \u2592                nf_hook_slow
> \u2592                 NF_HOOK.clone.1
> \u2592                 ip_rcv
> \u2592                 __netif_receive_skb
> \u2592                 __netif_receive_skb
> \u2592                 netif_receive_skb
> \u2592                 napi_skb_finish
> \u2592                 napi_gro_receive
> \u2592                 ixgbe_clean_rx_irq
> \u2592                 0xffffffffa033a5
> \u2592                 net_rx_action
> \u2592                 __do_softirq
> \u2592               + call_softirq
> \u2592      + 35.85% find_iova
> \u2592      + 19.44% add_unmap
>
>
> Thanks
> WeiGu

What about using the driver as provided in 2.6.38 ?

No custom module parameter, only play with irq affinities

Say you have 64 queues but want only 8 cpus (24 -> 31) receiving trafic

for i in `seq 0 7`
do
 echo 01000000 >/proc/irq/*/eth1-fp-$i/../smp_affinity
done

for i in `seq 8 15`
do
 echo 02000000 >/proc/irq/*/eth1-fp-$i/../smp_affinity
done

...

for i in `seq 56 63`
do
 echo 80000000 >/proc/irq/*/eth1-fp-$i/../smp_affinity
done


Why is ixgbe_change_mtu() seen on your profile ?
Its damn expensive, since it must call ixgbe_reinit_locked()

Are you using a custom code in kernel ?




^ permalink raw reply

* Re: problem of "ipv4: revert Set rt->rt_iif more sanely on output routes."
From: OGAWA Hirofumi @ 2011-04-07  8:32 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <8739lusbvz.fsf@devron.myhome.or.jp>

OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> writes:

> OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> writes:
>
>> David Miller <davem@davemloft.net> writes:
>>
>>> So fix is something like:
>>>
>>> 1) Add "int rt_route_iif;" to struct rtable
>>>
>>> 2) For input routes, always set rt_route_iif to same value as rt_iif
>>>
>>> 3) For output routes, always set rt_route_iif to zero.  Set rt_iif
>>>    as it is done currently.
>>>
>>> 4) Change rt_is_{output,input}_route() to test rt_route_iif
>>>
>>> This should fix the bug and not introduce new regressions.
>>>
>>> Can you write and test such a patch with your test case?
>>
>> Ok. I'll try, but I'm not sure I understand the above correctly. Well,
>> I'll send the patch after testing.
>
> This patch seems to work for avahi-daemon without any warning.
>
> BTW, the above meant change from (there was before) "fl.iif" to
> "rt_route_iif"? If so, this patch is not enough. I'm not sure
>
> +	rth->rt_route_iif = 0;
> +	rth->rt_iif	= oldflp4->flowi4_oif ? : dev_out->ifindex;
>
> is correct one or not. Please review.

Forgot the patch.
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>


[PATCH] Fix  "Set rt->rt_iif more sanely on output routes."


1018b5c01636c7c6bda31a719bda34fc631db29a breaks rt_is_{output,input}_route.

This became the cause to return "IP_PKTINFO's ->ipi_ifindex == 0".

To fix it, this does

1) Add "int rt_route_iif;" to struct rtable

2) For input routes, always set rt_route_iif to same value as rt_iif

3) For output routes, always set rt_route_iif to zero.  Set rt_iif
   as it is done currently.

4) Change rt_is_{output,input}_route() to test rt_route_iif

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
---

 include/net/route.h     |    5 +++--
 net/ipv4/route.c        |    8 ++++++--
 net/ipv4/xfrm4_policy.c |    1 +
 3 files changed, 10 insertions(+), 4 deletions(-)

diff -puN include/net/route.h~revert-avahi-breaker include/net/route.h
--- linux-2.6/include/net/route.h~revert-avahi-breaker	2011-04-07 17:12:05.000000000 +0900
+++ linux-2.6-hirofumi/include/net/route.h	2011-04-07 17:12:05.000000000 +0900
@@ -64,6 +64,7 @@ struct rtable {
 
 	__be32			rt_dst;	/* Path destination	*/
 	__be32			rt_src;	/* Path source		*/
+	int			rt_route_iif;
 	int			rt_iif;
 	int			rt_oif;
 	__u32			rt_mark;
@@ -80,12 +81,12 @@ struct rtable {
 
 static inline bool rt_is_input_route(struct rtable *rt)
 {
-	return rt->rt_iif != 0;
+	return rt->rt_route_iif != 0;
 }
 
 static inline bool rt_is_output_route(struct rtable *rt)
 {
-	return rt->rt_iif == 0;
+	return rt->rt_route_iif == 0;
 }
 
 struct ip_rt_acct {
diff -puN net/ipv4/route.c~revert-avahi-breaker net/ipv4/route.c
--- linux-2.6/net/ipv4/route.c~revert-avahi-breaker	2011-04-07 17:12:05.000000000 +0900
+++ linux-2.6-hirofumi/net/ipv4/route.c	2011-04-07 17:12:05.000000000 +0900
@@ -1891,6 +1891,7 @@ static int ip_route_input_mc(struct sk_b
 #ifdef CONFIG_IP_ROUTE_CLASSID
 	rth->dst.tclassid = itag;
 #endif
+	rth->rt_route_iif = dev->ifindex;
 	rth->rt_iif	= dev->ifindex;
 	rth->dst.dev	= init_net.loopback_dev;
 	dev_hold(rth->dst.dev);
@@ -2026,6 +2027,7 @@ static int __mkroute_input(struct sk_buf
 	rth->rt_key_src	= saddr;
 	rth->rt_src	= saddr;
 	rth->rt_gateway	= daddr;
+	rth->rt_route_iif = in_dev->dev->ifindex;
 	rth->rt_iif 	= in_dev->dev->ifindex;
 	rth->dst.dev	= (out_dev)->dev;
 	dev_hold(rth->dst.dev);
@@ -2202,6 +2204,7 @@ local_input:
 #ifdef CONFIG_IP_ROUTE_CLASSID
 	rth->dst.tclassid = itag;
 #endif
+	rth->rt_route_iif = dev->ifindex;
 	rth->rt_iif	= dev->ifindex;
 	rth->dst.dev	= net->loopback_dev;
 	dev_hold(rth->dst.dev);
@@ -2401,7 +2404,8 @@ static struct rtable *__mkroute_output(c
 	rth->rt_mark    = oldflp4->flowi4_mark;
 	rth->rt_dst	= fl4->daddr;
 	rth->rt_src	= fl4->saddr;
-	rth->rt_iif	= 0;
+	rth->rt_route_iif = 0;
+	rth->rt_iif	= oldflp4->flowi4_oif ? : dev_out->ifindex;
 	/* get references to the devices that are to be hold by the routing
 	   cache entry */
 	rth->dst.dev	= dev_out;
@@ -2716,6 +2720,7 @@ struct dst_entry *ipv4_blackhole_route(s
 		rt->rt_key_dst = ort->rt_key_dst;
 		rt->rt_key_src = ort->rt_key_src;
 		rt->rt_tos = ort->rt_tos;
+		rt->rt_route_iif = ort->rt_route_iif;
 		rt->rt_iif = ort->rt_iif;
 		rt->rt_oif = ort->rt_oif;
 		rt->rt_mark = ort->rt_mark;
@@ -2725,7 +2730,6 @@ struct dst_entry *ipv4_blackhole_route(s
 		rt->rt_type = ort->rt_type;
 		rt->rt_dst = ort->rt_dst;
 		rt->rt_src = ort->rt_src;
-		rt->rt_iif = ort->rt_iif;
 		rt->rt_gateway = ort->rt_gateway;
 		rt->rt_spec_dst = ort->rt_spec_dst;
 		rt->peer = ort->peer;
diff -puN net/ipv4/xfrm4_policy.c~revert-avahi-breaker net/ipv4/xfrm4_policy.c
--- linux-2.6/net/ipv4/xfrm4_policy.c~revert-avahi-breaker	2011-04-07 17:12:05.000000000 +0900
+++ linux-2.6-hirofumi/net/ipv4/xfrm4_policy.c	2011-04-07 17:12:05.000000000 +0900
@@ -74,6 +74,7 @@ static int xfrm4_fill_dst(struct xfrm_ds
 	rt->rt_key_dst = fl4->daddr;
 	rt->rt_key_src = fl4->saddr;
 	rt->rt_tos = fl4->flowi4_tos;
+	rt->rt_route_iif = fl4->flowi4_iif;
 	rt->rt_iif = fl4->flowi4_iif;
 	rt->rt_oif = fl4->flowi4_oif;
 	rt->rt_mark = fl4->flowi4_mark;
_

^ permalink raw reply

* Re: problem of "ipv4: revert Set rt->rt_iif more sanely on output routes."
From: OGAWA Hirofumi @ 2011-04-07  8:29 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <877hb6sf43.fsf@devron.myhome.or.jp>

OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> writes:

> David Miller <davem@davemloft.net> writes:
>
>> So fix is something like:
>>
>> 1) Add "int rt_route_iif;" to struct rtable
>>
>> 2) For input routes, always set rt_route_iif to same value as rt_iif
>>
>> 3) For output routes, always set rt_route_iif to zero.  Set rt_iif
>>    as it is done currently.
>>
>> 4) Change rt_is_{output,input}_route() to test rt_route_iif
>>
>> This should fix the bug and not introduce new regressions.
>>
>> Can you write and test such a patch with your test case?
>
> Ok. I'll try, but I'm not sure I understand the above correctly. Well,
> I'll send the patch after testing.

This patch seems to work for avahi-daemon without any warning.

BTW, the above meant change from (there was before) "fl.iif" to
"rt_route_iif"? If so, this patch is not enough. I'm not sure

+	rth->rt_route_iif = 0;
+	rth->rt_iif	= oldflp4->flowi4_oif ? : dev_out->ifindex;

is correct one or not. Please review.

Thanks.
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply

* Agent Needed !!!
From: Mr. Chia-Juch Chang @ 2011-04-07  8:21 UTC (permalink / raw)


China Steel Corporation (CSC).
HEAD OFFICE 1 Chung-Kang Road,
Siaogang District, Kaohsiung
81233, Taiwan, R.O.C.
REF:CSC/REP/887

i. Introduction.
My Name is Chia-Juch Chang. I am the Chief Executive Officer of China Steel
Corporation (CSC). We need a reputable company/firm to serve as our payment
collection agent in North America, Europe, Asia. You shall earn 10% of every
payment issued to you on behalf of China Steel Corporation.

ii. Requirement (Contact Information):
1. Full Names:
2. Company Name:
3. Full Contact Address:
4. Tel and Fax Numbers:

If interested, please email us immediately at ChinaSteelCorporation@email.com

Contact Person: Ethan Downing
Regional Manager
Tel: +886-7-802-1111
Email: ChinaSteelCorporation@email.com
Website: www.csc.com.tw

----------------------------------------------------------------
This message was sent using IMP, the Internet Messaging Program.



^ permalink raw reply

* Re: [PATCH net-next] cxgb4: don't hold RTNL during ethtool phys_id
From: Dimitris Michailidis @ 2011-04-07  8:18 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Casey Leedom, Ben Hutchings, David Miller, netdev
In-Reply-To: <20110406173308.4737e9d4@nehalam>

Stephen Hemminger wrote:
> On Wed, 6 Apr 2011 17:20:29 -0700
> Casey Leedom <leedom@chelsio.com> wrote:
> 
>> | From: Stephen Hemminger <shemminger@linux-foundation.org>
>> | Date: Wednesday, April 06, 2011 05:09 pm
>> | 
>> | The Chelsio cxgb4 drivers implement blinking in a unique way by
>> | waiting on the mailbox. This patch cleans it up slightly by no longer
>> | holding the system wide network configuration lock during the process.
>> | 
>> | The patch also uses correct semantics for the time argument
>> | which is supposed to be in seconds; and zero is supposed
>> | to signify infinite blinking.
>> | 
>> | This is still a bad firmware interface design for this
>> | since it means the board is basically hung while doing the blink.
>> | But fixing it correctly would require hardware and firmware
>> | documentation. With that information the device could be converted
>> | to the new set_phys_id.
>> | 
>> | Compile tested only.
>> | 
>> | Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
>>
>>   Are you assuming that the firmware won't respond with a command completion 
>> until the LED blinking is complete?  If so, that's a bad assumption.  The 
>> firmware runs as an asynchronous real-time OS.  The LED blinking simply becomes 
>> a thread of activity within the OS and the command completes immediately.
>>
>> Casey
> 
> Then how is LED blinking stopped?

You can pass 0 as blinks to cancel your request, which may or may not cancel 
the LED blinking depending on what other drivers have concurrent blinking 
requests in progress.  But you can't pass UINT_MAX as the patch does.  I'll 
fix it up to use the new ethtool interface this week.

^ permalink raw reply

* Re: [PATCH net-next 2/5] be2net: use common method to check for sriov function type
From: Ben Hutchings @ 2011-04-07  8:14 UTC (permalink / raw)
  To: Ajit Khaparde; +Cc: netdev
In-Reply-To: <20110407040801.GA4199@akhaparde-VBox>

On Wed, 2011-04-06 at 23:08 -0500, Ajit Khaparde wrote:
> Lancer and BE can both use SLI_INTF_REG to check a VF or a PF.
[...]

This seems pretty unreliable (both in the previous and the current
version).  You cannot rely on the whole of PCI config space being mapped
to a VM guest.  KVM certainly didn't do this when I used PCI pass-
through.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* Re: [PATCH 07/19] timberdale: mfd_cell is now implicitly available to drivers
From: Felipe Balbi @ 2011-04-07  8:09 UTC (permalink / raw)
  To: Greg KH
  Cc: Felipe Balbi, Samuel Ortiz, Grant Likely, Andres Salomon,
	linux-kernel, Mark Brown, khali, ben-linux, Peter Korsgaard,
	Mauro Carvalho Chehab, David Brownell, linux-i2c, linux-media,
	netdev, spi-devel-general, Mocean Laboratories
In-Reply-To: <20110406220900.GA16117@suse.de>

Hi,

On Wed, Apr 06, 2011 at 03:09:00PM -0700, Greg KH wrote:
> On Wed, Apr 06, 2011 at 09:59:02PM +0300, Felipe Balbi wrote:
> > Hi,
> > 
> > On Wed, Apr 06, 2011 at 08:47:34PM +0200, Samuel Ortiz wrote:
> > > > > > What is a "MFD cell pointer" and why is it needed in struct device?
> > > > > An MFD cell is an MFD instantiated device.
> > > > > MFD (Multi Function Device) drivers instantiate platform devices. Those
> > > > > devices drivers sometimes need a platform data pointer, sometimes an MFD
> > > > > specific pointer, and sometimes both. Also, some of those drivers have been
> > > > > implemented as MFD sub drivers, while others know nothing about MFD and just
> > > > > expect a plain platform_data pointer.
> > > > 
> > > > That sounds like a bug in those drivers, why not fix them to properly
> > > > pass in the correct pointer?
> > > Because they're drivers for generic IPs, not MFD ones. By forcing them to use
> > > MFD specific structure and APIs, we make it more difficult for platform code
> > > to instantiate them.
> > 
> > I agree. What I do on those cases is to have a simple platform_device
> > for the core IP driver and use platform_device_id tables to do runtime
> > checks of the small differences. If one platform X doesn't use a
> > platform_bus, it uses e.g. PCI, then you make a PCI "bridge" which
> > allocates a platform_device with the correct name and adds that to the
> > driver model.
> > 
> > See [1] (for the core driver) and [2] (for a PCI bridge driver) for an
> > example of what I'm talking about.
> 
> Yes, thanks for providing a real example, this is the best way to handle
> this.

no problem.

ps: that's the driver for the USB3 controller which will come on OMAP5.
Driver being validate on a pre-silicon platform right now :-D In a few
weeks I'll send the driver for integration.

-- 
balbi

^ permalink raw reply

* Re: Low performance Intel 10GE NIC (3.2.10) on 2.6.38 Kernel
From: Eric Dumazet @ 2011-04-07  8:07 UTC (permalink / raw)
  To: Wei Gu; +Cc: netdev, Alexander Duyck, Jeff Kirsher
In-Reply-To: <D12839161ADD3A4B8DA63D1A134D084026E48B9E82@ESGSCCMS0001.eapac.ericsson.se>

Le jeudi 07 avril 2011 à 15:22 +0800, Wei Gu a écrit :
> Hi guys,
> As I talked with Eric, that I get a very low performance on Linux 2.6.38 kernel with intel ixgbe-3.2.10 driver.
> I test different rx buff size on the Intel 10G NIC, by setting ethtool -G rx 4096.
> I get the lowest performance(~50Kpps Rx&Tx) by setting the rx==4096.
> Once I decrease the Rx to 512 (default) then I can get Max 250Kpps Rx&Tx on 1 NIC.
> 
> I was runing this test with HP DL580 4 Sock CPUs, and full memeory configuration.
> modprobe ixgbe RSS=8,8,8,8,8,8,8,8 FdirMode=0,0,0,0,0,0,0,0 Node=0,0,1,1,2,2,3,3
> Numactrl --hardware
> available: 4 nodes (0-3)
> node 0 cpus: 0 1 2 3 4 5 6 7 32 33 34 35 36 37 38 39
> node 0 size: 65525 MB
> node 0 free: 63053 MB
> node 1 cpus: 8 9 10 11 12 13 14 15 40 41 42 43 44 45 46 47
> node 1 size: 65536 MB
> node 1 free: 63388 MB
> node 2 cpus: 16 17 18 19 20 21 22 23 48 49 50 51 52 53 54 55
> node 2 size: 65536 MB
> node 2 free: 63344 MB
> node 3 cpus: 24 25 26 27 28 29 30 31 56 57 58 59 60 61 62 63
> node 3 size: 65535 MB
> node 3 free: 63376 MB
> 
> Then I binding the eth10's rx and tx's IRQs to core "24 25 26 27 28 29 30 31", one by one, which means 1 rx and 1 tx was share 1 core.
> 
> 
> I did the same test on 2.6.32 kernel, I can get >2.5M tx&rx with the same setup on RHEL6(2.6.32) Linux. But never reach 10.000.000 rx&tx on a single NIC:)
> 
> I also test the 2.6.38 shipped intel ixgbe driver It has the same problem.
> 
> This is a perf record with linux shipped ixgbe driver, looks it has a very high irq/s rate. And the softirq was busy on alloc_iova
> 
> 
> PerfTop:  512417 irqs/sec  kernel:91.3%  exact:  0.0% [1000Hz cpu-clock-msecs],  (all, 64 CPUs)
> ------------------------------------------------------------------------------------------------------------------------------------------------------
> -      0.82%     ksoftirqd/24  [kernel.kallsyms]          [k] _raw_spin_unlock_irqrestore
> \u2592   - _raw_spin_unlock_irqrestore
> \u2592      - 44.27% alloc_iova
> \u2592           intel_alloc_iova
> \u2592           __intel_map_single
> \u2592           intel_map_page
> \u2592         - ixgbe_init_interrupt_scheme
> \u2592            - 59.97% ixgbe_alloc_rx_buffers
> \u2592                 ixgbe_clean_rx_irq
> \u2592                 0xffffffffa033a5
> \u2592                 net_rx_action
> u2592                 __do_softirq
> \u2592               + call_softirq
> \u2592            - 40.03% ixgbe_change_mtu
> \u2592                 ixgbe_change_mtu
> \u2592                 dev_hard_start_xmit
> \u2592                 sch_direct_xmit
> \u2592                 dev_queue_xmit
> \u2592                 vlan_dev_hard_start_xmit
> \u2592                 hook_func
> \u2592                 nf_iterate
> \u2592                nf_hook_slow
> \u2592                 NF_HOOK.clone.1
> \u2592                 ip_rcv
> \u2592                 __netif_receive_skb
> \u2592                 __netif_receive_skb
> \u2592                 netif_receive_skb
> \u2592                 napi_skb_finish
> \u2592                 napi_gro_receive
> \u2592                 ixgbe_clean_rx_irq
> \u2592                 0xffffffffa033a5
> \u2592                 net_rx_action
> \u2592                 __do_softirq
> \u2592               + call_softirq
> \u2592      + 35.85% find_iova
> \u2592      + 19.44% add_unmap
> 
> 
> Thanks
> WeiGu

What about using the driver as provided in 2.6.38 ?

No custom module parameter, only play with irq affinities

Say you have 64 queues but want only 8 cpus (24 -> 31) receiving trafic

for i in `seq 0 7`
do
 echo 01000000 >/proc/irq/*/eth1-fp-$i/../smp_affinity
done

for i in `seq 8 15`
do
 echo 02000000 >/proc/irq/*/eth1-fp-$i/../smp_affinity
done

...

for i in `seq 56 63`
do
 echo 80000000 >/proc/irq/*/eth1-fp-$i/../smp_affinity
done


Why is ixgbe_change_mtu() seen on your profile ?
Its damn expensive, since it must call ixgbe_reinit_locked()

Are you using a custom code in kernel ?




^ permalink raw reply

* Re: [PATCH 07/19] timberdale: mfd_cell is now implicitly available to drivers
From: Grant Likely @ 2011-04-07  8:04 UTC (permalink / raw)
  To: Greg KH
  Cc: Andres Salomon, Samuel Ortiz, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	Mark Brown, khali-PUYAD+kWke1g9hUCZPvPmw,
	ben-linux-elnMNo+KYs3YtjvyW6yDsg, Peter Korsgaard,
	Mauro Carvalho Chehab, David Brownell,
	linux-i2c-u79uwXL29TY76Z2rM5mHXA,
	linux-media-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	spi-devel-general-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	Mocean Laboratories
In-Reply-To: <20110406183854.GA10058-l3A5Bk7waGM@public.gmane.org>

On Wed, Apr 06, 2011 at 11:38:54AM -0700, Greg KH wrote:
> On Wed, Apr 06, 2011 at 11:25:57AM -0700, Andres Salomon wrote:
> > > > We've been faced with the problem of being able to pass both MFD
> > > > related data and a platform_data pointer to some of those drivers.
> > > > Squeezing the MFD bits in the sub driver platform_data pointer
> > > > doesn't work for drivers that know nothing about MFDs. It also adds
> > > > an additional dependency on the MFD API to all MFD sub drivers.
> > > > That prevents any of those drivers to eventually be used as plain
> > > > platform device drivers.
> > > 
> > > Then they shouldn't be "plain" platform drivers, that should only be
> > > reserved for drivers that are the "lowest" type.  Just make them MFD
> > > devices and go from there.
> > 
> > 
> > The problem is of mixing "plain" platform devices and MFD devices.
> 
> Then don't do that.

>From my perspective, MFD devices are little more than a bag of
platform_devices, with the MFD layer provides infrastructure for
managing it.  It isn't that there are 'plain' platform device and
'mfd' devices.  There are only platform_devices, but some of the
drivers use additional data stored in a struct mfd.

Personally, I'm not thrilled with the approach of using struct mfd, or
more specifically making it available to drivers, but on the ugly
scale it isn't very high.

However, the changes on how struct mfd is passed that were merged in
2.6.39 were actively dangerous and are going to be reverted.  Yet
a method is still needed to pass the struct mfd in a safe way.  I
don't have a problem with adding the mfd pointer to struct
platform_device, even if it should just be a stop gap to something
better.

Independently, I have been experimenting with typesafe methods for
attaching data to devices which may very well be the long term
approach, but for the short term I see no problem with adding the mfd
pointer, particularly because it is by far safer than any of the other
immediately available options.

g.

^ permalink raw reply

* [PATCH] iproute2: parse flag XFRM_POLICY_ICMP
From: Ulrich Weber @ 2011-04-07  7:37 UTC (permalink / raw)
  To: shemminger; +Cc: netdev

parse flag XFRM_POLICY_ICMP

Signed-off-by: Ulrich Weber <uweber@astaro.com>
---
 ip/ipxfrm.c      |    1 +
 ip/xfrm_policy.c |    4 +++-
 2 files changed, 4 insertions(+), 1 deletions(-)

diff --git a/ip/ipxfrm.c b/ip/ipxfrm.c
index a276c0b..7a9a681 100644
--- a/ip/ipxfrm.c
+++ b/ip/ipxfrm.c
@@ -980,6 +980,7 @@ void xfrm_policy_info_print(struct xfrm_userpolicy_info *xpinfo,
 
 		fprintf(fp, "flag ");
 		XFRM_FLAG_PRINT(fp, flags, XFRM_POLICY_LOCALOK, "localok");
+		XFRM_FLAG_PRINT(fp, flags, XFRM_POLICY_ICMP, "icmp");
 		if (flags)
 			fprintf(fp, "%x", flags);
 	}
diff --git a/ip/xfrm_policy.c b/ip/xfrm_policy.c
index 9ef5c09..7827f91 100644
--- a/ip/xfrm_policy.c
+++ b/ip/xfrm_policy.c
@@ -77,7 +77,7 @@ static void usage(void)
 	//fprintf(stderr, "PRIORITY - priority value(default=0)\n");
 
 	fprintf(stderr, "FLAG-LIST := [ FLAG-LIST ] FLAG\n");
-	fprintf(stderr, "FLAG := [ localok ]\n");
+	fprintf(stderr, "FLAG := [ localok | icmp ]\n");
 
 	fprintf(stderr, "LIMIT-LIST := [ LIMIT-LIST ] | [ limit LIMIT ]\n");
 	fprintf(stderr, "LIMIT := [ [time-soft|time-hard|time-use-soft|time-use-hard] SECONDS ] |\n");
@@ -156,6 +156,8 @@ static int xfrm_policy_flag_parse(__u8 *flags, int *argcp, char ***argvp)
 		while (1) {
 			if (strcmp(*argv, "localok") == 0)
 				*flags |= XFRM_POLICY_LOCALOK;
+			else if (strcmp(*argv, "icmp") == 0)
+				*flags |= XFRM_POLICY_ICMP;
 			else {
 				PREV_ARG(); /* back track */
 				break;
-- 
1.7.1


^ permalink raw reply related

* [PatchV3 3/3] usb: plusb: Add debug to reset function
From: Simon Wood @ 2011-04-07  7:40 UTC (permalink / raw)
  To: Greg Kroah-Hartman
  Cc: Sergei Shtylyov, davem, linux-usb, netdev, linux-kernel,
	Simon Wood
In-Reply-To: <1302162015-22504-1-git-send-email-simon@mungewell.org>

From: simon <simon@ubuntu.(none)>

This patch adds some debug to the reset function to print out the
reason why it fails.

Signed-off-by: Simon Wood <simon@mungewell.org>
---
 drivers/net/usb/plusb.c |    6 +++++-
 1 files changed, 5 insertions(+), 1 deletions(-)

diff --git a/drivers/net/usb/plusb.c b/drivers/net/usb/plusb.c
index f46aa07..217aec8 100644
--- a/drivers/net/usb/plusb.c
+++ b/drivers/net/usb/plusb.c
@@ -94,11 +94,15 @@ pl_set_QuickLink_features(struct usbnet *dev, int val)
 
 static int pl_reset(struct usbnet *dev)
 {
+	int status;
+
 	/* some units seem to need this reset, others reject it utterly.
 	 * FIXME be more like "naplink" or windows drivers.
 	 */
-	(void) pl_set_QuickLink_features(dev,
+	status = pl_set_QuickLink_features(dev,
 		PL_S_EN|PL_RESET_OUT|PL_RESET_IN|PL_PEER_E);
+	if (status != 0 && netif_msg_probe(dev))
+		netif_dbg(dev, link, dev->net, "pl_reset --> %d\n", status);
 	return 0;
 }
 
-- 
1.7.4.1

^ permalink raw reply related

* [PatchV3 2/3] usb: plusb: Add support for PL-25A1
From: Simon Wood @ 2011-04-07  7:40 UTC (permalink / raw)
  To: Greg Kroah-Hartman
  Cc: Sergei Shtylyov, davem, linux-usb, netdev, linux-kernel,
	Simon Wood
In-Reply-To: <1302162015-22504-1-git-send-email-simon@mungewell.org>

From: simon <simon@ubuntu.(none)>

This patch adds support for the PL-25A1 by adding the appropriate
USB ID's. This chip is used in the Belkin 'Windows Easy Transfer'
Cables.

Signed-off-by: Simon Wood <simon@mungewell.org>
---
 drivers/net/usb/Kconfig |    2 +-
 drivers/net/usb/plusb.c |   22 ++++++++++++++++++++--
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/drivers/net/usb/Kconfig b/drivers/net/usb/Kconfig
index 3ec22c3..9d4f911 100644
--- a/drivers/net/usb/Kconfig
+++ b/drivers/net/usb/Kconfig
@@ -258,7 +258,7 @@ config USB_NET_NET1080
 	  optionally with LEDs that indicate traffic
 
 config USB_NET_PLUSB
-	tristate "Prolific PL-2301/2302 based cables"
+	tristate "Prolific PL-2301/2302/25A1 based cables"
 	# if the handshake/init/reset problems, from original 'plusb',
 	# are ever resolved ... then remove "experimental"
 	depends on USB_USBNET && EXPERIMENTAL
diff --git a/drivers/net/usb/plusb.c b/drivers/net/usb/plusb.c
index 2fe1bb5..f46aa07 100644
--- a/drivers/net/usb/plusb.c
+++ b/drivers/net/usb/plusb.c
@@ -45,6 +45,14 @@
  * seems to get wedged under load.  Prolific docs are weak, and
  * don't identify differences between PL2301 and PL2302, much less
  * anything to explain the different PL2302 versions observed.
+ *
+ * NOTE:  pl2501 has several modes, including pl2301 and pl2302
+ * compatibility.   Some docs suggest the difference between 2301
+ * and 2302 is only to make MS-Windows use a different driver...
+ *
+ * pl25a1 glue based on patch from Tony Gibbs.  Prolific "docs" on
+ * this chip are as usual incomplete about what control messages
+ * are supported.
  */
 
 /*
@@ -95,7 +103,7 @@ static int pl_reset(struct usbnet *dev)
 }
 
 static const struct driver_info	prolific_info = {
-	.description =	"Prolific PL-2301/PL-2302",
+	.description =	"Prolific PL-2301/PL-2302/PL-25A1",
 	.flags =	FLAG_POINTTOPOINT | FLAG_NO_SETINT,
 		/* some PL-2302 versions seem to fail usb_set_interface() */
 	.reset =	pl_reset,
@@ -111,6 +119,7 @@ static const struct driver_info	prolific_info = {
 
 static const struct usb_device_id	products [] = {
 
+/* full speed cables */
 {
 	USB_DEVICE(0x067b, 0x0000),	// PL-2301
 	.driver_info =	(unsigned long) &prolific_info,
@@ -119,6 +128,15 @@ static const struct usb_device_id	products [] = {
 	.driver_info =	(unsigned long) &prolific_info,
 },
 
+/* high speed cables */
+{
+	USB_DEVICE(0x067b, 0x25a1),     /* PL-25A1, no eeprom */
+	.driver_info =  (unsigned long) &prolific_info,
+}, {
+	USB_DEVICE(0x050d, 0x258a),     /* Belkin F5U258/F5U279 (PL-25A1) */
+	.driver_info =  (unsigned long) &prolific_info,
+},
+
 	{ },		// END
 };
 MODULE_DEVICE_TABLE(usb, products);
@@ -145,5 +163,5 @@ static void __exit plusb_exit(void)
 module_exit(plusb_exit);
 
 MODULE_AUTHOR("David Brownell");
-MODULE_DESCRIPTION("Prolific PL-2301/2302 USB Host to Host Link Driver");
+MODULE_DESCRIPTION("Prolific PL-2301/2302/25A1 USB Host to Host Link Driver");
 MODULE_LICENSE("GPL");
-- 
1.7.4.1

^ permalink raw reply related

* [PatchV3 1/3] usb: plusb: Whitespace
From: Simon Wood @ 2011-04-07  7:40 UTC (permalink / raw)
  To: Greg Kroah-Hartman
  Cc: Sergei Shtylyov, davem, linux-usb, netdev, linux-kernel,
	Simon Wood
In-Reply-To: <1301456667-1648-1-git-send-email-simon@mungewell.org>

From: simon <simon@ubuntu.(none)>

This patch cleans up a couple of instances of incorrect whitespace

Signed-off-by: Simon Wood <simon@mungewell.org>
---
 drivers/net/usb/plusb.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/usb/plusb.c b/drivers/net/usb/plusb.c
index 823c537..2fe1bb5 100644
--- a/drivers/net/usb/plusb.c
+++ b/drivers/net/usb/plusb.c
@@ -134,13 +134,13 @@ static struct usb_driver plusb_driver = {
 
 static int __init plusb_init(void)
 {
- 	return usb_register(&plusb_driver);
+	return usb_register(&plusb_driver);
 }
 module_init(plusb_init);
 
 static void __exit plusb_exit(void)
 {
- 	usb_deregister(&plusb_driver);
+	usb_deregister(&plusb_driver);
 }
 module_exit(plusb_exit);
 
-- 
1.7.4.1

^ permalink raw reply related

* Low performance Intel 10GE NIC (3.2.10) on 2.6.38 Kernel
From: Wei Gu @ 2011-04-07  7:22 UTC (permalink / raw)
  To: netdev, Alexander Duyck, Jeff Kirsher
In-Reply-To: <1302157012.2701.73.camel@edumazet-laptop>

Hi guys,
As I talked with Eric, that I get a very low performance on Linux 2.6.38 kernel with intel ixgbe-3.2.10 driver.
I test different rx buff size on the Intel 10G NIC, by setting ethtool -G rx 4096.
I get the lowest performance(~50Kpps Rx&Tx) by setting the rx==4096.
Once I decrease the Rx to 512 (default) then I can get Max 250Kpps Rx&Tx on 1 NIC.

I was runing this test with HP DL580 4 Sock CPUs, and full memeory configuration.
modprobe ixgbe RSS=8,8,8,8,8,8,8,8 FdirMode=0,0,0,0,0,0,0,0 Node=0,0,1,1,2,2,3,3
Numactrl --hardware
available: 4 nodes (0-3)
node 0 cpus: 0 1 2 3 4 5 6 7 32 33 34 35 36 37 38 39
node 0 size: 65525 MB
node 0 free: 63053 MB
node 1 cpus: 8 9 10 11 12 13 14 15 40 41 42 43 44 45 46 47
node 1 size: 65536 MB
node 1 free: 63388 MB
node 2 cpus: 16 17 18 19 20 21 22 23 48 49 50 51 52 53 54 55
node 2 size: 65536 MB
node 2 free: 63344 MB
node 3 cpus: 24 25 26 27 28 29 30 31 56 57 58 59 60 61 62 63
node 3 size: 65535 MB
node 3 free: 63376 MB

Then I binding the eth10's rx and tx's IRQs to core "24 25 26 27 28 29 30 31", one by one, which means 1 rx and 1 tx was share 1 core.


I did the same test on 2.6.32 kernel, I can get >2.5M tx&rx with the same setup on RHEL6(2.6.32) Linux. But never reach 10.000.000 rx&tx on a single NIC:)

I also test the 2.6.38 shipped intel ixgbe driver It has the same problem.

This is a perf record with linux shipped ixgbe driver, looks it has a very high irq/s rate. And the softirq was busy on alloc_iova


PerfTop:  512417 irqs/sec  kernel:91.3%  exact:  0.0% [1000Hz cpu-clock-msecs],  (all, 64 CPUs)
------------------------------------------------------------------------------------------------------------------------------------------------------
-      0.82%     ksoftirqd/24  [kernel.kallsyms]          [k] _raw_spin_unlock_irqrestore
\u2592   - _raw_spin_unlock_irqrestore
\u2592      - 44.27% alloc_iova
\u2592           intel_alloc_iova
\u2592           __intel_map_single
\u2592           intel_map_page
\u2592         - ixgbe_init_interrupt_scheme
\u2592            - 59.97% ixgbe_alloc_rx_buffers
\u2592                 ixgbe_clean_rx_irq
\u2592                 0xffffffffa033a5
\u2592                 net_rx_action
u2592                 __do_softirq
\u2592               + call_softirq
\u2592            - 40.03% ixgbe_change_mtu
\u2592                 ixgbe_change_mtu
\u2592                 dev_hard_start_xmit
\u2592                 sch_direct_xmit
\u2592                 dev_queue_xmit
\u2592                 vlan_dev_hard_start_xmit
\u2592                 hook_func
\u2592                 nf_iterate
\u2592                nf_hook_slow
\u2592                 NF_HOOK.clone.1
\u2592                 ip_rcv
\u2592                 __netif_receive_skb
\u2592                 __netif_receive_skb
\u2592                 netif_receive_skb
\u2592                 napi_skb_finish
\u2592                 napi_gro_receive
\u2592                 ixgbe_clean_rx_irq
\u2592                 0xffffffffa033a5
\u2592                 net_rx_action
\u2592                 __do_softirq
\u2592               + call_softirq
\u2592      + 35.85% find_iova
\u2592      + 19.44% add_unmap


Thanks
WeiGu


-----Original Message-----
From: Eric Dumazet [mailto:eric.dumazet@gmail.com]
Sent: Thursday, April 07, 2011 2:17 PM
To: Wei Gu
Cc: netdev; Alexander Duyck; Jeff Kirsher
Subject: RE: Question on "net: allocate skbs on local node"

Le jeudi 07 avril 2011 à 07:16 +0200, Eric Dumazet a écrit :
> Le jeudi 07 avril 2011 à 06:58 +0200, Eric Dumazet a écrit :
> > Le jeudi 07 avril 2011 à 10:16 +0800, Wei Gu a écrit :
> > > Hi Eric,
> > > Testing with ixgbe Linux 2.6.38 driver:
> > > We have a little better thruput figure with this driver, but it
> > > looks not scalling at all, I always stressed one CPU core/24.
> > > And when look the perf report for ksoftirqd/24, the most cost
> > > function is still "_raw_spin_unlock_irqstore" and the IRQ/s is
> > > huge, it's somehow conflicts with desgin of NAPI. On linux 2.6.32
> > > while the CPU was stressed the IRQ will descreased while the NAPI
> > > will running much on the polling mode. I don't know why on 2.6.38
> > > the IRQ was keep increasing.
> >
> >
> > CC netdev and Intel guys, since they said it should not happen (TM)
> >
> > IF you dont use DCA (make sure ioatdma module is not loaded), how
> > comes
> > alloc_iova() is called at all ?
> >
> > IF you use DCA, how comes its called, since the same CPU serves a
> > given interrupt ?
> >
> >
>
> But then, maybe you forgot to cpu affine IRQS ?
>
> High performance routing setup is tricky, since you probably want to
> disable many features that are ON by default : Most machines act as a
> end host.
>
>

Please dont send me anymore private mails, I do think the issue you have is on a setup, not a particular optimization done in network stack.


Copy of your private mail :

> On 2.6.38, I got a lot of "rx_missed_errors" on NIC, which means the
> rx loop was really busy to get packet from the receiving ring. Usually
> in this case it shouldn't exit the softirqs and keep polling in order
> to decrease the initrs.
>
> On 2.6.32, I can Rx and Tx 2.3Mpps with no packet lost(error on NIC),
> but on 2.6.38 I can only reach 50kpps with a lot of
> "rx_missed_errors", and all the binding cpu core was 100% in SI. I
> don't think there was any optimizations on it.

I hope you understand there is something wrong with your setup ?

50.000 pps on a 64 cpu machine is a bad joke.

We can reach +10.000.000 on a 16 cpus one.




^ permalink raw reply

* Re: problem of "ipv4: revert Set rt->rt_iif more sanely on output routes."
From: OGAWA Hirofumi @ 2011-04-07  7:19 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20110406.224244.104071339.davem@davemloft.net>

David Miller <davem@davemloft.net> writes:

> So fix is something like:
>
> 1) Add "int rt_route_iif;" to struct rtable
>
> 2) For input routes, always set rt_route_iif to same value as rt_iif
>
> 3) For output routes, always set rt_route_iif to zero.  Set rt_iif
>    as it is done currently.
>
> 4) Change rt_is_{output,input}_route() to test rt_route_iif
>
> This should fix the bug and not introduce new regressions.
>
> Can you write and test such a patch with your test case?

Ok. I'll try, but I'm not sure I understand the above correctly. Well,
I'll send the patch after testing.

Thanks.
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply

* Re: [Patch] iwlwifi: remove obsoleted module alias and parameters
From: Cong Wang @ 2011-04-07  7:17 UTC (permalink / raw)
  To: Guy, Wey-Yi
  Cc: linux-wireless@vger.kernel.org, netdev@vger.kernel.org,
	Intel Linux Wireless, Berg, Johannes, John W. Linville,
	Stanislaw Gruszka, Venkataraman, Meenakshi, Larry Finger
In-Reply-To: <1302098755.14995.112.camel@wwguy-huron>

于 2011年04月06日 22:05, Guy, Wey-Yi 写道:
> On Wed, 2011-04-06 at 02:49 -0700, Amerigo Wang wrote:
>> As scheduled in Documentation/feature-removal-schedule.txt,
>> remove "*50", "disable_hw_scan" module parameters and MODULE_ALIAS("iwl4965").
>>
>> Cc: Intel Linux Wireless<ilw@linux.intel.com>
>> Cc: Johannes Berg<johannes.berg@intel.com>
>> Cc: "John W. Linville"<linville@tuxdriver.com>
>> Cc: Wey-Yi Guy<wey-yi.w.guy@intel.com>
>> Cc: Stanislaw Gruszka<sgruszka@redhat.com>
>> Cc: Meenakshi Venkataraman<meenakshi.venkataraman@intel.com>
>> Cc: Larry Finger<Larry.Finger@lwfinger.net>
>> Signed-off-by: WANG Cong<amwang@redhat.com>
>>
>> ---
> what tree you are base on?
> please check commit#7eaa6a5e964f1ab02d849bda36950c0d30be8ce2 in
> wireless-next-2.6

The latest Linus tree, sorry that I didn't know wireless has its own tree,
I just checked wireless-next-2.6 but don't find any commit matched
that commit ID, but I assume you meant you already sent a same patch?
If yes, feel free to discard mine.

Thanks.

^ permalink raw reply

* Re: [Patch] isdn: remove deprecated ISDN_CAPI_CAPIFS
From: Cong Wang @ 2011-04-07  7:06 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, isdn, jan.kiszka
In-Reply-To: <20110406.131216.15250040.davem@davemloft.net>

于 2011年04月07日 04:12, David Miller 写道:
> From: Amerigo Wang<amwang@redhat.com>
> Date: Wed,  6 Apr 2011 17:05:39 +0800
>
>> Cc: Jan Kiszka<jan.kiszka@web.de>
>> Cc: Karsten Keil<isdn@linux-pingi.de>
>> Signed-off-by: WANG Cong<amwang@redhat.com>
>
> capi.c still includes capifs.h, which you are deleting here.
>
> How did you build test this?

Oops! I definitely used a wrong .config.. :-/

Thanks for fixing it, Jan!

^ permalink raw reply

* Re: [PATCH] xen: drop anti-dependency on X86_VISWS
From: Ian Campbell @ 2011-04-07  6:58 UTC (permalink / raw)
  To: David Miller
  Cc: eric.dumazet@gmail.com, mirq-linux@rere.qmqm.pl,
	netdev@vger.kernel.org, Jeremy Fitzhardinge,
	konrad.wilk@oracle.com, xen-devel@lists.xensource.com,
	virtualization@lists.linux-foundation.org,
	randy.dunlap@oracle.com, pazke@donpac.ru,
	linux-visws-devel@lists.sf.net, tglx@linutronix.de,
	mingo@redhat.com, hpa@zytor.com
In-Reply-To: <20110406.144515.235693855.davem@davemloft.net>

On Wed, 2011-04-06 at 22:45 +0100, David Miller wrote:
> From: Ian Campbell <Ian.Campbell@eu.citrix.com>
> Date: Mon, 4 Apr 2011 10:55:55 +0100
> 
> > You mean the "!X86_VISWS" I presume? It doesn't make sense to me either.
> 
> No, I think 32-bit x86 allmodconfig elides XEN because of it's X86_TSC dependency.

TSC is a real dependency of the Xen interfaces.

> And, well, you could type "make allmodconfig" on your tree and see for
> yourself instead of asking me :-)

True.

X86_TSC not being enabled appears to due to CONFIG_ELAN being enabled
which causes the processor selection option (which defaults to M686,
which is a sane choice and enables TSC etc) to be gated at the top level
in arch/x86/Kconfig.cpu. Disabling the ELAN option then leaves X86_TSC
gated on !CONFIG_NUMAQ but removing that results in a generally useful
looking config.

It's a shame that these sorts of minority options cause allmodconfig to
omit support for more interesting configurations, such as modern
processors. Other than negating the semantics of such options I'm not
really sure what can be done about it though. On the other hand
compiling all the unusual stuff in an allmodconfig is probably a
positive thing.

I'm not sure why ELAN belongs in the EXTENDED_PLATFORM option space
rather than in the CPU choice option, since its only impact seems to be
on -march, MODULE_PROC_FAMILY and some cpufreq drivers which doesn't
sound like an extended platform to me but does it appear to be
deliberate (see 9e111f3e167a "x86: move ELAN to the
NON_STANDARD_PLATFORM section", that was the old name for
EXTENDED_PLATFORM).

Hrm, what about the following? (doesn't actually make a difference to
Xen since allmodconfig chooses HIGHMEM4G instead of HIGHMEM64G in the !
NUMAQ case but I stopped worrying about that several paragraphs ago)

8<--------

x86: invert X86_EXTENDED_PLATFORM to X86_STANDARD_PLATFORM

Having the =y choice be the more "standard" configuration causes
all*config to provide greater coverage of usual configurations.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc6c53a..6d8a404 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -299,15 +299,15 @@ config X86_BIGSMP
 	  This option is needed for the systems that have more than 8 CPUs
 
 if X86_32
-config X86_EXTENDED_PLATFORM
-	bool "Support for extended (non-PC) x86 platforms"
+config X86_STANDARD_PLATFORM
+	bool "Restrict support to standard (PC) x86 platforms"
 	default y
 	---help---
-	  If you disable this option then the kernel will only support
+	  If you enable this option then the kernel will only support
 	  standard PC platforms. (which covers the vast majority of
 	  systems out there.)
 
-	  If you enable this option then you'll be able to select support
+	  If you disable this option then you'll be able to select support
 	  for the following (non-PC) 32 bit x86 platforms:
 		AMD Elan
 		NUMAQ (IBM/Sequent)
@@ -318,25 +318,25 @@ config X86_EXTENDED_PLATFORM
 		Moorestown MID devices
 
 	  If you have one of these systems, or if you want to build a
-	  generic distribution kernel, say Y here - otherwise say N.
+	  generic distribution kernel, say N here - otherwise say Y.
 endif
 
 if X86_64
-config X86_EXTENDED_PLATFORM
-	bool "Support for extended (non-PC) x86 platforms"
+config X86_STANDARD_PLATFORM
+	bool "Restrict support to standard (PC) x86 platforms"
 	default y
 	---help---
-	  If you disable this option then the kernel will only support
+	  If you enable this option then the kernel will only support
 	  standard PC platforms. (which covers the vast majority of
 	  systems out there.)
 
-	  If you enable this option then you'll be able to select support
+	  If you disable this option then you'll be able to select support
 	  for the following (non-PC) 64 bit x86 platforms:
 		ScaleMP vSMP
 		SGI Ultraviolet
 
 	  If you have one of these systems, or if you want to build a
-	  generic distribution kernel, say Y here - otherwise say N.
+	  generic distribution kernel, say N here - otherwise say Y.
 endif
 # This is an alphabetically sorted list of 64 bit extended platforms
 # Please maintain the alphabetic order if and when there are additions
@@ -346,7 +346,7 @@ config X86_VSMP
 	select PARAVIRT_GUEST
 	select PARAVIRT
 	depends on X86_64 && PCI
-	depends on X86_EXTENDED_PLATFORM
+	depends on !X86_STANDARD_PLATFORM
 	---help---
 	  Support for ScaleMP vSMP systems.  Say 'Y' here if this kernel is
 	  supposed to run on these EM64T-based machines.  Only choose this option
@@ -355,7 +355,7 @@ config X86_VSMP
 config X86_UV
 	bool "SGI Ultraviolet"
 	depends on X86_64
-	depends on X86_EXTENDED_PLATFORM
+	depends on !X86_STANDARD_PLATFORM
 	depends on NUMA
 	depends on X86_X2APIC
 	---help---
@@ -368,7 +368,7 @@ config X86_UV
 config X86_ELAN
 	bool "AMD Elan"
 	depends on X86_32
-	depends on X86_EXTENDED_PLATFORM
+	depends on !X86_STANDARD_PLATFORM
 	---help---
 	  Select this for an AMD Elan processor.
 
@@ -381,7 +381,7 @@ config X86_INTEL_CE
 	depends on PCI
 	depends on PCI_GODIRECT
 	depends on X86_32
-	depends on X86_EXTENDED_PLATFORM
+	depends on !X86_STANDARD_PLATFORM
 	select X86_REBOOTFIXUPS
 	select OF
 	select OF_EARLY_FLATTREE
@@ -395,7 +395,7 @@ config X86_MRST
 	depends on PCI
 	depends on PCI_GOANY
 	depends on X86_32
-	depends on X86_EXTENDED_PLATFORM
+	depends on !X86_STANDARD_PLATFORM
 	depends on X86_IO_APIC
 	select APB_TIMER
 	select I2C
@@ -413,7 +413,7 @@ config X86_MRST
 config X86_RDC321X
 	bool "RDC R-321x SoC"
 	depends on X86_32
-	depends on X86_EXTENDED_PLATFORM
+	depends on !X86_STANDARD_PLATFORM
 	select M486
 	select X86_REBOOTFIXUPS
 	---help---
@@ -424,7 +424,7 @@ config X86_RDC321X
 config X86_32_NON_STANDARD
 	bool "Support non-standard 32-bit SMP architectures"
 	depends on X86_32 && SMP
-	depends on X86_EXTENDED_PLATFORM
+	depends on !X86_STANDARD_PLATFORM
 	---help---
 	  This option compiles in the NUMAQ, Summit, bigsmp, ES7000, default
 	  subarchitectures.  It is intended for a generic binary kernel.



^ permalink raw reply related

* Re: nfs client doesn't work [was: mmotm 2011-03-31-14-48 uploaded]
From: Jiri Slaby @ 2011-04-07  6:42 UTC (permalink / raw)
  To: Myklebust, Trond
  Cc: Jiri Slaby, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	mm-commits-u79uwXL29TY76Z2rM5mHXA, ML netdev,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1302122693.16786.0.camel-SyLVLa/KEI9HwK5hSS5vWB2eb7JE58TQ@public.gmane.org>

On 04/06/2011 10:44 PM, Myklebust, Trond wrote:
> On Sat, 2011-04-02 at 10:56 +0200, Jiri Slaby wrote:
>> On 03/31/2011 11:48 PM, akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org wrote:
>> > The mm-of-the-moment snapshot 2011-03-31-14-48 has been uploaded to
>>
>> Hi, nfs client is defunct in this kernel. Tcpdump says:
>> 10:51:55.489717 IP 10.20.11.33.759945860 > 10.20.3.2.2049: 132 getattr
>> fh 0,0/24
>> 10:51:55.515927 IP 10.20.3.2.2049 > 10.20.11.33.759945860: reply ok 44
>> getattr ERROR: Operation not permitted
>> 10:51:55.515949 IP 10.20.11.33.921 > 10.20.3.2.2049: Flags [.], ack
>> 3569361440, win 115, options [nop,nop,TS val 599750 ecr 255058541],
> length 0
>> 10:52:04.130310 IP 10.20.11.33.793500292 > 10.20.3.2.2049: 76 getattr fh
>> 0,0/24
>> 10:52:04.152178 IP 10.20.3.2.2049 > 10.20.11.33.793500292: reply ok 44
>> getattr ERROR: Operation not permitted
>>
>> If I run the same mount command (mount -oro,intr host:dir mountpoint)
>> from within a virtual machine with 2.6.38.2 there, everything mounts OK.
> 
> Does the attached patch help?

No, still the operation not permitted in the tcpdump output and no mount.

thanks,
-- 
js
suse labs
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* RE: Question on "net: allocate skbs on local node"
From: Eric Dumazet @ 2011-04-07  6:16 UTC (permalink / raw)
  To: Wei Gu; +Cc: netdev, Alexander Duyck, Jeff Kirsher
In-Reply-To: <1302153412.2701.64.camel@edumazet-laptop>

Le jeudi 07 avril 2011 à 07:16 +0200, Eric Dumazet a écrit :
> Le jeudi 07 avril 2011 à 06:58 +0200, Eric Dumazet a écrit :
> > Le jeudi 07 avril 2011 à 10:16 +0800, Wei Gu a écrit :
> > > Hi Eric,
> > > Testing with ixgbe Linux 2.6.38 driver:
> > > We have a little better thruput figure with this driver, but it looks
> > > not scalling at all, I always stressed one CPU core/24.
> > > And when look the perf report for ksoftirqd/24, the most cost function
> > > is still "_raw_spin_unlock_irqstore" and the IRQ/s is huge, it's
> > > somehow conflicts with desgin of NAPI. On linux 2.6.32 while the CPU
> > > was stressed the IRQ will descreased while the NAPI will running much
> > > on the polling mode. I don't know why on 2.6.38 the IRQ was keep
> > > increasing.
> > 
> > 
> > CC netdev and Intel guys, since they said it should not happen (TM)
> > 
> > IF you dont use DCA (make sure ioatdma module is not loaded), how comes
> > alloc_iova() is called at all ?
> > 
> > IF you use DCA, how comes its called, since the same CPU serves a given
> > interrupt ?
> > 
> > 
> 
> But then, maybe you forgot to cpu affine IRQS ?
> 
> High performance routing setup is tricky, since you probably want to
> disable many features that are ON by default : Most machines act as a
> end host.
> 
> 

Please dont send me anymore private mails, I do think the issue you have
is on a setup, not a particular optimization done in network stack.


Copy of your private mail :

> On 2.6.38, I got a lot of "rx_missed_errors" on NIC, which means the
> rx loop was really busy to get packet from the receiving ring. Usually
> in this case it shouldn't exit the softirqs and keep polling in order
> to decrease the initrs.
> 
> On 2.6.32, I can Rx and Tx 2.3Mpps with no packet lost(error on NIC),
> but on 2.6.38 I can only reach 50kpps with a lot of
> "rx_missed_errors", and all the binding cpu core was 100% in SI. I
> don't think there was any optimizations on it.

I hope you understand there is something wrong with your setup ?

50.000 pps on a 64 cpu machine is a bad joke.

We can reach +10.000.000 on a 16 cpus one.




^ permalink raw reply

* Re: [PATCH net-next] net: fix skb_add_data_nocache() to calc csum correctly
From: David Miller @ 2011-04-07  6:05 UTC (permalink / raw)
  To: therbert; +Cc: yjwei, netdev
In-Reply-To: <BANLkTikspQHnjagFme0S3GdPUo-zw48zBw@mail.gmail.com>

From: Tom Herbert <therbert@google.com>
Date: Wed, 6 Apr 2011 21:50:55 -0700

> Nice catch.
> 
> Acked-by: Tom Herbert <therbert@google.com>
> 
> On Wed, Apr 6, 2011 at 9:40 PM, Wei Yongjun <yjwei@cn.fujitsu.com> wrote:
>> commit c6e1a0d12ca7b4f22c58e55a16beacfb7d3d8462 broken the calc
>>  (net: Allow no-cache copy from user on transmit)
>> of checksum, which may cause some tcp packets be dropped because
>> incorrect checksum. ssh does not work under today's net-next-2.6
>> tree.
>>
>> Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>

Applied, thanks everyone.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox