Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next-2.6 v7 0/5] sctp: Patch series
From: Michio Honda @ 2011-04-26  8:35 UTC (permalink / raw)
  To: netdev; +Cc: lksctp-developers

Series of 5 patches to support auto_asconf and the other related functionalities that auto_asconf relies on. 

Cheers,
- Michio

[1/5] Add Auto-ASCONF support
[2/5] Add sysctl support for Auto-ASCONF
[3/5] Add socket option operation for Auto-ASCONF
[4/5] Add ADD/DEL ASCONF handling at the receiver
[5/5] Add ASCONF operation on the single-homed host

^ permalink raw reply

* [PATCH net-next-2.6 v7 1/5] sctp: Add Auto-ASCONF support
From: Michio Honda @ 2011-04-26  8:35 UTC (permalink / raw)
  To: netdev; +Cc: lksctp-developers

SCTP reconfigure the IP addresses in the association by using ASCONF chunks as mentioned in RFC5061.  
For example, we can start to use the newly configured IP address in the existing association.  
ASCONF operation is invoked in two ways: 
First is done by the application to call sctp_bindx() system call.  
Second is automatic operation in the SCTP stack with address events in the host computer (called auto_asconf) .  
The former is already implemented, and this patch implement the latter.  

Signed-off-by: Michio Honda <micchie@sfc.wide.ad.jp>
---
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 7e8e34c..872ffa0 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -121,6 +121,8 @@ extern int sctp_copy_local_addr_list(struct sctp_bind_addr *,
 				     int flags);
 extern struct sctp_pf *sctp_get_pf_specific(sa_family_t family);
 extern int sctp_register_pf(struct sctp_pf *, sa_family_t);
+void sctp_addr_wq_mgmt(struct sctp_sockaddr_entry *, int);
+void sctp_free_addr_wq(void);
 
 /*
  * sctp/socket.c
@@ -135,6 +137,7 @@ void sctp_sock_rfree(struct sk_buff *skb);
 void sctp_copy_sock(struct sock *newsk, struct sock *sk,
 		    struct sctp_association *asoc);
 extern struct percpu_counter sctp_sockets_allocated;
+int sctp_asconf_mgmt(struct sctp_sock *, struct sctp_sockaddr_entry *);
 
 /*
  * sctp/primitive.c
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 5c9bada..f7f6add 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -205,6 +205,11 @@ extern struct sctp_globals {
 	 * It is a list of sctp_sockaddr_entry.
 	 */
 	struct list_head local_addr_list;
+	int default_auto_asconf;
+	struct list_head addr_waitq;
+	struct timer_list addr_wq_timer;
+	struct list_head auto_asconf_splist;
+	spinlock_t addr_wq_lock;
 
 	/* Lock that protects the local_addr_list writers */
 	spinlock_t addr_list_lock;
@@ -264,6 +269,11 @@ extern struct sctp_globals {
 #define sctp_port_hashtable		(sctp_globals.port_hashtable)
 #define sctp_local_addr_list		(sctp_globals.local_addr_list)
 #define sctp_local_addr_lock		(sctp_globals.addr_list_lock)
+#define sctp_auto_asconf_splist		(sctp_globals.auto_asconf_splist)
+#define sctp_addr_waitq			(sctp_globals.addr_waitq)
+#define sctp_addr_wq_timer		(sctp_globals.addr_wq_timer)
+#define sctp_addr_wq_lock		(sctp_globals.addr_wq_lock)
+#define sctp_default_auto_asconf	(sctp_globals.default_auto_asconf)
 #define sctp_scope_policy		(sctp_globals.ipv4_scope_policy)
 #define sctp_addip_enable		(sctp_globals.addip_enable)
 #define sctp_addip_noauth		(sctp_globals.addip_noauth_enable)
@@ -341,6 +351,8 @@ struct sctp_sock {
 	atomic_t pd_mode;
 	/* Receive to here while partial delivery is in effect. */
 	struct sk_buff_head pd_lobby;
+	struct list_head auto_asconf_list;
+	int do_auto_asconf;
 };
 
 static inline struct sctp_sock *sctp_sk(const struct sock *sk)
@@ -796,6 +808,8 @@ struct sctp_sockaddr_entry {
 	__u8 valid;
 };
 
+#define SCTP_ADDRESS_TICK_DELAY	500
+
 typedef struct sctp_chunk *(sctp_packet_phandler_t)(struct sctp_association *);
 
 /* This structure holds lists of chunks as we are assembling for
@@ -1239,6 +1253,7 @@ sctp_scope_t sctp_scope(const union sctp_addr *);
 int sctp_in_scope(const union sctp_addr *addr, const sctp_scope_t scope);
 int sctp_is_any(struct sock *sk, const union sctp_addr *addr);
 int sctp_addr_is_valid(const union sctp_addr *addr);
+int sctp_is_ep_boundall(struct sock *sk);
 
 
 /* What type of endpoint?  */
diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
index faf71d1..869267b 100644
--- a/net/sctp/bind_addr.c
+++ b/net/sctp/bind_addr.c
@@ -536,6 +536,21 @@ int sctp_in_scope(const union sctp_addr *addr, sctp_scope_t scope)
 	return 0;
 }
 
+int sctp_is_ep_boundall(struct sock *sk)
+{
+	struct sctp_bind_addr *bp;
+	struct sctp_sockaddr_entry *addr;
+
+	bp = &sctp_sk(sk)->ep->base.bind_addr;
+	if (sctp_list_single_entry(&bp->address_list)) {
+		addr = list_entry(bp->address_list.next,
+				  struct sctp_sockaddr_entry, list);
+		if (sctp_is_any(sk, &addr->a))
+			return 1;
+	}
+	return 0;
+}
+
 /********************************************************************
  * 3rd Level Abstractions
  ********************************************************************/
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 321f175..233eb3a 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -105,6 +105,7 @@ static int sctp_inet6addr_event(struct notifier_block *this, unsigned long ev,
 			addr->valid = 1;
 			spin_lock_bh(&sctp_local_addr_lock);
 			list_add_tail_rcu(&addr->list, &sctp_local_addr_list);
+			sctp_addr_wq_mgmt(addr, SCTP_ADDR_NEW);
 			spin_unlock_bh(&sctp_local_addr_lock);
 		}
 		break;
@@ -115,6 +116,7 @@ static int sctp_inet6addr_event(struct notifier_block *this, unsigned long ev,
 			if (addr->a.sa.sa_family == AF_INET6 &&
 					ipv6_addr_equal(&addr->a.v6.sin6_addr,
 						&ifa->addr)) {
+				sctp_addr_wq_mgmt(addr, SCTP_ADDR_DEL);
 				found = 1;
 				addr->valid = 0;
 				list_del_rcu(&addr->list);
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index d5bf91d..638bafc 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -636,6 +636,160 @@ static void sctp_v4_ecn_capable(struct sock *sk)
 	INET_ECN_xmit(sk);
 }
 
+void sctp_addr_wq_timeout_handler(unsigned long arg)
+{
+	struct sctp_sockaddr_entry *addrw = NULL;
+	union sctp_addr *addr = NULL;
+	struct sctp_sock *sp = NULL;
+
+	spin_lock_bh(&sctp_addr_wq_lock);
+retry_wq:
+	if (list_empty(&sctp_addr_waitq)) {
+		SCTP_DEBUG_PRINTK("sctp_addrwq_timo_handler: nothing in addr waitq\n");
+		spin_unlock_bh(&sctp_addr_wq_lock);
+		return;
+	}
+	addrw = list_first_entry(&sctp_addr_waitq, struct sctp_sockaddr_entry,
+			list);
+	addr = &addrw->a;
+	SCTP_DEBUG_PRINTK_IPADDR("sctp_addrwq_timo_handler: the first ent in wq %p is ",
+	    " for cmd %d at entry %p\n", &sctp_addr_waitq, addr, addrw->state,
+	    addrw);
+
+	/* Now we send an ASCONF for each association */
+	/* Note. we currently don't handle link local IPv6 addressees */
+	if (addr->sa.sa_family == AF_INET6) {
+		struct in6_addr *in6 = (struct in6_addr *)&addr->v6.sin6_addr;
+
+		if (ipv6_addr_type(&addr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) {
+			SCTP_DEBUG_PRINTK("sctp_timo_handler: link local, hence don't tell sockets\n");
+			list_del(&addrw->list);
+			kfree(addrw);
+			goto retry_wq;
+		}
+		if (ipv6_chk_addr(&init_net, in6, NULL, 0) == 0 &&
+		    addrw->state == SCTP_ADDR_NEW) {
+			unsigned long timeo_val;
+
+			SCTP_DEBUG_PRINTK("sctp_timo_handler: this is on DAD, trying %d sec later\n",
+			    SCTP_ADDRESS_TICK_DELAY);
+			timeo_val = jiffies;
+			timeo_val += msecs_to_jiffies(SCTP_ADDRESS_TICK_DELAY);
+			mod_timer(&sctp_addr_wq_timer, timeo_val);
+			spin_unlock_bh(&sctp_addr_wq_lock);
+			return;
+		}
+	}
+	list_for_each_entry(sp, &sctp_auto_asconf_splist, auto_asconf_list) {
+		struct sock *sk;
+
+		sk = sctp_opt2sk(sp);
+		/* ignore bound-specific endpoints */
+		if (!sctp_is_ep_boundall(sk))
+			continue;
+		sctp_bh_lock_sock(sk);
+		if (sctp_asconf_mgmt(sp, addrw) < 0) {
+			SCTP_DEBUG_PRINTK("sctp_addrwq_timo_handler: sctp_asconf_mgmt failed\n");
+			sctp_bh_unlock_sock(sk);
+			continue;
+		}
+		sctp_bh_unlock_sock(sk);
+	}
+
+	list_del(&addrw->list);
+	kfree(addrw);
+
+	if (!list_empty(&sctp_addr_waitq))
+		goto retry_wq;
+
+	spin_unlock_bh(&sctp_addr_wq_lock);
+}
+
+void sctp_free_addr_wq()
+{
+	struct sctp_sockaddr_entry *addrw = NULL;
+	struct sctp_sockaddr_entry *temp = NULL;
+
+	spin_lock_bh(&sctp_addr_wq_lock);
+	del_timer(&sctp_addr_wq_timer);
+	list_for_each_entry_safe(addrw, temp, &sctp_addr_waitq, list) {
+		list_del(&addrw->list);
+		kfree(addrw);
+	}
+	spin_unlock_bh(&sctp_addr_wq_lock);
+}
+
+/* lookup the entry for the same address in the addr_waitq
+ * sctp_addr_wq MUST be locked
+ */
+static struct sctp_sockaddr_entry *sctp_addr_wq_lookup(struct sctp_sockaddr_entry *addr)
+{
+	struct sctp_sockaddr_entry *addrw;
+
+	list_for_each_entry(addrw, &sctp_addr_waitq, list) {
+		if (addrw->a.sa.sa_family != addr->a.sa.sa_family)
+			continue;
+		if (addrw->a.sa.sa_family == AF_INET) {
+			if (addrw->a.v4.sin_addr.s_addr ==
+			    addr->a.v4.sin_addr.s_addr)
+				return addrw;
+		} else if (addrw->a.sa.sa_family == AF_INET6) {
+			if (ipv6_addr_equal(&addrw->a.v6.sin6_addr,
+			    &addr->a.v6.sin6_addr))
+				return addrw;
+		}
+	}
+	return NULL;
+}
+
+void sctp_addr_wq_mgmt(struct sctp_sockaddr_entry *addr, int cmd)
+{
+	struct sctp_sockaddr_entry *addrw = NULL;
+	unsigned long timeo_val;
+	union sctp_addr *tmpaddr;
+
+	/* first, we check if an opposite message already exist in the queue.
+	 * If we found such message, it is removed.
+	 * This operation is a bit stupid, but the DHCP client attaches the
+	 * new address after a couple of addition and deletion of that address
+	 */
+
+	spin_lock_bh(&sctp_addr_wq_lock);
+	/* Offsets existing events in addr_wq */
+	addrw = sctp_addr_wq_lookup(addr);
+	tmpaddr = &addrw->a;
+	if (addrw) {
+		if (addrw->state != cmd) {
+			SCTP_DEBUG_PRINTK_IPADDR("sctp_addr_wq_mgmt offsets existing entry for %d ",
+			    " in wq %p\n", addrw->state, tmpaddr,
+			    &sctp_addr_waitq);
+			list_del(&addrw->list);
+			kfree(addrw);
+		}
+		spin_unlock_bh(&sctp_addr_wq_lock);
+		return;
+	}
+
+	/* OK, we have to add the new address to the wait queue */
+	addrw = kmemdup(addr, sizeof(struct sctp_sockaddr_entry), GFP_ATOMIC);
+	if (addrw == NULL) {
+		spin_unlock_bh(&sctp_addr_wq_lock);
+		return;
+	}
+	addrw->state = cmd;
+	list_add_tail(&addrw->list, &sctp_addr_waitq);
+	tmpaddr = &addrw->a;
+	SCTP_DEBUG_PRINTK_IPADDR("sctp_addr_wq_mgmt add new entry for cmd:%d ",
+	    " in wq %p\n", addrw->state, tmpaddr, &sctp_addr_waitq);
+
+	if (!timer_pending(&sctp_addr_wq_timer)) {
+		timeo_val = jiffies;
+		timeo_val += msecs_to_jiffies(SCTP_ADDRESS_TICK_DELAY);
+		mod_timer(&sctp_addr_wq_timer, timeo_val);
+	}
+	spin_unlock_bh(&sctp_addr_wq_lock);
+}
+
 /* Event handler for inet address addition/deletion events.
  * The sctp_local_addr_list needs to be protocted by a spin lock since
  * multiple notifiers (say IPv4 and IPv6) may be running at the same
@@ -663,6 +817,7 @@ static int sctp_inetaddr_event(struct notifier_block *this, unsigned long ev,
 			addr->valid = 1;
 			spin_lock_bh(&sctp_local_addr_lock);
 			list_add_tail_rcu(&addr->list, &sctp_local_addr_list);
+			sctp_addr_wq_mgmt(addr, SCTP_ADDR_NEW);
 			spin_unlock_bh(&sctp_local_addr_lock);
 		}
 		break;
@@ -673,6 +828,7 @@ static int sctp_inetaddr_event(struct notifier_block *this, unsigned long ev,
 			if (addr->a.sa.sa_family == AF_INET &&
 					addr->a.v4.sin_addr.s_addr ==
 					ifa->ifa_local) {
+				sctp_addr_wq_mgmt(addr, SCTP_ADDR_DEL);
 				found = 1;
 				addr->valid = 0;
 				list_del_rcu(&addr->list);
@@ -1256,6 +1412,7 @@ SCTP_STATIC __init int sctp_init(void)
 	/* Disable ADDIP by default. */
 	sctp_addip_enable = 0;
 	sctp_addip_noauth = 0;
+	sctp_default_auto_asconf = 0;
 
 	/* Enable PR-SCTP by default. */
 	sctp_prsctp_enable = 1;
@@ -1280,6 +1437,13 @@ SCTP_STATIC __init int sctp_init(void)
 	spin_lock_init(&sctp_local_addr_lock);
 	sctp_get_local_addr_list();
 
+	/* Initialize the address event list */
+	INIT_LIST_HEAD(&sctp_addr_waitq);
+	INIT_LIST_HEAD(&sctp_auto_asconf_splist);
+	spin_lock_init(&sctp_addr_wq_lock);
+	sctp_addr_wq_timer.expires = 0;
+	setup_timer(&sctp_addr_wq_timer, sctp_addr_wq_timeout_handler, 0);
+
 	status = sctp_v4_protosw_init();
 
 	if (status)
@@ -1351,6 +1515,7 @@ SCTP_STATIC __exit void sctp_exit(void)
 	/* Unregister with inet6/inet layers. */
 	sctp_v6_del_protocol();
 	sctp_v4_del_protocol();
+	sctp_free_addr_wq();
 
 	/* Free the control endpoint.  */
 	inet_ctl_sock_destroy(sctp_ctl_sock);
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index f694ee1..fb183c5 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -811,6 +811,37 @@ out:
 	return retval;
 }
 
+/* set addr events to assocs in the endpoint.  ep and addr_wq must be locked */
+int
+sctp_asconf_mgmt(struct sctp_sock *sp, struct sctp_sockaddr_entry *addrw)
+{
+	struct sock *sk = sctp_opt2sk(sp);
+	union sctp_addr *addr = NULL;
+	int cmd;
+	int error = 0;
+
+	addr = &addrw->a;
+	addr->v4.sin_port = htons(sp->ep->base.bind_addr.port);
+	cmd = addrw->state;
+
+	SCTP_DEBUG_PRINTK("sctp_asconf_mgmt sp:%p\n", sp);
+	if (cmd == SCTP_ADDR_NEW) {
+		error = sctp_send_asconf_add_ip(sk, (struct sockaddr *)addr, 1);
+		if (error) {
+			SCTP_DEBUG_PRINTK("asconf_mgmt: send_asconf_add_ip returns %d\n", error);
+			return error;
+		}
+	} else if (cmd == SCTP_ADDR_DEL) {
+		error = sctp_send_asconf_del_ip(sk, (struct sockaddr *)addr, 1);
+		if (error) {
+			SCTP_DEBUG_PRINTK("asconf_mgmt: send_asconf_del_ip returns %d\n", error);
+			return error;
+		}
+	}
+
+	return 0;
+}
+
 /* Helper for tunneling sctp_bindx() requests through sctp_setsockopt()
  *
  * API 8.1
@@ -3764,6 +3795,13 @@ SCTP_STATIC int sctp_init_sock(struct sock *sk)
 	local_bh_disable();
 	percpu_counter_inc(&sctp_sockets_allocated);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+	if (sctp_default_auto_asconf) {
+		list_add_tail(&sp->auto_asconf_list,
+		    &sctp_auto_asconf_splist);
+		sp->do_auto_asconf = 1;
+	} else
+		sp->do_auto_asconf = 0;
+	SCTP_DEBUG_PRINTK("sctp_init_sk sk:%p ep:%p\n", sk, ep);
 	local_bh_enable();
 
 	return 0;
@@ -3773,11 +3811,17 @@ SCTP_STATIC int sctp_init_sock(struct sock *sk)
 SCTP_STATIC void sctp_destroy_sock(struct sock *sk)
 {
 	struct sctp_endpoint *ep;
+	struct sctp_sock *sp;
 
 	SCTP_DEBUG_PRINTK("sctp_destroy_sock(sk: %p)\n", sk);
 
 	/* Release our hold on the endpoint. */
-	ep = sctp_sk(sk)->ep;
+	sp = sctp_sk(sk);
+	ep = sp->ep;
+	if (sp->do_auto_asconf) {
+		sp->do_auto_asconf = 0;
+		list_del(&sp->auto_asconf_list);
+	}
 	sctp_endpoint_free(ep);
 	local_bh_disable();
 	percpu_counter_dec(&sctp_sockets_allocated);


^ permalink raw reply related

* [PATCH net-next-2.6 v7 2/5] sctp: Add sysctl support for Auto-ASCONF
From: Michio Honda @ 2011-04-26  8:36 UTC (permalink / raw)
  To: netdev; +Cc: lksctp-developers

This patch allows the system administrator to change default Auto-ASCONF on/off behavior via an sysctl value.  

Signed-off-by: Michio Honda <micchie@sfc.wide.ad.jp>
---
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 50cb57f..6b39529 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -183,6 +183,13 @@ static ctl_table sctp_table[] = {
		.proc_handler	= proc_dointvec,
	},
	{
+		.procname	= "default_auto_asconf",
+		.data		= &sctp_default_auto_asconf,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
		.procname	= "prsctp_enable",
		.data		= &sctp_prsctp_enable,
		.maxlen		= sizeof(int),

^ permalink raw reply related

* [PATCH net-next-2.6 v7 3/5] sctp: Add socket option operation for Auto-ASCONF
From: Michio Honda @ 2011-04-26  8:36 UTC (permalink / raw)
  To: netdev; +Cc: lksctp-developers

This patch allows the application to operate Auto-ASCONF on/off behavior via setsockopt() and getsockopt().  

Signed-off-by: Michio Honda <micchie@sfc.wide.ad.jp>
---
diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h
index 32fd512..0842ef0 100644
--- a/include/net/sctp/user.h
+++ b/include/net/sctp/user.h
@@ -92,6 +92,7 @@ typedef __s32 sctp_assoc_t;
#define SCTP_LOCAL_AUTH_CHUNKS	27	/* Read only */
#define SCTP_GET_ASSOC_NUMBER	28	/* Read only */
#define SCTP_GET_ASSOC_ID_LIST	29	/* Read only */
+#define SCTP_AUTO_ASCONF       30

/* Internal Socket Options. Some of the sctp library functions are
 * implemented using these socket options.
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index f694ee1..bf3501c 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3335,6 +3335,46 @@ static int sctp_setsockopt_del_key(struct sock *sk,

}

+/*
+ * 8.1.23 SCTP_AUTO_ASCONF
+ *
+ * This option will enable or disable the use of the automatic generation of
+ * ASCONF chunks to add and delete addresses to an existing association.  Note
+ * that this option has two caveats namely: a) it only affects sockets that
+ * are bound to all addresses available to the SCTP stack, and b) the system
+ * administrator may have an overriding control that turns the ASCONF feature
+ * off no matter what setting the socket option may have.
+ * This option expects an integer boolean flag, where a non-zero value turns on
+ * the option, and a zero value turns off the option.
+ * Note. In this implementation, socket operation overrides default parameter
+ * being set by sysctl as well as FreeBSD implementation
+ */
+static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
+					unsigned int optlen)
+{
+	int val;
+	struct sctp_sock *sp = sctp_sk(sk);
+
+	if (optlen < sizeof(int))
+		return -EINVAL;
+	if (get_user(val, (int __user *)optval))
+		return -EFAULT;
+	if (!sctp_is_ep_boundall(sk) && val)
+		return -EINVAL;
+	if ((val && sp->do_auto_asconf) || (!val && !sp->do_auto_asconf))
+		return 0;
+
+	if (val == 0 && sp->do_auto_asconf) {
+		list_del(&sp->auto_asconf_list);
+		sp->do_auto_asconf = 0;
+	} else if (val && !sp->do_auto_asconf) {
+		list_add_tail(&sp->auto_asconf_list,
+		    &sctp_auto_asconf_splist);
+		sp->do_auto_asconf = 1;
+	}
+	return 0;
+}
+

/* API 6.2 setsockopt(), getsockopt()
 *
@@ -3482,6 +3522,9 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
	case SCTP_AUTH_DELETE_KEY:
		retval = sctp_setsockopt_del_key(sk, optval, optlen);
		break;
+	case SCTP_AUTO_ASCONF:
+		retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
+		break;
	default:
		retval = -ENOPROTOOPT;
		break;
@@ -5277,6 +5320,28 @@ static int sctp_getsockopt_assoc_number(struct sock *sk, int len,
	return 0;
}

+/*
+ * 8.1.23 SCTP_AUTO_ASCONF
+ * See the corresponding setsockopt entry as description
+ */
+static int sctp_getsockopt_auto_asconf(struct sock *sk, int len,
+				   char __user *optval, int __user *optlen)
+{
+	int val = 0;
+
+	if (len < sizeof(int))
+		return -EINVAL;
+
+	len = sizeof(int);
+	if (sctp_sk(sk)->do_auto_asconf && sctp_is_ep_boundall(sk))
+		val = 1;
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &val, len))
+		return -EFAULT;
+	return 0;
+}
+
/*
 * 8.2.6. Get the Current Identifiers of Associations
 *        (SCTP_GET_ASSOC_ID_LIST)
@@ -5461,6 +5526,9 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
	case SCTP_GET_ASSOC_ID_LIST:
		retval = sctp_getsockopt_assoc_ids(sk, len, optval, optlen);
		break;
+	case SCTP_AUTO_ASCONF:
+		retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen);
+		break;
	default:
		retval = -ENOPROTOOPT;
		break;

^ permalink raw reply related

* [PATCH net-next-2.6 v7 4/5] sctp: Add ADD/DEL ASCONF handling at the receiver
From: Michio Honda @ 2011-04-26  8:37 UTC (permalink / raw)
  To: netdev; +Cc: lksctp-developers

This patch fixes the problem that the original code cannot delete the remote address where the corresponding transport is currently directed, even when the ASCONF is sent from the other address (this situation happens when the single-homed sender transmits  ASCONF with ADD and DEL.)  

Signed-off-by: Michio Honda <micchie@sfc.wide.ad.jp>
---
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 58eb27f..3740603 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -3014,7 +3014,7 @@ static __be16 sctp_process_asconf_param(struct sctp_association *asoc,
		 * an Error Cause TLV set to the new error code 'Request to
		 * Delete Source IP Address'
		 */
-		if (sctp_cmp_addr_exact(sctp_source(asconf), &addr))
+		if (sctp_cmp_addr_exact(&asconf->source, &addr))
			return SCTP_ERROR_DEL_SRC_IP;

		/* Section 4.2.2

^ permalink raw reply related

* [PATCH net-next-2.6 v7 5/5] sctp: Add ASCONF operation on the single-homed host
From: Michio Honda @ 2011-04-26  8:37 UTC (permalink / raw)
  To: netdev; +Cc: lksctp-developers

SCTP can change the IP address on the single-homed host.  
In this case, the SCTP association transmits an ASCONF packet including addition of the new IP address and deletion of the old address.  This patch implements this functionality.  
In this case, the ASCONF chunk is added to the beginning of the queue, because the other chunks cannot be transmitted in this state.  

Signed-off-by: Michio Honda <micchie@sfc.wide.ad.jp>
---
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 5c9bada..c0f5616 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1901,6 +1901,8 @@ struct sctp_association {
	 * after reaching 4294967295.
	 */
	__u32 addip_serial;
+	union sctp_addr *asconf_addr_del_pending;
+	int src_out_of_asoc_ok;

	/* SCTP AUTH: list of the endpoint shared keys.  These
	 * keys are provided out of band by the user applicaton
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 1a21c57..ea56b0d 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -279,6 +279,8 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
	asoc->peer.asconf_capable = 0;
	if (sctp_addip_noauth)
		asoc->peer.asconf_capable = 1;
+	asoc->asconf_addr_del_pending = NULL;
+	asoc->src_out_of_asoc_ok = 0;

	/* Create an input queue.  */
	sctp_inq_init(&asoc->base.inqueue);
@@ -443,6 +445,10 @@ void sctp_association_free(struct sctp_association *asoc)

	asoc->peer.transport_count = 0;

+	/* Free pending address space being deleted */
+	if (asoc->asconf_addr_del_pending != NULL)
+		kfree(asoc->asconf_addr_del_pending);
+
	/* Free any cached ASCONF_ACK chunk. */
	sctp_assoc_free_asconf_acks(asoc);

diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 321f175..e67cc31 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -332,6 +332,13 @@ static void sctp_v6_get_saddr(struct sctp_sock *sk,
				matchlen = bmatchlen;
			}
		}
+		if (laddr->state == SCTP_ADDR_NEW && asoc->src_out_of_asoc_ok) {
+			bmatchlen = sctp_v6_addr_match_len(daddr, &laddr->a);
+			if (!baddr || (matchlen < bmatchlen)) {
+				baddr = &laddr->a;
+				matchlen = bmatchlen;
+			}
+		}
	}

	if (baddr) {
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 1c88c89..7981854 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -754,6 +754,16 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
	 */

	list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) {
+		/* RFC 5061, 5.3
+		 * F1) This means that until such time as the ASCONF
+		 * containing the add is acknowledged, the sender MUST
+		 * NOT use the new IP address as a source for ANY SCTP
+		 * packet except on carrying an ASCONF Chunk.
+		 */
+		if (asoc->src_out_of_asoc_ok &&
+		    chunk->chunk_hdr->type != SCTP_CID_ASCONF)
+			continue;
+
		list_del_init(&chunk->list);

		/* Pick the right transport to use. */
@@ -881,6 +891,9 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
		}
	}

+	if (q->asoc->src_out_of_asoc_ok)
+		goto sctp_flush_out;
+
	/* Is it OK to send data chunks?  */
	switch (asoc->state) {
	case SCTP_STATE_COOKIE_ECHOED:
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index d5bf91d..40aef4d 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -510,7 +510,9 @@ static struct dst_entry *sctp_v4_get_dst(struct sctp_association *asoc,
		sctp_v4_dst_saddr(&dst_saddr, dst, htons(bp->port));
		rcu_read_lock();
		list_for_each_entry_rcu(laddr, &bp->address_list, list) {
-			if (!laddr->valid || (laddr->state != SCTP_ADDR_SRC))
+			if (!laddr->valid || (laddr->state == SCTP_ADDR_DEL) ||
+			    (laddr->state != SCTP_ADDR_SRC &&
+			    !asoc->src_out_of_asoc_ok))
				continue;
			if (sctp_v4_cmp_addr(&dst_saddr, &laddr->a))
				goto out_unlock;
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 58eb27f..53e5ea2 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -2768,6 +2768,12 @@ struct sctp_chunk *sctp_make_asconf_update_ip(struct sctp_association *asoc,
	int			addr_param_len = 0;
	int 			totallen = 0;
	int 			i;
+	sctp_addip_param_t del_param; /* 8 Bytes (Type 0xC002, Len and CrrID) */
+	struct sctp_af *del_af;
+	int del_addr_param_len = 0;
+	int del_paramlen = sizeof(sctp_addip_param_t);
+	union sctp_addr_param del_addr_param; /* (v4) 8 Bytes, (v6) 20 Bytes */
+	int			del_pickup = 0;

	/* Get total length of all the address parameters. */
	addr_buf = addrs;
@@ -2780,6 +2786,17 @@ struct sctp_chunk *sctp_make_asconf_update_ip(struct sctp_association *asoc,
		totallen += addr_param_len;

		addr_buf += af->sockaddr_len;
+		if (asoc->asconf_addr_del_pending && !del_pickup) {
+			if (!sctp_in_scope(asoc->asconf_addr_del_pending,
+			    sctp_scope(addr)))
+				continue;
+			/* reuse the parameter length from the same scope one */
+			totallen += paramlen;
+			totallen += addr_param_len;
+			del_pickup = 1;
+			asoc->src_out_of_asoc_ok = 1;
+			SCTP_DEBUG_PRINTK("mkasconf_update_ip: picked same-scope del_pending addr, totallen for all addresses is %d\n", totallen);
+		}
	}

	/* Create an asconf chunk with the required length. */
@@ -2802,6 +2819,19 @@ struct sctp_chunk *sctp_make_asconf_update_ip(struct sctp_association *asoc,

		addr_buf += af->sockaddr_len;
	}
+	if (flags == SCTP_PARAM_ADD_IP && del_pickup) {
+		addr = asoc->asconf_addr_del_pending;
+		del_af = sctp_get_af_specific(addr->v4.sin_family);
+		del_addr_param_len = del_af->to_addr_param(addr,
+		    &del_addr_param);
+		del_param.param_hdr.type = SCTP_PARAM_DEL_IP;
+		del_param.param_hdr.length = htons(del_paramlen +
+		    del_addr_param_len);
+		del_param.crr_id = i;
+
+		sctp_addto_chunk(retval, del_paramlen, &del_param);
+		sctp_addto_chunk(retval, del_addr_param_len, &del_addr_param);
+	}
	return retval;
}

@@ -3224,6 +3254,11 @@ static void sctp_asconf_param_success(struct sctp_association *asoc,
	case SCTP_PARAM_DEL_IP:
		local_bh_disable();
		sctp_del_bind_addr(bp, &addr);
+		if (asoc->asconf_addr_del_pending != NULL &&
+		    sctp_cmp_addr_exact(asoc->asconf_addr_del_pending, &addr)) {
+			kfree(asoc->asconf_addr_del_pending);
+			asoc->asconf_addr_del_pending = NULL;
+		}
		local_bh_enable();
		list_for_each_entry(transport, &asoc->peer.transport_addr_list,
				transports) {
@@ -3381,6 +3416,9 @@ int sctp_process_asconf_ack(struct sctp_association *asoc,
		asconf_len -= length;
	}

+	if (no_err && asoc->src_out_of_asoc_ok)
+		asoc->src_out_of_asoc_ok = 0;
+
	/* Free the cached last sent asconf chunk. */
	list_del_init(&asconf->transmitted_list);
	sctp_chunk_free(asconf);
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index f694ee1..7d2fd48 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -583,10 +583,6 @@ static int sctp_send_asconf_add_ip(struct sock		*sk,
			goto out;
		}

-		retval = sctp_send_asconf(asoc, chunk);
-		if (retval)
-			goto out;
-
		/* Add the new addresses to the bind address list with
		 * use_as_src set to 0.
		 */
@@ -599,6 +595,23 @@ static int sctp_send_asconf_add_ip(struct sock		*sk,
						    SCTP_ADDR_NEW, GFP_ATOMIC);
			addr_buf += af->sockaddr_len;
		}
+		if (asoc->src_out_of_asoc_ok) {
+			struct sctp_transport *trans;
+
+			list_for_each_entry(trans,
+			    &asoc->peer.transport_addr_list, transports) {
+				/* Clear the source and route cache */
+				dst_release(trans->dst);
+				trans->cwnd = min(4*asoc->pathmtu, max_t(__u32,
+				    2*asoc->pathmtu, 4380));
+				trans->ssthresh = asoc->peer.i.a_rwnd;
+				trans->rto = asoc->rto_initial;
+				trans->rtt = trans->srtt = trans->rttvar = 0;
+				sctp_transport_route(trans, NULL,
+				    sctp_sk(asoc->base.sk));
+			}
+		}
+		retval = sctp_send_asconf(asoc, chunk);
	}

out:
@@ -715,7 +728,9 @@ static int sctp_send_asconf_del_ip(struct sock		*sk,
	struct sctp_sockaddr_entry *saddr;
	int 			i;
	int 			retval = 0;
+	int			stored = 0;

+	chunk = NULL;
	if (!sctp_addip_enable)
		return retval;

@@ -766,8 +781,32 @@ static int sctp_send_asconf_del_ip(struct sock		*sk,
		bp = &asoc->base.bind_addr;
		laddr = sctp_find_unmatch_addr(bp, (union sctp_addr *)addrs,
					       addrcnt, sp);
-		if (!laddr)
-			continue;
+		if ((laddr == NULL) && (addrcnt == 1)) {
+			if (asoc->asconf_addr_del_pending)
+				continue;
+			asoc->asconf_addr_del_pending =
+			    kzalloc(sizeof(union sctp_addr), GFP_ATOMIC);
+			asoc->asconf_addr_del_pending->sa.sa_family =
+				    addrs->sa_family;
+			asoc->asconf_addr_del_pending->v4.sin_port =
+				    htons(bp->port);
+			if (addrs->sa_family == AF_INET) {
+				struct sockaddr_in *sin;
+
+				sin = (struct sockaddr_in *)addrs;
+				asoc->asconf_addr_del_pending->v4.sin_addr.s_addr = sin->sin_addr.s_addr;
+			} else if (addrs->sa_family == AF_INET6) {
+				struct sockaddr_in6 *sin6;
+
+				sin6 = (struct sockaddr_in6 *)addrs;
+				ipv6_addr_copy(&asoc->asconf_addr_del_pending->v6.sin6_addr, &sin6->sin6_addr);
+			}
+			SCTP_DEBUG_PRINTK_IPADDR("send_asconf_del_ip: keep the last address asoc: %p ",
+			    " at %p\n", asoc, asoc->asconf_addr_del_pending,
+			    asoc->asconf_addr_del_pending);
+			stored = 1;
+			goto skip_mkasconf;
+		}

		/* We do not need RCU protection throughout this loop
		 * because this is done under a socket lock from the
@@ -780,6 +819,7 @@ static int sctp_send_asconf_del_ip(struct sock		*sk,
			goto out;
		}

+skip_mkasconf:
		/* Reset use_as_src flag for the addresses in the bind address
		 * list that are to be deleted.
		 */
@@ -805,6 +845,9 @@ static int sctp_send_asconf_del_ip(struct sock		*sk,
					     sctp_sk(asoc->base.sk));
		}

+		if (stored)
+			/* We don't need to transmit ASCONF */
+			continue;
		retval = sctp_send_asconf(asoc, chunk);
	}
out:

^ permalink raw reply related

* Re: [PATCH net-next-2.6 v7 1/5] sctp: Add Auto-ASCONF support
From: Wei Yongjun @ 2011-04-26  8:43 UTC (permalink / raw)
  To: Michio Honda; +Cc: netdev, lksctp-developers
In-Reply-To: <2418F0AB-37A2-4B85-8723-32D92A48B7DA@sfc.wide.ad.jp>


> SCTP reconfigure the IP addresses in the association by using ASCONF chunks as mentioned in RFC5061.  
> For example, we can start to use the newly configured IP address in the existing association.  
> ASCONF operation is invoked in two ways: 
> First is done by the application to call sctp_bindx() system call.  
> Second is automatic operation in the SCTP stack with address events in the host computer (called auto_asconf) .  
> The former is already implemented, and this patch implement the latter.  
>
> Signed-off-by: Michio Honda <micchie@sfc.wide.ad.jp>
> ---
> diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
> index 7e8e34c..872ffa0 100644
> --- a/include/net/sctp/sctp.h
> +++ b/include/net/sctp/sctp.h
> @@ -121,6 +121,8 @@ extern int sctp_copy_local_addr_list(struct sctp_bind_addr *,
>  				     int flags);
>  extern struct sctp_pf *sctp_get_pf_specific(sa_family_t family);
>  extern int sctp_register_pf(struct sctp_pf *, sa_family_t);
> +void sctp_addr_wq_mgmt(struct sctp_sockaddr_entry *, int);
> +void sctp_free_addr_wq(void);
>  
>

...

> +void sctp_free_addr_wq()

static void sctp_free_addr_wq(void)

> +{
> +	struct sctp_sockaddr_entry *addrw = NULL;
> +	struct sctp_sockaddr_entry *temp = NULL;
> +
> +	spin_lock_bh(&sctp_addr_wq_lock);
> +	del_timer(&sctp_addr_wq_timer);
> +	list_for_each_entry_safe(addrw, temp, &sctp_addr_waitq, list) {
> +		list_del(&addrw->list);
> +		kfree(addrw);
> +	}
> +	spin_unlock_bh(&sctp_addr_wq_lock);
> +}
> +
...

^ permalink raw reply

* Re: [PATCH 03/13] mm: Introduce __GFP_MEMALLOC to allow access to emergency reserves
From: NeilBrown @ 2011-04-26  9:49 UTC (permalink / raw)
  To: Mel Gorman; +Cc: Linux-MM, Linux-Netdev, LKML, David Miller, Peter Zijlstra
In-Reply-To: <1303803414-5937-4-git-send-email-mgorman@suse.de>

On Tue, 26 Apr 2011 08:36:44 +0100 Mel Gorman <mgorman@suse.de> wrote:

> __GFP_MEMALLOC will allow the allocation to disregard the watermarks,
> much like PF_MEMALLOC. It allows one to pass along the memalloc state in
> object related allocation flags as opposed to task related flags, such
> as sk->sk_allocation. This removes the need for ALLOC_PFMEMALLOC as
> callers using __GFP_MEMALLOC can get the ALLOC_NO_WATERMARK flag which
> is now enough to identify allocations related to page reclaim.
> 
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> Signed-off-by: Mel Gorman <mgorman@suse.de>
> ---
>  include/linux/gfp.h      |    4 +++-
>  include/linux/mm_types.h |    2 +-
>  mm/page_alloc.c          |   14 ++++++--------
>  mm/slab.c                |    2 +-
>  4 files changed, 11 insertions(+), 11 deletions(-)
> 
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index bfb8f93..4e011e7 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -23,6 +23,7 @@ struct vm_area_struct;
>  #define ___GFP_REPEAT		0x400u
>  #define ___GFP_NOFAIL		0x800u
>  #define ___GFP_NORETRY		0x1000u
> +#define ___GFP_MEMALLOC		0x2000u
>  #define ___GFP_COMP		0x4000u
>  #define ___GFP_ZERO		0x8000u
>  #define ___GFP_NOMEMALLOC	0x10000u
> @@ -75,6 +76,7 @@ struct vm_area_struct;
>  #define __GFP_REPEAT	((__force gfp_t)___GFP_REPEAT)	/* See above */
>  #define __GFP_NOFAIL	((__force gfp_t)___GFP_NOFAIL)	/* See above */
>  #define __GFP_NORETRY	((__force gfp_t)___GFP_NORETRY) /* See above */
> +#define __GFP_MEMALLOC	((__force gfp_t)___GFP_MEMALLOC)/* Allow access to emergency reserves */
>  #define __GFP_COMP	((__force gfp_t)___GFP_COMP)	/* Add compound page metadata */
>  #define __GFP_ZERO	((__force gfp_t)___GFP_ZERO)	/* Return zeroed page on success */
>  #define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) /* Don't use emergency reserves */

Having both "MEMALLOC" and  "NOMEMALLOC" seems ... unfortunate.

It appears that NOMEMALLOC over-rides MEMALLOC.  It might be good to document
this 

> +#define __GFP_MEMALLOC	((__force gfp_t)___GFP_MEMALLOC)/* Allow access to emergency reserves
                                                                   unless __GFP_NOMEMALLOC is set*/

>  #define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) /* Don't use emergency reserves
                                                                  Overrides __GFP_MEMALLOC */

I suspect that it is never valid to set both.  So NOMEMALLOC is really
NO_PF_MEMALLOC, but making that change is probably just noise.

Maybe a
   WARN_ON((gfp_mask & __GFP_MEMALLOC) && (gfp_mask & __GFP_NOMEMALLOC));
might be wise?

NeilBrown.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH 03/13] mm: Introduce __GFP_MEMALLOC to allow access to emergency reserves
From: Mel Gorman @ 2011-04-26 10:36 UTC (permalink / raw)
  To: NeilBrown; +Cc: Linux-MM, Linux-Netdev, LKML, David Miller, Peter Zijlstra
In-Reply-To: <20110426194947.764e048a@notabene.brown>

On Tue, Apr 26, 2011 at 07:49:47PM +1000, NeilBrown wrote:
> On Tue, 26 Apr 2011 08:36:44 +0100 Mel Gorman <mgorman@suse.de> wrote:
> 
> > __GFP_MEMALLOC will allow the allocation to disregard the watermarks,
> > much like PF_MEMALLOC. It allows one to pass along the memalloc state in
> > object related allocation flags as opposed to task related flags, such
> > as sk->sk_allocation. This removes the need for ALLOC_PFMEMALLOC as
> > callers using __GFP_MEMALLOC can get the ALLOC_NO_WATERMARK flag which
> > is now enough to identify allocations related to page reclaim.
> > 
> > Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> > Signed-off-by: Mel Gorman <mgorman@suse.de>
> > ---
> >  include/linux/gfp.h      |    4 +++-
> >  include/linux/mm_types.h |    2 +-
> >  mm/page_alloc.c          |   14 ++++++--------
> >  mm/slab.c                |    2 +-
> >  4 files changed, 11 insertions(+), 11 deletions(-)
> > 
> > diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> > index bfb8f93..4e011e7 100644
> > --- a/include/linux/gfp.h
> > +++ b/include/linux/gfp.h
> > @@ -23,6 +23,7 @@ struct vm_area_struct;
> >  #define ___GFP_REPEAT		0x400u
> >  #define ___GFP_NOFAIL		0x800u
> >  #define ___GFP_NORETRY		0x1000u
> > +#define ___GFP_MEMALLOC		0x2000u
> >  #define ___GFP_COMP		0x4000u
> >  #define ___GFP_ZERO		0x8000u
> >  #define ___GFP_NOMEMALLOC	0x10000u
> > @@ -75,6 +76,7 @@ struct vm_area_struct;
> >  #define __GFP_REPEAT	((__force gfp_t)___GFP_REPEAT)	/* See above */
> >  #define __GFP_NOFAIL	((__force gfp_t)___GFP_NOFAIL)	/* See above */
> >  #define __GFP_NORETRY	((__force gfp_t)___GFP_NORETRY) /* See above */
> > +#define __GFP_MEMALLOC	((__force gfp_t)___GFP_MEMALLOC)/* Allow access to emergency reserves */
> >  #define __GFP_COMP	((__force gfp_t)___GFP_COMP)	/* Add compound page metadata */
> >  #define __GFP_ZERO	((__force gfp_t)___GFP_ZERO)	/* Return zeroed page on success */
> >  #define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) /* Don't use emergency reserves */
> 
> Having both "MEMALLOC" and  "NOMEMALLOC" seems ... unfortunate.
> 
> It appears that NOMEMALLOC over-rides MEMALLOC.  It might be good to document
> this 
> 

I can document it. Right now, a better name than NOMEMALLOC does not
spring to mind.

> > +#define __GFP_MEMALLOC	((__force gfp_t)___GFP_MEMALLOC)/* Allow access to emergency reserves
>                                                                    unless __GFP_NOMEMALLOC is set*/
> 
> >  #define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) /* Don't use emergency reserves
>                                                                   Overrides __GFP_MEMALLOC */
> 
> I suspect that it is never valid to set both.  So NOMEMALLOC is really
> NO_PF_MEMALLOC, but making that change is probably just noise.
> 
> Maybe a
>    WARN_ON((gfp_mask & __GFP_MEMALLOC) && (gfp_mask & __GFP_NOMEMALLOC));
> might be wise?
> 

Both MEMALLOC and NOMEMALLOC are related to PFMEMALLOC reserves so
it's reasonable for them to have similar names. This warning will
also trigger because it's a combination of flags that does happen.

Consider for example

any interrupt
  -> __netdev_alloc_skb		(mask == GFP_ATOMIC)
    -> __alloc_skb		(mask == GFP_ATOMIC)
       if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
               gfp_mask |= __GFP_MEMALLOC;
				(mask == GFP_ATOMIC|__GFP_NOMEMALLOC)
      -> __kmalloc_reserve
		First attempt tries to avoid reserves so adds __GFP_MEMALLOC
				(mask == GFP_ATOMIC|__GFP_NOMEMALLOC|__GFP_MEMALLOC)

You're right in that __GFP_NOMEMALLOC overrides __GFP_MEMALLOC so that
could do with a note.

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH 03/13] mm: Introduce __GFP_MEMALLOC to allow access to emergency reserves
From: NeilBrown @ 2011-04-26 10:53 UTC (permalink / raw)
  To: Mel Gorman; +Cc: Linux-MM, Linux-Netdev, LKML, David Miller, Peter Zijlstra
In-Reply-To: <20110426103646.GD4658@suse.de>

On Tue, 26 Apr 2011 11:36:46 +0100 Mel Gorman <mgorman@suse.de> wrote:

> On Tue, Apr 26, 2011 at 07:49:47PM +1000, NeilBrown wrote:

> > Maybe a
> >    WARN_ON((gfp_mask & __GFP_MEMALLOC) && (gfp_mask & __GFP_NOMEMALLOC));
> > might be wise?
> > 
> 
> Both MEMALLOC and NOMEMALLOC are related to PFMEMALLOC reserves so
> it's reasonable for them to have similar names. This warning will
> also trigger because it's a combination of flags that does happen.
> 
> Consider for example
> 
> any interrupt
>   -> __netdev_alloc_skb		(mask == GFP_ATOMIC)
>     -> __alloc_skb		(mask == GFP_ATOMIC)
>        if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
>                gfp_mask |= __GFP_MEMALLOC;
> 				(mask == GFP_ATOMIC|__GFP_NOMEMALLOC)
>       -> __kmalloc_reserve
> 		First attempt tries to avoid reserves so adds __GFP_MEMALLOC
> 				(mask == GFP_ATOMIC|__GFP_NOMEMALLOC|__GFP_MEMALLOC)
> 

You have the "NO"s mixed up a bit which confused me for a while :-)
But I see your point - I guess the WARN_ON isn't really needed.


> You're right in that __GFP_NOMEMALLOC overrides __GFP_MEMALLOC so that
> could do with a note.
> 

Thanks,

NeilBrown

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH net-2.6 4/4] xfrm: Fix integer underrun on zero sized replay windows
From: Steffen Klassert @ 2011-04-26 10:58 UTC (permalink / raw)
  To: David Miller, Herbert Xu; +Cc: netdev
In-Reply-To: <20110426054232.GI5495@secunet.com>

On Tue, Apr 26, 2011 at 07:42:32AM +0200, Steffen Klassert wrote:
> The check if the replay window is contained within one subspace or
> spans over two subspaces causes an unwanted integer underrun on
> zero sized replay windows when we subtract minus one. We fix this by
> changeing this check to avoid the subtraction.
> 

Don't apply this one, it does not fix the issue completely.
I'll send a better one, sorry.

^ permalink raw reply

* Re: [PATCH 02/13] mm: sl[au]b: Add knowledge of PFMEMALLOC reserve pages
From: NeilBrown @ 2011-04-26 11:15 UTC (permalink / raw)
  To: Mel Gorman; +Cc: Linux-MM, Linux-Netdev, LKML, David Miller, Peter Zijlstra
In-Reply-To: <1303803414-5937-3-git-send-email-mgorman@suse.de>

On Tue, 26 Apr 2011 08:36:43 +0100 Mel Gorman <mgorman@suse.de> wrote:

> +bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
> +{
> +	return gfp_to_alloc_flags(gfp_mask) & ALLOC_PFMEMALLOC;
> +}
> +
>  static inline struct page *
>  __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
>  	struct zonelist *zonelist, enum zone_type high_zoneidx,
> @@ -2202,8 +2211,16 @@ nopage:
>  got_pg:
>  	if (kmemcheck_enabled)
>  		kmemcheck_pagealloc_alloc(page, order, gfp_mask);
> -	return page;
>  
> +	/*
> +	 * page->pfmemalloc is set when the caller had PFMEMALLOC set or is
> +	 * been OOM killed. The expectation is that the caller is taking
> +	 * steps that will free more memory. The caller should avoid the
> +	 * page being used for !PFMEMALLOC purposes.
> +	 */
> +	page->pfmemalloc = (alloc_flags & ALLOC_PFMEMALLOC);
> +
> +	return page;

Linus doesn't seem to be a fan of this construct:
   https://lkml.org/lkml/2011/4/1/255

pfmemalloc is a bool, and the value on the right is either 0 or 0x1000.

If bool happens to be typedefed to 'char' or even 'short', pfmemalloc would
always be set to 0.
Ditto for the gfp_pfmemalloc_allowed function.

Prefixing with '!!' would make it safe.

NeilBrown

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH 02/13] mm: sl[au]b: Add knowledge of PFMEMALLOC reserve pages
From: Mel Gorman @ 2011-04-26 11:33 UTC (permalink / raw)
  To: NeilBrown; +Cc: Linux-MM, Linux-Netdev, LKML, David Miller, Peter Zijlstra
In-Reply-To: <20110426211500.02d6a5a6@notabene.brown>

On Tue, 2011-04-26 at 21:15 +1000, NeilBrown wrote:
> On Tue, 26 Apr 2011 08:36:43 +0100 Mel Gorman <mgorman@suse.de> wrote:
> 
> > +bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
> > +{
> > +	return gfp_to_alloc_flags(gfp_mask) & ALLOC_PFMEMALLOC;
> > +}
> > +
> >  static inline struct page *
> >  __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
> >  	struct zonelist *zonelist, enum zone_type high_zoneidx,
> > @@ -2202,8 +2211,16 @@ nopage:
> >  got_pg:
> >  	if (kmemcheck_enabled)
> >  		kmemcheck_pagealloc_alloc(page, order, gfp_mask);
> > -	return page;
> >  
> > +	/*
> > +	 * page->pfmemalloc is set when the caller had PFMEMALLOC set or is
> > +	 * been OOM killed. The expectation is that the caller is taking
> > +	 * steps that will free more memory. The caller should avoid the
> > +	 * page being used for !PFMEMALLOC purposes.
> > +	 */
> > +	page->pfmemalloc = (alloc_flags & ALLOC_PFMEMALLOC);
> > +
> > +	return page;
> 
> Linus doesn't seem to be a fan of this construct:
>    https://lkml.org/lkml/2011/4/1/255
> 

There is confusion around this topic. Andrew prefers bool for true/false
values and it's self-documenting. I tend to prefer it myself for
readability and there is a slow conversion in the VM from
ints-used-as-bools to bools and my understanding of bool is that any
non-zero value will be treated as true (just as it is for int).

> pfmemalloc is a bool, and the value on the right is either 0 or 0x1000.
> 
> If bool happens to be typedefed to 'char' or even 'short', pfmemalloc would
> always be set to 0.

It is typedeffed as _Bool though which I thought was able to handle the
cast appropriately or is that wrong?

> Ditto for the gfp_pfmemalloc_allowed function.
> 
> Prefixing with '!!' would make it safe.
> 

Will do that to avoid any oddities. Thanks

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH 02/13] mm: sl[au]b: Add knowledge of PFMEMALLOC reserve pages
From: NeilBrown @ 2011-04-26 11:37 UTC (permalink / raw)
  To: Mel Gorman; +Cc: Linux-MM, Linux-Netdev, LKML, David Miller, Peter Zijlstra
In-Reply-To: <1303803414-5937-3-git-send-email-mgorman@suse.de>

On Tue, 26 Apr 2011 08:36:43 +0100 Mel Gorman <mgorman@suse.de> wrote:

> +		/*
> +		 * If there are full empty slabs and we were not forced to
> +		 * allocate a slab, mark this one !pfmemalloc
> +		 */
> +		l3 = cachep->nodelists[numa_mem_id()];
> +		if (!list_empty(&l3->slabs_free) && force_refill) {
> +			struct slab *slabp = virt_to_slab(objp);
> +			slabp->pfmemalloc = false;
> +			clear_obj_pfmemalloc(&objp);
> +			check_ac_pfmemalloc(cachep, ac);
> +			return objp;
> +		}

The comment doesn't match the code.  I think you need to remove the words
"full" and "not" assuming the code is correct which it probably is...

But the code seems to be much more complex than Peter's original, and I don't
see the gain.

Peter's code had only one 'reserved' flag for each kmem_cache.  You seem to
have one for every slab.  I don't see the point.
It is true that yours is in some sense more fair - but I'm not sure the
complexity is worth it.

Was there some particular reason you made the change?

NeilBrown

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [PATCH] dsa/mv88e6131: fix unknown multicast/broadcast forwarding on mv88e6085
From: Peter Korsgaard @ 2011-04-26 11:45 UTC (permalink / raw)
  To: davem, buytenh, netdev; +Cc: Peter Korsgaard

The 88e6085 has a few differences from the other devices in the port
control registers, causing unknown multicast/broadcast packets to get
dropped when using the standard port setup.

At the same time update kconfig to clarify that the mv88e6085 is now
supported.

Signed-off-by: Peter Korsgaard <jacmet@sunsite.dk>
---
 net/dsa/Kconfig     |    4 ++--
 net/dsa/mv88e6131.c |   26 +++++++++++++++++++++-----
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 87bb5f4..c53ded2 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -41,12 +41,12 @@ config NET_DSA_MV88E6XXX_NEED_PPU
 	default n
 
 config NET_DSA_MV88E6131
-	bool "Marvell 88E6095/6095F/6131 ethernet switch chip support"
+	bool "Marvell 88E6085/6095/6095F/6131 ethernet switch chip support"
 	select NET_DSA_MV88E6XXX
 	select NET_DSA_MV88E6XXX_NEED_PPU
 	select NET_DSA_TAG_DSA
 	---help---
-	  This enables support for the Marvell 88E6095/6095F/6131
+	  This enables support for the Marvell 88E6085/6095/6095F/6131
 	  ethernet switch chips.
 
 config NET_DSA_MV88E6123_61_65
diff --git a/net/dsa/mv88e6131.c b/net/dsa/mv88e6131.c
index 3da4188..45f7411 100644
--- a/net/dsa/mv88e6131.c
+++ b/net/dsa/mv88e6131.c
@@ -207,8 +207,15 @@ static int mv88e6131_setup_port(struct dsa_switch *ds, int p)
 	 * mode, but do not enable forwarding of unknown unicasts.
 	 */
 	val = 0x0433;
-	if (p == dsa_upstream_port(ds))
+	if (p == dsa_upstream_port(ds)) {
 		val |= 0x0104;
+		/*
+		 * On 6085, unknown multicast forward is controlled
+		 * here rather than in Port Control 2 register.
+		 */
+		if (ps->id == ID_6085)
+			val |= 0x0008;
+	}
 	if (ds->dsa_port_mask & (1 << p))
 		val |= 0x0100;
 	REG_WRITE(addr, 0x04, val);
@@ -251,10 +258,19 @@ static int mv88e6131_setup_port(struct dsa_switch *ds, int p)
 	 * If this is the upstream port for this switch, enable
 	 * forwarding of unknown multicast addresses.
 	 */
-	val = 0x0080 | dsa_upstream_port(ds);
-	if (p == dsa_upstream_port(ds))
-		val |= 0x0040;
-	REG_WRITE(addr, 0x08, val);
+	if (ps->id == ID_6085)
+		/*
+		 * on 6085, bits 3:0 are reserved, bit 6 control ARP
+		 * mirroring, and multicast forward is handled in
+		 * Port Control register.
+		 */
+		REG_WRITE(addr, 0x08, 0x0080);
+	else {
+		val = 0x0080 | dsa_upstream_port(ds);
+		if (p == dsa_upstream_port(ds))
+			val |= 0x0040;
+		REG_WRITE(addr, 0x08, val);
+	}
 
 	/*
 	 * Rate Control: disable ingress rate limiting.
-- 
1.7.2.3


^ permalink raw reply related

* Re: [PATCH 02/13] mm: sl[au]b: Add knowledge of PFMEMALLOC reserve pages
From: NeilBrown @ 2011-04-26 12:05 UTC (permalink / raw)
  To: Mel Gorman; +Cc: Linux-MM, Linux-Netdev, LKML, David Miller, Peter Zijlstra
In-Reply-To: <1303817628.5593.6.camel@machina.109elm.lan>

On Tue, 26 Apr 2011 12:33:48 +0100 Mel Gorman <mgorman@suse.de> wrote:

> On Tue, 2011-04-26 at 21:15 +1000, NeilBrown wrote:
> > On Tue, 26 Apr 2011 08:36:43 +0100 Mel Gorman <mgorman@suse.de> wrote:
> > 
> > > +bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
> > > +{
> > > +	return gfp_to_alloc_flags(gfp_mask) & ALLOC_PFMEMALLOC;
> > > +}
> > > +
> > >  static inline struct page *
> > >  __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
> > >  	struct zonelist *zonelist, enum zone_type high_zoneidx,
> > > @@ -2202,8 +2211,16 @@ nopage:
> > >  got_pg:
> > >  	if (kmemcheck_enabled)
> > >  		kmemcheck_pagealloc_alloc(page, order, gfp_mask);
> > > -	return page;
> > >  
> > > +	/*
> > > +	 * page->pfmemalloc is set when the caller had PFMEMALLOC set or is
> > > +	 * been OOM killed. The expectation is that the caller is taking
> > > +	 * steps that will free more memory. The caller should avoid the
> > > +	 * page being used for !PFMEMALLOC purposes.
> > > +	 */
> > > +	page->pfmemalloc = (alloc_flags & ALLOC_PFMEMALLOC);
> > > +
> > > +	return page;
> > 
> > Linus doesn't seem to be a fan of this construct:
> >    https://lkml.org/lkml/2011/4/1/255
> > 
> 
> There is confusion around this topic. Andrew prefers bool for true/false
> values and it's self-documenting. I tend to prefer it myself for
> readability and there is a slow conversion in the VM from
> ints-used-as-bools to bools and my understanding of bool is that any
> non-zero value will be treated as true (just as it is for int).
> 
> > pfmemalloc is a bool, and the value on the right is either 0 or 0x1000.
> > 
> > If bool happens to be typedefed to 'char' or even 'short', pfmemalloc would
> > always be set to 0.
> 
> It is typedeffed as _Bool though which I thought was able to handle the
> cast appropriately or is that wrong?

Yes, I too believe that _Bool does the right thing, so this particular usage
does happen to be safe in the kernel.  But there is a long tradition of
'bool' being typedefed to an int type so the usage can look wrong.  So maybe
it is best avoided.

I have no strong feeling either way - I just thought I would highlight it.
In general, I like bool, but I don't like automatic casts (I often use
  if (pointer != NULL) ...
because I think it reads better).

Thanks,
NeilBrown



> 
> > Ditto for the gfp_pfmemalloc_allowed function.
> > 
> > Prefixing with '!!' would make it safe.
> > 
> 
> Will do that to avoid any oddities. Thanks
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH 09/13] netvm: Set PF_MEMALLOC as appropriate during SKB processing
From: NeilBrown @ 2011-04-26 12:21 UTC (permalink / raw)
  To: Mel Gorman; +Cc: Linux-MM, Linux-Netdev, LKML, David Miller, Peter Zijlstra
In-Reply-To: <1303803414-5937-10-git-send-email-mgorman@suse.de>

On Tue, 26 Apr 2011 08:36:50 +0100 Mel Gorman <mgorman@suse.de> wrote:

> diff --git a/net/core/dev.c b/net/core/dev.c
> index 3871bf6..2d79a20 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3095,6 +3095,27 @@ static void vlan_on_bond_hook(struct sk_buff *skb)
>  	}
>  }
>  
> +/*
> + * Limit which protocols can use the PFMEMALLOC reserves to those that are
> + * expected to be used for communication with swap.
> + */
> +static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
> +{
> +	if (skb_pfmemalloc(skb))
> +		switch (skb->protocol) {
> +		case __constant_htons(ETH_P_ARP):
> +		case __constant_htons(ETH_P_IP):
> +		case __constant_htons(ETH_P_IPV6):
> +		case __constant_htons(ETH_P_8021Q):
> +			break;
> +
> +		default:
> +			return false;
> +		}
> +
> +	return true;
> +}

This sort of thing really bugs me :-)
Neither the comment nor the function name actually describe what the function
is doing.  The function is checking *2* things.
   is_pfmemalloc_skb_or_pfmemalloc_protocol()
might be a more correct name, but is too verbose.

I would prefer the skb_pfmemalloc test were removed from here and ....

> +	if (!skb_pfmemalloc_protocol(skb))
> +		goto drop;
> +

...added here so this becomes:

      if (!skb_pfmemalloc(skb) && !skb_pfmemalloc_protocol(skb))
                goto drop;

which actually makes sense.

Thanks,
NeilBrown

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH 12/13] mm: Throttle direct reclaimers if PF_MEMALLOC reserves are low and swap is backed by network storage
From: NeilBrown @ 2011-04-26 12:30 UTC (permalink / raw)
  To: Mel Gorman; +Cc: Linux-MM, Linux-Netdev, LKML, David Miller, Peter Zijlstra
In-Reply-To: <1303803414-5937-13-git-send-email-mgorman@suse.de>

On Tue, 26 Apr 2011 08:36:53 +0100 Mel Gorman <mgorman@suse.de> wrote:


> +/*
> + * Throttle direct reclaimers if backing storage is backed by the network
> + * and the PFMEMALLOC reserve for the preferred node is getting dangerously
> + * depleted. kswapd will continue to make progress and wake the processes
> + * when the low watermark is reached
> + */
> +static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
> +					nodemask_t *nodemask)
> +{
> +	struct zone *zone;
> +	int high_zoneidx = gfp_zone(gfp_mask);
> +	DEFINE_WAIT(wait);
> +
> +	/* Check if the pfmemalloc reserves are ok */
> +	first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
> +	prepare_to_wait(&zone->zone_pgdat->pfmemalloc_wait, &wait,
> +							TASK_INTERRUPTIBLE);
> +	if (pfmemalloc_watermark_ok(zone->zone_pgdat, high_zoneidx))
> +		goto out;
> +
> +	/* Throttle */
> +	do {
> +		schedule();
> +		finish_wait(&zone->zone_pgdat->pfmemalloc_wait, &wait);
> +		prepare_to_wait(&zone->zone_pgdat->pfmemalloc_wait, &wait,
> +							TASK_INTERRUPTIBLE);
> +	} while (!pfmemalloc_watermark_ok(zone->zone_pgdat, high_zoneidx) &&
> +			!fatal_signal_pending(current));
> +
> +out:
> +	finish_wait(&zone->zone_pgdat->pfmemalloc_wait, &wait);
> +}

You are doing an interruptible wait, but only checking for fatal signals.
So if a non-fatal signal arrives, you will busy-wait.

So I suspect you want TASK_KILLABLE, so just use:

    wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
                        pgmemalloc_watermark_ok(zone->zone_pgdata,
                                                high_zoneidx));

(You also have an extraneous call to finish_wait)

NeilBrown


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH 13/13] mm: Account for the number of times direct reclaimers get throttled
From: NeilBrown @ 2011-04-26 12:35 UTC (permalink / raw)
  To: Mel Gorman; +Cc: Linux-MM, Linux-Netdev, LKML, David Miller, Peter Zijlstra
In-Reply-To: <1303803414-5937-14-git-send-email-mgorman@suse.de>

On Tue, 26 Apr 2011 08:36:54 +0100 Mel Gorman <mgorman@suse.de> wrote:

> Under significant pressure when writing back to network-backed storage,
> direct reclaimers may get throttled. This is expected to be a
> short-lived event and the processes get woken up again but processes do
> get stalled. This patch counts how many times such stalling occurs. It's
> up to the administrator whether to reduce these stalls by increasing
> min_free_kbytes.
> 
> Signed-off-by: Mel Gorman <mgorman@suse.de>
> ---
>  include/linux/vm_event_item.h |    1 +
>  mm/vmscan.c                   |    1 +
>  mm/vmstat.c                   |    1 +
>  3 files changed, 3 insertions(+), 0 deletions(-)
> 
> diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
> index 03b90cdc..652e5f3 100644
> --- a/include/linux/vm_event_item.h
> +++ b/include/linux/vm_event_item.h
> @@ -29,6 +29,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
>  		FOR_ALL_ZONES(PGSTEAL),
>  		FOR_ALL_ZONES(PGSCAN_KSWAPD),
>  		FOR_ALL_ZONES(PGSCAN_DIRECT),
> +		PGSCAN_DIRECT_THROTTLE,
>  #ifdef CONFIG_NUMA
>  		PGSCAN_ZONE_RECLAIM_FAILED,
>  #endif
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 8b6da2b..e88138b 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2154,6 +2154,7 @@ static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
>  		goto out;
>  
>  	/* Throttle */
> +	count_vm_event(PGSCAN_DIRECT_THROTTLE);
>  	do {
>  		schedule();
>  		finish_wait(&zone->zone_pgdat->pfmemalloc_wait, &wait);
> diff --git a/mm/vmstat.c b/mm/vmstat.c
> index a2b7344..5725387 100644
> --- a/mm/vmstat.c
> +++ b/mm/vmstat.c
> @@ -911,6 +911,7 @@ const char * const vmstat_text[] = {
>  	TEXTS_FOR_ZONES("pgsteal")
>  	TEXTS_FOR_ZONES("pgscan_kswapd")
>  	TEXTS_FOR_ZONES("pgscan_direct")
> +	"pgscan_direct_throttle",
>  
>  #ifdef CONFIG_NUMA
>  	"zone_reclaim_failed",

I like this approach.  Make the information available, but don't make a fuss
about it.

Actually, I like the whole series - I'm really having to dig deep to find
anything to complain about :-)

Feel free to put
   Reviewed-by: NeilBrown <neilb@suse.de>
against anything that I haven't commented on.

Thanks,
NeilBrown

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: how to set vlan filter for intel 82599
From: Ben Hutchings @ 2011-04-26 13:31 UTC (permalink / raw)
  To: zhou rui; +Cc: netdev, e1000-devel
In-Reply-To: <BANLkTi=eADSYV90EC9uJhtLZfBBJa5O1Bw@mail.gmail.com>

On Tue, 2011-04-26 at 12:39 +0800, zhou rui wrote:
[...]
> i set the filter like below:
> 
> for a vlanid=50, it always match the last rule (action 7)
> 
> ./ethtool -K eth5 ntuple off
> ./ethtool -K eth5 ntuple on
> ./ethtool -U eth5 flow-type tcp4 vlan 32 vlan-mask 0xF01F action 1
> ./ethtool -U eth5 flow-type udp4 vlan 32 vlan-mask 0xF01F action 1
> ./ethtool -U eth5 flow-type udp4 vlan 64 vlan-mask 0xF01F action 7
> ./ethtool -U eth5 flow-type tcp4 vlan 64 vlan-mask 0xF01F action 7
> 
> I tried the latest ixgbe driver 3.3.9, it reports:
> 
> Cannot add new RX n-tuple filter: Operation not permitted
> 
> ./ethtool -V
> ethtool version 2.6.36

Check dmesg; there should be an error message there.  Of course the
error code should be EINVAL and not EPERM.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* Re: [PATCH 02/13] mm: sl[au]b: Add knowledge of PFMEMALLOC reserve pages
From: Mel Gorman @ 2011-04-26 13:59 UTC (permalink / raw)
  To: NeilBrown; +Cc: Linux-MM, Linux-Netdev, LKML, David Miller, Peter Zijlstra
In-Reply-To: <20110426213758.450f6f49@notabene.brown>

On Tue, Apr 26, 2011 at 09:37:58PM +1000, NeilBrown wrote:
> On Tue, 26 Apr 2011 08:36:43 +0100 Mel Gorman <mgorman@suse.de> wrote:
> 
> > +		/*
> > +		 * If there are full empty slabs and we were not forced to
> > +		 * allocate a slab, mark this one !pfmemalloc
> > +		 */
> > +		l3 = cachep->nodelists[numa_mem_id()];
> > +		if (!list_empty(&l3->slabs_free) && force_refill) {
> > +			struct slab *slabp = virt_to_slab(objp);
> > +			slabp->pfmemalloc = false;
> > +			clear_obj_pfmemalloc(&objp);
> > +			check_ac_pfmemalloc(cachep, ac);
> > +			return objp;
> > +		}
> 
> The comment doesn't match the code.  I think you need to remove the words
> "full" and "not" assuming the code is correct which it probably is...
> 

I'll fix up the comment, you're right, it's confusing.

> But the code seems to be much more complex than Peter's original, and I don't
> see the gain.
> 

You're right, it is more complex.

> Peter's code had only one 'reserved' flag for each kmem_cache. 

The reserve was set in a per-cpu structure so there was a "lag" time
before that information was available to other CPUs. Fine on smaller
machines but a bit more of a problem today. 

> You seem to
> have one for every slab.  I don't see the point.
> It is true that yours is in some sense more fair - but I'm not sure the
> complexity is worth it.
> 

More fairness was one of the objects.

> Was there some particular reason you made the change?
> 

This version survives under considerably more stress than Peter's
original version did without requiring the additional complexity of
memory reserves.

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH 03/13] mm: Introduce __GFP_MEMALLOC to allow access to emergency reserves
From: Mel Gorman @ 2011-04-26 14:00 UTC (permalink / raw)
  To: NeilBrown; +Cc: Linux-MM, Linux-Netdev, LKML, David Miller, Peter Zijlstra
In-Reply-To: <20110426205330.539a2766@notabene.brown>

On Tue, Apr 26, 2011 at 08:53:30PM +1000, NeilBrown wrote:
> On Tue, 26 Apr 2011 11:36:46 +0100 Mel Gorman <mgorman@suse.de> wrote:
> 
> > On Tue, Apr 26, 2011 at 07:49:47PM +1000, NeilBrown wrote:
> 
> > > Maybe a
> > >    WARN_ON((gfp_mask & __GFP_MEMALLOC) && (gfp_mask & __GFP_NOMEMALLOC));
> > > might be wise?
> > > 
> > 
> > Both MEMALLOC and NOMEMALLOC are related to PFMEMALLOC reserves so
> > it's reasonable for them to have similar names. This warning will
> > also trigger because it's a combination of flags that does happen.
> > 
> > Consider for example
> > 
> > any interrupt
> >   -> __netdev_alloc_skb		(mask == GFP_ATOMIC)
> >     -> __alloc_skb		(mask == GFP_ATOMIC)
> >        if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
> >                gfp_mask |= __GFP_MEMALLOC;
> > 				(mask == GFP_ATOMIC|__GFP_NOMEMALLOC)
> >       -> __kmalloc_reserve
> > 		First attempt tries to avoid reserves so adds __GFP_MEMALLOC
> > 				(mask == GFP_ATOMIC|__GFP_NOMEMALLOC|__GFP_MEMALLOC)
> > 
> 
> You have the "NO"s mixed up a bit which confused me for a while :-)
> But I see your point - I guess the WARN_ON isn't really needed.
> 

Bah, that was unhelpful on my part. I'm glad you saw the point anyway.
Sorry about that.

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH 09/13] netvm: Set PF_MEMALLOC as appropriate during SKB processing
From: Mel Gorman @ 2011-04-26 14:10 UTC (permalink / raw)
  To: NeilBrown; +Cc: Linux-MM, Linux-Netdev, LKML, David Miller, Peter Zijlstra
In-Reply-To: <20110426222157.33a461f8@notabene.brown>

On Tue, Apr 26, 2011 at 10:21:57PM +1000, NeilBrown wrote:
> On Tue, 26 Apr 2011 08:36:50 +0100 Mel Gorman <mgorman@suse.de> wrote:
> 
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index 3871bf6..2d79a20 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -3095,6 +3095,27 @@ static void vlan_on_bond_hook(struct sk_buff *skb)
> >  	}
> >  }
> >  
> > +/*
> > + * Limit which protocols can use the PFMEMALLOC reserves to those that are
> > + * expected to be used for communication with swap.
> > + */
> > +static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
> > +{
> > +	if (skb_pfmemalloc(skb))
> > +		switch (skb->protocol) {
> > +		case __constant_htons(ETH_P_ARP):
> > +		case __constant_htons(ETH_P_IP):
> > +		case __constant_htons(ETH_P_IPV6):
> > +		case __constant_htons(ETH_P_8021Q):
> > +			break;
> > +
> > +		default:
> > +			return false;
> > +		}
> > +
> > +	return true;
> > +}
> 
> This sort of thing really bugs me :-)
> Neither the comment nor the function name actually describe what the function
> is doing.  The function is checking *2* things.
>    is_pfmemalloc_skb_or_pfmemalloc_protocol()
> might be a more correct name, but is too verbose.
> 
> I would prefer the skb_pfmemalloc test were removed from here and ....
> 
> > +	if (!skb_pfmemalloc_protocol(skb))
> > +		goto drop;
> > +
> 
> ...added here so this becomes:
> 
>       if (!skb_pfmemalloc(skb) && !skb_pfmemalloc_protocol(skb))
>                 goto drop;
> 
> which actually makes sense.
> 

Moving the check is neater but that check should be

	if (skb_pfmemalloc(skb) && !skb_pfmemalloc_protocol(skb))

? It's only if the skb was allocated from emergency reserves that we
need to consider dropping it to make way for other packets to be
received.

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH 2/3] bql: Byte queue limits
From: Tom Herbert @ 2011-04-26 14:13 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: davem, netdev
In-Reply-To: <1303796313.2747.190.camel@edumazet-laptop>

> Wow... magical values and very limited advices how to tune them.
>
The intent is that the algorithm is sufficiently adaptive so that no
tuning should be required.

> Tom, this reminds me you were supposed to provide Documentation/files to
> describe RPS, RFS, XPS ...
>
We'll look into that.

> We receive many questions about these features...
>
>> Signed-off-by: Tom Herbert <therbert@google.com>
>> ---
>>  include/linux/netdevice.h |   46 +++++++++++++++-
>>  net/core/net-sysfs.c      |  137 +++++++++++++++++++++++++++++++++++++++++++--
>>  2 files changed, 177 insertions(+), 6 deletions(-)
>>
>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>> index cb8178a..0a76b88 100644
>> --- a/include/linux/netdevice.h
>> +++ b/include/linux/netdevice.h
>> @@ -44,6 +44,7 @@
>>  #include <linux/rculist.h>
>>  #include <linux/dmaengine.h>
>>  #include <linux/workqueue.h>
>> +#include <linux/dynamic_queue_limits.h>
>>
>>  #include <linux/ethtool.h>
>>  #include <net/net_namespace.h>
>> @@ -556,8 +557,10 @@ struct netdev_queue {
>>       struct Qdisc            *qdisc;
>>       unsigned long           state;
>>       struct Qdisc            *qdisc_sleeping;
>> -#ifdef CONFIG_RPS
>> +#ifdef CONFIG_XPS
>>       struct kobject          kobj;
>> +     bool                    do_bql;
>> +     struct dql              dql;
>>  #endif
>
> I have no idea why you use CONFIG_XPS for BQL (how BQL is it related to

I'll add CONFIG_BQL

> SMP ???), and why kobj is now guarded by CONFIG_XPS instead of
> CONFIG_RPS.
>
Because that kobj is for transmit side (currently only for xps_cpus)

>
>
>

^ permalink raw reply

* Re: [PATCH 00/13] Swap-over-NBD without deadlocking
From: Peter Zijlstra @ 2011-04-26 14:23 UTC (permalink / raw)
  To: Mel Gorman; +Cc: Linux-MM, Linux-Netdev, LKML, David Miller, Neil Brown
In-Reply-To: <1303803414-5937-1-git-send-email-mgorman@suse.de>

On Tue, 2011-04-26 at 08:36 +0100, Mel Gorman wrote:
> Comments?

Last time I brought up the whole swap over network bits I was pointed
towards the generic skb recycling work:

  http://lwn.net/Articles/332037/

as a means to pre-allocate memory, and it was suggested to simply pin
the few route-cache entries required to route these packets and
dis-allow swap packets to be fragmented (these last two avoid lots of
funny allocation cases in the network stack).


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox