[Patch 0/6] delay accounting & taskstats fixes

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [Patch 0/6] delay accounting & taskstats fixes
@ 2006-07-11  4:23 Shailabh Nagar
  2006-07-11  4:27 ` [Patch 1/6] per task delay accounting taskstats interface: code cleanup Shailabh Nagar
                   ` (7 more replies)
  0 siblings, 8 replies; 16+ messages in thread
From: Shailabh Nagar @ 2006-07-11  4:23 UTC (permalink / raw)
  To: Andrew Morton, linux-kernel
  Cc: Jay Lan, Chris Sturtivant, Paul Jackson, Balbir Singh,
	Chandra Seetharaman

Andrew,

Chandra, Balbir & I have been putting taskstats and delay accounting
patches through some extensive testing on multiple platforms.

Following are a set of patches that fix some bugs found as well as
some cleanups of the code. Some results showing the cpumask feature 
works as expected will follow separately.

--Shailabh


Patches against 2.6.18-rc1, apply over the per-task delay accounting
patches already in 2.6.18-rc1-mm1

Series

per-task-delay-accounting-taskstats-interface-exit-data-through-cpumasks-fix2.patch
per-task-delay-accounting-documentation-fix.patch
per-task-delay-accounting-taskstats-fix-early-sem-init.patch
per-task-delay-accounting-taskstats-fix-drop-listener-only-on-socket-close.patch
list_islast.patch
per-task-delay-accounting-taskstats-fix-clone-skbs-for-each-listener.patch



^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Patch 1/6] per task delay accounting taskstats interface: code cleanup
  2006-07-11  4:23 [Patch 0/6] delay accounting & taskstats fixes Shailabh Nagar
@ 2006-07-11  4:27 ` Shailabh Nagar
  2006-07-11  4:29 ` [Patch 2/6] per task delay accounting taskstats interface: documentation fix Shailabh Nagar
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 16+ messages in thread
From: Shailabh Nagar @ 2006-07-11  4:27 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, Jay Lan, Chris Sturtivant, Paul Jackson,
	Balbir Singh, Chandra Seetharaman

Cleanup cpumask related code and rearrange functions for clarity

Signed-Off-By: Shailabh Nagar <nagar@watson.ibm.com>

 include/linux/taskstats_kern.h |    5 ---
 kernel/taskstats.c             |   53 ++++++++++++++++++++++-------------------
 2 files changed, 29 insertions(+), 29 deletions(-)

Index: linux-2.6.18-rc1/include/linux/taskstats_kern.h
===================================================================
--- linux-2.6.18-rc1.orig/include/linux/taskstats_kern.h	2006-07-10 23:44:14.000000000 -0400
+++ linux-2.6.18-rc1/include/linux/taskstats_kern.h	2006-07-10 23:44:16.000000000 -0400
@@ -11,11 +11,6 @@
 #include <linux/sched.h>
 #include <net/genetlink.h>
 
-enum {
-	TASKSTATS_MSG_UNICAST,		/* send data only to requester */
-	TASKSTATS_MSG_MULTICAST,	/* send data to a group */
-};
-
 #ifdef CONFIG_TASKSTATS
 extern kmem_cache_t *taskstats_cache;
 extern struct mutex taskstats_exit_mutex;
Index: linux-2.6.18-rc1/kernel/taskstats.c
===================================================================
--- linux-2.6.18-rc1.orig/kernel/taskstats.c	2006-07-10 23:44:14.000000000 -0400
+++ linux-2.6.18-rc1/kernel/taskstats.c	2006-07-10 23:44:16.000000000 -0400
@@ -99,35 +99,45 @@ static int prepare_reply(struct genl_inf
 	return 0;
 }
 
-static int send_reply(struct sk_buff *skb, pid_t pid, int event, unsigned int cpu)
+/*
+ * Send taskstats data in @skb to listener with nl_pid @pid
+ */
+static int send_reply(struct sk_buff *skb, pid_t pid)
 {
 	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
-	struct listener_list *listeners;
-	struct list_head *p, *tmp;
-	void *reply;
+	void *reply = genlmsg_data(genlhdr);
 	int rc;
 
-	reply = genlmsg_data(genlhdr);
-
 	rc = genlmsg_end(skb, reply);
 	if (rc < 0) {
 		nlmsg_free(skb);
 		return rc;
 	}
 
-	if (event == TASKSTATS_MSG_UNICAST)
-		return genlmsg_unicast(skb, pid);
+	return genlmsg_unicast(skb, pid);
+}
 
-	/*
-	 * Taskstats multicast is unicasts to listeners who have registered
-	 * interest in cpu
-	 */
+/*
+ * Send taskstats data in @skb to listeners registered for @cpu's exit data
+ */
+static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
+{
+	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
+	struct listener_list *listeners;
+	struct listener *s, *tmp;
+	void *reply = genlmsg_data(genlhdr);
+	int rc, ret;
+
+	rc = genlmsg_end(skb, reply);
+	if (rc < 0) {
+		nlmsg_free(skb);
+		return rc;
+	}
 
+	rc = 0;
 	listeners = &per_cpu(listener_array, cpu);
 	down_write(&listeners->sem);
-	list_for_each_safe(p, tmp, &listeners->list) {
-		int ret;
-		struct listener *s = list_entry(p, struct listener, list);
+	list_for_each_entry_safe(s, tmp, &listeners->list, list) {
 		ret = genlmsg_unicast(skb, s->pid);
 		if (ret) {
 			list_del(&s->list);
@@ -253,11 +263,10 @@ ret:
 
 static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
 {
-	struct listener *s;
 	struct listener_list *listeners;
+	struct listener *s, *tmp;
 	unsigned int cpu;
 	cpumask_t mask = *maskp;
-	struct list_head *p;
 
 	if (!cpus_subset(mask, cpu_possible_map))
 		return -EINVAL;
@@ -282,12 +291,9 @@ static int add_del_listener(pid_t pid, c
 	/* Deregister or cleanup */
 cleanup:
 	for_each_cpu_mask(cpu, mask) {
-		struct list_head *tmp;
-
 		listeners = &per_cpu(listener_array, cpu);
 		down_write(&listeners->sem);
-		list_for_each_safe(p, tmp, &listeners->list) {
-			s = list_entry(p, struct listener, list);
+		list_for_each_entry_safe(s, tmp, &listeners->list, list) {
 			if (s->pid == pid) {
 				list_del(&s->list);
 				kfree(s);
@@ -376,8 +382,7 @@ static int taskstats_user_cmd(struct sk_
 
 	nla_nest_end(rep_skb, na);
 
-	return send_reply(rep_skb, info->snd_pid, TASKSTATS_MSG_UNICAST,
-			  CPU_DONT_CARE);
+	return send_reply(rep_skb, info->snd_pid);
 
 nla_put_failure:
 	return genlmsg_cancel(rep_skb, reply);
@@ -475,7 +480,7 @@ void taskstats_exit_send(struct task_str
 	nla_nest_end(rep_skb, na);
 
 send:
-	send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST, mycpu);
+	send_cpu_listeners(rep_skb, mycpu);
 	return;
 
 nla_put_failure:



^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Patch 2/6] per task delay accounting taskstats interface: documentation fix
  2006-07-11  4:23 [Patch 0/6] delay accounting & taskstats fixes Shailabh Nagar
  2006-07-11  4:27 ` [Patch 1/6] per task delay accounting taskstats interface: code cleanup Shailabh Nagar
@ 2006-07-11  4:29 ` Shailabh Nagar
  2006-07-11  4:30 ` [Patch 3/6] per task delay accounting taskstats interface: fix early sem init Shailabh Nagar
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 16+ messages in thread
From: Shailabh Nagar @ 2006-07-11  4:29 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, Jay Lan, Chris Sturtivant, Paul Jackson,
	Balbir Singh, Chandra Seetharaman

Change documentation and example program to reflect the flow control
issues being addressed by the cpumask changes.

Signed-Off-By: Shailabh Nagar <nagar@watson.ibm.com>

 Documentation/accounting/getdelays.c   |  612 +++++++++++++++++----------------
 Documentation/accounting/taskstats.txt |   64 ++-
 2 files changed, 368 insertions(+), 308 deletions(-)

Index: linux-2.6.18-rc1/Documentation/accounting/taskstats.txt
===================================================================
--- linux-2.6.18-rc1.orig/Documentation/accounting/taskstats.txt	2006-07-10 23:44:10.000000000 -0400
+++ linux-2.6.18-rc1/Documentation/accounting/taskstats.txt	2006-07-10 23:44:18.000000000 -0400
@@ -26,20 +26,28 @@ leader - a process is deemed alive as lo
 Usage
 -----
 
-To get statistics during task's lifetime, userspace opens a unicast netlink
+To get statistics during a task's lifetime, userspace opens a unicast netlink
 socket (NETLINK_GENERIC family) and sends commands specifying a pid or a tgid.
 The response contains statistics for a task (if pid is specified) or the sum of
 statistics for all tasks of the process (if tgid is specified).
 
-To obtain statistics for tasks which are exiting, userspace opens a multicast
-netlink socket. Each time a task exits, its per-pid statistics is always sent
-by the kernel to each listener on the multicast socket. In addition, if it is
-the last thread exiting its thread group, an additional record containing the
-per-tgid stats are also sent. The latter contains the sum of per-pid stats for
-all threads in the thread group, both past and present.
+To obtain statistics for tasks which are exiting, the userspace listener
+sends a register command and specifies a cpumask. Whenever a task exits on
+one of the cpus in the cpumask, its per-pid statistics are sent to the
+registered listener. Using cpumasks allows the data received by one listener
+to be limited and assists in flow control over the netlink interface and is
+explained in more detail below.
+
+If the exiting task is the last thread exiting its thread group,
+an additional record containing the per-tgid stats is also sent to userspace.
+The latter contains the sum of per-pid stats for all threads in the thread
+group, both past and present.
 
 getdelays.c is a simple utility demonstrating usage of the taskstats interface
-for reporting delay accounting statistics.
+for reporting delay accounting statistics. Users can register cpumasks,
+send commands and process responses, listen for per-tid/tgid exit data,
+write the data received to a file and do basic flow control by increasing
+receive buffer sizes.
 
 Interface
 ---------
@@ -66,10 +74,20 @@ The messages are in the format
 
 The taskstats payload is one of the following three kinds:
 
-1. Commands: Sent from user to kernel. The payload is one attribute, of type
-TASKSTATS_CMD_ATTR_PID/TGID, containing a u32 pid or tgid in the attribute
-payload. The pid/tgid denotes the task/process for which userspace wants
-statistics.
+1. Commands: Sent from user to kernel. Commands to get data on
+a pid/tgid consist of one attribute, of type TASKSTATS_CMD_ATTR_PID/TGID,
+containing a u32 pid or tgid in the attribute payload. The pid/tgid denotes
+the task/process for which userspace wants statistics.
+
+Commands to register/deregister interest in exit data from a set of cpus
+consist of one attribute, of type
+TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK and contain a cpumask in the
+attribute payload. The cpumask is specified as an ascii string of
+comma-separated cpu ranges e.g. to listen to exit data from cpus 1,2,3,5,7,8
+the cpumask would be "1-3,5,7-8". If userspace forgets to deregister interest
+in cpus before closing the listening socket, the kernel cleans up its interest
+set over time. However, for the sake of efficiency, an explicit deregistration
+is advisable.
 
 2. Response for a command: sent from the kernel in response to a userspace
 command. The payload is a series of three attributes of type:
@@ -138,4 +156,26 @@ struct too much, requiring disparate use
 unnecessarily receive large structures whose fields are of no interest, then
 extending the attributes structure would be worthwhile.
 
+Flow control for taskstats
+--------------------------
+
+When the rate of task exits becomes large, a listener may not be able to keep
+up with the kernel's rate of sending per-tid/tgid exit data leading to data
+loss. This possibility gets compounded when the taskstats structure gets
+extended and the number of cpus grows large.
+
+To avoid losing statistics, userspace should do one or more of the following:
+
+- increase the receive buffer sizes for the netlink sockets opened by
+listeners to receive exit data.
+
+- create more listeners and reduce the number of cpus being listened to by
+each listener. In the extreme case, there could be one listener for each cpu.
+Users may also consider setting the cpu affinity of the listener to the subset
+of cpus to which it listens, especially if they are listening to just one cpu.
+
+Despite these measures, if the userspace receives ENOBUFS error messages
+indicated overflow of receive buffers, it should take measures to handle the
+loss of data.
+
 ----
Index: linux-2.6.18-rc1/Documentation/accounting/getdelays.c
===================================================================
--- linux-2.6.18-rc1.orig/Documentation/accounting/getdelays.c	2006-07-10 23:44:10.000000000 -0400
+++ linux-2.6.18-rc1/Documentation/accounting/getdelays.c	2006-07-10 23:44:18.000000000 -0400
@@ -5,6 +5,7 @@
  *
  * Copyright (C) Shailabh Nagar, IBM Corp. 2005
  * Copyright (C) Balbir Singh, IBM Corp. 2006
+ * Copyright (c) Jay Lan, SGI. 2006
  *
  */
 
@@ -36,341 +37,360 @@
 
 #define err(code, fmt, arg...) do { printf(fmt, ##arg); exit(code); } while (0)
 int done = 0;
+int rcvbufsz=0;
+
+    char name[100];
+int dbg=0, print_delays=0;
+__u64 stime, utime;
+#define PRINTF(fmt, arg...) {			\
+	    if (dbg) {				\
+		printf(fmt, ##arg);		\
+	    }					\
+	}
+
+/* Maximum size of response requested or message sent */
+#define MAX_MSG_SIZE	256
+/* Maximum number of cpus expected to be specified in a cpumask */
+#define MAX_CPUS	32
+/* Maximum length of pathname to log file */
+#define MAX_FILENAME	256
+
+struct msgtemplate {
+	struct nlmsghdr n;
+	struct genlmsghdr g;
+	char buf[MAX_MSG_SIZE];
+};
+
+char cpumask[100+6*MAX_CPUS];
 
 /*
  * Create a raw netlink socket and bind
  */
-static int create_nl_socket(int protocol, int groups)
+static int create_nl_socket(int protocol)
 {
-    socklen_t addr_len;
-    int fd;
-    struct sockaddr_nl local;
+	int fd;
+	struct sockaddr_nl local;
 
-    fd = socket(AF_NETLINK, SOCK_RAW, protocol);
-    if (fd < 0)
-	return -1;
+	fd = socket(AF_NETLINK, SOCK_RAW, protocol);
+	if (fd < 0)
+		return -1;
+
+	if (rcvbufsz)
+		if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF,
+				&rcvbufsz, sizeof(rcvbufsz)) < 0) {
+			printf("Unable to set socket rcv buf size to %d\n",
+			       rcvbufsz);
+			return -1;
+		}
 
-    memset(&local, 0, sizeof(local));
-    local.nl_family = AF_NETLINK;
-    local.nl_groups = groups;
-
-    if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0)
-	goto error;
-
-    return fd;
-  error:
-    close(fd);
-    return -1;
-}
+	memset(&local, 0, sizeof(local));
+	local.nl_family = AF_NETLINK;
 
-int sendto_fd(int s, const char *buf, int bufLen)
-{
-    struct sockaddr_nl nladdr;
-    int r;
+	if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0)
+		goto error;
+
+	return fd;
+error:
+	close(fd);
+	return -1;
+}
 
-    memset(&nladdr, 0, sizeof(nladdr));
-    nladdr.nl_family = AF_NETLINK;
 
-    while ((r = sendto(s, buf, bufLen, 0, (struct sockaddr *) &nladdr,
-		       sizeof(nladdr))) < bufLen) {
-	if (r > 0) {
-	    buf += r;
-	    bufLen -= r;
-	} else if (errno != EAGAIN)
-	    return -1;
-    }
-    return 0;
+int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid,
+	     __u8 genl_cmd, __u16 nla_type,
+	     void *nla_data, int nla_len)
+{
+	struct nlattr *na;
+	struct sockaddr_nl nladdr;
+	int r, buflen;
+	char *buf;
+
+	struct msgtemplate msg;
+
+	msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
+	msg.n.nlmsg_type = nlmsg_type;
+	msg.n.nlmsg_flags = NLM_F_REQUEST;
+	msg.n.nlmsg_seq = 0;
+	msg.n.nlmsg_pid = nlmsg_pid;
+	msg.g.cmd = genl_cmd;
+	msg.g.version = 0x1;
+	na = (struct nlattr *) GENLMSG_DATA(&msg);
+	na->nla_type = nla_type;
+	na->nla_len = nla_len + 1 + NLA_HDRLEN;
+	memcpy(NLA_DATA(na), nla_data, nla_len);
+	msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len);
+
+	buf = (char *) &msg;
+	buflen = msg.n.nlmsg_len ;
+	memset(&nladdr, 0, sizeof(nladdr));
+	nladdr.nl_family = AF_NETLINK;
+	while ((r = sendto(sd, buf, buflen, 0, (struct sockaddr *) &nladdr,
+			   sizeof(nladdr))) < buflen) {
+		if (r > 0) {
+			buf += r;
+			buflen -= r;
+		} else if (errno != EAGAIN)
+			return -1;
+	}
+	return 0;
 }
 
+
 /*
  * Probe the controller in genetlink to find the family id
  * for the TASKSTATS family
  */
 int get_family_id(int sd)
 {
-    struct {
-	struct nlmsghdr n;
-	struct genlmsghdr g;
-	char buf[256];
-    } family_req;
-    struct {
-	struct nlmsghdr n;
-	struct genlmsghdr g;
-	char buf[256];
-    } ans;
-
-    int id;
-    struct nlattr *na;
-    int rep_len;
-
-    /* Get family name */
-    family_req.n.nlmsg_type = GENL_ID_CTRL;
-    family_req.n.nlmsg_flags = NLM_F_REQUEST;
-    family_req.n.nlmsg_seq = 0;
-    family_req.n.nlmsg_pid = getpid();
-    family_req.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
-    family_req.g.cmd = CTRL_CMD_GETFAMILY;
-    family_req.g.version = 0x1;
-    na = (struct nlattr *) GENLMSG_DATA(&family_req);
-    na->nla_type = CTRL_ATTR_FAMILY_NAME;
-    na->nla_len = strlen(TASKSTATS_GENL_NAME) + 1 + NLA_HDRLEN;
-    strcpy(NLA_DATA(na), TASKSTATS_GENL_NAME);
-    family_req.n.nlmsg_len += NLMSG_ALIGN(na->nla_len);
-
-    if (sendto_fd(sd, (char *) &family_req, family_req.n.nlmsg_len) < 0)
-	err(1, "error sending message via Netlink\n");
-
-    rep_len = recv(sd, &ans, sizeof(ans), 0);
-
-    if (rep_len < 0)
-	err(1, "error receiving reply message via Netlink\n");
-
-
-    /* Validate response message */
-    if (!NLMSG_OK((&ans.n), rep_len))
-	err(1, "invalid reply message received via Netlink\n");
-
-    if (ans.n.nlmsg_type == NLMSG_ERROR) {	/* error */
-	printf("error received NACK - leaving\n");
-	exit(1);
-    }
-
-
-    na = (struct nlattr *) GENLMSG_DATA(&ans);
-    na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
-    if (na->nla_type == CTRL_ATTR_FAMILY_ID) {
-	id = *(__u16 *) NLA_DATA(na);
-    }
-    return id;
-}
+	struct {
+		struct nlmsghdr n;
+		struct genlmsghdr g;
+		char buf[256];
+	} ans;
+
+	int id, rc;
+	struct nlattr *na;
+	int rep_len;
+
+	strcpy(name, TASKSTATS_GENL_NAME);
+	rc = send_cmd(sd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY,
+			CTRL_ATTR_FAMILY_NAME, (void *)name,
+			strlen(TASKSTATS_GENL_NAME)+1);
+
+	rep_len = recv(sd, &ans, sizeof(ans), 0);
+	if (ans.n.nlmsg_type == NLMSG_ERROR ||
+	    (rep_len < 0) || !NLMSG_OK((&ans.n), rep_len))
+		return 0;
 
-void print_taskstats(struct taskstats *t)
-{
-    printf("\n\nCPU   %15s%15s%15s%15s\n"
-	   "      %15llu%15llu%15llu%15llu\n"
-	   "IO    %15s%15s\n"
-	   "      %15llu%15llu\n"
-	   "MEM   %15s%15s\n"
-	   "      %15llu%15llu\n\n",
-	   "count", "real total", "virtual total", "delay total",
-	   t->cpu_count, t->cpu_run_real_total, t->cpu_run_virtual_total,
-	   t->cpu_delay_total,
-	   "count", "delay total",
-	   t->blkio_count, t->blkio_delay_total,
-	   "count", "delay total", t->swapin_count, t->swapin_delay_total);
+	na = (struct nlattr *) GENLMSG_DATA(&ans);
+	na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
+	if (na->nla_type == CTRL_ATTR_FAMILY_ID) {
+		id = *(__u16 *) NLA_DATA(na);
+	}
+	return id;
 }
 
-void sigchld(int sig)
+void print_delayacct(struct taskstats *t)
 {
-    done = 1;
+	printf("\n\nCPU   %15s%15s%15s%15s\n"
+	       "      %15llu%15llu%15llu%15llu\n"
+	       "IO    %15s%15s\n"
+	       "      %15llu%15llu\n"
+	       "MEM   %15s%15s\n"
+	       "      %15llu%15llu\n\n",
+	       "count", "real total", "virtual total", "delay total",
+	       t->cpu_count, t->cpu_run_real_total, t->cpu_run_virtual_total,
+	       t->cpu_delay_total,
+	       "count", "delay total",
+	       t->blkio_count, t->blkio_delay_total,
+	       "count", "delay total", t->swapin_count, t->swapin_delay_total);
 }
 
 int main(int argc, char *argv[])
 {
-    int rc;
-    int sk_nl;
-    struct nlmsghdr *nlh;
-    struct genlmsghdr *genlhdr;
-    char *buf;
-    struct taskstats_cmd_param *param;
-    __u16 id;
-    struct nlattr *na;
-
-    /* For receiving */
-    struct sockaddr_nl kern_nla, from_nla;
-    socklen_t from_nla_len;
-    int recv_len;
-    struct taskstats_reply *reply;
-
-    struct {
-	struct nlmsghdr n;
-	struct genlmsghdr g;
-	char buf[256];
-    } req;
-
-    struct {
-	struct nlmsghdr n;
-	struct genlmsghdr g;
-	char buf[256];
-    } ans;
+	int c, rc, rep_len, aggr_len, len2, cmd_type;
+	__u16 id;
+	__u32 mypid;
+
+	struct nlattr *na;
+	int nl_sd = -1;
+	int len = 0;
+	pid_t tid = 0;
+	pid_t rtid = 0;
+
+	int fd = 0;
+	int count = 0;
+	int write_file = 0;
+	int maskset = 0;
+	char logfile[128];
+	int loop = 0;
+
+	struct msgtemplate msg;
+
+	while (1) {
+		c = getopt(argc, argv, "dw:r:m:t:p:v:l");
+		if (c < 0)
+			break;
 
-    int nl_sd = -1;
-    int rep_len;
-    int len = 0;
-    int aggr_len, len2;
-    struct sockaddr_nl nladdr;
-    pid_t tid = 0;
-    pid_t rtid = 0;
-    int cmd_type = TASKSTATS_TYPE_TGID;
-    int c, status;
-    int forking = 0;
-    struct sigaction act = {
-	.sa_handler = SIG_IGN,
-	.sa_mask = SA_NOMASK,
-    };
-    struct sigaction tact ;
-
-    if (argc < 3) {
-	printf("usage %s [-t tgid][-p pid][-c cmd]\n", argv[0]);
-	exit(-1);
-    }
-
-    tact.sa_handler = sigchld;
-    sigemptyset(&tact.sa_mask);
-    if (sigaction(SIGCHLD, &tact, NULL) < 0)
-	err(1, "sigaction failed for SIGCHLD\n");
-
-    while (1) {
-
-	c = getopt(argc, argv, "t:p:c:");
-	if (c < 0)
-	    break;
-
-	switch (c) {
-	case 't':
-	    tid = atoi(optarg);
-	    if (!tid)
-		err(1, "Invalid tgid\n");
-	    cmd_type = TASKSTATS_CMD_ATTR_TGID;
-	    break;
-	case 'p':
-	    tid = atoi(optarg);
-	    if (!tid)
-		err(1, "Invalid pid\n");
-	    cmd_type = TASKSTATS_CMD_ATTR_TGID;
-	    break;
-	case 'c':
-	    opterr = 0;
-	    tid = fork();
-	    if (tid < 0)
-		err(1, "fork failed\n");
-
-	    if (tid == 0) {	/* child process */
-		if (execvp(argv[optind - 1], &argv[optind - 1]) < 0) {
-		    exit(-1);
+		switch (c) {
+		case 'd':
+			printf("print delayacct stats ON\n");
+			print_delays = 1;
+			break;
+		case 'w':
+			strncpy(logfile, optarg, MAX_FILENAME);
+			printf("write to file %s\n", logfile);
+			write_file = 1;
+			break;
+		case 'r':
+			rcvbufsz = atoi(optarg);
+			printf("receive buf size %d\n", rcvbufsz);
+			if (rcvbufsz < 0)
+				err(1, "Invalid rcv buf size\n");
+			break;
+		case 'm':
+			strncpy(cpumask, optarg, sizeof(cpumask));
+			maskset = 1;
+			printf("cpumask %s maskset %d\n", cpumask, maskset);
+			break;
+		case 't':
+			tid = atoi(optarg);
+			if (!tid)
+				err(1, "Invalid tgid\n");
+			cmd_type = TASKSTATS_CMD_ATTR_TGID;
+			print_delays = 1;
+			break;
+		case 'p':
+			tid = atoi(optarg);
+			if (!tid)
+				err(1, "Invalid pid\n");
+			cmd_type = TASKSTATS_CMD_ATTR_PID;
+			print_delays = 1;
+			break;
+		case 'v':
+			printf("debug on\n");
+			dbg = 1;
+			break;
+		case 'l':
+			printf("listen forever\n");
+			loop = 1;
+			break;
+		default:
+			printf("Unknown option %d\n", c);
+			exit(-1);
 		}
-	    }
-	    forking = 1;
-	    break;
-	default:
-	    printf("usage %s [-t tgid][-p pid][-c cmd]\n", argv[0]);
-	    exit(-1);
-	    break;
 	}
-	if (c == 'c')
-	    break;
-    }
-
-    /* Construct Netlink request message */
-
-    /* Send Netlink request message & get reply */
-
-    if ((nl_sd =
-	 create_nl_socket(NETLINK_GENERIC, TASKSTATS_LISTEN_GROUP)) < 0)
-	err(1, "error creating Netlink socket\n");
-
-
-    id = get_family_id(nl_sd);
-
-    /* Send command needed */
-    req.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
-    req.n.nlmsg_type = id;
-    req.n.nlmsg_flags = NLM_F_REQUEST;
-    req.n.nlmsg_seq = 0;
-    req.n.nlmsg_pid = tid;
-    req.g.cmd = TASKSTATS_CMD_GET;
-    na = (struct nlattr *) GENLMSG_DATA(&req);
-    na->nla_type = cmd_type;
-    na->nla_len = sizeof(unsigned int) + NLA_HDRLEN;
-    *(__u32 *) NLA_DATA(na) = tid;
-    req.n.nlmsg_len += NLMSG_ALIGN(na->nla_len);
-
-
-    if (!forking && sendto_fd(nl_sd, (char *) &req, req.n.nlmsg_len) < 0)
-	err(1, "error sending message via Netlink\n");
-
-    act.sa_handler = SIG_IGN;
-    sigemptyset(&act.sa_mask);
-    if (sigaction(SIGINT, &act, NULL) < 0)
-	err(1, "sigaction failed for SIGINT\n");
-
-    do {
-	int i;
-	struct pollfd pfd;
-	int pollres;
-
-	pfd.events = 0xffff & ~POLLOUT;
-	pfd.fd = nl_sd;
-	pollres = poll(&pfd, 1, 5000);
-	if (pollres < 0 || done) {
-	    break;
+
+	if (write_file) {
+		fd = open(logfile, O_WRONLY | O_CREAT | O_TRUNC,
+			  S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+		if (fd == -1) {
+			perror("Cannot open output file\n");
+			exit(1);
+		}
 	}
 
-	rep_len = recv(nl_sd, &ans, sizeof(ans), 0);
-	nladdr.nl_family = AF_NETLINK;
-	nladdr.nl_groups = TASKSTATS_LISTEN_GROUP;
+	if ((nl_sd = create_nl_socket(NETLINK_GENERIC)) < 0)
+		err(1, "error creating Netlink socket\n");
 
-	if (ans.n.nlmsg_type == NLMSG_ERROR) {	/* error */
-	    printf("error received NACK - leaving\n");
-	    exit(1);
+
+	mypid = getpid();
+	id = get_family_id(nl_sd);
+	if (!id) {
+		printf("Error getting family id, errno %d", errno);
+		goto err;
 	}
+	PRINTF("family id %d\n", id);
 
-	if (rep_len < 0) {
-	    err(1, "error receiving reply message via Netlink\n");
-	    break;
+	if (maskset) {
+		rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET,
+			      TASKSTATS_CMD_ATTR_REGISTER_CPUMASK,
+			      &cpumask, sizeof(cpumask));
+		PRINTF("Sent register cpumask, retval %d\n", rc);
+		if (rc < 0) {
+			printf("error sending register cpumask\n");
+			goto err;
+		}
 	}
 
-	/* Validate response message */
-	if (!NLMSG_OK((&ans.n), rep_len))
-	    err(1, "invalid reply message received via Netlink\n");
+	if (tid) {
+		rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET,
+			      cmd_type, &tid, sizeof(__u32));
+		PRINTF("Sent pid/tgid, retval %d\n", rc);
+		if (rc < 0) {
+			printf("error sending tid/tgid cmd\n");
+			goto done;
+		}
+	}
 
-	rep_len = GENLMSG_PAYLOAD(&ans.n);
+	do {
+		int i;
 
-	na = (struct nlattr *) GENLMSG_DATA(&ans);
-	len = 0;
-	i = 0;
-	while (len < rep_len) {
-	    len += NLA_ALIGN(na->nla_len);
-	    switch (na->nla_type) {
-	    case TASKSTATS_TYPE_AGGR_PID:
-		/* Fall through */
-	    case TASKSTATS_TYPE_AGGR_TGID:
-		aggr_len = NLA_PAYLOAD(na->nla_len);
-		len2 = 0;
-		/* For nested attributes, na follows */
-		na = (struct nlattr *) NLA_DATA(na);
-		done = 0;
-		while (len2 < aggr_len) {
-		    switch (na->nla_type) {
-		    case TASKSTATS_TYPE_PID:
-			rtid = *(int *) NLA_DATA(na);
-			break;
-		    case TASKSTATS_TYPE_TGID:
-			rtid = *(int *) NLA_DATA(na);
-			break;
-		    case TASKSTATS_TYPE_STATS:
-			if (rtid == tid) {
-			    print_taskstats((struct taskstats *)
-					    NLA_DATA(na));
-			    done = 1;
+		rep_len = recv(nl_sd, &msg, sizeof(msg), 0);
+		PRINTF("received %d bytes\n", rep_len);
+
+		if (rep_len < 0) {
+			printf("nonfatal reply error: errno %d\n", errno);
+			continue;
+		}
+		if (msg.n.nlmsg_type == NLMSG_ERROR ||
+		    !NLMSG_OK((&msg.n), rep_len)) {
+			printf("fatal reply error,  errno %d\n", errno);
+			goto done;
+		}
+
+		PRINTF("nlmsghdr size=%d, nlmsg_len=%d, rep_len=%d\n",
+		       sizeof(struct nlmsghdr), msg.n.nlmsg_len, rep_len);
+
+
+		rep_len = GENLMSG_PAYLOAD(&msg.n);
+
+		na = (struct nlattr *) GENLMSG_DATA(&msg);
+		len = 0;
+		i = 0;
+		while (len < rep_len) {
+			len += NLA_ALIGN(na->nla_len);
+			switch (na->nla_type) {
+			case TASKSTATS_TYPE_AGGR_TGID:
+				/* Fall through */
+			case TASKSTATS_TYPE_AGGR_PID:
+				aggr_len = NLA_PAYLOAD(na->nla_len);
+				len2 = 0;
+				/* For nested attributes, na follows */
+				na = (struct nlattr *) NLA_DATA(na);
+				done = 0;
+				while (len2 < aggr_len) {
+					switch (na->nla_type) {
+					case TASKSTATS_TYPE_PID:
+						rtid = *(int *) NLA_DATA(na);
+						if (print_delays)
+							printf("PID\t%d\n", rtid);
+						break;
+					case TASKSTATS_TYPE_TGID:
+						rtid = *(int *) NLA_DATA(na);
+						if (print_delays)
+							printf("TGID\t%d\n", rtid);
+						break;
+					case TASKSTATS_TYPE_STATS:
+						count++;
+						if (print_delays)
+							print_delayacct((struct taskstats *) NLA_DATA(na));
+						if (fd) {
+							if (write(fd, NLA_DATA(na), na->nla_len) < 0) {
+								err(1,"write error\n");
+							}
+						}
+						if (!loop)
+							goto done;
+						break;
+					default:
+						printf("Unknown nested nla_type %d\n", na->nla_type);
+						break;
+					}
+					len2 += NLA_ALIGN(na->nla_len);
+					na = (struct nlattr *) ((char *) na + len2);
+				}
+				break;
+
+			default:
+				printf("Unknown nla_type %d\n", na->nla_type);
+				break;
 			}
-			break;
-		    }
-		    len2 += NLA_ALIGN(na->nla_len);
-		    na = (struct nlattr *) ((char *) na + len2);
-		    if (done)
-			break;
+			na = (struct nlattr *) (GENLMSG_DATA(&msg) + len);
 		}
-	    }
-	    na = (struct nlattr *) (GENLMSG_DATA(&ans) + len);
-	    if (done)
-		break;
+	} while (loop);
+done:
+	if (maskset) {
+		rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET,
+			      TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK,
+			      &cpumask, sizeof(cpumask));
+		printf("Sent deregister mask, retval %d\n", rc);
+		if (rc < 0)
+			err(rc, "error sending deregister cpumask\n");
 	}
-	if (done)
-	    break;
-    }
-    while (1);
-
-    close(nl_sd);
-    return 0;
+err:
+	close(nl_sd);
+	if (fd)
+		close(fd);
+	return 0;
 }



^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Patch 3/6] per task delay accounting taskstats interface: fix early sem init
  2006-07-11  4:23 [Patch 0/6] delay accounting & taskstats fixes Shailabh Nagar
  2006-07-11  4:27 ` [Patch 1/6] per task delay accounting taskstats interface: code cleanup Shailabh Nagar
  2006-07-11  4:29 ` [Patch 2/6] per task delay accounting taskstats interface: documentation fix Shailabh Nagar
@ 2006-07-11  4:30 ` Shailabh Nagar
  2006-07-11  4:33 ` [Patch 4/6] per task delay accounting taskstats interface: fix drop listener only on socket close Shailabh Nagar
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 16+ messages in thread
From: Shailabh Nagar @ 2006-07-11  4:30 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, Jay Lan, Chris Sturtivant, Paul Jackson,
	Balbir Singh, Chandra Seetharaman

Shift initialization of semaphores taken on exit() path
to earlier in the bootup sequence. Without this fix,
booting on large cpu machines hangs at down_read() called 
on one of the per-cpu semaphores declared in taskstats.

Signed-Off-By: Shailabh Nagar <nagar@watson.ibm.com>
 kernel/taskstats.c |   12 ++++++------
 1 files changed, 6 insertions(+), 6 deletions(-)

Index: linux-2.6.18-rc1/kernel/taskstats.c
===================================================================
--- linux-2.6.18-rc1.orig/kernel/taskstats.c	2006-07-10 23:44:16.000000000 -0400
+++ linux-2.6.18-rc1/kernel/taskstats.c	2006-07-10 23:44:20.000000000 -0400
@@ -501,15 +501,20 @@ static struct genl_ops taskstats_ops = {
 /* Needed early in initialization */
 void __init taskstats_init_early(void)
 {
+	unsigned int i;
+
 	taskstats_cache = kmem_cache_create("taskstats_cache",
 						sizeof(struct taskstats),
 						0, SLAB_PANIC, NULL, NULL);
+	for_each_possible_cpu(i) {
+		INIT_LIST_HEAD(&(per_cpu(listener_array, i).list));
+		init_rwsem(&(per_cpu(listener_array, i).sem));
+	}
 }
 
 static int __init taskstats_init(void)
 {
 	int rc;
-	unsigned int i;
 
 	rc = genl_register_family(&family);
 	if (rc)
@@ -519,11 +524,6 @@ static int __init taskstats_init(void)
 	if (rc < 0)
 		goto err;
 
-	for_each_possible_cpu(i) {
-		INIT_LIST_HEAD(&(per_cpu(listener_array, i).list));
-		init_rwsem(&(per_cpu(listener_array, i).sem));
-	}
-
 	family_registered = 1;
 	return 0;
 err:



^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Patch 4/6] per task delay accounting taskstats interface: fix drop listener only on socket close
  2006-07-11  4:23 [Patch 0/6] delay accounting & taskstats fixes Shailabh Nagar
                   ` (2 preceding siblings ...)
  2006-07-11  4:30 ` [Patch 3/6] per task delay accounting taskstats interface: fix early sem init Shailabh Nagar
@ 2006-07-11  4:33 ` Shailabh Nagar
  2006-07-11  4:34 ` [Patch 5/6] list_islast utility Shailabh Nagar
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 16+ messages in thread
From: Shailabh Nagar @ 2006-07-11  4:33 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, Jay Lan, Chris Sturtivant, Paul Jackson,
	Balbir Singh, Chandra Seetharaman

Remove listeners from per-cpu listener lists only if they've closed the
socket (resulting in an ECONNREFUSED failure of genetlink unicast). 
For other errors returned by genlmsg_unicast like -EAGAIN which results 
from the receiver's buffer having insufficient space, userspace gets an 
ENOBUF warning and the kernel can continue.

Signed-Off-By: Shailabh Nagar <nagar@watson.ibm.com>
Signed-Off-By: Balbir Singh <balbir@in.ibm.com>
Signed-Off-By: Chandra Seetharaman <sekharan@us.ibm.com>

 kernel/taskstats.c |    2 +-
 1 files changed, 1 insertion(+), 1 deletion(-)

Index: linux-2.6.18-rc1/kernel/taskstats.c
===================================================================
--- linux-2.6.18-rc1.orig/kernel/taskstats.c	2006-07-10 23:44:54.000000000 -0400
+++ linux-2.6.18-rc1/kernel/taskstats.c	2006-07-10 23:44:56.000000000 -0400
@@ -139,7 +139,7 @@ static int send_cpu_listeners(struct sk_
 	down_write(&listeners->sem);
 	list_for_each_entry_safe(s, tmp, &listeners->list, list) {
 		ret = genlmsg_unicast(skb, s->pid);
-		if (ret) {
+		if (ret == -ECONNREFUSED) {
 			list_del(&s->list);
 			kfree(s);
 			rc = ret;



^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Patch 5/6] list_islast utility
  2006-07-11  4:23 [Patch 0/6] delay accounting & taskstats fixes Shailabh Nagar
                   ` (3 preceding siblings ...)
  2006-07-11  4:33 ` [Patch 4/6] per task delay accounting taskstats interface: fix drop listener only on socket close Shailabh Nagar
@ 2006-07-11  4:34 ` Shailabh Nagar
  2006-07-11  4:36 ` [Patch 6/6] per task delay accounting taskstats interface: fix clone skbs for each listener Shailabh Nagar
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 16+ messages in thread
From: Shailabh Nagar @ 2006-07-11  4:34 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, Jay Lan, Chris Sturtivant, Paul Jackson,
	Balbir Singh, Chandra Seetharaman

Add another list utility function to check for last element in a list.

Signed-Off-By: Shailabh Nagar <nagar@watson.ibm.com>

 include/linux/list.h |   11 +++++++++++
 1 files changed, 11 insertions(+)

Index: linux-2.6.18-rc1/include/linux/list.h
===================================================================
--- linux-2.6.18-rc1.orig/include/linux/list.h	2006-07-10 23:04:28.000000000 -0400
+++ linux-2.6.18-rc1/include/linux/list.h	2006-07-10 23:44:59.000000000 -0400
@@ -265,6 +265,17 @@ static inline void list_move_tail(struct
 }
 
 /**
+ * list_islast - tests whether @list is the last entry in list @head
+ * @list: the entry to test
+ * @head: the head of the list
+ */
+static inline int list_islast(const struct list_head *list,
+				const struct list_head *head)
+{
+	return list->next == head;
+}
+
+/**
  * list_empty - tests whether a list is empty
  * @head: the list to test.
  */



^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Patch 6/6] per task delay accounting taskstats interface: fix clone skbs for each listener
  2006-07-11  4:23 [Patch 0/6] delay accounting & taskstats fixes Shailabh Nagar
                   ` (4 preceding siblings ...)
  2006-07-11  4:34 ` [Patch 5/6] list_islast utility Shailabh Nagar
@ 2006-07-11  4:36 ` Shailabh Nagar
  2006-07-11 10:05   ` Andrew Morton
  2006-07-11  4:50 ` [Patch 0/6] delay accounting & taskstats fixes Shailabh Nagar
  2006-07-11 11:28 ` Christoph Hellwig
  7 siblings, 1 reply; 16+ messages in thread
From: Shailabh Nagar @ 2006-07-11  4:36 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, Jay Lan, Chris Sturtivant, Paul Jackson,
	Balbir Singh, Chandra Seetharaman, Jamal, netdev

Use a cloned sk_buff for each netlink message sent to multiple listeners.

Earlier, the same skb, representing a netlink message, was being erroneously 
reused for doing genetlink_unicast()'s (effectively netlink_unicast()) to 
each listener on the per-cpu list of listeners. Since netlink_unicast() frees
up the skb passed to it, regardless of status of the send, reuse is bad.

Thanks to Chandra Seetharaman for discovering this bug.

Signed-Off-By: Shailabh Nagar <nagar@watson.ibm.com>
Signed-Off-By: Chandra Seetharaman <sekharan@us.ibm.com>

 kernel/taskstats.c |   13 ++++++++++++-
 1 files changed, 12 insertions(+), 1 deletion(-)

Index: linux-2.6.18-rc1/kernel/taskstats.c
===================================================================
--- linux-2.6.18-rc1.orig/kernel/taskstats.c	2006-07-10 23:44:56.000000000 -0400
+++ linux-2.6.18-rc1/kernel/taskstats.c	2006-07-10 23:45:15.000000000 -0400
@@ -125,6 +125,7 @@ static int send_cpu_listeners(struct sk_
 	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
 	struct listener_list *listeners;
 	struct listener *s, *tmp;
+	struct sk_buff *skb_next, *skb_cur = skb;
 	void *reply = genlmsg_data(genlhdr);
 	int rc, ret;
 
@@ -138,12 +139,22 @@ static int send_cpu_listeners(struct sk_
 	listeners = &per_cpu(listener_array, cpu);
 	down_write(&listeners->sem);
 	list_for_each_entry_safe(s, tmp, &listeners->list, list) {
-		ret = genlmsg_unicast(skb, s->pid);
+		skb_next = NULL;
+		if (!list_islast(&s->list, &listeners->list)) {
+			skb_next = skb_clone(skb_cur, GFP_KERNEL);
+			if (!skb_next) {
+				nlmsg_free(skb_cur);
+				rc = -ENOMEM;
+				break;
+			}
+		}
+		ret = genlmsg_unicast(skb_cur, s->pid);
 		if (ret == -ECONNREFUSED) {
 			list_del(&s->list);
 			kfree(s);
 			rc = ret;
 		}
+		skb_cur = skb_next;
 	}
 	up_write(&listeners->sem);
 



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch 0/6] delay accounting & taskstats fixes
  2006-07-11  4:23 [Patch 0/6] delay accounting & taskstats fixes Shailabh Nagar
                   ` (5 preceding siblings ...)
  2006-07-11  4:36 ` [Patch 6/6] per task delay accounting taskstats interface: fix clone skbs for each listener Shailabh Nagar
@ 2006-07-11  4:50 ` Shailabh Nagar
  2006-07-11 11:28 ` Christoph Hellwig
  7 siblings, 0 replies; 16+ messages in thread
From: Shailabh Nagar @ 2006-07-11  4:50 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, Jay Lan, Chris Sturtivant, Paul Jackson,
	Balbir Singh, Chandra Seetharaman

On Tue, 2006-07-11 at 00:23, Shailabh Nagar wrote:
> Some results showing the cpumask feature 
> works as expected will follow separately.

To illustrate the basic premise behind the cpumask changes
namely, reducing the number of cpus results in fewer overflow
conditions, Chandra ran the following experiment on an
8-way PIII box:

- "mkthreads 1000 1000" (the program which only forks/exits) with
minimum time spent inside the thread function. This is basically an exit
rate that is higher than the 1000/cpu/sec discussed earlier.

- A slightly modified version of the getdelays.c example that is 
included in patch 2/6 in this series of postings. Modification was
to print only the number of receives and number of ENOBUFS seen. 

The -m parameter specifies the cpumask, 
-r parameter is the size of the socket receive buffer 
-l results in listening for exit data


The following sets of runs are self-explanatory.
Initial run emulates the earlier behaviour where one listener
receives data for all cpus. As the number of cpus to which it
listens is reduced, the ENOBUFS reduce.

Similarly, increasing the receive buffer size also cuts down 
on the ENOBUFS, further helped by reducing number of cpus listened
to. The final data point, not shown here, is when the receive 
buffer size is set to some high number like 8K when the ENOBUFS aren't
seen at all.


===================================

Each output shows 
stats <number of msgs rcvd> enobufs <num of enobufs seen>
cumulatively

elm3b94:/tmp # ./getdelays -m 0-7 -r 512 -l
cpumask 0-7 maskset 1
receive buf size 512
listen forever
 stats 100000, enobufs 2540
 stats 200000, enobufs 5701
 stats 300000, enobufs 8482
 stats 400000, enobufs 11332
 stats 500000, enobufs 14256
 stats 600000, enobufs 17150
 stats 700000, enobufs 19981
 stats 800000, enobufs 22971
 stats 900000, enobufs 26324
 stats 1000000, enobufs 29291

elm3b94:/tmp # ./getdelays -m 0-3 -r 512 -l
cpumask 0-3 maskset 1
receive buf size 512
listen forever
 stats 100000, enobufs 831
 stats 200000, enobufs 1772
 stats 300000, enobufs 2728
 stats 400000, enobufs 3820
 stats 500000, enobufs 5030
 stats 600000, enobufs 6501
 stats 700000, enobufs 7885
 stats 800000, enobufs 8971
 stats 900000, enobufs 10262
 stats 1000000, enobufs 11786

elm3b94:/tmp # ./getdelays -m 2-3 -r 512 -l
cpumask 2-3 maskset 1
receive buf size 512
listen forever
 stats 100000, enobufs 106
 stats 200000, enobufs 206
 stats 300000, enobufs 313
 stats 400000, enobufs 389
 stats 500000, enobufs 477
 stats 600000, enobufs 676
 stats 700000, enobufs 860
 stats 800000, enobufs 1048
 stats 900000, enobufs 1189
 stats 1000000, enobufs 1303

elm3b94:/tmp # ./getdelays -m 0 -r 512 -l
cpumask 0 maskset 1
receive buf size 512
listen forever
 stats 100000, enobufs 100
 stats 200000, enobufs 162
 stats 300000, enobufs 302
 stats 400000, enobufs 399
 stats 500000, enobufs 507
 stats 600000, enobufs 571
 stats 700000, enobufs 666
 stats 800000, enobufs 725
 stats 900000, enobufs 821
 stats 1000000, enobufs 909

================= with recv buf size 2048

elm3b94:/tmp # ./getdelays -m 0-7 -r 2048 -l
cpumask 0-7 maskset 1
receive buf size 2048
listen forever
 stats 100000, enobufs 35
 stats 200000, enobufs 88
 stats 300000, enobufs 123
 stats 400000, enobufs 138
 stats 500000, enobufs 155
 stats 600000, enobufs 191
 stats 700000, enobufs 223
 stats 800000, enobufs 238
 stats 900000, enobufs 280
 stats 1000000, enobufs 319


elm3b94:/tmp # ./getdelays -m 4-7 -r 2048 -l
cpumask 4-7 maskset 1
receive buf size 2048
listen forever
 stats 100000, enobufs 10
 stats 200000, enobufs 17
 stats 300000, enobufs 28
 stats 400000, enobufs 35
 stats 500000, enobufs 49
 stats 600000, enobufs 63
 stats 700000, enobufs 80
 stats 800000, enobufs 94
 stats 900000, enobufs 114
 stats 1000000, enobufs 138

elm3b94:/tmp # ./getdelays -m 2-3 -r 2048 -l
cpumask 2-3 maskset 1
receive buf size 2048
listen forever
 stats 100000, enobufs 5
 stats 200000, enobufs 6
 stats 300000, enobufs 10
 stats 400000, enobufs 14
 stats 500000, enobufs 19
 stats 600000, enobufs 33
 stats 700000, enobufs 38
 stats 800000, enobufs 43
 stats 900000, enobufs 54
 stats 1000000, enobufs 59


elm3b94:/tmp # ./getdelays -m 1 -r 2048 -l
cpumask 1 maskset 1
receive buf size 2048
listen forever
 stats 100000, enobufs 3
 stats 200000, enobufs 6
 stats 300000, enobufs 19
 stats 400000, enobufs 25
 stats 500000, enobufs 30
 stats 600000, enobufs 31
 stats 700000, enobufs 33
 stats 800000, enobufs 45
 stats 900000, enobufs 48
 stats 1000000, enobufs 52



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch 6/6] per task delay accounting taskstats interface: fix clone skbs for each listener
  2006-07-11  4:36 ` [Patch 6/6] per task delay accounting taskstats interface: fix clone skbs for each listener Shailabh Nagar
@ 2006-07-11 10:05   ` Andrew Morton
  2006-07-11 10:28     ` Herbert Xu
  0 siblings, 1 reply; 16+ messages in thread
From: Andrew Morton @ 2006-07-11 10:05 UTC (permalink / raw)
  To: nagar; +Cc: linux-kernel, jlan, csturtiv, pj, balbir, sekharan, hadi, netdev

On Tue, 11 Jul 2006 00:36:39 -0400
Shailabh Nagar <nagar@watson.ibm.com> wrote:

>  	down_write(&listeners->sem);
>  	list_for_each_entry_safe(s, tmp, &listeners->list, list) {
> -		ret = genlmsg_unicast(skb, s->pid);
> +		skb_next = NULL;
> +		if (!list_islast(&s->list, &listeners->list)) {
> +			skb_next = skb_clone(skb_cur, GFP_KERNEL);

If we do a GFP_KERNEL allocation with this semaphore held, and the
oom-killer tries to kill something to satisfy the allocation, and the
killed task gets stuck on that semaphore, I wonder of the box locks up.

Probably it'll work out OK if the semaphore is taken after that task has
had some resources torn down.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch 6/6] per task delay accounting taskstats interface: fix clone skbs for each listener
  2006-07-11 10:05   ` Andrew Morton
@ 2006-07-11 10:28     ` Herbert Xu
  2006-07-11 10:57       ` Andrew Morton
  0 siblings, 1 reply; 16+ messages in thread
From: Herbert Xu @ 2006-07-11 10:28 UTC (permalink / raw)
  To: Andrew Morton
  Cc: nagar, linux-kernel, jlan, csturtiv, pj, balbir, sekharan, hadi,
	netdev

Andrew Morton <akpm@osdl.org> wrote:
> On Tue, 11 Jul 2006 00:36:39 -0400
> Shailabh Nagar <nagar@watson.ibm.com> wrote:
> 
>>       down_write(&listeners->sem);
>>       list_for_each_entry_safe(s, tmp, &listeners->list, list) {
>> -             ret = genlmsg_unicast(skb, s->pid);
>> +             skb_next = NULL;
>> +             if (!list_islast(&s->list, &listeners->list)) {
>> +                     skb_next = skb_clone(skb_cur, GFP_KERNEL);
> 
> If we do a GFP_KERNEL allocation with this semaphore held, and the
> oom-killer tries to kill something to satisfy the allocation, and the
> killed task gets stuck on that semaphore, I wonder of the box locks up.

We do GFP_KERNEL inside semaphores/mutexes in lots of places.  So if this
can deadlock with the oom-killer we probably should fix that, preferably
by having GFP_KERNEL fail in that case.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch 6/6] per task delay accounting taskstats interface: fix clone skbs for each listener
  2006-07-11 10:28     ` Herbert Xu
@ 2006-07-11 10:57       ` Andrew Morton
  2006-07-11 11:15         ` Herbert Xu
  0 siblings, 1 reply; 16+ messages in thread
From: Andrew Morton @ 2006-07-11 10:57 UTC (permalink / raw)
  To: Herbert Xu
  Cc: nagar, linux-kernel, jlan, csturtiv, pj, balbir, sekharan, hadi,
	netdev

On Tue, 11 Jul 2006 20:28:50 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> Andrew Morton <akpm@osdl.org> wrote:
> > On Tue, 11 Jul 2006 00:36:39 -0400
> > Shailabh Nagar <nagar@watson.ibm.com> wrote:
> > 
> >>       down_write(&listeners->sem);
> >>       list_for_each_entry_safe(s, tmp, &listeners->list, list) {
> >> -             ret = genlmsg_unicast(skb, s->pid);
> >> +             skb_next = NULL;
> >> +             if (!list_islast(&s->list, &listeners->list)) {
> >> +                     skb_next = skb_clone(skb_cur, GFP_KERNEL);
> > 
> > If we do a GFP_KERNEL allocation with this semaphore held, and the
> > oom-killer tries to kill something to satisfy the allocation, and the
> > killed task gets stuck on that semaphore, I wonder of the box locks up.
> 
> We do GFP_KERNEL inside semaphores/mutexes in lots of places.  So if this
> can deadlock with the oom-killer we probably should fix that, preferably
> by having GFP_KERNEL fail in that case.

This lock is special, in that it's taken on the exit() path (I think).  So
it can block tasks which are trying to exit.

But yes.  Reliable, deadlock-free oom-killing is, err, a matter of ongoing
research.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch 6/6] per task delay accounting taskstats interface: fix clone skbs for each listener
  2006-07-11 10:57       ` Andrew Morton
@ 2006-07-11 11:15         ` Herbert Xu
  2006-07-11 17:44           ` Shailabh Nagar
  2006-07-12  5:41           ` Shailabh Nagar
  0 siblings, 2 replies; 16+ messages in thread
From: Herbert Xu @ 2006-07-11 11:15 UTC (permalink / raw)
  To: Andrew Morton
  Cc: nagar, linux-kernel, jlan, csturtiv, pj, balbir, sekharan, hadi,
	netdev

On Tue, Jul 11, 2006 at 03:57:31AM -0700, Andrew Morton wrote:
>
> > >>       down_write(&listeners->sem);
> > >>       list_for_each_entry_safe(s, tmp, &listeners->list, list) {
> > >> -             ret = genlmsg_unicast(skb, s->pid);
> > >> +             skb_next = NULL;
> > >> +             if (!list_islast(&s->list, &listeners->list)) {
> > >> +                     skb_next = skb_clone(skb_cur, GFP_KERNEL);
> > > 
> > > If we do a GFP_KERNEL allocation with this semaphore held, and the
> > > oom-killer tries to kill something to satisfy the allocation, and the
> > > killed task gets stuck on that semaphore, I wonder of the box locks up.
> > 
> > We do GFP_KERNEL inside semaphores/mutexes in lots of places.  So if this
> > can deadlock with the oom-killer we probably should fix that, preferably
> > by having GFP_KERNEL fail in that case.
> 
> This lock is special, in that it's taken on the exit() path (I think).  So
> it can block tasks which are trying to exit.

Sorry, missed the context.

If there is a deadlock then it's not just this allocation that you
need worry about.  There is also an allocation within genlmsg_uniast
that would be GFP_KERNEL.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch 0/6] delay accounting & taskstats fixes
  2006-07-11  4:23 [Patch 0/6] delay accounting & taskstats fixes Shailabh Nagar
                   ` (6 preceding siblings ...)
  2006-07-11  4:50 ` [Patch 0/6] delay accounting & taskstats fixes Shailabh Nagar
@ 2006-07-11 11:28 ` Christoph Hellwig
  2006-07-11 15:33   ` Shailabh Nagar
  7 siblings, 1 reply; 16+ messages in thread
From: Christoph Hellwig @ 2006-07-11 11:28 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: Andrew Morton, linux-kernel, Jay Lan, Chris Sturtivant,
	Paul Jackson, Balbir Singh, Chandra Seetharaman

On Tue, Jul 11, 2006 at 12:23:58AM -0400, Shailabh Nagar wrote:
> Andrew,
> 
> Chandra, Balbir & I have been putting taskstats and delay accounting
> patches through some extensive testing on multiple platforms.
> 
> Following are a set of patches that fix some bugs found as well as
> some cleanups of the code. Some results showing the cpumask feature 
> works as expected will follow separately.

Btw, did you ever explain what this delay accounting code is useful for
exactly?  It's an awfull lot of code that doesn't seem to have a huge
use.  While we're at it do you have a pointer to the associated userspace code?


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch 0/6] delay accounting & taskstats fixes
  2006-07-11 11:28 ` Christoph Hellwig
@ 2006-07-11 15:33   ` Shailabh Nagar
  0 siblings, 0 replies; 16+ messages in thread
From: Shailabh Nagar @ 2006-07-11 15:33 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Andrew Morton, linux-kernel, Jay Lan, Chris Sturtivant,
	Paul Jackson, Balbir Singh, Chandra Seetharaman

Christoph Hellwig wrote:
> On Tue, Jul 11, 2006 at 12:23:58AM -0400, Shailabh Nagar wrote:
> 
>>Andrew,
>>
>>Chandra, Balbir & I have been putting taskstats and delay accounting
>>patches through some extensive testing on multiple platforms.
>>
>>Following are a set of patches that fix some bugs found as well as
>>some cleanups of the code. Some results showing the cpumask feature 
>>works as expected will follow separately.
> 
> 
> Btw, did you ever explain what this delay accounting code is useful for
> exactly?  

Yes.
Previous postings have explained the rationale,

http://www.uwsg.indiana.edu/hypermail/linux/kernel/0511.1/2275.html
http://www.uwsg.indiana.edu/hypermail/linux/kernel/0512.0/2152.html

lwn.net had a good summary as always,
http://lwn.net/Articles/173209/




> It's an awfull lot of code that doesn't seem to have a huge
> use.  

The delay accounting part of the code is quite small (just a few
lines of timestamping code place in core kernel code) and much of the
remaining size comes from the interface. Since the interface is designed
to be of generic use towards ANY per-task accounting needs, discussions and
code have been fairly involved. We went through a long discussion of
potential exploiters:

(see thread under [Patch 0/8] per-task delay accounting at
http://www.uwsg.indiana.edu/hypermail/linux/kernel/0603.3/index.html

> While we're at it do you have a pointer to the associated userspace code?
> 

Its part of the patches in Documentation/accounting/getdelays.c

and the last-but-one version of it can be viewed at its location in the -mm
tree:
http://www.linux-m32r.org/lxr/http/source/Documentation/accounting/?a=generic

(The just accepted patches updates the documentation especially the parts
relating to cpumasks but the above should also suffice to give a flavor of
what userspace could do).


Thanks,
Shailabh


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch 6/6] per task delay accounting taskstats interface: fix clone skbs for each listener
  2006-07-11 11:15         ` Herbert Xu
@ 2006-07-11 17:44           ` Shailabh Nagar
  2006-07-12  5:41           ` Shailabh Nagar
  1 sibling, 0 replies; 16+ messages in thread
From: Shailabh Nagar @ 2006-07-11 17:44 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Andrew Morton, linux-kernel, jlan, csturtiv, pj, balbir, sekharan,
	hadi, netdev

Herbert Xu wrote:
> On Tue, Jul 11, 2006 at 03:57:31AM -0700, Andrew Morton wrote:
> 
>>>>>      down_write(&listeners->sem);
>>>>>      list_for_each_entry_safe(s, tmp, &listeners->list, list) {
>>>>>-             ret = genlmsg_unicast(skb, s->pid);
>>>>>+             skb_next = NULL;
>>>>>+             if (!list_islast(&s->list, &listeners->list)) {
>>>>>+                     skb_next = skb_clone(skb_cur, GFP_KERNEL);
>>>>
>>>>If we do a GFP_KERNEL allocation with this semaphore held, and the
>>>>oom-killer tries to kill something to satisfy the allocation, and the
>>>>killed task gets stuck on that semaphore, I wonder of the box locks up.

Hmm...doesn't look very safe does it.
There's no real need for us to skb_clone within the sem. Keeping a count of
listeners and doing the clone outside should let us avoid this problem.

I was trying to avoid doing the above because of the potential for
listeners getting added continuously to the list
(and having to repeat the allocation loop outside the down_write).

But on second thoughts, we're under no obligation to send the data to all the
listeners who add themselves in the short time between our taking a snapshot
of the listener count and when the send is done (within the down_write). So
it should be ok.

>>>
>>>We do GFP_KERNEL inside semaphores/mutexes in lots of places.  So if this
>>>can deadlock with the oom-killer we probably should fix that, preferably
>>>by having GFP_KERNEL fail in that case.
>>
>>This lock is special, in that it's taken on the exit() path (I think).  So
>>it can block tasks which are trying to exit.
> 
> 
> Sorry, missed the context.
> 
> If there is a deadlock then it's not just this allocation that you
> need worry about.  There is also an allocation within genlmsg_uniast
> that would be GFP_KERNEL.
> 

Thats true. The GFP_KERNEL allocation potentially called in netlink_trim() as part
of the genlmsg/netlink_unicast() is again a problem.

So perhaps we should switch to using RCU for protecting the listener list.

--Shailabh

> Cheers,

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch 6/6] per task delay accounting taskstats interface: fix clone skbs for each listener
  2006-07-11 11:15         ` Herbert Xu
  2006-07-11 17:44           ` Shailabh Nagar
@ 2006-07-12  5:41           ` Shailabh Nagar
  1 sibling, 0 replies; 16+ messages in thread
From: Shailabh Nagar @ 2006-07-12  5:41 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Andrew Morton, linux-kernel, Jay Lan, Chris Sturtivant,
	Paul Jackson, Balbir Singh, Chandra Seetharaman, Jamal, netdev

On Tue, 2006-07-11 at 07:15, Herbert Xu wrote:
> On Tue, Jul 11, 2006 at 03:57:31AM -0700, Andrew Morton wrote:
> >
> > > >>       down_write(&listeners->sem);
> > > >>       list_for_each_entry_safe(s, tmp, &listeners->list, list) {
> > > >> -             ret = genlmsg_unicast(skb, s->pid);
> > > >> +             skb_next = NULL;
> > > >> +             if (!list_islast(&s->list, &listeners->list)) {
> > > >> +                     skb_next = skb_clone(skb_cur, GFP_KERNEL);
> > > > 
> > > > If we do a GFP_KERNEL allocation with this semaphore held, and the
> > > > oom-killer tries to kill something to satisfy the allocation, and the
> > > > killed task gets stuck on that semaphore, I wonder of the box locks up.
> > > 
> > > We do GFP_KERNEL inside semaphores/mutexes in lots of places.  So if this
> > > can deadlock with the oom-killer we probably should fix that, preferably
> > > by having GFP_KERNEL fail in that case.
> > 
> > This lock is special, in that it's taken on the exit() path (I think).  So
> > it can block tasks which are trying to exit.
> 
> Sorry, missed the context.
> 
> If there is a deadlock then it's not just this allocation that you
> need worry about.  There is also an allocation within genlmsg_uniast
> that would be GFP_KERNEL.


Remove down_write() from taskstats code invoked on the exit() path.

In send_cpu_listeners(), which is called on the exit path, 
a down_write() was protecting operations like skb_clone() and 
genlmsg_unicast() that do GFP_KERNEL allocations. If the oom-killer 
decides to kill tasks to satisfy the allocations,the exit of those 
tasks could block on the same semphore.

The down_write() was only needed to allow removal of invalid 
listeners from the listener list. The patch converts the down_write 
to a down_read and defers the removal to a separate critical region. 
This ensures that even if the oom-killer is called, no other task's 
exit is blocked as it can still acquire another down_read.

Thanks to Andrew Morton & Herbert Xu for pointing out the oom 
related pitfalls, and to Chandra Seetharaman for suggesting this 
fix instead of using something more complex like RCU.

Signed-Off-By: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-Off-By: Shailabh Nagar <nagar@watson.ibm.com>

---

 kernel/taskstats.c |   24 +++++++++++++++++++-----
 1 files changed, 19 insertions(+), 5 deletions(-)

Index: linux-2.6.18-rc1/kernel/taskstats.c
===================================================================
--- linux-2.6.18-rc1.orig/kernel/taskstats.c	2006-07-11 00:13:00.000000000 -0400
+++ linux-2.6.18-rc1/kernel/taskstats.c	2006-07-12 00:07:53.000000000 -0400
@@ -51,6 +51,7 @@ __read_mostly = {
 struct listener {
 	struct list_head list;
 	pid_t pid;
+	char valid;
 };
 
 struct listener_list {
@@ -127,7 +128,7 @@ static int send_cpu_listeners(struct sk_
 	struct listener *s, *tmp;
 	struct sk_buff *skb_next, *skb_cur = skb;
 	void *reply = genlmsg_data(genlhdr);
-	int rc, ret;
+	int rc, ret, delcount = 0;
 
 	rc = genlmsg_end(skb, reply);
 	if (rc < 0) {
@@ -137,7 +138,7 @@ static int send_cpu_listeners(struct sk_
 
 	rc = 0;
 	listeners = &per_cpu(listener_array, cpu);
-	down_write(&listeners->sem);
+	down_read(&listeners->sem);
 	list_for_each_entry_safe(s, tmp, &listeners->list, list) {
 		skb_next = NULL;
 		if (!list_islast(&s->list, &listeners->list)) {
@@ -150,14 +151,26 @@ static int send_cpu_listeners(struct sk_
 		}
 		ret = genlmsg_unicast(skb_cur, s->pid);
 		if (ret == -ECONNREFUSED) {
-			list_del(&s->list);
-			kfree(s);
+			s->valid = 0;
+			delcount++;
 			rc = ret;
 		}
 		skb_cur = skb_next;
 	}
-	up_write(&listeners->sem);
+	up_read(&listeners->sem);
+
+	if (!delcount)
+		return rc;
 
+	/* Delete invalidated entries */
+	down_write(&listeners->sem);
+	list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+		if (!s->valid) {
+			list_del(&s->list);
+			kfree(s);
+		}
+	}
+	up_write(&listeners->sem);
 	return rc;
 }
 
@@ -290,6 +303,7 @@ static int add_del_listener(pid_t pid, c
 				goto cleanup;
 			s->pid = pid;
 			INIT_LIST_HEAD(&s->list);
+			s->valid = 1;
 
 			listeners = &per_cpu(listener_array, cpu);
 			down_write(&listeners->sem);



^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2006-07-12  5:41 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-07-11  4:23 [Patch 0/6] delay accounting & taskstats fixes Shailabh Nagar
2006-07-11  4:27 ` [Patch 1/6] per task delay accounting taskstats interface: code cleanup Shailabh Nagar
2006-07-11  4:29 ` [Patch 2/6] per task delay accounting taskstats interface: documentation fix Shailabh Nagar
2006-07-11  4:30 ` [Patch 3/6] per task delay accounting taskstats interface: fix early sem init Shailabh Nagar
2006-07-11  4:33 ` [Patch 4/6] per task delay accounting taskstats interface: fix drop listener only on socket close Shailabh Nagar
2006-07-11  4:34 ` [Patch 5/6] list_islast utility Shailabh Nagar
2006-07-11  4:36 ` [Patch 6/6] per task delay accounting taskstats interface: fix clone skbs for each listener Shailabh Nagar
2006-07-11 10:05   ` Andrew Morton
2006-07-11 10:28     ` Herbert Xu
2006-07-11 10:57       ` Andrew Morton
2006-07-11 11:15         ` Herbert Xu
2006-07-11 17:44           ` Shailabh Nagar
2006-07-12  5:41           ` Shailabh Nagar
2006-07-11  4:50 ` [Patch 0/6] delay accounting & taskstats fixes Shailabh Nagar
2006-07-11 11:28 ` Christoph Hellwig
2006-07-11 15:33   ` Shailabh Nagar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox