Netdev List
 help / color / mirror / Atom feed
* [Patch 1/3] sysctl: refactor integer handling proc code
From: Amerigo Wang @ 2010-04-12 10:04 UTC (permalink / raw)
  To: linux-kernel
  Cc: Octavian Purdila, Eric Dumazet, penguin-kernel, netdev,
	Neil Horman, ebiederm, David Miller, Amerigo Wang
In-Reply-To: <20100412100744.5302.92442.sendpatchset@localhost.localdomain>


From: Octavian Purdila <opurdila@ixiacom.com>

As we are about to add another integer handling proc function a little
bit of cleanup is in order: add a few helper functions to improve code
readability and decrease code duplication.

In the process a bug is also fixed: if the user specifies a number
with more then 20 digits it will be interpreted as two integers
(e.g. 10000...13 will be interpreted as 100.... and 13).

Behavior for EFAULT handling was changed as well. Previous to this
patch, when an EFAULT error occurred in the middle of a write
operation, although some of the elements were set, that was not
acknowledged to the user (by shorting the write and returning the
number of bytes accepted). EFAULT is now treated just like any other
errors by acknowledging the amount of bytes accepted.

Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
Signed-off-by: WANG Cong <amwang@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
---

Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -2040,8 +2040,148 @@ int proc_dostring(struct ctl_table *tabl
 			       buffer, lenp, ppos);
 }
 
+static int proc_skip_wspace(char __user **buf, size_t *size)
+{
+	char c;
+
+	while (*size) {
+		if (get_user(c, *buf))
+			return -EFAULT;
+		if (!isspace(c))
+			break;
+		(*size)--;
+		(*buf)++;
+	}
+
+	return 0;
+}
+
+static bool isanyof(char c, const char *v, unsigned len)
+{
+	int i;
+
+	if (!len)
+		return false;
+
+	for (i = 0; i < len; i++)
+		if (c == v[i])
+			break;
+	if (i == len)
+		return false;
+
+	return true;
+}
+
+#define TMPBUFLEN 22
+/**
+ * proc_get_long - reads an ASCII formated integer from a user buffer
+ *
+ * @buf - user buffer
+ * @size - size of the user buffer
+ * @val - this is where the number will be stored
+ * @neg - set to %TRUE if number is negative
+ * @perm_tr - a vector which contains the allowed trailers
+ * @perm_tr_len - size of the perm_tr vector
+ * @tr - pointer to store the trailer character
+ *
+ * In case of success 0 is returned and buf and size are updated with
+ * the amount of bytes read. If tr is non NULL and a trailing
+ * character exist (size is non zero after returning from this
+ * function) tr is updated with the trailing character.
+ */
+static int proc_get_long(char __user **buf, size_t *size,
+			  unsigned long *val, bool *neg,
+			  const char *perm_tr, unsigned perm_tr_len, char *tr)
+{
+	int len;
+	char *p, tmp[TMPBUFLEN];
+
+	if (!*size)
+		return -EINVAL;
+
+	len = *size;
+	if (len > TMPBUFLEN-1)
+		len = TMPBUFLEN-1;
+
+	if (copy_from_user(tmp, *buf, len))
+		return -EFAULT;
+
+	tmp[len] = 0;
+	p = tmp;
+	if (*p == '-' && *size > 1) {
+		*neg = 1;
+		p++;
+	} else
+		*neg = 0;
+	if (!isdigit(*p))
+		return -EINVAL;
+
+	*val = simple_strtoul(p, &p, 0);
+
+	len = p - tmp;
+
+	/* We don't know if the next char is whitespace thus we may accept
+	 * invalid integers (e.g. 1234...a) or two integers instead of one
+	 * (e.g. 123...1). So lets not allow such large numbers. */
+	if (len == TMPBUFLEN - 1)
+		return -EINVAL;
+
+	if (len < *size && perm_tr_len && !isanyof(*p, perm_tr, perm_tr_len))
+		return -EINVAL;
+
+	if (tr && (len < *size))
+		*tr = *p;
+
+	*buf += len;
+	*size -= len;
+
+	return 0;
+}
+
+/**
+ * proc_put_long - coverts an integer to a decimal ASCII formated string
+ *
+ * @buf - the user buffer
+ * @size - the size of the user buffer
+ * @val - the integer to be converted
+ * @neg - sign of the number, %TRUE for negative
+ * @first - if %FALSE will insert a separator character before the number
+ * @separator - the separator character
+ *
+ * In case of success 0 is returned and buf and size are updated with
+ * the amount of bytes read.
+ */
+static int proc_put_long(char __user **buf, size_t *size, unsigned long val,
+			  bool neg, bool first, char separator)
+{
+	int len;
+	char tmp[TMPBUFLEN], *p = tmp;
+
+	if (!first)
+		*p++ = separator;
+	sprintf(p, "%s%lu", neg ? "-" : "", val);
+	len = strlen(tmp);
+	if (len > *size)
+		len = *size;
+	if (copy_to_user(*buf, tmp, len))
+		return -EFAULT;
+	*size -= len;
+	*buf += len;
+	return 0;
+}
+#undef TMPBUFLEN
+
+static int proc_put_char(char __user **buf, size_t *size, char c)
+{
+	if (*size) {
+		if (put_user(c, *buf))
+			return -EFAULT;
+		(*size)--, (*buf)++;
+	}
+	return 0;
+}
 
-static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
+static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
 				 int *valp,
 				 int write, void *data)
 {
@@ -2050,7 +2190,7 @@ static int do_proc_dointvec_conv(int *ne
 	} else {
 		int val = *valp;
 		if (val < 0) {
-			*negp = -1;
+			*negp = 1;
 			*lvalp = (unsigned long)-val;
 		} else {
 			*negp = 0;
@@ -2060,20 +2200,18 @@ static int do_proc_dointvec_conv(int *ne
 	return 0;
 }
 
+static const char proc_wspace_sep[] = { ' ', '\t', '\n', 0 };
+
 static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
-		  int write, void __user *buffer,
+		  int write, void __user *_buffer,
 		  size_t *lenp, loff_t *ppos,
-		  int (*conv)(int *negp, unsigned long *lvalp, int *valp,
+		  int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
 			      int write, void *data),
 		  void *data)
 {
-#define TMPBUFLEN 21
-	int *i, vleft, first = 1, neg;
-	unsigned long lval;
-	size_t left, len;
-	
-	char buf[TMPBUFLEN], *p;
-	char __user *s = buffer;
+	int *i, vleft, first = 1, err = 0;
+	size_t left;
+	char __user *buffer = (char __user *) _buffer;
 	
 	if (!tbl_data || !table->maxlen || !*lenp ||
 	    (*ppos && !write)) {
@@ -2089,88 +2227,48 @@ static int __do_proc_dointvec(void *tbl_
 		conv = do_proc_dointvec_conv;
 
 	for (; left && vleft--; i++, first=0) {
-		if (write) {
-			while (left) {
-				char c;
-				if (get_user(c, s))
-					return -EFAULT;
-				if (!isspace(c))
-					break;
-				left--;
-				s++;
-			}
-			if (!left)
-				break;
-			neg = 0;
-			len = left;
-			if (len > sizeof(buf) - 1)
-				len = sizeof(buf) - 1;
-			if (copy_from_user(buf, s, len))
-				return -EFAULT;
-			buf[len] = 0;
-			p = buf;
-			if (*p == '-' && left > 1) {
-				neg = 1;
-				p++;
-			}
-			if (*p < '0' || *p > '9')
-				break;
-
-			lval = simple_strtoul(p, &p, 0);
+		unsigned long lval;
+		bool neg;
 
-			len = p-buf;
-			if ((len < left) && *p && !isspace(*p))
+		if (write) {
+			err = proc_skip_wspace(&buffer, &left);
+			if (err)
+				return err;
+			err = proc_get_long(&buffer, &left, &lval, &neg,
+					     proc_wspace_sep,
+					     sizeof(proc_wspace_sep), NULL);
+			if (err)
 				break;
-			s += len;
-			left -= len;
-
-			if (conv(&neg, &lval, i, 1, data))
+			if (conv(&neg, &lval, i, 1, data)) {
+				err = -EINVAL;
 				break;
+			}
 		} else {
-			p = buf;
-			if (!first)
-				*p++ = '\t';
-	
-			if (conv(&neg, &lval, i, 0, data))
+			if (conv(&neg, &lval, i, 0, data)) {
+				err = -EINVAL;
 				break;
-
-			sprintf(p, "%s%lu", neg ? "-" : "", lval);
-			len = strlen(buf);
-			if (len > left)
-				len = left;
-			if(copy_to_user(s, buf, len))
-				return -EFAULT;
-			left -= len;
-			s += len;
-		}
-	}
-
-	if (!write && !first && left) {
-		if(put_user('\n', s))
-			return -EFAULT;
-		left--, s++;
-	}
-	if (write) {
-		while (left) {
-			char c;
-			if (get_user(c, s++))
-				return -EFAULT;
-			if (!isspace(c))
+			}
+			err = proc_put_long(&buffer, &left, lval, neg, first,
+					     '\t');
+			if (err)
 				break;
-			left--;
 		}
 	}
+
+	if (!write && !first && left && !err)
+		err = proc_put_char(&buffer, &left, '\n');
+	if (write && !err)
+		err = proc_skip_wspace(&buffer, &left);
 	if (write && first)
-		return -EINVAL;
+		return err ? : -EINVAL;
 	*lenp -= left;
 	*ppos += *lenp;
 	return 0;
-#undef TMPBUFLEN
 }
 
 static int do_proc_dointvec(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos,
-		  int (*conv)(int *negp, unsigned long *lvalp, int *valp,
+		  int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
 			      int write, void *data),
 		  void *data)
 {
@@ -2238,8 +2336,8 @@ struct do_proc_dointvec_minmax_conv_para
 	int *max;
 };
 
-static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, 
-					int *valp, 
+static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
+					int *valp,
 					int write, void *data)
 {
 	struct do_proc_dointvec_minmax_conv_param *param = data;
@@ -2252,7 +2350,7 @@ static int do_proc_dointvec_minmax_conv(
 	} else {
 		int val = *valp;
 		if (val < 0) {
-			*negp = -1;
+			*negp = 1;
 			*lvalp = (unsigned long)-val;
 		} else {
 			*negp = 0;
@@ -2290,17 +2388,15 @@ int proc_dointvec_minmax(struct ctl_tabl
 }
 
 static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
-				     void __user *buffer,
+				     void __user *_buffer,
 				     size_t *lenp, loff_t *ppos,
 				     unsigned long convmul,
 				     unsigned long convdiv)
 {
-#define TMPBUFLEN 21
-	unsigned long *i, *min, *max, val;
-	int vleft, first=1, neg;
-	size_t len, left;
-	char buf[TMPBUFLEN], *p;
-	char __user *s = buffer;
+	unsigned long *i, *min, *max;
+	int vleft, first = 1, err = 0;
+	size_t left;
+	char __user *buffer = (char __user *) _buffer;
 	
 	if (!data || !table->maxlen || !*lenp ||
 	    (*ppos && !write)) {
@@ -2315,82 +2411,42 @@ static int __do_proc_doulongvec_minmax(v
 	left = *lenp;
 	
 	for (; left && vleft--; i++, min++, max++, first=0) {
+		unsigned long val;
+
 		if (write) {
-			while (left) {
-				char c;
-				if (get_user(c, s))
-					return -EFAULT;
-				if (!isspace(c))
-					break;
-				left--;
-				s++;
-			}
-			if (!left)
-				break;
-			neg = 0;
-			len = left;
-			if (len > TMPBUFLEN-1)
-				len = TMPBUFLEN-1;
-			if (copy_from_user(buf, s, len))
-				return -EFAULT;
-			buf[len] = 0;
-			p = buf;
-			if (*p == '-' && left > 1) {
-				neg = 1;
-				p++;
-			}
-			if (*p < '0' || *p > '9')
-				break;
-			val = simple_strtoul(p, &p, 0) * convmul / convdiv ;
-			len = p-buf;
-			if ((len < left) && *p && !isspace(*p))
+			bool neg;
+
+			err = proc_skip_wspace(&buffer, &left);
+			if (err)
+				return err;
+			err = proc_get_long(&buffer, &left, &val, &neg,
+					     proc_wspace_sep,
+					     sizeof(proc_wspace_sep), NULL);
+			if (err)
 				break;
 			if (neg)
-				val = -val;
-			s += len;
-			left -= len;
-
-			if(neg)
 				continue;
 			if ((min && val < *min) || (max && val > *max))
 				continue;
 			*i = val;
 		} else {
-			p = buf;
-			if (!first)
-				*p++ = '\t';
-			sprintf(p, "%lu", convdiv * (*i) / convmul);
-			len = strlen(buf);
-			if (len > left)
-				len = left;
-			if(copy_to_user(s, buf, len))
-				return -EFAULT;
-			left -= len;
-			s += len;
-		}
-	}
-
-	if (!write && !first && left) {
-		if(put_user('\n', s))
-			return -EFAULT;
-		left--, s++;
-	}
-	if (write) {
-		while (left) {
-			char c;
-			if (get_user(c, s++))
-				return -EFAULT;
-			if (!isspace(c))
+			val = convdiv * (*i) / convmul;
+			err = proc_put_long(&buffer, &left, val, 0, first,
+					     '\t');
+			if (err)
 				break;
-			left--;
 		}
 	}
+
+	if (!write && !first && left && !err)
+		err = proc_put_char(&buffer, &left, '\n');
+	if (write && !err)
+		err = proc_skip_wspace(&buffer, &left);
 	if (write && first)
-		return -EINVAL;
+		return err ? : -EINVAL;
 	*lenp -= left;
 	*ppos += *lenp;
 	return 0;
-#undef TMPBUFLEN
 }
 
 static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
@@ -2451,7 +2507,7 @@ int proc_doulongvec_ms_jiffies_minmax(st
 }
 
 
-static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
+static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
 					 int *valp,
 					 int write, void *data)
 {
@@ -2463,7 +2519,7 @@ static int do_proc_dointvec_jiffies_conv
 		int val = *valp;
 		unsigned long lval;
 		if (val < 0) {
-			*negp = -1;
+			*negp = 1;
 			lval = (unsigned long)-val;
 		} else {
 			*negp = 0;
@@ -2474,7 +2530,7 @@ static int do_proc_dointvec_jiffies_conv
 	return 0;
 }
 
-static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
+static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp,
 						int *valp,
 						int write, void *data)
 {
@@ -2486,7 +2542,7 @@ static int do_proc_dointvec_userhz_jiffi
 		int val = *valp;
 		unsigned long lval;
 		if (val < 0) {
-			*negp = -1;
+			*negp = 1;
 			lval = (unsigned long)-val;
 		} else {
 			*negp = 0;
@@ -2497,7 +2553,7 @@ static int do_proc_dointvec_userhz_jiffi
 	return 0;
 }
 
-static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
+static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
 					    int *valp,
 					    int write, void *data)
 {
@@ -2507,7 +2563,7 @@ static int do_proc_dointvec_ms_jiffies_c
 		int val = *valp;
 		unsigned long lval;
 		if (val < 0) {
-			*negp = -1;
+			*negp = 1;
 			lval = (unsigned long)-val;
 		} else {
 			*negp = 0;

^ permalink raw reply

* [Patch 2/3] sysctl: add proc_do_large_bitmap
From: Amerigo Wang @ 2010-04-12 10:04 UTC (permalink / raw)
  To: linux-kernel
  Cc: Octavian Purdila, Eric Dumazet, penguin-kernel, netdev,
	Neil Horman, Amerigo Wang, ebiederm, David Miller
In-Reply-To: <20100412100744.5302.92442.sendpatchset@localhost.localdomain>

From: Octavian Purdila <opurdila@ixiacom.com>

The new function can be used to read/write large bitmaps via /proc. A
comma separated range format is used for compact output and input
(e.g. 1,3-4,10-10).

Writing into the file will first reset the bitmap then update it
based on the given input.

Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
Signed-off-by: WANG Cong <amwang@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
---

Index: linux-2.6/include/linux/sysctl.h
===================================================================
--- linux-2.6.orig/include/linux/sysctl.h
+++ linux-2.6/include/linux/sysctl.h
@@ -980,6 +980,8 @@ extern int proc_doulongvec_minmax(struct
 				  void __user *, size_t *, loff_t *);
 extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int,
 				      void __user *, size_t *, loff_t *);
+extern int proc_do_large_bitmap(struct ctl_table *, int,
+				void __user *, size_t *, loff_t *);
 
 /*
  * Register a set of sysctl names by calling register_sysctl_table
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -2072,6 +2072,23 @@ static bool isanyof(char c, const char *
 	return true;
 }
 
+static int proc_skip_anyof(char __user **buf, size_t *size,
+			   const char *v, unsigned len)
+{
+	char c;
+
+	while (*size) {
+		if (get_user(c, *buf))
+			return -EFAULT;
+		if (!isanyof(c, v, len))
+			break;
+		(*size)--;
+		(*buf)++;
+	}
+
+	return 0;
+}
+
 #define TMPBUFLEN 22
 /**
  * proc_get_long - reads an ASCII formated integer from a user buffer
@@ -2663,6 +2680,135 @@ static int proc_do_cad_pid(struct ctl_ta
 	return 0;
 }
 
+/**
+ * proc_do_large_bitmap - read/write from/to a large bitmap
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * The bitmap is stored at table->data and the bitmap length (in bits)
+ * in table->maxlen.
+ *
+ * We use a range comma separated format (e.g. 1,3-4,10-10) so that
+ * large bitmaps may be represented in a compact manner. Writing into
+ * the file will clear the bitmap then update it with the given input.
+ *
+ * Returns 0 on success.
+ */
+int proc_do_large_bitmap(struct ctl_table *table, int write,
+			 void __user *_buffer, size_t *lenp, loff_t *ppos)
+{
+	int err = 0;
+	bool first = 1;
+	size_t left = *lenp;
+	unsigned long bitmap_len = table->maxlen;
+	char __user *buffer = (char __user *) _buffer;
+	unsigned long *bitmap = (unsigned long *) table->data;
+	unsigned long *tmp_bitmap = NULL;
+	char tr_a[] = { '-', ',', '\n', 0 }, tr_b[] = { ',', '\n', 0 }, c;
+	char tr_end[] = { '\n', 0 };
+
+
+	if (!bitmap_len || !left || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+
+	if (write) {
+		tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long),
+				     GFP_KERNEL);
+		if (!tmp_bitmap)
+			return -ENOMEM;
+		err = proc_skip_anyof(&buffer, &left, tr_end, sizeof(tr_end));
+		while (!err && left) {
+			unsigned long val_a, val_b;
+			bool neg;
+
+			err = proc_get_long(&buffer, &left, &val_a, &neg, tr_a,
+					     sizeof(tr_a), &c);
+			if (err)
+				break;
+			if (val_a >= bitmap_len || neg) {
+				err = -EINVAL;
+				break;
+			}
+
+			val_b = val_a;
+			if (left) {
+				buffer++;
+				left--;
+			}
+
+			if (c == '-') {
+				err = proc_get_long(&buffer, &left, &val_b,
+						     &neg, tr_b, sizeof(tr_b),
+						     &c);
+				if (err)
+					break;
+				if (val_b >= bitmap_len || neg ||
+				    val_a > val_b) {
+					err = -EINVAL;
+					break;
+				}
+				if (left) {
+					buffer++;
+					left--;
+				}
+			}
+
+			while (val_a <= val_b)
+				set_bit(val_a++, tmp_bitmap);
+
+			first = 0;
+			err = proc_skip_anyof(&buffer, &left, tr_end,
+					      sizeof(tr_end));
+		}
+	} else {
+		unsigned long bit_a, bit_b = 0;
+
+		while (left) {
+			bit_a = find_next_bit(bitmap, bitmap_len, bit_b);
+			if (bit_a >= bitmap_len)
+				break;
+			bit_b = find_next_zero_bit(bitmap, bitmap_len,
+						   bit_a + 1) - 1;
+
+			err = proc_put_long(&buffer, &left, bit_a, 0, first,
+					     ',');
+			if (err)
+				break;
+			if (bit_a != bit_b) {
+				err = proc_put_char(&buffer, &left, '-');
+				if (err)
+					break;
+				err = proc_put_long(&buffer, &left, bit_b, 0,
+						     1, 0);
+				if (err)
+					break;
+			}
+
+			first = 0; bit_b++;
+		}
+		if (!err)
+			err = proc_put_char(&buffer, &left, '\n');
+	}
+
+	if (!err) {
+		if (write)
+			memcpy(bitmap, tmp_bitmap,
+			       BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long));
+		kfree(tmp_bitmap);
+		*lenp -= left;
+		*ppos += *lenp;
+		return 0;
+	} else {
+		kfree(tmp_bitmap);
+		return err;
+	}
+}
+
 #else /* CONFIG_PROC_FS */
 
 int proc_dostring(struct ctl_table *table, int write,

^ permalink raw reply

* [Patch 3/3] net: reserve ports for applications using fixed port numbers
From: Amerigo Wang @ 2010-04-12 10:04 UTC (permalink / raw)
  To: linux-kernel
  Cc: Octavian Purdila, Eric Dumazet, penguin-kernel, netdev,
	Neil Horman, Amerigo Wang, David Miller, ebiederm
In-Reply-To: <20100412100744.5302.92442.sendpatchset@localhost.localdomain>

From: Octavian Purdila <opurdila@ixiacom.com>

This patch introduces /proc/sys/net/ipv4/ip_local_reserved_ports which
allows users to reserve ports for third-party applications.

The reserved ports will not be used by automatic port assignments
(e.g. when calling connect() or bind() with port number 0). Explicit
port allocation behavior is unchanged.

Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
Signed-off-by: WANG Cong <amwang@redhat.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
---

Index: linux-2.6/Documentation/networking/ip-sysctl.txt
===================================================================
--- linux-2.6.orig/Documentation/networking/ip-sysctl.txt
+++ linux-2.6/Documentation/networking/ip-sysctl.txt
@@ -588,6 +588,37 @@ ip_local_port_range - 2 INTEGERS
 	(i.e. by default) range 1024-4999 is enough to issue up to
 	2000 connections per second to systems supporting timestamps.
 
+ip_local_reserved_ports - list of comma separated ranges
+	Specify the ports which are reserved for known third-party
+	applications. These ports will not be used by automatic port
+	assignments (e.g. when calling connect() or bind() with port
+	number 0). Explicit port allocation behavior is unchanged.
+
+	The format used for both input and output is a comma separated
+	list of ranges (e.g. "1,2-4,10-10" for ports 1, 2, 3, 4 and
+	10). Writing to the file will clear all previously reserved
+	ports and update the current list with the one given in the
+	input.
+
+	Note that ip_local_port_range and ip_local_reserved_ports
+	settings are independent and both are considered by the kernel
+	when determining which ports are available for automatic port
+	assignments.
+
+	You can reserve ports which are not in the current
+	ip_local_port_range, e.g.:
+
+	$ cat /proc/sys/net/ipv4/ip_local_port_range
+	32000	61000
+	$ cat /proc/sys/net/ipv4/ip_local_reserved_ports
+	8080,9148
+
+	although this is redundant. However such a setting is useful
+	if later the port range is changed to a value that will
+	include the reserved ports.
+
+	Default: Empty
+
 ip_nonlocal_bind - BOOLEAN
 	If set, allows processes to bind() to non-local IP addresses,
 	which can be quite useful - but may break some applications.
Index: linux-2.6/drivers/infiniband/core/cma.c
===================================================================
--- linux-2.6.orig/drivers/infiniband/core/cma.c
+++ linux-2.6/drivers/infiniband/core/cma.c
@@ -1980,6 +1980,8 @@ retry:
 	/* FIXME: add proper port randomization per like inet_csk_get_port */
 	do {
 		ret = idr_get_new_above(ps, bind_list, next_port, &port);
+		if (!ret && inet_is_reserved_local_port(port))
+			ret = -EAGAIN;
 	} while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL));
 
 	if (ret)
@@ -2995,11 +2997,19 @@ static void cma_remove_one(struct ib_dev
 static int __init cma_init(void)
 {
 	int ret, low, high, remaining;
+	int tries = 10;
 
-	get_random_bytes(&next_port, sizeof next_port);
 	inet_get_local_port_range(&low, &high);
+again:
+	get_random_bytes(&next_port, sizeof next_port);
 	remaining = (high - low) + 1;
 	next_port = ((unsigned int) next_port % remaining) + low;
+	if (inet_is_reserved_local_port(next_port)) {
+		if (tries--)
+			goto again;
+		else
+			return -EBUSY;
+	}
 
 	cma_wq = create_singlethread_workqueue("rdma_cm");
 	if (!cma_wq)
Index: linux-2.6/include/net/ip.h
===================================================================
--- linux-2.6.orig/include/net/ip.h
+++ linux-2.6/include/net/ip.h
@@ -184,6 +184,12 @@ extern struct local_ports {
 } sysctl_local_ports;
 extern void inet_get_local_port_range(int *low, int *high);
 
+extern unsigned long *sysctl_local_reserved_ports;
+static inline int inet_is_reserved_local_port(int port)
+{
+	return test_bit(port, sysctl_local_reserved_ports);
+}
+
 extern int sysctl_ip_default_ttl;
 extern int sysctl_ip_nonlocal_bind;
 
Index: linux-2.6/net/ipv4/af_inet.c
===================================================================
--- linux-2.6.orig/net/ipv4/af_inet.c
+++ linux-2.6/net/ipv4/af_inet.c
@@ -1552,9 +1552,13 @@ static int __init inet_init(void)
 
 	BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));
 
+	sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
+	if (!sysctl_local_reserved_ports)
+		goto out;
+
 	rc = proto_register(&tcp_prot, 1);
 	if (rc)
-		goto out;
+		goto out_free_reserved_ports;
 
 	rc = proto_register(&udp_prot, 1);
 	if (rc)
@@ -1653,6 +1657,8 @@ out_unregister_udp_proto:
 	proto_unregister(&udp_prot);
 out_unregister_tcp_proto:
 	proto_unregister(&tcp_prot);
+out_free_reserved_ports:
+	kfree(sysctl_local_reserved_ports);
 	goto out;
 }
 
Index: linux-2.6/net/ipv4/inet_connection_sock.c
===================================================================
--- linux-2.6.orig/net/ipv4/inet_connection_sock.c
+++ linux-2.6/net/ipv4/inet_connection_sock.c
@@ -37,6 +37,9 @@ struct local_ports sysctl_local_ports __
 	.range = { 32768, 61000 },
 };
 
+unsigned long *sysctl_local_reserved_ports;
+EXPORT_SYMBOL(sysctl_local_reserved_ports);
+
 void inet_get_local_port_range(int *low, int *high)
 {
 	unsigned seq;
@@ -108,6 +111,8 @@ again:
 
 		smallest_size = -1;
 		do {
+			if (inet_is_reserved_local_port(rover))
+				goto next_nolock;
 			head = &hashinfo->bhash[inet_bhashfn(net, rover,
 					hashinfo->bhash_size)];
 			spin_lock(&head->lock);
@@ -130,6 +135,7 @@ again:
 			break;
 		next:
 			spin_unlock(&head->lock);
+		next_nolock:
 			if (++rover > high)
 				rover = low;
 		} while (--remaining > 0);
Index: linux-2.6/net/ipv4/inet_hashtables.c
===================================================================
--- linux-2.6.orig/net/ipv4/inet_hashtables.c
+++ linux-2.6/net/ipv4/inet_hashtables.c
@@ -456,6 +456,8 @@ int __inet_hash_connect(struct inet_time
 		local_bh_disable();
 		for (i = 1; i <= remaining; i++) {
 			port = low + (i + offset) % remaining;
+			if (inet_is_reserved_local_port(port))
+				continue;
 			head = &hinfo->bhash[inet_bhashfn(net, port,
 					hinfo->bhash_size)];
 			spin_lock(&head->lock);
Index: linux-2.6/net/ipv4/sysctl_net_ipv4.c
===================================================================
--- linux-2.6.orig/net/ipv4/sysctl_net_ipv4.c
+++ linux-2.6/net/ipv4/sysctl_net_ipv4.c
@@ -299,6 +299,13 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= ipv4_local_port_range,
 	},
+	{
+		.procname	= "ip_local_reserved_ports",
+		.data		= NULL, /* initialized in sysctl_ipv4_init */
+		.maxlen		= 65536,
+		.mode		= 0644,
+		.proc_handler	= proc_do_large_bitmap,
+	},
 #ifdef CONFIG_IP_MULTICAST
 	{
 		.procname	= "igmp_max_memberships",
@@ -736,6 +743,16 @@ static __net_initdata struct pernet_oper
 static __init int sysctl_ipv4_init(void)
 {
 	struct ctl_table_header *hdr;
+	struct ctl_table *i;
+
+	for (i = ipv4_table; i->procname; i++) {
+		if (strcmp(i->procname, "ip_local_reserved_ports") == 0) {
+			i->data = sysctl_local_reserved_ports;
+			break;
+		}
+	}
+	if (!i->procname)
+		return -EINVAL;
 
 	hdr = register_sysctl_paths(net_ipv4_ctl_path, ipv4_table);
 	if (hdr == NULL)
Index: linux-2.6/net/ipv4/udp.c
===================================================================
--- linux-2.6.orig/net/ipv4/udp.c
+++ linux-2.6/net/ipv4/udp.c
@@ -233,7 +233,8 @@ int udp_lib_get_port(struct sock *sk, un
 			 */
 			do {
 				if (low <= snum && snum <= high &&
-				    !test_bit(snum >> udptable->log, bitmap))
+				    !test_bit(snum >> udptable->log, bitmap) &&
+				    !inet_is_reserved_local_port(snum))
 					goto found;
 				snum += rand;
 			} while (snum != first);
Index: linux-2.6/net/sctp/socket.c
===================================================================
--- linux-2.6.orig/net/sctp/socket.c
+++ linux-2.6/net/sctp/socket.c
@@ -5436,6 +5436,8 @@ static long sctp_get_port_local(struct s
 			rover++;
 			if ((rover < low) || (rover > high))
 				rover = low;
+			if (inet_is_reserved_local_port(rover))
+				continue;
 			index = sctp_phashfn(rover);
 			head = &sctp_port_hashtable[index];
 			sctp_spin_lock(&head->lock);

^ permalink raw reply

* Re: [Patch 1/3] sysctl: refactor integer handling proc code
From: Alexey Dobriyan @ 2010-04-12 10:18 UTC (permalink / raw)
  To: Amerigo Wang
  Cc: linux-kernel, Octavian Purdila, Eric Dumazet, penguin-kernel,
	netdev, Neil Horman, ebiederm, David Miller
In-Reply-To: <20100412100754.5302.99552.sendpatchset@localhost.localdomain>

On Mon, Apr 12, 2010 at 06:04:04AM -0400, Amerigo Wang wrote:
> As we are about to add another integer handling proc function a little
> bit of cleanup is in order: add a few helper functions to improve code
> readability and decrease code duplication.
> 
> In the process a bug is also fixed: if the user specifies a number
> with more then 20 digits it will be interpreted as two integers
> (e.g. 10000...13 will be interpreted as 100.... and 13).

ULONG_MAX is not 22 digits always.

The fix is to not rely on simple_strtoul()

I guess it's time to finally remove it. :-(

Also, it's better to copy_from user stuff once.
Without looking at non-trivial users, one page should be enough.

> Behavior for EFAULT handling was changed as well. Previous to this
> patch, when an EFAULT error occurred in the middle of a write
> operation, although some of the elements were set, that was not
> acknowledged to the user (by shorting the write and returning the
> number of bytes accepted). EFAULT is now treated just like any other
> errors by acknowledging the amount of bytes accepted.

> +static int proc_skip_wspace(char __user **buf, size_t *size)
> +{
> +	char c;
> +
> +	while (*size) {
> +		if (get_user(c, *buf))
> +			return -EFAULT;
> +		if (!isspace(c))
> +			break;
> +		(*size)--;
> +		(*buf)++;
> +	}
> +
> +	return 0;
> +}

yeah, copy_from_user once, so we won't have this.

> +static bool isanyof(char c, const char *v, unsigned len)

A what?
this is memchr()

> +{
> +	int i;
> +
> +	if (!len)
> +		return false;
> +
> +	for (i = 0; i < len; i++)
> +		if (c == v[i])
> +			break;
> +	if (i == len)
> +		return false;
> +
> +	return true;
> +}
> +
> +#define TMPBUFLEN 22
> +/**
> + * proc_get_long - reads an ASCII formated integer from a user buffer
> + *
> + * @buf - user buffer
> + * @size - size of the user buffer
> + * @val - this is where the number will be stored
> + * @neg - set to %TRUE if number is negative
> + * @perm_tr - a vector which contains the allowed trailers
> + * @perm_tr_len - size of the perm_tr vector
> + * @tr - pointer to store the trailer character
> + *
> + * In case of success 0 is returned and buf and size are updated with
> + * the amount of bytes read. If tr is non NULL and a trailing
> + * character exist (size is non zero after returning from this
> + * function) tr is updated with the trailing character.
> + */
> +static int proc_get_long(char __user **buf, size_t *size,
> +			  unsigned long *val, bool *neg,
> +			  const char *perm_tr, unsigned perm_tr_len, char *tr)
> +{
> +	int len;
> +	char *p, tmp[TMPBUFLEN];
> +
> +	if (!*size)
> +		return -EINVAL;
> +
> +	len = *size;
> +	if (len > TMPBUFLEN-1)
> +		len = TMPBUFLEN-1;
> +
> +	if (copy_from_user(tmp, *buf, len))
> +		return -EFAULT;
> +
> +	tmp[len] = 0;
> +	p = tmp;
> +	if (*p == '-' && *size > 1) {
> +		*neg = 1;
> +		p++;
> +	} else
> +		*neg = 0;
> +	if (!isdigit(*p))
> +		return -EINVAL;
> +
> +	*val = simple_strtoul(p, &p, 0);
> +
> +	len = p - tmp;
> +
> +	/* We don't know if the next char is whitespace thus we may accept
> +	 * invalid integers (e.g. 1234...a) or two integers instead of one
> +	 * (e.g. 123...1). So lets not allow such large numbers. */
> +	if (len == TMPBUFLEN - 1)
> +		return -EINVAL;
> +
> +	if (len < *size && perm_tr_len && !isanyof(*p, perm_tr, perm_tr_len))
> +		return -EINVAL;
> +
> +	if (tr && (len < *size))
> +		*tr = *p;
> +
> +	*buf += len;
> +	*size -= len;
> +
> +	return 0;
> +}

^ permalink raw reply

* Re: [Patch 1/3] sysctl: refactor integer handling proc code
From: Alexey Dobriyan @ 2010-04-12 10:18 UTC (permalink / raw)
  To: Amerigo Wang
  Cc: linux-kernel, Octavian Purdila, Eric Dumazet, penguin-kernel,
	netdev, Neil Horman, ebiederm, David Miller
In-Reply-To: <20100412100754.5302.99552.sendpatchset@localhost.localdomain>

On Mon, Apr 12, 2010 at 06:04:04AM -0400, Amerigo Wang wrote:
> As we are about to add another integer handling proc function a little
> bit of cleanup is in order: add a few helper functions to improve code
> readability and decrease code duplication.
> 
> In the process a bug is also fixed: if the user specifies a number
> with more then 20 digits it will be interpreted as two integers
> (e.g. 10000...13 will be interpreted as 100.... and 13).

ULONG_MAX is not 22 digits always.

The fix is to not rely on simple_strtoul()

I guess it's time to finally remove it. :-(

Also, it's better to copy_from user stuff once.
Without looking at non-trivial users, one page should be enough.

> Behavior for EFAULT handling was changed as well. Previous to this
> patch, when an EFAULT error occurred in the middle of a write
> operation, although some of the elements were set, that was not
> acknowledged to the user (by shorting the write and returning the
> number of bytes accepted). EFAULT is now treated just like any other
> errors by acknowledging the amount of bytes accepted.

> +static int proc_skip_wspace(char __user **buf, size_t *size)
> +{
> +	char c;
> +
> +	while (*size) {
> +		if (get_user(c, *buf))
> +			return -EFAULT;
> +		if (!isspace(c))
> +			break;
> +		(*size)--;
> +		(*buf)++;
> +	}
> +
> +	return 0;
> +}

yeah, copy_from_user once, so we won't have this.

> +static bool isanyof(char c, const char *v, unsigned len)

A what?
this is memchr()

> +{
> +	int i;
> +
> +	if (!len)
> +		return false;
> +
> +	for (i = 0; i < len; i++)
> +		if (c == v[i])
> +			break;
> +	if (i == len)
> +		return false;
> +
> +	return true;
> +}
> +
> +#define TMPBUFLEN 22
> +/**
> + * proc_get_long - reads an ASCII formated integer from a user buffer
> + *
> + * @buf - user buffer
> + * @size - size of the user buffer
> + * @val - this is where the number will be stored
> + * @neg - set to %TRUE if number is negative
> + * @perm_tr - a vector which contains the allowed trailers
> + * @perm_tr_len - size of the perm_tr vector
> + * @tr - pointer to store the trailer character
> + *
> + * In case of success 0 is returned and buf and size are updated with
> + * the amount of bytes read. If tr is non NULL and a trailing
> + * character exist (size is non zero after returning from this
> + * function) tr is updated with the trailing character.
> + */
> +static int proc_get_long(char __user **buf, size_t *size,
> +			  unsigned long *val, bool *neg,
> +			  const char *perm_tr, unsigned perm_tr_len, char *tr)
> +{
> +	int len;
> +	char *p, tmp[TMPBUFLEN];
> +
> +	if (!*size)
> +		return -EINVAL;
> +
> +	len = *size;
> +	if (len > TMPBUFLEN-1)
> +		len = TMPBUFLEN-1;
> +
> +	if (copy_from_user(tmp, *buf, len))
> +		return -EFAULT;
> +
> +	tmp[len] = 0;
> +	p = tmp;
> +	if (*p == '-' && *size > 1) {
> +		*neg = 1;
> +		p++;
> +	} else
> +		*neg = 0;
> +	if (!isdigit(*p))
> +		return -EINVAL;
> +
> +	*val = simple_strtoul(p, &p, 0);
> +
> +	len = p - tmp;
> +
> +	/* We don't know if the next char is whitespace thus we may accept
> +	 * invalid integers (e.g. 1234...a) or two integers instead of one
> +	 * (e.g. 123...1). So lets not allow such large numbers. */
> +	if (len == TMPBUFLEN - 1)
> +		return -EINVAL;
> +
> +	if (len < *size && perm_tr_len && !isanyof(*p, perm_tr, perm_tr_len))
> +		return -EINVAL;
> +
> +	if (tr && (len < *size))
> +		*tr = *p;
> +
> +	*buf += len;
> +	*size -= len;
> +
> +	return 0;
> +}

^ permalink raw reply

* Re: [v3 Patch 2/3] bridge: make bridge support netpoll
From: Cong Wang @ 2010-04-12 10:37 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: linux-kernel, netdev, bridge, Andy Gospodarek, Neil Horman,
	Jeff Moyer, Matt Mackall, bonding-devel, Jay Vosburgh,
	David Miller
In-Reply-To: <20100408083710.2b61ee44@nehalam>

Stephen Hemminger wrote:
>> Index: linux-2.6/net/bridge/br_forward.c
>> ===================================================================
>> --- linux-2.6.orig/net/bridge/br_forward.c
>> +++ linux-2.6/net/bridge/br_forward.c
>> @@ -15,6 +15,7 @@
>>  #include <linux/slab.h>
>>  #include <linux/kernel.h>
>>  #include <linux/netdevice.h>
>> +#include <linux/netpoll.h>
>>  #include <linux/skbuff.h>
>>  #include <linux/if_vlan.h>
>>  #include <linux/netfilter_bridge.h>
>> @@ -50,7 +51,13 @@ int br_dev_queue_push_xmit(struct sk_buf
>>  		else {
>>  			skb_push(skb, ETH_HLEN);
>>  
>> -			dev_queue_xmit(skb);
>> +#ifdef CONFIG_NET_POLL_CONTROLLER
>> +			if (skb->dev->priv_flags & IFF_IN_NETPOLL) {
>> +				netpoll_send_skb(skb->dev->npinfo->netpoll, skb);
>> +				skb->dev->priv_flags &= ~IFF_IN_NETPOLL;
>> +			} else
>> +#endif
> 
> There is no protection on dev->priv_flags for SMP access.
> It would better bit value in dev->state if you are using it as control flag.
> 
> Then you could use 
> 			if (unlikely(test_and_clear_bit(__IN_NETPOLL, &skb->dev->state)))
> 				netpoll_send_skb(...)
> 
> 

Hmm, I think we can't use ->state here, it is not for this kind of purpose,
according to its comments.

Also, I find other usages of IFF_XXX flags of ->priv_flags are also using
&, | to set or clear the flags. So there must be some other things preventing
the race...


Thanks.

^ permalink raw reply

* Re: [v3 Patch 2/3] bridge: make bridge support netpoll
From: Eric Dumazet @ 2010-04-12 10:38 UTC (permalink / raw)
  To: Cong Wang
  Cc: Stephen Hemminger, linux-kernel, netdev, bridge, Andy Gospodarek,
	Neil Horman, Jeff Moyer, Matt Mackall, bonding-devel,
	Jay Vosburgh, David Miller
In-Reply-To: <4BC2F7E2.7020309@redhat.com>

Le lundi 12 avril 2010 à 18:37 +0800, Cong Wang a écrit :
> Stephen Hemminger wrote:
> > There is no protection on dev->priv_flags for SMP access.
> > It would better bit value in dev->state if you are using it as control flag.
> > 
> > Then you could use 
> > 			if (unlikely(test_and_clear_bit(__IN_NETPOLL, &skb->dev->state)))
> > 				netpoll_send_skb(...)
> > 
> > 
> 
> Hmm, I think we can't use ->state here, it is not for this kind of purpose,
> according to its comments.
> 
> Also, I find other usages of IFF_XXX flags of ->priv_flags are also using
> &, | to set or clear the flags. So there must be some other things preventing
> the race...

Yes, its RTNL that protects priv_flags changes, hopefully...

^ permalink raw reply

* [PATCH] iproute2: add option to build m_xt as a tc module.
From: Andreas Henriksson @ 2010-04-12 11:55 UTC (permalink / raw)
  To: shemminger; +Cc: netdev

Add TC_CONFIG_XT_MODULE option that can be added
either to Config (after ./configure) or as an argument to "make".

This will build the xt module (action ipt) of tc as a
shared object that is linked at runtime by tc if used,
rather then built into tc.

This is similar to how the atm qdisc support
is handled (q_atm.so).

Signed-off-by: Andreas Henriksson <andreas@fatal.se>

---

The reason for this is simply to be able to avoid
the tc binary from being linked to libxtables. This way
distributions who ship binary packages can
avoid a dependency on the iptables package by
ignoring m_xt.so in the dependency analysis
and let actual users of the tc arguments "action ipt"
make sure they have iptables installed.
(See http://bugs.debian.org/576953 )

This was not a problem with the old/deprecated
m_ipt module which did runtime linking of
the iptables library.

Having the split inside tc, rather then between tc and the required
library, is preferred. This way we'll notice at build-time
when the required library breaks API/ABI rather
then having to rely on people that uses the functionality
to report back when the ABI is broken.
(We've learned this the hard way in debian after many
angry bugreports.)

I've had jamal pre-review this and he didn't see any
problems with this.


diff --git a/tc/Makefile b/tc/Makefile
index 805c108..3af33cf 100644
--- a/tc/Makefile
+++ b/tc/Makefile
@@ -43,10 +43,18 @@ TCMODULES += em_cmp.o
 TCMODULES += em_u32.o
 TCMODULES += em_meta.o
 
+TCSO :=
+ifeq ($(TC_CONFIG_ATM),y)
+  TCSO += q_atm.so
+endif
 
 ifeq ($(TC_CONFIG_XT),y)
-  TCMODULES += m_xt.o
-  LDLIBS += -lxtables
+  ifeq ($(TC_CONFIG_XT_MODULE),y)
+    TCSO += m_xt.so
+  else
+    TCMODULES += m_xt.o
+    LDLIBS += -lxtables
+  endif
 else
   ifeq ($(TC_CONFIG_XT_OLD),y)
     TCMODULES += m_xt_old.o
@@ -81,11 +89,6 @@ ifneq ($(IPT_LIB_DIR),)
 	CFLAGS += -DIPT_LIB_DIR=\"$(IPT_LIB_DIR)\"
 endif
 
-TCSO :=
-ifeq ($(TC_CONFIG_ATM),y)
-  TCSO += q_atm.so
-endif
-
 YACC := bison
 LEX := flex
 
@@ -114,6 +117,9 @@ clean:
 q_atm.so: q_atm.c
 	$(CC) $(CFLAGS) $(LDFLAGS) -shared -fpic -o q_atm.so q_atm.c -latm
 
+m_xt.so: m_xt.c
+	$(CC) $(CFLAGS) $(LDFLAGS) -shared -fpic -o m_xt.so m_xt.c -lxtables
+
 %.yacc.c: %.y
 	$(YACC) $(YACCFLAGS) -o $@ $<
 

^ permalink raw reply related

* Bug#572201: forcedeth driver hangs under heavy load
From: stephen mulcahy @ 2010-04-12 12:39 UTC (permalink / raw)
  To: netdev; +Cc: Ben Hutchings, Eric Dumazet, Ayaz Abdulla, 572201
In-Reply-To: <4BC2EF88.3060203@atlanticlinux.ie>

stephen mulcahy wrote:
> It doesn't - further testing over the weekend saw 6 of 45 machines drop 
> off the network with this problem. Nothing in dmesg or system logs. 
> Happy to run more tests if someone can advise on what should be run.

I also just tried using the 2.6.30-2-amd64 (Debian) forcedeth kernel 
module while running the 2.6.32-3-amd64 (Debian) kernel and experienced 
the same symptoms.

Not sure if thats any help.

-stephen

^ permalink raw reply

* Bug#572201: forcedeth driver hangs under heavy load
From: Eric Dumazet @ 2010-04-12 12:47 UTC (permalink / raw)
  To: stephen mulcahy; +Cc: netdev, Ben Hutchings, Ayaz Abdulla, 572201
In-Reply-To: <4BC31486.1090603@gmail.com>

Le lundi 12 avril 2010 à 13:39 +0100, stephen mulcahy a écrit :
> stephen mulcahy wrote:
> > It doesn't - further testing over the weekend saw 6 of 45 machines drop 
> > off the network with this problem. Nothing in dmesg or system logs. 
> > Happy to run more tests if someone can advise on what should be run.
> 
> I also just tried using the 2.6.30-2-amd64 (Debian) forcedeth kernel 
> module while running the 2.6.32-3-amd64 (Debian) kernel and experienced 
> the same symptoms.
> 
> Not sure if thats any help.
> 

I am not sure I understand. Are you saying that using 2.6.30-2-amd64
kernel also makes your forcedeth adapter being not functional ?

Are both way non functional (RX and TX), or only one side ?






-- 
To UNSUBSCRIBE, email to debian-bugs-dist-REQUEST@lists.debian.org
with a subject of "unsubscribe". Trouble? Contact listmaster@lists.debian.org

^ permalink raw reply

* Re: forcedeth driver hangs under heavy load
From: stephen mulcahy @ 2010-04-12 13:05 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, Ben Hutchings, Ayaz Abdulla, 572201
In-Reply-To: <1271076426.16881.21.camel@edumazet-laptop>

Eric Dumazet wrote:
> Le lundi 12 avril 2010 à 13:39 +0100, stephen mulcahy a écrit :
> I am not sure I understand. Are you saying that using 2.6.30-2-amd64
> kernel also makes your forcedeth adapter being not functional ?

Hi Eric,

If I run my tests with the 2.6.30-2-amd64 kernel the network doesn't 
malfunction.

If I run my tests with the 2.6.32-3-amd64 kernel the network does 
malfunction.

If I take the forcedeth.ko module from the 2.6.30-2-amd64 kernel and 
drop that into /lib/modules/2.6.32-3-amd64/kernel/drivers/net/ and then 
reboot to 2.6.32-3-amd64 and rerun my tests - the network does malfunction.

> Are both way non functional (RX and TX), or only one side ?

Whats the best way of testing this? (tcpdump listening on both hosts and 
then running pings between the systems?)

-stephen

^ permalink raw reply

* Re: forcedeth driver hangs under heavy load
From: stephen mulcahy @ 2010-04-12 13:19 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, Ben Hutchings, Ayaz Abdulla, 572201
In-Reply-To: <4BC31AA0.5070006@gmail.com>

stephen mulcahy wrote:
>> Are both way non functional (RX and TX), or only one side ?
> 
> Whats the best way of testing this? (tcpdump listening on both hosts and 
> then running pings between the systems?)


stephen mulcahy wrote:
 >> Are both way non functional (RX and TX), or only one side ?
 >
 > Whats the best way of testing this? (tcpdump listening on both hosts and
 > then running pings between the systems?)

On one of the nodes that is in the malfunctioning state (node05), I ran

ssh node20

and grabbed the following output from running tcpdump on node20

root@node20:~# tcpdump host node20 and node05
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), capture size 96 bytes
14:12:59.612626 IP node05.webstar.cnet.36295 > node20.ssh: Flags [S], 
seq 3677858646, win 5840, options [mss 1460,sackOK,TS val 1599534 ecr 
0,nop,wscale 7], length 0
14:12:59.612656 IP node20.ssh > node05.webstar.cnet.36295: Flags [S.], 
seq 3610575850, ack 3677858647, win 5792, options [mss 1460,sackOK,TS 
val 1598775 ecr 1599534,nop,wscale 7], length 0
14:12:59.612718 IP node05.webstar.cnet.36295 > node20.ssh: Flags [.], 
ack 1, win 46, options [nop,nop,TS val 1599534 ecr 1598775], length 0
14:12:59.617434 IP node20.ssh > node05.webstar.cnet.36295: Flags [P.], 
seq 1:33, ack 1, win 46, options [nop,nop,TS val 1598776 ecr 1599534], 
length 32
14:12:59.617522 IP node05.webstar.cnet.36295 > node20.ssh: Flags [.], 
ack 33, win 46, options [nop,nop,TS val 1599535 ecr 1598776], length 0
14:12:59.617609 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], 
seq 1:33, ack 33, win 46, options [nop,nop,TS val 1599535 ecr 1598776], 
length 32
14:12:59.820434 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], 
seq 4294936586:4294936618, ack 2620194849, win 46, options [nop,nop,TS 
val 1599586 ecr 1598776], length 32
14:13:00.229069 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], 
seq 4294961734:4294961766, ack 3928358945, win 46, options [nop,nop,TS 
val 1599688 ecr 1598776], length 32
14:13:01.044396 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], 
seq 4294964167:4294964199, ack 410320929, win 46, options [nop,nop,TS 
val 1599892 ecr 1598776], length 32
14:13:02.676308 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], 
seq 1:33, ack 33, win 46, options [nop,nop,TS val 1600300 ecr 1598776], 
length 32
14:13:05.940804 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], 
seq 17294:17326, ack 3045851169, win 46, options [nop,nop,TS val 1601116 
ecr 1598776], length 32
14:13:12.468484 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], 
seq 17294:17326, ack 3045851169, win 46, options [nop,nop,TS val 1602748 
ecr 1598776], length 32
14:13:23.846891 IP node20.ssh > node05.webstar.cnet.36084: Flags [F.], 
seq 2093054475, ack 2175389538, win 46, options [nop,nop,TS val 1604834 
ecr 1575591], length 0
14:13:23.847278 IP node05.webstar.cnet.36084 > node20.ssh: Flags [R], 
seq 2175389538, win 0, length 0
14:13:25.523850 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], 
seq 1:33, ack 33, win 46, options [nop,nop,TS val 1606012 ecr 1598776], 
length 32
14:13:50.127509 IP node20.ssh > node05.webstar.cnet.36143: Flags [F.], 
seq 2526196657, ack 2590340885, win 46, options [nop,nop,TS val 1611404 
ecr 1582161], length 0
14:13:50.127879 IP node05.webstar.cnet.36143 > node20.ssh: Flags [R], 
seq 2590340885, win 0, length 0
14:13:51.633934 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], 
seq 4294963190:4294963222, ack 9830433, win 46, options [nop,nop,TS val 
1612540 ecr 1598776], length 32
14:13:55.125525 ARP, Request who-has node05.webstar.cnet tell node20, 
length 28
14:13:55.125886 ARP, Reply node05.webstar.cnet is-at 00:30:48:ce:dc:02 
(oui Unknown), length 46
14:14:43.855380 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], 
seq 1:33, ack 33, win 46, options [nop,nop,TS val 1625596 ecr 1598776], 
length 32
14:14:48.855143 ARP, Request who-has node20 tell node05.webstar.cnet, 
length 46
14:14:48.855469 ARP, Reply node20 is-at 00:30:48:ce:de:34 (oui Unknown), 
length 28
14:14:59.617675 IP node20.ssh > node05.webstar.cnet.36295: Flags [F.], 
seq 33, ack 1, win 46, options [nop,nop,TS val 1628777 ecr 1599535], 
length 0
14:14:59.618202 IP node05.webstar.cnet.36295 > node20.ssh: Flags [FP.], 
seq 4294959654:4294960446, ack 3930456098, win 46, options [nop,nop,TS 
val 1629536 ecr 1628777], length 792
14:14:59.821527 IP node20.ssh > node05.webstar.cnet.36295: Flags [F.], 
seq 33, ack 1, win 46, options [nop,nop,TS val 1628828 ecr 1599535], 
length 0
14:14:59.821598 IP node05.webstar.cnet.36295 > node20.ssh: Flags [.], 
ack 34, win 46, options [nop,nop,TS val 1629587 ecr 1628828,nop,nop,sack 
1 {33:34}], length 0
^C^
27 packets captured
31 packets received by filter
0 packets dropped by kernel


I then did ifdown and ifup on node05 and again ran

ssh node20

and grabbed the following output from running tcpdump on node20

root@node20:~# tcpdump host node20 and node05
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), capture size 96 bytes
14:15:50.626410 IP node05.webstar.cnet.36690 > node20.ssh: Flags [S], 
seq 2044900531, win 5840, options [mss 1460,sackOK,TS val 1642289 ecr 
0,nop,wscale 7], length 0
14:15:50.626441 IP node20.ssh > node05.webstar.cnet.36690: Flags [S.], 
seq 1976694445, ack 2044900532, win 5792, options [mss 1460,sackOK,TS 
val 1641529 ecr 1642289,nop,wscale 7], length 0
14:15:50.626482 IP node05.webstar.cnet.36690 > node20.ssh: Flags [.], 
ack 1, win 46, options [nop,nop,TS val 1642289 ecr 1641529], length 0
14:15:50.631138 IP node20.ssh > node05.webstar.cnet.36690: Flags [P.], 
seq 1:33, ack 1, win 46, options [nop,nop,TS val 1641530 ecr 1642289], 
length 32
14:15:50.631218 IP node05.webstar.cnet.36690 > node20.ssh: Flags [.], 
ack 33, win 46, options [nop,nop,TS val 1642290 ecr 1641530], length 0
14:15:50.631267 IP node05.webstar.cnet.36690 > node20.ssh: Flags [P.], 
seq 1:33, ack 33, win 46, options [nop,nop,TS val 1642290 ecr 1641530], 
length 32
14:15:50.631281 IP node20.ssh > node05.webstar.cnet.36690: Flags [.], 
ack 33, win 46, options [nop,nop,TS val 1641530 ecr 1642290], length 0
14:15:50.631367 IP node05.webstar.cnet.36690 > node20.ssh: Flags [P.], 
seq 33:825, ack 33, win 46, options [nop,nop,TS val 1642290 ecr 
1641530], length 792
14:15:50.631376 IP node20.ssh > node05.webstar.cnet.36690: Flags [.], 
ack 825, win 58, options [nop,nop,TS val 1641530 ecr 1642290], length 0
14:15:50.631808 IP node20.ssh > node05.webstar.cnet.36690: Flags [P.], 
seq 33:817, ack 825, win 58, options [nop,nop,TS val 1641530 ecr 
1642290], length 784
14:15:50.631950 IP node05.webstar.cnet.36690 > node20.ssh: Flags [P.], 
seq 825:849, ack 817, win 58, options [nop,nop,TS val 1642290 ecr 
1641530], length 24
14:15:50.633353 IP node20.ssh > node05.webstar.cnet.36690: Flags [P.], 
seq 817:969, ack 849, win 58, options [nop,nop,TS val 1641530 ecr 
1642290], length 152
14:15:50.633932 IP node05.webstar.cnet.36690 > node20.ssh: Flags [P.], 
seq 849:993, ack 969, win 71, options [nop,nop,TS val 1642291 ecr 
1641530], length 144
14:15:50.637998 IP node20.ssh > node05.webstar.cnet.36690: Flags [P.], 
seq 969:1689, ack 993, win 70, options [nop,nop,TS val 1641532 ecr 
1642291], length 720
14:15:50.676465 IP node05.webstar.cnet.36690 > node20.ssh: Flags [.], 
ack 1689, win 83, options [nop,nop,TS val 1642302 ecr 1641532], length 0
14:16:09.776134 IP node05.webstar.cnet.49671 > node20.50060: Flags [S], 
seq 2348078217, win 5840, options [mss 1460,sackOK,TS val 1647077 ecr 
0,nop,wscale 7], length 0
14:16:09.776498 IP node20.50060 > node05.webstar.cnet.49671: Flags [R.], 
seq 0, ack 2348078218, win 0, length 0
^C
17 packets captured
21 packets received by filter
0 packets dropped by kernel


Does that help?

^ permalink raw reply

* Re: [PATCH] rps: add flow director support
From: Tom Herbert @ 2010-04-12 13:34 UTC (permalink / raw)
  To: Changli Gao; +Cc: David S. Miller, netdev
In-Reply-To: <1271022140-3917-1-git-send-email-xiaosuo@gmail.com>

On Sun, Apr 11, 2010 at 2:42 PM, Changli Gao <xiaosuo@gmail.com> wrote:
> add rps flow director support
>
> with rps flow director, users can do weighted packet dispatching among CPUs.
> For example, CPU0:CPU1 is 1:3 for eth0's rx-0:
>
"Flow director" is a misnomer here in that it has no per flow
awareness, that is what RFS provides.  Please use a different name.

>  localhost linux # echo 4 > /sys/class/net/eth0/queues/rx-0/rps_flows
>  localhost linux # echo 0 > /sys/class/net/eth0/queues/rx-0/rps_flow_0
>  localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_1
>  localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_2
>  localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_3
>
It might be better to put this in its own directory and also do it per
CPU instead of hash entry.  This should result in a lot fewer entries
and I'm not sure how you would deal with holes in the hash table for
unspecified entries.  Also, it would be nice not to have to specify a
number of entries.  Maybe something like:

localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_cpu_map/0
localhost linux # echo 3 > /sys/class/net/eth0/queues/rx-0/rps_cpu_map/1

To specify CPU 0 with weight 1, CPU 1 with weight 3.

> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
> ----
>  net/core/net-sysfs.c |  176 +++++++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 172 insertions(+), 4 deletions(-)
> diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
> index 1e7fdd6..d904610 100644
> --- a/net/core/net-sysfs.c
> +++ b/net/core/net-sysfs.c
> @@ -511,6 +511,109 @@ static struct sysfs_ops rx_queue_sysfs_ops = {
>        .store = rx_queue_attr_store,
>  };
>
> +static DEFINE_MUTEX(rps_map_lock);
> +
> +static ssize_t show_rps_flow(struct netdev_rx_queue *queue,
> +                            struct rx_queue_attribute *attribute, char *buf)
> +{
> +       unsigned long flowid;
> +       struct rps_map *map;
> +       u16 cpu;
> +
> +       strict_strtoul(attribute->attr.name + strlen("rps_flow_"), 10, &flowid);
> +       rcu_read_lock();
> +       map = rcu_dereference(queue->rps_map);
> +       if (map && flowid < map->len)
> +               cpu = map->cpus[flowid];
> +       else
> +               cpu = 0;
> +       rcu_read_unlock();
> +       return sprintf(buf, "%hu\n", cpu);
> +}
> +
> +static ssize_t store_rps_flow(struct netdev_rx_queue *queue,
> +                             struct rx_queue_attribute *attribute,
> +                             const char *buf, size_t len)
> +{
> +       unsigned long flowid, cpu;
> +       struct rps_map *map;
> +
> +       if (!capable(CAP_NET_ADMIN))
> +               return -EPERM;
> +
> +       if (strict_strtoul(buf, 0, &cpu))
> +               return -EINVAL;
> +       strict_strtoul(attribute->attr.name + strlen("rps_flow_"), 10, &flowid);
> +
> +       mutex_lock(&rps_map_lock);
> +       map = queue->rps_map;
> +       if (map && flowid < map->len)
> +               map->cpus[flowid] = cpu;
> +       mutex_unlock(&rps_map_lock);
> +
> +       return len;
> +}
> +
> +static struct rx_queue_attribute **rps_flow_attribute;
> +static int rps_flow_attribute_size;
> +
> +/* must be called with rps_map_lock locked */
> +static int update_rps_flow_files(struct kobject *kobj,
> +                                struct rps_map *old_map, struct rps_map *map)
> +{
> +       int i;
> +       int old_map_len = old_map ? old_map->len : 0;
> +       int map_len = map ? map->len : 0;
> +
> +       if (old_map_len >= map_len) {
> +               for (i = map_len; i < old_map_len; i++)
> +                       sysfs_remove_file(kobj, &rps_flow_attribute[i]->attr);
> +               return 0;
> +       }
> +
> +       if (map_len > rps_flow_attribute_size) {
> +               struct rx_queue_attribute **attrs;
> +               char name[sizeof("rps_flow_4294967295")];
> +               char *pname;
> +
> +               attrs = krealloc(rps_flow_attribute, map_len * sizeof(void *),
> +                                GFP_KERNEL);
> +               if (attrs == NULL)
> +                       return -ENOMEM;
> +               rps_flow_attribute = attrs;
> +               for (i = rps_flow_attribute_size; i < map_len; i++) {
> +                       rps_flow_attribute[i] = kmalloc(sizeof(**attrs),
> +                                                       GFP_KERNEL);
> +                       if (rps_flow_attribute[i] == NULL)
> +                               break;
> +                       sprintf(name, "rps_flow_%d", i);
> +                       pname = kstrdup(name, GFP_KERNEL);
> +                       if (pname == NULL) {
> +                               kfree(rps_flow_attribute[i]);
> +                               break;
> +                       }
> +                       rps_flow_attribute[i]->attr.name = pname;
> +                       rps_flow_attribute[i]->attr.mode = S_IRUGO | S_IWUSR;
> +                       rps_flow_attribute[i]->show = show_rps_flow;
> +                       rps_flow_attribute[i]->store = store_rps_flow;
> +               }
> +               rps_flow_attribute_size = i;
> +               if (i != map_len)
> +                       return -ENOMEM;
> +       }
> +
> +       for (i = old_map_len; i < map_len; i++) {
> +               if (sysfs_create_file(kobj, &rps_flow_attribute[i]->attr)) {
> +                       while (--i >= old_map_len)
> +                               sysfs_remove_file(kobj,
> +                                                 &rps_flow_attribute[i]->attr);
> +                       return -ENOMEM;
> +               }
> +       }
> +
> +       return 0;
> +}
> +
>  static ssize_t show_rps_map(struct netdev_rx_queue *queue,
>                            struct rx_queue_attribute *attribute, char *buf)
>  {
> @@ -555,7 +658,6 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue,
>        struct rps_map *old_map, *map;
>        cpumask_var_t mask;
>        int err, cpu, i;
> -       static DEFINE_SPINLOCK(rps_map_lock);
>
>        if (!capable(CAP_NET_ADMIN))
>                return -EPERM;
> @@ -588,10 +690,15 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue,
>                map = NULL;
>        }
>
> -       spin_lock(&rps_map_lock);
> +       mutex_lock(&rps_map_lock);
>        old_map = queue->rps_map;
> -       rcu_assign_pointer(queue->rps_map, map);
> -       spin_unlock(&rps_map_lock);
> +       err = update_rps_flow_files(&queue->kobj, old_map, map);
> +       if (!err)
> +               rcu_assign_pointer(queue->rps_map, map);
> +       mutex_unlock(&rps_map_lock);
> +
> +       if (err)
> +               return err;
>
>        if (old_map)
>                call_rcu(&old_map->rcu, rps_map_release);
> @@ -603,8 +710,69 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue,
>  static struct rx_queue_attribute rps_cpus_attribute =
>        __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map);
>
> +static ssize_t show_rps_flows(struct netdev_rx_queue *queue,
> +               struct rx_queue_attribute *attribute, char *buf)
> +{
> +       struct rps_map *map;
> +       unsigned int len;
> +
> +       rcu_read_lock();
> +       map = rcu_dereference(queue->rps_map);
> +       len = map ? map->len : 0;
> +       rcu_read_unlock();
> +       return sprintf(buf, "%u\n", len);
> +}
> +
> +static ssize_t store_rps_flows(struct netdev_rx_queue *queue,
> +                              struct rx_queue_attribute *attribute,
> +                              const char *buf, size_t len)
> +{
> +       struct rps_map *old_map, *map;
> +       unsigned long flows;
> +       int err;
> +
> +       if (!capable(CAP_NET_ADMIN))
> +               return -EPERM;
> +
> +       if (strict_strtoul(buf, 0, &flows))
> +               return -EINVAL;
> +       if (flows != 0) {
> +               map = kzalloc(max_t(unsigned, RPS_MAP_SIZE(flows),
> +                                   L1_CACHE_BYTES), GFP_KERNEL);
> +               if (map == NULL)
> +                       return -ENOMEM;
> +               map->len = flows;
> +       } else {
> +               map = NULL;
> +       }
> +
> +       mutex_lock(&rps_map_lock);
> +       old_map = queue->rps_map;
> +       err = update_rps_flow_files(&queue->kobj, old_map, map);
> +       if (!err) {
> +               if (old_map && map)
> +                       memcpy(map->cpus, old_map->cpus,
> +                              sizeof(map->cpus[0]) *
> +                              min_t(unsigned int, flows, old_map->len));
> +               rcu_assign_pointer(queue->rps_map, map);
> +       }
> +       mutex_unlock(&rps_map_lock);
> +
> +       if (err)
> +               return err;
> +
> +       if (old_map)
> +               call_rcu(&old_map->rcu, rps_map_release);
> +
> +       return len;
> +}
> +
> +static struct rx_queue_attribute rps_flows_attribute =
> +       __ATTR(rps_flows, S_IRUGO | S_IWUSR, show_rps_flows, store_rps_flows);
> +
>  static struct attribute *rx_queue_default_attrs[] = {
>        &rps_cpus_attribute.attr,
> +       &rps_flows_attribute.attr,
>        NULL
>  };
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply

* hdlc_ppp: why no detach()?
From: Michael Barkowski @ 2010-04-12 14:15 UTC (permalink / raw)
  To: Krzysztof Halasa
  Cc: David S. Miller, Julia Lawall, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org

Hello Krzyztof,

I am looking at your hdlc_ppp code and I don't understand: why is there
not the equivalent of fr_detach() in there?

pc8300_drv:cpc_remove_one() frees netdevs quite confidently but I wonder
how it can be so sure that there are not skbs in hdlc_ppp's tx_queue
associated with those devices before freeing them....

Even if you wanted to switch a device from PPP to Frame Relay, I don't
see the method right now.  If I may ask, please, what am I missing?

If you agree there is a need for detach(), I would be happy to work on
it and make a submission.

thanks for your time,

-- 
Michael Barkowski
RuggedCom, Inc.


^ permalink raw reply

* Re: [PATCH] rps: add flow director support
From: Changli Gao @ 2010-04-12 14:27 UTC (permalink / raw)
  To: Tom Herbert; +Cc: David S. Miller, netdev
In-Reply-To: <z2o65634d661004120634h8336409er33af1fb75c2a9d1b@mail.gmail.com>

On Mon, Apr 12, 2010 at 9:34 PM, Tom Herbert <therbert@google.com> wrote:
> On Sun, Apr 11, 2010 at 2:42 PM, Changli Gao <xiaosuo@gmail.com> wrote:
>> add rps flow director support
>>
>> with rps flow director, users can do weighted packet dispatching among CPUs.
>> For example, CPU0:CPU1 is 1:3 for eth0's rx-0:
>>
> "Flow director" is a misnomer here in that it has no per flow
> awareness, that is what RFS provides.  Please use a different name.

Flow here is a bundle of flow, not the original meaning. How about
"rps_buckets" and "rps_bucket_x"?

>
>>  localhost linux # echo 4 > /sys/class/net/eth0/queues/rx-0/rps_flows
>>  localhost linux # echo 0 > /sys/class/net/eth0/queues/rx-0/rps_flow_0
>>  localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_1
>>  localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_2
>>  localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_3
>>
> It might be better to put this in its own directory

I have thought that before, but since they control the same data in
kernel as rps_cpus does, I put them in the same directory.

> and also do it per
> CPU instead of hash entry.  This should result in a lot fewer entries
> and I'm not sure how you would deal with holes in the hash table for
> unspecified entries.  Also, it would be nice not to have to specify a
> number of entries.  Maybe something like:
>
> localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_cpu_map/0
> localhost linux # echo 3 > /sys/class/net/eth0/queues/rx-0/rps_cpu_map/1
>
> To specify CPU 0 with weight 1, CPU 1 with weight 3.
>

Your way is more simple and straightforward. My idea has it own advantage:
1. control the rate precision through rps_flows.
2. do dynamic weighted packet dispatching by migrating some flows from
some CPUs to other CPUs. During this operations, only the flows
migrated are affected, and OOO only occurs in these flows.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply

* Re: hdlc_ppp: why no detach()?
From: Michael Barkowski @ 2010-04-12 14:34 UTC (permalink / raw)
  To: Krzysztof Halasa
  Cc: David S. Miller, Julia Lawall, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <4BC32B00.1030600@ruggedcom.com>

Michael Barkowski wrote:
> Hello Krzyztof,
> 
> I am looking at your hdlc_ppp code and I don't understand: why is there
> not the equivalent of fr_detach() in there?
> 
> pc8300_drv:cpc_remove_one() frees netdevs quite confidently but I wonder
> how it can be so sure that there are not skbs in hdlc_ppp's tx_queue
> associated with those devices before freeing them....
> 

the above is the real danger I see - free the netdev, then ppp's timer
comes along and dequeues from tx_queue an skb with invalid device.

> Even if you wanted to switch a device from PPP to Frame Relay, I don't
> see the method right now.  If I may ask, please, what am I missing?
> 

Ok - this part was a momentary lapse on my part - please strike from
the record :)

> If you agree there is a need for detach(), I would be happy to work on
> it and make a submission.
> 
> thanks for your time,
> 


-- 
Michael Barkowski
905-482-4577

^ permalink raw reply

* Re: Strange packet drops with heavy firewalling
From: Benny Lyne Amorsen @ 2010-04-12 14:44 UTC (permalink / raw)
  To: zhigang gong; +Cc: netdev
In-Reply-To: <q2v40c9f5b21004120116p766df82dj88c6af4e4cad55f@mail.gmail.com>

man, 12 04 2010 kl. 16:16 +0800, skrev zhigang gong:

> How do you know the per CPU usage data, by oprofile? I'm just a little
> surprised with the result, as it shows your new core is running 10x
> faster than your old core :). 

Well the old server had only two CPU's plus hyperthreading, and the
CPU's were Pentium-4-based. Add a slow memory bus to that and you have a
fairly slow system. It's almost 5 years old, so Moore's law says 2**3
increase in number of transistors...

In about the same time frame Linux has gone from being able to fill
1Gbps ethernet to being able to fill 10Gbps ethernet 

> What's the average packet size?

I asked the switch (I can't find a handy equivalent to ifstat which
counts packets instead of bytes). The 5 minute average packet sizes seem
to vary in the range 450 to 550 bytes.

> If your packet size is 64 bytes, then the pps(packet per second) rate
> should be about 585Kpps. As I know, this value is almost the best
> result when the standard linux kernel is processing the networking
> traffic with a normal 1Gb ethernet card (without multi-queue support)
> on a intel box. If it is the case, to buy a better ethernet card with
> multi-queue support should be a good choice. Otherwise, it may not
> help. 

I am far from that, perhaps 1/10th of that. I do a lot more processing
on at least some of the packets though (the ones starting new flows).


/Benny



^ permalink raw reply

* Very Important
From: Jiang Jianmin @ 2010-04-12 14:56 UTC (permalink / raw)


Good Day,
 
I have a secured business proposal of $28,272,000.00.Contact me via my private email(cncn1_jiang_jianmin2011@yahoo.com.cn)if interested.
 
Mr Jiang Jianmin.

^ permalink raw reply

* Re: forcedeth driver hangs under heavy load
From: Eric Dumazet @ 2010-04-12 15:24 UTC (permalink / raw)
  To: stephen mulcahy; +Cc: netdev, Ben Hutchings, Ayaz Abdulla, 572201
In-Reply-To: <4BC31DDE.7010005@gmail.com>

Le lundi 12 avril 2010 à 14:19 +0100, stephen mulcahy a écrit :

> Does that help?

Well, yes, because it seems a TCP problem.

root@node20:~# tcpdump host node20 and node05
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), capture size 96 bytes
14:12:59.612626 IP node05.webstar.cnet.36295 > node20.ssh: Flags [S], seq 3677858646, win 5840, options [mss 1460,sackOK,TS val 1599534 ecr 0,nop,wscale 7], length 0
14:12:59.612656 IP node20.ssh > node05.webstar.cnet.36295: Flags [S.], seq 3610575850, ack 3677858647, win 5792, options [mss 1460,sackOK,TS val 1598775 ecr 1599534,nop,wscale 7], length 0
14:12:59.612718 IP node05.webstar.cnet.36295 > node20.ssh: Flags [.], ack 1, win 46, options [nop,nop,TS val 1599534 ecr 1598775], length 0
14:12:59.617434 IP node20.ssh > node05.webstar.cnet.36295: Flags [P.], seq 1:33, ack 1, win 46, options [nop,nop,TS val 1598776 ecr 1599534], length 32
14:12:59.617522 IP node05.webstar.cnet.36295 > node20.ssh: Flags [.], ack 33, win 46, options [nop,nop,TS val 1599535 ecr 1598776], length 0
14:12:59.617609 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], seq 1:33, ack 33, win 46, options [nop,nop,TS val 1599535 ecr 1598776], length 32

All following xmitted frames are completely out of sync, this makes no sense.

Sequence number went backward.

14:12:59.820434 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], seq 4294936586:4294936618, ack 2620194849, win 46, options [nop,nop,TS val 1599586 ecr 1598776], length 32
14:13:00.229069 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], seq 4294961734:4294961766, ack 3928358945, win 46, options [nop,nop,TS val 1599688 ecr 1598776], length 32
14:13:01.044396 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], seq 4294964167:4294964199, ack 410320929, win 46, options [nop,nop,TS val 1599892 ecr 1598776], length 32


14:13:02.676308 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], seq 1:33, ack 33, win 46, options [nop,nop,TS val 1600300 ecr 1598776], length 32
14:13:05.940804 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], seq 17294:17326, ack 3045851169, win 46, options [nop,nop,TS val 1601116 ecr 1598776], length 32
14:13:12.468484 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], seq 17294:17326, ack 3045851169, win 46, options [nop,nop,TS val 1602748 ecr 1598776], length 32
14:13:25.523850 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], seq 1:33, ack 33, win 46, options [nop,nop,TS val 1606012 ecr 1598776], length 32
14:13:51.633934 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], seq 4294963190:4294963222, ack 9830433, win 46, options [nop,nop,TS val 1612540 ecr 1598776], length 32
14:14:43.855380 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], seq 1:33, ack 33, win 46, options [nop,nop,TS val 1625596 ecr 1598776], length 32
14:14:59.617675 IP node20.ssh > node05.webstar.cnet.36295: Flags [F.], seq 33, ack 1, win 46, options [nop,nop,TS val 1628777 ecr 1599535], length 0
14:14:59.618202 IP node05.webstar.cnet.36295 > node20.ssh: Flags [FP.], seq 4294959654:4294960446, ack 3930456098, win 46, options [nop,nop,TS val 1629536 ecr 1628777], length 792
14:14:59.821527 IP node20.ssh > node05.webstar.cnet.36295: Flags [F.], seq 33, ack 1, win 46, options [nop,nop,TS val 1628828 ecr 1599535], length 0
14:14:59.821598 IP node05.webstar.cnet.36295 > node20.ssh: Flags [.], ack 34, win 46, options [nop,nop,TS val 1629587 ecr 1628828,nop,nop,sack 1 {33:34}], length 0

Do you have some netfilters rules ?



^ permalink raw reply

* Re: [Bonding-devel] [v3 Patch 2/3] bridge: make bridge support netpoll
From: Stephen Hemminger @ 2010-04-12 15:38 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Cong Wang, Jay Vosburgh, Neil Horman, netdev, Matt Mackall,
	bridge, linux-kernel, David Miller, Jeff Moyer, Andy Gospodarek,
	bonding-devel
In-Reply-To: <1271068737.16881.18.camel@edumazet-laptop>

On Mon, 12 Apr 2010 12:38:57 +0200
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> Le lundi 12 avril 2010 à 18:37 +0800, Cong Wang a écrit :
> > Stephen Hemminger wrote:
> > > There is no protection on dev->priv_flags for SMP access.
> > > It would better bit value in dev->state if you are using it as control flag.
> > > 
> > > Then you could use 
> > > 			if (unlikely(test_and_clear_bit(__IN_NETPOLL, &skb->dev->state)))
> > > 				netpoll_send_skb(...)
> > > 
> > > 
> > 
> > Hmm, I think we can't use ->state here, it is not for this kind of purpose,
> > according to its comments.
> > 
> > Also, I find other usages of IFF_XXX flags of ->priv_flags are also using
> > &, | to set or clear the flags. So there must be some other things preventing
> > the race...
> 
> Yes, its RTNL that protects priv_flags changes, hopefully...
> 

The patch was not protecting priv_flags with RTNL.
For example..


@@ -308,7 +312,9 @@ static void netpoll_send_skb(struct netp
 		     tries > 0; --tries) {
 			if (__netif_tx_trylock(txq)) {
 				if (!netif_tx_queue_stopped(txq)) {
+					dev->priv_flags |= IFF_IN_NETPOLL;
 					status = ops->ndo_start_xmit(skb, dev);
+					dev->priv_flags &= ~IFF_IN_NETPOLL;
 					if (status == NETDEV_TX_OK)
 						txq_trans_update(txq);

^ permalink raw reply

* [PATCH 0/4] IPv6 addrconf related fixes
From: Stephen Hemminger @ 2010-04-12 15:41 UTC (permalink / raw)
  To: davem; +Cc: netdev

These apply to net-next, the problems do not exist in earlier kernels.
The problems started when I added changes to retain IPv6 addresses
when link goes down.

-- 


^ permalink raw reply

* [PATCH 1/4] IPv6: keep route for tentative address
From: Stephen Hemminger @ 2010-04-12 15:41 UTC (permalink / raw)
  To: David S. Miller, Tantilov, Emil S; +Cc: netdev
In-Reply-To: <20100412154130.397252857@vyatta.com>

[-- Attachment #1: ipv6-addrconf1.patch --]
[-- Type: text/plain, Size: 689 bytes --]

Recent changes preserve IPv6 address when link goes down (good).
But would cause address to point to dead dst entry (bad).
The simplest fix is to just not delete route if address is
being held for later use.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>


--- a/net/ipv6/addrconf.c	2010-04-11 12:19:37.938082190 -0700
+++ b/net/ipv6/addrconf.c	2010-04-11 12:25:05.349309074 -0700
@@ -4046,7 +4046,8 @@ static void __ipv6_ifa_notify(int event,
 			addrconf_leave_anycast(ifp);
 		addrconf_leave_solict(ifp->idev, &ifp->addr);
 		dst_hold(&ifp->rt->u.dst);
-		if (ip6_del_rt(ifp->rt))
+
+		if (ifp->dead && ip6_del_rt(ifp->rt))
 			dst_free(&ifp->rt->u.dst);
 		break;
 	}

-- 


^ permalink raw reply

* [PATCH 2/4] IPv6: keep tentative addresses in hash table
From: Stephen Hemminger @ 2010-04-12 15:41 UTC (permalink / raw)
  To: davem; +Cc: netdev
In-Reply-To: <20100412154130.397252857@vyatta.com>

[-- Attachment #1: ipv6-addrconf2.patch --]
[-- Type: text/plain, Size: 1105 bytes --]

When link goes down, want address to be preserved but in a tentative
state, therefore it has to stay in hash list.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

--- a/net/ipv6/addrconf.c	2010-04-11 12:25:05.349309074 -0700
+++ b/net/ipv6/addrconf.c	2010-04-11 12:25:10.408996382 -0700
@@ -2703,17 +2703,18 @@ static int addrconf_ifdown(struct net_de
 			/* Flag it for later restoration when link comes up */
 			ifa->flags |= IFA_F_TENTATIVE;
 			in6_ifa_hold(ifa);
+			write_unlock_bh(&idev->lock);
 		} else {
 			list_del(&ifa->if_list);
 			ifa->dead = 1;
-		}
-		write_unlock_bh(&idev->lock);
+			write_unlock_bh(&idev->lock);
 
-		/* clear hash table */
-		spin_lock_bh(&addrconf_hash_lock);
-		hlist_del_init_rcu(&ifa->addr_lst);
-		__in6_ifa_put(ifa);
-		spin_unlock_bh(&addrconf_hash_lock);
+			/* clear hash table */
+			spin_lock_bh(&addrconf_hash_lock);
+			hlist_del_init_rcu(&ifa->addr_lst);
+			__in6_ifa_put(ifa);
+			spin_unlock_bh(&addrconf_hash_lock);
+		}
 
 		__ipv6_ifa_notify(RTM_DELADDR, ifa);
 		atomic_notifier_call_chain(&inet6addr_chain, NETDEV_DOWN, ifa);

-- 


^ permalink raw reply

* [PATCH 3/4] ipv6: additional ref count for hash list unnecessary
From: Stephen Hemminger @ 2010-04-12 15:41 UTC (permalink / raw)
  To: davem; +Cc: netdev
In-Reply-To: <20100412154130.397252857@vyatta.com>

[-- Attachment #1: ipv6-addrconf3.patch --]
[-- Type: text/plain, Size: 1002 bytes --]

Since an address in hash list has to already have a ref count,
no additional ref count is needed. 

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>


--- a/net/ipv6/addrconf.c	2010-04-11 12:25:32.609002374 -0700
+++ b/net/ipv6/addrconf.c	2010-04-11 12:26:52.715246164 -0700
@@ -675,7 +675,6 @@ ipv6_add_addr(struct inet6_dev *idev, co
 	hash = ipv6_addr_hash(addr);
 
 	hlist_add_head_rcu(&ifa->addr_lst, &inet6_addr_lst[hash]);
-	in6_ifa_hold(ifa);
 	spin_unlock(&addrconf_hash_lock);
 
 	write_lock(&idev->lock);
@@ -723,7 +722,6 @@ static void ipv6_del_addr(struct inet6_i
 
 	spin_lock_bh(&addrconf_hash_lock);
 	hlist_del_init_rcu(&ifp->addr_lst);
-	__in6_ifa_put(ifp);
 	spin_unlock_bh(&addrconf_hash_lock);
 
 	write_lock_bh(&idev->lock);
@@ -2712,7 +2710,6 @@ static int addrconf_ifdown(struct net_de
 			/* clear hash table */
 			spin_lock_bh(&addrconf_hash_lock);
 			hlist_del_init_rcu(&ifa->addr_lst);
-			__in6_ifa_put(ifa);
 			spin_unlock_bh(&addrconf_hash_lock);
 		}
 

-- 


^ permalink raw reply

* [PATCH 4/4] IPv6: only notify protocols if address is compeletely gone
From: Stephen Hemminger @ 2010-04-12 15:41 UTC (permalink / raw)
  To: davem; +Cc: netdev
In-Reply-To: <20100412154130.397252857@vyatta.com>

[-- Attachment #1: ipv6-addrconf4.patch --]
[-- Type: text/plain, Size: 793 bytes --]

The notifier for address down should only be called if address is completely
gone, not just being marked as tentative on link transistion. The code
in net-next would case bonding/sctp/s390 to see address disappear on link
down, but they would never see it reappear on link up.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

--- a/net/ipv6/addrconf.c	2010-04-11 14:34:36.919767724 -0700
+++ b/net/ipv6/addrconf.c	2010-04-11 14:35:00.533967946 -0700
@@ -2714,7 +2714,9 @@ static int addrconf_ifdown(struct net_de
 		}
 
 		__ipv6_ifa_notify(RTM_DELADDR, ifa);
-		atomic_notifier_call_chain(&inet6addr_chain, NETDEV_DOWN, ifa);
+		if (ifa->dead)
+			atomic_notifier_call_chain(&inet6addr_chain,
+						   NETDEV_DOWN, ifa);
 		in6_ifa_put(ifa);
 
 		write_lock_bh(&idev->lock);

-- 


^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox