Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH V10 04/15] posix clocks: introduce a syscall for clock tuning.
From: Richard Cochran @ 2011-01-27 10:55 UTC (permalink / raw)
  To: linux-kernel-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-api-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	Alan Cox, Arnd Bergmann, Christoph Lameter, David Miller,
	John Stultz, Krzysztof Halasa, Peter Zijlstra, Rodolfo Giometti,
	Thomas Gleixner, Benjamin Herrenschmidt, H. Peter Anvin,
	Ingo Molnar, Mike Frysinger, Paul Mackerras, Russell King
In-Reply-To: <cover.1296124770.git.richard.cochran-3mrvs1K0uXizZXS1Dc/lvw@public.gmane.org>

A new syscall is introduced that allows tuning of a POSIX clock. The
new call, clock_adjtime, takes two parameters, the clock ID and a
pointer to a struct timex. Any ADJTIMEX(2) operation may be requested
via this system call, but various POSIX clocks may or may not support
tuning.

Signed-off-by: Richard Cochran <richard.cochran-3mrvs1K0uXizZXS1Dc/lvw@public.gmane.org>
---
 drivers/char/mmtimer.c       |    1 +
 include/linux/posix-timers.h |    4 +
 include/linux/syscalls.h     |    2 +
 kernel/compat.c              |  136 +++++++++++++++++++++++++++---------------
 kernel/posix-cpu-timers.c    |    6 ++
 kernel/posix-timers.c        |   35 +++++++++++
 6 files changed, 136 insertions(+), 48 deletions(-)

diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c
index ecd0082..f365ae8 100644
--- a/drivers/char/mmtimer.c
+++ b/drivers/char/mmtimer.c
@@ -767,6 +767,7 @@ static struct k_clock sgi_clock = {
 	.res = 0,
 	.clock_set = sgi_clock_set,
 	.clock_get = sgi_clock_get,
+	.clock_adj = do_posix_clock_noadjtime,
 	.timer_create = sgi_timer_create,
 	.nsleep = do_posix_clock_nonanosleep,
 	.timer_set = sgi_timer_set,
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index b2c14cb..24f01de 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -4,6 +4,7 @@
 #include <linux/spinlock.h>
 #include <linux/list.h>
 #include <linux/sched.h>
+#include <linux/timex.h>
 
 union cpu_time_count {
 	cputime_t cpu;
@@ -72,6 +73,7 @@ struct k_clock {
 	int (*clock_set) (const clockid_t which_clock,
 			  const struct timespec *tp);
 	int (*clock_get) (const clockid_t which_clock, struct timespec * tp);
+	int (*clock_adj) (const clockid_t which_clock, struct timex *tx);
 	int (*timer_create) (struct k_itimer *timer);
 	int (*nsleep) (const clockid_t which_clock, int flags,
 		       struct timespec *, struct timespec __user *);
@@ -91,6 +93,7 @@ void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock);
 int do_posix_clock_nonanosleep(const clockid_t, int flags, struct timespec *,
 			       struct timespec __user *);
 int do_posix_clock_nosettime(const clockid_t, const struct timespec *tp);
+int do_posix_clock_noadjtime(const clockid_t, struct timex *tx);
 
 /* function to call to trigger timer event */
 int posix_timer_event(struct k_itimer *timr, int si_private);
@@ -98,6 +101,7 @@ int posix_timer_event(struct k_itimer *timr, int si_private);
 int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *ts);
 int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *ts);
 int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *ts);
+int posix_cpu_clock_adj(const clockid_t which_clock, struct timex *tx);
 int posix_cpu_timer_create(struct k_itimer *timer);
 int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 		     struct timespec *rqtp, struct timespec __user *rmtp);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 18cd068..bfacab9 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -311,6 +311,8 @@ asmlinkage long sys_clock_settime(clockid_t which_clock,
 				const struct timespec __user *tp);
 asmlinkage long sys_clock_gettime(clockid_t which_clock,
 				struct timespec __user *tp);
+asmlinkage long sys_clock_adjtime(clockid_t which_clock,
+				struct timex __user *tx);
 asmlinkage long sys_clock_getres(clockid_t which_clock,
 				struct timespec __user *tp);
 asmlinkage long sys_clock_nanosleep(clockid_t which_clock, int flags,
diff --git a/kernel/compat.c b/kernel/compat.c
index c9e2ec0..38b1d2c 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -52,6 +52,64 @@ static int compat_put_timeval(struct compat_timeval __user *o,
 		put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
 }
 
+static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp)
+{
+	memset(txc, 0, sizeof(struct timex));
+
+	if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
+			__get_user(txc->modes, &utp->modes) ||
+			__get_user(txc->offset, &utp->offset) ||
+			__get_user(txc->freq, &utp->freq) ||
+			__get_user(txc->maxerror, &utp->maxerror) ||
+			__get_user(txc->esterror, &utp->esterror) ||
+			__get_user(txc->status, &utp->status) ||
+			__get_user(txc->constant, &utp->constant) ||
+			__get_user(txc->precision, &utp->precision) ||
+			__get_user(txc->tolerance, &utp->tolerance) ||
+			__get_user(txc->time.tv_sec, &utp->time.tv_sec) ||
+			__get_user(txc->time.tv_usec, &utp->time.tv_usec) ||
+			__get_user(txc->tick, &utp->tick) ||
+			__get_user(txc->ppsfreq, &utp->ppsfreq) ||
+			__get_user(txc->jitter, &utp->jitter) ||
+			__get_user(txc->shift, &utp->shift) ||
+			__get_user(txc->stabil, &utp->stabil) ||
+			__get_user(txc->jitcnt, &utp->jitcnt) ||
+			__get_user(txc->calcnt, &utp->calcnt) ||
+			__get_user(txc->errcnt, &utp->errcnt) ||
+			__get_user(txc->stbcnt, &utp->stbcnt))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc)
+{
+	if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
+			__put_user(txc->modes, &utp->modes) ||
+			__put_user(txc->offset, &utp->offset) ||
+			__put_user(txc->freq, &utp->freq) ||
+			__put_user(txc->maxerror, &utp->maxerror) ||
+			__put_user(txc->esterror, &utp->esterror) ||
+			__put_user(txc->status, &utp->status) ||
+			__put_user(txc->constant, &utp->constant) ||
+			__put_user(txc->precision, &utp->precision) ||
+			__put_user(txc->tolerance, &utp->tolerance) ||
+			__put_user(txc->time.tv_sec, &utp->time.tv_sec) ||
+			__put_user(txc->time.tv_usec, &utp->time.tv_usec) ||
+			__put_user(txc->tick, &utp->tick) ||
+			__put_user(txc->ppsfreq, &utp->ppsfreq) ||
+			__put_user(txc->jitter, &utp->jitter) ||
+			__put_user(txc->shift, &utp->shift) ||
+			__put_user(txc->stabil, &utp->stabil) ||
+			__put_user(txc->jitcnt, &utp->jitcnt) ||
+			__put_user(txc->calcnt, &utp->calcnt) ||
+			__put_user(txc->errcnt, &utp->errcnt) ||
+			__put_user(txc->stbcnt, &utp->stbcnt) ||
+			__put_user(txc->tai, &utp->tai))
+		return -EFAULT;
+	return 0;
+}
+
 asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
 		struct timezone __user *tz)
 {
@@ -617,6 +675,29 @@ long compat_sys_clock_gettime(clockid_t which_clock,
 	return err;
 }
 
+long compat_sys_clock_adjtime(clockid_t which_clock,
+		struct compat_timex __user *utp)
+{
+	struct timex txc;
+	mm_segment_t oldfs;
+	int err, ret;
+
+	err = compat_get_timex(&txc, utp);
+	if (err)
+		return err;
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc);
+	set_fs(oldfs);
+
+	err = compat_put_timex(utp, &txc);
+	if (err)
+		return err;
+
+	return ret;
+}
+
 long compat_sys_clock_getres(clockid_t which_clock,
 		struct compat_timespec __user *tp)
 {
@@ -951,58 +1032,17 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
 asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
 {
 	struct timex txc;
-	int ret;
-
-	memset(&txc, 0, sizeof(struct timex));
+	int err, ret;
 
-	if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
-			__get_user(txc.modes, &utp->modes) ||
-			__get_user(txc.offset, &utp->offset) ||
-			__get_user(txc.freq, &utp->freq) ||
-			__get_user(txc.maxerror, &utp->maxerror) ||
-			__get_user(txc.esterror, &utp->esterror) ||
-			__get_user(txc.status, &utp->status) ||
-			__get_user(txc.constant, &utp->constant) ||
-			__get_user(txc.precision, &utp->precision) ||
-			__get_user(txc.tolerance, &utp->tolerance) ||
-			__get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-			__get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-			__get_user(txc.tick, &utp->tick) ||
-			__get_user(txc.ppsfreq, &utp->ppsfreq) ||
-			__get_user(txc.jitter, &utp->jitter) ||
-			__get_user(txc.shift, &utp->shift) ||
-			__get_user(txc.stabil, &utp->stabil) ||
-			__get_user(txc.jitcnt, &utp->jitcnt) ||
-			__get_user(txc.calcnt, &utp->calcnt) ||
-			__get_user(txc.errcnt, &utp->errcnt) ||
-			__get_user(txc.stbcnt, &utp->stbcnt))
-		return -EFAULT;
+	err = compat_get_timex(&txc, utp);
+	if (err)
+		return err;
 
 	ret = do_adjtimex(&txc);
 
-	if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
-			__put_user(txc.modes, &utp->modes) ||
-			__put_user(txc.offset, &utp->offset) ||
-			__put_user(txc.freq, &utp->freq) ||
-			__put_user(txc.maxerror, &utp->maxerror) ||
-			__put_user(txc.esterror, &utp->esterror) ||
-			__put_user(txc.status, &utp->status) ||
-			__put_user(txc.constant, &utp->constant) ||
-			__put_user(txc.precision, &utp->precision) ||
-			__put_user(txc.tolerance, &utp->tolerance) ||
-			__put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-			__put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-			__put_user(txc.tick, &utp->tick) ||
-			__put_user(txc.ppsfreq, &utp->ppsfreq) ||
-			__put_user(txc.jitter, &utp->jitter) ||
-			__put_user(txc.shift, &utp->shift) ||
-			__put_user(txc.stabil, &utp->stabil) ||
-			__put_user(txc.jitcnt, &utp->jitcnt) ||
-			__put_user(txc.calcnt, &utp->calcnt) ||
-			__put_user(txc.errcnt, &utp->errcnt) ||
-			__put_user(txc.stbcnt, &utp->stbcnt) ||
-			__put_user(txc.tai, &utp->tai))
-		ret = -EFAULT;
+	err = compat_put_timex(utp, &txc);
+	if (err)
+		return err;
 
 	return ret;
 }
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 05bb717..0206116 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -207,6 +207,10 @@ int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
 	return error;
 }
 
+int posix_cpu_clock_adj(const clockid_t which_clock, struct timex *tx)
+{
+	return -EOPNOTSUPP;
+}
 
 /*
  * Sample a per-thread clock for the given task.
@@ -1610,6 +1614,7 @@ static __init int init_posix_cpu_timers(void)
 		.clock_getres = process_cpu_clock_getres,
 		.clock_get = process_cpu_clock_get,
 		.clock_set = do_posix_clock_nosettime,
+		.clock_adj = do_posix_clock_noadjtime,
 		.timer_create = process_cpu_timer_create,
 		.nsleep = process_cpu_nsleep,
 		.nsleep_restart = process_cpu_nsleep_restart,
@@ -1618,6 +1623,7 @@ static __init int init_posix_cpu_timers(void)
 		.clock_getres = thread_cpu_clock_getres,
 		.clock_get = thread_cpu_clock_get,
 		.clock_set = do_posix_clock_nosettime,
+		.clock_adj = do_posix_clock_noadjtime,
 		.timer_create = thread_cpu_timer_create,
 		.nsleep = thread_cpu_nsleep,
 		.nsleep_restart = thread_cpu_nsleep_restart,
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 21b7ca2..7279cd1 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -203,6 +203,11 @@ static int common_timer_create(struct k_itimer *new_timer)
 	return 0;
 }
 
+static inline int common_clock_adj(const clockid_t which_clock, struct timex *t)
+{
+	return do_adjtimex(t);
+}
+
 static int no_timer_create(struct k_itimer *new_timer)
 {
 	return -EOPNOTSUPP;
@@ -279,11 +284,13 @@ static __init int init_posix_timers(void)
 		.clock_getres = hrtimer_get_res,
 		.clock_get = posix_ktime_get_ts,
 		.clock_set = do_posix_clock_nosettime,
+		.clock_adj = do_posix_clock_noadjtime,
 	};
 	struct k_clock clock_monotonic_raw = {
 		.clock_getres = hrtimer_get_res,
 		.clock_get = posix_get_monotonic_raw,
 		.clock_set = do_posix_clock_nosettime,
+		.clock_adj = do_posix_clock_noadjtime,
 		.timer_create = no_timer_create,
 		.nsleep = no_nsleep,
 	};
@@ -291,6 +298,7 @@ static __init int init_posix_timers(void)
 		.clock_getres = posix_get_coarse_res,
 		.clock_get = posix_get_realtime_coarse,
 		.clock_set = do_posix_clock_nosettime,
+		.clock_adj = do_posix_clock_noadjtime,
 		.timer_create = no_timer_create,
 		.nsleep = no_nsleep,
 	};
@@ -298,6 +306,7 @@ static __init int init_posix_timers(void)
 		.clock_getres = posix_get_coarse_res,
 		.clock_get = posix_get_monotonic_coarse,
 		.clock_set = do_posix_clock_nosettime,
+		.clock_adj = do_posix_clock_noadjtime,
 		.timer_create = no_timer_create,
 		.nsleep = no_nsleep,
 	};
@@ -934,6 +943,12 @@ int do_posix_clock_nosettime(const clockid_t clockid, const struct timespec *tp)
 }
 EXPORT_SYMBOL_GPL(do_posix_clock_nosettime);
 
+int do_posix_clock_noadjtime(const clockid_t which_clock, struct timex *t)
+{
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(do_posix_clock_noadjtime);
+
 int do_posix_clock_nonanosleep(const clockid_t clock, int flags,
 			       struct timespec *t, struct timespec __user *r)
 {
@@ -975,6 +990,26 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
 
 }
 
+SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
+		struct timex __user *, utx)
+{
+	struct timex ktx;
+	int err;
+
+	if (copy_from_user(&ktx, utx, sizeof(ktx)))
+		return -EFAULT;
+
+	if (invalid_clockid(which_clock))
+		return -EINVAL;
+
+	err = CLOCK_DISPATCH(which_clock, clock_adj, (which_clock, &ktx));
+
+	if (copy_to_user(utx, &ktx, sizeof(ktx)))
+		return -EFAULT;
+
+	return err;
+}
+
 SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
 		struct timespec __user *, tp)
 {
-- 
1.7.0.4

^ permalink raw reply related

* [PATCH V10 03/15] ntp: add ADJ_SETOFFSET mode bit
From: Richard Cochran @ 2011-01-27 10:55 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-api, netdev, Alan Cox, Arnd Bergmann, Christoph Lameter,
	David Miller, John Stultz, Krzysztof Halasa, Peter Zijlstra,
	Rodolfo Giometti, Thomas Gleixner, Benjamin Herrenschmidt,
	H. Peter Anvin, Ingo Molnar, Mike Frysinger, Paul Mackerras,
	Russell King
In-Reply-To: <cover.1296124770.git.richard.cochran@omicron.at>

This patch adds a new mode bit into the timex structure. When set, the bit
instructs the kernel to add the given time value to the current time.

Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
---
 include/linux/timex.h |    3 ++-
 kernel/time/ntp.c     |   11 +++++++++++
 2 files changed, 13 insertions(+), 1 deletions(-)

diff --git a/include/linux/timex.h b/include/linux/timex.h
index d23999f..aa60fe7 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -73,7 +73,7 @@ struct timex {
 	long tolerance;		/* clock frequency tolerance (ppm)
 				 * (read only)
 				 */
-	struct timeval time;	/* (read only) */
+	struct timeval time;	/* (read only, except for ADJ_SETOFFSET) */
 	long tick;		/* (modified) usecs between clock ticks */
 
 	long ppsfreq;           /* pps frequency (scaled ppm) (ro) */
@@ -102,6 +102,7 @@ struct timex {
 #define ADJ_STATUS		0x0010	/* clock status */
 #define ADJ_TIMECONST		0x0020	/* pll time constant */
 #define ADJ_TAI			0x0080	/* set TAI offset */
+#define ADJ_SETOFFSET		0x0100  /* add 'time' to current time */
 #define ADJ_MICRO		0x1000	/* select microsecond resolution */
 #define ADJ_NANO		0x2000	/* select nanosecond resolution */
 #define ADJ_TICK		0x4000	/* tick value */
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5c00242..34d1b64 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -646,6 +646,17 @@ int do_adjtimex(struct timex *txc)
 			hrtimer_cancel(&leap_timer);
 	}
 
+	if (txc->modes & ADJ_SETOFFSET) {
+		struct timespec delta;
+		if ((unsigned long)txc->time.tv_usec >= NSEC_PER_SEC)
+			return -EINVAL;
+		delta.tv_sec  = txc->time.tv_sec;
+		delta.tv_nsec = txc->time.tv_usec;
+		if (!(txc->modes & ADJ_NANO))
+			delta.tv_nsec *= 1000;
+		timekeeping_inject_offset(&delta);
+	}
+
 	getnstimeofday(&ts);
 
 	write_seqlock_irq(&xtime_lock);
-- 
1.7.0.4

^ permalink raw reply related

* [PATCH V10 02/15] time: Correct the *settime* parameters
From: Richard Cochran @ 2011-01-27 10:54 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-api, netdev, Alan Cox, Arnd Bergmann, Christoph Lameter,
	David Miller, John Stultz, Krzysztof Halasa, Peter Zijlstra,
	Rodolfo Giometti, Thomas Gleixner, Benjamin Herrenschmidt,
	H. Peter Anvin, Ingo Molnar, Mike Frysinger, Paul Mackerras,
	Russell King
In-Reply-To: <cover.1296124770.git.richard.cochran@omicron.at>

Both settimeofday() and clock_settime() promise with a 'const'
attribute not to alter the arguments passed in. This patch adds the
missing 'const' attribute into the various kernel functions
implementing these calls.

Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
---
 drivers/char/mmtimer.c       |    2 +-
 include/linux/posix-timers.h |    5 +++--
 include/linux/security.h     |    9 +++++----
 include/linux/time.h         |    5 +++--
 kernel/posix-timers.c        |    4 ++--
 kernel/time.c                |    2 +-
 kernel/time/timekeeping.c    |    2 +-
 security/commoncap.c         |    2 +-
 security/security.c          |    2 +-
 9 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c
index e6d7562..ecd0082 100644
--- a/drivers/char/mmtimer.c
+++ b/drivers/char/mmtimer.c
@@ -487,7 +487,7 @@ static int sgi_clock_get(clockid_t clockid, struct timespec *tp)
 	return 0;
 };
 
-static int sgi_clock_set(clockid_t clockid, struct timespec *tp)
+static int sgi_clock_set(const clockid_t clockid, const struct timespec *tp)
 {
 
 	u64 nsec;
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 3e23844..b2c14cb 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -69,7 +69,8 @@ struct k_itimer {
 struct k_clock {
 	int res;		/* in nanoseconds */
 	int (*clock_getres) (const clockid_t which_clock, struct timespec *tp);
-	int (*clock_set) (const clockid_t which_clock, struct timespec * tp);
+	int (*clock_set) (const clockid_t which_clock,
+			  const struct timespec *tp);
 	int (*clock_get) (const clockid_t which_clock, struct timespec * tp);
 	int (*timer_create) (struct k_itimer *timer);
 	int (*nsleep) (const clockid_t which_clock, int flags,
@@ -89,7 +90,7 @@ void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock);
 /* error handlers for timer_create, nanosleep and settime */
 int do_posix_clock_nonanosleep(const clockid_t, int flags, struct timespec *,
 			       struct timespec __user *);
-int do_posix_clock_nosettime(const clockid_t, struct timespec *tp);
+int do_posix_clock_nosettime(const clockid_t, const struct timespec *tp);
 
 /* function to call to trigger timer event */
 int posix_timer_event(struct k_itimer *timr, int si_private);
diff --git a/include/linux/security.h b/include/linux/security.h
index c642bb8..c096aa6 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -53,7 +53,7 @@ struct audit_krule;
  */
 extern int cap_capable(struct task_struct *tsk, const struct cred *cred,
 		       int cap, int audit);
-extern int cap_settime(struct timespec *ts, struct timezone *tz);
+extern int cap_settime(const struct timespec *ts, const struct timezone *tz);
 extern int cap_ptrace_access_check(struct task_struct *child, unsigned int mode);
 extern int cap_ptrace_traceme(struct task_struct *parent);
 extern int cap_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted);
@@ -1387,7 +1387,7 @@ struct security_operations {
 	int (*quotactl) (int cmds, int type, int id, struct super_block *sb);
 	int (*quota_on) (struct dentry *dentry);
 	int (*syslog) (int type);
-	int (*settime) (struct timespec *ts, struct timezone *tz);
+	int (*settime) (const struct timespec *ts, const struct timezone *tz);
 	int (*vm_enough_memory) (struct mm_struct *mm, long pages);
 
 	int (*bprm_set_creds) (struct linux_binprm *bprm);
@@ -1669,7 +1669,7 @@ int security_sysctl(struct ctl_table *table, int op);
 int security_quotactl(int cmds, int type, int id, struct super_block *sb);
 int security_quota_on(struct dentry *dentry);
 int security_syslog(int type);
-int security_settime(struct timespec *ts, struct timezone *tz);
+int security_settime(const struct timespec *ts, const struct timezone *tz);
 int security_vm_enough_memory(long pages);
 int security_vm_enough_memory_mm(struct mm_struct *mm, long pages);
 int security_vm_enough_memory_kern(long pages);
@@ -1904,7 +1904,8 @@ static inline int security_syslog(int type)
 	return 0;
 }
 
-static inline int security_settime(struct timespec *ts, struct timezone *tz)
+static inline int security_settime(const struct timespec *ts,
+				   const struct timezone *tz)
 {
 	return cap_settime(ts, tz);
 }
diff --git a/include/linux/time.h b/include/linux/time.h
index 8863688..81f955f 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -147,8 +147,9 @@ static inline u32 arch_gettimeoffset(void) { return 0; }
 #endif
 
 extern void do_gettimeofday(struct timeval *tv);
-extern int do_settimeofday(struct timespec *tv);
-extern int do_sys_settimeofday(struct timespec *tv, struct timezone *tz);
+extern int do_settimeofday(const struct timespec *tv);
+extern int do_sys_settimeofday(const struct timespec *tv,
+			       const struct timezone *tz);
 #define do_posix_clock_monotonic_gettime(ts) ktime_get_ts(ts)
 extern long do_utimes(int dfd, const char __user *filename, struct timespec *times, int flags);
 struct itimerval;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 93bd2eb..21b7ca2 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -192,7 +192,7 @@ static int common_clock_get(clockid_t which_clock, struct timespec *tp)
 }
 
 static inline int common_clock_set(const clockid_t which_clock,
-				   struct timespec *tp)
+				   const struct timespec *tp)
 {
 	return do_sys_settimeofday(tp, NULL);
 }
@@ -928,7 +928,7 @@ void exit_itimers(struct signal_struct *sig)
 }
 
 /* Not available / possible... functions */
-int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp)
+int do_posix_clock_nosettime(const clockid_t clockid, const struct timespec *tp)
 {
 	return -EINVAL;
 }
diff --git a/kernel/time.c b/kernel/time.c
index 3217435..70ca1ee 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -150,7 +150,7 @@ static inline void warp_clock(void)
  * various programs will get confused when the clock gets warped.
  */
 
-int do_sys_settimeofday(struct timespec *tv, struct timezone *tz)
+int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
 {
 	static int firsttime = 1;
 	int error = 0;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 1e284a5..3879bf3 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -353,7 +353,7 @@ EXPORT_SYMBOL(do_gettimeofday);
  *
  * Sets the time of day to the new time and update NTP and notify hrtimers
  */
-int do_settimeofday(struct timespec *tv)
+int do_settimeofday(const struct timespec *tv)
 {
 	struct timespec ts_delta;
 	unsigned long flags;
diff --git a/security/commoncap.c b/security/commoncap.c
index 64c2ed9..dbfdaed 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -93,7 +93,7 @@ int cap_capable(struct task_struct *tsk, const struct cred *cred, int cap,
  * Determine whether the current process may set the system clock and timezone
  * information, returning 0 if permission granted, -ve if denied.
  */
-int cap_settime(struct timespec *ts, struct timezone *tz)
+int cap_settime(const struct timespec *ts, const struct timezone *tz)
 {
 	if (!capable(CAP_SYS_TIME))
 		return -EPERM;
diff --git a/security/security.c b/security/security.c
index 739e403..b995428 100644
--- a/security/security.c
+++ b/security/security.c
@@ -202,7 +202,7 @@ int security_syslog(int type)
 	return security_ops->syslog(type);
 }
 
-int security_settime(struct timespec *ts, struct timezone *tz)
+int security_settime(const struct timespec *ts, const struct timezone *tz)
 {
 	return security_ops->settime(ts, tz);
 }
-- 
1.7.0.4

^ permalink raw reply related

* [PATCH V10 01/15] time: Introduce timekeeping_inject_offset
From: John Stultz @ 2011-01-27 10:54 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-api, netdev, Alan Cox, Arnd Bergmann, Christoph Lameter,
	David Miller, John Stultz, Krzysztof Halasa, Peter Zijlstra,
	Rodolfo Giometti, Thomas Gleixner, Benjamin Herrenschmidt,
	H. Peter Anvin, Ingo Molnar, Mike Frysinger, Paul Mackerras,
	Russell King
In-Reply-To: <cover.1296124770.git.richard.cochran@omicron.at>

This adds a kernel-internal timekeeping interface to add or subtract
a fixed amount from CLOCK_REALTIME. This makes it so kernel users or
interfaces trying to do so do not have to read the time, then add an
offset and then call settimeofday(), which adds some extra error in
comparision to just simply adding the offset in the kernel timekeeping
core.

CC: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
---
 include/linux/time.h      |    1 +
 kernel/time/timekeeping.c |   36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 0 deletions(-)

diff --git a/include/linux/time.h b/include/linux/time.h
index 1e6d3b5..8863688 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -168,6 +168,7 @@ extern int timekeeping_valid_for_hres(void);
 extern u64 timekeeping_max_deferment(void);
 extern void update_wall_time(void);
 extern void timekeeping_leap_insert(int leapsecond);
+extern int timekeeping_inject_offset(struct timespec *ts);
 
 struct tms;
 extern void do_sys_times(struct tms *);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d27c756..1e284a5 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -387,6 +387,42 @@ int do_settimeofday(struct timespec *tv)
 
 EXPORT_SYMBOL(do_settimeofday);
 
+
+/**
+ * timekeeping_inject_offset - Adds or subtracts from the current time.
+ * @tv:		pointer to the timespec variable containing the offset
+ *
+ * Adds or subtracts an offset value from the current time.
+ */
+int timekeeping_inject_offset(struct timespec *ts)
+{
+	unsigned long flags;
+
+	if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
+		return -EINVAL;
+
+	write_seqlock_irqsave(&xtime_lock, flags);
+
+	timekeeping_forward_now();
+
+	xtime = timespec_add(xtime, *ts);
+	wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts);
+
+	timekeeper.ntp_error = 0;
+	ntp_clear();
+
+	update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+				timekeeper.mult);
+
+	write_sequnlock_irqrestore(&xtime_lock, flags);
+
+	/* signal hrtimers about time change */
+	clock_was_set();
+
+	return 0;
+}
+EXPORT_SYMBOL(timekeeping_inject_offset);
+
 /**
  * change_clocksource - Swaps clocksources if a new one is available
  *
-- 
1.7.0.4

^ permalink raw reply related

* [PATCH V10 00/15] ptp: IEEE 1588 hardware clock support
From: Richard Cochran @ 2011-01-27 10:53 UTC (permalink / raw)
  To: linux-kernel-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-api-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	Alan Cox, Arnd Bergmann, Christoph Lameter, David Miller,
	John Stultz, Krzysztof Halasa, Peter Zijlstra, Rodolfo Giometti,
	Thomas Gleixner, Benjamin Herrenschmidt, H. Peter Anvin,
	Ingo Molnar, Mike Frysinger, Paul Mackerras, Russell King

Here is version 10 of the PHC patch series. We missed the boat for
2.6.38, but I hope this can be merged for the next one.

Reviewers: I think its done. How can I get this merged?

Arch maintainers (arm, blackfin, powerpc, x86): I have added you on
CC in the hope of getting your acks for the new syscall.

Thanks,
Richard

* Patch ChangeLog
** v10
*** posix clocks
    - removed the (previously implied) default functions from the k_clocks
    - made the ts parameter 'const' in clock_set functions
*** ptp hardware clocks
    - updated to changed PPS kernel API

* Introduction

  The aim of this patch set is to add support for PTP Hardware Clocks
  (PHCs) into the Linux kernel.

  The first eleven patches provide infrastructure supporting dynamic
  POSIX clock devices. This new code will be useful for other kinds of
  new clocks, not just PHCs.

  The last four patches implement the PHC code.

  Support for obtaining timestamps from a PHC already exists via the
  SO_TIMESTAMPING socket option, integrated in kernel version 2.6.30.
  This patch set completes the picture by allow user space programs to
  adjust the PHC and to control its ancillary features.

* Previous Discussions

  This patch set previously appeared on the netdev list. Since V5 of
  the character device patch set, the discussion has moved to the
  lkml.

  - IEEE 1588 hardware clock support [V5]
    http://lkml.org/lkml/2010/8/16/90

  - POSIX clock tuning syscall with static clock ids
    http://lkml.org/lkml/2010/8/23/49

  - POSIX clock tuning syscall with dynamic clock ids
    http://lkml.org/lkml/2010/9/3/119

  - IEEE 1588 hardware clock support [V6]
    http://lkml.org/lkml/2010/9/23/310

  - Dynamic clock devices [RFC]
    http://lkml.org/lkml/2010/11/4/290

  - IEEE 1588 hardware clock support [V7]
    http://lkml.org/lkml/2010/12/16/195

  - IEEE 1588 hardware clock support [V8]
    http://lkml.org/lkml/2010/12/31/128

  - IEEE 1588 hardware clock support [V9]
    http://lkml.org/lkml/2011/1/13/65


John Stultz (1):
  time: Introduce timekeeping_inject_offset

Richard Cochran (14):
  time: Correct the *settime* parameters
  ntp: add ADJ_SETOFFSET mode bit
  posix clocks: introduce a syscall for clock tuning.
  posix_clocks: add clock_adjtime for arm
  posix_clocks: add clock_adjtime for blackfin
  posix_clocks: add clock_adjtime for powerpc
  posix_clocks: add clock_adjtime for x86
  posix clocks: cleanup the CLOCK_DISPTACH macro
  posix clocks: remove useless default methods
  posix clocks: introduce dynamic clocks
  ptp: Added a brand new class driver for ptp clocks.
  ptp: Added a clock that uses the eTSEC found on the MPC85xx.
  ptp: Added a clock driver for the IXP46x.
  ptp: Added a clock driver for the National Semiconductor PHYTER.

 Documentation/ABI/testing/sysfs-ptp             |   98 +++
 Documentation/powerpc/dts-bindings/fsl/tsec.txt |   57 ++
 Documentation/ptp/ptp.txt                       |   89 +++
 Documentation/ptp/testptp.c                     |  352 +++++++++
 Documentation/ptp/testptp.mk                    |   33 +
 arch/arm/include/asm/unistd.h                   |    1 +
 arch/arm/kernel/calls.S                         |    1 +
 arch/arm/mach-ixp4xx/include/mach/ixp46x_ts.h   |   78 ++
 arch/blackfin/include/asm/unistd.h              |    3 +-
 arch/blackfin/mach-common/entry.S               |    1 +
 arch/powerpc/boot/dts/mpc8313erdb.dts           |   14 +
 arch/powerpc/boot/dts/mpc8572ds.dts             |   14 +
 arch/powerpc/boot/dts/p2020ds.dts               |   14 +
 arch/powerpc/boot/dts/p2020rdb.dts              |   14 +
 arch/powerpc/include/asm/systbl.h               |    1 +
 arch/powerpc/include/asm/unistd.h               |    3 +-
 arch/x86/ia32/ia32entry.S                       |    1 +
 arch/x86/include/asm/unistd_32.h                |    3 +-
 arch/x86/include/asm/unistd_64.h                |    2 +
 arch/x86/kernel/syscall_table_32.S              |    1 +
 drivers/Kconfig                                 |    2 +
 drivers/Makefile                                |    1 +
 drivers/char/mmtimer.c                          |    3 +-
 drivers/net/Makefile                            |    1 +
 drivers/net/arm/ixp4xx_eth.c                    |  192 +++++-
 drivers/net/gianfar_ptp.c                       |  448 +++++++++++
 drivers/net/gianfar_ptp_reg.h                   |  113 +++
 drivers/net/phy/Kconfig                         |   29 +
 drivers/net/phy/Makefile                        |    1 +
 drivers/net/phy/dp83640.c                       |  896 +++++++++++++++++++++++
 drivers/net/phy/dp83640_reg.h                   |  261 +++++++
 drivers/ptp/Kconfig                             |   53 ++
 drivers/ptp/Makefile                            |    7 +
 drivers/ptp/ptp_chardev.c                       |  144 ++++
 drivers/ptp/ptp_clock.c                         |  315 ++++++++
 drivers/ptp/ptp_ixp46x.c                        |  332 +++++++++
 drivers/ptp/ptp_private.h                       |   85 +++
 drivers/ptp/ptp_sysfs.c                         |  226 ++++++
 include/linux/Kbuild                            |    1 +
 include/linux/posix-clock.h                     |  150 ++++
 include/linux/posix-timers.h                    |   28 +-
 include/linux/ptp_clock.h                       |   79 ++
 include/linux/ptp_clock_kernel.h                |  141 ++++
 include/linux/security.h                        |    9 +-
 include/linux/syscalls.h                        |    2 +
 include/linux/time.h                            |    8 +-
 include/linux/timex.h                           |    3 +-
 kernel/compat.c                                 |  136 +++--
 kernel/posix-cpu-timers.c                       |    6 +
 kernel/posix-timers.c                           |  245 ++++---
 kernel/time.c                                   |    2 +-
 kernel/time/Makefile                            |    3 +-
 kernel/time/ntp.c                               |   11 +
 kernel/time/posix-clock-syscalls.h              |   39 +
 kernel/time/posix-clock.c                       |  432 +++++++++++
 kernel/time/timekeeping.c                       |   38 +-
 security/commoncap.c                            |    2 +-
 security/security.c                             |    2 +-
 58 files changed, 5062 insertions(+), 164 deletions(-)
 create mode 100644 Documentation/ABI/testing/sysfs-ptp
 create mode 100644 Documentation/ptp/ptp.txt
 create mode 100644 Documentation/ptp/testptp.c
 create mode 100644 Documentation/ptp/testptp.mk
 create mode 100644 arch/arm/mach-ixp4xx/include/mach/ixp46x_ts.h
 create mode 100644 drivers/net/gianfar_ptp.c
 create mode 100644 drivers/net/gianfar_ptp_reg.h
 create mode 100644 drivers/net/phy/dp83640.c
 create mode 100644 drivers/net/phy/dp83640_reg.h
 create mode 100644 drivers/ptp/Kconfig
 create mode 100644 drivers/ptp/Makefile
 create mode 100644 drivers/ptp/ptp_chardev.c
 create mode 100644 drivers/ptp/ptp_clock.c
 create mode 100644 drivers/ptp/ptp_ixp46x.c
 create mode 100644 drivers/ptp/ptp_private.h
 create mode 100644 drivers/ptp/ptp_sysfs.c
 create mode 100644 include/linux/posix-clock.h
 create mode 100644 include/linux/ptp_clock.h
 create mode 100644 include/linux/ptp_clock_kernel.h
 create mode 100644 kernel/time/posix-clock-syscalls.h
 create mode 100644 kernel/time/posix-clock.c

^ permalink raw reply

* Re: Realtek r8168C / r8169 driver VLAN TAG stripping
From: Anand Raj Manickam @ 2011-01-27 10:35 UTC (permalink / raw)
  To: Francois Romieu; +Cc: netdev, Hayes
In-Reply-To: <20110118012146.GA3704@electric-eye.fr.zoreil.com>

Hi ,
We upgraded to 2.6.36 kernel . The result is SAME.
The VLAN tag gets stripped ;-)
Do let me know if you need more info .
Thanks,
Anand


On Tue, Jan 18, 2011 at 6:51 AM, Francois Romieu <romieu@fr.zoreil.com> wrote:
> Anand Raj Manickam <anandrm@gmail.com> :
>> On Mon, Jan 17, 2011 at 11:52 AM, Anand Raj Manickam <anandrm@gmail.com> wrote:
> [...]
>> > This is the dmesg  for XID
>> >
>> > eth0: RTL8168c/8111c at 0xf9628000, 00:17:54:00:f6:62, XID 1c4000c0 IRQ 31
>> > r8169: mac_version = 0x16
>
> I do not have one of those (RTL_GIGA_MAC_VER_22) to check if it handles vlan
> correctly yet.
>
>> > r8169 Gigabit Ethernet driver 2.3LK-NAPI loaded
>> >
>> > Unfortunately , i m not able to upgrade my kernel now . If there is a
>> > Fix for it , that would be great !!
>
> I doubt there is a lot of glamour/fortune/fame in backporting the 89 r8169
> patches between v2.6.30 and v2.6.37 but you may help yourself and give it
> a try.
>
> --
> Ueimor
>

^ permalink raw reply

* Re: [RFC PATCH] net: Implement read-only protection and COW'ing of metrics.
From: Eric Dumazet @ 2011-01-27 10:20 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <1296122511.3027.11.camel@edumazet-laptop>

Le jeudi 27 janvier 2011 à 11:01 +0100, Eric Dumazet a écrit :
> Le mercredi 26 janvier 2011 à 15:25 -0800, David Miller a écrit :
> > Eric, thanks again for your feedback.  I've taken a stab at fixing the
> > various races, in particular the one you discovered about metrics
> > sharing and how this interacts with fib_info releases.
> > 
> > What I've choosen to do is two-fold:
> > 
> > 1) Update ->_metrics atomically with cmpxchg once a route becomes publicly
> >    visible.
> > 
> > 2) Remember and grab a reference to the fib_info for shared read-only
> >    metrics in rt->fi, then release it once the metrics regerence goes
> >    away.
> > 
> > It sounds expensive but hear me out :-)
> > 
> > First of all, at rt_set_nexthop() time, the atomic we use to grab a
> > ref to the fib_info is replacing a 60-byte memcpy() into the dst
> > metrics.
> > 
> > Next, the ->_metrics atomic to un-COW the metrics at destroy time
> > might in fact be overkill.  Especially once writable metrics live in
> > the inetpeer cache (that's the next set of patches after this one).
> > 
> > Finally, once this change is stabilized we can be a lot smarter about
> > what we do at the time an entry is created.  For example, when a route
> > is looked up for a TCP socket, we essentially know we are going to COW
> > the route %99.99999 of the time.  So we can pass a hint into TCP's
> > route lookups in the flow flags field telling it to pre-COW the route.
> > 
> > TCP pre-COW'ing of metrics will thus save several atomics.
> > 
> > Anyways, here is the patch, it is only build tested at this point, but
> > I wanted to get feedback from you about the basic gist of things
> > as soon as possible.
> > 
> > Thanks!
> > 
> 
> Thanks David, I read this (I am a bit busy preparing my travel to
> Reunion/Maurice islands). This looks pretty nice. I have one comment :
> 
> >  
> > +u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
> > +{
> > +	u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
> > +
> > +	if (p) {
> > +		u32 *old_p = __DST_METRICS_PTR(old);
> > +		unsigned long prev, new;
> > +
> > +		memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
> > +
> > +		new = (unsigned long) p;
> > +		prev = cmpxchg(&dst->_metrics, old, new);
> > +
> > +		if (prev != old) {
> > +			kfree(p);
> > +			p = __DST_METRICS_PTR(prev);
> > +			if (prev & DST_METRICS_READ_ONLY)
> > +				p = NULL;
> > +		}
> > +	}
> > +	return p;
> > +}
> > +EXPORT_SYMBOL(dst_cow_metrics_generic);
> > +
> ...
> 
> > diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> > index 3e5b7cc..7fc6301 100644
> > --- a/net/ipv4/route.c
> > +++ b/net/ipv4/route.c
> > @@ -152,6 +152,36 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
> >  {
> >  }
> >  
> > +static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
> > +{
> > +	u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
> > +
> > +	if (p) {
> > +		u32 *old_p = __DST_METRICS_PTR(old);
> > +		unsigned long prev, new;
> > +
> > +		memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
> > +
> > +		new = (unsigned long) p;
> > +		prev = cmpxchg(&dst->_metrics, old, new);
> > +
> > +		if (prev != old) {
> > +			kfree(p);
> > +			p = __DST_METRICS_PTR(prev);
> > +			if (prev & DST_METRICS_READ_ONLY)
> > +				p = NULL;
> > +		} else {
> 
> Hmm, I first asked myself why you dont use dst_cow_metrics_generic() to
> perform the generic allocation, but saw following :
> 
> > +			struct rtable *rt = (struct rtable *) dst;
> > +
> 
> Since you use cmpxchg() to permut the dst->_metrics, I feel this rt->fi
> needs some protection as well. Maybe store fi pointer inside the metrics
> instead of dst, or else you need a spinlock to perform the whole
> transaction (change dst->_metrics & rt->fi) ?
> 
> > +			if (rt->fi) {
> > +				fib_info_put(rt->fi);
> > +				rt->fi = NULL;
> > +			}
> > +		}
> > +	}
> > +	return p;
> > +}
> > +
> 


Hmm, reading again, I realize the rt->fi was set only when installing
the readonly metrics, so ignore my previous mail.

Thanks !



^ permalink raw reply

* SO_REUSEPORT - can it be done in kernel?
From: Daniel Baluta @ 2011-01-27 10:07 UTC (permalink / raw)
  To: therbert; +Cc: netdev

Hi Tom,

How did you solved the issue regarding scaling TCP listeners?
I think SO_REUSEPORT proposed by patch [1] can be a good
start. Where there any follow ups?

Also, solving the problem in users pace can be an option. I want
to run multiple instances of a DNS server on a multi core system.
Any suggestions would be welcomed.

SO_REUSEPORT option seems to be already there [2]. Where
there any plans to have a kernel implementation?

thanks,
Daniel.

[1] http://amailbox.org/mailarchive/linux-netdev/2010/4/19/6274993
[2] http://lxr.linux.no/linux+v2.6.37/include/asm-generic/socket.h#L25

^ permalink raw reply

* Re: [PATCH 12/20] x25: remove the BKL
From: Andrew Hendry @ 2011-01-27 10:07 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: linux-kernel, linux-x25, netdev
In-Reply-To: <1295993854-4971-13-git-send-email-arnd@arndb.de>

Left it running and put about 3.0G through x.25, it was running fine
until after about 20 hours.
I was stopping the test programs and hit this.

Jan 27 20:18:34 jaunty kernel: [80403.945790] PGD 1d8b00067 PUD 1ddec3067 PMD 0
Jan 27 20:18:34 jaunty kernel: [80403.945836] CPU 3
Jan 27 20:18:34 jaunty kernel: [80403.945842] Modules linked in: x25
nls_cp437 cifs binfmt_misc kvm_intel kvm snd_hda_codec_via
snd_usb_audio snd_hda_intel snd_hda_codec nouveau snd_pcm_oss
snd_mixer_oss snd_pcm snd_seq_dummy snd_hwdep snd_usbmidi_lib
snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq
psmouse snd_timer snd_seq_device serio_raw fbcon ttm tileblit font
bitblit softcursor xhci_hcd drm_kms_helper snd drm asus_atk0110
soundcore snd_page_alloc i2c_algo_bit video usbhid hid usb_storage
r8169 pata_jmicron ahci mii libahci
Jan 27 20:18:34 jaunty kernel: [80403.946026]
Jan 27 20:18:34 jaunty kernel: [80403.946034] Pid: 28187, comm:
x25echotest Not tainted 2.6.38-rc2+ #41 P7P55D-E PRO/System Product
Name
Jan 27 20:18:34 jaunty kernel: [80403.946050] RIP:
0010:[<ffffffffa026f197>]  [<ffffffffa026f197>]
x25_sendmsg+0x1a7/0x530 [x25]
Jan 27 20:18:34 jaunty kernel: [80403.946072] RSP:
0018:ffff880228dbfcb8  EFLAGS: 00010246
Jan 27 20:18:34 jaunty kernel: [80403.946083] RAX: 0000000000000080
RBX: ffff880228dbfd70 RCX: ffff880228dbfce4
Jan 27 20:18:34 jaunty kernel: [80403.946096] RDX: 00000000fffffe00
RSI: 0000000000000000 RDI: ffff8801ba89f050
Jan 27 20:18:34 jaunty kernel: [80403.946109] RBP: ffff880228dbfd18
R08: ffff88022aa91000 R09: 0000000000000000
Jan 27 20:18:34 jaunty kernel: [80403.946482] R10: 0000000000000000
R11: 0000000000000000 R12: ffff8801ba89f000
Jan 27 20:18:34 jaunty kernel: [80403.946495] R13: 0000000000000000
R14: 0000000000000000 R15: 0000000000000000
Jan 27 20:18:34 jaunty kernel: [80403.946509] FS:
00007f09b3013700(0000) GS:ffff8800bf460000(0000)
knlGS:0000000000000000
Jan 27 20:18:34 jaunty kernel: [80403.946523] CS:  0010 DS: 0000 ES:
0000 CR0: 000000008005003b
Jan 27 20:18:34 jaunty kernel: [80403.946534] CR2: 00000000000000b4
CR3: 00000001df992000 CR4: 00000000000006e0
Jan 27 20:18:34 jaunty kernel: [80403.946547] DR0: 0000000000000000
DR1: 0000000000000000 DR2: 0000000000000000
Jan 27 20:18:34 jaunty kernel: [80403.946560] DR3: 0000000000000000
DR6: 00000000ffff0ff0 DR7: 0000000000000400
Jan 27 20:18:34 jaunty kernel: [80403.946574] Process x25echotest
(pid: 28187, threadinfo ffff880228dbe000, task ffff8801d89bc320)
Jan 27 20:18:34 jaunty kernel: [80403.946594]  ffff880200000008
0000000000000016 0000303030390009 0000000000000000
Jan 27 20:18:34 jaunty kernel: [80403.946616]  ffff880228db0000
fffffe00d8832450 0000000000000000 ffff880228dbfd38
Jan 27 20:18:34 jaunty kernel: [80403.946638]  ffff880228dbfec8
ffff880228dbfdf8 ffff8801de73b980 ffff880228dbfd70
Jan 27 20:18:34 jaunty kernel: [80403.946671]  [<ffffffff8140cdd3>]
sock_aio_write+0x183/0x1a0
Jan 27 20:18:34 jaunty kernel: [80403.946686]  [<ffffffff8110304c>] ?
__pte_alloc+0xdc/0x100
Jan 27 20:18:34 jaunty kernel: [80403.946700]  [<ffffffff81138a5a>]
do_sync_write+0xda/0x120
Jan 27 20:18:34 jaunty kernel: [80403.946713]  [<ffffffff8140d026>] ?
move_addr_to_user+0x86/0xa0
Jan 27 20:18:34 jaunty kernel: [80403.946729]  [<ffffffff812431a3>] ?
security_file_permission+0x23/0x90
Jan 27 20:18:34 jaunty kernel: [80403.946743]  [<ffffffff8113903e>]
vfs_write+0x15e/0x180
Jan 27 20:18:34 jaunty kernel: [80403.946757]  [<ffffffff81139151>]
sys_write+0x51/0x90
Jan 27 20:18:34 jaunty kernel: [80403.946771]  [<ffffffff8100bf42>]
system_call_fastpath+0x16/0x1b
Jan 27 20:18:34 jaunty kernel: [80403.946973]  RSP <ffff880228dbfcb8>
Jan 27 20:18:34 jaunty kernel: [80403.950010] ---[ end trace
36cd53b6ce0d6f4b ]---

If i have done it right, x25_sendmsg+0x1a7/0x530 is the skb_reserve
which gets inlined here.
(af_x25.c)
        /* Build a packet */
        SOCK_DEBUG(sk, "x25_sendmsg: sendto: building packet.\n");

        if ((msg->msg_flags & MSG_OOB) && len > 32)
                len = 32;

        size = len + X25_MAX_L2_LEN + X25_EXT_MIN_LEN;

        release_sock(sk);
        skb = sock_alloc_send_skb(sk, size, noblock, &rc);
        lock_sock(sk);

        X25_SKB_CB(skb)->flags = msg->msg_flags;

        skb_reserve(skb, X25_MAX_L2_LEN + X25_EXT_MIN_LEN);

        /*
         *      Put the data on the end
         */
        SOCK_DEBUG(sk, "x25_sendmsg: Copying user data\n");



objdump -dS show it at 2197 here.

static inline void skb_reserve(struct sk_buff *skb, int len)
{
        skb->data += len;
        skb->tail += len;
    2197:       41 83 87 b4 00 00 00    addl   $0x16,0xb4(%r15) <---
    219e:       16
    219f:       41 89 47 28             mov    %eax,0x28(%r15)
    21a3:       49 8b 87 c8 00 00 00    mov    0xc8(%r15),%rax
    21aa:       48 83 c0 16             add    $0x16,%rax
        skb_reserve(skb, X25_MAX_L2_LEN + X25_EXT_MIN_LEN);

But im not sure where to go from there...



On Wed, Jan 26, 2011 at 9:17 AM, Arnd Bergmann <arnd@arndb.de> wrote:
>
> This replaces all instances of lock_kernel in x25
> with lock_sock, taking care to release the socket
> lock around sleeping functions (sock_alloc_send_skb
> and skb_recv_datagram). It is not clear whether
> this is a correct solution, but it seem to be what
> other protocols do in the same situation.
>
> Compile-tested only.
>
> Signed-off-by: Arnd Bergmann <arnd@arndb.de>
> Cc: Andrew Hendry <andrew.hendry@gmail.com>
> Cc: linux-x25@vger.kernel.org
> Cc: netdev@vger.kernel.org
> ---
>  net/x25/Kconfig   |    1 -
>  net/x25/af_x25.c  |   61 ++++++++++++++++------------------------------------
>  net/x25/x25_out.c |    7 ++++-
>  3 files changed, 24 insertions(+), 45 deletions(-)
>
> diff --git a/net/x25/Kconfig b/net/x25/Kconfig
> index 2196e55..e6759c9 100644
> --- a/net/x25/Kconfig
> +++ b/net/x25/Kconfig
> @@ -5,7 +5,6 @@
>  config X25
>        tristate "CCITT X.25 Packet Layer (EXPERIMENTAL)"
>        depends on EXPERIMENTAL
> -       depends on BKL # should be fixable
>        ---help---
>          X.25 is a set of standardized network protocols, similar in scope to
>          frame relay; the one physical line from your box to the X.25 network
> diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
> index ad96ee9..8f5d1bb 100644
> --- a/net/x25/af_x25.c
> +++ b/net/x25/af_x25.c
> @@ -40,7 +40,6 @@
>  #include <linux/errno.h>
>  #include <linux/kernel.h>
>  #include <linux/sched.h>
> -#include <linux/smp_lock.h>
>  #include <linux/timer.h>
>  #include <linux/string.h>
>  #include <linux/net.h>
> @@ -432,15 +431,6 @@ void x25_destroy_socket_from_timer(struct sock *sk)
>        sock_put(sk);
>  }
>
> -static void x25_destroy_socket(struct sock *sk)
> -{
> -       sock_hold(sk);
> -       lock_sock(sk);
> -       __x25_destroy_socket(sk);
> -       release_sock(sk);
> -       sock_put(sk);
> -}
> -
>  /*
>  *     Handling for system calls applied via the various interfaces to a
>  *     X.25 socket object.
> @@ -647,18 +637,19 @@ static int x25_release(struct socket *sock)
>        struct sock *sk = sock->sk;
>        struct x25_sock *x25;
>
> -       lock_kernel();
>        if (!sk)
> -               goto out;
> +               return 0;
>
>        x25 = x25_sk(sk);
>
> +       sock_hold(sk);
> +       lock_sock(sk);
>        switch (x25->state) {
>
>                case X25_STATE_0:
>                case X25_STATE_2:
>                        x25_disconnect(sk, 0, 0, 0);
> -                       x25_destroy_socket(sk);
> +                       __x25_destroy_socket(sk);
>                        goto out;
>
>                case X25_STATE_1:
> @@ -678,7 +669,8 @@ static int x25_release(struct socket *sock)
>
>        sock_orphan(sk);
>  out:
> -       unlock_kernel();
> +       release_sock(sk);
> +       sock_put(sk);
>        return 0;
>  }
>
> @@ -1085,7 +1077,7 @@ static int x25_sendmsg(struct kiocb *iocb, struct socket *sock,
>        size_t size;
>        int qbit = 0, rc = -EINVAL;
>
> -       lock_kernel();
> +       lock_sock(sk);
>        if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_OOB|MSG_EOR|MSG_CMSG_COMPAT))
>                goto out;
>
> @@ -1148,9 +1140,10 @@ static int x25_sendmsg(struct kiocb *iocb, struct socket *sock,
>
>        size = len + X25_MAX_L2_LEN + X25_EXT_MIN_LEN;
>
> +       release_sock(sk);
>        skb = sock_alloc_send_skb(sk, size, noblock, &rc);
> -       if (!skb)
> -               goto out;
> +       lock_sock(sk);
> +
>        X25_SKB_CB(skb)->flags = msg->msg_flags;
>
>        skb_reserve(skb, X25_MAX_L2_LEN + X25_EXT_MIN_LEN);
> @@ -1231,26 +1224,10 @@ static int x25_sendmsg(struct kiocb *iocb, struct socket *sock,
>                        len++;
>        }
>
> -       /*
> -        * lock_sock() is currently only used to serialize this x25_kick()
> -        * against input-driven x25_kick() calls. It currently only blocks
> -        * incoming packets for this socket and does not protect against
> -        * any other socket state changes and is not called from anywhere
> -        * else. As x25_kick() cannot block and as long as all socket
> -        * operations are BKL-wrapped, we don't need take to care about
> -        * purging the backlog queue in x25_release().
> -        *
> -        * Using lock_sock() to protect all socket operations entirely
> -        * (and making the whole x25 stack SMP aware) unfortunately would
> -        * require major changes to {send,recv}msg and skb allocation methods.
> -        * -> 2.5 ;)
> -        */
> -       lock_sock(sk);
>        x25_kick(sk);
> -       release_sock(sk);
>        rc = len;
>  out:
> -       unlock_kernel();
> +       release_sock(sk);
>        return rc;
>  out_kfree_skb:
>        kfree_skb(skb);
> @@ -1271,7 +1248,7 @@ static int x25_recvmsg(struct kiocb *iocb, struct socket *sock,
>        unsigned char *asmptr;
>        int rc = -ENOTCONN;
>
> -       lock_kernel();
> +       lock_sock(sk);
>        /*
>         * This works for seqpacket too. The receiver has ordered the queue for
>         * us! We do one quick check first though
> @@ -1300,8 +1277,10 @@ static int x25_recvmsg(struct kiocb *iocb, struct socket *sock,
>                msg->msg_flags |= MSG_OOB;
>        } else {
>                /* Now we can treat all alike */
> +               release_sock(sk);
>                skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT,
>                                        flags & MSG_DONTWAIT, &rc);
> +               lock_sock(sk);
>                if (!skb)
>                        goto out;
>
> @@ -1338,14 +1317,12 @@ static int x25_recvmsg(struct kiocb *iocb, struct socket *sock,
>
>        msg->msg_namelen = sizeof(struct sockaddr_x25);
>
> -       lock_sock(sk);
>        x25_check_rbuf(sk);
> -       release_sock(sk);
>        rc = copied;
>  out_free_dgram:
>        skb_free_datagram(sk, skb);
>  out:
> -       unlock_kernel();
> +       release_sock(sk);
>        return rc;
>  }
>
> @@ -1581,18 +1558,18 @@ out_cud_release:
>
>                case SIOCX25CALLACCPTAPPRV: {
>                        rc = -EINVAL;
> -                       lock_kernel();
> +                       lock_sock(sk);
>                        if (sk->sk_state != TCP_CLOSE)
>                                break;
>                        clear_bit(X25_ACCPT_APPRV_FLAG, &x25->flags);
> -                       unlock_kernel();
> +                       release_sock(sk);
>                        rc = 0;
>                        break;
>                }
>
>                case SIOCX25SENDCALLACCPT:  {
>                        rc = -EINVAL;
> -                       lock_kernel();
> +                       lock_sock(sk);
>                        if (sk->sk_state != TCP_ESTABLISHED)
>                                break;
>                        /* must call accptapprv above */
> @@ -1600,7 +1577,7 @@ out_cud_release:
>                                break;
>                        x25_write_internal(sk, X25_CALL_ACCEPTED);
>                        x25->state = X25_STATE_3;
> -                       unlock_kernel();
> +                       release_sock(sk);
>                        rc = 0;
>                        break;
>                }
> diff --git a/net/x25/x25_out.c b/net/x25/x25_out.c
> index d00649f..f1a6ff1 100644
> --- a/net/x25/x25_out.c
> +++ b/net/x25/x25_out.c
> @@ -68,8 +68,11 @@ int x25_output(struct sock *sk, struct sk_buff *skb)
>                frontlen = skb_headroom(skb);
>
>                while (skb->len > 0) {
> -                       if ((skbn = sock_alloc_send_skb(sk, frontlen + max_len,
> -                                                       noblock, &err)) == NULL){
> +                       release_sock(sk);
> +                       skbn = sock_alloc_send_skb(sk, frontlen + max_len,
> +                                                  1, &err);
> +                       lock_sock(sk);
> +                       if (!skbn) {
>                                if (err == -EWOULDBLOCK && noblock){
>                                        kfree_skb(skb);
>                                        return sent;
> --
> 1.7.1
>

^ permalink raw reply

* Re: [RFC PATCH] net: Implement read-only protection and COW'ing of metrics.
From: Eric Dumazet @ 2011-01-27 10:01 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20110126.152538.260074157.davem@davemloft.net>

Le mercredi 26 janvier 2011 à 15:25 -0800, David Miller a écrit :
> Eric, thanks again for your feedback.  I've taken a stab at fixing the
> various races, in particular the one you discovered about metrics
> sharing and how this interacts with fib_info releases.
> 
> What I've choosen to do is two-fold:
> 
> 1) Update ->_metrics atomically with cmpxchg once a route becomes publicly
>    visible.
> 
> 2) Remember and grab a reference to the fib_info for shared read-only
>    metrics in rt->fi, then release it once the metrics regerence goes
>    away.
> 
> It sounds expensive but hear me out :-)
> 
> First of all, at rt_set_nexthop() time, the atomic we use to grab a
> ref to the fib_info is replacing a 60-byte memcpy() into the dst
> metrics.
> 
> Next, the ->_metrics atomic to un-COW the metrics at destroy time
> might in fact be overkill.  Especially once writable metrics live in
> the inetpeer cache (that's the next set of patches after this one).
> 
> Finally, once this change is stabilized we can be a lot smarter about
> what we do at the time an entry is created.  For example, when a route
> is looked up for a TCP socket, we essentially know we are going to COW
> the route %99.99999 of the time.  So we can pass a hint into TCP's
> route lookups in the flow flags field telling it to pre-COW the route.
> 
> TCP pre-COW'ing of metrics will thus save several atomics.
> 
> Anyways, here is the patch, it is only build tested at this point, but
> I wanted to get feedback from you about the basic gist of things
> as soon as possible.
> 
> Thanks!
> 

Thanks David, I read this (I am a bit busy preparing my travel to
Reunion/Maurice islands). This looks pretty nice. I have one comment :

>  
> +u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
> +{
> +	u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
> +
> +	if (p) {
> +		u32 *old_p = __DST_METRICS_PTR(old);
> +		unsigned long prev, new;
> +
> +		memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
> +
> +		new = (unsigned long) p;
> +		prev = cmpxchg(&dst->_metrics, old, new);
> +
> +		if (prev != old) {
> +			kfree(p);
> +			p = __DST_METRICS_PTR(prev);
> +			if (prev & DST_METRICS_READ_ONLY)
> +				p = NULL;
> +		}
> +	}
> +	return p;
> +}
> +EXPORT_SYMBOL(dst_cow_metrics_generic);
> +
...

> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index 3e5b7cc..7fc6301 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -152,6 +152,36 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
>  {
>  }
>  
> +static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
> +{
> +	u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
> +
> +	if (p) {
> +		u32 *old_p = __DST_METRICS_PTR(old);
> +		unsigned long prev, new;
> +
> +		memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
> +
> +		new = (unsigned long) p;
> +		prev = cmpxchg(&dst->_metrics, old, new);
> +
> +		if (prev != old) {
> +			kfree(p);
> +			p = __DST_METRICS_PTR(prev);
> +			if (prev & DST_METRICS_READ_ONLY)
> +				p = NULL;
> +		} else {

Hmm, I first asked myself why you dont use dst_cow_metrics_generic() to
perform the generic allocation, but saw following :

> +			struct rtable *rt = (struct rtable *) dst;
> +

Since you use cmpxchg() to permut the dst->_metrics, I feel this rt->fi
needs some protection as well. Maybe store fi pointer inside the metrics
instead of dst, or else you need a spinlock to perform the whole
transaction (change dst->_metrics & rt->fi) ?

> +			if (rt->fi) {
> +				fib_info_put(rt->fi);
> +				rt->fi = NULL;
> +			}
> +		}
> +	}
> +	return p;
> +}
> +



^ permalink raw reply

* Re: [PATCH] netfilter: CONNMARK: support save the mark of the master connection
From: Changli Gao @ 2011-01-27 10:01 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: David S. Miller, netfilter-devel, netdev
In-Reply-To: <4D413F96.2070309@trash.net>

On Thu, Jan 27, 2011 at 5:49 PM, Patrick McHardy <kaber@trash.net> wrote:
> Am 27.01.2011 10:38, schrieb Changli Gao:
>> In some cases(Policy routing), it is expected that all the sub-connections
>> share the same mark with their master.
>
> We already automatically mark expected connections with the mark of the
> master. What purpose does your patch serve on top of that?
>
>

It is done in init_conntrack(), indeed. Sorry for the noise. Thanks.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply

* Re: [PATCH] netfilter: CONNMARK: support save the mark of the master connection
From: Patrick McHardy @ 2011-01-27  9:49 UTC (permalink / raw)
  To: Changli Gao; +Cc: David S. Miller, netfilter-devel, netdev
In-Reply-To: <1296121124-7016-1-git-send-email-xiaosuo@gmail.com>

Am 27.01.2011 10:38, schrieb Changli Gao:
> In some cases(Policy routing), it is expected that all the sub-connections
> share the same mark with their master.

We already automatically mark expected connections with the mark of the
master. What purpose does your patch serve on top of that?

^ permalink raw reply

* [PATCH] netfilter: CONNMARK: support save the mark of the master connection
From: Changli Gao @ 2011-01-27  9:38 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: David S. Miller, netfilter-devel, netdev, Changli Gao

In some cases(Policy routing), it is expected that all the sub-connections
share the same mark with their master.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
---
 include/linux/netfilter/xt_connmark.h |    3 ++-
 net/netfilter/xt_connmark.c           |   15 +++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)
diff --git a/include/linux/netfilter/xt_connmark.h b/include/linux/netfilter/xt_connmark.h
index efc17a8..4b513f8 100644
--- a/include/linux/netfilter/xt_connmark.h
+++ b/include/linux/netfilter/xt_connmark.h
@@ -15,7 +15,8 @@
 enum {
 	XT_CONNMARK_SET = 0,
 	XT_CONNMARK_SAVE,
-	XT_CONNMARK_RESTORE
+	XT_CONNMARK_RESTORE,
+	XT_CONNMARK_SAVE_MASTER,
 };
 
 struct xt_connmark_tginfo1 {
diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c
index 7278145..4207bb6 100644
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -69,6 +69,21 @@ connmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
 		          (ct->mark & info->ctmask);
 		skb->mark = newmark;
 		break;
+	case XT_CONNMARK_SAVE_MASTER:
+		if (ct->master) {
+			struct nf_conn *master;
+
+			master = ct->master;
+			while (master->master)
+				master = master->master;
+			newmark = (ct->mark & ~info->ctmask) ^
+				  (master->mark & info->nfmask);
+			if (ct->mark != newmark) {
+				ct->mark = newmark;
+				nf_conntrack_event_cache(IPCT_MARK, ct);
+			}
+		}
+		break;
 	}
 
 	return XT_CONTINUE;

^ permalink raw reply related

* Re: [PATCH net-next-2.6] drivers/net: remove some rcu sparse warnings
From: Arnd Bergmann @ 2011-01-27  9:22 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev, Michael Chan, Eilon Greenstein
In-Reply-To: <1296106103.1783.114.camel@edumazet-laptop>

On Thursday 27 January 2011, Eric Dumazet wrote:
> Add missing __rcu annotations and helpers.
> minor : Fix some rcu_dereference() calls in macvtap
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> CC: Arnd Bergmann <arnd@arndb.de>
> CC: Michael Chan <mchan@broadcom.com>
> CC: Eilon Greenstein <eilong@broadcom.com>

Macvtap bits look good, thanks!

Acked-by: Arnd Bergmann <arnd@arndb.de>


^ permalink raw reply

* Re: [PATCH] pch_gbe: Fix the issue that the receiving data is not normal.
From: Toshiharu Okada @ 2011-01-27  8:45 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, linux-kernel, qi.wang, yong.y.wang, andrew.chih.howe.khor,
	joel.clark, kok.howg.ewe
In-Reply-To: <20110125.133240.59688304.davem@davemloft.net>

Hi David

Thank you for your comment.
I will confirm them  and will submit the patch modified.

Best regards
Toshiharu Okada(OKI semiconductor)
----- Original Message ----- 
From: "David Miller" <davem@davemloft.net>
To: <toshiharu-linux@dsn.okisemi.com>
Cc: <netdev@vger.kernel.org>; <linux-kernel@vger.kernel.org>; 
<qi.wang@intel.com>; <yong.y.wang@intel.com>; 
<andrew.chih.howe.khor@intel.com>; <joel.clark@intel.com>; 
<kok.howg.ewe@intel.com>
Sent: Wednesday, January 26, 2011 6:32 AM
Subject: Re: [PATCH] pch_gbe: Fix the issue that the receiving data is not 
normal.

From: Toshiharu Okada <toshiharu-linux@dsn.okisemi.com>
Date: Mon, 24 Jan 2011 13:43:31 +0900

> This PCH_GBE driver had an issue that the receiving data is not normal.
> This driver had not removed correctly the padding data
> which the DMA include in receiving data.
>
> This patch fixed this issue.
>
> Signed-off-by: Toshiharu Okada <toshiharu-linux@dsn.okisemi.com>

There are bugs in these changes:

>  if (skb_copy_flag) { /* recycle  skb */
>  struct sk_buff *new_skb;
>  new_skb =
> -     netdev_alloc_skb(netdev,
> -      length + NET_IP_ALIGN);
> +     netdev_alloc_skb(netdev, length);
>  if (new_skb) {
>  if (!skb_padding_flag) {
>  skb_reserve(new_skb,
> - NET_IP_ALIGN);
> + PCH_GBE_DMA_PADDING);
>  }
>  memcpy(new_skb->data, skb->data,
>  length);

If "!skb_padding_flag" then you will write past the end of the SKB
data in that memcpy.

You cannot allocate only "length" then proceed to reserve 
PCH_GBE_DMA_PADDING
and then add "length" worth of data on top of that.  In such a cause you
must allocate at least "length + PCH_GBE_DMA_PADDING".

Furthermore you _MUST_ respect NET_IP_ALIGN.  Some platforms set this value
to "0", because otherwise performance suffers greatly.

There are two seperate issues, removing the padding bytes provided by
the device, and aligning the IP headers as wanted by the cpu
architecutre.  Therefore they should be handled seperately, and we
therefore should still see references to NET_IP_ALIGN in your patch.

^ permalink raw reply

* Re: TSO/GRO/LRO/somethingO breaks LVS on 2.6.36
From: Eric Dumazet @ 2011-01-27  7:42 UTC (permalink / raw)
  To: Simon Kirby; +Cc: Simon Horman, netdev
In-Reply-To: <20110127004805.GB11578@hostway.ca>

Le mercredi 26 janvier 2011 à 16:48 -0800, Simon Kirby a écrit :
> On Thu, Jan 13, 2011 at 03:34:22PM +0900, Simon Horman wrote:
> 
> > Hi Simon,
> > 
> > thanks for prodding me to respond to this post offline and sorry for not
> > responding earlier.
> > 
> > Firstly, I think that this is a receive-side problem so I don't believe
> > that GSO (generic segmentation offload) or other transmit-side options are
> > likely to have any affect.
> > 
> > My understanding is that on the receive-side there are two options which
> > when enabled can result in the behaviour that you describe.
> > 
> > * LRO (large receive offload)
> > 
> >   You have this disabled, and assuming it really is disabled it
> >   shouldn't be causing a problem.
> > 
> > * GRO (generic receive offload)
> > 
> >   This does not seem to be in the output of your ethtool commands at all.
> >   So I wonder if your ethtool is too old to support this option?
> 
> So, this was the case.  Our ethtool (lenny) was too old to see the GRO
> option, only GSO.  Disabling GRO on eth1.39 has no effect, but disabling
> it on eth1 caused it to stop receiving the merged frames, fixing the LVS
> packet loss (due to no sending GSO support from LVS/IPVS).
> 
> Speaking of this, did your patch for LVS/IPVS GSO support go anywhere? 
> 
> >   In any case, I was able to reproduce the problem that you describe (or at
> >   least something very similar) using 2.6.36 with GRO enabled on eth1.1 and
> >   the problem did not manifest when I disabled GRO on eth1.1.
> 
> It worked for you to do ethtool -K eth1.1 gro off, then?  For me on
> 2.6.37, it seemed to be that "ethtool -K eth1 gro off" was needed, even
> though packets arrive on eth1.39.
> 
> Also, strangely, 2.6.35.4's default state (with no received merged frames)
> has GRO on for eth1 but off for eth1.39:
> 
> # ethtool -k eth1
> Offload parameters for eth1:
> rx-checksumming: on
> tx-checksumming: on
> scatter-gather: on
> tcp-segmentation-offload: on
> udp-fragmentation-offload: off
> generic-segmentation-offload: on
> generic-receive-offload: on
> large-receive-offload: off
> ntuple-filters: off
> receive-hashing: off
> 
> # ethtool -k eth1.39
> Offload parameters for eth1.39:
> rx-checksumming: on
> tx-checksumming: off
> scatter-gather: off
> tcp-segmentation-offload: off
> udp-fragmentation-offload: off
> generic-segmentation-offload: off
> generic-receive-offload: off
> large-receive-offload: off
> ntuple-filters: off
> receive-hashing: off
> 
> If I set 2.6.37 to have all of the same options, I still see GRO frames
> on 2.6.37 (tg3), which is weird.
> 

Weird maybe, but GRO check/handling is done in dev_gro_receive(), on
eth1 receive path.

Frames are assembled by GRO layer using tg3 NAPI structure (holding GRO
machine state) before being delivered to eth1.39

It would be useless/expensive to add another GRO layer on eth1.39

We might not report GRO state on vlan/bonding (or reflect real device
GRO state)




^ permalink raw reply

* Re: [PATCH net-next-2.6] net_sched: sch_mqprio: dont leak kernel memory
From: Eric Dumazet @ 2011-01-27  7:04 UTC (permalink / raw)
  To: Joe Perches; +Cc: David Miller, netdev, john.r.fastabend
In-Reply-To: <1296108251.2448.183.camel@Joe-Laptop>

Le mercredi 26 janvier 2011 à 22:04 -0800, Joe Perches a écrit :

> /* MQPRIO */
> #define TC_QOPT_BITMASK 15
> #define TC_QOPT_MAX_QUEUE 16
> 
> struct tc_mqprio_qopt {
> __u8 num_tc;
> __u8 prio_tc_map[TC_QOPT_BITMASK + 1];
> __u8 hw;
> __u16 count[TC_QOPT_MAX_QUEUE];
> __u16 offset[TC_QOPT_MAX_QUEUE];
> };
> 
> I believe this struct needs to be declared __packed.
> 

Oh my god. Yet another ugly thing.

> It could otherwise be 24 bytes not 22.

22 ? You are kidding probably. Its 82 exactly.

Listen, I doubled check my patch, its good, while your rants are lazy.

> Or if char array declarations have a different
> alignment requirement, could be any size.
> 

If if if... could could could...


> memset is better than {0}.
> 

You never stop do you ?

The bigger object is u16, therefore alignof() is 2, not 4

No ABI requires a short (u16) is aligned on 4 byte boundary.

If you find a compiler not respecting this, you can bet linux wont run
at all if compiled with it. mqprio 'potential 2 bytes leak' will hardly
be a problem.




^ permalink raw reply

* Re: [PATCH net-next-2.6] net_sched: sch_mqprio: dont leak kernel memory
From: Changli Gao @ 2011-01-27  6:54 UTC (permalink / raw)
  To: Joe Perches; +Cc: David Miller, eric.dumazet, netdev, john.r.fastabend
In-Reply-To: <1296108251.2448.183.camel@Joe-Laptop>

On Thu, Jan 27, 2011 at 2:04 PM, Joe Perches <joe@perches.com> wrote:
> On Wed, 2011-01-26 at 11:55 -0800, David Miller wrote:
>> From: Joe Perches <joe@perches.com>
>> Date: Wed, 26 Jan 2011 09:43:43 -0800
>> > On Wed, 2011-01-26 at 18:21 +0100, Eric Dumazet wrote:
>> >> mqprio_dump() should make sure all fields of struct tc_mqprio_qopt are
>> >> initialized.
>> >> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
>> >> CC: John Fastabend <john.r.fastabend@intel.com>
>> >> ---
>> >>  net/sched/sch_mqprio.c |    2 +-
>> >>  1 file changed, 1 insertion(+), 1 deletion(-)
>> >>
>> >> diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
>> >> index fbc6f53..effd4ee 100644
>> >> --- a/net/sched/sch_mqprio.c
>> >> +++ b/net/sched/sch_mqprio.c
>> >> @@ -215,7 +215,7 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
>> >>    struct net_device *dev = qdisc_dev(sch);
>> >>    struct mqprio_sched *priv = qdisc_priv(sch);
>> >>    unsigned char *b = skb_tail_pointer(skb);
>> >> -  struct tc_mqprio_qopt opt;
>> >> +  struct tc_mqprio_qopt opt = { 0 };
>> > I think the best style to use memset so that any
>> > possible struct padding is guaranteed to be zeroed.
>> Such padding does not exist, and we won't add such padding since this is
>> a user visible data structure and thus whose layout is cast in stone.
>
> /* MQPRIO */
> #define TC_QOPT_BITMASK 15
> #define TC_QOPT_MAX_QUEUE 16
>
> struct tc_mqprio_qopt {
> __u8 num_tc; // 1
> __u8 prio_tc_map[TC_QOPT_BITMASK + 1]; // 16
> __u8 hw; // 1
> __u16 count[TC_QOPT_MAX_QUEUE]; // 32
> __u16 offset[TC_QOPT_MAX_QUEUE]; //32
> };
>
> I believe this struct needs to be declared __packed.
>
> It could otherwise be 24 bytes not 22.
> Or if char array declarations have a different
> alignment requirement, could be any size.
>

The total size is 1 + 16 + 1 + 32 + 32 = 82.

How do you get 24 or 22?

> memset is better than {0}.
>


-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply

* Re: [PATCH net-next-2.6] net_sched: sch_mqprio: dont leak kernel memory
From: Joe Perches @ 2011-01-27  6:04 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, netdev, john.r.fastabend
In-Reply-To: <20110126.115530.226756606.davem@davemloft.net>

On Wed, 2011-01-26 at 11:55 -0800, David Miller wrote: 
> From: Joe Perches <joe@perches.com>
> Date: Wed, 26 Jan 2011 09:43:43 -0800
> > On Wed, 2011-01-26 at 18:21 +0100, Eric Dumazet wrote:
> >> mqprio_dump() should make sure all fields of struct tc_mqprio_qopt are
> >> initialized.
> >> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> >> CC: John Fastabend <john.r.fastabend@intel.com>
> >> ---
> >>  net/sched/sch_mqprio.c |    2 +-
> >>  1 file changed, 1 insertion(+), 1 deletion(-)
> >> 
> >> diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
> >> index fbc6f53..effd4ee 100644
> >> --- a/net/sched/sch_mqprio.c
> >> +++ b/net/sched/sch_mqprio.c
> >> @@ -215,7 +215,7 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
> >>  	struct net_device *dev = qdisc_dev(sch);
> >>  	struct mqprio_sched *priv = qdisc_priv(sch);
> >>  	unsigned char *b = skb_tail_pointer(skb);
> >> -	struct tc_mqprio_qopt opt;
> >> +	struct tc_mqprio_qopt opt = { 0 };
> > I think the best style to use memset so that any
> > possible struct padding is guaranteed to be zeroed.
> Such padding does not exist, and we won't add such padding since this is
> a user visible data structure and thus whose layout is cast in stone.

/* MQPRIO */
#define TC_QOPT_BITMASK 15
#define TC_QOPT_MAX_QUEUE 16

struct tc_mqprio_qopt {
__u8 num_tc;
__u8 prio_tc_map[TC_QOPT_BITMASK + 1];
__u8 hw;
__u16 count[TC_QOPT_MAX_QUEUE];
__u16 offset[TC_QOPT_MAX_QUEUE];
};

I believe this struct needs to be declared __packed.

It could otherwise be 24 bytes not 22.
Or if char array declarations have a different
alignment requirement, could be any size.

memset is better than {0}.

cheers, Joe


^ permalink raw reply

* [PATCH net-next-2.6] drivers/net: remove some rcu sparse warnings
From: Eric Dumazet @ 2011-01-27  5:28 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Michael Chan, Arnd Bergmann, Eilon Greenstein

Add missing __rcu annotations and helpers.
minor : Fix some rcu_dereference() calls in macvtap

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Arnd Bergmann <arnd@arndb.de>
CC: Michael Chan <mchan@broadcom.com>
CC: Eilon Greenstein <eilong@broadcom.com>
---
 drivers/net/bnx2.c              |    6 ++++--
 drivers/net/bnx2.h              |    2 +-
 drivers/net/bnx2x/bnx2x.h       |    2 +-
 drivers/net/bnx2x/bnx2x_main.c  |    3 ++-
 drivers/net/cnic.c              |   27 ++++++++++++++++++---------
 drivers/net/cnic.h              |    2 +-
 drivers/net/hamradio/bpqether.c |    5 +++--
 drivers/net/macvtap.c           |   18 ++++++++++--------
 8 files changed, 40 insertions(+), 25 deletions(-)

diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index 231aa97..2c3d747 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -435,7 +435,8 @@ bnx2_cnic_stop(struct bnx2 *bp)
 	struct cnic_ctl_info info;
 
 	mutex_lock(&bp->cnic_lock);
-	c_ops = bp->cnic_ops;
+	c_ops = rcu_dereference_protected(bp->cnic_ops,
+					  lockdep_is_held(&bp->cnic_lock));
 	if (c_ops) {
 		info.cmd = CNIC_CTL_STOP_CMD;
 		c_ops->cnic_ctl(bp->cnic_data, &info);
@@ -450,7 +451,8 @@ bnx2_cnic_start(struct bnx2 *bp)
 	struct cnic_ctl_info info;
 
 	mutex_lock(&bp->cnic_lock);
-	c_ops = bp->cnic_ops;
+	c_ops = rcu_dereference_protected(bp->cnic_ops,
+					  lockdep_is_held(&bp->cnic_lock));
 	if (c_ops) {
 		if (!(bp->flags & BNX2_FLAG_USING_MSIX)) {
 			struct bnx2_napi *bnapi = &bp->bnx2_napi[0];
diff --git a/drivers/net/bnx2.h b/drivers/net/bnx2.h
index 5488a2e..6824eba 100644
--- a/drivers/net/bnx2.h
+++ b/drivers/net/bnx2.h
@@ -6758,7 +6758,7 @@ struct bnx2 {
 	u32		tx_wake_thresh;
 
 #ifdef BCM_CNIC
-	struct cnic_ops		*cnic_ops;
+	struct cnic_ops	__rcu	*cnic_ops;
 	void			*cnic_data;
 #endif
 
diff --git a/drivers/net/bnx2x/bnx2x.h b/drivers/net/bnx2x/bnx2x.h
index 8e41837..dfdb9b5 100644
--- a/drivers/net/bnx2x/bnx2x.h
+++ b/drivers/net/bnx2x/bnx2x.h
@@ -1110,7 +1110,7 @@ struct bnx2x {
 #define BNX2X_CNIC_FLAG_MAC_SET		1
 	void			*t2;
 	dma_addr_t		t2_mapping;
-	struct cnic_ops		*cnic_ops;
+	struct cnic_ops	__rcu	*cnic_ops;
 	void			*cnic_data;
 	u32			cnic_tag;
 	struct cnic_eth_dev	cnic_eth_dev;
diff --git a/drivers/net/bnx2x/bnx2x_main.c b/drivers/net/bnx2x/bnx2x_main.c
index 8cdcf5b..a2a1bc4 100644
--- a/drivers/net/bnx2x/bnx2x_main.c
+++ b/drivers/net/bnx2x/bnx2x_main.c
@@ -9862,7 +9862,8 @@ static int bnx2x_cnic_ctl_send(struct bnx2x *bp, struct cnic_ctl_info *ctl)
 	int rc = 0;
 
 	mutex_lock(&bp->cnic_mutex);
-	c_ops = bp->cnic_ops;
+	c_ops = rcu_dereference_protected(bp->cnic_ops,
+					  lockdep_is_held(&bp->cnic_mutex));
 	if (c_ops)
 		rc = c_ops->cnic_ctl(bp->cnic_data, ctl);
 	mutex_unlock(&bp->cnic_mutex);
diff --git a/drivers/net/cnic.c b/drivers/net/cnic.c
index 263a294..e12049e 100644
--- a/drivers/net/cnic.c
+++ b/drivers/net/cnic.c
@@ -65,7 +65,14 @@ static LIST_HEAD(cnic_udev_list);
 static DEFINE_RWLOCK(cnic_dev_lock);
 static DEFINE_MUTEX(cnic_lock);
 
-static struct cnic_ulp_ops *cnic_ulp_tbl[MAX_CNIC_ULP_TYPE];
+static struct cnic_ulp_ops __rcu *cnic_ulp_tbl[MAX_CNIC_ULP_TYPE];
+
+/* helper function, assuming cnic_lock is held */
+static inline struct cnic_ulp_ops *cnic_ulp_tbl_prot(int type)
+{
+	return rcu_dereference_protected(cnic_ulp_tbl[type],
+					 lockdep_is_held(&cnic_lock));
+}
 
 static int cnic_service_bnx2(void *, void *);
 static int cnic_service_bnx2x(void *, void *);
@@ -435,7 +442,7 @@ int cnic_register_driver(int ulp_type, struct cnic_ulp_ops *ulp_ops)
 		return -EINVAL;
 	}
 	mutex_lock(&cnic_lock);
-	if (cnic_ulp_tbl[ulp_type]) {
+	if (cnic_ulp_tbl_prot(ulp_type)) {
 		pr_err("%s: Type %d has already been registered\n",
 		       __func__, ulp_type);
 		mutex_unlock(&cnic_lock);
@@ -478,7 +485,7 @@ int cnic_unregister_driver(int ulp_type)
 		return -EINVAL;
 	}
 	mutex_lock(&cnic_lock);
-	ulp_ops = cnic_ulp_tbl[ulp_type];
+	ulp_ops = cnic_ulp_tbl_prot(ulp_type);
 	if (!ulp_ops) {
 		pr_err("%s: Type %d has not been registered\n",
 		       __func__, ulp_type);
@@ -529,7 +536,7 @@ static int cnic_register_device(struct cnic_dev *dev, int ulp_type,
 		return -EINVAL;
 	}
 	mutex_lock(&cnic_lock);
-	if (cnic_ulp_tbl[ulp_type] == NULL) {
+	if (cnic_ulp_tbl_prot(ulp_type) == NULL) {
 		pr_err("%s: Driver with type %d has not been registered\n",
 		       __func__, ulp_type);
 		mutex_unlock(&cnic_lock);
@@ -544,7 +551,7 @@ static int cnic_register_device(struct cnic_dev *dev, int ulp_type,
 
 	clear_bit(ULP_F_START, &cp->ulp_flags[ulp_type]);
 	cp->ulp_handle[ulp_type] = ulp_ctx;
-	ulp_ops = cnic_ulp_tbl[ulp_type];
+	ulp_ops = cnic_ulp_tbl_prot(ulp_type);
 	rcu_assign_pointer(cp->ulp_ops[ulp_type], ulp_ops);
 	cnic_hold(dev);
 
@@ -2953,7 +2960,8 @@ static void cnic_ulp_stop(struct cnic_dev *dev)
 		struct cnic_ulp_ops *ulp_ops;
 
 		mutex_lock(&cnic_lock);
-		ulp_ops = cp->ulp_ops[if_type];
+		ulp_ops = rcu_dereference_protected(cp->ulp_ops[if_type],
+						    lockdep_is_held(&cnic_lock));
 		if (!ulp_ops) {
 			mutex_unlock(&cnic_lock);
 			continue;
@@ -2977,7 +2985,8 @@ static void cnic_ulp_start(struct cnic_dev *dev)
 		struct cnic_ulp_ops *ulp_ops;
 
 		mutex_lock(&cnic_lock);
-		ulp_ops = cp->ulp_ops[if_type];
+		ulp_ops = rcu_dereference_protected(cp->ulp_ops[if_type],
+						    lockdep_is_held(&cnic_lock));
 		if (!ulp_ops || !ulp_ops->cnic_start) {
 			mutex_unlock(&cnic_lock);
 			continue;
@@ -3041,7 +3050,7 @@ static void cnic_ulp_init(struct cnic_dev *dev)
 		struct cnic_ulp_ops *ulp_ops;
 
 		mutex_lock(&cnic_lock);
-		ulp_ops = cnic_ulp_tbl[i];
+		ulp_ops = cnic_ulp_tbl_prot(i);
 		if (!ulp_ops || !ulp_ops->cnic_init) {
 			mutex_unlock(&cnic_lock);
 			continue;
@@ -3065,7 +3074,7 @@ static void cnic_ulp_exit(struct cnic_dev *dev)
 		struct cnic_ulp_ops *ulp_ops;
 
 		mutex_lock(&cnic_lock);
-		ulp_ops = cnic_ulp_tbl[i];
+		ulp_ops = cnic_ulp_tbl_prot(i);
 		if (!ulp_ops || !ulp_ops->cnic_exit) {
 			mutex_unlock(&cnic_lock);
 			continue;
diff --git a/drivers/net/cnic.h b/drivers/net/cnic.h
index b328f6c..4456260 100644
--- a/drivers/net/cnic.h
+++ b/drivers/net/cnic.h
@@ -220,7 +220,7 @@ struct cnic_local {
 #define ULP_F_INIT	0
 #define ULP_F_START	1
 #define ULP_F_CALL_PENDING	2
-	struct cnic_ulp_ops *ulp_ops[MAX_CNIC_ULP_TYPE];
+	struct cnic_ulp_ops __rcu *ulp_ops[MAX_CNIC_ULP_TYPE];
 
 	unsigned long cnic_local_flags;
 #define	CNIC_LCL_FL_KWQ_INIT		0x0
diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c
index ac1d323..8931168 100644
--- a/drivers/net/hamradio/bpqether.c
+++ b/drivers/net/hamradio/bpqether.c
@@ -400,13 +400,14 @@ static void *bpq_seq_start(struct seq_file *seq, loff_t *pos)
 static void *bpq_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct list_head *p;
+	struct bpqdev *bpqdev = v;
 
 	++*pos;
 
 	if (v == SEQ_START_TOKEN)
-		p = rcu_dereference(bpq_devices.next);
+		p = rcu_dereference(list_next_rcu(&bpq_devices));
 	else
-		p = rcu_dereference(((struct bpqdev *)v)->bpq_list.next);
+		p = rcu_dereference(list_next_rcu(&bpqdev->bpq_list));
 
 	return (p == &bpq_devices) ? NULL 
 		: list_entry(p, struct bpqdev, bpq_list);
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 5933621..2300e45 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -39,7 +39,7 @@ struct macvtap_queue {
 	struct socket sock;
 	struct socket_wq wq;
 	int vnet_hdr_sz;
-	struct macvlan_dev *vlan;
+	struct macvlan_dev __rcu *vlan;
 	struct file *file;
 	unsigned int flags;
 };
@@ -141,7 +141,8 @@ static void macvtap_put_queue(struct macvtap_queue *q)
 	struct macvlan_dev *vlan;
 
 	spin_lock(&macvtap_lock);
-	vlan = rcu_dereference(q->vlan);
+	vlan = rcu_dereference_protected(q->vlan,
+					 lockdep_is_held(&macvtap_lock));
 	if (vlan) {
 		int index = get_slot(vlan, q);
 
@@ -219,7 +220,8 @@ static void macvtap_del_queues(struct net_device *dev)
 	/* macvtap_put_queue can free some slots, so go through all slots */
 	spin_lock(&macvtap_lock);
 	for (i = 0; i < MAX_MACVTAP_QUEUES && vlan->numvtaps; i++) {
-		q = rcu_dereference(vlan->taps[i]);
+		q = rcu_dereference_protected(vlan->taps[i],
+					      lockdep_is_held(&macvtap_lock));
 		if (q) {
 			qlist[j++] = q;
 			rcu_assign_pointer(vlan->taps[i], NULL);
@@ -569,7 +571,7 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q,
 	}
 
 	rcu_read_lock_bh();
-	vlan = rcu_dereference(q->vlan);
+	vlan = rcu_dereference_bh(q->vlan);
 	if (vlan)
 		macvlan_start_xmit(skb, vlan->dev);
 	else
@@ -583,7 +585,7 @@ err_kfree:
 
 err:
 	rcu_read_lock_bh();
-	vlan = rcu_dereference(q->vlan);
+	vlan = rcu_dereference_bh(q->vlan);
 	if (vlan)
 		vlan->dev->stats.tx_dropped++;
 	rcu_read_unlock_bh();
@@ -631,7 +633,7 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
 	ret = skb_copy_datagram_const_iovec(skb, 0, iv, vnet_hdr_len, len);
 
 	rcu_read_lock_bh();
-	vlan = rcu_dereference(q->vlan);
+	vlan = rcu_dereference_bh(q->vlan);
 	if (vlan)
 		macvlan_count_rx(vlan, len, ret == 0, 0);
 	rcu_read_unlock_bh();
@@ -727,7 +729,7 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
 
 	case TUNGETIFF:
 		rcu_read_lock_bh();
-		vlan = rcu_dereference(q->vlan);
+		vlan = rcu_dereference_bh(q->vlan);
 		if (vlan)
 			dev_hold(vlan->dev);
 		rcu_read_unlock_bh();
@@ -736,7 +738,7 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
 			return -ENOLINK;
 
 		ret = 0;
-		if (copy_to_user(&ifr->ifr_name, q->vlan->dev->name, IFNAMSIZ) ||
+		if (copy_to_user(&ifr->ifr_name, vlan->dev->name, IFNAMSIZ) ||
 		    put_user(q->flags, &ifr->ifr_flags))
 			ret = -EFAULT;
 		dev_put(vlan->dev);



^ permalink raw reply related

* Re: [net-next 08/12] ixgb: convert to new VLAN model
From: Ben Hutchings @ 2011-01-27  4:18 UTC (permalink / raw)
  To: Jesse Gross
  Cc: Tantilov, Emil S, Kirsher, Jeffrey T, davem@davemloft.net,
	netdev@vger.kernel.org, bphilips@novell.com, Pieper, Jeffrey E
In-Reply-To: <AANLkTi=RU11ibzd3c9sqCLL0pNowvx1_ow7C=qWVoPMt@mail.gmail.com>

On Wed, 2011-01-26 at 19:53 -0800, Jesse Gross wrote:
> On Tue, Jan 25, 2011 at 10:20 AM, Tantilov, Emil S
> <emil.s.tantilov@intel.com> wrote:
[...]
> > Sure, but I think a savvy user would always check the result of an
> > ethtool command (ie. `ethtool -K` followed with `ethtool -k`, -A/-a,
> > etc).
> 
> Probably, but it seems the less savviness required from the user the
> better.  Regardless, it doesn't affect anything here, it would just be
> a change to the userspace tool.

I am intending to modify ethtool so that it will report any other
offload settings that were changed automatically.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* [PATCH net-next-2.6] net: fix dev_seq_next()
From: Eric Dumazet @ 2011-01-27  4:08 UTC (permalink / raw)
  To: David Miller, Paul E. McKenney; +Cc: netdev

Paul, the following comment in include/linux/rculist.h is misleading :

"Why is there no list_empty_rcu()?  Because list_empty() serves this
purpose..."

This is probably why I made the error ;)

list_empty() has a meaning only if state cannot change right after its
use.

In an rcu_read_lock() section, state _can_ change, so there is no way
list_empty() can be used at all.

Thanks

[PATCH net-next-2.6] net: fix dev_seq_next()

Commit c6d14c84566d (net: Introduce for_each_netdev_rcu() iterator)
added a race in dev_seq_next().

The rcu_dereference() call should be done _before_ testing the end of
list, or we might return a wrong net_device if a concurrent thread
changes net_device list under us.

Note : discovered thanks to a sparse warning :

net/core/dev.c:3919:9: error: incompatible types in comparison expression
(different address spaces)

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
Given this was discovered by code analysis rather than a bug report, I
prepared a patch for net-next-2.6. Once fully tested, this could be
backported to 2.6.33

 include/linux/netdevice.h |    9 ++++++++-
 net/core/dev.c            |   11 +++++++----
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8858422..c7d7074 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1447,7 +1447,7 @@ static inline struct net_device *next_net_device_rcu(struct net_device *dev)
 	struct net *net;
 
 	net = dev_net(dev);
-	lh = rcu_dereference(dev->dev_list.next);
+	lh = rcu_dereference(list_next_rcu(&dev->dev_list));
 	return lh == &net->dev_base_head ? NULL : net_device_entry(lh);
 }
 
@@ -1457,6 +1457,13 @@ static inline struct net_device *first_net_device(struct net *net)
 		net_device_entry(net->dev_base_head.next);
 }
 
+static inline struct net_device *first_net_device_rcu(struct net *net)
+{
+	struct list_head *lh = rcu_dereference(list_next_rcu(&net->dev_base_head));
+
+	return lh == &net->dev_base_head ? NULL : net_device_entry(lh);
+}
+
 extern int 			netdev_boot_setup_check(struct net_device *dev);
 extern unsigned long		netdev_boot_base(const char *prefix, int unit);
 extern struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
diff --git a/net/core/dev.c b/net/core/dev.c
index 1b4c07f..ddd5df2 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4051,12 +4051,15 @@ void *dev_seq_start(struct seq_file *seq, loff_t *pos)
 
 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	struct net_device *dev = (v == SEQ_START_TOKEN) ?
-				  first_net_device(seq_file_net(seq)) :
-				  next_net_device((struct net_device *)v);
+	struct net_device *dev = v;
+
+	if (v == SEQ_START_TOKEN)
+		dev = first_net_device_rcu(seq_file_net(seq));
+	else
+		dev = next_net_device_rcu(dev);
 
 	++*pos;
-	return rcu_dereference(dev);
+	return dev;
 }
 
 void dev_seq_stop(struct seq_file *seq, void *v)



^ permalink raw reply related

* Re: [net-next 08/12] ixgb: convert to new VLAN model
From: Jesse Gross @ 2011-01-27  3:53 UTC (permalink / raw)
  To: Tantilov, Emil S
  Cc: Kirsher, Jeffrey T, davem@davemloft.net, netdev@vger.kernel.org,
	bphilips@novell.com, Pieper, Jeffrey E, Ben Hutchings
In-Reply-To: <EA929A9653AAE14F841771FB1DE5A136602DF59AA3@rrsmsx501.amr.corp.intel.com>

On Tue, Jan 25, 2011 at 10:20 AM, Tantilov, Emil S
<emil.s.tantilov@intel.com> wrote:
>>-----Original Message-----
>>From: Jesse Gross [mailto:jesse@nicira.com]
>>Sent: Tuesday, January 25, 2011 9:23 AM
>>To: Tantilov, Emil S
>>Cc: Kirsher, Jeffrey T; davem@davemloft.net; netdev@vger.kernel.org;
>>bphilips@novell.com; Pieper, Jeffrey E
>>Subject: Re: [net-next 08/12] ixgb: convert to new VLAN model
>>
>>On Sun, Jan 23, 2011 at 4:25 PM, Tantilov, Emil S
>><emil.s.tantilov@intel.com> wrote:
>>> Jesse Gross wrote:
>>>> On Thu, Jan 6, 2011 at 7:29 PM,  <jeffrey.t.kirsher@intel.com> wrote:
>>>>> +static int ixgb_set_flags(struct net_device *netdev, u32 data) +{
>>>>> +       struct ixgb_adapter *adapter = netdev_priv(netdev); +
>>>>> bool need_reset; +       int rc;
>>>>> +
>>>>> +       /*
>>>>> +        * TX vlan insertion does not work per HW design when Rx
>>>>> stripping is +        * disabled.  Disable txvlan when rxvlan is
>>>>> off. +        */ +       if ((data & ETH_FLAG_RXVLAN) !=
>>>>> (netdev->features & NETIF_F_HW_VLAN_RX)) +               data ^=
>>>>> ETH_FLAG_TXVLAN;
>>>>
>>>> Does this really do the right thing?  If the RX vlan setting is
>>>> changed, it will do the opposite of what the user requested for TX
>>>> vlan?
>>>>
>>>> So if I start with both on (the default) and turn them both off in one
>>>> command (a valid setting), I will get RX off and TX on (an invalid
>>>> setting).
>>>>
>>>> Why not:
>>>>
>>>> if (!(data & ETH_FLAG_RXVLAN))
>>>>         data &= ~ETH_FLAG_TXVLAN;
>>>
>>> Yeah that works for disabling rxvlan, but what if rxvlan is disabled, and
>>the user attempts to enable txvlan? At least our validation argued that we
>>should make it work both ways. Perhaps something like the following?
>>>
>>>        if (!(data & ETH_FLAG_RXVLAN) &&
>>>           (netdev->features & NETIF_F_HW_VLAN_TX))
>>>                data &= ~ETH_FLAG_TXVLAN;
>>>        else if (data & ETH_FLAG_TXVLAN)
>>>                data |= ETH_FLAG_RXVLAN;
>>
>>I think the logic above does what you describe and will always result
>>in a consistent state.  Turning dependent features on when needed is a
>>little bit inconsistent with the rest of Ethtool (for example, turning
>>on TSO when checksum offloading is off will not enable checksum
>>offloading, it will produce an error).  However, I know that drivers
>
> That is the reason I asked, as I don't want to keep bouncing the patch back and forth. Personally I like the idea of helping the user and adjusting the flags to something that works rather than a generic error message.

I think it is fine to adjust things, especially where the restrictions
are hardware specific and the user is less likely to know what
settings are related.  As long as it works, it doesn't matter too much
to me either way, so please do what you think is the most appropriate.

>
>>aren't completely consistent here and the most important part is that
>>it enforces valid states, so I don't have a strong opinion.  Ben's
>>previous suggestion of Ethtool querying again after the operation and
>>reporting any flags that were automatically changed would help a lot
>>here.
>
> Sure, but I think a savvy user would always check the result of an ethtool command (ie. `ethtool -K` followed with `ethtool -k`, -A/-a, etc).

Probably, but it seems the less savviness required from the user the
better.  Regardless, it doesn't affect anything here, it would just be
a change to the userspace tool.

^ permalink raw reply

* i have business for you get back to us
From: MR CHEN GUAN @ 2011-01-26  1:30 UTC (permalink / raw)


I am Mr. Chen Guan, Foreign Operations Manager of the Bank of China (Hong Kong). I have a business suggestion for you of $17,300,000 Million USD. Further details Contact me via email (chenguan000000@yahoo.com.hk)

^ permalink raw reply

* Re: TSO/GRO/LRO/somethingO breaks LVS on 2.6.36
From: Simon Horman @ 2011-01-27  1:36 UTC (permalink / raw)
  To: Simon Kirby; +Cc: Eric Dumazet, netdev
In-Reply-To: <20110127004805.GB11578@hostway.ca>

On Wed, Jan 26, 2011 at 04:48:05PM -0800, Simon Kirby wrote:
> On Thu, Jan 13, 2011 at 03:34:22PM +0900, Simon Horman wrote:
> 
> > Hi Simon,
> > 
> > thanks for prodding me to respond to this post offline and sorry for not
> > responding earlier.
> > 
> > Firstly, I think that this is a receive-side problem so I don't believe
> > that GSO (generic segmentation offload) or other transmit-side options are
> > likely to have any affect.
> > 
> > My understanding is that on the receive-side there are two options which
> > when enabled can result in the behaviour that you describe.
> > 
> > * LRO (large receive offload)
> > 
> >   You have this disabled, and assuming it really is disabled it
> >   shouldn't be causing a problem.
> > 
> > * GRO (generic receive offload)
> > 
> >   This does not seem to be in the output of your ethtool commands at all.
> >   So I wonder if your ethtool is too old to support this option?
> 
> So, this was the case.  Our ethtool (lenny) was too old to see the GRO
> option, only GSO.  Disabling GRO on eth1.39 has no effect, but disabling
> it on eth1 caused it to stop receiving the merged frames, fixing the LVS
> packet loss (due to no sending GSO support from LVS/IPVS).
> 
> Speaking of this, did your patch for LVS/IPVS GSO support go anywhere? 

The patch for IPVS GRO support has been merged and should appear in 2.6.39.
This is somewhat later than I previously anticipated due to a merge mix-up
on my part.

> >   In any case, I was able to reproduce the problem that you describe (or at
> >   least something very similar) using 2.6.36 with GRO enabled on eth1.1 and
> >   the problem did not manifest when I disabled GRO on eth1.1.
> 
> It worked for you to do ethtool -K eth1.1 gro off, then?  For me on
> 2.6.37, it seemed to be that "ethtool -K eth1 gro off" was needed, even
> though packets arrive on eth1.39.

I will recheck my results, but in general I think it is a bit
of an open question as to how ethtool settings should be propagated
between related devices.

> Also, strangely, 2.6.35.4's default state (with no received merged frames)
> has GRO on for eth1 but off for eth1.39:
> 
> # ethtool -k eth1
> Offload parameters for eth1:
> rx-checksumming: on
> tx-checksumming: on
> scatter-gather: on
> tcp-segmentation-offload: on
> udp-fragmentation-offload: off
> generic-segmentation-offload: on
> generic-receive-offload: on
> large-receive-offload: off
> ntuple-filters: off
> receive-hashing: off
> 
> # ethtool -k eth1.39
> Offload parameters for eth1.39:
> rx-checksumming: on
> tx-checksumming: off
> scatter-gather: off
> tcp-segmentation-offload: off
> udp-fragmentation-offload: off
> generic-segmentation-offload: off
> generic-receive-offload: off
> large-receive-offload: off
> ntuple-filters: off
> receive-hashing: off
> 
> If I set 2.6.37 to have all of the same options, I still see GRO frames
> on 2.6.37 (tg3), which is weird.

Yes that is a weird.

There has been quite a lot of work on VLANs recently and
I suspect that the behaviour that you are observing with 2.6.37
is a regression that occurred during that work. It would
be good to fix things to restore the 2.6.35 behaviour.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox