* [PATCH 1/5] code_domain: New code domain tracking susbsystem
2012-08-03 15:02 [RFC PATCH 0/5] cputime: Generic virtual based cputime accounting v2 Frederic Weisbecker
@ 2012-08-03 15:02 ` Frederic Weisbecker
2012-08-03 18:51 ` Paul E. McKenney
2012-08-03 19:45 ` Ingo Molnar
2012-08-03 15:02 ` [PATCH 2/5] cputime: Don't allow virtual and irq finegrained cputime accounting simultaneously Frederic Weisbecker
` (3 subsequent siblings)
4 siblings, 2 replies; 11+ messages in thread
From: Frederic Weisbecker @ 2012-08-03 15:02 UTC (permalink / raw)
To: LKML
Cc: Frederic Weisbecker, Alessio Igor Bogani, Andrew Morton,
Avi Kivity, Chris Metcalf, Christoph Lameter, Geoff Levand,
Gilad Ben Yossef, Hakan Akkan, H. Peter Anvin, Ingo Molnar,
Kevin Hilman, Max Krasnyansky, Paul E. McKenney, Peter Zijlstra,
Stephen Hemminger, Steven Rostedt, Sven-Thorsten Dietrich,
Thomas Gleixner
Create a new subsystem that handles the probing on kernel
boundaries to keep track of the transitions between code domains
with two basic initial domains: user or kernel.
This is an abstraction of some RCU code that use it to implement
its userspace extended quiescent state.
We need to pull this up from RCU into this new level of indirection
because this probing is also going to be used to implement an "on
demand" generic virtual cputime accounting. A necessary step to
shutdown the tick while still accounting the cputime.
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Alessio Igor Bogani <abogani@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Avi Kivity <avi@redhat.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Geoff Levand <geoff@infradead.org>
Cc: Gilad Ben Yossef <gilad@benyossef.com>
Cc: Hakan Akkan <hakanakkan@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Kevin Hilman <khilman@ti.com>
Cc: Max Krasnyansky <maxk@qualcomm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Sven-Thorsten Dietrich <thebigcorporation@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
arch/Kconfig | 12 +++---
arch/x86/Kconfig | 2 +-
arch/x86/include/asm/{rcu.h => code_domain.h} | 12 +++---
arch/x86/kernel/ptrace.c | 6 +-
arch/x86/kernel/signal.c | 5 +-
arch/x86/kernel/traps.c | 2 +-
arch/x86/mm/fault.c | 2 +-
include/linux/code_domain.h | 18 ++++++++
include/linux/rcupdate.h | 2 -
include/linux/sched.h | 8 ---
init/Kconfig | 24 ++++++----
kernel/Makefile | 1 +
kernel/code_domain_tracking.c | 59 +++++++++++++++++++++++++
kernel/rcutree.c | 42 +-----------------
kernel/sched/core.c | 9 ++--
15 files changed, 121 insertions(+), 83 deletions(-)
rename arch/x86/include/asm/{rcu.h => code_domain.h} (53%)
create mode 100644 include/linux/code_domain.h
create mode 100644 kernel/code_domain_tracking.c
diff --git a/arch/Kconfig b/arch/Kconfig
index d891c62..2ce2a2f 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -277,14 +277,14 @@ config SECCOMP_FILTER
config HAVE_VIRT_CPU_ACCOUNTING
bool
-config HAVE_RCU_USER_QS
+config HAVE_CODE_DOMAIN_TRACKING
bool
help
- Provide kernel entry/exit hooks necessary for userspace
+ Provide kernel boundaries probing necessary for userspace
RCU extended quiescent state. Syscalls need to be wrapped inside
- rcu_user_exit()-rcu_user_enter() through the slow path using
- TIF_NOHZ flag. Exceptions handlers must be wrapped as well. Irqs
- are already protected inside rcu_irq_enter/rcu_irq_exit() but
- preemption or signal handling on irq exit still need to be protected.
+ user_exit()-user_enter() through the slow path using TIF_NOHZ flag.
+ Exceptions handlers must be wrapped as well. Irqs are already
+ protected inside rcu_irq_enter/rcu_irq_exit() but preemption or
+ signal handling on irq exit still need to be protected.
source "kernel/gcov/Kconfig"
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 38dfcc2..cc9bf3e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -95,7 +95,7 @@ config X86
select KTIME_SCALAR if X86_32
select GENERIC_STRNCPY_FROM_USER
select GENERIC_STRNLEN_USER
- select HAVE_RCU_USER_QS if X86_64
+ select HAVE_CODE_DOMAIN_TRACKING if X86_64
config INSTRUCTION_DECODER
def_bool (KPROBES || PERF_EVENTS || UPROBES)
diff --git a/arch/x86/include/asm/rcu.h b/arch/x86/include/asm/code_domain.h
similarity index 53%
rename from arch/x86/include/asm/rcu.h
rename to arch/x86/include/asm/code_domain.h
index 439815b..e245152 100644
--- a/arch/x86/include/asm/rcu.h
+++ b/arch/x86/include/asm/code_domain.h
@@ -1,19 +1,19 @@
-#ifndef _ASM_X86_RCU_H
-#define _ASM_X86_RCU_H
+#ifndef _ASM_X86_CODE_DOMAIN_H
+#define _ASM_X86_CODE_DOMAIN_H
-#include <linux/rcupdate.h>
+#include <linux/code_domain.h>
#include <asm/ptrace.h>
static inline void exception_enter(struct pt_regs *regs)
{
- rcu_user_exit();
+ user_exit();
}
static inline void exception_exit(struct pt_regs *regs)
{
-#ifdef CONFIG_RCU_USER_QS
+#ifdef CONFIG_CODE_DOMAIN_TRACKING
if (user_mode(regs))
- rcu_user_enter();
+ user_enter();
#endif
}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 9f94f8e..5bc2e50 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -21,7 +21,7 @@
#include <linux/signal.h>
#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>
-#include <linux/rcupdate.h>
+#include <linux/code_domain.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -1464,7 +1464,7 @@ long syscall_trace_enter(struct pt_regs *regs)
{
long ret = 0;
- rcu_user_exit();
+ user_exit();
/*
* If we stepped into a sysenter/syscall insn, it trapped in
@@ -1530,5 +1530,5 @@ void syscall_trace_leave(struct pt_regs *regs)
if (step || test_thread_flag(TIF_SYSCALL_TRACE))
tracehook_report_syscall_exit(regs, step);
- rcu_user_enter();
+ user_enter();
}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 5cc2579..fc3e12c 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -19,6 +19,7 @@
#include <linux/uaccess.h>
#include <linux/user-return-notifier.h>
#include <linux/uprobes.h>
+#include <linux/code_domain.h>
#include <asm/processor.h>
#include <asm/ucontext.h>
@@ -776,7 +777,7 @@ static void do_signal(struct pt_regs *regs)
void
do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
{
- rcu_user_exit();
+ user_exit();
#ifdef CONFIG_X86_MCE
/* notify userspace of pending MCEs */
@@ -804,7 +805,7 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
clear_thread_flag(TIF_IRET);
#endif /* CONFIG_X86_32 */
- rcu_user_enter();
+ user_enter();
}
void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 9b8195b..2d1fe02 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -52,7 +52,7 @@
#include <asm/i387.h>
#include <asm/fpu-internal.h>
#include <asm/mce.h>
-#include <asm/rcu.h>
+#include <asm/code_domain.h>
#include <asm/mach_traps.h>
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 7dde46d..be026ea 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -18,7 +18,7 @@
#include <asm/pgalloc.h> /* pgd_*(), ... */
#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
#include <asm/fixmap.h> /* VSYSCALL_START */
-#include <asm/rcu.h> /* exception_enter(), ... */
+#include <asm/code_domain.h> /* exception_enter(), ... */
/*
* Page fault error code bits:
diff --git a/include/linux/code_domain.h b/include/linux/code_domain.h
new file mode 100644
index 0000000..5d4513d
--- /dev/null
+++ b/include/linux/code_domain.h
@@ -0,0 +1,18 @@
+#ifndef _LINUX_CODE_DOMAIN_TRACKING_H
+#define _LINUX_CODE_DOMAIN_TRACKING_H
+
+#ifdef CONFIG_CODE_DOMAIN_TRACKING
+#include <linux/sched.h>
+
+extern void user_enter(void);
+extern void user_exit(void);
+extern void code_domain_task_switch(struct task_struct *prev,
+ struct task_struct *next);
+#else
+static inline void user_enter(void) { }
+static inline void user_exit(void) { }
+static inline void code_domain_task_switch(struct task_struct *prev,
+ struct task_struct *next) { }
+#endif /* !CONFIG_CODE_DOMAIN_TRACKING */
+
+#endif
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 1fc0a0e..e411117 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -197,8 +197,6 @@ extern void rcu_user_enter(void);
extern void rcu_user_exit(void);
extern void rcu_user_enter_irq(void);
extern void rcu_user_exit_irq(void);
-extern void rcu_user_hooks_switch(struct task_struct *prev,
- struct task_struct *next);
#else
static inline void rcu_user_enter(void) { }
static inline void rcu_user_exit(void) { }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 30105f4..7b7a438 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1899,14 +1899,6 @@ static inline void rcu_copy_process(struct task_struct *p)
#endif
-static inline void rcu_switch(struct task_struct *prev,
- struct task_struct *next)
-{
-#ifdef CONFIG_RCU_USER_QS
- rcu_user_hooks_switch(prev, next);
-#endif
-}
-
#ifdef CONFIG_SMP
extern void do_set_cpus_allowed(struct task_struct *p,
const struct cpumask *new_mask);
diff --git a/init/Kconfig b/init/Kconfig
index cc1d581..e2854a0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -404,6 +404,19 @@ config AUDIT_LOGINUID_IMMUTABLE
source "kernel/irq/Kconfig"
source "kernel/time/Kconfig"
+config CODE_DOMAIN_TRACKING
+ bool
+
+config CODE_DOMAIN_TRACKING_FORCE
+ bool "Force kernel boundaries probing"
+ depends on CODE_DOMAIN_TRACKING
+ help
+ Set the probes in user/kernel boundaries by default in order to
+ test the features that rely on it such as userspace RCU extended
+ quiescent states.
+ This test is there for debugging until we have a real user like a
+ full adaptive nohz option.
+
menu "RCU Subsystem"
choice
@@ -456,7 +469,8 @@ config PREEMPT_RCU
config RCU_USER_QS
bool "Consider userspace as in RCU extended quiescent state"
- depends on HAVE_RCU_USER_QS && SMP
+ depends on HAVE_CODE_DOMAIN_TRACKING && SMP
+ select CODE_DOMAIN_TRACKING
help
This option sets hooks on kernel / userspace boundaries and
puts RCU in extended quiescent state when the CPU runs in
@@ -464,14 +478,6 @@ config RCU_USER_QS
excluded from the global RCU state machine and thus doesn't
to keep the timer tick on for RCU.
-config RCU_USER_QS_FORCE
- bool "Force userspace extended QS by default"
- depends on RCU_USER_QS
- help
- Set the hooks in user/kernel boundaries by default in order to
- test this feature that treats userspace as an extended quiescent
- state until we have a real user like a full adaptive nohz option.
-
config RCU_FANOUT
int "Tree-based hierarchical RCU fanout value"
range 2 64 if 64BIT
diff --git a/kernel/Makefile b/kernel/Makefile
index c0cc67a..86bc293 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -110,6 +110,7 @@ obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
obj-$(CONFIG_PADATA) += padata.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_JUMP_LABEL) += jump_label.o
+obj-$(CONFIG_CODE_DOMAIN_TRACKING) += code_domain_tracking.o
$(obj)/configs.o: $(obj)/config_data.h
diff --git a/kernel/code_domain_tracking.c b/kernel/code_domain_tracking.c
new file mode 100644
index 0000000..8332c76
--- /dev/null
+++ b/kernel/code_domain_tracking.c
@@ -0,0 +1,59 @@
+#include <linux/code_domain.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/percpu.h>
+
+struct code_domain_tracking {
+ /*
+ * When tracking_active is false, hooks are not
+ * set to minimize overhead: TIF flags are cleared
+ * and calls to user_enter/exit are ignored. This
+ * may be further optimized using static keys.
+ */
+ bool tracking_active;
+ enum {
+ IN_KERNEL = 0,
+ IN_USER,
+ } state;
+};
+
+DEFINE_PER_CPU(struct code_domain_tracking, code_domain) = {
+#ifdef CONFIG_CODE_DOMAIN_TRACKING_FORCE
+ .tracking_active = true,
+#endif
+};
+
+void user_enter(void)
+{
+ unsigned long flags;
+
+ WARN_ON_ONCE(!current->mm);
+ local_irq_save(flags);
+ if (__this_cpu_read(code_domain.tracking_active) &&
+ __this_cpu_read(code_domain.state) != IN_USER) {
+ __this_cpu_write(code_domain.state, IN_USER);
+ rcu_user_enter();
+ }
+ local_irq_restore(flags);
+}
+
+void user_exit(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ if (__this_cpu_read(code_domain.state) == IN_USER) {
+ __this_cpu_write(code_domain.state, IN_KERNEL);
+ rcu_user_exit();
+ }
+ local_irq_restore(flags);
+}
+
+void code_domain_task_switch(struct task_struct *prev,
+ struct task_struct *next)
+{
+ if (__this_cpu_read(code_domain.tracking_active)) {
+ clear_tsk_thread_flag(prev, TIF_NOHZ);
+ set_tsk_thread_flag(next, TIF_NOHZ);
+ }
+}
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 318d00e..f6a24cb 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -212,9 +212,6 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
.dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
.dynticks = ATOMIC_INIT(1),
-#if defined(CONFIG_RCU_USER_QS) && !defined(CONFIG_RCU_USER_QS_FORCE)
- .ignore_user_qs = true,
-#endif
};
static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */
@@ -448,18 +445,7 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
*/
void rcu_user_enter(void)
{
- unsigned long flags;
- struct rcu_dynticks *rdtp;
-
- WARN_ON_ONCE(!current->mm);
-
- local_irq_save(flags);
- rdtp = &__get_cpu_var(rcu_dynticks);
- if (!rdtp->ignore_user_qs && !rdtp->in_user) {
- rdtp->in_user = true;
- rcu_eqs_enter(1);
- }
- local_irq_restore(flags);
+ rcu_eqs_enter(1);
}
EXPORT_SYMBOL_GPL(rcu_user_enter);
@@ -597,16 +583,7 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit);
*/
void rcu_user_exit(void)
{
- unsigned long flags;
- struct rcu_dynticks *rdtp;
-
- local_irq_save(flags);
- rdtp = &__get_cpu_var(rcu_dynticks);
- if (rdtp->in_user) {
- rdtp->in_user = false;
- rcu_eqs_exit(1);
- }
- local_irq_restore(flags);
+ rcu_eqs_exit(1);
}
EXPORT_SYMBOL_GPL(rcu_user_exit);
@@ -730,21 +707,6 @@ int rcu_is_cpu_idle(void)
}
EXPORT_SYMBOL(rcu_is_cpu_idle);
-#ifdef CONFIG_RCU_USER_QS
-void rcu_user_hooks_switch(struct task_struct *prev,
- struct task_struct *next)
-{
- struct rcu_dynticks *rdtp;
-
- /* Interrupts are disabled in context switch */
- rdtp = &__get_cpu_var(rcu_dynticks);
- if (!rdtp->ignore_user_qs) {
- clear_tsk_thread_flag(prev, TIF_NOHZ);
- set_tsk_thread_flag(next, TIF_NOHZ);
- }
-}
-#endif /* #ifdef CONFIG_RCU_USER_QS */
-
#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
/*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 94a4894..64bb370 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -72,6 +72,7 @@
#include <linux/slab.h>
#include <linux/init_task.h>
#include <linux/binfmts.h>
+#include <linux/code_domain.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
@@ -1925,8 +1926,8 @@ context_switch(struct rq *rq, struct task_struct *prev,
spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
#endif
+ code_domain_task_switch(prev, next);
/* Here we just switch the register state and the stack. */
- rcu_switch(prev, next);
switch_to(prev, next, prev);
barrier();
@@ -2920,9 +2921,9 @@ EXPORT_SYMBOL(schedule);
asmlinkage void __sched schedule_user(void)
{
- rcu_user_exit();
+ user_exit();
schedule();
- rcu_user_enter();
+ user_enter();
}
/**
@@ -3026,7 +3027,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
/* Catch callers which need to be fixed */
BUG_ON(ti->preempt_count || !irqs_disabled());
- rcu_user_exit();
+ user_exit();
do {
add_preempt_count(PREEMPT_ACTIVE);
local_irq_enable();
--
1.7.5.4
^ permalink raw reply related [flat|nested] 11+ messages in thread* Re: [PATCH 1/5] code_domain: New code domain tracking susbsystem
2012-08-03 15:02 ` [PATCH 1/5] code_domain: New code domain tracking susbsystem Frederic Weisbecker
@ 2012-08-03 18:51 ` Paul E. McKenney
2012-08-03 19:45 ` Ingo Molnar
1 sibling, 0 replies; 11+ messages in thread
From: Paul E. McKenney @ 2012-08-03 18:51 UTC (permalink / raw)
To: Frederic Weisbecker
Cc: LKML, Alessio Igor Bogani, Andrew Morton, Avi Kivity,
Chris Metcalf, Christoph Lameter, Geoff Levand, Gilad Ben Yossef,
Hakan Akkan, H. Peter Anvin, Ingo Molnar, Kevin Hilman,
Max Krasnyansky, Peter Zijlstra, Stephen Hemminger,
Steven Rostedt, Sven-Thorsten Dietrich, Thomas Gleixner
On Fri, Aug 03, 2012 at 05:02:21PM +0200, Frederic Weisbecker wrote:
> Create a new subsystem that handles the probing on kernel
> boundaries to keep track of the transitions between code domains
> with two basic initial domains: user or kernel.
>
> This is an abstraction of some RCU code that use it to implement
> its userspace extended quiescent state.
>
> We need to pull this up from RCU into this new level of indirection
> because this probing is also going to be used to implement an "on
> demand" generic virtual cputime accounting. A necessary step to
> shutdown the tick while still accounting the cputime.
>From an RCU viewpoint:
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
> Cc: Alessio Igor Bogani <abogani@kernel.org>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Avi Kivity <avi@redhat.com>
> Cc: Chris Metcalf <cmetcalf@tilera.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Geoff Levand <geoff@infradead.org>
> Cc: Gilad Ben Yossef <gilad@benyossef.com>
> Cc: Hakan Akkan <hakanakkan@gmail.com>
> Cc: H. Peter Anvin <hpa@zytor.com>
> Cc: Ingo Molnar <mingo@kernel.org>
> Cc: Kevin Hilman <khilman@ti.com>
> Cc: Max Krasnyansky <maxk@qualcomm.com>
> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Stephen Hemminger <shemminger@vyatta.com>
> Cc: Steven Rostedt <rostedt@goodmis.org>
> Cc: Sven-Thorsten Dietrich <thebigcorporation@gmail.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> ---
> arch/Kconfig | 12 +++---
> arch/x86/Kconfig | 2 +-
> arch/x86/include/asm/{rcu.h => code_domain.h} | 12 +++---
> arch/x86/kernel/ptrace.c | 6 +-
> arch/x86/kernel/signal.c | 5 +-
> arch/x86/kernel/traps.c | 2 +-
> arch/x86/mm/fault.c | 2 +-
> include/linux/code_domain.h | 18 ++++++++
> include/linux/rcupdate.h | 2 -
> include/linux/sched.h | 8 ---
> init/Kconfig | 24 ++++++----
> kernel/Makefile | 1 +
> kernel/code_domain_tracking.c | 59 +++++++++++++++++++++++++
> kernel/rcutree.c | 42 +-----------------
> kernel/sched/core.c | 9 ++--
> 15 files changed, 121 insertions(+), 83 deletions(-)
> rename arch/x86/include/asm/{rcu.h => code_domain.h} (53%)
> create mode 100644 include/linux/code_domain.h
> create mode 100644 kernel/code_domain_tracking.c
>
> diff --git a/arch/Kconfig b/arch/Kconfig
> index d891c62..2ce2a2f 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -277,14 +277,14 @@ config SECCOMP_FILTER
> config HAVE_VIRT_CPU_ACCOUNTING
> bool
>
> -config HAVE_RCU_USER_QS
> +config HAVE_CODE_DOMAIN_TRACKING
> bool
> help
> - Provide kernel entry/exit hooks necessary for userspace
> + Provide kernel boundaries probing necessary for userspace
> RCU extended quiescent state. Syscalls need to be wrapped inside
> - rcu_user_exit()-rcu_user_enter() through the slow path using
> - TIF_NOHZ flag. Exceptions handlers must be wrapped as well. Irqs
> - are already protected inside rcu_irq_enter/rcu_irq_exit() but
> - preemption or signal handling on irq exit still need to be protected.
> + user_exit()-user_enter() through the slow path using TIF_NOHZ flag.
> + Exceptions handlers must be wrapped as well. Irqs are already
> + protected inside rcu_irq_enter/rcu_irq_exit() but preemption or
> + signal handling on irq exit still need to be protected.
>
> source "kernel/gcov/Kconfig"
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 38dfcc2..cc9bf3e 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -95,7 +95,7 @@ config X86
> select KTIME_SCALAR if X86_32
> select GENERIC_STRNCPY_FROM_USER
> select GENERIC_STRNLEN_USER
> - select HAVE_RCU_USER_QS if X86_64
> + select HAVE_CODE_DOMAIN_TRACKING if X86_64
>
> config INSTRUCTION_DECODER
> def_bool (KPROBES || PERF_EVENTS || UPROBES)
> diff --git a/arch/x86/include/asm/rcu.h b/arch/x86/include/asm/code_domain.h
> similarity index 53%
> rename from arch/x86/include/asm/rcu.h
> rename to arch/x86/include/asm/code_domain.h
> index 439815b..e245152 100644
> --- a/arch/x86/include/asm/rcu.h
> +++ b/arch/x86/include/asm/code_domain.h
> @@ -1,19 +1,19 @@
> -#ifndef _ASM_X86_RCU_H
> -#define _ASM_X86_RCU_H
> +#ifndef _ASM_X86_CODE_DOMAIN_H
> +#define _ASM_X86_CODE_DOMAIN_H
>
> -#include <linux/rcupdate.h>
> +#include <linux/code_domain.h>
> #include <asm/ptrace.h>
>
> static inline void exception_enter(struct pt_regs *regs)
> {
> - rcu_user_exit();
> + user_exit();
> }
>
> static inline void exception_exit(struct pt_regs *regs)
> {
> -#ifdef CONFIG_RCU_USER_QS
> +#ifdef CONFIG_CODE_DOMAIN_TRACKING
> if (user_mode(regs))
> - rcu_user_enter();
> + user_enter();
> #endif
> }
>
> diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
> index 9f94f8e..5bc2e50 100644
> --- a/arch/x86/kernel/ptrace.c
> +++ b/arch/x86/kernel/ptrace.c
> @@ -21,7 +21,7 @@
> #include <linux/signal.h>
> #include <linux/perf_event.h>
> #include <linux/hw_breakpoint.h>
> -#include <linux/rcupdate.h>
> +#include <linux/code_domain.h>
>
> #include <asm/uaccess.h>
> #include <asm/pgtable.h>
> @@ -1464,7 +1464,7 @@ long syscall_trace_enter(struct pt_regs *regs)
> {
> long ret = 0;
>
> - rcu_user_exit();
> + user_exit();
>
> /*
> * If we stepped into a sysenter/syscall insn, it trapped in
> @@ -1530,5 +1530,5 @@ void syscall_trace_leave(struct pt_regs *regs)
> if (step || test_thread_flag(TIF_SYSCALL_TRACE))
> tracehook_report_syscall_exit(regs, step);
>
> - rcu_user_enter();
> + user_enter();
> }
> diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
> index 5cc2579..fc3e12c 100644
> --- a/arch/x86/kernel/signal.c
> +++ b/arch/x86/kernel/signal.c
> @@ -19,6 +19,7 @@
> #include <linux/uaccess.h>
> #include <linux/user-return-notifier.h>
> #include <linux/uprobes.h>
> +#include <linux/code_domain.h>
>
> #include <asm/processor.h>
> #include <asm/ucontext.h>
> @@ -776,7 +777,7 @@ static void do_signal(struct pt_regs *regs)
> void
> do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
> {
> - rcu_user_exit();
> + user_exit();
>
> #ifdef CONFIG_X86_MCE
> /* notify userspace of pending MCEs */
> @@ -804,7 +805,7 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
> clear_thread_flag(TIF_IRET);
> #endif /* CONFIG_X86_32 */
>
> - rcu_user_enter();
> + user_enter();
> }
>
> void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index 9b8195b..2d1fe02 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -52,7 +52,7 @@
> #include <asm/i387.h>
> #include <asm/fpu-internal.h>
> #include <asm/mce.h>
> -#include <asm/rcu.h>
> +#include <asm/code_domain.h>
>
> #include <asm/mach_traps.h>
>
> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index 7dde46d..be026ea 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -18,7 +18,7 @@
> #include <asm/pgalloc.h> /* pgd_*(), ... */
> #include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
> #include <asm/fixmap.h> /* VSYSCALL_START */
> -#include <asm/rcu.h> /* exception_enter(), ... */
> +#include <asm/code_domain.h> /* exception_enter(), ... */
>
> /*
> * Page fault error code bits:
> diff --git a/include/linux/code_domain.h b/include/linux/code_domain.h
> new file mode 100644
> index 0000000..5d4513d
> --- /dev/null
> +++ b/include/linux/code_domain.h
> @@ -0,0 +1,18 @@
> +#ifndef _LINUX_CODE_DOMAIN_TRACKING_H
> +#define _LINUX_CODE_DOMAIN_TRACKING_H
> +
> +#ifdef CONFIG_CODE_DOMAIN_TRACKING
> +#include <linux/sched.h>
> +
> +extern void user_enter(void);
> +extern void user_exit(void);
> +extern void code_domain_task_switch(struct task_struct *prev,
> + struct task_struct *next);
> +#else
> +static inline void user_enter(void) { }
> +static inline void user_exit(void) { }
> +static inline void code_domain_task_switch(struct task_struct *prev,
> + struct task_struct *next) { }
> +#endif /* !CONFIG_CODE_DOMAIN_TRACKING */
> +
> +#endif
> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> index 1fc0a0e..e411117 100644
> --- a/include/linux/rcupdate.h
> +++ b/include/linux/rcupdate.h
> @@ -197,8 +197,6 @@ extern void rcu_user_enter(void);
> extern void rcu_user_exit(void);
> extern void rcu_user_enter_irq(void);
> extern void rcu_user_exit_irq(void);
> -extern void rcu_user_hooks_switch(struct task_struct *prev,
> - struct task_struct *next);
> #else
> static inline void rcu_user_enter(void) { }
> static inline void rcu_user_exit(void) { }
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 30105f4..7b7a438 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1899,14 +1899,6 @@ static inline void rcu_copy_process(struct task_struct *p)
>
> #endif
>
> -static inline void rcu_switch(struct task_struct *prev,
> - struct task_struct *next)
> -{
> -#ifdef CONFIG_RCU_USER_QS
> - rcu_user_hooks_switch(prev, next);
> -#endif
> -}
> -
> #ifdef CONFIG_SMP
> extern void do_set_cpus_allowed(struct task_struct *p,
> const struct cpumask *new_mask);
> diff --git a/init/Kconfig b/init/Kconfig
> index cc1d581..e2854a0 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -404,6 +404,19 @@ config AUDIT_LOGINUID_IMMUTABLE
> source "kernel/irq/Kconfig"
> source "kernel/time/Kconfig"
>
> +config CODE_DOMAIN_TRACKING
> + bool
> +
> +config CODE_DOMAIN_TRACKING_FORCE
> + bool "Force kernel boundaries probing"
> + depends on CODE_DOMAIN_TRACKING
> + help
> + Set the probes in user/kernel boundaries by default in order to
> + test the features that rely on it such as userspace RCU extended
> + quiescent states.
> + This test is there for debugging until we have a real user like a
> + full adaptive nohz option.
> +
> menu "RCU Subsystem"
>
> choice
> @@ -456,7 +469,8 @@ config PREEMPT_RCU
>
> config RCU_USER_QS
> bool "Consider userspace as in RCU extended quiescent state"
> - depends on HAVE_RCU_USER_QS && SMP
> + depends on HAVE_CODE_DOMAIN_TRACKING && SMP
> + select CODE_DOMAIN_TRACKING
> help
> This option sets hooks on kernel / userspace boundaries and
> puts RCU in extended quiescent state when the CPU runs in
> @@ -464,14 +478,6 @@ config RCU_USER_QS
> excluded from the global RCU state machine and thus doesn't
> to keep the timer tick on for RCU.
>
> -config RCU_USER_QS_FORCE
> - bool "Force userspace extended QS by default"
> - depends on RCU_USER_QS
> - help
> - Set the hooks in user/kernel boundaries by default in order to
> - test this feature that treats userspace as an extended quiescent
> - state until we have a real user like a full adaptive nohz option.
> -
> config RCU_FANOUT
> int "Tree-based hierarchical RCU fanout value"
> range 2 64 if 64BIT
> diff --git a/kernel/Makefile b/kernel/Makefile
> index c0cc67a..86bc293 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -110,6 +110,7 @@ obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
> obj-$(CONFIG_PADATA) += padata.o
> obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
> obj-$(CONFIG_JUMP_LABEL) += jump_label.o
> +obj-$(CONFIG_CODE_DOMAIN_TRACKING) += code_domain_tracking.o
>
> $(obj)/configs.o: $(obj)/config_data.h
>
> diff --git a/kernel/code_domain_tracking.c b/kernel/code_domain_tracking.c
> new file mode 100644
> index 0000000..8332c76
> --- /dev/null
> +++ b/kernel/code_domain_tracking.c
> @@ -0,0 +1,59 @@
> +#include <linux/code_domain.h>
> +#include <linux/rcupdate.h>
> +#include <linux/sched.h>
> +#include <linux/percpu.h>
> +
> +struct code_domain_tracking {
> + /*
> + * When tracking_active is false, hooks are not
> + * set to minimize overhead: TIF flags are cleared
> + * and calls to user_enter/exit are ignored. This
> + * may be further optimized using static keys.
> + */
> + bool tracking_active;
> + enum {
> + IN_KERNEL = 0,
> + IN_USER,
> + } state;
> +};
> +
> +DEFINE_PER_CPU(struct code_domain_tracking, code_domain) = {
> +#ifdef CONFIG_CODE_DOMAIN_TRACKING_FORCE
> + .tracking_active = true,
> +#endif
> +};
> +
> +void user_enter(void)
> +{
> + unsigned long flags;
> +
> + WARN_ON_ONCE(!current->mm);
> + local_irq_save(flags);
> + if (__this_cpu_read(code_domain.tracking_active) &&
> + __this_cpu_read(code_domain.state) != IN_USER) {
> + __this_cpu_write(code_domain.state, IN_USER);
> + rcu_user_enter();
> + }
> + local_irq_restore(flags);
> +}
> +
> +void user_exit(void)
> +{
> + unsigned long flags;
> +
> + local_irq_save(flags);
> + if (__this_cpu_read(code_domain.state) == IN_USER) {
> + __this_cpu_write(code_domain.state, IN_KERNEL);
> + rcu_user_exit();
> + }
> + local_irq_restore(flags);
> +}
> +
> +void code_domain_task_switch(struct task_struct *prev,
> + struct task_struct *next)
> +{
> + if (__this_cpu_read(code_domain.tracking_active)) {
> + clear_tsk_thread_flag(prev, TIF_NOHZ);
> + set_tsk_thread_flag(next, TIF_NOHZ);
> + }
> +}
> diff --git a/kernel/rcutree.c b/kernel/rcutree.c
> index 318d00e..f6a24cb 100644
> --- a/kernel/rcutree.c
> +++ b/kernel/rcutree.c
> @@ -212,9 +212,6 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
> DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
> .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
> .dynticks = ATOMIC_INIT(1),
> -#if defined(CONFIG_RCU_USER_QS) && !defined(CONFIG_RCU_USER_QS_FORCE)
> - .ignore_user_qs = true,
> -#endif
> };
>
> static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */
> @@ -448,18 +445,7 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
> */
> void rcu_user_enter(void)
> {
> - unsigned long flags;
> - struct rcu_dynticks *rdtp;
> -
> - WARN_ON_ONCE(!current->mm);
> -
> - local_irq_save(flags);
> - rdtp = &__get_cpu_var(rcu_dynticks);
> - if (!rdtp->ignore_user_qs && !rdtp->in_user) {
> - rdtp->in_user = true;
> - rcu_eqs_enter(1);
> - }
> - local_irq_restore(flags);
> + rcu_eqs_enter(1);
> }
> EXPORT_SYMBOL_GPL(rcu_user_enter);
>
> @@ -597,16 +583,7 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit);
> */
> void rcu_user_exit(void)
> {
> - unsigned long flags;
> - struct rcu_dynticks *rdtp;
> -
> - local_irq_save(flags);
> - rdtp = &__get_cpu_var(rcu_dynticks);
> - if (rdtp->in_user) {
> - rdtp->in_user = false;
> - rcu_eqs_exit(1);
> - }
> - local_irq_restore(flags);
> + rcu_eqs_exit(1);
> }
> EXPORT_SYMBOL_GPL(rcu_user_exit);
>
> @@ -730,21 +707,6 @@ int rcu_is_cpu_idle(void)
> }
> EXPORT_SYMBOL(rcu_is_cpu_idle);
>
> -#ifdef CONFIG_RCU_USER_QS
> -void rcu_user_hooks_switch(struct task_struct *prev,
> - struct task_struct *next)
> -{
> - struct rcu_dynticks *rdtp;
> -
> - /* Interrupts are disabled in context switch */
> - rdtp = &__get_cpu_var(rcu_dynticks);
> - if (!rdtp->ignore_user_qs) {
> - clear_tsk_thread_flag(prev, TIF_NOHZ);
> - set_tsk_thread_flag(next, TIF_NOHZ);
> - }
> -}
> -#endif /* #ifdef CONFIG_RCU_USER_QS */
> -
> #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
>
> /*
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 94a4894..64bb370 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -72,6 +72,7 @@
> #include <linux/slab.h>
> #include <linux/init_task.h>
> #include <linux/binfmts.h>
> +#include <linux/code_domain.h>
>
> #include <asm/switch_to.h>
> #include <asm/tlb.h>
> @@ -1925,8 +1926,8 @@ context_switch(struct rq *rq, struct task_struct *prev,
> spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
> #endif
>
> + code_domain_task_switch(prev, next);
> /* Here we just switch the register state and the stack. */
> - rcu_switch(prev, next);
> switch_to(prev, next, prev);
>
> barrier();
> @@ -2920,9 +2921,9 @@ EXPORT_SYMBOL(schedule);
>
> asmlinkage void __sched schedule_user(void)
> {
> - rcu_user_exit();
> + user_exit();
> schedule();
> - rcu_user_enter();
> + user_enter();
> }
>
> /**
> @@ -3026,7 +3027,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
> /* Catch callers which need to be fixed */
> BUG_ON(ti->preempt_count || !irqs_disabled());
>
> - rcu_user_exit();
> + user_exit();
> do {
> add_preempt_count(PREEMPT_ACTIVE);
> local_irq_enable();
> --
> 1.7.5.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [PATCH 1/5] code_domain: New code domain tracking susbsystem
2012-08-03 15:02 ` [PATCH 1/5] code_domain: New code domain tracking susbsystem Frederic Weisbecker
2012-08-03 18:51 ` Paul E. McKenney
@ 2012-08-03 19:45 ` Ingo Molnar
2012-08-03 20:09 ` Steven Rostedt
1 sibling, 1 reply; 11+ messages in thread
From: Ingo Molnar @ 2012-08-03 19:45 UTC (permalink / raw)
To: Frederic Weisbecker
Cc: LKML, Alessio Igor Bogani, Andrew Morton, Avi Kivity,
Chris Metcalf, Christoph Lameter, Geoff Levand, Gilad Ben Yossef,
Hakan Akkan, H. Peter Anvin, Kevin Hilman, Max Krasnyansky,
Paul E. McKenney, Peter Zijlstra, Stephen Hemminger,
Steven Rostedt, Sven-Thorsten Dietrich, Thomas Gleixner
* Frederic Weisbecker <fweisbec@gmail.com> wrote:
> Create a new subsystem that handles the probing on kernel
> boundaries to keep track of the transitions between code
> domains with two basic initial domains: user or kernel.
To do a bit more bike shed painting, I'd call it "context
tracking" - user mode, kernel mode (guest mode, etc.).
The term 'code domain' would bring up blank stares from most
kernel developers, me thinks.
Thanks,
Ingo
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 1/5] code_domain: New code domain tracking susbsystem
2012-08-03 19:45 ` Ingo Molnar
@ 2012-08-03 20:09 ` Steven Rostedt
2012-08-03 20:31 ` Paul E. McKenney
0 siblings, 1 reply; 11+ messages in thread
From: Steven Rostedt @ 2012-08-03 20:09 UTC (permalink / raw)
To: Ingo Molnar
Cc: Frederic Weisbecker, LKML, Alessio Igor Bogani, Andrew Morton,
Avi Kivity, Chris Metcalf, Christoph Lameter, Geoff Levand,
Gilad Ben Yossef, Hakan Akkan, H. Peter Anvin, Kevin Hilman,
Max Krasnyansky, Paul E. McKenney, Peter Zijlstra,
Stephen Hemminger, Sven-Thorsten Dietrich, Thomas Gleixner
On Fri, 2012-08-03 at 21:45 +0200, Ingo Molnar wrote:
> * Frederic Weisbecker <fweisbec@gmail.com> wrote:
>
> > Create a new subsystem that handles the probing on kernel
> > boundaries to keep track of the transitions between code
> > domains with two basic initial domains: user or kernel.
>
> To do a bit more bike shed painting, I'd call it "context
> tracking" - user mode, kernel mode (guest mode, etc.).
>
> The term 'code domain' would bring up blank stares from most
> kernel developers, me thinks.
Heh, that would be a second new term I heard this week for context.
Earlier, I noticed that Paul McKenney called it 'levels'. So now there's
four names:
user/kernel context
user/kernel state
user/kernel level
user/kernel domain
And we could probably add a fifth:
user/kernel mode
;-)
-- Steve
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 1/5] code_domain: New code domain tracking susbsystem
2012-08-03 20:09 ` Steven Rostedt
@ 2012-08-03 20:31 ` Paul E. McKenney
2012-08-03 21:04 ` Frederic Weisbecker
0 siblings, 1 reply; 11+ messages in thread
From: Paul E. McKenney @ 2012-08-03 20:31 UTC (permalink / raw)
To: Steven Rostedt
Cc: Ingo Molnar, Frederic Weisbecker, LKML, Alessio Igor Bogani,
Andrew Morton, Avi Kivity, Chris Metcalf, Christoph Lameter,
Geoff Levand, Gilad Ben Yossef, Hakan Akkan, H. Peter Anvin,
Kevin Hilman, Max Krasnyansky, Peter Zijlstra, Stephen Hemminger,
Sven-Thorsten Dietrich, Thomas Gleixner
On Fri, Aug 03, 2012 at 04:09:39PM -0400, Steven Rostedt wrote:
> On Fri, 2012-08-03 at 21:45 +0200, Ingo Molnar wrote:
> > * Frederic Weisbecker <fweisbec@gmail.com> wrote:
> >
> > > Create a new subsystem that handles the probing on kernel
> > > boundaries to keep track of the transitions between code
> > > domains with two basic initial domains: user or kernel.
> >
> > To do a bit more bike shed painting, I'd call it "context
> > tracking" - user mode, kernel mode (guest mode, etc.).
> >
> > The term 'code domain' would bring up blank stares from most
> > kernel developers, me thinks.
>
> Heh, that would be a second new term I heard this week for context.
> Earlier, I noticed that Paul McKenney called it 'levels'. So now there's
> four names:
>
> user/kernel context
> user/kernel state
> user/kernel level
> user/kernel domain
>
> And we could probably add a fifth:
>
> user/kernel mode
Plus:
user/kernel space
> ;-)
Then there is "supervisor", "system", "privileged", and who knows what
all else for "kernel". And "application" and "problem" and probably
others for "user".
Thanx, Paul
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 1/5] code_domain: New code domain tracking susbsystem
2012-08-03 20:31 ` Paul E. McKenney
@ 2012-08-03 21:04 ` Frederic Weisbecker
0 siblings, 0 replies; 11+ messages in thread
From: Frederic Weisbecker @ 2012-08-03 21:04 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Steven Rostedt, Ingo Molnar, LKML, Alessio Igor Bogani,
Andrew Morton, Avi Kivity, Chris Metcalf, Christoph Lameter,
Geoff Levand, Gilad Ben Yossef, Hakan Akkan, H. Peter Anvin,
Kevin Hilman, Max Krasnyansky, Peter Zijlstra, Stephen Hemminger,
Sven-Thorsten Dietrich, Thomas Gleixner
On Fri, Aug 03, 2012 at 01:31:44PM -0700, Paul E. McKenney wrote:
> On Fri, Aug 03, 2012 at 04:09:39PM -0400, Steven Rostedt wrote:
> > On Fri, 2012-08-03 at 21:45 +0200, Ingo Molnar wrote:
> > > * Frederic Weisbecker <fweisbec@gmail.com> wrote:
> > >
> > > > Create a new subsystem that handles the probing on kernel
> > > > boundaries to keep track of the transitions between code
> > > > domains with two basic initial domains: user or kernel.
> > >
> > > To do a bit more bike shed painting, I'd call it "context
> > > tracking" - user mode, kernel mode (guest mode, etc.).
> > >
> > > The term 'code domain' would bring up blank stares from most
> > > kernel developers, me thinks.
> >
> > Heh, that would be a second new term I heard this week for context.
> > Earlier, I noticed that Paul McKenney called it 'levels'. So now there's
> > four names:
> >
> > user/kernel context
> > user/kernel state
> > user/kernel level
> > user/kernel domain
> >
> > And we could probably add a fifth:
> >
> > user/kernel mode
>
> Plus:
>
> user/kernel space
>
> > ;-)
>
> Then there is "supervisor", "system", "privileged", and who knows what
> all else for "kernel". And "application" and "problem" and probably
> others for "user".
Hehe.
Ok I agree that domain already has a biased meaning in the kernel.
So I'm going to respin with code_context_tracking.
If anybody oppose, please raise your hand.
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 2/5] cputime: Don't allow virtual and irq finegrained cputime accounting simultaneously
2012-08-03 15:02 [RFC PATCH 0/5] cputime: Generic virtual based cputime accounting v2 Frederic Weisbecker
2012-08-03 15:02 ` [PATCH 1/5] code_domain: New code domain tracking susbsystem Frederic Weisbecker
@ 2012-08-03 15:02 ` Frederic Weisbecker
2012-08-03 15:02 ` [PATCH 3/5] cputime: Allow dynamic switch between tick/virtual based cputime accounting Frederic Weisbecker
` (2 subsequent siblings)
4 siblings, 0 replies; 11+ messages in thread
From: Frederic Weisbecker @ 2012-08-03 15:02 UTC (permalink / raw)
To: LKML
Cc: Frederic Weisbecker, Alessio Igor Bogani, Andrew Morton,
Avi Kivity, Chris Metcalf, Christoph Lameter, Geoff Levand,
Gilad Ben Yossef, Hakan Akkan, H. Peter Anvin, Ingo Molnar,
Kevin Hilman, Max Krasnyansky, Paul E. McKenney, Peter Zijlstra,
Stephen Hemminger, Steven Rostedt, Sven-Thorsten Dietrich,
Thomas Gleixner
We may soon be able to run both at the same time. But we need
to rework a bit the irq finegrained accounting before that.
Just do a mutual exclusion for now.
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Alessio Igor Bogani <abogani@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Avi Kivity <avi@redhat.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Geoff Levand <geoff@infradead.org>
Cc: Gilad Ben Yossef <gilad@benyossef.com>
Cc: Hakan Akkan <hakanakkan@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Kevin Hilman <khilman@ti.com>
Cc: Max Krasnyansky <maxk@qualcomm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Sven-Thorsten Dietrich <thebigcorporation@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
init/Kconfig | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/init/Kconfig b/init/Kconfig
index e2854a0..cf5e86e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -269,7 +269,7 @@ config POSIX_MQUEUE_SYSCTL
config VIRT_CPU_ACCOUNTING
bool "Deterministic task and CPU time accounting"
- depends on HAVE_VIRT_CPU_ACCOUNTING
+ depends on HAVE_VIRT_CPU_ACCOUNTING && !IRQ_TIME_ACCOUNTING
default y if PPC64
help
Select this option to enable more accurate task and CPU time
--
1.7.5.4
^ permalink raw reply related [flat|nested] 11+ messages in thread* [PATCH 3/5] cputime: Allow dynamic switch between tick/virtual based cputime accounting
2012-08-03 15:02 [RFC PATCH 0/5] cputime: Generic virtual based cputime accounting v2 Frederic Weisbecker
2012-08-03 15:02 ` [PATCH 1/5] code_domain: New code domain tracking susbsystem Frederic Weisbecker
2012-08-03 15:02 ` [PATCH 2/5] cputime: Don't allow virtual and irq finegrained cputime accounting simultaneously Frederic Weisbecker
@ 2012-08-03 15:02 ` Frederic Weisbecker
2012-08-03 15:02 ` [PATCH 4/5] cputime: Rename account_system_vtime to account_vtime Frederic Weisbecker
2012-08-03 15:02 ` [PATCH 5/5] cputime: Generic on-demand virtual cputime accounting Frederic Weisbecker
4 siblings, 0 replies; 11+ messages in thread
From: Frederic Weisbecker @ 2012-08-03 15:02 UTC (permalink / raw)
To: LKML
Cc: Frederic Weisbecker, Alessio Igor Bogani, Andrew Morton,
Avi Kivity, Chris Metcalf, Christoph Lameter, Geoff Levand,
Gilad Ben Yossef, Hakan Akkan, H. Peter Anvin, Ingo Molnar,
Kevin Hilman, Max Krasnyansky, Paul E. McKenney, Peter Zijlstra,
Stephen Hemminger, Steven Rostedt, Sven-Thorsten Dietrich,
Thomas Gleixner
Allow to dynamically switch between tick and virtual based cputime accounting.
This way we can provide a kind of "on-demand" virtual based cputime
accounting. In this mode, the kernel will rely on the user hooks
subsystem to dynamically hook on kernel boundaries.
This is in preparation for beeing able to stop the timer tick further
idle. Doing so will depend on CONFIG_VIRT_CPU_ACCOUNTING which makes
it possible to account the cputime without the tick by hooking on
kernel/user boundaries.
Depending whether the tick is stopped or not, we can switch between
tick and vtime based accounting anytime in order to minimize the
overhead associated to user hooks.
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Alessio Igor Bogani <abogani@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Avi Kivity <avi@redhat.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Geoff Levand <geoff@infradead.org>
Cc: Gilad Ben Yossef <gilad@benyossef.com>
Cc: Hakan Akkan <hakanakkan@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Kevin Hilman <khilman@ti.com>
Cc: Max Krasnyansky <maxk@qualcomm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Sven-Thorsten Dietrich <thebigcorporation@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
arch/ia64/include/asm/cputime.h | 5 ++++
arch/ia64/kernel/time.c | 2 +-
arch/powerpc/include/asm/cputime.h | 5 ++++
arch/powerpc/kernel/time.c | 2 +-
arch/s390/include/asm/cputime.h | 5 ++++
arch/s390/kernel/vtime.c | 2 +-
include/asm-generic/cputime.h | 5 ++++
include/linux/kernel_stat.h | 3 ++
include/linux/sched.h | 5 +---
kernel/fork.c | 3 +-
kernel/sched/cputime.c | 39 +++++++++++++++--------------------
kernel/time/tick-sched.c | 28 ++++++++++++-------------
12 files changed, 58 insertions(+), 46 deletions(-)
diff --git a/arch/ia64/include/asm/cputime.h b/arch/ia64/include/asm/cputime.h
index 3deac95..9532b9c 100644
--- a/arch/ia64/include/asm/cputime.h
+++ b/arch/ia64/include/asm/cputime.h
@@ -103,5 +103,10 @@ static inline void cputime_to_timeval(const cputime_t ct, struct timeval *val)
#define cputime64_to_clock_t(__ct) \
cputime_to_clock_t((__force cputime_t)__ct)
+static inline bool accounting_vtime(void)
+{
+ return true;
+}
+
#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
#endif /* __IA64_CPUTIME_H */
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 6247197..7afcf93 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -144,7 +144,7 @@ EXPORT_SYMBOL_GPL(account_system_vtime);
* Called from the timer interrupt handler to charge accumulated user time
* to the current process. Must be called with interrupts disabled.
*/
-void account_process_tick(struct task_struct *p, int user_tick)
+void account_process_tick_vtime(struct task_struct *p, int user_tick)
{
struct thread_info *ti = task_thread_info(p);
cputime_t delta_utime;
diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h
index 487d46f..901e0ac 100644
--- a/arch/powerpc/include/asm/cputime.h
+++ b/arch/powerpc/include/asm/cputime.h
@@ -228,6 +228,11 @@ static inline cputime_t clock_t_to_cputime(const unsigned long clk)
#define cputime64_to_clock_t(ct) cputime_to_clock_t((cputime_t)(ct))
+static inline bool accounting_vtime(void)
+{
+ return true;
+}
+
#endif /* __KERNEL__ */
#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
#endif /* __POWERPC_CPUTIME_H */
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 49da7f0..3f1918a 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -354,7 +354,7 @@ EXPORT_SYMBOL_GPL(account_system_vtime);
* (i.e. since the last entry from usermode) so that
* get_paca()->user_time_scaled is up to date.
*/
-void account_process_tick(struct task_struct *tsk, int user_tick)
+void account_process_tick_vtime(struct task_struct *tsk, int user_tick)
{
cputime_t utime, utimescaled;
diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h
index 718374d..05ffda7 100644
--- a/arch/s390/include/asm/cputime.h
+++ b/arch/s390/include/asm/cputime.h
@@ -188,4 +188,9 @@ static inline int s390_nohz_delay(int cpu)
#define arch_needs_cpu(cpu) s390_nohz_delay(cpu)
+static inline bool accounting_vtime(void)
+{
+ return true;
+}
+
#endif /* _S390_CPUTIME_H */
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index 506e9bd..29f20fc 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -99,7 +99,7 @@ void account_switch_vtime(struct task_struct *prev)
S390_lowcore.system_timer = ti->system_timer;
}
-void account_process_tick(struct task_struct *tsk, int user_tick)
+void account_process_tick_vtime(struct task_struct *tsk, int user_tick)
{
do_account_vtime(tsk, HARDIRQ_OFFSET);
}
diff --git a/include/asm-generic/cputime.h b/include/asm-generic/cputime.h
index 9a62937..212c8bb 100644
--- a/include/asm-generic/cputime.h
+++ b/include/asm-generic/cputime.h
@@ -66,4 +66,9 @@ typedef u64 __nocast cputime64_t;
#define cputime64_to_clock_t(__ct) \
jiffies_64_to_clock_t(cputime64_to_jiffies64(__ct))
+static inline bool accounting_vtime(void)
+{
+ return false;
+}
+
#endif
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index bbe5d15..1270b86 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -132,8 +132,11 @@ extern void account_idle_ticks(unsigned long ticks);
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
extern void account_switch_vtime(struct task_struct *prev);
+extern bool account_process_tick_vtime(struct task_struct *p, int user_tick);
#else
static inline void account_switch_vtime(struct task_struct *prev) { }
+static inline void account_process_tick_vtime(struct task_struct *p,
+ int user_tick) { }
#endif
#endif /* _LINUX_KERNEL_STAT_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7b7a438..6209682 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -612,9 +612,7 @@ struct signal_struct {
cputime_t utime, stime, cutime, cstime;
cputime_t gtime;
cputime_t cgtime;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
cputime_t prev_utime, prev_stime;
-#endif
unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
unsigned long inblock, oublock, cinblock, coublock;
@@ -1363,9 +1361,8 @@ struct task_struct {
cputime_t utime, stime, utimescaled, stimescaled;
cputime_t gtime;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
cputime_t prev_utime, prev_stime;
-#endif
+
unsigned long nvcsw, nivcsw; /* context switch counts */
struct timespec start_time; /* monotonic time */
struct timespec real_start_time; /* boot based time */
diff --git a/kernel/fork.c b/kernel/fork.c
index 3e2ed43..b19dd02 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1245,9 +1245,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->utime = p->stime = p->gtime = 0;
p->utimescaled = p->stimescaled = 0;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
p->prev_utime = p->prev_stime = 0;
-#endif
+
#if defined(SPLIT_RSS_COUNTING)
memset(&p->rss_stat, 0, sizeof(p->rss_stat));
#endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index fd5bd01..ff525ca 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -287,8 +287,6 @@ static __always_inline bool steal_account_process_tick(void)
return false;
}
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
-
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
/*
* Account a tick to a process and cpustat
@@ -368,6 +366,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
struct rq *rq = this_rq();
+ if (accounting_vtime()) {
+ account_process_tick_vtime(p, user_tick);
+ return;
+ }
+
if (sched_clock_irqtime) {
irqtime_account_process_tick(p, user_tick, rq);
return;
@@ -410,29 +413,10 @@ void account_idle_ticks(unsigned long ticks)
account_idle_time(jiffies_to_cputime(ticks));
}
-#endif
/*
* Use precise platform statistics if available:
*/
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
- *ut = p->utime;
- *st = p->stime;
-}
-
-void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
- struct task_cputime cputime;
-
- thread_group_cputime(p, &cputime);
-
- *ut = cputime.utime;
- *st = cputime.stime;
-}
-#else
-
#ifndef nsecs_to_cputime
# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
#endif
@@ -441,6 +425,12 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
cputime_t rtime, utime = p->utime, total = utime + p->stime;
+ if (accounting_vtime()) {
+ *ut = p->utime;
+ *st = p->stime;
+ return;
+ }
+
/*
* Use CFS's precise accounting:
*/
@@ -476,6 +466,12 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
thread_group_cputime(p, &cputime);
+ if (accounting_vtime()) {
+ *ut = cputime.utime;
+ *st = cputime.stime;
+ return;
+ }
+
total = cputime.utime + cputime.stime;
rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
@@ -494,4 +490,3 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
*ut = sig->prev_utime;
*st = sig->prev_stime;
}
-#endif
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 45b17ae..76dc22b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -557,9 +557,7 @@ void tick_nohz_idle_exit(void)
{
int cpu = smp_processor_id();
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
unsigned long ticks;
-#endif
ktime_t now;
local_irq_disable();
@@ -584,19 +582,19 @@ void tick_nohz_idle_exit(void)
tick_do_update_jiffies64(now);
update_cpu_load_nohz();
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
- /*
- * We stopped the tick in idle. Update process times would miss the
- * time we slept as update_process_times does only a 1 tick
- * accounting. Enforce that this is accounted to idle !
- */
- ticks = jiffies - ts->idle_jiffies;
- /*
- * We might be one off. Do not randomly account a huge number of ticks!
- */
- if (ticks && ticks < LONG_MAX)
- account_idle_ticks(ticks);
-#endif
+ if (!accounting_vtime()) {
+ /*
+ * We stopped the tick in idle. Update process times would miss the
+ * time we slept as update_process_times does only a 1 tick
+ * accounting. Enforce that this is accounted to idle !
+ */
+ ticks = jiffies - ts->idle_jiffies;
+ /*
+ * We might be one off. Do not randomly account a huge number of ticks!
+ */
+ if (ticks && ticks < LONG_MAX)
+ account_idle_ticks(ticks);
+ }
calc_load_exit_idle();
touch_softlockup_watchdog();
--
1.7.5.4
^ permalink raw reply related [flat|nested] 11+ messages in thread* [PATCH 4/5] cputime: Rename account_system_vtime to account_vtime
2012-08-03 15:02 [RFC PATCH 0/5] cputime: Generic virtual based cputime accounting v2 Frederic Weisbecker
` (2 preceding siblings ...)
2012-08-03 15:02 ` [PATCH 3/5] cputime: Allow dynamic switch between tick/virtual based cputime accounting Frederic Weisbecker
@ 2012-08-03 15:02 ` Frederic Weisbecker
2012-08-03 15:02 ` [PATCH 5/5] cputime: Generic on-demand virtual cputime accounting Frederic Weisbecker
4 siblings, 0 replies; 11+ messages in thread
From: Frederic Weisbecker @ 2012-08-03 15:02 UTC (permalink / raw)
To: LKML
Cc: Frederic Weisbecker, Alessio Igor Bogani, Andrew Morton,
Avi Kivity, Chris Metcalf, Christoph Lameter, Geoff Levand,
Gilad Ben Yossef, Hakan Akkan, H. Peter Anvin, Ingo Molnar,
Kevin Hilman, Max Krasnyansky, Paul E. McKenney, Peter Zijlstra,
Stephen Hemminger, Steven Rostedt, Sven-Thorsten Dietrich,
Thomas Gleixner
account_system_vtime() can be called from random places:
hard/softirq entry/exit, kvm guest entry/exit, and even
context switches on powerpc.
Rename it to the even more generic account_vtime() name,
this reflect well that we are in a random place in the
kernel where we have either system, idle or user time
to account.
This way we can reuse the "system" version and expand it
with other domains such as "user" or "idle" to prepare
for the user hooks based virtual cputime accounting that
knows better when to flush the time for which domain of
execution.
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Alessio Igor Bogani <abogani@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Avi Kivity <avi@redhat.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Geoff Levand <geoff@infradead.org>
Cc: Gilad Ben Yossef <gilad@benyossef.com>
Cc: Hakan Akkan <hakanakkan@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Kevin Hilman <khilman@ti.com>
Cc: Max Krasnyansky <maxk@qualcomm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Sven-Thorsten Dietrich <thebigcorporation@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
arch/ia64/kernel/time.c | 4 ++--
arch/powerpc/kernel/time.c | 8 ++++----
arch/s390/kernel/vtime.c | 4 ++--
include/linux/hardirq.h | 8 ++++----
include/linux/kvm_host.h | 4 ++--
kernel/sched/cputime.c | 8 ++++----
kernel/softirq.c | 6 +++---
7 files changed, 21 insertions(+), 21 deletions(-)
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 7afcf93..03de550 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -116,7 +116,7 @@ void account_switch_vtime(struct task_struct *prev)
* Account time for a transition between system, hard irq or soft irq state.
* Note that this function is called with interrupts enabled.
*/
-void account_system_vtime(struct task_struct *tsk)
+void account_vtime(struct task_struct *tsk)
{
struct thread_info *ti = task_thread_info(tsk);
unsigned long flags;
@@ -138,7 +138,7 @@ void account_system_vtime(struct task_struct *tsk)
local_irq_restore(flags);
}
-EXPORT_SYMBOL_GPL(account_system_vtime);
+EXPORT_SYMBOL_GPL(account_vtime);
/*
* Called from the timer interrupt handler to charge accumulated user time
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 3f1918a..fba763d 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -291,7 +291,7 @@ static inline u64 calculate_stolen_time(u64 stop_tb)
* Account time for a transition between system, hard irq
* or soft irq state.
*/
-void account_system_vtime(struct task_struct *tsk)
+void account_vtime(struct task_struct *tsk)
{
u64 now, nowscaled, delta, deltascaled;
unsigned long flags;
@@ -343,14 +343,14 @@ void account_system_vtime(struct task_struct *tsk)
}
local_irq_restore(flags);
}
-EXPORT_SYMBOL_GPL(account_system_vtime);
+EXPORT_SYMBOL_GPL(account_vtime);
/*
* Transfer the user and system times accumulated in the paca
* by the exception entry and exit code to the generic process
* user and system time records.
* Must be called with interrupts disabled.
- * Assumes that account_system_vtime() has been called recently
+ * Assumes that account_vtime() has been called recently
* (i.e. since the last entry from usermode) so that
* get_paca()->user_time_scaled is up to date.
*/
@@ -368,7 +368,7 @@ void account_process_tick_vtime(struct task_struct *tsk, int user_tick)
void account_switch_vtime(struct task_struct *prev)
{
- account_system_vtime(prev);
+ account_vtime(prev);
account_process_tick(prev, 0);
}
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index 29f20fc..95f8105 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -108,7 +108,7 @@ void account_process_tick_vtime(struct task_struct *tsk, int user_tick)
* Update process times based on virtual cpu times stored by entry.S
* to the lowcore fields user_timer, system_timer & steal_clock.
*/
-void account_system_vtime(struct task_struct *tsk)
+void account_vtime(struct task_struct *tsk)
{
struct thread_info *ti = task_thread_info(tsk);
__u64 timer, system;
@@ -122,7 +122,7 @@ void account_system_vtime(struct task_struct *tsk)
ti->system_timer = S390_lowcore.system_timer;
account_system_time(tsk, 0, system, system);
}
-EXPORT_SYMBOL_GPL(account_system_vtime);
+EXPORT_SYMBOL_GPL(account_vtime);
void __kprobes vtime_stop_cpu(void)
{
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index bb7f309..6432e33 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -132,11 +132,11 @@ extern void synchronize_irq(unsigned int irq);
struct task_struct;
#if !defined(CONFIG_VIRT_CPU_ACCOUNTING) && !defined(CONFIG_IRQ_TIME_ACCOUNTING)
-static inline void account_system_vtime(struct task_struct *tsk)
+static inline void account_vtime(struct task_struct *tsk)
{
}
#else
-extern void account_system_vtime(struct task_struct *tsk);
+extern void account_vtime(struct task_struct *tsk);
#endif
#if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU)
@@ -162,7 +162,7 @@ extern void rcu_nmi_exit(void);
*/
#define __irq_enter() \
do { \
- account_system_vtime(current); \
+ account_vtime(current); \
add_preempt_count(HARDIRQ_OFFSET); \
trace_hardirq_enter(); \
} while (0)
@@ -178,7 +178,7 @@ extern void irq_enter(void);
#define __irq_exit() \
do { \
trace_hardirq_exit(); \
- account_system_vtime(current); \
+ account_vtime(current); \
sub_preempt_count(HARDIRQ_OFFSET); \
} while (0)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c446435..54b5859 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -676,7 +676,7 @@ static inline int kvm_deassign_device(struct kvm *kvm,
static inline void kvm_guest_enter(void)
{
BUG_ON(preemptible());
- account_system_vtime(current);
+ account_vtime(current);
current->flags |= PF_VCPU;
/* KVM does not hold any references to rcu protected data when it
* switches CPU into a guest mode. In fact switching to a guest mode
@@ -690,7 +690,7 @@ static inline void kvm_guest_enter(void)
static inline void kvm_guest_exit(void)
{
- account_system_vtime(current);
+ account_vtime(current);
current->flags &= ~PF_VCPU;
}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index ff525ca..53b03cc 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -9,11 +9,11 @@
/*
* There are no locks covering percpu hardirq/softirq time.
- * They are only modified in account_system_vtime, on corresponding CPU
+ * They are only modified in account_vtime, on corresponding CPU
* with interrupts disabled. So, writes are safe.
* They are read and saved off onto struct rq in update_rq_clock().
* This may result in other CPU reading this CPU's irq time and can
- * race with irq/account_system_vtime on this CPU. We would either get old
+ * race with irq/account_vtime on this CPU. We would either get old
* or new value with a side effect of accounting a slice of irq time to wrong
* task when irq is in progress while we read rq->clock. That is a worthy
* compromise in place of having locks on each irq in account_system_time.
@@ -42,7 +42,7 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq);
* Called before incrementing preempt_count on {soft,}irq_enter
* and before decrementing preempt_count on {soft,}irq_exit.
*/
-void account_system_vtime(struct task_struct *curr)
+void account_vtime(struct task_struct *curr)
{
unsigned long flags;
s64 delta;
@@ -72,7 +72,7 @@ void account_system_vtime(struct task_struct *curr)
irq_time_write_end();
local_irq_restore(flags);
}
-EXPORT_SYMBOL_GPL(account_system_vtime);
+EXPORT_SYMBOL_GPL(account_vtime);
static int irqtime_account_hi_update(void)
{
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 671f959..8e20a7d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -212,7 +212,7 @@ asmlinkage void __do_softirq(void)
int cpu;
pending = local_softirq_pending();
- account_system_vtime(current);
+ account_vtime(current);
__local_bh_disable((unsigned long)__builtin_return_address(0),
SOFTIRQ_OFFSET);
@@ -263,7 +263,7 @@ restart:
lockdep_softirq_exit();
- account_system_vtime(current);
+ account_vtime(current);
__local_bh_enable(SOFTIRQ_OFFSET);
}
@@ -331,7 +331,7 @@ static inline void invoke_softirq(void)
*/
void irq_exit(void)
{
- account_system_vtime(current);
+ account_vtime(current);
trace_hardirq_exit();
sub_preempt_count(IRQ_EXIT_OFFSET);
if (!in_interrupt() && local_softirq_pending())
--
1.7.5.4
^ permalink raw reply related [flat|nested] 11+ messages in thread* [PATCH 5/5] cputime: Generic on-demand virtual cputime accounting
2012-08-03 15:02 [RFC PATCH 0/5] cputime: Generic virtual based cputime accounting v2 Frederic Weisbecker
` (3 preceding siblings ...)
2012-08-03 15:02 ` [PATCH 4/5] cputime: Rename account_system_vtime to account_vtime Frederic Weisbecker
@ 2012-08-03 15:02 ` Frederic Weisbecker
4 siblings, 0 replies; 11+ messages in thread
From: Frederic Weisbecker @ 2012-08-03 15:02 UTC (permalink / raw)
To: LKML
Cc: Frederic Weisbecker, Alessio Igor Bogani, Andrew Morton,
Avi Kivity, Chris Metcalf, Christoph Lameter, Geoff Levand,
Gilad Ben Yossef, Hakan Akkan, H. Peter Anvin, Ingo Molnar,
Kevin Hilman, Max Krasnyansky, Paul E. McKenney, Peter Zijlstra,
Stephen Hemminger, Steven Rostedt, Sven-Thorsten Dietrich,
Thomas Gleixner
If we want to stop the tick further idle, we need to be
able to account the cputime without using the tick.
Virtual based cputime accounting solves that problem by
hooking into kernel/user boundaries.
However implementing CONFIG_VIRT_CPU_ACCOUNTING requires
to set low level hooks and involves more overhead. But
we already have a generic code domain tracking subsystem
that is required for RCU needs by archs which will want to
shut down the tick outside idle.
This patch implements a generic virtual based cputime
accounting that relies on these generic kernel/user hooks.
There are some upsides of doing this:
- This requires no arch code to implement CONFIG_VIRT_CPU_ACCOUNTING
if code domain tracking is already built (already necessary for RCU in full
tickless mode).
- We can rely on the generic code domain tracking subsystem to dynamically
(de)activate the hooks, so that we can switch anytime between virtual
and tick based accounting. This way we don't have the overhead
of the virtual accounting when the tick is running periodically.
And a few downsides:
- It relies on jiffies and the hooks are set in high level code. This
results in less precise cputime accounting than with a true native
virtual based cputime accounting which hooks on low level code and use
a cpu hardware clock. Precision is not the goal of this though.
- There is probably more overhead than a native virtual based cputime
accounting. But this relies on hooks that are already set anyway.
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Alessio Igor Bogani <abogani@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Avi Kivity <avi@redhat.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Geoff Levand <geoff@infradead.org>
Cc: Gilad Ben Yossef <gilad@benyossef.com>
Cc: Hakan Akkan <hakanakkan@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Kevin Hilman <khilman@ti.com>
Cc: Max Krasnyansky <maxk@qualcomm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Sven-Thorsten Dietrich <thebigcorporation@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
include/asm-generic/cputime.h | 2 +
include/linux/code_domain.h | 26 ++++++++++
include/linux/kernel_stat.h | 12 ++++-
init/Kconfig | 11 ++++-
kernel/code_domain_tracking.c | 20 ++------
kernel/sched/cputime.c | 112 +++++++++++++++++++++++++++++++++++++++++
6 files changed, 166 insertions(+), 17 deletions(-)
diff --git a/include/asm-generic/cputime.h b/include/asm-generic/cputime.h
index 212c8bb..2a78aa7 100644
--- a/include/asm-generic/cputime.h
+++ b/include/asm-generic/cputime.h
@@ -66,9 +66,11 @@ typedef u64 __nocast cputime64_t;
#define cputime64_to_clock_t(__ct) \
jiffies_64_to_clock_t(cputime64_to_jiffies64(__ct))
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_GEN
static inline bool accounting_vtime(void)
{
return false;
}
+#endif
#endif
diff --git a/include/linux/code_domain.h b/include/linux/code_domain.h
index 5d4513d..4d47a7b 100644
--- a/include/linux/code_domain.h
+++ b/include/linux/code_domain.h
@@ -4,6 +4,32 @@
#ifdef CONFIG_CODE_DOMAIN_TRACKING
#include <linux/sched.h>
+struct code_domain_tracking {
+ /*
+ * When tracking_active is false, hooks are not
+ * set to minimize overhead: TIF flags are cleared
+ * and calls to user_enter/exit are ignored. This
+ * may be further optimized using static keys.
+ */
+ bool tracking_active;
+ enum {
+ IN_KERNEL = 0,
+ IN_USER,
+ } state;
+};
+
+DECLARE_PER_CPU(struct code_domain_tracking, code_domain);
+
+static inline bool code_domain_in_user(void)
+{
+ return __this_cpu_read(code_domain.state) == IN_USER;
+}
+
+static inline bool code_domain_tracking_active(void)
+{
+ return __this_cpu_read(code_domain.tracking_active);
+}
+
extern void user_enter(void);
extern void user_exit(void);
extern void code_domain_task_switch(struct task_struct *prev,
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 1270b86..6e509a9 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -130,13 +130,23 @@ extern void account_process_tick(struct task_struct *, int user);
extern void account_steal_ticks(unsigned long ticks);
extern void account_idle_ticks(unsigned long ticks);
+
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
extern void account_switch_vtime(struct task_struct *prev);
-extern bool account_process_tick_vtime(struct task_struct *p, int user_tick);
+extern void account_process_tick_vtime(struct task_struct *p, int user_tick);
#else
static inline void account_switch_vtime(struct task_struct *prev) { }
static inline void account_process_tick_vtime(struct task_struct *p,
int user_tick) { }
#endif
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+void account_system_vtime(struct task_struct *tsk);
+void account_user_vtime(struct task_struct *tsk);
+bool accounting_vtime(void);
+#else
+static inline void account_system_vtime(struct task_struct *tsk) { }
+static inline void account_user_vtime(struct task_struct *tsk) { }
+#endif
+
#endif /* _LINUX_KERNEL_STAT_H */
diff --git a/init/Kconfig b/init/Kconfig
index cf5e86e..1bf2e43 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -269,7 +269,9 @@ config POSIX_MQUEUE_SYSCTL
config VIRT_CPU_ACCOUNTING
bool "Deterministic task and CPU time accounting"
- depends on HAVE_VIRT_CPU_ACCOUNTING && !IRQ_TIME_ACCOUNTING
+ depends on HAVE_VIRT_CPU_ACCOUNTING || HAVE_CODE_DOMAIN_TRACKING
+ depends on !IRQ_TIME_ACCOUNTING
+ select VIRT_CPU_ACCOUNTING_GEN if !HAVE_VIRT_CPU_ACCOUNTING
default y if PPC64
help
Select this option to enable more accurate task and CPU time
@@ -280,6 +282,13 @@ config VIRT_CPU_ACCOUNTING
stolen time on logically-partitioned systems running on
IBM POWER5-based machines.
+config VIRT_CPU_ACCOUNTING_GEN
+ select CODE_DOMAIN_TRACKING
+ bool
+ help
+ Implement a generic virtual based cputime accounting by using
+ the code domain tracking subsystem.
+
config BSD_PROCESS_ACCT
bool "BSD Process Accounting"
help
diff --git a/kernel/code_domain_tracking.c b/kernel/code_domain_tracking.c
index 8332c76..19c2bc0 100644
--- a/kernel/code_domain_tracking.c
+++ b/kernel/code_domain_tracking.c
@@ -1,21 +1,7 @@
#include <linux/code_domain.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
-#include <linux/percpu.h>
-
-struct code_domain_tracking {
- /*
- * When tracking_active is false, hooks are not
- * set to minimize overhead: TIF flags are cleared
- * and calls to user_enter/exit are ignored. This
- * may be further optimized using static keys.
- */
- bool tracking_active;
- enum {
- IN_KERNEL = 0,
- IN_USER,
- } state;
-};
+#include <linux/kernel_stat.h>
DEFINE_PER_CPU(struct code_domain_tracking, code_domain) = {
#ifdef CONFIG_CODE_DOMAIN_TRACKING_FORCE
@@ -32,6 +18,8 @@ void user_enter(void)
if (__this_cpu_read(code_domain.tracking_active) &&
__this_cpu_read(code_domain.state) != IN_USER) {
__this_cpu_write(code_domain.state, IN_USER);
+ if (accounting_vtime())
+ account_system_vtime(current);
rcu_user_enter();
}
local_irq_restore(flags);
@@ -44,6 +32,8 @@ void user_exit(void)
local_irq_save(flags);
if (__this_cpu_read(code_domain.state) == IN_USER) {
__this_cpu_write(code_domain.state, IN_KERNEL);
+ if (accounting_vtime())
+ account_user_vtime(current);
rcu_user_exit();
}
local_irq_restore(flags);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 53b03cc..d55d9cf 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -2,6 +2,7 @@
#include <linux/sched.h>
#include <linux/tsacct_kern.h>
#include <linux/kernel_stat.h>
+#include <linux/code_domain.h>
#include "sched.h"
@@ -490,3 +491,114 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
*ut = sig->prev_utime;
*st = sig->prev_stime;
}
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+static DEFINE_PER_CPU(long, last_jiffies) = INITIAL_JIFFIES;
+
+static cputime_t get_vtime_delta(void)
+{
+ long delta;
+
+ delta = jiffies - __this_cpu_read(last_jiffies);
+ __this_cpu_add(last_jiffies, delta);
+
+ return jiffies_to_cputime(delta);
+}
+
+void account_system_vtime(struct task_struct *tsk)
+{
+ cputime_t delta_cpu = get_vtime_delta();
+
+ account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu));
+}
+
+void account_user_vtime(struct task_struct *tsk)
+{
+ cputime_t delta_cpu = get_vtime_delta();
+
+ account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
+}
+
+static void account_idle_vtime(void)
+{
+ cputime_t delta_cpu = get_vtime_delta();
+
+ account_idle_time(delta_cpu);
+}
+
+void account_vtime(struct task_struct *tsk)
+{
+ unsigned long count = irq_count();
+
+ if (!count) {
+ /*
+ * If we interrupted user, code_domain_in_user()
+ * is 1 because the code domain tracking don't hook
+ * on irq entry/exit. This way we know if
+ * we need to flush user time on kernel entry.
+ */
+ if (code_domain_in_user())
+ account_user_vtime(tsk);
+ } else {
+ if (count == HARDIRQ_OFFSET ||
+ count == SOFTIRQ_OFFSET) {
+ if (is_idle_task(tsk))
+ account_idle_vtime();
+ else
+ account_system_vtime(tsk);
+ }
+ }
+}
+
+void account_switch_vtime(struct task_struct *prev)
+{
+ if (is_idle_task(prev))
+ account_idle_vtime();
+ else
+ account_system_vtime(prev);
+}
+
+/*
+ * This is a kind of hack: if we flush user time only on
+ * irq entry, we miss the jiffies update and the time is spuriously
+ * accounted to system time.
+ */
+void account_process_tick_vtime(struct task_struct *p, int user_tick)
+{
+ if (code_domain_in_user())
+ account_user_vtime(p);
+}
+
+bool accounting_vtime(void)
+{
+ return code_domain_tracking_active();
+}
+
+static int __cpuinit vtime_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ long cpu = (long)hcpu;
+ long *last_jiffies_cpu = per_cpu_ptr(&last_jiffies, cpu);
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ /*
+ * CHECKME: ensure that's visible by the CPU
+ * once it wakes up
+ */
+ *last_jiffies_cpu = jiffies;
+ default:
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static int __init init_vtime(void)
+{
+ cpu_notifier(vtime_cpu_notify, 0);
+ return 0;
+}
+early_initcall(init_vtime);
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
--
1.7.5.4
^ permalink raw reply related [flat|nested] 11+ messages in thread