* [RFC][PATCH v3 1/5] sparc64: Fix asm/percpu.h build error
From: Peter Zijlstra @ 2020-05-29 21:35 UTC (permalink / raw)
To: mingo, will, tglx
Cc: linux-s390, peterz, bigeasy, x86, heiko.carstens, linux-kernel,
rostedt, a.darwish, sparclinux, linuxppc-dev, davem
In-Reply-To: <20200529213550.683440625@infradead.org>
In order to break a header dependency between lockdep and task_struct,
I need per-cpu stuff from lockdep.
Including <asm/percpu.h> from lockdep.h gives a build error, this
patch cures that, but results in the following warning:
../arch/sparc/include/asm/percpu_64.h:7:24: warning: call-clobbered register used for global register variable
register unsigned long __local_per_cpu_offset asm("g5");
But i've no idea how to fix that :/ but it does build.
Not-Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
arch/sparc/include/asm/trap_block.h | 2 ++
1 file changed, 2 insertions(+)
--- a/arch/sparc/include/asm/trap_block.h
+++ b/arch/sparc/include/asm/trap_block.h
@@ -2,6 +2,8 @@
#ifndef _SPARC_TRAP_BLOCK_H
#define _SPARC_TRAP_BLOCK_H
+#include <linux/threads.h>
+
#include <asm/hypervisor.h>
#include <asm/asi.h>
^ permalink raw reply
* [PATCH v3 5/5] lockdep: Remove lockdep_hardirq{s_enabled, _context}() argument
From: Peter Zijlstra @ 2020-05-29 21:35 UTC (permalink / raw)
To: mingo, will, tglx
Cc: linux-s390, peterz, bigeasy, x86, heiko.carstens, linux-kernel,
rostedt, a.darwish, sparclinux, linuxppc-dev, davem
In-Reply-To: <20200529213550.683440625@infradead.org>
Now that the macros use per-cpu data, we no longer need the argument.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
arch/x86/entry/common.c | 2 +-
include/linux/irqflags.h | 8 ++++----
include/linux/lockdep.h | 2 +-
kernel/locking/lockdep.c | 30 +++++++++++++++---------------
kernel/softirq.c | 2 +-
tools/include/linux/irqflags.h | 4 ++--
6 files changed, 24 insertions(+), 24 deletions(-)
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -689,7 +689,7 @@ noinstr void idtentry_exit_user(struct p
noinstr bool idtentry_enter_nmi(struct pt_regs *regs)
{
- bool irq_state = lockdep_hardirqs_enabled(current);
+ bool irq_state = lockdep_hardirqs_enabled();
__nmi_enter();
lockdep_hardirqs_off(CALLER_ADDR0);
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -40,9 +40,9 @@ DECLARE_PER_CPU(int, hardirq_context);
extern void trace_hardirqs_off_finish(void);
extern void trace_hardirqs_on(void);
extern void trace_hardirqs_off(void);
-# define lockdep_hardirq_context(p) (this_cpu_read(hardirq_context))
+# define lockdep_hardirq_context() (this_cpu_read(hardirq_context))
# define lockdep_softirq_context(p) ((p)->softirq_context)
-# define lockdep_hardirqs_enabled(p) (this_cpu_read(hardirqs_enabled))
+# define lockdep_hardirqs_enabled() (this_cpu_read(hardirqs_enabled))
# define lockdep_softirqs_enabled(p) ((p)->softirqs_enabled)
# define lockdep_hardirq_enter() \
do { \
@@ -109,9 +109,9 @@ do { \
# define trace_hardirqs_off_finish() do { } while (0)
# define trace_hardirqs_on() do { } while (0)
# define trace_hardirqs_off() do { } while (0)
-# define lockdep_hardirq_context(p) 0
+# define lockdep_hardirq_context() 0
# define lockdep_softirq_context(p) 0
-# define lockdep_hardirqs_enabled(p) 0
+# define lockdep_hardirqs_enabled() 0
# define lockdep_softirqs_enabled(p) 0
# define lockdep_hardirq_enter() do { } while (0)
# define lockdep_hardirq_threaded() do { } while (0)
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -736,7 +736,7 @@ do { \
# define lockdep_assert_RT_in_threaded_ctx() do { \
WARN_ONCE(debug_locks && !current->lockdep_recursion && \
- lockdep_hardirq_context(current) && \
+ lockdep_hardirq_context() && \
!(current->hardirq_threaded || current->irq_config), \
"Not in threaded context on PREEMPT_RT as expected\n"); \
} while (0)
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2062,9 +2062,9 @@ print_bad_irq_dependency(struct task_str
pr_warn("-----------------------------------------------------\n");
pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
curr->comm, task_pid_nr(curr),
- lockdep_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
+ lockdep_hardirq_context(), hardirq_count() >> HARDIRQ_SHIFT,
curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT,
- lockdep_hardirqs_enabled(curr),
+ lockdep_hardirqs_enabled(),
curr->softirqs_enabled);
print_lock(next);
@@ -3331,9 +3331,9 @@ print_usage_bug(struct task_struct *curr
pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
curr->comm, task_pid_nr(curr),
- lockdep_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
+ lockdep_hardirq_context(), hardirq_count() >> HARDIRQ_SHIFT,
lockdep_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
- lockdep_hardirqs_enabled(curr),
+ lockdep_hardirqs_enabled(),
lockdep_softirqs_enabled(curr));
print_lock(this);
@@ -3655,7 +3655,7 @@ void lockdep_hardirqs_on_prepare(unsigne
if (DEBUG_LOCKS_WARN_ON(current->lockdep_recursion & LOCKDEP_RECURSION_MASK))
return;
- if (unlikely(lockdep_hardirqs_enabled(current))) {
+ if (unlikely(lockdep_hardirqs_enabled())) {
/*
* Neither irq nor preemption are disabled here
* so this is racy by nature but losing one hit
@@ -3683,7 +3683,7 @@ void lockdep_hardirqs_on_prepare(unsigne
* Can't allow enabling interrupts while in an interrupt handler,
* that's general bad form and such. Recursion, limited stack etc..
*/
- if (DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context(current)))
+ if (DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context()))
return;
current->hardirq_chain_key = current->curr_chain_key;
@@ -3718,7 +3718,7 @@ void noinstr lockdep_hardirqs_on(unsigne
if (DEBUG_LOCKS_WARN_ON(curr->lockdep_recursion & LOCKDEP_RECURSION_MASK))
return;
- if (lockdep_hardirqs_enabled(curr)) {
+ if (lockdep_hardirqs_enabled()) {
/*
* Neither irq nor preemption are disabled here
* so this is racy by nature but losing one hit
@@ -3772,7 +3772,7 @@ void noinstr lockdep_hardirqs_off(unsign
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return;
- if (lockdep_hardirqs_enabled(curr)) {
+ if (lockdep_hardirqs_enabled()) {
/*
* We have done an ON -> OFF transition:
*/
@@ -3821,7 +3821,7 @@ void lockdep_softirqs_on(unsigned long i
* usage bit for all held locks, if hardirqs are
* enabled too:
*/
- if (lockdep_hardirqs_enabled(curr))
+ if (lockdep_hardirqs_enabled())
mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ);
lockdep_recursion_finish();
}
@@ -3870,7 +3870,7 @@ mark_usage(struct task_struct *curr, str
*/
if (!hlock->trylock) {
if (hlock->read) {
- if (lockdep_hardirq_context(curr))
+ if (lockdep_hardirq_context())
if (!mark_lock(curr, hlock,
LOCK_USED_IN_HARDIRQ_READ))
return 0;
@@ -3879,7 +3879,7 @@ mark_usage(struct task_struct *curr, str
LOCK_USED_IN_SOFTIRQ_READ))
return 0;
} else {
- if (lockdep_hardirq_context(curr))
+ if (lockdep_hardirq_context())
if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ))
return 0;
if (curr->softirq_context)
@@ -3917,7 +3917,7 @@ mark_usage(struct task_struct *curr, str
static inline unsigned int task_irq_context(struct task_struct *task)
{
- return LOCK_CHAIN_HARDIRQ_CONTEXT * !!lockdep_hardirq_context(task) +
+ return LOCK_CHAIN_HARDIRQ_CONTEXT * !!lockdep_hardirq_context() +
LOCK_CHAIN_SOFTIRQ_CONTEXT * !!task->softirq_context;
}
@@ -4010,7 +4010,7 @@ static inline short task_wait_context(st
* Set appropriate wait type for the context; for IRQs we have to take
* into account force_irqthread as that is implied by PREEMPT_RT.
*/
- if (lockdep_hardirq_context(curr)) {
+ if (lockdep_hardirq_context()) {
/*
* Check if force_irqthreads will run us threaded.
*/
@@ -4853,11 +4853,11 @@ static void check_flags(unsigned long fl
return;
if (irqs_disabled_flags(flags)) {
- if (DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled(current))) {
+ if (DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled())) {
printk("possible reason: unannotated irqs-off.\n");
}
} else {
- if (DEBUG_LOCKS_WARN_ON(!lockdep_hardirqs_enabled(current))) {
+ if (DEBUG_LOCKS_WARN_ON(!lockdep_hardirqs_enabled())) {
printk("possible reason: unannotated irqs-on.\n");
}
}
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -230,7 +230,7 @@ static inline bool lockdep_softirq_start
{
bool in_hardirq = false;
- if (lockdep_hardirq_context(current)) {
+ if (lockdep_hardirq_context()) {
in_hardirq = true;
lockdep_hardirq_exit();
}
--- a/tools/include/linux/irqflags.h
+++ b/tools/include/linux/irqflags.h
@@ -2,9 +2,9 @@
#ifndef _LIBLOCKDEP_LINUX_TRACE_IRQFLAGS_H_
#define _LIBLOCKDEP_LINUX_TRACE_IRQFLAGS_H_
-# define lockdep_hardirq_context(p) 0
+# define lockdep_hardirq_context() 0
# define lockdep_softirq_context(p) 0
-# define lockdep_hardirqs_enabled(p) 0
+# define lockdep_hardirqs_enabled() 0
# define lockdep_softirqs_enabled(p) 0
# define lockdep_hardirq_enter() do { } while (0)
# define lockdep_hardirq_exit() do { } while (0)
^ permalink raw reply
* [PATCH v3 4/5] lockdep: Change hardirq{s_enabled, _context} to per-cpu variables
From: Peter Zijlstra @ 2020-05-29 21:35 UTC (permalink / raw)
To: mingo, will, tglx
Cc: linux-s390, peterz, bigeasy, x86, heiko.carstens, linux-kernel,
rostedt, a.darwish, sparclinux, linuxppc-dev, davem
In-Reply-To: <20200529213550.683440625@infradead.org>
Currently all IRQ-tracking state is in task_struct, this means that
task_struct needs to be defined before we use it.
Especially for lockdep_assert_irq*() this can lead to header-hell.
Move the hardirq state into per-cpu variables to avoid the task_struct
dependency.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/irqflags.h | 19 ++++++++++++-------
include/linux/lockdep.h | 34 ++++++++++++++++++----------------
include/linux/sched.h | 2 --
kernel/fork.c | 4 +---
kernel/locking/lockdep.c | 30 +++++++++++++++---------------
kernel/softirq.c | 6 ++++++
6 files changed, 52 insertions(+), 43 deletions(-)
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -14,6 +14,7 @@
#include <linux/typecheck.h>
#include <asm/irqflags.h>
+#include <asm/percpu.h>
/* Currently lockdep_softirqs_on/off is used only by lockdep */
#ifdef CONFIG_PROVE_LOCKING
@@ -31,18 +32,22 @@
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
+
+DECLARE_PER_CPU(int, hardirqs_enabled);
+DECLARE_PER_CPU(int, hardirq_context);
+
extern void trace_hardirqs_on_prepare(void);
extern void trace_hardirqs_off_finish(void);
extern void trace_hardirqs_on(void);
extern void trace_hardirqs_off(void);
-# define lockdep_hardirq_context(p) ((p)->hardirq_context)
+# define lockdep_hardirq_context(p) (this_cpu_read(hardirq_context))
# define lockdep_softirq_context(p) ((p)->softirq_context)
-# define lockdep_hardirqs_enabled(p) ((p)->hardirqs_enabled)
+# define lockdep_hardirqs_enabled(p) (this_cpu_read(hardirqs_enabled))
# define lockdep_softirqs_enabled(p) ((p)->softirqs_enabled)
-# define lockdep_hardirq_enter() \
-do { \
- if (!current->hardirq_context++) \
- current->hardirq_threaded = 0; \
+# define lockdep_hardirq_enter() \
+do { \
+ if (this_cpu_inc_return(hardirq_context) == 1) \
+ current->hardirq_threaded = 0; \
} while (0)
# define lockdep_hardirq_threaded() \
do { \
@@ -50,7 +55,7 @@ do { \
} while (0)
# define lockdep_hardirq_exit() \
do { \
- current->hardirq_context--; \
+ this_cpu_dec(hardirq_context); \
} while (0)
# define lockdep_softirq_enter() \
do { \
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -19,6 +19,7 @@ extern int lock_stat;
#define MAX_LOCKDEP_SUBCLASSES 8UL
+#include <asm/percpu.h>
#include <linux/types.h>
enum lockdep_wait_type {
@@ -703,28 +704,29 @@ do { \
lock_release(&(lock)->dep_map, _THIS_IP_); \
} while (0)
-#define lockdep_assert_irqs_enabled() do { \
- WARN_ONCE(debug_locks && !current->lockdep_recursion && \
- !current->hardirqs_enabled, \
- "IRQs not enabled as expected\n"); \
- } while (0)
+DECLARE_PER_CPU(int, hardirqs_enabled);
+DECLARE_PER_CPU(int, hardirq_context);
-#define lockdep_assert_irqs_disabled() do { \
- WARN_ONCE(debug_locks && !current->lockdep_recursion && \
- current->hardirqs_enabled, \
- "IRQs not disabled as expected\n"); \
- } while (0)
+#define lockdep_assert_irqs_enabled() \
+do { \
+ WARN_ON_ONCE(debug_locks && !this_cpu_read(hardirqs_enabled)); \
+} while (0)
-#define lockdep_assert_in_irq() do { \
- WARN_ONCE(debug_locks && !current->lockdep_recursion && \
- !current->hardirq_context, \
- "Not in hardirq as expected\n"); \
- } while (0)
+#define lockdep_assert_irqs_disabled() \
+do { \
+ WARN_ON_ONCE(debug_locks && this_cpu_read(hardirqs_enabled)); \
+} while (0)
+
+#define lockdep_assert_in_irq() \
+do { \
+ WARN_ON_ONCE(debug_locks && !this_cpu_read(hardirq_context)); \
+} while (0)
#else
# define might_lock(lock) do { } while (0)
# define might_lock_read(lock) do { } while (0)
# define might_lock_nested(lock, subclass) do { } while (0)
+
# define lockdep_assert_irqs_enabled() do { } while (0)
# define lockdep_assert_irqs_disabled() do { } while (0)
# define lockdep_assert_in_irq() do { } while (0)
@@ -734,7 +736,7 @@ do { \
# define lockdep_assert_RT_in_threaded_ctx() do { \
WARN_ONCE(debug_locks && !current->lockdep_recursion && \
- current->hardirq_context && \
+ lockdep_hardirq_context(current) && \
!(current->hardirq_threaded || current->irq_config), \
"Not in threaded context on PREEMPT_RT as expected\n"); \
} while (0)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -991,8 +991,6 @@ struct task_struct {
unsigned long hardirq_disable_ip;
unsigned int hardirq_enable_event;
unsigned int hardirq_disable_event;
- int hardirqs_enabled;
- int hardirq_context;
u64 hardirq_chain_key;
unsigned long softirq_disable_ip;
unsigned long softirq_enable_ip;
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1946,8 +1946,8 @@ static __latent_entropy struct task_stru
rt_mutex_init_task(p);
+ lockdep_assert_irqs_enabled();
#ifdef CONFIG_PROVE_LOCKING
- DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
retval = -EAGAIN;
@@ -2028,7 +2028,6 @@ static __latent_entropy struct task_stru
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
p->irq_events = 0;
- p->hardirqs_enabled = 0;
p->hardirq_enable_ip = 0;
p->hardirq_enable_event = 0;
p->hardirq_disable_ip = _THIS_IP_;
@@ -2038,7 +2037,6 @@ static __latent_entropy struct task_stru
p->softirq_enable_event = 0;
p->softirq_disable_ip = 0;
p->softirq_disable_event = 0;
- p->hardirq_context = 0;
p->softirq_context = 0;
#endif
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2062,9 +2062,9 @@ print_bad_irq_dependency(struct task_str
pr_warn("-----------------------------------------------------\n");
pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
curr->comm, task_pid_nr(curr),
- curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
+ lockdep_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT,
- curr->hardirqs_enabled,
+ lockdep_hardirqs_enabled(curr),
curr->softirqs_enabled);
print_lock(next);
@@ -3655,7 +3655,7 @@ void lockdep_hardirqs_on_prepare(unsigne
if (DEBUG_LOCKS_WARN_ON(current->lockdep_recursion & LOCKDEP_RECURSION_MASK))
return;
- if (unlikely(current->hardirqs_enabled)) {
+ if (unlikely(lockdep_hardirqs_enabled(current))) {
/*
* Neither irq nor preemption are disabled here
* so this is racy by nature but losing one hit
@@ -3683,7 +3683,7 @@ void lockdep_hardirqs_on_prepare(unsigne
* Can't allow enabling interrupts while in an interrupt handler,
* that's general bad form and such. Recursion, limited stack etc..
*/
- if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
+ if (DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context(current)))
return;
current->hardirq_chain_key = current->curr_chain_key;
@@ -3718,7 +3718,7 @@ void noinstr lockdep_hardirqs_on(unsigne
if (DEBUG_LOCKS_WARN_ON(curr->lockdep_recursion & LOCKDEP_RECURSION_MASK))
return;
- if (curr->hardirqs_enabled) {
+ if (lockdep_hardirqs_enabled(curr)) {
/*
* Neither irq nor preemption are disabled here
* so this is racy by nature but losing one hit
@@ -3745,7 +3745,7 @@ void noinstr lockdep_hardirqs_on(unsigne
skip_checks:
/* we'll do an OFF -> ON transition: */
- curr->hardirqs_enabled = 1;
+ this_cpu_write(hardirqs_enabled, 1);
curr->hardirq_enable_ip = ip;
curr->hardirq_enable_event = ++curr->irq_events;
debug_atomic_inc(hardirqs_on_events);
@@ -3772,11 +3772,11 @@ void noinstr lockdep_hardirqs_off(unsign
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return;
- if (curr->hardirqs_enabled) {
+ if (lockdep_hardirqs_enabled(curr)) {
/*
* We have done an ON -> OFF transition:
*/
- curr->hardirqs_enabled = 0;
+ this_cpu_write(hardirqs_enabled, 0);
curr->hardirq_disable_ip = ip;
curr->hardirq_disable_event = ++curr->irq_events;
debug_atomic_inc(hardirqs_off_events);
@@ -3821,7 +3821,7 @@ void lockdep_softirqs_on(unsigned long i
* usage bit for all held locks, if hardirqs are
* enabled too:
*/
- if (curr->hardirqs_enabled)
+ if (lockdep_hardirqs_enabled(curr))
mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ);
lockdep_recursion_finish();
}
@@ -3870,7 +3870,7 @@ mark_usage(struct task_struct *curr, str
*/
if (!hlock->trylock) {
if (hlock->read) {
- if (curr->hardirq_context)
+ if (lockdep_hardirq_context(curr))
if (!mark_lock(curr, hlock,
LOCK_USED_IN_HARDIRQ_READ))
return 0;
@@ -3879,7 +3879,7 @@ mark_usage(struct task_struct *curr, str
LOCK_USED_IN_SOFTIRQ_READ))
return 0;
} else {
- if (curr->hardirq_context)
+ if (lockdep_hardirq_context(curr))
if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ))
return 0;
if (curr->softirq_context)
@@ -3917,7 +3917,7 @@ mark_usage(struct task_struct *curr, str
static inline unsigned int task_irq_context(struct task_struct *task)
{
- return LOCK_CHAIN_HARDIRQ_CONTEXT * !!task->hardirq_context +
+ return LOCK_CHAIN_HARDIRQ_CONTEXT * !!lockdep_hardirq_context(task) +
LOCK_CHAIN_SOFTIRQ_CONTEXT * !!task->softirq_context;
}
@@ -4010,7 +4010,7 @@ static inline short task_wait_context(st
* Set appropriate wait type for the context; for IRQs we have to take
* into account force_irqthread as that is implied by PREEMPT_RT.
*/
- if (curr->hardirq_context) {
+ if (lockdep_hardirq_context(curr)) {
/*
* Check if force_irqthreads will run us threaded.
*/
@@ -4853,11 +4853,11 @@ static void check_flags(unsigned long fl
return;
if (irqs_disabled_flags(flags)) {
- if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)) {
+ if (DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled(current))) {
printk("possible reason: unannotated irqs-off.\n");
}
} else {
- if (DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled)) {
+ if (DEBUG_LOCKS_WARN_ON(!lockdep_hardirqs_enabled(current))) {
printk("possible reason: unannotated irqs-on.\n");
}
}
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -107,6 +107,12 @@ static bool ksoftirqd_running(unsigned l
* where hardirqs are disabled legitimately:
*/
#ifdef CONFIG_TRACE_IRQFLAGS
+
+DEFINE_PER_CPU(int, hardirqs_enabled);
+DEFINE_PER_CPU(int, hardirq_context);
+EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled);
+EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context);
+
void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
{
unsigned long flags;
^ permalink raw reply
* [PATCH v3 3/5] s390: Break cyclic percpu include
From: Peter Zijlstra @ 2020-05-29 21:35 UTC (permalink / raw)
To: mingo, will, tglx
Cc: linux-s390, peterz, bigeasy, x86, heiko.carstens, linux-kernel,
rostedt, a.darwish, sparclinux, linuxppc-dev, davem
In-Reply-To: <20200529213550.683440625@infradead.org>
In order to use <asm/percpu.h> in irqflags.h, we need to make sure
asm/percpu.h does not itself depend on irqflags.h
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
arch/s390/include/asm/smp.h | 1 +
arch/s390/include/asm/thread_info.h | 1 -
2 files changed, 1 insertion(+), 1 deletion(-)
--- a/arch/s390/include/asm/smp.h
+++ b/arch/s390/include/asm/smp.h
@@ -10,6 +10,7 @@
#include <asm/sigp.h>
#include <asm/lowcore.h>
+#include <asm/processor.h>
#define raw_smp_processor_id() (S390_lowcore.cpu_nr)
--- a/arch/s390/include/asm/thread_info.h
+++ b/arch/s390/include/asm/thread_info.h
@@ -24,7 +24,6 @@
#ifndef __ASSEMBLY__
#include <asm/lowcore.h>
#include <asm/page.h>
-#include <asm/processor.h>
#define STACK_INIT_OFFSET \
(THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs))
^ permalink raw reply
* Re: [musl] ppc64le and 32-bit LE userland compatibility
From: Rich Felker @ 2020-05-29 19:24 UTC (permalink / raw)
To: Will Springer
Cc: libc-alpha, eery, daniel, musl, binutils, libc-dev, linuxppc-dev
In-Reply-To: <2047231.C4sosBPzcN@sheen>
On Fri, May 29, 2020 at 07:03:48PM +0000, Will Springer wrote:
> The next problem concerns the ABI more directly. The failure mode was `file`
> surfacing EINVAL from pread64 when invoked on an ELF; pread64 was passed a
> garbage value for `pos`, which didn't appear to be caused by anything in
> `file`. Initially it seemed as though the 32-bit components of the arg were
> getting swapped, and we made hacky fixes to glibc and musl to put them in the
> "right order"; however, we weren't sure if that was the correct approach, or
> if there were knock-on effects we didn't know about. So we found the relevant
> compat code path in the kernel, at arch/powerpc/kernel/sys_ppc32.c, where
> there exists this comment:
>
> > /*
> > * long long munging:
> > * The 32 bit ABI passes long longs in an odd even register pair.
> > */
>
> It seems that the opposite is true in LE mode, and something is expecting long
> longs to start on an even register. I realized this after I tried swapping hi/
> lo `u32`s here and didn't see an improvement. I whipped up a patch [6] that
> switches which syscalls use padding arguments depending on endianness, while
> hopefully remaining tidy enough to be unobtrusive. (I took some liberties with
> variable names/types so that the macro could be consistent.)
The argument passing for pread/pwrite is historically a mess and
differs between archs. musl has a dedicated macro that archs can
define to override it. But it looks like it should match regardless of
BE vs LE, and musl already defines it for powerpc with the default
definition, adding a zero arg to start on an even arg-slot index,
which is an odd register (since ppc32 args start with an odd one, r3).
> [6]: https://gist.github.com/Skirmisher/02891c1a8cafa0ff18b2460933ef4f3c
I don't think this is correct, but I'm confused about where it's
getting messed up because it looks like it should already be right.
> This was enough to fix up the `file` bug. I'm no seasoned kernel hacker,
> though, and there is still concern over the right way to approach this,
> whether it should live in the kernel or libc, etc. Frankly, I don't know the
> ABI structure enough to understand why the register padding has to be
> different in this case, or what lower-level component is responsible for it..
> For comparison, I had a look at the mips tree, since it's bi-endian and has a
> similar 32/64 situation. There is a macro conditional upon endianness that is
> responsible for munging long longs; it uses __MIPSEB__ and __MIPSEL__ instead
> of an if/else on the generic __LITTLE_ENDIAN__. Not sure what to make of that.
> (It also simply swaps registers for LE, unlike what I did for ppc.)
Indeed the problem is probably that you need to swap registers for LE,
not remove the padding slot. Did you check what happens if you pass a
value larger than 32 bits?
If so, the right way to fix this on the kernel side would be to
construct the value as a union rather than by bitwise ops so it's
endian-agnostic:
(union { u32 parts[2]; u64 val; }){{ arg1, arg2 }}.val
But the kernel folks might prefer endian ifdefs for some odd reason...
> Also worth noting is the one other outstanding bug, where the time-related
> syscalls in the 32-bit vDSO seem to return garbage. It doesn't look like an
> endian bug to me, and it doesn't affect standard syscalls (which is why if you
> run `date` on musl it prints the correct time, unlike on glibc). The vDSO time
> functions are implemented in ppc asm (arch/powerpc/kernel/vdso32/
> gettimeofday.S), and I've never touched the stuff, so if anyone has a clue I'm
> all ears.
Not sure about this. Worst-case, just leave it disabled until someone
finds a fix.
Rich
^ permalink raw reply
* Re: [RFC PATCH 1/2] libnvdimm: Add prctl control for disabling synchronous fault support.
From: Dan Williams @ 2020-05-29 19:22 UTC (permalink / raw)
To: Aneesh Kumar K.V
Cc: Jan Kara, linux-nvdimm, jack, Jeff Moyer, Oliver O'Halloran,
Michal Suchánek, linuxppc-dev
In-Reply-To: <7e8ee9e3-4d4d-e4b9-913b-1c2448adc62a@linux.ibm.com>
On Fri, May 29, 2020 at 3:55 AM Aneesh Kumar K.V
<aneesh.kumar@linux.ibm.com> wrote:
>
> On 5/29/20 3:22 PM, Jan Kara wrote:
> > Hi!
> >
> > On Fri 29-05-20 15:07:31, Aneesh Kumar K.V wrote:
> >> Thanks Michal. I also missed Jeff in this email thread.
> >
> > And I think you'll also need some of the sched maintainers for the prctl
> > bits...
> >
> >> On 5/29/20 3:03 PM, Michal Suchánek wrote:
> >>> Adding Jan
> >>>
> >>> On Fri, May 29, 2020 at 11:11:39AM +0530, Aneesh Kumar K.V wrote:
> >>>> With POWER10, architecture is adding new pmem flush and sync instructions.
> >>>> The kernel should prevent the usage of MAP_SYNC if applications are not using
> >>>> the new instructions on newer hardware.
> >>>>
> >>>> This patch adds a prctl option MAP_SYNC_ENABLE that can be used to enable
> >>>> the usage of MAP_SYNC. The kernel config option is added to allow the user
> >>>> to control whether MAP_SYNC should be enabled by default or not.
> >>>>
> >>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> > ...
> >>>> diff --git a/kernel/fork.c b/kernel/fork.c
> >>>> index 8c700f881d92..d5a9a363e81e 100644
> >>>> --- a/kernel/fork.c
> >>>> +++ b/kernel/fork.c
> >>>> @@ -963,6 +963,12 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
> >>>> static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
> >>>> +#ifdef CONFIG_ARCH_MAP_SYNC_DISABLE
> >>>> +unsigned long default_map_sync_mask = MMF_DISABLE_MAP_SYNC_MASK;
> >>>> +#else
> >>>> +unsigned long default_map_sync_mask = 0;
> >>>> +#endif
> >>>> +
> >
> > I'm not sure CONFIG is really the right approach here. For a distro that would
> > basically mean to disable MAP_SYNC for all PPC kernels unless application
> > explicitly uses the right prctl. Shouldn't we rather initialize
> > default_map_sync_mask on boot based on whether the CPU we run on requires
> > new flush instructions or not? Otherwise the patch looks sensible.
> >
>
> yes that is correct. We ideally want to deny MAP_SYNC only w.r.t
> POWER10. But on a virtualized platform there is no easy way to detect
> that. We could ideally hook this into the nvdimm driver where we look at
> the new compat string ibm,persistent-memory-v2 and then disable MAP_SYNC
> if we find a device with the specific value.
>
> BTW with the recent changes I posted for the nvdimm driver, older kernel
> won't initialize persistent memory device on newer hardware. Newer
> hardware will present the device to OS with a different device tree
> compat string.
>
> My expectation w.r.t this patch was, Distro would want to mark
> CONFIG_ARCH_MAP_SYNC_DISABLE=n based on the different application
> certification. Otherwise application will have to end up calling the
> prctl(MMF_DISABLE_MAP_SYNC, 0) any way. If that is the case, should this
> be dependent on P10?
>
> With that I am wondering should we even have this patch? Can we expect
> userspace get updated to use new instruction?.
>
> With ppc64 we never had a real persistent memory device available for
> end user to try. The available persistent memory stack was using vPMEM
> which was presented as a volatile memory region for which there is no
> need to use any of the flush instructions. We could safely assume that
> as we get applications certified/verified for working with pmem device
> on ppc64, they would all be using the new instructions?
I think prctl is the wrong interface for this. I was thinking a sysfs
interface along the same lines as /sys/block/pmemX/dax/write_cache.
That attribute is toggling DAXDEV_WRITE_CACHE for the determination of
whether the platform or the kernel needs to handle cache flushing
relative to power loss. A similar attribute can be established for
DAXDEV_SYNC, it would simply default to off based on a configuration
time policy, but be dynamically changeable at runtime via sysfs.
These flags are device properties that affect the kernel and
userspace's handling of persistence.
^ permalink raw reply
* Re: [PATCH v8 0/8] powerpc: switch VDSO to C implementation
From: Christophe Leroy @ 2020-05-29 18:56 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, nathanl
Cc: linux-arch, arnd, linux-kernel, luto, tglx, vincenzo.frascino,
linuxppc-dev
In-Reply-To: <cover.1588079622.git.christophe.leroy@c-s.fr>
Hi Michael,
Le 28/04/2020 à 15:16, Christophe Leroy a écrit :
> This is the seventh version of a series to switch powerpc VDSO to
> generic C implementation.
>
> Main changes since v7 are:
> - Added gettime64 on PPC32
>
> This series applies on today's powerpc/merge branch.
>
> See the last patches for details on changes and performance.
Do you have any plans for this series ?
Even if you don't feel like merging it this cycle, I think patches 1 to
3 are worth it.
Christophe
>
> Christophe Leroy (8):
> powerpc/vdso64: Switch from __get_datapage() to get_datapage inline
> macro
> powerpc/vdso: Remove __kernel_datapage_offset and simplify
> __get_datapage()
> powerpc/vdso: Remove unused \tmp param in __get_datapage()
> powerpc/processor: Move cpu_relax() into asm/vdso/processor.h
> powerpc/vdso: Prepare for switching VDSO to generic C implementation.
> powerpc/vdso: Switch VDSO to generic C implementation.
> lib/vdso: force inlining of __cvdso_clock_gettime_common()
> powerpc/vdso: Provide __kernel_clock_gettime64() on vdso32
>
> arch/powerpc/Kconfig | 2 +
> arch/powerpc/include/asm/clocksource.h | 7 +
> arch/powerpc/include/asm/processor.h | 10 +-
> arch/powerpc/include/asm/vdso/clocksource.h | 7 +
> arch/powerpc/include/asm/vdso/gettimeofday.h | 175 +++++++++++
> arch/powerpc/include/asm/vdso/processor.h | 23 ++
> arch/powerpc/include/asm/vdso/vsyscall.h | 25 ++
> arch/powerpc/include/asm/vdso_datapage.h | 50 ++--
> arch/powerpc/kernel/asm-offsets.c | 49 +--
> arch/powerpc/kernel/time.c | 91 +-----
> arch/powerpc/kernel/vdso.c | 58 +---
> arch/powerpc/kernel/vdso32/Makefile | 32 +-
> arch/powerpc/kernel/vdso32/cacheflush.S | 2 +-
> arch/powerpc/kernel/vdso32/config-fake32.h | 34 +++
> arch/powerpc/kernel/vdso32/datapage.S | 7 +-
> arch/powerpc/kernel/vdso32/gettimeofday.S | 300 +------------------
> arch/powerpc/kernel/vdso32/vdso32.lds.S | 8 +-
> arch/powerpc/kernel/vdso32/vgettimeofday.c | 35 +++
> arch/powerpc/kernel/vdso64/Makefile | 23 +-
> arch/powerpc/kernel/vdso64/cacheflush.S | 9 +-
> arch/powerpc/kernel/vdso64/datapage.S | 31 +-
> arch/powerpc/kernel/vdso64/gettimeofday.S | 243 +--------------
> arch/powerpc/kernel/vdso64/vdso64.lds.S | 7 +-
> arch/powerpc/kernel/vdso64/vgettimeofday.c | 29 ++
> lib/vdso/gettimeofday.c | 2 +-
> 25 files changed, 460 insertions(+), 799 deletions(-)
> create mode 100644 arch/powerpc/include/asm/clocksource.h
> create mode 100644 arch/powerpc/include/asm/vdso/clocksource.h
> create mode 100644 arch/powerpc/include/asm/vdso/gettimeofday.h
> create mode 100644 arch/powerpc/include/asm/vdso/processor.h
> create mode 100644 arch/powerpc/include/asm/vdso/vsyscall.h
> create mode 100644 arch/powerpc/kernel/vdso32/config-fake32.h
> create mode 100644 arch/powerpc/kernel/vdso32/vgettimeofday.c
> create mode 100644 arch/powerpc/kernel/vdso64/vgettimeofday.c
>
^ permalink raw reply
* Re: [PATCH] powerpc/32s: Fix another build failure with CONFIG_PPC_KUAP_DEBUG
From: Christophe Leroy @ 2020-05-29 18:53 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <5ce7982c72424eb3e7abf78063d454c38c42b343.1590778219.git.christophe.leroy@csgroup.eu>
Le 29/05/2020 à 20:50, Christophe Leroy a écrit :
> From: Christophe Leroy <christophe.leroy@c-s.fr>
>
> 'thread' doesn't exist in kuap_check() macro.
>
> Use 'current' instead.
>
> Fixes: a68c31fc01ef ("powerpc/32s: Implement Kernel Userspace Access Protection")
> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Argh, can you drop this line ?
> Cc: stable@vger.kernel.org
> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Reported-by: kbuild test robot <lkp@intel.com>
> ---
> arch/powerpc/include/asm/book3s/32/kup.h | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h
> index db0a1c281587..668508c8a1b5 100644
> --- a/arch/powerpc/include/asm/book3s/32/kup.h
> +++ b/arch/powerpc/include/asm/book3s/32/kup.h
> @@ -75,7 +75,7 @@
>
> .macro kuap_check current, gpr
> #ifdef CONFIG_PPC_KUAP_DEBUG
> - lwz \gpr, KUAP(thread)
> + lwz \gpr, THREAD + KUAP(\current)
> 999: twnei \gpr, 0
> EMIT_BUG_ENTRY 999b, __FILE__, __LINE__, (BUGFLAG_WARNING | BUGFLAG_ONCE)
> #endif
>
^ permalink raw reply
* Re: [PATCH] powerpc/nvram: Replace kmalloc with kzalloc in the error message
From: Markus Elfring @ 2020-05-29 18:50 UTC (permalink / raw)
To: Michael Ellerman, Liao Pingfang, linuxppc-dev
Cc: Yi Wang, Tony Luck, Kees Cook, Wang Liang, Anton Vorontsov,
kernel-janitors, LKML, Colin Cross, Paul Mackerras, Xue Zhihong,
Greg Kroah-Hartman, Thomas Gleixner, Allison Randal
> Please just remove the message instead, it's a tiny allocation that's
> unlikely to ever fail, and the caller will print an error anyway.
How do you think about to take another look at a previous update suggestion
like the following?
powerpc/nvram: Delete three error messages for a failed memory allocation
https://patchwork.ozlabs.org/project/linuxppc-dev/patch/00845261-8528-d011-d3b8-e9355a231d3a@users.sourceforge.net/
https://lore.kernel.org/linuxppc-dev/00845261-8528-d011-d3b8-e9355a231d3a@users.sourceforge.net/
https://lore.kernel.org/patchwork/patch/752720/
https://lkml.org/lkml/2017/1/19/537
Regards,
Markus
^ permalink raw reply
* [PATCH] powerpc/32s: Fix another build failure with CONFIG_PPC_KUAP_DEBUG
From: Christophe Leroy @ 2020-05-29 18:50 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linuxppc-dev, linux-kernel
From: Christophe Leroy <christophe.leroy@c-s.fr>
'thread' doesn't exist in kuap_check() macro.
Use 'current' instead.
Fixes: a68c31fc01ef ("powerpc/32s: Implement Kernel Userspace Access Protection")
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: stable@vger.kernel.org
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
arch/powerpc/include/asm/book3s/32/kup.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h
index db0a1c281587..668508c8a1b5 100644
--- a/arch/powerpc/include/asm/book3s/32/kup.h
+++ b/arch/powerpc/include/asm/book3s/32/kup.h
@@ -75,7 +75,7 @@
.macro kuap_check current, gpr
#ifdef CONFIG_PPC_KUAP_DEBUG
- lwz \gpr, KUAP(thread)
+ lwz \gpr, THREAD + KUAP(\current)
999: twnei \gpr, 0
EMIT_BUG_ENTRY 999b, __FILE__, __LINE__, (BUGFLAG_WARNING | BUGFLAG_ONCE)
#endif
--
2.25.0
^ permalink raw reply related
* Re: [PATCH] ASoC: fsl_asrc_dma: Fix dma_chan leak when config DMA channel failed
From: Mark Brown @ 2020-05-29 16:51 UTC (permalink / raw)
To: Xiyu Yang, Liam Girdwood, Takashi Iwai, Timur Tabi, linux-kernel,
Jaroslav Kysela, alsa-devel, linuxppc-dev, Xiubo Li,
Fabio Estevam, Nicolin Chen
Cc: Xin Tan, yuanxzhang, kjlu
In-Reply-To: <1590415966-52416-1-git-send-email-xiyuyang19@fudan.edu.cn>
On Mon, 25 May 2020 22:12:46 +0800, Xiyu Yang wrote:
> fsl_asrc_dma_hw_params() invokes dma_request_channel() or
> fsl_asrc_get_dma_channel(), which returns a reference of the specified
> dma_chan object to "pair->dma_chan[dir]" with increased refcnt.
>
> The reference counting issue happens in one exception handling path of
> fsl_asrc_dma_hw_params(). When config DMA channel failed for Back-End,
> the function forgets to decrease the refcnt increased by
> dma_request_channel() or fsl_asrc_get_dma_channel(), causing a refcnt
> leak.
>
> [...]
Applied to
https://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound.git for-next
Thanks!
[1/1] ASoC: fsl_asrc_dma: Fix dma_chan leak when config DMA channel failed
commit: 36124fb19f1ae68a500cd76a76d40c6e81bee346
All being well this means that it will be integrated into the linux-next
tree (usually sometime in the next 24 hours) and sent to Linus during
the next merge window (or sooner if it is a bug fix), however if
problems are discovered then the patch may be dropped or reverted.
You may get further e-mails resulting from automated or manual testing
and review of the tree, please engage with people reporting problems and
send followup patches addressing any issues that are reported if needed.
If any updates are required or you are submitting further changes they
should be sent as incremental updates against current git, existing
patches will not be replaced.
Please add any relevant lists and maintainers to the CCs when replying
to this mail.
Thanks,
Mark
^ permalink raw reply
* Re: [Intel-gfx] [PATCH 06/13] ocfs2: use new sysctl subdir helper register_sysctl_subdir()
From: Kees Cook @ 2020-05-29 14:49 UTC (permalink / raw)
To: Luis Chamberlain
Cc: jack, rafael, airlied, amir73il, clemens, dri-devel, joseph.qi,
sfr, mark, rdna, yzaikin, arnd, intel-gfx, julia.lawall, jlbec,
nixiaoming, vbabka, axboe, tytso, gregkh, linux-kernel, ebiederm,
akpm, linuxppc-dev, ocfs2-devel, viro
In-Reply-To: <20200529114912.GC11244@42.do-not-panic.com>
On Fri, May 29, 2020 at 11:49:12AM +0000, Luis Chamberlain wrote:
> Yikes, sense, you're right. Nope, I left the random config tests to
> 0day. Will fix, thanks!
Yeah, I do the same for randconfig, but I always do an "allmodconfig"
build before sending stuff. It's a good smoke test.
--
Kees Cook
^ permalink raw reply
* Re: [PATCH 12/13] sysctl: add helper to register empty subdir
From: Eric W. Biederman @ 2020-05-29 13:03 UTC (permalink / raw)
To: Luis Chamberlain
Cc: jack, rafael, airlied, amir73il, clemens, dri-devel, joseph.qi,
sfr, mark, rdna, yzaikin, joonas.lahtinen, keescook, arnd,
intel-gfx, jani.nikula, julia.lawall, viro, rodrigo.vivi,
nixiaoming, vbabka, axboe, tytso, gregkh, linux-kernel, daniel,
akpm, linuxppc-dev, ocfs2-devel, jlbec
In-Reply-To: <20200529074108.16928-13-mcgrof@kernel.org>
Luis Chamberlain <mcgrof@kernel.org> writes:
> The way to create a subdirectory from the base set of directories
> is a bit obscure, so provide a helper which makes this clear, and
> also helps remove boiler plate code required to do this work.
I agreee calling:
register_sysctl("fs/binfmt_misc", sysctl_mount_point)
is a bit obscure but if you are going to make a wrapper
please make it the trivial one liner above.
Say something that looks like:
struct sysctl_header *register_sysctl_mount_point(const char *path)
{
return register_sysctl(path, sysctl_mount_point);
}
And yes please talk about a mount point and not an empty dir, as these
are permanently empty directories to serve as mount points. There are
some subtle but important permission checks this allows in the case of
unprivileged mounts.
Further code like this belong in proc_sysctl.c next to all of the code
it is related to so that it is easier to see how to refactor the code if
necessary.
Eric
>
> Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
> ---
> include/linux/sysctl.h | 7 +++++++
> kernel/sysctl.c | 16 +++++++++++++---
> 2 files changed, 20 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
> index 33a471b56345..89c92390e6de 100644
> --- a/include/linux/sysctl.h
> +++ b/include/linux/sysctl.h
> @@ -208,6 +208,8 @@ extern void register_sysctl_init(const char *path, struct ctl_table *table,
> extern struct ctl_table_header *register_sysctl_subdir(const char *base,
> const char *subdir,
> struct ctl_table *table);
> +extern void register_sysctl_empty_subdir(const char *base, const char *subdir);
> +
> void do_sysctl_args(void);
>
> extern int pwrsw_enabled;
> @@ -231,6 +233,11 @@ inline struct ctl_table_header *register_sysctl_subdir(const char *base,
> return NULL;
> }
>
> +static inline void register_sysctl_empty_subdir(const char *base,
> + const char *subdir)
> +{
> +}
> +
> static inline struct ctl_table_header *register_sysctl_paths(
> const struct ctl_path *path, struct ctl_table *table)
> {
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index f9a35325d5d5..460532cd5ac8 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -3188,13 +3188,17 @@ struct ctl_table_header *register_sysctl_subdir(const char *base,
> { }
> };
>
> - if (!table->procname)
> + if (table != sysctl_mount_point && !table->procname)
> goto out;
>
> hdr = register_sysctl_table(base_table);
> if (unlikely(!hdr)) {
> - pr_err("failed when creating subdirectory sysctl %s/%s/%s\n",
> - base, subdir, table->procname);
> + if (table != sysctl_mount_point)
> + pr_err("failed when creating subdirectory sysctl %s/%s/%s\n",
> + base, subdir, table->procname);
> + else
> + pr_err("failed when creating empty subddirectory %s/%s\n",
> + base, subdir);
> goto out;
> }
> kmemleak_not_leak(hdr);
> @@ -3202,6 +3206,12 @@ struct ctl_table_header *register_sysctl_subdir(const char *base,
> return hdr;
> }
> EXPORT_SYMBOL_GPL(register_sysctl_subdir);
> +
> +void register_sysctl_empty_subdir(const char *base,
> + const char *subdir)
> +{
> + register_sysctl_subdir(base, subdir, sysctl_mount_point);
> +}
> #endif /* CONFIG_SYSCTL */
> /*
> * No sense putting this after each symbol definition, twice,
^ permalink raw reply
* Re: [PATCH 02/13] cdrom: use new sysctl subdir helper register_sysctl_subdir()
From: Eric W. Biederman @ 2020-05-29 12:46 UTC (permalink / raw)
To: Luis Chamberlain
Cc: jack, rafael, airlied, amir73il, clemens, dri-devel, joseph.qi,
sfr, mark, rdna, yzaikin, joonas.lahtinen, keescook, arnd,
intel-gfx, jani.nikula, julia.lawall, viro, rodrigo.vivi,
nixiaoming, vbabka, axboe, tytso, gregkh, linux-kernel, daniel,
akpm, linuxppc-dev, ocfs2-devel, jlbec
In-Reply-To: <20200529074108.16928-3-mcgrof@kernel.org>
Luis Chamberlain <mcgrof@kernel.org> writes:
> This simplifies the code considerably. The following coccinelle
With register_sysctl the code would read:
cdrom_sysctl_header = register_sysctl("dev/cdrom", cdrom_table);
Please go that direction. Thank you.
Eric
^ permalink raw reply
* Re: [PATCH 01/13] sysctl: add new register_sysctl_subdir() helper
From: Eric W. Biederman @ 2020-05-29 12:42 UTC (permalink / raw)
To: Luis Chamberlain
Cc: jack, rafael, airlied, amir73il, clemens, dri-devel, joseph.qi,
sfr, mark, rdna, yzaikin, joonas.lahtinen, keescook, arnd,
intel-gfx, jani.nikula, julia.lawall, viro, rodrigo.vivi,
nixiaoming, vbabka, axboe, tytso, gregkh, linux-kernel, daniel,
akpm, linuxppc-dev, ocfs2-devel, jlbec
In-Reply-To: <87ftbiud6s.fsf@x220.int.ebiederm.org>
ebiederm@xmission.com (Eric W. Biederman) writes:
> Luis Chamberlain <mcgrof@kernel.org> writes:
>
>> Often enough all we need to do is create a subdirectory so that
>> we can stuff sysctls underneath it. However, *if* that directory
>> was already created early on the boot sequence we really have no
>> need to use the full boiler plate code for it, we can just use
>> local variables to help us guide sysctl to place the new leaf files.
>>
>> So use a helper to do precisely this.
>
> Reset restart. This is patch is total nonsense.
>
> - You are using register_sysctl_table which as I believe I have
> mentioned is a deprecated compatibility wrapper. The point of
> spring house cleaning is to get off of the deprecated functions
> isn't it?
>
> - You are using the old nasty form for creating directories instead
> of just passing in a path.
>
> - None of this is even remotely necessary. The directories
> are created automatically if you just register their entries.
Oh. *blink* The poor naming threw me off.
This is a clumsy and poorly named version of register_sysctl();
Yes. This change is totally unnecessary.
Eric
^ permalink raw reply
* Re: [PATCH 01/13] sysctl: add new register_sysctl_subdir() helper
From: Eric W. Biederman @ 2020-05-29 12:40 UTC (permalink / raw)
To: Luis Chamberlain
Cc: jack, rafael, airlied, amir73il, clemens, dri-devel, joseph.qi,
sfr, mark, rdna, yzaikin, joonas.lahtinen, keescook, arnd,
intel-gfx, jani.nikula, julia.lawall, viro, rodrigo.vivi,
nixiaoming, vbabka, axboe, tytso, gregkh, linux-kernel, daniel,
akpm, linuxppc-dev, ocfs2-devel, jlbec
In-Reply-To: <20200529074108.16928-2-mcgrof@kernel.org>
Luis Chamberlain <mcgrof@kernel.org> writes:
> Often enough all we need to do is create a subdirectory so that
> we can stuff sysctls underneath it. However, *if* that directory
> was already created early on the boot sequence we really have no
> need to use the full boiler plate code for it, we can just use
> local variables to help us guide sysctl to place the new leaf files.
>
> So use a helper to do precisely this.
Reset restart. This is patch is total nonsense.
- You are using register_sysctl_table which as I believe I have
mentioned is a deprecated compatibility wrapper. The point of
spring house cleaning is to get off of the deprecated functions
isn't it?
- You are using the old nasty form for creating directories instead
of just passing in a path.
- None of this is even remotely necessary. The directories
are created automatically if you just register their entries.
Eric
^ permalink raw reply
* Re: [PATCH v3 3/3] arch, scripts: Add script to check relocations at compile time
From: Anup Patel @ 2020-05-29 12:08 UTC (permalink / raw)
To: Alexandre Ghiti
Cc: Albert Ou, Anup Patel, linux-kernel@vger.kernel.org List,
Atish Patra, Paul Mackerras, Zong Li, Paul Walmsley,
Palmer Dabbelt, linux-riscv, linuxppc-dev
In-Reply-To: <20200524085259.24784-4-alex@ghiti.fr>
On Sun, May 24, 2020 at 2:26 PM Alexandre Ghiti <alex@ghiti.fr> wrote:
>
> Relocating kernel at runtime is done very early in the boot process, so
> it is not convenient to check for relocations there and react in case a
> relocation was not expected.
>
> Powerpc architecture has a script that allows to check at compile time
> for such unexpected relocations: extract the common logic to scripts/
> and add arch specific scripts triggered at postlink.
>
> At the moment, powerpc and riscv architectures take advantage of this
> compile-time check.
>
> Signed-off-by: Alexandre Ghiti <alex@ghiti.fr>
> ---
> arch/powerpc/tools/relocs_check.sh | 18 ++-------------
> arch/riscv/Makefile.postlink | 36 ++++++++++++++++++++++++++++++
> arch/riscv/tools/relocs_check.sh | 26 +++++++++++++++++++++
> scripts/relocs_check.sh | 20 +++++++++++++++++
> 4 files changed, 84 insertions(+), 16 deletions(-)
> create mode 100644 arch/riscv/Makefile.postlink
> create mode 100755 arch/riscv/tools/relocs_check.sh
> create mode 100755 scripts/relocs_check.sh
Maybe you should send the change arch/powerpc/tools/relocs_check.sh
as a separate patch so that it can be picked up by arch/powerpc maintainers.
>
> diff --git a/arch/powerpc/tools/relocs_check.sh b/arch/powerpc/tools/relocs_check.sh
> index 014e00e74d2b..e367895941ae 100755
> --- a/arch/powerpc/tools/relocs_check.sh
> +++ b/arch/powerpc/tools/relocs_check.sh
> @@ -15,21 +15,8 @@ if [ $# -lt 3 ]; then
> exit 1
> fi
>
> -# Have Kbuild supply the path to objdump and nm so we handle cross compilation.
> -objdump="$1"
> -nm="$2"
> -vmlinux="$3"
> -
> -# Remove from the bad relocations those that match an undefined weak symbol
> -# which will result in an absolute relocation to 0.
> -# Weak unresolved symbols are of that form in nm output:
> -# " w _binary__btf_vmlinux_bin_end"
> -undef_weak_symbols=$($nm "$vmlinux" | awk '$1 ~ /w/ { print $2 }')
> -
> bad_relocs=$(
> -$objdump -R "$vmlinux" |
> - # Only look at relocation lines.
> - grep -E '\<R_' |
> +${srctree}/scripts/relocs_check.sh "$@" |
> # These relocations are okay
> # On PPC64:
> # R_PPC64_RELATIVE, R_PPC64_NONE
> @@ -43,8 +30,7 @@ R_PPC_ADDR16_LO
> R_PPC_ADDR16_HI
> R_PPC_ADDR16_HA
> R_PPC_RELATIVE
> -R_PPC_NONE' |
> - ([ "$undef_weak_symbols" ] && grep -F -w -v "$undef_weak_symbols" || cat)
> +R_PPC_NONE'
> )
>
> if [ -z "$bad_relocs" ]; then
> diff --git a/arch/riscv/Makefile.postlink b/arch/riscv/Makefile.postlink
> new file mode 100644
> index 000000000000..bf2b2bca1845
> --- /dev/null
> +++ b/arch/riscv/Makefile.postlink
> @@ -0,0 +1,36 @@
> +# SPDX-License-Identifier: GPL-2.0
> +# ===========================================================================
> +# Post-link riscv pass
> +# ===========================================================================
> +#
> +# Check that vmlinux relocations look sane
> +
> +PHONY := __archpost
> +__archpost:
> +
> +-include include/config/auto.conf
> +include scripts/Kbuild.include
> +
> +quiet_cmd_relocs_check = CHKREL $@
> +cmd_relocs_check = \
> + $(CONFIG_SHELL) $(srctree)/arch/riscv/tools/relocs_check.sh "$(OBJDUMP)" "$(NM)" "$@"
> +
> +# `@true` prevents complaint when there is nothing to be done
> +
> +vmlinux: FORCE
> + @true
> +ifdef CONFIG_RELOCATABLE
> + $(call if_changed,relocs_check)
> +endif
> +
> +%.ko: FORCE
> + @true
> +
> +clean:
> + @true
> +
> +PHONY += FORCE clean
> +
> +FORCE:
> +
> +.PHONY: $(PHONY)
> diff --git a/arch/riscv/tools/relocs_check.sh b/arch/riscv/tools/relocs_check.sh
> new file mode 100755
> index 000000000000..baeb2e7b2290
> --- /dev/null
> +++ b/arch/riscv/tools/relocs_check.sh
> @@ -0,0 +1,26 @@
> +#!/bin/sh
> +# SPDX-License-Identifier: GPL-2.0-or-later
> +# Based on powerpc relocs_check.sh
> +
> +# This script checks the relocations of a vmlinux for "suspicious"
> +# relocations.
> +
> +if [ $# -lt 3 ]; then
> + echo "$0 [path to objdump] [path to nm] [path to vmlinux]" 1>&2
> + exit 1
> +fi
> +
> +bad_relocs=$(
> +${srctree}/scripts/relocs_check.sh "$@" |
> + # These relocations are okay
> + # R_RISCV_RELATIVE
> + grep -F -w -v 'R_RISCV_RELATIVE'
> +)
> +
> +if [ -z "$bad_relocs" ]; then
> + exit 0
> +fi
> +
> +num_bad=$(echo "$bad_relocs" | wc -l)
> +echo "WARNING: $num_bad bad relocations"
> +echo "$bad_relocs"
> diff --git a/scripts/relocs_check.sh b/scripts/relocs_check.sh
> new file mode 100755
> index 000000000000..137c660499f3
> --- /dev/null
> +++ b/scripts/relocs_check.sh
> @@ -0,0 +1,20 @@
> +#!/bin/sh
> +# SPDX-License-Identifier: GPL-2.0-or-later
> +
> +# Get a list of all the relocations, remove from it the relocations
> +# that are known to be legitimate and return this list to arch specific
> +# script that will look for suspicious relocations.
> +
> +objdump="$1"
> +nm="$2"
> +vmlinux="$3"
> +
> +# Remove from the possible bad relocations those that match an undefined
> +# weak symbol which will result in an absolute relocation to 0.
> +# Weak unresolved symbols are of that form in nm output:
> +# " w _binary__btf_vmlinux_bin_end"
> +undef_weak_symbols=$($nm "$vmlinux" | awk '$1 ~ /w/ { print $2 }')
> +
> +$objdump -R "$vmlinux" |
> + grep -E '\<R_' |
> + ([ "$undef_weak_symbols" ] && grep -F -w -v "$undef_weak_symbols" || cat)
> --
> 2.20.1
>
Otherwise, looks good to me.
Reviewed-by: Anup Patel <anup@brainfault.org>
Regards,
Anup
^ permalink raw reply
* Re: [PATCH v3 2/3] riscv: Introduce CONFIG_RELOCATABLE
From: Anup Patel @ 2020-05-29 12:04 UTC (permalink / raw)
To: Alexandre Ghiti
Cc: Albert Ou, Anup Patel, linux-kernel@vger.kernel.org List,
Atish Patra, Paul Mackerras, Zong Li, Paul Walmsley,
Palmer Dabbelt, linux-riscv, linuxppc-dev
In-Reply-To: <20200524085259.24784-3-alex@ghiti.fr>
On Sun, May 24, 2020 at 2:25 PM Alexandre Ghiti <alex@ghiti.fr> wrote:
>
> This config allows to compile the kernel as PIE and to relocate it at
> any virtual address at runtime: this paves the way to KASLR and to 4-level
> page table folding at runtime. Runtime relocation is possible since
> relocation metadata are embedded into the kernel.
>
> Note that relocating at runtime introduces an overhead even if the
> kernel is loaded at the same address it was linked at and that the compiler
> options are those used in arm64 which uses the same RELA relocation
> format.
>
> Signed-off-by: Alexandre Ghiti <alex@ghiti.fr>
> ---
> arch/riscv/Kconfig | 12 +++++++
> arch/riscv/Makefile | 5 ++-
> arch/riscv/kernel/vmlinux.lds.S | 6 ++--
> arch/riscv/mm/Makefile | 4 +++
> arch/riscv/mm/init.c | 63 +++++++++++++++++++++++++++++++++
> 5 files changed, 87 insertions(+), 3 deletions(-)
>
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index a31e1a41913a..93127d5913fe 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -170,6 +170,18 @@ config PGTABLE_LEVELS
> default 3 if 64BIT
> default 2
>
> +config RELOCATABLE
> + bool
> + depends on MMU
> + help
> + This builds a kernel as a Position Independent Executable (PIE),
> + which retains all relocation metadata required to relocate the
> + kernel binary at runtime to a different virtual address than the
> + address it was linked at.
> + Since RISCV uses the RELA relocation format, this requires a
> + relocation pass at runtime even if the kernel is loaded at the
> + same address it was linked at.
> +
> source "arch/riscv/Kconfig.socs"
>
> menu "Platform type"
> diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile
> index fb6e37db836d..1406416ea743 100644
> --- a/arch/riscv/Makefile
> +++ b/arch/riscv/Makefile
> @@ -9,7 +9,10 @@
> #
>
> OBJCOPYFLAGS := -O binary
> -LDFLAGS_vmlinux :=
> +ifeq ($(CONFIG_RELOCATABLE),y)
> +LDFLAGS_vmlinux := -shared -Bsymbolic -z notext -z norelro
> +KBUILD_CFLAGS += -fPIE
> +endif
> ifeq ($(CONFIG_DYNAMIC_FTRACE),y)
> LDFLAGS_vmlinux := --no-relax
> endif
> diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S
> index a9abde62909f..e8ffba8c2044 100644
> --- a/arch/riscv/kernel/vmlinux.lds.S
> +++ b/arch/riscv/kernel/vmlinux.lds.S
> @@ -85,8 +85,10 @@ SECTIONS
>
> BSS_SECTION(PAGE_SIZE, PAGE_SIZE, 0)
>
> - .rel.dyn : {
> - *(.rel.dyn*)
> + .rela.dyn : ALIGN(8) {
> + __rela_dyn_start = .;
> + *(.rela .rela*)
> + __rela_dyn_end = .;
> }
>
> _end = .;
> diff --git a/arch/riscv/mm/Makefile b/arch/riscv/mm/Makefile
> index 363ef01c30b1..dc5cdaa80bc1 100644
> --- a/arch/riscv/mm/Makefile
> +++ b/arch/riscv/mm/Makefile
> @@ -1,6 +1,10 @@
> # SPDX-License-Identifier: GPL-2.0-only
>
> CFLAGS_init.o := -mcmodel=medany
> +ifdef CONFIG_RELOCATABLE
> +CFLAGS_init.o += -fno-pie
> +endif
> +
> ifdef CONFIG_FTRACE
> CFLAGS_REMOVE_init.o = -pg
> endif
> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> index 17f108baec4f..7074522d40c6 100644
> --- a/arch/riscv/mm/init.c
> +++ b/arch/riscv/mm/init.c
> @@ -13,6 +13,9 @@
> #include <linux/of_fdt.h>
> #include <linux/libfdt.h>
> #include <linux/set_memory.h>
> +#ifdef CONFIG_RELOCATABLE
> +#include <linux/elf.h>
> +#endif
>
> #include <asm/fixmap.h>
> #include <asm/tlbflush.h>
> @@ -379,6 +382,53 @@ static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size)
> #error "setup_vm() is called from head.S before relocate so it should not use absolute addressing."
> #endif
>
> +#ifdef CONFIG_RELOCATABLE
> +extern unsigned long __rela_dyn_start, __rela_dyn_end;
> +
> +#ifdef CONFIG_64BIT
> +#define Elf_Rela Elf64_Rela
> +#define Elf_Addr Elf64_Addr
> +#else
> +#define Elf_Rela Elf32_Rela
> +#define Elf_Addr Elf32_Addr
> +#endif
> +
> +void __init relocate_kernel(uintptr_t load_pa)
> +{
> + Elf_Rela *rela = (Elf_Rela *)&__rela_dyn_start;
> + /*
> + * This holds the offset between the linked virtual address and the
> + * relocated virtual address.
> + */
> + uintptr_t reloc_offset = kernel_virt_addr - KERNEL_LINK_ADDR;
> + /*
> + * This holds the offset between kernel linked virtual address and
> + * physical address.
> + */
> + uintptr_t va_kernel_link_pa_offset = KERNEL_LINK_ADDR - load_pa;
> +
> + for ( ; rela < (Elf_Rela *)&__rela_dyn_end; rela++) {
> + Elf_Addr addr = (rela->r_offset - va_kernel_link_pa_offset);
> + Elf_Addr relocated_addr = rela->r_addend;
> +
> + if (rela->r_info != R_RISCV_RELATIVE)
> + continue;
> +
> + /*
> + * Make sure to not relocate vdso symbols like rt_sigreturn
> + * which are linked from the address 0 in vmlinux since
> + * vdso symbol addresses are actually used as an offset from
> + * mm->context.vdso in VDSO_OFFSET macro.
> + */
> + if (relocated_addr >= KERNEL_LINK_ADDR)
> + relocated_addr += reloc_offset;
> +
> + *(Elf_Addr *)addr = relocated_addr;
> + }
> +}
> +
> +#endif
> +
> static uintptr_t load_pa, load_sz;
>
> void create_kernel_page_table(pgd_t *pgdir, uintptr_t map_size)
> @@ -405,6 +455,19 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
>
> pfn_base = PFN_DOWN(load_pa);
>
> +#ifdef CONFIG_RELOCATABLE
> +#ifdef CONFIG_64BIT
> + /*
> + * Early page table uses only one PGDIR, which makes it possible
> + * to map PGDIR_SIZE aligned on PGDIR_SIZE: if the relocation offset
> + * makes the kernel cross over a PGDIR_SIZE boundary, raise a bug
> + * since a part of the kernel would not get mapped.
> + * This cannot happen on rv32 as we use the entire page directory level.
> + */
> + BUG_ON(PGDIR_SIZE - (kernel_virt_addr & (PGDIR_SIZE - 1)) < load_sz);
> +#endif
> + relocate_kernel(load_pa);
> +#endif
> /*
> * Enforce boot alignment requirements of RV32 and
> * RV64 by only allowing PMD or PGD mappings.
> --
> 2.20.1
>
>
Looks good to me as well.
Reviewed-by: Anup Patel <anup@brainfault.org>
Regards,
Anup
^ permalink raw reply
* Re: [PATCH 11/13] random: simplify sysctl declaration with register_sysctl_subdir()
From: Xiaoming Ni @ 2020-05-29 12:09 UTC (permalink / raw)
To: Greg KH, Luis Chamberlain
Cc: jack, rafael, airlied, amir73il, clemens, dri-devel, joseph.qi,
sfr, mark, rdna, yzaikin, joonas.lahtinen, keescook, arnd,
intel-gfx, jani.nikula, julia.lawall, jlbec, rodrigo.vivi, vbabka,
axboe, tytso, linux-kernel, ebiederm, daniel, akpm, linuxppc-dev,
ocfs2-devel, viro
In-Reply-To: <20200529102644.GB1345939@kroah.com>
On 2020/5/29 18:26, Greg KH wrote:
> On Fri, May 29, 2020 at 07:41:06AM +0000, Luis Chamberlain wrote:
>> From: Xiaoming Ni <nixiaoming@huawei.com>
>>
>> Move random_table sysctl from kernel/sysctl.c to drivers/char/random.c
>> and use register_sysctl_subdir() to help remove the clutter out of
>> kernel/sysctl.c.
>>
>> Signed-off-by: Xiaoming Ni <nixiaoming@huawei.com>
>> Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
>> ---
>> drivers/char/random.c | 14 ++++++++++++--
>> include/linux/sysctl.h | 1 -
>> kernel/sysctl.c | 5 -----
>> 3 files changed, 12 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/char/random.c b/drivers/char/random.c
>> index a7cf6aa65908..73fd4b6e9c18 100644
>> --- a/drivers/char/random.c
>> +++ b/drivers/char/random.c
>> @@ -2101,8 +2101,7 @@ static int proc_do_entropy(struct ctl_table *table, int write,
>> }
>>
>> static int sysctl_poolsize = INPUT_POOL_WORDS * 32;
>> -extern struct ctl_table random_table[];
>> -struct ctl_table random_table[] = {
>> +static struct ctl_table random_table[] = {
>> {
>> .procname = "poolsize",
>> .data = &sysctl_poolsize,
>> @@ -2164,6 +2163,17 @@ struct ctl_table random_table[] = {
>> #endif
>> { }
>> };
>> +
>> +/*
>> + * rand_initialize() is called before sysctl_init(),
>> + * so we cannot call register_sysctl_init() in rand_initialize()
>> + */
>> +static int __init random_sysctls_init(void)
>> +{
>> + register_sysctl_subdir("kernel", "random", random_table);
>
> No error checking?
>
> :(
It was my mistake, I forgot to add a comment here.
Same as the comment of register_sysctl_init(),
There is almost no failure here during the system initialization phase,
and failure in time does not affect the main function.
Thanks
Xiaoming Ni
^ permalink raw reply
* Re: [PATCH 01/13] sysctl: add new register_sysctl_subdir() helper
From: Luis Chamberlain @ 2020-05-29 12:16 UTC (permalink / raw)
To: Jani Nikula
Cc: jack, rafael, airlied, amir73il, clemens, dri-devel, joseph.qi,
sfr, mark, rdna, yzaikin, joonas.lahtinen, keescook, arnd,
intel-gfx, julia.lawall, jlbec, rodrigo.vivi, nixiaoming, vbabka,
axboe, tytso, gregkh, linux-kernel, ebiederm, daniel, akpm,
linuxppc-dev, ocfs2-devel, viro
In-Reply-To: <87d06n17mm.fsf@intel.com>
On Fri, May 29, 2020 at 11:13:21AM +0300, Jani Nikula wrote:
> On Fri, 29 May 2020, Luis Chamberlain <mcgrof@kernel.org> wrote:
> > Often enough all we need to do is create a subdirectory so that
> > we can stuff sysctls underneath it. However, *if* that directory
> > was already created early on the boot sequence we really have no
> > need to use the full boiler plate code for it, we can just use
> > local variables to help us guide sysctl to place the new leaf files.
>
> I find it hard to figure out the lifetime requirements for the tables
> passed in; when it's okay to use local variables and when you need
> longer lifetimes. It's not documented, everyone appears to be using
> static tables for this. It's far from obvious.
I agree 2000% that it is not obvious. What made me consider it was that
I *knew* that the base directory would already exist, so it wouldn't
make sense for the code to rely on earlier parts of a table if part
of the hierarchy already existed.
In fact, a *huge* part of the due dilligence on this and futre series
on this cleanup will be to be 100% sure that the base path is already
created. And so this use is obviously dangerous, you just *need* to
know that the base path is created before.
Non-posted changes also deal with link order to help address this
in other places, given that link order controls how *initcalls()
(early_initcall(), late_initcall(), etc) are ordered if you have
multiple of these.
I had a link order series long ago which augmented our ability to make
things clearer at a link order. Eventually I believe this will become
more important, specially as we end up wanting to async more code.
For now, we can only rely on manual code inspection for ensuring
proper ordering. Part of the implicit aspects of this cleanup is
to slowly make these things clearer for each base path.
So... the "fs" base path will actually end up being created in
fs/sysctl.c after we are *fully* done with the fs sysctl cleanups.
Luis
^ permalink raw reply
* Re: [PATCH 09/13] firmware_loader: simplify sysctl declaration with register_sysctl_subdir()
From: Xiaoming Ni @ 2020-05-29 11:59 UTC (permalink / raw)
To: Greg KH, Luis Chamberlain
Cc: jack, rafael, airlied, amir73il, clemens, dri-devel, joseph.qi,
sfr, mark, rdna, yzaikin, joonas.lahtinen, keescook, arnd,
intel-gfx, jani.nikula, julia.lawall, jlbec, rodrigo.vivi, vbabka,
axboe, tytso, linux-kernel, ebiederm, daniel, akpm, linuxppc-dev,
ocfs2-devel, viro
In-Reply-To: <20200529102613.GA1345939@kroah.com>
On 2020/5/29 18:26, Greg KH wrote:
> On Fri, May 29, 2020 at 07:41:04AM +0000, Luis Chamberlain wrote:
>> From: Xiaoming Ni <nixiaoming@huawei.com>
>>
>> Move the firmware config sysctl table to fallback_table.c and use the
>> new register_sysctl_subdir() helper. This removes the clutter from
>> kernel/sysctl.c.
>>
>> Signed-off-by: Xiaoming Ni <nixiaoming@huawei.com>
>> Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
>> ---
>> drivers/base/firmware_loader/fallback.c | 4 ++++
>> drivers/base/firmware_loader/fallback.h | 11 ++++++++++
>> drivers/base/firmware_loader/fallback_table.c | 22 +++++++++++++++++--
>> include/linux/sysctl.h | 1 -
>> kernel/sysctl.c | 7 ------
>> 5 files changed, 35 insertions(+), 10 deletions(-)
>
> So it now takes more lines than the old stuff? :(
>
CONFIG_FW_LOADER = m
Before cleaning, no matter whether ko is loaded or not, the sysctl
interface will be created, but now we need to add register and
unregister interfaces, so the number of lines of code has increased
>>
>> diff --git a/drivers/base/firmware_loader/fallback.c b/drivers/base/firmware_loader/fallback.c
>> index d9ac7296205e..8190653ae9a3 100644
>> --- a/drivers/base/firmware_loader/fallback.c
>> +++ b/drivers/base/firmware_loader/fallback.c
>> @@ -200,12 +200,16 @@ static struct class firmware_class = {
>>
>> int register_sysfs_loader(void)
>> {
>> + int ret = register_firmware_config_sysctl();
>> + if (ret != 0)
>> + return ret;
>
> checkpatch :(
This is my fault, thanks for your guidance
>
>> return class_register(&firmware_class);
>
> And if that fails?
>
Yes, it is better to call register_firmware_config_sysctl() after
class_register().
thanks for your guidance.
>> }
>>
>> void unregister_sysfs_loader(void)
>> {
>> class_unregister(&firmware_class);
>> + unregister_firmware_config_sysctl();
>> }
>>
>> static ssize_t firmware_loading_show(struct device *dev,
>> diff --git a/drivers/base/firmware_loader/fallback.h b/drivers/base/firmware_loader/fallback.h
>> index 06f4577733a8..7d2cb5f6ceb8 100644
>> --- a/drivers/base/firmware_loader/fallback.h
>> +++ b/drivers/base/firmware_loader/fallback.h
>> @@ -42,6 +42,17 @@ void fw_fallback_set_default_timeout(void);
>>
>> int register_sysfs_loader(void);
>> void unregister_sysfs_loader(void);
>> +#ifdef CONFIG_SYSCTL
>> +extern int register_firmware_config_sysctl(void);
>> +extern void unregister_firmware_config_sysctl(void);
>> +#else
>> +static inline int register_firmware_config_sysctl(void)
>> +{
>> + return 0;
>> +}
>> +static inline void unregister_firmware_config_sysctl(void) { }
>> +#endif /* CONFIG_SYSCTL */
>> +
>> #else /* CONFIG_FW_LOADER_USER_HELPER */
>> static inline int firmware_fallback_sysfs(struct firmware *fw, const char *name,
>> struct device *device,
>> diff --git a/drivers/base/firmware_loader/fallback_table.c b/drivers/base/firmware_loader/fallback_table.c
>> index 46a731dede6f..4234aa5ee5df 100644
>> --- a/drivers/base/firmware_loader/fallback_table.c
>> +++ b/drivers/base/firmware_loader/fallback_table.c
>> @@ -24,7 +24,7 @@ struct firmware_fallback_config fw_fallback_config = {
>> EXPORT_SYMBOL_NS_GPL(fw_fallback_config, FIRMWARE_LOADER_PRIVATE);
>>
>> #ifdef CONFIG_SYSCTL
>> -struct ctl_table firmware_config_table[] = {
>> +static struct ctl_table firmware_config_table[] = {
>> {
>> .procname = "force_sysfs_fallback",
>> .data = &fw_fallback_config.force_sysfs_fallback,
>> @@ -45,4 +45,22 @@ struct ctl_table firmware_config_table[] = {
>> },
>> { }
>> };
>> -#endif
>> +
>> +static struct ctl_table_header *hdr;
>> +int register_firmware_config_sysctl(void)
>> +{
>> + if (hdr)
>> + return -EEXIST;
>
> How can hdr be set?
>
It's my mistake, register_firmware_config_sysctl() is not exported,
there will be no repeated calls.
thanks for your guidance.
>> + hdr = register_sysctl_subdir("kernel", "firmware_config",
>> + firmware_config_table);
>> + if (!hdr)
>> + return -ENOMEM;
>> + return 0;
>> +}
>> +
>> +void unregister_firmware_config_sysctl(void)
>> +{
>> + if (hdr)
>> + unregister_sysctl_table(hdr);
>
> Why can't unregister_sysctl_table() take a null pointer value?
Sorry, I didn't notice that the unregister_sysctl_table() already checks
the input parameters.
thanks for your guidance.
> And what sets 'hdr' (worst name for a static variable) to NULL so that
> it knows not to be unregistered again as it looks like
> register_firmware_config_sysctl() could be called multiple times.
How about renaming hdr to firmware_config_sysct_table_header?
+ if (hdr)
+ return -EEXIST;
After deleting this code in register_firmware_config_sysctl(), and
considering register_firmware_config_sysctl() and
unregister_firmware_config_sysctl() are not exported, whether there is
no need to add "hdr = NULL;" ?
Thanks
Xiaoming Ni
^ permalink raw reply
* Re: [PATCH 09/13] firmware_loader: simplify sysctl declaration with register_sysctl_subdir()
From: Luis Chamberlain @ 2020-05-29 12:09 UTC (permalink / raw)
To: Greg KH
Cc: jack, rafael, airlied, amir73il, clemens, dri-devel, joseph.qi,
sfr, mark, rdna, yzaikin, joonas.lahtinen, keescook, arnd,
intel-gfx, jani.nikula, julia.lawall, jlbec, rodrigo.vivi,
nixiaoming, vbabka, axboe, tytso, linux-kernel, ebiederm, daniel,
akpm, linuxppc-dev, ocfs2-devel, viro
In-Reply-To: <20200529102613.GA1345939@kroah.com>
On Fri, May 29, 2020 at 12:26:13PM +0200, Greg KH wrote:
> On Fri, May 29, 2020 at 07:41:04AM +0000, Luis Chamberlain wrote:
> > From: Xiaoming Ni <nixiaoming@huawei.com>
> >
> > Move the firmware config sysctl table to fallback_table.c and use the
> > new register_sysctl_subdir() helper. This removes the clutter from
> > kernel/sysctl.c.
> >
> > Signed-off-by: Xiaoming Ni <nixiaoming@huawei.com>
> > Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
> > ---
> > drivers/base/firmware_loader/fallback.c | 4 ++++
> > drivers/base/firmware_loader/fallback.h | 11 ++++++++++
> > drivers/base/firmware_loader/fallback_table.c | 22 +++++++++++++++++--
> > include/linux/sysctl.h | 1 -
> > kernel/sysctl.c | 7 ------
> > 5 files changed, 35 insertions(+), 10 deletions(-)
>
> So it now takes more lines than the old stuff? :(
Pretty much agreed with the other changes, thanks for the review!
But this diff-stat change, indeed, it is unfortunate that we end up
with more code here than before. We'll try to reduce it instead
somehow, however in some cases during this spring-cleaning, since
the goal is to move code from one file to another, it *may* require
more code. So it won't always be negative. But we'll try!
Luis
^ permalink raw reply
* Re: [Intel-gfx] [PATCH 06/13] ocfs2: use new sysctl subdir helper register_sysctl_subdir()
From: Luis Chamberlain @ 2020-05-29 11:49 UTC (permalink / raw)
To: Kees Cook
Cc: jack, rafael, airlied, amir73il, clemens, dri-devel, joseph.qi,
sfr, mark, rdna, yzaikin, arnd, intel-gfx, julia.lawall, jlbec,
nixiaoming, vbabka, axboe, tytso, gregkh, linux-kernel, ebiederm,
akpm, linuxppc-dev, ocfs2-devel, viro
In-Reply-To: <202005290121.C78B4AC@keescook>
On Fri, May 29, 2020 at 01:23:19AM -0700, Kees Cook wrote:
> On Fri, May 29, 2020 at 07:41:01AM +0000, Luis Chamberlain wrote:
> > This simplifies the code considerably. The following coccinelle
> > SmPL grammar rule was used to transform this code.
> >
> > // pycocci sysctl-subdir.cocci fs/ocfs2/stackglue.c
> >
> > @c1@
> > expression E1;
> > identifier subdir, sysctls;
> > @@
> >
> > static struct ctl_table subdir[] = {
> > {
> > .procname = E1,
> > .maxlen = 0,
> > .mode = 0555,
> > .child = sysctls,
> > },
> > { }
> > };
> >
> > @c2@
> > identifier c1.subdir;
> >
> > expression E2;
> > identifier base;
> > @@
> >
> > static struct ctl_table base[] = {
> > {
> > .procname = E2,
> > .maxlen = 0,
> > .mode = 0555,
> > .child = subdir,
> > },
> > { }
> > };
> >
> > @c3@
> > identifier c2.base;
> > identifier header;
> > @@
> >
> > header = register_sysctl_table(base);
> >
> > @r1 depends on c1 && c2 && c3@
> > expression c1.E1;
> > identifier c1.subdir, c1.sysctls;
> > @@
> >
> > -static struct ctl_table subdir[] = {
> > - {
> > - .procname = E1,
> > - .maxlen = 0,
> > - .mode = 0555,
> > - .child = sysctls,
> > - },
> > - { }
> > -};
> >
> > @r2 depends on c1 && c2 && c3@
> > identifier c1.subdir;
> >
> > expression c2.E2;
> > identifier c2.base;
> > @@
> > -static struct ctl_table base[] = {
> > - {
> > - .procname = E2,
> > - .maxlen = 0,
> > - .mode = 0555,
> > - .child = subdir,
> > - },
> > - { }
> > -};
> >
> > @r3 depends on c1 && c2 && c3@
> > expression c1.E1;
> > identifier c1.sysctls;
> > expression c2.E2;
> > identifier c2.base;
> > identifier c3.header;
> > @@
> >
> > header =
> > -register_sysctl_table(base);
> > +register_sysctl_subdir(E2, E1, sysctls);
> >
> > Generated-by: Coccinelle SmPL
> >
> > Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
> > ---
> > fs/ocfs2/stackglue.c | 27 ++++-----------------------
> > 1 file changed, 4 insertions(+), 23 deletions(-)
> >
> > diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
> > index a191094694c6..addafced7f59 100644
> > --- a/fs/ocfs2/stackglue.c
> > +++ b/fs/ocfs2/stackglue.c
> > @@ -677,28 +677,8 @@ static struct ctl_table ocfs2_mod_table[] = {
> > },
> > { }
> > };
> > -
> > -static struct ctl_table ocfs2_kern_table[] = {
> > - {
> > - .procname = "ocfs2",
> > - .data = NULL,
> > - .maxlen = 0,
> > - .mode = 0555,
> > - .child = ocfs2_mod_table
> > - },
> > - { }
> > -};
> > -
> > -static struct ctl_table ocfs2_root_table[] = {
> > - {
> > - .procname = "fs",
> > - .data = NULL,
> > - .maxlen = 0,
> > - .mode = 0555,
> > - .child = ocfs2_kern_table
> > - },
> > - { }
> > -};
> > + .data = NULL,
> > + .data = NULL,
>
> The conversion script doesn't like the .data field assignments. ;)
>
> Was this series built with allmodconfig? I would have expected this to
> blow up very badly. :)
Yikes, sense, you're right. Nope, I left the random config tests to
0day. Will fix, thanks!
Luis
^ permalink raw reply
* Re: [PATCH] powerpc/64/syscall: Disable sanitisers for C syscall entry/exit code
From: Michael Ellerman @ 2020-05-29 11:14 UTC (permalink / raw)
To: Andrew Donnellan, Daniel Axtens, linuxppc-dev, npiggin
In-Reply-To: <2fc6270d-9c63-32ab-8d32-66875f8e5d4f@linux.ibm.com>
Andrew Donnellan <ajd@linux.ibm.com> writes:
> On 29/5/20 4:14 pm, Daniel Axtens wrote:
>> syzkaller is picking up a bunch of crashes that look like this:
>>
>> Unrecoverable exception 380 at c00000000037ed60 (msr=8000000000001031)
>> Oops: Unrecoverable exception, sig: 6 [#1]
>> LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
>> Modules linked in:
>> CPU: 0 PID: 874 Comm: syz-executor.0 Not tainted 5.7.0-rc7-syzkaller-00016-gb0c3ba31be3e #0
>> NIP: c00000000037ed60 LR: c00000000004bac8 CTR: c000000000030990
>> REGS: c0000000555a7230 TRAP: 0380 Not tainted (5.7.0-rc7-syzkaller-00016-gb0c3ba31be3e)
>> MSR: 8000000000001031 <SF,ME,IR,DR,LE> CR: 48222882 XER: 20000000
>> CFAR: c00000000004bac4 IRQMASK: 0
>> GPR00: c00000000004bb68 c0000000555a74c0 c0000000024b3500 0000000000000005
>> GPR04: 0000000000000000 0000000000000000 c00000000004bb88 c008000000910000
>> GPR08: 00000000000b0000 c00000000004bac8 0000000000016000 c000000002503500
>> GPR12: c000000000030990 c000000003190000 00000000106a5898 00000000106a0000
>> GPR16: 00000000106a5890 c000000007a92000 c000000008180e00 c000000007a8f700
>> GPR20: c000000007a904b0 0000000010110000 c00000000259d318 5deadbeef0000100
>> GPR24: 5deadbeef0000122 c000000078422700 c000000009ee88b8 c000000078422778
>> GPR28: 0000000000000001 800000000280b033 0000000000000000 c0000000555a75a0
>> NIP [c00000000037ed60] __sanitizer_cov_trace_pc+0x40/0x50
>> LR [c00000000004bac8] interrupt_exit_kernel_prepare+0x118/0x310
>> Call Trace:
>> [c0000000555a74c0] [c00000000004bb68] interrupt_exit_kernel_prepare+0x1b8/0x310 (unreliable)
>> [c0000000555a7530] [c00000000000f9a8] interrupt_return+0x118/0x1c0
>> --- interrupt: 900 at __sanitizer_cov_trace_pc+0x0/0x50
>> ...<random previous call chain>...
>>
>> That looks like the KCOV helper accessing memory that's not safe to
>> access in the interrupt handling context.
>>
>> Do not instrument the new syscall entry/exit code with KCOV, GCOV or
>> UBSAN.
>>
>> Cc: Nicholas Piggin <npiggin@gmail.com>
>> Fixes: 68b34588e202 ("powerpc/64/sycall: Implement syscall entry/exit logic in C")
>> Signed-off-by: Daniel Axtens <dja@axtens.net>
>
> This seems reasonable - I've verified that this does indeed suppress the
> kcov trace calls.
Thanks.
> Acked-by: Andrew Donnellan <ajd@linux.ibm.com>
>
> (does this need to be tagged for stable? the Fixes: commit is in 5.6 but
> we're at 5.7-rc7 at this point...)
No. The Fixed commit is based on v5.6-rc2, but it didn't go in until v5.7-rc1:
$ git describe --contains --match 'v*' 68b34588e202
v5.7-rc1~35^2~46
I plan to send it to Linus before the v5.7 release.
cheers
^ permalink raw reply
* Re: [RFC PATCH 1/2] libnvdimm: Add prctl control for disabling synchronous fault support.
From: Aneesh Kumar K.V @ 2020-05-29 10:55 UTC (permalink / raw)
To: Jan Kara
Cc: linux-nvdimm, jack, Jeff Moyer, oohall, dan.j.williams,
Michal Suchánek, linuxppc-dev
In-Reply-To: <20200529095250.GP14550@quack2.suse.cz>
On 5/29/20 3:22 PM, Jan Kara wrote:
> Hi!
>
> On Fri 29-05-20 15:07:31, Aneesh Kumar K.V wrote:
>> Thanks Michal. I also missed Jeff in this email thread.
>
> And I think you'll also need some of the sched maintainers for the prctl
> bits...
>
>> On 5/29/20 3:03 PM, Michal Suchánek wrote:
>>> Adding Jan
>>>
>>> On Fri, May 29, 2020 at 11:11:39AM +0530, Aneesh Kumar K.V wrote:
>>>> With POWER10, architecture is adding new pmem flush and sync instructions.
>>>> The kernel should prevent the usage of MAP_SYNC if applications are not using
>>>> the new instructions on newer hardware.
>>>>
>>>> This patch adds a prctl option MAP_SYNC_ENABLE that can be used to enable
>>>> the usage of MAP_SYNC. The kernel config option is added to allow the user
>>>> to control whether MAP_SYNC should be enabled by default or not.
>>>>
>>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ...
>>>> diff --git a/kernel/fork.c b/kernel/fork.c
>>>> index 8c700f881d92..d5a9a363e81e 100644
>>>> --- a/kernel/fork.c
>>>> +++ b/kernel/fork.c
>>>> @@ -963,6 +963,12 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
>>>> static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
>>>> +#ifdef CONFIG_ARCH_MAP_SYNC_DISABLE
>>>> +unsigned long default_map_sync_mask = MMF_DISABLE_MAP_SYNC_MASK;
>>>> +#else
>>>> +unsigned long default_map_sync_mask = 0;
>>>> +#endif
>>>> +
>
> I'm not sure CONFIG is really the right approach here. For a distro that would
> basically mean to disable MAP_SYNC for all PPC kernels unless application
> explicitly uses the right prctl. Shouldn't we rather initialize
> default_map_sync_mask on boot based on whether the CPU we run on requires
> new flush instructions or not? Otherwise the patch looks sensible.
>
yes that is correct. We ideally want to deny MAP_SYNC only w.r.t
POWER10. But on a virtualized platform there is no easy way to detect
that. We could ideally hook this into the nvdimm driver where we look at
the new compat string ibm,persistent-memory-v2 and then disable MAP_SYNC
if we find a device with the specific value.
BTW with the recent changes I posted for the nvdimm driver, older kernel
won't initialize persistent memory device on newer hardware. Newer
hardware will present the device to OS with a different device tree
compat string.
My expectation w.r.t this patch was, Distro would want to mark
CONFIG_ARCH_MAP_SYNC_DISABLE=n based on the different application
certification. Otherwise application will have to end up calling the
prctl(MMF_DISABLE_MAP_SYNC, 0) any way. If that is the case, should this
be dependent on P10?
With that I am wondering should we even have this patch? Can we expect
userspace get updated to use new instruction?.
With ppc64 we never had a real persistent memory device available for
end user to try. The available persistent memory stack was using vPMEM
which was presented as a volatile memory region for which there is no
need to use any of the flush instructions. We could safely assume that
as we get applications certified/verified for working with pmem device
on ppc64, they would all be using the new instructions?
-aneesh
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox