* [GIT pull] printk updates for 4.15
@ 2017-11-13 9:36 Thomas Gleixner
2017-11-14 1:18 ` Linus Torvalds
0 siblings, 1 reply; 36+ messages in thread
From: Thomas Gleixner @ 2017-11-13 9:36 UTC (permalink / raw)
To: Linus Torvalds
Cc: LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton
Linus,
please pull the latest core-printk-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git core-printk-for-linus
This update adds the mechanisms to emit printk timestamps based on
different clocks:
- scheduler clock (default)
- monotonic time
- boot time
- wall clock time
This helps to correlate dmesg with user space log information, tracing,
etc. This can be local correlation or in case of wall clock time correlated
across machines, assumed that all machines are synchronized via
NTP/PTP.
Thanks,
tglx
Note: The two timekeeping commits
4c3711d7fb47 ("timekeeping: Provide NMI safe access to clock realtime")
5df32107f609 ("timekeeping: Make fast accessors return 0 before timekeeping is initialized")
are also part of the timers/core pull request (same SHAs). They have been
pulled into this branch to keep the printk commits separate.
------------------>
Prarit Bhargava (3):
timekeeping: Make fast accessors return 0 before timekeeping is initialized
printk: Add monotonic, boottime, and realtime timestamps
printk: Fix unused printk_time compile warning when !CONFIG_PRINTK
Thomas Gleixner (1):
timekeeping: Provide NMI safe access to clock realtime
Documentation/admin-guide/kernel-parameters.txt | 6 +-
include/linux/timekeeper_internal.h | 6 +-
include/linux/timekeeping.h | 1 +
kernel/printk/printk.c | 130 ++++++++++++++++++++++--
kernel/time/timekeeping.c | 60 +++++++++--
lib/Kconfig.debug | 48 ++++++++-
6 files changed, 232 insertions(+), 19 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 05496622b4ef..9a84483db403 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3211,8 +3211,10 @@
ratelimit - ratelimit the logging
Default: ratelimit
- printk.time= Show timing data prefixed to each printk message line
- Format: <bool> (1/Y/y=enable, 0/N/n=disable)
+ printk.time= Show timestamp prefixed to each printk message line
+ Format: <string>
+ (0/N/n/disable, 1/Y/y/local,
+ b/boot, m/monotonic, r/realtime (in UTC))
processor.max_cstate= [HW,ACPI]
Limit processor to maximum C-state
diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 0a0a53daf2a2..98f645ee8409 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -13,19 +13,22 @@
/**
* struct tk_read_base - base structure for timekeeping readout
* @clock: Current clocksource used for timekeeping.
- * @read: Read function of @clock
* @mask: Bitmask for two's complement subtraction of non 64bit clocks
* @cycle_last: @clock cycle value at last update
* @mult: (NTP adjusted) multiplier for scaled math conversion
* @shift: Shift value for scaled math conversion
* @xtime_nsec: Shifted (fractional) nano seconds offset for readout
* @base: ktime_t (nanoseconds) base time for readout
+ * @base_real: Nanoseconds base value for clock REALTIME readout
*
* This struct has size 56 byte on 64 bit. Together with a seqcount it
* occupies a single 64byte cache line.
*
* The struct is separate from struct timekeeper as it is also used
* for a fast NMI safe accessors.
+ *
+ * @base_real is for the fast NMI safe accessor to allow reading clock
+ * realtime from any context.
*/
struct tk_read_base {
struct clocksource *clock;
@@ -35,6 +38,7 @@ struct tk_read_base {
u32 shift;
u64 xtime_nsec;
ktime_t base;
+ u64 base_real;
};
/**
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index ddc229ff6d1e..eb98cbdbb323 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -239,6 +239,7 @@ static inline u64 ktime_get_raw_ns(void)
extern u64 ktime_get_mono_fast_ns(void);
extern u64 ktime_get_raw_fast_ns(void);
extern u64 ktime_get_boot_fast_ns(void);
+extern u64 ktime_get_real_fast_ns(void);
/*
* Timespec interfaces utilizing the ktime based ones
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 512f7c2baedd..fe94246fa073 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -576,6 +576,9 @@ static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len,
return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len);
}
+static u64 printk_get_first_ts(void);
+static u64 (*printk_get_ts)(void) = printk_get_first_ts;
+
/* insert record into the buffer, discard old ones, update heads */
static int log_store(int facility, int level,
enum log_flags flags, u64 ts_nsec,
@@ -624,7 +627,7 @@ static int log_store(int facility, int level,
if (ts_nsec > 0)
msg->ts_nsec = ts_nsec;
else
- msg->ts_nsec = local_clock();
+ msg->ts_nsec = printk_get_ts();
memset(log_dict(msg) + dict_len, 0, pad_len);
msg->len = size;
@@ -1201,14 +1204,130 @@ static inline void boot_delay_msec(int level)
}
#endif
-static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME);
-module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
+/**
+ * enum timestamp_sources - Timestamp sources for printk() messages.
+ * @PRINTK_TIME_DISABLED: No time stamp.
+ * @PRINTK_TIME_LOCAL: Local hardware clock timestamp.
+ * @PRINTK_TIME_BOOT: Boottime clock timestamp.
+ * @PRINTK_TIME_MONO: Monotonic clock timestamp.
+ * @PRINTK_TIME_REAL: Realtime clock timestamp.
+ */
+enum timestamp_sources {
+ PRINTK_TIME_DISABLED = 0,
+ PRINTK_TIME_LOCAL = 1,
+ PRINTK_TIME_BOOT = 2,
+ PRINTK_TIME_MONO = 3,
+ PRINTK_TIME_REAL = 4,
+};
+
+static const char * const timestamp_sources_str[5] = {
+ "disabled",
+ "local",
+ "boottime",
+ "monotonic",
+ "realtime",
+};
+
+static int printk_time = CONFIG_PRINTK_TIME_TYPE;
+
+static int printk_set_ts_source(enum timestamp_sources ts_source)
+{
+ int err = 0;
+
+ switch (ts_source) {
+ case PRINTK_TIME_LOCAL:
+ printk_get_ts = local_clock;
+ break;
+ case PRINTK_TIME_BOOT:
+ printk_get_ts = ktime_get_boot_fast_ns;
+ break;
+ case PRINTK_TIME_MONO:
+ printk_get_ts = ktime_get_mono_fast_ns;
+ break;
+ case PRINTK_TIME_REAL:
+ printk_get_ts = ktime_get_real_fast_ns;
+ break;
+ case PRINTK_TIME_DISABLED:
+ /*
+ * The timestamp is always stored into the log buffer.
+ * Keep the current one.
+ */
+ break;
+ default:
+ err = -EINVAL;
+ break;
+ }
+
+ if (!err)
+ printk_time = ts_source;
+ return err;
+}
+
+static u64 printk_get_first_ts(void)
+{
+ printk_set_ts_source(printk_time);
+
+ /* Fallback for invalid or disabled timestamp source */
+ if (printk_get_ts == printk_get_first_ts)
+ printk_get_ts = local_clock;
+
+ return printk_get_ts();
+}
+
+static int param_set_time(const char *val, const struct kernel_param *kp)
+{
+ char *param = strstrip((char *)val);
+ int time_source = -1;
+ int ts;
+ int err;
+
+ if (strlen(param) == 1) {
+ /* Preserve legacy boolean settings */
+ if ((param[0] == '0') || (param[0] == 'n') ||
+ (param[0] == 'N'))
+ time_source = PRINTK_TIME_DISABLED;
+ if ((param[0] == '1') || (param[0] == 'y') ||
+ (param[0] == 'Y'))
+ time_source = PRINTK_TIME_LOCAL;
+ }
+ if (time_source == -1) {
+ for (ts = 0; ts < ARRAY_SIZE(timestamp_sources_str); ts++) {
+ if (!strncmp(timestamp_sources_str[ts], param,
+ strlen(param))) {
+ time_source = ts;
+ break;
+ }
+ }
+ }
+
+ err = printk_set_ts_source(time_source);
+ if (err) {
+ pr_warn("printk: invalid timestamp option %s\n", param);
+ return err;
+ }
+
+ pr_info("printk: timestamp set to %s\n",
+ timestamp_sources_str[printk_time]);
+ return 0;
+}
+
+static int param_get_time(char *buffer, const struct kernel_param *kp)
+{
+ return scnprintf(buffer, PAGE_SIZE, "%s",
+ timestamp_sources_str[printk_time]);
+}
+
+static struct kernel_param_ops printk_time_ops = {
+ .set = param_set_time,
+ .get = param_get_time,
+};
+module_param_cb(time, &printk_time_ops, NULL, 0644);
static size_t print_time(u64 ts, char *buf)
{
unsigned long rem_nsec;
- if (!printk_time)
+ if (printk_time == PRINTK_TIME_DISABLED)
return 0;
rem_nsec = do_div(ts, 1000000000);
@@ -1631,7 +1750,7 @@ static bool cont_add(int facility, int level, enum log_flags flags, const char *
cont.facility = facility;
cont.level = level;
cont.owner = current;
- cont.ts_nsec = local_clock();
+ cont.ts_nsec = printk_get_ts();
cont.flags = flags;
}
@@ -1860,7 +1979,6 @@ static void call_console_drivers(const char *ext_text, size_t ext_len,
static size_t msg_print_text(const struct printk_log *msg,
bool syslog, char *buf, size_t size) { return 0; }
static bool suppress_message_printing(int level) { return false; }
-
#endif /* CONFIG_PRINTK */
#ifdef CONFIG_EARLY_PRINTK
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 2cafb49aa65e..8af77006e937 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -60,8 +60,27 @@ struct tk_fast {
struct tk_read_base base[2];
};
-static struct tk_fast tk_fast_mono ____cacheline_aligned;
-static struct tk_fast tk_fast_raw ____cacheline_aligned;
+/* Suspend-time cycles value for halted fast timekeeper. */
+static u64 cycles_at_suspend;
+
+static u64 dummy_clock_read(struct clocksource *cs)
+{
+ return cycles_at_suspend;
+}
+
+static struct clocksource dummy_clock = {
+ .read = dummy_clock_read,
+};
+
+static struct tk_fast tk_fast_mono ____cacheline_aligned = {
+ .base[0] = { .clock = &dummy_clock, },
+ .base[1] = { .clock = &dummy_clock, },
+};
+
+static struct tk_fast tk_fast_raw ____cacheline_aligned = {
+ .base[0] = { .clock = &dummy_clock, },
+ .base[1] = { .clock = &dummy_clock, },
+};
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;
@@ -477,17 +496,38 @@ u64 notrace ktime_get_boot_fast_ns(void)
}
EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);
-/* Suspend-time cycles value for halted fast timekeeper. */
-static u64 cycles_at_suspend;
-static u64 dummy_clock_read(struct clocksource *cs)
+/*
+ * See comment for __ktime_get_fast_ns() vs. timestamp ordering
+ */
+static __always_inline u64 __ktime_get_real_fast_ns(struct tk_fast *tkf)
{
- return cycles_at_suspend;
+ struct tk_read_base *tkr;
+ unsigned int seq;
+ u64 now;
+
+ do {
+ seq = raw_read_seqcount_latch(&tkf->seq);
+ tkr = tkf->base + (seq & 0x01);
+ now = ktime_to_ns(tkr->base_real);
+
+ now += timekeeping_delta_to_ns(tkr,
+ clocksource_delta(
+ tk_clock_read(tkr),
+ tkr->cycle_last,
+ tkr->mask));
+ } while (read_seqcount_retry(&tkf->seq, seq));
+
+ return now;
}
-static struct clocksource dummy_clock = {
- .read = dummy_clock_read,
-};
+/**
+ * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime.
+ */
+u64 ktime_get_real_fast_ns(void)
+{
+ return __ktime_get_real_fast_ns(&tk_fast_mono);
+}
/**
* halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
@@ -507,6 +547,7 @@ static void halt_fast_timekeeper(struct timekeeper *tk)
memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
cycles_at_suspend = tk_clock_read(tkr);
tkr_dummy.clock = &dummy_clock;
+ tkr_dummy.base_real = tkr->base + tk->offs_real;
update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);
tkr = &tk->tkr_raw;
@@ -654,6 +695,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
update_vsyscall(tk);
update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
+ tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real;
update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 2689b7c50c52..c6401d325b0e 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -8,12 +8,58 @@ config PRINTK_TIME
messages to be added to the output of the syslog() system
call and at the console.
+choice
+ prompt "printk default clock timestamp" if PRINTK_TIME
+ default PRINTK_TIME_LOCAL if PRINTK_TIME
+ help
+ This option is selected by setting one of
+ PRINTK_TIME_[DISABLE|LOCAL|BOOT|MONO|REAL] and causes time stamps of
+ the printk() messages to be added to the output of the syslog()
+ system call and at the console.
+
The timestamp is always recorded internally, and exported
to /dev/kmsg. This flag just specifies if the timestamp should
be included, not that the timestamp is recorded.
The behavior is also controlled by the kernel command line
- parameter printk.time=1. See Documentation/admin-guide/kernel-parameters.rst
+ parameter printk.time. See
+ Documentation/admin-guide/kernel-parameters.rst
+
+config PRINTK_TIME_LOCAL
+ bool "Local Clock"
+ help
+ Selecting this option causes the time stamps of printk() to be
+ stamped with the unadjusted hardware clock.
+
+config PRINTK_TIME_BOOT
+ bool "Boot Time Clock"
+ help
+ Selecting this option causes the time stamps of printk() to be
+ stamped with the adjusted boottime clock.
+
+config PRINTK_TIME_MONO
+ bool "Monotonic Clock"
+ help
+ Selecting this option causes the time stamps of printk() to be
+ stamped with the adjusted monotonic clock.
+
+config PRINTK_TIME_REAL
+ bool "Real Time Clock"
+ help
+ Selecting this option causes the time stamps of printk() to be
+ stamped with the adjusted realtime clock (UTC).
+endchoice
+
+config PRINTK_TIME_TYPE
+ int
+ depends on PRINTK
+ range 0 4
+ default 0 if !PRINTK_TIME
+ default 1 if PRINTK_TIME_LOCAL
+ default 2 if PRINTK_TIME_BOOT
+ default 3 if PRINTK_TIME_MONO
+ default 4 if PRINTK_TIME_REAL
+
config CONSOLE_LOGLEVEL_DEFAULT
int "Default console loglevel (1-15)"
^ permalink raw reply related [flat|nested] 36+ messages in thread* Re: [GIT pull] printk updates for 4.15 2017-11-13 9:36 [GIT pull] printk updates for 4.15 Thomas Gleixner @ 2017-11-14 1:18 ` Linus Torvalds 2017-11-14 2:48 ` Linus Torvalds 2017-11-14 10:03 ` Petr Mladek 0 siblings, 2 replies; 36+ messages in thread From: Linus Torvalds @ 2017-11-14 1:18 UTC (permalink / raw) To: Thomas Gleixner Cc: LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton On Mon, Nov 13, 2017 at 1:36 AM, Thomas Gleixner <tglx@linutronix.de> wrote: > Linus, > > please pull the latest core-printk-for-linus git tree from: > > git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git core-printk-for-linus > > This update adds the mechanisms to emit printk timestamps based on > different clocks: > > - scheduler clock (default) > - monotonic time > - boot time > - wall clock time > > This helps to correlate dmesg with user space log information, tracing, > etc. This can be local correlation or in case of wall clock time correlated > across machines, assumed that all machines are synchronized via > NTP/PTP. Honestly, this just seems bogus to me, particularly since it's a single choice. The *sane* model would be to (a) continue to use the existing time that we always have (local_clock()) in the printk timestamps, and don't confuse people with the semantics of that field changing. (b) just emit a "synchronization printk" every once in a while, which is obviously also using the same standard time source, but the line actually _says_ what the other time sources are. Then it's easy to see what the printk time source is, in relation to any _number_ of other timesources. And if that synchronization printk is nicely formatted, it will even be something that people appreciate seeing in dmesg _irrespective_ of any actual synchronization issues. And something that reads the journal could trivially pick up on the synchronization printk line, and then correct the local timesource to whatever internal journal timesource it wants to. And the important thing is that because you just give *all* timesources in the synchronization line, that choice isn't fixed by some random kernel configuration or setting. Instead, this seems to have a completely broken "pick one time source model at random" approach, so now different machines will have different models, and it will likely _break_ existing code that picks the timesource from the kernel dmesg, unless you just pick the local one. That seems like bad design, and really stupid. Am I missing something? Because as-is, this just seems like a horribly bad feature to me. I'm not pulling it without some very good arguments for this all. Linus ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-14 1:18 ` Linus Torvalds @ 2017-11-14 2:48 ` Linus Torvalds 2017-11-14 10:03 ` Petr Mladek 1 sibling, 0 replies; 36+ messages in thread From: Linus Torvalds @ 2017-11-14 2:48 UTC (permalink / raw) To: Thomas Gleixner Cc: LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton On Mon, Nov 13, 2017 at 5:18 PM, Linus Torvalds <torvalds@linux-foundation.org> wrote: > > (b) just emit a "synchronization printk" every once in a while, which > is obviously also using the same standard time source, but the line > actually _says_ what the other time sources are. Side note: there's a few good obvious times to do this. After a NTP synchronization, after a resume, and maybe "every X hours if nothing else is happening". That "if nothing else is happening" would actually be a nice heartbeat thing for people who care about that. I've had machines crash overnight, and later wondered when it happened. Of course, these days other system journal sources tend to be so chatty that it doesn't much happen, but maybe it would still be appreciated in embedded places where that isn't yet the case.. And that "how often you do the time sync printk" really could be a kernel configuration thing then, but it wouldn't actually affect any existing machinery unlike the "let's just change what the printk header timestamp means". Linus ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-14 1:18 ` Linus Torvalds 2017-11-14 2:48 ` Linus Torvalds @ 2017-11-14 10:03 ` Petr Mladek 2017-11-14 13:28 ` Prarit Bhargava ` (2 more replies) 1 sibling, 3 replies; 36+ messages in thread From: Petr Mladek @ 2017-11-14 10:03 UTC (permalink / raw) To: Linus Torvalds Cc: Thomas Gleixner, LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Mark Salyzyn, Prarit Bhargava, Sergey Senozhatsky, Steven Rostedt On Mon 2017-11-13 17:18:33, Linus Torvalds wrote: > On Mon, Nov 13, 2017 at 1:36 AM, Thomas Gleixner <tglx@linutronix.de> wrote: > > Linus, > > > > please pull the latest core-printk-for-linus git tree from: > > > > git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git core-printk-for-linus > > > > This update adds the mechanisms to emit printk timestamps based on > > different clocks: > > > > - scheduler clock (default) > > - monotonic time > > - boot time > > - wall clock time > > > > This helps to correlate dmesg with user space log information, tracing, > > etc. This can be local correlation or in case of wall clock time correlated > > across machines, assumed that all machines are synchronized via > > NTP/PTP. > > Honestly, this just seems bogus to me, particularly since it's a single choice. > > The *sane* model would be to > > (a) continue to use the existing time that we always have > (local_clock()) in the printk timestamps, and don't confuse people > with the semantics of that field changing. > > (b) just emit a "synchronization printk" every once in a while, which > is obviously also using the same standard time source, but the line > actually _says_ what the other time sources are. This was actually the original approach by Mark Salyzyn, see https://lkml.kernel.org/r/20170720182505.9357-1-salyzyn@android.com > Then it's easy to see what the printk time source is, in relation to > any _number_ of other timesources. And if that synchronization printk > is nicely formatted, it will even be something that people appreciate > seeing in dmesg _irrespective_ of any actual synchronization issues. > > And something that reads the journal could trivially pick up on the > synchronization printk line, and then correct the local timesource to > whatever internal journal timesource it wants to. And the important > thing is that because you just give *all* timesources in the > synchronization line, that choice isn't fixed by some random kernel > configuration or setting. One risk is that the messages might get lost. For example, they might be filtered by loglevel or during a flood of messages on slow consoles. > Instead, this seems to have a completely broken "pick one time source > model at random" approach, so now different machines will have > different models, and it will likely _break_ existing code that picks > the timesource from the kernel dmesg, unless you just pick the local > one. AFAIK, local clock is not synchronous between different machines and even CPUs on the same machine. It was used in printk() because it was lockless. Therefore it is kind of random itself. You could make some post-synchronization using that printed time stamps from other clocks. But it is not reliable (lost messages) and somehow inconvenient. I am not super happy that userspace might need update with the approach in this pull request. But it seems to be rather trivial. The timestamp (number) in the log can be converted into the date+time as following: + realtime: timestamp ~= number of micro sec. since 1.1.1970 + other clocks: timestamp ~= number of micro sec. since boot > That seems like bad design, and really stupid. > > Am I missing something? Because as-is, this just seems like a horribly > bad feature to me. I'm not pulling it without some very good arguments > for this all. I wonder if the current approach might be acceptable if we print some suffix after real-time or any non-local_clock timestamps. This would allow userspace to always handle this correctly. IMHO, it would be more reliable and convenient than the "synchronization printks". Best Regards, Petr ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-14 10:03 ` Petr Mladek @ 2017-11-14 13:28 ` Prarit Bhargava 2017-11-14 15:56 ` Mark Salyzyn 2017-11-14 17:20 ` Linus Torvalds 2 siblings, 0 replies; 36+ messages in thread From: Prarit Bhargava @ 2017-11-14 13:28 UTC (permalink / raw) To: Petr Mladek Cc: Linus Torvalds, Thomas Gleixner, LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Mark Salyzyn, Sergey Senozhatsky, Steven Rostedt On 11/14/2017 05:03 AM, Petr Mladek wrote: > On Mon 2017-11-13 17:18:33, Linus Torvalds wrote: >> On Mon, Nov 13, 2017 at 1:36 AM, Thomas Gleixner <tglx@linutronix.de> wrote: >>> Linus, >>> >>> please pull the latest core-printk-for-linus git tree from: >>> >>> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git core-printk-for-linus >>> >>> This update adds the mechanisms to emit printk timestamps based on >>> different clocks: >>> >>> - scheduler clock (default) >>> - monotonic time >>> - boot time >>> - wall clock time >>> >>> This helps to correlate dmesg with user space log information, tracing, >>> etc. This can be local correlation or in case of wall clock time correlated >>> across machines, assumed that all machines are synchronized via >>> NTP/PTP. >> >> Honestly, this just seems bogus to me, particularly since it's a single choice. >> >> The *sane* model would be to >> >> (a) continue to use the existing time that we always have >> (local_clock()) in the printk timestamps, and don't confuse people >> with the semantics of that field changing. The only time anyone would notice a change is if they used real time. Otherwise the behaviour is pretty much the same -- the kernel starts booting with offset 0.00000 and starts incrementing shortly thereafter. >> >> (b) just emit a "synchronization printk" every once in a while, which >> is obviously also using the same standard time source, but the line >> actually _says_ what the other time sources are. > > This was actually the original approach by Mark Salyzyn, see > https://lkml.kernel.org/r/20170720182505.9357-1-salyzyn@android.com One message every 5 mins over 2 weeks is ~4000 messages. On systems with long up times this would fill the kmsg log. The boot log is important to determine the system topology & device configuration, etc. > > >> Then it's easy to see what the printk time source is, in relation to >> any _number_ of other timesources. And if that synchronization printk >> is nicely formatted, it will even be something that people appreciate >> seeing in dmesg _irrespective_ of any actual synchronization issues. >> >> And something that reads the journal could trivially pick up on the >> synchronization printk line, and then correct the local timesource to >> whatever internal journal timesource it wants to. And the important >> thing is that because you just give *all* timesources in the >> synchronization line, that choice isn't fixed by some random kernel >> configuration or setting. > > One risk is that the messages might get lost. For example, they might be > filtered by loglevel or during a flood of messages on slow consoles. > > >> Instead, this seems to have a completely broken "pick one time source >> model at random" approach, so now different machines will have >> different models, and it will likely _break_ existing code that picks >> the timesource from the kernel dmesg, unless you just pick the local >> one. > > AFAIK, local clock is not synchronous between different machines > and even CPUs on the same machine. It was used in printk() because > it was lockless. Therefore it is kind of random itself. > > You could make some post-synchronization using that printed > time stamps from other clocks. But it is not reliable (lost > messages) and somehow inconvenient. > > I am not super happy that userspace might need update with > the approach in this pull request. But it seems to be rather > trivial. The timestamp (number) in the log can be converted into > the date+time as following: > > + realtime: timestamp ~= number of micro sec. since 1.1.1970 > + other clocks: timestamp ~= number of micro sec. since boot Note that the current behaviour of dmesg & crash are maintained. This patch does not break existing userspace. If the kernel was built with 'CONFIG_PRINTK_TIME=y' both dmesg & crash function exactly as before. For the new timestamp (boot, real time, or wall clock) the patch to util-linux's dmesg is trivial: Do a read of sysfs to determine what type of clock is used and set the offset based on that value. crash's log mirrors what was on the console on different architectures. I've been using this patchset for debugging purposes across different architectures for a while now and haven't seen any issues with dmesg or crash. > > >> That seems like bad design, and really stupid. >> >> Am I missing something? Because as-is, this just seems like a horribly >> bad feature to me. I'm not pulling it without some very good arguments >> for this all. > Mark has an Android issue that he should speak to that effects large numbers of systems. My issue is that many systems with bad FW & HW have timeouts due to bugs in system configuration. For example, a system may stop functioning at exactly 200 hours (some events are *much* longer, some events are shorter). A time that specific isn't a random event and has something to do with buggy firmware/BIOS/BMC/HW/power. Identifying & debugging these continues to be very difficult using the local_clock() because it is not synchronized to to NTP or PTP. > I wonder if the current approach might be acceptable if we print > some suffix after real-time or any non-local_clock timestamps. > This would allow userspace to always handle this correctly. > IMHO, it would be more reliable and convenient than the > "synchronization printks". That was another suggestion Mark had made too. Another idea was to encode the timestamp type in the message structure itself ('b' for boot, for example). These approaches are significantly more difficult to code in userspace than the trivial dmesg patch previously mentioned. P. ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-14 10:03 ` Petr Mladek 2017-11-14 13:28 ` Prarit Bhargava @ 2017-11-14 15:56 ` Mark Salyzyn 2017-11-15 0:48 ` Sergey Senozhatsky 2017-11-14 17:20 ` Linus Torvalds 2 siblings, 1 reply; 36+ messages in thread From: Mark Salyzyn @ 2017-11-14 15:56 UTC (permalink / raw) To: Petr Mladek, Linus Torvalds Cc: Thomas Gleixner, LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Prarit Bhargava, Sergey Senozhatsky, Steven Rostedt On 11/14/2017 02:03 AM, Petr Mladek wrote: > On Mon 2017-11-13 17:18:33, Linus Torvalds wrote: >> On Mon, Nov 13, 2017 at 1:36 AM, Thomas Gleixner <tglx@linutronix.de> wrote: >> Honestly, this just seems bogus to me, particularly since it's a single choice. >> >> The *sane* model would be to >> >> (a) continue to use the existing time that we always have >> (local_clock()) in the printk timestamps, and don't confuse people >> with the semantics of that field changing. >> >> (b) just emit a "synchronization printk" every once in a while, which >> is obviously also using the same standard time source, but the line >> actually _says_ what the other time sources are. > This was actually the original approach by Mark Salyzyn, see > https://lkml.kernel.org/r/20170720182505.9357-1-salyzyn@android.com Thanks for the vote of confidence :-) but we were holding back re spinning that set until the ability to switch the primary timestamp got accepted. (we were to use U suffix for realtime, B for boottime etc to designate them). Printing the other timebases during major disruptions (suspend/resume/etc) was needed for our battery monitoring analysis tools, and we only used them as synchronization markers as a secondary effect. But we _also_ had Android partners asking if it was possible to switch the primary (dmesg) timestamp to one of boottime or realtime to aid in triaging temporal issues. Doing so in the tool was sometimes too inaccurate (~+/-20ms) even with help sniffing the periodic synchronization/disruption prints. Switching the timebase for dmesg gave us roughly 1000 times more comparative time precision which helps immensely when trying to correlate and order the sequence of events in user space and kernel activities. -- Mark ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-14 15:56 ` Mark Salyzyn @ 2017-11-15 0:48 ` Sergey Senozhatsky 0 siblings, 0 replies; 36+ messages in thread From: Sergey Senozhatsky @ 2017-11-15 0:48 UTC (permalink / raw) To: Mark Salyzyn Cc: Petr Mladek, Linus Torvalds, Thomas Gleixner, LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Prarit Bhargava, Sergey Senozhatsky, Steven Rostedt Hello Mark, On (11/14/17 07:56), Mark Salyzyn wrote: [..] > Thanks for the vote of confidence :-) but we were holding back re spinning > that set until the ability to switch the primary timestamp got accepted. (we > were to use U suffix for realtime, B for boottime etc to designate them). > > Printing the other timebases during major disruptions (suspend/resume/etc) > was needed for our battery monitoring analysis tools, and we only used them > as synchronization markers as a secondary effect. > > But we _also_ had Android partners asking if it was possible to switch the > primary (dmesg) timestamp to one of boottime or realtime to aid in triaging > temporal issues. Doing so in the tool was sometimes too inaccurate > (~+/-20ms) even with help sniffing the periodic synchronization/disruption > prints. Switching the timebase for dmesg gave us roughly 1000 times more > comparative time precision which helps immensely when trying to correlate > and order the sequence of events in user space and kernel activities. have you considered trace points? -ss ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-14 10:03 ` Petr Mladek 2017-11-14 13:28 ` Prarit Bhargava 2017-11-14 15:56 ` Mark Salyzyn @ 2017-11-14 17:20 ` Linus Torvalds 2017-11-14 20:21 ` Thomas Gleixner 2 siblings, 1 reply; 36+ messages in thread From: Linus Torvalds @ 2017-11-14 17:20 UTC (permalink / raw) To: Petr Mladek Cc: Thomas Gleixner, LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Mark Salyzyn, Prarit Bhargava, Sergey Senozhatsky, Steven Rostedt On Tue, Nov 14, 2017 at 2:03 AM, Petr Mladek <pmladek@suse.com> wrote: > > AFAIK, local clock is not synchronous between different machines > and even CPUs on the same machine. It was used in printk() because > it was lockless. Therefore it is kind of random itself. It's not that it would be synchronous between machines, but that it would behave the same way. Lookie here, the issue is this: static ssize_t msg_print_ext_header(.. { u64 ts_usec = msg->ts_nsec; do_div(ts_usec, 1000); return scnprintf(buf, size, "%u,%llu,%llu,%c;", (msg->facility << 3) | msg->level, seq, ts_usec, msg->flags & LOG_CONT ? 'c' : '-'); } this is an _exported_ interface to user space. Is it an odd interface? Yes it is. It was "designed" by the systemd people. See commit e11fea92e13f ("kmsg: export printk records to the /dev/kmsg interface"). But that really means that you can't just change what "ts_nsec" means - it's exported as "microseconds since boot". And yes, at least systemd actually uses it. I cloned the git repository and checked. It does something like unsigned long long usec; ... r = safe_atollu(p, &usec); ... if (asprintf(&source_time, "_SOURCE_MONOTONIC_TIMESTAMP=%llu", usec) >= 0) iovec[n++] = IOVEC_MAKE_STRING(source_time); when reading /dev/kmsg, and then elsewhere it does PARSE_FIELD_VEC_ENTRY("_SOURCE_MONOTONIC_TIMESTAMP=", &monotonic, &monotonic_len), if (mode == OUTPUT_SHORT_MONOTONIC) r = output_timestamp_monotonic(f, j, monotonic); Now, the important thing to keep in mind is that the kernel is not the _only_ source of these "_SOURCE_MONOTONIC_TIMESTAMP" things. In other places, systemd generates them itself, with ts->monotonic = now(CLOCK_MONOTONIC); where "now()" ends up using clock_gettime(). End result? The kernel /dev/kmsg interface has to be at least "similar" to returning usec's comparable to CLOCK_MONOTONIC. You can't make it just do realtime. That's a whole different clock. Now, admittedly "local_clock()" (aka sched_clock()) that printk uses now is not great either, but at least it has the same basic time base as CLOCK_MONOTONIC, and isn't off by 47 years. So *that* is my worry. I don't expect clocks across machines to have the same value. But I expect them to have the same *behavior*, and not see really odd system journal behavior just because one machine ends up using wall clock time, and one ends up using a monotonic clock, and one ends up using the traditional jiffies or whatever. And honestly, if we decide to change the time format, the absolute *LAST* thing I want to see is to make this a config option. Either we do it or we don't. Not this insane "you can choose between five different choices, I leave it up to you to screw up your machine". Seriously. This whole "we can't even make a choice, so we let the user screw with it" is pure and utter garbage. If you as a kernel developer cannot make a choice., and argue strongly for _why_ that choice is the right one to export to user space, then we do not change existing behavior, since clearly you don't really have a good reason for the change. This is why I would suggest just keeping the existing clock, and then adding clock events at _significant_ points. Somebody brought up the complete red herring of "every five minutes will fiill up the system log". That's complete BS. You do it when people change the system clock (and perhaps when NTP synchronizes), or after a suspend event, or maybe regularly every few _hours_ (and make that last part configurable). Filling up the system log is simply not an issue. Linus ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-14 17:20 ` Linus Torvalds @ 2017-11-14 20:21 ` Thomas Gleixner 2017-11-14 21:07 ` Linus Torvalds 0 siblings, 1 reply; 36+ messages in thread From: Thomas Gleixner @ 2017-11-14 20:21 UTC (permalink / raw) To: Linus Torvalds Cc: Petr Mladek, LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Mark Salyzyn, Prarit Bhargava, Sergey Senozhatsky, Steven Rostedt On Tue, 14 Nov 2017, Linus Torvalds wrote: > > But that really means that you can't just change what "ts_nsec" means > - it's exported as "microseconds since boot". That's true if sched clock is stable and high resolution and does not stop during suspend. So in reality this clock has random meanings depending on the hardware properties. Really well "designed". > Now, the important thing to keep in mind is that the kernel is not the > _only_ source of these "_SOURCE_MONOTONIC_TIMESTAMP" things. > > In other places, systemd generates them itself, with > > ts->monotonic = now(CLOCK_MONOTONIC); > > where "now()" ends up using clock_gettime(). > > End result? The kernel /dev/kmsg interface has to be at least > "similar" to returning usec's comparable to CLOCK_MONOTONIC. First of all sched_clock() and CLOCK_MONOTONIC can drift apart frequency wise pretty bad. Secondly, on a machine with non stop TSC sched_clock() advances by the time the system spent in suspend, while CLOCK_MONOTONIC does not for historical reasons. CLOCK_BOOTTIME does because that accounts for the time spent in suspend, but then again using BOOTTIME for the user space time stamps does not work on systems where sched_clock() stops in suspend. So in terms of local machine correlation the only reasonable choice is CLOCK_MONOTONIC especially when existing user space already tries to correlate time via clock_gettime(CLOCK_MONOTONIC) based time stamps. This works on all systems no matter which underlying hardware is used to retrieve the timestamp. dmesg is not really a performance critical path, so the few extra cycles which this might take more than using local/sched_clock() are not an issue. > This is why I would suggest just keeping the existing clock, and then > adding clock events at _significant_ points. Somebody brought up the > complete red herring of "every five minutes will fiill up the system > log". That's complete BS. You do it when people change the system > clock (and perhaps when NTP synchronizes), or after a suspend event, > or maybe regularly every few _hours_ (and make that last part > configurable). I'm not worried about the system log spam, I'm more worried that we lose exactly that last synchronization line due to buffer wrap around. One possible solution which was discussed before is to have that extra information in every dmesg timestamp. That means storing CLOCK_MONOTONIC as the default timestamp and auxillary offsets for CLOCK_BOOTTIME and CLOCK_REALTIME. Thats simple enough to achieve as we can access that information via the NMI safe time accessors. Needs a new function, but thats trivial. The interesting question is how to print out that information. We can do it unconditionally, but that might confuse existing tools. A command line option and a sysfs knob to enable the printout of these extra bits might make this work. Thanks, tglx ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-14 20:21 ` Thomas Gleixner @ 2017-11-14 21:07 ` Linus Torvalds 2017-11-14 21:09 ` Thomas Gleixner 2017-11-14 21:16 ` Mark Salyzyn 0 siblings, 2 replies; 36+ messages in thread From: Linus Torvalds @ 2017-11-14 21:07 UTC (permalink / raw) To: Thomas Gleixner Cc: Petr Mladek, LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Mark Salyzyn, Prarit Bhargava, Sergey Senozhatsky, Steven Rostedt On Tue, Nov 14, 2017 at 12:21 PM, Thomas Gleixner <tglx@linutronix.de> wrote: > > First of all sched_clock() and CLOCK_MONOTONIC can drift apart frequency > wise pretty bad. Yes. And I would actuall ynot object to just making the printk buffers use CLOCK_MONOTONIC. The reason we avoided that was we didn't want to take the locks, though. sched_clock() is used not because it's the right clock, or particularly good, but because it is fast and doesn't have locking issues. But all of this is entirely separate from the "changing from one clock that has small issues for good reasons to FOUR DIFFERENT CLOCKS THAT THE USER CHOOSES RANDOMLY". See my objection? One is "let's improve the clock". The other is "let's do something crazy and idiotic". Linus ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-14 21:07 ` Linus Torvalds @ 2017-11-14 21:09 ` Thomas Gleixner 2017-11-14 21:16 ` Mark Salyzyn 1 sibling, 0 replies; 36+ messages in thread From: Thomas Gleixner @ 2017-11-14 21:09 UTC (permalink / raw) To: Linus Torvalds Cc: Petr Mladek, LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Mark Salyzyn, Prarit Bhargava, Sergey Senozhatsky, Steven Rostedt On Tue, 14 Nov 2017, Linus Torvalds wrote: > On Tue, Nov 14, 2017 at 12:21 PM, Thomas Gleixner <tglx@linutronix.de> wrote: > > > > First of all sched_clock() and CLOCK_MONOTONIC can drift apart frequency > > wise pretty bad. > > Yes. And I would actuall ynot object to just making the printk buffers > use CLOCK_MONOTONIC. > > The reason we avoided that was we didn't want to take the locks, though. I know, but today we can use the NMI safe accessor which avoids that. > sched_clock() is used not because it's the right clock, or > particularly good, but because it is fast and doesn't have locking > issues. > > But all of this is entirely separate from the "changing from one clock > that has small issues for good reasons to FOUR DIFFERENT CLOCKS THAT > THE USER CHOOSES RANDOMLY". > > See my objection? I understand that and I certainly was not trying to argue about that. Thanks, tglx ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-14 21:07 ` Linus Torvalds 2017-11-14 21:09 ` Thomas Gleixner @ 2017-11-14 21:16 ` Mark Salyzyn 2017-11-14 21:29 ` Linus Torvalds 1 sibling, 1 reply; 36+ messages in thread From: Mark Salyzyn @ 2017-11-14 21:16 UTC (permalink / raw) To: Linus Torvalds, Thomas Gleixner Cc: Petr Mladek, LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Prarit Bhargava, Sergey Senozhatsky, Steven Rostedt On 11/14/2017 01:07 PM, Linus Torvalds wrote: > But all of this is entirely separate from the "changing from one clock > that has small issues for good reasons to FOUR DIFFERENT CLOCKS THAT > THE USER CHOOSES RANDOMLY". User is _root_ (assuming default perms for nodes), not some random user. At least on an embedded Android platform, the only root user is init(8), so it can becomes a system boot time decision only. > See my objection? > > One is "let's improve the clock". > > The other is "let's do something crazy and idiotic". If configured and enabled, let's allow an embedded device the rights to chose the clock timebase for the ultimate in synchronization to other activities on the system (correlate with user space log timebase). -- Mark ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-14 21:16 ` Mark Salyzyn @ 2017-11-14 21:29 ` Linus Torvalds 2017-11-14 22:10 ` Mark Salyzyn 0 siblings, 1 reply; 36+ messages in thread From: Linus Torvalds @ 2017-11-14 21:29 UTC (permalink / raw) To: Mark Salyzyn Cc: Thomas Gleixner, Petr Mladek, LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Prarit Bhargava, Sergey Senozhatsky, Steven Rostedt On Tue, Nov 14, 2017 at 1:16 PM, Mark Salyzyn <salyzyn@android.com> wrote: > > User is _root_ (assuming default perms for nodes), not some random user. Totally irrelevant. It was clear from this discussion that the developers involved in the discussion APPARENTLY DID NOT EVEN UNDERSTAND the linkage to systemd, for example. Just look at this part of the patch: +config PRINTK_TIME_TYPE + int + depends on PRINTK which already shows that trhe developers _thought_ that this is purely about CONFIG_PRINTK_TIME. But that is the "print_time()" prefix that is printed as part of the buffer itself as [%5lu.%06lu]. That's the "freeform" string part of dmesg. People clearly never actually connected the dots, and noticed that msg_print_ext_header() thing, which is part of the logging format, and which has a different format and that is actually parsed by tools. So you guys screwed that up. Now, if the developers don't know or understand that, how do you expect some poor random MIS person who builds the kernel or sets things up to know? Basically, the whole choice is something that shouldn't exist. If Andoid wants real-time oin the system logs as part of "print_time()", then that is one thing. But doing it this way is BROKEN. Seriously. If you want wall-clock time in the print_time() header, do *THAT*. Don't screw up ts_nsec without understanding how it is used. Really. Do you really want to argue about this, when the patch clearly shows that the developers didn't understand the code? Comprende? So just admit that the patch series was done entirely the wrong way around, instead of making excuses for broken crap. And then maybe you can send a single patch that changes *ONLY* print_time(), with an explanation of how Android uses CONFIG_PRINTK_TIME and would like that to be realtime instead of the monotonic clock that we traditionally use. See? Fix the part you actually want to fix, instead of breaking the parts THAT YOU DIDN'T EVEN UNDERSTAND OR KNOW OF! Linus ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-14 21:29 ` Linus Torvalds @ 2017-11-14 22:10 ` Mark Salyzyn 2017-11-14 22:37 ` Linus Torvalds 0 siblings, 1 reply; 36+ messages in thread From: Mark Salyzyn @ 2017-11-14 22:10 UTC (permalink / raw) To: Linus Torvalds Cc: Thomas Gleixner, Petr Mladek, LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Prarit Bhargava, Sergey Senozhatsky, Steven Rostedt On 11/14/2017 01:29 PM, Linus Torvalds wrote: > And then maybe you can send a single patch that changes *ONLY* > print_time(), with an explanation of how Android uses > CONFIG_PRINTK_TIME and would like that to be realtime instead of the > monotonic clock that we traditionally use. I would be pleased to be educated. print_time() uses msg->ts_nsec; and AFAIK that needs to be in reference to the timebase. You can not after the fact acquire the correct synchronization and corrections between boottime and localtime/monotonic for example, it has to be landed at point of logging, not reconstructed at time of printing? [TL;DR] We would need to land the at-time-of-logging correction factor(s) for the timebase for each logged entry for print_time() to provide a correct fixed value when dumped. eg I would also need these values handy in the msg structure: tk->xtime_sec; tk->wall_to_monotonic->tv_sec tk->wall_to_monotonic->tv_nsec tk->raw_time.tv_sec; tk->raw_time.tv_nsec tk->tkr_raw.shift tk->tkr_raw.xtime_nsec; tk->tkr_raw.mult; rk->tkr_raw.shift tk->tkr_mono.xtime_nsec tk->tkr_mono.xtime_nsec; tk->tkr_mono.mult; tk->tkr_mono.shift; tk->tkr_mono.mask; tk->tkr_mono.cycle_last; tk->offs_boot; (or live with it your way, klogd grabbing the 'relatively correct' entries ASAP, but then dmesg output will differ) Having a periodic message with a hint for correction factors helps (which we have used in a prototype dmesg tool), but drift puts us in the ms region (which I was satisfied with, but some of our partners were not) when they would have liked us (or ns) time precision. For Android, periodic correction prints at time disruptions is good enough, I am merely advocating for some of the partners needs. -- Mark ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-14 22:10 ` Mark Salyzyn @ 2017-11-14 22:37 ` Linus Torvalds 2017-11-14 22:50 ` Thomas Gleixner 0 siblings, 1 reply; 36+ messages in thread From: Linus Torvalds @ 2017-11-14 22:37 UTC (permalink / raw) To: Mark Salyzyn Cc: Thomas Gleixner, Petr Mladek, LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Prarit Bhargava, Sergey Senozhatsky, Steven Rostedt On Tue, Nov 14, 2017 at 2:10 PM, Mark Salyzyn <salyzyn@android.com> wrote: > On 11/14/2017 01:29 PM, Linus Torvalds wrote: >> >> And then maybe you can send a single patch that changes *ONLY* >> print_time(), with an explanation of how Android uses >> CONFIG_PRINTK_TIME and would like that to be realtime instead of the >> monotonic clock that we traditionally use. > > print_time() uses msg->ts_nsec; and AFAIK that needs to be in reference to > the timebase. I'd suggest simply adding another time to the internal 'struct printk_log' (and probably just set it in "log_store()"). Doesn't that seem much simpler and more targeted? And exactly because it's targeted and only changes that one thing, now people that don't run Android, and have systemd that actually looks at that timestamp, can _also_ enable that wall clock output if they want to, without it impacting timestamps at all. In fact, I would expect that at that point you actually would want to have multiple different formats that don't necessarily have anything to do with the clock source, but simply with the formatting. Do you want wallclock to be shown in UTC? Do you want to show it in the system timezeone? Do you want to always show the full resolution, or do you want to show something that takes the previous log entry into account, so it only shows the date if it has actually changed, and maybe shows the time as just a delta if it is small? Because now it's clearly about the freeflowing _text_, not about the timestamp that is processed by tools. The text field done by print_time() really is _that_ different from the timestamp field that is exposed to tools. Yes, somebody might still be parsing it, and maybe then you need to set that text field to a legacy format anyway, but at least it's a whole lot easier to explain what's going on. Linus ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-14 22:37 ` Linus Torvalds @ 2017-11-14 22:50 ` Thomas Gleixner 2017-11-14 23:00 ` Joe Perches ` (3 more replies) 0 siblings, 4 replies; 36+ messages in thread From: Thomas Gleixner @ 2017-11-14 22:50 UTC (permalink / raw) To: Linus Torvalds Cc: Mark Salyzyn, Petr Mladek, LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Prarit Bhargava, Sergey Senozhatsky, Steven Rostedt n Tue, 14 Nov 2017, Linus Torvalds wrote: > On Tue, Nov 14, 2017 at 2:10 PM, Mark Salyzyn <salyzyn@android.com> wrote: > > On 11/14/2017 01:29 PM, Linus Torvalds wrote: > >> > >> And then maybe you can send a single patch that changes *ONLY* > >> print_time(), with an explanation of how Android uses > >> CONFIG_PRINTK_TIME and would like that to be realtime instead of the > >> monotonic clock that we traditionally use. > > > > print_time() uses msg->ts_nsec; and AFAIK that needs to be in reference to > > the timebase. > > I'd suggest simply adding another time to the internal 'struct > printk_log' (and probably just set it in "log_store()"). > > Doesn't that seem much simpler and more targeted? > > And exactly because it's targeted and only changes that one thing, now > people that don't run Android, and have systemd that actually looks at > that timestamp, can _also_ enable that wall clock output if they want > to, without it impacting timestamps at all. > > In fact, I would expect that at that point you actually would want to > have multiple different formats that don't necessarily have anything > to do with the clock source, but simply with the formatting. > > Do you want wallclock to be shown in UTC? Do you want to show it in > the system timezeone? Do you want to always show the full resolution, > or do you want to show something that takes the previous log entry > into account, so it only shows the date if it has actually changed, > and maybe shows the time as just a delta if it is small? > > Because now it's clearly about the freeflowing _text_, not about the > timestamp that is processed by tools. > > The text field done by print_time() really is _that_ different from > the timestamp field that is exposed to tools. > > Yes, somebody might still be parsing it, and maybe then you need to > set that text field to a legacy format anyway, but at least it's a > whole lot easier to explain what's going on. Something like the untested patch below should do the storage and converts the primary timestamp from local_clock() to clock monotonic. The print out and /dev/kmsg is unchanged. That's a separate story. Thanks, tglx 8<------------------- --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -352,7 +352,11 @@ enum log_flags { }; struct printk_log { - u64 ts_nsec; /* timestamp in nanoseconds */ + /* Timestamps in nanoseconds */ + union { + u64 ts_nsec; + struct timestamps ts; + }; u16 len; /* length of entire record */ u16 text_len; /* length of text buffer */ u16 dict_len; /* length of dictionary buffer */ @@ -578,7 +582,7 @@ static u32 truncate_msg(u16 *text_len, u /* insert record into the buffer, discard old ones, update heads */ static int log_store(int facility, int level, - enum log_flags flags, u64 ts_nsec, + enum log_flags flags, struct timestamps *ts, const char *dict, u16 dict_len, const char *text, u16 text_len) { @@ -621,10 +625,10 @@ static int log_store(int facility, int l msg->facility = facility; msg->level = level & 7; msg->flags = flags & 0x1f; - if (ts_nsec > 0) - msg->ts_nsec = ts_nsec; + if (ts && ts->mono > 0) + msg->ts = *ts; else - msg->ts_nsec = local_clock(); + ktime_get_fast_timestamps(&msg->ts); memset(log_dict(msg) + dict_len, 0, pad_len); msg->len = size; @@ -687,7 +691,7 @@ static void append_char(char **pp, char static ssize_t msg_print_ext_header(char *buf, size_t size, struct printk_log *msg, u64 seq) { - u64 ts_usec = msg->ts_nsec; + u64 ts_usec = msg->ts.mono; do_div(ts_usec, 1000); @@ -1022,6 +1026,7 @@ void log_buf_vmcoreinfo_setup(void) */ VMCOREINFO_STRUCT_SIZE(printk_log); VMCOREINFO_OFFSET(printk_log, ts_nsec); + VMCOREINFO_OFFSET(printk_log, ts); VMCOREINFO_OFFSET(printk_log, len); VMCOREINFO_OFFSET(printk_log, text_len); VMCOREINFO_OFFSET(printk_log, dict_len); @@ -1204,17 +1209,18 @@ static inline void boot_delay_msec(int l static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME); module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); -static size_t print_time(u64 ts, char *buf) +static size_t print_time(const struct timestamps *ts, char *buf) { unsigned long rem_nsec; + u64 mono = ts->mono; if (!printk_time) return 0; - rem_nsec = do_div(ts, 1000000000); + rem_nsec = do_div(mono, 1000000000); if (!buf) - return snprintf(NULL, 0, "[%5lu.000000] ", (unsigned long)ts); + return snprintf(NULL, 0, "[%5lu.000000] ", (unsigned long)mono); return sprintf(buf, "[%5lu.%06lu] ", (unsigned long)ts, rem_nsec / 1000); @@ -1239,7 +1245,7 @@ static size_t print_prefix(const struct } } - len += print_time(msg->ts_nsec, buf ? buf + len : NULL); + len += print_time(&msg->ts, buf ? buf + len : NULL); return len; } @@ -1599,7 +1605,7 @@ static struct cont { char buf[LOG_LINE_MAX]; size_t len; /* length == 0 means unused buffer */ struct task_struct *owner; /* task of first print*/ - u64 ts_nsec; /* time of first print */ + struct timestamps ts; /* time of first print */ u8 level; /* log level of first message */ u8 facility; /* log facility of first message */ enum log_flags flags; /* prefix, newline flags */ @@ -1610,7 +1616,7 @@ static void cont_flush(void) if (cont.len == 0) return; - log_store(cont.facility, cont.level, cont.flags, cont.ts_nsec, + log_store(cont.facility, cont.level, cont.flags, &cont.ts, NULL, 0, cont.buf, cont.len); cont.len = 0; } @@ -1631,7 +1637,7 @@ static bool cont_add(int facility, int l cont.facility = facility; cont.level = level; cont.owner = current; - cont.ts_nsec = local_clock(); + ktime_get_fast_timestamps(&cont.ts); cont.flags = flags; } @@ -1677,7 +1683,8 @@ static size_t log_output(int facility, i } /* Store it in the record log */ - return log_store(facility, level, lflags, 0, dict, dictlen, text, text_len); + return log_store(facility, level, lflags, NULL, dict, dictlen, text, + text_len); } asmlinkage int vprintk_emit(int facility, int level, --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -146,7 +146,19 @@ extern void timekeeping_inject_sleeptime * PPS accessor */ extern void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, - struct timespec64 *ts_real); + struct timespec64 *ts_real); + +/* + * struct timestanps - Simultaneous mono/boot/real timestamps + * @mono: Monotonic timestamp + * @boot: Boottime timestamp + * @real: Realtime timestamp + */ +struct timestamps { + u64 mono; + u64 boot; + u64 real; +}; /* * struct system_time_snapshot - simultaneous raw/real time capture with @@ -206,6 +218,9 @@ extern int get_device_system_crosststamp */ extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot); +/* NMI safe mono/boot/realtime timestamps */ +extern void ktime_get_fast_timestamps(struct timestamps *snap); + /* * Persistent clock related interfaces */ --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -496,29 +496,29 @@ u64 notrace ktime_get_boot_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); - /* * See comment for __ktime_get_fast_ns() vs. timestamp ordering */ -static __always_inline u64 __ktime_get_real_fast_ns(struct tk_fast *tkf) +static notrace u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) { struct tk_read_base *tkr; + u64 basem, baser, delta; unsigned int seq; - u64 now; do { seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); - now = ktime_to_ns(tkr->base_real); + basem = ktime_to_ns(tkr->base); + baser = ktime_to_ns(tkr->base_real); - now += timekeeping_delta_to_ns(tkr, - clocksource_delta( - tk_clock_read(tkr), - tkr->cycle_last, - tkr->mask)); + delta = timekeeping_delta_to_ns(tkr, + clocksource_delta(tk_clock_read(tkr), + tkr->cycle_last, tkr->mask)); } while (read_seqcount_retry(&tkf->seq, seq)); - return now; + if (mono) + *mono = basem + delta; + return baser + delta; } /** @@ -526,11 +526,25 @@ static __always_inline u64 __ktime_get_r */ u64 ktime_get_real_fast_ns(void) { - return __ktime_get_real_fast_ns(&tk_fast_mono); + return __ktime_get_real_fast(&tk_fast_mono, NULL); } EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns); /** + * ktime_get_fast_timestamps: - NMI safe timestamps + * @snap: Pointer to timestamp storage + * + * Stores clock monotonic, boottime and realtime time stamps + */ +void ktime_get_fast_timestamps(struct timestamps *snap) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + snap->real = __ktime_get_real_fast(&tk_fast_mono, &snap->mono); + snap->boot = snap->mono + ktime_to_ns(tk->offs_boot); +} + +/** * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. * @tk: Timekeeper to snapshot. * ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-14 22:50 ` Thomas Gleixner @ 2017-11-14 23:00 ` Joe Perches 2017-11-14 23:00 ` Linus Torvalds ` (2 subsequent siblings) 3 siblings, 0 replies; 36+ messages in thread From: Joe Perches @ 2017-11-14 23:00 UTC (permalink / raw) To: Thomas Gleixner, Linus Torvalds Cc: Mark Salyzyn, Petr Mladek, LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Prarit Bhargava, Sergey Senozhatsky, Steven Rostedt On Tue, 2017-11-14 at 23:50 +0100, Thomas Gleixner wrote: > Something like the untested patch below should do the storage and converts > the primary timestamp from local_clock() to clock monotonic. trivia: > --- a/kernel/printk/printk.c > +++ b/kernel/printk/printk.c > @@ -352,7 +352,11 @@ enum log_flags { > }; > > struct printk_log { > - u64 ts_nsec; /* timestamp in nanoseconds */ > + /* Timestamps in nanoseconds */ > + union { > + u64 ts_nsec; > + struct timestamps ts; struct timestamps could probably use a better name > --- a/include/linux/timekeeping.h > +++ b/include/linux/timekeeping.h > @@ -146,7 +146,19 @@ extern void timekeeping_inject_sleeptime > * PPS accessor > */ > extern void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, > - struct timespec64 *ts_real); > + struct timespec64 *ts_real); > + > +/* > + * struct timestanps - Simultaneous mono/boot/real timestamps typo timestamps > + * @mono: Monotonic timestamp > + * @boot: Boottime timestamp > + * @real: Realtime timestamp > + */ > +struct timestamps { > + u64 mono; > + u64 boot; > + u64 real; > +}; Maybe something like struct system_event_timestamps ? ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-14 22:50 ` Thomas Gleixner 2017-11-14 23:00 ` Joe Perches @ 2017-11-14 23:00 ` Linus Torvalds 2017-11-14 23:04 ` Thomas Gleixner 2017-11-15 0:00 ` Linus Torvalds 2018-01-29 20:34 ` Mark Salyzyn 3 siblings, 1 reply; 36+ messages in thread From: Linus Torvalds @ 2017-11-14 23:00 UTC (permalink / raw) To: Thomas Gleixner Cc: Mark Salyzyn, Petr Mladek, LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Prarit Bhargava, Sergey Senozhatsky, Steven Rostedt On Tue, Nov 14, 2017 at 2:50 PM, Thomas Gleixner <tglx@linutronix.de> wrote: > > Something like the untested patch below should do the storage and converts > the primary timestamp from local_clock() to clock monotonic. This may work. > - u64 ts_nsec; /* timestamp in nanoseconds */ > + /* Timestamps in nanoseconds */ > + union { > + u64 ts_nsec; > + struct timestamps ts; > + }; But I'd skip the union, and just search-and-replace the ts_nsec -> ts.mono. You did that in at least one place anyway. Because as it is, that's just too subtle and hard to read. Don't make it worse by having a magical "ts_nsec is also ts.mono, but you have to look in two different files to see that. That file is messy enough as-is. Linus ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-14 23:00 ` Linus Torvalds @ 2017-11-14 23:04 ` Thomas Gleixner 2017-11-14 23:18 ` Linus Torvalds 0 siblings, 1 reply; 36+ messages in thread From: Thomas Gleixner @ 2017-11-14 23:04 UTC (permalink / raw) To: Linus Torvalds Cc: Mark Salyzyn, Petr Mladek, LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Prarit Bhargava, Sergey Senozhatsky, Steven Rostedt On Tue, 14 Nov 2017, Linus Torvalds wrote: > On Tue, Nov 14, 2017 at 2:50 PM, Thomas Gleixner <tglx@linutronix.de> wrote: > > > > Something like the untested patch below should do the storage and converts > > the primary timestamp from local_clock() to clock monotonic. > > This may work. > > > - u64 ts_nsec; /* timestamp in nanoseconds */ > > + /* Timestamps in nanoseconds */ > > + union { > > + u64 ts_nsec; > > + struct timestamps ts; > > + }; > > But I'd skip the union, and just search-and-replace the ts_nsec -> > ts.mono. You did that in at least one place anyway. > > Because as it is, that's just too subtle and hard to read. Don't make > it worse by having a magical "ts_nsec is also ts.mono, but you have to > look in two different files to see that. > > That file is messy enough as-is. True. I already replaced all of them except this one: @@ -1022,6 +1026,7 @@ void log_buf_vmcoreinfo_setup(void) */ VMCOREINFO_STRUCT_SIZE(printk_log); VMCOREINFO_OFFSET(printk_log, ts_nsec); + VMCOREINFO_OFFSET(printk_log, ts); VMCOREINFO_OFFSET(printk_log, len); VMCOREINFO_OFFSET(printk_log, text_len); VMCOREINFO_OFFSET(printk_log, dict_len); If I remove ts_nsec completely, then the VMCORE info changes and that would break existing tools because they search for ts_nsec and fail.... Thanks, tglx ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-14 23:04 ` Thomas Gleixner @ 2017-11-14 23:18 ` Linus Torvalds 2017-11-14 23:22 ` Thomas Gleixner 0 siblings, 1 reply; 36+ messages in thread From: Linus Torvalds @ 2017-11-14 23:18 UTC (permalink / raw) To: Thomas Gleixner Cc: Mark Salyzyn, Petr Mladek, LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Prarit Bhargava, Sergey Senozhatsky, Steven Rostedt On Tue, Nov 14, 2017 at 3:04 PM, Thomas Gleixner <tglx@linutronix.de> wrote: > True. I already replaced all of them except this one: > > @@ -1022,6 +1026,7 @@ void log_buf_vmcoreinfo_setup(void) > */ > VMCOREINFO_STRUCT_SIZE(printk_log); > VMCOREINFO_OFFSET(printk_log, ts_nsec); > + VMCOREINFO_OFFSET(printk_log, ts); > VMCOREINFO_OFFSET(printk_log, len); > VMCOREINFO_OFFSET(printk_log, text_len); > VMCOREINFO_OFFSET(printk_log, dict_len); > > If I remove ts_nsec completely, then the VMCORE info changes and that would > break existing tools because they search for ts_nsec and fail.... Dear god, that's nasty. I agree that the vmcore info name probably needs to be retained, but maybe we could just do that with #define VMCOREINFO_FIELD_AND_OFFSET(name, field, offset) \ vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", name, field, offset) and then do #define VMCOREINFO_OFFSET(name, field) \ VMCOREINFO_FIELD_AND_OFFSET(#name, #field, \ (unsigned long)offsetof(struct name, offset)) and for that ts_nsec you can then do /* ts.mono used to be called "ts_nsec" */ VMCOREINFO_FIELD_AND_OFFSET("printk_log", "ts_nsec", (unsigned long)offsetof(struct printk_log, ts.mono)) or something. And I may obviously have messed that all up, I wrote it all in the email client, so it's more pseudo-code than anything else. But it looks pretty straight-forward conceptually, and would make it pretty clear what is going on, no? Linus ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-14 23:18 ` Linus Torvalds @ 2017-11-14 23:22 ` Thomas Gleixner 0 siblings, 0 replies; 36+ messages in thread From: Thomas Gleixner @ 2017-11-14 23:22 UTC (permalink / raw) To: Linus Torvalds Cc: Mark Salyzyn, Petr Mladek, LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Prarit Bhargava, Sergey Senozhatsky, Steven Rostedt On Tue, 14 Nov 2017, Linus Torvalds wrote: > On Tue, Nov 14, 2017 at 3:04 PM, Thomas Gleixner <tglx@linutronix.de> wrote: > > True. I already replaced all of them except this one: > > > > @@ -1022,6 +1026,7 @@ void log_buf_vmcoreinfo_setup(void) > > */ > > VMCOREINFO_STRUCT_SIZE(printk_log); > > VMCOREINFO_OFFSET(printk_log, ts_nsec); > > + VMCOREINFO_OFFSET(printk_log, ts); > > VMCOREINFO_OFFSET(printk_log, len); > > VMCOREINFO_OFFSET(printk_log, text_len); > > VMCOREINFO_OFFSET(printk_log, dict_len); > > > > If I remove ts_nsec completely, then the VMCORE info changes and that would > > break existing tools because they search for ts_nsec and fail.... > > Dear god, that's nasty. > > I agree that the vmcore info name probably needs to be retained, but > maybe we could just do that with > > #define VMCOREINFO_FIELD_AND_OFFSET(name, field, offset) \ > vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", name, field, offset) > > and then do > > #define VMCOREINFO_OFFSET(name, field) \ > VMCOREINFO_FIELD_AND_OFFSET(#name, #field, \ > (unsigned long)offsetof(struct name, offset)) > > and for that ts_nsec you can then do > > /* ts.mono used to be called "ts_nsec" */ > VMCOREINFO_FIELD_AND_OFFSET("printk_log", "ts_nsec", > (unsigned long)offsetof(struct printk_log, ts.mono)) > > or something. > > And I may obviously have messed that all up, I wrote it all in the > email client, so it's more pseudo-code than anything else. > > But it looks pretty straight-forward conceptually, and would make it > pretty clear what is going on, no? That should work. I'll try that tomorrow with brain awake. Thanks, tglx ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-14 22:50 ` Thomas Gleixner 2017-11-14 23:00 ` Joe Perches 2017-11-14 23:00 ` Linus Torvalds @ 2017-11-15 0:00 ` Linus Torvalds 2017-11-15 8:04 ` Ingo Molnar 2017-11-15 16:26 ` Mark Salyzyn 2018-01-29 20:34 ` Mark Salyzyn 3 siblings, 2 replies; 36+ messages in thread From: Linus Torvalds @ 2017-11-15 0:00 UTC (permalink / raw) To: Thomas Gleixner Cc: Mark Salyzyn, Petr Mladek, LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Prarit Bhargava, Sergey Senozhatsky, Steven Rostedt On Tue, Nov 14, 2017 at 2:50 PM, Thomas Gleixner <tglx@linutronix.de> wrote: > +/* > + * struct timestanps - Simultaneous mono/boot/real timestamps > + * @mono: Monotonic timestamp > + * @boot: Boottime timestamp > + * @real: Realtime timestamp > + */ Side note: does anybody really wanr/need the boottime thing? I can definitely understand why people want a monotonic clock (since ordering is meaningful). And at the same time, it's pretty obvious that wall clock is meaningful. Who really wants that boot time thing when you have those two? I get the feeling hat nobody really wanted it, and it was just added for completeness. I don't think 'struct printk_log' is _that_ size sensitive, but it does seem to be a bad idea to add 8 bytes without having a good reason for it. The other times seem to have good reasons, not so much the boot one. Linus ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-15 0:00 ` Linus Torvalds @ 2017-11-15 8:04 ` Ingo Molnar 2017-11-15 16:26 ` Mark Salyzyn 1 sibling, 0 replies; 36+ messages in thread From: Ingo Molnar @ 2017-11-15 8:04 UTC (permalink / raw) To: Linus Torvalds Cc: Thomas Gleixner, Mark Salyzyn, Petr Mladek, LKML, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Prarit Bhargava, Sergey Senozhatsky, Steven Rostedt * Linus Torvalds <torvalds@linux-foundation.org> wrote: > On Tue, Nov 14, 2017 at 2:50 PM, Thomas Gleixner <tglx@linutronix.de> wrote: > > +/* > > + * struct timestanps - Simultaneous mono/boot/real timestamps > > + * @mono: Monotonic timestamp > > + * @boot: Boottime timestamp > > + * @real: Realtime timestamp > > + */ > > Side note: does anybody really wanr/need the boottime thing? > > I can definitely understand why people want a monotonic clock (since > ordering is meaningful). And at the same time, it's pretty obvious > that wall clock is meaningful. > > Who really wants that boot time thing when you have those two? I get > the feeling hat nobody really wanted it, and it was just added for > completeness. IIRC there were some boot time optimization tools that used the boot-time relative timestamps to visualize and compare things - but I don't know whether they used the text-form timestamps or the binary log timestamp. (I suspect the former.) > I don't think 'struct printk_log' is _that_ size sensitive, but it > does seem to be a bad idea to add 8 bytes without having a good reason > for it. The other times seem to have good reasons, not so much the > boot one. So I think when it comes to 'timing' and post-mortem log analysis, then redundancy of time sources is usually good. "When did this occur after bootup" is inherent in boot timestamps - and is usable even if the 'when did the system boot up' info is lost (got log-rotated away, or cannot be trusted, etc.). It's admittedly a bit of a handwaving argument, because the timestamps _are_ redundant and can be completely eliminated on a properly run system with proper timekeeping and all log entries available, but in general this kind of redundancy does not feel actively bad to me. (In fact I'd argue that having such redundancy is one of the (few) advantages of binary log formats, because tools don't mind the visual clutter, so we might as well make use of it.) Thanks, Ingo ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-15 0:00 ` Linus Torvalds 2017-11-15 8:04 ` Ingo Molnar @ 2017-11-15 16:26 ` Mark Salyzyn 2017-11-15 17:42 ` Linus Torvalds 1 sibling, 1 reply; 36+ messages in thread From: Mark Salyzyn @ 2017-11-15 16:26 UTC (permalink / raw) To: Linus Torvalds, Thomas Gleixner Cc: Petr Mladek, LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Prarit Bhargava, Sergey Senozhatsky, Steven Rostedt On 11/14/2017 04:00 PM, Linus Torvalds wrote: > On Tue, Nov 14, 2017 at 2:50 PM, Thomas Gleixner <tglx@linutronix.de> wrote: >> +/* >> + * struct timestanps - Simultaneous mono/boot/real timestamps >> + * @mono: Monotonic timestamp >> + * @boot: Boottime timestamp >> + * @real: Realtime timestamp >> + */ > Side note: does anybody really wanr/need the boottime thing? "Android partners have asked for it". I have said no... Maybe for completeness, but having it may straighten out a synchronization mess. [TL;DR] "If you have three watches, you will never know what time it is"(tm) Android peripherals with embedded firmware (camera, sensor hub, etc) all are run synchronized to boottime. If the firmware logs any data and it is propagated, we run into issues trying to triage firmware, kernel and user space temporal activities as each is on a different timebase. Android's logger cluster (logd) today can only be switched between CLOCK_REALTIME or CLOCK_MONOTONIC for its timebase. It tries to track the pair to merge content from all the sources. use is mixed: CLOCK_REALTIME is selected on phones/tablets, CLOCK_MONOTONIC is selected on watches. CLOCK_BOOTIME to make a trio of synchronized time sources is being requested. We are blocked on introducing CLOCK_BOOTTIME as an option because it is difficult to discern between logging utilizing CLOCK_BOOTTIME or CLOCK_MONOTONIC based on incoming value alone (heuristics) so the infrastructure will change. Logging (in user space) the trio of timestamps is too costly (2.5us, currently CLOCK_BOOTTIME is not supplied by vdso, so is a syscall) and some sources will never report the trio. (I have patches that fix CLOCK_BOOTTIME in vdso being considered in the vdso-unification effort) CLOCK_BOOTTIME and CLOCK_MONOTONIC represent an issue for continuity through a reboot or kernel crash. Stitching them together sometimes generates maniacal laughter. CLOCK_REALTIME is not set until a LTE connection is made, so we are effectively seeing CLOCK_BOOTTIME (or worse, some random offset) logging until later in the boot cycle. Currently kernel logs and atrace (kernel ftrace + a user merged component) are CLOCK_MONOTONIC. I do not know if we will ever unify the timebases. Or select one (CLOCK_BOOTTIME has been suggested as the best choice). kernel supporting it, or the trio, in logd.auditd and logd.klogd threads would be a step towards a better and synchronized logging world though. -- Mark ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-15 16:26 ` Mark Salyzyn @ 2017-11-15 17:42 ` Linus Torvalds 2017-11-16 0:37 ` Thomas Gleixner 0 siblings, 1 reply; 36+ messages in thread From: Linus Torvalds @ 2017-11-15 17:42 UTC (permalink / raw) To: Mark Salyzyn Cc: Thomas Gleixner, Petr Mladek, LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Prarit Bhargava, Sergey Senozhatsky, Steven Rostedt On Wed, Nov 15, 2017 at 8:26 AM, Mark Salyzyn <salyzyn@android.com> wrote: > > Maybe for completeness, but having it may straighten out a synchronization > mess. > > [TL;DR] > > "If you have three watches, you will never know what time it is"(tm) > > Android peripherals with embedded firmware (camera, sensor hub, etc) all are > run synchronized to boottime. If the firmware logs any data and it is > propagated, we run into issues trying to triage firmware, kernel and user > space temporal activities as each is on a different timebase. So I agree with all of this, but I wonder what actuall yuses that BOOTTIME to make it worth even synchronizing with. I'm assuming it's some evdev user. But I'm wondering if perhaps we could just simplify our own lives and make CLOCK_BOOTTIME and CLOCK_MONOTONIC just be the same. And we'd make them be the same by making CLOCK_MONOTONIC act like CLOCK_BOOTTIME. I doubt anybody would notice. At least we could _try_ that kind of system clock simplification. Maybe people would scream bloody murder and we'd have to revert, but wouldn't it be lovely to simplify the synchronization problem by just making it go away (well, at least for the BOOTTIME/MONOTONIC case). So I'd prefer that kind of "could we at least first try to improve on the situation" over adding more complexity due to a historical accident that people at the time didn't think would be so confusing.. I suspect a lot of people use CLOCK_MONOTONIC because it's (a) been around forever (b) is quicker than realtime (c) is monotonic (duh!) so you can sort things but these users wouldn't _mind_ getting the BOOTTIME behavior. So making CLOCK_MONOTONIC just do the BOOTTIME thing likely wouldn't hurt. They don't care about suspend. In contrast, the people who now use BOOTTIME, probably use it exactly _because_ they care about suspend, and may still want the monotonicity, but they'd like something that still approaches wall clock time at least in the _delta_. So I'm pushing back on complexity here a bit. We *might* be able to really just simplify things. Linus ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-15 17:42 ` Linus Torvalds @ 2017-11-16 0:37 ` Thomas Gleixner 2017-11-16 1:23 ` John Stultz 2017-11-16 1:32 ` Linus Torvalds 0 siblings, 2 replies; 36+ messages in thread From: Thomas Gleixner @ 2017-11-16 0:37 UTC (permalink / raw) To: Linus Torvalds Cc: Mark Salyzyn, Petr Mladek, LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Prarit Bhargava, Sergey Senozhatsky, Steven Rostedt, John Stultz On Wed, 15 Nov 2017, Linus Torvalds wrote: > > So I agree with all of this, but I wonder what actuall yuses that > BOOTTIME to make it worth even synchronizing with. > > I'm assuming it's some evdev user. > > But I'm wondering if perhaps we could just simplify our own lives and > make CLOCK_BOOTTIME and CLOCK_MONOTONIC just be the same. > > And we'd make them be the same by making CLOCK_MONOTONIC act like > CLOCK_BOOTTIME. > > I doubt anybody would notice. > > At least we could _try_ that kind of system clock simplification. > Maybe people would scream bloody murder and we'd have to revert, but > wouldn't it be lovely to simplify the synchronization problem by just > making it go away (well, at least for the BOOTTIME/MONOTONIC case). Yes it would be lovely. I have some vague memories about having MONOTONIC behave the same way as BOOTTIME in the early days of the generic timekeeping infrastructure, high resolution timers and idle NOHZ work, 10+ years ago. This broke stuff because the historic behaviour was to not advance on resume and the change caused massive timer expiries right after resume which confused the hell out of things, because timers fired immediately which were not expected to fire as they were implicitely relying on suspend not affecting clock monotonic. So we reverted back to the old behaviour. Soon after that, people complained about not being able to arm timers which should expire after a resume or get access to the time spent there, but they could not use REALTIME due to time jumps caused by DST and whatever. So we introduced BOOTTIME. In hindsight it might have been better not to do that, but now we have to deal with it. I'm a bit worried to change that because the behaviour difference of MONOTONIC and BOOTTIME vs. suspend/resume is well documented all over the place and there are explicit choices made in applications and libraries which one of them to use for a particular problem. So I expect that some of the surprises we've seen 10+ years ago still persist. I'm also quite sure that there is kernel code which relies implicitely on that behaviour. We surely can fix that, but it might be tedious to debug. John? Thanks, tglx ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-16 0:37 ` Thomas Gleixner @ 2017-11-16 1:23 ` John Stultz 2017-11-16 1:32 ` Linus Torvalds 1 sibling, 0 replies; 36+ messages in thread From: John Stultz @ 2017-11-16 1:23 UTC (permalink / raw) To: Thomas Gleixner Cc: Linus Torvalds, Mark Salyzyn, Petr Mladek, LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Prarit Bhargava, Sergey Senozhatsky, Steven Rostedt On Wed, Nov 15, 2017 at 4:37 PM, Thomas Gleixner <tglx@linutronix.de> wrote: > On Wed, 15 Nov 2017, Linus Torvalds wrote: >> >> So I agree with all of this, but I wonder what actuall yuses that >> BOOTTIME to make it worth even synchronizing with. >> >> I'm assuming it's some evdev user. >> >> But I'm wondering if perhaps we could just simplify our own lives and >> make CLOCK_BOOTTIME and CLOCK_MONOTONIC just be the same. >> >> And we'd make them be the same by making CLOCK_MONOTONIC act like >> CLOCK_BOOTTIME. >> >> I doubt anybody would notice. >> >> At least we could _try_ that kind of system clock simplification. >> Maybe people would scream bloody murder and we'd have to revert, but >> wouldn't it be lovely to simplify the synchronization problem by just >> making it go away (well, at least for the BOOTTIME/MONOTONIC case). > > Yes it would be lovely. I have some vague memories about having MONOTONIC > behave the same way as BOOTTIME in the early days of the generic > timekeeping infrastructure, high resolution timers and idle NOHZ work, 10+ > years ago. > > This broke stuff because the historic behaviour was to not advance on > resume and the change caused massive timer expiries right after resume > which confused the hell out of things, because timers fired immediately > which were not expected to fire as they were implicitely relying on suspend > not affecting clock monotonic. > > So we reverted back to the old behaviour. > > Soon after that, people complained about not being able to arm timers which > should expire after a resume or get access to the time spent there, but > they could not use REALTIME due to time jumps caused by DST and > whatever. So we introduced BOOTTIME. > > In hindsight it might have been better not to do that, but now we have to > deal with it. > > I'm a bit worried to change that because the behaviour difference of > MONOTONIC and BOOTTIME vs. suspend/resume is well documented all over the > place and there are explicit choices made in applications and libraries > which one of them to use for a particular problem. So I expect that some of > the surprises we've seen 10+ years ago still persist. > > I'm also quite sure that there is kernel code which relies implicitely on > that behaviour. We surely can fix that, but it might be tedious to debug. > > John? Yea, I don't think we could get away with replacing CLOCK_MONOTONIC with CLOCK_BOOTTIME at this point. I think in retrospect, for userspace it probably would have been the right decision when we were initially sorting how CLOCK_MONOTONIC hrtimers and suspend would work together, but even then, we would still need something like the current CLOCK_MONOTONIC internally for the kernel to avoid spinning firing a million recurring timers on resume after a long suspend. Then having a non 1:1 mapping from the kernel's internal sense of MONOTONIC and userland's sense would have add more complexity. Even if years ago we had defined CLOCK_MONOTONIC to work like CLOCK_BOOTTIME for userspace, I suspect we'd end up having apps wanting something like CLOCK_RUNTIME to get similar non-suspend accounting behavior. :) thanks -john ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-16 0:37 ` Thomas Gleixner 2017-11-16 1:23 ` John Stultz @ 2017-11-16 1:32 ` Linus Torvalds 2017-11-16 7:12 ` Thomas Gleixner 1 sibling, 1 reply; 36+ messages in thread From: Linus Torvalds @ 2017-11-16 1:32 UTC (permalink / raw) To: Thomas Gleixner Cc: Mark Salyzyn, Petr Mladek, LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Prarit Bhargava, Sergey Senozhatsky, Steven Rostedt, John Stultz On Wed, Nov 15, 2017 at 4:37 PM, Thomas Gleixner <tglx@linutronix.de> wrote: > > This broke stuff because the historic behaviour was to not advance on > resume and the change caused massive timer expiries right after resume > which confused the hell out of things, because timers fired immediately > which were not expected to fire as they were implicitely relying on suspend > not affecting clock monotonic. Ahh, now that you mention it, I remember that. Things coming to a complete halt because every single X event timer and network timer all firing at resume. If the timer wheel thing doesn't happen any more, I'd still be willing to try it out, but you're right, it might be much worse than I thought. Linus ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-16 1:32 ` Linus Torvalds @ 2017-11-16 7:12 ` Thomas Gleixner 2017-11-18 0:26 ` Thomas Gleixner 0 siblings, 1 reply; 36+ messages in thread From: Thomas Gleixner @ 2017-11-16 7:12 UTC (permalink / raw) To: Linus Torvalds Cc: Mark Salyzyn, Petr Mladek, LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Prarit Bhargava, Sergey Senozhatsky, Steven Rostedt, John Stultz On Wed, 15 Nov 2017, Linus Torvalds wrote: > On Wed, Nov 15, 2017 at 4:37 PM, Thomas Gleixner <tglx@linutronix.de> wrote: > > > > This broke stuff because the historic behaviour was to not advance on > > resume and the change caused massive timer expiries right after resume > > which confused the hell out of things, because timers fired immediately > > which were not expected to fire as they were implicitely relying on suspend > > not affecting clock monotonic. > > Ahh, now that you mention it, I remember that. Things coming to a > complete halt because every single X event timer and network timer all > firing at resume. > > If the timer wheel thing doesn't happen any more, I'd still be willing > to try it out, but you're right, it might be much worse than I > thought. The timer wheel part is easy to cure, we actually tried that back then by advancing clock monotonic, but not jiffies. That made at least most of the kernel side issues go away, but the user space wreckage persisted and we saw now way to fix that w/o violating the no user space visible regression rule. It might be better now, so yes might can try and see what happens. The android folks surely an run a quick coverage test on their side, but we'd need to get some serious feedback from desktop and enterprise people as well. The first thing we need to look at before even going there is stuff like NTP which needs the coupling of MONOTONIC and MONOTONIC_RAW preserved. I need to make sure that this does not break in the first place. I hope that I can find a few spare cycles to whip up a POC patch which does not make stuff fall apart immediately. Thanks, tglx ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-16 7:12 ` Thomas Gleixner @ 2017-11-18 0:26 ` Thomas Gleixner 2017-11-18 0:44 ` Linus Torvalds 2017-11-20 6:20 ` Kevin Easton 0 siblings, 2 replies; 36+ messages in thread From: Thomas Gleixner @ 2017-11-18 0:26 UTC (permalink / raw) To: Linus Torvalds Cc: Mark Salyzyn, Petr Mladek, LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Prarit Bhargava, Sergey Senozhatsky, Steven Rostedt, John Stultz On Thu, 16 Nov 2017, Thomas Gleixner wrote: > I hope that I can find a few spare cycles to whip up a POC patch which does > not make stuff fall apart immediately. Here you go. It survived suspend resume in a VM. Thanks, tglx 8<------------------------ Subject: timekeeping: Make monotonic behave like boottime From: Thomas Gleixner <tglx@linutronix.de> Date: Fri, 17 Nov 2017 11:46:48 +0100 Clock MONOTONIC is not fast forwarded by the time spent in suspend on resume. This is only done for BOOTTIME. It would be desired to get rid of that difference, but the difference between clock MONOTONIC and clock BOOTTIME is well documented so there might be applications which depend on that behaviour. Implement it for testing purposes. This requires to prevent that jiffies advance, which are coupled at clock MONOTONIC as well in the current implementation. This is achieved by forwarding the variables which are used for the jiffies update after resume before the tick is restarted, In timekeeping resume, the change is rather simple. Instead of updating the offset between clock MONOTONIC and clock REALTIME/BOOTTIME, advance the time keeper base for the MONOTONIC and the MONOTONIC_RAW clock by the time spent in suspend. So MONOTONIC is now the same as BOOTTIME and the offset between clock REALTIME and clock MONOTONIC is the same as before suspend. WARNING: Use this with caution. Following prerequisites are required before booting into a kernel with this patch applied: - Backup of important data like the nude pictures you did not send to facebook yet - Functional fire estinguisher - Vaccum cleaner to deal with the fallout You've been warned and you're responsible for any damage which might result. NOT-Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- kernel/time/tick-common.c | 15 +++++++++++++++ kernel/time/tick-internal.h | 6 ++++++ kernel/time/tick-sched.c | 9 +++++++++ kernel/time/timekeeping.c | 9 +++++++-- 4 files changed, 37 insertions(+), 2 deletions(-) --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -419,6 +419,19 @@ void tick_suspend_local(void) clockevents_shutdown(td->evtdev); } +static void tick_forward_next_period(void) +{ + ktime_t delta, now = ktime_get(); + u64 n; + + delta = ktime_sub(now, tick_next_period); + n = ktime_divns(delta, tick_period); + tick_next_period += n * tick_period; + if (tick_next_period < now) + tick_next_period += tick_period; + tick_sched_forward_next_period(); +} + /** * tick_resume_local - Resume the local tick device * @@ -431,6 +444,8 @@ void tick_resume_local(void) struct tick_device *td = this_cpu_ptr(&tick_cpu_device); bool broadcast = tick_resume_check_broadcast(); + tick_forward_next_period(); + clockevents_tick_resume(td->evtdev); if (!broadcast) { if (td->mode == TICKDEV_MODE_PERIODIC) --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -141,6 +141,12 @@ static inline void tick_check_oneshot_br static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); } #endif /* !(BROADCAST && ONESHOT) */ +#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) +extern void tick_sched_forward_next_period(void); +#else +static inline void tick_sched_forward_next_period(void) { } +#endif + /* NO_HZ_FULL internal */ #ifdef CONFIG_NO_HZ_FULL extern void tick_nohz_init(void); --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -52,6 +52,15 @@ struct tick_sched *tick_get_tick_sched(i static ktime_t last_jiffies_update; /* + * Called after resume. Make sure that jiffies are not fast forwarded due to + * clock monotonic being forwarded by the suspended time. + */ +void tick_sched_forward_next_period(void) +{ + last_jiffies_update = tick_next_period; +} + +/* * Must be called with interrupts disabled ! */ static void tick_do_update_jiffies64(ktime_t now) --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -138,7 +138,13 @@ static void tk_set_wall_to_mono(struct t static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) { - tk->offs_boot = ktime_add(tk->offs_boot, delta); + /* + * Update both bases so mono and raw stay coupled. Not strictly the + * same as the old code which did not adjust clock monotonic after + * resume, so dragons might be lurking somewhere. + */ + tk->tkr_mono.base += delta; + tk->tkr_raw.base += delta; } /* @@ -1624,7 +1630,6 @@ static void __timekeeping_inject_sleepti return; } tk_xtime_add(tk, delta); - tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta)); tk_update_sleep_time(tk, timespec64_to_ktime(*delta)); tk_debug_account_sleep_time(delta); } ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-18 0:26 ` Thomas Gleixner @ 2017-11-18 0:44 ` Linus Torvalds 2017-11-18 1:00 ` Thomas Gleixner 2017-11-20 6:20 ` Kevin Easton 1 sibling, 1 reply; 36+ messages in thread From: Linus Torvalds @ 2017-11-18 0:44 UTC (permalink / raw) To: Thomas Gleixner Cc: Mark Salyzyn, Petr Mladek, LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Prarit Bhargava, Sergey Senozhatsky, Steven Rostedt, John Stultz On Fri, Nov 17, 2017 at 4:26 PM, Thomas Gleixner <tglx@linutronix.de> wrote: > > Here you go. It survived suspend resume in a VM. That looks almost too small. If this were to allow us to basically unify BOOTTIME/MONOTONIC, that would be lovely. Should we just try it, with the understanding that we'll quite possibly have to just revert? Assuming this works, what would the "phase two" patch that actually unifies the two (as opposed to make them have the same value) look like? Is that just basically just removing the dozen or so "offst_boot" uses, or are there other possible simplifications? Linus ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-18 0:44 ` Linus Torvalds @ 2017-11-18 1:00 ` Thomas Gleixner 0 siblings, 0 replies; 36+ messages in thread From: Thomas Gleixner @ 2017-11-18 1:00 UTC (permalink / raw) To: Linus Torvalds Cc: Mark Salyzyn, Petr Mladek, LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Prarit Bhargava, Sergey Senozhatsky, Steven Rostedt, John Stultz On Fri, 17 Nov 2017, Linus Torvalds wrote: > On Fri, Nov 17, 2017 at 4:26 PM, Thomas Gleixner <tglx@linutronix.de> wrote: > > > > Here you go. It survived suspend resume in a VM. > > That looks almost too small. > > If this were to allow us to basically unify BOOTTIME/MONOTONIC, that > would be lovely. I surely missed something. John should figure that out faster than me. > Should we just try it, with the understanding that we'll quite > possibly have to just revert? You've seen the warning in the changelog :) > Assuming this works, what would the "phase two" patch that actually > unifies the two (as opposed to make them have the same value) look > like? > > Is that just basically just removing the dozen or so "offst_boot" > uses, or are there other possible simplifications? Pretty much. We can't get rid of CLOCK_BOOTTIME and the handling of it, though we can remove one hrtimer clock base by mapping boot time to monotonic. That's about it. Thanks, tglx ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-18 0:26 ` Thomas Gleixner 2017-11-18 0:44 ` Linus Torvalds @ 2017-11-20 6:20 ` Kevin Easton 2017-11-20 6:36 ` Linus Torvalds 1 sibling, 1 reply; 36+ messages in thread From: Kevin Easton @ 2017-11-20 6:20 UTC (permalink / raw) To: Thomas Gleixner Cc: Linus Torvalds, Mark Salyzyn, Petr Mladek, LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Prarit Bhargava, Sergey Senozhatsky, Steven Rostedt, John Stultz On Sat, Nov 18, 2017 at 01:26:07AM +0100, Thomas Gleixner wrote: > On Thu, 16 Nov 2017, Thomas Gleixner wrote: > > I hope that I can find a few spare cycles to whip up a POC patch which does > > not make stuff fall apart immediately. > > Here you go. It survived suspend resume in a VM. > > Thanks, > > tglx > > 8<------------------------ > Subject: timekeeping: Make monotonic behave like boottime > From: Thomas Gleixner <tglx@linutronix.de> > Date: Fri, 17 Nov 2017 11:46:48 +0100 > > Clock MONOTONIC is not fast forwarded by the time spent in suspend on > resume. This is only done for BOOTTIME. > > It would be desired to get rid of that difference, but the difference > between clock MONOTONIC and clock BOOTTIME is well documented so there > might be applications which depend on that behaviour. As a comment from the userspace peanut gallery, I personally hope this does pass muster. The existing POSIX wording implies that MONOTONIC doesn't stop counting over suspend, and that's what you need when you want to know the time elapsed between two external events. - Kevin > ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-20 6:20 ` Kevin Easton @ 2017-11-20 6:36 ` Linus Torvalds 0 siblings, 0 replies; 36+ messages in thread From: Linus Torvalds @ 2017-11-20 6:36 UTC (permalink / raw) To: Kevin Easton Cc: Thomas Gleixner, Mark Salyzyn, Petr Mladek, LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Prarit Bhargava, Sergey Senozhatsky, Steven Rostedt, John Stultz On Sun, Nov 19, 2017 at 8:20 PM, Kevin Easton <kevin@guarana.org> wrote: > > As a comment from the userspace peanut gallery, I personally hope this > does pass muster. The existing POSIX wording implies that MONOTONIC > doesn't stop counting over suspend, and that's what you need when you > want to know the time elapsed between two external events. Well, people testing the patch (being aware of Thomas' warning) would be good. It would be particularly good to test suspending when a lot of timers are active, particularly on a machine that isn't so beefy as to hide any big load spikes etc. Traditionally that tends to be networking but also lots of graphical programs with animations.. Linus ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2017-11-14 22:50 ` Thomas Gleixner ` (2 preceding siblings ...) 2017-11-15 0:00 ` Linus Torvalds @ 2018-01-29 20:34 ` Mark Salyzyn 2018-01-29 21:49 ` Thomas Gleixner 3 siblings, 1 reply; 36+ messages in thread From: Mark Salyzyn @ 2018-01-29 20:34 UTC (permalink / raw) To: Thomas Gleixner, Linus Torvalds Cc: Petr Mladek, LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Prarit Bhargava, Sergey Senozhatsky, Steven Rostedt Any movement on the following proposal? tglx@ do you have an update? -- Mark On 11/14/2017 02:50 PM, Thomas Gleixner wrote: > On Tue, 14 Nov 2017, Linus Torvalds wrote: >> I'd suggest simply adding another time to the internal 'struct >> printk_log' (and probably just set it in "log_store()"). >> >> Doesn't that seem much simpler and more targeted? >> >> And exactly because it's targeted and only changes that one thing, now >> people that don't run Android, and have systemd that actually looks at >> that timestamp, can _also_ enable that wall clock output if they want >> to, without it impacting timestamps at all. >> >> In fact, I would expect that at that point you actually would want to >> have multiple different formats that don't necessarily have anything >> to do with the clock source, but simply with the formatting. >> >> Do you want wallclock to be shown in UTC? Do you want to show it in >> the system timezeone? Do you want to always show the full resolution, >> or do you want to show something that takes the previous log entry >> into account, so it only shows the date if it has actually changed, >> and maybe shows the time as just a delta if it is small? >> >> Because now it's clearly about the freeflowing _text_, not about the >> timestamp that is processed by tools. >> >> The text field done by print_time() really is _that_ different from >> the timestamp field that is exposed to tools. >> >> Yes, somebody might still be parsing it, and maybe then you need to >> set that text field to a legacy format anyway, but at least it's a >> whole lot easier to explain what's going on. > Something like the untested patch below should do the storage and converts > the primary timestamp from local_clock() to clock monotonic. > > The print out and /dev/kmsg is unchanged. That's a separate story. > > Thanks, > > tglx > > 8<------------------- > --- a/kernel/printk/printk.c > +++ b/kernel/printk/printk.c > @@ -352,7 +352,11 @@ enum log_flags { > }; > > struct printk_log { > - u64 ts_nsec; /* timestamp in nanoseconds */ > + /* Timestamps in nanoseconds */ > + union { > + u64 ts_nsec; > + struct timestamps ts; > + }; > u16 len; /* length of entire record */ > u16 text_len; /* length of text buffer */ > u16 dict_len; /* length of dictionary buffer */ > @@ -578,7 +582,7 @@ static u32 truncate_msg(u16 *text_len, u > > /* insert record into the buffer, discard old ones, update heads */ > static int log_store(int facility, int level, > - enum log_flags flags, u64 ts_nsec, > + enum log_flags flags, struct timestamps *ts, > const char *dict, u16 dict_len, > const char *text, u16 text_len) > { > @@ -621,10 +625,10 @@ static int log_store(int facility, int l > msg->facility = facility; > msg->level = level & 7; > msg->flags = flags & 0x1f; > - if (ts_nsec > 0) > - msg->ts_nsec = ts_nsec; > + if (ts && ts->mono > 0) > + msg->ts = *ts; > else > - msg->ts_nsec = local_clock(); > + ktime_get_fast_timestamps(&msg->ts); > memset(log_dict(msg) + dict_len, 0, pad_len); > msg->len = size; > > @@ -687,7 +691,7 @@ static void append_char(char **pp, char > static ssize_t msg_print_ext_header(char *buf, size_t size, > struct printk_log *msg, u64 seq) > { > - u64 ts_usec = msg->ts_nsec; > + u64 ts_usec = msg->ts.mono; > > do_div(ts_usec, 1000); > > @@ -1022,6 +1026,7 @@ void log_buf_vmcoreinfo_setup(void) > */ > VMCOREINFO_STRUCT_SIZE(printk_log); > VMCOREINFO_OFFSET(printk_log, ts_nsec); > + VMCOREINFO_OFFSET(printk_log, ts); > VMCOREINFO_OFFSET(printk_log, len); > VMCOREINFO_OFFSET(printk_log, text_len); > VMCOREINFO_OFFSET(printk_log, dict_len); > @@ -1204,17 +1209,18 @@ static inline void boot_delay_msec(int l > static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME); > module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); > > -static size_t print_time(u64 ts, char *buf) > +static size_t print_time(const struct timestamps *ts, char *buf) > { > unsigned long rem_nsec; > + u64 mono = ts->mono; > > if (!printk_time) > return 0; > > - rem_nsec = do_div(ts, 1000000000); > + rem_nsec = do_div(mono, 1000000000); > > if (!buf) > - return snprintf(NULL, 0, "[%5lu.000000] ", (unsigned long)ts); > + return snprintf(NULL, 0, "[%5lu.000000] ", (unsigned long)mono); > > return sprintf(buf, "[%5lu.%06lu] ", > (unsigned long)ts, rem_nsec / 1000); > @@ -1239,7 +1245,7 @@ static size_t print_prefix(const struct > } > } > > - len += print_time(msg->ts_nsec, buf ? buf + len : NULL); > + len += print_time(&msg->ts, buf ? buf + len : NULL); > return len; > } > > @@ -1599,7 +1605,7 @@ static struct cont { > char buf[LOG_LINE_MAX]; > size_t len; /* length == 0 means unused buffer */ > struct task_struct *owner; /* task of first print*/ > - u64 ts_nsec; /* time of first print */ > + struct timestamps ts; /* time of first print */ > u8 level; /* log level of first message */ > u8 facility; /* log facility of first message */ > enum log_flags flags; /* prefix, newline flags */ > @@ -1610,7 +1616,7 @@ static void cont_flush(void) > if (cont.len == 0) > return; > > - log_store(cont.facility, cont.level, cont.flags, cont.ts_nsec, > + log_store(cont.facility, cont.level, cont.flags, &cont.ts, > NULL, 0, cont.buf, cont.len); > cont.len = 0; > } > @@ -1631,7 +1637,7 @@ static bool cont_add(int facility, int l > cont.facility = facility; > cont.level = level; > cont.owner = current; > - cont.ts_nsec = local_clock(); > + ktime_get_fast_timestamps(&cont.ts); > cont.flags = flags; > } > > @@ -1677,7 +1683,8 @@ static size_t log_output(int facility, i > } > > /* Store it in the record log */ > - return log_store(facility, level, lflags, 0, dict, dictlen, text, text_len); > + return log_store(facility, level, lflags, NULL, dict, dictlen, text, > + text_len); > } > > asmlinkage int vprintk_emit(int facility, int level, > --- a/include/linux/timekeeping.h > +++ b/include/linux/timekeeping.h > @@ -146,7 +146,19 @@ extern void timekeeping_inject_sleeptime > * PPS accessor > */ > extern void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, > - struct timespec64 *ts_real); > + struct timespec64 *ts_real); > + > +/* > + * struct timestanps - Simultaneous mono/boot/real timestamps > + * @mono: Monotonic timestamp > + * @boot: Boottime timestamp > + * @real: Realtime timestamp > + */ > +struct timestamps { > + u64 mono; > + u64 boot; > + u64 real; > +}; > > /* > * struct system_time_snapshot - simultaneous raw/real time capture with > @@ -206,6 +218,9 @@ extern int get_device_system_crosststamp > */ > extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot); > > +/* NMI safe mono/boot/realtime timestamps */ > +extern void ktime_get_fast_timestamps(struct timestamps *snap); > + > /* > * Persistent clock related interfaces > */ > --- a/kernel/time/timekeeping.c > +++ b/kernel/time/timekeeping.c > @@ -496,29 +496,29 @@ u64 notrace ktime_get_boot_fast_ns(void) > } > EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); > > - > /* > * See comment for __ktime_get_fast_ns() vs. timestamp ordering > */ > -static __always_inline u64 __ktime_get_real_fast_ns(struct tk_fast *tkf) > +static notrace u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) > { > struct tk_read_base *tkr; > + u64 basem, baser, delta; > unsigned int seq; > - u64 now; > > do { > seq = raw_read_seqcount_latch(&tkf->seq); > tkr = tkf->base + (seq & 0x01); > - now = ktime_to_ns(tkr->base_real); > + basem = ktime_to_ns(tkr->base); > + baser = ktime_to_ns(tkr->base_real); > > - now += timekeeping_delta_to_ns(tkr, > - clocksource_delta( > - tk_clock_read(tkr), > - tkr->cycle_last, > - tkr->mask)); > + delta = timekeeping_delta_to_ns(tkr, > + clocksource_delta(tk_clock_read(tkr), > + tkr->cycle_last, tkr->mask)); > } while (read_seqcount_retry(&tkf->seq, seq)); > > - return now; > + if (mono) > + *mono = basem + delta; > + return baser + delta; > } > > /** > @@ -526,11 +526,25 @@ static __always_inline u64 __ktime_get_r > */ > u64 ktime_get_real_fast_ns(void) > { > - return __ktime_get_real_fast_ns(&tk_fast_mono); > + return __ktime_get_real_fast(&tk_fast_mono, NULL); > } > EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns); > > /** > + * ktime_get_fast_timestamps: - NMI safe timestamps > + * @snap: Pointer to timestamp storage > + * > + * Stores clock monotonic, boottime and realtime time stamps > + */ > +void ktime_get_fast_timestamps(struct timestamps *snap) > +{ > + struct timekeeper *tk = &tk_core.timekeeper; > + > + snap->real = __ktime_get_real_fast(&tk_fast_mono, &snap->mono); > + snap->boot = snap->mono + ktime_to_ns(tk->offs_boot); > +} > + > +/** > * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. > * @tk: Timekeeper to snapshot. > * ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [GIT pull] printk updates for 4.15 2018-01-29 20:34 ` Mark Salyzyn @ 2018-01-29 21:49 ` Thomas Gleixner 0 siblings, 0 replies; 36+ messages in thread From: Thomas Gleixner @ 2018-01-29 21:49 UTC (permalink / raw) To: Mark Salyzyn Cc: Linus Torvalds, Petr Mladek, LKML, Ingo Molnar, H. Peter Anvin, Peter Zijlstra, Andrew Morton, Prarit Bhargava, Sergey Senozhatsky, Steven Rostedt On Mon, 29 Jan 2018, Mark Salyzyn wrote: > Any movement on the following proposal? tglx@ do you have an update? This sits in my inbox since then due to the melted spectrum mess.... ^ permalink raw reply [flat|nested] 36+ messages in thread
end of thread, other threads:[~2018-01-29 21:49 UTC | newest] Thread overview: 36+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2017-11-13 9:36 [GIT pull] printk updates for 4.15 Thomas Gleixner 2017-11-14 1:18 ` Linus Torvalds 2017-11-14 2:48 ` Linus Torvalds 2017-11-14 10:03 ` Petr Mladek 2017-11-14 13:28 ` Prarit Bhargava 2017-11-14 15:56 ` Mark Salyzyn 2017-11-15 0:48 ` Sergey Senozhatsky 2017-11-14 17:20 ` Linus Torvalds 2017-11-14 20:21 ` Thomas Gleixner 2017-11-14 21:07 ` Linus Torvalds 2017-11-14 21:09 ` Thomas Gleixner 2017-11-14 21:16 ` Mark Salyzyn 2017-11-14 21:29 ` Linus Torvalds 2017-11-14 22:10 ` Mark Salyzyn 2017-11-14 22:37 ` Linus Torvalds 2017-11-14 22:50 ` Thomas Gleixner 2017-11-14 23:00 ` Joe Perches 2017-11-14 23:00 ` Linus Torvalds 2017-11-14 23:04 ` Thomas Gleixner 2017-11-14 23:18 ` Linus Torvalds 2017-11-14 23:22 ` Thomas Gleixner 2017-11-15 0:00 ` Linus Torvalds 2017-11-15 8:04 ` Ingo Molnar 2017-11-15 16:26 ` Mark Salyzyn 2017-11-15 17:42 ` Linus Torvalds 2017-11-16 0:37 ` Thomas Gleixner 2017-11-16 1:23 ` John Stultz 2017-11-16 1:32 ` Linus Torvalds 2017-11-16 7:12 ` Thomas Gleixner 2017-11-18 0:26 ` Thomas Gleixner 2017-11-18 0:44 ` Linus Torvalds 2017-11-18 1:00 ` Thomas Gleixner 2017-11-20 6:20 ` Kevin Easton 2017-11-20 6:36 ` Linus Torvalds 2018-01-29 20:34 ` Mark Salyzyn 2018-01-29 21:49 ` Thomas Gleixner
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox