public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/3] ring-buffer: less locking and only disable preemption
@ 2008-10-04  6:00 Steven Rostedt
  2008-10-04  6:00 ` [PATCH 1/3] ring-buffer: move page indexes into page headers Steven Rostedt
                   ` (3 more replies)
  0 siblings, 4 replies; 16+ messages in thread
From: Steven Rostedt @ 2008-10-04  6:00 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Thomas Gleixner, Peter Zijlstra, Andrew Morton,
	Linus Torvalds, Mathieu Desnoyers

Ingo,

These patches need to be put through the ringer. Could you add them
to your ring-buffer branch, so we can test them out before putting
them into your master branch.

The following patches bring the ring buffer closer to a lockless
solution. They move the locking only to the actual moving the
tail/write pointer from one page to the next. Interrupts are now
enabled during most of the writes.

A lot of the locking protection is still within the ftrace infrastructure.
The last patch takes some of that away.

The function tracer cannot be reentrant just due to the nature that
it traces everything, and can cause recursion issues.

-- Steve


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 1/3] ring-buffer: move page indexes into page headers
  2008-10-04  6:00 [PATCH 0/3] ring-buffer: less locking and only disable preemption Steven Rostedt
@ 2008-10-04  6:00 ` Steven Rostedt
  2008-10-04  6:00 ` [PATCH 2/3] ring-buffer: make reentrant Steven Rostedt
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 16+ messages in thread
From: Steven Rostedt @ 2008-10-04  6:00 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Thomas Gleixner, Peter Zijlstra, Andrew Morton,
	Linus Torvalds, Mathieu Desnoyers, Steven Rostedt

[-- Attachment #1: ring-buffer-move-index-in-page-header.patch --]
[-- Type: text/plain, Size: 8939 bytes --]

Remove the global head and tail indexes and move them into the
page header. Each page will now keep track of where the last
write and read was made. We also rename the head and tail to read
and write for better clarification.

This patch is needed for future enhancements to move the ring buffer
to a lockless solution.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c |   75 ++++++++++++++++++++++++---------------------
 1 file changed, 41 insertions(+), 34 deletions(-)

Index: linux-tip.git/kernel/trace/ring_buffer.c
===================================================================
--- linux-tip.git.orig/kernel/trace/ring_buffer.c	2008-10-04 01:14:30.000000000 -0400
+++ linux-tip.git/kernel/trace/ring_buffer.c	2008-10-04 01:18:17.000000000 -0400
@@ -117,6 +117,8 @@ void *ring_buffer_event_data(struct ring
 struct buffer_page {
 	u64		 time_stamp;	/* page time stamp */
 	unsigned	 size;		/* size of page data */
+	unsigned	 write;		/* index for next write */
+	unsigned	 read;		/* index for next read */
 	struct list_head list;		/* list of free pages */
 	void *page;			/* Actual data page */
 };
@@ -153,11 +155,8 @@ struct ring_buffer_per_cpu {
 	spinlock_t			lock;
 	struct lock_class_key		lock_key;
 	struct list_head		pages;
-	unsigned long			head;	/* read from head */
-	unsigned long			tail;	/* write to tail */
-	unsigned long			reader;
-	struct buffer_page		*head_page;
-	struct buffer_page		*tail_page;
+	struct buffer_page		*head_page;	/* read from head */
+	struct buffer_page		*tail_page;	/* write to tail */
 	struct buffer_page		*reader_page;
 	unsigned long			overrun;
 	unsigned long			entries;
@@ -566,10 +565,11 @@ int ring_buffer_resize(struct ring_buffe
 
 static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
 {
-	return (cpu_buffer->reader == cpu_buffer->reader_page->size &&
+	return cpu_buffer->reader_page->read == cpu_buffer->reader_page->size &&
 		(cpu_buffer->tail_page == cpu_buffer->reader_page ||
 		 (cpu_buffer->tail_page == cpu_buffer->head_page &&
-		  cpu_buffer->head == cpu_buffer->tail)));
+		  cpu_buffer->head_page->read ==
+		  cpu_buffer->tail_page->write));
 }
 
 static inline int rb_null_event(struct ring_buffer_event *event)
@@ -577,7 +577,7 @@ static inline int rb_null_event(struct r
 	return event->type == RINGBUF_TYPE_PADDING;
 }
 
-static inline void *rb_page_index(struct buffer_page *page, unsigned index)
+static inline void *__rb_page_index(struct buffer_page *page, unsigned index)
 {
 	return page->page + index;
 }
@@ -585,15 +585,21 @@ static inline void *rb_page_index(struct
 static inline struct ring_buffer_event *
 rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
 {
-	return rb_page_index(cpu_buffer->reader_page,
-			     cpu_buffer->reader);
+	return __rb_page_index(cpu_buffer->reader_page,
+			       cpu_buffer->reader_page->read);
+}
+
+static inline struct ring_buffer_event *
+rb_head_event(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	return __rb_page_index(cpu_buffer->head_page,
+			       cpu_buffer->head_page->read);
 }
 
 static inline struct ring_buffer_event *
 rb_iter_head_event(struct ring_buffer_iter *iter)
 {
-	return rb_page_index(iter->head_page,
-			     iter->head);
+	return __rb_page_index(iter->head_page, iter->head);
 }
 
 /*
@@ -610,7 +616,7 @@ static void rb_update_overflow(struct ri
 	for (head = 0; head < rb_head_size(cpu_buffer);
 	     head += rb_event_length(event)) {
 
-		event = rb_page_index(cpu_buffer->head_page, head);
+		event = __rb_page_index(cpu_buffer->head_page, head);
 		BUG_ON(rb_null_event(event));
 		/* Only count data entries */
 		if (event->type != RINGBUF_TYPE_DATA)
@@ -640,13 +646,13 @@ rb_add_stamp(struct ring_buffer_per_cpu 
 
 static void rb_reset_head_page(struct ring_buffer_per_cpu *cpu_buffer)
 {
-	cpu_buffer->head = 0;
+	cpu_buffer->head_page->read = 0;
 }
 
 static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	cpu_buffer->read_stamp = cpu_buffer->reader_page->time_stamp;
-	cpu_buffer->reader = 0;
+	cpu_buffer->reader_page->read = 0;
 }
 
 static inline void rb_inc_iter(struct ring_buffer_iter *iter)
@@ -743,9 +749,8 @@ __rb_reserve_next(struct ring_buffer_per
 	struct ring_buffer *buffer = cpu_buffer->buffer;
 	struct ring_buffer_event *event;
 
-	/* No locking needed for tail page */
 	tail_page = cpu_buffer->tail_page;
-	tail = cpu_buffer->tail;
+	tail = cpu_buffer->tail_page->write;
 
 	if (tail + length > BUF_PAGE_SIZE) {
 		struct buffer_page *next_page = tail_page;
@@ -774,7 +779,7 @@ __rb_reserve_next(struct ring_buffer_per
 		}
 
 		if (tail != BUF_PAGE_SIZE) {
-			event = rb_page_index(tail_page, tail);
+			event = __rb_page_index(tail_page, tail);
 			/* page padding */
 			event->type = RINGBUF_TYPE_PADDING;
 		}
@@ -784,14 +789,14 @@ __rb_reserve_next(struct ring_buffer_per
 		tail_page->size = 0;
 		tail = 0;
 		cpu_buffer->tail_page = tail_page;
-		cpu_buffer->tail = tail;
+		cpu_buffer->tail_page->write = tail;
 		rb_add_stamp(cpu_buffer, ts);
 		spin_unlock(&cpu_buffer->lock);
 	}
 
 	BUG_ON(tail + length > BUF_PAGE_SIZE);
 
-	event = rb_page_index(tail_page, tail);
+	event = __rb_page_index(tail_page, tail);
 	rb_update_event(event, type, length);
 
 	return event;
@@ -823,12 +828,12 @@ rb_add_time_stamp(struct ring_buffer_per
 		return -1;
 
 	/* check to see if we went to the next page */
-	if (cpu_buffer->tail) {
+	if (cpu_buffer->tail_page->write) {
 		/* Still on same page, update timestamp */
 		event->time_delta = *delta & TS_MASK;
 		event->array[0] = *delta >> TS_SHIFT;
 		/* commit the time event */
-		cpu_buffer->tail +=
+		cpu_buffer->tail_page->write +=
 			rb_event_length(event);
 		cpu_buffer->write_stamp = *ts;
 		*delta = 0;
@@ -846,7 +851,7 @@ rb_reserve_next_event(struct ring_buffer
 
 	ts = ring_buffer_time_stamp(cpu_buffer->cpu);
 
-	if (cpu_buffer->tail) {
+	if (cpu_buffer->tail_page->write) {
 		delta = ts - cpu_buffer->write_stamp;
 
 		if (test_time_stamp(delta)) {
@@ -868,7 +873,7 @@ rb_reserve_next_event(struct ring_buffer
 		return NULL;
 
 	/* If the reserve went to the next page, our delta is zero */
-	if (!cpu_buffer->tail)
+	if (!cpu_buffer->tail_page->write)
 		delta = 0;
 
 	event->time_delta = delta;
@@ -933,8 +938,8 @@ ring_buffer_lock_reserve(struct ring_buf
 static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
 		      struct ring_buffer_event *event)
 {
-	cpu_buffer->tail += rb_event_length(event);
-	cpu_buffer->tail_page->size = cpu_buffer->tail;
+	cpu_buffer->tail_page->write += rb_event_length(event);
+	cpu_buffer->tail_page->size = cpu_buffer->tail_page->write;
 	cpu_buffer->write_stamp += event->time_delta;
 	cpu_buffer->entries++;
 }
@@ -1178,10 +1183,10 @@ void ring_buffer_iter_reset(struct ring_
 	/* Iterator usage is expected to have record disabled */
 	if (list_empty(&cpu_buffer->reader_page->list)) {
 		iter->head_page = cpu_buffer->head_page;
-		iter->head = cpu_buffer->head;
+		iter->head = cpu_buffer->head_page->read;
 	} else {
 		iter->head_page = cpu_buffer->reader_page;
-		iter->head = cpu_buffer->reader;
+		iter->head = cpu_buffer->reader_page->read;
 	}
 	if (iter->head)
 		iter->read_stamp = cpu_buffer->read_stamp;
@@ -1200,7 +1205,7 @@ int ring_buffer_iter_empty(struct ring_b
 	cpu_buffer = iter->cpu_buffer;
 
 	return iter->head_page == cpu_buffer->tail_page &&
-		iter->head == cpu_buffer->tail;
+		iter->head == cpu_buffer->tail_page->write;
 }
 
 static void
@@ -1277,11 +1282,11 @@ rb_get_reader_page(struct ring_buffer_pe
 	reader = cpu_buffer->reader_page;
 
 	/* If there's more to read, return this page */
-	if (cpu_buffer->reader < reader->size)
+	if (cpu_buffer->reader_page->read < reader->size)
 		goto out;
 
 	/* Never should we have an index greater than the size */
-	WARN_ON(cpu_buffer->reader > reader->size);
+	WARN_ON(cpu_buffer->reader_page->read > reader->size);
 
 	/* check if we caught up to the tail */
 	reader = NULL;
@@ -1342,7 +1347,7 @@ static void rb_advance_reader(struct rin
 	rb_update_read_stamp(cpu_buffer, event);
 
 	length = rb_event_length(event);
-	cpu_buffer->reader += length;
+	cpu_buffer->reader_page->read += length;
 }
 
 static void rb_advance_iter(struct ring_buffer_iter *iter)
@@ -1373,7 +1378,7 @@ static void rb_advance_iter(struct ring_
 	 * at the tail of the buffer.
 	 */
 	BUG_ON((iter->head_page == cpu_buffer->tail_page) &&
-	       (iter->head + length > cpu_buffer->tail));
+	       (iter->head + length > cpu_buffer->tail_page->write));
 
 	rb_update_iter_read_stamp(iter, event);
 
@@ -1623,7 +1628,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu 
 	INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
 	cpu_buffer->reader_page->size = 0;
 
-	cpu_buffer->head = cpu_buffer->tail = cpu_buffer->reader = 0;
+	cpu_buffer->head_page->read = 0;
+	cpu_buffer->tail_page->write = 0;
+	cpu_buffer->reader_page->read = 0;
 
 	cpu_buffer->overrun = 0;
 	cpu_buffer->entries = 0;

-- 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 2/3] ring-buffer: make reentrant
  2008-10-04  6:00 [PATCH 0/3] ring-buffer: less locking and only disable preemption Steven Rostedt
  2008-10-04  6:00 ` [PATCH 1/3] ring-buffer: move page indexes into page headers Steven Rostedt
@ 2008-10-04  6:00 ` Steven Rostedt
  2008-10-04  6:01 ` [PATCH 3/3] ftrace: make some tracers reentrant Steven Rostedt
  2008-10-04  8:40 ` [PATCH 0/3] ring-buffer: less locking and only disable preemption Ingo Molnar
  3 siblings, 0 replies; 16+ messages in thread
From: Steven Rostedt @ 2008-10-04  6:00 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Thomas Gleixner, Peter Zijlstra, Andrew Morton,
	Linus Torvalds, Mathieu Desnoyers, Steven Rostedt

[-- Attachment #1: ring-buffer-reentrant.patch --]
[-- Type: text/plain, Size: 23960 bytes --]

This patch replaces the local_irq_save/restore with preempt_disable/
enable. This allows for interrupts to enter while recording.
To write to the ring buffer, you must reserve data, and then
commit it. During this time, an interrupt may call a trace function
that will also record into the buffer before the commit is made.

The interrupt will reserve its entry after the first entry, even
though the first entry did not finish yet.

The time stamp delta of the interrupt entry will be zero, since
in the view of the trace, the interrupt happened during the
first field anyway.

Locking still takes place when the tail/write moves from one page
to the next. The reader always takes the locks.

A new page pointer is added, called the commit. The write/tail will
always point to the end of all entries. The commit field will
point to the last committed entry. Only this commit entry may
update the write time stamp.

The reader can only go up to the commit. It cannot go past it.

If a lot of interrupts come in during a commit that fills up the
buffer, and it happens to make it all the way around the buffer
back to the commit, then a warning is printed and new events will
be dropped.


Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c |  485 ++++++++++++++++++++++++++++++++++-----------
 1 file changed, 373 insertions(+), 112 deletions(-)

Index: linux-tip.git/kernel/trace/ring_buffer.c
===================================================================
--- linux-tip.git.orig/kernel/trace/ring_buffer.c	2008-10-04 01:18:17.000000000 -0400
+++ linux-tip.git/kernel/trace/ring_buffer.c	2008-10-04 01:33:27.000000000 -0400
@@ -116,8 +116,8 @@ void *ring_buffer_event_data(struct ring
  */
 struct buffer_page {
 	u64		 time_stamp;	/* page time stamp */
-	unsigned	 size;		/* size of page data */
-	unsigned	 write;		/* index for next write */
+	local_t		 write;		/* index for next write */
+	local_t		 commit;	/* write commited index */
 	unsigned	 read;		/* index for next read */
 	struct list_head list;		/* list of free pages */
 	void *page;			/* Actual data page */
@@ -157,6 +157,7 @@ struct ring_buffer_per_cpu {
 	struct list_head		pages;
 	struct buffer_page		*head_page;	/* read from head */
 	struct buffer_page		*tail_page;	/* write to tail */
+	struct buffer_page		*commit_page;	/* commited pages */
 	struct buffer_page		*reader_page;
 	unsigned long			overrun;
 	unsigned long			entries;
@@ -185,12 +186,32 @@ struct ring_buffer_iter {
 	u64				read_stamp;
 };
 
-#define RB_WARN_ON(buffer, cond)			\
-	if (unlikely(cond)) {				\
-		atomic_inc(&buffer->record_disabled);	\
-		WARN_ON(1);				\
-		return -1;				\
-	}
+#define RB_WARN_ON(buffer, cond)				\
+	do {							\
+		if (unlikely(cond)) {				\
+			atomic_inc(&buffer->record_disabled);	\
+			WARN_ON(1);				\
+		}						\
+	} while (0)
+
+#define RB_WARN_ON_RET(buffer, cond)				\
+	do {							\
+		if (unlikely(cond)) {				\
+			atomic_inc(&buffer->record_disabled);	\
+			WARN_ON(1);				\
+			return -1;				\
+		}						\
+	} while (0)
+
+#define RB_WARN_ON_ONCE(buffer, cond)				\
+	do {							\
+		static int once;				\
+		if (unlikely(cond) && !once) {			\
+			once++;					\
+			atomic_inc(&buffer->record_disabled);	\
+			WARN_ON(1);				\
+		}						\
+	} while (0)
 
 /**
  * check_pages - integrity check of buffer pages
@@ -204,22 +225,19 @@ static int rb_check_pages(struct ring_bu
 	struct list_head *head = &cpu_buffer->pages;
 	struct buffer_page *page, *tmp;
 
-	RB_WARN_ON(cpu_buffer, head->next->prev != head);
-	RB_WARN_ON(cpu_buffer, head->prev->next != head);
+	RB_WARN_ON_RET(cpu_buffer, head->next->prev != head);
+	RB_WARN_ON_RET(cpu_buffer, head->prev->next != head);
 
 	list_for_each_entry_safe(page, tmp, head, list) {
-		RB_WARN_ON(cpu_buffer, page->list.next->prev != &page->list);
-		RB_WARN_ON(cpu_buffer, page->list.prev->next != &page->list);
+		RB_WARN_ON_RET(cpu_buffer,
+			       page->list.next->prev != &page->list);
+		RB_WARN_ON_RET(cpu_buffer,
+			       page->list.prev->next != &page->list);
 	}
 
 	return 0;
 }
 
-static unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
-{
-	return cpu_buffer->head_page->size;
-}
-
 static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
 			     unsigned nr_pages)
 {
@@ -286,7 +304,6 @@ rb_allocate_cpu_buffer(struct ring_buffe
 	page->page = (void *)addr;
 
 	INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
-	cpu_buffer->reader_page->size = 0;
 
 	ret = rb_allocate_pages(cpu_buffer, buffer->pages);
 	if (ret < 0)
@@ -294,8 +311,7 @@ rb_allocate_cpu_buffer(struct ring_buffe
 
 	cpu_buffer->head_page
 		= list_entry(cpu_buffer->pages.next, struct buffer_page, list);
-	cpu_buffer->tail_page
-		= list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+	cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
 
 	return cpu_buffer;
 
@@ -563,15 +579,6 @@ int ring_buffer_resize(struct ring_buffe
 	return -ENOMEM;
 }
 
-static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
-{
-	return cpu_buffer->reader_page->read == cpu_buffer->reader_page->size &&
-		(cpu_buffer->tail_page == cpu_buffer->reader_page ||
-		 (cpu_buffer->tail_page == cpu_buffer->head_page &&
-		  cpu_buffer->head_page->read ==
-		  cpu_buffer->tail_page->write));
-}
-
 static inline int rb_null_event(struct ring_buffer_event *event)
 {
 	return event->type == RINGBUF_TYPE_PADDING;
@@ -602,6 +609,33 @@ rb_iter_head_event(struct ring_buffer_it
 	return __rb_page_index(iter->head_page, iter->head);
 }
 
+static inline unsigned rb_page_write(struct buffer_page *bpage)
+{
+	return local_read(&bpage->write);
+}
+
+static inline unsigned rb_page_commit(struct buffer_page *bpage)
+{
+	return local_read(&bpage->commit);
+}
+
+/* Size is determined by what has been commited */
+static inline unsigned rb_page_size(struct buffer_page *bpage)
+{
+	return rb_page_commit(bpage);
+}
+
+static inline unsigned
+rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	return rb_page_commit(cpu_buffer->commit_page);
+}
+
+static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	return rb_page_commit(cpu_buffer->head_page);
+}
+
 /*
  * When the tail hits the head and the buffer is in overwrite mode,
  * the head jumps to the next page and all content on the previous
@@ -637,16 +671,76 @@ static inline void rb_inc_page(struct ri
 	*page = list_entry(p, struct buffer_page, list);
 }
 
+static inline unsigned
+rb_event_index(struct ring_buffer_event *event)
+{
+	unsigned long addr = (unsigned long)event;
+
+	return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE);
+}
+
+static inline int
+rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
+	     struct ring_buffer_event *event)
+{
+	unsigned long addr = (unsigned long)event;
+	unsigned long index;
+
+	index = rb_event_index(event);
+	addr &= PAGE_MASK;
+
+	return cpu_buffer->commit_page->page == (void *)addr &&
+		rb_commit_index(cpu_buffer) == index;
+}
+
 static inline void
-rb_add_stamp(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
+rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
+		    struct ring_buffer_event *event)
 {
-	cpu_buffer->tail_page->time_stamp = *ts;
-	cpu_buffer->write_stamp = *ts;
+	unsigned long addr = (unsigned long)event;
+	unsigned long index;
+
+	index = rb_event_index(event);
+	addr &= PAGE_MASK;
+
+	while (cpu_buffer->commit_page->page != (void *)addr) {
+		RB_WARN_ON(cpu_buffer,
+			   cpu_buffer->commit_page == cpu_buffer->tail_page);
+		cpu_buffer->commit_page->commit =
+			cpu_buffer->commit_page->write;
+		rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
+		cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp;
+	}
+
+	/* Now set the commit to the event's index */
+	local_set(&cpu_buffer->commit_page->commit, index);
 }
 
-static void rb_reset_head_page(struct ring_buffer_per_cpu *cpu_buffer)
+static inline void
+rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
 {
-	cpu_buffer->head_page->read = 0;
+	/*
+	 * We only race with interrupts and NMIs on this CPU.
+	 * If we own the commit event, then we can commit
+	 * all others that interrupted us, since the interruptions
+	 * are in stack format (they finish before they come
+	 * back to us). This allows us to do a simple loop to
+	 * assign the commit to the tail.
+	 */
+	while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
+		cpu_buffer->commit_page->commit =
+			cpu_buffer->commit_page->write;
+		rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
+		cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp;
+		/* add barrier to keep gcc from optimizing too much */
+		barrier();
+	}
+	while (rb_commit_index(cpu_buffer) !=
+	       rb_page_write(cpu_buffer->commit_page)) {
+		cpu_buffer->commit_page->commit =
+			cpu_buffer->commit_page->write;
+		barrier();
+	}
 }
 
 static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
@@ -745,61 +839,120 @@ __rb_reserve_next(struct ring_buffer_per
 		  unsigned type, unsigned long length, u64 *ts)
 {
 	struct buffer_page *tail_page, *head_page, *reader_page;
-	unsigned long tail;
+	unsigned long tail, write;
 	struct ring_buffer *buffer = cpu_buffer->buffer;
 	struct ring_buffer_event *event;
+	unsigned long flags;
 
 	tail_page = cpu_buffer->tail_page;
-	tail = cpu_buffer->tail_page->write;
+	write = local_add_return(length, &tail_page->write);
+	tail = write - length;
 
-	if (tail + length > BUF_PAGE_SIZE) {
+	/* See if we shot pass the end of this buffer page */
+	if (write > BUF_PAGE_SIZE) {
 		struct buffer_page *next_page = tail_page;
 
-		spin_lock(&cpu_buffer->lock);
+		spin_lock_irqsave(&cpu_buffer->lock, flags);
+
 		rb_inc_page(cpu_buffer, &next_page);
 
 		head_page = cpu_buffer->head_page;
 		reader_page = cpu_buffer->reader_page;
 
 		/* we grabbed the lock before incrementing */
-		WARN_ON(next_page == reader_page);
+		RB_WARN_ON(cpu_buffer, next_page == reader_page);
+
+		/*
+		 * If for some reason, we had an interrupt storm that made
+		 * it all the way around the buffer, bail, and warn
+		 * about it.
+		 */
+		if (unlikely(next_page == cpu_buffer->commit_page)) {
+			WARN_ON_ONCE(1);
+			goto out_unlock;
+		}
 
 		if (next_page == head_page) {
 			if (!(buffer->flags & RB_FL_OVERWRITE)) {
-				spin_unlock(&cpu_buffer->lock);
-				return NULL;
+				/* reset write */
+				if (tail <= BUF_PAGE_SIZE)
+					local_set(&tail_page->write, tail);
+				goto out_unlock;
 			}
 
-			/* count overflows */
-			rb_update_overflow(cpu_buffer);
+			/* tail_page has not moved yet? */
+			if (tail_page == cpu_buffer->tail_page) {
+				/* count overflows */
+				rb_update_overflow(cpu_buffer);
+
+				rb_inc_page(cpu_buffer, &head_page);
+				cpu_buffer->head_page = head_page;
+				cpu_buffer->head_page->read = 0;
+			}
+		}
 
-			rb_inc_page(cpu_buffer, &head_page);
-			cpu_buffer->head_page = head_page;
-			rb_reset_head_page(cpu_buffer);
+		/*
+		 * If the tail page is still the same as what we think
+		 * it is, then it is up to us to update the tail
+		 * pointer.
+		 */
+		if (tail_page == cpu_buffer->tail_page) {
+			local_set(&next_page->write, 0);
+			local_set(&next_page->commit, 0);
+			cpu_buffer->tail_page = next_page;
+
+			/* reread the time stamp */
+			*ts = ring_buffer_time_stamp(cpu_buffer->cpu);
+			cpu_buffer->tail_page->time_stamp = *ts;
 		}
 
-		if (tail != BUF_PAGE_SIZE) {
+		/*
+		 * The actual tail page has moved forward.
+		 */
+		if (tail < BUF_PAGE_SIZE) {
+			/* Mark the rest of the page with padding */
 			event = __rb_page_index(tail_page, tail);
-			/* page padding */
 			event->type = RINGBUF_TYPE_PADDING;
 		}
 
-		tail_page->size = tail;
-		tail_page = next_page;
-		tail_page->size = 0;
-		tail = 0;
-		cpu_buffer->tail_page = tail_page;
-		cpu_buffer->tail_page->write = tail;
-		rb_add_stamp(cpu_buffer, ts);
-		spin_unlock(&cpu_buffer->lock);
+		if (tail <= BUF_PAGE_SIZE)
+			/* Set the write back to the previous setting */
+			local_set(&tail_page->write, tail);
+
+		/*
+		 * If this was a commit entry that failed,
+		 * increment that too
+		 */
+		if (tail_page == cpu_buffer->commit_page &&
+		    tail == rb_commit_index(cpu_buffer)) {
+			rb_set_commit_to_write(cpu_buffer);
+		}
+
+		spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+
+		/* fail and let the caller try again */
+		return ERR_PTR(-EAGAIN);
 	}
 
-	BUG_ON(tail + length > BUF_PAGE_SIZE);
+	/* We reserved something on the buffer */
+
+	BUG_ON(write > BUF_PAGE_SIZE);
 
 	event = __rb_page_index(tail_page, tail);
 	rb_update_event(event, type, length);
 
+	/*
+	 * If this is a commit and the tail is zero, then update
+	 * this page's time stamp.
+	 */
+	if (!tail && rb_is_commit(cpu_buffer, event))
+		cpu_buffer->commit_page->time_stamp = *ts;
+
 	return event;
+
+ out_unlock:
+	spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+	return NULL;
 }
 
 static int
@@ -808,6 +961,7 @@ rb_add_time_stamp(struct ring_buffer_per
 {
 	struct ring_buffer_event *event;
 	static int once;
+	int ret;
 
 	if (unlikely(*delta > (1ULL << 59) && !once++)) {
 		printk(KERN_WARNING "Delta way too big! %llu"
@@ -825,21 +979,38 @@ rb_add_time_stamp(struct ring_buffer_per
 				  RB_LEN_TIME_EXTEND,
 				  ts);
 	if (!event)
-		return -1;
+		return -EBUSY;
 
-	/* check to see if we went to the next page */
-	if (cpu_buffer->tail_page->write) {
-		/* Still on same page, update timestamp */
-		event->time_delta = *delta & TS_MASK;
-		event->array[0] = *delta >> TS_SHIFT;
-		/* commit the time event */
-		cpu_buffer->tail_page->write +=
-			rb_event_length(event);
+	if (PTR_ERR(event) == -EAGAIN)
+		return -EAGAIN;
+
+	/* Only a commited time event can update the write stamp */
+	if (rb_is_commit(cpu_buffer, event)) {
+		/*
+		 * If this is the first on the page, then we need to
+		 * update the page itself, and just put in a zero.
+		 */
+		if (rb_event_index(event)) {
+			event->time_delta = *delta & TS_MASK;
+			event->array[0] = *delta >> TS_SHIFT;
+		} else {
+			cpu_buffer->commit_page->time_stamp = *ts;
+			event->time_delta = 0;
+			event->array[0] = 0;
+		}
 		cpu_buffer->write_stamp = *ts;
-		*delta = 0;
+		/* let the caller know this was the commit */
+		ret = 1;
+	} else {
+		/* Darn, this is just wasted space */
+		event->time_delta = 0;
+		event->array[0] = 0;
+		ret = 0;
 	}
 
-	return 0;
+	*delta = 0;
+
+	return ret;
 }
 
 static struct ring_buffer_event *
@@ -848,32 +1019,69 @@ rb_reserve_next_event(struct ring_buffer
 {
 	struct ring_buffer_event *event;
 	u64 ts, delta;
+	int commit = 0;
 
+ again:
 	ts = ring_buffer_time_stamp(cpu_buffer->cpu);
 
-	if (cpu_buffer->tail_page->write) {
+	/*
+	 * Only the first commit can update the timestamp.
+	 * Yes there is a race here. If an interrupt comes in
+	 * just after the conditional and it traces too, then it
+	 * will also check the deltas. More than one timestamp may
+	 * also be made. But only the entry that did the actual
+	 * commit will be something other than zero.
+	 */
+	if (cpu_buffer->tail_page == cpu_buffer->commit_page &&
+	    rb_page_write(cpu_buffer->tail_page) ==
+	    rb_commit_index(cpu_buffer)) {
+
 		delta = ts - cpu_buffer->write_stamp;
 
+		/* make sure this delta is calculated here */
+		barrier();
+
+		/* Did the write stamp get updated already? */
+		if (unlikely(ts < cpu_buffer->write_stamp))
+			goto again;
+
 		if (test_time_stamp(delta)) {
-			int ret;
 
-			ret = rb_add_time_stamp(cpu_buffer, &ts, &delta);
-			if (ret < 0)
+			commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
+
+			if (commit == -EBUSY)
 				return NULL;
+
+			if (commit == -EAGAIN)
+				goto again;
+
+			RB_WARN_ON(cpu_buffer, commit < 0);
 		}
-	} else {
-		spin_lock(&cpu_buffer->lock);
-		rb_add_stamp(cpu_buffer, &ts);
-		spin_unlock(&cpu_buffer->lock);
+	} else
+		/* Non commits have zero deltas */
 		delta = 0;
-	}
 
 	event = __rb_reserve_next(cpu_buffer, type, length, &ts);
-	if (!event)
+	if (PTR_ERR(event) == -EAGAIN)
+		goto again;
+
+	if (!event) {
+		if (unlikely(commit))
+			/*
+			 * Ouch! We needed a timestamp and it was commited. But
+			 * we didn't get our event reserved.
+			 */
+			rb_set_commit_to_write(cpu_buffer);
 		return NULL;
+	}
 
-	/* If the reserve went to the next page, our delta is zero */
-	if (!cpu_buffer->tail_page->write)
+	/*
+	 * If the timestamp was commited, make the commit our entry
+	 * now so that we will update it when needed.
+	 */
+	if (commit)
+		rb_set_commit_event(cpu_buffer, event);
+	else if (!rb_is_commit(cpu_buffer, event))
 		delta = 0;
 
 	event->time_delta = delta;
@@ -881,6 +1089,8 @@ rb_reserve_next_event(struct ring_buffer
 	return event;
 }
 
+static DEFINE_PER_CPU(int, rb_need_resched);
+
 /**
  * ring_buffer_lock_reserve - reserve a part of the buffer
  * @buffer: the ring buffer to reserve from
@@ -904,12 +1114,15 @@ ring_buffer_lock_reserve(struct ring_buf
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_event *event;
-	int cpu;
+	int cpu, resched;
 
 	if (atomic_read(&buffer->record_disabled))
 		return NULL;
 
-	local_irq_save(*flags);
+	/* If we are tracing schedule, we don't want to recurse */
+	resched = need_resched();
+	preempt_disable_notrace();
+
 	cpu = raw_smp_processor_id();
 
 	if (!cpu_isset(cpu, buffer->cpumask))
@@ -922,26 +1135,42 @@ ring_buffer_lock_reserve(struct ring_buf
 
 	length = rb_calculate_event_length(length);
 	if (length > BUF_PAGE_SIZE)
-		return NULL;
+		goto out;
 
 	event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length);
 	if (!event)
 		goto out;
 
+	/*
+	 * Need to store resched state on this cpu.
+	 * Only the first needs to.
+	 */
+
+	if (preempt_count() == 1)
+		per_cpu(rb_need_resched, cpu) = resched;
+
 	return event;
 
  out:
-	local_irq_restore(*flags);
+	if (resched)
+		preempt_enable_notrace();
+	else
+		preempt_enable_notrace();
 	return NULL;
 }
 
 static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
 		      struct ring_buffer_event *event)
 {
-	cpu_buffer->tail_page->write += rb_event_length(event);
-	cpu_buffer->tail_page->size = cpu_buffer->tail_page->write;
-	cpu_buffer->write_stamp += event->time_delta;
 	cpu_buffer->entries++;
+
+	/* Only process further if we own the commit */
+	if (!rb_is_commit(cpu_buffer, event))
+		return;
+
+	cpu_buffer->write_stamp += event->time_delta;
+
+	rb_set_commit_to_write(cpu_buffer);
 }
 
 /**
@@ -965,7 +1194,16 @@ int ring_buffer_unlock_commit(struct rin
 
 	rb_commit(cpu_buffer, event);
 
-	local_irq_restore(flags);
+	/*
+	 * Only the last preempt count needs to restore preemption.
+	 */
+	if (preempt_count() == 1) {
+		if (per_cpu(rb_need_resched, cpu))
+			preempt_enable_no_resched_notrace();
+		else
+			preempt_enable_notrace();
+	} else
+		preempt_enable_no_resched_notrace();
 
 	return 0;
 }
@@ -989,15 +1227,17 @@ int ring_buffer_write(struct ring_buffer
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_event *event;
-	unsigned long event_length, flags;
+	unsigned long event_length;
 	void *body;
 	int ret = -EBUSY;
-	int cpu;
+	int cpu, resched;
 
 	if (atomic_read(&buffer->record_disabled))
 		return -EBUSY;
 
-	local_irq_save(flags);
+	resched = need_resched();
+	preempt_disable_notrace();
+
 	cpu = raw_smp_processor_id();
 
 	if (!cpu_isset(cpu, buffer->cpumask))
@@ -1022,11 +1262,26 @@ int ring_buffer_write(struct ring_buffer
 
 	ret = 0;
  out:
-	local_irq_restore(flags);
+	if (resched)
+		preempt_enable_no_resched_notrace();
+	else
+		preempt_enable_notrace();
 
 	return ret;
 }
 
+static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	struct buffer_page *reader = cpu_buffer->reader_page;
+	struct buffer_page *head = cpu_buffer->head_page;
+	struct buffer_page *commit = cpu_buffer->commit_page;
+
+	return reader->read == rb_page_commit(reader) &&
+		(commit == reader ||
+		 (commit == head &&
+		  head->read == rb_page_commit(commit)));
+}
+
 /**
  * ring_buffer_record_disable - stop all writes into the buffer
  * @buffer: The ring buffer to stop writes to.
@@ -1204,8 +1459,8 @@ int ring_buffer_iter_empty(struct ring_b
 
 	cpu_buffer = iter->cpu_buffer;
 
-	return iter->head_page == cpu_buffer->tail_page &&
-		iter->head == cpu_buffer->tail_page->write;
+	return iter->head_page == cpu_buffer->commit_page &&
+		iter->head == rb_commit_index(cpu_buffer);
 }
 
 static void
@@ -1282,15 +1537,16 @@ rb_get_reader_page(struct ring_buffer_pe
 	reader = cpu_buffer->reader_page;
 
 	/* If there's more to read, return this page */
-	if (cpu_buffer->reader_page->read < reader->size)
+	if (cpu_buffer->reader_page->read < rb_page_size(reader))
 		goto out;
 
 	/* Never should we have an index greater than the size */
-	WARN_ON(cpu_buffer->reader_page->read > reader->size);
+	RB_WARN_ON(cpu_buffer,
+		   cpu_buffer->reader_page->read > rb_page_size(reader));
 
 	/* check if we caught up to the tail */
 	reader = NULL;
-	if (cpu_buffer->tail_page == cpu_buffer->reader_page)
+	if (cpu_buffer->commit_page == cpu_buffer->reader_page)
 		goto out;
 
 	/*
@@ -1301,7 +1557,9 @@ rb_get_reader_page(struct ring_buffer_pe
 	reader = cpu_buffer->head_page;
 	cpu_buffer->reader_page->list.next = reader->list.next;
 	cpu_buffer->reader_page->list.prev = reader->list.prev;
-	cpu_buffer->reader_page->size = 0;
+
+	local_set(&cpu_buffer->reader_page->write, 0);
+	local_set(&cpu_buffer->reader_page->commit, 0);
 
 	/* Make the reader page now replace the head */
 	reader->list.prev->next = &cpu_buffer->reader_page->list;
@@ -1313,7 +1571,7 @@ rb_get_reader_page(struct ring_buffer_pe
 	 */
 	cpu_buffer->head_page = cpu_buffer->reader_page;
 
-	if (cpu_buffer->tail_page != reader)
+	if (cpu_buffer->commit_page != reader)
 		rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
 
 	/* Finally update the reader page to the new head */
@@ -1363,8 +1621,8 @@ static void rb_advance_iter(struct ring_
 	/*
 	 * Check if we are at the end of the buffer.
 	 */
-	if (iter->head >= iter->head_page->size) {
-		BUG_ON(iter->head_page == cpu_buffer->tail_page);
+	if (iter->head >= rb_page_size(iter->head_page)) {
+		BUG_ON(iter->head_page == cpu_buffer->commit_page);
 		rb_inc_iter(iter);
 		return;
 	}
@@ -1377,16 +1635,16 @@ static void rb_advance_iter(struct ring_
 	 * This should not be called to advance the header if we are
 	 * at the tail of the buffer.
 	 */
-	BUG_ON((iter->head_page == cpu_buffer->tail_page) &&
-	       (iter->head + length > cpu_buffer->tail_page->write));
+	BUG_ON((iter->head_page == cpu_buffer->commit_page) &&
+	       (iter->head + length > rb_commit_index(cpu_buffer)));
 
 	rb_update_iter_read_stamp(iter, event);
 
 	iter->head += length;
 
 	/* check for end of page padding */
-	if ((iter->head >= iter->head_page->size) &&
-	    (iter->head_page != cpu_buffer->tail_page))
+	if ((iter->head >= rb_page_size(iter->head_page)) &&
+	    (iter->head_page != cpu_buffer->commit_page))
 		rb_advance_iter(iter);
 }
 
@@ -1420,7 +1678,7 @@ ring_buffer_peek(struct ring_buffer *buf
 
 	switch (event->type) {
 	case RINGBUF_TYPE_PADDING:
-		WARN_ON(1);
+		RB_WARN_ON(cpu_buffer, 1);
 		rb_advance_reader(cpu_buffer);
 		return NULL;
 
@@ -1622,14 +1880,17 @@ rb_reset_cpu(struct ring_buffer_per_cpu 
 {
 	cpu_buffer->head_page
 		= list_entry(cpu_buffer->pages.next, struct buffer_page, list);
-	cpu_buffer->head_page->size = 0;
-	cpu_buffer->tail_page = cpu_buffer->head_page;
-	cpu_buffer->tail_page->size = 0;
-	INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
-	cpu_buffer->reader_page->size = 0;
+	local_set(&cpu_buffer->head_page->write, 0);
+	local_set(&cpu_buffer->head_page->commit, 0);
 
 	cpu_buffer->head_page->read = 0;
-	cpu_buffer->tail_page->write = 0;
+
+	cpu_buffer->tail_page = cpu_buffer->head_page;
+	cpu_buffer->commit_page = cpu_buffer->head_page;
+
+	INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
+	local_set(&cpu_buffer->reader_page->write, 0);
+	local_set(&cpu_buffer->reader_page->commit, 0);
 	cpu_buffer->reader_page->read = 0;
 
 	cpu_buffer->overrun = 0;

-- 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 3/3] ftrace: make some tracers reentrant
  2008-10-04  6:00 [PATCH 0/3] ring-buffer: less locking and only disable preemption Steven Rostedt
  2008-10-04  6:00 ` [PATCH 1/3] ring-buffer: move page indexes into page headers Steven Rostedt
  2008-10-04  6:00 ` [PATCH 2/3] ring-buffer: make reentrant Steven Rostedt
@ 2008-10-04  6:01 ` Steven Rostedt
  2008-10-04  8:40 ` [PATCH 0/3] ring-buffer: less locking and only disable preemption Ingo Molnar
  3 siblings, 0 replies; 16+ messages in thread
From: Steven Rostedt @ 2008-10-04  6:01 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Thomas Gleixner, Peter Zijlstra, Andrew Morton,
	Linus Torvalds, Mathieu Desnoyers, Steven Rostedt

[-- Attachment #1: ftrace-disable-less.patch --]
[-- Type: text/plain, Size: 3588 bytes --]

Now that the ring buffer is reentrant, some of the ftrace tracers
(sched_swich, debugging traces) can also be reentrant.

Note: Never make the function tracer reentrant, that can cause
  recursion problems all over the kernel. The function tracer
  must disable reentrancy.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c              |   10 ++--------
 kernel/trace/trace_sched_switch.c |   10 ++--------
 2 files changed, 4 insertions(+), 16 deletions(-)

Index: linux-tip.git/kernel/trace/trace.c
===================================================================
--- linux-tip.git.orig/kernel/trace/trace.c	2008-10-04 01:14:30.000000000 -0400
+++ linux-tip.git/kernel/trace/trace.c	2008-10-04 01:49:35.000000000 -0400
@@ -839,7 +839,6 @@ ftrace_special(unsigned long arg1, unsig
 {
 	struct trace_array *tr = &global_trace;
 	struct trace_array_cpu *data;
-	long disabled;
 	int cpu;
 	int pc;
 
@@ -850,12 +849,10 @@ ftrace_special(unsigned long arg1, unsig
 	preempt_disable_notrace();
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
-	disabled = atomic_inc_return(&data->disabled);
 
-	if (likely(disabled == 1))
+	if (likely(!atomic_read(&data->disabled)))
 		ftrace_trace_special(tr, data, arg1, arg2, arg3, pc);
 
-	atomic_dec(&data->disabled);
 	preempt_enable_notrace();
 }
 
@@ -2961,7 +2958,6 @@ int trace_vprintk(unsigned long ip, cons
 	struct trace_array_cpu *data;
 	struct print_entry *entry;
 	unsigned long flags, irq_flags;
-	long disabled;
 	int cpu, len = 0, size, pc;
 
 	if (!tr->ctrl || tracing_disabled)
@@ -2971,9 +2967,8 @@ int trace_vprintk(unsigned long ip, cons
 	preempt_disable_notrace();
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
-	disabled = atomic_inc_return(&data->disabled);
 
-	if (unlikely(disabled != 1))
+	if (unlikely(atomic_read(&data->disabled)))
 		goto out;
 
 	spin_lock_irqsave(&trace_buf_lock, flags);
@@ -2999,7 +2994,6 @@ int trace_vprintk(unsigned long ip, cons
 	spin_unlock_irqrestore(&trace_buf_lock, flags);
 
  out:
-	atomic_dec(&data->disabled);
 	preempt_enable_notrace();
 
 	return len;
Index: linux-tip.git/kernel/trace/trace_sched_switch.c
===================================================================
--- linux-tip.git.orig/kernel/trace/trace_sched_switch.c	2008-10-04 01:14:30.000000000 -0400
+++ linux-tip.git/kernel/trace/trace_sched_switch.c	2008-10-04 01:49:35.000000000 -0400
@@ -24,7 +24,6 @@ probe_sched_switch(struct rq *__rq, stru
 {
 	struct trace_array_cpu *data;
 	unsigned long flags;
-	long disabled;
 	int cpu;
 	int pc;
 
@@ -41,12 +40,10 @@ probe_sched_switch(struct rq *__rq, stru
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	data = ctx_trace->data[cpu];
-	disabled = atomic_inc_return(&data->disabled);
 
-	if (likely(disabled == 1))
+	if (likely(!atomic_read(&data->disabled)))
 		tracing_sched_switch_trace(ctx_trace, data, prev, next, flags, pc);
 
-	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
 }
 
@@ -55,7 +52,6 @@ probe_sched_wakeup(struct rq *__rq, stru
 {
 	struct trace_array_cpu *data;
 	unsigned long flags;
-	long disabled;
 	int cpu, pc;
 
 	if (!likely(tracer_enabled))
@@ -67,13 +63,11 @@ probe_sched_wakeup(struct rq *__rq, stru
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	data = ctx_trace->data[cpu];
-	disabled = atomic_inc_return(&data->disabled);
 
-	if (likely(disabled == 1))
+	if (likely(!atomic_read(&data->disabled)))
 		tracing_sched_wakeup_trace(ctx_trace, data, wakee, current,
 					   flags, pc);
 
-	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
 }
 

-- 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/3] ring-buffer: less locking and only disable preemption
  2008-10-04  6:00 [PATCH 0/3] ring-buffer: less locking and only disable preemption Steven Rostedt
                   ` (2 preceding siblings ...)
  2008-10-04  6:01 ` [PATCH 3/3] ftrace: make some tracers reentrant Steven Rostedt
@ 2008-10-04  8:40 ` Ingo Molnar
  2008-10-04 14:34   ` Steven Rostedt
  3 siblings, 1 reply; 16+ messages in thread
From: Ingo Molnar @ 2008-10-04  8:40 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: linux-kernel, Thomas Gleixner, Peter Zijlstra, Andrew Morton,
	Linus Torvalds, Mathieu Desnoyers


* Steven Rostedt <rostedt@goodmis.org> wrote:

> Ingo,
> 
> These patches need to be put through the ringer. Could you add them to 
> your ring-buffer branch, so we can test them out before putting them 
> into your master branch.

hey, in fact your latest iteration already tested out so well on a wide 
range of boxes that i've merged it all into tip/tracing/core already.

I'll reuse tip/tracing/ring-buffer for these latest 3 patches (merge it 
up to tip/tracing/core and add these three patches) but it's a delta, 
i.e. the whole ring-buffer approach is ready for prime time i think.

Hm, do we do deallocation of the buffers already when we switch tracers?

> The following patches bring the ring buffer closer to a lockless 
> solution. They move the locking only to the actual moving the 
> tail/write pointer from one page to the next. Interrupts are now 
> enabled during most of the writes.

very nice direction!

> A lot of the locking protection is still within the ftrace 
> infrastructure. The last patch takes some of that away.
> 
> The function tracer cannot be reentrant just due to the nature that it 
> traces everything, and can cause recursion issues.

Correct, and that's by far the yuckiest aspect of it. And there's 
another aspect: NMIs. We've still got the tip/tracing/nmisafe angle with 
these commits:

 d979781: ftrace: mark lapic_wd_event() notrace
 c2c27ba: ftrace: ignore functions that cannot be kprobe-ed
 431e946: ftrace: do not trace NMI contexts
 1eda930: x86, tracing, nmisafe: fix threadinfo_ -> TI_ rename fallout
 84c2ca9: sched: sched_clock() improvement: use in_nmi()
 0d84b78: x86 NMI-safe INT3 and Page Fault
 a04464b: x86_64 page fault NMI-safe
 b335389: Change avr32 active count bit
 a581cbd: Change Alpha active count bit
 eca0999: Stringify support commas

but i'm not yet fully convinced about the NMI angle, the practical cross 
section to random lowlevel x86 code is wider than any sched_clock() 
impact for example. We might be best off avoiding it: force-disable the 
NMI watchdog when we trace?

	Ingo

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/3] ring-buffer: less locking and only disable preemption
  2008-10-04  8:40 ` [PATCH 0/3] ring-buffer: less locking and only disable preemption Ingo Molnar
@ 2008-10-04 14:34   ` Steven Rostedt
  2008-10-04 14:44     ` Ingo Molnar
  2008-10-04 16:33     ` Mathieu Desnoyers
  0 siblings, 2 replies; 16+ messages in thread
From: Steven Rostedt @ 2008-10-04 14:34 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: LKML, Thomas Gleixner, Peter Zijlstra, Andrew Morton,
	Linus Torvalds, Mathieu Desnoyers, Arjan van de Ven


[ Added Arjan to CC regarding the last statements ]

On Sat, 4 Oct 2008, Ingo Molnar wrote:
> 
> * Steven Rostedt <rostedt@goodmis.org> wrote:
> > These patches need to be put through the ringer. Could you add them to 
> > your ring-buffer branch, so we can test them out before putting them 
> > into your master branch.
> 
> hey, in fact your latest iteration already tested out so well on a wide 
> range of boxes that I've merged it all into tip/tracing/core already.

Great to hear!

> 
> I'll reuse tip/tracing/ring-buffer for these latest 3 patches (merge it 
> up to tip/tracing/core and add these three patches) but it's a delta, 
> i.e. the whole ring-buffer approach is ready for prime time i think.
> 
> Hm, do we do deallocation of the buffers already when we switch tracers?

Not yet, but that is one of the trivial changes. I spent too much time
on these more complex changes to get around to it.

> 
> > The following patches bring the ring buffer closer to a lockless 
> > solution. They move the locking only to the actual moving the 
> > tail/write pointer from one page to the next. Interrupts are now 
> > enabled during most of the writes.
> 
> very nice direction!

Thanks!

> 
> > A lot of the locking protection is still within the ftrace 
> > infrastructure. The last patch takes some of that away.
> > 
> > The function tracer cannot be reentrant just due to the nature that it 
> > traces everything, and can cause recursion issues.
> 
> Correct, and that's by far the yuckiest aspect of it. And there's 
> another aspect: NMIs. We've still got the tip/tracing/nmisafe angle with 
> these commits:
> 
>  d979781: ftrace: mark lapic_wd_event() notrace
>  c2c27ba: ftrace: ignore functions that cannot be kprobe-ed
>  431e946: ftrace: do not trace NMI contexts
>  1eda930: x86, tracing, nmisafe: fix threadinfo_ -> TI_ rename fallout
>  84c2ca9: sched: sched_clock() improvement: use in_nmi()
>  0d84b78: x86 NMI-safe INT3 and Page Fault
>  a04464b: x86_64 page fault NMI-safe
>  b335389: Change avr32 active count bit
>  a581cbd: Change Alpha active count bit
>  eca0999: Stringify support commas
> 
> but I'm not yet fully convinced about the NMI angle, the practical cross 
> section to random low level x86 code is wider than any sched_clock() 
> impact for example. We might be best off avoiding it: force-disable the 
> NMI watchdog when we trace?

Since we still have the locking in the ring buffer, it is still not NMI 
safe. But once we remove all locking, then the tracer is fine.

BUT!

The dynamic function tracer is another issue. The problem with NMIs has 
nothing to do with locking, or corrupting the buffers. It has to do with 
the dynamic code modification.  Whenever we modify code, we must guarantee 
that it will not be executed on another CPU.

Kstop_machine serves this purpose rather well. We can modify code without 
worrying it will be executed on another CPU, except for NMIs. The problem 
now comes where an NMI can come in and execute the code being modified. 
That's why I put in all the notrace, lines. But it gets difficult because 
of nmi_notifier can call all over the kernel.  Perhaps, we can simply 
disable the nmi-notifier when we are doing the kstop_machine call?

-- Steve


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/3] ring-buffer: less locking and only disable preemption
  2008-10-04 14:34   ` Steven Rostedt
@ 2008-10-04 14:44     ` Ingo Molnar
  2008-10-04 17:41       ` Ingo Molnar
  2008-10-04 16:33     ` Mathieu Desnoyers
  1 sibling, 1 reply; 16+ messages in thread
From: Ingo Molnar @ 2008-10-04 14:44 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: LKML, Thomas Gleixner, Peter Zijlstra, Andrew Morton,
	Linus Torvalds, Mathieu Desnoyers, Arjan van de Ven


* Steven Rostedt <rostedt@goodmis.org> wrote:

> The dynamic function tracer is another issue. The problem with NMIs 
> has nothing to do with locking, or corrupting the buffers. It has to 
> do with the dynamic code modification.  Whenever we modify code, we 
> must guarantee that it will not be executed on another CPU.
> 
> Kstop_machine serves this purpose rather well. We can modify code 
> without worrying it will be executed on another CPU, except for NMIs. 
> The problem now comes where an NMI can come in and execute the code 
> being modified. That's why I put in all the notrace, lines. But it 
> gets difficult because of nmi_notifier can call all over the kernel.  
> Perhaps, we can simply disable the nmi-notifier when we are doing the 
> kstop_machine call?

that would definitely be one way to reduce the cross section, but not 
enough i'm afraid. For example in the nmi_watchdog=2 case we call into 
various lapic functions and paravirt lapic handlers which makes it all 
spread to 3-4 paravirtualization flavors ...

sched_clock()'s notrace aspects were pretty manageable, but this in its 
current form is not.

	Ingo

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/3] ring-buffer: less locking and only disable preemption
  2008-10-04 14:34   ` Steven Rostedt
  2008-10-04 14:44     ` Ingo Molnar
@ 2008-10-04 16:33     ` Mathieu Desnoyers
  2008-10-04 17:18       ` Steven Rostedt
  1 sibling, 1 reply; 16+ messages in thread
From: Mathieu Desnoyers @ 2008-10-04 16:33 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Ingo Molnar, LKML, Thomas Gleixner, Peter Zijlstra, Andrew Morton,
	Linus Torvalds, Arjan van de Ven

* Steven Rostedt (rostedt@goodmis.org) wrote:
> 
> [ Added Arjan to CC regarding the last statements ]
> 
> On Sat, 4 Oct 2008, Ingo Molnar wrote:
> > 
> > * Steven Rostedt <rostedt@goodmis.org> wrote:
> > > These patches need to be put through the ringer. Could you add them to 
> > > your ring-buffer branch, so we can test them out before putting them 
> > > into your master branch.
> > 
> > hey, in fact your latest iteration already tested out so well on a wide 
> > range of boxes that I've merged it all into tip/tracing/core already.
> 
> Great to hear!
> 
> > 
> > I'll reuse tip/tracing/ring-buffer for these latest 3 patches (merge it 
> > up to tip/tracing/core and add these three patches) but it's a delta, 
> > i.e. the whole ring-buffer approach is ready for prime time i think.
> > 
> > Hm, do we do deallocation of the buffers already when we switch tracers?
> 
> Not yet, but that is one of the trivial changes. I spent too much time
> on these more complex changes to get around to it.
> 
> > 
> > > The following patches bring the ring buffer closer to a lockless 
> > > solution. They move the locking only to the actual moving the 
> > > tail/write pointer from one page to the next. Interrupts are now 
> > > enabled during most of the writes.
> > 
> > very nice direction!
> 
> Thanks!
> 
> > 
> > > A lot of the locking protection is still within the ftrace 
> > > infrastructure. The last patch takes some of that away.
> > > 
> > > The function tracer cannot be reentrant just due to the nature that it 
> > > traces everything, and can cause recursion issues.
> > 
> > Correct, and that's by far the yuckiest aspect of it. And there's 
> > another aspect: NMIs. We've still got the tip/tracing/nmisafe angle with 
> > these commits:
> > 
> >  d979781: ftrace: mark lapic_wd_event() notrace
> >  c2c27ba: ftrace: ignore functions that cannot be kprobe-ed
> >  431e946: ftrace: do not trace NMI contexts
> >  1eda930: x86, tracing, nmisafe: fix threadinfo_ -> TI_ rename fallout
> >  84c2ca9: sched: sched_clock() improvement: use in_nmi()
> >  0d84b78: x86 NMI-safe INT3 and Page Fault
> >  a04464b: x86_64 page fault NMI-safe
> >  b335389: Change avr32 active count bit
> >  a581cbd: Change Alpha active count bit
> >  eca0999: Stringify support commas
> > 
> > but I'm not yet fully convinced about the NMI angle, the practical cross 
> > section to random low level x86 code is wider than any sched_clock() 
> > impact for example. We might be best off avoiding it: force-disable the 
> > NMI watchdog when we trace?
> 
> Since we still have the locking in the ring buffer, it is still not NMI 
> safe. But once we remove all locking, then the tracer is fine.
> 
> BUT!
> 
> The dynamic function tracer is another issue. The problem with NMIs has 
> nothing to do with locking, or corrupting the buffers. It has to do with 
> the dynamic code modification.  Whenever we modify code, we must guarantee 
> that it will not be executed on another CPU.
> 
> Kstop_machine serves this purpose rather well. We can modify code without 
> worrying it will be executed on another CPU, except for NMIs. The problem 
> now comes where an NMI can come in and execute the code being modified. 
> That's why I put in all the notrace, lines. But it gets difficult because 
> of nmi_notifier can call all over the kernel.  Perhaps, we can simply 
> disable the nmi-notifier when we are doing the kstop_machine call?
> 

Or use this code, based on a temporary breakpoint, to do the code
patching (part of the -lttng tree). It does not require stop_machine at
all and is nmi safe.

Mathieu


Immediate Values - x86 Optimization NMI and MCE support

x86 optimization of the immediate values which uses a movl with code patching
to set/unset the value used to populate the register used as variable source.
It uses a breakpoint to bypass the instruction being changed, which lessens the
interrupt latency of the operation and protects against NMIs and MCE.

- More reentrant immediate value : uses a breakpoint. Needs to know the
  instruction's first byte. This is why we keep the "instruction size"
  variable, so we can support the REX prefixed instructions too.

Changelog:
- Change the immediate.c update code to support variable length opcodes.
- Use text_poke_early with cr0 WP save/restore to patch the bypass. We are doing
  non atomic writes to a code region only touched by us (nobody can execute it
  since we are protected by the imv_mutex).
- Add x86_64 support, ready for i386+x86_64 -> x86 merge.
- Use asm-x86/asm.h.
- Change the immediate.c update code to support variable length opcodes.
- Use imv_* instead of immediate_*.
- Use kernel_wp_disable/enable instead of save/restore.
- Fix 1 byte immediate value so it declares its instruction size.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
CC: Andi Kleen <ak@muc.de>
CC: "H. Peter Anvin" <hpa@zytor.com>
CC: Chuck Ebbert <cebbert@redhat.com>
CC: Christoph Hellwig <hch@infradead.org>
CC: Jeremy Fitzhardinge <jeremy@goop.org>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Ingo Molnar <mingo@redhat.com>
---
 arch/x86/kernel/Makefile    |    1 
 arch/x86/kernel/immediate.c |  291 ++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/traps_32.c  |    8 -
 include/asm-x86/immediate.h |   48 ++++++-
 4 files changed, 338 insertions(+), 10 deletions(-)

Index: linux-2.6-lttng/include/asm-x86/immediate.h
===================================================================
--- linux-2.6-lttng.orig/include/asm-x86/immediate.h	2008-08-06 01:32:23.000000000 -0400
+++ linux-2.6-lttng/include/asm-x86/immediate.h	2008-08-06 02:35:56.000000000 -0400
@@ -12,6 +12,18 @@
 
 #include <asm/asm.h>
 
+struct __imv {
+	unsigned long var;	/* Pointer to the identifier variable of the
+				 * immediate value
+				 */
+	unsigned long imv;	/*
+				 * Pointer to the memory location of the
+				 * immediate value within the instruction.
+				 */
+	unsigned char size;	/* Type size. */
+	unsigned char insn_size;/* Instruction size. */
+} __attribute__ ((packed));
+
 /**
  * imv_read - read immediate variable
  * @name: immediate value name
@@ -26,6 +38,11 @@
  * what will generate an instruction with 8 bytes immediate value (not the REX.W
  * prefixed one that loads a sign extended 32 bits immediate value in a r64
  * register).
+ *
+ * Create the instruction in a discarded section to calculate its size. This is
+ * how we can align the beginning of the instruction on an address that will
+ * permit atomic modification of the immediate value without knowing the size of
+ * the opcode used by the compiler. The operand size is known in advance.
  */
 #define imv_read(name)							\
 	({								\
@@ -33,9 +50,14 @@
 		BUILD_BUG_ON(sizeof(value) > 8);			\
 		switch (sizeof(value)) {				\
 		case 1:							\
-			asm(".section __imv,\"aw\",@progbits\n\t"	\
+			asm(".section __discard,\"\",@progbits\n\t"	\
+				"1:\n\t"				\
+				"mov $0,%0\n\t"				\
+				"2:\n\t"				\
+				".previous\n\t"				\
+				".section __imv,\"aw\",@progbits\n\t"	\
 				_ASM_PTR "%c1, (3f)-%c2\n\t"		\
-				".byte %c2\n\t"				\
+				".byte %c2, (2b-1b)\n\t"		\
 				".previous\n\t"				\
 				"mov $0,%0\n\t"				\
 				"3:\n\t"				\
@@ -45,10 +67,16 @@
 			break;						\
 		case 2:							\
 		case 4:							\
-			asm(".section __imv,\"aw\",@progbits\n\t"	\
+			asm(".section __discard,\"\",@progbits\n\t"	\
+				"1:\n\t"				\
+				"mov $0,%0\n\t"				\
+				"2:\n\t"				\
+				".previous\n\t"				\
+				".section __imv,\"aw\",@progbits\n\t"	\
 				_ASM_PTR "%c1, (3f)-%c2\n\t"		\
-				".byte %c2\n\t"				\
+				".byte %c2, (2b-1b)\n\t"		\
 				".previous\n\t"				\
+				".org . + ((-.-(2b-1b)) & (%c2-1)), 0x90\n\t" \
 				"mov $0,%0\n\t"				\
 				"3:\n\t"				\
 				: "=r" (value)				\
@@ -60,10 +88,16 @@
 				value = name##__imv;			\
 				break;					\
 			}						\
-			asm(".section __imv,\"aw\",@progbits\n\t"	\
+			asm(".section __discard,\"\",@progbits\n\t"	\
+				"1:\n\t"				\
+				"mov $0xFEFEFEFE01010101,%0\n\t"	\
+				"2:\n\t"				\
+				".previous\n\t"				\
+				".section __imv,\"aw\",@progbits\n\t"	\
 				_ASM_PTR "%c1, (3f)-%c2\n\t"		\
-				".byte %c2\n\t"				\
+				".byte %c2, (2b-1b)\n\t"		\
 				".previous\n\t"				\
+				".org . + ((-.-(2b-1b)) & (%c2-1)), 0x90\n\t" \
 				"mov $0xFEFEFEFE01010101,%0\n\t" 	\
 				"3:\n\t"				\
 				: "=r" (value)				\
@@ -74,4 +108,6 @@
 		value;							\
 	})
 
+extern int arch_imv_update(const struct __imv *imv, int early);
+
 #endif /* _ASM_X86_IMMEDIATE_H */
Index: linux-2.6-lttng/arch/x86/kernel/traps_32.c
===================================================================
--- linux-2.6-lttng.orig/arch/x86/kernel/traps_32.c	2008-08-06 00:44:22.000000000 -0400
+++ linux-2.6-lttng/arch/x86/kernel/traps_32.c	2008-08-06 02:35:57.000000000 -0400
@@ -576,7 +576,7 @@ void do_##name(struct pt_regs *regs, lon
 }
 
 DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
-#ifndef CONFIG_KPROBES
+#if !defined(CONFIG_KPROBES) && !defined(CONFIG_IMMEDIATE)
 DO_VM86_ERROR(3, SIGTRAP, "int3", int3)
 #endif
 DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow)
@@ -850,7 +850,7 @@ void restart_nmi(void)
 	acpi_nmi_enable();
 }
 
-#ifdef CONFIG_KPROBES
+#if defined(CONFIG_KPROBES) || defined(CONFIG_IMMEDIATE)
 void __kprobes do_int3(struct pt_regs *regs, long error_code)
 {
 	trace_hardirqs_fixup();
@@ -859,8 +859,8 @@ void __kprobes do_int3(struct pt_regs *r
 			== NOTIFY_STOP)
 		return;
 	/*
-	 * This is an interrupt gate, because kprobes wants interrupts
-	 * disabled. Normal trap handlers don't.
+	 * This is an interrupt gate, because kprobes and immediate values wants
+	 * interrupts disabled. Normal trap handlers don't.
 	 */
 	restore_interrupts(regs);
 
Index: linux-2.6-lttng/arch/x86/kernel/Makefile
===================================================================
--- linux-2.6-lttng.orig/arch/x86/kernel/Makefile	2008-08-06 00:41:34.000000000 -0400
+++ linux-2.6-lttng/arch/x86/kernel/Makefile	2008-08-06 02:35:57.000000000 -0400
@@ -73,6 +73,7 @@ obj-$(CONFIG_X86_SUMMIT_NUMA)	+= summit_
 obj-y				+= vsmp_64.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
 obj-$(CONFIG_MODULES)		+= module_$(BITS).o
+obj-$(CONFIG_IMMEDIATE)		+= immediate.o
 obj-$(CONFIG_EFI) 		+= efi.o efi_$(BITS).o efi_stub_$(BITS).o
 obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault_32.o
 obj-$(CONFIG_KGDB)		+= kgdb.o
Index: linux-2.6-lttng/arch/x86/kernel/immediate.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6-lttng/arch/x86/kernel/immediate.c	2008-08-06 02:36:33.000000000 -0400
@@ -0,0 +1,291 @@
+/*
+ * Immediate Value - x86 architecture specific code.
+ *
+ * Rationale
+ *
+ * Required because of :
+ * - Erratum 49 fix for Intel PIII.
+ * - Still present on newer processors : Intel Core 2 Duo Processor for Intel
+ *   Centrino Duo Processor Technology Specification Update, AH33.
+ *   Unsynchronized Cross-Modifying Code Operations Can Cause Unexpected
+ *   Instruction Execution Results.
+ *
+ * Permits immediate value modification by XMC with correct serialization.
+ *
+ * Reentrant for NMI and trap handler instrumentation. Permits XMC to a
+ * location that has preemption enabled because it involves no temporary or
+ * reused data structure.
+ *
+ * Quoting Richard J Moore, source of the information motivating this
+ * implementation which differs from the one proposed by Intel which is not
+ * suitable for kernel context (does not support NMI and would require disabling
+ * interrupts on every CPU for a long period) :
+ *
+ * "There is another issue to consider when looking into using probes other
+ * then int3:
+ *
+ * Intel erratum 54 - Unsynchronized Cross-modifying code - refers to the
+ * practice of modifying code on one processor where another has prefetched
+ * the unmodified version of the code. Intel states that unpredictable general
+ * protection faults may result if a synchronizing instruction (iret, int,
+ * int3, cpuid, etc ) is not executed on the second processor before it
+ * executes the pre-fetched out-of-date copy of the instruction.
+ *
+ * When we became aware of this I had a long discussion with Intel's
+ * microarchitecture guys. It turns out that the reason for this erratum
+ * (which incidentally Intel does not intend to fix) is because the trace
+ * cache - the stream of micro-ops resulting from instruction interpretation -
+ * cannot be guaranteed to be valid. Reading between the lines I assume this
+ * issue arises because of optimization done in the trace cache, where it is
+ * no longer possible to identify the original instruction boundaries. If the
+ * CPU discoverers that the trace cache has been invalidated because of
+ * unsynchronized cross-modification then instruction execution will be
+ * aborted with a GPF. Further discussion with Intel revealed that replacing
+ * the first opcode byte with an int3 would not be subject to this erratum.
+ *
+ * So, is cmpxchg reliable? One has to guarantee more than mere atomicity."
+ *
+ * Overall design
+ *
+ * The algorithm proposed by Intel applies not so well in kernel context: it
+ * would imply disabling interrupts and looping on every CPUs while modifying
+ * the code and would not support instrumentation of code called from interrupt
+ * sources that cannot be disabled.
+ *
+ * Therefore, we use a different algorithm to respect Intel's erratum (see the
+ * quoted discussion above). We make sure that no CPU sees an out-of-date copy
+ * of a pre-fetched instruction by 1 - using a breakpoint, which skips the
+ * instruction that is going to be modified, 2 - issuing an IPI to every CPU to
+ * execute a sync_core(), to make sure that even when the breakpoint is removed,
+ * no cpu could possibly still have the out-of-date copy of the instruction,
+ * modify the now unused 2nd byte of the instruction, and then put back the
+ * original 1st byte of the instruction.
+ *
+ * It has exactly the same intent as the algorithm proposed by Intel, but
+ * it has less side-effects, scales better and supports NMI, SMI and MCE.
+ *
+ * Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
+ */
+
+#include <linux/preempt.h>
+#include <linux/smp.h>
+#include <linux/notifier.h>
+#include <linux/module.h>
+#include <linux/immediate.h>
+#include <linux/kdebug.h>
+#include <linux/rcupdate.h>
+#include <linux/kprobes.h>
+#include <linux/io.h>
+
+#include <asm/cacheflush.h>
+
+#define BREAKPOINT_INSTRUCTION  0xcc
+#define BREAKPOINT_INS_LEN	1
+#define NR_NOPS			10
+
+static unsigned long target_after_int3;	/* EIP of the target after the int3 */
+static unsigned long bypass_eip;	/* EIP of the bypass. */
+static unsigned long bypass_after_int3;	/* EIP after the end-of-bypass int3 */
+static unsigned long after_imv;	/*
+					 * EIP where to resume after the
+					 * single-stepping.
+					 */
+
+/*
+ * Internal bypass used during value update. The bypass is skipped by the
+ * function in which it is inserted.
+ * No need to be aligned because we exclude readers from the site during
+ * update.
+ * Layout is:
+ * (10x nop) int3
+ * (maximum size is 2 bytes opcode + 8 bytes immediate value for long on x86_64)
+ * The nops are the target replaced by the instruction to single-step.
+ * Align on 16 bytes to make sure the nops fit within a single page so remapping
+ * it can be done easily.
+ */
+static inline void _imv_bypass(unsigned long *bypassaddr,
+	unsigned long *breaknextaddr)
+{
+		asm volatile("jmp 2f;\n\t"
+				".align 16;\n\t"
+				"0:\n\t"
+				".space 10, 0x90;\n\t"
+				"1:\n\t"
+				"int3;\n\t"
+				"2:\n\t"
+				"mov $(0b),%0;\n\t"
+				"mov $((1b)+1),%1;\n\t"
+				: "=r" (*bypassaddr),
+				  "=r" (*breaknextaddr));
+}
+
+static void imv_synchronize_core(void *info)
+{
+	sync_core();	/* use cpuid to stop speculative execution */
+}
+
+/*
+ * The eip value points right after the breakpoint instruction, in the second
+ * byte of the movl.
+ * Disable preemption in the bypass to make sure no thread will be preempted in
+ * it. We can then use synchronize_sched() to make sure every bypass users have
+ * ended.
+ */
+static int imv_notifier(struct notifier_block *nb,
+	unsigned long val, void *data)
+{
+	enum die_val die_val = (enum die_val) val;
+	struct die_args *args = data;
+
+	if (!args->regs || user_mode_vm(args->regs))
+		return NOTIFY_DONE;
+
+	if (die_val == DIE_INT3) {
+		if (args->regs->ip == target_after_int3) {
+			preempt_disable();
+			args->regs->ip = bypass_eip;
+			return NOTIFY_STOP;
+		} else if (args->regs->ip == bypass_after_int3) {
+			args->regs->ip = after_imv;
+			preempt_enable();
+			return NOTIFY_STOP;
+		}
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block imv_notify = {
+	.notifier_call = imv_notifier,
+	.priority = 0x7fffffff,	/* we need to be notified first */
+};
+
+/**
+ * arch_imv_update - update one immediate value
+ * @imv: pointer of type const struct __imv to update
+ * @early: early boot (1) or normal (0)
+ *
+ * Update one immediate value. Must be called with imv_mutex held.
+ */
+__kprobes int arch_imv_update(const struct __imv *imv, int early)
+{
+	int ret;
+	unsigned char opcode_size = imv->insn_size - imv->size;
+	unsigned long insn = imv->imv - opcode_size;
+	unsigned long len;
+	char *vaddr;
+	struct page *pages[1];
+
+#ifdef CONFIG_KPROBES
+	/*
+	 * Fail if a kprobe has been set on this instruction.
+	 * (TODO: we could eventually do better and modify all the (possibly
+	 * nested) kprobes for this site if kprobes had an API for this.
+	 */
+	if (unlikely(!early
+			&& *(unsigned char *)insn == BREAKPOINT_INSTRUCTION)) {
+		printk(KERN_WARNING "Immediate value in conflict with kprobe. "
+				    "Variable at %p, "
+				    "instruction at %p, size %hu\n",
+				    (void *)imv->imv,
+				    (void *)imv->var, imv->size);
+		return -EBUSY;
+	}
+#endif
+
+	/*
+	 * If the variable and the instruction have the same value, there is
+	 * nothing to do.
+	 */
+	switch (imv->size) {
+	case 1:	if (*(uint8_t *)imv->imv
+				== *(uint8_t *)imv->var)
+			return 0;
+		break;
+	case 2:	if (*(uint16_t *)imv->imv
+				== *(uint16_t *)imv->var)
+			return 0;
+		break;
+	case 4:	if (*(uint32_t *)imv->imv
+				== *(uint32_t *)imv->var)
+			return 0;
+		break;
+#ifdef CONFIG_X86_64
+	case 8:	if (*(uint64_t *)imv->imv
+				== *(uint64_t *)imv->var)
+			return 0;
+		break;
+#endif
+	default:return -EINVAL;
+	}
+
+	if (!early) {
+		/* bypass is 10 bytes long for x86_64 long */
+		WARN_ON(imv->insn_size > 10);
+		_imv_bypass(&bypass_eip, &bypass_after_int3);
+
+		after_imv = imv->imv + imv->size;
+
+		/*
+		 * Using the _early variants because nobody is executing the
+		 * bypass code while we patch it. It is protected by the
+		 * imv_mutex. Since we modify the instructions non atomically
+		 * (for nops), we have to use the _early variant.
+		 * We must however deal with RO pages.
+		 * Use a single page : 10 bytes are aligned on 16 bytes
+		 * boundaries.
+		 */
+		pages[0] = virt_to_page((void *)bypass_eip);
+		vaddr = vmap(pages, 1, VM_MAP, PAGE_KERNEL);
+		BUG_ON(!vaddr);
+		text_poke_early(&vaddr[bypass_eip & ~PAGE_MASK],
+			(void *)insn, imv->insn_size);
+		/*
+		 * Fill the rest with nops.
+		 */
+		len = NR_NOPS - imv->insn_size;
+		add_nops((void *)
+			&vaddr[(bypass_eip & ~PAGE_MASK) + imv->insn_size],
+			len);
+		vunmap(vaddr);
+
+		target_after_int3 = insn + BREAKPOINT_INS_LEN;
+		/* register_die_notifier has memory barriers */
+		register_die_notifier(&imv_notify);
+		/* The breakpoint will single-step the bypass */
+		text_poke((void *)insn,
+			((unsigned char[]){BREAKPOINT_INSTRUCTION}), 1);
+		/*
+		 * Make sure the breakpoint is set before we continue (visible
+		 * to other CPUs and interrupts).
+		 */
+		wmb();
+		/*
+		 * Execute serializing instruction on each CPU.
+		 */
+		ret = on_each_cpu(imv_synchronize_core, NULL, 1);
+		BUG_ON(ret != 0);
+
+		text_poke((void *)(insn + opcode_size), (void *)imv->var,
+				imv->size);
+		/*
+		 * Make sure the value can be seen from other CPUs and
+		 * interrupts.
+		 */
+		wmb();
+		text_poke((void *)insn, (unsigned char *)bypass_eip, 1);
+		/*
+		 * Wait for all int3 handlers to end (interrupts are disabled in
+		 * int3). This CPU is clearly not in a int3 handler, because
+		 * int3 handler is not preemptible and there cannot be any more
+		 * int3 handler called for this site, because we placed the
+		 * original instruction back.  synchronize_sched has memory
+		 * barriers.
+		 */
+		synchronize_sched();
+		unregister_die_notifier(&imv_notify);
+		/* unregister_die_notifier has memory barriers */
+	} else
+		text_poke_early((void *)imv->imv, (void *)imv->var,
+			imv->size);
+	return 0;
+}


-- 
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/3] ring-buffer: less locking and only disable preemption
  2008-10-04 16:33     ` Mathieu Desnoyers
@ 2008-10-04 17:18       ` Steven Rostedt
  2008-10-06 17:13         ` Mathieu Desnoyers
  0 siblings, 1 reply; 16+ messages in thread
From: Steven Rostedt @ 2008-10-04 17:18 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Ingo Molnar, LKML, Thomas Gleixner, Peter Zijlstra, Andrew Morton,
	Linus Torvalds, Arjan van de Ven


On Sat, 4 Oct 2008, Mathieu Desnoyers wrote:
> 
> Or use this code, based on a temporary breakpoint, to do the code
> patching (part of the -lttng tree). It does not require stop_machine at
> all and is nmi safe.
> 

When this is supported for all archs, and can be done at all functions 
then I could use it.

I may just have the arch specific code use it, but we'll see.

Also, how good is it at patching 20,000 call sites?

-- Steve


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/3] ring-buffer: less locking and only disable preemption
  2008-10-04 14:44     ` Ingo Molnar
@ 2008-10-04 17:41       ` Ingo Molnar
  2008-10-04 22:27         ` Mathieu Desnoyers
  0 siblings, 1 reply; 16+ messages in thread
From: Ingo Molnar @ 2008-10-04 17:41 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: LKML, Thomas Gleixner, Peter Zijlstra, Andrew Morton,
	Linus Torvalds, Mathieu Desnoyers, Arjan van de Ven


* Ingo Molnar <mingo@elte.hu> wrote:

> * Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> > The dynamic function tracer is another issue. The problem with NMIs 
> > has nothing to do with locking, or corrupting the buffers. It has to 
> > do with the dynamic code modification.  Whenever we modify code, we 
> > must guarantee that it will not be executed on another CPU.
> > 
> > Kstop_machine serves this purpose rather well. We can modify code 
> > without worrying it will be executed on another CPU, except for NMIs. 
> > The problem now comes where an NMI can come in and execute the code 
> > being modified. That's why I put in all the notrace, lines. But it 
> > gets difficult because of nmi_notifier can call all over the kernel.  
> > Perhaps, we can simply disable the nmi-notifier when we are doing the 
> > kstop_machine call?
> 
> that would definitely be one way to reduce the cross section, but not 
> enough i'm afraid. For example in the nmi_watchdog=2 case we call into 
> various lapic functions and paravirt lapic handlers which makes it all 
> spread to 3-4 paravirtualization flavors ...
> 
> sched_clock()'s notrace aspects were pretty manageable, but this in 
> its current form is not.

there's a relatively simple method that would solve all these 
impact-size problems.

We cannot stop NMIs (and MCEs, etc.), but we can make kernel code 
modifications atomic, by adding the following thin layer ontop of it:

   #define MAX_CODE_SIZE 10

   int redo_len;
   u8 *redo_vaddr;

   u8 redo_buffer[MAX_CODE_SIZE];

   atomic_t __read_mostly redo_pending;

and use it in do_nmi():

   if (unlikely(atomic_read(&redo_pending)))
	modify_code_redo();

i.e. when we modify code, we first fill in the redo_buffer[], redo_vaddr 
and redo_len[], then we set redo_pending flag. Then we modify the kernel 
code, and clear the redo_pending flag.

If an NMI (or MCE) handler intervenes, it will notice the pending 
'transaction' and will copy redo_buffer[] to the (redo_vaddr,len) 
location and will continue.

So as far as non-maskable contexts are concerned, kernel code patching 
becomes an atomic operation. do_nmi() has to be marked notrace but 
that's all and easy to maintain.

Hm?

	Ingo

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/3] ring-buffer: less locking and only disable preemption
  2008-10-04 17:41       ` Ingo Molnar
@ 2008-10-04 22:27         ` Mathieu Desnoyers
  2008-10-04 23:21           ` Steven Rostedt
  2008-10-05 10:13           ` Ingo Molnar
  0 siblings, 2 replies; 16+ messages in thread
From: Mathieu Desnoyers @ 2008-10-04 22:27 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, LKML, Thomas Gleixner, Peter Zijlstra,
	Andrew Morton, Linus Torvalds, Arjan van de Ven

* Ingo Molnar (mingo@elte.hu) wrote:
> 
> * Ingo Molnar <mingo@elte.hu> wrote:
> 
> > * Steven Rostedt <rostedt@goodmis.org> wrote:
> > 
> > > The dynamic function tracer is another issue. The problem with NMIs 
> > > has nothing to do with locking, or corrupting the buffers. It has to 
> > > do with the dynamic code modification.  Whenever we modify code, we 
> > > must guarantee that it will not be executed on another CPU.
> > > 
> > > Kstop_machine serves this purpose rather well. We can modify code 
> > > without worrying it will be executed on another CPU, except for NMIs. 
> > > The problem now comes where an NMI can come in and execute the code 
> > > being modified. That's why I put in all the notrace, lines. But it 
> > > gets difficult because of nmi_notifier can call all over the kernel.  
> > > Perhaps, we can simply disable the nmi-notifier when we are doing the 
> > > kstop_machine call?
> > 
> > that would definitely be one way to reduce the cross section, but not 
> > enough i'm afraid. For example in the nmi_watchdog=2 case we call into 
> > various lapic functions and paravirt lapic handlers which makes it all 
> > spread to 3-4 paravirtualization flavors ...
> > 
> > sched_clock()'s notrace aspects were pretty manageable, but this in 
> > its current form is not.
> 
> there's a relatively simple method that would solve all these 
> impact-size problems.
> 
> We cannot stop NMIs (and MCEs, etc.), but we can make kernel code 
> modifications atomic, by adding the following thin layer ontop of it:
> 
>    #define MAX_CODE_SIZE 10
> 
>    int redo_len;
>    u8 *redo_vaddr;
> 
>    u8 redo_buffer[MAX_CODE_SIZE];
> 
>    atomic_t __read_mostly redo_pending;
> 
> and use it in do_nmi():
> 
>    if (unlikely(atomic_read(&redo_pending)))
> 	modify_code_redo();
> 
> i.e. when we modify code, we first fill in the redo_buffer[], redo_vaddr 
> and redo_len[], then we set redo_pending flag. Then we modify the kernel 
> code, and clear the redo_pending flag.
> 
> If an NMI (or MCE) handler intervenes, it will notice the pending 
> 'transaction' and will copy redo_buffer[] to the (redo_vaddr,len) 
> location and will continue.
> 
> So as far as non-maskable contexts are concerned, kernel code patching 
> becomes an atomic operation. do_nmi() has to be marked notrace but 
> that's all and easy to maintain.
> 
> Hm?
> 

The comment at the beginning of 
http://git.kernel.org/?p=linux/kernel/git/compudj/linux-2.6-lttng.git;a=blob;f=arch/x86/kernel/immediate.c;h=87a25db0efbd8f73d3d575e48541f2a179915da5;hb=b6148ea934f42e730571f41aa5a1a081a93995b5

explains that code modification on x86 SMP systems is not only a matter
of atomicity, but also a matter of not changing the code underneath a
running CPU which is making assumptions that it won't change underneath
without issuing a synchronizing instruction before the new code is used
by the CPU. The scheme you propose here takes care of atomicity, but
does not take care of the synchronization problem. A sync_core() would
probably be required when such modification is detected.

Also, speaking of plain atomicity, you scheme does not seem to protect
against NMIs running on a different CPU, because the non-atomic change
could race with such NMI.

Mathieu

> 	Ingo
> 

-- 
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/3] ring-buffer: less locking and only disable preemption
  2008-10-04 22:27         ` Mathieu Desnoyers
@ 2008-10-04 23:21           ` Steven Rostedt
  2008-10-06 17:10             ` Mathieu Desnoyers
  2008-10-05 10:13           ` Ingo Molnar
  1 sibling, 1 reply; 16+ messages in thread
From: Steven Rostedt @ 2008-10-04 23:21 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Ingo Molnar, LKML, Thomas Gleixner, Peter Zijlstra, Andrew Morton,
	Linus Torvalds, Arjan van de Ven


On Sat, 4 Oct 2008, Mathieu Desnoyers wrote:
> > 
> > there's a relatively simple method that would solve all these 
> > impact-size problems.
> > 
> > We cannot stop NMIs (and MCEs, etc.), but we can make kernel code 
> > modifications atomic, by adding the following thin layer ontop of it:
> > 
> >    #define MAX_CODE_SIZE 10
> > 
> >    int redo_len;
> >    u8 *redo_vaddr;
> > 
> >    u8 redo_buffer[MAX_CODE_SIZE];
> > 
> >    atomic_t __read_mostly redo_pending;
> > 
> > and use it in do_nmi():
> > 
> >    if (unlikely(atomic_read(&redo_pending)))
> > 	modify_code_redo();
> > 
> > i.e. when we modify code, we first fill in the redo_buffer[], redo_vaddr 
> > and redo_len[], then we set redo_pending flag. Then we modify the kernel 
> > code, and clear the redo_pending flag.
> > 
> > If an NMI (or MCE) handler intervenes, it will notice the pending 
> > 'transaction' and will copy redo_buffer[] to the (redo_vaddr,len) 
> > location and will continue.
> > 
> > So as far as non-maskable contexts are concerned, kernel code patching 
> > becomes an atomic operation. do_nmi() has to be marked notrace but 
> > that's all and easy to maintain.
> > 
> > Hm?
> > 
> 
> The comment at the beginning of 
> http://git.kernel.org/?p=linux/kernel/git/compudj/linux-2.6-lttng.git;a=blob;f=arch/x86/kernel/immediate.c;h=87a25db0efbd8f73d3d575e48541f2a179915da5;hb=b6148ea934f42e730571f41aa5a1a081a93995b5

Mathieu, please stop pointing to git tree comments (especially those that
are not in mainline).  If you have an actual technical PDF link, that 
would be much better.

> 
> explains that code modification on x86 SMP systems is not only a matter
> of atomicity, but also a matter of not changing the code underneath a
> running CPU which is making assumptions that it won't change underneath
> without issuing a synchronizing instruction before the new code is used
> by the CPU. The scheme you propose here takes care of atomicity, but
> does not take care of the synchronization problem. A sync_core() would
> probably be required when such modification is detected.
> 
> Also, speaking of plain atomicity, you scheme does not seem to protect
> against NMIs running on a different CPU, because the non-atomic change
> could race with such NMI.

Ingo,

Mathieu is correct in this regard. We do not neet to protect ourselves 
from NMIs on the CPU that we execute the code on. We need to protect 
ourselves from NMIs running on other CPUS.

-- Steve


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/3] ring-buffer: less locking and only disable preemption
  2008-10-04 22:27         ` Mathieu Desnoyers
  2008-10-04 23:21           ` Steven Rostedt
@ 2008-10-05 10:13           ` Ingo Molnar
  2008-10-06 13:53             ` Mathieu Desnoyers
  1 sibling, 1 reply; 16+ messages in thread
From: Ingo Molnar @ 2008-10-05 10:13 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Steven Rostedt, LKML, Thomas Gleixner, Peter Zijlstra,
	Andrew Morton, Linus Torvalds, Arjan van de Ven


* Mathieu Desnoyers <compudj@krystal.dyndns.org> wrote:

> explains that code modification on x86 SMP systems is not only a 
> matter of atomicity, but also a matter of not changing the code 
> underneath a running CPU which is making assumptions that it won't 
> change underneath without issuing a synchronizing instruction before 
> the new code is used by the CPU. The scheme you propose here takes 
> care of atomicity, but does not take care of the synchronization 
> problem. A sync_core() would probably be required when such 
> modification is detected.

that's wrong, my scheme protects against these cases: before _any_ code 
is modified we set the redo_pending atomic flag, and make sure that 
previous NMI handlers have stopped executing. (easy enough)

then the atomic update of redo_pending should be a sufficient barrier 
for another CPU to notice the pending transaction.

Note that the cross-CPU modification can still be 'half done' when the 
NMI hits, that's why we execute modify_code_redo() to 'redo' the full 
modification before executing further NMI code. That is executed _on the 
CPU_ that triggers an NMI, and the CPU itself is self-consistent.

( The modify_code_redo() will have to do a sync_cores() of course, like 
  all self-modifying code, to flush speculative execution. )

> Also, speaking of plain atomicity, you scheme does not seem to protect 
> against NMIs running on a different CPU, because the non-atomic change 
> could race with such NMI.

That's wrong too. Another CPU will notice that redo_pending is set and 
will execute modify_code_redo() from its NMI handler _before_ calling 
all the notifiers and other 'wide' code paths.

the only item that needs to be marked 'notrace' is only the highlevel 
do_nmi() handler itself. (as that executes before we have a chance to 
execute modify_code_redo())

So we trade a large, fragile, and unmapped set of NMI-implicated 
codepaths for a very tight and well controlled an easy to maintain 
codepath.

	Ingo

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/3] ring-buffer: less locking and only disable preemption
  2008-10-05 10:13           ` Ingo Molnar
@ 2008-10-06 13:53             ` Mathieu Desnoyers
  0 siblings, 0 replies; 16+ messages in thread
From: Mathieu Desnoyers @ 2008-10-06 13:53 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, LKML, Thomas Gleixner, Peter Zijlstra,
	Andrew Morton, Linus Torvalds, Arjan van de Ven

* Ingo Molnar (mingo@elte.hu) wrote:
> 
> * Mathieu Desnoyers <compudj@krystal.dyndns.org> wrote:
> 
> > explains that code modification on x86 SMP systems is not only a 
> > matter of atomicity, but also a matter of not changing the code 
> > underneath a running CPU which is making assumptions that it won't 
> > change underneath without issuing a synchronizing instruction before 
> > the new code is used by the CPU. The scheme you propose here takes 
> > care of atomicity, but does not take care of the synchronization 
> > problem. A sync_core() would probably be required when such 
> > modification is detected.
> 
> that's wrong, my scheme protects against these cases: before _any_ code 
> is modified we set the redo_pending atomic flag, and make sure that 
> previous NMI handlers have stopped executing. (easy enough)
> 

Hi Ingo,

Hrm, how will this take care of the following race ?

CPU A                                CPU B
- NMI fires
- NMI handler checks for
    redo_pending flag, == 0
- NMI handler runs code              - set redo_pending
    about to be modified
                                     - NMI fires
                                     - NMI handler checks redo_pending,
                                       == 1, executes modify_code_redo()
           -- race : NMI on A executes code modified by B --
- NMI handler finished running
    code about to be modified

Mathieu

> then the atomic update of redo_pending should be a sufficient barrier 
> for another CPU to notice the pending transaction.
> 
> Note that the cross-CPU modification can still be 'half done' when the 
> NMI hits, that's why we execute modify_code_redo() to 'redo' the full 
> modification before executing further NMI code. That is executed _on the 
> CPU_ that triggers an NMI, and the CPU itself is self-consistent.
> 
> ( The modify_code_redo() will have to do a sync_cores() of course, like 
>   all self-modifying code, to flush speculative execution. )
> 
> > Also, speaking of plain atomicity, you scheme does not seem to protect 
> > against NMIs running on a different CPU, because the non-atomic change 
> > could race with such NMI.
> 
> That's wrong too. Another CPU will notice that redo_pending is set and 
> will execute modify_code_redo() from its NMI handler _before_ calling 
> all the notifiers and other 'wide' code paths.
> 
> the only item that needs to be marked 'notrace' is only the highlevel 
> do_nmi() handler itself. (as that executes before we have a chance to 
> execute modify_code_redo())
> 
> So we trade a large, fragile, and unmapped set of NMI-implicated 
> codepaths for a very tight and well controlled an easy to maintain 
> codepath.
> 
> 	Ingo
> 

-- 
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/3] ring-buffer: less locking and only disable preemption
  2008-10-04 23:21           ` Steven Rostedt
@ 2008-10-06 17:10             ` Mathieu Desnoyers
  0 siblings, 0 replies; 16+ messages in thread
From: Mathieu Desnoyers @ 2008-10-06 17:10 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Ingo Molnar, LKML, Thomas Gleixner, Peter Zijlstra, Andrew Morton,
	Linus Torvalds, Arjan van de Ven

* Steven Rostedt (rostedt@goodmis.org) wrote:
> 
> On Sat, 4 Oct 2008, Mathieu Desnoyers wrote:
> > > 
> > > there's a relatively simple method that would solve all these 
> > > impact-size problems.
> > > 
> > > We cannot stop NMIs (and MCEs, etc.), but we can make kernel code 
> > > modifications atomic, by adding the following thin layer ontop of it:
> > > 
> > >    #define MAX_CODE_SIZE 10
> > > 
> > >    int redo_len;
> > >    u8 *redo_vaddr;
> > > 
> > >    u8 redo_buffer[MAX_CODE_SIZE];
> > > 
> > >    atomic_t __read_mostly redo_pending;
> > > 
> > > and use it in do_nmi():
> > > 
> > >    if (unlikely(atomic_read(&redo_pending)))
> > > 	modify_code_redo();
> > > 
> > > i.e. when we modify code, we first fill in the redo_buffer[], redo_vaddr 
> > > and redo_len[], then we set redo_pending flag. Then we modify the kernel 
> > > code, and clear the redo_pending flag.
> > > 
> > > If an NMI (or MCE) handler intervenes, it will notice the pending 
> > > 'transaction' and will copy redo_buffer[] to the (redo_vaddr,len) 
> > > location and will continue.
> > > 
> > > So as far as non-maskable contexts are concerned, kernel code patching 
> > > becomes an atomic operation. do_nmi() has to be marked notrace but 
> > > that's all and easy to maintain.
> > > 
> > > Hm?
> > > 
> > 
> > The comment at the beginning of 
> > http://git.kernel.org/?p=linux/kernel/git/compudj/linux-2.6-lttng.git;a=blob;f=arch/x86/kernel/immediate.c;h=87a25db0efbd8f73d3d575e48541f2a179915da5;hb=b6148ea934f42e730571f41aa5a1a081a93995b5
> 
> Mathieu, please stop pointing to git tree comments (especially those that
> are not in mainline).  If you have an actual technical PDF link, that 
> would be much better.
> 

Hi Steven,

The top 10 lines of the comment the URL points to :

Intel Core 2 Duo Processor for Intel Centrino Duo Processor Technology
Specification Update, AH33

(direct link :
ftp://download.intel.com/design/mobile/SPECUPDT/31407918.pdf)

AH33 -> Page 48

<Quote>
Problem :

The act of one processor, or system bus master, writing data into a
currently executing code segment of a second processor with the intent
of having the second processor execute that data as code is called
cross-modifying code (XMC). XMC that does not force the second processor
to execute a synchronizing instruction, prior to execution of the new
code, is called unsynchronized XMC. Software using unsynchronized XMC to
modify the instruction byte stream of a processor can see unexpected or
unpredictable execution behavior from the processor that is executing
the modified code.
</Quote>

What my patch does is exactly this : it forces the second CPU to issue a
synchronizing instruction (either iret from the breakpoint or cpuid)
before the new instruction is reachable by any CPU. It therefore turns
what would otherwise be an unsynchronized XMC into a synchronized XMC.

And yes patching 20000 sites can be made increadibly fast for the
5-bytes call/nop code-patching case because all the breakpoint handlers
have to do is to increment the return IP of 4 bytes (1 byte for
breakpoint, 4 bytes must be skipped). However, we would have to keep a
hash table of the modified instruction pointers around so the breakpoint
handler can know why the breakpoint happened. After the moment the
breakpoint is removed, given interrupts are disabled in the int3 gate,
this hash table have to be kept around until all the currently running
IRQ handlers have finished their execution.

Mathieu

> > 
> > explains that code modification on x86 SMP systems is not only a matter
> > of atomicity, but also a matter of not changing the code underneath a
> > running CPU which is making assumptions that it won't change underneath
> > without issuing a synchronizing instruction before the new code is used
> > by the CPU. The scheme you propose here takes care of atomicity, but
> > does not take care of the synchronization problem. A sync_core() would
> > probably be required when such modification is detected.
> > 
> > Also, speaking of plain atomicity, you scheme does not seem to protect
> > against NMIs running on a different CPU, because the non-atomic change
> > could race with such NMI.
> 
> Ingo,
> 
> Mathieu is correct in this regard. We do not neet to protect ourselves 
> from NMIs on the CPU that we execute the code on. We need to protect 
> ourselves from NMIs running on other CPUS.
> 
> -- Steve
> 

-- 
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/3] ring-buffer: less locking and only disable preemption
  2008-10-04 17:18       ` Steven Rostedt
@ 2008-10-06 17:13         ` Mathieu Desnoyers
  0 siblings, 0 replies; 16+ messages in thread
From: Mathieu Desnoyers @ 2008-10-06 17:13 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Ingo Molnar, LKML, Thomas Gleixner, Peter Zijlstra, Andrew Morton,
	Linus Torvalds, Arjan van de Ven

* Steven Rostedt (rostedt@goodmis.org) wrote:
> 
> On Sat, 4 Oct 2008, Mathieu Desnoyers wrote:
> > 
> > Or use this code, based on a temporary breakpoint, to do the code
> > patching (part of the -lttng tree). It does not require stop_machine at
> > all and is nmi safe.
> > 
> 
> When this is supported for all archs, and can be done at all functions 
> then I could use it.
> 

How about incrementally using this piece of infrastructure when
available on a given architecture ? This way we keep a sub-optimal
fall-back for archs which does not support NMI-safe code patching and
incrementally get the optimal behavior. Otherwise, we would require any
new architecture to implement that up-front, which I doubt is a good
idea.

> I may just have the arch specific code use it, but we'll see.
> 
> Also, how good is it at patching 20,000 call sites?
> 

Can be done really fast using a hash table, see my previous mail.

Mathieu

> -- Steve
> 

-- 
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2008-10-06 17:14 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-10-04  6:00 [PATCH 0/3] ring-buffer: less locking and only disable preemption Steven Rostedt
2008-10-04  6:00 ` [PATCH 1/3] ring-buffer: move page indexes into page headers Steven Rostedt
2008-10-04  6:00 ` [PATCH 2/3] ring-buffer: make reentrant Steven Rostedt
2008-10-04  6:01 ` [PATCH 3/3] ftrace: make some tracers reentrant Steven Rostedt
2008-10-04  8:40 ` [PATCH 0/3] ring-buffer: less locking and only disable preemption Ingo Molnar
2008-10-04 14:34   ` Steven Rostedt
2008-10-04 14:44     ` Ingo Molnar
2008-10-04 17:41       ` Ingo Molnar
2008-10-04 22:27         ` Mathieu Desnoyers
2008-10-04 23:21           ` Steven Rostedt
2008-10-06 17:10             ` Mathieu Desnoyers
2008-10-05 10:13           ` Ingo Molnar
2008-10-06 13:53             ` Mathieu Desnoyers
2008-10-04 16:33     ` Mathieu Desnoyers
2008-10-04 17:18       ` Steven Rostedt
2008-10-06 17:13         ` Mathieu Desnoyers

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox