From: Harsh Bora <harsh@linux.vnet.ibm.com>
To: Stefan Hajnoczi <stefanha@gmail.com>, stefanha@linux.vnet.ibm.com
Cc: vilanova@ac.upc.edu, qemu-devel@nongnu.org,
aneesh.kumar@linux.vnet.ibm.com
Subject: Re: [Qemu-devel] [RFC PATCH v3 2/3] simpletrace-v2: Handle variable number/size of elements per trace record.
Date: Wed, 18 Jan 2012 14:44:40 +0530 [thread overview]
Message-ID: <4F168D80.8000707@linux.vnet.ibm.com> (raw)
In-Reply-To: <CAJSP0QWVSENstH8HtBhcAf43Wr2PuTrpnf7P7ZWodE8t12uUFA@mail.gmail.com>
On 01/10/2012 10:11 PM, Stefan Hajnoczi wrote:
> On Tue, Jan 10, 2012 at 10:59 AM, Harsh Prateek Bora
> <harsh@linux.vnet.ibm.com> wrote:
>> Advantages over existing simpletrace backend:
>> - More than 6 elements (vitually unlimited) arguments can be traced.
>> - This allows to trace strings (variable size element) as well.
>>
>> Signed-off-by: Harsh Prateek Bora<harsh@linux.vnet.ibm.com>
>> ---
>> monitor.c | 2 +-
>> trace/simple.c | 178 ++++++++++++++++++++------------------------------------
>> trace/simple.h | 31 ++++++++--
>> 3 files changed, 88 insertions(+), 123 deletions(-)
>>
>> diff --git a/monitor.c b/monitor.c
>> index ffda0fe..b6f85d1 100644
>> --- a/monitor.c
>> +++ b/monitor.c
>> @@ -945,7 +945,7 @@ static void do_info_cpu_stats(Monitor *mon)
>> #if defined(CONFIG_TRACE_SIMPLE)
>> static void do_info_trace(Monitor *mon)
>> {
>> - st_print_trace((FILE *)mon,&monitor_fprintf);
>> + /* FIXME: st_print_trace((FILE *)mon,&monitor_fprintf); */
>
> This command is only available from the human monitor. It's not very
> useful because it historically hasn't been able to pretty-print events
> or show them in the right order (we use a ringbuffer but it prints
> them out from index 0).
>
> Therefore, I don't think we're under any obligation to keep this
> command around. No one has complained about it's limitations - I
> think this is a sign that no one has used it. I'd be okay with a
> patch that removes it.
>
>> }
>> #endif
>>
>> diff --git a/trace/simple.c b/trace/simple.c
>> index b639dda..65cafe2 100644
>> --- a/trace/simple.c
>> +++ b/trace/simple.c
>> @@ -27,7 +27,7 @@
>> #define HEADER_MAGIC 0xf2b177cb0aa429b4ULL
>>
>> /** Trace file version number, bump if format changes */
>> -#define HEADER_VERSION 0
>> +#define HEADER_VERSION 2
>>
>> /** Records were dropped event ID */
>> #define DROPPED_EVENT_ID (~(uint64_t)0 - 1)
>> @@ -35,23 +35,6 @@
>> /** Trace record is valid */
>> #define TRACE_RECORD_VALID ((uint64_t)1<< 63)
>>
>> -/** Trace buffer entry */
>> -typedef struct {
>> - uint64_t event;
>> - uint64_t timestamp_ns;
>> - uint64_t x1;
>> - uint64_t x2;
>> - uint64_t x3;
>> - uint64_t x4;
>> - uint64_t x5;
>> - uint64_t x6;
>> -} TraceRecord;
>> -
>> -enum {
>> - TRACE_BUF_LEN = 4096,
>> - TRACE_BUF_FLUSH_THRESHOLD = TRACE_BUF_LEN / 4,
>> -};
>> -
>> /*
>> * Trace records are written out by a dedicated thread. The thread waits for
>> * records to become available, writes them out, and then waits again.
>> @@ -62,11 +45,12 @@ static GCond *trace_empty_cond;
>> static bool trace_available;
>> static bool trace_writeout_enabled;
>>
>> -static TraceRecord trace_buf[TRACE_BUF_LEN];
>> +uint8_t trace_buf[TRACE_BUF_LEN];
>> static unsigned int trace_idx;
>> static FILE *trace_fp;
>> static char *trace_file_name = NULL;
>>
>> +
>
> Spurious newline.
>
>> /**
>> * Read a trace record from the trace buffer
>> *
>> @@ -75,16 +59,19 @@ static char *trace_file_name = NULL;
>> *
>> * Returns false if the record is not valid.
>> */
>> -static bool get_trace_record(unsigned int idx, TraceRecord *record)
>> +static bool get_trace_record(unsigned int idx, TraceRecord **recordptr)
>> {
>> - if (!(trace_buf[idx].event& TRACE_RECORD_VALID)) {
>> + TraceRecord *record = (TraceRecord *)&trace_buf[idx];
>> + if (!(record->event& TRACE_RECORD_VALID)) {
>> return false;
>> }
>>
>> __sync_synchronize(); /* read memory barrier before accessing record */
>>
>> - *record = trace_buf[idx];
>> - record->event&= ~TRACE_RECORD_VALID;
>> + *recordptr = g_malloc(record->length);
>> + /* make a copy of record to avoid being overwritten */
>> + memcpy(*recordptr, record, record->length);
>> + (*recordptr)->event&= ~TRACE_RECORD_VALID;
>> return true;
>> }
>>
>> @@ -106,6 +93,47 @@ static void flush_trace_file(bool wait)
>> g_static_mutex_unlock(&trace_lock);
>> }
>>
>> +unsigned int trace_alloc_record(TraceEventID event, uint32_t datasize)
>> +{
>> + unsigned int idx, rec_off;
>> + uint32_t rec_len = ST_V2_REC_HDR_LEN + datasize;
>> + uint64_t timestamp_ns = get_clock();
>> +
>> + idx = g_atomic_int_exchange_and_add((gint *)&trace_idx, rec_len) % TRACE_BUF_LEN;
>> + rec_off = idx;
>> + write_to_buffer(rec_off, (uint8_t*)&event, sizeof(event));
>> + rec_off += sizeof(event);
>> + write_to_buffer(rec_off, (uint8_t*)×tamp_ns, sizeof(timestamp_ns));
>> + rec_off += sizeof(timestamp_ns);
>> + write_to_buffer(rec_off, (uint8_t*)&rec_len, sizeof(rec_len));
>> + rec_off += sizeof(rec_len);
>> + return idx;
>> +}
>> +
>> +void write_to_buffer(unsigned int idx, uint8_t *dataptr, uint32_t size)
>> +{
>> + uint32_t x = 0;
>> + while (x< size)
>> + {
>> + if (idx>= TRACE_BUF_LEN) {
>> + idx = idx % TRACE_BUF_LEN;
>> + }
>> + trace_buf[idx++] = dataptr[x++];
>> + }
>> +}
>> +
>> +void trace_mark_record_complete(unsigned int idx)
>> +{
>> + TraceRecord *record = (TraceRecord*)&trace_buf[idx];
>> +
>> + __sync_synchronize(); /* write barrier before marking as valid */
>> + record->event |= TRACE_RECORD_VALID;
>> +
>> + if (idx> TRACE_BUF_FLUSH_THRESHOLD) {
>> + flush_trace_file(false);
>> + }
>
> This looks suspicious. We want to kick the writeout thread when the
> buffer reaches the threshold but not repeatedly after that. Also
> because it's a ring buffer idx could "start" above the threshold.
>
> Perhaps something like:
> if (idx> TRACE_BUF_FLUSH_THRESHOLD&& old_idx<= TRACE_BUF_FLUSH_THRESHOLD) {
> flush_trace_file(false);
> }
>
>> +}
>> +
>> static void wait_for_trace_records_available(void)
>> {
>> g_static_mutex_lock(&trace_lock);
>> @@ -120,7 +148,7 @@ static void wait_for_trace_records_available(void)
>>
>> static gpointer writeout_thread(gpointer opaque)
>> {
>> - TraceRecord record;
>> + TraceRecord record, *recordptr;
>> unsigned int writeout_idx = 0;
>> unsigned int num_available, idx;
>> size_t unused __attribute__ ((unused));
>> @@ -130,19 +158,20 @@ static gpointer writeout_thread(gpointer opaque)
>>
>> num_available = trace_idx - writeout_idx;
>> if (num_available> TRACE_BUF_LEN) {
>> - record = (TraceRecord){
>> - .event = DROPPED_EVENT_ID,
>> - .x1 = num_available,
>> - };
>> - unused = fwrite(&record, sizeof(record), 1, trace_fp);
>> + record.event = DROPPED_EVENT_ID,
>> + record.length = num_available,
>
> Please don't use .length for the number of bytes dropped because this
> makes life harder for parsing scripts like simpletrace.py. They would
> need a special case to check for DROPPED_EVENT_ID and then treat
> .length as an argument. Instead the number of bytes should be an
> argument.
>
> Also, by removing the struct literal assignment you lose the automatic
> zeroing of fields. We'll write out junk data - it would be nicer to
> clear it.
>
>> + unused = fwrite(&record, ST_V2_REC_HDR_LEN, 1, trace_fp);
>> writeout_idx += num_available;
>> }
>>
>> idx = writeout_idx % TRACE_BUF_LEN;
>> - while (get_trace_record(idx,&record)) {
>> - trace_buf[idx].event = 0; /* clear valid bit */
>> - unused = fwrite(&record, sizeof(record), 1, trace_fp);
>> - idx = ++writeout_idx % TRACE_BUF_LEN;
>> + while (get_trace_record(idx,&recordptr)) {
>> + unused = fwrite(recordptr, recordptr->length, 1, trace_fp);
>> + writeout_idx += recordptr->length;
>> + g_free(recordptr);
>> + recordptr = (TraceRecord *)&trace_buf[idx];
>> + recordptr->event = 0;
>> + idx = writeout_idx % TRACE_BUF_LEN;
>> }
>
> I'm wondering if it's worth using a different approach here. Writing
> out individual records has bothered me.
>
> If we have a second buffer, as big as trace_buf[], then a function can
> copy out all records and make them available in trace_buf again:
>
> /**
> * Copy completed trace records out of the ring buffer
> *
> * @idx offset into trace_buf[]
> * @buf destination buffer
> * @len size of destination buffer
> * @ret the number of bytes consumed
> */
> size_t consume_trace_records(unsigned int idx, void *buf, size_t len);
>
> That means consume gobbles up as many records as it can:
> * Until it reaches an invalid record which has not been written yet
> * Until it reaches the end of trace_buf[], the caller can call again
> with idx wrapped to 0
>
> After copying into buf[] it clears the valid bit in trace_buf[].
>
> Then the loop which calls consume_trace_records() does a single
> fwrite(3) and increments idx/writeout_idx.
>
> The advantage to this is that we write many records in one go and I
> think it splits up the writeout steps a little nicer than what we've
> previously done.
>
I think this optimization can be introduced as a separate patch later.
Let me know if you think otherwise.
>>
>> fflush(trace_fp);
>> @@ -231,7 +196,7 @@ void st_set_trace_file_enabled(bool enable)
>> static const TraceRecord header = {
>> .event = HEADER_EVENT_ID,
>> .timestamp_ns = HEADER_MAGIC,
>> - .x1 = HEADER_VERSION,
>> + .length = HEADER_VERSION,
>
> Hmm...this is kind of ugly (see comment about using .length above) but
> in this case most parsers will have a special-case anyway to check the
> magic number. We need to use the .length field because historically
> that's where the version is located.
>
So, lets keep the version here only, right ?
- Harsh
>> +/* Interfaces for simpletrace v2 */
>
> Please drop this comment. All trace/simple.[ch] code is for v2, this
> makes it seems like we support both versions at the same time.
>
>> +
>> +/** Trace buffer entry */
>> +typedef struct {
>> + uint64_t event; /* TraceEventID */
>> + uint64_t timestamp_ns;
>> + uint32_t length; /* in bytes */
>> + uint32_t reserved; /* unused */
>> + uint8_t arguments[]; /* arguments position affects ST_V2_REC_HDR_LEN */
>> +} TraceRecord;
>
> Does this need to be visible? I think it can live in trace/simple.c
>
>> +
>> +#define ST_V2_REC_HDR_LEN 24
>
> Does not need to be visible. Also why calculate manually, instead of
> a macro I would use sizeof(TraceRecord) (the array without elements is
> not counted by sizeof).
>
>> +
>> +enum {
>> + TRACE_BUF_LEN = 4096 * 64,
>> + TRACE_BUF_FLUSH_THRESHOLD = TRACE_BUF_LEN / 4,
>> +};
>
> Does not need to be visible.
>
>> +
>> +extern uint8_t trace_buf[TRACE_BUF_LEN];
>
> Does not need to be visible.
>
next prev parent reply other threads:[~2012-01-18 9:15 UTC|newest]
Thread overview: 23+ messages / expand[flat|nested] mbox.gz Atom feed top
2012-01-10 10:59 [Qemu-devel] [RFC PATCH v3 0/3] simpletrace : support var num of args and strings Harsh Prateek Bora
2012-01-10 10:59 ` [Qemu-devel] [RFC PATCH v3 1/3] Converting tracetool.sh to tracetool.py Harsh Prateek Bora
2012-01-10 14:50 ` Stefan Hajnoczi
2012-01-11 6:25 ` Harsh Bora
2012-01-11 10:03 ` Stefan Hajnoczi
2012-01-10 21:45 ` Lluís Vilanova
2012-01-11 17:14 ` Stefan Hajnoczi
2012-01-10 22:51 ` Lluís Vilanova
2012-01-11 6:38 ` Harsh Bora
2012-01-11 8:46 ` Harsh Bora
2012-01-11 10:50 ` Stefan Hajnoczi
2012-01-12 9:33 ` Stefan Hajnoczi
2012-01-11 10:07 ` Stefan Hajnoczi
2012-01-10 10:59 ` [Qemu-devel] [RFC PATCH v3 2/3] simpletrace-v2: Handle variable number/size of elements per trace record Harsh Prateek Bora
2012-01-10 16:41 ` Stefan Hajnoczi
2012-01-18 9:14 ` Harsh Bora [this message]
2012-01-18 10:31 ` Stefan Hajnoczi
2012-01-18 10:41 ` Harsh Bora
2012-01-18 10:52 ` Harsh Bora
2012-01-18 10:59 ` Stefan Hajnoczi
2012-01-18 11:09 ` Harsh Bora
2012-01-10 10:59 ` [Qemu-devel] [RFC PATCH v3 3/3] simpletrace.py: updated log reader script to handle new log format Harsh Prateek Bora
2012-01-11 12:30 ` Stefan Hajnoczi
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4F168D80.8000707@linux.vnet.ibm.com \
--to=harsh@linux.vnet.ibm.com \
--cc=aneesh.kumar@linux.vnet.ibm.com \
--cc=qemu-devel@nongnu.org \
--cc=stefanha@gmail.com \
--cc=stefanha@linux.vnet.ibm.com \
--cc=vilanova@ac.upc.edu \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).