[PATCH] 2/3: MCA/MCE correctable error handling

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] 2/3: MCA/MCE correctable error handling
@ 2007-08-21 13:31 Christoph Egger
  2007-08-21 15:53 ` Jan Beulich
  0 siblings, 1 reply; 9+ messages in thread
From: Christoph Egger @ 2007-08-21 13:31 UTC (permalink / raw)
  To: xen-devel; +Cc: Gavin.Maltby, Keir Fraser

[-- Attachment #1: Type: text/plain, Size: 509 bytes --]


This is patch 2/3.

Signed-off-by: Christoph Egger <Christoph.Egger@amd.com>


-- 
AMD Saxony, Dresden, Germany
Operating System Research Center

Legal Information:
AMD Saxony Limited Liability Company & Co. KG
Sitz (Geschäftsanschrift):
   Wilschdorfer Landstr. 101, 01109 Dresden, Deutschland
Registergericht Dresden: HRA 4896
vertretungsberechtigter Komplementär:
   AMD Saxony LLC (Sitz Wilmington, Delaware, USA)
Geschäftsführer der AMD Saxony LLC:
   Dr. Hans-R. Deppe, Thomas McCoy

[-- Attachment #2: mca_diff2_headers.diff --]
[-- Type: text/x-diff, Size: 5196 bytes --]

diff -r b5fc70fadacc -r a5209d79d241 xen/include/public/arch-x86/xen.h
--- a/xen/include/public/arch-x86/xen.h	Fri Aug 17 09:06:44 2007 +0200
+++ b/xen/include/public/arch-x86/xen.h	Fri Aug 17 13:21:40 2007 +0200
@@ -81,6 +81,104 @@ DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
 #define MAX_VIRT_CPUS 32
 
 #ifndef __ASSEMBLY__
+
+/*
+ * Machine Check Architecure:
+ * structs are read-only and used to report all kinds of
+ * correctable and uncorrectable errors detected by the HW.
+ * Dom0 and DomU: register a handler to get notified.
+ * Dom0 only: Correctable errors are reported via VIRQ_MCA
+ * Dom0 and DomU: Uncorrectable errors are reported via nmi handlers
+ */
+#define MC_TYPE_GLOBAL 0
+#define MC_TYPE_BANK   1
+
+struct mcinfo_common {
+    uint16_t type;     /* structure type */
+    uint16_t size;     /* size of this struct in bytes */
+} __attribute__((packed));
+
+
+#define MC_FLAG_CORRECTABLE    (1 << 0)
+#define MC_FLAG_UNCORRECTABLE  (1 << 1)
+/* contains global x86 mc information */
+struct mcinfo_global {
+    struct mcinfo_common common;
+
+    uint16_t mc_domid; 
+    uint16_t mc_socketid;
+    uint16_t mc_coreid;
+    uint16_t mc_vcpu_id;
+    uint64_t mc_gstatus; /* global status */
+    uint32_t mc_flags;
+} __attribute__((packed));
+
+/* contains bank local x86 mc information */
+struct mcinfo_bank {
+    struct mcinfo_common common;
+
+    uint16_t mc_bank; /* bank nr */
+    uint64_t mc_status; /* bank status */
+    uint64_t mc_addr;
+    uint64_t mc_misc;
+} __attribute__((packed));
+
+
+/* sizeof(struct mcinfo_global) + 6 * sizeof(struct mcinfo_bank) == 200.
+ * This is enough space to store mc information of up to six banks.
+ */
+#define MCINFO_MAXSIZE (204 - sizeof(uint32_t))
+
+struct arch_mc_info {
+    /* Number of mcainfo_* entries in mi_data */
+    uint32_t mi_nentries;
+
+    uint8_t mi_data[MCINFO_MAXSIZE];
+};
+typedef struct arch_mc_info arch_mc_info_t;
+
+
+
+/* 
+ * OS's should use these instead of writing their own helper functions
+ * each with its own bugs and drawbacks.
+ */
+/* Prototype:
+ *    uint32_t x86_mcinfo_nentries(struct shared_info *si);
+ */
+#define x86_mcinfo_nentries(_si)    \
+    (_si)->arch.mc_info.mi_nentries
+/* Prototype:
+ *    struct mcinfo_common *x86_mcinfo_first(struct shared_info *si);
+ */
+#define x86_mcinfo_first(_si)       \
+    (struct mcinfo_common *)((_si)->arch.mc_info.mi_data)
+/* Prototype:
+ *    struct mcinfo_common *x86_mcinfo_next(struct mcinfo_common *mic);
+ */
+#define x86_mcinfo_next(_mic)       \
+   (struct mcinfo_common *)((uint8_t *)(_mic) + (_mic)->size)
+
+/* Prototype:
+ *    void x86_mcinfo_lookup(void *ret, struct shared_info *si, uint16_t type);
+ */
+#define x86_mcinfo_lookup(_ret, _si, _type)    \
+    do {                                                        \
+        uint32_t found, i;                                      \
+        struct mcinfo_common *_mic;                             \
+                                                                \
+        _mic = x86_mcinfo_first(_si);                           \
+        found = 0;                                              \
+        for (i = 0; i < x86_mcinfo_nentries(_si); i++) {        \
+            if (_mic->type == (_type))                          \
+                found = 1;                                      \
+            else                                                \
+                _mic = x86_mcinfo_next(_mic);                   \
+        }                                                       \
+        (_ret) = found ? _mic : NULL;                           \
+    } while (0)
+
+
 
 typedef unsigned long xen_ulong_t;
 
@@ -173,6 +271,7 @@ struct arch_shared_info {
     /* Frame containing list of mfns containing list of mfns containing p2m. */
     xen_pfn_t     pfn_to_mfn_frame_list_list;
     unsigned long nmi_reason;
+    struct arch_mc_info mc_info; /* machine check information */
     uint64_t pad[32];
 };
 typedef struct arch_shared_info arch_shared_info_t;
diff -r b5fc70fadacc -r a5209d79d241 xen/include/public/foreign/reference.size
--- a/xen/include/public/foreign/reference.size	Fri Aug 17 09:06:44 2007 +0200
+++ b/xen/include/public/foreign/reference.size	Fri Aug 17 13:21:40 2007 +0200
@@ -13,6 +13,7 @@ arch_vcpu_info            |      24     
 arch_vcpu_info            |      24      16       0
 vcpu_time_info            |      32      32      32
 vcpu_info                 |      64      64      48
-arch_shared_info          |     268     280     272
-shared_info               |    2584    3368    4384
+arch_mc_info              |     204     204       -
+arch_shared_info          |     472     488     272
+shared_info               |    2788    3576    4384
 
diff -r b5fc70fadacc -r a5209d79d241 xen/include/public/foreign/structs.py
--- a/xen/include/public/foreign/structs.py	Fri Aug 17 09:06:44 2007 +0200
+++ b/xen/include/public/foreign/structs.py	Fri Aug 17 13:21:40 2007 +0200
@@ -15,6 +15,7 @@ structs = [ "start_info",
             "arch_vcpu_info",
             "vcpu_time_info",
             "vcpu_info",
+            "arch_mc_info",
             "arch_shared_info",
             "shared_info" ];
 

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] 2/3: MCA/MCE correctable error handling
  2007-08-21 13:31 [PATCH] 2/3: MCA/MCE correctable error handling Christoph Egger
@ 2007-08-21 15:53 ` Jan Beulich
  2007-08-22  8:47   ` Christoph Egger
  0 siblings, 1 reply; 9+ messages in thread
From: Jan Beulich @ 2007-08-21 15:53 UTC (permalink / raw)
  To: Christoph Egger; +Cc: Gavin.Maltby, xen-devel, Keir Fraser

>+} __attribute__((packed));

I think it was a general agreement that it is not a good idea (non-portable to
non-GNU compilers) to put things like this in public headers. I think by properly
ordering things you can get away without this (and almost without padding).

>+struct mcinfo_global {
>...
>+    uint16_t mc_socketid;
>+    uint16_t mc_coreid;
>+    uint16_t mc_vcpu_id;

Unless mc_vcpu_id is intended for that purpose, this neglects hyperthreading (I
know, AMD doesn't use this, but the interface should be vendor neutral).

If mc_vcpu_id is meant for that purpose, its naming is ambiguous.

If mc_vcpu_id is meant as a vcpuid, then ordering things os that mc_domid and
mc_vcpu_id are contiguous would seem more natural (making eventual
examination in raw memory less confusing).

>+/* sizeof(struct mcinfo_global) + 6 * sizeof(struct mcinfo_bank) == 200.
>+ * This is enough space to store mc information of up to six banks.
>+ */
>+#define MCINFO_MAXSIZE (204 - sizeof(uint32_t))

Why don't you use the sizeof()-s from the description? If this is really needed
for some reason, then having 200 in the description and 204 in the macro is
at least confusing...

>     /* Frame containing list of mfns containing list of mfns containing p2m. */
>     xen_pfn_t     pfn_to_mfn_frame_list_list;
>     unsigned long nmi_reason;
>+    struct arch_mc_info mc_info; /* machine check information */
>     uint64_t pad[32];
> };

Are you sure it is appropriate to add a member here without reducing the
padding member?

Jan

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] 2/3: MCA/MCE correctable error handling
  2007-08-21 15:53 ` Jan Beulich
@ 2007-08-22  8:47   ` Christoph Egger
  2007-08-22 10:01     ` Jan Beulich
  0 siblings, 1 reply; 9+ messages in thread
From: Christoph Egger @ 2007-08-22  8:47 UTC (permalink / raw)
  To: xen-devel; +Cc: Gavin.Maltby, Keir Fraser, Jan Beulich

On Tuesday 21 August 2007 17:53:45 Jan Beulich wrote:
> >+} __attribute__((packed));
>
> I think it was a general agreement that it is not a good idea (non-portable
> to non-GNU compilers) to put things like this in public headers. I think by
> properly ordering things you can get away without this (and almost without
> padding).

Oh, you're right. I should use #pragma pack(1)  instead.
Is this fine with you?

>
> >+struct mcinfo_global {
> >...
> >+    uint16_t mc_socketid;
> >+    uint16_t mc_coreid;
> >+    uint16_t mc_vcpu_id;
>
> Unless mc_vcpu_id is intended for that purpose, this neglects
> hyperthreading (I know, AMD doesn't use this, but the interface should be
> vendor neutral).
>
> If mc_vcpu_id is meant for that purpose, its naming is ambiguous.
>
> If mc_vcpu_id is meant as a vcpuid, then ordering things os that mc_domid
> and mc_vcpu_id are contiguous would seem more natural (making eventual
> examination in raw memory less confusing).

mc_coreid is the physical core that reported the machine check information.
mc_socketid is the physical socket the physical core is in. This is useful
to find all other affected physical cores, when an error in the L3-Cache is 
reported that is shared over all cores in the chip.

mc_vcpu_id is the id of the active vcpu for the domain, that reported the
machine check information. Inside Xen, this is current->vcpu_id.
mc_vcpu_id is needed to deal properly with the upcoming NUMA support
my collegue is working on.

> >+/* sizeof(struct mcinfo_global) + 6 * sizeof(struct mcinfo_bank) == 200.
> >+ * This is enough space to store mc information of up to six banks.
> >+ */
> >+#define MCINFO_MAXSIZE (204 - sizeof(uint32_t))
>
> Why don't you use the sizeof()-s from the description? If this is really
> needed for some reason, then having 200 in the description and 204 in the
> macro is at least confusing...

MCINFO_MAXSIZE is the size for the content of the mi_data array.
MCINFO_MAXSIZE + sizeof(mi_nentries) == 204. That is where is number comes
from.

> >     /* Frame containing list of mfns containing list of mfns containing
> > p2m. */ xen_pfn_t     pfn_to_mfn_frame_list_list;
> >     unsigned long nmi_reason;
> >+    struct arch_mc_info mc_info; /* machine check information */
> >     uint64_t pad[32];
> > };
>
> Are you sure it is appropriate to add a member here without reducing the
> padding member?

You want me to replace "uint64_t pad[32];" with "uint32_t pad[13];" ?

-- 
AMD Saxony, Dresden, Germany
Operating System Research Center

Legal Information:
AMD Saxony Limited Liability Company & Co. KG
Sitz (Geschäftsanschrift):
   Wilschdorfer Landstr. 101, 01109 Dresden, Deutschland
Registergericht Dresden: HRA 4896
vertretungsberechtigter Komplementär:
   AMD Saxony LLC (Sitz Wilmington, Delaware, USA)
Geschäftsführer der AMD Saxony LLC:
   Dr. Hans-R. Deppe, Thomas McCoy

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] 2/3: MCA/MCE correctable error handling
  2007-08-22  8:47   ` Christoph Egger
@ 2007-08-22 10:01     ` Jan Beulich
  2007-08-22 15:47       ` Christoph Egger
  0 siblings, 1 reply; 9+ messages in thread
From: Jan Beulich @ 2007-08-22 10:01 UTC (permalink / raw)
  To: Christoph Egger; +Cc: Gavin.Maltby, xen-devel, Keir Fraser

>>> "Christoph Egger" <Christoph.Egger@amd.com> 22.08.07 10:47 >>>
>On Tuesday 21 August 2007 17:53:45 Jan Beulich wrote:
>> >+} __attribute__((packed));
>>
>> I think it was a general agreement that it is not a good idea (non-portable
>> to non-GNU compilers) to put things like this in public headers. I think by
>> properly ordering things you can get away without this (and almost without
>> padding).
>
>Oh, you're right. I should use #pragma pack(1)  instead.
>Is this fine with you?

I'm afraid that wouldn't work with the compat mode header generation, as the
original use of #pragma pack(push, ...) was banned for (Sun) compatibility
reasons. Consequently, I believe the only way of doing this properly is to avoid
depending on compiler behavior by arranging things appropriately (including
padding members if needed) and avoiding #pragma pack() altogether.

>>
>> >+struct mcinfo_global {
>> >...
>> >+    uint16_t mc_socketid;
>> >+    uint16_t mc_coreid;
>> >+    uint16_t mc_vcpu_id;
>>
>> Unless mc_vcpu_id is intended for that purpose, this neglects
>> hyperthreading (I know, AMD doesn't use this, but the interface should be
>> vendor neutral).
>>
>> If mc_vcpu_id is meant for that purpose, its naming is ambiguous.
>>
>> If mc_vcpu_id is meant as a vcpuid, then ordering things os that mc_domid
>> and mc_vcpu_id are contiguous would seem more natural (making eventual
>> examination in raw memory less confusing).
>
>mc_coreid is the physical core that reported the machine check information.
>mc_socketid is the physical socket the physical core is in. This is useful
>to find all other affected physical cores, when an error in the L3-Cache is 
>reported that is shared over all cores in the chip.
>
>mc_vcpu_id is the id of the active vcpu for the domain, that reported the
>machine check information. Inside Xen, this is current->vcpu_id.
>mc_vcpu_id is needed to deal properly with the upcoming NUMA support
>my collegue is working on.

Okay, but then you're really lacking a thread id here, for being filled by
respective Intel code (AMD code would set this to zero).

>> >+/* sizeof(struct mcinfo_global) + 6 * sizeof(struct mcinfo_bank) == 200.
>> >+ * This is enough space to store mc information of up to six banks.
>> >+ */
>> >+#define MCINFO_MAXSIZE (204 - sizeof(uint32_t))
>>
>> Why don't you use the sizeof()-s from the description? If this is really
>> needed for some reason, then having 200 in the description and 204 in the
>> macro is at least confusing...
>
>MCINFO_MAXSIZE is the size for the content of the mi_data array.
>MCINFO_MAXSIZE + sizeof(mi_nentries) == 204. That is where is number comes
>from.

I concluded that, but pointed out that seeing two different numbers made
me think of why this is so, whereas identical numbers would have avoided
that (and will likely keep others from asking later, too).

Also, you don't say what was the reason for you to use constants instead
of sizeof() here.

>> >     /* Frame containing list of mfns containing list of mfns containing
>> > p2m. */ xen_pfn_t     pfn_to_mfn_frame_list_list;
>> >     unsigned long nmi_reason;
>> >+    struct arch_mc_info mc_info; /* machine check information */
>> >     uint64_t pad[32];
>> > };
>>
>> Are you sure it is appropriate to add a member here without reducing the
>> padding member?
>
>You want me to replace "uint64_t pad[32];" with "uint32_t pad[13];" ?

It would be my understanding that that's the right thing to do (assuming you
calculated the numbers correctly), but I'd rely on Keir to give a final word on
this.

Jan

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] 2/3: MCA/MCE correctable error handling
  2007-08-22 10:01     ` Jan Beulich
@ 2007-08-22 15:47       ` Christoph Egger
  2007-08-22 16:13         ` Keir Fraser
  0 siblings, 1 reply; 9+ messages in thread
From: Christoph Egger @ 2007-08-22 15:47 UTC (permalink / raw)
  To: xen-devel; +Cc: Gavin.Maltby, Keir Fraser, Jan Beulich


I fixed all this in my code and will re-submit the new patch.

Christoph



On Wednesday 22 August 2007 12:01:26 Jan Beulich wrote:
> >>> "Christoph Egger" <Christoph.Egger@amd.com> 22.08.07 10:47 >>>
> >
> >On Tuesday 21 August 2007 17:53:45 Jan Beulich wrote:
> >> >+} __attribute__((packed));
> >>
> >> I think it was a general agreement that it is not a good idea
> >> (non-portable to non-GNU compilers) to put things like this in public
> >> headers. I think by properly ordering things you can get away without
> >> this (and almost without padding).
> >
> >Oh, you're right. I should use #pragma pack(1)  instead.
> >Is this fine with you?
>
> I'm afraid that wouldn't work with the compat mode header generation, as
> the original use of #pragma pack(push, ...) was banned for (Sun)
> compatibility reasons. Consequently, I believe the only way of doing this
> properly is to avoid depending on compiler behavior by arranging things
> appropriately (including padding members if needed) and avoiding #pragma
> pack() altogether.
>
> >> >+struct mcinfo_global {
> >> >...
> >> >+    uint16_t mc_socketid;
> >> >+    uint16_t mc_coreid;
> >> >+    uint16_t mc_vcpu_id;
> >>
> >> Unless mc_vcpu_id is intended for that purpose, this neglects
> >> hyperthreading (I know, AMD doesn't use this, but the interface should
> >> be vendor neutral).
> >>
> >> If mc_vcpu_id is meant for that purpose, its naming is ambiguous.
> >>
> >> If mc_vcpu_id is meant as a vcpuid, then ordering things os that
> >> mc_domid and mc_vcpu_id are contiguous would seem more natural (making
> >> eventual examination in raw memory less confusing).
> >
> >mc_coreid is the physical core that reported the machine check
> > information. mc_socketid is the physical socket the physical core is in.
> > This is useful to find all other affected physical cores, when an error
> > in the L3-Cache is reported that is shared over all cores in the chip.
> >
> >mc_vcpu_id is the id of the active vcpu for the domain, that reported the
> >machine check information. Inside Xen, this is current->vcpu_id.
> >mc_vcpu_id is needed to deal properly with the upcoming NUMA support
> >my collegue is working on.
>
> Okay, but then you're really lacking a thread id here, for being filled by
> respective Intel code (AMD code would set this to zero).
>
> >> >+/* sizeof(struct mcinfo_global) + 6 * sizeof(struct mcinfo_bank) ==
> >> > 200. + * This is enough space to store mc information of up to six
> >> > banks. + */
> >> >+#define MCINFO_MAXSIZE (204 - sizeof(uint32_t))
> >>
> >> Why don't you use the sizeof()-s from the description? If this is really
> >> needed for some reason, then having 200 in the description and 204 in
> >> the macro is at least confusing...
> >
> >MCINFO_MAXSIZE is the size for the content of the mi_data array.
> >MCINFO_MAXSIZE + sizeof(mi_nentries) == 204. That is where is number comes
> >from.
>
> I concluded that, but pointed out that seeing two different numbers made
> me think of why this is so, whereas identical numbers would have avoided
> that (and will likely keep others from asking later, too).
>
> Also, you don't say what was the reason for you to use constants instead
> of sizeof() here.
>
> >> >     /* Frame containing list of mfns containing list of mfns
> >> > containing p2m. */ xen_pfn_t     pfn_to_mfn_frame_list_list;
> >> >     unsigned long nmi_reason;
> >> >+    struct arch_mc_info mc_info; /* machine check information */
> >> >     uint64_t pad[32];
> >> > };
> >>
> >> Are you sure it is appropriate to add a member here without reducing the
> >> padding member?
> >
> >You want me to replace "uint64_t pad[32];" with "uint32_t pad[13];" ?
>
> It would be my understanding that that's the right thing to do (assuming
> you calculated the numbers correctly), but I'd rely on Keir to give a final
> word on this.
>
> Jan



-- 
AMD Saxony, Dresden, Germany
Operating System Research Center

Legal Information:
AMD Saxony Limited Liability Company & Co. KG
Sitz (Geschäftsanschrift):
   Wilschdorfer Landstr. 101, 01109 Dresden, Deutschland
Registergericht Dresden: HRA 4896
vertretungsberechtigter Komplementär:
   AMD Saxony LLC (Sitz Wilmington, Delaware, USA)
Geschäftsführer der AMD Saxony LLC:
   Dr. Hans-R. Deppe, Thomas McCoy

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] 2/3: MCA/MCE correctable error handling
  2007-08-22 15:47       ` Christoph Egger
@ 2007-08-22 16:13         ` Keir Fraser
  2007-08-23  7:07           ` Christoph Egger
  0 siblings, 1 reply; 9+ messages in thread
From: Keir Fraser @ 2007-08-22 16:13 UTC (permalink / raw)
  To: Christoph Egger, xen-devel; +Cc: Gavin.Maltby, Jan Beulich

I'm not sure you should use shared_info at all. This interface should be
low-rate enough that you can add a sysctl (assuming this info is applicable
for dom0 only, which it looks to be). Actually, probably a platform_op
rather than a sysctl...

 -- Keir

On 22/8/07 16:47, "Christoph Egger" <Christoph.Egger@amd.com> wrote:

> 
> I fixed all this in my code and will re-submit the new patch.
> 
> Christoph
> 
> 
> 
> On Wednesday 22 August 2007 12:01:26 Jan Beulich wrote:
>>>>> "Christoph Egger" <Christoph.Egger@amd.com> 22.08.07 10:47 >>>
>>> 
>>> On Tuesday 21 August 2007 17:53:45 Jan Beulich wrote:
>>>>> +} __attribute__((packed));
>>>> 
>>>> I think it was a general agreement that it is not a good idea
>>>> (non-portable to non-GNU compilers) to put things like this in public
>>>> headers. I think by properly ordering things you can get away without
>>>> this (and almost without padding).
>>> 
>>> Oh, you're right. I should use #pragma pack(1)  instead.
>>> Is this fine with you?
>> 
>> I'm afraid that wouldn't work with the compat mode header generation, as
>> the original use of #pragma pack(push, ...) was banned for (Sun)
>> compatibility reasons. Consequently, I believe the only way of doing this
>> properly is to avoid depending on compiler behavior by arranging things
>> appropriately (including padding members if needed) and avoiding #pragma
>> pack() altogether.
>> 
>>>>> +struct mcinfo_global {
>>>>> ...
>>>>> +    uint16_t mc_socketid;
>>>>> +    uint16_t mc_coreid;
>>>>> +    uint16_t mc_vcpu_id;
>>>> 
>>>> Unless mc_vcpu_id is intended for that purpose, this neglects
>>>> hyperthreading (I know, AMD doesn't use this, but the interface should
>>>> be vendor neutral).
>>>> 
>>>> If mc_vcpu_id is meant for that purpose, its naming is ambiguous.
>>>> 
>>>> If mc_vcpu_id is meant as a vcpuid, then ordering things os that
>>>> mc_domid and mc_vcpu_id are contiguous would seem more natural (making
>>>> eventual examination in raw memory less confusing).
>>> 
>>> mc_coreid is the physical core that reported the machine check
>>> information. mc_socketid is the physical socket the physical core is in.
>>> This is useful to find all other affected physical cores, when an error
>>> in the L3-Cache is reported that is shared over all cores in the chip.
>>> 
>>> mc_vcpu_id is the id of the active vcpu for the domain, that reported the
>>> machine check information. Inside Xen, this is current->vcpu_id.
>>> mc_vcpu_id is needed to deal properly with the upcoming NUMA support
>>> my collegue is working on.
>> 
>> Okay, but then you're really lacking a thread id here, for being filled by
>> respective Intel code (AMD code would set this to zero).
>> 
>>>>> +/* sizeof(struct mcinfo_global) + 6 * sizeof(struct mcinfo_bank) ==
>>>>> 200. + * This is enough space to store mc information of up to six
>>>>> banks. + */
>>>>> +#define MCINFO_MAXSIZE (204 - sizeof(uint32_t))
>>>> 
>>>> Why don't you use the sizeof()-s from the description? If this is really
>>>> needed for some reason, then having 200 in the description and 204 in
>>>> the macro is at least confusing...
>>> 
>>> MCINFO_MAXSIZE is the size for the content of the mi_data array.
>>> MCINFO_MAXSIZE + sizeof(mi_nentries) == 204. That is where is number comes
>>> from.
>> 
>> I concluded that, but pointed out that seeing two different numbers made
>> me think of why this is so, whereas identical numbers would have avoided
>> that (and will likely keep others from asking later, too).
>> 
>> Also, you don't say what was the reason for you to use constants instead
>> of sizeof() here.
>> 
>>>>>     /* Frame containing list of mfns containing list of mfns
>>>>> containing p2m. */ xen_pfn_t     pfn_to_mfn_frame_list_list;
>>>>>     unsigned long nmi_reason;
>>>>> +    struct arch_mc_info mc_info; /* machine check information */
>>>>>     uint64_t pad[32];
>>>>> };
>>>> 
>>>> Are you sure it is appropriate to add a member here without reducing the
>>>> padding member?
>>> 
>>> You want me to replace "uint64_t pad[32];" with "uint32_t pad[13];" ?
>> 
>> It would be my understanding that that's the right thing to do (assuming
>> you calculated the numbers correctly), but I'd rely on Keir to give a final
>> word on this.
>> 
>> Jan
> 
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] 2/3: MCA/MCE correctable error handling
  2007-08-22 16:13         ` Keir Fraser
@ 2007-08-23  7:07           ` Christoph Egger
  2007-08-23  9:23             ` [PATCH] resend " Christoph Egger
  2007-08-23 14:12             ` [PATCH] " Keir Fraser
  0 siblings, 2 replies; 9+ messages in thread
From: Christoph Egger @ 2007-08-23  7:07 UTC (permalink / raw)
  To: xen-devel; +Cc: Gavin.Maltby, Keir Fraser, Jan Beulich

On Wednesday 22 August 2007 18:13:30 Keir Fraser wrote:
> I'm not sure you should use shared_info at all. This interface should be
> low-rate enough that you can add a sysctl (assuming this info is applicable
> for dom0 only, which it looks to be).

The polling service is dom0 only. Uncorrectable errors may also be reported
to a DomU.

> Actually, probably a platform_op  rather than a sysctl...

What you propose is to copy the MCA info from Xen to guest?
The MCA info structure will be used for both correctable and uncorrectable 
error notification (as I said in my first patch 0/3 mail).

I assumed you agreed on all this as is in the patches, since you did not 
object/comment when this was discussed on this list in the "MCA/MCE concept" 
topic (and also your questions in your patch 3/3 mail were discussed there).
The purpose of this discussion was to find an agreement on how to proceed.

Therefore, all what I expected from you were some questions/comments about 
implementation details and no design/concept questions.


Christoph


>  -- Keir
>
> On 22/8/07 16:47, "Christoph Egger" <Christoph.Egger@amd.com> wrote:
> > I fixed all this in my code and will re-submit the new patch.
> >
> > Christoph
> >
> > On Wednesday 22 August 2007 12:01:26 Jan Beulich wrote:
> >>>>> "Christoph Egger" <Christoph.Egger@amd.com> 22.08.07 10:47 >>>
> >>>
> >>> On Tuesday 21 August 2007 17:53:45 Jan Beulich wrote:
> >>>>> +} __attribute__((packed));
> >>>>
> >>>> I think it was a general agreement that it is not a good idea
> >>>> (non-portable to non-GNU compilers) to put things like this in public
> >>>> headers. I think by properly ordering things you can get away without
> >>>> this (and almost without padding).
> >>>
> >>> Oh, you're right. I should use #pragma pack(1)  instead.
> >>> Is this fine with you?
> >>
> >> I'm afraid that wouldn't work with the compat mode header generation, as
> >> the original use of #pragma pack(push, ...) was banned for (Sun)
> >> compatibility reasons. Consequently, I believe the only way of doing
> >> this properly is to avoid depending on compiler behavior by arranging
> >> things appropriately (including padding members if needed) and avoiding
> >> #pragma pack() altogether.
> >>
> >>>>> +struct mcinfo_global {
> >>>>> ...
> >>>>> +    uint16_t mc_socketid;
> >>>>> +    uint16_t mc_coreid;
> >>>>> +    uint16_t mc_vcpu_id;
> >>>>
> >>>> Unless mc_vcpu_id is intended for that purpose, this neglects
> >>>> hyperthreading (I know, AMD doesn't use this, but the interface should
> >>>> be vendor neutral).
> >>>>
> >>>> If mc_vcpu_id is meant for that purpose, its naming is ambiguous.
> >>>>
> >>>> If mc_vcpu_id is meant as a vcpuid, then ordering things os that
> >>>> mc_domid and mc_vcpu_id are contiguous would seem more natural (making
> >>>> eventual examination in raw memory less confusing).
> >>>
> >>> mc_coreid is the physical core that reported the machine check
> >>> information. mc_socketid is the physical socket the physical core is
> >>> in. This is useful to find all other affected physical cores, when an
> >>> error in the L3-Cache is reported that is shared over all cores in the
> >>> chip.
> >>>
> >>> mc_vcpu_id is the id of the active vcpu for the domain, that reported
> >>> the machine check information. Inside Xen, this is current->vcpu_id.
> >>> mc_vcpu_id is needed to deal properly with the upcoming NUMA support my
> >>> collegue is working on.
> >>
> >> Okay, but then you're really lacking a thread id here, for being filled
> >> by respective Intel code (AMD code would set this to zero).
> >>
> >>>>> +/* sizeof(struct mcinfo_global) + 6 * sizeof(struct mcinfo_bank) ==
> >>>>> 200. + * This is enough space to store mc information of up to six
> >>>>> banks. + */
> >>>>> +#define MCINFO_MAXSIZE (204 - sizeof(uint32_t))
> >>>>
> >>>> Why don't you use the sizeof()-s from the description? If this is
> >>>> really needed for some reason, then having 200 in the description and
> >>>> 204 in the macro is at least confusing...
> >>>
> >>> MCINFO_MAXSIZE is the size for the content of the mi_data array.
> >>> MCINFO_MAXSIZE + sizeof(mi_nentries) == 204. That is where is number
> >>> comes from.
> >>
> >> I concluded that, but pointed out that seeing two different numbers made
> >> me think of why this is so, whereas identical numbers would have avoided
> >> that (and will likely keep others from asking later, too).
> >>
> >> Also, you don't say what was the reason for you to use constants instead
> >> of sizeof() here.
> >>
> >>>>>     /* Frame containing list of mfns containing list of mfns
> >>>>> containing p2m. */ xen_pfn_t     pfn_to_mfn_frame_list_list;
> >>>>>     unsigned long nmi_reason;
> >>>>> +    struct arch_mc_info mc_info; /* machine check information */
> >>>>>     uint64_t pad[32];
> >>>>> };
> >>>>
> >>>> Are you sure it is appropriate to add a member here without reducing
> >>>> the padding member?
> >>>
> >>> You want me to replace "uint64_t pad[32];" with "uint32_t pad[13];" ?
> >>
> >> It would be my understanding that that's the right thing to do (assuming
> >> you calculated the numbers correctly), but I'd rely on Keir to give a
> >> final word on this.
> >>
> >> Jan



-- 
AMD Saxony, Dresden, Germany
Operating System Research Center

Legal Information:
AMD Saxony Limited Liability Company & Co. KG
Sitz (Geschäftsanschrift):
   Wilschdorfer Landstr. 101, 01109 Dresden, Deutschland
Registergericht Dresden: HRA 4896
vertretungsberechtigter Komplementär:
   AMD Saxony LLC (Sitz Wilmington, Delaware, USA)
Geschäftsführer der AMD Saxony LLC:
   Dr. Hans-R. Deppe, Thomas McCoy

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH] resend 2/3: MCA/MCE correctable error handling
  2007-08-23  7:07           ` Christoph Egger
@ 2007-08-23  9:23             ` Christoph Egger
  2007-08-23 14:12             ` [PATCH] " Keir Fraser
  1 sibling, 0 replies; 9+ messages in thread
From: Christoph Egger @ 2007-08-23  9:23 UTC (permalink / raw)
  To: xen-devel; +Cc: Gavin.Maltby, Keir Fraser, Jan Beulich

[-- Attachment #1: Type: text/plain, Size: 591 bytes --]


Yesterday I said, I will re-send this patch. Here is it.
It incorporates feedback from Jan Beulich.

Signed-off-by: Christoph Egger <Christoph.Egger@amd.com>


-- 
AMD Saxony, Dresden, Germany
Operating System Research Center

Legal Information:
AMD Saxony Limited Liability Company & Co. KG
Sitz (Geschäftsanschrift):
   Wilschdorfer Landstr. 101, 01109 Dresden, Deutschland
Registergericht Dresden: HRA 4896
vertretungsberechtigter Komplementär:
   AMD Saxony LLC (Sitz Wilmington, Delaware, USA)
Geschäftsführer der AMD Saxony LLC:
   Dr. Hans-R. Deppe, Thomas McCoy

[-- Attachment #2: mca_diff2_headers.diff --]
[-- Type: text/x-diff, Size: 5406 bytes --]

diff -r 73832dfa9bcd -r 0fd5402a3730 xen/include/public/arch-x86/xen.h
--- a/xen/include/public/arch-x86/xen.h	Thu Aug 23 09:51:13 2007 +0200
+++ b/xen/include/public/arch-x86/xen.h	Thu Aug 23 10:20:43 2007 +0200
@@ -81,6 +81,108 @@ DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
 #define MAX_VIRT_CPUS 32
 
 #ifndef __ASSEMBLY__
+
+/*
+ * Machine Check Architecure:
+ * structs are read-only and used to report all kinds of
+ * correctable and uncorrectable errors detected by the HW.
+ * Dom0 and DomU: register a handler to get notified.
+ * Dom0 only: Correctable errors are reported via VIRQ_MCA
+ * Dom0 and DomU: Uncorrectable errors are reported via nmi handlers
+ */
+#define MC_TYPE_GLOBAL 0
+#define MC_TYPE_BANK   1
+
+struct mcinfo_common {
+    uint16_t type;     /* structure type */
+    uint16_t size;     /* size of this struct in bytes */
+};
+
+
+#define MC_FLAG_CORRECTABLE    (1 << 0)
+#define MC_FLAG_UNCORRECTABLE  (1 << 1)
+/* contains global x86 mc information */
+struct mcinfo_global {
+    struct mcinfo_common common;
+
+    uint16_t mc_domid; /* impacted domain */
+    uint32_t mc_socketid; /* physical socket of the physical core */
+    uint16_t mc_coreid; /* physical impacted core */
+    uint16_t mc_core_threadid; /* core thread of physical core */
+    uint16_t mc_vcpuid; /* virtual cpu scheduled for impacted domain */
+    uint64_t mc_gstatus; /* global status */
+    uint32_t mc_flags;
+};
+
+/* contains bank local x86 mc information */
+struct mcinfo_bank {
+    struct mcinfo_common common;
+
+    uint32_t mc_bank; /* bank nr */
+    uint64_t mc_status; /* bank status */
+    uint64_t mc_addr;
+    uint64_t mc_misc;
+};
+
+
+/* sizeof(struct mcinfo_global) + 6 * sizeof(struct mcinfo_bank) == 220.
+ * This is enough space to store mc information of up to six banks.
+ * 220 + sizeof(mi_nentries) = 224.
+ */
+#define MCINFO_MAXSIZE (224 - sizeof(uint32_t))
+
+struct arch_mc_info {
+    /* Number of mcinfo_* entries in mi_data */
+    uint32_t mi_nentries;
+
+    uint8_t mi_data[MCINFO_MAXSIZE];
+};
+typedef struct arch_mc_info arch_mc_info_t;
+
+
+
+/* 
+ * OS's should use these instead of writing their own helper functions
+ * each with its own bugs and drawbacks.
+ * We use macros instead of static inline functions to allow guests
+ * to include this header in assembly files (*.S).
+ */
+/* Prototype:
+ *    uint32_t x86_mcinfo_nentries(struct shared_info *si);
+ */
+#define x86_mcinfo_nentries(_si)    \
+    (_si)->arch.mc_info.mi_nentries
+/* Prototype:
+ *    struct mcinfo_common *x86_mcinfo_first(struct shared_info *si);
+ */
+#define x86_mcinfo_first(_si)       \
+    (struct mcinfo_common *)((_si)->arch.mc_info.mi_data)
+/* Prototype:
+ *    struct mcinfo_common *x86_mcinfo_next(struct mcinfo_common *mic);
+ */
+#define x86_mcinfo_next(_mic)       \
+   (struct mcinfo_common *)((uint8_t *)(_mic) + (_mic)->size)
+
+/* Prototype:
+ *    void x86_mcinfo_lookup(void *ret, struct shared_info *si, uint16_t type);
+ */
+#define x86_mcinfo_lookup(_ret, _si, _type)    \
+    do {                                                        \
+        uint32_t found, i;                                      \
+        struct mcinfo_common *_mic;                             \
+                                                                \
+        _mic = x86_mcinfo_first(_si);                           \
+        found = 0;                                              \
+        for (i = 0; i < x86_mcinfo_nentries(_si); i++) {        \
+            if (_mic->type == (_type))                          \
+                found = 1;                                      \
+            else                                                \
+                _mic = x86_mcinfo_next(_mic);                   \
+        }                                                       \
+        (_ret) = found ? _mic : NULL;                           \
+    } while (0)
+
+
 
 typedef unsigned long xen_ulong_t;
 
@@ -173,7 +275,8 @@ struct arch_shared_info {
     /* Frame containing list of mfns containing list of mfns containing p2m. */
     xen_pfn_t     pfn_to_mfn_frame_list_list;
     unsigned long nmi_reason;
-    uint64_t pad[32];
+    struct arch_mc_info mc_info; /* machine check information */
+    uint32_t pad[8];
 };
 typedef struct arch_shared_info arch_shared_info_t;
 
diff -r 73832dfa9bcd -r 0fd5402a3730 xen/include/public/foreign/reference.size
--- a/xen/include/public/foreign/reference.size	Thu Aug 23 09:51:13 2007 +0200
+++ b/xen/include/public/foreign/reference.size	Thu Aug 23 10:20:43 2007 +0200
@@ -13,6 +13,7 @@ arch_vcpu_info            |      24     
 arch_vcpu_info            |      24      16       0
 vcpu_time_info            |      32      32      32
 vcpu_info                 |      64      64      48
+arch_mc_info              |     224     224       -
 arch_shared_info          |     268     280     272
 shared_info               |    2584    3368    4384
 
diff -r 73832dfa9bcd -r 0fd5402a3730 xen/include/public/foreign/structs.py
--- a/xen/include/public/foreign/structs.py	Thu Aug 23 09:51:13 2007 +0200
+++ b/xen/include/public/foreign/structs.py	Thu Aug 23 10:20:43 2007 +0200
@@ -15,6 +15,7 @@ structs = [ "start_info",
             "arch_vcpu_info",
             "vcpu_time_info",
             "vcpu_info",
+            "arch_mc_info",
             "arch_shared_info",
             "shared_info" ];
 

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] 2/3: MCA/MCE correctable error handling
  2007-08-23  7:07           ` Christoph Egger
  2007-08-23  9:23             ` [PATCH] resend " Christoph Egger
@ 2007-08-23 14:12             ` Keir Fraser
  1 sibling, 0 replies; 9+ messages in thread
From: Keir Fraser @ 2007-08-23 14:12 UTC (permalink / raw)
  To: Christoph Egger, xen-devel; +Cc: Gavin.Maltby, Jan Beulich

On 23/8/07 08:07, "Christoph Egger" <Christoph.Egger@amd.com> wrote:

> Therefore, all what I expected from you were some questions/comments about
> implementation details and no design/concept questions.

Whether the info is propagated via a hypercall or via shared_info is an
implementation detail, as far as I'm concerned. If you want to add stuff to
shared_info you need a strong argument why it can't be done by other means.

 -- Keir

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2007-08-23 14:12 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-08-21 13:31 [PATCH] 2/3: MCA/MCE correctable error handling Christoph Egger
2007-08-21 15:53 ` Jan Beulich
2007-08-22  8:47   ` Christoph Egger
2007-08-22 10:01     ` Jan Beulich
2007-08-22 15:47       ` Christoph Egger
2007-08-22 16:13         ` Keir Fraser
2007-08-23  7:07           ` Christoph Egger
2007-08-23  9:23             ` [PATCH] resend " Christoph Egger
2007-08-23 14:12             ` [PATCH] " Keir Fraser

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.