xen-devel.lists.xenproject.org archive mirror
 help / color / mirror / Atom feed
From: David Scott <dave.scott@eu.citrix.com>
To: Ian Campbell <Ian.Campbell@citrix.com>
Cc: "Liuqiming (John)" <john.liuqiming@huawei.com>,
	Yanqiangjun <yanqiangjun@huawei.com>,
	"xen-devel@lists.xen.org" <xen-devel@lists.xen.org>,
	"andrew.cooper3@citrix.com" <andrew.cooper3@citrix.com>
Subject: Re: oxenstored memory leak? seems related with XSA-38
Date: Tue, 16 Jul 2013 10:56:53 +0100	[thread overview]
Message-ID: <51E518E5.7000503@eu.citrix.com> (raw)
In-Reply-To: <1373967989.4663.31.camel@kazak.uk.xensource.com>

On 16/07/13 10:46, Ian Campbell wrote:
> On Mon, 2013-07-15 at 13:13 +0100, David Scott wrote:
>> Hi,
>>
>> On 05/07/13 10:07, Liuqiming (John) wrote:
>>>
>>> Here is my patch that try to fix this issue.
>>>
>>> The whole idea is: add check logic when read from IO ring, and if error happens mark the reading connection as "bad",
>>> Unless vm reboot, oxenstored will not handle message from this connection any more.
>>
>> I think detecting a bad client and avoiding wasting CPU time on it is a
>> good idea. Is this patch working well for you in your testing?
>>
>> In future I wonder whether we should add some form of request
>> rate-limiting in addition to the per-domain quotas, to prevent one
>> domain from taking more than it's fair share of xenstored time. I
>> imagine that a non-malicious domain can still keep xenstored busy with
>> 'legitimate' traffic (although hopefully it still provides services to
>> other guests, albeit more slowly)
>
> Is this an Ack for this patch, or (leaving aside future work) would you
> like to see changes before it gets applied?

I'm happy with this current patch for now, so

Acked-by: David Scott <dave.scott@eu.citrix.com>

Cheers,
Dave

>
> Ian.
>
>>
>> Cheers,
>> Dave
>>
>>>
>>> diff --git a/tools/ocaml/libs/xb/xs_ring_stubs.c b/tools/ocaml/libs/xb/xs_ring_stubs.c
>>> index fdd9983..a9ca456 100644
>>> --- a/tools/ocaml/libs/xb/xs_ring_stubs.c
>>> +++ b/tools/ocaml/libs/xb/xs_ring_stubs.c
>>> @@ -45,6 +45,10 @@
>>>    	cons = *(volatile uint32*)&intf->req_cons;
>>>    	prod = *(volatile uint32*)&intf->req_prod;
>>>    	xen_mb();
>>> +
>>> +	if ((prod - cons) > XENSTORE_RING_SIZE)
>>> +	    return -1;
>>> +
>>>    	if (prod == cons)
>>>    		return 0;
>>>    	cons = MASK_XENSTORE_IDX(cons);
>>> @@ -94,7 +98,7 @@
>>>    	res = xs_ring_read(GET_C_STRUCT(interface),
>>>    	                   String_val(buffer), Int_val(len));
>>>    	if (res == -1)
>>> -		caml_failwith("huh");
>>> +		caml_failwith("bad client");
>>>    	result = Val_int(res);
>>>    	CAMLreturn(result);
>>>    }
>>> diff --git a/tools/ocaml/xenstored/connection.ml b/tools/ocaml/xenstored/connection.ml
>>> index 32e2f2e..e862c79 100644
>>> --- a/tools/ocaml/xenstored/connection.ml
>>> +++ b/tools/ocaml/xenstored/connection.ml
>>> @@ -38,6 +38,11 @@
>>>    	mutable perm: Perms.Connection.t;
>>>    }
>>>
>>> +let mark_as_bad con =
>>> +	match con.dom with
>>> +	| None -> ()
>>> +	| Some domain -> Domain.mark_as_bad domain
>>> +
>>>    let get_path con =
>>>    Printf.sprintf "/local/domain/%i/" (match con.dom with None -> 0 | Some d -> Domain.get_id d)
>>>
>>> diff --git a/tools/ocaml/xenstored/domain.ml b/tools/ocaml/xenstored/domain.ml
>>> index 85ab282..444069d 100644
>>> --- a/tools/ocaml/xenstored/domain.ml
>>> +++ b/tools/ocaml/xenstored/domain.ml
>>> @@ -27,6 +27,7 @@
>>>    	interface: Xenmmap.mmap_interface;
>>>    	eventchn: Event.t;
>>>    	mutable port: Xeneventchn.t option;
>>> +	mutable bad_client: bool;
>>>    }
>>>
>>>    let get_path dom = "/local/domain/" ^ (sprintf "%u" dom.id)
>>> @@ -34,6 +35,9 @@
>>>    let get_interface d = d.interface
>>>    let get_mfn d = d.mfn
>>>    let get_remote_port d = d.remote_port
>>> +
>>> +let is_bad_domain domain = domain.bad_client
>>> +let mark_as_bad domain = domain.bad_client <- true
>>>
>>>    let string_of_port = function
>>>    | None -> "None"
>>> @@ -68,7 +72,8 @@
>>>    	remote_port = remote_port;
>>>    	interface = interface;
>>>    	eventchn = eventchn;
>>> -	port = None
>>> +	port = None;
>>> +	bad_client = false
>>>    }
>>>
>>>    let is_dom0 d = d.id = 0
>>> diff --git a/tools/ocaml/xenstored/process.ml b/tools/ocaml/xenstored/process.ml
>>> index a4ff741..2267ddc 100644
>>> --- a/tools/ocaml/xenstored/process.ml
>>> +++ b/tools/ocaml/xenstored/process.ml
>>> @@ -374,7 +374,17 @@
>>>    	Logging.xb_answer ~ty ~tid ~con:(Connection.get_domstr con) data
>>>
>>>    let do_input store cons doms con =
>>> -	if Connection.do_input con then (
>>> +	let newpacket =
>>> +		try
>>> +			Connection.do_input con
>>> +		with Failure exp ->
>>> +			error "caught exception %s" exp;
>>> +			error "got a bad client %s" (sprintf "%-8s" (Connection.get_domstr con));
>>> +			Connection.mark_as_bad con;
>>> +			false
>>> +	in
>>> +	
>>> +	if newpacket then (
>>>    		let packet = Connection.pop_in con in
>>>    		let tid, rid, ty, data = Xenbus.Xb.Packet.unpack packet in
>>>    		(* As we don't log IO, do not call an unnecessary sanitize_data
>>> diff --git a/tools/ocaml/xenstored/xenstored.ml b/tools/ocaml/xenstored/xenstored.ml
>>> index 4045aed..cfc2a4b 100644
>>> --- a/tools/ocaml/xenstored/xenstored.ml
>>> +++ b/tools/ocaml/xenstored/xenstored.ml
>>> @@ -50,9 +50,10 @@
>>>
>>>    let process_domains store cons domains =
>>>    	let do_io_domain domain =
>>> -		let con = Connections.find_domain cons (Domain.get_id domain) in
>>> -		Process.do_input store cons domains con;
>>> -		Process.do_output store cons domains con in
>>> +		if not (Domain.is_bad_domain domain) then
>>> +			let con = Connections.find_domain cons (Domain.get_id domain) in
>>> +			Process.do_input store cons domains con;
>>> +			Process.do_output store cons domains con in
>>>    	Domains.iter domains do_io_domain
>>>
>>>    let sigusr1_handler store =
>>>
>>>
>>>> -----Original Message-----
>>>> From: Liuqiming (John)
>>>> Sent: Friday, July 05, 2013 11:14 AM
>>>> To: 'andrew.cooper3@citrix.com'
>>>> Cc: Yanqiangjun; 'xen-devel@lists.xen.org'
>>>> Subject: RE: [Xen-devel] oxenstored memory leak? seems related with
>>>> XSA-38
>>>>
>>>> Hi Andrew,
>>>>
>>>> Yes, I know the second patch commit by Ian Campbell.
>>>>
>>>> Actually I have applied all the patch from
>>>> http://wiki.xen.org/wiki/Security_Announcements#XSA_38_oxenstored_inc
>>>> orrect_handling_of_certain_Xenbus_ring_states
>>>>
>>>> But this issue still exists.
>>>>
>>>>> On 04/07/13 03:48, Liuqiming (John) wrote:
>>>>>> Hi all,
>>>>>>
>>>>>> Continue my test about oxenstored:
>>>>>>
>>>>>> I switch to original C xenstored and test my "broken" vm. The cxenstored
>>>>> do not have the "memory leak" issue.
>>>>>> So I compared the IO ring handler logic between cxenstored and
>>>>> oxenstored and find out the difference:
>>>>>>
>>>>>> In Cxenstord, after got the cons and prod value, a index check will be
>>>>> performed
>>>>>>
>>>>>> 	if (!check_indexes(cons, prod)) {
>>>>>> 		errno = EIO;
>>>>>> 		return -1;
>>>>>> 	}
>>>>>>
>>>>>> static bool check_indexes(XENSTORE_RING_IDX cons,
>>>>> XENSTORE_RING_IDX prod)
>>>>>> {
>>>>>> 	return ((prod - cons) <= XENSTORE_RING_SIZE);
>>>>>> }
>>>>>>
>>>>>> So any connection has prod - cons > XENSTORE_RING_SIZE will be
>>>> treated
>>>>> as "bad client", and cxenstored will not handle its IO ring msg any more.
>>>>>>
>>>>>> But in oxenstored, there is just a simple comparison between prod and
>>>>> cons
>>>>>>       if (prod == cons)
>>>>>> 		return 0;
>>>>>>
>>>>>> so there leaves a security hole to a guest vm user who can manipulate
>>>> prod
>>>>> and cons to make oxenstored increasing memory usage and out of service.
>>>>>>
>>>>>> I managed to create a patch to fix this and I'm testing it on xen4.2.2. Will
>>>>> send out soon.
>>>>>
>>>>> Are you aware of
>>>>>
>>>> http://xenbits.xen.org/gitweb/?p=xen.git;a=commitdiff;h=eeddfad1b339dca
>>>>> a787230f519a19de1cbc22ad8
>>>>> which is a second patch for XSA-38 ?
>>>>>
>>>>> ~Andrew
>>>>>
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Liuqiming (John)
>>>>>>> Sent: Monday, July 01, 2013 9:47 PM
>>>>>>> To: 'xen-devel@lists.xen.org'; 'ian.jackson@eu.citrix.com';
>>>>>>> 'ian.campbell@citrix.com'
>>>>>>> Cc: Yanqiangjun
>>>>>>> Subject: oxenstored memory leak? seems related with XSA-38
>>>>>>>
>>>>>>> Hi all,
>>>>>>>
>>>>>>> I test starting vm using xen-4.2.2 release with oxenstored, and got a
>>>>> problem
>>>>>>> may be related with XSA-38
>>>>>>>
>>>>>
>>>> (http://lists.xen.org/archives/html/xen-announce/2013-02/msg00005.html).
>>>>>>>
>>>>>>> When vm started, oxenstored memory usage keep increasing, and it
>>>> took
>>>>>>> 1.5G memory at last. Vm hanged at loading OS screen.
>>>>>>>
>>>>>>> Here is the output of top:
>>>>>>>
>>>>>>> top - 20:18:32 up 1 day,  3:09,  5 users,  load average: 0.99, 0.63,
>>>> 0.32
>>>>>>> Tasks:   1 total,   1 running,   0 sleeping,   0 stopped,   0 zombie
>>>>>>> %Cpu(s):  4.5 us,  1.8 sy,  0.0 ni, 93.7 id,  0.0 wa,  0.0 hi,  0.0 si,
>>>>> 0.0
>>>>>>> st
>>>>>>> KiB Mem:  46919428 total, 46699012 used,   220416 free,    36916
>>>>>>> buffers
>>>>>>> KiB Swap:  2103292 total,        0 used,  2103292 free, 44260932
>>>>>>> cached
>>>>>>>
>>>>>>>     PID USER      PR  NI  VIRT  RES  SHR S  %CPU %MEM
>>>>> TIME+
>>>>>>> COMMAND
>>>>>>>     806 root      20   0  955m 926m 1068 R  99.9  2.0   4:54.14
>>>>>>> oxenstored
>>>>>>>
>>>>>>>
>>>>>>> top - 20:19:05 up 1 day,  3:09,  5 users,  load average: 0.99, 0.67,
>>>> 0.34
>>>>>>> Tasks:   1 total,   1 running,   0 sleeping,   0 stopped,   0 zombie
>>>>>>> %Cpu(s):  4.6 us,  1.6 sy,  0.0 ni, 93.8 id,  0.0 wa,  0.0 hi,  0.0 si,
>>>>> 0.0
>>>>>>> st
>>>>>>> KiB Mem:  46919428 total, 46708564 used,   210864 free,    36964
>>>>>>> buffers
>>>>>>> KiB Swap:  2103292 total,        0 used,  2103292 free, 44168380
>>>>>>> cached
>>>>>>>
>>>>>>>     PID USER      PR  NI  VIRT  RES  SHR S  %CPU %MEM
>>>>> TIME+
>>>>>>> COMMAND
>>>>>>>     806 root      20   0 1048m 1.0g 1068 R 100.2  2.2   5:27.03
>>>>>>> oxenstored
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> top - 20:21:35 up 1 day,  3:12,  5 users,  load average: 1.00, 0.80,
>>>> 0.44
>>>>>>> Tasks:   1 total,   1 running,   0 sleeping,   0 stopped,   0 zombie
>>>>>>> %Cpu(s):  4.7 us,  1.6 sy,  0.0 ni, 93.7 id,  0.0 wa,  0.0 hi,  0.0 si,
>>>>> 0.0
>>>>>>> st
>>>>>>> KiB Mem:  46919428 total, 46703052 used,   216376 free,    37208
>>>>>>> buffers
>>>>>>> KiB Swap:  2103292 total,        0 used,  2103292 free, 43682968
>>>>>>> cached
>>>>>>>
>>>>>>>     PID USER      PR  NI  VIRT  RES  SHR S  %CPU %MEM
>>>>> TIME+
>>>>>>> COMMAND
>>>>>>>     806 root      20   0 1551m 1.5g 1068 R 100.2  3.3   7:56.10
>>>>>>> oxenstored
>>>>>>>
>>>>>>> And oxenstored log got these over and over again:
>>>>>>>
>>>>>>> [20130701T12:27:14.290Z] D8           invalid
>>>>>>> device/suspend/event-channel
>>>>>>>
>>>>>>> ..
>>>>>>> [20130701T12:27:14.290Z] D8.1937077039 invalid
>>>>>>> /event-channel
>>>>>>>
>>>>>>>     ..
>>>>>>> [20130701T12:27:14.290Z] D8.1852727656
>>>>>>> invalid
>>>>>>>
>>>>>>>               ..
>>>>>>> [20130701T12:27:14.290Z] D8           debug
>>>>>>> [20130701T12:27:14.290Z] D8           debug
>>>>>>> [20130701T12:27:14.290Z] D8           debug
>>>>>>> [20130701T12:27:14.290Z] D8           debug
>>>>>>> [20130701T12:27:14.290Z] D8           debug
>>>>>>> [20130701T12:27:14.290Z] D8           debug
>>>>>>>
>>>>>>> My vm is a windows guest and has GPL PVDriver installed. This problem
>>>> is
>>>>>>> hard to reproduce, and after a hard reboot, everything looks normal.
>>>>>>>
>>>>>>> I guess it's something wrong with the xenbus IO Ring, so I investigated
>>>> the
>>>>>>> code:
>>>>>>>
>>>>>>> 1) oxenstored and xenbus in vm using a shared page to communicate
>>>> with
>>>>>>> each other
>>>>>>>     struct xenstore_domain_interface {
>>>>>>>       char req[XENSTORE_RING_SIZE]; /* Requests to xenstore
>>>> daemon.
>>>>> */
>>>>>>>       char rsp[XENSTORE_RING_SIZE]; /* Replies and async watch
>>>> events.
>>>>> */
>>>>>>>       XENSTORE_RING_IDX req_cons, req_prod;
>>>>>>>       XENSTORE_RING_IDX rsp_cons, rsp_prod;
>>>>>>> };
>>>>>>>
>>>>>>> 2) xenbus in vm put request in req and increase req_prod, then send a
>>>>> event
>>>>>>> to oxenstored
>>>>>>> 3) oxenstored calculates how many to read using req_cons and
>>>> req_prod,
>>>>>>> and after read oxenstored increase req_cons to make it equals req_prod
>>>>>>> which means no request pending.
>>>>>>> 4) oxenstored put responds in rsp and increase rsp_prod, then send a
>>>>> event
>>>>>>> to vm, xenbus in vm using similar logic to handle the rsp ring.
>>>>>>>
>>>>>>>    Am I correct?
>>>>>>>
>>>>>>> So, I'm curious about what happened when req_cons larger than
>>>>> req_prod
>>>>>>> (this can be caused by buggy PV Driver or malicious guest user), it seems
>>>>>>> oxenstored will fell in a endless loop.
>>>>>>>
>>>>>>> Is this what XSA-38 talk about?
>>>>>>>
>>>>>>> I built a pvdriver which will set req_prod to 0 after several xenstore
>>>>>>> operation, and test it on xen-unstable.hg make sure all XSA-38 patches
>>>>>>> applied.
>>>>>>> It seems that the problem I first met reproduced. Oxenstored will took a
>>>>> lot
>>>>>>> of memory eventually.
>>>>>>>
>>>>>>> Could anyone help me about this issue?
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>> _______________________________________________
>>>>>> Xen-devel mailing list
>>>>>> Xen-devel@lists.xen.org
>>>>>> http://lists.xen.org/xen-devel
>>>
>>> _______________________________________________
>>> Xen-devel mailing list
>>> Xen-devel@lists.xen.org
>>> http://lists.xen.org/xen-devel
>>>
>>
>>
>> _______________________________________________
>> Xen-devel mailing list
>> Xen-devel@lists.xen.org
>> http://lists.xen.org/xen-devel
>
>

  reply	other threads:[~2013-07-16  9:56 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <C462DF05CCBDDC42BC667291C037376D39205981@SZXEML506-MBS.china.huawei.com>
2013-07-05  3:14 ` oxenstored memory leak? seems related with XSA-38 Liuqiming (John)
2013-07-05  9:07 ` Liuqiming (John)
2013-07-15 12:13   ` David Scott
2013-07-16  5:19     ` Liuqiming (John)
2013-07-16  9:46     ` Ian Campbell
2013-07-16  9:56       ` David Scott [this message]
2013-07-19 11:56         ` Ian Campbell
2013-07-22 12:08           ` Liuqiming (John)
2013-07-22 21:37             ` Ian Campbell
2013-07-04  2:48 Liuqiming (John)
2013-07-04  8:52 ` Andrew Cooper
  -- strict thread matches above, loose matches on Subject: below --
2013-07-01 13:47 Liuqiming (John)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=51E518E5.7000503@eu.citrix.com \
    --to=dave.scott@eu.citrix.com \
    --cc=Ian.Campbell@citrix.com \
    --cc=andrew.cooper3@citrix.com \
    --cc=john.liuqiming@huawei.com \
    --cc=xen-devel@lists.xen.org \
    --cc=yanqiangjun@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).