All of lore.kernel.org
 help / color / mirror / Atom feed
From: Yinghai Lu <yinghai@kernel.org>
To: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>,
	Jeremy Fitzhardinge <jeremy@goop.org>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	"tglx@linutronix.de" <tglx@linutronix.de>,
	"x86@kernel.org" <x86@kernel.org>,
	Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>,
	Jan Beulich <JBeulich@novell.com>
Subject: Re: [PATCH] x86/mm/init: respect memblock reserved regions when destroying mappings
Date: Tue, 08 Feb 2011 08:04:08 -0800	[thread overview]
Message-ID: <4D516978.9050309@kernel.org> (raw)
In-Reply-To: <alpine.DEB.2.00.1102081401390.7277@kaball-desktop>

On 02/08/2011 06:03 AM, Stefano Stabellini wrote:
> On Tue, 8 Feb 2011, Yinghai Lu wrote:
>> On Mon, Feb 7, 2011 at 11:00 AM, Stefano Stabellini
>> <stefano.stabellini@eu.citrix.com> wrote:
>>> On Mon, 7 Feb 2011, Stefano Stabellini wrote:
>>>> On Sun, 6 Feb 2011, Yinghai Lu wrote:
>>>>> On 02/05/2011 11:30 PM, H. Peter Anvin wrote:
>>>>>> On 02/05/2011 11:02 PM, Yinghai Lu wrote:
>>>>>>> why not just move calling cleanup_highmap down?
>>>>>>>
>>>>>>> something like attached patch.
>>>>>>
>>>>>> This patch looks very clean and looks on the surface of it like it is
>>>>>> removing some ugly ad hoc code, but (as always) it needs a description
>>>>>> about the problem it solves and why it is correct.
>>>>>
>>>>> Sure.
>>>>>
>>>>>
>>>>> Jeremy and xen guys, can you please check if it works well with xen ?
>>>>>
>>>>
>>>> Actually this patch makes things worse on xen, because before
>>>> cleanup_highmap() wasn't called at all on xen (on purpose) and now it
>>>> is, fully destroying all the mappings we have at _end.
>>>>
>>>> Can we add a check on memblock reserved regions in cleanup_highmap()?
>>>
>>> In case you are wondering how Yinghai Lu's patch would look like with
>>> the added check, here it is:
>>>
>>>
>>> diff --git a/arch/x86/include/asm/memblock.h b/arch/x86/include/asm/memblock.h
>>> index 19ae14b..184f778 100644
>>> --- a/arch/x86/include/asm/memblock.h
>>> +++ b/arch/x86/include/asm/memblock.h
>>> @@ -3,6 +3,7 @@
>>>
>>>  #define ARCH_DISCARD_MEMBLOCK
>>>
>>> +bool memblock_check_reserved_size(u64 *addrp, u64 *sizep, u64 align);
>>>  u64 memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align);
>>>  void memblock_x86_to_bootmem(u64 start, u64 end);
>>>
>>> diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
>>> index 975f709..28686b6 100644
>>> --- a/arch/x86/include/asm/pgtable_64.h
>>> +++ b/arch/x86/include/asm/pgtable_64.h
>>> @@ -165,7 +165,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
>>>  #define __swp_entry_to_pte(x)          ((pte_t) { .pte = (x).val })
>>>
>>>  extern int kern_addr_valid(unsigned long addr);
>>> -extern void cleanup_highmap(void);
>>> +extern void cleanup_highmap(unsigned long end);
>>>
>>>  #define HAVE_ARCH_UNMAPPED_AREA
>>>  #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
>>> diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
>>> index 2d2673c..5655c22 100644
>>> --- a/arch/x86/kernel/head64.c
>>> +++ b/arch/x86/kernel/head64.c
>>> @@ -77,9 +77,6 @@ void __init x86_64_start_kernel(char * real_mode_data)
>>>        /* Make NULL pointers segfault */
>>>        zap_identity_mappings();
>>>
>>> -       /* Cleanup the over mapped high alias */
>>> -       cleanup_highmap();
>>> -
>>>        max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
>>>
>>>        for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
>>> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
>>> index d3cfe26..91afde6 100644
>>> --- a/arch/x86/kernel/setup.c
>>> +++ b/arch/x86/kernel/setup.c
>>> @@ -297,6 +297,9 @@ static void __init init_gbpages(void)
>>>  static inline void init_gbpages(void)
>>>  {
>>>  }
>>> +static void __init cleanup_highmap(unsigned long end)
>>> +{
>>> +}
>>>  #endif
>>>
>>>  static void __init reserve_brk(void)
>>> @@ -922,6 +925,9 @@ void __init setup_arch(char **cmdline_p)
>>>         */
>>>        reserve_brk();
>>>
>>> +       /* Cleanup the over mapped high alias after _brk_end*/
>>> +       cleanup_highmap(_brk_end);
>>> +
>>>        memblock.current_limit = get_max_mapped();
>>>        memblock_x86_fill();
>>>
>>> diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
>>> index 947f42a..f13ff3a 100644
>>> --- a/arch/x86/mm/init.c
>>> +++ b/arch/x86/mm/init.c
>>> @@ -279,25 +279,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
>>>        load_cr3(swapper_pg_dir);
>>>  #endif
>>>
>>> -#ifdef CONFIG_X86_64
>>> -       if (!after_bootmem && !start) {
>>> -               pud_t *pud;
>>> -               pmd_t *pmd;
>>> -
>>> -               mmu_cr4_features = read_cr4();
>>> -
>>> -               /*
>>> -                * _brk_end cannot change anymore, but it and _end may be
>>> -                * located on different 2M pages. cleanup_highmap(), however,
>>> -                * can only consider _end when it runs, so destroy any
>>> -                * mappings beyond _brk_end here.
>>> -                */
>>> -               pud = pud_offset(pgd_offset_k(_brk_end), _brk_end);
>>> -               pmd = pmd_offset(pud, _brk_end - 1);
>>> -               while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1))
>>> -                       pmd_clear(pmd);
>>> -       }
>>> -#endif
>>>        __flush_tlb_all();
>>>
>>>        if (!after_bootmem && e820_table_end > e820_table_start)
>>> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
>>> index 71a5929..028c49e 100644
>>> --- a/arch/x86/mm/init_64.c
>>> +++ b/arch/x86/mm/init_64.c
>>> @@ -297,18 +297,26 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
>>>  * rounded up to the 2MB boundary. This catches the invalid pmds as
>>>  * well, as they are located before _text:
>>>  */
>>> -void __init cleanup_highmap(void)
>>> +void __init cleanup_highmap(unsigned long end)
>>>  {
>>>        unsigned long vaddr = __START_KERNEL_map;
>>> -       unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1;
>>>        pmd_t *pmd = level2_kernel_pgt;
>>>        pmd_t *last_pmd = pmd + PTRS_PER_PMD;
>>> +       u64 size, addrp;
>>> +       bool changed;
>>> +
>>> +       end = roundup(end, PMD_SIZE) - 1;
>>>
>>>        for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
>>>                if (pmd_none(*pmd))
>>>                        continue;
>>> -               if (vaddr < (unsigned long) _text || vaddr > end)
>>> -                       set_pmd(pmd, __pmd(0));
>>> +               if (vaddr < (unsigned long) _text || vaddr > end) {
>>> +                       addrp = __pa(vaddr);
>>> +                       size = PMD_SIZE;
>>> +                       changed = memblock_check_reserved_size(&addrp, &size, PMD_SIZE);
>>> +                       if (!changed && size)
>>> +                               set_pmd(pmd, __pmd(0));
>>> +               }
>>
>> for native path, memblock_check_reserved_size() are called 256 times
>> without obvious reasons.
> 
> 
> what about this patch, does it look like a reasonable solution?
> 
> 
> 
> diff --git a/arch/x86/include/asm/memblock.h b/arch/x86/include/asm/memblock.h
> index 19ae14b..184f778 100644
> --- a/arch/x86/include/asm/memblock.h
> +++ b/arch/x86/include/asm/memblock.h
> @@ -3,6 +3,7 @@
>  
>  #define ARCH_DISCARD_MEMBLOCK
>  
> +bool memblock_check_reserved_size(u64 *addrp, u64 *sizep, u64 align);
>  u64 memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align);
>  void memblock_x86_to_bootmem(u64 start, u64 end);
>  
> diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
> index 975f709..28686b6 100644
> --- a/arch/x86/include/asm/pgtable_64.h
> +++ b/arch/x86/include/asm/pgtable_64.h
> @@ -165,7 +165,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
>  #define __swp_entry_to_pte(x)		((pte_t) { .pte = (x).val })
>  
>  extern int kern_addr_valid(unsigned long addr);
> -extern void cleanup_highmap(void);
> +extern void cleanup_highmap(unsigned long end);
>  
>  #define HAVE_ARCH_UNMAPPED_AREA
>  #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
> diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
> index 2d2673c..5655c22 100644
> --- a/arch/x86/kernel/head64.c
> +++ b/arch/x86/kernel/head64.c
> @@ -77,9 +77,6 @@ void __init x86_64_start_kernel(char * real_mode_data)
>  	/* Make NULL pointers segfault */
>  	zap_identity_mappings();
>  
> -	/* Cleanup the over mapped high alias */
> -	cleanup_highmap();
> -
>  	max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
>  
>  	for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> index d3cfe26..91afde6 100644
> --- a/arch/x86/kernel/setup.c
> +++ b/arch/x86/kernel/setup.c
> @@ -297,6 +297,9 @@ static void __init init_gbpages(void)
>  static inline void init_gbpages(void)
>  {
>  }
> +static void __init cleanup_highmap(unsigned long end)
> +{
> +}
>  #endif
>  
>  static void __init reserve_brk(void)
> @@ -922,6 +925,9 @@ void __init setup_arch(char **cmdline_p)
>  	 */
>  	reserve_brk();
>  
> +	/* Cleanup the over mapped high alias after _brk_end*/
> +	cleanup_highmap(_brk_end);
> +
>  	memblock.current_limit = get_max_mapped();
>  	memblock_x86_fill();
>  
> diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
> index 947f42a..f13ff3a 100644
> --- a/arch/x86/mm/init.c
> +++ b/arch/x86/mm/init.c
> @@ -279,25 +279,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
>  	load_cr3(swapper_pg_dir);
>  #endif
>  
> -#ifdef CONFIG_X86_64
> -	if (!after_bootmem && !start) {
> -		pud_t *pud;
> -		pmd_t *pmd;
> -
> -		mmu_cr4_features = read_cr4();
> -
> -		/*
> -		 * _brk_end cannot change anymore, but it and _end may be
> -		 * located on different 2M pages. cleanup_highmap(), however,
> -		 * can only consider _end when it runs, so destroy any
> -		 * mappings beyond _brk_end here.
> -		 */
> -		pud = pud_offset(pgd_offset_k(_brk_end), _brk_end);
> -		pmd = pmd_offset(pud, _brk_end - 1);
> -		while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1))
> -			pmd_clear(pmd);
> -	}
> -#endif
>  	__flush_tlb_all();
>  
>  	if (!after_bootmem && e820_table_end > e820_table_start)
> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> index 71a5929..90a64de 100644
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -297,12 +297,25 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
>   * rounded up to the 2MB boundary. This catches the invalid pmds as
>   * well, as they are located before _text:
>   */
> -void __init cleanup_highmap(void)
> +void __init cleanup_highmap(unsigned long end)
>  {
>  	unsigned long vaddr = __START_KERNEL_map;
> -	unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1;
>  	pmd_t *pmd = level2_kernel_pgt;
>  	pmd_t *last_pmd = pmd + PTRS_PER_PMD;
> +	u64 size, addrp;
> +	bool changed;
> +
> +	end = roundup(end, PMD_SIZE) - 1;
> +
> +	/* check for reserved regions after end */
> +	addrp = __pa(end);
> +	size = (PTRS_PER_PMD * PMD_SIZE + vaddr) - end;
> +	changed = memblock_check_reserved_size(&addrp, &size, PMD_SIZE);
> +	if (changed || !size) {
> +		/* reserved regions found, avoid removing mappings after end */
> +		pud_t *pud = pud_offset(pgd_offset_k(end), end);
> +		last_pmd = pmd_offset(pud, end);
> +	}
>  
>  	for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
>  		if (pmd_none(*pmd))

test case:
native path, bootloader have initrd overlap with [0,512M)...

We will not get highmap cleared.

Maybe we have to keep two steps method.

Yinghai

  reply	other threads:[~2011-02-08 16:05 UTC|newest]

Thread overview: 39+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-01-31 15:18 [PATCH] x86/mm/init: respect memblock reserved regions when destroying mappings Stefano Stabellini
2011-02-02 20:15 ` Jeremy Fitzhardinge
2011-02-03  5:05 ` H. Peter Anvin
2011-02-03 11:25   ` Stefano Stabellini
2011-02-03 17:02     ` H. Peter Anvin
2011-02-04 11:35       ` Stefano Stabellini
2011-02-05  1:18         ` Jeremy Fitzhardinge
2011-02-06  7:02           ` Yinghai Lu
2011-02-06  7:30             ` H. Peter Anvin
2011-02-06 17:49               ` Yinghai Lu
2011-02-06 19:24               ` Yinghai Lu
2011-02-07 16:50                 ` Stefano Stabellini
2011-02-07 18:04                   ` Yinghai Lu
2011-02-07 18:58                     ` Stefano Stabellini
2011-02-07 19:00                       ` Yinghai Lu
2011-02-07 19:18                         ` Yinghai Lu
2011-02-07 21:56                         ` Jeremy Fitzhardinge
2011-02-08  3:12                           ` Yinghai Lu
2011-02-08  4:56                             ` Jeremy Fitzhardinge
2011-02-08  5:09                               ` Yinghai Lu
2011-02-08 14:55                                 ` Konrad Rzeszutek Wilk
2011-02-08 19:24                                   ` Jeremy Fitzhardinge
2011-02-08 20:26                                     ` Stefano Stabellini
2011-02-08 19:34                             ` H. Peter Anvin
2011-02-10 23:48                               ` Jeremy Fitzhardinge
2011-02-10 23:57                                 ` Yinghai Lu
2011-02-11  0:35                                   ` H. Peter Anvin
2011-02-11  0:54                                     ` Yinghai Lu
2011-02-14 16:26                                       ` Konrad Rzeszutek Wilk
2011-02-14 17:55                                         ` Yinghai Lu
2011-02-14 17:58                                           ` Stefano Stabellini
2011-02-14 18:09                                             ` Yinghai Lu
2011-02-14 20:02                                           ` H. Peter Anvin
2011-02-16 17:36                                             ` Stefano Stabellini
2011-02-07 19:00                   ` Stefano Stabellini
2011-02-08  5:16                     ` Yinghai Lu
2011-02-08 14:03                       ` Stefano Stabellini
2011-02-08 16:04                         ` Yinghai Lu [this message]
2011-02-07 16:02           ` Stefano Stabellini

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4D516978.9050309@kernel.org \
    --to=yinghai@kernel.org \
    --cc=JBeulich@novell.com \
    --cc=hpa@zytor.com \
    --cc=jeremy@goop.org \
    --cc=konrad.wilk@oracle.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=stefano.stabellini@eu.citrix.com \
    --cc=tglx@linutronix.de \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.