All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Kirill A. Shutemov" <kirill@shutemov.name>
To: Mike Rapoport <rppt@kernel.org>
Cc: linux-kernel@vger.kernel.org, Alan Cox <alan@linux.intel.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Andy Lutomirski <luto@kernel.org>,
	Christopher Lameter <cl@linux.com>,
	Dave Hansen <dave.hansen@linux.intel.com>,
	Idan Yaniv <idan.yaniv@ibm.com>,
	James Bottomley <jejb@linux.ibm.com>,
	Matthew Wilcox <willy@infradead.org>,
	Peter Zijlstra <peterz@infradead.org>,
	"Reshetova, Elena" <elena.reshetova@intel.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Tycho Andersen <tycho@tycho.ws>,
	linux-api@vger.kernel.org, linux-mm@kvack.org,
	Mike Rapoport <rppt@linux.ibm.com>
Subject: Re: [RFC PATCH v2 4/5] mm: secretmem: use PMD-size pages to amortize direct map fragmentation
Date: Mon, 13 Jul 2020 14:05:05 +0300	[thread overview]
Message-ID: <20200713110505.mesvinqjbj7imsdz@box> (raw)
In-Reply-To: <20200706172051.19465-5-rppt@kernel.org>

On Mon, Jul 06, 2020 at 08:20:50PM +0300, Mike Rapoport wrote:
> From: Mike Rapoport <rppt@linux.ibm.com>
> 
> Removing a PAGE_SIZE page from the direct map every time such page is
> allocated for a secret memory mapping will cause severe fragmentation of
> the direct map. This fragmentation can be reduced by using PMD-size pages
> as a pool for small pages for secret memory mappings.
> 
> Add a gen_pool per secretmem inode and lazily populate this pool with
> PMD-size pages.
> 
> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> ---
>  mm/secretmem.c | 107 ++++++++++++++++++++++++++++++++++++++++---------
>  1 file changed, 88 insertions(+), 19 deletions(-)
> 
> diff --git a/mm/secretmem.c b/mm/secretmem.c
> index df8f8c958cc2..c6fcf6d76951 100644
> --- a/mm/secretmem.c
> +++ b/mm/secretmem.c
> @@ -5,6 +5,7 @@
>  #include <linux/memfd.h>
>  #include <linux/printk.h>
>  #include <linux/pagemap.h>
> +#include <linux/genalloc.h>
>  #include <linux/pseudo_fs.h>
>  #include <linux/set_memory.h>
>  #include <linux/sched/signal.h>
> @@ -23,24 +24,66 @@
>  #define SECRETMEM_UNCACHED	0x2
>  
>  struct secretmem_ctx {
> +	struct gen_pool *pool;
>  	unsigned int mode;
>  };
>  
> -static struct page *secretmem_alloc_page(gfp_t gfp)
> +static int secretmem_pool_increase(struct secretmem_ctx *ctx, gfp_t gfp)
>  {
> -	/*
> -	 * FIXME: use a cache of large pages to reduce the direct map
> -	 * fragmentation
> -	 */
> -	return alloc_page(gfp);
> +	unsigned long nr_pages = (1 << HPAGE_PMD_ORDER);
> +	struct gen_pool *pool = ctx->pool;
> +	unsigned long addr;
> +	struct page *page;
> +	int err;
> +
> +	page = alloc_pages(gfp, HPAGE_PMD_ORDER);
> +	if (!page)
> +		return -ENOMEM;
> +
> +	addr = (unsigned long)page_address(page);
> +	split_page(page, HPAGE_PMD_ORDER);
> +
> +	err = gen_pool_add(pool, addr, HPAGE_PMD_SIZE, NUMA_NO_NODE);
> +	if (err) {
> +		__free_pages(page, HPAGE_PMD_ORDER);
> +		return err;
> +	}
> +
> +	__kernel_map_pages(page, nr_pages, 0);

It's worth nothing that unlike flush_tlb_kernel_range(),
__kernel_map_pages() only flushed local TLB, so other CPU may still have
access to the page. It's shouldn't be a blocker, but deserve a comment.


> +
> +	return 0;
> +}
> +
> +static struct page *secretmem_alloc_page(struct secretmem_ctx *ctx,
> +					 gfp_t gfp)
> +{
> +	struct gen_pool *pool = ctx->pool;
> +	unsigned long addr;
> +	struct page *page;
> +	int err;
> +
> +	if (gen_pool_avail(pool) < PAGE_SIZE) {
> +		err = secretmem_pool_increase(ctx, gfp);
> +		if (err)
> +			return NULL;
> +	}
> +
> +	addr = gen_pool_alloc(pool, PAGE_SIZE);
> +	if (!addr)
> +		return NULL;
> +
> +	page = virt_to_page(addr);
> +	get_page(page);
> +
> +	return page;
>  }
>  
>  static vm_fault_t secretmem_fault(struct vm_fault *vmf)
>  {
> +	struct secretmem_ctx *ctx = vmf->vma->vm_file->private_data;
>  	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
>  	struct inode *inode = file_inode(vmf->vma->vm_file);
>  	pgoff_t offset = vmf->pgoff;
> -	unsigned long addr;
>  	struct page *page;
>  	int ret = 0;
>  
> @@ -49,7 +92,7 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf)
>  
>  	page = find_get_entry(mapping, offset);
>  	if (!page) {
> -		page = secretmem_alloc_page(vmf->gfp_mask);
> +		page = secretmem_alloc_page(ctx, vmf->gfp_mask);
>  		if (!page)
>  			return vmf_error(-ENOMEM);
>  
> @@ -57,14 +100,8 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf)
>  		if (unlikely(ret))
>  			goto err_put_page;
>  
> -		ret = set_direct_map_invalid_noflush(page);
> -		if (ret)
> -			goto err_del_page_cache;
> -
> -		addr = (unsigned long)page_address(page);
> -		flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
> -
>  		__SetPageUptodate(page);
> +		set_page_private(page, (unsigned long)ctx);
>  
>  		ret = VM_FAULT_LOCKED;
>  	}
> @@ -72,8 +109,6 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf)
>  	vmf->page = page;
>  	return ret;
>  
> -err_del_page_cache:
> -	delete_from_page_cache(page);
>  err_put_page:
>  	put_page(page);
>  	return vmf_error(ret);
> @@ -155,7 +190,11 @@ static int secretmem_migratepage(struct address_space *mapping,
>  
>  static void secretmem_freepage(struct page *page)
>  {
> -	set_direct_map_default_noflush(page);
> +	unsigned long addr = (unsigned long)page_address(page);
> +	struct secretmem_ctx *ctx = (struct secretmem_ctx *)page_private(page);
> +	struct gen_pool *pool = ctx->pool;
> +
> +	gen_pool_free(pool, addr, PAGE_SIZE);
>  }
>  
>  static const struct address_space_operations secretmem_aops = {
> @@ -179,13 +218,18 @@ struct file *secretmem_file_create(const char *name, unsigned int flags)
>  	if (!ctx)
>  		goto err_free_inode;
>  
> +	ctx->pool = gen_pool_create(PAGE_SHIFT, NUMA_NO_NODE);
> +	if (!ctx->pool)
> +		goto err_free_ctx;
> +
>  	file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem",
>  				 O_RDWR, &secretmem_fops);
>  	if (IS_ERR(file))
> -		goto err_free_ctx;
> +		goto err_free_pool;
>  
>  	mapping_set_unevictable(inode->i_mapping);
>  
> +	inode->i_private = ctx;
>  	inode->i_mapping->private_data = ctx;
>  	inode->i_mapping->a_ops = &secretmem_aops;
>  
> @@ -197,6 +241,8 @@ struct file *secretmem_file_create(const char *name, unsigned int flags)
>  
>  	return file;
>  
> +err_free_pool:
> +	gen_pool_destroy(ctx->pool);
>  err_free_ctx:
>  	kfree(ctx);
>  err_free_inode:
> @@ -204,11 +250,34 @@ struct file *secretmem_file_create(const char *name, unsigned int flags)
>  	return file;
>  }
>  
> +static void secretmem_cleanup_chunk(struct gen_pool *pool,
> +				    struct gen_pool_chunk *chunk, void *data)
> +{
> +	unsigned long start = chunk->start_addr;
> +	unsigned long end = chunk->end_addr;
> +	unsigned long nr_pages, addr;
> +
> +	nr_pages = (end - start + 1) / PAGE_SIZE;
> +	__kernel_map_pages(virt_to_page(start), nr_pages, 1);
> +
> +	for (addr = start; addr < end; addr += PAGE_SIZE)
> +		put_page(virt_to_page(addr));
> +}
> +
> +static void secretmem_cleanup_pool(struct secretmem_ctx *ctx)
> +{
> +	struct gen_pool *pool = ctx->pool;
> +
> +	gen_pool_for_each_chunk(pool, secretmem_cleanup_chunk, ctx);
> +	gen_pool_destroy(pool);
> +}
> +
>  static void secretmem_evict_inode(struct inode *inode)
>  {
>  	struct secretmem_ctx *ctx = inode->i_private;
>  
>  	truncate_inode_pages_final(&inode->i_data);
> +	secretmem_cleanup_pool(ctx);
>  	clear_inode(inode);
>  	kfree(ctx);
>  }
> -- 
> 2.26.2
> 

-- 
 Kirill A. Shutemov

  reply	other threads:[~2020-07-13 11:05 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-07-06 17:20 [RFC PATCH v2 0/5] mm: extend memfd with ability to create "secret" memory areas Mike Rapoport
2020-07-06 17:20 ` [RFC PATCH v2 1/5] mm: make HPAGE_PxD_{SHIFT,MASK,SIZE} always available Mike Rapoport
2020-07-07  5:07   ` Hugh Dickins
2020-07-07  6:47     ` Mike Rapoport
2020-07-10 16:40     ` Andrea Arcangeli
2020-07-10 16:57       ` Matthew Wilcox
2020-07-10 17:08         ` Andrea Arcangeli
2020-07-10 17:12         ` Mike Rapoport
2020-07-06 17:20 ` [RFC PATCH v2 2/5] mmap: make mlock_future_check() global Mike Rapoport
2020-07-06 17:20 ` [RFC PATCH v2 3/5] mm: extend memfd with ability to create "secret" memory areas Mike Rapoport
2020-07-13 10:58   ` Kirill A. Shutemov
2020-07-13 15:31     ` Mike Rapoport
2020-07-06 17:20 ` [RFC PATCH v2 4/5] mm: secretmem: use PMD-size pages to amortize direct map fragmentation Mike Rapoport
2020-07-13 11:05   ` Kirill A. Shutemov [this message]
2020-07-13 15:32     ` Mike Rapoport
2020-07-06 17:20 ` [RFC PATCH v2 5/5] mm: secretmem: add ability to reserve memory at boot Mike Rapoport
2020-07-17  8:36 ` [RFC PATCH v2 0/5] mm: extend memfd with ability to create "secret" memory areas Pavel Machek
2020-07-17 14:43   ` James Bottomley

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200713110505.mesvinqjbj7imsdz@box \
    --to=kirill@shutemov.name \
    --cc=akpm@linux-foundation.org \
    --cc=alan@linux.intel.com \
    --cc=cl@linux.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=elena.reshetova@intel.com \
    --cc=idan.yaniv@ibm.com \
    --cc=jejb@linux.ibm.com \
    --cc=linux-api@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=luto@kernel.org \
    --cc=peterz@infradead.org \
    --cc=rppt@kernel.org \
    --cc=rppt@linux.ibm.com \
    --cc=tglx@linutronix.de \
    --cc=tycho@tycho.ws \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.