Re: [PATCH v2] x86, uaccess: introduce copy_from_iter_wt for pmem / writethrough operations

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Ross Zwisler <ross.zwisler@linux.intel.com>
To: Dan Williams <dan.j.williams@intel.com>
Cc: viro@zeniv.linux.org.uk, Jan Kara <jack@suse.cz>,
	Matthew Wilcox <mawilcox@microsoft.com>,
	x86@kernel.org, linux-kernel@vger.kernel.org, hch@lst.de,
	linux-block@vger.kernel.org, linux-nvdimm@lists.01.org,
	jmoyer@redhat.com, Ingo Molnar <mingo@redhat.com>,
	"H. Peter Anvin" <hpa@zytor.com>,
	linux-fsdevel@vger.kernel.org,
	Thomas Gleixner <tglx@linutronix.de>,
	ross.zwisler@linux.intel.com
Subject: Re: [PATCH v2] x86, uaccess: introduce copy_from_iter_wt for pmem / writethrough operations
Date: Mon, 8 May 2017 14:32:20 -0600	[thread overview]
Message-ID: <20170508203220.GA29073@linux.intel.com> (raw)
In-Reply-To: <149340820800.28724.16189291963486607562.stgit@dwillia2-desk3.amr.corp.intel.com>

On Fri, Apr 28, 2017 at 12:39:12PM -0700, Dan Williams wrote:
> The pmem driver has a need to transfer data with a persistent memory
> destination and be able to rely on the fact that the destination writes
> are not cached. It is sufficient for the writes to be flushed to a
> cpu-store-buffer (non-temporal / "movnt" in x86 terms), as we expect
> userspace to call fsync() to ensure data-writes have reached a
> power-fail-safe zone in the platform. The fsync() triggers a REQ_FUA or
> REQ_FLUSH to the pmem driver which will turn around and fence previous
> writes with an "sfence".
> 
> Implement a __copy_from_user_inatomic_wt, memcpy_page_wt, and memcpy_wt,
> that guarantee that the destination buffer is not dirty in the cpu cache
> on completion. The new copy_from_iter_wt and sub-routines will be used
> to replace the "pmem api" (include/linux/pmem.h +
> arch/x86/include/asm/pmem.h). The availability of copy_from_iter_wt()
> and memcpy_wt() are gated by the CONFIG_ARCH_HAS_UACCESS_WT config
> symbol, and fallback to copy_from_iter_nocache() and plain memcpy()
> otherwise.
> 
> This is meant to satisfy the concern from Linus that if a driver wants
> to do something beyond the normal nocache semantics it should be
> something private to that driver [1], and Al's concern that anything
> uaccess related belongs with the rest of the uaccess code [2].
> 
> [1]: https://lists.01.org/pipermail/linux-nvdimm/2017-January/008364.html
> [2]: https://lists.01.org/pipermail/linux-nvdimm/2017-April/009942.html
> 
> Cc: <x86@kernel.org>
> Cc: Jan Kara <jack@suse.cz>
> Cc: Jeff Moyer <jmoyer@redhat.com>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Cc: Al Viro <viro@zeniv.linux.org.uk>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Matthew Wilcox <mawilcox@microsoft.com>
> Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---
<>
> diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
> index c5504b9a472e..07ded30c7e89 100644
> --- a/arch/x86/include/asm/uaccess_64.h
> +++ b/arch/x86/include/asm/uaccess_64.h
> @@ -171,6 +171,10 @@ unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigne
>  extern long __copy_user_nocache(void *dst, const void __user *src,
>  				unsigned size, int zerorest);
>  
> +extern long __copy_user_wt(void *dst, const void __user *src, unsigned size);
> +extern void memcpy_page_wt(char *to, struct page *page, size_t offset,
> +			   size_t len);
> +
>  static inline int
>  __copy_from_user_inatomic_nocache(void *dst, const void __user *src,
>  				  unsigned size)
> @@ -179,6 +183,13 @@ __copy_from_user_inatomic_nocache(void *dst, const void __user *src,
>  	return __copy_user_nocache(dst, src, size, 0);
>  }
>  
> +static inline int
> +__copy_from_user_inatomic_wt(void *dst, const void __user *src, unsigned size)
> +{
> +	kasan_check_write(dst, size);
> +	return __copy_user_wt(dst, src, size);
> +}
> +
>  unsigned long
>  copy_user_handle_tail(char *to, char *from, unsigned len);
>  
> diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
> index 3b7c40a2e3e1..0aeff66a022f 100644
> --- a/arch/x86/lib/usercopy_64.c
> +++ b/arch/x86/lib/usercopy_64.c
> @@ -7,6 +7,7 @@
>   */
>  #include <linux/export.h>
>  #include <linux/uaccess.h>
> +#include <linux/highmem.h>
>  
>  /*
>   * Zero Userspace
> @@ -73,3 +74,130 @@ copy_user_handle_tail(char *to, char *from, unsigned len)
>  	clac();
>  	return len;
>  }
> +
> +#ifdef CONFIG_ARCH_HAS_UACCESS_WT
> +/**
> + * clean_cache_range - write back a cache range with CLWB
> + * @vaddr:	virtual start address
> + * @size:	number of bytes to write back
> + *
> + * Write back a cache range using the CLWB (cache line write back)
> + * instruction. Note that @size is internally rounded up to be cache
> + * line size aligned.
> + */
> +static void clean_cache_range(void *addr, size_t size)
> +{
> +	u16 x86_clflush_size = boot_cpu_data.x86_clflush_size;
> +	unsigned long clflush_mask = x86_clflush_size - 1;
> +	void *vend = addr + size;
> +	void *p;
> +
> +	for (p = (void *)((unsigned long)addr & ~clflush_mask);
> +	     p < vend; p += x86_clflush_size)
> +		clwb(p);
> +}
> +
> +long __copy_user_wt(void *dst, const void __user *src, unsigned size)
> +{
> +	unsigned long flushed, dest = (unsigned long) dst;
> +	long rc = __copy_user_nocache(dst, src, size, 0);
> +
> +	/*
> +	 * __copy_user_nocache() uses non-temporal stores for the bulk
> +	 * of the transfer, but we need to manually flush if the
> +	 * transfer is unaligned. A cached memory copy is used when
> +	 * destination or size is not naturally aligned. That is:
> +	 *   - Require 8-byte alignment when size is 8 bytes or larger.
> +	 *   - Require 4-byte alignment when size is 4 bytes.
> +	 */
> +	if (size < 8) {
> +		if (!IS_ALIGNED(dest, 4) || size != 4)
> +			clean_cache_range(dst, 1);
> +	} else {
> +		if (!IS_ALIGNED(dest, 8)) {
> +			dest = ALIGN(dest, boot_cpu_data.x86_clflush_size);
> +			clean_cache_range(dst, 1);
> +		}
> +
> +		flushed = dest - (unsigned long) dst;
> +		if (size > flushed && !IS_ALIGNED(size - flushed, 8))
> +			clean_cache_range(dst + size - 1, 1);
> +	}
> +
> +	return rc;
> +}
> +
> +void memcpy_wt(void *_dst, const void *_src, size_t size)
> +{
> +	unsigned long dest = (unsigned long) _dst;
> +	unsigned long source = (unsigned long) _src;
> +
> +	/* cache copy and flush to align dest */
> +	if (!IS_ALIGNED(dest, 8)) {
> +		unsigned len = min_t(unsigned, size, ALIGN(dest, 8) - dest);
> +
> +		memcpy((void *) dest, (void *) source, len);
> +		clean_cache_range((void *) dest, len);
> +		dest += len;
> +		source += len;
> +		size -= len;
> +		if (!size)
> +			return;
> +	}
> +
> +	/* 4x8 movnti loop */
> +	while (size >= 32) {
> +		asm("movq    (%0), %%r8\n"
> +		    "movq   8(%0), %%r9\n"
> +		    "movq  16(%0), %%r10\n"
> +		    "movq  24(%0), %%r11\n"
> +		    "movnti  %%r8,   (%1)\n"
> +		    "movnti  %%r9,  8(%1)\n"
> +		    "movnti %%r10, 16(%1)\n"
> +		    "movnti %%r11, 24(%1)\n"
> +		    :: "r" (source), "r" (dest)
> +		    : "memory", "r8", "r9", "r10", "r11");
> +		dest += 32;
> +		source += 32;
> +		size -= 32;
> +	}
> +
> +	/* 1x8 movnti loop */
> +	while (size >= 8) {
> +		asm("movq    (%0), %%r8\n"
> +		    "movnti  %%r8,   (%1)\n"
> +		    :: "r" (source), "r" (dest)
> +		    : "memory", "r8");
> +		dest += 8;
> +		source += 8;
> +		size -= 8;
> +	}
> +
> +	/* 1x4 movnti loop */
> +	while (size >= 4) {
> +		asm("movl    (%0), %%r8d\n"
> +		    "movnti  %%r8d,   (%1)\n"
> +		    :: "r" (source), "r" (dest)
> +		    : "memory", "r8");
> +		dest += 4;
> +		source += 4;
> +		size -= 4;
> +	}
> +
> +	/* cache copy for remaining bytes */
> +	if (size) {
> +		memcpy((void *) dest, (void *) source, size);
> +		clean_cache_range((void *) dest, size);
> +	}
> +}
> +EXPORT_SYMBOL_GPL(memcpy_wt);

I took a pretty hard look at the changes in arch/x86/lib/usercopy_64.c, and
they look correct to me.  The inline assembly for non-temporal copies mixed
with C for loop control is IMHO much easier to follow than the pure assembly
of __copy_user_nocache().

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>

next prev parent reply	other threads:[~2017-05-08 20:32 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <20170425012230.GX29622@ZenIV.linux.org.uk>
2017-04-26 21:56 ` [RFC PATCH] x86, uaccess, pmem: introduce copy_from_iter_writethru for dax + pmem Dan Williams
2017-04-27  6:30   ` Ingo Molnar
2017-04-28 19:39     ` [PATCH v2] x86, uaccess: introduce copy_from_iter_wt for pmem / writethrough operations Dan Williams
2017-05-05  6:54       ` Ingo Molnar
2017-05-05 14:12         ` Dan Williams
2017-05-05 20:39       ` Kani, Toshimitsu
2017-05-05 22:25         ` Dan Williams
2017-05-05 22:44           ` Kani, Toshimitsu
2017-05-06  2:15             ` Dan Williams
2017-05-06  3:17               ` Kani, Toshimitsu
2017-05-06  9:46               ` Ingo Molnar
2017-05-06 13:57                 ` Dan Williams
2017-05-07  8:57                   ` Ingo Molnar
2017-05-08  3:01                     ` Kani, Toshimitsu
2017-05-08 20:32       ` Ross Zwisler [this message]
2017-05-08 20:40         ` Dan Williams

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170508203220.GA29073@linux.intel.com \
    --to=ross.zwisler@linux.intel.com \
    --cc=dan.j.williams@intel.com \
    --cc=hch@lst.de \
    --cc=hpa@zytor.com \
    --cc=jack@suse.cz \
    --cc=jmoyer@redhat.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-nvdimm@lists.01.org \
    --cc=mawilcox@microsoft.com \
    --cc=mingo@redhat.com \
    --cc=tglx@linutronix.de \
    --cc=viro@zeniv.linux.org.uk \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).