Re: [PATCH v8 8/8] x86/tlb: do flush_tlb_kernel_range by 'invlpg'

linux-tegra.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Alex Shi <alex.shi-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
To: Andi Kleen <ak-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>,
	linux-tegra-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-omap-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: tglx-hfZtesqFncYOwBW4kG4KsQ@public.gmane.org,
	mingo-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org,
	hpa-YMNOUZJC4hwAvxtiuMwx3w@public.gmane.org,
	arnd-r2nGTMty4D4@public.gmane.org,
	rostedt-nx8X9YLhiw1AfugRpC6u6w@public.gmane.org,
	fweisbec-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org,
	jeremy-TSDbQ3PG+2Y@public.gmane.org,
	seto.hidetoshi-+CUm20s59erQFUHtdCDX3A@public.gmane.org,
	borislav.petkov-5C7GfCeVMHo@public.gmane.org,
	tony.luck-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org,
	luto-3s7WtUTddSA@public.gmane.org,
	riel-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org,
	avi-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org,
	len.brown-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org,
	tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org,
	cl-gkYfJU5Cukgdnm+yROfE0A@public.gmane.org,
	jbeulich-IBi9RG/b67k@public.gmane.org,
	eric.dumazet-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org,
	akinobu.mita-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org,
	cpw-sJ/iWh9BUns@public.gmane.org,
	penberg-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org,
	steiner-sJ/iWh9BUns@public.gmane.org,
	viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn@public.gmane.org,
	kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A@public.gmane.org,
	aarcange-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org,
	rientjes-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Subject: Re: [PATCH v8 8/8] x86/tlb: do flush_tlb_kernel_range by 'invlpg'
Date: Thu, 21 Jun 2012 13:25:27 +0800	[thread overview]
Message-ID: <4FE2B047.503@intel.com> (raw)
In-Reply-To: <4FD93DC7.8020501-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>

On 06/14/2012 09:26 AM, Alex Shi wrote:

> On 06/14/2012 09:10 AM, Alex Shi wrote:
> 
>> On 06/13/2012 10:56 PM, Andi Kleen wrote:
>>
>>> On Tue, Jun 12, 2012 at 05:06:45PM +0800, Alex Shi wrote:
>>>> This patch do flush_tlb_kernel_range by 'invlpg'. The performance pay
>>>> and gain was analysed in my patch (x86/flush_tlb: try flush_tlb_single
>>>> one by one in flush_tlb_range). Now we move this logical into kernel
>>>> part. The pay is multiple 'invlpg' execution cost, that is same. but
>>>>  the gain(cost reducing of TLB entries refilling) is absolutely
>>>> increased.
>>>
>>> The subtle point is whether INVLPG flushes global pages or not.
>>> After some digging I found a sentence in the SDM that says it does.
>>> So it may be safe.
>>
>>
>> Many thanks for your time!
>>
>>>
>>> What does it improve?
>>
>>




I just write a rough kernel modules that alloc some page arrays in kernel and then map to vaddr by 'vmap'. 

Then my macro benchmark inject a 'unmap_kernel_range' request from a sysfs interface, and doing random memory access in user level during the time.

On my NHM EP 2P * 4 Cores * HT.

Without this patch, the memory access with 4 threads is ~12ns/time.
With this patch, the memory access with 4 threads is ~9ns/time.

With threads number increasing the benefit becomes small and nearly disappeared after thread number up to 256.

But no any regression. 


The rough user macro-benchmark and kernel module is here:

--- kernel module--

#include <linux/init.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/kernel.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/gfp.h>
#include <linux/fs.h>
#include <linux/mman.h>
#include <linux/uaccess.h>
#include <linux/sysfs.h>
#include <linux/hrtimer.h>
#include <linux/device.h>
#include <linux/cpu.h>

MODULE_LICENSE("Dual BSD/GPL");

/* 
 * $cat Makefile 
 * obj-m := modvmalloc.o
 *
 * compile command:
 *  #cd linux; make /home/alexs/exec/modules/modvmalloc.ko 
 */
#define NR_PAGES	(4)
#define NR_BLOCKS	(1024)

struct block {
	struct page ** page_array; 
	void *vaddr;
	int page_count;
};
struct block *block;

static int blocks = NR_BLOCKS;
module_param(blocks, uint, 0400);
MODULE_PARM_DESC(blocks, "map unmap blocks number ");

static struct page **relay_alloc_page_array(unsigned int nr_pages) 
{ 
	const size_t pa_size = NR_PAGES * sizeof(struct page *); 
	if (pa_size > PAGE_SIZE) 
		return vzalloc(pa_size); 
	return kzalloc(pa_size, GFP_KERNEL); 
} 

static void relay_free_page_array(struct page **array) 
{ 
	if (is_vmalloc_addr(array)) 
		vfree(array); 
	else
		kfree(array);
}

static void vmap_unmap(void)
{
	//purge_vmap_area_lazy();
	//vm_unmap_aliases();
	int i;
	for (i=0; i< blocks; i++)
		unmap_kernel_range((unsigned long)(block->vaddr), NR_PAGES*PAGE_SIZE);
}

// ---------------
long vmap_num = 0;

static ssize_t __vmap_num_store(const char *buf,
		size_t count, int smt)
{
	long factor = 0;
	long i;
	unsigned long start, stop;

	if (sscanf(buf, "%ld", &factor) != 1)
		return -EINVAL;

	vmap_num = factor;
	start = ktime_to_ns(ktime_get());

	vmap_unmap();

	stop = ktime_to_ns(ktime_get());
	i = blocks;
	printk(KERN_ERR "vunmap %ld times cost %ld ns/time\n", 
			i, (stop - start)/i);
	return count;
}

static ssize_t vmap_num_show(struct device *dev,
		struct device_attribute *attr,
		char *buf)
{
	return sprintf(buf, "%ld\n", vmap_num);
}
static ssize_t vmap_num_store(struct device *dev,
		struct device_attribute *attr,
		const char *buf, size_t count)
{
	return __vmap_num_store(buf, count, 0);
}

DEVICE_ATTR(vmap_num, 0644,
		vmap_num_show,
		vmap_num_store);

int create_sysfs_vmap_num(struct device *dev)
{
	return device_create_file(dev, &dev_attr_vmap_num);
}

static int mapunmap_init(void){
	long i,j,k;

	create_sysfs_vmap_num(cpu_subsys.dev_root);
	block = kmalloc(sizeof(struct block)*blocks, GFP_KERNEL);

	for (k=0; k< blocks; k++) {
		block[k].page_count = 0;
		block[k].page_array = relay_alloc_page_array(NR_PAGES);
		if (!block[k].page_array)
			return -1;

		for (i = 0; i < NR_PAGES; i++) {
			block[k].page_array[i] = alloc_page(GFP_KERNEL);
			if (unlikely(!block[k].page_array[i])) {
				printk(KERN_ERR "\talloc page error \n");
				goto depopulate;
			}
		}

		if (i!=NR_PAGES)	goto depopulate;

		block[k].page_count = i;
		block[k].vaddr = vmap(block[k].page_array, NR_PAGES, VM_MAP, PAGE_KERNEL);
		if (!(block[k].vaddr)) {
			printk(KERN_ERR "\t\t vmap error !\n");
			goto depopulate;
		}
	}
	printk(KERN_INFO "vmalloc module init OK \n");
	return 0;

depopulate:
	for (i=0; i< k; i++)
		if (block[i].page_count !=0) {
			for (j = 0; j < block[i].page_count; j++)
				__free_page((block[j].page_array[j]));
			relay_free_page_array(block[j].page_array);
		}
	printk(KERN_INFO "vmalloc module init fail\n");
	return -1;
}


static void mapunmap_exit(void){
	long i, j;

	printk(KERN_INFO "bye! this is test module\n");
	device_remove_file(cpu_subsys.dev_root, &dev_attr_vmap_num);

	for (i=0; i< blocks; i++)
		if (block[i].page_count !=0) {
			for (j = 0; j < block[i].page_count; j++)
				__free_page((block[j].page_array[j]));
			relay_free_page_array(block[j].page_array);
		}
}


module_init(mapunmap_init);
module_exit(mapunmap_exit);

--- benchmark ---

/*
   maccess.c
   This is a macrobenchmark for TLB flush range testing.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

   Copyright (C) Intel 2012
   Coypright Alex Shi alex.shi-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org 

   gcc -o maccess maccess.c -lrt -lpthread -O2

    #perf stat -e r881,r882,r884 -e r801,r802,r810,r820,r840,r880,r807 -e rc01 -e r4901,r4902,r4910,r4920,r4940,r4980 -e r5f01  -e rbd01,rdb20  -e r4f02 -e r8004,r8201,r8501,r8502,r8504,r8510,r8520,r8540,r8580  -e rae01,rc820,rc102,rc900 -e r8600  -e rcb10  ./maccess 
*/

#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sys/mman.h>
#include <time.h>
#include <sys/types.h>
#include <pthread.h>

#define FILE_SIZE	(1024*1024*1024)

#define PAGE_SIZE 	(4096)
#define HPAGE_SIZE 	(4096*512)

#ifndef MAP_HUGETLB
#define MAP_HUGETLB	0x40000
#endif


long getnsec(clockid_t clockid) {
        struct timespec ts;
        if (clock_gettime(clockid, &ts) == -1)
                perror("clock_gettime failed");
        return (long) ts.tv_sec * 1000000000 + (long) ts.tv_nsec;
}

//data for threads
struct data{
	int pagenum;
	void *startaddr;
	int rw;
	int loop;
};
volatile int * threadstart;
//thread for memory accessing
void *accessmm(void *data){
	struct data *d = data;
	long *actimes;
	char x;
	int i, k;
	int randn[PAGE_SIZE];
	
	for (i=0;i<PAGE_SIZE; i++)
		randn[i] = rand();

	actimes = malloc(sizeof(long));

	while (*threadstart == 0 )
		usleep(1);

	if (d->rw == 0)
		for (*actimes=0; *threadstart == 1; (*actimes)++)
			for (k=0; k < d->pagenum; k++)
				x = *(volatile char *)(d->startaddr + randn[k]%FILE_SIZE); 
	else
		for (*actimes=0; *threadstart == 1; (*actimes)++)
			for (k=0; k < d->pagenum; k++)
				*(char *)(d->startaddr + randn[k]%FILE_SIZE) = 1; 
	return actimes;
}

int main(int argc, char *argv[])
{
        static  char            optstr[] = "p:w:ht:s:";
	int s = 1;	/* */
	int p = 512;	/* default accessed page number, after maccess */
	int er = 0, rw = 0, h = 0, t = 2; /* d: debug; h: use huge page; t thread number */
	int pagesize = PAGE_SIZE; /*default for regular page */
	volatile char x;
	long protindex = 0;

	int i, j, k, c;
	void *m1, *startaddr;
	unsigned long *startaddr2[1024*512];
	volatile void *tempaddr;
	clockid_t clockid = CLOCK_MONOTONIC;
	unsigned long start, stop, mptime, actime;
	int randn[PAGE_SIZE];

	pthread_t	pid[1024];
	void * res;
	struct data data;

	char command[1024];

	for (i=0;i<PAGE_SIZE; i++)
		randn[i] = rand();

        while ((c = getopt(argc, argv, optstr)) != EOF)
                switch (c) {
                case 's':
                        s = atoi(optarg);
                        break;
                case 'p':
                        p = atoi(optarg);
                        break;
                case 'h':
                        h = 1;
                        break;
                case 'w':
                        rw = atoi(optarg);
                        break;
                case 't':
                        t = atoi(optarg);
                        break;
                case '?':
                        er = 1;
                        break;
                }
        if (er) {
                printf("usage: %s %s\n", argv[0], optstr);
                exit(1);
	}

	printf("pid is %d, thread number %d active %d seconds, access page num %d\n", getpid(), t, s, p);
	if (h == 0){
		startaddr = mmap(0, FILE_SIZE, PROT_READ|PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
		pagesize = PAGE_SIZE;
	} else {
		startaddr = mmap(0, FILE_SIZE, PROT_READ|PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED | MAP_HUGETLB, -1, 0);
		pagesize = HPAGE_SIZE;
	}

	start = getnsec(clockid);
	//access whole memory, will generate many page faults 
	for (tempaddr = startaddr; tempaddr < startaddr + FILE_SIZE; tempaddr += pagesize)
		memset((char *)tempaddr, 0, 1);
        stop = getnsec(clockid);

	threadstart = malloc(sizeof(int));
	*threadstart = 0;
	data.pagenum = p; data.startaddr = startaddr; data.rw = rw;
	for (i=0; i< t; i++)
		if(pthread_create(&pid[i], NULL, accessmm, &data))
			perror("pthread create");
	//wait for randn[] filling.
	sleep(1);

	mptime = actime = 0;
	sprintf(command, "sudo sh -c 'echo %d > /sys/devices/system/cpu/vmap_num'", s);
	printf("%s\n", command);

	start = getnsec(clockid);
	//kick threads, let them running.
	*threadstart = 1;

	system(command);
	*threadstart = 0;

	stop = getnsec(clockid);
	mptime += stop - start;

	//get threads' result.
	for (i=0; i< t; i++) {
		if (pthread_join(pid[i], &res))
			perror("pthread_join");
		actime += *(long*)res;
	}
end:
	printf("maccess %ld ms, memory access %ld times/thread/ms, cost %ldns/time\n",
		 mptime/1000000, actime*p*1000000/t/mptime, mptime*t/(actime*p));
	exit(0);
}

> 
>>
>>> -Andi
>>
>>
> 
>

     prev parent reply	other threads:[~2012-06-21  5:25 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <1339492005-20241-1-git-send-email-alex.shi@intel.com>
     [not found] ` <1339492005-20241-9-git-send-email-alex.shi@intel.com>
     [not found]   ` <20120613145656.GB32604@tassilo.jf.intel.com>
     [not found]     ` <4FD939F2.3090605@intel.com>
2012-06-14  1:26       ` [PATCH v8 8/8] x86/tlb: do flush_tlb_kernel_range by 'invlpg' Alex Shi
2012-06-14  2:10         ` Alex Shi
     [not found]         ` <4FD93DC7.8020501-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
2012-06-21  5:25           ` Alex Shi [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4FE2B047.503@intel.com \
    --to=alex.shi-ral2jqcrhueavxtiumwx3w@public.gmane.org \
    --cc=aarcange-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org \
    --cc=ak-VuQAYsv1563Yd54FQh9/CA@public.gmane.org \
    --cc=akinobu.mita-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org \
    --cc=akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org \
    --cc=arnd-r2nGTMty4D4@public.gmane.org \
    --cc=avi-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org \
    --cc=borislav.petkov-5C7GfCeVMHo@public.gmane.org \
    --cc=cl-gkYfJU5Cukgdnm+yROfE0A@public.gmane.org \
    --cc=cpw-sJ/iWh9BUns@public.gmane.org \
    --cc=eric.dumazet-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org \
    --cc=fweisbec-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org \
    --cc=hpa-YMNOUZJC4hwAvxtiuMwx3w@public.gmane.org \
    --cc=jbeulich-IBi9RG/b67k@public.gmane.org \
    --cc=jeremy-TSDbQ3PG+2Y@public.gmane.org \
    --cc=kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A@public.gmane.org \
    --cc=len.brown-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org \
    --cc=linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
    --cc=linux-omap-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
    --cc=linux-tegra-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
    --cc=luto-3s7WtUTddSA@public.gmane.org \
    --cc=mingo-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org \
    --cc=penberg-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org \
    --cc=riel-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org \
    --cc=rientjes-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org \
    --cc=rostedt-nx8X9YLhiw1AfugRpC6u6w@public.gmane.org \
    --cc=seto.hidetoshi-+CUm20s59erQFUHtdCDX3A@public.gmane.org \
    --cc=steiner-sJ/iWh9BUns@public.gmane.org \
    --cc=tglx-hfZtesqFncYOwBW4kG4KsQ@public.gmane.org \
    --cc=tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org \
    --cc=tony.luck-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org \
    --cc=viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn@public.gmane.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).