- * [PATCH 1/5] Core kernel backend to capture the memory reference pattern
  2011-07-05  8:22 [PATCH 0/5] mm,debug: VM framework to capture memory reference pattern Ankita Garg
@ 2011-07-05  8:22 ` Ankita Garg
  2011-07-05  8:22 ` [PATCH 2/5] memref module to walk the process page table Ankita Garg
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 11+ messages in thread
From: Ankita Garg @ 2011-07-05  8:22 UTC (permalink / raw)
  To: linux-mm; +Cc: ankita, svaidy, linux-kernel
Hi,
This patch adds data structure and memory for the capturing the
reference pattern.  At system boot time, an array memtrace_memblock is
created with information about memory blocks of size 64MB. Memory
references are captured at the granularity of these memory blocks. Even
when a single page within a memory block is referred in sampling interval,
the complete block of memory is marked as being referenced by the kernel.
Whether to mark the block as being referenced or not is indicated by the
kernel module, introduced in patch 2/3.
TODO:
- The access_flag field of the memtrace_block_accessed array can be used as
  a count of the number of times the pages in that memory block were accessed,
  instead of a simple 1 or 0 value
Signed-off-by: Ankita Garg <ankita@in.ibm.com>
---
 include/linux/memtrace.h |   29 ++++++++++++
 include/linux/sched.h    |    4 ++
 kernel/fork.c            |    6 +++
 lib/Kconfig.debug        |    4 ++
 lib/Makefile             |    1 +
 lib/memtrace.c           |  108 ++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 152 insertions(+), 0 deletions(-)
 create mode 100644 include/linux/memtrace.h
 create mode 100644 lib/memtrace.c
diff --git a/include/linux/memtrace.h b/include/linux/memtrace.h
new file mode 100644
index 0000000..0fa15e0
--- /dev/null
+++ b/include/linux/memtrace.h
@@ -0,0 +1,29 @@
+#ifndef _LINUX_MEMTRACE_H
+#define _LINUX_MEMTRACE_H
+
+#include <linux/types.h>
+#include <linux/sched.h>
+
+extern pid_t pg_trace_pid;
+
+struct memtrace_block {
+	unsigned int    seq;
+	unsigned long	access_flag;
+};
+
+#define MAX_MEMTRACE_BLOCK 512
+
+pid_t get_pg_trace_pid(void);
+void set_pg_trace_pid(pid_t pid);
+void set_mem_trace(struct task_struct *tsk, int flag);
+void set_task_seq(struct task_struct *tsk, unsigned int seq);
+unsigned int get_task_seq(struct task_struct *tsk);
+void init_seq_number(void);
+unsigned int get_seq_number(void);
+unsigned int inc_seq_number(void);
+void set_memtrace_block_sz(int sz);
+void mark_memtrace_block_accessed(unsigned long paddr);
+void init_memtrace_blocks(void);
+void update_and_log_data(void);
+
+#endif /* _LINUX_MEMTRACE_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a837b20..bbf6973 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1473,6 +1473,10 @@ struct task_struct {
 	u64 acct_vm_mem1;	/* accumulated virtual memory usage */
 	cputime_t acct_timexpd;	/* stime + utime since last update */
 #endif
+#if defined(CONFIG_MEMTRACE)
+	unsigned int mem_trace;
+	unsigned int seq;
+#endif
 #ifdef CONFIG_CPUSETS
 	nodemask_t mems_allowed;	/* Protected by alloc_lock */
 	int mems_allowed_change_disable;
diff --git a/kernel/fork.c b/kernel/fork.c
index 0276c30..361413d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1153,6 +1153,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	p->default_timer_slack_ns = current->timer_slack_ns;
 
+#ifdef CONFIG_MEMTRACE
+	if(current->mem_trace) {
+		p->mem_trace = 1;
+		p->seq = 0;
+	}
+#endif
 	task_io_accounting_init(&p->ioac);
 	acct_clear_integrals(p);
 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index dd373c8..9955a40 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -748,6 +748,10 @@ config DEBUG_VIRTUAL
 
 	  If unsure, say N.
 
+config MEMTRACE
+	bool "Memory Reference Tracing"
+	default n
+
 config DEBUG_NOMMU_REGIONS
 	bool "Debug the global anon/private NOMMU mapping region tree"
 	depends on DEBUG_KERNEL && !MMU
diff --git a/lib/Makefile b/lib/Makefile
index 6b597fd..652c5fa 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -89,6 +89,7 @@ obj-$(CONFIG_SWIOTLB) += swiotlb.o
 obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o
 obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o
 obj-$(CONFIG_CPU_NOTIFIER_ERROR_INJECT) += cpu-notifier-error-inject.o
+obj-$(CONFIG_MEMTRACE) += memtrace.o
 
 lib-$(CONFIG_GENERIC_BUG) += bug.o
 
diff --git a/lib/memtrace.c b/lib/memtrace.c
new file mode 100644
index 0000000..5ebd7c8
--- /dev/null
+++ b/lib/memtrace.c
@@ -0,0 +1,108 @@
+#include <asm/atomic.h>
+#include <linux/memtrace.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+
+/* Trace Unique identifier */
+atomic_t trace_sequence_number;
+pid_t pg_trace_pid;
+int memtrace_block_sz;
+int total_block_count;
+
+#define MB_SHIFT	20
+
+/* TODO: Dynamically allocate this array depending on the amount of memory
+ * present on the system
+ */
+struct memtrace_block memtrace_block_accessed[MAX_MEMTRACE_BLOCK+1];
+
+/* App being traced */
+pid_t get_pg_trace_pid(void)
+{
+	return pg_trace_pid;
+}
+EXPORT_SYMBOL_GPL(get_pg_trace_pid);
+
+void set_pg_trace_pid(pid_t pid)
+{
+	pg_trace_pid = pid;
+}
+EXPORT_SYMBOL_GPL(set_pg_trace_pid);
+
+void set_mem_trace(struct task_struct *tsk, int flag)
+{
+	tsk->mem_trace = flag;
+}
+EXPORT_SYMBOL_GPL(set_mem_trace);
+
+void set_task_seq(struct task_struct *tsk, unsigned int seq)
+{
+	tsk->seq = seq;
+}
+EXPORT_SYMBOL_GPL(set_task_seq);
+
+unsigned int get_task_seq(struct task_struct *tsk)
+{
+	return (tsk->seq);
+}
+EXPORT_SYMBOL_GPL(get_task_seq);
+
+void init_seq_number(void)
+{
+	return (atomic_set(&trace_sequence_number, 0));
+}
+EXPORT_SYMBOL_GPL(init_seq_number);
+
+unsigned int get_seq_number(void)
+{
+	return atomic_read(&trace_sequence_number);
+}
+EXPORT_SYMBOL_GPL(get_seq_number);
+
+unsigned int inc_seq_number(void)
+{
+	return (atomic_inc_return(&trace_sequence_number));
+}
+EXPORT_SYMBOL_GPL(inc_seq_number);
+
+void set_memtrace_block_sz(int sz)
+{
+	memtrace_block_sz = sz;
+	total_block_count = (totalram_pages << PAGE_SHIFT) / (memtrace_block_sz << MB_SHIFT );
+}
+EXPORT_SYMBOL_GPL(set_memtrace_block_sz);
+
+void mark_memtrace_block_accessed(unsigned long paddr)
+ {
+	int memtrace_block;
+	unsigned long paddr_mb;
+
+	paddr_mb = paddr >> MB_SHIFT;
+
+	memtrace_block = ((int) paddr_mb/memtrace_block_sz) + 1;
+	memtrace_block_accessed[memtrace_block].seq = get_seq_number();
+	memtrace_block_accessed[memtrace_block].access_flag = 1;
+}
+EXPORT_SYMBOL_GPL(mark_memtrace_block_accessed);
+
+void update_and_log_data(void)
+{
+ 	int i;
+	unsigned int seq;
+	unsigned long base_addr, access_flag;
+
+	for (i = 1; i <= total_block_count; i++) {
+		seq = memtrace_block_accessed[i].seq;
+		base_addr = i * memtrace_block_sz;
+		access_flag = memtrace_block_accessed[i].access_flag;
+		/*
+		 *  Log trace data
+		 *  Can modify to dump only blocks that have been marked
+		 *  accessed
+		 */
+		memtrace_block_accessed[i].access_flag = 0;
+ 	}
+
+	return;
+}
+EXPORT_SYMBOL_GPL(update_and_log_data);
-- 
1.7.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 11+ messages in thread
- * [PATCH 2/5] memref module to walk the process page table
  2011-07-05  8:22 [PATCH 0/5] mm,debug: VM framework to capture memory reference pattern Ankita Garg
  2011-07-05  8:22 ` [PATCH 1/5] Core kernel backend to capture the " Ankita Garg
@ 2011-07-05  8:22 ` Ankita Garg
  2011-07-05  8:22 ` [PATCH 3/5] Capture kernel memory references Ankita Garg
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 11+ messages in thread
From: Ankita Garg @ 2011-07-05  8:22 UTC (permalink / raw)
  To: linux-mm; +Cc: ankita, svaidy, linux-kernel
Hi,
This patch introduces the memref module that walks through the page tables
of all the required processes to capture the reference pattern information.
The module makes use of the walk_page_range routine provided by the kernel.
Further, the module walks through the page tables of all the tasks that are
its children and in the same thread group. One of the reasons why a core
kernel backend is needed is that some of the routines/data needed to walk
through all the process and kernel page tables are not exported for use by
kernel modules.
Signed-off-by: Ankita Garg <ankita@in.ibm.com>
---
 arch/x86/mm/pgtable.c |    2 +
 arch/x86/mm/tlb.c     |    1 +
 drivers/misc/Kconfig  |    5 +
 drivers/misc/Makefile |    1 +
 drivers/misc/memref.c |  194 +++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/pid.c          |    1 +
 mm/memory.c           |    1 +
 mm/pagewalk.c         |    2 +
 8 files changed, 207 insertions(+), 0 deletions(-)
 create mode 100644 drivers/misc/memref.c
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 8573b83..bc17d20 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -4,6 +4,7 @@
 #include <asm/pgtable.h>
 #include <asm/tlb.h>
 #include <asm/fixmap.h>
+#include <linux/module.h>
 
 #define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
 
@@ -300,6 +301,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 	paravirt_pgd_free(mm, pgd);
 	free_page((unsigned long)pgd);
 }
+EXPORT_SYMBOL_GPL(ptep_test_and_clear_young);
 
 int ptep_set_access_flags(struct vm_area_struct *vma,
 			  unsigned long address, pte_t *ptep,
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index d6c0418..f24c9f2 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -299,6 +299,7 @@ void flush_tlb_mm(struct mm_struct *mm)
 
 	preempt_enable();
 }
+EXPORT_SYMBOL_GPL(flush_tlb_mm);
 
 void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
 {
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 4e349cd..bca5977 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -314,6 +314,11 @@ config SGI_GRU_DEBUG
 	This option enables addition debugging code for the SGI GRU driver. If
 	you are unsure, say N.
 
+config MEMREF
+	tristate "Memory Reference Tracing module"
+	select MEMTRACE
+	default n
+
 config APDS9802ALS
 	tristate "Medfield Avago APDS9802 ALS Sensor module"
 	depends on I2C
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index 5f03172..c878486 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -46,3 +46,4 @@ obj-y				+= ti-st/
 obj-$(CONFIG_AB8500_PWM)	+= ab8500-pwm.o
 obj-y				+= lis3lv02d/
 obj-y				+= carma/
+obj-$(CONFIG_MEMREF)		+= memref.o
diff --git a/drivers/misc/memref.c b/drivers/misc/memref.c
new file mode 100644
index 0000000..4e8785f
--- /dev/null
+++ b/drivers/misc/memref.c
@@ -0,0 +1,194 @@
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+#include <asm/pgtable.h>
+#include <linux/connector.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <asm/tlbflush.h>
+#include <linux/types.h>
+#include <asm/page.h>
+#include <linux/kthread.h>
+#include <linux/memtrace.h>
+
+struct task_struct *memref_thr;
+struct task_struct *tsk;
+unsigned int seq;
+struct mm_struct *k_mm;
+
+static pid_t trace_pid = -1;
+static int interval = 10;
+static int memtrace_block_size = 64;
+
+#define LIMIT 1024
+int top = -1;
+struct task_struct *stack[LIMIT];
+
+module_param(trace_pid, int, 0664);
+MODULE_PARM_DESC(trace_pid, "Pid of app to be traced");
+module_param(interval, int, 0664);
+MODULE_PARM_DESC(interval, "Sampling interval in milliseconds");
+module_param(memtrace_block_size, int, 0664);
+MODULE_PARM_DESC(memtrace_block_size, "Memory Block Size");
+
+static int check_and_clear_task_pages(pmd_t *pmd, unsigned long addr,
+				unsigned long end, struct mm_walk *walk)
+{
+	struct vm_area_struct *vma = walk->private;
+	unsigned long pfn;
+	pte_t *pte, ptent;
+	spinlock_t *ptl;
+	struct page *page;
+	unsigned long paddr;
+
+	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	for (; addr != end; pte++, addr += PAGE_SIZE) {
+		ptent = *pte;
+		if (!pte_present(ptent) || pte_none(*pte) || pte_huge(*pte))
+			continue;
+
+		page = vm_normal_page(vma, addr, ptent);
+		if (!page)
+			continue;
+
+		/* this is where need to check if reference bit was set,
+		 * if found to be set, make a note of it and then clear it
+		 */
+		if(ptep_test_and_clear_young(vma, addr, pte)) {
+			ClearPageReferenced(page);
+			pfn = pte_pfn(ptent);
+			if(pfn_valid(pfn)) {
+				paddr = pfn << PAGE_SHIFT;
+				mark_memtrace_block_accessed(paddr);
+			}
+		}
+	}
+	pte_unmap_unlock(pte - 1, ptl);
+	return 0;
+}
+
+static void walk_task_pages(struct mm_struct *mm)
+{
+	struct vm_area_struct *vma;
+
+	if (mm) {
+		struct mm_walk walk_task_pages = {
+			.pmd_entry = check_and_clear_task_pages,
+			.mm = mm,
+		};
+		down_read(&mm->mmap_sem);
+		for (vma = mm->mmap; vma; vma = vma->vm_next) {
+			walk_task_pages.private = vma;
+			if (!is_vm_hugetlb_page(vma)) ;
+				walk_page_range(vma->vm_start, vma->vm_end,
+						&walk_task_pages);
+		}
+		flush_tlb_mm(mm);
+		up_read(&mm->mmap_sem);
+		mmput(mm);
+	}
+}
+
+static int is_list_empty(void)
+{
+	if(top == -1)
+		return 1;
+	return 0;
+}
+
+static void insert_into_list(struct task_struct *v)
+{
+	if(top == LIMIT)
+		return;
+	top++;
+	stack[top] = v;
+}
+
+static struct task_struct* del_from_list(void)
+{
+	struct task_struct *t;
+
+	if(is_list_empty())
+		return NULL;
+	t = stack[top];
+	top--;
+	return t;
+}
+
+static void walk_tasks(struct task_struct *p)
+{
+	struct task_struct *t, *c;
+	struct mm_struct *mm_task;
+
+	if(!p)
+		return;
+
+	insert_into_list(p);
+
+	while(!is_list_empty()) {
+		c = del_from_list();
+		set_mem_trace(c, 1);
+		if(!thread_group_leader(c))
+			continue;
+		set_task_seq(c, seq);
+		mm_task = get_task_mm(c);
+		if(mm_task)
+			walk_task_pages(mm_task);
+
+		list_for_each_entry(t, &c->children, sibling)
+			if(get_task_seq(t) != seq)
+				insert_into_list(t);
+	}
+	return;
+}
+
+static int memref_thread(void *data)
+{
+	struct task_struct *task = data;
+
+	while(!kthread_should_stop() && task) {
+		seq = inc_seq_number();
+
+		walk_tasks(task);
+		update_and_log_data();
+		msleep(interval);
+	}
+	return 0;
+}
+
+static int memref_start(void)
+{
+
+	rcu_read_lock();
+	set_pg_trace_pid(trace_pid);
+	init_seq_number();
+	set_memtrace_block_sz(memtrace_block_size);
+
+	tsk = find_task_by_vpid(trace_pid);
+	if(!tsk) {
+		printk("No task with pid %d found \n", trace_pid);
+		tsk = ERR_PTR(-ESRCH);
+		return -EINVAL;
+	}
+
+	set_mem_trace(tsk, 1);
+	rcu_read_unlock();
+	memref_thr = kthread_create(memref_thread, tsk, "memref");
+	wake_up_process(memref_thr);
+	return 0;
+}
+
+static void memref_stop(void)
+{
+	if(memref_thr)
+		kthread_stop(memref_thr);
+	set_pg_trace_pid(-1);
+	set_mem_trace(tsk, 0);
+	return;
+}
+
+module_init(memref_start);
+module_exit(memref_stop);
+MODULE_LICENSE("GPL");
diff --git a/kernel/pid.c b/kernel/pid.c
index 57a8346..abfb4a6 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -427,6 +427,7 @@ struct task_struct *find_task_by_vpid(pid_t vnr)
 {
 	return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns);
 }
+EXPORT_SYMBOL_GPL(find_task_by_vpid);
 
 struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
 {
diff --git a/mm/memory.c b/mm/memory.c
index 87d9353..a1fbd62 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -843,6 +843,7 @@ check_pfn:
 out:
 	return pfn_to_page(pfn);
 }
+EXPORT_SYMBOL_GPL(vm_normal_page);
 
 /*
  * copy one vm_area from one task to the other. Assumes the page tables
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index c3450d5..f29d1cb 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -2,6 +2,7 @@
 #include <linux/highmem.h>
 #include <linux/sched.h>
 #include <linux/hugetlb.h>
+#include <linux/module.h>
 
 static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 			  struct mm_walk *walk)
@@ -210,3 +211,4 @@ int walk_page_range(unsigned long addr, unsigned long end,
 
 	return err;
 }
+EXPORT_SYMBOL_GPL(walk_page_range);
-- 
1.7.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 11+ messages in thread
- * [PATCH 3/5] Capture kernel memory references
  2011-07-05  8:22 [PATCH 0/5] mm,debug: VM framework to capture memory reference pattern Ankita Garg
  2011-07-05  8:22 ` [PATCH 1/5] Core kernel backend to capture the " Ankita Garg
  2011-07-05  8:22 ` [PATCH 2/5] memref module to walk the process page table Ankita Garg
@ 2011-07-05  8:22 ` Ankita Garg
  2011-07-05  8:22 ` [PATCH 4/5] Capture references to page cache pages Ankita Garg
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 11+ messages in thread
From: Ankita Garg @ 2011-07-05  8:22 UTC (permalink / raw)
  To: linux-mm; +Cc: ankita, svaidy, linux-kernel
Hi,
This patch introduces code to traverse the kernel page tables, starting from
the highest level pgdir table in init_level4_pgt.
Signed-off-by: Ankita Garg <ankita@in.ibm.com>
---
 arch/x86/include/asm/pgtable_64.h |    1 +
 drivers/misc/memref.c             |    1 +
 include/linux/memtrace.h          |    1 +
 lib/memtrace.c                    |   95 +++++++++++++++++++++++++++++++++++++
 4 files changed, 98 insertions(+), 0 deletions(-)
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 975f709..09c99e0 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -13,6 +13,7 @@
 #include <asm/processor.h>
 #include <linux/bitops.h>
 #include <linux/threads.h>
+#include <linux/module.h>
 
 extern pud_t level3_kernel_pgt[512];
 extern pud_t level3_ident_pgt[512];
diff --git a/drivers/misc/memref.c b/drivers/misc/memref.c
index 4e8785f..abf8b23 100644
--- a/drivers/misc/memref.c
+++ b/drivers/misc/memref.c
@@ -152,6 +152,7 @@ static int memref_thread(void *data)
 		seq = inc_seq_number();
 
 		walk_tasks(task);
+		kernel_mapping_ref();
 		update_and_log_data();
 		msleep(interval);
 	}
diff --git a/include/linux/memtrace.h b/include/linux/memtrace.h
index 0fa15e0..b1fce57 100644
--- a/include/linux/memtrace.h
+++ b/include/linux/memtrace.h
@@ -24,6 +24,7 @@ unsigned int inc_seq_number(void);
 void set_memtrace_block_sz(int sz);
 void mark_memtrace_block_accessed(unsigned long paddr);
 void init_memtrace_blocks(void);
+void kernel_mapping_ref(void);
 void update_and_log_data(void);
 
 #endif /* _LINUX_MEMTRACE_H */
diff --git a/lib/memtrace.c b/lib/memtrace.c
index 5ebd7c8..aec5b65 100644
--- a/lib/memtrace.c
+++ b/lib/memtrace.c
@@ -72,6 +72,101 @@ void set_memtrace_block_sz(int sz)
 }
 EXPORT_SYMBOL_GPL(set_memtrace_block_sz);
 
+#define PTE_LEVEL_MULT (PAGE_SIZE)
+#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
+#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
+#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
+
+static void walk_k_pte_level(pmd_t pmd, unsigned long addr)
+ {
+ 	pte_t *pte;
+ 	int i, ret;
+	unsigned long pfn;
+	struct page *pg;
+
+	pte = (pte_t*) pmd_page_vaddr(pmd);
+
+ 	for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
+ 		if(!pte_present(*pte) && pte_none(*pte) && pte_huge(*pte))
+			continue;
+
+		pfn = pte_pfn(*pte);
+		if(pfn_valid(pfn) && pte_young(*pte)) {
+			ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+						(unsigned long *) &pte->pte);
+			if (ret) {
+				pg = pfn_to_page(pfn);
+				ClearPageReferenced(pg);
+				mark_memtrace_block_accessed(pfn << PAGE_SHIFT);
+			}
+		}
+ 	}
+}
+
+#if PTRS_PER_PMD > 1
+
+static void walk_k_pmd_level(pud_t pud, unsigned long addr)
+ {
+ 	pmd_t *pmd;
+ 	int i;
+
+ 	pmd = (pmd_t *) pud_page_vaddr(pud);
+
+ 	for (i = 0; i < PTRS_PER_PMD; i++) {
+
+ 		if(!pmd_none(*pmd) && pmd_present(*pmd) && !pmd_large(*pmd))
+			walk_k_pte_level(*pmd, addr + i * PMD_LEVEL_MULT);
+
+ 		pmd++;
+ 	}
+ }
+
+#else
+#define walk_pmd_level(p,a) walk_pte_level(__pmd(pud_val(p)),a)
+#define pud_none(a)  pmd_none(__pmd(pud_val(a)))
+#define pud_large(a) pmd_large(__pmd(pud_val(a)))
+#endif
+
+#if PTRS_PER_PUD > 1
+
+static void walk_k_pud_level(pgd_t pgd, unsigned long addr)
+ {
+ 	pud_t *pud;
+ 	int i;
+
+ 	pud = (pud_t *) pgd_page_vaddr(pgd);
+
+ 	for (i = 0; i < PTRS_PER_PUD; i++) {
+
+ 		if(!pud_none(*pud) && pud_present(*pud) && !pud_large(*pud))
+ 			walk_k_pmd_level(*pud, addr + i * PUD_LEVEL_MULT);
+ 		pud++;
+ 	}
+ }
+
+#else
+#define walk_pud_level(p,a) walk_pmd_level(__pud(pgd_val(p)),a)
+#define pgd_none(a)  pud_none(__pud(pgd_val(a)))
+#define pgd_large(a) pud_large(__pud(pgd_val(a)))
+#endif
+
+void kernel_mapping_ref(void)
+{
+ 	pgd_t *pgd;
+ 	int i;
+
+        pgd = (pgd_t *) &init_level4_pgt;
+
+ 	for (i=0; i < PTRS_PER_PGD; i++) {
+
+ 		if(!pgd_none(*pgd) && pgd_present(*pgd) && !pgd_large(*pgd)) {
+ 			walk_k_pud_level(*pgd, i * PGD_LEVEL_MULT);
+		}
+ 		pgd++;
+ 	}
+}
+EXPORT_SYMBOL_GPL(kernel_mapping_ref);
+
 void mark_memtrace_block_accessed(unsigned long paddr)
  {
 	int memtrace_block;
-- 
1.7.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 11+ messages in thread
- * [PATCH 4/5] Capture references to page cache pages
  2011-07-05  8:22 [PATCH 0/5] mm,debug: VM framework to capture memory reference pattern Ankita Garg
                   ` (2 preceding siblings ...)
  2011-07-05  8:22 ` [PATCH 3/5] Capture kernel memory references Ankita Garg
@ 2011-07-05  8:22 ` Ankita Garg
  2011-07-05  8:22 ` [PATCH 5/5] Logging the captured reference data Ankita Garg
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 11+ messages in thread
From: Ankita Garg @ 2011-07-05  8:22 UTC (permalink / raw)
  To: linux-mm; +Cc: ankita, svaidy, linux-kernel
Hi,
Page cache accesses may not be mapped, hence fake an access when a
pagecache page is looked up, by marking the corresponding memory
address block as accessed.
Signed-off-by: Ankita Garg <ankita@in.ibm.com>
---
 mm/filemap.c |    8 ++++++++
 1 files changed, 8 insertions(+), 0 deletions(-)
diff --git a/mm/filemap.c b/mm/filemap.c
index a8251a8..7ae7f36 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -36,6 +36,7 @@
 #include <linux/mm_inline.h> /* for page_is_file_cache() */
 #include <linux/cleancache.h>
 #include "internal.h"
+#include <linux/memtrace.h>
 
 /*
  * FIXME: remove all knowledge of the buffer layer from the core VM
@@ -730,6 +731,13 @@ repeat:
 			page_cache_release(page);
 			goto repeat;
 		}
+#if defined(CONFIG_MEMTRACE)
+		if(get_pg_trace_pid() != -1 && current->mem_trace) {
+			unsigned long pfn = page_to_pfn(page);
+			if(pfn_valid(pfn))
+				mark_memtrace_block_accessed(pfn << PAGE_SHIFT);
+		}
+#endif
 	}
 out:
 	rcu_read_unlock();
-- 
1.7.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 11+ messages in thread
- * [PATCH 5/5] Logging the captured reference data
  2011-07-05  8:22 [PATCH 0/5] mm,debug: VM framework to capture memory reference pattern Ankita Garg
                   ` (3 preceding siblings ...)
  2011-07-05  8:22 ` [PATCH 4/5] Capture references to page cache pages Ankita Garg
@ 2011-07-05  8:22 ` Ankita Garg
  2011-07-05 23:33 ` [PATCH 0/5] mm,debug: VM framework to capture memory reference pattern Valdis.Kletnieks
  2011-07-06  9:01 ` Andrew Morton
  6 siblings, 0 replies; 11+ messages in thread
From: Ankita Garg @ 2011-07-05  8:22 UTC (permalink / raw)
  To: linux-mm; +Cc: ankita, svaidy, linux-kernel
Hi,
This patch logs the reference data collected using the trace events
framework. To enable capturing the trace, insert the module and mount
debugfs.
# modprobe memref
# echo "memtrace:memtrace" > /debug/tracing/set_event
# echo 1 > /debug/tracing/events/memtrace/memtrace/enable
# echo 1 > /debug/tracing/tracing_on
# echo 1 > /debug/tracing/tracing_enable
# cat /debug/tracing/trace
#
#   TASK-PID    CPU#    TIMESTAMP  FUNCTION
#      | |       |          |         |
  memref-4402  [000]   250.274467: memtrace: 2115 6208 1
  memref-4402  [000]   250.274467: memtrace: 2115 6272 0
  memref-4402  [000]   250.274467: memtrace: 2115 6336 0
  memref-4402  [000]   250.274467: memtrace: 2115 6400 1
                                               |   |   |
                                               V   |   V
                                     sample number | whether referenced
                                                   | or not
                                                   V
                                        physical address of the
                                        start of the block in MB
sample number is a monotonically increasing unique count associated with
a sample. Time stamp is for trace printing not access. The entire access
pattern for all blocks will be at each interval (10ms default).
This data can be post-processed by scripts to generate the overall memory
reference pattern for a given amount of time. Temporal and spatial
reference pattern can be obtained.
This is a statistical sample where any number of reference to a block
over the sampling interval is just marked as one.
Signed-off-by: Ankita Garg <ankita@in.ibm.com>
---
 include/trace/events/memtrace.h |   28 ++++++++++++++++++++++++++++
 lib/memtrace.c                  |    4 ++++
 2 files changed, 32 insertions(+), 0 deletions(-)
 create mode 100644 include/trace/events/memtrace.h
diff --git a/include/trace/events/memtrace.h b/include/trace/events/memtrace.h
new file mode 100644
index 0000000..8a6cdd6
--- /dev/null
+++ b/include/trace/events/memtrace.h
@@ -0,0 +1,28 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM memtrace
+
+#include <linux/tracepoint.h>
+
+#if !defined(_TRACE_MEMTRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_MEMTRACE_H
+
+TRACE_EVENT(memtrace,
+	TP_PROTO(unsigned int seq, unsigned long base, unsigned long access_flag),
+	TP_ARGS(seq, base, access_flag),
+	TP_STRUCT__entry(
+		__field(	unsigned int ,	seq		)
+		__field(	unsigned long,	base		)
+		__field(	unsigned long,	access_flag	)
+	),
+	TP_fast_assign(
+		__entry->seq		= seq;
+		__entry->base		= base;
+		__entry->access_flag	= access_flag;
+	),
+	TP_printk("%u %lu %lu", __entry->seq, __entry->base, __entry->access_flag)
+);
+
+#endif /* _TRACE_MEMTRACE_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/lib/memtrace.c b/lib/memtrace.c
index aec5b65..e9cb967 100644
--- a/lib/memtrace.c
+++ b/lib/memtrace.c
@@ -3,6 +3,9 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/memtrace.h>
+
 /* Trace Unique identifier */
 atomic_t trace_sequence_number;
 pid_t pg_trace_pid;
@@ -195,6 +198,7 @@ void update_and_log_data(void)
 		 *  Can modify to dump only blocks that have been marked
 		 *  accessed
 		 */
+		trace_memtrace(seq, base_addr, access_flag);
 		memtrace_block_accessed[i].access_flag = 0;
  	}
 
-- 
1.7.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 11+ messages in thread
- * Re: [PATCH 0/5] mm,debug: VM framework to capture memory reference pattern
  2011-07-05  8:22 [PATCH 0/5] mm,debug: VM framework to capture memory reference pattern Ankita Garg
                   ` (4 preceding siblings ...)
  2011-07-05  8:22 ` [PATCH 5/5] Logging the captured reference data Ankita Garg
@ 2011-07-05 23:33 ` Valdis.Kletnieks
  2011-07-06  4:27   ` Ankita Garg
  2011-07-06  9:01 ` Andrew Morton
  6 siblings, 1 reply; 11+ messages in thread
From: Valdis.Kletnieks @ 2011-07-05 23:33 UTC (permalink / raw)
  To: Ankita Garg; +Cc: linux-mm, svaidy, linux-kernel
[-- Attachment #1: Type: text/plain, Size: 585 bytes --]
On Tue, 05 Jul 2011 13:52:34 +0530, Ankita Garg said:
> by default) and scans through all pages of the specified tasks (including
> children/threads) running in the system. If the hardware reference bit in the
> page table is set, then the page is marked as accessed over the last sampling
> interval and the reference bit is cleared.
Does that cause any issues for other code in the mm subsystem that was
expecting to use the reference bit for something useful? (Similarly, if other
code in mm turns that bit *off* for its own reasons, does your code still
produce useful results?)
[-- Attachment #2: Type: application/pgp-signature, Size: 227 bytes --]
^ permalink raw reply	[flat|nested] 11+ messages in thread
- * Re: [PATCH 0/5] mm,debug: VM framework to capture memory reference pattern
  2011-07-05 23:33 ` [PATCH 0/5] mm,debug: VM framework to capture memory reference pattern Valdis.Kletnieks
@ 2011-07-06  4:27   ` Ankita Garg
  0 siblings, 0 replies; 11+ messages in thread
From: Ankita Garg @ 2011-07-06  4:27 UTC (permalink / raw)
  To: Valdis.Kletnieks; +Cc: linux-mm, svaidy, linux-kernel
Hi,
On Tue, Jul 05, 2011 at 07:33:24PM -0400, Valdis.Kletnieks@vt.edu wrote:
> On Tue, 05 Jul 2011 13:52:34 +0530, Ankita Garg said:
> 
> > by default) and scans through all pages of the specified tasks (including
> > children/threads) running in the system. If the hardware reference bit in the
> > page table is set, then the page is marked as accessed over the last sampling
> > interval and the reference bit is cleared.
> 
> Does that cause any issues for other code in the mm subsystem that was
> expecting to use the reference bit for something useful? (Similarly, if other
> code in mm turns that bit *off* for its own reasons, does your code still
> produce useful results?)
At this point, the VM code does not use the reference bit for any
decision making, not even in the LRU. However, if the reference bit is
used later on, then this change will interfere with that logic.
-- 
Regards,
Ankita Garg (ankita@in.ibm.com)
Linux Technology Center
IBM India Systems & Technology Labs,
Bangalore, India
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 11+ messages in thread 
 
- * Re: [PATCH 0/5] mm,debug: VM framework to capture memory reference pattern
  2011-07-05  8:22 [PATCH 0/5] mm,debug: VM framework to capture memory reference pattern Ankita Garg
                   ` (5 preceding siblings ...)
  2011-07-05 23:33 ` [PATCH 0/5] mm,debug: VM framework to capture memory reference pattern Valdis.Kletnieks
@ 2011-07-06  9:01 ` Andrew Morton
  2011-07-06  9:31   ` Ankita Garg
  6 siblings, 1 reply; 11+ messages in thread
From: Andrew Morton @ 2011-07-06  9:01 UTC (permalink / raw)
  To: Ankita Garg; +Cc: linux-mm, svaidy, linux-kernel, Matt Mackall
On Tue,  5 Jul 2011 13:52:34 +0530 Ankita Garg <ankita@in.ibm.com> wrote:
> 
> This patch series is an instrumentation/debug infrastructure that captures
> the memory reference pattern of applications (workloads). 
Can't the interfaces described in Documentation/vm/pagemap.txt be used
for this?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 11+ messages in thread
- * Re: [PATCH 0/5] mm,debug: VM framework to capture memory reference pattern
  2011-07-06  9:01 ` Andrew Morton
@ 2011-07-06  9:31   ` Ankita Garg
  2011-07-06 13:36     ` Matt Mackall
  0 siblings, 1 reply; 11+ messages in thread
From: Ankita Garg @ 2011-07-06  9:31 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm, svaidy, linux-kernel, Matt Mackall
Hi,
On Wed, Jul 06, 2011 at 02:01:03AM -0700, Andrew Morton wrote:
> On Tue,  5 Jul 2011 13:52:34 +0530 Ankita Garg <ankita@in.ibm.com> wrote:
> 
> > 
> > This patch series is an instrumentation/debug infrastructure that captures
> > the memory reference pattern of applications (workloads). 
> 
> Can't the interfaces described in Documentation/vm/pagemap.txt be used
> for this?
The pagemap interface does not closely track the hardware reference bit
of the pages. The 'REFERENCED' flag maintained in /proc/kpageflags
only indicates if the page has been referenced since last LRU list
enqueue/requeue. So estimating the rate at which a particular page of
memory is referenced cannot be obtained. Further, it does not provide
information on the amount of kernel memory referenced on behalf of
the process.
-- 
Regards,
Ankita Garg (ankita@in.ibm.com)
Linux Technology Center
IBM India Systems & Technology Labs,
Bangalore, India
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 11+ messages in thread 
- * Re: [PATCH 0/5] mm,debug: VM framework to capture memory reference pattern
  2011-07-06  9:31   ` Ankita Garg
@ 2011-07-06 13:36     ` Matt Mackall
  0 siblings, 0 replies; 11+ messages in thread
From: Matt Mackall @ 2011-07-06 13:36 UTC (permalink / raw)
  To: Ankita Garg; +Cc: Andrew Morton, linux-mm, svaidy, linux-kernel
On Wed, 2011-07-06 at 15:01 +0530, Ankita Garg wrote:
> Hi,
> 
> On Wed, Jul 06, 2011 at 02:01:03AM -0700, Andrew Morton wrote:
> > On Tue,  5 Jul 2011 13:52:34 +0530 Ankita Garg <ankita@in.ibm.com> wrote:
> > 
> > > 
> > > This patch series is an instrumentation/debug infrastructure that captures
> > > the memory reference pattern of applications (workloads). 
> > 
> > Can't the interfaces described in Documentation/vm/pagemap.txt be used
> > for this?
> 
> The pagemap interface does not closely track the hardware reference bit
> of the pages. The 'REFERENCED' flag maintained in /proc/kpageflags
> only indicates if the page has been referenced since last LRU list
> enqueue/requeue. So estimating the rate at which a particular page of
> memory is referenced cannot be obtained. Further, it does not provide
> information on the amount of kernel memory referenced on behalf of
> the process.
Pagemap is good for measuring state and bad for measuring activity.
Computing state from activity via integration is generally impossible
due to the constant term and the possibility of event buffer overruns:
 state = integral(activity, t1, t2) + C
Doing the reverse is also generally impossible as it means collecting
extremely large samples at an extremely high resolution to avoid missing
events:
 activity = derivative(state, t1, t2)
If you want to measure activity, you want a tracing framework. If you
want to measure state, you want an inspection framework. Trying to build
one from the other just won't work reliably.
-- 
Mathematics is the supreme nostalgia of our time.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 11+ messages in thread