diff for duplicates of <20190520035254.57579-7-minchan@kernel.org> diff --git a/a/1.txt b/N1/1.txt index 9881d58..2c65f35 100644 --- a/a/1.txt +++ b/N1/1.txt @@ -1,493 +1,73 @@ -Currently, process_madvise syscall works for only one address range -so user should call the syscall several times to give hints to -multiple address range. -This patch extends process_madvise syscall to support multiple -hints, address ranges and return vaules so user could give hints -all at once. - -struct pr_madvise_param { - int size; /* the size of this structure */ - const struct iovec __user *vec; /* address range array */ -} - -int process_madvise(int pidfd, ssize_t nr_elem, - int *behavior, - struct pr_madvise_param *results, - struct pr_madvise_param *ranges, - unsigned long flags); - -- pidfd - -target process fd - -- nr_elem - -the number of elemenent of array behavior, results, ranges - -- behavior - -hints for each address range in remote process so that user could -give different hints for each range. - -- results - -array of buffers to get results for associated remote address range -action. - -- ranges - -array to buffers to have remote process's address ranges to be -processed - -- flags - -extra argument for the future. It should be zero this moment. - -Example) - -struct pr_madvise_param { - int size; - const struct iovec *vec; -}; - -int main(int argc, char *argv[]) -{ - struct pr_madvise_param retp, rangep; - struct iovec result_vec[2], range_vec[2]; - int hints[2]; - long ret[2]; - void *addr[2]; - - pid_t pid; - char cmd[64] = {0,}; - addr[0] = mmap(NULL, ALLOC_SIZE, PROT_READ|PROT_WRITE, - MAP_POPULATE|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); - - if (MAP_FAILED == addr[0]) - return 1; - - addr[1] = mmap(NULL, ALLOC_SIZE, PROT_READ|PROT_WRITE, - MAP_POPULATE|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); - - if (MAP_FAILED == addr[1]) - return 1; - - hints[0] = MADV_COLD; - range_vec[0].iov_base = addr[0]; - range_vec[0].iov_len = ALLOC_SIZE; - result_vec[0].iov_base = &ret[0]; - result_vec[0].iov_len = sizeof(long); - retp.vec = result_vec; - retp.size = sizeof(struct pr_madvise_param); - - hints[1] = MADV_COOL; - range_vec[1].iov_base = addr[1]; - range_vec[1].iov_len = ALLOC_SIZE; - result_vec[1].iov_base = &ret[1]; - result_vec[1].iov_len = sizeof(long); - rangep.vec = range_vec; - rangep.size = sizeof(struct pr_madvise_param); - - pid = fork(); - if (!pid) { - sleep(10); - } else { - int pidfd = open(cmd, O_DIRECTORY | O_CLOEXEC); - if (pidfd < 0) - return 1; - - /* munmap to make pages private for the child */ - munmap(addr[0], ALLOC_SIZE); - munmap(addr[1], ALLOC_SIZE); - system("cat /proc/vmstat | egrep 'pswpout|deactivate'"); - if (syscall(__NR_process_madvise, pidfd, 2, behaviors, - &retp, &rangep, 0)) - perror("process_madvise fail\n"); - system("cat /proc/vmstat | egrep 'pswpout|deactivate'"); - } - - return 0; -} - -Signed-off-by: Minchan Kim <minchan@kernel.org> ---- - include/uapi/asm-generic/mman-common.h | 5 + - mm/madvise.c | 184 +++++++++++++++++++++---- - 2 files changed, 166 insertions(+), 23 deletions(-) - -diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h -index b9b51eeb8e1a..b8e230de84a6 100644 ---- a/include/uapi/asm-generic/mman-common.h -+++ b/include/uapi/asm-generic/mman-common.h -@@ -74,4 +74,9 @@ - #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ - PKEY_DISABLE_WRITE) - -+struct pr_madvise_param { -+ int size; /* the size of this structure */ -+ const struct iovec __user *vec; /* address range array */ -+}; -+ - #endif /* __ASM_GENERIC_MMAN_COMMON_H */ -diff --git a/mm/madvise.c b/mm/madvise.c -index af02aa17e5c1..f4f569dac2bd 100644 ---- a/mm/madvise.c -+++ b/mm/madvise.c -@@ -320,6 +320,7 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr, - struct page *page; - struct vm_area_struct *vma = walk->vma; - unsigned long next; -+ long nr_pages = 0; - - next = pmd_addr_end(addr, end); - if (pmd_trans_huge(*pmd)) { -@@ -380,9 +381,12 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr, - - ptep_test_and_clear_young(vma, addr, pte); - deactivate_page(page); -+ nr_pages++; -+ - } - - pte_unmap_unlock(orig_pte, ptl); -+ *(long *)walk->private += nr_pages; - cond_resched(); - - return 0; -@@ -390,11 +394,13 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr, - - static void madvise_cool_page_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, -- unsigned long addr, unsigned long end) -+ unsigned long addr, unsigned long end, -+ long *nr_pages) - { - struct mm_walk cool_walk = { - .pmd_entry = madvise_cool_pte_range, - .mm = vma->vm_mm, -+ .private = nr_pages - }; - - tlb_start_vma(tlb, vma); -@@ -403,7 +409,8 @@ static void madvise_cool_page_range(struct mmu_gather *tlb, - } - - static long madvise_cool(struct vm_area_struct *vma, -- unsigned long start_addr, unsigned long end_addr) -+ unsigned long start_addr, unsigned long end_addr, -+ long *nr_pages) - { - struct mm_struct *mm = vma->vm_mm; - struct mmu_gather tlb; -@@ -413,7 +420,7 @@ static long madvise_cool(struct vm_area_struct *vma, - - lru_add_drain(); - tlb_gather_mmu(&tlb, mm, start_addr, end_addr); -- madvise_cool_page_range(&tlb, vma, start_addr, end_addr); -+ madvise_cool_page_range(&tlb, vma, start_addr, end_addr, nr_pages); - tlb_finish_mmu(&tlb, start_addr, end_addr); - - return 0; -@@ -429,6 +436,7 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, - int isolated = 0; - struct vm_area_struct *vma = walk->vma; - unsigned long next; -+ long nr_pages = 0; - - next = pmd_addr_end(addr, end); - if (pmd_trans_huge(*pmd)) { -@@ -492,7 +500,7 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, - list_add(&page->lru, &page_list); - if (isolated >= SWAP_CLUSTER_MAX) { - pte_unmap_unlock(orig_pte, ptl); -- reclaim_pages(&page_list); -+ nr_pages += reclaim_pages(&page_list); - isolated = 0; - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - orig_pte = pte; -@@ -500,19 +508,22 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, - } - - pte_unmap_unlock(orig_pte, ptl); -- reclaim_pages(&page_list); -+ nr_pages += reclaim_pages(&page_list); - cond_resched(); - -+ *(long *)walk->private += nr_pages; - return 0; - } - - static void madvise_cold_page_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, -- unsigned long addr, unsigned long end) -+ unsigned long addr, unsigned long end, -+ long *nr_pages) - { - struct mm_walk warm_walk = { - .pmd_entry = madvise_cold_pte_range, - .mm = vma->vm_mm, -+ .private = nr_pages, - }; - - tlb_start_vma(tlb, vma); -@@ -522,7 +533,8 @@ static void madvise_cold_page_range(struct mmu_gather *tlb, - - - static long madvise_cold(struct vm_area_struct *vma, -- unsigned long start_addr, unsigned long end_addr) -+ unsigned long start_addr, unsigned long end_addr, -+ long *nr_pages) - { - struct mm_struct *mm = vma->vm_mm; - struct mmu_gather tlb; -@@ -532,7 +544,7 @@ static long madvise_cold(struct vm_area_struct *vma, - - lru_add_drain(); - tlb_gather_mmu(&tlb, mm, start_addr, end_addr); -- madvise_cold_page_range(&tlb, vma, start_addr, end_addr); -+ madvise_cold_page_range(&tlb, vma, start_addr, end_addr, nr_pages); - tlb_finish_mmu(&tlb, start_addr, end_addr); - - return 0; -@@ -922,7 +934,7 @@ static int madvise_inject_error(int behavior, - static long - madvise_vma(struct task_struct *tsk, struct vm_area_struct *vma, - struct vm_area_struct **prev, unsigned long start, -- unsigned long end, int behavior) -+ unsigned long end, int behavior, long *nr_pages) - { - switch (behavior) { - case MADV_REMOVE: -@@ -930,9 +942,9 @@ madvise_vma(struct task_struct *tsk, struct vm_area_struct *vma, - case MADV_WILLNEED: - return madvise_willneed(vma, prev, start, end); - case MADV_COOL: -- return madvise_cool(vma, start, end); -+ return madvise_cool(vma, start, end, nr_pages); - case MADV_COLD: -- return madvise_cold(vma, start, end); -+ return madvise_cold(vma, start, end, nr_pages); - case MADV_FREE: - case MADV_DONTNEED: - return madvise_dontneed_free(tsk, vma, prev, start, -@@ -981,7 +993,7 @@ madvise_behavior_valid(int behavior) - } - - static int madvise_core(struct task_struct *tsk, unsigned long start, -- size_t len_in, int behavior) -+ size_t len_in, int behavior, long *nr_pages) - { - unsigned long end, tmp; - struct vm_area_struct *vma, *prev; -@@ -996,6 +1008,7 @@ static int madvise_core(struct task_struct *tsk, unsigned long start, - - if (start & ~PAGE_MASK) - return error; -+ - len = (len_in + ~PAGE_MASK) & PAGE_MASK; - - /* Check to see whether len was rounded up from small -ve to zero */ -@@ -1035,6 +1048,8 @@ static int madvise_core(struct task_struct *tsk, unsigned long start, - blk_start_plug(&plug); - for (;;) { - /* Still start < end. */ -+ long pages = 0; -+ - error = -ENOMEM; - if (!vma) - goto out; -@@ -1053,9 +1068,11 @@ static int madvise_core(struct task_struct *tsk, unsigned long start, - tmp = end; - - /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ -- error = madvise_vma(tsk, vma, &prev, start, tmp, behavior); -+ error = madvise_vma(tsk, vma, &prev, start, tmp, -+ behavior, &pages); - if (error) - goto out; -+ *nr_pages += pages; - start = tmp; - if (prev && start < prev->vm_end) - start = prev->vm_end; -@@ -1140,26 +1157,137 @@ static int madvise_core(struct task_struct *tsk, unsigned long start, - */ - SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) - { -- return madvise_core(current, start, len_in, behavior); -+ unsigned long dummy; -+ -+ return madvise_core(current, start, len_in, behavior, &dummy); - } - --SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start, -- size_t, len_in, int, behavior) -+static int pr_madvise_copy_param(struct pr_madvise_param __user *u_param, -+ struct pr_madvise_param *param) -+{ -+ u32 size; -+ int ret; -+ -+ memset(param, 0, sizeof(*param)); -+ -+ ret = get_user(size, &u_param->size); -+ if (ret) -+ return ret; -+ -+ if (size > PAGE_SIZE) -+ return -E2BIG; -+ -+ if (!size || size > sizeof(struct pr_madvise_param)) -+ return -EINVAL; -+ -+ ret = copy_from_user(param, u_param, size); -+ if (ret) -+ return -EFAULT; -+ -+ return ret; -+} -+ -+static int process_madvise_core(struct task_struct *tsk, int *behaviors, -+ struct iov_iter *iter, -+ const struct iovec *range_vec, -+ unsigned long riovcnt, -+ unsigned long flags) -+{ -+ int i; -+ long err; -+ -+ for (err = 0, i = 0; i < riovcnt && iov_iter_count(iter); i++) { -+ long ret = 0; -+ -+ err = madvise_core(tsk, (unsigned long)range_vec[i].iov_base, -+ range_vec[i].iov_len, behaviors[i], -+ &ret); -+ if (err) -+ ret = err; -+ -+ if (copy_to_iter(&ret, sizeof(long), iter) != -+ sizeof(long)) { -+ err = -EFAULT; -+ break; -+ } -+ -+ err = 0; -+ } -+ -+ return err; -+} -+ -+SYSCALL_DEFINE6(process_madvise, int, pidfd, ssize_t, nr_elem, -+ const int __user *, hints, -+ struct pr_madvise_param __user *, results, -+ struct pr_madvise_param __user *, ranges, -+ unsigned long, flags) - { - int ret; - struct fd f; - struct pid *pid; - struct task_struct *tsk; - struct mm_struct *mm; -+ struct pr_madvise_param result_p, range_p; -+ const struct iovec __user *result_vec, __user *range_vec; -+ int *behaviors; -+ struct iovec iovstack_result[UIO_FASTIOV]; -+ struct iovec iovstack_r[UIO_FASTIOV]; -+ struct iovec *iov_l = iovstack_result; -+ struct iovec *iov_r = iovstack_r; -+ struct iov_iter iter; -+ -+ if (flags != 0) -+ return -EINVAL; -+ -+ ret = pr_madvise_copy_param(results, &result_p); -+ if (ret) -+ return ret; -+ -+ ret = pr_madvise_copy_param(ranges, &range_p); -+ if (ret) -+ return ret; -+ -+ result_vec = result_p.vec; -+ range_vec = range_p.vec; -+ -+ if (result_p.size != sizeof(struct pr_madvise_param) || -+ range_p.size != sizeof(struct pr_madvise_param)) -+ return -EINVAL; -+ -+ behaviors = kmalloc_array(nr_elem, sizeof(int), GFP_KERNEL); -+ if (!behaviors) -+ return -ENOMEM; -+ -+ ret = copy_from_user(behaviors, hints, sizeof(int) * nr_elem); -+ if (ret < 0) -+ goto free_behavior_vec; -+ -+ ret = import_iovec(READ, result_vec, nr_elem, UIO_FASTIOV, -+ &iov_l, &iter); -+ if (ret < 0) -+ goto free_behavior_vec; -+ -+ if (!iov_iter_count(&iter)) { -+ ret = -EINVAL; -+ goto free_iovecs; -+ } -+ -+ ret = rw_copy_check_uvector(CHECK_IOVEC_ONLY, range_vec, nr_elem, -+ UIO_FASTIOV, iovstack_r, &iov_r); -+ if (ret <= 0) -+ goto free_iovecs; - - f = fdget(pidfd); -- if (!f.file) -- return -EBADF; -+ if (!f.file) { -+ ret = -EBADF; -+ goto free_iovecs; -+ } - - pid = pidfd_to_pid(f.file); - if (IS_ERR(pid)) { - ret = PTR_ERR(pid); -- goto err; -+ goto put_fd; - } - - ret = -EINVAL; -@@ -1167,7 +1295,7 @@ SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start, - tsk = pid_task(pid, PIDTYPE_PID); - if (!tsk) { - rcu_read_unlock(); -- goto err; -+ goto put_fd; - } - get_task_struct(tsk); - rcu_read_unlock(); -@@ -1176,12 +1304,22 @@ SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start, - ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; - if (ret == -EACCES) - ret = -EPERM; -- goto err; -+ goto put_task; - } -- ret = madvise_core(tsk, start, len_in, behavior); -+ -+ ret = process_madvise_core(tsk, behaviors, &iter, iov_r, -+ nr_elem, flags); - mmput(mm); -+put_task: - put_task_struct(tsk); --err: -+put_fd: - fdput(f); -+free_iovecs: -+ if (iov_r != iovstack_r) -+ kfree(iov_r); -+ kfree(iov_l); -+free_behavior_vec: -+ kfree(behaviors); -+ - return ret; - } --- -2.21.0.1020.gf2820cf01a-goog +On Mon, 20 May 2019 12:52:53 +0900 Minchan Kim wrote: +> Example) +> +Better if the following stuff is stored somewhere under the +tools/testing directory. + +BR +Hillf + +> struct pr_madvise_param { +> int size; +> const struct iovec *vec; +> }; +> +> int main(int argc, char *argv[]) +> { +> struct pr_madvise_param retp, rangep; +> struct iovec result_vec[2], range_vec[2]; +> int hints[2]; +> long ret[2]; +> void *addr[2]; +> +> pid_t pid; +> char cmd[64] = {0,}; +> addr[0] = mmap(NULL, ALLOC_SIZE, PROT_READ|PROT_WRITE, +> MAP_POPULATE|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); +> +> if (MAP_FAILED == addr[0]) +> return 1; +> +> addr[1] = mmap(NULL, ALLOC_SIZE, PROT_READ|PROT_WRITE, +> MAP_POPULATE|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); +> +> if (MAP_FAILED == addr[1]) +> return 1; +> +> hints[0] = MADV_COLD; +> range_vec[0].iov_base = addr[0]; +> range_vec[0].iov_len = ALLOC_SIZE; +> result_vec[0].iov_base = &ret[0]; +> result_vec[0].iov_len = sizeof(long); +> retp.vec = result_vec; +> retp.size = sizeof(struct pr_madvise_param); +> +> hints[1] = MADV_COOL; +> range_vec[1].iov_base = addr[1]; +> range_vec[1].iov_len = ALLOC_SIZE; +> result_vec[1].iov_base = &ret[1]; +> result_vec[1].iov_len = sizeof(long); +> rangep.vec = range_vec; +> rangep.size = sizeof(struct pr_madvise_param); +> +> pid = fork(); +> if (!pid) { +> sleep(10); +> } else { +> int pidfd = open(cmd, O_DIRECTORY | O_CLOEXEC); +> if (pidfd < 0) +> return 1; +> +> /* munmap to make pages private for the child */ +> munmap(addr[0], ALLOC_SIZE); +> munmap(addr[1], ALLOC_SIZE); +> system("cat /proc/vmstat | egrep 'pswpout|deactivate'"); +> if (syscall(__NR_process_madvise, pidfd, 2, behaviors, +> &retp, &rangep, 0)) +> perror("process_madvise fail\n"); +> system("cat /proc/vmstat | egrep 'pswpout|deactivate'"); +> } +> +> return 0; +> } diff --git a/a/content_digest b/N1/content_digest index 8f29afc..6639c55 100644 --- a/a/content_digest +++ b/N1/content_digest @@ -1,9 +1,10 @@ "ref\020190520035254.57579-1-minchan@kernel.org\0" - "From\0Minchan Kim <minchan@kernel.org>\0" - "Subject\0[RFC 6/7] mm: extend process_madvise syscall to support vector arrary\0" - "Date\0Mon, 20 May 2019 12:52:53 +0900\0" - "To\0Andrew Morton <akpm@linux-foundation.org>\0" - "Cc\0LKML <linux-kernel@vger.kernel.org>" + "From\0Hillf Danton <hdanton@sina.com>\0" + "Subject\0Re: [RFC 6/7] mm: extend process_madvise syscall to support vector arrary\0" + "Date\0Wed, 29 May 2019 12:14:47 +0800\0" + "To\0Minchan Kim <minchan@kernel.org>\0" + "Cc\0Andrew Morton <akpm@linux-foundation.org>" + LKML <linux-kernel@vger.kernel.org> linux-mm <linux-mm@kvack.org> Michal Hocko <mhocko@suse.com> Johannes Weiner <hannes@cmpxchg.org> @@ -13,502 +14,81 @@ Daniel Colascione <dancol@google.com> Shakeel Butt <shakeelb@google.com> Sonny Rao <sonnyrao@google.com> - Brian Geffon <bgeffon@google.com> - " Minchan Kim <minchan@kernel.org>\0" + " Brian Geffon <bgeffon@google.com>\0" "\00:1\0" "b\0" - "Currently, process_madvise syscall works for only one address range\n" - "so user should call the syscall several times to give hints to\n" - "multiple address range.\n" "\n" - "This patch extends process_madvise syscall to support multiple\n" - "hints, address ranges and return vaules so user could give hints\n" - "all at once.\n" - "\n" - "struct pr_madvise_param {\n" - " int size; /* the size of this structure */\n" - " const struct iovec __user *vec; /* address range array */\n" - "}\n" - "\n" - "int process_madvise(int pidfd, ssize_t nr_elem,\n" - "\t\t int *behavior,\n" - "\t\t struct pr_madvise_param *results,\n" - "\t\t struct pr_madvise_param *ranges,\n" - "\t\t unsigned long flags);\n" - "\n" - "- pidfd\n" - "\n" - "target process fd\n" - "\n" - "- nr_elem\n" - "\n" - "the number of elemenent of array behavior, results, ranges\n" - "\n" - "- behavior\n" - "\n" - "hints for each address range in remote process so that user could\n" - "give different hints for each range.\n" - "\n" - "- results\n" - "\n" - "array of buffers to get results for associated remote address range\n" - "action.\n" - "\n" - "- ranges\n" - "\n" - "array to buffers to have remote process's address ranges to be\n" - "processed\n" - "\n" - "- flags\n" - "\n" - "extra argument for the future. It should be zero this moment.\n" - "\n" - "Example)\n" - "\n" - "struct pr_madvise_param {\n" - " int size;\n" - " const struct iovec *vec;\n" - "};\n" - "\n" - "int main(int argc, char *argv[])\n" - "{\n" - " struct pr_madvise_param retp, rangep;\n" - " struct iovec result_vec[2], range_vec[2];\n" - " int hints[2];\n" - " long ret[2];\n" - " void *addr[2];\n" - "\n" - " pid_t pid;\n" - " char cmd[64] = {0,};\n" - " addr[0] = mmap(NULL, ALLOC_SIZE, PROT_READ|PROT_WRITE,\n" - " MAP_POPULATE|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);\n" - "\n" - " if (MAP_FAILED == addr[0])\n" - " return 1;\n" - "\n" - " addr[1] = mmap(NULL, ALLOC_SIZE, PROT_READ|PROT_WRITE,\n" - " MAP_POPULATE|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);\n" - "\n" - " if (MAP_FAILED == addr[1])\n" - " return 1;\n" - "\n" - " hints[0] = MADV_COLD;\n" - "\trange_vec[0].iov_base = addr[0];\n" - " range_vec[0].iov_len = ALLOC_SIZE;\n" - " result_vec[0].iov_base = &ret[0];\n" - " result_vec[0].iov_len = sizeof(long);\n" - "\tretp.vec = result_vec;\n" - " retp.size = sizeof(struct pr_madvise_param);\n" - "\n" - " hints[1] = MADV_COOL;\n" - " range_vec[1].iov_base = addr[1];\n" - " range_vec[1].iov_len = ALLOC_SIZE;\n" - " result_vec[1].iov_base = &ret[1];\n" - " result_vec[1].iov_len = sizeof(long);\n" - " rangep.vec = range_vec;\n" - " rangep.size = sizeof(struct pr_madvise_param);\n" - "\n" - " pid = fork();\n" - " if (!pid) {\n" - " sleep(10);\n" - " } else {\n" - " int pidfd = open(cmd, O_DIRECTORY | O_CLOEXEC);\n" - " if (pidfd < 0)\n" - " return 1;\n" - "\n" - " /* munmap to make pages private for the child */\n" - " munmap(addr[0], ALLOC_SIZE);\n" - " munmap(addr[1], ALLOC_SIZE);\n" - " system(\"cat /proc/vmstat | egrep 'pswpout|deactivate'\");\n" - " if (syscall(__NR_process_madvise, pidfd, 2, behaviors,\n" - "\t\t\t\t\t\t&retp, &rangep, 0))\n" - " perror(\"process_madvise fail\\n\");\n" - " system(\"cat /proc/vmstat | egrep 'pswpout|deactivate'\");\n" - " }\n" - "\n" - " return 0;\n" - "}\n" - "\n" - "Signed-off-by: Minchan Kim <minchan@kernel.org>\n" - "---\n" - " include/uapi/asm-generic/mman-common.h | 5 +\n" - " mm/madvise.c | 184 +++++++++++++++++++++----\n" - " 2 files changed, 166 insertions(+), 23 deletions(-)\n" - "\n" - "diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h\n" - "index b9b51eeb8e1a..b8e230de84a6 100644\n" - "--- a/include/uapi/asm-generic/mman-common.h\n" - "+++ b/include/uapi/asm-generic/mman-common.h\n" - "@@ -74,4 +74,9 @@\n" - " #define PKEY_ACCESS_MASK\t(PKEY_DISABLE_ACCESS |\\\n" - " \t\t\t\t PKEY_DISABLE_WRITE)\n" - " \n" - "+struct pr_madvise_param {\n" - "+\tint size;\t\t\t/* the size of this structure */\n" - "+\tconst struct iovec __user *vec;\t/* address range array */\n" - "+};\n" - "+\n" - " #endif /* __ASM_GENERIC_MMAN_COMMON_H */\n" - "diff --git a/mm/madvise.c b/mm/madvise.c\n" - "index af02aa17e5c1..f4f569dac2bd 100644\n" - "--- a/mm/madvise.c\n" - "+++ b/mm/madvise.c\n" - "@@ -320,6 +320,7 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr,\n" - " \tstruct page *page;\n" - " \tstruct vm_area_struct *vma = walk->vma;\n" - " \tunsigned long next;\n" - "+\tlong nr_pages = 0;\n" - " \n" - " \tnext = pmd_addr_end(addr, end);\n" - " \tif (pmd_trans_huge(*pmd)) {\n" - "@@ -380,9 +381,12 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr,\n" - " \n" - " \t\tptep_test_and_clear_young(vma, addr, pte);\n" - " \t\tdeactivate_page(page);\n" - "+\t\tnr_pages++;\n" - "+\n" - " \t}\n" - " \n" - " \tpte_unmap_unlock(orig_pte, ptl);\n" - "+\t*(long *)walk->private += nr_pages;\n" - " \tcond_resched();\n" - " \n" - " \treturn 0;\n" - "@@ -390,11 +394,13 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr,\n" - " \n" - " static void madvise_cool_page_range(struct mmu_gather *tlb,\n" - " \t\t\t struct vm_area_struct *vma,\n" - "-\t\t\t unsigned long addr, unsigned long end)\n" - "+\t\t\t unsigned long addr, unsigned long end,\n" - "+\t\t\t long *nr_pages)\n" - " {\n" - " \tstruct mm_walk cool_walk = {\n" - " \t\t.pmd_entry = madvise_cool_pte_range,\n" - " \t\t.mm = vma->vm_mm,\n" - "+\t\t.private = nr_pages\n" - " \t};\n" - " \n" - " \ttlb_start_vma(tlb, vma);\n" - "@@ -403,7 +409,8 @@ static void madvise_cool_page_range(struct mmu_gather *tlb,\n" - " }\n" - " \n" - " static long madvise_cool(struct vm_area_struct *vma,\n" - "-\t\t\tunsigned long start_addr, unsigned long end_addr)\n" - "+\t\t\tunsigned long start_addr, unsigned long end_addr,\n" - "+\t\t\tlong *nr_pages)\n" - " {\n" - " \tstruct mm_struct *mm = vma->vm_mm;\n" - " \tstruct mmu_gather tlb;\n" - "@@ -413,7 +420,7 @@ static long madvise_cool(struct vm_area_struct *vma,\n" - " \n" - " \tlru_add_drain();\n" - " \ttlb_gather_mmu(&tlb, mm, start_addr, end_addr);\n" - "-\tmadvise_cool_page_range(&tlb, vma, start_addr, end_addr);\n" - "+\tmadvise_cool_page_range(&tlb, vma, start_addr, end_addr, nr_pages);\n" - " \ttlb_finish_mmu(&tlb, start_addr, end_addr);\n" - " \n" - " \treturn 0;\n" - "@@ -429,6 +436,7 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,\n" - " \tint isolated = 0;\n" - " \tstruct vm_area_struct *vma = walk->vma;\n" - " \tunsigned long next;\n" - "+\tlong nr_pages = 0;\n" - " \n" - " \tnext = pmd_addr_end(addr, end);\n" - " \tif (pmd_trans_huge(*pmd)) {\n" - "@@ -492,7 +500,7 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,\n" - " \t\tlist_add(&page->lru, &page_list);\n" - " \t\tif (isolated >= SWAP_CLUSTER_MAX) {\n" - " \t\t\tpte_unmap_unlock(orig_pte, ptl);\n" - "-\t\t\treclaim_pages(&page_list);\n" - "+\t\t\tnr_pages += reclaim_pages(&page_list);\n" - " \t\t\tisolated = 0;\n" - " \t\t\tpte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);\n" - " \t\t\torig_pte = pte;\n" - "@@ -500,19 +508,22 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,\n" - " \t}\n" - " \n" - " \tpte_unmap_unlock(orig_pte, ptl);\n" - "-\treclaim_pages(&page_list);\n" - "+\tnr_pages += reclaim_pages(&page_list);\n" - " \tcond_resched();\n" - " \n" - "+\t*(long *)walk->private += nr_pages;\n" - " \treturn 0;\n" - " }\n" - " \n" - " static void madvise_cold_page_range(struct mmu_gather *tlb,\n" - " \t\t\t struct vm_area_struct *vma,\n" - "-\t\t\t unsigned long addr, unsigned long end)\n" - "+\t\t\t unsigned long addr, unsigned long end,\n" - "+\t\t\t long *nr_pages)\n" - " {\n" - " \tstruct mm_walk warm_walk = {\n" - " \t\t.pmd_entry = madvise_cold_pte_range,\n" - " \t\t.mm = vma->vm_mm,\n" - "+\t\t.private = nr_pages,\n" - " \t};\n" - " \n" - " \ttlb_start_vma(tlb, vma);\n" - "@@ -522,7 +533,8 @@ static void madvise_cold_page_range(struct mmu_gather *tlb,\n" - " \n" - " \n" - " static long madvise_cold(struct vm_area_struct *vma,\n" - "-\t\t\tunsigned long start_addr, unsigned long end_addr)\n" - "+\t\t\tunsigned long start_addr, unsigned long end_addr,\n" - "+\t\t\tlong *nr_pages)\n" - " {\n" - " \tstruct mm_struct *mm = vma->vm_mm;\n" - " \tstruct mmu_gather tlb;\n" - "@@ -532,7 +544,7 @@ static long madvise_cold(struct vm_area_struct *vma,\n" - " \n" - " \tlru_add_drain();\n" - " \ttlb_gather_mmu(&tlb, mm, start_addr, end_addr);\n" - "-\tmadvise_cold_page_range(&tlb, vma, start_addr, end_addr);\n" - "+\tmadvise_cold_page_range(&tlb, vma, start_addr, end_addr, nr_pages);\n" - " \ttlb_finish_mmu(&tlb, start_addr, end_addr);\n" - " \n" - " \treturn 0;\n" - "@@ -922,7 +934,7 @@ static int madvise_inject_error(int behavior,\n" - " static long\n" - " madvise_vma(struct task_struct *tsk, struct vm_area_struct *vma,\n" - " \t\tstruct vm_area_struct **prev, unsigned long start,\n" - "-\t\tunsigned long end, int behavior)\n" - "+\t\tunsigned long end, int behavior, long *nr_pages)\n" - " {\n" - " \tswitch (behavior) {\n" - " \tcase MADV_REMOVE:\n" - "@@ -930,9 +942,9 @@ madvise_vma(struct task_struct *tsk, struct vm_area_struct *vma,\n" - " \tcase MADV_WILLNEED:\n" - " \t\treturn madvise_willneed(vma, prev, start, end);\n" - " \tcase MADV_COOL:\n" - "-\t\treturn madvise_cool(vma, start, end);\n" - "+\t\treturn madvise_cool(vma, start, end, nr_pages);\n" - " \tcase MADV_COLD:\n" - "-\t\treturn madvise_cold(vma, start, end);\n" - "+\t\treturn madvise_cold(vma, start, end, nr_pages);\n" - " \tcase MADV_FREE:\n" - " \tcase MADV_DONTNEED:\n" - " \t\treturn madvise_dontneed_free(tsk, vma, prev, start,\n" - "@@ -981,7 +993,7 @@ madvise_behavior_valid(int behavior)\n" - " }\n" - " \n" - " static int madvise_core(struct task_struct *tsk, unsigned long start,\n" - "-\t\t\tsize_t len_in, int behavior)\n" - "+\t\t\tsize_t len_in, int behavior, long *nr_pages)\n" - " {\n" - " \tunsigned long end, tmp;\n" - " \tstruct vm_area_struct *vma, *prev;\n" - "@@ -996,6 +1008,7 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,\n" - " \n" - " \tif (start & ~PAGE_MASK)\n" - " \t\treturn error;\n" - "+\n" - " \tlen = (len_in + ~PAGE_MASK) & PAGE_MASK;\n" - " \n" - " \t/* Check to see whether len was rounded up from small -ve to zero */\n" - "@@ -1035,6 +1048,8 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,\n" - " \tblk_start_plug(&plug);\n" - " \tfor (;;) {\n" - " \t\t/* Still start < end. */\n" - "+\t\tlong pages = 0;\n" - "+\n" - " \t\terror = -ENOMEM;\n" - " \t\tif (!vma)\n" - " \t\t\tgoto out;\n" - "@@ -1053,9 +1068,11 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,\n" - " \t\t\ttmp = end;\n" - " \n" - " \t\t/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */\n" - "-\t\terror = madvise_vma(tsk, vma, &prev, start, tmp, behavior);\n" - "+\t\terror = madvise_vma(tsk, vma, &prev, start, tmp,\n" - "+\t\t\t\t\tbehavior, &pages);\n" - " \t\tif (error)\n" - " \t\t\tgoto out;\n" - "+\t\t*nr_pages += pages;\n" - " \t\tstart = tmp;\n" - " \t\tif (prev && start < prev->vm_end)\n" - " \t\t\tstart = prev->vm_end;\n" - "@@ -1140,26 +1157,137 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,\n" - " */\n" - " SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)\n" - " {\n" - "-\treturn madvise_core(current, start, len_in, behavior);\n" - "+\tunsigned long dummy;\n" - "+\n" - "+\treturn madvise_core(current, start, len_in, behavior, &dummy);\n" - " }\n" - " \n" - "-SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start,\n" - "-\t\tsize_t, len_in, int, behavior)\n" - "+static int pr_madvise_copy_param(struct pr_madvise_param __user *u_param,\n" - "+\t\tstruct pr_madvise_param *param)\n" - "+{\n" - "+\tu32 size;\n" - "+\tint ret;\n" - "+\n" - "+\tmemset(param, 0, sizeof(*param));\n" - "+\n" - "+\tret = get_user(size, &u_param->size);\n" - "+\tif (ret)\n" - "+\t\treturn ret;\n" - "+\n" - "+\tif (size > PAGE_SIZE)\n" - "+\t\treturn -E2BIG;\n" - "+\n" - "+\tif (!size || size > sizeof(struct pr_madvise_param))\n" - "+\t\treturn -EINVAL;\n" - "+\n" - "+\tret = copy_from_user(param, u_param, size);\n" - "+\tif (ret)\n" - "+\t\treturn -EFAULT;\n" - "+\n" - "+\treturn ret;\n" - "+}\n" - "+\n" - "+static int process_madvise_core(struct task_struct *tsk, int *behaviors,\n" - "+\t\t\t\tstruct iov_iter *iter,\n" - "+\t\t\t\tconst struct iovec *range_vec,\n" - "+\t\t\t\tunsigned long riovcnt,\n" - "+\t\t\t\tunsigned long flags)\n" - "+{\n" - "+\tint i;\n" - "+\tlong err;\n" - "+\n" - "+\tfor (err = 0, i = 0; i < riovcnt && iov_iter_count(iter); i++) {\n" - "+\t\tlong ret = 0;\n" - "+\n" - "+\t\terr = madvise_core(tsk, (unsigned long)range_vec[i].iov_base,\n" - "+\t\t\t\trange_vec[i].iov_len, behaviors[i],\n" - "+\t\t\t\t&ret);\n" - "+\t\tif (err)\n" - "+\t\t\tret = err;\n" - "+\n" - "+\t\tif (copy_to_iter(&ret, sizeof(long), iter) !=\n" - "+\t\t\t\tsizeof(long)) {\n" - "+\t\t\terr = -EFAULT;\n" - "+\t\t\tbreak;\n" - "+\t\t}\n" - "+\n" - "+\t\terr = 0;\n" - "+\t}\n" - "+\n" - "+\treturn err;\n" - "+}\n" - "+\n" - "+SYSCALL_DEFINE6(process_madvise, int, pidfd, ssize_t, nr_elem,\n" - "+\t\t\tconst int __user *, hints,\n" - "+\t\t\tstruct pr_madvise_param __user *, results,\n" - "+\t\t\tstruct pr_madvise_param __user *, ranges,\n" - "+\t\t\tunsigned long, flags)\n" - " {\n" - " \tint ret;\n" - " \tstruct fd f;\n" - " \tstruct pid *pid;\n" - " \tstruct task_struct *tsk;\n" - " \tstruct mm_struct *mm;\n" - "+\tstruct pr_madvise_param result_p, range_p;\n" - "+\tconst struct iovec __user *result_vec, __user *range_vec;\n" - "+\tint *behaviors;\n" - "+\tstruct iovec iovstack_result[UIO_FASTIOV];\n" - "+\tstruct iovec iovstack_r[UIO_FASTIOV];\n" - "+\tstruct iovec *iov_l = iovstack_result;\n" - "+\tstruct iovec *iov_r = iovstack_r;\n" - "+\tstruct iov_iter iter;\n" - "+\n" - "+\tif (flags != 0)\n" - "+\t\treturn -EINVAL;\n" - "+\n" - "+\tret = pr_madvise_copy_param(results, &result_p);\n" - "+\tif (ret)\n" - "+\t\treturn ret;\n" - "+\n" - "+\tret = pr_madvise_copy_param(ranges, &range_p);\n" - "+\tif (ret)\n" - "+\t\treturn ret;\n" - "+\n" - "+\tresult_vec = result_p.vec;\n" - "+\trange_vec = range_p.vec;\n" - "+\n" - "+\tif (result_p.size != sizeof(struct pr_madvise_param) ||\n" - "+\t\t\trange_p.size != sizeof(struct pr_madvise_param))\n" - "+\t\treturn -EINVAL;\n" - "+\n" - "+\tbehaviors = kmalloc_array(nr_elem, sizeof(int), GFP_KERNEL);\n" - "+\tif (!behaviors)\n" - "+\t\treturn -ENOMEM;\n" - "+\n" - "+\tret = copy_from_user(behaviors, hints, sizeof(int) * nr_elem);\n" - "+\tif (ret < 0)\n" - "+\t\tgoto free_behavior_vec;\n" - "+\n" - "+\tret = import_iovec(READ, result_vec, nr_elem, UIO_FASTIOV,\n" - "+\t\t\t\t&iov_l, &iter);\n" - "+\tif (ret < 0)\n" - "+\t\tgoto free_behavior_vec;\n" - "+\n" - "+\tif (!iov_iter_count(&iter)) {\n" - "+\t\tret = -EINVAL;\n" - "+\t\tgoto free_iovecs;\n" - "+\t}\n" - "+\n" - "+\tret = rw_copy_check_uvector(CHECK_IOVEC_ONLY, range_vec, nr_elem,\n" - "+\t\t\t\tUIO_FASTIOV, iovstack_r, &iov_r);\n" - "+\tif (ret <= 0)\n" - "+\t\tgoto free_iovecs;\n" - " \n" - " \tf = fdget(pidfd);\n" - "-\tif (!f.file)\n" - "-\t\treturn -EBADF;\n" - "+\tif (!f.file) {\n" - "+\t\tret = -EBADF;\n" - "+\t\tgoto free_iovecs;\n" - "+\t}\n" - " \n" - " \tpid = pidfd_to_pid(f.file);\n" - " \tif (IS_ERR(pid)) {\n" - " \t\tret = PTR_ERR(pid);\n" - "-\t\tgoto err;\n" - "+\t\tgoto put_fd;\n" - " \t}\n" - " \n" - " \tret = -EINVAL;\n" - "@@ -1167,7 +1295,7 @@ SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start,\n" - " \ttsk = pid_task(pid, PIDTYPE_PID);\n" - " \tif (!tsk) {\n" - " \t\trcu_read_unlock();\n" - "-\t\tgoto err;\n" - "+\t\tgoto put_fd;\n" - " \t}\n" - " \tget_task_struct(tsk);\n" - " \trcu_read_unlock();\n" - "@@ -1176,12 +1304,22 @@ SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start,\n" - " \t\tret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;\n" - " \t\tif (ret == -EACCES)\n" - " \t\t\tret = -EPERM;\n" - "-\t\tgoto err;\n" - "+\t\tgoto put_task;\n" - " \t}\n" - "-\tret = madvise_core(tsk, start, len_in, behavior);\n" - "+\n" - "+\tret = process_madvise_core(tsk, behaviors, &iter, iov_r,\n" - "+\t\t\t\t\tnr_elem, flags);\n" - " \tmmput(mm);\n" - "+put_task:\n" - " \tput_task_struct(tsk);\n" - "-err:\n" - "+put_fd:\n" - " \tfdput(f);\n" - "+free_iovecs:\n" - "+\tif (iov_r != iovstack_r)\n" - "+\t\tkfree(iov_r);\n" - "+\tkfree(iov_l);\n" - "+free_behavior_vec:\n" - "+\tkfree(behaviors);\n" - "+\n" - " \treturn ret;\n" - " }\n" - "-- \n" - 2.21.0.1020.gf2820cf01a-goog + "On Mon, 20 May 2019 12:52:53 +0900 Minchan Kim wrote:\n" + "> Example)\n" + "> \n" + "Better if the following stuff is stored somewhere under the\n" + "tools/testing directory.\n" + "\n" + "BR\n" + "Hillf\n" + "\n" + "> struct pr_madvise_param {\n" + "> int size;\n" + "> const struct iovec *vec;\n" + "> };\n" + "> \n" + "> int main(int argc, char *argv[])\n" + "> {\n" + "> struct pr_madvise_param retp, rangep;\n" + "> struct iovec result_vec[2], range_vec[2];\n" + "> int hints[2];\n" + "> long ret[2];\n" + "> void *addr[2];\n" + "> \n" + "> pid_t pid;\n" + "> char cmd[64] = {0,};\n" + "> addr[0] = mmap(NULL, ALLOC_SIZE, PROT_READ|PROT_WRITE,\n" + "> MAP_POPULATE|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);\n" + "> \n" + "> if (MAP_FAILED == addr[0])\n" + "> return 1;\n" + "> \n" + "> addr[1] = mmap(NULL, ALLOC_SIZE, PROT_READ|PROT_WRITE,\n" + "> MAP_POPULATE|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);\n" + "> \n" + "> if (MAP_FAILED == addr[1])\n" + "> return 1;\n" + "> \n" + "> hints[0] = MADV_COLD;\n" + "> \trange_vec[0].iov_base = addr[0];\n" + "> range_vec[0].iov_len = ALLOC_SIZE;\n" + "> result_vec[0].iov_base = &ret[0];\n" + "> result_vec[0].iov_len = sizeof(long);\n" + "> \tretp.vec = result_vec;\n" + "> retp.size = sizeof(struct pr_madvise_param);\n" + "> \n" + "> hints[1] = MADV_COOL;\n" + "> range_vec[1].iov_base = addr[1];\n" + "> range_vec[1].iov_len = ALLOC_SIZE;\n" + "> result_vec[1].iov_base = &ret[1];\n" + "> result_vec[1].iov_len = sizeof(long);\n" + "> rangep.vec = range_vec;\n" + "> rangep.size = sizeof(struct pr_madvise_param);\n" + "> \n" + "> pid = fork();\n" + "> if (!pid) {\n" + "> sleep(10);\n" + "> } else {\n" + "> int pidfd = open(cmd, O_DIRECTORY | O_CLOEXEC);\n" + "> if (pidfd < 0)\n" + "> return 1;\n" + "> \n" + "> /* munmap to make pages private for the child */\n" + "> munmap(addr[0], ALLOC_SIZE);\n" + "> munmap(addr[1], ALLOC_SIZE);\n" + "> system(\"cat /proc/vmstat | egrep 'pswpout|deactivate'\");\n" + "> if (syscall(__NR_process_madvise, pidfd, 2, behaviors,\n" + "> \t\t\t\t\t\t&retp, &rangep, 0))\n" + "> perror(\"process_madvise fail\\n\");\n" + "> system(\"cat /proc/vmstat | egrep 'pswpout|deactivate'\");\n" + "> }\n" + "> \n" + "> return 0;\n" + > } -ee9405a002a07ac8f10b398c03c8e6d49fae511e9110bc1631189bcbd411b707 +5c7cc622cd2a2f689e0483310fe7aa50008f65725291bef5f5bc0d94d65f1344
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.