Linux userland API discussions
 help / color / mirror / Atom feed
* [PATCH v3 3/5] mm: account nr_isolated_xxx in [isolate|putback]_lru_page
From: Minchan Kim @ 2019-06-27 11:54 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, LKML, linux-api, Michal Hocko, Johannes Weiner,
	Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, oleksandr, hdanton, lizeb, Dave Hansen,
	Kirill A . Shutemov, Minchan Kim
In-Reply-To: <20190627115405.255259-1-minchan@kernel.org>

The isolate counting is pecpu counter so it would be not huge gain
to work them by batch. Rather than complicating to make them batch,
let's make it more stright-foward via adding the counting logic
into [isolate|putback]_lru_page API.

* v1
 * fix accounting bug - Hillf

Link: http://lkml.kernel.org/r/20190531165927.GA20067@cmpxchg.org
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 mm/compaction.c     |  2 --
 mm/gup.c            |  7 +------
 mm/khugepaged.c     |  3 ---
 mm/memory-failure.c |  3 ---
 mm/memory_hotplug.c |  4 ----
 mm/mempolicy.c      |  6 +-----
 mm/migrate.c        | 37 ++++++++-----------------------------
 mm/vmscan.c         | 22 ++++++++++++++++------
 8 files changed, 26 insertions(+), 58 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 9e1b9acb116b..c6591682deda 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -982,8 +982,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 
 		/* Successfully isolated */
 		del_page_from_lru_list(page, lruvec, page_lru(page));
-		inc_node_page_state(page,
-				NR_ISOLATED_ANON + page_is_file_cache(page));
 
 isolate_success:
 		list_add(&page->lru, &cc->migratepages);
diff --git a/mm/gup.c b/mm/gup.c
index 7dde2e3a1963..aec3a2b7e61b 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1473,13 +1473,8 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk,
 					drain_allow = false;
 				}
 
-				if (!isolate_lru_page(head)) {
+				if (!isolate_lru_page(head))
 					list_add_tail(&head->lru, &cma_page_list);
-					mod_node_page_state(page_pgdat(head),
-							    NR_ISOLATED_ANON +
-							    page_is_file_cache(head),
-							    hpage_nr_pages(head));
-				}
 			}
 		}
 	}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 0f7419938008..7da34e198ec5 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -503,7 +503,6 @@ void __khugepaged_exit(struct mm_struct *mm)
 
 static void release_pte_page(struct page *page)
 {
-	dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
 	unlock_page(page);
 	putback_lru_page(page);
 }
@@ -602,8 +601,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 			result = SCAN_DEL_PAGE_LRU;
 			goto out;
 		}
-		inc_node_page_state(page,
-				NR_ISOLATED_ANON + page_is_file_cache(page));
 		VM_BUG_ON_PAGE(!PageLocked(page), page);
 		VM_BUG_ON_PAGE(PageLRU(page), page);
 
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 7e08cbf3ba49..3586e8226e4e 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1795,9 +1795,6 @@ static int __soft_offline_page(struct page *page, int flags)
 		 * so use !__PageMovable instead for LRU page's mapping
 		 * cannot have PAGE_MAPPING_MOVABLE.
 		 */
-		if (!__PageMovable(page))
-			inc_node_page_state(page, NR_ISOLATED_ANON +
-						page_is_file_cache(page));
 		list_add(&page->lru, &pagelist);
 		ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
 					MIGRATE_SYNC, MR_MEMORY_FAILURE);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index dfab21dc33dc..68577c677b46 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1384,10 +1384,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 			ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
 		if (!ret) { /* Success */
 			list_add_tail(&page->lru, &source);
-			if (!__PageMovable(page))
-				inc_node_page_state(page, NR_ISOLATED_ANON +
-						    page_is_file_cache(page));
-
 		} else {
 			pr_warn("failed to isolate pfn %lx\n", pfn);
 			dump_page(page, "isolation failed");
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 64562809bf3b..03081f3404ca 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -994,12 +994,8 @@ static int migrate_page_add(struct page *page, struct list_head *pagelist,
 	 * Avoid migrating a page that is shared with others.
 	 */
 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
-		if (!isolate_lru_page(head)) {
+		if (!isolate_lru_page(head))
 			list_add_tail(&head->lru, pagelist);
-			mod_node_page_state(page_pgdat(head),
-				NR_ISOLATED_ANON + page_is_file_cache(head),
-				hpage_nr_pages(head));
-		}
 	}
 
 	return 0;
diff --git a/mm/migrate.c b/mm/migrate.c
index 572b4bc85d76..5583324c01e7 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -190,8 +190,6 @@ void putback_movable_pages(struct list_head *l)
 			unlock_page(page);
 			put_page(page);
 		} else {
-			mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
-					page_is_file_cache(page), -hpage_nr_pages(page));
 			putback_lru_page(page);
 		}
 	}
@@ -1181,10 +1179,17 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
 		return -ENOMEM;
 
 	if (page_count(page) == 1) {
+		bool is_lru = !__PageMovable(page);
+
 		/* page was freed from under us. So we are done. */
 		ClearPageActive(page);
 		ClearPageUnevictable(page);
-		if (unlikely(__PageMovable(page))) {
+		if (likely(is_lru))
+			mod_node_page_state(page_pgdat(page),
+						NR_ISOLATED_ANON +
+						page_is_file_cache(page),
+						-hpage_nr_pages(page));
+		else {
 			lock_page(page);
 			if (!PageMovable(page))
 				__ClearPageIsolated(page);
@@ -1210,15 +1215,6 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
 		 * restored.
 		 */
 		list_del(&page->lru);
-
-		/*
-		 * Compaction can migrate also non-LRU pages which are
-		 * not accounted to NR_ISOLATED_*. They can be recognized
-		 * as __PageMovable
-		 */
-		if (likely(!__PageMovable(page)))
-			mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
-					page_is_file_cache(page), -hpage_nr_pages(page));
 	}
 
 	/*
@@ -1572,9 +1568,6 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
 
 		err = 0;
 		list_add_tail(&head->lru, pagelist);
-		mod_node_page_state(page_pgdat(head),
-			NR_ISOLATED_ANON + page_is_file_cache(head),
-			hpage_nr_pages(head));
 	}
 out_putpage:
 	/*
@@ -1890,8 +1883,6 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
 
 static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
 {
-	int page_lru;
-
 	VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
 
 	/* Avoid migrating to a node that is nearly full */
@@ -1913,10 +1904,6 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
 		return 0;
 	}
 
-	page_lru = page_is_file_cache(page);
-	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
-				hpage_nr_pages(page));
-
 	/*
 	 * Isolating the page has taken another reference, so the
 	 * caller's reference can be safely dropped without the page
@@ -1971,8 +1958,6 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
 	if (nr_remaining) {
 		if (!list_empty(&migratepages)) {
 			list_del(&page->lru);
-			dec_node_page_state(page, NR_ISOLATED_ANON +
-					page_is_file_cache(page));
 			putback_lru_page(page);
 		}
 		isolated = 0;
@@ -2002,7 +1987,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 	pg_data_t *pgdat = NODE_DATA(node);
 	int isolated = 0;
 	struct page *new_page = NULL;
-	int page_lru = page_is_file_cache(page);
 	unsigned long start = address & HPAGE_PMD_MASK;
 
 	new_page = alloc_pages_node(node,
@@ -2048,8 +2032,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 		/* Retake the callers reference and putback on LRU */
 		get_page(page);
 		putback_lru_page(page);
-		mod_node_page_state(page_pgdat(page),
-			 NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
 
 		goto out_unlock;
 	}
@@ -2099,9 +2081,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 	count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
 	count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
 
-	mod_node_page_state(page_pgdat(page),
-			NR_ISOLATED_ANON + page_lru,
-			-HPAGE_PMD_NR);
 	return isolated;
 
 out_fail:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 49e9ee4d771d..223ce5da08f0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1014,6 +1014,9 @@ int remove_mapping(struct address_space *mapping, struct page *page)
 void putback_lru_page(struct page *page)
 {
 	lru_cache_add(page);
+	mod_node_page_state(page_pgdat(page),
+				NR_ISOLATED_ANON + page_is_file_cache(page),
+				-hpage_nr_pages(page));
 	put_page(page);		/* drop ref from isolate */
 }
 
@@ -1479,6 +1482,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 */
 		nr_reclaimed += nr_pages;
 
+		mod_node_page_state(pgdat, NR_ISOLATED_ANON +
+						page_is_file_cache(page),
+						-nr_pages);
 		/*
 		 * Is there need to periodically free_page_list? It would
 		 * appear not as the counts should be low
@@ -1554,7 +1560,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
 	ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
 			TTU_IGNORE_ACCESS, &dummy_stat, true);
 	list_splice(&clean_pages, page_list);
-	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
 	return ret;
 }
 
@@ -1630,6 +1635,9 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
 		 */
 		ClearPageLRU(page);
 		ret = 0;
+		__mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
+						page_is_file_cache(page),
+						hpage_nr_pages(page));
 	}
 
 	return ret;
@@ -1761,6 +1769,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 	trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
 				    total_scan, skipped, nr_taken, mode, lru);
 	update_lru_sizes(lruvec, lru, nr_zone_taken);
+
 	return nr_taken;
 }
 
@@ -1809,6 +1818,9 @@ int isolate_lru_page(struct page *page)
 			ClearPageLRU(page);
 			del_page_from_lru_list(page, lruvec, lru);
 			ret = 0;
+			mod_node_page_state(pgdat, NR_ISOLATED_ANON +
+						page_is_file_cache(page),
+						hpage_nr_pages(page));
 		}
 		spin_unlock_irq(&pgdat->lru_lock);
 	}
@@ -1900,6 +1912,9 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
 		update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
 		list_move(&page->lru, &lruvec->lists[lru]);
 
+		__mod_node_page_state(pgdat, NR_ISOLATED_ANON +
+						page_is_file_cache(page),
+						-hpage_nr_pages(page));
 		if (put_page_testzero(page)) {
 			__ClearPageLRU(page);
 			__ClearPageActive(page);
@@ -1977,7 +1992,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
 				     &nr_scanned, sc, lru);
 
-	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
 	reclaim_stat->recent_scanned[file] += nr_taken;
 
 	item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
@@ -2003,8 +2017,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 
 	move_pages_to_lru(lruvec, &page_list);
 
-	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
-
 	spin_unlock_irq(&pgdat->lru_lock);
 
 	mem_cgroup_uncharge_list(&page_list);
@@ -2063,7 +2075,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
 				     &nr_scanned, sc, lru);
 
-	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
 	reclaim_stat->recent_scanned[file] += nr_taken;
 
 	__count_vm_events(PGREFILL, nr_scanned);
@@ -2132,7 +2143,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	__count_vm_events(PGDEACTIVATE, nr_deactivate);
 	__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
 
-	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
 	spin_unlock_irq(&pgdat->lru_lock);
 
 	mem_cgroup_uncharge_list(&l_active);
-- 
2.22.0.410.gd8fdbe21b5-goog

^ permalink raw reply related

* [PATCH v3 4/5] mm: introduce MADV_PAGEOUT
From: Minchan Kim @ 2019-06-27 11:54 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, LKML, linux-api, Michal Hocko, Johannes Weiner,
	Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, oleksandr, hdanton, lizeb, Dave Hansen,
	Kirill A . Shutemov, Minchan Kim
In-Reply-To: <20190627115405.255259-1-minchan@kernel.org>

When a process expects no accesses to a certain memory range
for a long time, it could hint kernel that the pages can be
reclaimed instantly but data should be preserved for future use.
This could reduce workingset eviction so it ends up increasing
performance.

This patch introduces the new MADV_PAGEOUT hint to madvise(2)
syscall. MADV_PAGEOUT can be used by a process to mark a memory
range as not expected to be used for a long time so that kernel
reclaims *any LRU* pages instantly. The hint can help kernel in
deciding which pages to evict proactively.

- man-page material

MADV_PAGEOUT (since Linux x.x)

Do not expect access in the near future so pages in the specified
regions could be reclaimed instantly regardless of memory pressure.
Thus, access in the range after successful operation could cause
major page fault but never lose the up-to-date contents unlike
MADV_DONTNEED. It works for only private anonymous mappings and
non-anonymous mappings that belong to files that the calling process
could successfully open for writing; otherwise, it could be used for
sidechannel attack.

MADV_PAGEOUT cannot be applied to locked pages, Huge TLB pages, or
VM_PFNMAP pages.

* v2
 * add comment about SWAP_CLUSTER_MAX - mhocko
 * add permission check to prevent sidechannel attack - mhocko
 * add man page stuff - dave

* v1
 * change pte to old and rely on the other's reference - hannes
 * remove page_mapcount to check shared page - mhocko

* RFC v2
 * make reclaim_pages simple via factoring out isolate logic - hannes

* RFCv1
 * rename from MADV_COLD to MADV_PAGEOUT - hannes
 * bail out if process is being killed - Hillf
 * fix reclaim_pages bugs - Hillf

Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 include/linux/swap.h                   |   1 +
 include/uapi/asm-generic/mman-common.h |   1 +
 mm/madvise.c                           | 212 +++++++++++++++++++++++++
 mm/vmscan.c                            |  58 +++++++
 4 files changed, 272 insertions(+)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0ce997edb8bb..063c0c1e112b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -365,6 +365,7 @@ extern int vm_swappiness;
 extern int remove_mapping(struct address_space *mapping, struct page *page);
 extern unsigned long vm_total_pages;
 
+extern unsigned long reclaim_pages(struct list_head *page_list);
 #ifdef CONFIG_NUMA
 extern int node_reclaim_mode;
 extern int sysctl_min_unmapped_ratio;
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index d7b4231eea63..f545e159b472 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -48,6 +48,7 @@
 #define MADV_WILLNEED	3		/* will need these pages */
 #define MADV_DONTNEED	4		/* don't need these pages */
 #define MADV_COLD	5		/* deactivatie these pages */
+#define MADV_PAGEOUT	6		/* reclaim these pages */
 
 /* common parameters: try to keep these consistent across architectures */
 #define MADV_FREE	8		/* free pages only if memory pressure */
diff --git a/mm/madvise.c b/mm/madvise.c
index 7abb8e54bc7a..ee210473f639 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -11,6 +11,7 @@
 #include <linux/syscalls.h>
 #include <linux/mempolicy.h>
 #include <linux/page-isolation.h>
+#include <linux/page_idle.h>
 #include <linux/userfaultfd_k.h>
 #include <linux/hugetlb.h>
 #include <linux/falloc.h>
@@ -41,6 +42,7 @@ static int madvise_need_mmap_write(int behavior)
 	case MADV_WILLNEED:
 	case MADV_DONTNEED:
 	case MADV_COLD:
+	case MADV_PAGEOUT:
 	case MADV_FREE:
 		return 0;
 	default:
@@ -480,6 +482,213 @@ static long madvise_cold(struct vm_area_struct *vma,
 	return 0;
 }
 
+static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr,
+				unsigned long end, struct mm_walk *walk)
+{
+	struct mmu_gather *tlb = walk->private;
+	struct mm_struct *mm = tlb->mm;
+	struct vm_area_struct *vma = walk->vma;
+	pte_t *orig_pte, *pte, ptent;
+	spinlock_t *ptl;
+	LIST_HEAD(page_list);
+	struct page *page;
+	int isolated = 0;
+	unsigned long next;
+
+	if (fatal_signal_pending(current))
+		return -EINTR;
+
+	next = pmd_addr_end(addr, end);
+	if (pmd_trans_huge(*pmd)) {
+		pmd_t orig_pmd;
+
+		tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
+		ptl = pmd_trans_huge_lock(pmd, vma);
+		if (!ptl)
+			return 0;
+
+		orig_pmd = *pmd;
+		if (is_huge_zero_pmd(orig_pmd))
+			goto huge_unlock;
+
+		if (unlikely(!pmd_present(orig_pmd))) {
+			VM_BUG_ON(thp_migration_supported() &&
+					!is_pmd_migration_entry(orig_pmd));
+			goto huge_unlock;
+		}
+
+		page = pmd_page(orig_pmd);
+		if (next - addr != HPAGE_PMD_SIZE) {
+			int err;
+
+			if (page_mapcount(page) != 1)
+				goto huge_unlock;
+			get_page(page);
+			spin_unlock(ptl);
+			lock_page(page);
+			err = split_huge_page(page);
+			unlock_page(page);
+			put_page(page);
+			if (!err)
+				goto regular_page;
+			return 0;
+		}
+
+		if (isolate_lru_page(page))
+			goto huge_unlock;
+
+		if (pmd_young(orig_pmd)) {
+			pmdp_invalidate(vma, addr, pmd);
+			orig_pmd = pmd_mkold(orig_pmd);
+
+			set_pmd_at(mm, addr, pmd, orig_pmd);
+			tlb_remove_tlb_entry(tlb, pmd, addr);
+		}
+
+		ClearPageReferenced(page);
+		test_and_clear_page_young(page);
+		list_add(&page->lru, &page_list);
+huge_unlock:
+		spin_unlock(ptl);
+		reclaim_pages(&page_list);
+		return 0;
+	}
+
+	if (pmd_trans_unstable(pmd))
+		return 0;
+regular_page:
+	tlb_change_page_size(tlb, PAGE_SIZE);
+	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	flush_tlb_batched_pending(mm);
+	arch_enter_lazy_mmu_mode();
+	for (; addr < end; pte++, addr += PAGE_SIZE) {
+		ptent = *pte;
+		if (!pte_present(ptent))
+			continue;
+
+		page = vm_normal_page(vma, addr, ptent);
+		if (!page)
+			continue;
+
+		/*
+		 * creating a THP page is expensive so split it only if we
+		 * are sure it's worth. Split it if we are only owner.
+		 */
+		if (PageTransCompound(page)) {
+			if (page_mapcount(page) != 1)
+				break;
+			get_page(page);
+			if (!trylock_page(page)) {
+				put_page(page);
+				break;
+			}
+			pte_unmap_unlock(orig_pte, ptl);
+			if (split_huge_page(page)) {
+				unlock_page(page);
+				put_page(page);
+				pte_offset_map_lock(mm, pmd, addr, &ptl);
+				break;
+			}
+			unlock_page(page);
+			put_page(page);
+			pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+			pte--;
+			addr -= PAGE_SIZE;
+			continue;
+		}
+
+		VM_BUG_ON_PAGE(PageTransCompound(page), page);
+
+		if (isolate_lru_page(page))
+			continue;
+
+		isolated++;
+		if (pte_young(ptent)) {
+			ptent = ptep_get_and_clear_full(mm, addr, pte,
+							tlb->fullmm);
+			ptent = pte_mkold(ptent);
+			set_pte_at(mm, addr, pte, ptent);
+			tlb_remove_tlb_entry(tlb, pte, addr);
+		}
+		ClearPageReferenced(page);
+		test_and_clear_page_young(page);
+		list_add(&page->lru, &page_list);
+		/*
+		 * Prevent early OOM kill since it could isolate too many LRU
+		 * pages concurrently.
+		 */
+		if (isolated >= SWAP_CLUSTER_MAX) {
+			arch_leave_lazy_mmu_mode();
+			pte_unmap_unlock(orig_pte, ptl);
+			reclaim_pages(&page_list);
+			isolated = 0;
+			pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+			arch_enter_lazy_mmu_mode();
+			orig_pte = pte;
+		}
+	}
+
+	arch_leave_lazy_mmu_mode();
+	pte_unmap_unlock(orig_pte, ptl);
+	reclaim_pages(&page_list);
+	cond_resched();
+
+	return 0;
+}
+
+static void madvise_pageout_page_range(struct mmu_gather *tlb,
+			     struct vm_area_struct *vma,
+			     unsigned long addr, unsigned long end)
+{
+	struct mm_walk pageout_walk = {
+		.pmd_entry = madvise_pageout_pte_range,
+		.mm = vma->vm_mm,
+		.private = tlb,
+	};
+
+	tlb_start_vma(tlb, vma);
+	walk_page_range(addr, end, &pageout_walk);
+	tlb_end_vma(tlb, vma);
+}
+
+static inline bool can_do_pageout(struct vm_area_struct *vma)
+{
+	if (vma_is_anonymous(vma))
+		return true;
+	if (!vma->vm_file)
+		return false;
+	/*
+	 * paging out pagecache only for non-anonymous mappings that correspond
+	 * to the files the calling process could (if tried) open for writing;
+	 * otherwise we'd be including shared non-exclusive mappings, which
+	 * opens a side channel.
+	 */
+	return inode_owner_or_capable(file_inode(vma->vm_file)) ||
+		inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
+}
+
+static long madvise_pageout(struct vm_area_struct *vma,
+			struct vm_area_struct **prev,
+			unsigned long start_addr, unsigned long end_addr)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct mmu_gather tlb;
+
+	*prev = vma;
+	if (!can_madv_lru_vma(vma))
+		return -EINVAL;
+
+	if (!can_do_pageout(vma))
+		return 0;
+
+	lru_add_drain();
+	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
+	madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
+	tlb_finish_mmu(&tlb, start_addr, end_addr);
+
+	return 0;
+}
+
 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 				unsigned long end, struct mm_walk *walk)
 
@@ -870,6 +1079,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 		return madvise_willneed(vma, prev, start, end);
 	case MADV_COLD:
 		return madvise_cold(vma, prev, start, end);
+	case MADV_PAGEOUT:
+		return madvise_pageout(vma, prev, start, end);
 	case MADV_FREE:
 	case MADV_DONTNEED:
 		return madvise_dontneed_free(vma, prev, start, end, behavior);
@@ -892,6 +1103,7 @@ madvise_behavior_valid(int behavior)
 	case MADV_DONTNEED:
 	case MADV_FREE:
 	case MADV_COLD:
+	case MADV_PAGEOUT:
 #ifdef CONFIG_KSM
 	case MADV_MERGEABLE:
 	case MADV_UNMERGEABLE:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 223ce5da08f0..b521770c4314 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2151,6 +2151,64 @@ static void shrink_active_list(unsigned long nr_to_scan,
 			nr_deactivate, nr_rotated, sc->priority, file);
 }
 
+unsigned long reclaim_pages(struct list_head *page_list)
+{
+	int nid = -1;
+	unsigned long nr_reclaimed = 0;
+	LIST_HEAD(node_page_list);
+	struct reclaim_stat dummy_stat;
+	struct scan_control sc = {
+		.gfp_mask = GFP_KERNEL,
+		.priority = DEF_PRIORITY,
+		.may_writepage = 1,
+		.may_unmap = 1,
+		.may_swap = 1,
+	};
+
+	while (!list_empty(page_list)) {
+		struct page *page;
+
+		page = lru_to_page(page_list);
+		if (nid == -1) {
+			nid = page_to_nid(page);
+			INIT_LIST_HEAD(&node_page_list);
+		}
+
+		if (nid == page_to_nid(page)) {
+			list_move(&page->lru, &node_page_list);
+			continue;
+		}
+
+		nr_reclaimed += shrink_page_list(&node_page_list,
+						NODE_DATA(nid),
+						&sc, 0,
+						&dummy_stat, false);
+		while (!list_empty(&node_page_list)) {
+			struct page *page = lru_to_page(&node_page_list);
+
+			list_del(&page->lru);
+			putback_lru_page(page);
+		}
+
+		nid = -1;
+	}
+
+	if (!list_empty(&node_page_list)) {
+		nr_reclaimed += shrink_page_list(&node_page_list,
+						NODE_DATA(nid),
+						&sc, 0,
+						&dummy_stat, false);
+		while (!list_empty(&node_page_list)) {
+			struct page *page = lru_to_page(&node_page_list);
+
+			list_del(&page->lru);
+			putback_lru_page(page);
+		}
+	}
+
+	return nr_reclaimed;
+}
+
 /*
  * The inactive anon list should be small enough that the VM never has
  * to do too much work.
-- 
2.22.0.410.gd8fdbe21b5-goog

^ permalink raw reply related

* [PATCH v3 5/5] mm: factor out pmd young/dirty bit handling and THP split
From: Minchan Kim @ 2019-06-27 11:54 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, LKML, linux-api, Michal Hocko, Johannes Weiner,
	Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, oleksandr, hdanton, lizeb, Dave Hansen,
	Kirill A . Shutemov, Minchan Kim
In-Reply-To: <20190627115405.255259-1-minchan@kernel.org>

Now, there are common part among MADV_COLD|PAGEOUT|FREE to reset
access/dirty bit resetting or split the THP page to handle part
of subpages in the THP page. This patch factor out the common part.

Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 include/linux/huge_mm.h |   3 -
 mm/huge_memory.c        |  74 -------------
 mm/madvise.c            | 234 +++++++++++++++++++++++-----------------
 3 files changed, 135 insertions(+), 176 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 7cd5c150c21d..2667e1aa3ce5 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -29,9 +29,6 @@ extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 					  unsigned long addr,
 					  pmd_t *pmd,
 					  unsigned int flags);
-extern bool madvise_free_huge_pmd(struct mmu_gather *tlb,
-			struct vm_area_struct *vma,
-			pmd_t *pmd, unsigned long addr, unsigned long next);
 extern int zap_huge_pmd(struct mmu_gather *tlb,
 			struct vm_area_struct *vma,
 			pmd_t *pmd, unsigned long addr);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 93f531b63a45..e4b9a06788f3 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1671,80 +1671,6 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
 	return 0;
 }
 
-/*
- * Return true if we do MADV_FREE successfully on entire pmd page.
- * Otherwise, return false.
- */
-bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
-		pmd_t *pmd, unsigned long addr, unsigned long next)
-{
-	spinlock_t *ptl;
-	pmd_t orig_pmd;
-	struct page *page;
-	struct mm_struct *mm = tlb->mm;
-	bool ret = false;
-
-	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
-
-	ptl = pmd_trans_huge_lock(pmd, vma);
-	if (!ptl)
-		goto out_unlocked;
-
-	orig_pmd = *pmd;
-	if (is_huge_zero_pmd(orig_pmd))
-		goto out;
-
-	if (unlikely(!pmd_present(orig_pmd))) {
-		VM_BUG_ON(thp_migration_supported() &&
-				  !is_pmd_migration_entry(orig_pmd));
-		goto out;
-	}
-
-	page = pmd_page(orig_pmd);
-	/*
-	 * If other processes are mapping this page, we couldn't discard
-	 * the page unless they all do MADV_FREE so let's skip the page.
-	 */
-	if (page_mapcount(page) != 1)
-		goto out;
-
-	if (!trylock_page(page))
-		goto out;
-
-	/*
-	 * If user want to discard part-pages of THP, split it so MADV_FREE
-	 * will deactivate only them.
-	 */
-	if (next - addr != HPAGE_PMD_SIZE) {
-		get_page(page);
-		spin_unlock(ptl);
-		split_huge_page(page);
-		unlock_page(page);
-		put_page(page);
-		goto out_unlocked;
-	}
-
-	if (PageDirty(page))
-		ClearPageDirty(page);
-	unlock_page(page);
-
-	if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
-		pmdp_invalidate(vma, addr, pmd);
-		orig_pmd = pmd_mkold(orig_pmd);
-		orig_pmd = pmd_mkclean(orig_pmd);
-
-		set_pmd_at(mm, addr, pmd, orig_pmd);
-		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
-	}
-
-	mark_page_lazyfree(page);
-	ret = true;
-out:
-	spin_unlock(ptl);
-out_unlocked:
-	return ret;
-}
-
 static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
 {
 	pgtable_t pgtable;
diff --git a/mm/madvise.c b/mm/madvise.c
index ee210473f639..13b06dc8d402 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -310,6 +310,91 @@ static long madvise_willneed(struct vm_area_struct *vma,
 	return 0;
 }
 
+enum madv_pmdp_reset_t {
+	MADV_PMDP_RESET,	/* pmd was reset successfully */
+	MADV_PMDP_SPLIT,	/* pmd was split */
+	MADV_PMDP_ERROR,
+};
+
+static enum madv_pmdp_reset_t madvise_pmdp_reset_or_split(struct mm_walk *walk,
+				pmd_t *pmd, spinlock_t *ptl,
+				unsigned long addr, unsigned long end,
+				bool young, bool dirty)
+{
+	pmd_t orig_pmd;
+	unsigned long next;
+	struct page *page;
+	struct mmu_gather *tlb = walk->private;
+	struct mm_struct *mm = walk->mm;
+	struct vm_area_struct *vma = walk->vma;
+	bool reset_young = false;
+	bool reset_dirty = false;
+	enum madv_pmdp_reset_t ret = MADV_PMDP_ERROR;
+
+	orig_pmd = *pmd;
+	if (is_huge_zero_pmd(orig_pmd))
+		return ret;
+
+	if (unlikely(!pmd_present(orig_pmd))) {
+		VM_BUG_ON(thp_migration_supported() &&
+				!is_pmd_migration_entry(orig_pmd));
+		return ret;
+	}
+
+	next = pmd_addr_end(addr, end);
+	page = pmd_page(orig_pmd);
+	if (next - addr != HPAGE_PMD_SIZE) {
+		/*
+		 * THP collapsing is not cheap so only split the page is
+		 * private to the this process.
+		 */
+		if (page_mapcount(page) != 1)
+			return ret;
+		get_page(page);
+		spin_unlock(ptl);
+		lock_page(page);
+		if (!split_huge_page(page))
+			ret = MADV_PMDP_SPLIT;
+		unlock_page(page);
+		put_page(page);
+		return ret;
+	}
+
+	if (young && pmd_young(orig_pmd))
+		reset_young = true;
+	if (dirty && pmd_dirty(orig_pmd))
+		reset_dirty = true;
+
+	/*
+	 * Other process could rely on the PG_dirty for data consistency,
+	 * not pte_dirty so we could reset PG_dirty only when we are owner
+	 * of the page.
+	 */
+	if (reset_dirty) {
+		if (page_mapcount(page) != 1)
+			goto out;
+		if (!trylock_page(page))
+			goto out;
+		if (PageDirty(page))
+			ClearPageDirty(page);
+		unlock_page(page);
+	}
+
+	ret = MADV_PMDP_RESET;
+	if (reset_young || reset_dirty) {
+		tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
+		pmdp_invalidate(vma, addr, pmd);
+		if (reset_young)
+			orig_pmd = pmd_mkold(orig_pmd);
+		if (reset_dirty)
+			orig_pmd = pmd_mkclean(orig_pmd);
+		set_pmd_at(mm, addr, pmd, orig_pmd);
+		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+	}
+out:
+	return ret;
+}
+
 static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
 				unsigned long end, struct mm_walk *walk)
 {
@@ -319,64 +404,31 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
 	pte_t *orig_pte, *pte, ptent;
 	spinlock_t *ptl;
 	struct page *page;
-	unsigned long next;
 
-	next = pmd_addr_end(addr, end);
 	if (pmd_trans_huge(*pmd)) {
-		pmd_t orig_pmd;
-
-		tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
 		ptl = pmd_trans_huge_lock(pmd, vma);
 		if (!ptl)
 			return 0;
 
-		orig_pmd = *pmd;
-		if (is_huge_zero_pmd(orig_pmd))
-			goto huge_unlock;
-
-		if (unlikely(!pmd_present(orig_pmd))) {
-			VM_BUG_ON(thp_migration_supported() &&
-					!is_pmd_migration_entry(orig_pmd));
-			goto huge_unlock;
-		}
-
-		page = pmd_page(orig_pmd);
-		if (next - addr != HPAGE_PMD_SIZE) {
-			int err;
-
-			if (page_mapcount(page) != 1)
-				goto huge_unlock;
-
-			get_page(page);
+		switch (madvise_pmdp_reset_or_split(walk, pmd, ptl, addr, end,
+							true, false)) {
+		case MADV_PMDP_RESET:
 			spin_unlock(ptl);
-			lock_page(page);
-			err = split_huge_page(page);
-			unlock_page(page);
-			put_page(page);
-			if (!err)
-				goto regular_page;
-			return 0;
-		}
-
-		if (pmd_young(orig_pmd)) {
-			pmdp_invalidate(vma, addr, pmd);
-			orig_pmd = pmd_mkold(orig_pmd);
-
-			set_pmd_at(mm, addr, pmd, orig_pmd);
-			tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+			page = pmd_page(*pmd);
+			test_and_clear_page_young(page);
+			deactivate_page(page);
+			goto next;
+		case MADV_PMDP_ERROR:
+			spin_unlock(ptl);
+			goto next;
+		case MADV_PMDP_SPLIT:
+			; /* go through */
 		}
-
-		test_and_clear_page_young(page);
-		deactivate_page(page);
-huge_unlock:
-		spin_unlock(ptl);
-		return 0;
 	}
 
 	if (pmd_trans_unstable(pmd))
 		return 0;
 
-regular_page:
 	tlb_change_page_size(tlb, PAGE_SIZE);
 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	flush_tlb_batched_pending(mm);
@@ -443,6 +495,7 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
 
 	arch_enter_lazy_mmu_mode();
 	pte_unmap_unlock(orig_pte, ptl);
+next:
 	cond_resched();
 
 	return 0;
@@ -493,70 +546,38 @@ static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr,
 	LIST_HEAD(page_list);
 	struct page *page;
 	int isolated = 0;
-	unsigned long next;
 
 	if (fatal_signal_pending(current))
 		return -EINTR;
 
-	next = pmd_addr_end(addr, end);
 	if (pmd_trans_huge(*pmd)) {
-		pmd_t orig_pmd;
-
-		tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
 		ptl = pmd_trans_huge_lock(pmd, vma);
 		if (!ptl)
 			return 0;
 
-		orig_pmd = *pmd;
-		if (is_huge_zero_pmd(orig_pmd))
-			goto huge_unlock;
-
-		if (unlikely(!pmd_present(orig_pmd))) {
-			VM_BUG_ON(thp_migration_supported() &&
-					!is_pmd_migration_entry(orig_pmd));
-			goto huge_unlock;
-		}
-
-		page = pmd_page(orig_pmd);
-		if (next - addr != HPAGE_PMD_SIZE) {
-			int err;
-
-			if (page_mapcount(page) != 1)
-				goto huge_unlock;
-			get_page(page);
+		switch (madvise_pmdp_reset_or_split(walk, pmd, ptl, addr, end,
+							true, false)) {
+		case MADV_PMDP_RESET:
+			page = pmd_page(*pmd);
 			spin_unlock(ptl);
-			lock_page(page);
-			err = split_huge_page(page);
-			unlock_page(page);
-			put_page(page);
-			if (!err)
-				goto regular_page;
-			return 0;
-		}
-
-		if (isolate_lru_page(page))
-			goto huge_unlock;
-
-		if (pmd_young(orig_pmd)) {
-			pmdp_invalidate(vma, addr, pmd);
-			orig_pmd = pmd_mkold(orig_pmd);
-
-			set_pmd_at(mm, addr, pmd, orig_pmd);
-			tlb_remove_tlb_entry(tlb, pmd, addr);
+			if (isolate_lru_page(page))
+				return 0;
+			ClearPageReferenced(page);
+			test_and_clear_page_young(page);
+			list_add(&page->lru, &page_list);
+			reclaim_pages(&page_list);
+			goto next;
+		case MADV_PMDP_ERROR:
+			spin_unlock(ptl);
+			goto next;
+		case MADV_PMDP_SPLIT:
+			; /* go through */
 		}
-
-		ClearPageReferenced(page);
-		test_and_clear_page_young(page);
-		list_add(&page->lru, &page_list);
-huge_unlock:
-		spin_unlock(ptl);
-		reclaim_pages(&page_list);
-		return 0;
 	}
 
 	if (pmd_trans_unstable(pmd))
 		return 0;
-regular_page:
+
 	tlb_change_page_size(tlb, PAGE_SIZE);
 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	flush_tlb_batched_pending(mm);
@@ -631,6 +652,7 @@ static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr,
 	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(orig_pte, ptl);
 	reclaim_pages(&page_list);
+next:
 	cond_resched();
 
 	return 0;
@@ -700,12 +722,26 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 	pte_t *orig_pte, *pte, ptent;
 	struct page *page;
 	int nr_swap = 0;
-	unsigned long next;
 
-	next = pmd_addr_end(addr, end);
-	if (pmd_trans_huge(*pmd))
-		if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
+	if (pmd_trans_huge(*pmd)) {
+		ptl = pmd_trans_huge_lock(pmd, vma);
+		if (!ptl)
+			return 0;
+
+		switch (madvise_pmdp_reset_or_split(walk, pmd, ptl, addr, end,
+							true, true)) {
+		case MADV_PMDP_RESET:
+			page = pmd_page(*pmd);
+			spin_unlock(ptl);
+			mark_page_lazyfree(page);
 			goto next;
+		case MADV_PMDP_ERROR:
+			spin_unlock(ptl);
+			goto next;
+		case MADV_PMDP_SPLIT:
+			; /* go through */
+		}
+	}
 
 	if (pmd_trans_unstable(pmd))
 		return 0;
@@ -817,8 +853,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 	}
 	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(orig_pte, ptl);
-	cond_resched();
 next:
+	cond_resched();
 	return 0;
 }
 
-- 
2.22.0.410.gd8fdbe21b5-goog

^ permalink raw reply related

* Re: [PATCH v3 1/5] mm: introduce MADV_COLD
From: Dave Hansen @ 2019-06-27 13:13 UTC (permalink / raw)
  To: Minchan Kim, Andrew Morton
  Cc: linux-mm, LKML, linux-api, Michal Hocko, Johannes Weiner,
	Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, oleksandr, hdanton, lizeb,
	Kirill A . Shutemov
In-Reply-To: <20190627115405.255259-2-minchan@kernel.org>

On 6/27/19 4:54 AM, Minchan Kim wrote:
> This patch introduces the new MADV_COLD hint to madvise(2) syscall.
> MADV_COLD can be used by a process to mark a memory range as not expected
> to be used in the near future. The hint can help kernel in deciding which
> pages to evict early during memory pressure.
> 
> It works for every LRU pages like MADV_[DONTNEED|FREE]. IOW, It moves
> 
> 	active file page -> inactive file LRU
> 	active anon page -> inacdtive anon LRU

Is the LRU behavior part of the interface or the implementation?

I ask because we've got something in between tossing something down the
LRU and swapping it: page migration.  Specifically, on a system with
slower memory media (like persistent memory) we just migrate a page
instead of discarding it at reclaim:

> https://lore.kernel.org/linux-mm/20190321200157.29678-4-keith.busch@intel.com/

So let's say I have a page I want to evict from DRAM to the next slower
tier of memory.  Do I use MADV_COLD or MADV_PAGEOUT?  If the LRU
behavior is part of the interface itself, then MADV_COLD doesn't work.

Do you think we'll need a third MADV_ flag for our automatic migration
behavior?  MADV_REALLYCOLD?  MADV_MIGRATEOUT?

^ permalink raw reply

* Re: [PATCH v3 1/2] pid: add pidfd_open()
From: Konstantin Khlebnikov @ 2019-06-27 13:57 UTC (permalink / raw)
  To: Christian Brauner, jannh, oleg, viro, torvalds, linux-kernel,
	arnd
  Cc: linux-ia64, linux-sh, linux-mips, dhowells, joel, linux-kselftest,
	sparclinux, elena.reshetova, linux-arch, linux-s390, dancol,
	kernel-team, serge, linux-xtensa, keescook, linux-m68k, luto,
	tglx, surenb, linux-arm-kernel, linux-parisc, linux-api, cyphar,
	luto, ebiederm, linux-alpha, akpm, linuxppc-dev
In-Reply-To: <20190520155630.21684-1-christian@brauner.io>

On 20.05.2019 18:56, Christian Brauner wrote:
> This adds the pidfd_open() syscall. It allows a caller to retrieve pollable
> pidfds for a process which did not get created via CLONE_PIDFD, i.e. for a
> process that is created via traditional fork()/clone() calls that is only
> referenced by a PID:
> 
> int pidfd = pidfd_open(1234, 0);
> ret = pidfd_send_signal(pidfd, SIGSTOP, NULL, 0);
> 
> With the introduction of pidfds through CLONE_PIDFD it is possible to
> created pidfds at process creation time.
> However, a lot of processes get created with traditional PID-based calls
> such as fork() or clone() (without CLONE_PIDFD). For these processes a
> caller can currently not create a pollable pidfd. This is a problem for
> Android's low memory killer (LMK) and service managers such as systemd.
> Both are examples of tools that want to make use of pidfds to get reliable
> notification of process exit for non-parents (pidfd polling) and race-free
> signal sending (pidfd_send_signal()). They intend to switch to this API for
> process supervision/management as soon as possible. Having no way to get
> pollable pidfds from PID-only processes is one of the biggest blockers for
> them in adopting this api. With pidfd_open() making it possible to retrieve
> pidfds for PID-based processes we enable them to adopt this api.
> 
> In line with Arnd's recent changes to consolidate syscall numbers across
> architectures, I have added the pidfd_open() syscall to all architectures
> at the same time.

As I see pidfd_open() works only within current pid-namespace.

Have you considered separate argument for pidns-fd or flag for opening pid in
pid-ns referred by nsproxy->pid_ns_for_children set by setns.

This could be used for use cases I've tried to cover by syscall "translate_pid"
https://lkml.org/lkml/2018/6/1/788

> 
> Signed-off-by: Christian Brauner <christian@brauner.io>
> Reviewed-by: Oleg Nesterov <oleg@redhat.com>
> Acked-by: Arnd Bergmann <arnd@arndb.de>
> Cc: "Eric W. Biederman" <ebiederm@xmission.com>
> Cc: Kees Cook <keescook@chromium.org>
> Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Jann Horn <jannh@google.com>
> Cc: David Howells <dhowells@redhat.com>
> Cc: Andy Lutomirsky <luto@kernel.org>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Aleksa Sarai <cyphar@cyphar.com>
> Cc: Linus Torvalds <torvalds@linux-foundation.org>
> Cc: Al Viro <viro@zeniv.linux.org.uk>
> Cc: linux-api@vger.kernel.org
> ---
> v1:
> - kbuild test robot <lkp@intel.com>:
>    - add missing entry for pidfd_open to arch/arm/tools/syscall.tbl
> - Oleg Nesterov <oleg@redhat.com>:
>    - use simpler thread-group leader check
> v2:
> - Oleg Nesterov <oleg@redhat.com>:
>    - avoid using additional variable
>    - remove unneeded comment
> - Arnd Bergmann <arnd@arndb.de>:
>    - switch from 428 to 434 since the new mount api has taken it
>    - bump syscall numbers in arch/arm64/include/asm/unistd.h
> - Joel Fernandes (Google) <joel@joelfernandes.org>:
>    - switch from ESRCH to EINVAL when the passed-in pid does not refer to a
>      thread-group leader
> - Christian Brauner <christian@brauner.io>:
>    - rebase on v5.2-rc1
>    - adapt syscall number to account for new mount api syscalls
> v3:
> - Arnd Bergmann <arnd@arndb.de>:
>    - add missing syscall entries for mips-o32 and mips-n64
> ---
>   arch/alpha/kernel/syscalls/syscall.tbl      |  1 +
>   arch/arm/tools/syscall.tbl                  |  1 +
>   arch/arm64/include/asm/unistd.h             |  2 +-
>   arch/arm64/include/asm/unistd32.h           |  2 +
>   arch/ia64/kernel/syscalls/syscall.tbl       |  1 +
>   arch/m68k/kernel/syscalls/syscall.tbl       |  1 +
>   arch/microblaze/kernel/syscalls/syscall.tbl |  1 +
>   arch/mips/kernel/syscalls/syscall_n32.tbl   |  1 +
>   arch/mips/kernel/syscalls/syscall_n64.tbl   |  1 +
>   arch/mips/kernel/syscalls/syscall_o32.tbl   |  1 +
>   arch/parisc/kernel/syscalls/syscall.tbl     |  1 +
>   arch/powerpc/kernel/syscalls/syscall.tbl    |  1 +
>   arch/s390/kernel/syscalls/syscall.tbl       |  1 +
>   arch/sh/kernel/syscalls/syscall.tbl         |  1 +
>   arch/sparc/kernel/syscalls/syscall.tbl      |  1 +
>   arch/x86/entry/syscalls/syscall_32.tbl      |  1 +
>   arch/x86/entry/syscalls/syscall_64.tbl      |  1 +
>   arch/xtensa/kernel/syscalls/syscall.tbl     |  1 +
>   include/linux/pid.h                         |  1 +
>   include/linux/syscalls.h                    |  1 +
>   include/uapi/asm-generic/unistd.h           |  4 +-
>   kernel/fork.c                               |  2 +-
>   kernel/pid.c                                | 43 +++++++++++++++++++++
>   23 files changed, 68 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
> index 9e7704e44f6d..1db9bbcfb84e 100644
> --- a/arch/alpha/kernel/syscalls/syscall.tbl
> +++ b/arch/alpha/kernel/syscalls/syscall.tbl
> @@ -473,3 +473,4 @@
>   541	common	fsconfig			sys_fsconfig
>   542	common	fsmount				sys_fsmount
>   543	common	fspick				sys_fspick
> +544	common	pidfd_open			sys_pidfd_open
> diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
> index aaf479a9e92d..81e6e1817c45 100644
> --- a/arch/arm/tools/syscall.tbl
> +++ b/arch/arm/tools/syscall.tbl
> @@ -447,3 +447,4 @@
>   431	common	fsconfig			sys_fsconfig
>   432	common	fsmount				sys_fsmount
>   433	common	fspick				sys_fspick
> +434	common	pidfd_open			sys_pidfd_open
> diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
> index 70e6882853c0..e8f7d95a1481 100644
> --- a/arch/arm64/include/asm/unistd.h
> +++ b/arch/arm64/include/asm/unistd.h
> @@ -44,7 +44,7 @@
>   #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
>   #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
>   
> -#define __NR_compat_syscalls		434
> +#define __NR_compat_syscalls		435
>   #endif
>   
>   #define __ARCH_WANT_SYS_CLONE
> diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
> index c39e90600bb3..7a3158ccd68e 100644
> --- a/arch/arm64/include/asm/unistd32.h
> +++ b/arch/arm64/include/asm/unistd32.h
> @@ -886,6 +886,8 @@ __SYSCALL(__NR_fsconfig, sys_fsconfig)
>   __SYSCALL(__NR_fsmount, sys_fsmount)
>   #define __NR_fspick 433
>   __SYSCALL(__NR_fspick, sys_fspick)
> +#define __NR_pidfd_open 434
> +__SYSCALL(__NR_pidfd_open, sys_pidfd_open)
>   
>   /*
>    * Please add new compat syscalls above this comment and update
> diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
> index e01df3f2f80d..ecc44926737b 100644
> --- a/arch/ia64/kernel/syscalls/syscall.tbl
> +++ b/arch/ia64/kernel/syscalls/syscall.tbl
> @@ -354,3 +354,4 @@
>   431	common	fsconfig			sys_fsconfig
>   432	common	fsmount				sys_fsmount
>   433	common	fspick				sys_fspick
> +434	common	pidfd_open			sys_pidfd_open
> diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
> index 7e3d0734b2f3..9a3eb2558568 100644
> --- a/arch/m68k/kernel/syscalls/syscall.tbl
> +++ b/arch/m68k/kernel/syscalls/syscall.tbl
> @@ -433,3 +433,4 @@
>   431	common	fsconfig			sys_fsconfig
>   432	common	fsmount				sys_fsmount
>   433	common	fspick				sys_fspick
> +434	common	pidfd_open			sys_pidfd_open
> diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
> index 26339e417695..ad706f83c755 100644
> --- a/arch/microblaze/kernel/syscalls/syscall.tbl
> +++ b/arch/microblaze/kernel/syscalls/syscall.tbl
> @@ -439,3 +439,4 @@
>   431	common	fsconfig			sys_fsconfig
>   432	common	fsmount				sys_fsmount
>   433	common	fspick				sys_fspick
> +434	common	pidfd_open			sys_pidfd_open
> diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
> index 0e2dd68ade57..97035e19ad03 100644
> --- a/arch/mips/kernel/syscalls/syscall_n32.tbl
> +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
> @@ -372,3 +372,4 @@
>   431	n32	fsconfig			sys_fsconfig
>   432	n32	fsmount				sys_fsmount
>   433	n32	fspick				sys_fspick
> +434	n32	pidfd_open			sys_pidfd_open
> diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
> index 5eebfa0d155c..d7292722d3b0 100644
> --- a/arch/mips/kernel/syscalls/syscall_n64.tbl
> +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
> @@ -348,3 +348,4 @@
>   431	n64	fsconfig			sys_fsconfig
>   432	n64	fsmount				sys_fsmount
>   433	n64	fspick				sys_fspick
> +434	n64	pidfd_open			sys_pidfd_open
> diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
> index 3cc1374e02d0..dba084c92f14 100644
> --- a/arch/mips/kernel/syscalls/syscall_o32.tbl
> +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
> @@ -421,3 +421,4 @@
>   431	o32	fsconfig			sys_fsconfig
>   432	o32	fsmount				sys_fsmount
>   433	o32	fspick				sys_fspick
> +434	o32	pidfd_open			sys_pidfd_open
> diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
> index c9e377d59232..5022b9e179c2 100644
> --- a/arch/parisc/kernel/syscalls/syscall.tbl
> +++ b/arch/parisc/kernel/syscalls/syscall.tbl
> @@ -430,3 +430,4 @@
>   431	common	fsconfig			sys_fsconfig
>   432	common	fsmount				sys_fsmount
>   433	common	fspick				sys_fspick
> +434	common	pidfd_open			sys_pidfd_open
> diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
> index 103655d84b4b..f2c3bda2d39f 100644
> --- a/arch/powerpc/kernel/syscalls/syscall.tbl
> +++ b/arch/powerpc/kernel/syscalls/syscall.tbl
> @@ -515,3 +515,4 @@
>   431	common	fsconfig			sys_fsconfig
>   432	common	fsmount				sys_fsmount
>   433	common	fspick				sys_fspick
> +434	common	pidfd_open			sys_pidfd_open
> diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
> index e822b2964a83..6ebacfeaf853 100644
> --- a/arch/s390/kernel/syscalls/syscall.tbl
> +++ b/arch/s390/kernel/syscalls/syscall.tbl
> @@ -436,3 +436,4 @@
>   431  common	fsconfig		sys_fsconfig			sys_fsconfig
>   432  common	fsmount			sys_fsmount			sys_fsmount
>   433  common	fspick			sys_fspick			sys_fspick
> +434  common	pidfd_open		sys_pidfd_open			sys_pidfd_open
> diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
> index 016a727d4357..834c9c7d79fa 100644
> --- a/arch/sh/kernel/syscalls/syscall.tbl
> +++ b/arch/sh/kernel/syscalls/syscall.tbl
> @@ -436,3 +436,4 @@
>   431	common	fsconfig			sys_fsconfig
>   432	common	fsmount				sys_fsmount
>   433	common	fspick				sys_fspick
> +434	common	pidfd_open			sys_pidfd_open
> diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
> index e047480b1605..c58e71f21129 100644
> --- a/arch/sparc/kernel/syscalls/syscall.tbl
> +++ b/arch/sparc/kernel/syscalls/syscall.tbl
> @@ -479,3 +479,4 @@
>   431	common	fsconfig			sys_fsconfig
>   432	common	fsmount				sys_fsmount
>   433	common	fspick				sys_fspick
> +434	common	pidfd_open			sys_pidfd_open
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index ad968b7bac72..43e4429a5272 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -438,3 +438,4 @@
>   431	i386	fsconfig		sys_fsconfig			__ia32_sys_fsconfig
>   432	i386	fsmount			sys_fsmount			__ia32_sys_fsmount
>   433	i386	fspick			sys_fspick			__ia32_sys_fspick
> +434	i386	pidfd_open		sys_pidfd_open			__ia32_sys_pidfd_open
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index b4e6f9e6204a..1bee0a77fdd3 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -355,6 +355,7 @@
>   431	common	fsconfig		__x64_sys_fsconfig
>   432	common	fsmount			__x64_sys_fsmount
>   433	common	fspick			__x64_sys_fspick
> +434	common	pidfd_open		__x64_sys_pidfd_open
>   
>   #
>   # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
> index 5fa0ee1c8e00..782b81945ccc 100644
> --- a/arch/xtensa/kernel/syscalls/syscall.tbl
> +++ b/arch/xtensa/kernel/syscalls/syscall.tbl
> @@ -404,3 +404,4 @@
>   431	common	fsconfig			sys_fsconfig
>   432	common	fsmount				sys_fsmount
>   433	common	fspick				sys_fspick
> +434	common	pidfd_open			sys_pidfd_open
> diff --git a/include/linux/pid.h b/include/linux/pid.h
> index 3c8ef5a199ca..c938a92eab99 100644
> --- a/include/linux/pid.h
> +++ b/include/linux/pid.h
> @@ -67,6 +67,7 @@ struct pid
>   extern struct pid init_struct_pid;
>   
>   extern const struct file_operations pidfd_fops;
> +extern int pidfd_create(struct pid *pid);
>   
>   static inline struct pid *get_pid(struct pid *pid)
>   {
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index e2870fe1be5b..989055e0b501 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -929,6 +929,7 @@ asmlinkage long sys_clock_adjtime32(clockid_t which_clock,
>   				struct old_timex32 __user *tx);
>   asmlinkage long sys_syncfs(int fd);
>   asmlinkage long sys_setns(int fd, int nstype);
> +asmlinkage long sys_pidfd_open(pid_t pid, unsigned int flags);
>   asmlinkage long sys_sendmmsg(int fd, struct mmsghdr __user *msg,
>   			     unsigned int vlen, unsigned flags);
>   asmlinkage long sys_process_vm_readv(pid_t pid,
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index a87904daf103..e5684a4512c0 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -844,9 +844,11 @@ __SYSCALL(__NR_fsconfig, sys_fsconfig)
>   __SYSCALL(__NR_fsmount, sys_fsmount)
>   #define __NR_fspick 433
>   __SYSCALL(__NR_fspick, sys_fspick)
> +#define __NR_pidfd_open 434
> +__SYSCALL(__NR_pidfd_open, sys_pidfd_open)
>   
>   #undef __NR_syscalls
> -#define __NR_syscalls 434
> +#define __NR_syscalls 435
>   
>   /*
>    * 32 bit systems traditionally used different
> diff --git a/kernel/fork.c b/kernel/fork.c
> index b4cba953040a..c3df226f47a1 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1724,7 +1724,7 @@ const struct file_operations pidfd_fops = {
>    * Return: On success, a cloexec pidfd is returned.
>    *         On error, a negative errno number will be returned.
>    */
> -static int pidfd_create(struct pid *pid)
> +int pidfd_create(struct pid *pid)
>   {
>   	int fd;
>   
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 89548d35eefb..8fc9d94f6ac1 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -37,6 +37,7 @@
>   #include <linux/syscalls.h>
>   #include <linux/proc_ns.h>
>   #include <linux/proc_fs.h>
> +#include <linux/sched/signal.h>
>   #include <linux/sched/task.h>
>   #include <linux/idr.h>
>   
> @@ -450,6 +451,48 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
>   	return idr_get_next(&ns->idr, &nr);
>   }
>   
> +/**
> + * pidfd_open() - Open new pid file descriptor.
> + *
> + * @pid:   pid for which to retrieve a pidfd
> + * @flags: flags to pass
> + *
> + * This creates a new pid file descriptor with the O_CLOEXEC flag set for
> + * the process identified by @pid. Currently, the process identified by
> + * @pid must be a thread-group leader. This restriction currently exists
> + * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
> + * be used with CLONE_THREAD) and pidfd polling (only supports thread group
> + * leaders).
> + *
> + * Return: On success, a cloexec pidfd is returned.
> + *         On error, a negative errno number will be returned.
> + */
> +SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
> +{
> +	int fd, ret;
> +	struct pid *p;
> +
> +	if (flags)
> +		return -EINVAL;
> +
> +	if (pid <= 0)
> +		return -EINVAL;
> +
> +	p = find_get_pid(pid);
> +	if (!p)
> +		return -ESRCH;
> +
> +	ret = 0;
> +	rcu_read_lock();
> +	if (!pid_task(p, PIDTYPE_TGID))
> +		ret = -EINVAL;
> +	rcu_read_unlock();
> +
> +	fd = ret ?: pidfd_create(p);
> +	put_pid(p);
> +	return fd;
> +}
> +
>   void __init pid_idr_init(void)
>   {
>   	/* Verify no one has done anything silly: */
> 

^ permalink raw reply

* Re: [PATCH v3 1/5] mm: introduce MADV_COLD
From: Michal Hocko @ 2019-06-27 14:02 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Minchan Kim, Andrew Morton, linux-mm, LKML, linux-api,
	Johannes Weiner, Tim Murray, Joel Fernandes, Suren Baghdasaryan,
	Daniel Colascione, Shakeel Butt, Sonny Rao, oleksandr, hdanton,
	lizeb, Kirill A . Shutemov
In-Reply-To: <343599f9-3d99-b74f-1732-368e584fa5ef@intel.com>

On Thu 27-06-19 06:13:36, Dave Hansen wrote:
> On 6/27/19 4:54 AM, Minchan Kim wrote:
> > This patch introduces the new MADV_COLD hint to madvise(2) syscall.
> > MADV_COLD can be used by a process to mark a memory range as not expected
> > to be used in the near future. The hint can help kernel in deciding which
> > pages to evict early during memory pressure.
> > 
> > It works for every LRU pages like MADV_[DONTNEED|FREE]. IOW, It moves
> > 
> > 	active file page -> inactive file LRU
> > 	active anon page -> inacdtive anon LRU
> 
> Is the LRU behavior part of the interface or the implementation?
> 
> I ask because we've got something in between tossing something down the
> LRU and swapping it: page migration.  Specifically, on a system with
> slower memory media (like persistent memory) we just migrate a page
> instead of discarding it at reclaim:

But we already do have interfaces for migrating the memory
(move_pages(2)). Why should this interface duplicate that interface?
I believe the only purpose of these two new madvise modes is to provide
a non-destructive MADV_{DONTNEED,FREE} alteternatives. In other words,
pageout vs. age interface.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply

* Re: [PATCH V33 24/30] bpf: Restrict bpf when kernel lockdown is in confidentiality mode
From: Stephen Smalley @ 2019-06-27 14:35 UTC (permalink / raw)
  To: Andy Lutomirski, James Morris
  Cc: Andy Lutomirski, Matthew Garrett, linux-security, LKML, Linux API,
	David Howells, Alexei Starovoitov, Matthew Garrett,
	Network Development, Chun-Yi Lee, Daniel Borkmann,
	linux-security-module
In-Reply-To: <6E53376F-01BB-4795-BC02-24F9CAE00001@amacapital.net>

On 6/26/19 8:57 PM, Andy Lutomirski wrote:
> 
> 
>> On Jun 26, 2019, at 1:22 PM, James Morris <jmorris@namei.org> wrote:
>>
>> [Adding the LSM mailing list: missed this patchset initially]
>>
>>> On Thu, 20 Jun 2019, Andy Lutomirski wrote:
>>>
>>> This patch exemplifies why I don't like this approach:
>>>
>>>> @@ -97,6 +97,7 @@ enum lockdown_reason {
>>>>         LOCKDOWN_INTEGRITY_MAX,
>>>>         LOCKDOWN_KCORE,
>>>>         LOCKDOWN_KPROBES,
>>>> +       LOCKDOWN_BPF,
>>>>         LOCKDOWN_CONFIDENTIALITY_MAX,
>>>
>>>> --- a/security/lockdown/lockdown.c
>>>> +++ b/security/lockdown/lockdown.c
>>>> @@ -33,6 +33,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = {
>>>>         [LOCKDOWN_INTEGRITY_MAX] = "integrity",
>>>>         [LOCKDOWN_KCORE] = "/proc/kcore access",
>>>>         [LOCKDOWN_KPROBES] = "use of kprobes",
>>>> +       [LOCKDOWN_BPF] = "use of bpf",
>>>>         [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality",
>>>
>>> The text here says "use of bpf", but what this patch is *really* doing
>>> is locking down use of BPF to read kernel memory.  If the details
>>> change, then every LSM needs to get updated, and we risk breaking user
>>> policies that are based on LSMs that offer excessively fine
>>> granularity.
>>
>> Can you give an example of how the details might change?
>>
>>> I'd be more comfortable if the LSM only got to see "confidentiality"
>>> or "integrity".
>>
>> These are not sufficient for creating a useful policy for the SELinux
>> case.
>>
>>
> 
> I may have misunderstood, but I thought that SELinux mainly needed to allow certain privileged programs to bypass the policy.  Is there a real example of what SELinux wants to do that can’t be done in the simplified model?
> 
> The think that specifically makes me uneasy about exposing all of these precise actions to LSMs is that they will get exposed to userspace in a way that forces us to treat them as stable ABIs.

There are two scenarios where finer-grained distinctions make sense:

- Users may need to enable specific functionality that falls under the 
umbrella of "confidentiality" or "integrity" lockdown.  Finer-grained 
lockdown reasons free them from having to make an all-or-nothing choice 
between lost functionality or no lockdown at all. This can be supported 
directly by the lockdown module without any help from SELinux or other 
security modules; we just need the ability to specify these 
finer-grained lockdown levels via the boot parameters and securityfs nodes.

- Different processes/programs may need to use different sets of 
functionality restricted via lockdown confidentiality or integrity 
categories.  If we have to allow all-or-none for the set of 
interfaces/functionality covered by the generic confidentiality or 
integrity categories, then we'll end up having to choose between lost 
functionality or overprivileged processes, neither of which is optimal.

Is it truly the case that everything under the "confidentiality" 
category poses the same level of risk to kernel confidentiality, and 
similarly for everything under the "integrity" category?  If not, then 
being able to distinguish them definitely has benefit.

I'm still not clear though on how/if this will compose with or be 
overridden by other security modules.  We would need some means for 
another security module to take over lockdown decisions once it has 
initialized (including policy load), and to be able to access state that 
is currently private to the lockdown module, like the level.

^ permalink raw reply

* Re: [PATCH v3 1/5] mm: introduce MADV_COLD
From: Dave Hansen @ 2019-06-27 14:36 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Minchan Kim, Andrew Morton, linux-mm, LKML, linux-api,
	Johannes Weiner, Tim Murray, Joel Fernandes, Suren Baghdasaryan,
	Daniel Colascione, Shakeel Butt, Sonny Rao, oleksandr, hdanton,
	lizeb, Kirill A . Shutemov
In-Reply-To: <20190627140203.GB5303@dhcp22.suse.cz>

On 6/27/19 7:02 AM, Michal Hocko wrote:
>> Is the LRU behavior part of the interface or the implementation?
>>
>> I ask because we've got something in between tossing something down the
>> LRU and swapping it: page migration.  Specifically, on a system with
>> slower memory media (like persistent memory) we just migrate a page
>> instead of discarding it at reclaim:
> But we already do have interfaces for migrating the memory
> (move_pages(2)). Why should this interface duplicate that interface?
> I believe the only purpose of these two new madvise modes is to provide
> a non-destructive MADV_{DONTNEED,FREE} alteternatives. In other words,
> pageout vs. age interface.

The existing interface's problem for this case is that it has to know
exact locations where the memory is and where it should go.  For
instance, if you have two sockets, you very likely want to demote DRAM
to the persistent memory DIMM sitting next to it and not go
cross-socket.  To do _that_, you need to know where the existing
allocation lies so you can find the appropriate destination node.

That's not a problem for existing NUMA-enlightened apps, but it is for
everything else.

For MADV_COLD, if we defined it like this, I think we could use it for
both purposes (demotion and LRU movement):

	Pages in the specified regions will be treated as less-recently-
	accessed compared to pages in the system with similar access
	frequencies.  In contrast to MADV_DONTNEED, the contents of the
	region are preserved.

It would be nice not to talk about reclaim at all since we're not
promising reclaim per se.

^ permalink raw reply

* Re: [PATCH v3 1/5] mm: introduce MADV_COLD
From: Michal Hocko @ 2019-06-27 14:53 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Minchan Kim, Andrew Morton, linux-mm, LKML, linux-api,
	Johannes Weiner, Tim Murray, Joel Fernandes, Suren Baghdasaryan,
	Daniel Colascione, Shakeel Butt, Sonny Rao, oleksandr, hdanton,
	lizeb, Kirill A . Shutemov
In-Reply-To: <d9341eb3-08eb-3c2b-9786-00b8a4f59953@intel.com>

On Thu 27-06-19 07:36:50, Dave Hansen wrote:
[...]
> For MADV_COLD, if we defined it like this, I think we could use it for
> both purposes (demotion and LRU movement):
> 
> 	Pages in the specified regions will be treated as less-recently-
> 	accessed compared to pages in the system with similar access
> 	frequencies.  In contrast to MADV_DONTNEED, the contents of the

you meant s@MADV_DONTNEED@MADV_FREE@ I suppose

> 	region are preserved.
> 
> It would be nice not to talk about reclaim at all since we're not
> promising reclaim per se.

Well, I guess this is just an implementation detail. MADV_FREE is really
only about aging. It is up to the kernel what to do during the reclaim
and the advice doesn't and shouldn't make any difference here.

Now MADV_PAGEOUT would be more tricky in that direction because it
defines an immediate action to page out the range. I do understand your
argument about NUMA unaware applications which might want to get
something like MADV_DEMOTE which would move a page to a secondary memory
(whatever that is) but I think this is asking for its own madvise.
MADV_PAGEOUT has a quite simple semnatic - move to the backing storage -
and I would rather not make it more complex.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply

* Re: [PATCH V34 09/29] kexec_file: Restrict at runtime if the kernel is locked down
From: Matthew Garrett @ 2019-06-27 15:28 UTC (permalink / raw)
  To: James Morris
  Cc: LSM List, Linux Kernel Mailing List, Linux API, Jiri Bohac,
	David Howells, kexec
In-Reply-To: <alpine.LRH.2.21.1906271423070.16512@namei.org>

On Wed, Jun 26, 2019 at 9:59 PM James Morris <jmorris@namei.org> wrote:
> This is not a criticism of the patch but a related issue which I haven't
> seen discussed (apologies if it has).
>
> If signed code is loaded into ring 0, verified by the kernel, then
> executed, you still lose your secure/trusted/verified boot state. If the
> currently running kernel has been runtime-compromised, any signature
> verification performed by the kernel cannot be trusted.
>
> This problem is out of scope for the lockdown threat model (which
> naturally cannot include a compromised kernel), but folk should be aware
> that signature-verified kexec does not provide equivalent assurance to a
> full reboot on a secure-boot system.

By that metric, on a secure boot system how do we determine that code
running in the firmware environment wasn't compromised before it
launched the initial signed kernel?

^ permalink raw reply

* Re: [PATCH V34 19/29] Lock down module params that specify hardware parameters (eg. ioport)
From: Matthew Garrett @ 2019-06-27 15:30 UTC (permalink / raw)
  To: Daniel Axtens
  Cc: James Morris, LSM List, Linux Kernel Mailing List, Linux API,
	David Howells, Alan Cox
In-Reply-To: <87ef3f3ihh.fsf@dja-thinkpad.axtens.net>

On Wed, Jun 26, 2019 at 6:49 PM Daniel Axtens <dja@axtens.net> wrote:
>
> Matthew Garrett <matthewgarrett@google.com> writes:
> > +     if (kp->flags & KERNEL_PARAM_FL_HWPARAM &&
> > +         security_locked_down(LOCKDOWN_MODULE_PARAMETERS))
> > +             return false;
> > +     return true;
> >  }
>
> Should this test occur before tainting the kernel?

Seems reasonable.

^ permalink raw reply

* Re: [PATCH bpf-next v9 05/10] bpf,landlock: Add a new map type: inode
From: Mickaël Salaün @ 2019-06-27 16:18 UTC (permalink / raw)
  To: Al Viro, Mickaël Salaün
  Cc: linux-kernel, Aleksa Sarai, Alexei Starovoitov, Andrew Morton,
	Andy Lutomirski, Arnaldo Carvalho de Melo, Casey Schaufler,
	Daniel Borkmann, David Drysdale, David S . Miller,
	Eric W . Biederman, James Morris, Jann Horn, John Johansen,
	Jonathan Corbet, Kees Cook, Michael Kerrisk, Paul Moore,
	Sargun Dhillon, Serge E . Hallyn, Shuah Khan, Stephen Smalley,
	Tejun Heo, Tetsuo Handa <penguin-kern>
In-Reply-To: <20190625225201.GJ17978@ZenIV.linux.org.uk>


On 26/06/2019 00:52, Al Viro wrote:
> On Tue, Jun 25, 2019 at 11:52:34PM +0200, Mickaël Salaün wrote:
>> +/* must call iput(inode) after this call */
>> +static struct inode *inode_from_fd(int ufd, bool check_access)
>> +{
>> +    struct inode *ret;
>> +    struct fd f;
>> +    int deny;
>> +
>> +    f = fdget(ufd);
>> +    if (unlikely(!f.file || !file_inode(f.file))) {
>> +            ret = ERR_PTR(-EBADF);
>> +            goto put_fd;
>> +    }
>
> Just when does one get a NULL file_inode()?  The reason I'm asking is
> that arseloads of code would break if one managed to create such
> a beast...

I didn't find any API documentation about this guarantee, so I followed
a defensive programming approach. I'll remove the file_inode() check.

>
> Incidentally, that should be return ERR_PTR(-EBADF); fdput() is wrong there.

Right, I'll fix that.

>
>> +    }
>> +    /* check if the FD is tied to a mount point */
>> +    /* TODO: add this check when called from an eBPF program too */
>> +    if (unlikely(!f.file->f_path.mnt
>
> Again, the same question - when the hell can that happen?

Defensive programming again, I'll remove it.

> If you are
> sitting on an exploitable roothole, do share it...
>
>  || f.file->f_path.mnt->mnt_flags &
>> +                            MNT_INTERNAL)) {
>> +            ret = ERR_PTR(-EINVAL);
>> +            goto put_fd;
>
> What does it have to do with mountpoints, anyway?

I want to only manage inodes tied to a userspace-visible file system
(this check may not be enough though). It doesn't make sense to be able
to add inodes which are not mounted, to this kind of map.

>
>> +/* called from syscall */
>> +static int sys_inode_map_delete_elem(struct bpf_map *map, struct inode *key)
>> +{
>> +    struct inode_array *array = container_of(map, struct inode_array, map);
>> +    struct inode *inode;
>> +    int i;
>> +
>> +    WARN_ON_ONCE(!rcu_read_lock_held());
>> +    for (i = 0; i < array->map.max_entries; i++) {
>> +            if (array->elems[i].inode == key) {
>> +                    inode = xchg(&array->elems[i].inode, NULL);
>> +                    array->nb_entries--;
>
> Umm...  Is that intended to be atomic in any sense?

nb_entries is not used as a bound check but to avoid walking uselessly
through the (pre-allocated) array when adding a new element, but I'll
use an atomic to avoid inconsistencies anyway.

>
>> +                    iput(inode);
>> +                    return 0;
>> +            }
>> +    }
>> +    return -ENOENT;
>> +}
>> +
>> +/* called from syscall */
>> +int bpf_inode_map_delete_elem(struct bpf_map *map, int *key)
>> +{
>> +    struct inode *inode;
>> +    int err;
>> +
>> +    inode = inode_from_fd(*key, false);
>> +    if (IS_ERR(inode))
>> +            return PTR_ERR(inode);
>> +    err = sys_inode_map_delete_elem(map, inode);
>> +    iput(inode);
>> +    return err;
>> +}
>
> Wait a sec...  So we have those beasties that can have long-term
> references to arbitrary inodes stuck in them?  What will happen
> if you get umount(2) called while such a thing exists?

I though an umount would be denied but no, we get a self-destructed busy
inode and a bug!
What about wrapping the inode's superblock->s_op->destroy_inode() to
first remove the element from the map and then call the real
destroy_inode(), if any?
Or I could update fs/inode.c:destroy_inode() to call inode->free_inode()
if it is set, and set it when such inode is referenced by a map?
Or maybe I could hold the referencing file in the map and then wrap its
f_op?


--
Mickaël Salaün
ANSSI/SDE/ST/LAM

Les données à caractère personnel recueillies et traitées dans le cadre de cet échange, le sont à seule fin d’exécution d’une relation professionnelle et s’opèrent dans cette seule finalité et pour la durée nécessaire à cette relation. Si vous souhaitez faire usage de vos droits de consultation, de rectification et de suppression de vos données, veuillez contacter contact.rgpd@sgdsn.gouv.fr. Si vous avez reçu ce message par erreur, nous vous remercions d’en informer l’expéditeur et de détruire le message. The personal data collected and processed during this exchange aims solely at completing a business relationship and is limited to the necessary duration of that relationship. If you wish to use your rights of consultation, rectification and deletion of your data, please contact: contact.rgpd@sgdsn.gouv.fr. If you have received this message in error, we thank you for informing the sender and destroying the message.

^ permalink raw reply

* Re: [PATCH v4 2/2] fpga: dfl: fme: add performance reporting support
From: Greg KH @ 2019-06-27 16:53 UTC (permalink / raw)
  To: Wu Hao
  Cc: mdf, linux-fpga, linux-kernel, linux-api, atull, Luwei Kang,
	Xu Yilun
In-Reply-To: <1561612195-6081-3-git-send-email-hao.wu@intel.com>

On Thu, Jun 27, 2019 at 01:09:55PM +0800, Wu Hao wrote:
> This patch adds support for performance reporting private feature
> for FPGA Management Engine (FME). Now it supports several different
> performance counters, including 'basic', 'cache', 'fabric', 'vtd'
> and 'vtd_sip'. It allows user to use standard linux tools to access
> these performance counters.
> 
> e.g. List all events by "perf list"
> 
>   perf list | grep fme
> 
>   fme0/cache_read_hit/                         [Kernel PMU event]
>   fme0/cache_read_miss/                        [Kernel PMU event]
>   ...
> 
>   fme0/fab_mmio_read/                          [Kernel PMU event]
>   fme0/fab_mmio_write/                         [Kernel PMU event]
>   ...
> 
>   fme0/fab_port_mmio_read,portid=?/            [Kernel PMU event]
>   fme0/fab_port_mmio_write,portid=?/           [Kernel PMU event]
>   ...
> 
>   fme0/vtd_port_devtlb_1g_fill,portid=?/       [Kernel PMU event]
>   fme0/vtd_port_devtlb_2m_fill,portid=?/       [Kernel PMU event]
>   ...
> 
>   fme0/vtd_sip_iotlb_1g_hit/                   [Kernel PMU event]
>   fme0/vtd_sip_iotlb_1g_miss/                  [Kernel PMU event]
>   ...
> 
>   fme0/clock                                   [Kernel PMU event]
>   ...
> 
> e.g. check increased counter value after run one application using
> "perf stat" command.
> 
>  perf stat -e fme0/fab_mmio_read/,fme0/fab_mmio_write/, ./test
> 
>  Performance counter stats for './test':
> 
>                  1      fme0/fab_mmio_read/
>                  2      fme0/fab_mmio_write/
> 
>        1.009496520 seconds time elapsed
> 
> Please note that fabric counters support both fab_* and fab_port_*, but
> actually they are sharing one set of performance counters in hardware.
> If user wants to monitor overall data events on fab_* then fab_port_*
> can't be supported at the same time, see example below:
> 
> perf stat -e fme0/fab_mmio_read/,fme0/fab_port_mmio_write,portid=0/
> 
>  Performance counter stats for 'system wide':
> 
>                  0      fme0/fab_mmio_read/
>    <not supported>      fme0/fab_port_mmio_write,portid=0/
> 
>        2.141064085 seconds time elapsed
> 
> Signed-off-by: Luwei Kang <luwei.kang@intel.com>
> Signed-off-by: Xu Yilun <yilun.xu@intel.com>
> Signed-off-by: Wu Hao <hao.wu@intel.com>
> ---
> v3: replace scnprintf with sprintf in sysfs interfaces.
>     update sysfs doc kernel version and date.
>     fix sysfs doc issue for fabric counter.
>     refine PERF_OBJ_ATTR_* macro, doesn't count on __ATTR anymore.
>     introduce PERF_OBJ_ATTR_F_* macro, as it needs to use different
>     filenames for some of the sysfs attributes.
>     remove kobject_del when destroy kobject, kobject_put is enough.
>     do sysfs_remove_groups first when destroying perf_obj.
>     WARN_ON_ONCE in case internal parms are wrong in read_*_count().
> v4: rework this patch to use standard perf API as user interfaces.
> ---
>  drivers/fpga/Makefile       |   1 +
>  drivers/fpga/dfl-fme-main.c |   4 +
>  drivers/fpga/dfl-fme-perf.c | 871 ++++++++++++++++++++++++++++++++++++++++++++
>  drivers/fpga/dfl-fme.h      |   2 +
>  4 files changed, 878 insertions(+)
>  create mode 100644 drivers/fpga/dfl-fme-perf.c
> 
> diff --git a/drivers/fpga/Makefile b/drivers/fpga/Makefile
> index 4865b74..d8e21df 100644
> --- a/drivers/fpga/Makefile
> +++ b/drivers/fpga/Makefile
> @@ -40,6 +40,7 @@ obj-$(CONFIG_FPGA_DFL_FME_REGION)	+= dfl-fme-region.o
>  obj-$(CONFIG_FPGA_DFL_AFU)		+= dfl-afu.o
>  
>  dfl-fme-objs := dfl-fme-main.o dfl-fme-pr.o dfl-fme-error.o
> +dfl-fme-objs += dfl-fme-perf.o
>  dfl-afu-objs := dfl-afu-main.o dfl-afu-region.o dfl-afu-dma-region.o
>  dfl-afu-objs += dfl-afu-error.o
>  
> diff --git a/drivers/fpga/dfl-fme-main.c b/drivers/fpga/dfl-fme-main.c
> index 9225b68..a11c112 100644
> --- a/drivers/fpga/dfl-fme-main.c
> +++ b/drivers/fpga/dfl-fme-main.c
> @@ -639,6 +639,10 @@ static void fme_power_mgmt_uinit(struct platform_device *pdev,
>  		.ops = &fme_power_mgmt_ops,
>  	},
>  	{
> +		.id_table = fme_perf_id_table,
> +		.ops = &fme_perf_ops,
> +	},
> +	{
>  		.ops = NULL,
>  	},
>  };
> diff --git a/drivers/fpga/dfl-fme-perf.c b/drivers/fpga/dfl-fme-perf.c
> new file mode 100644
> index 0000000..0d7768a
> --- /dev/null
> +++ b/drivers/fpga/dfl-fme-perf.c
> @@ -0,0 +1,871 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Driver for FPGA Management Engine (FME) Global Performance Reporting
> + *
> + * Copyright 2019 Intel Corporation, Inc.
> + *
> + * Authors:
> + *   Kang Luwei <luwei.kang@intel.com>
> + *   Xiao Guangrong <guangrong.xiao@linux.intel.com>
> + *   Wu Hao <hao.wu@intel.com>
> + *   Xu Yilun <yilun.xu@intel.com>
> + *   Joseph Grecco <joe.grecco@intel.com>
> + *   Enno Luebbers <enno.luebbers@intel.com>
> + *   Tim Whisonant <tim.whisonant@intel.com>
> + *   Ananda Ravuri <ananda.ravuri@intel.com>
> + *   Mitchel, Henry <henry.mitchel@intel.com>
> + */
> +
> +#include <linux/perf_event.h>
> +#include "dfl.h"
> +#include "dfl-fme.h"
> +
> +/*
> + * Performance Counter Registers for Cache.
> + *
> + * Cache Events are listed below as CACHE_EVNT_*.
> + */
> +#define CACHE_CTRL			0x8
> +#define CACHE_RESET_CNTR		BIT_ULL(0)
> +#define CACHE_FREEZE_CNTR		BIT_ULL(8)
> +#define CACHE_CTRL_EVNT			GENMASK_ULL(19, 16)
> +#define CACHE_EVNT_RD_HIT		0x0
> +#define CACHE_EVNT_WR_HIT		0x1
> +#define CACHE_EVNT_RD_MISS		0x2
> +#define CACHE_EVNT_WR_MISS		0x3
> +#define CACHE_EVNT_RSVD			0x4
> +#define CACHE_EVNT_HOLD_REQ		0x5
> +#define CACHE_EVNT_DATA_WR_PORT_CONTEN	0x6
> +#define CACHE_EVNT_TAG_WR_PORT_CONTEN	0x7
> +#define CACHE_EVNT_TX_REQ_STALL		0x8
> +#define CACHE_EVNT_RX_REQ_STALL		0x9
> +#define CACHE_EVNT_EVICTIONS		0xa
> +#define CACHE_CHANNEL_SEL		BIT_ULL(20)
> +#define CACHE_CHANNEL_RD		0
> +#define CACHE_CHANNEL_WR		1
> +#define CACHE_CNTR0			0x10
> +#define CACHE_CNTR1			0x18
> +#define CACHE_CNTR_EVNT_CNTR		GENMASK_ULL(47, 0)
> +#define CACHE_CNTR_EVNT			GENMASK_ULL(63, 60)
> +
> +/*
> + * Performance Counter Registers for Fabric.
> + *
> + * Fabric Events are listed below as FAB_EVNT_*
> + */
> +#define FAB_CTRL			0x20
> +#define FAB_RESET_CNTR			BIT_ULL(0)
> +#define FAB_FREEZE_CNTR			BIT_ULL(8)
> +#define FAB_CTRL_EVNT			GENMASK_ULL(19, 16)
> +#define FAB_EVNT_PCIE0_RD		0x0
> +#define FAB_EVNT_PCIE0_WR		0x1
> +#define FAB_EVNT_PCIE1_RD		0x2
> +#define FAB_EVNT_PCIE1_WR		0x3
> +#define FAB_EVNT_UPI_RD			0x4
> +#define FAB_EVNT_UPI_WR			0x5
> +#define FAB_EVNT_MMIO_RD		0x6
> +#define FAB_EVNT_MMIO_WR		0x7
> +#define FAB_PORT_ID			GENMASK_ULL(21, 20)
> +#define FAB_PORT_FILTER			BIT_ULL(23)
> +#define FAB_PORT_FILTER_DISABLE		0
> +#define FAB_PORT_FILTER_ENABLE		1
> +#define FAB_CNTR			0x28
> +#define FAB_CNTR_EVNT_CNTR		GENMASK_ULL(59, 0)
> +#define FAB_CNTR_EVNT			GENMASK_ULL(63, 60)
> +
> +/*
> + * Performance Counter Registers for Clock.
> + *
> + * Clock Counter can't be reset or frozen by SW.
> + */
> +#define CLK_CNTR			0x30
> +#define BASIC_EVNT_CLK			0x0
> +
> +/*
> + * Performance Counter Registers for IOMMU / VT-D.
> + *
> + * VT-D Events are listed below as VTD_EVNT_* and VTD_SIP_EVNT_*
> + */
> +#define VTD_CTRL			0x38
> +#define VTD_RESET_CNTR			BIT_ULL(0)
> +#define VTD_FREEZE_CNTR			BIT_ULL(8)
> +#define VTD_CTRL_EVNT			GENMASK_ULL(19, 16)
> +#define VTD_EVNT_AFU_MEM_RD_TRANS	0x0
> +#define VTD_EVNT_AFU_MEM_WR_TRANS	0x1
> +#define VTD_EVNT_AFU_DEVTLB_RD_HIT	0x2
> +#define VTD_EVNT_AFU_DEVTLB_WR_HIT	0x3
> +#define VTD_EVNT_DEVTLB_4K_FILL		0x4
> +#define VTD_EVNT_DEVTLB_2M_FILL		0x5
> +#define VTD_EVNT_DEVTLB_1G_FILL		0x6
> +#define VTD_CNTR			0x40
> +#define VTD_CNTR_EVNT_CNTR		GENMASK_ULL(47, 0)
> +#define VTD_CNTR_EVNT			GENMASK_ULL(63, 60)
> +
> +#define VTD_SIP_CTRL			0x48
> +#define VTD_SIP_RESET_CNTR		BIT_ULL(0)
> +#define VTD_SIP_FREEZE_CNTR		BIT_ULL(8)
> +#define VTD_SIP_CTRL_EVNT		GENMASK_ULL(19, 16)
> +#define VTD_SIP_EVNT_IOTLB_4K_HIT	0x0
> +#define VTD_SIP_EVNT_IOTLB_2M_HIT	0x1
> +#define VTD_SIP_EVNT_IOTLB_1G_HIT	0x2
> +#define VTD_SIP_EVNT_SLPWC_L3_HIT	0x3
> +#define VTD_SIP_EVNT_SLPWC_L4_HIT	0x4
> +#define VTD_SIP_EVNT_RCC_HIT		0x5
> +#define VTD_SIP_EVNT_IOTLB_4K_MISS	0x6
> +#define VTD_SIP_EVNT_IOTLB_2M_MISS	0x7
> +#define VTD_SIP_EVNT_IOTLB_1G_MISS	0x8
> +#define VTD_SIP_EVNT_SLPWC_L3_MISS	0x9
> +#define VTD_SIP_EVNT_SLPWC_L4_MISS	0xa
> +#define VTD_SIP_EVNT_RCC_MISS		0xb
> +#define VTD_SIP_CNTR			0X50
> +#define VTD_SIP_CNTR_EVNT_CNTR		GENMASK_ULL(47, 0)
> +#define VTD_SIP_CNTR_EVNT		GENMASK_ULL(63, 60)
> +
> +#define PERF_TIMEOUT			30
> +
> +#define PERF_MAX_PORT_NUM		1
> +
> +/**
> + * struct fme_perf_priv - priv data structure for fme perf driver
> + *
> + * @dev: parent device.
> + * @ioaddr: mapped base address of mmio region.
> + * @pmu: pmu data structure for fme perf counters.
> + * @id: id of this fme performance report private feature.
> + * @fab_users: current user number on fabric counters.
> + * @fab_port_id: used to indicate current working mode of fabric counters.
> + * @fab_lock: lock to protect fabric counters working mode.
> + * @events_group: events attribute group for fme perf pmu.
> + * @attr_groups: attribute groups for fme perf pmu.
> + */
> +struct fme_perf_priv {
> +	struct device *dev;
> +	void __iomem *ioaddr;
> +	struct pmu pmu;
> +	u64 id;
> +
> +	u32 fab_users;
> +	u32 fab_port_id;
> +	spinlock_t fab_lock;
> +
> +	struct attribute_group events_group;
> +	const struct attribute_group *attr_groups[4];
> +};
> +
> +/**
> + * struct fme_perf_event_attr - fme perf event attribute
> + *
> + * @attr: device attribute of fme perf event.
> + * @event_id: id of fme perf event.
> + * @event_type: type of fme perf event.
> + * @is_port_event: indicate if this is a port based event.
> + * @data: private data for fme perf event.
> + */
> +struct fme_perf_event_attr {
> +	struct device_attribute attr;
> +	u32 event_id;
> +	u32 event_type;
> +	bool is_port_event;
> +	u64 data;
> +};
> +
> +/**
> + * struct fme_perf_event_ops - callbacks for fme perf events
> + *
> + * @event_init: callback invoked during event init.
> + * @event_destroy: callback invoked during event destroy.
> + * @read_counter: callback to read hardware counters.
> + */
> +struct fme_perf_event_ops {
> +	int (*event_init)(struct fme_perf_priv *priv, u32 event,
> +			  u32 port_id, u64 data);
> +	void (*event_destroy)(struct fme_perf_priv *priv, u32 event,
> +			      u32 port_id, u64 data);
> +	u64 (*read_counter)(struct fme_perf_priv *priv, u32 event,
> +			    u32 port_id, u64 data);
> +};
> +
> +/**
> + * struct fme_perf_event_group - fme perf groups
> + *
> + * @ev_attrs: fme perf event attributes.
> + * @num: events number in this group.
> + * @ops: same callbacks shared by all fme perf events in this group.
> + */
> +struct fme_perf_event_group {
> +	struct fme_perf_event_attr *ev_attrs;
> +	unsigned int num;
> +	struct fme_perf_event_ops *ops;
> +};
> +
> +#define to_fme_perf_priv(_pmu)	container_of(_pmu, struct fme_perf_priv, pmu)
> +
> +static cpumask_t fme_perf_cpumask = CPU_MASK_CPU0;
> +
> +static ssize_t cpumask_show(struct device *dev,
> +			    struct device_attribute *attr, char *buf)
> +{
> +	return cpumap_print_to_pagebuf(true, buf, &fme_perf_cpumask);
> +}
> +static DEVICE_ATTR_RO(cpumask);
> +
> +static struct attribute *fme_perf_cpumask_attrs[] = {
> +	&dev_attr_cpumask.attr,
> +	NULL,
> +};
> +
> +static struct attribute_group fme_perf_cpumask_group = {
> +	.attrs = fme_perf_cpumask_attrs,
> +};
> +
> +#define FME_EVENT_MASK		GENMASK_ULL(11, 0)
> +#define FME_EVTYPE_MASK		GENMASK_ULL(15, 12)
> +#define FME_EVTYPE_BASIC	0
> +#define FME_EVTYPE_CACHE	1
> +#define FME_EVTYPE_FABRIC	2
> +#define FME_EVTYPE_VTD		3
> +#define FME_EVTYPE_VTD_SIP	4
> +#define FME_EVTYPE_MAX		FME_EVTYPE_VTD_SIP
> +#define FME_PORTID_MASK		GENMASK_ULL(23, 16)
> +#define FME_PORTID_ROOT		(0xffU)
> +
> +PMU_FORMAT_ATTR(event,		"config:0-11");
> +PMU_FORMAT_ATTR(evtype,		"config:12-15");
> +PMU_FORMAT_ATTR(portid,		"config:16-23");
> +
> +static struct attribute *fme_perf_format_attrs[] = {
> +	&format_attr_event.attr,
> +	&format_attr_evtype.attr,
> +	&format_attr_portid.attr,
> +	NULL,
> +};
> +
> +static struct attribute_group fme_perf_format_group = {
> +	.name = "format",
> +	.attrs = fme_perf_format_attrs,
> +};

No Documentation/ABI/ entries for these sysfs files?

thanks,

greg k-h

^ permalink raw reply

* Re: [PATCH bpf-next v9 05/10] bpf,landlock: Add a new map type: inode
From: Al Viro @ 2019-06-27 16:56 UTC (permalink / raw)
  To: Mickaël Salaün
  Cc: Mickaël Salaün, linux-kernel, Aleksa Sarai,
	Alexei Starovoitov, Andrew Morton, Andy Lutomirski,
	Arnaldo Carvalho de Melo, Casey Schaufler, Daniel Borkmann,
	David Drysdale, David S . Miller, Eric W . Biederman,
	James Morris, Jann Horn, John Johansen, Jonathan Corbet,
	Kees Cook, Michael Kerrisk, Paul Moore
In-Reply-To: <79bac827-4092-8a4d-9dc6-6019419b2486@ssi.gouv.fr>

On Thu, Jun 27, 2019 at 06:18:12PM +0200, Mickaël Salaün wrote:

> >> +/* called from syscall */
> >> +static int sys_inode_map_delete_elem(struct bpf_map *map, struct inode *key)
> >> +{
> >> +    struct inode_array *array = container_of(map, struct inode_array, map);
> >> +    struct inode *inode;
> >> +    int i;
> >> +
> >> +    WARN_ON_ONCE(!rcu_read_lock_held());
> >> +    for (i = 0; i < array->map.max_entries; i++) {
> >> +            if (array->elems[i].inode == key) {
> >> +                    inode = xchg(&array->elems[i].inode, NULL);
> >> +                    array->nb_entries--;
> >
> > Umm...  Is that intended to be atomic in any sense?
> 
> nb_entries is not used as a bound check but to avoid walking uselessly
> through the (pre-allocated) array when adding a new element, but I'll
> use an atomic to avoid inconsistencies anyway.


> > Wait a sec...  So we have those beasties that can have long-term
> > references to arbitrary inodes stuck in them?  What will happen
> > if you get umount(2) called while such a thing exists?
> 
> I though an umount would be denied but no, we get a self-destructed busy
> inode and a bug!
> What about wrapping the inode's superblock->s_op->destroy_inode() to
> first remove the element from the map and then call the real
> destroy_inode(), if any?

What do you mean, _the_ map?  I don't see anything to prevent insertion
of references to the same inode into any number of those...

> Or I could update fs/inode.c:destroy_inode() to call inode->free_inode()
> if it is set, and set it when such inode is referenced by a map?
> Or maybe I could hold the referencing file in the map and then wrap its
> f_op?

First of all, anything including the word "wrap" is a non-starter.
We really don't need the headache associated with the locking needed
to replace the method tables on the fly, or with the code checking that
->f_op points to given method table, etc.  That's not going to fly,
especially since you'd end up _chaining_ those (again, the same reference
can go in more than once).

Nothing is allowed to change the method tables of live objects, period.
Once a struct file is opened, its ->f_op is never going to change and
it entirely belongs to the device driver or filesystem it resides on.
Nothing else (not VFS, not VM, not some LSM module, etc.) has any business
touching that.  The same goes for inodes, dentries, etc.

What kind of behaviour do you want there?  Do you want the inodes you've
referenced there to be forgotten on e.g. memory pressure?  The thing is,
I don't see how "it's getting freed" could map onto any semantics that
might be useful for you - it looks like the wrong event for that.

^ permalink raw reply

* Re: [PATCH v3 0/5] Introduce MADV_COLD and MADV_PAGEOUT
From: Kirill A. Shutemov @ 2019-06-27 18:06 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, linux-mm, LKML, linux-api, Michal Hocko,
	Johannes Weiner, Tim Murray, Joel Fernandes, Suren Baghdasaryan,
	Daniel Colascione, Shakeel Butt, Sonny Rao, oleksandr, hdanton,
	lizeb, Dave Hansen, Kirill A . Shutemov
In-Reply-To: <20190627115405.255259-1-minchan@kernel.org>

On Thu, Jun 27, 2019 at 08:54:00PM +0900, Minchan Kim wrote:
> - Problem
> 
> Naturally, cached apps were dominant consumers of memory on the system.
> However, they were not significant consumers of swap even though they are
> good candidate for swap. Under investigation, swapping out only begins
> once the low zone watermark is hit and kswapd wakes up, but the overall
> allocation rate in the system might trip lmkd thresholds and cause a cached
> process to be killed(we measured performance swapping out vs. zapping the
> memory by killing a process. Unsurprisingly, zapping is 10x times faster
> even though we use zram which is much faster than real storage) so kill
> from lmkd will often satisfy the high zone watermark, resulting in very
> few pages actually being moved to swap.

Maybe we should look if we do The Right Thing™ at system-wide level before
introducing new API? How changing swappiness affects your workloads? What
is swappiness value in your setup?

-- 
 Kirill A. Shutemov

^ permalink raw reply

* Re: [PATCH V33 24/30] bpf: Restrict bpf when kernel lockdown is in confidentiality mode
From: James Morris @ 2019-06-27 18:06 UTC (permalink / raw)
  To: Stephen Smalley
  Cc: Andy Lutomirski, Andy Lutomirski, Matthew Garrett, linux-security,
	LKML, Linux API, David Howells, Alexei Starovoitov,
	Matthew Garrett, Network Development, Chun-Yi Lee,
	Daniel Borkmann, linux-security-module
In-Reply-To: <bce70c8b-9efd-6362-d536-cfbbcf70b0b7@tycho.nsa.gov>

On Thu, 27 Jun 2019, Stephen Smalley wrote:

> There are two scenarios where finer-grained distinctions make sense:
> 
> - Users may need to enable specific functionality that falls under the
> umbrella of "confidentiality" or "integrity" lockdown.  Finer-grained lockdown
> reasons free them from having to make an all-or-nothing choice between lost
> functionality or no lockdown at all.

Agreed. This will be used for more than just UEFI secure boot on desktops, 
e.g. embedded systems using verified boot, where finer grained policy will 
be needed for what are sometimes very specific use-cases (which may be 
also covered by other mitigations).

> This can be supported directly by the
> lockdown module without any help from SELinux or other security modules; we
> just need the ability to specify these finer-grained lockdown levels via the
> boot parameters and securityfs nodes.

If the lockdown LSM implements fine grained policy (rather than the simple 
coarse grained policy), I'd suggest adding a new lockdown level of 
'custom' which by default enables all hooks but allows selective 
disablement via params/sysfs.

This would be simpler than telling users to use a different lockdown LSM 
for this.

> - Different processes/programs may need to use different sets of functionality
> restricted via lockdown confidentiality or integrity categories.  If we have
> to allow all-or-none for the set of interfaces/functionality covered by the
> generic confidentiality or integrity categories, then we'll end up having to
> choose between lost functionality or overprivileged processes, neither of
> which is optimal.
> 
> Is it truly the case that everything under the "confidentiality" category
> poses the same level of risk to kernel confidentiality, and similarly for
> everything under the "integrity" category?  If not, then being able to
> distinguish them definitely has benefit.

Good question. We can't know the answer to this unless we know how an 
attacker might leverage access.

The value here IMHO is more in allowing tradeoffs to be made by system 
designers vs. disabling lockdown entirely.

> I'm still not clear though on how/if this will compose with or be overridden
> by other security modules.  We would need some means for another security
> module to take over lockdown decisions once it has initialized (including
> policy load), and to be able to access state that is currently private to the
> lockdown module, like the level.

Why not utilize stacking (restrictively), similarly to capabilities?


-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply

* Re: [PATCH V34 09/29] kexec_file: Restrict at runtime if the kernel is locked down
From: James Morris @ 2019-06-27 18:14 UTC (permalink / raw)
  To: Matthew Garrett
  Cc: Jiri Bohac, Linux API, kexec-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	Linux Kernel Mailing List, David Howells, LSM List
In-Reply-To: <CACdnJusJeCYPKVFHiu6yn+mfqQe5k0RqZhbCUZEkxtXx_shMmw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

On Thu, 27 Jun 2019, Matthew Garrett wrote:

> By that metric, on a secure boot system how do we determine that code
> running in the firmware environment wasn't compromised before it
> launched the initial signed kernel?

Remote attestation tied to a hardware root of trust, before allowing 
access to any further resources.


-- 
James Morris
<jmorris-gx6/JNMH7DfYtjvyW6yDsg@public.gmane.org>

^ permalink raw reply

* Re: [PATCH V33 24/30] bpf: Restrict bpf when kernel lockdown is in confidentiality mode
From: Stephen Smalley @ 2019-06-27 20:16 UTC (permalink / raw)
  To: James Morris
  Cc: Andy Lutomirski, Andy Lutomirski, Matthew Garrett, linux-security,
	LKML, Linux API, David Howells, Alexei Starovoitov,
	Matthew Garrett, Network Development, Chun-Yi Lee,
	Daniel Borkmann, linux-security-module
In-Reply-To: <alpine.LRH.2.21.1906280332500.17363@namei.org>

On 6/27/19 2:06 PM, James Morris wrote:
> On Thu, 27 Jun 2019, Stephen Smalley wrote:
> 
>> There are two scenarios where finer-grained distinctions make sense:
>>
>> - Users may need to enable specific functionality that falls under the
>> umbrella of "confidentiality" or "integrity" lockdown.  Finer-grained lockdown
>> reasons free them from having to make an all-or-nothing choice between lost
>> functionality or no lockdown at all.
> 
> Agreed. This will be used for more than just UEFI secure boot on desktops,
> e.g. embedded systems using verified boot, where finer grained policy will
> be needed for what are sometimes very specific use-cases (which may be
> also covered by other mitigations).
> 
>> This can be supported directly by the
>> lockdown module without any help from SELinux or other security modules; we
>> just need the ability to specify these finer-grained lockdown levels via the
>> boot parameters and securityfs nodes.
> 
> If the lockdown LSM implements fine grained policy (rather than the simple
> coarse grained policy), I'd suggest adding a new lockdown level of
> 'custom' which by default enables all hooks but allows selective
> disablement via params/sysfs.
> 
> This would be simpler than telling users to use a different lockdown LSM
> for this.
> 
>> - Different processes/programs may need to use different sets of functionality
>> restricted via lockdown confidentiality or integrity categories.  If we have
>> to allow all-or-none for the set of interfaces/functionality covered by the
>> generic confidentiality or integrity categories, then we'll end up having to
>> choose between lost functionality or overprivileged processes, neither of
>> which is optimal.
>>
>> Is it truly the case that everything under the "confidentiality" category
>> poses the same level of risk to kernel confidentiality, and similarly for
>> everything under the "integrity" category?  If not, then being able to
>> distinguish them definitely has benefit.
> 
> Good question. We can't know the answer to this unless we know how an
> attacker might leverage access.
> 
> The value here IMHO is more in allowing tradeoffs to be made by system
> designers vs. disabling lockdown entirely.
> 
>> I'm still not clear though on how/if this will compose with or be overridden
>> by other security modules.  We would need some means for another security
>> module to take over lockdown decisions once it has initialized (including
>> policy load), and to be able to access state that is currently private to the
>> lockdown module, like the level.
> 
> Why not utilize stacking (restrictively), similarly to capabilities?

That would only allow the LSM to further lock down the system above the 
lockdown level set at boot, not grant exemptions for specific 
functionality/interfaces required by the user or by a specific 
process/program. You'd have to boot with lockdown=none (or your 
lockdown=custom suggestion) in order for the LSM to allow anything 
covered by the integrity or confidentiality levels.  And then the kernel 
would be unprotected prior to full initialization of the LSM, including 
policy load.

It seems like one would want to be able to boot with lockdown=integrity 
to protect the kernel initially, then switch over to allowing the LSM to 
selectively override it.

^ permalink raw reply

* Re: [PATCH v3 0/5] Introduce MADV_COLD and MADV_PAGEOUT
From: Minchan Kim @ 2019-06-27 23:12 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Andrew Morton, linux-mm, LKML, linux-api, Michal Hocko,
	Johannes Weiner, Tim Murray, Joel Fernandes, Suren Baghdasaryan,
	Daniel Colascione, Shakeel Butt, Sonny Rao, oleksandr, hdanton,
	lizeb, Dave Hansen, Kirill A . Shutemov
In-Reply-To: <20190627180601.xcppuzia3gk57lq2@box>

On Thu, Jun 27, 2019 at 09:06:01PM +0300, Kirill A. Shutemov wrote:
> On Thu, Jun 27, 2019 at 08:54:00PM +0900, Minchan Kim wrote:
> > - Problem
> > 
> > Naturally, cached apps were dominant consumers of memory on the system.
> > However, they were not significant consumers of swap even though they are
> > good candidate for swap. Under investigation, swapping out only begins
> > once the low zone watermark is hit and kswapd wakes up, but the overall
> > allocation rate in the system might trip lmkd thresholds and cause a cached
> > process to be killed(we measured performance swapping out vs. zapping the
> > memory by killing a process. Unsurprisingly, zapping is 10x times faster
> > even though we use zram which is much faster than real storage) so kill
> > from lmkd will often satisfy the high zone watermark, resulting in very
> > few pages actually being moved to swap.
> 
> Maybe we should look if we do The Right Thing™ at system-wide level before
> introducing new API? How changing swappiness affects your workloads? What
> is swappiness value in your setup?

It was 100. Even, I tried 150 and 200 with simple hack of swappiness.
However, it caused too excessive swpout.

Anyway, systen-level tune is generally good but if process has hint, that
should work better and that's why advise API is.

^ permalink raw reply

* Re: [PATCH V33 24/30] bpf: Restrict bpf when kernel lockdown is in confidentiality mode
From: Matthew Garrett @ 2019-06-27 23:16 UTC (permalink / raw)
  To: Stephen Smalley
  Cc: James Morris, Andy Lutomirski, Andy Lutomirski, linux-security,
	LKML, Linux API, David Howells, Alexei Starovoitov,
	Network Development, Chun-Yi Lee, Daniel Borkmann, LSM List
In-Reply-To: <de8b15eb-ba6c-847a-7435-42742203d4a5@tycho.nsa.gov>

On Thu, Jun 27, 2019 at 1:16 PM Stephen Smalley <sds@tycho.nsa.gov> wrote:
> That would only allow the LSM to further lock down the system above the
> lockdown level set at boot, not grant exemptions for specific
> functionality/interfaces required by the user or by a specific
> process/program. You'd have to boot with lockdown=none (or your
> lockdown=custom suggestion) in order for the LSM to allow anything
> covered by the integrity or confidentiality levels.  And then the kernel
> would be unprotected prior to full initialization of the LSM, including
> policy load.
>
> It seems like one would want to be able to boot with lockdown=integrity
> to protect the kernel initially, then switch over to allowing the LSM to
> selectively override it.

One option would be to allow modules to be "unstacked" at runtime, but
there's still something of a problem here - how do you ensure that
your userland can be trusted to load a new policy before it does so?
If you're able to assert that your early userland is trustworthy
(perhaps because it's in an initramfs that's part of your signed boot
payload), there's maybe an argument that most of the lockdown
integrity guarantees are unnecessary before handoff - just using the
lockdown LSM to protect against attacks via kernel parameters would be
sufficient.

^ permalink raw reply

* Re: [PATCH V34 09/29] kexec_file: Restrict at runtime if the kernel is locked down
From: Matthew Garrett @ 2019-06-27 23:17 UTC (permalink / raw)
  To: James Morris
  Cc: LSM List, Linux Kernel Mailing List, Linux API, Jiri Bohac,
	David Howells, kexec
In-Reply-To: <alpine.LRH.2.21.1906280411370.18880@namei.org>

On Thu, Jun 27, 2019 at 11:14 AM James Morris <jmorris@namei.org> wrote:
>
> On Thu, 27 Jun 2019, Matthew Garrett wrote:
>
> > By that metric, on a secure boot system how do we determine that code
> > running in the firmware environment wasn't compromised before it
> > launched the initial signed kernel?
>
> Remote attestation tied to a hardware root of trust, before allowing
> access to any further resources.

If you use IMA you can get the same guarantees over kexec.

^ permalink raw reply

* Re: [PATCH V33 24/30] bpf: Restrict bpf when kernel lockdown is in confidentiality mode
From: Andy Lutomirski @ 2019-06-27 23:23 UTC (permalink / raw)
  To: Matthew Garrett
  Cc: Stephen Smalley, James Morris, Andy Lutomirski, linux-security,
	LKML, Linux API, David Howells, Alexei Starovoitov,
	Network Development, Chun-Yi Lee, Daniel Borkmann, LSM List
In-Reply-To: <CACdnJuuG8cR7h9v3pNcBKsxyckAzpKuBJs1GQxsz77jk5DRoQA@mail.gmail.com>

On Thu, Jun 27, 2019 at 4:16 PM Matthew Garrett <mjg59@google.com> wrote:
>
> On Thu, Jun 27, 2019 at 1:16 PM Stephen Smalley <sds@tycho.nsa.gov> wrote:
> > That would only allow the LSM to further lock down the system above the
> > lockdown level set at boot, not grant exemptions for specific
> > functionality/interfaces required by the user or by a specific
> > process/program. You'd have to boot with lockdown=none (or your
> > lockdown=custom suggestion) in order for the LSM to allow anything
> > covered by the integrity or confidentiality levels.  And then the kernel
> > would be unprotected prior to full initialization of the LSM, including
> > policy load.
> >
> > It seems like one would want to be able to boot with lockdown=integrity
> > to protect the kernel initially, then switch over to allowing the LSM to
> > selectively override it.
>
> One option would be to allow modules to be "unstacked" at runtime, but
> there's still something of a problem here - how do you ensure that
> your userland can be trusted to load a new policy before it does so?
> If you're able to assert that your early userland is trustworthy
> (perhaps because it's in an initramfs that's part of your signed boot
> payload), there's maybe an argument that most of the lockdown
> integrity guarantees are unnecessary before handoff - just using the
> lockdown LSM to protect against attacks via kernel parameters would be
> sufficient.

I think that, if you don't trust your system enough to avoid
compromising itself before policy load, then your MAC policy is more
or less dead in the water.  It seems to be that it ought to be good
enough to boot with lockdown=none and then have a real policy loaded
along with the rest of the MAC policy.  Or, for applications that need
to be stricter, you accept that MAC policy can't override lockdown.

^ permalink raw reply

* Re: [PATCH V33 24/30] bpf: Restrict bpf when kernel lockdown is in confidentiality mode
From: Andy Lutomirski @ 2019-06-27 23:27 UTC (permalink / raw)
  To: Stephen Smalley
  Cc: James Morris, Andy Lutomirski, Matthew Garrett, linux-security,
	LKML, Linux API, David Howells, Alexei Starovoitov,
	Matthew Garrett, Network Development, Chun-Yi Lee,
	Daniel Borkmann, LSM List
In-Reply-To: <bce70c8b-9efd-6362-d536-cfbbcf70b0b7@tycho.nsa.gov>

On Thu, Jun 27, 2019 at 7:35 AM Stephen Smalley <sds@tycho.nsa.gov> wrote:
>
> On 6/26/19 8:57 PM, Andy Lutomirski wrote:
> >
> >
> >> On Jun 26, 2019, at 1:22 PM, James Morris <jmorris@namei.org> wrote:
> >>
> >> [Adding the LSM mailing list: missed this patchset initially]
> >>
> >>> On Thu, 20 Jun 2019, Andy Lutomirski wrote:
> >>>
> >>> This patch exemplifies why I don't like this approach:
> >>>
> >>>> @@ -97,6 +97,7 @@ enum lockdown_reason {
> >>>>         LOCKDOWN_INTEGRITY_MAX,
> >>>>         LOCKDOWN_KCORE,
> >>>>         LOCKDOWN_KPROBES,
> >>>> +       LOCKDOWN_BPF,
> >>>>         LOCKDOWN_CONFIDENTIALITY_MAX,
> >>>
> >>>> --- a/security/lockdown/lockdown.c
> >>>> +++ b/security/lockdown/lockdown.c
> >>>> @@ -33,6 +33,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = {
> >>>>         [LOCKDOWN_INTEGRITY_MAX] = "integrity",
> >>>>         [LOCKDOWN_KCORE] = "/proc/kcore access",
> >>>>         [LOCKDOWN_KPROBES] = "use of kprobes",
> >>>> +       [LOCKDOWN_BPF] = "use of bpf",
> >>>>         [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality",
> >>>
> >>> The text here says "use of bpf", but what this patch is *really* doing
> >>> is locking down use of BPF to read kernel memory.  If the details
> >>> change, then every LSM needs to get updated, and we risk breaking user
> >>> policies that are based on LSMs that offer excessively fine
> >>> granularity.
> >>
> >> Can you give an example of how the details might change?
> >>
> >>> I'd be more comfortable if the LSM only got to see "confidentiality"
> >>> or "integrity".
> >>
> >> These are not sufficient for creating a useful policy for the SELinux
> >> case.
> >>
> >>
> >
> > I may have misunderstood, but I thought that SELinux mainly needed to allow certain privileged programs to bypass the policy.  Is there a real example of what SELinux wants to do that can’t be done in the simplified model?
> >
> > The think that specifically makes me uneasy about exposing all of these precise actions to LSMs is that they will get exposed to userspace in a way that forces us to treat them as stable ABIs.
>
> There are two scenarios where finer-grained distinctions make sense:
>
> - Users may need to enable specific functionality that falls under the
> umbrella of "confidentiality" or "integrity" lockdown.  Finer-grained
> lockdown reasons free them from having to make an all-or-nothing choice
> between lost functionality or no lockdown at all. This can be supported
> directly by the lockdown module without any help from SELinux or other
> security modules; we just need the ability to specify these
> finer-grained lockdown levels via the boot parameters and securityfs nodes.
>
> - Different processes/programs may need to use different sets of
> functionality restricted via lockdown confidentiality or integrity
> categories.  If we have to allow all-or-none for the set of
> interfaces/functionality covered by the generic confidentiality or
> integrity categories, then we'll end up having to choose between lost
> functionality or overprivileged processes, neither of which is optimal.
>
> Is it truly the case that everything under the "confidentiality"
> category poses the same level of risk to kernel confidentiality, and
> similarly for everything under the "integrity" category?  If not, then
> being able to distinguish them definitely has benefit.
>

They're really quite similar in my mind.  Certainly some things in the
"integrity" category give absolutely trivial control over the kernel
(e.g. modules) while others make it quite challenging (ioperm), but
the end result is very similar.  And quite a few "confidentiality"
things genuinely do allow all kernel memory to be read.

I agree that finer-grained distinctions could be useful. My concern is
that it's a tradeoff, and the other end of the tradeoff is an ABI
stability issue.  If someone decides down the road that some feature
that is currently "integrity" can be split into a narrow "integrity"
feature and a "confidentiality" feature then, if the user policy knows
about the individual features, there's a risk of breaking people's
systems.  If we keep the fine-grained control, do we have a clear
compatibility story?

^ permalink raw reply

* Re: [PATCH v2 bpf-next 1/4] bpf: unprivileged BPF access via /dev/bpf
From: Andy Lutomirski @ 2019-06-27 23:42 UTC (permalink / raw)
  To: Andy Lutomirski, Kees Cook, Linux API
  Cc: Song Liu, Network Development, bpf, Alexei Starovoitov,
	Daniel Borkmann, kernel-team, lmb, Jann Horn, Greg KH
In-Reply-To: <21894f45-70d8-dfca-8c02-044f776c5e05@kernel.org>

[sigh, I finally set up lore nntp, and I goofed some addresses.  Hi
Kees and linux-api.]

On Thu, Jun 27, 2019 at 4:40 PM Andy Lutomirski <luto@kernel.org> wrote:
>
> On 6/27/19 1:19 PM, Song Liu wrote:
> > This patch introduce unprivileged BPF access. The access control is
> > achieved via device /dev/bpf. Users with write access to /dev/bpf are able
> > to call sys_bpf().
> >
> > Two ioctl command are added to /dev/bpf:
> >
> > The two commands enable/disable permission to call sys_bpf() for current
> > task. This permission is noted by bpf_permitted in task_struct. This
> > permission is inherited during clone(CLONE_THREAD).
> >
> > Helper function bpf_capable() is added to check whether the task has got
> > permission via /dev/bpf.
> >
>
> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > index 0e079b2298f8..79dc4d641cf3 100644
> > --- a/kernel/bpf/verifier.c
> > +++ b/kernel/bpf/verifier.c
> > @@ -9134,7 +9134,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
> >               env->insn_aux_data[i].orig_idx = i;
> >       env->prog = *prog;
> >       env->ops = bpf_verifier_ops[env->prog->type];
> > -     is_priv = capable(CAP_SYS_ADMIN);
> > +     is_priv = bpf_capable(CAP_SYS_ADMIN);
>
> Huh?  This isn't a hardening measure -- the "is_priv" verifier mode
> allows straight-up leaks of private kernel state to user mode.
>
> (For that matter, the pending lockdown stuff should possibly consider
> this a "confidentiality" issue.)
>
>
> I have a bigger issue with this patch, though: it's a really awkward way
> to pretend to have capabilities.  For bpf, it seems like you could make
> this be a *real* capability without too much pain since there's only one
> syscall there.  Just find a way to pass an fd to /dev/bpf into the
> syscall.  If this means you need a new bpf_with_cap() syscall that takes
> an extra argument, so be it.  The old bpf() syscall can just translate
> to bpf_with_cap(..., -1).
>
> For a while, I've considered a scheme I call "implicit rights".  There
> would be a directory in /dev called /dev/implicit_rights.  This would
> either be part of devtmpfs or a whole new filesystem -- it would *not*
> be any other filesystem.  The contents would be files that can't be read
> or written and exist only in memory.  You create them with a privileged
> syscall.  Certain actions that are sensitive but not at the level of
> CAP_SYS_ADMIN (use of large-attack-surface bpf stuff, creation of user
> namespaces, profiling the kernel, etc) could require an "implicit
> right".  When you do them, if you don't have CAP_SYS_ADMIN, the kernel
> would do a path walk for, say, /dev/implicit_rights/bpf and, if the
> object exists, can be opened, and actually refers to the "bpf" rights
> object, then the action is allowed.  Otherwise it's denied.
>
> This is extensible, and it doesn't require the rather ugly per-task
> state of whether it's enabled.
>
> For things like creation of user namespaces, there's an existing API,
> and the default is that it works without privilege.  Switching it to an
> implicit right has the benefit of not requiring code changes to programs
> that already work as non-root.
>
> But, for BPF in particular, this type of compatibility issue doesn't
> exist now.  You already can't use most eBPF functionality without
> privilege.  New bpf-using programs meant to run without privilege are
> *new*, so they can use a new improved API.  So, rather than adding this
> obnoxious ioctl, just make the API explicit, please.
>
> Also, please cc: linux-abi next time.

^ permalink raw reply

* Re: [PATCH v3 1/5] mm: introduce MADV_COLD
From: Minchan Kim @ 2019-06-27 23:46 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Andrew Morton, linux-mm, LKML, linux-api, Michal Hocko,
	Johannes Weiner, Tim Murray, Joel Fernandes, Suren Baghdasaryan,
	Daniel Colascione, Shakeel Butt, Sonny Rao, oleksandr, hdanton,
	lizeb, Kirill A . Shutemov
In-Reply-To: <343599f9-3d99-b74f-1732-368e584fa5ef@intel.com>

On Thu, Jun 27, 2019 at 06:13:36AM -0700, Dave Hansen wrote:
> On 6/27/19 4:54 AM, Minchan Kim wrote:
> > This patch introduces the new MADV_COLD hint to madvise(2) syscall.
> > MADV_COLD can be used by a process to mark a memory range as not expected
> > to be used in the near future. The hint can help kernel in deciding which
> > pages to evict early during memory pressure.
> > 
> > It works for every LRU pages like MADV_[DONTNEED|FREE]. IOW, It moves
> > 
> > 	active file page -> inactive file LRU
> > 	active anon page -> inacdtive anon LRU
> 
> Is the LRU behavior part of the interface or the implementation?

It's a just implementation. What user should expect with this API is they just
informs to the kernel "this memory in the regions wouldn't access in the near
future" so how kernel will handle memory in there is up to the kernel.

> 
> I ask because we've got something in between tossing something down the
> LRU and swapping it: page migration.  Specifically, on a system with
> slower memory media (like persistent memory) we just migrate a page
> instead of discarding it at reclaim:
> 
> > https://lore.kernel.org/linux-mm/20190321200157.29678-4-keith.busch@intel.com/
> 
> So let's say I have a page I want to evict from DRAM to the next slower
> tier of memory.  Do I use MADV_COLD or MADV_PAGEOUT?  If the LRU
> behavior is part of the interface itself, then MADV_COLD doesn't work.

IMHO, if it's one of storage in the memory hierarchy, that shouldn't be transparent
for the user? What I meant is VM moves inactive pages to the persistent memory
before the reclaiming. IOW, VM would have one more level LRU or extened inactive
LRU to cover the persistent memory.

> 
> Do you think we'll need a third MADV_ flag for our automatic migration
> behavior?  MADV_REALLYCOLD?  MADV_MIGRATEOUT?

I believe it depends on how we abstract the persistent memory of cache hierarchy.
If we abstract it as diffrent storage with DRAM, manybe, that should be part of
other syscall like like move_pages. 
If we abstract it as part of DRAM, that should be part of additional LRU
or extended inactive LRU.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox