* [PATCH v3 3/5] mm: account nr_isolated_xxx in [isolate|putback]_lru_page
From: Minchan Kim @ 2019-06-27 11:54 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-mm, LKML, linux-api, Michal Hocko, Johannes Weiner,
Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
Shakeel Butt, Sonny Rao, oleksandr, hdanton, lizeb, Dave Hansen,
Kirill A . Shutemov, Minchan Kim
In-Reply-To: <20190627115405.255259-1-minchan@kernel.org>
The isolate counting is pecpu counter so it would be not huge gain
to work them by batch. Rather than complicating to make them batch,
let's make it more stright-foward via adding the counting logic
into [isolate|putback]_lru_page API.
* v1
* fix accounting bug - Hillf
Link: http://lkml.kernel.org/r/20190531165927.GA20067@cmpxchg.org
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
mm/compaction.c | 2 --
mm/gup.c | 7 +------
mm/khugepaged.c | 3 ---
mm/memory-failure.c | 3 ---
mm/memory_hotplug.c | 4 ----
mm/mempolicy.c | 6 +-----
mm/migrate.c | 37 ++++++++-----------------------------
mm/vmscan.c | 22 ++++++++++++++++------
8 files changed, 26 insertions(+), 58 deletions(-)
diff --git a/mm/compaction.c b/mm/compaction.c
index 9e1b9acb116b..c6591682deda 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -982,8 +982,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/* Successfully isolated */
del_page_from_lru_list(page, lruvec, page_lru(page));
- inc_node_page_state(page,
- NR_ISOLATED_ANON + page_is_file_cache(page));
isolate_success:
list_add(&page->lru, &cc->migratepages);
diff --git a/mm/gup.c b/mm/gup.c
index 7dde2e3a1963..aec3a2b7e61b 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1473,13 +1473,8 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk,
drain_allow = false;
}
- if (!isolate_lru_page(head)) {
+ if (!isolate_lru_page(head))
list_add_tail(&head->lru, &cma_page_list);
- mod_node_page_state(page_pgdat(head),
- NR_ISOLATED_ANON +
- page_is_file_cache(head),
- hpage_nr_pages(head));
- }
}
}
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 0f7419938008..7da34e198ec5 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -503,7 +503,6 @@ void __khugepaged_exit(struct mm_struct *mm)
static void release_pte_page(struct page *page)
{
- dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
unlock_page(page);
putback_lru_page(page);
}
@@ -602,8 +601,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
result = SCAN_DEL_PAGE_LRU;
goto out;
}
- inc_node_page_state(page,
- NR_ISOLATED_ANON + page_is_file_cache(page));
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageLRU(page), page);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 7e08cbf3ba49..3586e8226e4e 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1795,9 +1795,6 @@ static int __soft_offline_page(struct page *page, int flags)
* so use !__PageMovable instead for LRU page's mapping
* cannot have PAGE_MAPPING_MOVABLE.
*/
- if (!__PageMovable(page))
- inc_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
list_add(&page->lru, &pagelist);
ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
MIGRATE_SYNC, MR_MEMORY_FAILURE);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index dfab21dc33dc..68577c677b46 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1384,10 +1384,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
if (!ret) { /* Success */
list_add_tail(&page->lru, &source);
- if (!__PageMovable(page))
- inc_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
-
} else {
pr_warn("failed to isolate pfn %lx\n", pfn);
dump_page(page, "isolation failed");
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 64562809bf3b..03081f3404ca 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -994,12 +994,8 @@ static int migrate_page_add(struct page *page, struct list_head *pagelist,
* Avoid migrating a page that is shared with others.
*/
if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
- if (!isolate_lru_page(head)) {
+ if (!isolate_lru_page(head))
list_add_tail(&head->lru, pagelist);
- mod_node_page_state(page_pgdat(head),
- NR_ISOLATED_ANON + page_is_file_cache(head),
- hpage_nr_pages(head));
- }
}
return 0;
diff --git a/mm/migrate.c b/mm/migrate.c
index 572b4bc85d76..5583324c01e7 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -190,8 +190,6 @@ void putback_movable_pages(struct list_head *l)
unlock_page(page);
put_page(page);
} else {
- mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
- page_is_file_cache(page), -hpage_nr_pages(page));
putback_lru_page(page);
}
}
@@ -1181,10 +1179,17 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
return -ENOMEM;
if (page_count(page) == 1) {
+ bool is_lru = !__PageMovable(page);
+
/* page was freed from under us. So we are done. */
ClearPageActive(page);
ClearPageUnevictable(page);
- if (unlikely(__PageMovable(page))) {
+ if (likely(is_lru))
+ mod_node_page_state(page_pgdat(page),
+ NR_ISOLATED_ANON +
+ page_is_file_cache(page),
+ -hpage_nr_pages(page));
+ else {
lock_page(page);
if (!PageMovable(page))
__ClearPageIsolated(page);
@@ -1210,15 +1215,6 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
* restored.
*/
list_del(&page->lru);
-
- /*
- * Compaction can migrate also non-LRU pages which are
- * not accounted to NR_ISOLATED_*. They can be recognized
- * as __PageMovable
- */
- if (likely(!__PageMovable(page)))
- mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
- page_is_file_cache(page), -hpage_nr_pages(page));
}
/*
@@ -1572,9 +1568,6 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
err = 0;
list_add_tail(&head->lru, pagelist);
- mod_node_page_state(page_pgdat(head),
- NR_ISOLATED_ANON + page_is_file_cache(head),
- hpage_nr_pages(head));
}
out_putpage:
/*
@@ -1890,8 +1883,6 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
{
- int page_lru;
-
VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
/* Avoid migrating to a node that is nearly full */
@@ -1913,10 +1904,6 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
return 0;
}
- page_lru = page_is_file_cache(page);
- mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
- hpage_nr_pages(page));
-
/*
* Isolating the page has taken another reference, so the
* caller's reference can be safely dropped without the page
@@ -1971,8 +1958,6 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
if (nr_remaining) {
if (!list_empty(&migratepages)) {
list_del(&page->lru);
- dec_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
putback_lru_page(page);
}
isolated = 0;
@@ -2002,7 +1987,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
pg_data_t *pgdat = NODE_DATA(node);
int isolated = 0;
struct page *new_page = NULL;
- int page_lru = page_is_file_cache(page);
unsigned long start = address & HPAGE_PMD_MASK;
new_page = alloc_pages_node(node,
@@ -2048,8 +2032,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
/* Retake the callers reference and putback on LRU */
get_page(page);
putback_lru_page(page);
- mod_node_page_state(page_pgdat(page),
- NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
goto out_unlock;
}
@@ -2099,9 +2081,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
- mod_node_page_state(page_pgdat(page),
- NR_ISOLATED_ANON + page_lru,
- -HPAGE_PMD_NR);
return isolated;
out_fail:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 49e9ee4d771d..223ce5da08f0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1014,6 +1014,9 @@ int remove_mapping(struct address_space *mapping, struct page *page)
void putback_lru_page(struct page *page)
{
lru_cache_add(page);
+ mod_node_page_state(page_pgdat(page),
+ NR_ISOLATED_ANON + page_is_file_cache(page),
+ -hpage_nr_pages(page));
put_page(page); /* drop ref from isolate */
}
@@ -1479,6 +1482,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
*/
nr_reclaimed += nr_pages;
+ mod_node_page_state(pgdat, NR_ISOLATED_ANON +
+ page_is_file_cache(page),
+ -nr_pages);
/*
* Is there need to periodically free_page_list? It would
* appear not as the counts should be low
@@ -1554,7 +1560,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
TTU_IGNORE_ACCESS, &dummy_stat, true);
list_splice(&clean_pages, page_list);
- mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
return ret;
}
@@ -1630,6 +1635,9 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
*/
ClearPageLRU(page);
ret = 0;
+ __mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
+ page_is_file_cache(page),
+ hpage_nr_pages(page));
}
return ret;
@@ -1761,6 +1769,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
total_scan, skipped, nr_taken, mode, lru);
update_lru_sizes(lruvec, lru, nr_zone_taken);
+
return nr_taken;
}
@@ -1809,6 +1818,9 @@ int isolate_lru_page(struct page *page)
ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, lru);
ret = 0;
+ mod_node_page_state(pgdat, NR_ISOLATED_ANON +
+ page_is_file_cache(page),
+ hpage_nr_pages(page));
}
spin_unlock_irq(&pgdat->lru_lock);
}
@@ -1900,6 +1912,9 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
list_move(&page->lru, &lruvec->lists[lru]);
+ __mod_node_page_state(pgdat, NR_ISOLATED_ANON +
+ page_is_file_cache(page),
+ -hpage_nr_pages(page));
if (put_page_testzero(page)) {
__ClearPageLRU(page);
__ClearPageActive(page);
@@ -1977,7 +1992,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
&nr_scanned, sc, lru);
- __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
reclaim_stat->recent_scanned[file] += nr_taken;
item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
@@ -2003,8 +2017,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
move_pages_to_lru(lruvec, &page_list);
- __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
-
spin_unlock_irq(&pgdat->lru_lock);
mem_cgroup_uncharge_list(&page_list);
@@ -2063,7 +2075,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
&nr_scanned, sc, lru);
- __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
reclaim_stat->recent_scanned[file] += nr_taken;
__count_vm_events(PGREFILL, nr_scanned);
@@ -2132,7 +2143,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
__count_vm_events(PGDEACTIVATE, nr_deactivate);
__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
- __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&pgdat->lru_lock);
mem_cgroup_uncharge_list(&l_active);
--
2.22.0.410.gd8fdbe21b5-goog
^ permalink raw reply related
* [PATCH v3 2/5] mm: change PAGEREF_RECLAIM_CLEAN with PAGE_REFRECLAIM
From: Minchan Kim @ 2019-06-27 11:54 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-mm, LKML, linux-api, Michal Hocko, Johannes Weiner,
Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
Shakeel Butt, Sonny Rao, oleksandr, hdanton, lizeb, Dave Hansen,
Kirill A . Shutemov, Minchan Kim
In-Reply-To: <20190627115405.255259-1-minchan@kernel.org>
The local variable references in shrink_page_list is PAGEREF_RECLAIM_CLEAN
as default. It is for preventing to reclaim dirty pages when CMA try to
migrate pages. Strictly speaking, we don't need it because CMA didn't allow
to write out by .may_writepage = 0 in reclaim_clean_pages_from_list.
Moreover, it has a problem to prevent anonymous pages's swap out even
though force_reclaim = true in shrink_page_list on upcoming patch.
So this patch makes references's default value to PAGEREF_RECLAIM and
rename force_reclaim with ignore_references to make it more clear.
This is a preparatory work for next patch.
* RFCv1
* use ignore_referecnes as parameter name - hannes
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
mm/vmscan.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9e3292ee5c7c..49e9ee4d771d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1117,7 +1117,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
struct scan_control *sc,
enum ttu_flags ttu_flags,
struct reclaim_stat *stat,
- bool force_reclaim)
+ bool ignore_references)
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
@@ -1131,7 +1131,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
struct address_space *mapping;
struct page *page;
int may_enter_fs;
- enum page_references references = PAGEREF_RECLAIM_CLEAN;
+ enum page_references references = PAGEREF_RECLAIM;
bool dirty, writeback;
unsigned int nr_pages;
@@ -1262,7 +1262,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
}
}
- if (!force_reclaim)
+ if (!ignore_references)
references = page_check_references(page, sc);
switch (references) {
--
2.22.0.410.gd8fdbe21b5-goog
^ permalink raw reply related
* [PATCH v3 1/5] mm: introduce MADV_COLD
From: Minchan Kim @ 2019-06-27 11:54 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-mm, LKML, linux-api, Michal Hocko, Johannes Weiner,
Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
Shakeel Butt, Sonny Rao, oleksandr, hdanton, lizeb, Dave Hansen,
Kirill A . Shutemov, Minchan Kim
In-Reply-To: <20190627115405.255259-1-minchan@kernel.org>
When a process expects no accesses to a certain memory range, it could
give a hint to kernel that the pages can be reclaimed when memory pressure
happens but data should be preserved for future use. This could reduce
workingset eviction so it ends up increasing performance.
This patch introduces the new MADV_COLD hint to madvise(2) syscall.
MADV_COLD can be used by a process to mark a memory range as not expected
to be used in the near future. The hint can help kernel in deciding which
pages to evict early during memory pressure.
It works for every LRU pages like MADV_[DONTNEED|FREE]. IOW, It moves
active file page -> inactive file LRU
active anon page -> inacdtive anon LRU
Unlike MADV_FREE, it doesn't move active anonymous pages to inactive
file LRU's head because MADV_COLD is a little bit different symantic.
MADV_FREE means it's okay to discard when the memory pressure because
the content of the page is *garbage* so freeing such pages is almost zero
overhead since we don't need to swap out and access afterward causes just
minor fault. Thus, it would make sense to put those freeable pages in
inactive file LRU to compete other used-once pages. It makes sense for
implmentaion point of view, too because it's not swapbacked memory any
longer until it would be re-dirtied. Even, it could give a bonus to make
them be reclaimed on swapless system. However, MADV_COLD doesn't mean
garbage so reclaiming them requires swap-out/in in the end so it's bigger
cost. Since we have designed VM LRU aging based on cost-model, anonymous
cold pages would be better to position inactive anon's LRU list, not file
LRU. Furthermore, it would help to avoid unnecessary scanning if system
doesn't have a swap device. Let's start simpler way without adding
complexity at this moment. However, keep in mind, too that it's a caveat
that workloads with a lot of pages cache are likely to ignore MADV_COLD
on anonymous memory because we rarely age anonymous LRU lists.
* man-page material
MADV_COLD (since Linux x.x)
Do not expect access in the near future so under memory pressure, pages
in the specified regions could be reclaimed more aggressively compared
to other pages in the system. The difference with MADV_DONTNEED is it
doesn't change the semantics of memory access in the specified regions.
Thus, it will keep the up-to-date contents of the region.
MADV_COLD cannot be applied to locked pages, Huge TLB pages, or VM_PFNMAP
pages.
* v2
* add up the warn with lots of page cache workload - mhocko
* add man page stuff - dave
* v1
* remove page_mapcount filter - hannes, mhocko
* remove idle page handling - joelaf
* RFCv2
* add more description - mhocko
* RFCv1
* renaming from MADV_COOL to MADV_COLD - hannes
* internal review
* use clear_page_youn in deactivate_page - joelaf
* Revise the description - surenb
* Renaming from MADV_WARM to MADV_COOL - surenb
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
include/linux/swap.h | 1 +
include/uapi/asm-generic/mman-common.h | 1 +
mm/internal.h | 2 +-
mm/madvise.c | 180 ++++++++++++++++++++++++-
mm/oom_kill.c | 2 +-
mm/swap.c | 42 ++++++
6 files changed, 224 insertions(+), 4 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index de2c67a33b7e..0ce997edb8bb 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -340,6 +340,7 @@ extern void lru_add_drain_cpu(int cpu);
extern void lru_add_drain_all(void);
extern void rotate_reclaimable_page(struct page *page);
extern void deactivate_file_page(struct page *page);
+extern void deactivate_page(struct page *page);
extern void mark_page_lazyfree(struct page *page);
extern void swap_setup(void);
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index ef4623f03156..d7b4231eea63 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -47,6 +47,7 @@
#define MADV_SEQUENTIAL 2 /* expect sequential page references */
#define MADV_WILLNEED 3 /* will need these pages */
#define MADV_DONTNEED 4 /* don't need these pages */
+#define MADV_COLD 5 /* deactivatie these pages */
/* common parameters: try to keep these consistent across architectures */
#define MADV_FREE 8 /* free pages only if memory pressure */
diff --git a/mm/internal.h b/mm/internal.h
index f53a14d67538..c61b215ff265 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -39,7 +39,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf);
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
-static inline bool can_madv_dontneed_vma(struct vm_area_struct *vma)
+static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
{
return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
}
diff --git a/mm/madvise.c b/mm/madvise.c
index 628022e674a7..7abb8e54bc7a 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -40,6 +40,7 @@ static int madvise_need_mmap_write(int behavior)
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
+ case MADV_COLD:
case MADV_FREE:
return 0;
default:
@@ -307,6 +308,178 @@ static long madvise_willneed(struct vm_area_struct *vma,
return 0;
}
+static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct mmu_gather *tlb = walk->private;
+ struct mm_struct *mm = tlb->mm;
+ struct vm_area_struct *vma = walk->vma;
+ pte_t *orig_pte, *pte, ptent;
+ spinlock_t *ptl;
+ struct page *page;
+ unsigned long next;
+
+ next = pmd_addr_end(addr, end);
+ if (pmd_trans_huge(*pmd)) {
+ pmd_t orig_pmd;
+
+ tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (!ptl)
+ return 0;
+
+ orig_pmd = *pmd;
+ if (is_huge_zero_pmd(orig_pmd))
+ goto huge_unlock;
+
+ if (unlikely(!pmd_present(orig_pmd))) {
+ VM_BUG_ON(thp_migration_supported() &&
+ !is_pmd_migration_entry(orig_pmd));
+ goto huge_unlock;
+ }
+
+ page = pmd_page(orig_pmd);
+ if (next - addr != HPAGE_PMD_SIZE) {
+ int err;
+
+ if (page_mapcount(page) != 1)
+ goto huge_unlock;
+
+ get_page(page);
+ spin_unlock(ptl);
+ lock_page(page);
+ err = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ if (!err)
+ goto regular_page;
+ return 0;
+ }
+
+ if (pmd_young(orig_pmd)) {
+ pmdp_invalidate(vma, addr, pmd);
+ orig_pmd = pmd_mkold(orig_pmd);
+
+ set_pmd_at(mm, addr, pmd, orig_pmd);
+ tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+ }
+
+ test_and_clear_page_young(page);
+ deactivate_page(page);
+huge_unlock:
+ spin_unlock(ptl);
+ return 0;
+ }
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
+
+regular_page:
+ tlb_change_page_size(tlb, PAGE_SIZE);
+ orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ flush_tlb_batched_pending(mm);
+ arch_enter_lazy_mmu_mode();
+ for (; addr < end; pte++, addr += PAGE_SIZE) {
+ ptent = *pte;
+
+ if (pte_none(ptent))
+ continue;
+
+ if (!pte_present(ptent))
+ continue;
+
+ page = vm_normal_page(vma, addr, ptent);
+ if (!page)
+ continue;
+
+ /*
+ * Creating a THP page is expensive so split it only if we
+ * are sure it's worth. Split it if we are only owner.
+ */
+ if (PageTransCompound(page)) {
+ if (page_mapcount(page) != 1)
+ break;
+ get_page(page);
+ if (!trylock_page(page)) {
+ put_page(page);
+ break;
+ }
+ pte_unmap_unlock(orig_pte, ptl);
+ if (split_huge_page(page)) {
+ unlock_page(page);
+ put_page(page);
+ pte_offset_map_lock(mm, pmd, addr, &ptl);
+ break;
+ }
+ unlock_page(page);
+ put_page(page);
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ pte--;
+ addr -= PAGE_SIZE;
+ continue;
+ }
+
+ VM_BUG_ON_PAGE(PageTransCompound(page), page);
+
+ if (pte_young(ptent)) {
+ ptent = ptep_get_and_clear_full(mm, addr, pte,
+ tlb->fullmm);
+ ptent = pte_mkold(ptent);
+ set_pte_at(mm, addr, pte, ptent);
+ tlb_remove_tlb_entry(tlb, pte, addr);
+ }
+
+ /*
+ * We are deactivating a page for accelerating reclaiming.
+ * VM couldn't reclaim the page unless we clear PG_young.
+ * As a side effect, it makes confuse idle-page tracking
+ * because they will miss recent referenced history.
+ */
+ test_and_clear_page_young(page);
+ deactivate_page(page);
+ }
+
+ arch_enter_lazy_mmu_mode();
+ pte_unmap_unlock(orig_pte, ptl);
+ cond_resched();
+
+ return 0;
+}
+
+static void madvise_cold_page_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ struct mm_walk cold_walk = {
+ .pmd_entry = madvise_cold_pte_range,
+ .mm = vma->vm_mm,
+ .private = tlb,
+ };
+
+ tlb_start_vma(tlb, vma);
+ walk_page_range(addr, end, &cold_walk);
+ tlb_end_vma(tlb, vma);
+}
+
+static long madvise_cold(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start_addr, unsigned long end_addr)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct mmu_gather tlb;
+
+ *prev = vma;
+ if (!can_madv_lru_vma(vma))
+ return -EINVAL;
+
+ lru_add_drain();
+ tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
+ madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
+ tlb_finish_mmu(&tlb, start_addr, end_addr);
+
+ return 0;
+}
+
static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
@@ -519,7 +692,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
int behavior)
{
*prev = vma;
- if (!can_madv_dontneed_vma(vma))
+ if (!can_madv_lru_vma(vma))
return -EINVAL;
if (!userfaultfd_remove(vma, start, end)) {
@@ -541,7 +714,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
*/
return -ENOMEM;
}
- if (!can_madv_dontneed_vma(vma))
+ if (!can_madv_lru_vma(vma))
return -EINVAL;
if (end > vma->vm_end) {
/*
@@ -695,6 +868,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
return madvise_remove(vma, prev, start, end);
case MADV_WILLNEED:
return madvise_willneed(vma, prev, start, end);
+ case MADV_COLD:
+ return madvise_cold(vma, prev, start, end);
case MADV_FREE:
case MADV_DONTNEED:
return madvise_dontneed_free(vma, prev, start, end, behavior);
@@ -716,6 +891,7 @@ madvise_behavior_valid(int behavior)
case MADV_WILLNEED:
case MADV_DONTNEED:
case MADV_FREE:
+ case MADV_COLD:
#ifdef CONFIG_KSM
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 6de5c354d6ca..2140a6f8db63 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -523,7 +523,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
set_bit(MMF_UNSTABLE, &mm->flags);
for (vma = mm->mmap ; vma; vma = vma->vm_next) {
- if (!can_madv_dontneed_vma(vma))
+ if (!can_madv_lru_vma(vma))
continue;
/*
diff --git a/mm/swap.c b/mm/swap.c
index 607c48229a1d..a91859d061f3 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -47,6 +47,7 @@ int page_cluster;
static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs);
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
@@ -538,6 +539,22 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
update_page_reclaim_stat(lruvec, file, 0);
}
+static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+ void *arg)
+{
+ if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+ int file = page_is_file_cache(page);
+ int lru = page_lru_base_type(page);
+
+ del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
+ ClearPageActive(page);
+ ClearPageReferenced(page);
+ add_page_to_lru_list(page, lruvec, lru);
+
+ __count_vm_events(PGDEACTIVATE, hpage_nr_pages(page));
+ update_page_reclaim_stat(lruvec, file, 0);
+ }
+}
static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
void *arg)
@@ -590,6 +607,10 @@ void lru_add_drain_cpu(int cpu)
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
+ pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+ if (pagevec_count(pvec))
+ pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+
pvec = &per_cpu(lru_lazyfree_pvecs, cpu);
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
@@ -623,6 +644,26 @@ void deactivate_file_page(struct page *page)
}
}
+/*
+ * deactivate_page - deactivate a page
+ * @page: page to deactivate
+ *
+ * deactivate_page() moves @page to the inactive list if @page was on the active
+ * list and was not an unevictable page. This is done to accelerate the reclaim
+ * of @page.
+ */
+void deactivate_page(struct page *page)
+{
+ if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+ struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+
+ get_page(page);
+ if (!pagevec_add(pvec, page) || PageCompound(page))
+ pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+ put_cpu_var(lru_deactivate_pvecs);
+ }
+}
+
/**
* mark_page_lazyfree - make an anon page lazyfree
* @page: page to deactivate
@@ -687,6 +728,7 @@ void lru_add_drain_all(void)
if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
+ pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) ||
need_activate_page_drain(cpu)) {
INIT_WORK(work, lru_add_drain_per_cpu);
--
2.22.0.410.gd8fdbe21b5-goog
^ permalink raw reply related
* [PATCH v3 0/5] Introduce MADV_COLD and MADV_PAGEOUT
From: Minchan Kim @ 2019-06-27 11:54 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-mm, LKML, linux-api, Michal Hocko, Johannes Weiner,
Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
Shakeel Butt, Sonny Rao, oleksandr, hdanton, lizeb, Dave Hansen,
Kirill A . Shutemov, Minchan Kim
This patch is part of previous series:
https://lore.kernel.org/lkml/20190531064313.193437-1-minchan@kernel.org/T/#u
Originally, it was created for external madvise hinting feature.
https://lkml.org/lkml/2019/5/31/463
Michal wanted to separte the discussion from external hinting interface
so this patchset includes only first part of my entire patchset
- introduce MADV_COLD and MADV_PAGEOUT hint to madvise.
However, I keep entire description for others for easier understanding
why this kinds of hint was born.
Thanks.
This patchset is against on next-20190530.
Below is description of previous entire patchset.
================= &< =====================
- Background
The Android terminology used for forking a new process and starting an app
from scratch is a cold start, while resuming an existing app is a hot start.
While we continually try to improve the performance of cold starts, hot
starts will always be significantly less power hungry as well as faster so
we are trying to make hot start more likely than cold start.
To increase hot start, Android userspace manages the order that apps should
be killed in a process called ActivityManagerService. ActivityManagerService
tracks every Android app or service that the user could be interacting with
at any time and translates that into a ranked list for lmkd(low memory
killer daemon). They are likely to be killed by lmkd if the system has to
reclaim memory. In that sense they are similar to entries in any other cache.
Those apps are kept alive for opportunistic performance improvements but
those performance improvements will vary based on the memory requirements of
individual workloads.
- Problem
Naturally, cached apps were dominant consumers of memory on the system.
However, they were not significant consumers of swap even though they are
good candidate for swap. Under investigation, swapping out only begins
once the low zone watermark is hit and kswapd wakes up, but the overall
allocation rate in the system might trip lmkd thresholds and cause a cached
process to be killed(we measured performance swapping out vs. zapping the
memory by killing a process. Unsurprisingly, zapping is 10x times faster
even though we use zram which is much faster than real storage) so kill
from lmkd will often satisfy the high zone watermark, resulting in very
few pages actually being moved to swap.
- Approach
The approach we chose was to use a new interface to allow userspace to
proactively reclaim entire processes by leveraging platform information.
This allowed us to bypass the inaccuracy of the kernel’s LRUs for pages
that are known to be cold from userspace and to avoid races with lmkd
by reclaiming apps as soon as they entered the cached state. Additionally,
it could provide many chances for platform to use much information to
optimize memory efficiency.
To achieve the goal, the patchset introduce two new options for madvise.
One is MADV_COLD which will deactivate activated pages and the other is
MADV_PAGEOUT which will reclaim private pages instantly. These new options
complement MADV_DONTNEED and MADV_FREE by adding non-destructive ways to
gain some free memory space. MADV_PAGEOUT is similar to MADV_DONTNEED in a way
that it hints the kernel that memory region is not currently needed and
should be reclaimed immediately; MADV_COLD is similar to MADV_FREE in a way
that it hints the kernel that memory region is not currently needed and
should be reclaimed when memory pressure rises.
Minchan Kim (5):
mm: introduce MADV_COLD
mm: change PAGEREF_RECLAIM_CLEAN with PAGE_REFRECLAIM
mm: account nr_isolated_xxx in [isolate|putback]_lru_page
mm: introduce MADV_PAGEOUT
mm: factor out pmd young/dirty bit handling and THP split
include/linux/huge_mm.h | 3 -
include/linux/swap.h | 2 +
include/uapi/asm-generic/mman-common.h | 2 +
mm/compaction.c | 2 -
mm/gup.c | 7 +-
mm/huge_memory.c | 74 -----
mm/internal.h | 2 +-
mm/khugepaged.c | 3 -
mm/madvise.c | 438 ++++++++++++++++++++++++-
mm/memory-failure.c | 3 -
mm/memory_hotplug.c | 4 -
mm/mempolicy.c | 6 +-
mm/migrate.c | 37 +--
mm/oom_kill.c | 2 +-
mm/swap.c | 42 +++
mm/vmscan.c | 86 ++++-
16 files changed, 566 insertions(+), 147 deletions(-)
--
2.22.0.410.gd8fdbe21b5-goog
^ permalink raw reply
* Re: [PATCH] binfmt_elf: Extract .note.gnu.property from an ELF file
From: Florian Weimer @ 2019-06-27 9:38 UTC (permalink / raw)
To: Andy Lutomirski
Cc: Dave Martin, Yu-cheng Yu, X86 ML, H. Peter Anvin, Thomas Gleixner,
Ingo Molnar, LKML, open list:DOCUMENTATION, Linux-MM, linux-arch,
Linux API, Arnd Bergmann, Balbir Singh, Cyrill Gorcunov,
Dave Hansen, Eugene Syromiatnikov, H.J. Lu, Jann Horn,
Jonathan Corbet, Kees Cook, Mike Kravetz <mike.kravet>
In-Reply-To: <CALCETrVZCzh+KFCF6ijuf4QEPn=R2gJ8FHLpyFd=n+pNOMMMjA@mail.gmail.com>
* Andy Lutomirski:
> Also, I don't think there's any actual requirement that the upstream
> kernel recognize existing CET-enabled RHEL 8 binaries as being
> CET-enabled. I tend to think that RHEL 8 jumped the gun here.
The ABI was supposed to be finalized and everyone involved thought it
had been reviewed by the GNU gABI community and other interested
parties. It had been included in binutils for several releases.
>From my point of view, the kernel is just a consumer of the ABI. The
kernel would not change an instruction encoding if it doesn't like it
for some reason, either.
> While the upstream kernel should make some reasonble effort to make
> sure that RHEL 8 binaries will continue to run, I don't see why we
> need to go out of our way to keep the full set of mitigations
> available for binaries that were developed against a non-upstream
> kernel.
They were developed against the ABI specification.
I do not have a strong opinion what the kernel should do going forward.
I just want to make clear what happened.
Thanks,
Florian
^ permalink raw reply
* Re: [PATCH] binfmt_elf: Extract .note.gnu.property from an ELF file
From: Dave Martin @ 2019-06-27 9:27 UTC (permalink / raw)
To: Andy Lutomirski
Cc: Yu-cheng Yu, X86 ML, H. Peter Anvin, Thomas Gleixner, Ingo Molnar,
LKML, open list:DOCUMENTATION, Linux-MM, linux-arch, Linux API,
Arnd Bergmann, Balbir Singh, Cyrill Gorcunov, Dave Hansen,
Eugene Syromiatnikov, Florian Weimer, H.J. Lu, Jann Horn,
Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit,
Oleg Nesterov, Pavel Machek
In-Reply-To: <CALCETrVZCzh+KFCF6ijuf4QEPn=R2gJ8FHLpyFd=n+pNOMMMjA@mail.gmail.com>
On Wed, Jun 26, 2019 at 10:14:07AM -0700, Andy Lutomirski wrote:
> On Thu, May 2, 2019 at 4:10 AM Dave Martin <Dave.Martin@arm.com> wrote:
[...]
> > A couple of questions before I look in more detail:
> >
> > 1) Can we rely on PT_GNU_PROPERTY being present in the phdrs to describe
> > the NT_GNU_PROPERTY_TYPE_0 note? If so, we can avoid trying to parse
> > irrelevant PT_NOTE segments.
> >
> >
> > 2) Are there standard types for things like the program property header?
> > If not, can we add something in elf.h? We should try to coordinate with
> > libc on that. Something like
> >
>
> Where did PT_GNU_PROPERTY come from? Are there actual docs for it?
> Can someone here tell us what the actual semantics of this new ELF
> thingy are? From some searching, it seems like it's kind of an ELF
> note but kind of not. An actual description would be fantastic.
https://github.com/hjl-tools/linux-abi/wiki/linux-abi-draft.pdf
I don't know _when_ it was added, and the description is minimal, but
it's there.
(I'd say it's fairly obvious how it should be used, but it could do with
some clarification...)
> Also, I don't think there's any actual requirement that the upstream
> kernel recognize existing CET-enabled RHEL 8 binaries as being
> CET-enabled. I tend to think that RHEL 8 jumped the gun here. While
> the upstream kernel should make some reasonble effort to make sure
> that RHEL 8 binaries will continue to run, I don't see why we need to
> go out of our way to keep the full set of mitigations available for
> binaries that were developed against a non-upstream kernel.
If that's an accpetable approach, it should certainly make our life
easier.
> In fact, if we handle the legacy bitmap differently from RHEL 8, we
> may *have* to make sure that we don't recognize existing RHEL 8
> binaries as CET-enabled.
Can't comment on that. If the existing RHEL 8 binaries strictly don't
have the PT_GNU_PROPERTY phdr, then this might serve a dual purpose ...
otherwise, x86 might need some additional annotation for new binaries.
I'll leave it for others to comment.
Cheers
---Dave
^ permalink raw reply
* Re: [PATCH v5 17/18] kernel/sysctl-test: Add null pointer test for sysctl.c:proc_dointvec()
From: Luis Chamberlain @ 2019-06-27 6:10 UTC (permalink / raw)
To: Iurii Zaikin, linux-api-u79uwXL29TY76Z2rM5mHXA,
Michael Kerrisk (man-pages)
Cc: pmladek-IBi9RG/b67k, linux-doc-u79uwXL29TY76Z2rM5mHXA,
peterz-wEGCiKHe2LqWVfeAwA7xHQ, amir73il-Re5JQEeQqe8AvxtiuMwx3w,
Brendan Higgins, dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
Alexander.Levin-0li6OtcxBFHby3iVrkZq2A,
yamada.masahiro-uWyLwvC0a2jby3iVrkZq2A,
mpe-Gsx/Oe8HsFggBc27wqDAHg,
linux-kselftest-u79uwXL29TY76Z2rM5mHXA,
shuah-DgEjT+Ai2ygdnm+yROfE0A, robh-DgEjT+Ai2ygdnm+yROfE0A,
linux-nvdimm-hn68Rpc1hR1g9hUCZPvPmw,
frowand.list-Re5JQEeQqe8AvxtiuMwx3w,
knut.omang-QHcLZuEGTsvQT0dZR+AlfA,
kieran.bingham-ryLnwIuWjnjg/C1BVhZhaw, wfg-VuQAYsv1563Yd54FQh9/CA,
joel-U3u1mxZcP9KHXe+LvDLADg, David Rientjes,
jdike-OPE4K8JWMJJBDgjK7y7TUQ,
dan.carpenter-QHcLZuEGTsvQT0dZR+AlfA,
devicetree-u79uwXL29TY76Z2rM5mHXA,
linux-kbuild-u79uwXL29TY76Z2rM5mHXA, Tim.Bird-7U/KSKJipcs,
linux-um-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
rostedt-nx8X9YLhiw1AfugRpC6u6w, julia.lawall-L2FTfq7BK8M,
jpoimboe-H+wXaHxf7aLQT0dZR+AlfA, kunit-dev-/JYPxA39Uh5TLH3MbocFFw,
tytso-3s7WtUTddSA, richard-/L3Ra7n9ekc, Stephen Boyd,
gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r,
rdunlap-wEGCiKHe2LqWVfeAwA7xHQ,
linux-kernel-u79uwXL29TaqPxH82wqD4g
In-Reply-To: <CAAXuY3p+kVhjQ4LYtzormqVcH2vKu1abc_K9Z0XY=JX=bp8NcQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
On Wed, Jun 26, 2019 at 09:07:43PM -0700, Iurii Zaikin wrote:
> On Tue, Jun 25, 2019 at 7:17 PM Luis Chamberlain <mcgrof-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org> wrote:
> > > +static void sysctl_test_dointvec_table_maxlen_unset(struct kunit *test)
> > > +{
> > > + struct ctl_table table = {
> > > + .procname = "foo",
> > > + .data = &test_data.int_0001,
> > > + .maxlen = 0,
> > > + .mode = 0644,
> > > + .proc_handler = proc_dointvec,
> > > + .extra1 = &i_zero,
> > > + .extra2 = &i_one_hundred,
> > > + };
> > > + void *buffer = kunit_kzalloc(test, sizeof(int), GFP_USER);
> > > + size_t len;
> > > + loff_t pos;
> > > +
> > > + len = 1234;
> > > + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, 0, buffer, &len, &pos));
> > > + KUNIT_EXPECT_EQ(test, (size_t)0, len);
> > > + len = 1234;
> > > + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, 1, buffer, &len, &pos));
> > > + KUNIT_EXPECT_EQ(test, (size_t)0, len);
> > > +}
> >
> > In a way this is also testing for general kernel API changes. This is and the
> > last one were good examples. So this is not just testing functionality
> > here. There is no wrong or write answer if 0 or -EINVAL was returned
> > other than the fact that we have been doing this for years.
> >
> > Its a perhaps small but important difference for some of these tests. I
> > *do* think its worth clarifying through documentation which ones are
> > testing for API consistency Vs proper correctness.
>
> You make a good point that the test codifies the existing behavior of
> the function in lieu of formal documentation. However, the test cases
> were derived from examining the source code of the function under test
> and attempting to cover all branches. The assertions were added only
> for the values that appeared to be set deliberately in the
> implementation. And it makes sense to me to test that the code does
> exactly what the implementation author intended.
I'm not arguing against adding them. I'm suggesting that it is different
to test for API than for correctness of intended functionality, and
it would be wise to make it clear which test cases are for API and which
for correctness.
This will come up later for other kunit tests and it would be great
to set precendent so that other kunit tests can follow similar
practices to ensure its clear what is API realted Vs correctness of
intended functionality.
In fact, I'm not yet sure if its possible to test public kernel API to
userspace with kunit, but if it is possible... well, that could make
linux-api folks happy as they could enable us to codify interpreation of
what is expected into kunit test cases, and we'd ensure that the
codified interpretation is not only documented in man pages but also
through formal kunit test cases.
A regression in linux-api then could be formalized through a proper
kunit tests case. And if an API evolves, it would force developers to
update the respective kunit which codifies that contract.
> > > +static void sysctl_test_dointvec_single_less_int_min(struct kunit *test)
> > > +{
> > > + struct ctl_table table = {
> > > + .procname = "foo",
> > > + .data = &test_data.int_0001,
> > > + .maxlen = sizeof(int),
> > > + .mode = 0644,
> > > + .proc_handler = proc_dointvec,
> > > + .extra1 = &i_zero,
> > > + .extra2 = &i_one_hundred,
> > > + };
> > > + char input[32];
> > > + size_t len = sizeof(input) - 1;
> > > + loff_t pos = 0;
> > > + unsigned long abs_of_less_than_min = (unsigned long)INT_MAX
> > > + - (INT_MAX + INT_MIN) + 1;
> > > +
> > > + KUNIT_EXPECT_LT(test,
> > > + (size_t)snprintf(input, sizeof(input), "-%lu",
> > > + abs_of_less_than_min),
> > > + sizeof(input));
> > > +
> > > + table.data = kunit_kzalloc(test, sizeof(int), GFP_USER);
> > > + KUNIT_EXPECT_EQ(test, -EINVAL,
> > > + proc_dointvec(&table, 1, input, &len, &pos));
> > > + KUNIT_EXPECT_EQ(test, sizeof(input) - 1, len);
> > > + KUNIT_EXPECT_EQ(test, 0, ((int *)table.data)[0]);
> > > +}
> >
> > API test.
> >
> Not sure why.
Because you are codifying that we *definitely* return -EINVAL on
overlow. Some parts of the kernel return -ERANGE for overflows for
instance.
It would be a generic test for overflow if it would just test
for any error.
It is a fine and good test to keep. All these tests are good to keep.
> I believe there has been a real bug with int overflow in
> proc_dointvec.
> Covering it with test seems like a good idea.
Oh definitely.
> > > +static void sysctl_test_dointvec_single_greater_int_max(struct kunit *test)
> > > +{
> > > + struct ctl_table table = {
> > > + .procname = "foo",
> > > + .data = &test_data.int_0001,
> > > + .maxlen = sizeof(int),
> > > + .mode = 0644,
> > > + .proc_handler = proc_dointvec,
> > > + .extra1 = &i_zero,
> > > + .extra2 = &i_one_hundred,
> > > + };
> > > + char input[32];
> > > + size_t len = sizeof(input) - 1;
> > > + loff_t pos = 0;
> > > + unsigned long greater_than_max = (unsigned long)INT_MAX + 1;
> > > +
> > > + KUNIT_EXPECT_GT(test, greater_than_max, (unsigned long)INT_MAX);
> > > + KUNIT_EXPECT_LT(test, (size_t)snprintf(input, sizeof(input), "%lu",
> > > + greater_than_max),
> > > + sizeof(input));
> > > + table.data = kunit_kzalloc(test, sizeof(int), GFP_USER);
> > > + KUNIT_EXPECT_EQ(test, -EINVAL,
> > > + proc_dointvec(&table, 1, input, &len, &pos));
> > > + KUNIT_EXPECT_EQ(test, sizeof(input) - 1, len);
> > > + KUNIT_EXPECT_EQ(test, 0, ((int *)table.data)[0]);
> > > +}
> > > +
> >
> > API test.
> >
> > > +static struct kunit_case sysctl_test_cases[] = {
> > > + KUNIT_CASE(sysctl_test_dointvec_null_tbl_data),
> > > + KUNIT_CASE(sysctl_test_dointvec_table_maxlen_unset),
> > > + KUNIT_CASE(sysctl_test_dointvec_table_len_is_zero),
> > > + KUNIT_CASE(sysctl_test_dointvec_table_read_but_position_set),
> > > + KUNIT_CASE(sysctl_test_dointvec_happy_single_positive),
> > > + KUNIT_CASE(sysctl_test_dointvec_happy_single_negative),
> > > + KUNIT_CASE(sysctl_test_dointvec_single_less_int_min),
> > > + KUNIT_CASE(sysctl_test_dointvec_single_greater_int_max),
> > > + {}
> > > +};
> >
> > Oh all are API tests.. perhaps then just rename then
> > sysctl_test_cases to sysctl_api_test_cases.
> >
> > Would be good to add at least *two* other tests cases for this
> > example, one which does a valid read and one which does a valid write.
> Added valid reads. There already are 2 valid writes.
Thanks.
> > If that is done either we add another kunit test module for correctness
> > or just extend the above and use prefix / postfixes on the functions
> > to distinguish between API / correctness somehow.
> >
> > > +
> > > +static struct kunit_module sysctl_test_module = {
> > > + .name = "sysctl_test",
> > > + .test_cases = sysctl_test_cases,
> > > +};
> > > +
> > > +module_test(sysctl_test_module);
> > > diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> > > index cbdfae3798965..389b8986f5b77 100644
> > > --- a/lib/Kconfig.debug
> > > +++ b/lib/Kconfig.debug
> > > @@ -1939,6 +1939,16 @@ config TEST_SYSCTL
> > >
> > > If unsure, say N.
> > >
> > > +config SYSCTL_KUNIT_TEST
> > > + bool "KUnit test for sysctl"
> > > + depends on KUNIT
> > > + help
> > > + This builds the proc sysctl unit test, which runs on boot. For more
> > > + information on KUnit and unit tests in general please refer to the
> > > + KUnit documentation in Documentation/dev-tools/kunit/.
> >
> > A little more description here would help. It is testing for API and
> > hopefully also correctness (if extended with those two examples I
> > mentioned).
> >
> Added "Tests the API contract and implementation correctness of sysctl."
Yes, much clearer, thanks!
Luis
^ permalink raw reply
* [PATCH v4 2/2] fpga: dfl: fme: add performance reporting support
From: Wu Hao @ 2019-06-27 5:09 UTC (permalink / raw)
To: mdf, linux-fpga, linux-kernel
Cc: linux-api, gregkh, atull, Wu Hao, Luwei Kang, Xu Yilun
In-Reply-To: <1561612195-6081-1-git-send-email-hao.wu@intel.com>
This patch adds support for performance reporting private feature
for FPGA Management Engine (FME). Now it supports several different
performance counters, including 'basic', 'cache', 'fabric', 'vtd'
and 'vtd_sip'. It allows user to use standard linux tools to access
these performance counters.
e.g. List all events by "perf list"
perf list | grep fme
fme0/cache_read_hit/ [Kernel PMU event]
fme0/cache_read_miss/ [Kernel PMU event]
...
fme0/fab_mmio_read/ [Kernel PMU event]
fme0/fab_mmio_write/ [Kernel PMU event]
...
fme0/fab_port_mmio_read,portid=?/ [Kernel PMU event]
fme0/fab_port_mmio_write,portid=?/ [Kernel PMU event]
...
fme0/vtd_port_devtlb_1g_fill,portid=?/ [Kernel PMU event]
fme0/vtd_port_devtlb_2m_fill,portid=?/ [Kernel PMU event]
...
fme0/vtd_sip_iotlb_1g_hit/ [Kernel PMU event]
fme0/vtd_sip_iotlb_1g_miss/ [Kernel PMU event]
...
fme0/clock [Kernel PMU event]
...
e.g. check increased counter value after run one application using
"perf stat" command.
perf stat -e fme0/fab_mmio_read/,fme0/fab_mmio_write/, ./test
Performance counter stats for './test':
1 fme0/fab_mmio_read/
2 fme0/fab_mmio_write/
1.009496520 seconds time elapsed
Please note that fabric counters support both fab_* and fab_port_*, but
actually they are sharing one set of performance counters in hardware.
If user wants to monitor overall data events on fab_* then fab_port_*
can't be supported at the same time, see example below:
perf stat -e fme0/fab_mmio_read/,fme0/fab_port_mmio_write,portid=0/
Performance counter stats for 'system wide':
0 fme0/fab_mmio_read/
<not supported> fme0/fab_port_mmio_write,portid=0/
2.141064085 seconds time elapsed
Signed-off-by: Luwei Kang <luwei.kang@intel.com>
Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
---
v3: replace scnprintf with sprintf in sysfs interfaces.
update sysfs doc kernel version and date.
fix sysfs doc issue for fabric counter.
refine PERF_OBJ_ATTR_* macro, doesn't count on __ATTR anymore.
introduce PERF_OBJ_ATTR_F_* macro, as it needs to use different
filenames for some of the sysfs attributes.
remove kobject_del when destroy kobject, kobject_put is enough.
do sysfs_remove_groups first when destroying perf_obj.
WARN_ON_ONCE in case internal parms are wrong in read_*_count().
v4: rework this patch to use standard perf API as user interfaces.
---
drivers/fpga/Makefile | 1 +
drivers/fpga/dfl-fme-main.c | 4 +
drivers/fpga/dfl-fme-perf.c | 871 ++++++++++++++++++++++++++++++++++++++++++++
drivers/fpga/dfl-fme.h | 2 +
4 files changed, 878 insertions(+)
create mode 100644 drivers/fpga/dfl-fme-perf.c
diff --git a/drivers/fpga/Makefile b/drivers/fpga/Makefile
index 4865b74..d8e21df 100644
--- a/drivers/fpga/Makefile
+++ b/drivers/fpga/Makefile
@@ -40,6 +40,7 @@ obj-$(CONFIG_FPGA_DFL_FME_REGION) += dfl-fme-region.o
obj-$(CONFIG_FPGA_DFL_AFU) += dfl-afu.o
dfl-fme-objs := dfl-fme-main.o dfl-fme-pr.o dfl-fme-error.o
+dfl-fme-objs += dfl-fme-perf.o
dfl-afu-objs := dfl-afu-main.o dfl-afu-region.o dfl-afu-dma-region.o
dfl-afu-objs += dfl-afu-error.o
diff --git a/drivers/fpga/dfl-fme-main.c b/drivers/fpga/dfl-fme-main.c
index 9225b68..a11c112 100644
--- a/drivers/fpga/dfl-fme-main.c
+++ b/drivers/fpga/dfl-fme-main.c
@@ -639,6 +639,10 @@ static void fme_power_mgmt_uinit(struct platform_device *pdev,
.ops = &fme_power_mgmt_ops,
},
{
+ .id_table = fme_perf_id_table,
+ .ops = &fme_perf_ops,
+ },
+ {
.ops = NULL,
},
};
diff --git a/drivers/fpga/dfl-fme-perf.c b/drivers/fpga/dfl-fme-perf.c
new file mode 100644
index 0000000..0d7768a
--- /dev/null
+++ b/drivers/fpga/dfl-fme-perf.c
@@ -0,0 +1,871 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Driver for FPGA Management Engine (FME) Global Performance Reporting
+ *
+ * Copyright 2019 Intel Corporation, Inc.
+ *
+ * Authors:
+ * Kang Luwei <luwei.kang@intel.com>
+ * Xiao Guangrong <guangrong.xiao@linux.intel.com>
+ * Wu Hao <hao.wu@intel.com>
+ * Xu Yilun <yilun.xu@intel.com>
+ * Joseph Grecco <joe.grecco@intel.com>
+ * Enno Luebbers <enno.luebbers@intel.com>
+ * Tim Whisonant <tim.whisonant@intel.com>
+ * Ananda Ravuri <ananda.ravuri@intel.com>
+ * Mitchel, Henry <henry.mitchel@intel.com>
+ */
+
+#include <linux/perf_event.h>
+#include "dfl.h"
+#include "dfl-fme.h"
+
+/*
+ * Performance Counter Registers for Cache.
+ *
+ * Cache Events are listed below as CACHE_EVNT_*.
+ */
+#define CACHE_CTRL 0x8
+#define CACHE_RESET_CNTR BIT_ULL(0)
+#define CACHE_FREEZE_CNTR BIT_ULL(8)
+#define CACHE_CTRL_EVNT GENMASK_ULL(19, 16)
+#define CACHE_EVNT_RD_HIT 0x0
+#define CACHE_EVNT_WR_HIT 0x1
+#define CACHE_EVNT_RD_MISS 0x2
+#define CACHE_EVNT_WR_MISS 0x3
+#define CACHE_EVNT_RSVD 0x4
+#define CACHE_EVNT_HOLD_REQ 0x5
+#define CACHE_EVNT_DATA_WR_PORT_CONTEN 0x6
+#define CACHE_EVNT_TAG_WR_PORT_CONTEN 0x7
+#define CACHE_EVNT_TX_REQ_STALL 0x8
+#define CACHE_EVNT_RX_REQ_STALL 0x9
+#define CACHE_EVNT_EVICTIONS 0xa
+#define CACHE_CHANNEL_SEL BIT_ULL(20)
+#define CACHE_CHANNEL_RD 0
+#define CACHE_CHANNEL_WR 1
+#define CACHE_CNTR0 0x10
+#define CACHE_CNTR1 0x18
+#define CACHE_CNTR_EVNT_CNTR GENMASK_ULL(47, 0)
+#define CACHE_CNTR_EVNT GENMASK_ULL(63, 60)
+
+/*
+ * Performance Counter Registers for Fabric.
+ *
+ * Fabric Events are listed below as FAB_EVNT_*
+ */
+#define FAB_CTRL 0x20
+#define FAB_RESET_CNTR BIT_ULL(0)
+#define FAB_FREEZE_CNTR BIT_ULL(8)
+#define FAB_CTRL_EVNT GENMASK_ULL(19, 16)
+#define FAB_EVNT_PCIE0_RD 0x0
+#define FAB_EVNT_PCIE0_WR 0x1
+#define FAB_EVNT_PCIE1_RD 0x2
+#define FAB_EVNT_PCIE1_WR 0x3
+#define FAB_EVNT_UPI_RD 0x4
+#define FAB_EVNT_UPI_WR 0x5
+#define FAB_EVNT_MMIO_RD 0x6
+#define FAB_EVNT_MMIO_WR 0x7
+#define FAB_PORT_ID GENMASK_ULL(21, 20)
+#define FAB_PORT_FILTER BIT_ULL(23)
+#define FAB_PORT_FILTER_DISABLE 0
+#define FAB_PORT_FILTER_ENABLE 1
+#define FAB_CNTR 0x28
+#define FAB_CNTR_EVNT_CNTR GENMASK_ULL(59, 0)
+#define FAB_CNTR_EVNT GENMASK_ULL(63, 60)
+
+/*
+ * Performance Counter Registers for Clock.
+ *
+ * Clock Counter can't be reset or frozen by SW.
+ */
+#define CLK_CNTR 0x30
+#define BASIC_EVNT_CLK 0x0
+
+/*
+ * Performance Counter Registers for IOMMU / VT-D.
+ *
+ * VT-D Events are listed below as VTD_EVNT_* and VTD_SIP_EVNT_*
+ */
+#define VTD_CTRL 0x38
+#define VTD_RESET_CNTR BIT_ULL(0)
+#define VTD_FREEZE_CNTR BIT_ULL(8)
+#define VTD_CTRL_EVNT GENMASK_ULL(19, 16)
+#define VTD_EVNT_AFU_MEM_RD_TRANS 0x0
+#define VTD_EVNT_AFU_MEM_WR_TRANS 0x1
+#define VTD_EVNT_AFU_DEVTLB_RD_HIT 0x2
+#define VTD_EVNT_AFU_DEVTLB_WR_HIT 0x3
+#define VTD_EVNT_DEVTLB_4K_FILL 0x4
+#define VTD_EVNT_DEVTLB_2M_FILL 0x5
+#define VTD_EVNT_DEVTLB_1G_FILL 0x6
+#define VTD_CNTR 0x40
+#define VTD_CNTR_EVNT_CNTR GENMASK_ULL(47, 0)
+#define VTD_CNTR_EVNT GENMASK_ULL(63, 60)
+
+#define VTD_SIP_CTRL 0x48
+#define VTD_SIP_RESET_CNTR BIT_ULL(0)
+#define VTD_SIP_FREEZE_CNTR BIT_ULL(8)
+#define VTD_SIP_CTRL_EVNT GENMASK_ULL(19, 16)
+#define VTD_SIP_EVNT_IOTLB_4K_HIT 0x0
+#define VTD_SIP_EVNT_IOTLB_2M_HIT 0x1
+#define VTD_SIP_EVNT_IOTLB_1G_HIT 0x2
+#define VTD_SIP_EVNT_SLPWC_L3_HIT 0x3
+#define VTD_SIP_EVNT_SLPWC_L4_HIT 0x4
+#define VTD_SIP_EVNT_RCC_HIT 0x5
+#define VTD_SIP_EVNT_IOTLB_4K_MISS 0x6
+#define VTD_SIP_EVNT_IOTLB_2M_MISS 0x7
+#define VTD_SIP_EVNT_IOTLB_1G_MISS 0x8
+#define VTD_SIP_EVNT_SLPWC_L3_MISS 0x9
+#define VTD_SIP_EVNT_SLPWC_L4_MISS 0xa
+#define VTD_SIP_EVNT_RCC_MISS 0xb
+#define VTD_SIP_CNTR 0X50
+#define VTD_SIP_CNTR_EVNT_CNTR GENMASK_ULL(47, 0)
+#define VTD_SIP_CNTR_EVNT GENMASK_ULL(63, 60)
+
+#define PERF_TIMEOUT 30
+
+#define PERF_MAX_PORT_NUM 1
+
+/**
+ * struct fme_perf_priv - priv data structure for fme perf driver
+ *
+ * @dev: parent device.
+ * @ioaddr: mapped base address of mmio region.
+ * @pmu: pmu data structure for fme perf counters.
+ * @id: id of this fme performance report private feature.
+ * @fab_users: current user number on fabric counters.
+ * @fab_port_id: used to indicate current working mode of fabric counters.
+ * @fab_lock: lock to protect fabric counters working mode.
+ * @events_group: events attribute group for fme perf pmu.
+ * @attr_groups: attribute groups for fme perf pmu.
+ */
+struct fme_perf_priv {
+ struct device *dev;
+ void __iomem *ioaddr;
+ struct pmu pmu;
+ u64 id;
+
+ u32 fab_users;
+ u32 fab_port_id;
+ spinlock_t fab_lock;
+
+ struct attribute_group events_group;
+ const struct attribute_group *attr_groups[4];
+};
+
+/**
+ * struct fme_perf_event_attr - fme perf event attribute
+ *
+ * @attr: device attribute of fme perf event.
+ * @event_id: id of fme perf event.
+ * @event_type: type of fme perf event.
+ * @is_port_event: indicate if this is a port based event.
+ * @data: private data for fme perf event.
+ */
+struct fme_perf_event_attr {
+ struct device_attribute attr;
+ u32 event_id;
+ u32 event_type;
+ bool is_port_event;
+ u64 data;
+};
+
+/**
+ * struct fme_perf_event_ops - callbacks for fme perf events
+ *
+ * @event_init: callback invoked during event init.
+ * @event_destroy: callback invoked during event destroy.
+ * @read_counter: callback to read hardware counters.
+ */
+struct fme_perf_event_ops {
+ int (*event_init)(struct fme_perf_priv *priv, u32 event,
+ u32 port_id, u64 data);
+ void (*event_destroy)(struct fme_perf_priv *priv, u32 event,
+ u32 port_id, u64 data);
+ u64 (*read_counter)(struct fme_perf_priv *priv, u32 event,
+ u32 port_id, u64 data);
+};
+
+/**
+ * struct fme_perf_event_group - fme perf groups
+ *
+ * @ev_attrs: fme perf event attributes.
+ * @num: events number in this group.
+ * @ops: same callbacks shared by all fme perf events in this group.
+ */
+struct fme_perf_event_group {
+ struct fme_perf_event_attr *ev_attrs;
+ unsigned int num;
+ struct fme_perf_event_ops *ops;
+};
+
+#define to_fme_perf_priv(_pmu) container_of(_pmu, struct fme_perf_priv, pmu)
+
+static cpumask_t fme_perf_cpumask = CPU_MASK_CPU0;
+
+static ssize_t cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return cpumap_print_to_pagebuf(true, buf, &fme_perf_cpumask);
+}
+static DEVICE_ATTR_RO(cpumask);
+
+static struct attribute *fme_perf_cpumask_attrs[] = {
+ &dev_attr_cpumask.attr,
+ NULL,
+};
+
+static struct attribute_group fme_perf_cpumask_group = {
+ .attrs = fme_perf_cpumask_attrs,
+};
+
+#define FME_EVENT_MASK GENMASK_ULL(11, 0)
+#define FME_EVTYPE_MASK GENMASK_ULL(15, 12)
+#define FME_EVTYPE_BASIC 0
+#define FME_EVTYPE_CACHE 1
+#define FME_EVTYPE_FABRIC 2
+#define FME_EVTYPE_VTD 3
+#define FME_EVTYPE_VTD_SIP 4
+#define FME_EVTYPE_MAX FME_EVTYPE_VTD_SIP
+#define FME_PORTID_MASK GENMASK_ULL(23, 16)
+#define FME_PORTID_ROOT (0xffU)
+
+PMU_FORMAT_ATTR(event, "config:0-11");
+PMU_FORMAT_ATTR(evtype, "config:12-15");
+PMU_FORMAT_ATTR(portid, "config:16-23");
+
+static struct attribute *fme_perf_format_attrs[] = {
+ &format_attr_event.attr,
+ &format_attr_evtype.attr,
+ &format_attr_portid.attr,
+ NULL,
+};
+
+static struct attribute_group fme_perf_format_group = {
+ .name = "format",
+ .attrs = fme_perf_format_attrs,
+};
+
+static ssize_t fme_perf_event_sysfs_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+ struct fme_perf_event_attr *ev_attr =
+ container_of(attr, struct fme_perf_event_attr, attr);
+ char *buf = page;
+
+ buf += sprintf(buf, "event=0x%02x", ev_attr->event_id);
+ buf += sprintf(buf, ",evtype=0x%02x", ev_attr->event_type);
+
+ if (ev_attr->is_port_event)
+ buf += sprintf(buf, ",portid=?\n");
+ else
+ buf += sprintf(buf, ",portid=0x%02x\n", FME_PORTID_ROOT);
+
+ return (ssize_t)(buf - page);
+}
+
+#define FME_EVENT_ATTR(_name) \
+ __ATTR(_name, 0444, fme_perf_event_sysfs_show, NULL)
+
+#define FME_EVENT_BASIC(_name, _event) { \
+ .attr = FME_EVENT_ATTR(_name), \
+ .event_id = _event, \
+ .event_type = FME_EVTYPE_BASIC, \
+ .is_port_event = false, \
+}
+
+/* data is used to save hardware channel information for cache events */
+#define FME_EVENT_CACHE(_name, _event, _data) { \
+ .attr = FME_EVENT_ATTR(cache_##_name), \
+ .event_id = _event, \
+ .event_type = FME_EVTYPE_CACHE, \
+ .is_port_event = false, \
+ .data = _data, \
+}
+
+#define FME_EVENT_FABRIC(_name, _event) { \
+ .attr = FME_EVENT_ATTR(fab_##_name), \
+ .event_id = _event, \
+ .event_type = FME_EVTYPE_FABRIC, \
+ .is_port_event = false, \
+}
+
+#define FME_EVENT_FABRIC_PORT(_name, _event) { \
+ .attr = FME_EVENT_ATTR(fab_port_##_name), \
+ .event_id = _event, \
+ .event_type = FME_EVTYPE_FABRIC, \
+ .is_port_event = true, \
+}
+
+#define FME_EVENT_VTD_PORT(_name, _event) { \
+ .attr = FME_EVENT_ATTR(vtd_port_##_name), \
+ .event_id = _event, \
+ .event_type = FME_EVTYPE_VTD, \
+ .is_port_event = true, \
+}
+
+#define FME_EVENT_VTD_SIP(_name, _event) { \
+ .attr = FME_EVENT_ATTR(vtd_sip_##_name), \
+ .event_id = _event, \
+ .event_type = FME_EVTYPE_VTD_SIP, \
+ .is_port_event = false, \
+}
+
+static struct fme_perf_event_attr fme_perf_basic_events[] = {
+ FME_EVENT_BASIC(clock, BASIC_EVNT_CLK),
+};
+
+static struct fme_perf_event_attr fme_perf_cache_events[] = {
+ FME_EVENT_CACHE(read_hit, CACHE_EVNT_RD_HIT, CACHE_CHANNEL_RD),
+ FME_EVENT_CACHE(read_miss, CACHE_EVNT_RD_MISS, CACHE_CHANNEL_RD),
+ FME_EVENT_CACHE(write_hit, CACHE_EVNT_WR_HIT, CACHE_CHANNEL_WR),
+ FME_EVENT_CACHE(write_miss, CACHE_EVNT_WR_MISS, CACHE_CHANNEL_WR),
+ FME_EVENT_CACHE(hold_request, CACHE_EVNT_HOLD_REQ, CACHE_CHANNEL_RD),
+ FME_EVENT_CACHE(data_write_port_contention,
+ CACHE_EVNT_DATA_WR_PORT_CONTEN, CACHE_CHANNEL_WR),
+ FME_EVENT_CACHE(tag_write_port_contention,
+ CACHE_EVNT_TAG_WR_PORT_CONTEN, CACHE_CHANNEL_WR),
+ FME_EVENT_CACHE(tx_req_stall, CACHE_EVNT_TX_REQ_STALL,
+ CACHE_CHANNEL_RD),
+ FME_EVENT_CACHE(rx_req_stall, CACHE_EVNT_RX_REQ_STALL,
+ CACHE_CHANNEL_RD),
+ FME_EVENT_CACHE(eviction, CACHE_EVNT_EVICTIONS, CACHE_CHANNEL_RD),
+};
+
+static struct fme_perf_event_attr fme_perf_fab_events[] = {
+ FME_EVENT_FABRIC(pcie0_read, FAB_EVNT_PCIE0_RD),
+ FME_EVENT_FABRIC(pcie0_write, FAB_EVNT_PCIE0_WR),
+ FME_EVENT_FABRIC(pcie1_read, FAB_EVNT_PCIE1_RD),
+ FME_EVENT_FABRIC(pcie1_write, FAB_EVNT_PCIE1_WR),
+ FME_EVENT_FABRIC(upi_read, FAB_EVNT_UPI_RD),
+ FME_EVENT_FABRIC(upi_write, FAB_EVNT_UPI_WR),
+ FME_EVENT_FABRIC(mmio_read, FAB_EVNT_MMIO_RD),
+ FME_EVENT_FABRIC(mmio_write, FAB_EVNT_MMIO_WR),
+
+ FME_EVENT_FABRIC_PORT(pcie0_read, FAB_EVNT_PCIE0_RD),
+ FME_EVENT_FABRIC_PORT(pcie0_write, FAB_EVNT_PCIE0_WR),
+ FME_EVENT_FABRIC_PORT(pcie1_read, FAB_EVNT_PCIE1_RD),
+ FME_EVENT_FABRIC_PORT(pcie1_write, FAB_EVNT_PCIE1_WR),
+ FME_EVENT_FABRIC_PORT(upi_read, FAB_EVNT_UPI_RD),
+ FME_EVENT_FABRIC_PORT(upi_write, FAB_EVNT_UPI_WR),
+ FME_EVENT_FABRIC_PORT(mmio_read, FAB_EVNT_MMIO_RD),
+ FME_EVENT_FABRIC_PORT(mmio_write, FAB_EVNT_MMIO_WR),
+};
+
+static struct fme_perf_event_attr fme_perf_vtd_events[] = {
+ FME_EVENT_VTD_PORT(read_transaction, VTD_EVNT_AFU_MEM_RD_TRANS),
+ FME_EVENT_VTD_PORT(write_transaction, VTD_EVNT_AFU_MEM_WR_TRANS),
+ FME_EVENT_VTD_PORT(devtlb_read_hit, VTD_EVNT_AFU_DEVTLB_RD_HIT),
+ FME_EVENT_VTD_PORT(devtlb_write_hit, VTD_EVNT_AFU_DEVTLB_WR_HIT),
+ FME_EVENT_VTD_PORT(devtlb_4k_fill, VTD_EVNT_DEVTLB_4K_FILL),
+ FME_EVENT_VTD_PORT(devtlb_2m_fill, VTD_EVNT_DEVTLB_2M_FILL),
+ FME_EVENT_VTD_PORT(devtlb_1g_fill, VTD_EVNT_DEVTLB_1G_FILL),
+};
+
+static struct fme_perf_event_attr fme_perf_vtd_sip_events[] = {
+ FME_EVENT_VTD_SIP(iotlb_4k_hit, VTD_SIP_EVNT_IOTLB_4K_HIT),
+ FME_EVENT_VTD_SIP(iotlb_2m_hit, VTD_SIP_EVNT_IOTLB_2M_HIT),
+ FME_EVENT_VTD_SIP(iotlb_1g_hit, VTD_SIP_EVNT_IOTLB_1G_HIT),
+ FME_EVENT_VTD_SIP(slpwc_l3_hit, VTD_SIP_EVNT_SLPWC_L3_HIT),
+ FME_EVENT_VTD_SIP(slpwc_l4_hit, VTD_SIP_EVNT_SLPWC_L4_HIT),
+ FME_EVENT_VTD_SIP(rcc_hit, VTD_SIP_EVNT_RCC_HIT),
+ FME_EVENT_VTD_SIP(iotlb_4k_miss, VTD_SIP_EVNT_IOTLB_4K_MISS),
+ FME_EVENT_VTD_SIP(iotlb_2m_miss, VTD_SIP_EVNT_IOTLB_2M_MISS),
+ FME_EVENT_VTD_SIP(iotlb_1g_miss, VTD_SIP_EVNT_IOTLB_1G_MISS),
+ FME_EVENT_VTD_SIP(slpwc_l3_miss, VTD_SIP_EVNT_SLPWC_L3_MISS),
+ FME_EVENT_VTD_SIP(slpwc_l4_miss, VTD_SIP_EVNT_SLPWC_L4_MISS),
+ FME_EVENT_VTD_SIP(rcc_miss, VTD_SIP_EVNT_RCC_MISS),
+};
+
+static u64 basic_read_event_counter(struct fme_perf_priv *priv,
+ u32 event, u32 port_id, u64 data)
+{
+ void __iomem *base = priv->ioaddr;
+
+ if (event == BASIC_EVNT_CLK)
+ return readq(base + CLK_CNTR);
+
+ return 0;
+}
+
+static struct fme_perf_event_ops fme_perf_basic_ops = {
+ .read_counter = basic_read_event_counter,
+};
+
+static u64 cache_read_event_counter(struct fme_perf_priv *priv,
+ u32 event, u32 port_id, u64 data)
+{
+ void __iomem *base = priv->ioaddr;
+ u8 channel = (u8)data;
+ u64 v, count;
+
+ /* set channel access type and cache event code. */
+ v = readq(base + CACHE_CTRL);
+ v &= ~(CACHE_CHANNEL_SEL | CACHE_CTRL_EVNT);
+ v |= FIELD_PREP(CACHE_CHANNEL_SEL, channel);
+ v |= FIELD_PREP(CACHE_CTRL_EVNT, event);
+ writeq(v, base + CACHE_CTRL);
+
+ if (readq_poll_timeout_atomic(base + CACHE_CNTR0, v,
+ FIELD_GET(CACHE_CNTR_EVNT, v) == event,
+ 1, PERF_TIMEOUT)) {
+ dev_err(priv->dev, "timeout, unmatched cache event code in counter register.\n");
+ return 0;
+ }
+
+ v = readq(base + CACHE_CNTR0);
+ count = FIELD_GET(CACHE_CNTR_EVNT_CNTR, v);
+ v = readq(base + CACHE_CNTR1);
+ count += FIELD_GET(CACHE_CNTR_EVNT_CNTR, v);
+
+ return count;
+}
+
+static struct fme_perf_event_ops fme_perf_cache_ops = {
+ .read_counter = cache_read_event_counter,
+};
+
+static int fabric_event_init(struct fme_perf_priv *priv,
+ u32 event, u32 port_id, u64 data)
+{
+ void __iomem *base = priv->ioaddr;
+ int ret = 0;
+ u64 v;
+
+ /*
+ * as fabric counter set only can be in either overall or port mode.
+ * In overall mode, it counts overall data for FPGA, and in port mode,
+ * it is configured to monitor on one individual port.
+ *
+ * so every time, a new event is initialized, driver checks
+ * current working mode and if someone is using this counter set.
+ */
+ spin_lock(&priv->fab_lock);
+ if (priv->fab_users && priv->fab_port_id != port_id) {
+ dev_dbg(priv->dev, "conflict fabric event monitoring mode.\n");
+ ret = -EOPNOTSUPP;
+ goto exit;
+ }
+
+ priv->fab_users++;
+
+ /*
+ * skip if current working mode matches, otherwise change the working
+ * mode per input port_id, to monitor overall data or another port.
+ */
+ if (priv->fab_port_id == port_id)
+ goto exit;
+
+ priv->fab_port_id = port_id;
+
+ v = readq(base + FAB_CTRL);
+ v &= ~(FAB_PORT_FILTER | FAB_PORT_ID);
+
+ if (port_id == FME_PORTID_ROOT) {
+ v |= FIELD_PREP(FAB_PORT_FILTER, FAB_PORT_FILTER_DISABLE);
+ } else {
+ v |= FIELD_PREP(FAB_PORT_FILTER, FAB_PORT_FILTER_ENABLE);
+ v |= FIELD_PREP(FAB_PORT_ID, port_id);
+ }
+ writeq(v, base + FAB_CTRL);
+
+exit:
+ spin_unlock(&priv->fab_lock);
+ return ret;
+}
+
+static void fabric_event_destroy(struct fme_perf_priv *priv,
+ u32 event, u32 port_id, u64 data)
+{
+ spin_lock(&priv->fab_lock);
+ priv->fab_users--;
+ spin_unlock(&priv->fab_lock);
+}
+
+static u64 fabric_read_event_counter(struct fme_perf_priv *priv,
+ u32 event, u32 port_id, u64 data)
+{
+ void __iomem *base = priv->ioaddr;
+ u64 v;
+
+ v = readq(base + FAB_CTRL);
+ v &= ~FAB_CTRL_EVNT;
+ v |= FIELD_PREP(FAB_CTRL_EVNT, event);
+ writeq(v, base + FAB_CTRL);
+
+ if (readq_poll_timeout_atomic(base + FAB_CNTR, v,
+ FIELD_GET(FAB_CNTR_EVNT, v) == event,
+ 1, PERF_TIMEOUT)) {
+ dev_err(priv->dev, "timeout, unmatched fab event code in counter register.\n");
+ return 0;
+ }
+
+ v = readq(base + FAB_CNTR);
+ return FIELD_GET(FAB_CNTR_EVNT_CNTR, v);
+}
+
+static struct fme_perf_event_ops fme_perf_fab_ops = {
+ .event_init = fabric_event_init,
+ .event_destroy = fabric_event_destroy,
+ .read_counter = fabric_read_event_counter,
+};
+
+static u64 vtd_read_event_counter(struct fme_perf_priv *priv,
+ u32 event, u32 port_id, u64 data)
+{
+ void __iomem *base = priv->ioaddr;
+ u64 v;
+
+ event += port_id;
+
+ v = readq(base + VTD_CTRL);
+ v &= ~VTD_CTRL_EVNT;
+ v |= FIELD_PREP(VTD_CTRL_EVNT, event);
+ writeq(v, base + VTD_CTRL);
+
+ if (readq_poll_timeout_atomic(base + VTD_CNTR, v,
+ FIELD_GET(VTD_CNTR_EVNT, v) == event,
+ 1, PERF_TIMEOUT)) {
+ dev_err(priv->dev, "timeout, unmatched vtd event code in counter register.\n");
+ return 0;
+ }
+
+ v = readq(base + VTD_CNTR);
+ return FIELD_GET(VTD_CNTR_EVNT_CNTR, v);
+}
+
+static struct fme_perf_event_ops fme_perf_vtd_ops = {
+ .read_counter = vtd_read_event_counter,
+};
+
+static u64 vtd_sip_read_event_counter(struct fme_perf_priv *priv,
+ u32 event, u32 port_id, u64 data)
+{
+ void __iomem *base = priv->ioaddr;
+ u64 v;
+
+ v = readq(base + VTD_SIP_CTRL);
+ v &= ~VTD_SIP_CTRL_EVNT;
+ v |= FIELD_PREP(VTD_SIP_CTRL_EVNT, event);
+ writeq(v, base + VTD_SIP_CTRL);
+
+ if (readq_poll_timeout_atomic(base + VTD_SIP_CNTR, v,
+ FIELD_GET(VTD_SIP_CNTR_EVNT, v) == event,
+ 1, PERF_TIMEOUT)) {
+ dev_err(priv->dev, "timeout, unmatched vtd sip event code in counter register\n");
+ return 0;
+ }
+
+ v = readq(base + VTD_SIP_CNTR);
+ return FIELD_GET(VTD_SIP_CNTR_EVNT_CNTR, v);
+}
+
+static struct fme_perf_event_ops fme_perf_vtd_sip_ops = {
+ .read_counter = vtd_sip_read_event_counter,
+};
+
+#define FME_EVENT_GROUP(_name) { \
+ .ev_attrs = fme_perf_##_name##_events, \
+ .num = ARRAY_SIZE(fme_perf_##_name##_events), \
+ .ops = &fme_perf_##_name##_ops, \
+}
+
+/* event group array is indexed by FME_EVTYPE_* */
+static struct fme_perf_event_group fme_perf_event_groups[] = {
+ FME_EVENT_GROUP(basic),
+ FME_EVENT_GROUP(cache),
+ FME_EVENT_GROUP(fab),
+ FME_EVENT_GROUP(vtd),
+ FME_EVENT_GROUP(vtd_sip),
+};
+
+static struct fme_perf_event_attr *
+get_event_attr(u32 event_id, u32 event_type, u32 port_id)
+{
+ bool is_port_event = (port_id != FME_PORTID_ROOT);
+ struct fme_perf_event_group *group;
+ unsigned int i;
+
+ if (event_type > FME_EVTYPE_MAX)
+ return NULL;
+
+ group = &fme_perf_event_groups[event_type];
+
+ for (i = 0; i < group->num; i++) {
+ if (event_id == group->ev_attrs[i].event_id &&
+ is_port_event == group->ev_attrs[i].is_port_event)
+ return &group->ev_attrs[i];
+ }
+
+ return NULL;
+}
+
+static struct fme_perf_event_ops *get_event_ops(u32 event_type)
+{
+ return fme_perf_event_groups[event_type].ops;
+}
+
+static void fme_perf_event_destroy(struct perf_event *event)
+{
+ struct fme_perf_event_ops *ops = get_event_ops(event->hw.event_base);
+ struct fme_perf_priv *priv = to_fme_perf_priv(event->pmu);
+
+ if (ops->event_destroy)
+ ops->event_destroy(priv, event->hw.idx,
+ event->hw.config_base, event->hw.config);
+}
+
+static int fme_perf_event_init(struct perf_event *event)
+{
+ struct fme_perf_priv *priv = to_fme_perf_priv(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
+ struct fme_perf_event_attr *ev_attr;
+ u32 event_id, event_type, port_id;
+ struct fme_perf_event_ops *ops;
+
+ /* test the event attr type check for PMU enumeration */
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ /*
+ * fme counters are shared across all cores.
+ * Therefore, it does not support per-process mode.
+ * Also, it does not support event sampling mode.
+ */
+ if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
+ return -EINVAL;
+
+ if (event->cpu < 0)
+ return -EINVAL;
+
+ event_id = FIELD_GET(FME_EVENT_MASK, event->attr.config);
+ event_type = FIELD_GET(FME_EVTYPE_MASK, event->attr.config);
+ port_id = FIELD_GET(FME_PORTID_MASK, event->attr.config);
+
+ ev_attr = get_event_attr(event_id, event_type, port_id);
+ if (!ev_attr)
+ return -EINVAL;
+
+ if ((ev_attr->is_port_event && port_id >= PERF_MAX_PORT_NUM) ||
+ (!ev_attr->is_port_event && port_id != FME_PORTID_ROOT))
+ return -EINVAL;
+
+ hwc->event_base = event_type;
+ hwc->idx = (int)event_id;
+ hwc->config_base = port_id;
+ hwc->config = ev_attr->data;
+
+ event->destroy = fme_perf_event_destroy;
+
+ dev_dbg(priv->dev,
+ "%s eventid=0x%x, evtype=0x%x, portid=0x%x, data=0x%llx\n",
+ __func__, event_id, event_type, port_id, ev_attr->data);
+
+ ops = get_event_ops(event->hw.event_base);
+
+ if (ops->event_init)
+ return ops->event_init(priv, hwc->idx,
+ hwc->config_base, hwc->config);
+
+ return 0;
+}
+
+static void fme_perf_event_update(struct perf_event *event)
+{
+ struct fme_perf_event_ops *ops = get_event_ops(event->hw.event_base);
+ struct fme_perf_priv *priv = to_fme_perf_priv(event->pmu);
+ u64 now, prev, delta;
+
+ now = ops->read_counter(priv, (u32)event->hw.idx,
+ event->hw.config_base, event->hw.config);
+ prev = local64_read(&event->hw.prev_count);
+ delta = now - prev;
+
+ local64_add(delta, &event->count);
+}
+
+static void fme_perf_event_start(struct perf_event *event, int flags)
+{
+ struct fme_perf_event_ops *ops = get_event_ops(event->hw.event_base);
+ struct fme_perf_priv *priv = to_fme_perf_priv(event->pmu);
+ u64 count;
+
+ count = ops->read_counter(priv, (u32)event->hw.idx,
+ event->hw.config_base, event->hw.config);
+ local64_set(&event->hw.prev_count, count);
+}
+
+static void fme_perf_event_stop(struct perf_event *event, int flags)
+{
+ fme_perf_event_update(event);
+}
+
+static int fme_perf_event_add(struct perf_event *event, int flags)
+{
+ if (flags & PERF_EF_START)
+ fme_perf_event_start(event, flags);
+
+ return 0;
+}
+
+static void fme_perf_event_del(struct perf_event *event, int flags)
+{
+ fme_perf_event_stop(event, PERF_EF_UPDATE);
+}
+
+static void fme_perf_event_read(struct perf_event *event)
+{
+ fme_perf_event_update(event);
+}
+
+static int fme_perf_setup_attrs(struct fme_perf_priv *priv)
+{
+ struct fme_perf_event_group *group;
+ unsigned int i, idx = 0, num = 0;
+ struct attribute **attrs;
+
+ /*
+ * if feature id is FME_FEATURE_ID_GLOBAL_IPERF, hardware supports
+ * all performance counters, otherwise only basic and fabric counters.
+ */
+ num += fme_perf_event_groups[FME_EVTYPE_BASIC].num;
+ num += fme_perf_event_groups[FME_EVTYPE_FABRIC].num;
+
+ if (priv->id == FME_FEATURE_ID_GLOBAL_IPERF) {
+ num += fme_perf_event_groups[FME_EVTYPE_CACHE].num;
+ num += fme_perf_event_groups[FME_EVTYPE_VTD].num;
+ num += fme_perf_event_groups[FME_EVTYPE_VTD_SIP].num;
+ }
+
+ attrs = devm_kcalloc(priv->dev, num + 1, sizeof(*attrs), GFP_KERNEL);
+ if (!attrs)
+ return -ENOMEM;
+
+ group = &fme_perf_event_groups[FME_EVTYPE_BASIC];
+ for (i = 0; i < group->num; i++)
+ attrs[idx++] = &group->ev_attrs[i].attr.attr;
+
+ group = &fme_perf_event_groups[FME_EVTYPE_FABRIC];
+ for (i = 0; i < group->num; i++)
+ attrs[idx++] = &group->ev_attrs[i].attr.attr;
+
+ if (priv->id == FME_FEATURE_ID_GLOBAL_IPERF) {
+ group = &fme_perf_event_groups[FME_EVTYPE_CACHE];
+ for (i = 0; i < group->num; i++)
+ attrs[idx++] = &group->ev_attrs[i].attr.attr;
+
+ group = &fme_perf_event_groups[FME_EVTYPE_VTD];
+ for (i = 0; i < group->num; i++)
+ attrs[idx++] = &group->ev_attrs[i].attr.attr;
+
+ group = &fme_perf_event_groups[FME_EVTYPE_VTD_SIP];
+ for (i = 0; i < group->num; i++)
+ attrs[idx++] = &group->ev_attrs[i].attr.attr;
+ }
+
+ priv->events_group.name = "events";
+ priv->events_group.attrs = attrs;
+
+ priv->attr_groups[0] = &fme_perf_format_group;
+ priv->attr_groups[1] = &fme_perf_cpumask_group;
+ priv->attr_groups[2] = &priv->events_group;
+
+ return 0;
+}
+
+static void fme_perf_setup_hardware(struct fme_perf_priv *priv)
+{
+ void __iomem *base = priv->ioaddr;
+ u64 v;
+
+ /* read and save current working mode for fabric counters */
+ v = readq(base + FAB_CTRL);
+
+ if (FIELD_GET(FAB_PORT_FILTER, v) == FAB_PORT_FILTER_DISABLE)
+ priv->fab_port_id = FME_PORTID_ROOT;
+ else
+ priv->fab_port_id = FIELD_GET(FAB_PORT_ID, v);
+}
+
+static int fme_perf_pmu_register(struct platform_device *pdev,
+ struct fme_perf_priv *priv)
+{
+ struct pmu *pmu = &priv->pmu;
+ char *name;
+ int ret;
+
+ spin_lock_init(&priv->fab_lock);
+
+ ret = fme_perf_setup_attrs(priv);
+ if (ret)
+ return ret;
+
+ fme_perf_setup_hardware(priv);
+
+ pmu->task_ctx_nr = perf_invalid_context;
+ pmu->attr_groups = priv->attr_groups;
+ pmu->event_init = fme_perf_event_init;
+ pmu->add = fme_perf_event_add;
+ pmu->del = fme_perf_event_del;
+ pmu->start = fme_perf_event_start;
+ pmu->stop = fme_perf_event_stop;
+ pmu->read = fme_perf_event_read;
+ pmu->capabilities = PERF_PMU_CAP_NO_INTERRUPT |
+ PERF_PMU_CAP_NO_EXCLUDE;
+
+ name = devm_kasprintf(priv->dev, GFP_KERNEL, "fme%d", pdev->id);
+
+ ret = perf_pmu_register(pmu, name, -1);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+static void fme_perf_pmu_unregister(struct fme_perf_priv *priv)
+{
+ perf_pmu_unregister(&priv->pmu);
+}
+
+static int fme_perf_init(struct platform_device *pdev,
+ struct dfl_feature *feature)
+{
+ struct fme_perf_priv *priv;
+ int ret;
+
+ dev_dbg(&pdev->dev, "FME Perf Init\n");
+
+ priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ priv->dev = &pdev->dev;
+ priv->ioaddr = feature->ioaddr;
+ priv->id = feature->id;
+
+ ret = fme_perf_pmu_register(pdev, priv);
+ if (ret)
+ return ret;
+
+ feature->priv = priv;
+ return 0;
+}
+
+static void fme_perf_uinit(struct platform_device *pdev,
+ struct dfl_feature *feature)
+{
+ struct fme_perf_priv *priv = feature->priv;
+
+ fme_perf_pmu_unregister(priv);
+}
+
+const struct dfl_feature_id fme_perf_id_table[] = {
+ {.id = FME_FEATURE_ID_GLOBAL_IPERF,},
+ {.id = FME_FEATURE_ID_GLOBAL_DPERF,},
+ {0,}
+};
+
+const struct dfl_feature_ops fme_perf_ops = {
+ .init = fme_perf_init,
+ .uinit = fme_perf_uinit,
+};
diff --git a/drivers/fpga/dfl-fme.h b/drivers/fpga/dfl-fme.h
index 5fbe3f5..dc71048 100644
--- a/drivers/fpga/dfl-fme.h
+++ b/drivers/fpga/dfl-fme.h
@@ -39,5 +39,7 @@ struct dfl_fme {
extern const struct dfl_feature_id fme_pr_mgmt_id_table[];
extern const struct dfl_feature_ops fme_global_err_ops;
extern const struct dfl_feature_id fme_global_err_id_table[];
+extern const struct dfl_feature_ops fme_perf_ops;
+extern const struct dfl_feature_id fme_perf_id_table[];
#endif /* __DFL_FME_H */
--
1.8.3.1
^ permalink raw reply related
* [PATCH v4 1/2] Documentation: fpga: dfl: add description for performance reporting support
From: Wu Hao @ 2019-06-27 5:09 UTC (permalink / raw)
To: mdf, linux-fpga, linux-kernel; +Cc: linux-api, gregkh, atull, Xu Yilun, Wu Hao
In-Reply-To: <1561612195-6081-1-git-send-email-hao.wu@intel.com>
From: Xu Yilun <yilun.xu@intel.com>
This patch adds description for performance reporting support for
Device Feature List (DFL) based FPGA.
Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
---
Documentation/fpga/dfl.txt | 83 ++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 83 insertions(+)
diff --git a/Documentation/fpga/dfl.txt b/Documentation/fpga/dfl.txt
index 652917f..6acd1b5 100644
--- a/Documentation/fpga/dfl.txt
+++ b/Documentation/fpga/dfl.txt
@@ -115,6 +115,11 @@ More functions are exposed through sysfs
management information (current temperature, thresholds, threshold status,
etc.).
+ Performance reporting
+ performance counters are exposed through perf PMU APIs. Standard perf tool
+ can be used to monitor all available perf events. Please see performance
+ counter section below for more detailed information.
+
FIU - PORT
==========
@@ -368,6 +373,84 @@ The device nodes used for ioctl() or mmap() can be referenced through:
/sys/class/fpga_region/<regionX>/<dfl-port.n>/dev
+Performance Counters
+====================
+Performance reporting is one private feature implemented in FME. It could
+supports several independent, system-wide, device counter sets in hardware to
+monitor and count for performance events, including "basic", "cache", "fabric",
+"vtd" and "vtd_sip" counters. Users could use standard perf tool to monitor
+FPGA cache hit/miss rate, transaction number, interface clock counter of AFU
+and other FPGA performance events.
+
+Different FPGA devices may have different counter sets, it depends on hardware
+implementation. e.g. some discrete FPGA cards don't have any cache. User could
+use "perf list" to check which perf events are supported by target hardware.
+
+In order to allow user to use standard perf API to access these performance
+counters, driver creates a perf PMU, and related sysfs interfaces in
+/sys/bus/event_source/devices/fme* to describe available perf events and
+configuration options.
+
+The "format" directory describes the format of the config field of struct
+perf_event_attr. There are 3 bitfields for config, "evtype" defines which type
+the perf event belongs to. "event" is the identity of the event within its
+category. "portid" is introduced to decide counters set to monitor on FPGA
+overall data or a specific port.
+
+The "events" directory describes the configuration templates for all available
+events which can be used with perf tool directly. For example, fab_mmio_read
+has the configuration "event=0x06,evtype=0x02,portid=0xff", which shows this
+event belongs to fabric type (0x02), the local event id is 0x06 and it is for
+overall monitoring (portid=0xff).
+
+Example usage of perf can be:
+
+$# perf list |grep fme
+
+ fme0/fab_mmio_read/ [Kernel PMU event]
+<...>
+ fme0/fab_port_mmio_read,portid=?/ [Kernel PMU event]
+<...>
+
+$# perf stat -a -e fme0/fab_mmio_read/ <command>
+or
+$# perf stat -a -e fme0/event=0x06,evtype=0x02,portid=0xff/ <command>
+or
+$# perf stat -a -e fme0/config=0xff2006/ <command>
+
+Another example, fab_port_mmio_read monitors mmio read of a specific port. So
+its configuration template is "event=0x06,evtype=0x01,portid=?". The portid
+should be explicitly set.
+
+Its usage of perf can be:
+
+$# perf stat -a -e fme0/fab_port_mmio_read,portid=0x0/ <command>
+or
+$# perf stat -a -e fme0/event=0x06,evtype=0x02,portid=0x0/ <command>
+or
+$# perf stat -a -e fme0/config=0x2006/ <command>
+
+Please note for fabric counters, overall perf events (fab_*) and port perf
+events (fab_port_*) actually share one set of counters in hardware, so it can't
+monitor both at the same time. If this set of counters is configured to monitor
+overall data, then per port perf data is not supported. See below example.
+
+$# perf stat -e fme0/fab_mmio_read/,fme0/fab_port_mmio_write,\
+ portid=0/ sleep 1
+
+ Performance counter stats for 'system wide':
+
+ 3 fme0/fab_mmio_read/
+ <not supported> fme0/fab_port_mmio_write,portid=0x0/
+
+ 1.001750904 seconds time elapsed
+
+The driver also provides a "cpumask" sysfs attribute, which always shows fixed
+value cpu0 as all perf events are from system-wide counters on FPGA device.
+
+The current driver does not support sampling. So "perf record" is unsupported.
+
+
Add new FIUs support
====================
It's possible that developers made some new function blocks (FIUs) under this
--
1.8.3.1
^ permalink raw reply related
* [PATCH v4 0/2] add performance reporting support to FPGA DFL drivers
From: Wu Hao @ 2019-06-27 5:09 UTC (permalink / raw)
To: mdf, linux-fpga, linux-kernel; +Cc: linux-api, gregkh, atull, Wu Hao
This patchset adds performance reporting support for FPGA DFL drivers. It
introduces one pmu to expose userspace interfaces via standard perf API.
User could use standard perf tool to access perf events exposed via pmu.
This patchset is splitted from patchset[1] for better review, and version 3
patch could be found here[2]. Please note that this patchset needs to be
applied on top of patchset[3][4].
Main changes from v3:
- add more descriptions in doc, including how to use perf tool for these
hardware counters. (patch #1)
- use standard perf API instead of sysfs entries. (patch #2)
[1]https://lkml.org/lkml/2019/5/27/11
[2]https://lkml.org/lkml/2019/5/27/18
[3]https://lkml.org/lkml/2019/6/27/29
[4]https://lkml.org/lkml/2019/6/27/49
Wu Hao (1):
fpga: dfl: fme: add performance reporting support
Xu Yilun (1):
Documentation: fpga: dfl: add description for performance reporting
support
Documentation/fpga/dfl.txt | 83 +++++
drivers/fpga/Makefile | 1 +
drivers/fpga/dfl-fme-main.c | 4 +
drivers/fpga/dfl-fme-perf.c | 871 ++++++++++++++++++++++++++++++++++++++++++++
drivers/fpga/dfl-fme.h | 2 +
5 files changed, 961 insertions(+)
create mode 100644 drivers/fpga/dfl-fme-perf.c
--
1.8.3.1
^ permalink raw reply
* Re: [PATCH V34 09/29] kexec_file: Restrict at runtime if the kernel is locked down
From: James Morris @ 2019-06-27 4:59 UTC (permalink / raw)
To: Matthew Garrett
Cc: linux-security-module, linux-kernel, linux-api, Jiri Bohac,
David Howells, Matthew Garrett, kexec
In-Reply-To: <20190622000358.19895-10-matthewgarrett@google.com>
On Fri, 21 Jun 2019, Matthew Garrett wrote:
> From: Jiri Bohac <jbohac@suse.cz>
>
> When KEXEC_SIG is not enabled, kernel should not load images through
> kexec_file systemcall if the kernel is locked down.
This is not a criticism of the patch but a related issue which I haven't
seen discussed (apologies if it has).
If signed code is loaded into ring 0, verified by the kernel, then
executed, you still lose your secure/trusted/verified boot state. If the
currently running kernel has been runtime-compromised, any signature
verification performed by the kernel cannot be trusted.
This problem is out of scope for the lockdown threat model (which
naturally cannot include a compromised kernel), but folk should be aware
that signature-verified kexec does not provide equivalent assurance to a
full reboot on a secure-boot system.
Potential mitigations here include runtime integrity verification of the
kernel via a separate security monitor (hypervisor, SMM, TEE etc.) or some
kind of platform support for kexec verification.
--
James Morris
<jmorris@namei.org>
^ permalink raw reply
* [PATCH v4 3/3] fpga: dfl: fme: add power management support
From: Wu Hao @ 2019-06-27 4:53 UTC (permalink / raw)
To: mdf, linux-fpga, linux-kernel
Cc: linux-api, linux-hwmon, jdelvare, linux, atull, gregkh, Wu Hao,
Luwei Kang, Xu Yilun
In-Reply-To: <1561611218-5800-1-git-send-email-hao.wu@intel.com>
This patch adds support for power management private feature under
FPGA Management Engine (FME). This private feature driver registers
a hwmon for power (power1_input), thresholds information, e.g.
(power1_max / crit / max_alarm / crit_alarm) and also read-only sysfs
interfaces for other power management information. For configuration,
user could write threshold values via above power1_max / crit sysfs
interface under hwmon too.
Signed-off-by: Luwei Kang <luwei.kang@intel.com>
Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
---
v2: create a dfl_fme_power hwmon to expose power sysfs interfaces.
move all sysfs interfaces under hwmon
consumed --> hwmon power1_input
threshold1 --> hwmon power1_cap
threshold2 --> hwmon power1_crit
threshold1_status --> hwmon power1_cap_status
threshold2_status --> hwmon power1_crit_status
xeon_limit --> hwmon power1_xeon_limit
fpga_limit --> hwmon power1_fpga_limit
ltr --> hwmon power1_ltr
v3: rename some hwmon sysfs interfaces to follow hwmon ABI.
power1_cap --> power1_max
power1_cap_status --> power1_max_alarm
power1_crit_status --> power1_crit_alarm
update sysfs doc for above sysfs interface changes.
replace scnprintf with sprintf in sysfs interface.
v4: use HWMON_CHANNEL_INFO.
update date in sysfs doc.
---
Documentation/ABI/testing/sysfs-platform-dfl-fme | 67 +++++++
drivers/fpga/dfl-fme-main.c | 221 +++++++++++++++++++++++
2 files changed, 288 insertions(+)
diff --git a/Documentation/ABI/testing/sysfs-platform-dfl-fme b/Documentation/ABI/testing/sysfs-platform-dfl-fme
index 2cd17dc..a669548 100644
--- a/Documentation/ABI/testing/sysfs-platform-dfl-fme
+++ b/Documentation/ABI/testing/sysfs-platform-dfl-fme
@@ -127,6 +127,7 @@ Contact: Wu Hao <hao.wu@intel.com>
Description: Read-Only. Read this file to get the name of hwmon device, it
supports values:
'dfl_fme_thermal' - thermal hwmon device name
+ 'dfl_fme_power' - power hwmon device name
What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/temp1_input
Date: June 2019
@@ -183,3 +184,69 @@ Description: Read-Only. Read this file to get the policy of hardware threshold1
(see 'temp1_max'). It only supports two values (policies):
0 - AP2 state (90% throttling)
1 - AP1 state (50% throttling)
+
+What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/power1_input
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-Only. It returns current FPGA power consumption in uW.
+
+What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/power1_max
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-Write. Read this file to get current hardware power
+ threshold1 in uW. If power consumption rises at or above
+ this threshold, hardware starts 50% throttling.
+ Write this file to set current hardware power threshold1 in uW.
+ As hardware only accepts values in Watts, so input value will
+ be round down per Watts (< 1 watts part will be discarded).
+ Write fails with -EINVAL if input parsing fails or input isn't
+ in the valid range (0 - 127000000 uW).
+
+What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/power1_crit
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-Write. Read this file to get current hardware power
+ threshold2 in uW. If power consumption rises at or above
+ this threshold, hardware starts 90% throttling.
+ Write this file to set current hardware power threshold2 in uW.
+ As hardware only accepts values in Watts, so input value will
+ be round down per Watts (< 1 watts part will be discarded).
+ Write fails with -EINVAL if input parsing fails or input isn't
+ in the valid range (0 - 127000000 uW).
+
+What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/power1_max_alarm
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-only. It returns 1 if power consumption is currently at or
+ above hardware threshold1 (see 'power1_max'), otherwise 0.
+
+What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/power1_crit_alarm
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-only. It returns 1 if power consumption is currently at or
+ above hardware threshold2 (see 'power1_crit'), otherwise 0.
+
+What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/power1_xeon_limit
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-Only. It returns power limit for XEON in uW.
+
+What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/power1_fpga_limit
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-Only. It returns power limit for FPGA in uW.
+
+What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/power1_ltr
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-only. Read this file to get current Latency Tolerance
+ Reporting (ltr) value. This ltr impacts the CPU low power
+ state in integrated solution.
diff --git a/drivers/fpga/dfl-fme-main.c b/drivers/fpga/dfl-fme-main.c
index 59ff9f1..9225b68 100644
--- a/drivers/fpga/dfl-fme-main.c
+++ b/drivers/fpga/dfl-fme-main.c
@@ -400,6 +400,223 @@ static void fme_thermal_mgmt_uinit(struct platform_device *pdev,
.uinit = fme_thermal_mgmt_uinit,
};
+#define FME_PWR_STATUS 0x8
+#define FME_LATENCY_TOLERANCE BIT_ULL(18)
+#define PWR_CONSUMED GENMASK_ULL(17, 0)
+
+#define FME_PWR_THRESHOLD 0x10
+#define PWR_THRESHOLD1 GENMASK_ULL(6, 0) /* in Watts */
+#define PWR_THRESHOLD2 GENMASK_ULL(14, 8) /* in Watts */
+#define PWR_THRESHOLD_MAX 0x7f /* in Watts */
+#define PWR_THRESHOLD1_STATUS BIT_ULL(16)
+#define PWR_THRESHOLD2_STATUS BIT_ULL(17)
+
+#define FME_PWR_XEON_LIMIT 0x18
+#define XEON_PWR_LIMIT GENMASK_ULL(14, 0) /* in 0.1 Watts */
+#define XEON_PWR_EN BIT_ULL(15)
+#define FME_PWR_FPGA_LIMIT 0x20
+#define FPGA_PWR_LIMIT GENMASK_ULL(14, 0) /* in 0.1 Watts */
+#define FPGA_PWR_EN BIT_ULL(15)
+
+#define PWR_THRESHOLD_MAX_IN_UW (PWR_THRESHOLD_MAX * 1000000)
+
+static int power_hwmon_read(struct device *dev, enum hwmon_sensor_types type,
+ u32 attr, int channel, long *val)
+{
+ struct dfl_feature *feature = dev_get_drvdata(dev);
+ u64 v;
+
+ switch (attr) {
+ case hwmon_power_input:
+ v = readq(feature->ioaddr + FME_PWR_STATUS);
+ *val = (long)(FIELD_GET(PWR_CONSUMED, v) * 1000000);
+ break;
+ case hwmon_power_max:
+ v = readq(feature->ioaddr + FME_PWR_THRESHOLD);
+ *val = (long)(FIELD_GET(PWR_THRESHOLD1, v) * 1000000);
+ break;
+ case hwmon_power_crit:
+ v = readq(feature->ioaddr + FME_PWR_THRESHOLD);
+ *val = (long)(FIELD_GET(PWR_THRESHOLD2, v) * 1000000);
+ break;
+ case hwmon_power_max_alarm:
+ v = readq(feature->ioaddr + FME_PWR_THRESHOLD);
+ *val = (long)FIELD_GET(PWR_THRESHOLD1_STATUS, v);
+ break;
+ case hwmon_power_crit_alarm:
+ v = readq(feature->ioaddr + FME_PWR_THRESHOLD);
+ *val = (long)FIELD_GET(PWR_THRESHOLD2_STATUS, v);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int power_hwmon_write(struct device *dev, enum hwmon_sensor_types type,
+ u32 attr, int channel, long val)
+{
+ struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
+ struct dfl_feature *feature = dev_get_drvdata(dev);
+ int ret = 0;
+ u64 v;
+
+ if (val < 0 || val > PWR_THRESHOLD_MAX_IN_UW)
+ return -EINVAL;
+
+ val = val / 1000000;
+
+ mutex_lock(&pdata->lock);
+
+ switch (attr) {
+ case hwmon_power_max:
+ v = readq(feature->ioaddr + FME_PWR_THRESHOLD);
+ v &= ~PWR_THRESHOLD1;
+ v |= FIELD_PREP(PWR_THRESHOLD1, val);
+ writeq(v, feature->ioaddr + FME_PWR_THRESHOLD);
+ break;
+ case hwmon_power_crit:
+ v = readq(feature->ioaddr + FME_PWR_THRESHOLD);
+ v &= ~PWR_THRESHOLD2;
+ v |= FIELD_PREP(PWR_THRESHOLD2, val);
+ writeq(v, feature->ioaddr + FME_PWR_THRESHOLD);
+ break;
+ default:
+ ret = -EOPNOTSUPP;
+ break;
+ }
+
+ mutex_unlock(&pdata->lock);
+
+ return ret;
+}
+
+static umode_t power_hwmon_attrs_visible(const void *drvdata,
+ enum hwmon_sensor_types type,
+ u32 attr, int channel)
+{
+ switch (attr) {
+ case hwmon_power_input:
+ case hwmon_power_max_alarm:
+ case hwmon_power_crit_alarm:
+ return 0444;
+ case hwmon_power_max:
+ case hwmon_power_crit:
+ return 0644;
+ }
+
+ return 0;
+}
+
+static const struct hwmon_ops power_hwmon_ops = {
+ .is_visible = power_hwmon_attrs_visible,
+ .read = power_hwmon_read,
+ .write = power_hwmon_write,
+};
+
+static const struct hwmon_channel_info *power_hwmon_info[] = {
+ HWMON_CHANNEL_INFO(power, HWMON_P_INPUT |
+ HWMON_P_MAX | HWMON_P_MAX_ALARM |
+ HWMON_P_CRIT | HWMON_P_CRIT_ALARM),
+ NULL
+};
+
+static const struct hwmon_chip_info power_hwmon_chip_info = {
+ .ops = &power_hwmon_ops,
+ .info = power_hwmon_info,
+};
+
+static ssize_t power1_xeon_limit_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct dfl_feature *feature = dev_get_drvdata(dev);
+ u16 xeon_limit = 0;
+ u64 v;
+
+ v = readq(feature->ioaddr + FME_PWR_XEON_LIMIT);
+
+ if (FIELD_GET(XEON_PWR_EN, v))
+ xeon_limit = FIELD_GET(XEON_PWR_LIMIT, v);
+
+ return sprintf(buf, "%u\n", xeon_limit * 100000);
+}
+
+static ssize_t power1_fpga_limit_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct dfl_feature *feature = dev_get_drvdata(dev);
+ u16 fpga_limit = 0;
+ u64 v;
+
+ v = readq(feature->ioaddr + FME_PWR_FPGA_LIMIT);
+
+ if (FIELD_GET(FPGA_PWR_EN, v))
+ fpga_limit = FIELD_GET(FPGA_PWR_LIMIT, v);
+
+ return sprintf(buf, "%u\n", fpga_limit * 100000);
+}
+
+static ssize_t power1_ltr_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct dfl_feature *feature = dev_get_drvdata(dev);
+ u64 v;
+
+ v = readq(feature->ioaddr + FME_PWR_STATUS);
+
+ return sprintf(buf, "%u\n",
+ (unsigned int)FIELD_GET(FME_LATENCY_TOLERANCE, v));
+}
+
+static DEVICE_ATTR_RO(power1_xeon_limit);
+static DEVICE_ATTR_RO(power1_fpga_limit);
+static DEVICE_ATTR_RO(power1_ltr);
+
+static struct attribute *power_extra_attrs[] = {
+ &dev_attr_power1_xeon_limit.attr,
+ &dev_attr_power1_fpga_limit.attr,
+ &dev_attr_power1_ltr.attr,
+ NULL
+};
+
+ATTRIBUTE_GROUPS(power_extra);
+
+static int fme_power_mgmt_init(struct platform_device *pdev,
+ struct dfl_feature *feature)
+{
+ struct device *hwmon;
+
+ dev_dbg(&pdev->dev, "FME Power Management Init.\n");
+
+ hwmon = devm_hwmon_device_register_with_info(&pdev->dev,
+ "dfl_fme_power", feature,
+ &power_hwmon_chip_info,
+ power_extra_groups);
+ if (IS_ERR(hwmon)) {
+ dev_err(&pdev->dev, "Fail to register power hwmon\n");
+ return PTR_ERR(hwmon);
+ }
+
+ return 0;
+}
+
+static void fme_power_mgmt_uinit(struct platform_device *pdev,
+ struct dfl_feature *feature)
+{
+ dev_dbg(&pdev->dev, "FME Power Management UInit.\n");
+}
+
+static const struct dfl_feature_id fme_power_mgmt_id_table[] = {
+ {.id = FME_FEATURE_ID_POWER_MGMT,},
+ {0,}
+};
+
+static const struct dfl_feature_ops fme_power_mgmt_ops = {
+ .init = fme_power_mgmt_init,
+ .uinit = fme_power_mgmt_uinit,
+};
+
static struct dfl_feature_driver fme_feature_drvs[] = {
{
.id_table = fme_hdr_id_table,
@@ -418,6 +635,10 @@ static void fme_thermal_mgmt_uinit(struct platform_device *pdev,
.ops = &fme_thermal_mgmt_ops,
},
{
+ .id_table = fme_power_mgmt_id_table,
+ .ops = &fme_power_mgmt_ops,
+ },
+ {
.ops = NULL,
},
};
--
1.8.3.1
^ permalink raw reply related
* [PATCH v4 2/3] fpga: dfl: fme: add thermal management support
From: Wu Hao @ 2019-06-27 4:53 UTC (permalink / raw)
To: mdf, linux-fpga, linux-kernel
Cc: linux-api, linux-hwmon, jdelvare, linux, atull, gregkh, Wu Hao,
Luwei Kang, Russ Weight, Xu Yilun
In-Reply-To: <1561611218-5800-1-git-send-email-hao.wu@intel.com>
This patch adds support to thermal management private feature for DFL
FPGA Management Engine (FME). This private feature driver registers
a hwmon for thermal/temperature monitoring (hwmon temp1_input).
If hardware automatic throttling is supported by this hardware, then
driver also exposes sysfs interfaces under hwmon for thresholds
(temp1_max/ crit/ emergency), threshold alarms (temp1_max_alarm/
temp1_crit_alarm) and throttling policy (temp1_max_policy).
Signed-off-by: Luwei Kang <luwei.kang@intel.com>
Signed-off-by: Russ Weight <russell.h.weight@intel.com>
Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
---
v2: create a dfl_fme_thermal hwmon to expose thermal information.
move all sysfs interfaces under hwmon
tempareture --> hwmon temp1_input
threshold1 --> hwmon temp1_alarm
threshold2 --> hwmon temp1_crit
trip_threshold --> hwmon temp1_emergency
threshold1_status --> hwmon temp1_alarm_status
threshold2_status --> hwmon temp1_crit_status
threshold1_policy --> hwmon temp1_alarm_policy
v3: rename some hwmon sysfs interfaces to follow hwmon ABI.
temp1_alarm --> temp1_max
temp1_alarm_status --> temp1_max_alarm
temp1_crit_status --> temp1_crit_alarm
temp1_alarm_policy --> temp1_max_policy
update sysfs doc for above sysfs interface changes.
replace scnprintf with sprintf in sysfs interface.
v4: use HWMON_CHANNEL_INFO.
rebase, and update date in sysfs doc.
---
Documentation/ABI/testing/sysfs-platform-dfl-fme | 64 ++++++++
drivers/fpga/Kconfig | 2 +-
drivers/fpga/dfl-fme-main.c | 187 +++++++++++++++++++++++
3 files changed, 252 insertions(+), 1 deletion(-)
diff --git a/Documentation/ABI/testing/sysfs-platform-dfl-fme b/Documentation/ABI/testing/sysfs-platform-dfl-fme
index 86eef83..2cd17dc 100644
--- a/Documentation/ABI/testing/sysfs-platform-dfl-fme
+++ b/Documentation/ABI/testing/sysfs-platform-dfl-fme
@@ -119,3 +119,67 @@ Description: Write-only. Write error code to this file to clear all errors
logged in errors, first_error and next_error. Write fails with
-EINVAL if input parsing fails or input error code doesn't
match.
+
+What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/name
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-Only. Read this file to get the name of hwmon device, it
+ supports values:
+ 'dfl_fme_thermal' - thermal hwmon device name
+
+What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/temp1_input
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-Only. It returns FPGA device temperature in millidegrees
+ Celsius.
+
+What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/temp1_max
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-Only. It returns hardware threshold1 temperature in
+ millidegrees Celsius. If temperature rises at or above this
+ threshold, hardware starts 50% or 90% throttling (see
+ 'temp1_max_policy').
+
+What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/temp1_crit
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-Only. It returns hardware threshold2 temperature in
+ millidegrees Celsius. If temperature rises at or above this
+ threshold, hardware starts 100% throttling.
+
+What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/temp1_emergency
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-Only. It returns hardware trip threshold temperature in
+ millidegrees Celsius. If temperature rises at or above this
+ threshold, a fatal event will be triggered to board management
+ controller (BMC) to shutdown FPGA.
+
+What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/temp1_max_alarm
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-only. It returns 1 if temperature is currently at or above
+ hardware threshold1 (see 'temp1_max'), otherwise 0.
+
+What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/temp1_crit_alarm
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-only. It returns 1 if temperature is currently at or above
+ hardware threshold2 (see 'temp1_crit'), otherwise 0.
+
+What: /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/temp1_max_policy
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-Only. Read this file to get the policy of hardware threshold1
+ (see 'temp1_max'). It only supports two values (policies):
+ 0 - AP2 state (90% throttling)
+ 1 - AP1 state (50% throttling)
diff --git a/drivers/fpga/Kconfig b/drivers/fpga/Kconfig
index 8072c19..48f6224 100644
--- a/drivers/fpga/Kconfig
+++ b/drivers/fpga/Kconfig
@@ -155,7 +155,7 @@ config FPGA_DFL
config FPGA_DFL_FME
tristate "FPGA DFL FME Driver"
- depends on FPGA_DFL
+ depends on FPGA_DFL && HWMON
help
The FPGA Management Engine (FME) is a feature device implemented
under Device Feature List (DFL) framework. Select this option to
diff --git a/drivers/fpga/dfl-fme-main.c b/drivers/fpga/dfl-fme-main.c
index 4490cf4..59ff9f1 100644
--- a/drivers/fpga/dfl-fme-main.c
+++ b/drivers/fpga/dfl-fme-main.c
@@ -14,6 +14,8 @@
* Henry Mitchel <henry.mitchel@intel.com>
*/
+#include <linux/hwmon.h>
+#include <linux/hwmon-sysfs.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/uaccess.h>
@@ -217,6 +219,187 @@ static long fme_hdr_ioctl(struct platform_device *pdev,
.ioctl = fme_hdr_ioctl,
};
+#define FME_THERM_THRESHOLD 0x8
+#define TEMP_THRESHOLD1 GENMASK_ULL(6, 0)
+#define TEMP_THRESHOLD1_EN BIT_ULL(7)
+#define TEMP_THRESHOLD2 GENMASK_ULL(14, 8)
+#define TEMP_THRESHOLD2_EN BIT_ULL(15)
+#define TRIP_THRESHOLD GENMASK_ULL(30, 24)
+#define TEMP_THRESHOLD1_STATUS BIT_ULL(32) /* threshold1 reached */
+#define TEMP_THRESHOLD2_STATUS BIT_ULL(33) /* threshold2 reached */
+/* threshold1 policy: 0 - AP2 (90% throttle) / 1 - AP1 (50% throttle) */
+#define TEMP_THRESHOLD1_POLICY BIT_ULL(44)
+
+#define FME_THERM_RDSENSOR_FMT1 0x10
+#define FPGA_TEMPERATURE GENMASK_ULL(6, 0)
+
+#define FME_THERM_CAP 0x20
+#define THERM_NO_THROTTLE BIT_ULL(0)
+
+#define MD_PRE_DEG
+
+static bool fme_thermal_throttle_support(void __iomem *base)
+{
+ u64 v = readq(base + FME_THERM_CAP);
+
+ return FIELD_GET(THERM_NO_THROTTLE, v) ? false : true;
+}
+
+static umode_t thermal_hwmon_attrs_visible(const void *drvdata,
+ enum hwmon_sensor_types type,
+ u32 attr, int channel)
+{
+ const struct dfl_feature *feature = drvdata;
+
+ /* temperature is always supported, and check hardware cap for others */
+ if (attr == hwmon_temp_input)
+ return 0444;
+
+ return fme_thermal_throttle_support(feature->ioaddr) ? 0444 : 0;
+}
+
+static int thermal_hwmon_read(struct device *dev, enum hwmon_sensor_types type,
+ u32 attr, int channel, long *val)
+{
+ struct dfl_feature *feature = dev_get_drvdata(dev);
+ u64 v;
+
+ switch (attr) {
+ case hwmon_temp_input:
+ v = readq(feature->ioaddr + FME_THERM_RDSENSOR_FMT1);
+ *val = (long)(FIELD_GET(FPGA_TEMPERATURE, v) * 1000);
+ break;
+ case hwmon_temp_max:
+ v = readq(feature->ioaddr + FME_THERM_THRESHOLD);
+ *val = (long)(FIELD_GET(TEMP_THRESHOLD1, v) * 1000);
+ break;
+ case hwmon_temp_crit:
+ v = readq(feature->ioaddr + FME_THERM_THRESHOLD);
+ *val = (long)(FIELD_GET(TEMP_THRESHOLD2, v) * 1000);
+ break;
+ case hwmon_temp_emergency:
+ v = readq(feature->ioaddr + FME_THERM_THRESHOLD);
+ *val = (long)(FIELD_GET(TRIP_THRESHOLD, v) * 1000);
+ break;
+ case hwmon_temp_max_alarm:
+ v = readq(feature->ioaddr + FME_THERM_THRESHOLD);
+ *val = (long)FIELD_GET(TEMP_THRESHOLD1_STATUS, v);
+ break;
+ case hwmon_temp_crit_alarm:
+ v = readq(feature->ioaddr + FME_THERM_THRESHOLD);
+ *val = (long)FIELD_GET(TEMP_THRESHOLD2_STATUS, v);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static const struct hwmon_ops thermal_hwmon_ops = {
+ .is_visible = thermal_hwmon_attrs_visible,
+ .read = thermal_hwmon_read,
+};
+
+static const struct hwmon_channel_info *thermal_hwmon_info[] = {
+ HWMON_CHANNEL_INFO(temp, HWMON_T_INPUT | HWMON_T_EMERGENCY |
+ HWMON_T_MAX | HWMON_T_MAX_ALARM |
+ HWMON_T_CRIT | HWMON_T_CRIT_ALARM),
+ NULL
+};
+
+static const struct hwmon_chip_info thermal_hwmon_chip_info = {
+ .ops = &thermal_hwmon_ops,
+ .info = thermal_hwmon_info,
+};
+
+static ssize_t temp1_max_policy_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct dfl_feature *feature = dev_get_drvdata(dev);
+ u64 v;
+
+ v = readq(feature->ioaddr + FME_THERM_THRESHOLD);
+
+ return sprintf(buf, "%u\n",
+ (unsigned int)FIELD_GET(TEMP_THRESHOLD1_POLICY, v));
+}
+
+static DEVICE_ATTR_RO(temp1_max_policy);
+
+static struct attribute *thermal_extra_attrs[] = {
+ &dev_attr_temp1_max_policy.attr,
+ NULL,
+};
+
+static umode_t thermal_extra_attrs_visible(struct kobject *kobj,
+ struct attribute *attr, int index)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct dfl_feature *feature = dev_get_drvdata(dev);
+
+ return fme_thermal_throttle_support(feature->ioaddr) ? attr->mode : 0;
+}
+
+static const struct attribute_group thermal_extra_group = {
+ .attrs = thermal_extra_attrs,
+ .is_visible = thermal_extra_attrs_visible,
+};
+__ATTRIBUTE_GROUPS(thermal_extra);
+
+static int fme_thermal_mgmt_init(struct platform_device *pdev,
+ struct dfl_feature *feature)
+{
+ struct device *hwmon;
+
+ dev_dbg(&pdev->dev, "FME Thermal Management Init.\n");
+
+ /*
+ * create hwmon to allow userspace monitoring temperature and other
+ * threshold information.
+ *
+ * temp1_input -> FPGA device temperature
+ * temp1_max -> hardware threshold 1 -> 50% or 90% throttling
+ * temp1_crit -> hardware threshold 2 -> 100% throttling
+ * temp1_emergency -> hardware trip_threshold to shutdown FPGA
+ * temp1_max_alarm -> hardware threshold 1 alarm
+ * temp1_crit_alarm -> hardware threshold 2 alarm
+ *
+ * create device specific sysfs interfaces, e.g. read temp1_max_policy
+ * to understand the actual hardware throttling action (50% vs 90%).
+ *
+ * If hardware doesn't support automatic throttling per thresholds,
+ * then all above sysfs interfaces are not visible except temp1_input
+ * for temperature.
+ */
+ hwmon = devm_hwmon_device_register_with_info(&pdev->dev,
+ "dfl_fme_thermal", feature,
+ &thermal_hwmon_chip_info,
+ thermal_extra_groups);
+ if (IS_ERR(hwmon)) {
+ dev_err(&pdev->dev, "Fail to register thermal hwmon\n");
+ return PTR_ERR(hwmon);
+ }
+
+ return 0;
+}
+
+static void fme_thermal_mgmt_uinit(struct platform_device *pdev,
+ struct dfl_feature *feature)
+{
+ dev_dbg(&pdev->dev, "FME Thermal Management UInit.\n");
+}
+
+static const struct dfl_feature_id fme_thermal_mgmt_id_table[] = {
+ {.id = FME_FEATURE_ID_THERMAL_MGMT,},
+ {0,}
+};
+
+static const struct dfl_feature_ops fme_thermal_mgmt_ops = {
+ .init = fme_thermal_mgmt_init,
+ .uinit = fme_thermal_mgmt_uinit,
+};
+
static struct dfl_feature_driver fme_feature_drvs[] = {
{
.id_table = fme_hdr_id_table,
@@ -231,6 +414,10 @@ static long fme_hdr_ioctl(struct platform_device *pdev,
.ops = &fme_global_err_ops,
},
{
+ .id_table = fme_thermal_mgmt_id_table,
+ .ops = &fme_thermal_mgmt_ops,
+ },
+ {
.ops = NULL,
},
};
--
1.8.3.1
^ permalink raw reply related
* [PATCH v4 1/3] Documentation: fpga: dfl: add descriptions for thermal/power management interfaces
From: Wu Hao @ 2019-06-27 4:53 UTC (permalink / raw)
To: mdf, linux-fpga, linux-kernel
Cc: linux-api, linux-hwmon, jdelvare, linux, atull, gregkh, Xu Yilun,
Wu Hao
In-Reply-To: <1561611218-5800-1-git-send-email-hao.wu@intel.com>
From: Xu Yilun <yilun.xu@intel.com>
This patch add introductions to thermal/power interfaces. They are
implemented as hwmon sysfs interfaces by thermal/power private
feature drivers.
Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
---
v4: rebased.
---
Documentation/fpga/dfl.txt | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/Documentation/fpga/dfl.txt b/Documentation/fpga/dfl.txt
index a22631f..652917f 100644
--- a/Documentation/fpga/dfl.txt
+++ b/Documentation/fpga/dfl.txt
@@ -105,6 +105,16 @@ More functions are exposed through sysfs
error reporting sysfs interfaces allow user to read errors detected by the
hardware, and clear the logged errors.
+ Power management (dfl_fme_power hwmon)
+ power management hwmon sysfs interfaces allow user to read power management
+ information (power consumption, thresholds, threshold status, limits, etc.)
+ and configure power thresholds for different throttling levels.
+
+ Thermal management (dfl_fme_thermal hwmon)
+ thermal management hwmon sysfs interfaces allow user to read thermal
+ management information (current temperature, thresholds, threshold status,
+ etc.).
+
FIU - PORT
==========
--
1.8.3.1
^ permalink raw reply related
* [PATCH v4 0/3] add thermal/power management features for FPGA DFL drivers
From: Wu Hao @ 2019-06-27 4:53 UTC (permalink / raw)
To: mdf, linux-fpga, linux-kernel
Cc: linux-api, linux-hwmon, jdelvare, linux, atull, gregkh, Wu Hao
This patchset adds thermal and power management features for FPGA DFL
drivers. Both patches are using hwmon as userspace interfaces.
Main changes from v3:
- use HWMON_CHANNEL_INFO.
Main changes from v2:
- switch to standard hwmon APIs for thermal hwmon:
temp1_alarm --> temp1_max
temp1_alarm_status --> temp1_max_alarm
temp1_crit_status --> temp1_crit_alarm
temp1_alarm_policy --> temp1_max_policy
- switch to standard hwmon APIs for power hwmon:
power1_cap --> power1_max
power1_cap_status --> power1_max_alarm
power1_crit_status --> power1_crit_alarm
Please note that this patchset is generated on top of this patchset.
[PATCH v4 00/15] add new features for FPGA DFL drivers
- https://lkml.org/lkml/2019/6/27/29
Wu Hao (2):
fpga: dfl: fme: add thermal management support
fpga: dfl: fme: add power management support
Xu Yilun (1):
Documentation: fpga: dfl: add descriptions for thermal/power
management interfaces
Documentation/ABI/testing/sysfs-platform-dfl-fme | 131 ++++++++
Documentation/fpga/dfl.txt | 10 +
drivers/fpga/Kconfig | 2 +-
drivers/fpga/dfl-fme-main.c | 408 +++++++++++++++++++++++
4 files changed, 550 insertions(+), 1 deletion(-)
--
1.8.3.1
^ permalink raw reply
* [PATCH v4 15/15] fpga: dfl: fme: add global error reporting support
From: Wu Hao @ 2019-06-27 4:44 UTC (permalink / raw)
To: mdf, linux-fpga, linux-kernel
Cc: linux-api, yilun.xu, hao.wu, gregkh, atull, Luwei Kang,
Ananda Ravuri
In-Reply-To: <1561610695-5414-1-git-send-email-hao.wu@intel.com>
This patch adds support for global error reporting for FPGA
Management Engine (FME), it introduces sysfs interfaces to
report different error detected by the hardware, and allow
user to clear errors or inject error for testing purpose.
Signed-off-by: Luwei Kang <luwei.kang@intel.com>
Signed-off-by: Ananda Ravuri <ananda.ravuri@intel.com>
Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
Acked-by: Alan Tull <atull@kernel.org>
---
v2: fix issues found in sysfs doc.
fix returned error code issues for writable sysfs interfaces.
(use -EINVAL if input doesn't match error code)
reorder the sysfs groups in code.
v3: code rebase.
replace scnprintf with sprintf in sysfs interfaces.
update sysfs doc kernel version and date.
v4: update sysfs doc date.
---
Documentation/ABI/testing/sysfs-platform-dfl-fme | 75 +++++
drivers/fpga/Makefile | 2 +-
drivers/fpga/dfl-fme-error.c | 385 +++++++++++++++++++++++
drivers/fpga/dfl-fme-main.c | 4 +
drivers/fpga/dfl-fme.h | 2 +
drivers/fpga/dfl.h | 2 +
6 files changed, 469 insertions(+), 1 deletion(-)
create mode 100644 drivers/fpga/dfl-fme-error.c
diff --git a/Documentation/ABI/testing/sysfs-platform-dfl-fme b/Documentation/ABI/testing/sysfs-platform-dfl-fme
index 99cd3b2..86eef83 100644
--- a/Documentation/ABI/testing/sysfs-platform-dfl-fme
+++ b/Documentation/ABI/testing/sysfs-platform-dfl-fme
@@ -44,3 +44,78 @@ Description: Read-only. It returns socket_id to indicate which socket
this FPGA belongs to, only valid for integrated solution.
User only needs this information, in case standard numa node
can't provide correct information.
+
+What: /sys/bus/platform/devices/dfl-fme.0/errors/revision
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-only. Read this file to get the revision of this global
+ error reporting private feature.
+
+What: /sys/bus/platform/devices/dfl-fme.0/errors/pcie0_errors
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-Write. Read this file for errors detected on pcie0 link.
+ Write this file to clear errors logged in pcie0_errors. Write
+ fails with -EINVAL if input parsing fails or input error code
+ doesn't match.
+
+What: /sys/bus/platform/devices/dfl-fme.0/errors/pcie1_errors
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-Write. Read this file for errors detected on pcie1 link.
+ Write this file to clear errors logged in pcie1_errors. Write
+ fails with -EINVAL if input parsing fails or input error code
+ doesn't match.
+
+What: /sys/bus/platform/devices/dfl-fme.0/errors/nonfatal_errors
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-only. It returns non-fatal errors detected.
+
+What: /sys/bus/platform/devices/dfl-fme.0/errors/catfatal_errors
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-only. It returns catastrophic and fatal errors detected.
+
+What: /sys/bus/platform/devices/dfl-fme.0/errors/inject_error
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-Write. Read this file to check errors injected. Write this
+ file to inject errors for testing purpose. Write fails with
+ -EINVAL if input parsing fails or input inject error code isn't
+ supported.
+
+What: /sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/errors
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-only. Read this file to get errors detected by hardware.
+
+What: /sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/first_error
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-only. Read this file to get the first error detected by
+ hardware.
+
+What: /sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/next_error
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-only. Read this file to get the second error detected by
+ hardware.
+
+What: /sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/clear
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Write-only. Write error code to this file to clear all errors
+ logged in errors, first_error and next_error. Write fails with
+ -EINVAL if input parsing fails or input error code doesn't
+ match.
diff --git a/drivers/fpga/Makefile b/drivers/fpga/Makefile
index 7255891..4865b74 100644
--- a/drivers/fpga/Makefile
+++ b/drivers/fpga/Makefile
@@ -39,7 +39,7 @@ obj-$(CONFIG_FPGA_DFL_FME_BRIDGE) += dfl-fme-br.o
obj-$(CONFIG_FPGA_DFL_FME_REGION) += dfl-fme-region.o
obj-$(CONFIG_FPGA_DFL_AFU) += dfl-afu.o
-dfl-fme-objs := dfl-fme-main.o dfl-fme-pr.o
+dfl-fme-objs := dfl-fme-main.o dfl-fme-pr.o dfl-fme-error.o
dfl-afu-objs := dfl-afu-main.o dfl-afu-region.o dfl-afu-dma-region.o
dfl-afu-objs += dfl-afu-error.o
diff --git a/drivers/fpga/dfl-fme-error.c b/drivers/fpga/dfl-fme-error.c
new file mode 100644
index 0000000..cdea108
--- /dev/null
+++ b/drivers/fpga/dfl-fme-error.c
@@ -0,0 +1,385 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Driver for FPGA Management Engine Error Management
+ *
+ * Copyright 2019 Intel Corporation, Inc.
+ *
+ * Authors:
+ * Kang Luwei <luwei.kang@intel.com>
+ * Xiao Guangrong <guangrong.xiao@linux.intel.com>
+ * Wu Hao <hao.wu@intel.com>
+ * Joseph Grecco <joe.grecco@intel.com>
+ * Enno Luebbers <enno.luebbers@intel.com>
+ * Tim Whisonant <tim.whisonant@intel.com>
+ * Ananda Ravuri <ananda.ravuri@intel.com>
+ * Mitchel, Henry <henry.mitchel@intel.com>
+ */
+
+#include <linux/uaccess.h>
+
+#include "dfl.h"
+#include "dfl-fme.h"
+
+#define FME_ERROR_MASK 0x8
+#define FME_ERROR 0x10
+#define MBP_ERROR BIT_ULL(6)
+#define PCIE0_ERROR_MASK 0x18
+#define PCIE0_ERROR 0x20
+#define PCIE1_ERROR_MASK 0x28
+#define PCIE1_ERROR 0x30
+#define FME_FIRST_ERROR 0x38
+#define FME_NEXT_ERROR 0x40
+#define RAS_NONFAT_ERROR_MASK 0x48
+#define RAS_NONFAT_ERROR 0x50
+#define RAS_CATFAT_ERROR_MASK 0x58
+#define RAS_CATFAT_ERROR 0x60
+#define RAS_ERROR_INJECT 0x68
+#define INJECT_ERROR_MASK GENMASK_ULL(2, 0)
+
+static ssize_t revision_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct device *err_dev = dev->parent;
+ void __iomem *base;
+
+ base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+ return sprintf(buf, "%u\n", dfl_feature_revision(base));
+}
+static DEVICE_ATTR_RO(revision);
+
+static ssize_t pcie0_errors_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct device *err_dev = dev->parent;
+ void __iomem *base;
+
+ base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+ return sprintf(buf, "0x%llx\n",
+ (unsigned long long)readq(base + PCIE0_ERROR));
+}
+
+static ssize_t pcie0_errors_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
+ struct device *err_dev = dev->parent;
+ void __iomem *base;
+ int ret = 0;
+ u64 v, val;
+
+ if (kstrtou64(buf, 0, &val))
+ return -EINVAL;
+
+ base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+ mutex_lock(&pdata->lock);
+ writeq(GENMASK_ULL(63, 0), base + PCIE0_ERROR_MASK);
+
+ v = readq(base + PCIE0_ERROR);
+ if (val == v)
+ writeq(v, base + PCIE0_ERROR);
+ else
+ ret = -EINVAL;
+
+ writeq(0ULL, base + PCIE0_ERROR_MASK);
+ mutex_unlock(&pdata->lock);
+ return ret ? ret : count;
+}
+static DEVICE_ATTR_RW(pcie0_errors);
+
+static ssize_t pcie1_errors_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct device *err_dev = dev->parent;
+ void __iomem *base;
+
+ base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+ return sprintf(buf, "0x%llx\n",
+ (unsigned long long)readq(base + PCIE1_ERROR));
+}
+
+static ssize_t pcie1_errors_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
+ struct device *err_dev = dev->parent;
+ void __iomem *base;
+ int ret = 0;
+ u64 v, val;
+
+ if (kstrtou64(buf, 0, &val))
+ return -EINVAL;
+
+ base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+ mutex_lock(&pdata->lock);
+ writeq(GENMASK_ULL(63, 0), base + PCIE1_ERROR_MASK);
+
+ v = readq(base + PCIE1_ERROR);
+ if (val == v)
+ writeq(v, base + PCIE1_ERROR);
+ else
+ ret = -EINVAL;
+
+ writeq(0ULL, base + PCIE1_ERROR_MASK);
+ mutex_unlock(&pdata->lock);
+ return ret ? ret : count;
+}
+static DEVICE_ATTR_RW(pcie1_errors);
+
+static ssize_t nonfatal_errors_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct device *err_dev = dev->parent;
+ void __iomem *base;
+
+ base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+ return sprintf(buf, "0x%llx\n",
+ (unsigned long long)readq(base + RAS_NONFAT_ERROR));
+}
+static DEVICE_ATTR_RO(nonfatal_errors);
+
+static ssize_t catfatal_errors_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct device *err_dev = dev->parent;
+ void __iomem *base;
+
+ base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+ return sprintf(buf, "0x%llx\n",
+ (unsigned long long)readq(base + RAS_CATFAT_ERROR));
+}
+static DEVICE_ATTR_RO(catfatal_errors);
+
+static ssize_t inject_error_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct device *err_dev = dev->parent;
+ void __iomem *base;
+ u64 v;
+
+ base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+ v = readq(base + RAS_ERROR_INJECT);
+
+ return sprintf(buf, "0x%llx\n",
+ (unsigned long long)FIELD_GET(INJECT_ERROR_MASK, v));
+}
+
+static ssize_t inject_error_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
+ struct device *err_dev = dev->parent;
+ void __iomem *base;
+ u8 inject_error;
+ u64 v;
+
+ if (kstrtou8(buf, 0, &inject_error))
+ return -EINVAL;
+
+ if (inject_error & ~INJECT_ERROR_MASK)
+ return -EINVAL;
+
+ base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+ mutex_lock(&pdata->lock);
+ v = readq(base + RAS_ERROR_INJECT);
+ v &= ~INJECT_ERROR_MASK;
+ v |= FIELD_PREP(INJECT_ERROR_MASK, inject_error);
+ writeq(v, base + RAS_ERROR_INJECT);
+ mutex_unlock(&pdata->lock);
+
+ return count;
+}
+static DEVICE_ATTR_RW(inject_error);
+
+static struct attribute *errors_attrs[] = {
+ &dev_attr_revision.attr,
+ &dev_attr_pcie0_errors.attr,
+ &dev_attr_pcie1_errors.attr,
+ &dev_attr_nonfatal_errors.attr,
+ &dev_attr_catfatal_errors.attr,
+ &dev_attr_inject_error.attr,
+ NULL,
+};
+
+static struct attribute_group errors_attr_group = {
+ .attrs = errors_attrs,
+};
+
+static ssize_t errors_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct device *err_dev = dev->parent;
+ void __iomem *base;
+
+ base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+ return sprintf(buf, "0x%llx\n",
+ (unsigned long long)readq(base + FME_ERROR));
+}
+static DEVICE_ATTR_RO(errors);
+
+static ssize_t first_error_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct device *err_dev = dev->parent;
+ void __iomem *base;
+
+ base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+ return sprintf(buf, "0x%llx\n",
+ (unsigned long long)readq(base + FME_FIRST_ERROR));
+}
+static DEVICE_ATTR_RO(first_error);
+
+static ssize_t next_error_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct device *err_dev = dev->parent;
+ void __iomem *base;
+
+ base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+ return sprintf(buf, "0x%llx\n",
+ (unsigned long long)readq(base + FME_NEXT_ERROR));
+}
+static DEVICE_ATTR_RO(next_error);
+
+static ssize_t clear_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
+ struct device *err_dev = dev->parent;
+ void __iomem *base;
+ u64 v, val;
+ int ret = 0;
+
+ if (kstrtou64(buf, 0, &val))
+ return -EINVAL;
+
+ base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+ mutex_lock(&pdata->lock);
+ writeq(GENMASK_ULL(63, 0), base + FME_ERROR_MASK);
+
+ v = readq(base + FME_ERROR);
+ if (val == v) {
+ writeq(v, base + FME_ERROR);
+ v = readq(base + FME_FIRST_ERROR);
+ writeq(v, base + FME_FIRST_ERROR);
+ v = readq(base + FME_NEXT_ERROR);
+ writeq(v, base + FME_NEXT_ERROR);
+ } else {
+ ret = -EINVAL;
+ }
+
+ /* Workaround: disable MBP_ERROR if feature revision is 0 */
+ writeq(dfl_feature_revision(base) ? 0ULL : MBP_ERROR,
+ base + FME_ERROR_MASK);
+ mutex_unlock(&pdata->lock);
+ return ret ? ret : count;
+}
+static DEVICE_ATTR_WO(clear);
+
+static struct attribute *fme_errors_attrs[] = {
+ &dev_attr_errors.attr,
+ &dev_attr_first_error.attr,
+ &dev_attr_next_error.attr,
+ &dev_attr_clear.attr,
+ NULL,
+};
+
+static struct attribute_group fme_errors_attr_group = {
+ .attrs = fme_errors_attrs,
+ .name = "fme-errors",
+};
+
+static const struct attribute_group *error_groups[] = {
+ &fme_errors_attr_group,
+ &errors_attr_group,
+ NULL
+};
+
+static void fme_error_enable(struct dfl_feature *feature)
+{
+ void __iomem *base = feature->ioaddr;
+
+ /* Workaround: disable MBP_ERROR if revision is 0 */
+ writeq(dfl_feature_revision(feature->ioaddr) ? 0ULL : MBP_ERROR,
+ base + FME_ERROR_MASK);
+ writeq(0ULL, base + PCIE0_ERROR_MASK);
+ writeq(0ULL, base + PCIE1_ERROR_MASK);
+ writeq(0ULL, base + RAS_NONFAT_ERROR_MASK);
+ writeq(0ULL, base + RAS_CATFAT_ERROR_MASK);
+}
+
+static void err_dev_release(struct device *dev)
+{
+ kfree(dev);
+}
+
+static int fme_global_err_init(struct platform_device *pdev,
+ struct dfl_feature *feature)
+{
+ struct device *dev;
+ int ret = 0;
+
+ dev_dbg(&pdev->dev, "FME Global Error Reporting Init.\n");
+
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev)
+ return -ENOMEM;
+
+ dev->parent = &pdev->dev;
+ dev->release = err_dev_release;
+ dev_set_name(dev, "errors");
+
+ fme_error_enable(feature);
+
+ ret = device_register(dev);
+ if (ret) {
+ put_device(dev);
+ return ret;
+ }
+
+ ret = sysfs_create_groups(&dev->kobj, error_groups);
+ if (ret) {
+ device_unregister(dev);
+ return ret;
+ }
+
+ feature->priv = dev;
+
+ return ret;
+}
+
+static void fme_global_err_uinit(struct platform_device *pdev,
+ struct dfl_feature *feature)
+{
+ struct device *dev = feature->priv;
+
+ dev_dbg(&pdev->dev, "FME Global Error Reporting UInit.\n");
+
+ sysfs_remove_groups(&dev->kobj, error_groups);
+ device_unregister(dev);
+}
+
+const struct dfl_feature_id fme_global_err_id_table[] = {
+ {.id = FME_FEATURE_ID_GLOBAL_ERR,},
+ {0,}
+};
+
+const struct dfl_feature_ops fme_global_err_ops = {
+ .init = fme_global_err_init,
+ .uinit = fme_global_err_uinit,
+};
diff --git a/drivers/fpga/dfl-fme-main.c b/drivers/fpga/dfl-fme-main.c
index 2d69b8f..4490cf4 100644
--- a/drivers/fpga/dfl-fme-main.c
+++ b/drivers/fpga/dfl-fme-main.c
@@ -227,6 +227,10 @@ static long fme_hdr_ioctl(struct platform_device *pdev,
.ops = &fme_pr_mgmt_ops,
},
{
+ .id_table = fme_global_err_id_table,
+ .ops = &fme_global_err_ops,
+ },
+ {
.ops = NULL,
},
};
diff --git a/drivers/fpga/dfl-fme.h b/drivers/fpga/dfl-fme.h
index 7a021c4..5fbe3f5 100644
--- a/drivers/fpga/dfl-fme.h
+++ b/drivers/fpga/dfl-fme.h
@@ -37,5 +37,7 @@ struct dfl_fme {
extern const struct dfl_feature_ops fme_pr_mgmt_ops;
extern const struct dfl_feature_id fme_pr_mgmt_id_table[];
+extern const struct dfl_feature_ops fme_global_err_ops;
+extern const struct dfl_feature_id fme_global_err_id_table[];
#endif /* __DFL_FME_H */
diff --git a/drivers/fpga/dfl.h b/drivers/fpga/dfl.h
index fbc57f0..6c32080 100644
--- a/drivers/fpga/dfl.h
+++ b/drivers/fpga/dfl.h
@@ -197,12 +197,14 @@ struct dfl_feature_driver {
* feature dev (platform device)'s reources.
* @ioaddr: mapped mmio resource address.
* @ops: ops of this sub feature.
+ * @priv: priv data of this feature.
*/
struct dfl_feature {
u64 id;
int resource_index;
void __iomem *ioaddr;
const struct dfl_feature_ops *ops;
+ void *priv;
};
#define DEV_STATUS_IN_USE 0
--
1.8.3.1
^ permalink raw reply related
* [PATCH v4 14/15] fpga: dfl: fme: add capability sysfs interfaces
From: Wu Hao @ 2019-06-27 4:44 UTC (permalink / raw)
To: mdf, linux-fpga, linux-kernel
Cc: linux-api, yilun.xu, hao.wu, gregkh, atull, Luwei Kang
In-Reply-To: <1561610695-5414-1-git-send-email-hao.wu@intel.com>
This patch adds 3 read-only sysfs interfaces for FPGA Management Engine
(FME) block for capabilities including cache_size, fabric_version and
socket_id.
Signed-off-by: Luwei Kang <luwei.kang@intel.com>
Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
Acked-by: Alan Tull <atull@kernel.org>
---
v3: replace scnprintf with sprintf in sysfs interfaces.
update sysfs doc kernel version and date.
v4: update sysfs doc date.
---
Documentation/ABI/testing/sysfs-platform-dfl-fme | 23 ++++++++++++
drivers/fpga/dfl-fme-main.c | 48 ++++++++++++++++++++++++
2 files changed, 71 insertions(+)
diff --git a/Documentation/ABI/testing/sysfs-platform-dfl-fme b/Documentation/ABI/testing/sysfs-platform-dfl-fme
index 8fa4feb..99cd3b2 100644
--- a/Documentation/ABI/testing/sysfs-platform-dfl-fme
+++ b/Documentation/ABI/testing/sysfs-platform-dfl-fme
@@ -21,3 +21,26 @@ Contact: Wu Hao <hao.wu@intel.com>
Description: Read-only. It returns Bitstream (static FPGA region) meta
data, which includes the synthesis date, seed and other
information of this static FPGA region.
+
+What: /sys/bus/platform/devices/dfl-fme.0/cache_size
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-only. It returns cache size of this FPGA device.
+
+What: /sys/bus/platform/devices/dfl-fme.0/fabric_version
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-only. It returns fabric version of this FPGA device.
+ Userspace applications need this information to select
+ best data channels per different fabric design.
+
+What: /sys/bus/platform/devices/dfl-fme.0/socket_id
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-only. It returns socket_id to indicate which socket
+ this FPGA belongs to, only valid for integrated solution.
+ User only needs this information, in case standard numa node
+ can't provide correct information.
diff --git a/drivers/fpga/dfl-fme-main.c b/drivers/fpga/dfl-fme-main.c
index 38c6342..2d69b8f 100644
--- a/drivers/fpga/dfl-fme-main.c
+++ b/drivers/fpga/dfl-fme-main.c
@@ -75,10 +75,58 @@ static ssize_t bitstream_metadata_show(struct device *dev,
}
static DEVICE_ATTR_RO(bitstream_metadata);
+static ssize_t cache_size_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ void __iomem *base;
+ u64 v;
+
+ base = dfl_get_feature_ioaddr_by_id(dev, FME_FEATURE_ID_HEADER);
+
+ v = readq(base + FME_HDR_CAP);
+
+ return sprintf(buf, "%u\n",
+ (unsigned int)FIELD_GET(FME_CAP_CACHE_SIZE, v));
+}
+static DEVICE_ATTR_RO(cache_size);
+
+static ssize_t fabric_version_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ void __iomem *base;
+ u64 v;
+
+ base = dfl_get_feature_ioaddr_by_id(dev, FME_FEATURE_ID_HEADER);
+
+ v = readq(base + FME_HDR_CAP);
+
+ return sprintf(buf, "%u\n",
+ (unsigned int)FIELD_GET(FME_CAP_FABRIC_VERID, v));
+}
+static DEVICE_ATTR_RO(fabric_version);
+
+static ssize_t socket_id_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ void __iomem *base;
+ u64 v;
+
+ base = dfl_get_feature_ioaddr_by_id(dev, FME_FEATURE_ID_HEADER);
+
+ v = readq(base + FME_HDR_CAP);
+
+ return sprintf(buf, "%u\n",
+ (unsigned int)FIELD_GET(FME_CAP_SOCKET_ID, v));
+}
+static DEVICE_ATTR_RO(socket_id);
+
static const struct attribute *fme_hdr_attrs[] = {
&dev_attr_ports_num.attr,
&dev_attr_bitstream_id.attr,
&dev_attr_bitstream_metadata.attr,
+ &dev_attr_cache_size.attr,
+ &dev_attr_fabric_version.attr,
+ &dev_attr_socket_id.attr,
NULL,
};
--
1.8.3.1
^ permalink raw reply related
* [PATCH v4 13/15] fpga: dfl: afu: add STP (SignalTap) support
From: Wu Hao @ 2019-06-27 4:44 UTC (permalink / raw)
To: mdf, linux-fpga, linux-kernel; +Cc: linux-api, yilun.xu, hao.wu, gregkh, atull
In-Reply-To: <1561610695-5414-1-git-send-email-hao.wu@intel.com>
STP (SignalTap) is one of the private features under the port for
debugging. This patch adds private feature driver support for it
to allow userspace applications to mmap related mmio region and
provide STP service.
Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
Acked-by: Moritz Fischer <mdf@kernel.org>
Acked-by: Alan Tull <atull@kernel.org>
---
drivers/fpga/dfl-afu-main.c | 34 ++++++++++++++++++++++++++++++++++
1 file changed, 34 insertions(+)
diff --git a/drivers/fpga/dfl-afu-main.c b/drivers/fpga/dfl-afu-main.c
index bcf6e28..8241ace 100644
--- a/drivers/fpga/dfl-afu-main.c
+++ b/drivers/fpga/dfl-afu-main.c
@@ -513,6 +513,36 @@ static void port_afu_uinit(struct platform_device *pdev,
.uinit = port_afu_uinit,
};
+static int port_stp_init(struct platform_device *pdev,
+ struct dfl_feature *feature)
+{
+ struct resource *res = &pdev->resource[feature->resource_index];
+
+ dev_dbg(&pdev->dev, "PORT STP Init.\n");
+
+ return afu_mmio_region_add(dev_get_platdata(&pdev->dev),
+ DFL_PORT_REGION_INDEX_STP,
+ resource_size(res), res->start,
+ DFL_PORT_REGION_MMAP | DFL_PORT_REGION_READ |
+ DFL_PORT_REGION_WRITE);
+}
+
+static void port_stp_uinit(struct platform_device *pdev,
+ struct dfl_feature *feature)
+{
+ dev_dbg(&pdev->dev, "PORT STP UInit.\n");
+}
+
+static const struct dfl_feature_id port_stp_id_table[] = {
+ {.id = PORT_FEATURE_ID_STP,},
+ {0,}
+};
+
+static const struct dfl_feature_ops port_stp_ops = {
+ .init = port_stp_init,
+ .uinit = port_stp_uinit,
+};
+
static struct dfl_feature_driver port_feature_drvs[] = {
{
.id_table = port_hdr_id_table,
@@ -527,6 +557,10 @@ static void port_afu_uinit(struct platform_device *pdev,
.ops = &port_err_ops,
},
{
+ .id_table = port_stp_id_table,
+ .ops = &port_stp_ops,
+ },
+ {
.ops = NULL,
}
};
--
1.8.3.1
^ permalink raw reply related
* [PATCH v4 12/15] fpga: dfl: afu: add error reporting support.
From: Wu Hao @ 2019-06-27 4:44 UTC (permalink / raw)
To: mdf, linux-fpga, linux-kernel; +Cc: linux-api, yilun.xu, hao.wu, gregkh, atull
In-Reply-To: <1561610695-5414-1-git-send-email-hao.wu@intel.com>
Error reporting is one important private feature, it reports error
detected on port and accelerated function unit (AFU). It introduces
several sysfs interfaces to allow userspace to check and clear
errors detected by hardware.
Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
Acked-by: Alan Tull <atull@kernel.org>
---
v2: add more error code description for error clear sysfs in doc.
return -EINVAL instead of -EBUSY when input error code doesn't
match in error clear sysfs.
v3: replace scnprintf with sprintf in sysfs interfaces.
update sysfs doc kernel version and date.
v4: update sysfs doc date.
---
Documentation/ABI/testing/sysfs-platform-dfl-port | 39 ++++
drivers/fpga/Makefile | 1 +
drivers/fpga/dfl-afu-error.c | 225 ++++++++++++++++++++++
drivers/fpga/dfl-afu-main.c | 4 +
drivers/fpga/dfl-afu.h | 4 +
5 files changed, 273 insertions(+)
create mode 100644 drivers/fpga/dfl-afu-error.c
diff --git a/Documentation/ABI/testing/sysfs-platform-dfl-port b/Documentation/ABI/testing/sysfs-platform-dfl-port
index 04ea7f2..4aeca94 100644
--- a/Documentation/ABI/testing/sysfs-platform-dfl-port
+++ b/Documentation/ABI/testing/sysfs-platform-dfl-port
@@ -79,3 +79,42 @@ KernelVersion: 5.3
Contact: Wu Hao <hao.wu@intel.com>
Description: Read-only. Read this file to get the status of issued command
to userclck_freqcntrcmd.
+
+What: /sys/bus/platform/devices/dfl-port.0/errors/revision
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-only. Read this file to get the revision of this error
+ reporting private feature.
+
+What: /sys/bus/platform/devices/dfl-port.0/errors/errors
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-only. Read this file to get errors detected on port and
+ Accelerated Function Unit (AFU).
+
+What: /sys/bus/platform/devices/dfl-port.0/errors/first_error
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-only. Read this file to get the first error detected by
+ hardware.
+
+What: /sys/bus/platform/devices/dfl-port.0/errors/first_malformed_req
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-only. Read this file to get the first malformed request
+ captured by hardware.
+
+What: /sys/bus/platform/devices/dfl-port.0/errors/clear
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Write-only. Write error code to this file to clear errors.
+ Write fails with -EINVAL if input parsing fails or input error
+ code doesn't match.
+ Write fails with -EBUSY or -ETIMEDOUT if error can't be cleared
+ as hardware is in low power state (-EBUSY) or not responding
+ (-ETIMEDOUT).
diff --git a/drivers/fpga/Makefile b/drivers/fpga/Makefile
index 312b937..7255891 100644
--- a/drivers/fpga/Makefile
+++ b/drivers/fpga/Makefile
@@ -41,6 +41,7 @@ obj-$(CONFIG_FPGA_DFL_AFU) += dfl-afu.o
dfl-fme-objs := dfl-fme-main.o dfl-fme-pr.o
dfl-afu-objs := dfl-afu-main.o dfl-afu-region.o dfl-afu-dma-region.o
+dfl-afu-objs += dfl-afu-error.o
# Drivers for FPGAs which implement DFL
obj-$(CONFIG_FPGA_DFL_PCI) += dfl-pci.o
diff --git a/drivers/fpga/dfl-afu-error.c b/drivers/fpga/dfl-afu-error.c
new file mode 100644
index 0000000..f20dbdf
--- /dev/null
+++ b/drivers/fpga/dfl-afu-error.c
@@ -0,0 +1,225 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Driver for FPGA Accelerated Function Unit (AFU) Error Reporting
+ *
+ * Copyright 2019 Intel Corporation, Inc.
+ *
+ * Authors:
+ * Wu Hao <hao.wu@linux.intel.com>
+ * Xiao Guangrong <guangrong.xiao@linux.intel.com>
+ * Joseph Grecco <joe.grecco@intel.com>
+ * Enno Luebbers <enno.luebbers@intel.com>
+ * Tim Whisonant <tim.whisonant@intel.com>
+ * Ananda Ravuri <ananda.ravuri@intel.com>
+ * Mitchel Henry <henry.mitchel@intel.com>
+ */
+
+#include <linux/uaccess.h>
+
+#include "dfl-afu.h"
+
+#define PORT_ERROR_MASK 0x8
+#define PORT_ERROR 0x10
+#define PORT_FIRST_ERROR 0x18
+#define PORT_MALFORMED_REQ0 0x20
+#define PORT_MALFORMED_REQ1 0x28
+
+#define ERROR_MASK GENMASK_ULL(63, 0)
+
+/* mask or unmask port errors by the error mask register. */
+static void __port_err_mask(struct device *dev, bool mask)
+{
+ void __iomem *base;
+
+ base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_ERROR);
+
+ writeq(mask ? ERROR_MASK : 0, base + PORT_ERROR_MASK);
+}
+
+/* clear port errors. */
+static int __port_err_clear(struct device *dev, u64 err)
+{
+ struct platform_device *pdev = to_platform_device(dev);
+ void __iomem *base_err, *base_hdr;
+ int ret;
+ u64 v;
+
+ base_err = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_ERROR);
+ base_hdr = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_HEADER);
+
+ /*
+ * clear Port Errors
+ *
+ * - Check for AP6 State
+ * - Halt Port by keeping Port in reset
+ * - Set PORT Error mask to all 1 to mask errors
+ * - Clear all errors
+ * - Set Port mask to all 0 to enable errors
+ * - All errors start capturing new errors
+ * - Enable Port by pulling the port out of reset
+ */
+
+ /* if device is still in AP6 power state, can not clear any error. */
+ v = readq(base_hdr + PORT_HDR_STS);
+ if (FIELD_GET(PORT_STS_PWR_STATE, v) == PORT_STS_PWR_STATE_AP6) {
+ dev_err(dev, "Could not clear errors, device in AP6 state.\n");
+ return -EBUSY;
+ }
+
+ /* Halt Port by keeping Port in reset */
+ ret = __port_disable(pdev);
+ if (ret)
+ return ret;
+
+ /* Mask all errors */
+ __port_err_mask(dev, true);
+
+ /* Clear errors if err input matches with current port errors.*/
+ v = readq(base_err + PORT_ERROR);
+
+ if (v == err) {
+ writeq(v, base_err + PORT_ERROR);
+
+ v = readq(base_err + PORT_FIRST_ERROR);
+ writeq(v, base_err + PORT_FIRST_ERROR);
+ } else {
+ ret = -EINVAL;
+ }
+
+ /* Clear mask */
+ __port_err_mask(dev, false);
+
+ /* Enable the Port by clear the reset */
+ __port_enable(pdev);
+
+ return ret;
+}
+
+static ssize_t revision_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ void __iomem *base;
+
+ base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_ERROR);
+
+ return sprintf(buf, "%u\n", dfl_feature_revision(base));
+}
+static DEVICE_ATTR_RO(revision);
+
+static ssize_t errors_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct dfl_feature_platform_data *pdata = dev_get_platdata(dev);
+ void __iomem *base;
+ u64 error;
+
+ base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_ERROR);
+
+ mutex_lock(&pdata->lock);
+ error = readq(base + PORT_ERROR);
+ mutex_unlock(&pdata->lock);
+
+ return sprintf(buf, "0x%llx\n", (unsigned long long)error);
+}
+static DEVICE_ATTR_RO(errors);
+
+static ssize_t first_error_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct dfl_feature_platform_data *pdata = dev_get_platdata(dev);
+ void __iomem *base;
+ u64 error;
+
+ base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_ERROR);
+
+ mutex_lock(&pdata->lock);
+ error = readq(base + PORT_FIRST_ERROR);
+ mutex_unlock(&pdata->lock);
+
+ return sprintf(buf, "0x%llx\n", (unsigned long long)error);
+}
+static DEVICE_ATTR_RO(first_error);
+
+static ssize_t first_malformed_req_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct dfl_feature_platform_data *pdata = dev_get_platdata(dev);
+ void __iomem *base;
+ u64 req0, req1;
+
+ base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_ERROR);
+
+ mutex_lock(&pdata->lock);
+ req0 = readq(base + PORT_MALFORMED_REQ0);
+ req1 = readq(base + PORT_MALFORMED_REQ1);
+ mutex_unlock(&pdata->lock);
+
+ return sprintf(buf, "0x%016llx%016llx\n",
+ (unsigned long long)req1, (unsigned long long)req0);
+}
+static DEVICE_ATTR_RO(first_malformed_req);
+
+static ssize_t clear_store(struct device *dev, struct device_attribute *attr,
+ const char *buff, size_t count)
+{
+ struct dfl_feature_platform_data *pdata = dev_get_platdata(dev);
+ u64 value;
+ int ret;
+
+ if (kstrtou64(buff, 0, &value))
+ return -EINVAL;
+
+ mutex_lock(&pdata->lock);
+ ret = __port_err_clear(dev, value);
+ mutex_unlock(&pdata->lock);
+
+ return ret ? ret : count;
+}
+static DEVICE_ATTR_WO(clear);
+
+static struct attribute *port_err_attrs[] = {
+ &dev_attr_revision.attr,
+ &dev_attr_errors.attr,
+ &dev_attr_first_error.attr,
+ &dev_attr_first_malformed_req.attr,
+ &dev_attr_clear.attr,
+ NULL,
+};
+
+static struct attribute_group port_err_attr_group = {
+ .attrs = port_err_attrs,
+ .name = "errors",
+};
+
+static int port_err_init(struct platform_device *pdev,
+ struct dfl_feature *feature)
+{
+ struct dfl_feature_platform_data *pdata = dev_get_platdata(&pdev->dev);
+
+ dev_dbg(&pdev->dev, "PORT ERR Init.\n");
+
+ mutex_lock(&pdata->lock);
+ __port_err_mask(&pdev->dev, false);
+ mutex_unlock(&pdata->lock);
+
+ return sysfs_create_group(&pdev->dev.kobj, &port_err_attr_group);
+}
+
+static void port_err_uinit(struct platform_device *pdev,
+ struct dfl_feature *feature)
+{
+ dev_dbg(&pdev->dev, "PORT ERR UInit.\n");
+
+ sysfs_remove_group(&pdev->dev.kobj, &port_err_attr_group);
+}
+
+const struct dfl_feature_id port_err_id_table[] = {
+ {.id = PORT_FEATURE_ID_ERROR,},
+ {0,}
+};
+
+const struct dfl_feature_ops port_err_ops = {
+ .init = port_err_init,
+ .uinit = port_err_uinit,
+};
diff --git a/drivers/fpga/dfl-afu-main.c b/drivers/fpga/dfl-afu-main.c
index c8bc0b5..bcf6e28 100644
--- a/drivers/fpga/dfl-afu-main.c
+++ b/drivers/fpga/dfl-afu-main.c
@@ -523,6 +523,10 @@ static void port_afu_uinit(struct platform_device *pdev,
.ops = &port_afu_ops,
},
{
+ .id_table = port_err_id_table,
+ .ops = &port_err_ops,
+ },
+ {
.ops = NULL,
}
};
diff --git a/drivers/fpga/dfl-afu.h b/drivers/fpga/dfl-afu.h
index 35e60c5..c3182a2 100644
--- a/drivers/fpga/dfl-afu.h
+++ b/drivers/fpga/dfl-afu.h
@@ -100,4 +100,8 @@ int afu_dma_map_region(struct dfl_feature_platform_data *pdata,
struct dfl_afu_dma_region *
afu_dma_region_find(struct dfl_feature_platform_data *pdata,
u64 iova, u64 size);
+
+extern const struct dfl_feature_ops port_err_ops;
+extern const struct dfl_feature_id port_err_id_table[];
+
#endif /* __DFL_AFU_H */
--
1.8.3.1
^ permalink raw reply related
* [PATCH v4 11/15] fpga: dfl: afu: export __port_enable/disable function.
From: Wu Hao @ 2019-06-27 4:44 UTC (permalink / raw)
To: mdf, linux-fpga, linux-kernel; +Cc: linux-api, yilun.xu, hao.wu, gregkh, atull
In-Reply-To: <1561610695-5414-1-git-send-email-hao.wu@intel.com>
As these two functions are used by other private features. e.g.
in error reporting private feature, it requires to check port status
and reset port for error clearing.
Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
Acked-by: Moritz Fischer <mdf@kernel.org>
Acked-by: Alan Tull <atull@kernel.org>
---
drivers/fpga/dfl-afu-main.c | 25 ++++++++++++++-----------
drivers/fpga/dfl-afu.h | 3 +++
2 files changed, 17 insertions(+), 11 deletions(-)
diff --git a/drivers/fpga/dfl-afu-main.c b/drivers/fpga/dfl-afu-main.c
index 65b3e89..c8bc0b5 100644
--- a/drivers/fpga/dfl-afu-main.c
+++ b/drivers/fpga/dfl-afu-main.c
@@ -24,14 +24,16 @@
#define DRV_VERSION "0.8"
/**
- * port_enable - enable a port
+ * __port_enable - enable a port
* @pdev: port platform device.
*
* Enable Port by clear the port soft reset bit, which is set by default.
* The AFU is unable to respond to any MMIO access while in reset.
- * port_enable function should only be used after port_disable function.
+ * __port_enable function should only be used after __port_disable function.
+ *
+ * The caller needs to hold lock for protection.
*/
-static void port_enable(struct platform_device *pdev)
+void __port_enable(struct platform_device *pdev)
{
struct dfl_feature_platform_data *pdata = dev_get_platdata(&pdev->dev);
void __iomem *base;
@@ -54,13 +56,14 @@ static void port_enable(struct platform_device *pdev)
#define RST_POLL_TIMEOUT 1000 /* us */
/**
- * port_disable - disable a port
+ * __port_disable - disable a port
* @pdev: port platform device.
*
- * Disable Port by setting the port soft reset bit, it puts the port into
- * reset.
+ * Disable Port by setting the port soft reset bit, it puts the port into reset.
+ *
+ * The caller needs to hold lock for protection.
*/
-static int port_disable(struct platform_device *pdev)
+int __port_disable(struct platform_device *pdev)
{
struct dfl_feature_platform_data *pdata = dev_get_platdata(&pdev->dev);
void __iomem *base;
@@ -106,9 +109,9 @@ static int __port_reset(struct platform_device *pdev)
{
int ret;
- ret = port_disable(pdev);
+ ret = __port_disable(pdev);
if (!ret)
- port_enable(pdev);
+ __port_enable(pdev);
return ret;
}
@@ -805,9 +808,9 @@ static int port_enable_set(struct platform_device *pdev, bool enable)
mutex_lock(&pdata->lock);
if (enable)
- port_enable(pdev);
+ __port_enable(pdev);
else
- ret = port_disable(pdev);
+ ret = __port_disable(pdev);
mutex_unlock(&pdata->lock);
return ret;
diff --git a/drivers/fpga/dfl-afu.h b/drivers/fpga/dfl-afu.h
index 0c7630a..35e60c5 100644
--- a/drivers/fpga/dfl-afu.h
+++ b/drivers/fpga/dfl-afu.h
@@ -79,6 +79,9 @@ struct dfl_afu {
struct dfl_feature_platform_data *pdata;
};
+void __port_enable(struct platform_device *pdev);
+int __port_disable(struct platform_device *pdev);
+
void afu_mmio_region_init(struct dfl_feature_platform_data *pdata);
int afu_mmio_region_add(struct dfl_feature_platform_data *pdata,
u32 region_index, u64 region_size, u64 phys, u32 flags);
--
1.8.3.1
^ permalink raw reply related
* [PATCH v4 10/15] fpga: dfl: add id_table for dfl private feature driver
From: Wu Hao @ 2019-06-27 4:44 UTC (permalink / raw)
To: mdf, linux-fpga, linux-kernel; +Cc: linux-api, yilun.xu, hao.wu, gregkh, atull
In-Reply-To: <1561610695-5414-1-git-send-email-hao.wu@intel.com>
This patch adds id_table for each dfl private feature driver,
it allows to reuse same private feature driver to match and support
multiple dfl private features.
Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
Acked-by: Moritz Fischer <mdf@kernel.org>
Acked-by: Alan Tull <atull@kernel.org>
---
drivers/fpga/dfl-afu-main.c | 14 ++++++++++++--
drivers/fpga/dfl-fme-main.c | 11 ++++++++---
drivers/fpga/dfl-fme-pr.c | 7 ++++++-
drivers/fpga/dfl-fme.h | 3 ++-
drivers/fpga/dfl.c | 21 +++++++++++++++++++--
drivers/fpga/dfl.h | 21 +++++++++++++++------
6 files changed, 62 insertions(+), 15 deletions(-)
diff --git a/drivers/fpga/dfl-afu-main.c b/drivers/fpga/dfl-afu-main.c
index 8b434a4..65b3e89 100644
--- a/drivers/fpga/dfl-afu-main.c
+++ b/drivers/fpga/dfl-afu-main.c
@@ -435,6 +435,11 @@ static void port_hdr_uinit(struct platform_device *pdev,
return ret;
}
+static const struct dfl_feature_id port_hdr_id_table[] = {
+ {.id = PORT_FEATURE_ID_HEADER,},
+ {0,}
+};
+
static const struct dfl_feature_ops port_hdr_ops = {
.init = port_hdr_init,
.uinit = port_hdr_uinit,
@@ -495,6 +500,11 @@ static void port_afu_uinit(struct platform_device *pdev,
sysfs_remove_files(&pdev->dev.kobj, port_afu_attrs);
}
+static const struct dfl_feature_id port_afu_id_table[] = {
+ {.id = PORT_FEATURE_ID_AFU,},
+ {0,}
+};
+
static const struct dfl_feature_ops port_afu_ops = {
.init = port_afu_init,
.uinit = port_afu_uinit,
@@ -502,11 +512,11 @@ static void port_afu_uinit(struct platform_device *pdev,
static struct dfl_feature_driver port_feature_drvs[] = {
{
- .id = PORT_FEATURE_ID_HEADER,
+ .id_table = port_hdr_id_table,
.ops = &port_hdr_ops,
},
{
- .id = PORT_FEATURE_ID_AFU,
+ .id_table = port_afu_id_table,
.ops = &port_afu_ops,
},
{
diff --git a/drivers/fpga/dfl-fme-main.c b/drivers/fpga/dfl-fme-main.c
index 8b2a337..38c6342 100644
--- a/drivers/fpga/dfl-fme-main.c
+++ b/drivers/fpga/dfl-fme-main.c
@@ -158,6 +158,11 @@ static long fme_hdr_ioctl(struct platform_device *pdev,
return -ENODEV;
}
+static const struct dfl_feature_id fme_hdr_id_table[] = {
+ {.id = FME_FEATURE_ID_HEADER,},
+ {0,}
+};
+
static const struct dfl_feature_ops fme_hdr_ops = {
.init = fme_hdr_init,
.uinit = fme_hdr_uinit,
@@ -166,12 +171,12 @@ static long fme_hdr_ioctl(struct platform_device *pdev,
static struct dfl_feature_driver fme_feature_drvs[] = {
{
- .id = FME_FEATURE_ID_HEADER,
+ .id_table = fme_hdr_id_table,
.ops = &fme_hdr_ops,
},
{
- .id = FME_FEATURE_ID_PR_MGMT,
- .ops = &pr_mgmt_ops,
+ .id_table = fme_pr_mgmt_id_table,
+ .ops = &fme_pr_mgmt_ops,
},
{
.ops = NULL,
diff --git a/drivers/fpga/dfl-fme-pr.c b/drivers/fpga/dfl-fme-pr.c
index cd94ba8..52f1745 100644
--- a/drivers/fpga/dfl-fme-pr.c
+++ b/drivers/fpga/dfl-fme-pr.c
@@ -483,7 +483,12 @@ static long fme_pr_ioctl(struct platform_device *pdev,
return ret;
}
-const struct dfl_feature_ops pr_mgmt_ops = {
+const struct dfl_feature_id fme_pr_mgmt_id_table[] = {
+ {.id = FME_FEATURE_ID_PR_MGMT,},
+ {0}
+};
+
+const struct dfl_feature_ops fme_pr_mgmt_ops = {
.init = pr_mgmt_init,
.uinit = pr_mgmt_uinit,
.ioctl = fme_pr_ioctl,
diff --git a/drivers/fpga/dfl-fme.h b/drivers/fpga/dfl-fme.h
index de20755..7a021c4 100644
--- a/drivers/fpga/dfl-fme.h
+++ b/drivers/fpga/dfl-fme.h
@@ -35,6 +35,7 @@ struct dfl_fme {
struct dfl_feature_platform_data *pdata;
};
-extern const struct dfl_feature_ops pr_mgmt_ops;
+extern const struct dfl_feature_ops fme_pr_mgmt_ops;
+extern const struct dfl_feature_id fme_pr_mgmt_id_table[];
#endif /* __DFL_FME_H */
diff --git a/drivers/fpga/dfl.c b/drivers/fpga/dfl.c
index 28d61b6..1bb2b58 100644
--- a/drivers/fpga/dfl.c
+++ b/drivers/fpga/dfl.c
@@ -14,6 +14,8 @@
#include "dfl.h"
+#define DRV_VERSION "0.8"
+
static DEFINE_MUTEX(dfl_id_mutex);
/*
@@ -281,6 +283,21 @@ static int dfl_feature_instance_init(struct platform_device *pdev,
return ret;
}
+static bool dfl_feature_drv_match(struct dfl_feature *feature,
+ struct dfl_feature_driver *driver)
+{
+ const struct dfl_feature_id *ids = driver->id_table;
+
+ if (ids) {
+ while (ids->id) {
+ if (ids->id == feature->id)
+ return true;
+ ids++;
+ }
+ }
+ return false;
+}
+
/**
* dfl_fpga_dev_feature_init - init for sub features of dfl feature device
* @pdev: feature device.
@@ -301,8 +318,7 @@ int dfl_fpga_dev_feature_init(struct platform_device *pdev,
while (drv->ops) {
dfl_fpga_dev_for_each_feature(pdata, feature) {
- /* match feature and drv using id */
- if (feature->id == drv->id) {
+ if (dfl_feature_drv_match(feature, drv)) {
ret = dfl_feature_instance_init(pdev, pdata,
feature, drv);
if (ret)
@@ -1178,3 +1194,4 @@ static void __exit dfl_fpga_exit(void)
MODULE_DESCRIPTION("FPGA Device Feature List (DFL) Support");
MODULE_AUTHOR("Intel Corporation");
MODULE_LICENSE("GPL v2");
+MODULE_VERSION(DRV_VERSION);
diff --git a/drivers/fpga/dfl.h b/drivers/fpga/dfl.h
index 3c5dc3a..fbc57f0 100644
--- a/drivers/fpga/dfl.h
+++ b/drivers/fpga/dfl.h
@@ -30,8 +30,8 @@
/* plus one for fme device */
#define MAX_DFL_FEATURE_DEV_NUM (MAX_DFL_FPGA_PORT_NUM + 1)
-/* Reserved 0x0 for Header Group Register and 0xff for AFU */
-#define FEATURE_ID_FIU_HEADER 0x0
+/* Reserved 0xfe for Header Group Register and 0xff for AFU */
+#define FEATURE_ID_FIU_HEADER 0xfe
#define FEATURE_ID_AFU 0xff
#define FME_FEATURE_ID_HEADER FEATURE_ID_FIU_HEADER
@@ -169,13 +169,22 @@ struct dfl_fpga_port_ops {
int dfl_fpga_check_port_id(struct platform_device *pdev, void *pport_id);
/**
- * struct dfl_feature_driver - sub feature's driver
+ * struct dfl_feature_id - dfl private feature id
*
- * @id: sub feature id.
- * @ops: ops of this sub feature.
+ * @id: unique dfl private feature id.
*/
-struct dfl_feature_driver {
+struct dfl_feature_id {
u64 id;
+};
+
+/**
+ * struct dfl_feature_driver - dfl private feature driver
+ *
+ * @id_table: id_table for dfl private features supported by this driver.
+ * @ops: ops of this dfl private feature driver.
+ */
+struct dfl_feature_driver {
+ const struct dfl_feature_id *id_table;
const struct dfl_feature_ops *ops;
};
--
1.8.3.1
^ permalink raw reply related
* [PATCH v4 09/15] fpga: dfl: afu: add userclock sysfs interfaces.
From: Wu Hao @ 2019-06-27 4:44 UTC (permalink / raw)
To: mdf, linux-fpga, linux-kernel
Cc: linux-api, yilun.xu, hao.wu, gregkh, atull, Ananda Ravuri,
Russ Weight
In-Reply-To: <1561610695-5414-1-git-send-email-hao.wu@intel.com>
This patch introduces userclock sysfs interfaces for AFU, user
could use these interfaces for clock setting to AFU.
Please note that, this is only working for port header feature
with revision 0, for later revisions, userclock setting is moved
to a separated private feature, so one revision sysfs interface
is exposed to userspace application for this purpose too.
Signed-off-by: Ananda Ravuri <ananda.ravuri@intel.com>
Signed-off-by: Russ Weight <russell.h.weight@intel.com>
Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
Acked-by: Alan Tull <atull@kernel.org>
---
v3: replace scnprintf with sprintf in sysfs interfaces.
update sysfs doc kernel version and date.
v4: update sysfs doc date.
---
Documentation/ABI/testing/sysfs-platform-dfl-port | 35 +++++++
drivers/fpga/dfl-afu-main.c | 113 +++++++++++++++++++++-
drivers/fpga/dfl.h | 4 +
3 files changed, 151 insertions(+), 1 deletion(-)
diff --git a/Documentation/ABI/testing/sysfs-platform-dfl-port b/Documentation/ABI/testing/sysfs-platform-dfl-port
index 17b37d1..04ea7f2 100644
--- a/Documentation/ABI/testing/sysfs-platform-dfl-port
+++ b/Documentation/ABI/testing/sysfs-platform-dfl-port
@@ -44,3 +44,38 @@ Contact: Wu Hao <hao.wu@intel.com>
Description: Read-write. Read and set AFU latency tolerance reporting value.
Set ltr to 1 if the AFU can tolerate latency >= 40us or set it
to 0 if it is latency sensitive.
+
+What: /sys/bus/platform/devices/dfl-port.0/revision
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-only. Read this file to get the revision of port header
+ feature.
+
+What: /sys/bus/platform/devices/dfl-port.0/userclk_freqcmd
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Write-only. User writes command to this interface to set
+ userclock to AFU.
+
+What: /sys/bus/platform/devices/dfl-port.0/userclk_freqsts
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-only. Read this file to get the status of issued command
+ to userclck_freqcmd.
+
+What: /sys/bus/platform/devices/dfl-port.0/userclk_freqcntrcmd
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Write-only. User writes command to this interface to set
+ userclock counter.
+
+What: /sys/bus/platform/devices/dfl-port.0/userclk_freqcntrsts
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-only. Read this file to get the status of issued command
+ to userclck_freqcntrcmd.
diff --git a/drivers/fpga/dfl-afu-main.c b/drivers/fpga/dfl-afu-main.c
index 040ed8a..8b434a4 100644
--- a/drivers/fpga/dfl-afu-main.c
+++ b/drivers/fpga/dfl-afu-main.c
@@ -144,6 +144,17 @@ static int port_get_id(struct platform_device *pdev)
static DEVICE_ATTR_RO(id);
static ssize_t
+revision_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ void __iomem *base;
+
+ base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_HEADER);
+
+ return sprintf(buf, "%x\n", dfl_feature_revision(base));
+}
+static DEVICE_ATTR_RO(revision);
+
+static ssize_t
ltr_show(struct device *dev, struct device_attribute *attr, char *buf)
{
struct dfl_feature_platform_data *pdata = dev_get_platdata(dev);
@@ -278,6 +289,7 @@ static int port_get_id(struct platform_device *pdev)
static const struct attribute *port_hdr_attrs[] = {
&dev_attr_id.attr,
+ &dev_attr_revision.attr,
&dev_attr_ltr.attr,
&dev_attr_ap1_event.attr,
&dev_attr_ap2_event.attr,
@@ -285,14 +297,112 @@ static int port_get_id(struct platform_device *pdev)
NULL,
};
+static ssize_t
+userclk_freqcmd_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct dfl_feature_platform_data *pdata = dev_get_platdata(dev);
+ u64 userclk_freq_cmd;
+ void __iomem *base;
+
+ if (kstrtou64(buf, 0, &userclk_freq_cmd))
+ return -EINVAL;
+
+ base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_HEADER);
+
+ mutex_lock(&pdata->lock);
+ writeq(userclk_freq_cmd, base + PORT_HDR_USRCLK_CMD0);
+ mutex_unlock(&pdata->lock);
+
+ return count;
+}
+static DEVICE_ATTR_WO(userclk_freqcmd);
+
+static ssize_t
+userclk_freqcntrcmd_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct dfl_feature_platform_data *pdata = dev_get_platdata(dev);
+ u64 userclk_freqcntr_cmd;
+ void __iomem *base;
+
+ if (kstrtou64(buf, 0, &userclk_freqcntr_cmd))
+ return -EINVAL;
+
+ base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_HEADER);
+
+ mutex_lock(&pdata->lock);
+ writeq(userclk_freqcntr_cmd, base + PORT_HDR_USRCLK_CMD1);
+ mutex_unlock(&pdata->lock);
+
+ return count;
+}
+static DEVICE_ATTR_WO(userclk_freqcntrcmd);
+
+static ssize_t
+userclk_freqsts_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ u64 userclk_freqsts;
+ void __iomem *base;
+
+ base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_HEADER);
+
+ userclk_freqsts = readq(base + PORT_HDR_USRCLK_STS0);
+
+ return sprintf(buf, "0x%llx\n", (unsigned long long)userclk_freqsts);
+}
+static DEVICE_ATTR_RO(userclk_freqsts);
+
+static ssize_t
+userclk_freqcntrsts_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ u64 userclk_freqcntrsts;
+ void __iomem *base;
+
+ base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_HEADER);
+
+ userclk_freqcntrsts = readq(base + PORT_HDR_USRCLK_STS1);
+
+ return sprintf(buf, "0x%llx\n",
+ (unsigned long long)userclk_freqcntrsts);
+}
+static DEVICE_ATTR_RO(userclk_freqcntrsts);
+
+static const struct attribute *port_hdr_userclk_attrs[] = {
+ &dev_attr_userclk_freqcmd.attr,
+ &dev_attr_userclk_freqcntrcmd.attr,
+ &dev_attr_userclk_freqsts.attr,
+ &dev_attr_userclk_freqcntrsts.attr,
+ NULL,
+};
+
static int port_hdr_init(struct platform_device *pdev,
struct dfl_feature *feature)
{
+ int ret;
+
dev_dbg(&pdev->dev, "PORT HDR Init.\n");
port_reset(pdev);
- return sysfs_create_files(&pdev->dev.kobj, port_hdr_attrs);
+ ret = sysfs_create_files(&pdev->dev.kobj, port_hdr_attrs);
+ if (ret)
+ return ret;
+
+ /*
+ * if revision > 0, the userclock will be moved from port hdr register
+ * region to a separated private feature.
+ */
+ if (dfl_feature_revision(feature->ioaddr) > 0)
+ return 0;
+
+ ret = sysfs_create_files(&pdev->dev.kobj, port_hdr_userclk_attrs);
+ if (ret)
+ sysfs_remove_files(&pdev->dev.kobj, port_hdr_attrs);
+
+ return ret;
}
static void port_hdr_uinit(struct platform_device *pdev,
@@ -300,6 +410,7 @@ static void port_hdr_uinit(struct platform_device *pdev,
{
dev_dbg(&pdev->dev, "PORT HDR UInit.\n");
+ sysfs_remove_files(&pdev->dev.kobj, port_hdr_userclk_attrs);
sysfs_remove_files(&pdev->dev.kobj, port_hdr_attrs);
}
diff --git a/drivers/fpga/dfl.h b/drivers/fpga/dfl.h
index 1525098..3c5dc3a 100644
--- a/drivers/fpga/dfl.h
+++ b/drivers/fpga/dfl.h
@@ -120,6 +120,10 @@
#define PORT_HDR_CAP 0x30
#define PORT_HDR_CTRL 0x38
#define PORT_HDR_STS 0x40
+#define PORT_HDR_USRCLK_CMD0 0x50
+#define PORT_HDR_USRCLK_CMD1 0x58
+#define PORT_HDR_USRCLK_STS0 0x60
+#define PORT_HDR_USRCLK_STS1 0x68
/* Port Capability Register Bitfield */
#define PORT_CAP_PORT_NUM GENMASK_ULL(1, 0) /* ID of this port */
--
1.8.3.1
^ permalink raw reply related
* [PATCH v4 08/15] fpga: dfl: afu: add AFU state related sysfs interfaces
From: Wu Hao @ 2019-06-27 4:44 UTC (permalink / raw)
To: mdf, linux-fpga, linux-kernel
Cc: linux-api, yilun.xu, hao.wu, gregkh, atull, Ananda Ravuri
In-Reply-To: <1561610695-5414-1-git-send-email-hao.wu@intel.com>
This patch introduces more sysfs interfaces for Accelerated
Function Unit (AFU). These interfaces allow users to read
current AFU Power State (APx), read / clear AFU Power (APx)
events which are sticky to identify transient APx state,
and manage AFU's LTR (latency tolerance reporting).
Signed-off-by: Ananda Ravuri <ananda.ravuri@intel.com>
Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
Acked-by: Alan Tull <atull@kernel.org>
---
v3: replace scnprintf with sprintf in sysfs interfaces.
update sysfs doc kernel version and date.
v4: update sysfs doc date.
---
Documentation/ABI/testing/sysfs-platform-dfl-port | 30 +++++
drivers/fpga/dfl-afu-main.c | 140 ++++++++++++++++++++++
drivers/fpga/dfl.h | 11 ++
3 files changed, 181 insertions(+)
diff --git a/Documentation/ABI/testing/sysfs-platform-dfl-port b/Documentation/ABI/testing/sysfs-platform-dfl-port
index 6a92dda..17b37d1 100644
--- a/Documentation/ABI/testing/sysfs-platform-dfl-port
+++ b/Documentation/ABI/testing/sysfs-platform-dfl-port
@@ -14,3 +14,33 @@ Description: Read-only. User can program different PR bitstreams to FPGA
Accelerator Function Unit (AFU) for different functions. It
returns uuid which could be used to identify which PR bitstream
is programmed in this AFU.
+
+What: /sys/bus/platform/devices/dfl-port.0/power_state
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-only. It reports the APx (AFU Power) state, different APx
+ means different throttling level. When reading this file, it
+ returns "0" - Normal / "1" - AP1 / "2" - AP2 / "6" - AP6.
+
+What: /sys/bus/platform/devices/dfl-port.0/ap1_event
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-write. Read or set 1 to clear AP1 (AFU Power State 1)
+ event. It's used to indicate transient AP1 state.
+
+What: /sys/bus/platform/devices/dfl-port.0/ap2_event
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-write. Read or set 1 to clear AP2 (AFU Power State 2)
+ event. It's used to indicate transient AP2 state.
+
+What: /sys/bus/platform/devices/dfl-port.0/ltr
+Date: June 2019
+KernelVersion: 5.3
+Contact: Wu Hao <hao.wu@intel.com>
+Description: Read-write. Read and set AFU latency tolerance reporting value.
+ Set ltr to 1 if the AFU can tolerate latency >= 40us or set it
+ to 0 if it is latency sensitive.
diff --git a/drivers/fpga/dfl-afu-main.c b/drivers/fpga/dfl-afu-main.c
index 02baa6a..040ed8a 100644
--- a/drivers/fpga/dfl-afu-main.c
+++ b/drivers/fpga/dfl-afu-main.c
@@ -21,6 +21,8 @@
#include "dfl-afu.h"
+#define DRV_VERSION "0.8"
+
/**
* port_enable - enable a port
* @pdev: port platform device.
@@ -141,8 +143,145 @@ static int port_get_id(struct platform_device *pdev)
}
static DEVICE_ATTR_RO(id);
+static ssize_t
+ltr_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ struct dfl_feature_platform_data *pdata = dev_get_platdata(dev);
+ void __iomem *base;
+ u64 v;
+
+ base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_HEADER);
+
+ mutex_lock(&pdata->lock);
+ v = readq(base + PORT_HDR_CTRL);
+ mutex_unlock(&pdata->lock);
+
+ return sprintf(buf, "%x\n", (u8)FIELD_GET(PORT_CTRL_LATENCY, v));
+}
+
+static ssize_t
+ltr_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct dfl_feature_platform_data *pdata = dev_get_platdata(dev);
+ void __iomem *base;
+ u8 ltr;
+ u64 v;
+
+ if (kstrtou8(buf, 0, <r) || ltr > 1)
+ return -EINVAL;
+
+ base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_HEADER);
+
+ mutex_lock(&pdata->lock);
+ v = readq(base + PORT_HDR_CTRL);
+ v &= ~PORT_CTRL_LATENCY;
+ v |= FIELD_PREP(PORT_CTRL_LATENCY, ltr);
+ writeq(v, base + PORT_HDR_CTRL);
+ mutex_unlock(&pdata->lock);
+
+ return count;
+}
+static DEVICE_ATTR_RW(ltr);
+
+static ssize_t
+ap1_event_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ struct dfl_feature_platform_data *pdata = dev_get_platdata(dev);
+ void __iomem *base;
+ u64 v;
+
+ base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_HEADER);
+
+ mutex_lock(&pdata->lock);
+ v = readq(base + PORT_HDR_STS);
+ mutex_unlock(&pdata->lock);
+
+ return sprintf(buf, "%x\n", (u8)FIELD_GET(PORT_STS_AP1_EVT, v));
+}
+
+static ssize_t
+ap1_event_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct dfl_feature_platform_data *pdata = dev_get_platdata(dev);
+ void __iomem *base;
+ u8 ap1_event;
+
+ if (kstrtou8(buf, 0, &ap1_event) || ap1_event != 1)
+ return -EINVAL;
+
+ base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_HEADER);
+
+ mutex_lock(&pdata->lock);
+ writeq(PORT_STS_AP1_EVT, base + PORT_HDR_STS);
+ mutex_unlock(&pdata->lock);
+
+ return count;
+}
+static DEVICE_ATTR_RW(ap1_event);
+
+static ssize_t
+ap2_event_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct dfl_feature_platform_data *pdata = dev_get_platdata(dev);
+ void __iomem *base;
+ u64 v;
+
+ base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_HEADER);
+
+ mutex_lock(&pdata->lock);
+ v = readq(base + PORT_HDR_STS);
+ mutex_unlock(&pdata->lock);
+
+ return sprintf(buf, "%x\n", (u8)FIELD_GET(PORT_STS_AP2_EVT, v));
+}
+
+static ssize_t
+ap2_event_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct dfl_feature_platform_data *pdata = dev_get_platdata(dev);
+ void __iomem *base;
+ u8 ap2_event;
+
+ if (kstrtou8(buf, 0, &ap2_event) || ap2_event != 1)
+ return -EINVAL;
+
+ base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_HEADER);
+
+ mutex_lock(&pdata->lock);
+ writeq(PORT_STS_AP2_EVT, base + PORT_HDR_STS);
+ mutex_unlock(&pdata->lock);
+
+ return count;
+}
+static DEVICE_ATTR_RW(ap2_event);
+
+static ssize_t
+power_state_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ struct dfl_feature_platform_data *pdata = dev_get_platdata(dev);
+ void __iomem *base;
+ u64 v;
+
+ base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_HEADER);
+
+ mutex_lock(&pdata->lock);
+ v = readq(base + PORT_HDR_STS);
+ mutex_unlock(&pdata->lock);
+
+ return sprintf(buf, "0x%x\n", (u8)FIELD_GET(PORT_STS_PWR_STATE, v));
+}
+static DEVICE_ATTR_RO(power_state);
+
static const struct attribute *port_hdr_attrs[] = {
&dev_attr_id.attr,
+ &dev_attr_ltr.attr,
+ &dev_attr_ap1_event.attr,
+ &dev_attr_ap2_event.attr,
+ &dev_attr_power_state.attr,
NULL,
};
@@ -634,3 +773,4 @@ static void __exit afu_exit(void)
MODULE_AUTHOR("Intel Corporation");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS("platform:dfl-port");
+MODULE_VERSION(DRV_VERSION);
diff --git a/drivers/fpga/dfl.h b/drivers/fpga/dfl.h
index 1350e8e..1525098 100644
--- a/drivers/fpga/dfl.h
+++ b/drivers/fpga/dfl.h
@@ -119,6 +119,7 @@
#define PORT_HDR_NEXT_AFU NEXT_AFU
#define PORT_HDR_CAP 0x30
#define PORT_HDR_CTRL 0x38
+#define PORT_HDR_STS 0x40
/* Port Capability Register Bitfield */
#define PORT_CAP_PORT_NUM GENMASK_ULL(1, 0) /* ID of this port */
@@ -130,6 +131,16 @@
/* Latency tolerance reporting. '1' >= 40us, '0' < 40us.*/
#define PORT_CTRL_LATENCY BIT_ULL(2)
#define PORT_CTRL_SFTRST_ACK BIT_ULL(4) /* HW ack for reset */
+
+/* Port Status Register Bitfield */
+#define PORT_STS_AP2_EVT BIT_ULL(13) /* AP2 event detected */
+#define PORT_STS_AP1_EVT BIT_ULL(12) /* AP1 event detected */
+#define PORT_STS_PWR_STATE GENMASK_ULL(11, 8) /* AFU power states */
+#define PORT_STS_PWR_STATE_NORM 0
+#define PORT_STS_PWR_STATE_AP1 1 /* 50% throttling */
+#define PORT_STS_PWR_STATE_AP2 2 /* 90% throttling */
+#define PORT_STS_PWR_STATE_AP6 6 /* 100% throttling */
+
/**
* struct dfl_fpga_port_ops - port ops
*
--
1.8.3.1
^ permalink raw reply related
* [PATCH v4 07/15] fpga: dfl: pci: enable SRIOV support.
From: Wu Hao @ 2019-06-27 4:44 UTC (permalink / raw)
To: mdf, linux-fpga, linux-kernel
Cc: linux-api, yilun.xu, hao.wu, gregkh, atull, Zhang Yi Z
In-Reply-To: <1561610695-5414-1-git-send-email-hao.wu@intel.com>
This patch enables the standard sriov support. It allows user to
enable SRIOV (and VFs), then user could pass through accelerators
(VFs) into virtual machine or use VFs directly in host.
Signed-off-by: Zhang Yi Z <yi.z.zhang@intel.com>
Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
Acked-by: Alan Tull <atull@kernel.org>
Acked-by: Moritz Fischer <mdf@kernel.org>
---
drivers/fpga/dfl-pci.c | 40 ++++++++++++++++++++++++++++++++++++++++
drivers/fpga/dfl.c | 41 +++++++++++++++++++++++++++++++++++++++++
drivers/fpga/dfl.h | 1 +
3 files changed, 82 insertions(+)
diff --git a/drivers/fpga/dfl-pci.c b/drivers/fpga/dfl-pci.c
index 66b5720..2fa571b 100644
--- a/drivers/fpga/dfl-pci.c
+++ b/drivers/fpga/dfl-pci.c
@@ -223,8 +223,46 @@ int cci_pci_probe(struct pci_dev *pcidev, const struct pci_device_id *pcidevid)
return ret;
}
+static int cci_pci_sriov_configure(struct pci_dev *pcidev, int num_vfs)
+{
+ struct cci_drvdata *drvdata = pci_get_drvdata(pcidev);
+ struct dfl_fpga_cdev *cdev = drvdata->cdev;
+ int ret = 0;
+
+ mutex_lock(&cdev->lock);
+
+ if (!num_vfs) {
+ /*
+ * disable SRIOV and then put released ports back to default
+ * PF access mode.
+ */
+ pci_disable_sriov(pcidev);
+
+ __dfl_fpga_cdev_config_port_vf(cdev, false);
+
+ } else if (cdev->released_port_num == num_vfs) {
+ /*
+ * only enable SRIOV if cdev has matched released ports, put
+ * released ports into VF access mode firstly.
+ */
+ __dfl_fpga_cdev_config_port_vf(cdev, true);
+
+ ret = pci_enable_sriov(pcidev, num_vfs);
+ if (ret)
+ __dfl_fpga_cdev_config_port_vf(cdev, false);
+ } else {
+ ret = -EINVAL;
+ }
+
+ mutex_unlock(&cdev->lock);
+ return ret;
+}
+
static void cci_pci_remove(struct pci_dev *pcidev)
{
+ if (dev_is_pf(&pcidev->dev))
+ cci_pci_sriov_configure(pcidev, 0);
+
cci_remove_feature_devs(pcidev);
pci_disable_pcie_error_reporting(pcidev);
}
@@ -234,6 +272,7 @@ static void cci_pci_remove(struct pci_dev *pcidev)
.id_table = cci_pcie_id_tbl,
.probe = cci_pci_probe,
.remove = cci_pci_remove,
+ .sriov_configure = cci_pci_sriov_configure,
};
module_pci_driver(cci_pci_driver);
@@ -241,3 +280,4 @@ static void cci_pci_remove(struct pci_dev *pcidev)
MODULE_DESCRIPTION("FPGA DFL PCIe Device Driver");
MODULE_AUTHOR("Intel Corporation");
MODULE_LICENSE("GPL v2");
+MODULE_VERSION(DRV_VERSION);
diff --git a/drivers/fpga/dfl.c b/drivers/fpga/dfl.c
index 308c808..28d61b6 100644
--- a/drivers/fpga/dfl.c
+++ b/drivers/fpga/dfl.c
@@ -1112,6 +1112,47 @@ int dfl_fpga_cdev_config_port(struct dfl_fpga_cdev *cdev,
}
EXPORT_SYMBOL_GPL(dfl_fpga_cdev_config_port);
+static void config_port_vf(struct device *fme_dev, int port_id, bool is_vf)
+{
+ void __iomem *base;
+ u64 v;
+
+ base = dfl_get_feature_ioaddr_by_id(fme_dev, FME_FEATURE_ID_HEADER);
+
+ v = readq(base + FME_HDR_PORT_OFST(port_id));
+
+ v &= ~FME_PORT_OFST_ACC_CTRL;
+ v |= FIELD_PREP(FME_PORT_OFST_ACC_CTRL,
+ is_vf ? FME_PORT_OFST_ACC_VF : FME_PORT_OFST_ACC_PF);
+
+ writeq(v, base + FME_HDR_PORT_OFST(port_id));
+}
+
+/**
+ * __dfl_fpga_cdev_config_port_vf - configure port to VF access mode
+ *
+ * @cdev: parent container device.
+ * @if_vf: true for VF access mode, and false for PF access mode
+ *
+ * Return: 0 on success, negative error code otherwise.
+ *
+ * This function is needed in sriov configuration routine. It could be used to
+ * configures the released ports access mode to VF or PF.
+ * The caller needs to hold lock for protection.
+ */
+void __dfl_fpga_cdev_config_port_vf(struct dfl_fpga_cdev *cdev, bool is_vf)
+{
+ struct dfl_feature_platform_data *pdata;
+
+ list_for_each_entry(pdata, &cdev->port_dev_list, node) {
+ if (device_is_registered(&pdata->dev->dev))
+ continue;
+
+ config_port_vf(cdev->fme_dev, pdata->id, is_vf);
+ }
+}
+EXPORT_SYMBOL_GPL(__dfl_fpga_cdev_config_port_vf);
+
static int __init dfl_fpga_init(void)
{
int ret;
diff --git a/drivers/fpga/dfl.h b/drivers/fpga/dfl.h
index 63f39ab..1350e8e 100644
--- a/drivers/fpga/dfl.h
+++ b/drivers/fpga/dfl.h
@@ -421,5 +421,6 @@ struct platform_device *
int dfl_fpga_cdev_config_port(struct dfl_fpga_cdev *cdev,
u32 port_id, bool release);
+void __dfl_fpga_cdev_config_port_vf(struct dfl_fpga_cdev *cdev, bool is_vf);
#endif /* __FPGA_DFL_H */
--
1.8.3.1
^ permalink raw reply related
* [PATCH v4 06/15] fpga: dfl: fme: add DFL_FPGA_FME_PORT_RELEASE/ASSIGN ioctl support.
From: Wu Hao @ 2019-06-27 4:44 UTC (permalink / raw)
To: mdf, linux-fpga, linux-kernel
Cc: linux-api, yilun.xu, hao.wu, gregkh, atull, Zhang Yi Z
In-Reply-To: <1561610695-5414-1-git-send-email-hao.wu@intel.com>
In order to support virtualization usage via PCIe SRIOV, this patch
adds two ioctls under FPGA Management Engine (FME) to release and
assign back the port device. In order to safely turn Port from PF
into VF and enable PCIe SRIOV, it requires user to invoke this
PORT_RELEASE ioctl to release port firstly to remove userspace
interfaces, and then configure the PF/VF access register in FME.
After disable SRIOV, it requires user to invoke this PORT_ASSIGN
ioctl to attach the port back to PF.
Ioctl interfaces:
* DFL_FPGA_FME_PORT_RELEASE
Release platform device of given port, it deletes port platform
device to remove related userspace interfaces on PF, then
configures PF/VF access mode to VF.
* DFL_FPGA_FME_PORT_ASSIGN
Assign platform device of given port back to PF, it configures
PF/VF access mode to PF, then adds port platform device back to
re-enable related userspace interfaces on PF.
Signed-off-by: Zhang Yi Z <yi.z.zhang@intel.com>
Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
Acked-by: Alan Tull <atull@kernel.org>
Acked-by: Moritz Fischer <mdf@kernel.org>
---
v3: code rebase.
v4: no change.
---
drivers/fpga/dfl-fme-main.c | 54 +++++++++++++++++++++
drivers/fpga/dfl.c | 107 +++++++++++++++++++++++++++++++++++++-----
drivers/fpga/dfl.h | 10 ++++
include/uapi/linux/fpga-dfl.h | 32 +++++++++++++
4 files changed, 191 insertions(+), 12 deletions(-)
diff --git a/drivers/fpga/dfl-fme-main.c b/drivers/fpga/dfl-fme-main.c
index 076d74f..8b2a337 100644
--- a/drivers/fpga/dfl-fme-main.c
+++ b/drivers/fpga/dfl-fme-main.c
@@ -16,6 +16,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
+#include <linux/uaccess.h>
#include <linux/fpga-dfl.h>
#include "dfl.h"
@@ -105,9 +106,62 @@ static void fme_hdr_uinit(struct platform_device *pdev,
sysfs_remove_files(&pdev->dev.kobj, fme_hdr_attrs);
}
+static long fme_hdr_ioctl_release_port(struct dfl_feature_platform_data *pdata,
+ void __user *arg)
+{
+ struct dfl_fpga_cdev *cdev = pdata->dfl_cdev;
+ struct dfl_fpga_fme_port_release release;
+ unsigned long minsz;
+
+ minsz = offsetofend(struct dfl_fpga_fme_port_release, port_id);
+
+ if (copy_from_user(&release, arg, minsz))
+ return -EFAULT;
+
+ if (release.argsz < minsz || release.flags)
+ return -EINVAL;
+
+ return dfl_fpga_cdev_config_port(cdev, release.port_id, true);
+}
+
+static long fme_hdr_ioctl_assign_port(struct dfl_feature_platform_data *pdata,
+ void __user *arg)
+{
+ struct dfl_fpga_cdev *cdev = pdata->dfl_cdev;
+ struct dfl_fpga_fme_port_assign assign;
+ unsigned long minsz;
+
+ minsz = offsetofend(struct dfl_fpga_fme_port_assign, port_id);
+
+ if (copy_from_user(&assign, arg, minsz))
+ return -EFAULT;
+
+ if (assign.argsz < minsz || assign.flags)
+ return -EINVAL;
+
+ return dfl_fpga_cdev_config_port(cdev, assign.port_id, false);
+}
+
+static long fme_hdr_ioctl(struct platform_device *pdev,
+ struct dfl_feature *feature,
+ unsigned int cmd, unsigned long arg)
+{
+ struct dfl_feature_platform_data *pdata = dev_get_platdata(&pdev->dev);
+
+ switch (cmd) {
+ case DFL_FPGA_FME_PORT_RELEASE:
+ return fme_hdr_ioctl_release_port(pdata, (void __user *)arg);
+ case DFL_FPGA_FME_PORT_ASSIGN:
+ return fme_hdr_ioctl_assign_port(pdata, (void __user *)arg);
+ }
+
+ return -ENODEV;
+}
+
static const struct dfl_feature_ops fme_hdr_ops = {
.init = fme_hdr_init,
.uinit = fme_hdr_uinit,
+ .ioctl = fme_hdr_ioctl,
};
static struct dfl_feature_driver fme_feature_drvs[] = {
diff --git a/drivers/fpga/dfl.c b/drivers/fpga/dfl.c
index 4b66aaa..308c808 100644
--- a/drivers/fpga/dfl.c
+++ b/drivers/fpga/dfl.c
@@ -231,16 +231,20 @@ void dfl_fpga_port_ops_del(struct dfl_fpga_port_ops *ops)
*/
int dfl_fpga_check_port_id(struct platform_device *pdev, void *pport_id)
{
- struct dfl_fpga_port_ops *port_ops = dfl_fpga_port_ops_get(pdev);
- int port_id;
+ struct dfl_feature_platform_data *pdata = dev_get_platdata(&pdev->dev);
+ struct dfl_fpga_port_ops *port_ops;
+
+ if (pdata->id != FEATURE_DEV_ID_UNUSED)
+ return pdata->id == *(int *)pport_id;
+ port_ops = dfl_fpga_port_ops_get(pdev);
if (!port_ops || !port_ops->get_id)
return 0;
- port_id = port_ops->get_id(pdev);
+ pdata->id = port_ops->get_id(pdev);
dfl_fpga_port_ops_put(port_ops);
- return port_id == *(int *)pport_id;
+ return pdata->id == *(int *)pport_id;
}
EXPORT_SYMBOL_GPL(dfl_fpga_check_port_id);
@@ -474,6 +478,7 @@ static int build_info_commit_dev(struct build_feature_devs_info *binfo)
pdata->dev = fdev;
pdata->num = binfo->feature_num;
pdata->dfl_cdev = binfo->cdev;
+ pdata->id = FEATURE_DEV_ID_UNUSED;
mutex_init(&pdata->lock);
lockdep_set_class_and_name(&pdata->lock, &dfl_pdata_keys[type],
dfl_pdata_key_strings[type]);
@@ -973,25 +978,27 @@ void dfl_fpga_feature_devs_remove(struct dfl_fpga_cdev *cdev)
{
struct dfl_feature_platform_data *pdata, *ptmp;
- remove_feature_devs(cdev);
-
mutex_lock(&cdev->lock);
- if (cdev->fme_dev) {
- /* the fme should be unregistered. */
- WARN_ON(device_is_registered(cdev->fme_dev));
+ if (cdev->fme_dev)
put_device(cdev->fme_dev);
- }
list_for_each_entry_safe(pdata, ptmp, &cdev->port_dev_list, node) {
struct platform_device *port_dev = pdata->dev;
- /* the port should be unregistered. */
- WARN_ON(device_is_registered(&port_dev->dev));
+ /* remove released ports */
+ if (!device_is_registered(&port_dev->dev)) {
+ dfl_id_free(feature_dev_id_type(port_dev),
+ port_dev->id);
+ platform_device_put(port_dev);
+ }
+
list_del(&pdata->node);
put_device(&port_dev->dev);
}
mutex_unlock(&cdev->lock);
+ remove_feature_devs(cdev);
+
fpga_region_unregister(cdev->region);
devm_kfree(cdev->parent, cdev);
}
@@ -1029,6 +1036,82 @@ struct platform_device *
}
EXPORT_SYMBOL_GPL(__dfl_fpga_cdev_find_port);
+static int attach_port_dev(struct dfl_fpga_cdev *cdev, u32 port_id)
+{
+ struct platform_device *port_pdev;
+ int ret = -ENODEV;
+
+ mutex_lock(&cdev->lock);
+ port_pdev = __dfl_fpga_cdev_find_port(cdev, &port_id,
+ dfl_fpga_check_port_id);
+ if (!port_pdev)
+ goto unlock_exit;
+
+ if (device_is_registered(&port_pdev->dev)) {
+ ret = -EBUSY;
+ goto put_dev_exit;
+ }
+
+ ret = platform_device_add(port_pdev);
+ if (ret)
+ goto put_dev_exit;
+
+ dfl_feature_dev_use_end(dev_get_platdata(&port_pdev->dev));
+ cdev->released_port_num--;
+put_dev_exit:
+ put_device(&port_pdev->dev);
+unlock_exit:
+ mutex_unlock(&cdev->lock);
+ return ret;
+}
+
+static int detach_port_dev(struct dfl_fpga_cdev *cdev, u32 port_id)
+{
+ struct platform_device *port_pdev;
+ int ret = -ENODEV;
+
+ mutex_lock(&cdev->lock);
+ port_pdev = __dfl_fpga_cdev_find_port(cdev, &port_id,
+ dfl_fpga_check_port_id);
+ if (!port_pdev)
+ goto unlock_exit;
+
+ if (!device_is_registered(&port_pdev->dev)) {
+ ret = -EBUSY;
+ goto put_dev_exit;
+ }
+
+ ret = dfl_feature_dev_use_begin(dev_get_platdata(&port_pdev->dev));
+ if (ret)
+ goto put_dev_exit;
+
+ platform_device_del(port_pdev);
+ cdev->released_port_num++;
+put_dev_exit:
+ put_device(&port_pdev->dev);
+unlock_exit:
+ mutex_unlock(&cdev->lock);
+ return ret;
+}
+
+/**
+ * dfl_fpga_cdev_config_port - configure a port feature dev
+ * @cdev: parent container device.
+ * @port_id: id of the port feature device.
+ * @release: release port or assign port back.
+ *
+ * This function allows user to release port platform device or assign it back.
+ * e.g. to safely turn one port from PF into VF for PCI device SRIOV support,
+ * release port platform device is one necessary step.
+ */
+int dfl_fpga_cdev_config_port(struct dfl_fpga_cdev *cdev,
+ u32 port_id, bool release)
+{
+ return release ? detach_port_dev(cdev, port_id) :
+ attach_port_dev(cdev, port_id);
+}
+EXPORT_SYMBOL_GPL(dfl_fpga_cdev_config_port);
+
static int __init dfl_fpga_init(void)
{
int ret;
diff --git a/drivers/fpga/dfl.h b/drivers/fpga/dfl.h
index 8851c6c..63f39ab 100644
--- a/drivers/fpga/dfl.h
+++ b/drivers/fpga/dfl.h
@@ -183,6 +183,8 @@ struct dfl_feature {
#define DEV_STATUS_IN_USE 0
+#define FEATURE_DEV_ID_UNUSED (-1)
+
/**
* struct dfl_feature_platform_data - platform data for feature devices
*
@@ -191,6 +193,7 @@ struct dfl_feature {
* @cdev: cdev of feature dev.
* @dev: ptr to platform device linked with this platform data.
* @dfl_cdev: ptr to container device.
+ * @id: id used for this feature device.
* @disable_count: count for port disable.
* @num: number for sub features.
* @dev_status: dev status (e.g. DEV_STATUS_IN_USE).
@@ -203,6 +206,7 @@ struct dfl_feature_platform_data {
struct cdev cdev;
struct platform_device *dev;
struct dfl_fpga_cdev *dfl_cdev;
+ int id;
unsigned int disable_count;
unsigned long dev_status;
void *private;
@@ -378,6 +382,7 @@ int dfl_fpga_enum_info_add_dfl(struct dfl_fpga_enum_info *info,
* @fme_dev: FME feature device under this container device.
* @lock: mutex lock to protect the port device list.
* @port_dev_list: list of all port feature devices under this container device.
+ * @released_port_num: released port number under this container device.
*/
struct dfl_fpga_cdev {
struct device *parent;
@@ -385,6 +390,7 @@ struct dfl_fpga_cdev {
struct device *fme_dev;
struct mutex lock;
struct list_head port_dev_list;
+ int released_port_num;
};
struct dfl_fpga_cdev *
@@ -412,4 +418,8 @@ struct platform_device *
return pdev;
}
+
+int dfl_fpga_cdev_config_port(struct dfl_fpga_cdev *cdev,
+ u32 port_id, bool release);
+
#endif /* __FPGA_DFL_H */
diff --git a/include/uapi/linux/fpga-dfl.h b/include/uapi/linux/fpga-dfl.h
index 2e324e5..e9a00e0 100644
--- a/include/uapi/linux/fpga-dfl.h
+++ b/include/uapi/linux/fpga-dfl.h
@@ -176,4 +176,36 @@ struct dfl_fpga_fme_port_pr {
#define DFL_FPGA_FME_PORT_PR _IO(DFL_FPGA_MAGIC, DFL_FME_BASE + 0)
+/**
+ * DFL_FPGA_FME_PORT_RELEASE - _IOW(DFL_FPGA_MAGIC, DFL_FME_BASE + 1,
+ * struct dfl_fpga_fme_port_release)
+ *
+ * Driver releases the port per Port ID provided by caller.
+ * Return: 0 on success, -errno on failure.
+ */
+struct dfl_fpga_fme_port_release {
+ /* Input */
+ __u32 argsz; /* Structure length */
+ __u32 flags; /* Zero for now */
+ __u32 port_id;
+};
+
+#define DFL_FPGA_FME_PORT_RELEASE _IO(DFL_FPGA_MAGIC, DFL_FME_BASE + 1)
+
+/**
+ * DFL_FPGA_FME_PORT_ASSIGN - _IOW(DFL_FPGA_MAGIC, DFL_FME_BASE + 2,
+ * struct dfl_fpga_fme_port_assign)
+ *
+ * Driver assigns the port back per Port ID provided by caller.
+ * Return: 0 on success, -errno on failure.
+ */
+struct dfl_fpga_fme_port_assign {
+ /* Input */
+ __u32 argsz; /* Structure length */
+ __u32 flags; /* Zero for now */
+ __u32 port_id;
+};
+
+#define DFL_FPGA_FME_PORT_ASSIGN _IO(DFL_FPGA_MAGIC, DFL_FME_BASE + 2)
+
#endif /* _UAPI_LINUX_FPGA_DFL_H */
--
1.8.3.1
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox