* [RFC PATCH 2/4] page_pool: basic implementation of page_pool
From: Jesper Dangaard Brouer @ 2016-12-20 13:28 UTC (permalink / raw)
To: linux-mm, Alexander Duyck
Cc: willemdebruijn.kernel, netdev, john.fastabend, Saeed Mahameed,
Jesper Dangaard Brouer, bjorn.topel, Alexei Starovoitov,
Tariq Toukan
In-Reply-To: <20161220132444.18788.50875.stgit@firesoul>
The focus in this patch is getting the API around page_pool figured out.
The internal data structures for returning page_pool pages is not optimal.
This implementation use ptr_ring for recycling, which is known not to scale
in case of multiple remote CPUs releasing/returning pages.
A bulking interface into the page allocator is also left for later. (This
requires cooperation will Mel Gorman, who just send me some PoC patches for this).
---
include/linux/mm.h | 6 +
include/linux/mm_types.h | 11 +
include/linux/page-flags.h | 13 +
include/linux/page_pool.h | 158 +++++++++++++++
include/linux/skbuff.h | 2
include/trace/events/mmflags.h | 3
mm/Makefile | 3
mm/page_alloc.c | 10 +
mm/page_pool.c | 423 ++++++++++++++++++++++++++++++++++++++++
mm/slub.c | 4
10 files changed, 627 insertions(+), 6 deletions(-)
create mode 100644 include/linux/page_pool.h
create mode 100644 mm/page_pool.c
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4424784ac374..11b4d8fb280b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -23,6 +23,7 @@
#include <linux/page_ext.h>
#include <linux/err.h>
#include <linux/page_ref.h>
+#include <linux/page_pool.h>
struct mempolicy;
struct anon_vma;
@@ -765,6 +766,11 @@ static inline void put_page(struct page *page)
{
page = compound_head(page);
+ if (PagePool(page)) {
+ page_pool_put_page(page);
+ return;
+ }
+
if (put_page_testzero(page))
__put_page(page);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 08d947fc4c59..c74dea967f99 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -47,6 +47,12 @@ struct page {
unsigned long flags; /* Atomic flags, some possibly
* updated asynchronously */
union {
+ /* DISCUSS: Considered moving page_pool pointer here,
+ * but I'm unsure if 'mapping' is needed for userspace
+ * mapping the page, as this is a use-case the
+ * page_pool need to support in the future. (Basically
+ * mapping a NIC RX ring into userspace).
+ */
struct address_space *mapping; /* If low bit clear, points to
* inode address_space, or NULL.
* If page mapped as anonymous
@@ -63,6 +69,7 @@ struct page {
union {
pgoff_t index; /* Our offset within mapping. */
void *freelist; /* sl[aou]b first free object */
+ dma_addr_t dma_addr; /* used by page_pool */
/* page_deferred_list().prev -- second tail page */
};
@@ -117,6 +124,8 @@ struct page {
* avoid collision and false-positive PageTail().
*/
union {
+ /* XXX: Idea reuse lru list, in page_pool to align with PCP */
+
struct list_head lru; /* Pageout list, eg. active_list
* protected by zone_lru_lock !
* Can be used as a generic list
@@ -189,6 +198,8 @@ struct page {
#endif
#endif
struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */
+ /* XXX: Sure page_pool will have no users of "private"? */
+ struct page_pool *pool;
};
#ifdef CONFIG_MEMCG
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 74e4dda91238..253d7f7cf89f 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -91,7 +91,8 @@ enum pageflags {
PG_mappedtodisk, /* Has blocks allocated on-disk */
PG_reclaim, /* To be reclaimed asap */
PG_swapbacked, /* Page is backed by RAM/swap */
- PG_unevictable, /* Page is "unevictable" */
+/*20*/ PG_unevictable, /* Page is "unevictable" */
+// XXX stable flag?
#ifdef CONFIG_MMU
PG_mlocked, /* Page is vma mlocked */
#endif
@@ -101,6 +102,8 @@ enum pageflags {
#ifdef CONFIG_MEMORY_FAILURE
PG_hwpoison, /* hardware poisoned page. Don't touch */
#endif
+ /* Question: can we squeeze in here and avoid CONFIG_64BIT hacks?*/
+ PG_pool, // XXX macros called: SetPagePool / PagePool
#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
PG_young,
PG_idle,
@@ -347,6 +350,12 @@ PAGEFLAG_FALSE(HWPoison)
#define __PG_HWPOISON 0
#endif
+// XXX: Define some macros for page_pool
+// XXX: avoiding atomic set_bit() operation (like slab)
+// XXX: PF_HEAD vs PF_ANY vs PF_NO_TAIL????
+__PAGEFLAG(Pool, pool, PF_ANY)
+
+
#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
TESTPAGEFLAG(Young, young, PF_ANY)
SETPAGEFLAG(Young, young, PF_ANY)
@@ -700,7 +709,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
/*
* Flags checked when a page is freed. Pages being freed should not have
* these flags set. It they are, there is a problem.
- */
+ */ /* XXX add PG_pool here??? */
#define PAGE_FLAGS_CHECK_AT_FREE \
(1UL << PG_lru | 1UL << PG_locked | \
1UL << PG_private | 1UL << PG_private_2 | \
diff --git a/include/linux/page_pool.h b/include/linux/page_pool.h
new file mode 100644
index 000000000000..6f8f2ff6d758
--- /dev/null
+++ b/include/linux/page_pool.h
@@ -0,0 +1,158 @@
+/*
+ * page_pool.h
+ *
+ * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
+ * Copyright (C) 2016 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The page_pool is primarily motivated by two things (1) performance
+ * and (2) changing the memory model for drivers.
+ *
+ * Drivers have developed performance workarounds when the speed of
+ * the page allocator and the DMA APIs became too slow for their HW
+ * needs. The page pool solves them on a general level providing
+ * performance gains and benefits that local driver recycling hacks
+ * cannot realize.
+ *
+ * A fundamental property is that pages are returned to the page_pool.
+ * This property allow a certain class of optimizations, which is to
+ * move setup and tear-down operations out of the fast-path, sometimes
+ * known as constructor/destruction operations. DMA map/unmap is one
+ * example of operations this applies to. Certain page alloc/free
+ * validations can also be avoided in the fast-path. Another example
+ * could be pre-mapping pages into userspace, and clearing them
+ * (memset-zero) outside the fast-path.
+ *
+ * This API is only meant for streaming DMA, which map/unmap frequently.
+ */
+#ifndef _LINUX_PAGE_POOL_H
+#define _LINUX_PAGE_POOL_H
+
+/*
+ * NOTES on page flags (PG_pool)... we might have a problem with
+ * enough page flags on 32 bit systems, example see PG_idle + PG_young
+ * include/linux/page_idle.h and CONFIG_IDLE_PAGE_TRACKING
+ */
+
+#include <linux/ptr_ring.h>
+
+//#include <linux/dma-mapping.h>
+#include <linux/dma-direction.h>
+
+// Not-used-atm #define PP_FLAG_NAPI 0x1
+#define PP_FLAG_ALL 0
+
+/*
+ * Fast allocation side cache array/stack
+ *
+ * The cache size and refill watermark is related to the network
+ * use-case. The NAPI budget is 64 packets. After a NAPI poll the RX
+ * ring is usually refilled and the max consumed elements will be 64,
+ * thus a natural max size of objects needed in the cache.
+ *
+ * Keeping room for more objects, is due to XDP_DROP use-case. As
+ * XDP_DROP allows the opportunity to recycle objects directly into
+ * this array, as it shares the same softirq/NAPI protection. If
+ * cache is already full (or partly full) then the XDP_DROP recycles
+ * would have to take a slower code path.
+ */
+#define PP_ALLOC_CACHE_SIZE 128
+#define PP_ALLOC_CACHE_REFILL 64
+struct pp_alloc_cache {
+ u32 count ____cacheline_aligned_in_smp;
+ u32 refill; /* not used atm */
+ void *cache[PP_ALLOC_CACHE_SIZE];
+};
+
+/*
+ * Extensible params struct. Focus on currently implemented features,
+ * extend later. Restriction, subsequently added members value of zero
+ * must gives the previous behaviour. Avoids need to update every
+ * driver simultaniously (given likely in difference subsystems).
+ */
+struct page_pool_params {
+ u32 size; /* caller sets size of struct */
+ unsigned int order;
+ unsigned long flags;
+ /* Associated with a specific device, for DMA pre-mapping purposes */
+ struct device *dev;
+ /* Numa node id to allocate from pages from */
+ int nid;
+ enum dma_data_direction dma_dir; /* DMA mapping direction */
+ unsigned int pool_size;
+ char end_marker[0]; /* must be last struct member */
+};
+#define PAGE_POOL_PARAMS_SIZE offsetof(struct page_pool_params, end_marker)
+
+struct page_pool {
+ struct page_pool_params p;
+
+ /*
+ * Data structure for allocation side
+ *
+ * Drivers allocation side usually already perform some kind
+ * of resource protection. Piggyback on this protection, and
+ * require driver to protect allocation side.
+ *
+ * For NIC drivers this means, allocate a page_pool per
+ * RX-queue. As the RX-queue is already protected by
+ * Softirq/BH scheduling and napi_schedule. NAPI schedule
+ * guarantee that a single napi_struct will only be scheduled
+ * on a single CPU (see napi_schedule).
+ */
+ struct pp_alloc_cache alloc;
+
+ /* Data structure for storing recycled pages.
+ *
+ * Returning/freeing pages is more complicated synchronization
+ * wise, because free's can happen on remote CPUs, with no
+ * association with allocation resource.
+ *
+ * For now use ptr_ring, as it separates consumer and
+ * producer, which is a common use-case. The ptr_ring is not
+ * though as the final data structure, expecting this to
+ * change into a more advanced data structure with more
+ * integration with page_alloc.c and data structs per CPU for
+ * returning pages in bulk.
+ *
+ */
+ struct ptr_ring ring;
+
+ /* TODO: Domain "id" add later, for RX zero-copy validation */
+
+ /* TODO: Need list pointers for keeping page_pool object on a
+ * cleanup list, given pages can be "outstanding" even after
+ * e.g. driver is unloaded.
+ */
+};
+
+struct page* page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);
+
+static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)
+{
+ gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN | __GFP_COLD);
+ return page_pool_alloc_pages(pool, gfp);
+}
+
+struct page_pool *page_pool_create(const struct page_pool_params *params);
+
+void page_pool_destroy(struct page_pool *pool);
+
+/* Never call this directly, use helpers below */
+void __page_pool_put_page(struct page *page, bool allow_direct);
+
+static inline void page_pool_put_page(struct page *page)
+{
+ __page_pool_put_page(page, false);
+}
+/* Very limited use-cases allow recycle direct */
+static inline void page_pool_recycle_direct(struct page *page)
+{
+ __page_pool_put_page(page, true);
+}
+
+#endif /* _LINUX_PAGE_POOL_H */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ac7fa34db8a7..84294278039d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2584,7 +2584,7 @@ static inline void __skb_frag_ref(skb_frag_t *frag)
* @f: the fragment offset.
*
* Takes an additional reference on the @f'th paged fragment of @skb.
- */
+ */ // XXX
static inline void skb_frag_ref(struct sk_buff *skb, int f)
{
__skb_frag_ref(&skb_shinfo(skb)->frags[f]);
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 5a81ab48a2fb..ee15ca659ea1 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -99,7 +99,8 @@
{1UL << PG_mappedtodisk, "mappedtodisk" }, \
{1UL << PG_reclaim, "reclaim" }, \
{1UL << PG_swapbacked, "swapbacked" }, \
- {1UL << PG_unevictable, "unevictable" } \
+ {1UL << PG_unevictable, "unevictable" }, \
+ {1UL << PG_pool, "pool" } \
IF_HAVE_PG_MLOCK(PG_mlocked, "mlocked" ) \
IF_HAVE_PG_UNCACHED(PG_uncached, "uncached" ) \
IF_HAVE_PG_HWPOISON(PG_hwpoison, "hwpoison" ) \
diff --git a/mm/Makefile b/mm/Makefile
index 295bd7a9f76b..dbe5a7181e28 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -100,3 +100,6 @@ obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
+
+# Hack enable for compile testing
+obj-y += page_pool.o
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2c6d5f64feca..655db05f0c1c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3873,6 +3873,11 @@ EXPORT_SYMBOL(get_zeroed_page);
void __free_pages(struct page *page, unsigned int order)
{
+ if (PagePool(page)) {
+ page_pool_put_page(page);
+ return;
+ }
+
if (put_page_testzero(page)) {
if (order == 0)
free_hot_cold_page(page, false);
@@ -4000,6 +4005,11 @@ void __free_page_frag(void *addr)
{
struct page *page = virt_to_head_page(addr);
+ if (PagePool(page)) {
+ page_pool_put_page(page);
+ return;
+ }
+
if (unlikely(put_page_testzero(page)))
__free_pages_ok(page, compound_order(page));
}
diff --git a/mm/page_pool.c b/mm/page_pool.c
new file mode 100644
index 000000000000..74138d5fe86d
--- /dev/null
+++ b/mm/page_pool.c
@@ -0,0 +1,423 @@
+/*
+ * page_pool.c
+ */
+
+/* Using the page pool from a driver, involves
+ *
+ * 1. Creating/allocating a page_pool per RX ring for the NIC
+ * 2. Using pages from page_pool to populate RX ring
+ * 3. Page pool will call dma_map/unmap
+ * 4. Driver is responsible for dma_sync part
+ * 5. On page put/free the page is returned to the page_pool
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include <linux/page_pool.h>
+#include <linux/dma-direction.h>
+#include <linux/dma-mapping.h>
+#include <linux/page-flags.h>
+#include <linux/mm.h> /* for __put_page() */
+
+/*
+ * The struct page_pool (likely) cannot be embedded into another
+ * structure, because freeing this struct depend on outstanding pages,
+ * which can point back to the page_pool. Thus, don't export "init".
+ */
+int page_pool_init(struct page_pool *pool,
+ const struct page_pool_params *params)
+{
+ int ring_qsize = 1024; /* Default */
+ int param_copy_sz;
+
+ if (!pool)
+ return -EFAULT;
+
+ /* Allow kernel devel trees and driver to progress at different rates */
+ param_copy_sz = PAGE_POOL_PARAMS_SIZE;
+ memset(&pool->p, 0, param_copy_sz);
+ if (params->size < param_copy_sz) {
+ /*
+ * Older module calling newer kernel, handled by only
+ * copying supplied size, and keep remaining params zero
+ */
+ param_copy_sz = params->size;
+ } else if (params->size > param_copy_sz) {
+ /*
+ * Newer module calling older kernel. Need to validate
+ * no new features were requested.
+ */
+ unsigned char *addr = (unsigned char*)params + param_copy_sz;
+ unsigned char *end = (unsigned char*)params + params->size;
+
+ for (; addr < end; addr++) {
+ if (*addr != 0)
+ return -E2BIG;
+ }
+ }
+ memcpy(&pool->p, params, param_copy_sz);
+
+ /* Validate only known flags were used */
+ if (pool->p.flags & ~(PP_FLAG_ALL))
+ return -EINVAL;
+
+ if (pool->p.pool_size)
+ ring_qsize = pool->p.pool_size;
+
+ /* ptr_ring is not meant as final struct, see page_pool.h */
+ if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) {
+ return -ENOMEM;
+ }
+
+ /*
+ * DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
+ * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
+ * which is the XDP_TX use-case.
+ */
+ if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
+ (pool->p.dma_dir != DMA_BIDIRECTIONAL))
+ return -EINVAL;
+
+ return 0;
+}
+
+struct page_pool *page_pool_create(const struct page_pool_params *params)
+{
+ struct page_pool *pool;
+ int err = 0;
+
+ if (params->size < offsetof(struct page_pool_params, nid)) {
+ WARN(1, "Fix page_pool_params->size code\n");
+ return NULL;
+ }
+
+ pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
+ err = page_pool_init(pool, params);
+ if (err < 0) {
+ pr_warn("%s() gave up with errno %d\n", __func__, err);
+ kfree(pool);
+ return ERR_PTR(err);
+ }
+ return pool;
+}
+EXPORT_SYMBOL(page_pool_create);
+
+/* fast path */
+static struct page *__page_pool_get_cached(struct page_pool *pool)
+{
+ struct page *page;
+
+ /* FIXME: use another test for safe-context, caller should
+ * simply provide this guarantee
+ */
+ if (likely(in_serving_softirq())) { // FIXME add use of PP_FLAG_NAPI
+ struct ptr_ring *r;
+
+ if (likely(pool->alloc.count)) {
+ /* Fast-path */
+ page = pool->alloc.cache[--pool->alloc.count];
+ return page;
+ }
+ /* Slower-path: Alloc array empty, time to refill */
+ r = &pool->ring;
+ /* Open-coded bulk ptr_ring consumer.
+ *
+ * Discussion: ATM the ring consumer lock is not
+ * really needed due to the softirq/NAPI protection,
+ * but later MM-layer need the ability to reclaim
+ * pages on the ring. Thus, keeping the locks.
+ */
+ spin_lock(&r->consumer_lock);
+ while ((page = __ptr_ring_consume(r))) {
+ if (pool->alloc.count == PP_ALLOC_CACHE_REFILL)
+ break;
+ pool->alloc.cache[pool->alloc.count++] = page;
+ }
+ spin_unlock(&r->consumer_lock);
+ return page;
+ }
+
+ /* Slow-path: Get page from locked ring queue */
+ page = ptr_ring_consume(&pool->ring);
+ return page;
+}
+
+/* slow path */
+noinline
+static struct page *__page_pool_alloc_pages(struct page_pool *pool,
+ gfp_t _gfp)
+{
+ struct page *page;
+ gfp_t gfp = _gfp;
+ dma_addr_t dma;
+
+ /* We could always set __GFP_COMP, and avoid this branch, as
+ * prep_new_page() can handle order-0 with __GFP_COMP.
+ */
+ if (pool->p.order)
+ gfp |= __GFP_COMP;
+ /*
+ * Discuss GFP flags: e.g
+ * __GFP_NOWARN + __GFP_NORETRY + __GFP_NOMEMALLOC
+ */
+
+ /*
+ * FUTURE development:
+ *
+ * Current slow-path essentially falls back to single page
+ * allocations, which doesn't improve performance. This code
+ * need bulk allocation support from the page allocator code.
+ *
+ * For now, page pool recycle cache is not refilled. Hint:
+ * when pages are returned, they will go into the recycle
+ * cache.
+ */
+
+ /* Cache was empty, do real allocation */
+ page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
+ if (!page)
+ return NULL;
+
+ /* FIXME: Add accounting of pages.
+ *
+ * TODO: Look into memcg_charge_slab/memcg_uncharge_slab
+ *
+ * What if page comes from pfmemalloc reserves?
+ * Should we abort to help memory pressure? (test err code path!)
+ * Code see SetPageSlabPfmemalloc(), __ClearPageSlabPfmemalloc()
+ * and page_is_pfmemalloc(page)
+ */
+
+ /* Setup DMA mapping:
+ * This mapping is kept for lifetime of page, until leaving pool.
+ */
+ dma = dma_map_page(pool->p.dev, page, 0,
+ (PAGE_SIZE << pool->p.order),
+ pool->p.dma_dir);
+ if (dma_mapping_error(pool->p.dev, dma)) {
+ put_page(page);
+ return NULL;
+ }
+ page->dma_addr = dma;
+
+ /* IDEA: When page just alloc'ed is should/must have refcnt 1.
+ * Should we do refcnt inc tricks to keep page mapped/owned by
+ * page_pool infrastructure? (like page_frag code)
+ */
+
+ /* TODO: Init fields in struct page. See slub code allocate_slab()
+ *
+ */
+ page->pool = pool; /* Save pool the page MUST be returned to */
+ __SetPagePool(page); /* Mark page with flag */
+
+ return page;
+}
+
+
+/* For using page_pool replace: alloc_pages() API calls, but provide
+ * synchronization guarantee for allocation side.
+ */
+struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
+{
+ struct page *page;
+
+ /* Fast-path: Get a page from cache */
+ page = __page_pool_get_cached(pool);
+ if (page)
+ return page;
+
+ /* Slow-path: cache empty, do real allocation */
+ page = __page_pool_alloc_pages(pool, gfp);
+ return page;
+}
+EXPORT_SYMBOL(page_pool_alloc_pages);
+
+/* Cleanup page_pool state from page */
+// Ideas taken from __free_slab()
+static void __page_pool_clean_page(struct page *page)
+{
+ struct page_pool *pool;
+
+ VM_BUG_ON_PAGE(!PagePool(page), page);
+
+ // mod_zone_page_state() ???
+
+ pool = page->pool;
+ __ClearPagePool(page);
+
+ /* DMA unmap */
+ dma_unmap_page(pool->p.dev, page->dma_addr,
+ PAGE_SIZE << pool->p.order,
+ pool->p.dma_dir);
+ page->dma_addr = 0;
+ /* Q: Use DMA macros???
+ *
+ * dma_unmap_page(pool->p.dev, dma_unmap_addr(page,dma_addr),
+ * PAGE_SIZE << pool->p.order,
+ * pool->p.dma_dir);
+ * dma_unmap_addr_set(page, dma_addr, 0);
+ */
+
+ /* FUTURE: Use Alex Duyck's DMA_ATTR_SKIP_CPU_SYNC changes
+ *
+ * dma_unmap_page_attrs(pool->p.dev, page->dma_addr,
+ * PAGE_SIZE << pool->p.order,
+ * pool->p.dma_dir,
+ * DMA_ATTR_SKIP_CPU_SYNC);
+ */
+
+ // page_mapcount_reset(page); // ??
+ // page->mapping = NULL; // ??
+
+ // Not really needed, but good for provoking bugs
+ page->pool = (void *)0xDEADBEE0;
+
+ /* FIXME: Add accounting of pages here!
+ *
+ * Look into: memcg_uncharge_page_pool(page, order, pool);
+ */
+
+ // FIXME: do we need this??? likely not as slub does not...
+// if (unlikely(is_zone_device_page(page)))
+// put_zone_device_page(page);
+
+}
+
+/* Return a page to the page allocator, cleaning up our state */
+static void __page_pool_return_page(struct page *page)
+{
+ struct page_pool *pool = page->pool;
+
+ __page_pool_clean_page(page);
+ /*
+ * Given page pool state and flags were just cleared, the page
+ * must be freed here. Thus, code invariant assumes
+ * refcnt==1, as __free_pages() call put_page_testzero().
+ */
+ __free_pages(page, pool->p.order);
+}
+
+bool __page_pool_recycle_into_ring(struct page_pool *pool,
+ struct page *page)
+{
+ int ret;
+ /* TODO: Use smarter data structure for recycle cache. Using
+ * ptr_ring will not scale when multiple remote CPUs want to
+ * recycle pages.
+ */
+
+ /* Need BH protection when free occurs from userspace e.g
+ * __kfree_skb() called via {tcp,inet,sock}_recvmsg
+ *
+ * Problematic for several reasons: (1) it is more costly,
+ * (2) the BH unlock can cause (re)sched of softirq.
+ *
+ * BH protection not needed if current is serving softirq
+ */
+ if (in_serving_softirq())
+ ret = ptr_ring_produce(&pool->ring, page);
+ else
+ ret = ptr_ring_produce_bh(&pool->ring, page);
+
+ return (ret == 0) ? true : false;
+}
+
+/*
+ * Only allow direct recycling in very special circumstances, into the
+ * alloc cache. E.g. XDP_DROP use-case.
+ *
+ * Caller must provide appropiate safe context.
+ */
+static bool __page_pool_recycle_direct(struct page *page,
+ struct page_pool *pool)
+{
+ // BUG_ON(!in_serving_softirq());
+
+ if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE))
+ return false;
+
+ /* Caller MUST have verified/know (page_ref_count(page) == 1) */
+ pool->alloc.cache[pool->alloc.count++] = page;
+ return true;
+}
+
+void __page_pool_put_page(struct page *page, bool allow_direct)
+{
+ struct page_pool *pool = page->pool;
+
+ /* This is a fast-path optimization, that avoids an atomic
+ * operation, in the case where a single object is (refcnt)
+ * using the page.
+ *
+ * refcnt == 1 means page_pool owns page, and can recycle it.
+ */
+ if (likely(page_ref_count(page) == 1)) {
+ /* Read barrier implicit paired with full MB of atomic ops */
+ smp_rmb();
+
+ if (allow_direct)
+ if (__page_pool_recycle_direct(page, pool))
+ return;
+
+ if (!__page_pool_recycle_into_ring(pool, page)) {
+ /* Cache full, do real __free_pages() */
+ __page_pool_return_page(page);
+ }
+ return;
+ }
+ /*
+ * Many drivers splitting up the page into fragments, and some
+ * want to keep doing this to save memory. The put_page_testzero()
+ * function as a refcnt decrement, and should not return true.
+ */
+ if (unlikely(put_page_testzero(page))) {
+ /*
+ * Reaching refcnt zero should not be possible,
+ * indicate code error. Don't crash but warn, handle
+ * case by not-recycling, but return page to page
+ * allocator.
+ */
+ WARN(1, "%s() violating page_pool invariance refcnt:%d\n",
+ __func__, page_ref_count(page));
+ /* Cleanup state before directly returning page */
+ __page_pool_clean_page(page);
+ __put_page(page);
+ }
+}
+EXPORT_SYMBOL(__page_pool_put_page);
+
+static void __destructor_put_page(void *ptr)
+{
+ struct page *page = ptr;
+
+ /* Verify the refcnt invariant of cached pages */
+ if (!(page_ref_count(page) == 1)) {
+ pr_crit("%s() page_pool refcnt %d violation\n",
+ __func__, page_ref_count(page));
+ BUG();
+ }
+ __page_pool_return_page(page);
+}
+
+/* Cleanup and release resources */
+void page_pool_destroy(struct page_pool *pool)
+{
+ /* Empty recycle ring */
+ ptr_ring_cleanup(&pool->ring, __destructor_put_page);
+
+ /* FIXME-mem-leak: cleanup array/stack cache
+ * pool->alloc. Driver usually will destroy RX ring after
+ * making sure nobody can alloc from it, thus it should be
+ * safe to just empty cache here
+ */
+
+ /* FIXME: before releasing the page_pool memory, we MUST make
+ * sure no pages points back this page_pool.
+ */
+ kfree(pool);
+}
+EXPORT_SYMBOL(page_pool_destroy);
diff --git a/mm/slub.c b/mm/slub.c
index 067598a00849..7de478c20464 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1572,8 +1572,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
page->objects = oo_objects(oo);
order = compound_order(page);
- page->slab_cache = s;
- __SetPageSlab(page);
+ page->slab_cache = s; // Example: Saving kmem_cache in struct page
+ __SetPageSlab(page); // Example: Setting flag
if (page_is_pfmemalloc(page))
SetPageSlabPfmemalloc(page);
^ permalink raw reply related
* [RFC PATCH 1/4] doc: page_pool introduction documentation
From: Jesper Dangaard Brouer @ 2016-12-20 13:28 UTC (permalink / raw)
To: linux-mm, Alexander Duyck
Cc: willemdebruijn.kernel, netdev, john.fastabend, Saeed Mahameed,
Jesper Dangaard Brouer, bjorn.topel, Alexei Starovoitov,
Tariq Toukan
In-Reply-To: <20161220132444.18788.50875.stgit@firesoul>
Copied from:
https://prototype-kernel.readthedocs.io/en/latest/vm/page_pool/introduction.html
~/git/prototype-kernel/kernel/Documentation/vm/page_pool/introduction.rst
This will be updated from above links before upstream submit.
Also this need to be "linked" into new kernel doc system.
---
Documentation/vm/page_pool/introduction.rst | 71 +++++++++++++++++++++++++++
1 file changed, 71 insertions(+)
create mode 100644 Documentation/vm/page_pool/introduction.rst
diff --git a/Documentation/vm/page_pool/introduction.rst b/Documentation/vm/page_pool/introduction.rst
new file mode 100644
index 000000000000..db03b02f218c
--- /dev/null
+++ b/Documentation/vm/page_pool/introduction.rst
@@ -0,0 +1,71 @@
+============
+Introduction
+============
+
+The page_pool is a generic API for drivers that have a need for a pool
+of recycling pages used for streaming DMA.
+
+
+Motivation
+==========
+
+The page_pool is primarily motivated by two things (1) performance
+and (2) changing the memory model for drivers.
+
+Drivers have developed performance workarounds when the speed of the
+page allocator and the DMA APIs became too slow for their HW
+needs. The page pool solves them on a general level providing
+performance gains and benefits that local driver recycling hacks
+cannot realize.
+
+A fundamental property is that pages are returned to the page_pool.
+This property allow a certain class of optimizations, which is to move
+setup and tear-down operations out of the fast-path, sometimes known as
+constructor/destruction operations. DMA map/unmap is one example of
+operations this applies to. Certain page alloc/free validations can
+also be avoided in the fast-path. Another example could be
+pre-mapping pages into userspace, and clearing them (memset-zero)
+outside the fast-path.
+
+Memory model
+============
+
+Once drivers are converted to using page_pool API, then it will become
+easier change the underlying memory model backing the driver with
+pages (without changing the driver).
+
+One prime use-case is NIC zero-copy RX into userspace. As DaveM
+describes in his `Google-plus post`_, the mapping and unmapping
+operations in the address space of the process has a cost that cancels
+out most of the gains of such zero-copy schemes.
+
+This mapping cost can solved the same way as the keeping DMA mapped
+trick. By keeping the pages VM-mapped to userspace. This is a layer
+that can be added later to the page_pool. It will likely be
+beneficial to also consider using huge-pages (as backing) to reduce
+the TLB-stress.
+
+.. _Google-plus post:
+ https://plus.google.com/+DavidMiller/posts/EUDiGoXD6Xv
+
+Advantages
+==========
+
+Advantages of a recycling page pool as bullet points:
+
+1) Faster than going through page-allocator. Given a specialized
+ allocator require less checks, and can piggyback on drivers
+ resource protection (for alloc-side).
+
+2) DMA IOMMU mapping cost is removed by keeping pages mapped.
+
+3) Makes DMA pages writable by predictable DMA unmap point.
+
+4) OOM protection at device level, as having a feedback-loop knows
+ number of outstanding pages.
+
+5) Flexible memory model allowing zero-copy RX, solving memory early
+ demux (does depend on HW filters into RX queues)
+
+6) Less fragmentation of the page buddy algorithm, when driver
+ maintains a steady-state working-set.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related
* [RFC PATCH 0/4] page_pool proof-of-concept early code
From: Jesper Dangaard Brouer @ 2016-12-20 13:28 UTC (permalink / raw)
To: linux-mm, Alexander Duyck
Cc: willemdebruijn.kernel, netdev, john.fastabend, Saeed Mahameed,
Jesper Dangaard Brouer, bjorn.topel, Alexei Starovoitov,
Tariq Toukan
This is an RFC patchset of my *work-in-progress* page_pool implemenation.
This is NOT ready for inclusion. People asked to see the code, so here we go.
This patchset is focused providing a generic replacement for the
driver page recycle caches. Where mlx5 is the first user in patch-3.
Notice that patch-2 is more "MM-invasive" (modifies put_page) than
patch-4 which is less MM-agressive (scaled back based on input from
Mel Gorman).
I do know that all page-flags are used (for 32bit), thus I'm open to
suggestions/ideas on howto work-around this (need some way to identify
a page belongs to a page pool).
This patchset is the bare-minimum PoC that allows me to benchmarks
these ideas and see if performance is going in the right direction.
It is not safe, e.g. unloading the driver can crash the kernel.
---
Jesper Dangaard Brouer (4):
doc: page_pool introduction documentation
page_pool: basic implementation of page_pool
mlx5: use page_pool
page_pool: change refcnt model
Documentation/vm/page_pool/introduction.rst | 71 ++++
drivers/net/ethernet/mellanox/mlx5/core/en.h | 1
drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 28 +
drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 47 ++
include/linux/mm.h | 1
include/linux/mm_types.h | 11 +
include/linux/page-flags.h | 13 +
include/linux/page_pool.h | 168 +++++++++
include/linux/skbuff.h | 2
include/trace/events/mmflags.h | 3
mm/Makefile | 3
mm/page_alloc.c | 6
mm/page_pool.c | 402 +++++++++++++++++++++
mm/slub.c | 4
mm/swap.c | 3
15 files changed, 741 insertions(+), 22 deletions(-)
create mode 100644 Documentation/vm/page_pool/introduction.rst
create mode 100644 include/linux/page_pool.h
create mode 100644 mm/page_pool.c
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply
* [PATCH 2/2] bpf: do not use KMALLOC_SHIFT_MAX
From: Michal Hocko @ 2016-12-20 13:06 UTC (permalink / raw)
To: Andrew Morton
Cc: Cristopher Lameter, Alexei Starovoitov, Andrey Konovalov, netdev,
linux-mm, LKML, Michal Hocko
In-Reply-To: <20161220130659.16461-1-mhocko@kernel.org>
From: Michal Hocko <mhocko@suse.com>
01b3f52157ff ("bpf: fix allocation warnings in bpf maps and integer
overflow") has added checks for the maximum allocateable size. It
(ab)used KMALLOC_SHIFT_MAX for that purpose. While this is not incorrect
it is not very clean because we already have KMALLOC_MAX_SIZE for this
very reason so let's change both checks to use KMALLOC_MAX_SIZE instead.
The original motivation for using KMALLOC_SHIFT_MAX was to work around
an incorrect KMALLOC_MAX_SIZE which could lead to allocation warnings
but it is no longer needed since "slab: make sure that KMALLOC_MAX_SIZE
will fit into MAX_ORDER".
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Christoph Lameter <cl@linux.com>
---
kernel/bpf/arraymap.c | 2 +-
kernel/bpf/hashtab.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index a2ac051c342f..229a5d5df977 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -56,7 +56,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
attr->value_size == 0 || attr->map_flags)
return ERR_PTR(-EINVAL);
- if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1))
+ if (attr->value_size > KMALLOC_MAX_SIZE)
/* if value_size is bigger, the user space won't be able to
* access the elements.
*/
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index ad1bc67aff1b..c5ec7dc71c84 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -181,7 +181,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
*/
goto free_htab;
- if (htab->map.value_size >= (1 << (KMALLOC_SHIFT_MAX - 1)) -
+ if (htab->map.value_size >= KMALLOC_MAX_SIZE -
MAX_BPF_STACK - sizeof(struct htab_elem))
/* if value_size is bigger, the user space won't be able to
* access the elements via bpf syscall. This check also makes
--
2.10.2
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related
* [PATCH 1/2] mm, slab: make sure that KMALLOC_MAX_SIZE will fit into MAX_ORDER
From: Michal Hocko @ 2016-12-20 13:06 UTC (permalink / raw)
To: Andrew Morton
Cc: Cristopher Lameter, Alexei Starovoitov, Andrey Konovalov, netdev,
linux-mm, LKML, Michal Hocko
In-Reply-To: <20161220130659.16461-1-mhocko@kernel.org>
From: Michal Hocko <mhocko@suse.com>
Andrey Konovalov has reported the following warning triggered by
the syzkaller fuzzer.
WARNING: CPU: 1 PID: 9935 at mm/page_alloc.c:3511
__alloc_pages_nodemask+0x159c/0x1e20
Kernel panic - not syncing: panic_on_warn set ...
CPU: 1 PID: 9935 Comm: syz-executor0 Not tainted 4.9.0-rc7+ #34
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
ffff88006949f2c8 ffffffff81f96b8a ffffffff00000200 1ffff1000d293dec
ffffed000d293de4 0000000000000a06 0000000041b58ab3 ffffffff8598b510
ffffffff81f968f8 0000000041b58ab3 ffffffff85942a58 ffffffff81432860
Call Trace:
[< inline >] __dump_stack lib/dump_stack.c:15
[<ffffffff81f96b8a>] dump_stack+0x292/0x398 lib/dump_stack.c:51
[<ffffffff8168c88e>] panic+0x1cb/0x3a9 kernel/panic.c:179
[<ffffffff812b80b4>] __warn+0x1c4/0x1e0 kernel/panic.c:542
[<ffffffff812b831c>] warn_slowpath_null+0x2c/0x40 kernel/panic.c:585
[< inline >] __alloc_pages_slowpath mm/page_alloc.c:3511
[<ffffffff816c08ac>] __alloc_pages_nodemask+0x159c/0x1e20 mm/page_alloc.c:3781
[<ffffffff817cde17>] alloc_pages_current+0x1c7/0x6b0 mm/mempolicy.c:2072
[< inline >] alloc_pages include/linux/gfp.h:469
[<ffffffff8172fd8f>] kmalloc_order+0x1f/0x70 mm/slab_common.c:1015
[<ffffffff8172fdff>] kmalloc_order_trace+0x1f/0x160 mm/slab_common.c:1026
[< inline >] kmalloc_large include/linux/slab.h:422
[<ffffffff817e01f0>] __kmalloc+0x210/0x2d0 mm/slub.c:3723
[< inline >] kmalloc include/linux/slab.h:495
[<ffffffff832262a7>] ep_write_iter+0x167/0xb50 drivers/usb/gadget/legacy/inode.c:664
[< inline >] new_sync_write fs/read_write.c:499
[<ffffffff817fdcd3>] __vfs_write+0x483/0x760 fs/read_write.c:512
[<ffffffff817ff720>] vfs_write+0x170/0x4e0 fs/read_write.c:560
[< inline >] SYSC_write fs/read_write.c:607
[<ffffffff81803b2b>] SyS_write+0xfb/0x230 fs/read_write.c:599
[<ffffffff84f47ec1>] entry_SYSCALL_64_fastpath+0x1f/0xc2
The issue is caused by a lack of size check for the request size in
ep_write_iter which should be fixed. It, however, points to another
problem, that SLUB defines KMALLOC_MAX_SIZE too large because the its
KMALLOC_SHIFT_MAX is (MAX_ORDER + PAGE_SHIFT) which means that the
resulting page allocator request might be MAX_ORDER which is too large
(see __alloc_pages_slowpath). The same applies to the SLOB allocator
which allows even larger sizes. Make sure that they are capped properly
and never request more than MAX_ORDER order.
Reported-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Christoph Lameter <cl@linux.com>
---
include/linux/slab.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 084b12bad198..4c5363566815 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -226,7 +226,7 @@ static inline const char *__check_heap_object(const void *ptr,
* (PAGE_SIZE*2). Larger requests are passed to the page allocator.
*/
#define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1)
-#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT)
+#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1)
#ifndef KMALLOC_SHIFT_LOW
#define KMALLOC_SHIFT_LOW 3
#endif
@@ -239,7 +239,7 @@ static inline const char *__check_heap_object(const void *ptr,
* be allocated from the same page.
*/
#define KMALLOC_SHIFT_HIGH PAGE_SHIFT
-#define KMALLOC_SHIFT_MAX 30
+#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1)
#ifndef KMALLOC_SHIFT_LOW
#define KMALLOC_SHIFT_LOW 3
#endif
--
2.10.2
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related
* [PATCH 0/2 v2] mm, slab: consolidate KMALLOC_MAX_SIZE
From: Michal Hocko @ 2016-12-20 13:06 UTC (permalink / raw)
To: Andrew Morton
Cc: Cristopher Lameter, Alexei Starovoitov, Andrey Konovalov, netdev,
linux-mm, LKML
Hi,
this is the second version of the patchset previously posted here [1].
Alexei has insisted on the patches reordering which I've done in this
series. I've also updated the changelog of the second patch to mention
why KMALLOC_SHIFT_MAX has been used.
Andrey has revealed a discrepancy between KMALLOC_MAX_SIZE and the
maximum supported page allocator size [2]. The underlying problem
should be fixed in the ep_write_iter code of course, but I do not feel
qualified to do that. The discrepancy which it reveals (see patch 2)
is worth fixing anyway, though.
While I was looking into the code, I've noticed that the only code which
uses KMALLOC_SHIFT_MAX outside of the slab code is bpf so I've updated
it to use KMALLOC_MAX_SIZE instead. There shouldn't be any real reason
to use KMALLOC_SHIFT_MAX which is a slab internal constant same as
KMALLOC_SHIFT_{LOW,HIGH}
[1] http://lkml.kernel.org/r/20161215164722.21586-1-mhocko@kernel.org
[2] http://lkml.kernel.org/r/CAAeHK+ztusS68DejO8AH3nn-EfiYQpD5FmBwmqKG8BWvoqPNqQ@mail.gm
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply
* [PATCH] stmmac: enable rx queues
From: Joao Pinto @ 2016-12-20 12:55 UTC (permalink / raw)
To: peppe.cavallaro, davem
Cc: hock.leong.kweh, niklas.cassel, pavel, linux-kernel, netdev,
Joao Pinto
When the hardware is synthesized with multiple queues, all queues are
disabled for default. This patch adds the rx queues configuration.
This patch was successfully tested in a Synopsys QoS Reference design.
Signed-off-by: Joao Pinto <jpinto@synopsys.com>
---
drivers/net/ethernet/stmicro/stmmac/common.h | 2 ++
drivers/net/ethernet/stmicro/stmmac/dwmac4.h | 4 ++++
drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c | 11 +++++++++++
drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 21 +++++++++++++++++++++
4 files changed, 38 insertions(+)
diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index b13a144..61bab50 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -454,6 +454,8 @@ struct stmmac_ops {
void (*core_init)(struct mac_device_info *hw, int mtu);
/* Enable and verify that the IPC module is supported */
int (*rx_ipc)(struct mac_device_info *hw);
+ /* Enable RX Queues */
+ void (*rx_queue_enable)(struct mac_device_info *hw, u32 queue);
/* Dump MAC registers */
void (*dump_regs)(struct mac_device_info *hw);
/* Handle extra events on specific interrupts hw dependent */
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
index 3e8d4fe..fd013bd 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
@@ -22,6 +22,7 @@
#define GMAC_HASH_TAB_32_63 0x00000014
#define GMAC_RX_FLOW_CTRL 0x00000090
#define GMAC_QX_TX_FLOW_CTRL(x) (0x70 + x * 4)
+#define GMAC_RXQ_CTRL0 0x000000a0
#define GMAC_INT_STATUS 0x000000b0
#define GMAC_INT_EN 0x000000b4
#define GMAC_PCS_BASE 0x000000e0
@@ -44,6 +45,9 @@
#define GMAC_MAX_PERFECT_ADDRESSES 128
+/* MAC RX Queue Enable*/
+#define GMAC_RX_QUEUE_ENABLE(queue) BIT(queue * 2)
+
/* MAC Flow Control RX */
#define GMAC_RX_FLOW_CTRL_RFE BIT(0)
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
index eaed7cb..7ec1887 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
@@ -59,6 +59,16 @@ static void dwmac4_core_init(struct mac_device_info *hw, int mtu)
writel(value, ioaddr + GMAC_INT_EN);
}
+static void dwmac4_rx_queue_enable(struct mac_device_info *hw, u32 queue)
+{
+ void __iomem *ioaddr = hw->pcsr;
+ u32 value = readl(ioaddr + GMAC_RXQ_CTRL0);
+
+ value |= GMAC_RX_QUEUE_ENABLE(queue);
+
+ writel(value, ioaddr + GMAC_RXQ_CTRL0);
+}
+
static void dwmac4_dump_regs(struct mac_device_info *hw)
{
void __iomem *ioaddr = hw->pcsr;
@@ -392,6 +402,7 @@ static void dwmac4_debug(void __iomem *ioaddr, struct stmmac_extra_stats *x)
static const struct stmmac_ops dwmac4_ops = {
.core_init = dwmac4_core_init,
.rx_ipc = dwmac4_rx_ipc_enable,
+ .rx_queue_enable = dwmac4_rx_queue_enable,
.dump_regs = dwmac4_dump_regs,
.host_irq_status = dwmac4_irq_status,
.flow_ctrl = dwmac4_flow_ctrl,
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 3e40578..e30034d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -1271,6 +1271,24 @@ static void free_dma_desc_resources(struct stmmac_priv *priv)
}
/**
+ * stmmac_mac_enable_rx_queues - Enable MAC rx queues
+ * @priv: driver private structure
+ * Description: It is used for enabling the rx queues in the MAC
+ */
+static void stmmac_mac_enable_rx_queues(struct stmmac_priv *priv)
+{
+ int rx_count = priv->dma_cap.number_rx_channel;
+ int queue = 0;
+
+ /* If GMAC does not have multiqueues, then this is not necessary*/
+ if (rx_count == 1)
+ return;
+
+ for (queue = 0; queue < rx_count; queue++)
+ priv->hw->mac->rx_queue_enable(priv->hw, queue);
+}
+
+/**
* stmmac_dma_operation_mode - HW DMA operation mode
* @priv: driver private structure
* Description: it is used for configuring the DMA operation mode register in
@@ -1691,6 +1709,9 @@ static int stmmac_hw_setup(struct net_device *dev, bool init_ptp)
/* Initialize the MAC Core */
priv->hw->mac->core_init(priv->hw, dev->mtu);
+ /* Initialize MAC RX Queues */
+ stmmac_mac_enable_rx_queues(priv);
+
ret = priv->hw->mac->rx_ipc(priv->hw);
if (!ret) {
netdev_warn(priv->dev, "RX IPC Checksum Offload disabled\n");
--
2.9.3
^ permalink raw reply related
* Re: [PATCH net] virtio_net: reject XDP programs using header adjustment
From: Jakub Kicinski @ 2016-12-20 12:30 UTC (permalink / raw)
To: John Fastabend; +Cc: netdev, kafai, Daniel Borkmann, alexei.starovoitov, mst
In-Reply-To: <58581E01.7070902@gmail.com>
On Mon, Dec 19, 2016 at 5:50 PM, John Fastabend
<john.fastabend@gmail.com> wrote:
> On 16-12-19 07:05 AM, Jakub Kicinski wrote:
>> commit 17bedab27231 ("bpf: xdp: Allow head adjustment in XDP prog")
>> added a new XDP helper to prepend and remove data from a frame.
>> Make virtio_net reject programs making use of this helper until
>> proper support is added.
>>
>> Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
>> ---
>> drivers/net/virtio_net.c | 5 +++++
>> 1 file changed, 5 insertions(+)
>>
>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> index 08327e005ccc..db761f37783e 100644
>> --- a/drivers/net/virtio_net.c
>> +++ b/drivers/net/virtio_net.c
>> @@ -1677,6 +1677,11 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
>> u16 xdp_qp = 0, curr_qp;
>> int i, err;
>>
>> + if (prog && prog->xdp_adjust_head) {
>> + netdev_warn(dev, "Does not support bpf_xdp_adjust_head()\n");
>> + return -EOPNOTSUPP;
>> + }
>> +
>> if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
>> virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6)) {
>> netdev_warn(dev, "can't set XDP while host is implementing LRO, disable LRO first\n");
>>
>
> Acked-by: John Fastabend <john.r.fastabend@intel.com>
>
> Thanks patch looks good. Alternatively we could push a "bug fix" to
> support the adjust header feature depending on how DaveM and MST feel
> about that. I don't have a strong opinion but I have the patch on my
> queue it just needs some more testing.
Cool! I thought to ask you what your plans are but then this patch is so
trivial I decided to just post it :) I'm perfectly happy with dropping it
for now and reposting after ~rc5 if needed.
^ permalink raw reply
* Re: mlx4: Bug in XDP_TX + 16 rx-queues
From: Tariq Toukan @ 2016-12-20 12:02 UTC (permalink / raw)
To: Martin KaFai Lau, Tariq Toukan
Cc: Saeed Mahameed, netdev@vger.kernel.org, Alexei Starovoitov
In-Reply-To: <20161219233709.GA29858@kafai-mba.local>
Thanks Martin, nice catch!
On 20/12/2016 1:37 AM, Martin KaFai Lau wrote:
> Hi Tariq,
>
> On Sat, Dec 17, 2016 at 02:18:03AM -0800, Martin KaFai Lau wrote:
>> Hi All,
>>
>> I have been debugging with XDP_TX and 16 rx-queues.
>>
>> 1) When 16 rx-queues is used and an XDP prog is doing XDP_TX,
>> it seems that the packet cannot be XDP_TX out if the pkt
>> is received from some particular CPUs (/rx-queues).
>>
>> 2) If 8 rx-queues is used, it does not have problem.
>>
>> 3) The 16 rx-queues problem also went away after reverting these
>> two patches:
>> 15fca2c8eb41 net/mlx4_en: Add ethtool statistics for XDP cases
>> 67f8b1dcb9ee net/mlx4_en: Refactor the XDP forwarding rings scheme
>>
> After taking a closer look at 67f8b1dcb9ee ("net/mlx4_en: Refactor the XDP forwarding rings scheme")
> and armed with the fact that '>8 rx-queues does not work', I have
> made the attached change that fixed the issue.
>
> Making change in mlx4_en_fill_qp_context() could be an easier fix
> but I think this change will be easier for discussion purpose.
>
> I don't want to lie that I know anything about how this variable
> works in CX3. If this change makes sense, I can cook up a diff.
> Otherwise, can you shed some light on what could be happening
> and hopefully can lead to a diff?
>
> Thanks
> --Martin
>
>
> diff --git i/drivers/net/ethernet/mellanox/mlx4/en_netdev.c w/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> index bcd955339058..b3bfb987e493 100644
> --- i/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> +++ w/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> @@ -1638,10 +1638,10 @@ int mlx4_en_start_port(struct net_device *dev)
>
> /* Configure tx cq's and rings */
> for (t = 0 ; t < MLX4_EN_NUM_TX_TYPES; t++) {
> - u8 num_tx_rings_p_up = t == TX ? priv->num_tx_rings_p_up : 1;
The bug lies in this line.
Number of rings per UP in case of TX_XDP should be
priv->tx_ring_num[TX_XDP ], not 1.
Please try the following fix.
I can prepare and send it once the window opens again.
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index bcd955339058..edbe200ac2fa 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -1638,7 +1638,8 @@ int mlx4_en_start_port(struct net_device *dev)
/* Configure tx cq's and rings */
for (t = 0 ; t < MLX4_EN_NUM_TX_TYPES; t++) {
- u8 num_tx_rings_p_up = t == TX ? priv->num_tx_rings_p_up
: 1;
+ u8 num_tx_rings_p_up = t == TX ?
+ priv->num_tx_rings_p_up : priv->tx_ring_num[t];
for (i = 0; i < priv->tx_ring_num[t]; i++) {
/* Configure cq */
> -
> for (i = 0; i < priv->tx_ring_num[t]; i++) {
> /* Configure cq */
> + int user_prio;
> +
> cq = priv->tx_cq[t][i];
> err = mlx4_en_activate_cq(priv, cq, i);
> if (err) {
> @@ -1660,9 +1660,14 @@ int mlx4_en_start_port(struct net_device *dev)
>
> /* Configure ring */
> tx_ring = priv->tx_ring[t][i];
> + if (t != TX_XDP)
> + user_prio = i / priv->num_tx_rings_p_up;
> + else
> + user_prio = i & 0x07;
> +
> err = mlx4_en_activate_tx_ring(priv, tx_ring,
> cq->mcq.cqn,
> - i / num_tx_rings_p_up);
> + user_prio);
> if (err) {
> en_err(priv, "Failed allocating Tx ring\n");
> mlx4_en_deactivate_cq(priv, cq);
Regards,
Tariq Toukan.
^ permalink raw reply related
* Re: [PATCH net-next] ixgbevf: fix 'Etherleak' in ixgbevf
From: Weilong Chen @ 2016-12-20 11:50 UTC (permalink / raw)
To: Alexander Duyck
Cc: Jeff Kirsher, intel-wired-lan, Netdev,
linux-kernel@vger.kernel.org, wangkefeng.wang
In-Reply-To: <CAKgT0UfA9VG4NQZLLxXvobtfp22yPpPssSM4Ge94KFa1RVnhyg@mail.gmail.com>
Hi,
Thanks for you reply.
We test you patch, but the problem is still there, it seems do not work.
I'm not sure why ixgbe use the limit 17. The kenel use ETH_ZLEN (60)
with out FCS. A lot of drivers such as e1000 use it. Any explaination?
Thanks.
On 2016/12/16 0:13, Alexander Duyck wrote:
> On Thu, Dec 15, 2016 at 3:40 AM, Weilong Chen <chenweilong@huawei.com> wrote:
>> Nessus report the vf appears to leak memory in network packets.
>> Fix this by padding all small packets manually.
>>
>> And the CVE-2003-0001.
>> https://ofirarkin.files.wordpress.com/2008/11/atstake_etherleak_report.pdf
>>
>> Signed-off-by: Weilong Chen <chenweilong@huawei.com>
>> ---
>> drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 7 +++++++
>> 1 file changed, 7 insertions(+)
>>
>> diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
>> index 6d4bef5..137a154 100644
>> --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
>> +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
>> @@ -3654,6 +3654,13 @@ static int ixgbevf_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
>> return NETDEV_TX_OK;
>> }
>>
>> + /* On PCI/PCI-X HW, if packet size is less than ETH_ZLEN,
>> + * packets may get corrupted during padding by HW.
>> + * To WA this issue, pad all small packets manually.
>> + */
>> + if (eth_skb_pad(skb))
>> + return NETDEV_TX_OK;
>> +
>
> So the patch description for this probably isn't correct. It looks
> like the problem isn't leaking data it is the fact that the frames
> aren't being padded to prevent malicious events. The only issue is
> the patch is padding by a bit too much. I would recommend replacing
> this with the following from ixgbe:
>
> /*
> * The minimum packet size for olinfo paylen is 17 so pad the skb
> * in order to meet this minimum size requirement.
> */
> if (skb_put_padto(skb, 17))
> return NETDEV_TX_OK;
>
>
>> tx_ring = adapter->tx_ring[skb->queue_mapping];
>>
>> /* need: 1 descriptor per page * PAGE_SIZE/IXGBE_MAX_DATA_PER_TXD,
>> --
>> 1.7.12
>>
>
> .
>
^ permalink raw reply
* Re: wl1251 & mac address & calibration data
From: Kalle Valo @ 2016-12-20 11:47 UTC (permalink / raw)
To: Arend Van Spriel
Cc: Pali Rohár, Daniel Wagner, Luis R. Rodriguez, Tom Gundersen,
Johannes Berg, Ming Lei, Mimi Zohar, Bjorn Andersson,
Rafał Miłecki, Sebastian Reichel, Pavel Machek,
Michal Kazior, Ivaylo Dimitrov, Aaro Koskinen, Tony Lindgren,
linux-wireless, Network Development, linux-kernel@vger.kernel.org
In-Reply-To: <fd180271-1cd4-9891-e3d5-ae9cd0fb088b@broadcom.com>
Arend Van Spriel <arend.vanspriel@broadcom.com> writes:
> On 18-12-2016 13:09, Pali Rohár wrote:
>
>> File wl1251-nvs.bin is provided by linux-firmware package and contains
>> default data which should be overriden by model specific calibrated
>> data.
>
> Ah. Someone thought it was a good idea to provide the "one ring to rule
> them all". Nice.
Yes, that was a bad idea. wl1251-nvs.bin in linux-firmware.git should be
renamed to wl1251-nvs.bin.example, or something like that, as it should
be only installed to a real system only if there's no real calibration
data available (only for developers to use, not real users).
>> But overwriting that one file is not possible as it next update of
>> linux-firmware package will overwrite it back. It break any normal usage
>> of package management.
>>
>> Also it is ridiculously broken by design if some "boot" files needs to
>> be overwritten to initialize hardware properly. To not break booting you
>> need to overwrite that file before first boot. But without booting
>> device you cannot read calibration data. So some hack with autoreboot
>> after boot is needed.
Providing the calibration data via Device Tree is the proper way to
solve this. Yes yes, I know N900 doesn't support it but that's a
deficiency in N900, not Linux.
>> And how to detect that we have real overwritten calibration data and
>> not default one from linux-firmware? Any heuristic or checks will be
>> broken here. And no, nothing like you need to reboot your device now
>> (and similar concept) from windows world is not accepted.
>
> Well. After reading and creating calibration data you could just rebind
> the driver to the device to have it probed again.
Or load wl1251 as a module and make sure calibration data is installed
before the module is loaded. LEDE does that with ath10k:
https://git.lede-project.org/?p=source.git;a=blob;f=target/linux/ar71xx/base-files/etc/hotplug.d/firmware/11-ath10k-caldata;h=97875bd79a579a0010da3f60324b6ec966fe9c6a;hb=HEAD
> But yeah, the default one from linux-firmware should never have been
> there in the first place.
Agreed.
--
Kalle Valo
^ permalink raw reply
* Re: Soft lockup in tc_classify
From: Daniel Borkmann @ 2016-12-20 11:47 UTC (permalink / raw)
To: Shahar Klein, Cong Wang
Cc: Or Gerlitz, Linux Netdev List, Roi Dayan, David Miller,
Jiri Pirko, John Fastabend, Hadar Hen Zion
In-Reply-To: <5a985705-11e5-1575-a049-723accb97608@mellanox.com>
Hi Shahar,
On 12/20/2016 07:22 AM, Shahar Klein wrote:
> On 12/19/2016 7:58 PM, Cong Wang wrote:
>> On Mon, Dec 19, 2016 at 8:39 AM, Shahar Klein <shahark@mellanox.com> wrote:
>>> On 12/13/2016 12:51 AM, Cong Wang wrote:
>>>>
>>>> On Mon, Dec 12, 2016 at 1:18 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>>>>
>>>>> On Mon, Dec 12, 2016 at 3:28 PM, Daniel Borkmann <daniel@iogearbox.net>
>>>>> wrote:
>>>>>
>>>>>> Note that there's still the RCU fix missing for the deletion race that
>>>>>> Cong will still send out, but you say that the only thing you do is to
>>>>>> add a single rule, but no other operation in involved during that test?
>>>>>
>>>>> What's missing to have the deletion race fixed? making a patch or
>>>>> testing to a patch which was sent?
>>>>
>>>> If you think it would help for this problem, here is my patch rebased
>>>> on the latest net-next.
>>>>
>>>> Again, I don't see how it could help this case yet, especially I don't
>>>> see how we could have a loop in this singly linked list.
>>>
>>> I've applied cong's patch and hit a different lockup(full log attached):
>>
>> Are you sure this is really different? For me, it is still inside the loop
>> in tc_classify(), with only a slightly different offset.
>>
>>>
>>> Daniel suggested I'll add a print:
>>> case RTM_DELTFILTER:
>>> - err = tp->ops->delete(tp, fh);
>>> + printk(KERN_ERR "DEBUGG:SK %s:%d\n", __func__, __LINE__);
>>> + err = tp->ops->delete(tp, fh, &last);
>>> if (err == 0) {
>>>
>>> and I couldn't see this print in the output.....
>>
>> Hmm, that is odd, if this never prints, then my patch should not make any
>> difference.
>>
>> There are still two other cases where we could change tp->next, so do you
>> mind to add two more printk's for debugging?
>>
>> Attached is the delta patch.
>>
>> Thanks!
>
> I've added a slightly different debug print:
> @@ -368,11 +375,12 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
> if (tp_created) {
> RCU_INIT_POINTER(tp->next, rtnl_dereference(*back));
> rcu_assign_pointer(*back, tp);
> + printk(KERN_ERR "DEBUGG:SK add/change filter by: %pf tp=%p tp->next=%p\n", tp->ops->get, tp, tp->next);
> }
> tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER, false);
I'm curious, could you be a bit more verbose why you didn't go with Cong's
debug patch?
In particular, why you removed the hunk from the condition 'n->nlmsg_type ==
RTM_DELTFILTER && t->tcm_handle == 0' where we delete the whole tp instance?
Is it because if you have that printk() there, then the issue doesn't trigger
for you anymore? Or any other reason?
How many CPUs does your test machine have, I suspect more than 1, right?
So iff RTM_DELTFILTER with tcm_handle of 0 really played a role in this, I'm
wondering whether there was a subtle deletion + add race where the newly added
filter on the other CPU still saw a stale pointer in the list. But just a wild
guess at this point.
Hmm, could you try this below to see whether the issue still appears?
Thanks,
Daniel
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 3fbba79..4eee1cb 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -317,7 +317,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
struct tcf_proto *next = rtnl_dereference(tp->next);
- RCU_INIT_POINTER(*back, next);
+ rcu_assign_pointer(*back, next);
tfilter_notify(net, skb, n, tp, fh,
RTM_DELTFILTER, false);
> full output attached:
>
> [ 283.290271] Mirror/redirect action on
> [ 283.305031] DEBUGG:SK add/change filter by: fl_get [cls_flower] tp=ffff9432d704df60 tp->next= (null)
> [ 283.322563] DEBUGG:SK add/change filter by: fl_get [cls_flower] tp=ffff9436e718d240 tp->next= (null)
> [ 283.359997] GACT probability on
> [ 283.365923] DEBUGG:SK add/change filter by: fl_get [cls_flower] tp=ffff9436e718d3c0 tp->next=ffff9436e718d240
> [ 283.378725] DEBUGG:SK add/change filter by: fl_get [cls_flower] tp=ffff9436e718d3c0 tp->next=ffff9436e718d3c0
> [ 283.391310] DEBUGG:SK add/change filter by: fl_get [cls_flower] tp=ffff9436e718d3c0 tp->next=ffff9436e718d3c0
> [ 283.403923] DEBUGG:SK add/change filter by: fl_get [cls_flower] tp=ffff9436e718d3c0 tp->next=ffff9436e718d3c0
> [ 283.416542] DEBUGG:SK add/change filter by: fl_get [cls_flower] tp=ffff9436e718d3c0 tp->next=ffff9436e718d3c0
> [ 308.538571] NMI watchdog: BUG: soft lockup - CPU#0 stuck for 23s! [swapper/0:0]
>
> Thanks
> Shahar
^ permalink raw reply related
* [PATCH][V2] qed: fix memory leak of a qed_spq_entry on error failure paths
From: Colin King @ 2016-12-20 11:44 UTC (permalink / raw)
To: Yuval Mintz, Ariel Elior, everest-linux-l2, netdev; +Cc: linux-kernel
From: Colin Ian King <colin.king@canonical.com>
A qed_spq_entry entry is allocated by qed_sp_init_request but is not
kfree'd if an error occurs, causing a memory leak. Fix this by
returning the previously allocated spq entry and also setting *pp_ent
to NULL to be safe.
Thanks to Yuval Mintz for suggestions on how to improve my original
fix.
Signed-off-by: Colin Ian King <colin.king@canonical.com>
---
drivers/net/ethernet/qlogic/qed/qed_sp_commands.c | 13 ++++++++++---
1 file changed, 10 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
index a39ef2e..d2034fa 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
@@ -55,8 +55,10 @@ int qed_sp_init_request(struct qed_hwfn *p_hwfn,
break;
case QED_SPQ_MODE_BLOCK:
- if (!p_data->p_comp_data)
- return -EINVAL;
+ if (!p_data->p_comp_data) {
+ rc = -EINVAL;
+ goto err;
+ }
p_ent->comp_cb.cookie = p_data->p_comp_data->cookie;
break;
@@ -71,7 +73,8 @@ int qed_sp_init_request(struct qed_hwfn *p_hwfn,
default:
DP_NOTICE(p_hwfn, "Unknown SPQE completion mode %d\n",
p_ent->comp_mode);
- return -EINVAL;
+ rc = -EINVAL;
+ goto err;
}
DP_VERBOSE(p_hwfn, QED_MSG_SPQ,
@@ -85,6 +88,10 @@ int qed_sp_init_request(struct qed_hwfn *p_hwfn,
memset(&p_ent->ramrod, 0, sizeof(p_ent->ramrod));
return 0;
+err:
+ qed_spq_return_entry(p_hwfn, *pp_ent);
+ *pp_ent = NULL;
+ return rc;
}
static enum tunnel_clss qed_tunn_get_clss_type(u8 type)
--
2.10.2
^ permalink raw reply related
* Re: [PATCHv2 net 2/2] sctp: not copying duplicate addrs to the assoc's bind address list
From: Marcelo Ricardo Leitner @ 2016-12-20 11:30 UTC (permalink / raw)
To: Xin Long; +Cc: network dev, linux-sctp, davem, Neil Horman
In-Reply-To: <5a0037123617b525dfe456db1055770f39fb1193.1482212764.git.lucien.xin@gmail.com>
On Tue, Dec 20, 2016 at 01:49:50PM +0800, Xin Long wrote:
> sctp.local_addr_list is a global address list that is supposed to include
> all the local addresses. sctp updates this list according to NETDEV_UP/
> NETDEV_DOWN notifications.
>
> However, if multiple NICs have the same address, the global list would
> have duplicate addresses. Even if for one NIC, promote secondaries in
> __inet_del_ifa can also lead to accumulating duplicate addresses.
>
> When sctp binds address 'ANY' and creates a connection, it copies all
> the addresses from global list into asoc's bind addr list, which makes
> sctp pack the duplicate addresses into INIT/INIT_ACK packets.
>
> This patch is to filter the duplicate addresses when copying the addrs
> from global list in sctp_copy_local_addr_list and unpacking addr_param
> from cookie in sctp_raw_to_bind_addrs to asoc's bind addr list.
>
> Note that we can't filter the duplicate addrs when global address list
> gets updated, As NETDEV_DOWN event may remove an addr that still exists
> in another NIC.
>
> Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> ---
> net/sctp/bind_addr.c | 3 +++
> net/sctp/protocol.c | 3 +++
> 2 files changed, 6 insertions(+)
>
> diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
> index 401c607..1ebc184 100644
> --- a/net/sctp/bind_addr.c
> +++ b/net/sctp/bind_addr.c
> @@ -292,6 +292,8 @@ int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list,
> }
>
> af->from_addr_param(&addr, rawaddr, htons(port), 0);
> + if (sctp_bind_addr_state(bp, &addr) != -1)
> + goto next;
> retval = sctp_add_bind_addr(bp, &addr, sizeof(addr),
> SCTP_ADDR_SRC, gfp);
> if (retval) {
> @@ -300,6 +302,7 @@ int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list,
> break;
> }
>
> +next:
> len = ntohs(param->length);
> addrs_len -= len;
> raw_addr_list += len;
> diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
> index da5d82b..616a942 100644
> --- a/net/sctp/protocol.c
> +++ b/net/sctp/protocol.c
> @@ -220,6 +220,9 @@ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp,
> !(copy_flags & SCTP_ADDR6_PEERSUPP)))
> continue;
>
> + if (sctp_bind_addr_state(bp, &addr->a) != -1)
> + continue;
> +
> error = sctp_add_bind_addr(bp, &addr->a, sizeof(addr->a),
> SCTP_ADDR_SRC, GFP_ATOMIC);
> if (error)
> --
> 2.1.0
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-sctp" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
^ permalink raw reply
* Re: [PATCHv2 net 1/2] sctp: reduce indent level in sctp_copy_local_addr_list
From: Marcelo Ricardo Leitner @ 2016-12-20 11:29 UTC (permalink / raw)
To: Xin Long; +Cc: network dev, linux-sctp, davem, Neil Horman
In-Reply-To: <b4d6428c565955e9e81e3b4623a6458e7567e593.1482212764.git.lucien.xin@gmail.com>
On Tue, Dec 20, 2016 at 01:49:49PM +0800, Xin Long wrote:
> This patch is to reduce indent level by using continue when the addr
> is not allowed, and also drop end_copy by using break.
>
> Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> ---
> net/sctp/protocol.c | 37 +++++++++++++++++++------------------
> 1 file changed, 19 insertions(+), 18 deletions(-)
>
> diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
> index 7b523e3..da5d82b 100644
> --- a/net/sctp/protocol.c
> +++ b/net/sctp/protocol.c
> @@ -205,26 +205,27 @@ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp,
> list_for_each_entry_rcu(addr, &net->sctp.local_addr_list, list) {
> if (!addr->valid)
> continue;
> - if (sctp_in_scope(net, &addr->a, scope)) {
> - /* Now that the address is in scope, check to see if
> - * the address type is really supported by the local
> - * sock as well as the remote peer.
> - */
> - if ((((AF_INET == addr->a.sa.sa_family) &&
> - (copy_flags & SCTP_ADDR4_PEERSUPP))) ||
> - (((AF_INET6 == addr->a.sa.sa_family) &&
> - (copy_flags & SCTP_ADDR6_ALLOWED) &&
> - (copy_flags & SCTP_ADDR6_PEERSUPP)))) {
> - error = sctp_add_bind_addr(bp, &addr->a,
> - sizeof(addr->a),
> - SCTP_ADDR_SRC, GFP_ATOMIC);
> - if (error)
> - goto end_copy;
> - }
> - }
> + if (!sctp_in_scope(net, &addr->a, scope))
> + continue;
> +
> + /* Now that the address is in scope, check to see if
> + * the address type is really supported by the local
> + * sock as well as the remote peer.
> + */
> + if (addr->a.sa.sa_family == AF_INET &&
> + !(copy_flags & SCTP_ADDR4_PEERSUPP))
> + continue;
> + if (addr->a.sa.sa_family == AF_INET6 &&
> + (!(copy_flags & SCTP_ADDR6_ALLOWED) ||
> + !(copy_flags & SCTP_ADDR6_PEERSUPP)))
> + continue;
> +
> + error = sctp_add_bind_addr(bp, &addr->a, sizeof(addr->a),
> + SCTP_ADDR_SRC, GFP_ATOMIC);
> + if (error)
> + break;
> }
>
> -end_copy:
> rcu_read_unlock();
> return error;
> }
> --
> 2.1.0
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-sctp" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
^ permalink raw reply
* [PATCH] stmmac: CSR clock configuration fix
From: Joao Pinto @ 2016-12-20 11:21 UTC (permalink / raw)
To: peppe.cavallaro, davem
Cc: hock.leong.kweh, niklas.cassel, pavel, linux-kernel, netdev,
Joao Pinto
When testing stmmac with my QoS reference design I checked a problem in the
CSR clock configuration that was impossibilitating the phy discovery, since
every read operation returned 0x0000ffff. This patch fixes the issue.
Signed-off-by: Joao Pinto <jpinto@synopsys.com>
---
drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
index 23322fd..fda01f7 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
@@ -81,8 +81,8 @@ static int stmmac_mdio_read(struct mii_bus *bus, int phyaddr, int phyreg)
value |= (phyaddr << priv->hw->mii.addr_shift)
& priv->hw->mii.addr_mask;
value |= (phyreg << priv->hw->mii.reg_shift) & priv->hw->mii.reg_mask;
- value |= (priv->clk_csr & priv->hw->mii.clk_csr_mask)
- << priv->hw->mii.clk_csr_shift;
+ value |= (priv->clk_csr << priv->hw->mii.clk_csr_shift)
+ & priv->hw->mii.clk_csr_mask;
if (priv->plat->has_gmac4)
value |= MII_GMAC4_READ;
@@ -122,8 +122,8 @@ static int stmmac_mdio_write(struct mii_bus *bus, int phyaddr, int phyreg,
& priv->hw->mii.addr_mask;
value |= (phyreg << priv->hw->mii.reg_shift) & priv->hw->mii.reg_mask;
- value |= ((priv->clk_csr & priv->hw->mii.clk_csr_mask)
- << priv->hw->mii.clk_csr_shift);
+ value |= (priv->clk_csr << priv->hw->mii.clk_csr_shift)
+ & priv->hw->mii.clk_csr_mask;
if (priv->plat->has_gmac4)
value |= MII_GMAC4_WRITE;
--
2.9.3
^ permalink raw reply related
* [PATCHv2 net] netfilter: check duplicate config when initializing in ipt_CLUSTERIP
From: Xin Long @ 2016-12-20 11:14 UTC (permalink / raw)
To: network dev, netfilter-devel; +Cc: davem, pablo, Marcelo Ricardo Leitner
Now when adding an ipt_CLUSTERIP rule, it only checks duplicate config in
clusterip_config_find_get(). But after that, there may be still another
thread to insert a config with the same ip, then it leaves proc_create_data
to do duplicate check.
It's more reasonable to check duplicate config by ipt_CLUSTERIP itself,
instead of checking it by proc fs duplicate file check. Before, when proc
fs allowed duplicate name files in a directory, It could even crash kernel
because of use-after-free.
This patch is to check duplicate config under the protection of clusterip
net lock when initializing a new config and correct the return err.
Note that it also moves proc file node creation after adding new config, as
proc_create_data may sleep, it couldn't be called under the clusterip_net
lock. clusterip_config_find_get returns NULL if c->pde is null to make sure
it can't be used until the proc file node creation is done.
v1->v2:
correct the err clusterip_config_init returns.
Suggested-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
net/ipv4/netfilter/ipt_CLUSTERIP.c | 34 +++++++++++++++++++++++-----------
1 file changed, 23 insertions(+), 11 deletions(-)
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 21db00d..a6b8c1a 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -144,7 +144,7 @@ clusterip_config_find_get(struct net *net, __be32 clusterip, int entry)
rcu_read_lock_bh();
c = __clusterip_config_find(net, clusterip);
if (c) {
- if (unlikely(!atomic_inc_not_zero(&c->refcount)))
+ if (!c->pde || unlikely(!atomic_inc_not_zero(&c->refcount)))
c = NULL;
else if (entry)
atomic_inc(&c->entries);
@@ -166,14 +166,15 @@ clusterip_config_init_nodelist(struct clusterip_config *c,
static struct clusterip_config *
clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
- struct net_device *dev)
+ struct net_device *dev)
{
+ struct net *net = dev_net(dev);
struct clusterip_config *c;
- struct clusterip_net *cn = net_generic(dev_net(dev), clusterip_net_id);
+ struct clusterip_net *cn = net_generic(net, clusterip_net_id);
c = kzalloc(sizeof(*c), GFP_ATOMIC);
if (!c)
- return NULL;
+ return ERR_PTR(-ENOMEM);
c->dev = dev;
c->clusterip = ip;
@@ -185,6 +186,17 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
atomic_set(&c->refcount, 1);
atomic_set(&c->entries, 1);
+ spin_lock_bh(&cn->lock);
+ if (__clusterip_config_find(net, ip)) {
+ spin_unlock_bh(&cn->lock);
+ kfree(c);
+
+ return ERR_PTR(-EBUSY);
+ }
+
+ list_add_rcu(&c->list, &cn->configs);
+ spin_unlock_bh(&cn->lock);
+
#ifdef CONFIG_PROC_FS
{
char buffer[16];
@@ -195,16 +207,16 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
cn->procdir,
&clusterip_proc_fops, c);
if (!c->pde) {
+ spin_lock_bh(&cn->lock);
+ list_del_rcu(&c->list);
+ spin_unlock_bh(&cn->lock);
kfree(c);
- return NULL;
+
+ return ERR_PTR(-ENOMEM);
}
}
#endif
- spin_lock_bh(&cn->lock);
- list_add_rcu(&c->list, &cn->configs);
- spin_unlock_bh(&cn->lock);
-
return c;
}
@@ -410,9 +422,9 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
config = clusterip_config_init(cipinfo,
e->ip.dst.s_addr, dev);
- if (!config) {
+ if (IS_ERR(config)) {
dev_put(dev);
- return -ENOMEM;
+ return PTR_ERR(config);
}
dev_mc_add(config->dev, config->clustermac);
}
--
2.1.0
^ permalink raw reply related
* Re: [PATCH net] netfilter: check duplicate config when initializing in ipt_CLUSTERIP
From: Xin Long @ 2016-12-20 11:14 UTC (permalink / raw)
To: Pablo Neira Ayuso
Cc: network dev, netfilter-devel, davem, Marcelo Ricardo Leitner
In-Reply-To: <20161220004803.GA14656@salvia>
On Tue, Dec 20, 2016 at 8:48 AM, Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> On Thu, Dec 15, 2016 at 12:31:40PM +0800, Xin Long wrote:
>> @@ -185,6 +186,17 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
>> atomic_set(&c->refcount, 1);
>> atomic_set(&c->entries, 1);
>>
>> + spin_lock_bh(&cn->lock);
>> + if (__clusterip_config_find(net, ip)) {
>> + spin_unlock_bh(&cn->lock);
>> + kfree(c);
>> +
>> + return NULL;
>> + }
>
> This is going to result in ENOMEM error report to userspace on race,
> which can be confusing. Time for clusterip_config_init() to return
> PTR_ERR()?
will post v2 with PTR_ERR, thanks.
>
>> +
>> + list_add_rcu(&c->list, &cn->configs);
>> + spin_unlock_bh(&cn->lock);
>> +
>> #ifdef CONFIG_PROC_FS
>> {
>> char buffer[16];
^ permalink raw reply
* Re: Potential issues (security and otherwise) with the current cgroup-bpf API
From: Daniel Mack @ 2016-12-20 10:21 UTC (permalink / raw)
To: Andy Lutomirski, Alexei Starovoitov
Cc: Andy Lutomirski, Mickaël Salaün, Kees Cook, Jann Horn,
Tejun Heo, David Ahern, David S. Miller, Thomas Graf,
Michael Kerrisk, Peter Zijlstra, Linux API,
linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
Network Development
In-Reply-To: <CALCETrXymvAo-9zhQe=amToz_fs9XGniK2KLZv5Fxc66qcUx6A-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
Hi,
On 12/20/2016 04:50 AM, Andy Lutomirski wrote:
> On Mon, Dec 19, 2016 at 7:18 PM, Alexei Starovoitov
> <alexei.starovoitov-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
>> On Mon, Dec 19, 2016 at 04:25:32PM -0800, Andy Lutomirski wrote:
>>> I think we're still talking past each other. A big part of the point
>>> of changing it is that none of this is specific to bpf. You could (in
>>
>> the hooks and context passed into the program is very much bpf specific.
>> That's what I've been trying to convey all along.
>
> You mean BPF_CGROUP_RUN_PROG_INET_SOCK(sk)? There is nothing bpf
> specfic about the hook except that the name of this macro has "BPF" in
> it. There is nothing whatsoever that's bpf-specific about the context
> -- sk is not bpf-specific at all.
>
> The only thing bpf-specific about it is that it currently only invokes
> bpf programs. That could easily change.
I'm not sure if I follow. The code as it currently stands only supports
attaching bpf programs to cgroups which have been created using
BPF_PROG_LOAD. If cgroups would support other program types in the
future, then they would need to be stored in different data types
anyway, and the bpf syscall multiplexer would be the wrong entry point
to access them anyway.
Whether we add bpf-specific code to the cgroup file parsers or
cgroup-specific code to the bpf layer does not make much of a semantic
difference, does it? As a matter of fact, my very first implementation
of this patch set implemented a cgroup controller that would allow
writing strings like "ingress 5" to its control file, where 5 is the fd
number that came out of BPF_PROG_LOAD. The main reason we decided to
ditch that was that echoing fd numbers into a text file seemed way worse
than going through a proper syscall layer with it, and ioctls are
unavailable on pseudo-fs.
The idea was rather to allow attaching bpf programs to other things than
just cgroups as well, which is why we called the member of 'union
bpf_attr' 'target_fd', and a cgroup is just one type a target here.
>> i'm assuming 'baadf00d' is bpf program fd expressed a text string?
>> and kernel needs to parse above? will you allow capital and lower
>> case for 'bpf:' ? and mixed case too? spaces and tabs allowed or not?
>> can program fd expressed as decimal or hex or both?
>> how do you return the error? as a text string for user space
>> to parse?
>
> No. The kernel does not parse it because you cannot write this to the
> file. You set a bpf filter with ioctl and pass an fd.
An ioctl on what file, exactly?
> If you *read*
> the file, you get the same bpf program hash that fdinfo on the bpf
> object would show -- this is for debugging and (eventually) CRIU.
We need a debugging facility at some point, I agree to that. As the code
currently stands, that would rather need to go into the bpf(2) syscall
though, as setting a program through bpf(2) and reading it through
cgroupfs is really nasty.
>> so you're proposing to add a bunch of hard coded logic to the kernel.
>> First to parse such text into some sort of syntax tree or list/set
>> and then have hard coded logic specifically for these two use cases?
>> While above two can be implemented as trivial bpf programs already?!
>> That goes 180% degree vs bpf philosophy. bpf is about moving
>> the specific code out of the kernel and keeping kernel generic that
>> it can solve as many use cases as possible by being programmable.
>
> I'm not seriously proposing implementing these. My point is that
> *bpf*, while wonderful, is not the be-all-and-end-all of kernel
> configurability, and other types of hooks might want to be hooked in
> here.
Sure, but nobody claimed it to be that be-all-and-end-all thing. It's
just one thing that a cgroup is now able to accommodate, and because
that new feature is specific to bpf, we decided to hook up the uapi to
the bpf syscall.
> So if I set up a cgroup that's monitored and call it /cgroup/a and
> enable delegation and if the program running there wants to do its own
> monitoring in /cgroup/a/b (via delegation), then you really want the
> outer monitor to silently drop events coming from /cgroup/a/b?
That's a fair point, and we've discussed it as well. The issue is, as
Alexei already pointed out, that we do not want to traverse the tree up
to the root for nested cgroups due to the runtime costs in the
networking fast-path. After all, we're running the bpf program for each
packet in flight. Hence, we opted for the approach to only look at the
leaf node for now, with the ability to open it up further in the future
using flags during attach etc.
> The current approach to bpf hooks will bite you down the road. David
> Ahern is already proposing using it for something that is not tracing
> at all, and someone will want that in a container, and there will be a
> problem.
Hmm, I thought we've sorted out the concerns about that by making sure
that we
a) lock-down the API sufficiently so it doesn't cause any security
issues in its current form, and
b) make it possible to extend the functionality in the future by adding
flags to the command struct etc.
And I hoped we achieved that after discussing it for so long.
> How about slowing down a wee bit and trying to come up with cgroup
> hook semantics that work for all of these use cases?
I'm all for discussing things, but I don't this was done in a rush.
I do agree though that adding functionality to cgroups that is not
limited to resource control is a delicate thing to do, which is why I
cc'ed cgroups@ in my patches. I should have also added linux-api@ I
guess, sorry I missed that.
> I think my proposal is quite close to workable.
So let's talk about how to proceed. I've seen different bits of your
proposal in different mails, and I think a summary of it would help the
discussion.
Thanks,
Daniel
^ permalink raw reply
* [PATCH for bnxt_re V3 19/21] bnxt_re: Set uverbs command mask
From: Selvin Xavier @ 2016-12-20 9:13 UTC (permalink / raw)
To: dledford, linux-rdma
Cc: netdev, michael.chan, Selvin Xavier, Eddie Wai, Devesh Sharma,
Somnath Kotur, Sriharsha Basavapatna
In-Reply-To: <1482225211-22423-1-git-send-email-selvin.xavier@broadcom.com>
This patch exports available uverbs command mask to the IB stack.
Also, populates some of the missing parameters in the ibdev structure
used for registration.
Signed-off-by: Eddie Wai <eddie.wai@broadcom.com>
Signed-off-by: Devesh Sharma <devesh.sharma@broadcom.com>
Signed-off-by: Somnath Kotur <somnath.kotur@broadcom.com>
Signed-off-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com>
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
---
drivers/infiniband/hw/bnxtre/bnxt_re_main.c | 35 +++++++++++++++++++++++++++++
1 file changed, 35 insertions(+)
diff --git a/drivers/infiniband/hw/bnxtre/bnxt_re_main.c b/drivers/infiniband/hw/bnxtre/bnxt_re_main.c
index ba082f1..035e85a 100644
--- a/drivers/infiniband/hw/bnxtre/bnxt_re_main.c
+++ b/drivers/infiniband/hw/bnxtre/bnxt_re_main.c
@@ -61,6 +61,7 @@
#include "bnxt_qplib_rcfw.h"
#include "bnxt_re.h"
#include "bnxt_re_ib_verbs.h"
+#include <rdma/bnxtre-abi.h>
#include "bnxt.h"
static char version[] =
BNXT_RE_DESC " v" ROCE_DRV_MODULE_VERSION "\n";
@@ -435,8 +436,42 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev)
strlen(BNXT_RE_DESC) + 5);
ibdev->phys_port_cnt = 1;
+ bnxt_qplib_get_guid(rdev->netdev->dev_addr, (u8 *)&ibdev->node_guid);
+
ibdev->num_comp_vectors = 1;
ibdev->dma_device = &rdev->en_dev->pdev->dev;
+ ibdev->local_dma_lkey = BNXT_QPLIB_RSVD_LKEY;
+
+ /* User space */
+ ibdev->uverbs_abi_ver = BNXT_RE_ABI_VERSION;
+ ibdev->uverbs_cmd_mask =
+ (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+ (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+ (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+ (1ull << IB_USER_VERBS_CMD_REG_MR) |
+ (1ull << IB_USER_VERBS_CMD_REREG_MR) |
+ (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+ (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+ (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_QP) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_AH) |
+ (1ull << IB_USER_VERBS_CMD_MODIFY_AH) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_AH) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_AH);
+ /* POLL_CQ and REQ_NOTIFY_CQ is directly handled in libbnxt_re */
+
+ /* Kernel verbs */
ibdev->query_device = bnxt_re_query_device;
ibdev->modify_device = bnxt_re_modify_device;
--
2.5.5
^ permalink raw reply related
* [PATCH for bnxt_re V3 18/21] bnxt_re: Support for DCB
From: Selvin Xavier @ 2016-12-20 9:13 UTC (permalink / raw)
To: dledford, linux-rdma
Cc: netdev, michael.chan, Selvin Xavier, Eddie Wai, Devesh Sharma,
Somnath Kotur, Sriharsha Basavapatna
In-Reply-To: <1482225211-22423-1-git-send-email-selvin.xavier@broadcom.com>
This patch queries the configured RoCE APP Priority on the host
using the dcbnl API and programs the RoCE FW with the corresponding
Traffic Class(es) for the priority.
v2: Fixed some sparse warning and cleanup of function
bnxt_re_query_hwrm_pri2cos
v3: Adds bnxt_qplib_map_tc2cos as a part of this patch. Uses
ROCE_V2_UDP_DPORT instead of BNXT_RE_ROCE_V2_PORT_NO.
Signed-off-by: Eddie Wai <eddie.wai@broadcom.com>
Signed-off-by: Devesh Sharma <devesh.sharma@broadcom.com>
Signed-off-by: Somnath Kotur <somnath.kotur@broadcom.com>
Signed-off-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com>
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
---
drivers/infiniband/hw/bnxtre/bnxt_qplib_sp.c | 37 +++++++
drivers/infiniband/hw/bnxtre/bnxt_qplib_sp.h | 1 +
drivers/infiniband/hw/bnxtre/bnxt_re.h | 5 +
drivers/infiniband/hw/bnxtre/bnxt_re_main.c | 140 +++++++++++++++++++++++++++
4 files changed, 183 insertions(+)
diff --git a/drivers/infiniband/hw/bnxtre/bnxt_qplib_sp.c b/drivers/infiniband/hw/bnxtre/bnxt_qplib_sp.c
index 4f77af8..aaafcf99 100644
--- a/drivers/infiniband/hw/bnxtre/bnxt_qplib_sp.c
+++ b/drivers/infiniband/hw/bnxtre/bnxt_qplib_sp.c
@@ -799,3 +799,40 @@ int bnxt_qplib_free_fast_reg_page_list(struct bnxt_qplib_res *res,
bnxt_qplib_free_hwq(res->pdev, &frpl->hwq);
return 0;
}
+
+int bnxt_qplib_map_tc2cos(struct bnxt_qplib_res *res, u16 *cids)
+{
+ struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+ struct cmdq_map_tc_to_cos req;
+ struct creq_map_tc_to_cos_resp *resp;
+ u16 cmd_flags = 0;
+ int tleft;
+
+ RCFW_CMD_PREP(req, MAP_TC_TO_COS, cmd_flags);
+ req.cos0 = cpu_to_le16(cids[0]);
+ req.cos1 = cpu_to_le16(cids[1]);
+
+ resp = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, NULL, 0);
+ if (!resp) {
+ dev_err(&res->pdev->dev, "QPLIB: SP: MAP_TC2COS send failed");
+ return -EINVAL;
+ }
+
+ tleft = bnxt_qplib_rcfw_block_for_resp(rcfw, le16_to_cpu(req.cookie));
+ if (!tleft) {
+ dev_err(&res->pdev->dev, "QPLIB: SP: MAP_TC2COS timed out");
+ return -ETIMEDOUT;
+ }
+
+ if (resp->status ||
+ le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
+ dev_err(&res->pdev->dev, "QPLIB: SP: MAP_TC2COS failed ");
+ dev_err(&res->pdev->dev,
+ "QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
+ resp->status, le16_to_cpu(req.cookie),
+ le16_to_cpu(resp->cookie));
+ return -EINVAL;
+ }
+
+ return 0;
+}
diff --git a/drivers/infiniband/hw/bnxtre/bnxt_qplib_sp.h b/drivers/infiniband/hw/bnxtre/bnxt_qplib_sp.h
index 3358f6d..96437bb 100644
--- a/drivers/infiniband/hw/bnxtre/bnxt_qplib_sp.h
+++ b/drivers/infiniband/hw/bnxtre/bnxt_qplib_sp.h
@@ -156,4 +156,5 @@ int bnxt_qplib_alloc_fast_reg_page_list(struct bnxt_qplib_res *res,
struct bnxt_qplib_frpl *frpl, int max);
int bnxt_qplib_free_fast_reg_page_list(struct bnxt_qplib_res *res,
struct bnxt_qplib_frpl *frpl);
+int bnxt_qplib_map_tc2cos(struct bnxt_qplib_res *res, u16 *cids);
#endif /* __BNXT_QPLIB_SP_H__*/
diff --git a/drivers/infiniband/hw/bnxtre/bnxt_re.h b/drivers/infiniband/hw/bnxtre/bnxt_re.h
index 6281e23..5565be8 100644
--- a/drivers/infiniband/hw/bnxtre/bnxt_re.h
+++ b/drivers/infiniband/hw/bnxtre/bnxt_re.h
@@ -45,6 +45,8 @@
#define BNXT_RE_REF_WAIT_COUNT 10
#define BNXT_RE_DESC "Broadcom NetXtreme-C/E RoCE Driver"
+#define BNXT_RE_ROCE_V1_ETH_TYPE 0x8915
+
#define BNXT_RE_PAGE_SIZE_4K BIT(12)
#define BNXT_RE_PAGE_SIZE_8K BIT(13)
#define BNXT_RE_PAGE_SIZE_64K BIT(16)
@@ -95,6 +97,9 @@ struct bnxt_re_dev {
int id;
+ struct delayed_work worker;
+ u8 cur_prio_map;
+
/* FP Notification Queue (CQ & SRQ) */
struct tasklet_struct nq_task;
diff --git a/drivers/infiniband/hw/bnxtre/bnxt_re_main.c b/drivers/infiniband/hw/bnxtre/bnxt_re_main.c
index 3a45b53..ba082f1 100644
--- a/drivers/infiniband/hw/bnxtre/bnxt_re_main.c
+++ b/drivers/infiniband/hw/bnxtre/bnxt_re_main.c
@@ -44,6 +44,7 @@
#include <linux/rculist.h>
#include <linux/spinlock.h>
#include <linux/pci.h>
+#include <net/dcbnl.h>
#include <net/ipv6.h>
#include <net/addrconf.h>
@@ -743,6 +744,50 @@ static void bnxt_re_dispatch_event(struct ib_device *ibdev, struct ib_qp *qp,
ib_dispatch_event(&ib_event);
}
+#define HWRM_QUEUE_PRI2COS_QCFG_INPUT_FLAGS_IVLAN 0x02
+static int bnxt_re_query_hwrm_pri2cos(struct bnxt_re_dev *rdev, u8 dir,
+ u64 *cid_map)
+{
+ struct hwrm_queue_pri2cos_qcfg_input req = {0};
+ struct bnxt *bp = netdev_priv(rdev->netdev);
+ struct hwrm_queue_pri2cos_qcfg_output resp;
+ struct bnxt_en_dev *en_dev = rdev->en_dev;
+ struct bnxt_fw_msg fw_msg;
+ u32 flags = 0;
+ u8 *qcfgmap, *tmp_map;
+ int rc = 0, i;
+
+ if (!cid_map)
+ return -EINVAL;
+
+ memset(&fw_msg, 0, sizeof(fw_msg));
+ bnxt_re_init_hwrm_hdr(rdev, (void *)&req,
+ HWRM_QUEUE_PRI2COS_QCFG, -1, -1);
+ flags |= (dir & 0x01);
+ flags |= HWRM_QUEUE_PRI2COS_QCFG_INPUT_FLAGS_IVLAN;
+ req.flags = cpu_to_le32(flags);
+ req.port_id = bp->pf.port_id;
+
+ bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
+ sizeof(resp), DFLT_HWRM_CMD_TIMEOUT);
+ rc = en_dev->en_ops->bnxt_send_fw_msg(en_dev, BNXT_ROCE_ULP, &fw_msg);
+ if (rc)
+ return rc;
+
+ if (resp.queue_cfg_info) {
+ dev_warn(rdev_to_dev(rdev),
+ "Asymmetric cos queue configuration detected");
+ dev_warn(rdev_to_dev(rdev),
+ " on device, QoS may not be fully functional\n");
+ }
+ qcfgmap = &resp.pri0_cos_queue_id;
+ tmp_map = (u8 *)cid_map;
+ for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++)
+ tmp_map[i] = qcfgmap[i];
+
+ return rc;
+}
+
static bool bnxt_re_is_qp1_or_shadow_qp(struct bnxt_re_dev *rdev,
struct bnxt_re_qp *qp)
{
@@ -783,6 +828,80 @@ static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev, bool qp_wait)
}
}
+static u32 bnxt_re_get_priority_mask(struct bnxt_re_dev *rdev)
+{
+ u32 prio_map = 0, tmp_map = 0;
+ struct net_device *netdev;
+ struct dcb_app app;
+
+ netdev = rdev->netdev;
+
+ memset(&app, 0, sizeof(app));
+ app.selector = IEEE_8021QAZ_APP_SEL_ETHERTYPE;
+ app.protocol = BNXT_RE_ROCE_V1_ETH_TYPE;
+ tmp_map = dcb_ieee_getapp_mask(netdev, &app);
+ prio_map = tmp_map;
+
+ app.selector = IEEE_8021QAZ_APP_SEL_DGRAM;
+ app.protocol = ROCE_V2_UDP_DPORT;
+ tmp_map = dcb_ieee_getapp_mask(netdev, &app);
+ prio_map |= tmp_map;
+
+ if (!prio_map)
+ prio_map = -EFAULT;
+ return prio_map;
+}
+
+static void bnxt_re_parse_cid_map(u8 prio_map, u8 *cid_map, u16 *cosq)
+{
+ u16 prio;
+ u8 id;
+
+ for (prio = 0, id = 0; prio < 8; prio++) {
+ if (prio_map & (1 << prio)) {
+ cosq[id] = cid_map[prio];
+ id++;
+ if (id == 2) /* Max 2 tcs supported */
+ break;
+ }
+ }
+}
+
+static int bnxt_re_setup_qos(struct bnxt_re_dev *rdev)
+{
+ u8 prio_map = 0;
+ u64 cid_map;
+ int rc;
+
+ /* Get priority for roce */
+ rc = bnxt_re_get_priority_mask(rdev);
+ if (rc < 0)
+ return rc;
+ prio_map = (u8)rc;
+
+ if (prio_map == rdev->cur_prio_map)
+ return 0;
+ rdev->cur_prio_map = prio_map;
+ /* Get cosq id for this priority */
+ rc = bnxt_re_query_hwrm_pri2cos(rdev, 0, &cid_map);
+ if (rc) {
+ dev_warn(rdev_to_dev(rdev), "no cos for p_mask %x\n", prio_map);
+ return rc;
+ }
+ /* Parse CoS IDs for app priority */
+ bnxt_re_parse_cid_map(prio_map, (u8 *)&cid_map, rdev->cosq);
+
+ /* Config BONO. */
+ rc = bnxt_qplib_map_tc2cos(&rdev->qplib_res, rdev->cosq);
+ if (rc) {
+ dev_warn(rdev_to_dev(rdev), "no tc for cos{%x, %x}\n",
+ rdev->cosq[0], rdev->cosq[1]);
+ return rc;
+ }
+
+ return 0;
+}
+
static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev, bool lock_wait)
{
int i, rc;
@@ -794,6 +913,9 @@ static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev, bool lock_wait)
/* Cleanup ib dev */
bnxt_re_unregister_ib(rdev);
}
+ if (test_and_clear_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags))
+ cancel_delayed_work(&rdev->worker);
+
bnxt_re_cleanup_res(rdev);
bnxt_re_free_res(rdev, lock_wait);
@@ -836,6 +958,16 @@ static void bnxt_re_set_resource_limits(struct bnxt_re_dev *rdev)
rdev->dev_attr.tqm_alloc_reqs[i];
}
+/* worker thread for polling periodic events. Now used for QoS programming*/
+static void bnxt_re_worker(struct work_struct *work)
+{
+ struct bnxt_re_dev *rdev = container_of(work, struct bnxt_re_dev,
+ worker.work);
+
+ bnxt_re_setup_qos(rdev);
+ schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000));
+}
+
static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
{
int i, j, rc;
@@ -920,6 +1052,14 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
goto fail;
}
+ rc = bnxt_re_setup_qos(rdev);
+ if (rc)
+ pr_info("RoCE priority not yet configured\n");
+
+ INIT_DELAYED_WORK(&rdev->worker, bnxt_re_worker);
+ set_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags);
+ schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000));
+
/* Register ib dev */
rc = bnxt_re_register_ib(rdev);
if (rc) {
--
2.5.5
^ permalink raw reply related
* [PATCH for bnxt_re V3 15/21] bnxt_re: Support post_recv
From: Selvin Xavier @ 2016-12-20 9:13 UTC (permalink / raw)
To: dledford, linux-rdma
Cc: netdev, michael.chan, Selvin Xavier, Eddie Wai, Devesh Sharma,
Somnath Kotur, Sriharsha Basavapatna
In-Reply-To: <1482225211-22423-1-git-send-email-selvin.xavier@broadcom.com>
Enables the fastpath verb ib_post_recv.
v3: Fixes sparse warnings
Signed-off-by: Eddie Wai <eddie.wai@broadcom.com>
Signed-off-by: Devesh Sharma <devesh.sharma@broadcom.com>
Signed-off-by: Somnath Kotur <somnath.kotur@broadcom.com>
Signed-off-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com>
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
---
drivers/infiniband/hw/bnxtre/bnxt_qplib_fp.c | 100 +++++++++++++++++++
drivers/infiniband/hw/bnxtre/bnxt_qplib_fp.h | 8 ++
drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.c | 123 ++++++++++++++++++++++++
drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.h | 2 +
drivers/infiniband/hw/bnxtre/bnxt_re_main.c | 2 +
5 files changed, 235 insertions(+)
diff --git a/drivers/infiniband/hw/bnxtre/bnxt_qplib_fp.c b/drivers/infiniband/hw/bnxtre/bnxt_qplib_fp.c
index d1ccf66..eeafb2a 100644
--- a/drivers/infiniband/hw/bnxtre/bnxt_qplib_fp.c
+++ b/drivers/infiniband/hw/bnxtre/bnxt_qplib_fp.c
@@ -1097,6 +1097,37 @@ void *bnxt_qplib_get_qp1_sq_buf(struct bnxt_qplib_qp *qp,
return NULL;
}
+u32 bnxt_qplib_get_rq_prod_index(struct bnxt_qplib_qp *qp)
+{
+ struct bnxt_qplib_q *rq = &qp->rq;
+
+ return HWQ_CMP(rq->hwq.prod, &rq->hwq);
+}
+
+dma_addr_t bnxt_qplib_get_qp_buf_from_index(struct bnxt_qplib_qp *qp, u32 index)
+{
+ return (qp->rq_hdr_buf_map + index * qp->rq_hdr_buf_size);
+}
+
+void *bnxt_qplib_get_qp1_rq_buf(struct bnxt_qplib_qp *qp,
+ struct bnxt_qplib_sge *sge)
+{
+ struct bnxt_qplib_q *rq = &qp->rq;
+ u32 sw_prod;
+
+ memset(sge, 0, sizeof(*sge));
+
+ if (qp->rq_hdr_buf) {
+ sw_prod = HWQ_CMP(rq->hwq.prod, &rq->hwq);
+ sge->addr = (dma_addr_t)(qp->rq_hdr_buf_map +
+ sw_prod * qp->rq_hdr_buf_size);
+ sge->lkey = 0xFFFFFFFF;
+ sge->size = qp->rq_hdr_buf_size;
+ return qp->rq_hdr_buf + sw_prod * sge->size;
+ }
+ return NULL;
+}
+
void bnxt_qplib_post_send_db(struct bnxt_qplib_qp *qp)
{
struct bnxt_qplib_q *sq = &qp->sq;
@@ -1346,6 +1377,75 @@ int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp,
return rc;
}
+void bnxt_qplib_post_recv_db(struct bnxt_qplib_qp *qp)
+{
+ struct bnxt_qplib_q *rq = &qp->rq;
+ struct dbr_dbr db_msg = { 0 };
+ u32 sw_prod;
+
+ sw_prod = HWQ_CMP(rq->hwq.prod, &rq->hwq);
+ db_msg.index = cpu_to_le32((sw_prod << DBR_DBR_INDEX_SFT) &
+ DBR_DBR_INDEX_MASK);
+ db_msg.type_xid =
+ cpu_to_le32(((qp->id << DBR_DBR_XID_SFT) & DBR_DBR_XID_MASK) |
+ DBR_DBR_TYPE_RQ);
+
+ /* Flush the writes to HW Rx WQE before the ringing Rx DB */
+ wmb();
+ __iowrite64_copy(qp->dpi->dbr, &db_msg, sizeof(db_msg) / sizeof(u64));
+}
+
+int bnxt_qplib_post_recv(struct bnxt_qplib_qp *qp,
+ struct bnxt_qplib_swqe *wqe)
+{
+ struct bnxt_qplib_q *rq = &qp->rq;
+ struct rq_wqe *rqe, **rqe_ptr;
+ struct sq_sge *hw_sge;
+ u32 sw_prod;
+ int i, rc = 0;
+
+ if (qp->state == CMDQ_MODIFY_QP_NEW_STATE_ERR) {
+ dev_err(&rq->hwq.pdev->dev,
+ "QPLIB: FP: QP (0x%x) is in the 0x%x state",
+ qp->id, qp->state);
+ rc = -EINVAL;
+ goto done;
+ }
+ if (HWQ_CMP((rq->hwq.prod + 1), &rq->hwq) ==
+ HWQ_CMP(rq->hwq.cons, &rq->hwq)) {
+ dev_err(&rq->hwq.pdev->dev,
+ "QPLIB: FP: QP (0x%x) RQ is full!", qp->id);
+ rc = -EINVAL;
+ goto done;
+ }
+ sw_prod = HWQ_CMP(rq->hwq.prod, &rq->hwq);
+ rq->swq[sw_prod].wr_id = wqe->wr_id;
+
+ rqe_ptr = (struct rq_wqe **)rq->hwq.pbl_ptr;
+ rqe = &rqe_ptr[RQE_PG(sw_prod)][RQE_IDX(sw_prod)];
+
+ memset(rqe, 0, BNXT_QPLIB_MAX_RQE_ENTRY_SIZE);
+
+ /* Calculate wqe_size16 and data_len */
+ for (i = 0, hw_sge = (struct sq_sge *)rqe->data;
+ i < wqe->num_sge; i++, hw_sge++) {
+ hw_sge->va_or_pa = cpu_to_le64(wqe->sg_list[i].addr);
+ hw_sge->l_key = cpu_to_le32(wqe->sg_list[i].lkey);
+ hw_sge->size = cpu_to_le32(wqe->sg_list[i].size);
+ }
+ rqe->wqe_type = wqe->type;
+ rqe->flags = wqe->flags;
+ rqe->wqe_size = wqe->num_sge +
+ ((offsetof(typeof(*rqe), data) + 15) >> 4);
+
+ /* Supply the rqe->wr_id index to the wr_id_tbl for now */
+ rqe->wr_id[0] = cpu_to_le32(sw_prod);
+
+ rq->hwq.prod++;
+done:
+ return rc;
+}
+
/* CQ */
/* Spinlock must be held */
diff --git a/drivers/infiniband/hw/bnxtre/bnxt_qplib_fp.h b/drivers/infiniband/hw/bnxtre/bnxt_qplib_fp.h
index 0a87920..e160050 100644
--- a/drivers/infiniband/hw/bnxtre/bnxt_qplib_fp.h
+++ b/drivers/infiniband/hw/bnxtre/bnxt_qplib_fp.h
@@ -420,9 +420,17 @@ int bnxt_qplib_query_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp);
int bnxt_qplib_destroy_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp);
void *bnxt_qplib_get_qp1_sq_buf(struct bnxt_qplib_qp *qp,
struct bnxt_qplib_sge *sge);
+void *bnxt_qplib_get_qp1_rq_buf(struct bnxt_qplib_qp *qp,
+ struct bnxt_qplib_sge *sge);
+u32 bnxt_qplib_get_rq_prod_index(struct bnxt_qplib_qp *qp);
+dma_addr_t bnxt_qplib_get_qp_buf_from_index(struct bnxt_qplib_qp *qp,
+ u32 index);
void bnxt_qplib_post_send_db(struct bnxt_qplib_qp *qp);
int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp,
struct bnxt_qplib_swqe *wqe);
+void bnxt_qplib_post_recv_db(struct bnxt_qplib_qp *qp);
+int bnxt_qplib_post_recv(struct bnxt_qplib_qp *qp,
+ struct bnxt_qplib_swqe *wqe);
int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq);
int bnxt_qplib_destroy_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq);
diff --git a/drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.c b/drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.c
index ff406e9..20a66ec 100644
--- a/drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.c
+++ b/drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.c
@@ -1637,6 +1637,51 @@ static int bnxt_re_build_qp1_send_v2(struct bnxt_re_qp *qp,
return rc;
}
+/* For the MAD layer, it only provides the recv SGE the size of
+ * ib_grh + MAD datagram. No Ethernet headers, Ethertype, BTH, DETH,
+ * nor RoCE iCRC. The Cu+ solution must provide buffer for the entire
+ * receive packet (334 bytes) with no VLAN and then copy the GRH
+ * and the MAD datagram out to the provided SGE.
+ */
+static int bnxt_re_build_qp1_shadow_qp_recv(struct bnxt_re_qp *qp,
+ struct ib_recv_wr *wr,
+ struct bnxt_qplib_swqe *wqe,
+ int payload_size)
+{
+ struct bnxt_qplib_sge ref, sge;
+ u32 rq_prod_index;
+ struct bnxt_re_sqp_entries *sqp_entry;
+
+ rq_prod_index = bnxt_qplib_get_rq_prod_index(&qp->qplib_qp);
+
+ if (bnxt_qplib_get_qp1_rq_buf(&qp->qplib_qp, &sge)) {
+ /* Create 1 SGE to receive the entire
+ * ethernet packet
+ */
+ /* Save the reference from ULP */
+ ref.addr = wqe->sg_list[0].addr;
+ ref.lkey = wqe->sg_list[0].lkey;
+ ref.size = wqe->sg_list[0].size;
+
+ sqp_entry = &qp->rdev->sqp_tbl[rq_prod_index];
+
+ /* SGE 1 */
+ wqe->sg_list[0].addr = sge.addr;
+ wqe->sg_list[0].lkey = sge.lkey;
+ wqe->sg_list[0].size = BNXT_QPLIB_MAX_QP1_RQ_HDR_SIZE_V2;
+ sge.size -= wqe->sg_list[0].size;
+
+ sqp_entry->sge.addr = ref.addr;
+ sqp_entry->sge.lkey = ref.lkey;
+ sqp_entry->sge.size = ref.size;
+ /* Store the wrid for reporting completion */
+ sqp_entry->wrid = wqe->wr_id;
+ /* change the wqe->wrid to table index */
+ wqe->wr_id = rq_prod_index;
+ }
+ return 0;
+}
+
static int is_ud_qp(struct bnxt_re_qp *qp)
{
return qp->qplib_qp.type == CMDQ_CREATE_QP_TYPE_UD;
@@ -1983,6 +2028,84 @@ int bnxt_re_post_send(struct ib_qp *ib_qp, struct ib_send_wr *wr,
return rc;
}
+static int bnxt_re_post_recv_shadow_qp(struct bnxt_re_dev *rdev,
+ struct bnxt_re_qp *qp,
+ struct ib_recv_wr *wr)
+{
+ struct bnxt_qplib_swqe wqe;
+ int rc = 0, payload_sz = 0;
+
+ memset(&wqe, 0, sizeof(wqe));
+ while (wr) {
+ /* House keeping */
+ memset(&wqe, 0, sizeof(wqe));
+
+ /* Common */
+ wqe.num_sge = wr->num_sge;
+ if (wr->num_sge > qp->qplib_qp.rq.max_sge) {
+ dev_err(rdev_to_dev(rdev),
+ "Limit exceeded for Receive SGEs");
+ rc = -EINVAL;
+ goto bad;
+ }
+ payload_sz = bnxt_re_build_sgl(wr->sg_list, wqe.sg_list,
+ wr->num_sge);
+ wqe.wr_id = wr->wr_id;
+ wqe.type = BNXT_QPLIB_SWQE_TYPE_RECV;
+
+ if (!rc)
+ rc = bnxt_qplib_post_recv(&qp->qplib_qp, &wqe);
+bad:
+ if (rc)
+ break;
+
+ wr = wr->next;
+ }
+ bnxt_qplib_post_recv_db(&qp->qplib_qp);
+ return rc;
+}
+
+int bnxt_re_post_recv(struct ib_qp *ib_qp, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr)
+{
+ struct bnxt_re_qp *qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp);
+ struct bnxt_qplib_swqe wqe;
+ int rc = 0, payload_sz = 0;
+
+ while (wr) {
+ /* House keeping */
+ memset(&wqe, 0, sizeof(wqe));
+
+ /* Common */
+ wqe.num_sge = wr->num_sge;
+ if (wr->num_sge > qp->qplib_qp.rq.max_sge) {
+ dev_err(rdev_to_dev(qp->rdev),
+ "Limit exceeded for Receive SGEs");
+ rc = -EINVAL;
+ goto bad;
+ }
+
+ payload_sz = bnxt_re_build_sgl(wr->sg_list, wqe.sg_list,
+ wr->num_sge);
+ wqe.wr_id = wr->wr_id;
+ wqe.type = BNXT_QPLIB_SWQE_TYPE_RECV;
+
+ if (ib_qp->qp_type == IB_QPT_GSI)
+ rc = bnxt_re_build_qp1_shadow_qp_recv(qp, wr, &wqe,
+ payload_sz);
+ if (!rc)
+ rc = bnxt_qplib_post_recv(&qp->qplib_qp, &wqe);
+bad:
+ if (rc) {
+ *bad_wr = wr;
+ break;
+ }
+ wr = wr->next;
+ }
+ bnxt_qplib_post_recv_db(&qp->qplib_qp);
+ return rc;
+}
+
/* Completion Queues */
int bnxt_re_destroy_cq(struct ib_cq *ib_cq)
{
diff --git a/drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.h b/drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.h
index becdcdc..9f3dd49 100644
--- a/drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.h
+++ b/drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.h
@@ -164,6 +164,8 @@ int bnxt_re_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
int bnxt_re_destroy_qp(struct ib_qp *qp);
int bnxt_re_post_send(struct ib_qp *qp, struct ib_send_wr *send_wr,
struct ib_send_wr **bad_send_wr);
+int bnxt_re_post_recv(struct ib_qp *qp, struct ib_recv_wr *recv_wr,
+ struct ib_recv_wr **bad_recv_wr);
struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
const struct ib_cq_init_attr *attr,
struct ib_ucontext *context,
diff --git a/drivers/infiniband/hw/bnxtre/bnxt_re_main.c b/drivers/infiniband/hw/bnxtre/bnxt_re_main.c
index 03a34c0..adaa663 100644
--- a/drivers/infiniband/hw/bnxtre/bnxt_re_main.c
+++ b/drivers/infiniband/hw/bnxtre/bnxt_re_main.c
@@ -463,6 +463,8 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev)
ibdev->destroy_qp = bnxt_re_destroy_qp;
ibdev->post_send = bnxt_re_post_send;
+ ibdev->post_recv = bnxt_re_post_recv;
+
ibdev->create_cq = bnxt_re_create_cq;
ibdev->destroy_cq = bnxt_re_destroy_cq;
ibdev->req_notify_cq = bnxt_re_req_notify_cq;
--
2.5.5
^ permalink raw reply related
* [PATCH for bnxt_re V3 14/21] bnxt_re: Support post_send verb
From: Selvin Xavier @ 2016-12-20 9:13 UTC (permalink / raw)
To: dledford, linux-rdma
Cc: netdev, michael.chan, Selvin Xavier, Eddie Wai, Devesh Sharma,
Somnath Kotur, Sriharsha Basavapatna
In-Reply-To: <1482225211-22423-1-git-send-email-selvin.xavier@broadcom.com>
Enables the ib_post_send fastpath verb for posting Send work
requests on QPs.
v2: Fixed some sparse warnings
v3: Fixes endianness related warnings reported by sparse. Changes
some of the macros to inline functions.
Signed-off-by: Eddie Wai <eddie.wai@broadcom.com>
Signed-off-by: Devesh Sharma <devesh.sharma@broadcom.com>
Signed-off-by: Somnath Kotur <somnath.kotur@broadcom.com>
Signed-off-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com>
---
drivers/infiniband/hw/bnxtre/bnxt_qplib_fp.c | 268 ++++++++++++
drivers/infiniband/hw/bnxtre/bnxt_qplib_fp.h | 8 +
drivers/infiniband/hw/bnxtre/bnxt_re.h | 4 +
drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.c | 544 +++++++++++++++++++++++-
drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.h | 2 +
drivers/infiniband/hw/bnxtre/bnxt_re_main.c | 1 +
6 files changed, 824 insertions(+), 3 deletions(-)
diff --git a/drivers/infiniband/hw/bnxtre/bnxt_qplib_fp.c b/drivers/infiniband/hw/bnxtre/bnxt_qplib_fp.c
index 8fc5928..d1ccf66 100644
--- a/drivers/infiniband/hw/bnxtre/bnxt_qplib_fp.c
+++ b/drivers/infiniband/hw/bnxtre/bnxt_qplib_fp.c
@@ -1078,6 +1078,274 @@ int bnxt_qplib_destroy_qp(struct bnxt_qplib_res *res,
return 0;
}
+void *bnxt_qplib_get_qp1_sq_buf(struct bnxt_qplib_qp *qp,
+ struct bnxt_qplib_sge *sge)
+{
+ struct bnxt_qplib_q *sq = &qp->sq;
+ u32 sw_prod;
+
+ memset(sge, 0, sizeof(*sge));
+
+ if (qp->sq_hdr_buf) {
+ sw_prod = HWQ_CMP(sq->hwq.prod, &sq->hwq);
+ sge->addr = (dma_addr_t)(qp->sq_hdr_buf_map +
+ sw_prod * qp->sq_hdr_buf_size);
+ sge->lkey = 0xFFFFFFFF;
+ sge->size = qp->sq_hdr_buf_size;
+ return qp->sq_hdr_buf + sw_prod * sge->size;
+ }
+ return NULL;
+}
+
+void bnxt_qplib_post_send_db(struct bnxt_qplib_qp *qp)
+{
+ struct bnxt_qplib_q *sq = &qp->sq;
+ struct dbr_dbr db_msg = { 0 };
+ u32 sw_prod;
+
+ sw_prod = HWQ_CMP(sq->hwq.prod, &sq->hwq);
+
+ db_msg.index = cpu_to_le32((sw_prod << DBR_DBR_INDEX_SFT) &
+ DBR_DBR_INDEX_MASK);
+ db_msg.type_xid =
+ cpu_to_le32(((qp->id << DBR_DBR_XID_SFT) & DBR_DBR_XID_MASK) |
+ DBR_DBR_TYPE_SQ);
+ /* Flush all the WQE writes to HW */
+ wmb();
+ __iowrite64_copy(qp->dpi->dbr, &db_msg, sizeof(db_msg) / sizeof(u64));
+}
+
+int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp,
+ struct bnxt_qplib_swqe *wqe)
+{
+ struct bnxt_qplib_q *sq = &qp->sq;
+ struct bnxt_qplib_swq *swq;
+ struct sq_send *hw_sq_send_hdr, **hw_sq_send_ptr;
+ struct sq_sge *hw_sge;
+ u32 sw_prod;
+ u8 wqe_size16;
+ int i, rc = 0, data_len = 0, pkt_num = 0;
+ __le32 temp32;
+
+ if (qp->state != CMDQ_MODIFY_QP_NEW_STATE_RTS) {
+ rc = -EINVAL;
+ goto done;
+ }
+ if (HWQ_CMP((sq->hwq.prod + 1), &sq->hwq) ==
+ HWQ_CMP(sq->hwq.cons, &sq->hwq)) {
+ rc = -ENOMEM;
+ goto done;
+ }
+ sw_prod = HWQ_CMP(sq->hwq.prod, &sq->hwq);
+ swq = &sq->swq[sw_prod];
+ swq->wr_id = wqe->wr_id;
+ swq->type = wqe->type;
+ swq->flags = wqe->flags;
+ if (qp->sig_type)
+ swq->flags |= SQ_SEND_FLAGS_SIGNAL_COMP;
+ swq->start_psn = sq->psn & BTH_PSN_MASK;
+
+ hw_sq_send_ptr = (struct sq_send **)sq->hwq.pbl_ptr;
+ hw_sq_send_hdr = &hw_sq_send_ptr[get_sqe_pg(sw_prod)]
+ [get_sqe_idx(sw_prod)];
+
+ memset(hw_sq_send_hdr, 0, BNXT_QPLIB_MAX_SQE_ENTRY_SIZE);
+
+ if (wqe->flags & BNXT_QPLIB_SWQE_FLAGS_INLINE) {
+ /* Copy the inline data */
+ if (wqe->inline_len > BNXT_QPLIB_SWQE_MAX_INLINE_LENGTH) {
+ dev_warn(&sq->hwq.pdev->dev,
+ "QPLIB: Inline data length > 96 detected");
+ data_len = BNXT_QPLIB_SWQE_MAX_INLINE_LENGTH;
+ } else {
+ data_len = wqe->inline_len;
+ }
+ memcpy(hw_sq_send_hdr->data, wqe->inline_data, data_len);
+ wqe_size16 = (data_len + 15) >> 4;
+ } else {
+ for (i = 0, hw_sge = (struct sq_sge *)hw_sq_send_hdr->data;
+ i < wqe->num_sge; i++, hw_sge++) {
+ hw_sge->va_or_pa = cpu_to_le64(wqe->sg_list[i].addr);
+ hw_sge->l_key = cpu_to_le32(wqe->sg_list[i].lkey);
+ hw_sge->size = cpu_to_le32(wqe->sg_list[i].size);
+ data_len += wqe->sg_list[i].size;
+ }
+ /* Each SGE entry = 1 WQE size16 */
+ wqe_size16 = wqe->num_sge;
+ }
+
+ /* Specifics */
+ switch (wqe->type) {
+ case BNXT_QPLIB_SWQE_TYPE_SEND:
+ if (qp->type == CMDQ_CREATE_QP1_TYPE_GSI) {
+ /* Assemble info for Raw Ethertype QPs */
+ struct sq_send_raweth_qp1 *sqe =
+ (struct sq_send_raweth_qp1 *)hw_sq_send_hdr;
+
+ sqe->wqe_type = wqe->type;
+ sqe->flags = wqe->flags;
+ sqe->wqe_size = wqe_size16 +
+ ((offsetof(typeof(*sqe), data) + 15) >> 4);
+ sqe->cfa_action = cpu_to_le16(wqe->rawqp1.cfa_action);
+ sqe->lflags = cpu_to_le16(wqe->rawqp1.lflags);
+ sqe->length = cpu_to_le32(data_len);
+ sqe->cfa_meta = cpu_to_le32((wqe->rawqp1.cfa_meta &
+ SQ_SEND_RAWETH_QP1_CFA_META_VLAN_VID_MASK) <<
+ SQ_SEND_RAWETH_QP1_CFA_META_VLAN_VID_SFT);
+
+ break;
+ }
+ /* else, just fall thru */
+ case BNXT_QPLIB_SWQE_TYPE_SEND_WITH_IMM:
+ case BNXT_QPLIB_SWQE_TYPE_SEND_WITH_INV:
+ {
+ struct sq_send *sqe = (struct sq_send *)hw_sq_send_hdr;
+
+ sqe->wqe_type = wqe->type;
+ sqe->flags = wqe->flags;
+ sqe->wqe_size = wqe_size16 +
+ ((offsetof(typeof(*sqe), data) + 15) >> 4);
+ sqe->inv_key_or_imm_data = cpu_to_le32(
+ wqe->send.inv_key);
+ if (qp->type == CMDQ_CREATE_QP_TYPE_UD) {
+ sqe->q_key = cpu_to_le32(wqe->send.q_key);
+ sqe->dst_qp = cpu_to_le32(
+ wqe->send.dst_qp & SQ_SEND_DST_QP_MASK);
+ sqe->length = cpu_to_le32(data_len);
+ sqe->avid = cpu_to_le32(wqe->send.avid &
+ SQ_SEND_AVID_MASK);
+ sq->psn = (sq->psn + 1) & BTH_PSN_MASK;
+ } else {
+ sqe->length = cpu_to_le32(data_len);
+ sqe->dst_qp = 0;
+ sqe->avid = 0;
+ if (qp->mtu)
+ pkt_num = (data_len + qp->mtu - 1) / qp->mtu;
+ if (!pkt_num)
+ pkt_num = 1;
+ sq->psn = (sq->psn + pkt_num) & BTH_PSN_MASK;
+ }
+ break;
+ }
+ case BNXT_QPLIB_SWQE_TYPE_RDMA_WRITE:
+ case BNXT_QPLIB_SWQE_TYPE_RDMA_WRITE_WITH_IMM:
+ case BNXT_QPLIB_SWQE_TYPE_RDMA_READ:
+ {
+ struct sq_rdma *sqe = (struct sq_rdma *)hw_sq_send_hdr;
+
+ sqe->wqe_type = wqe->type;
+ sqe->flags = wqe->flags;
+ sqe->wqe_size = wqe_size16 +
+ ((offsetof(typeof(*sqe), data) + 15) >> 4);
+ sqe->imm_data = cpu_to_le32(wqe->rdma.inv_key);
+ sqe->length = cpu_to_le32((u32)data_len);
+ sqe->remote_va = cpu_to_le64(wqe->rdma.remote_va);
+ sqe->remote_key = cpu_to_le32(wqe->rdma.r_key);
+ if (qp->mtu)
+ pkt_num = (data_len + qp->mtu - 1) / qp->mtu;
+ if (!pkt_num)
+ pkt_num = 1;
+ sq->psn = (sq->psn + pkt_num) & BTH_PSN_MASK;
+ break;
+ }
+ case BNXT_QPLIB_SWQE_TYPE_ATOMIC_CMP_AND_SWP:
+ case BNXT_QPLIB_SWQE_TYPE_ATOMIC_FETCH_AND_ADD:
+ {
+ struct sq_atomic *sqe = (struct sq_atomic *)hw_sq_send_hdr;
+
+ sqe->wqe_type = wqe->type;
+ sqe->flags = wqe->flags;
+ sqe->remote_key = cpu_to_le32(wqe->atomic.r_key);
+ sqe->remote_va = cpu_to_le64(wqe->atomic.remote_va);
+ sqe->swap_data = cpu_to_le64(wqe->atomic.swap_data);
+ sqe->cmp_data = cpu_to_le64(wqe->atomic.cmp_data);
+ if (qp->mtu)
+ pkt_num = (data_len + qp->mtu - 1) / qp->mtu;
+ if (!pkt_num)
+ pkt_num = 1;
+ sq->psn = (sq->psn + pkt_num) & BTH_PSN_MASK;
+ break;
+ }
+ case BNXT_QPLIB_SWQE_TYPE_LOCAL_INV:
+ {
+ struct sq_localinvalidate *sqe =
+ (struct sq_localinvalidate *)hw_sq_send_hdr;
+
+ sqe->wqe_type = wqe->type;
+ sqe->flags = wqe->flags;
+ sqe->inv_l_key = cpu_to_le32(wqe->local_inv.inv_l_key);
+
+ break;
+ }
+ case BNXT_QPLIB_SWQE_TYPE_FAST_REG_MR:
+ {
+ struct sq_fr_pmr *sqe = (struct sq_fr_pmr *)hw_sq_send_hdr;
+
+ sqe->wqe_type = wqe->type;
+ sqe->flags = wqe->flags;
+ sqe->access_cntl = wqe->frmr.access_cntl |
+ SQ_FR_PMR_ACCESS_CNTL_LOCAL_WRITE;
+ sqe->zero_based_page_size_log =
+ (wqe->frmr.pg_sz_log & SQ_FR_PMR_PAGE_SIZE_LOG_MASK) <<
+ SQ_FR_PMR_PAGE_SIZE_LOG_SFT |
+ (wqe->frmr.zero_based ? SQ_FR_PMR_ZERO_BASED : 0);
+ sqe->l_key = cpu_to_le32(wqe->frmr.l_key);
+ temp32 = cpu_to_le32(wqe->frmr.length);
+ memcpy(sqe->length, &temp32, sizeof(wqe->frmr.length));
+ sqe->numlevels_pbl_page_size_log =
+ ((wqe->frmr.pbl_pg_sz_log <<
+ SQ_FR_PMR_PBL_PAGE_SIZE_LOG_SFT) &
+ SQ_FR_PMR_PBL_PAGE_SIZE_LOG_MASK) |
+ ((wqe->frmr.levels << SQ_FR_PMR_NUMLEVELS_SFT) &
+ SQ_FR_PMR_NUMLEVELS_MASK);
+
+ for (i = 0; i < wqe->frmr.page_list_len; i++)
+ wqe->frmr.pbl_ptr[i] = cpu_to_le64(
+ wqe->frmr.page_list[i] |
+ PTU_PTE_VALID);
+ sqe->pblptr = cpu_to_le64(wqe->frmr.pbl_dma_ptr);
+ sqe->va = cpu_to_le64(wqe->frmr.va);
+
+ break;
+ }
+ case BNXT_QPLIB_SWQE_TYPE_BIND_MW:
+ {
+ struct sq_bind *sqe = (struct sq_bind *)hw_sq_send_hdr;
+
+ sqe->wqe_type = wqe->type;
+ sqe->flags = wqe->flags;
+ sqe->access_cntl = wqe->bind.access_cntl;
+ sqe->mw_type_zero_based = wqe->bind.mw_type |
+ (wqe->bind.zero_based ? SQ_BIND_ZERO_BASED : 0);
+ sqe->parent_l_key = cpu_to_le32(wqe->bind.parent_l_key);
+ sqe->l_key = cpu_to_le32(wqe->bind.r_key);
+ sqe->va = cpu_to_le64(wqe->bind.va);
+ temp32 = cpu_to_le32(wqe->bind.length);
+ memcpy(&sqe->length, &temp32, sizeof(wqe->bind.length));
+ break;
+ }
+ default:
+ /* Bad wqe, return error */
+ rc = -EINVAL;
+ goto done;
+ }
+ swq->next_psn = sq->psn & BTH_PSN_MASK;
+ if (swq->psn_search) {
+ swq->psn_search->opcode_start_psn = cpu_to_le32(
+ ((swq->start_psn << SQ_PSN_SEARCH_START_PSN_SFT) &
+ SQ_PSN_SEARCH_START_PSN_MASK) |
+ ((wqe->type << SQ_PSN_SEARCH_OPCODE_SFT) &
+ SQ_PSN_SEARCH_OPCODE_MASK));
+ swq->psn_search->flags_next_psn = cpu_to_le32(
+ ((swq->next_psn << SQ_PSN_SEARCH_NEXT_PSN_SFT) &
+ SQ_PSN_SEARCH_NEXT_PSN_MASK));
+ }
+
+ sq->hwq.prod++;
+done:
+ return rc;
+}
+
/* CQ */
/* Spinlock must be held */
diff --git a/drivers/infiniband/hw/bnxtre/bnxt_qplib_fp.h b/drivers/infiniband/hw/bnxtre/bnxt_qplib_fp.h
index e62c750..0a87920 100644
--- a/drivers/infiniband/hw/bnxtre/bnxt_qplib_fp.h
+++ b/drivers/infiniband/hw/bnxtre/bnxt_qplib_fp.h
@@ -38,6 +38,9 @@
#ifndef __BNXT_QPLIB_FP_H__
#define __BNXT_QPLIB_FP_H__
+
+#define BNXT_QPLIB_ETHTYPE_ROCEV1 0x8915
+
struct bnxt_qplib_sge {
u64 addr;
u32 lkey;
@@ -415,6 +418,11 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp);
int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp);
int bnxt_qplib_query_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp);
int bnxt_qplib_destroy_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp);
+void *bnxt_qplib_get_qp1_sq_buf(struct bnxt_qplib_qp *qp,
+ struct bnxt_qplib_sge *sge);
+void bnxt_qplib_post_send_db(struct bnxt_qplib_qp *qp);
+int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp,
+ struct bnxt_qplib_swqe *wqe);
int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq);
int bnxt_qplib_destroy_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq);
diff --git a/drivers/infiniband/hw/bnxtre/bnxt_re.h b/drivers/infiniband/hw/bnxtre/bnxt_re.h
index f53ebd9..6281e23 100644
--- a/drivers/infiniband/hw/bnxtre/bnxt_re.h
+++ b/drivers/infiniband/hw/bnxtre/bnxt_re.h
@@ -131,6 +131,10 @@ struct bnxt_re_dev {
#define to_bnxt_re_dev(ptr, member) \
container_of((ptr), struct bnxt_re_dev, member)
+#define BNXT_RE_ROCE_V1_PACKET 0
+#define BNXT_RE_ROCEV2_IPV4_PACKET 2
+#define BNXT_RE_ROCEV2_IPV6_PACKET 3
+
static inline struct device *rdev_to_dev(struct bnxt_re_dev *rdev)
{
if (rdev)
diff --git a/drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.c b/drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.c
index 9df6fb8..ff406e9 100644
--- a/drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.c
+++ b/drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.c
@@ -60,6 +60,20 @@
#include "bnxt_re_ib_verbs.h"
#include <rdma/bnxtre-abi.h>
+static int bnxt_re_build_sgl(struct ib_sge *ib_sg_list,
+ struct bnxt_qplib_sge *sg_list, int num)
+{
+ int i, total = 0;
+
+ for (i = 0; i < num; i++) {
+ sg_list[i].addr = ib_sg_list[i].addr;
+ sg_list[i].lkey = ib_sg_list[i].lkey;
+ sg_list[i].size = ib_sg_list[i].length;
+ total += sg_list[i].size;
+ }
+ return total;
+}
+
/* Device */
struct net_device *bnxt_re_get_netdev(struct ib_device *ibdev, u8 port_num)
{
@@ -698,8 +712,6 @@ static u8 __from_ib_qp_type(enum ib_qp_type type)
return CMDQ_CREATE_QP_TYPE_RC;
case IB_QPT_UD:
return CMDQ_CREATE_QP_TYPE_UD;
- case IB_QPT_RAW_ETHERTYPE:
- return CMDQ_CREATE_QP_TYPE_RAW_ETHERTYPE;
default:
return IB_QPT_MAX;
}
@@ -873,7 +885,6 @@ struct ib_qp *bnxt_re_create_qp(struct ib_pd *ib_pd,
struct bnxt_re_dev *rdev = pd->rdev;
struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
struct bnxt_re_qp *qp;
- struct bnxt_re_srq *srq;
struct bnxt_re_cq *cq;
int rc, entries;
@@ -1445,6 +1456,533 @@ int bnxt_re_query_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr,
return 0;
}
+/* Routine for sending QP1 packets for RoCE V1 an V2
+ */
+static int bnxt_re_build_qp1_send_v2(struct bnxt_re_qp *qp,
+ struct ib_send_wr *wr,
+ struct bnxt_qplib_swqe *wqe,
+ int payload_size)
+{
+ struct ib_device *ibdev = &qp->rdev->ibdev;
+ struct bnxt_re_ah *ah = container_of(ud_wr(wr)->ah, struct bnxt_re_ah,
+ ib_ah);
+ struct bnxt_qplib_ah *qplib_ah = &ah->qplib_ah;
+ struct bnxt_qplib_sge sge;
+ union ib_gid sgid;
+ u8 nw_type;
+ u16 ether_type;
+ struct ib_gid_attr sgid_attr;
+ union ib_gid dgid;
+ bool is_eth = false;
+ bool is_vlan = false;
+ bool is_grh = false;
+ bool is_udp = false;
+ u8 ip_version = 0;
+ u16 vlan_id = 0xFFFF;
+ void *buf;
+ int i, rc = 0, size;
+
+ memset(&qp->qp1_hdr, 0, sizeof(qp->qp1_hdr));
+
+ rc = ib_get_cached_gid(ibdev, 1,
+ qplib_ah->host_sgid_index, &sgid,
+ &sgid_attr);
+ if (rc) {
+ dev_err(rdev_to_dev(qp->rdev),
+ "Failed to query gid at index %d",
+ qplib_ah->host_sgid_index);
+ return rc;
+ }
+ if (sgid_attr.ndev) {
+ if (is_vlan_dev(sgid_attr.ndev))
+ vlan_id = vlan_dev_vlan_id(sgid_attr.ndev);
+ dev_put(sgid_attr.ndev);
+ }
+ /* Get network header type for this GID */
+ nw_type = ib_gid_to_network_type(sgid_attr.gid_type, &sgid);
+ switch (nw_type) {
+ case RDMA_NETWORK_IPV4:
+ nw_type = BNXT_RE_ROCEV2_IPV4_PACKET;
+ break;
+ case RDMA_NETWORK_IPV6:
+ nw_type = BNXT_RE_ROCEV2_IPV6_PACKET;
+ break;
+ default:
+ nw_type = BNXT_RE_ROCE_V1_PACKET;
+ break;
+ }
+ memcpy(&dgid.raw, &qplib_ah->dgid, 16);
+ is_udp = sgid_attr.gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP;
+ if (is_udp) {
+ if (ipv6_addr_v4mapped((struct in6_addr *)&sgid)) {
+ ip_version = 4;
+ ether_type = ETH_P_IP;
+ } else {
+ ip_version = 6;
+ ether_type = ETH_P_IPV6;
+ }
+ is_grh = false;
+ } else {
+ ether_type = BNXT_QPLIB_ETHTYPE_ROCEV1;
+ is_grh = true;
+ }
+
+ is_eth = true;
+ is_vlan = (vlan_id && (vlan_id < 0x1000)) ? true : false;
+
+ ib_ud_header_init(payload_size, !is_eth, is_eth, is_vlan, is_grh,
+ ip_version, is_udp, 0, &qp->qp1_hdr);
+
+ /* ETH */
+ ether_addr_copy(qp->qp1_hdr.eth.dmac_h, ah->qplib_ah.dmac);
+ ether_addr_copy(qp->qp1_hdr.eth.smac_h, qp->qplib_qp.smac);
+
+ /* For vlan, check the sgid for vlan existence */
+
+ if (!is_vlan) {
+ qp->qp1_hdr.eth.type = cpu_to_be16(ether_type);
+ } else {
+ qp->qp1_hdr.vlan.type = cpu_to_be16(ether_type);
+ qp->qp1_hdr.vlan.tag = cpu_to_be16(vlan_id);
+ }
+
+ if (is_grh || (ip_version == 6)) {
+ memcpy(qp->qp1_hdr.grh.source_gid.raw, sgid.raw, sizeof(sgid));
+ memcpy(qp->qp1_hdr.grh.destination_gid.raw, qplib_ah->dgid.data,
+ sizeof(sgid));
+ qp->qp1_hdr.grh.hop_limit = qplib_ah->hop_limit;
+ }
+
+ if (ip_version == 4) {
+ qp->qp1_hdr.ip4.tos = 0;
+ qp->qp1_hdr.ip4.id = 0;
+ qp->qp1_hdr.ip4.frag_off = htons(IP_DF);
+ qp->qp1_hdr.ip4.ttl = qplib_ah->hop_limit;
+
+ memcpy(&qp->qp1_hdr.ip4.saddr, sgid.raw + 12, 4);
+ memcpy(&qp->qp1_hdr.ip4.daddr, qplib_ah->dgid.data + 12, 4);
+ qp->qp1_hdr.ip4.check = ib_ud_ip4_csum(&qp->qp1_hdr);
+ }
+
+ if (is_udp) {
+ qp->qp1_hdr.udp.dport = htons(ROCE_V2_UDP_DPORT);
+ qp->qp1_hdr.udp.sport = htons(0x8CD1);
+ qp->qp1_hdr.udp.csum = 0;
+ }
+
+ /* BTH */
+ if (wr->opcode == IB_WR_SEND_WITH_IMM) {
+ qp->qp1_hdr.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+ qp->qp1_hdr.immediate_present = 1;
+ } else {
+ qp->qp1_hdr.bth.opcode = IB_OPCODE_UD_SEND_ONLY;
+ }
+ if (wr->send_flags & IB_SEND_SOLICITED)
+ qp->qp1_hdr.bth.solicited_event = 1;
+ /* pad_count */
+ qp->qp1_hdr.bth.pad_count = (4 - payload_size) & 3;
+
+ /* P_key for QP1 is for all members */
+ qp->qp1_hdr.bth.pkey = cpu_to_be16(0xFFFF);
+ qp->qp1_hdr.bth.destination_qpn = IB_QP1;
+ qp->qp1_hdr.bth.ack_req = 0;
+ qp->send_psn++;
+ qp->send_psn &= BTH_PSN_MASK;
+ qp->qp1_hdr.bth.psn = cpu_to_be32(qp->send_psn);
+ /* DETH */
+ /* Use the priviledged Q_Key for QP1 */
+ qp->qp1_hdr.deth.qkey = cpu_to_be32(IB_QP1_QKEY);
+ qp->qp1_hdr.deth.source_qpn = IB_QP1;
+
+ /* Pack the QP1 to the transmit buffer */
+ buf = bnxt_qplib_get_qp1_sq_buf(&qp->qplib_qp, &sge);
+ if (buf) {
+ size = ib_ud_header_pack(&qp->qp1_hdr, buf);
+ for (i = wqe->num_sge; i; i--) {
+ wqe->sg_list[i].addr = wqe->sg_list[i - 1].addr;
+ wqe->sg_list[i].lkey = wqe->sg_list[i - 1].lkey;
+ wqe->sg_list[i].size = wqe->sg_list[i - 1].size;
+ }
+
+ /*
+ * Max Header buf size for IPV6 RoCE V2 is 86,
+ * which is same as the QP1 SQ header buffer.
+ * Header buf size for IPV4 RoCE V2 can be 66.
+ * ETH(14) + VLAN(4)+ IP(20) + UDP (8) + BTH(20).
+ * Subtract 20 bytes from QP1 SQ header buf size
+ */
+ if (is_udp && ip_version == 4)
+ sge.size -= 20;
+ /*
+ * Max Header buf size for RoCE V1 is 78.
+ * ETH(14) + VLAN(4) + GRH(40) + BTH(20).
+ * Subtract 8 bytes from QP1 SQ header buf size
+ */
+ if (!is_udp)
+ sge.size -= 8;
+
+ /* Subtract 4 bytes for non vlan packets */
+ if (!is_vlan)
+ sge.size -= 4;
+
+ wqe->sg_list[0].addr = sge.addr;
+ wqe->sg_list[0].lkey = sge.lkey;
+ wqe->sg_list[0].size = sge.size;
+ wqe->num_sge++;
+
+ } else {
+ dev_err(rdev_to_dev(qp->rdev), "QP1 buffer is empty!");
+ rc = -ENOMEM;
+ }
+ return rc;
+}
+
+static int is_ud_qp(struct bnxt_re_qp *qp)
+{
+ return qp->qplib_qp.type == CMDQ_CREATE_QP_TYPE_UD;
+}
+
+static int bnxt_re_build_send_wqe(struct bnxt_re_qp *qp,
+ struct ib_send_wr *wr,
+ struct bnxt_qplib_swqe *wqe)
+{
+ struct bnxt_re_ah *ah = NULL;
+
+ if (is_ud_qp(qp)) {
+ ah = container_of(ud_wr(wr)->ah, struct bnxt_re_ah, ib_ah);
+ wqe->send.q_key = ud_wr(wr)->remote_qkey;
+ wqe->send.dst_qp = ud_wr(wr)->remote_qpn;
+ wqe->send.avid = ah->qplib_ah.id;
+ }
+ switch (wr->opcode) {
+ case IB_WR_SEND:
+ wqe->type = BNXT_QPLIB_SWQE_TYPE_SEND;
+ break;
+ case IB_WR_SEND_WITH_IMM:
+ wqe->type = BNXT_QPLIB_SWQE_TYPE_SEND_WITH_IMM;
+ wqe->send.imm_data = wr->ex.imm_data;
+ break;
+ case IB_WR_SEND_WITH_INV:
+ wqe->type = BNXT_QPLIB_SWQE_TYPE_SEND_WITH_INV;
+ wqe->send.inv_key = wr->ex.invalidate_rkey;
+ break;
+ default:
+ return -EINVAL;
+ }
+ if (wr->send_flags & IB_SEND_SIGNALED)
+ wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_SIGNAL_COMP;
+ if (wr->send_flags & IB_SEND_FENCE)
+ wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_UC_FENCE;
+ if (wr->send_flags & IB_SEND_SOLICITED)
+ wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_SOLICIT_EVENT;
+ if (wr->send_flags & IB_SEND_INLINE)
+ wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_INLINE;
+
+ return 0;
+}
+
+static int bnxt_re_build_rdma_wqe(struct ib_send_wr *wr,
+ struct bnxt_qplib_swqe *wqe)
+{
+ switch (wr->opcode) {
+ case IB_WR_RDMA_WRITE:
+ wqe->type = BNXT_QPLIB_SWQE_TYPE_RDMA_WRITE;
+ break;
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ wqe->type = BNXT_QPLIB_SWQE_TYPE_RDMA_WRITE_WITH_IMM;
+ wqe->rdma.imm_data = wr->ex.imm_data;
+ break;
+ case IB_WR_RDMA_READ:
+ wqe->type = BNXT_QPLIB_SWQE_TYPE_RDMA_READ;
+ wqe->rdma.inv_key = wr->ex.invalidate_rkey;
+ break;
+ default:
+ return -EINVAL;
+ }
+ wqe->rdma.remote_va = rdma_wr(wr)->remote_addr;
+ wqe->rdma.r_key = rdma_wr(wr)->rkey;
+ if (wr->send_flags & IB_SEND_SIGNALED)
+ wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_SIGNAL_COMP;
+ if (wr->send_flags & IB_SEND_FENCE)
+ wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_UC_FENCE;
+ if (wr->send_flags & IB_SEND_SOLICITED)
+ wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_SOLICIT_EVENT;
+ if (wr->send_flags & IB_SEND_INLINE)
+ wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_INLINE;
+
+ return 0;
+}
+
+static int bnxt_re_build_atomic_wqe(struct ib_send_wr *wr,
+ struct bnxt_qplib_swqe *wqe)
+{
+ switch (wr->opcode) {
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ wqe->type = BNXT_QPLIB_SWQE_TYPE_ATOMIC_CMP_AND_SWP;
+ wqe->atomic.swap_data = atomic_wr(wr)->swap;
+ break;
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ wqe->type = BNXT_QPLIB_SWQE_TYPE_ATOMIC_FETCH_AND_ADD;
+ wqe->atomic.cmp_data = atomic_wr(wr)->compare_add;
+ break;
+ default:
+ return -EINVAL;
+ }
+ wqe->atomic.remote_va = atomic_wr(wr)->remote_addr;
+ wqe->atomic.r_key = atomic_wr(wr)->rkey;
+ if (wr->send_flags & IB_SEND_SIGNALED)
+ wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_SIGNAL_COMP;
+ if (wr->send_flags & IB_SEND_FENCE)
+ wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_UC_FENCE;
+ if (wr->send_flags & IB_SEND_SOLICITED)
+ wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_SOLICIT_EVENT;
+ return 0;
+}
+
+static int bnxt_re_build_inv_wqe(struct ib_send_wr *wr,
+ struct bnxt_qplib_swqe *wqe)
+{
+ wqe->type = BNXT_QPLIB_SWQE_TYPE_LOCAL_INV;
+ wqe->local_inv.inv_l_key = wr->ex.invalidate_rkey;
+
+ if (wr->send_flags & IB_SEND_SIGNALED)
+ wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_SIGNAL_COMP;
+ if (wr->send_flags & IB_SEND_FENCE)
+ wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_UC_FENCE;
+ if (wr->send_flags & IB_SEND_SOLICITED)
+ wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_SOLICIT_EVENT;
+
+ return 0;
+}
+
+static int bnxt_re_build_reg_wqe(struct ib_reg_wr *wr,
+ struct bnxt_qplib_swqe *wqe)
+{
+ struct bnxt_re_mr *mr = container_of(wr->mr, struct bnxt_re_mr, ib_mr);
+ struct bnxt_qplib_frpl *qplib_frpl = &mr->qplib_frpl;
+ int access = wr->access;
+
+ wqe->frmr.pbl_ptr = (__le64 *)qplib_frpl->hwq.pbl_ptr[0];
+ wqe->frmr.pbl_dma_ptr = qplib_frpl->hwq.pbl_dma_ptr[0];
+ wqe->frmr.page_list = mr->pages;
+ wqe->frmr.page_list_len = mr->npages;
+ wqe->frmr.levels = qplib_frpl->hwq.level + 1;
+ wqe->type = BNXT_QPLIB_SWQE_TYPE_REG_MR;
+
+ if (wr->wr.send_flags & IB_SEND_FENCE)
+ wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_UC_FENCE;
+ if (wr->wr.send_flags & IB_SEND_SIGNALED)
+ wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_SIGNAL_COMP;
+
+ if (access & IB_ACCESS_LOCAL_WRITE)
+ wqe->frmr.access_cntl |= SQ_FR_PMR_ACCESS_CNTL_LOCAL_WRITE;
+ if (access & IB_ACCESS_REMOTE_READ)
+ wqe->frmr.access_cntl |= SQ_FR_PMR_ACCESS_CNTL_REMOTE_READ;
+ if (access & IB_ACCESS_REMOTE_WRITE)
+ wqe->frmr.access_cntl |= SQ_FR_PMR_ACCESS_CNTL_REMOTE_WRITE;
+ if (access & IB_ACCESS_REMOTE_ATOMIC)
+ wqe->frmr.access_cntl |= SQ_FR_PMR_ACCESS_CNTL_REMOTE_ATOMIC;
+ if (access & IB_ACCESS_MW_BIND)
+ wqe->frmr.access_cntl |= SQ_FR_PMR_ACCESS_CNTL_WINDOW_BIND;
+
+ wqe->frmr.l_key = wr->key;
+ wqe->frmr.length = wr->mr->length;
+ wqe->frmr.pbl_pg_sz_log = (wr->mr->page_size >> PAGE_SHIFT_4K) - 1;
+ wqe->frmr.va = wr->mr->iova;
+ return 0;
+}
+
+static int bnxt_re_copy_inline_data(struct bnxt_re_dev *rdev,
+ struct ib_send_wr *wr,
+ struct bnxt_qplib_swqe *wqe)
+{
+ /* Copy the inline data to the data field */
+ u8 *in_data;
+ u32 i, sge_len;
+ void *sge_addr;
+
+ in_data = wqe->inline_data;
+ for (i = 0; i < wr->num_sge; i++) {
+ sge_addr = (void *)(unsigned long)
+ wr->sg_list[i].addr;
+ sge_len = wr->sg_list[i].length;
+
+ if ((sge_len + wqe->inline_len) >
+ BNXT_QPLIB_SWQE_MAX_INLINE_LENGTH) {
+ dev_err(rdev_to_dev(rdev),
+ "Inline data size requested > supported value");
+ return -EINVAL;
+ }
+ sge_len = wr->sg_list[i].length;
+
+ memcpy(in_data, sge_addr, sge_len);
+ in_data += wr->sg_list[i].length;
+ wqe->inline_len += wr->sg_list[i].length;
+ }
+ return wqe->inline_len;
+}
+
+static int bnxt_re_copy_wr_payload(struct bnxt_re_dev *rdev,
+ struct ib_send_wr *wr,
+ struct bnxt_qplib_swqe *wqe)
+{
+ int payload_sz = 0;
+
+ if (wr->send_flags & IB_SEND_INLINE)
+ payload_sz = bnxt_re_copy_inline_data(rdev, wr, wqe);
+ else
+ payload_sz = bnxt_re_build_sgl(wr->sg_list, wqe->sg_list,
+ wqe->num_sge);
+
+ return payload_sz;
+}
+
+static int bnxt_re_post_send_shadow_qp(struct bnxt_re_dev *rdev,
+ struct bnxt_re_qp *qp,
+ struct ib_send_wr *wr)
+{
+ struct bnxt_qplib_swqe wqe;
+ int rc = 0, payload_sz = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->sq_lock, flags);
+ memset(&wqe, 0, sizeof(wqe));
+ while (wr) {
+ /* House keeping */
+ memset(&wqe, 0, sizeof(wqe));
+
+ /* Common */
+ wqe.num_sge = wr->num_sge;
+ if (wr->num_sge > qp->qplib_qp.sq.max_sge) {
+ dev_err(rdev_to_dev(rdev),
+ "Limit exceeded for Send SGEs");
+ rc = -EINVAL;
+ goto bad;
+ }
+
+ payload_sz = bnxt_re_copy_wr_payload(qp->rdev, wr, &wqe);
+ if (payload_sz < 0) {
+ rc = -EINVAL;
+ goto bad;
+ }
+ wqe.wr_id = wr->wr_id;
+
+ wqe.type = BNXT_QPLIB_SWQE_TYPE_SEND;
+
+ rc = bnxt_re_build_send_wqe(qp, wr, &wqe);
+ if (!rc)
+ rc = bnxt_qplib_post_send(&qp->qplib_qp, &wqe);
+bad:
+ if (rc) {
+ dev_err(rdev_to_dev(rdev),
+ "Post send failed opcode = %#x rc = %d",
+ wr->opcode, rc);
+ break;
+ }
+ wr = wr->next;
+ }
+ bnxt_qplib_post_send_db(&qp->qplib_qp);
+ spin_unlock_irqrestore(&qp->sq_lock, flags);
+ return rc;
+}
+
+int bnxt_re_post_send(struct ib_qp *ib_qp, struct ib_send_wr *wr,
+ struct ib_send_wr **bad_wr)
+{
+ struct bnxt_re_qp *qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp);
+ struct bnxt_qplib_swqe wqe;
+ int rc = 0, payload_sz = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->sq_lock, flags);
+ while (wr) {
+ /* House keeping */
+ memset(&wqe, 0, sizeof(wqe));
+
+ /* Common */
+ wqe.num_sge = wr->num_sge;
+ if (wr->num_sge > qp->qplib_qp.sq.max_sge) {
+ dev_err(rdev_to_dev(qp->rdev),
+ "Limit exceeded for Send SGEs");
+ rc = -EINVAL;
+ goto bad;
+ }
+
+ payload_sz = bnxt_re_copy_wr_payload(qp->rdev, wr, &wqe);
+ if (payload_sz < 0) {
+ rc = -EINVAL;
+ goto bad;
+ }
+ wqe.wr_id = wr->wr_id;
+
+ switch (wr->opcode) {
+ case IB_WR_SEND:
+ case IB_WR_SEND_WITH_IMM:
+ if (ib_qp->qp_type == IB_QPT_GSI) {
+ rc = bnxt_re_build_qp1_send_v2(qp, wr, &wqe,
+ payload_sz);
+ if (rc)
+ goto bad;
+ wqe.rawqp1.lflags |=
+ SQ_SEND_RAWETH_QP1_LFLAGS_ROCE_CRC;
+ }
+ switch (wr->send_flags) {
+ case IB_SEND_IP_CSUM:
+ wqe.rawqp1.lflags |=
+ SQ_SEND_RAWETH_QP1_LFLAGS_IP_CHKSUM;
+ break;
+ default:
+ break;
+ }
+ /* Fall thru to build the wqe */
+ case IB_WR_SEND_WITH_INV:
+ rc = bnxt_re_build_send_wqe(qp, wr, &wqe);
+ break;
+ case IB_WR_RDMA_WRITE:
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ case IB_WR_RDMA_READ:
+ rc = bnxt_re_build_rdma_wqe(wr, &wqe);
+ break;
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ rc = bnxt_re_build_atomic_wqe(wr, &wqe);
+ break;
+ case IB_WR_RDMA_READ_WITH_INV:
+ dev_err(rdev_to_dev(qp->rdev),
+ "RDMA Read with Invalidate is not supported");
+ rc = -EINVAL;
+ goto bad;
+ case IB_WR_LOCAL_INV:
+ rc = bnxt_re_build_inv_wqe(wr, &wqe);
+ break;
+ case IB_WR_REG_MR:
+ rc = bnxt_re_build_reg_wqe(reg_wr(wr), &wqe);
+ break;
+ default:
+ /* Unsupported WRs */
+ dev_err(rdev_to_dev(qp->rdev),
+ "WR (%#x) is not supported", wr->opcode);
+ rc = -EINVAL;
+ goto bad;
+ }
+ if (!rc)
+ rc = bnxt_qplib_post_send(&qp->qplib_qp, &wqe);
+bad:
+ if (rc) {
+ dev_err(rdev_to_dev(qp->rdev),
+ "post_send failed op:%#x qps = %#x rc = %d\n",
+ wr->opcode, qp->qplib_qp.state, rc);
+ *bad_wr = wr;
+ break;
+ }
+ wr = wr->next;
+ }
+ bnxt_qplib_post_send_db(&qp->qplib_qp);
+ spin_unlock_irqrestore(&qp->sq_lock, flags);
+
+ return rc;
+}
+
/* Completion Queues */
int bnxt_re_destroy_cq(struct ib_cq *ib_cq)
{
diff --git a/drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.h b/drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.h
index 75ee88a..becdcdc 100644
--- a/drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.h
+++ b/drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.h
@@ -162,6 +162,8 @@ int bnxt_re_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
int bnxt_re_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
int bnxt_re_destroy_qp(struct ib_qp *qp);
+int bnxt_re_post_send(struct ib_qp *qp, struct ib_send_wr *send_wr,
+ struct ib_send_wr **bad_send_wr);
struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
const struct ib_cq_init_attr *attr,
struct ib_ucontext *context,
diff --git a/drivers/infiniband/hw/bnxtre/bnxt_re_main.c b/drivers/infiniband/hw/bnxtre/bnxt_re_main.c
index 2396730..03a34c0 100644
--- a/drivers/infiniband/hw/bnxtre/bnxt_re_main.c
+++ b/drivers/infiniband/hw/bnxtre/bnxt_re_main.c
@@ -462,6 +462,7 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev)
ibdev->query_qp = bnxt_re_query_qp;
ibdev->destroy_qp = bnxt_re_destroy_qp;
+ ibdev->post_send = bnxt_re_post_send;
ibdev->create_cq = bnxt_re_create_cq;
ibdev->destroy_cq = bnxt_re_destroy_cq;
ibdev->req_notify_cq = bnxt_re_req_notify_cq;
--
2.5.5
^ permalink raw reply related
* [PATCH for bnxt_re V3 12/21] bnxt_re: Support memory registration verbs
From: Selvin Xavier @ 2016-12-20 9:13 UTC (permalink / raw)
To: dledford, linux-rdma
Cc: netdev, michael.chan, Selvin Xavier, Eddie Wai, Devesh Sharma,
Somnath Kotur, Sriharsha Basavapatna
In-Reply-To: <1482225211-22423-1-git-send-email-selvin.xavier@broadcom.com>
This patch implements the kernel and user memory region registration
supported by the bnxt_re driver.
This includes the user MR, FRMR, FMR and DMA MR support.
v3: Moves bnxt_qplib_map_tc2cos to DCB patch as the functionality is used
by DCB patch. Also, removed some unwanted macros.
Signed-off-by: Eddie Wai <eddie.wai@broadcom.com>
Signed-off-by: Devesh Sharma <devesh.sharma@broadcom.com>
Signed-off-by: Somnath Kotur <somnath.kotur@broadcom.com>
Signed-off-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com>
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
---
drivers/infiniband/hw/bnxtre/bnxt_qplib_sp.c | 289 ++++++++++++++++++
drivers/infiniband/hw/bnxtre/bnxt_qplib_sp.h | 41 +++
drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.c | 376 ++++++++++++++++++++++++
drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.h | 44 +++
drivers/infiniband/hw/bnxtre/bnxt_re_main.c | 11 +
5 files changed, 761 insertions(+)
diff --git a/drivers/infiniband/hw/bnxtre/bnxt_qplib_sp.c b/drivers/infiniband/hw/bnxtre/bnxt_qplib_sp.c
index 132dd29..4f77af8 100644
--- a/drivers/infiniband/hw/bnxtre/bnxt_qplib_sp.c
+++ b/drivers/infiniband/hw/bnxtre/bnxt_qplib_sp.c
@@ -510,3 +510,292 @@ int bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah)
}
return 0;
}
+
+/* MRW */
+int bnxt_qplib_free_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw)
+{
+ struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+ struct cmdq_deallocate_key req;
+ struct creq_deallocate_key_resp *resp;
+ u16 cmd_flags = 0;
+
+ if (mrw->lkey == 0xFFFFFFFF) {
+ dev_info(&res->pdev->dev,
+ "QPLIB: SP: Free a reserved lkey MRW");
+ return 0;
+ }
+
+ RCFW_CMD_PREP(req, DEALLOCATE_KEY, cmd_flags);
+
+ req.mrw_flags = mrw->type;
+
+ if ((mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE1) ||
+ (mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2A) ||
+ (mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2B))
+ req.key = cpu_to_le32(mrw->rkey);
+ else
+ req.key = cpu_to_le32(mrw->lkey);
+
+ resp = (struct creq_deallocate_key_resp *)
+ bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+ NULL, 0);
+ if (!resp) {
+ dev_err(&res->pdev->dev, "QPLIB: SP: FREE_MR send failed");
+ return -EINVAL;
+ }
+ if (!bnxt_qplib_rcfw_wait_for_resp(rcfw, le16_to_cpu(req.cookie))) {
+ /* Cmd timed out */
+ dev_err(&res->pdev->dev, "QPLIB: SP: FREE_MR timed out");
+ return -ETIMEDOUT;
+ }
+ if (resp->status ||
+ le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
+ dev_err(&res->pdev->dev, "QPLIB: SP: FREE_MR failed ");
+ dev_err(&res->pdev->dev,
+ "QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
+ resp->status, le16_to_cpu(req.cookie),
+ le16_to_cpu(resp->cookie));
+ return -EINVAL;
+ }
+ /* Free the qplib's MRW memory */
+ if (mrw->hwq.max_elements)
+ bnxt_qplib_free_hwq(res->pdev, &mrw->hwq);
+
+ return 0;
+}
+
+int bnxt_qplib_alloc_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw)
+{
+ struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+ struct cmdq_allocate_mrw req;
+ struct creq_allocate_mrw_resp *resp;
+ u16 cmd_flags = 0;
+ unsigned long tmp;
+
+ RCFW_CMD_PREP(req, ALLOCATE_MRW, cmd_flags);
+
+ req.pd_id = cpu_to_le32(mrw->pd->id);
+ req.mrw_flags = mrw->type;
+ if ((mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR &&
+ mrw->flags & BNXT_QPLIB_FR_PMR) ||
+ mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2A ||
+ mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2B)
+ req.access = CMDQ_ALLOCATE_MRW_ACCESS_CONSUMER_OWNED_KEY;
+ tmp = (unsigned long)mrw;
+ req.mrw_handle = cpu_to_le64(tmp);
+
+ resp = (struct creq_allocate_mrw_resp *)
+ bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+ NULL, 0);
+ if (!resp) {
+ dev_err(&rcfw->pdev->dev, "QPLIB: SP: ALLOC_MRW send failed");
+ return -EINVAL;
+ }
+ if (!bnxt_qplib_rcfw_wait_for_resp(rcfw, le16_to_cpu(req.cookie))) {
+ /* Cmd timed out */
+ dev_err(&rcfw->pdev->dev, "QPLIB: SP: ALLOC_MRW timed out");
+ return -ETIMEDOUT;
+ }
+ if (resp->status ||
+ le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
+ dev_err(&rcfw->pdev->dev, "QPLIB: SP: ALLOC_MRW failed ");
+ dev_err(&rcfw->pdev->dev,
+ "QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
+ resp->status, le16_to_cpu(req.cookie),
+ le16_to_cpu(resp->cookie));
+ return -EINVAL;
+ }
+ if ((mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE1) ||
+ (mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2A) ||
+ (mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2B))
+ mrw->rkey = le32_to_cpu(resp->xid);
+ else
+ mrw->lkey = le32_to_cpu(resp->xid);
+ return 0;
+}
+
+int bnxt_qplib_dereg_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw,
+ bool block)
+{
+ struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+ struct cmdq_deregister_mr req;
+ struct creq_deregister_mr_resp *resp;
+ u16 cmd_flags = 0;
+ int rc;
+
+ RCFW_CMD_PREP(req, DEREGISTER_MR, cmd_flags);
+
+ req.lkey = cpu_to_le32(mrw->lkey);
+ resp = (struct creq_deregister_mr_resp *)
+ bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+ NULL, block);
+ if (!resp) {
+ dev_err(&rcfw->pdev->dev, "QPLIB: SP: DEREG_MR send failed");
+ return -EINVAL;
+ }
+ if (block)
+ rc = bnxt_qplib_rcfw_block_for_resp(rcfw,
+ le16_to_cpu(req.cookie));
+ else
+ rc = bnxt_qplib_rcfw_wait_for_resp(rcfw,
+ le16_to_cpu(req.cookie));
+ if (!rc) {
+ /* Cmd timed out */
+ dev_err(&res->pdev->dev, "QPLIB: SP: DEREG_MR timed out");
+ return -ETIMEDOUT;
+ }
+ if (resp->status ||
+ le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
+ dev_err(&rcfw->pdev->dev, "QPLIB: SP: DEREG_MR failed ");
+ dev_err(&rcfw->pdev->dev,
+ "QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
+ resp->status, le16_to_cpu(req.cookie),
+ le16_to_cpu(resp->cookie));
+ return -EINVAL;
+ }
+
+ /* Free the qplib's MR memory */
+ if (mrw->hwq.max_elements) {
+ mrw->va = 0;
+ mrw->total_size = 0;
+ bnxt_qplib_free_hwq(res->pdev, &mrw->hwq);
+ }
+
+ return 0;
+}
+
+int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
+ u64 *pbl_tbl, int num_pbls, bool block)
+{
+ struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+ struct cmdq_register_mr req;
+ struct creq_register_mr_resp *resp;
+ u16 cmd_flags = 0, level;
+ int pg_ptrs, pages, i, rc;
+ dma_addr_t **pbl_ptr;
+ u32 pg_size;
+
+ if (num_pbls) {
+ pg_ptrs = roundup_pow_of_two(num_pbls);
+ pages = pg_ptrs >> MAX_PBL_LVL_1_PGS_SHIFT;
+ if (!pages)
+ pages++;
+
+ if (pages > MAX_PBL_LVL_1_PGS) {
+ dev_err(&res->pdev->dev, "QPLIB: SP: Reg MR pages ");
+ dev_err(&res->pdev->dev,
+ "requested (0x%x) exceeded max (0x%x)",
+ pages, MAX_PBL_LVL_1_PGS);
+ return -ENOMEM;
+ }
+ /* Free the hwq if it already exist, must be a rereg */
+ if (mr->hwq.max_elements)
+ bnxt_qplib_free_hwq(res->pdev, &mr->hwq);
+
+ mr->hwq.max_elements = pages;
+ rc = bnxt_qplib_alloc_init_hwq(res->pdev, &mr->hwq, NULL, 0,
+ &mr->hwq.max_elements,
+ PAGE_SIZE, 0, PAGE_SIZE,
+ HWQ_TYPE_CTX);
+ if (rc) {
+ dev_err(&res->pdev->dev,
+ "SP: Reg MR memory allocation failed");
+ return -ENOMEM;
+ }
+ /* Write to the hwq */
+ pbl_ptr = (dma_addr_t **)mr->hwq.pbl_ptr;
+ for (i = 0; i < num_pbls; i++)
+ pbl_ptr[PTR_PG(i)][PTR_IDX(i)] =
+ (pbl_tbl[i] & PAGE_MASK) | PTU_PTE_VALID;
+ }
+
+ RCFW_CMD_PREP(req, REGISTER_MR, cmd_flags);
+
+ /* Configure the request */
+ if (mr->hwq.level == PBL_LVL_MAX) {
+ level = 0;
+ req.pbl = 0;
+ pg_size = PAGE_SIZE;
+ } else {
+ level = mr->hwq.level + 1;
+ req.pbl = cpu_to_le64(mr->hwq.pbl[PBL_LVL_0].pg_map_arr[0]);
+ pg_size = mr->hwq.pbl[PBL_LVL_0].pg_size;
+ }
+ req.log2_pg_size_lvl = (level << CMDQ_REGISTER_MR_LVL_SFT) |
+ ((ilog2(pg_size) <<
+ CMDQ_REGISTER_MR_LOG2_PG_SIZE_SFT) &
+ CMDQ_REGISTER_MR_LOG2_PG_SIZE_MASK);
+ req.access = (mr->flags & 0xFFFF);
+ req.va = cpu_to_le64(mr->va);
+ req.key = cpu_to_le32(mr->lkey);
+ req.mr_size = cpu_to_le64(mr->total_size);
+
+ resp = (struct creq_register_mr_resp *)
+ bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+ NULL, block);
+ if (!resp) {
+ dev_err(&res->pdev->dev, "SP: REG_MR send failed");
+ rc = -EINVAL;
+ goto fail;
+ }
+ if (block)
+ rc = bnxt_qplib_rcfw_block_for_resp(rcfw,
+ le16_to_cpu(req.cookie));
+ else
+ rc = bnxt_qplib_rcfw_wait_for_resp(rcfw,
+ le16_to_cpu(req.cookie));
+ if (!rc) {
+ /* Cmd timed out */
+ dev_err(&res->pdev->dev, "SP: REG_MR timed out");
+ rc = -ETIMEDOUT;
+ goto fail;
+ }
+ if (resp->status ||
+ le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
+ dev_err(&res->pdev->dev, "QPLIB: SP: REG_MR failed ");
+ dev_err(&res->pdev->dev,
+ "QPLIB: SP: with status 0x%x cmdq 0x%x resp 0x%x",
+ resp->status, le16_to_cpu(req.cookie),
+ le16_to_cpu(resp->cookie));
+ rc = -EINVAL;
+ goto fail;
+ }
+ return 0;
+
+fail:
+ if (mr->hwq.max_elements)
+ bnxt_qplib_free_hwq(res->pdev, &mr->hwq);
+ return rc;
+}
+
+int bnxt_qplib_alloc_fast_reg_page_list(struct bnxt_qplib_res *res,
+ struct bnxt_qplib_frpl *frpl,
+ int max_pg_ptrs)
+{
+ int pg_ptrs, pages, rc;
+
+ /* Re-calculate the max to fit the HWQ allocation model */
+ pg_ptrs = roundup_pow_of_two(max_pg_ptrs);
+ pages = pg_ptrs >> MAX_PBL_LVL_1_PGS_SHIFT;
+ if (!pages)
+ pages++;
+
+ if (pages > MAX_PBL_LVL_1_PGS)
+ return -ENOMEM;
+
+ frpl->hwq.max_elements = pages;
+ rc = bnxt_qplib_alloc_init_hwq(res->pdev, &frpl->hwq, NULL, 0,
+ &frpl->hwq.max_elements, PAGE_SIZE, 0,
+ PAGE_SIZE, HWQ_TYPE_CTX);
+ if (!rc)
+ frpl->max_pg_ptrs = pg_ptrs;
+
+ return rc;
+}
+
+int bnxt_qplib_free_fast_reg_page_list(struct bnxt_qplib_res *res,
+ struct bnxt_qplib_frpl *frpl)
+{
+ bnxt_qplib_free_hwq(res->pdev, &frpl->hwq);
+ return 0;
+}
diff --git a/drivers/infiniband/hw/bnxtre/bnxt_qplib_sp.h b/drivers/infiniband/hw/bnxtre/bnxt_qplib_sp.h
index 26eac17..3358f6d 100644
--- a/drivers/infiniband/hw/bnxtre/bnxt_qplib_sp.h
+++ b/drivers/infiniband/hw/bnxtre/bnxt_qplib_sp.h
@@ -94,6 +94,34 @@ struct bnxt_qplib_ah {
u8 nw_type;
};
+struct bnxt_qplib_mrw {
+ struct bnxt_qplib_pd *pd;
+ int type;
+ u32 flags;
+#define BNXT_QPLIB_FR_PMR 0x80000000
+ u32 lkey;
+ u32 rkey;
+#define BNXT_QPLIB_RSVD_LKEY 0xFFFFFFFF
+ u64 va;
+ u64 total_size;
+ u32 npages;
+ u64 mr_handle;
+ struct bnxt_qplib_hwq hwq;
+};
+
+struct bnxt_qplib_frpl {
+ int max_pg_ptrs;
+ struct bnxt_qplib_hwq hwq;
+};
+
+#define BNXT_QPLIB_ACCESS_LOCAL_WRITE BIT(0)
+#define BNXT_QPLIB_ACCESS_REMOTE_READ BIT(1)
+#define BNXT_QPLIB_ACCESS_REMOTE_WRITE BIT(2)
+#define BNXT_QPLIB_ACCESS_REMOTE_ATOMIC BIT(3)
+#define BNXT_QPLIB_ACCESS_MW_BIND BIT(4)
+#define BNXT_QPLIB_ACCESS_ZERO_BASED BIT(5)
+#define BNXT_QPLIB_ACCESS_ON_DEMAND BIT(6)
+
int bnxt_qplib_get_sgid(struct bnxt_qplib_res *res,
struct bnxt_qplib_sgid_tbl *sgid_tbl, int index,
struct bnxt_qplib_gid *gid);
@@ -115,4 +143,17 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw,
struct bnxt_qplib_dev_attr *attr);
int bnxt_qplib_create_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah);
int bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah);
+int bnxt_qplib_alloc_mrw(struct bnxt_qplib_res *res,
+ struct bnxt_qplib_mrw *mrw);
+int bnxt_qplib_dereg_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw,
+ bool block);
+int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
+ u64 *pbl_tbl, int num_pbls, bool block);
+int bnxt_qplib_free_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr);
+int bnxt_qplib_alloc_fast_reg_mr(struct bnxt_qplib_res *res,
+ struct bnxt_qplib_mrw *mr, int max);
+int bnxt_qplib_alloc_fast_reg_page_list(struct bnxt_qplib_res *res,
+ struct bnxt_qplib_frpl *frpl, int max);
+int bnxt_qplib_free_fast_reg_page_list(struct bnxt_qplib_res *res,
+ struct bnxt_qplib_frpl *frpl);
#endif /* __BNXT_QPLIB_SP_H__*/
diff --git a/drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.c b/drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.c
index c085dbd..9efa12f 100644
--- a/drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.c
+++ b/drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.c
@@ -639,6 +639,48 @@ int bnxt_re_query_ah(struct ib_ah *ib_ah, struct ib_ah_attr *ah_attr)
return 0;
}
+static int __from_ib_access_flags(int iflags)
+{
+ int qflags = 0;
+
+ if (iflags & IB_ACCESS_LOCAL_WRITE)
+ qflags |= BNXT_QPLIB_ACCESS_LOCAL_WRITE;
+ if (iflags & IB_ACCESS_REMOTE_READ)
+ qflags |= BNXT_QPLIB_ACCESS_REMOTE_READ;
+ if (iflags & IB_ACCESS_REMOTE_WRITE)
+ qflags |= BNXT_QPLIB_ACCESS_REMOTE_WRITE;
+ if (iflags & IB_ACCESS_REMOTE_ATOMIC)
+ qflags |= BNXT_QPLIB_ACCESS_REMOTE_ATOMIC;
+ if (iflags & IB_ACCESS_MW_BIND)
+ qflags |= BNXT_QPLIB_ACCESS_MW_BIND;
+ if (iflags & IB_ZERO_BASED)
+ qflags |= BNXT_QPLIB_ACCESS_ZERO_BASED;
+ if (iflags & IB_ACCESS_ON_DEMAND)
+ qflags |= BNXT_QPLIB_ACCESS_ON_DEMAND;
+ return qflags;
+};
+
+static enum ib_access_flags __to_ib_access_flags(int qflags)
+{
+ enum ib_access_flags iflags = 0;
+
+ if (qflags & BNXT_QPLIB_ACCESS_LOCAL_WRITE)
+ iflags |= IB_ACCESS_LOCAL_WRITE;
+ if (qflags & BNXT_QPLIB_ACCESS_REMOTE_WRITE)
+ iflags |= IB_ACCESS_REMOTE_WRITE;
+ if (qflags & BNXT_QPLIB_ACCESS_REMOTE_READ)
+ iflags |= IB_ACCESS_REMOTE_READ;
+ if (qflags & BNXT_QPLIB_ACCESS_REMOTE_ATOMIC)
+ iflags |= IB_ACCESS_REMOTE_ATOMIC;
+ if (qflags & BNXT_QPLIB_ACCESS_MW_BIND)
+ iflags |= IB_ACCESS_MW_BIND;
+ if (qflags & BNXT_QPLIB_ACCESS_ZERO_BASED)
+ iflags |= IB_ZERO_BASED;
+ if (qflags & BNXT_QPLIB_ACCESS_ON_DEMAND)
+ iflags |= IB_ACCESS_ON_DEMAND;
+ return iflags;
+};
+
/* Completion Queues */
int bnxt_re_destroy_cq(struct ib_cq *ib_cq)
{
@@ -785,6 +827,340 @@ int bnxt_re_req_notify_cq(struct ib_cq *ib_cq,
return 0;
}
+/* Memory Regions */
+struct ib_mr *bnxt_re_get_dma_mr(struct ib_pd *ib_pd, int mr_access_flags)
+{
+ struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
+ struct bnxt_re_dev *rdev = pd->rdev;
+ struct bnxt_re_mr *mr;
+ u64 pbl = 0;
+ int rc;
+
+ mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+ if (!mr)
+ return ERR_PTR(-ENOMEM);
+
+ mr->rdev = rdev;
+ mr->qplib_mr.pd = &pd->qplib_pd;
+ mr->qplib_mr.flags = __from_ib_access_flags(mr_access_flags);
+ mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR;
+
+ /* Allocate and register 0 as the address */
+ rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr);
+ if (rc)
+ goto fail;
+
+ mr->qplib_mr.hwq.level = PBL_LVL_MAX;
+ mr->qplib_mr.total_size = -1; /* Infinte length */
+ rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, &pbl, 0, false);
+ if (rc)
+ goto fail_mr;
+
+ mr->ib_mr.lkey = mr->qplib_mr.lkey;
+ if (mr_access_flags & (IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_ATOMIC))
+ mr->ib_mr.rkey = mr->ib_mr.lkey;
+ atomic_inc(&rdev->mr_count);
+
+ return &mr->ib_mr;
+
+fail_mr:
+ bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr);
+fail:
+ kfree(mr);
+ return ERR_PTR(rc);
+}
+
+int bnxt_re_dereg_mr(struct ib_mr *ib_mr)
+{
+ struct bnxt_re_mr *mr = container_of(ib_mr, struct bnxt_re_mr, ib_mr);
+ struct bnxt_re_dev *rdev = mr->rdev;
+ int rc = 0;
+
+ if (mr->npages && mr->pages) {
+ rc = bnxt_qplib_free_fast_reg_page_list(&rdev->qplib_res,
+ &mr->qplib_frpl);
+ kfree(mr->pages);
+ mr->npages = 0;
+ mr->pages = NULL;
+ }
+ rc = bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr);
+
+ if (!IS_ERR(mr->ib_umem) && mr->ib_umem)
+ ib_umem_release(mr->ib_umem);
+
+ kfree(mr);
+ atomic_dec(&rdev->mr_count);
+ return rc;
+}
+
+static int bnxt_re_set_page(struct ib_mr *ib_mr, u64 addr)
+{
+ struct bnxt_re_mr *mr = container_of(ib_mr, struct bnxt_re_mr, ib_mr);
+
+ if (unlikely(mr->npages == mr->qplib_frpl.max_pg_ptrs))
+ return -ENOMEM;
+
+ mr->pages[mr->npages++] = addr;
+ return 0;
+}
+
+int bnxt_re_map_mr_sg(struct ib_mr *ib_mr, struct scatterlist *sg, int sg_nents,
+ unsigned int *sg_offset)
+{
+ struct bnxt_re_mr *mr = container_of(ib_mr, struct bnxt_re_mr, ib_mr);
+
+ mr->npages = 0;
+ return ib_sg_to_pages(ib_mr, sg, sg_nents, sg_offset, bnxt_re_set_page);
+}
+
+struct ib_mr *bnxt_re_alloc_mr(struct ib_pd *ib_pd, enum ib_mr_type type,
+ u32 max_num_sg)
+{
+ struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
+ struct bnxt_re_dev *rdev = pd->rdev;
+ struct bnxt_re_mr *mr = NULL;
+ int rc;
+
+ if (type != IB_MR_TYPE_MEM_REG) {
+ dev_dbg(rdev_to_dev(rdev), "MR type 0x%x not supported", type);
+ return ERR_PTR(-EINVAL);
+ }
+ if (max_num_sg > MAX_PBL_LVL_1_PGS)
+ return ERR_PTR(-EINVAL);
+
+ mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+ if (!mr)
+ return ERR_PTR(-ENOMEM);
+
+ mr->rdev = rdev;
+ mr->qplib_mr.pd = &pd->qplib_pd;
+ mr->qplib_mr.flags = BNXT_QPLIB_FR_PMR;
+ mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR;
+
+ rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr);
+ if (rc)
+ goto fail;
+
+ mr->ib_mr.lkey = mr->qplib_mr.lkey;
+ mr->ib_mr.rkey = mr->ib_mr.lkey;
+
+ mr->pages = kcalloc(max_num_sg, sizeof(u64), GFP_KERNEL);
+ if (!mr->pages) {
+ rc = -ENOMEM;
+ goto fail;
+ }
+ rc = bnxt_qplib_alloc_fast_reg_page_list(&rdev->qplib_res,
+ &mr->qplib_frpl, max_num_sg);
+ if (rc) {
+ dev_err(rdev_to_dev(rdev),
+ "Failed to allocate HW FR page list");
+ goto fail_mr;
+ }
+
+ atomic_inc(&rdev->mr_count);
+ return &mr->ib_mr;
+
+fail_mr:
+ bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr);
+fail:
+ kfree(mr->pages);
+ kfree(mr);
+ return ERR_PTR(rc);
+}
+
+/* Fast Memory Regions */
+struct ib_fmr *bnxt_re_alloc_fmr(struct ib_pd *ib_pd, int mr_access_flags,
+ struct ib_fmr_attr *fmr_attr)
+{
+ struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
+ struct bnxt_re_dev *rdev = pd->rdev;
+ struct bnxt_re_fmr *fmr;
+ int rc;
+
+ if (fmr_attr->max_pages > MAX_PBL_LVL_2_PGS ||
+ fmr_attr->max_maps > rdev->dev_attr.max_map_per_fmr) {
+ dev_err(rdev_to_dev(rdev), "Allocate FMR exceeded Max limit");
+ return ERR_PTR(-ENOMEM);
+ }
+ fmr = kzalloc(sizeof(*fmr), GFP_KERNEL);
+ if (!fmr)
+ return ERR_PTR(-ENOMEM);
+
+ fmr->rdev = rdev;
+ fmr->qplib_fmr.pd = &pd->qplib_pd;
+ fmr->qplib_fmr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR;
+
+ rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &fmr->qplib_fmr);
+ if (rc)
+ goto fail;
+
+ fmr->qplib_fmr.flags = __from_ib_access_flags(mr_access_flags);
+ fmr->ib_fmr.lkey = fmr->qplib_fmr.lkey;
+ fmr->ib_fmr.rkey = fmr->ib_fmr.lkey;
+
+ atomic_inc(&rdev->mr_count);
+ return &fmr->ib_fmr;
+fail:
+ kfree(fmr);
+ return ERR_PTR(rc);
+}
+
+int bnxt_re_map_phys_fmr(struct ib_fmr *ib_fmr, u64 *page_list, int list_len,
+ u64 iova)
+{
+ struct bnxt_re_fmr *fmr = container_of(ib_fmr, struct bnxt_re_fmr,
+ ib_fmr);
+ struct bnxt_re_dev *rdev = fmr->rdev;
+ int rc;
+
+ fmr->qplib_fmr.va = iova;
+ fmr->qplib_fmr.total_size = list_len * PAGE_SIZE;
+
+ rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &fmr->qplib_fmr, page_list,
+ list_len, true);
+ if (rc)
+ dev_err(rdev_to_dev(rdev), "Failed to map FMR for lkey = 0x%x!",
+ fmr->ib_fmr.lkey);
+ return rc;
+}
+
+int bnxt_re_unmap_fmr(struct list_head *fmr_list)
+{
+ struct bnxt_re_dev *rdev;
+ struct bnxt_re_fmr *fmr;
+ struct ib_fmr *ib_fmr;
+ int rc = 0;
+
+ /* Validate each FMRs inside the fmr_list */
+ list_for_each_entry(ib_fmr, fmr_list, list) {
+ fmr = container_of(ib_fmr, struct bnxt_re_fmr, ib_fmr);
+ rdev = fmr->rdev;
+
+ if (rdev) {
+ rc = bnxt_qplib_dereg_mrw(&rdev->qplib_res,
+ &fmr->qplib_fmr, true);
+ if (rc)
+ break;
+ }
+ }
+ return rc;
+}
+
+int bnxt_re_dealloc_fmr(struct ib_fmr *ib_fmr)
+{
+ struct bnxt_re_fmr *fmr = container_of(ib_fmr, struct bnxt_re_fmr,
+ ib_fmr);
+ struct bnxt_re_dev *rdev = fmr->rdev;
+ int rc;
+
+ rc = bnxt_qplib_free_mrw(&rdev->qplib_res, &fmr->qplib_fmr);
+ if (rc)
+ dev_err(rdev_to_dev(rdev), "Failed to free FMR");
+
+ kfree(fmr);
+ atomic_dec(&rdev->mr_count);
+ return rc;
+}
+
+/* uverbs */
+struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length,
+ u64 virt_addr, int mr_access_flags,
+ struct ib_udata *udata)
+{
+ struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
+ struct bnxt_re_dev *rdev = pd->rdev;
+ struct bnxt_re_mr *mr;
+ struct ib_umem *umem;
+ u64 *pbl_tbl, *pbl_tbl_orig;
+ int i, umem_pgs, pages, page_shift, rc;
+ struct scatterlist *sg;
+ int entry;
+
+ mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+ if (!mr)
+ return ERR_PTR(-ENOMEM);
+
+ mr->rdev = rdev;
+ mr->qplib_mr.pd = &pd->qplib_pd;
+ mr->qplib_mr.flags = __from_ib_access_flags(mr_access_flags);
+ mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_MR;
+
+ umem = ib_umem_get(ib_pd->uobject->context, start, length,
+ mr_access_flags, 0);
+ if (IS_ERR(umem)) {
+ dev_err(rdev_to_dev(rdev), "Failed to get umem");
+ rc = -EFAULT;
+ goto free_mr;
+ }
+ mr->ib_umem = umem;
+
+ rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr);
+ if (rc) {
+ dev_err(rdev_to_dev(rdev), "Failed to allocate MR");
+ goto release_umem;
+ }
+ /* The fixed portion of the rkey is the same as the lkey */
+ mr->ib_mr.rkey = mr->qplib_mr.rkey;
+
+ mr->qplib_mr.va = virt_addr;
+ umem_pgs = ib_umem_page_count(umem);
+ if (!umem_pgs) {
+ dev_err(rdev_to_dev(rdev), "umem is invalid!");
+ rc = -EINVAL;
+ goto free_mrw;
+ }
+ mr->qplib_mr.total_size = length;
+
+ pbl_tbl = kcalloc(umem_pgs, sizeof(u64 *), GFP_KERNEL);
+ if (!pbl_tbl) {
+ rc = -EINVAL;
+ goto free_mrw;
+ }
+ pbl_tbl_orig = pbl_tbl;
+
+ page_shift = ilog2(umem->page_size);
+ if (umem->hugetlb) {
+ dev_err(rdev_to_dev(rdev), "umem hugetlb not supported!");
+ rc = -EFAULT;
+ goto fail;
+ }
+ if (umem->page_size != PAGE_SIZE) {
+ dev_err(rdev_to_dev(rdev), "umem page size unsupported!");
+ rc = -EFAULT;
+ goto fail;
+ }
+ /* Map umem buf ptrs to the PBL */
+ for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+ pages = sg_dma_len(sg) >> page_shift;
+ for (i = 0; i < pages; i++, pbl_tbl++)
+ *pbl_tbl = sg_dma_address(sg) + (i << page_shift);
+ }
+ rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, pbl_tbl_orig,
+ umem_pgs, false);
+ if (rc) {
+ dev_err(rdev_to_dev(rdev), "Failed to register user MR");
+ goto fail;
+ }
+
+ kfree(pbl_tbl_orig);
+
+ mr->ib_mr.lkey = mr->qplib_mr.lkey;
+ mr->ib_mr.rkey = mr->qplib_mr.lkey;
+ atomic_inc(&rdev->mr_count);
+
+ return &mr->ib_mr;
+fail:
+ kfree(pbl_tbl_orig);
+free_mrw:
+ bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr);
+release_umem:
+ ib_umem_release(umem);
+free_mr:
+ kfree(mr);
+ return ERR_PTR(rc);
+}
+
struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev,
struct ib_udata *udata)
{
diff --git a/drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.h b/drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.h
index 14c9e02..ba9a4c9 100644
--- a/drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.h
+++ b/drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.h
@@ -70,6 +70,34 @@ struct bnxt_re_cq {
struct ib_umem *umem;
};
+struct bnxt_re_mr {
+ struct bnxt_re_dev *rdev;
+ struct ib_mr ib_mr;
+ struct ib_umem *ib_umem;
+ struct bnxt_qplib_mrw qplib_mr;
+ u32 npages;
+ u64 *pages;
+ struct bnxt_qplib_frpl qplib_frpl;
+};
+
+struct bnxt_re_frpl {
+ struct bnxt_re_dev *rdev;
+ struct bnxt_qplib_frpl qplib_frpl;
+ u64 *page_list;
+};
+
+struct bnxt_re_fmr {
+ struct bnxt_re_dev *rdev;
+ struct ib_fmr ib_fmr;
+ struct bnxt_qplib_mrw qplib_fmr;
+};
+
+struct bnxt_re_mw {
+ struct bnxt_re_dev *rdev;
+ struct ib_mw ib_mw;
+ struct bnxt_qplib_mrw qplib_mw;
+};
+
struct bnxt_re_ucontext {
struct bnxt_re_dev *rdev;
struct ib_ucontext ib_uctx;
@@ -119,6 +147,22 @@ struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
struct ib_udata *udata);
int bnxt_re_destroy_cq(struct ib_cq *cq);
int bnxt_re_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
+struct ib_mr *bnxt_re_get_dma_mr(struct ib_pd *pd, int mr_access_flags);
+
+int bnxt_re_map_mr_sg(struct ib_mr *ib_mr, struct scatterlist *sg, int sg_nents,
+ unsigned int *sg_offset);
+struct ib_mr *bnxt_re_alloc_mr(struct ib_pd *ib_pd, enum ib_mr_type mr_type,
+ u32 max_num_sg);
+int bnxt_re_dereg_mr(struct ib_mr *mr);
+struct ib_fmr *bnxt_re_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
+ struct ib_fmr_attr *fmr_attr);
+int bnxt_re_map_phys_fmr(struct ib_fmr *fmr, u64 *page_list, int list_len,
+ u64 iova);
+int bnxt_re_unmap_fmr(struct list_head *fmr_list);
+int bnxt_re_dealloc_fmr(struct ib_fmr *fmr);
+struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+ u64 virt_addr, int mr_access_flags,
+ struct ib_udata *udata);
struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev,
struct ib_udata *udata);
int bnxt_re_dealloc_ucontext(struct ib_ucontext *context);
diff --git a/drivers/infiniband/hw/bnxtre/bnxt_re_main.c b/drivers/infiniband/hw/bnxtre/bnxt_re_main.c
index f270b4c..8ef465c 100644
--- a/drivers/infiniband/hw/bnxtre/bnxt_re_main.c
+++ b/drivers/infiniband/hw/bnxtre/bnxt_re_main.c
@@ -459,6 +459,17 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev)
ibdev->create_cq = bnxt_re_create_cq;
ibdev->destroy_cq = bnxt_re_destroy_cq;
ibdev->req_notify_cq = bnxt_re_req_notify_cq;
+
+ ibdev->get_dma_mr = bnxt_re_get_dma_mr;
+ ibdev->dereg_mr = bnxt_re_dereg_mr;
+ ibdev->alloc_mr = bnxt_re_alloc_mr;
+ ibdev->map_mr_sg = bnxt_re_map_mr_sg;
+ ibdev->alloc_fmr = bnxt_re_alloc_fmr;
+ ibdev->map_phys_fmr = bnxt_re_map_phys_fmr;
+ ibdev->unmap_fmr = bnxt_re_unmap_fmr;
+ ibdev->dealloc_fmr = bnxt_re_dealloc_fmr;
+
+ ibdev->reg_user_mr = bnxt_re_reg_user_mr;
ibdev->alloc_ucontext = bnxt_re_alloc_ucontext;
ibdev->dealloc_ucontext = bnxt_re_dealloc_ucontext;
ibdev->mmap = bnxt_re_mmap;
--
2.5.5
^ permalink raw reply related
* [PATCH for bnxt_re V3 09/21] bnxt_re: Support for GID related verbs
From: Selvin Xavier @ 2016-12-20 9:13 UTC (permalink / raw)
To: dledford, linux-rdma
Cc: netdev, michael.chan, Selvin Xavier, Eddie Wai, Devesh Sharma,
Somnath Kotur, Sriharsha Basavapatna
In-Reply-To: <1482225211-22423-1-git-send-email-selvin.xavier@broadcom.com>
Implements add GID, del GID, get_netdev and pkey related verbs.
v3: Fixes some sparse warning related to endianness check. Removes
macros which are just wrapper for standard defines.
Signed-off-by: Eddie Wai <eddie.wai@broadcom.com>
Signed-off-by: Devesh Sharma <devesh.sharma@broadcom.com>
Signed-off-by: Somnath Kotur <somnath.kotur@broadcom.com>
Signed-off-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com>
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
---
drivers/infiniband/hw/bnxtre/bnxt_qplib_res.c | 5 +
drivers/infiniband/hw/bnxtre/bnxt_qplib_res.h | 3 +
drivers/infiniband/hw/bnxtre/bnxt_qplib_sp.c | 218 ++++++++++++++++++++++++
drivers/infiniband/hw/bnxtre/bnxt_qplib_sp.h | 11 ++
drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.c | 123 +++++++++++++
drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.h | 18 ++
drivers/infiniband/hw/bnxtre/bnxt_re_main.c | 7 +
7 files changed, 385 insertions(+)
diff --git a/drivers/infiniband/hw/bnxtre/bnxt_qplib_res.c b/drivers/infiniband/hw/bnxtre/bnxt_qplib_res.c
index 7004f97..d5677b2 100644
--- a/drivers/infiniband/hw/bnxtre/bnxt_qplib_res.c
+++ b/drivers/infiniband/hw/bnxtre/bnxt_qplib_res.c
@@ -507,6 +507,11 @@ static void bnxt_qplib_cleanup_sgid_tbl(struct bnxt_qplib_res *res,
{
int i;
+ for (i = 0; i < sgid_tbl->max; i++) {
+ if (memcmp(&sgid_tbl->tbl[i], &bnxt_qplib_gid_zero,
+ sizeof(bnxt_qplib_gid_zero)))
+ bnxt_qplib_del_sgid(sgid_tbl, &sgid_tbl->tbl[i], true);
+ }
memset(sgid_tbl->tbl, 0, sizeof(struct bnxt_qplib_gid) * sgid_tbl->max);
memset(sgid_tbl->hw_id, -1, sizeof(u16) * sgid_tbl->max);
sgid_tbl->active = 0;
diff --git a/drivers/infiniband/hw/bnxtre/bnxt_qplib_res.h b/drivers/infiniband/hw/bnxtre/bnxt_qplib_res.h
index fc9c38c..6596ee2 100644
--- a/drivers/infiniband/hw/bnxtre/bnxt_qplib_res.h
+++ b/drivers/infiniband/hw/bnxtre/bnxt_qplib_res.h
@@ -186,6 +186,9 @@ struct bnxt_qplib_res {
struct bnxt_qplib_dpi_tbl dpi_tbl;
};
+#define to_bnxt_qplib(ptr, type, member) \
+ container_of(ptr, type, member)
+
struct bnxt_qplib_pd;
struct bnxt_qplib_dev_attr;
diff --git a/drivers/infiniband/hw/bnxtre/bnxt_qplib_sp.c b/drivers/infiniband/hw/bnxtre/bnxt_qplib_sp.c
index 103f9ab..2bdfaf0 100644
--- a/drivers/infiniband/hw/bnxtre/bnxt_qplib_sp.c
+++ b/drivers/infiniband/hw/bnxtre/bnxt_qplib_sp.c
@@ -46,6 +46,10 @@
#include "bnxt_qplib_res.h"
#include "bnxt_qplib_rcfw.h"
#include "bnxt_qplib_sp.h"
+
+const struct bnxt_qplib_gid bnxt_qplib_gid_zero = {{ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0 } };
+
/* Device */
int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw,
struct bnxt_qplib_dev_attr *attr)
@@ -129,6 +133,220 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw,
return 0;
}
+/* SGID */
+int bnxt_qplib_get_sgid(struct bnxt_qplib_res *res,
+ struct bnxt_qplib_sgid_tbl *sgid_tbl, int index,
+ struct bnxt_qplib_gid *gid)
+{
+ if (index > sgid_tbl->max) {
+ dev_err(&res->pdev->dev,
+ "QPLIB: Index %d exceeded SGID table max (%d)",
+ index, sgid_tbl->max);
+ return -EINVAL;
+ }
+ memcpy(gid, &sgid_tbl->tbl[index], sizeof(*gid));
+ return 0;
+}
+
+int bnxt_qplib_del_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
+ struct bnxt_qplib_gid *gid, bool update)
+{
+ struct bnxt_qplib_res *res = to_bnxt_qplib(sgid_tbl,
+ struct bnxt_qplib_res,
+ sgid_tbl);
+ struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+ int index;
+
+ if (!sgid_tbl) {
+ dev_err(&res->pdev->dev, "QPLIB: SGID table not allocated");
+ return -EINVAL;
+ }
+ /* Do we need a sgid_lock here? */
+ if (!sgid_tbl->active) {
+ dev_err(&res->pdev->dev,
+ "QPLIB: SGID table has no active entries");
+ return -ENOMEM;
+ }
+ for (index = 0; index < sgid_tbl->max; index++) {
+ if (!memcmp(&sgid_tbl->tbl[index], gid, sizeof(*gid)))
+ break;
+ }
+ if (index == sgid_tbl->max) {
+ dev_warn(&res->pdev->dev, "GID not found in the SGID table");
+ return 0;
+ }
+ /* Remove GID from the SGID table */
+ if (update) {
+ struct cmdq_delete_gid req;
+ struct creq_delete_gid_resp *resp;
+ u16 cmd_flags = 0;
+
+ RCFW_CMD_PREP(req, DELETE_GID, cmd_flags);
+ if (sgid_tbl->hw_id[index] == 0xFFFF) {
+ dev_err(&res->pdev->dev,
+ "QPLIB: GID entry contains an invalid HW id");
+ return -EINVAL;
+ }
+ req.gid_index = cpu_to_le16(sgid_tbl->hw_id[index]);
+ resp = (struct creq_delete_gid_resp *)
+ bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, NULL,
+ 0);
+ if (!resp) {
+ dev_err(&res->pdev->dev,
+ "QPLIB: SP: DELETE_GID send failed");
+ return -EINVAL;
+ }
+ if (!bnxt_qplib_rcfw_wait_for_resp(rcfw,
+ le16_to_cpu(req.cookie))) {
+ /* Cmd timed out */
+ dev_err(&res->pdev->dev,
+ "QPLIB: SP: DELETE_GID timed out");
+ return -ETIMEDOUT;
+ }
+ if (resp->status ||
+ le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
+ dev_err(&res->pdev->dev,
+ "QPLIB: SP: DELETE_GID failed ");
+ dev_err(&res->pdev->dev,
+ "QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
+ resp->status, le16_to_cpu(req.cookie),
+ le16_to_cpu(resp->cookie));
+ return -EINVAL;
+ }
+ }
+ memcpy(&sgid_tbl->tbl[index], &bnxt_qplib_gid_zero,
+ sizeof(bnxt_qplib_gid_zero));
+ sgid_tbl->active--;
+ dev_dbg(&res->pdev->dev,
+ "QPLIB: SGID deleted hw_id[0x%x] = 0x%x active = 0x%x",
+ index, sgid_tbl->hw_id[index], sgid_tbl->active);
+ sgid_tbl->hw_id[index] = (u16)-1;
+
+ /* unlock */
+ return 0;
+}
+
+int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
+ struct bnxt_qplib_gid *gid, u8 *smac, u16 vlan_id,
+ bool update, u32 *index)
+{
+ struct bnxt_qplib_res *res = to_bnxt_qplib(sgid_tbl,
+ struct bnxt_qplib_res,
+ sgid_tbl);
+ struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+ int i, free_idx, rc = 0;
+
+ if (!sgid_tbl) {
+ dev_err(&res->pdev->dev, "QPLIB: SGID table not allocated");
+ return -EINVAL;
+ }
+ /* Do we need a sgid_lock here? */
+ if (sgid_tbl->active == sgid_tbl->max) {
+ dev_err(&res->pdev->dev, "QPLIB: SGID table is full");
+ return -ENOMEM;
+ }
+ free_idx = sgid_tbl->max;
+ for (i = 0; i < sgid_tbl->max; i++) {
+ if (!memcmp(&sgid_tbl->tbl[i], gid, sizeof(*gid))) {
+ dev_dbg(&res->pdev->dev,
+ "QPLIB: SGID entry already exist in entry %d!",
+ i);
+ *index = i;
+ return -EALREADY;
+ } else if (!memcmp(&sgid_tbl->tbl[i], &bnxt_qplib_gid_zero,
+ sizeof(bnxt_qplib_gid_zero)) &&
+ free_idx == sgid_tbl->max) {
+ free_idx = i;
+ }
+ }
+ if (free_idx == sgid_tbl->max) {
+ dev_err(&res->pdev->dev,
+ "QPLIB: SGID table is FULL but count is not MAX??");
+ return -ENOMEM;
+ }
+ if (update) {
+ struct cmdq_add_gid req;
+ struct creq_add_gid_resp *resp;
+ u16 cmd_flags = 0;
+ u32 temp32[4];
+ u16 temp16[3];
+
+ RCFW_CMD_PREP(req, ADD_GID, cmd_flags);
+
+ memcpy(temp32, gid->data, sizeof(struct bnxt_qplib_gid));
+ req.gid[0] = cpu_to_be32(temp32[3]);
+ req.gid[1] = cpu_to_be32(temp32[2]);
+ req.gid[2] = cpu_to_be32(temp32[1]);
+ req.gid[3] = cpu_to_be32(temp32[0]);
+ if (vlan_id != 0xFFFF)
+ req.vlan = cpu_to_le16((vlan_id &
+ CMDQ_ADD_GID_VLAN_VLAN_ID_MASK) |
+ CMDQ_ADD_GID_VLAN_TPID_TPID_8100 |
+ CMDQ_ADD_GID_VLAN_VLAN_EN);
+
+ /* MAC in network format */
+ memcpy(temp16, smac, 6);
+ req.src_mac[0] = cpu_to_be16(temp16[0]);
+ req.src_mac[1] = cpu_to_be16(temp16[1]);
+ req.src_mac[2] = cpu_to_be16(temp16[2]);
+
+ resp = (struct creq_add_gid_resp *)
+ bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+ NULL, 0);
+ if (!resp) {
+ dev_err(&res->pdev->dev,
+ "QPLIB: SP: ADD_GID send failed");
+ return -EINVAL;
+ }
+ if (!bnxt_qplib_rcfw_wait_for_resp(rcfw,
+ le16_to_cpu(req.cookie))) {
+ /* Cmd timed out */
+ dev_err(&res->pdev->dev,
+ "QPIB: SP: ADD_GID timed out");
+ return -ETIMEDOUT;
+ }
+ if (resp->status ||
+ le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
+ dev_err(&res->pdev->dev, "QPLIB: SP: ADD_GID failed ");
+ dev_err(&res->pdev->dev,
+ "QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
+ resp->status, le16_to_cpu(req.cookie),
+ le16_to_cpu(resp->cookie));
+ return -EINVAL;
+ }
+ sgid_tbl->hw_id[free_idx] = le32_to_cpu(resp->xid);
+ }
+ /* Add GID to the sgid_tbl */
+ memcpy(&sgid_tbl->tbl[free_idx], gid, sizeof(*gid));
+ sgid_tbl->active++;
+ dev_dbg(&res->pdev->dev,
+ "QPLIB: SGID added hw_id[0x%x] = 0x%x active = 0x%x",
+ free_idx, sgid_tbl->hw_id[free_idx], sgid_tbl->active);
+
+ *index = free_idx;
+ /* unlock */
+ return rc;
+}
+
+/* pkeys */
+int bnxt_qplib_get_pkey(struct bnxt_qplib_res *res,
+ struct bnxt_qplib_pkey_tbl *pkey_tbl, u16 index,
+ u16 *pkey)
+{
+ if (index == 0xFFFF) {
+ *pkey = 0xFFFF;
+ return 0;
+ }
+ if (index > pkey_tbl->max) {
+ dev_err(&res->pdev->dev,
+ "QPLIB: Index %d exceeded PKEY table max (%d)",
+ index, pkey_tbl->max);
+ return -EINVAL;
+ }
+ memcpy(pkey, &pkey_tbl->tbl[index], sizeof(*pkey));
+ return 0;
+}
+
int bnxt_qplib_del_pkey(struct bnxt_qplib_res *res,
struct bnxt_qplib_pkey_tbl *pkey_tbl, u16 *pkey,
bool update)
diff --git a/drivers/infiniband/hw/bnxtre/bnxt_qplib_sp.h b/drivers/infiniband/hw/bnxtre/bnxt_qplib_sp.h
index 7de6600..8e20924 100644
--- a/drivers/infiniband/hw/bnxtre/bnxt_qplib_sp.h
+++ b/drivers/infiniband/hw/bnxtre/bnxt_qplib_sp.h
@@ -78,6 +78,17 @@ struct bnxt_qplib_gid {
u8 data[16];
};
+int bnxt_qplib_get_sgid(struct bnxt_qplib_res *res,
+ struct bnxt_qplib_sgid_tbl *sgid_tbl, int index,
+ struct bnxt_qplib_gid *gid);
+int bnxt_qplib_del_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
+ struct bnxt_qplib_gid *gid, bool update);
+int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
+ struct bnxt_qplib_gid *gid, u8 *mac, u16 vlan_id,
+ bool update, u32 *index);
+int bnxt_qplib_get_pkey(struct bnxt_qplib_res *res,
+ struct bnxt_qplib_pkey_tbl *pkey_tbl, u16 index,
+ u16 *pkey);
int bnxt_qplib_del_pkey(struct bnxt_qplib_res *res,
struct bnxt_qplib_pkey_tbl *pkey_tbl, u16 *pkey,
bool update);
diff --git a/drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.c b/drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.c
index ede0e54..5185487 100644
--- a/drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.c
+++ b/drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.c
@@ -60,6 +60,22 @@
#include "bnxt_re_ib_verbs.h"
#include <rdma/bnxtre-abi.h>
+/* Device */
+struct net_device *bnxt_re_get_netdev(struct ib_device *ibdev, u8 port_num)
+{
+ struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
+ struct net_device *netdev = NULL;
+
+ rcu_read_lock();
+ if (rdev)
+ netdev = rdev->netdev;
+ if (netdev)
+ dev_hold(netdev);
+
+ rcu_read_unlock();
+ return netdev;
+}
+
int bnxt_re_query_device(struct ib_device *ibdev,
struct ib_device_attr *ib_attr,
struct ib_udata *udata)
@@ -272,6 +288,113 @@ int bnxt_re_get_port_immutable(struct ib_device *ibdev, u8 port_num,
immutable->max_mad_size = IB_MGMT_MAD_SIZE;
return 0;
}
+
+int bnxt_re_query_pkey(struct ib_device *ibdev, u8 port_num,
+ u16 index, u16 *pkey)
+{
+ struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
+
+ /* Ignore port_num */
+
+ memset(pkey, 0, sizeof(*pkey));
+ return bnxt_qplib_get_pkey(&rdev->qplib_res,
+ &rdev->qplib_res.pkey_tbl, index, pkey);
+}
+
+int bnxt_re_query_gid(struct ib_device *ibdev, u8 port_num,
+ int index, union ib_gid *gid)
+{
+ struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
+ int rc = 0;
+
+ /* Ignore port_num */
+ memset(gid, 0, sizeof(*gid));
+ rc = bnxt_qplib_get_sgid(&rdev->qplib_res,
+ &rdev->qplib_res.sgid_tbl, index,
+ (struct bnxt_qplib_gid *)gid);
+ return rc;
+}
+
+int bnxt_re_del_gid(struct ib_device *ibdev, u8 port_num,
+ unsigned int index, void **context)
+{
+ int rc = 0;
+ struct bnxt_re_gid_ctx *ctx, **ctx_tbl;
+ struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
+ struct bnxt_qplib_sgid_tbl *sgid_tbl = &rdev->qplib_res.sgid_tbl;
+
+ /* Delete the entry from the hardware */
+ ctx = *context;
+ if (!ctx)
+ return -EINVAL;
+
+ if (sgid_tbl && sgid_tbl->active) {
+ if (ctx->idx >= sgid_tbl->max)
+ return -EINVAL;
+ ctx->refcnt--;
+ if (!ctx->refcnt) {
+ rc = bnxt_qplib_del_sgid
+ (sgid_tbl,
+ &sgid_tbl->tbl[ctx->idx], true);
+ if (rc)
+ dev_err(rdev_to_dev(rdev),
+ "Failed to remove GID: %#x", rc);
+ ctx_tbl = sgid_tbl->ctx;
+ ctx_tbl[ctx->idx] = NULL;
+ kfree(ctx);
+ }
+ } else {
+ return -EINVAL;
+ }
+ return rc;
+}
+
+int bnxt_re_add_gid(struct ib_device *ibdev, u8 port_num,
+ unsigned int index, const union ib_gid *gid,
+ const struct ib_gid_attr *attr, void **context)
+{
+ int rc;
+ u32 tbl_idx = 0;
+ u16 vlan_id = 0xFFFF;
+ struct bnxt_re_gid_ctx *ctx, **ctx_tbl;
+ struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
+ struct bnxt_qplib_sgid_tbl *sgid_tbl = &rdev->qplib_res.sgid_tbl;
+
+ if ((attr->ndev) && is_vlan_dev(attr->ndev))
+ vlan_id = vlan_dev_vlan_id(attr->ndev);
+
+ rc = bnxt_qplib_add_sgid(sgid_tbl, (struct bnxt_qplib_gid *)gid,
+ rdev->qplib_res.netdev->dev_addr,
+ vlan_id, true, &tbl_idx);
+ if (rc == -EALREADY) {
+ ctx_tbl = sgid_tbl->ctx;
+ ctx_tbl[tbl_idx]->refcnt++;
+ *context = ctx_tbl[tbl_idx];
+ return 0;
+ }
+
+ if (rc < 0) {
+ dev_err(rdev_to_dev(rdev), "Failed to add GID: %#x", rc);
+ return rc;
+ }
+
+ ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+ ctx_tbl = sgid_tbl->ctx;
+ ctx->idx = tbl_idx;
+ ctx->refcnt = 1;
+ ctx_tbl[tbl_idx] = ctx;
+
+ return rc;
+}
+
+enum rdma_link_layer bnxt_re_get_link_layer(struct ib_device *ibdev,
+ u8 port_num)
+{
+ return IB_LINK_LAYER_ETHERNET;
+}
+
/* Protection Domains */
int bnxt_re_dealloc_pd(struct ib_pd *ib_pd)
{
diff --git a/drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.h b/drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.h
index 255a0ad..c10fd63 100644
--- a/drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.h
+++ b/drivers/infiniband/hw/bnxtre/bnxt_re_ib_verbs.h
@@ -39,6 +39,11 @@
#ifndef __BNXT_RE_IB_VERBS_H__
#define __BNXT_RE_IB_VERBS_H__
+struct bnxt_re_gid_ctx {
+ u32 idx;
+ u32 refcnt;
+};
+
struct bnxt_re_pd {
struct bnxt_re_dev *rdev;
struct ib_pd ib_pd;
@@ -54,6 +59,8 @@ struct bnxt_re_ucontext {
spinlock_t sh_lock; /* protect shpg */
};
+struct net_device *bnxt_re_get_netdev(struct ib_device *ibdev, u8 port_num);
+
int bnxt_re_query_device(struct ib_device *ibdev,
struct ib_device_attr *ib_attr,
struct ib_udata *udata);
@@ -67,6 +74,17 @@ int bnxt_re_modify_port(struct ib_device *ibdev, u8 port_num,
struct ib_port_modify *port_modify);
int bnxt_re_get_port_immutable(struct ib_device *ibdev, u8 port_num,
struct ib_port_immutable *immutable);
+int bnxt_re_query_pkey(struct ib_device *ibdev, u8 port_num,
+ u16 index, u16 *pkey);
+int bnxt_re_del_gid(struct ib_device *ibdev, u8 port_num,
+ unsigned int index, void **context);
+int bnxt_re_add_gid(struct ib_device *ibdev, u8 port_num,
+ unsigned int index, const union ib_gid *gid,
+ const struct ib_gid_attr *attr, void **context);
+int bnxt_re_query_gid(struct ib_device *ibdev, u8 port_num,
+ int index, union ib_gid *gid);
+enum rdma_link_layer bnxt_re_get_link_layer(struct ib_device *ibdev,
+ u8 port_num);
struct ib_pd *bnxt_re_alloc_pd(struct ib_device *ibdev,
struct ib_ucontext *context,
struct ib_udata *udata);
diff --git a/drivers/infiniband/hw/bnxtre/bnxt_re_main.c b/drivers/infiniband/hw/bnxtre/bnxt_re_main.c
index afadf21..8893ebf 100644
--- a/drivers/infiniband/hw/bnxtre/bnxt_re_main.c
+++ b/drivers/infiniband/hw/bnxtre/bnxt_re_main.c
@@ -442,6 +442,13 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev)
ibdev->query_port = bnxt_re_query_port;
ibdev->modify_port = bnxt_re_modify_port;
ibdev->get_port_immutable = bnxt_re_get_port_immutable;
+ ibdev->query_pkey = bnxt_re_query_pkey;
+ ibdev->query_gid = bnxt_re_query_gid;
+ ibdev->get_netdev = bnxt_re_get_netdev;
+ ibdev->add_gid = bnxt_re_add_gid;
+ ibdev->del_gid = bnxt_re_del_gid;
+ ibdev->get_link_layer = bnxt_re_get_link_layer;
+
ibdev->alloc_pd = bnxt_re_alloc_pd;
ibdev->dealloc_pd = bnxt_re_dealloc_pd;
ibdev->alloc_ucontext = bnxt_re_alloc_ucontext;
--
2.5.5
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox