* [PATCH 4/9] net: thunderx: Cleanup receive buffer allocation
From: sunil.kovvuri @ 2017-05-02 13:06 UTC (permalink / raw)
To: netdev; +Cc: linux-kernel, linux-arm-kernel, Sunil Goutham
In-Reply-To: <1493730418-24606-1-git-send-email-sunil.kovvuri@gmail.com>
From: Sunil Goutham <sgoutham@cavium.com>
Get rid of unnecessary double pointer references and type casting
in receive buffer allocation code.
Signed-off-by: Sunil Goutham <sgoutham@cavium.com>
---
drivers/net/ethernet/cavium/thunder/nicvf_queues.c | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
index 90c5bc7d..e4a02a9 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
@@ -145,7 +145,7 @@ static struct pgcache *nicvf_alloc_page(struct nicvf *nic,
/* Allocate buffer for packet reception */
static inline int nicvf_alloc_rcv_buffer(struct nicvf *nic, struct rbdr *rbdr,
- gfp_t gfp, u32 buf_len, u64 **rbuf)
+ gfp_t gfp, u32 buf_len, u64 *rbuf)
{
struct pgcache *pgcache = NULL;
@@ -172,10 +172,10 @@ static inline int nicvf_alloc_rcv_buffer(struct nicvf *nic, struct rbdr *rbdr,
nic->rb_page = pgcache->page;
ret:
/* HW will ensure data coherency, CPU sync not required */
- *rbuf = (u64 *)((u64)dma_map_page_attrs(&nic->pdev->dev, nic->rb_page,
- nic->rb_page_offset, buf_len,
- DMA_FROM_DEVICE,
- DMA_ATTR_SKIP_CPU_SYNC));
+ *rbuf = (u64)dma_map_page_attrs(&nic->pdev->dev, nic->rb_page,
+ nic->rb_page_offset, buf_len,
+ DMA_FROM_DEVICE,
+ DMA_ATTR_SKIP_CPU_SYNC);
if (dma_mapping_error(&nic->pdev->dev, (dma_addr_t)*rbuf)) {
if (!nic->rb_page_offset)
__free_pages(nic->rb_page, 0);
@@ -212,7 +212,7 @@ static int nicvf_init_rbdr(struct nicvf *nic, struct rbdr *rbdr,
int ring_len, int buf_size)
{
int idx;
- u64 *rbuf;
+ u64 rbuf;
struct rbdr_entry_t *desc;
int err;
@@ -257,7 +257,7 @@ static int nicvf_init_rbdr(struct nicvf *nic, struct rbdr *rbdr,
}
desc = GET_RBDR_DESC(rbdr, idx);
- desc->buf_addr = (u64)rbuf & ~(NICVF_RCV_BUF_ALIGN_BYTES - 1);
+ desc->buf_addr = rbuf & ~(NICVF_RCV_BUF_ALIGN_BYTES - 1);
}
nicvf_get_page(nic);
@@ -330,7 +330,7 @@ static void nicvf_refill_rbdr(struct nicvf *nic, gfp_t gfp)
int refill_rb_cnt;
struct rbdr *rbdr;
struct rbdr_entry_t *desc;
- u64 *rbuf;
+ u64 rbuf;
int new_rb = 0;
refill:
@@ -364,7 +364,7 @@ static void nicvf_refill_rbdr(struct nicvf *nic, gfp_t gfp)
break;
desc = GET_RBDR_DESC(rbdr, tail);
- desc->buf_addr = (u64)rbuf & ~(NICVF_RCV_BUF_ALIGN_BYTES - 1);
+ desc->buf_addr = rbuf & ~(NICVF_RCV_BUF_ALIGN_BYTES - 1);
refill_rb_cnt--;
new_rb++;
}
--
2.7.4
^ permalink raw reply related
* [PATCH 2/9] net: thunderx: Optimize RBDR descriptor handling
From: sunil.kovvuri @ 2017-05-02 13:06 UTC (permalink / raw)
To: netdev; +Cc: Sunil Goutham, linux-kernel, linux-arm-kernel
In-Reply-To: <1493730418-24606-1-git-send-email-sunil.kovvuri@gmail.com>
From: Sunil Goutham <sgoutham@cavium.com>
Receive buffer's physical address or iova will anyway not
go beyond 49bits, since it is the max supported HW address.
As per perf, updating bitfields i.e buf_addr:42 in RBDR
descriptor entry consumes lots of cpu cycles, hence changed
it to a 64bit field with alignment requirements taken care of.
Signed-off-by: Sunil Goutham <sgoutham@cavium.com>
---
drivers/net/ethernet/cavium/thunder/nicvf_queues.c | 8 ++++----
drivers/net/ethernet/cavium/thunder/q_struct.h | 10 +---------
2 files changed, 5 insertions(+), 13 deletions(-)
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
index 12f9709..dfc85a1 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
@@ -257,7 +257,7 @@ static int nicvf_init_rbdr(struct nicvf *nic, struct rbdr *rbdr,
}
desc = GET_RBDR_DESC(rbdr, idx);
- desc->buf_addr = (u64)rbuf >> NICVF_RCV_BUF_ALIGN;
+ desc->buf_addr = (u64)rbuf & ~(NICVF_RCV_BUF_ALIGN_BYTES - 1);
}
nicvf_get_page(nic);
@@ -286,7 +286,7 @@ static void nicvf_free_rbdr(struct nicvf *nic, struct rbdr *rbdr)
/* Release page references */
while (head != tail) {
desc = GET_RBDR_DESC(rbdr, head);
- buf_addr = ((u64)desc->buf_addr) << NICVF_RCV_BUF_ALIGN;
+ buf_addr = desc->buf_addr;
phys_addr = nicvf_iova_to_phys(nic, buf_addr);
dma_unmap_page_attrs(&nic->pdev->dev, buf_addr, RCV_FRAG_LEN,
DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC);
@@ -297,7 +297,7 @@ static void nicvf_free_rbdr(struct nicvf *nic, struct rbdr *rbdr)
}
/* Release buffer of tail desc */
desc = GET_RBDR_DESC(rbdr, tail);
- buf_addr = ((u64)desc->buf_addr) << NICVF_RCV_BUF_ALIGN;
+ buf_addr = desc->buf_addr;
phys_addr = nicvf_iova_to_phys(nic, buf_addr);
dma_unmap_page_attrs(&nic->pdev->dev, buf_addr, RCV_FRAG_LEN,
DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC);
@@ -364,7 +364,7 @@ static void nicvf_refill_rbdr(struct nicvf *nic, gfp_t gfp)
break;
desc = GET_RBDR_DESC(rbdr, tail);
- desc->buf_addr = (u64)rbuf >> NICVF_RCV_BUF_ALIGN;
+ desc->buf_addr = (u64)rbuf & ~(NICVF_RCV_BUF_ALIGN_BYTES - 1);
refill_rb_cnt--;
new_rb++;
}
diff --git a/drivers/net/ethernet/cavium/thunder/q_struct.h b/drivers/net/ethernet/cavium/thunder/q_struct.h
index f363472..e47205a 100644
--- a/drivers/net/ethernet/cavium/thunder/q_struct.h
+++ b/drivers/net/ethernet/cavium/thunder/q_struct.h
@@ -359,15 +359,7 @@ union cq_desc_t {
};
struct rbdr_entry_t {
-#if defined(__BIG_ENDIAN_BITFIELD)
- u64 rsvd0:15;
- u64 buf_addr:42;
- u64 cache_align:7;
-#elif defined(__LITTLE_ENDIAN_BITFIELD)
- u64 cache_align:7;
- u64 buf_addr:42;
- u64 rsvd0:15;
-#endif
+ u64 buf_addr;
};
/* TCP reassembly context */
--
2.7.4
^ permalink raw reply related
* [PATCH 1/9] net: thunderx: Support for page recycling
From: sunil.kovvuri @ 2017-05-02 13:06 UTC (permalink / raw)
To: netdev; +Cc: linux-kernel, linux-arm-kernel, Sunil Goutham
In-Reply-To: <1493730418-24606-1-git-send-email-sunil.kovvuri@gmail.com>
From: Sunil Goutham <sgoutham@cavium.com>
Adds support for page recycling for allocating receive buffers
to reduce cost of refilling RBDR ring. Also got rid of using
compound pages when pagesize is 4K, only order-0 pages now.
Only page is recycled, DMA mappings still needs to be done for
every receive buffer allocated due to following constraints
- Cannot have just one receive buffer per 64KB page.
- There is just one buffer ring shared across 8 Rx queues, so
buffers of same page can go to any Rx queue.
- HW gives buffer address where packet has been DMA'ed and not
the index into buffer ring.
This makes it not possible to resue DMA mapping info. So unfortunately
have to go through costly mapping route for every buffer.
Signed-off-by: Sunil Goutham <sgoutham@cavium.com>
---
drivers/net/ethernet/cavium/thunder/nic.h | 4 +-
.../net/ethernet/cavium/thunder/nicvf_ethtool.c | 3 +-
drivers/net/ethernet/cavium/thunder/nicvf_queues.c | 121 ++++++++++++++++++---
drivers/net/ethernet/cavium/thunder/nicvf_queues.h | 11 ++
4 files changed, 119 insertions(+), 20 deletions(-)
diff --git a/drivers/net/ethernet/cavium/thunder/nic.h b/drivers/net/ethernet/cavium/thunder/nic.h
index 6fb4421..dca6aed 100644
--- a/drivers/net/ethernet/cavium/thunder/nic.h
+++ b/drivers/net/ethernet/cavium/thunder/nic.h
@@ -252,12 +252,14 @@ struct nicvf_drv_stats {
u64 tx_csum_overflow;
/* driver debug stats */
- u64 rcv_buffer_alloc_failures;
u64 tx_tso;
u64 tx_timeout;
u64 txq_stop;
u64 txq_wake;
+ u64 rcv_buffer_alloc_failures;
+ u64 page_alloc;
+
struct u64_stats_sync syncp;
};
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c b/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c
index 02a986c..a89db5f 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c
@@ -100,11 +100,12 @@ static const struct nicvf_stat nicvf_drv_stats[] = {
NICVF_DRV_STAT(tx_csum_overlap),
NICVF_DRV_STAT(tx_csum_overflow),
- NICVF_DRV_STAT(rcv_buffer_alloc_failures),
NICVF_DRV_STAT(tx_tso),
NICVF_DRV_STAT(tx_timeout),
NICVF_DRV_STAT(txq_stop),
NICVF_DRV_STAT(txq_wake),
+ NICVF_DRV_STAT(rcv_buffer_alloc_failures),
+ NICVF_DRV_STAT(page_alloc),
};
static const struct nicvf_stat nicvf_queue_stats[] = {
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
index 7b0fd8d..12f9709 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
@@ -19,8 +19,6 @@
#include "q_struct.h"
#include "nicvf_queues.h"
-#define NICVF_PAGE_ORDER ((PAGE_SIZE <= 4096) ? PAGE_ALLOC_COSTLY_ORDER : 0)
-
static inline u64 nicvf_iova_to_phys(struct nicvf *nic, dma_addr_t dma_addr)
{
/* Translation is installed only when IOMMU is present */
@@ -90,33 +88,88 @@ static void nicvf_free_q_desc_mem(struct nicvf *nic, struct q_desc_mem *dmem)
dmem->base = NULL;
}
-/* Allocate buffer for packet reception
- * HW returns memory address where packet is DMA'ed but not a pointer
- * into RBDR ring, so save buffer address at the start of fragment and
- * align the start address to a cache aligned address
+/* Allocate a new page or recycle one if possible
+ *
+ * We cannot optimize dma mapping here, since
+ * 1. It's only one RBDR ring for 8 Rx queues.
+ * 2. CQE_RX gives address of the buffer where pkt has been DMA'ed
+ * and not idx into RBDR ring, so can't refer to saved info.
+ * 3. There are multiple receive buffers per page
*/
-static inline int nicvf_alloc_rcv_buffer(struct nicvf *nic, gfp_t gfp,
- u32 buf_len, u64 **rbuf)
+static struct pgcache *nicvf_alloc_page(struct nicvf *nic,
+ struct rbdr *rbdr, gfp_t gfp)
{
- int order = NICVF_PAGE_ORDER;
+ struct page *page = NULL;
+ struct pgcache *pgcache, *next;
+
+ /* Check if page is already allocated */
+ pgcache = &rbdr->pgcache[rbdr->pgidx];
+ page = pgcache->page;
+ /* Check if page can be recycled */
+ if (page && (page_ref_count(page) != 1))
+ page = NULL;
+
+ if (!page) {
+ page = alloc_pages(gfp | __GFP_COMP | __GFP_NOWARN, 0);
+ if (!page)
+ return NULL;
+
+ this_cpu_inc(nic->pnicvf->drv_stats->page_alloc);
+
+ /* Check for space */
+ if (rbdr->pgalloc >= rbdr->pgcnt) {
+ /* Page can still be used */
+ nic->rb_page = page;
+ return NULL;
+ }
+
+ /* Save the page in page cache */
+ pgcache->page = page;
+ rbdr->pgalloc++;
+ }
+
+ /* Take extra page reference for recycling */
+ page_ref_add(page, 1);
+
+ rbdr->pgidx++;
+ rbdr->pgidx &= (rbdr->pgcnt - 1);
+
+ /* Prefetch refcount of next page in page cache */
+ next = &rbdr->pgcache[rbdr->pgidx];
+ page = next->page;
+ if (page)
+ prefetch(&page->_refcount);
+
+ return pgcache;
+}
+
+/* Allocate buffer for packet reception */
+static inline int nicvf_alloc_rcv_buffer(struct nicvf *nic, struct rbdr *rbdr,
+ gfp_t gfp, u32 buf_len, u64 **rbuf)
+{
+ struct pgcache *pgcache = NULL;
/* Check if request can be accomodated in previous allocated page */
if (nic->rb_page &&
- ((nic->rb_page_offset + buf_len) < (PAGE_SIZE << order))) {
+ ((nic->rb_page_offset + buf_len) <= PAGE_SIZE)) {
nic->rb_pageref++;
goto ret;
}
nicvf_get_page(nic);
+ nic->rb_page = NULL;
- /* Allocate a new page */
- nic->rb_page = alloc_pages(gfp | __GFP_COMP | __GFP_NOWARN,
- order);
- if (!nic->rb_page) {
+ /* Get new page, either recycled or new one */
+ pgcache = nicvf_alloc_page(nic, rbdr, gfp);
+ if (!pgcache && !nic->rb_page) {
this_cpu_inc(nic->pnicvf->drv_stats->rcv_buffer_alloc_failures);
return -ENOMEM;
}
+
nic->rb_page_offset = 0;
+ /* Check if it's recycled */
+ if (pgcache)
+ nic->rb_page = pgcache->page;
ret:
/* HW will ensure data coherency, CPU sync not required */
*rbuf = (u64 *)((u64)dma_map_page_attrs(&nic->pdev->dev, nic->rb_page,
@@ -125,7 +178,7 @@ static inline int nicvf_alloc_rcv_buffer(struct nicvf *nic, gfp_t gfp,
DMA_ATTR_SKIP_CPU_SYNC));
if (dma_mapping_error(&nic->pdev->dev, (dma_addr_t)*rbuf)) {
if (!nic->rb_page_offset)
- __free_pages(nic->rb_page, order);
+ __free_pages(nic->rb_page, 0);
nic->rb_page = NULL;
return -ENOMEM;
}
@@ -177,10 +230,26 @@ static int nicvf_init_rbdr(struct nicvf *nic, struct rbdr *rbdr,
rbdr->head = 0;
rbdr->tail = 0;
+ /* Initialize page recycling stuff.
+ *
+ * Can't use single buffer per page especially with 64K pages.
+ * On embedded platforms i.e 81xx/83xx available memory itself
+ * is low and minimum ring size of RBDR is 8K, that takes away
+ * lots of memory.
+ */
+ rbdr->pgcnt = ring_len / (PAGE_SIZE / buf_size);
+ rbdr->pgcnt = roundup_pow_of_two(rbdr->pgcnt);
+ rbdr->pgcache = kzalloc(sizeof(*rbdr->pgcache) *
+ rbdr->pgcnt, GFP_KERNEL);
+ if (!rbdr->pgcache)
+ return -ENOMEM;
+ rbdr->pgidx = 0;
+ rbdr->pgalloc = 0;
+
nic->rb_page = NULL;
for (idx = 0; idx < ring_len; idx++) {
- err = nicvf_alloc_rcv_buffer(nic, GFP_KERNEL, RCV_FRAG_LEN,
- &rbuf);
+ err = nicvf_alloc_rcv_buffer(nic, rbdr, GFP_KERNEL,
+ RCV_FRAG_LEN, &rbuf);
if (err) {
/* To free already allocated and mapped ones */
rbdr->tail = idx - 1;
@@ -201,6 +270,7 @@ static void nicvf_free_rbdr(struct nicvf *nic, struct rbdr *rbdr)
{
int head, tail;
u64 buf_addr, phys_addr;
+ struct pgcache *pgcache;
struct rbdr_entry_t *desc;
if (!rbdr)
@@ -234,6 +304,18 @@ static void nicvf_free_rbdr(struct nicvf *nic, struct rbdr *rbdr)
if (phys_addr)
put_page(virt_to_page(phys_to_virt(phys_addr)));
+ /* Sync page cache info */
+ smp_rmb();
+
+ /* Release additional page references held for recycling */
+ head = 0;
+ while (head < rbdr->pgcnt) {
+ pgcache = &rbdr->pgcache[head];
+ if (pgcache->page && page_ref_count(pgcache->page) != 0)
+ put_page(pgcache->page);
+ head++;
+ }
+
/* Free RBDR ring */
nicvf_free_q_desc_mem(nic, &rbdr->dmem);
}
@@ -269,13 +351,16 @@ static void nicvf_refill_rbdr(struct nicvf *nic, gfp_t gfp)
else
refill_rb_cnt = qs->rbdr_len - qcount - 1;
+ /* Sync page cache info */
+ smp_rmb();
+
/* Start filling descs from tail */
tail = nicvf_queue_reg_read(nic, NIC_QSET_RBDR_0_1_TAIL, rbdr_idx) >> 3;
while (refill_rb_cnt) {
tail++;
tail &= (rbdr->dmem.q_len - 1);
- if (nicvf_alloc_rcv_buffer(nic, gfp, RCV_FRAG_LEN, &rbuf))
+ if (nicvf_alloc_rcv_buffer(nic, rbdr, gfp, RCV_FRAG_LEN, &rbuf))
break;
desc = GET_RBDR_DESC(rbdr, tail);
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.h b/drivers/net/ethernet/cavium/thunder/nicvf_queues.h
index 10cb4b8..da48366 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.h
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.h
@@ -213,6 +213,11 @@ struct q_desc_mem {
void *unalign_base;
};
+struct pgcache {
+ struct page *page;
+ u64 dma_addr;
+};
+
struct rbdr {
bool enable;
u32 dma_size;
@@ -222,6 +227,12 @@ struct rbdr {
u32 head;
u32 tail;
struct q_desc_mem dmem;
+
+ /* For page recycling */
+ int pgidx;
+ int pgcnt;
+ int pgalloc;
+ struct pgcache *pgcache;
} ____cacheline_aligned_in_smp;
struct rcv_queue {
--
2.7.4
^ permalink raw reply related
* [PATCH 0/9] net: thunderx: Adds XDP support
From: sunil.kovvuri @ 2017-05-02 13:06 UTC (permalink / raw)
To: netdev; +Cc: linux-kernel, linux-arm-kernel, Sunil Goutham
From: Sunil Goutham <sgoutham@cavium.com>
This patch series adds support for XDP to ThunderX NIC driver
which is used on CN88xx, CN81xx and CN83xx platforms.
Patches 1-4 are performance improvement and cleanup patches
which are done keeping XDP performance bottlenecks in view.
Rest of the patches adds actual XDP support.
Sunil Goutham (9):
net: thunderx: Support for page recycling
net: thunderx: Optimize RBDR descriptor handling
net: thunderx: Optimize CQE_TX handling
net: thunderx: Cleanup receive buffer allocation
net: thunderx: Add basic XDP support
net: thunderx: Add support for XDP_DROP
net: thunderx: Add support for XDP_TX
net: thunderx: Support for XDP header adjustment
net: thunderx: Optimize page recycling for XDP
drivers/net/ethernet/cavium/thunder/nic.h | 10 +-
.../net/ethernet/cavium/thunder/nicvf_ethtool.c | 29 +-
drivers/net/ethernet/cavium/thunder/nicvf_main.c | 313 +++++++++++++++--
drivers/net/ethernet/cavium/thunder/nicvf_queues.c | 387 +++++++++++++++++----
drivers/net/ethernet/cavium/thunder/nicvf_queues.h | 32 +-
drivers/net/ethernet/cavium/thunder/q_struct.h | 10 +-
6 files changed, 657 insertions(+), 124 deletions(-)
--
2.7.4
^ permalink raw reply
* Re: [PATCH v2 net-next 0/7] Extend socket timestamping API
From: Miroslav Lichvar @ 2017-05-02 12:55 UTC (permalink / raw)
To: netdev; +Cc: Richard Cochran, Willem de Bruijn
In-Reply-To: <20170502101103.30444-1-mlichvar@redhat.com>
Hm, I see that net-next was closed. I missed the annoucement. Sorry
for the spam.
On Tue, May 02, 2017 at 02:46:02PM +0200, Miroslav Lichvar wrote:
> Changes v1->v2:
> - added separate patch for new NAPI functions
> - split code from __sock_recv_timestamp() for better readability
> - fixed RCU locking
> - fixed compiler warning (missing case in switch in first patch)
> - inline sw_tx_timestamp() in its only user
--
Miroslav Lichvar
^ permalink raw reply
* Re: net/smc and the RDMA core
From: Ursula Braun @ 2017-05-02 12:41 UTC (permalink / raw)
To: Parav Pandit, Bart Van Assche, hch-jcswGhMUV9g@public.gmane.org,
davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org
Cc: netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <HE1PR0502MB30048AFD086C4B0D535BFC52D1140-692Kmc8YnlL9PhveBwpv4cDSnupUy6xnnBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
On 05/01/2017 07:55 PM, Parav Pandit wrote:
> Hi Bart, Ursula, Dave,
>
> I am particularly concerned about SMC as address family.
> It should not be treated as address family, but rather an additional protocol similar for socket type SOCK_STREAM.
We tried to avoid changes of the kernel TCP code. A new address family
seemed to be a feasible way to achieve this.
> While doing performance benchmarking last month and while porting few database application,
>
> I encountered a major hurdle where user space library heavily depend on AF_INET and AF_INET6 family through get_addrinfo and other friend functions.
> Adding or treating AF_SMC as AF_INET just doesn't sound right.
>
> Most user space code doesn't care for the protocol field, but do handle domain field.
>
> I personally believe it's not too late to modify SMC to drop expose AF_SMC and have it exposed through new protocol that can be exposed through socket() API.
>
> Parav
>
>> -----Original Message-----
>> From: linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org [mailto:linux-rdma-
>> owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org] On Behalf Of Bart Van Assche
>> Sent: Monday, May 1, 2017 12:30 PM
>> To: hch-jcswGhMUV9g@public.gmane.org; davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org; ubraun-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org
>> Cc: netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
>> Subject: Re: net/smc and the RDMA core
>>
>> On Mon, 2017-05-01 at 18:33 +0200, Christoph Hellwig wrote:
>>> Hi Ursual, hi netdev reviewers,
>>>
>>> how did the smc protocol manage to get merged without any review on
>>> linux-rdma at all? As the results it seems it's very substandard in
>>> terms of RDMA API usage, e.g. it neither uses the proper CQ API nor
>>> the RDMA R/W API, and other will probably find additional issues as
>>> well.
>>
>> Hello Dave and Ursula,
>>
>> It seems very rude to me to have merged the SMC protocol driver without
>> having involved the linux-rdma community. Anyway, I have the following
>> questions for Dave and Ursula:
>> * Since the Linux kernel is standards based: where can we find the standard
>> that defines the SMC wire protocol? If this protocol has not been
>> standardized yet: in what file (other than *.[ch]) in the Linux kernel
>> tree has this protocol been documented?
>> * What are the differences between the SMC protocol, the SDP protocol and
>> the rsockets protocol? How do existing implementations for these protocols
>> compare to each other from a performance point of view? If no performance
>> comparison between these protocols is available, shouldn't the performance
>> of these protocols have been compared with each other before a review of
>> the SMC driver even started?
>> * What are the reasons why the SDP driver was never accepted upstream? Do
>> the arguments why SDP was not accepted upstream also apply to the SMC
>> driver (SDP = Sockets Direct Protocol)?
>> * Since SMC has to be selected by specifying AF_SMC, how are users expected
>> to specify whether AF_INET, AF_INET6 or yet another address family should
>> be used to set up a connection between SMC endpoints?
>> * Is the SMC driver limited to RoCE? Are you aware that the rsockets library
>> supports multiple transport layers (RoCE, IB and iWARP)?
>> * Since functionality that is similar what the SMC driver provides already
>> exists in user space (rsockets), why has this functionality been
>> reimplemented as a kernel driver (SMC)?
>>
>> Bart.--
>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body
>> of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at
>> http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: net/smc and the RDMA core
From: Ursula Braun @ 2017-05-02 12:34 UTC (permalink / raw)
To: Bart Van Assche, hch-jcswGhMUV9g@public.gmane.org,
davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org
Cc: netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <1493659776.2665.7.camel-XdAiOPVOjttBDgjK7y7TUQ@public.gmane.org>
On 05/01/2017 07:29 PM, Bart Van Assche wrote:
> On Mon, 2017-05-01 at 18:33 +0200, Christoph Hellwig wrote:
>> Hi Ursual, hi netdev reviewers,
>>
>> how did the smc protocol manage to get merged without any review
>> on linux-rdma at all? As the results it seems it's very substandard
>> in terms of RDMA API usage, e.g. it neither uses the proper CQ API
>> nor the RDMA R/W API, and other will probably find additional issues
>> as well.
>
> Hello Dave and Ursula,
>
> It seems very rude to me to have merged the SMC protocol driver without
> having involved the linux-rdma community. Anyway, I have the following
> questions for Dave and Ursula:
> * Since the Linux kernel is standards based: where can we find the standard
> that defines the SMC wire protocol? If this protocol has not been
> standardized yet: in what file (other than *.[ch]) in the Linux kernel
> tree has this protocol been documented?
Hello Bart,
The protocol is standardized, see: http://www.rfc-editor.org/info/rfc7609.
I described this and some more protocol details in my patch series
overview mail, see for instance:
http://marc.info/?l=linux-s390&m=148397751211964&w=2
This description explains the reasons to come up with SMC-R.
> * What are the differences between the SMC protocol, the SDP protocol and
> the rsockets protocol? How do existing implementations for these protocols
> compare to each other from a performance point of view? If no performance
> comparison between these protocols is available, shouldn't the performance
> of these protocols have been compared with each other before a review of
> the SMC driver even started?
> * What are the reasons why the SDP driver was never accepted upstream? Do
> the arguments why SDP was not accepted upstream also apply to the SMC
> driver (SDP = Sockets Direct Protocol)?
> * Since SMC has to be selected by specifying AF_SMC, how are users expected
> to specify whether AF_INET, AF_INET6 or yet another address family should
> be used to set up a connection between SMC
> endpoints?
The IPv6 support in SMC-R is on our todo-list.
> * Is the SMC driver limited to RoCE? Are you aware that the rsockets library
> supports multiple transport layers (RoCE, IB and iWARP)?
For now, only RoCE is supported. Other transports might be added in the future.
> * Since functionality that is similar what the SMC driver provides already
> exists in user space (rsockets), why has this functionality been
> reimplemented as a kernel driver (SMC)?
>
> Bart.
>
Regards, Ursula
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* IPsec PFP support on linux
From: Sowmini Varadhan @ 2017-05-02 12:32 UTC (permalink / raw)
To: netdev, herbert, linux-crypto, swan, steffen.klassert, borisp,
ilant
Cc: sowmini.varadhan
I have a question about linux support for IPsec PFP (as defined in
rfc 4301). I am assuming this exists, and is accessible from uspace,
in which case I need some hints on how to set it up.
Assuming I have a server listening at port 5001 that I want to
secure via ipsec. Suppose I want to make sure that each TCP/UDP 5-tuple
sending packets to port 5001 gets its own SA.
RFC4301 has this:
- SPD-S: For traffic that is to be protected using IPsec, the
entry consists of the values of the selectors that apply to the
traffic to be protected via AH or ESP, controls on how to
create SAs based on these selectors, ...
and further down
If IPsec processing is specified for
an entry, a "populate from packet" (PFP) flag may be asserted for
one or more of the selectors in the SPD entry (Local IP address;
Remote IP address; Next Layer Protocol; and, depending on Next
Layer Protocol, Local port and Remote port, or ICMP type/code, or
Mobility Header type). If asserted for a given selector X, the
flag indicates that the SA to be created should take its value for
X from the value in the packet. Otherwise, the SA should take its
value(s) for X from the value(s) in the SPD entry.
A google search produces a discarded patch
http://marc.info/?l=linux-netdev&m=119746758904140
but its not clear to me how to set this up (if PFP works fine,
as suggested by Herbert's response above)
I tried experimenting with IP_XFRM_POLICY from a simple udp client but
(a) that seems to require a SPI and reqid to set up the SPD
(b) I see the SADB_ACQUIRE upcall being triggered after the local port
is bound (and SADB entry is set up for the lport). But ike phase2
does not converge for the lport specific sadb added
by the bind (even in quick mode)
My understanding is that pluto shoud be generating spi's to make sure
they are sufficiently unique/random etc. so (a) makes me think I'm
either not setting this up or not using this correctly.
Any hints/sample code/RTFMs would be helpful (documentation for
IP_XFRM_POLICY seems scant, afaict). I'd be happy to share my
udp client program, if it can provide more context to my question.
--Sowmini
^ permalink raw reply
* [net-next PATCH 4/4] samples/bpf: export map_data[] for more info on maps
From: Jesper Dangaard Brouer @ 2017-05-02 12:32 UTC (permalink / raw)
To: kafai
Cc: netdev, eric, Daniel Borkmann, Alexei Starovoitov,
Jesper Dangaard Brouer
In-Reply-To: <149372826543.22268.3617359219409721129.stgit@firesoul>
Giving *_user.c side tools access to map_data[] provides easier
access to information on the maps being loaded. Still provide
the guarantee that the order maps are being defined in inside the
_kern.c file corresponds with the order in the array. Now user
tools are not blind, but can inspect and verify the maps that got
loaded from the ELF binary.
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
samples/bpf/bpf_load.h | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
index 4d4fd4678a64..ca0563d04744 100644
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -24,12 +24,18 @@ struct bpf_map_data {
typedef void (*fixup_map_cb)(struct bpf_map_data *map, int idx);
-extern int map_fd[MAX_MAPS];
extern int prog_fd[MAX_PROGS];
extern int event_fd[MAX_PROGS];
extern char bpf_log_buf[BPF_LOG_BUF_SIZE];
extern int prog_cnt;
+/* There is a one-to-one mapping between map_fd[] and map_data[].
+ * The map_data[] just contains more rich info on the given map.
+ */
+extern int map_fd[MAX_MAPS];
+extern struct bpf_map_data map_data[MAX_MAPS];
+extern int map_data_count;
+
/* parses elf file compiled by llvm .c->.o
* . parses 'maps' section and creates maps via BPF syscall
* . parses 'license' section and passes it to syscall
^ permalink raw reply related
* [net-next PATCH 3/4] samples/bpf: load_bpf.c make callback fixup more flexible
From: Jesper Dangaard Brouer @ 2017-05-02 12:32 UTC (permalink / raw)
To: kafai
Cc: netdev, eric, Daniel Borkmann, Alexei Starovoitov,
Jesper Dangaard Brouer
In-Reply-To: <149372826543.22268.3617359219409721129.stgit@firesoul>
Do this change before others start to use this callback.
Change map_perf_test_user.c which seems to be the only user.
This patch extends capabilities of commit 9fd63d05f3e8 ("bpf:
Allow bpf sample programs (*_user.c) to change bpf_map_def").
Give fixup callback access to struct bpf_map_data, instead of
only stuct bpf_map_def. This add flexibility to allow userspace
to reassign the map file descriptor. This is very useful when
wanting to share maps between several bpf programs.
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
samples/bpf/bpf_load.c | 17 ++++++++---------
samples/bpf/bpf_load.h | 10 ++++++++--
samples/bpf/map_perf_test_user.c | 14 +++++++-------
3 files changed, 23 insertions(+), 18 deletions(-)
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index fedec29c7817..74456b3eb89a 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -39,13 +39,6 @@ int event_fd[MAX_PROGS];
int prog_cnt;
int prog_array_fd = -1;
-/* Keeping relevant info on maps */
-struct bpf_map_data {
- int fd;
- char *name;
- size_t elf_offset;
- struct bpf_map_def def;
-};
struct bpf_map_data map_data[MAX_MAPS];
int map_data_count = 0;
@@ -202,8 +195,14 @@ static int load_maps(struct bpf_map_data *maps, int nr_maps,
int i;
for (i = 0; i < nr_maps; i++) {
- if (fixup_map)
- fixup_map(&maps[i].def, maps[i].name, i);
+ if (fixup_map) {
+ fixup_map(&maps[i], i);
+ /* Allow userspace to assign map FD prior to creation */
+ if (maps[i].fd != -1) {
+ map_fd[i] = maps[i].fd;
+ continue;
+ }
+ }
if (maps[i].def.type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
maps[i].def.type == BPF_MAP_TYPE_HASH_OF_MAPS) {
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
index 05822f83173a..4d4fd4678a64 100644
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -15,8 +15,14 @@ struct bpf_map_def {
unsigned int inner_map_idx;
};
-typedef void (*fixup_map_cb)(struct bpf_map_def *map, const char *map_name,
- int idx);
+struct bpf_map_data {
+ int fd;
+ char *name;
+ size_t elf_offset;
+ struct bpf_map_def def;
+};
+
+typedef void (*fixup_map_cb)(struct bpf_map_data *map, int idx);
extern int map_fd[MAX_MAPS];
extern int prog_fd[MAX_PROGS];
diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c
index 6ac778153315..1a8894b5ac51 100644
--- a/samples/bpf/map_perf_test_user.c
+++ b/samples/bpf/map_perf_test_user.c
@@ -320,21 +320,21 @@ static void fill_lpm_trie(void)
assert(!r);
}
-static void fixup_map(struct bpf_map_def *map, const char *name, int idx)
+static void fixup_map(struct bpf_map_data *map, int idx)
{
int i;
- if (!strcmp("inner_lru_hash_map", name)) {
+ if (!strcmp("inner_lru_hash_map", map->name)) {
inner_lru_hash_idx = idx;
- inner_lru_hash_size = map->max_entries;
+ inner_lru_hash_size = map->def.max_entries;
}
- if (!strcmp("array_of_lru_hashs", name)) {
+ if (!strcmp("array_of_lru_hashs", map->name)) {
if (inner_lru_hash_idx == -1) {
printf("inner_lru_hash_map must be defined before array_of_lru_hashs\n");
exit(1);
}
- map->inner_map_idx = inner_lru_hash_idx;
+ map->def.inner_map_idx = inner_lru_hash_idx;
array_of_lru_hashs_idx = idx;
}
@@ -345,9 +345,9 @@ static void fixup_map(struct bpf_map_def *map, const char *name, int idx)
/* Only change the max_entries for the enabled test(s) */
for (i = 0; i < NR_TESTS; i++) {
- if (!strcmp(test_map_names[i], name) &&
+ if (!strcmp(test_map_names[i], map->name) &&
(check_test_flags(i))) {
- map->max_entries = num_map_entries;
+ map->def.max_entries = num_map_entries;
}
}
}
^ permalink raw reply related
* [net-next PATCH 0/4] Improve bpf ELF-loader under samples/bpf
From: Jesper Dangaard Brouer @ 2017-05-02 12:31 UTC (permalink / raw)
To: kafai
Cc: netdev, eric, Daniel Borkmann, Alexei Starovoitov,
Jesper Dangaard Brouer
This series improves and fixes bpf ELF loader and programs under
samples/bpf. The bpf_load.c created some hard to debug issues when
the struct (bpf_map_def) used in the ELF maps section format changed
in commit fb30d4b71214 ("bpf: Add tests for map-in-map").
This was hotfixed in commit 409526bea3c3 ("samples/bpf: bpf_load.c
detect and abort if ELF maps section size is wrong") by detecting the
issue and aborting the program.
In most situations the bpf-loader should be able to handle these kind
of changes to the struct size. This patch series aim to do proper
backward and forward compabilility handling when loading ELF files.
This series also adjust the callback that was introduced in commit
9fd63d05f3e8 ("bpf: Allow bpf sample programs (*_user.c) to change
bpf_map_def") to use the new bpf_map_data structure, before more users
start to use this callback.
Hoping these changes can make the merge window, as above mentioned
commits have not been merged yet, and it would be good to avoid users
hitting these issues.
---
Jesper Dangaard Brouer (4):
samples/bpf: adjust rlimit RLIMIT_MEMLOCK for traceex2, tracex3 and tracex4
samples/bpf: make bpf_load.c code compatible with ELF maps section changes
samples/bpf: load_bpf.c make callback fixup more flexible
samples/bpf: export map_data[] for more info on maps
samples/bpf/bpf_load.c | 229 ++++++++++++++++++++++++++------------
samples/bpf/bpf_load.h | 18 ++-
samples/bpf/map_perf_test_user.c | 14 +-
samples/bpf/tracex2_user.c | 7 +
samples/bpf/tracex3_user.c | 7 +
samples/bpf/tracex4_user.c | 8 +
6 files changed, 201 insertions(+), 82 deletions(-)
^ permalink raw reply
* [net-next PATCH 1/4] samples/bpf: adjust rlimit RLIMIT_MEMLOCK for traceex2, tracex3 and tracex4
From: Jesper Dangaard Brouer @ 2017-05-02 12:31 UTC (permalink / raw)
To: kafai
Cc: netdev, eric, Daniel Borkmann, Alexei Starovoitov,
Jesper Dangaard Brouer
In-Reply-To: <149372826543.22268.3617359219409721129.stgit@firesoul>
Needed to adjust max locked memory RLIMIT_MEMLOCK for testing these bpf samples
as these are using more and larger maps than can fit in distro default 64Kbytes limit.
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
samples/bpf/tracex2_user.c | 7 +++++++
samples/bpf/tracex3_user.c | 7 +++++++
samples/bpf/tracex4_user.c | 8 ++++++++
3 files changed, 22 insertions(+)
diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c
index ded9804c5034..7fee0f1ba9a3 100644
--- a/samples/bpf/tracex2_user.c
+++ b/samples/bpf/tracex2_user.c
@@ -4,6 +4,7 @@
#include <signal.h>
#include <linux/bpf.h>
#include <string.h>
+#include <sys/resource.h>
#include "libbpf.h"
#include "bpf_load.h"
@@ -112,6 +113,7 @@ static void int_exit(int sig)
int main(int ac, char **argv)
{
+ struct rlimit r = {1024*1024, RLIM_INFINITY};
char filename[256];
long key, next_key, value;
FILE *f;
@@ -119,6 +121,11 @@ int main(int ac, char **argv)
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK)");
+ return 1;
+ }
+
signal(SIGINT, int_exit);
/* start 'ping' in the background to have some kfree_skb events */
diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c
index 8f7d199d5945..fe372239d505 100644
--- a/samples/bpf/tracex3_user.c
+++ b/samples/bpf/tracex3_user.c
@@ -11,6 +11,7 @@
#include <stdbool.h>
#include <string.h>
#include <linux/bpf.h>
+#include <sys/resource.h>
#include "libbpf.h"
#include "bpf_load.h"
@@ -112,11 +113,17 @@ static void print_hist(int fd)
int main(int ac, char **argv)
{
+ struct rlimit r = {1024*1024, RLIM_INFINITY};
char filename[256];
int i;
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK)");
+ return 1;
+ }
+
if (load_bpf_file(filename)) {
printf("%s", bpf_log_buf);
return 1;
diff --git a/samples/bpf/tracex4_user.c b/samples/bpf/tracex4_user.c
index 03449f773cb1..22c644f1f4c3 100644
--- a/samples/bpf/tracex4_user.c
+++ b/samples/bpf/tracex4_user.c
@@ -12,6 +12,8 @@
#include <string.h>
#include <time.h>
#include <linux/bpf.h>
+#include <sys/resource.h>
+
#include "libbpf.h"
#include "bpf_load.h"
@@ -50,11 +52,17 @@ static void print_old_objects(int fd)
int main(int ac, char **argv)
{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
char filename[256];
int i;
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)");
+ return 1;
+ }
+
if (load_bpf_file(filename)) {
printf("%s", bpf_log_buf);
return 1;
^ permalink raw reply related
* [net-next PATCH 2/4] samples/bpf: make bpf_load.c code compatible with ELF maps section changes
From: Jesper Dangaard Brouer @ 2017-05-02 12:31 UTC (permalink / raw)
To: kafai
Cc: netdev, eric, Daniel Borkmann, Alexei Starovoitov,
Jesper Dangaard Brouer
In-Reply-To: <149372826543.22268.3617359219409721129.stgit@firesoul>
This patch does proper parsing of the ELF "maps" section, in-order to
be both backwards and forwards compatible with changes to the map
definition struct bpf_map_def, which gets compiled into the ELF file.
The assumption is that new features with value zero, means that they
are not in-use. For backward compatibility where loading an ELF file
with a smaller struct bpf_map_def, only copy objects ELF size, leaving
rest of loaders struct zero. For forward compatibility where ELF file
have a larger struct bpf_map_def, only copy loaders own struct size
and verify that rest of the larger struct is zero, assuming this means
the newer feature was not activated, thus it should be safe for this
older loader to load this newer ELF file.
Fixes: fb30d4b71214 ("bpf: Add tests for map-in-map")
Fixes: 409526bea3c3 ("samples/bpf: bpf_load.c detect and abort if ELF maps section size is wrong")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
samples/bpf/bpf_load.c | 224 +++++++++++++++++++++++++++++++++---------------
1 file changed, 155 insertions(+), 69 deletions(-)
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 4221dc359453..fedec29c7817 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -39,6 +39,16 @@ int event_fd[MAX_PROGS];
int prog_cnt;
int prog_array_fd = -1;
+/* Keeping relevant info on maps */
+struct bpf_map_data {
+ int fd;
+ char *name;
+ size_t elf_offset;
+ struct bpf_map_def def;
+};
+struct bpf_map_data map_data[MAX_MAPS];
+int map_data_count = 0;
+
static int populate_prog_array(const char *event, int prog_fd)
{
int ind = atoi(event), err;
@@ -186,42 +196,39 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
return 0;
}
-static int load_maps(struct bpf_map_def *maps, int nr_maps,
- const char **map_names, fixup_map_cb fixup_map)
+static int load_maps(struct bpf_map_data *maps, int nr_maps,
+ fixup_map_cb fixup_map)
{
int i;
- /*
- * Warning: Using "maps" pointing to ELF data_maps->d_buf as
- * an array of struct bpf_map_def is a wrong assumption about
- * the ELF maps section format.
- */
+
for (i = 0; i < nr_maps; i++) {
if (fixup_map)
- fixup_map(&maps[i], map_names[i], i);
+ fixup_map(&maps[i].def, maps[i].name, i);
- if (maps[i].type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
- maps[i].type == BPF_MAP_TYPE_HASH_OF_MAPS) {
- int inner_map_fd = map_fd[maps[i].inner_map_idx];
+ if (maps[i].def.type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
+ maps[i].def.type == BPF_MAP_TYPE_HASH_OF_MAPS) {
+ int inner_map_fd = map_fd[maps[i].def.inner_map_idx];
- map_fd[i] = bpf_create_map_in_map(maps[i].type,
- maps[i].key_size,
- inner_map_fd,
- maps[i].max_entries,
- maps[i].map_flags);
+ map_fd[i] = bpf_create_map_in_map(maps[i].def.type,
+ maps[i].def.key_size,
+ inner_map_fd,
+ maps[i].def.max_entries,
+ maps[i].def.map_flags);
} else {
- map_fd[i] = bpf_create_map(maps[i].type,
- maps[i].key_size,
- maps[i].value_size,
- maps[i].max_entries,
- maps[i].map_flags);
+ map_fd[i] = bpf_create_map(maps[i].def.type,
+ maps[i].def.key_size,
+ maps[i].def.value_size,
+ maps[i].def.max_entries,
+ maps[i].def.map_flags);
}
if (map_fd[i] < 0) {
printf("failed to create a map: %d %s\n",
errno, strerror(errno));
return 1;
}
+ maps[i].fd = map_fd[i];
- if (maps[i].type == BPF_MAP_TYPE_PROG_ARRAY)
+ if (maps[i].def.type == BPF_MAP_TYPE_PROG_ARRAY)
prog_array_fd = map_fd[i];
}
return 0;
@@ -251,7 +258,8 @@ static int get_sec(Elf *elf, int i, GElf_Ehdr *ehdr, char **shname,
}
static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols,
- GElf_Shdr *shdr, struct bpf_insn *insn)
+ GElf_Shdr *shdr, struct bpf_insn *insn,
+ struct bpf_map_data *maps, int nr_maps)
{
int i, nrels;
@@ -261,6 +269,8 @@ static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols,
GElf_Sym sym;
GElf_Rel rel;
unsigned int insn_idx;
+ bool match = false;
+ int j, map_idx;
gelf_getrel(data, i, &rel);
@@ -274,11 +284,21 @@ static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols,
return 1;
}
insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD;
- /*
- * Warning: Using sizeof(struct bpf_map_def) here is a
- * wrong assumption about ELF maps section format
- */
- insn[insn_idx].imm = map_fd[sym.st_value / sizeof(struct bpf_map_def)];
+
+ /* Match FD relocation against recorded map_data[] offset */
+ for (map_idx = 0; map_idx < nr_maps; map_idx++) {
+ if (maps[map_idx].elf_offset == sym.st_value) {
+ match = true;
+ break;
+ }
+ }
+ if (match) {
+ insn[insn_idx].imm = maps[map_idx].fd;
+ } else {
+ printf("invalid relo for insn[%d] no map_data match\n",
+ insn_idx);
+ return 1;
+ }
}
return 0;
@@ -297,40 +317,112 @@ static int cmp_symbols(const void *l, const void *r)
return 0;
}
-static int get_sorted_map_names(Elf *elf, Elf_Data *symbols, int maps_shndx,
- int strtabidx, char **map_names)
+static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx,
+ Elf *elf, Elf_Data *symbols, int strtabidx)
{
- GElf_Sym map_symbols[MAX_MAPS];
- int i, nr_maps = 0;
+ int map_sz_elf, map_sz_copy;
+ bool validate_zero = false;
+ Elf_Data *data_maps;
+ int i, nr_maps;
+ GElf_Sym *sym;
+ Elf_Scn *scn;
+ int copy_sz;
+
+ if (maps_shndx < 0)
+ return -EINVAL;
+ if (!symbols)
+ return -EINVAL;
+
+ /* Get data for maps section via elf index */
+ scn = elf_getscn(elf, maps_shndx);
+ if (scn)
+ data_maps = elf_getdata(scn, NULL);
+ if (!scn || !data_maps) {
+ printf("Failed to get Elf_Data from maps section %d\n",
+ maps_shndx);
+ return -EINVAL;
+ }
- for (i = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) {
- assert(nr_maps < MAX_MAPS);
- if (!gelf_getsym(symbols, i, &map_symbols[nr_maps]))
+ /* For each map get corrosponding symbol table entry */
+ sym = calloc(MAX_MAPS+1, sizeof(GElf_Sym));
+ for (i = 0, nr_maps = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) {
+ assert(nr_maps < MAX_MAPS+1);
+ if (!gelf_getsym(symbols, i, &sym[nr_maps]))
continue;
- if (map_symbols[nr_maps].st_shndx != maps_shndx)
+ if (sym[nr_maps].st_shndx != maps_shndx)
continue;
+ /* Only increment iif maps section */
nr_maps++;
}
- qsort(map_symbols, nr_maps, sizeof(GElf_Sym), cmp_symbols);
+ /* Align to map_fd[] order, via sort on offset in sym.st_value */
+ qsort(sym, nr_maps, sizeof(GElf_Sym), cmp_symbols);
+
+ /* Keeping compatible with ELF maps section changes
+ * ------------------------------------------------
+ * The program size of struct bpf_map_def is known by loader
+ * code, but struct stored in ELF file can be different.
+ *
+ * Unfortunately sym[i].st_size is zero. To calculate the
+ * struct size stored in the ELF file, assume all struct have
+ * the same size, and simply divide with number of map
+ * symbols.
+ */
+ map_sz_elf = data_maps->d_size / nr_maps;
+ map_sz_copy = sizeof(struct bpf_map_def);
+ if (map_sz_elf < map_sz_copy) {
+ /*
+ * Backward compat, loading older ELF file with
+ * smaller struct, keeping remaining bytes zero.
+ */
+ map_sz_copy = map_sz_elf;
+ } else if (map_sz_elf > map_sz_copy) {
+ /*
+ * Forward compat, loading newer ELF file with larger
+ * struct with unknown features. Assume zero means
+ * feature not used. Thus, validate rest of struct
+ * data is zero.
+ */
+ validate_zero = true;
+ }
+ /* Memcpy relevant part of ELF maps data to loader maps */
for (i = 0; i < nr_maps; i++) {
- char *map_name;
-
- map_name = elf_strptr(elf, strtabidx, map_symbols[i].st_name);
- if (!map_name) {
- printf("cannot get map symbol\n");
- return -1;
- }
-
- map_names[i] = strdup(map_name);
- if (!map_names[i]) {
+ unsigned char *addr, *end;
+ struct bpf_map_def *def;
+ const char *map_name;
+ size_t offset;
+
+ map_name = elf_strptr(elf, strtabidx, sym[i].st_name);
+ maps[i].name = strdup(map_name);
+ if (!maps[i].name) {
printf("strdup(%s): %s(%d)\n", map_name,
strerror(errno), errno);
- return -1;
+ free(sym);
+ return -errno;
+ }
+
+ /* Symbol value is offset into ELF maps section data area */
+ offset = sym[i].st_value;
+ def = (struct bpf_map_def *)(data_maps->d_buf + offset);
+ maps[i].elf_offset = offset;
+ memset(&maps[i].def, 0, sizeof(struct bpf_map_def));
+ memcpy(&maps[i].def, def, map_sz_copy);
+
+ /* Verify no newer features were requested */
+ if (validate_zero) {
+ addr = (unsigned char*) def + map_sz_copy;
+ end = (unsigned char*) def + map_sz_elf;
+ for (; addr < end; addr++) {
+ if (*addr != 0) {
+ free(sym);
+ return -EFBIG;
+ }
+ }
}
}
+ free(sym);
return nr_maps;
}
@@ -341,7 +433,8 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
GElf_Ehdr ehdr;
GElf_Shdr shdr, shdr_prog;
Elf_Data *data, *data_prog, *data_maps = NULL, *symbols = NULL;
- char *shname, *shname_prog, *map_names[MAX_MAPS] = { NULL };
+ char *shname, *shname_prog;
+ int nr_maps = 0;
/* reset global variables */
kern_version = 0;
@@ -389,8 +482,12 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
}
memcpy(&kern_version, data->d_buf, sizeof(int));
} else if (strcmp(shname, "maps") == 0) {
+ int j;
+
maps_shndx = i;
data_maps = data;
+ for (j = 0; j < MAX_MAPS; j++)
+ map_data[j].fd = -1;
} else if (shdr.sh_type == SHT_SYMTAB) {
strtabidx = shdr.sh_link;
symbols = data;
@@ -405,27 +502,17 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
}
if (data_maps) {
- int nr_maps;
- int prog_elf_map_sz;
-
- nr_maps = get_sorted_map_names(elf, symbols, maps_shndx,
- strtabidx, map_names);
- if (nr_maps < 0)
- goto done;
-
- /* Deduce map struct size stored in ELF maps section */
- prog_elf_map_sz = data_maps->d_size / nr_maps;
- if (prog_elf_map_sz != sizeof(struct bpf_map_def)) {
- printf("Error: ELF maps sec wrong size (%d/%lu),"
- " old kern.o file?\n",
- prog_elf_map_sz, sizeof(struct bpf_map_def));
+ nr_maps = load_elf_maps_section(map_data, maps_shndx,
+ elf, symbols, strtabidx);
+ if (nr_maps < 0) {
+ printf("Error: Failed loading ELF maps (errno:%d):%s\n",
+ nr_maps, strerror(-nr_maps));
ret = 1;
goto done;
}
-
- if (load_maps(data_maps->d_buf, nr_maps,
- (const char **)map_names, fixup_map))
+ if (load_maps(map_data, nr_maps, fixup_map))
goto done;
+ map_data_count = nr_maps;
processed_sec[maps_shndx] = true;
}
@@ -453,7 +540,8 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
processed_sec[shdr.sh_info] = true;
processed_sec[i] = true;
- if (parse_relo_and_apply(data, symbols, &shdr, insns))
+ if (parse_relo_and_apply(data, symbols, &shdr, insns,
+ map_data, nr_maps))
continue;
if (memcmp(shname_prog, "kprobe/", 7) == 0 ||
@@ -488,8 +576,6 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
ret = 0;
done:
- for (i = 0; i < MAX_MAPS; i++)
- free(map_names[i]);
close(fd);
return ret;
}
^ permalink raw reply related
* Re: net/smc and the RDMA core
From: Ursula Braun @ 2017-05-02 12:25 UTC (permalink / raw)
To: Christoph Hellwig, David S. Miller
Cc: netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20170501163311.GA22209-jcswGhMUV9g@public.gmane.org>
On 05/01/2017 06:33 PM, Christoph Hellwig wrote:
> Hi Ursual, hi netdev reviewers,
>
> how did the smc protocol manage to get merged without any review
> on linux-rdma at all? As the results it seems it's very substandard
> in terms of RDMA API usage, e.g. it neither uses the proper CQ API
> nor the RDMA R/W API, and other will probably find additional issues
> as well.
>
Hi Christoph,
We have been posting SMC-R patches on netdev since 2015, there was never
any secrecy about it. Still sorry for omitting linux-rdma, will include
with future postings from now on.
Of course, we are open to any further code reviews, so if you can point
out specific issues, we will be happy to work with you to get them
addressed!
Regards, Ursula
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [patch net-next repost] net: sched: add helpers to handle extended actions
From: Jiri Pirko @ 2017-05-02 12:07 UTC (permalink / raw)
To: Jamal Hadi Salim; +Cc: netdev, davem, xiyou.wangcong, mlxsw
In-Reply-To: <35328c1b-4571-f115-50c6-5f0c46ced3cc@mojatatu.com>
Tue, May 02, 2017 at 01:59:20PM CEST, jhs@mojatatu.com wrote:
>On 17-05-02 04:12 AM, Jiri Pirko wrote:
>> From: Jiri Pirko <jiri@mellanox.com>
>>
>> Jump is now the only one using value action opcode. This is going to
>> change soon. So introduce helpers to work with this. Convert TC_ACT_JUMP.
>>
>> This also fixes the TC_ACT_JUMP check, which is incorrectly done as a
>> bit check, not a value check.
>>
>> Fixes: e0ee84ded796 ("net sched actions: Complete the JUMPX opcode")
>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>> ---
>> Dave, I'm sending this for -net-next although I know it is closed. But
>> the mentioned commit is not yet in -net. Feel free to take this either
>> to -net-next or -net, whatever suits you better. Thanks.
>
>I think you are pushing the boundary a little calling it a bug fix
Well, it is a bugfix. Otherwise we could not use bit 1 for anything else
then jump in the future. This is also UAPI. That's why I'm pushing it as
a fix.
>and this could go with your patch series instead.
>The name TC_ACT_EXT_CMP should be TC_ACT_EXT_CMP_OPCODE
I was thinking about it as well, I would like to keep it shorter. The
current name is quite appropriate and it is clear what the macro does.
>Other than that:
>
>Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Thanks.
^ permalink raw reply
* Re: [oss-drivers] Re: [PATCH/RFC net-next 0/4] net/sched: cls_flower: avoid false matching of truncated packets
From: Simon Horman @ 2017-05-02 12:04 UTC (permalink / raw)
To: Jamal Hadi Salim
Cc: Jiri Pirko, Cong Wang, Dinan Gunawardena, netdev, oss-drivers
In-Reply-To: <c88b57d8-81a5-78e5-add6-33941f912d73@mojatatu.com>
On Mon, May 01, 2017 at 09:35:47PM -0400, Jamal Hadi Salim wrote:
> On 17-05-01 06:36 AM, Simon Horman wrote:
> >On Sun, Apr 30, 2017 at 09:51:30AM -0400, Jamal Hadi Salim wrote:
> >>On 17-04-28 10:14 AM, Simon Horman wrote:
>
> [..]
>
> >>minimal some flag should qualify it as "truncated".
> >
> >Would changing TCA_FLOWER_HEADER_PARSE_ERR_ACT to
> >TCA_FLOWER_META_TRUNCATED help?
> >
>
> I think so - as long as you are able to recognize the truncated
> vs real 0 port/type etc.
That is the intention of the patchset.
I will make the change and repost.
^ permalink raw reply
* Re: [patch net-next repost] net: sched: add helpers to handle extended actions
From: Jamal Hadi Salim @ 2017-05-02 11:59 UTC (permalink / raw)
To: Jiri Pirko, netdev; +Cc: davem, xiyou.wangcong, mlxsw
In-Reply-To: <1493712720-1501-1-git-send-email-jiri@resnulli.us>
On 17-05-02 04:12 AM, Jiri Pirko wrote:
> From: Jiri Pirko <jiri@mellanox.com>
>
> Jump is now the only one using value action opcode. This is going to
> change soon. So introduce helpers to work with this. Convert TC_ACT_JUMP.
>
> This also fixes the TC_ACT_JUMP check, which is incorrectly done as a
> bit check, not a value check.
>
> Fixes: e0ee84ded796 ("net sched actions: Complete the JUMPX opcode")
> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
> ---
> Dave, I'm sending this for -net-next although I know it is closed. But
> the mentioned commit is not yet in -net. Feel free to take this either
> to -net-next or -net, whatever suits you better. Thanks.
I think you are pushing the boundary a little calling it a bug fix
and this could go with your patch series instead.
The name TC_ACT_EXT_CMP should be TC_ACT_EXT_CMP_OPCODE
Other than that:
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
cheers,
jamal
^ permalink raw reply
* Re: [patch net-next] net: sched: add helpers to handle extended actions
From: Jamal Hadi Salim @ 2017-05-02 11:57 UTC (permalink / raw)
To: Jiri Pirko; +Cc: netdev, davem, xiyou.wangcong, mlxsw
In-Reply-To: <20170502055134.GB1877@nanopsycho.orion>
On 17-05-02 01:51 AM, Jiri Pirko wrote:
> Sun, Apr 30, 2017 at 04:08:15PM CEST, jhs@mojatatu.com wrote:
>> Jiri,
>>
>> With "goto chain X" this will have to be more generalized. Maybe
>> we have 0xAXXXXXXX Where "A" recognizes the extension with
>> current values ACT_JUMP(0x1) and GOTO_CHAIN(maybe 0x2)
>> and the rest "XXXXXXX" is a free floating parameter values
>> which carry the goto count for ACT_JUMP and GOTO_CHAIN chain-id.
>
> That is exactly what this patch is doing.
>
You are right, sorry ;->
cheers,
jamal
^ permalink raw reply
* Re: [PATCH] flower: check unused bits in MPLS fields
From: Simon Horman @ 2017-05-02 11:44 UTC (permalink / raw)
To: Jamal Hadi Salim
Cc: Benjamin LaHaise, netdev, David Miller, Jakub Kicinski,
Jiri Pirko
In-Reply-To: <af1de251-299e-a037-f4b1-21cb22051bf9@mojatatu.com>
On Mon, May 01, 2017 at 09:37:00PM -0400, Jamal Hadi Salim wrote:
> On 17-05-01 09:58 AM, Benjamin LaHaise wrote:
> >Since several of the the netlink attributes used to configure the flower
> >classifier's MPLS TC, BOS and Label fields have additional bits which are
> >unused, check those bits to ensure that they are actually 0 as suggested
> >by Jamal.
> >
> >Signed-off-by: Benjamin LaHaise <benjamin.lahaise@netronome.com>
> >Cc: David Miller <davem@davemloft.net>
> >Cc: Jamal Hadi Salim <jhs@mojatatu.com>
> >Cc: Simon Horman <simon.horman@netronome.com>
> >Cc: Jakub Kicinski <kubakici@wp.pl>
> >Cc: Jiri Pirko <jiri@resnulli.us>
>
> Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
^ permalink raw reply
* RE: [PATCH net v4 00/12] Fix possbile memleaks when fail to register_netdevice
From: Gao Feng @ 2017-05-02 11:10 UTC (permalink / raw)
To: davem, jiri, mareklindner, sw, a, kuznet, jmorris, yoshfuji,
kaber, steffen.klassert, herbert, netdev
In-Reply-To: <cover.1493699451.git.gfree.wind@foxmail.com>
Hi all,
> From: gfree.wind@foxmail.com [mailto:gfree.wind@foxmail.com]
> Sent: Tuesday, May 2, 2017 1:59 PM
> drivers/net/dummy.c | 14 +++++++++++---
> drivers/net/ifb.c | 33 +++++++++++++++++++++++----------
> drivers/net/loopback.c | 15 ++++++++++++++-
> drivers/net/team/team.c | 15 ++++++++++++---
> drivers/net/veth.c | 15 ++++++++++++++-
> net/8021q/vlan_dev.c | 17 +++++++++++++----
> net/batman-adv/soft-interface.c | 18 +++++++++++++++---
> net/ipv4/ip_tunnel.c | 11 ++++++++++-
> net/ipv6/ip6_gre.c | 17 +++++++++++++----
> net/ipv6/ip6_tunnel.c | 11 ++++++++++-
> net/ipv6/ip6_vti.c | 11 ++++++++++-
> net/ipv6/sit.c | 17 +++++++++++++----
> 12 files changed, 158 insertions(+), 36 deletions(-)
>
> --
> v4: Make patches as one sery of patches, per David Miler
> v3: Split one patch to multiple commits, per David Ahern
> v2: Move the free in ndo_uninit when fail to register, per Herbert Xu
> v1: initial version
>
>
Because I sent these patches too fast today, so that my email server blocks
my account today.
The left patches (08~12) would be sent after my account is unlocked.
Maybe tomorrow.
I am very sorry about this, and try to change another mail server next time.
Best Regards
Feng
^ permalink raw reply
* RE: [PATCH net v3] driver: veth: Fix one possbile memleak when fail to register_netdevice
From: Gao Feng @ 2017-05-02 11:03 UTC (permalink / raw)
To: 'Xin Long'
Cc: 'davem', jarod, 'Stephen Hemminger', dsa,
'network dev'
In-Reply-To: <CADvbK_cEHXTSXx5Qa17TfeUXnYiBq8ptR+=5e-fbzPF0r4DNhQ@mail.gmail.com>
> From: Xin Long [mailto:lucien.xin@gmail.com]
> Sent: Tuesday, May 2, 2017 3:56 PM
> On Sat, Apr 29, 2017 at 11:51 AM, <gfree.wind@foxmail.com> wrote:
> > From: Gao Feng <gfree.wind@foxmail.com>
[...]
> > -static void veth_dev_free(struct net_device *dev)
> > +static void veth_destructor_free(struct net_device *dev)
> > {
> > free_percpu(dev->vstats);
> > +}
> not sure why you needed to add this function.
> to use free_percpu() directly may be clearer.
Because both of ndo_uninit and destructor need to perform same free statements.
It is good at maintain the codes with the common function.
>
> > +
> > +static void veth_dev_uninit(struct net_device *dev) {
> call free_percpu() here, no need to check dev->reg_state.
> free_percpu will just return if dev->vstats is NULL.
It would break the original design if don't check the reg_state.
The original logic is that free the resources in the destructor, not in ndo_init.
BTW, because I send multiple patches too fast today, the email server blocks my account.
So I have to reply you with a different email account. Sorry.
Best Regards
Feng
>
[...]
^ permalink raw reply
* [PATCH] ipx: call ipxitf_put() in ioctl error path
From: Dan Carpenter @ 2017-05-02 10:58 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev, security@kernel.org, secalert@redhat.com
In-Reply-To: <143C0AFC63FC204CB0C55BB88F3A8ABB33419ED1@EX02.corp.qihoo.net>
We should call ipxitf_put() if the copy_to_user() fails.
Reported-by: 李强 <liqiang6-s@360.cn>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index 8a9219ff2e77..fa31ef29e3fa 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -1168,11 +1168,10 @@ static int ipxitf_ioctl(unsigned int cmd, void __user *arg)
sipx->sipx_network = ipxif->if_netnum;
memcpy(sipx->sipx_node, ipxif->if_node,
sizeof(sipx->sipx_node));
- rc = -EFAULT;
+ rc = 0;
if (copy_to_user(arg, &ifr, sizeof(ifr)))
- break;
+ rc = -EFAULT;
ipxitf_put(ipxif);
- rc = 0;
break;
}
case SIOCAIPXITFCRT:
^ permalink raw reply related
* RE: [PATCH net v3] driver: veth: Fix one possbile memleak when fail to register_netdevice
From: Gao Feng @ 2017-05-02 10:51 UTC (permalink / raw)
To: 'David Ahern', davem, jarod, lucien.xin, stephen, netdev
In-Reply-To: <92eab1fc-cd70-1561-1f94-665e77725750@cumulusnetworks.com>
> From: David Ahern [mailto:dsa@cumulusnetworks.com]
> Sent: Monday, May 1, 2017 11:08 PM
> On 4/28/17 9:51 PM, gfree.wind@foxmail.com wrote:
> > diff --git a/drivers/net/veth.c b/drivers/net/veth.c index
> > 8c39d6d..418376a 100644
> > --- a/drivers/net/veth.c
> > +++ b/drivers/net/veth.c
> > @@ -224,9 +224,21 @@ static int veth_dev_init(struct net_device *dev)
> > return 0;
> > }
> >
> > -static void veth_dev_free(struct net_device *dev)
> > +static void veth_destructor_free(struct net_device *dev)
>
> _destructor in the name is confusing since veth_dev_free is the
> dev->destructor. Perhaps that should be veth_free_stats.
Because I want to emphasize it should be invoked in the destructor.
What's your opinion ?
[...]
> Functionally, it looks good to me.
>
> Acked-by: David Ahern <dsa@cumulusnetworks.com>
Thanks David.
I have sent the v4 patches with a series according to David's advice.
BTW, because I send multiple patches too fast today, the email server blocks
my account.
So I have to reply you with a different email account. Sorry.
Regards
Feng
^ permalink raw reply
* [PATCH v2 net-next 5/7] net: don't make false software transmit timestamps
From: Miroslav Lichvar @ 2017-05-02 10:11 UTC (permalink / raw)
To: netdev
Cc: Richard Cochran, Willem de Bruijn, Soheil Hassas Yeganeh,
Keller, Jacob E, Denny Page, Jiri Benc
In-Reply-To: <20170502101103.30444-1-mlichvar@redhat.com>
If software timestamping is enabled by the SO_TIMESTAMP(NS) option
when a message without timestamp is already waiting in the queue, the
__sock_recv_timestamp() function will read the current time to make a
timestamp in order to always have something for the application.
However, this applies also to outgoing packets looped back to the error
queue when hardware timestamping is enabled by the SO_TIMESTAMPING
option. A software transmit timestamp made after the actual transmission
is added to messages with hardware timestamps.
Modify the function to save the current time as a software timestamp
only if it's for a received packet (i.e. it's not in the error queue).
CC: Richard Cochran <richardcochran@gmail.com>
CC: Willem de Bruijn <willemb@google.com>
Signed-off-by: Miroslav Lichvar <mlichvar@redhat.com>
---
net/socket.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/net/socket.c b/net/socket.c
index da4d4ab..fe7e5bc 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -692,7 +692,8 @@ static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb)
void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
struct sk_buff *skb)
{
- int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP);
+ int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP) &&
+ !skb_is_err_queue(skb);
struct scm_timestamping tss;
int empty = 1;
struct skb_shared_hwtstamps *shhwtstamps =
--
2.9.3
^ permalink raw reply related
* [PATCH v2 net-next 7/7] net: ethernet: update drivers to make both SW and HW TX timestamps
From: Miroslav Lichvar @ 2017-05-02 10:11 UTC (permalink / raw)
To: netdev
Cc: Richard Cochran, Willem de Bruijn, Soheil Hassas Yeganeh,
Keller, Jacob E, Denny Page, Jiri Benc
In-Reply-To: <20170502101103.30444-1-mlichvar@redhat.com>
Some drivers were calling the skb_tx_timestamp() function only when
a hardware timestamp was not requested. Now that applications can use
the SOF_TIMESTAMPING_OPT_TX_SWHW option to request both software and
hardware timestamps, the drivers need to be modified to unconditionally
call skb_tx_timestamp().
CC: Richard Cochran <richardcochran@gmail.com>
CC: Willem de Bruijn <willemb@google.com>
Signed-off-by: Miroslav Lichvar <mlichvar@redhat.com>
---
drivers/net/ethernet/amd/xgbe/xgbe-drv.c | 3 +--
drivers/net/ethernet/intel/e1000e/netdev.c | 4 ++--
drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c | 3 +--
drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 6 ++----
4 files changed, 6 insertions(+), 10 deletions(-)
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 89b21d7..5a2ad9c 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -1391,8 +1391,7 @@ static void xgbe_prep_tx_tstamp(struct xgbe_prv_data *pdata,
spin_unlock_irqrestore(&pdata->tstamp_lock, flags);
}
- if (!XGMAC_GET_BITS(packet->attributes, TX_PACKET_ATTRIBUTES, PTP))
- skb_tx_timestamp(skb);
+ skb_tx_timestamp(skb);
}
static void xgbe_prep_vlan(struct sk_buff *skb, struct xgbe_packet_data *packet)
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index 0ff9295..6ed3bc4 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -5868,10 +5868,10 @@ static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb,
adapter->tx_hwtstamp_skb = skb_get(skb);
adapter->tx_hwtstamp_start = jiffies;
schedule_work(&adapter->tx_hwtstamp_work);
- } else {
- skb_tx_timestamp(skb);
}
+ skb_tx_timestamp(skb);
+
netdev_sent_queue(netdev, skb->len);
e1000_tx_queue(tx_ring, tx_flags, count);
/* Make sure there is space in the ring for the next send. */
diff --git a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c
index d54490d..50c182c 100644
--- a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c
+++ b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c
@@ -1418,8 +1418,7 @@ static netdev_tx_t sxgbe_xmit(struct sk_buff *skb, struct net_device *dev)
priv->hw->desc->tx_enable_tstamp(first_desc);
}
- if (!tqueue->hwts_tx_en)
- skb_tx_timestamp(skb);
+ skb_tx_timestamp(skb);
priv->hw->dma->enable_dma_transmission(priv->ioaddr, txq_index);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 3115700..7f857ee 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -2880,8 +2880,7 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
priv->xstats.tx_set_ic_bit++;
}
- if (!priv->hwts_tx_en)
- skb_tx_timestamp(skb);
+ skb_tx_timestamp(skb);
if (unlikely((skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) &&
priv->hwts_tx_en)) {
@@ -3084,8 +3083,7 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
priv->xstats.tx_set_ic_bit++;
}
- if (!priv->hwts_tx_en)
- skb_tx_timestamp(skb);
+ skb_tx_timestamp(skb);
/* Ready to fill the first descriptor and set the OWN bit w/o any
* problems because all the descriptors are actually ready to be
--
2.9.3
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox