* Re: [PATCH bpf-next v5 03/11] xsk: add support to allow unaligned chunk placement
From: Maxim Mikityanskiy @ 2019-08-27 7:36 UTC (permalink / raw)
To: Kevin Laatz
Cc: netdev@vger.kernel.org, ast@kernel.org, daniel@iogearbox.net,
bjorn.topel@intel.com, magnus.karlsson@intel.com,
jakub.kicinski@netronome.com, jonathan.lemon@gmail.com,
Saeed Mahameed, stephen@networkplumber.org,
bruce.richardson@intel.com, ciara.loftus@intel.com,
bpf@vger.kernel.org, intel-wired-lan@lists.osuosl.org
In-Reply-To: <20190822014427.49800-4-kevin.laatz@intel.com>
On 2019-08-22 04:44, Kevin Laatz wrote:
> Currently, addresses are chunk size aligned. This means, we are very
> restricted in terms of where we can place chunk within the umem. For
> example, if we have a chunk size of 2k, then our chunks can only be placed
> at 0,2k,4k,6k,8k... and so on (ie. every 2k starting from 0).
>
> This patch introduces the ability to use unaligned chunks. With these
> changes, we are no longer bound to having to place chunks at a 2k (or
> whatever your chunk size is) interval. Since we are no longer dealing with
> aligned chunks, they can now cross page boundaries. Checks for page
> contiguity have been added in order to keep track of which pages are
> followed by a physically contiguous page.
>
> Signed-off-by: Kevin Laatz <kevin.laatz@intel.com>
> Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
> Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
>
> ---
> v2:
> - Add checks for the flags coming from userspace
> - Fix how we get chunk_size in xsk_diag.c
> - Add defines for masking the new descriptor format
> - Modified the rx functions to use new descriptor format
> - Modified the tx functions to use new descriptor format
>
> v3:
> - Add helper function to do address/offset masking/addition
>
> v4:
> - fixed page_start calculation in __xsk_rcv_memcpy().
> - move offset handling to the xdp_umem_get_* functions
> - modified the len field in xdp_umem_reg struct. We now use 16 bits from
> this for the flags field.
> - removed next_pg_contig field from xdp_umem_page struct. Using low 12
> bits of addr to store flags instead.
> - other minor changes based on review comments
>
> v5:
> - Added accessors for getting addr and offset
> - Added helper function to add offset to addr
> - Fixed offset handling in xsk_rcv
> - Removed bitfields from xdp_umem_reg
> - Added struct size checking for xdp_umem_reg in xsk_setsockopt to handle
> different versions of the struct.
> - fix conflicts after 'bpf-af-xdp-wakeup' was merged.
> ---
> include/net/xdp_sock.h | 75 +++++++++++++++++++++++++++--
> include/uapi/linux/if_xdp.h | 9 ++++
> net/xdp/xdp_umem.c | 19 ++++++--
> net/xdp/xsk.c | 96 +++++++++++++++++++++++++++++--------
> net/xdp/xsk_diag.c | 2 +-
> net/xdp/xsk_queue.h | 68 ++++++++++++++++++++++----
> 6 files changed, 232 insertions(+), 37 deletions(-)
>
> diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
> index f023b9940d64..c9398ce7960f 100644
> --- a/include/net/xdp_sock.h
> +++ b/include/net/xdp_sock.h
> @@ -16,6 +16,13 @@
> struct net_device;
> struct xsk_queue;
>
> +/* Masks for xdp_umem_page flags.
> + * The low 12-bits of the addr will be 0 since this is the page address, so we
> + * can use them for flags.
> + */
> +#define XSK_NEXT_PG_CONTIG_SHIFT 0
> +#define XSK_NEXT_PG_CONTIG_MASK (1ULL << XSK_NEXT_PG_CONTIG_SHIFT)
> +
> struct xdp_umem_page {
> void *addr;
> dma_addr_t dma;
> @@ -27,8 +34,12 @@ struct xdp_umem_fq_reuse {
> u64 handles[];
> };
>
> -/* Flags for the umem flags field. */
> -#define XDP_UMEM_USES_NEED_WAKEUP (1 << 0)
> +/* Flags for the umem flags field.
> + *
> + * The NEED_WAKEUP flag is 1 due to the reuse of the flags field for public
> + * flags. See inlude/uapi/include/linux/if_xdp.h.
> + */
> +#define XDP_UMEM_USES_NEED_WAKEUP (1 << 1)
>
> struct xdp_umem {
> struct xsk_queue *fq;
> @@ -124,14 +135,36 @@ void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
> int xsk_map_inc(struct xsk_map *map);
> void xsk_map_put(struct xsk_map *map);
>
> +static inline u64 xsk_umem_extract_addr(u64 addr)
> +{
> + return addr & XSK_UNALIGNED_BUF_ADDR_MASK;
> +}
> +
> +static inline u64 xsk_umem_extract_offset(u64 addr)
> +{
> + return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT;
> +}
> +
> +static inline u64 xsk_umem_add_offset_to_addr(u64 addr)
> +{
> + return xsk_umem_extract_addr(addr) + xsk_umem_extract_offset(addr);
> +}
> +
> static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
> {
> - return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1));
> + unsigned long page_addr;
> +
> + addr = xsk_umem_add_offset_to_addr(addr);
> + page_addr = (unsigned long)umem->pages[addr >> PAGE_SHIFT].addr;
> +
> + return (char *)(page_addr & PAGE_MASK) + (addr & ~PAGE_MASK);
> }
>
> static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
> {
> - return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 1));
> + addr = xsk_umem_add_offset_to_addr(addr);
> +
> + return umem->pages[addr >> PAGE_SHIFT].dma + (addr & ~PAGE_MASK);
> }
>
> /* Reuse-queue aware version of FILL queue helpers */
> @@ -172,6 +205,19 @@ static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr)
>
> rq->handles[rq->length++] = addr;
> }
> +
> +/* Handle the offset appropriately depending on aligned or unaligned mode.
> + * For unaligned mode, we store the offset in the upper 16-bits of the address.
> + * For aligned mode, we simply add the offset to the address.
> + */
> +static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 address,
> + u64 offset)
> +{
> + if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG)
> + return address + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
> + else
> + return address + offset;
> +}
> #else
> static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
> {
> @@ -241,6 +287,21 @@ static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
> return NULL;
> }
>
> +static inline u64 xsk_umem_extract_addr(u64 addr)
> +{
> + return 0;
> +}
> +
> +static inline u64 xsk_umem_extract_offset(u64 addr)
> +{
> + return 0;
> +}
> +
> +static inline u64 xsk_umem_add_offset_to_addr(u64 addr)
> +{
> + return 0;
> +}
> +
> static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
> {
> return NULL;
> @@ -290,6 +351,12 @@ static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
> return false;
> }
>
> +static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle,
> + u64 offset)
> +{
> + return 0;
> +}
> +
> #endif /* CONFIG_XDP_SOCKETS */
>
> #endif /* _LINUX_XDP_SOCK_H */
> diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
> index 62b80d57b72a..be328c59389d 100644
> --- a/include/uapi/linux/if_xdp.h
> +++ b/include/uapi/linux/if_xdp.h
> @@ -26,6 +26,9 @@
> */
> #define XDP_USE_NEED_WAKEUP (1 << 3)
>
> +/* Flags for xsk_umem_config flags */
> +#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)
> +
> struct sockaddr_xdp {
> __u16 sxdp_family;
> __u16 sxdp_flags;
> @@ -66,6 +69,7 @@ struct xdp_umem_reg {
> __u64 len; /* Length of packet data area */
> __u32 chunk_size;
> __u32 headroom;
> + __u32 flags;
> };
>
> struct xdp_statistics {
> @@ -87,6 +91,11 @@ struct xdp_options {
> #define XDP_UMEM_PGOFF_FILL_RING 0x100000000ULL
> #define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000ULL
>
> +/* Masks for unaligned chunks mode */
> +#define XSK_UNALIGNED_BUF_OFFSET_SHIFT 48
> +#define XSK_UNALIGNED_BUF_ADDR_MASK \
> + ((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1)
> +
> /* Rx/Tx descriptor */
> struct xdp_desc {
> __u64 addr;
> diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
> index 2d65779282a1..e997b263a0dd 100644
> --- a/net/xdp/xdp_umem.c
> +++ b/net/xdp/xdp_umem.c
> @@ -340,6 +340,7 @@ static int xdp_umem_account_pages(struct xdp_umem *umem)
>
> static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
> {
> + bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;
> u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
> unsigned int chunks, chunks_per_page;
> u64 addr = mr->addr, size = mr->len;
> @@ -355,7 +356,11 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
> return -EINVAL;
> }
>
> - if (!is_power_of_2(chunk_size))
> + if (mr->flags & ~(XDP_UMEM_UNALIGNED_CHUNK_FLAG |
> + XDP_UMEM_USES_NEED_WAKEUP))
> + return -EINVAL;
> +
> + if (!unaligned_chunks && !is_power_of_2(chunk_size))
> return -EINVAL;
>
> if (!PAGE_ALIGNED(addr)) {
> @@ -372,9 +377,11 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
> if (chunks == 0)
> return -EINVAL;
>
> - chunks_per_page = PAGE_SIZE / chunk_size;
> - if (chunks < chunks_per_page || chunks % chunks_per_page)
> - return -EINVAL;
> + if (!unaligned_chunks) {
> + chunks_per_page = PAGE_SIZE / chunk_size;
> + if (chunks < chunks_per_page || chunks % chunks_per_page)
> + return -EINVAL;
> + }
>
> headroom = ALIGN(headroom, 64);
>
> @@ -383,13 +390,15 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
> return -EINVAL;
>
> umem->address = (unsigned long)addr;
> - umem->chunk_mask = ~((u64)chunk_size - 1);
> + umem->chunk_mask = unaligned_chunks ? XSK_UNALIGNED_BUF_ADDR_MASK
> + : ~((u64)chunk_size - 1);
> umem->size = size;
> umem->headroom = headroom;
> umem->chunk_size_nohr = chunk_size - headroom;
> umem->npgs = size / PAGE_SIZE;
> umem->pgs = NULL;
> umem->user = NULL;
> + umem->flags = mr->flags;
> INIT_LIST_HEAD(&umem->xsk_list);
> spin_lock_init(&umem->xsk_list_lock);
>
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index ee4428a892fa..907e5f12338f 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -45,7 +45,7 @@ EXPORT_SYMBOL(xsk_umem_has_addrs);
>
> u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
> {
> - return xskq_peek_addr(umem->fq, addr);
> + return xskq_peek_addr(umem->fq, addr, umem);
> }
> EXPORT_SYMBOL(xsk_umem_peek_addr);
>
> @@ -115,21 +115,43 @@ bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
> }
> EXPORT_SYMBOL(xsk_umem_uses_need_wakeup);
>
> +/* If a buffer crosses a page boundary, we need to do 2 memcpy's, one for
> + * each page. This is only required in copy mode.
> + */
> +static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf,
> + u32 len, u32 metalen)
> +{
> + void *to_buf = xdp_umem_get_data(umem, addr);
> +
> + addr = xsk_umem_add_offset_to_addr(addr);
> + if (xskq_crosses_non_contig_pg(umem, addr, len + metalen)) {
> + void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr;
> + u64 page_start = addr & ~(PAGE_SIZE - 1);
> + u64 first_len = PAGE_SIZE - (addr - page_start);
> +
> + memcpy(to_buf, from_buf, first_len + metalen);
> + memcpy(next_pg_addr, from_buf + first_len, len - first_len);
> +
> + return;
> + }
> +
> + memcpy(to_buf, from_buf, len + metalen);
> +}
> +
> static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
> {
> - void *to_buf, *from_buf;
> + u64 offset = xs->umem->headroom;
> + u64 addr, memcpy_addr;
> + void *from_buf;
> u32 metalen;
> - u64 addr;
> int err;
>
> - if (!xskq_peek_addr(xs->umem->fq, &addr) ||
> + if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) ||
> len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
> xs->rx_dropped++;
> return -ENOSPC;
> }
>
> - addr += xs->umem->headroom;
> -
> if (unlikely(xdp_data_meta_unsupported(xdp))) {
> from_buf = xdp->data;
> metalen = 0;
> @@ -138,9 +160,11 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
> metalen = xdp->data - xdp->data_meta;
> }
>
> - to_buf = xdp_umem_get_data(xs->umem, addr);
> - memcpy(to_buf, from_buf, len + metalen);
> - addr += metalen;
> + memcpy_addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
> + __xsk_rcv_memcpy(xs->umem, memcpy_addr, from_buf, len, metalen);
> +
> + offset += metalen;
> + addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
> err = xskq_produce_batch_desc(xs->rx, addr, len);
> if (!err) {
> xskq_discard_addr(xs->umem->fq);
> @@ -185,6 +209,7 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
> {
> u32 metalen = xdp->data - xdp->data_meta;
> u32 len = xdp->data_end - xdp->data;
> + u64 offset = xs->umem->headroom;
> void *buffer;
> u64 addr;
> int err;
> @@ -196,17 +221,17 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
> goto out_unlock;
> }
>
> - if (!xskq_peek_addr(xs->umem->fq, &addr) ||
> + if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) ||
> len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
> err = -ENOSPC;
> goto out_drop;
> }
>
> - addr += xs->umem->headroom;
> -
> - buffer = xdp_umem_get_data(xs->umem, addr);
> + buffer = xdp_umem_get_data(xs->umem, addr + offset);
> memcpy(buffer, xdp->data_meta, len + metalen);
> - addr += metalen;
> + offset += metalen;
> +
> + addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
> err = xskq_produce_batch_desc(xs->rx, addr, len);
> if (err)
> goto out_drop;
> @@ -250,7 +275,7 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc)
>
> rcu_read_lock();
> list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
> - if (!xskq_peek_desc(xs->tx, desc))
> + if (!xskq_peek_desc(xs->tx, desc, umem))
> continue;
>
> if (xskq_produce_addr_lazy(umem->cq, desc->addr))
> @@ -304,7 +329,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
> if (xs->queue_id >= xs->dev->real_num_tx_queues)
> goto out;
>
> - while (xskq_peek_desc(xs->tx, &desc)) {
> + while (xskq_peek_desc(xs->tx, &desc, xs->umem)) {
> char *buffer;
> u64 addr;
> u32 len;
> @@ -333,7 +358,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
> skb->dev = xs->dev;
> skb->priority = sk->sk_priority;
> skb->mark = sk->sk_mark;
> - skb_shinfo(skb)->destructor_arg = (void *)(long)addr;
> + skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
> skb->destructor = xsk_destruct_skb;
>
> err = dev_direct_xmit(skb, xs->queue_id);
> @@ -526,6 +551,24 @@ static struct socket *xsk_lookup_xsk_from_fd(int fd)
> return sock;
> }
>
> +/* Check if umem pages are contiguous.
> + * If zero-copy mode, use the DMA address to do the page contiguity check
> + * For all other modes we use addr (kernel virtual address)
> + * Store the result in the low bits of addr.
> + */
> +static void xsk_check_page_contiguity(struct xdp_umem *umem, u32 flags)
> +{
> + struct xdp_umem_page *pgs = umem->pages;
> + int i, is_contig;
> +
> + for (i = 0; i < umem->npgs - 1; i++) {
> + is_contig = (flags & XDP_ZEROCOPY) ?
> + (pgs[i].dma + PAGE_SIZE == pgs[i + 1].dma) :
> + (pgs[i].addr + PAGE_SIZE == pgs[i + 1].addr);
> + pgs[i].addr += is_contig << XSK_NEXT_PG_CONTIG_SHIFT;
> + }
> +}
> +
> static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
> {
> struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
> @@ -616,6 +659,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
> err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
> if (err)
> goto out_unlock;
> +
> + xsk_check_page_contiguity(xs->umem, flags);
> }
>
> xs->dev = dev;
> @@ -636,6 +681,13 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
> return err;
> }
>
> +struct xdp_umem_reg_v1 {
> + __u64 addr; /* Start of packet data area */
> + __u64 len; /* Length of packet data area */
> + __u32 chunk_size;
> + __u32 headroom;
> +};
> +
> static int xsk_setsockopt(struct socket *sock, int level, int optname,
> char __user *optval, unsigned int optlen)
> {
> @@ -673,10 +725,16 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
> }
> case XDP_UMEM_REG:
> {
> - struct xdp_umem_reg mr;
> + size_t mr_size = sizeof(struct xdp_umem_reg);
> + struct xdp_umem_reg mr = {};
> struct xdp_umem *umem;
>
> - if (copy_from_user(&mr, optval, sizeof(mr)))
> + if (optlen < sizeof(struct xdp_umem_reg_v1))
> + return -EINVAL;
> + else if (optlen < sizeof(mr))
> + mr_size = sizeof(struct xdp_umem_reg_v1);
> +
> + if (copy_from_user(&mr, optval, mr_size))
> return -EFAULT;
>
> mutex_lock(&xs->mutex);
> diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c
> index d5e06c8e0cbf..9986a759fe06 100644
> --- a/net/xdp/xsk_diag.c
> +++ b/net/xdp/xsk_diag.c
> @@ -56,7 +56,7 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb)
> du.id = umem->id;
> du.size = umem->size;
> du.num_pages = umem->npgs;
> - du.chunk_size = (__u32)(~umem->chunk_mask + 1);
> + du.chunk_size = umem->chunk_size_nohr + umem->headroom;
> du.headroom = umem->headroom;
> du.ifindex = umem->dev ? umem->dev->ifindex : 0;
> du.queue_id = umem->queue_id;
> diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
> index dd9e985c2461..6c67c9d0294f 100644
> --- a/net/xdp/xsk_queue.h
> +++ b/net/xdp/xsk_queue.h
> @@ -134,6 +134,17 @@ static inline bool xskq_has_addrs(struct xsk_queue *q, u32 cnt)
>
> /* UMEM queue */
>
> +static inline bool xskq_crosses_non_contig_pg(struct xdp_umem *umem, u64 addr,
> + u64 length)
> +{
> + bool cross_pg = (addr & (PAGE_SIZE - 1)) + length > PAGE_SIZE;
> + bool next_pg_contig =
> + (unsigned long)umem->pages[(addr >> PAGE_SHIFT)].addr &
> + XSK_NEXT_PG_CONTIG_MASK;
> +
> + return cross_pg && !next_pg_contig;
> +}
> +
> static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
> {
> if (addr >= q->size) {
> @@ -144,23 +155,49 @@ static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
> return true;
> }
>
> -static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr)
> +static inline bool xskq_is_valid_addr_unaligned(struct xsk_queue *q, u64 addr,
> + u64 length,
> + struct xdp_umem *umem)
> +{
> + addr = xsk_umem_add_offset_to_addr(addr);
> + if (addr >= q->size ||
I know I already asked about it, but I feel we need to clarify things
here further.
The `addr >= q->size` check is good, it checks that the address doesn't
overflow the UMEM. However, the new encoding of UMEM handles consists of
two parts: base address of the frame (low bits) and offset (high bits),
that are added together by xsk_umem_add_offset_to_addr. It's possible to
craft an address that has too big base address of the frame (all
0xff...f), some non-zero offset, so after adding them together we pass
`addr < q->size`, but the base address exceeds the UMEM. In my opinion,
we should not allow such addresses, because the base address of the
frame is out of bounds, so we need one more check here. What do you
think about this point?
> + xskq_crosses_non_contig_pg(umem, addr, length)) {
> + q->invalid_descs++;
> + return false;
> + }
> +
> + return true;
> +}
> +
> +static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr,
> + struct xdp_umem *umem)
> {
> while (q->cons_tail != q->cons_head) {
> struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
> unsigned int idx = q->cons_tail & q->ring_mask;
>
> *addr = READ_ONCE(ring->desc[idx]) & q->chunk_mask;
> +
> + if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) {
> + if (xskq_is_valid_addr_unaligned(q, *addr,
> + umem->chunk_size_nohr,
> + umem))
> + return addr;
> + goto out;
> + }
> +
> if (xskq_is_valid_addr(q, *addr))
> return addr;
>
> +out:
> q->cons_tail++;
> }
>
> return NULL;
> }
>
> -static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr)
> +static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr,
> + struct xdp_umem *umem)
> {
> if (q->cons_tail == q->cons_head) {
> smp_mb(); /* D, matches A */
> @@ -171,7 +208,7 @@ static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr)
> smp_rmb();
> }
>
> - return xskq_validate_addr(q, addr);
> + return xskq_validate_addr(q, addr, umem);
> }
>
> static inline void xskq_discard_addr(struct xsk_queue *q)
> @@ -230,8 +267,21 @@ static inline int xskq_reserve_addr(struct xsk_queue *q)
>
> /* Rx/Tx queue */
>
> -static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d)
> +static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d,
> + struct xdp_umem *umem)
> {
> + if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) {
> + if (!xskq_is_valid_addr_unaligned(q, d->addr, d->len, umem))
> + return false;
> +
> + if (d->len > umem->chunk_size_nohr || d->options) {
> + q->invalid_descs++;
> + return false;
> + }
> +
> + return true;
> + }
> +
> if (!xskq_is_valid_addr(q, d->addr))
> return false;
>
> @@ -245,14 +295,15 @@ static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d)
> }
>
> static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q,
> - struct xdp_desc *desc)
> + struct xdp_desc *desc,
> + struct xdp_umem *umem)
> {
> while (q->cons_tail != q->cons_head) {
> struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
> unsigned int idx = q->cons_tail & q->ring_mask;
>
> *desc = READ_ONCE(ring->desc[idx]);
> - if (xskq_is_valid_desc(q, desc))
> + if (xskq_is_valid_desc(q, desc, umem))
> return desc;
>
> q->cons_tail++;
> @@ -262,7 +313,8 @@ static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q,
> }
>
> static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q,
> - struct xdp_desc *desc)
> + struct xdp_desc *desc,
> + struct xdp_umem *umem)
> {
> if (q->cons_tail == q->cons_head) {
> smp_mb(); /* D, matches A */
> @@ -273,7 +325,7 @@ static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q,
> smp_rmb(); /* C, matches B */
> }
>
> - return xskq_validate_desc(q, desc);
> + return xskq_validate_desc(q, desc, umem);
> }
>
> static inline void xskq_discard_desc(struct xsk_queue *q)
>
^ permalink raw reply
* Re: [PATCH -next] net: mlx5: Kconfig: Fix MLX5_CORE_EN dependencies
From: walter harms @ 2019-08-27 7:24 UTC (permalink / raw)
To: Mao Wenan
Cc: saeedm, leon, davem, netdev, linux-rdma, linux-kernel,
kernel-janitors
In-Reply-To: <20190827031251.98881-1-maowenan@huawei.com>
Am 27.08.2019 05:12, schrieb Mao Wenan:
> When MLX5_CORE_EN=y and PCI_HYPERV_INTERFACE is not set, below errors are found:
> drivers/net/ethernet/mellanox/mlx5/core/en_main.o: In function `mlx5e_nic_enable':
> en_main.c:(.text+0xb649): undefined reference to `mlx5e_hv_vhca_stats_create'
> drivers/net/ethernet/mellanox/mlx5/core/en_main.o: In function `mlx5e_nic_disable':
> en_main.c:(.text+0xb8c4): undefined reference to `mlx5e_hv_vhca_stats_destroy'
>
> This because CONFIG_PCI_HYPERV_INTERFACE is newly introduced by 'commit 348dd93e40c1
> ("PCI: hv: Add a Hyper-V PCI interface driver for software backchannel interface"),
> Fix this by making MLX5_CORE_EN imply PCI_HYPERV_INTERFACE.
>
> Fixes: cef35af34d6d ("net/mlx5e: Add mlx5e HV VHCA stats agent")
> Signed-off-by: Mao Wenan <maowenan@huawei.com>
> ---
> drivers/net/ethernet/mellanox/mlx5/core/Kconfig | 1 +
> 1 file changed, 1 insertion(+)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
> index 37fef8c..a6a70ce 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
> @@ -35,6 +35,7 @@ config MLX5_CORE_EN
> depends on IPV6=y || IPV6=n || MLX5_CORE=m
OT but ...
is that IPV6 needed at all ? can there be something else that yes or no ?
re,
wh
> select PAGE_POOL
> select DIMLIB
> + imply PCI_HYPERV_INTERFACE
> default n
> ---help---
> Ethernet support in Mellanox Technologies ConnectX-4 NIC.
^ permalink raw reply
* [PATCH net-next] ipv6: shrink struct ipv6_mc_socklist
From: Eric Dumazet @ 2019-08-27 7:08 UTC (permalink / raw)
To: David S . Miller; +Cc: netdev, Eric Dumazet, Eric Dumazet
Remove two holes on 64bit arches, to bring the size
to one cache line exactly.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
include/net/if_inet6.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index 50037913c9b191cb3793e3f072104c10c4257ff2..a01981d7108f96075b6939f4a78f14e7afe93a4e 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -89,9 +89,9 @@ struct ip6_sf_socklist {
struct ipv6_mc_socklist {
struct in6_addr addr;
int ifindex;
+ unsigned int sfmode; /* MCAST_{INCLUDE,EXCLUDE} */
struct ipv6_mc_socklist __rcu *next;
rwlock_t sflock;
- unsigned int sfmode; /* MCAST_{INCLUDE,EXCLUDE} */
struct ip6_sf_socklist *sflist;
struct rcu_head rcu;
};
--
2.23.0.187.g17f5b7556c-goog
^ permalink raw reply related
* Re: [patch net-next rfc 3/7] net: rtnetlink: add commands to add and delete alternative ifnames
From: Jiri Pirko @ 2019-08-27 7:08 UTC (permalink / raw)
To: David Miller
Cc: jakub.kicinski, dsahern, roopa, netdev, sthemmin, dcbw, mkubecek,
andrew, parav, saeedm, mlxsw
In-Reply-To: <20190826.151819.804077961408964282.davem@davemloft.net>
Tue, Aug 27, 2019 at 12:18:19AM CEST, davem@davemloft.net wrote:
>From: Jakub Kicinski <jakub.kicinski@netronome.com>
>Date: Mon, 26 Aug 2019 15:15:52 -0700
>
>> Weren't there multiple problems with the size of the RTM_NEWLINK
>> notification already? Adding multiple sizeable strings to it won't
>> help there either :S
>
>Indeed.
>
>We even had situations where we had to make the information provided
>in a newlink dump opt-in when we added VF info because the buffers
>glibc was using at the time were too small and this broke so much
>stuff.
>
>I honestly think that the size of link dumps are out of hand as-is.
Okay, so if I understand correctly, on top of separate commands for
add/del of alternative names, you suggest also get/dump to be separate
command and don't fill this up in existing newling/getlink command.
So we'll have:
CMD to add:
RTM_NEWALTIFNAME = 108
#define RTM_NEWALTIFNAME RTM_NEWALTIFNAME
Example msg (user->kernel):
IFLA_IFNAME eth0
IFLA_ALT_IFNAME_MOD somereallyreallylongname
Example msg (user->kernel):
IFLA_ALT_IFNAME somereallyreallylongname
IFLA_ALT_IFNAME_MOD someotherreallyreallylongname
CMD to delete:
RTM_DELALTIFNAME,
#define RTM_DELALTIFNAME RTM_DELALTIFNAME
Example msg (user->kernel):
IFLA_IFNAME eth0
IFLA_ALT_IFNAME_MOD somereallyreallylongname
CMD to get/dump:
RTM_GETALTIFNAME,
#define RTM_GETALTIFNAME RTM_GETALTIFNAME
Example msg (kernel->user):
hdr (with ifindex)
IFLA_ALT_IFNAME_LIST (nest)
IFLA_ALT_IFNAME somereallyreallylongname
IFLA_ALT_IFNAME someotherreallyreallylongname
Correct?
^ permalink raw reply
* Re: [Xen-devel] Question on xen-netfront code to fix a potential ring buffer corruption
From: Juergen Gross @ 2019-08-27 7:04 UTC (permalink / raw)
To: Dongli Zhang, xen-devel; +Cc: Joe Jin, netdev
In-Reply-To: <9284025e-1066-387e-a52f-c46d4c66d2d3@oracle.com>
On 27.08.19 08:43, Dongli Zhang wrote:
> Hi Juergen,
>
> On 8/27/19 2:13 PM, Juergen Gross wrote:
>> On 18.08.19 10:31, Dongli Zhang wrote:
>>> Hi,
>>>
>>> Would you please help confirm why the condition at line 908 is ">="?
>>>
>>> In my opinion, we would only hit "skb_shinfo(skb)->nr_frag == MAX_SKB_FRAGS" at
>>> line 908.
>>>
>>> 890 static RING_IDX xennet_fill_frags(struct netfront_queue *queue,
>>> 891 struct sk_buff *skb,
>>> 892 struct sk_buff_head *list)
>>> 893 {
>>> 894 RING_IDX cons = queue->rx.rsp_cons;
>>> 895 struct sk_buff *nskb;
>>> 896
>>> 897 while ((nskb = __skb_dequeue(list))) {
>>> 898 struct xen_netif_rx_response *rx =
>>> 899 RING_GET_RESPONSE(&queue->rx, ++cons);
>>> 900 skb_frag_t *nfrag = &skb_shinfo(nskb)->frags[0];
>>> 901
>>> 902 if (skb_shinfo(skb)->nr_frags == MAX_SKB_FRAGS) {
>>> 903 unsigned int pull_to = NETFRONT_SKB_CB(skb)->pull_to;
>>> 904
>>> 905 BUG_ON(pull_to < skb_headlen(skb));
>>> 906 __pskb_pull_tail(skb, pull_to - skb_headlen(skb));
>>> 907 }
>>> 908 if (unlikely(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS)) {
>>> 909 queue->rx.rsp_cons = ++cons;
>>> 910 kfree_skb(nskb);
>>> 911 return ~0U;
>>> 912 }
>>> 913
>>> 914 skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
>>> 915 skb_frag_page(nfrag),
>>> 916 rx->offset, rx->status, PAGE_SIZE);
>>> 917
>>> 918 skb_shinfo(nskb)->nr_frags = 0;
>>> 919 kfree_skb(nskb);
>>> 920 }
>>> 921
>>> 922 return cons;
>>> 923 }
>>>
>>>
>>> The reason that I ask about this is because I am considering below patch to
>>> avoid a potential xen-netfront ring buffer corruption.
>>>
>>> diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
>>> index 8d33970..48a2162 100644
>>> --- a/drivers/net/xen-netfront.c
>>> +++ b/drivers/net/xen-netfront.c
>>> @@ -906,7 +906,7 @@ static RING_IDX xennet_fill_frags(struct netfront_queue
>>> *queue,
>>> __pskb_pull_tail(skb, pull_to - skb_headlen(skb));
>>> }
>>> if (unlikely(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS)) {
>>> - queue->rx.rsp_cons = ++cons;
>>> + queue->rx.rsp_cons = cons + skb_queue_len(list) + 1;
>>> kfree_skb(nskb);
>>> return ~0U;
>>> }
>>>
>>>
>>> If there is skb left in list when we return ~0U, queue->rx.rsp_cons may be set
>>> incorrectly.
>>
>> Sa basically you want to consume the responses for all outstanding skbs
>> in the list?
>>
>
> I think there would be bug if there is skb left in the list.
>
> This is what is implanted in xennet_poll() when there is error of processing
> extra info like below. As at line 1034, if there is error, all outstanding skb
> are consumed.
>
> 985 static int xennet_poll(struct napi_struct *napi, int budget)
> ... ...
> 1028 if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
> 1029 struct xen_netif_extra_info *gso;
> 1030 gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
> 1031
> 1032 if (unlikely(xennet_set_skb_gso(skb, gso))) {
> 1033 __skb_queue_head(&tmpq, skb);
> 1034 queue->rx.rsp_cons += skb_queue_len(&tmpq);
> 1035 goto err;
> 1036 }
> 1037 }
>
> The reason we need to consume all outstanding skb is because
> xennet_get_responses() would reset both queue->rx_skbs[i] and
> queue->grant_rx_ref[i] to NULL before enqueue all outstanding skb to the list
> (e.g., &tmpq), by xennet_get_rx_skb() and xennet_get_rx_ref().
>
> If we do not consume all of them, we would hit NULL in queue->rx_skbs[i] in next
> iteration of while loop in xennet_poll().
>
> That is, if we do not consume all outstanding skb, the queue->rx.rsp_cons may
> refer to a response whose queue->rx_skbs[i] and queue->grant_rx_ref[i] are
> already reset to NULL.
Thanks for confirming. I just wanted to make sure I understand your
patch correctly. :-)
Juergen
^ permalink raw reply
* Re: [PATCH net] xfrm: remove the unnecessary .net_exit for xfrmi
From: Steffen Klassert @ 2019-08-27 7:01 UTC (permalink / raw)
To: Xin Long; +Cc: network dev, davem
In-Reply-To: <120f5509a5c9292b437041e8a4193653adb9a019.1566807951.git.lucien.xin@gmail.com>
On Mon, Aug 26, 2019 at 04:25:51PM +0800, Xin Long wrote:
> The xfrm_if(s) on each netns can be deleted when its xfrmi dev is
> deleted. xfrmi dev's removal can happen when:
>
> 1. Its phydev is being deleted and NETDEV_UNREGISTER event is
> processed in xfrmi_event() from my last patch.
> 2. netns is being removed and all xfrmi devs will be deleted.
> 3. rtnl_link_unregister(&xfrmi_link_ops) in xfrmi_fini() when
> xfrm_interface.ko is being unloaded.
>
> So there's no need to use xfrmi_exit_net() to clean any xfrm_if up.
>
> Signed-off-by: Xin Long <lucien.xin@gmail.com>
We already have a bunch of xfrm interface fixes in the ipsec tree.
Please base this patch onto the ipsec tree and adjust if needed.
Thanks!
^ permalink raw reply
* Re: [PATCH net] xfrm: add NETDEV_UNREGISTER event process for xfrmi
From: Steffen Klassert @ 2019-08-27 6:58 UTC (permalink / raw)
To: Xin Long; +Cc: network dev, davem
In-Reply-To: <fce85b872c03cee379cb30ae46100ff42bea8e0d.1566807905.git.lucien.xin@gmail.com>
On Mon, Aug 26, 2019 at 04:25:05PM +0800, Xin Long wrote:
> When creating a xfrmi dev, it always holds the phydev. The xfrmi dev should
> be deleted when the phydev is being unregistered, so that the phydev can be
> put on time. Otherwise the phydev's deleting will get stuck:
>
> # ip link add dummy10 type dummy
> # ip link add xfrmi10 type xfrm dev dummy10
> # ip link del dummy10
>
> The last command blocks and dmesg shows:
>
> unregister_netdevice: waiting for dummy10 to become free. Usage count = 1
>
> This patch fixes it by adding NETDEV_UNREGISTER event process for xfrmi.
There is already a patch in the ipsec tree that fixes this issue
in a different way. Please base xfrm patches on the ipsec/ipsec-next
tree to avoid double work.
Thanks anyway!
^ permalink raw reply
* [PATCH v2] ath10k: Adjust skb length in ath10k_sdio_mbox_rx_packet
From: Nicolas Boichat @ 2019-08-27 6:58 UTC (permalink / raw)
To: Kalle Valo
Cc: David S . Miller, ath10k, linux-wireless, netdev, linux-kernel,
wgong, Niklas Cassel, Alagu Sankar, briannorris, tientzu
When the FW bundles multiple packets, pkt->act_len may be incorrect
as it refers to the first packet only (however, the FW will only
bundle packets that fit into the same pkt->alloc_len).
Before this patch, the skb length would be set (incorrectly) to
pkt->act_len in ath10k_sdio_mbox_rx_packet, and then later manually
adjusted in ath10k_sdio_mbox_rx_process_packet.
The first problem is that ath10k_sdio_mbox_rx_process_packet does not
use proper skb_put commands to adjust the length (it directly changes
skb->len), so we end up with a mismatch between skb->head + skb->tail
and skb->data + skb->len. This is quite serious, and causes corruptions
in the TCP stack, as the stack tries to coalesce packets, and relies
on skb->tail being correct (that is, skb_tail_pointer must point to
the first byte_after_ the data).
Instead of re-adjusting the size in ath10k_sdio_mbox_rx_process_packet,
this moves the code to ath10k_sdio_mbox_rx_packet, and also add a
bounds check, as skb_put would crash the kernel if not enough space is
available.
Fixes: 8530b4e7b22bc3b ("ath10k: sdio: set skb len for all rx packets")
Signed-off-by: Nicolas Boichat <drinkcat@chromium.org>
---
One simple way to reproduce this issue is this scriplet, that sends a
lot of small packets over SSH (it usually fails before reaching 300):
(for i in `seq 1 300`; do echo $i; sleep 0.1; done) | ssh $IP cat
I was not able to check the original use case why the code was added
(packets > 1500 bytes), as the FW on my board crashes when sending
such large packets.
drivers/net/wireless/ath/ath10k/sdio.c | 25 +++++++++++++++++--------
1 file changed, 17 insertions(+), 8 deletions(-)
diff --git a/drivers/net/wireless/ath/ath10k/sdio.c b/drivers/net/wireless/ath/ath10k/sdio.c
index 8ed4fbd8d6c3888..0a3ac44a13698c1 100644
--- a/drivers/net/wireless/ath/ath10k/sdio.c
+++ b/drivers/net/wireless/ath/ath10k/sdio.c
@@ -381,16 +381,11 @@ static int ath10k_sdio_mbox_rx_process_packet(struct ath10k *ar,
struct ath10k_htc_hdr *htc_hdr = (struct ath10k_htc_hdr *)skb->data;
bool trailer_present = htc_hdr->flags & ATH10K_HTC_FLAG_TRAILER_PRESENT;
enum ath10k_htc_ep_id eid;
- u16 payload_len;
u8 *trailer;
int ret;
- payload_len = le16_to_cpu(htc_hdr->len);
- skb->len = payload_len + sizeof(struct ath10k_htc_hdr);
-
if (trailer_present) {
- trailer = skb->data + sizeof(*htc_hdr) +
- payload_len - htc_hdr->trailer_len;
+ trailer = skb->data + skb->len - htc_hdr->trailer_len;
eid = pipe_id_to_eid(htc_hdr->eid);
@@ -636,9 +631,23 @@ static int ath10k_sdio_mbox_rx_packet(struct ath10k *ar,
ret = ath10k_sdio_readsb(ar, ar_sdio->mbox_info.htc_addr,
skb->data, pkt->alloc_len);
+ if (!ret) {
+ /* Update actual length. The original length may be incorrect,
+ * as the FW will bundle multiple packets as long as their sizes
+ * fit within the same aligned length (pkt->alloc_len).
+ */
+ struct ath10k_htc_hdr *htc_hdr =
+ (struct ath10k_htc_hdr *)skb->data;
+ pkt->act_len = le16_to_cpu(htc_hdr->len) + sizeof(*htc_hdr);
+ if (pkt->act_len <= pkt->alloc_len) {
+ skb_put(skb, pkt->act_len);
+ } else {
+ ath10k_warn(ar, "rx_packet too large (%d > %d)\n",
+ pkt->act_len, pkt->alloc_len);
+ ret = -EMSGSIZE;
+ }
+ }
pkt->status = ret;
- if (!ret)
- skb_put(skb, pkt->act_len);
return ret;
}
--
2.23.0.187.g17f5b7556c-goog
^ permalink raw reply related
* Re: [Xen-devel] Question on xen-netfront code to fix a potential ring buffer corruption
From: Dongli Zhang @ 2019-08-27 6:43 UTC (permalink / raw)
To: Juergen Gross, xen-devel; +Cc: netdev, Joe Jin
In-Reply-To: <130ea0ab-4364-2b77-dc8d-b869e06d1768@suse.com>
Hi Juergen,
On 8/27/19 2:13 PM, Juergen Gross wrote:
> On 18.08.19 10:31, Dongli Zhang wrote:
>> Hi,
>>
>> Would you please help confirm why the condition at line 908 is ">="?
>>
>> In my opinion, we would only hit "skb_shinfo(skb)->nr_frag == MAX_SKB_FRAGS" at
>> line 908.
>>
>> 890 static RING_IDX xennet_fill_frags(struct netfront_queue *queue,
>> 891 struct sk_buff *skb,
>> 892 struct sk_buff_head *list)
>> 893 {
>> 894 RING_IDX cons = queue->rx.rsp_cons;
>> 895 struct sk_buff *nskb;
>> 896
>> 897 while ((nskb = __skb_dequeue(list))) {
>> 898 struct xen_netif_rx_response *rx =
>> 899 RING_GET_RESPONSE(&queue->rx, ++cons);
>> 900 skb_frag_t *nfrag = &skb_shinfo(nskb)->frags[0];
>> 901
>> 902 if (skb_shinfo(skb)->nr_frags == MAX_SKB_FRAGS) {
>> 903 unsigned int pull_to = NETFRONT_SKB_CB(skb)->pull_to;
>> 904
>> 905 BUG_ON(pull_to < skb_headlen(skb));
>> 906 __pskb_pull_tail(skb, pull_to - skb_headlen(skb));
>> 907 }
>> 908 if (unlikely(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS)) {
>> 909 queue->rx.rsp_cons = ++cons;
>> 910 kfree_skb(nskb);
>> 911 return ~0U;
>> 912 }
>> 913
>> 914 skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
>> 915 skb_frag_page(nfrag),
>> 916 rx->offset, rx->status, PAGE_SIZE);
>> 917
>> 918 skb_shinfo(nskb)->nr_frags = 0;
>> 919 kfree_skb(nskb);
>> 920 }
>> 921
>> 922 return cons;
>> 923 }
>>
>>
>> The reason that I ask about this is because I am considering below patch to
>> avoid a potential xen-netfront ring buffer corruption.
>>
>> diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
>> index 8d33970..48a2162 100644
>> --- a/drivers/net/xen-netfront.c
>> +++ b/drivers/net/xen-netfront.c
>> @@ -906,7 +906,7 @@ static RING_IDX xennet_fill_frags(struct netfront_queue
>> *queue,
>> __pskb_pull_tail(skb, pull_to - skb_headlen(skb));
>> }
>> if (unlikely(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS)) {
>> - queue->rx.rsp_cons = ++cons;
>> + queue->rx.rsp_cons = cons + skb_queue_len(list) + 1;
>> kfree_skb(nskb);
>> return ~0U;
>> }
>>
>>
>> If there is skb left in list when we return ~0U, queue->rx.rsp_cons may be set
>> incorrectly.
>
> Sa basically you want to consume the responses for all outstanding skbs
> in the list?
>
I think there would be bug if there is skb left in the list.
This is what is implanted in xennet_poll() when there is error of processing
extra info like below. As at line 1034, if there is error, all outstanding skb
are consumed.
985 static int xennet_poll(struct napi_struct *napi, int budget)
... ...
1028 if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
1029 struct xen_netif_extra_info *gso;
1030 gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
1031
1032 if (unlikely(xennet_set_skb_gso(skb, gso))) {
1033 __skb_queue_head(&tmpq, skb);
1034 queue->rx.rsp_cons += skb_queue_len(&tmpq);
1035 goto err;
1036 }
1037 }
The reason we need to consume all outstanding skb is because
xennet_get_responses() would reset both queue->rx_skbs[i] and
queue->grant_rx_ref[i] to NULL before enqueue all outstanding skb to the list
(e.g., &tmpq), by xennet_get_rx_skb() and xennet_get_rx_ref().
If we do not consume all of them, we would hit NULL in queue->rx_skbs[i] in next
iteration of while loop in xennet_poll().
That is, if we do not consume all outstanding skb, the queue->rx.rsp_cons may
refer to a response whose queue->rx_skbs[i] and queue->grant_rx_ref[i] are
already reset to NULL.
Dongli Zhang
>>
>> While I am trying to make up a case that would hit the corruption, I could not
>> explain why (unlikely(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS)), but not
>> just "==". Perhaps __pskb_pull_tail() may fail although pull_to is less than
>> skb_headlen(skb).
>
> I don't think nr_frags can be larger than MAX_SKB_FRAGS. OTOH I don't
> think it hurts to have a safety net here in order to avoid problems.
>
> Originally this was BUG_ON(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS)
> so that might explain the ">=".
>
>
> Juergen
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xenproject.org
> https://lists.xenproject.org/mailman/listinfo/xen-devel
^ permalink raw reply
* Re: KASAN: slab-out-of-bounds Read in sctp_inq_pop
From: Xin Long @ 2019-08-27 6:29 UTC (permalink / raw)
To: syzbot
Cc: davem, LKML, linux-sctp, Marcelo Ricardo Leitner, network dev,
Neil Horman, syzkaller-bugs, Vlad Yasevich
In-Reply-To: <000000000000afc64d0591084876@google.com>
On Tue, Aug 27, 2019 at 1:15 AM syzbot
<syzbot+3ca06c5cb35ee3fc1f89@syzkaller.appspotmail.com> wrote:
>
> Hello,
>
> syzbot found the following crash on:
>
> HEAD commit: 9733a7c6 Add linux-next specific files for 20190823
> git tree: linux-next
> console output: https://syzkaller.appspot.com/x/log.txt?x=143ec11e600000
> kernel config: https://syzkaller.appspot.com/x/.config?x=f6c78a1438582bd1
> dashboard link: https://syzkaller.appspot.com/bug?extid=3ca06c5cb35ee3fc1f89
> compiler: gcc (GCC) 9.0.0 20181231 (experimental)
>
> Unfortunately, I don't have any reproducer for this crash yet.
>
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+3ca06c5cb35ee3fc1f89@syzkaller.appspotmail.com
>
> ==================================================================
> BUG: KASAN: slab-out-of-bounds in sctp_inq_pop+0xafd/0xd80
> net/sctp/inqueue.c:201
> Read of size 2 at addr ffff8880a4e37222 by task syz-executor.3/32407
>
> CPU: 1 PID: 32407 Comm: syz-executor.3 Not tainted 5.3.0-rc5-next-20190823
> #72
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> Google 01/01/2011
> Call Trace:
> __dump_stack lib/dump_stack.c:77 [inline]
> dump_stack+0x172/0x1f0 lib/dump_stack.c:113
> print_address_description.cold+0xd4/0x306 mm/kasan/report.c:351
> __kasan_report.cold+0x1b/0x36 mm/kasan/report.c:482
> kasan_report+0x12/0x17 mm/kasan/common.c:610
> __asan_report_load2_noabort+0x14/0x20 mm/kasan/generic_report.c:130
> sctp_inq_pop+0xafd/0xd80 net/sctp/inqueue.c:201
> sctp_endpoint_bh_rcv+0x184/0x8d0 net/sctp/endpointola.c:335
> sctp_inq_push+0x1e4/0x280 net/sctp/inqueue.c:80
> sctp_rcv+0x2807/0x3590 net/sctp/input.c:256
> sctp6_rcv+0x17/0x30 net/sctp/ipv6.c:1049
> ip6_protocol_deliver_rcu+0x2fe/0x1660 net/ipv6/ip6_input.c:397
> ip6_input_finish+0x84/0x170 net/ipv6/ip6_input.c:438
> NF_HOOK include/linux/netfilter.h:305 [inline]
> NF_HOOK include/linux/netfilter.h:299 [inline]
> ip6_input+0xe4/0x3f0 net/ipv6/ip6_input.c:447
> dst_input include/net/dst.h:442 [inline]
> ip6_sublist_rcv_finish+0x98/0x1e0 net/ipv6/ip6_input.c:84
> ip6_list_rcv_finish net/ipv6/ip6_input.c:118 [inline]
> ip6_sublist_rcv+0x80c/0xcf0 net/ipv6/ip6_input.c:282
> ipv6_list_rcv+0x373/0x4b0 net/ipv6/ip6_input.c:316
> __netif_receive_skb_list_ptype net/core/dev.c:5049 [inline]
> __netif_receive_skb_list_core+0x1a2/0x9d0 net/core/dev.c:5087
> __netif_receive_skb_list net/core/dev.c:5149 [inline]
> netif_receive_skb_list_internal+0x7eb/0xe60 net/core/dev.c:5244
> gro_normal_list.part.0+0x1e/0xb0 net/core/dev.c:5757
> gro_normal_list net/core/dev.c:5755 [inline]
> gro_normal_one net/core/dev.c:5769 [inline]
> napi_frags_finish net/core/dev.c:5782 [inline]
> napi_gro_frags+0xa6a/0xea0 net/core/dev.c:5855
> tun_get_user+0x2e98/0x3fa0 drivers/net/tun.c:1974
> tun_chr_write_iter+0xbd/0x156 drivers/net/tun.c:2020
> call_write_iter include/linux/fs.h:1890 [inline]
> do_iter_readv_writev+0x5f8/0x8f0 fs/read_write.c:693
> do_iter_write fs/read_write.c:976 [inline]
> do_iter_write+0x17b/0x380 fs/read_write.c:957
> vfs_writev+0x1b3/0x2f0 fs/read_write.c:1021
> do_writev+0x15b/0x330 fs/read_write.c:1064
> __do_sys_writev fs/read_write.c:1137 [inline]
> __se_sys_writev fs/read_write.c:1134 [inline]
> __x64_sys_writev+0x75/0xb0 fs/read_write.c:1134
> do_syscall_64+0xfa/0x760 arch/x86/entry/common.c:290
> entry_SYSCALL_64_after_hwframe+0x49/0xbe
> RIP: 0033:0x459731
> Code: 75 14 b8 14 00 00 00 0f 05 48 3d 01 f0 ff ff 0f 83 34 b9 fb ff c3 48
> 83 ec 08 e8 fa 2c 00 00 48 89 04 24 b8 14 00 00 00 0f 05 <48> 8b 3c 24 48
> 89 c2 e8 43 2d 00 00 48 89 d0 48 83 c4 08 48 3d 01
> RSP: 002b:00007fb4cd361ba0 EFLAGS: 00000293 ORIG_RAX: 0000000000000014
> RAX: ffffffffffffffda RBX: 000000000000002a RCX: 0000000000459731
> RDX: 0000000000000001 RSI: 00007fb4cd361c00 RDI: 00000000000000f0
> RBP: 000000000075bf20 R08: 0000000000000000 R09: 0000000000000000
> R10: 0000000000000000 R11: 0000000000000293 R12: 00007fb4cd3626d4
> R13: 00000000004c87e3 R14: 00000000004df640 R15: 00000000ffffffff
>
> Allocated by task 32407:
> save_stack+0x23/0x90 mm/kasan/common.c:69
> set_track mm/kasan/common.c:77 [inline]
> __kasan_kmalloc mm/kasan/common.c:486 [inline]
> __kasan_kmalloc.constprop.0+0xcf/0xe0 mm/kasan/common.c:459
> kasan_slab_alloc+0xf/0x20 mm/kasan/common.c:494
> slab_post_alloc_hook mm/slab.h:584 [inline]
> slab_alloc mm/slab.c:3319 [inline]
> kmem_cache_alloc+0x121/0x710 mm/slab.c:3483
> __build_skb+0x26/0x70 net/core/skbuff.c:310
> __napi_alloc_skb+0x1d2/0x300 net/core/skbuff.c:523
> napi_alloc_skb include/linux/skbuff.h:2801 [inline]
> napi_get_frags net/core/dev.c:5742 [inline]
> napi_get_frags+0x65/0x140 net/core/dev.c:5737
> tun_napi_alloc_frags drivers/net/tun.c:1473 [inline]
> tun_get_user+0x16bd/0x3fa0 drivers/net/tun.c:1834
> tun_chr_write_iter+0xbd/0x156 drivers/net/tun.c:2020
> call_write_iter include/linux/fs.h:1890 [inline]
> do_iter_readv_writev+0x5f8/0x8f0 fs/read_write.c:693
> do_iter_write fs/read_write.c:976 [inline]
> do_iter_write+0x17b/0x380 fs/read_write.c:957
> vfs_writev+0x1b3/0x2f0 fs/read_write.c:1021
> do_writev+0x15b/0x330 fs/read_write.c:1064
> __do_sys_writev fs/read_write.c:1137 [inline]
> __se_sys_writev fs/read_write.c:1134 [inline]
> __x64_sys_writev+0x75/0xb0 fs/read_write.c:1134
> do_syscall_64+0xfa/0x760 arch/x86/entry/common.c:290
> entry_SYSCALL_64_after_hwframe+0x49/0xbe
>
> Freed by task 3891:
> save_stack+0x23/0x90 mm/kasan/common.c:69
> set_track mm/kasan/common.c:77 [inline]
> __kasan_slab_free+0x102/0x150 mm/kasan/common.c:448
> kasan_slab_free+0xe/0x10 mm/kasan/common.c:456
> __cache_free mm/slab.c:3425 [inline]
> kmem_cache_free+0x86/0x320 mm/slab.c:3693
> kfree_skbmem net/core/skbuff.c:623 [inline]
> kfree_skbmem+0xc5/0x150 net/core/skbuff.c:617
> __kfree_skb net/core/skbuff.c:680 [inline]
> consume_skb net/core/skbuff.c:838 [inline]
> consume_skb+0x103/0x3b0 net/core/skbuff.c:832
> skb_free_datagram+0x1b/0x100 net/core/datagram.c:328
> netlink_recvmsg+0x6c6/0xf50 net/netlink/af_netlink.c:1996
> sock_recvmsg_nosec net/socket.c:871 [inline]
> sock_recvmsg net/socket.c:889 [inline]
> sock_recvmsg+0xce/0x110 net/socket.c:885
> ___sys_recvmsg+0x271/0x5a0 net/socket.c:2480
> __sys_recvmsg+0x102/0x1d0 net/socket.c:2537
> __do_sys_recvmsg net/socket.c:2547 [inline]
> __se_sys_recvmsg net/socket.c:2544 [inline]
> __x64_sys_recvmsg+0x78/0xb0 net/socket.c:2544
> do_syscall_64+0xfa/0x760 arch/x86/entry/common.c:290
> entry_SYSCALL_64_after_hwframe+0x49/0xbe
>
> The buggy address belongs to the object at ffff8880a4e37140
> which belongs to the cache skbuff_head_cache of size 224
> The buggy address is located 2 bytes to the right of
> 224-byte region [ffff8880a4e37140, ffff8880a4e37220)
> The buggy address belongs to the page:
> page:ffffea0002938dc0 refcount:1 mapcount:0 mapping:ffff88821b6a3a80
> index:0x0
> flags: 0x1fffc0000000200(slab)
> raw: 01fffc0000000200 ffffea000257fa88 ffffea00023a2008 ffff88821b6a3a80
> raw: 0000000000000000 ffff8880a4e37000 000000010000000c 0000000000000000
> page dumped because: kasan: bad access detected
>
> Memory state around the buggy address:
> ffff8880a4e37100: fc fc fc fc fc fc fc fc 00 00 00 00 00 00 00 00
> ffff8880a4e37180: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> > ffff8880a4e37200: 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc fc
> ^
> ffff8880a4e37280: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> ffff8880a4e37300: fb fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
> ==================================================================
>
>
> ---
> This bug is generated by a bot. It may contain errors.
> See https://goo.gl/tpsmEJ for more information about syzbot.
> syzbot engineers can be reached at syzkaller@googlegroups.com.
>
> syzbot will keep track of this bug report. See:
> https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
#syz fix: net: ipv6: fix listify ip6_rcv_finish in case of forwarding
^ permalink raw reply
* Re: Question on xen-netfront code to fix a potential ring buffer corruption
From: Juergen Gross @ 2019-08-27 6:13 UTC (permalink / raw)
To: Dongli Zhang, xen-devel; +Cc: Joe Jin, netdev
In-Reply-To: <c45b306e-c67b-49f5-8fe8-3913557a8774@default>
On 18.08.19 10:31, Dongli Zhang wrote:
> Hi,
>
> Would you please help confirm why the condition at line 908 is ">="?
>
> In my opinion, we would only hit "skb_shinfo(skb)->nr_frag == MAX_SKB_FRAGS" at
> line 908.
>
> 890 static RING_IDX xennet_fill_frags(struct netfront_queue *queue,
> 891 struct sk_buff *skb,
> 892 struct sk_buff_head *list)
> 893 {
> 894 RING_IDX cons = queue->rx.rsp_cons;
> 895 struct sk_buff *nskb;
> 896
> 897 while ((nskb = __skb_dequeue(list))) {
> 898 struct xen_netif_rx_response *rx =
> 899 RING_GET_RESPONSE(&queue->rx, ++cons);
> 900 skb_frag_t *nfrag = &skb_shinfo(nskb)->frags[0];
> 901
> 902 if (skb_shinfo(skb)->nr_frags == MAX_SKB_FRAGS) {
> 903 unsigned int pull_to = NETFRONT_SKB_CB(skb)->pull_to;
> 904
> 905 BUG_ON(pull_to < skb_headlen(skb));
> 906 __pskb_pull_tail(skb, pull_to - skb_headlen(skb));
> 907 }
> 908 if (unlikely(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS)) {
> 909 queue->rx.rsp_cons = ++cons;
> 910 kfree_skb(nskb);
> 911 return ~0U;
> 912 }
> 913
> 914 skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
> 915 skb_frag_page(nfrag),
> 916 rx->offset, rx->status, PAGE_SIZE);
> 917
> 918 skb_shinfo(nskb)->nr_frags = 0;
> 919 kfree_skb(nskb);
> 920 }
> 921
> 922 return cons;
> 923 }
>
>
> The reason that I ask about this is because I am considering below patch to
> avoid a potential xen-netfront ring buffer corruption.
>
> diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
> index 8d33970..48a2162 100644
> --- a/drivers/net/xen-netfront.c
> +++ b/drivers/net/xen-netfront.c
> @@ -906,7 +906,7 @@ static RING_IDX xennet_fill_frags(struct netfront_queue *queue,
> __pskb_pull_tail(skb, pull_to - skb_headlen(skb));
> }
> if (unlikely(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS)) {
> - queue->rx.rsp_cons = ++cons;
> + queue->rx.rsp_cons = cons + skb_queue_len(list) + 1;
> kfree_skb(nskb);
> return ~0U;
> }
>
>
> If there is skb left in list when we return ~0U, queue->rx.rsp_cons may be set
> incorrectly.
Sa basically you want to consume the responses for all outstanding skbs
in the list?
>
> While I am trying to make up a case that would hit the corruption, I could not
> explain why (unlikely(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS)), but not
> just "==". Perhaps __pskb_pull_tail() may fail although pull_to is less than
> skb_headlen(skb).
I don't think nr_frags can be larger than MAX_SKB_FRAGS. OTOH I don't
think it hurts to have a safety net here in order to avoid problems.
Originally this was BUG_ON(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS)
so that might explain the ">=".
Juergen
^ permalink raw reply
* Re: [RFC PATCH net-next] net: phy: force phy suspend when calling phy_stop
From: Heiner Kallweit @ 2019-08-27 5:51 UTC (permalink / raw)
To: Jian Shen, andrew, f.fainelli, davem; +Cc: netdev, forest.zhouchang, linuxarm
In-Reply-To: <1566874020-14334-1-git-send-email-shenjian15@huawei.com>
On 27.08.2019 04:47, Jian Shen wrote:
> Some ethernet drivers may call phy_start() and phy_stop() from
> ndo_open and ndo_close() respectively.
>
> When network cable is unconnected, and operate like below:
> step 1: ifconfig ethX up -> ndo_open -> phy_start ->start
> autoneg, and phy is no link.
> step 2: ifconfig ethX down -> ndo_close -> phy_stop -> just stop
> phy state machine.
> step 3: plugin the network cable, and autoneg complete, then
> LED for link status will be on.
> step 4: ethtool ethX --> see the result of "Link detected" is no.
>
Step 3 and 4 seem to be unrelated to the actual issue.
With which MAC + PHY driver did you observe this?
> This patch forces phy suspend even phydev->link is off.
>
> Signed-off-by: Jian Shen <shenjian15@huawei.com>
> ---
> drivers/net/phy/phy.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
> index f3adea9..0acd5b4 100644
> --- a/drivers/net/phy/phy.c
> +++ b/drivers/net/phy/phy.c
> @@ -911,8 +911,8 @@ void phy_state_machine(struct work_struct *work)
> if (phydev->link) {
> phydev->link = 0;
> phy_link_down(phydev, true);
> - do_suspend = true;
> }
> + do_suspend = true;
> break;
> }
>
>
Heiner
^ permalink raw reply
* Re: [PATCH, RFC] ath10k: Fix skb->len (properly) in ath10k_sdio_mbox_rx_packet
From: Nicolas Boichat @ 2019-08-27 5:23 UTC (permalink / raw)
To: Wen Gong
Cc: kvalo@codeaurora.org, Alagu Sankar, netdev@vger.kernel.org,
briannorris@chromium.org, linux-wireless@vger.kernel.org,
linux-kernel@vger.kernel.org, ath10k@lists.infradead.org,
wgong@codeaurora.org, niklas.cassel@linaro.org,
tientzu@chromium.org, David S . Miller
In-Reply-To: <36878f3488f047978038c844daedd02f@aptaiexm02f.ap.qualcomm.com>
On Tue, Aug 27, 2019 at 11:34 AM Wen Gong <wgong@qti.qualcomm.com> wrote:
>
> > -----Original Message-----
> > From: ath10k <ath10k-bounces@lists.infradead.org> On Behalf Of Nicolas
> > Boichat
> > Sent: Tuesday, August 27, 2019 8:33 AM
> > To: kvalo@codeaurora.org
> > Cc: Alagu Sankar <alagusankar@silex-india.com>; netdev@vger.kernel.org;
> > briannorris@chromium.org; linux-wireless@vger.kernel.org; linux-
> > kernel@vger.kernel.org; ath10k@lists.infradead.org;
> > wgong@codeaurora.org; niklas.cassel@linaro.org; tientzu@chromium.org;
> > David S . Miller <davem@davemloft.net>
> > Subject: [EXT] [PATCH, RFC] ath10k: Fix skb->len (properly) in
> > ath10k_sdio_mbox_rx_packet
> >
> > (not a formal patch, take this as a bug report for now, I can clean
> > up depending on the feedback I get here)
> >
> > There's at least 3 issues here, and the patch fixes 2/3 only, I'm not sure
> > how/if 1 should be handled.
> > 1. ath10k_sdio_mbox_rx_alloc allocating skb of a incorrect size (too
> > small)
> > 2. ath10k_sdio_mbox_rx_packet calling skb_put with that incorrect size.
> > 3. ath10k_sdio_mbox_rx_process_packet attempts to fixup the size, but
> > does not use proper skb_put commands to do so, so we end up with
> > a mismatch between skb->head + skb->tail and skb->data + skb->len.
> >
> > Let's start with 3, this is quite serious as this and causes corruptions
> > in the TCP stack, as the stack tries to coalesce packets, and relies on
> > skb->tail being correct (that is, skb_tail_pointer must point to the
> > first byte _after_ the data): one must never manipulate skb->len
> > directly.
> >
> > Instead, we need to use skb_put to allocate more space (which updates
> > skb->len and skb->tail). But it seems odd to do that in
> > ath10k_sdio_mbox_rx_process_packet, so I move the code to
> > ath10k_sdio_mbox_rx_packet (point 2 above).
> >
> > However, there is still something strange (point 1 above), why is
> > ath10k_sdio_mbox_rx_alloc allocating packets of the incorrect
> > (too small?) size? What happens if the packet is bigger than alloc_len?
> > Does this lead to corruption/lost data?
> >
> > Fixes: 8530b4e7b22bc3b ("ath10k: sdio: set skb len for all rx packets")
> > Signed-off-by: Nicolas Boichat <drinkcat@chromium.org>
> >
> > ---
> >
> > One simple way to test this is this scriplet, that sends a lot of
> > small packets over SSH:
> > (for i in `seq 1 300`; do echo $i; sleep 0.1; done) | ssh $IP cat
> >
> > In my testing it rarely ever reach 300 without failure.
> >
> > drivers/net/wireless/ath/ath10k/sdio.c | 18 ++++++++++++------
> > 1 file changed, 12 insertions(+), 6 deletions(-)
> >
> > diff --git a/drivers/net/wireless/ath/ath10k/sdio.c
> > b/drivers/net/wireless/ath/ath10k/sdio.c
> > index 8ed4fbd8d6c3888..a9f5002863ee7bb 100644
> > --- a/drivers/net/wireless/ath/ath10k/sdio.c
> > +++ b/drivers/net/wireless/ath/ath10k/sdio.c
> > @@ -381,16 +381,14 @@ static int
> > ath10k_sdio_mbox_rx_process_packet(struct ath10k *ar,
> > struct ath10k_htc_hdr *htc_hdr = (struct ath10k_htc_hdr *)skb->data;
> > bool trailer_present = htc_hdr->flags &
> > ATH10K_HTC_FLAG_TRAILER_PRESENT;
> > enum ath10k_htc_ep_id eid;
> > - u16 payload_len;
> > u8 *trailer;
> > int ret;
> >
> > - payload_len = le16_to_cpu(htc_hdr->len);
> > - skb->len = payload_len + sizeof(struct ath10k_htc_hdr);
> > + /* TODO: Remove this? */
> If the pkt->act_len has set again in ath10k_sdio_mbox_rx_packet, seems not needed.
Sure, will drop.
> > + WARN_ON(skb->len != le16_to_cpu(htc_hdr->len) + sizeof(*htc_hdr));
> >
> > if (trailer_present) {
> > - trailer = skb->data + sizeof(*htc_hdr) +
> > - payload_len - htc_hdr->trailer_len;
> > + trailer = skb->data + skb->len - htc_hdr->trailer_len;
> >
> > eid = pipe_id_to_eid(htc_hdr->eid);
> >
> > @@ -637,8 +635,16 @@ static int ath10k_sdio_mbox_rx_packet(struct
> > ath10k *ar,
> > ret = ath10k_sdio_readsb(ar, ar_sdio->mbox_info.htc_addr,
> > skb->data, pkt->alloc_len);
> > pkt->status = ret;
> > - if (!ret)
> > + if (!ret) {
> > + /* Update actual length. */
> > + /* FIXME: This looks quite wrong, why is pkt->act_len not
> > + * correct in the first place?
> > + */
> Firmware will do bundle for rx packet, and the aligned length by block size(256) of each packet's len is same
> in a bundle.
>
> Eg.
> packet 1 len: 300, aligned length:512
> packet 2 len: 400, aligned length:512
> packet 3 len: 200, aligned length:256
> packet 4 len: 100, aligned length:256
> packet 5 len: 700, aligned length:768
> packet 6 len: 600, aligned length:768
>
> then packet 1,2 will in bundle 1, packet 3,4 in a bundle 2, packet 5,6 in a bundle 3.
>
> For bundle 1, packet 1,2 will both allocate with len 512, and act_len is 300 first,
> then packet 2's len will be overwrite to 400.
>
> For bundle 2, packet 3,4 will both allocate with len 256, and act_len is 200 first,
> then packet 4's len will be overwrite to 100.
>
> For bundle 3, packet 5,6 will both allocate with len 768, and act_len is 700 first,
> then packet 6's len will be overwrite to 600.
Ok thanks, I'll send a v2 with an improved description.
> > + struct ath10k_htc_hdr *htc_hdr =
> > + (struct ath10k_htc_hdr *)skb->data;
> > + pkt->act_len = le16_to_cpu(htc_hdr->len) + sizeof(*htc_hdr);
> > skb_put(skb, pkt->act_len);
> > + }
> >
> > return ret;
> > }
> > --
> > 2.23.0.187.g17f5b7556c-goog
> >
> >
> > _______________________________________________
> > ath10k mailing list
> > ath10k@lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/ath10k
^ permalink raw reply
* Re: [PATCH V2 net 1/2] openvswitch: Properly set L4 keys on "later" IP fragments
From: Pravin Shelar @ 2019-08-27 5:13 UTC (permalink / raw)
To: Greg Rose; +Cc: Linux Kernel Network Developers, Joe Stringer
In-Reply-To: <1566852359-8028-1-git-send-email-gvrose8192@gmail.com>
On Mon, Aug 26, 2019 at 1:46 PM Greg Rose <gvrose8192@gmail.com> wrote:
>
> When IP fragments are reassembled before being sent to conntrack, the
> key from the last fragment is used. Unless there are reordering
> issues, the last fragment received will not contain the L4 ports, so the
> key for the reassembled datagram won't contain them. This patch updates
> the key once we have a reassembled datagram.
>
> The handle_fragments() function works on L3 headers so we pull the L3/L4
> flow key update code from key_extract into a new function
> 'key_extract_l3l4'. Then we add a another new function
> ovs_flow_key_update_l3l4() and export it so that it is accessible by
> handle_fragments() for conntrack packet reassembly.
>
> Co-authored by: Justin Pettit <jpettit@ovn.org>
> Signed-off-by: Greg Rose <gvrose8192@gmail.com>
> ---
> net/openvswitch/conntrack.c | 5 ++
> net/openvswitch/flow.c | 161 ++++++++++++++++++++++++++------------------
> net/openvswitch/flow.h | 1 +
> 3 files changed, 101 insertions(+), 66 deletions(-)
>
...
...
>
> +/**
> + * key_extract - extracts a flow key from an Ethernet frame.
> + * @skb: sk_buff that contains the frame, with skb->data pointing to the
> + * Ethernet header
> + * @key: output flow key
> + *
> + * The caller must ensure that skb->len >= ETH_HLEN.
> + *
> + * Returns 0 if successful, otherwise a negative errno value.
> + *
> + * Initializes @skb header fields as follows:
> + *
> + * - skb->mac_header: the L2 header.
> + *
> + * - skb->network_header: just past the L2 header, or just past the
> + * VLAN header, to the first byte of the L2 payload.
> + *
> + * - skb->transport_header: If key->eth.type is ETH_P_IP or ETH_P_IPV6
> + * on output, then just past the IP header, if one is present and
> + * of a correct length, otherwise the same as skb->network_header.
> + * For other key->eth.type values it is left untouched.
> + *
> + * - skb->protocol: the type of the data starting at skb->network_header.
> + * Equals to key->eth.type.
> + */
> +static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
> +{
> + struct ethhdr *eth;
> +
> + /* Flags are always used as part of stats */
> + key->tp.flags = 0;
> +
> + skb_reset_mac_header(skb);
> +
> + /* Link layer. */
> + clear_vlan(key);
> + if (ovs_key_mac_proto(key) == MAC_PROTO_NONE) {
> + if (unlikely(eth_type_vlan(skb->protocol)))
> + return -EINVAL;
> +
> + skb_reset_network_header(skb);
> + key->eth.type = skb->protocol;
> + } else {
> + eth = eth_hdr(skb);
> + ether_addr_copy(key->eth.src, eth->h_source);
> + ether_addr_copy(key->eth.dst, eth->h_dest);
> +
> + __skb_pull(skb, 2 * ETH_ALEN);
> + /* We are going to push all headers that we pull, so no need to
> + * update skb->csum here.
> + */
> +
> + if (unlikely(parse_vlan(skb, key)))
> + return -ENOMEM;
> +
> + key->eth.type = parse_ethertype(skb);
> + if (unlikely(key->eth.type == htons(0)))
> + return -ENOMEM;
> +
> + /* Multiple tagged packets need to retain TPID to satisfy
> + * skb_vlan_pop(), which will later shift the ethertype into
> + * skb->protocol.
> + */
> + if (key->eth.cvlan.tci & htons(VLAN_CFI_MASK))
> + skb->protocol = key->eth.cvlan.tpid;
> + else
> + skb->protocol = key->eth.type;
> +
> + skb_reset_network_header(skb);
> + __skb_push(skb, skb->data - skb_mac_header(skb));
> + }
> +
> + skb_reset_mac_len(skb);
> +
> + /* Fill out L3/L4 key info, if any */
> + return key_extract_l3l4(skb, key);
> +}
> +
> +/* In the case of conntrack fragment handling it expects L3 headers,
> + * add a helper.
> + */
> +int ovs_flow_key_update_l3l4(struct sk_buff *skb, struct sw_flow_key *key)
> +{
> + int res;
> +
> + res = key_extract_l3l4(skb, key);
> + if (!res)
> + key->mac_proto &= ~SW_FLOW_KEY_INVALID;
> +
Since this is not full key extract, this flag can not be unset.
Otherwise looks good.
^ permalink raw reply
* Re: [patch net-next rfc 3/7] net: rtnetlink: add commands to add and delete alternative ifnames
From: Michal Kubecek @ 2019-08-27 5:09 UTC (permalink / raw)
To: netdev
Cc: David Ahern, David Miller, jakub.kicinski, jiri, roopa, sthemmin,
dcbw, andrew, parav, saeedm, mlxsw
In-Reply-To: <beb8ec07-f28e-4378-e8dd-fa6fe290377b@gmail.com>
On Mon, Aug 26, 2019 at 06:17:08PM -0600, David Ahern wrote:
>
> Something a bit stand alone would be a better choice - like all of the
> VF stuff, stats, per-device type configuration. Yes, that ship has
> sailed, but as I recall that is where the overhead is.
We are not so far from the point where IFLA_VFINFO_LIST grows over 64 KB
so that it does not fit into a netlink attribute. So we will probably
have to rethink that part anyway.
Michal
^ permalink raw reply
* [PATCH v2 2/2] macb: Update compatibility string for SiFive FU540-C000
From: Yash Shah @ 2019-08-27 5:06 UTC (permalink / raw)
To: davem, netdev, devicetree, linux-kernel, linux-riscv
Cc: robh+dt, mark.rutland, nicolas.ferre, palmer, paul.walmsley,
ynezz, sachin.ghadi, Yash Shah
In-Reply-To: <1566882364-23891-1-git-send-email-yash.shah@sifive.com>
Update the compatibility string for SiFive FU540-C000 as per the new
string updated in the binding doc.
Reference:
https://lore.kernel.org/netdev/CAJ2_jOFEVZQat0Yprg4hem4jRrqkB72FKSeQj4p8P5KA-+rgww@mail.gmail.com/
Signed-off-by: Yash Shah <yash.shah@sifive.com>
Acked-by: Nicolas Ferre <nicolas.ferre@microchip.com>
Reviewed-by: Paul Walmsley <paul.walmsley@sifive.com>
Tested-by: Paul Walmsley <paul.walmsley@sifive.com>
---
drivers/net/ethernet/cadence/macb_main.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index 5ca17e6..35b59b5 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -4154,7 +4154,7 @@ static int fu540_c000_init(struct platform_device *pdev)
{ .compatible = "cdns,emac", .data = &emac_config },
{ .compatible = "cdns,zynqmp-gem", .data = &zynqmp_config},
{ .compatible = "cdns,zynq-gem", .data = &zynq_config },
- { .compatible = "sifive,fu540-macb", .data = &fu540_c000_config },
+ { .compatible = "sifive,fu540-c000-gem", .data = &fu540_c000_config },
{ /* sentinel */ }
};
MODULE_DEVICE_TABLE(of, macb_dt_ids);
--
1.9.1
^ permalink raw reply related
* [PATCH v2 1/2] macb: bindings doc: update sifive fu540-c000 binding
From: Yash Shah @ 2019-08-27 5:06 UTC (permalink / raw)
To: davem, netdev, devicetree, linux-kernel, linux-riscv
Cc: robh+dt, mark.rutland, nicolas.ferre, palmer, paul.walmsley,
ynezz, sachin.ghadi, Yash Shah
In-Reply-To: <1566882364-23891-1-git-send-email-yash.shah@sifive.com>
As per the discussion with Nicolas Ferre[0], rename the compatible property
to a more appropriate and specific string.
[0] https://lore.kernel.org/netdev/CAJ2_jOFEVZQat0Yprg4hem4jRrqkB72FKSeQj4p8P5KA-+rgww@mail.gmail.com/
Signed-off-by: Yash Shah <yash.shah@sifive.com>
Acked-by: Nicolas Ferre <nicolas.ferre@microchip.com>
Reviewed-by: Paul Walmsley <paul.walmsley@sifive.com>
Reviewed-by: Rob Herring <robh@kernel.org>
---
Documentation/devicetree/bindings/net/macb.txt | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/Documentation/devicetree/bindings/net/macb.txt b/Documentation/devicetree/bindings/net/macb.txt
index 63c73fa..0b61a90 100644
--- a/Documentation/devicetree/bindings/net/macb.txt
+++ b/Documentation/devicetree/bindings/net/macb.txt
@@ -15,10 +15,10 @@ Required properties:
Use "atmel,sama5d4-gem" for the GEM IP (10/100) available on Atmel sama5d4 SoCs.
Use "cdns,zynq-gem" Xilinx Zynq-7xxx SoC.
Use "cdns,zynqmp-gem" for Zynq Ultrascale+ MPSoC.
- Use "sifive,fu540-macb" for SiFive FU540-C000 SoC.
+ Use "sifive,fu540-c000-gem" for SiFive FU540-C000 SoC.
Or the generic form: "cdns,emac".
- reg: Address and length of the register set for the device
- For "sifive,fu540-macb", second range is required to specify the
+ For "sifive,fu540-c000-gem", second range is required to specify the
address and length of the registers for GEMGXL Management block.
- interrupts: Should contain macb interrupt
- phy-mode: See ethernet.txt file in the same directory.
--
1.9.1
^ permalink raw reply related
* [PATCH v2 0/2] Update ethernet compatible string for SiFive FU540
From: Yash Shah @ 2019-08-27 5:06 UTC (permalink / raw)
To: davem, netdev, devicetree, linux-kernel, linux-riscv
Cc: robh+dt, mark.rutland, nicolas.ferre, palmer, paul.walmsley,
ynezz, sachin.ghadi, Yash Shah
This patch series renames the compatible property to a more appropriate
string. The patchset is based on Linux-5.3-rc6 and tested on SiFive
Unleashed board
Change history:
Since v1:
- Dropped PATCH3 because it's already merged
- Change the reference url in the patch descriptions to point to a
'lore.kernel.org' link instead of 'lkml.org'
Yash Shah (2):
macb: bindings doc: update sifive fu540-c000 binding
macb: Update compatibility string for SiFive FU540-C000
Documentation/devicetree/bindings/net/macb.txt | 4 ++--
drivers/net/ethernet/cadence/macb_main.c | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
--
1.9.1
^ permalink raw reply
* Re: [patch net-next rfc 3/7] net: rtnetlink: add commands to add and delete alternative ifnames
From: Michal Kubecek @ 2019-08-27 4:55 UTC (permalink / raw)
To: netdev
Cc: David Ahern, Jakub Kicinski, Jiri Pirko, Roopa Prabhu,
David Miller, Stephen Hemminger, dcbw, Andrew Lunn, parav,
Saeed Mahameed, mlxsw
In-Reply-To: <5d79fba4-f82e-97a7-7846-fd1de089a95b@gmail.com>
On Mon, Aug 26, 2019 at 03:46:43PM -0600, David Ahern wrote:
> On 8/26/19 10:55 AM, Jakub Kicinski wrote:
> > On Mon, 26 Aug 2019 18:09:16 +0200, Jiri Pirko wrote:
> >> DaveA, Roopa. Do you insist on doing add/remove of altnames in the
> >> existing setlist command using embedded message op attrs? I'm asking
> >> because after some time thinking about it, it still feels wrong to me :/
> >>
> >> If this would be a generic netlink api, we would just add another couple
> >> of commands. What is so different we can't add commands here?
> >> It is also much simpler code. Easy error handling, no need for
> >> rollback, no possibly inconsistent state, etc.
> >
> > +1 the separate op feels like a better uapi to me as well.
> >
> > Perhaps we could redo the iproute2 command line interface to make the
> > name the primary object? Would that address your concern Dave and Roopa?
> >
>
> No, my point is exactly that a name is not a primary object. A name is
> an attribute of a link - something that exists for the convenience of
> userspace only. (Like the 'protocol' for routes, rules and neighbors.)
>
> Currently, names are changed by RTM_NEWLINK/RTM_SETLINK. Aliases are
> added and deleted by RTM_NEWLINK/RTM_SETLINK. Why is an alternative name
> so special that it should have its own API?
There is only one alias so that it makes perfect sense to set it like
any other attribute. But the series introduces a list of alternative
names. So IMHO better analogy would be network addresses - and we do
have RTM_NEWADDR/RTM_DELADDR for them.
> If only 1 alt name was allowed, then RTM_NEWLINK/RTM_SETLINK would
> suffice. Management of it would have the same semantics as an alias -
> empty string means delete, non-empty string sets the value.
>
> So really the push for new RTM commands is to handle an unlimited number
> of alt names with the ability to change / delete any one of them. Has
> the need for multiple alternate ifnames been fully established? (I don't
> recall other than a discussion about parallels to block devices.)
I still believe the biggest problem with so-called "predictable names"
for network devices (which turn out to be not predictable and often not
even persistent) comes from the fact that unlike for block (or
character) devices, we don't have anything like symlink for network
device. This forces udev to choose only one naming scheme with all the
unpleasant consequences. So I see this series as an opportunity to get
rid of this limitation.
Michal
^ permalink raw reply
* Re: [PATCH v5 net-next 04/18] ionic: Add basic lif support
From: Jakub Kicinski @ 2019-08-27 4:42 UTC (permalink / raw)
To: Shannon Nelson; +Cc: netdev, davem
In-Reply-To: <20190826213339.56909-5-snelson@pensando.io>
On Mon, 26 Aug 2019 14:33:25 -0700, Shannon Nelson wrote:
> +static inline bool ionic_is_pf(struct ionic *ionic)
> +{
> + return ionic->pdev &&
> + ionic->pdev->device == PCI_DEVICE_ID_PENSANDO_IONIC_ETH_PF;
> +}
> +
> +static inline bool ionic_is_vf(struct ionic *ionic)
> +{
> + return ionic->pdev &&
> + ionic->pdev->device == PCI_DEVICE_ID_PENSANDO_IONIC_ETH_VF;
> +}
> +
> +static inline bool ionic_is_25g(struct ionic *ionic)
> +{
> + return ionic_is_pf(ionic) &&
> + ionic->pdev->subsystem_device == IONIC_SUBDEV_ID_NAPLES_25;
> +}
> +
> +static inline bool ionic_is_100g(struct ionic *ionic)
> +{
> + return ionic_is_pf(ionic) &&
> + (ionic->pdev->subsystem_device == IONIC_SUBDEV_ID_NAPLES_100_4 ||
> + ionic->pdev->subsystem_device == IONIC_SUBDEV_ID_NAPLES_100_8);
> +}
Again, a bunch of unused stuff.
^ permalink raw reply
* Re: [PATCH v5 net-next 03/18] ionic: Add port management commands
From: Jakub Kicinski @ 2019-08-27 4:36 UTC (permalink / raw)
To: Shannon Nelson; +Cc: netdev, davem
In-Reply-To: <20190826213339.56909-4-snelson@pensando.io>
On Mon, 26 Aug 2019 14:33:24 -0700, Shannon Nelson wrote:
> The port management commands apply to the physical port
> associated with the PCI device, which might be shared among
> several logical interfaces.
>
> Signed-off-by: Shannon Nelson <snelson@pensando.io>
> ---
> drivers/net/ethernet/pensando/ionic/ionic.h | 4 +
> .../ethernet/pensando/ionic/ionic_bus_pci.c | 16 +++
> .../net/ethernet/pensando/ionic/ionic_dev.c | 116 ++++++++++++++++++
> .../net/ethernet/pensando/ionic/ionic_dev.h | 15 +++
> .../net/ethernet/pensando/ionic/ionic_main.c | 86 +++++++++++++
> 5 files changed, 237 insertions(+)
>
> diff --git a/drivers/net/ethernet/pensando/ionic/ionic.h b/drivers/net/ethernet/pensando/ionic/ionic.h
> index 1f3c4a916849..db2ad14899d3 100644
> --- a/drivers/net/ethernet/pensando/ionic/ionic.h
> +++ b/drivers/net/ethernet/pensando/ionic/ionic.h
> @@ -44,4 +44,8 @@ int ionic_identify(struct ionic *ionic);
> int ionic_init(struct ionic *ionic);
> int ionic_reset(struct ionic *ionic);
>
> +int ionic_port_identify(struct ionic *ionic);
> +int ionic_port_init(struct ionic *ionic);
> +int ionic_port_reset(struct ionic *ionic);
> +
> #endif /* _IONIC_H_ */
> diff --git a/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c b/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c
> index 286b4b450a73..804dd43e92a6 100644
> --- a/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c
> +++ b/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c
> @@ -138,12 +138,27 @@ static int ionic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
> goto err_out_teardown;
> }
>
> + /* Configure the ports */
> + err = ionic_port_identify(ionic);
> + if (err) {
> + dev_err(dev, "Cannot identify port: %d, aborting\n", err);
> + goto err_out_reset;
> + }
> +
> + err = ionic_port_init(ionic);
> + if (err) {
> + dev_err(dev, "Cannot init port: %d, aborting\n", err);
> + goto err_out_reset;
> + }
> +
> err = ionic_devlink_register(ionic);
> if (err)
> dev_err(dev, "Cannot register devlink: %d\n", err);
>
> return 0;
>
> +err_out_reset:
> + ionic_reset(ionic);
> err_out_teardown:
> ionic_dev_teardown(ionic);
> err_out_unmap_bars:
> @@ -170,6 +185,7 @@ static void ionic_remove(struct pci_dev *pdev)
> return;
>
> ionic_devlink_unregister(ionic);
> + ionic_port_reset(ionic);
> ionic_reset(ionic);
> ionic_dev_teardown(ionic);
> ionic_unmap_bars(ionic);
> diff --git a/drivers/net/ethernet/pensando/ionic/ionic_dev.c b/drivers/net/ethernet/pensando/ionic/ionic_dev.c
> index 0bf1bd6bd7b1..cddd41a43550 100644
> --- a/drivers/net/ethernet/pensando/ionic/ionic_dev.c
> +++ b/drivers/net/ethernet/pensando/ionic/ionic_dev.c
> @@ -134,3 +134,119 @@ void ionic_dev_cmd_reset(struct ionic_dev *idev)
>
> ionic_dev_cmd_go(idev, &cmd);
> }
> +
> +/* Port commands */
> +void ionic_dev_cmd_port_identify(struct ionic_dev *idev)
> +{
> + union ionic_dev_cmd cmd = {
> + .port_init.opcode = IONIC_CMD_PORT_IDENTIFY,
> + .port_init.index = 0,
> + };
> +
> + ionic_dev_cmd_go(idev, &cmd);
> +}
> +
> +void ionic_dev_cmd_port_init(struct ionic_dev *idev)
> +{
> + union ionic_dev_cmd cmd = {
> + .port_init.opcode = IONIC_CMD_PORT_INIT,
> + .port_init.index = 0,
> + .port_init.info_pa = cpu_to_le64(idev->port_info_pa),
> + };
> +
> + ionic_dev_cmd_go(idev, &cmd);
> +}
> +
> +void ionic_dev_cmd_port_reset(struct ionic_dev *idev)
> +{
> + union ionic_dev_cmd cmd = {
> + .port_reset.opcode = IONIC_CMD_PORT_RESET,
> + .port_reset.index = 0,
> + };
> +
> + ionic_dev_cmd_go(idev, &cmd);
> +}
> +
> +void ionic_dev_cmd_port_state(struct ionic_dev *idev, u8 state)
> +{
> + union ionic_dev_cmd cmd = {
> + .port_setattr.opcode = IONIC_CMD_PORT_SETATTR,
> + .port_setattr.index = 0,
> + .port_setattr.attr = IONIC_PORT_ATTR_STATE,
> + .port_setattr.state = state,
> + };
> +
> + ionic_dev_cmd_go(idev, &cmd);
> +}
> +
> +void ionic_dev_cmd_port_speed(struct ionic_dev *idev, u32 speed)
> +{
> + union ionic_dev_cmd cmd = {
> + .port_setattr.opcode = IONIC_CMD_PORT_SETATTR,
> + .port_setattr.index = 0,
> + .port_setattr.attr = IONIC_PORT_ATTR_SPEED,
> + .port_setattr.speed = cpu_to_le32(speed),
> + };
> +
> + ionic_dev_cmd_go(idev, &cmd);
> +}
> +
> +void ionic_dev_cmd_port_mtu(struct ionic_dev *idev, u32 mtu)
> +{
> + union ionic_dev_cmd cmd = {
> + .port_setattr.opcode = IONIC_CMD_PORT_SETATTR,
> + .port_setattr.index = 0,
> + .port_setattr.attr = IONIC_PORT_ATTR_MTU,
> + .port_setattr.mtu = cpu_to_le32(mtu),
> + };
> +
> + ionic_dev_cmd_go(idev, &cmd);
> +}
> +
> +void ionic_dev_cmd_port_autoneg(struct ionic_dev *idev, u8 an_enable)
> +{
> + union ionic_dev_cmd cmd = {
> + .port_setattr.opcode = IONIC_CMD_PORT_SETATTR,
> + .port_setattr.index = 0,
> + .port_setattr.attr = IONIC_PORT_ATTR_AUTONEG,
> + .port_setattr.an_enable = an_enable,
> + };
> +
> + ionic_dev_cmd_go(idev, &cmd);
> +}
> +
> +void ionic_dev_cmd_port_fec(struct ionic_dev *idev, u8 fec_type)
> +{
> + union ionic_dev_cmd cmd = {
> + .port_setattr.opcode = IONIC_CMD_PORT_SETATTR,
> + .port_setattr.index = 0,
> + .port_setattr.attr = IONIC_PORT_ATTR_FEC,
> + .port_setattr.fec_type = fec_type,
> + };
> +
> + ionic_dev_cmd_go(idev, &cmd);
> +}
> +
> +void ionic_dev_cmd_port_pause(struct ionic_dev *idev, u8 pause_type)
> +{
> + union ionic_dev_cmd cmd = {
> + .port_setattr.opcode = IONIC_CMD_PORT_SETATTR,
> + .port_setattr.index = 0,
> + .port_setattr.attr = IONIC_PORT_ATTR_PAUSE,
> + .port_setattr.pause_type = pause_type,
> + };
> +
> + ionic_dev_cmd_go(idev, &cmd);
> +}
> +
> +void ionic_dev_cmd_port_loopback(struct ionic_dev *idev, u8 loopback_mode)
> +{
> + union ionic_dev_cmd cmd = {
> + .port_setattr.opcode = IONIC_CMD_PORT_SETATTR,
> + .port_setattr.index = 0,
> + .port_setattr.attr = IONIC_PORT_ATTR_LOOPBACK,
> + .port_setattr.loopback_mode = loopback_mode,
> + };
> +
> + ionic_dev_cmd_go(idev, &cmd);
> +}
> diff --git a/drivers/net/ethernet/pensando/ionic/ionic_dev.h b/drivers/net/ethernet/pensando/ionic/ionic_dev.h
> index 30a5206bba4e..5b83f21af18a 100644
> --- a/drivers/net/ethernet/pensando/ionic/ionic_dev.h
> +++ b/drivers/net/ethernet/pensando/ionic/ionic_dev.h
> @@ -122,6 +122,10 @@ struct ionic_dev {
> struct ionic_intr __iomem *intr_ctrl;
> u64 __iomem *intr_status;
>
> + u32 port_info_sz;
> + struct ionic_port_info *port_info;
> + dma_addr_t port_info_pa;
> +
> struct ionic_devinfo dev_info;
> };
>
> @@ -140,4 +144,15 @@ void ionic_dev_cmd_identify(struct ionic_dev *idev, u8 ver);
> void ionic_dev_cmd_init(struct ionic_dev *idev);
> void ionic_dev_cmd_reset(struct ionic_dev *idev);
>
> +void ionic_dev_cmd_port_identify(struct ionic_dev *idev);
> +void ionic_dev_cmd_port_init(struct ionic_dev *idev);
> +void ionic_dev_cmd_port_reset(struct ionic_dev *idev);
> +void ionic_dev_cmd_port_state(struct ionic_dev *idev, u8 state);
> +void ionic_dev_cmd_port_speed(struct ionic_dev *idev, u32 speed);
> +void ionic_dev_cmd_port_mtu(struct ionic_dev *idev, u32 mtu);
> +void ionic_dev_cmd_port_autoneg(struct ionic_dev *idev, u8 an_enable);
> +void ionic_dev_cmd_port_fec(struct ionic_dev *idev, u8 fec_type);
> +void ionic_dev_cmd_port_pause(struct ionic_dev *idev, u8 pause_type);
> +void ionic_dev_cmd_port_loopback(struct ionic_dev *idev, u8 loopback_mode);
I don't think you call most of these functions in this patch.
> #endif /* _IONIC_DEV_H_ */
> diff --git a/drivers/net/ethernet/pensando/ionic/ionic_main.c b/drivers/net/ethernet/pensando/ionic/ionic_main.c
> index f52eb6c50358..47928f184230 100644
> --- a/drivers/net/ethernet/pensando/ionic/ionic_main.c
> +++ b/drivers/net/ethernet/pensando/ionic/ionic_main.c
> @@ -309,6 +309,92 @@ int ionic_reset(struct ionic *ionic)
> return err;
> }
>
> +int ionic_port_identify(struct ionic *ionic)
> +{
> + struct ionic_identity *ident = &ionic->ident;
> + struct ionic_dev *idev = &ionic->idev;
> + size_t sz;
> + int err;
> +
> + mutex_lock(&ionic->dev_cmd_lock);
> +
> + ionic_dev_cmd_port_identify(idev);
> + err = ionic_dev_cmd_wait(ionic, devcmd_timeout);
> + if (!err) {
> + sz = min(sizeof(ident->port), sizeof(idev->dev_cmd_regs->data));
> + memcpy_fromio(&ident->port, &idev->dev_cmd_regs->data, sz);
> + }
> +
> + mutex_unlock(&ionic->dev_cmd_lock);
> +
> + return err;
> +}
> +
> +int ionic_port_init(struct ionic *ionic)
> +{
> + struct ionic_identity *ident = &ionic->ident;
> + struct ionic_dev *idev = &ionic->idev;
> + size_t sz;
> + int err;
> +
> + if (idev->port_info)
> + return 0;
> +
> + idev->port_info_sz = ALIGN(sizeof(*idev->port_info), PAGE_SIZE);
> + idev->port_info = dma_alloc_coherent(ionic->dev, idev->port_info_sz,
> + &idev->port_info_pa,
> + GFP_KERNEL);
> + if (!idev->port_info) {
> + dev_err(ionic->dev, "Failed to allocate port info, aborting\n");
> + return -ENOMEM;
> + }
> +
> + sz = min(sizeof(ident->port.config), sizeof(idev->dev_cmd_regs->data));
> +
> + mutex_lock(&ionic->dev_cmd_lock);
> +
> + memcpy_toio(&idev->dev_cmd_regs->data, &ident->port.config, sz);
> + ionic_dev_cmd_port_init(idev);
> + err = ionic_dev_cmd_wait(ionic, devcmd_timeout);
> +
> + ionic_dev_cmd_port_state(&ionic->idev, IONIC_PORT_ADMIN_STATE_UP);
> + (void)ionic_dev_cmd_wait(ionic, devcmd_timeout);
> +
> + mutex_unlock(&ionic->dev_cmd_lock);
> + if (err) {
> + dev_err(ionic->dev, "Failed to init port\n");
The lifetime of port_info seems a little strange. Why is it left in
place even if the command failed? Doesn't this leak memory?
> + return err;
> + }
> +
> + return 0;
return err; work for both paths
> +}
> +
> +int ionic_port_reset(struct ionic *ionic)
> +{
> + struct ionic_dev *idev = &ionic->idev;
> + int err;
> +
> + if (!idev->port_info)
> + return 0;
> +
> + mutex_lock(&ionic->dev_cmd_lock);
> + ionic_dev_cmd_port_reset(idev);
> + err = ionic_dev_cmd_wait(ionic, devcmd_timeout);
> + mutex_unlock(&ionic->dev_cmd_lock);
> + if (err) {
> + dev_err(ionic->dev, "Failed to reset port\n");
> + return err;
Again, memory leak if command fails? (nothing frees port_info)
> + }
> +
> + dma_free_coherent(ionic->dev, idev->port_info_sz,
> + idev->port_info, idev->port_info_pa);
> +
> + idev->port_info = NULL;
> + idev->port_info_pa = 0;
> +
> + return err;
Well, with current code err can only be 0 at this point.
> +}
> +
> static int __init ionic_init_module(void)
> {
> pr_info("%s %s, ver %s\n",
^ permalink raw reply
* RE: [PATCH 2/4] mdev: Make mdev alias unique among all mdevs
From: Parav Pandit @ 2019-08-27 4:28 UTC (permalink / raw)
To: Mark Bloch, alex.williamson@redhat.com, Jiri Pirko,
kwankhede@nvidia.com, cohuck@redhat.com, davem@davemloft.net
Cc: kvm@vger.kernel.org, linux-kernel@vger.kernel.org,
netdev@vger.kernel.org
In-Reply-To: <6601940a-4832-08d2-e0f6-f9ac24758cdc@mellanox.com>
Hi Mark,
> -----Original Message-----
> From: Mark Bloch <markb@mellanox.com>
> Sent: Tuesday, August 27, 2019 4:32 AM
> To: Parav Pandit <parav@mellanox.com>; alex.williamson@redhat.com; Jiri
> Pirko <jiri@mellanox.com>; kwankhede@nvidia.com; cohuck@redhat.com;
> davem@davemloft.net
> Cc: kvm@vger.kernel.org; linux-kernel@vger.kernel.org;
> netdev@vger.kernel.org
> Subject: Re: [PATCH 2/4] mdev: Make mdev alias unique among all mdevs
>
>
>
> On 8/26/19 1:41 PM, Parav Pandit wrote:
> > Mdev alias should be unique among all the mdevs, so that when such
> > alias is used by the mdev users to derive other objects, there is no
> > collision in a given system.
> >
> > Signed-off-by: Parav Pandit <parav@mellanox.com>
> > ---
> > drivers/vfio/mdev/mdev_core.c | 5 +++++
> > 1 file changed, 5 insertions(+)
> >
> > diff --git a/drivers/vfio/mdev/mdev_core.c
> > b/drivers/vfio/mdev/mdev_core.c index e825ff38b037..6eb37f0c6369
> > 100644
> > --- a/drivers/vfio/mdev/mdev_core.c
> > +++ b/drivers/vfio/mdev/mdev_core.c
> > @@ -375,6 +375,11 @@ int mdev_device_create(struct kobject *kobj,
> struct device *dev,
> > ret = -EEXIST;
> > goto mdev_fail;
> > }
> > + if (tmp->alias && strcmp(tmp->alias, alias) == 0) {
>
> alias can be NULL here no?
>
If alias is NULL, tmp->alias would also be null because for given parent either we have alias or we don’t.
So its not possible to have tmp->alias as null and alias as non null.
But it may be good/defensive to add check for both.
> > + mutex_unlock(&mdev_list_lock);
> > + ret = -EEXIST;
> > + goto mdev_fail;
> > + }
> > }
> >
> > mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
> >
>
> Mark
^ permalink raw reply
* Re: [PATCH v5 net-next 02/18] ionic: Add hardware init and device commands
From: Jakub Kicinski @ 2019-08-27 4:28 UTC (permalink / raw)
To: Andrew Lunn; +Cc: Shannon Nelson, netdev, davem
In-Reply-To: <20190827022628.GD13411@lunn.ch>
On Tue, 27 Aug 2019 04:26:28 +0200, Andrew Lunn wrote:
> > +int ionic_identify(struct ionic *ionic)
> > +{
> > + struct ionic_identity *ident = &ionic->ident;
> > + struct ionic_dev *idev = &ionic->idev;
> > + size_t sz;
> > + int err;
> > +
> > + memset(ident, 0, sizeof(*ident));
> > +
> > + ident->drv.os_type = cpu_to_le32(IONIC_OS_TYPE_LINUX);
> > + ident->drv.os_dist = 0;
> > + strncpy(ident->drv.os_dist_str, utsname()->release,
> > + sizeof(ident->drv.os_dist_str) - 1);
> > + ident->drv.kernel_ver = cpu_to_le32(LINUX_VERSION_CODE);
> > + strncpy(ident->drv.kernel_ver_str, utsname()->version,
> > + sizeof(ident->drv.kernel_ver_str) - 1);
> > + strncpy(ident->drv.driver_ver_str, IONIC_DRV_VERSION,
> > + sizeof(ident->drv.driver_ver_str) - 1);
> > +
> > + mutex_lock(&ionic->dev_cmd_lock);
> > +
>
> I don't know about others, but from a privacy prospective, i'm not so
> happy about this. This is a smart NIC. It could be reporting back to
> Mothership pensando with this information?
>
> I would be happier if there was a privacy statement, right here,
> saying what this information is used for, and an agreement it is not
> used for anything else. If that gets violated, you can then only blame
> yourself when we ripe this out and hard code it to static values.
FWIW seems like a fair ask to me..
^ permalink raw reply
* RE: [PATCH 1/4] mdev: Introduce sha1 based mdev alias
From: Parav Pandit @ 2019-08-27 4:24 UTC (permalink / raw)
To: Alex Williamson
Cc: Jiri Pirko, kwankhede@nvidia.com, cohuck@redhat.com,
davem@davemloft.net, kvm@vger.kernel.org,
linux-kernel@vger.kernel.org, netdev@vger.kernel.org
In-Reply-To: <20190826194456.6edef7d1@x1.home>
> -----Original Message-----
> From: Alex Williamson <alex.williamson@redhat.com>
> Sent: Tuesday, August 27, 2019 7:15 AM
> To: Parav Pandit <parav@mellanox.com>
> Cc: Jiri Pirko <jiri@mellanox.com>; kwankhede@nvidia.com;
> cohuck@redhat.com; davem@davemloft.net; kvm@vger.kernel.org; linux-
> kernel@vger.kernel.org; netdev@vger.kernel.org
> Subject: Re: [PATCH 1/4] mdev: Introduce sha1 based mdev alias
>
> On Mon, 26 Aug 2019 15:41:16 -0500
> Parav Pandit <parav@mellanox.com> wrote:
>
> > Whenever a parent requests to generate mdev alias, generate a mdev
> > alias.
> > It is an optional attribute that parent can request to generate for
> > each of its child mdev.
> > mdev alias is generated using sha1 from the mdev name.
> >
> > Signed-off-by: Parav Pandit <parav@mellanox.com>
> > ---
> > drivers/vfio/mdev/mdev_core.c | 98
> +++++++++++++++++++++++++++++++-
> > drivers/vfio/mdev/mdev_private.h | 5 +-
> > drivers/vfio/mdev/mdev_sysfs.c | 13 +++--
> > include/linux/mdev.h | 4 ++
> > 4 files changed, 111 insertions(+), 9 deletions(-)
> >
> > diff --git a/drivers/vfio/mdev/mdev_core.c
> > b/drivers/vfio/mdev/mdev_core.c index b558d4cfd082..e825ff38b037
> > 100644
> > --- a/drivers/vfio/mdev/mdev_core.c
> > +++ b/drivers/vfio/mdev/mdev_core.c
> > @@ -10,9 +10,11 @@
> > #include <linux/module.h>
> > #include <linux/device.h>
> > #include <linux/slab.h>
> > +#include <linux/mm.h>
> > #include <linux/uuid.h>
> > #include <linux/sysfs.h>
> > #include <linux/mdev.h>
> > +#include <crypto/hash.h>
> >
> > #include "mdev_private.h"
> >
> > @@ -27,6 +29,8 @@ static struct class_compat *mdev_bus_compat_class;
> > static LIST_HEAD(mdev_list); static DEFINE_MUTEX(mdev_list_lock);
> >
> > +static struct crypto_shash *alias_hash;
> > +
> > struct device *mdev_parent_dev(struct mdev_device *mdev) {
> > return mdev->parent->dev;
> > @@ -164,6 +168,18 @@ int mdev_register_device(struct device *dev, const
> struct mdev_parent_ops *ops)
> > goto add_dev_err;
> > }
> >
> > + if (ops->get_alias_length) {
> > + unsigned int digest_size;
> > + unsigned int aligned_len;
> > +
> > + aligned_len = roundup(ops->get_alias_length(), 2);
> > + digest_size = crypto_shash_digestsize(alias_hash);
> > + if (aligned_len / 2 > digest_size) {
> > + ret = -EINVAL;
> > + goto add_dev_err;
> > + }
> > + }
>
> This looks like a sanity check, it could be done outside of the
> parent_list_lock, even before we get a parent device reference.
>
Yes.
> I think we're using a callback for get_alias_length() rather than a fixed field
> to support the mtty module option added in patch 4, right?
Right.
I will move the check outside.
> Its utility is rather limited with no args. I could imagine that if a parent
> wanted to generate an alias that could be incorporated into a string with the
> parent device name that it would be useful to call this with the parent
> device as an arg. I guess we can save that until a user comes along though.
>
Right. We save until user arrives.
I suggest you review the extra complexity I added here for vendor driven alias length, which I think we should do when an actual user comes along.
> There doesn't seem to be anything serializing use of alias_hash.
>
Each sha1 calculation is happening on the new descriptor allocated and initialized using crypto_shash_init().
So it appears to me that each hash calculation can occur in parallel on the individual desc.
> > +
> > parent = kzalloc(sizeof(*parent), GFP_KERNEL);
> > if (!parent) {
> > ret = -ENOMEM;
> > @@ -259,6 +275,7 @@ static void mdev_device_free(struct mdev_device
> *mdev)
> > mutex_unlock(&mdev_list_lock);
> >
> > dev_dbg(&mdev->dev, "MDEV: destroying\n");
> > + kvfree(mdev->alias);
> > kfree(mdev);
> > }
> >
> > @@ -269,18 +286,86 @@ static void mdev_device_release(struct device
> *dev)
> > mdev_device_free(mdev);
> > }
> >
> > -int mdev_device_create(struct kobject *kobj,
> > - struct device *dev, const guid_t *uuid)
> > +static const char *
> > +generate_alias(const char *uuid, unsigned int max_alias_len) {
> > + struct shash_desc *hash_desc;
> > + unsigned int digest_size;
> > + unsigned char *digest;
> > + unsigned int alias_len;
> > + char *alias;
> > + int ret = 0;
> > +
> > + /* Align to multiple of 2 as bin2hex will generate
> > + * even number of bytes.
> > + */
>
> Comment style for non-networking code please.
Ack.
>
> > + alias_len = roundup(max_alias_len, 2);
> > + alias = kvzalloc(alias_len + 1, GFP_KERNEL);
>
> The size we're generating here should be small enough to just use kzalloc(),
Ack.
> probably below too.
>
Descriptor size is 96 bytes long. kvzalloc is more optimal.
> > + if (!alias)
> > + return NULL;
> > +
> > + /* Allocate and init descriptor */
> > + hash_desc = kvzalloc(sizeof(*hash_desc) +
> > + crypto_shash_descsize(alias_hash),
> > + GFP_KERNEL);
> > + if (!hash_desc)
> > + goto desc_err;
> > +
> > + hash_desc->tfm = alias_hash;
> > +
> > + digest_size = crypto_shash_digestsize(alias_hash);
> > +
> > + digest = kvzalloc(digest_size, GFP_KERNEL);
> > + if (!digest) {
> > + ret = -ENOMEM;
> > + goto digest_err;
> > + }
> > + crypto_shash_init(hash_desc);
> > + crypto_shash_update(hash_desc, uuid, UUID_STRING_LEN);
> > + crypto_shash_final(hash_desc, digest);
> > + bin2hex(&alias[0], digest,
>
> &alias[0], ie. alias
Ack.
>
> > + min_t(unsigned int, digest_size, alias_len / 2));
> > + /* When alias length is odd, zero out and additional last byte
> > + * that bin2hex has copied.
> > + */
> > + if (max_alias_len % 2)
> > + alias[max_alias_len] = 0;
>
> Doesn't this give us a null terminated string for odd numbers but not even
> numbers? Probably best to define that we always provide a null terminated
> string then we could do this unconditionally.
>
> > +
> > + kvfree(digest);
> > + kvfree(hash_desc);
> > + return alias;
> > +
> > +digest_err:
> > + kvfree(hash_desc);
> > +desc_err:
> > + kvfree(alias);
> > + return NULL;
> > +}
> > +
> > +int mdev_device_create(struct kobject *kobj, struct device *dev,
> > + const char *uuid_str, const guid_t *uuid)
> > {
> > int ret;
> > struct mdev_device *mdev, *tmp;
> > struct mdev_parent *parent;
> > struct mdev_type *type = to_mdev_type(kobj);
> > + unsigned int alias_len = 0;
> > + const char *alias = NULL;
> >
> > parent = mdev_get_parent(type->parent);
> > if (!parent)
> > return -EINVAL;
> >
> > + if (parent->ops->get_alias_length)
> > + alias_len = parent->ops->get_alias_length();
> > + if (alias_len) {
>
> Why isn't this nested into the branch above?
>
I will nest it. No specific reason to not nest it.
> > + alias = generate_alias(uuid_str, alias_len);
> > + if (!alias) {
> > + ret = -ENOMEM;
>
> Could use an ERR_PTR and propagate an errno.
>
generate_alias() only returns one error type ENOMEM.
When we add more error types, ERR_PTR is useful.
> > + goto alias_fail;
> > + }
> > + }
> > +
> > mutex_lock(&mdev_list_lock);
> >
> > /* Check for duplicate */
> > @@ -300,6 +385,8 @@ int mdev_device_create(struct kobject *kobj,
> > }
> >
> > guid_copy(&mdev->uuid, uuid);
> > + mdev->alias = alias;
> > + alias = NULL;
>
> A comment justifying this null'ing might help prevent it getting culled as
> some point. It appears arbitrary at first look. Thanks,
>
Ack. I will add it.
> Alex
^ permalink raw reply
* Re: [PATCH v5 net-next 02/18] ionic: Add hardware init and device commands
From: Jakub Kicinski @ 2019-08-27 4:24 UTC (permalink / raw)
To: Shannon Nelson; +Cc: netdev, davem
In-Reply-To: <20190826213339.56909-3-snelson@pensando.io>
On Mon, 26 Aug 2019 14:33:23 -0700, Shannon Nelson wrote:
> The ionic device has a small set of PCI registers, including a
> device control and data space, and a large set of message
> commands.
>
> Signed-off-by: Shannon Nelson <snelson@pensando.io>
> diff --git a/drivers/net/ethernet/pensando/ionic/Makefile b/drivers/net/ethernet/pensando/ionic/Makefile
> index f174e8f7bce1..a23d58519c63 100644
> --- a/drivers/net/ethernet/pensando/ionic/Makefile
> +++ b/drivers/net/ethernet/pensando/ionic/Makefile
> @@ -3,4 +3,5 @@
>
> obj-$(CONFIG_IONIC) := ionic.o
>
> -ionic-y := ionic_main.o ionic_bus_pci.o ionic_devlink.o
> +ionic-y := ionic_main.o ionic_bus_pci.o ionic_devlink.o ionic_dev.o \
> + ionic_debugfs.o
> diff --git a/drivers/net/ethernet/pensando/ionic/ionic.h b/drivers/net/ethernet/pensando/ionic/ionic.h
> index d40077161214..1f3c4a916849 100644
> --- a/drivers/net/ethernet/pensando/ionic/ionic.h
> +++ b/drivers/net/ethernet/pensando/ionic/ionic.h
> @@ -4,6 +4,10 @@
> #ifndef _IONIC_H_
> #define _IONIC_H_
>
> +#include "ionic_if.h"
> +#include "ionic_dev.h"
> +#include "ionic_devlink.h"
> +
> #define IONIC_DRV_NAME "ionic"
> #define IONIC_DRV_DESCRIPTION "Pensando Ethernet NIC Driver"
> #define IONIC_DRV_VERSION "0.15.0-k"
> @@ -17,10 +21,27 @@
> #define IONIC_SUBDEV_ID_NAPLES_100_4 0x4001
> #define IONIC_SUBDEV_ID_NAPLES_100_8 0x4002
>
> +#define devcmd_timeout 10
nit: upper case?
> struct ionic {
> struct pci_dev *pdev;
> struct device *dev;
> struct devlink *dl;
> + struct devlink_port dl_port;
devlink_port is not used in this patch
> + struct ionic_dev idev;
> + struct mutex dev_cmd_lock; /* lock for dev_cmd operations */
> + struct dentry *dentry;
> + struct ionic_dev_bar bars[IONIC_BARS_MAX];
> + unsigned int num_bars;
> + struct ionic_identity ident;
> };
> + err = ionic_init(ionic);
> + if (err) {
> + dev_err(dev, "Cannot init device: %d, aborting\n", err);
> + goto err_out_teardown;
> + }
> +
> + err = ionic_devlink_register(ionic);
> + if (err)
> + dev_err(dev, "Cannot register devlink: %d\n", err);
>
> return 0;
> +
> +err_out_teardown:
> + ionic_dev_teardown(ionic);
> +err_out_unmap_bars:
> + ionic_unmap_bars(ionic);
> + pci_release_regions(pdev);
> +err_out_pci_clear_master:
> + pci_clear_master(pdev);
> +err_out_pci_disable_device:
> + pci_disable_device(pdev);
> +err_out_debugfs_del_dev:
> + ionic_debugfs_del_dev(ionic);
> +err_out_clear_drvdata:
> + mutex_destroy(&ionic->dev_cmd_lock);
> + ionic_devlink_free(ionic);
> +
> + return err;
> }
>
> static void ionic_remove(struct pci_dev *pdev)
> {
> struct ionic *ionic = pci_get_drvdata(pdev);
>
> + if (!ionic)
How can this be NULL? Usually if this is NULL that means probe()
failed but 'err' was not set properly. Perhaps WARN_ON() here?
> + return;
> +
> + ionic_devlink_unregister(ionic);
> + ionic_reset(ionic);
> + ionic_dev_teardown(ionic);
> + ionic_unmap_bars(ionic);
> + pci_release_regions(pdev);
> + pci_clear_master(pdev);
> + pci_disable_device(pdev);
> + ionic_debugfs_del_dev(ionic);
> + mutex_destroy(&ionic->dev_cmd_lock);
> ionic_devlink_free(ionic);
> }
>
> diff --git a/drivers/net/ethernet/pensando/ionic/ionic_dev.h b/drivers/net/ethernet/pensando/ionic/ionic_dev.h
> new file mode 100644
> index 000000000000..30a5206bba4e
> --- /dev/null
> +++ b/drivers/net/ethernet/pensando/ionic/ionic_dev.h
> @@ -0,0 +1,143 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/* Copyright(c) 2017 - 2019 Pensando Systems, Inc */
> +
> +#ifndef _IONIC_DEV_H_
> +#define _IONIC_DEV_H_
> +
> +#include <linux/mutex.h>
> +#include <linux/workqueue.h>
> +
> +#include "ionic_if.h"
> +#include "ionic_regs.h"
> +
> +struct ionic_dev_bar {
> + void __iomem *vaddr;
> + phys_addr_t bus_addr;
> + unsigned long len;
> + int res_index;
> +};
> +
> +static inline void ionic_struct_size_checks(void)
> +{
> + /* Registers */
> + BUILD_BUG_ON(sizeof(struct ionic_intr) != 32);
> +
> + BUILD_BUG_ON(sizeof(struct ionic_doorbell) != 8);
> + BUILD_BUG_ON(sizeof(struct ionic_intr_status) != 8);
> +
> + BUILD_BUG_ON(sizeof(union ionic_dev_regs) != 4096);
> + BUILD_BUG_ON(sizeof(union ionic_dev_info_regs) != 2048);
> + BUILD_BUG_ON(sizeof(union ionic_dev_cmd_regs) != 2048);
> +
> + BUILD_BUG_ON(sizeof(struct ionic_lif_stats) != 1024);
> +
> + BUILD_BUG_ON(sizeof(struct ionic_admin_cmd) != 64);
> + BUILD_BUG_ON(sizeof(struct ionic_admin_comp) != 16);
> + BUILD_BUG_ON(sizeof(struct ionic_nop_cmd) != 64);
> + BUILD_BUG_ON(sizeof(struct ionic_nop_comp) != 16);
> +
> + /* Device commands */
> + BUILD_BUG_ON(sizeof(struct ionic_dev_identify_cmd) != 64);
> + BUILD_BUG_ON(sizeof(struct ionic_dev_identify_comp) != 16);
> + BUILD_BUG_ON(sizeof(struct ionic_dev_init_cmd) != 64);
> + BUILD_BUG_ON(sizeof(struct ionic_dev_init_comp) != 16);
> + BUILD_BUG_ON(sizeof(struct ionic_dev_reset_cmd) != 64);
> + BUILD_BUG_ON(sizeof(struct ionic_dev_reset_comp) != 16);
> + BUILD_BUG_ON(sizeof(struct ionic_dev_getattr_cmd) != 64);
> + BUILD_BUG_ON(sizeof(struct ionic_dev_getattr_comp) != 16);
> + BUILD_BUG_ON(sizeof(struct ionic_dev_setattr_cmd) != 64);
> + BUILD_BUG_ON(sizeof(struct ionic_dev_setattr_comp) != 16);
> +
> + /* Port commands */
> + BUILD_BUG_ON(sizeof(struct ionic_port_identify_cmd) != 64);
> + BUILD_BUG_ON(sizeof(struct ionic_port_identify_comp) != 16);
> + BUILD_BUG_ON(sizeof(struct ionic_port_init_cmd) != 64);
> + BUILD_BUG_ON(sizeof(struct ionic_port_init_comp) != 16);
> + BUILD_BUG_ON(sizeof(struct ionic_port_reset_cmd) != 64);
> + BUILD_BUG_ON(sizeof(struct ionic_port_reset_comp) != 16);
> + BUILD_BUG_ON(sizeof(struct ionic_port_getattr_cmd) != 64);
> + BUILD_BUG_ON(sizeof(struct ionic_port_getattr_comp) != 16);
> + BUILD_BUG_ON(sizeof(struct ionic_port_setattr_cmd) != 64);
> + BUILD_BUG_ON(sizeof(struct ionic_port_setattr_comp) != 16);
> +
> + /* LIF commands */
> + BUILD_BUG_ON(sizeof(struct ionic_lif_init_cmd) != 64);
> + BUILD_BUG_ON(sizeof(struct ionic_lif_init_comp) != 16);
> + BUILD_BUG_ON(sizeof(struct ionic_lif_reset_cmd) != 64);
> + BUILD_BUG_ON(sizeof(ionic_lif_reset_comp) != 16);
> + BUILD_BUG_ON(sizeof(struct ionic_lif_getattr_cmd) != 64);
> + BUILD_BUG_ON(sizeof(struct ionic_lif_getattr_comp) != 16);
> + BUILD_BUG_ON(sizeof(struct ionic_lif_setattr_cmd) != 64);
> + BUILD_BUG_ON(sizeof(struct ionic_lif_setattr_comp) != 16);
> +
> + BUILD_BUG_ON(sizeof(struct ionic_q_init_cmd) != 64);
> + BUILD_BUG_ON(sizeof(struct ionic_q_init_comp) != 16);
> + BUILD_BUG_ON(sizeof(struct ionic_q_control_cmd) != 64);
> + BUILD_BUG_ON(sizeof(ionic_q_control_comp) != 16);
> +
> + BUILD_BUG_ON(sizeof(struct ionic_rx_mode_set_cmd) != 64);
> + BUILD_BUG_ON(sizeof(ionic_rx_mode_set_comp) != 16);
> + BUILD_BUG_ON(sizeof(struct ionic_rx_filter_add_cmd) != 64);
> + BUILD_BUG_ON(sizeof(struct ionic_rx_filter_add_comp) != 16);
> + BUILD_BUG_ON(sizeof(struct ionic_rx_filter_del_cmd) != 64);
> + BUILD_BUG_ON(sizeof(ionic_rx_filter_del_comp) != 16);
> +
> + /* RDMA commands */
> + BUILD_BUG_ON(sizeof(struct ionic_rdma_reset_cmd) != 64);
> + BUILD_BUG_ON(sizeof(struct ionic_rdma_queue_cmd) != 64);
> +
> + /* Events */
> + BUILD_BUG_ON(sizeof(struct ionic_notifyq_cmd) != 4);
> + BUILD_BUG_ON(sizeof(union ionic_notifyq_comp) != 64);
> + BUILD_BUG_ON(sizeof(struct ionic_notifyq_event) != 64);
> + BUILD_BUG_ON(sizeof(struct ionic_link_change_event) != 64);
> + BUILD_BUG_ON(sizeof(struct ionic_reset_event) != 64);
> + BUILD_BUG_ON(sizeof(struct ionic_heartbeat_event) != 64);
> + BUILD_BUG_ON(sizeof(struct ionic_log_event) != 64);
> +
> + /* I/O */
> + BUILD_BUG_ON(sizeof(struct ionic_txq_desc) != 16);
> + BUILD_BUG_ON(sizeof(struct ionic_txq_sg_desc) != 128);
> + BUILD_BUG_ON(sizeof(struct ionic_txq_comp) != 16);
> +
> + BUILD_BUG_ON(sizeof(struct ionic_rxq_desc) != 16);
> + BUILD_BUG_ON(sizeof(struct ionic_rxq_sg_desc) != 128);
> + BUILD_BUG_ON(sizeof(struct ionic_rxq_comp) != 16);
static_assert() for all of those? That way you don't need this fake
function.
> +}
> +
> +struct ionic_devinfo {
> + u8 asic_type;
> + u8 asic_rev;
> + char fw_version[IONIC_DEVINFO_FWVERS_BUFLEN + 1];
> + char serial_num[IONIC_DEVINFO_SERIAL_BUFLEN + 1];
> +};
> +
> +struct ionic_dev {
> + union ionic_dev_info_regs __iomem *dev_info_regs;
> + union ionic_dev_cmd_regs __iomem *dev_cmd_regs;
> +
> + u64 __iomem *db_pages;
> + dma_addr_t phy_db_pages;
> +
> + struct ionic_intr __iomem *intr_ctrl;
> + u64 __iomem *intr_status;
> +
> + struct ionic_devinfo dev_info;
> +};
> +
> +struct ionic;
> +
> +void ionic_init_devinfo(struct ionic *ionic);
> +int ionic_dev_setup(struct ionic *ionic);
> +void ionic_dev_teardown(struct ionic *ionic);
> +
> +void ionic_dev_cmd_go(struct ionic_dev *idev, union ionic_dev_cmd *cmd);
> +u8 ionic_dev_cmd_status(struct ionic_dev *idev);
> +bool ionic_dev_cmd_done(struct ionic_dev *idev);
> +void ionic_dev_cmd_comp(struct ionic_dev *idev, union ionic_dev_cmd_comp *comp);
> +
> +void ionic_dev_cmd_identify(struct ionic_dev *idev, u8 ver);
> +void ionic_dev_cmd_init(struct ionic_dev *idev);
> +void ionic_dev_cmd_reset(struct ionic_dev *idev);
> +
> +#endif /* _IONIC_DEV_H_ */
> diff --git a/drivers/net/ethernet/pensando/ionic/ionic_devlink.c b/drivers/net/ethernet/pensando/ionic/ionic_devlink.c
> index e24ef6971cd5..1ca1e33cca04 100644
> --- a/drivers/net/ethernet/pensando/ionic/ionic_devlink.c
> +++ b/drivers/net/ethernet/pensando/ionic/ionic_devlink.c
> @@ -11,8 +11,28 @@
> static int ionic_dl_info_get(struct devlink *dl, struct devlink_info_req *req,
> struct netlink_ext_ack *extack)
> {
> + struct ionic *ionic = devlink_priv(dl);
> + struct ionic_dev *idev = &ionic->idev;
> + char buf[16];
> +
> devlink_info_driver_name_put(req, IONIC_DRV_NAME);
>
> + devlink_info_version_running_put(req,
> + DEVLINK_INFO_VERSION_GENERIC_FW_MGMT,
> + idev->dev_info.fw_version);
Are you sure this is not the FW that controls the data path?
> + snprintf(buf, sizeof(buf), "0x%x", idev->dev_info.asic_type);
> + devlink_info_version_fixed_put(req,
> + DEVLINK_INFO_VERSION_GENERIC_BOARD_ID,
> + buf);
Board ID is not ASIC. This is for identifying a board version with all
its components which surround the main ASIC.
> + snprintf(buf, sizeof(buf), "0x%x", idev->dev_info.asic_rev);
> + devlink_info_version_fixed_put(req,
> + DEVLINK_INFO_VERSION_GENERIC_BOARD_REV,
> + buf);
ditto
> + devlink_info_serial_number_put(req, idev->dev_info.serial_num);
> +
> return 0;
> }
>
> @@ -41,3 +61,22 @@ void ionic_devlink_free(struct ionic *ionic)
> {
> devlink_free(ionic->dl);
> }
> +
> +int ionic_devlink_register(struct ionic *ionic)
> +{
> + int err;
> +
> + err = devlink_register(ionic->dl, ionic->dev);
> + if (err)
> + dev_warn(ionic->dev, "devlink_register failed: %d\n", err);
> +
> + return err;
> +}
> +
> +void ionic_devlink_unregister(struct ionic *ionic)
> +{
> + if (!ionic || !ionic->dl)
> + return;
Impossiblu
> + devlink_unregister(ionic->dl);
> +}
> diff --git a/drivers/net/ethernet/pensando/ionic/ionic_devlink.h b/drivers/net/ethernet/pensando/ionic/ionic_devlink.h
> index 1df50874260a..0690172fc57a 100644
> --- a/drivers/net/ethernet/pensando/ionic/ionic_devlink.h
> +++ b/drivers/net/ethernet/pensando/ionic/ionic_devlink.h
> @@ -8,5 +8,7 @@
>
> struct ionic *ionic_devlink_alloc(struct device *dev);
> void ionic_devlink_free(struct ionic *ionic);
> +int ionic_devlink_register(struct ionic *ionic);
> +void ionic_devlink_unregister(struct ionic *ionic);
>
> #endif /* _IONIC_DEVLINK_H_ */
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox