Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH net-next v2 3/5] virtio_ring: add packed ring support
From: Michael S. Tsirkin @ 2018-11-07 17:48 UTC (permalink / raw)
  To: Tiwei Bie
  Cc: jasowang, virtualization, linux-kernel, netdev, virtio-dev, wexu,
	jfreimann
In-Reply-To: <20180711022711.7090-4-tiwei.bie@intel.com>

On Wed, Jul 11, 2018 at 10:27:09AM +0800, Tiwei Bie wrote:
> This commit introduces the support (without EVENT_IDX) for
> packed ring.
> 
> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> ---
>  drivers/virtio/virtio_ring.c | 495 ++++++++++++++++++++++++++++++++++-
>  1 file changed, 487 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index c4f8abc7445a..f317b485ba54 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -55,12 +55,21 @@
>  #define END_USE(vq)
>  #endif
>  
> +#define _VRING_DESC_F_AVAIL(b)	((__u16)(b) << 7)
> +#define _VRING_DESC_F_USED(b)	((__u16)(b) << 15)
> +
>  struct vring_desc_state {
>  	void *data;			/* Data for callback. */
>  	struct vring_desc *indir_desc;	/* Indirect descriptor, if any. */
>  };
>  
>  struct vring_desc_state_packed {
> +	void *data;			/* Data for callback. */
> +	struct vring_packed_desc *indir_desc; /* Indirect descriptor, if any. */
> +	int num;			/* Descriptor list length. */
> +	dma_addr_t addr;		/* Buffer DMA addr. */
> +	u32 len;			/* Buffer length. */
> +	u16 flags;			/* Descriptor flags. */
>  	int next;			/* The next desc state. */
>  };
>  
> @@ -660,7 +669,6 @@ static bool virtqueue_poll_split(struct virtqueue *_vq, unsigned last_used_idx)
>  {
>  	struct vring_virtqueue *vq = to_vvq(_vq);
>  
> -	virtio_mb(vq->weak_barriers);
>  	return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev, vq->vring.used->idx);
>  }
>  
> @@ -757,6 +765,72 @@ static inline unsigned vring_size_packed(unsigned int num, unsigned long align)
>  		& ~(align - 1)) + sizeof(struct vring_packed_desc_event) * 2;
>  }
>  
> +static void vring_unmap_state_packed(const struct vring_virtqueue *vq,
> +				     struct vring_desc_state_packed *state)
> +{
> +	u16 flags;
> +
> +	if (!vring_use_dma_api(vq->vq.vdev))
> +		return;
> +
> +	flags = state->flags;
> +
> +	if (flags & VRING_DESC_F_INDIRECT) {
> +		dma_unmap_single(vring_dma_dev(vq),
> +				 state->addr, state->len,
> +				 (flags & VRING_DESC_F_WRITE) ?
> +				 DMA_FROM_DEVICE : DMA_TO_DEVICE);
> +	} else {
> +		dma_unmap_page(vring_dma_dev(vq),
> +			       state->addr, state->len,
> +			       (flags & VRING_DESC_F_WRITE) ?
> +			       DMA_FROM_DEVICE : DMA_TO_DEVICE);
> +	}
> +}
> +
> +static void vring_unmap_desc_packed(const struct vring_virtqueue *vq,
> +				   struct vring_packed_desc *desc)
> +{
> +	u16 flags;
> +
> +	if (!vring_use_dma_api(vq->vq.vdev))
> +		return;
> +
> +	flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);

BTW this stuff is only used on error etc. Is there a way to
reuse vring_unmap_state_packed?

> +
> +	if (flags & VRING_DESC_F_INDIRECT) {
> +		dma_unmap_single(vring_dma_dev(vq),
> +				 virtio64_to_cpu(vq->vq.vdev, desc->addr),
> +				 virtio32_to_cpu(vq->vq.vdev, desc->len),
> +				 (flags & VRING_DESC_F_WRITE) ?
> +				 DMA_FROM_DEVICE : DMA_TO_DEVICE);
> +	} else {
> +		dma_unmap_page(vring_dma_dev(vq),
> +			       virtio64_to_cpu(vq->vq.vdev, desc->addr),
> +			       virtio32_to_cpu(vq->vq.vdev, desc->len),
> +			       (flags & VRING_DESC_F_WRITE) ?
> +			       DMA_FROM_DEVICE : DMA_TO_DEVICE);
> +	}
> +}
> +
> +static struct vring_packed_desc *alloc_indirect_packed(struct virtqueue *_vq,
> +						       unsigned int total_sg,
> +						       gfp_t gfp)
> +{
> +	struct vring_packed_desc *desc;
> +
> +	/*
> +	 * We require lowmem mappings for the descriptors because
> +	 * otherwise virt_to_phys will give us bogus addresses in the
> +	 * virtqueue.
> +	 */
> +	gfp &= ~__GFP_HIGHMEM;
> +
> +	desc = kmalloc(total_sg * sizeof(struct vring_packed_desc), gfp);
> +
> +	return desc;
> +}
> +
>  static inline int virtqueue_add_packed(struct virtqueue *_vq,
>  				       struct scatterlist *sgs[],
>  				       unsigned int total_sg,
> @@ -766,47 +840,449 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
>  				       void *ctx,
>  				       gfp_t gfp)
>  {
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	struct vring_packed_desc *desc;
> +	struct scatterlist *sg;
> +	unsigned int i, n, descs_used, uninitialized_var(prev), err_idx;
> +	__virtio16 uninitialized_var(head_flags), flags;
> +	u16 head, avail_wrap_counter, id, curr;
> +	bool indirect;
> +
> +	START_USE(vq);
> +
> +	BUG_ON(data == NULL);
> +	BUG_ON(ctx && vq->indirect);
> +
> +	if (unlikely(vq->broken)) {
> +		END_USE(vq);
> +		return -EIO;
> +	}
> +
> +#ifdef DEBUG
> +	{
> +		ktime_t now = ktime_get();
> +
> +		/* No kick or get, with .1 second between?  Warn. */
> +		if (vq->last_add_time_valid)
> +			WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time))
> +					    > 100);
> +		vq->last_add_time = now;
> +		vq->last_add_time_valid = true;
> +	}
> +#endif
> +
> +	BUG_ON(total_sg == 0);
> +
> +	head = vq->next_avail_idx;
> +	avail_wrap_counter = vq->avail_wrap_counter;
> +
> +	if (virtqueue_use_indirect(_vq, total_sg))
> +		desc = alloc_indirect_packed(_vq, total_sg, gfp);
> +	else {
> +		desc = NULL;
> +		WARN_ON_ONCE(total_sg > vq->vring_packed.num && !vq->indirect);
> +	}
> +
> +	if (desc) {
> +		/* Use a single buffer which doesn't continue */
> +		indirect = true;
> +		/* Set up rest to use this indirect table. */
> +		i = 0;
> +		descs_used = 1;
> +	} else {
> +		indirect = false;
> +		desc = vq->vring_packed.desc;
> +		i = head;
> +		descs_used = total_sg;
> +	}
> +
> +	if (vq->vq.num_free < descs_used) {
> +		pr_debug("Can't add buf len %i - avail = %i\n",
> +			 descs_used, vq->vq.num_free);
> +		/* FIXME: for historical reasons, we force a notify here if
> +		 * there are outgoing parts to the buffer.  Presumably the
> +		 * host should service the ring ASAP. */

I don't think we have a reason to do this for packed ring.
No historical baggage there, right?

> +		if (out_sgs)
> +			vq->notify(&vq->vq);
> +		if (indirect)
> +			kfree(desc);
> +		END_USE(vq);
> +		return -ENOSPC;
> +	}
> +
> +	id = vq->free_head;
> +	BUG_ON(id == vq->vring_packed.num);
> +
> +	curr = id;
> +	for (n = 0; n < out_sgs + in_sgs; n++) {
> +		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> +			dma_addr_t addr = vring_map_one_sg(vq, sg, n < out_sgs ?
> +					       DMA_TO_DEVICE : DMA_FROM_DEVICE);
> +			if (vring_mapping_error(vq, addr))
> +				goto unmap_release;
> +
> +			flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT |
> +				  (n < out_sgs ? 0 : VRING_DESC_F_WRITE) |
> +				  _VRING_DESC_F_AVAIL(vq->avail_wrap_counter) |
> +				  _VRING_DESC_F_USED(!vq->avail_wrap_counter));
> +			if (!indirect && i == head)
> +				head_flags = flags;
> +			else
> +				desc[i].flags = flags;
> +
> +			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
> +			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
> +			i++;
> +			if (!indirect) {
> +				if (vring_use_dma_api(_vq->vdev)) {
> +					vq->desc_state_packed[curr].addr = addr;
> +					vq->desc_state_packed[curr].len =
> +						sg->length;
> +					vq->desc_state_packed[curr].flags =
> +						virtio16_to_cpu(_vq->vdev,
> +								flags);
> +				}
> +				curr = vq->desc_state_packed[curr].next;
> +
> +				if (i >= vq->vring_packed.num) {
> +					i = 0;
> +					vq->avail_wrap_counter ^= 1;
> +				}
> +			}
> +		}
> +	}
> +
> +	prev = (i > 0 ? i : vq->vring_packed.num) - 1;
> +	desc[prev].id = cpu_to_virtio16(_vq->vdev, id);
> +
> +	/* Last one doesn't continue. */
> +	if (total_sg == 1)
> +		head_flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
> +	else
> +		desc[prev].flags &= cpu_to_virtio16(_vq->vdev,
> +						~VRING_DESC_F_NEXT);
> +
> +	if (indirect) {
> +		/* Now that the indirect table is filled in, map it. */
> +		dma_addr_t addr = vring_map_single(
> +			vq, desc, total_sg * sizeof(struct vring_packed_desc),
> +			DMA_TO_DEVICE);
> +		if (vring_mapping_error(vq, addr))
> +			goto unmap_release;
> +
> +		head_flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT |
> +				      _VRING_DESC_F_AVAIL(avail_wrap_counter) |
> +				      _VRING_DESC_F_USED(!avail_wrap_counter));
> +		vq->vring_packed.desc[head].addr = cpu_to_virtio64(_vq->vdev,
> +								   addr);
> +		vq->vring_packed.desc[head].len = cpu_to_virtio32(_vq->vdev,
> +				total_sg * sizeof(struct vring_packed_desc));
> +		vq->vring_packed.desc[head].id = cpu_to_virtio16(_vq->vdev, id);
> +
> +		if (vring_use_dma_api(_vq->vdev)) {
> +			vq->desc_state_packed[id].addr = addr;
> +			vq->desc_state_packed[id].len = total_sg *
> +					sizeof(struct vring_packed_desc);
> +			vq->desc_state_packed[id].flags =
> +					virtio16_to_cpu(_vq->vdev, head_flags);
> +		}
> +	}
> +
> +	/* We're using some buffers from the free list. */
> +	vq->vq.num_free -= descs_used;
> +
> +	/* Update free pointer */
> +	if (indirect) {
> +		n = head + 1;
> +		if (n >= vq->vring_packed.num) {
> +			n = 0;
> +			vq->avail_wrap_counter ^= 1;
> +		}
> +		vq->next_avail_idx = n;
> +		vq->free_head = vq->desc_state_packed[id].next;
> +	} else {
> +		vq->next_avail_idx = i;
> +		vq->free_head = curr;
> +	}
> +
> +	/* Store token and indirect buffer state. */
> +	vq->desc_state_packed[id].num = descs_used;
> +	vq->desc_state_packed[id].data = data;
> +	if (indirect)
> +		vq->desc_state_packed[id].indir_desc = desc;
> +	else
> +		vq->desc_state_packed[id].indir_desc = ctx;
> +
> +	/* A driver MUST NOT make the first descriptor in the list
> +	 * available before all subsequent descriptors comprising
> +	 * the list are made available. */
> +	virtio_wmb(vq->weak_barriers);
> +	vq->vring_packed.desc[head].flags = head_flags;
> +	vq->num_added += descs_used;
> +
> +	pr_debug("Added buffer head %i to %p\n", head, vq);
> +	END_USE(vq);
> +
> +	return 0;
> +
> +unmap_release:
> +	err_idx = i;
> +	i = head;
> +
> +	for (n = 0; n < total_sg; n++) {
> +		if (i == err_idx)
> +			break;
> +		vring_unmap_desc_packed(vq, &desc[i]);
> +		i++;
> +		if (!indirect && i >= vq->vring_packed.num)
> +			i = 0;
> +	}
> +
> +	vq->avail_wrap_counter = avail_wrap_counter;
> +
> +	if (indirect)
> +		kfree(desc);
> +
> +	END_USE(vq);
>  	return -EIO;
>  }
>  
>  static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
>  {
> -	return false;
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	u16 flags;
> +	bool needs_kick;
> +	u32 snapshot;
> +
> +	START_USE(vq);
> +	/* We need to expose the new flags value before checking notification
> +	 * suppressions. */
> +	virtio_mb(vq->weak_barriers);
> +
> +	snapshot = READ_ONCE(*(u32 *)vq->vring_packed.device);
> +	flags = virtio16_to_cpu(_vq->vdev, (__virtio16)(snapshot >> 16)) & 0x3;
> +
> +#ifdef DEBUG
> +	if (vq->last_add_time_valid) {
> +		WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
> +					      vq->last_add_time)) > 100);
> +	}
> +	vq->last_add_time_valid = false;
> +#endif
> +
> +	needs_kick = (flags != VRING_EVENT_F_DISABLE);
> +	END_USE(vq);
> +	return needs_kick;
> +}
> +
> +static void detach_buf_packed(struct vring_virtqueue *vq,
> +			      unsigned int id, void **ctx)
> +{
> +	struct vring_desc_state_packed *state = NULL;
> +	struct vring_packed_desc *desc;
> +	unsigned int curr, i;
> +
> +	/* Clear data ptr. */
> +	vq->desc_state_packed[id].data = NULL;
> +
> +	curr = id;
> +	for (i = 0; i < vq->desc_state_packed[id].num; i++) {
> +		state = &vq->desc_state_packed[curr];
> +		vring_unmap_state_packed(vq, state);
> +		curr = state->next;
> +	}
> +
> +	BUG_ON(state == NULL);
> +	vq->vq.num_free += vq->desc_state_packed[id].num;
> +	state->next = vq->free_head;
> +	vq->free_head = id;
> +
> +	if (vq->indirect) {
> +		u32 len;
> +
> +		/* Free the indirect table, if any, now that it's unmapped. */
> +		desc = vq->desc_state_packed[id].indir_desc;
> +		if (!desc)
> +			return;
> +
> +		if (vring_use_dma_api(vq->vq.vdev)) {
> +			len = vq->desc_state_packed[id].len;
> +			for (i = 0; i < len / sizeof(struct vring_packed_desc);
> +					i++)
> +				vring_unmap_desc_packed(vq, &desc[i]);
> +		}
> +		kfree(desc);
> +		vq->desc_state_packed[id].indir_desc = NULL;
> +	} else if (ctx) {
> +		*ctx = vq->desc_state_packed[id].indir_desc;
> +	}
> +}
> +
> +static inline bool is_used_desc_packed(const struct vring_virtqueue *vq,
> +				       u16 idx, bool used_wrap_counter)
> +{
> +	u16 flags;
> +	bool avail, used;
> +
> +	flags = virtio16_to_cpu(vq->vq.vdev,
> +				vq->vring_packed.desc[idx].flags);
> +	avail = !!(flags & VRING_DESC_F_AVAIL);
> +	used = !!(flags & VRING_DESC_F_USED);
> +
> +	return avail == used && used == used_wrap_counter;
>  }
>  
>  static inline bool more_used_packed(const struct vring_virtqueue *vq)
>  {
> -	return false;
> +	return is_used_desc_packed(vq, vq->last_used_idx,
> +			vq->used_wrap_counter);
>  }
>  
>  static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
>  					  unsigned int *len,
>  					  void **ctx)
>  {
> -	return NULL;
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	u16 last_used, id;
> +	void *ret;
> +
> +	START_USE(vq);
> +
> +	if (unlikely(vq->broken)) {
> +		END_USE(vq);
> +		return NULL;
> +	}
> +
> +	if (!more_used_packed(vq)) {
> +		pr_debug("No more buffers in queue\n");
> +		END_USE(vq);
> +		return NULL;
> +	}
> +
> +	/* Only get used elements after they have been exposed by host. */
> +	virtio_rmb(vq->weak_barriers);
> +
> +	last_used = vq->last_used_idx;
> +	id = virtio16_to_cpu(_vq->vdev, vq->vring_packed.desc[last_used].id);
> +	*len = virtio32_to_cpu(_vq->vdev, vq->vring_packed.desc[last_used].len);
> +
> +	if (unlikely(id >= vq->vring_packed.num)) {
> +		BAD_RING(vq, "id %u out of range\n", id);
> +		return NULL;
> +	}
> +	if (unlikely(!vq->desc_state_packed[id].data)) {
> +		BAD_RING(vq, "id %u is not a head!\n", id);
> +		return NULL;
> +	}
> +
> +	vq->last_used_idx += vq->desc_state_packed[id].num;
> +	if (vq->last_used_idx >= vq->vring_packed.num) {
> +		vq->last_used_idx -= vq->vring_packed.num;
> +		vq->used_wrap_counter ^= 1;
> +	}
> +
> +	/* detach_buf_packed clears data, so grab it now. */
> +	ret = vq->desc_state_packed[id].data;
> +	detach_buf_packed(vq, id, ctx);
> +
> +#ifdef DEBUG
> +	vq->last_add_time_valid = false;
> +#endif
> +
> +	END_USE(vq);
> +	return ret;
>  }
>  
>  static void virtqueue_disable_cb_packed(struct virtqueue *_vq)
>  {
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +	if (vq->event_flags_shadow != VRING_EVENT_F_DISABLE) {
> +		vq->event_flags_shadow = VRING_EVENT_F_DISABLE;
> +		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> +							vq->event_flags_shadow);
> +	}
>  }
>  
>  static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
>  {
> -	return 0;
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +	START_USE(vq);
> +
> +	/* We optimistically turn back on interrupts, then check if there was
> +	 * more to do. */
> +
> +	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> +		vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
> +		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> +							vq->event_flags_shadow);
> +	}
> +
> +	END_USE(vq);
> +	return vq->last_used_idx | ((u16)vq->used_wrap_counter << 15);
>  }
>  
> -static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned last_used_idx)
> +static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned off_wrap)
>  {
> -	return false;
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	bool wrap_counter;
> +	u16 used_idx;
> +
> +	wrap_counter = off_wrap >> 15;
> +	used_idx = off_wrap & ~(1 << 15);
> +
> +	return is_used_desc_packed(vq, used_idx, wrap_counter);
>  }
>  
>  static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq)
>  {
> -	return false;
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +	START_USE(vq);
> +
> +	/* We optimistically turn back on interrupts, then check if there was
> +	 * more to do. */
> +
> +	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> +		vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
> +		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> +							vq->event_flags_shadow);
> +		/* We need to enable interrupts first before re-checking
> +		 * for more used buffers. */
> +		virtio_mb(vq->weak_barriers);
> +	}
> +
> +	if (more_used_packed(vq)) {
> +		END_USE(vq);
> +		return false;
> +	}
> +
> +	END_USE(vq);
> +	return true;
>  }
>  
>  static void *virtqueue_detach_unused_buf_packed(struct virtqueue *_vq)
>  {
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	unsigned int i;
> +	void *buf;
> +
> +	START_USE(vq);
> +
> +	for (i = 0; i < vq->vring_packed.num; i++) {
> +		if (!vq->desc_state_packed[i].data)
> +			continue;
> +		/* detach_buf clears data, so grab it now. */
> +		buf = vq->desc_state_packed[i].data;
> +		detach_buf_packed(vq, i, NULL);
> +		END_USE(vq);
> +		return buf;
> +	}
> +	/* That should have freed everything. */
> +	BUG_ON(vq->vq.num_free != vq->vring_packed.num);
> +
> +	END_USE(vq);
>  	return NULL;
>  }
>  
> @@ -1083,6 +1559,9 @@ bool virtqueue_poll(struct virtqueue *_vq, unsigned last_used_idx)
>  {
>  	struct vring_virtqueue *vq = to_vvq(_vq);
>  
> +	/* We need to enable interrupts first before re-checking
> +	 * for more used buffers. */
> +	virtio_mb(vq->weak_barriers);
>  	return vq->packed ? virtqueue_poll_packed(_vq, last_used_idx) :
>  			    virtqueue_poll_split(_vq, last_used_idx);
>  }
> -- 
> 2.18.0

^ permalink raw reply

* [PATCH] net/wan/fsl_ucc_hdlc: add BQL support
From: Mathias Thore @ 2018-11-07  8:09 UTC (permalink / raw)
  To: qiang.zhao, linuxppc-dev, netdev, joakim.tjernlund,
	david.gounaris
  Cc: Mathias Thore

Add byte queue limits support in the fsl_ucc_hdlc driver.

Signed-off-by: Mathias Thore <mathias.thore@infinera.com>
---
 drivers/net/wan/fsl_ucc_hdlc.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/net/wan/fsl_ucc_hdlc.c b/drivers/net/wan/fsl_ucc_hdlc.c
index 4d6409605207..7a42336c8af8 100644
--- a/drivers/net/wan/fsl_ucc_hdlc.c
+++ b/drivers/net/wan/fsl_ucc_hdlc.c
@@ -391,6 +391,7 @@ static netdev_tx_t ucc_hdlc_tx(struct sk_buff *skb, struct net_device *dev)
 		dev_kfree_skb(skb);
 		return -ENOMEM;
 	}
+	netdev_sent_queue(dev, skb->len);
 	spin_lock_irqsave(&priv->lock, flags);
 
 	/* Start from the next BD that should be filled */
@@ -447,6 +448,8 @@ static int hdlc_tx_done(struct ucc_hdlc_private *priv)
 {
 	/* Start from the next BD that should be filled */
 	struct net_device *dev = priv->ndev;
+	unsigned int bytes_sent = 0;
+	int howmany = 0;
 	struct qe_bd *bd;		/* BD pointer */
 	u16 bd_status;
 	int tx_restart = 0;
@@ -474,6 +477,8 @@ static int hdlc_tx_done(struct ucc_hdlc_private *priv)
 		skb = priv->tx_skbuff[priv->skb_dirtytx];
 		if (!skb)
 			break;
+		howmany++;
+		bytes_sent += skb->len;
 		dev->stats.tx_packets++;
 		memset(priv->tx_buffer +
 		       (be32_to_cpu(bd->buf) - priv->dma_tx_addr),
@@ -501,6 +506,7 @@ static int hdlc_tx_done(struct ucc_hdlc_private *priv)
 	if (tx_restart)
 		hdlc_tx_restart(priv);
 
+	netdev_completed_queue(dev, howmany, bytes_sent);
 	return 0;
 }
 
@@ -721,6 +727,7 @@ static int uhdlc_open(struct net_device *dev)
 		priv->hdlc_busy = 1;
 		netif_device_attach(priv->ndev);
 		napi_enable(&priv->napi);
+		netdev_reset_queue(dev);
 		netif_start_queue(dev);
 		hdlc_open(dev);
 	}
@@ -812,6 +819,7 @@ static int uhdlc_close(struct net_device *dev)
 
 	free_irq(priv->ut_info->uf_info.irq, priv);
 	netif_stop_queue(dev);
+	netdev_reset_queue(dev);
 	priv->hdlc_busy = 0;
 
 	return 0;
-- 
2.18.1

^ permalink raw reply related

* [PATCH net-next] net: phy: realtek: load driver for all PHYs with a Realtek OUI
From: Heiner Kallweit @ 2018-11-07  7:52 UTC (permalink / raw)
  To: Florian Fainelli, Andrew Lunn, David Miller; +Cc: netdev@vger.kernel.org

Instead of listing every single PHYID, load the driver for every PHYID
with a Realtek OUI, independent of model number and revision.

This patch also improves two further aspects:
- constify realtek_tbl[]
- the mask should have been 0xffffffff instead of 0x001fffff so far,
  by masking out some bits a PHY from another vendor could have been
  matched

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
---
 drivers/net/phy/realtek.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/drivers/net/phy/realtek.c b/drivers/net/phy/realtek.c
index 271e8adc3..7b1c89b38 100644
--- a/drivers/net/phy/realtek.c
+++ b/drivers/net/phy/realtek.c
@@ -305,15 +305,8 @@ static struct phy_driver realtek_drvs[] = {
 
 module_phy_driver(realtek_drvs);
 
-static struct mdio_device_id __maybe_unused realtek_tbl[] = {
-	{ 0x001cc816, 0x001fffff },
-	{ 0x001cc910, 0x001fffff },
-	{ 0x001cc912, 0x001fffff },
-	{ 0x001cc913, 0x001fffff },
-	{ 0x001cc914, 0x001fffff },
-	{ 0x001cc915, 0x001fffff },
-	{ 0x001cc916, 0x001fffff },
-	{ 0x001cc961, 0x001fffff },
+static const struct mdio_device_id __maybe_unused realtek_tbl[] = {
+	{ 0x001cc800, GENMASK(31, 10) },
 	{ }
 };
 
-- 
2.19.1

^ permalink raw reply related

* Re: [PATCH rdma] net/mlx5: Fix XRC SRQ umem valid bits
From: Leon Romanovsky @ 2018-11-07  7:34 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Doug Ledford, Yishai Hadas, RDMA mailing list, Artemy Kovalyov,
	Saeed Mahameed, linux-netdev
In-Reply-To: <20181106221153.GG30476@ziepe.ca>

[-- Attachment #1: Type: text/plain, Size: 2042 bytes --]

On Tue, Nov 06, 2018 at 03:11:53PM -0700, Jason Gunthorpe wrote:
> On Tue, Nov 06, 2018 at 05:10:53PM -0500, Doug Ledford wrote:
> > On Tue, 2018-11-06 at 22:02 +0000, Jason Gunthorpe wrote:
> > > On Tue, Nov 06, 2018 at 04:31:08PM -0500, Doug Ledford wrote:
> > > > On Wed, 2018-10-31 at 12:20 +0200, Leon Romanovsky wrote:
> > > > > From: Yishai Hadas <yishaih@mellanox.com>
> > > > >
> > > > > Adapt XRC SRQ to the latest HW specification with fixed definition
> > > > > around umem valid bits. The previous definition relied on a bit which
> > > > > was taken for other purposes in legacy FW.
> > > > >
> > > > > Fixes: bd37197554eb ("net/mlx5: Update mlx5_ifc with DEVX UID bits")
> > > > > Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
> > > > > Reviewed-by: Artemy Kovalyov <artemyko@mellanox.com>
> > > > > Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
> > > > > Hi Doug, Jason
> > > > >
> > > > > This commit fixes code sent in this merge window, so I'm not marking it
> > > > > with any rdma-rc/rdma-next. It will be better to be sent during this merge
> > > > > window if you have extra pull request to issue, or as a -rc material, if
> > > > > not.
> > > > >
> > > > > BTW, we didn't combine reserved fields, because our convention is to align such
> > > > > fields to 32 bits for better readability.
> > > > >
> > > > > Thanks
> > > >
> > > > This looks fine.  Let me know when it's in the mlx5-next tree to pull.
> > >
> > > It needs to go to -rc...
> > >
> > > This needs a mlx5-rc branch for this I guess?
> >
> > I don't think so.  As long as it's the first commit in mlx5-next, and
> > mlx5-next is 4.20-rc1 based, then pulling this commit into the -rc tree
> > will only pull the single commit.  Then when we pull into for-next for
> > the first time, we will get this in for-next too.  That seems best to
> > me.
>
> That works too, if Leon is fast :)

Thank you both for suggestion.

I did it.
99b77fef3c6c net/mlx5: Fix XRC SRQ umem valid bits

It is first commit and it is based on -rc1.

Thanks

>
> Jason

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply

* [PATCH] [stable, netdev 4.4+] lan78xx: make sure RX_ADDRL & RX_ADDRH regs are always up to date
From: Paolo Pisati @ 2018-11-07 16:50 UTC (permalink / raw)
  To: Woojung Huh, Microchip Linux Driver Support, netdev, stable
  Cc: linux-usb, linux-kernel

[partial backport upstream 760db29bdc97b73ff60b091315ad787b1deb5cf5]

Upon invocation, lan78xx_init_mac_address() checks that the mac address present
in the RX_ADDRL & RX_ADDRH registers is a valid address, if not, it first tries
to read a new address from an external eeprom or the otp area, and in case both
read fail (or the address read back is invalid), it randomly generates a new
one.

Unfortunately, due to the way the above logic is laid out,
if both read_eeprom() and read_otp() fail, a new mac address is correctly
generated but is never written back to RX_ADDRL & RX_ADDRH, leaving the chip in an
incosistent state and with an invalid mac address (e.g. the nic appears to be
completely dead, and doesn't receive any packet, etc):

lan78xx_init_mac_address()
...
if (lan78xx_read_eeprom(addr ...) || lan78xx_read_otp(addr ...)) {
	if (is_valid_ether_addr(addr) {
		// nop...
	} else {
		random_ether_addr(addr);
	}

	// correctly writes back the new address
	lan78xx_write_reg(RX_ADDRL, addr ...);
	lan78xx_write_reg(RX_ADDRH, addr ...);
} else {
	// XXX if both eeprom and otp read fail, we land here and skip
	// XXX the RX_ADDRL & RX_ADDRH update completely
	random_ether_addr(addr);
}

This bug went unnoticed because lan78xx_read_otp() was buggy itself and would
never fail, up until 4bfc338 "lan78xx: Correctly indicate invalid OTP"
fixed it and as a side effect uncovered this bug.

4.18+ is fine, since the bug was implicitly fixed in 760db29 "lan78xx: Read MAC
address from DT if present" when the address change logic was reorganized, but
it's still present in all stable trees below that: linux-4.4.y, linux-4.9.y,
linux-4.14.y, etc up to linux-4.18.y (not included).

Signed-off-by: Paolo Pisati <p.pisati@gmail.com>
---
 drivers/net/usb/lan78xx.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index 50e2e10a..114dc55 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -1660,13 +1660,6 @@ static void lan78xx_init_mac_address(struct lan78xx_net *dev)
 				netif_dbg(dev, ifup, dev->net,
 					  "MAC address set to random addr");
 			}
-
-			addr_lo = addr[0] | (addr[1] << 8) |
-				  (addr[2] << 16) | (addr[3] << 24);
-			addr_hi = addr[4] | (addr[5] << 8);
-
-			ret = lan78xx_write_reg(dev, RX_ADDRL, addr_lo);
-			ret = lan78xx_write_reg(dev, RX_ADDRH, addr_hi);
 		} else {
 			/* generate random MAC */
 			random_ether_addr(addr);
@@ -1674,6 +1667,11 @@ static void lan78xx_init_mac_address(struct lan78xx_net *dev)
 				  "MAC address set to random addr");
 		}
 	}
+	addr_lo = addr[0] | (addr[1] << 8) | (addr[2] << 16) | (addr[3] << 24);
+	addr_hi = addr[4] | (addr[5] << 8);
+
+	ret = lan78xx_write_reg(dev, RX_ADDRL, addr_lo);
+	ret = lan78xx_write_reg(dev, RX_ADDRH, addr_hi);

 	ret = lan78xx_write_reg(dev, MAF_LO(0), addr_lo);
 	ret = lan78xx_write_reg(dev, MAF_HI(0), addr_hi | MAF_HI_VALID_);
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next] net: phy: make phy_trigger_machine static
From: Heiner Kallweit @ 2018-11-07  7:15 UTC (permalink / raw)
  To: Florian Fainelli, Andrew Lunn, David Miller; +Cc: netdev@vger.kernel.org

phy_trigger_machine() is used in phy.c only, so we can make it static.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
---
 drivers/net/phy/phy.c | 33 ++++++++++++---------------------
 include/linux/phy.h   |  1 -
 2 files changed, 12 insertions(+), 22 deletions(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 1d73ac330..476578746 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -467,6 +467,18 @@ int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd)
 }
 EXPORT_SYMBOL(phy_mii_ioctl);
 
+static void phy_queue_state_machine(struct phy_device *phydev,
+				    unsigned int secs)
+{
+	mod_delayed_work(system_power_efficient_wq, &phydev->state_queue,
+			 secs * HZ);
+}
+
+static void phy_trigger_machine(struct phy_device *phydev)
+{
+	phy_queue_state_machine(phydev, 0);
+}
+
 static int phy_config_aneg(struct phy_device *phydev)
 {
 	if (phydev->drv->config_aneg)
@@ -620,13 +632,6 @@ int phy_speed_up(struct phy_device *phydev)
 }
 EXPORT_SYMBOL_GPL(phy_speed_up);
 
-static void phy_queue_state_machine(struct phy_device *phydev,
-				    unsigned int secs)
-{
-	mod_delayed_work(system_power_efficient_wq, &phydev->state_queue,
-			 secs * HZ);
-}
-
 /**
  * phy_start_machine - start PHY state machine tracking
  * @phydev: the phy_device struct
@@ -643,20 +648,6 @@ void phy_start_machine(struct phy_device *phydev)
 }
 EXPORT_SYMBOL_GPL(phy_start_machine);
 
-/**
- * phy_trigger_machine - trigger the state machine to run
- *
- * @phydev: the phy_device struct
- *
- * Description: There has been a change in state which requires that the
- *   state machine runs.
- */
-
-void phy_trigger_machine(struct phy_device *phydev)
-{
-	phy_queue_state_machine(phydev, 0);
-}
-
 /**
  * phy_stop_machine - stop the PHY state machine tracking
  * @phydev: target phy_device struct
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 3ea87f774..9e4d49ef4 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1054,7 +1054,6 @@ void phy_change_work(struct work_struct *work);
 void phy_mac_interrupt(struct phy_device *phydev);
 void phy_start_machine(struct phy_device *phydev);
 void phy_stop_machine(struct phy_device *phydev);
-void phy_trigger_machine(struct phy_device *phydev);
 int phy_ethtool_sset(struct phy_device *phydev, struct ethtool_cmd *cmd);
 void phy_ethtool_ksettings_get(struct phy_device *phydev,
 			       struct ethtool_link_ksettings *cmd);
-- 
2.19.1

^ permalink raw reply related

* Re: [PATCH 2/5] VSOCK: support fill data to mergeable rx buffer in host
From: jiangyiwen @ 2018-11-07  7:11 UTC (permalink / raw)
  To: Jason Wang, stefanha, stefanha; +Cc: netdev, kvm, virtualization
In-Reply-To: <8963dba5-c2fb-69d0-4adb-72d751a9dedf@redhat.com>

On 2018/11/7 14:18, Jason Wang wrote:
> 
> On 2018/11/6 下午2:30, jiangyiwen wrote:
>>> Seems duplicated with the one used by vhost-net.
>>>
>>> In packed virtqueue implementation, I plan to move this to vhost.c.
>>>
>> Yes, this code is full copied from vhost-net, if it can be packed into
>> vhost.c, it would be great.
>>
> 
> If you try to reuse vhost-net, you don't even need to care about this :)
> 
> Thanks
> 
> 
> .
> 

Hi Jason,

Thank your advice, I will consider your idea. But I don't know
what's stefan's suggestion? It seems that he doesn't care much
about this community. :(

I still hope this community can have some vitality.

^ permalink raw reply

* Re: [PATCH 3/5] VSOCK: support receive mergeable rx buffer in guest
From: jiangyiwen @ 2018-11-07  7:07 UTC (permalink / raw)
  To: Jason Wang, stefanha; +Cc: netdev, kvm, virtualization
In-Reply-To: <1b67b8ef-5dcc-0383-1b32-d80f294533d3@redhat.com>

On 2018/11/7 14:20, Jason Wang wrote:
> 
> On 2018/11/6 下午2:41, jiangyiwen wrote:
>> On 2018/11/6 12:00, Jason Wang wrote:
>>> On 2018/11/5 下午3:47, jiangyiwen wrote:
>>>> Guest receive mergeable rx buffer, it can merge
>>>> scatter rx buffer into a big buffer and then copy
>>>> to user space.
>>>>
>>>> Signed-off-by: Yiwen Jiang<jiangyiwen@huawei.com>
>>>> ---
>>>>    include/linux/virtio_vsock.h            |  9 ++++
>>>>    net/vmw_vsock/virtio_transport.c        | 75 +++++++++++++++++++++++++++++----
>>>>    net/vmw_vsock/virtio_transport_common.c | 59 ++++++++++++++++++++++----
>>>>    3 files changed, 127 insertions(+), 16 deletions(-)
>>>>
>>>> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
>>>> index da9e1fe..6be3cd7 100644
>>>> --- a/include/linux/virtio_vsock.h
>>>> +++ b/include/linux/virtio_vsock.h
>>>> @@ -13,6 +13,8 @@
>>>>    #define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE    (1024 * 4)
>>>>    #define VIRTIO_VSOCK_MAX_BUF_SIZE        0xFFFFFFFFUL
>>>>    #define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE        (1024 * 64)
>>>> +/* virtio_vsock_pkt + max_pkt_len(default MAX_PKT_BUF_SIZE) */
>>>> +#define VIRTIO_VSOCK_MAX_MRG_BUF_NUM ((VIRTIO_VSOCK_MAX_PKT_BUF_SIZE / PAGE_SIZE) + 1)
>>>>
>>>>    /* Virtio-vsock feature */
>>>>    #define VIRTIO_VSOCK_F_MRG_RXBUF 0 /* Host can merge receive buffers. */
>>>> @@ -48,6 +50,11 @@ struct virtio_vsock_sock {
>>>>        struct list_head rx_queue;
>>>>    };
>>>>
>>>> +struct virtio_vsock_mrg_rxbuf {
>>>> +    void *buf;
>>>> +    u32 len;
>>>> +};
>>>> +
>>>>    struct virtio_vsock_pkt {
>>>>        struct virtio_vsock_hdr    hdr;
>>>>        struct virtio_vsock_mrg_rxbuf_hdr mrg_rxbuf_hdr;
>>>> @@ -59,6 +66,8 @@ struct virtio_vsock_pkt {
>>>>        u32 len;
>>>>        u32 off;
>>>>        bool reply;
>>>> +    bool mergeable;
>>>> +    struct virtio_vsock_mrg_rxbuf mrg_rxbuf[VIRTIO_VSOCK_MAX_MRG_BUF_NUM];
>>>>    };
>>> It's better to use iov here I think, and drop buf completely.
>>>
>>> And this is better to be done in an independent patch.
>>>
>> You're right, I can use kvec instead of customized structure,
>> in addition, I don't understand about drop buf completely and
>> an independent patch.
> 
> 
> I mean there a void *buf in struct virtio_vsock_pkt. You can drop it and switch to use iov(iter) or other data structure that supports sg.
> 
> Thanks
> 

Yes, I understand your idea, I don't want to modify tx process method, so I
keep the buf.

> 
>>
>> Thanks.
>>
> 
> .
> 

^ permalink raw reply

* Re: RFC: changed error code when binding unix socket twice
From: Petr Vorel @ 2018-11-07 15:56 UTC (permalink / raw)
  To: gregkh, David Miller
  Cc: mkubecek, Networking, Cong Wang, rweikusat,
	Linux Kernel Mailing List, ltp, Cyril Hrubis, junchi.chen,
	Dmitry Vyukov, Naresh Kamboju, Arnd Bergmann
In-Reply-To: <CAK8P3a1q32spcF445Zhw-KMXG2VwFZuMw5C1sYFk3qLXz3HB5w@mail.gmail.com>

Hi

> I forgot that 4.1 has ended a while ago. Greg also sometimes still takes patches
> for 3.18, so that might be a candidate aside from 3.18

Gregkh, David, does it make sense to you to merge commit 0fb44559ffd6 ("af_unix:
move unix_mknod() out of bindlock") to 3.18? If yes, please do so.


> > I guess we need to adjust LTP test to accept either return code as EOL longterm
> > branches probably will not take this patch.

> I'd argue that if we decide that EADDRINUSE is the intended return value,
> it would be appropriate for LTP to warn about kernels that never got the
> backport.

> The alternative would be to not backport the patch further, and then change LTP
> to no longer warn. Note that the bug that got fixed by the 0fb44559ffd6 patch
> is probably more important than the return code, so I would say
> we want the patch backported to anything that people still run anyway,
> especially if they are running LTP to make sure it works correctly.

>         Arnd

Kind regards,
Petr

^ permalink raw reply

* Re: [PATCH 3/5] VSOCK: support receive mergeable rx buffer in guest
From: Jason Wang @ 2018-11-07  6:20 UTC (permalink / raw)
  To: jiangyiwen, stefanha; +Cc: netdev, kvm, virtualization
In-Reply-To: <5BE137B2.5040305@huawei.com>


On 2018/11/6 下午2:41, jiangyiwen wrote:
> On 2018/11/6 12:00, Jason Wang wrote:
>> On 2018/11/5 下午3:47, jiangyiwen wrote:
>>> Guest receive mergeable rx buffer, it can merge
>>> scatter rx buffer into a big buffer and then copy
>>> to user space.
>>>
>>> Signed-off-by: Yiwen Jiang<jiangyiwen@huawei.com>
>>> ---
>>>    include/linux/virtio_vsock.h            |  9 ++++
>>>    net/vmw_vsock/virtio_transport.c        | 75 +++++++++++++++++++++++++++++----
>>>    net/vmw_vsock/virtio_transport_common.c | 59 ++++++++++++++++++++++----
>>>    3 files changed, 127 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
>>> index da9e1fe..6be3cd7 100644
>>> --- a/include/linux/virtio_vsock.h
>>> +++ b/include/linux/virtio_vsock.h
>>> @@ -13,6 +13,8 @@
>>>    #define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE    (1024 * 4)
>>>    #define VIRTIO_VSOCK_MAX_BUF_SIZE        0xFFFFFFFFUL
>>>    #define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE        (1024 * 64)
>>> +/* virtio_vsock_pkt + max_pkt_len(default MAX_PKT_BUF_SIZE) */
>>> +#define VIRTIO_VSOCK_MAX_MRG_BUF_NUM ((VIRTIO_VSOCK_MAX_PKT_BUF_SIZE / PAGE_SIZE) + 1)
>>>
>>>    /* Virtio-vsock feature */
>>>    #define VIRTIO_VSOCK_F_MRG_RXBUF 0 /* Host can merge receive buffers. */
>>> @@ -48,6 +50,11 @@ struct virtio_vsock_sock {
>>>        struct list_head rx_queue;
>>>    };
>>>
>>> +struct virtio_vsock_mrg_rxbuf {
>>> +    void *buf;
>>> +    u32 len;
>>> +};
>>> +
>>>    struct virtio_vsock_pkt {
>>>        struct virtio_vsock_hdr    hdr;
>>>        struct virtio_vsock_mrg_rxbuf_hdr mrg_rxbuf_hdr;
>>> @@ -59,6 +66,8 @@ struct virtio_vsock_pkt {
>>>        u32 len;
>>>        u32 off;
>>>        bool reply;
>>> +    bool mergeable;
>>> +    struct virtio_vsock_mrg_rxbuf mrg_rxbuf[VIRTIO_VSOCK_MAX_MRG_BUF_NUM];
>>>    };
>> It's better to use iov here I think, and drop buf completely.
>>
>> And this is better to be done in an independent patch.
>>
> You're right, I can use kvec instead of customized structure,
> in addition, I don't understand about drop buf completely and
> an independent patch.


I mean there a void *buf in struct virtio_vsock_pkt. You can drop it and 
switch to use iov(iter) or other data structure that supports sg.

Thanks


>
> Thanks.
>

^ permalink raw reply

* Re: [PATCH 2/5] VSOCK: support fill data to mergeable rx buffer in host
From: Jason Wang @ 2018-11-07  6:18 UTC (permalink / raw)
  To: jiangyiwen, stefanha; +Cc: netdev, kvm, virtualization
In-Reply-To: <5BE134EF.1070009@huawei.com>


On 2018/11/6 下午2:30, jiangyiwen wrote:
>> Seems duplicated with the one used by vhost-net.
>>
>> In packed virtqueue implementation, I plan to move this to vhost.c.
>>
> Yes, this code is full copied from vhost-net, if it can be packed into
> vhost.c, it would be great.
>

If you try to reuse vhost-net, you don't even need to care about this :)

Thanks

^ permalink raw reply

* Re: [PATCH 1/5] VSOCK: support fill mergeable rx buffer in guest
From: Jason Wang @ 2018-11-07  6:17 UTC (permalink / raw)
  To: jiangyiwen, stefanha; +Cc: netdev, kvm, virtualization
In-Reply-To: <5BE13331.7050901@huawei.com>


On 2018/11/6 下午2:22, jiangyiwen wrote:
> On 2018/11/6 11:38, Jason Wang wrote:
>> On 2018/11/5 下午3:45, jiangyiwen wrote:
>>> In driver probing, if virtio has VIRTIO_VSOCK_F_MRG_RXBUF feature,
>>> it will fill mergeable rx buffer, support for host send mergeable
>>> rx buffer. It will fill a page everytime to compact with small
>>> packet and big packet.
>>>
>>> Signed-off-by: Yiwen Jiang<jiangyiwen@huawei.com>
>>> ---
>>>    include/linux/virtio_vsock.h     |  3 ++
>>>    net/vmw_vsock/virtio_transport.c | 72 +++++++++++++++++++++++++++++-----------
>>>    2 files changed, 56 insertions(+), 19 deletions(-)
>>>
>>> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
>>> index e223e26..bf84418 100644
>>> --- a/include/linux/virtio_vsock.h
>>> +++ b/include/linux/virtio_vsock.h
>>> @@ -14,6 +14,9 @@
>>>    #define VIRTIO_VSOCK_MAX_BUF_SIZE        0xFFFFFFFFUL
>>>    #define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE        (1024 * 64)
>>>
>>> +/* Virtio-vsock feature */
>>> +#define VIRTIO_VSOCK_F_MRG_RXBUF 0 /* Host can merge receive buffers. */
>>> +
>>>    enum {
>>>        VSOCK_VQ_RX     = 0, /* for host to guest data */
>>>        VSOCK_VQ_TX     = 1, /* for guest to host data */
>>> diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
>>> index 5d3cce9..2040a9e 100644
>>> --- a/net/vmw_vsock/virtio_transport.c
>>> +++ b/net/vmw_vsock/virtio_transport.c
>>> @@ -64,6 +64,7 @@ struct virtio_vsock {
>>>        struct virtio_vsock_event event_list[8];
>>>
>>>        u32 guest_cid;
>>> +    bool mergeable;
>>>    };
>>>
>>>    static struct virtio_vsock *virtio_vsock_get(void)
>>> @@ -256,6 +257,25 @@ static int virtio_transport_send_pkt_loopback(struct virtio_vsock *vsock,
>>>        return 0;
>>>    }
>>>
>>> +static int fill_mergeable_rx_buff(struct virtqueue *vq)
>>> +{
>>> +    void *page = NULL;
>>> +    struct scatterlist sg;
>>> +    int err;
>>> +
>>> +    page = (void *)get_zeroed_page(GFP_KERNEL);
>> Any reason to use zeroed page?
> In previous version, the entire structure of virtio_vsock_pkt is preallocated
> in guest use kzalloc, it is a contiguous zeroed physical memory, but host only
> need to fill virtio_vsock_hdr size.
>
> However, in mergeable rx buffer version, we only fill a page in vring descriptor
> in guest, and I will reserve size of virtio_vsock_pkt in host instead of write
> the total size of virtio_vsock_pkt, for the correctness of structure value,
> we should set zeroed page in advance.


I may miss something, but it looks to me only the header needs to be zeroed.


>
>>> +    if (!page)
>>> +        return -ENOMEM;
>>> +
>>> +    sg_init_one(&sg, page, PAGE_SIZE);
>> FYI, for virtio-net we have several optimizations for mergeable rx buffer:
>>
>> - skb_page_frag_refill() which can use high order page and reduce the stress of page allocator
>>
> You're right, initially I want to use a memory poll to manage the rx buffer,
> and then use this in the later optimized patch. Your advice is very great.
>
>> - we don't use fixed buffer size, instead we use EWMA to estimate the possible rx buffer size to avoid internal fragmentation
>>
> Ok, I analysis the feature and consider add it into my patches.
>
>> If we can try to reuse virtio-net driver, we will get those nice features.
>>
> Yes, after all virtio-net has a very good ecological environment, and it also
> do many performance optimization, it is actually a good idea.
>

Yes, so my suggestion is to consider to reuse them (unless we found 
something that is a real blocker) instead of duplicating codes, features 
a bugs.

Thanks

^ permalink raw reply

* Re: [RFC PATCH 00/12] net: introduce Qualcomm IPA driver
From: Arnd Bergmann @ 2018-11-07 15:46 UTC (permalink / raw)
  To: Alex Elder
  Cc: David Miller, Bjorn Andersson, Ilias Apalodimas, Rob Herring,
	Mark Rutland, Networking, DTML, linux-arm-msm, linux-soc,
	Linux ARM, Linux Kernel Mailing List, syadagir, mjavid
In-Reply-To: <20181107003250.5832-1-elder@linaro.org>

On Wed, Nov 7, 2018 at 1:33 AM Alex Elder <elder@linaro.org> wrote:
> The code has undergone considerable rework to prepare it for
> incorporation into upstream Linux.  Parts of it bear little
> resemblance to the original driver.  Still, some work remains
> to be done.  The current code and its design had a preliminary
> review, and some changes to the data path implementation were
> recommended.   These have not yet been addressed:
> - Use NAPI for all interfaces, not just RX (and WAN data) endpoints.
> - Do more work in the NAPI poll function, including collecting
>   completed TX requests and posting buffers for RX.
> - Do not use periodic NOP requests as a way to avoid TX interrupts.
> - The NAPI context should be associated with the hardware interrupt
>   (it is now associated with something abstracted from the hardware).
> - Use threaded interrupts, to avoid the need for using spinlocks and
>   atomic variables for synchronizing between workqueue and interrupt
>   context.
> - Have runtime power management enable and disable IPA clock and
>   interconnects.
> Many thanks to Arnd Bergmann, Ilias Apalodimas, and Bjorn Andersson
> for their early feedback.

Thanks for getting the current version out even with the long TODO
list. I've had my first deeper look at some of the patches and found
a few more things that likely require substantial rework. I also think
there is still significant room for simplifying it further, and getting
better performance out of it in the process.

Also, despite the criticism in my patch review, I have to say you've
done a great job at cutting out a lot of the things that were present
in the past, it's good to see that you have come this far with the
cleanup!

      Arnd

^ permalink raw reply

* Re: [RFC PATCH 11/12] soc: qcom: ipa: IPA rmnet interface
From: Dan Williams @ 2018-11-07 15:26 UTC (permalink / raw)
  To: Alex Elder, davem, arnd, bjorn.andersson, ilias.apalodimas
  Cc: netdev, devicetree, linux-arm-msm, linux-soc, linux-arm-kernel,
	linux-kernel, syadagir, mjavid, robh+dt, mark.rutland
In-Reply-To: <20181107003250.5832-12-elder@linaro.org>

On Tue, 2018-11-06 at 18:32 -0600, Alex Elder wrote:
> The IPA uses "rmnet" as a way to present remote network resources as
> if they were local to the AP.  IPA interfaces representing networks
> accessible via the modem are represented as rmnet interfaces,
> implemented by the "rmnet data driver" found here:
>     drivers/net/ethernet/qualcomm/rmnet/

It looks like there's a lot of overlap between this driver and that
one.  Ideally they would be a single driver which could automatically
select the IPA mode for appropriate hardware, or the IPA mode would be
a "subdriver" that bases itself heavily on the existing rmnet driver
even if it doesn't use the same packet movement functions.  Or
something like that.

But as Arnd stated, the ioctls won't fly.  They were also proposed for
the rmnet driver but they are better done as netlink, or via sysfs as
the existing qmi_wwan driver does for some of the same values.

Half the non-extended ioctls aren't even supported/used and should
simply be removed.

The extended ioctls should be evaluated as to whether they are really
needed (eg RMNET_IOCTL_GET_DRIVER_NAME).  I think most of the rest have
either been handled already via the 'rmnet' driver itself (like the MUX
channels using the vlan netlink attributes) or should be added via
netlink-type mechansims if they are really required.

In general, it would be good to get to a single 'rmnet' driver that has
a single API and supports as much hardware as possible.  There's too
much duplication currently.

Dan

> The IPA is able to perform aggregation of packets, as well as
> checksum offload.  These options (plus others, such as configuring
> MTU size) are configurable using an ioctl interface.  In addition,
> rmnet devices support multiplexing.
> 
> TX packets are handed to the data path layer, and when their
> transmission is complete the notification callback will be
> called.  The data path code posts RX packets to the hardware,
> and when they are filled they are supplied here by a receive
> notification callback.
> 
> The IPA driver currently does not support the modem shutting down
> (or crashing).  But the rmnet_ipa device roughly represents the
> availability of networks reachable by the modem.  If the modem is
> operational, an ipa_rmnet network device will be available.  Modem
> operation is managed by the remoteproc subsystem.
> 
> Note:  This portion of the driver will be heavily affected by
> planned rework on the data path code.
> 
> Signed-off-by: Alex Elder <elder@linaro.org>
> ---
>  drivers/net/ipa/msm_rmnet.h    | 120 +++++
>  drivers/net/ipa/rmnet_config.h |  31 ++
>  drivers/net/ipa/rmnet_ipa.c    | 805
> +++++++++++++++++++++++++++++++++
>  3 files changed, 956 insertions(+)
>  create mode 100644 drivers/net/ipa/msm_rmnet.h
>  create mode 100644 drivers/net/ipa/rmnet_config.h
>  create mode 100644 drivers/net/ipa/rmnet_ipa.c
> 
> diff --git a/drivers/net/ipa/msm_rmnet.h
> b/drivers/net/ipa/msm_rmnet.h
> new file mode 100644
> index 000000000000..042380fd53fb
> --- /dev/null
> +++ b/drivers/net/ipa/msm_rmnet.h
> @@ -0,0 +1,120 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +/* Copyright (c) 2018, The Linux Foundation. All rights reserved.
> + * Copyright (C) 2018 Linaro Ltd.
> + */
> +#ifndef _MSM_RMNET_H_
> +#define _MSM_RMNET_H_
> +
> +/* Bitmap macros for RmNET driver operation mode. */
> +#define RMNET_MODE_NONE	    0x00
> +#define RMNET_MODE_LLP_ETH  0x01
> +#define RMNET_MODE_LLP_IP   0x02
> +#define RMNET_MODE_QOS	    0x04
> +
> +/* IOCTL commands
> + * Values chosen to not conflict with other drivers in the ecosystem
> + */
> +
> +#define RMNET_IOCTL_SET_LLP_ETHERNET 0x000089f1 /* Set Ethernet
> protocol  */
> +#define RMNET_IOCTL_SET_LLP_IP	     0x000089f2 /* Set RAWIP
> protocol	  */
> +#define RMNET_IOCTL_GET_LLP	     0x000089f3 /* Get link
> protocol	  */
> +#define RMNET_IOCTL_SET_QOS_ENABLE   0x000089f4 /* Set QoS header
> enabled */
> +#define RMNET_IOCTL_SET_QOS_DISABLE  0x000089f5 /* Set QoS header
> disabled*/
> +#define RMNET_IOCTL_GET_QOS	     0x000089f6 /* Get QoS header
> state	  */
> +#define RMNET_IOCTL_GET_OPMODE	     0x000089f7 /* Get
> operation mode	  */
> +#define RMNET_IOCTL_OPEN	     0x000089f8 /* Open transport
> port	  */
> +#define RMNET_IOCTL_CLOSE	     0x000089f9 /* Close transport
> port	  */
> +#define RMNET_IOCTL_FLOW_ENABLE	     0x000089fa /* Flow
> enable		  */
> +#define RMNET_IOCTL_FLOW_DISABLE     0x000089fb /* Flow disable	
> 	  */
> +#define RMNET_IOCTL_FLOW_SET_HNDL    0x000089fc /* Set flow handle	
>   */
> +#define RMNET_IOCTL_EXTENDED	     0x000089fd /* Extended
> IOCTLs	  */
> +
> +/* RmNet Data Required IOCTLs */
> +#define RMNET_IOCTL_GET_SUPPORTED_FEATURES     0x0000	/* Get
> features	   */
> +#define RMNET_IOCTL_SET_MRU		       0x0001	/*
> Set MRU	   */
> +#define RMNET_IOCTL_GET_MRU		       0x0002	/*
> Get MRU	   */
> +#define RMNET_IOCTL_GET_EPID		       0x0003	/*
> Get endpoint ID */
> +#define RMNET_IOCTL_GET_DRIVER_NAME	       0x0004	/*
> Get driver name */
> +#define RMNET_IOCTL_ADD_MUX_CHANNEL	       0x0005	/*
> Add MUX ID	   */
> +#define RMNET_IOCTL_SET_EGRESS_DATA_FORMAT     0x0006	/* Set
> EDF	   */
> +#define RMNET_IOCTL_SET_INGRESS_DATA_FORMAT    0x0007	/* Set
> IDF	   */
> +#define RMNET_IOCTL_SET_AGGREGATION_COUNT      0x0008	/* Set
> agg count   */
> +#define RMNET_IOCTL_GET_AGGREGATION_COUNT      0x0009	/* Get
> agg count   */
> +#define RMNET_IOCTL_SET_AGGREGATION_SIZE       0x000a	/* Set
> agg size	   */
> +#define RMNET_IOCTL_GET_AGGREGATION_SIZE       0x000b	/* Get
> agg size	   */
> +#define RMNET_IOCTL_FLOW_CONTROL	       0x000c	/* Do
> flow control */
> +#define RMNET_IOCTL_GET_DFLT_CONTROL_CHANNEL   0x000d	/* For
> legacy use  */
> +#define RMNET_IOCTL_GET_HWSW_MAP	       0x000e	/* Get
> HW/SW map   */
> +#define RMNET_IOCTL_SET_RX_HEADROOM	       0x000f	/*
> RX Headroom	   */
> +#define RMNET_IOCTL_GET_EP_PAIR		       0x0010	
> /* Endpoint pair   */
> +#define RMNET_IOCTL_SET_QOS_VERSION	       0x0011	/*
> 8/6 byte QoS hdr*/
> +#define RMNET_IOCTL_GET_QOS_VERSION	       0x0012	/*
> 8/6 byte QoS hdr*/
> +#define RMNET_IOCTL_GET_SUPPORTED_QOS_MODES    0x0013	/* Get
> QoS modes   */
> +#define RMNET_IOCTL_SET_SLEEP_STATE	       0x0014	/*
> Set sleep state */
> +#define RMNET_IOCTL_SET_XLAT_DEV_INFO	       0x0015	/*
> xlat dev name   */
> +#define RMNET_IOCTL_DEREGISTER_DEV	       0x0016	/*
> Dereg a net dev */
> +#define RMNET_IOCTL_GET_SG_SUPPORT	       0x0017	/*
> Query sg support*/
> +
> +/* Return values for the RMNET_IOCTL_GET_SUPPORTED_FEATURES IOCTL */
> +#define RMNET_IOCTL_FEAT_NOTIFY_MUX_CHANNEL		BIT(0)
> +#define RMNET_IOCTL_FEAT_SET_EGRESS_DATA_FORMAT		BIT(1
> )
> +#define RMNET_IOCTL_FEAT_SET_INGRESS_DATA_FORMAT	BIT(2)
> +
> +/* Input values for the RMNET_IOCTL_SET_EGRESS_DATA_FORMAT IOCTL  */
> +#define RMNET_IOCTL_EGRESS_FORMAT_AGGREGATION		BIT(2)
> +#define RMNET_IOCTL_EGRESS_FORMAT_CHECKSUM		BIT(4)
> +
> +/* Input values for the RMNET_IOCTL_SET_INGRESS_DATA_FORMAT IOCTL */
> +#define RMNET_IOCTL_INGRESS_FORMAT_CHECKSUM		BIT(4)
> +#define RMNET_IOCTL_INGRESS_FORMAT_AGG_DATA		BIT(5)
> +
> +/* User space may not have this defined. */
> +#ifndef IFNAMSIZ
> +#define IFNAMSIZ 16
> +#endif
> +
> +struct rmnet_ioctl_extended_s {
> +	u32	extended_ioctl;
> +	union {
> +		u32	data; /* Generic data field for most
> extended IOCTLs */
> +
> +		/* Return values for
> +		 *    RMNET_IOCTL_GET_DRIVER_NAME
> +		 *    RMNET_IOCTL_GET_DFLT_CONTROL_CHANNEL
> +		 */
> +		char	if_name[IFNAMSIZ];
> +
> +		/* Input values for the RMNET_IOCTL_ADD_MUX_CHANNEL
> IOCTL */
> +		struct {
> +			u32	mux_id;
> +			char	vchannel_name[IFNAMSIZ];
> +		} rmnet_mux_val;
> +
> +		/* Input values for the RMNET_IOCTL_FLOW_CONTROL
> IOCTL */
> +		struct {
> +			u8	flow_mode;
> +			u8	mux_id;
> +		} flow_control_prop;
> +
> +		/* Return values for RMNET_IOCTL_GET_EP_PAIR */
> +		struct {
> +			u32	consumer_pipe_num;
> +			u32	producer_pipe_num;
> +		} ipa_ep_pair;
> +
> +		struct {
> +			u32	__data; /* Placeholder for legacy
> data*/
> +			u32	agg_size;
> +			u32	agg_count;
> +		} ingress_format;
> +	} u;
> +};
> +
> +struct rmnet_ioctl_data_s {
> +	union {
> +		u32	operation_mode;
> +		u32	tcm_handle;
> +	} u;
> +};
> +#endif /* _MSM_RMNET_H_ */
> diff --git a/drivers/net/ipa/rmnet_config.h
> b/drivers/net/ipa/rmnet_config.h
> new file mode 100644
> index 000000000000..3b9a549ca1bd
> --- /dev/null
> +++ b/drivers/net/ipa/rmnet_config.h
> @@ -0,0 +1,31 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +/* Copyright (c) 2016-2018, The Linux Foundation. All rights
> reserved.
> + * Copyright (C) 2018 Linaro Ltd.
> + */
> +#ifndef _RMNET_CONFIG_H_
> +#define _RMNET_CONFIG_H_
> +
> +#include <linux/types.h>
> +
> +/* XXX We want to use struct rmnet_map_header, but that's currently
> defined in
> + * XXX     drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
> + * XXX We also want to use RMNET_MAP_GET_CD_BIT(Y), defined in the
> same file.
> + */
> +struct rmnet_map_header_s {
> +#ifndef RMNET_USE_BIG_ENDIAN_STRUCTS
> +	u8	pad_len		: 6,
> +		reserved_bit	: 1,
> +		cd_bit		: 1;
> +#else
> +	u8	cd_bit		: 1,
> +		reserved_bit	: 1,
> +		pad_len		: 6;
> +#endif /* RMNET_USE_BIG_ENDIAN_STRUCTS */
> +	u8	mux_id;
> +	u16	pkt_len;
> +}  __aligned(1);
> +
> +#define RMNET_MAP_GET_CD_BIT(Y) (((struct rmnet_map_header_s *)Y-
> >data)->cd_bit)
> +
> +#endif /* _RMNET_CONFIG_H_ */
> diff --git a/drivers/net/ipa/rmnet_ipa.c
> b/drivers/net/ipa/rmnet_ipa.c
> new file mode 100644
> index 000000000000..7006afe3a5ea
> --- /dev/null
> +++ b/drivers/net/ipa/rmnet_ipa.c
> @@ -0,0 +1,805 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +/* Copyright (c) 2014-2018, The Linux Foundation. All rights
> reserved.
> + * Copyright (C) 2018 Linaro Ltd.
> + */
> +
> +/* WWAN Transport Network Driver. */
> +
> +#include <linux/completion.h>
> +#include <linux/errno.h>
> +#include <linux/if_arp.h>
> +#include <linux/interrupt.h>
> +#include <linux/init.h>
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +#include <linux/netdevice.h>
> +#include <linux/of_device.h>
> +#include <linux/string.h>
> +#include <linux/skbuff.h>
> +#include <linux/version.h>
> +#include <linux/workqueue.h>
> +#include <net/pkt_sched.h>
> +
> +#include "msm_rmnet.h"
> +#include "rmnet_config.h"
> +#include "ipa_qmi.h"
> +#include "ipa_i.h"
> +
> +#define DRIVER_NAME		"wwan_ioctl"
> +#define IPA_WWAN_DEV_NAME	"rmnet_ipa%d"
> +
> +#define MUX_CHANNEL_MAX		10	/* max mux channels
> */
> +
> +#define NAPI_WEIGHT		60
> +
> +#define WWAN_DATA_LEN		2000
> +#define HEADROOM_FOR_QMAP	8	/* for mux header */
> +#define TAILROOM		0	/* for padding by mux layer
> */
> +
> +#define DEFAULT_OUTSTANDING_HIGH	128
> +#define DEFAULT_OUTSTANDING_HIGH_CTL	(DEFAULT_OUTSTANDING_HIG
> H + 32)
> +#define DEFAULT_OUTSTANDING_LOW		64
> +
> +#define IPA_APPS_WWAN_CONS_RING_COUNT	256
> +#define IPA_APPS_WWAN_PROD_RING_COUNT	512
> +
> +static int ipa_rmnet_poll(struct napi_struct *napi, int budget);
> +
> +/** struct ipa_wwan_private - WWAN private data
> + * @net: network interface struct implemented by this driver
> + * @stats: iface statistics
> + * @outstanding_high: number of outstanding packets allowed
> + * @outstanding_low: number of outstanding packets which shall cause
> + *
> + * WWAN private - holds all relevant info about WWAN driver
> + */
> +struct ipa_wwan_private {
> +	struct net_device_stats stats;
> +	atomic_t outstanding_pkts;
> +	int outstanding_high_ctl;
> +	int outstanding_high;
> +	int outstanding_low;
> +	struct napi_struct napi;
> +};
> +
> +struct rmnet_ipa_context {
> +	struct net_device *dev;
> +	struct mutex mux_id_mutex;		/* protects
> mux_id[] */
> +	u32 mux_id_count;
> +	u32 mux_id[MUX_CHANNEL_MAX];
> +	u32 wan_prod_ep_id;
> +	u32 wan_cons_ep_id;
> +	struct mutex ep_setup_mutex;		/* endpoint
> setup/teardown */
> +};
> +
> +static bool initialized;	/* Avoid duplicate initialization */
> +
> +static struct rmnet_ipa_context rmnet_ipa_ctx_struct;
> +static struct rmnet_ipa_context *rmnet_ipa_ctx =
> &rmnet_ipa_ctx_struct;
> +
> +/** wwan_open() - Opens the wwan network interface */
> +static int ipa_wwan_open(struct net_device *dev)
> +{
> +	struct ipa_wwan_private *wwan_ptr = netdev_priv(dev);
> +
> +	napi_enable(&wwan_ptr->napi);
> +	netif_start_queue(dev);
> +
> +	return 0;
> +}
> +
> +/** ipa_wwan_stop() - Stops the wwan network interface. */
> +static int ipa_wwan_stop(struct net_device *dev)
> +{
> +	netif_stop_queue(dev);
> +
> +	return 0;
> +}
> +
> +/** ipa_wwan_xmit() - Transmits an skb.
> + *
> + * @skb: skb to be transmitted
> + * @dev: network device
> + *
> + * Return codes:
> + * NETDEV_TX_OK: Success
> + * NETDEV_TX_BUSY: Error while transmitting the skb. Try again later
> + */
> +static int ipa_wwan_xmit(struct sk_buff *skb, struct net_device
> *dev)
> +{
> +	struct ipa_wwan_private *wwan_ptr = netdev_priv(dev);
> +	unsigned int skb_len;
> +	int outstanding;
> +
> +	if (skb->protocol != htons(ETH_P_MAP)) {
> +		dev_kfree_skb_any(skb);
> +		dev->stats.tx_dropped++;
> +		return NETDEV_TX_OK;
> +	}
> +
> +	/* Control packets are sent even if queue is stopped.  We
> +	 * always honor the data and control high-water marks.
> +	 */
> +	outstanding = atomic_read(&wwan_ptr->outstanding_pkts);
> +	if (!RMNET_MAP_GET_CD_BIT(skb)) {	/* Data packet? */
> +		if (netif_queue_stopped(dev))
> +			return NETDEV_TX_BUSY;
> +		if (outstanding >= wwan_ptr->outstanding_high)
> +			return NETDEV_TX_BUSY;
> +	} else if (outstanding >= wwan_ptr->outstanding_high_ctl) {
> +		return NETDEV_TX_BUSY;
> +	}
> +
> +	/* both data packets and commands will be routed to
> +	 * IPA_CLIENT_Q6_WAN_CONS based on status configuration.
> +	 */
> +	skb_len = skb->len;
> +	if (ipa_tx_dp(IPA_CLIENT_APPS_WAN_PROD, skb))
> +		return NETDEV_TX_BUSY;
> +
> +	atomic_inc(&wwan_ptr->outstanding_pkts);
> +	dev->stats.tx_packets++;
> +	dev->stats.tx_bytes += skb_len;
> +
> +	return NETDEV_TX_OK;
> +}
> +
> +/** apps_ipa_tx_complete_notify() - Rx notify
> + *
> + * @priv: driver context
> + * @evt: event type
> + * @data: data provided with event
> + *
> + * Check that the packet is the one we sent and release it
> + * This function will be called in defered context in IPA wq.
> + */
> +static void apps_ipa_tx_complete_notify(void *priv, enum
> ipa_dp_evt_type evt,
> +					unsigned long data)
> +{
> +	struct ipa_wwan_private *wwan_ptr;
> +	struct net_device *dev = priv;
> +	struct sk_buff *skb;
> +
> +	skb = (struct sk_buff *)data;
> +
> +	if (dev != rmnet_ipa_ctx->dev) {
> +		dev_kfree_skb_any(skb);
> +		return;
> +	}
> +
> +	if (evt != IPA_WRITE_DONE) {
> +		ipa_err("unsupported evt on Tx callback, Drop the
> packet\n");
> +		dev_kfree_skb_any(skb);
> +		dev->stats.tx_dropped++;
> +		return;
> +	}
> +
> +	wwan_ptr = netdev_priv(dev);
> +	atomic_dec(&wwan_ptr->outstanding_pkts);
> +	__netif_tx_lock_bh(netdev_get_tx_queue(dev, 0));
> +	if (netif_queue_stopped(dev) &&
> +	    atomic_read(&wwan_ptr->outstanding_pkts) <
> +				wwan_ptr->outstanding_low) {
> +		netif_wake_queue(dev);
> +	}
> +
> +	__netif_tx_unlock_bh(netdev_get_tx_queue(dev, 0));
> +	dev_kfree_skb_any(skb);
> +}
> +
> +/** apps_ipa_packet_receive_notify() - Rx notify
> + *
> + * @priv: driver context
> + * @evt: event type
> + * @data: data provided with event
> + *
> + * IPA will pass a packet to the Linux network stack with skb->data
> + */
> +static void apps_ipa_packet_receive_notify(void *priv, enum
> ipa_dp_evt_type evt,
> +					   unsigned long data)
> +{
> +	struct ipa_wwan_private *wwan_ptr;
> +	struct net_device *dev = priv;
> +
> +	wwan_ptr = netdev_priv(dev);
> +	if (evt == IPA_RECEIVE) {
> +		struct sk_buff *skb = (struct sk_buff *)data;
> +		int ret;
> +		unsigned int packet_len = skb->len;
> +
> +		skb->dev = rmnet_ipa_ctx->dev;
> +		skb->protocol = htons(ETH_P_MAP);
> +
> +		ret = netif_receive_skb(skb);
> +		if (ret) {
> +			pr_err_ratelimited("fail on
> netif_receive_skb\n");
> +			dev->stats.rx_dropped++;
> +		}
> +		dev->stats.rx_packets++;
> +		dev->stats.rx_bytes += packet_len;
> +	} else if (evt == IPA_CLIENT_START_POLL) {
> +		napi_schedule(&wwan_ptr->napi);
> +	} else if (evt == IPA_CLIENT_COMP_NAPI) {
> +		napi_complete(&wwan_ptr->napi);
> +	} else {
> +		ipa_err("Invalid evt %d received in
> wan_ipa_receive\n", evt);
> +	}
> +}
> +
> +/** handle_ingress_format() - Ingress data format configuration */
> +static int handle_ingress_format(struct net_device *dev,
> +				 struct rmnet_ioctl_extended_s *in)
> +{
> +	enum ipa_cs_offload_en offload_type;
> +	enum ipa_client_type client;
> +	u32 metadata_offset;
> +	u32 rx_buffer_size;
> +	u32 channel_count;
> +	u32 length_offset;
> +	u32 header_size;
> +	bool aggr_active;
> +	u32 aggr_bytes;
> +	u32 aggr_count;
> +	u32 aggr_size;	/* in KB */
> +	u32 ep_id;
> +	int ret;
> +
> +	client = IPA_CLIENT_APPS_WAN_CONS;
> +	channel_count = IPA_APPS_WWAN_CONS_RING_COUNT;
> +	header_size = sizeof(struct rmnet_map_header_s);
> +	metadata_offset = offsetof(struct rmnet_map_header_s,
> mux_id);
> +	length_offset = offsetof(struct rmnet_map_header_s,
> pkt_len);
> +	offload_type = IPA_CS_OFFLOAD_NONE;
> +	aggr_bytes = IPA_GENERIC_AGGR_BYTE_LIMIT;
> +	aggr_count = IPA_GENERIC_AGGR_PKT_LIMIT;
> +	aggr_active = false;
> +
> +	if (in->u.data & RMNET_IOCTL_INGRESS_FORMAT_CHECKSUM)
> +		offload_type = IPA_CS_OFFLOAD_DL;
> +
> +	if (in->u.data & RMNET_IOCTL_INGRESS_FORMAT_AGG_DATA) {
> +		aggr_bytes = in->u.ingress_format.agg_size;
> +		aggr_count = in->u.ingress_format.agg_count;
> +		aggr_active = true;
> +	}
> +
> +	if (aggr_bytes > ipa_reg_aggr_max_byte_limit())
> +		return -EINVAL;
> +
> +	if (aggr_count > ipa_reg_aggr_max_packet_limit())
> +		return -EINVAL;
> +
> +	/* Compute the buffer size required to handle the requested
> +	 * aggregation byte limit.  The aggr_byte_limit value is
> +	 * expressed as a number of KB, but we derive that value
> +	 * after computing the buffer size to use (in bytes).  The
> +	 * buffer must be sufficient to hold one IPA_MTU-sized
> +	 * packet *after* the limit is reached.
> +	 *
> +	 * (Note that the rx_buffer_size value reflects only the
> +	 * space for data, not any standard metadata or headers.)
> +	 */
> +	rx_buffer_size = ipa_aggr_byte_limit_buf_size(aggr_bytes);
> +
> +	/* Account for the extra IPA_MTU past the limit in the
> +	 * buffer, and convert the result to the KB units the
> +	 * aggr_byte_limit uses.
> +	 */
> +	aggr_size = (rx_buffer_size - IPA_MTU) / SZ_1K;
> +
> +	mutex_lock(&rmnet_ipa_ctx->ep_setup_mutex);
> +
> +	if (rmnet_ipa_ctx->wan_cons_ep_id != IPA_EP_ID_BAD) {
> +		ret = -EBUSY;
> +		goto out_unlock;
> +	}
> +
> +	ret = ipa_ep_alloc(client);
> +	if (ret < 0)
> +		goto out_unlock;
> +	ep_id = ret;
> +
> +	/* Record our endpoint configuration parameters */
> +	ipa_endp_init_hdr_cons(ep_id, header_size, metadata_offset,
> +			       length_offset);
> +	ipa_endp_init_hdr_ext_cons(ep_id, 0, true);
> +	ipa_endp_init_aggr_cons(ep_id, aggr_size, aggr_count, true);
> +	ipa_endp_init_cfg_cons(ep_id, offload_type);
> +	ipa_endp_init_hdr_metadata_mask_cons(ep_id, 0xff000000);
> +	ipa_endp_status_cons(ep_id, !aggr_active);
> +
> +	ipa_ctx->ipa_client_apps_wan_cons_agg_gro = aggr_active;
> +
> +	ret = ipa_ep_setup(ep_id, channel_count, 1, rx_buffer_size,
> +			   apps_ipa_packet_receive_notify, dev);
> +	if (ret)
> +		ipa_ep_free(ep_id);
> +	else
> +		rmnet_ipa_ctx->wan_cons_ep_id = ep_id;
> +out_unlock:
> +	mutex_unlock(&rmnet_ipa_ctx->ep_setup_mutex);
> +
> +	return ret;
> +}
> +
> +/** handle_egress_format() - Egress data format configuration */
> +static int handle_egress_format(struct net_device *dev,
> +				struct rmnet_ioctl_extended_s *e)
> +{
> +	enum ipa_cs_offload_en offload_type;
> +	enum ipa_client_type dst_client;
> +	enum ipa_client_type client;
> +	enum ipa_aggr_type aggr_type;
> +	enum ipa_aggr_en aggr_en;
> +	u32 channel_count;
> +	u32 length_offset;
> +	u32 header_align;
> +	u32 header_offset;
> +	u32 header_size;
> +	u32 ep_id;
> +	int ret;
> +
> +	client = IPA_CLIENT_APPS_WAN_PROD;
> +	dst_client = IPA_CLIENT_APPS_LAN_CONS;
> +	channel_count = IPA_APPS_WWAN_PROD_RING_COUNT;
> +	header_size = sizeof(struct rmnet_map_header_s);
> +	offload_type = IPA_CS_OFFLOAD_NONE;
> +	aggr_en = IPA_BYPASS_AGGR;
> +	aggr_type = 0;	/* ignored if BYPASS */
> +	header_offset = 0;
> +	length_offset = 0;
> +	header_align = 0;
> +
> +	if (e->u.data & RMNET_IOCTL_EGRESS_FORMAT_CHECKSUM) {
> +		offload_type = IPA_CS_OFFLOAD_UL;
> +		header_offset = sizeof(struct rmnet_map_header_s) /
> 4;
> +		header_size += sizeof(u32);
> +	}
> +
> +	if (e->u.data & RMNET_IOCTL_EGRESS_FORMAT_AGGREGATION) {
> +		aggr_en = IPA_ENABLE_DEAGGR;
> +		aggr_type = IPA_QCMAP;
> +		length_offset = offsetof(struct rmnet_map_header_s,
> pkt_len);
> +		header_align = ilog2(sizeof(u32));
> +	}
> +
> +	mutex_lock(&rmnet_ipa_ctx->ep_setup_mutex);
> +
> +	if (rmnet_ipa_ctx->wan_prod_ep_id != IPA_EP_ID_BAD) {
> +		ret = -EBUSY;
> +		goto out_unlock;
> +	}
> +
> +	ret = ipa_ep_alloc(client);
> +	if (ret < 0)
> +		goto out_unlock;
> +	ep_id = ret;
> +
> +	if (aggr_en == IPA_ENABLE_DEAGGR &&
> !ipa_endp_aggr_support(ep_id)) {
> +		ret = -ENOTSUPP;
> +		goto out_unlock;
> +	}
> +
> +	/* We really do want 0 metadata offset */
> +	ipa_endp_init_hdr_prod(ep_id, header_size, 0,
> length_offset);
> +	ipa_endp_init_hdr_ext_prod(ep_id, header_align);
> +	ipa_endp_init_mode_prod(ep_id, IPA_BASIC, dst_client);
> +	ipa_endp_init_aggr_prod(ep_id, aggr_en, aggr_type);
> +	ipa_endp_init_cfg_prod(ep_id, offload_type, header_offset);
> +	ipa_endp_init_seq_prod(ep_id);
> +	ipa_endp_init_deaggr_prod(ep_id);
> +	/* Enable source notification status for exception packets
> +	 * (i.e. QMAP commands) to be routed to modem.
> +	 */
> +	ipa_endp_status_prod(ep_id, true, IPA_CLIENT_Q6_WAN_CONS);
> +
> +	/* Use a deferred interrupting no-op to reduce completion
> interrupts */
> +	ipa_no_intr_init(ep_id);
> +
> +	ret = ipa_ep_setup(ep_id, channel_count, 1, 0,
> +			   apps_ipa_tx_complete_notify, dev);
> +	if (ret)
> +		ipa_ep_free(ep_id);
> +	else
> +		rmnet_ipa_ctx->wan_prod_ep_id = ep_id;
> +
> +out_unlock:
> +	mutex_unlock(&rmnet_ipa_ctx->ep_setup_mutex);
> +
> +	return ret;
> +}
> +
> +/** ipa_wwan_add_mux_channel() - add a mux_id */
> +static int ipa_wwan_add_mux_channel(u32 mux_id)
> +{
> +	int ret;
> +	u32 i;
> +
> +	mutex_lock(&rmnet_ipa_ctx->mux_id_mutex);
> +
> +	if (rmnet_ipa_ctx->mux_id_count >= MUX_CHANNEL_MAX) {
> +		ret = -EFAULT;
> +		goto out;
> +	}
> +
> +	for (i = 0; i < rmnet_ipa_ctx->mux_id_count; i++)
> +		if (mux_id == rmnet_ipa_ctx->mux_id[i])
> +			break;
> +
> +	/* Record the mux_id if it hasn't already been seen */
> +	if (i == rmnet_ipa_ctx->mux_id_count)
> +		rmnet_ipa_ctx->mux_id[rmnet_ipa_ctx->mux_id_count++] 
> = mux_id;
> +	ret = 0;
> +out:
> +	mutex_unlock(&rmnet_ipa_ctx->mux_id_mutex);
> +
> +	return ret;
> +}
> +
> +/** ipa_wwan_ioctl_extended() - rmnet extended I/O control */
> +static int ipa_wwan_ioctl_extended(struct net_device *dev, void
> __user *data)
> +{
> +	struct rmnet_ioctl_extended_s edata = { };
> +	size_t size = sizeof(edata);
> +
> +	if (copy_from_user(&edata, data, size))
> +		return -EFAULT;
> +
> +	switch (edata.extended_ioctl) {
> +	case RMNET_IOCTL_GET_SUPPORTED_FEATURES:	/* Get
> features */
> +		edata.u.data = RMNET_IOCTL_FEAT_NOTIFY_MUX_CHANNEL;
> +		edata.u.data |=
> RMNET_IOCTL_FEAT_SET_EGRESS_DATA_FORMAT;
> +		edata.u.data |=
> RMNET_IOCTL_FEAT_SET_INGRESS_DATA_FORMAT;
> +		goto copy_out;
> +
> +	case RMNET_IOCTL_GET_EPID:			/* Get
> endpoint ID */
> +		edata.u.data = 1;
> +		goto copy_out;
> +
> +	case RMNET_IOCTL_GET_DRIVER_NAME:		/* Get
> driver name */
> +		memcpy(&edata.u.if_name, rmnet_ipa_ctx->dev->name,
> IFNAMSIZ);
> +		goto copy_out;
> +
> +	case RMNET_IOCTL_ADD_MUX_CHANNEL:		/* Add MUX
> ID */
> +		return
> ipa_wwan_add_mux_channel(edata.u.rmnet_mux_val.mux_id);
> +
> +	case RMNET_IOCTL_SET_EGRESS_DATA_FORMAT:	/* Egress
> data format */
> +		return handle_egress_format(dev, &edata) ? -EFAULT :
> 0;
> +
> +	case RMNET_IOCTL_SET_INGRESS_DATA_FORMAT:	/* Ingress
> format */
> +		return handle_ingress_format(dev, &edata) ? -EFAULT
> : 0;
> +
> +	case RMNET_IOCTL_GET_EP_PAIR:			/* Get
> endpoint pair */
> +		edata.u.ipa_ep_pair.consumer_pipe_num =
> +				ipa_client_ep_id(IPA_CLIENT_APPS_WAN
> _PROD);
> +		edata.u.ipa_ep_pair.producer_pipe_num =
> +				ipa_client_ep_id(IPA_CLIENT_APPS_WAN
> _CONS);
> +		goto copy_out;
> +
> +	case RMNET_IOCTL_GET_SG_SUPPORT:		/* Get SG
> support */
> +		edata.u.data = 1;	/* Scatter/gather is always
> supported */
> +		goto copy_out;
> +
> +	/* Unsupported requests */
> +	case RMNET_IOCTL_SET_MRU:			/* Set MRU
> */
> +	case RMNET_IOCTL_GET_MRU:			/* Get MRU
> */
> +	case RMNET_IOCTL_GET_AGGREGATION_COUNT:		/*
> Get agg count */
> +	case RMNET_IOCTL_SET_AGGREGATION_COUNT:		/*
> Set agg count */
> +	case RMNET_IOCTL_GET_AGGREGATION_SIZE:		/* Get
> agg size */
> +	case RMNET_IOCTL_SET_AGGREGATION_SIZE:		/* Set
> agg size */
> +	case RMNET_IOCTL_FLOW_CONTROL:			/* Do
> flow control */
> +	case RMNET_IOCTL_GET_DFLT_CONTROL_CHANNEL:	/* For
> legacy use */
> +	case RMNET_IOCTL_GET_HWSW_MAP:			/* Get
> HW/SW map */
> +	case RMNET_IOCTL_SET_RX_HEADROOM:		/* Set RX
> Headroom */
> +	case RMNET_IOCTL_SET_QOS_VERSION:		/* Set 8/6
> byte QoS */
> +	case RMNET_IOCTL_GET_QOS_VERSION:		/* Get 8/6
> byte QoS */
> +	case RMNET_IOCTL_GET_SUPPORTED_QOS_MODES:	/* Get QoS
> modes */
> +	case RMNET_IOCTL_SET_SLEEP_STATE:		/* Set
> sleep state */
> +	case RMNET_IOCTL_SET_XLAT_DEV_INFO:		/* xlat
> dev name */
> +	case RMNET_IOCTL_DEREGISTER_DEV:		/*
> Deregister netdev */
> +		return -ENOTSUPP;	/* Defined, but unsupported
> command */
> +
> +	default:
> +		return -EINVAL;		/* Invalid
> (unrecognized) command */
> +	}
> +
> +copy_out:
> +	return copy_to_user(data, &edata, size) ? -EFAULT : 0;
> +}
> +
> +/** ipa_wwan_ioctl() - I/O control for wwan network driver */
> +static int ipa_wwan_ioctl(struct net_device *dev, struct ifreq *ifr,
> int cmd)
> +{
> +	struct rmnet_ioctl_data_s ioctl_data = { };
> +	void __user *data;
> +	size_t size;
> +
> +	data = ifr->ifr_ifru.ifru_data;
> +	size = sizeof(ioctl_data);
> +
> +	switch (cmd) {
> +	/* These features are implied; alternatives are not
> supported */
> +	case RMNET_IOCTL_SET_LLP_IP:		/* RAW IP
> protocol */
> +	case RMNET_IOCTL_SET_QOS_DISABLE:	/* QoS header
> disabled */
> +		return 0;
> +
> +	/* These features are not supported; use alternatives */
> +	case RMNET_IOCTL_SET_LLP_ETHERNET:	/* Ethernet
> protocol */
> +	case RMNET_IOCTL_SET_QOS_ENABLE:	/* QoS header
> enabled */
> +	case RMNET_IOCTL_GET_OPMODE:		/* Get operation
> mode */
> +	case RMNET_IOCTL_FLOW_ENABLE:		/* Flow enable
> */
> +	case RMNET_IOCTL_FLOW_DISABLE:		/* Flow
> disable */
> +	case RMNET_IOCTL_FLOW_SET_HNDL:		/* Set flow
> handle */
> +		return -ENOTSUPP;
> +
> +	case RMNET_IOCTL_GET_LLP:		/* Get link
> protocol */
> +		ioctl_data.u.operation_mode = RMNET_MODE_LLP_IP;
> +		goto copy_out;
> +
> +	case RMNET_IOCTL_GET_QOS:		/* Get QoS header
> state */
> +		ioctl_data.u.operation_mode = RMNET_MODE_NONE;
> +		goto copy_out;
> +
> +	case RMNET_IOCTL_OPEN:			/* Open
> transport port */
> +	case RMNET_IOCTL_CLOSE:			/* Close
> transport port */
> +		return 0;
> +
> +	case RMNET_IOCTL_EXTENDED:		/* Extended IOCTLs
> */
> +		return ipa_wwan_ioctl_extended(dev, data);
> +
> +	default:
> +		return -EINVAL;
> +	}
> +
> +copy_out:
> +	return copy_to_user(data, &ioctl_data, size) ? -EFAULT : 0;
> +}
> +
> +static const struct net_device_ops ipa_wwan_ops_ip = {
> +	.ndo_open	= ipa_wwan_open,
> +	.ndo_stop	= ipa_wwan_stop,
> +	.ndo_start_xmit	= ipa_wwan_xmit,
> +	.ndo_do_ioctl	= ipa_wwan_ioctl,
> +};
> +
> +/** wwan_setup() - Setup the wwan network driver */
> +static void ipa_wwan_setup(struct net_device *dev)
> +{
> +	dev->netdev_ops = &ipa_wwan_ops_ip;
> +	ether_setup(dev);
> +	dev->header_ops = NULL;	 /* No header (override
> ether_setup() value) */
> +	dev->type = ARPHRD_RAWIP;
> +	dev->hard_header_len = 0;
> +	dev->max_mtu = WWAN_DATA_LEN;
> +	dev->mtu = dev->max_mtu;
> +	dev->addr_len = 0;
> +	dev->flags &= ~(IFF_BROADCAST | IFF_MULTICAST);
> +	dev->needed_headroom = HEADROOM_FOR_QMAP;
> +	dev->needed_tailroom = TAILROOM;
> +	dev->watchdog_timeo = msecs_to_jiffies(10 * MSEC_PER_SEC);
> +}
> +
> +/** ipa_wwan_probe() - Network probe function */
> +static int ipa_wwan_probe(struct platform_device *pdev)
> +{
> +	struct ipa_wwan_private *wwan_ptr;
> +	struct net_device *dev;
> +	int ret;
> +
> +	mutex_init(&rmnet_ipa_ctx->ep_setup_mutex);
> +	mutex_init(&rmnet_ipa_ctx->mux_id_mutex);
> +
> +	/* Mark client handles bad until we initialize them */
> +	rmnet_ipa_ctx->wan_prod_ep_id = IPA_EP_ID_BAD;
> +	rmnet_ipa_ctx->wan_cons_ep_id = IPA_EP_ID_BAD;
> +
> +	ret = ipa_modem_smem_init();
> +	if (ret)
> +		goto err_clear_ctx;
> +
> +	/* start A7 QMI service/client */
> +	ipa_qmi_init();
> +
> +	/* initialize wan-driver netdev */
> +	dev = alloc_netdev(sizeof(struct ipa_wwan_private),
> +			   IPA_WWAN_DEV_NAME,
> +			   NET_NAME_UNKNOWN,
> +			   ipa_wwan_setup);
> +	if (!dev) {
> +		ipa_err("no memory for netdev\n");
> +		ret = -ENOMEM;
> +		goto err_clear_ctx;
> +	}
> +	rmnet_ipa_ctx->dev = dev;
> +	wwan_ptr = netdev_priv(dev);
> +	wwan_ptr->outstanding_high_ctl =
> DEFAULT_OUTSTANDING_HIGH_CTL;
> +	wwan_ptr->outstanding_high = DEFAULT_OUTSTANDING_HIGH;
> +	wwan_ptr->outstanding_low = DEFAULT_OUTSTANDING_LOW;
> +	atomic_set(&wwan_ptr->outstanding_pkts, 0);
> +
> +	/* Enable SG support in netdevice. */
> +	dev->hw_features |= NETIF_F_SG;
> +
> +	netif_napi_add(dev, &wwan_ptr->napi, ipa_rmnet_poll,
> NAPI_WEIGHT);
> +	ret = register_netdev(dev);
> +	if (ret) {
> +		ipa_err("unable to register ipa_netdev %d rc=%d\n",
> 0, ret);
> +		goto err_napi_del;
> +	}
> +
> +	/* offline charging mode */
> +	ipa_proxy_clk_unvote();
> +
> +	/* Till the system is suspended, we keep the clock open */
> +	ipa_client_add();
> +
> +	initialized = true;
> +
> +	return 0;
> +
> +err_napi_del:
> +	netif_napi_del(&wwan_ptr->napi);
> +	free_netdev(dev);
> +err_clear_ctx:
> +	memset(&rmnet_ipa_ctx_struct, 0,
> sizeof(rmnet_ipa_ctx_struct));
> +
> +	return ret;
> +}
> +
> +static int ipa_wwan_remove(struct platform_device *pdev)
> +{
> +	struct ipa_wwan_private *wwan_ptr =
> netdev_priv(rmnet_ipa_ctx->dev);
> +
> +	dev_info(&pdev->dev, "rmnet_ipa started
> deinitialization\n");
> +
> +	mutex_lock(&rmnet_ipa_ctx->ep_setup_mutex);
> +
> +	ipa_client_add();
> +
> +	if (rmnet_ipa_ctx->wan_cons_ep_id != IPA_EP_ID_BAD) {
> +		ipa_ep_teardown(rmnet_ipa_ctx->wan_cons_ep_id);
> +		rmnet_ipa_ctx->wan_cons_ep_id = IPA_EP_ID_BAD;
> +	}
> +
> +	if (rmnet_ipa_ctx->wan_prod_ep_id != IPA_EP_ID_BAD) {
> +		ipa_ep_teardown(rmnet_ipa_ctx->wan_prod_ep_id);
> +		rmnet_ipa_ctx->wan_prod_ep_id = IPA_EP_ID_BAD;
> +	}
> +
> +	ipa_client_remove();
> +
> +	netif_napi_del(&wwan_ptr->napi);
> +	mutex_unlock(&rmnet_ipa_ctx->ep_setup_mutex);
> +	unregister_netdev(rmnet_ipa_ctx->dev);
> +
> +	if (rmnet_ipa_ctx->dev)
> +		free_netdev(rmnet_ipa_ctx->dev);
> +	rmnet_ipa_ctx->dev = NULL;
> +
> +	mutex_destroy(&rmnet_ipa_ctx->mux_id_mutex);
> +	mutex_destroy(&rmnet_ipa_ctx->ep_setup_mutex);
> +
> +	initialized = false;
> +
> +	dev_info(&pdev->dev, "rmnet_ipa completed
> deinitialization\n");
> +
> +	return 0;
> +}
> +
> +/** rmnet_ipa_ap_suspend() - suspend callback for runtime_pm
> + * @dev: pointer to device
> + *
> + * This callback will be invoked by the runtime_pm framework when an
> AP suspend
> + * operation is invoked, usually by pressing a suspend button.
> + *
> + * Returns -EAGAIN to runtime_pm framework in case there are pending
> packets
> + * in the Tx queue. This will postpone the suspend operation until
> all the
> + * pending packets will be transmitted.
> + *
> + * In case there are no packets to send, releases the WWAN0_PROD
> entity.
> + * As an outcome, the number of IPA active clients should be
> decremented
> + * until IPA clocks can be gated.
> + */
> +static int rmnet_ipa_ap_suspend(struct device *dev)
> +{
> +	struct net_device *netdev = rmnet_ipa_ctx->dev;
> +	struct ipa_wwan_private *wwan_ptr;
> +	int ret;
> +
> +	if (!netdev) {
> +		ipa_err("netdev is NULL.\n");
> +		ret = 0;
> +		goto bail;
> +	}
> +
> +	netif_tx_lock_bh(netdev);
> +	wwan_ptr = netdev_priv(netdev);
> +	if (!wwan_ptr) {
> +		ipa_err("wwan_ptr is NULL.\n");
> +		ret = 0;
> +		goto unlock_and_bail;
> +	}
> +
> +	/* Do not allow A7 to suspend in case there are outstanding
> packets */
> +	if (atomic_read(&wwan_ptr->outstanding_pkts) != 0) {
> +		ret = -EAGAIN;
> +		goto unlock_and_bail;
> +	}
> +
> +	/* Make sure that there is no Tx operation ongoing */
> +	netif_stop_queue(netdev);
> +
> +	ret = 0;
> +	ipa_client_remove();
> +
> +unlock_and_bail:
> +	netif_tx_unlock_bh(netdev);
> +bail:
> +
> +	return ret;
> +}
> +
> +/** rmnet_ipa_ap_resume() - resume callback for runtime_pm
> + * @dev: pointer to device
> + *
> + * This callback will be invoked by the runtime_pm framework when an
> AP resume
> + * operation is invoked.
> + *
> + * Enables the network interface queue and returns success to the
> + * runtime_pm framework.
> + */
> +static int rmnet_ipa_ap_resume(struct device *dev)
> +{
> +	struct net_device *netdev = rmnet_ipa_ctx->dev;
> +
> +	ipa_client_add();
> +	if (netdev)
> +		netif_wake_queue(netdev);
> +
> +	return 0;
> +}
> +
> +static const struct of_device_id rmnet_ipa_dt_match[] = {
> +	{.compatible = "qcom,rmnet-ipa"},
> +	{},
> +};
> +MODULE_DEVICE_TABLE(of, rmnet_ipa_dt_match);
> +
> +static const struct dev_pm_ops rmnet_ipa_pm_ops = {
> +	.suspend_noirq = rmnet_ipa_ap_suspend,
> +	.resume_noirq = rmnet_ipa_ap_resume,
> +};
> +
> +static struct platform_driver rmnet_ipa_driver = {
> +	.driver = {
> +		.name = "rmnet_ipa",
> +		.owner = THIS_MODULE,
> +		.pm = &rmnet_ipa_pm_ops,
> +		.of_match_table = rmnet_ipa_dt_match,
> +	},
> +	.probe = ipa_wwan_probe,
> +	.remove = ipa_wwan_remove,
> +};
> +
> +int ipa_wwan_init(void)
> +{
> +	if (initialized)
> +		return 0;
> +
> +	return platform_driver_register(&rmnet_ipa_driver);
> +}
> +
> +void ipa_wwan_cleanup(void)
> +{
> +	platform_driver_unregister(&rmnet_ipa_driver);
> +	memset(&rmnet_ipa_ctx_struct, 0,
> sizeof(rmnet_ipa_ctx_struct));
> +}
> +
> +static int ipa_rmnet_poll(struct napi_struct *napi, int budget)
> +{
> +	return ipa_rx_poll(rmnet_ipa_ctx->wan_cons_ep_id, budget);
> +}
> +
> +MODULE_DESCRIPTION("WWAN Network Interface");
> +MODULE_LICENSE("GPL v2");

^ permalink raw reply

* Re: [PATCH net-next] tcp: minor optimization in tcp ack fast path processing
From: Eric Dumazet @ 2018-11-07 15:16 UTC (permalink / raw)
  To: Yafang Shao, davem, edumazet; +Cc: netdev, linux-kernel, Joe Perches
In-Reply-To: <1541589617-1607-2-git-send-email-laoar.shao@gmail.com>



On 11/07/2018 03:20 AM, Yafang Shao wrote:
> Bitwise operation is a little faster.


> So I replace after() with (flag & FLAG_SND_UNA_ADVANCED) as this flag is
> already set before.
> 
> Cc: Joe Perches <joe@perches.com>
> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> ---
>  net/ipv4/tcp_input.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 2868ef2..0167015 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -3610,7 +3610,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
>  	if (flag & FLAG_UPDATE_TS_RECENT)
>  		tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
>  
> -	if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
> +	if (!(flag & FLAG_SLOWPATH) && (flag & FLAG_SND_UNA_ADVANCED)) {
>  		/* Window is constant, pure forward advance.
>  		 * No more checks are required.
>  		 * Note, we use the fact that SND.UNA>=SND.WL2.
> 

What about reducing this to a single conditional jump ?

if ((flag & (FLAG_SLOWPATH | FLAG_SND_UNA_ADVANCED)) == FLAG_SND_UNA_ADVANCED)  {

^ permalink raw reply

* Re: [RFC PATCH 07/12] soc: qcom: ipa: IPA register abstraction
From: Arnd Bergmann @ 2018-11-07 15:00 UTC (permalink / raw)
  To: Alex Elder
  Cc: David Miller, Bjorn Andersson, Ilias Apalodimas, Networking, DTML,
	linux-arm-msm, linux-soc, Linux ARM, Linux Kernel Mailing List,
	syadagir, mjavid, Rob Herring, Mark Rutland
In-Reply-To: <20181107003250.5832-8-elder@linaro.org>

On Wed, Nov 7, 2018 at 1:33 AM Alex Elder <elder@linaro.org> wrote:
> diff --git a/drivers/net/ipa/ipa_reg.c b/drivers/net/ipa/ipa_reg.c
> new file mode 100644
> index 000000000000..5e0aa6163235
> --- /dev/null
> +++ b/drivers/net/ipa/ipa_reg.c
> @@ -0,0 +1,972 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +/* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved.
> + * Copyright (C) 2018 Linaro Ltd.
> + */
> +
> +#include <linux/types.h>
> +#include <linux/io.h>
> +#include <linux/bitfield.h>
> +
> +#include "ipa_reg.h"
> +
> +/* I/O remapped base address of IPA register space */
> +static void __iomem *ipa_reg_virt;

This should of course be part of the device structure.

> +/* struct ipa_reg_desc - descriptor for an abstracted hardware register
> + *
> + * @construct - fn to construct the register value from its field structure
> + * @parse - function to parse register field values into its field structure
> + * @offset - register offset relative to base address
> + * @n_ofst - size multiplier for "N-parameterized" registers
> + */
> +struct ipa_reg_desc {
> +       u32 (*construct)(enum ipa_reg reg, const void *fields);
> +       void (*parse)(enum ipa_reg reg, void *fields, u32 val);
> +       u32 offset;
> +       u16 n_ofst;
> +};

Indirect function pointers can be a bit expensive in the post-spectre
days. It's probably not overly important if these are always part of
an MMIO access function, but you should be careful about using
these in the data path.

How many different versions do we have to support in practice?

       Arnd

^ permalink raw reply

* Re: [RFC PATCH 10/12] soc: qcom: ipa: data path
From: Arnd Bergmann @ 2018-11-07 14:55 UTC (permalink / raw)
  To: Alex Elder
  Cc: David Miller, Bjorn Andersson, Ilias Apalodimas, Networking, DTML,
	linux-arm-msm, linux-soc, Linux ARM, Linux Kernel Mailing List,
	syadagir, mjavid, Rob Herring, Mark Rutland
In-Reply-To: <20181107003250.5832-11-elder@linaro.org>

On Wed, Nov 7, 2018 at 1:33 AM Alex Elder <elder@linaro.org> wrote:
>
> This patch contains "ipa_dp.c", which includes the bulk of the data
> path code.  There is an overview in the code of how things operate,
> but there are already plans to rework this portion of the driver.
>
> In particular:
>   - Interrupt handling will be replaced with a threaded interrupt
>     handler.  Currently handling occurs in a combination of
>     interrupt and workqueue context, and this requires locking
>     and atomic operations for proper synchronization.

You probably don't want to use just a threaded IRQ handler to
start the poll function, that would still require an extra indirection.

However, you can probably use the top half of the threaded
handler to request the poll function if necessary but use
the bottom half for anything that does not go through poll.

>   - Currently, only receive endpoints use NAPI.  Transmit
>     completion interrupts are disabled, and are handled in batches
>     by periodically scheduling an interrupting no-op request.
>     The plan is to arrange for transmit requests to generate
>     interrupts, and their completion will be processed with other
>     completions in the NAPI poll function.  This will also allow
>     accurate feedback about packet sojourn time to be provided to
>     queue limiting mechanisms.

Right, that is definitely required here. I also had a look at
the gsi_channel_queue() function, which sits in the middle of
the transmit function and is rather unoptimized. I'd suggest moving
that into the caller so we can see what is going on, and then
optimizing it from there.

>   - Not all receive endpoints use NAPI.  The plan is for *all*
>     endpoints to use NAPI.  And because all endpoints share a
>     common GSI interrupt, a single NAPI structure will used to
>     managing the processing for all completions on all endpoints.
>   - Receive buffers are posted to the hardware by a workqueue
>     function.  Instead, the plan is to have this done by the
>     NAPI poll routine.

Makes sense, yes.

      Arnd

^ permalink raw reply

* Re: [PATCH net-next 03/11] vxlan: Allow configuration of DF behaviour
From: Stephen Hemminger @ 2018-11-07  5:00 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: David S. Miller, Sabrina Dubroca, Xin Long, netdev
In-Reply-To: <1922d1e13074e73435724523f901f2c97eb3a764.1541533786.git.sbrivio@redhat.com>

On Tue,  6 Nov 2018 22:38:59 +0100
Stefano Brivio <sbrivio@redhat.com> wrote:

> 			df = htons(IP_DF);
>  		}
>  
> +		if (!df) {
> +			if (vxlan->cfg.df == VXLAN_DF_SET) {
> +				df = htons(IP_DF);

I am confused, this looks like this new flag is duplicating the exiting tunnel DF flag.
(in info->key.tun.flags). Why is another flag needed?

^ permalink raw reply

* [PATCH] dpaa_eth: add ethtool coalesce control
From: Madalin Bucur @ 2018-11-07 13:53 UTC (permalink / raw)
  To: netdev, davem; +Cc: linux-kernel, Madalin Bucur

Allow ethtool control of the DPAA QMan portal interrupt coalescing
settings.

Signed-off-by: Madalin Bucur <madalin.bucur@nxp.com>
---
 drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c | 41 ++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
index 13d6e2272ece..548a7e8893d8 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
@@ -529,6 +529,45 @@ static int dpaa_get_ts_info(struct net_device *net_dev,
 	return 0;
 }
 
+static int dpaa_get_coalesce(struct net_device *dev,
+			     struct ethtool_coalesce *c)
+{
+	struct qman_portal *portal;
+	u32 period;
+	u8 thresh;
+
+	portal = qman_get_affine_portal(smp_processor_id());
+	qman_portal_get_iperiod(portal, &period);
+	qman_dqrr_get_ithresh(portal, &thresh);
+
+	c->rx_coalesce_usecs = period;
+	c->rx_max_coalesced_frames = thresh;
+	c->use_adaptive_rx_coalesce = false;
+
+	return 0;
+}
+
+static int dpaa_set_coalesce(struct net_device *dev,
+			     struct ethtool_coalesce *c)
+{
+	const cpumask_t *cpus = qman_affine_cpus();
+	struct qman_portal *portal;
+	u32 period;
+	u8 thresh;
+	int cpu;
+
+	period = c->rx_coalesce_usecs;
+	thresh = c->rx_max_coalesced_frames;
+
+	for_each_cpu(cpu, cpus) {
+		portal = qman_get_affine_portal(cpu);
+		qman_portal_set_iperiod(portal, period);
+		qman_dqrr_set_ithresh(portal, thresh);
+	}
+
+	return 0;
+}
+
 const struct ethtool_ops dpaa_ethtool_ops = {
 	.get_drvinfo = dpaa_get_drvinfo,
 	.get_msglevel = dpaa_get_msglevel,
@@ -545,4 +584,6 @@ const struct ethtool_ops dpaa_ethtool_ops = {
 	.get_rxnfc = dpaa_get_rxnfc,
 	.set_rxnfc = dpaa_set_rxnfc,
 	.get_ts_info = dpaa_get_ts_info,
+	.get_coalesce = dpaa_get_coalesce,
+	.set_coalesce = dpaa_set_coalesce,
 };
-- 
2.1.0

^ permalink raw reply related

* [PATCH net-next 2/2] br_netfilter: namespace bridge netfilter sysctls
From: Christian Brauner @ 2018-11-07 13:48 UTC (permalink / raw)
  To: davem, netdev, linux-kernel, netfilter-devel, coreteam, bridge
  Cc: tyhicks, pablo, kadlec, fw, roopa, nikolay, Christian Brauner
In-Reply-To: <20181107134859.19896-1-christian@brauner.io>

Currently, the /proc/sys/net/bridge folder is only created in the initial
network namespace. This patch ensures that the /proc/sys/net/bridge folder
is available in each network namespace if the module is loaded and
disappears from all network namespaces when the module is unloaded.

In doing so the patch makes the sysctls:

bridge-nf-call-arptables
bridge-nf-call-ip6tables
bridge-nf-call-iptables
bridge-nf-filter-pppoe-tagged
bridge-nf-filter-vlan-tagged
bridge-nf-pass-vlan-input-dev

apply per network namespace. This unblocks some use-cases where users would
like to e.g. not do bridge filtering for bridges in a specific network
namespace while doing so for bridges located in another network namespace.

The netfilter rules are afaict already per network namespace so it should
be safe for users to specify whether bridge devices inside a network
namespace are supposed to go through iptables et al. or not. Also, this can
already be done per-bridge by setting an option for each individual bridge
via Netlink. It should also be possible to do this for all bridges in a
network namespace via sysctls.

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: Tyler Hicks <tyhicks@canonical.com>
---
 include/net/netfilter/br_netfilter.h |   3 +-
 net/bridge/br_netfilter_hooks.c      | 116 ++++++++++++++++++++-------
 net/bridge/br_netfilter_ipv6.c       |   2 +-
 3 files changed, 91 insertions(+), 30 deletions(-)

diff --git a/include/net/netfilter/br_netfilter.h b/include/net/netfilter/br_netfilter.h
index 74af19c3a8f7..e51f5961272b 100644
--- a/include/net/netfilter/br_netfilter.h
+++ b/include/net/netfilter/br_netfilter.h
@@ -48,7 +48,8 @@ static inline struct rtable *bridge_parent_rtable(const struct net_device *dev)
 	return port ? &port->br->fake_rtable : NULL;
 }
 
-struct net_device *setup_pre_routing(struct sk_buff *skb);
+struct net_device *setup_pre_routing(struct sk_buff *skb,
+				     const struct net *net);
 void br_netfilter_enable(void);
 
 #if IS_ENABLED(CONFIG_IPV6)
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 656a084f4825..8a33268f2750 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -72,17 +72,17 @@ static inline __be16 vlan_proto(const struct sk_buff *skb)
 		return 0;
 }
 
-#define IS_VLAN_IP(skb) \
+#define IS_VLAN_IP(skb, net) \
 	(vlan_proto(skb) == htons(ETH_P_IP) && \
-	 init_net.brnf.filter_vlan_tagged)
+	 net->brnf.filter_vlan_tagged)
 
-#define IS_VLAN_IPV6(skb) \
+#define IS_VLAN_IPV6(skb, net) \
 	(vlan_proto(skb) == htons(ETH_P_IPV6) && \
-	 init_net.brnf.filter_vlan_tagged)
+	 net->brnf.filter_vlan_tagged)
 
-#define IS_VLAN_ARP(skb) \
+#define IS_VLAN_ARP(skb, net) \
 	(vlan_proto(skb) == htons(ETH_P_ARP) &&	\
-	 init_net.brnf.filter_vlan_tagged)
+	 net->brnf.filter_vlan_tagged)
 
 static inline __be16 pppoe_proto(const struct sk_buff *skb)
 {
@@ -90,15 +90,15 @@ static inline __be16 pppoe_proto(const struct sk_buff *skb)
 			    sizeof(struct pppoe_hdr)));
 }
 
-#define IS_PPPOE_IP(skb) \
+#define IS_PPPOE_IP(skb, net) \
 	(skb->protocol == htons(ETH_P_PPP_SES) && \
 	 pppoe_proto(skb) == htons(PPP_IP) && \
-	 init_net.brnf.filter_pppoe_tagged)
+	 net->brnf.filter_pppoe_tagged)
 
-#define IS_PPPOE_IPV6(skb) \
+#define IS_PPPOE_IPV6(skb, net) \
 	(skb->protocol == htons(ETH_P_PPP_SES) && \
 	 pppoe_proto(skb) == htons(PPP_IPV6) && \
-	 init_net.brnf.filter_pppoe_tagged)
+	 net->brnf.filter_pppoe_tagged)
 
 /* largest possible L2 header, see br_nf_dev_queue_xmit() */
 #define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN)
@@ -408,12 +408,14 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_
 	return 0;
 }
 
-static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct net_device *dev)
+static struct net_device *brnf_get_logical_dev(struct sk_buff *skb,
+					       const struct net_device *dev,
+					       const struct net *net)
 {
 	struct net_device *vlan, *br;
 
 	br = bridge_parent(dev);
-	if (init_net.brnf.pass_vlan_indev == 0 || !skb_vlan_tag_present(skb))
+	if (net->brnf.pass_vlan_indev == 0 || !skb_vlan_tag_present(skb))
 		return br;
 
 	vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto,
@@ -423,7 +425,7 @@ static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct
 }
 
 /* Some common code for IPv4/IPv6 */
-struct net_device *setup_pre_routing(struct sk_buff *skb)
+struct net_device *setup_pre_routing(struct sk_buff *skb, const struct net *net)
 {
 	struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
 
@@ -434,7 +436,7 @@ struct net_device *setup_pre_routing(struct sk_buff *skb)
 
 	nf_bridge->in_prerouting = 1;
 	nf_bridge->physindev = skb->dev;
-	skb->dev = brnf_get_logical_dev(skb, skb->dev);
+	skb->dev = brnf_get_logical_dev(skb, skb->dev, net);
 
 	if (skb->protocol == htons(ETH_P_8021Q))
 		nf_bridge->orig_proto = BRNF_PROTO_8021Q;
@@ -469,8 +471,9 @@ static unsigned int br_nf_pre_routing(void *priv,
 		return NF_DROP;
 	br = p->br;
 
-	if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) {
-		if (!init_net.brnf.call_ip6tables &&
+	if (IS_IPV6(skb) || IS_VLAN_IPV6(skb, state->net) ||
+	    IS_PPPOE_IPV6(skb, state->net)) {
+		if (!state->net->brnf.call_ip6tables &&
 		    !br_opt_get(br, BROPT_NF_CALL_IP6TABLES))
 			return NF_ACCEPT;
 
@@ -478,11 +481,12 @@ static unsigned int br_nf_pre_routing(void *priv,
 		return br_nf_pre_routing_ipv6(priv, skb, state);
 	}
 
-	if (!init_net.brnf.call_iptables &&
+	if (!state->net->brnf.call_iptables &&
 	    !br_opt_get(br, BROPT_NF_CALL_IPTABLES))
 		return NF_ACCEPT;
 
-	if (!IS_IP(skb) && !IS_VLAN_IP(skb) && !IS_PPPOE_IP(skb))
+	if (!IS_IP(skb) && !IS_VLAN_IP(skb, state->net) &&
+	    !IS_PPPOE_IP(skb, state->net))
 		return NF_ACCEPT;
 
 	nf_bridge_pull_encap_header_rcsum(skb);
@@ -493,7 +497,7 @@ static unsigned int br_nf_pre_routing(void *priv,
 	nf_bridge_put(skb->nf_bridge);
 	if (!nf_bridge_alloc(skb))
 		return NF_DROP;
-	if (!setup_pre_routing(skb))
+	if (!setup_pre_routing(skb, state->net))
 		return NF_DROP;
 
 	nf_bridge = nf_bridge_info_get(skb);
@@ -515,7 +519,7 @@ static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff
 	struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
 	struct net_device *in;
 
-	if (!IS_ARP(skb) && !IS_VLAN_ARP(skb)) {
+	if (!IS_ARP(skb) && !IS_VLAN_ARP(skb, net)) {
 
 		if (skb->protocol == htons(ETH_P_IP))
 			nf_bridge->frag_max_size = IPCB(skb)->frag_max_size;
@@ -569,9 +573,11 @@ static unsigned int br_nf_forward_ip(void *priv,
 	if (!parent)
 		return NF_DROP;
 
-	if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb))
+	if (IS_IP(skb) || IS_VLAN_IP(skb, state->net) ||
+	    IS_PPPOE_IP(skb, state->net))
 		pf = NFPROTO_IPV4;
-	else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb))
+	else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb, state->net) ||
+		 IS_PPPOE_IPV6(skb, state->net))
 		pf = NFPROTO_IPV6;
 	else
 		return NF_ACCEPT;
@@ -602,7 +608,7 @@ static unsigned int br_nf_forward_ip(void *priv,
 		skb->protocol = htons(ETH_P_IPV6);
 
 	NF_HOOK(pf, NF_INET_FORWARD, state->net, NULL, skb,
-		brnf_get_logical_dev(skb, state->in),
+		brnf_get_logical_dev(skb, state->in, state->net),
 		parent,	br_nf_forward_finish);
 
 	return NF_STOLEN;
@@ -621,18 +627,18 @@ static unsigned int br_nf_forward_arp(void *priv,
 		return NF_ACCEPT;
 	br = p->br;
 
-	if (!init_net.brnf.call_arptables &&
+	if (!state->net->brnf.call_arptables &&
 	    !br_opt_get(br, BROPT_NF_CALL_ARPTABLES))
 		return NF_ACCEPT;
 
 	if (!IS_ARP(skb)) {
-		if (!IS_VLAN_ARP(skb))
+		if (!IS_VLAN_ARP(skb, state->net))
 			return NF_ACCEPT;
 		nf_bridge_pull_encap_header(skb);
 	}
 
 	if (arp_hdr(skb)->ar_pln != 4) {
-		if (IS_VLAN_ARP(skb))
+		if (IS_VLAN_ARP(skb, state->net))
 			nf_bridge_push_encap_header(skb);
 		return NF_ACCEPT;
 	}
@@ -787,9 +793,11 @@ static unsigned int br_nf_post_routing(void *priv,
 	if (!realoutdev)
 		return NF_DROP;
 
-	if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb))
+	if (IS_IP(skb) || IS_VLAN_IP(skb, state->net) ||
+	    IS_PPPOE_IP(skb, state->net))
 		pf = NFPROTO_IPV4;
-	else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb))
+	else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb, state->net) ||
+		 IS_PPPOE_IPV6(skb, state->net))
 		pf = NFPROTO_IPV6;
 	else
 		return NF_ACCEPT;
@@ -1071,6 +1079,49 @@ static inline void br_netfilter_sysctl_default(struct netns_brnf *brnf)
 	brnf->pass_vlan_indev = 0;
 }
 
+static __net_init int br_netfilter_sysctl_init_net(struct net *net)
+{
+	struct ctl_table *table = brnf_table;
+
+	if (net_eq(net, &init_net))
+		return 0;
+
+	table = kmemdup(table, sizeof(brnf_table), GFP_KERNEL);
+	if (!table)
+		return -ENOMEM;
+
+	table[0].data = &net->brnf.call_arptables;
+	table[1].data = &net->brnf.call_iptables;
+	table[2].data = &net->brnf.call_ip6tables;
+	table[3].data = &net->brnf.filter_vlan_tagged;
+	table[4].data = &net->brnf.filter_pppoe_tagged;
+	table[5].data = &net->brnf.pass_vlan_indev;
+
+	net->brnf.ctl_hdr = register_net_sysctl(net, "net/bridge", table);
+	if (!net->brnf.ctl_hdr) {
+		kfree(table);
+		return -ENOMEM;
+	}
+
+	br_netfilter_sysctl_default(&net->brnf);
+
+	return 0;
+}
+
+static __net_exit void br_netfilter_sysctl_exit_net(struct net *net)
+{
+	if (net_eq(net, &init_net))
+		return;
+
+	unregister_net_sysctl_table(net->brnf.ctl_hdr);
+	kfree(net->brnf.ctl_hdr->ctl_table_arg);
+}
+
+static struct pernet_operations br_netfilter_sysctl_ops = {
+	.init = br_netfilter_sysctl_init_net,
+	.exit = br_netfilter_sysctl_exit_net,
+};
+
 static int __init br_netfilter_init(void)
 {
 	int ret;
@@ -1097,6 +1148,14 @@ static int __init br_netfilter_init(void)
 		unregister_pernet_subsys(&brnf_net_ops);
 		return -ENOMEM;
 	}
+
+	ret = register_pernet_subsys(&br_netfilter_sysctl_ops);
+	if (ret < 0) {
+		unregister_netdevice_notifier(&brnf_notifier);
+		unregister_pernet_subsys(&brnf_net_ops);
+		unregister_net_sysctl_table(init_net.brnf.ctl_hdr);
+		return ret;
+	}
 #endif
 	RCU_INIT_POINTER(nf_br_ops, &br_ops);
 	printk(KERN_NOTICE "Bridge firewalling registered\n");
@@ -1110,6 +1169,7 @@ static void __exit br_netfilter_fini(void)
 	unregister_pernet_subsys(&brnf_net_ops);
 #ifdef CONFIG_SYSCTL
 	unregister_net_sysctl_table(init_net.brnf.ctl_hdr);
+	unregister_pernet_subsys(&br_netfilter_sysctl_ops);
 #endif
 }
 
diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c
index 96c072e71ea2..d2220e502b6f 100644
--- a/net/bridge/br_netfilter_ipv6.c
+++ b/net/bridge/br_netfilter_ipv6.c
@@ -227,7 +227,7 @@ unsigned int br_nf_pre_routing_ipv6(void *priv,
 	nf_bridge_put(skb->nf_bridge);
 	if (!nf_bridge_alloc(skb))
 		return NF_DROP;
-	if (!setup_pre_routing(skb))
+	if (!setup_pre_routing(skb, state->net))
 		return NF_DROP;
 
 	nf_bridge = nf_bridge_info_get(skb);
-- 
2.19.1

^ permalink raw reply related

* [PATCH net-next 1/2] br_netfilter: add struct netns_brnf
From: Christian Brauner @ 2018-11-07 13:48 UTC (permalink / raw)
  To: davem, netdev, linux-kernel, netfilter-devel, coreteam, bridge
  Cc: tyhicks, pablo, kadlec, fw, roopa, nikolay, Christian Brauner
In-Reply-To: <20181107134859.19896-1-christian@brauner.io>

This adds struct netns_brnf in preparation for per-network-namespace
br_netfilter settings. The individual br_netfilter sysctl options are moved
into a central place in struct net. The struct is only included when
the CONFIG_BRIDGE_NETFILTER kconfig option is enabled in the kernel.

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: Tyler Hicks <tyhicks@canonical.com>
---
 include/net/net_namespace.h     |  3 ++
 include/net/netns/netfilter.h   | 16 ++++++++
 net/bridge/br_netfilter_hooks.c | 68 ++++++++++++++++-----------------
 3 files changed, 52 insertions(+), 35 deletions(-)

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 99d4148e0f90..bea0474cd3ea 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -125,6 +125,9 @@ struct net {
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 	struct netns_ct		ct;
 #endif
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+	struct netns_brnf	brnf;
+#endif
 #if defined(CONFIG_NF_TABLES) || defined(CONFIG_NF_TABLES_MODULE)
 	struct netns_nftables	nft;
 #endif
diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h
index ca043342c0eb..eedbd1ac940e 100644
--- a/include/net/netns/netfilter.h
+++ b/include/net/netns/netfilter.h
@@ -35,4 +35,20 @@ struct netns_nf {
 	bool			defrag_ipv6;
 #endif
 };
+
+struct netns_brnf {
+#ifdef CONFIG_SYSCTL
+	struct ctl_table_header *ctl_hdr;
+#endif
+
+	/* default value is 1 */
+	int call_iptables;
+	int call_ip6tables;
+	int call_arptables;
+
+	/* default value is 0 */
+	int filter_vlan_tagged;
+	int filter_pppoe_tagged;
+	int pass_vlan_indev;
+};
 #endif
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index b1b5e8516724..656a084f4825 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -53,23 +53,6 @@ struct brnf_net {
 	bool enabled;
 };
 
-#ifdef CONFIG_SYSCTL
-static struct ctl_table_header *brnf_sysctl_header;
-static int brnf_call_iptables __read_mostly = 1;
-static int brnf_call_ip6tables __read_mostly = 1;
-static int brnf_call_arptables __read_mostly = 1;
-static int brnf_filter_vlan_tagged __read_mostly;
-static int brnf_filter_pppoe_tagged __read_mostly;
-static int brnf_pass_vlan_indev __read_mostly;
-#else
-#define brnf_call_iptables 1
-#define brnf_call_ip6tables 1
-#define brnf_call_arptables 1
-#define brnf_filter_vlan_tagged 0
-#define brnf_filter_pppoe_tagged 0
-#define brnf_pass_vlan_indev 0
-#endif
-
 #define IS_IP(skb) \
 	(!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IP))
 
@@ -91,15 +74,15 @@ static inline __be16 vlan_proto(const struct sk_buff *skb)
 
 #define IS_VLAN_IP(skb) \
 	(vlan_proto(skb) == htons(ETH_P_IP) && \
-	 brnf_filter_vlan_tagged)
+	 init_net.brnf.filter_vlan_tagged)
 
 #define IS_VLAN_IPV6(skb) \
 	(vlan_proto(skb) == htons(ETH_P_IPV6) && \
-	 brnf_filter_vlan_tagged)
+	 init_net.brnf.filter_vlan_tagged)
 
 #define IS_VLAN_ARP(skb) \
 	(vlan_proto(skb) == htons(ETH_P_ARP) &&	\
-	 brnf_filter_vlan_tagged)
+	 init_net.brnf.filter_vlan_tagged)
 
 static inline __be16 pppoe_proto(const struct sk_buff *skb)
 {
@@ -110,12 +93,12 @@ static inline __be16 pppoe_proto(const struct sk_buff *skb)
 #define IS_PPPOE_IP(skb) \
 	(skb->protocol == htons(ETH_P_PPP_SES) && \
 	 pppoe_proto(skb) == htons(PPP_IP) && \
-	 brnf_filter_pppoe_tagged)
+	 init_net.brnf.filter_pppoe_tagged)
 
 #define IS_PPPOE_IPV6(skb) \
 	(skb->protocol == htons(ETH_P_PPP_SES) && \
 	 pppoe_proto(skb) == htons(PPP_IPV6) && \
-	 brnf_filter_pppoe_tagged)
+	 init_net.brnf.filter_pppoe_tagged)
 
 /* largest possible L2 header, see br_nf_dev_queue_xmit() */
 #define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN)
@@ -430,7 +413,7 @@ static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct
 	struct net_device *vlan, *br;
 
 	br = bridge_parent(dev);
-	if (brnf_pass_vlan_indev == 0 || !skb_vlan_tag_present(skb))
+	if (init_net.brnf.pass_vlan_indev == 0 || !skb_vlan_tag_present(skb))
 		return br;
 
 	vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto,
@@ -487,7 +470,7 @@ static unsigned int br_nf_pre_routing(void *priv,
 	br = p->br;
 
 	if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) {
-		if (!brnf_call_ip6tables &&
+		if (!init_net.brnf.call_ip6tables &&
 		    !br_opt_get(br, BROPT_NF_CALL_IP6TABLES))
 			return NF_ACCEPT;
 
@@ -495,7 +478,8 @@ static unsigned int br_nf_pre_routing(void *priv,
 		return br_nf_pre_routing_ipv6(priv, skb, state);
 	}
 
-	if (!brnf_call_iptables && !br_opt_get(br, BROPT_NF_CALL_IPTABLES))
+	if (!init_net.brnf.call_iptables &&
+	    !br_opt_get(br, BROPT_NF_CALL_IPTABLES))
 		return NF_ACCEPT;
 
 	if (!IS_IP(skb) && !IS_VLAN_IP(skb) && !IS_PPPOE_IP(skb))
@@ -637,7 +621,8 @@ static unsigned int br_nf_forward_arp(void *priv,
 		return NF_ACCEPT;
 	br = p->br;
 
-	if (!brnf_call_arptables && !br_opt_get(br, BROPT_NF_CALL_ARPTABLES))
+	if (!init_net.brnf.call_arptables &&
+	    !br_opt_get(br, BROPT_NF_CALL_ARPTABLES))
 		return NF_ACCEPT;
 
 	if (!IS_ARP(skb)) {
@@ -1032,42 +1017,42 @@ int brnf_sysctl_call_tables(struct ctl_table *ctl, int write,
 static struct ctl_table brnf_table[] = {
 	{
 		.procname	= "bridge-nf-call-arptables",
-		.data		= &brnf_call_arptables,
+		.data		= &init_net.brnf.call_arptables,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= brnf_sysctl_call_tables,
 	},
 	{
 		.procname	= "bridge-nf-call-iptables",
-		.data		= &brnf_call_iptables,
+		.data		= &init_net.brnf.call_iptables,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= brnf_sysctl_call_tables,
 	},
 	{
 		.procname	= "bridge-nf-call-ip6tables",
-		.data		= &brnf_call_ip6tables,
+		.data		= &init_net.brnf.call_ip6tables,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= brnf_sysctl_call_tables,
 	},
 	{
 		.procname	= "bridge-nf-filter-vlan-tagged",
-		.data		= &brnf_filter_vlan_tagged,
+		.data		= &init_net.brnf.filter_vlan_tagged,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= brnf_sysctl_call_tables,
 	},
 	{
 		.procname	= "bridge-nf-filter-pppoe-tagged",
-		.data		= &brnf_filter_pppoe_tagged,
+		.data		= &init_net.brnf.filter_pppoe_tagged,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= brnf_sysctl_call_tables,
 	},
 	{
 		.procname	= "bridge-nf-pass-vlan-input-dev",
-		.data		= &brnf_pass_vlan_indev,
+		.data		= &init_net.brnf.pass_vlan_indev,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= brnf_sysctl_call_tables,
@@ -1076,6 +1061,16 @@ static struct ctl_table brnf_table[] = {
 };
 #endif
 
+static inline void br_netfilter_sysctl_default(struct netns_brnf *brnf)
+{
+	brnf->call_iptables = 1;
+	brnf->call_ip6tables = 1;
+	brnf->call_arptables = 1;
+	brnf->filter_vlan_tagged = 0;
+	brnf->filter_pppoe_tagged = 0;
+	brnf->pass_vlan_indev = 0;
+}
+
 static int __init br_netfilter_init(void)
 {
 	int ret;
@@ -1090,9 +1085,12 @@ static int __init br_netfilter_init(void)
 		return ret;
 	}
 
+	/* Always set default values. Even if CONFIG_SYSCTL is not set. */
+	br_netfilter_sysctl_default(&init_net.brnf);
+
 #ifdef CONFIG_SYSCTL
-	brnf_sysctl_header = register_net_sysctl(&init_net, "net/bridge", brnf_table);
-	if (brnf_sysctl_header == NULL) {
+	init_net.brnf.ctl_hdr = register_net_sysctl(&init_net, "net/bridge", brnf_table);
+	if (!init_net.brnf.ctl_hdr) {
 		printk(KERN_WARNING
 		       "br_netfilter: can't register to sysctl.\n");
 		unregister_netdevice_notifier(&brnf_notifier);
@@ -1111,7 +1109,7 @@ static void __exit br_netfilter_fini(void)
 	unregister_netdevice_notifier(&brnf_notifier);
 	unregister_pernet_subsys(&brnf_net_ops);
 #ifdef CONFIG_SYSCTL
-	unregister_net_sysctl_table(brnf_sysctl_header);
+	unregister_net_sysctl_table(init_net.brnf.ctl_hdr);
 #endif
 }
 
-- 
2.19.1

^ permalink raw reply related

* [PATCH net-next 0/2] br_netfilter: enable in non-initial netns
From: Christian Brauner @ 2018-11-07 13:48 UTC (permalink / raw)
  To: davem, netdev, linux-kernel, netfilter-devel, coreteam, bridge
  Cc: tyhicks, pablo, kadlec, fw, roopa, nikolay, Christian Brauner

Hey everyone,

Over time I have seen multiple reports by users who want to run applications
(Kubernetes e.g. via [1]) that require the br_netfilter module in
non-initial network namespaces [2], [3], [4], [5] (There are more issues
where this requirement is reported.).
Currently, the /proc/sys/net/bridge folder is only created in the
initial network namespace. This patch series ensures that the
/proc/sys/net/bridge folder is available in each network namespace if
the module is loaded and disappears from all network namespaces when the
module is unloaded.
The patch series also makes the sysctls:

bridge-nf-call-arptables
bridge-nf-call-ip6tables
bridge-nf-call-iptables
bridge-nf-filter-pppoe-tagged
bridge-nf-filter-vlan-tagged
bridge-nf-pass-vlan-input-dev

apply per network namespace. This unblocks some use-cases where users
would like to e.g. not do bridge filtering for bridges in a specific
network namespace while doing so for bridges located in another network
namespace.
The netfilter rules are afaict already per network namespace so it
should be safe for users to specify whether a bridge device inside their
network namespace is supposed to go through iptables et al. or not.
Also, this can already be done by setting an option for each individual
bridge via Netlink. It should also be possible to do this for all
bridges in a network namespace via sysctls.

Thanks!
Christian

[1]: https://github.com/zimmertr/Bootstrap-Kubernetes-with-Ansible
[2]: https://github.com/lxc/lxd/issues/5193
[3]: https://discuss.linuxcontainers.org/t/bridge-nf-call-iptables-and-swap-error-on-lxd-with-kubeadm/2204
[4]: https://github.com/lxc/lxd/issues/3306
[5]: https://gitlab.com/gitlab-org/gitlab-runner/issues/3705

Christian Brauner (2):
  br_netfilter: add struct netns_brnf
  br_netfilter: namespace bridge netfilter sysctls

 include/net/net_namespace.h          |   3 +
 include/net/netfilter/br_netfilter.h |   3 +-
 include/net/netns/netfilter.h        |  16 +++
 net/bridge/br_netfilter_hooks.c      | 166 ++++++++++++++++++---------
 net/bridge/br_netfilter_ipv6.c       |   2 +-
 5 files changed, 134 insertions(+), 56 deletions(-)

-- 
2.19.1

^ permalink raw reply

* Re: [RFC PATCH 11/12] soc: qcom: ipa: IPA rmnet interface
From: Arnd Bergmann @ 2018-11-07 13:30 UTC (permalink / raw)
  To: Alex Elder
  Cc: David Miller, Bjorn Andersson, Ilias Apalodimas, Networking, DTML,
	linux-arm-msm, linux-soc, Linux ARM, Linux Kernel Mailing List,
	syadagir, mjavid, Rob Herring, Mark Rutland
In-Reply-To: <20181107003250.5832-12-elder@linaro.org>

On Wed, Nov 7, 2018 at 1:33 AM Alex Elder <elder@linaro.org> wrote:

> Note:  This portion of the driver will be heavily affected by
> planned rework on the data path code.

Ok. I don't think the ioctl interface has a real chance of getting merged
into the kernel. You should generally not require any custom user space
tools for a driver like this.

> diff --git a/drivers/net/ipa/msm_rmnet.h b/drivers/net/ipa/msm_rmnet.h
> new file mode 100644
> index 000000000000..042380fd53fb
> --- /dev/null
> +++ b/drivers/net/ipa/msm_rmnet.h

Just for the record: if we really wanted to define ioctls, this would go
into 'include/linux/uapi/msm_rmnet.h' and get installed into the
/usr/include hierarchy on all machines.

> +
> +#define RMNET_IOCTL_SET_LLP_ETHERNET 0x000089f1 /* Set Ethernet protocol  */
> +#define RMNET_IOCTL_SET_LLP_IP      0x000089f2 /* Set RAWIP protocol     */
> +#define RMNET_IOCTL_GET_LLP         0x000089f3 /* Get link protocol      */
> +#define RMNET_IOCTL_SET_QOS_ENABLE   0x000089f4 /* Set QoS header enabled */
> +#define RMNET_IOCTL_SET_QOS_DISABLE  0x000089f5 /* Set QoS header disabled*/
> +#define RMNET_IOCTL_GET_QOS         0x000089f6 /* Get QoS header state   */
> +#define RMNET_IOCTL_GET_OPMODE      0x000089f7 /* Get operation mode     */

And the commands would be defined using _IOC/_IOR/_IOW macros that
document which arguments they take


> +#define RMNET_IOCTL_OPEN            0x000089f8 /* Open transport port    */
> +#define RMNET_IOCTL_CLOSE           0x000089f9 /* Close transport port   */
> +#define RMNET_IOCTL_FLOW_ENABLE             0x000089fa /* Flow enable            */
> +#define RMNET_IOCTL_FLOW_DISABLE     0x000089fb /* Flow disable                  */
> +#define RMNET_IOCTL_FLOW_SET_HNDL    0x000089fc /* Set flow handle       */
> +#define RMNET_IOCTL_EXTENDED        0x000089fd /* Extended IOCTLs        */

'extended' interfaces are obviously out of the question entirely, those
would all need to be separate commands.


> +/* User space may not have this defined. */
> +#ifndef IFNAMSIZ
> +#define IFNAMSIZ 16
> +#endif

This is in <linux/if.h>

> +struct rmnet_ioctl_extended_s {
> +       u32     extended_ioctl;
> +       union {

And unions in the ioctl interfaces also wouldn't work.

> +static bool initialized;       /* Avoid duplicate initialization */
> +
> +static struct rmnet_ipa_context rmnet_ipa_ctx_struct;
> +static struct rmnet_ipa_context *rmnet_ipa_ctx = &rmnet_ipa_ctx_struct;

Global variables like these should be removed.

> +/** ipa_wwan_xmit() - Transmits an skb.
> + *
> + * @skb: skb to be transmitted
> + * @dev: network device
> + *
> + * Return codes:
> + * NETDEV_TX_OK: Success
> + * NETDEV_TX_BUSY: Error while transmitting the skb. Try again later
> + */
> +static int ipa_wwan_xmit(struct sk_buff *skb, struct net_device *dev)
> +{
> +       struct ipa_wwan_private *wwan_ptr = netdev_priv(dev);
> +       unsigned int skb_len;
> +       int outstanding;
> +
> +       if (skb->protocol != htons(ETH_P_MAP)) {
> +               dev_kfree_skb_any(skb);
> +               dev->stats.tx_dropped++;
> +               return NETDEV_TX_OK;
> +       }
> +
> +       /* Control packets are sent even if queue is stopped.  We
> +        * always honor the data and control high-water marks.
> +        */
> +       outstanding = atomic_read(&wwan_ptr->outstanding_pkts);
> +       if (!RMNET_MAP_GET_CD_BIT(skb)) {       /* Data packet? */
> +               if (netif_queue_stopped(dev))
> +                       return NETDEV_TX_BUSY;
> +               if (outstanding >= wwan_ptr->outstanding_high)
> +                       return NETDEV_TX_BUSY;
> +       } else if (outstanding >= wwan_ptr->outstanding_high_ctl) {
> +               return NETDEV_TX_BUSY;
> +       }

This seems to be a poor reimplementation of BQL. Better
use netdev_sent_queue() and netdev_completed_queue()
to do the same thing better.

> +/** apps_ipa_packet_receive_notify() - Rx notify
> + *
> + * @priv: driver context
> + * @evt: event type
> + * @data: data provided with event
> + *
> + * IPA will pass a packet to the Linux network stack with skb->data
> + */
> +static void apps_ipa_packet_receive_notify(void *priv, enum ipa_dp_evt_type evt,
> +                                          unsigned long data)
> +{
> +       struct ipa_wwan_private *wwan_ptr;
> +       struct net_device *dev = priv;
> +
> +       wwan_ptr = netdev_priv(dev);
> +       if (evt == IPA_RECEIVE) {
> +               struct sk_buff *skb = (struct sk_buff *)data;
> +               int ret;
> +               unsigned int packet_len = skb->len;
> +
> +               skb->dev = rmnet_ipa_ctx->dev;
> +               skb->protocol = htons(ETH_P_MAP);
> +
> +               ret = netif_receive_skb(skb);
> +               if (ret) {
> +                       pr_err_ratelimited("fail on netif_receive_skb\n");
> +                       dev->stats.rx_dropped++;
> +               }
> +               dev->stats.rx_packets++;
> +               dev->stats.rx_bytes += packet_len;
> +       } else if (evt == IPA_CLIENT_START_POLL) {
> +               napi_schedule(&wwan_ptr->napi);
> +       } else if (evt == IPA_CLIENT_COMP_NAPI) {
> +               napi_complete(&wwan_ptr->napi);
> +       } else {
> +               ipa_err("Invalid evt %d received in wan_ipa_receive\n", evt);
> +       }
> +}

I don't understand the logic here. Why is this a callback function?
You normally want the data path to be as fast as possible, and the
indirection seems like it would get in the way of that.

Since the function doesn't do much interesting work, could
it be moved into the caller?

> +/** handle_ingress_format() - Ingress data format configuration */
> +static int handle_ingress_format(struct net_device *dev,
> +                                struct rmnet_ioctl_extended_s *in)
> +{

Can you describe how this would be called from user space?
I.e. what is the reason we have to configure anything here?


> +
> +       /* Unsupported requests */
> +       case RMNET_IOCTL_SET_MRU:                       /* Set MRU */
> +       case RMNET_IOCTL_GET_MRU:                       /* Get MRU */
> +       case RMNET_IOCTL_GET_AGGREGATION_COUNT:         /* Get agg count */
> +       case RMNET_IOCTL_SET_AGGREGATION_COUNT:         /* Set agg count */
> +       case RMNET_IOCTL_GET_AGGREGATION_SIZE:          /* Get agg size */
> +       case RMNET_IOCTL_SET_AGGREGATION_SIZE:          /* Set agg size */
> +       case RMNET_IOCTL_FLOW_CONTROL:                  /* Do flow control */
> +       case RMNET_IOCTL_GET_DFLT_CONTROL_CHANNEL:      /* For legacy use */
> +       case RMNET_IOCTL_GET_HWSW_MAP:                  /* Get HW/SW map */
> +       case RMNET_IOCTL_SET_RX_HEADROOM:               /* Set RX Headroom */
> +       case RMNET_IOCTL_SET_QOS_VERSION:               /* Set 8/6 byte QoS */
> +       case RMNET_IOCTL_GET_QOS_VERSION:               /* Get 8/6 byte QoS */
> +       case RMNET_IOCTL_GET_SUPPORTED_QOS_MODES:       /* Get QoS modes */
> +       case RMNET_IOCTL_SET_SLEEP_STATE:               /* Set sleep state */
> +       case RMNET_IOCTL_SET_XLAT_DEV_INFO:             /* xlat dev name */
> +       case RMNET_IOCTL_DEREGISTER_DEV:                /* Deregister netdev */
> +               return -ENOTSUPP;       /* Defined, but unsupported command */
> +
> +       default:
> +               return -EINVAL;         /* Invalid (unrecognized) command */
> +       }
> +
> +copy_out:
> +       return copy_to_user(data, &edata, size) ? -EFAULT : 0;
> +}
> +
> +/** ipa_wwan_ioctl() - I/O control for wwan network driver */
> +static int ipa_wwan_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
> +{
> +       struct rmnet_ioctl_data_s ioctl_data = { };
> +       void __user *data;
> +       size_t size;
> +
> +       data = ifr->ifr_ifru.ifru_data;
> +       size = sizeof(ioctl_data);
> +
> +       switch (cmd) {
> +       /* These features are implied; alternatives are not supported */
> +       case RMNET_IOCTL_SET_LLP_IP:            /* RAW IP protocol */
> +       case RMNET_IOCTL_SET_QOS_DISABLE:       /* QoS header disabled */
> +               return 0;
> +
> +       /* These features are not supported; use alternatives */
> +       case RMNET_IOCTL_SET_LLP_ETHERNET:      /* Ethernet protocol */
> +       case RMNET_IOCTL_SET_QOS_ENABLE:        /* QoS header enabled */
> +       case RMNET_IOCTL_GET_OPMODE:            /* Get operation mode */
> +       case RMNET_IOCTL_FLOW_ENABLE:           /* Flow enable */
> +       case RMNET_IOCTL_FLOW_DISABLE:          /* Flow disable */
> +       case RMNET_IOCTL_FLOW_SET_HNDL:         /* Set flow handle */
> +               return -ENOTSUPP;
> +
> +       case RMNET_IOCTL_GET_LLP:               /* Get link protocol */
> +               ioctl_data.u.operation_mode = RMNET_MODE_LLP_IP;
> +               goto copy_out;
> +
> +       case RMNET_IOCTL_GET_QOS:               /* Get QoS header state */
> +               ioctl_data.u.operation_mode = RMNET_MODE_NONE;
> +               goto copy_out;
> +
> +       case RMNET_IOCTL_OPEN:                  /* Open transport port */
> +       case RMNET_IOCTL_CLOSE:                 /* Close transport port */
> +               return 0;
> +
> +       case RMNET_IOCTL_EXTENDED:              /* Extended IOCTLs */
> +               return ipa_wwan_ioctl_extended(dev, data);
> +
> +       default:
> +               return -EINVAL;
> +       }

It would help to remove everything that is a nop or not implemented
or that returns a constant value here, those are clearly not
relevant for the submission here.

> +
> +static const struct of_device_id rmnet_ipa_dt_match[] = {
> +       {.compatible = "qcom,rmnet-ipa"},
> +       {},
> +};

The match string looks overly generic, surely there must be plans
to have future versions of this that might require identification.

     Arnd

^ permalink raw reply

* Join the Illuminati Brotherhood
From: Bright Terry @ 2018-11-02  8:35 UTC (permalink / raw)


Greetings from the Illuminati order, Join the Illuminati Brotherhood for
fame, knowledge, wealth and powers.

^ permalink raw reply

* Re: [RFC PATCH 12/12] soc: qcom: ipa: build and "ipa_i.h"
From: Arnd Bergmann @ 2018-11-07 12:34 UTC (permalink / raw)
  To: Alex Elder
  Cc: David Miller, Bjorn Andersson, Ilias Apalodimas, Networking, DTML,
	linux-arm-msm, linux-soc, Linux ARM, Linux Kernel Mailing List,
	syadagir, mjavid, Rob Herring, Mark Rutland
In-Reply-To: <20181107003250.5832-13-elder@linaro.org>

On Wed, Nov 7, 2018 at 1:33 AM Alex Elder <elder@linaro.org> wrote:
> +config IPA_ASSERT
> +       bool "Enable IPA assertions"
> +       depends on IPA
> +       default y
> +       help
> +        Incorporate IPA assertion verification in the build.  This
> +        cause various design assumptions to be checked at runtime,
> +        generating a report (and a crash) if any assumed condition
> +        does not hold.  You may wish to disable this to avoid the
> +        overhead of checking.

Maybe remove this from the submission.

> +#define ipa_debug(fmt, args...)        dev_dbg(ipa_ctx->dev, fmt, ## args)
> +#define ipa_err(fmt, args...)  dev_err(ipa_ctx->dev, fmt, ## args)

These macros refer to variables in the caller that are not passed as arguments,
which is generally a bad idea. They also trivially wrap a standard kernel
interface, so better just that directly.

> +#define ipa_bug() \
> +       do {                                                            \
> +               ipa_err("an unrecoverable error has occurred\n");       \
> +               BUG();                                                  \
> +       } while (0)
> +
> +#define ipa_bug_on(condition)                                          \
> +       do {                                                            \
> +               if (condition) {                                \
> +                       ipa_err("ipa_bug_on(%s) failed!\n", #condition); \
> +                       ipa_bug();                                      \
> +               }                                                       \
> +       } while (0)

According to a discussion at the kernel summit, we should generally
try to avoid BUG() as it rarely does anything useful: it crashes the
current task, but in a network driver that usually means killing the
entire kernel since you are not in process context.

Try questioning each one to see if it can possibly happen, or if the
code can be rewritten in a way to guarantee that it cannot.

If continuing after the bug was detected does not cause a security
hole or permanent data corruption, you can also use WARN_ON()
or WARN_ONCE() (without a wrapper).

> +int ipa_wwan_init(void);
> +void ipa_wwan_cleanup(void);
> +
> +int ipa_stop_gsi_channel(u32 ep_id);
> +
> +void ipa_cfg_ep(u32 ep_id);
> +
> +int ipa_tx_dp(enum ipa_client_type dst, struct sk_buff *skb);
> +
> +bool ipa_endp_aggr_support(u32 ep_id);
> +enum ipa_seq_type ipa_endp_seq_type(u32 ep_id);
> +
> +void ipa_endp_init_hdr_cons(u32 ep_id, u32 header_size,
> +                           u32 metadata_offset, u32 length_offset);
> +void ipa_endp_init_hdr_prod(u32 ep_id, u32 header_size,
> +                           u32 metadata_offset, u32 length_offset);

I'm surprised to see many functions that don't take a pointer
to an instance as the first argument, which often indicates
that you have global state variables and the driver won't
work with multiple hardware instances.

      Arnd

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox