Netdev List
 help / color / mirror / Atom feed
* Re: [RFC PATCH V2 2/2] vhost: device IOTLB API
From: Michael S. Tsirkin @ 2016-04-27 11:45 UTC (permalink / raw)
  To: Jason Wang
  Cc: kvm, qemu-devel, netdev, linux-kernel, peterx, virtualization,
	pbonzini
In-Reply-To: <1458873274-13961-3-git-send-email-jasowang@redhat.com>

On Fri, Mar 25, 2016 at 10:34:34AM +0800, Jason Wang wrote:
> This patch tries to implement an device IOTLB for vhost. This could be
> used with for co-operation with userspace(qemu) implementation of DMA
> remapping.
> 
> The idea is simple. When vhost meets an IOTLB miss, it will request
> the assistance of userspace to do the translation, this is done
> through:
> 
> - Fill the translation request in a preset userspace address (This
>   address is set through ioctl VHOST_SET_IOTLB_REQUEST_ENTRY).
> - Notify userspace through eventfd (This eventfd was set through ioctl
>   VHOST_SET_IOTLB_FD).

Why use an eventfd for this? We use them for interrupts because
that happens to be what kvm wants, but here - why don't we
just add a generic support for reading out events
on the vhost fd itself?

> - device IOTLB were started and stopped through VHOST_RUN_IOTLB ioctl
> 
> When userspace finishes the translation, it will update the vhost
> IOTLB through VHOST_UPDATE_IOTLB ioctl. Userspace is also in charge of
> snooping the IOTLB invalidation of IOMMU IOTLB and use
> VHOST_UPDATE_IOTLB to invalidate the possible entry in vhost.

There's one problem here, and that is that VQs still do not undergo
translation.  In theory VQ could be mapped in such a way
that it's not contigious in userspace memory.


> Signed-off-by: Jason Wang <jasowang@redhat.com>

What limits amount of entries that kernel keeps around?
Do we want at least a mod parameter for this?


> ---
>  drivers/vhost/net.c        |   6 +-
>  drivers/vhost/vhost.c      | 301 +++++++++++++++++++++++++++++++++++++++------
>  drivers/vhost/vhost.h      |  17 ++-
>  fs/eventfd.c               |   3 +-
>  include/uapi/linux/vhost.h |  35 ++++++
>  5 files changed, 320 insertions(+), 42 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 481db96..7cbdeed 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -334,7 +334,7 @@ static void handle_tx(struct vhost_net *net)
>  		head = vhost_get_vq_desc(vq, vq->iov,
>  					 ARRAY_SIZE(vq->iov),
>  					 &out, &in,
> -					 NULL, NULL);
> +					 NULL, NULL, VHOST_ACCESS_RO);
>  		/* On error, stop handling until the next kick. */
>  		if (unlikely(head < 0))
>  			break;
> @@ -470,7 +470,7 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
>  		}
>  		r = vhost_get_vq_desc(vq, vq->iov + seg,
>  				      ARRAY_SIZE(vq->iov) - seg, &out,
> -				      &in, log, log_num);
> +				      &in, log, log_num, VHOST_ACCESS_WO);
>  		if (unlikely(r < 0))
>  			goto err;
>  
> @@ -1083,7 +1083,7 @@ static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
>  		r = vhost_dev_ioctl(&n->dev, ioctl, argp);
>  		if (r == -ENOIOCTLCMD)
>  			r = vhost_vring_ioctl(&n->dev, ioctl, argp);
> -		else
> +		else if (ioctl != VHOST_UPDATE_IOTLB)
>  			vhost_net_flush(n);
>  		mutex_unlock(&n->dev.mutex);
>  		return r;
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 32c35a9..1dd43e8 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -280,6 +280,10 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>  	vq->call_ctx = NULL;
>  	vq->call = NULL;
>  	vq->log_ctx = NULL;
> +	vq->iotlb_call = NULL;
> +	vq->iotlb_call_ctx = NULL;
> +	vq->iotlb_request = NULL;
> +	vq->pending_request.flags.type = VHOST_IOTLB_INVALIDATE;
>  	vq->umem = NULL;
>  	vq->is_le = virtio_legacy_is_little_endian();
>  	vhost_vq_reset_user_be(vq);
> @@ -387,8 +391,10 @@ void vhost_dev_init(struct vhost_dev *dev,
>  	dev->log_ctx = NULL;
>  	dev->log_file = NULL;
>  	dev->umem = NULL;
> +	dev->iotlb = NULL;
>  	dev->mm = NULL;
>  	spin_lock_init(&dev->work_lock);
> +	spin_lock_init(&dev->iotlb_lock);
>  	INIT_LIST_HEAD(&dev->work_list);
>  	dev->worker = NULL;
>  
> @@ -537,6 +543,15 @@ void vhost_dev_stop(struct vhost_dev *dev)
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_stop);
>  
> +static void vhost_umem_free(struct vhost_umem *umem,
> +			    struct vhost_umem_node *node)
> +{
> +	vhost_umem_interval_tree_remove(node, &umem->umem_tree);
> +	list_del(&node->link);
> +	kfree(node);
> +	umem->numem--;
> +}
> +
>  static void vhost_umem_clean(struct vhost_umem *umem)
>  {
>  	struct vhost_umem_node *node, *tmp;
> @@ -544,11 +559,9 @@ static void vhost_umem_clean(struct vhost_umem *umem)
>  	if (!umem)
>  		return;
>  
> -	list_for_each_entry_safe(node, tmp, &umem->umem_list, link) {
> -		vhost_umem_interval_tree_remove(node, &umem->umem_tree);
> -		list_del(&node->link);
> -		kvfree(node);
> -	}
> +	list_for_each_entry_safe(node, tmp, &umem->umem_list, link)
> +		vhost_umem_free(umem, node);
> +
>  	kvfree(umem);
>  }
>  
> @@ -580,6 +593,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev, bool locked)
>  	/* No one will access memory at this point */
>  	vhost_umem_clean(dev->umem);
>  	dev->umem = NULL;
> +	vhost_umem_clean(dev->iotlb);
> +	dev->iotlb = NULL;
>  	WARN_ON(!list_empty(&dev->work_list));
>  	if (dev->worker) {
>  		kthread_stop(dev->worker);
> @@ -699,11 +714,61 @@ int vhost_vq_access_ok(struct vhost_virtqueue *vq)
>  }
>  EXPORT_SYMBOL_GPL(vhost_vq_access_ok);
>  
> +static int vhost_new_umem_range(struct vhost_umem *umem,
> +				u64 start, u64 size, u64 end,
> +				u64 userspace_addr, int perm)
> +{
> +	struct vhost_umem_node *tmp, *node = kmalloc(sizeof(*node), GFP_ATOMIC);
> +
> +	if (!node)
> +		return -ENOMEM;
> +
> +	if (umem->numem == VHOST_IOTLB_SIZE) {
> +		tmp = list_last_entry(&umem->umem_list, typeof(*tmp), link);
> +		vhost_umem_free(umem, tmp);
> +	}
> +
> +	node->start = start;
> +	node->size = size;
> +	node->last = end;
> +	node->userspace_addr = userspace_addr;
> +	node->perm = perm;
> +	INIT_LIST_HEAD(&node->link);
> +	list_add_tail(&node->link, &umem->umem_list);
> +	vhost_umem_interval_tree_insert(node, &umem->umem_tree);
> +	umem->numem++;
> +
> +	return 0;
> +}
> +
> +static void vhost_del_umem_range(struct vhost_umem *umem,
> +				 u64 start, u64 end)
> +{
> +	struct vhost_umem_node *node;
> +
> +	while ((node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
> +							   start, end)))
> +		vhost_umem_free(umem, node);
> +}
> +
> +static struct vhost_umem *vhost_umem_alloc(void)
> +{
> +	struct vhost_umem *umem = vhost_kvzalloc(sizeof(*umem));
> +
> +	if (!umem)
> +		return NULL;
> +
> +	umem->umem_tree = RB_ROOT;
> +	umem->numem = 0;
> +	INIT_LIST_HEAD(&umem->umem_list);
> +
> +	return umem;
> +}
> +
>  static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
>  {
>  	struct vhost_memory mem, *newmem;
>  	struct vhost_memory_region *region;
> -	struct vhost_umem_node *node;
>  	struct vhost_umem *newumem, *oldumem;
>  	unsigned long size = offsetof(struct vhost_memory, regions);
>  	int i;
> @@ -725,28 +790,23 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
>  		return -EFAULT;
>  	}
>  
> -	newumem = vhost_kvzalloc(sizeof(*newumem));
> +	newumem = vhost_umem_alloc();
>  	if (!newumem) {
>  		kvfree(newmem);
>  		return -ENOMEM;
>  	}
>  
> -	newumem->umem_tree = RB_ROOT;
> -	INIT_LIST_HEAD(&newumem->umem_list);
> -
>  	for (region = newmem->regions;
>  	     region < newmem->regions + mem.nregions;
>  	     region++) {
> -		node = vhost_kvzalloc(sizeof(*node));
> -		if (!node)
> +		if (vhost_new_umem_range(newumem,
> +					 region->guest_phys_addr,
> +					 region->memory_size,
> +					 region->guest_phys_addr +
> +					 region->memory_size - 1,
> +					 region->userspace_addr,
> +				         VHOST_ACCESS_RW))
>  			goto err;
> -		node->start = region->guest_phys_addr;
> -		node->size = region->memory_size;
> -		node->last = node->start + node->size - 1;
> -		node->userspace_addr = region->userspace_addr;
> -		INIT_LIST_HEAD(&node->link);
> -		list_add_tail(&node->link, &newumem->umem_list);
> -		vhost_umem_interval_tree_insert(node, &newumem->umem_tree);
>  	}
>  
>  	if (!memory_access_ok(d, newumem, 0))
> @@ -782,6 +842,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp)
>  	struct vhost_vring_state s;
>  	struct vhost_vring_file f;
>  	struct vhost_vring_addr a;
> +	struct vhost_vring_iotlb_entry e;
>  	u32 idx;
>  	long r;
>  
> @@ -910,6 +971,35 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp)
>  		} else
>  			filep = eventfp;
>  		break;
> +	case VHOST_SET_VRING_IOTLB_REQUEST:
> +		r = -EFAULT;
> +		if (copy_from_user(&e, argp, sizeof e))
> +			break;
> +		if (!access_ok(VERIFY_WRITE, e.userspace_addr,
> +				sizeof(*vq->iotlb_request)))
> +			break;
> +		r = 0;
> +		vq->iotlb_request = (struct vhost_iotlb_entry __user *)e.userspace_addr;
> +		break;
> +	case VHOST_SET_VRING_IOTLB_CALL:
> +		if (copy_from_user(&f, argp, sizeof f)) {
> +			r = -EFAULT;
> +			break;
> +		}
> +		eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);
> +		if (IS_ERR(eventfp)) {
> +			r = PTR_ERR(eventfp);
> +			break;
> +		}
> +		if (eventfp != vq->iotlb_call) {
> +			filep = vq->iotlb_call;
> +			ctx = vq->iotlb_call_ctx;
> +			vq->iotlb_call = eventfp;
> +			vq->iotlb_call_ctx = eventfp ?
> +				eventfd_ctx_fileget(eventfp) : NULL;
> +		} else
> +			filep = eventfp;
> +		break;
>  	case VHOST_SET_VRING_CALL:
>  		if (copy_from_user(&f, argp, sizeof f)) {
>  			r = -EFAULT;
> @@ -977,11 +1067,55 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp)
>  }
>  EXPORT_SYMBOL_GPL(vhost_vring_ioctl);
>  
> +static int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled)
> +{
> +	struct vhost_umem *niotlb, *oiotlb;
> +
> +	if (enabled) {
> +		niotlb = vhost_umem_alloc();
> +		if (!niotlb)
> +			return -ENOMEM;
> +	} else
> +		niotlb = NULL;
> +
> +	spin_lock(&d->iotlb_lock);
> +	oiotlb = d->iotlb;
> +	d->iotlb = niotlb;
> +	spin_unlock(&d->iotlb_lock);
> +
> +	vhost_umem_clean(oiotlb);
> +
> +	return 0;
> +}
> +
> +static void vhost_complete_iotlb_update(struct vhost_dev *d,
> +					struct vhost_iotlb_entry *entry)
> +{
> +	struct vhost_iotlb_entry *req;
> +	struct vhost_virtqueue *vq;
> +	int i;
> +
> +
> +	for (i = 0; i < d->nvqs; i++) {
> +		vq = d->vqs[i];
> +		mutex_lock(&vq->mutex);
> +		req = &vq->pending_request;
> +		if (entry->iova <= req->iova &&
> +		    entry->iova + entry->size - 1 > req->iova &&
> +		    req->flags.type == VHOST_IOTLB_MISS) {
> +			*req = *entry;
> +			vhost_poll_queue(&vq->poll);
> +		}
> +		mutex_unlock(&vq->mutex);
> +	}
> +}
> +
>  /* Caller must have device mutex */
>  long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
>  {
>  	struct file *eventfp, *filep = NULL;
>  	struct eventfd_ctx *ctx = NULL;
> +	struct vhost_iotlb_entry entry;
>  	u64 p;
>  	long r;
>  	int i, fd;
> @@ -1050,6 +1184,52 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
>  		if (filep)
>  			fput(filep);
>  		break;
> +	case VHOST_RUN_IOTLB:
> +		/* FIXME: enable and disabled */
> +		vhost_init_device_iotlb(d, true);
> +		break;
> +	case VHOST_UPDATE_IOTLB:
> +		r = copy_from_user(&entry, argp, sizeof(entry));
> +		if (r < 0) {
> +			r = -EFAULT;
> +			goto done;
> +		}
> +
> +		spin_lock(&d->iotlb_lock);
> +		if (!d->iotlb) {
> +			spin_unlock(&d->iotlb_lock);
> +			r = -EFAULT;
> +			goto done;
> +		}
> +		switch (entry.flags.type) {
> +		case VHOST_IOTLB_UPDATE:
> +			if (entry.flags.valid != VHOST_IOTLB_VALID) {
> +				break;
> +			}
> +			if (vhost_new_umem_range(d->iotlb,
> +						 entry.iova,
> +						 entry.size,
> +						 entry.iova + entry.size - 1,
> +                                                 entry.userspace_addr,
> +                                                 entry.flags.perm)) {
> +				r = -ENOMEM;
> +				break;
> +			}
> +			break;
> +		case VHOST_IOTLB_INVALIDATE:
> +			vhost_del_umem_range(d->iotlb,
> +					     entry.iova,
> +					     entry.iova + entry.size - 1);
> +			break;
> +		default:
> +			r = -EINVAL;
> +		}
> +		spin_unlock(&d->iotlb_lock);
> +
> +		if (!r && entry.flags.type != VHOST_IOTLB_INVALIDATE)
> +			vhost_complete_iotlb_update(d, &entry);
> +
> +		break;
>  	default:
>  		r = -ENOIOCTLCMD;
>  		break;
> @@ -1197,27 +1377,69 @@ int vhost_init_used(struct vhost_virtqueue *vq)
>  }
>  EXPORT_SYMBOL_GPL(vhost_init_used);
>  
> +static int vhost_iotlb_miss(struct vhost_virtqueue *vq, u64 iova)
> +{
> +	struct vhost_iotlb_entry *pending = &vq->pending_request;
> +	int ret;
> +
> +	if (!vq->iotlb_call_ctx)
> +		return -EOPNOTSUPP;
> +
> +	if (!vq->iotlb_request)
> +		return -EOPNOTSUPP;
> +
> +	if (pending->flags.type == VHOST_IOTLB_MISS) {
> +		return -EEXIST;
> +	}
> +
> +	pending->iova = iova;
> +	pending->flags.type = VHOST_IOTLB_MISS;
> +
> +	ret = __copy_to_user(vq->iotlb_request, pending,
> +			     sizeof(struct vhost_iotlb_entry));
> +	if (ret) {
> +		goto err;
> +	}
> +
> +	if (vq->iotlb_call_ctx)
> +		eventfd_signal(vq->iotlb_call_ctx, 1);
> +err:
> +	return ret;
> +}
> +
>  static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
> -			  struct iovec iov[], int iov_size)
> +			  struct iovec iov[], int iov_size, int access)
>  {
>  	const struct vhost_umem_node *node;
> -	struct vhost_umem *umem = vq->umem;
> +	struct vhost_dev *dev = vq->dev;
> +	struct vhost_umem *umem = dev->iotlb ? dev->iotlb : dev->umem;
>  	struct iovec *_iov;
>  	u64 s = 0;
>  	int ret = 0;
>  
> +	spin_lock(&dev->iotlb_lock);
> +
>  	while ((u64)len > s) {
>  		u64 size;
>  		if (unlikely(ret >= iov_size)) {
>  			ret = -ENOBUFS;
>  			break;
>  		}
> +
>  		node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
>  							addr, addr + len - 1);
>  		if (node == NULL || node->start > addr) {
> -			ret = -EFAULT;
> +			if (umem != dev->iotlb) {
> +				ret = -EFAULT;
> +				break;
> +			}
> +			ret = -EAGAIN;
> +			break;
> +		} else if (!(node->perm & access)) {
> +			ret = -EPERM;
>  			break;
>  		}
> +
>  		_iov = iov + ret;
>  		size = node->size - addr + node->start;
>  		_iov->iov_len = min((u64)len - s, size);
> @@ -1228,6 +1450,10 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
>  		++ret;
>  	}
>  
> +	spin_unlock(&dev->iotlb_lock);
> +
> +	if (ret == -EAGAIN)
> +		vhost_iotlb_miss(vq, addr);
>  	return ret;
>  }
>  
> @@ -1256,7 +1482,7 @@ static int get_indirect(struct vhost_virtqueue *vq,
>  			struct iovec iov[], unsigned int iov_size,
>  			unsigned int *out_num, unsigned int *in_num,
>  			struct vhost_log *log, unsigned int *log_num,
> -			struct vring_desc *indirect)
> +			struct vring_desc *indirect, int access)
>  {
>  	struct vring_desc desc;
>  	unsigned int i = 0, count, found = 0;
> @@ -1274,9 +1500,10 @@ static int get_indirect(struct vhost_virtqueue *vq,
>  	}
>  
>  	ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, vq->indirect,
> -			     UIO_MAXIOV);
> +			     UIO_MAXIOV, access);
>  	if (unlikely(ret < 0)) {
> -		vq_err(vq, "Translation failure %d in indirect.\n", ret);
> +		if (ret != -EAGAIN)
> +			vq_err(vq, "Translation failure %d in indirect.\n", ret);
>  		return ret;
>  	}
>  	iov_iter_init(&from, READ, vq->indirect, ret, len);
> @@ -1316,10 +1543,11 @@ static int get_indirect(struct vhost_virtqueue *vq,
>  
>  		ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
>  				     vhost32_to_cpu(vq, desc.len), iov + iov_count,
> -				     iov_size - iov_count);
> +				     iov_size - iov_count, access);
>  		if (unlikely(ret < 0)) {
> -			vq_err(vq, "Translation failure %d indirect idx %d\n",
> -			       ret, i);
> +			if (ret != -EAGAIN)
> +				vq_err(vq, "Translation failure %d indirect idx %d\n",
> +					ret, i);
>  			return ret;
>  		}
>  		/* If this is an input descriptor, increment that count. */
> @@ -1355,7 +1583,8 @@ static int get_indirect(struct vhost_virtqueue *vq,
>  int vhost_get_vq_desc(struct vhost_virtqueue *vq,
>  		      struct iovec iov[], unsigned int iov_size,
>  		      unsigned int *out_num, unsigned int *in_num,
> -		      struct vhost_log *log, unsigned int *log_num)
> +		      struct vhost_log *log, unsigned int *log_num,
> +		      int access)
>  {
>  	struct vring_desc desc;
>  	unsigned int i, head, found = 0;
> @@ -1433,10 +1662,11 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
>  		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) {
>  			ret = get_indirect(vq, iov, iov_size,
>  					   out_num, in_num,
> -					   log, log_num, &desc);
> +					   log, log_num, &desc, access);
>  			if (unlikely(ret < 0)) {
> -				vq_err(vq, "Failure detected "
> -				       "in indirect descriptor at idx %d\n", i);
> +				if (ret != -EAGAIN)
> +					vq_err(vq, "Failure detected "
> +						"in indirect descriptor at idx %d\n", i);
>  				return ret;
>  			}
>  			continue;
> @@ -1444,10 +1674,11 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
>  
>  		ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
>  				     vhost32_to_cpu(vq, desc.len), iov + iov_count,
> -				     iov_size - iov_count);
> +				     iov_size - iov_count, access);
>  		if (unlikely(ret < 0)) {
> -			vq_err(vq, "Translation failure %d descriptor idx %d\n",
> -			       ret, i);
> +			if (ret != -EAGAIN)
> +				vq_err(vq, "Translation failure %d descriptor idx %d\n",
> +					ret, i);
>  			return ret;
>  		}
>  		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE)) {
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 5d64393..4365104 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -62,13 +62,15 @@ struct vhost_umem_node {
>  	__u64 last;
>  	__u64 size;
>  	__u64 userspace_addr;
> -	__u64 flags_padding;
> +	__u32 perm;
> +	__u32 flags_padding;
>  	__u64 __subtree_last;
>  };
>  
>  struct vhost_umem {
>  	struct rb_root umem_tree;
>  	struct list_head umem_list;
> +	int numem;
>  };
>  
>  /* The virtqueue structure describes a queue attached to a device. */
> @@ -84,9 +86,13 @@ struct vhost_virtqueue {
>  	struct file *kick;
>  	struct file *call;
>  	struct file *error;
> +	struct file *iotlb_call;
>  	struct eventfd_ctx *call_ctx;
>  	struct eventfd_ctx *error_ctx;
>  	struct eventfd_ctx *log_ctx;
> +	struct eventfd_ctx *iotlb_call_ctx;
> +	struct vhost_iotlb_entry __user *iotlb_request;
> +	struct vhost_iotlb_entry pending_request;
>  
>  	struct vhost_poll poll;
>  
> @@ -135,6 +141,8 @@ struct vhost_virtqueue {
>  #endif
>  };
>  
> +#define VHOST_IOTLB_SIZE 2048
> +
>  struct vhost_dev {
>  	struct mm_struct *mm;
>  	struct mutex mutex;
> @@ -146,6 +154,8 @@ struct vhost_dev {
>  	struct list_head work_list;
>  	struct task_struct *worker;
>  	struct vhost_umem *umem;
> +	struct vhost_umem *iotlb;
> +	spinlock_t iotlb_lock;
>  };
>  
>  void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int nvqs);
> @@ -164,7 +174,8 @@ int vhost_log_access_ok(struct vhost_dev *);
>  int vhost_get_vq_desc(struct vhost_virtqueue *,
>  		      struct iovec iov[], unsigned int iov_count,
>  		      unsigned int *out_num, unsigned int *in_num,
> -		      struct vhost_log *log, unsigned int *log_num);
> +		      struct vhost_log *log, unsigned int *log_num,
> +		      int access);
>  void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
>  
>  int vhost_init_used(struct vhost_virtqueue *);
> @@ -183,7 +194,7 @@ int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
>  		    unsigned int log_num, u64 len);
>  
>  #define vq_err(vq, fmt, ...) do {                                  \
> -		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
> +		printk(pr_fmt(fmt), ##__VA_ARGS__);       \
>  		if ((vq)->error_ctx)                               \
>  				eventfd_signal((vq)->error_ctx, 1);\
>  	} while (0)
> diff --git a/fs/eventfd.c b/fs/eventfd.c
> index 8d0c0df..5c0a22f 100644
> --- a/fs/eventfd.c
> +++ b/fs/eventfd.c
> @@ -59,8 +59,9 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
>  	if (ULLONG_MAX - ctx->count < n)
>  		n = ULLONG_MAX - ctx->count;
>  	ctx->count += n;
> -	if (waitqueue_active(&ctx->wqh))
> +	if (waitqueue_active(&ctx->wqh)) {
>  		wake_up_locked_poll(&ctx->wqh, POLLIN);
> +	}
>  	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
>  
>  	return n;
> diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
> index ab373191..5c35ab4 100644
> --- a/include/uapi/linux/vhost.h
> +++ b/include/uapi/linux/vhost.h
> @@ -47,6 +47,32 @@ struct vhost_vring_addr {
>  	__u64 log_guest_addr;
>  };
>  
> +struct vhost_iotlb_entry {
> +	__u64 iova;
> +	__u64 size;
> +	__u64 userspace_addr;

Alignment requirements?

> +	struct {
> +#define VHOST_ACCESS_RO      0x1
> +#define VHOST_ACCESS_WO      0x2
> +#define VHOST_ACCESS_RW      0x3
> +		__u8  perm;
> +#define VHOST_IOTLB_MISS           1
> +#define VHOST_IOTLB_UPDATE         2
> +#define VHOST_IOTLB_INVALIDATE     3
> +		__u8  type;
> +#define VHOST_IOTLB_INVALID        0x1
> +#define VHOST_IOTLB_VALID          0x2
> +		__u8  valid;

why do we need this flag?

> +		__u8  u8_padding;
> +		__u32 padding;
> +	} flags;
> +};
> +
> +struct vhost_vring_iotlb_entry {
> +	unsigned int index;
> +	__u64 userspace_addr;
> +};
> +
>  struct vhost_memory_region {
>  	__u64 guest_phys_addr;
>  	__u64 memory_size; /* bytes */
> @@ -127,6 +153,15 @@ struct vhost_memory {
>  /* Set eventfd to signal an error */
>  #define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file)
>  
> +/* IOTLB */
> +/* Specify an eventfd file descriptor to signle on IOTLB miss */

typo

> +#define VHOST_SET_VRING_IOTLB_CALL _IOW(VHOST_VIRTIO, 0x23, struct      \
> +                                        vhost_vring_file)
> +#define VHOST_SET_VRING_IOTLB_REQUEST _IOW(VHOST_VIRTIO, 0x25, struct   \
> +                                           vhost_vring_iotlb_entry)
> +#define VHOST_UPDATE_IOTLB _IOW(VHOST_VIRTIO, 0x24, struct vhost_iotlb_entry)
> +#define VHOST_RUN_IOTLB _IOW(VHOST_VIRTIO, 0x26, int)
> +

Is the assumption that userspace must dedicate a thread to running the iotlb? 
I dislike this one.
Please support asynchronous APIs at least optionally to make
userspace make its own threading decisions.

>  /* VHOST_NET specific defines */
>  
>  /* Attach virtio net ring to a raw socket, or tap device.

Don't we need a feature bit for this?
Are we short on feature bits? If yes maybe it's time to add
something like PROTOCOL_FEATURES that we have in vhost-user.

> -- 
> 2.5.0

^ permalink raw reply

* [PATCH] net: phy: at803x: Register 'link_change_notify' only for AT8030
From: Sebastian Frias @ 2016-04-27 11:34 UTC (permalink / raw)
  To: Daniel Mack, David S. Miller, netdev; +Cc: lkml, mason, Sergei Shtylyov

There is no need to register the callback introduced by
commit 13a56b449325 ("net: phy: at803x: Add support for hardware reset")
for non faulty PHYs.

The check on the PHY ID is not necessary anymore and thus has been
removed from the callback implementation as well.

Fixes: 13a56b449325 ("net: phy: at803x: Add support for hardware reset")

Signed-off-by: Sebastian Frias <sf84@laposte.net>
---
 drivers/net/phy/at803x.c | 43 ++++++++++++++++++++-----------------------
 1 file changed, 20 insertions(+), 23 deletions(-)

diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c
index b3ffaee..7fdc676 100644
--- a/drivers/net/phy/at803x.c
+++ b/drivers/net/phy/at803x.c
@@ -353,33 +353,32 @@ static void at803x_link_change_notify(struct phy_device *phydev)
 	struct at803x_priv *priv = phydev->priv;
 
 	/*
-	 * Conduct a hardware reset for AT8030 every time a link loss is
+	 * Conduct a hardware reset for AT8030 (this callback is only
+	 * registered for AT8030 at the moment) every time a link loss is
 	 * signalled. This is necessary to circumvent a hardware bug that
 	 * occurs when the cable is unplugged while TX packets are pending
 	 * in the FIFO. In such cases, the FIFO enters an error mode it
 	 * cannot recover from by software.
 	 */
-	if (phydev->drv->phy_id == ATH8030_PHY_ID) {
-		if (phydev->state == PHY_NOLINK) {
-			if (priv->gpiod_reset && !priv->phy_reset) {
-				struct at803x_context context;
-
-				at803x_context_save(phydev, &context);
-
-				gpiod_set_value(priv->gpiod_reset, 1);
-				msleep(1);
-				gpiod_set_value(priv->gpiod_reset, 0);
-				msleep(1);
-
-				at803x_context_restore(phydev, &context);
-
-				phydev_dbg(phydev, "%s(): phy was reset\n",
-					   __func__);
-				priv->phy_reset = true;
-			}
-		} else {
-			priv->phy_reset = false;
+	if (phydev->state == PHY_NOLINK) {
+		if (priv->gpiod_reset && !priv->phy_reset) {
+			struct at803x_context context;
+
+			at803x_context_save(phydev, &context);
+
+			gpiod_set_value(priv->gpiod_reset, 1);
+			msleep(1);
+			gpiod_set_value(priv->gpiod_reset, 0);
+			msleep(1);
+
+			at803x_context_restore(phydev, &context);
+
+			phydev_dbg(phydev, "%s(): phy was reset\n",
+				   __func__);
+			priv->phy_reset = true;
 		}
+	} else {
+		priv->phy_reset = false;
 	}
 }
 
@@ -391,7 +390,6 @@ static struct phy_driver at803x_driver[] = {
 	.phy_id_mask		= 0xffffffef,
 	.probe			= at803x_probe,
 	.config_init		= at803x_config_init,
-	.link_change_notify	= at803x_link_change_notify,
 	.set_wol		= at803x_set_wol,
 	.get_wol		= at803x_get_wol,
 	.suspend		= at803x_suspend,
@@ -427,7 +425,6 @@ static struct phy_driver at803x_driver[] = {
 	.phy_id_mask		= 0xffffffef,
 	.probe			= at803x_probe,
 	.config_init		= at803x_config_init,
-	.link_change_notify	= at803x_link_change_notify,
 	.set_wol		= at803x_set_wol,
 	.get_wol		= at803x_get_wol,
 	.suspend		= at803x_suspend,
-- 
2.1.4

^ permalink raw reply related

* Re: [RFC PATCH V2 1/2] vhost: convert pre sorted vhost memory array to interval tree
From: Michael S. Tsirkin @ 2016-04-27 11:30 UTC (permalink / raw)
  To: Jason Wang
  Cc: kvm, qemu-devel, netdev, linux-kernel, peterx, virtualization,
	pbonzini
In-Reply-To: <1458873274-13961-2-git-send-email-jasowang@redhat.com>

On Fri, Mar 25, 2016 at 10:34:33AM +0800, Jason Wang wrote:
> Current pre-sorted memory region array has some limitations for future
> device IOTLB conversion:
> 
> 1) need extra work for adding and removing a single region, and it's
>    expected to be slow because of sorting or memory re-allocation.
> 2) need extra work of removing a large range which may intersect
>    several regions with different size.
> 3) need trick for a replacement policy like LRU
> 
> To overcome the above shortcomings, this patch convert it to interval
> tree which can easily address the above issue with almost no extra
> work.
> 
> The patch could be used for:
> 
> - Extend the current API and only let the userspace to send diffs of
>   memory table.
> - Simplify Device IOTLB implementation.

Does this affect performance at all?

> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/vhost/net.c   |   8 +--
>  drivers/vhost/vhost.c | 182 ++++++++++++++++++++++++++++----------------------
>  drivers/vhost/vhost.h |  27 ++++++--
>  3 files changed, 128 insertions(+), 89 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 9eda69e..481db96 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -968,20 +968,20 @@ static long vhost_net_reset_owner(struct vhost_net *n)
>  	struct socket *tx_sock = NULL;
>  	struct socket *rx_sock = NULL;
>  	long err;
> -	struct vhost_memory *memory;
> +	struct vhost_umem *umem;
>  
>  	mutex_lock(&n->dev.mutex);
>  	err = vhost_dev_check_owner(&n->dev);
>  	if (err)
>  		goto done;
> -	memory = vhost_dev_reset_owner_prepare();
> -	if (!memory) {
> +	umem = vhost_dev_reset_owner_prepare();
> +	if (!umem) {
>  		err = -ENOMEM;
>  		goto done;
>  	}
>  	vhost_net_stop(n, &tx_sock, &rx_sock);
>  	vhost_net_flush(n);
> -	vhost_dev_reset_owner(&n->dev, memory);
> +	vhost_dev_reset_owner(&n->dev, umem);
>  	vhost_net_vq_reset(n);
>  done:
>  	mutex_unlock(&n->dev.mutex);
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index ad2146a..32c35a9 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -27,6 +27,7 @@
>  #include <linux/cgroup.h>
>  #include <linux/module.h>
>  #include <linux/sort.h>
> +#include <linux/interval_tree_generic.h>
>  
>  #include "vhost.h"
>  
> @@ -42,6 +43,10 @@ enum {
>  #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
>  #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
>  
> +INTERVAL_TREE_DEFINE(struct vhost_umem_node,
> +		     rb, __u64, __subtree_last,
> +		     START, LAST, , vhost_umem_interval_tree);
> +
>  #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
>  static void vhost_vq_reset_user_be(struct vhost_virtqueue *vq)
>  {
> @@ -275,7 +280,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>  	vq->call_ctx = NULL;
>  	vq->call = NULL;
>  	vq->log_ctx = NULL;
> -	vq->memory = NULL;
> +	vq->umem = NULL;
>  	vq->is_le = virtio_legacy_is_little_endian();
>  	vhost_vq_reset_user_be(vq);
>  }
> @@ -381,7 +386,7 @@ void vhost_dev_init(struct vhost_dev *dev,
>  	mutex_init(&dev->mutex);
>  	dev->log_ctx = NULL;
>  	dev->log_file = NULL;
> -	dev->memory = NULL;
> +	dev->umem = NULL;
>  	dev->mm = NULL;
>  	spin_lock_init(&dev->work_lock);
>  	INIT_LIST_HEAD(&dev->work_list);
> @@ -486,27 +491,36 @@ err_mm:
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_set_owner);
>  
> -struct vhost_memory *vhost_dev_reset_owner_prepare(void)
> +static void *vhost_kvzalloc(unsigned long size)
> +{
> +	void *n = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
> +
> +	if (!n)
> +		n = vzalloc(size);
> +	return n;
> +}
> +
> +struct vhost_umem *vhost_dev_reset_owner_prepare(void)
>  {
> -	return kmalloc(offsetof(struct vhost_memory, regions), GFP_KERNEL);
> +	return vhost_kvzalloc(sizeof(struct vhost_umem));
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_reset_owner_prepare);
>  
>  /* Caller should have device mutex */
> -void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_memory *memory)
> +void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_umem *umem)
>  {
>  	int i;
>  
>  	vhost_dev_cleanup(dev, true);
>  
>  	/* Restore memory to default empty mapping. */
> -	memory->nregions = 0;
> -	dev->memory = memory;
> +	INIT_LIST_HEAD(&umem->umem_list);
> +	dev->umem = umem;
>  	/* We don't need VQ locks below since vhost_dev_cleanup makes sure
>  	 * VQs aren't running.
>  	 */
>  	for (i = 0; i < dev->nvqs; ++i)
> -		dev->vqs[i]->memory = memory;
> +		dev->vqs[i]->umem = umem;
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_reset_owner);
>  
> @@ -523,6 +537,21 @@ void vhost_dev_stop(struct vhost_dev *dev)
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_stop);
>  
> +static void vhost_umem_clean(struct vhost_umem *umem)
> +{
> +	struct vhost_umem_node *node, *tmp;
> +
> +	if (!umem)
> +		return;
> +
> +	list_for_each_entry_safe(node, tmp, &umem->umem_list, link) {
> +		vhost_umem_interval_tree_remove(node, &umem->umem_tree);
> +		list_del(&node->link);
> +		kvfree(node);
> +	}
> +	kvfree(umem);
> +}
> +
>  /* Caller should have device mutex if and only if locked is set */
>  void vhost_dev_cleanup(struct vhost_dev *dev, bool locked)
>  {
> @@ -549,8 +578,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev, bool locked)
>  		fput(dev->log_file);
>  	dev->log_file = NULL;
>  	/* No one will access memory at this point */
> -	kvfree(dev->memory);
> -	dev->memory = NULL;
> +	vhost_umem_clean(dev->umem);
> +	dev->umem = NULL;
>  	WARN_ON(!list_empty(&dev->work_list));
>  	if (dev->worker) {
>  		kthread_stop(dev->worker);
> @@ -576,25 +605,25 @@ static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
>  }
>  
>  /* Caller should have vq mutex and device mutex. */
> -static int vq_memory_access_ok(void __user *log_base, struct vhost_memory *mem,
> +static int vq_memory_access_ok(void __user *log_base, struct vhost_umem *umem,
>  			       int log_all)
>  {
> -	int i;
> +	struct vhost_umem_node *node;
>  
> -	if (!mem)
> +	if (!umem)
>  		return 0;
>  
> -	for (i = 0; i < mem->nregions; ++i) {
> -		struct vhost_memory_region *m = mem->regions + i;
> -		unsigned long a = m->userspace_addr;
> -		if (m->memory_size > ULONG_MAX)
> +	list_for_each_entry(node, &umem->umem_list, link) {
> +		unsigned long a = node->userspace_addr;
> +
> +		if (node->size > ULONG_MAX)
>  			return 0;
>  		else if (!access_ok(VERIFY_WRITE, (void __user *)a,
> -				    m->memory_size))
> +				    node->size))
>  			return 0;
>  		else if (log_all && !log_access_ok(log_base,
> -						   m->guest_phys_addr,
> -						   m->memory_size))
> +						   node->start,
> +						   node->size))
>  			return 0;
>  	}
>  	return 1;
> @@ -602,7 +631,7 @@ static int vq_memory_access_ok(void __user *log_base, struct vhost_memory *mem,
>  
>  /* Can we switch to this memory table? */
>  /* Caller should have device mutex but not vq mutex */
> -static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem,
> +static int memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem,
>  			    int log_all)
>  {
>  	int i;
> @@ -615,7 +644,8 @@ static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem,
>  		log = log_all || vhost_has_feature(d->vqs[i], VHOST_F_LOG_ALL);
>  		/* If ring is inactive, will check when it's enabled. */
>  		if (d->vqs[i]->private_data)
> -			ok = vq_memory_access_ok(d->vqs[i]->log_base, mem, log);
> +			ok = vq_memory_access_ok(d->vqs[i]->log_base,
> +						 umem, log);
>  		else
>  			ok = 1;
>  		mutex_unlock(&d->vqs[i]->mutex);
> @@ -642,7 +672,7 @@ static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
>  /* Caller should have device mutex but not vq mutex */
>  int vhost_log_access_ok(struct vhost_dev *dev)
>  {
> -	return memory_access_ok(dev, dev->memory, 1);
> +	return memory_access_ok(dev, dev->umem, 1);
>  }
>  EXPORT_SYMBOL_GPL(vhost_log_access_ok);
>  
> @@ -653,7 +683,7 @@ static int vq_log_access_ok(struct vhost_virtqueue *vq,
>  {
>  	size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
>  
> -	return vq_memory_access_ok(log_base, vq->memory,
> +	return vq_memory_access_ok(log_base, vq->umem,
>  				   vhost_has_feature(vq, VHOST_F_LOG_ALL)) &&
>  		(!vq->log_used || log_access_ok(log_base, vq->log_addr,
>  					sizeof *vq->used +
> @@ -669,28 +699,12 @@ int vhost_vq_access_ok(struct vhost_virtqueue *vq)
>  }
>  EXPORT_SYMBOL_GPL(vhost_vq_access_ok);
>  
> -static int vhost_memory_reg_sort_cmp(const void *p1, const void *p2)
> -{
> -	const struct vhost_memory_region *r1 = p1, *r2 = p2;
> -	if (r1->guest_phys_addr < r2->guest_phys_addr)
> -		return 1;
> -	if (r1->guest_phys_addr > r2->guest_phys_addr)
> -		return -1;
> -	return 0;
> -}
> -
> -static void *vhost_kvzalloc(unsigned long size)
> -{
> -	void *n = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
> -
> -	if (!n)
> -		n = vzalloc(size);
> -	return n;
> -}
> -
>  static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
>  {
> -	struct vhost_memory mem, *newmem, *oldmem;
> +	struct vhost_memory mem, *newmem;
> +	struct vhost_memory_region *region;
> +	struct vhost_umem_node *node;
> +	struct vhost_umem *newumem, *oldumem;
>  	unsigned long size = offsetof(struct vhost_memory, regions);
>  	int i;
>  
> @@ -710,24 +724,52 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
>  		kvfree(newmem);
>  		return -EFAULT;
>  	}
> -	sort(newmem->regions, newmem->nregions, sizeof(*newmem->regions),
> -		vhost_memory_reg_sort_cmp, NULL);
>  
> -	if (!memory_access_ok(d, newmem, 0)) {
> +	newumem = vhost_kvzalloc(sizeof(*newumem));
> +	if (!newumem) {
>  		kvfree(newmem);
> -		return -EFAULT;
> +		return -ENOMEM;
> +	}
> +
> +	newumem->umem_tree = RB_ROOT;
> +	INIT_LIST_HEAD(&newumem->umem_list);
> +
> +	for (region = newmem->regions;
> +	     region < newmem->regions + mem.nregions;
> +	     region++) {
> +		node = vhost_kvzalloc(sizeof(*node));
> +		if (!node)
> +			goto err;
> +		node->start = region->guest_phys_addr;
> +		node->size = region->memory_size;
> +		node->last = node->start + node->size - 1;
> +		node->userspace_addr = region->userspace_addr;
> +		INIT_LIST_HEAD(&node->link);
> +		list_add_tail(&node->link, &newumem->umem_list);
> +		vhost_umem_interval_tree_insert(node, &newumem->umem_tree);
>  	}
> -	oldmem = d->memory;
> -	d->memory = newmem;
> +
> +	if (!memory_access_ok(d, newumem, 0))
> +		goto err;
> +
> +	oldumem = d->umem;
> +	d->umem = newumem;
>  
>  	/* All memory accesses are done under some VQ mutex. */
>  	for (i = 0; i < d->nvqs; ++i) {
>  		mutex_lock(&d->vqs[i]->mutex);
> -		d->vqs[i]->memory = newmem;
> +		d->vqs[i]->umem = newumem;
>  		mutex_unlock(&d->vqs[i]->mutex);
>  	}
> -	kvfree(oldmem);
> +
> +	kvfree(newmem);
> +	vhost_umem_clean(oldumem);
>  	return 0;
> +
> +err:
> +	vhost_umem_clean(newumem);
> +	kvfree(newmem);
> +	return -EFAULT;
>  }
>  
>  long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp)
> @@ -1017,28 +1059,6 @@ done:
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_ioctl);
>  
> -static const struct vhost_memory_region *find_region(struct vhost_memory *mem,
> -						     __u64 addr, __u32 len)
> -{
> -	const struct vhost_memory_region *reg;
> -	int start = 0, end = mem->nregions;
> -
> -	while (start < end) {
> -		int slot = start + (end - start) / 2;
> -		reg = mem->regions + slot;
> -		if (addr >= reg->guest_phys_addr)
> -			end = slot;
> -		else
> -			start = slot + 1;
> -	}
> -
> -	reg = mem->regions + start;
> -	if (addr >= reg->guest_phys_addr &&
> -		reg->guest_phys_addr + reg->memory_size > addr)
> -		return reg;
> -	return NULL;
> -}
> -
>  /* TODO: This is really inefficient.  We need something like get_user()
>   * (instruction directly accesses the data, with an exception table entry
>   * returning -EFAULT). See Documentation/x86/exception-tables.txt.
> @@ -1180,29 +1200,29 @@ EXPORT_SYMBOL_GPL(vhost_init_used);
>  static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
>  			  struct iovec iov[], int iov_size)
>  {
> -	const struct vhost_memory_region *reg;
> -	struct vhost_memory *mem;
> +	const struct vhost_umem_node *node;
> +	struct vhost_umem *umem = vq->umem;
>  	struct iovec *_iov;
>  	u64 s = 0;
>  	int ret = 0;
>  
> -	mem = vq->memory;
>  	while ((u64)len > s) {
>  		u64 size;
>  		if (unlikely(ret >= iov_size)) {
>  			ret = -ENOBUFS;
>  			break;
>  		}
> -		reg = find_region(mem, addr, len);
> -		if (unlikely(!reg)) {
> +		node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
> +							addr, addr + len - 1);
> +		if (node == NULL || node->start > addr) {
>  			ret = -EFAULT;
>  			break;
>  		}
>  		_iov = iov + ret;
> -		size = reg->memory_size - addr + reg->guest_phys_addr;
> +		size = node->size - addr + node->start;
>  		_iov->iov_len = min((u64)len - s, size);
>  		_iov->iov_base = (void __user *)(unsigned long)
> -			(reg->userspace_addr + addr - reg->guest_phys_addr);
> +			(node->userspace_addr + addr - node->start);
>  		s += size;
>  		addr += size;
>  		++ret;
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index d3f7674..5d64393 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -52,6 +52,25 @@ struct vhost_log {
>  	u64 len;
>  };
>  
> +#define START(node) ((node)->start)
> +#define LAST(node) ((node)->last)
> +
> +struct vhost_umem_node {
> +	struct rb_node rb;
> +	struct list_head link;
> +	__u64 start;
> +	__u64 last;
> +	__u64 size;
> +	__u64 userspace_addr;
> +	__u64 flags_padding;
> +	__u64 __subtree_last;
> +};
> +
> +struct vhost_umem {
> +	struct rb_root umem_tree;
> +	struct list_head umem_list;
> +};
> +
>  /* The virtqueue structure describes a queue attached to a device. */
>  struct vhost_virtqueue {
>  	struct vhost_dev *dev;
> @@ -100,7 +119,7 @@ struct vhost_virtqueue {
>  	struct iovec *indirect;
>  	struct vring_used_elem *heads;
>  	/* Protected by virtqueue mutex. */
> -	struct vhost_memory *memory;
> +	struct vhost_umem *umem;
>  	void *private_data;
>  	u64 acked_features;
>  	/* Log write descriptors */
> @@ -117,7 +136,6 @@ struct vhost_virtqueue {
>  };
>  
>  struct vhost_dev {
> -	struct vhost_memory *memory;
>  	struct mm_struct *mm;
>  	struct mutex mutex;
>  	struct vhost_virtqueue **vqs;
> @@ -127,14 +145,15 @@ struct vhost_dev {
>  	spinlock_t work_lock;
>  	struct list_head work_list;
>  	struct task_struct *worker;
> +	struct vhost_umem *umem;
>  };
>  
>  void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int nvqs);
>  long vhost_dev_set_owner(struct vhost_dev *dev);
>  bool vhost_dev_has_owner(struct vhost_dev *dev);
>  long vhost_dev_check_owner(struct vhost_dev *);
> -struct vhost_memory *vhost_dev_reset_owner_prepare(void);
> -void vhost_dev_reset_owner(struct vhost_dev *, struct vhost_memory *);
> +struct vhost_umem *vhost_dev_reset_owner_prepare(void);
> +void vhost_dev_reset_owner(struct vhost_dev *, struct vhost_umem *);
>  void vhost_dev_cleanup(struct vhost_dev *, bool locked);
>  void vhost_dev_stop(struct vhost_dev *);
>  long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user *argp);
> -- 
> 2.5.0

^ permalink raw reply

* Re: [PATCH] vhost_net: stop polling socket during rx processing
From: Michael S. Tsirkin @ 2016-04-27 11:28 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <1461656153-24074-1-git-send-email-jasowang@redhat.com>

On Tue, Apr 26, 2016 at 03:35:53AM -0400, Jason Wang wrote:
> We don't stop polling socket during rx processing, this will lead
> unnecessary wakeups from under layer net devices (E.g
> sock_def_readable() form tun). Rx will be slowed down in this
> way. This patch avoids this by stop polling socket during rx
> processing. A small drawback is that this introduces some overheads in
> light load case because of the extra start/stop polling, but single
> netperf TCP_RR does not notice any change. In a super heavy load case,
> e.g using pktgen to inject packet to guest, we get about ~17%
> improvement on pps:
> 
> before: ~1370000 pkt/s
> after:  ~1500000 pkt/s
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>

Acked-by: Michael S. Tsirkin <mst@redhat.com>

There is one other possible enhancement: we actually have the wait queue
lock taken in _wake_up, but we give it up only to take it again in the
handler.

It would be nicer to just remove the entry when we wake
the vhost thread. Re-add it if required.
I think that something like the below would give you the necessary API.
Pls feel free to use it if you are going to implement a patch on top
doing this - that's not a reason not to include this simple patch
though.

--->

wait: add API to drop a wait_queue_t entry from wake up handler

A wake up handler might want to remove its own wait queue entry to avoid
future wakeups.  In particular, vhost has such a need.  As wait queue
lock is already taken, all we need is an API to remove the entry without
wait_queue_head_t which isn't currently accessible to wake up handlers.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

---

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 27d7a0a..9c6604b 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -191,11 +191,17 @@ __add_wait_queue_tail_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
 }
 
 static inline void
-__remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old)
+__remove_wait_queue_entry(wait_queue_t *old)
 {
 	list_del(&old->task_list);
 }
 
+static inline void
+__remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old)
+{
+	__remove_wait_queue_entry(old);
+}
+
 typedef int wait_bit_action_f(struct wait_bit_key *, int mode);
 void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
 void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);

^ permalink raw reply related

* Re: [PATCH] netem: Segment GSO packets on enqueue.
From: Neil Horman @ 2016-04-27 11:27 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, Jamal Hadi Salim, David S. Miller, netem
In-Reply-To: <1461701955.5535.38.camel@edumazet-glaptop3.roam.corp.google.com>

On Tue, Apr 26, 2016 at 01:19:15PM -0700, Eric Dumazet wrote:
> On Tue, 2016-04-26 at 15:00 -0400, Neil Horman wrote:
> > I can understand that, but that raises two questions in my mind:
> > 
> > 1)  Doesn't that make all the statistical manipulation for netem wrong?  That is
> > to say, if netem drops 5% of packets, and it happens to drop a GSO packet, its
> > actually dropping several, instead of the single one required.
> 
> 
> Please take a look at tbf_segment(), where you can find a proper way to
> handle this.
> 
> Note that for the case q->corrupt is 0, we definitely do not want to
> segment TSO packets.
> 
> > 2) How are you getting netem to work with GSO at all?  The warning is triggered
> > for me on every GSO packet, which I think would impact throughput :)
> 
> I mostly use netem to add delays and drops.
> I never had this bug, since q->corrupt = 0
> 

I see what you're saying now, I should only be segmenting the packet if the
qdisc needs to compute the checksum on each packet.  Other packets that aren't
selected to be mangled can pass through un-segmented (assuming they meet any
other queue constraints).

Ok, thanks.  Self-nak.  I'll respin/test and post a new version

Best
Neil

> 
> 
> 

^ permalink raw reply

* [PATCH v3 2/2] pegasus: fixes reported packet length
From: Petko Manolov @ 2016-04-27 11:24 UTC (permalink / raw)
  To: netdev; +Cc: davem, a1291762, johannes, Petko Manolov
In-Reply-To: <1461756290-27421-1-git-send-email-petkan@mip-labs.com>

The default Pegasus setup was to append the status and CRC at the end of each
received packet.  The status bits are used to update various stats, but CRC has
been ignored.  The new default is to not append CRC at the end of RX packets.

Signed-off-by: Petko Manolov <petkan@mip-labs.com>
---
 drivers/net/usb/pegasus.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/usb/pegasus.c b/drivers/net/usb/pegasus.c
index f919e20..82129ee 100644
--- a/drivers/net/usb/pegasus.c
+++ b/drivers/net/usb/pegasus.c
@@ -411,7 +411,7 @@ static int enable_net_traffic(struct net_device *dev, struct usb_device *usb)
 	int ret;
 
 	read_mii_word(pegasus, pegasus->phy, MII_LPA, &linkpart);
-	data[0] = 0xc9;
+	data[0] = 0xc8; /* TX & RX enable, append status, no CRC */
 	data[1] = 0;
 	if (linkpart & (ADVERTISE_100FULL | ADVERTISE_10FULL))
 		data[1] |= 0x20;	/* set full duplex */
@@ -497,7 +497,7 @@ static void read_bulk_callback(struct urb *urb)
 		pkt_len = buf[count - 3] << 8;
 		pkt_len += buf[count - 4];
 		pkt_len &= 0xfff;
-		pkt_len -= 8;
+		pkt_len -= 4;
 	}
 
 	/*
-- 
2.8.0.rc3

^ permalink raw reply related

* [PATCH v3 0/2] pegasus: correct buffer & packet sizes
From: Petko Manolov @ 2016-04-27 11:24 UTC (permalink / raw)
  To: netdev; +Cc: davem, a1291762, johannes, Petko Manolov

As noticed by Lincoln Ramsay <a1291762@gmail.com> some old (usb 1.1) Pegasus
based devices may actually return more bytes than the specified in the datasheet
amount.  That would not be a problem if the allocated space for the SKB was
equal to the parameter passed to usb_fill_bulk_urb().  Some poor bugger (i
really hope it was not me, but 'git blame' is useless in this case, so anyway)
decided to add '+ 8' to the buffer length parameter.  Sometimes the usb transfer
overflows and corrupts the socket structure, leading to kernel panic.

The above doesn't seem to happen for newer (Pegasus2 based) devices which did
help this bug to hide for so long.

The new default is to not include the CRC at the end of each received package.  
So far CRC has been ignored which makes no sense to do it in a first place.

The patch is against v4.6-rc5 and was tested on ADM8515 device by transferring
multiple gigabytes of data over a couple of days without any complaints from the
kernel.  Please apply it to whatever net tree you deem fit.

Changes since v1:

 - split the patch in two parts;
 - corrected the subject lines;

Changes since v2:

 - do not append CRC by default (based on a discussion with Johannes Berg);

Petko Manolov (2):
  pegasus: fixes URB buffer allocation size;
  pegasus: fixes reported packet length

 drivers/net/usb/pegasus.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

-- 
2.8.0.rc3

^ permalink raw reply

* [PATCH v3 1/2] pegasus: fixes URB buffer allocation size;
From: Petko Manolov @ 2016-04-27 11:24 UTC (permalink / raw)
  To: netdev; +Cc: davem, a1291762, johannes, Petko Manolov
In-Reply-To: <1461756290-27421-1-git-send-email-petkan@mip-labs.com>

usb_fill_bulk_urb() receives buffer length parameter 8 bytes larger
than what's allocated by alloc_skb(); This seems to be a problem with
older (pegasus usb-1.1) devices, which may silently return more data
than the maximal packet length.

Reported-by: Lincoln Ramsay <a1291762@gmail.com>
Signed-off-by: Petko Manolov <petkan@mip-labs.com>
---
 drivers/net/usb/pegasus.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/usb/pegasus.c b/drivers/net/usb/pegasus.c
index f840802..f919e20 100644
--- a/drivers/net/usb/pegasus.c
+++ b/drivers/net/usb/pegasus.c
@@ -528,7 +528,7 @@ static void read_bulk_callback(struct urb *urb)
 goon:
 	usb_fill_bulk_urb(pegasus->rx_urb, pegasus->usb,
 			  usb_rcvbulkpipe(pegasus->usb, 1),
-			  pegasus->rx_skb->data, PEGASUS_MTU + 8,
+			  pegasus->rx_skb->data, PEGASUS_MTU,
 			  read_bulk_callback, pegasus);
 	rx_status = usb_submit_urb(pegasus->rx_urb, GFP_ATOMIC);
 	if (rx_status == -ENODEV)
@@ -569,7 +569,7 @@ static void rx_fixup(unsigned long data)
 	}
 	usb_fill_bulk_urb(pegasus->rx_urb, pegasus->usb,
 			  usb_rcvbulkpipe(pegasus->usb, 1),
-			  pegasus->rx_skb->data, PEGASUS_MTU + 8,
+			  pegasus->rx_skb->data, PEGASUS_MTU,
 			  read_bulk_callback, pegasus);
 try_again:
 	status = usb_submit_urb(pegasus->rx_urb, GFP_ATOMIC);
@@ -823,7 +823,7 @@ static int pegasus_open(struct net_device *net)
 
 	usb_fill_bulk_urb(pegasus->rx_urb, pegasus->usb,
 			  usb_rcvbulkpipe(pegasus->usb, 1),
-			  pegasus->rx_skb->data, PEGASUS_MTU + 8,
+			  pegasus->rx_skb->data, PEGASUS_MTU,
 			  read_bulk_callback, pegasus);
 	if ((res = usb_submit_urb(pegasus->rx_urb, GFP_KERNEL))) {
 		if (res == -ENODEV)
-- 
2.8.0.rc3

^ permalink raw reply related

* Re: [net-next PATCH V3 3/5] samples/bpf: add a README file to get users started
From: Naveen N. Rao @ 2016-04-27 11:00 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: netdev, linux-kbuild, bblanco, borkmann, alexei.starovoitov
In-Reply-To: <20160427111634.38305214@redhat.com>

On 2016/04/27 11:16AM, Jesper Dangaard Brouer wrote:
> On Wed, 27 Apr 2016 14:05:22 +0530
> "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> wrote:
> 
> > On 2016/04/27 09:30AM, Jesper Dangaard Brouer wrote:
> > > Getting started with using examples in samples/bpf/ is not
> > > straightforward.  There are several dependencies, and specific
> > > versions of these dependencies.
> > > 
> > > Just compiling the example tool is also slightly obscure, e.g. one
> > > need to call make like:
> > > 
> > >  make samples/bpf/
> > > 
> > > Do notice the "/" slash after the directory name.
> > > 
> > > Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> > > ---
> > >  samples/bpf/README.rst |   75 ++++++++++++++++++++++++++++++++++++++++++++++++
> > >  1 file changed, 75 insertions(+)
> > >  create mode 100644 samples/bpf/README.rst  
> > 
> > Thanks for adding this! A few nits...
> 
> I would prefer if we could apply this patchset and you could followup
> with a patch with your nits...

... and have another patch just for that?
Regardless, I thought the reason we review is so the patch that goes in 
is already in a good shape.

> 
> > > 
> > > diff --git a/samples/bpf/README.rst b/samples/bpf/README.rst
> > > new file mode 100644
> > > index 000000000000..1fa157db905b
> > > --- /dev/null
> > > +++ b/samples/bpf/README.rst
> > > @@ -0,0 +1,75 @@
> > > +eBPF sample programs
> > > +====================
> > > +
> > > +This kernel samples/bpf directory contains a mini eBPF library, test  
> > 	^^^^^^^^^^^^^^^^^^
> > 'This directory contains' should suffice.
> 
> The reason I formulated it like this, was that people will often hit
> this kind of documentation when searching google.

That doesn't make sense - shouldn't they be looking at a README file in 
the local samples/bpf directory first before going to google?

> 
> 
> > > +stubs, verifier test-suite and examples for using eBPF.
> > > +
> > > +Build dependencies
> > > +==================
> > > +
> > > +Compiling requires having installed:
> > > + * clang >= version 3.4.0
> > > + * llvm >= version 3.7.1
> > > +
> > > +Note that LLVM's tool 'llc' must support target 'bpf', list with command::
> > > +
> > > + $ llc --version  
> > 
> > 'llc --version | grep bpf' is probably simpler?
> 
> I wanted to give people the impression of how the output looks like.

But, that won't help someone trying to check if their installed llc has 
bpf support or not.
> 
> > > + LLVM (http://llvm.org/):
> > > +  LLVM version 3.x.y
> > > +  [...]
> > > +  Host CPU: xxx

For instance, is the above output something the user needs to see to 
ensure BPF support for llc?

> > > +
> > > +  Registered Targets:
> > > +    [...]
> > > +    bpf        - BPF (host endian)
> > > +    bpfeb      - BPF (big endian)
> > > +    bpfel      - BPF (little endian)

The above is what really matters. Adding 'grep bpf' makes it explicit on 
what the user needs to look for.

> > > +    [...]
> > > +
> > > +Kernel headers
> > > +--------------
> > > +
> > > +There are usually dependencies to header files of the current kernel.
> > > +To avoid installing devel kernel headers system wide, as a normal
> > > +user, simply call::
> > > +
> > > + make headers_install
> > > +
> > > +This will creates a local "usr/include" directory in the git/build top
> > > +level directory, that the make system automatically pickup first.
> > > +
> > > +Compiling
> > > +=========
> > > +
> > > +For compiling goto kernel top level build directory and run make like::  
> > 
> > For building the BPF samples, issue the below command from the kernel 
> > root directory:
> 
> I like your formulation better, but it it worth a respin of the entire
> patchset? 
> 
> Notice you need the extra "::" ending of the paragraph, to make this
> document format nicely with RST (ReStructuredText).
> 
> The a README with a .rst suffix will be picked up by github and
> displayed as the doc for the directory. Thus I also made sure it
> "compiles" with the rst tools. E.g see how samples/pktgen gets auto
> documented and nicely formatted via github (scroll down):
>  https://github.com/torvalds/linux/tree/master/samples/pktgen

Looks nice, though I wasn't aware we had any text in the kernel tree 
adhering to this formatting.

> 
> > > +
> > > + make samples/bpf/
> > > +
> > > +Do notice the "/" slash after the directory name.
> > > +
> > > +Manually compiling LLVM with 'bpf' support
> > > +------------------------------------------
> > > +
> > > +Since version 3.7.0, LLVM adds a proper LLVM backend target for the
> > > +BPF bytecode architecture.
> > > +
> > > +By default llvm will build all non-experimental backends including bpf.
> > > +To generate a smaller llc binary one can use::
> > > +
> > > + -DLLVM_TARGETS_TO_BUILD="BPF;X86"  
> > 
> > Is the X86 target really needed?
> 
> I'm not sure, but if you want to use clang/llc for something else it is
> useful, and the example usage of the ";" separator syntax makes it
> worth including as an example.

Ok. The reason I asked is if users need to include the appropriate arch 
target depending on where they build this. It doesn't look like X86 or 
other architecture targets are necessary though.

- Naveen


^ permalink raw reply

* [PATCH] fq: split out backlog update logic
From: Michal Kazior @ 2016-04-27 10:59 UTC (permalink / raw)
  To: netdev; +Cc: davem, johannes, Michal Kazior

mac80211 (which will be the first user of the
fq.h) recently started to support software A-MSDU
aggregation. It glues skbuffs together into a
single one so the backlog accounting needs to be
more fine-grained.

To avoid backlog sorting logic duplication split
it up for re-use.

Signed-off-by: Michal Kazior <michal.kazior@tieto.com>
---

While preparing a re-spin of fq_codel for
mac80211-next/master I've noticed that it's
current head has the new software A-MSDU
aggregation.

I'm aware I just recently submitted fq.h.

Sorry for the noise!


 include/net/fq_impl.h | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/include/net/fq_impl.h b/include/net/fq_impl.h
index 02eab7c51adb..163f3ed0f05a 100644
--- a/include/net/fq_impl.h
+++ b/include/net/fq_impl.h
@@ -120,6 +120,24 @@ static struct fq_flow *fq_flow_classify(struct fq *fq,
 	return flow;
 }
 
+static void fq_recalc_backlog(struct fq *fq,
+			      struct fq_tin *tin,
+			      struct fq_flow *flow)
+{
+	struct fq_flow *i;
+
+	if (list_empty(&flow->backlogchain))
+		list_add_tail(&flow->backlogchain, &fq->backlogs);
+
+	i = flow;
+	list_for_each_entry_continue_reverse(i, &fq->backlogs,
+					     backlogchain)
+		if (i->backlog > flow->backlog)
+			break;
+
+	list_move(&flow->backlogchain, &i->backlogchain);
+}
+
 static void fq_tin_enqueue(struct fq *fq,
 			   struct fq_tin *tin,
 			   struct sk_buff *skb,
@@ -127,7 +145,6 @@ static void fq_tin_enqueue(struct fq *fq,
 			   fq_flow_get_default_t get_default_func)
 {
 	struct fq_flow *flow;
-	struct fq_flow *i;
 
 	lockdep_assert_held(&fq->lock);
 
@@ -139,16 +156,7 @@ static void fq_tin_enqueue(struct fq *fq,
 	tin->backlog_packets++;
 	fq->backlog++;
 
-	if (list_empty(&flow->backlogchain))
-		list_add_tail(&flow->backlogchain, &fq->backlogs);
-
-	i = flow;
-	list_for_each_entry_continue_reverse(i, &fq->backlogs,
-					     backlogchain)
-		if (i->backlog > flow->backlog)
-			break;
-
-	list_move(&flow->backlogchain, &i->backlogchain);
+	fq_recalc_backlog(fq, tin, flow);
 
 	if (list_empty(&flow->flowchain)) {
 		flow->deficit = fq->quantum;
-- 
2.1.4

^ permalink raw reply related

* Re: [PATCH] veth: Fix potential memory leak in veth_newlink
From: Nikolay Aleksandrov @ 2016-04-27 10:57 UTC (permalink / raw)
  To: Haishuang Yan, David S. Miller, Toshiaki Makita; +Cc: netdev, linux-kernel
In-Reply-To: <1461753739-4803-1-git-send-email-yanhaishuang@cmss.chinamobile.com>

On 04/27/2016 12:42 PM, Haishuang Yan wrote:
> Free peer netdev when failed to configure peer link or register dev.
> 
> Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
> ---
>  drivers/net/veth.c | 1 -
>  1 file changed, 1 deletion(-)
> 
> diff --git a/drivers/net/veth.c b/drivers/net/veth.c
> index f37a6e6..8bb9fb8 100644
> --- a/drivers/net/veth.c
> +++ b/drivers/net/veth.c
> @@ -472,7 +472,6 @@ err_register_dev:
>  	/* nothing to do */
>  err_configure_peer:
>  	unregister_netdevice(peer);
> -	return err;
>  
>  err_register_peer:
>  	free_netdev(peer);
> 

No, it won't leak. unregister_netdevice() will queue it on the todo list and at
the next rtnl unlock the peer device will get freed.
In fact calling it like this you'll hit BUG_ON(dev->reg_state != NETREG_UNREGISTERED) in
free_netdev so I guess you didn't even test your patch.

^ permalink raw reply

* Re: [patch] tipc: remove an unnecessary NULL check
From: Xue, Ying @ 2016-04-27 10:51 UTC (permalink / raw)
  To: Dan Carpenter, Jon Maloy
  Cc: netdev@vger.kernel.org, kernel-janitors@vger.kernel.org,
	tipc-discussion@lists.sourceforge.net, David S. Miller
In-Reply-To: <20160427080528.GA22469@mwanda>

> From: Dan Carpenter [mailto:dan.carpenter@oracle.com]
> Sent: 2016年4月27日 16:05
> To: Jon Maloy
> Cc: Xue, Ying; David S. Miller; netdev@vger.kernel.org; tipc-
> discussion@lists.sourceforge.net; kernel-janitors@vger.kernel.org
> Subject: [patch] tipc: remove an unnecessary NULL check
> 
> This is never called with a NULL "buf" and anyway, we dereference 's' on the
> lines before so it would Oops before we reach the check.
> 
> Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>

Acked-by: Ying Xue <ying.xue@windriver.com>

> 
> diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 79de588..0dd0224
> 100644
> --- a/net/tipc/subscr.c
> +++ b/net/tipc/subscr.c
> @@ -326,8 +326,7 @@ static void tipc_subscrb_rcv_cb(struct net *net, int
> conid,
>  		return tipc_subscrp_cancel(s, subscriber);
>  	}
> 
> -	if (s)
> -		tipc_subscrp_subscribe(net, s, subscriber, swap);
> +	tipc_subscrp_subscribe(net, s, subscriber, swap);
>  }
> 
>  /* Handle one request to establish a new subscriber */
------------------------------------------------------------------------------
Find and fix application performance issues faster with Applications Manager
Applications Manager provides deep performance insights into multiple tiers of
your business applications. It resolves application problems quickly and
reduces your MTTR. Get your free trial!
https://ad.doubleclick.net/ddm/clk/302982198;130105516;z
_______________________________________________
tipc-discussion mailing list
tipc-discussion@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/tipc-discussion

^ permalink raw reply

* [PATCH] veth: Fix potential memory leak in veth_newlink
From: Haishuang Yan @ 2016-04-27 10:42 UTC (permalink / raw)
  To: David S. Miller, Toshiaki Makita; +Cc: netdev, linux-kernel, Haishuang Yan

Free peer netdev when failed to configure peer link or register dev.

Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
---
 drivers/net/veth.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index f37a6e6..8bb9fb8 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -472,7 +472,6 @@ err_register_dev:
 	/* nothing to do */
 err_configure_peer:
 	unregister_netdevice(peer);
-	return err;
 
 err_register_peer:
 	free_netdev(peer);
-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH net-next] drivers/net: add 6WIND SHULTI support
From: Florian Westphal @ 2016-04-27  9:56 UTC (permalink / raw)
  To: Nicolas Dichtel; +Cc: davem, netdev, jiri
In-Reply-To: <1461749838-4613-1-git-send-email-nicolas.dichtel@6wind.com>

Nicolas Dichtel <nicolas.dichtel@6wind.com> wrote:
> This patch adds the support of the 6WIND SHULTI switch. It is a software
> switch doing L2 forwarding.
> 
> This first version implements the minimum needed to get the device working.
> It also implements, via switchdev and rtnetlink, bridge forwarding offload,
> including FDB static entries, FDB learning and FDB ageing.

How is this different from net/bridge?
How is this different from openvswitch?

^ permalink raw reply

* Re: [PATCH net-next] drivers/net: add 6WIND SHULTI support
From: Jiri Pirko @ 2016-04-27  9:50 UTC (permalink / raw)
  To: Nicolas Dichtel; +Cc: davem, netdev
In-Reply-To: <1461749838-4613-1-git-send-email-nicolas.dichtel@6wind.com>

Wed, Apr 27, 2016 at 11:37:18AM CEST, nicolas.dichtel@6wind.com wrote:
>This patch adds the support of the 6WIND SHULTI switch. It is a software
>switch doing L2 forwarding.
>
>This first version implements the minimum needed to get the device working.
>It also implements, via switchdev and rtnetlink, bridge forwarding offload,
>including FDB static entries, FDB learning and FDB ageing.

I have to review this more closely, but at the first glance, this is
triggering a TRAMPOLINE ALERT!

Would you please share more description about this "driver". What is the
purpose, what are benefits, usecases, etc.

Thanks!

^ permalink raw reply

* Re: [PATCH v2 0/2] pegasus: correct buffer sizes
From: Johannes Berg @ 2016-04-27  9:44 UTC (permalink / raw)
  To: Petko Manolov; +Cc: netdev, davem, a1291762
In-Reply-To: <20160427093358.GA30444@p310>

On Wed, 2016-04-27 at 12:33 +0300, Petko Manolov wrote:
> 
> Your guess turned out to be not so wild.  ;)  All Pegasus devices are
> configured  (by the driver) to append CRC at the end of each RX
> packet.  However, the driver reports packet length that does not
> include it.  

Interesting, then my guess was wrong though, since the length is
reported without it, or am I misunderstanding this?

> I doubt the appended CRC is being silently verified by the upper
> layer, bit i might be wrong of course.

It's even "outside" the skb as you describe it, so it can't even be
touched, no?

> Perhaps it is best if instruct the device to not include the CRC as
> it seems ignored anyway.

Yeah, there's no point in passing it over the bus.

johannes

^ permalink raw reply

* [PATCH net-next] drivers/net: add 6WIND SHULTI support
From: Nicolas Dichtel @ 2016-04-27  9:37 UTC (permalink / raw)
  To: davem; +Cc: netdev, jiri, Nicolas Dichtel

This patch adds the support of the 6WIND SHULTI switch. It is a software
switch doing L2 forwarding.

This first version implements the minimum needed to get the device working.
It also implements, via switchdev and rtnetlink, bridge forwarding offload,
including FDB static entries, FDB learning and FDB ageing.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---
 MAINTAINERS                                        |    9 +
 drivers/net/ethernet/6wind/Kconfig                 |   20 +
 drivers/net/ethernet/6wind/Makefile                |    6 +
 drivers/net/ethernet/6wind/shulti/Kconfig          |   12 +
 drivers/net/ethernet/6wind/shulti/Makefile         |    6 +
 drivers/net/ethernet/6wind/shulti/shulti_core.c    | 1164 ++++++++++++++++++++
 drivers/net/ethernet/6wind/shulti/shulti_private.h |   68 ++
 drivers/net/ethernet/6wind/shulti/shulti_swdev.c   |  233 ++++
 drivers/net/ethernet/Kconfig                       |    1 +
 drivers/net/ethernet/Makefile                      |    1 +
 include/linux/netdevice.h                          |    4 +
 include/uapi/linux/Kbuild                          |    1 +
 include/uapi/linux/shulti.h                        |   87 ++
 13 files changed, 1612 insertions(+)
 create mode 100644 drivers/net/ethernet/6wind/Kconfig
 create mode 100644 drivers/net/ethernet/6wind/Makefile
 create mode 100644 drivers/net/ethernet/6wind/shulti/Kconfig
 create mode 100644 drivers/net/ethernet/6wind/shulti/Makefile
 create mode 100644 drivers/net/ethernet/6wind/shulti/shulti_core.c
 create mode 100644 drivers/net/ethernet/6wind/shulti/shulti_private.h
 create mode 100644 drivers/net/ethernet/6wind/shulti/shulti_swdev.c
 create mode 100644 include/uapi/linux/shulti.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 37691abd894c..608ab3fe0eea 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -166,6 +166,15 @@ L:	linux-hams@vger.kernel.org
 S:	Maintained
 F:	drivers/net/hamradio/6pack.c
 
+6WIND NETWORK DRIVERS
+M: 	Nicolas Dichtel <nicolas.dichtel@6wind.com>
+L:	netdev@vger.kernel.org
+S:	Supported
+W:	http://www.6wind.com/
+Q:	http://patchwork.ozlabs.org/project/netdev/list/
+F:	include/uapi/linux/shulti.h
+F:	drivers/net/ethernet/6wind/
+
 8169 10/100/1000 GIGABIT ETHERNET DRIVER
 M:	Realtek linux nic maintainers <nic_swsd@realtek.com>
 L:	netdev@vger.kernel.org
diff --git a/drivers/net/ethernet/6wind/Kconfig b/drivers/net/ethernet/6wind/Kconfig
new file mode 100644
index 000000000000..495e692a1547
--- /dev/null
+++ b/drivers/net/ethernet/6wind/Kconfig
@@ -0,0 +1,20 @@
+#
+# 6WIND drivers configuration
+#
+
+config NET_VENDOR_6WIND
+	bool "6WIND devices"
+	default y
+	---help---
+	  If you have a network device belonging to this class, say Y.
+
+	  Note that the answer to this question doesn't directly affect the
+	  kernel: saying N will just cause the configurator to skip all
+	  the questions about 6WIND devices. If you say Y, you will be asked
+	  for your specific device in the following questions.
+
+if NET_VENDOR_6WIND
+
+source "drivers/net/ethernet/6wind/shulti/Kconfig"
+
+endif # NET_VENDOR_6WIND
diff --git a/drivers/net/ethernet/6wind/Makefile b/drivers/net/ethernet/6wind/Makefile
new file mode 100644
index 000000000000..7375a2c6e09e
--- /dev/null
+++ b/drivers/net/ethernet/6wind/Makefile
@@ -0,0 +1,6 @@
+#
+#
+# Makefile for the 6WIND device drivers.
+#
+
+obj-$(CONFIG_6WIND_SHULTI) += shulti/
diff --git a/drivers/net/ethernet/6wind/shulti/Kconfig b/drivers/net/ethernet/6wind/shulti/Kconfig
new file mode 100644
index 000000000000..c7daa8461e35
--- /dev/null
+++ b/drivers/net/ethernet/6wind/shulti/Kconfig
@@ -0,0 +1,12 @@
+#
+# 6WIND SHULTI driver configuration
+#
+
+config 6WIND_SHULTI
+	tristate "6WIND SHULTI support"
+	depends on NET_SWITCHDEV
+	---help---
+	  This driver supports 6WIND SHULTI switch device.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called shulti.
diff --git a/drivers/net/ethernet/6wind/shulti/Makefile b/drivers/net/ethernet/6wind/shulti/Makefile
new file mode 100644
index 000000000000..faf4ffe8aac5
--- /dev/null
+++ b/drivers/net/ethernet/6wind/shulti/Makefile
@@ -0,0 +1,6 @@
+#
+# Makefile for the 6WIND SHULTI driver.
+#
+
+obj-$(CONFIG_6WIND_SHULTI)	+= shulti.o
+shulti-objs			:= shulti_core.o shulti_swdev.o
diff --git a/drivers/net/ethernet/6wind/shulti/shulti_core.c b/drivers/net/ethernet/6wind/shulti/shulti_core.c
new file mode 100644
index 000000000000..f5de0b6a946e
--- /dev/null
+++ b/drivers/net/ethernet/6wind/shulti/shulti_core.c
@@ -0,0 +1,1164 @@
+/*
+ * Copyright (c) 2016 6WIND S.A.
+ * Copyright (c) 2016 Nicolas Dichtel <nicolas.dichtel@6wind.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+#include <linux/skbuff.h>
+#include <linux/completion.h>
+#include <linux/rtnetlink.h>
+#include <linux/shulti.h>
+#include <net/genetlink.h>
+#include <net/netns/generic.h>
+#include <net/rtnetlink.h>
+#include <net/switchdev.h>
+
+#include "shulti_private.h"
+
+#define DRV_VERSION	"1.0"
+
+static unsigned int shulti_tx_queues;
+static unsigned int shulti_rx_queues;
+
+static int shulti_net_id __read_mostly;
+struct shulti_net *shulti_pernet(struct net *net)
+{
+	return net_generic(net, shulti_net_id);
+}
+
+struct shulti_priv *__shulti_lookup_port(struct shulti_swdev *swdev,
+					 u32 swdev_portid)
+{
+	struct shulti_priv *port = NULL;
+
+	list_for_each_entry_rcu(port, &swdev->port_list, list)
+		if (port->swdev_portid == swdev_portid)
+			return port;
+
+	return NULL;
+}
+
+/* genl functions */
+struct genl_family shulti_genl_family = {
+	.id = GENL_ID_GENERATE,
+	.hdrsize = sizeof(struct shulti_genl_hdr),
+	.name = SHULTI_NAME,
+	.version = 1,
+	.maxattr = SHULTI_A_MAX,
+	.netnsok = true,
+};
+
+static const struct genl_multicast_group shulti_mcgrp[] = {
+	{ .name = SHULTI_NAME, },
+};
+
+static const struct nla_policy shulti_genl_policy[SHULTI_A_MAX + 1] = {
+	[SHULTI_A_NL_PORTIDS]	= { .type = NLA_NESTED },
+	[SHULTI_A_SWDEV_PORTID]	= { .type = NLA_U32 },
+	[SHULTI_A_RX_QUEUES]	= { .type = NLA_U32 },
+	[SHULTI_A_ERRCODE]	= { .type = NLA_S32 },
+	[SHULTI_A_IFINDEX]	= { .type = NLA_S32 },
+	[SHULTI_A_LINK_STATUS]	= { .type = NLA_U8 },
+	[SHULTI_A_STATS]	= { .len = sizeof(struct rtnl_link_stats64) },
+	[SHULTI_A_VLAN_VID]	= { .len = sizeof(struct shulti_vlan_vid) },
+	[SHULTI_A_DRVINFO]	= { .len = sizeof(struct ethtool_drvinfo) },
+	[SHULTI_A_SETTINGS]	= { .len = sizeof(struct ethtool_cmd) },
+	[SHULTI_A_RX_MODES]	= { .type = NLA_U32 },
+	[SHULTI_A_UC_ADDR]	= { .len = ETH_ALEN },
+	[SHULTI_A_MC_ADDR]	= { .len = ETH_ALEN },
+	[SHULTI_A_STP_STATE]	= { .type = NLA_U8 },
+	[SHULTI_A_BR_STATE_LEARNING] = { .type = NLA_U8 },
+	[SHULTI_A_BR_AGEING_TIME] = { .type = NLA_U32 },
+	[SHULTI_A_BR_FDB]	= { .len = sizeof(struct shulti_br_fdb) },
+};
+
+static int shulti_set_tx_queues(struct shulti_swdev *swdev, unsigned int num)
+{
+	/* rtnl get_num_tx_queues ops is global to the module hence this info
+	 * is global.
+	 */
+	if (shulti_tx_queues && num &&
+	    shulti_tx_queues != num)
+		return -EINVAL;
+
+	swdev->data_nl_portids_cnt = num;
+	shulti_tx_queues = num;
+	return 0;
+}
+
+static void shulti_destroy_port_rcu(struct rcu_head *head)
+{
+	struct shulti_priv *port = container_of(head, struct shulti_priv,
+						rcu);
+	struct net_device *dev = priv2netdev(port);
+
+	port->swdev_id = 0;
+	port->swdev_portid = 0;
+	port->genl_req.portid = 0;
+	INIT_LIST_HEAD_RCU(&port->list);
+	shulti_switchdev_fini(dev);
+	dev_put(dev);
+}
+
+static void __shulti_release_swdev_port(struct shulti_priv *port)
+{
+	list_del_rcu(&port->list);
+	call_rcu(&port->rcu, shulti_destroy_port_rcu);
+}
+
+static void __shulti_release_swdev(struct shulti_swdev *swdev)
+{
+	struct shulti_priv *port, *tmp;
+	LIST_HEAD(swdev_port_list);
+
+	ASSERT_RTNL();
+
+	/* Ensure that it's not already freed */
+	if (!swdev->data_nl_portids)
+		return;
+
+	list_for_each_entry_safe(port, tmp, &swdev->port_list, list) {
+		__shulti_release_swdev_port(port);
+		unregister_netdevice_queue(priv2netdev(port), &swdev_port_list);
+	}
+	unregister_netdevice_many(&swdev_port_list);
+
+	swdev->ctrl_nl_portid = 0;
+	swdev->swdev_id = 0;
+	shulti_set_tx_queues(swdev, 0);
+	shulti_rx_queues = 0;
+	kfree(swdev->data_nl_portids);
+	swdev->data_nl_portids = NULL;
+	module_put(THIS_MODULE);
+}
+
+/* genl commands */
+static int shulti_genl_bind(struct sk_buff *skb, struct genl_info *info)
+{
+	struct net *net = genl_info_net(info);
+	struct shulti_net *pernet = shulti_pernet(net);
+	struct shulti_genl_hdr *hdr = info->userhdr;
+	int err = -EAGAIN;
+	size_t len;
+
+	if (!info->attrs[SHULTI_A_RX_QUEUES] ||
+	    !info->attrs[SHULTI_A_NL_PORTIDS])
+		return -EINVAL;
+
+	if (!try_module_get(THIS_MODULE))
+		return -EAGAIN;
+
+	if (!rtnl_trylock())
+		goto err_mod;
+
+	/* Only one instance is allowed for now. */
+	err = -EBUSY;
+	if (pernet->swdev.swdev_id)
+		goto err_lock;
+
+	/* rtnl get_num_rx_queues ops is global to the module hence this info
+	 * is global.
+	 */
+	err = -EINVAL;
+	if (shulti_rx_queues &&
+	    shulti_rx_queues != nla_get_u32(info->attrs[SHULTI_A_RX_QUEUES]))
+		goto err_lock;
+	shulti_rx_queues = nla_get_u32(info->attrs[SHULTI_A_RX_QUEUES]);
+
+	len = nla_len(info->attrs[SHULTI_A_NL_PORTIDS]);
+	pernet->swdev.data_nl_portids = kmalloc(len, GFP_KERNEL);
+	err = -ENOMEM;
+	if (!pernet->swdev.data_nl_portids)
+		goto err_lock;
+	err = shulti_set_tx_queues(&pernet->swdev, len / sizeof(u32));
+	if (err < 0)
+		goto err_free;
+	memcpy(pernet->swdev.data_nl_portids,
+	       nla_data(info->attrs[SHULTI_A_NL_PORTIDS]), len);
+
+	pernet->swdev.swdev_id = hdr->swdev_id;
+	pernet->swdev.ctrl_nl_portid = info->snd_portid;
+	rtnl_unlock();
+	return 0;
+
+err_free:
+	kfree(pernet->swdev.data_nl_portids);
+	pernet->swdev.data_nl_portids = NULL;
+err_lock:
+	rtnl_unlock();
+err_mod:
+	module_put(THIS_MODULE);
+	return err;
+}
+
+static int shulti_genl_unbind(struct sk_buff *skb, struct genl_info *info)
+{
+	struct net *net = genl_info_net(info);
+	struct shulti_net *pernet = shulti_pernet(net);
+	struct shulti_genl_hdr *hdr = info->userhdr;
+	int err = -EINVAL;
+
+	if (!rtnl_trylock())
+		return -EAGAIN;
+
+	if (!pernet->swdev.swdev_id ||
+	    pernet->swdev.swdev_id != hdr->swdev_id ||
+	    pernet->swdev.ctrl_nl_portid != info->snd_portid)
+		goto end;
+
+	__shulti_release_swdev(&pernet->swdev);
+	err = 0;
+end:
+	rtnl_unlock();
+	return err;
+}
+
+static int shulti_genl_port_info(struct sk_buff *skb, struct genl_info *info)
+{
+	struct shulti_genl_hdr *shulti_hdr = info->userhdr;
+	struct net *net = genl_info_net(info);
+	struct shulti_net *pernet = shulti_pernet(net);
+	int err = -EMSGSIZE, ifindex;
+	struct sk_buff *msg = NULL;
+	struct shulti_priv *port;
+	struct net_device *dev;
+	u32 swdev_portid;
+	size_t len;
+
+	if (!info->attrs[SHULTI_A_SWDEV_PORTID])
+		return -EINVAL;
+
+	if (shulti_hdr->swdev_id != pernet->swdev.swdev_id)
+		return -EINVAL;
+
+	swdev_portid = nla_get_u32(info->attrs[SHULTI_A_SWDEV_PORTID]);
+	rcu_read_lock();
+	port = __shulti_lookup_port(&pernet->swdev, swdev_portid);
+	if (!port) {
+		rcu_read_unlock();
+		return -ENXIO;
+	}
+	dev = priv2netdev(port);
+	ifindex = dev->ifindex;
+	rcu_read_unlock();
+
+	len = genlmsg_total_size(sizeof(struct shulti_genl_hdr))
+	      + nla_total_size(sizeof(uint32_t)) /* SHULTI_A_SWDEV_PORTID */
+	      + nla_total_size(sizeof(int));     /* SHULTI_A_IFINDEX */
+	msg = genlmsg_new(len, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	shulti_hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq,
+				 &shulti_genl_family, 0, SHULTI_C_PORT_INFO);
+	if (!shulti_hdr)
+		goto err;
+	shulti_hdr->swdev_id = pernet->swdev.swdev_id;
+
+	if (nla_put_u32(msg, SHULTI_A_SWDEV_PORTID, swdev_portid))
+		goto err;
+	if (nla_put_s32(msg, SHULTI_A_IFINDEX, ifindex))
+		goto err;
+
+	genlmsg_end(msg, shulti_hdr);
+	return genlmsg_reply(msg, info);
+err:
+	nlmsg_free(msg);
+	return err;
+}
+
+static int shulti_genl_link_status(struct sk_buff *msg, struct genl_info *info)
+{
+	struct shulti_genl_hdr *shulti_hdr = info->userhdr;
+	struct net *net = genl_info_net(info);
+	struct shulti_net *pernet = shulti_pernet(net);
+	struct shulti_priv *port;
+	struct net_device *dev;
+	int err = -ENOENT;
+	u32 swdev_portid;
+
+	if (!info->attrs[SHULTI_A_SWDEV_PORTID] ||
+	    !info->attrs[SHULTI_A_LINK_STATUS])
+		return -EINVAL;
+
+	if (shulti_hdr->swdev_id != pernet->swdev.swdev_id)
+		return -EINVAL;
+
+	swdev_portid = nla_get_u32(info->attrs[SHULTI_A_SWDEV_PORTID]);
+	rcu_read_lock();
+	port = __shulti_lookup_port(&pernet->swdev, swdev_portid);
+	if (!port)
+		goto out;
+	dev = priv2netdev(port);
+
+	if (nla_get_u8(info->attrs[SHULTI_A_LINK_STATUS]))
+		netif_carrier_on(dev);
+	else
+		netif_carrier_off(dev);
+out:
+	rcu_read_unlock();
+	return err;
+}
+
+static int shulti_genl_packet(struct sk_buff *msg, struct genl_info *info)
+{
+	struct shulti_genl_hdr *shulti_hdr = info->userhdr;
+	struct net *net = genl_info_net(info);
+	struct shulti_net *pernet = shulti_pernet(net);
+	struct shulti_priv *port;
+	struct net_device *dev;
+	struct sk_buff *skb;
+	u32 swdev_portid;
+	size_t len;
+	int err = -ENOENT;
+
+	if (!info->attrs[SHULTI_A_SWDEV_PORTID] ||
+	    !info->attrs[SHULTI_A_PACKET])
+		return -EINVAL;
+
+	if (shulti_hdr->swdev_id != pernet->swdev.swdev_id)
+		return -EINVAL;
+
+	swdev_portid = nla_get_u32(info->attrs[SHULTI_A_SWDEV_PORTID]);
+	rcu_read_lock();
+	port = __shulti_lookup_port(&pernet->swdev, swdev_portid);
+	if (!port)
+		goto out;
+	dev = priv2netdev(port);
+
+	err = -ENETDOWN;
+	if (!(dev->flags & IFF_UP))
+		goto out;
+
+	err = -ENOMEM;
+	len = nla_len(info->attrs[SHULTI_A_PACKET]);
+	skb = netdev_alloc_skb_ip_align(dev, len);
+	if (!skb)
+		goto out;
+	skb_reserve(skb, NET_IP_ALIGN);
+
+	skb_copy_to_linear_data(skb, nla_data(info->attrs[SHULTI_A_PACKET]),
+				len);
+	skb_put(skb, len);
+	skb->protocol = eth_type_trans(skb, dev);
+	netif_rx(skb);
+	err = 0;
+out:
+	rcu_read_unlock();
+	return err;
+}
+
+static int shulti_genl_answer(struct sk_buff *msg, struct genl_info *info,
+			      int attr)
+{
+	struct shulti_genl_hdr *shulti_hdr = info->userhdr;
+	struct net *net = genl_info_net(info);
+	struct shulti_net *pernet = shulti_pernet(net);
+	struct shulti_priv *port;
+	u32 swdev_portid;
+	int err = -ENOENT;
+
+	if (!info->attrs[SHULTI_A_SWDEV_PORTID] ||
+	    (attr && !info->attrs[attr]))
+		return -EINVAL;
+
+	if (shulti_hdr->swdev_id != pernet->swdev.swdev_id)
+		return -EINVAL;
+
+	swdev_portid = nla_get_u32(info->attrs[SHULTI_A_SWDEV_PORTID]);
+	rcu_read_lock();
+	port = __shulti_lookup_port(&pernet->swdev, swdev_portid);
+	if (!port)
+		goto out_rcu_unlock;
+
+	spin_lock(&port->genl_req.lock);
+	err = -ESRCH;
+	/* Is someone waiting for an answer? */
+	if (!port->genl_req.wait_answer)
+		goto out_spin_unlock;
+
+	if (info->attrs[SHULTI_A_ERRCODE])
+		port->genl_req.errcode =
+			nla_get_s32(info->attrs[SHULTI_A_ERRCODE]);
+	else
+		port->genl_req.errcode = 0;
+	if (attr &&
+	    port->genl_req.answer &&
+	    nla_len(info->attrs[attr]) == port->genl_req.answer_len)
+		memcpy(port->genl_req.answer, nla_data(info->attrs[attr]),
+		       nla_len(info->attrs[attr]));
+	complete(&port->genl_req.compl);
+	err = 0;
+out_spin_unlock:
+	spin_unlock(&port->genl_req.lock);
+out_rcu_unlock:
+	rcu_read_unlock();
+	return err;
+}
+
+static int shulti_genl_get_error_code(struct sk_buff *msg,
+				      struct genl_info *info)
+{
+	return shulti_genl_answer(msg, info, 0);
+}
+
+static int shulti_genl_update_stats(struct sk_buff *msg, struct genl_info *info)
+{
+	struct shulti_genl_hdr *shulti_hdr = info->userhdr;
+	struct net *net = genl_info_net(info);
+	struct shulti_net *pernet = shulti_pernet(net);
+	struct shulti_priv *port;
+	u32 swdev_portid;
+	int err = -ENOENT;
+
+	if (!info->attrs[SHULTI_A_SWDEV_PORTID] ||
+	    !info->attrs[SHULTI_A_STATS] ||
+	    shulti_hdr->swdev_id != pernet->swdev.swdev_id)
+		return -EINVAL;
+
+	swdev_portid = nla_get_u32(info->attrs[SHULTI_A_SWDEV_PORTID]);
+	rcu_read_lock();
+	port = __shulti_lookup_port(&pernet->swdev, swdev_portid);
+	if (!port)
+		goto out;
+	memcpy(&port->stats, nla_data(info->attrs[SHULTI_A_STATS]),
+	       nla_len(info->attrs[SHULTI_A_STATS]));
+	err = 0;
+out:
+	rcu_read_unlock();
+	return err;
+}
+
+static int shulti_genl_drvinfo(struct sk_buff *msg, struct genl_info *info)
+{
+	return shulti_genl_answer(msg, info, SHULTI_A_DRVINFO);
+}
+
+static int shulti_genl_get_settings(struct sk_buff *msg,
+				    struct genl_info *info)
+{
+	return shulti_genl_answer(msg, info, SHULTI_A_SETTINGS);
+}
+
+static const struct genl_ops shulti_genl_ops[] = {
+	{
+		.cmd = SHULTI_C_BIND,
+		.doit = shulti_genl_bind,
+		.policy = shulti_genl_policy,
+	},
+	{
+		.cmd = SHULTI_C_UNBIND,
+		.doit = shulti_genl_unbind,
+		.policy = shulti_genl_policy,
+	},
+	{
+		.cmd = SHULTI_C_PORT_INFO,
+		.doit = shulti_genl_port_info,
+		.policy = shulti_genl_policy,
+	},
+	{
+		.cmd = SHULTI_C_LINK_STATUS,
+		.doit = shulti_genl_link_status,
+		.policy = shulti_genl_policy,
+	},
+	{
+		.cmd = SHULTI_C_PACKET,
+		.doit = shulti_genl_packet,
+		.policy = shulti_genl_policy,
+	},
+	{
+		.cmd = SHULTI_C_UPDATE_STATS,
+		.doit = shulti_genl_update_stats,
+		.policy = shulti_genl_policy,
+	},
+	{
+		.cmd = SHULTI_C_VLAN_RX_ADD_VID,
+		.doit = shulti_genl_get_error_code,
+		.policy = shulti_genl_policy,
+	},
+	{
+		.cmd = SHULTI_C_VLAN_RX_DEL_VID,
+		.doit = shulti_genl_get_error_code,
+		.policy = shulti_genl_policy,
+	},
+	{
+		.cmd = SHULTI_C_DRVINFO,
+		.doit = shulti_genl_drvinfo,
+		.policy = shulti_genl_policy,
+	},
+	{
+		.cmd = SHULTI_C_GET_SETTINGS,
+		.doit = shulti_genl_get_settings,
+		.policy = shulti_genl_policy,
+	},
+	{
+		.cmd = SHULTI_C_SET_SETTINGS,
+		.doit = shulti_genl_get_error_code,
+		.policy = shulti_genl_policy,
+	},
+	{
+		.cmd = SHULTI_C_SET_BR_FDB,
+		.doit = shulti_genl_set_br_fdb,
+		.policy = shulti_genl_policy,
+	},
+};
+
+static void shulti_destroy_swdev(struct work_struct *work)
+{
+	struct shulti_swdev *swdev =
+		container_of(work, struct shulti_swdev, destroy_work);
+
+	rtnl_lock();
+	__shulti_release_swdev(swdev);
+	rtnl_unlock();
+}
+
+static int shulti_rcv_nl_event(struct notifier_block *this,
+			       unsigned long event, void *ptr)
+{
+	struct netlink_notify *n = ptr;
+	struct net *net = n->net;
+	struct shulti_net *pernet = shulti_pernet(net);
+	struct shulti_priv *port;
+	struct net_device *dev;
+
+	if (event == NETLINK_URELEASE &&
+	    n->protocol == NETLINK_GENERIC &&
+	    n->portid == pernet->swdev.ctrl_nl_portid) {
+		/* disable tx */
+		list_for_each_entry_rcu(port, &pernet->swdev.port_list, list) {
+			dev = priv2netdev(port);
+			netif_carrier_off(dev);
+			netif_tx_disable(dev);
+		}
+		/* disable rx */
+		pernet->swdev.swdev_id = 0;
+
+		/* schedule cleaning */
+		pernet->swdev.ctrl_nl_portid = 0;
+		schedule_work(&pernet->swdev.destroy_work);
+	}
+
+	return NOTIFY_DONE;
+}
+
+struct sk_buff *shulti_genl_build_req(struct net_device *dev, int cmd,
+				      size_t attrlen, gfp_t flags)
+{
+	struct shulti_priv *priv = netdev_priv(dev);
+	struct shulti_genl_hdr *shulti_hdr;
+	struct sk_buff *msg = NULL;
+	size_t size;
+
+	if (!priv->swdev_id)
+		goto err;
+
+	size = genlmsg_total_size(sizeof(struct shulti_genl_hdr))
+	       + nla_total_size(sizeof(uint32_t)) /* SHULTI_A_SWDEV_PORTID */
+	       + attrlen; /* attributes */
+
+	msg = genlmsg_new(size, flags);
+	if (!msg)
+		goto err;
+
+	shulti_hdr = genlmsg_put(msg, 0, 0, &shulti_genl_family, 0, cmd);
+	if (!shulti_hdr)
+		goto err;
+
+	shulti_hdr->swdev_id = priv->swdev_id;
+	if (nla_put_u32(msg, SHULTI_A_SWDEV_PORTID, priv->swdev_portid))
+		goto err;
+
+	return msg;
+err:
+	nlmsg_free(msg);
+	return NULL;
+}
+
+int shulti_genl_send_req(struct net_device *dev, struct sk_buff *msg,
+			 bool need_answer, void *answer, size_t answer_len)
+{
+	struct shulti_priv *priv = netdev_priv(dev);
+	struct shulti_genl_hdr *shulti_hdr;
+	int err = -EAGAIN;
+
+	shulti_hdr = genlmsg_data(nlmsg_data(nlmsg_hdr(msg)));
+	genlmsg_end(msg, shulti_hdr);
+
+	if (need_answer) {
+		spin_lock(&priv->genl_req.lock);
+		/* Parallel requests are not supported for now */
+		if (priv->genl_req.wait_answer)
+			goto out_locked;
+		priv->genl_req.wait_answer = true;
+		priv->genl_req.errcode = 0;
+		priv->genl_req.answer = answer;
+		priv->genl_req.answer_len = answer_len;
+		reinit_completion(&priv->genl_req.compl);
+		spin_unlock(&priv->genl_req.lock);
+	}
+
+	err = genlmsg_unicast(priv->swdev_net, msg, priv->genl_req.portid);
+	if (err < 0)
+		goto out_unlocked;
+	msg = NULL;
+
+	if (!need_answer)
+		goto out;
+
+	if (wait_for_completion_interruptible_timeout(&priv->genl_req.compl,
+						      msecs_to_jiffies(50))
+	    <= 0)
+		err = -ETIMEDOUT;
+
+out_unlocked:
+	spin_lock(&priv->genl_req.lock);
+	if (!err)
+		err = priv->genl_req.errcode;
+
+out_locked:
+	if (need_answer) {
+		priv->genl_req.wait_answer = false;
+		priv->genl_req.answer = NULL;
+	}
+	spin_unlock(&priv->genl_req.lock);
+	nlmsg_free(msg);
+out:
+	return err;
+}
+
+static struct notifier_block shulti_nl_notifier = {
+	.notifier_call  = shulti_rcv_nl_event,
+};
+
+/* ndo handlers */
+static int shulti_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct shulti_priv *priv = netdev_priv(dev);
+	struct shulti_net *pernet = shulti_pernet(priv->swdev_net);
+	struct shulti_genl_hdr *shulti_hdr;
+	struct sk_buff *msg = NULL;
+	unsigned int hlen, q;
+	struct nlattr *nla;
+	size_t size;
+
+	if (!pernet->swdev.swdev_id)
+		goto err;
+
+	size = genlmsg_total_size(sizeof(struct shulti_genl_hdr))
+	       + nla_total_size(sizeof(uint32_t)); /* SHULTI_A_SWDEV_PORTID */
+
+	hlen = skb_zerocopy_headlen(skb);
+	size += sizeof(struct nlattr) + hlen;
+
+	msg = genlmsg_new(size, GFP_ATOMIC);
+	if (!msg)
+		goto err;
+
+	shulti_hdr = genlmsg_put(msg, 0, 0, &shulti_genl_family, 0,
+				 SHULTI_C_PACKET);
+	if (!shulti_hdr)
+		goto err;
+
+	shulti_hdr->swdev_id = priv->swdev_id;
+	if (nla_put_u32(msg, SHULTI_A_SWDEV_PORTID, priv->swdev_portid))
+		goto err;
+
+	if (skb_tailroom(msg) < sizeof(struct nlattr) + hlen)
+		goto err;
+
+	nla = (struct nlattr *)skb_put(msg, sizeof(struct nlattr));
+	nla->nla_type = SHULTI_A_PACKET;
+	nla->nla_len = nla_attr_size(skb->len);
+
+	if (skb_zerocopy(msg, skb, skb->len, hlen))
+		goto err;
+
+	genlmsg_end(msg, shulti_hdr);
+	q = skb_get_queue_mapping(skb) % shulti_tx_queues;
+	if (genlmsg_unicast(priv->swdev_net, msg,
+			    pernet->swdev.data_nl_portids[q]) < 0)
+		goto err;
+
+	consume_skb(skb);
+	return NETDEV_TX_OK;
+
+err:
+	dev->stats.tx_dropped++;
+	nlmsg_free(msg);
+	kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+static struct rtnl_link_stats64 *
+shulti_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *tot)
+{
+	struct shulti_priv *priv = netdev_priv(dev);
+
+	netdev_stats_to_stats64(tot, &dev->stats);
+	tot->rx_packets += priv->stats.rx_packets;
+	tot->tx_packets += priv->stats.tx_packets;
+	tot->rx_bytes += priv->stats.rx_bytes;
+	tot->tx_bytes += priv->stats.tx_bytes;
+	tot->rx_errors += priv->stats.rx_errors;
+	tot->tx_errors += priv->stats.tx_errors;
+	tot->multicast += priv->stats.multicast;
+	tot->rx_missed_errors += priv->stats.rx_missed_errors;
+
+	return tot;
+}
+
+static void shulti_set_rx_mode(struct net_device *dev)
+{
+	struct netdev_hw_addr *ha;
+	unsigned char *ethaddr;
+	struct sk_buff *msg;
+	struct nlattr *attr;
+	u32 rx_modes = 0;
+	size_t len;
+
+	len = nla_total_size(sizeof(u32)) +
+	      nla_total_size(netdev_uc_count(dev) * ETH_ALEN) +
+	      nla_total_size(netdev_mc_count(dev) * ETH_ALEN);
+
+	msg = shulti_genl_build_req(dev, SHULTI_C_SET_RX_MODE, len,
+				    GFP_ATOMIC);
+	if (!msg)
+		return;
+
+	rx_modes = dev->flags & (IFF_PROMISC | IFF_ALLMULTI);
+	if (nla_put_u32(msg, SHULTI_A_RX_MODES, rx_modes) < 0)
+		goto err;
+
+	attr = nla_reserve(msg, SHULTI_A_UC_ADDR,
+			   netdev_mc_count(dev) * ETH_ALEN);
+	if (!attr)
+		goto err;
+	ethaddr = (unsigned char *)nla_data(attr);
+	netdev_for_each_uc_addr(ha, dev) {
+		ether_addr_copy(ethaddr, ha->addr);
+		ethaddr += ETH_ALEN;
+	}
+
+	attr = nla_reserve(msg, SHULTI_A_MC_ADDR,
+			   netdev_mc_count(dev) * ETH_ALEN);
+	if (!attr)
+		goto err;
+	ethaddr = (unsigned char *)nla_data(attr);
+	netdev_for_each_mc_addr(ha, dev) {
+		ether_addr_copy(ethaddr, ha->addr);
+		ethaddr += ETH_ALEN;
+	}
+
+	shulti_genl_send_req(dev, msg, false, NULL, 0);
+	return;
+err:
+	nlmsg_free(msg);
+}
+
+static u16 shulti_pick_default_tx_queue(void)
+{
+	unsigned int cpu = smp_processor_id();
+
+	return (u16)(cpu % shulti_tx_queues);
+}
+
+static u16 shulti_select_queue(struct net_device *dev, struct sk_buff *skb,
+			       void *accel_priv,
+			       select_queue_fallback_t fallback)
+{
+	u16 queue_index;
+
+	if (skb_rx_queue_recorded(skb))
+		queue_index = skb_get_rx_queue(skb);
+	else
+		queue_index = shulti_pick_default_tx_queue();
+
+	while (unlikely(queue_index >= dev->real_num_tx_queues))
+		queue_index -= dev->real_num_tx_queues;
+
+	return queue_index;
+}
+
+static int shulti_vlan_set_rx_vid(struct net_device *dev, u8 cmd,
+				  __be16 proto, uint16_t vid)
+{
+	struct shulti_vlan_vid vlan_vid = {
+		.proto = proto,
+		.vid = vid,
+	};
+	size_t len = sizeof(vlan_vid);
+	struct sk_buff *msg;
+
+	msg = shulti_genl_build_req(dev, cmd, nla_total_size(len), GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	if (nla_put(msg, SHULTI_A_VLAN_VID, len, &vlan_vid) < 0) {
+		nlmsg_free(msg);
+		return -EMSGSIZE;
+	}
+
+	return shulti_genl_send_req(dev, msg, true, NULL, 0);
+}
+
+static int shulti_vlan_rx_add_vid(struct net_device *dev, __be16 proto,
+				  uint16_t vid)
+{
+	return shulti_vlan_set_rx_vid(dev, SHULTI_C_VLAN_RX_ADD_VID,
+				      proto, vid);
+}
+
+static int shulti_vlan_rx_del_vid(struct net_device *dev, __be16 proto,
+				  uint16_t vid)
+{
+	return shulti_vlan_set_rx_vid(dev, SHULTI_C_VLAN_RX_DEL_VID,
+				      proto, vid);
+}
+
+static int shulti_get_phys_port_id(struct net_device *dev,
+				   struct netdev_phys_item_id *ppid)
+{
+	struct shulti_priv *priv = netdev_priv(dev);
+
+	ppid->id_len = sizeof(priv->swdev_portid);
+	memcpy(ppid->id, &priv->swdev_portid, ppid->id_len);
+
+	return 0;
+}
+
+static int shulti_get_phys_port_name(struct net_device *dev,
+				     char *buf, size_t len)
+{
+	struct shulti_priv *priv = netdev_priv(dev);
+
+	strncpy(buf, priv->phys_port_name, len);
+	return 0;
+}
+
+static const struct net_device_ops shulti_netdev_ops = {
+	.ndo_start_xmit		= shulti_xmit,
+	.ndo_get_stats64	= shulti_get_stats64,
+	.ndo_set_rx_mode	= shulti_set_rx_mode,
+	.ndo_set_mac_address	= eth_mac_addr,
+	.ndo_select_queue	= shulti_select_queue,
+	.ndo_features_check	= passthru_features_check,
+	.ndo_vlan_rx_add_vid	= shulti_vlan_rx_add_vid,
+	.ndo_vlan_rx_kill_vid	= shulti_vlan_rx_del_vid,
+	.ndo_bridge_getlink	= switchdev_port_bridge_getlink,
+	.ndo_bridge_setlink	= switchdev_port_bridge_setlink,
+	.ndo_bridge_dellink	= switchdev_port_bridge_dellink,
+	.ndo_get_phys_port_id	= shulti_get_phys_port_id,
+	.ndo_get_phys_port_name	= shulti_get_phys_port_name,
+};
+
+static void shulti_get_drvinfo(struct net_device *dev,
+			       struct ethtool_drvinfo *info)
+{
+	struct sk_buff *msg;
+
+	msg = shulti_genl_build_req(dev, SHULTI_C_DRVINFO, 0, GFP_KERNEL);
+	if (!msg)
+		goto fallback;
+
+	if (shulti_genl_send_req(dev, msg, true, info, sizeof(*info)) < 0)
+		goto fallback;
+	return;
+
+fallback:
+	strlcpy(info->driver, SHULTI_NAME, sizeof(info->driver));
+	strlcpy(info->version, DRV_VERSION, sizeof(info->version));
+}
+
+static int shulti_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+	struct sk_buff *msg;
+
+	msg = shulti_genl_build_req(dev, SHULTI_C_GET_SETTINGS, 0,
+				    GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	return shulti_genl_send_req(dev, msg, true, cmd, sizeof(*cmd));
+}
+
+static int shulti_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+	size_t len = sizeof(struct ethtool_cmd);
+	struct sk_buff *msg;
+
+	msg = shulti_genl_build_req(dev, SHULTI_C_SET_SETTINGS,
+				    nla_total_size(len), GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	if (nla_put(msg, SHULTI_A_SETTINGS, len, cmd) < 0) {
+		nlmsg_free(msg);
+		return -EMSGSIZE;
+	}
+
+	return shulti_genl_send_req(dev, msg, true, NULL, 0);
+}
+
+static struct ethtool_ops shulti_ethtool_ops = {
+	.get_drvinfo      = shulti_get_drvinfo,
+	.get_settings     = shulti_get_settings,
+	.set_settings     = shulti_set_settings,
+};
+
+/* For udev */
+static struct device_type shulti_type = {
+	.name = "shulti",
+};
+
+static void shulti_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+
+	SET_NETDEV_DEVTYPE(dev, &shulti_type);
+
+	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+
+	dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST;
+	dev->features = dev->hw_features | NETIF_F_LLTX;
+	dev->vlan_features = dev->features;
+
+	dev->netdev_ops = &shulti_netdev_ops;
+	dev->ethtool_ops = &shulti_ethtool_ops;
+	dev->destructor = free_netdev;
+}
+
+/* rtnl stuff */
+static const struct nla_policy shulti_policy[IFLA_SHULTI_MAX + 1] = {
+	[IFLA_SHULTI_SWDEV_ID]		= { .type = NLA_U32 },
+	[IFLA_SHULTI_SWDEV_PORTID]	= { .type = NLA_U32 },
+	[IFLA_SHULTI_PHYS_PORT_NAME]	= { .type = NLA_STRING,
+					    .len = IFNAMSIZ },
+};
+
+static int shulti_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+	if (!data[IFLA_SHULTI_SWDEV_ID] ||
+	    !data[IFLA_SHULTI_SWDEV_PORTID])
+		return -EINVAL;
+
+	if (tb[IFLA_ADDRESS]) {
+		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
+			return -EINVAL;
+		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
+			return -EADDRNOTAVAIL;
+	}
+
+	return 0;
+}
+
+static int shulti_newlink(struct net *src_net, struct net_device *dev,
+			  struct nlattr *tb[], struct nlattr *data[])
+{
+	struct shulti_net *pernet = shulti_pernet(src_net);
+	struct shulti_priv *priv = netdev_priv(dev);
+	u32 swdev_id, swdev_portid;
+	int err;
+
+	ASSERT_RTNL();
+
+	swdev_id = nla_get_u32(data[IFLA_SHULTI_SWDEV_ID]);
+	swdev_portid = nla_get_u32(data[IFLA_SHULTI_SWDEV_PORTID]);
+
+	if (pernet->swdev.swdev_id != swdev_id)
+		return -EINVAL;
+	if (__shulti_lookup_port(&pernet->swdev, swdev_portid))
+		return -EEXIST;
+	priv->swdev_id = swdev_id;
+	priv->swdev_portid = swdev_portid;
+	priv->swdev_net = src_net;
+	if (data[IFLA_SHULTI_PHYS_PORT_NAME])
+		strcpy(priv->phys_port_name,
+		       nla_data(data[IFLA_SHULTI_PHYS_PORT_NAME]));
+	else
+		memset(priv->phys_port_name, 0, sizeof(priv->phys_port_name));
+	list_add_rcu(&priv->list, &pernet->swdev.port_list);
+	dev_hold(dev);
+	priv->genl_req.portid = pernet->swdev.ctrl_nl_portid;
+	init_completion(&priv->genl_req.compl);
+	priv->genl_req.answer = NULL;
+	spin_lock_init(&priv->genl_req.lock);
+
+	if (!tb[IFLA_ADDRESS])
+		eth_hw_addr_random(dev);
+
+	if (tb[IFLA_IFNAME])
+		nla_strlcpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ);
+	else
+		snprintf(dev->name, IFNAMSIZ, SHULTI_NAME "%%d");
+
+	shulti_switchdev_init(dev);
+	err = register_netdevice(dev);
+	if (err < 0)
+		goto err;
+
+	netif_carrier_off(dev);
+	return 0;
+
+err:
+	__shulti_release_swdev_port(priv);
+	return err;
+}
+
+static void shulti_dellink(struct net_device *dev, struct list_head *head)
+{
+	struct shulti_priv *priv = netdev_priv(dev);
+
+	ASSERT_RTNL();
+
+	__shulti_release_swdev_port(priv);
+	unregister_netdevice_queue(dev, head);
+}
+
+static int shulti_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+	struct shulti_priv *priv = netdev_priv(dev);
+
+	if (nla_put_u32(skb, IFLA_SHULTI_SWDEV_ID, priv->swdev_id) ||
+	    nla_put_u32(skb, IFLA_SHULTI_SWDEV_PORTID, priv->swdev_portid))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static unsigned int shulti_get_num_rx_queues(void)
+{
+	return shulti_rx_queues;
+}
+
+static unsigned int shulti_get_num_tx_queues(void)
+{
+	return shulti_tx_queues;
+}
+
+static struct rtnl_link_ops shulti_link_ops = {
+	.kind			= SHULTI_NAME,
+	.priv_size		= sizeof(struct shulti_priv),
+	.setup			= shulti_setup,
+	.maxtype		= IFLA_SHULTI_MAX,
+	.policy			= shulti_policy,
+	.validate		= shulti_validate,
+	.newlink		= shulti_newlink,
+	.dellink		= shulti_dellink,
+	.fill_info		= shulti_fill_info,
+	.get_num_tx_queues	= shulti_get_num_tx_queues,
+	.get_num_rx_queues	= shulti_get_num_rx_queues,
+};
+
+/* netns init/exit */
+static int __net_init shulti_net_init(struct net *net)
+{
+	struct shulti_net *pernet = shulti_pernet(net);
+
+	memset(&pernet->swdev, 0, sizeof(pernet->swdev));
+	INIT_LIST_HEAD(&pernet->swdev.port_list);
+	INIT_WORK(&pernet->swdev.destroy_work, shulti_destroy_swdev);
+	return 0;
+}
+
+static void __net_exit shulti_net_exit(struct net *net)
+{
+	struct shulti_net *pernet = shulti_pernet(net);
+
+	/* Nothing to do, if we hit this function, all sockets are already
+	 * closed and thus all associated netdevs are already unregistered.
+	 */
+	WARN_ON(pernet->swdev.ctrl_nl_portid);
+}
+
+static struct pernet_operations shulti_net_ops = {
+	.init = shulti_net_init,
+	.exit = shulti_net_exit,
+	.id   = &shulti_net_id,
+	.size = sizeof(struct shulti_net),
+};
+
+/* module init/exit */
+static int __init shulti_module_init(void)
+{
+	int err;
+
+	err = rtnl_link_register(&shulti_link_ops);
+	if (err < 0)
+		goto err;
+
+	err = genl_register_family_with_ops_groups(&shulti_genl_family,
+						   shulti_genl_ops,
+						   shulti_mcgrp);
+	if (err < 0)
+		goto err_rtnl_unreg;
+
+	err = netlink_register_notifier(&shulti_nl_notifier);
+	if (err < 0)
+		goto err_genl_unreg;
+
+	err = register_pernet_device(&shulti_net_ops);
+
+	return 0;
+
+err_genl_unreg:
+	genl_unregister_family(&shulti_genl_family);
+err_rtnl_unreg:
+	rtnl_link_unregister(&shulti_link_ops);
+err:
+	return err;
+}
+
+static void __exit shulti_module_exit(void)
+{
+	netlink_unregister_notifier(&shulti_nl_notifier);
+	genl_unregister_family(&shulti_genl_family);
+	rtnl_link_unregister(&shulti_link_ops);
+}
+
+module_init(shulti_module_init);
+module_exit(shulti_module_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Nicolas Dichtel <nicolas.dichtel@6wind.com>");
+MODULE_DESCRIPTION("6WIND SHULTI driver");
+MODULE_ALIAS_RTNL_LINK(SHULTI_NAME);
diff --git a/drivers/net/ethernet/6wind/shulti/shulti_private.h b/drivers/net/ethernet/6wind/shulti/shulti_private.h
new file mode 100644
index 000000000000..40c0458a2483
--- /dev/null
+++ b/drivers/net/ethernet/6wind/shulti/shulti_private.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2016 6WIND S.A.
+ * Copyright (c) 2016 Nicolas Dichtel <nicolas.dichtel@6wind.com>
+ *
+ */
+
+#ifndef _SHULTI_PRIVATE_H_
+#define _SHULTI_PRIVATE_H_
+
+#include <linux/types.h>
+#include <linux/spinlock_types.h>
+#include <linux/netdevice.h>
+#include <linux/completion.h>
+#include <net/net_namespace.h>
+#include <net/genetlink.h>
+
+struct shulti_priv {
+	u32				swdev_id;
+	u32				swdev_portid;
+	char				phys_port_name[IFNAMSIZ];
+	struct net			*swdev_net;
+	struct list_head		list;
+	struct rcu_head			rcu;
+	struct rtnl_link_stats64	stats;
+	struct {
+		u32			portid;
+		struct completion	compl;
+		bool			wait_answer;
+		int			errcode;
+		void			*answer;
+		size_t			answer_len;
+		/* protect access to genl_req fields */
+		spinlock_t		lock;
+	} genl_req;
+	struct {
+		u32			br_flags;
+		u8			stp_state;
+	} bridge;
+};
+
+struct shulti_swdev {
+	u32			swdev_id;
+	u32			ctrl_nl_portid;
+	u32			*data_nl_portids;
+	u32			data_nl_portids_cnt;
+	struct list_head	port_list;
+	struct work_struct	destroy_work;
+};
+
+struct shulti_net {
+	struct shulti_swdev	swdev;
+};
+
+/* from shulti_core.c */
+struct shulti_net *shulti_pernet(struct net *net);
+struct shulti_priv *__shulti_lookup_port(struct shulti_swdev *swdev,
+					 u32 swdev_portid);
+struct sk_buff *shulti_genl_build_req(struct net_device *dev, int cmd,
+				      size_t attrlen, gfp_t flags);
+int shulti_genl_send_req(struct net_device *dev, struct sk_buff *msg,
+			 bool need_answer, void *answer, size_t answer_len);
+
+/* from shulti_swdev.c */
+void shulti_switchdev_init(struct net_device *dev);
+void shulti_switchdev_fini(struct net_device *dev);
+int shulti_genl_set_br_fdb(struct sk_buff *msg, struct genl_info *info);
+
+#endif /* _SHULTI_PRIVATE_H_ */
diff --git a/drivers/net/ethernet/6wind/shulti/shulti_swdev.c b/drivers/net/ethernet/6wind/shulti/shulti_swdev.c
new file mode 100644
index 000000000000..06dc26ad060f
--- /dev/null
+++ b/drivers/net/ethernet/6wind/shulti/shulti_swdev.c
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2016 6WIND S.A.
+ * Copyright (c) 2016 Nicolas Dichtel <nicolas.dichtel@6wind.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+#include <linux/skbuff.h>
+#include <linux/if_bridge.h>
+#include <linux/shulti.h>
+#include <net/switchdev.h>
+
+#include "shulti_private.h"
+
+int shulti_genl_set_br_fdb(struct sk_buff *msg, struct genl_info *info)
+{
+	struct shulti_genl_hdr *shulti_hdr = info->userhdr;
+	struct net *net = genl_info_net(info);
+	struct shulti_net *pernet = shulti_pernet(net);
+	struct switchdev_notifier_fdb_info swdev_fdb;
+	struct shulti_br_fdb *shulti_fdb;
+	struct shulti_priv *port;
+	u32 swdev_portid;
+	int err = -ENOENT;
+
+	if (!info->attrs[SHULTI_A_SWDEV_PORTID] ||
+	    !info->attrs[SHULTI_A_BR_FDB] ||
+	    shulti_hdr->swdev_id != pernet->swdev.swdev_id)
+		return -EINVAL;
+
+	swdev_portid = nla_get_u32(info->attrs[SHULTI_A_SWDEV_PORTID]);
+	rtnl_lock();
+	port = __shulti_lookup_port(&pernet->swdev, swdev_portid);
+	if (!port)
+		goto out_unlock;
+
+	shulti_fdb = nla_data(info->attrs[SHULTI_A_BR_FDB]);
+	swdev_fdb.addr = shulti_fdb->addr;
+	swdev_fdb.vid = shulti_fdb->vid;
+	if (shulti_fdb->add)
+		call_switchdev_notifiers(SWITCHDEV_FDB_ADD,
+					 priv2netdev(port), &swdev_fdb.info);
+	else
+		call_switchdev_notifiers(SWITCHDEV_FDB_DEL,
+					 priv2netdev(port), &swdev_fdb.info);
+out_unlock:
+	rtnl_unlock();
+	return err;
+}
+
+static int shulti_port_attr_get(struct net_device *dev,
+				struct switchdev_attr *attr)
+{
+	struct shulti_priv *priv = netdev_priv(dev);
+
+	switch (attr->id) {
+	case SWITCHDEV_ATTR_ID_PORT_PARENT_ID:
+		attr->u.ppid.id_len = sizeof(priv->swdev_id);
+		memcpy(&attr->u.ppid.id, &priv->swdev_id, attr->u.ppid.id_len);
+		break;
+	case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS:
+		attr->u.brport_flags = priv->bridge.br_flags;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static void __shulti_destructor(void const *data)
+{
+	struct sk_buff *msg = (struct sk_buff *)data;
+
+	nlmsg_free(msg);
+}
+
+static int __shulti_port_attr_set(struct net_device *dev,
+				  struct switchdev_trans *trans,
+				  u8 cmd, int attrtype, size_t attrlen,
+				  void *attrdata, gfp_t flags)
+{
+	struct sk_buff *msg;
+
+	if (switchdev_trans_ph_prepare(trans)) {
+		struct switchdev_trans_item *elem;
+		size_t len;
+
+		elem = kzalloc(sizeof(*elem), flags);
+		if (!elem)
+			return -ENOMEM;
+
+		len = nla_total_size(attrlen);
+		msg = shulti_genl_build_req(dev, cmd, len, flags);
+		if (!msg) {
+			kfree(elem);
+			return -ENOMEM;
+		}
+		if (nla_put(msg, attrtype, attrlen, attrdata) < 0) {
+			nlmsg_free(msg);
+			kfree(elem);
+			return -EMSGSIZE;
+		}
+		switchdev_trans_item_enqueue(trans, msg, __shulti_destructor,
+					     elem);
+		return 0;
+	}
+
+	msg = switchdev_trans_item_dequeue(trans);
+	return shulti_genl_send_req(dev, msg, false, NULL, 0);
+}
+
+static int shulti_port_attr_stp_state_set(struct net_device *dev,
+					  struct switchdev_trans *trans,
+					  u8 state)
+{
+	struct shulti_priv *priv = netdev_priv(dev);
+
+	if (!switchdev_trans_ph_prepare(trans))
+		priv->bridge.stp_state = state;
+
+	return __shulti_port_attr_set(dev, trans, SHULTI_C_SET_BRIDGE,
+				      SHULTI_A_STP_STATE, sizeof(state),
+				      &state, GFP_KERNEL);
+}
+
+static int shulti_port_attr_br_flags_set(struct net_device *dev,
+					 struct switchdev_trans *trans,
+					 unsigned long flags)
+{
+	struct shulti_priv *priv = netdev_priv(dev);
+	u8 learning = !!(flags & BR_LEARNING);
+
+	if (!((priv->bridge.br_flags ^ flags) & BR_LEARNING))
+		return 0;
+
+	if (!switchdev_trans_ph_prepare(trans))
+		priv->bridge.br_flags = flags;
+
+	return __shulti_port_attr_set(dev, trans, SHULTI_C_SET_BRIDGE,
+				      SHULTI_A_BR_STATE_LEARNING,
+				      sizeof(learning), &learning, GFP_KERNEL);
+}
+
+static int shulti_port_attr_br_ageing_time_set(struct net_device *dev,
+					       struct switchdev_trans *trans,
+					       u32 ageing_time)
+{
+	return __shulti_port_attr_set(dev, trans, SHULTI_C_SET_BRIDGE,
+				      SHULTI_A_BR_AGEING_TIME,
+				      sizeof(ageing_time), &ageing_time,
+				      GFP_KERNEL);
+}
+
+static int shulti_port_attr_set(struct net_device *dev,
+				const struct switchdev_attr *attr,
+				struct switchdev_trans *trans)
+{
+	int err = 0;
+
+	switch (attr->id) {
+	case SWITCHDEV_ATTR_ID_PORT_STP_STATE:
+		err = shulti_port_attr_stp_state_set(dev, trans,
+						     attr->u.stp_state);
+		break;
+	case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS:
+		err = shulti_port_attr_br_flags_set(dev, trans,
+						    attr->u.brport_flags);
+		break;
+	case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME:
+		err = shulti_port_attr_br_ageing_time_set(dev, trans,
+							  attr->u.ageing_time);
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	return err;
+}
+
+static const struct switchdev_ops shulti_switchdev_ops = {
+	.switchdev_port_attr_get        = shulti_port_attr_get,
+	.switchdev_port_attr_set        = shulti_port_attr_set,
+};
+
+void shulti_switchdev_init(struct net_device *dev)
+{
+	struct shulti_priv *priv = netdev_priv(dev);
+
+	dev->switchdev_ops = &shulti_switchdev_ops;
+	priv->bridge.br_flags = BR_LEARNING_SYNC;
+}
+
+void shulti_switchdev_fini(struct net_device *dev)
+{
+}
diff --git a/drivers/net/ethernet/Kconfig b/drivers/net/ethernet/Kconfig
index 2ffd63463299..5a231b10134a 100644
--- a/drivers/net/ethernet/Kconfig
+++ b/drivers/net/ethernet/Kconfig
@@ -18,6 +18,7 @@ config SUNGEM_PHY
 	tristate
 
 source "drivers/net/ethernet/3com/Kconfig"
+source "drivers/net/ethernet/6wind/Kconfig"
 source "drivers/net/ethernet/adaptec/Kconfig"
 source "drivers/net/ethernet/aeroflex/Kconfig"
 source "drivers/net/ethernet/agere/Kconfig"
diff --git a/drivers/net/ethernet/Makefile b/drivers/net/ethernet/Makefile
index 1d349e9aa9a6..c4800da261b0 100644
--- a/drivers/net/ethernet/Makefile
+++ b/drivers/net/ethernet/Makefile
@@ -3,6 +3,7 @@
 #
 
 obj-$(CONFIG_NET_VENDOR_3COM) += 3com/
+obj-$(CONFIG_NET_VENDOR_6WIND) += 6wind/
 obj-$(CONFIG_NET_VENDOR_8390) += 8390/
 obj-$(CONFIG_NET_VENDOR_ADAPTEC) += adaptec/
 obj-$(CONFIG_GRETH) += aeroflex/
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 18d8394f2e5d..fdec31c15490 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2012,6 +2012,10 @@ static inline void *netdev_priv(const struct net_device *dev)
 	return (char *)dev + ALIGN(sizeof(struct net_device), NETDEV_ALIGN);
 }
 
+#define priv2netdev(priv) \
+	((struct net_device *)((char *)(priv) - \
+			      ALIGN(sizeof(struct net_device), NETDEV_ALIGN)))
+
 /* Set the sysfs physical device reference for the network logical device
  * if set prior to registration will cause a symlink during initialization.
  */
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 813ffb2e22c9..cba49d1d5684 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -363,6 +363,7 @@ header-y += rtc.h
 header-y += rtnetlink.h
 header-y += scc.h
 header-y += sched.h
+header-y += shulti.h
 header-y += scif_ioctl.h
 header-y += screen_info.h
 header-y += sctp.h
diff --git a/include/uapi/linux/shulti.h b/include/uapi/linux/shulti.h
new file mode 100644
index 000000000000..3877d7e2f760
--- /dev/null
+++ b/include/uapi/linux/shulti.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2016 6WIND S.A.
+ * Copyright (c) 2016 Nicolas Dichtel <nicolas.dichtel@6wind.com>
+ *
+ */
+
+#ifndef _UAPI_SHULTI_H_
+#define _UAPI_SHULTI_H_
+
+#include <linux/types.h>
+#include <linux/if_ether.h>
+
+/* rtnl linkinfo attributes */
+enum {
+	IFLA_SHULTI_UNSPEC,
+	IFLA_SHULTI_SWDEV_ID,
+	IFLA_SHULTI_SWDEV_PORTID,
+	IFLA_SHULTI_PHYS_PORT_NAME,
+
+	__IFLA_SHULTI_MAX
+#define IFLA_SHULTI_MAX	(__IFLA_SHULTI_MAX - 1)
+};
+
+/* genl stuff */
+#define SHULTI_NAME  "shulti"
+
+struct shulti_genl_hdr {
+	__u32	swdev_id;
+};
+
+enum {
+	SHULTI_C_UNSPEC = 0,
+	SHULTI_C_BIND,
+	SHULTI_C_UNBIND,
+	SHULTI_C_PORT_INFO,
+	SHULTI_C_LINK_STATUS,
+	SHULTI_C_PACKET,
+	SHULTI_C_UPDATE_STATS,
+	SHULTI_C_VLAN_RX_ADD_VID,
+	SHULTI_C_VLAN_RX_DEL_VID,
+	SHULTI_C_DRVINFO,
+	SHULTI_C_GET_SETTINGS,
+	SHULTI_C_SET_SETTINGS,
+	SHULTI_C_SET_RX_MODE,
+	SHULTI_C_SET_BRIDGE,
+	SHULTI_C_SET_BR_FDB,
+
+	__SHULTI_C_MAX,
+#define SHULTI_C_MAX (__SHULTI_C_MAX - 1)
+};
+
+enum {
+	SHULTI_A_UNSPEC,
+	SHULTI_A_NL_PORTIDS,
+	SHULTI_A_SWDEV_PORTID,
+	SHULTI_A_RX_QUEUES,
+	SHULTI_A_ERRCODE,
+	SHULTI_A_IFINDEX,
+	SHULTI_A_LINK_STATUS,
+	SHULTI_A_PACKET,
+	SHULTI_A_STATS,
+	SHULTI_A_VLAN_VID,
+	SHULTI_A_DRVINFO,
+	SHULTI_A_SETTINGS,
+	SHULTI_A_RX_MODES,
+	SHULTI_A_UC_ADDR,
+	SHULTI_A_MC_ADDR,
+	SHULTI_A_STP_STATE,
+	SHULTI_A_BR_STATE_LEARNING,
+	SHULTI_A_BR_AGEING_TIME,
+	SHULTI_A_BR_FDB,
+
+	__SHULTI_A_MAX
+#define SHULTI_A_MAX	(__SHULTI_A_MAX - 1)
+};
+
+struct shulti_vlan_vid {
+	__be16 proto;
+	__u16 vid;
+};
+
+struct shulti_br_fdb {
+	__u8 add;
+	__u8 addr[ETH_ALEN];
+	__u16 vid;
+};
+#endif /* _UAPI_SHULTI_H_ */
-- 
2.4.2

^ permalink raw reply related

* ixgbe: cannot enable LRO
From: Otto Sabart @ 2016-04-27  9:36 UTC (permalink / raw)
  To: netdev; +Cc: Jirka Hladky, Adam Okuliar

[-- Attachment #1: Type: text/plain, Size: 265 bytes --]


Hello everyone,
does anybody have a problem with LRO on ixge (on latest 4.6-rc5)?
I cannot find a way to enable it.

On stable RHEL7.2 kernel everything works fine.

I opened a bug report [0].

[0] https://bugzilla.kernel.org/show_bug.cgi?id=117291


Thanks!

Ota

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 870 bytes --]

^ permalink raw reply

* [PATCH net v3 2/2] gre: build header correctly for collect metadata tunnels
From: Jiri Benc @ 2016-04-27  9:29 UTC (permalink / raw)
  To: netdev; +Cc: Pravin B Shelar, Thomas Graf, Simon Horman
In-Reply-To: <cover.1461747398.git.jbenc@redhat.com>

In ipgre (i.e. not gretap) + collect metadata mode, the skb was assumed to
contain Ethernet header and was encapsulated as ETH_P_TEB. This is not the
case, the interface is ARPHRD_IPGRE and the protocol to be used for
encapsulation is skb->protocol.

Fixes: 2e15ea390e6f4 ("ip_gre: Add support to collect tunnel metadata.")
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
---
v3: unchanged
v2: unchanged
---
 net/ipv4/ip_gre.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index d0abde4236af..f973e0a58993 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -523,7 +523,8 @@ static struct rtable *gre_get_rt(struct sk_buff *skb,
 	return ip_route_output_key(net, fl);
 }
 
-static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
+static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
+			__be16 proto)
 {
 	struct ip_tunnel_info *tun_info;
 	const struct ip_tunnel_key *key;
@@ -575,7 +576,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
-	build_header(skb, tunnel_hlen, flags, htons(ETH_P_TEB),
+	build_header(skb, tunnel_hlen, flags, proto,
 		     tunnel_id_to_key(tun_info->key.tun_id), 0);
 
 	df = key->tun_flags & TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
@@ -616,7 +617,7 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
 	const struct iphdr *tnl_params;
 
 	if (tunnel->collect_md) {
-		gre_fb_xmit(skb, dev);
+		gre_fb_xmit(skb, dev, skb->protocol);
 		return NETDEV_TX_OK;
 	}
 
@@ -660,7 +661,7 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 
 	if (tunnel->collect_md) {
-		gre_fb_xmit(skb, dev);
+		gre_fb_xmit(skb, dev, htons(ETH_P_TEB));
 		return NETDEV_TX_OK;
 	}
 
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net v3 1/2] gre: do not assign header_ops in collect metadata mode
From: Jiri Benc @ 2016-04-27  9:29 UTC (permalink / raw)
  To: netdev; +Cc: Pravin B Shelar, Thomas Graf, Simon Horman
In-Reply-To: <cover.1461747398.git.jbenc@redhat.com>

In ipgre mode (i.e. not gretap) with collect metadata flag set, the tunnel
is incorrectly assumed to be mGRE in NBMA mode (see commit 6a5f44d7a048c).
This is not the case, we're controlling the encapsulation addresses by
lwtunnel metadata. And anyway, assigning dev->header_ops in collect metadata
mode does not make sense.

Although it would be more user firendly to reject requests that specify
both the collect metadata flag and a remote/local IP address, this would
break current users of gretap or introduce ugly code and differences in
handling ipgre and gretap configuration. Keep the current behavior of
remote/local IP address being ignored in such case.

v3: Back to v1, added explanation paragraph.
v2: Reject configuration specifying both remote/local address and collect
    metadata flag.

Fixes: 2e15ea390e6f4 ("ip_gre: Add support to collect tunnel metadata.")
Signed-off-by: Jiri Benc <jbenc@redhat.com>
---
 net/ipv4/ip_gre.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index af5d1f38217f..d0abde4236af 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -893,7 +893,7 @@ static int ipgre_tunnel_init(struct net_device *dev)
 	netif_keep_dst(dev);
 	dev->addr_len		= 4;
 
-	if (iph->daddr) {
+	if (iph->daddr && !tunnel->collect_md) {
 #ifdef CONFIG_NET_IPGRE_BROADCAST
 		if (ipv4_is_multicast(iph->daddr)) {
 			if (!iph->saddr)
@@ -902,8 +902,9 @@ static int ipgre_tunnel_init(struct net_device *dev)
 			dev->header_ops = &ipgre_header_ops;
 		}
 #endif
-	} else
+	} else if (!tunnel->collect_md) {
 		dev->header_ops = &ipgre_header_ops;
+	}
 
 	return ip_tunnel_init(dev);
 }
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net v3 0/2] gre: fix lwtunnel support
From: Jiri Benc @ 2016-04-27  9:29 UTC (permalink / raw)
  To: netdev; +Cc: Pravin B Shelar, Thomas Graf, Simon Horman

This patchset fixes a few bugs in ipgre metadata mode implementation.

As an example, in this setup:

ip a a 192.168.1.1/24 dev eth0
ip l a gre1 type gre external
ip l s gre1 up
ip a a 192.168.99.1/24 dev gre1
ip r a 192.168.99.2/32 encap ip dst 192.168.1.2 ttl 10 dev gre1
ping 192.168.99.2

the traffic does not go through before this patchset and does as expected
with it applied.

v3: Back to v1 in order not to break existing users. Dropped patch 3, will
    be fixed in iproute2 instead.
v2: Rejecting invalid configuration, added patch 3, dropped patch for
    ETH_P_TEB (will target net-next).

Jiri Benc (2):
  gre: do not assign header_ops in collect metadata mode
  gre: build header correctly for collect metadata tunnels

 net/ipv4/ip_gre.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

-- 
1.8.3.1

^ permalink raw reply

* Re: [PATCH 1/6] bus: Add shared MDIO bus framework
From: Arnd Bergmann @ 2016-04-27  9:28 UTC (permalink / raw)
  To: Anup Patel
  Cc: Andrew Lunn, Florian Fainelli, Pramod Kumar, Rob Herring,
	Catalin Marinas, Will Deacon, Masahiro Yamada, Chen-Yu Tsai,
	Mark Rutland, Device Tree, Pawel Moll, Suzuki K Poulose,
	netdev-u79uwXL29TY76Z2rM5mHXA, Punit Agrawal, Linux Kernel,
	BCM Kernel Feedback, Linux ARM Kernel, Kishon Vijay Abraham I
In-Reply-To: <CAALAos_bPnT96xA883mkkjwUTut=_NJymUZKM=-0ZujdDxNEkA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

On Wednesday 27 April 2016 10:16:40 Anup Patel wrote:

> It is really interesting to see the evolution of MDIO bus:
> 
> 1. Traditionally, MDIO controller used to be part of each ethernet controller
> itself so each ethernet controller used to have it's own 2 wire MDIO bus
> 
> 2. Next, we saw SoC with multiple ethernet controllers sharing same MDIO
> bus. In other words, we saw multiple MDIO bus being muxed over single
> MDIO bus with additional bus select lines (I think this is when
> drivers/net/phy/mdio-mux.c APIs were implemented but at this point all
> PHYs on muxed MDIO bus were ethernet PHYs).
> 
> 3. Then, we saw SoC with ethernet switch devices also accessible over
> shared MDIO bus (or Muxed MDIO bus) along with ethernet PHYs (I guess
> this is why we have drivers/net/phy/mdio_device.c which adds
> "mdio_device" for non-ethernet-PHY devices on MDIO bus).
> 
> 4. Now, we have SoC with SATA PHYs, PCIe PHYs, USB PHYs, and Ethernet
> PHYs all accessible over same shared MDIO bus (or Muxed MDIO bus). The
> SATA PHYs and PCIe PHYs are registered to "Generic PHY framework". For
> USB PHYs, we can either register to "Generic PHY framework" or "USB PHY
> framework". For Ethernet PHYs, we register MDIO bus instance to "Ethernet
> MDIO framework".

Thanks for the extra background information.

> The devices on ARM64 SoC has to be within first 4GB and RAM has to start
> from first 4GB to be ARM compliant because ARM64 CPUs have 32bit mode and
> all devices and RAM should be available to 32bit mode with MMU disabled.
> This means that we only have 4GB to fit all devices registers and some
> portion of RAM. Some of these non-Ethernet PHYs have tons of registers so
> as number of PHYs increase in a SoC they will eat-up lot of first 4GB
> address space. Using same MDIO bus for all types of PHYs (SATA, PCIe, USB,
> and Ethernet) is actually a good approach because it actually saves lot of
> first 4GB address space. In future, more devices will be moved to a shared
> MDIO bus which are less frequently accessed.

I think it remains to be seen if anyone other than Broadcom follows
this model. I have no idea if that's likely or not, perhaps everyone
does this, perhaps you are the only ones.

> For Broadcom iProc SoCs, the design choice has already been made to use
> shared MDIO bus for all PHYs. In fact, Broadcom iProc NS2 SoC already has
> a shared MDIO bus for SATA PHYs, USB PHYs, PCIe PHYs, and Ethernet
> PHYs and more Broadcom iProc SoCs are on their way. Of course, there are
> few exceptions in iProc SoCs such as SATA PHYs where we also have memory
> mapped registers to access PHYs but other PHYs don't have such memory
> mapped registers.
> 
> Clearly from above, the traditional 2 wire MDIO bus is now a shared MDIO
> bus with 2-wire plus additional select lines. Also, now we have SoCs (such
> as Broadcom iProc SoCs) which has such shared MDIO bus and I think
> in-future we will have SoCs with a shared MDIO bus for variety of devices.
> 
> For long term, we really need a clean solution to fit shared MDIO based
> PHY drivers in Linux kernel. Also, shared MDIO based PHY drivers should
> not be dependent on any particular IO subsystem (such as Linux Ethernet)
> because there are lot of use-cases where people want strip down kernel
> image by not-compiling IO subsystems which are not required.
> 
> IMHO, we have several options in front of us:
> 1. Use some light-weight framework (such as shared_mdio.c implemented
> by this patchset) under drivers/bus

I think this has been sufficiently NAKed by everyone

> 2. Extend "Generic PHY framework" to allow something like shared MDIO
> bus (as-per Arnd's suggestion)
> 3. Move-out "MDIO-mux APIs" from drivers/net/phy to something like
> drivers/mdio-mux and make it independent of "Linux Ethernet subsystem".
> (... may be more options ...)

while these two really describe the same thing. I think as a first
step, we can reorganize the Kconfig structure to put ethernet PHY
and the MDIO bus as two separate submenus in drivers/phy/Kconfig,
and make the latter independent of CONFIG_NETDEVICES, see patch
below. With that, you should already be able to write a generic
phy driver that registers itself as an MDIO device driver.

We can also debate moving files from drivers/net/phy and
drivers/usb/phy into drivers/phy/{net,mdio,usb}/ as a follow-up,
but the file location is really not all that important here.

	Arnd


diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index befd67df08e1..c58b60e70ab2 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -356,8 +356,6 @@ config NET_SB1000
 
 	  If you don't have this card, of course say N.
 
-source "drivers/net/phy/Kconfig"
-
 source "drivers/net/plip/Kconfig"
 
 source "drivers/net/ppp/Kconfig"
diff --git a/drivers/net/ethernet/Kconfig b/drivers/net/ethernet/Kconfig
index 2ffd63463299..2e2491b344d9 100644
--- a/drivers/net/ethernet/Kconfig
+++ b/drivers/net/ethernet/Kconfig
@@ -11,9 +11,6 @@ menuconfig ETHERNET
 
 if ETHERNET
 
-config MDIO
-	tristate
-
 config SUNGEM_PHY
 	tristate
 
diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index 6dad9a9c356c..58447866fe64 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -3,8 +3,9 @@
 #
 
 menuconfig PHYLIB
-	tristate "PHY Device support and infrastructure"
+	tristate "Ethernet PHY Device support and infrastructure"
 	depends on NETDEVICES
+	select MDIO
 	help
 	  Ethernet controllers are usually attached to PHY
 	  devices.  This option provides infrastructure for
@@ -164,6 +165,16 @@ config FIXED_PHY
 	  PHYs that are not connected to the real MDIO bus.
 
 	  Currently tested with mpc866ads and mpc8349e-mitx.
+endif # PHYLIB
+
+config MDIO
+	tristate
+	help
+	  The MDIO bus is typically used ethernet PHYs, but can also be
+	  used by other PHY drivers.
+
+menu "MDIO bus drivers"
+	depends on MDIO
 
 config MDIO_BITBANG
 	tristate "Support for bitbanged MDIO buses"
@@ -271,7 +282,7 @@ config MDIO_BCM_IPROC
 	  This module provides a driver for the MDIO busses found in the
 	  Broadcom iProc SoC's.
 
-endif # PHYLIB
+endmenu # MDIO
 
 config MICREL_KS8995MA
 	tristate "Micrel KS8995MA 5-ports 10/100 managed Ethernet switch"
diff --git a/drivers/phy/Kconfig b/drivers/phy/Kconfig
index ce29c17359e9..298893744a17 100644
--- a/drivers/phy/Kconfig
+++ b/drivers/phy/Kconfig
@@ -2,7 +2,9 @@
 # PHY
 #
 
-menu "PHY Subsystem"
+menu "PHY drivers"
+
+menu "Generic PHY subsystem"
 
 config GENERIC_PHY
 	bool "PHY Core"
@@ -425,3 +427,7 @@ config PHY_CYGNUS_PCIE
 source "drivers/phy/tegra/Kconfig"
 
 endmenu
+
+source "drivers/net/phy/Kconfig"
+
+endmenu

--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* Re: [net-next PATCH V3 3/5] samples/bpf: add a README file to get users started
From: Jesper Dangaard Brouer @ 2016-04-27  9:16 UTC (permalink / raw)
  To: Naveen N. Rao
  Cc: netdev, linux-kbuild, bblanco, borkmann, alexei.starovoitov,
	brouer
In-Reply-To: <20160427083522.GA15073@naverao1-tp.in.ibm.com>

On Wed, 27 Apr 2016 14:05:22 +0530
"Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> wrote:

> On 2016/04/27 09:30AM, Jesper Dangaard Brouer wrote:
> > Getting started with using examples in samples/bpf/ is not
> > straightforward.  There are several dependencies, and specific
> > versions of these dependencies.
> > 
> > Just compiling the example tool is also slightly obscure, e.g. one
> > need to call make like:
> > 
> >  make samples/bpf/
> > 
> > Do notice the "/" slash after the directory name.
> > 
> > Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> > ---
> >  samples/bpf/README.rst |   75 ++++++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 75 insertions(+)
> >  create mode 100644 samples/bpf/README.rst  
> 
> Thanks for adding this! A few nits...

I would prefer if we could apply this patchset and you could followup
with a patch with your nits...

> > 
> > diff --git a/samples/bpf/README.rst b/samples/bpf/README.rst
> > new file mode 100644
> > index 000000000000..1fa157db905b
> > --- /dev/null
> > +++ b/samples/bpf/README.rst
> > @@ -0,0 +1,75 @@
> > +eBPF sample programs
> > +====================
> > +
> > +This kernel samples/bpf directory contains a mini eBPF library, test  
> 	^^^^^^^^^^^^^^^^^^
> 'This directory contains' should suffice.

The reason I formulated it like this, was that people will often hit
this kind of documentation when searching google.


> > +stubs, verifier test-suite and examples for using eBPF.
> > +
> > +Build dependencies
> > +==================
> > +
> > +Compiling requires having installed:
> > + * clang >= version 3.4.0
> > + * llvm >= version 3.7.1
> > +
> > +Note that LLVM's tool 'llc' must support target 'bpf', list with command::
> > +
> > + $ llc --version  
> 
> 'llc --version | grep bpf' is probably simpler?

I wanted to give people the impression of how the output looks like.
 
> > + LLVM (http://llvm.org/):
> > +  LLVM version 3.x.y
> > +  [...]
> > +  Host CPU: xxx
> > +
> > +  Registered Targets:
> > +    [...]
> > +    bpf        - BPF (host endian)
> > +    bpfeb      - BPF (big endian)
> > +    bpfel      - BPF (little endian)
> > +    [...]
> > +
> > +Kernel headers
> > +--------------
> > +
> > +There are usually dependencies to header files of the current kernel.
> > +To avoid installing devel kernel headers system wide, as a normal
> > +user, simply call::
> > +
> > + make headers_install
> > +
> > +This will creates a local "usr/include" directory in the git/build top
> > +level directory, that the make system automatically pickup first.
> > +
> > +Compiling
> > +=========
> > +
> > +For compiling goto kernel top level build directory and run make like::  
> 
> For building the BPF samples, issue the below command from the kernel 
> root directory:

I like your formulation better, but it it worth a respin of the entire
patchset? 

Notice you need the extra "::" ending of the paragraph, to make this
document format nicely with RST (ReStructuredText).

The a README with a .rst suffix will be picked up by github and
displayed as the doc for the directory. Thus I also made sure it
"compiles" with the rst tools. E.g see how samples/pktgen gets auto
documented and nicely formatted via github (scroll down):
 https://github.com/torvalds/linux/tree/master/samples/pktgen

> > +
> > + make samples/bpf/
> > +
> > +Do notice the "/" slash after the directory name.
> > +
> > +Manually compiling LLVM with 'bpf' support
> > +------------------------------------------
> > +
> > +Since version 3.7.0, LLVM adds a proper LLVM backend target for the
> > +BPF bytecode architecture.
> > +
> > +By default llvm will build all non-experimental backends including bpf.
> > +To generate a smaller llc binary one can use::
> > +
> > + -DLLVM_TARGETS_TO_BUILD="BPF;X86"  
> 
> Is the X86 target really needed?

I'm not sure, but if you want to use clang/llc for something else it is
useful, and the example usage of the ";" separator syntax makes it
worth including as an example.

> > +
> > +Quick sniplet for manually compiling LLVM and clang
> > +(build dependencies are cmake and gcc-c++)::
> > +
> > + $ git clone http://llvm.org/git/llvm.git
> > + $ cd llvm/tools
> > + $ git clone --depth 1 http://llvm.org/git/clang.git
> > + $ cd ..; mkdir build; cd build
> > + $ cmake .. -DLLVM_TARGETS_TO_BUILD="BPF;X86"  
> 					    ^^^
> Here too.
> 
> - Naveen
> 
> > + $ make -j $(getconf _NPROCESSORS_ONLN)
> > +
> > +It is also possible to point make to the newly compiled 'llc' command
> > +via redefining LLC on the make command line::
> > +
> > + make samples/bpf/ LLC=~/git/llvm/build/bin/llc
> > +
> >   
> 



-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* pull-request: mac80211 2016-04-27
From: Johannes Berg @ 2016-04-27  8:57 UTC (permalink / raw)
  To: David Miller
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA

Hi Dave,

While writing some new code yesterday, I found and fixed a per-CPU memory
leak, this pull request has just a single patch addressing that.

Let me know if there's any problem.

Thanks,
johannes



The following changes since commit 8f815cdde3e550e10c2736990d791f60c2ce43eb:

  nl80211: check netlink protocol in socket release notification (2016-04-12 15:39:06 +0200)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211.git tags/mac80211-for-davem-2016-04-27

for you to fetch changes up to e6436be21e77e3659b4ff7e357ab5a8342d132d2:

  mac80211: fix statistics leak if dev_alloc_name() fails (2016-04-27 10:06:58 +0200)

----------------------------------------------------------------
Just a single fix, for a per-CPU memory leak in a
(root user triggerable) error case.

----------------------------------------------------------------
Johannes Berg (1):
      mac80211: fix statistics leak if dev_alloc_name() fails

 net/mac80211/iface.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH net-next 1/1] pch_gbe: replace private tx ring lock with common netif_tx_lock
From: Nikolay Aleksandrov @ 2016-04-27  8:55 UTC (permalink / raw)
  To: Francois Romieu, netdev; +Cc: davem
In-Reply-To: <20160426224921.GA29558@electric-eye.fr.zoreil.com>

On 04/27/2016 12:49 AM, Francois Romieu wrote:
> pch_gbe_tx_ring.tx_lock is only used in the hard_xmit handler and
> in the transmit completion reaper called from NAPI context.
> 
> Signed-off-by: Francois Romieu <romieu@fr.zoreil.com>
> ---
>  CONFIG_COMPILE_TESTed
> 
>  drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h      |  2 --
>  drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c | 10 ++--------
>  2 files changed, 2 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h
> index 2a55d6d..8d710a3 100644
> --- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h
> +++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h
> @@ -481,7 +481,6 @@ struct pch_gbe_buffer {
>  
>  /**
>   * struct pch_gbe_tx_ring - tx ring information
> - * @tx_lock:	spinlock structs
>   * @desc:	pointer to the descriptor ring memory
>   * @dma:	physical address of the descriptor ring
>   * @size:	length of descriptor ring in bytes
> @@ -491,7 +490,6 @@ struct pch_gbe_buffer {
>   * @buffer_info:	array of buffer information structs
>   */
>  struct pch_gbe_tx_ring {
> -	spinlock_t tx_lock;
>  	struct pch_gbe_tx_desc *desc;
>  	dma_addr_t dma;
>  	unsigned int size;
> diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
> index ca4add7..5c8e2f1 100644
> --- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
> +++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
> @@ -1640,7 +1640,7 @@ pch_gbe_clean_tx(struct pch_gbe_adapter *adapter,
>  		   cleaned_count);
>  	if (cleaned_count > 0)  { /*skip this if nothing cleaned*/
>  		/* Recover from running out of Tx resources in xmit_frame */
> -		spin_lock(&tx_ring->tx_lock);
> +		netif_tx_lock(adapter->netdev);
>  		if (unlikely(cleaned && (netif_queue_stopped(adapter->netdev))))
>  		{
>  			netif_wake_queue(adapter->netdev);
> @@ -1652,7 +1652,7 @@ pch_gbe_clean_tx(struct pch_gbe_adapter *adapter,
>  
>  		netdev_dbg(adapter->netdev, "next_to_clean : %d\n",
>  			   tx_ring->next_to_clean);
> -		spin_unlock(&tx_ring->tx_lock);
> +		netif_tx_lock(adapter->netdev);

Shouldn't this be netif_tx_unlock ?

>  	}
>  	return cleaned;
>  }
> @@ -1805,7 +1805,6 @@ int pch_gbe_setup_tx_resources(struct pch_gbe_adapter *adapter,
>  
>  	tx_ring->next_to_use = 0;
>  	tx_ring->next_to_clean = 0;
> -	spin_lock_init(&tx_ring->tx_lock);
>  
>  	for (desNo = 0; desNo < tx_ring->count; desNo++) {
>  		tx_desc = PCH_GBE_TX_DESC(*tx_ring, desNo);
> @@ -2135,13 +2134,9 @@ static int pch_gbe_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
>  {
>  	struct pch_gbe_adapter *adapter = netdev_priv(netdev);
>  	struct pch_gbe_tx_ring *tx_ring = adapter->tx_ring;
> -	unsigned long flags;
> -
> -	spin_lock_irqsave(&tx_ring->tx_lock, flags);
>  
>  	if (unlikely(!PCH_GBE_DESC_UNUSED(tx_ring))) {
>  		netif_stop_queue(netdev);
> -		spin_unlock_irqrestore(&tx_ring->tx_lock, flags);
>  		netdev_dbg(netdev,
>  			   "Return : BUSY  next_to use : 0x%08x  next_to clean : 0x%08x\n",
>  			   tx_ring->next_to_use, tx_ring->next_to_clean);
> @@ -2150,7 +2145,6 @@ static int pch_gbe_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
>  
>  	/* CRC,ITAG no support */
>  	pch_gbe_tx_queue(adapter, tx_ring, skb);
> -	spin_unlock_irqrestore(&tx_ring->tx_lock, flags);
>  	return NETDEV_TX_OK;
>  }
>  
> 

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox