Netdev List
 help / color / mirror / Atom feed
* [PATCH] ethtool: add one ethtool option to set relax ordering mode
From: Mao Wenan @ 2016-12-08  6:51 UTC (permalink / raw)
  To: netdev, jeffrey.t.kirsher
In-Reply-To: <1481179898-10668-1-git-send-email-maowenan@huawei.com>

This patch provides one way to set/unset IXGBE NIC TX and RX
relax ordering mode, which can be set by ethtool.
Relax ordering is one mode of 82599 NIC, to enable this mode
can enhance the performance for some cpu architecure.
example:
ethtool -s enp1s0f0 relaxorder off
ethtool -s enp1s0f0 relaxorder on

Signed-off-by: Mao Wenan <maowenan@huawei.com>
---
 ethtool-copy.h |  6 ++++++
 ethtool.c      | 24 +++++++++++++++++++++++-
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/ethtool-copy.h b/ethtool-copy.h
index 3d299e3..37d93be 100644
--- a/ethtool-copy.h
+++ b/ethtool-copy.h
@@ -1329,6 +1329,8 @@ struct ethtool_per_queue_op {
 #define ETHTOOL_PHY_GTUNABLE	0x0000004e /* Get PHY tunable configuration */
 #define ETHTOOL_PHY_STUNABLE	0x0000004f /* Set PHY tunable configuration */
 
+#define ETHTOOL_SRELAXORDER	0x00000050 /* Set relax ordering mode, on or off*/
+
 /* compatibility with older code */
 #define SPARC_ETH_GSET		ETHTOOL_GSET
 #define SPARC_ETH_SSET		ETHTOOL_SSET
@@ -1558,6 +1560,10 @@ static __inline__ int ethtool_validate_duplex(__u8 duplex)
 #define WAKE_MAGIC		(1 << 5)
 #define WAKE_MAGICSECURE	(1 << 6) /* only meaningful if WAKE_MAGIC */
 
+/* Relax Ordering mode, on or off. */
+#define RELAXORDER_OFF		0x00
+#define RELAXORDER_ON		0x01
+
 /* L2-L4 network traffic flow types */
 #define	TCP_V4_FLOW	0x01	/* hash or spec (tcp_ip4_spec) */
 #define	UDP_V4_FLOW	0x02	/* hash or spec (udp_ip4_spec) */
diff --git a/ethtool.c b/ethtool.c
index 7af039e..acafd71 100644
--- a/ethtool.c
+++ b/ethtool.c
@@ -2738,6 +2738,8 @@ static int do_sset(struct cmd_context *ctx)
 	int msglvl_changed = 0;
 	u32 msglvl_wanted = 0;
 	u32 msglvl_mask = 0;
+	int relaxorder_wanted = -1;
+	int relaxorder_changed = 0;
 	struct cmdline_info cmdline_msglvl[ARRAY_SIZE(flags_msglvl)];
 	int argc = ctx->argc;
 	char **argp = ctx->argp;
@@ -2873,6 +2875,16 @@ static int do_sset(struct cmd_context *ctx)
 					ARRAY_SIZE(cmdline_msglvl));
 				break;
 			}
+		} else if (!strcmp(argp[i], "relaxorder")) {
+			relaxorder_changed = 1;
+			i += 1;
+			if (i >= argc)
+				exit_bad_args();
+			if (!strcmp(argp[i], "on"))
+				relaxorder_wanted = RELAXORDER_ON;
+			else if (!strcmp(argp[i], "off"))
+				relaxorder_wanted = RELAXORDER_OFF;
+			else	exit_bad_args();
 		} else {
 			exit_bad_args();
 		}
@@ -3093,6 +3105,15 @@ static int do_sset(struct cmd_context *ctx)
 		}
 	}
 
+	if (relaxorder_changed) {
+		struct ethtool_value edata;
+
+		edata.cmd = ETHTOOL_SRELAXORDER;
+		edata.data = relaxorder_wanted;
+		err = send_ioctl(ctx, &edata);
+		if (err < 0)
+			perror("Cannot set relax ordering mode");
+	}
 	return 0;
 }
 
@@ -4690,7 +4711,8 @@ static const struct option {
 	  "		[ xcvr internal|external ]\n"
 	  "		[ wol p|u|m|b|a|g|s|d... ]\n"
 	  "		[ sopass %x:%x:%x:%x:%x:%x ]\n"
-	  "		[ msglvl %d | msglvl type on|off ... ]\n" },
+	  "		[ msglvl %d | msglvl type on|off ... ]\n"
+	  "		[ relaxorder on|off ]\n" },
 	{ "-a|--show-pause", 1, do_gpause, "Show pause options" },
 	{ "-A|--pause", 1, do_spause, "Set pause options",
 	  "		[ autoneg on|off ]\n"
-- 
2.7.0

^ permalink raw reply related

* [PATCH] net: add one ethtool option to set relax ordering mode
From: Mao Wenan @ 2016-12-08  6:51 UTC (permalink / raw)
  To: netdev, jeffrey.t.kirsher

This patch provides one way to set/unset IXGBE NIC TX and RX
relax ordering mode, which can be set by ethtool.
Relax ordering is one mode of 82599 NIC, to enable this mode
can enhance the performance for some cpu architecure.
example:
ethtool -s enp1s0f0 relaxorder off
ethtool -s enp1s0f0 relaxorder on

Signed-off-by: Mao Wenan <maowenan@huawei.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 34 ++++++++++++++++++++++++
 include/linux/ethtool.h                          |  2 ++
 include/uapi/linux/ethtool.h                     |  6 +++++
 net/core/ethtool.c                               |  5 ++++
 4 files changed, 47 insertions(+)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
index f49f803..9650539 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@ -493,6 +493,39 @@ static void ixgbe_set_msglevel(struct net_device *netdev, u32 data)
 	adapter->msg_enable = data;
 }
 
+static void ixgbe_set_relaxorder(struct net_device *netdev, u32 data)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+	struct ixgbe_hw *hw = &adapter->hw;
+	u32 i = 0;
+	pr_info("set relax ordering mode : %s\n",data?"on":"off");
+	
+	for (i = 0; i < hw->mac.max_tx_queues; i++) {
+		u32 regval;
+
+		regval = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL_82599(i));
+		if (data)
+			regval |= IXGBE_DCA_TXCTRL_DESC_WRO_EN;
+		else
+			regval &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN;
+		IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL_82599(i), regval);
+	}
+
+	for (i = 0; i < hw->mac.max_rx_queues; i++) {
+		u32 regval;
+		
+		regval = IXGBE_READ_REG(hw, IXGBE_DCA_RXCTRL(i));
+		if (data)
+			regval |= (IXGBE_DCA_RXCTRL_DATA_WRO_EN |
+					IXGBE_DCA_RXCTRL_HEAD_WRO_EN);
+		else
+			regval &= ~(IXGBE_DCA_RXCTRL_DATA_WRO_EN |
+					IXGBE_DCA_RXCTRL_HEAD_WRO_EN);
+		IXGBE_WRITE_REG(hw, IXGBE_DCA_RXCTRL(i), regval);
+	}
+
+}
+
 static int ixgbe_get_regs_len(struct net_device *netdev)
 {
 #define IXGBE_REGS_LEN  1139
@@ -3274,6 +3307,7 @@ static const struct ethtool_ops ixgbe_ethtool_ops = {
 	.get_ts_info		= ixgbe_get_ts_info,
 	.get_module_info	= ixgbe_get_module_info,
 	.get_module_eeprom	= ixgbe_get_module_eeprom,
+	.set_relaxorder         = ixgbe_set_relaxorder,
 };
 
 void ixgbe_set_ethtool_ops(struct net_device *netdev)
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 9ded8c6..0fae148 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -286,6 +286,7 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
  *	fields should be ignored (use %__ETHTOOL_LINK_MODE_MASK_NBITS
  *	instead of the latter), any change to them will be overwritten
  *	by kernel. Returns a negative error code or zero.
+ * @set_relaxorder: set relax ordering mode, on|off.
  *
  * All operations are optional (i.e. the function pointer may be set
  * to %NULL) and callers must take this into account.  Callers must
@@ -372,5 +373,6 @@ struct ethtool_ops {
 				      struct ethtool_link_ksettings *);
 	int	(*set_link_ksettings)(struct net_device *,
 				      const struct ethtool_link_ksettings *);
+	void    (*set_relaxorder)(struct net_device *, u32);
 };
 #endif /* _LINUX_ETHTOOL_H */
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 8e54723..86349b9 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1314,6 +1314,8 @@ struct ethtool_per_queue_op {
 #define ETHTOOL_GLINKSETTINGS	0x0000004c /* Get ethtool_link_settings */
 #define ETHTOOL_SLINKSETTINGS	0x0000004d /* Set ethtool_link_settings */
 
+#define ETHTOOL_SRELAXORDER	0x00000050 /* Set relax ordering mode, on or off*/
+
 
 /* compatibility with older code */
 #define SPARC_ETH_GSET		ETHTOOL_GSET
@@ -1494,6 +1496,10 @@ static inline int ethtool_validate_speed(__u32 speed)
 #define DUPLEX_FULL		0x01
 #define DUPLEX_UNKNOWN		0xff
 
+/* Relax Ordering mode, on or off. */
+#define RELAXORDER_OFF          0x00
+#define RELAXORDER_ON           0x01
+
 static inline int ethtool_validate_duplex(__u8 duplex)
 {
 	switch (duplex) {
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 047a175..b7629d1 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -2685,6 +2685,11 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_SLINKSETTINGS:
 		rc = ethtool_set_link_ksettings(dev, useraddr);
 		break;
+	case ETHTOOL_SRELAXORDER:
+		rc = ethtool_set_value_void(dev, useraddr,
+					dev->ethtool_ops->set_relaxorder);
+		 break;
+
 	default:
 		rc = -EOPNOTSUPP;
 	}
-- 
2.7.0

^ permalink raw reply related

* Re: [PATCH] linux/types.h: enable endian checks for all sparse builds
From: Bart Van Assche @ 2016-12-08  6:38 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: linux-kernel@vger.kernel.org, Linus Torvalds, Christoph Hellwig,
	Jason Wang, linux-kbuild@vger.kernel.org, Michal Marek,
	Arnd Bergmann, Greg Kroah-Hartman, Matt Mackall, Herbert Xu,
	David Airlie, Gerd Hoffmann, Ohad Ben-Cohen,
	Christian Borntraeger, Cornelia Huck, James E.J. Bottomley,
	David S. Miller, Jens Axboe, Neil Armstrong <narmstr
In-Reply-To: <20161208075152-mutt-send-email-mst@kernel.org>

On 12/07/16 21:54, Michael S. Tsirkin wrote:
> On Thu, Dec 08, 2016 at 05:21:47AM +0000, Bart Van Assche wrote:
>> Additionally, there are notable exceptions to the rule that most drivers
>> are endian-clean, e.g. drivers/scsi/qla2xxx. I would appreciate it if it
>> would remain possible to check such drivers with sparse without enabling
>> endianness checks. Have you considered to change #ifdef __CHECK_ENDIAN__
>> into e.g. #ifndef __DONT_CHECK_ENDIAN__?
>
> The right thing is probably just to fix these, isn't it?
> Until then, why not just ignore the warnings?

Neither option is realistic. With endian-checking enabled the qla2xxx 
driver triggers so many warnings that it becomes a real challenge to 
filter the non-endian warnings out manually:

$ for f in "" CF=-D__CHECK_ENDIAN__; do make M=drivers/scsi/qla2xxx C=2\
     $f | &grep -c ': warning:'; done
4
752

If you think it would be easy to fix the endian warnings triggered by 
the qla2xxx driver, you are welcome to try to fix these.

Bart.

^ permalink raw reply

* Re: [net-next PATCH v5 5/6] virtio_net: add XDP_TX support
From: Michael S. Tsirkin @ 2016-12-08  6:11 UTC (permalink / raw)
  To: John Fastabend
  Cc: daniel, shm, davem, tgraf, alexei.starovoitov, john.r.fastabend,
	netdev, brouer
In-Reply-To: <20161207201245.28121.95418.stgit@john-Precision-Tower-5810>

On Wed, Dec 07, 2016 at 12:12:45PM -0800, John Fastabend wrote:
> This adds support for the XDP_TX action to virtio_net. When an XDP
> program is run and returns the XDP_TX action the virtio_net XDP
> implementation will transmit the packet on a TX queue that aligns
> with the current CPU that the XDP packet was processed on.
> 
> Before sending the packet the header is zeroed.  Also XDP is expected
> to handle checksum correctly so no checksum offload  support is
> provided.
> 
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> ---
>  drivers/net/virtio_net.c |   99 +++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 92 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 28b1196..8e5b13c 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -330,12 +330,57 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
>  	return skb;
>  }
>  
> +static void virtnet_xdp_xmit(struct virtnet_info *vi,
> +			     struct receive_queue *rq,
> +			     struct send_queue *sq,
> +			     struct xdp_buff *xdp)
> +{
> +	struct page *page = virt_to_head_page(xdp->data);
> +	struct virtio_net_hdr_mrg_rxbuf *hdr;
> +	unsigned int num_sg, len;
> +	void *xdp_sent;
> +	int err;
> +
> +	/* Free up any pending old buffers before queueing new ones. */
> +	while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) {
> +		struct page *sent_page = virt_to_head_page(xdp_sent);
> +
> +		if (vi->mergeable_rx_bufs)
> +			put_page(sent_page);
> +		else
> +			give_pages(rq, sent_page);
> +	}

Looks like this is the only place where you do virtqueue_get_buf.
No interrupt handler?
This means that if you fill up the queue, nothing will clean it
and things will get stuck.
Can this be the issue you saw?


> +
> +	/* Zero header and leave csum up to XDP layers */
> +	hdr = xdp->data;
> +	memset(hdr, 0, vi->hdr_len);
> +
> +	num_sg = 1;
> +	sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data);
> +	err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
> +				   xdp->data, GFP_ATOMIC);
> +	if (unlikely(err)) {
> +		if (vi->mergeable_rx_bufs)
> +			put_page(page);
> +		else
> +			give_pages(rq, page);
> +	} else if (!vi->mergeable_rx_bufs) {
> +		/* If not mergeable bufs must be big packets so cleanup pages */
> +		give_pages(rq, (struct page *)page->private);
> +		page->private = 0;
> +	}
> +
> +	virtqueue_kick(sq->vq);

Is this unconditional kick a work-around for hang
we could not figure out yet?
I guess this helps because it just slows down the guest.
I don't much like it ...

> +}
> +
>  static u32 do_xdp_prog(struct virtnet_info *vi,
> +		       struct receive_queue *rq,
>  		       struct bpf_prog *xdp_prog,
>  		       struct page *page, int offset, int len)
>  {
>  	int hdr_padded_len;
>  	struct xdp_buff xdp;
> +	unsigned int qp;
>  	u32 act;
>  	u8 *buf;
>  
> @@ -353,9 +398,15 @@ static u32 do_xdp_prog(struct virtnet_info *vi,
>  	switch (act) {
>  	case XDP_PASS:
>  		return XDP_PASS;
> +	case XDP_TX:
> +		qp = vi->curr_queue_pairs -
> +			vi->xdp_queue_pairs +
> +			smp_processor_id();
> +		xdp.data = buf + (vi->mergeable_rx_bufs ? 0 : 4);
> +		virtnet_xdp_xmit(vi, rq, &vi->sq[qp], &xdp);
> +		return XDP_TX;
>  	default:
>  		bpf_warn_invalid_xdp_action(act);
> -	case XDP_TX:
>  	case XDP_ABORTED:
>  	case XDP_DROP:
>  		return XDP_DROP;
> @@ -390,9 +441,17 @@ static struct sk_buff *receive_big(struct net_device *dev,
>  
>  		if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
>  			goto err_xdp;
> -		act = do_xdp_prog(vi, xdp_prog, page, 0, len);
> -		if (act == XDP_DROP)
> +		act = do_xdp_prog(vi, rq, xdp_prog, page, 0, len);
> +		switch (act) {
> +		case XDP_PASS:
> +			break;
> +		case XDP_TX:
> +			rcu_read_unlock();
> +			goto xdp_xmit;
> +		case XDP_DROP:
> +		default:
>  			goto err_xdp;
> +		}
>  	}
>  	rcu_read_unlock();
>  
> @@ -407,6 +466,7 @@ static struct sk_buff *receive_big(struct net_device *dev,
>  err:
>  	dev->stats.rx_dropped++;
>  	give_pages(rq, page);
> +xdp_xmit:
>  	return NULL;
>  }
>  
> @@ -425,6 +485,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>  	struct bpf_prog *xdp_prog;
>  	unsigned int truesize;
>  
> +	head_skb = NULL;
> +
>  	rcu_read_lock();
>  	xdp_prog = rcu_dereference(rq->xdp_prog);
>  	if (xdp_prog) {
> @@ -448,9 +510,17 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>  		if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
>  			goto err_xdp;
>  
> -		act = do_xdp_prog(vi, xdp_prog, page, offset, len);
> -		if (act == XDP_DROP)
> +		act = do_xdp_prog(vi, rq, xdp_prog, page, offset, len);
> +		switch (act) {
> +		case XDP_PASS:
> +			break;
> +		case XDP_TX:
> +			rcu_read_unlock();
> +			goto xdp_xmit;
> +		case XDP_DROP:
> +		default:
>  			goto err_xdp;
> +		}
>  	}
>  	rcu_read_unlock();
>  
> @@ -528,6 +598,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>  err_buf:
>  	dev->stats.rx_dropped++;
>  	dev_kfree_skb(head_skb);
> +xdp_xmit:
>  	return NULL;
>  }
>  
> @@ -1734,6 +1805,16 @@ static void free_receive_page_frags(struct virtnet_info *vi)
>  			put_page(vi->rq[i].alloc_frag.page);
>  }
>  
> +static bool is_xdp_queue(struct virtnet_info *vi, int q)
> +{
> +	if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
> +		return false;
> +	else if (q < vi->curr_queue_pairs)
> +		return true;
> +	else
> +		return false;
> +}
> +
>  static void free_unused_bufs(struct virtnet_info *vi)
>  {
>  	void *buf;
> @@ -1741,8 +1822,12 @@ static void free_unused_bufs(struct virtnet_info *vi)
>  
>  	for (i = 0; i < vi->max_queue_pairs; i++) {
>  		struct virtqueue *vq = vi->sq[i].vq;
> -		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
> -			dev_kfree_skb(buf);
> +		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
> +			if (!is_xdp_queue(vi, i))
> +				dev_kfree_skb(buf);
> +			else
> +				put_page(virt_to_head_page(buf));
> +		}
>  	}
>  
>  	for (i = 0; i < vi->max_queue_pairs; i++) {

^ permalink raw reply

* Re: [net-next PATCH v5 4/6] virtio_net: add dedicated XDP transmit queues
From: Michael S. Tsirkin @ 2016-12-08  5:59 UTC (permalink / raw)
  To: John Fastabend
  Cc: daniel, shm, davem, tgraf, alexei.starovoitov, john.r.fastabend,
	netdev, brouer
In-Reply-To: <20161207201223.28121.87060.stgit@john-Precision-Tower-5810>

On Wed, Dec 07, 2016 at 12:12:23PM -0800, John Fastabend wrote:
> XDP requires using isolated transmit queues to avoid interference
> with normal networking stack (BQL, NETDEV_TX_BUSY, etc).
> This patch
> adds a XDP queue per cpu when a XDP program is loaded and does not
> expose the queues to the OS via the normal API call to
> netif_set_real_num_tx_queues(). This way the stack will never push
> an skb to these queues.
> 
> However virtio/vhost/qemu implementation only allows for creating
> TX/RX queue pairs at this time so creating only TX queues was not
> possible. And because the associated RX queues are being created I
> went ahead and exposed these to the stack and let the backend use
> them. This creates more RX queues visible to the network stack than
> TX queues which is worth mentioning but does not cause any issues as
> far as I can tell.
> 
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> ---
>  drivers/net/virtio_net.c |   30 ++++++++++++++++++++++++++++--
>  1 file changed, 28 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index a009299..28b1196 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -114,6 +114,9 @@ struct virtnet_info {
>  	/* # of queue pairs currently used by the driver */
>  	u16 curr_queue_pairs;
>  
> +	/* # of XDP queue pairs currently used by the driver */
> +	u16 xdp_queue_pairs;
> +
>  	/* I like... big packets and I cannot lie! */
>  	bool big_packets;
>  
> @@ -1547,7 +1550,8 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
>  	unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
>  	struct virtnet_info *vi = netdev_priv(dev);
>  	struct bpf_prog *old_prog;
> -	int i;
> +	u16 xdp_qp = 0, curr_qp;
> +	int i, err;
>  
>  	if ((dev->features & NETIF_F_LRO) && prog) {
>  		netdev_warn(dev, "can't set XDP while LRO is on, disable LRO first\n");
> @@ -1564,12 +1568,34 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
>  		return -EINVAL;
>  	}
>  
> +	curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs;
> +	if (prog)
> +		xdp_qp = nr_cpu_ids;
> +
> +	/* XDP requires extra queues for XDP_TX */
> +	if (curr_qp + xdp_qp > vi->max_queue_pairs) {
> +		netdev_warn(dev, "request %i queues but max is %i\n",
> +			    curr_qp + xdp_qp, vi->max_queue_pairs);
> +		return -ENOMEM;
> +	}

Can't we disable XDP_TX somehow? Many people might only want RX drop,
and extra queues are not always there.


> +
> +	err = virtnet_set_queues(vi, curr_qp + xdp_qp);
> +	if (err) {
> +		dev_warn(&dev->dev, "XDP Device queue allocation failure.\n");
> +		return err;
> +	}
> +
>  	if (prog) {
>  		prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
> -		if (IS_ERR(prog))
> +		if (IS_ERR(prog)) {
> +			virtnet_set_queues(vi, curr_qp);
>  			return PTR_ERR(prog);
> +		}
>  	}
>  
> +	vi->xdp_queue_pairs = xdp_qp;
> +	netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);
> +
>  	for (i = 0; i < vi->max_queue_pairs; i++) {
>  		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
>  		rcu_assign_pointer(vi->rq[i].xdp_prog, prog);

^ permalink raw reply

* Re: [net-next PATCH v5 3/6] virtio_net: Add XDP support
From: Michael S. Tsirkin @ 2016-12-08  5:54 UTC (permalink / raw)
  To: John Fastabend
  Cc: daniel, shm, davem, tgraf, alexei.starovoitov, john.r.fastabend,
	netdev, brouer
In-Reply-To: <5848EC48.7080904@gmail.com>

On Wed, Dec 07, 2016 at 09:14:48PM -0800, John Fastabend wrote:
> On 16-12-07 08:48 PM, Michael S. Tsirkin wrote:
> > On Wed, Dec 07, 2016 at 12:11:57PM -0800, John Fastabend wrote:
> >> From: John Fastabend <john.fastabend@gmail.com>
> >>
> >> This adds XDP support to virtio_net. Some requirements must be
> >> met for XDP to be enabled depending on the mode. First it will
> >> only be supported with LRO disabled so that data is not pushed
> >> across multiple buffers. Second the MTU must be less than a page
> >> size to avoid having to handle XDP across multiple pages.
> >>
> >> If mergeable receive is enabled this patch only supports the case
> >> where header and data are in the same buf which we can check when
> >> a packet is received by looking at num_buf. If the num_buf is
> >> greater than 1 and a XDP program is loaded the packet is dropped
> >> and a warning is thrown. When any_header_sg is set this does not
> >> happen and both header and data is put in a single buffer as expected
> >> so we check this when XDP programs are loaded.  Subsequent patches
> >> will process the packet in a degraded mode to ensure connectivity
> >> and correctness is not lost even if backend pushes packets into
> >> multiple buffers.
> >>
> >> If big packets mode is enabled and MTU/LRO conditions above are
> >> met then XDP is allowed.
> >>
> >> This patch was tested with qemu with vhost=on and vhost=off where
> >> mergeable and big_packet modes were forced via hard coding feature
> >> negotiation. Multiple buffers per packet was forced via a small
> >> test patch to vhost.c in the vhost=on qemu mode.
> >>
> >> Suggested-by: Shrijeet Mukherjee <shrijeet@gmail.com>
> >> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> > 
> > I'd like to note that I don't think disabling LRO is a good
> > plan long-term. It's really important for virtio performance,
> > so IMHO we need a fix for that.
> > I'm guessing that a subset of XDP programs would be quite
> > happy with just looking at headers, and that is there in the 1st buffer.
> > So how about teaching XDP that there could be a truncated packet?
> > 
> > Then we won't have to disable LRO.
> > 
> 
> Agreed long-term we can drop this requirement this type of improvement
> would also allow working with jumbo frames on nics.
> 
> I don't think it should block this patch series though.
> 
> .John

Right.

^ permalink raw reply

* Re: [PATCH] linux/types.h: enable endian checks for all sparse builds
From: Michael S. Tsirkin @ 2016-12-08  5:53 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: kvm@vger.kernel.org, Neil Armstrong, David Airlie,
	linux-remoteproc@vger.kernel.org, dri-devel@lists.freedesktop.org,
	virtualization@lists.linux-foundation.org,
	linux-s390@vger.kernel.org, James E.J. Bottomley, Herbert Xu,
	linux-scsi@vger.kernel.org, Christoph Hellwig,
	v9fs-developer@lists.sourceforge.net, Asias He, Arnd Bergmann,
	linux-kbuild@vger.kernel.org, Jens Axboe, Michal Marek,
	Stefan Hajnoczi <stef
In-Reply-To: <BLUPR02MB168374E98A6C86E43DF0BCFE81840@BLUPR02MB1683.namprd02.prod.outlook.com>

On Thu, Dec 08, 2016 at 05:21:47AM +0000, Bart Van Assche wrote:
> On 12/07/16 18:29, Michael S. Tsirkin wrote:
> > By now, linux is mostly endian-clean. Enabling endian-ness
> > checks for everyone produces about 200 new sparse warnings for me -
> > less than 10% over the 2000 sparse warnings already there.
> >
> > Not a big deal, OTOH enabling this helps people notice
> > they are introducing new bugs.
> >
> > So let's just drop __CHECK_ENDIAN__. Follow-up patches
> > can drop distinction between __bitwise and __bitwise__.
> 
> Hello Michael,
> 
> This patch makes a whole bunch of ccflags-y += -D__CHECK_ENDIAN__ 
> statements obsolete. Have you considered to remove these statements?

Absolutely. Just waiting for feedback on the idea.

> Additionally, there are notable exceptions to the rule that most drivers 
> are endian-clean, e.g. drivers/scsi/qla2xxx. I would appreciate it if it 
> would remain possible to check such drivers with sparse without enabling 
> endianness checks. Have you considered to change #ifdef __CHECK_ENDIAN__ 
> into e.g. #ifndef __DONT_CHECK_ENDIAN__?
> 
> Thanks,
> 
> Bart.

The right thing is probably just to fix these, isn't it?
Until then, why not just ignore the warnings?

-- 
MST

^ permalink raw reply

* Re: [PATCH] linux/types.h: enable endian checks for all sparse builds
From: Bart Van Assche @ 2016-12-08  5:21 UTC (permalink / raw)
  To: Michael S. Tsirkin, linux-kernel@vger.kernel.org, Linus Torvalds,
	Christoph Hellwig
  Cc: Jason Wang, linux-kbuild@vger.kernel.org, Michal Marek,
	Arnd Bergmann, Greg Kroah-Hartman, Matt Mackall, Herbert Xu,
	David Airlie, Gerd Hoffmann, Ohad Ben-Cohen,
	Christian Borntraeger, Cornelia Huck, James E.J. Bottomley,
	David S. Miller, Jens Axboe, Neil Armstrong, Stefan Hajnoczi,
	Asias He, linux-crypto@vger.kernel.org
In-Reply-To: <1481164052-28036-1-git-send-email-mst@redhat.com>

On 12/07/16 18:29, Michael S. Tsirkin wrote:
> By now, linux is mostly endian-clean. Enabling endian-ness
> checks for everyone produces about 200 new sparse warnings for me -
> less than 10% over the 2000 sparse warnings already there.
>
> Not a big deal, OTOH enabling this helps people notice
> they are introducing new bugs.
>
> So let's just drop __CHECK_ENDIAN__. Follow-up patches
> can drop distinction between __bitwise and __bitwise__.

Hello Michael,

This patch makes a whole bunch of ccflags-y += -D__CHECK_ENDIAN__ 
statements obsolete. Have you considered to remove these statements?

Additionally, there are notable exceptions to the rule that most drivers 
are endian-clean, e.g. drivers/scsi/qla2xxx. I would appreciate it if it 
would remain possible to check such drivers with sparse without enabling 
endianness checks. Have you considered to change #ifdef __CHECK_ENDIAN__ 
into e.g. #ifndef __DONT_CHECK_ENDIAN__?

Thanks,

Bart.

^ permalink raw reply

* Re: [net-next PATCH v5 3/6] virtio_net: Add XDP support
From: John Fastabend @ 2016-12-08  5:14 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: daniel, shm, davem, tgraf, alexei.starovoitov, john.r.fastabend,
	netdev, brouer
In-Reply-To: <20161208064037-mutt-send-email-mst@kernel.org>

On 16-12-07 08:48 PM, Michael S. Tsirkin wrote:
> On Wed, Dec 07, 2016 at 12:11:57PM -0800, John Fastabend wrote:
>> From: John Fastabend <john.fastabend@gmail.com>
>>
>> This adds XDP support to virtio_net. Some requirements must be
>> met for XDP to be enabled depending on the mode. First it will
>> only be supported with LRO disabled so that data is not pushed
>> across multiple buffers. Second the MTU must be less than a page
>> size to avoid having to handle XDP across multiple pages.
>>
>> If mergeable receive is enabled this patch only supports the case
>> where header and data are in the same buf which we can check when
>> a packet is received by looking at num_buf. If the num_buf is
>> greater than 1 and a XDP program is loaded the packet is dropped
>> and a warning is thrown. When any_header_sg is set this does not
>> happen and both header and data is put in a single buffer as expected
>> so we check this when XDP programs are loaded.  Subsequent patches
>> will process the packet in a degraded mode to ensure connectivity
>> and correctness is not lost even if backend pushes packets into
>> multiple buffers.
>>
>> If big packets mode is enabled and MTU/LRO conditions above are
>> met then XDP is allowed.
>>
>> This patch was tested with qemu with vhost=on and vhost=off where
>> mergeable and big_packet modes were forced via hard coding feature
>> negotiation. Multiple buffers per packet was forced via a small
>> test patch to vhost.c in the vhost=on qemu mode.
>>
>> Suggested-by: Shrijeet Mukherjee <shrijeet@gmail.com>
>> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> 
> I'd like to note that I don't think disabling LRO is a good
> plan long-term. It's really important for virtio performance,
> so IMHO we need a fix for that.
> I'm guessing that a subset of XDP programs would be quite
> happy with just looking at headers, and that is there in the 1st buffer.
> So how about teaching XDP that there could be a truncated packet?
> 
> Then we won't have to disable LRO.
> 

Agreed long-term we can drop this requirement this type of improvement
would also allow working with jumbo frames on nics.

I don't think it should block this patch series though.

.John

^ permalink raw reply

* Re: [net-next PATCH v5 3/6] virtio_net: Add XDP support
From: Michael S. Tsirkin @ 2016-12-08  4:48 UTC (permalink / raw)
  To: John Fastabend
  Cc: daniel, shm, davem, tgraf, alexei.starovoitov, john.r.fastabend,
	netdev, brouer
In-Reply-To: <20161207201157.28121.39934.stgit@john-Precision-Tower-5810>

On Wed, Dec 07, 2016 at 12:11:57PM -0800, John Fastabend wrote:
> From: John Fastabend <john.fastabend@gmail.com>
> 
> This adds XDP support to virtio_net. Some requirements must be
> met for XDP to be enabled depending on the mode. First it will
> only be supported with LRO disabled so that data is not pushed
> across multiple buffers. Second the MTU must be less than a page
> size to avoid having to handle XDP across multiple pages.
> 
> If mergeable receive is enabled this patch only supports the case
> where header and data are in the same buf which we can check when
> a packet is received by looking at num_buf. If the num_buf is
> greater than 1 and a XDP program is loaded the packet is dropped
> and a warning is thrown. When any_header_sg is set this does not
> happen and both header and data is put in a single buffer as expected
> so we check this when XDP programs are loaded.  Subsequent patches
> will process the packet in a degraded mode to ensure connectivity
> and correctness is not lost even if backend pushes packets into
> multiple buffers.
> 
> If big packets mode is enabled and MTU/LRO conditions above are
> met then XDP is allowed.
> 
> This patch was tested with qemu with vhost=on and vhost=off where
> mergeable and big_packet modes were forced via hard coding feature
> negotiation. Multiple buffers per packet was forced via a small
> test patch to vhost.c in the vhost=on qemu mode.
> 
> Suggested-by: Shrijeet Mukherjee <shrijeet@gmail.com>
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>

I'd like to note that I don't think disabling LRO is a good
plan long-term. It's really important for virtio performance,
so IMHO we need a fix for that.
I'm guessing that a subset of XDP programs would be quite
happy with just looking at headers, and that is there in the 1st buffer.
So how about teaching XDP that there could be a truncated packet?

Then we won't have to disable LRO.


> ---
>  drivers/net/virtio_net.c |  175 +++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 170 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index a5c47b1..a009299 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -22,6 +22,7 @@
>  #include <linux/module.h>
>  #include <linux/virtio.h>
>  #include <linux/virtio_net.h>
> +#include <linux/bpf.h>
>  #include <linux/scatterlist.h>
>  #include <linux/if_vlan.h>
>  #include <linux/slab.h>
> @@ -81,6 +82,8 @@ struct receive_queue {
>  
>  	struct napi_struct napi;
>  
> +	struct bpf_prog __rcu *xdp_prog;
> +
>  	/* Chain pages by the private ptr. */
>  	struct page *pages;
>  
> @@ -324,6 +327,38 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
>  	return skb;
>  }
>  
> +static u32 do_xdp_prog(struct virtnet_info *vi,
> +		       struct bpf_prog *xdp_prog,
> +		       struct page *page, int offset, int len)
> +{
> +	int hdr_padded_len;
> +	struct xdp_buff xdp;
> +	u32 act;
> +	u8 *buf;
> +
> +	buf = page_address(page) + offset;
> +
> +	if (vi->mergeable_rx_bufs)
> +		hdr_padded_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> +	else
> +		hdr_padded_len = sizeof(struct padded_vnet_hdr);
> +
> +	xdp.data = buf + hdr_padded_len;
> +	xdp.data_end = xdp.data + (len - vi->hdr_len);
> +
> +	act = bpf_prog_run_xdp(xdp_prog, &xdp);
> +	switch (act) {
> +	case XDP_PASS:
> +		return XDP_PASS;
> +	default:
> +		bpf_warn_invalid_xdp_action(act);
> +	case XDP_TX:
> +	case XDP_ABORTED:
> +	case XDP_DROP:
> +		return XDP_DROP;
> +	}
> +}
> +
>  static struct sk_buff *receive_small(struct virtnet_info *vi, void *buf, unsigned int len)
>  {
>  	struct sk_buff * skb = buf;
> @@ -340,14 +375,32 @@ static struct sk_buff *receive_big(struct net_device *dev,
>  				   void *buf,
>  				   unsigned int len)
>  {
> +	struct bpf_prog *xdp_prog;
>  	struct page *page = buf;
> -	struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
> +	struct sk_buff *skb;
>  
> +	rcu_read_lock();
> +	xdp_prog = rcu_dereference(rq->xdp_prog);
> +	if (xdp_prog) {
> +		struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
> +		u32 act;
> +
> +		if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
> +			goto err_xdp;
> +		act = do_xdp_prog(vi, xdp_prog, page, 0, len);
> +		if (act == XDP_DROP)
> +			goto err_xdp;
> +	}
> +	rcu_read_unlock();
> +
> +	skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
>  	if (unlikely(!skb))
>  		goto err;
>  
>  	return skb;
>  
> +err_xdp:
> +	rcu_read_unlock();
>  err:
>  	dev->stats.rx_dropped++;
>  	give_pages(rq, page);
> @@ -365,11 +418,42 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>  	u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
>  	struct page *page = virt_to_head_page(buf);
>  	int offset = buf - page_address(page);
> -	unsigned int truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
> +	struct sk_buff *head_skb, *curr_skb;
> +	struct bpf_prog *xdp_prog;
> +	unsigned int truesize;
> +
> +	rcu_read_lock();
> +	xdp_prog = rcu_dereference(rq->xdp_prog);
> +	if (xdp_prog) {
> +		u32 act;
> +
> +		/* No known backend devices should send packets with
> +		 * more than a single buffer when XDP conditions are
> +		 * met. However it is not strictly illegal so the case
> +		 * is handled as an exception and a warning is thrown.
> +		 */
> +		if (unlikely(num_buf > 1)) {
> +			bpf_warn_invalid_xdp_buffer();
> +			goto err_xdp;
> +		}
>  
> -	struct sk_buff *head_skb = page_to_skb(vi, rq, page, offset, len,
> -					       truesize);
> -	struct sk_buff *curr_skb = head_skb;
> +		/* Transient failure which in theory could occur if
> +		 * in-flight packets from before XDP was enabled reach
> +		 * the receive path after XDP is loaded. In practice I
> +		 * was not able to create this condition.
> +		 */
> +		if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
> +			goto err_xdp;
> +
> +		act = do_xdp_prog(vi, xdp_prog, page, offset, len);
> +		if (act == XDP_DROP)
> +			goto err_xdp;
> +	}
> +	rcu_read_unlock();
> +
> +	truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
> +	head_skb = page_to_skb(vi, rq, page, offset, len, truesize);
> +	curr_skb = head_skb;
>  
>  	if (unlikely(!curr_skb))
>  		goto err_skb;
> @@ -423,6 +507,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>  	ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
>  	return head_skb;
>  
> +err_xdp:
> +	rcu_read_unlock();
>  err_skb:
>  	put_page(page);
>  	while (--num_buf) {
> @@ -1328,6 +1414,13 @@ static int virtnet_set_channels(struct net_device *dev,
>  	if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0)
>  		return -EINVAL;
>  
> +	/* For now we don't support modifying channels while XDP is loaded
> +	 * also when XDP is loaded all RX queues have XDP programs so we only
> +	 * need to check a single RX queue.
> +	 */
> +	if (vi->rq[0].xdp_prog)
> +		return -EINVAL;
> +
>  	get_online_cpus();
>  	err = virtnet_set_queues(vi, queue_pairs);
>  	if (!err) {
> @@ -1449,6 +1542,69 @@ static int virtnet_set_features(struct net_device *netdev,
>  	return 0;
>  }
>  
> +static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
> +{
> +	unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
> +	struct virtnet_info *vi = netdev_priv(dev);
> +	struct bpf_prog *old_prog;
> +	int i;
> +
> +	if ((dev->features & NETIF_F_LRO) && prog) {
> +		netdev_warn(dev, "can't set XDP while LRO is on, disable LRO first\n");
> +		return -EINVAL;
> +	}
> +
> +	if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
> +		netdev_warn(dev, "XDP expects header/data in single page\n");
> +		return -EINVAL;
> +	}
> +
> +	if (dev->mtu > max_sz) {
> +		netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz);
> +		return -EINVAL;
> +	}
> +
> +	if (prog) {
> +		prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
> +		if (IS_ERR(prog))
> +			return PTR_ERR(prog);
> +	}
> +
> +	for (i = 0; i < vi->max_queue_pairs; i++) {
> +		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
> +		rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
> +		if (old_prog)
> +			bpf_prog_put(old_prog);
> +	}
> +
> +	return 0;
> +}
> +
> +static bool virtnet_xdp_query(struct net_device *dev)
> +{
> +	struct virtnet_info *vi = netdev_priv(dev);
> +	int i;
> +
> +	for (i = 0; i < vi->max_queue_pairs; i++) {
> +		if (vi->rq[i].xdp_prog)
> +			return true;
> +	}
> +	return false;
> +}
> +
> +static int virtnet_xdp(struct net_device *dev, struct netdev_xdp *xdp)
> +{
> +	switch (xdp->command) {
> +	case XDP_SETUP_PROG:
> +		return virtnet_xdp_set(dev, xdp->prog);
> +	case XDP_QUERY_PROG:
> +		xdp->prog_attached = virtnet_xdp_query(dev);
> +		return 0;
> +	default:
> +		return -EINVAL;
> +	}
> +}
> +
>  static const struct net_device_ops virtnet_netdev = {
>  	.ndo_open            = virtnet_open,
>  	.ndo_stop   	     = virtnet_close,
> @@ -1466,6 +1622,7 @@ static int virtnet_set_features(struct net_device *netdev,
>  	.ndo_busy_poll		= virtnet_busy_poll,
>  #endif
>  	.ndo_set_features	= virtnet_set_features,
> +	.ndo_xdp		= virtnet_xdp,
>  };
>  
>  static void virtnet_config_changed_work(struct work_struct *work)
> @@ -1527,12 +1684,20 @@ static void virtnet_free_queues(struct virtnet_info *vi)
>  
>  static void free_receive_bufs(struct virtnet_info *vi)
>  {
> +	struct bpf_prog *old_prog;
>  	int i;
>  
> +	rtnl_lock();
>  	for (i = 0; i < vi->max_queue_pairs; i++) {
>  		while (vi->rq[i].pages)
>  			__free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
> +
> +		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
> +		RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL);
> +		if (old_prog)
> +			bpf_prog_put(old_prog);
>  	}
> +	rtnl_unlock();
>  }
>  
>  static void free_receive_page_frags(struct virtnet_info *vi)

^ permalink raw reply

* Re: Misalignment, MIPS, and ip_hdr(skb)->version
From: Daniel Kahn Gillmor @ 2016-12-08  4:34 UTC (permalink / raw)
  To: Hannes Frederic Sowa, Jason A. Donenfeld, Netdev, linux-mips
  Cc: LKML, WireGuard mailing list
In-Reply-To: <095cac5b-b757-6f4a-e699-8eedf9ed7221@stressinduktion.org>

On Wed 2016-12-07 19:30:34 -0500, Hannes Frederic Sowa wrote:
> Your custom protocol should be designed in a way you get an aligned ip
> header. Most protocols of the IETF follow this mantra and it is always
> possible to e.g. pad options so you end up on aligned boundaries for the
> next header.

fwiw, i'm not convinced that "most protocols of the IETF follow this
mantra".  we've had multiple discussions in different protocol groups
about shaving or bloating by a few bytes here or there in different
protocols, and i don't think anyone has brought up memory alignment as
an argument in any of the discussions i've followed.

that said, it sure does sound like it would make things simpler to
construct the protocol that way :)

          --dkg

^ permalink raw reply

* Re: net-next closing, README
From: Stephen Hemminger @ 2016-12-08  3:13 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20161207.162845.1424733568267357691.davem@davemloft.net>

On Wed, 07 Dec 2016 16:28:45 -0500 (EST)
David Miller <davem@davemloft.net> wrote:

> The merge window is about to open soon, and next week I will be
> having sporadic internet access while travelling around, therefore
> I am closing net-next up tonight.
> 
> Therefore, please do not submit any new features or cleanups for
> net-next.  Bug fixes for problems introduced in net-next are fine,
> however.
> 
> Thank you.

I have a couple of patches that I would like to get into net-next, but
it is not critical. They replace the hardcoded workarounds with code
that negotiates values with the host. Would these be acceptable?
Sorry for the delay but needed to test on oldest supported version
to ensure negotiation worked.

^ permalink raw reply

* Re: net-next closing, README
From: Stephen Hemminger @ 2016-12-08  3:20 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20161207191345.6e765651@xeon-e3>

On Wed, 7 Dec 2016 19:13:45 -0800
Stephen Hemminger <stephen@networkplumber.org> wrote:

> On Wed, 07 Dec 2016 16:28:45 -0500 (EST)
> David Miller <davem@davemloft.net> wrote:
> 
> > The merge window is about to open soon, and next week I will be
> > having sporadic internet access while travelling around, therefore
> > I am closing net-next up tonight.
> > 
> > Therefore, please do not submit any new features or cleanups for
> > net-next.  Bug fixes for problems introduced in net-next are fine,
> > however.
> > 
> > Thank you.  
> 
> I have a couple of patches that I would like to get into net-next, but
> it is not critical. They replace the hardcoded workarounds with code
> that negotiates values with the host. Would these be acceptable?
> Sorry for the delay but needed to test on oldest supported version
> to ensure negotiation worked.

Never mind, although the changes work on older versions of Windows Server,
the performance would be worse.  Basically old servers don't do UDP checksum
offload but still are capable of handling TCP.  Let me work up a better
solution that handles both cases.

^ permalink raw reply

* [PATCH net v2 1/1] driver: ipvlan: Unlink the upper dev when ipvlan_link_new failed
From: fgao @ 2016-12-08  3:16 UTC (permalink / raw)
  To: davem, maheshb, edumazet, netdev, gfree.wind; +Cc: Gao Feng

From: Gao Feng <fgao@ikuai8.com>

When netdev_upper_dev_unlink failed in ipvlan_link_new, need to
unlink the ipvlan dev with upper dev.

Signed-off-by: Gao Feng <fgao@ikuai8.com>
---
 v2: Rename the label to unlink_netdev, per Mahesh Bandewar
 v1: Initial patch

 drivers/net/ipvlan/ipvlan_main.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index 0fef178..dfbc4ef 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -546,13 +546,15 @@ static int ipvlan_link_new(struct net *src_net, struct net_device *dev,
 	}
 	err = ipvlan_set_port_mode(port, mode);
 	if (err) {
-		goto unregister_netdev;
+		goto unlink_netdev;
 	}
 
 	list_add_tail_rcu(&ipvlan->pnode, &port->ipvlans);
 	netif_stacked_transfer_operstate(phy_dev, dev);
 	return 0;
 
+unlink_netdev:
+	netdev_upper_dev_unlink(phy_dev, dev);
 unregister_netdev:
 	unregister_netdevice(dev);
 destroy_ipvlan_port:
-- 
1.9.1

^ permalink raw reply related

* Re: commit : ppp: add rtnetlink device creation support - breaks netcf on my machine.
From: Brad Campbell @ 2016-12-08  2:29 UTC (permalink / raw)
  To: Thomas Haller, Dan Williams, Guillaume Nault
  Cc: netdev, Thomas Graf, David Miller
In-Reply-To: <1481132622.4116.6.camel@redhat.com>

On 08/12/16 01:43, Thomas Haller wrote:
> On Tue, 2016-12-06 at 17:12 -0600, Dan Williams wrote:
>>
>>> libnl1 rejects the IFLA_INFO_DATA attribute because it expects it
>>> to
>>> contain a sub-attribute. Since the payload size is zero it doesn't
>>> match the policy and parsing fails.
>>>
>>> There's no problem with libnl3 because its policy accepts empty
>>> payloads for NLA_NESTED attributes (see libnl3 commit 4be02ace4826
>
> Hi,
>
> libnl1 is unmaintained these days. I don't think it makes sense to
> backport that patch. The last upstream release was 3+ years ago, with
> no upstream development since then.
>
> IMHO netcf should drop libnl-1 support.
>

G'day Thomas,

I'm not sure anyone was suggesting fixing libnl1, it was more around a 
discussion with regard to a change in the kernel breaking old userspace 
and whether it needs to be fixed in the kernel.

Personally, now I have a solution to *my* immediate problem (that being 
any kernel 4.7 or later prevented libvirtd starting on my servers 
because my netcf was compiled against libnl1) I can upgrade the relevant 
userspace components to work around the issue.

Also, now this issue is a number of months old and I appear to be the 
only person reporting it, maybe it's not worth tackling. I would 
absolutely say that netcf needs to drop libnl1 now though as it *is* 
broken on newer kernels under the right circumstances.

I appreciate the assistance in tracking it down anyway. Thanks guys.

Regards,
Brad

^ permalink raw reply

* [PATCH] linux/types.h: enable endian checks for all sparse builds
From: Michael S. Tsirkin @ 2016-12-08  2:29 UTC (permalink / raw)
  To: linux-kernel, Linus Torvalds, Christoph Hellwig
  Cc: kvm, Neil Armstrong, David Airlie, linux-remoteproc, dri-devel,
	virtualization, linux-s390, James E.J.  Bottomley, Herbert Xu,
	linux-scsi, v9fs-developer, Asias He, Arnd Bergmann, linux-kbuild,
	Jens Axboe, Michal Marek, Stefan Hajnoczi, Matt Mackall,
	Greg Kroah-Hartman, linux-kernel, linux-crypto, netdev,
	David S. Miller

By now, linux is mostly endian-clean. Enabling endian-ness
checks for everyone produces about 200 new sparse warnings for me -
less than 10% over the 2000 sparse warnings already there.

Not a big deal, OTOH enabling this helps people notice
they are introducing new bugs.

So let's just drop __CHECK_ENDIAN__. Follow-up patches
can drop distinction between __bitwise and __bitwise__.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Suggested-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---

Linus, could you ack this for upstream? If yes I'll
merge through my tree as a replacement for enabling
this just for virtio.

 include/uapi/linux/types.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/include/uapi/linux/types.h b/include/uapi/linux/types.h
index acf0979..41e5914 100644
--- a/include/uapi/linux/types.h
+++ b/include/uapi/linux/types.h
@@ -23,11 +23,7 @@
 #else
 #define __bitwise__
 #endif
-#ifdef __CHECK_ENDIAN__
 #define __bitwise __bitwise__
-#else
-#define __bitwise
-#endif
 
 typedef __u16 __bitwise __le16;
 typedef __u16 __bitwise __be16;
-- 
MST

^ permalink raw reply related

* Re: [PATCH net 1/1] driver: ipvlan: Unlink the upper dev when ipvlan_link_new failed
From: Gao Feng @ 2016-12-08  1:44 UTC (permalink / raw)
  To: Mahesh Bandewar (महेश बंडेवार)
  Cc: David Miller, Eric Dumazet, linux-netdev
In-Reply-To: <CAF2d9jgfO=__ympFr-RSeWnKcwzDYDee5kh+Dp_KPk1fsMCHYQ@mail.gmail.com>

On Thu, Dec 8, 2016 at 9:39 AM, Mahesh Bandewar (महेश बंडेवार)
<maheshb@google.com> wrote:
> On Wed, Dec 7, 2016 at 5:21 PM,  <fgao@ikuai8.com> wrote:
>> From: Gao Feng <fgao@ikuai8.com>
>>
>> When netdev_upper_dev_unlink failed in ipvlan_link_new, need to
>> unlink the ipvlan dev with upper dev.
>>
>> Signed-off-by: Gao Feng <fgao@ikuai8.com>
>> ---
>>  drivers/net/ipvlan/ipvlan_main.c | 4 +++-
>>  1 file changed, 3 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
>> index 0fef178..189adbc 100644
>> --- a/drivers/net/ipvlan/ipvlan_main.c
>> +++ b/drivers/net/ipvlan/ipvlan_main.c
>> @@ -546,13 +546,15 @@ static int ipvlan_link_new(struct net *src_net, struct net_device *dev,
>>         }
>>         err = ipvlan_set_port_mode(port, mode);
>>         if (err) {
>> -               goto unregister_netdev;
>> +               goto dev_unlink;
>>         }
>>
>>         list_add_tail_rcu(&ipvlan->pnode, &port->ipvlans);
>>         netif_stacked_transfer_operstate(phy_dev, dev);
>>         return 0;
>>
>> +dev_unlink:
> probably 'unlink_netdev' label inline with other labels used. thanks

OK, it is better name.
I will follow it and send v2 update.

Regards
Feng

>> +       netdev_upper_dev_unlink(phy_dev, dev);
>>  unregister_netdev:
>>         unregister_netdevice(dev);
>>  destroy_ipvlan_port:
>> --
>> 1.9.1
>>
>>

^ permalink raw reply

* Re: [PATCH net 1/1] driver: ipvlan: Unlink the upper dev when ipvlan_link_new failed
From: Mahesh Bandewar (महेश बंडेवार) @ 2016-12-08  1:39 UTC (permalink / raw)
  To: fgao; +Cc: David Miller, Eric Dumazet, linux-netdev, gfree.wind
In-Reply-To: <1481160060-28730-1-git-send-email-fgao@ikuai8.com>

On Wed, Dec 7, 2016 at 5:21 PM,  <fgao@ikuai8.com> wrote:
> From: Gao Feng <fgao@ikuai8.com>
>
> When netdev_upper_dev_unlink failed in ipvlan_link_new, need to
> unlink the ipvlan dev with upper dev.
>
> Signed-off-by: Gao Feng <fgao@ikuai8.com>
> ---
>  drivers/net/ipvlan/ipvlan_main.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
> index 0fef178..189adbc 100644
> --- a/drivers/net/ipvlan/ipvlan_main.c
> +++ b/drivers/net/ipvlan/ipvlan_main.c
> @@ -546,13 +546,15 @@ static int ipvlan_link_new(struct net *src_net, struct net_device *dev,
>         }
>         err = ipvlan_set_port_mode(port, mode);
>         if (err) {
> -               goto unregister_netdev;
> +               goto dev_unlink;
>         }
>
>         list_add_tail_rcu(&ipvlan->pnode, &port->ipvlans);
>         netif_stacked_transfer_operstate(phy_dev, dev);
>         return 0;
>
> +dev_unlink:
probably 'unlink_netdev' label inline with other labels used. thanks
> +       netdev_upper_dev_unlink(phy_dev, dev);
>  unregister_netdev:
>         unregister_netdevice(dev);
>  destroy_ipvlan_port:
> --
> 1.9.1
>
>

^ permalink raw reply

* Re: [Intel-wired-lan] [PATCH 1/1] ixgbe: fcoe: return value of skb_linearize should be handled
From: Rustad, Mark D @ 2016-12-08  1:32 UTC (permalink / raw)
  To: Zhouyi Zhou
  Cc: Kirsher, Jeffrey T, intel-wired-lan@lists.osuosl.org,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org, Zhouyi Zhou
In-Reply-To: <1481096614-25295-1-git-send-email-zhouzhouyi@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 832 bytes --]

Zhouyi Zhou <zhouzhouyi@gmail.com> wrote:

> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c  
> b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> index fee1f29..4926d48 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> @@ -2173,8 +2173,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector  
> *q_vector,
>  				total_rx_bytes += ddp_bytes;
>  				total_rx_packets += DIV_ROUND_UP(ddp_bytes,
>  								 mss);
> -			}
> -			if (!ddp_bytes) {
> +			} else {
>  				dev_kfree_skb_any(skb);
>  				continue;
>  			}

This is changing the logic by treating a negative ddp_bytes value (an error  
return) the same as a 0 value. This is probably wrong and inappropriate for  
this patch in any case.

--
Mark Rustad, Networking Division, Intel Corporation

[-- Attachment #2: Message signed with OpenPGP using GPGMail --]
[-- Type: application/pgp-signature, Size: 841 bytes --]

^ permalink raw reply

* [PATCH net 1/1] driver: ipvlan: Unlink the upper dev when ipvlan_link_new failed
From: fgao @ 2016-12-08  1:21 UTC (permalink / raw)
  To: davem, maheshb, edumazet, netdev, gfree.wind; +Cc: Gao Feng

From: Gao Feng <fgao@ikuai8.com>

When netdev_upper_dev_unlink failed in ipvlan_link_new, need to
unlink the ipvlan dev with upper dev.

Signed-off-by: Gao Feng <fgao@ikuai8.com>
---
 drivers/net/ipvlan/ipvlan_main.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index 0fef178..189adbc 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -546,13 +546,15 @@ static int ipvlan_link_new(struct net *src_net, struct net_device *dev,
 	}
 	err = ipvlan_set_port_mode(port, mode);
 	if (err) {
-		goto unregister_netdev;
+		goto dev_unlink;
 	}
 
 	list_add_tail_rcu(&ipvlan->pnode, &port->ipvlans);
 	netif_stacked_transfer_operstate(phy_dev, dev);
 	return 0;
 
+dev_unlink:
+	netdev_upper_dev_unlink(phy_dev, dev);
 unregister_netdev:
 	unregister_netdevice(dev);
 destroy_ipvlan_port:
-- 
1.9.1

^ permalink raw reply related

* Re: [PATCH 1/1] ixgbe: fcoe: return value of skb_linearize should be handled
From: Zhouyi Zhou @ 2016-12-08  1:21 UTC (permalink / raw)
  To: Jeff Kirsher
  Cc: intel-wired-lan, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, Zhouyi Zhou
In-Reply-To: <1481131804.2404.10.camel@intel.com>

Thanks Jeff for your advice,
Sorry for the  my innocence as a Linux kernel rookie.

Zhouyi

On Thu, Dec 8, 2016 at 1:30 AM, Jeff Kirsher
<jeffrey.t.kirsher@intel.com> wrote:
> On Wed, 2016-12-07 at 15:43 +0800, Zhouyi Zhou wrote:
>> Signed-off-by: Zhouyi Zhou <yizhouzhou@ict.ac.cn>
>> Reviewed-by: Cong Wang <xiyou.wangcong@gmail.com>
>> Reviewed-by: Yuval Shaia <yuval.shaia@oracle.com>
>> Reviewed-by: Eric Dumazet <eric.dumazet@gmail.com>
>> ---
>>  drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c | 6 +++++-
>>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 3 +--
>>  2 files changed, 6 insertions(+), 3 deletions(-)
>
> Did Cong, Yuval and Eric give their Reviewed-by offline?  I see they made
> comments and suggests, but never saw them actually give you their Reviewed-
> by.  You cannot automatically add their Reviewed-by, Signed-off-by, etc
> just because someone provides feedback on your patch.

^ permalink raw reply

* RE: [net-next v2 02/19] i40e: simplify txd use count calculation
From: Duyck, Alexander H @ 2016-12-08  1:09 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Kirsher, Jeffrey T, davem@davemloft.net, Williams, Mitch A,
	netdev@vger.kernel.org, nhorman@redhat.com, sassmann@redhat.com,
	jogreene@redhat.com, guru.anbalagane@oracle.com
In-Reply-To: <1481159012.4930.85.camel@edumazet-glaptop3.roam.corp.google.com>

> -----Original Message-----
> From: Eric Dumazet [mailto:eric.dumazet@gmail.com]
> Sent: Wednesday, December 7, 2016 5:04 PM
> To: Duyck, Alexander H <alexander.h.duyck@intel.com>
> Cc: Kirsher, Jeffrey T <jeffrey.t.kirsher@intel.com>; davem@davemloft.net;
> Williams, Mitch A <mitch.a.williams@intel.com>; netdev@vger.kernel.org;
> nhorman@redhat.com; sassmann@redhat.com; jogreene@redhat.com;
> guru.anbalagane@oracle.com
> Subject: Re: [net-next v2 02/19] i40e: simplify txd use count calculation
> 
> On Thu, 2016-12-08 at 00:35 +0000, Duyck, Alexander H wrote:
> 
> > Well there ends up being a few aspects to it.  First we don't need the
> > precision of a full 64b inverse multiplication, that is why we can get
> > away with multiple by 85 and shift.  The assumption is we should never
> > see a buffer larger than 64K for a TSO frame.  That being the case we
> > can do the same thing without having to use a 64b value which isn't an
> > option on 32b architectures.
> >
> > So basically what it comes down to is dealing with the "optimized for
> > size" kernel option, and 32b architectures not being able to do this.
> > Arguably both are corner cases but better to deal with them than take
> > a performance hit we don't have to.
> 
> ok ok ;)
> 
> Too bad the 65536 value is accepted, (is it ?) otherwise
> 
> unsigned int foo(unsigned short size)
> {
> 	return size / 0x3000;
> }
> 
> -> generates the same kind of instructions, with maybe a better
> precision.
> 
> foo:
> 	movzwl	%di, %eax
> 	imull	$43691, %eax, %eax
> 	shrl	$29, %eax
> 	ret

I haven't ever looked all that closely.  I'm assuming the frame size can probably exceed by at least ETH_HLEN - 1 since the IP header length can theoretically reach 64K - 1 before we are maxed out.

We don't really need the extra precision anyway.  We will be off by 1 most likely anyway in many cases since the actual hardware can handle up to 16K - 1 in either the first and/or last buffer of any series since the actual limit is the 14 bits in the Tx descriptor for reporting the buffer length and we try to keep the split between buffers 4K aligned.

- Alex

^ permalink raw reply

* RE: [net-next] icmp: correct return value of icmp_rcv()
From: 张胜举 @ 2016-12-08  1:09 UTC (permalink / raw)
  To: 'Eric Dumazet'; +Cc: netdev
In-Reply-To: <1481120298.4930.3.camel@edumazet-glaptop3.roam.corp.google.com>

> -----Original Message-----
> From: Eric Dumazet [mailto:eric.dumazet@gmail.com]
> Sent: Wednesday, December 07, 2016 10:18 PM
> To: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
> Cc: netdev@vger.kernel.org
> Subject: Re: [net-next] icmp: correct return value of icmp_rcv()
> 
> On Wed, 2016-12-07 at 14:52 +0800, Zhang Shengju wrote:
> > Currently, icmp_rcv() always return zero on a packet delivery upcall.
> >
> > To make its behavior more compliant with the way this API should be
> > used, this patch changes this to let it return NET_RX_SUCCESS when the
> > packet is proper handled, and NET_RX_DROP otherwise.
> >
> > Signed-off-by: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
> > ---
> >  net/ipv4/icmp.c | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 691146a..f79d7a8
> > 100644
> > --- a/net/ipv4/icmp.c
> > +++ b/net/ipv4/icmp.c
> > @@ -1047,12 +1047,12 @@ int icmp_rcv(struct sk_buff *skb)
> >
> >  	if (success)  {
> >  		consume_skb(skb);
> > -		return 0;
> > +		return NET_RX_SUCCESS;
> >  	}
> >
> >  drop:
> >  	kfree_skb(skb);
> > -	return 0;
> > +	return NET_RX_DROP;
> >  csum_error:
> >  	__ICMP_INC_STATS(net, ICMP_MIB_CSUMERRORS);
> >  error:
> 
> 
> I am curious, what external/visible effects do you expect from such a change ?
> 
> We now have a very precise monitoring of where packets are dropped
> (consume_skb()/kfree_skb())
> 
> 

I know that the return value is always ignored, I just to want to make it 
compliant with the way this API required like I said in the comment.

Thanks,

^ permalink raw reply

* Re: [net-next v2 02/19] i40e: simplify txd use count calculation
From: Eric Dumazet @ 2016-12-08  1:03 UTC (permalink / raw)
  To: Duyck, Alexander H
  Cc: Kirsher, Jeffrey T, davem@davemloft.net, Williams, Mitch A,
	netdev@vger.kernel.org, nhorman@redhat.com, sassmann@redhat.com,
	jogreene@redhat.com, guru.anbalagane@oracle.com
In-Reply-To: <B1C1DF2ACD01FD4881736AA51731BAB2A54ADC@ORSMSX107.amr.corp.intel.com>

On Thu, 2016-12-08 at 00:35 +0000, Duyck, Alexander H wrote:

> Well there ends up being a few aspects to it.  First we don't need the
> precision of a full 64b inverse multiplication, that is why we can get
> away with multiple by 85 and shift.  The assumption is we should never
> see a buffer larger than 64K for a TSO frame.  That being the case we
> can do the same thing without having to use a 64b value which isn't an
> option on 32b architectures.
> 
> So basically what it comes down to is dealing with the "optimized for
> size" kernel option, and 32b architectures not being able to do this.
> Arguably both are corner cases but better to deal with them than take
> a performance hit we don't have to.

ok ok ;)

Too bad the 65536 value is accepted, (is it ?) otherwise

unsigned int foo(unsigned short size)
{
	return size / 0x3000;
}

-> generates the same kind of instructions, with maybe a better
precision.

foo:
	movzwl	%di, %eax
	imull	$43691, %eax, %eax
	shrl	$29, %eax
	ret

^ permalink raw reply

* [PATCH 1/1] orinoco: fix improper return value
From: Pan Bian @ 2016-12-08  0:40 UTC (permalink / raw)
  To: Kalle Valo, linux-wireless, netdev; +Cc: linux-kernel, Pan Bian

Function orinoco_ioctl_commit() returns 0 (indicates success) when the
call to orinoco_lock() fails. Thus, the return value is inconsistent with
the execution status. It may be better to return "-EBUSY" when the call 
to orinoco_lock() fails.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=188671

Signed-off-by: Pan Bian <bianpan2016@163.com>
---
 drivers/net/wireless/intersil/orinoco/wext.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/intersil/orinoco/wext.c b/drivers/net/wireless/intersil/orinoco/wext.c
index 1d4dae4..fee57ea 100644
--- a/drivers/net/wireless/intersil/orinoco/wext.c
+++ b/drivers/net/wireless/intersil/orinoco/wext.c
@@ -1314,7 +1314,7 @@ static int orinoco_ioctl_commit(struct net_device *dev,
 		return 0;
 
 	if (orinoco_lock(priv, &flags) != 0)
-		return err;
+		return -EBUSY;
 
 	err = orinoco_commit(priv);
 
-- 
1.9.1

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox