Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next V2 7/7] liquidio VF rx data and ctl path
From: Raghu Vatsavayi @ 2016-12-06 21:06 UTC (permalink / raw)
  To: davem
  Cc: netdev, Raghu Vatsavayi, Raghu Vatsavayi, Derek Chickles,
	Satanand Burla, Felix Manlunas
In-Reply-To: <1481058367-3937-1-git-send-email-rvatsavayi@caviumnetworks.com>

Adds support for VF receive data control path.

Signed-off-by: Raghu Vatsavayi <raghu.vatsavayi@caviumnetworks.com>
Signed-off-by: Derek Chickles <derek.chickles@caviumnetworks.com>
Signed-off-by: Satanand Burla <satananda.burla@caviumnetworks.com>
Signed-off-by: Felix Manlunas <felix.manlunas@caviumnetworks.com>
---
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 412 ++++++++++++++++++++-
 .../net/ethernet/cavium/liquidio/octeon_device.c   |   2 +-
 drivers/net/ethernet/cavium/liquidio/octeon_droq.c |  10 +
 .../ethernet/cavium/liquidio/response_manager.c    |   3 +-
 4 files changed, 423 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
index ce5cdcd..e3724c6 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
@@ -38,6 +38,8 @@
 
 #define DEFAULT_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK)
 
+/* Bit mask values for lio->ifstate */
+#define   LIO_IFSTATE_DROQ_OPS             0x01
 #define   LIO_IFSTATE_REGISTERED           0x02
 #define   LIO_IFSTATE_RUNNING              0x04
 
@@ -55,6 +57,14 @@ struct liquidio_if_cfg_resp {
 	u64 status;
 };
 
+struct liquidio_rx_ctl_context {
+	int octeon_id;
+
+	wait_queue_head_t wc;
+
+	int cond;
+};
+
 union tx_info {
 	u64 u64;
 	struct {
@@ -177,6 +187,16 @@ static int wait_for_pending_requests(struct octeon_device *oct)
 };
 
 /**
+ * \brief check interface state
+ * @param lio per-network private data
+ * @param state_flag flag state to check
+ */
+static inline int ifstate_check(struct lio *lio, int state_flag)
+{
+	return atomic_read(&lio->ifstate) & state_flag;
+}
+
+/**
  * \brief set interface state
  * @param lio per-network private data
  * @param state_flag flag state to set
@@ -510,6 +530,31 @@ static inline void update_link_status(struct net_device *netdev,
 	}
 }
 
+static void update_txq_status(struct octeon_device *oct, int iq_num)
+{
+	struct octeon_instr_queue *iq = oct->instr_queue[iq_num];
+	struct net_device *netdev;
+	struct lio *lio;
+
+	netdev = oct->props[iq->ifidx].netdev;
+	lio = GET_LIO(netdev);
+	if (netif_is_multiqueue(netdev)) {
+		if (__netif_subqueue_stopped(netdev, iq->q_index) &&
+		    lio->linfo.link.s.link_up &&
+		    (!octnet_iq_is_full(oct, iq_num))) {
+			netif_wake_subqueue(netdev, iq->q_index);
+			INCR_INSTRQUEUE_PKT_COUNT(lio->oct_dev, iq_num,
+						  tx_restart, 1);
+		} else {
+			if (!octnet_iq_is_full(oct, lio->txq)) {
+				INCR_INSTRQUEUE_PKT_COUNT(
+				    lio->oct_dev, lio->txq, tx_restart, 1);
+				wake_q(netdev, lio->txq);
+			}
+		}
+	}
+}
+
 static
 int liquidio_schedule_msix_droq_pkt_handler(struct octeon_droq *droq, u64 ret)
 {
@@ -818,6 +863,91 @@ static void octeon_destroy_resources(struct octeon_device *oct)
 }
 
 /**
+ * \brief Callback for rx ctrl
+ * @param status status of request
+ * @param buf pointer to resp structure
+ */
+static void rx_ctl_callback(struct octeon_device *oct,
+			    u32 status, void *buf)
+{
+	struct octeon_soft_command *sc = (struct octeon_soft_command *)buf;
+	struct liquidio_rx_ctl_context *ctx;
+
+	ctx  = (struct liquidio_rx_ctl_context *)sc->ctxptr;
+
+	oct = lio_get_device(ctx->octeon_id);
+	if (status)
+		dev_err(&oct->pci_dev->dev, "rx ctl instruction failed. Status: %llx\n",
+			CVM_CAST64(status));
+	WRITE_ONCE(ctx->cond, 1);
+
+	/* This barrier is required to be sure that the response has been
+	 * written fully before waking up the handler
+	 */
+	wmb();
+
+	wake_up_interruptible(&ctx->wc);
+}
+
+/**
+ * \brief Send Rx control command
+ * @param lio per-network private data
+ * @param start_stop whether to start or stop
+ */
+static void send_rx_ctrl_cmd(struct lio *lio, int start_stop)
+{
+	struct octeon_device *oct = (struct octeon_device *)lio->oct_dev;
+	int ctx_size = sizeof(struct liquidio_rx_ctl_context);
+	struct liquidio_rx_ctl_context *ctx;
+	struct octeon_soft_command *sc;
+	union octnet_cmd *ncmd;
+	int retval;
+
+	if (oct->props[lio->ifidx].rx_on == start_stop)
+		return;
+
+	sc = (struct octeon_soft_command *)
+		octeon_alloc_soft_command(oct, OCTNET_CMD_SIZE,
+					  16, ctx_size);
+
+	ncmd = (union octnet_cmd *)sc->virtdptr;
+	ctx  = (struct liquidio_rx_ctl_context *)sc->ctxptr;
+
+	WRITE_ONCE(ctx->cond, 0);
+	ctx->octeon_id = lio_get_device_id(oct);
+	init_waitqueue_head(&ctx->wc);
+
+	ncmd->u64 = 0;
+	ncmd->s.cmd = OCTNET_CMD_RX_CTL;
+	ncmd->s.param1 = start_stop;
+
+	octeon_swap_8B_data((u64 *)ncmd, (OCTNET_CMD_SIZE >> 3));
+
+	sc->iq_no = lio->linfo.txpciq[0].s.q_no;
+
+	octeon_prepare_soft_command(oct, sc, OPCODE_NIC,
+				    OPCODE_NIC_CMD, 0, 0, 0);
+
+	sc->callback = rx_ctl_callback;
+	sc->callback_arg = sc;
+	sc->wait_time = 5000;
+
+	retval = octeon_send_soft_command(oct, sc);
+	if (retval == IQ_SEND_FAILED) {
+		netif_info(lio, rx_err, lio->netdev, "Failed to send RX Control message\n");
+	} else {
+		/* Sleep on a wait queue till the cond flag indicates that the
+		 * response arrived or timed-out.
+		 */
+		if (sleep_cond(&ctx->wc, &ctx->cond) == -EINTR)
+			return;
+		oct->props[lio->ifidx].rx_on = start_stop;
+	}
+
+	octeon_free_soft_command(oct, sc);
+}
+
+/**
  * \brief Destroy NIC device interface
  * @param oct octeon device
  * @param ifidx which interface to destroy
@@ -828,6 +958,7 @@ static void octeon_destroy_resources(struct octeon_device *oct)
 static void liquidio_destroy_nic_device(struct octeon_device *oct, int ifidx)
 {
 	struct net_device *netdev = oct->props[ifidx].netdev;
+	struct napi_struct *napi, *n;
 	struct lio *lio;
 
 	if (!netdev) {
@@ -843,6 +974,15 @@ static void liquidio_destroy_nic_device(struct octeon_device *oct, int ifidx)
 	if (atomic_read(&lio->ifstate) & LIO_IFSTATE_RUNNING)
 		liquidio_stop(netdev);
 
+	if (oct->props[lio->ifidx].napi_enabled == 1) {
+		list_for_each_entry_safe(napi, n, &netdev->napi_list, dev_list)
+			napi_disable(napi);
+
+		oct->props[lio->ifidx].napi_enabled = 0;
+
+		oct->droq[0]->ops.poll_mode = 0;
+	}
+
 	if (atomic_read(&lio->ifstate) & LIO_IFSTATE_REGISTERED)
 		unregister_netdev(netdev);
 
@@ -863,7 +1003,8 @@ static void liquidio_destroy_nic_device(struct octeon_device *oct, int ifidx)
  */
 static int liquidio_stop_nic_module(struct octeon_device *oct)
 {
-	int i;
+	struct lio *lio;
+	int i, j;
 
 	dev_dbg(&oct->pci_dev->dev, "Stopping network interfaces\n");
 	if (!oct->ifcount) {
@@ -871,6 +1012,17 @@ static int liquidio_stop_nic_module(struct octeon_device *oct)
 		return 1;
 	}
 
+	spin_lock_bh(&oct->cmd_resp_wqlock);
+	oct->cmd_resp_state = OCT_DRV_OFFLINE;
+	spin_unlock_bh(&oct->cmd_resp_wqlock);
+
+	for (i = 0; i < oct->ifcount; i++) {
+		lio = GET_LIO(oct->props[i].netdev);
+		for (j = 0; j < lio->linfo.num_rxpciq; j++)
+			octeon_unregister_droq_ops(oct,
+						   lio->linfo.rxpciq[j].s.q_no);
+	}
+
 	for (i = 0; i < oct->ifcount; i++)
 		liquidio_destroy_nic_device(oct, i);
 
@@ -1091,6 +1243,41 @@ static void free_netsgbuf_with_resp(void *buf)
 }
 
 /**
+ * \brief Setup output queue
+ * @param oct octeon device
+ * @param q_no which queue
+ * @param num_descs how many descriptors
+ * @param desc_size size of each descriptor
+ * @param app_ctx application context
+ */
+static int octeon_setup_droq(struct octeon_device *oct, int q_no, int num_descs,
+			     int desc_size, void *app_ctx)
+{
+	int ret_val;
+
+	dev_dbg(&oct->pci_dev->dev, "Creating Droq: %d\n", q_no);
+	/* droq creation and local register settings. */
+	ret_val = octeon_create_droq(oct, q_no, num_descs, desc_size, app_ctx);
+	if (ret_val < 0)
+		return ret_val;
+
+	if (ret_val == 1) {
+		dev_dbg(&oct->pci_dev->dev, "Using default droq %d\n", q_no);
+		return 0;
+	}
+
+	/* Enable the droq queues */
+	octeon_set_droq_pkt_op(oct, q_no, 1);
+
+	/* Send Credit for Octeon Output queues. Credits are always
+	 * sent after the output queue is enabled.
+	 */
+	writel(oct->droq[q_no]->max_count, oct->droq[q_no]->pkts_credit_reg);
+
+	return ret_val;
+}
+
+/**
  * \brief Callback for getting interface configuration
  * @param status status of request
  * @param buf pointer to resp structure
@@ -1142,6 +1329,155 @@ static u16 select_q(struct net_device *dev, struct sk_buff *skb,
 	return (u16)(qindex % (lio->linfo.num_txpciq));
 }
 
+/** Routine to push packets arriving on Octeon interface upto network layer.
+ * @param oct_id   - octeon device id.
+ * @param skbuff   - skbuff struct to be passed to network layer.
+ * @param len      - size of total data received.
+ * @param rh       - Control header associated with the packet
+ * @param param    - additional control data with the packet
+ * @param arg      - farg registered in droq_ops
+ */
+static void
+liquidio_push_packet(u32 octeon_id __attribute__((unused)),
+		     void *skbuff,
+		     u32 len,
+		     union octeon_rh *rh,
+		     void *param,
+		     void *arg)
+{
+	struct napi_struct *napi = param;
+	struct octeon_droq *droq =
+		container_of(param, struct octeon_droq, napi);
+	struct net_device *netdev = (struct net_device *)arg;
+	struct sk_buff *skb = (struct sk_buff *)skbuff;
+
+	if (netdev) {
+		struct lio *lio = GET_LIO(netdev);
+		int packet_was_received;
+
+		/* Do not proceed if the interface is not in RUNNING state. */
+		if (!ifstate_check(lio, LIO_IFSTATE_RUNNING)) {
+			recv_buffer_free(skb);
+			droq->stats.rx_dropped++;
+			return;
+		}
+
+		skb->dev = netdev;
+
+		skb_record_rx_queue(skb, droq->q_no);
+		if (likely(len > MIN_SKB_SIZE)) {
+			struct octeon_skb_page_info *pg_info;
+			unsigned char *va;
+
+			pg_info = ((struct octeon_skb_page_info *)(skb->cb));
+			if (pg_info->page) {
+				/* For Paged allocation use the frags */
+				va = page_address(pg_info->page) +
+					pg_info->page_offset;
+				memcpy(skb->data, va, MIN_SKB_SIZE);
+				skb_put(skb, MIN_SKB_SIZE);
+				skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
+						pg_info->page,
+						pg_info->page_offset +
+						MIN_SKB_SIZE,
+						len - MIN_SKB_SIZE,
+						LIO_RXBUFFER_SZ);
+			}
+		} else {
+			struct octeon_skb_page_info *pg_info =
+				((struct octeon_skb_page_info *)(skb->cb));
+			skb_copy_to_linear_data(skb,
+						page_address(pg_info->page) +
+						pg_info->page_offset, len);
+			skb_put(skb, len);
+			put_page(pg_info->page);
+		}
+
+		skb_pull(skb, rh->r_dh.len * 8);
+		skb->protocol = eth_type_trans(skb, skb->dev);
+
+		if ((netdev->features & NETIF_F_RXCSUM) &&
+		    (rh->r_dh.csum_verified & CNNIC_CSUM_VERIFIED))
+			/* checksum has already been verified */
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+		else
+			skb->ip_summed = CHECKSUM_NONE;
+
+		packet_was_received = (napi_gro_receive(napi, skb) != GRO_DROP);
+
+		if (packet_was_received) {
+			droq->stats.rx_bytes_received += len;
+			droq->stats.rx_pkts_received++;
+			netdev->last_rx = jiffies;
+		} else {
+			droq->stats.rx_dropped++;
+			netif_info(lio, rx_err, lio->netdev,
+				   "droq:%d  error rx_dropped:%llu\n",
+				   droq->q_no, droq->stats.rx_dropped);
+		}
+
+	} else {
+		recv_buffer_free(skb);
+	}
+}
+
+/**
+ * \brief callback when receive interrupt occurs and we are in NAPI mode
+ * @param arg pointer to octeon output queue
+ */
+static void liquidio_vf_napi_drv_callback(void *arg)
+{
+	struct octeon_droq *droq = arg;
+
+	napi_schedule_irqoff(&droq->napi);
+}
+
+/**
+ * \brief Entry point for NAPI polling
+ * @param napi NAPI structure
+ * @param budget maximum number of items to process
+ */
+static int liquidio_napi_poll(struct napi_struct *napi, int budget)
+{
+	struct octeon_instr_queue *iq;
+	struct octeon_device *oct;
+	struct octeon_droq *droq;
+	int tx_done = 0, iq_no;
+	int work_done;
+
+	droq = container_of(napi, struct octeon_droq, napi);
+	oct = droq->oct_dev;
+	iq_no = droq->q_no;
+
+	/* Handle Droq descriptors */
+	work_done = octeon_process_droq_poll_cmd(oct, droq->q_no,
+						 POLL_EVENT_PROCESS_PKTS,
+						 budget);
+
+	/* Flush the instruction queue */
+	iq = oct->instr_queue[iq_no];
+	if (iq) {
+		/* Process iq buffers with in the budget limits */
+		tx_done = octeon_flush_iq(oct, iq, 1, budget);
+		/* Update iq read-index rather than waiting for next interrupt.
+		 * Return back if tx_done is false.
+		 */
+		update_txq_status(oct, iq_no);
+	} else {
+		dev_err(&oct->pci_dev->dev, "%s: iq (%d) num invalid\n",
+			__func__, iq_no);
+	}
+
+	if ((work_done < budget) && (tx_done)) {
+		napi_complete(napi);
+		octeon_process_droq_poll_cmd(droq->oct_dev, droq->q_no,
+					     POLL_EVENT_ENABLE_INTR, 0);
+		return 0;
+	}
+
+	return (!tx_done) ? (budget) : (work_done);
+}
+
 /**
  * \brief Setup input and output queues
  * @param octeon_dev octeon device
@@ -1153,16 +1489,68 @@ static u16 select_q(struct net_device *dev, struct sk_buff *skb,
  */
 static inline int setup_io_queues(struct octeon_device *octeon_dev, int ifidx)
 {
+	struct octeon_droq_ops droq_ops;
 	struct net_device *netdev;
+	static int cpu_id_modulus;
+	struct octeon_droq *droq;
+	struct napi_struct *napi;
+	static int cpu_id;
 	int num_tx_descs;
 	struct lio *lio;
 	int retval = 0;
-	int q;
+	int q, q_no;
 
 	netdev = octeon_dev->props[ifidx].netdev;
 
 	lio = GET_LIO(netdev);
 
+	memset(&droq_ops, 0, sizeof(struct octeon_droq_ops));
+
+	droq_ops.fptr = liquidio_push_packet;
+	droq_ops.farg = netdev;
+
+	droq_ops.poll_mode = 1;
+	droq_ops.napi_fn = liquidio_vf_napi_drv_callback;
+	cpu_id = 0;
+	cpu_id_modulus = num_present_cpus();
+
+	/* set up DROQs. */
+	for (q = 0; q < lio->linfo.num_rxpciq; q++) {
+		q_no = lio->linfo.rxpciq[q].s.q_no;
+
+		retval = octeon_setup_droq(
+		    octeon_dev, q_no,
+		    CFG_GET_NUM_RX_DESCS_NIC_IF(octeon_get_conf(octeon_dev),
+						lio->ifidx),
+		    CFG_GET_NUM_RX_BUF_SIZE_NIC_IF(octeon_get_conf(octeon_dev),
+						   lio->ifidx),
+		    NULL);
+		if (retval) {
+			dev_err(&octeon_dev->pci_dev->dev,
+				"%s : Runtime DROQ(RxQ) creation failed.\n",
+				__func__);
+			return 1;
+		}
+
+		droq = octeon_dev->droq[q_no];
+		napi = &droq->napi;
+		netif_napi_add(netdev, napi, liquidio_napi_poll, 64);
+
+		/* designate a CPU for this droq */
+		droq->cpu_id = cpu_id;
+		cpu_id++;
+		if (cpu_id >= cpu_id_modulus)
+			cpu_id = 0;
+
+		octeon_register_droq_ops(octeon_dev, q_no, &droq_ops);
+	}
+
+	/* 23XX VF can send/recv control messages (via the first VF-owned
+	 * droq) from the firmware even if the ethX interface is down,
+	 * so that's why poll_mode must be off for the first droq.
+	 */
+	octeon_dev->droq[0]->ops.poll_mode = 0;
+
 	/* set up IQs. */
 	for (q = 0; q < lio->linfo.num_txpciq; q++) {
 		num_tx_descs = CFG_GET_NUM_TX_DESCS_NIC_IF(
@@ -1189,6 +1577,16 @@ static int liquidio_open(struct net_device *netdev)
 {
 	struct lio *lio = GET_LIO(netdev);
 	struct octeon_device *oct = lio->oct_dev;
+	struct napi_struct *napi, *n;
+
+	if (!oct->props[lio->ifidx].napi_enabled) {
+		list_for_each_entry_safe(napi, n, &netdev->napi_list, dev_list)
+			napi_enable(napi);
+
+		oct->props[lio->ifidx].napi_enabled = 1;
+
+		oct->droq[0]->ops.poll_mode = 1;
+	}
 
 	ifstate_set(lio, LIO_IFSTATE_RUNNING);
 
@@ -1198,6 +1596,9 @@ static int liquidio_open(struct net_device *netdev)
 	netif_info(lio, ifup, lio->netdev, "Interface Open, ready for traffic\n");
 	start_txq(netdev);
 
+	/* tell Octeon to start forwarding packets to host */
+	send_rx_ctrl_cmd(lio, 1);
+
 	dev_info(&oct->pci_dev->dev, "%s interface is opened\n", netdev->name);
 
 	return 0;
@@ -1220,6 +1621,9 @@ static int liquidio_stop(struct net_device *netdev)
 	netif_carrier_off(netdev);
 	lio->link_changes++;
 
+	/* tell Octeon to stop forwarding packets to host */
+	send_rx_ctrl_cmd(lio, 0);
+
 	ifstate_reset(lio, LIO_IFSTATE_RUNNING);
 
 	txqs_stop(netdev);
@@ -2016,6 +2420,8 @@ static int setup_nic_devices(struct octeon_device *octeon_dev)
 			goto setup_nic_dev_fail;
 		}
 
+		ifstate_set(lio, LIO_IFSTATE_DROQ_OPS);
+
 		/* For VFs, enable Octeon device interrupts here,
 		 * as this is contingent upon IO queue setup
 		 */
@@ -2026,8 +2432,10 @@ static int setup_nic_devices(struct octeon_device *octeon_dev)
 		 * tx and rx queues
 		 */
 		lio->txq = lio->linfo.txpciq[0].s.q_no;
+		lio->rxq = lio->linfo.rxpciq[0].s.q_no;
 
 		lio->tx_qsize = octeon_get_tx_qsize(octeon_dev, lio->txq);
+		lio->rx_qsize = octeon_get_rx_qsize(octeon_dev, lio->rxq);
 
 		if (setup_glists(lio, num_iqueues)) {
 			dev_err(&octeon_dev->pci_dev->dev,
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_device.c b/drivers/net/ethernet/cavium/liquidio/octeon_device.c
index 583818e..a8df493 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_device.c
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_device.c
@@ -1374,7 +1374,7 @@ void lio_enable_irq(struct octeon_droq *droq, struct octeon_instr_queue *iq)
 	/*write resend. Writing RESEND in SLI_PKTX_CNTS should be enough
 	 *to trigger tx interrupts as well, if they are pending.
 	 */
-	if (oct && OCTEON_CN23XX_PF(oct)) {
+	if (oct && (OCTEON_CN23XX_PF(oct) || OCTEON_CN23XX_VF(oct))) {
 		if (droq)
 			writeq(CN23XX_INTR_RESEND, droq->pkts_sent_reg);
 		/*we race with firmrware here. read and write the IN_DONE_CNTS*/
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_droq.c b/drivers/net/ethernet/cavium/liquidio/octeon_droq.c
index 8bf1ac76..0be87d1 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_droq.c
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_droq.c
@@ -28,6 +28,7 @@
 #include "cn66xx_regs.h"
 #include "cn66xx_device.h"
 #include "cn23xx_pf_device.h"
+#include "cn23xx_vf_device.h"
 
 struct niclist {
 	struct list_head list;
@@ -261,6 +262,11 @@ int octeon_init_droq(struct octeon_device *oct,
 
 		c_pkts_per_intr = (u32)CFG_GET_OQ_PKTS_PER_INTR(conf23);
 		c_refill_threshold = (u32)CFG_GET_OQ_REFILL_THRESHOLD(conf23);
+	} else if (OCTEON_CN23XX_VF(oct)) {
+		struct octeon_config *conf23 = CHIP_CONF(oct, cn23xx_vf);
+
+		c_pkts_per_intr = (u32)CFG_GET_OQ_PKTS_PER_INTR(conf23);
+		c_refill_threshold = (u32)CFG_GET_OQ_REFILL_THRESHOLD(conf23);
 	} else {
 		return 1;
 	}
@@ -889,6 +895,10 @@ static inline void octeon_droq_drop_packets(struct octeon_device *oct,
 			lio_enable_irq(oct->droq[q_no], oct->instr_queue[q_no]);
 		}
 		break;
+
+		case OCTEON_CN23XX_VF_VID:
+			lio_enable_irq(oct->droq[q_no], oct->instr_queue[q_no]);
+		break;
 		}
 		return 0;
 	}
diff --git a/drivers/net/ethernet/cavium/liquidio/response_manager.c b/drivers/net/ethernet/cavium/liquidio/response_manager.c
index fdaf742..2fbaae9 100644
--- a/drivers/net/ethernet/cavium/liquidio/response_manager.c
+++ b/drivers/net/ethernet/cavium/liquidio/response_manager.c
@@ -84,7 +84,8 @@ int lio_process_ordered_list(struct octeon_device *octeon_dev,
 
 		sc = (struct octeon_soft_command *)ordered_sc_list->
 		    head.next;
-		if (OCTEON_CN23XX_PF(octeon_dev)) {
+		if (OCTEON_CN23XX_PF(octeon_dev) ||
+		    OCTEON_CN23XX_VF(octeon_dev)) {
 			rdp = (struct octeon_instr_rdp *)&sc->cmd.cmd3.rdp;
 			rptr = sc->cmd.cmd3.rptr;
 		} else {
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next V2 5/7] liquidio CN23XX: VF xmit
From: Raghu Vatsavayi @ 2016-12-06 21:06 UTC (permalink / raw)
  To: davem
  Cc: netdev, Raghu Vatsavayi, Raghu Vatsavayi, Derek Chickles,
	Satanand Burla, Felix Manlunas
In-Reply-To: <1481058367-3937-1-git-send-email-rvatsavayi@caviumnetworks.com>

Adds support for transmit functionality in VF.

Signed-off-by: Raghu Vatsavayi <raghu.vatsavayi@caviumnetworks.com>
Signed-off-by: Derek Chickles <derek.chickles@caviumnetworks.com>
Signed-off-by: Satanand Burla <satananda.burla@caviumnetworks.com>
Signed-off-by: Felix Manlunas <felix.manlunas@caviumnetworks.com>
---
 .../ethernet/cavium/liquidio/cn23xx_vf_device.c    |  21 ++
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 340 +++++++++++++++++++++
 .../net/ethernet/cavium/liquidio/request_manager.c |   6 +-
 3 files changed, 364 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/cn23xx_vf_device.c b/drivers/net/ethernet/cavium/liquidio/cn23xx_vf_device.c
index 108e487..b6117b6 100644
--- a/drivers/net/ethernet/cavium/liquidio/cn23xx_vf_device.c
+++ b/drivers/net/ethernet/cavium/liquidio/cn23xx_vf_device.c
@@ -529,6 +529,26 @@ static u64 cn23xx_vf_msix_interrupt_handler(void *dev)
 	return ret;
 }
 
+static u32 cn23xx_update_read_index(struct octeon_instr_queue *iq)
+{
+	u32 pkt_in_done = readl(iq->inst_cnt_reg);
+	u32 last_done;
+	u32 new_idx;
+
+	last_done = pkt_in_done - iq->pkt_in_done;
+	iq->pkt_in_done = pkt_in_done;
+
+	/* Modulo of the new index with the IQ size will give us
+	 * the new index.  The iq->reset_instr_cnt is always zero for
+	 * cn23xx, so no extra adjustments are needed.
+	 */
+	new_idx = (iq->octeon_read_index +
+		   (u32)(last_done & CN23XX_PKT_IN_DONE_CNT_MASK)) %
+		  iq->max_count;
+
+	return new_idx;
+}
+
 static void cn23xx_enable_vf_interrupt(struct octeon_device *oct, u8 intr_flag)
 {
 	struct octeon_cn23xx_vf *cn23xx = (struct octeon_cn23xx_vf *)oct->chip;
@@ -660,6 +680,7 @@ int cn23xx_setup_octeon_vf_device(struct octeon_device *oct)
 	oct->fn_list.msix_interrupt_handler = cn23xx_vf_msix_interrupt_handler;
 
 	oct->fn_list.setup_device_regs = cn23xx_setup_vf_device_regs;
+	oct->fn_list.update_iq_read_idx = cn23xx_update_read_index;
 
 	oct->fn_list.enable_interrupt = cn23xx_enable_vf_interrupt;
 	oct->fn_list.disable_interrupt = cn23xx_disable_vf_interrupt;
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
index e4ee6ec..cf80722 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
@@ -55,6 +55,21 @@ struct liquidio_if_cfg_resp {
 	u64 status;
 };
 
+union tx_info {
+	u64 u64;
+	struct {
+#ifdef __BIG_ENDIAN_BITFIELD
+		u16 gso_size;
+		u16 gso_segs;
+		u32 reserved;
+#else
+		u32 reserved;
+		u16 gso_segs;
+		u16 gso_size;
+#endif
+	} s;
+};
+
 #define OCTNIC_MAX_SG  (MAX_SKB_FRAGS)
 
 #define OCTNIC_GSO_MAX_HEADER_SIZE 128
@@ -255,6 +270,19 @@ static void start_txq(struct net_device *netdev)
 }
 
 /**
+ * \brief Stop a queue
+ * @param netdev network device
+ * @param q which queue to stop
+ */
+static inline void stop_q(struct net_device *netdev, int q)
+{
+	if (netif_is_multiqueue(netdev))
+		netif_stop_subqueue(netdev, q);
+	else
+		netif_stop_queue(netdev);
+}
+
+/**
  * Remove the node at the head of the list. The list would be empty at
  * the end of this call if there are no more nodes in the list.
  */
@@ -945,6 +973,45 @@ static u16 select_q(struct net_device *dev, struct sk_buff *skb,
 }
 
 /**
+ * \brief Setup input and output queues
+ * @param octeon_dev octeon device
+ * @param ifidx Interface index
+ *
+ * Note: Queues are with respect to the octeon device. Thus
+ * an input queue is for egress packets, and output queues
+ * are for ingress packets.
+ */
+static inline int setup_io_queues(struct octeon_device *octeon_dev, int ifidx)
+{
+	struct net_device *netdev;
+	int num_tx_descs;
+	struct lio *lio;
+	int retval = 0;
+	int q;
+
+	netdev = octeon_dev->props[ifidx].netdev;
+
+	lio = GET_LIO(netdev);
+
+	/* set up IQs. */
+	for (q = 0; q < lio->linfo.num_txpciq; q++) {
+		num_tx_descs = CFG_GET_NUM_TX_DESCS_NIC_IF(
+		    octeon_get_conf(octeon_dev), lio->ifidx);
+		retval = octeon_setup_iq(octeon_dev, ifidx, q,
+					 lio->linfo.txpciq[q], num_tx_descs,
+					 netdev_get_tx_queue(netdev, q));
+		if (retval) {
+			dev_err(&octeon_dev->pci_dev->dev,
+				" %s : Runtime IQ(TxQ) creation failed.\n",
+				__func__);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+/**
  * \brief Net device open for LiquidIO
  * @param netdev network device
  */
@@ -1180,6 +1247,259 @@ static int liquidio_change_mtu(struct net_device *netdev, int new_mtu)
 	return 0;
 }
 
+/** \brief Transmit networks packets to the Octeon interface
+ * @param skbuff   skbuff struct to be passed to network layer.
+ * @param netdev   pointer to network device
+ * @returns whether the packet was transmitted to the device okay or not
+ *             (NETDEV_TX_OK or NETDEV_TX_BUSY)
+ */
+static int liquidio_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+	struct octnet_buf_free_info *finfo;
+	union octnic_cmd_setup cmdsetup;
+	struct octnic_data_pkt ndata;
+	struct octeon_instr_irh *irh;
+	struct oct_iq_stats *stats;
+	struct octeon_device *oct;
+	int q_idx = 0, iq_no = 0;
+	union tx_info *tx_info;
+	struct lio *lio;
+	int status = 0;
+	u64 dptr = 0;
+	u32 tag = 0;
+	int j;
+
+	lio = GET_LIO(netdev);
+	oct = lio->oct_dev;
+
+	if (netif_is_multiqueue(netdev)) {
+		q_idx = skb->queue_mapping;
+		q_idx = (q_idx % (lio->linfo.num_txpciq));
+		tag = q_idx;
+		iq_no = lio->linfo.txpciq[q_idx].s.q_no;
+	} else {
+		iq_no = lio->txq;
+	}
+
+	stats = &oct->instr_queue[iq_no]->stats;
+
+	/* Check for all conditions in which the current packet cannot be
+	 * transmitted.
+	 */
+	if (!(atomic_read(&lio->ifstate) & LIO_IFSTATE_RUNNING) ||
+	    (!lio->linfo.link.s.link_up) || (skb->len <= 0)) {
+		netif_info(lio, tx_err, lio->netdev, "Transmit failed link_status : %d\n",
+			   lio->linfo.link.s.link_up);
+		goto lio_xmit_failed;
+	}
+
+	/* Use space in skb->cb to store info used to unmap and
+	 * free the buffers.
+	 */
+	finfo = (struct octnet_buf_free_info *)skb->cb;
+	finfo->lio = lio;
+	finfo->skb = skb;
+	finfo->sc = NULL;
+
+	/* Prepare the attributes for the data to be passed to OSI. */
+	memset(&ndata, 0, sizeof(struct octnic_data_pkt));
+
+	ndata.buf = finfo;
+
+	ndata.q_no = iq_no;
+
+	if (netif_is_multiqueue(netdev)) {
+		if (octnet_iq_is_full(oct, ndata.q_no)) {
+			/* defer sending if queue is full */
+			netif_info(lio, tx_err, lio->netdev, "Transmit failed iq:%d full\n",
+				   ndata.q_no);
+			stats->tx_iq_busy++;
+			return NETDEV_TX_BUSY;
+		}
+	} else {
+		if (octnet_iq_is_full(oct, lio->txq)) {
+			/* defer sending if queue is full */
+			stats->tx_iq_busy++;
+			netif_info(lio, tx_err, lio->netdev, "Transmit failed iq:%d full\n",
+				   ndata.q_no);
+			return NETDEV_TX_BUSY;
+		}
+	}
+
+	ndata.datasize = skb->len;
+
+	cmdsetup.u64 = 0;
+	cmdsetup.s.iq_no = iq_no;
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
+		cmdsetup.s.transport_csum = 1;
+
+	if (!skb_shinfo(skb)->nr_frags) {
+		cmdsetup.s.u.datasize = skb->len;
+		octnet_prepare_pci_cmd(oct, &ndata.cmd, &cmdsetup, tag);
+		/* Offload checksum calculation for TCP/UDP packets */
+		dptr = dma_map_single(&oct->pci_dev->dev,
+				      skb->data,
+				      skb->len,
+				      DMA_TO_DEVICE);
+		if (dma_mapping_error(&oct->pci_dev->dev, dptr)) {
+			dev_err(&oct->pci_dev->dev, "%s DMA mapping error 1\n",
+				__func__);
+			return NETDEV_TX_BUSY;
+		}
+
+		ndata.cmd.cmd3.dptr = dptr;
+		finfo->dptr = dptr;
+		ndata.reqtype = REQTYPE_NORESP_NET;
+
+	} else {
+		struct skb_frag_struct *frag;
+		struct octnic_gather *g;
+		int i, frags;
+
+		spin_lock(&lio->glist_lock[q_idx]);
+		g = (struct octnic_gather *)list_delete_head(
+		    &lio->glist[q_idx]);
+		spin_unlock(&lio->glist_lock[q_idx]);
+
+		if (!g) {
+			netif_info(lio, tx_err, lio->netdev,
+				   "Transmit scatter gather: glist null!\n");
+			goto lio_xmit_failed;
+		}
+
+		cmdsetup.s.gather = 1;
+		cmdsetup.s.u.gatherptrs = (skb_shinfo(skb)->nr_frags + 1);
+		octnet_prepare_pci_cmd(oct, &ndata.cmd, &cmdsetup, tag);
+
+		memset(g->sg, 0, g->sg_size);
+
+		g->sg[0].ptr[0] = dma_map_single(&oct->pci_dev->dev,
+						 skb->data,
+						 (skb->len - skb->data_len),
+						 DMA_TO_DEVICE);
+		if (dma_mapping_error(&oct->pci_dev->dev, g->sg[0].ptr[0])) {
+			dev_err(&oct->pci_dev->dev, "%s DMA mapping error 2\n",
+				__func__);
+			return NETDEV_TX_BUSY;
+		}
+		add_sg_size(&g->sg[0], (skb->len - skb->data_len), 0);
+
+		frags = skb_shinfo(skb)->nr_frags;
+		i = 1;
+		while (frags--) {
+			frag = &skb_shinfo(skb)->frags[i - 1];
+
+			g->sg[(i >> 2)].ptr[(i & 3)] =
+				dma_map_page(&oct->pci_dev->dev,
+					     frag->page.p,
+					     frag->page_offset,
+					     frag->size,
+					     DMA_TO_DEVICE);
+			if (dma_mapping_error(&oct->pci_dev->dev,
+					      g->sg[i >> 2].ptr[i & 3])) {
+				dma_unmap_single(&oct->pci_dev->dev,
+						 g->sg[0].ptr[0],
+						 skb->len - skb->data_len,
+						 DMA_TO_DEVICE);
+				for (j = 1; j < i; j++) {
+					frag = &skb_shinfo(skb)->frags[j - 1];
+					dma_unmap_page(&oct->pci_dev->dev,
+						       g->sg[j >> 2].ptr[j & 3],
+						       frag->size,
+						       DMA_TO_DEVICE);
+				}
+				dev_err(&oct->pci_dev->dev, "%s DMA mapping error 3\n",
+					__func__);
+				return NETDEV_TX_BUSY;
+			}
+
+			add_sg_size(&g->sg[(i >> 2)], frag->size, (i & 3));
+			i++;
+		}
+
+		dptr = dma_map_single(&oct->pci_dev->dev,
+				      g->sg, g->sg_size,
+				      DMA_TO_DEVICE);
+		if (dma_mapping_error(&oct->pci_dev->dev, dptr)) {
+			dev_err(&oct->pci_dev->dev, "%s DMA mapping error 4\n",
+				__func__);
+			dma_unmap_single(&oct->pci_dev->dev, g->sg[0].ptr[0],
+					 skb->len - skb->data_len,
+					 DMA_TO_DEVICE);
+			for (j = 1; j <= frags; j++) {
+				frag = &skb_shinfo(skb)->frags[j - 1];
+				dma_unmap_page(&oct->pci_dev->dev,
+					       g->sg[j >> 2].ptr[j & 3],
+					       frag->size, DMA_TO_DEVICE);
+			}
+			return NETDEV_TX_BUSY;
+		}
+
+		ndata.cmd.cmd3.dptr = dptr;
+		finfo->dptr = dptr;
+		finfo->g = g;
+
+		ndata.reqtype = REQTYPE_NORESP_NET_SG;
+	}
+
+	irh = (struct octeon_instr_irh *)&ndata.cmd.cmd3.irh;
+	tx_info = (union tx_info *)&ndata.cmd.cmd3.ossp[0];
+
+	if (skb_shinfo(skb)->gso_size) {
+		tx_info->s.gso_size = skb_shinfo(skb)->gso_size;
+		tx_info->s.gso_segs = skb_shinfo(skb)->gso_segs;
+	}
+
+	status = octnet_send_nic_data_pkt(oct, &ndata);
+	if (status == IQ_SEND_FAILED)
+		goto lio_xmit_failed;
+
+	netif_info(lio, tx_queued, lio->netdev, "Transmit queued successfully\n");
+
+	if (status == IQ_SEND_STOP) {
+		dev_err(&oct->pci_dev->dev, "Rcvd IQ_SEND_STOP signal; stopping IQ-%d\n",
+			iq_no);
+		stop_q(lio->netdev, q_idx);
+	}
+
+	netif_trans_update(netdev);
+
+	if (skb_shinfo(skb)->gso_size)
+		stats->tx_done += skb_shinfo(skb)->gso_segs;
+	else
+		stats->tx_done++;
+	stats->tx_tot_bytes += skb->len;
+
+	return NETDEV_TX_OK;
+
+lio_xmit_failed:
+	stats->tx_dropped++;
+	netif_info(lio, tx_err, lio->netdev, "IQ%d Transmit dropped:%llu\n",
+		   iq_no, stats->tx_dropped);
+	if (dptr)
+		dma_unmap_single(&oct->pci_dev->dev, dptr,
+				 ndata.datasize, DMA_TO_DEVICE);
+	tx_buffer_free(skb);
+	return NETDEV_TX_OK;
+}
+
+/** \brief Network device Tx timeout
+ * @param netdev    pointer to network device
+ */
+static void liquidio_tx_timeout(struct net_device *netdev)
+{
+	struct lio *lio;
+
+	lio = GET_LIO(netdev);
+
+	netif_info(lio, tx_err, lio->netdev,
+		   "Transmit timeout tx_dropped:%ld, waking up queues now!!\n",
+		   netdev->stats.tx_dropped);
+	netif_trans_update(netdev);
+	txqs_wake(netdev);
+}
+
 /** Sending command to enable/disable RX checksum offload
  * @param netdev                pointer to network device
  * @param command               OCTNET_CMD_TNL_RX_CSUM_CTL
@@ -1282,8 +1602,10 @@ static int liquidio_set_features(struct net_device *netdev,
 static const struct net_device_ops lionetdevops = {
 	.ndo_open		= liquidio_open,
 	.ndo_stop		= liquidio_stop,
+	.ndo_start_xmit		= liquidio_xmit,
 	.ndo_set_mac_address	= liquidio_set_mac,
 	.ndo_set_rx_mode	= liquidio_set_mcast_list,
+	.ndo_tx_timeout		= liquidio_tx_timeout,
 	.ndo_change_mtu		= liquidio_change_mtu,
 	.ndo_fix_features	= liquidio_fix_features,
 	.ndo_set_features	= liquidio_set_features,
@@ -1507,6 +1829,24 @@ static int setup_nic_devices(struct octeon_device *octeon_dev)
 		/* Copy MAC Address to OS network device structure */
 		ether_addr_copy(netdev->dev_addr, mac);
 
+		if (setup_io_queues(octeon_dev, i)) {
+			dev_err(&octeon_dev->pci_dev->dev, "I/O queues creation failed\n");
+			goto setup_nic_dev_fail;
+		}
+
+		/* For VFs, enable Octeon device interrupts here,
+		 * as this is contingent upon IO queue setup
+		 */
+		octeon_dev->fn_list.enable_interrupt(octeon_dev,
+						     OCTEON_ALL_INTR);
+
+		/* By default all interfaces on a single Octeon uses the same
+		 * tx and rx queues
+		 */
+		lio->txq = lio->linfo.txpciq[0].s.q_no;
+
+		lio->tx_qsize = octeon_get_tx_qsize(octeon_dev, lio->txq);
+
 		if (setup_glists(lio, num_iqueues)) {
 			dev_err(&octeon_dev->pci_dev->dev,
 				"Gather list allocation failed\n");
diff --git a/drivers/net/ethernet/cavium/liquidio/request_manager.c b/drivers/net/ethernet/cavium/liquidio/request_manager.c
index ea2b7e4..3ce6675 100644
--- a/drivers/net/ethernet/cavium/liquidio/request_manager.c
+++ b/drivers/net/ethernet/cavium/liquidio/request_manager.c
@@ -394,7 +394,7 @@ static inline void __copy_cmd_into_iq(struct octeon_instr_queue *iq,
 		case REQTYPE_SOFT_COMMAND:
 			sc = buf;
 
-			if (OCTEON_CN23XX_PF(oct))
+			if (OCTEON_CN23XX_PF(oct) || OCTEON_CN23XX_VF(oct))
 				irh = (struct octeon_instr_irh *)
 					&sc->cmd.cmd3.irh;
 			else
@@ -607,7 +607,7 @@ static void check_db_timeout(struct work_struct *work)
 
 	oct_cfg = octeon_get_conf(oct);
 
-	if (OCTEON_CN23XX_PF(oct)) {
+	if (OCTEON_CN23XX_PF(oct) || OCTEON_CN23XX_VF(oct)) {
 		ih3 = (struct octeon_instr_ih3 *)&sc->cmd.cmd3.ih3;
 
 		ih3->pkind = oct->instr_queue[sc->iq_no]->txpciq.s.pkind;
@@ -700,7 +700,7 @@ int octeon_send_soft_command(struct octeon_device *oct,
 	struct octeon_instr_irh *irh;
 	u32 len;
 
-	if (OCTEON_CN23XX_PF(oct)) {
+	if (OCTEON_CN23XX_PF(oct) || OCTEON_CN23XX_VF(oct)) {
 		ih3 =  (struct octeon_instr_ih3 *)&sc->cmd.cmd3.ih3;
 		if (ih3->dlengsz) {
 			WARN_ON(!sc->dmadptr);
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next V2 0/7] liquidio VF data path
From: Raghu Vatsavayi @ 2016-12-06 21:06 UTC (permalink / raw)
  To: davem; +Cc: netdev, Raghu Vatsavayi

Dave,

Following patch series adds support for VF data path
related features. I have removed unnecessary "void *"
casting. Please apply patches in following order as 
some of them depend on earlier patches.

Raghu Vatsavayi (7):
  liquidio CN23XX: VF offload features
  liquidio CN23XX: VF link status
  liquidio CN23XX: VF mac address
  liquidio CN23XX: VF scatter gather lists
  liquidio CN23XX: VF xmit
  liquidio CN23XX: VF TX buffers
  liquidio VF rx data and ctl path

 .../ethernet/cavium/liquidio/cn23xx_vf_device.c    |   21 +
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 2094 +++++++++++++++++++-
 .../net/ethernet/cavium/liquidio/liquidio_common.h |    1 +
 .../net/ethernet/cavium/liquidio/octeon_device.c   |    5 +-
 drivers/net/ethernet/cavium/liquidio/octeon_droq.c |   10 +
 .../net/ethernet/cavium/liquidio/octeon_network.h  |    1 +
 .../net/ethernet/cavium/liquidio/request_manager.c |    6 +-
 .../ethernet/cavium/liquidio/response_manager.c    |    3 +-
 8 files changed, 2124 insertions(+), 17 deletions(-)

-- 
1.8.3.1

^ permalink raw reply

* [PATCH net-next V2 3/7] liquidio CN23XX: VF mac address
From: Raghu Vatsavayi @ 2016-12-06 21:06 UTC (permalink / raw)
  To: davem
  Cc: netdev, Raghu Vatsavayi, Raghu Vatsavayi, Derek Chickles,
	Satanand Burla, Felix Manlunas
In-Reply-To: <1481058367-3937-1-git-send-email-rvatsavayi@caviumnetworks.com>

Adds support for configuring mtu, multicast and mac address.

Signed-off-by: Raghu Vatsavayi <raghu.vatsavayi@caviumnetworks.com>
Signed-off-by: Derek Chickles <derek.chickles@caviumnetworks.com>
Signed-off-by: Satanand Burla <satananda.burla@caviumnetworks.com>
Signed-off-by: Felix Manlunas <felix.manlunas@caviumnetworks.com>
---
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 195 +++++++++++++++++++++
 .../net/ethernet/cavium/liquidio/liquidio_common.h |   1 +
 .../net/ethernet/cavium/liquidio/octeon_network.h  |   1 +
 3 files changed, 197 insertions(+)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
index eddd77f..0e23e2f 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
@@ -864,6 +864,194 @@ static int liquidio_stop(struct net_device *netdev)
 	return 0;
 }
 
+/**
+ * \brief Converts a mask based on net device flags
+ * @param netdev network device
+ *
+ * This routine generates a octnet_ifflags mask from the net device flags
+ * received from the OS.
+ */
+static inline enum octnet_ifflags get_new_flags(struct net_device *netdev)
+{
+	enum octnet_ifflags f = OCTNET_IFFLAG_UNICAST;
+
+	if (netdev->flags & IFF_PROMISC)
+		f |= OCTNET_IFFLAG_PROMISC;
+
+	if (netdev->flags & IFF_ALLMULTI)
+		f |= OCTNET_IFFLAG_ALLMULTI;
+
+	if (netdev->flags & IFF_MULTICAST) {
+		f |= OCTNET_IFFLAG_MULTICAST;
+
+		/* Accept all multicast addresses if there are more than we
+		 * can handle
+		 */
+		if (netdev_mc_count(netdev) > MAX_OCTEON_MULTICAST_ADDR)
+			f |= OCTNET_IFFLAG_ALLMULTI;
+	}
+
+	if (netdev->flags & IFF_BROADCAST)
+		f |= OCTNET_IFFLAG_BROADCAST;
+
+	return f;
+}
+
+static void liquidio_set_uc_list(struct net_device *netdev)
+{
+	struct lio *lio = GET_LIO(netdev);
+	struct octeon_device *oct = lio->oct_dev;
+	struct octnic_ctrl_pkt nctrl;
+	struct netdev_hw_addr *ha;
+	u64 *mac;
+
+	if (lio->netdev_uc_count == netdev_uc_count(netdev))
+		return;
+
+	if (netdev_uc_count(netdev) > MAX_NCTRL_UDD) {
+		dev_err(&oct->pci_dev->dev, "too many MAC addresses in netdev uc list\n");
+		return;
+	}
+
+	lio->netdev_uc_count = netdev_uc_count(netdev);
+
+	memset(&nctrl, 0, sizeof(struct octnic_ctrl_pkt));
+	nctrl.ncmd.s.cmd = OCTNET_CMD_SET_UC_LIST;
+	nctrl.ncmd.s.more = lio->netdev_uc_count;
+	nctrl.ncmd.s.param1 = oct->vf_num;
+	nctrl.iq_no = lio->linfo.txpciq[0].s.q_no;
+	nctrl.netpndev = (u64)netdev;
+	nctrl.cb_fn = liquidio_link_ctrl_cmd_completion;
+
+	/* copy all the addresses into the udd */
+	mac = &nctrl.udd[0];
+	netdev_for_each_uc_addr(ha, netdev) {
+		ether_addr_copy(((u8 *)mac) + 2, ha->addr);
+		mac++;
+	}
+
+	octnet_send_nic_ctrl_pkt(lio->oct_dev, &nctrl);
+}
+
+/**
+ * \brief Net device set_multicast_list
+ * @param netdev network device
+ */
+static void liquidio_set_mcast_list(struct net_device *netdev)
+{
+	int mc_count = min(netdev_mc_count(netdev), MAX_OCTEON_MULTICAST_ADDR);
+	struct lio *lio = GET_LIO(netdev);
+	struct octeon_device *oct = lio->oct_dev;
+	struct octnic_ctrl_pkt nctrl;
+	struct netdev_hw_addr *ha;
+	u64 *mc;
+	int ret;
+
+	memset(&nctrl, 0, sizeof(struct octnic_ctrl_pkt));
+
+	/* Create a ctrl pkt command to be sent to core app. */
+	nctrl.ncmd.u64 = 0;
+	nctrl.ncmd.s.cmd = OCTNET_CMD_SET_MULTI_LIST;
+	nctrl.ncmd.s.param1 = get_new_flags(netdev);
+	nctrl.ncmd.s.param2 = mc_count;
+	nctrl.ncmd.s.more = mc_count;
+	nctrl.netpndev = (u64)netdev;
+	nctrl.cb_fn = liquidio_link_ctrl_cmd_completion;
+
+	/* copy all the addresses into the udd */
+	mc = &nctrl.udd[0];
+	netdev_for_each_mc_addr(ha, netdev) {
+		*mc = 0;
+		ether_addr_copy(((u8 *)mc) + 2, ha->addr);
+		/* no need to swap bytes */
+		if (++mc > &nctrl.udd[mc_count])
+			break;
+	}
+
+	nctrl.iq_no = lio->linfo.txpciq[0].s.q_no;
+
+	/* Apparently, any activity in this call from the kernel has to
+	 * be atomic. So we won't wait for response.
+	 */
+	nctrl.wait_time = 0;
+
+	ret = octnet_send_nic_ctrl_pkt(lio->oct_dev, &nctrl);
+	if (ret < 0) {
+		dev_err(&oct->pci_dev->dev, "DEVFLAGS change failed in core (ret: 0x%x)\n",
+			ret);
+	}
+
+	liquidio_set_uc_list(netdev);
+}
+
+/**
+ * \brief Net device set_mac_address
+ * @param netdev network device
+ */
+static int liquidio_set_mac(struct net_device *netdev, void *p)
+{
+	struct sockaddr *addr = (struct sockaddr *)p;
+	struct lio *lio = GET_LIO(netdev);
+	struct octeon_device *oct = lio->oct_dev;
+	struct octnic_ctrl_pkt nctrl;
+	int ret = 0;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	if (ether_addr_equal(addr->sa_data, netdev->dev_addr))
+		return 0;
+
+	if (lio->linfo.macaddr_is_admin_asgnd)
+		return -EPERM;
+
+	memset(&nctrl, 0, sizeof(struct octnic_ctrl_pkt));
+
+	nctrl.ncmd.u64 = 0;
+	nctrl.ncmd.s.cmd = OCTNET_CMD_CHANGE_MACADDR;
+	nctrl.ncmd.s.param1 = 0;
+	nctrl.ncmd.s.more = 1;
+	nctrl.iq_no = lio->linfo.txpciq[0].s.q_no;
+	nctrl.netpndev = (u64)netdev;
+	nctrl.cb_fn = liquidio_link_ctrl_cmd_completion;
+	nctrl.wait_time = 100;
+
+	nctrl.udd[0] = 0;
+	/* The MAC Address is presented in network byte order. */
+	ether_addr_copy((u8 *)&nctrl.udd[0] + 2, addr->sa_data);
+
+	ret = octnet_send_nic_ctrl_pkt(lio->oct_dev, &nctrl);
+	if (ret < 0) {
+		dev_err(&oct->pci_dev->dev, "MAC Address change failed\n");
+		return -ENOMEM;
+	}
+	memcpy(netdev->dev_addr, addr->sa_data, netdev->addr_len);
+	ether_addr_copy(((u8 *)&lio->linfo.hw_addr) + 2, addr->sa_data);
+
+	return 0;
+}
+
+/**
+ * \brief Net device change_mtu
+ * @param netdev network device
+ */
+static int liquidio_change_mtu(struct net_device *netdev, int new_mtu)
+{
+	struct lio *lio = GET_LIO(netdev);
+	struct octeon_device *oct = lio->oct_dev;
+
+	lio->mtu = new_mtu;
+
+	netif_info(lio, probe, lio->netdev, "MTU Changed from %d to %d\n",
+		   netdev->mtu, new_mtu);
+	dev_info(&oct->pci_dev->dev, "%s MTU Changed from %d to %d\n",
+		 netdev->name, netdev->mtu, new_mtu);
+
+	netdev->mtu = new_mtu;
+
+	return 0;
+}
+
 /** Sending command to enable/disable RX checksum offload
  * @param netdev                pointer to network device
  * @param command               OCTNET_CMD_TNL_RX_CSUM_CTL
@@ -966,6 +1154,9 @@ static int liquidio_set_features(struct net_device *netdev,
 static const struct net_device_ops lionetdevops = {
 	.ndo_open		= liquidio_open,
 	.ndo_stop		= liquidio_stop,
+	.ndo_set_mac_address	= liquidio_set_mac,
+	.ndo_set_rx_mode	= liquidio_set_mcast_list,
+	.ndo_change_mtu		= liquidio_change_mtu,
 	.ndo_fix_features	= liquidio_fix_features,
 	.ndo_set_features	= liquidio_set_features,
 	.ndo_select_queue	= select_q,
@@ -1165,6 +1356,10 @@ static int setup_nic_devices(struct octeon_device *octeon_dev)
 
 		netdev->hw_features = lio->dev_capability;
 
+		/* MTU range: 68 - 16000 */
+		netdev->min_mtu = LIO_MIN_MTU_SIZE;
+		netdev->max_mtu = LIO_MAX_MTU_SIZE;
+
 		/* Point to the  properties for octeon device to which this
 		 * interface belongs.
 		 */
diff --git a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
index f308ee4..ba329f6 100644
--- a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
+++ b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
@@ -212,6 +212,7 @@ static inline void add_sg_size(struct octeon_sg_entry *sg_entry,
 
 #define   OCTNET_CMD_ID_ACTIVE         0x1a
 
+#define   OCTNET_CMD_SET_UC_LIST       0x1b
 #define   OCTNET_CMD_SET_VF_LINKSTATE  0x1c
 #define   OCTNET_CMD_VXLAN_PORT_ADD    0x0
 #define   OCTNET_CMD_VXLAN_PORT_DEL    0x1
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_network.h b/drivers/net/ethernet/cavium/liquidio/octeon_network.h
index e94edc8..6bb8941 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_network.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_network.h
@@ -123,6 +123,7 @@ struct lio {
 	/* work queue for  link status */
 	struct cavium_wq	link_status_wq;
 
+	int netdev_uc_count;
 };
 
 #define LIO_SIZE         (sizeof(struct lio))
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next V2 2/7] liquidio CN23XX: VF link status
From: Raghu Vatsavayi @ 2016-12-06 21:06 UTC (permalink / raw)
  To: davem
  Cc: netdev, Raghu Vatsavayi, Raghu Vatsavayi, Derek Chickles,
	Satanand Burla, Felix Manlunas
In-Reply-To: <1481058367-3937-1-git-send-email-rvatsavayi@caviumnetworks.com>

Adds support for VF link status related changes.

Signed-off-by: Raghu Vatsavayi <raghu.vatsavayi@caviumnetworks.com>
Signed-off-by: Derek Chickles <derek.chickles@caviumnetworks.com>
Signed-off-by: Satanand Burla <satananda.burla@caviumnetworks.com>
Signed-off-by: Felix Manlunas <felix.manlunas@caviumnetworks.com>
---
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 249 +++++++++++++++++++++
 1 file changed, 249 insertions(+)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
index 81a578f..eddd77f 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
@@ -163,6 +163,186 @@ static inline void ifstate_reset(struct lio *lio, int state_flag)
 	atomic_set(&lio->ifstate, (atomic_read(&lio->ifstate) & ~(state_flag)));
 }
 
+/**
+ * \brief Stop Tx queues
+ * @param netdev network device
+ */
+static inline void txqs_stop(struct net_device *netdev)
+{
+	if (netif_is_multiqueue(netdev)) {
+		int i;
+
+		for (i = 0; i < netdev->num_tx_queues; i++)
+			netif_stop_subqueue(netdev, i);
+	} else {
+		netif_stop_queue(netdev);
+	}
+}
+
+/**
+ * \brief Start Tx queues
+ * @param netdev network device
+ */
+static inline void txqs_start(struct net_device *netdev)
+{
+	if (netif_is_multiqueue(netdev)) {
+		int i;
+
+		for (i = 0; i < netdev->num_tx_queues; i++)
+			netif_start_subqueue(netdev, i);
+	} else {
+		netif_start_queue(netdev);
+	}
+}
+
+/**
+ * \brief Wake Tx queues
+ * @param netdev network device
+ */
+static inline void txqs_wake(struct net_device *netdev)
+{
+	struct lio *lio = GET_LIO(netdev);
+
+	if (netif_is_multiqueue(netdev)) {
+		int i;
+
+		for (i = 0; i < netdev->num_tx_queues; i++) {
+			int qno = lio->linfo.txpciq[i % (lio->linfo.num_txpciq)]
+				      .s.q_no;
+			if (__netif_subqueue_stopped(netdev, i)) {
+				INCR_INSTRQUEUE_PKT_COUNT(lio->oct_dev, qno,
+							  tx_restart, 1);
+				netif_wake_subqueue(netdev, i);
+			}
+		}
+	} else {
+		INCR_INSTRQUEUE_PKT_COUNT(lio->oct_dev, lio->txq,
+					  tx_restart, 1);
+		netif_wake_queue(netdev);
+	}
+}
+
+/**
+ * \brief Start Tx queue
+ * @param netdev network device
+ */
+static void start_txq(struct net_device *netdev)
+{
+	struct lio *lio = GET_LIO(netdev);
+
+	if (lio->linfo.link.s.link_up) {
+		txqs_start(netdev);
+		return;
+	}
+}
+
+/**
+ * \brief Print link information
+ * @param netdev network device
+ */
+static void print_link_info(struct net_device *netdev)
+{
+	struct lio *lio = GET_LIO(netdev);
+
+	if (atomic_read(&lio->ifstate) & LIO_IFSTATE_REGISTERED) {
+		struct oct_link_info *linfo = &lio->linfo;
+
+		if (linfo->link.s.link_up) {
+			netif_info(lio, link, lio->netdev, "%d Mbps %s Duplex UP\n",
+				   linfo->link.s.speed,
+				   (linfo->link.s.duplex) ? "Full" : "Half");
+		} else {
+			netif_info(lio, link, lio->netdev, "Link Down\n");
+		}
+	}
+}
+
+/**
+ * \brief Routine to notify MTU change
+ * @param work work_struct data structure
+ */
+static void octnet_link_status_change(struct work_struct *work)
+{
+	struct cavium_wk *wk = (struct cavium_wk *)work;
+	struct lio *lio = (struct lio *)wk->ctxptr;
+
+	rtnl_lock();
+	call_netdevice_notifiers(NETDEV_CHANGEMTU, lio->netdev);
+	rtnl_unlock();
+}
+
+/**
+ * \brief Sets up the mtu status change work
+ * @param netdev network device
+ */
+static inline int setup_link_status_change_wq(struct net_device *netdev)
+{
+	struct lio *lio = GET_LIO(netdev);
+	struct octeon_device *oct = lio->oct_dev;
+
+	lio->link_status_wq.wq = alloc_workqueue("link-status",
+						 WQ_MEM_RECLAIM, 0);
+	if (!lio->link_status_wq.wq) {
+		dev_err(&oct->pci_dev->dev, "unable to create cavium link status wq\n");
+		return -1;
+	}
+	INIT_DELAYED_WORK(&lio->link_status_wq.wk.work,
+			  octnet_link_status_change);
+	lio->link_status_wq.wk.ctxptr = lio;
+
+	return 0;
+}
+
+static inline void cleanup_link_status_change_wq(struct net_device *netdev)
+{
+	struct lio *lio = GET_LIO(netdev);
+
+	if (lio->link_status_wq.wq) {
+		cancel_delayed_work_sync(&lio->link_status_wq.wk.work);
+		destroy_workqueue(lio->link_status_wq.wq);
+	}
+}
+
+/**
+ * \brief Update link status
+ * @param netdev network device
+ * @param ls link status structure
+ *
+ * Called on receipt of a link status response from the core application to
+ * update each interface's link status.
+ */
+static inline void update_link_status(struct net_device *netdev,
+				      union oct_link_status *ls)
+{
+	struct lio *lio = GET_LIO(netdev);
+	struct octeon_device *oct = lio->oct_dev;
+
+	if ((lio->intf_open) && (lio->linfo.link.u64 != ls->u64)) {
+		lio->linfo.link.u64 = ls->u64;
+
+		print_link_info(netdev);
+		lio->link_changes++;
+
+		if (lio->linfo.link.s.link_up) {
+			netif_carrier_on(netdev);
+			txqs_wake(netdev);
+		} else {
+			netif_carrier_off(netdev);
+			txqs_stop(netdev);
+		}
+
+		if (lio->linfo.link.s.mtu < netdev->mtu) {
+			dev_warn(&oct->pci_dev->dev,
+				 "PF has changed the MTU for gmx port. Reducing the mtu from %d to %d\n",
+				 netdev->mtu, lio->linfo.link.s.mtu);
+			lio->mtu = lio->linfo.link.s.mtu;
+			netdev->mtu = lio->linfo.link.s.mtu;
+			queue_delayed_work(lio->link_status_wq.wq,
+					   &lio->link_status_wq.wk.work, 0);
+		}
+	}
+}
+
 static
 int liquidio_schedule_msix_droq_pkt_handler(struct octeon_droq *droq, u64 ret)
 {
@@ -499,6 +679,8 @@ static void liquidio_destroy_nic_device(struct octeon_device *oct, int ifidx)
 	if (atomic_read(&lio->ifstate) & LIO_IFSTATE_REGISTERED)
 		unregister_netdev(netdev);
 
+	cleanup_link_status_change_wq(netdev);
+
 	free_netdev(netdev);
 
 	oct->props[ifidx].gmxport = -1;
@@ -635,6 +817,28 @@ static u16 select_q(struct net_device *dev, struct sk_buff *skb,
 }
 
 /**
+ * \brief Net device open for LiquidIO
+ * @param netdev network device
+ */
+static int liquidio_open(struct net_device *netdev)
+{
+	struct lio *lio = GET_LIO(netdev);
+	struct octeon_device *oct = lio->oct_dev;
+
+	ifstate_set(lio, LIO_IFSTATE_RUNNING);
+
+	/* Ready for link status updates */
+	lio->intf_open = 1;
+
+	netif_info(lio, ifup, lio->netdev, "Interface Open, ready for traffic\n");
+	start_txq(netdev);
+
+	dev_info(&oct->pci_dev->dev, "%s interface is opened\n", netdev->name);
+
+	return 0;
+}
+
+/**
  * \brief Net device stop for LiquidIO
  * @param netdev network device
  */
@@ -653,6 +857,8 @@ static int liquidio_stop(struct net_device *netdev)
 
 	ifstate_reset(lio, LIO_IFSTATE_RUNNING);
 
+	txqs_stop(netdev);
+
 	dev_info(&oct->pci_dev->dev, "%s interface is stopped\n", netdev->name);
 
 	return 0;
@@ -758,11 +964,47 @@ static int liquidio_set_features(struct net_device *netdev,
 }
 
 static const struct net_device_ops lionetdevops = {
+	.ndo_open		= liquidio_open,
+	.ndo_stop		= liquidio_stop,
 	.ndo_fix_features	= liquidio_fix_features,
 	.ndo_set_features	= liquidio_set_features,
 	.ndo_select_queue	= select_q,
 };
 
+static int lio_nic_info(struct octeon_recv_info *recv_info, void *buf)
+{
+	struct octeon_device *oct = (struct octeon_device *)buf;
+	struct octeon_recv_pkt *recv_pkt = recv_info->recv_pkt;
+	union oct_link_status *ls;
+	int gmxport = 0;
+	int i;
+
+	if (recv_pkt->buffer_size[0] != sizeof(*ls)) {
+		dev_err(&oct->pci_dev->dev, "Malformed NIC_INFO, len=%d, ifidx=%d\n",
+			recv_pkt->buffer_size[0],
+			recv_pkt->rh.r_nic_info.gmxport);
+		goto nic_info_err;
+	}
+
+	gmxport = recv_pkt->rh.r_nic_info.gmxport;
+	ls = (union oct_link_status *)get_rbd(recv_pkt->buffer_ptr[0]);
+
+	octeon_swap_8B_data((u64 *)ls, (sizeof(union oct_link_status)) >> 3);
+
+	for (i = 0; i < oct->ifcount; i++) {
+		if (oct->props[i].gmxport == gmxport) {
+			update_link_status(oct->props[i].netdev, ls);
+			break;
+		}
+	}
+
+nic_info_err:
+	for (i = 0; i < recv_pkt->buffer_count; i++)
+		recv_buffer_free(recv_pkt->buffer_ptr[i]);
+	octeon_free_recv_info(recv_info);
+	return 0;
+}
+
 /**
  * \brief Setup network interfaces
  * @param octeon_dev  octeon device
@@ -788,6 +1030,10 @@ static int setup_nic_devices(struct octeon_device *octeon_dev)
 
 	ifidx_or_pfnum = octeon_dev->pf_num;
 
+	/* This is to handle link status changes */
+	octeon_register_dispatch_fn(octeon_dev, OPCODE_NIC, OPCODE_NIC_INFO,
+				    lio_nic_info, octeon_dev);
+
 	for (i = 0; i < octeon_dev->ifcount; i++) {
 		resp_size = sizeof(struct liquidio_if_cfg_resp);
 		ctx_size = sizeof(struct liquidio_if_cfg_context);
@@ -946,6 +1192,9 @@ static int setup_nic_devices(struct octeon_device *octeon_dev)
 			liquidio_set_feature(netdev, OCTNET_CMD_VERBOSE_ENABLE,
 					     0);
 
+		if (setup_link_status_change_wq(netdev))
+			goto setup_nic_dev_fail;
+
 		/* Register the network device with the OS */
 		if (register_netdev(netdev)) {
 			dev_err(&octeon_dev->pci_dev->dev, "Device registration failed\n");
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next V2 4/7] liquidio CN23XX: VF scatter gather lists
From: Raghu Vatsavayi @ 2016-12-06 21:06 UTC (permalink / raw)
  To: davem
  Cc: netdev, Raghu Vatsavayi, Raghu Vatsavayi, Derek Chickles,
	Satanand Burla, Felix Manlunas
In-Reply-To: <1481058367-3937-1-git-send-email-rvatsavayi@caviumnetworks.com>

Adds support for VF scatter gather lists.

Signed-off-by: Raghu Vatsavayi <raghu.vatsavayi@caviumnetworks.com>
Signed-off-by: Derek Chickles <derek.chickles@caviumnetworks.com>
Signed-off-by: Satanand Burla <satananda.burla@caviumnetworks.com>
Signed-off-by: Felix Manlunas <felix.manlunas@caviumnetworks.com>
---
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 134 +++++++++++++++++++++
 1 file changed, 134 insertions(+)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
index 0e23e2f..e4ee6ec 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
@@ -55,10 +55,28 @@ struct liquidio_if_cfg_resp {
 	u64 status;
 };
 
+#define OCTNIC_MAX_SG  (MAX_SKB_FRAGS)
+
 #define OCTNIC_GSO_MAX_HEADER_SIZE 128
 #define OCTNIC_GSO_MAX_SIZE \
 		(CN23XX_DEFAULT_INPUT_JABBER - OCTNIC_GSO_MAX_HEADER_SIZE)
 
+struct octnic_gather {
+	/* List manipulation. Next and prev pointers. */
+	struct list_head list;
+
+	/* Size of the gather component at sg in bytes. */
+	int sg_size;
+
+	/* Number of bytes that sg was adjusted to make it 8B-aligned. */
+	int adjust;
+
+	/* Gather component that can accommodate max sized fragment list
+	 * received from the IP layer.
+	 */
+	struct octeon_sg_entry *sg;
+};
+
 struct octeon_device_priv {
 	/* Tasklet structures for this device. */
 	struct tasklet_struct droq_tasklet;
@@ -237,6 +255,114 @@ static void start_txq(struct net_device *netdev)
 }
 
 /**
+ * Remove the node at the head of the list. The list would be empty at
+ * the end of this call if there are no more nodes in the list.
+ */
+static inline struct list_head *list_delete_head(struct list_head *root)
+{
+	struct list_head *node;
+
+	if ((root->prev == root) && (root->next == root))
+		node = NULL;
+	else
+		node = root->next;
+
+	if (node)
+		list_del(node);
+
+	return node;
+}
+
+/**
+ * \brief Delete gather lists
+ * @param lio per-network private data
+ */
+static void delete_glists(struct lio *lio)
+{
+	struct octnic_gather *g;
+	int i;
+
+	if (!lio->glist)
+		return;
+
+	for (i = 0; i < lio->linfo.num_txpciq; i++) {
+		do {
+			g = (struct octnic_gather *)
+			    list_delete_head(&lio->glist[i]);
+			if (g) {
+				if (g->sg)
+					kfree((void *)((unsigned long)g->sg -
+							g->adjust));
+				kfree(g);
+			}
+		} while (g);
+	}
+
+	kfree(lio->glist);
+	kfree(lio->glist_lock);
+}
+
+/**
+ * \brief Setup gather lists
+ * @param lio per-network private data
+ */
+static int setup_glists(struct lio *lio, int num_iqs)
+{
+	struct octnic_gather *g;
+	int i, j;
+
+	lio->glist_lock =
+	    kzalloc(sizeof(*lio->glist_lock) * num_iqs, GFP_KERNEL);
+	if (!lio->glist_lock)
+		return 1;
+
+	lio->glist =
+	    kzalloc(sizeof(*lio->glist) * num_iqs, GFP_KERNEL);
+	if (!lio->glist) {
+		kfree(lio->glist_lock);
+		return 1;
+	}
+
+	for (i = 0; i < num_iqs; i++) {
+		spin_lock_init(&lio->glist_lock[i]);
+
+		INIT_LIST_HEAD(&lio->glist[i]);
+
+		for (j = 0; j < lio->tx_qsize; j++) {
+			g = kzalloc(sizeof(*g), GFP_KERNEL);
+			if (!g)
+				break;
+
+			g->sg_size = ((ROUNDUP4(OCTNIC_MAX_SG) >> 2) *
+				      OCT_SG_ENTRY_SIZE);
+
+			g->sg = kmalloc(g->sg_size + 8, GFP_KERNEL);
+			if (!g->sg) {
+				kfree(g);
+				break;
+			}
+
+			/* The gather component should be aligned on 64-bit
+			 * boundary
+			 */
+			if (((unsigned long)g->sg) & 7) {
+				g->adjust = 8 - (((unsigned long)g->sg) & 7);
+				g->sg = (struct octeon_sg_entry *)
+					((unsigned long)g->sg + g->adjust);
+			}
+			list_add_tail(&g->list, &lio->glist[i]);
+		}
+
+		if (j != lio->tx_qsize) {
+			delete_glists(lio);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+/**
  * \brief Print link information
  * @param netdev network device
  */
@@ -681,6 +807,8 @@ static void liquidio_destroy_nic_device(struct octeon_device *oct, int ifidx)
 
 	cleanup_link_status_change_wq(netdev);
 
+	delete_glists(lio);
+
 	free_netdev(netdev);
 
 	oct->props[ifidx].gmxport = -1;
@@ -1379,6 +1507,12 @@ static int setup_nic_devices(struct octeon_device *octeon_dev)
 		/* Copy MAC Address to OS network device structure */
 		ether_addr_copy(netdev->dev_addr, mac);
 
+		if (setup_glists(lio, num_iqueues)) {
+			dev_err(&octeon_dev->pci_dev->dev,
+				"Gather list allocation failed\n");
+			goto setup_nic_dev_fail;
+		}
+
 		if (netdev->features & NETIF_F_LRO)
 			liquidio_set_feature(netdev, OCTNET_CMD_LRO_ENABLE,
 					     OCTNIC_LROIPV4 | OCTNIC_LROIPV6);
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next V2 6/7] liquidio CN23XX: VF TX buffers
From: Raghu Vatsavayi @ 2016-12-06 21:06 UTC (permalink / raw)
  To: davem
  Cc: netdev, Raghu Vatsavayi, Raghu Vatsavayi, Derek Chickles,
	Satanand Burla, Felix Manlunas
In-Reply-To: <1481058367-3937-1-git-send-email-rvatsavayi@caviumnetworks.com>

Adds support for freeing VF xmit buffers.

Signed-off-by: Raghu Vatsavayi <raghu.vatsavayi@caviumnetworks.com>
Signed-off-by: Derek Chickles <derek.chickles@caviumnetworks.com>
Signed-off-by: Satanand Burla <satananda.burla@caviumnetworks.com>
Signed-off-by: Felix Manlunas <felix.manlunas@caviumnetworks.com>
---
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 182 +++++++++++++++++++++
 1 file changed, 182 insertions(+)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
index cf80722..ce5cdcd 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
@@ -270,6 +270,19 @@ static void start_txq(struct net_device *netdev)
 }
 
 /**
+ * \brief Wake a queue
+ * @param netdev network device
+ * @param q which queue to wake
+ */
+static inline void wake_q(struct net_device *netdev, int q)
+{
+	if (netif_is_multiqueue(netdev))
+		netif_wake_subqueue(netdev, q);
+	else
+		netif_wake_queue(netdev);
+}
+
+/**
  * \brief Stop a queue
  * @param netdev network device
  * @param q which queue to stop
@@ -920,6 +933,163 @@ static int octeon_pci_os_setup(struct octeon_device *oct)
 	return 0;
 }
 
+static inline int skb_iq(struct lio *lio, struct sk_buff *skb)
+{
+	int q = 0;
+
+	if (netif_is_multiqueue(lio->netdev))
+		q = skb->queue_mapping % lio->linfo.num_txpciq;
+
+	return q;
+}
+
+/**
+ * \brief Check Tx queue state for a given network buffer
+ * @param lio per-network private data
+ * @param skb network buffer
+ */
+static inline int check_txq_state(struct lio *lio, struct sk_buff *skb)
+{
+	int q = 0, iq = 0;
+
+	if (netif_is_multiqueue(lio->netdev)) {
+		q = skb->queue_mapping;
+		iq = lio->linfo.txpciq[(q % (lio->linfo.num_txpciq))].s.q_no;
+	} else {
+		iq = lio->txq;
+		q = iq;
+	}
+
+	if (octnet_iq_is_full(lio->oct_dev, iq))
+		return 0;
+
+	if (__netif_subqueue_stopped(lio->netdev, q)) {
+		INCR_INSTRQUEUE_PKT_COUNT(lio->oct_dev, iq, tx_restart, 1);
+		wake_q(lio->netdev, q);
+	}
+
+	return 1;
+}
+
+/**
+ * \brief Unmap and free network buffer
+ * @param buf buffer
+ */
+static void free_netbuf(void *buf)
+{
+	struct octnet_buf_free_info *finfo;
+	struct sk_buff *skb;
+	struct lio *lio;
+
+	finfo = (struct octnet_buf_free_info *)buf;
+	skb = finfo->skb;
+	lio = finfo->lio;
+
+	dma_unmap_single(&lio->oct_dev->pci_dev->dev, finfo->dptr, skb->len,
+			 DMA_TO_DEVICE);
+
+	check_txq_state(lio, skb);
+
+	tx_buffer_free(skb);
+}
+
+/**
+ * \brief Unmap and free gather buffer
+ * @param buf buffer
+ */
+static void free_netsgbuf(void *buf)
+{
+	struct octnet_buf_free_info *finfo;
+	struct octnic_gather *g;
+	struct sk_buff *skb;
+	int i, frags, iq;
+	struct lio *lio;
+
+	finfo = (struct octnet_buf_free_info *)buf;
+	skb = finfo->skb;
+	lio = finfo->lio;
+	g = finfo->g;
+	frags = skb_shinfo(skb)->nr_frags;
+
+	dma_unmap_single(&lio->oct_dev->pci_dev->dev,
+			 g->sg[0].ptr[0], (skb->len - skb->data_len),
+			 DMA_TO_DEVICE);
+
+	i = 1;
+	while (frags--) {
+		struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[i - 1];
+
+		pci_unmap_page((lio->oct_dev)->pci_dev,
+			       g->sg[(i >> 2)].ptr[(i & 3)],
+			       frag->size, DMA_TO_DEVICE);
+		i++;
+	}
+
+	dma_unmap_single(&lio->oct_dev->pci_dev->dev,
+			 finfo->dptr, g->sg_size,
+			 DMA_TO_DEVICE);
+
+	iq = skb_iq(lio, skb);
+
+	spin_lock(&lio->glist_lock[iq]);
+	list_add_tail(&g->list, &lio->glist[iq]);
+	spin_unlock(&lio->glist_lock[iq]);
+
+	check_txq_state(lio, skb); /* mq support: sub-queue state check */
+
+	tx_buffer_free(skb);
+}
+
+/**
+ * \brief Unmap and free gather buffer with response
+ * @param buf buffer
+ */
+static void free_netsgbuf_with_resp(void *buf)
+{
+	struct octnet_buf_free_info *finfo;
+	struct octeon_soft_command *sc;
+	struct octnic_gather *g;
+	struct sk_buff *skb;
+	int i, frags, iq;
+	struct lio *lio;
+
+	sc = (struct octeon_soft_command *)buf;
+	skb = (struct sk_buff *)sc->callback_arg;
+	finfo = (struct octnet_buf_free_info *)&skb->cb;
+
+	lio = finfo->lio;
+	g = finfo->g;
+	frags = skb_shinfo(skb)->nr_frags;
+
+	dma_unmap_single(&lio->oct_dev->pci_dev->dev,
+			 g->sg[0].ptr[0], (skb->len - skb->data_len),
+			 DMA_TO_DEVICE);
+
+	i = 1;
+	while (frags--) {
+		struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[i - 1];
+
+		pci_unmap_page((lio->oct_dev)->pci_dev,
+			       g->sg[(i >> 2)].ptr[(i & 3)],
+			       frag->size, DMA_TO_DEVICE);
+		i++;
+	}
+
+	dma_unmap_single(&lio->oct_dev->pci_dev->dev,
+			 finfo->dptr, g->sg_size,
+			 DMA_TO_DEVICE);
+
+	iq = skb_iq(lio, skb);
+
+	spin_lock(&lio->glist_lock[iq]);
+	list_add_tail(&g->list, &lio->glist[iq]);
+	spin_unlock(&lio->glist_lock[iq]);
+
+	/* Don't free the skb yet */
+
+	check_txq_state(lio, skb);
+}
+
 /**
  * \brief Callback for getting interface configuration
  * @param status status of request
@@ -1675,6 +1845,18 @@ static int setup_nic_devices(struct octeon_device *octeon_dev)
 	octeon_register_dispatch_fn(octeon_dev, OPCODE_NIC, OPCODE_NIC_INFO,
 				    lio_nic_info, octeon_dev);
 
+	/* REQTYPE_RESP_NET and REQTYPE_SOFT_COMMAND do not have free functions.
+	 * They are handled directly.
+	 */
+	octeon_register_reqtype_free_fn(octeon_dev, REQTYPE_NORESP_NET,
+					free_netbuf);
+
+	octeon_register_reqtype_free_fn(octeon_dev, REQTYPE_NORESP_NET_SG,
+					free_netsgbuf);
+
+	octeon_register_reqtype_free_fn(octeon_dev, REQTYPE_RESP_NET_SG,
+					free_netsgbuf_with_resp);
+
 	for (i = 0; i < octeon_dev->ifcount; i++) {
 		resp_size = sizeof(struct liquidio_if_cfg_resp);
 		ctx_size = sizeof(struct liquidio_if_cfg_context);
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next V2 1/7] liquidio CN23XX: VF offload features
From: Raghu Vatsavayi @ 2016-12-06 21:06 UTC (permalink / raw)
  To: davem
  Cc: netdev, Raghu Vatsavayi, Raghu Vatsavayi, Derek Chickles,
	Satanand Burla, Felix Manlunas
In-Reply-To: <1481058367-3937-1-git-send-email-rvatsavayi@caviumnetworks.com>

Adds support for VF link initialization and offload features.

Signed-off-by: Raghu Vatsavayi <raghu.vatsavayi@caviumnetworks.com>
Signed-off-by: Derek Chickles <derek.chickles@caviumnetworks.com>
Signed-off-by: Satanand Burla <satananda.burla@caviumnetworks.com>
Signed-off-by: Felix Manlunas <felix.manlunas@caviumnetworks.com>
---
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 562 +++++++++++++++++++++
 .../net/ethernet/cavium/liquidio/octeon_device.c   |   3 +
 2 files changed, 565 insertions(+)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
index e6321f3..81a578f 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
@@ -22,7 +22,9 @@
 #include "octeon_iq.h"
 #include "response_manager.h"
 #include "octeon_device.h"
+#include "octeon_nic.h"
 #include "octeon_main.h"
+#include "octeon_network.h"
 #include "cn23xx_vf_device.h"
 
 MODULE_AUTHOR("Cavium Networks, <support@cavium.com>");
@@ -30,6 +32,33 @@
 MODULE_LICENSE("GPL");
 MODULE_VERSION(LIQUIDIO_VERSION);
 
+static int debug = -1;
+module_param(debug, int, 0644);
+MODULE_PARM_DESC(debug, "NETIF_MSG debug bits");
+
+#define DEFAULT_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK)
+
+#define   LIO_IFSTATE_REGISTERED           0x02
+#define   LIO_IFSTATE_RUNNING              0x04
+
+struct liquidio_if_cfg_context {
+	int octeon_id;
+
+	wait_queue_head_t wc;
+
+	int cond;
+};
+
+struct liquidio_if_cfg_resp {
+	u64 rh;
+	struct liquidio_if_cfg_info cfg_info;
+	u64 status;
+};
+
+#define OCTNIC_GSO_MAX_HEADER_SIZE 128
+#define OCTNIC_GSO_MAX_SIZE \
+		(CN23XX_DEFAULT_INPUT_JABBER - OCTNIC_GSO_MAX_HEADER_SIZE)
+
 struct octeon_device_priv {
 	/* Tasklet structures for this device. */
 	struct tasklet_struct droq_tasklet;
@@ -40,6 +69,7 @@ struct octeon_device_priv {
 liquidio_vf_probe(struct pci_dev *pdev, const struct pci_device_id *ent);
 static void liquidio_vf_remove(struct pci_dev *pdev);
 static int octeon_device_init(struct octeon_device *oct);
+static int liquidio_stop(struct net_device *netdev);
 
 static int lio_wait_for_oq_pkts(struct octeon_device *oct)
 {
@@ -113,6 +143,26 @@ static int wait_for_pending_requests(struct octeon_device *oct)
 	.remove		= liquidio_vf_remove,
 };
 
+/**
+ * \brief set interface state
+ * @param lio per-network private data
+ * @param state_flag flag state to set
+ */
+static inline void ifstate_set(struct lio *lio, int state_flag)
+{
+	atomic_set(&lio->ifstate, (atomic_read(&lio->ifstate) | state_flag));
+}
+
+/**
+ * \brief clear interface state
+ * @param lio per-network private data
+ * @param state_flag flag state to clear
+ */
+static inline void ifstate_reset(struct lio *lio, int state_flag)
+{
+	atomic_set(&lio->ifstate, (atomic_read(&lio->ifstate) & ~(state_flag)));
+}
+
 static
 int liquidio_schedule_msix_droq_pkt_handler(struct octeon_droq *droq, u64 ret)
 {
@@ -316,6 +366,7 @@ static void octeon_destroy_resources(struct octeon_device *oct)
 		/* No more instructions will be forwarded. */
 		atomic_set(&oct->status, OCT_DEV_IN_RESET);
 
+		oct->app_mode = CVM_DRV_INVALID_APP;
 		dev_dbg(&oct->pci_dev->dev, "Device state is now %s\n",
 			lio_get_state_string(&oct->status));
 
@@ -420,6 +471,63 @@ static void octeon_destroy_resources(struct octeon_device *oct)
 }
 
 /**
+ * \brief Destroy NIC device interface
+ * @param oct octeon device
+ * @param ifidx which interface to destroy
+ *
+ * Cleanup associated with each interface for an Octeon device  when NIC
+ * module is being unloaded or if initialization fails during load.
+ */
+static void liquidio_destroy_nic_device(struct octeon_device *oct, int ifidx)
+{
+	struct net_device *netdev = oct->props[ifidx].netdev;
+	struct lio *lio;
+
+	if (!netdev) {
+		dev_err(&oct->pci_dev->dev, "%s No netdevice ptr for index %d\n",
+			__func__, ifidx);
+		return;
+	}
+
+	lio = GET_LIO(netdev);
+
+	dev_dbg(&oct->pci_dev->dev, "NIC device cleanup\n");
+
+	if (atomic_read(&lio->ifstate) & LIO_IFSTATE_RUNNING)
+		liquidio_stop(netdev);
+
+	if (atomic_read(&lio->ifstate) & LIO_IFSTATE_REGISTERED)
+		unregister_netdev(netdev);
+
+	free_netdev(netdev);
+
+	oct->props[ifidx].gmxport = -1;
+
+	oct->props[ifidx].netdev = NULL;
+}
+
+/**
+ * \brief Stop complete NIC functionality
+ * @param oct octeon device
+ */
+static int liquidio_stop_nic_module(struct octeon_device *oct)
+{
+	int i;
+
+	dev_dbg(&oct->pci_dev->dev, "Stopping network interfaces\n");
+	if (!oct->ifcount) {
+		dev_err(&oct->pci_dev->dev, "Init for Octeon was not completed\n");
+		return 1;
+	}
+
+	for (i = 0; i < oct->ifcount; i++)
+		liquidio_destroy_nic_device(oct, i);
+
+	dev_dbg(&oct->pci_dev->dev, "Network interfaces stopped\n");
+	return 0;
+}
+
+/**
  * \brief Cleans up resources at unload time
  * @param pdev PCI device structure
  */
@@ -429,6 +537,9 @@ static void liquidio_vf_remove(struct pci_dev *pdev)
 
 	dev_dbg(&oct_dev->pci_dev->dev, "Stopping device\n");
 
+	if (oct_dev->app_mode == CVM_DRV_NIC_APP)
+		liquidio_stop_nic_module(oct_dev);
+
 	/* Reset the octeon device and cleanup all memory allocated for
 	 * the octeon device by driver.
 	 */
@@ -472,6 +583,452 @@ static int octeon_pci_os_setup(struct octeon_device *oct)
 }
 
 /**
+ * \brief Callback for getting interface configuration
+ * @param status status of request
+ * @param buf pointer to resp structure
+ */
+static void if_cfg_callback(struct octeon_device *oct,
+			    u32 status __attribute__((unused)), void *buf)
+{
+	struct octeon_soft_command *sc = (struct octeon_soft_command *)buf;
+	struct liquidio_if_cfg_context *ctx;
+	struct liquidio_if_cfg_resp *resp;
+
+	resp = (struct liquidio_if_cfg_resp *)sc->virtrptr;
+	ctx = (struct liquidio_if_cfg_context *)sc->ctxptr;
+
+	oct = lio_get_device(ctx->octeon_id);
+	if (resp->status)
+		dev_err(&oct->pci_dev->dev, "nic if cfg instruction failed. Status: %llx\n",
+			CVM_CAST64(resp->status));
+	WRITE_ONCE(ctx->cond, 1);
+
+	snprintf(oct->fw_info.liquidio_firmware_version, 32, "%s",
+		 resp->cfg_info.liquidio_firmware_version);
+
+	/* This barrier is required to be sure that the response has been
+	 * written fully before waking up the handler
+	 */
+	wmb();
+
+	wake_up_interruptible(&ctx->wc);
+}
+
+/**
+ * \brief Select queue based on hash
+ * @param dev Net device
+ * @param skb sk_buff structure
+ * @returns selected queue number
+ */
+static u16 select_q(struct net_device *dev, struct sk_buff *skb,
+		    void *accel_priv __attribute__((unused)),
+		    select_queue_fallback_t fallback __attribute__((unused)))
+{
+	struct lio *lio;
+	u32 qindex;
+
+	lio = GET_LIO(dev);
+
+	qindex = skb_tx_hash(dev, skb);
+
+	return (u16)(qindex % (lio->linfo.num_txpciq));
+}
+
+/**
+ * \brief Net device stop for LiquidIO
+ * @param netdev network device
+ */
+static int liquidio_stop(struct net_device *netdev)
+{
+	struct lio *lio = GET_LIO(netdev);
+	struct octeon_device *oct = lio->oct_dev;
+
+	netif_info(lio, ifdown, lio->netdev, "Stopping interface!\n");
+	/* Inform that netif carrier is down */
+	lio->intf_open = 0;
+	lio->linfo.link.s.link_up = 0;
+
+	netif_carrier_off(netdev);
+	lio->link_changes++;
+
+	ifstate_reset(lio, LIO_IFSTATE_RUNNING);
+
+	dev_info(&oct->pci_dev->dev, "%s interface is stopped\n", netdev->name);
+
+	return 0;
+}
+
+/** Sending command to enable/disable RX checksum offload
+ * @param netdev                pointer to network device
+ * @param command               OCTNET_CMD_TNL_RX_CSUM_CTL
+ * @param rx_cmd_bit            OCTNET_CMD_RXCSUM_ENABLE/
+ *                              OCTNET_CMD_RXCSUM_DISABLE
+ * @returns                     SUCCESS or FAILURE
+ */
+static int liquidio_set_rxcsum_command(struct net_device *netdev, int command,
+				       u8 rx_cmd)
+{
+	struct lio *lio = GET_LIO(netdev);
+	struct octeon_device *oct = lio->oct_dev;
+	struct octnic_ctrl_pkt nctrl;
+	int ret = 0;
+
+	nctrl.ncmd.u64 = 0;
+	nctrl.ncmd.s.cmd = command;
+	nctrl.ncmd.s.param1 = rx_cmd;
+	nctrl.iq_no = lio->linfo.txpciq[0].s.q_no;
+	nctrl.wait_time = 100;
+	nctrl.netpndev = (u64)netdev;
+	nctrl.cb_fn = liquidio_link_ctrl_cmd_completion;
+
+	ret = octnet_send_nic_ctrl_pkt(lio->oct_dev, &nctrl);
+	if (ret < 0) {
+		dev_err(&oct->pci_dev->dev, "DEVFLAGS RXCSUM change failed in core (ret:0x%x)\n",
+			ret);
+	}
+	return ret;
+}
+
+/** \brief Net device fix features
+ * @param netdev  pointer to network device
+ * @param request features requested
+ * @returns updated features list
+ */
+static netdev_features_t liquidio_fix_features(struct net_device *netdev,
+					       netdev_features_t request)
+{
+	struct lio *lio = netdev_priv(netdev);
+
+	if ((request & NETIF_F_RXCSUM) &&
+	    !(lio->dev_capability & NETIF_F_RXCSUM))
+		request &= ~NETIF_F_RXCSUM;
+
+	if ((request & NETIF_F_HW_CSUM) &&
+	    !(lio->dev_capability & NETIF_F_HW_CSUM))
+		request &= ~NETIF_F_HW_CSUM;
+
+	if ((request & NETIF_F_TSO) && !(lio->dev_capability & NETIF_F_TSO))
+		request &= ~NETIF_F_TSO;
+
+	if ((request & NETIF_F_TSO6) && !(lio->dev_capability & NETIF_F_TSO6))
+		request &= ~NETIF_F_TSO6;
+
+	if ((request & NETIF_F_LRO) && !(lio->dev_capability & NETIF_F_LRO))
+		request &= ~NETIF_F_LRO;
+
+	/* Disable LRO if RXCSUM is off */
+	if (!(request & NETIF_F_RXCSUM) && (netdev->features & NETIF_F_LRO) &&
+	    (lio->dev_capability & NETIF_F_LRO))
+		request &= ~NETIF_F_LRO;
+
+	return request;
+}
+
+/** \brief Net device set features
+ * @param netdev  pointer to network device
+ * @param features features to enable/disable
+ */
+static int liquidio_set_features(struct net_device *netdev,
+				 netdev_features_t features)
+{
+	struct lio *lio = netdev_priv(netdev);
+
+	if (!((netdev->features ^ features) & NETIF_F_LRO))
+		return 0;
+
+	if ((features & NETIF_F_LRO) && (lio->dev_capability & NETIF_F_LRO))
+		liquidio_set_feature(netdev, OCTNET_CMD_LRO_ENABLE,
+				     OCTNIC_LROIPV4 | OCTNIC_LROIPV6);
+	else if (!(features & NETIF_F_LRO) &&
+		 (lio->dev_capability & NETIF_F_LRO))
+		liquidio_set_feature(netdev, OCTNET_CMD_LRO_DISABLE,
+				     OCTNIC_LROIPV4 | OCTNIC_LROIPV6);
+	if (!(netdev->features & NETIF_F_RXCSUM) &&
+	    (lio->enc_dev_capability & NETIF_F_RXCSUM) &&
+	    (features & NETIF_F_RXCSUM))
+		liquidio_set_rxcsum_command(netdev, OCTNET_CMD_TNL_RX_CSUM_CTL,
+					    OCTNET_CMD_RXCSUM_ENABLE);
+	else if ((netdev->features & NETIF_F_RXCSUM) &&
+		 (lio->enc_dev_capability & NETIF_F_RXCSUM) &&
+		 !(features & NETIF_F_RXCSUM))
+		liquidio_set_rxcsum_command(netdev, OCTNET_CMD_TNL_RX_CSUM_CTL,
+					    OCTNET_CMD_RXCSUM_DISABLE);
+
+	return 0;
+}
+
+static const struct net_device_ops lionetdevops = {
+	.ndo_fix_features	= liquidio_fix_features,
+	.ndo_set_features	= liquidio_set_features,
+	.ndo_select_queue	= select_q,
+};
+
+/**
+ * \brief Setup network interfaces
+ * @param octeon_dev  octeon device
+ *
+ * Called during init time for each device. It assumes the NIC
+ * is already up and running.  The link information for each
+ * interface is passed in link_info.
+ */
+static int setup_nic_devices(struct octeon_device *octeon_dev)
+{
+	int retval, num_iqueues, num_oqueues;
+	struct liquidio_if_cfg_context *ctx;
+	u32 resp_size, ctx_size, data_size;
+	struct liquidio_if_cfg_resp *resp;
+	struct octeon_soft_command *sc;
+	union oct_nic_if_cfg if_cfg;
+	struct octdev_props *props;
+	struct net_device *netdev;
+	struct lio_version *vdata;
+	struct lio *lio = NULL;
+	u8 mac[ETH_ALEN], i, j;
+	u32 ifidx_or_pfnum;
+
+	ifidx_or_pfnum = octeon_dev->pf_num;
+
+	for (i = 0; i < octeon_dev->ifcount; i++) {
+		resp_size = sizeof(struct liquidio_if_cfg_resp);
+		ctx_size = sizeof(struct liquidio_if_cfg_context);
+		data_size = sizeof(struct lio_version);
+		sc = (struct octeon_soft_command *)
+			octeon_alloc_soft_command(octeon_dev, data_size,
+						  resp_size, ctx_size);
+		resp = (struct liquidio_if_cfg_resp *)sc->virtrptr;
+		ctx  = (struct liquidio_if_cfg_context *)sc->ctxptr;
+		vdata = (struct lio_version *)sc->virtdptr;
+
+		*((u64 *)vdata) = 0;
+		vdata->major = cpu_to_be16(LIQUIDIO_BASE_MAJOR_VERSION);
+		vdata->minor = cpu_to_be16(LIQUIDIO_BASE_MINOR_VERSION);
+		vdata->micro = cpu_to_be16(LIQUIDIO_BASE_MICRO_VERSION);
+
+		WRITE_ONCE(ctx->cond, 0);
+		ctx->octeon_id = lio_get_device_id(octeon_dev);
+		init_waitqueue_head(&ctx->wc);
+
+		if_cfg.u64 = 0;
+
+		if_cfg.s.num_iqueues = octeon_dev->sriov_info.rings_per_vf;
+		if_cfg.s.num_oqueues = octeon_dev->sriov_info.rings_per_vf;
+		if_cfg.s.base_queue = 0;
+
+		sc->iq_no = 0;
+
+		octeon_prepare_soft_command(octeon_dev, sc, OPCODE_NIC,
+					    OPCODE_NIC_IF_CFG, 0, if_cfg.u64,
+					    0);
+
+		sc->callback = if_cfg_callback;
+		sc->callback_arg = sc;
+		sc->wait_time = 5000;
+
+		retval = octeon_send_soft_command(octeon_dev, sc);
+		if (retval == IQ_SEND_FAILED) {
+			dev_err(&octeon_dev->pci_dev->dev,
+				"iq/oq config failed status: %x\n", retval);
+			/* Soft instr is freed by driver in case of failure. */
+			goto setup_nic_dev_fail;
+		}
+
+		/* Sleep on a wait queue till the cond flag indicates that the
+		 * response arrived or timed-out.
+		 */
+		if (sleep_cond(&ctx->wc, &ctx->cond) == -EINTR) {
+			dev_err(&octeon_dev->pci_dev->dev, "Wait interrupted\n");
+			goto setup_nic_wait_intr;
+		}
+
+		retval = resp->status;
+		if (retval) {
+			dev_err(&octeon_dev->pci_dev->dev, "iq/oq config failed\n");
+			goto setup_nic_dev_fail;
+		}
+
+		octeon_swap_8B_data((u64 *)(&resp->cfg_info),
+				    (sizeof(struct liquidio_if_cfg_info)) >> 3);
+
+		num_iqueues = hweight64(resp->cfg_info.iqmask);
+		num_oqueues = hweight64(resp->cfg_info.oqmask);
+
+		if (!(num_iqueues) || !(num_oqueues)) {
+			dev_err(&octeon_dev->pci_dev->dev,
+				"Got bad iqueues (%016llx) or oqueues (%016llx) from firmware.\n",
+				resp->cfg_info.iqmask, resp->cfg_info.oqmask);
+			goto setup_nic_dev_fail;
+		}
+		dev_dbg(&octeon_dev->pci_dev->dev,
+			"interface %d, iqmask %016llx, oqmask %016llx, numiqueues %d, numoqueues %d\n",
+			i, resp->cfg_info.iqmask, resp->cfg_info.oqmask,
+			num_iqueues, num_oqueues);
+
+		netdev = alloc_etherdev_mq(LIO_SIZE, num_iqueues);
+
+		if (!netdev) {
+			dev_err(&octeon_dev->pci_dev->dev, "Device allocation failed\n");
+			goto setup_nic_dev_fail;
+		}
+
+		SET_NETDEV_DEV(netdev, &octeon_dev->pci_dev->dev);
+
+		/* Associate the routines that will handle different
+		 * netdev tasks.
+		 */
+		netdev->netdev_ops = &lionetdevops;
+
+		lio = GET_LIO(netdev);
+
+		memset(lio, 0, sizeof(struct lio));
+
+		lio->ifidx = ifidx_or_pfnum;
+
+		props = &octeon_dev->props[i];
+		props->gmxport = resp->cfg_info.linfo.gmxport;
+		props->netdev = netdev;
+
+		lio->linfo.num_rxpciq = num_oqueues;
+		lio->linfo.num_txpciq = num_iqueues;
+
+		for (j = 0; j < num_oqueues; j++) {
+			lio->linfo.rxpciq[j].u64 =
+			    resp->cfg_info.linfo.rxpciq[j].u64;
+		}
+		for (j = 0; j < num_iqueues; j++) {
+			lio->linfo.txpciq[j].u64 =
+			    resp->cfg_info.linfo.txpciq[j].u64;
+		}
+
+		lio->linfo.hw_addr = resp->cfg_info.linfo.hw_addr;
+		lio->linfo.gmxport = resp->cfg_info.linfo.gmxport;
+		lio->linfo.link.u64 = resp->cfg_info.linfo.link.u64;
+		lio->linfo.macaddr_is_admin_asgnd =
+			resp->cfg_info.linfo.macaddr_is_admin_asgnd;
+
+		lio->msg_enable = netif_msg_init(debug, DEFAULT_MSG_ENABLE);
+
+		lio->dev_capability = NETIF_F_HIGHDMA
+				      | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM
+				      | NETIF_F_SG | NETIF_F_RXCSUM
+				      | NETIF_F_TSO | NETIF_F_TSO6
+				      | NETIF_F_GRO
+				      | NETIF_F_LRO;
+		netif_set_gso_max_size(netdev, OCTNIC_GSO_MAX_SIZE);
+
+		netdev->features = (lio->dev_capability & ~NETIF_F_LRO);
+
+		netdev->hw_features = lio->dev_capability;
+
+		/* Point to the  properties for octeon device to which this
+		 * interface belongs.
+		 */
+		lio->oct_dev = octeon_dev;
+		lio->octprops = props;
+		lio->netdev = netdev;
+
+		dev_dbg(&octeon_dev->pci_dev->dev,
+			"if%d gmx: %d hw_addr: 0x%llx\n", i,
+			lio->linfo.gmxport, CVM_CAST64(lio->linfo.hw_addr));
+
+		/* 64-bit swap required on LE machines */
+		octeon_swap_8B_data(&lio->linfo.hw_addr, 1);
+		for (j = 0; j < ETH_ALEN; j++)
+			mac[j] = *((u8 *)(((u8 *)&lio->linfo.hw_addr) + 2 + j));
+
+		/* Copy MAC Address to OS network device structure */
+		ether_addr_copy(netdev->dev_addr, mac);
+
+		if (netdev->features & NETIF_F_LRO)
+			liquidio_set_feature(netdev, OCTNET_CMD_LRO_ENABLE,
+					     OCTNIC_LROIPV4 | OCTNIC_LROIPV6);
+
+		if ((debug != -1) && (debug & NETIF_MSG_HW))
+			liquidio_set_feature(netdev, OCTNET_CMD_VERBOSE_ENABLE,
+					     0);
+
+		/* Register the network device with the OS */
+		if (register_netdev(netdev)) {
+			dev_err(&octeon_dev->pci_dev->dev, "Device registration failed\n");
+			goto setup_nic_dev_fail;
+		}
+
+		dev_dbg(&octeon_dev->pci_dev->dev,
+			"Setup NIC ifidx:%d mac:%02x%02x%02x%02x%02x%02x\n",
+			i, mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
+		netif_carrier_off(netdev);
+		lio->link_changes++;
+
+		ifstate_set(lio, LIO_IFSTATE_REGISTERED);
+
+		/* Sending command to firmware to enable Rx checksum offload
+		 * by default at the time of setup of Liquidio driver for
+		 * this device
+		 */
+		liquidio_set_rxcsum_command(netdev, OCTNET_CMD_TNL_RX_CSUM_CTL,
+					    OCTNET_CMD_RXCSUM_ENABLE);
+		liquidio_set_feature(netdev, OCTNET_CMD_TNL_TX_CSUM_CTL,
+				     OCTNET_CMD_TXCSUM_ENABLE);
+
+		dev_dbg(&octeon_dev->pci_dev->dev,
+			"NIC ifidx:%d Setup successful\n", i);
+
+		octeon_free_soft_command(octeon_dev, sc);
+	}
+
+	return 0;
+
+setup_nic_dev_fail:
+
+	octeon_free_soft_command(octeon_dev, sc);
+
+setup_nic_wait_intr:
+
+	while (i--) {
+		dev_err(&octeon_dev->pci_dev->dev,
+			"NIC ifidx:%d Setup failed\n", i);
+		liquidio_destroy_nic_device(octeon_dev, i);
+	}
+	return -ENODEV;
+}
+
+/**
+ * \brief initialize the NIC
+ * @param oct octeon device
+ *
+ * This initialization routine is called once the Octeon device application is
+ * up and running
+ */
+static int liquidio_init_nic_module(struct octeon_device *oct)
+{
+	int num_nic_ports = 1;
+	int i, retval = 0;
+
+	dev_dbg(&oct->pci_dev->dev, "Initializing network interfaces\n");
+
+	/* only default iq and oq were initialized
+	 * initialize the rest as well run port_config command for each port
+	 */
+	oct->ifcount = num_nic_ports;
+	memset(oct->props, 0,
+	       sizeof(struct octdev_props) * num_nic_ports);
+
+	for (i = 0; i < MAX_OCTEON_LINKS; i++)
+		oct->props[i].gmxport = -1;
+
+	retval = setup_nic_devices(oct);
+	if (retval) {
+		dev_err(&oct->pci_dev->dev, "Setup NIC devices failed\n");
+		goto octnet_init_failure;
+	}
+
+octnet_init_failure:
+
+	oct->ifcount = 0;
+
+	return retval;
+}
+
+/**
  * \brief Device initialization for each Octeon device that is probed
  * @param octeon_dev  octeon device
  */
@@ -498,6 +1055,8 @@ static int octeon_device_init(struct octeon_device *oct)
 
 	atomic_set(&oct->status, OCT_DEV_PCI_MAP_DONE);
 
+	oct->app_mode = CVM_DRV_NIC_APP;
+
 	/* Initialize the dispatch mechanism used to push packets arriving on
 	 * Octeon Output queues.
 	 */
@@ -594,6 +1153,9 @@ static int octeon_device_init(struct octeon_device *oct)
 
 	atomic_set(&oct->status, OCT_DEV_RUNNING);
 
+	if (liquidio_init_nic_module(oct))
+		return 1;
+
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_device.c b/drivers/net/ethernet/cavium/liquidio/octeon_device.c
index 6d54032..583818e 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_device.c
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_device.c
@@ -1221,6 +1221,9 @@ struct octeon_config *octeon_get_conf(struct octeon_device *oct)
 	} else if (OCTEON_CN23XX_PF(oct)) {
 		default_oct_conf = (struct octeon_config *)
 			(CHIP_CONF(oct, cn23xx_pf));
+	} else if (OCTEON_CN23XX_VF(oct)) {
+		default_oct_conf = (struct octeon_config *)
+			(CHIP_CONF(oct, cn23xx_vf));
 	}
 	return default_oct_conf;
 }
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH nf-next v2] netfilter: xt_bpf: support ebpf
From: Willem de Bruijn @ 2016-12-06 21:25 UTC (permalink / raw)
  To: netfilter-devel; +Cc: fw, eric.dumazet, pablo, netdev, daniel, Willem de Bruijn

From: Willem de Bruijn <willemb@google.com>

Add support for attaching an eBPF object by file descriptor.

The iptables binary can be called with a path to an elf object or a
pinned bpf object. Also pass the mode and path to the kernel to be
able to return it later for iptables dump and save.

Signed-off-by: Willem de Bruijn <willemb@google.com>

---

Changes
  v1 -> v2
    - define XT_BPF_PATH_MAX (== 512: does not grow structure size)
---
 include/uapi/linux/netfilter/xt_bpf.h | 21 ++++++++
 net/netfilter/xt_bpf.c                | 96 +++++++++++++++++++++++++++++------
 2 files changed, 101 insertions(+), 16 deletions(-)

diff --git a/include/uapi/linux/netfilter/xt_bpf.h b/include/uapi/linux/netfilter/xt_bpf.h
index 1fad2c2..b97725a 100644
--- a/include/uapi/linux/netfilter/xt_bpf.h
+++ b/include/uapi/linux/netfilter/xt_bpf.h
@@ -2,9 +2,11 @@
 #define _XT_BPF_H
 
 #include <linux/filter.h>
+#include <linux/limits.h>
 #include <linux/types.h>
 
 #define XT_BPF_MAX_NUM_INSTR	64
+#define XT_BPF_PATH_MAX		(XT_BPF_MAX_NUM_INSTR * sizeof(struct sock_filter))
 
 struct bpf_prog;
 
@@ -16,4 +18,23 @@ struct xt_bpf_info {
 	struct bpf_prog *filter __attribute__((aligned(8)));
 };
 
+enum xt_bpf_modes {
+	XT_BPF_MODE_BYTECODE,
+	XT_BPF_MODE_FD_PINNED,
+	XT_BPF_MODE_FD_ELF,
+};
+
+struct xt_bpf_info_v1 {
+	__u16 mode;
+	__u16 bpf_program_num_elem;
+	__s32 fd;
+	union {
+		struct sock_filter bpf_program[XT_BPF_MAX_NUM_INSTR];
+		char path[XT_BPF_PATH_MAX];
+	};
+
+	/* only used in the kernel */
+	struct bpf_prog *filter __attribute__((aligned(8)));
+};
+
 #endif /*_XT_BPF_H */
diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
index dffee9d47..2dedaa2 100644
--- a/net/netfilter/xt_bpf.c
+++ b/net/netfilter/xt_bpf.c
@@ -10,6 +10,7 @@
 #include <linux/module.h>
 #include <linux/skbuff.h>
 #include <linux/filter.h>
+#include <linux/bpf.h>
 
 #include <linux/netfilter/xt_bpf.h>
 #include <linux/netfilter/x_tables.h>
@@ -20,15 +21,15 @@ MODULE_LICENSE("GPL");
 MODULE_ALIAS("ipt_bpf");
 MODULE_ALIAS("ip6t_bpf");
 
-static int bpf_mt_check(const struct xt_mtchk_param *par)
+static int __bpf_mt_check_bytecode(struct sock_filter *insns, __u16 len,
+				   struct bpf_prog **ret)
 {
-	struct xt_bpf_info *info = par->matchinfo;
 	struct sock_fprog_kern program;
 
-	program.len = info->bpf_program_num_elem;
-	program.filter = info->bpf_program;
+	program.len = len;
+	program.filter = insns;
 
-	if (bpf_prog_create(&info->filter, &program)) {
+	if (bpf_prog_create(ret, &program)) {
 		pr_info("bpf: check failed: parse error\n");
 		return -EINVAL;
 	}
@@ -36,6 +37,42 @@ static int bpf_mt_check(const struct xt_mtchk_param *par)
 	return 0;
 }
 
+static int __bpf_mt_check_fd(int fd, struct bpf_prog **ret)
+{
+	struct bpf_prog *prog;
+
+	prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	*ret = prog;
+	return 0;
+}
+
+static int bpf_mt_check(const struct xt_mtchk_param *par)
+{
+	struct xt_bpf_info *info = par->matchinfo;
+
+	return __bpf_mt_check_bytecode(info->bpf_program,
+				       info->bpf_program_num_elem,
+				       &info->filter);
+}
+
+static int bpf_mt_check_v1(const struct xt_mtchk_param *par)
+{
+	struct xt_bpf_info_v1 *info = par->matchinfo;
+
+	if (info->mode == XT_BPF_MODE_BYTECODE)
+		return __bpf_mt_check_bytecode(info->bpf_program,
+					       info->bpf_program_num_elem,
+					       &info->filter);
+	else if (info->mode == XT_BPF_MODE_FD_PINNED ||
+		 info->mode == XT_BPF_MODE_FD_ELF)
+		return __bpf_mt_check_fd(info->fd, &info->filter);
+	else
+		return -EINVAL;
+}
+
 static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct xt_bpf_info *info = par->matchinfo;
@@ -43,31 +80,58 @@ static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	return BPF_PROG_RUN(info->filter, skb);
 }
 
+static bool bpf_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_bpf_info_v1 *info = par->matchinfo;
+
+	return !!bpf_prog_run_save_cb(info->filter, (struct sk_buff *) skb);
+}
+
 static void bpf_mt_destroy(const struct xt_mtdtor_param *par)
 {
 	const struct xt_bpf_info *info = par->matchinfo;
+
+	bpf_prog_destroy(info->filter);
+}
+
+static void bpf_mt_destroy_v1(const struct xt_mtdtor_param *par)
+{
+	const struct xt_bpf_info_v1 *info = par->matchinfo;
+
 	bpf_prog_destroy(info->filter);
 }
 
-static struct xt_match bpf_mt_reg __read_mostly = {
-	.name		= "bpf",
-	.revision	= 0,
-	.family		= NFPROTO_UNSPEC,
-	.checkentry	= bpf_mt_check,
-	.match		= bpf_mt,
-	.destroy	= bpf_mt_destroy,
-	.matchsize	= sizeof(struct xt_bpf_info),
-	.me		= THIS_MODULE,
+static struct xt_match bpf_mt_reg[] __read_mostly = {
+	{
+		.name		= "bpf",
+		.revision	= 0,
+		.family		= NFPROTO_UNSPEC,
+		.checkentry	= bpf_mt_check,
+		.match		= bpf_mt,
+		.destroy	= bpf_mt_destroy,
+		.matchsize	= sizeof(struct xt_bpf_info),
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "bpf",
+		.revision	= 1,
+		.family		= NFPROTO_UNSPEC,
+		.checkentry	= bpf_mt_check_v1,
+		.match		= bpf_mt_v1,
+		.destroy	= bpf_mt_destroy_v1,
+		.matchsize	= sizeof(struct xt_bpf_info_v1),
+		.me		= THIS_MODULE,
+	},
 };
 
 static int __init bpf_mt_init(void)
 {
-	return xt_register_match(&bpf_mt_reg);
+	return xt_register_matches(bpf_mt_reg, ARRAY_SIZE(bpf_mt_reg));
 }
 
 static void __exit bpf_mt_exit(void)
 {
-	xt_unregister_match(&bpf_mt_reg);
+	xt_unregister_matches(bpf_mt_reg, ARRAY_SIZE(bpf_mt_reg));
 }
 
 module_init(bpf_mt_init);
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related

* Re: [PATCH v2 net-next 3/4] mlx4: xdp: Reserve headroom for receiving packet when XDP prog is active
From: Saeed Mahameed @ 2016-12-06 21:40 UTC (permalink / raw)
  To: Martin KaFai Lau
  Cc: Linux Netdev List, Alexei Starovoitov, Brenden Blanco,
	Daniel Borkmann, David Miller, Jesper Dangaard Brouer,
	Saeed Mahameed, Tariq Toukan, Kernel Team
In-Reply-To: <20161206182718.GA16682@kafai-mba.local>

On Tue, Dec 6, 2016 at 8:27 PM, Martin KaFai Lau <kafai@fb.com> wrote:
> On Tue, Dec 06, 2016 at 06:50:47PM +0200, Saeed Mahameed wrote:
>> On Mon, Dec 5, 2016 at 9:55 PM, Martin KaFai Lau <kafai@fb.com> wrote:
>> > On Mon, Dec 05, 2016 at 02:54:06AM +0200, Saeed Mahameed wrote:
>> >> On Sun, Dec 4, 2016 at 5:17 AM, Martin KaFai Lau <kafai@fb.com> wrote:
>> >> > Reserve XDP_PACKET_HEADROOM and honor bpf_xdp_adjust_head()
>> >> > when XDP prog is active.  This patch only affects the code
>> >> > path when XDP is active.
>> >> >
>> >> > Signed-off-by: Martin KaFai Lau <kafai@fb.com>
>> >> > ---
>> >>
>> >> Hi Martin, Sorry for the late review, i have some comments below
>> >>
>> >> >  drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 17 +++++++++++++++--
>> >> >  drivers/net/ethernet/mellanox/mlx4/en_rx.c     | 23 +++++++++++++++++------
>> >> >  drivers/net/ethernet/mellanox/mlx4/en_tx.c     |  9 +++++----
>> >> >  drivers/net/ethernet/mellanox/mlx4/mlx4_en.h   |  3 ++-
>> >> >  4 files changed, 39 insertions(+), 13 deletions(-)
>> >> >
>> >> > diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
>> >> > index 311c14153b8b..094a13b52cf6 100644
>> >> > --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
>> >> > +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
>> >> > @@ -51,7 +51,8 @@
>> >> >  #include "mlx4_en.h"
>> >> >  #include "en_port.h"
>> >> >
>> >> > -#define MLX4_EN_MAX_XDP_MTU ((int)(PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN)))
>> >> > +#define MLX4_EN_MAX_XDP_MTU ((int)(PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN) - \
>> >> > +                                  XDP_PACKET_HEADROOM))
>> >> >
>> >> >  int mlx4_en_setup_tc(struct net_device *dev, u8 up)
>> >> >  {
>> >> > @@ -1551,6 +1552,7 @@ int mlx4_en_start_port(struct net_device *dev)
>> >> >         struct mlx4_en_tx_ring *tx_ring;
>> >> >         int rx_index = 0;
>> >> >         int err = 0;
>> >> > +       int mtu;
>> >> >         int i, t;
>> >> >         int j;
>> >> >         u8 mc_list[16] = {0};
>> >> > @@ -1684,8 +1686,12 @@ int mlx4_en_start_port(struct net_device *dev)
>> >> >         }
>> >> >
>> >> >         /* Configure port */
>> >> > +       mtu = priv->rx_skb_size + ETH_FCS_LEN;
>> >> > +       if (priv->tx_ring_num[TX_XDP])
>> >> > +               mtu += XDP_PACKET_HEADROOM;
>> >> > +
>> >>
>> >> Why would the physical MTU care for the headroom you preserve for XDP prog?
>> >> This is the wire MTU, it shouldn't be changed, please keep it as
>> >> before, any preservation you make in packets buffers are needed only
>> >> for FWD case or modify case (HW or wire should not care about them).
>> >
>> > Thanks for your feedback!
>>
>> Just doing my job :))
>>
>> >
>> > FWD:
>> > packet received from a port
>> > => process by a XDP prog
>> > => XDP_TX out to the same port.
>> >
>> > For example, if the received packet has 1500 payload and the XDP prog
>> > encapsulates it in an IPv6 header (+40 bytes).  After testing, it cannot
>> > be sent out due to the HW/wire MTU is 1500.
>> >
>> > Even the wire MTU info was passed to the XDP prog, there is not much a
>> > XDP prog could do here other than dropping it.
>> >
>> > Hence, this patch gives guarantee to the XDP prog such that
>> > it can always send out what it has received + XDP_PACKET_HEADROOM.
>> >
>>
>> Still i am not convinced ! this is against common sense,
>> this means that the XDP prog can send packets larger than the  MTU
>> seen on netdev!
>>
>> anyway if a packet with the size (MTU + XDP_PACKET_HEADROOM) was sent
>> from XDP ring and HW allowed it to exit somehow (with the code you
>> provided :)), most likely it will be dropped
>> at the other end.
> The MTU of our receiver side is larger than 1500.
>
> If the otherside could not handle >1500, we could lower the box running
> XDP prog to 1460.
>

This is exactly the user confusion we are trying to avoid.

Genuinely lowering the other side or dropping packets in XDP program
that are not eligible for edit&FWD (packets > MTU - required headroom
)  will create the same effect. why don't you use this approach ?

dropping "large" packets in XDP seems the best solution.

> Just ensure we are on the same page.  The rx MTU stays the same (1500)
> because the rx_desc's byte_count is not raised by XDP_PACKET_HEADROOM.
>

Yea it is clear,

One more reason not to do this: now packets that were dropped due to
"large MTU" HW drop cause, will now pass the HW check but will fail on
RX error (RX buffers are smaller than the wire MTU sized packet) this
counts as an error in both mlx5/4 which is not acceptable.

>>
>> I still think XDP prog should not be allowed to FW packets larger than
>> the MTU seen on the netdev and you shouldn't modify the wire MTU just
>> for this case.
>>
>> >>
>> >> >         err = mlx4_SET_PORT_general(mdev->dev, priv->port,
>> >> > -                                   priv->rx_skb_size + ETH_FCS_LEN,
>> >> > +                                   mtu,
>> >> >                                     priv->prof->tx_pause,
>> >> >                                     priv->prof->tx_ppp,
>> >> >                                     priv->prof->rx_pause,
>> >> > @@ -2255,6 +2261,13 @@ static bool mlx4_en_check_xdp_mtu(struct net_device *dev, int mtu)
>> >> >  {
>> >> >         struct mlx4_en_priv *priv = netdev_priv(dev);
>> >> >
>> >> > +       if (mtu + XDP_PACKET_HEADROOM > priv->max_mtu) {
>> >> > +               en_err(priv,
>> >> > +                      "Device max mtu:%d does not allow %d bytes reserved headroom for XDP prog\n",
>> >> > +                      priv->max_mtu, XDP_PACKET_HEADROOM);
>> >> > +               return false;
>> >> > +       }
>> >> > +
>> >> >         if (mtu > MLX4_EN_MAX_XDP_MTU) {
>> >> >                 en_err(priv, "mtu:%d > max:%d when XDP prog is attached\n",
>> >> >                        mtu, MLX4_EN_MAX_XDP_MTU);
>> >> > diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
>> >> > index 23e9d04d1ef4..324771ac929e 100644
>> >> > --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
>> >> > +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
>> >> > @@ -96,7 +96,6 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
>> >> >         struct mlx4_en_rx_alloc page_alloc[MLX4_EN_MAX_RX_FRAGS];
>> >> >         const struct mlx4_en_frag_info *frag_info;
>> >> >         struct page *page;
>> >> > -       dma_addr_t dma;
>> >> >         int i;
>> >> >
>> >> >         for (i = 0; i < priv->num_frags; i++) {
>> >> > @@ -115,9 +114,10 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
>> >> >
>> >> >         for (i = 0; i < priv->num_frags; i++) {
>> >> >                 frags[i] = ring_alloc[i];
>> >> > -               dma = ring_alloc[i].dma + ring_alloc[i].page_offset;
>> >> > +               frags[i].page_offset += priv->frag_info[i].rx_headroom;
>> >>
>> >> I don't see any need for headroom on frag_info other that frag0 (which
>> >> where the packet starts).
>> >> What is the meaning of a headroom of a frag in a middle of a packet ?
>> >>
>> >> if you agree with me then, you can use XDP_PACKET_HEADROOM as is where
>> >> needed (i.e frag0 page offset) and remove
>> >> "priv->frag_info[i].rx_headroom"
>> >>
>> >> ...
>> >>
>> >> After going through the code a little bit i see that this code is
>> >> shared between XDP and common path, and you didn't want to add boolean
>> >> conditions.
>> >>
>> >> Ok i see what you did here.
>> >>
>> >> Maybe we can pass headroom as a function parameter and split frag0
>> >> handling from the rest ?
>> >> If it is too much then i am ok with the code as it is,
>> > Right, this patch does the boolean check (XDP active or not) early on
>> > in mlx4_en_calc_rx_buf() (i.e. out of the fast path) and store
>> > the result in priv->frag_info[0].rx_headroom.
>> >
>> > Just want to ensure I understand your comment correctly.
>> > You prefer not to store the boolean test result in frag_info[0].rx_headroom
>> > since it is redundant to !!priv->tx_ring_num[TX_XDP] and rx_headroom is also
>> > confusing for frag[1-3].
>> >
>> > Instead, do the XDP [in]active test before calling mlx4_en_alloc_frags()
>> > and then only adjust frags[0].page_offset by +XDP_PACKET_HEADROOM if is needed.
>> > It could be done either by passing an extra argument to mlx4_en_alloc_frags()
>> > or completely separate mlx4_en_alloc_frags().  I am fine with this also.
>> >
>>
>> Correct, but if this change will add extra checks to the data path
>> then I am ok with the current code.
> Right, the check has to be done somewhere in the data path.
> Lets stay with the current approach then.
>
>>
>> >
>> >>
>> >> > +               rx_desc->data[i].addr = cpu_to_be64(frags[i].dma +
>> >> > +                                                   frags[i].page_offset);
>> >> >                 ring_alloc[i] = page_alloc[i];
>> >> > -               rx_desc->data[i].addr = cpu_to_be64(dma);
>> >> >         }
>> >> >
>> >> >         return 0;
>> >> > @@ -250,7 +250,8 @@ static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv,
>> >> >
>> >> >         if (ring->page_cache.index > 0) {
>> >> >                 frags[0] = ring->page_cache.buf[--ring->page_cache.index];
>> >> > -               rx_desc->data[0].addr = cpu_to_be64(frags[0].dma);
>> >> > +               rx_desc->data[0].addr = cpu_to_be64(frags[0].dma +
>> >> > +                                                   frags[0].page_offset);
>> >> >                 return 0;
>> >> >         }
>> >> >
>> >> > @@ -889,6 +890,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
>> >> >                 if (xdp_prog) {
>> >> >                         struct xdp_buff xdp;
>> >> >                         dma_addr_t dma;
>> >> > +                       void *pg_addr, *orig_data;
>> >> >                         u32 act;
>> >> >
>> >> >                         dma = be64_to_cpu(rx_desc->data[0].addr);
>> >> > @@ -896,11 +898,18 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
>> >> >                                                 priv->frag_info[0].frag_size,
>> >> >                                                 DMA_FROM_DEVICE);
>> >> >
>> >> > -                       xdp.data = page_address(frags[0].page) +
>> >> > -                                                       frags[0].page_offset;
>> >> > +                       pg_addr = page_address(frags[0].page);
>> >> > +                       orig_data = pg_addr + frags[0].page_offset;
>> >> > +                       xdp.data = orig_data;
>> >> >                         xdp.data_end = xdp.data + length;
>> >> >
>> >> >                         act = bpf_prog_run_xdp(xdp_prog, &xdp);
>> >> > +
>> >> > +                       if (xdp.data != orig_data) {
>> >> > +                               length = xdp.data_end - xdp.data;
>> >> > +                               frags[0].page_offset = xdp.data - pg_addr;
>> >> > +                       }
>> >> > +
>> >> >
>> >>
>> >> is this needed only for XDP FWD case ?
>> > No. It is also for PASS.
>> >
>>
>> I see.
>>
>> >> is this the only way to detect that the user modified the packet
>> >> headers (comparing pointers, before and after) ?
>> > Yes
>> >
>> >>
>> >> if the answer is yes, it should be faster to unconditionally reset
>> >> packet offset and lenght on XDP_FWD :
>> >> case XDP_FWD:
>> >>    length = xdp.data_end - xdp.data;
>> >>    frags[0].page_offset = xdp.data - pg_addr;
>> >>
>> >>
>> >> >                         switch (act) {
>> >> >                         case XDP_PASS:
>> >> >                                 break;
>> >> > @@ -1180,6 +1189,7 @@ void mlx4_en_calc_rx_buf(struct net_device *dev)
>> >> >                  */
>> >> >                 priv->frag_info[0].frag_stride = PAGE_SIZE;
>> >> >                 priv->frag_info[0].dma_dir = PCI_DMA_BIDIRECTIONAL;
>> >> > +               priv->frag_info[0].rx_headroom = XDP_PACKET_HEADROOM;
>> >> >                 i = 1;
>> >> >         } else {
>> >> >                 int buf_size = 0;
>> >> > @@ -1194,6 +1204,7 @@ void mlx4_en_calc_rx_buf(struct net_device *dev)
>> >> >                                 ALIGN(priv->frag_info[i].frag_size,
>> >> >                                       SMP_CACHE_BYTES);
>> >> >                         priv->frag_info[i].dma_dir = PCI_DMA_FROMDEVICE;
>> >> > +                       priv->frag_info[i].rx_headroom = 0;
>> >>
>> >> IMHO, redundant. as you see here frag0 and other frags handling are
>> >> separated, maybe we can do the same in mlx4_en_alloc_frags.
>> >>
>> >> >                         buf_size += priv->frag_info[i].frag_size;
>> >> >                         i++;
>> >> >                 }
>> >> > diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
>> >> > index 4b597dca5c52..9e5f38cefe5f 100644
>> >> > --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
>> >> > +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
>> >> > @@ -354,7 +354,7 @@ u32 mlx4_en_recycle_tx_desc(struct mlx4_en_priv *priv,
>> >> >         struct mlx4_en_rx_alloc frame = {
>> >> >                 .page = tx_info->page,
>> >> >                 .dma = tx_info->map0_dma,
>> >> > -               .page_offset = 0,
>> >> > +               .page_offset = XDP_PACKET_HEADROOM,
>> >> >                 .page_size = PAGE_SIZE,
>> >> >         };
>> >> >
>> >> > @@ -1132,7 +1132,7 @@ netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_ring *rx_ring,
>> >> >         tx_info->page = frame->page;
>> >> >         frame->page = NULL;
>> >> >         tx_info->map0_dma = dma;
>> >> > -       tx_info->map0_byte_count = length;
>> >> > +       tx_info->map0_byte_count = length + frame->page_offset;
>> >>
>> >> Didn't you already take care of lenght by the following code:
>> >>                        if (xdp.data != orig_data) {
>> >>                                length = xdp.data_end - xdp.data;
>> >>                                frags[0].page_offset = xdp.data - pg_addr;
>> >>                         }
>> >>
>> > Before this patch, length always assumes the data starts at the beginning
>> > of the page and dma is the start of the page.  Hence, adding
>> > framg->page_offset back to the length here.
>> >
>> > However, if I read the codes correctly, I think the map0_byte_count (before or
>> > after this patch) does not matter since it is only used in dma_unmap_page() and
>> > PAGE_SIZE is always used in dma_unmap_page() for this code patch.  Hence, I think
>> > we can just set map0_byte_count to PAGE_SIZE here.
>> >
>>
>> Right, in mlx4_alloc_pages we always map with PAGE_SIZE <<  order
>>  dma = dma_map_page(priv->ddev, page, 0, PAGE_SIZE << order,
>>   frag_info->dma_dir);
>> for XDP order is always 0, so you can safely set it to PAGE_SIZE.
>>
>> >> and here  frame->page_offset is not really page offset, it can only be
>> >> XDP_PACKET_HEADROOM.
>> > Note that the XDP prog can call bpf_xdp_adjust_head() to add a header.
>> > The XDP prog can extend up to XDP_PACKET_HEADROOM (256) bytes but it
>> > can also (and usually) only add 40 bytes IPv6 header and then XDP_TX it out.
>> >
>>
>> I see.
>>
>> >>
>> >> >         tx_info->nr_txbb = nr_txbb;
>> >> >         tx_info->nr_bytes = max_t(unsigned int, length, ETH_ZLEN);
>> >> >         tx_info->data_offset = (void *)data - (void *)tx_desc;
>> >> > @@ -1141,9 +1141,10 @@ netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_ring *rx_ring,
>> >> >         tx_info->linear = 1;
>> >> >         tx_info->inl = 0;
>> >> >
>> >> > -       dma_sync_single_for_device(priv->ddev, dma, length, PCI_DMA_TODEVICE);
>> >> > +       dma_sync_single_range_for_device(priv->ddev, dma, frame->page_offset,
>> >> > +                                        length, PCI_DMA_TODEVICE);
>> >> >
>> >> > -       data->addr = cpu_to_be64(dma);
>> >> > +       data->addr = cpu_to_be64(dma + frame->page_offset);
>> >> >         data->lkey = ring->mr_key;
>> >> >         dma_wmb();
>> >> >         data->byte_count = cpu_to_be32(length);
>> >> > diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
>> >> > index 20a936428f4a..ba1c6cd0cc79 100644
>> >> > --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
>> >> > +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
>> >> > @@ -475,7 +475,8 @@ struct mlx4_en_frag_info {
>> >> >         u16 frag_prefix_size;
>> >> >         u32 frag_stride;
>> >> >         enum dma_data_direction dma_dir;
>> >> > -       int order;
>> >> > +       u16 order;
>> >> > +       u16 rx_headroom;
>> >> >  };
>> >> >
>> >> >  #ifdef CONFIG_MLX4_EN_DCB
>> >> > --
>> >> > 2.5.1
>> >> >

^ permalink raw reply

* [PATCH net] netvsc: reduce maximum GSO size
From: Stephen Hemminger @ 2016-12-06 21:43 UTC (permalink / raw)
  To: davem; +Cc: netdev, Stephen Hemminger

Hyper-V (and Azure) support using NVGRE which requires some extra space
for encapsulation headers. Because of this the largest allowed TSO
packet is reduced.

For older releases, hard code a fixed reduced value.  For next release,
there is a better solution which uses result of host offload
negotiation.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
---
Please queue this for stable as well.

 drivers/net/hyperv/netvsc_drv.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index f638215..c9140c3 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -47,6 +47,10 @@
 				 NETIF_F_TSO | \
 				 NETIF_F_TSO6 | \
 				 NETIF_F_HW_CSUM)
+
+/* Restrict GSO size to account for NVGRE */
+#define NETVSC_GSO_MAX_SIZE	62768
+
 static int ring_size = 128;
 module_param(ring_size, int, S_IRUGO);
 MODULE_PARM_DESC(ring_size, "Ring buffer size (# of pages)");
@@ -1400,6 +1404,7 @@ static int netvsc_probe(struct hv_device *dev,
 	nvdev = net_device_ctx->nvdev;
 	netif_set_real_num_tx_queues(net, nvdev->num_chn);
 	netif_set_real_num_rx_queues(net, nvdev->num_chn);
+	netif_set_gso_max_size(net, NETVSC_GSO_MAX_SIZE);
 
 	ret = register_netdev(net);
 	if (ret != 0) {
-- 
2.10.2

^ permalink raw reply related

* Re: [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
From: tndave @ 2016-12-06 22:04 UTC (permalink / raw)
  To: Alexander Duyck; +Cc: Jeff Kirsher, intel-wired-lan, Netdev
In-Reply-To: <CAKgT0UfTP+BrvDBzUJAVr9-DRCKgM7T3aS=LgRic8UZz8x82eg@mail.gmail.com>



On 12/06/2016 09:10 AM, Alexander Duyck wrote:
> On Mon, Dec 5, 2016 at 2:23 PM, tndave <tushar.n.dave@oracle.com> wrote:
>>
>>
>> On 12/05/2016 01:54 PM, Alexander Duyck wrote:
>>>
>>> On Mon, Dec 5, 2016 at 9:07 AM, Tushar Dave <tushar.n.dave@oracle.com>
>>> wrote:
>>>>
>>>> Unlike previous generation NIC (e.g. ixgbe) i40e doesn't seem to have
>>>> standard CSR where PCIe relaxed ordering can be set. Without PCIe relax
>>>> ordering enabled, i40e performance is significantly low on SPARC.
>>>>
>>>> This patch sets PCIe relax ordering for SPARC arch by setting dma attr
>>>> DMA_ATTR_WEAK_ORDERING for every tx and rx DMA map/unmap.
>>>> This has shown 10x increase in performance numbers.
>>>>
>>>> e.g.
>>>> iperf TCP test with 10 threads on SPARC S7
>>>>
>>>> Test 1: Without this patch
>>>>
>>>> # iperf -s
>>>> ------------------------------------------------------------
>>>> Server listening on TCP port 5001
>>>> TCP window size: 85.3 KByte (default)
>>>> ------------------------------------------------------------
>>>> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40926
>>>> [  5] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40934
>>>> [  6] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40930
>>>> [  7] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40928
>>>> [  8] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40922
>>>> [  9] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40932
>>>> [ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40920
>>>> [ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40924
>>>> [ 14] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40982
>>>> [ 12] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40980
>>>> [ ID] Interval       Transfer     Bandwidth
>>>> [  4]  0.0-20.0 sec   566 MBytes   237 Mbits/sec
>>>> [  5]  0.0-20.0 sec   532 MBytes   223 Mbits/sec
>>>> [  6]  0.0-20.0 sec   537 MBytes   225 Mbits/sec
>>>> [  8]  0.0-20.0 sec   546 MBytes   229 Mbits/sec
>>>> [ 11]  0.0-20.0 sec   592 MBytes   248 Mbits/sec
>>>> [  7]  0.0-20.0 sec   539 MBytes   226 Mbits/sec
>>>> [  9]  0.0-20.0 sec   572 MBytes   240 Mbits/sec
>>>> [ 10]  0.0-20.0 sec   604 MBytes   253 Mbits/sec
>>>> [ 14]  0.0-20.0 sec   567 MBytes   238 Mbits/sec
>>>> [ 12]  0.0-20.0 sec   511 MBytes   214 Mbits/sec
>>>> [SUM]  0.0-20.0 sec  5.44 GBytes  2.33 Gbits/sec
>>>>
>>>> Test 2: with this patch:
>>>>
>>>> # iperf -s
>>>> ------------------------------------------------------------
>>>> Server listening on TCP port 5001
>>>> TCP window size: 85.3 KByte (default)
>>>> ------------------------------------------------------------
>>>> TCP: request_sock_TCP: Possible SYN flooding on port 5001. Sending
>>>> cookies.  Check SNMP counters.
>>>> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46876
>>>> [  5] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46874
>>>> [  6] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46872
>>>> [  7] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46880
>>>> [  8] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46878
>>>> [  9] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46884
>>>> [ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46886
>>>> [ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46890
>>>> [ 12] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46888
>>>> [ 13] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46882
>>>> [ ID] Interval       Transfer     Bandwidth
>>>> [  4]  0.0-20.0 sec  7.45 GBytes  3.19 Gbits/sec
>>>> [  5]  0.0-20.0 sec  7.48 GBytes  3.21 Gbits/sec
>>>> [  7]  0.0-20.0 sec  7.34 GBytes  3.15 Gbits/sec
>>>> [  8]  0.0-20.0 sec  7.42 GBytes  3.18 Gbits/sec
>>>> [  9]  0.0-20.0 sec  7.24 GBytes  3.11 Gbits/sec
>>>> [ 10]  0.0-20.0 sec  7.40 GBytes  3.17 Gbits/sec
>>>> [ 12]  0.0-20.0 sec  7.49 GBytes  3.21 Gbits/sec
>>>> [  6]  0.0-20.0 sec  7.30 GBytes  3.13 Gbits/sec
>>>> [ 11]  0.0-20.0 sec  7.44 GBytes  3.19 Gbits/sec
>>>> [ 13]  0.0-20.0 sec  7.22 GBytes  3.10 Gbits/sec
>>>> [SUM]  0.0-20.0 sec  73.8 GBytes  31.6 Gbits/sec
>>>>
>>>> NOTE: In my testing, this patch does _not_ show any harm to i40e
>>>> performance numbers on x86.
>>>>
>>>> Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
>>>
>>>
>>> You went through and replaced all of the dma_unmap/map_page calls with
>>> dma_map/unmap_single_attrs  I would prefer you didn't do that.  I have
>>
>> Yes, because currently there is no DMA API for dma_map/unmap_page with dma
>> attr*
>>>
>>> patches to add the ability to map and unmap pages with attributes that
>>> should be available for 4.10-rc1 so if you could wait on this patch
>>> until then it would be preferred.
>>
>> :-) thanks. I will wait until your patches are out.
>>
>>>
>>>> ---
>>>>  drivers/net/ethernet/intel/i40e/i40e_txrx.c | 69
>>>> ++++++++++++++++++++---------
>>>>  drivers/net/ethernet/intel/i40e/i40e_txrx.h |  1 +
>>>>  2 files changed, 49 insertions(+), 21 deletions(-)
>>>>
>>>> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>>>> b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>>>> index 6287bf6..800dca7 100644
>>>> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>>>> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>>>> @@ -551,15 +551,17 @@ static void i40e_unmap_and_free_tx_resource(struct
>>>> i40e_ring *ring,
>>>>                 else
>>>>                         dev_kfree_skb_any(tx_buffer->skb);
>>>>                 if (dma_unmap_len(tx_buffer, len))
>>>> -                       dma_unmap_single(ring->dev,
>>>> -                                        dma_unmap_addr(tx_buffer, dma),
>>>> -                                        dma_unmap_len(tx_buffer, len),
>>>> -                                        DMA_TO_DEVICE);
>>>> +                       dma_unmap_single_attrs(ring->dev,
>>>> +                                              dma_unmap_addr(tx_buffer,
>>>> dma),
>>>> +                                              dma_unmap_len(tx_buffer,
>>>> len),
>>>> +                                              DMA_TO_DEVICE,
>>>> +                                              ring->dma_attrs);
>>>>         } else if (dma_unmap_len(tx_buffer, len)) {
>>>> -               dma_unmap_page(ring->dev,
>>>> -                              dma_unmap_addr(tx_buffer, dma),
>>>> -                              dma_unmap_len(tx_buffer, len),
>>>> -                              DMA_TO_DEVICE);
>>>> +               dma_unmap_single_attrs(ring->dev,
>>>> +                                      dma_unmap_addr(tx_buffer, dma),
>>>> +                                      dma_unmap_len(tx_buffer, len),
>>>> +                                      DMA_TO_DEVICE,
>>>> +                                      ring->dma_attrs);
>>>>         }
>>>>
>>>>         tx_buffer->next_to_watch = NULL;
>>>> @@ -662,6 +664,8 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>>>         struct i40e_tx_buffer *tx_buf;
>>>>         struct i40e_tx_desc *tx_head;
>>>>         struct i40e_tx_desc *tx_desc;
>>>> +       dma_addr_t addr;
>>>> +       size_t size;
>>>>         unsigned int total_bytes = 0, total_packets = 0;
>>>>         unsigned int budget = vsi->work_limit;
>>>>
>>>> @@ -696,10 +700,11 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>>>                 napi_consume_skb(tx_buf->skb, napi_budget);
>>>>
>>>>                 /* unmap skb header data */
>>>> -               dma_unmap_single(tx_ring->dev,
>>>> -                                dma_unmap_addr(tx_buf, dma),
>>>> -                                dma_unmap_len(tx_buf, len),
>>>> -                                DMA_TO_DEVICE);
>>>> +               dma_unmap_single_attrs(tx_ring->dev,
>>>> +                                      dma_unmap_addr(tx_buf, dma),
>>>> +                                      dma_unmap_len(tx_buf, len),
>>>> +                                      DMA_TO_DEVICE,
>>>> +                                      tx_ring->dma_attrs);
>>>>
>>>>                 /* clear tx_buffer data */
>>>>                 tx_buf->skb = NULL;
>>>> @@ -717,12 +722,15 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>>>                                 tx_desc = I40E_TX_DESC(tx_ring, 0);
>>>>                         }
>>>>
>>>> +                       addr = dma_unmap_addr(tx_buf, dma);
>>>> +                       size = dma_unmap_len(tx_buf, len);
>>>
>>>
>>> On some architectures this change could lead to issues since
>>> dma_unmap_len could be 0 meaning that addr would never be used.
>>
>> I see. Thanks.
>>
>>>
>>>>                         /* unmap any remaining paged data */
>>>>                         if (dma_unmap_len(tx_buf, len)) {
>>>> -                               dma_unmap_page(tx_ring->dev,
>>>> -                                              dma_unmap_addr(tx_buf,
>>>> dma),
>>>> -                                              dma_unmap_len(tx_buf,
>>>> len),
>>>> -                                              DMA_TO_DEVICE);
>>>> +                               dma_unmap_single_attrs(tx_ring->dev,
>>>> +                                                      addr,
>>>> +                                                      size,
>>>> +                                                      DMA_TO_DEVICE,
>>>> +
>>>> tx_ring->dma_attrs);
>>>>                                 dma_unmap_len_set(tx_buf, len, 0);
>>>>                         }
>>>>                 }
>>>> @@ -1010,6 +1018,11 @@ int i40e_setup_tx_descriptors(struct i40e_ring
>>>> *tx_ring)
>>>>          */
>>>>         tx_ring->size += sizeof(u32);
>>>>         tx_ring->size = ALIGN(tx_ring->size, 4096);
>>>> +#ifdef CONFIG_SPARC
>>>> +       tx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING;
>>>> +#else
>>>> +       tx_ring->dma_attrs = 0;
>>>> +#endif
>>>>         tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
>>>>                                            &tx_ring->dma, GFP_KERNEL);
>>>>         if (!tx_ring->desc) {
>>>
>>>
>>> Also not a fan of adding yet ring attribute.  Is there any reason why
>>> you couldn't simply add a set of inline functions at the start of
>>> i40e_txrx.c that could replace the DMA map/unmap operations in this
>>> code but pass either 0 or DMA_ATTR_WEAK_ORDERING as needed for the
>>> drivers?  Then the x86 code doesn't have to change while the SPARC
>>> code will be able to be passed the attribute.
>>
>> Sure I can do that.
>>
>> I will follow up with patch after your patches for map/unmap page with dma
>> attr will be out.
>>
>> Thanks.
>>
>> -Tushar
>>
>
> I was thinking about it and I realized we can probably simplify this
> even further.  In the case of most other architectures the
> DMA_ATTR_WEAK_ORDERING has no effect anyway.  So from what I can
> tell there is probably no reason not to just always pass that
> attribute with the DMA mappings.  From what I can tell the only
> other architecture that uses this is the PowerPC Cell architecture.
Yes, besides SPARC64, only PowerPC Cell architecture uses
DMA_ATTR_WEAK_ORDERING; I guess it should be okay to always pass
DMA_ATTR_WEAK_ORDERING.
>
> Also I was wondering if you actually needed to enable this attribute
> for both Rx and Tx buffers or just Rx buffers?  The patch that
> enabled DMA_ATTR_WEAK_ORDERING for Sparc64 seems to call out writes,
> but I didn't see anything about reads.  I'm just wondering if
> changing the code for Tx has any effect?  If not you could probably
> drop those changes and just focus on Rx.
The patch I sent enabled DMA_ATTR_WEAK_ORDERING for sparc64 so that
write to & read from both rx and tx dma buffers can be relaxed order.

Passing DMA_ATTR_WEAK_ORDERING for tx dma buff doesn't have the same
impact as it has with DMA_ATTR_WEAK_ORDERING and rx dma buffers.
However, I can only confirm if DMA_ATTR_WEAK_ORDERING is not needed at
all for tx dma buffer after collecting some more data!

Thanks.

-Tushar

>
> Thanks.
>
> - Alex
>

^ permalink raw reply

* Re: [PATCH v2 net-next 3/4] mlx4: xdp: Reserve headroom for receiving packet when XDP prog is active
From: Martin KaFai Lau @ 2016-12-06 22:25 UTC (permalink / raw)
  To: Saeed Mahameed
  Cc: Linux Netdev List, Alexei Starovoitov, Brenden Blanco,
	Daniel Borkmann, David Miller, Jesper Dangaard Brouer,
	Saeed Mahameed, Tariq Toukan, Kernel Team
In-Reply-To: <CALzJLG_+1fAv=i2uRF7cd_GZh2v66Pry=HusHmh6a_Nfq78CHg@mail.gmail.com>

On Tue, Dec 06, 2016 at 11:40:19PM +0200, Saeed Mahameed wrote:
> On Tue, Dec 6, 2016 at 8:27 PM, Martin KaFai Lau <kafai@fb.com> wrote:
> > On Tue, Dec 06, 2016 at 06:50:47PM +0200, Saeed Mahameed wrote:
> >> On Mon, Dec 5, 2016 at 9:55 PM, Martin KaFai Lau <kafai@fb.com> wrote:
> >> > On Mon, Dec 05, 2016 at 02:54:06AM +0200, Saeed Mahameed wrote:
> >> >> On Sun, Dec 4, 2016 at 5:17 AM, Martin KaFai Lau <kafai@fb.com> wrote:
> >> >> > Reserve XDP_PACKET_HEADROOM and honor bpf_xdp_adjust_head()
> >> >> > when XDP prog is active.  This patch only affects the code
> >> >> > path when XDP is active.
> >> >> >
> >> >> > Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> >> >> > ---
> >> >>
> >> >> Hi Martin, Sorry for the late review, i have some comments below
> >> >>
> >> >> >  drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 17 +++++++++++++++--
> >> >> >  drivers/net/ethernet/mellanox/mlx4/en_rx.c     | 23 +++++++++++++++++------
> >> >> >  drivers/net/ethernet/mellanox/mlx4/en_tx.c     |  9 +++++----
> >> >> >  drivers/net/ethernet/mellanox/mlx4/mlx4_en.h   |  3 ++-
> >> >> >  4 files changed, 39 insertions(+), 13 deletions(-)
> >> >> >
> >> >> > diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> >> >> > index 311c14153b8b..094a13b52cf6 100644
> >> >> > --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> >> >> > +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> >> >> > @@ -51,7 +51,8 @@
> >> >> >  #include "mlx4_en.h"
> >> >> >  #include "en_port.h"
> >> >> >
> >> >> > -#define MLX4_EN_MAX_XDP_MTU ((int)(PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN)))
> >> >> > +#define MLX4_EN_MAX_XDP_MTU ((int)(PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN) - \
> >> >> > +                                  XDP_PACKET_HEADROOM))
> >> >> >
> >> >> >  int mlx4_en_setup_tc(struct net_device *dev, u8 up)
> >> >> >  {
> >> >> > @@ -1551,6 +1552,7 @@ int mlx4_en_start_port(struct net_device *dev)
> >> >> >         struct mlx4_en_tx_ring *tx_ring;
> >> >> >         int rx_index = 0;
> >> >> >         int err = 0;
> >> >> > +       int mtu;
> >> >> >         int i, t;
> >> >> >         int j;
> >> >> >         u8 mc_list[16] = {0};
> >> >> > @@ -1684,8 +1686,12 @@ int mlx4_en_start_port(struct net_device *dev)
> >> >> >         }
> >> >> >
> >> >> >         /* Configure port */
> >> >> > +       mtu = priv->rx_skb_size + ETH_FCS_LEN;
> >> >> > +       if (priv->tx_ring_num[TX_XDP])
> >> >> > +               mtu += XDP_PACKET_HEADROOM;
> >> >> > +
> >> >>
> >> >> Why would the physical MTU care for the headroom you preserve for XDP prog?
> >> >> This is the wire MTU, it shouldn't be changed, please keep it as
> >> >> before, any preservation you make in packets buffers are needed only
> >> >> for FWD case or modify case (HW or wire should not care about them).
> >> >
> >> > Thanks for your feedback!
> >>
> >> Just doing my job :))
> >>
> >> >
> >> > FWD:
> >> > packet received from a port
> >> > => process by a XDP prog
> >> > => XDP_TX out to the same port.
> >> >
> >> > For example, if the received packet has 1500 payload and the XDP prog
> >> > encapsulates it in an IPv6 header (+40 bytes).  After testing, it cannot
> >> > be sent out due to the HW/wire MTU is 1500.
> >> >
> >> > Even the wire MTU info was passed to the XDP prog, there is not much a
> >> > XDP prog could do here other than dropping it.
> >> >
> >> > Hence, this patch gives guarantee to the XDP prog such that
> >> > it can always send out what it has received + XDP_PACKET_HEADROOM.
> >> >
> >>
> >> Still i am not convinced ! this is against common sense,
> >> this means that the XDP prog can send packets larger than the  MTU
> >> seen on netdev!
> >>
> >> anyway if a packet with the size (MTU + XDP_PACKET_HEADROOM) was sent
> >> from XDP ring and HW allowed it to exit somehow (with the code you
> >> provided :)), most likely it will be dropped
> >> at the other end.
> > The MTU of our receiver side is larger than 1500.
> >
> > If the otherside could not handle >1500, we could lower the box running
> > XDP prog to 1460.
> >
>
> This is exactly the user confusion we are trying to avoid.
>
> Genuinely lowering the other side or dropping packets in XDP program
> that are not eligible for edit&FWD (packets > MTU - required headroom
> )  will create the same effect. why don't you use this approach ?
>
> dropping "large" packets in XDP seems the best solution.
Within the DC, yes we have absolute control on what to expect and we can even
lower the other end easily if it is needed.  However, it may not be the case
for machines sitting at some exotic location.

After this thread, I think this bit may require more thoughts/discussions.
I will drop it now and revisit later since it is not user ABI related.

For now, lets check and drop at the driver side since the driver has the MTU
info.

>
> > Just ensure we are on the same page.  The rx MTU stays the same (1500)
> > because the rx_desc's byte_count is not raised by XDP_PACKET_HEADROOM.
> >
>
> Yea it is clear,
>
> One more reason not to do this: now packets that were dropped due to
> "large MTU" HW drop cause, will now pass the HW check but will fail on
> RX error (RX buffers are smaller than the wire MTU sized packet) this
> counts as an error in both mlx5/4 which is not acceptable.
>
> >>
> >> I still think XDP prog should not be allowed to FW packets larger than
> >> the MTU seen on the netdev and you shouldn't modify the wire MTU just
> >> for this case.
> >>
> >> >>
> >> >> >         err = mlx4_SET_PORT_general(mdev->dev, priv->port,
> >> >> > -                                   priv->rx_skb_size + ETH_FCS_LEN,
> >> >> > +                                   mtu,
> >> >> >                                     priv->prof->tx_pause,
> >> >> >                                     priv->prof->tx_ppp,
> >> >> >                                     priv->prof->rx_pause,
> >> >> > @@ -2255,6 +2261,13 @@ static bool mlx4_en_check_xdp_mtu(struct net_device *dev, int mtu)
> >> >> >  {
> >> >> >         struct mlx4_en_priv *priv = netdev_priv(dev);
> >> >> >
> >> >> > +       if (mtu + XDP_PACKET_HEADROOM > priv->max_mtu) {
> >> >> > +               en_err(priv,
> >> >> > +                      "Device max mtu:%d does not allow %d bytes reserved headroom for XDP prog\n",
> >> >> > +                      priv->max_mtu, XDP_PACKET_HEADROOM);
> >> >> > +               return false;
> >> >> > +       }
> >> >> > +
> >> >> >         if (mtu > MLX4_EN_MAX_XDP_MTU) {
> >> >> >                 en_err(priv, "mtu:%d > max:%d when XDP prog is attached\n",
> >> >> >                        mtu, MLX4_EN_MAX_XDP_MTU);
> >> >> > diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> >> >> > index 23e9d04d1ef4..324771ac929e 100644
> >> >> > --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> >> >> > +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> >> >> > @@ -96,7 +96,6 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
> >> >> >         struct mlx4_en_rx_alloc page_alloc[MLX4_EN_MAX_RX_FRAGS];
> >> >> >         const struct mlx4_en_frag_info *frag_info;
> >> >> >         struct page *page;
> >> >> > -       dma_addr_t dma;
> >> >> >         int i;
> >> >> >
> >> >> >         for (i = 0; i < priv->num_frags; i++) {
> >> >> > @@ -115,9 +114,10 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
> >> >> >
> >> >> >         for (i = 0; i < priv->num_frags; i++) {
> >> >> >                 frags[i] = ring_alloc[i];
> >> >> > -               dma = ring_alloc[i].dma + ring_alloc[i].page_offset;
> >> >> > +               frags[i].page_offset += priv->frag_info[i].rx_headroom;
> >> >>
> >> >> I don't see any need for headroom on frag_info other that frag0 (which
> >> >> where the packet starts).
> >> >> What is the meaning of a headroom of a frag in a middle of a packet ?
> >> >>
> >> >> if you agree with me then, you can use XDP_PACKET_HEADROOM as is where
> >> >> needed (i.e frag0 page offset) and remove
> >> >> "priv->frag_info[i].rx_headroom"
> >> >>
> >> >> ...
> >> >>
> >> >> After going through the code a little bit i see that this code is
> >> >> shared between XDP and common path, and you didn't want to add boolean
> >> >> conditions.
> >> >>
> >> >> Ok i see what you did here.
> >> >>
> >> >> Maybe we can pass headroom as a function parameter and split frag0
> >> >> handling from the rest ?
> >> >> If it is too much then i am ok with the code as it is,
> >> > Right, this patch does the boolean check (XDP active or not) early on
> >> > in mlx4_en_calc_rx_buf() (i.e. out of the fast path) and store
> >> > the result in priv->frag_info[0].rx_headroom.
> >> >
> >> > Just want to ensure I understand your comment correctly.
> >> > You prefer not to store the boolean test result in frag_info[0].rx_headroom
> >> > since it is redundant to !!priv->tx_ring_num[TX_XDP] and rx_headroom is also
> >> > confusing for frag[1-3].
> >> >
> >> > Instead, do the XDP [in]active test before calling mlx4_en_alloc_frags()
> >> > and then only adjust frags[0].page_offset by +XDP_PACKET_HEADROOM if is needed.
> >> > It could be done either by passing an extra argument to mlx4_en_alloc_frags()
> >> > or completely separate mlx4_en_alloc_frags().  I am fine with this also.
> >> >
> >>
> >> Correct, but if this change will add extra checks to the data path
> >> then I am ok with the current code.
> > Right, the check has to be done somewhere in the data path.
> > Lets stay with the current approach then.
> >
> >>
> >> >
> >> >>
> >> >> > +               rx_desc->data[i].addr = cpu_to_be64(frags[i].dma +
> >> >> > +                                                   frags[i].page_offset);
> >> >> >                 ring_alloc[i] = page_alloc[i];
> >> >> > -               rx_desc->data[i].addr = cpu_to_be64(dma);
> >> >> >         }
> >> >> >
> >> >> >         return 0;
> >> >> > @@ -250,7 +250,8 @@ static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv,
> >> >> >
> >> >> >         if (ring->page_cache.index > 0) {
> >> >> >                 frags[0] = ring->page_cache.buf[--ring->page_cache.index];
> >> >> > -               rx_desc->data[0].addr = cpu_to_be64(frags[0].dma);
> >> >> > +               rx_desc->data[0].addr = cpu_to_be64(frags[0].dma +
> >> >> > +                                                   frags[0].page_offset);
> >> >> >                 return 0;
> >> >> >         }
> >> >> >
> >> >> > @@ -889,6 +890,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
> >> >> >                 if (xdp_prog) {
> >> >> >                         struct xdp_buff xdp;
> >> >> >                         dma_addr_t dma;
> >> >> > +                       void *pg_addr, *orig_data;
> >> >> >                         u32 act;
> >> >> >
> >> >> >                         dma = be64_to_cpu(rx_desc->data[0].addr);
> >> >> > @@ -896,11 +898,18 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
> >> >> >                                                 priv->frag_info[0].frag_size,
> >> >> >                                                 DMA_FROM_DEVICE);
> >> >> >
> >> >> > -                       xdp.data = page_address(frags[0].page) +
> >> >> > -                                                       frags[0].page_offset;
> >> >> > +                       pg_addr = page_address(frags[0].page);
> >> >> > +                       orig_data = pg_addr + frags[0].page_offset;
> >> >> > +                       xdp.data = orig_data;
> >> >> >                         xdp.data_end = xdp.data + length;
> >> >> >
> >> >> >                         act = bpf_prog_run_xdp(xdp_prog, &xdp);
> >> >> > +
> >> >> > +                       if (xdp.data != orig_data) {
> >> >> > +                               length = xdp.data_end - xdp.data;
> >> >> > +                               frags[0].page_offset = xdp.data - pg_addr;
> >> >> > +                       }
> >> >> > +
> >> >> >
> >> >>
> >> >> is this needed only for XDP FWD case ?
> >> > No. It is also for PASS.
> >> >
> >>
> >> I see.
> >>
> >> >> is this the only way to detect that the user modified the packet
> >> >> headers (comparing pointers, before and after) ?
> >> > Yes
> >> >
> >> >>
> >> >> if the answer is yes, it should be faster to unconditionally reset
> >> >> packet offset and lenght on XDP_FWD :
> >> >> case XDP_FWD:
> >> >>    length = xdp.data_end - xdp.data;
> >> >>    frags[0].page_offset = xdp.data - pg_addr;
> >> >>
> >> >>
> >> >> >                         switch (act) {
> >> >> >                         case XDP_PASS:
> >> >> >                                 break;
> >> >> > @@ -1180,6 +1189,7 @@ void mlx4_en_calc_rx_buf(struct net_device *dev)
> >> >> >                  */
> >> >> >                 priv->frag_info[0].frag_stride = PAGE_SIZE;
> >> >> >                 priv->frag_info[0].dma_dir = PCI_DMA_BIDIRECTIONAL;
> >> >> > +               priv->frag_info[0].rx_headroom = XDP_PACKET_HEADROOM;
> >> >> >                 i = 1;
> >> >> >         } else {
> >> >> >                 int buf_size = 0;
> >> >> > @@ -1194,6 +1204,7 @@ void mlx4_en_calc_rx_buf(struct net_device *dev)
> >> >> >                                 ALIGN(priv->frag_info[i].frag_size,
> >> >> >                                       SMP_CACHE_BYTES);
> >> >> >                         priv->frag_info[i].dma_dir = PCI_DMA_FROMDEVICE;
> >> >> > +                       priv->frag_info[i].rx_headroom = 0;
> >> >>
> >> >> IMHO, redundant. as you see here frag0 and other frags handling are
> >> >> separated, maybe we can do the same in mlx4_en_alloc_frags.
> >> >>
> >> >> >                         buf_size += priv->frag_info[i].frag_size;
> >> >> >                         i++;
> >> >> >                 }
> >> >> > diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
> >> >> > index 4b597dca5c52..9e5f38cefe5f 100644
> >> >> > --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
> >> >> > +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
> >> >> > @@ -354,7 +354,7 @@ u32 mlx4_en_recycle_tx_desc(struct mlx4_en_priv *priv,
> >> >> >         struct mlx4_en_rx_alloc frame = {
> >> >> >                 .page = tx_info->page,
> >> >> >                 .dma = tx_info->map0_dma,
> >> >> > -               .page_offset = 0,
> >> >> > +               .page_offset = XDP_PACKET_HEADROOM,
> >> >> >                 .page_size = PAGE_SIZE,
> >> >> >         };
> >> >> >
> >> >> > @@ -1132,7 +1132,7 @@ netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_ring *rx_ring,
> >> >> >         tx_info->page = frame->page;
> >> >> >         frame->page = NULL;
> >> >> >         tx_info->map0_dma = dma;
> >> >> > -       tx_info->map0_byte_count = length;
> >> >> > +       tx_info->map0_byte_count = length + frame->page_offset;
> >> >>
> >> >> Didn't you already take care of lenght by the following code:
> >> >>                        if (xdp.data != orig_data) {
> >> >>                                length = xdp.data_end - xdp.data;
> >> >>                                frags[0].page_offset = xdp.data - pg_addr;
> >> >>                         }
> >> >>
> >> > Before this patch, length always assumes the data starts at the beginning
> >> > of the page and dma is the start of the page.  Hence, adding
> >> > framg->page_offset back to the length here.
> >> >
> >> > However, if I read the codes correctly, I think the map0_byte_count (before or
> >> > after this patch) does not matter since it is only used in dma_unmap_page() and
> >> > PAGE_SIZE is always used in dma_unmap_page() for this code patch.  Hence, I think
> >> > we can just set map0_byte_count to PAGE_SIZE here.
> >> >
> >>
> >> Right, in mlx4_alloc_pages we always map with PAGE_SIZE <<  order
> >>  dma = dma_map_page(priv->ddev, page, 0, PAGE_SIZE << order,
> >>   frag_info->dma_dir);
> >> for XDP order is always 0, so you can safely set it to PAGE_SIZE.
> >>
> >> >> and here  frame->page_offset is not really page offset, it can only be
> >> >> XDP_PACKET_HEADROOM.
> >> > Note that the XDP prog can call bpf_xdp_adjust_head() to add a header.
> >> > The XDP prog can extend up to XDP_PACKET_HEADROOM (256) bytes but it
> >> > can also (and usually) only add 40 bytes IPv6 header and then XDP_TX it out.
> >> >
> >>
> >> I see.
> >>
> >> >>
> >> >> >         tx_info->nr_txbb = nr_txbb;
> >> >> >         tx_info->nr_bytes = max_t(unsigned int, length, ETH_ZLEN);
> >> >> >         tx_info->data_offset = (void *)data - (void *)tx_desc;
> >> >> > @@ -1141,9 +1141,10 @@ netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_ring *rx_ring,
> >> >> >         tx_info->linear = 1;
> >> >> >         tx_info->inl = 0;
> >> >> >
> >> >> > -       dma_sync_single_for_device(priv->ddev, dma, length, PCI_DMA_TODEVICE);
> >> >> > +       dma_sync_single_range_for_device(priv->ddev, dma, frame->page_offset,
> >> >> > +                                        length, PCI_DMA_TODEVICE);
> >> >> >
> >> >> > -       data->addr = cpu_to_be64(dma);
> >> >> > +       data->addr = cpu_to_be64(dma + frame->page_offset);
> >> >> >         data->lkey = ring->mr_key;
> >> >> >         dma_wmb();
> >> >> >         data->byte_count = cpu_to_be32(length);
> >> >> > diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
> >> >> > index 20a936428f4a..ba1c6cd0cc79 100644
> >> >> > --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
> >> >> > +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
> >> >> > @@ -475,7 +475,8 @@ struct mlx4_en_frag_info {
> >> >> >         u16 frag_prefix_size;
> >> >> >         u32 frag_stride;
> >> >> >         enum dma_data_direction dma_dir;
> >> >> > -       int order;
> >> >> > +       u16 order;
> >> >> > +       u16 rx_headroom;
> >> >> >  };
> >> >> >
> >> >> >  #ifdef CONFIG_MLX4_EN_DCB
> >> >> > --
> >> >> > 2.5.1
> >> >> >

^ permalink raw reply

* [PATCH net-next 0/2] Add ethtool set regs support
From: Saeed Mahameed @ 2016-12-06 22:33 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, John W . Linville, Saeed Mahameed

Hi Dave,

This series adds the support for setting device registers from user
space ethtool.

Currently ethtool only allows to get device registers,
we extend ethtool functionality to also set device registers, by
introducing set_regs to ethtool_ops which will be invoked when
user space requests "ETHTOOL_SREGS", for example via ethtool user app:

ethtool -D DEVNAME [ file FILENAME ] is used to set registers in
the device using vendor specific binary registers data provided via
stdin/file. Changes made by this option can be queried using get
regs -d flag.

This simple ethool change will give HW vendors the flexibility to set
pure HW configurations (not directly related to netdev resources states
and rings), without the need of vendor proprietary tools and hacks.

2nd patch adds the support for ethtool set/get_regs in mlx5e driver.

Important Note: With this extension we will allow HW vendors to access (set) their 
device register without the need for them to open their format, hence the binary
file passed on ethtool -D DEVNAME.

This means that the device driver MUST check for correctness/validity of the 
registers data sent to it and whether this register is permitted to be iset form user space
in order to prevent the user from accessing/setting registers/Device configurations
that already standardized by the kernel/stack user APIs, or not allowed to be seen/set by user.

mlx5 driver have registers allowed access list and will check the user 
Request validity before forwarding it to HW registers. Mlx5 will allow only mlx5 specific
configurations to be set (e.g. Device Diag Counters for HW performance debugging and analysis)
which has no standard API to access it.

Comments and redirections are more than welcome

This series was generated against commit:
b0da4f743db5 ("net: calxeda: xgmac: use new api ethtool_{get|set}_link_ksettings")

Thanks,
Saeed.

Gal Pressman (2):
  ethtool: Add set regs -D option support
  net/mlx5e: Add ethtool get/set reg support

 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c      |  19 ----
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  12 +++
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |  21 ++++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   8 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_regs.c  | 116 +++++++++++++++++++++
 include/linux/ethtool.h                            |   1 +
 include/linux/mlx5/mlx5_ifc.h                      |  22 ++++
 include/uapi/linux/ethtool.h                       |   1 +
 net/core/ethtool.c                                 |  31 ++++++
 10 files changed, 213 insertions(+), 20 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_regs.c

-- 
2.7.4

^ permalink raw reply

* [PATCH net-next 2/2] net/mlx5e: Add ethtool get/set reg support
From: Saeed Mahameed @ 2016-12-06 22:33 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, John W . Linville, Gal Pressman, Dmitry Teif,
	Saeed Mahameed
In-Reply-To: <1481063590-7727-1-git-send-email-saeedm@mellanox.com>

From: Gal Pressman <galp@mellanox.com>

Add ethtool -[dD] callbacks support for get and set registers.
This interface allows users to query and change device registers.

Add the support for set/get DIAGNOSTIC_PARAMS/COUNTERS registers.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Dmitry Teif <dimat@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c      |  19 ----
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  12 +++
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |  21 ++++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   8 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_regs.c  | 116 +++++++++++++++++++++
 include/linux/mlx5/mlx5_ifc.h                      |  22 ++++
 7 files changed, 180 insertions(+), 20 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_regs.c

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 9f43beb..b24564c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -8,6 +8,6 @@ mlx5_core-y :=	main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
 mlx5_core-$(CONFIG_MLX5_CORE_EN) += wq.o eswitch.o eswitch_offloads.o \
 		en_main.o en_common.o en_fs.o en_ethtool.o en_tx.o \
 		en_rx.o en_rx_am.o en_txrx.o en_clock.o vxlan.o \
-		en_tc.o en_arfs.o en_rep.o en_fs_ethtool.o en_selftest.o
+		en_tc.o en_arfs.o en_rep.o en_fs_ethtool.o en_selftest.o en_regs.o
 
 mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) +=  en_dcbnl.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index b0448b5..f8b6c83 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -650,25 +650,6 @@ static int cmd_status_to_err(u8 status)
 	}
 }
 
-struct mlx5_ifc_mbox_out_bits {
-	u8         status[0x8];
-	u8         reserved_at_8[0x18];
-
-	u8         syndrome[0x20];
-
-	u8         reserved_at_40[0x40];
-};
-
-struct mlx5_ifc_mbox_in_bits {
-	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
-
-	u8         reserved_at_20[0x10];
-	u8         op_mod[0x10];
-
-	u8         reserved_at_40[0x40];
-};
-
 void mlx5_cmd_mbox_status(void *out, u8 *status, u32 *syndrome)
 {
 	*status = MLX5_GET(mbox_out, out, status);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 63dd639..fcc296b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -659,6 +659,11 @@ struct mlx5e_tir {
 	struct list_head  list;
 };
 
+struct mlx5e_reg {
+	u8  data_in[MLX5_ST_SZ_BYTES(mbox_in)];
+	u8 *data_out;
+};
+
 enum {
 	MLX5E_TC_PRIO = 0,
 	MLX5E_NIC_PRIO
@@ -713,6 +718,7 @@ struct mlx5e_priv {
 	struct mlx5e_stats         stats;
 	struct mlx5e_tstamp        tstamp;
 	u16 q_counter;
+	struct mlx5e_reg          *reg;
 #ifdef CONFIG_MLX5_CORE_EN_DCB
 	struct mlx5e_dcbx          dcbx;
 #endif
@@ -803,6 +809,12 @@ int mlx5e_get_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
 void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params,
 				 u8 cq_period_mode);
 
+struct mlx5e_reg *mlx5e_regs_init(void);
+int mlx5e_regs_set(struct net_device *dev, void *buff, int inlen);
+void mlx5e_regs_get(struct net_device *dev, void *buff);
+int mlx5e_regs_get_len(void);
+void mlx5e_regs_destroy(struct mlx5e_reg *reg);
+
 static inline void mlx5e_tx_notify_hw(struct mlx5e_sq *sq,
 				      struct mlx5_wqe_ctrl_seg *ctrl, int bf_sz)
 {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 352462a..6adc9ea 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -1552,6 +1552,23 @@ static u32 mlx5e_get_priv_flags(struct net_device *netdev)
 	return priv->params.pflags;
 }
 
+static int mlx5e_get_regs_len(struct net_device *dev)
+{
+	return mlx5e_regs_get_len();
+}
+
+static void mlx5e_get_regs(struct net_device *dev, struct ethtool_regs *regs,
+			   void *buff)
+{
+	mlx5e_regs_get(dev, buff);
+}
+
+static int mlx5e_set_regs(struct net_device *dev, struct ethtool_regs *regs,
+			  u8 *data)
+{
+	return mlx5e_regs_set(dev, data, regs->len);
+}
+
 static int mlx5e_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd)
 {
 	int err = 0;
@@ -1605,4 +1622,8 @@ const struct ethtool_ops mlx5e_ethtool_ops = {
 	.get_priv_flags    = mlx5e_get_priv_flags,
 	.set_priv_flags    = mlx5e_set_priv_flags,
 	.self_test         = mlx5e_self_test,
+	.get_regs_len      = mlx5e_get_regs_len,
+	.get_regs          = mlx5e_get_regs,
+	.set_regs          = mlx5e_set_regs,
+
 };
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 9def5cc..e1905ba 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3686,6 +3686,11 @@ static void mlx5e_nic_init(struct mlx5_core_dev *mdev,
 
 	mlx5e_build_nic_netdev_priv(mdev, netdev, profile, ppriv);
 	mlx5e_build_nic_netdev(netdev);
+
+	priv->reg = mlx5e_regs_init();
+	if (!priv->reg)
+		mlx5_core_warn(mdev, "Failed to allocate mlx5e_reg\n");
+
 	mlx5e_vxlan_init(priv);
 }
 
@@ -3696,6 +3701,9 @@ static void mlx5e_nic_cleanup(struct mlx5e_priv *priv)
 
 	mlx5e_vxlan_cleanup(priv);
 
+	if (priv->reg)
+		mlx5e_regs_destroy(priv->reg);
+
 	if (MLX5_CAP_GEN(mdev, vport_group_manager))
 		mlx5_eswitch_unregister_vport_rep(esw, 0);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_regs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_regs.c
new file mode 100644
index 0000000..a83df1f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_regs.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx5/driver.h>
+#include "mlx5_core.h"
+#include "en.h"
+
+#define MLX5E_MAX_REG_LEN             4096
+#define MLX5E_MAX_CMD_OUT_LEN (MLX5E_MAX_REG_LEN - MLX5_ST_SZ_BYTES(mbox_in))
+
+static void reg_out_alloc(struct mlx5e_reg *reg)
+{
+	if (reg->data_out) {
+		memset(reg->data_out, 0, MLX5E_MAX_CMD_OUT_LEN);
+		return;
+	}
+
+	reg->data_out = mlx5_vzalloc(MLX5E_MAX_CMD_OUT_LEN);
+}
+
+struct mlx5e_reg *mlx5e_regs_init(void)
+{
+	return kzalloc(sizeof(struct mlx5e_reg), GFP_KERNEL);
+}
+
+void mlx5e_regs_destroy(struct mlx5e_reg *reg)
+{
+	kvfree(reg->data_out);
+	kfree(reg);
+}
+
+static bool opcode_valid(u16 opcode)
+{
+	switch (opcode) {
+	case MLX5_CMD_OP_QUERY_HCA_CAP:
+	case MLX5_CMD_OP_QUERY_DIAGNOSTIC_PARAMS:
+	case MLX5_CMD_OP_SET_DIAGNOSTIC_PARAMS:
+	case MLX5_CMD_OP_QUERY_DIAGNOSTICS_COUNTERS:
+		return true;
+	}
+
+	return false;
+}
+
+int mlx5e_regs_set(struct net_device *dev, void *buff, int inlen)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5e_reg *reg = priv->reg;
+	u16 opcode;
+
+	if (!reg)
+		return -ENOMEM;
+
+	opcode = MLX5_GET(mbox_in, buff, opcode);
+	if (!opcode_valid(opcode))
+		return -EINVAL;
+
+	reg_out_alloc(reg);
+	if (!reg->data_out)
+		return -ENOMEM;
+
+	memcpy(reg->data_in, buff, sizeof(reg->data_in));
+
+	return mlx5_cmd_exec(mdev, buff, inlen, reg->data_out,
+			     MLX5E_MAX_CMD_OUT_LEN);
+}
+
+void mlx5e_regs_get(struct net_device *dev, void *buff)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5e_reg *reg = priv->reg;
+
+	if (!reg)
+		return;
+
+	if (reg->data_out) {
+		memcpy(buff, reg->data_in, sizeof(reg->data_in));
+		memcpy(buff + sizeof(reg->data_in), reg->data_out,
+		       MLX5E_MAX_CMD_OUT_LEN);
+	}
+}
+
+int mlx5e_regs_get_len(void)
+{
+	return MLX5E_MAX_REG_LEN;
+}
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index a5f0fbe..9738b70 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -169,6 +169,9 @@ enum {
 	MLX5_CMD_OP_DEALLOC_XRCD                  = 0x80f,
 	MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN        = 0x816,
 	MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN      = 0x817,
+	MLX5_CMD_OP_QUERY_DIAGNOSTIC_PARAMS       = 0x819,
+	MLX5_CMD_OP_SET_DIAGNOSTIC_PARAMS         = 0x820,
+	MLX5_CMD_OP_QUERY_DIAGNOSTICS_COUNTERS    = 0x821,
 	MLX5_CMD_OP_QUERY_CONG_STATUS             = 0x822,
 	MLX5_CMD_OP_MODIFY_CONG_STATUS            = 0x823,
 	MLX5_CMD_OP_QUERY_CONG_PARAMS             = 0x824,
@@ -230,6 +233,25 @@ enum {
 	MLX5_CMD_OP_MAX
 };
 
+struct mlx5_ifc_mbox_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_mbox_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+};
+
 struct mlx5_ifc_flow_table_fields_supported_bits {
 	u8         outer_dmac[0x1];
 	u8         outer_smac[0x1];
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next 1/2] ethtool: Add set regs -D option support
From: Saeed Mahameed @ 2016-12-06 22:33 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, John W . Linville, Gal Pressman, Dmitry Teif,
	Saeed Mahameed
In-Reply-To: <1481063590-7727-1-git-send-email-saeedm@mellanox.com>

From: Gal Pressman <galp@mellanox.com>

Currently ethtool only allows us to get device registers, in this patch
we extend this functionality to also set device registers.
ethtool -D DEVNAME [ file FILENAME ] is used to set registers in
the device using vendor specific binary registers data provided via
stdin/file. Changes made by this option can be queried using get
regs -d flag.

Example:
$ ethtool -D eth1 file /tmp/mlx5_regs

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Dmitry Teif <dimat@mellanox.com>
CC: John W. Linville <linville@tuxdriver.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/ethtool.h      |  1 +
 include/uapi/linux/ethtool.h |  1 +
 net/core/ethtool.c           | 31 +++++++++++++++++++++++++++++++
 3 files changed, 33 insertions(+)

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 9ded8c6..c9f5d37 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -305,6 +305,7 @@ struct ethtool_ops {
 	void	(*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *);
 	int	(*get_regs_len)(struct net_device *);
 	void	(*get_regs)(struct net_device *, struct ethtool_regs *, void *);
+	int	(*set_regs)(struct net_device *, struct ethtool_regs *, u8 *);
 	void	(*get_wol)(struct net_device *, struct ethtool_wolinfo *);
 	int	(*set_wol)(struct net_device *, struct ethtool_wolinfo *);
 	u32	(*get_msglevel)(struct net_device *);
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index f0db778..f81c6fd 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1330,6 +1330,7 @@ struct ethtool_per_queue_op {
 #define ETHTOOL_SLINKSETTINGS	0x0000004d /* Set ethtool_link_settings */
 #define ETHTOOL_PHY_GTUNABLE	0x0000004e /* Get PHY tunable configuration */
 #define ETHTOOL_PHY_STUNABLE	0x0000004f /* Set PHY tunable configuration */
+#define ETHTOOL_SREGS		0x00000050 /* Set NIC registers */
 
 /* compatibility with older code */
 #define SPARC_ETH_GSET		ETHTOOL_GSET
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index e23766c..5548565 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1424,6 +1424,34 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
 	return ret;
 }
 
+static int ethtool_set_regs(struct net_device *dev, char __user *useraddr)
+{
+	void __user *userbuf = useraddr + offsetof(struct ethtool_regs, data);
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+	struct ethtool_regs regs;
+	int ret = 0;
+	u8 *data;
+
+	if (!ops->set_regs || !ops->get_regs_len)
+		return -EOPNOTSUPP;
+	if (copy_from_user(&regs, useraddr, sizeof(regs)))
+		return -EFAULT;
+
+	data = kmalloc(PAGE_SIZE, GFP_USER);
+	if (!data)
+		return -ENOMEM;
+
+	ret = -EFAULT;
+	if (copy_from_user(data, userbuf, regs.len))
+		goto out;
+
+	ret = ops->set_regs(dev, &regs, data);
+
+out:
+	kfree(data);
+	return ret;
+}
+
 static int ethtool_reset(struct net_device *dev, char __user *useraddr)
 {
 	struct ethtool_value reset;
@@ -2597,6 +2625,9 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_GREGS:
 		rc = ethtool_get_regs(dev, useraddr);
 		break;
+	case ETHTOOL_SREGS:
+		rc = ethtool_set_regs(dev, useraddr);
+		break;
 	case ETHTOOL_GWOL:
 		rc = ethtool_get_wol(dev, useraddr);
 		break;
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH net-next 7/7] bnxt_en: Add interface to support RDMA driver.
From: kbuild test robot @ 2016-12-06 22:33 UTC (permalink / raw)
  To: Michael Chan
  Cc: kbuild-all, davem, netdev, selvin.xavier, somnath.kotur, dledford,
	linux-rdma
In-Reply-To: <1481044178-25193-8-git-send-email-michael.chan@broadcom.com>

[-- Attachment #1: Type: text/plain, Size: 5469 bytes --]

Hi Michael,

[auto build test WARNING on net-next/master]

url:    https://github.com/0day-ci/linux/commits/Michael-Chan/bnxt_en-Add-interface-to-support-RDMA-driver/20161207-053721
config: i386-allmodconfig (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

Note: it may well be a FALSE warning. FWIW you are at least aware of it now.
http://gcc.gnu.org/wiki/Better_Uninitialized_Warnings

All warnings (new ones prefixed by >>):

   drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c: In function 'bnxt_unregister_dev':
>> drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c:97:4: warning: 'i' may be used uninitialized in this function [-Wmaybe-uninitialized]
      i++;
      ~^~
   drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c: In function 'bnxt_ulp_stop':
>> drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c:219:6: warning: 'ops' may be used uninitialized in this function [-Wmaybe-uninitialized]
      if (!ops || !ops->ulp_stop)
         ^

vim +/i +97 drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c

    91		RCU_INIT_POINTER(ulp->ulp_ops, NULL);
    92		synchronize_rcu();
    93		ulp->max_async_event_id = 0;
    94		ulp->async_events_bmap = NULL;
    95		while (atomic_read(&ulp->ref_count) != 0 && i < 10) {
    96			msleep(100);
  > 97			i++;
    98		}
    99		return 0;
   100	}
   101	
   102	static int bnxt_req_msix_vecs(struct bnxt_en_dev *edev, int ulp_id,
   103				      struct bnxt_msix_entry *ent, int num_msix)
   104	{
   105		struct net_device *dev = edev->net;
   106		struct bnxt *bp = netdev_priv(dev);
   107		int max_idx, max_cp_rings;
   108		int avail_msix, i, idx;
   109	
   110		ASSERT_RTNL();
   111		if (ulp_id != BNXT_ROCE_ULP)
   112			return -EINVAL;
   113	
   114		if (!(bp->flags & BNXT_FLAG_USING_MSIX))
   115			return -ENODEV;
   116	
   117		max_cp_rings = bnxt_get_max_func_cp_rings(bp);
   118		max_idx = min_t(int, bp->total_irqs, max_cp_rings);
   119		avail_msix = max_idx - bp->cp_nr_rings;
   120		if (!avail_msix)
   121			return -ENOMEM;
   122		if (avail_msix > num_msix)
   123			avail_msix = num_msix;
   124	
   125		idx = max_idx - avail_msix;
   126		for (i = 0; i < avail_msix; i++) {
   127			ent[i].vector = bp->irq_tbl[idx + i].vector;
   128			ent[i].ring_idx = idx + i;
   129			ent[i].db_offset = (idx + i) * 0x80;
   130		}
   131		bnxt_set_max_func_irqs(bp, max_idx - avail_msix);
   132		bnxt_set_max_func_cp_rings(bp, max_cp_rings - avail_msix);
   133		edev->ulp_tbl[ulp_id].msix_requested = avail_msix;
   134		return avail_msix;
   135	}
   136	
   137	static int bnxt_free_msix_vecs(struct bnxt_en_dev *edev, int ulp_id)
   138	{
   139		struct net_device *dev = edev->net;
   140		struct bnxt *bp = netdev_priv(dev);
   141		int max_cp_rings, msix_requested;
   142	
   143		ASSERT_RTNL();
   144		if (ulp_id != BNXT_ROCE_ULP)
   145			return -EINVAL;
   146	
   147		max_cp_rings = bnxt_get_max_func_cp_rings(bp);
   148		msix_requested = edev->ulp_tbl[ulp_id].msix_requested;
   149		bnxt_set_max_func_cp_rings(bp, max_cp_rings + msix_requested);
   150		edev->ulp_tbl[ulp_id].msix_requested = 0;
   151		bnxt_set_max_func_irqs(bp, bp->total_irqs);
   152		return 0;
   153	}
   154	
   155	void bnxt_subtract_ulp_resources(struct bnxt *bp, int ulp_id)
   156	{
   157		ASSERT_RTNL();
   158		if (bnxt_ulp_registered(bp->edev, ulp_id)) {
   159			struct bnxt_en_dev *edev = bp->edev;
   160			unsigned int msix_req, max;
   161	
   162			msix_req = edev->ulp_tbl[ulp_id].msix_requested;
   163			max = bnxt_get_max_func_cp_rings(bp);
   164			bnxt_set_max_func_cp_rings(bp, max - msix_req);
   165			max = bnxt_get_max_func_stat_ctxs(bp);
   166			bnxt_set_max_func_stat_ctxs(bp, max - 1);
   167		}
   168	}
   169	
   170	static int bnxt_send_msg(struct bnxt_en_dev *edev, int ulp_id,
   171				 struct bnxt_fw_msg *fw_msg)
   172	{
   173		struct net_device *dev = edev->net;
   174		struct bnxt *bp = netdev_priv(dev);
   175		struct input *req;
   176		int rc;
   177	
   178		mutex_lock(&bp->hwrm_cmd_lock);
   179		req = fw_msg->msg;
   180		req->resp_addr = cpu_to_le64(bp->hwrm_cmd_resp_dma_addr);
   181		rc = _hwrm_send_message(bp, fw_msg->msg, fw_msg->msg_len,
   182					fw_msg->timeout);
   183		if (!rc) {
   184			struct output *resp = bp->hwrm_cmd_resp_addr;
   185			u32 len = le16_to_cpu(resp->resp_len);
   186	
   187			if (fw_msg->resp_max_len < len)
   188				len = fw_msg->resp_max_len;
   189	
   190			memcpy(fw_msg->resp, resp, len);
   191		}
   192		mutex_unlock(&bp->hwrm_cmd_lock);
   193		return rc;
   194	}
   195	
   196	static void bnxt_ulp_get(struct bnxt_ulp *ulp)
   197	{
   198		atomic_inc(&ulp->ref_count);
   199	}
   200	
   201	static void bnxt_ulp_put(struct bnxt_ulp *ulp)
   202	{
   203		atomic_dec(&ulp->ref_count);
   204	}
   205	
   206	void bnxt_ulp_stop(struct bnxt *bp)
   207	{
   208		struct bnxt_en_dev *edev = bp->edev;
   209		struct bnxt_ulp_ops *ops;
   210		int i;
   211	
   212		if (!edev)
   213			return;
   214	
   215		for (i = 0; i < BNXT_MAX_ULP; i++) {
   216			struct bnxt_ulp *ulp = &edev->ulp_tbl[i];
   217	
   218			rtnl_dereference(ulp->ulp_ops);
 > 219			if (!ops || !ops->ulp_stop)
   220				continue;
   221			ops->ulp_stop(ulp->handle);
   222		}

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 57018 bytes --]

^ permalink raw reply

* Re: [PATCH nf-next] netfilter: xt_bpf: support ebpf
From: Willem de Bruijn @ 2016-12-06 22:44 UTC (permalink / raw)
  To: Florian Westphal
  Cc: Willem de Bruijn, Pablo Neira Ayuso, netfilter-devel,
	Network Development, Daniel Borkmann, Eric Dumazet
In-Reply-To: <20161206002027.GD16819@breakpoint.cc>

On Mon, Dec 5, 2016 at 7:20 PM, Florian Westphal <fw@strlen.de> wrote:
> Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote:
>> While we're discussing the patch, another question, about revisions: I
>> tested both modified and original iptables binaries on both standard
>> and modified kernels. It all works as expected, except for the case
>> where both binaries are used on a single kernel. For instance:
>>
>>   iptables -A OUTPUT -m bpf --bytecode "`./nfbpf_compile RAW 'udp port
>> 8000'`" -j LOG
>>   ./iptables.new -L
>>
>> Here the new binary will interpret the object as xt_bpf_match_v1, but
>> iptables has inserted xt_bpf_match. The same problem happens the other
>> way around. A new binary can be made robust to detect old structs, but
>> not the other way around. Specific to bpf, the existing xt_bpf code
>> has an unfortunate bug that it always prints at least one line of
>> code, even if ->bpf_program_num_elems == 0.
>>
>> I notice that other extensions also do not necessarily only extend
>> struct vN in vN+1. Is the above a known issue?
>
> Yes, I guess noone ever bothered to fix this.
>
> The kernel blob should contain the match/target revision number,
> so userspace can in fact see that 'this is bpf v42', but iirc
> the netfilter userspace just loads the highest userspace revision
> supported by the kernel (which is then different for the 2 iptables
> binaries).

We can fall back on not parsing contents on mismatch:

diff --git a/iptables/iptables.c b/iptables/iptables.c
index 540d111..ada7c94 100644
--- a/iptables/iptables.c
+++ b/iptables/iptables.c
@@ -504,7 +504,8 @@ print_match(const struct xt_entry_match *m,
                xtables_find_match(m->u.user.name, XTF_TRY_LOAD, NULL);

        if (match) {
-               if (match->print)
+               if (match->print &&
+                   m->u.user.revision == match->revision)
                        match->print(ip, m, numeric);
                else
                        printf("%s ", match->name);

> But we *could* display message like 'kernel uses revision 2 but I can
> only find 0 and 1' or fall back to the lower supported revision without
> guess-the-struct-by-size games.

That's a good idea. A special case printf() with a notice, then.

^ permalink raw reply related

* Re: [PATCH net-next 0/2] Add ethtool set regs support
From: Stephen Hemminger @ 2016-12-06 22:45 UTC (permalink / raw)
  To: Saeed Mahameed; +Cc: David S. Miller, netdev, John W . Linville
In-Reply-To: <1481063590-7727-1-git-send-email-saeedm@mellanox.com>

On Wed,  7 Dec 2016 00:33:08 +0200
Saeed Mahameed <saeedm@mellanox.com> wrote:

> This simple ethool change will give HW vendors the flexibility to set
> pure HW configurations (not directly related to netdev resources states
> and rings), without the need of vendor proprietary tools and hacks.

The danger is you need to restrict the kernel to only allow setting
safe registers (and this is HW dependent).  There are cases like secure
boot where it is expected that even root is not allowed to modify
all memory.

Also supporting closed format of device registers is not in the interest
of promoting open source.

I am not saying I fundamentally disagree with supporting this, but it
is a bigger step than you make it out to be.

^ permalink raw reply

* Re: [PATCH net-next V2 1/2] net/sched: cls_flower: Add support for matching on flags
From: kbuild test robot @ 2016-12-06 23:03 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: kbuild-all, David S. Miller, netdev, Jiri Pirko, Roi Dayan,
	Hadar Har-Zion, Or Gerlitz
In-Reply-To: <1481037486-27195-2-git-send-email-ogerlitz@mellanox.com>

[-- Attachment #1: Type: text/plain, Size: 1819 bytes --]

Hi Or,

[auto build test ERROR on net-next/master]

url:    https://github.com/0day-ci/linux/commits/Or-Gerlitz/net-sched-cls_flower-Add-support-for-matching-on-flags/20161207-012247
config: arm-allmodconfig (attached as .config)
compiler: arm-linux-gnueabi-gcc (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
        wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=arm 

Note: the linux-review/Or-Gerlitz/net-sched-cls_flower-Add-support-for-matching-on-flags/20161207-012247 HEAD 591ecce02e6ed3dab17d5c45a3f7368581c596ce builds fine.
      It only hurts bisectibility.

All errors (new ones prefixed by >>):

   In file included from include/net/pkt_cls.h:4:0,
                    from drivers/net/ethernet/mellanox/mlx5/core/en_tc.c:34:
>> include/uapi/linux/pkt_cls.h:470:37: error: implicit declaration of function 'BIT' [-Werror=implicit-function-declaration]
     TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = BIT(0),
                                        ^~~
>> include/uapi/linux/pkt_cls.h:470:2: error: enumerator value for 'TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT' is not an integer constant
     TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = BIT(0),
     ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   cc1: some warnings being treated as errors

vim +/BIT +470 include/uapi/linux/pkt_cls.h

   464		__TCA_FLOWER_MAX,
   465	};
   466	
   467	#define TCA_FLOWER_MAX (__TCA_FLOWER_MAX - 1)
   468	
   469	enum {
 > 470		TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = BIT(0),
   471	};
   472	
   473	/* Match-all classifier */

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 59574 bytes --]

^ permalink raw reply

* Soft lockup in inet_put_port on 4.6
From: Tom Herbert @ 2016-12-06 23:06 UTC (permalink / raw)
  To: Linux Kernel Network Developers, Josef Bacik

Hello,

We are seeing a fair number of machines getting into softlockup in 4.6
kernel. As near as I can tell this is happening on the spinlock in
bind hash bucket. When inet_csk_get_port exits and does spinunlock_bh
the TCP timer runs and we hit lockup in inet_put_port (presumably on
same lock). It seems like the locked isn't properly be unlocked
somewhere but I don't readily see it.

Any ideas?

Thanks,
Tom

NMI watchdog: BUG: soft lockup - CPU#22 stuck for 22s! [proxygend:4152094]
Modules linked in: fuse nf_log_ipv6 ip6t_REJECT nf_reject_ipv6
nf_log_ipv4 nf_log_common xt_LOG ipt_REJECT nf_reject_ipv4 xt_limit
xt_multiport ipip ip_tunnel tunnel4 ip6_tunnel tunnel6 coretemp mptctl
mptbase cls_bpf ipmi_watchdog tcp_diag inet_diag ip6table_filter
xt_NFLOG nfnetlink_log xt_comment xt_statistic iptable_filter xt_mark
tpm_crb i2c_piix4 dm_crypt loop ipmi_devintf acpi_cpufreq iTCO_wdt
iTCO_vendor_support ipmi_si ipmi_msghandler efivars i2c_i801 sg
lpc_ich mfd_core hpilo xhci_pci xhci_hcd button nvme nvme_core
CPU: 22 PID: 4152094 Comm: proxygend Tainted: G W L
4.6.7-13_fbk3_1119_g367d67b #13
Hardware name: HP ProLiant DL380 Gen9/ProLiant DL380 Gen9, BIOS P89 12/27/2015
task: ffff88168c52d100 ti: ffff881c12fb0000 task.ti: ffff881c12fb0000
RIP: 0010:[<ffffffff810b87b8>] [<ffffffff810b87b8>]
queued_spin_lock_slowpath+0xf8/0x170
RSP: 0018:ffff883fff303da0 EFLAGS: 00000246
RAX: 0000000000000000 RBX: ffff881257163e00 RCX: 0000000000000001
RDX: ffff883fff375e40 RSI: 00000000005c0000 RDI: ffffc90018d6bae0
RBP: ffff883fff303da0 R08: ffff883fff315e40 R09: 0000000000000000
R10: 0000000000000020 R11: 00000000000001c0 R12: ffffc90018d6bae0
R13: ffffffff820f8a80 R14: ffff881257163f30 R15: 0000000000000000
FS: 00007fa7bb7ff700(0000) GS:ffff883fff300000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007ff4be114d90 CR3: 000000243f99c000 CR4: 00000000003406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Stack: ffff883fff303db0 ffffffff817e5910 ffff883fff303dd8 ffffffff8172f6b4
ffff881257163e00 0000000000000007 0000000000000004 ffff883fff303e00
ffffffff81733237 ffff881257163e00 0000000000000000 ffffffff81ce7cc0
Call Trace:
<IRQ>
[<ffffffff817e5910>] _raw_spin_lock+0x20/0x30
[<ffffffff8172f6b4>] inet_put_port+0x54/0xb0
[<ffffffff81733237>] tcp_set_state+0x67/0xc0
[<ffffffff81733a33>] tcp_done+0x33/0x90
[<ffffffff81746431>] tcp_write_err+0x31/0x50
[<ffffffff81746bc9>] tcp_retransmit_timer+0x119/0x7d0
[<ffffffff81747460>] ? tcp_write_timer_handler+0x1e0/0x1e0
[<ffffffff8174730e>] tcp_write_timer_handler+0x8e/0x1e0
[<ffffffff817474c7>] tcp_write_timer+0x67/0x70
[<ffffffff810ccc35>] call_timer_fn+0x35/0x120
[<ffffffff81747460>] ? tcp_write_timer_handler+0x1e0/0x1e0
[<ffffffff810cd01c>] run_timer_softirq+0x1fc/0x2b0
[<ffffffff817e811c>] __do_softirq+0xcc/0x26c
[<ffffffff817e753c>] do_softirq_own_stack+0x1c/0x30 <EOI>
[<ffffffff8107b481>] do_softirq+0x31/0x40
[<ffffffff8107b508>] __local_bh_enable_ip+0x78/0x80
[<ffffffff817e572a>] _raw_spin_unlock_bh +0x1a/0x20
[<ffffffff81730a61>] inet_csk_get_port+0x1c1/0x5a0
[<ffffffff816c7637>] ? sock_poll+0x47/0xb0
[<ffffffff817313f5>] inet_csk_listen_start+0x65/0xc0
[<ffffffff8175ea8c>] inet_listen+0x9c/0xe0
[<ffffffff816c8560>] SyS_listen+0x80/0x90
[<ffffffff817e5adb>] entry_SYSCALL_64_fastpath+0x13/0x8f
Code: c1 ea 0c 83 e8 01 83 e2 30 48 98 48 81 c2 40 5e 01 00 48 03 14
c5 c0 d4 d1 81 4c 89 02 41 8b 40 08 85 c0 75 0a f3 90 41 8b 40 08 <85>
c0 74 f6 4d 8b 08 4d 85 c9 74 08 41 0f 0d 09 eb 02 f3 90 8b

^ permalink raw reply

* Re: commit : ppp: add rtnetlink device creation support - breaks netcf on my machine.
From: Guillaume Nault @ 2016-12-06 23:08 UTC (permalink / raw)
  To: Brad Campbell; +Cc: netdev, Thomas Graf, David Miller
In-Reply-To: <5d537b7e-97e9-709c-7b3e-61280cc264f8@fnarfbargle.com>

(Cc Thomas and David)

On Tue, Dec 06, 2016 at 03:47:20PM +0800, Brad Campbell wrote:
> On 06/12/16 01:53, Guillaume Nault wrote:
> > > 
> > Probably not a mistake on your side. I've started looking at netcf'
> > source code, but haven't found anything that could explain your issue.
> > It'd really help if you could provide steps to reproduce the bug.
> 
> Further to my message this morning, I started with a clean linux.git
> 4.9.0-rc7-00198-g0cb65c8 and did two runs. One untouched and one with the
> identified patch reverted. I logged both of these with NLCB=debug, then
> split out the ppp section and diffed them.
> 
> It appears the only difference of note is the new ATTR 18. I did a diff of
> the entire dump for both and nothing else popped out.
> 
Thanks for the detailed report. Things are getting clear now.

> 
> brad@test:~$ diff -u ppp-ok ppp-fail
> --- ppp-ok	2016-12-06 13:32:04.358393578 +0800
> +++ ppp-fail	2016-12-06 13:32:18.577864406 +0800
> @@ -1,10 +1,10 @@
>  --------------------------   BEGIN NETLINK MESSAGE
> ---------------------------
>    [HEADER] 16 octets
> -    .nlmsg_len = 628
> +    .nlmsg_len = 644
>      .nlmsg_type = 16 <route/link::new>
>      .nlmsg_flags = 2 <MULTI>
> -    .nlmsg_seq = 1481001940
> -    .nlmsg_pid = 7462
> +    .nlmsg_seq = 1481002252
> +    .nlmsg_pid = 7376
>    [PAYLOAD] 16 octets
>      00 00 00 02 0a 00 00 00 d1 10 01 00 00 00 00 00       ................
>    [ATTR 03] 5 octets
> @@ -71,6 +71,8 @@
>      00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> ..................
>      00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> ..................
>      00 00 00 00 00 00                                     ......
> +  [ATTR 18] 12 octets
> +    08 00 01 00 70 70 70 00 04 00 02 00                   ....ppp.....
>    [ATTR 26] 132 octets
>      84 00 02 00 80 00 01 00 01 00 00 00 00 00 00 00 00 00
> ..................
>      00 00 01 00 00 00 01 00 00 00 01 00 00 00 01 00 00 00
> ..................
> @@ -81,3 +83,4 @@
>      00 00 00 00 10 27 00 00 e8 03 00 00 00 00 00 00 00 00
> .....'............
>      00 00 00 00 00 00                                     ......
>  ---------------------------  END NETLINK MESSAGE
> ---------------------------
> 
'ATTR 18' is the IFLA_LINKINFO attribute. It contains two sub-attributes:
  * IFLA_INFO_KIND ('08 00 01 00 70 70 70 00'), containing the "ppp"
    string,
  * and IFLA_INFO_DATA ('04 00 02 00') which has no payload because,
    currently, ppp has no device specific data to return.

> Running with NLDBG=4 seems to generate this :
> DBG<2>: While picking up for 0x26d2e00 <route/link>, recvmsgs() returned
> -34:  (errno = Numerical result out of range)DBG<1>: Clearing cache
> 0x26d2e00 <route/link>...
> 
libnl1 rejects the IFLA_INFO_DATA attribute because it expects it to
contain a sub-attribute. Since the payload size is zero it doesn't
match the policy and parsing fails.

There's no problem with libnl3 because its policy accepts empty
payloads for NLA_NESTED attributes (see libnl3 commit 4be02ace4826 "Be
liberal when receiving an empty nested attribute").

I think empty nested attributes make perfect sense. At least we accept
them from user space since commit ea5693ccc553 ("netlink: allow empty
nested attributes"), so it should be fine to generate some from the
kernel.
OTOH, since some user space programs broke because of this, it might be
better to always add attributes in the .fill_info() callbacks, to work
around libnl1's policy. David, Thomas, do you have any opinion on this?

^ permalink raw reply

* Re: commit : ppp: add rtnetlink device creation support - breaks netcf on my machine.
From: Dan Williams @ 2016-12-06 23:12 UTC (permalink / raw)
  To: Guillaume Nault, Brad Campbell; +Cc: netdev, Thomas Graf, David Miller, thaller
In-Reply-To: <20161206230853.ukyg75cyxugdwg4a@alphalink.fr>

On Wed, 2016-12-07 at 00:08 +0100, Guillaume Nault wrote:
> (Cc Thomas and David)

CC Thomas Haller who is the current libnl maintainer...

Dan

> On Tue, Dec 06, 2016 at 03:47:20PM +0800, Brad Campbell wrote:
> > 
> > On 06/12/16 01:53, Guillaume Nault wrote:
> > > 
> > > > 
> > > > 
> > > Probably not a mistake on your side. I've started looking at
> > > netcf'
> > > source code, but haven't found anything that could explain your
> > > issue.
> > > It'd really help if you could provide steps to reproduce the bug.
> > 
> > Further to my message this morning, I started with a clean
> > linux.git
> > 4.9.0-rc7-00198-g0cb65c8 and did two runs. One untouched and one
> > with the
> > identified patch reverted. I logged both of these with NLCB=debug,
> > then
> > split out the ppp section and diffed them.
> > 
> > It appears the only difference of note is the new ATTR 18. I did a
> > diff of
> > the entire dump for both and nothing else popped out.
> > 
> Thanks for the detailed report. Things are getting clear now.
> 
> > 
> > 
> > brad@test:~$ diff -u ppp-ok ppp-fail
> > --- ppp-ok	2016-12-06 13:32:04.358393578 +0800
> > +++ ppp-fail	2016-12-06 13:32:18.577864406 +0800
> > @@ -1,10 +1,10 @@
> >  --------------------------   BEGIN NETLINK MESSAGE
> > ---------------------------
> >    [HEADER] 16 octets
> > -    .nlmsg_len = 628
> > +    .nlmsg_len = 644
> >      .nlmsg_type = 16 <route/link::new>
> >      .nlmsg_flags = 2 <MULTI>
> > -    .nlmsg_seq = 1481001940
> > -    .nlmsg_pid = 7462
> > +    .nlmsg_seq = 1481002252
> > +    .nlmsg_pid = 7376
> >    [PAYLOAD] 16 octets
> >      00 00 00 02 0a 00 00 00 d1 10 01 00 00 00 00
> > 00       ................
> >    [ATTR 03] 5 octets
> > @@ -71,6 +71,8 @@
> >      00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> > ..................
> >      00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> > ..................
> >      00 00 00 00 00 00                                     ......
> > +  [ATTR 18] 12 octets
> > +    08 00 01 00 70 70 70 00 04 00 02
> > 00                   ....ppp.....
> >    [ATTR 26] 132 octets
> >      84 00 02 00 80 00 01 00 01 00 00 00 00 00 00 00 00 00
> > ..................
> >      00 00 01 00 00 00 01 00 00 00 01 00 00 00 01 00 00 00
> > ..................
> > @@ -81,3 +83,4 @@
> >      00 00 00 00 10 27 00 00 e8 03 00 00 00 00 00 00 00 00
> > .....'............
> >      00 00 00 00 00 00                                     ......
> >  ---------------------------  END NETLINK MESSAGE
> > ---------------------------
> > 
> 'ATTR 18' is the IFLA_LINKINFO attribute. It contains two sub-
> attributes:
>   * IFLA_INFO_KIND ('08 00 01 00 70 70 70 00'), containing the "ppp"
>     string,
>   * and IFLA_INFO_DATA ('04 00 02 00') which has no payload because,
>     currently, ppp has no device specific data to return.
> 
> > 
> > Running with NLDBG=4 seems to generate this :
> > DBG<2>: While picking up for 0x26d2e00 <route/link>, recvmsgs()
> > returned
> > -34:  (errno = Numerical result out of range)DBG<1>: Clearing cache
> > 0x26d2e00 <route/link>...
> > 
> libnl1 rejects the IFLA_INFO_DATA attribute because it expects it to
> contain a sub-attribute. Since the payload size is zero it doesn't
> match the policy and parsing fails.
> 
> There's no problem with libnl3 because its policy accepts empty
> payloads for NLA_NESTED attributes (see libnl3 commit 4be02ace4826
> "Be
> liberal when receiving an empty nested attribute").
> 
> I think empty nested attributes make perfect sense. At least we
> accept
> them from user space since commit ea5693ccc553 ("netlink: allow empty
> nested attributes"), so it should be fine to generate some from the
> kernel.
> OTOH, since some user space programs broke because of this, it might
> be
> better to always add attributes in the .fill_info() callbacks, to
> work
> around libnl1's policy. David, Thomas, do you have any opinion on
> this?

^ permalink raw reply

* ixgbe Port cannot load, "failed to register GSI"
From: Ben Greear @ 2016-12-06 23:22 UTC (permalink / raw)
  To: netdev

We put 3 10-g dual-port ixgbe NICs and 4 4-port I350 NICs in a 2U rackmount, and one of the ixgbe ports
fails to come up.  This previously worked before reboot, so maybe it is a race somehow.  Kernel is 4.4.11+,
but not hacks to ixgbe or I350 drivers.

Anyone know if there is some sort of way to make this work reliably?

dmesg | grep ixgbe

[    5.803307] ixgbe: Intel(R) 10 Gigabit PCI Express Network Driver - version 4.2.1-k
[    5.803309] ixgbe: Copyright (c) 1999-2015 Intel Corporation.
[    5.952119] ixgbe 0000:04:00.0: Multiqueue Enabled: Rx Queue count = 8, Tx Queue count = 8
[    5.952245] ixgbe 0000:04:00.0: PCI Express bandwidth of 32GT/s available
[    5.952246] ixgbe 0000:04:00.0: (Speed:5.0GT/s, Width: x8, Encoding Loss:20%)
[    5.952328] ixgbe 0000:04:00.0: MAC: 2, PHY: 15, SFP+: 5, PBA No: FFFFFF-0FF
[    5.952330] ixgbe 0000:04:00.0: 00:e0:ed:77:09:16
[    5.954004] ixgbe 0000:04:00.0: Intel(R) 10 Gigabit Network Connection
[    6.102346] ixgbe 0000:04:00.1: Multiqueue Enabled: Rx Queue count = 8, Tx Queue count = 8
[    6.102475] ixgbe 0000:04:00.1: PCI Express bandwidth of 32GT/s available
[    6.102478] ixgbe 0000:04:00.1: (Speed:5.0GT/s, Width: x8, Encoding Loss:20%)
[    6.102562] ixgbe 0000:04:00.1: MAC: 2, PHY: 15, SFP+: 6, PBA No: FFFFFF-0FF
[    6.102564] ixgbe 0000:04:00.1: 00:e0:ed:77:09:17
[    6.104869] ixgbe 0000:04:00.1: Intel(R) 10 Gigabit Network Connection
[    6.253429] ixgbe 0000:05:00.0: Multiqueue Enabled: Rx Queue count = 8, Tx Queue count = 8
[    6.253558] ixgbe 0000:05:00.0: PCI Express bandwidth of 32GT/s available
[    6.253560] ixgbe 0000:05:00.0: (Speed:5.0GT/s, Width: x8, Encoding Loss:20%)
[    6.253644] ixgbe 0000:05:00.0: MAC: 2, PHY: 15, SFP+: 5, PBA No: FFFFFF-0FF
[    6.253646] ixgbe 0000:05:00.0: 00:e0:ed:79:06:50
[    6.255855] ixgbe 0000:05:00.0: Intel(R) 10 Gigabit Network Connection
[    6.404128] ixgbe 0000:05:00.1: Multiqueue Enabled: Rx Queue count = 8, Tx Queue count = 8
[    6.404254] ixgbe 0000:05:00.1: PCI Express bandwidth of 32GT/s available
[    6.404255] ixgbe 0000:05:00.1: (Speed:5.0GT/s, Width: x8, Encoding Loss:20%)
[    6.404337] ixgbe 0000:05:00.1: MAC: 2, PHY: 15, SFP+: 6, PBA No: FFFFFF-0FF
[    6.404339] ixgbe 0000:05:00.1: 00:e0:ed:79:06:51
[    6.405914] ixgbe 0000:05:00.1: Intel(R) 10 Gigabit Network Connection
[    6.554373] ixgbe 0000:06:00.0: Multiqueue Enabled: Rx Queue count = 8, Tx Queue count = 8
[    6.554501] ixgbe 0000:06:00.0: PCI Express bandwidth of 32GT/s available
[    6.554504] ixgbe 0000:06:00.0: (Speed:5.0GT/s, Width: x8, Encoding Loss:20%)
[    6.554588] ixgbe 0000:06:00.0: MAC: 2, PHY: 15, SFP+: 5, PBA No: FFFFFF-0FF
[    6.554590] ixgbe 0000:06:00.0: 00:e0:ed:79:06:56
[    6.556994] ixgbe 0000:06:00.0: Intel(R) 10 Gigabit Network Connection
[    6.557160] ixgbe 0000:06:00.1: PCI INT B: failed to register GSI
[    6.557169] ixgbe: probe of 0000:06:00.1 failed with error -28

Thanks,
Ben
-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply

* Re: [PATCH net-next 2/7] bnxt_en: Enable MSIX early in bnxt_init_one().
From: kbuild test robot @ 2016-12-06 23:55 UTC (permalink / raw)
  To: Michael Chan
  Cc: kbuild-all-JC7UmRfGjtg, davem-fT/PcQaiUtIeIZ0/mPfg9Q,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	selvin.xavier-dY08KVG/lbpWk0Htik3J/w,
	somnath.kotur-dY08KVG/lbpWk0Htik3J/w,
	dledford-H+wXaHxf7aLQT0dZR+AlfA,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1481044178-25193-3-git-send-email-michael.chan-dY08KVG/lbpWk0Htik3J/w@public.gmane.org>

[-- Attachment #1: Type: text/plain, Size: 8301 bytes --]

Hi Michael,

[auto build test WARNING on net-next/master]

url:    https://github.com/0day-ci/linux/commits/Michael-Chan/bnxt_en-Add-interface-to-support-RDMA-driver/20161207-053721
config: i386-randconfig-h1-12070631 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All warnings (new ones prefixed by >>):

   drivers/net/ethernet/broadcom/bnxt/bnxt.c: In function 'bnxt_get_max_func_irqs':
>> drivers/net/ethernet/broadcom/bnxt/bnxt.c:4818:1: warning: control reaches end of non-void function [-Wreturn-type]
    }
    ^
   Cyclomatic Complexity 5 include/linux/compiler.h:__read_once_size
   Cyclomatic Complexity 5 include/linux/compiler.h:__write_once_size
   Cyclomatic Complexity 2 arch/x86/include/asm/bitops.h:set_bit
   Cyclomatic Complexity 1 arch/x86/include/asm/bitops.h:__set_bit
   Cyclomatic Complexity 2 arch/x86/include/asm/bitops.h:clear_bit
   Cyclomatic Complexity 1 arch/x86/include/asm/bitops.h:__clear_bit
   Cyclomatic Complexity 1 arch/x86/include/asm/bitops.h:test_and_set_bit
   Cyclomatic Complexity 1 arch/x86/include/asm/bitops.h:test_and_clear_bit
   Cyclomatic Complexity 1 arch/x86/include/asm/bitops.h:constant_test_bit
   Cyclomatic Complexity 1 arch/x86/include/asm/bitops.h:variable_test_bit
   Cyclomatic Complexity 1 arch/x86/include/asm/bitops.h:fls
   Cyclomatic Complexity 1 include/linux/log2.h:__ilog2_u32
   Cyclomatic Complexity 1 include/linux/list.h:INIT_LIST_HEAD
   Cyclomatic Complexity 1 include/asm-generic/getorder.h:__get_order
   Cyclomatic Complexity 1 include/linux/err.h:ERR_PTR
   Cyclomatic Complexity 1 arch/x86/include/asm/irqflags.h:arch_irqs_disabled_flags
   Cyclomatic Complexity 1 arch/x86/include/asm/processor.h:prefetch
   Cyclomatic Complexity 1 arch/x86/include/asm/atomic.h:atomic_read
   Cyclomatic Complexity 1 arch/x86/include/asm/atomic.h:atomic_set
   Cyclomatic Complexity 1 arch/x86/include/asm/atomic.h:atomic_inc
   Cyclomatic Complexity 1 arch/x86/include/asm/atomic.h:atomic_cmpxchg
   Cyclomatic Complexity 5 arch/x86/include/asm/preempt.h:__preempt_count_add
   Cyclomatic Complexity 1 include/linux/bottom_half.h:__local_bh_disable_ip
   Cyclomatic Complexity 1 include/linux/bottom_half.h:local_bh_disable
   Cyclomatic Complexity 1 include/linux/spinlock.h:spinlock_check
   Cyclomatic Complexity 1 include/linux/spinlock.h:spin_lock
   Cyclomatic Complexity 1 include/linux/spinlock.h:spin_lock_bh
   Cyclomatic Complexity 1 include/linux/spinlock.h:spin_unlock
   Cyclomatic Complexity 1 include/linux/spinlock.h:spin_unlock_bh
   Cyclomatic Complexity 1 include/linux/workqueue.h:__init_work
   Cyclomatic Complexity 1 arch/x86/include/asm/topology.h:numa_node_id
   Cyclomatic Complexity 1 include/linux/topology.h:numa_mem_id
   Cyclomatic Complexity 1 include/linux/gfp.h:gfp_zonelist
   Cyclomatic Complexity 1 include/linux/gfp.h:node_zonelist
   Cyclomatic Complexity 1 include/linux/kasan.h:kasan_kmalloc
   Cyclomatic Complexity 28 include/linux/slab.h:kmalloc_index
   Cyclomatic Complexity 1 include/linux/slab.h:kmem_cache_alloc_trace
   Cyclomatic Complexity 1 include/linux/slab.h:kmalloc_order_trace
   Cyclomatic Complexity 67 include/linux/slab.h:kmalloc_large
   Cyclomatic Complexity 5 include/linux/slab.h:kmalloc
   Cyclomatic Complexity 1 include/linux/slab.h:kzalloc
   Cyclomatic Complexity 1 arch/x86/include/asm/io.h:readl
   Cyclomatic Complexity 1 arch/x86/include/asm/io.h:writel
   Cyclomatic Complexity 1 include/linux/device.h:dev_get_drvdata
   Cyclomatic Complexity 1 include/linux/device.h:dev_set_drvdata
   Cyclomatic Complexity 1 include/linux/pci.h:pci_is_bridge
   Cyclomatic Complexity 1 include/linux/mm.h:lowmem_page_address
   Cyclomatic Complexity 1 include/linux/mm.h:page_is_pfmemalloc
   Cyclomatic Complexity 1 include/linux/pci.h:pci_disable_msix
   Cyclomatic Complexity 1 include/linux/pci.h:pci_enable_msix_range
   Cyclomatic Complexity 1 include/linux/pci.h:pci_get_drvdata
   Cyclomatic Complexity 1 include/linux/pci.h:pci_set_drvdata
   Cyclomatic Complexity 1 include/linux/dma-debug.h:debug_dma_map_page
   Cyclomatic Complexity 1 include/linux/dma-debug.h:debug_dma_mapping_error
   Cyclomatic Complexity 1 include/linux/dma-debug.h:debug_dma_unmap_page
   Cyclomatic Complexity 1 include/linux/dma-debug.h:debug_dma_alloc_coherent
   Cyclomatic Complexity 1 include/linux/dma-debug.h:debug_dma_free_coherent
   Cyclomatic Complexity 1 include/linux/dma-debug.h:debug_dma_sync_single_for_cpu
   Cyclomatic Complexity 1 include/linux/dma-debug.h:debug_dma_sync_single_for_device
   Cyclomatic Complexity 1 include/linux/kmemcheck.h:kmemcheck_mark_initialized
   Cyclomatic Complexity 1 include/linux/dma-mapping.h:valid_dma_direction
   Cyclomatic Complexity 1 arch/x86/include/asm/dma-mapping.h:get_dma_ops
   Cyclomatic Complexity 2 include/linux/dma-mapping.h:dma_mapping_error
   Cyclomatic Complexity 1 include/linux/dynamic_queue_limits.h:dql_avail
   Cyclomatic Complexity 1 include/linux/skbuff.h:skb_frag_size
   Cyclomatic Complexity 1 include/linux/skbuff.h:skb_frag_size_set
   Cyclomatic Complexity 1 include/linux/skbuff.h:__skb_set_hash
   Cyclomatic Complexity 1 include/linux/skbuff.h:skb_set_hash
   Cyclomatic Complexity 1 include/linux/skbuff.h:skb_end_pointer
   Cyclomatic Complexity 1 include/linux/skbuff.h:skb_headlen
   Cyclomatic Complexity 1 include/linux/skbuff.h:skb_reserve
   Cyclomatic Complexity 1 include/linux/skbuff.h:skb_inner_transport_header
   Cyclomatic Complexity 1 include/linux/skbuff.h:skb_inner_network_header
   Cyclomatic Complexity 1 include/linux/skbuff.h:skb_transport_header
   Cyclomatic Complexity 1 include/linux/skbuff.h:skb_transport_offset
   Cyclomatic Complexity 1 include/linux/skbuff.h:skb_inner_network_header_len
   Cyclomatic Complexity 1 include/linux/skbuff.h:skb_inner_network_offset
   Cyclomatic Complexity 1 include/linux/skbuff.h:skb_frag_page
   Cyclomatic Complexity 1 include/linux/skbuff.h:__skb_frag_set_page
   Cyclomatic Complexity 1 include/linux/skbuff.h:skb_get_queue_mapping
   Cyclomatic Complexity 1 include/linux/skbuff.h:skb_record_rx_queue
   Cyclomatic Complexity 1 include/linux/skbuff.h:skb_is_gso
   Cyclomatic Complexity 1 include/linux/skbuff.h:skb_checksum_none_assert
   Cyclomatic Complexity 1 include/linux/netdevice.h:napi_disable_pending
   Cyclomatic Complexity 3 include/linux/netdevice.h:napi_schedule_prep
   Cyclomatic Complexity 1 include/linux/netdevice.h:netdev_get_num_tc
   Cyclomatic Complexity 1 include/linux/netdevice.h:netdev_get_tx_queue
   Cyclomatic Complexity 1 include/linux/netdevice.h:netdev_priv
   Cyclomatic Complexity 1 include/linux/netdevice.h:netif_tx_stop_queue
   Cyclomatic Complexity 1 include/linux/netdevice.h:netif_tx_queue_stopped
   Cyclomatic Complexity 1 include/linux/netdevice.h:netif_running
   Cyclomatic Complexity 1 include/linux/netdevice.h:netif_carrier_ok
   Cyclomatic Complexity 1 include/linux/netdevice.h:__netif_tx_lock
   Cyclomatic Complexity 1 include/linux/netdevice.h:__netif_tx_unlock
   Cyclomatic Complexity 1 include/linux/netdevice.h:netif_addr_lock_bh
   Cyclomatic Complexity 1 include/linux/netdevice.h:netif_addr_unlock_bh
   Cyclomatic Complexity 1 include/linux/etherdevice.h:is_zero_ether_addr
   Cyclomatic Complexity 1 include/linux/etherdevice.h:is_multicast_ether_addr

vim +4818 drivers/net/ethernet/broadcom/bnxt/bnxt.c

  4802		if (bp->flags & BNXT_FLAG_USING_MSIX)
  4803			bnxt_setup_msix(bp);
  4804		else
  4805			bnxt_setup_inta(bp);
  4806	
  4807		rc = bnxt_set_real_num_queues(bp);
  4808		return rc;
  4809	}
  4810	
  4811	static unsigned int bnxt_get_max_func_irqs(struct bnxt *bp)
  4812	{
  4813		if (BNXT_PF(bp))
  4814			return bp->pf.max_irqs;
  4815	#if defined(CONFIG_BNXT_SRIOV)
  4816		return bp->vf.max_irqs;
  4817	#endif
> 4818	}
  4819	
  4820	void bnxt_set_max_func_irqs(struct bnxt *bp, unsigned int max_irqs)
  4821	{
  4822		if (BNXT_PF(bp))
  4823			bp->pf.max_irqs = max_irqs;
  4824	#if defined(CONFIG_BNXT_SRIOV)
  4825		else
  4826			bp->vf.max_irqs = max_irqs;

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 28906 bytes --]

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox