Netdev List

* [PATCH v9 5/8] thunderbolt: Networking transmit and receive
From: Amir Levy @ 2016-11-09 14:20 UTC (permalink / raw)
  To: gregkh
  Cc: andreas.noever, bhelgaas, corbet, linux-kernel, linux-pci, netdev,
	linux-doc, mario_limonciello, thunderbolt-linux, mika.westerberg,
	tomas.winkler, xiong.y.zhang, Amir Levy
In-Reply-To: <1478701208-4585-1-git-send-email-amir.jer.levy@intel.com>

This patch provides the handling interface for sending and receiving
network packets between the hosts over the full communication route
(using the communication path established in the previous patch).

The Thunderbolt Network driver interfaces the Linux network stack
and the hardware controller configuration to handle packet transmissions:
  +----------------+            +----------------+
  |Host 1          |            |Host 2          |
  |                |            |                |
  |   +-------+    |            |    +-------+   |
  |   |Network|    |            |    |Network|   |
  |   |Stack  |    |            |    |Stack  |   |
  |   +-------+    |            |    +-------+   |
  |       ^        |            |        ^       |
  |       |        |            |        |       |
  |       v        |            |        v       |
  | +-----------+  |            |  +-----------+ |
  | |Thunderbolt|  |            |  |Thunderbolt| |
  | |Networking |  |            |  |Networking | |
  | |Driver     |  |            |  |Driver     | |
  | +-----------+  |            |  +-----------+ |
  |       ^        |            |        ^       |
  |       |        |            |        |       |
  |       v        |            |        v       |
  | +-----------+  |            |  +-----------+ |
  | |Thunderbolt|  |            |  |Thunderbolt| |
  | |Controller |<-+------------+->|Controller | |
  | +-----------+  |            |  +-----------+ |
  +----------------+            +----------------+

Signed-off-by: Amir Levy <amir.jer.levy@intel.com>
---
 drivers/thunderbolt/icm/icm_nhi.c |   15 +
 drivers/thunderbolt/icm/net.c     | 1471 +++++++++++++++++++++++++++++++++++++
 2 files changed, 1486 insertions(+)

diff --git a/drivers/thunderbolt/icm/icm_nhi.c b/drivers/thunderbolt/icm/icm_nhi.c
index edc910b..b1cc347 100644
--- a/drivers/thunderbolt/icm/icm_nhi.c
+++ b/drivers/thunderbolt/icm/icm_nhi.c
@@ -928,6 +928,7 @@ static irqreturn_t nhi_msi(int __always_unused irq, void *data)
 {
 	struct tbt_nhi_ctxt *nhi_ctxt = data;
 	u32 isr0, isr1, imr0, imr1;
+	int i;
 
 	/* clear on read */
 	isr0 = ioread32(nhi_ctxt->iobase + REG_RING_NOTIFY_BASE);
@@ -950,6 +951,20 @@ static irqreturn_t nhi_msi(int __always_unused irq, void *data)
 
 	spin_unlock(&nhi_ctxt->lock);
 
+	for (i = 0; i < nhi_ctxt->num_ports; ++i) {
+		struct net_device *net_dev =
+				nhi_ctxt->net_devices[i].net_dev;
+		if (net_dev) {
+			u8 path = PATH_FROM_PORT(nhi_ctxt->num_paths, i);
+
+			if (isr0 & REG_RING_INT_RX_PROCESSED(
+					path, nhi_ctxt->num_paths))
+				tbt_net_rx_msi(net_dev);
+			if (isr0 & REG_RING_INT_TX_PROCESSED(path))
+				tbt_net_tx_msi(net_dev);
+		}
+	}
+
 	if (isr0 & REG_RING_INT_RX_PROCESSED(TBT_ICM_RING_NUM,
 					     nhi_ctxt->num_paths))
 		schedule_work(&nhi_ctxt->icm_msgs_work);
diff --git a/drivers/thunderbolt/icm/net.c b/drivers/thunderbolt/icm/net.c
index beeafb3..cf985dd 100644
--- a/drivers/thunderbolt/icm/net.c
+++ b/drivers/thunderbolt/icm/net.c
@@ -124,6 +124,17 @@ struct approve_inter_domain_connection_cmd {
 
 };
 
+struct tbt_frame_header {
+	/* size of the data with the frame */
+	__le32 frame_size;
+	/* running index on the frames */
+	__le16 frame_index;
+	/* ID of the frame to match frames to specific packet */
+	__le16 frame_id;
+	/* how many frames assembles a full packet */
+	__le32 frame_count;
+};
+
 enum neg_event {
 	RECEIVE_LOGOUT = NUM_MEDIUM_STATUSES,
 	RECEIVE_LOGIN_RESPONSE,
@@ -131,15 +142,81 @@ enum neg_event {
 	NUM_NEG_EVENTS
 };
 
+enum frame_status {
+	GOOD_FRAME,
+	GOOD_AS_FIRST_FRAME,
+	GOOD_AS_FIRST_MULTICAST_FRAME,
+	FRAME_NOT_READY,
+	FRAME_ERROR,
+};
+
+enum packet_filter {
+	/* all multicast MAC addresses */
+	PACKET_TYPE_ALL_MULTICAST,
+	/* all types of MAC addresses: multicast, unicast and broadcast */
+	PACKET_TYPE_PROMISCUOUS,
+	/* all unicast MAC addresses */
+	PACKET_TYPE_UNICAST_PROMISCUOUS,
+};
+
 enum disconnect_path_stage {
 	STAGE_1 = BIT(0),
 	STAGE_2 = BIT(1)
 };
 
+struct tbt_net_stats {
+	u64 tx_packets;
+	u64 tx_bytes;
+	u64 tx_errors;
+	u64 rx_packets;
+	u64 rx_bytes;
+	u64 rx_length_errors;
+	u64 rx_over_errors;
+	u64 rx_crc_errors;
+	u64 rx_missed_errors;
+	u64 multicast;
+};
+
+static const char tbt_net_gstrings_stats[][ETH_GSTRING_LEN] = {
+	"tx_packets",
+	"tx_bytes",
+	"tx_errors",
+	"rx_packets",
+	"rx_bytes",
+	"rx_length_errors",
+	"rx_over_errors",
+	"rx_crc_errors",
+	"rx_missed_errors",
+	"multicast",
+};
+
+struct tbt_buffer {
+	dma_addr_t dma;
+	union {
+		struct tbt_frame_header *hdr;
+		struct page *page;
+	};
+	u32 page_offset;
+};
+
+struct tbt_desc_ring {
+	/* pointer to the descriptor ring memory */
+	struct tbt_buf_desc *desc;
+	/* physical address of the descriptor ring */
+	dma_addr_t dma;
+	/* array of buffer structs */
+	struct tbt_buffer *buffers;
+	/* last descriptor that was associated with a buffer */
+	u16 last_allocated;
+	/* next descriptor to check for DD status bit */
+	u16 next_to_clean;
+};
+
 /**
  *  struct tbt_port - the basic tbt_port structure
  *  @tbt_nhi_ctxt:		context of the nhi controller.
  *  @net_dev:			networking device object.
+ *  @napi:			network API
  *  @login_retry_work:		work queue for sending login requests.
  *  @login_response_work:	work queue for sending login responses.
  *  @work_struct logout_work:	work queue for sending logout requests.
@@ -155,6 +232,11 @@ enum disconnect_path_stage {
  *  @login_retry_count:		counts number of login retries sent.
  *  @local_depth:		depth of the remote peer in the chain.
  *  @transmit_path:		routing parameter for the icm.
+ *  @tx_ring:			transmit ring from where the packets are sent.
+ *  @rx_ring:			receive ring  where the packets are received.
+ *  @stats:			network statistics of the rx/tx packets.
+ *  @packet_filters:		defines filters for the received packets.
+ *  @multicast_hash_table:	hash table of multicast addresses.
  *  @frame_id:			counting ID of frames.
  *  @num:			port number.
  *  @local_path:		routing parameter for the icm.
@@ -164,6 +246,7 @@ enum disconnect_path_stage {
 struct tbt_port {
 	struct tbt_nhi_ctxt *nhi_ctxt;
 	struct net_device *net_dev;
+	struct napi_struct napi;
 	struct delayed_work login_retry_work;
 	struct work_struct login_response_work;
 	struct work_struct logout_work;
@@ -179,6 +262,17 @@ struct tbt_port {
 	u8 login_retry_count;
 	u8 local_depth;
 	u8 transmit_path;
+	struct tbt_desc_ring tx_ring ____cacheline_aligned_in_smp;
+	struct tbt_desc_ring rx_ring;
+	struct tbt_net_stats stats;
+	u32 packet_filters;
+	/*
+	 * hash table of 1024 boolean entries with hashing of
+	 * the multicast address
+	 */
+	u32 multicast_hash_table[DIV_ROUND_UP(
+					TBT_NET_MULTICAST_HASH_TABLE_SIZE,
+					BITS_PER_U32)];
 	u16 frame_id;
 	u8 num;
 	u8 local_path;
@@ -225,6 +319,8 @@ static void tbt_net_tear_down(struct net_device *net_dev, bool send_logout)
 		      (port->local_path * REG_OPTS_STEP);
 		u32 rx_reg_val = ioread32(rx_reg) & ~REG_OPTS_E2E_EN;
 
+		napi_disable(&port->napi);
+
 		tx_reg = iobase + REG_TX_OPTIONS_BASE +
 			 (port->local_path * REG_OPTS_STEP);
 		tx_reg_val = ioread32(tx_reg) & ~REG_OPTS_E2E_EN;
@@ -266,8 +362,1336 @@ static void tbt_net_tear_down(struct net_device *net_dev, bool send_logout)
 				       port->nhi_ctxt->num_paths);
 		spin_unlock_irqrestore(&port->nhi_ctxt->lock, flags);
 	}
+
+	port->rx_ring.next_to_clean = 0;
+	port->rx_ring.last_allocated = TBT_NET_NUM_RX_BUFS - 1;
+
+}
+
+void tbt_net_tx_msi(struct net_device *net_dev)
+{
+	struct tbt_port *port = netdev_priv(net_dev);
+	void __iomem *iobase = port->nhi_ctxt->iobase;
+	u32 prod_cons, prod, cons;
+
+	prod_cons = ioread32(TBT_RING_CONS_PROD_REG(iobase, REG_TX_RING_BASE,
+						    port->local_path));
+	prod = TBT_REG_RING_PROD_EXTRACT(prod_cons);
+	cons = TBT_REG_RING_CONS_EXTRACT(prod_cons);
+	if (prod >= TBT_NET_NUM_TX_BUFS || cons >= TBT_NET_NUM_TX_BUFS)
+		return;
+
+	if (TBT_NUM_BUFS_BETWEEN(prod, cons, TBT_NET_NUM_TX_BUFS) >=
+							TX_WAKE_THRESHOLD) {
+		netif_wake_queue(port->net_dev);
+	} else {
+		spin_lock(&port->nhi_ctxt->lock);
+		/* enable TX interrupt */
+		RING_INT_ENABLE_TX(iobase, port->local_path);
+		spin_unlock(&port->nhi_ctxt->lock);
+	}
+}
+
+static irqreturn_t tbt_net_tx_msix(int __always_unused irq, void *data)
+{
+	struct tbt_port *port = data;
+	void __iomem *iobase = port->nhi_ctxt->iobase;
+	u32 prod_cons, prod, cons;
+
+	prod_cons = ioread32(TBT_RING_CONS_PROD_REG(iobase,
+						    REG_TX_RING_BASE,
+						    port->local_path));
+	prod = TBT_REG_RING_PROD_EXTRACT(prod_cons);
+	cons = TBT_REG_RING_CONS_EXTRACT(prod_cons);
+	if (prod < TBT_NET_NUM_TX_BUFS && cons < TBT_NET_NUM_TX_BUFS &&
+	    TBT_NUM_BUFS_BETWEEN(prod, cons, TBT_NET_NUM_TX_BUFS) >=
+							TX_WAKE_THRESHOLD) {
+		spin_lock(&port->nhi_ctxt->lock);
+		/* disable TX interrupt */
+		RING_INT_DISABLE_TX(iobase, port->local_path);
+		spin_unlock(&port->nhi_ctxt->lock);
+
+		netif_wake_queue(port->net_dev);
+	}
+
+	return IRQ_HANDLED;
+}
+
+void tbt_net_rx_msi(struct net_device *net_dev)
+{
+	struct tbt_port *port = netdev_priv(net_dev);
+
+	napi_schedule_irqoff(&port->napi);
+}
+
+static irqreturn_t tbt_net_rx_msix(int __always_unused irq, void *data)
+{
+	struct tbt_port *port = data;
+
+	if (likely(napi_schedule_prep(&port->napi))) {
+		struct tbt_nhi_ctxt *nhi_ctx = port->nhi_ctxt;
+
+		spin_lock(&nhi_ctx->lock);
+		/* disable RX interrupt */
+		RING_INT_DISABLE_RX(nhi_ctx->iobase, port->local_path,
+				    nhi_ctx->num_paths);
+		spin_unlock(&nhi_ctx->lock);
+
+		__napi_schedule_irqoff(&port->napi);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static void tbt_net_pull_tail(struct sk_buff *skb)
+{
+	skb_frag_t *frag = &skb_shinfo(skb)->frags[0];
+	unsigned int pull_len;
+	unsigned char *va;
+
+	/*
+	 * it is valid to use page_address instead of kmap since we are
+	 * working with pages allocated out of the lomem pool
+	 */
+	va = skb_frag_address(frag);
+
+	pull_len = eth_get_headlen(va, TBT_NET_RX_HDR_SIZE);
+
+	/* align pull length to size of long to optimize memcpy performance */
+	skb_copy_to_linear_data(skb, va, ALIGN(pull_len, sizeof(long)));
+
+	/* update all of the pointers */
+	skb_frag_size_sub(frag, pull_len);
+	frag->page_offset += pull_len;
+	skb->data_len -= pull_len;
+	skb->tail += pull_len;
+}
+
+static inline bool tbt_net_alloc_mapped_page(struct device *dev,
+					     struct tbt_buffer *buf, gfp_t gfp)
+{
+	if (!buf->page) {
+		buf->page = alloc_page(gfp | __GFP_COLD);
+		if (unlikely(!buf->page))
+			return false;
+
+		buf->dma = dma_map_page(dev, buf->page, 0, PAGE_SIZE,
+					DMA_FROM_DEVICE);
+		if (dma_mapping_error(dev, buf->dma)) {
+			__free_page(buf->page);
+			buf->page = NULL;
+			return false;
+		}
+		buf->page_offset = 0;
+	}
+	return true;
+}
+
+static bool tbt_net_alloc_rx_buffers(struct device *dev,
+				     struct tbt_desc_ring *rx_ring,
+				     u16 cleaned_count, void __iomem *reg,
+				     gfp_t gfp)
+{
+	u16 i = (rx_ring->last_allocated + 1) & (TBT_NET_NUM_RX_BUFS - 1);
+	bool res = false;
+
+	while (cleaned_count--) {
+		struct tbt_buf_desc *desc = &rx_ring->desc[i];
+		struct tbt_buffer *buf = &rx_ring->buffers[i];
+
+		/* making sure next_to_clean won't get old buffer */
+		desc->attributes = cpu_to_le32(DESC_ATTR_REQ_STS |
+					       DESC_ATTR_INT_EN);
+		if (tbt_net_alloc_mapped_page(dev, buf, gfp)) {
+			res = true;
+			rx_ring->last_allocated = i;
+			i = (i + 1) & (TBT_NET_NUM_RX_BUFS - 1);
+			desc->phys = cpu_to_le64(buf->dma + buf->page_offset);
+		} else {
+			break;
+		}
+	}
+
+	if (res) {
+		iowrite32((rx_ring->last_allocated << REG_RING_CONS_SHIFT) &
+			  REG_RING_CONS_MASK, reg);
+	}
+
+	return res;
+}
+
+static inline bool tbt_net_multicast_mac_set(const u32 *multicast_hash_table,
+					     const u8 *ether_addr)
+{
+	u16 hash_val = TBT_NET_ETHER_ADDR_HASH(ether_addr);
+
+	return !!(multicast_hash_table[hash_val / BITS_PER_U32] &
+		  BIT(hash_val % BITS_PER_U32));
+}
+
+static enum frame_status tbt_net_check_frame(struct tbt_port *port,
+					     u16 frame_num, u32 *count,
+					     u16 index, u16 *id, u32 *size)
+{
+	struct tbt_desc_ring *rx_ring = &port->rx_ring;
+	__le32 desc_attr = rx_ring->desc[frame_num].attributes;
+	enum frame_status res = GOOD_AS_FIRST_FRAME;
+	u32 len, frame_count, frame_size;
+	struct tbt_frame_header *hdr;
+
+	if (!(desc_attr & cpu_to_le32(DESC_ATTR_DESC_DONE)))
+		return FRAME_NOT_READY;
+
+	rmb(); /* read other fields from desc after checking DD */
+
+	if (unlikely(desc_attr & cpu_to_le32(DESC_ATTR_RX_CRC_ERR))) {
+		++port->stats.rx_crc_errors;
+		goto err;
+	} else if (unlikely(desc_attr &
+				cpu_to_le32(DESC_ATTR_RX_BUF_OVRN_ERR))) {
+		++port->stats.rx_over_errors;
+		goto err;
+	}
+
+	len = (le32_to_cpu(desc_attr) & DESC_ATTR_LEN_MASK)
+	      >> DESC_ATTR_LEN_SHIFT;
+	if (len == 0)
+		len = TBT_RING_MAX_FRAME_SIZE;
+	/* should be greater than just header i.e. contains data */
+	if (unlikely(len <= sizeof(struct tbt_frame_header))) {
+		++port->stats.rx_length_errors;
+		goto err;
+	}
+
+	prefetchw(rx_ring->buffers[frame_num].page);
+	hdr = page_address(rx_ring->buffers[frame_num].page) +
+				rx_ring->buffers[frame_num].page_offset;
+	/* prefetch first cache line of first page */
+	prefetch(hdr);
+
+	/* we are reusing so sync this buffer for CPU use */
+	dma_sync_single_range_for_cpu(&port->nhi_ctxt->pdev->dev,
+				      rx_ring->buffers[frame_num].dma,
+				      rx_ring->buffers[frame_num].page_offset,
+				      TBT_RING_MAX_FRAME_SIZE,
+				      DMA_FROM_DEVICE);
+
+	frame_count = le32_to_cpu(hdr->frame_count);
+	frame_size = le32_to_cpu(hdr->frame_size);
+
+	if (unlikely((frame_size > len - sizeof(struct tbt_frame_header)) ||
+		     (frame_size == 0))) {
+		++port->stats.rx_length_errors;
+		goto err;
+	}
+	/*
+	 * In case we're in the middle of packet, validate the frame header
+	 * based on first fragment of the packet
+	 */
+	if (*count) {
+		/* check the frame count fits the count field */
+		if (frame_count != *count) {
+			++port->stats.rx_length_errors;
+			goto check_as_first;
+		}
+
+		/*
+		 * check the frame identifiers are incremented correctly,
+		 * and id is matching
+		 */
+		if ((le16_to_cpu(hdr->frame_index) != index) ||
+		    (le16_to_cpu(hdr->frame_id) != *id)) {
+			++port->stats.rx_missed_errors;
+			goto check_as_first;
+		}
+
+		*size += frame_size;
+		if (*size > TBT_NET_MTU) {
+			++port->stats.rx_length_errors;
+			goto err;
+		}
+		res = GOOD_FRAME;
+	} else { /* start of packet, validate the frame header */
+		const u8 *addr;
+
+check_as_first:
+		rx_ring->next_to_clean = frame_num;
+
+		/* validate the first packet has a valid frame count */
+		if (unlikely(frame_count == 0 ||
+			     frame_count > (TBT_NET_NUM_RX_BUFS / 4))) {
+			++port->stats.rx_length_errors;
+			goto err;
+		}
+
+		/* validate the first packet has a valid frame index */
+		if (hdr->frame_index != 0) {
+			++port->stats.rx_missed_errors;
+			goto err;
+		}
+
+		BUILD_BUG_ON(TBT_NET_RX_HDR_SIZE > TBT_RING_MAX_FRM_DATA_SZ);
+		if ((frame_count > 1) && (frame_size < TBT_NET_RX_HDR_SIZE)) {
+			++port->stats.rx_length_errors;
+			goto err;
+		}
+
+		addr = (u8 *)(hdr + 1);
+
+		/* check the packet can go through the filter */
+		if (is_multicast_ether_addr(addr)) {
+			if (!is_broadcast_ether_addr(addr)) {
+				if ((port->packet_filters &
+				     (BIT(PACKET_TYPE_PROMISCUOUS) |
+				      BIT(PACKET_TYPE_ALL_MULTICAST))) ||
+				    tbt_net_multicast_mac_set(
+					port->multicast_hash_table, addr))
+					res = GOOD_AS_FIRST_MULTICAST_FRAME;
+				else
+					goto err;
+			}
+		} else if (!(port->packet_filters &
+			     (BIT(PACKET_TYPE_PROMISCUOUS) |
+			      BIT(PACKET_TYPE_UNICAST_PROMISCUOUS))) &&
+			   !ether_addr_equal(port->net_dev->dev_addr, addr)) {
+			goto err;
+		}
+
+		*size = frame_size;
+		*count = frame_count;
+		*id = le16_to_cpu(hdr->frame_id);
+	}
+
+#if (PREFETCH_STRIDE < 128)
+	prefetch((u8 *)hdr + PREFETCH_STRIDE);
+#endif
+
+	return res;
+
+err:
+	rx_ring->next_to_clean = (frame_num + 1) & (TBT_NET_NUM_RX_BUFS - 1);
+	return FRAME_ERROR;
+}
+
+static inline unsigned int tbt_net_max_frm_data_size(
+						__maybe_unused u32 frame_size)
+{
+#if (TBT_NUM_FRAMES_PER_PAGE > 1)
+	return ALIGN(frame_size + sizeof(struct tbt_frame_header),
+		     L1_CACHE_BYTES) -
+	       sizeof(struct tbt_frame_header);
+#else
+	return TBT_RING_MAX_FRM_DATA_SZ;
+#endif
+}
+
+static int tbt_net_poll(struct napi_struct *napi, int budget)
+{
+	struct tbt_port *port = container_of(napi, struct tbt_port, napi);
+	void __iomem *reg = TBT_RING_CONS_PROD_REG(port->nhi_ctxt->iobase,
+						   REG_RX_RING_BASE,
+						   port->local_path);
+	struct tbt_desc_ring *rx_ring = &port->rx_ring;
+	u16 cleaned_count = TBT_NUM_BUFS_BETWEEN(rx_ring->last_allocated,
+						 rx_ring->next_to_clean,
+						 TBT_NET_NUM_RX_BUFS);
+	unsigned long flags;
+	int rx_packets = 0;
+
+loop:
+	while (likely(rx_packets < budget)) {
+		struct sk_buff *skb;
+		enum frame_status status;
+		bool multicast = false;
+		u32 frame_count = 0, size;
+		u16 j, frame_id;
+		int i;
+
+		/*
+		 * return some buffers to hardware, one at a time is too slow
+		 * so allocate  TBT_NET_RX_BUFFER_WRITE buffers at the same time
+		 */
+		if (cleaned_count >= TBT_NET_RX_BUFFER_WRITE) {
+			tbt_net_alloc_rx_buffers(&port->nhi_ctxt->pdev->dev,
+						 rx_ring, cleaned_count, reg,
+						 GFP_ATOMIC);
+			cleaned_count = 0;
+		}
+
+		status = tbt_net_check_frame(port, rx_ring->next_to_clean,
+					     &frame_count, 0, &frame_id,
+					     &size);
+		if (status == FRAME_NOT_READY)
+			break;
+
+		if (status == FRAME_ERROR) {
+			++cleaned_count;
+			continue;
+		}
+
+		multicast = (status == GOOD_AS_FIRST_MULTICAST_FRAME);
+
+		/*
+		 *  i is incremented up to the frame_count frames received,
+		 *  j cyclicly goes over the location from the next frame
+		 *  to clean in the ring
+		 */
+		j = (rx_ring->next_to_clean + 1);
+		j &= (TBT_NET_NUM_RX_BUFS - 1);
+		for (i = 1; i < frame_count; ++i) {
+			status = tbt_net_check_frame(port, j, &frame_count, i,
+						     &frame_id, &size);
+			if (status == FRAME_NOT_READY)
+				goto out;
+
+			j = (j + 1) & (TBT_NET_NUM_RX_BUFS - 1);
+
+			/* if a new frame is found, start over */
+			if (status == GOOD_AS_FIRST_FRAME ||
+			    status == GOOD_AS_FIRST_MULTICAST_FRAME) {
+				multicast = (status ==
+					     GOOD_AS_FIRST_MULTICAST_FRAME);
+				cleaned_count += i;
+				i = 0;
+				continue;
+			}
+
+			if (status == FRAME_ERROR) {
+				cleaned_count += (i + 1);
+				goto loop;
+			}
+		}
+
+		/* allocate a skb to store the frags */
+		skb = netdev_alloc_skb_ip_align(port->net_dev,
+						TBT_NET_RX_HDR_SIZE);
+		if (unlikely(!skb))
+			break;
+
+		/*
+		 * we will be copying header into skb->data in
+		 * tbt_net_pull_tail so it is in our interest to prefetch
+		 * it now to avoid a possible cache miss
+		 */
+		prefetchw(skb->data);
+
+		/*
+		 * if overall size of packet smaller than TBT_NET_RX_HDR_SIZE
+		 * which is a small buffer size we decided to allocate
+		 * as the base to RX
+		 */
+		if (size <= TBT_NET_RX_HDR_SIZE) {
+			struct tbt_buffer *buf =
+				&(rx_ring->buffers[rx_ring->next_to_clean]);
+			u8 *va = page_address(buf->page) + buf->page_offset +
+				 sizeof(struct tbt_frame_header);
+
+			memcpy(__skb_put(skb, size), va,
+			       ALIGN(size, sizeof(long)));
+
+			/*
+			 * Reuse buffer as-is,
+			 * just make sure it is local
+			 * Access to local memory is faster than non-local
+			 * memory so let's reuse.
+			 * If not local, let's free it and reallocate later.
+			 */
+			if (likely(page_to_nid(buf->page) == numa_node_id()))
+				/* sync the buffer for use by the device */
+				dma_sync_single_range_for_device(
+						&port->nhi_ctxt->pdev->dev,
+						buf->dma, buf->page_offset,
+						TBT_RING_MAX_FRAME_SIZE,
+						DMA_FROM_DEVICE);
+			else {
+				/* this page cannot be reused so discard it */
+				put_page(buf->page);
+				buf->page = NULL;
+				dma_unmap_page(&port->nhi_ctxt->pdev->dev,
+					       buf->dma, PAGE_SIZE,
+					       DMA_FROM_DEVICE);
+			}
+			rx_ring->next_to_clean = (rx_ring->next_to_clean + 1) &
+						 (TBT_NET_NUM_RX_BUFS - 1);
+		} else {
+			for (i = 0; i < frame_count;  ++i) {
+				struct tbt_buffer *buf = &(rx_ring->buffers[
+						rx_ring->next_to_clean]);
+				struct tbt_frame_header *hdr =
+						page_address(buf->page) +
+						buf->page_offset;
+				u32 frm_size = le32_to_cpu(hdr->frame_size);
+
+				unsigned int truesize =
+					tbt_net_max_frm_data_size(frm_size);
+
+				/* add frame to skb struct */
+				skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
+						buf->page,
+						sizeof(struct tbt_frame_header)
+							+ buf->page_offset,
+						frm_size, truesize);
+
+#if (TBT_NUM_FRAMES_PER_PAGE > 1)
+				/* move offset up to the next cache line */
+				buf->page_offset += (truesize +
+					sizeof(struct tbt_frame_header));
+
+				/*
+				 * we can reuse buffer if there is space
+				 * available and it is local
+				 */
+				if (page_to_nid(buf->page) == numa_node_id()
+				    && buf->page_offset <=
+					PAGE_SIZE - TBT_RING_MAX_FRAME_SIZE) {
+					/*
+					 * bump ref count on page before
+					 * it is given to the stack
+					 */
+					get_page(buf->page);
+					/*
+					 * sync the buffer for use by the
+					 * device
+					 */
+					dma_sync_single_range_for_device(
+						&port->nhi_ctxt->pdev->dev,
+						buf->dma, buf->page_offset,
+						TBT_RING_MAX_FRAME_SIZE,
+						DMA_FROM_DEVICE);
+				} else
+#endif
+				{
+					buf->page = NULL;
+					dma_unmap_page(
+						&port->nhi_ctxt->pdev->dev,
+						buf->dma, PAGE_SIZE,
+						DMA_FROM_DEVICE);
+				}
+
+				rx_ring->next_to_clean =
+						(rx_ring->next_to_clean + 1) &
+						(TBT_NET_NUM_RX_BUFS - 1);
+			}
+			/*
+			 * place header from the first
+			 * fragment in linear portion of buffer
+			 */
+			tbt_net_pull_tail(skb);
+		}
+
+		/*
+		 * The Thunderbolt medium doesn't have any restriction on
+		 * minimum frame size, thus doesn't need any padding in
+		 * transmit.
+		 * The network stack accepts Runt Ethernet frames,
+		 * therefor there is neither padding in receive.
+		 */
+
+		skb->protocol = eth_type_trans(skb, port->net_dev);
+		napi_gro_receive(&port->napi, skb);
+
+		++rx_packets;
+		port->stats.rx_bytes += size;
+		if (multicast)
+			++port->stats.multicast;
+		cleaned_count += frame_count;
+	}
+
+out:
+	port->stats.rx_packets += rx_packets;
+
+	if (cleaned_count)
+		tbt_net_alloc_rx_buffers(&port->nhi_ctxt->pdev->dev,
+					 rx_ring, cleaned_count, reg,
+					 GFP_ATOMIC);
+
+	/* If all work not completed, return budget and keep polling */
+	if (rx_packets >= budget)
+		return budget;
+
+	/* Work is done so exit the polling mode and re-enable the interrupt */
+	napi_complete(napi);
+
+	spin_lock_irqsave(&port->nhi_ctxt->lock, flags);
+	/* enable RX interrupt */
+	RING_INT_ENABLE_RX(port->nhi_ctxt->iobase, port->local_path,
+			   port->nhi_ctxt->num_paths);
+
+	spin_unlock_irqrestore(&port->nhi_ctxt->lock, flags);
+
+	return 0;
+}
+
+static int tbt_net_open(struct net_device *net_dev)
+{
+	struct tbt_port *port = netdev_priv(net_dev);
+	int res = 0;
+	int i, j;
+
+	/* change link state to off until path establishment finishes */
+	netif_carrier_off(net_dev);
+
+	/*
+	 * if we previously succeeded to allocate msix entries,
+	 * now request IRQ for them:
+	 *  2=tx data port 0,
+	 *  3=rx data port 0,
+	 *  4=tx data port 1,
+	 *  5=rx data port 1,
+	 *  ...
+	 *  if not, if msi is used, nhi_msi will handle icm & data paths
+	 */
+	if (port->nhi_ctxt->msix_entries) {
+		char name[] = "tbt-net-xx-xx";
+
+		scnprintf(name, sizeof(name), "tbt-net-rx-%02u", port->num);
+		res = devm_request_irq(&port->nhi_ctxt->pdev->dev,
+			port->nhi_ctxt->msix_entries[3+(port->num*2)].vector,
+			tbt_net_rx_msix, 0, name, port);
+		if (res) {
+			netif_err(port, ifup, net_dev, "request_irq %s failed %d\n",
+				  name, res);
+			goto out;
+		}
+		name[8] = 't';
+		res = devm_request_irq(&port->nhi_ctxt->pdev->dev,
+			port->nhi_ctxt->msix_entries[2+(port->num*2)].vector,
+			tbt_net_tx_msix, 0, name, port);
+		if (res) {
+			netif_err(port, ifup, net_dev, "request_irq %s failed %d\n",
+				  name, res);
+			goto request_irq_failure;
+		}
+	}
+	/*
+	 * Verifying that all buffer sizes are well defined.
+	 * Starting with frame(s) will not tip over the
+	 * page boundary
+	 */
+	BUILD_BUG_ON(TBT_NUM_FRAMES_PER_PAGE < 1);
+	/*
+	 * Just to make sure we have enough place for containing
+	 * 3 max MTU packets for TX
+	 */
+	BUILD_BUG_ON((TBT_NET_NUM_TX_BUFS * TBT_RING_MAX_FRAME_SIZE) <
+		     (TBT_NET_MTU * 3));
+	/* make sure the number of TX Buffers is power of 2 */
+	BUILD_BUG_ON_NOT_POWER_OF_2(TBT_NET_NUM_TX_BUFS);
+	/*
+	 * Just to make sure we have enough place for containing
+	 * 3 max MTU packets for RX
+	 */
+	BUILD_BUG_ON((TBT_NET_NUM_RX_BUFS * TBT_RING_MAX_FRAME_SIZE) <
+		     (TBT_NET_MTU * 3));
+	/* make sure the number of RX Buffers is power of 2 */
+	BUILD_BUG_ON_NOT_POWER_OF_2(TBT_NET_NUM_RX_BUFS);
+
+	port->rx_ring.last_allocated = TBT_NET_NUM_RX_BUFS - 1;
+
+	port->tx_ring.buffers = vzalloc(TBT_NET_NUM_TX_BUFS *
+					sizeof(struct tbt_buffer));
+	if (!port->tx_ring.buffers)
+		goto ring_alloc_failure;
+	port->rx_ring.buffers = vzalloc(TBT_NET_NUM_RX_BUFS *
+					sizeof(struct tbt_buffer));
+	if (!port->rx_ring.buffers)
+		goto ring_alloc_failure;
+
+	/*
+	 * Allocate TX and RX descriptors
+	 * if the total size is less than a page, do a central allocation
+	 * Otherwise, split TX and RX
+	 */
+	if (TBT_NET_SIZE_TOTAL_DESCS <= PAGE_SIZE) {
+		port->tx_ring.desc = dmam_alloc_coherent(
+				&port->nhi_ctxt->pdev->dev,
+				TBT_NET_SIZE_TOTAL_DESCS,
+				&port->tx_ring.dma,
+				GFP_KERNEL | __GFP_ZERO);
+		if (!port->tx_ring.desc)
+			goto ring_alloc_failure;
+		/* RX starts where TX finishes */
+		port->rx_ring.desc = &port->tx_ring.desc[TBT_NET_NUM_TX_BUFS];
+		port->rx_ring.dma = port->tx_ring.dma +
+			(TBT_NET_NUM_TX_BUFS * sizeof(struct tbt_buf_desc));
+	} else {
+		port->tx_ring.desc = dmam_alloc_coherent(
+				&port->nhi_ctxt->pdev->dev,
+				TBT_NET_NUM_TX_BUFS *
+						sizeof(struct tbt_buf_desc),
+				&port->tx_ring.dma,
+				GFP_KERNEL | __GFP_ZERO);
+		if (!port->tx_ring.desc)
+			goto ring_alloc_failure;
+		port->rx_ring.desc = dmam_alloc_coherent(
+				&port->nhi_ctxt->pdev->dev,
+				TBT_NET_NUM_RX_BUFS *
+						sizeof(struct tbt_buf_desc),
+				&port->rx_ring.dma,
+				GFP_KERNEL | __GFP_ZERO);
+		if (!port->rx_ring.desc)
+			goto rx_desc_alloc_failure;
+	}
+
+	/* allocate TX buffers and configure the descriptors */
+	for (i = 0; i < TBT_NET_NUM_TX_BUFS; i++) {
+		port->tx_ring.buffers[i].hdr = dma_alloc_coherent(
+			&port->nhi_ctxt->pdev->dev,
+			TBT_NUM_FRAMES_PER_PAGE * TBT_RING_MAX_FRAME_SIZE,
+			&port->tx_ring.buffers[i].dma,
+			GFP_KERNEL);
+		if (!port->tx_ring.buffers[i].hdr)
+			goto buffers_alloc_failure;
+
+		port->tx_ring.desc[i].phys =
+				cpu_to_le64(port->tx_ring.buffers[i].dma);
+		port->tx_ring.desc[i].attributes =
+				cpu_to_le32(DESC_ATTR_REQ_STS |
+					    TBT_NET_DESC_ATTR_SOF_EOF);
+
+		/*
+		 * In case the page is bigger than the frame size,
+		 * make the next buffer descriptor points
+		 * on the next frame memory address within the page
+		 */
+		for (i++, j = 1; (i < TBT_NET_NUM_TX_BUFS) &&
+				 (j < TBT_NUM_FRAMES_PER_PAGE); i++, j++) {
+			port->tx_ring.buffers[i].dma =
+				port->tx_ring.buffers[i - 1].dma +
+				TBT_RING_MAX_FRAME_SIZE;
+			port->tx_ring.buffers[i].hdr =
+				(void *)(port->tx_ring.buffers[i - 1].hdr) +
+				TBT_RING_MAX_FRAME_SIZE;
+			/* move the next offset i.e. TBT_RING_MAX_FRAME_SIZE */
+			port->tx_ring.buffers[i].page_offset =
+				port->tx_ring.buffers[i - 1].page_offset +
+				TBT_RING_MAX_FRAME_SIZE;
+			port->tx_ring.desc[i].phys =
+				cpu_to_le64(port->tx_ring.buffers[i].dma);
+			port->tx_ring.desc[i].attributes =
+				cpu_to_le32(DESC_ATTR_REQ_STS |
+					    TBT_NET_DESC_ATTR_SOF_EOF);
+		}
+		i--;
+	}
+
+	port->negotiation_status =
+			BIT(port->nhi_ctxt->net_devices[port->num].medium_sts);
+	if (port->negotiation_status == BIT(MEDIUM_READY_FOR_CONNECTION)) {
+		port->login_retry_count = 0;
+		queue_delayed_work(port->nhi_ctxt->net_workqueue,
+				   &port->login_retry_work, 0);
+	}
+
+	netif_info(port, ifup, net_dev, "Thunderbolt(TM) Networking port %u - ready for ThunderboltIP negotiation\n",
+		   port->num);
+	return 0;
+
+buffers_alloc_failure:
+	/*
+	 * Rollback the Tx buffers that were already allocated
+	 * until the failure
+	 */
+	for (i--; i >= 0; i--) {
+		/* free only for first buffer allocation */
+		if (port->tx_ring.buffers[i].page_offset == 0)
+			dma_free_coherent(&port->nhi_ctxt->pdev->dev,
+					  TBT_NUM_FRAMES_PER_PAGE *
+						TBT_RING_MAX_FRAME_SIZE,
+					  port->tx_ring.buffers[i].hdr,
+					  port->tx_ring.buffers[i].dma);
+		port->tx_ring.buffers[i].hdr = NULL;
+	}
+	/*
+	 * For central allocation, free all
+	 * otherwise free RX and then TX separately
+	 */
+	if (TBT_NET_SIZE_TOTAL_DESCS <= PAGE_SIZE) {
+		dmam_free_coherent(&port->nhi_ctxt->pdev->dev,
+				   TBT_NET_SIZE_TOTAL_DESCS,
+				   port->tx_ring.desc,
+				   port->tx_ring.dma);
+		port->rx_ring.desc = NULL;
+	} else {
+		dmam_free_coherent(&port->nhi_ctxt->pdev->dev,
+				   TBT_NET_NUM_RX_BUFS *
+						sizeof(struct tbt_buf_desc),
+				   port->rx_ring.desc,
+				   port->rx_ring.dma);
+		port->rx_ring.desc = NULL;
+rx_desc_alloc_failure:
+		dmam_free_coherent(&port->nhi_ctxt->pdev->dev,
+				   TBT_NET_NUM_TX_BUFS *
+						sizeof(struct tbt_buf_desc),
+				   port->tx_ring.desc,
+				   port->tx_ring.dma);
+	}
+	port->tx_ring.desc = NULL;
+ring_alloc_failure:
+	vfree(port->tx_ring.buffers);
+	port->tx_ring.buffers = NULL;
+	vfree(port->rx_ring.buffers);
+	port->rx_ring.buffers = NULL;
+	res = -ENOMEM;
+	netif_err(port, ifup, net_dev, "Thunderbolt(TM) Networking port %u - unable to allocate memory\n",
+		  port->num);
+
+	if (!port->nhi_ctxt->msix_entries)
+		goto out;
+
+	devm_free_irq(&port->nhi_ctxt->pdev->dev,
+		      port->nhi_ctxt->msix_entries[2 + (port->num * 2)].vector,
+		      port);
+request_irq_failure:
+	devm_free_irq(&port->nhi_ctxt->pdev->dev,
+		      port->nhi_ctxt->msix_entries[3 + (port->num * 2)].vector,
+		      port);
+out:
+	return res;
+}
+
+static int tbt_net_close(struct net_device *net_dev)
+{
+	struct tbt_port *port = netdev_priv(net_dev);
+	int i;
+
+	/*
+	 * Close connection, disable rings, flow controls
+	 * and interrupts
+	 */
+	tbt_net_tear_down(net_dev, !(port->negotiation_status &
+				     BIT(RECEIVE_LOGOUT)));
+
+	cancel_work_sync(&port->login_response_work);
+	cancel_work_sync(&port->logout_work);
+	cancel_work_sync(&port->status_reply_work);
+	cancel_work_sync(&port->approve_inter_domain_work);
+
+	/* Rollback the Tx buffers that were allocated */
+	for (i = 0; i < TBT_NET_NUM_TX_BUFS; i++) {
+		if (port->tx_ring.buffers[i].page_offset == 0)
+			dma_free_coherent(&port->nhi_ctxt->pdev->dev,
+					  TBT_NUM_FRAMES_PER_PAGE *
+						TBT_RING_MAX_FRAME_SIZE,
+					  port->tx_ring.buffers[i].hdr,
+					  port->tx_ring.buffers[i].dma);
+		port->tx_ring.buffers[i].hdr = NULL;
+	}
+	/* Unmap the Rx buffers that were allocated */
+	for (i = 0; i < TBT_NET_NUM_RX_BUFS; i++)
+		if (port->rx_ring.buffers[i].page) {
+			put_page(port->rx_ring.buffers[i].page);
+			port->rx_ring.buffers[i].page = NULL;
+			dma_unmap_page(&port->nhi_ctxt->pdev->dev,
+				       port->rx_ring.buffers[i].dma, PAGE_SIZE,
+				       DMA_FROM_DEVICE);
+		}
+
+	/*
+	 * For central allocation, free all
+	 * otherwise free RX and then TX separately
+	 */
+	if (TBT_NET_SIZE_TOTAL_DESCS <= PAGE_SIZE) {
+		dmam_free_coherent(&port->nhi_ctxt->pdev->dev,
+				   TBT_NET_SIZE_TOTAL_DESCS,
+				   port->tx_ring.desc,
+				   port->tx_ring.dma);
+		port->rx_ring.desc = NULL;
+	} else {
+		dmam_free_coherent(&port->nhi_ctxt->pdev->dev,
+				   TBT_NET_NUM_RX_BUFS *
+						sizeof(struct tbt_buf_desc),
+				   port->rx_ring.desc,
+				   port->rx_ring.dma);
+		port->rx_ring.desc = NULL;
+		dmam_free_coherent(&port->nhi_ctxt->pdev->dev,
+				   TBT_NET_NUM_TX_BUFS *
+						sizeof(struct tbt_buf_desc),
+				   port->tx_ring.desc,
+				   port->tx_ring.dma);
+	}
+	port->tx_ring.desc = NULL;
+
+	vfree(port->tx_ring.buffers);
+	port->tx_ring.buffers = NULL;
+	vfree(port->rx_ring.buffers);
+	port->rx_ring.buffers = NULL;
+
+	devm_free_irq(&port->nhi_ctxt->pdev->dev,
+		      port->nhi_ctxt->msix_entries[3 + (port->num * 2)].vector,
+		      port);
+	devm_free_irq(&port->nhi_ctxt->pdev->dev,
+		      port->nhi_ctxt->msix_entries[2 + (port->num * 2)].vector,
+		      port);
+
+	netif_info(port, ifdown, net_dev, "Thunderbolt(TM) Networking port %u - is down\n",
+		   port->num);
+
+	return 0;
+}
+
+static bool tbt_net_xmit_csum(struct sk_buff *skb,
+			      struct tbt_desc_ring *tx_ring, u32 first,
+			      u32 last, u32 frame_count)
+{
+
+	struct tbt_frame_header *hdr = tx_ring->buffers[first].hdr;
+	__wsum wsum = (__force __wsum)htonl(skb->len -
+					    skb_transport_offset(skb));
+	int offset = skb_transport_offset(skb);
+	__sum16 *tucso;  /* TCP UDP Checksum Segment Offset */
+	__be16 protocol = skb->protocol;
+	u8 *dest = (u8 *)(hdr + 1);
+	int len;
+
+	if (skb->ip_summed != CHECKSUM_PARTIAL) {
+		for (; first != last;
+			first = (first + 1) & (TBT_NET_NUM_TX_BUFS - 1)) {
+			hdr = tx_ring->buffers[first].hdr;
+			hdr->frame_count = cpu_to_le32(frame_count);
+		}
+		return true;
+	}
+
+	if (protocol == htons(ETH_P_8021Q)) {
+		struct vlan_hdr *vhdr, vh;
+
+		vhdr = skb_header_pointer(skb, ETH_HLEN, sizeof(vh), &vh);
+		if (!vhdr)
+			return false;
+
+		protocol = vhdr->h_vlan_encapsulated_proto;
+	}
+
+	/*
+	 * Data points on the beginning of packet.
+	 * Check is the checksum absolute place in the
+	 * packet.
+	 * ipcso will update IP checksum.
+	 * tucso will update TCP/UPD checksum.
+	 */
+	if (protocol == htons(ETH_P_IP)) {
+		__sum16 *ipcso = (__sum16 *)(dest +
+			((u8 *)&(ip_hdr(skb)->check) - skb->data));
+
+		*ipcso = 0;
+		*ipcso = ip_fast_csum(dest + skb_network_offset(skb),
+				      ip_hdr(skb)->ihl);
+		if (ip_hdr(skb)->protocol == IPPROTO_TCP)
+			tucso = (__sum16 *)(dest +
+				((u8 *)&(tcp_hdr(skb)->check) - skb->data));
+		else if (ip_hdr(skb)->protocol == IPPROTO_UDP)
+			tucso = (__sum16 *)(dest +
+				((u8 *)&(udp_hdr(skb)->check) - skb->data));
+		else
+			return false;
+
+		*tucso = ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
+					    ip_hdr(skb)->daddr, 0,
+					    ip_hdr(skb)->protocol, 0);
+	} else if (skb_is_gso(skb)) {
+		if (skb_is_gso_v6(skb)) {
+			tucso = (__sum16 *)(dest +
+				((u8 *)&(tcp_hdr(skb)->check) - skb->data));
+			*tucso = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+						  &ipv6_hdr(skb)->daddr,
+						  0, IPPROTO_TCP, 0);
+		} else if ((protocol == htons(ETH_P_IPV6)) &&
+			   (skb_shinfo(skb)->gso_type & SKB_GSO_UDP)) {
+			tucso = (__sum16 *)(dest +
+				((u8 *)&(udp_hdr(skb)->check) - skb->data));
+			*tucso = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+						  &ipv6_hdr(skb)->daddr,
+						  0, IPPROTO_UDP, 0);
+		} else {
+			return false;
+		}
+	} else if (protocol == htons(ETH_P_IPV6)) {
+		tucso = (__sum16 *)(dest + skb_checksum_start_offset(skb) +
+				    skb->csum_offset);
+		*tucso = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+					  &ipv6_hdr(skb)->daddr,
+					  0, ipv6_hdr(skb)->nexthdr, 0);
+	} else {
+		return false;
+	}
+
+	/* First frame was headers, rest of the frames is data */
+	for (; first != last; first = (first + 1) & (TBT_NET_NUM_TX_BUFS - 1),
+								offset = 0) {
+		hdr = tx_ring->buffers[first].hdr;
+		dest = (u8 *)(hdr + 1) + offset;
+		len = le32_to_cpu(hdr->frame_size) - offset;
+		wsum = csum_partial(dest, len, wsum);
+		hdr->frame_count = cpu_to_le32(frame_count);
+	}
+	*tucso = csum_fold(wsum);
+
+	return true;
+}
+
+static netdev_tx_t tbt_net_xmit_frame(struct sk_buff *skb,
+				      struct net_device *net_dev)
+{
+	struct tbt_port *port = netdev_priv(net_dev);
+	void __iomem *iobase = port->nhi_ctxt->iobase;
+	void __iomem *reg = TBT_RING_CONS_PROD_REG(iobase,
+						   REG_TX_RING_BASE,
+						   port->local_path);
+	struct tbt_desc_ring *tx_ring = &port->tx_ring;
+	struct tbt_frame_header *hdr;
+	u32 prod_cons, prod, cons, first;
+	/* len equivalent to the fragment length */
+	unsigned int len = skb_headlen(skb);
+	/* data_len is overall packet length */
+	unsigned int data_len = skb->len;
+	u32 frm_idx, frag_num = 0;
+	const u8 *src = skb->data;
+	bool unmap = false;
+	__le32 *attr;
+	u8 *dest;
+
+	if (unlikely(data_len == 0 || data_len > TBT_NET_MTU))
+		goto invalid_packet;
+
+	prod_cons = ioread32(reg);
+	prod = TBT_REG_RING_PROD_EXTRACT(prod_cons);
+	cons = TBT_REG_RING_CONS_EXTRACT(prod_cons);
+	if (prod >= TBT_NET_NUM_TX_BUFS || cons >= TBT_NET_NUM_TX_BUFS)
+		goto tx_error;
+
+	if (data_len > (TBT_NUM_BUFS_BETWEEN(prod, cons, TBT_NET_NUM_TX_BUFS) *
+			TBT_RING_MAX_FRM_DATA_SZ)) {
+		unsigned long flags;
+
+		netif_stop_queue(net_dev);
+
+		spin_lock_irqsave(&port->nhi_ctxt->lock, flags);
+		/*
+		 * Enable TX interrupt to be notified about available buffers
+		 * and restart transmission upon this.
+		 */
+		RING_INT_ENABLE_TX(iobase, port->local_path);
+		spin_unlock_irqrestore(&port->nhi_ctxt->lock, flags);
+
+		return NETDEV_TX_BUSY;
+	}
+
+	first = prod;
+	attr = &tx_ring->desc[prod].attributes;
+	hdr = tx_ring->buffers[prod].hdr;
+	dest = (u8 *)(hdr + 1);
+	/* if overall packet is bigger than the frame data size */
+	for (frm_idx = 0; data_len > TBT_RING_MAX_FRM_DATA_SZ; ++frm_idx) {
+		u32 size_left = TBT_RING_MAX_FRM_DATA_SZ;
+
+		*attr &= cpu_to_le32(~(DESC_ATTR_LEN_MASK |
+				      DESC_ATTR_INT_EN |
+				      DESC_ATTR_DESC_DONE));
+		hdr->frame_size = cpu_to_le32(TBT_RING_MAX_FRM_DATA_SZ);
+		hdr->frame_index = cpu_to_le16(frm_idx);
+		hdr->frame_id = cpu_to_le16(port->frame_id);
+
+		do {
+			if (len > size_left) {
+				/*
+				 * Copy data onto tx buffer data with full
+				 * frame size then break
+				 * and go to next frame
+				 */
+				memcpy(dest, src, size_left);
+				len -= size_left;
+				dest += size_left;
+				src += size_left;
+				break;
+			}
+
+			memcpy(dest, src, len);
+			size_left -= len;
+			dest += len;
+
+			if (unmap) {
+				kunmap_atomic((void *)src);
+				unmap = false;
+			}
+			/*
+			 * Ensure all fragments have been processed
+			 */
+			if (frag_num < skb_shinfo(skb)->nr_frags) {
+				const skb_frag_t *frag =
+					&(skb_shinfo(skb)->frags[frag_num]);
+				len = skb_frag_size(frag);
+				/* map and then unmap quickly */
+				src = kmap_atomic(skb_frag_page(frag)) +
+							frag->page_offset;
+				unmap = true;
+				++frag_num;
+			} else if (unlikely(size_left > 0)) {
+				goto invalid_packet;
+			}
+		} while (size_left > 0);
+
+		data_len -= TBT_RING_MAX_FRM_DATA_SZ;
+		prod = (prod + 1) & (TBT_NET_NUM_TX_BUFS - 1);
+		attr = &tx_ring->desc[prod].attributes;
+		hdr = tx_ring->buffers[prod].hdr;
+		dest = (u8 *)(hdr + 1);
+	}
+
+	*attr &= cpu_to_le32(~(DESC_ATTR_LEN_MASK | DESC_ATTR_DESC_DONE));
+	/* Enable the interrupts, for resuming from stop queue later (if so) */
+	*attr |= cpu_to_le32(DESC_ATTR_INT_EN |
+		(((sizeof(struct tbt_frame_header) + data_len) <<
+		  DESC_ATTR_LEN_SHIFT) & DESC_ATTR_LEN_MASK));
+	hdr->frame_size = cpu_to_le32(data_len);
+	hdr->frame_index = cpu_to_le16(frm_idx);
+	hdr->frame_id = cpu_to_le16(port->frame_id);
+
+	/* In case  the remaining data_len is smaller than a frame */
+	while (len < data_len) {
+		memcpy(dest, src, len);
+		data_len -= len;
+		dest += len;
+
+		if (unmap) {
+			kunmap_atomic((void *)src);
+			unmap = false;
+		}
+
+		if (frag_num < skb_shinfo(skb)->nr_frags) {
+			const skb_frag_t *frag =
+					&(skb_shinfo(skb)->frags[frag_num]);
+			len = skb_frag_size(frag);
+			src = kmap_atomic(skb_frag_page(frag)) +
+							frag->page_offset;
+			unmap = true;
+			++frag_num;
+		} else if (unlikely(data_len > 0)) {
+			goto invalid_packet;
+		}
+	}
+	memcpy(dest, src, data_len);
+	if (unmap) {
+		kunmap_atomic((void *)src);
+		unmap = false;
+	}
+
+	++frm_idx;
+	prod = (prod + 1) & (TBT_NET_NUM_TX_BUFS - 1);
+
+	if (!tbt_net_xmit_csum(skb, tx_ring, first, prod, frm_idx))
+		goto invalid_packet;
+
+	if (port->match_frame_id)
+		++port->frame_id;
+
+	prod_cons &= ~REG_RING_PROD_MASK;
+	prod_cons |= (prod << REG_RING_PROD_SHIFT) & REG_RING_PROD_MASK;
+	wmb(); /* make sure producer update is done after buffers are ready */
+	iowrite32(prod_cons, reg);
+
+	++port->stats.tx_packets;
+	port->stats.tx_bytes += skb->len;
+
+	dev_consume_skb_any(skb);
+	return NETDEV_TX_OK;
+
+invalid_packet:
+	netif_err(port, tx_err, net_dev, "port %u invalid transmit packet\n",
+		  port->num);
+tx_error:
+	++port->stats.tx_errors;
+	dev_kfree_skb_any(skb);
+	return NETDEV_TX_OK;
 }
 
+static void tbt_net_set_rx_mode(struct net_device *net_dev)
+{
+	struct tbt_port *port = netdev_priv(net_dev);
+	struct netdev_hw_addr *ha;
+
+	if (net_dev->flags & IFF_PROMISC)
+		port->packet_filters |= BIT(PACKET_TYPE_PROMISCUOUS);
+	else
+		port->packet_filters &= ~BIT(PACKET_TYPE_PROMISCUOUS);
+	if (net_dev->flags & IFF_ALLMULTI)
+		port->packet_filters |= BIT(PACKET_TYPE_ALL_MULTICAST);
+	else
+		port->packet_filters &= ~BIT(PACKET_TYPE_ALL_MULTICAST);
+
+	/* if you have more than a single MAC address */
+	if (netdev_uc_count(net_dev) > 1)
+		port->packet_filters |= BIT(PACKET_TYPE_UNICAST_PROMISCUOUS);
+	/* if have a single MAC address */
+	else if (netdev_uc_count(net_dev) == 1) {
+		netdev_for_each_uc_addr(ha, net_dev)
+			/* checks whether the MAC is what we set */
+			if (ether_addr_equal(ha->addr, net_dev->dev_addr))
+				port->packet_filters &=
+					~BIT(PACKET_TYPE_UNICAST_PROMISCUOUS);
+			else
+				port->packet_filters |=
+					BIT(PACKET_TYPE_UNICAST_PROMISCUOUS);
+	} else {
+		port->packet_filters &= ~BIT(PACKET_TYPE_UNICAST_PROMISCUOUS);
+	}
+
+	/* Populate the multicast hash table with received MAC addresses */
+	memset(port->multicast_hash_table, 0,
+	       sizeof(port->multicast_hash_table));
+	netdev_for_each_mc_addr(ha, net_dev) {
+		u16 hash_val = TBT_NET_ETHER_ADDR_HASH(ha->addr);
+
+		port->multicast_hash_table[hash_val / BITS_PER_U32] |=
+						BIT(hash_val % BITS_PER_U32);
+	}
+
+}
+
+static struct rtnl_link_stats64 *tbt_net_get_stats64(
+					struct net_device *net_dev,
+					struct rtnl_link_stats64 *stats)
+{
+	struct tbt_port *port = netdev_priv(net_dev);
+
+	memset(stats, 0, sizeof(*stats));
+	stats->tx_packets = port->stats.tx_packets;
+	stats->tx_bytes = port->stats.tx_bytes;
+	stats->tx_errors = port->stats.tx_errors;
+	stats->rx_packets = port->stats.rx_packets;
+	stats->rx_bytes = port->stats.rx_bytes;
+	stats->rx_length_errors = port->stats.rx_length_errors;
+	stats->rx_over_errors = port->stats.rx_over_errors;
+	stats->rx_crc_errors = port->stats.rx_crc_errors;
+	stats->rx_missed_errors = port->stats.rx_missed_errors;
+	stats->rx_errors = stats->rx_length_errors + stats->rx_over_errors +
+			   stats->rx_crc_errors + stats->rx_missed_errors;
+	stats->multicast = port->stats.multicast;
+	return stats;
+}
+
+static int tbt_net_set_mac_address(struct net_device *net_dev, void *addr)
+{
+	struct sockaddr *saddr = addr;
+
+	if (!is_valid_ether_addr(saddr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	memcpy(net_dev->dev_addr, saddr->sa_data, net_dev->addr_len);
+
+	return 0;
+}
+
+static int tbt_net_change_mtu(struct net_device *net_dev, int new_mtu)
+{
+	struct tbt_port *port = netdev_priv(net_dev);
+
+	/* MTU < 68 is an error and causes problems on some kernels */
+	if (new_mtu < 68 || new_mtu > (TBT_NET_MTU - ETH_HLEN))
+		return -EINVAL;
+
+	netif_info(port, probe, net_dev, "Thunderbolt(TM) Networking port %u - changing MTU from %u to %d\n",
+		   port->num, net_dev->mtu, new_mtu);
+
+	net_dev->mtu = new_mtu;
+
+	return 0;
+}
+
+static const struct net_device_ops tbt_netdev_ops = {
+	/* called when the network is up'ed */
+	.ndo_open		= tbt_net_open,
+	/* called when the network is down'ed */
+	.ndo_stop		= tbt_net_close,
+	.ndo_start_xmit		= tbt_net_xmit_frame,
+	.ndo_set_rx_mode	= tbt_net_set_rx_mode,
+	.ndo_get_stats64	= tbt_net_get_stats64,
+	.ndo_set_mac_address	= tbt_net_set_mac_address,
+	.ndo_change_mtu		= tbt_net_change_mtu,
+	.ndo_validate_addr	= eth_validate_addr,
+};
+
+static int tbt_net_get_settings(__maybe_unused struct net_device *net_dev,
+				struct ethtool_cmd *ecmd)
+{
+	ecmd->supported |= SUPPORTED_20000baseKR2_Full;
+	ecmd->advertising |= ADVERTISED_20000baseKR2_Full;
+	ecmd->autoneg = AUTONEG_DISABLE;
+	ecmd->transceiver = XCVR_INTERNAL;
+	ecmd->supported |= SUPPORTED_FIBRE;
+	ecmd->advertising |= ADVERTISED_FIBRE;
+	ecmd->port = PORT_FIBRE;
+	ethtool_cmd_speed_set(ecmd, SPEED_20000);
+	ecmd->duplex = DUPLEX_FULL;
+
+	return 0;
+}
+
+
+static u32 tbt_net_get_msglevel(struct net_device *net_dev)
+{
+	struct tbt_port *port = netdev_priv(net_dev);
+
+	return port->msg_enable;
+}
+
+static void tbt_net_set_msglevel(struct net_device *net_dev, u32 data)
+{
+	struct tbt_port *port = netdev_priv(net_dev);
+
+	port->msg_enable = data;
+}
+
+static void tbt_net_get_strings(__maybe_unused struct net_device *net_dev,
+				u32 stringset, u8 *data)
+{
+	if (stringset == ETH_SS_STATS)
+		memcpy(data, tbt_net_gstrings_stats,
+		       sizeof(tbt_net_gstrings_stats));
+}
+
+static void tbt_net_get_ethtool_stats(struct net_device *net_dev,
+				      __maybe_unused struct ethtool_stats *sts,
+				      u64 *data)
+{
+	struct tbt_port *port = netdev_priv(net_dev);
+
+	memcpy(data, &port->stats, sizeof(port->stats));
+}
+
+static int tbt_net_get_sset_count(__maybe_unused struct net_device *net_dev,
+				  int sset)
+{
+	if (sset == ETH_SS_STATS)
+		return sizeof(tbt_net_gstrings_stats) / ETH_GSTRING_LEN;
+	return -EOPNOTSUPP;
+}
+
+static void tbt_net_get_drvinfo(struct net_device *net_dev,
+				struct ethtool_drvinfo *drvinfo)
+{
+	struct tbt_port *port = netdev_priv(net_dev);
+
+	strlcpy(drvinfo->driver, "Thunderbolt(TM) Networking",
+		sizeof(drvinfo->driver));
+	strlcpy(drvinfo->version, DRV_VERSION, sizeof(drvinfo->version));
+
+	strlcpy(drvinfo->bus_info, pci_name(port->nhi_ctxt->pdev),
+		sizeof(drvinfo->bus_info));
+	drvinfo->n_stats = tbt_net_get_sset_count(net_dev, ETH_SS_STATS);
+}
+
+static const struct ethtool_ops tbt_net_ethtool_ops = {
+	.get_settings		= tbt_net_get_settings,
+	.get_drvinfo		= tbt_net_get_drvinfo,
+	.get_link		= ethtool_op_get_link,
+	.get_msglevel		= tbt_net_get_msglevel,
+	.set_msglevel		= tbt_net_set_msglevel,
+	.get_strings		= tbt_net_get_strings,
+	.get_ethtool_stats	= tbt_net_get_ethtool_stats,
+	.get_sset_count		= tbt_net_get_sset_count,
+};
+
 static inline int send_message(struct tbt_port *port, const char *func,
 				enum pdf_value pdf, u32 msg_len,
 				const void *msg)
@@ -496,6 +1920,10 @@ void negotiation_events(struct net_device *net_dev,
 		/* configure TX ring */
 		reg = iobase + REG_TX_RING_BASE +
 		      (port->local_path * REG_RING_STEP);
+		iowrite32(lower_32_bits(port->tx_ring.dma),
+			  reg + REG_RING_PHYS_LO_OFFSET);
+		iowrite32(upper_32_bits(port->tx_ring.dma),
+			  reg + REG_RING_PHYS_HI_OFFSET);
 
 		tx_ring_conf = (TBT_NET_NUM_TX_BUFS << REG_RING_SIZE_SHIFT) &
 				REG_RING_SIZE_MASK;
@@ -538,6 +1966,10 @@ void negotiation_events(struct net_device *net_dev,
 		 */
 		reg = iobase + REG_RX_RING_BASE +
 		      (port->local_path * REG_RING_STEP);
+		iowrite32(lower_32_bits(port->rx_ring.dma),
+			  reg + REG_RING_PHYS_LO_OFFSET);
+		iowrite32(upper_32_bits(port->rx_ring.dma),
+			  reg + REG_RING_PHYS_HI_OFFSET);
 
 		rx_ring_conf = (TBT_NET_NUM_RX_BUFS << REG_RING_SIZE_SHIFT) &
 				REG_RING_SIZE_MASK;
@@ -547,6 +1979,17 @@ void negotiation_events(struct net_device *net_dev,
 				REG_RING_BUF_SIZE_MASK;
 
 		iowrite32(rx_ring_conf, reg + REG_RING_SIZE_OFFSET);
+		/* allocate RX buffers and configure the descriptors */
+		if (!tbt_net_alloc_rx_buffers(&port->nhi_ctxt->pdev->dev,
+					      &port->rx_ring,
+					      TBT_NET_NUM_RX_BUFS,
+					      reg + REG_RING_CONS_PROD_OFFSET,
+					      GFP_KERNEL)) {
+			netif_err(port, link, net_dev, "Thunderbolt(TM) Networking port %u - no memory for receive buffers\n",
+				  port->num);
+			tbt_net_tear_down(net_dev, true);
+			break;
+		}
 
 		spin_lock_irqsave(&port->nhi_ctxt->lock, flags);
 		/* enable RX interrupt */
@@ -559,6 +2002,7 @@ void negotiation_events(struct net_device *net_dev,
 		netif_info(port, link, net_dev, "Thunderbolt(TM) Networking port %u - ready\n",
 			   port->num);
 
+		napi_enable(&port->napi);
 		netif_carrier_on(net_dev);
 		netif_start_queue(net_dev);
 		break;
@@ -769,15 +2213,42 @@ struct net_device *nhi_alloc_etherdev(struct tbt_nhi_ctxt *nhi_ctxt,
 	scnprintf(net_dev->name, sizeof(net_dev->name), "tbtnet%%dp%hhu",
 		  port_num);
 
+	net_dev->netdev_ops = &tbt_netdev_ops;
+
+	netif_napi_add(net_dev, &port->napi, tbt_net_poll, NAPI_POLL_WEIGHT);
+
+	net_dev->hw_features = NETIF_F_SG |
+			       NETIF_F_ALL_TSO |
+			       NETIF_F_UFO |
+			       NETIF_F_GRO |
+			       NETIF_F_IP_CSUM |
+			       NETIF_F_IPV6_CSUM;
+	net_dev->features = net_dev->hw_features;
+	if (nhi_ctxt->pci_using_dac)
+		net_dev->features |= NETIF_F_HIGHDMA;
+
 	INIT_DELAYED_WORK(&port->login_retry_work, login_retry);
 	INIT_WORK(&port->login_response_work, login_response);
 	INIT_WORK(&port->logout_work, logout);
 	INIT_WORK(&port->status_reply_work, status_reply);
 	INIT_WORK(&port->approve_inter_domain_work, approve_inter_domain);
 
+	net_dev->ethtool_ops = &tbt_net_ethtool_ops;
+
+	tbt_net_change_mtu(net_dev, TBT_NET_MTU - ETH_HLEN);
+
+	if (register_netdev(net_dev))
+		goto err_register;
+
+	netif_carrier_off(net_dev);
+
 	netif_info(port, probe, net_dev,
 		   "Thunderbolt(TM) Networking port %u - MAC Address: %pM\n",
 		   port_num, net_dev->dev_addr);
 
 	return net_dev;
+
+err_register:
+	free_netdev(net_dev);
+	return NULL;
 }
-- 
2.7.4

^ permalink raw reply related