DPDK-dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 2/4] net/e1000: add firmware version get
From: Qiming Yang @ 2016-12-27 12:30 UTC (permalink / raw)
  To: dev, thomas.monjalon; +Cc: remy.horton, ferruh.yigit, Qiming Yang
In-Reply-To: <1482841816-54143-1-git-send-email-qiming.yang@intel.com>

This patch adds a new function eth_igb_fw_version_get.

Signed-off-by: Qiming Yang <qiming.yang@intel.com>
---
v3 changes:
 * use eth_igb_fw_version_get(struct rte_eth_dev *dev, u32 *fw_major,
   u32 *fw_minor, u32 *fw_minor, u32 *fw_patch, u32 *etrack_id) instead
   of eth_igb_fw_version_get(struct rte_eth_dev *dev, char *fw_version,
   int fw_length). Add statusment in /doc/guides/nics/features/igb.ini.
---
---
 doc/guides/nics/features/igb.ini |  1 +
 drivers/net/e1000/igb_ethdev.c   | 43 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+)

diff --git a/doc/guides/nics/features/igb.ini b/doc/guides/nics/features/igb.ini
index 9fafe72..ffd87ba 100644
--- a/doc/guides/nics/features/igb.ini
+++ b/doc/guides/nics/features/igb.ini
@@ -39,6 +39,7 @@ EEPROM dump          = Y
 Registers dump       = Y
 BSD nic_uio          = Y
 Linux UIO            = Y
+FW version           = Y
 Linux VFIO           = Y
 x86-32               = Y
 x86-64               = Y
diff --git a/drivers/net/e1000/igb_ethdev.c b/drivers/net/e1000/igb_ethdev.c
index 4a15447..25344b7 100644
--- a/drivers/net/e1000/igb_ethdev.c
+++ b/drivers/net/e1000/igb_ethdev.c
@@ -120,6 +120,8 @@ static int eth_igb_xstats_get_names(struct rte_eth_dev *dev,
 				    unsigned limit);
 static void eth_igb_stats_reset(struct rte_eth_dev *dev);
 static void eth_igb_xstats_reset(struct rte_eth_dev *dev);
+static void eth_igb_fw_version_get(struct rte_eth_dev *dev, u32 *fw_major,
+		u32 *fw_minor, u32 *fw_patch, u32 *etrack_id);
 static void eth_igb_infos_get(struct rte_eth_dev *dev,
 			      struct rte_eth_dev_info *dev_info);
 static const uint32_t *eth_igb_supported_ptypes_get(struct rte_eth_dev *dev);
@@ -389,6 +391,7 @@ static const struct eth_dev_ops eth_igb_ops = {
 	.xstats_get_names     = eth_igb_xstats_get_names,
 	.stats_reset          = eth_igb_stats_reset,
 	.xstats_reset         = eth_igb_xstats_reset,
+	.fw_version_get       = eth_igb_fw_version_get,
 	.dev_infos_get        = eth_igb_infos_get,
 	.dev_supported_ptypes_get = eth_igb_supported_ptypes_get,
 	.mtu_set              = eth_igb_mtu_set,
@@ -1981,6 +1984,46 @@ eth_igbvf_stats_reset(struct rte_eth_dev *dev)
 }
 
 static void
+eth_igb_fw_version_get(struct rte_eth_dev *dev, u32 *fw_major, u32 *fw_minor,
+			u32 *fw_patch, u32 *etrack_id)
+{
+	struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct e1000_fw_version fw;
+
+	e1000_get_fw_version(hw, &fw);
+
+	switch (hw->mac.type) {
+	case e1000_i210:
+	case e1000_i211:
+		if (!(e1000_get_flash_presence_i210(hw))) {
+			*fw_major = fw.invm_major;
+			*fw_minor = fw.invm_minor;
+			break;
+		}
+		/* fall through */
+	default:
+		/* if option rom is valid, display its version too*/
+		if (fw.or_valid) {
+			*fw_major = fw.eep_major;
+			*fw_minor = fw.eep_minor;
+			*etrack_id = fw.etrack_id;
+			*fw_patch = fw.or_patch;
+		/* no option rom */
+		} else {
+			if (fw.etrack_id != 0X0000) {
+			*fw_major = fw.eep_major;
+			*fw_minor = fw.eep_minor;
+			*etrack_id = fw.etrack_id;
+			} else {
+			*fw_major = fw.eep_major;
+			*fw_minor = fw.eep_minor;
+			}
+		}
+		break;
+	}
+}
+
+static void
 eth_igb_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 {
 	struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
-- 
2.7.4

^ permalink raw reply related

* [PATCH v3 1/4] ethdev: add firmware information get
From: Qiming Yang @ 2016-12-27 12:30 UTC (permalink / raw)
  To: dev, thomas.monjalon; +Cc: remy.horton, ferruh.yigit, Qiming Yang
In-Reply-To: <1482841816-54143-1-git-send-email-qiming.yang@intel.com>

This patch adds a new API 'rte_eth_dev_fw_info_get' for fetching
firmware related information by a given device.

Signed-off-by: Qiming Yang <qiming.yang@intel.com>
Acked-by: Remy Horton <remy.horton@intel.com>
---
v2 changes:
* modified some comment statements.
v3 changes:
* change API, use rte_eth_dev_fw_info_get(uint8_t port_id,
  uint32_t *fw_major, uint32_t *fw_minor, uint32_t *fw_patch,
  uint32_t *etrack_id) instead of rte_eth_dev_fwver_get(uint8_t port_id,
  char *fw_version, int fw_length).
  Add statusment in /doc/guides/nics/features/default.ini and
  release_17_02.rst.
---
---
 doc/guides/nics/features/default.ini   |  1 +
 doc/guides/rel_notes/release_17_02.rst |  4 ++++
 lib/librte_ether/rte_ethdev.c          | 14 ++++++++++++++
 lib/librte_ether/rte_ethdev.h          | 23 +++++++++++++++++++++++
 lib/librte_ether/rte_ether_version.map |  1 +
 5 files changed, 43 insertions(+)

diff --git a/doc/guides/nics/features/default.ini b/doc/guides/nics/features/default.ini
index f1bf9bf..8237ee4 100644
--- a/doc/guides/nics/features/default.ini
+++ b/doc/guides/nics/features/default.ini
@@ -66,3 +66,4 @@ x86-64               =
 Usage doc            =
 Design doc           =
 Perf doc             =
+FW version           =
diff --git a/doc/guides/rel_notes/release_17_02.rst b/doc/guides/rel_notes/release_17_02.rst
index 180af82..f6dc6c0 100644
--- a/doc/guides/rel_notes/release_17_02.rst
+++ b/doc/guides/rel_notes/release_17_02.rst
@@ -52,6 +52,10 @@ New Features
   See the :ref:`Generic flow API <Generic_flow_API>` documentation for more
   information.
 
+* **Added firmware information get API.**
+ Added a new function ``rte_eth_dev_fw_info_get()`` to fetch firmware related
+ information by a given device. Information include major firmware version,
+ minor firmware version, patch number and etrack id.
 
 Resolved Issues
 ---------------
diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
index 280f0db..f399f09 100644
--- a/lib/librte_ether/rte_ethdev.c
+++ b/lib/librte_ether/rte_ethdev.c
@@ -1586,6 +1586,20 @@ rte_eth_dev_set_rx_queue_stats_mapping(uint8_t port_id, uint16_t rx_queue_id,
 }
 
 void
+rte_eth_dev_fw_info_get(uint8_t port_id, uint32_t *fw_major, uint32_t *fw_minor,
+	uint32_t *fw_patch, uint32_t *etrack_id)
+{
+	struct rte_eth_dev *dev;
+
+	RTE_ETH_VALID_PORTID_OR_RET(port_id);
+	dev = &rte_eth_devices[port_id];
+
+	RTE_FUNC_PTR_OR_RET(*dev->dev_ops->fw_version_get);
+	(*dev->dev_ops->fw_version_get)(dev, fw_major, fw_minor,
+					fw_patch, etrack_id);
+}
+
+void
 rte_eth_dev_info_get(uint8_t port_id, struct rte_eth_dev_info *dev_info)
 {
 	struct rte_eth_dev *dev;
diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
index fb51754..829f652 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -1150,6 +1150,11 @@ typedef uint32_t (*eth_rx_queue_count_t)(struct rte_eth_dev *dev,
 typedef int (*eth_rx_descriptor_done_t)(void *rxq, uint16_t offset);
 /**< @internal Check DD bit of specific RX descriptor */
 
+typedef void (*eth_fw_version_get_t)(struct rte_eth_dev *dev,
+		uint32_t *fw_major, uint32_t *fw_minor,
+		uint32_t *fw_patch, uint32_t *etrack_id);
+/**< @internal Get firmware information of an Ethernet device. */
+
 typedef void (*eth_rxq_info_get_t)(struct rte_eth_dev *dev,
 	uint16_t rx_queue_id, struct rte_eth_rxq_info *qinfo);
 
@@ -1457,6 +1462,7 @@ struct eth_dev_ops {
 	eth_txq_info_get_t         txq_info_get; /**< retrieve TX queue information. */
 	eth_dev_supported_ptypes_get_t dev_supported_ptypes_get;
 	/**< Get packet types supported and identified by device. */
+	eth_fw_version_get_t       fw_version_get; /**< Get firmware version. */
 
 	vlan_filter_set_t          vlan_filter_set; /**< Filter VLAN Setup. */
 	vlan_tpid_set_t            vlan_tpid_set; /**< Outer/Inner VLAN TPID Setup. */
@@ -2395,6 +2401,23 @@ void rte_eth_macaddr_get(uint8_t port_id, struct ether_addr *mac_addr);
 void rte_eth_dev_info_get(uint8_t port_id, struct rte_eth_dev_info *dev_info);
 
 /**
+ * Retrieve the firmware version of a device.
+ *
+ * @param port_id
+ *   The port identifier of the device.
+ * @param fw_major
+ *   A array pointer to store the major firmware version of a device.
+ * @param fw_minor
+ *   A array pointer to store the minor firmware version of a device.
+ * @param fw_patch
+ *   A array pointer to store the firmware patch number of a device.
+ * @param etrack_id
+ *   A array pointer to store the nvm version of a device.
+ */
+void rte_eth_dev_fw_info_get(uint8_t port_id, uint32_t *fw_major,
+	uint32_t *fw_minor, uint32_t *fw_patch, uint32_t *etrack_id);
+
+/**
  * Retrieve the supported packet types of an Ethernet device.
  *
  * When a packet type is announced as supported, it *must* be recognized by
diff --git a/lib/librte_ether/rte_ether_version.map b/lib/librte_ether/rte_ether_version.map
index a021781..a89bfaa 100644
--- a/lib/librte_ether/rte_ether_version.map
+++ b/lib/librte_ether/rte_ether_version.map
@@ -156,5 +156,6 @@ DPDK_17.02 {
 	rte_flow_flush;
 	rte_flow_query;
 	rte_flow_validate;
+	rte_eth_dev_fw_info_get;
 
 } DPDK_16.11;
-- 
2.7.4

^ permalink raw reply related

* [PATCH v3 0/4] new API 'rte_eth_dev_fw_info_get'
From: Qiming Yang @ 2016-12-27 12:30 UTC (permalink / raw)
  To: dev, thomas.monjalon; +Cc: remy.horton, ferruh.yigit, Qiming Yang
In-Reply-To: <1481008582-69416-1-git-send-email-qiming.yang@intel.com>

These four patches added a new function ``rte_eth_dev_fwver_get()``
to fetch firmware related information by a given device.
Information include major firmware version, minor firmware
version, patch number and etrack id.

Qiming Yang (4):
  ethdev: add firmware information get
  net/e1000: add firmware version get
  net/ixgbe: add firmware version get
  net/i40e: add firmware version get

 doc/guides/nics/features/default.ini   |  1 +
 doc/guides/nics/features/i40e.ini      |  1 +
 doc/guides/nics/features/igb.ini       |  1 +
 doc/guides/nics/features/ixgbe.ini     |  1 +
 doc/guides/rel_notes/release_17_02.rst |  4 ++++
 drivers/net/e1000/igb_ethdev.c         | 43 ++++++++++++++++++++++++++++++++++
 drivers/net/i40e/i40e_ethdev.c         | 15 ++++++++++++
 drivers/net/ixgbe/ixgbe_ethdev.c       | 17 ++++++++++++++
 lib/librte_ether/rte_ethdev.c          | 14 +++++++++++
 lib/librte_ether/rte_ethdev.h          | 23 ++++++++++++++++++
 lib/librte_ether/rte_ether_version.map |  1 +
 11 files changed, 121 insertions(+)

-- 
2.7.4

^ permalink raw reply

* Re: [PATCH] net/mlx5: fix multi segment packet send
From: Adrien Mazarguil @ 2016-12-27 10:21 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev, stable
In-Reply-To: <1482766116-6202-1-git-send-email-shahafs@mellanox.com>

Hi Shahaf,

On Mon, Dec 26, 2016 at 05:28:36PM +0200, Shahaf Shuler wrote:
> Dseg pointer is not initialised when the first segment is inlined
> causing a segmentation fault in such situation.
> 
> Fixes: 2a66cf378954 ("net/mlx5: support inline send")
> 
> CC: stable@dpdk.org
> Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>

Thanks for fixing this bug, a few comments below.

> ---
>  drivers/net/mlx5/mlx5_rxtx.c | 6 +++++-
>  1 file changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
> index 97810e8..d6688c6 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.c
> +++ b/drivers/net/mlx5/mlx5_rxtx.c
> @@ -483,7 +483,7 @@
>  				assert(addr <= addr_end);
>  			}
>  			/*
> -			 * 2 DWORDs consumed by the WQE header + 1 DSEG +
> +			 * 2 DWORDs consumed by the WQE header + ETH segment +
>  			 * the size of the inline part of the packet.
>  			 */
>  			ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
> @@ -498,6 +498,10 @@
>  			} else if (!segs_n) {
>  				goto next_pkt;
>  			} else {
> +				/* dseg will be advance as part of next_seg*/

Nit-picking here, there is a missing space in the above comment.

> +				dseg = (volatile rte_v128u32_t *)

rte_v128u32_t does not exist (yet) in the tree, this patch therefore depends
on "eal: define generic vector types" [1]. Such dependencies should be
mentioned as a notes section of a patch (after a three-dash line).

Regarding stable@dpdk.org, un case the vector types patch is not applied on
the stable branch, you'll also have to provide your own definition.

> +					((uintptr_t)wqe +
> +					 ((ds - 1) * MLX5_WQE_DWORD_SIZE));
>  				goto next_seg;
>  			}
>  		} else {
> -- 
> 1.8.3.1
> 

Otherwise,

Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>

[1] http://dpdk.org/ml/archives/dev/2016-November/050261.html

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply

* [PATCH 2/2] app/testpmd: remove explicit ixgbe link request
From: Jerin Jacob @ 2016-12-27 10:09 UTC (permalink / raw)
  To: dev; +Cc: konstantin.ananyev, helin.zhang, thomas.monjalon, Jerin Jacob
In-Reply-To: <1482833398-30145-1-git-send-email-jerin.jacob@caviumnetworks.com>

Removed explicit ixgbe driver linkage request from
app/testpmd makefile to mk/rte.app.mk to
1)Maintain the correct link ordering(from higher level libraries
to lower level libraries)
2)In shared lib configuration, any application can use ixgbe
exposed pmd specific APIs not just testpmd.

Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 app/test-pmd/Makefile | 2 --
 mk/rte.app.mk         | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/app/test-pmd/Makefile b/app/test-pmd/Makefile
index 5988c3e..96e0c67 100644
--- a/app/test-pmd/Makefile
+++ b/app/test-pmd/Makefile
@@ -59,8 +59,6 @@ SRCS-y += csumonly.c
 SRCS-y += icmpecho.c
 SRCS-$(CONFIG_RTE_LIBRTE_IEEE1588) += ieee1588fwd.c
 
-_LDLIBS-$(CONFIG_RTE_LIBRTE_IXGBE_PMD) += -lrte_pmd_ixgbe
-
 CFLAGS_cmdline.o := -D_GNU_SOURCE
 
 # this application needs libraries first
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index f75f0e2..aee235c 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -101,6 +101,7 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_CFGFILE)        += -lrte_cfgfile
 
 _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_BOND)       += -lrte_pmd_bond
 _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_XENVIRT)    += -lrte_pmd_xenvirt -lxenstore
+_LDLIBS-$(CONFIG_RTE_LIBRTE_IXGBE_PMD)      += -lrte_pmd_ixgbe
 
 ifeq ($(CONFIG_RTE_BUILD_SHARED_LIB),n)
 # plugins (link only if static libraries)
@@ -114,7 +115,6 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_ENA_PMD)        += -lrte_pmd_ena
 _LDLIBS-$(CONFIG_RTE_LIBRTE_ENIC_PMD)       += -lrte_pmd_enic
 _LDLIBS-$(CONFIG_RTE_LIBRTE_FM10K_PMD)      += -lrte_pmd_fm10k
 _LDLIBS-$(CONFIG_RTE_LIBRTE_I40E_PMD)       += -lrte_pmd_i40e
-_LDLIBS-$(CONFIG_RTE_LIBRTE_IXGBE_PMD)      += -lrte_pmd_ixgbe
 _LDLIBS-$(CONFIG_RTE_LIBRTE_MLX4_PMD)       += -lrte_pmd_mlx4 -libverbs
 _LDLIBS-$(CONFIG_RTE_LIBRTE_MLX5_PMD)       += -lrte_pmd_mlx5 -libverbs
 _LDLIBS-$(CONFIG_RTE_LIBRTE_MPIPE_PMD)      += -lrte_pmd_mpipe -lgxio
-- 
2.5.5

^ permalink raw reply related

* [PATCH 1/2] net/ixgbe: remove unused global variable
From: Jerin Jacob @ 2016-12-27 10:09 UTC (permalink / raw)
  To: dev; +Cc: konstantin.ananyev, helin.zhang, thomas.monjalon, Jerin Jacob

Removed unused "reg_info" global variable from ixgbe driver.

cat build/app/testpmd.map | grep "Allocating common symbols" -A 15
Allocating common symbols
Common symbol   size    file
reg_info        0x18    build/lib/librte_pmd_ixgbe.a(ixgbe_ethdev.o)

Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 drivers/net/ixgbe/ixgbe_regs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ixgbe/ixgbe_regs.h b/drivers/net/ixgbe/ixgbe_regs.h
index 773e169..2aa4820 100644
--- a/drivers/net/ixgbe/ixgbe_regs.h
+++ b/drivers/net/ixgbe/ixgbe_regs.h
@@ -41,7 +41,7 @@ struct reg_info {
 	uint32_t count;
 	uint32_t stride;
 	const char *name;
-} reg_info;
+};
 
 static const struct reg_info ixgbe_regs_general[] = {
 	{IXGBE_CTRL, 1, 1, "IXGBE_CTRL"},
-- 
2.5.5

^ permalink raw reply related

* [PATCH v2 4/4] lib/librte_vhost: improve vhost perf using rte_memset
From: Zhiyong Yang @ 2016-12-27 10:04 UTC (permalink / raw)
  To: dev
  Cc: yuanhan.liu, thomas.monjalon, bruce.richardson,
	konstantin.ananyev, pablo.de.lara.guarch, Zhiyong Yang
In-Reply-To: <1482833098-38096-1-git-send-email-zhiyong.yang@intel.com>

Using rte_memset instead of copy_virtio_net_hdr can bring 3%~4%
performance improvements on IA platform from virtio/vhost
non-mergeable loopback testing.

Two key points have been considered:
1. One variable initialization could be saved, which involves memory
store.
2. copy_virtio_net_hdr involves both load (from stack, the virtio_hdr
var) and store (to virtio driver memory), while rte_memset just involves
store.

Signed-off-by: Zhiyong Yang <zhiyong.yang@intel.com>
---

Changes in V2:

Modify release_17_02.rst description.

 doc/guides/rel_notes/release_17_02.rst |  7 +++++++
 lib/librte_vhost/virtio_net.c          | 18 +++++++++++-------
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/doc/guides/rel_notes/release_17_02.rst b/doc/guides/rel_notes/release_17_02.rst
index 180af82..3d39cde 100644
--- a/doc/guides/rel_notes/release_17_02.rst
+++ b/doc/guides/rel_notes/release_17_02.rst
@@ -52,6 +52,13 @@ New Features
   See the :ref:`Generic flow API <Generic_flow_API>` documentation for more
   information.
 
+* **Introduced rte_memset on IA platform.**
+
+  Performance drop had been caused in some cases on Ivybridge when DPDK code calls
+  glibc function memset. It was necessary to introduce more high efficient function
+  to replace it. The function rte_memset supported three types of instruction sets
+  including sse & avx(128 bits), avx2(256 bits) and avx512(512bits) and have better
+  performance than glibc memset.
 
 Resolved Issues
 ---------------
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 595f67c..392b31b 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -37,6 +37,7 @@
 
 #include <rte_mbuf.h>
 #include <rte_memcpy.h>
+#include <rte_memset.h>
 #include <rte_ether.h>
 #include <rte_ip.h>
 #include <rte_virtio_net.h>
@@ -194,7 +195,7 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vring_desc *descs,
 	uint32_t cpy_len;
 	struct vring_desc *desc;
 	uint64_t desc_addr;
-	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
+	struct virtio_net_hdr *virtio_hdr;
 
 	desc = &descs[desc_idx];
 	desc_addr = gpa_to_vva(dev, desc->addr);
@@ -208,8 +209,9 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vring_desc *descs,
 
 	rte_prefetch0((void *)(uintptr_t)desc_addr);
 
-	virtio_enqueue_offload(m, &virtio_hdr.hdr);
-	copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
+	virtio_hdr = (struct virtio_net_hdr *)(uintptr_t)desc_addr;
+	rte_memset(virtio_hdr, 0, sizeof(*virtio_hdr));
+	virtio_enqueue_offload(m, virtio_hdr);
 	vhost_log_write(dev, desc->addr, dev->vhost_hlen);
 	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
 
@@ -459,7 +461,6 @@ static inline int __attribute__((always_inline))
 copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct rte_mbuf *m,
 			    struct buf_vector *buf_vec, uint16_t num_buffers)
 {
-	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
 	uint32_t vec_idx = 0;
 	uint64_t desc_addr;
 	uint32_t mbuf_offset, mbuf_avail;
@@ -480,7 +481,6 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct rte_mbuf *m,
 	hdr_phys_addr = buf_vec[vec_idx].buf_addr;
 	rte_prefetch0((void *)(uintptr_t)hdr_addr);
 
-	virtio_hdr.num_buffers = num_buffers;
 	LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
 		dev->vid, num_buffers);
 
@@ -512,8 +512,12 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct rte_mbuf *m,
 		}
 
 		if (hdr_addr) {
-			virtio_enqueue_offload(hdr_mbuf, &virtio_hdr.hdr);
-			copy_virtio_net_hdr(dev, hdr_addr, virtio_hdr);
+			struct virtio_net_hdr_mrg_rxbuf *hdr =
+			(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr;
+
+			rte_memset(&(hdr->hdr), 0, sizeof(hdr->hdr));
+			hdr->num_buffers = num_buffers;
+			virtio_enqueue_offload(hdr_mbuf, &(hdr->hdr));
 			vhost_log_write(dev, hdr_phys_addr, dev->vhost_hlen);
 			PRINT_PACKET(dev, (uintptr_t)hdr_addr,
 				     dev->vhost_hlen, 0);
-- 
2.7.4

^ permalink raw reply related

* [PATCH v2 3/4] app/test: add performance autotest for rte_memset
From: Zhiyong Yang @ 2016-12-27 10:04 UTC (permalink / raw)
  To: dev
  Cc: yuanhan.liu, thomas.monjalon, bruce.richardson,
	konstantin.ananyev, pablo.de.lara.guarch, Zhiyong Yang
In-Reply-To: <1482833098-38096-1-git-send-email-zhiyong.yang@intel.com>

The file implements the perf autotest for rte_memset. The perf data
can be gotten compared between rte_memset and memset when you run it.
We can see the perf of rte_memset obviously is better than glibc memset
especially for small N bytes.
The first column shows the N size for memset & rte_memset.
The second column lists a set of numbers for rte_memset Vs memset perf
in cache.
The third column lists a set of numbers for rte_memset Vs memset perf
in memory.

The following data is gotten on haswell. 

** rte_memset() - memset perf tests
        (C = compile-time constant) **
======== ======= ======== ======= ========
   Size memset in cache  memset in mem
(bytes)        (ticks)        (ticks)
------- -------------- ---------------
============= 32B aligned ================
      1       3 -    8      14 -  115
      3       4 -    8      19 -  125
      6       3 -    7      19 -  125
      8       3 -    6      19 -  124
     12       3 -    6      19 -  124
     15       3 -    6      19 -  125
     16       3 -    8      13 -  125
     32       3 -    7      19 -  133
     64       3 -    7      28 -  162
     65       6 -    8      41 -  182
    128       6 -   13      54 -  199
    192       8 -   13      77 -  273
    255       8 -   16     100 -  222
    512      17 -   14     187 -  247
    768      22 -   20     270 -  362
   1024      29 -   28     329 -  377
   2048      63 -   57     564 -  601
   4096     104 -  102     993 - 1025
   8192     200 -  211    1831 - 2270
------ -------------- -------------- ------
C     6       2 -    2      19 -   19
C    64       2 -    6      28 -   33
C   128       3 -   12      54 -   59
C   192       5 -   29      77 -   83
C   256       6 -   35     100 -  105
C   512      12 -   60     188 -  195
C   768      18 -   20     271 -  362
C  1024      24 -   29     329 -  377

Signed-off-by: Zhiyong Yang <zhiyong.yang@intel.com>
---

Change in V2:

Add perf comparation data between rte_memset and memset on haswell.

 app/test/Makefile           |   1 +
 app/test/test_memset_perf.c | 348 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 349 insertions(+)
 create mode 100644 app/test/test_memset_perf.c

diff --git a/app/test/Makefile b/app/test/Makefile
index 82da3f3..1c3e7f1 100644
--- a/app/test/Makefile
+++ b/app/test/Makefile
@@ -124,6 +124,7 @@ SRCS-y += test_memcpy.c
 SRCS-y += test_memcpy_perf.c
 
 SRCS-y += test_memset.c
+SRCS-y += test_memset_perf.c
 
 SRCS-$(CONFIG_RTE_LIBRTE_HASH) += test_hash.c
 SRCS-$(CONFIG_RTE_LIBRTE_HASH) += test_thash.c
diff --git a/app/test/test_memset_perf.c b/app/test/test_memset_perf.c
new file mode 100644
index 0000000..83b15b5
--- /dev/null
+++ b/app/test/test_memset_perf.c
@@ -0,0 +1,348 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <rte_common.h>
+#include <rte_cycles.h>
+#include <rte_random.h>
+#include <rte_malloc.h>
+#include <rte_memset.h>
+#include "test.h"
+
+/*
+ * Set this to the maximum buffer size you want to test. If it is 0, then the
+ * values in the buf_sizes[] array below will be used.
+ */
+#define TEST_VALUE_RANGE        0
+
+/* List of buffer sizes to test */
+#if TEST_VALUE_RANGE == 0
+static size_t buf_sizes[] = {
+	1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 31, 32, 33, 63, 64, 65,
+	70, 85, 96, 105, 115, 127, 128, 129, 161, 191, 192, 193, 255, 256,
+	257, 319, 320, 321, 383, 384, 385, 447, 448, 449, 511, 512, 513,
+	767, 768, 769, 1023, 1024, 1025, 1518, 1522, 1536, 1600, 2048, 2560,
+	3072, 3584, 4096, 4608, 5120, 5632, 6144, 6656, 7168, 7680, 8192
+};
+/* MUST be as large as largest packet size above */
+#define SMALL_BUFFER_SIZE 8192
+#else /* TEST_VALUE_RANGE != 0 */
+static size_t buf_sizes[TEST_VALUE_RANGE];
+#define SMALL_BUFFER_SIZE       TEST_VALUE_RANGE
+#endif /* TEST_VALUE_RANGE == 0 */
+
+/*
+ * Arrays of this size are used for measuring uncached memory accesses by
+ * picking a random location within the buffer. Make this smaller if there are
+ * memory allocation errors.
+ */
+#define LARGE_BUFFER_SIZE       (100 * 1024 * 1024)
+
+/* How many times to run timing loop for performance tests */
+#define TEST_ITERATIONS         1000000
+#define TEST_BATCH_SIZE         100
+
+/* Data is aligned on this many bytes (power of 2) */
+#ifdef RTE_MACHINE_CPUFLAG_AVX512F
+#define ALIGNMENT_UNIT          64
+#elif defined RTE_MACHINE_CPUFLAG_AVX2
+#define ALIGNMENT_UNIT          32
+#else /* RTE_MACHINE_CPUFLAG */
+#define ALIGNMENT_UNIT          16
+#endif /* RTE_MACHINE_CPUFLAG */
+
+/*
+ * Pointers used in performance tests. The two large buffers are for uncached
+ * access where random addresses within the buffer are used for each
+ * memset. The two small buffers are for cached access.
+ */
+static uint8_t *large_buf_read, *large_buf_write;
+static uint8_t *small_buf_read, *small_buf_write;
+
+/* Initialise data buffers. */
+static int
+init_buffers(void)
+{
+	unsigned int i;
+
+	large_buf_read = rte_malloc("memset", LARGE_BUFFER_SIZE
+					+ ALIGNMENT_UNIT, ALIGNMENT_UNIT);
+	if (large_buf_read == NULL)
+		goto error_large_buf_read;
+
+	large_buf_write = rte_malloc("memset", LARGE_BUFFER_SIZE
+					+ ALIGNMENT_UNIT, ALIGNMENT_UNIT);
+	if (large_buf_write == NULL)
+		goto error_large_buf_write;
+
+	small_buf_read = rte_malloc("memset", SMALL_BUFFER_SIZE
+					+ ALIGNMENT_UNIT, ALIGNMENT_UNIT);
+	if (small_buf_read == NULL)
+		goto error_small_buf_read;
+
+	small_buf_write = rte_malloc("memset", SMALL_BUFFER_SIZE
+					+ ALIGNMENT_UNIT, ALIGNMENT_UNIT);
+	if (small_buf_write == NULL)
+		goto error_small_buf_write;
+
+	for (i = 0; i < LARGE_BUFFER_SIZE; i++)
+		large_buf_read[i] = rte_rand();
+	for (i = 0; i < SMALL_BUFFER_SIZE; i++)
+		small_buf_read[i] = rte_rand();
+
+	return 0;
+
+error_small_buf_write:
+	rte_free(small_buf_read);
+error_small_buf_read:
+	rte_free(large_buf_write);
+error_large_buf_write:
+	rte_free(large_buf_read);
+error_large_buf_read:
+	printf("ERROR: not enough memory\n");
+	return -1;
+}
+
+/* Cleanup data buffers */
+static void
+free_buffers(void)
+{
+	rte_free(large_buf_read);
+	rte_free(large_buf_write);
+	rte_free(small_buf_read);
+	rte_free(small_buf_write);
+}
+
+/*
+ * Get a random offset into large array, with enough space needed to perform
+ * max memset size. Offset is aligned, uoffset is used for unalignment setting.
+ */
+static inline size_t
+get_rand_offset(size_t uoffset)
+{
+	return ((rte_rand() % (LARGE_BUFFER_SIZE - SMALL_BUFFER_SIZE)) &
+			~(ALIGNMENT_UNIT - 1)) + uoffset;
+}
+
+/* Fill in destination addresses. */
+static inline void
+fill_addr_arrays(size_t *dst_addr, int is_dst_cached, size_t dst_uoffset)
+{
+	unsigned int i;
+
+	for (i = 0; i < TEST_BATCH_SIZE; i++)
+		dst_addr[i] = (is_dst_cached) ? dst_uoffset :
+					get_rand_offset(dst_uoffset);
+}
+
+/*
+ * WORKAROUND: For some reason the first test doing an uncached write
+ * takes a very long time (~25 times longer than is expected). So we do
+ * it once without timing.
+ */
+static void
+do_uncached_write(uint8_t *dst, int is_dst_cached, size_t size)
+{
+	unsigned int i, j;
+	size_t dst_addrs[TEST_BATCH_SIZE];
+	int ch = rte_rand() & 0xff;
+
+	for (i = 0; i < (TEST_ITERATIONS / TEST_BATCH_SIZE); i++) {
+		fill_addr_arrays(dst_addrs, is_dst_cached, 0);
+		for (j = 0; j < TEST_BATCH_SIZE; j++)
+			rte_memset(dst+dst_addrs[j], ch, size);
+	}
+}
+
+/*
+ * Run a single memset performance test. This is a macro to ensure that if
+ * the "size" parameter is a constant it won't be converted to a variable.
+ */
+#define SINGLE_PERF_TEST(dst, is_dst_cached, dst_uoffset, size)             \
+do {                                                                        \
+	unsigned int iter, t;                                               \
+	size_t dst_addrs[TEST_BATCH_SIZE];                                  \
+	uint64_t start_time, total_time = 0;                                \
+	uint64_t total_time2 = 0;                                           \
+	int ch = rte_rand() & 0xff;                                         \
+									    \
+	for (iter = 0; iter < (TEST_ITERATIONS / TEST_BATCH_SIZE); iter++) {\
+	fill_addr_arrays(dst_addrs, is_dst_cached, dst_uoffset);            \
+	start_time = rte_rdtsc();                                           \
+	for (t = 0; t < TEST_BATCH_SIZE; t++)                               \
+		rte_memset(dst+dst_addrs[t], ch, size);                      \
+	total_time += rte_rdtsc() - start_time;                             \
+	}                                                                   \
+	for (iter = 0; iter < (TEST_ITERATIONS / TEST_BATCH_SIZE); iter++) {\
+	fill_addr_arrays(dst_addrs, is_dst_cached, dst_uoffset);            \
+	start_time = rte_rdtsc();                                           \
+	for (t = 0; t < TEST_BATCH_SIZE; t++)                               \
+		memset(dst+dst_addrs[t], ch, size);                         \
+	total_time2 += rte_rdtsc() - start_time;                            \
+	}                                                                   \
+	printf("%8.0f -",  (double)total_time / TEST_ITERATIONS);           \
+	printf("%5.0f",  (double)total_time2 / TEST_ITERATIONS);            \
+} while (0)
+
+/* Run aligned memset tests. */
+#define ALL_PERF_TESTS_FOR_SIZE(n)                                       \
+do {                                                                     \
+	if (__builtin_constant_p(n))                                     \
+		printf("\nC%6u", (unsigned int)n);                       \
+	else                                                             \
+		printf("\n%7u", (unsigned int)n);                        \
+	SINGLE_PERF_TEST(small_buf_write, 1, 0, n);                      \
+	SINGLE_PERF_TEST(large_buf_write, 0, 0, n);                      \
+} while (0)
+
+/* Run unaligned memset tests */
+#define ALL_PERF_TESTS_FOR_SIZE_UNALIGNED(n)                             \
+do {                                                                     \
+	if (__builtin_constant_p(n))                                     \
+		printf("\nC%6u", (unsigned int)n);                       \
+	else                                                             \
+		printf("\n%7u", (unsigned int)n);                        \
+	SINGLE_PERF_TEST(small_buf_write, 1, 1, n);                      \
+	SINGLE_PERF_TEST(large_buf_write, 0, 1, n);                      \
+} while (0)
+
+/* Run memset tests for constant length */
+#define ALL_PERF_TEST_FOR_CONSTANT                                       \
+do {                                                                     \
+	TEST_CONSTANT(6U); TEST_CONSTANT(64U); TEST_CONSTANT(128U);      \
+	TEST_CONSTANT(192U); TEST_CONSTANT(256U); TEST_CONSTANT(512U);   \
+	TEST_CONSTANT(768U); TEST_CONSTANT(1024U); TEST_CONSTANT(1536U); \
+} while (0)
+
+/* Run all memset tests for aligned constant cases */
+static inline void
+perf_test_constant_aligned(void)
+{
+#define TEST_CONSTANT ALL_PERF_TESTS_FOR_SIZE
+	ALL_PERF_TEST_FOR_CONSTANT;
+#undef TEST_CONSTANT
+}
+
+/* Run all memset tests for unaligned constant cases */
+static inline void
+perf_test_constant_unaligned(void)
+{
+#define TEST_CONSTANT ALL_PERF_TESTS_FOR_SIZE_UNALIGNED
+	ALL_PERF_TEST_FOR_CONSTANT;
+#undef TEST_CONSTANT
+}
+
+/* Run all memset tests for aligned variable cases */
+static inline void
+perf_test_variable_aligned(void)
+{
+	unsigned int n = sizeof(buf_sizes) / sizeof(buf_sizes[0]);
+	unsigned int i;
+
+	for (i = 0; i < n; i++)
+		ALL_PERF_TESTS_FOR_SIZE((size_t)buf_sizes[i]);
+}
+
+/* Run all memset tests for unaligned variable cases */
+static inline void
+perf_test_variable_unaligned(void)
+{
+	unsigned int n = sizeof(buf_sizes) / sizeof(buf_sizes[0]);
+	unsigned int i;
+
+	for (i = 0; i < n; i++)
+		ALL_PERF_TESTS_FOR_SIZE_UNALIGNED((size_t)buf_sizes[i]);
+}
+
+/* Run all memset tests */
+static int
+perf_test(void)
+{
+	int ret;
+
+	ret = init_buffers();
+	if (ret != 0)
+		return ret;
+
+#if TEST_VALUE_RANGE != 0
+	/* Set up buf_sizes array, if required */
+	unsigned int i;
+
+	for (i = 0; i < TEST_VALUE_RANGE; i++)
+		buf_sizes[i] = i;
+#endif
+
+	/* See function comment */
+	do_uncached_write(large_buf_write, 0, SMALL_BUFFER_SIZE);
+
+	printf("\n** rte_memset() - memset perf tests \t\n  \
+	(C = compile-time constant) **\n"
+		"======== ======= ======== ======= ========\n"
+		"   Size memset in cache  memset in mem\n"
+		"(bytes)        (ticks)        (ticks)\n"
+		"------- -------------- ---------------");
+
+	printf("\n============= %2dB aligned ================", ALIGNMENT_UNIT);
+	/* Do aligned tests where size is a variable */
+	perf_test_variable_aligned();
+	printf("\n------ -------------- -------------- ------");
+	/* Do aligned tests where size is a compile-time constant */
+	perf_test_constant_aligned();
+	printf("\n============= Unaligned ===================");
+	/* Do unaligned tests where size is a variable */
+	perf_test_variable_unaligned();
+	printf("\n------ -------------- -------------- ------");
+	/* Do unaligned tests where size is a compile-time constant */
+	perf_test_constant_unaligned();
+	printf("\n====== ============== ============== =======\n\n");
+
+	free_buffers();
+
+	return 0;
+}
+
+static int
+test_memset_perf(void)
+{
+	int ret;
+
+	ret = perf_test();
+	if (ret != 0)
+		return -1;
+	return 0;
+}
+
+REGISTER_TEST_COMMAND(memset_perf_autotest, test_memset_perf);
-- 
2.7.4

^ permalink raw reply related

* [PATCH v2 2/4] app/test: add functional autotest for rte_memset
From: Zhiyong Yang @ 2016-12-27 10:04 UTC (permalink / raw)
  To: dev
  Cc: yuanhan.liu, thomas.monjalon, bruce.richardson,
	konstantin.ananyev, pablo.de.lara.guarch, Zhiyong Yang
In-Reply-To: <1482833098-38096-1-git-send-email-zhiyong.yang@intel.com>

The file implements the functional autotest for rte_memset, which
validates the new function rte_memset whether to work in a right
way. The implementation of test_memcpy.c is used as a reference.

Usage:
step 1: run ./x86_64-native-linuxapp-gcc/app/test
step 2: run command memset_autotest at the run time.

Signed-off-by: Zhiyong Yang <zhiyong.yang@intel.com>
---
 app/test/Makefile      |   2 +
 app/test/test_memset.c | 158 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 160 insertions(+)
 create mode 100644 app/test/test_memset.c

diff --git a/app/test/Makefile b/app/test/Makefile
index 5be023a..82da3f3 100644
--- a/app/test/Makefile
+++ b/app/test/Makefile
@@ -123,6 +123,8 @@ SRCS-y += test_logs.c
 SRCS-y += test_memcpy.c
 SRCS-y += test_memcpy_perf.c
 
+SRCS-y += test_memset.c
+
 SRCS-$(CONFIG_RTE_LIBRTE_HASH) += test_hash.c
 SRCS-$(CONFIG_RTE_LIBRTE_HASH) += test_thash.c
 SRCS-$(CONFIG_RTE_LIBRTE_HASH) += test_hash_perf.c
diff --git a/app/test/test_memset.c b/app/test/test_memset.c
new file mode 100644
index 0000000..c9020bf
--- /dev/null
+++ b/app/test/test_memset.c
@@ -0,0 +1,158 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <rte_common.h>
+#include <rte_random.h>
+#include <rte_memset.h>
+#include "test.h"
+
+/*
+ * Set this to the maximum buffer size you want to test. If it is 0, then the
+ * values in the buf_sizes[] array below will be used.
+ */
+#define TEST_VALUE_RANGE 0
+#define MAX_INT8 127
+#define MIN_INT8 -128
+/* List of buffer sizes to test */
+#if TEST_VALUE_RANGE == 0
+static size_t buf_sizes[] = {
+	0, 1, 7, 8, 9, 15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 128, 129,
+	255, 256, 257, 320, 384, 511, 512, 513, 1023, 1024, 1025, 1518,
+	1522, 1600, 2048, 3072, 4096, 5120, 6144, 7168, 8192
+};
+/* MUST be as large as largest packet size above */
+#define BUFFER_SIZE       8192
+#else /* TEST_VALUE_RANGE != 0 */
+static size_t buf_sizes[TEST_VALUE_RANGE];
+#define BUFFER_SIZE       TEST_VALUE_RANGE
+#endif /* TEST_VALUE_RANGE == 0 */
+
+/* Data is aligned on this many bytes (power of 2) */
+#define ALIGNMENT_UNIT 32
+
+/*
+ * Create two buffers, and initialize the one as the reference buffer with
+ * random values. Another(dest_buff) is assigned by the reference buffer.
+ * Set some memory area of dest_buff by using ch and then compare to see
+ * if the rte_memset is successful. The bytes outside the setted area are
+ * also checked to make sure they are not changed.
+ */
+static int
+test_single_memset(unsigned int off_dst, int ch, size_t size)
+{
+	unsigned int i;
+	uint8_t dest_buff[BUFFER_SIZE + ALIGNMENT_UNIT];
+	uint8_t ref_buff[BUFFER_SIZE + ALIGNMENT_UNIT];
+	void *ret;
+
+	/* Setup buffers */
+	for (i = 0; i < BUFFER_SIZE + ALIGNMENT_UNIT; i++) {
+		ref_buff[i] = (uint8_t) rte_rand();
+		dest_buff[i] = ref_buff[i];
+	}
+	/* Do the rte_memset */
+	ret = rte_memset(dest_buff + off_dst, ch, size);
+	if (ret != (dest_buff + off_dst)) {
+		printf("rte_memset() returned %p, not %p\n",
+		       ret, dest_buff + off_dst);
+	}
+	/* Check nothing before offset was affected */
+	for (i = 0; i < off_dst; i++) {
+		if (dest_buff[i] != ref_buff[i]) {
+			printf("rte_memset() failed for %u bytes (offsets=%u): \
+			       [modified before start of dst].\n",
+			       (unsigned int)size, off_dst);
+			return -1;
+		}
+	}
+	/* Check every byte was setted */
+	for (i = 0; i < size; i++) {
+		if (dest_buff[i + off_dst] != (uint8_t)ch) {
+			printf("rte_memset() failed for %u bytes (offsets=%u): \
+			       [didn't memset byte %u].\n",
+			       (unsigned int)size, off_dst, i);
+			return -1;
+		}
+	}
+	/* Check nothing after memset was affected */
+	for (i = off_dst + size; i < BUFFER_SIZE + ALIGNMENT_UNIT; i++) {
+		if (dest_buff[i] != ref_buff[i]) {
+			printf("rte_memset() failed for %u bytes (offsets=%u): \
+			      [memset too many].\n",
+			       (unsigned int)size, off_dst);
+			return -1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * Check functionality for various buffer sizes and data offsets/alignments.
+ */
+static int
+func_test(void)
+{
+	unsigned int off_dst, i;
+	unsigned int num_buf_sizes = sizeof(buf_sizes) / sizeof(buf_sizes[0]);
+	int ret;
+	int j;
+
+	for (j = MIN_INT8; j <= MAX_INT8; j++) {
+		for (off_dst = 0; off_dst < ALIGNMENT_UNIT; off_dst++) {
+			for (i = 0; i < num_buf_sizes; i++) {
+				ret = test_single_memset(off_dst, j,
+							 buf_sizes[i]);
+				if (ret != 0)
+					return -1;
+			}
+		}
+	}
+	return 0;
+}
+
+static int
+test_memset(void)
+{
+	int ret;
+
+	ret = func_test();
+	if (ret != 0)
+		return -1;
+	return 0;
+}
+
+REGISTER_TEST_COMMAND(memset_autotest, test_memset);
-- 
2.7.4

^ permalink raw reply related

* [PATCH v2 1/4] eal/common: introduce rte_memset on IA platform
From: Zhiyong Yang @ 2016-12-27 10:04 UTC (permalink / raw)
  To: dev
  Cc: yuanhan.liu, thomas.monjalon, bruce.richardson,
	konstantin.ananyev, pablo.de.lara.guarch, Zhiyong Yang
In-Reply-To: <1482833098-38096-1-git-send-email-zhiyong.yang@intel.com>

Performance drop has been caused in some cases when DPDK code calls glibc
function memset. please reference to discussions about memset in
http://dpdk.org/ml/archives/dev/2016-October/048628.html
It is necessary to introduce more high efficient function to fix it.
One important thing about rte_memset is that we can get clear control
on what instruction flow is used. This patch supports instruction sets
such as sse & avx(128 bits), avx2(256 bits) and avx512(512bits).
rte_memset makes full use of vectorization and inline function to improve
the perf on IA. In addition, cache line and memory alignment are fully
taken into consideration.

Signed-off-by: Zhiyong Yang <zhiyong.yang@intel.com>
---

Changes in V2:

Rename rte_memset.h -> rte_memset_64.h and create a file rte_memset.h
for each arch.

 .../common/include/arch/arm/rte_memset.h           |  36 ++
 .../common/include/arch/ppc_64/rte_memset.h        |  36 ++
 .../common/include/arch/tile/rte_memset.h          |  36 ++
 .../common/include/arch/x86/rte_memset.h           |  51 +++
 .../common/include/arch/x86/rte_memset_64.h        | 378 +++++++++++++++++++++
 lib/librte_eal/common/include/generic/rte_memset.h |  52 +++
 6 files changed, 589 insertions(+)
 create mode 100644 lib/librte_eal/common/include/arch/arm/rte_memset.h
 create mode 100644 lib/librte_eal/common/include/arch/ppc_64/rte_memset.h
 create mode 100644 lib/librte_eal/common/include/arch/tile/rte_memset.h
 create mode 100644 lib/librte_eal/common/include/arch/x86/rte_memset.h
 create mode 100644 lib/librte_eal/common/include/arch/x86/rte_memset_64.h
 create mode 100644 lib/librte_eal/common/include/generic/rte_memset.h

diff --git a/lib/librte_eal/common/include/arch/arm/rte_memset.h b/lib/librte_eal/common/include/arch/arm/rte_memset.h
new file mode 100644
index 0000000..6945f6d
--- /dev/null
+++ b/lib/librte_eal/common/include/arch/arm/rte_memset.h
@@ -0,0 +1,36 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of RehiveTech nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_MEMSET_ARM_H_
+#define _RTE_MEMSET_ARM_H_
+
+#define rte_memset memset
+
+#endif /* _RTE_MEMSET_ARM_H_ */
diff --git a/lib/librte_eal/common/include/arch/ppc_64/rte_memset.h b/lib/librte_eal/common/include/arch/ppc_64/rte_memset.h
new file mode 100644
index 0000000..0d73f05
--- /dev/null
+++ b/lib/librte_eal/common/include/arch/ppc_64/rte_memset.h
@@ -0,0 +1,36 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of IBM Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _RTE_MEMSET_PPC_64_H_
+#define _RTE_MEMSET_PPC_64_H_
+
+#define rte_memset memset
+
+#endif /* _RTE_MEMSET_PPC_64_H_ */
diff --git a/lib/librte_eal/common/include/arch/tile/rte_memset.h b/lib/librte_eal/common/include/arch/tile/rte_memset.h
new file mode 100644
index 0000000..e8a1aa1
--- /dev/null
+++ b/lib/librte_eal/common/include/arch/tile/rte_memset.h
@@ -0,0 +1,36 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of EZchip Semiconductor nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _RTE_MEMSET_TILE_H_
+#define _RTE_MEMSET_TILE_H_
+
+#define rte_memset memset
+
+#endif /* _RTE_MEMSET_TILE_H_ */
diff --git a/lib/librte_eal/common/include/arch/x86/rte_memset.h b/lib/librte_eal/common/include/arch/x86/rte_memset.h
new file mode 100644
index 0000000..86e0812
--- /dev/null
+++ b/lib/librte_eal/common/include/arch/x86/rte_memset.h
@@ -0,0 +1,51 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_MEMSET_X86_H_
+#define _RTE_MEMSET_X86_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef RTE_ARCH_X86_64
+#include "rte_memset_64.h"
+#else
+#define rte_memset memset
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_MEMSET_X86_64_H_ */
diff --git a/lib/librte_eal/common/include/arch/x86/rte_memset_64.h b/lib/librte_eal/common/include/arch/x86/rte_memset_64.h
new file mode 100644
index 0000000..f25d344
--- /dev/null
+++ b/lib/librte_eal/common/include/arch/x86/rte_memset_64.h
@@ -0,0 +1,378 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_MEMSET_X86_64_H_
+#define _RTE_MEMSET_X86_64_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @file
+ *
+ * Functions for vectorised implementation of memset().
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <rte_vect.h>
+
+static inline void *
+rte_memset(void *dst, int a, size_t n) __attribute__((always_inline));
+
+static inline void
+rte_memset_less16(void *dst, int a, size_t n)
+{
+	uintptr_t dstu = (uintptr_t)dst;
+
+	if (n >= 8) {
+		uint16_t b = ((uint8_t)a | (((uint8_t)a) << 8));
+		uint16_t c = ((uint8_t)a | (((uint8_t)a) << 8));
+		uint32_t d = b | c << 16;
+		uint64_t e = d | ((uint64_t)d << 32);
+
+		*(uint64_t *)dstu = e;
+		*(uint64_t *)((uint8_t *)dstu + n - 8) = e;
+	} else {
+		if (n & 0x01) {
+			*(uint8_t *)dstu = (uint8_t)a;
+			dstu = (uintptr_t)((uint8_t *)dstu + 1);
+		}
+		if (n & 0x02) {
+			*(uint16_t *)dstu = (uint8_t)a | (((uint8_t)a) << 8);
+			dstu = (uintptr_t)((uint16_t *)dstu + 1);
+		}
+		if (n & 0x04) {
+			uint16_t b = ((uint8_t)a | (((uint8_t)a) << 8));
+
+			*(uint32_t *)dstu = (uint32_t)(b | (b << 16));
+			dstu = (uintptr_t)((uint32_t *)dstu + 1);
+		}
+	}
+}
+
+static inline void
+rte_memset16(uint8_t *dst, int8_t a)
+{
+	__m128i xmm0;
+
+	xmm0 = _mm_set1_epi8(a);
+	_mm_storeu_si128((__m128i *)dst, xmm0);
+}
+
+static inline void
+rte_memset_17to32(void *dst, int a, size_t n)
+{
+	rte_memset16((uint8_t *)dst, a);
+	rte_memset16((uint8_t *)dst - 16 + n, a);
+}
+
+#ifdef RTE_MACHINE_CPUFLAG_AVX512
+
+/**
+ * AVX512 implementation below
+ */
+
+static inline void
+rte_memset32(uint8_t *dst, int8_t a)
+{
+	__m256i ymm0;
+
+	ymm0 = _mm256_set1_epi8(a);
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+}
+
+static inline void
+rte_memset64(uint8_t *dst, int8_t a)
+{
+	__m512i zmm0;
+
+	zmm0 = _mm512_set1_epi8(a);
+	_mm512_storeu_si512((void *)dst, zmm0);
+}
+
+static inline void
+rte_memset128blocks(uint8_t *dst, int8_t a, size_t n)
+{
+	__m512i zmm0;
+
+	zmm0 = _mm512_set1_epi8(a);
+	while (n >= 128) {
+		n -= 128;
+		_mm512_store_si512((void *)(dst + 0 * 64), zmm0);
+		_mm512_store_si512((void *)(dst + 1 * 64), zmm0);
+		dst = dst + 128;
+	}
+}
+
+static inline void *
+rte_memset(void *dst, int a, size_t n)
+{
+	void *ret = dst;
+	size_t dstofss;
+	size_t bits;
+
+	if (n < 16) {
+		rte_memset_less16(dst, a, n);
+		return ret;
+	} else if (n == 16) {
+		rte_memset16((uint8_t *)dst, a);
+		return ret;
+	}
+	if (n <= 32) {
+		rte_memset_17to32(dst, a, n);
+		return ret;
+	}
+	if (n <= 64) {
+		rte_memset32((uint8_t *)dst, a);
+		rte_memset32((uint8_t *)dst - 32 + n, a);
+		return ret;
+	}
+	if (n >= 256) {
+		dstofss = ((uintptr_t)dst & 0x3F);
+		if (dstofss > 0) {
+			dstofss = 64 - dstofss;
+			n -= dstofss;
+			rte_memset64((uint8_t *)dst, a);
+			dst = (uint8_t *)dst + dstofss;
+		}
+		rte_memset128blocks((uint8_t *)dst, a, n);
+		bits = n;
+		n = n & 127;
+		bits -= n;
+		dst = (uint8_t *)dst + bits;
+	}
+	if (n > 128) {
+		n -= 128;
+		rte_memset64((uint8_t *)dst, a);
+		rte_memset64((uint8_t *)dst + 64, a);
+		dst = (uint8_t *)dst + 128;
+	}
+	if (n > 64) {
+		rte_memset64((uint8_t *)dst, a);
+		rte_memset64((uint8_t *)dst - 64 + n, a);
+		return ret;
+	}
+	if (n > 0)
+		rte_memset64((uint8_t *)dst - 64 + n, a);
+	return ret;
+}
+
+#elif defined RTE_MACHINE_CPUFLAG_AVX2
+
+/**
+ *  AVX2 implementation below
+ */
+
+static inline void
+rte_memset32(uint8_t *dst, int8_t a)
+{
+	__m256i ymm0;
+
+	ymm0 = _mm256_set1_epi8(a);
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+}
+
+static inline void
+rte_memset_33to64(void *dst, int a, size_t n)
+{
+	rte_memset32((uint8_t *)dst, a);
+	rte_memset32((uint8_t *)dst - 32 + n, a);
+}
+
+static inline void
+rte_memset64blocks(uint8_t *dst, int8_t a, size_t n)
+{
+	__m256i ymm0;
+
+	ymm0 = _mm256_set1_epi8(a);
+	while (n >= 64) {
+		n -= 64;
+		_mm256_store_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
+		_mm256_store_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm0);
+		dst = (uint8_t *)dst + 64;
+
+	}
+}
+
+static inline void *
+rte_memset(void *dst, int a, size_t n)
+{
+	void *ret = dst;
+	size_t dstofss;
+	size_t bits;
+
+	if (n < 16) {
+		rte_memset_less16(dst, a, n);
+		return ret;
+	} else if (n == 16) {
+		rte_memset16((uint8_t *)dst, a);
+		return ret;
+	}
+	if (n <= 32) {
+		rte_memset_17to32(dst, a, n);
+		return ret;
+	}
+	if (n <= 64) {
+		rte_memset_33to64(dst, a, n);
+		return ret;
+	}
+	if (n > 64) {
+		dstofss = (uintptr_t)dst & 0x1F;
+		if (dstofss > 0) {
+			dstofss = 32 - dstofss;
+			n -= dstofss;
+			rte_memset32((uint8_t *)dst, a);
+			dst = (uint8_t *)dst + dstofss;
+		}
+		rte_memset64blocks((uint8_t *)dst, a, n);
+		bits = n;
+		n = n & 63;
+		bits -= n;
+		dst = (uint8_t *)dst + bits;
+	}
+	if (n > 32) {
+		rte_memset_33to64(dst, a, n);
+		return ret;
+	}
+	if (n > 0)
+		rte_memset32((uint8_t *)dst - 32 + n, a);
+	return ret;
+}
+
+#else /* RTE_MACHINE_CPUFLAG */
+
+/**
+ * SSE && AVX implementation below
+ */
+
+static inline void
+rte_memset32(uint8_t *dst, int8_t a)
+{
+	__m128i xmm0 = _mm_set1_epi8(a);
+
+	_mm_storeu_si128((__m128i *)dst, xmm0);
+	_mm_storeu_si128((__m128i *)(dst + 16), xmm0);
+}
+
+static inline void
+rte_memset16blocks(uint8_t *dst, int8_t a, size_t n)
+{
+	__m128i xmm0 = _mm_set1_epi8(a);
+
+	while (n >= 16) {
+		n -= 16;
+		_mm_store_si128((__m128i *)(dst + 0 * 16), xmm0);
+		dst = (uint8_t *)dst + 16;
+	}
+}
+
+static inline void
+rte_memset64blocks(uint8_t *dst, int8_t a, size_t n)
+{
+	__m128i xmm0 = _mm_set1_epi8(a);
+
+	while (n >= 64) {
+		n -= 64;
+		_mm_store_si128((__m128i *)(dst + 0 * 16), xmm0);
+		_mm_store_si128((__m128i *)(dst + 1 * 16), xmm0);
+		_mm_store_si128((__m128i *)(dst + 2 * 16), xmm0);
+		_mm_store_si128((__m128i *)(dst + 3 * 16), xmm0);
+		dst = (uint8_t *)dst + 64;
+	}
+}
+
+static inline void *
+rte_memset(void *dst, int a, size_t n)
+{
+	void *ret = dst;
+	size_t dstofss;
+	size_t bits;
+
+	if (n < 16) {
+		rte_memset_less16(dst, a, n);
+		return ret;
+	} else if (n == 16) {
+		rte_memset16((uint8_t *)dst, a);
+		return ret;
+	}
+	if (n <= 32) {
+		rte_memset_17to32(dst, a, n);
+		return ret;
+	}
+	if (n <= 48) {
+		rte_memset32((uint8_t *)dst, a);
+		rte_memset16((uint8_t *)dst - 16 + n, a);
+		return ret;
+	}
+	if (n <= 64) {
+		rte_memset32((uint8_t *)dst, a);
+		rte_memset16((uint8_t *)dst + 32, a);
+		rte_memset16((uint8_t *)dst - 16 + n, a);
+		return ret;
+	}
+	if (n > 64) {
+		dstofss = (uintptr_t)dst & 0xF;
+		if (dstofss > 0) {
+			dstofss = 16 - dstofss;
+			n -= dstofss;
+			rte_memset16((uint8_t *)dst, a);
+			dst = (uint8_t *)dst + dstofss;
+		}
+		rte_memset64blocks((uint8_t *)dst, a, n);
+		bits = n;
+		n &= 63;
+		bits -= n;
+		dst = (uint8_t *)dst + bits;
+		rte_memset16blocks((uint8_t *)dst, a, n);
+		bits = n;
+		n &= 0xf;
+		bits -= n;
+		dst = (uint8_t *)dst + bits;
+		if (n > 0) {
+			rte_memset16((uint8_t *)dst - 16 + n, a);
+			return ret;
+		}
+	}
+	return ret;
+}
+
+#endif /* RTE_MACHINE_CPUFLAG */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_MEMSET_H_ */
diff --git a/lib/librte_eal/common/include/generic/rte_memset.h b/lib/librte_eal/common/include/generic/rte_memset.h
new file mode 100644
index 0000000..b03a7d0
--- /dev/null
+++ b/lib/librte_eal/common/include/generic/rte_memset.h
@@ -0,0 +1,52 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_MEMSET_H_
+#define _RTE_MEMSET_H_
+
+/**
+ * @file
+ *
+ * Functions for vectorised implementation of memset().
+ */
+#ifdef _RTE_MEMSET_X86_64_H_
+
+static void *
+rte_memset(void *dst, int a, size_t n);
+
+#else
+
+#define rte_memset memset
+
+#endif
+#endif /* _RTE_MEMSET_H_ */
-- 
2.7.4

^ permalink raw reply related

* [PATCH v2 0/4] eal/common: introduce rte_memset and related test
From: Zhiyong Yang @ 2016-12-27 10:04 UTC (permalink / raw)
  To: dev
  Cc: yuanhan.liu, thomas.monjalon, bruce.richardson,
	konstantin.ananyev, pablo.de.lara.guarch
In-Reply-To: <1480926387-63838-2-git-send-email-zhiyong.yang@intel.com>

DPDK code has met performance drop badly in some case when calling glibc
function memset. Reference to discussions about memset in 
http://dpdk.org/ml/archives/dev/2016-October/048628.html
It is necessary to introduce more high efficient function to fix it.
One important thing about rte_memset is that we can get clear control
on what instruction flow is used.

This patchset introduces rte_memset to bring more high efficient
implementation, and will bring obvious perf improvement, especially
for small N bytes in the most application scenarios.

Patch 1 implements rte_memset in the file rte_memset.h on IA platform
The file supports three types of instruction sets including sse & avx
(128bits), avx2(256bits) and avx512(512bits). rte_memset makes use of
vectorization and inline function to improve the perf on IA. In addition,
cache line and memory alignment are fully taken into consideration.

Patch 2 implements functional autotest to validates the function whether
to work in a right way.

Patch 3 implements performance autotest separately in cache and memory.
We can see the perf of rte_memset is obviously better than glibc memset
especially for small N bytes.

Patch 4 Using rte_memset instead of copy_virtio_net_hdr can bring 3%~4%
performance improvements on IA platform from virtio/vhost non-mergeable
loopback testing.

Changes in V2:

Patch 1:
Rename rte_memset.h -> rte_memset_64.h and create a file rte_memset.h
for each arch.

Patch 3:
add the perf comparation data between rte_memset and memset on haswell.

Patch 4:
Modify release_17_02.rst description.

Zhiyong Yang (4):
  eal/common: introduce rte_memset on IA platform
  app/test: add functional autotest for rte_memset
  app/test: add performance autotest for rte_memset
  lib/librte_vhost: improve vhost perf using rte_memset

 app/test/Makefile                                  |   3 +
 app/test/test_memset.c                             | 158 +++++++++
 app/test/test_memset_perf.c                        | 348 +++++++++++++++++++
 doc/guides/rel_notes/release_17_02.rst             |   7 +
 .../common/include/arch/arm/rte_memset.h           |  36 ++
 .../common/include/arch/ppc_64/rte_memset.h        |  36 ++
 .../common/include/arch/tile/rte_memset.h          |  36 ++
 .../common/include/arch/x86/rte_memset.h           |  51 +++
 .../common/include/arch/x86/rte_memset_64.h        | 378 +++++++++++++++++++++
 lib/librte_eal/common/include/generic/rte_memset.h |  52 +++
 lib/librte_vhost/virtio_net.c                      |  18 +-
 11 files changed, 1116 insertions(+), 7 deletions(-)
 create mode 100644 app/test/test_memset.c
 create mode 100644 app/test/test_memset_perf.c
 create mode 100644 lib/librte_eal/common/include/arch/arm/rte_memset.h
 create mode 100644 lib/librte_eal/common/include/arch/ppc_64/rte_memset.h
 create mode 100644 lib/librte_eal/common/include/arch/tile/rte_memset.h
 create mode 100644 lib/librte_eal/common/include/arch/x86/rte_memset.h
 create mode 100644 lib/librte_eal/common/include/arch/x86/rte_memset_64.h
 create mode 100644 lib/librte_eal/common/include/generic/rte_memset.h

-- 
2.7.4

^ permalink raw reply

* [PATCH v2 29/29] net/vmxnet3: use eal I/O device memory read/write API
From: Jerin Jacob @ 2016-12-27  9:49 UTC (permalink / raw)
  To: dev
  Cc: konstantin.ananyev, thomas.monjalon, bruce.richardson, jianbo.liu,
	viktorin, santosh.shukla, Yong Wang, Jerin Jacob
In-Reply-To: <1482832175-27199-1-git-send-email-jerin.jacob@caviumnetworks.com>

From: Santosh Shukla <santosh.shukla@caviumnetworks.com>

Replace the raw I/O device memory read/write access with eal
abstraction for I/O device memory read/write access to fix
portability issues across different architectures.

CC: Yong Wang <yongwang@vmware.com>
Signed-off-by: Santosh Shukla <santosh.shukla@caviumnetworks.com>
Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 drivers/net/vmxnet3/vmxnet3_ethdev.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/vmxnet3/vmxnet3_ethdev.h b/drivers/net/vmxnet3/vmxnet3_ethdev.h
index 7d3b11e..85c00e4 100644
--- a/drivers/net/vmxnet3/vmxnet3_ethdev.h
+++ b/drivers/net/vmxnet3/vmxnet3_ethdev.h
@@ -34,6 +34,8 @@
 #ifndef _VMXNET3_ETHDEV_H_
 #define _VMXNET3_ETHDEV_H_
 
+#include <rte_io.h>
+
 #define VMXNET3_MAX_MAC_ADDRS 1
 
 /* UPT feature to negotiate */
@@ -120,7 +122,7 @@ struct vmxnet3_hw {
 
 /* Config space read/writes */
 
-#define VMXNET3_PCI_REG(reg) (*((volatile uint32_t *)(reg)))
+#define VMXNET3_PCI_REG(reg) rte_read32(reg)
 
 static inline uint32_t
 vmxnet3_read_addr(volatile void *addr)
@@ -128,9 +130,7 @@ vmxnet3_read_addr(volatile void *addr)
 	return VMXNET3_PCI_REG(addr);
 }
 
-#define VMXNET3_PCI_REG_WRITE(reg, value) do { \
-	VMXNET3_PCI_REG((reg)) = (value); \
-} while(0)
+#define VMXNET3_PCI_REG_WRITE(reg, value) rte_write32((value), (reg))
 
 #define VMXNET3_PCI_BAR0_REG_ADDR(hw, reg) \
 	((volatile uint32_t *)((char *)(hw)->hw_addr0 + (reg)))
-- 
2.5.5

^ permalink raw reply related

* [PATCH v2 28/29] net/virtio: use eal I/O device memory read/write API
From: Jerin Jacob @ 2016-12-27  9:49 UTC (permalink / raw)
  To: dev
  Cc: konstantin.ananyev, thomas.monjalon, bruce.richardson, jianbo.liu,
	viktorin, santosh.shukla, Huawei Xie, Yuanhan Liu, Jerin Jacob
In-Reply-To: <1482832175-27199-1-git-send-email-jerin.jacob@caviumnetworks.com>

From: Santosh Shukla <santosh.shukla@caviumnetworks.com>

Replace the raw I/O device memory read/write access with eal
abstraction for I/O device memory read/write access to fix
portability issues across different architectures.

CC: Huawei Xie <huawei.xie@intel.com>
CC: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Signed-off-by: Santosh Shukla <santosh.shukla@caviumnetworks.com>
Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
Acked-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
---
 drivers/net/virtio/virtio_pci.c | 97 +++++++++++++----------------------------
 1 file changed, 31 insertions(+), 66 deletions(-)

diff --git a/drivers/net/virtio/virtio_pci.c b/drivers/net/virtio/virtio_pci.c
index 9b47165..7c1cb4c 100644
--- a/drivers/net/virtio/virtio_pci.c
+++ b/drivers/net/virtio/virtio_pci.c
@@ -37,6 +37,8 @@
  #include <fcntl.h>
 #endif
 
+#include <rte_io.h>
+
 #include "virtio_pci.h"
 #include "virtio_logs.h"
 #include "virtqueue.h"
@@ -316,48 +318,11 @@ static const struct virtio_pci_ops legacy_ops = {
 	.notify_queue	= legacy_notify_queue,
 };
 
-
-static inline uint8_t
-io_read8(uint8_t *addr)
-{
-	return *(volatile uint8_t *)addr;
-}
-
-static inline void
-io_write8(uint8_t val, uint8_t *addr)
-{
-	*(volatile uint8_t *)addr = val;
-}
-
-static inline uint16_t
-io_read16(uint16_t *addr)
-{
-	return *(volatile uint16_t *)addr;
-}
-
-static inline void
-io_write16(uint16_t val, uint16_t *addr)
-{
-	*(volatile uint16_t *)addr = val;
-}
-
-static inline uint32_t
-io_read32(uint32_t *addr)
-{
-	return *(volatile uint32_t *)addr;
-}
-
-static inline void
-io_write32(uint32_t val, uint32_t *addr)
-{
-	*(volatile uint32_t *)addr = val;
-}
-
 static inline void
 io_write64_twopart(uint64_t val, uint32_t *lo, uint32_t *hi)
 {
-	io_write32(val & ((1ULL << 32) - 1), lo);
-	io_write32(val >> 32,		     hi);
+	rte_write32(val & ((1ULL << 32) - 1), lo);
+	rte_write32(val >> 32,		     hi);
 }
 
 static void
@@ -369,13 +334,13 @@ modern_read_dev_config(struct virtio_hw *hw, size_t offset,
 	uint8_t old_gen, new_gen;
 
 	do {
-		old_gen = io_read8(&hw->common_cfg->config_generation);
+		old_gen = rte_read8(&hw->common_cfg->config_generation);
 
 		p = dst;
 		for (i = 0;  i < length; i++)
-			*p++ = io_read8((uint8_t *)hw->dev_cfg + offset + i);
+			*p++ = rte_read8((uint8_t *)hw->dev_cfg + offset + i);
 
-		new_gen = io_read8(&hw->common_cfg->config_generation);
+		new_gen = rte_read8(&hw->common_cfg->config_generation);
 	} while (old_gen != new_gen);
 }
 
@@ -387,7 +352,7 @@ modern_write_dev_config(struct virtio_hw *hw, size_t offset,
 	const uint8_t *p = src;
 
 	for (i = 0;  i < length; i++)
-		io_write8(*p++, (uint8_t *)hw->dev_cfg + offset + i);
+		rte_write8((*p++), (((uint8_t *)hw->dev_cfg) + offset + i));
 }
 
 static uint64_t
@@ -395,11 +360,11 @@ modern_get_features(struct virtio_hw *hw)
 {
 	uint32_t features_lo, features_hi;
 
-	io_write32(0, &hw->common_cfg->device_feature_select);
-	features_lo = io_read32(&hw->common_cfg->device_feature);
+	rte_write32(0, &hw->common_cfg->device_feature_select);
+	features_lo = rte_read32(&hw->common_cfg->device_feature);
 
-	io_write32(1, &hw->common_cfg->device_feature_select);
-	features_hi = io_read32(&hw->common_cfg->device_feature);
+	rte_write32(1, &hw->common_cfg->device_feature_select);
+	features_hi = rte_read32(&hw->common_cfg->device_feature);
 
 	return ((uint64_t)features_hi << 32) | features_lo;
 }
@@ -407,25 +372,25 @@ modern_get_features(struct virtio_hw *hw)
 static void
 modern_set_features(struct virtio_hw *hw, uint64_t features)
 {
-	io_write32(0, &hw->common_cfg->guest_feature_select);
-	io_write32(features & ((1ULL << 32) - 1),
-		&hw->common_cfg->guest_feature);
+	rte_write32(0, &hw->common_cfg->guest_feature_select);
+	rte_write32(features & ((1ULL << 32) - 1),
+		    &hw->common_cfg->guest_feature);
 
-	io_write32(1, &hw->common_cfg->guest_feature_select);
-	io_write32(features >> 32,
-		&hw->common_cfg->guest_feature);
+	rte_write32(1, &hw->common_cfg->guest_feature_select);
+	rte_write32(features >> 32,
+		    &hw->common_cfg->guest_feature);
 }
 
 static uint8_t
 modern_get_status(struct virtio_hw *hw)
 {
-	return io_read8(&hw->common_cfg->device_status);
+	return rte_read8(&hw->common_cfg->device_status);
 }
 
 static void
 modern_set_status(struct virtio_hw *hw, uint8_t status)
 {
-	io_write8(status, &hw->common_cfg->device_status);
+	rte_write8(status, &hw->common_cfg->device_status);
 }
 
 static void
@@ -438,21 +403,21 @@ modern_reset(struct virtio_hw *hw)
 static uint8_t
 modern_get_isr(struct virtio_hw *hw)
 {
-	return io_read8(hw->isr);
+	return rte_read8(hw->isr);
 }
 
 static uint16_t
 modern_set_config_irq(struct virtio_hw *hw, uint16_t vec)
 {
-	io_write16(vec, &hw->common_cfg->msix_config);
-	return io_read16(&hw->common_cfg->msix_config);
+	rte_write16(vec, &hw->common_cfg->msix_config);
+	return rte_read16(&hw->common_cfg->msix_config);
 }
 
 static uint16_t
 modern_get_queue_num(struct virtio_hw *hw, uint16_t queue_id)
 {
-	io_write16(queue_id, &hw->common_cfg->queue_select);
-	return io_read16(&hw->common_cfg->queue_size);
+	rte_write16(queue_id, &hw->common_cfg->queue_select);
+	return rte_read16(&hw->common_cfg->queue_size);
 }
 
 static int
@@ -470,7 +435,7 @@ modern_setup_queue(struct virtio_hw *hw, struct virtqueue *vq)
 							 ring[vq->vq_nentries]),
 				   VIRTIO_PCI_VRING_ALIGN);
 
-	io_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
 
 	io_write64_twopart(desc_addr, &hw->common_cfg->queue_desc_lo,
 				      &hw->common_cfg->queue_desc_hi);
@@ -479,11 +444,11 @@ modern_setup_queue(struct virtio_hw *hw, struct virtqueue *vq)
 	io_write64_twopart(used_addr, &hw->common_cfg->queue_used_lo,
 				      &hw->common_cfg->queue_used_hi);
 
-	notify_off = io_read16(&hw->common_cfg->queue_notify_off);
+	notify_off = rte_read16(&hw->common_cfg->queue_notify_off);
 	vq->notify_addr = (void *)((uint8_t *)hw->notify_base +
 				notify_off * hw->notify_off_multiplier);
 
-	io_write16(1, &hw->common_cfg->queue_enable);
+	rte_write16(1, &hw->common_cfg->queue_enable);
 
 	PMD_INIT_LOG(DEBUG, "queue %u addresses:", vq->vq_queue_index);
 	PMD_INIT_LOG(DEBUG, "\t desc_addr: %" PRIx64, desc_addr);
@@ -498,7 +463,7 @@ modern_setup_queue(struct virtio_hw *hw, struct virtqueue *vq)
 static void
 modern_del_queue(struct virtio_hw *hw, struct virtqueue *vq)
 {
-	io_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
 
 	io_write64_twopart(0, &hw->common_cfg->queue_desc_lo,
 				  &hw->common_cfg->queue_desc_hi);
@@ -507,13 +472,13 @@ modern_del_queue(struct virtio_hw *hw, struct virtqueue *vq)
 	io_write64_twopart(0, &hw->common_cfg->queue_used_lo,
 				  &hw->common_cfg->queue_used_hi);
 
-	io_write16(0, &hw->common_cfg->queue_enable);
+	rte_write16(0, &hw->common_cfg->queue_enable);
 }
 
 static void
 modern_notify_queue(struct virtio_hw *hw __rte_unused, struct virtqueue *vq)
 {
-	io_write16(1, vq->notify_addr);
+	rte_write16(1, vq->notify_addr);
 }
 
 static const struct virtio_pci_ops modern_ops = {
-- 
2.5.5

^ permalink raw reply related

* [PATCH v2 27/29] net/thunderx: use eal I/O device memory read/write API
From: Jerin Jacob @ 2016-12-27  9:49 UTC (permalink / raw)
  To: dev
  Cc: konstantin.ananyev, thomas.monjalon, bruce.richardson, jianbo.liu,
	viktorin, santosh.shukla, Jerin Jacob
In-Reply-To: <1482832175-27199-1-git-send-email-jerin.jacob@caviumnetworks.com>

Replace the raw I/O device memory read/write access with eal
abstraction for I/O device memory read/write access to fix portability
issues across different architectures.

Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 drivers/net/thunderx/base/nicvf_plat.h | 36 ++++------------------------------
 1 file changed, 4 insertions(+), 32 deletions(-)

diff --git a/drivers/net/thunderx/base/nicvf_plat.h b/drivers/net/thunderx/base/nicvf_plat.h
index 83c1844..3754e1b 100644
--- a/drivers/net/thunderx/base/nicvf_plat.h
+++ b/drivers/net/thunderx/base/nicvf_plat.h
@@ -69,31 +69,15 @@
 #include <rte_ether.h>
 #define NICVF_MAC_ADDR_SIZE ETHER_ADDR_LEN
 
+#include <rte_io.h>
+#define nicvf_addr_write(addr, val) rte_write64_relaxed((val), (void *)(addr))
+#define nicvf_addr_read(addr) rte_read64_relaxed((void *)(addr))
+
 /* ARM64 specific functions */
 #if defined(RTE_ARCH_ARM64)
 #define nicvf_prefetch_store_keep(_ptr) ({\
 	asm volatile("prfm pstl1keep, %a0\n" : : "p" (_ptr)); })
 
-static inline void __attribute__((always_inline))
-nicvf_addr_write(uintptr_t addr, uint64_t val)
-{
-	asm volatile(
-		    "str %x[val], [%x[addr]]"
-		    :
-		    : [val] "r" (val), [addr] "r" (addr));
-}
-
-static inline uint64_t __attribute__((always_inline))
-nicvf_addr_read(uintptr_t addr)
-{
-	uint64_t val;
-
-	asm volatile(
-		    "ldr %x[val], [%x[addr]]"
-		    : [val] "=r" (val)
-		    : [addr] "r" (addr));
-	return val;
-}
 
 #define NICVF_LOAD_PAIR(reg1, reg2, addr) ({		\
 			asm volatile(			\
@@ -106,18 +90,6 @@ nicvf_addr_read(uintptr_t addr)
 
 #define nicvf_prefetch_store_keep(_ptr) do {} while (0)
 
-static inline void __attribute__((always_inline))
-nicvf_addr_write(uintptr_t addr, uint64_t val)
-{
-	*(volatile uint64_t *)addr = val;
-}
-
-static inline uint64_t __attribute__((always_inline))
-nicvf_addr_read(uintptr_t addr)
-{
-	return	*(volatile uint64_t *)addr;
-}
-
 #define NICVF_LOAD_PAIR(reg1, reg2, addr)		\
 do {							\
 	reg1 = nicvf_addr_read((uintptr_t)addr);	\
-- 
2.5.5

^ permalink raw reply related

* [PATCH v2 26/29] net/qede: use eal I/O device memory read/write API
From: Jerin Jacob @ 2016-12-27  9:49 UTC (permalink / raw)
  To: dev
  Cc: konstantin.ananyev, thomas.monjalon, bruce.richardson, jianbo.liu,
	viktorin, santosh.shukla, Harish Patil, Rasesh Mody, Jerin Jacob
In-Reply-To: <1482832175-27199-1-git-send-email-jerin.jacob@caviumnetworks.com>

From: Santosh Shukla <santosh.shukla@caviumnetworks.com>

Replace the raw I/O device memory read/write access with eal
abstraction for I/O device memory read/write access to fix
portability issues across different architectures.

CC: Harish Patil <harish.patil@cavium.com>
CC: Rasesh Mody <rasesh.mody@cavium.com>
Signed-off-by: Santosh Shukla <santosh.shukla@caviumnetworks.com>
Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 drivers/net/qede/base/bcm_osal.h      | 20 +++++++++++---------
 drivers/net/qede/base/ecore_int_api.h | 28 +++++++++++++++++++++++-----
 drivers/net/qede/base/ecore_spq.c     |  3 ++-
 drivers/net/qede/qede_rxtx.c          |  2 +-
 4 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/drivers/net/qede/base/bcm_osal.h b/drivers/net/qede/base/bcm_osal.h
index 0b446f2..33d43c6 100644
--- a/drivers/net/qede/base/bcm_osal.h
+++ b/drivers/net/qede/base/bcm_osal.h
@@ -18,6 +18,7 @@
 #include <rte_cycles.h>
 #include <rte_debug.h>
 #include <rte_ether.h>
+#include <rte_io.h>
 
 /* Forward declaration */
 struct ecore_dev;
@@ -113,18 +114,18 @@ void *osal_dma_alloc_coherent_aligned(struct ecore_dev *, dma_addr_t *,
 
 /* HW reads/writes */
 
-#define DIRECT_REG_RD(_dev, _reg_addr) \
-	(*((volatile u32 *) (_reg_addr)))
+#define DIRECT_REG_RD(_dev, _reg_addr) rte_read32(_reg_addr)
 
 #define REG_RD(_p_hwfn, _reg_offset) \
 	DIRECT_REG_RD(_p_hwfn,		\
 			((u8 *)(uintptr_t)(_p_hwfn->regview) + (_reg_offset)))
 
-#define DIRECT_REG_WR16(_reg_addr, _val) \
-	(*((volatile u16 *)(_reg_addr)) = _val)
+#define DIRECT_REG_WR16(_reg_addr, _val) rte_write16((_val), (_reg_addr))
 
-#define DIRECT_REG_WR(_dev, _reg_addr, _val) \
-	(*((volatile u32 *)(_reg_addr)) = _val)
+#define DIRECT_REG_WR(_dev, _reg_addr, _val) rte_write32((_val), (_reg_addr))
+
+#define DIRECT_REG_WR_RELAXED(_dev, _reg_addr, _val) \
+	rte_write32_relaxed((_val), (_reg_addr))
 
 #define REG_WR(_p_hwfn, _reg_offset, _val) \
 	DIRECT_REG_WR(NULL,  \
@@ -134,9 +135,10 @@ void *osal_dma_alloc_coherent_aligned(struct ecore_dev *, dma_addr_t *,
 	DIRECT_REG_WR16(((u8 *)(uintptr_t)(_p_hwfn->regview) + \
 			(_reg_offset)), (u16)_val)
 
-#define DOORBELL(_p_hwfn, _db_addr, _val) \
-	DIRECT_REG_WR(_p_hwfn, \
-	     ((u8 *)(uintptr_t)(_p_hwfn->doorbells) + (_db_addr)), (u32)_val)
+#define DOORBELL(_p_hwfn, _db_addr, _val)				\
+	DIRECT_REG_WR_RELAXED((_p_hwfn),				\
+			      ((u8 *)(uintptr_t)(_p_hwfn->doorbells) +	\
+			      (_db_addr)), (u32)_val)
 
 /* Mutexes */
 
diff --git a/drivers/net/qede/base/ecore_int_api.h b/drivers/net/qede/base/ecore_int_api.h
index fc873e7..a0d6a43 100644
--- a/drivers/net/qede/base/ecore_int_api.h
+++ b/drivers/net/qede/base/ecore_int_api.h
@@ -120,19 +120,37 @@ static OSAL_INLINE void __internal_ram_wr(void *p_hwfn,
 }
 
 #ifdef ECORE_CONFIG_DIRECT_HWFN
+static OSAL_INLINE void __internal_ram_wr_relaxed(struct ecore_hwfn *p_hwfn,
+						  void OSAL_IOMEM * addr,
+						  int size, u32 *data)
+#else
+static OSAL_INLINE void __internal_ram_wr_relaxed(void *p_hwfn,
+						  void OSAL_IOMEM * addr,
+						  int size, u32 *data)
+#endif
+{
+	unsigned int i;
+
+	for (i = 0; i < size / sizeof(*data); i++)
+		DIRECT_REG_WR_RELAXED(p_hwfn, &((u32 OSAL_IOMEM *)addr)[i],
+				      data[i]);
+}
+
+#ifdef ECORE_CONFIG_DIRECT_HWFN
 static OSAL_INLINE void internal_ram_wr(struct ecore_hwfn *p_hwfn,
-					void OSAL_IOMEM *addr,
-					int size, u32 *data)
+						void OSAL_IOMEM * addr,
+						int size, u32 *data)
 {
-	__internal_ram_wr(p_hwfn, addr, size, data);
+	__internal_ram_wr_relaxed(p_hwfn, addr, size, data);
 }
 #else
 static OSAL_INLINE void internal_ram_wr(void OSAL_IOMEM *addr,
-					int size, u32 *data)
+						int size, u32 *data)
 {
-	__internal_ram_wr(OSAL_NULL, addr, size, data);
+	__internal_ram_wr_relaxed(OSAL_NULL, addr, size, data);
 }
 #endif
+
 #endif
 
 struct ecore_hwfn;
diff --git a/drivers/net/qede/base/ecore_spq.c b/drivers/net/qede/base/ecore_spq.c
index 0d744dd..6e5ce5d 100644
--- a/drivers/net/qede/base/ecore_spq.c
+++ b/drivers/net/qede/base/ecore_spq.c
@@ -248,7 +248,8 @@ static enum _ecore_status_t ecore_spq_hw_post(struct ecore_hwfn *p_hwfn,
 	/* make sure the SPQE is updated before the doorbell */
 	OSAL_WMB(p_hwfn->p_dev);
 
-	DOORBELL(p_hwfn, DB_ADDR(p_spq->cid, DQ_DEMS_LEGACY), *(u32 *)&db);
+	DOORBELL(p_hwfn, DB_ADDR(p_spq->cid, DQ_DEMS_LEGACY),
+		 *(u32 *)&db);
 
 	/* make sure doorbell is rang */
 	OSAL_WMB(p_hwfn->p_dev);
diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c
index 2e181c8..e1e9956 100644
--- a/drivers/net/qede/qede_rxtx.c
+++ b/drivers/net/qede/qede_rxtx.c
@@ -1246,7 +1246,7 @@ qede_xmit_pkts(void *p_txq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 	txq->tx_db.data.bd_prod = bd_prod;
 	rte_wmb();
 	rte_compiler_barrier();
-	DIRECT_REG_WR(edev, txq->doorbell_addr, txq->tx_db.raw);
+	DIRECT_REG_WR_RELAXED(edev, txq->doorbell_addr, txq->tx_db.raw);
 	rte_wmb();
 
 	/* Check again for Tx completions */
-- 
2.5.5

^ permalink raw reply related

* [PATCH v2 25/29] net/nfp: use eal I/O device memory read/write API
From: Jerin Jacob @ 2016-12-27  9:49 UTC (permalink / raw)
  To: dev
  Cc: konstantin.ananyev, thomas.monjalon, bruce.richardson, jianbo.liu,
	viktorin, santosh.shukla, Alejandro Lucero, Jerin Jacob
In-Reply-To: <1482832175-27199-1-git-send-email-jerin.jacob@caviumnetworks.com>

From: Santosh Shukla <santosh.shukla@caviumnetworks.com>

Replace the raw I/O device memory read/write access with eal
abstraction for I/O device memory read/write access to fix
portability issues across different architectures.

CC: Alejandro Lucero <alejandro.lucero@netronome.com>
Signed-off-by: Santosh Shukla <santosh.shukla@caviumnetworks.com>
Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 drivers/net/nfp/nfp_net_pmd.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/net/nfp/nfp_net_pmd.h b/drivers/net/nfp/nfp_net_pmd.h
index c180972..f11b32e 100644
--- a/drivers/net/nfp/nfp_net_pmd.h
+++ b/drivers/net/nfp/nfp_net_pmd.h
@@ -121,25 +121,26 @@ struct nfp_net_adapter;
 #define NFD_CFG_MINOR_VERSION_of(x) (((x) >> 0) & 0xff)
 
 #include <linux/types.h>
+#include <rte_io.h>
 
 static inline uint8_t nn_readb(volatile const void *addr)
 {
-	return *((volatile const uint8_t *)(addr));
+	return rte_read8(addr);
 }
 
 static inline void nn_writeb(uint8_t val, volatile void *addr)
 {
-	*((volatile uint8_t *)(addr)) = val;
+	rte_write8(val, addr);
 }
 
 static inline uint32_t nn_readl(volatile const void *addr)
 {
-	return *((volatile const uint32_t *)(addr));
+	return rte_read32(addr);
 }
 
 static inline void nn_writel(uint32_t val, volatile void *addr)
 {
-	*((volatile uint32_t *)(addr)) = val;
+	rte_write32(val, addr);
 }
 
 static inline uint64_t nn_readq(volatile void *addr)
-- 
2.5.5

^ permalink raw reply related

* [PATCH v2 24/29] net/ixgbe: use eal I/O device memory read/write API
From: Jerin Jacob @ 2016-12-27  9:49 UTC (permalink / raw)
  To: dev
  Cc: konstantin.ananyev, thomas.monjalon, bruce.richardson, jianbo.liu,
	viktorin, santosh.shukla, Helin Zhang, Jerin Jacob
In-Reply-To: <1482832175-27199-1-git-send-email-jerin.jacob@caviumnetworks.com>

From: Santosh Shukla <santosh.shukla@caviumnetworks.com>

Replace the raw I/O device memory read/write access with eal
abstraction for I/O device memory read/write access to fix
portability issues across different architectures.

CC: Helin Zhang <helin.zhang@intel.com>
CC: Konstantin Ananyev <konstantin.ananyev@intel.com>
Signed-off-by: Santosh Shukla <santosh.shukla@caviumnetworks.com>
Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 drivers/net/ixgbe/base/ixgbe_osdep.h | 11 +++++++----
 drivers/net/ixgbe/ixgbe_rxtx.c       | 13 +++++++------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ixgbe/base/ixgbe_osdep.h b/drivers/net/ixgbe/base/ixgbe_osdep.h
index 77f0af5..9b874b8 100644
--- a/drivers/net/ixgbe/base/ixgbe_osdep.h
+++ b/drivers/net/ixgbe/base/ixgbe_osdep.h
@@ -44,6 +44,7 @@
 #include <rte_cycles.h>
 #include <rte_log.h>
 #include <rte_byteorder.h>
+#include <rte_io.h>
 
 #include "../ixgbe_logs.h"
 #include "../ixgbe_bypass_defines.h"
@@ -121,16 +122,18 @@ typedef int		bool;
 
 #define prefetch(x) rte_prefetch0(x)
 
-#define IXGBE_PCI_REG(reg) (*((volatile uint32_t *)(reg)))
+#define IXGBE_PCI_REG(reg) rte_read32(reg)
 
 static inline uint32_t ixgbe_read_addr(volatile void* addr)
 {
 	return rte_le_to_cpu_32(IXGBE_PCI_REG(addr));
 }
 
-#define IXGBE_PCI_REG_WRITE(reg, value) do { \
-	IXGBE_PCI_REG((reg)) = (rte_cpu_to_le_32(value)); \
-} while(0)
+#define IXGBE_PCI_REG_WRITE(reg, value)			\
+	rte_write32((rte_cpu_to_le_32(value)), reg)
+
+#define IXGBE_PCI_REG_WRITE_RELAXED(reg, value)		\
+	rte_write32_relaxed((rte_cpu_to_le_32(value)), reg)
 
 #define IXGBE_PCI_REG_ADDR(hw, reg) \
 	((volatile uint32_t *)((char *)(hw)->hw_addr + (reg)))
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index b2d9f45..81544bb 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -321,7 +321,7 @@ tx_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	/* update tail pointer */
 	rte_wmb();
-	IXGBE_PCI_REG_WRITE(txq->tdt_reg_addr, txq->tx_tail);
+	IXGBE_PCI_REG_WRITE_RELAXED(txq->tdt_reg_addr, txq->tx_tail);
 
 	return nb_pkts;
 }
@@ -897,7 +897,7 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 	PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u tx_tail=%u nb_tx=%u",
 		   (unsigned) txq->port_id, (unsigned) txq->queue_id,
 		   (unsigned) tx_id, (unsigned) nb_tx);
-	IXGBE_PCI_REG_WRITE(txq->tdt_reg_addr, tx_id);
+	IXGBE_PCI_REG_WRITE_RELAXED(txq->tdt_reg_addr, tx_id);
 	txq->tx_tail = tx_id;
 
 	return nb_tx;
@@ -1581,7 +1581,8 @@ rx_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 
 		/* update tail pointer */
 		rte_wmb();
-		IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, cur_free_trigger);
+		IXGBE_PCI_REG_WRITE_RELAXED(rxq->rdt_reg_addr,
+					    cur_free_trigger);
 	}
 
 	if (rxq->rx_tail >= rxq->nb_rx_desc)
@@ -1985,8 +1986,8 @@ ixgbe_recv_pkts_lro(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts,
 
 			if (!ixgbe_rx_alloc_bufs(rxq, false)) {
 				rte_wmb();
-				IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr,
-						    next_rdt);
+				IXGBE_PCI_REG_WRITE_RELAXED(rxq->rdt_reg_addr,
+							    next_rdt);
 				nb_hold -= rxq->rx_free_thresh;
 			} else {
 				PMD_RX_LOG(DEBUG, "RX bulk alloc failed "
@@ -2157,7 +2158,7 @@ ixgbe_recv_pkts_lro(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts,
 			   rxq->port_id, rxq->queue_id, rx_id, nb_hold, nb_rx);
 
 		rte_wmb();
-		IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, prev_id);
+		IXGBE_PCI_REG_WRITE_RELAXED(rxq->rdt_reg_addr, prev_id);
 		nb_hold = 0;
 	}
 
-- 
2.5.5

^ permalink raw reply related

* [PATCH v2 23/29] net/i40e: use eal I/O device memory read/write API
From: Jerin Jacob @ 2016-12-27  9:49 UTC (permalink / raw)
  To: dev
  Cc: konstantin.ananyev, thomas.monjalon, bruce.richardson, jianbo.liu,
	viktorin, santosh.shukla, Helin Zhang, Jingjing Wu, Satha Rao,
	Jerin Jacob
In-Reply-To: <1482832175-27199-1-git-send-email-jerin.jacob@caviumnetworks.com>

From: Santosh Shukla <santosh.shukla@caviumnetworks.com>

Replace the raw I/O device memory read/write access with eal abstraction
for I/O device memory read/write access to fix portability issues across
different architectures.

CC: Helin Zhang <helin.zhang@intel.com>
CC: Jingjing Wu <jingjing.wu@intel.com>
Signed-off-by: Santosh Shukla <santosh.shukla@caviumnetworks.com>
Signed-off-by: Satha Rao <skoteshwar@caviumnetworks.com>
Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 drivers/net/i40e/base/i40e_osdep.h | 10 +++++++---
 drivers/net/i40e/i40e_rxtx.c       |  4 ++--
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/net/i40e/base/i40e_osdep.h b/drivers/net/i40e/base/i40e_osdep.h
index 38e7ba5..c57ecde 100644
--- a/drivers/net/i40e/base/i40e_osdep.h
+++ b/drivers/net/i40e/base/i40e_osdep.h
@@ -44,6 +44,7 @@
 #include <rte_cycles.h>
 #include <rte_spinlock.h>
 #include <rte_log.h>
+#include <rte_io.h>
 
 #include "../i40e_logs.h"
 
@@ -153,15 +154,18 @@ do {                                                            \
  * I40E_PRTQF_FD_MSK
  */
 
-#define I40E_PCI_REG(reg)         (*((volatile uint32_t *)(reg)))
+#define I40E_PCI_REG(reg)		rte_read32(reg)
 #define I40E_PCI_REG_ADDR(a, reg) \
 	((volatile uint32_t *)((char *)(a)->hw_addr + (reg)))
 static inline uint32_t i40e_read_addr(volatile void *addr)
 {
 	return rte_le_to_cpu_32(I40E_PCI_REG(addr));
 }
-#define I40E_PCI_REG_WRITE(reg, value) \
-	do { I40E_PCI_REG((reg)) = rte_cpu_to_le_32(value); } while (0)
+
+#define I40E_PCI_REG_WRITE(reg, value)		\
+	rte_write32((rte_cpu_to_le_32(value)), reg)
+#define I40E_PCI_REG_WRITE_RELAXED(reg, value)	\
+	rte_write32_relaxed((rte_cpu_to_le_32(value)), reg)
 
 #define I40E_WRITE_FLUSH(a) I40E_READ_REG(a, I40E_GLGEN_STAT)
 #define I40EVF_WRITE_FLUSH(a) I40E_READ_REG(a, I40E_VFGEN_RSTAT)
diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index 7ae7d9f..5c41a90 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -1228,7 +1228,7 @@ i40e_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 		   (unsigned) txq->port_id, (unsigned) txq->queue_id,
 		   (unsigned) tx_id, (unsigned) nb_tx);
 
-	I40E_PCI_REG_WRITE(txq->qtx_tail, tx_id);
+	I40E_PCI_REG_WRITE_RELAXED(txq->qtx_tail, tx_id);
 	txq->tx_tail = tx_id;
 
 	return nb_tx;
@@ -1380,7 +1380,7 @@ tx_xmit_pkts(struct i40e_tx_queue *txq,
 
 	/* Update the tx tail register */
 	rte_wmb();
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WRITE_RELAXED(txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
-- 
2.5.5

^ permalink raw reply related

* [PATCH v2 22/29] net/fm10k: use eal I/O device memory read/write API
From: Jerin Jacob @ 2016-12-27  9:49 UTC (permalink / raw)
  To: dev
  Cc: konstantin.ananyev, thomas.monjalon, bruce.richardson, jianbo.liu,
	viktorin, santosh.shukla, Jing Chen, Jerin Jacob
In-Reply-To: <1482832175-27199-1-git-send-email-jerin.jacob@caviumnetworks.com>

From: Santosh Shukla <santosh.shukla@caviumnetworks.com>

Replace the raw I/O device memory read/write access with eal
abstraction for I/O device memory read/write access to fix
portability issues across different architectures.

CC: Jing Chen <jing.d.chen@intel.com>
Signed-off-by: Santosh Shukla <santosh.shukla@caviumnetworks.com>
Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 drivers/net/fm10k/base/fm10k_osdep.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/net/fm10k/base/fm10k_osdep.h b/drivers/net/fm10k/base/fm10k_osdep.h
index a21daa2..f07b678 100644
--- a/drivers/net/fm10k/base/fm10k_osdep.h
+++ b/drivers/net/fm10k/base/fm10k_osdep.h
@@ -39,6 +39,8 @@ POSSIBILITY OF SUCH DAMAGE.
 #include <rte_atomic.h>
 #include <rte_byteorder.h>
 #include <rte_cycles.h>
+#include <rte_io.h>
+
 #include "../fm10k_logs.h"
 
 /* TODO: this does not look like it should be used... */
@@ -88,17 +90,16 @@ typedef int        bool;
 #endif
 
 /* offsets are WORD offsets, not BYTE offsets */
-#define FM10K_WRITE_REG(hw, reg, val)    \
-	((((volatile uint32_t *)(hw)->hw_addr)[(reg)]) = ((uint32_t)(val)))
-#define FM10K_READ_REG(hw, reg)          \
-	(((volatile uint32_t *)(hw)->hw_addr)[(reg)])
+#define FM10K_WRITE_REG(hw, reg, val)		\
+	rte_write32((val), ((hw)->hw_addr + (reg)))
+
+#define FM10K_READ_REG(hw, reg) rte_read32(((hw)->hw_addr + (reg)))
+
 #define FM10K_WRITE_FLUSH(a) FM10K_READ_REG(a, FM10K_CTRL)
 
-#define FM10K_PCI_REG(reg) (*((volatile uint32_t *)(reg)))
+#define FM10K_PCI_REG(reg) rte_read32(reg)
 
-#define FM10K_PCI_REG_WRITE(reg, value) do { \
-	FM10K_PCI_REG((reg)) = (value); \
-} while (0)
+#define FM10K_PCI_REG_WRITE(reg, value) rte_write32((value), (reg))
 
 /* not implemented */
 #define FM10K_READ_PCI_WORD(hw, reg)     0
-- 
2.5.5

^ permalink raw reply related

* [PATCH v2 21/29] net/enic: use eal I/O device memory read/write API
From: Jerin Jacob @ 2016-12-27  9:49 UTC (permalink / raw)
  To: dev
  Cc: konstantin.ananyev, thomas.monjalon, bruce.richardson, jianbo.liu,
	viktorin, santosh.shukla, John Daley, Nelson Escobar, Jerin Jacob
In-Reply-To: <1482832175-27199-1-git-send-email-jerin.jacob@caviumnetworks.com>

From: Santosh Shukla <santosh.shukla@caviumnetworks.com>

Replace the raw I/O device memory read/write access with eal
abstraction for I/O device memory read/write access to fix portability
issues across different architectures.

CC: John Daley <johndale@cisco.com>
CC: Nelson Escobar <neescoba@cisco.com>
Signed-off-by: Santosh Shukla <santosh.shukla@caviumnetworks.com>
Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 drivers/net/enic/enic_compat.h | 27 +++++++++++++++++++--------
 drivers/net/enic/enic_rxtx.c   |  9 +++++----
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/drivers/net/enic/enic_compat.h b/drivers/net/enic/enic_compat.h
index 5dbd983..fc58bb4 100644
--- a/drivers/net/enic/enic_compat.h
+++ b/drivers/net/enic/enic_compat.h
@@ -41,6 +41,7 @@
 #include <rte_atomic.h>
 #include <rte_malloc.h>
 #include <rte_log.h>
+#include <rte_io.h>
 
 #define ENIC_PAGE_ALIGN 4096UL
 #define ENIC_ALIGN      ENIC_PAGE_ALIGN
@@ -95,42 +96,52 @@ typedef         unsigned long long  dma_addr_t;
 
 static inline uint32_t ioread32(volatile void *addr)
 {
-	return *(volatile uint32_t *)addr;
+	return rte_read32(addr);
 }
 
 static inline uint16_t ioread16(volatile void *addr)
 {
-	return *(volatile uint16_t *)addr;
+	return rte_read16(addr);
 }
 
 static inline uint8_t ioread8(volatile void *addr)
 {
-	return *(volatile uint8_t *)addr;
+	return rte_read8(addr);
 }
 
 static inline void iowrite32(uint32_t val, volatile void *addr)
 {
-	*(volatile uint32_t *)addr = val;
+	rte_write32(val, addr);
+}
+
+static inline void iowrite32_relaxed(uint32_t val, volatile void *addr)
+{
+	rte_write32_relaxed(val, addr);
 }
 
 static inline void iowrite16(uint16_t val, volatile void *addr)
 {
-	*(volatile uint16_t *)addr = val;
+	rte_write16(val, addr);
 }
 
 static inline void iowrite8(uint8_t val, volatile void *addr)
 {
-	*(volatile uint8_t *)addr = val;
+	rte_write8(val, addr);
 }
 
 static inline unsigned int readl(volatile void __iomem *addr)
 {
-	return *(volatile unsigned int *)addr;
+	return rte_read32(addr);
+}
+
+static inline unsigned int readl_relaxed(volatile void __iomem *addr)
+{
+	return rte_read32_relaxed(addr);
 }
 
 static inline void writel(unsigned int val, volatile void __iomem *addr)
 {
-	*(volatile unsigned int *)addr = val;
+	rte_write32(val, addr);
 }
 
 #define min_t(type, x, y) ({                    \
diff --git a/drivers/net/enic/enic_rxtx.c b/drivers/net/enic/enic_rxtx.c
index f762a26..382d1ab 100644
--- a/drivers/net/enic/enic_rxtx.c
+++ b/drivers/net/enic/enic_rxtx.c
@@ -380,10 +380,11 @@ enic_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 
 		rte_mb();
 		if (data_rq->in_use)
-			iowrite32(data_rq->posted_index,
-				  &data_rq->ctrl->posted_index);
+			iowrite32_relaxed(data_rq->posted_index,
+					  &data_rq->ctrl->posted_index);
 		rte_compiler_barrier();
-		iowrite32(sop_rq->posted_index, &sop_rq->ctrl->posted_index);
+		iowrite32_relaxed(sop_rq->posted_index,
+				  &sop_rq->ctrl->posted_index);
 	}
 
 
@@ -550,7 +551,7 @@ uint16_t enic_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 	}
  post:
 	rte_wmb();
-	iowrite32(head_idx, &wq->ctrl->posted_index);
+	iowrite32_relaxed(head_idx, &wq->ctrl->posted_index);
  done:
 	wq->ring.desc_avail = wq_desc_avail;
 	wq->head_idx = head_idx;
-- 
2.5.5

^ permalink raw reply related

* [PATCH v2 20/29] net/ena: use eal I/O device memory read/write API
From: Jerin Jacob @ 2016-12-27  9:49 UTC (permalink / raw)
  To: dev
  Cc: konstantin.ananyev, thomas.monjalon, bruce.richardson, jianbo.liu,
	viktorin, santosh.shukla, Jan Medala, Jakub Palider, Jerin Jacob
In-Reply-To: <1482832175-27199-1-git-send-email-jerin.jacob@caviumnetworks.com>

From: Santosh Shukla <santosh.shukla@caviumnetworks.com>

Replace the raw I/O device memory read/write access with eal
abstraction for I/O device memory read/write access to fix
portability issues across different architectures.

CC: Jan Medala <jan@semihalf.com>
CC: Jakub Palider <jpa@semihalf.com>
Signed-off-by: Santosh Shukla <santosh.shukla@caviumnetworks.com>
Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
Acked-by: Jan Medala <jan@semihalf.com>
---
 drivers/net/ena/base/ena_eth_com.h   |  2 +-
 drivers/net/ena/base/ena_plat_dpdk.h | 11 +++++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ena/base/ena_eth_com.h b/drivers/net/ena/base/ena_eth_com.h
index 71a880c..ee62685 100644
--- a/drivers/net/ena/base/ena_eth_com.h
+++ b/drivers/net/ena/base/ena_eth_com.h
@@ -118,7 +118,7 @@ static inline int ena_com_write_sq_doorbell(struct ena_com_io_sq *io_sq)
 	ena_trc_dbg("write submission queue doorbell for queue: %d tail: %d\n",
 		    io_sq->qid, tail);
 
-	ENA_REG_WRITE32(tail, io_sq->db_addr);
+	ENA_REG_WRITE32_RELAXED(tail, io_sq->db_addr);
 
 	return 0;
 }
diff --git a/drivers/net/ena/base/ena_plat_dpdk.h b/drivers/net/ena/base/ena_plat_dpdk.h
index 87c3bf1..09d540a 100644
--- a/drivers/net/ena/base/ena_plat_dpdk.h
+++ b/drivers/net/ena/base/ena_plat_dpdk.h
@@ -48,6 +48,7 @@
 #include <rte_malloc.h>
 #include <rte_memzone.h>
 #include <rte_spinlock.h>
+#include <rte_io.h>
 
 #include <sys/time.h>
 
@@ -226,15 +227,21 @@ typedef uint64_t dma_addr_t;
 
 static inline void writel(u32 value, volatile void  *addr)
 {
-	*(volatile u32 *)addr = value;
+	rte_write32(value, addr);
+}
+
+static inline void writel_relaxed(u32 value, volatile void  *addr)
+{
+	rte_write32_relaxed(value, addr);
 }
 
 static inline u32 readl(const volatile void *addr)
 {
-	return *(const volatile u32 *)addr;
+	return rte_read32(addr);
 }
 
 #define ENA_REG_WRITE32(value, reg) writel((value), (reg))
+#define ENA_REG_WRITE32_RELAXED(value, reg) writel_relaxed((value), (reg))
 #define ENA_REG_READ32(reg) readl((reg))
 
 #define ATOMIC32_INC(i32_ptr) rte_atomic32_inc(i32_ptr)
-- 
2.5.5

^ permalink raw reply related

* [PATCH v2 19/29] net/e1000: use eal I/O device memory read/write API
From: Jerin Jacob @ 2016-12-27  9:49 UTC (permalink / raw)
  To: dev
  Cc: konstantin.ananyev, thomas.monjalon, bruce.richardson, jianbo.liu,
	viktorin, santosh.shukla, Wenzhuo Lu, Jerin Jacob
In-Reply-To: <1482832175-27199-1-git-send-email-jerin.jacob@caviumnetworks.com>

From: Santosh Shukla <santosh.shukla@caviumnetworks.com>

Replace the raw I/O device memory read/write access with eal
abstraction for I/O device memory read/write access to fix
portability issues across different architectures.

CC: Wenzhuo Lu <wenzhuo.lu@intel.com>
Signed-off-by: Santosh Shukla <santosh.shukla@caviumnetworks.com>
Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 drivers/net/e1000/base/e1000_osdep.h | 18 ++++++++++--------
 drivers/net/e1000/em_rxtx.c          |  2 +-
 drivers/net/e1000/igb_rxtx.c         |  2 +-
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/net/e1000/base/e1000_osdep.h b/drivers/net/e1000/base/e1000_osdep.h
index 47a1948..b886804 100644
--- a/drivers/net/e1000/base/e1000_osdep.h
+++ b/drivers/net/e1000/base/e1000_osdep.h
@@ -44,6 +44,7 @@
 #include <rte_log.h>
 #include <rte_debug.h>
 #include <rte_byteorder.h>
+#include <rte_io.h>
 
 #include "../e1000_logs.h"
 
@@ -94,17 +95,18 @@ typedef int		bool;
 
 #define E1000_WRITE_FLUSH(a) E1000_READ_REG(a, E1000_STATUS)
 
-#define E1000_PCI_REG(reg) (*((volatile uint32_t *)(reg)))
+#define E1000_PCI_REG(reg)	rte_read32(reg)
 
-#define E1000_PCI_REG16(reg) (*((volatile uint16_t *)(reg)))
+#define E1000_PCI_REG16(reg)	rte_read16(reg)
 
-#define E1000_PCI_REG_WRITE(reg, value) do { \
-	E1000_PCI_REG((reg)) = (rte_cpu_to_le_32(value)); \
-} while (0)
+#define E1000_PCI_REG_WRITE(reg, value)			\
+	rte_write32((rte_cpu_to_le_32(value)), reg)
 
-#define E1000_PCI_REG_WRITE16(reg, value) do { \
-	E1000_PCI_REG16((reg)) = (rte_cpu_to_le_16(value)); \
-} while (0)
+#define E1000_PCI_REG_WRITE_RELAXED(reg, value)		\
+	rte_write32_relaxed((rte_cpu_to_le_32(value)), reg)
+
+#define E1000_PCI_REG_WRITE16(reg, value)		\
+	rte_write16((rte_cpu_to_le_16(value)), reg)
 
 #define E1000_PCI_REG_ADDR(hw, reg) \
 	((volatile uint32_t *)((char *)(hw)->hw_addr + (reg)))
diff --git a/drivers/net/e1000/em_rxtx.c b/drivers/net/e1000/em_rxtx.c
index 41f51c0..6ec38d4 100644
--- a/drivers/net/e1000/em_rxtx.c
+++ b/drivers/net/e1000/em_rxtx.c
@@ -610,7 +610,7 @@ eth_em_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 	PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u tx_tail=%u nb_tx=%u",
 		(unsigned) txq->port_id, (unsigned) txq->queue_id,
 		(unsigned) tx_id, (unsigned) nb_tx);
-	E1000_PCI_REG_WRITE(txq->tdt_reg_addr, tx_id);
+	E1000_PCI_REG_WRITE_RELAXED(txq->tdt_reg_addr, tx_id);
 	txq->tx_tail = tx_id;
 
 	return nb_tx;
diff --git a/drivers/net/e1000/igb_rxtx.c b/drivers/net/e1000/igb_rxtx.c
index dbd37ac..61edbfb 100644
--- a/drivers/net/e1000/igb_rxtx.c
+++ b/drivers/net/e1000/igb_rxtx.c
@@ -605,7 +605,7 @@ eth_igb_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 	/*
 	 * Set the Transmit Descriptor Tail (TDT).
 	 */
-	E1000_PCI_REG_WRITE(txq->tdt_reg_addr, tx_id);
+	E1000_PCI_REG_WRITE_RELAXED(txq->tdt_reg_addr, tx_id);
 	PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u tx_tail=%u nb_tx=%u",
 		   (unsigned) txq->port_id, (unsigned) txq->queue_id,
 		   (unsigned) tx_id, (unsigned) nb_tx);
-- 
2.5.5

^ permalink raw reply related

* [PATCH v2 18/29] net/cxgbe: use eal I/O device memory read/write API
From: Jerin Jacob @ 2016-12-27  9:49 UTC (permalink / raw)
  To: dev
  Cc: konstantin.ananyev, thomas.monjalon, bruce.richardson, jianbo.liu,
	viktorin, santosh.shukla, Rahul Lakkireddy, Jerin Jacob
In-Reply-To: <1482832175-27199-1-git-send-email-jerin.jacob@caviumnetworks.com>

From: Santosh Shukla <santosh.shukla@caviumnetworks.com>

Replace the raw I/O device memory read/write access with eal
abstraction for I/O device memory read/write access to fix
portability issues across different architectures.

CC: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Signed-off-by: Santosh Shukla <santosh.shukla@caviumnetworks.com>
Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 drivers/net/cxgbe/base/adapter.h | 34 ++++++++++++++++++++++++++++------
 drivers/net/cxgbe/cxgbe_compat.h |  8 +++++++-
 drivers/net/cxgbe/sge.c          | 10 +++++-----
 3 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/drivers/net/cxgbe/base/adapter.h b/drivers/net/cxgbe/base/adapter.h
index 5e3bd50..beb1e3e 100644
--- a/drivers/net/cxgbe/base/adapter.h
+++ b/drivers/net/cxgbe/base/adapter.h
@@ -37,6 +37,7 @@
 #define __T4_ADAPTER_H__
 
 #include <rte_mbuf.h>
+#include <rte_io.h>
 
 #include "cxgbe_compat.h"
 #include "t4_regs_values.h"
@@ -324,7 +325,7 @@ struct adapter {
 	int use_unpacked_mode; /* unpacked rx mode state */
 };
 
-#define CXGBE_PCI_REG(reg) (*((volatile uint32_t *)(reg)))
+#define CXGBE_PCI_REG(reg) rte_read32(reg)
 
 static inline uint64_t cxgbe_read_addr64(volatile void *addr)
 {
@@ -350,16 +351,21 @@ static inline uint32_t cxgbe_read_addr(volatile void *addr)
 #define CXGBE_READ_REG64(adap, reg) \
 	cxgbe_read_addr64(CXGBE_PCI_REG_ADDR((adap), (reg)))
 
-#define CXGBE_PCI_REG_WRITE(reg, value) ({ \
-	CXGBE_PCI_REG((reg)) = (value); })
+#define CXGBE_PCI_REG_WRITE(reg, value) rte_write32((value), (reg))
+
+#define CXGBE_PCI_REG_WRITE_RELAXED(reg, value) \
+	rte_write32_relaxed((value), (reg))
 
 #define CXGBE_WRITE_REG(adap, reg, value) \
 	CXGBE_PCI_REG_WRITE(CXGBE_PCI_REG_ADDR((adap), (reg)), (value))
 
+#define CXGBE_WRITE_REG_RELAXED(adap, reg, value) \
+	CXGBE_PCI_REG_WRITE_RELAXED(CXGBE_PCI_REG_ADDR((adap), (reg)), (value))
+
 static inline uint64_t cxgbe_write_addr64(volatile void *addr, uint64_t val)
 {
-	CXGBE_PCI_REG(addr) = val;
-	CXGBE_PCI_REG(((volatile uint8_t *)(addr) + 4)) = (val >> 32);
+	CXGBE_PCI_REG_WRITE(addr, val);
+	CXGBE_PCI_REG_WRITE(((volatile uint8_t *)(addr) + 4), (val >> 32));
 	return val;
 }
 
@@ -383,7 +389,7 @@ static inline u32 t4_read_reg(struct adapter *adapter, u32 reg_addr)
 }
 
 /**
- * t4_write_reg - write a HW register
+ * t4_write_reg - write a HW register with barrier
  * @adapter: the adapter
  * @reg_addr: the register address
  * @val: the value to write
@@ -398,6 +404,22 @@ static inline void t4_write_reg(struct adapter *adapter, u32 reg_addr, u32 val)
 }
 
 /**
+ * t4_write_reg_relaxed - write a HW register with no barrier
+ * @adapter: the adapter
+ * @reg_addr: the register address
+ * @val: the value to write
+ *
+ * Write a 32-bit value into the given HW register.
+ */
+static inline void t4_write_reg_relaxed(struct adapter *adapter, u32 reg_addr,
+					u32 val)
+{
+	CXGBE_DEBUG_REG(adapter, "setting register 0x%x to 0x%x\n", reg_addr,
+			val);
+	CXGBE_WRITE_REG_RELAXED(adapter, reg_addr, val);
+}
+
+/**
  * t4_read_reg64 - read a 64-bit HW register
  * @adapter: the adapter
  * @reg_addr: the register address
diff --git a/drivers/net/cxgbe/cxgbe_compat.h b/drivers/net/cxgbe/cxgbe_compat.h
index e68f8f5..1551cbf 100644
--- a/drivers/net/cxgbe/cxgbe_compat.h
+++ b/drivers/net/cxgbe/cxgbe_compat.h
@@ -45,6 +45,7 @@
 #include <rte_cycles.h>
 #include <rte_spinlock.h>
 #include <rte_log.h>
+#include <rte_io.h>
 
 #define dev_printf(level, fmt, args...) \
 	RTE_LOG(level, PMD, "rte_cxgbe_pmd: " fmt, ## args)
@@ -254,7 +255,7 @@ static inline unsigned long ilog2(unsigned long n)
 
 static inline void writel(unsigned int val, volatile void __iomem *addr)
 {
-	*(volatile unsigned int *)addr = val;
+	rte_write32(val, addr);
 }
 
 static inline void writeq(u64 val, volatile void __iomem *addr)
@@ -263,4 +264,9 @@ static inline void writeq(u64 val, volatile void __iomem *addr)
 	writel(val >> 32, (void *)((uintptr_t)addr + 4));
 }
 
+static inline void writel_relaxed(unsigned int val, volatile void __iomem *addr)
+{
+	rte_write32_relaxed(val, addr);
+}
+
 #endif /* _CXGBE_COMPAT_H_ */
diff --git a/drivers/net/cxgbe/sge.c b/drivers/net/cxgbe/sge.c
index 736f08c..fc03a0c 100644
--- a/drivers/net/cxgbe/sge.c
+++ b/drivers/net/cxgbe/sge.c
@@ -338,12 +338,12 @@ static inline void ring_fl_db(struct adapter *adap, struct sge_fl *q)
 		 * mechanism.
 		 */
 		if (unlikely(!q->bar2_addr)) {
-			t4_write_reg(adap, MYPF_REG(A_SGE_PF_KDOORBELL),
-				     val | V_QID(q->cntxt_id));
+			t4_write_reg_relaxed(adap, MYPF_REG(A_SGE_PF_KDOORBELL),
+					     val | V_QID(q->cntxt_id));
 		} else {
-			writel(val | V_QID(q->bar2_qid),
-			       (void *)((uintptr_t)q->bar2_addr +
-			       SGE_UDB_KDOORBELL));
+			writel_relaxed(val | V_QID(q->bar2_qid),
+				       (void *)((uintptr_t)q->bar2_addr +
+				       SGE_UDB_KDOORBELL));
 
 			/*
 			 * This Write memory Barrier will force the write to
-- 
2.5.5

^ permalink raw reply related

* [PATCH v2 17/29] net/bnx2x: use eal I/O device memory read/write API
From: Jerin Jacob @ 2016-12-27  9:49 UTC (permalink / raw)
  To: dev
  Cc: konstantin.ananyev, thomas.monjalon, bruce.richardson, jianbo.liu,
	viktorin, santosh.shukla, Harish Patil, Rasesh Mody, Jerin Jacob
In-Reply-To: <1482832175-27199-1-git-send-email-jerin.jacob@caviumnetworks.com>

From: Santosh Shukla <santosh.shukla@caviumnetworks.com>

Replace the raw I/O device memory read/write access with eal abstraction
for I/O device memory read/write access to fix portability issues across
different architectures.

CC: Harish Patil <harish.patil@cavium.com>
CC: Rasesh Mody <rasesh.mody@cavium.com>
Signed-off-by: Santosh Shukla <santosh.shukla@caviumnetworks.com>
Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 drivers/net/bnx2x/bnx2x.h | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/drivers/net/bnx2x/bnx2x.h b/drivers/net/bnx2x/bnx2x.h
index 5cefea4..59064d8 100644
--- a/drivers/net/bnx2x/bnx2x.h
+++ b/drivers/net/bnx2x/bnx2x.h
@@ -18,6 +18,7 @@
 
 #include <rte_byteorder.h>
 #include <rte_spinlock.h>
+#include <rte_io.h>
 
 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
 #ifndef __LITTLE_ENDIAN
@@ -1419,8 +1420,7 @@ bnx2x_reg_write8(struct bnx2x_softc *sc, size_t offset, uint8_t val)
 {
 	PMD_DEBUG_PERIODIC_LOG(DEBUG, "offset=0x%08lx val=0x%02x",
 			       (unsigned long)offset, val);
-	*((volatile uint8_t*)
-	  ((uintptr_t)sc->bar[BAR0].base_addr + offset)) = val;
+	rte_write8(val, ((uint8_t *)sc->bar[BAR0].base_addr + offset));
 }
 
 static inline void
@@ -1433,8 +1433,8 @@ bnx2x_reg_write16(struct bnx2x_softc *sc, size_t offset, uint16_t val)
 #endif
 	PMD_DEBUG_PERIODIC_LOG(DEBUG, "offset=0x%08lx val=0x%04x",
 			       (unsigned long)offset, val);
-	*((volatile uint16_t*)
-	  ((uintptr_t)sc->bar[BAR0].base_addr + offset)) = val;
+	rte_write16(val, ((uint8_t *)sc->bar[BAR0].base_addr + offset));
+
 }
 
 static inline void
@@ -1448,8 +1448,7 @@ bnx2x_reg_write32(struct bnx2x_softc *sc, size_t offset, uint32_t val)
 
 	PMD_DEBUG_PERIODIC_LOG(DEBUG, "offset=0x%08lx val=0x%08x",
 			       (unsigned long)offset, val);
-	*((volatile uint32_t*)
-	  ((uintptr_t)sc->bar[BAR0].base_addr + offset)) = val;
+	rte_write32(val, ((uint8_t *)sc->bar[BAR0].base_addr + offset));
 }
 
 static inline uint8_t
@@ -1457,8 +1456,7 @@ bnx2x_reg_read8(struct bnx2x_softc *sc, size_t offset)
 {
 	uint8_t val;
 
-	val = (uint8_t)(*((volatile uint8_t*)
-			  ((uintptr_t)sc->bar[BAR0].base_addr + offset)));
+	val = rte_read8((uint8_t *)sc->bar[BAR0].base_addr + offset);
 	PMD_DEBUG_PERIODIC_LOG(DEBUG, "offset=0x%08lx val=0x%02x",
 			       (unsigned long)offset, val);
 
@@ -1476,8 +1474,7 @@ bnx2x_reg_read16(struct bnx2x_softc *sc, size_t offset)
 			    (unsigned long)offset);
 #endif
 
-	val = (uint16_t)(*((volatile uint16_t*)
-			   ((uintptr_t)sc->bar[BAR0].base_addr + offset)));
+	val = rte_read16(((uint8_t *)sc->bar[BAR0].base_addr + offset));
 	PMD_DEBUG_PERIODIC_LOG(DEBUG, "offset=0x%08lx val=0x%08x",
 			       (unsigned long)offset, val);
 
@@ -1495,8 +1492,7 @@ bnx2x_reg_read32(struct bnx2x_softc *sc, size_t offset)
 			    (unsigned long)offset);
 #endif
 
-	val = (uint32_t)(*((volatile uint32_t*)
-			   ((uintptr_t)sc->bar[BAR0].base_addr + offset)));
+	val = rte_read32(((uint8_t *)sc->bar[BAR0].base_addr + offset));
 	PMD_DEBUG_PERIODIC_LOG(DEBUG, "offset=0x%08lx val=0x%08x",
 			       (unsigned long)offset, val);
 
@@ -1560,11 +1556,9 @@ bnx2x_reg_read32(struct bnx2x_softc *sc, size_t offset)
 #define DPM_TRIGGER_TYPE 0x40
 
 /* Doorbell macro */
-#define BNX2X_DB_WRITE(db_bar, val) \
-	*((volatile uint32_t *)(db_bar)) = (val)
+#define BNX2X_DB_WRITE(db_bar, val) rte_write32_relaxed((val), (db_bar))
 
-#define BNX2X_DB_READ(db_bar) \
-	*((volatile uint32_t *)(db_bar))
+#define BNX2X_DB_READ(db_bar) rte_read32_relaxed(db_bar)
 
 #define DOORBELL_ADDR(sc, offset) \
 	(volatile uint32_t *)(((char *)(sc)->bar[BAR1].base_addr + (offset)))
-- 
2.5.5

^ permalink raw reply related

* [PATCH v2 16/29] net/bnxt: use eal I/O device memory read/write API
From: Jerin Jacob @ 2016-12-27  9:49 UTC (permalink / raw)
  To: dev
  Cc: konstantin.ananyev, thomas.monjalon, bruce.richardson, jianbo.liu,
	viktorin, santosh.shukla, Stephen Hurd, Ajit Khaparde,
	Jerin Jacob
In-Reply-To: <1482832175-27199-1-git-send-email-jerin.jacob@caviumnetworks.com>

From: Santosh Shukla <santosh.shukla@caviumnetworks.com>

Replace the raw I/O device memory read/write access with eal abstraction
for I/O device memory read/write access to fix portability issues across
different architectures.

CC: Stephen Hurd <stephen.hurd@broadcom.com>
CC: Ajit Khaparde <ajit.khaparde@broadcom.com>
Signed-off-by: Santosh Shukla <santosh.shukla@caviumnetworks.com>
Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 drivers/net/bnxt/bnxt_cpr.h  | 13 ++++++++-----
 drivers/net/bnxt/bnxt_hwrm.c |  7 +++++--
 drivers/net/bnxt/bnxt_txr.h  |  6 +++---
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/drivers/net/bnxt/bnxt_cpr.h b/drivers/net/bnxt/bnxt_cpr.h
index f9f2adb..83e5376 100644
--- a/drivers/net/bnxt/bnxt_cpr.h
+++ b/drivers/net/bnxt/bnxt_cpr.h
@@ -34,6 +34,8 @@
 #ifndef _BNXT_CPR_H_
 #define _BNXT_CPR_H_
 
+#include <rte_io.h>
+
 #define CMP_VALID(cmp, raw_cons, ring)					\
 	(!!(((struct cmpl_base *)(cmp))->info3_v & CMPL_BASE_V) ==	\
 	 !((raw_cons) & ((ring)->ring_size)))
@@ -50,13 +52,14 @@
 #define DB_CP_FLAGS		(DB_KEY_CP | DB_IDX_VALID | DB_IRQ_DIS)
 
 #define B_CP_DB_REARM(cpr, raw_cons)					\
-		(*(uint32_t *)((cpr)->cp_doorbell) = (DB_CP_REARM_FLAGS | \
-				RING_CMP(cpr->cp_ring_struct, raw_cons)))
+	rte_write32((DB_CP_REARM_FLAGS |				\
+		    RING_CMP(((cpr)->cp_ring_struct), raw_cons)),	\
+		    ((cpr)->cp_doorbell))
 
 #define B_CP_DIS_DB(cpr, raw_cons)					\
-		rte_smp_wmb();						\
-		(*(uint32_t *)((cpr)->cp_doorbell) = (DB_CP_FLAGS |	\
-				RING_CMP(cpr->cp_ring_struct, raw_cons)))
+	rte_write32((DB_CP_FLAGS |					\
+		    RING_CMP(((cpr)->cp_ring_struct), raw_cons)),	\
+		    ((cpr)->cp_doorbell))
 
 struct bnxt_ring;
 struct bnxt_cp_ring_info {
diff --git a/drivers/net/bnxt/bnxt_hwrm.c b/drivers/net/bnxt/bnxt_hwrm.c
index 07e7124..c182152 100644
--- a/drivers/net/bnxt/bnxt_hwrm.c
+++ b/drivers/net/bnxt/bnxt_hwrm.c
@@ -50,6 +50,8 @@
 #include "bnxt_vnic.h"
 #include "hsi_struct_def_dpdk.h"
 
+#include <rte_io.h>
+
 #define HWRM_CMD_TIMEOUT		2000
 
 /*
@@ -72,7 +74,7 @@ static int bnxt_hwrm_send_message_locked(struct bnxt *bp, void *msg,
 	/* Write request msg to hwrm channel */
 	for (i = 0; i < msg_len; i += 4) {
 		bar = (uint8_t *)bp->bar0 + i;
-		*(volatile uint32_t *)bar = *data;
+		rte_write32(*data, bar);
 		data++;
 	}
 
@@ -80,11 +82,12 @@ static int bnxt_hwrm_send_message_locked(struct bnxt *bp, void *msg,
 	for (; i < bp->max_req_len; i += 4) {
 		bar = (uint8_t *)bp->bar0 + i;
 		*(volatile uint32_t *)bar = 0;
+		rte_write32(0, bar);
 	}
 
 	/* Ring channel doorbell */
 	bar = (uint8_t *)bp->bar0 + 0x100;
-	*(volatile uint32_t *)bar = 1;
+	rte_write32(1, bar);
 
 	/* Poll for the valid bit */
 	for (i = 0; i < HWRM_CMD_TIMEOUT; i++) {
diff --git a/drivers/net/bnxt/bnxt_txr.h b/drivers/net/bnxt/bnxt_txr.h
index 4c16101..5b09711 100644
--- a/drivers/net/bnxt/bnxt_txr.h
+++ b/drivers/net/bnxt/bnxt_txr.h
@@ -34,12 +34,12 @@
 #ifndef _BNXT_TXR_H_
 #define _BNXT_TXR_H_
 
+#include <rte_io.h>
+
 #define MAX_TX_RINGS	16
 #define BNXT_TX_PUSH_THRESH 92
 
-#define B_TX_DB(db, prod)						\
-		rte_smp_wmb();						\
-		(*(uint32_t *)db = (DB_KEY_TX | prod))
+#define B_TX_DB(db, prod)	rte_write32((DB_KEY_TX | (prod)), db)
 
 struct bnxt_tx_ring_info {
 	uint16_t		tx_prod;
-- 
2.5.5

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox