Netdev List
 help / color / mirror / Atom feed
* [PULL] virtio: virtio 1.0 support, misc patches
From: Michael S. Tsirkin @ 2014-12-11 12:02 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: sergei.shtylyov, kvm, mst, netdev, linux-kernel, virtualization,
	pbonzini, ben, David Miller, thuth

The following changes since commit b2776bf7149bddd1f4161f14f79520f17fc1d71d:

  Linux 3.18 (2014-12-07 14:21:05 -0800)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git tags/for_linus

for you to fetch changes up to 803cd18f7b5e6c7ad6bee9571ae8f4450190ab58:

  virtio_ccw: finalize_features error handling (2014-12-09 16:32:41 +0200)

Note: some net drivers are affected by these patches.
David said he's fine with merging these patches through
my tree.
Rusty's on vacation, he acked using my tree for these, too.

----------------------------------------------------------------
virtio: virtio 1.0 support, misc patches

This adds a lot of infrastructure for virtio 1.0 support.
Notable missing pieces: virtio pci, virtio balloon (needs spec extension),
vhost scsi.

Plus, there are some minor fixes in a couple of places.

Cc: David Miller <davem@davemloft.net>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

----------------------------------------------------------------
Cornelia Huck (4):
      virtio: allow transports to get avail/used addresses
      KVM: s390: virtio-ccw revision 1 SET_VQ
      KVM: s390: enable virtio-ccw revision 1
      virtio_ccw: finalize_features error handling

Jason Wang (1):
      vhost: remove unnecessary forward declarations in vhost.h

Michael S. Tsirkin (64):
      virtio: add low-level APIs for feature bits
      virtio: use u32, not bitmap for features
      mic_virtio: robust feature array size calculation
      virtio: add support for 64 bit features.
      virtio: assert 32 bit features in transports
      virtio_ccw: add support for 64 bit features.
      virtio: add virtio 1.0 feature bit
      virtio: memory access APIs
      virtio_ring: switch to new memory access APIs
      virtio_config: endian conversion for v1.0
      virtio: set FEATURES_OK
      virtio: simplify feature bit handling
      virtio: add legacy feature table support
      virtio_net: v1.0 endianness
      virtio_blk: v1.0 support
      KVM: s390 allow virtio_ccw status writes to fail
      virtio_blk: make serial attribute static
      virtio_blk: fix race at module removal
      virtio_net: pass vi around
      virtio_net: get rid of virtio_net_hdr/skb_vnet_hdr
      virtio_net: stricter short buffer length checks
      virtio_net: bigger header when VERSION_1 is set
      virtio_net: disable mac write for virtio 1.0
      virtio_net: enable v1.0 support
      vhost: make features 64 bit
      vhost: add memory access wrappers
      vhost/net: force len for TX to host endian
      vhost: switch to __get/__put_user exclusively
      vhost: virtio 1.0 endian-ness support
      vhost/net: virtio 1.0 byte swap
      vhost/net: larger header for virtio 1.0
      vhost/net: enable virtio 1.0
      tun: move internal flag defines out of uapi
      tun: drop most type defines
      tun: add VNET_LE flag
      tun: TUN_VNET_LE support, fix sparse warnings for virtio headers
      macvtap: TUN_VNET_LE support
      virtio_scsi: v1.0 support
      virtio_scsi: move to uapi
      virtio_scsi: export to userspace
      vhost/scsi: partial virtio 1.0 support
      af_packet: virtio 1.0 stubs
      virtio_console: virtio 1.0 support
      virtio_balloon: add legacy_only flag
      virtio: make VIRTIO_F_VERSION_1 a transport bit
      virtio: drop VIRTIO_F_VERSION_1 from drivers
      virtio_console: fix sparse warnings
      virtio: add API to detect legacy devices
      virtio_ccw: legacy: don't negotiate rev 1/features
      virtio: allow finalize_features to fail
      virtio_ccw: rev 1 devices set VIRTIO_F_VERSION_1
      virtio_balloon: drop legacy_only driver flag
      virtio: drop legacy_only driver flag
      virtio_pci: add isr field
      virtio_pci: fix coding style for structs
      virtio_pci: free up vq->priv
      virtio_pci: use priv for vq notification
      virtio_pci: delete vqs indirectly
      virtio_pci: setup vqs indirectly
      virtio_pci: setup config vector indirectly
      virtio_pci: split out legacy device support
      virtio_pci: update file descriptions and copyright
      virtio_pci: rename virtio_pci -> virtio_pci_common
      virtio_ccw: future-proof finalize_features

Thomas Huth (1):
      KVM: s390: Set virtio-ccw transport revision

 drivers/vhost/vhost.h                  |  41 +-
 drivers/virtio/virtio_pci_common.h     | 136 ++++++
 include/linux/virtio.h                 |  12 +-
 include/linux/virtio_byteorder.h       |  59 +++
 include/linux/virtio_config.h          | 103 ++++-
 include/uapi/linux/if_tun.h            |  17 +-
 include/uapi/linux/virtio_blk.h        |  15 +-
 include/uapi/linux/virtio_config.h     |   9 +-
 include/uapi/linux/virtio_console.h    |   7 +-
 include/uapi/linux/virtio_net.h        |  15 +-
 include/uapi/linux/virtio_ring.h       |  45 +-
 include/{ => uapi}/linux/virtio_scsi.h | 106 ++---
 include/uapi/linux/virtio_types.h      |  46 ++
 tools/virtio/linux/virtio.h            |  22 +-
 tools/virtio/linux/virtio_config.h     |   2 +-
 drivers/block/virtio_blk.c             |  74 +--
 drivers/char/virtio_console.c          |  39 +-
 drivers/lguest/lguest_device.c         |  17 +-
 drivers/misc/mic/card/mic_virtio.c     |  14 +-
 drivers/net/macvtap.c                  |  68 ++-
 drivers/net/tun.c                      | 168 +++----
 drivers/net/virtio_net.c               | 161 +++----
 drivers/remoteproc/remoteproc_virtio.c |  11 +-
 drivers/s390/kvm/kvm_virtio.c          |  11 +-
 drivers/s390/kvm/virtio_ccw.c          | 203 +++++++--
 drivers/scsi/virtio_scsi.c             |  50 +-
 drivers/vhost/net.c                    |  31 +-
 drivers/vhost/scsi.c                   |  22 +-
 drivers/vhost/vhost.c                  |  93 ++--
 drivers/virtio/virtio.c                | 102 ++++-
 drivers/virtio/virtio_mmio.c           |  17 +-
 drivers/virtio/virtio_pci.c            | 802 ---------------------------------
 drivers/virtio/virtio_pci_common.c     | 464 +++++++++++++++++++
 drivers/virtio/virtio_pci_legacy.c     | 326 ++++++++++++++
 drivers/virtio/virtio_ring.c           | 109 +++--
 net/packet/af_packet.c                 |  35 +-
 tools/virtio/virtio_test.c             |   5 +-
 tools/virtio/vringh_test.c             |  16 +-
 drivers/virtio/Makefile                |   1 +
 include/uapi/linux/Kbuild              |   2 +
 40 files changed, 2048 insertions(+), 1428 deletions(-)
 create mode 100644 drivers/virtio/virtio_pci_common.h
 create mode 100644 include/linux/virtio_byteorder.h
 rename include/{ => uapi}/linux/virtio_scsi.h (73%)
 create mode 100644 include/uapi/linux/virtio_types.h
 delete mode 100644 drivers/virtio/virtio_pci.c
 create mode 100644 drivers/virtio/virtio_pci_common.c
 create mode 100644 drivers/virtio/virtio_pci_legacy.c

^ permalink raw reply

* [PATCH net v2] Fix race condition between vxlan_sock_add and vxlan_sock_release
From: Marcelo Ricardo Leitner @ 2014-12-11 12:02 UTC (permalink / raw)
  To: netdev

Currently, when trying to reuse a socket, vxlan_sock_add will grab
vn->sock_lock, locate a reusable socket, inc refcount and release
vn->sock_lock.

But vxlan_sock_release() will first decrement refcount, and then grab
that lock. refcnt operations are atomic but as currently we have
deferred works which hold vs->refcnt each, this might happen, leading to
a use after free (specially after vxlan_igmp_leave):

  CPU 1                            CPU 2

deferred work                    vxlan_sock_add
  ...                              ...
                                   spin_lock(&vn->sock_lock)
                                   vs = vxlan_find_sock();
  vxlan_sock_release
    dec vs->refcnt, reaches 0
    spin_lock(&vn->sock_lock)
                                   vxlan_sock_hold(vs), refcnt=1
                                   spin_unlock(&vn->sock_lock)
    hlist_del_rcu(&vs->hlist);
    vxlan_notify_del_rx_port(vs)
    spin_unlock(&vn->sock_lock)


So when we look for a reusable socket, we check if it wasn't freed
already before reusing it.

Signed-off-by: Marcelo Ricardo Leitner <mleitner@redhat.com>
Fixes: 7c47cedf43a8b3 ("vxlan: move IGMP join/leave to work queue")
---

Notes:
    v1->v2: addressed Dave's comment on better to use atomic_add_unless()
    than grabbing the lock earlier on vxlan_sock_release()
    
    Note that there are two search&reuse places, on vxlan_init() and
    vxlan_sock_add(), both handled.

 drivers/net/vxlan.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 31ecb03368c6dc3d581fdbd30b409b88190f3c71..49d9f229199851c48f5a9e6f1b282b42cedc2a41 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1985,9 +1985,8 @@ static int vxlan_init(struct net_device *dev)
 	spin_lock(&vn->sock_lock);
 	vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET,
 			     vxlan->dst_port);
-	if (vs) {
+	if (vs && atomic_add_unless(&vs->refcnt, 1, 0)) {
 		/* If we have a socket with same port already, reuse it */
-		atomic_inc(&vs->refcnt);
 		vxlan_vs_add_dev(vs, vxlan);
 	} else {
 		/* otherwise make new socket outside of RTNL */
@@ -2389,12 +2388,9 @@ struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port,
 
 	spin_lock(&vn->sock_lock);
 	vs = vxlan_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port);
-	if (vs) {
-		if (vs->rcv == rcv)
-			atomic_inc(&vs->refcnt);
-		else
+	if (vs && ((vs->rcv != rcv) ||
+		   !atomic_add_unless(&vs->refcnt, 1, 0)))
 			vs = ERR_PTR(-EBUSY);
-	}
 	spin_unlock(&vn->sock_lock);
 
 	if (!vs)
-- 
1.9.3

^ permalink raw reply related

* Re: [PATCH net-next RESEND] net: Do not call ndo_dflt_fdb_dump if ndo_fdb_dump is defined.
From: Jamal Hadi Salim @ 2014-12-11 11:49 UTC (permalink / raw)
  To: David Miller, h.sokolowski; +Cc: netdev, Roopa Prabhu, Vlad Yasevich
In-Reply-To: <20141210.233239.472984361665334371.davem@davemloft.net>

On 12/10/14 23:32, David Miller wrote:
> From: "Hubert Sokolowski" <h.sokolowski@wit.edu.pl>
> Date: Wed, 10 Dec 2014 19:37:01 -0000
>
>> This change restores the semantic that was present
>> before 5e6d243587990a588143b9da3974833649595587
>> "bridge: netlink dump interface at par with brctl"
>> on how ndo_dflt_fdb_dump is called.
>> This semantic is still used for add and del operations
>> so let's keep it consistent.
>> Driver can still call ndo_dflt_fdb_dump from inside
>> its own fdb_dump routine when needed.
>>
>> Signed-off-by: Hubert Sokolowski <h.sokolowski@wit.edu.pl>
>
> Jamal, please review.
>

It wont work. As pointed out by Roopa in
the other email dev->uc/mc will not get dumped with this
change. Vlad will be in a better position to comment.
CCing Vlad.

Hubert, immediate gratification never works on netdev.
I advised you to run the commit tests in at least
2 emails when you contacted me privately before posting.
It would have chewed about 5 minutes of your time.
I am sure it cost Roopa at least 1 hour. And if Dave
had sucked in your innocent looking patch we'd be playing
damage control after which is a lot more expensive.

cheers,
jamal

^ permalink raw reply

* [PATCH net-next v9 3/3] net: hisilicon: new hip04 ethernet driver
From: Ding Tianhong @ 2014-12-11 11:42 UTC (permalink / raw)
  To: zhangfei.gao, davem, linux, arnd, f.fainelli, sergei.shtylyov,
	mark.rutland, David.Laight, eric.dumazet, xuwei5
  Cc: linux-arm-kernel, netdev, devicetree
In-Reply-To: <1418298150-4944-1-git-send-email-dingtianhong@huawei.com>

From: Zhangfei Gao <zhangfei.gao@linaro.org>

Support Hisilicon hip04 ethernet driver, including 100M / 1000M controller.
The controller has no tx done interrupt, reclaim xmitted buffer in the poll.

Signed-off-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
---
 drivers/net/ethernet/hisilicon/Makefile    |   2 +-
 drivers/net/ethernet/hisilicon/hip04_eth.c | 876 +++++++++++++++++++++++++++++
 2 files changed, 877 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/hisilicon/hip04_eth.c

diff --git a/drivers/net/ethernet/hisilicon/Makefile b/drivers/net/ethernet/hisilicon/Makefile
index 40115a7..6c14540 100644
--- a/drivers/net/ethernet/hisilicon/Makefile
+++ b/drivers/net/ethernet/hisilicon/Makefile
@@ -3,4 +3,4 @@
 #
 
 obj-$(CONFIG_HIX5HD2_GMAC) += hix5hd2_gmac.o
-obj-$(CONFIG_HIP04_ETH) += hip04_mdio.o
+obj-$(CONFIG_HIP04_ETH) += hip04_mdio.o hip04_eth.o
diff --git a/drivers/net/ethernet/hisilicon/hip04_eth.c b/drivers/net/ethernet/hisilicon/hip04_eth.c
new file mode 100644
index 0000000..9d37b67
--- /dev/null
+++ b/drivers/net/ethernet/hisilicon/hip04_eth.c
@@ -0,0 +1,876 @@
+
+/* Copyright (c) 2014 Linaro Ltd.
+ * Copyright (c) 2014 Hisilicon Limited.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/etherdevice.h>
+#include <linux/platform_device.h>
+#include <linux/interrupt.h>
+#include <linux/of_address.h>
+#include <linux/phy.h>
+#include <linux/of_mdio.h>
+#include <linux/of_net.h>
+#include <linux/mfd/syscon.h>
+#include <linux/regmap.h>
+
+#define PPE_CFG_RX_ADDR			0x100
+#define PPE_CFG_POOL_GRP		0x300
+#define PPE_CFG_RX_BUF_SIZE		0x400
+#define PPE_CFG_RX_FIFO_SIZE		0x500
+#define PPE_CURR_BUF_CNT		0xa200
+
+#define GE_DUPLEX_TYPE			0x08
+#define GE_MAX_FRM_SIZE_REG		0x3c
+#define GE_PORT_MODE			0x40
+#define GE_PORT_EN			0x44
+#define GE_SHORT_RUNTS_THR_REG		0x50
+#define GE_TX_LOCAL_PAGE_REG		0x5c
+#define GE_TRANSMIT_CONTROL_REG		0x60
+#define GE_CF_CRC_STRIP_REG		0x1b0
+#define GE_MODE_CHANGE_REG		0x1b4
+#define GE_RECV_CONTROL_REG		0x1e0
+#define GE_STATION_MAC_ADDRESS		0x210
+#define PPE_CFG_CPU_ADD_ADDR		0x580
+#define PPE_CFG_MAX_FRAME_LEN_REG	0x408
+#define PPE_CFG_BUS_CTRL_REG		0x424
+#define PPE_CFG_RX_CTRL_REG		0x428
+#define PPE_CFG_RX_PKT_MODE_REG		0x438
+#define PPE_CFG_QOS_VMID_GEN		0x500
+#define PPE_CFG_RX_PKT_INT		0x538
+#define PPE_INTEN			0x600
+#define PPE_INTSTS			0x608
+#define PPE_RINT			0x604
+#define PPE_CFG_STS_MODE		0x700
+#define PPE_HIS_RX_PKT_CNT		0x804
+
+/* REG_INTERRUPT */
+#define RCV_INT				BIT(10)
+#define RCV_NOBUF			BIT(8)
+#define RCV_DROP			BIT(7)
+#define TX_DROP				BIT(6)
+#define DEF_INT_ERR			(RCV_NOBUF | RCV_DROP | TX_DROP)
+#define DEF_INT_MASK			(RCV_INT | DEF_INT_ERR)
+
+/* TX descriptor config */
+#define TX_FREE_MEM			BIT(0)
+#define TX_READ_ALLOC_L3		BIT(1)
+#define TX_FINISH_CACHE_INV		BIT(2)
+#define TX_CLEAR_WB			BIT(4)
+#define TX_L3_CHECKSUM			BIT(5)
+#define TX_LOOP_BACK			BIT(11)
+
+/* RX error */
+#define RX_PKT_DROP			BIT(0)
+#define RX_L2_ERR			BIT(1)
+#define RX_PKT_ERR			(RX_PKT_DROP | RX_L2_ERR)
+
+#define SGMII_SPEED_1000		0x08
+#define SGMII_SPEED_100			0x07
+#define SGMII_SPEED_10			0x06
+#define MII_SPEED_100			0x01
+#define MII_SPEED_10			0x00
+
+#define GE_DUPLEX_FULL			BIT(0)
+#define GE_DUPLEX_HALF			0x00
+#define GE_MODE_CHANGE_EN		BIT(0)
+
+#define GE_TX_AUTO_NEG			BIT(5)
+#define GE_TX_ADD_CRC			BIT(6)
+#define GE_TX_SHORT_PAD_THROUGH		BIT(7)
+
+#define GE_RX_STRIP_CRC			BIT(0)
+#define GE_RX_STRIP_PAD			BIT(3)
+#define GE_RX_PAD_EN			BIT(4)
+
+#define GE_AUTO_NEG_CTL			BIT(0)
+
+#define GE_RX_INT_THRESHOLD		BIT(6)
+#define GE_RX_TIMEOUT			0x04
+
+#define GE_RX_PORT_EN			BIT(1)
+#define GE_TX_PORT_EN			BIT(2)
+
+#define PPE_CFG_STS_RX_PKT_CNT_RC	BIT(12)
+
+#define PPE_CFG_RX_PKT_ALIGN		BIT(18)
+#define PPE_CFG_QOS_VMID_MODE		BIT(14)
+#define PPE_CFG_QOS_VMID_GRP_SHIFT	8
+
+#define PPE_CFG_RX_FIFO_FSFU		BIT(11)
+#define PPE_CFG_RX_DEPTH_SHIFT		16
+#define PPE_CFG_RX_START_SHIFT		0
+#define PPE_CFG_RX_CTRL_ALIGN_SHIFT	11
+
+#define PPE_CFG_BUS_LOCAL_REL		BIT(14)
+#define PPE_CFG_BUS_BIG_ENDIEN		BIT(0)
+
+#define RX_DESC_NUM			128
+#define TX_DESC_NUM			256
+#define TX_NEXT(N)			(((N) + 1) & (TX_DESC_NUM-1))
+#define RX_NEXT(N)			(((N) + 1) & (RX_DESC_NUM-1))
+
+#define GMAC_PPE_RX_PKT_MAX_LEN		379
+#define GMAC_MAX_PKT_LEN		1516
+#define GMAC_MIN_PKT_LEN		31
+#define RX_BUF_SIZE			1600
+#define RESET_TIMEOUT			1000
+#define TX_TIMEOUT			(6 * HZ)
+
+#define DRV_NAME			"hip04-ether"
+
+struct tx_desc {
+	u32 send_addr;
+	u32 send_size;
+	u32 next_addr;
+	u32 cfg;
+	u32 wb_addr;
+} __aligned(64);
+
+struct rx_desc {
+	u16 reserved_16;
+	u16 pkt_len;
+	u32 reserve1[3];
+	u32 pkt_err;
+	u32 reserve2[4];
+};
+
+struct hip04_priv {
+	void __iomem *base;
+	int phy_mode;
+	int chan;
+	unsigned int port;
+	unsigned int speed;
+	unsigned int duplex;
+	unsigned int reg_inten;
+
+	struct napi_struct napi;
+	struct net_device *ndev;
+
+	struct tx_desc *tx_desc;
+	dma_addr_t tx_desc_dma;
+	struct sk_buff *tx_skb[TX_DESC_NUM];
+	dma_addr_t tx_phys[TX_DESC_NUM];
+	unsigned int tx_head;
+	unsigned int tx_tail;
+	int tx_count;
+	unsigned long last_tx;
+
+	unsigned char *rx_buf[RX_DESC_NUM];
+	dma_addr_t rx_phys[RX_DESC_NUM];
+	unsigned int rx_head;
+	unsigned int rx_buf_size;
+
+	struct device_node *phy_node;
+	struct phy_device *phy;
+	struct regmap *map;
+	struct work_struct tx_timeout_task;
+
+	struct workqueue_struct *wq;
+	struct delayed_work tx_clean_task;
+};
+
+static void hip04_config_port(struct net_device *ndev, u32 speed, u32 duplex)
+{
+	struct hip04_priv *priv = netdev_priv(ndev);
+	u32 val;
+
+	priv->speed = speed;
+	priv->duplex = duplex;
+
+	switch (priv->phy_mode) {
+	case PHY_INTERFACE_MODE_SGMII:
+		if (speed == SPEED_1000)
+			val = SGMII_SPEED_1000;
+		else if (speed == SPEED_100)
+			val = SGMII_SPEED_100;
+		else
+			val = SGMII_SPEED_10;
+		break;
+	case PHY_INTERFACE_MODE_MII:
+		if (speed == SPEED_100)
+			val = MII_SPEED_100;
+		else
+			val = MII_SPEED_10;
+		break;
+	default:
+		netdev_warn(ndev, "not supported mode\n");
+		val = MII_SPEED_10;
+		break;
+	}
+	writel_relaxed(val, priv->base + GE_PORT_MODE);
+
+	val = duplex ? GE_DUPLEX_FULL : GE_DUPLEX_HALF;
+	writel_relaxed(val, priv->base + GE_DUPLEX_TYPE);
+
+	val = GE_MODE_CHANGE_EN;
+	writel_relaxed(val, priv->base + GE_MODE_CHANGE_REG);
+}
+
+static void hip04_reset_ppe(struct hip04_priv *priv)
+{
+	u32 val, tmp, timeout = 0;
+
+	do {
+		regmap_read(priv->map, priv->port * 4 + PPE_CURR_BUF_CNT, &val);
+		regmap_read(priv->map, priv->port * 4 + PPE_CFG_RX_ADDR, &tmp);
+		if (timeout++ > RESET_TIMEOUT)
+			break;
+	} while (val & 0xfff);
+}
+
+static void hip04_config_fifo(struct hip04_priv *priv)
+{
+	u32 val;
+
+	val = readl_relaxed(priv->base + PPE_CFG_STS_MODE);
+	val |= PPE_CFG_STS_RX_PKT_CNT_RC;
+	writel_relaxed(val, priv->base + PPE_CFG_STS_MODE);
+
+	val = BIT(priv->port);
+	regmap_write(priv->map, priv->port * 4 + PPE_CFG_POOL_GRP, val);
+
+	val = priv->port << PPE_CFG_QOS_VMID_GRP_SHIFT;
+	val |= PPE_CFG_QOS_VMID_MODE;
+	writel_relaxed(val, priv->base + PPE_CFG_QOS_VMID_GEN);
+
+	val = RX_BUF_SIZE;
+	regmap_write(priv->map, priv->port * 4 + PPE_CFG_RX_BUF_SIZE, val);
+
+	val = RX_DESC_NUM << PPE_CFG_RX_DEPTH_SHIFT;
+	val |= PPE_CFG_RX_FIFO_FSFU;
+	val |= priv->chan << PPE_CFG_RX_START_SHIFT;
+	regmap_write(priv->map, priv->port * 4 + PPE_CFG_RX_FIFO_SIZE, val);
+
+	val = NET_IP_ALIGN << PPE_CFG_RX_CTRL_ALIGN_SHIFT;
+	writel_relaxed(val, priv->base + PPE_CFG_RX_CTRL_REG);
+
+	val = PPE_CFG_RX_PKT_ALIGN;
+	writel_relaxed(val, priv->base + PPE_CFG_RX_PKT_MODE_REG);
+
+	val = PPE_CFG_BUS_LOCAL_REL | PPE_CFG_BUS_BIG_ENDIEN;
+	writel_relaxed(val, priv->base + PPE_CFG_BUS_CTRL_REG);
+
+	val = GMAC_PPE_RX_PKT_MAX_LEN;
+	writel_relaxed(val, priv->base + PPE_CFG_MAX_FRAME_LEN_REG);
+
+	val = GMAC_MAX_PKT_LEN;
+	writel_relaxed(val, priv->base + GE_MAX_FRM_SIZE_REG);
+
+	val = GMAC_MIN_PKT_LEN;
+	writel_relaxed(val, priv->base + GE_SHORT_RUNTS_THR_REG);
+
+	val = readl_relaxed(priv->base + GE_TRANSMIT_CONTROL_REG);
+	val |= GE_TX_AUTO_NEG | GE_TX_ADD_CRC | GE_TX_SHORT_PAD_THROUGH;
+	writel_relaxed(val, priv->base + GE_TRANSMIT_CONTROL_REG);
+
+	val = GE_RX_STRIP_CRC;
+	writel_relaxed(val, priv->base + GE_CF_CRC_STRIP_REG);
+
+	val = readl_relaxed(priv->base + GE_RECV_CONTROL_REG);
+	val |= GE_RX_STRIP_PAD | GE_RX_PAD_EN;
+	writel_relaxed(val, priv->base + GE_RECV_CONTROL_REG);
+
+	val = GE_AUTO_NEG_CTL;
+	writel_relaxed(val, priv->base + GE_TX_LOCAL_PAGE_REG);
+}
+
+static void hip04_mac_enable(struct net_device *ndev)
+{
+	struct hip04_priv *priv = netdev_priv(ndev);
+	u32 val;
+
+	/* enable tx & rx */
+	val = readl_relaxed(priv->base + GE_PORT_EN);
+	val |= GE_RX_PORT_EN | GE_TX_PORT_EN;
+	writel_relaxed(val, priv->base + GE_PORT_EN);
+
+	/* clear rx int */
+	val = RCV_INT;
+	writel_relaxed(val, priv->base + PPE_RINT);
+
+	/* config recv int */
+	val = GE_RX_INT_THRESHOLD | GE_RX_TIMEOUT;
+	writel_relaxed(val, priv->base + PPE_CFG_RX_PKT_INT);
+
+	/* enable interrupt */
+	priv->reg_inten = DEF_INT_MASK;
+	writel_relaxed(priv->reg_inten, priv->base + PPE_INTEN);
+}
+
+static void hip04_mac_disable(struct net_device *ndev)
+{
+	struct hip04_priv *priv = netdev_priv(ndev);
+	u32 val;
+
+	/* disable int */
+	priv->reg_inten &= ~(DEF_INT_MASK);
+	writel_relaxed(priv->reg_inten, priv->base + PPE_INTEN);
+
+	/* disable tx & rx */
+	val = readl_relaxed(priv->base + GE_PORT_EN);
+	val &= ~(GE_RX_PORT_EN | GE_TX_PORT_EN);
+	writel_relaxed(val, priv->base + GE_PORT_EN);
+}
+
+static void hip04_set_xmit_desc(struct hip04_priv *priv, dma_addr_t phys)
+{
+	writel(phys, priv->base + PPE_CFG_CPU_ADD_ADDR);
+}
+
+static void hip04_set_recv_desc(struct hip04_priv *priv, dma_addr_t phys)
+{
+	regmap_write(priv->map, priv->port * 4 + PPE_CFG_RX_ADDR, phys);
+}
+
+static u32 hip04_recv_cnt(struct hip04_priv *priv)
+{
+	return readl(priv->base + PPE_HIS_RX_PKT_CNT);
+}
+
+static void hip04_update_mac_address(struct net_device *ndev)
+{
+	struct hip04_priv *priv = netdev_priv(ndev);
+
+	writel_relaxed(((ndev->dev_addr[0] << 8) | (ndev->dev_addr[1])),
+		       priv->base + GE_STATION_MAC_ADDRESS);
+	writel_relaxed(((ndev->dev_addr[2] << 24) | (ndev->dev_addr[3] << 16) |
+			(ndev->dev_addr[4] << 8) | (ndev->dev_addr[5])),
+		       priv->base + GE_STATION_MAC_ADDRESS + 4);
+}
+
+static int hip04_set_mac_address(struct net_device *ndev, void *addr)
+{
+	eth_mac_addr(ndev, addr);
+	hip04_update_mac_address(ndev);
+	return 0;
+}
+
+static void hip04_tx_reclaim(struct net_device *ndev, bool force)
+{
+	struct hip04_priv *priv = netdev_priv(ndev);
+	unsigned tx_tail = priv->tx_tail;
+	struct tx_desc *desc;
+	unsigned int bytes_compl = 0, pkts_compl = 0;
+
+	if (priv->tx_count == 0)
+		goto out;
+
+	while ((tx_tail != priv->tx_head) || (priv->tx_count == TX_DESC_NUM)) {
+		desc = &priv->tx_desc[priv->tx_tail];
+		if (desc->send_addr != 0) {
+			if (force)
+				desc->send_addr = 0;
+			else
+				break;
+		}
+
+		if (priv->tx_phys[tx_tail]) {
+			dma_unmap_single(&ndev->dev, priv->tx_phys[tx_tail],
+					 priv->tx_skb[tx_tail]->len,
+					 DMA_TO_DEVICE);
+			priv->tx_phys[tx_tail] = 0;
+		}
+		pkts_compl++;
+		bytes_compl += priv->tx_skb[tx_tail]->len;
+		dev_kfree_skb(priv->tx_skb[tx_tail]);
+		priv->tx_skb[tx_tail] = NULL;
+		tx_tail = TX_NEXT(tx_tail);
+		priv->tx_count--;
+
+		if (priv->tx_count <= 0)
+			break;
+	}
+
+	priv->tx_tail = tx_tail;
+
+	/* Ensure tx_tail & tx_count visible to xmit */
+	smp_mb();
+out:
+
+	if (pkts_compl || bytes_compl)
+		netdev_completed_queue(ndev, pkts_compl, bytes_compl);
+
+	if (unlikely(netif_queue_stopped(ndev)) &&
+	    (priv->tx_count < TX_DESC_NUM))
+		netif_wake_queue(ndev);
+}
+
+static void hip04_tx_clean_monitor(struct work_struct *work)
+{
+	struct hip04_priv *priv = container_of(work, struct hip04_priv,
+					       tx_clean_task.work);
+	struct net_device *ndev = priv->ndev;
+	int delta_in_ticks = msecs_to_jiffies(1000);
+
+	if (!time_in_range(jiffies, priv->last_tx,
+			   priv->last_tx + delta_in_ticks)) {
+		netif_tx_lock(ndev);
+		hip04_tx_reclaim(ndev, false);
+		netif_tx_unlock(ndev);
+	}
+	queue_delayed_work(priv->wq, &priv->tx_clean_task, delta_in_ticks);
+}
+
+static int hip04_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
+{
+	struct hip04_priv *priv = netdev_priv(ndev);
+	struct net_device_stats *stats = &ndev->stats;
+	unsigned int tx_head = priv->tx_head;
+	struct tx_desc *desc = &priv->tx_desc[tx_head];
+	dma_addr_t phys;
+
+	if (priv->tx_count >= TX_DESC_NUM) {
+		netif_stop_queue(ndev);
+		return NETDEV_TX_BUSY;
+	}
+
+	hip04_tx_reclaim(ndev, false);
+
+	phys = dma_map_single(&ndev->dev, skb->data, skb->len, DMA_TO_DEVICE);
+	if (dma_mapping_error(&ndev->dev, phys)) {
+		dev_kfree_skb(skb);
+		return NETDEV_TX_OK;
+	}
+
+	priv->tx_skb[tx_head] = skb;
+	priv->tx_phys[tx_head] = phys;
+	desc->send_addr = cpu_to_be32(phys);
+	desc->send_size = cpu_to_be32(skb->len);
+	desc->cfg = cpu_to_be32(TX_CLEAR_WB | TX_FINISH_CACHE_INV);
+	phys = priv->tx_desc_dma + tx_head * sizeof(struct tx_desc);
+	desc->wb_addr = cpu_to_be32(phys);
+	skb_tx_timestamp(skb);
+
+	/* Don't wait up for transmitted skbs to be freed. */
+	skb_orphan(skb);
+
+	hip04_set_xmit_desc(priv, phys);
+	priv->tx_head = TX_NEXT(tx_head);
+	netdev_sent_queue(ndev, skb->len);
+
+	stats->tx_bytes += skb->len;
+	stats->tx_packets++;
+	priv->tx_count++;
+	priv->last_tx = jiffies;
+
+	/* Ensure tx_head & tx_count update visible to tx reclaim */
+	smp_mb();
+
+	return NETDEV_TX_OK;
+}
+
+static int hip04_rx_poll(struct napi_struct *napi, int budget)
+{
+	struct hip04_priv *priv = container_of(napi, struct hip04_priv, napi);
+	struct net_device *ndev = priv->ndev;
+	struct net_device_stats *stats = &ndev->stats;
+	unsigned int cnt = hip04_recv_cnt(priv);
+	struct rx_desc *desc;
+	struct sk_buff *skb;
+	unsigned char *buf;
+	bool last = false;
+	dma_addr_t phys;
+	int rx = 0;
+	u16 len;
+	u32 err;
+
+	while (cnt && !last) {
+		buf = priv->rx_buf[priv->rx_head];
+		skb = build_skb(buf, priv->rx_buf_size);
+		if (unlikely(!skb))
+			net_dbg_ratelimited("build_skb failed\n");
+
+		dma_unmap_single(&ndev->dev, priv->rx_phys[priv->rx_head],
+				 RX_BUF_SIZE, DMA_FROM_DEVICE);
+		priv->rx_phys[priv->rx_head] = 0;
+
+		desc = (struct rx_desc *)skb->data;
+		len = be16_to_cpu(desc->pkt_len);
+		err = be32_to_cpu(desc->pkt_err);
+
+		if (0 == len) {
+			dev_kfree_skb_any(skb);
+			last = true;
+		} else if ((err & RX_PKT_ERR) || (len >= GMAC_MAX_PKT_LEN)) {
+			dev_kfree_skb_any(skb);
+			stats->rx_dropped++;
+			stats->rx_errors++;
+		} else {
+			skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
+			skb_put(skb, len);
+			skb->protocol = eth_type_trans(skb, ndev);
+			napi_gro_receive(&priv->napi, skb);
+			stats->rx_packets++;
+			stats->rx_bytes += len;
+			rx++;
+		}
+
+		buf = netdev_alloc_frag(priv->rx_buf_size);
+		if (!buf)
+			return -ENOMEM;
+		phys = dma_map_single(&ndev->dev, buf,
+				      RX_BUF_SIZE, DMA_FROM_DEVICE);
+		if (dma_mapping_error(&ndev->dev, phys))
+			return -EIO;
+		priv->rx_buf[priv->rx_head] = buf;
+		priv->rx_phys[priv->rx_head] = phys;
+		hip04_set_recv_desc(priv, phys);
+
+		priv->rx_head = RX_NEXT(priv->rx_head);
+		if (rx >= budget)
+			goto done;
+
+		if (--cnt == 0)
+			cnt = hip04_recv_cnt(priv);
+	}
+
+	if (!(priv->reg_inten & RCV_INT)) {
+		/* enable rx interrupt */
+		priv->reg_inten |= RCV_INT;
+		writel_relaxed(priv->reg_inten, priv->base + PPE_INTEN);
+	}
+	napi_complete(napi);
+done:
+	return rx;
+}
+
+static irqreturn_t hip04_mac_interrupt(int irq, void *dev_id)
+{
+	struct net_device *ndev = (struct net_device *)dev_id;
+	struct hip04_priv *priv = netdev_priv(ndev);
+	struct net_device_stats *stats = &ndev->stats;
+	u32 ists = readl_relaxed(priv->base + PPE_INTSTS);
+
+	writel_relaxed(DEF_INT_MASK, priv->base + PPE_RINT);
+
+	if (unlikely(ists & DEF_INT_ERR)) {
+		if (ists & (RCV_NOBUF | RCV_DROP))
+			stats->rx_errors++;
+			stats->rx_dropped++;
+			netdev_err(ndev, "rx drop\n");
+		if (ists & TX_DROP) {
+			stats->tx_dropped++;
+			netdev_err(ndev, "tx drop\n");
+		}
+	}
+
+	if (ists & RCV_INT) {
+		/* disable rx interrupt */
+		priv->reg_inten &= ~(RCV_INT);
+		writel_relaxed(priv->reg_inten, priv->base + PPE_INTEN);
+		napi_schedule(&priv->napi);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static void hip04_adjust_link(struct net_device *ndev)
+{
+	struct hip04_priv *priv = netdev_priv(ndev);
+	struct phy_device *phy = priv->phy;
+
+	if ((priv->speed != phy->speed) || (priv->duplex != phy->duplex)) {
+		hip04_config_port(ndev, phy->speed, phy->duplex);
+		phy_print_status(phy);
+	}
+}
+
+static int hip04_mac_open(struct net_device *ndev)
+{
+	struct hip04_priv *priv = netdev_priv(ndev);
+	int i;
+
+	priv->rx_head = 0;
+	priv->tx_head = 0;
+	priv->tx_tail = 0;
+	priv->tx_count = 0;
+	hip04_reset_ppe(priv);
+
+	for (i = 0; i < RX_DESC_NUM; i++) {
+		dma_addr_t phys;
+
+		phys = dma_map_single(&ndev->dev, priv->rx_buf[i],
+				      RX_BUF_SIZE, DMA_FROM_DEVICE);
+		if (dma_mapping_error(&ndev->dev, phys))
+			return -EIO;
+
+		priv->rx_phys[i] = phys;
+		hip04_set_recv_desc(priv, phys);
+	}
+
+	if (priv->phy)
+		phy_start(priv->phy);
+
+	netdev_reset_queue(ndev);
+	netif_start_queue(ndev);
+	hip04_mac_enable(ndev);
+	napi_enable(&priv->napi);
+
+	INIT_DELAYED_WORK(&priv->tx_clean_task, hip04_tx_clean_monitor);
+	queue_delayed_work(priv->wq, &priv->tx_clean_task, 0);
+
+	return 0;
+}
+
+static int hip04_mac_stop(struct net_device *ndev)
+{
+	struct hip04_priv *priv = netdev_priv(ndev);
+	int i;
+
+	cancel_delayed_work_sync(&priv->tx_clean_task);
+
+	napi_disable(&priv->napi);
+	netif_stop_queue(ndev);
+	hip04_mac_disable(ndev);
+	hip04_tx_reclaim(ndev, true);
+	hip04_reset_ppe(priv);
+
+	if (priv->phy)
+		phy_stop(priv->phy);
+
+	for (i = 0; i < RX_DESC_NUM; i++) {
+		if (priv->rx_phys[i]) {
+			dma_unmap_single(&ndev->dev, priv->rx_phys[i],
+					 RX_BUF_SIZE, DMA_FROM_DEVICE);
+			priv->rx_phys[i] = 0;
+		}
+	}
+
+	return 0;
+}
+
+static void hip04_timeout(struct net_device *ndev)
+{
+	struct hip04_priv *priv = netdev_priv(ndev);
+
+	schedule_work(&priv->tx_timeout_task);
+}
+
+static void hip04_tx_timeout_task(struct work_struct *work)
+{
+	struct hip04_priv *priv;
+
+	priv = container_of(work, struct hip04_priv, tx_timeout_task);
+	hip04_mac_stop(priv->ndev);
+	hip04_mac_open(priv->ndev);
+}
+
+static struct net_device_stats *hip04_get_stats(struct net_device *ndev)
+{
+	return &ndev->stats;
+}
+
+static struct net_device_ops hip04_netdev_ops = {
+	.ndo_open		= hip04_mac_open,
+	.ndo_stop		= hip04_mac_stop,
+	.ndo_get_stats		= hip04_get_stats,
+	.ndo_start_xmit		= hip04_mac_start_xmit,
+	.ndo_set_mac_address	= hip04_set_mac_address,
+	.ndo_tx_timeout         = hip04_timeout,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_change_mtu		= eth_change_mtu,
+};
+
+static int hip04_alloc_ring(struct net_device *ndev, struct device *d)
+{
+	struct hip04_priv *priv = netdev_priv(ndev);
+	int i;
+
+	priv->tx_desc = dma_alloc_coherent(d,
+			TX_DESC_NUM * sizeof(struct tx_desc),
+			&priv->tx_desc_dma, GFP_KERNEL);
+	if (!priv->tx_desc)
+		return -ENOMEM;
+
+	priv->rx_buf_size = RX_BUF_SIZE +
+			    SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	for (i = 0; i < RX_DESC_NUM; i++) {
+		priv->rx_buf[i] = netdev_alloc_frag(priv->rx_buf_size);
+		if (!priv->rx_buf[i])
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void hip04_free_ring(struct net_device *ndev, struct device *d)
+{
+	struct hip04_priv *priv = netdev_priv(ndev);
+	int i;
+
+	for (i = 0; i < RX_DESC_NUM; i++)
+		if (priv->rx_buf[i])
+			put_page(virt_to_head_page(priv->rx_buf[i]));
+
+	for (i = 0; i < TX_DESC_NUM; i++)
+		if (priv->tx_skb[i])
+			dev_kfree_skb_any(priv->tx_skb[i]);
+
+	dma_free_coherent(d, TX_DESC_NUM * sizeof(struct tx_desc),
+			  priv->tx_desc, priv->tx_desc_dma);
+}
+
+static int hip04_mac_probe(struct platform_device *pdev)
+{
+	struct device *d = &pdev->dev;
+	struct device_node *node = d->of_node;
+	struct of_phandle_args arg;
+	struct net_device *ndev;
+	struct hip04_priv *priv;
+	struct resource *res;
+	unsigned int irq;
+	int ret;
+
+	ndev = alloc_etherdev(sizeof(struct hip04_priv));
+	if (!ndev)
+		return -ENOMEM;
+
+	priv = netdev_priv(ndev);
+	priv->ndev = ndev;
+	platform_set_drvdata(pdev, ndev);
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	priv->base = devm_ioremap_resource(d, res);
+	if (IS_ERR(priv->base)) {
+		ret = PTR_ERR(priv->base);
+		goto init_fail;
+	}
+
+	ret = of_parse_phandle_with_fixed_args(node, "port-handle", 2, 0, &arg);
+	if (ret < 0) {
+		dev_warn(d, "no port-handle\n");
+		goto init_fail;
+	}
+
+	priv->port = arg.args[0];
+	priv->chan = arg.args[1] * RX_DESC_NUM;
+
+	priv->map = syscon_node_to_regmap(arg.np);
+	if (IS_ERR(priv->map)) {
+		dev_warn(d, "no syscon hisilicon,hip04-ppe\n");
+		ret = PTR_ERR(priv->map);
+		goto init_fail;
+	}
+
+	priv->phy_mode = of_get_phy_mode(node);
+	if (priv->phy_mode < 0) {
+		dev_warn(d, "not find phy-mode\n");
+		ret = -EINVAL;
+		goto init_fail;
+	}
+
+	irq = platform_get_irq(pdev, 0);
+	if (irq <= 0) {
+		ret = -EINVAL;
+		goto init_fail;
+	}
+
+	ret = devm_request_irq(d, irq, hip04_mac_interrupt,
+			       0, pdev->name, ndev);
+	if (ret) {
+		netdev_err(ndev, "devm_request_irq failed\n");
+		goto init_fail;
+	}
+
+	priv->phy_node = of_parse_phandle(node, "phy-handle", 0);
+	if (priv->phy_node) {
+		priv->phy = of_phy_connect(ndev, priv->phy_node,
+			&hip04_adjust_link, 0, priv->phy_mode);
+		if (!priv->phy) {
+			ret = -EPROBE_DEFER;
+			goto init_fail;
+		}
+	}
+
+	priv->wq = create_singlethread_workqueue(ndev->name);
+	if (!priv->wq) {
+		ret = -ENOMEM;
+		goto init_fail;
+	}
+
+	INIT_WORK(&priv->tx_timeout_task, hip04_tx_timeout_task);
+
+	ether_setup(ndev);
+	ndev->netdev_ops = &hip04_netdev_ops;
+	ndev->watchdog_timeo = TX_TIMEOUT;
+	ndev->priv_flags |= IFF_UNICAST_FLT;
+	ndev->irq = irq;
+	netif_napi_add(ndev, &priv->napi, hip04_rx_poll, NAPI_POLL_WEIGHT);
+	SET_NETDEV_DEV(ndev, &pdev->dev);
+
+	hip04_reset_ppe(priv);
+	if (priv->phy_mode == PHY_INTERFACE_MODE_MII)
+		hip04_config_port(ndev, SPEED_100, DUPLEX_FULL);
+
+	hip04_config_fifo(priv);
+	random_ether_addr(ndev->dev_addr);
+	hip04_update_mac_address(ndev);
+
+	ret = hip04_alloc_ring(ndev, d);
+	if (ret) {
+		netdev_err(ndev, "alloc ring fail\n");
+		goto alloc_fail;
+	}
+
+	ret = register_netdev(ndev);
+	if (ret) {
+		free_netdev(ndev);
+		goto alloc_fail;
+	}
+
+	return 0;
+
+alloc_fail:
+	hip04_free_ring(ndev, d);
+init_fail:
+	of_node_put(priv->phy_node);
+	free_netdev(ndev);
+	return ret;
+}
+
+static int hip04_remove(struct platform_device *pdev)
+{
+	struct net_device *ndev = platform_get_drvdata(pdev);
+	struct hip04_priv *priv = netdev_priv(ndev);
+	struct device *d = &pdev->dev;
+
+	if (priv->phy)
+		phy_disconnect(priv->phy);
+
+	hip04_free_ring(ndev, d);
+	unregister_netdev(ndev);
+	free_irq(ndev->irq, ndev);
+	of_node_put(priv->phy_node);
+	cancel_work_sync(&priv->tx_timeout_task);
+	if (priv->wq)
+		destroy_workqueue(priv->wq);
+	free_netdev(ndev);
+
+	return 0;
+}
+
+static const struct of_device_id hip04_mac_match[] = {
+	{ .compatible = "hisilicon,hip04-mac" },
+	{ }
+};
+
+static struct platform_driver hip04_mac_driver = {
+	.probe	= hip04_mac_probe,
+	.remove	= hip04_remove,
+	.driver	= {
+		.name		= DRV_NAME,
+		.owner		= THIS_MODULE,
+		.of_match_table	= hip04_mac_match,
+	},
+};
+module_platform_driver(hip04_mac_driver);
+
+MODULE_DESCRIPTION("HISILICON P04 Ethernet driver");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("platform:hip04-ether");
-- 
1.8.0

^ permalink raw reply related

* [PATCH net-next v9 2/3] net: hisilicon: new hip04 MDIO driver
From: Ding Tianhong @ 2014-12-11 11:42 UTC (permalink / raw)
  To: zhangfei.gao, davem, linux, arnd, f.fainelli, sergei.shtylyov,
	mark.rutland, David.Laight, eric.dumazet, xuwei5
  Cc: linux-arm-kernel, netdev, devicetree
In-Reply-To: <1418298150-4944-1-git-send-email-dingtianhong@huawei.com>

From: Zhangfei Gao <zhangfei.gao@linaro.org>

Hisilicon hip04 platform mdio driver
Reuse Marvell phy drivers/net/phy/marvell.c

Signed-off-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
---
 drivers/net/ethernet/hisilicon/Kconfig      |   9 ++
 drivers/net/ethernet/hisilicon/Makefile     |   1 +
 drivers/net/ethernet/hisilicon/hip04_mdio.c | 186 ++++++++++++++++++++++++++++
 3 files changed, 196 insertions(+)
 create mode 100644 drivers/net/ethernet/hisilicon/hip04_mdio.c

diff --git a/drivers/net/ethernet/hisilicon/Kconfig b/drivers/net/ethernet/hisilicon/Kconfig
index e942173..a54d897 100644
--- a/drivers/net/ethernet/hisilicon/Kconfig
+++ b/drivers/net/ethernet/hisilicon/Kconfig
@@ -24,4 +24,13 @@ config HIX5HD2_GMAC
 	help
 	  This selects the hix5hd2 mac family network device.
 
+config HIP04_ETH
+	tristate "HISILICON P04 Ethernet support"
+	select PHYLIB
+	select MARVELL_PHY
+	select MFD_SYSCON
+	---help---
+	  If you wish to compile a kernel for a hardware with hisilicon p04 SoC and
+	  want to use the internal ethernet then you should answer Y to this.
+
 endif # NET_VENDOR_HISILICON
diff --git a/drivers/net/ethernet/hisilicon/Makefile b/drivers/net/ethernet/hisilicon/Makefile
index 9175e846..40115a7 100644
--- a/drivers/net/ethernet/hisilicon/Makefile
+++ b/drivers/net/ethernet/hisilicon/Makefile
@@ -3,3 +3,4 @@
 #
 
 obj-$(CONFIG_HIX5HD2_GMAC) += hix5hd2_gmac.o
+obj-$(CONFIG_HIP04_ETH) += hip04_mdio.o
diff --git a/drivers/net/ethernet/hisilicon/hip04_mdio.c b/drivers/net/ethernet/hisilicon/hip04_mdio.c
new file mode 100644
index 0000000..b3bac25
--- /dev/null
+++ b/drivers/net/ethernet/hisilicon/hip04_mdio.c
@@ -0,0 +1,186 @@
+/* Copyright (c) 2014 Linaro Ltd.
+ * Copyright (c) 2014 Hisilicon Limited.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/io.h>
+#include <linux/of_mdio.h>
+#include <linux/delay.h>
+
+#define MDIO_CMD_REG		0x0
+#define MDIO_ADDR_REG		0x4
+#define MDIO_WDATA_REG		0x8
+#define MDIO_RDATA_REG		0xc
+#define MDIO_STA_REG		0x10
+
+#define MDIO_START		BIT(14)
+#define MDIO_R_VALID		BIT(1)
+#define MDIO_READ	        (BIT(12) | BIT(11) | MDIO_START)
+#define MDIO_WRITE	        (BIT(12) | BIT(10) | MDIO_START)
+
+struct hip04_mdio_priv {
+	void __iomem *base;
+};
+
+#define WAIT_TIMEOUT 10
+static int hip04_mdio_wait_ready(struct mii_bus *bus)
+{
+	struct hip04_mdio_priv *priv = bus->priv;
+	int i;
+
+	for (i = 0; readl_relaxed(priv->base + MDIO_CMD_REG) & MDIO_START; i++) {
+		if (i == WAIT_TIMEOUT)
+			return -ETIMEDOUT;
+		msleep(20);
+	}
+
+	return 0;
+}
+
+static int hip04_mdio_read(struct mii_bus *bus, int mii_id, int regnum)
+{
+	struct hip04_mdio_priv *priv = bus->priv;
+	u32 val;
+	int ret;
+
+	ret = hip04_mdio_wait_ready(bus);
+	if (ret < 0)
+		goto out;
+
+	val = regnum | (mii_id << 5) | MDIO_READ;
+	writel_relaxed(val, priv->base + MDIO_CMD_REG);
+
+	ret = hip04_mdio_wait_ready(bus);
+	if (ret < 0)
+		goto out;
+
+	val = readl_relaxed(priv->base + MDIO_STA_REG);
+	if (val & MDIO_R_VALID) {
+		dev_err(bus->parent, "SMI bus read not valid\n");
+		ret = -ENODEV;
+		goto out;
+	}
+
+	val = readl_relaxed(priv->base + MDIO_RDATA_REG);
+	ret = val & 0xFFFF;
+out:
+	return ret;
+}
+
+static int hip04_mdio_write(struct mii_bus *bus, int mii_id,
+			    int regnum, u16 value)
+{
+	struct hip04_mdio_priv *priv = bus->priv;
+	u32 val;
+	int ret;
+
+	ret = hip04_mdio_wait_ready(bus);
+	if (ret < 0)
+		goto out;
+
+	writel_relaxed(value, priv->base + MDIO_WDATA_REG);
+	val = regnum | (mii_id << 5) | MDIO_WRITE;
+	writel_relaxed(val, priv->base + MDIO_CMD_REG);
+out:
+	return ret;
+}
+
+static int hip04_mdio_reset(struct mii_bus *bus)
+{
+	int temp, i;
+
+	for (i = 0; i < PHY_MAX_ADDR; i++) {
+		hip04_mdio_write(bus, i, 22, 0);
+		temp = hip04_mdio_read(bus, i, MII_BMCR);
+		if (temp < 0)
+			continue;
+
+		temp |= BMCR_RESET;
+		if (hip04_mdio_write(bus, i, MII_BMCR, temp) < 0)
+			continue;
+	}
+
+	mdelay(500);
+	return 0;
+}
+
+static int hip04_mdio_probe(struct platform_device *pdev)
+{
+	struct resource *r;
+	struct mii_bus *bus;
+	struct hip04_mdio_priv *priv;
+	int ret;
+
+	bus = mdiobus_alloc_size(sizeof(struct hip04_mdio_priv));
+	if (!bus) {
+		dev_err(&pdev->dev, "Cannot allocate MDIO bus\n");
+		return -ENOMEM;
+	}
+
+	bus->name = "hip04_mdio_bus";
+	bus->read = hip04_mdio_read;
+	bus->write = hip04_mdio_write;
+	bus->reset = hip04_mdio_reset;
+	snprintf(bus->id, MII_BUS_ID_SIZE, "%s-mii", dev_name(&pdev->dev));
+	bus->parent = &pdev->dev;
+	priv = bus->priv;
+
+	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	priv->base = devm_ioremap_resource(&pdev->dev, r);
+	if (IS_ERR(priv->base)) {
+		ret = PTR_ERR(priv->base);
+		goto out_mdio;
+	}
+
+	ret = of_mdiobus_register(bus, pdev->dev.of_node);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "Cannot register MDIO bus (%d)\n", ret);
+		goto out_mdio;
+	}
+
+	platform_set_drvdata(pdev, bus);
+
+	return 0;
+
+out_mdio:
+	mdiobus_free(bus);
+	return ret;
+}
+
+static int hip04_mdio_remove(struct platform_device *pdev)
+{
+	struct mii_bus *bus = platform_get_drvdata(pdev);
+
+	mdiobus_unregister(bus);
+	mdiobus_free(bus);
+
+	return 0;
+}
+
+static const struct of_device_id hip04_mdio_match[] = {
+	{ .compatible = "hisilicon,hip04-mdio" },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, hip04_mdio_match);
+
+static struct platform_driver hip04_mdio_driver = {
+	.probe = hip04_mdio_probe,
+	.remove = hip04_mdio_remove,
+	.driver = {
+		.name = "hip04-mdio",
+		.owner = THIS_MODULE,
+		.of_match_table = hip04_mdio_match,
+	},
+};
+
+module_platform_driver(hip04_mdio_driver);
+
+MODULE_DESCRIPTION("HISILICON P04 MDIO interface driver");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("platform:hip04-mdio");
-- 
1.8.0

^ permalink raw reply related

* [PATCH net-next v9 1/3] Documentation: add Device tree bindings for Hisilicon hip04 ethernet
From: Ding Tianhong @ 2014-12-11 11:42 UTC (permalink / raw)
  To: zhangfei.gao, davem, linux, arnd, f.fainelli, sergei.shtylyov,
	mark.rutland, David.Laight, eric.dumazet, xuwei5
  Cc: linux-arm-kernel, netdev, devicetree
In-Reply-To: <1418298150-4944-1-git-send-email-dingtianhong@huawei.com>

From: Zhangfei Gao <zhangfei.gao@linaro.org>

This patch adds the Device Tree bindings for the Hisilicon hip04
Ethernet controller, including 100M / 1000M controller.

Signed-off-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
---
 .../bindings/net/hisilicon-hip04-net.txt           | 88 ++++++++++++++++++++++
 1 file changed, 88 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/net/hisilicon-hip04-net.txt

diff --git a/Documentation/devicetree/bindings/net/hisilicon-hip04-net.txt b/Documentation/devicetree/bindings/net/hisilicon-hip04-net.txt
new file mode 100644
index 0000000..988fc69
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/hisilicon-hip04-net.txt
@@ -0,0 +1,88 @@
+Hisilicon hip04 Ethernet Controller
+
+* Ethernet controller node
+
+Required properties:
+- compatible: should be "hisilicon,hip04-mac".
+- reg: address and length of the register set for the device.
+- interrupts: interrupt for the device.
+- port-handle: <phandle port channel>
+	phandle, specifies a reference to the syscon ppe node
+	port, port number connected to the controller
+	channel, recv channel start from channel * number (RX_DESC_NUM)
+- phy-mode: see ethernet.txt [1].
+
+Optional properties:
+- phy-handle: see ethernet.txt [1].
+
+[1] Documentation/devicetree/bindings/net/ethernet.txt
+
+
+* Ethernet ppe node:
+Control rx & tx fifos of all ethernet controllers.
+Have 2048 recv channels shared by all ethernet controllers, only if no overlap.
+Each controller's recv channel start from channel * number (RX_DESC_NUM).
+
+Required properties:
+- compatible: "hisilicon,hip04-ppe", "syscon".
+- reg: address and length of the register set for the device.
+
+
+* MDIO bus node:
+
+Required properties:
+
+- compatible: should be "hisilicon,hip04-mdio".
+- Inherits from MDIO bus node binding [2]
+[2] Documentation/devicetree/bindings/net/phy.txt
+
+Example:
+	mdio {
+		compatible = "hisilicon,hip04-mdio";
+		reg = <0x28f1000 0x1000>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		phy0: ethernet-phy@0 {
+			compatible = "ethernet-phy-ieee802.3-c22";
+			reg = <0>;
+			marvell,reg-init = <18 0x14 0 0x8001>;
+		};
+
+		phy1: ethernet-phy@1 {
+			compatible = "ethernet-phy-ieee802.3-c22";
+			reg = <1>;
+			marvell,reg-init = <18 0x14 0 0x8001>;
+		};
+	};
+
+	ppe: ppe@28c0000 {
+		compatible = "hisilicon,hip04-ppe", "syscon";
+		reg = <0x28c0000 0x10000>;
+	};
+
+	fe: ethernet@28b0000 {
+		compatible = "hisilicon,hip04-mac";
+		reg = <0x28b0000 0x10000>;
+		interrupts = <0 413 4>;
+		phy-mode = "mii";
+		port-handle = <&ppe 31 0>;
+	};
+
+	ge0: ethernet@2800000 {
+		compatible = "hisilicon,hip04-mac";
+		reg = <0x2800000 0x10000>;
+		interrupts = <0 402 4>;
+		phy-mode = "sgmii";
+		port-handle = <&ppe 0 1>;
+		phy-handle = <&phy0>;
+	};
+
+	ge8: ethernet@2880000 {
+		compatible = "hisilicon,hip04-mac";
+		reg = <0x2880000 0x10000>;
+		interrupts = <0 410 4>;
+		phy-mode = "sgmii";
+		port-handle = <&ppe 8 2>;
+		phy-handle = <&phy1>;
+	};
-- 
1.8.0

^ permalink raw reply related

* [PATCH net-next v9 0/3] add hisilicon hip04 ethernet driver
From: Ding Tianhong @ 2014-12-11 11:42 UTC (permalink / raw)
  To: zhangfei.gao-QSEj5FYQhm4dnm+yROfE0A, davem-fT/PcQaiUtIeIZ0/mPfg9Q,
	linux-lFZ/pmaqli7XmaaqVzeoHQ, arnd-r2nGTMty4D4,
	f.fainelli-Re5JQEeQqe8AvxtiuMwx3w,
	sergei.shtylyov-M4DtvfQ/ZS1MRgGoP+s0PdBPR1lH4CV8,
	mark.rutland-5wv7dgnIgG8, David.Laight-ZS65k/vG3HxXrIkS9f7CXA,
	eric.dumazet-Re5JQEeQqe8AvxtiuMwx3w,
	xuwei5-C8/M+/jPZTeaMJb+Lgu22Q
  Cc: linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	netdev-u79uwXL29TY76Z2rM5mHXA, devicetree-u79uwXL29TY76Z2rM5mHXA

v9:
- There is no tx completion interrupts to free DMAd Tx packets, it means taht
  we rely on new tx packets arriving to run the destructors of completed packets,
  which open up space in their sockets's send queues. Sometimes we don't get such
  new packets causing Tx to stall, a single UDP transmitter is a good example of
  this situation, so we need a clean up workqueue to reclaims completed packets,
  the workqueue will only free the last packets which is already stay for several jiffies.
  Also fix some format cleanups.

v8:
- Use poll to reclaim xmitted buffer as workaround since no tx done interrupt 

v7:
- Remove select NET_CORE in 0002

v6:
- Suggest by Russell: Use netdev_sent_queue & netdev_completed_queue to solve latency issue 
  Also shorten the period of timer, which is used to wakeup the queue since no
  tx completed interrupt.

v5:
  no big change, fix typo

v4:
- Modify accoringly to the suggetion from Arnd, Florian, Eric, David
  Use of_parse_phandle_with_fixed_args & syscon_node_to_regmap get ppe info
  Add skb_orphan() and tx_timer for reclaim since no tx_finished interrupt
  Update timeout, and move of_phy_connect to probe to reuse open/stop

v3:
- Suggest from Arnd, use syscon & regmap_write/read to replace static void __iomem *ppebase.
  Modify hisilicon-hip04-net.txt accrordingly to suggestion from Florian and Sergei.

v2:
- Got many suggestions from Russell, Arnd, Florian, Mark and Sergei
  Remove memcpy, use dma_map/unmap_single, use dma_alloc_coherent rather than dma_pool, etc.
  Refer property in ethernet.txt, change ppe description, etc.

Zhangfei Gao (3):
  Documentation: add Device tree bindings for Hisilicon hip04 ethernet
  net: hisilicon: new hip04 MDIO driver
  net: hisilicon: new hip04 ethernet driver

 .../bindings/net/hisilicon-hip04-net.txt           |  88 +++
 drivers/net/ethernet/hisilicon/Kconfig             |   9 +
 drivers/net/ethernet/hisilicon/Makefile            |   1 +
 drivers/net/ethernet/hisilicon/hip04_eth.c         | 876 +++++++++++++++++++++
 drivers/net/ethernet/hisilicon/hip04_mdio.c        | 186 +++++
 5 files changed, 1160 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/net/hisilicon-hip04-net.txt
 create mode 100644 drivers/net/ethernet/hisilicon/hip04_eth.c
 create mode 100644 drivers/net/ethernet/hisilicon/hip04_mdio.c

-- 
1.8.0


--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: OOPS: net/ipv6/datagram.c (line 260) ipv6_local_error
From: Steffen Klassert @ 2014-12-11 11:37 UTC (permalink / raw)
  To: Chris Ruehl; +Cc: netdev, davem
In-Reply-To: <5487CDE9.4070606@gtsys.com.hk>

On Wed, Dec 10, 2014 at 12:36:57PM +0800, Chris Ruehl wrote:
> Hi all,
> 
> We running a Dell server which crash frequently with (dell crash
> video snapshot) vanilla 3.14.25
> 
> 
> 
> The capture don't sadly don't show the full trace, so we lack on
> information.
> 1st line I can see in the crash video from the idrac :
> tcp_transmit_skb+0x461
> 
> The null pointer happen:
>  Type "apropos word" to search for commands related to "word"...
> Reading symbols from net/ipv6/datagram.o...done.
> (gdb) list *(ipv6_local_error+0x17)
> 0xae7 is in ipv6_local_error (net/ipv6/datagram.c:260).
> 255        struct ipv6_pinfo *np = inet6_sk(sk);
> 256        struct sock_exterr_skb *serr;
> 257        struct ipv6hdr *iph;
> 258        struct sk_buff *skb;
> 259
> 260        if (!np->recverr)
> 261            return;
> 262
> 263        skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC);
> 264        if (!skb)
> (gdb) quit
> 
> 
> We running a 6in4 with ipsec tunnel on the 6. I found a pull request from
> Steffen Klassert
> here:
> http://article.gmane.org/gmane.linux.network/281469
> 
> Which might be relevant to this problem.
> 
> For time being I add a
> 
>         if (np == NULL){
>                 LIMIT_NETDEBUG(KERN_DEBUG "ipv6_pinfo is NULL\n");
>                 return;
>         }
> 
> as work around to stop the server crashing

Looks like ipv6_local_error() got an ipv4 socket. You could
extend your workaround to something like the below. This
should give a full backtrace and the socket family.

 
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index cc11396..cf3a5d8 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -258,6 +258,13 @@ void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info)
 	struct ipv6hdr *iph;
 	struct sk_buff *skb;
 
+	if (np == NULL) {
+		WARN_ON_ONCE(1);
+		if (net_ratelimit())
+			printk(KERN_DEBUG "ipv6_pinfo is NULL, sk family %d\n", sk->sk_family);
+		return;
+	}
+
 	if (!np->recverr)
 		return;
 

^ permalink raw reply related

* Re: default enable sparse __CHECK_ENDIAN__ (was: Re: [PATCH v7 2/3] net: Add Keystone NetCP ethernet driver)
From: Marcel Holtmann @ 2014-12-11 11:18 UTC (permalink / raw)
  To: Joe Perches
  Cc: David S. Miller, Andrew Morton, Christopher Li, Michal Marek,
	m-karicheri2, Network Development, linux-arm-kernel, kernel list,
	devicetree, Rob Herring, grant.likely, linux-sparse
In-Reply-To: <1418267099.18092.28.camel@perches.com>

Hi Joe,

>>> Are you referring to the static code analyser sparse that is invoked
>>> through?
>> You have to explicitly enable endian checking, it's not on by
>> default.
> 
> There don't seem to be thousands of warnings anymore.
> 
> Maybe it's time to default enable it when using C=?
> 
> from: Documentation/sparse.txt:
> 
> The optional make variable CF can be used to pass arguments to sparse.  The
> build system passes -Wbitwise to sparse automatically.  To perform endianness
> checks, you may define __CHECK_ENDIAN__:
> 
>        make C=2 CF="-D__CHECK_ENDIAN__"
> 
> These checks are disabled by default as they generate a host of warnings.

actually a few subsystems use this in their Makefile:

	subdir-ccflags-y += -D__CHECK_ENDIAN__

We could start with that to enable endian checks by default in various places.

Regards

Marcel


^ permalink raw reply

* Re: [PATCH iproute2] ip: Simplify executing ip cmd within namespace
From: vadim4j @ 2014-12-11 10:57 UTC (permalink / raw)
  To: Nicolas Dichtel; +Cc: Vadim Kochan, netdev
In-Reply-To: <548978CD.80404@6wind.com>

On Thu, Dec 11, 2014 at 11:58:21AM +0100, Nicolas Dichtel wrote:
> Le 10/12/2014 23:56, Vadim Kochan a écrit :
> >From: Vadim Kochan <vadim4j@gmail.com>
> >
> >Added new '-ns' option to simplify executing following cmd:
> >
> >     ip netns exec NETNS ip OPTIONS COMMAND OBJECT
> >
> >     to
> >
> >     ip -ns NETNS OPTIONS COMMAND OBJECT
> >
> >e.g.:
> >
> >     ip -ns vnet0 link add br0 type bridge
> >
> >Signed-off-by: Vadim Kochan <vadim4j@gmail.com>
> >---
> >May be new option should have better name than '-ns' ?
> What about 'ip -netns' to be explicit like other options?
> user may still use 'ip -n' at the end.
> 
> 
> Regards,
> Nicolas

May be left '-n' for some other future option, but use the following
options: -net[ns] and -ns ? What do you think ?

Thanks,

^ permalink raw reply

* Re: [RFC PATCH net-next 1/1] net: Support for switch port configuration
From: Jiri Pirko @ 2014-12-11 11:01 UTC (permalink / raw)
  To: Varlese, Marco
  Cc: John Fastabend, netdev@vger.kernel.org,
	stephen@networkplumber.org, Fastabend, John R,
	roopa@cumulusnetworks.com, sfeldma@gmail.com,
	linux-kernel@vger.kernel.org
In-Reply-To: <C4896FB061E7DE4AAC93031BDCA044B104AC3914@IRSMSX108.ger.corp.intel.com>

Thu, Dec 11, 2014 at 10:59:42AM CET, marco.varlese@intel.com wrote:
>> -----Original Message-----
>> From: John Fastabend [mailto:john.fastabend@gmail.com]
>> Sent: Wednesday, December 10, 2014 5:04 PM
>> To: Jiri Pirko
>> Cc: Varlese, Marco; netdev@vger.kernel.org;
>> stephen@networkplumber.org; Fastabend, John R;
>> roopa@cumulusnetworks.com; sfeldma@gmail.com; linux-
>> kernel@vger.kernel.org
>> Subject: Re: [RFC PATCH net-next 1/1] net: Support for switch port
>> configuration
>> 
>> On 12/10/2014 08:50 AM, Jiri Pirko wrote:
>> > Wed, Dec 10, 2014 at 05:23:40PM CET, marco.varlese@intel.com wrote:
>> >> From: Marco Varlese <marco.varlese@intel.com>
>> >>
>> >> Switch hardware offers a list of attributes that are configurable on
>> >> a per port basis.
>> >> This patch provides a mechanism to configure switch ports by adding
>> >> an NDO for setting specific values to specific attributes.
>> >> There will be a separate patch that extends iproute2 to call the new
>> >> NDO.
>> >
>> >
>> > What are these attributes? Can you give some examples. I'm asking
>> > because there is a plan to pass generic attributes to switch ports
>> > replacing current specific ndo_switch_port_stp_update. In this case,
>> > bridge is setting that attribute.
>> >
>> > Is there need to set something directly from userspace or does it make
>> > rather sense to use involved bridge/ovs/bond ? I think that both will
>> > be needed.
>> 
>> +1
>> 
>> I think for many attributes it would be best to have both. The in kernel callers
>> and netlink userspace can use the same driver ndo_ops.
>> 
>> But then we don't _require_ any specific bridge/ovs/etc module. And we
>> may have some attributes that are not specific to any existing software
>> module. I'm guessing Marco has some examples of these.
>> 
>> [...]
>> 
>> 
>> --
>> John Fastabend         Intel Corporation
>
>We do have a need to configure the attributes directly from user-space and I have identified the tool to do that in iproute2.
>
>An example of attributes are:
>* enabling/disabling of learning of source addresses on a given port (you can imagine the attribute called LEARNING for example);
>* internal loopback control (i.e. LOOPBACK) which will control how the flow of traffic behaves from the switch fabric towards an egress port;
>* flooding for broadcast/multicast/unicast type of packets (i.e. BFLOODING, MFLOODING, UFLOODING);
>
>Some attributes would be of the type enabled/disabled while other will allow specific values to allow the user to configure different behaviours of that feature on that particular port on that platform.
>
>One thing to mention - as John stated as well - there might be some attributes that are not specific to any software module but rather have to do with the actual hardware/platform to configure.
>
>I hope this clarifies some points.

It does. Makes sense. We need to expose this attr set/get for both
in-kernel and userspace use cases.

Please adjust you patch for this. Also, as a second patch, it would be
great if you can convert ndo_switch_port_stp_update to this new ndo.

Thanks.


>
>-----------------------------------------------------------
>Marco Varlese		-	Intel Corporation
>-----------------------------------------------------------
>
>

^ permalink raw reply

* Re: [PATCH iproute2] ip: Simplify executing ip cmd within namespace
From: Nicolas Dichtel @ 2014-12-11 10:58 UTC (permalink / raw)
  To: Vadim Kochan, netdev
In-Reply-To: <1418252195-2612-1-git-send-email-vadim4j@gmail.com>

Le 10/12/2014 23:56, Vadim Kochan a écrit :
> From: Vadim Kochan <vadim4j@gmail.com>
>
> Added new '-ns' option to simplify executing following cmd:
>
>      ip netns exec NETNS ip OPTIONS COMMAND OBJECT
>
>      to
>
>      ip -ns NETNS OPTIONS COMMAND OBJECT
>
> e.g.:
>
>      ip -ns vnet0 link add br0 type bridge
>
> Signed-off-by: Vadim Kochan <vadim4j@gmail.com>
> ---
> May be new option should have better name than '-ns' ?
What about 'ip -netns' to be explicit like other options?
user may still use 'ip -n' at the end.


Regards,
Nicolas

^ permalink raw reply

* Re: [PATCH net-next 0/3] Kill arch_fast_hash
From: Herbert Xu @ 2014-12-11 10:35 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: davem, netdev, tgraf, hannes
In-Reply-To: <1418225592-29322-1-git-send-email-dborkman@redhat.com>

Daniel Borkmann <dborkman@redhat.com> wrote:
> Due to the size of changes I have based this against net-next,
> also given 3.18 is already out. I've split this into 3 parts,
> the first two to remove existing users (so they can optionally
> go to stable) and the last one to kill the remaining library bits.
> 
> Let me know if there are any issues.

Thanks for the fast response Daniel!
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* [PATCH v2 2/4 net-next] net: bcmgenet: rework Tx queue init
From: Petri Gynther @ 2014-12-11 10:20 UTC (permalink / raw)
  To: netdev; +Cc: davem, f.fainelli

1. Rename bcmgenet_init_multiq() to bcmgenet_init_tx_queues()
2. Move Tx default queue init inside bcmgenet_init_tx_queues()

Signed-off-by: Petri Gynther <pgynther@google.com>
---
 drivers/net/ethernet/broadcom/genet/bcmgenet.c | 81 +++++++++++---------------
 1 file changed, 35 insertions(+), 46 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index ea7a137..d46ff12 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -1775,78 +1775,73 @@ static int bcmgenet_init_rx_ring(struct bcmgenet_priv *priv,
 	return ret;
 }
 
-/* init multi xmit queues, only available for GENET2+
- * the queue is partitioned as follows:
+/* Initialize Tx queues
  *
- * queue 0 - 3 is priority based, each one has 32 descriptors,
+ * Queues 0-3 are priority-based, each one has 32 descriptors,
  * with queue 0 being the highest priority queue.
  *
- * queue 16 is the default tx queue with GENET_DEFAULT_BD_CNT
- * descriptors: 256 - (number of tx queues * bds per queues) = 128
- * descriptors.
+ * Queue 16 is the default Tx queue with
+ * GENET_DEFAULT_BD_CNT = 256 - 4 * 32 = 128 descriptors.
  *
- * The transmit control block pool is then partitioned as following:
- * - tx_cbs[0...127] are for queue 16
- * - tx_ring_cbs[0] points to tx_cbs[128..159]
- * - tx_ring_cbs[1] points to tx_cbs[160..191]
- * - tx_ring_cbs[2] points to tx_cbs[192..223]
- * - tx_ring_cbs[3] points to tx_cbs[224..255]
+ * The transmit control block pool is then partitioned as follows:
+ * - Tx queue 0 uses tx_cbs[0..31]
+ * - Tx queue 1 uses tx_cbs[32..63]
+ * - Tx queue 2 uses tx_cbs[64..95]
+ * - Tx queue 3 uses tx_cbs[96..127]
+ * - Tx queue 16 uses tx_cbs[128..255]
  */
-static void bcmgenet_init_multiq(struct net_device *dev)
+static void bcmgenet_init_tx_queues(struct net_device *dev)
 {
 	struct bcmgenet_priv *priv = netdev_priv(dev);
-	unsigned int i, dma_enable;
-	u32 reg, dma_ctrl, ring_cfg = 0;
+	u32 i, dma_enable;
+	u32 dma_ctrl, ring_cfg;
 	u32 dma_priority[3] = {0, 0, 0};
 
-	if (!netif_is_multiqueue(dev)) {
-		netdev_warn(dev, "called with non multi queue aware HW\n");
-		return;
-	}
-
 	dma_ctrl = bcmgenet_tdma_readl(priv, DMA_CTRL);
 	dma_enable = dma_ctrl & DMA_EN;
 	dma_ctrl &= ~DMA_EN;
 	bcmgenet_tdma_writel(priv, dma_ctrl, DMA_CTRL);
 
+	dma_ctrl = 0;
+	ring_cfg = 0;
+
 	/* Enable strict priority arbiter mode */
 	bcmgenet_tdma_writel(priv, DMA_ARBITER_SP, DMA_ARB_CTRL);
 
+	/* Initialize Tx priority queues */
 	for (i = 0; i < priv->hw_params->tx_queues; i++) {
-		/* first 64 tx_cbs are reserved for default tx queue
-		 * (ring 16)
-		 */
 		bcmgenet_init_tx_ring(priv, i, priv->hw_params->bds_cnt,
 				      i * priv->hw_params->bds_cnt,
 				      (i + 1) * priv->hw_params->bds_cnt);
-
-		/* Configure ring as descriptor ring and setup priority */
-		ring_cfg |= 1 << i;
-		dma_ctrl |= 1 << (i + DMA_RING_BUF_EN_SHIFT);
-
+		ring_cfg |= (1 << i);
+		dma_ctrl |= (1 << (i + DMA_RING_BUF_EN_SHIFT));
 		dma_priority[DMA_PRIO_REG_INDEX(i)] |=
 			((GENET_Q0_PRIORITY + i) << DMA_PRIO_REG_SHIFT(i));
 	}
 
-	/* Set ring 16 priority and program the hardware registers */
+	/* Initialize Tx default queue 16 */
+	bcmgenet_init_tx_ring(priv, DESC_INDEX, GENET_DEFAULT_BD_CNT,
+			      priv->hw_params->tx_queues *
+			      priv->hw_params->bds_cnt,
+			      TOTAL_DESC);
+	ring_cfg |= (1 << DESC_INDEX);
+	dma_ctrl |= (1 << (DESC_INDEX + DMA_RING_BUF_EN_SHIFT));
 	dma_priority[DMA_PRIO_REG_INDEX(DESC_INDEX)] |=
 		((GENET_Q0_PRIORITY + priv->hw_params->tx_queues) <<
 		 DMA_PRIO_REG_SHIFT(DESC_INDEX));
+
+	/* Set Tx queue priorities */
 	bcmgenet_tdma_writel(priv, dma_priority[0], DMA_PRIORITY_0);
 	bcmgenet_tdma_writel(priv, dma_priority[1], DMA_PRIORITY_1);
 	bcmgenet_tdma_writel(priv, dma_priority[2], DMA_PRIORITY_2);
 
-	/* Enable rings */
-	reg = bcmgenet_tdma_readl(priv, DMA_RING_CFG);
-	reg |= ring_cfg;
-	bcmgenet_tdma_writel(priv, reg, DMA_RING_CFG);
+	/* Enable Tx queues */
+	bcmgenet_tdma_writel(priv, ring_cfg, DMA_RING_CFG);
 
-	/* Configure ring as descriptor ring and re-enable DMA if enabled */
-	reg = bcmgenet_tdma_readl(priv, DMA_CTRL);
-	reg |= dma_ctrl;
+	/* Enable Tx DMA */
 	if (dma_enable)
-		reg |= DMA_EN;
-	bcmgenet_tdma_writel(priv, reg, DMA_CTRL);
+		dma_ctrl |= DMA_EN;
+	bcmgenet_tdma_writel(priv, dma_ctrl, DMA_CTRL);
 }
 
 static int bcmgenet_dma_teardown(struct bcmgenet_priv *priv)
@@ -1949,14 +1944,8 @@ static int bcmgenet_init_dma(struct bcmgenet_priv *priv)
 		return -ENOMEM;
 	}
 
-	/* initialize multi xmit queue */
-	bcmgenet_init_multiq(priv->dev);
-
-	/* initialize special ring 16 */
-	bcmgenet_init_tx_ring(priv, DESC_INDEX, GENET_DEFAULT_BD_CNT,
-			      priv->hw_params->tx_queues *
-			      priv->hw_params->bds_cnt,
-			      TOTAL_DESC);
+	/* Initialize Tx queues */
+	bcmgenet_init_tx_queues(priv->dev);
 
 	return 0;
 }
-- 
2.2.0.rc0.207.ga3a616c

^ permalink raw reply related

* [PATCH v2 4/4 net-next] net: bcmgenet: rename bcmgenet_hw_params->bds_cnt and GENET_DEFAULT_BD_CNT
From: Petri Gynther @ 2014-12-11 10:21 UTC (permalink / raw)
  To: netdev; +Cc: davem, f.fainelli

bcmgenet_hw_params->bds_cnt and GENET_DEFAULT_BD_CNT are used only in Tx init.
Rename them accordingly:
- bcmgenet_hw_params->bds_cnt => bcmgenet_hw_params->tx_bds_per_q
- GENET_DEFAULT_BD_CNT => GENET_Q16_TX_BD_CNT

Signed-off-by: Petri Gynther <pgynther@google.com>
---
 drivers/net/ethernet/broadcom/genet/bcmgenet.c | 29 +++++++++++++-------------
 drivers/net/ethernet/broadcom/genet/bcmgenet.h |  2 +-
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index 4a7744b..025c8a6 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -54,8 +54,8 @@
 /* Default highest priority queue for multi queue support */
 #define GENET_Q0_PRIORITY	0
 
-#define GENET_DEFAULT_BD_CNT	\
-	(TOTAL_DESC - priv->hw_params->tx_queues * priv->hw_params->bds_cnt)
+#define GENET_Q16_TX_BD_CNT	\
+	(TOTAL_DESC - priv->hw_params->tx_queues * priv->hw_params->tx_bds_per_q)
 
 #define RX_BUF_LENGTH		2048
 #define SKB_ALIGNMENT		32
@@ -1781,7 +1781,7 @@ static int bcmgenet_init_rx_ring(struct bcmgenet_priv *priv,
  * with queue 0 being the highest priority queue.
  *
  * Queue 16 is the default Tx queue with
- * GENET_DEFAULT_BD_CNT = 256 - 4 * 32 = 128 descriptors.
+ * GENET_Q16_TX_BD_CNT = 256 - 4 * 32 = 128 descriptors.
  *
  * The transmit control block pool is then partitioned as follows:
  * - Tx queue 0 uses tx_cbs[0..31]
@@ -1810,9 +1810,9 @@ static void bcmgenet_init_tx_queues(struct net_device *dev)
 
 	/* Initialize Tx priority queues */
 	for (i = 0; i < priv->hw_params->tx_queues; i++) {
-		bcmgenet_init_tx_ring(priv, i, priv->hw_params->bds_cnt,
-				      i * priv->hw_params->bds_cnt,
-				      (i + 1) * priv->hw_params->bds_cnt);
+		bcmgenet_init_tx_ring(priv, i, priv->hw_params->tx_bds_per_q,
+				      i * priv->hw_params->tx_bds_per_q,
+				      (i + 1) * priv->hw_params->tx_bds_per_q);
 		ring_cfg |= (1 << i);
 		dma_ctrl |= (1 << (i + DMA_RING_BUF_EN_SHIFT));
 		dma_priority[DMA_PRIO_REG_INDEX(i)] |=
@@ -1820,9 +1820,9 @@ static void bcmgenet_init_tx_queues(struct net_device *dev)
 	}
 
 	/* Initialize Tx default queue 16 */
-	bcmgenet_init_tx_ring(priv, DESC_INDEX, GENET_DEFAULT_BD_CNT,
+	bcmgenet_init_tx_ring(priv, DESC_INDEX, GENET_Q16_TX_BD_CNT,
 			      priv->hw_params->tx_queues *
-			      priv->hw_params->bds_cnt,
+			      priv->hw_params->tx_bds_per_q,
 			      TOTAL_DESC);
 	ring_cfg |= (1 << DESC_INDEX);
 	dma_ctrl |= (1 << (DESC_INDEX + DMA_RING_BUF_EN_SHIFT));
@@ -2426,8 +2426,8 @@ static const struct net_device_ops bcmgenet_netdev_ops = {
 static struct bcmgenet_hw_params bcmgenet_hw_params[] = {
 	[GENET_V1] = {
 		.tx_queues = 0,
+		.tx_bds_per_q = 0,
 		.rx_queues = 0,
-		.bds_cnt = 0,
 		.bp_in_en_shift = 16,
 		.bp_in_mask = 0xffff,
 		.hfb_filter_cnt = 16,
@@ -2439,8 +2439,8 @@ static struct bcmgenet_hw_params bcmgenet_hw_params[] = {
 	},
 	[GENET_V2] = {
 		.tx_queues = 4,
+		.tx_bds_per_q = 32,
 		.rx_queues = 4,
-		.bds_cnt = 32,
 		.bp_in_en_shift = 16,
 		.bp_in_mask = 0xffff,
 		.hfb_filter_cnt = 16,
@@ -2455,8 +2455,8 @@ static struct bcmgenet_hw_params bcmgenet_hw_params[] = {
 	},
 	[GENET_V3] = {
 		.tx_queues = 4,
+		.tx_bds_per_q = 32,
 		.rx_queues = 4,
-		.bds_cnt = 32,
 		.bp_in_en_shift = 17,
 		.bp_in_mask = 0x1ffff,
 		.hfb_filter_cnt = 48,
@@ -2471,8 +2471,8 @@ static struct bcmgenet_hw_params bcmgenet_hw_params[] = {
 	},
 	[GENET_V4] = {
 		.tx_queues = 4,
+		.tx_bds_per_q = 32,
 		.rx_queues = 4,
-		.bds_cnt = 32,
 		.bp_in_en_shift = 17,
 		.bp_in_mask = 0x1ffff,
 		.hfb_filter_cnt = 48,
@@ -2572,14 +2572,15 @@ static void bcmgenet_set_hw_params(struct bcmgenet_priv *priv)
 #endif
 
 	pr_debug("Configuration for version: %d\n"
-		"TXq: %1d, RXq: %1d, BDs: %1d\n"
+		"TXq: %1d, TXqBDs: %1d, RXq: %1d\n"
 		"BP << en: %2d, BP msk: 0x%05x\n"
 		"HFB count: %2d, QTAQ msk: 0x%05x\n"
 		"TBUF: 0x%04x, HFB: 0x%04x, HFBreg: 0x%04x\n"
 		"RDMA: 0x%05x, TDMA: 0x%05x\n"
 		"Words/BD: %d\n",
 		priv->version,
-		params->tx_queues, params->rx_queues, params->bds_cnt,
+		params->tx_queues, params->tx_bds_per_q,
+		params->rx_queues,
 		params->bp_in_en_shift, params->bp_in_mask,
 		params->hfb_filter_cnt, params->qtag_mask,
 		params->tbuf_offset, params->hfb_offset,
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.h b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
index b36ddec..3a8a90f 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.h
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
@@ -503,8 +503,8 @@ enum bcmgenet_version {
  */
 struct bcmgenet_hw_params {
 	u8		tx_queues;
+	u8		tx_bds_per_q;
 	u8		rx_queues;
-	u8		bds_cnt;
 	u8		bp_in_en_shift;
 	u32		bp_in_mask;
 	u8		hfb_filter_cnt;
-- 
2.2.0.rc0.207.ga3a616c

^ permalink raw reply related

* [PATCH v2 3/4 net-next] net: bcmgenet: precalculate TxCB->bd_addr
From: Petri Gynther @ 2014-12-11 10:20 UTC (permalink / raw)
  To: netdev; +Cc: davem, f.fainelli

There is 1-to-1 mapping between TxCBs and TxBDs. Precalculate TxCB->bd_addr
once in bcmgenet_init_dma() instead of doing it over and over needlessly in
bcmgenet_get_txcb().

Signed-off-by: Petri Gynther <pgynther@google.com>
---
 drivers/net/ethernet/broadcom/genet/bcmgenet.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index d46ff12..4a7744b 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -920,7 +920,7 @@ static struct enet_cb *bcmgenet_get_txcb(struct bcmgenet_priv *priv,
 
 	tx_cb_ptr = ring->cbs;
 	tx_cb_ptr += ring->write_ptr - ring->cb_ptr;
-	tx_cb_ptr->bd_addr = priv->tx_bds + ring->write_ptr * DMA_DESC_SIZE;
+
 	/* Advancing local write pointer */
 	if (ring->write_ptr == ring->end_ptr)
 		ring->write_ptr = ring->cb_ptr;
@@ -1918,6 +1918,8 @@ static void bcmgenet_fini_dma(struct bcmgenet_priv *priv)
 static int bcmgenet_init_dma(struct bcmgenet_priv *priv)
 {
 	int ret;
+	unsigned int i;
+	struct enet_cb *cb;
 
 	netif_dbg(priv, hw, priv->dev, "bcmgenet: init_edma\n");
 
@@ -1944,6 +1946,11 @@ static int bcmgenet_init_dma(struct bcmgenet_priv *priv)
 		return -ENOMEM;
 	}
 
+	for (i = 0; i < priv->num_tx_bds; i++) {
+		cb = priv->tx_cbs + i;
+		cb->bd_addr = priv->tx_bds + i * DMA_DESC_SIZE;
+	}
+
 	/* Initialize Tx queues */
 	bcmgenet_init_tx_queues(priv->dev);
 
-- 
2.2.0.rc0.207.ga3a616c

^ permalink raw reply related

* [PATCH v2 1/4 net-next] net: bcmgenet: bcmgenet_init_tx_ring() cleanup
From: Petri Gynther @ 2014-12-11 10:20 UTC (permalink / raw)
  To: netdev; +Cc: davem, f.fainelli

Signed-off-by: Petri Gynther <pgynther@google.com>
---
 drivers/net/ethernet/broadcom/genet/bcmgenet.c | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index 7078bd3..ea7a137 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -1680,17 +1680,14 @@ static int init_umac(struct bcmgenet_priv *priv)
 	return 0;
 }
 
-/* Initialize all house-keeping variables for a TX ring, along
- * with corresponding hardware registers
- */
+/* Initialize a Tx ring along with corresponding hardware registers */
 static void bcmgenet_init_tx_ring(struct bcmgenet_priv *priv,
 				  unsigned int index, unsigned int size,
-				  unsigned int write_ptr, unsigned int end_ptr)
+				  unsigned int start_ptr, unsigned int end_ptr)
 {
 	struct bcmgenet_tx_ring *ring = &priv->tx_rings[index];
 	u32 words_per_bd = WORDS_PER_BD(priv);
 	u32 flow_period_val = 0;
-	unsigned int first_bd;
 
 	spin_lock_init(&ring->lock);
 	ring->index = index;
@@ -1703,12 +1700,12 @@ static void bcmgenet_init_tx_ring(struct bcmgenet_priv *priv,
 		ring->int_enable = bcmgenet_tx_ring_int_enable;
 		ring->int_disable = bcmgenet_tx_ring_int_disable;
 	}
-	ring->cbs = priv->tx_cbs + write_ptr;
+	ring->cbs = priv->tx_cbs + start_ptr;
 	ring->size = size;
 	ring->c_index = 0;
 	ring->free_bds = size;
-	ring->write_ptr = write_ptr;
-	ring->cb_ptr = write_ptr;
+	ring->write_ptr = start_ptr;
+	ring->cb_ptr = start_ptr;
 	ring->end_ptr = end_ptr - 1;
 	ring->prod_index = 0;
 
@@ -1719,22 +1716,18 @@ static void bcmgenet_init_tx_ring(struct bcmgenet_priv *priv,
 	bcmgenet_tdma_ring_writel(priv, index, 0, TDMA_PROD_INDEX);
 	bcmgenet_tdma_ring_writel(priv, index, 0, TDMA_CONS_INDEX);
 	bcmgenet_tdma_ring_writel(priv, index, 1, DMA_MBUF_DONE_THRESH);
-	/* Disable rate control for now */
 	bcmgenet_tdma_ring_writel(priv, index, flow_period_val,
 				  TDMA_FLOW_PERIOD);
-	/* Unclassified traffic goes to ring 16 */
 	bcmgenet_tdma_ring_writel(priv, index,
 				  ((size << DMA_RING_SIZE_SHIFT) |
 				   RX_BUF_LENGTH), DMA_RING_BUF_SIZE);
 
-	first_bd = write_ptr;
-
 	/* Set start and end address, read and write pointers */
-	bcmgenet_tdma_ring_writel(priv, index, first_bd * words_per_bd,
+	bcmgenet_tdma_ring_writel(priv, index, start_ptr * words_per_bd,
 				  DMA_START_ADDR);
-	bcmgenet_tdma_ring_writel(priv, index, first_bd * words_per_bd,
+	bcmgenet_tdma_ring_writel(priv, index, start_ptr * words_per_bd,
 				  TDMA_READ_PTR);
-	bcmgenet_tdma_ring_writel(priv, index, first_bd,
+	bcmgenet_tdma_ring_writel(priv, index, start_ptr * words_per_bd,
 				  TDMA_WRITE_PTR);
 	bcmgenet_tdma_ring_writel(priv, index, end_ptr * words_per_bd - 1,
 				  DMA_END_ADDR);
-- 
2.2.0.rc0.207.ga3a616c

^ permalink raw reply related

* Re: [RFC PATCH 0/3] Faster than SLAB caching of SKBs with qmempool (backed by alf_queue)
From: Jesper Dangaard Brouer @ 2014-12-11 10:18 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: netdev, linux-kernel, linux-mm, linux-api, Eric Dumazet,
	David S. Miller, Hannes Frederic Sowa, Alexander Duyck,
	Alexei Starovoitov, Paul E. McKenney, Mathieu Desnoyers,
	Steven Rostedt, brouer
In-Reply-To: <alpine.DEB.2.11.1412101339480.22982@gentwo.org>

On Wed, 10 Dec 2014 13:51:32 -0600 (CST)
Christoph Lameter <cl@linux.com> wrote:

> On Wed, 10 Dec 2014, Jesper Dangaard Brouer wrote:
> 
> > One of the building blocks for achieving this speedup is a cmpxchg
> > based Lock-Free queue that supports bulking, named alf_queue for
> > Array-based Lock-Free queue.  By bulking elements (pointers) from the
> > queue, the cost of the cmpxchg (approx 8 ns) is amortized over several
> > elements.
> 
> This is a bit of an issue since the design of the SLUB allocator is such
> that you should pick up an object, apply some processing and then take the
> next one. The fetching of an object warms up the first cacheline and this
> is tied into the way free objects are linked in SLUB.
> 
> So a bulk fetch from SLUB will not that effective and cause the touching
> of many cachelines if we are dealing with just a few objects. If we are
> looking at whole slab pages with all objects then SLUB can be effective
> since we do not have to build up the linked pointer structure in each
> page. SLAB has a different architecture there and a bulk fetch there is
> possible without touching objects even for small sets since the freelist
> management is separate from the objects.
> 
> If you do this bulking then you will later access cache cold objects?
> Doesnt that negate the benefit that you gain? Or are these objects written
> to by hardware and therefore by necessity cache cold?

Cache warmup is a concern, but perhaps it's the callers responsibility
to prefetch for their use-case.  For qmempool I do have patches that
prefetch elems when going from the sharedq to the localq (per CPU), but
I didn't see much gain, and I could prove my point (of being faster than
slab) without it.  And I would use/need the slab bulk interface to add
elems to sharedq which I consider semi-cache cold.


> We could provide a faster bulk alloc/free function.
> 
> 	int kmem_cache_alloc_array(struct kmem_cache *s, gfp_t flags,
> 		size_t objects, void **array)

I like it :-)

> and this could be optimized by each slab allocator to provide fast
> population of objects in that array. We then assume that the number of
> objects is in the hundreds or so right?

I'm already seeing a benefit with 16 packets alloc/free "bulking".

On RX we have a "budget" of 64 packets/descriptors (taken from the NIC
RX ring) that need SKBs.

On TX packets are put into the TX ring, and later at TX completion the
TX ring is cleaned up, as many as 256 (as e.g. in the ixgbe driver).

Scientific articles on userspace networking (like netmap) report that
they need at least 8 packet bulking to see wirespeed 10G at 64 bytes.


> The corresponding free function
> 
> 	void kmem_cache_free_array(struct kmem_cache *s,
> 		size_t objects, void **array)
> 
> 
> I think the queue management of the array can be improved by using a
> similar technique as used the SLUB allocator using the cmpxchg_local.
> cmpxchg_local is much faster than a full cmpxchg and we are operating on
> per cpu structures anyways. So the overhead could still be reduced.

I think you missed that the per cpu localq is already not using cmpxchg
(it is a SPSC queue).  The sharedq (MPMC queue) does need and use the
locked cmpxchg.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Sr. Network Kernel Developer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [PATCH v2 1/1] net/macb: add TX multiqueue support for gem
From: Cyrille Pitchen @ 2014-12-11 10:16 UTC (permalink / raw)
  To: nicolas.ferre, davem, linux-arm-kernel, netdev, soren.brinkmann
  Cc: linux-kernel, Cyrille Pitchen
In-Reply-To: <cover.1418291637.git.cyrille.pitchen@atmel.com>

gem devices designed with multiqueue CANNOT work without this patch.

When probing a gem device, the driver must first prepare and enable the
peripheral clock before accessing I/O registers. The second step is to read the
MID register to find whether the device is a gem or an old macb IP.
For gem devices, it reads the Design Configuration Register 6 (DCFG6) to
compute to total number of queues, whereas macb devices always have a single
queue.
Only then it can call alloc_etherdev_mq() with the correct number of queues.
This is the reason why the order of some initializations has been changed in
macb_probe().
Eventually, the dedicated IRQ and TX ring buffer descriptors are initialized
for each queue.

For backward compatibility reasons, queue0 uses the legacy registers ISR, IER,
IDR, IMR, TBQP and RBQP. On the other hand, the other queues use new registers
ISR[1..7], IER[1..7], IDR[1..7], IMR[1..7], TBQP[1..7] and RBQP[1..7].
Except this hardware detail there is no real difference between queue0 and the
others. The driver hides that thanks to the struct macb_queue.
This structure allows us to share a common set of functions for all the queues.

Besides when a TX error occurs, the gem MUST be halted before writing any of
the TBQP registers to reset the relevant queue. An immediate side effect is
that the other queues too aren't processed anymore by the gem.
So macb_tx_error_task() calls netif_tx_stop_all_queues() to notify the Linux
network engine that all transmissions are stopped.

Also macb_tx_error_task() now calls spin_lock_irqsave() to prevent the
interrupt handlers of the other queues from running as each of them may wake
its associated queue up (please refer to macb_tx_interrupt()).

Finally, as all queues have previously been stopped, they should be restarted
calling netif_tx_start_all_queues() and setting the TSTART bit into the Network
Control Register. Before this patch, when dealing with a single queue, the
driver used to defer the reset of the faulting queue and the write of the
TSTART bit until the next call of macb_start_xmit().
As explained before, this bit is now set by macb_tx_error_task() too. That's
why the faulting queue MUST be reset by setting the TX_USED bit in its first
buffer descriptor before writing the TSTART bit.

Queue 0 always exits and is the lowest priority when other queues are available.
The higher the index of the queue is, the higher its priority is.

When transmitting frames, the TX queue is selected by the skb->queue_mapping
value. So queue discipline can be used to define the queue priority policy.

Signed-off-by: Cyrille Pitchen <cyrille.pitchen@atmel.com>
---
 drivers/net/ethernet/cadence/macb.c | 451 +++++++++++++++++++++++-------------
 drivers/net/ethernet/cadence/macb.h |  71 +++++-
 2 files changed, 358 insertions(+), 164 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb.c b/drivers/net/ethernet/cadence/macb.c
index 41113e5..ab8116f 100644
--- a/drivers/net/ethernet/cadence/macb.c
+++ b/drivers/net/ethernet/cadence/macb.c
@@ -66,23 +66,25 @@ static unsigned int macb_tx_ring_wrap(unsigned int index)
 	return index & (TX_RING_SIZE - 1);
 }
 
-static struct macb_dma_desc *macb_tx_desc(struct macb *bp, unsigned int index)
+static struct macb_dma_desc *macb_tx_desc(struct macb_queue *queue,
+					  unsigned int index)
 {
-	return &bp->tx_ring[macb_tx_ring_wrap(index)];
+	return &queue->tx_ring[macb_tx_ring_wrap(index)];
 }
 
-static struct macb_tx_skb *macb_tx_skb(struct macb *bp, unsigned int index)
+static struct macb_tx_skb *macb_tx_skb(struct macb_queue *queue,
+				       unsigned int index)
 {
-	return &bp->tx_skb[macb_tx_ring_wrap(index)];
+	return &queue->tx_skb[macb_tx_ring_wrap(index)];
 }
 
-static dma_addr_t macb_tx_dma(struct macb *bp, unsigned int index)
+static dma_addr_t macb_tx_dma(struct macb_queue *queue, unsigned int index)
 {
 	dma_addr_t offset;
 
 	offset = macb_tx_ring_wrap(index) * sizeof(struct macb_dma_desc);
 
-	return bp->tx_ring_dma + offset;
+	return queue->tx_ring_dma + offset;
 }
 
 static unsigned int macb_rx_ring_wrap(unsigned int index)
@@ -490,38 +492,50 @@ static void macb_tx_unmap(struct macb *bp, struct macb_tx_skb *tx_skb)
 
 static void macb_tx_error_task(struct work_struct *work)
 {
-	struct macb	*bp = container_of(work, struct macb, tx_error_task);
+	struct macb_queue	*queue = container_of(work, struct macb_queue,
+						      tx_error_task);
+	struct macb		*bp = queue->bp;
 	struct macb_tx_skb	*tx_skb;
+	struct macb_dma_desc	*desc;
 	struct sk_buff		*skb;
 	unsigned int		tail;
+	unsigned long		flags;
+
+	netdev_vdbg(bp->dev, "macb_tx_error_task: q = %u, t = %u, h = %u\n",
+		    (unsigned int)(queue - bp->queues),
+		    queue->tx_tail, queue->tx_head);
 
-	netdev_vdbg(bp->dev, "macb_tx_error_task: t = %u, h = %u\n",
-		    bp->tx_tail, bp->tx_head);
+	/*
+	 * Prevent the queue IRQ handlers from running: each of them may call
+	 * macb_tx_interrupt(), which in turn may call netif_wake_subqueue().
+	 * As explained below, we have to halt the transmission before updating
+	 * TBQP registers so we call netif_tx_stop_all_queues() to notify the
+	 * network engine about the macb/gem being halted.
+	 */
+	spin_lock_irqsave(&bp->lock, flags);
 
 	/* Make sure nobody is trying to queue up new packets */
-	netif_stop_queue(bp->dev);
+	netif_tx_stop_all_queues(bp->dev);
 
 	/*
 	 * Stop transmission now
 	 * (in case we have just queued new packets)
+	 * macb/gem must be halted to write TBQP register
 	 */
 	if (macb_halt_tx(bp))
 		/* Just complain for now, reinitializing TX path can be good */
 		netdev_err(bp->dev, "BUG: halt tx timed out\n");
 
-	/* No need for the lock here as nobody will interrupt us anymore */
-
 	/*
 	 * Treat frames in TX queue including the ones that caused the error.
 	 * Free transmit buffers in upper layer.
 	 */
-	for (tail = bp->tx_tail; tail != bp->tx_head; tail++) {
-		struct macb_dma_desc	*desc;
-		u32			ctrl;
+	for (tail = queue->tx_tail; tail != queue->tx_head; tail++) {
+		u32	ctrl;
 
-		desc = macb_tx_desc(bp, tail);
+		desc = macb_tx_desc(queue, tail);
 		ctrl = desc->ctrl;
-		tx_skb = macb_tx_skb(bp, tail);
+		tx_skb = macb_tx_skb(queue, tail);
 		skb = tx_skb->skb;
 
 		if (ctrl & MACB_BIT(TX_USED)) {
@@ -529,7 +543,7 @@ static void macb_tx_error_task(struct work_struct *work)
 			while (!skb) {
 				macb_tx_unmap(bp, tx_skb);
 				tail++;
-				tx_skb = macb_tx_skb(bp, tail);
+				tx_skb = macb_tx_skb(queue, tail);
 				skb = tx_skb->skb;
 			}
 
@@ -558,45 +572,56 @@ static void macb_tx_error_task(struct work_struct *work)
 		macb_tx_unmap(bp, tx_skb);
 	}
 
+	/* Set end of TX queue */
+	desc = macb_tx_desc(queue, 0);
+	desc->addr = 0;
+	desc->ctrl = MACB_BIT(TX_USED);
+
 	/* Make descriptor updates visible to hardware */
 	wmb();
 
 	/* Reinitialize the TX desc queue */
-	macb_writel(bp, TBQP, bp->tx_ring_dma);
+	queue_writel(queue, TBQP, queue->tx_ring_dma);
 	/* Make TX ring reflect state of hardware */
-	bp->tx_head = bp->tx_tail = 0;
-
-	/* Now we are ready to start transmission again */
-	netif_wake_queue(bp->dev);
+	queue->tx_head = 0;
+	queue->tx_tail = 0;
 
 	/* Housework before enabling TX IRQ */
 	macb_writel(bp, TSR, macb_readl(bp, TSR));
-	macb_writel(bp, IER, MACB_TX_INT_FLAGS);
+	queue_writel(queue, IER, MACB_TX_INT_FLAGS);
+
+	/* Now we are ready to start transmission again */
+	netif_tx_start_all_queues(bp->dev);
+	macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART));
+
+	spin_unlock_irqrestore(&bp->lock, flags);
 }
 
-static void macb_tx_interrupt(struct macb *bp)
+static void macb_tx_interrupt(struct macb_queue *queue)
 {
 	unsigned int tail;
 	unsigned int head;
 	u32 status;
+	struct macb *bp = queue->bp;
+	u16 queue_index = queue - bp->queues;
 
 	status = macb_readl(bp, TSR);
 	macb_writel(bp, TSR, status);
 
 	if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
-		macb_writel(bp, ISR, MACB_BIT(TCOMP));
+		queue_writel(queue, ISR, MACB_BIT(TCOMP));
 
 	netdev_vdbg(bp->dev, "macb_tx_interrupt status = 0x%03lx\n",
 		(unsigned long)status);
 
-	head = bp->tx_head;
-	for (tail = bp->tx_tail; tail != head; tail++) {
+	head = queue->tx_head;
+	for (tail = queue->tx_tail; tail != head; tail++) {
 		struct macb_tx_skb	*tx_skb;
 		struct sk_buff		*skb;
 		struct macb_dma_desc	*desc;
 		u32			ctrl;
 
-		desc = macb_tx_desc(bp, tail);
+		desc = macb_tx_desc(queue, tail);
 
 		/* Make hw descriptor updates visible to CPU */
 		rmb();
@@ -611,7 +636,7 @@ static void macb_tx_interrupt(struct macb *bp)
 
 		/* Process all buffers of the current transmitted frame */
 		for (;; tail++) {
-			tx_skb = macb_tx_skb(bp, tail);
+			tx_skb = macb_tx_skb(queue, tail);
 			skb = tx_skb->skb;
 
 			/* First, update TX stats if needed */
@@ -634,11 +659,11 @@ static void macb_tx_interrupt(struct macb *bp)
 		}
 	}
 
-	bp->tx_tail = tail;
-	if (netif_queue_stopped(bp->dev)
-			&& CIRC_CNT(bp->tx_head, bp->tx_tail,
-				    TX_RING_SIZE) <= MACB_TX_WAKEUP_THRESH)
-		netif_wake_queue(bp->dev);
+	queue->tx_tail = tail;
+	if (__netif_subqueue_stopped(bp->dev, queue_index) &&
+	    CIRC_CNT(queue->tx_head, queue->tx_tail,
+		     TX_RING_SIZE) <= MACB_TX_WAKEUP_THRESH)
+		netif_wake_subqueue(bp->dev, queue_index);
 }
 
 static void gem_rx_refill(struct macb *bp)
@@ -949,11 +974,12 @@ static int macb_poll(struct napi_struct *napi, int budget)
 
 static irqreturn_t macb_interrupt(int irq, void *dev_id)
 {
-	struct net_device *dev = dev_id;
-	struct macb *bp = netdev_priv(dev);
+	struct macb_queue *queue = dev_id;
+	struct macb *bp = queue->bp;
+	struct net_device *dev = bp->dev;
 	u32 status;
 
-	status = macb_readl(bp, ISR);
+	status = queue_readl(queue, ISR);
 
 	if (unlikely(!status))
 		return IRQ_NONE;
@@ -963,11 +989,13 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
 	while (status) {
 		/* close possible race with dev_close */
 		if (unlikely(!netif_running(dev))) {
-			macb_writel(bp, IDR, -1);
+			queue_writel(queue, IDR, -1);
 			break;
 		}
 
-		netdev_vdbg(bp->dev, "isr = 0x%08lx\n", (unsigned long)status);
+		netdev_vdbg(bp->dev, "queue = %u, isr = 0x%08lx\n",
+			    (unsigned int)(queue - bp->queues),
+			    (unsigned long)status);
 
 		if (status & MACB_RX_INT_FLAGS) {
 			/*
@@ -977,9 +1005,9 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
 			 * is already scheduled, so disable interrupts
 			 * now.
 			 */
-			macb_writel(bp, IDR, MACB_RX_INT_FLAGS);
+			queue_writel(queue, IDR, MACB_RX_INT_FLAGS);
 			if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
-				macb_writel(bp, ISR, MACB_BIT(RCOMP));
+				queue_writel(queue, ISR, MACB_BIT(RCOMP));
 
 			if (napi_schedule_prep(&bp->napi)) {
 				netdev_vdbg(bp->dev, "scheduling RX softirq\n");
@@ -988,17 +1016,17 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
 		}
 
 		if (unlikely(status & (MACB_TX_ERR_FLAGS))) {
-			macb_writel(bp, IDR, MACB_TX_INT_FLAGS);
-			schedule_work(&bp->tx_error_task);
+			queue_writel(queue, IDR, MACB_TX_INT_FLAGS);
+			schedule_work(&queue->tx_error_task);
 
 			if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
-				macb_writel(bp, ISR, MACB_TX_ERR_FLAGS);
+				queue_writel(queue, ISR, MACB_TX_ERR_FLAGS);
 
 			break;
 		}
 
 		if (status & MACB_BIT(TCOMP))
-			macb_tx_interrupt(bp);
+			macb_tx_interrupt(queue);
 
 		/*
 		 * Link change detection isn't possible with RMII, so we'll
@@ -1013,7 +1041,7 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
 				bp->hw_stats.macb.rx_overruns++;
 
 			if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
-				macb_writel(bp, ISR, MACB_BIT(ISR_ROVR));
+				queue_writel(queue, ISR, MACB_BIT(ISR_ROVR));
 		}
 
 		if (status & MACB_BIT(HRESP)) {
@@ -1025,10 +1053,10 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
 			netdev_err(dev, "DMA bus error: HRESP not OK\n");
 
 			if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
-				macb_writel(bp, ISR, MACB_BIT(HRESP));
+				queue_writel(queue, ISR, MACB_BIT(HRESP));
 		}
 
-		status = macb_readl(bp, ISR);
+		status = queue_readl(queue, ISR);
 	}
 
 	spin_unlock(&bp->lock);
@@ -1043,10 +1071,14 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
  */
 static void macb_poll_controller(struct net_device *dev)
 {
+	struct macb *bp = netdev_priv(dev);
+	struct macb_queue *queue;
 	unsigned long flags;
+	unsigned int q;
 
 	local_irq_save(flags);
-	macb_interrupt(dev->irq, dev);
+	for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue)
+		macb_interrupt(dev->irq, queue);
 	local_irq_restore(flags);
 }
 #endif
@@ -1058,10 +1090,11 @@ static inline unsigned int macb_count_tx_descriptors(struct macb *bp,
 }
 
 static unsigned int macb_tx_map(struct macb *bp,
+				struct macb_queue *queue,
 				struct sk_buff *skb)
 {
 	dma_addr_t mapping;
-	unsigned int len, entry, i, tx_head = bp->tx_head;
+	unsigned int len, entry, i, tx_head = queue->tx_head;
 	struct macb_tx_skb *tx_skb = NULL;
 	struct macb_dma_desc *desc;
 	unsigned int offset, size, count = 0;
@@ -1075,7 +1108,7 @@ static unsigned int macb_tx_map(struct macb *bp,
 	while (len) {
 		size = min(len, bp->max_tx_length);
 		entry = macb_tx_ring_wrap(tx_head);
-		tx_skb = &bp->tx_skb[entry];
+		tx_skb = &queue->tx_skb[entry];
 
 		mapping = dma_map_single(&bp->pdev->dev,
 					 skb->data + offset,
@@ -1104,7 +1137,7 @@ static unsigned int macb_tx_map(struct macb *bp,
 		while (len) {
 			size = min(len, bp->max_tx_length);
 			entry = macb_tx_ring_wrap(tx_head);
-			tx_skb = &bp->tx_skb[entry];
+			tx_skb = &queue->tx_skb[entry];
 
 			mapping = skb_frag_dma_map(&bp->pdev->dev, frag,
 						   offset, size, DMA_TO_DEVICE);
@@ -1143,14 +1176,14 @@ static unsigned int macb_tx_map(struct macb *bp,
 	i = tx_head;
 	entry = macb_tx_ring_wrap(i);
 	ctrl = MACB_BIT(TX_USED);
-	desc = &bp->tx_ring[entry];
+	desc = &queue->tx_ring[entry];
 	desc->ctrl = ctrl;
 
 	do {
 		i--;
 		entry = macb_tx_ring_wrap(i);
-		tx_skb = &bp->tx_skb[entry];
-		desc = &bp->tx_ring[entry];
+		tx_skb = &queue->tx_skb[entry];
+		desc = &queue->tx_ring[entry];
 
 		ctrl = (u32)tx_skb->size;
 		if (eof) {
@@ -1167,17 +1200,17 @@ static unsigned int macb_tx_map(struct macb *bp,
 		 */
 		wmb();
 		desc->ctrl = ctrl;
-	} while (i != bp->tx_head);
+	} while (i != queue->tx_head);
 
-	bp->tx_head = tx_head;
+	queue->tx_head = tx_head;
 
 	return count;
 
 dma_error:
 	netdev_err(bp->dev, "TX DMA map failed\n");
 
-	for (i = bp->tx_head; i != tx_head; i++) {
-		tx_skb = macb_tx_skb(bp, i);
+	for (i = queue->tx_head; i != tx_head; i++) {
+		tx_skb = macb_tx_skb(queue, i);
 
 		macb_tx_unmap(bp, tx_skb);
 	}
@@ -1187,14 +1220,16 @@ dma_error:
 
 static int macb_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
+	u16 queue_index = skb_get_queue_mapping(skb);
 	struct macb *bp = netdev_priv(dev);
+	struct macb_queue *queue = &bp->queues[queue_index];
 	unsigned long flags;
 	unsigned int count, nr_frags, frag_size, f;
 
 #if defined(DEBUG) && defined(VERBOSE_DEBUG)
 	netdev_vdbg(bp->dev,
-		   "start_xmit: len %u head %p data %p tail %p end %p\n",
-		   skb->len, skb->head, skb->data,
+		   "start_xmit: queue %hu len %u head %p data %p tail %p end %p\n",
+		   queue_index, skb->len, skb->head, skb->data,
 		   skb_tail_pointer(skb), skb_end_pointer(skb));
 	print_hex_dump(KERN_DEBUG, "data: ", DUMP_PREFIX_OFFSET, 16, 1,
 		       skb->data, 16, true);
@@ -1214,16 +1249,16 @@ static int macb_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	spin_lock_irqsave(&bp->lock, flags);
 
 	/* This is a hard error, log it. */
-	if (CIRC_SPACE(bp->tx_head, bp->tx_tail, TX_RING_SIZE) < count) {
-		netif_stop_queue(dev);
+	if (CIRC_SPACE(queue->tx_head, queue->tx_tail, TX_RING_SIZE) < count) {
+		netif_stop_subqueue(dev, queue_index);
 		spin_unlock_irqrestore(&bp->lock, flags);
 		netdev_dbg(bp->dev, "tx_head = %u, tx_tail = %u\n",
-			   bp->tx_head, bp->tx_tail);
+			   queue->tx_head, queue->tx_tail);
 		return NETDEV_TX_BUSY;
 	}
 
 	/* Map socket buffer for DMA transfer */
-	if (!macb_tx_map(bp, skb)) {
+	if (!macb_tx_map(bp, queue, skb)) {
 		dev_kfree_skb_any(skb);
 		goto unlock;
 	}
@@ -1235,8 +1270,8 @@ static int macb_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART));
 
-	if (CIRC_SPACE(bp->tx_head, bp->tx_tail, TX_RING_SIZE) < 1)
-		netif_stop_queue(dev);
+	if (CIRC_SPACE(queue->tx_head, queue->tx_tail, TX_RING_SIZE) < 1)
+		netif_stop_subqueue(dev, queue_index);
 
 unlock:
 	spin_unlock_irqrestore(&bp->lock, flags);
@@ -1304,20 +1339,24 @@ static void macb_free_rx_buffers(struct macb *bp)
 
 static void macb_free_consistent(struct macb *bp)
 {
-	if (bp->tx_skb) {
-		kfree(bp->tx_skb);
-		bp->tx_skb = NULL;
-	}
+	struct macb_queue *queue;
+	unsigned int q;
+
 	bp->macbgem_ops.mog_free_rx_buffers(bp);
 	if (bp->rx_ring) {
 		dma_free_coherent(&bp->pdev->dev, RX_RING_BYTES,
 				  bp->rx_ring, bp->rx_ring_dma);
 		bp->rx_ring = NULL;
 	}
-	if (bp->tx_ring) {
-		dma_free_coherent(&bp->pdev->dev, TX_RING_BYTES,
-				  bp->tx_ring, bp->tx_ring_dma);
-		bp->tx_ring = NULL;
+
+	for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
+		kfree(queue->tx_skb);
+		queue->tx_skb = NULL;
+		if (queue->tx_ring) {
+			dma_free_coherent(&bp->pdev->dev, TX_RING_BYTES,
+					  queue->tx_ring, queue->tx_ring_dma);
+			queue->tx_ring = NULL;
+		}
 	}
 }
 
@@ -1354,12 +1393,27 @@ static int macb_alloc_rx_buffers(struct macb *bp)
 
 static int macb_alloc_consistent(struct macb *bp)
 {
+	struct macb_queue *queue;
+	unsigned int q;
 	int size;
 
-	size = TX_RING_SIZE * sizeof(struct macb_tx_skb);
-	bp->tx_skb = kmalloc(size, GFP_KERNEL);
-	if (!bp->tx_skb)
-		goto out_err;
+	for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
+		size = TX_RING_BYTES;
+		queue->tx_ring = dma_alloc_coherent(&bp->pdev->dev, size,
+						    &queue->tx_ring_dma,
+						    GFP_KERNEL);
+		if (!queue->tx_ring)
+			goto out_err;
+		netdev_dbg(bp->dev,
+			   "Allocated TX ring for queue %u of %d bytes at %08lx (mapped %p)\n",
+			   q, size, (unsigned long)queue->tx_ring_dma,
+			   queue->tx_ring);
+
+		size = TX_RING_SIZE * sizeof(struct macb_tx_skb);
+		queue->tx_skb = kmalloc(size, GFP_KERNEL);
+		if (!queue->tx_skb)
+			goto out_err;
+	}
 
 	size = RX_RING_BYTES;
 	bp->rx_ring = dma_alloc_coherent(&bp->pdev->dev, size,
@@ -1370,15 +1424,6 @@ static int macb_alloc_consistent(struct macb *bp)
 		   "Allocated RX ring of %d bytes at %08lx (mapped %p)\n",
 		   size, (unsigned long)bp->rx_ring_dma, bp->rx_ring);
 
-	size = TX_RING_BYTES;
-	bp->tx_ring = dma_alloc_coherent(&bp->pdev->dev, size,
-					 &bp->tx_ring_dma, GFP_KERNEL);
-	if (!bp->tx_ring)
-		goto out_err;
-	netdev_dbg(bp->dev,
-		   "Allocated TX ring of %d bytes at %08lx (mapped %p)\n",
-		   size, (unsigned long)bp->tx_ring_dma, bp->tx_ring);
-
 	if (bp->macbgem_ops.mog_alloc_rx_buffers(bp))
 		goto out_err;
 
@@ -1391,15 +1436,22 @@ out_err:
 
 static void gem_init_rings(struct macb *bp)
 {
+	struct macb_queue *queue;
+	unsigned int q;
 	int i;
 
-	for (i = 0; i < TX_RING_SIZE; i++) {
-		bp->tx_ring[i].addr = 0;
-		bp->tx_ring[i].ctrl = MACB_BIT(TX_USED);
+	for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
+		for (i = 0; i < TX_RING_SIZE; i++) {
+			queue->tx_ring[i].addr = 0;
+			queue->tx_ring[i].ctrl = MACB_BIT(TX_USED);
+		}
+		queue->tx_ring[TX_RING_SIZE - 1].ctrl |= MACB_BIT(TX_WRAP);
+		queue->tx_head = 0;
+		queue->tx_tail = 0;
 	}
-	bp->tx_ring[TX_RING_SIZE - 1].ctrl |= MACB_BIT(TX_WRAP);
 
-	bp->rx_tail = bp->rx_prepared_head = bp->tx_head = bp->tx_tail = 0;
+	bp->rx_tail = 0;
+	bp->rx_prepared_head = 0;
 
 	gem_rx_refill(bp);
 }
@@ -1418,16 +1470,21 @@ static void macb_init_rings(struct macb *bp)
 	bp->rx_ring[RX_RING_SIZE - 1].addr |= MACB_BIT(RX_WRAP);
 
 	for (i = 0; i < TX_RING_SIZE; i++) {
-		bp->tx_ring[i].addr = 0;
-		bp->tx_ring[i].ctrl = MACB_BIT(TX_USED);
+		bp->queues[0].tx_ring[i].addr = 0;
+		bp->queues[0].tx_ring[i].ctrl = MACB_BIT(TX_USED);
+		bp->queues[0].tx_head = 0;
+		bp->queues[0].tx_tail = 0;
 	}
-	bp->tx_ring[TX_RING_SIZE - 1].ctrl |= MACB_BIT(TX_WRAP);
+	bp->queues[0].tx_ring[TX_RING_SIZE - 1].ctrl |= MACB_BIT(TX_WRAP);
 
-	bp->rx_tail = bp->tx_head = bp->tx_tail = 0;
+	bp->rx_tail = 0;
 }
 
 static void macb_reset_hw(struct macb *bp)
 {
+	struct macb_queue *queue;
+	unsigned int q;
+
 	/*
 	 * Disable RX and TX (XXX: Should we halt the transmission
 	 * more gracefully?)
@@ -1442,8 +1499,10 @@ static void macb_reset_hw(struct macb *bp)
 	macb_writel(bp, RSR, -1);
 
 	/* Disable all interrupts */
-	macb_writel(bp, IDR, -1);
-	macb_readl(bp, ISR);
+	for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
+		queue_writel(queue, IDR, -1);
+		queue_readl(queue, ISR);
+	}
 }
 
 static u32 gem_mdc_clk_div(struct macb *bp)
@@ -1540,6 +1599,9 @@ static void macb_configure_dma(struct macb *bp)
 
 static void macb_init_hw(struct macb *bp)
 {
+	struct macb_queue *queue;
+	unsigned int q;
+
 	u32 config;
 
 	macb_reset_hw(bp);
@@ -1565,16 +1627,18 @@ static void macb_init_hw(struct macb *bp)
 
 	/* Initialize TX and RX buffers */
 	macb_writel(bp, RBQP, bp->rx_ring_dma);
-	macb_writel(bp, TBQP, bp->tx_ring_dma);
+	for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
+		queue_writel(queue, TBQP, queue->tx_ring_dma);
+
+		/* Enable interrupts */
+		queue_writel(queue, IER,
+			     MACB_RX_INT_FLAGS |
+			     MACB_TX_INT_FLAGS |
+			     MACB_BIT(HRESP));
+	}
 
 	/* Enable TX and RX */
 	macb_writel(bp, NCR, MACB_BIT(RE) | MACB_BIT(TE) | MACB_BIT(MPE));
-
-	/* Enable interrupts */
-	macb_writel(bp, IER, (MACB_RX_INT_FLAGS
-			      | MACB_TX_INT_FLAGS
-			      | MACB_BIT(HRESP)));
-
 }
 
 /*
@@ -1736,7 +1800,7 @@ static int macb_open(struct net_device *dev)
 	/* schedule a link state check */
 	phy_start(bp->phy_dev);
 
-	netif_start_queue(dev);
+	netif_tx_start_all_queues(dev);
 
 	return 0;
 }
@@ -1746,7 +1810,7 @@ static int macb_close(struct net_device *dev)
 	struct macb *bp = netdev_priv(dev);
 	unsigned long flags;
 
-	netif_stop_queue(dev);
+	netif_tx_stop_all_queues(dev);
 	napi_disable(&bp->napi);
 
 	if (bp->phy_dev)
@@ -1895,8 +1959,8 @@ static void macb_get_regs(struct net_device *dev, struct ethtool_regs *regs,
 	regs->version = (macb_readl(bp, MID) & ((1 << MACB_REV_SIZE) - 1))
 			| MACB_GREGS_VERSION;
 
-	tail = macb_tx_ring_wrap(bp->tx_tail);
-	head = macb_tx_ring_wrap(bp->tx_head);
+	tail = macb_tx_ring_wrap(bp->queues[0].tx_tail);
+	head = macb_tx_ring_wrap(bp->queues[0].tx_head);
 
 	regs_buff[0]  = macb_readl(bp, NCR);
 	regs_buff[1]  = macb_or_gem_readl(bp, NCFGR);
@@ -1909,8 +1973,8 @@ static void macb_get_regs(struct net_device *dev, struct ethtool_regs *regs,
 
 	regs_buff[8]  = tail;
 	regs_buff[9]  = head;
-	regs_buff[10] = macb_tx_dma(bp, tail);
-	regs_buff[11] = macb_tx_dma(bp, head);
+	regs_buff[10] = macb_tx_dma(&bp->queues[0], tail);
+	regs_buff[11] = macb_tx_dma(&bp->queues[0], head);
 
 	if (macb_is_gem(bp)) {
 		regs_buff[12] = gem_readl(bp, USRIO);
@@ -2061,16 +2125,44 @@ static void macb_configure_caps(struct macb *bp)
 	netdev_dbg(bp->dev, "Cadence caps 0x%08x\n", bp->caps);
 }
 
+static void macb_probe_queues(void __iomem *mem,
+			      unsigned int *queue_mask,
+			      unsigned int *num_queues)
+{
+	unsigned int q;
+	u32 mid;
+
+	*queue_mask = 0x1;
+	*num_queues = 1;
+
+	/* is it macb or gem ? */
+	mid = __raw_readl(mem + MACB_MID);
+	if (MACB_BFEXT(IDNUM, mid) != 0x2)
+		return;
+
+	/* bit 0 is never set but queue 0 always exists */
+	*queue_mask = __raw_readl(mem + GEM_DCFG6) & 0xff;
+	*queue_mask |= 0x1;
+
+	for (q = 1; q < MACB_MAX_QUEUES; ++q)
+		if (*queue_mask & (1 << q))
+			(*num_queues)++;
+}
+
 static int __init macb_probe(struct platform_device *pdev)
 {
 	struct macb_platform_data *pdata;
 	struct resource *regs;
 	struct net_device *dev;
 	struct macb *bp;
+	struct macb_queue *queue;
 	struct phy_device *phydev;
 	u32 config;
 	int err = -ENXIO;
 	const char *mac;
+	void __iomem *mem;
+	unsigned int q, queue_mask, num_queues, q_irq = 0;
+	struct clk *pclk, *hclk, *tx_clk;
 
 	regs = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (!regs) {
@@ -2078,72 +2170,106 @@ static int __init macb_probe(struct platform_device *pdev)
 		goto err_out;
 	}
 
-	err = -ENOMEM;
-	dev = alloc_etherdev(sizeof(*bp));
-	if (!dev)
-		goto err_out;
-
-	SET_NETDEV_DEV(dev, &pdev->dev);
-
-	bp = netdev_priv(dev);
-	bp->pdev = pdev;
-	bp->dev = dev;
-
-	spin_lock_init(&bp->lock);
-	INIT_WORK(&bp->tx_error_task, macb_tx_error_task);
-
-	bp->pclk = devm_clk_get(&pdev->dev, "pclk");
-	if (IS_ERR(bp->pclk)) {
-		err = PTR_ERR(bp->pclk);
+	pclk = devm_clk_get(&pdev->dev, "pclk");
+	if (IS_ERR(pclk)) {
+		err = PTR_ERR(pclk);
 		dev_err(&pdev->dev, "failed to get macb_clk (%u)\n", err);
-		goto err_out_free_dev;
+		goto err_out;
 	}
 
-	bp->hclk = devm_clk_get(&pdev->dev, "hclk");
-	if (IS_ERR(bp->hclk)) {
-		err = PTR_ERR(bp->hclk);
+	hclk = devm_clk_get(&pdev->dev, "hclk");
+	if (IS_ERR(hclk)) {
+		err = PTR_ERR(hclk);
 		dev_err(&pdev->dev, "failed to get hclk (%u)\n", err);
-		goto err_out_free_dev;
+		goto err_out;
 	}
 
-	bp->tx_clk = devm_clk_get(&pdev->dev, "tx_clk");
+	tx_clk = devm_clk_get(&pdev->dev, "tx_clk");
 
-	err = clk_prepare_enable(bp->pclk);
+	err = clk_prepare_enable(pclk);
 	if (err) {
 		dev_err(&pdev->dev, "failed to enable pclk (%u)\n", err);
-		goto err_out_free_dev;
+		goto err_out;
 	}
 
-	err = clk_prepare_enable(bp->hclk);
+	err = clk_prepare_enable(hclk);
 	if (err) {
 		dev_err(&pdev->dev, "failed to enable hclk (%u)\n", err);
 		goto err_out_disable_pclk;
 	}
 
-	if (!IS_ERR(bp->tx_clk)) {
-		err = clk_prepare_enable(bp->tx_clk);
+	if (!IS_ERR(tx_clk)) {
+		err = clk_prepare_enable(tx_clk);
 		if (err) {
 			dev_err(&pdev->dev, "failed to enable tx_clk (%u)\n",
-					err);
+				err);
 			goto err_out_disable_hclk;
 		}
 	}
 
-	bp->regs = devm_ioremap(&pdev->dev, regs->start, resource_size(regs));
-	if (!bp->regs) {
+	err = -ENOMEM;
+	mem = devm_ioremap(&pdev->dev, regs->start, resource_size(regs));
+	if (!mem) {
 		dev_err(&pdev->dev, "failed to map registers, aborting.\n");
-		err = -ENOMEM;
 		goto err_out_disable_clocks;
 	}
 
-	dev->irq = platform_get_irq(pdev, 0);
-	err = devm_request_irq(&pdev->dev, dev->irq, macb_interrupt, 0,
-			dev->name, dev);
-	if (err) {
-		dev_err(&pdev->dev, "Unable to request IRQ %d (error %d)\n",
-			dev->irq, err);
+	macb_probe_queues(mem, &queue_mask, &num_queues);
+	dev = alloc_etherdev_mq(sizeof(*bp), num_queues);
+	if (!dev)
 		goto err_out_disable_clocks;
+
+	SET_NETDEV_DEV(dev, &pdev->dev);
+
+	bp = netdev_priv(dev);
+	bp->pdev = pdev;
+	bp->dev = dev;
+	bp->regs = mem;
+	bp->num_queues = num_queues;
+	bp->pclk = pclk;
+	bp->hclk = hclk;
+	bp->tx_clk = tx_clk;
+
+	bp->queues[0].bp = bp;
+	bp->queues[0].ISR  = MACB_ISR;
+	bp->queues[0].IER  = MACB_IER;
+	bp->queues[0].IDR  = MACB_IDR;
+	bp->queues[0].IMR  = MACB_IMR;
+	bp->queues[0].TBQP = MACB_TBQP;
+	for (q = 1, queue = &bp->queues[1]; q < MACB_MAX_QUEUES; ++q) {
+		if (!(queue_mask & (1 << q)))
+			continue;
+
+		queue->bp = bp;
+		queue->ISR  = (q-1) * sizeof(u32) + GEM_ISR1;
+		queue->IER  = (q-1) * sizeof(u32) + GEM_IER1;
+		queue->IDR  = (q-1) * sizeof(u32) + GEM_IDR1;
+		queue->IMR  = (q-1) * sizeof(u32) + GEM_IMR1;
+		queue->TBQP = (q-1) * sizeof(u32) + GEM_TBQP1;
+		queue++;
+	}
+
+	spin_lock_init(&bp->lock);
+
+	for (q = 0, queue = bp->queues; q < MACB_MAX_QUEUES; ++q) {
+		if (!(queue_mask & (1 << q)))
+			continue;
+
+		queue->irq = platform_get_irq(pdev, q);
+		err = devm_request_irq(&pdev->dev, queue->irq, macb_interrupt,
+				       0, dev->name, queue);
+		if (err) {
+			dev_err(&pdev->dev,
+				"Unable to request IRQ %d (error %d)\n",
+				queue->irq, err);
+			goto err_out_free_irq;
+		}
+
+		INIT_WORK(&queue->tx_error_task, macb_tx_error_task);
+		queue++;
+		q_irq++;
 	}
+	dev->irq = bp->queues[0].irq;
 
 	dev->netdev_ops = &macb_netdev_ops;
 	netif_napi_add(dev, &bp->napi, macb_poll, 64);
@@ -2219,7 +2345,7 @@ static int __init macb_probe(struct platform_device *pdev)
 	err = register_netdev(dev);
 	if (err) {
 		dev_err(&pdev->dev, "Cannot register net device, aborting.\n");
-		goto err_out_disable_clocks;
+		goto err_out_free_irq;
 	}
 
 	err = macb_mii_init(bp);
@@ -2242,15 +2368,17 @@ static int __init macb_probe(struct platform_device *pdev)
 
 err_out_unregister_netdev:
 	unregister_netdev(dev);
+err_out_free_irq:
+	for (q = 0, queue = bp->queues; q < q_irq; ++q, ++queue)
+		devm_free_irq(&pdev->dev, queue->irq, queue);
+	free_netdev(dev);
 err_out_disable_clocks:
-	if (!IS_ERR(bp->tx_clk))
-		clk_disable_unprepare(bp->tx_clk);
+	if (!IS_ERR(tx_clk))
+		clk_disable_unprepare(tx_clk);
 err_out_disable_hclk:
-	clk_disable_unprepare(bp->hclk);
+	clk_disable_unprepare(hclk);
 err_out_disable_pclk:
-	clk_disable_unprepare(bp->pclk);
-err_out_free_dev:
-	free_netdev(dev);
+	clk_disable_unprepare(pclk);
 err_out:
 	return err;
 }
@@ -2259,6 +2387,8 @@ static int __exit macb_remove(struct platform_device *pdev)
 {
 	struct net_device *dev;
 	struct macb *bp;
+	struct macb_queue *queue;
+	unsigned int q;
 
 	dev = platform_get_drvdata(pdev);
 
@@ -2270,11 +2400,14 @@ static int __exit macb_remove(struct platform_device *pdev)
 		kfree(bp->mii_bus->irq);
 		mdiobus_free(bp->mii_bus);
 		unregister_netdev(dev);
+		queue = bp->queues;
+		for (q = 0; q < bp->num_queues; ++q, ++queue)
+			devm_free_irq(&pdev->dev, queue->irq, queue);
+		free_netdev(dev);
 		if (!IS_ERR(bp->tx_clk))
 			clk_disable_unprepare(bp->tx_clk);
 		clk_disable_unprepare(bp->hclk);
 		clk_disable_unprepare(bp->pclk);
-		free_netdev(dev);
 	}
 
 	return 0;
diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h
index 517c09d..28d4e23 100644
--- a/drivers/net/ethernet/cadence/macb.h
+++ b/drivers/net/ethernet/cadence/macb.h
@@ -12,6 +12,7 @@
 
 #define MACB_GREGS_NBR 16
 #define MACB_GREGS_VERSION 1
+#define MACB_MAX_QUEUES 8
 
 /* MACB register offsets */
 #define MACB_NCR				0x0000
@@ -88,6 +89,48 @@
 #define GEM_DCFG5				0x0290
 #define GEM_DCFG6				0x0294
 #define GEM_DCFG7				0x0298
+#define GEM_ISR1				0x0400
+#define GEM_ISR2				0x0404
+#define GEM_ISR3				0x0408
+#define GEM_ISR4				0x040c
+#define GEM_ISR5				0x0410
+#define GEM_ISR6				0x0414
+#define GEM_ISR7				0x0418
+#define GEM_TBQP1				0x0440
+#define GEM_TBQP2				0x0444
+#define GEM_TBQP3				0x0448
+#define GEM_TBQP4				0x044c
+#define GEM_TBQP5				0x0450
+#define GEM_TBQP6				0x0454
+#define GEM_TBQP7				0x0458
+#define GEM_RBQP1				0x0480
+#define GEM_RBQP2				0x0484
+#define GEM_RBQP3				0x0488
+#define GEM_RBQP4				0x048c
+#define GEM_RBQP5				0x0490
+#define GEM_RBQP6				0x0494
+#define GEM_RBQP7				0x0498
+#define GEM_IER1				0x0600
+#define GEM_IER2				0x0604
+#define GEM_IER3				0x0608
+#define GEM_IER4				0x060c
+#define GEM_IER5				0x0610
+#define GEM_IER6				0x0614
+#define GEM_IER7				0x0618
+#define GEM_IDR1				0x0620
+#define GEM_IDR2				0x0624
+#define GEM_IDR3				0x0628
+#define GEM_IDR4				0x062c
+#define GEM_IDR5				0x0630
+#define GEM_IDR6				0x0634
+#define GEM_IDR7				0x0638
+#define GEM_IMR1				0x0640
+#define GEM_IMR2				0x0644
+#define GEM_IMR3				0x0648
+#define GEM_IMR4				0x064c
+#define GEM_IMR5				0x0650
+#define GEM_IMR6				0x0654
+#define GEM_IMR7				0x0658
 
 /* Bitfields in NCR */
 #define MACB_LB_OFFSET				0
@@ -376,6 +419,10 @@
 	__raw_readl((port)->regs + GEM_##reg)
 #define gem_writel(port, reg, value)			\
 	__raw_writel((value), (port)->regs + GEM_##reg)
+#define queue_readl(queue, reg)				\
+	__raw_readl((queue)->bp->regs + queue->reg)
+#define queue_writel(queue, reg, value)			\
+	__raw_writel((value), (queue)->bp->regs + queue->reg)
 
 /*
  * Conditional GEM/MACB macros.  These perform the operation to the correct
@@ -597,6 +644,23 @@ struct macb_config {
 	unsigned int		dma_burst_length;
 };
 
+struct macb_queue {
+	struct macb		*bp;
+	int			irq;
+
+	unsigned int		ISR;
+	unsigned int		IER;
+	unsigned int		IDR;
+	unsigned int		IMR;
+	unsigned int		TBQP;
+
+	unsigned int		tx_head, tx_tail;
+	struct macb_dma_desc	*tx_ring;
+	struct macb_tx_skb	*tx_skb;
+	dma_addr_t		tx_ring_dma;
+	struct work_struct	tx_error_task;
+};
+
 struct macb {
 	void __iomem		*regs;
 
@@ -607,9 +671,8 @@ struct macb {
 	void			*rx_buffers;
 	size_t			rx_buffer_size;
 
-	unsigned int		tx_head, tx_tail;
-	struct macb_dma_desc	*tx_ring;
-	struct macb_tx_skb	*tx_skb;
+	unsigned int		num_queues;
+	struct macb_queue	queues[MACB_MAX_QUEUES];
 
 	spinlock_t		lock;
 	struct platform_device	*pdev;
@@ -618,7 +681,6 @@ struct macb {
 	struct clk		*tx_clk;
 	struct net_device	*dev;
 	struct napi_struct	napi;
-	struct work_struct	tx_error_task;
 	struct net_device_stats	stats;
 	union {
 		struct macb_stats	macb;
@@ -626,7 +688,6 @@ struct macb {
 	}			hw_stats;
 
 	dma_addr_t		rx_ring_dma;
-	dma_addr_t		tx_ring_dma;
 	dma_addr_t		rx_buffers_dma;
 
 	struct macb_or_gem_ops	macbgem_ops;
-- 
1.8.2.2

^ permalink raw reply related

* [PATCH v2 0/1] net/macb: add TX multiqueue support for gem
From: Cyrille Pitchen @ 2014-12-11 10:16 UTC (permalink / raw)
  To: nicolas.ferre, davem, linux-arm-kernel, netdev, soren.brinkmann
  Cc: linux-kernel, Cyrille Pitchen

ChangeLog

v2:
fix compilation warnings for netdev_vdbg("%u...", queue - bp->queues, ...) calls

v1:
At the first look this patch may look quite big but it cannot be splitted.
Each queue has its own dedicated IRQ, which should be handled.
Also the Transmit Base Queue Pointer register of each available queue must be
initialized before starting the transmission, otherwise the transmission will be
halted immediately as HRESP errors are likely to occur.
In addition, some fields had to be moved from struct macb into struct macb_queue
so a common code could manage the queues.

This patch was applied to net-next and tested on a sama5d36ek board, which embeds
both macb and gem IPs, to check the backward compatibility.

Also it was tested on a sama5dx FPGA platform with a gem designed to use 3 queues.
Then we used the tc program to set a queue discipline policy as describe in the
Documentation/networking/multiqueue.txt: we successfully used each queue.

Cyrille Pitchen (1):
  net/macb: add TX multiqueue support for gem

 drivers/net/ethernet/cadence/macb.c | 451 +++++++++++++++++++++++-------------
 drivers/net/ethernet/cadence/macb.h |  71 +++++-
 2 files changed, 358 insertions(+), 164 deletions(-)

-- 
1.8.2.2

^ permalink raw reply

* [PATCH 1/1] net/macb: fix compilation warning for print_hex_dump() called with skb->mac_header
From: Cyrille Pitchen @ 2014-12-11 10:15 UTC (permalink / raw)
  To: nicolas.ferre, davem, linux-arm-kernel, netdev, soren.brinkmann
  Cc: linux-kernel, Cyrille Pitchen

Signed-off-by: Cyrille Pitchen <cyrille.pitchen@atmel.com>
---
 drivers/net/ethernet/cadence/macb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/cadence/macb.c b/drivers/net/ethernet/cadence/macb.c
index 41113e5..9ddc9bf 100644
--- a/drivers/net/ethernet/cadence/macb.c
+++ b/drivers/net/ethernet/cadence/macb.c
@@ -776,7 +776,7 @@ static int gem_rx(struct macb *bp, int budget)
 		netdev_vdbg(bp->dev, "received skb of length %u, csum: %08x\n",
 			    skb->len, skb->csum);
 		print_hex_dump(KERN_DEBUG, " mac: ", DUMP_PREFIX_ADDRESS, 16, 1,
-			       skb->mac_header, 16, true);
+			       skb_mac_header(skb), 16, true);
 		print_hex_dump(KERN_DEBUG, "data: ", DUMP_PREFIX_ADDRESS, 16, 1,
 			       skb->data, 32, true);
 #endif
-- 
1.8.2.2

^ permalink raw reply related

* RE: [RFC PATCH net-next 1/1] net: Support for switch port configuration
From: Varlese, Marco @ 2014-12-11  9:59 UTC (permalink / raw)
  To: John Fastabend, Jiri Pirko
  Cc: netdev@vger.kernel.org, stephen@networkplumber.org,
	Fastabend, John R, roopa@cumulusnetworks.com, sfeldma@gmail.com,
	linux-kernel@vger.kernel.org
In-Reply-To: <54887CF7.70708@gmail.com>

> -----Original Message-----
> From: John Fastabend [mailto:john.fastabend@gmail.com]
> Sent: Wednesday, December 10, 2014 5:04 PM
> To: Jiri Pirko
> Cc: Varlese, Marco; netdev@vger.kernel.org;
> stephen@networkplumber.org; Fastabend, John R;
> roopa@cumulusnetworks.com; sfeldma@gmail.com; linux-
> kernel@vger.kernel.org
> Subject: Re: [RFC PATCH net-next 1/1] net: Support for switch port
> configuration
> 
> On 12/10/2014 08:50 AM, Jiri Pirko wrote:
> > Wed, Dec 10, 2014 at 05:23:40PM CET, marco.varlese@intel.com wrote:
> >> From: Marco Varlese <marco.varlese@intel.com>
> >>
> >> Switch hardware offers a list of attributes that are configurable on
> >> a per port basis.
> >> This patch provides a mechanism to configure switch ports by adding
> >> an NDO for setting specific values to specific attributes.
> >> There will be a separate patch that extends iproute2 to call the new
> >> NDO.
> >
> >
> > What are these attributes? Can you give some examples. I'm asking
> > because there is a plan to pass generic attributes to switch ports
> > replacing current specific ndo_switch_port_stp_update. In this case,
> > bridge is setting that attribute.
> >
> > Is there need to set something directly from userspace or does it make
> > rather sense to use involved bridge/ovs/bond ? I think that both will
> > be needed.
> 
> +1
> 
> I think for many attributes it would be best to have both. The in kernel callers
> and netlink userspace can use the same driver ndo_ops.
> 
> But then we don't _require_ any specific bridge/ovs/etc module. And we
> may have some attributes that are not specific to any existing software
> module. I'm guessing Marco has some examples of these.
> 
> [...]
> 
> 
> --
> John Fastabend         Intel Corporation

We do have a need to configure the attributes directly from user-space and I have identified the tool to do that in iproute2.

An example of attributes are:
* enabling/disabling of learning of source addresses on a given port (you can imagine the attribute called LEARNING for example);
* internal loopback control (i.e. LOOPBACK) which will control how the flow of traffic behaves from the switch fabric towards an egress port;
* flooding for broadcast/multicast/unicast type of packets (i.e. BFLOODING, MFLOODING, UFLOODING);

Some attributes would be of the type enabled/disabled while other will allow specific values to allow the user to configure different behaviours of that feature on that particular port on that platform.

One thing to mention - as John stated as well - there might be some attributes that are not specific to any software module but rather have to do with the actual hardware/platform to configure.

I hope this clarifies some points.

-----------------------------------------------------------
Marco Varlese		-	Intel Corporation
-----------------------------------------------------------

^ permalink raw reply

* Goedemorgen
From: Loans Engine® @ 2014-12-11  9:05 UTC (permalink / raw)


Goedemorgen
 Een lening krijgen op 3% op zowel korte als lange termijn. Leningen motor is een globale leningen groep die werd opgericht om te voorzien in de behoeften van economisch depressief cliënten. Wij werken in alle categorieën van leningen. Voer de volgende gegevens als u geïnteresseerd bent.
Volledige naam:
Geslacht:
Land:
Het vereiste bedrag:
Duur:
Missie:
We moeten deze volledige informatie voor lening verwerking.
E-mail ons: loan.engine@outlook.com
Hartelijke groeten
Ana wit

^ permalink raw reply

* [PATCH V2 net-next 09/10] net/mlx4: Refactor QUERY_PORT
From: Or Gerlitz @ 2014-12-11  8:57 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Matan Barak, Amir Vadai, Tal Alon, Jack Morgenstein,
	Or Gerlitz
In-Reply-To: <1418288280-334-1-git-send-email-ogerlitz@mellanox.com>

From: Matan Barak <matanb@mellanox.com>

Currently QUERY_PORT is done as a part of QUERY_DEV_CAP firmware command.

Since we would like to use it without querying all device capabilities,
extract this part to be a function of its own.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx4/fw.c   |  141 +++++++++++++++++------------
 drivers/net/ethernet/mellanox/mlx4/fw.h   |   37 +++++----
 drivers/net/ethernet/mellanox/mlx4/main.c |   71 ++++++++++-----
 3 files changed, 154 insertions(+), 95 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 622bffa..073b3d1 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -886,61 +886,10 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	if (field32 & (1 << 21))
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_80_VFS;
 
-	if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
-		for (i = 1; i <= dev_cap->num_ports; ++i) {
-			MLX4_GET(field, outbox, QUERY_DEV_CAP_VL_PORT_OFFSET);
-			dev_cap->max_vl[i]	   = field >> 4;
-			MLX4_GET(field, outbox, QUERY_DEV_CAP_MTU_WIDTH_OFFSET);
-			dev_cap->ib_mtu[i]	   = field >> 4;
-			dev_cap->max_port_width[i] = field & 0xf;
-			MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_GID_OFFSET);
-			dev_cap->max_gids[i]	   = 1 << (field & 0xf);
-			MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_PKEY_OFFSET);
-			dev_cap->max_pkeys[i]	   = 1 << (field & 0xf);
-		}
-	} else {
-#define QUERY_PORT_SUPPORTED_TYPE_OFFSET	0x00
-#define QUERY_PORT_MTU_OFFSET			0x01
-#define QUERY_PORT_ETH_MTU_OFFSET		0x02
-#define QUERY_PORT_WIDTH_OFFSET			0x06
-#define QUERY_PORT_MAX_GID_PKEY_OFFSET		0x07
-#define QUERY_PORT_MAX_MACVLAN_OFFSET		0x0a
-#define QUERY_PORT_MAX_VL_OFFSET		0x0b
-#define QUERY_PORT_MAC_OFFSET			0x10
-#define QUERY_PORT_TRANS_VENDOR_OFFSET		0x18
-#define QUERY_PORT_WAVELENGTH_OFFSET		0x1c
-#define QUERY_PORT_TRANS_CODE_OFFSET		0x20
-
-		for (i = 1; i <= dev_cap->num_ports; ++i) {
-			err = mlx4_cmd_box(dev, 0, mailbox->dma, i, 0, MLX4_CMD_QUERY_PORT,
-					   MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
-			if (err)
-				goto out;
-
-			MLX4_GET(field, outbox, QUERY_PORT_SUPPORTED_TYPE_OFFSET);
-			dev_cap->supported_port_types[i] = field & 3;
-			dev_cap->suggested_type[i] = (field >> 3) & 1;
-			dev_cap->default_sense[i] = (field >> 4) & 1;
-			MLX4_GET(field, outbox, QUERY_PORT_MTU_OFFSET);
-			dev_cap->ib_mtu[i]	   = field & 0xf;
-			MLX4_GET(field, outbox, QUERY_PORT_WIDTH_OFFSET);
-			dev_cap->max_port_width[i] = field & 0xf;
-			MLX4_GET(field, outbox, QUERY_PORT_MAX_GID_PKEY_OFFSET);
-			dev_cap->max_gids[i]	   = 1 << (field >> 4);
-			dev_cap->max_pkeys[i]	   = 1 << (field & 0xf);
-			MLX4_GET(field, outbox, QUERY_PORT_MAX_VL_OFFSET);
-			dev_cap->max_vl[i]	   = field & 0xf;
-			MLX4_GET(field, outbox, QUERY_PORT_MAX_MACVLAN_OFFSET);
-			dev_cap->log_max_macs[i]  = field & 0xf;
-			dev_cap->log_max_vlans[i] = field >> 4;
-			MLX4_GET(dev_cap->eth_mtu[i], outbox, QUERY_PORT_ETH_MTU_OFFSET);
-			MLX4_GET(dev_cap->def_mac[i], outbox, QUERY_PORT_MAC_OFFSET);
-			MLX4_GET(field32, outbox, QUERY_PORT_TRANS_VENDOR_OFFSET);
-			dev_cap->trans_type[i] = field32 >> 24;
-			dev_cap->vendor_oui[i] = field32 & 0xffffff;
-			MLX4_GET(dev_cap->wavelength[i], outbox, QUERY_PORT_WAVELENGTH_OFFSET);
-			MLX4_GET(dev_cap->trans_code[i], outbox, QUERY_PORT_TRANS_CODE_OFFSET);
-		}
+	for (i = 1; i <= dev_cap->num_ports; i++) {
+		err = mlx4_QUERY_PORT(dev, i, dev_cap->port_cap + i);
+		if (err)
+			goto out;
 	}
 
 	mlx4_dbg(dev, "Base MM extensions: flags %08x, rsvd L_Key %08x\n",
@@ -977,8 +926,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	mlx4_dbg(dev, "Max CQEs: %d, max WQEs: %d, max SRQ WQEs: %d\n",
 		 dev_cap->max_cq_sz, dev_cap->max_qp_sz, dev_cap->max_srq_sz);
 	mlx4_dbg(dev, "Local CA ACK delay: %d, max MTU: %d, port width cap: %d\n",
-		 dev_cap->local_ca_ack_delay, 128 << dev_cap->ib_mtu[1],
-		 dev_cap->max_port_width[1]);
+		 dev_cap->local_ca_ack_delay, 128 << dev_cap->port_cap[1].ib_mtu,
+		 dev_cap->port_cap[1].max_port_width);
 	mlx4_dbg(dev, "Max SQ desc size: %d, max SQ S/G: %d\n",
 		 dev_cap->max_sq_desc_sz, dev_cap->max_sq_sg);
 	mlx4_dbg(dev, "Max RQ desc size: %d, max RQ S/G: %d\n",
@@ -995,6 +944,84 @@ out:
 	return err;
 }
 
+int mlx4_QUERY_PORT(struct mlx4_dev *dev, int port, struct mlx4_port_cap *port_cap)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *outbox;
+	u8 field;
+	u32 field32;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
+		err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_DEV_CAP,
+				   MLX4_CMD_TIME_CLASS_A,
+				   MLX4_CMD_NATIVE);
+
+		if (err)
+			goto out;
+
+		MLX4_GET(field, outbox, QUERY_DEV_CAP_VL_PORT_OFFSET);
+		port_cap->max_vl	   = field >> 4;
+		MLX4_GET(field, outbox, QUERY_DEV_CAP_MTU_WIDTH_OFFSET);
+		port_cap->ib_mtu	   = field >> 4;
+		port_cap->max_port_width = field & 0xf;
+		MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_GID_OFFSET);
+		port_cap->max_gids	   = 1 << (field & 0xf);
+		MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_PKEY_OFFSET);
+		port_cap->max_pkeys	   = 1 << (field & 0xf);
+	} else {
+#define QUERY_PORT_SUPPORTED_TYPE_OFFSET	0x00
+#define QUERY_PORT_MTU_OFFSET			0x01
+#define QUERY_PORT_ETH_MTU_OFFSET		0x02
+#define QUERY_PORT_WIDTH_OFFSET			0x06
+#define QUERY_PORT_MAX_GID_PKEY_OFFSET		0x07
+#define QUERY_PORT_MAX_MACVLAN_OFFSET		0x0a
+#define QUERY_PORT_MAX_VL_OFFSET		0x0b
+#define QUERY_PORT_MAC_OFFSET			0x10
+#define QUERY_PORT_TRANS_VENDOR_OFFSET		0x18
+#define QUERY_PORT_WAVELENGTH_OFFSET		0x1c
+#define QUERY_PORT_TRANS_CODE_OFFSET		0x20
+
+		err = mlx4_cmd_box(dev, 0, mailbox->dma, port, 0, MLX4_CMD_QUERY_PORT,
+				   MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+		if (err)
+			goto out;
+
+		MLX4_GET(field, outbox, QUERY_PORT_SUPPORTED_TYPE_OFFSET);
+		port_cap->supported_port_types = field & 3;
+		port_cap->suggested_type = (field >> 3) & 1;
+		port_cap->default_sense = (field >> 4) & 1;
+		MLX4_GET(field, outbox, QUERY_PORT_MTU_OFFSET);
+		port_cap->ib_mtu	   = field & 0xf;
+		MLX4_GET(field, outbox, QUERY_PORT_WIDTH_OFFSET);
+		port_cap->max_port_width = field & 0xf;
+		MLX4_GET(field, outbox, QUERY_PORT_MAX_GID_PKEY_OFFSET);
+		port_cap->max_gids	   = 1 << (field >> 4);
+		port_cap->max_pkeys	   = 1 << (field & 0xf);
+		MLX4_GET(field, outbox, QUERY_PORT_MAX_VL_OFFSET);
+		port_cap->max_vl	   = field & 0xf;
+		MLX4_GET(field, outbox, QUERY_PORT_MAX_MACVLAN_OFFSET);
+		port_cap->log_max_macs  = field & 0xf;
+		port_cap->log_max_vlans = field >> 4;
+		MLX4_GET(port_cap->eth_mtu, outbox, QUERY_PORT_ETH_MTU_OFFSET);
+		MLX4_GET(port_cap->def_mac, outbox, QUERY_PORT_MAC_OFFSET);
+		MLX4_GET(field32, outbox, QUERY_PORT_TRANS_VENDOR_OFFSET);
+		port_cap->trans_type = field32 >> 24;
+		port_cap->vendor_oui = field32 & 0xffffff;
+		MLX4_GET(port_cap->wavelength, outbox, QUERY_PORT_WAVELENGTH_OFFSET);
+		MLX4_GET(port_cap->trans_code, outbox, QUERY_PORT_TRANS_CODE_OFFSET);
+	}
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
 #define DEV_CAP_EXT_2_FLAG_VLAN_CONTROL (1 << 26)
 #define DEV_CAP_EXT_2_FLAG_80_VFS	(1 << 21)
 #define DEV_CAP_EXT_2_FLAG_FSM		(1 << 20)
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.h b/drivers/net/ethernet/mellanox/mlx4/fw.h
index 0e910a4..744398b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.h
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.h
@@ -43,6 +43,25 @@ struct mlx4_mod_stat_cfg {
 	u8 log_pg_sz_m;
 };
 
+struct mlx4_port_cap {
+	u8  supported_port_types;
+	u8  suggested_type;
+	u8  default_sense;
+	u8  log_max_macs;
+	u8  log_max_vlans;
+	int ib_mtu;
+	int max_port_width;
+	int max_vl;
+	int max_gids;
+	int max_pkeys;
+	u64 def_mac;
+	u16 eth_mtu;
+	int trans_type;
+	int vendor_oui;
+	u16 wavelength;
+	u64 trans_code;
+};
+
 struct mlx4_dev_cap {
 	int max_srq_sz;
 	int max_qp_sz;
@@ -67,17 +86,6 @@ struct mlx4_dev_cap {
 	int local_ca_ack_delay;
 	int num_ports;
 	u32 max_msg_sz;
-	int ib_mtu[MLX4_MAX_PORTS + 1];
-	int max_port_width[MLX4_MAX_PORTS + 1];
-	int max_vl[MLX4_MAX_PORTS + 1];
-	int max_gids[MLX4_MAX_PORTS + 1];
-	int max_pkeys[MLX4_MAX_PORTS + 1];
-	u64 def_mac[MLX4_MAX_PORTS + 1];
-	u16 eth_mtu[MLX4_MAX_PORTS + 1];
-	int trans_type[MLX4_MAX_PORTS + 1];
-	int vendor_oui[MLX4_MAX_PORTS + 1];
-	u16 wavelength[MLX4_MAX_PORTS + 1];
-	u64 trans_code[MLX4_MAX_PORTS + 1];
 	u16 stat_rate_support;
 	int fs_log_max_ucast_qp_range_size;
 	int fs_max_num_qp_per_entry;
@@ -115,12 +123,8 @@ struct mlx4_dev_cap {
 	u64 max_icm_sz;
 	int max_gso_sz;
 	int max_rss_tbl_sz;
-	u8  supported_port_types[MLX4_MAX_PORTS + 1];
-	u8  suggested_type[MLX4_MAX_PORTS + 1];
-	u8  default_sense[MLX4_MAX_PORTS + 1];
-	u8  log_max_macs[MLX4_MAX_PORTS + 1];
-	u8  log_max_vlans[MLX4_MAX_PORTS + 1];
 	u32 max_counters;
+	struct mlx4_port_cap port_cap[MLX4_MAX_PORTS + 1];
 };
 
 struct mlx4_func_cap {
@@ -217,6 +221,7 @@ struct mlx4_set_ib_param {
 };
 
 int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap);
+int mlx4_QUERY_PORT(struct mlx4_dev *dev, int port, struct mlx4_port_cap *port_cap);
 int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u8 gen_or_port,
 			struct mlx4_func_cap *func_cap);
 int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 3bfe90b..6173b80 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -254,6 +254,46 @@ static void mlx4_enable_cqe_eqe_stride(struct mlx4_dev *dev)
 	}
 }
 
+static int _mlx4_dev_port(struct mlx4_dev *dev, int port,
+			  struct mlx4_port_cap *port_cap)
+{
+	dev->caps.vl_cap[port]	    = port_cap->max_vl;
+	dev->caps.ib_mtu_cap[port]	    = port_cap->ib_mtu;
+	dev->phys_caps.gid_phys_table_len[port]  = port_cap->max_gids;
+	dev->phys_caps.pkey_phys_table_len[port] = port_cap->max_pkeys;
+	/* set gid and pkey table operating lengths by default
+	 * to non-sriov values
+	 */
+	dev->caps.gid_table_len[port]  = port_cap->max_gids;
+	dev->caps.pkey_table_len[port] = port_cap->max_pkeys;
+	dev->caps.port_width_cap[port] = port_cap->max_port_width;
+	dev->caps.eth_mtu_cap[port]    = port_cap->eth_mtu;
+	dev->caps.def_mac[port]        = port_cap->def_mac;
+	dev->caps.supported_type[port] = port_cap->supported_port_types;
+	dev->caps.suggested_type[port] = port_cap->suggested_type;
+	dev->caps.default_sense[port] = port_cap->default_sense;
+	dev->caps.trans_type[port]	    = port_cap->trans_type;
+	dev->caps.vendor_oui[port]     = port_cap->vendor_oui;
+	dev->caps.wavelength[port]     = port_cap->wavelength;
+	dev->caps.trans_code[port]     = port_cap->trans_code;
+
+	return 0;
+}
+
+static int mlx4_dev_port(struct mlx4_dev *dev, int port,
+			 struct mlx4_port_cap *port_cap)
+{
+	int err = 0;
+
+	err = mlx4_QUERY_PORT(dev, port, port_cap);
+
+	if (err)
+		mlx4_err(dev, "QUERY_PORT command failed.\n");
+
+	return err;
+}
+
+#define MLX4_A0_STEERING_TABLE_SIZE	256
 static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 {
 	int err;
@@ -289,24 +329,11 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 				      dev->caps.num_sys_eqs :
 				      MLX4_MAX_EQ_NUM;
 	for (i = 1; i <= dev->caps.num_ports; ++i) {
-		dev->caps.vl_cap[i]	    = dev_cap->max_vl[i];
-		dev->caps.ib_mtu_cap[i]	    = dev_cap->ib_mtu[i];
-		dev->phys_caps.gid_phys_table_len[i]  = dev_cap->max_gids[i];
-		dev->phys_caps.pkey_phys_table_len[i] = dev_cap->max_pkeys[i];
-		/* set gid and pkey table operating lengths by default
-		 * to non-sriov values */
-		dev->caps.gid_table_len[i]  = dev_cap->max_gids[i];
-		dev->caps.pkey_table_len[i] = dev_cap->max_pkeys[i];
-		dev->caps.port_width_cap[i] = dev_cap->max_port_width[i];
-		dev->caps.eth_mtu_cap[i]    = dev_cap->eth_mtu[i];
-		dev->caps.def_mac[i]        = dev_cap->def_mac[i];
-		dev->caps.supported_type[i] = dev_cap->supported_port_types[i];
-		dev->caps.suggested_type[i] = dev_cap->suggested_type[i];
-		dev->caps.default_sense[i] = dev_cap->default_sense[i];
-		dev->caps.trans_type[i]	    = dev_cap->trans_type[i];
-		dev->caps.vendor_oui[i]     = dev_cap->vendor_oui[i];
-		dev->caps.wavelength[i]     = dev_cap->wavelength[i];
-		dev->caps.trans_code[i]     = dev_cap->trans_code[i];
+		err = _mlx4_dev_port(dev, i, dev_cap->port_cap + i);
+		if (err) {
+			mlx4_err(dev, "QUERY_PORT command failed, aborting\n");
+			return err;
+		}
 	}
 
 	dev->caps.uar_page_size	     = PAGE_SIZE;
@@ -415,13 +442,13 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 			dev->caps.possible_type[i] = dev->caps.port_type[i];
 		}
 
-		if (dev->caps.log_num_macs > dev_cap->log_max_macs[i]) {
-			dev->caps.log_num_macs = dev_cap->log_max_macs[i];
+		if (dev->caps.log_num_macs > dev_cap->port_cap[i].log_max_macs) {
+			dev->caps.log_num_macs = dev_cap->port_cap[i].log_max_macs;
 			mlx4_warn(dev, "Requested number of MACs is too much for port %d, reducing to %d\n",
 				  i, 1 << dev->caps.log_num_macs);
 		}
-		if (dev->caps.log_num_vlans > dev_cap->log_max_vlans[i]) {
-			dev->caps.log_num_vlans = dev_cap->log_max_vlans[i];
+		if (dev->caps.log_num_vlans > dev_cap->port_cap[i].log_max_vlans) {
+			dev->caps.log_num_vlans = dev_cap->port_cap[i].log_max_vlans;
 			mlx4_warn(dev, "Requested number of VLANs is too much for port %d, reducing to %d\n",
 				  i, 1 << dev->caps.log_num_vlans);
 		}
-- 
1.7.1

^ permalink raw reply related

* [PATCH V2 net-next 10/10] net/mlx4: Add support for A0 steering
From: Or Gerlitz @ 2014-12-11  8:58 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Matan Barak, Amir Vadai, Tal Alon, Jack Morgenstein,
	Or Gerlitz
In-Reply-To: <1418288280-334-1-git-send-email-ogerlitz@mellanox.com>

From: Matan Barak <matanb@mellanox.com>

Add the required firmware commands for A0 steering and a way to enable
that. The firmware support focuses on INIT_HCA, QUERY_HCA, QUERY_PORT,
QUERY_DEV_CAP and QUERY_FUNC_CAP commands. Those commands are used
to configure and query the device.

The different A0 DMFS (steering) modes are:

Static - optimized performance, but flow steering rules are
limited. This mode should be choosed explicitly by the user
in order to be used.

Dynamic - this mode should be explicitly choosed by the user.
In this mode, the FW works in optimized steering mode as long as
it can and afterwards automatically drops to classic (full) DMFS.

Disable - this mode should be explicitly choosed by the user.
The user instructs the system not to use optimized steering, even if
the FW supports Dynamic A0 DMFS (and thus will be able to use optimized
steering in Default A0 DMFS mode).

Default - this mode is implicitly choosed. In this mode, if the FW
supports Dynamic A0 DMFS, it'll work in this mode. Otherwise, it'll
work at Disable A0 DMFS mode.

Under SRIOV configuration, when the A0 steering mode is enabled,
older guest VF drivers who aren't using the RX QP allocation flag
(MLX4_RESERVE_A0_QP) will get a QP from the general range and
fail when attempting to register a steering rule. To avoid that,
the PF context behaviour is changed once on A0 static mode, to
require support for the allocation flag in VF drivers too.

In order to enable A0 steering, we use log_num_mgm_entry_size param.
If the value of the parameter is not positive, we treat the absolute
value of log_num_mgm_entry_size as a bit field. Setting bit 2 of this
bit field enables static A0 steering.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c |    3 +-
 drivers/net/ethernet/mellanox/mlx4/fw.c        |   48 ++++++++-
 drivers/net/ethernet/mellanox/mlx4/fw.h        |    4 +
 drivers/net/ethernet/mellanox/mlx4/main.c      |  132 ++++++++++++++++++++++--
 drivers/net/ethernet/mellanox/mlx4/mlx4.h      |    2 -
 drivers/net/ethernet/mellanox/mlx4/qp.c        |    4 +-
 include/linux/mlx4/device.h                    |   17 +++-
 7 files changed, 191 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 568e1f4..6ff214d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2594,7 +2594,8 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
 			NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
 
 	if (mdev->dev->caps.steering_mode ==
-	    MLX4_STEERING_MODE_DEVICE_MANAGED)
+	    MLX4_STEERING_MODE_DEVICE_MANAGED &&
+	    mdev->dev->caps.dmfs_high_steer_mode != MLX4_STEERING_DMFS_A0_STATIC)
 		dev->hw_features |= NETIF_F_NTUPLE;
 
 	if (mdev->dev->caps.steering_mode != MLX4_STEERING_MODE_A0)
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 073b3d1..ef3b95b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -144,7 +144,8 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags)
 		[15] = "Ethernet Backplane autoneg support",
 		[16] = "CONFIG DEV support",
 		[17] = "Asymmetric EQs support",
-		[18] = "More than 80 VFs support"
+		[18] = "More than 80 VFs support",
+		[19] = "Performance optimized for limited rule configuration flow steering support"
 	};
 	int i;
 
@@ -680,6 +681,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 #define QUERY_DEV_CAP_FW_REASSIGN_MAC		0x9d
 #define QUERY_DEV_CAP_VXLAN			0x9e
 #define QUERY_DEV_CAP_MAD_DEMUX_OFFSET		0xb0
+#define QUERY_DEV_CAP_DMFS_HIGH_RATE_QPN_BASE_OFFSET	0xa8
+#define QUERY_DEV_CAP_DMFS_HIGH_RATE_QPN_RANGE_OFFSET	0xac
 
 	dev_cap->flags2 = 0;
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
@@ -876,6 +879,13 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	if (field32 & (1 << 0))
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_MAD_DEMUX;
 
+	MLX4_GET(dev_cap->dmfs_high_rate_qpn_base, outbox,
+		 QUERY_DEV_CAP_DMFS_HIGH_RATE_QPN_BASE_OFFSET);
+	dev_cap->dmfs_high_rate_qpn_base &= MGM_QPN_MASK;
+	MLX4_GET(dev_cap->dmfs_high_rate_qpn_range, outbox,
+		 QUERY_DEV_CAP_DMFS_HIGH_RATE_QPN_RANGE_OFFSET);
+	dev_cap->dmfs_high_rate_qpn_range &= MGM_QPN_MASK;
+
 	MLX4_GET(field32, outbox, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET);
 	if (field32 & (1 << 16))
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_UPDATE_QP;
@@ -935,6 +945,10 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	mlx4_dbg(dev, "Max GSO size: %d\n", dev_cap->max_gso_sz);
 	mlx4_dbg(dev, "Max counters: %d\n", dev_cap->max_counters);
 	mlx4_dbg(dev, "Max RSS Table size: %d\n", dev_cap->max_rss_tbl_sz);
+	mlx4_dbg(dev, "DMFS high rate steer QPn base: %d\n",
+		 dev_cap->dmfs_high_rate_qpn_base);
+	mlx4_dbg(dev, "DMFS high rate steer QPn range: %d\n",
+		 dev_cap->dmfs_high_rate_qpn_range);
 
 	dump_dev_cap_flags(dev, dev_cap->flags);
 	dump_dev_cap_flags2(dev, dev_cap->flags2);
@@ -996,6 +1010,7 @@ int mlx4_QUERY_PORT(struct mlx4_dev *dev, int port, struct mlx4_port_cap *port_c
 		port_cap->supported_port_types = field & 3;
 		port_cap->suggested_type = (field >> 3) & 1;
 		port_cap->default_sense = (field >> 4) & 1;
+		port_cap->dmfs_optimized_state = (field >> 5) & 1;
 		MLX4_GET(field, outbox, QUERY_PORT_MTU_OFFSET);
 		port_cap->ib_mtu	   = field & 0xf;
 		MLX4_GET(field, outbox, QUERY_PORT_WIDTH_OFFSET);
@@ -1530,6 +1545,12 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 	struct mlx4_cmd_mailbox *mailbox;
 	__be32 *inbox;
 	int err;
+	static const u8 a0_dmfs_hw_steering[] =  {
+		[MLX4_STEERING_DMFS_A0_DEFAULT]		= 0,
+		[MLX4_STEERING_DMFS_A0_DYNAMIC]		= 1,
+		[MLX4_STEERING_DMFS_A0_STATIC]		= 2,
+		[MLX4_STEERING_DMFS_A0_DISABLE]		= 3
+	};
 
 #define INIT_HCA_IN_SIZE		 0x200
 #define INIT_HCA_VERSION_OFFSET		 0x000
@@ -1563,6 +1584,7 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 #define  INIT_HCA_FS_PARAM_OFFSET         0x1d0
 #define  INIT_HCA_FS_BASE_OFFSET          (INIT_HCA_FS_PARAM_OFFSET + 0x00)
 #define  INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET  (INIT_HCA_FS_PARAM_OFFSET + 0x12)
+#define  INIT_HCA_FS_A0_OFFSET		  (INIT_HCA_FS_PARAM_OFFSET + 0x18)
 #define  INIT_HCA_FS_LOG_TABLE_SZ_OFFSET  (INIT_HCA_FS_PARAM_OFFSET + 0x1b)
 #define  INIT_HCA_FS_ETH_BITS_OFFSET      (INIT_HCA_FS_PARAM_OFFSET + 0x21)
 #define  INIT_HCA_FS_ETH_NUM_ADDRS_OFFSET (INIT_HCA_FS_PARAM_OFFSET + 0x22)
@@ -1673,8 +1695,11 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 		/* Enable Ethernet flow steering
 		 * with udp unicast and tcp unicast
 		 */
-		MLX4_PUT(inbox, (u8) (MLX4_FS_UDP_UC_EN | MLX4_FS_TCP_UC_EN),
-			 INIT_HCA_FS_ETH_BITS_OFFSET);
+		if (dev->caps.dmfs_high_steer_mode !=
+		    MLX4_STEERING_DMFS_A0_STATIC)
+			MLX4_PUT(inbox,
+				 (u8)(MLX4_FS_UDP_UC_EN | MLX4_FS_TCP_UC_EN),
+				 INIT_HCA_FS_ETH_BITS_OFFSET);
 		MLX4_PUT(inbox, (u16) MLX4_FS_NUM_OF_L2_ADDR,
 			 INIT_HCA_FS_ETH_NUM_ADDRS_OFFSET);
 		/* Enable IPoIB flow steering
@@ -1684,6 +1709,13 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 			 INIT_HCA_FS_IB_BITS_OFFSET);
 		MLX4_PUT(inbox, (u16) MLX4_FS_NUM_OF_L2_ADDR,
 			 INIT_HCA_FS_IB_NUM_ADDRS_OFFSET);
+
+		if (dev->caps.dmfs_high_steer_mode !=
+		    MLX4_STEERING_DMFS_A0_NOT_SUPPORTED)
+			MLX4_PUT(inbox,
+				 ((u8)(a0_dmfs_hw_steering[dev->caps.dmfs_high_steer_mode]
+				       << 6)),
+				 INIT_HCA_FS_A0_OFFSET);
 	} else {
 		MLX4_PUT(inbox, param->mc_base,	INIT_HCA_MC_BASE_OFFSET);
 		MLX4_PUT(inbox, param->log_mc_entry_sz,
@@ -1734,6 +1766,12 @@ int mlx4_QUERY_HCA(struct mlx4_dev *dev,
 	u32 dword_field;
 	int err;
 	u8 byte_field;
+	static const u8 a0_dmfs_query_hw_steering[] =  {
+		[0] = MLX4_STEERING_DMFS_A0_DEFAULT,
+		[1] = MLX4_STEERING_DMFS_A0_DYNAMIC,
+		[2] = MLX4_STEERING_DMFS_A0_STATIC,
+		[3] = MLX4_STEERING_DMFS_A0_DISABLE
+	};
 
 #define QUERY_HCA_GLOBAL_CAPS_OFFSET	0x04
 #define QUERY_HCA_CORE_CLOCK_OFFSET	0x0c
@@ -1786,6 +1824,10 @@ int mlx4_QUERY_HCA(struct mlx4_dev *dev,
 			 INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET);
 		MLX4_GET(param->log_mc_table_sz, outbox,
 			 INIT_HCA_FS_LOG_TABLE_SZ_OFFSET);
+		MLX4_GET(byte_field, outbox,
+			 INIT_HCA_FS_A0_OFFSET);
+		param->dmfs_high_steer_mode =
+			a0_dmfs_query_hw_steering[(byte_field >> 6) & 3];
 	} else {
 		MLX4_GET(param->mc_base, outbox, INIT_HCA_MC_BASE_OFFSET);
 		MLX4_GET(param->log_mc_entry_sz, outbox,
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.h b/drivers/net/ethernet/mellanox/mlx4/fw.h
index 744398b..794e282 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.h
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.h
@@ -60,6 +60,7 @@ struct mlx4_port_cap {
 	int vendor_oui;
 	u16 wavelength;
 	u64 trans_code;
+	u8 dmfs_optimized_state;
 };
 
 struct mlx4_dev_cap {
@@ -124,6 +125,8 @@ struct mlx4_dev_cap {
 	int max_gso_sz;
 	int max_rss_tbl_sz;
 	u32 max_counters;
+	u32 dmfs_high_rate_qpn_base;
+	u32 dmfs_high_rate_qpn_range;
 	struct mlx4_port_cap port_cap[MLX4_MAX_PORTS + 1];
 };
 
@@ -194,6 +197,7 @@ struct mlx4_init_hca_param {
 	u8  mw_enabled;  /* Enable memory windows */
 	u8  uar_page_sz; /* log pg sz in 4k chunks */
 	u8  steering_mode; /* for QUERY_HCA */
+	u8  dmfs_high_steer_mode; /* for QUERY_HCA */
 	u64 dev_cap_enabled;
 	u16 cqe_size; /* For use only when CQE stride feature enabled */
 	u16 eqe_size; /* For use only when EQE stride feature enabled */
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 6173b80..e25436b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -105,7 +105,8 @@ MODULE_PARM_DESC(enable_64b_cqe_eqe,
 		 "Enable 64 byte CQEs/EQEs when the FW supports this (default: True)");
 
 #define PF_CONTEXT_BEHAVIOUR_MASK	(MLX4_FUNC_CAP_64B_EQE_CQE | \
-					 MLX4_FUNC_CAP_EQE_CQE_STRIDE)
+					 MLX4_FUNC_CAP_EQE_CQE_STRIDE | \
+					 MLX4_FUNC_CAP_DMFS_A0_STATIC)
 
 static char mlx4_version[] =
 	DRV_NAME ": Mellanox ConnectX core driver v"
@@ -463,8 +464,28 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		(1 << dev->caps.log_num_vlans) *
 		dev->caps.num_ports;
 	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_EXCH] = MLX4_NUM_FEXCH;
+
+	if (dev_cap->dmfs_high_rate_qpn_base > 0 &&
+	    dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FS_EN)
+		dev->caps.dmfs_high_rate_qpn_base = dev_cap->dmfs_high_rate_qpn_base;
+	else
+		dev->caps.dmfs_high_rate_qpn_base =
+			dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW];
+
+	if (dev_cap->dmfs_high_rate_qpn_range > 0 &&
+	    dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FS_EN) {
+		dev->caps.dmfs_high_rate_qpn_range = dev_cap->dmfs_high_rate_qpn_range;
+		dev->caps.dmfs_high_steer_mode = MLX4_STEERING_DMFS_A0_DEFAULT;
+		dev->caps.flags2 |= MLX4_DEV_CAP_FLAG2_FS_A0;
+	} else {
+		dev->caps.dmfs_high_steer_mode = MLX4_STEERING_DMFS_A0_NOT_SUPPORTED;
+		dev->caps.dmfs_high_rate_qpn_base =
+			dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW];
+		dev->caps.dmfs_high_rate_qpn_range = MLX4_A0_STEERING_TABLE_SIZE;
+	}
+
 	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_RSS_RAW_ETH] =
-		MLX4_A0_STEERING_TABLE_SIZE;
+		dev->caps.dmfs_high_rate_qpn_range;
 
 	dev->caps.reserved_qps = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] +
 		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] +
@@ -753,7 +774,8 @@ static int mlx4_slave_cap(struct mlx4_dev *dev)
 
 	if ((func_cap.pf_context_behaviour | PF_CONTEXT_BEHAVIOUR_MASK) !=
 	    PF_CONTEXT_BEHAVIOUR_MASK) {
-		mlx4_err(dev, "Unknown pf context behaviour\n");
+		mlx4_err(dev, "Unknown pf context behaviour %x known flags %x\n",
+			 func_cap.pf_context_behaviour, PF_CONTEXT_BEHAVIOUR_MASK);
 		return -ENOSYS;
 	}
 
@@ -1640,10 +1662,46 @@ static int choose_log_fs_mgm_entry_size(int qp_per_entry)
 	return (i <= MLX4_MAX_MGM_LOG_ENTRY_SIZE) ? i : -1;
 }
 
+static const char *dmfs_high_rate_steering_mode_str(int dmfs_high_steer_mode)
+{
+	switch (dmfs_high_steer_mode) {
+	case MLX4_STEERING_DMFS_A0_DEFAULT:
+		return "default performance";
+
+	case MLX4_STEERING_DMFS_A0_DYNAMIC:
+		return "dynamic hybrid mode";
+
+	case MLX4_STEERING_DMFS_A0_STATIC:
+		return "performance optimized for limited rule configuration (static)";
+
+	case MLX4_STEERING_DMFS_A0_DISABLE:
+		return "disabled performance optimized steering";
+
+	case MLX4_STEERING_DMFS_A0_NOT_SUPPORTED:
+		return "performance optimized steering not supported";
+
+	default:
+		return "Unrecognized mode";
+	}
+}
+
+#define MLX4_DMFS_A0_STEERING			(1UL << 2)
+
 static void choose_steering_mode(struct mlx4_dev *dev,
 				 struct mlx4_dev_cap *dev_cap)
 {
-	if (mlx4_log_num_mgm_entry_size == -1 &&
+	if (mlx4_log_num_mgm_entry_size <= 0) {
+		if ((-mlx4_log_num_mgm_entry_size) & MLX4_DMFS_A0_STEERING) {
+			if (dev->caps.dmfs_high_steer_mode ==
+			    MLX4_STEERING_DMFS_A0_NOT_SUPPORTED)
+				mlx4_err(dev, "DMFS high rate mode not supported\n");
+			else
+				dev->caps.dmfs_high_steer_mode =
+					MLX4_STEERING_DMFS_A0_STATIC;
+		}
+	}
+
+	if (mlx4_log_num_mgm_entry_size <= 0 &&
 	    dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_FS_EN &&
 	    (!mlx4_is_mfunc(dev) ||
 	     (dev_cap->fs_max_num_qp_per_entry >= (dev->num_vfs + 1))) &&
@@ -1656,6 +1714,9 @@ static void choose_steering_mode(struct mlx4_dev *dev,
 		dev->caps.fs_log_max_ucast_qp_range_size =
 			dev_cap->fs_log_max_ucast_qp_range_size;
 	} else {
+		if (dev->caps.dmfs_high_steer_mode !=
+		    MLX4_STEERING_DMFS_A0_NOT_SUPPORTED)
+			dev->caps.dmfs_high_steer_mode = MLX4_STEERING_DMFS_A0_DISABLE;
 		if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER &&
 		    dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER)
 			dev->caps.steering_mode = MLX4_STEERING_MODE_B0;
@@ -1682,7 +1743,8 @@ static void choose_tunnel_offload_mode(struct mlx4_dev *dev,
 				       struct mlx4_dev_cap *dev_cap)
 {
 	if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED &&
-	    dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS)
+	    dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS &&
+	    dev->caps.dmfs_high_steer_mode != MLX4_STEERING_DMFS_A0_STATIC)
 		dev->caps.tunnel_offload_mode = MLX4_TUNNEL_OFFLOAD_MODE_VXLAN;
 	else
 		dev->caps.tunnel_offload_mode = MLX4_TUNNEL_OFFLOAD_MODE_NONE;
@@ -1691,6 +1753,35 @@ static void choose_tunnel_offload_mode(struct mlx4_dev *dev,
 		 == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) ? "vxlan" : "none");
 }
 
+static int mlx4_validate_optimized_steering(struct mlx4_dev *dev)
+{
+	int i;
+	struct mlx4_port_cap port_cap;
+
+	if (dev->caps.dmfs_high_steer_mode == MLX4_STEERING_DMFS_A0_NOT_SUPPORTED)
+		return -EINVAL;
+
+	for (i = 1; i <= dev->caps.num_ports; i++) {
+		if (mlx4_dev_port(dev, i, &port_cap)) {
+			mlx4_err(dev,
+				 "QUERY_DEV_CAP command failed, can't veify DMFS high rate steering.\n");
+		} else if ((dev->caps.dmfs_high_steer_mode !=
+			    MLX4_STEERING_DMFS_A0_DEFAULT) &&
+			   (port_cap.dmfs_optimized_state ==
+			    !!(dev->caps.dmfs_high_steer_mode ==
+			    MLX4_STEERING_DMFS_A0_DISABLE))) {
+			mlx4_err(dev,
+				 "DMFS high rate steer mode differ, driver requested %s but %s in FW.\n",
+				 dmfs_high_rate_steering_mode_str(
+					dev->caps.dmfs_high_steer_mode),
+				 (port_cap.dmfs_optimized_state ?
+					"enabled" : "disabled"));
+		}
+	}
+
+	return 0;
+}
+
 static int mlx4_init_fw(struct mlx4_dev *dev)
 {
 	struct mlx4_mod_stat_cfg   mlx4_cfg;
@@ -1743,6 +1834,10 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
 		choose_steering_mode(dev, &dev_cap);
 		choose_tunnel_offload_mode(dev, &dev_cap);
 
+		if (dev->caps.dmfs_high_steer_mode == MLX4_STEERING_DMFS_A0_STATIC &&
+		    mlx4_is_master(dev))
+			dev->caps.function_caps |= MLX4_FUNC_CAP_DMFS_A0_STATIC;
+
 		err = mlx4_get_phys_port_id(dev);
 		if (err)
 			mlx4_err(dev, "Fail to get physical port id\n");
@@ -1829,6 +1924,24 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
 				mlx4_err(dev, "Failed to map internal clock. Timestamping is not supported\n");
 			}
 		}
+
+		if (dev->caps.dmfs_high_steer_mode !=
+		    MLX4_STEERING_DMFS_A0_NOT_SUPPORTED) {
+			if (mlx4_validate_optimized_steering(dev))
+				mlx4_warn(dev, "Optimized steering validation failed\n");
+
+			if (dev->caps.dmfs_high_steer_mode ==
+			    MLX4_STEERING_DMFS_A0_DISABLE) {
+				dev->caps.dmfs_high_rate_qpn_base =
+					dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW];
+				dev->caps.dmfs_high_rate_qpn_range =
+					MLX4_A0_STEERING_TABLE_SIZE;
+			}
+
+			mlx4_dbg(dev, "DMFS high rate steer mode is: %s\n",
+				 dmfs_high_rate_steering_mode_str(
+					dev->caps.dmfs_high_steer_mode));
+		}
 	} else {
 		err = mlx4_init_slave(dev);
 		if (err) {
@@ -3201,10 +3314,11 @@ static int __init mlx4_verify_params(void)
 		port_type_array[0] = true;
 	}
 
-	if (mlx4_log_num_mgm_entry_size != -1 &&
-	    (mlx4_log_num_mgm_entry_size < MLX4_MIN_MGM_LOG_ENTRY_SIZE ||
-	     mlx4_log_num_mgm_entry_size > MLX4_MAX_MGM_LOG_ENTRY_SIZE)) {
-		pr_warn("mlx4_core: mlx4_log_num_mgm_entry_size (%d) not in legal range (-1 or %d..%d)\n",
+	if (mlx4_log_num_mgm_entry_size < -7 ||
+	    (mlx4_log_num_mgm_entry_size > 0 &&
+	     (mlx4_log_num_mgm_entry_size < MLX4_MIN_MGM_LOG_ENTRY_SIZE ||
+	      mlx4_log_num_mgm_entry_size > MLX4_MAX_MGM_LOG_ENTRY_SIZE))) {
+		pr_warn("mlx4_core: mlx4_log_num_mgm_entry_size (%d) not in legal range (-7..0 or %d..%d)\n",
 			mlx4_log_num_mgm_entry_size,
 			MLX4_MIN_MGM_LOG_ENTRY_SIZE,
 			MLX4_MAX_MGM_LOG_ENTRY_SIZE);
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index cebd118..bdd4eea 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -689,8 +689,6 @@ enum mlx4_qp_table_zones {
 	MLX4_QP_TABLE_ZONE_NUM
 };
 
-#define MLX4_A0_STEERING_TABLE_SIZE    256
-
 struct mlx4_qp_table {
 	struct mlx4_bitmap	*bitmap_gen;
 	struct mlx4_zone_allocator *zones;
diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c
index d8d040c..1586ecc 100644
--- a/drivers/net/ethernet/mellanox/mlx4/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx4/qp.c
@@ -712,8 +712,8 @@ int mlx4_init_qp_table(struct mlx4_dev *dev)
 	int k;
 	int fixed_reserved_from_bot_rv = 0;
 	int bottom_reserved_for_rss_bitmap;
-	u32 max_table_offset = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] +
-		MLX4_A0_STEERING_TABLE_SIZE;
+	u32 max_table_offset = dev->caps.dmfs_high_rate_qpn_base +
+			dev->caps.dmfs_high_rate_qpn_range;
 
 	spin_lock_init(&qp_table->lock);
 	INIT_RADIX_TREE(&dev->qp_table_tree, GFP_ATOMIC);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 39890cd..25c791e 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -117,6 +117,14 @@ enum {
 	MLX4_STEERING_MODE_DEVICE_MANAGED
 };
 
+enum {
+	MLX4_STEERING_DMFS_A0_DEFAULT,
+	MLX4_STEERING_DMFS_A0_DYNAMIC,
+	MLX4_STEERING_DMFS_A0_STATIC,
+	MLX4_STEERING_DMFS_A0_DISABLE,
+	MLX4_STEERING_DMFS_A0_NOT_SUPPORTED
+};
+
 static inline const char *mlx4_steering_mode_str(int steering_mode)
 {
 	switch (steering_mode) {
@@ -191,7 +199,8 @@ enum {
 	MLX4_DEV_CAP_FLAG2_ETH_BACKPL_AN_REP	= 1LL <<  15,
 	MLX4_DEV_CAP_FLAG2_CONFIG_DEV		= 1LL <<  16,
 	MLX4_DEV_CAP_FLAG2_SYS_EQS		= 1LL <<  17,
-	MLX4_DEV_CAP_FLAG2_80_VFS		= 1LL <<  18
+	MLX4_DEV_CAP_FLAG2_80_VFS		= 1LL <<  18,
+	MLX4_DEV_CAP_FLAG2_FS_A0		= 1LL <<  19
 };
 
 enum {
@@ -225,7 +234,8 @@ enum {
 
 enum {
 	MLX4_FUNC_CAP_64B_EQE_CQE	= 1L << 0,
-	MLX4_FUNC_CAP_EQE_CQE_STRIDE	= 1L << 1
+	MLX4_FUNC_CAP_EQE_CQE_STRIDE	= 1L << 1,
+	MLX4_FUNC_CAP_DMFS_A0_STATIC	= 1L << 2
 };
 
 
@@ -482,6 +492,7 @@ struct mlx4_caps {
 	int			reserved_mcgs;
 	int			num_qp_per_mgm;
 	int			steering_mode;
+	int			dmfs_high_steer_mode;
 	int			fs_log_max_ucast_qp_range_size;
 	int			num_pds;
 	int			reserved_pds;
@@ -522,6 +533,8 @@ struct mlx4_caps {
 	int			tunnel_offload_mode;
 	u8			rx_checksum_flags_port[MLX4_MAX_PORTS + 1];
 	u8			alloc_res_qp_mask;
+	u32			dmfs_high_rate_qpn_base;
+	u32			dmfs_high_rate_qpn_range;
 };
 
 struct mlx4_buf_list {
-- 
1.7.1

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox