Netdev List

Netdev List
 help / color / mirror / Atom feed

* [rdma-rc PATCH 0/2] iw_cxgb4: Adjust the cq/qp mask
From: Raju Rangoju @ 2019-02-14 12:10 UTC (permalink / raw)
  To: jgg, davem, linux-rdma; +Cc: netdev, swise, rajur

Export the LLD sge_host_page_size field to ULDs via
cxgb4_lld_info, so that iw_cxgb4 can adjust the cq/qp
mask based on no.of bar2 pages in a host page.

This series has both net(cxgb4) and rdma(iw_cxgb4) changes,
and I would request this merge via rdma repo.

I have made sure this series applies cleanly on both net-next
and rdma-for-rc and doesn't cause any merge conflicts.

Raju Rangoju (2):
  cxgb4: export sge_host_page_size to ulds
  iw_cxgb4: cq/qp mask depends on bar2 pages in a host page

 drivers/infiniband/hw/cxgb4/device.c           | 15 +++++++++++++--
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c |  1 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h |  1 +
 3 files changed, 15 insertions(+), 2 deletions(-)

-- 
2.13.0

^ permalink raw reply

* [rdma-rc PATCH 1/2] cxgb4: export sge_host_page_size to ulds
From: Raju Rangoju @ 2019-02-14 12:10 UTC (permalink / raw)
  To: jgg, davem, linux-rdma; +Cc: netdev, swise, rajur
In-Reply-To: <20190214121054.11693-1-rajur@chelsio.com>

Export the sge_host_page_size field to ULDs via
cxgb4_lld_info, so that iw_cxgb4 can make use of
this in calculating the correct qp/cq mask.

Fixes: 2391b0030e ("cxgb4: Remove SGE_HOST_PAGE_SIZE dependency on page
size")
Signed-off-by: Raju Rangoju <rajur@chelsio.com>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c | 1 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c
index c041f44324db..b3654598a2d5 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c
@@ -660,6 +660,7 @@ static void uld_init(struct adapter *adap, struct cxgb4_lld_info *lld)
 	lld->cclk_ps = 1000000000 / adap->params.vpd.cclk;
 	lld->udb_density = 1 << adap->params.sge.eq_qpp;
 	lld->ucq_density = 1 << adap->params.sge.iq_qpp;
+	lld->sge_host_page_size = 1 << (adap->params.sge.hps + 10);
 	lld->filt_mode = adap->params.tp.vlan_pri_map;
 	/* MODQ_REQ_MAP sets queues 0-3 to chan 0-3 */
 	for (i = 0; i < NCHAN; i++)
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
index 5fa9a2d5fc4b..21da34a4ca24 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
@@ -336,6 +336,7 @@ struct cxgb4_lld_info {
 	unsigned int cclk_ps;                /* Core clock period in psec */
 	unsigned short udb_density;          /* # of user DB/page */
 	unsigned short ucq_density;          /* # of user CQs/page */
+	unsigned int sge_host_page_size;     /* SGE host page size */
 	unsigned short filt_mode;            /* filter optional components */
 	unsigned short tx_modq[NCHAN];       /* maps each tx channel to a */
 					     /* scheduler queue */
-- 
2.13.0


^ permalink raw reply related

* [rdma-rc PATCH 2/2] iw_cxgb4: cq/qp mask depends on bar2 pages in a host page
From: Raju Rangoju @ 2019-02-14 12:10 UTC (permalink / raw)
  To: jgg, davem, linux-rdma; +Cc: netdev, swise, rajur
In-Reply-To: <20190214121054.11693-1-rajur@chelsio.com>

Adjust the cq/qp mask based on no.of bar2 pages in a host page.

For user-mode rdma, the granularity of the BAR2 memory mapped
to a user rdma process during queue allocation must be based
on the host page size. The lld attributes udb_density and
ucq_density are used to figure out how many sge contexts are
in a bar2 page. So the rdev->qpmask and rdev->cqmask in
iw_cxgb4 need to now be adjusted based on how many sge bar2
pages are in a host page.

Fixes: 2391b0030e ("cxgb4: Remove SGE_HOST_PAGE_SIZE dependency on page size")
Signed-off-by: Raju Rangoju <rajur@chelsio.com>
---
 drivers/infiniband/hw/cxgb4/device.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c
index c13c0ba30f63..d499cd61c0e8 100644
--- a/drivers/infiniband/hw/cxgb4/device.c
+++ b/drivers/infiniband/hw/cxgb4/device.c
@@ -783,6 +783,7 @@ void c4iw_init_dev_ucontext(struct c4iw_rdev *rdev,
 static int c4iw_rdev_open(struct c4iw_rdev *rdev)
 {
 	int err;
+	unsigned int factor;
 
 	c4iw_init_dev_ucontext(rdev, &rdev->uctx);
 
@@ -806,8 +807,18 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev)
 		return -EINVAL;
 	}
 
-	rdev->qpmask = rdev->lldi.udb_density - 1;
-	rdev->cqmask = rdev->lldi.ucq_density - 1;
+	/* This implementation requires a sge_host_page_size <= PAGE_SIZE. */
+	if (rdev->lldi.sge_host_page_size > PAGE_SIZE) {
+		pr_err("%s: unsupported sge host page size %u\n",
+		       pci_name(rdev->lldi.pdev),
+		       rdev->lldi.sge_host_page_size);
+		return -EINVAL;
+	}
+
+	factor = PAGE_SIZE / rdev->lldi.sge_host_page_size;
+	rdev->qpmask = (rdev->lldi.udb_density * factor) - 1;
+	rdev->cqmask = (rdev->lldi.ucq_density * factor) - 1;
+
 	pr_debug("dev %s stag start 0x%0x size 0x%0x num stags %d pbl start 0x%0x size 0x%0x rq start 0x%0x size 0x%0x qp qid start %u size %u cq qid start %u size %u srq size %u\n",
 		 pci_name(rdev->lldi.pdev), rdev->lldi.vr->stag.start,
 		 rdev->lldi.vr->stag.size, c4iw_num_stags(rdev),
-- 
2.13.0


^ permalink raw reply related

* Re: [PATCH 5/9] perf, bpf: save bpf_prog_info in a rbtree in perf_env
From: Jiri Olsa @ 2019-02-14 12:26 UTC (permalink / raw)
  To: Song Liu; +Cc: netdev, linux-kernel, ast, daniel, kernel-team, peterz, acme
In-Reply-To: <20190209011705.2160185-6-songliubraving@fb.com>

On Fri, Feb 08, 2019 at 05:17:01PM -0800, Song Liu wrote:

SNIP

> diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h
> index d01b8355f4ca..5894a177b7cf 100644
> --- a/tools/perf/util/env.h
> +++ b/tools/perf/util/env.h
> @@ -3,7 +3,10 @@
>  #define __PERF_ENV_H
>  
>  #include <linux/types.h>
> +#include <linux/rbtree.h>
>  #include "cpumap.h"
> +#include "rwsem.h"
> +#include "bpf-event.h"
>  
>  struct cpu_topology_map {
>  	int	socket_id;
> @@ -64,6 +67,8 @@ struct perf_env {
>  	struct memory_node	*memory_nodes;
>  	unsigned long long	 memory_bsize;
>  	u64                     clockid_res_ns;
> +	struct rw_semaphore	bpf_info_lock;

why's the lock needed?

jirka

^ permalink raw reply

* Re: [PATCH 5/9] perf, bpf: save bpf_prog_info in a rbtree in perf_env
From: Jiri Olsa @ 2019-02-14 12:33 UTC (permalink / raw)
  To: Song Liu; +Cc: netdev, linux-kernel, ast, daniel, kernel-team, peterz, acme
In-Reply-To: <20190209011705.2160185-6-songliubraving@fb.com>

On Fri, Feb 08, 2019 at 05:17:01PM -0800, Song Liu wrote:
> bpf_prog_info contains information necessary to annotate bpf programs.
> This patch saves bpf_prog_info for bpf programs loaded in the system.
> 
> perf-record saves bpf_prog_info information as headers to perf.data.
> A new header type HEADER_BPF_PROG_INFO is introduced for this data.

please move those 2 changes into separate patches then

it's hard to make comments when I don't see the rest of
the patches on the list please resend the patchset

thanks,
jirka

^ permalink raw reply

* Re: [PATCH] net: phy: at803x: disable delay only for RGMII mode
From: Niklas Cassel @ 2019-02-14 12:39 UTC (permalink / raw)
  To: Peter Ujfalusi
  Cc: Marc Gonzalez, Andrew Lunn, Florian Fainelli, Vinod Koul,
	David S Miller, linux-arm-msm, Bjorn Andersson, netdev,
	Nori, Sekhar
In-Reply-To: <3356ff05-8d08-591e-03bf-9d846f79097b@ti.com>

On Thu, Feb 14, 2019 at 12:49:36PM +0200, Peter Ujfalusi wrote:
> Hi Niklas,
> 
> On 13/02/2019 19.40, Niklas Cassel wrote:
> > On Wed, Feb 13, 2019 at 02:40:18PM +0100, Marc Gonzalez wrote:
> >> On 13/02/2019 14:29, Andrew Lunn wrote:
> >>
> >>>> So we have these modes:
> >>>>
> >>>> PHY_INTERFACE_MODE_RGMII: TX and RX delays disabled
> >>>> PHY_INTERFACE_MODE_RGMII_ID: TX and RX delays enabled
> >>>> PHY_INTERFACE_MODE_RGMII_RXID: RX delay enabled, TX delay disabled
> >>>> PHY_INTERFACE_MODE_RGMII_TXID: TX delay enabled, RX delay disabled
> >>>>
> >>>> What I don't like with this patch, is that if we specify phy-mode
> >>>> PHY_INTERFACE_MODE_RGMII_TXID, this patch will enable TX delay,
> >>>> but RX delay will not be explicitly set.
> >>>
> >>> That is not the behaviour we want. It is best to assume the device is
> >>> in a random state, and correctly enable/disable all delays as
> >>> requested. Only leave the hardware alone if PHY_INTERFACE_MODE_NA is
> >>> used.
> >>
> >> That's what my patch did:
> >> https://www.spinics.net/lists/netdev/msg445053.html
> >>
> >> But see Florian's remarks:
> >> https://www.spinics.net/lists/netdev/msg445133.html
> > 
> > Hello Marc,
> > 
> > I saw that comment from Florian. However that was way back in 2017.
> > Maybe the phy-modes were not as well defined back then?
> > 
> > Andrew recently suggested to fix the driver so that it conforms with the
> > phy-modes, and fix any SoC that specified an incorrect phy-mode in DT
> > and thus relied upon the broken behavior of the PHY driver:
> > https://www.spinics.net/lists/netdev/msg445133.html
> > 
> > 
> > So, I've rebased your old patch, see attachment.
> > I suggest that Peter test it on am335x-evm.
> 
> with the patch + s/rgmii-txid/rgmii-id in the am335x-evmsk.dts ethernet
> is working.
> I don't have am335x-evm to test, but it has the same PHY as evmsk.
> 

Florian's concern was that this PHY driver looked at "phy-mode" from the
perspective of the MAC rather than the PHY.
However, if s/rgmii-txid/rgmii-id is the correct fix for am335x-evm,
then this means that this PHY driver was just broken.

If the driver had misinterpreted the perspective, then the correct
fix for am335x-evm would have been s/rgmii-txid/rgmii-rxid.

So considering that this driver seems to be really broken
(rather then just inverted perspective),
perhaps we can merge the patch I attached in my previous email after all?
(Together with a s/rgmii-txid/rgmii-id in the am335x-evmsk.dts.)


Kind regards,
Niklas

> > am335x-evm appears to rely on the current broken behavior of the PHY
> > driver, so we will probably need to fix the am335x-evm according to this:
> > https://www.spinics.net/lists/netdev/msg445117.html
> > and merge that as well.
> > 
> > 
> > Andrew, Florian, do you both agree?
> > 
> > 
> > Kind regards,
> > Niklas
> > 
> 
> - Péter
> 
> Texas Instruments Finland Oy, Porkkalankatu 22, 00180 Helsinki.
> Y-tunnus/Business ID: 0615521-4. Kotipaikka/Domicile: Helsinki

^ permalink raw reply

* Re: TC stats / hw offload question
From: Jamal Hadi Salim @ 2019-02-14 12:39 UTC (permalink / raw)
  To: Edward Cree, netdev
  Cc: Jiri Pirko, Cong Wang, Or Gerlitz, Andy Gospodarek, PJ Waskiewicz,
	Anjali Singhai Jain, Jakub Kicinski
In-Reply-To: <11ab2dc0-ec39-18cd-d170-0d5f954198b9@solarflare.com>

On 2019-02-11 6:44 a.m., Edward Cree wrote:

>> Hasnt been necessary thus far.
>> Is your end goal to match and count?
> My end goal is to implement TC offload in some hw we're designing
>   here at Solarflare.  So I'm trying to determine what hardware is
>   expected/required to do.
> It might be possible to design our new hw so that we can attach a
>   counter to every action, if that's what TC wants. 

It makes sense to have a counter on every action - even if it is
for debugging purposes. The two most basic actions are "drop" or
"accept". In TC speak the default action is "classid x:y" which
typically is to select a queue or give the flow some identity
(one should be able to use the same action on h/w ingress as well
to select a rx DMA ring for example, but that seems uncommon).

Note, your counters should also be shareable; example, count all
the drops in one counter across multiple flows as in the following
case where counter index 1 is used.

tc flower match foo action drop index 1
tc flower match bar action drop index 1

>   But since the
>   other vendors don't seem to do that, I wondered if there was a
>   reason, or if perhaps the counter resources (and PCI bw to read
>   them) could be saved if all those separate counters aren't really
>   needed.  

Probably nobody has paid attention or asked as you did.
Will let the h/w folks speak for themselves. My understanding
based on experience is counters are cheap. Most modern NICs
and ASICs have a gazillion of them at their disposal.

> Right now the design we are considering would only count
>   packets as-matched, i.e. before any edits.  That's fine for encap
>   — you can calculate the bytes correction in SW — but not for decap
>   since in principle the length of the RXed outer headers could
>   vary (e.g. you might have IP options there).
>

ok, so not much in terms of other types of actions.
But for abstraction sake maybe use "flowid x:y" and have counters
associated with that. Or even make this optional and only attach
a counter if someone says "action ok" and allow them to specify
the counter index (assuming you architecture has an indexed table
of counters).

cheers,
jamal

^ permalink raw reply

* [PATCH net-next v2 0/2]cxgb4/cxgb4vfSupport for SGE doorbell queue timer
From: Vishal Kulkarni @ 2019-02-14 12:49 UTC (permalink / raw)
  To: netdev, davem; +Cc: nirranjan, indranil, dt, Vishal Kulkarni

This series of patchs add SGE doorbell queue timer for faster DMA completions.

Patch 1 Implements SGE doorbell queue timer

Patch 2 Adds ethtool capability to set/get SGE doorbell queue timer tick

---
v2
- Reverse christmas tree formatting for local variables.
---

Vishal Kulkarni (2):
  cxgb4/cxgb4vf: Add support for SGE doorbell queue timer
  cxgb4: Add capability to get/set SGE Doorbell Queue Timer Tick

 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h         |  11 +-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c | 189 +++++++++++-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c    |  29 +-
 drivers/net/ethernet/chelsio/cxgb4/sge.c           | 322 +++++++++++++++++----
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c         |  41 +++
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.h         |   1 +
 drivers/net/ethernet/chelsio/cxgb4/t4_values.h     |   6 +
 drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h      |  24 +-
 drivers/net/ethernet/chelsio/cxgb4vf/sge.c         |  27 +-
 9 files changed, 571 insertions(+), 79 deletions(-)

-- 
1.8.3.1


^ permalink raw reply

* [PATCH net-next v2 1/2] cxgb4/cxgb4vf: Add support for SGE doorbell queue timer
From: Vishal Kulkarni @ 2019-02-14 12:49 UTC (permalink / raw)
  To: netdev, davem; +Cc: nirranjan, indranil, dt, Vishal Kulkarni
In-Reply-To: <1550148556-3531-1-git-send-email-vishal@chelsio.com>

T6 introduced a Timer Mechanism in SGE called the
SGE Doorbell Queue Timer. With this we can now configure
TX Queues to get CIDX Updates when:

    Time(CIDX == PIDX) >= Timer

Previously we rely on TX Queue Status Page updates by hardware
for DMA completions. This will make Hardware/Firmware actually
deliver the CIDX Updates as Ingress Queue messages with
commensurate Interrupts.

So we now have a new RX Path component for processing CIDX Updates
and reclaiming TX Descriptors faster.

Original work by: Casey Leedom <leedom@chelsio.com>

Signed-off-by: Vishal Kulkarni <vishal@chelsio.com>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h      |  10 +-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c |  19 +-
 drivers/net/ethernet/chelsio/cxgb4/sge.c        | 322 +++++++++++++++++++-----
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c      |  41 +++
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.h      |   1 +
 drivers/net/ethernet/chelsio/cxgb4/t4_values.h  |   6 +
 drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h   |  24 +-
 drivers/net/ethernet/chelsio/cxgb4vf/sge.c      |  27 +-
 8 files changed, 375 insertions(+), 75 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 568715a..68d0d45 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -617,6 +617,7 @@ enum {                                 /* adapter flags */
 	FW_OFLD_CONN       = (1 << 9),
 	ROOT_NO_RELAXED_ORDERING = (1 << 10),
 	SHUTTING_DOWN	   = (1 << 11),
+	SGE_DBQ_TIMER      = (1 << 12),
 };
 
 enum {
@@ -756,6 +757,8 @@ struct sge_eth_txq {                /* state for an SGE Ethernet Tx queue */
 #ifdef CONFIG_CHELSIO_T4_DCB
 	u8 dcb_prio;		    /* DCB Priority bound to queue */
 #endif
+	u8 dbqt;                    /* SGE Doorbell Queue Timer in use */
+	unsigned int dbqtimerix;    /* SGE Doorbell Queue Timer Index */
 	unsigned long tso;          /* # of TSO requests */
 	unsigned long tx_cso;       /* # of Tx checksum offloads */
 	unsigned long vlan_ins;     /* # of Tx VLAN insertions */
@@ -816,6 +819,7 @@ struct sge {
 	u16 nqs_per_uld;	    /* # of Rx queues per ULD */
 	u16 timer_val[SGE_NTIMERS];
 	u8 counter_val[SGE_NCOUNTERS];
+	u16 dbqtimer_val[SGE_NDBQTIMERS];
 	u32 fl_pg_order;            /* large page allocation size */
 	u32 stat_len;               /* length of status page at ring end */
 	u32 pktshift;               /* padding between CPL & packet data */
@@ -1402,7 +1406,7 @@ int t4_sge_alloc_rxq(struct adapter *adap, struct sge_rspq *iq, bool fwevtq,
 		     rspq_flush_handler_t flush_handler, int cong);
 int t4_sge_alloc_eth_txq(struct adapter *adap, struct sge_eth_txq *txq,
 			 struct net_device *dev, struct netdev_queue *netdevq,
-			 unsigned int iqid);
+			 unsigned int iqid, u8 dbqt);
 int t4_sge_alloc_ctrl_txq(struct adapter *adap, struct sge_ctrl_txq *txq,
 			  struct net_device *dev, unsigned int iqid,
 			  unsigned int cmplqid);
@@ -1415,6 +1419,8 @@ int t4_sge_alloc_uld_txq(struct adapter *adap, struct sge_uld_txq *txq,
 int t4_sge_init(struct adapter *adap);
 void t4_sge_start(struct adapter *adap);
 void t4_sge_stop(struct adapter *adap);
+int t4_sge_eth_txq_egress_update(struct adapter *adap, struct sge_eth_txq *q,
+				 int maxreclaim);
 void cxgb4_set_ethtool_ops(struct net_device *netdev);
 int cxgb4_write_rss(const struct port_info *pi, const u16 *queues);
 enum cpl_tx_tnl_lso_type cxgb_encap_offload_supported(struct sk_buff *skb);
@@ -1821,6 +1827,8 @@ int t4_ctrl_eq_free(struct adapter *adap, unsigned int mbox, unsigned int pf,
 int t4_ofld_eq_free(struct adapter *adap, unsigned int mbox, unsigned int pf,
 		    unsigned int vf, unsigned int eqid);
 int t4_sge_ctxt_flush(struct adapter *adap, unsigned int mbox, int ctxt_type);
+int t4_read_sge_dbqtimers(struct adapter *adap, unsigned int ndbqtimers,
+			  u16 *dbqtimers);
 void t4_handle_get_port_info(struct port_info *pi, const __be64 *rpl);
 int t4_update_port_info(struct port_info *pi);
 int t4_get_link_params(struct port_info *pi, unsigned int *link_okp,
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index adf75d1..bdd11a6 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -575,7 +575,7 @@ static int fwevtq_handler(struct sge_rspq *q, const __be64 *rsp,
 			struct sge_eth_txq *eq;
 
 			eq = container_of(txq, struct sge_eth_txq, q);
-			netif_tx_wake_queue(eq->txq);
+			t4_sge_eth_txq_egress_update(q->adap, eq, -1);
 		} else {
 			struct sge_uld_txq *oq;
 
@@ -933,10 +933,13 @@ static int setup_sge_queues(struct adapter *adap)
 			q->rspq.idx = j;
 			memset(&q->stats, 0, sizeof(q->stats));
 		}
-		for (j = 0; j < pi->nqsets; j++, t++) {
+
+		q = &s->ethrxq[pi->first_qset];
+		for (j = 0; j < pi->nqsets; j++, t++, q++) {
 			err = t4_sge_alloc_eth_txq(adap, t, dev,
 					netdev_get_tx_queue(dev, j),
-					s->fw_evtq.cntxt_id);
+					q->rspq.cntxt_id,
+					!!(adap->flags & SGE_DBQ_TIMER));
 			if (err)
 				goto freeout;
 		}
@@ -958,7 +961,7 @@ static int setup_sge_queues(struct adapter *adap)
 	if (!is_t4(adap->params.chip)) {
 		err = t4_sge_alloc_eth_txq(adap, &s->ptptxq, adap->port[0],
 					   netdev_get_tx_queue(adap->port[0], 0)
-					   , s->fw_evtq.cntxt_id);
+					   , s->fw_evtq.cntxt_id, false);
 		if (err)
 			goto freeout;
 	}
@@ -4325,6 +4328,14 @@ static int adap_init0(struct adapter *adap)
 	if (ret < 0)
 		goto bye;
 
+	/* Grab the SGE Doorbell Queue Timer values.  If successful, that
+	 * indicates that the Firmware and Hardware support this.
+	 */
+	ret = t4_read_sge_dbqtimers(adap, ARRAY_SIZE(adap->sge.dbqtimer_val),
+				    adap->sge.dbqtimer_val);
+	if (!ret)
+		adap->flags |= SGE_DBQ_TIMER;
+
 	if (is_bypass_device(adap->pdev->device))
 		adap->params.bypass = 1;
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c
index fc0bc64..f18493f 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c
@@ -80,9 +80,10 @@
  * Max number of Tx descriptors we clean up at a time.  Should be modest as
  * freeing skbs isn't cheap and it happens while holding locks.  We just need
  * to free packets faster than they arrive, we eventually catch up and keep
- * the amortized cost reasonable.  Must be >= 2 * TXQ_STOP_THRES.
+ * the amortized cost reasonable.  Must be >= 2 * TXQ_STOP_THRES.  It should
+ * also match the CIDX Flush Threshold.
  */
-#define MAX_TX_RECLAIM 16
+#define MAX_TX_RECLAIM 32
 
 /*
  * Max number of Rx buffers we replenish at a time.  Again keep this modest,
@@ -401,31 +402,52 @@ static inline int reclaimable(const struct sge_txq *q)
 }
 
 /**
- *	cxgb4_reclaim_completed_tx - reclaims completed Tx descriptors
+ *	reclaim_completed_tx - reclaims completed TX Descriptors
  *	@adap: the adapter
  *	@q: the Tx queue to reclaim completed descriptors from
+ *	@maxreclaim: the maximum number of TX Descriptors to reclaim or -1
  *	@unmap: whether the buffers should be unmapped for DMA
  *
- *	Reclaims Tx descriptors that the SGE has indicated it has processed,
- *	and frees the associated buffers if possible.  Called with the Tx
- *	queue locked.
+ *	Reclaims Tx Descriptors that the SGE has indicated it has processed,
+ *	and frees the associated buffers if possible.  If @max == -1, then
+ *	we'll use a defaiult maximum.  Called with the TX Queue locked.
  */
-inline void cxgb4_reclaim_completed_tx(struct adapter *adap, struct sge_txq *q,
-					bool unmap)
+static inline int reclaim_completed_tx(struct adapter *adap, struct sge_txq *q,
+				       int maxreclaim, bool unmap)
 {
-	int avail = reclaimable(q);
+	int reclaim = reclaimable(q);
 
-	if (avail) {
+	if (reclaim) {
 		/*
 		 * Limit the amount of clean up work we do at a time to keep
 		 * the Tx lock hold time O(1).
 		 */
-		if (avail > MAX_TX_RECLAIM)
-			avail = MAX_TX_RECLAIM;
+		if (maxreclaim < 0)
+			maxreclaim = MAX_TX_RECLAIM;
+		if (reclaim > maxreclaim)
+			reclaim = maxreclaim;
 
-		free_tx_desc(adap, q, avail, unmap);
-		q->in_use -= avail;
+		free_tx_desc(adap, q, reclaim, unmap);
+		q->in_use -= reclaim;
 	}
+
+	return reclaim;
+}
+
+/**
+ *	cxgb4_reclaim_completed_tx - reclaims completed Tx descriptors
+ *	@adap: the adapter
+ *	@q: the Tx queue to reclaim completed descriptors from
+ *	@unmap: whether the buffers should be unmapped for DMA
+ *
+ *	Reclaims Tx descriptors that the SGE has indicated it has processed,
+ *	and frees the associated buffers if possible.  Called with the Tx
+ *	queue locked.
+ */
+void cxgb4_reclaim_completed_tx(struct adapter *adap, struct sge_txq *q,
+				bool unmap)
+{
+	(void)reclaim_completed_tx(adap, q, -1, unmap);
 }
 EXPORT_SYMBOL(cxgb4_reclaim_completed_tx);
 
@@ -1288,6 +1310,44 @@ static inline void t6_fill_tnl_lso(struct sk_buff *skb,
 }
 
 /**
+ *	t4_sge_eth_txq_egress_update - handle Ethernet TX Queue update
+ *	@adap: the adapter
+ *	@eq: the Ethernet TX Queue
+ *	@maxreclaim: the maximum number of TX Descriptors to reclaim or -1
+ *
+ *	We're typically called here to update the state of an Ethernet TX
+ *	Queue with respect to the hardware's progress in consuming the TX
+ *	Work Requests that we've put on that Egress Queue.  This happens
+ *	when we get Egress Queue Update messages and also prophylactically
+ *	in regular timer-based Ethernet TX Queue maintenance.
+ */
+int t4_sge_eth_txq_egress_update(struct adapter *adap, struct sge_eth_txq *eq,
+				 int maxreclaim)
+{
+	struct sge_txq *q = &eq->q;
+	unsigned int reclaimed;
+
+	if (!q->in_use || !__netif_tx_trylock(eq->txq))
+		return 0;
+
+	/* Reclaim pending completed TX Descriptors. */
+	reclaimed = reclaim_completed_tx(adap, &eq->q, maxreclaim, true);
+
+	/* If the TX Queue is currently stopped and there's now more than half
+	 * the queue available, restart it.  Otherwise bail out since the rest
+	 * of what we want do here is with the possibility of shipping any
+	 * currently buffered Coalesced TX Work Request.
+	 */
+	if (netif_tx_queue_stopped(eq->txq) && txq_avail(q) > (q->size / 2)) {
+		netif_tx_wake_queue(eq->txq);
+		eq->q.restarts++;
+	}
+
+	__netif_tx_unlock(eq->txq);
+	return reclaimed;
+}
+
+/**
  *	cxgb4_eth_xmit - add a packet to an Ethernet Tx queue
  *	@skb: the packet
  *	@dev: the egress net device
@@ -1357,7 +1417,7 @@ static netdev_tx_t cxgb4_eth_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 	skb_tx_timestamp(skb);
 
-	cxgb4_reclaim_completed_tx(adap, &q->q, true);
+	reclaim_completed_tx(adap, &q->q, -1, true);
 	cntrl = TXPKT_L4CSUM_DIS_F | TXPKT_IPCSUM_DIS_F;
 
 #ifdef CONFIG_CHELSIO_T4_FCOE
@@ -1400,8 +1460,25 @@ static netdev_tx_t cxgb4_eth_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	wr_mid = FW_WR_LEN16_V(DIV_ROUND_UP(flits, 2));
 	if (unlikely(credits < ETHTXQ_STOP_THRES)) {
+		/* After we're done injecting the Work Request for this
+		 * packet, we'll be below our "stop threshold" so stop the TX
+		 * Queue now and schedule a request for an SGE Egress Queue
+		 * Update message. The queue will get started later on when
+		 * the firmware processes this Work Request and sends us an
+		 * Egress Queue Status Update message indicating that space
+		 * has opened up.
+		 */
 		eth_txq_stop(q);
-		wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F;
+
+		/* If we're using the SGE Doorbell Queue Timer facility, we
+		 * don't need to ask the Firmware to send us Egress Queue CIDX
+		 * Updates: the Hardware will do this automatically.  And
+		 * since we send the Ingress Queue CIDX Updates to the
+		 * corresponding Ethernet Response Queue, we'll get them very
+		 * quickly.
+		 */
+		if (!q->dbqt)
+			wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F;
 	}
 
 	wr = (void *)&q->q.desc[q->q.pidx];
@@ -1671,7 +1748,7 @@ static netdev_tx_t cxgb4_vf_eth_xmit(struct sk_buff *skb,
 	/* Take this opportunity to reclaim any TX Descriptors whose DMA
 	 * transfers have completed.
 	 */
-	cxgb4_reclaim_completed_tx(adapter, &txq->q, true);
+	reclaim_completed_tx(adapter, &txq->q, -1, true);
 
 	/* Calculate the number of flits and TX Descriptors we're going to
 	 * need along with how many TX Descriptors will be left over after
@@ -1715,7 +1792,16 @@ static netdev_tx_t cxgb4_vf_eth_xmit(struct sk_buff *skb,
 		 * has opened up.
 		 */
 		eth_txq_stop(txq);
-		wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F;
+
+		/* If we're using the SGE Doorbell Queue Timer facility, we
+		 * don't need to ask the Firmware to send us Egress Queue CIDX
+		 * Updates: the Hardware will do this automatically.  And
+		 * since we send the Ingress Queue CIDX Updates to the
+		 * corresponding Ethernet Response Queue, we'll get them very
+		 * quickly.
+		 */
+		if (!txq->dbqt)
+			wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F;
 	}
 
 	/* Start filling in our Work Request.  Note that we do _not_ handle
@@ -2794,6 +2880,74 @@ static int t4_tx_hststamp(struct adapter *adapter, struct sk_buff *skb,
 }
 
 /**
+ *	t4_tx_completion_handler - handle CPL_SGE_EGR_UPDATE messages
+ *	@rspq: Ethernet RX Response Queue associated with Ethernet TX Queue
+ *	@rsp: Response Entry pointer into Response Queue
+ *	@gl: Gather List pointer
+ *
+ *	For adapters which support the SGE Doorbell Queue Timer facility,
+ *	we configure the Ethernet TX Queues to send CIDX Updates to the
+ *	Associated Ethernet RX Response Queue with CPL_SGE_EGR_UPDATE
+ *	messages.  This adds a small load to PCIe Link RX bandwidth and,
+ *	potentially, higher CPU Interrupt load, but allows us to respond
+ *	much more quickly to the CIDX Updates.  This is important for
+ *	Upper Layer Software which isn't willing to have a large amount
+ *	of TX Data outstanding before receiving DMA Completions.
+ */
+static void t4_tx_completion_handler(struct sge_rspq *rspq,
+				     const __be64 *rsp,
+				     const struct pkt_gl *gl)
+{
+	u8 opcode = ((const struct rss_header *)rsp)->opcode;
+	struct port_info *pi = netdev_priv(rspq->netdev);
+	struct adapter *adapter = rspq->adap;
+	struct sge *s = &adapter->sge;
+	struct sge_eth_txq *txq;
+
+	/* skip RSS header */
+	rsp++;
+
+	/* FW can send EGR_UPDATEs encapsulated in a CPL_FW4_MSG.
+	 */
+	if (unlikely(opcode == CPL_FW4_MSG &&
+		     ((const struct cpl_fw4_msg *)rsp)->type ==
+							FW_TYPE_RSSCPL)) {
+		rsp++;
+		opcode = ((const struct rss_header *)rsp)->opcode;
+		rsp++;
+	}
+
+	if (unlikely(opcode != CPL_SGE_EGR_UPDATE)) {
+		pr_info("%s: unexpected FW4/CPL %#x on Rx queue\n",
+			__func__, opcode);
+		return;
+	}
+
+	txq = &s->ethtxq[pi->first_qset + rspq->idx];
+
+	/* We've got the Hardware Consumer Index Update in the Egress Update
+	 * message.  If we're using the SGE Doorbell Queue Timer mechanism,
+	 * these Egress Update messages will be our sole CIDX Updates we get
+	 * since we don't want to chew up PCIe bandwidth for both Ingress
+	 * Messages and Status Page writes.  However, The code which manages
+	 * reclaiming successfully DMA'ed TX Work Requests uses the CIDX value
+	 * stored in the Status Page at the end of the TX Queue.  It's easiest
+	 * to simply copy the CIDX Update value from the Egress Update message
+	 * to the Status Page.  Also note that no Endian issues need to be
+	 * considered here since both are Big Endian and we're just copying
+	 * bytes consistently ...
+	 */
+	if (txq->dbqt) {
+		struct cpl_sge_egr_update *egr;
+
+		egr = (struct cpl_sge_egr_update *)rsp;
+		WRITE_ONCE(txq->q.stat->cidx, egr->cidx);
+	}
+
+	t4_sge_eth_txq_egress_update(adapter, txq, -1);
+}
+
+/**
  *	t4_ethrx_handler - process an ingress ethernet packet
  *	@q: the response queue that received the packet
  *	@rsp: the response queue descriptor holding the RX_PKT message
@@ -2816,6 +2970,15 @@ int t4_ethrx_handler(struct sge_rspq *q, const __be64 *rsp,
 	struct port_info *pi;
 	int ret = 0;
 
+	/* If we're looking at TX Queue CIDX Update, handle that separately
+	 * and return.
+	 */
+	if (unlikely((*(u8 *)rsp == CPL_FW4_MSG) ||
+		     (*(u8 *)rsp == CPL_SGE_EGR_UPDATE))) {
+		t4_tx_completion_handler(q, rsp, si);
+		return 0;
+	}
+
 	if (unlikely(*(u8 *)rsp == cpl_trace_pkt))
 		return handle_trace_pkt(q->adap, si);
 
@@ -3289,10 +3452,10 @@ static void sge_rx_timer_cb(struct timer_list *t)
 
 static void sge_tx_timer_cb(struct timer_list *t)
 {
-	unsigned long m;
-	unsigned int i, budget;
 	struct adapter *adap = from_timer(adap, t, sge.tx_timer);
 	struct sge *s = &adap->sge;
+	unsigned long m, period;
+	unsigned int i, budget;
 
 	for (i = 0; i < BITS_TO_LONGS(s->egr_sz); i++)
 		for (m = s->txq_maperr[i]; m; m &= m - 1) {
@@ -3320,29 +3483,29 @@ static void sge_tx_timer_cb(struct timer_list *t)
 	budget = MAX_TIMER_TX_RECLAIM;
 	i = s->ethtxq_rover;
 	do {
-		struct sge_eth_txq *q = &s->ethtxq[i];
-
-		if (q->q.in_use &&
-		    time_after_eq(jiffies, q->txq->trans_start + HZ / 100) &&
-		    __netif_tx_trylock(q->txq)) {
-			int avail = reclaimable(&q->q);
-
-			if (avail) {
-				if (avail > budget)
-					avail = budget;
-
-				free_tx_desc(adap, &q->q, avail, true);
-				q->q.in_use -= avail;
-				budget -= avail;
-			}
-			__netif_tx_unlock(q->txq);
-		}
+		budget -= t4_sge_eth_txq_egress_update(adap, &s->ethtxq[i],
+						       budget);
+		if (!budget)
+			break;
 
 		if (++i >= s->ethqsets)
 			i = 0;
-	} while (budget && i != s->ethtxq_rover);
+	} while (i != s->ethtxq_rover);
 	s->ethtxq_rover = i;
-	mod_timer(&s->tx_timer, jiffies + (budget ? TX_QCHECK_PERIOD : 2));
+
+	if (budget == 0) {
+		/* If we found too many reclaimable packets schedule a timer
+		 * in the near future to continue where we left off.
+		 */
+		period = 2;
+	} else {
+		/* We reclaimed all reclaimable TX Descriptors, so reschedule
+		 * at the normal period.
+		 */
+		period = TX_QCHECK_PERIOD;
+	}
+
+	mod_timer(&s->tx_timer, jiffies + period);
 }
 
 /**
@@ -3421,7 +3584,8 @@ int t4_sge_alloc_rxq(struct adapter *adap, struct sge_rspq *iq, bool fwevtq,
 							:  FW_IQ_IQTYPE_OFLD));
 
 	if (fl) {
-		enum chip_type chip = CHELSIO_CHIP_VERSION(adap->params.chip);
+		unsigned int chip_ver =
+			CHELSIO_CHIP_VERSION(adap->params.chip);
 
 		/* Allocate the ring for the hardware free list (with space
 		 * for its status page) along with the associated software
@@ -3459,10 +3623,10 @@ int t4_sge_alloc_rxq(struct adapter *adap, struct sge_rspq *iq, bool fwevtq,
 		 * the smaller 64-byte value there).
 		 */
 		c.fl0dcaen_to_fl0cidxfthresh =
-			htons(FW_IQ_CMD_FL0FBMIN_V(chip <= CHELSIO_T5 ?
+			htons(FW_IQ_CMD_FL0FBMIN_V(chip_ver <= CHELSIO_T5 ?
 						   FETCHBURSTMIN_128B_X :
-						   FETCHBURSTMIN_64B_X) |
-			      FW_IQ_CMD_FL0FBMAX_V((chip <= CHELSIO_T5) ?
+						   FETCHBURSTMIN_64B_T6_X) |
+			      FW_IQ_CMD_FL0FBMAX_V((chip_ver <= CHELSIO_T5) ?
 						   FETCHBURSTMAX_512B_X :
 						   FETCHBURSTMAX_256B_X));
 		c.fl0size = htons(flsz);
@@ -3584,14 +3748,24 @@ static void init_txq(struct adapter *adap, struct sge_txq *q, unsigned int id)
 	adap->sge.egr_map[id - adap->sge.egr_start] = q;
 }
 
+/**
+ *	t4_sge_alloc_eth_txq - allocate an Ethernet TX Queue
+ *	@adap: the adapter
+ *	@txq: the SGE Ethernet TX Queue to initialize
+ *	@dev: the Linux Network Device
+ *	@netdevq: the corresponding Linux TX Queue
+ *	@iqid: the Ingress Queue to which to deliver CIDX Update messages
+ *	@dbqt: whether this TX Queue will use the SGE Doorbell Queue Timers
+ */
 int t4_sge_alloc_eth_txq(struct adapter *adap, struct sge_eth_txq *txq,
 			 struct net_device *dev, struct netdev_queue *netdevq,
-			 unsigned int iqid)
+			 unsigned int iqid, u8 dbqt)
 {
-	int ret, nentries;
-	struct fw_eq_eth_cmd c;
-	struct sge *s = &adap->sge;
+	unsigned int chip_ver = CHELSIO_CHIP_VERSION(adap->params.chip);
 	struct port_info *pi = netdev_priv(dev);
+	struct sge *s = &adap->sge;
+	struct fw_eq_eth_cmd c;
+	int ret, nentries;
 
 	/* Add status entries */
 	nentries = txq->q.size + s->stat_len / sizeof(struct tx_desc);
@@ -3610,19 +3784,47 @@ int t4_sge_alloc_eth_txq(struct adapter *adap, struct sge_eth_txq *txq,
 			    FW_EQ_ETH_CMD_VFN_V(0));
 	c.alloc_to_len16 = htonl(FW_EQ_ETH_CMD_ALLOC_F |
 				 FW_EQ_ETH_CMD_EQSTART_F | FW_LEN16(c));
-	c.viid_pkd = htonl(FW_EQ_ETH_CMD_AUTOEQUEQE_F |
-			   FW_EQ_ETH_CMD_VIID_V(pi->viid));
+
+	/* For TX Ethernet Queues using the SGE Doorbell Queue Timer
+	 * mechanism, we use Ingress Queue messages for Hardware Consumer
+	 * Index Updates on the TX Queue.  Otherwise we have the Hardware
+	 * write the CIDX Updates into the Status Page at the end of the
+	 * TX Queue.
+	 */
+	c.autoequiqe_to_viid = htonl((dbqt
+				      ? FW_EQ_ETH_CMD_AUTOEQUIQE_F
+				      : FW_EQ_ETH_CMD_AUTOEQUEQE_F) |
+				     FW_EQ_ETH_CMD_VIID_V(pi->viid));
+
 	c.fetchszm_to_iqid =
-		htonl(FW_EQ_ETH_CMD_HOSTFCMODE_V(HOSTFCMODE_STATUS_PAGE_X) |
+		htonl(FW_EQ_ETH_CMD_HOSTFCMODE_V(dbqt
+						 ? HOSTFCMODE_INGRESS_QUEUE_X
+						 : HOSTFCMODE_STATUS_PAGE_X) |
 		      FW_EQ_ETH_CMD_PCIECHN_V(pi->tx_chan) |
 		      FW_EQ_ETH_CMD_FETCHRO_F | FW_EQ_ETH_CMD_IQID_V(iqid));
+
+	/* Note that the CIDX Flush Threshold should match MAX_TX_RECLAIM. */
 	c.dcaen_to_eqsize =
-		htonl(FW_EQ_ETH_CMD_FBMIN_V(FETCHBURSTMIN_64B_X) |
+		htonl(FW_EQ_ETH_CMD_FBMIN_V(chip_ver <= CHELSIO_T5
+					    ? FETCHBURSTMIN_64B_X
+					    : FETCHBURSTMIN_64B_T6_X) |
 		      FW_EQ_ETH_CMD_FBMAX_V(FETCHBURSTMAX_512B_X) |
 		      FW_EQ_ETH_CMD_CIDXFTHRESH_V(CIDXFLUSHTHRESH_32_X) |
 		      FW_EQ_ETH_CMD_EQSIZE_V(nentries));
+
 	c.eqaddr = cpu_to_be64(txq->q.phys_addr);
 
+	/* If we're using the SGE Doorbell Queue Timer mechanism, pass in the
+	 * currently configured Timer Index.  THis can be changed later via an
+	 * ethtool -C tx-usecs {Timer Val} command.  Note that the SGE
+	 * Doorbell Queue mode is currently automatically enabled in the
+	 * Firmware by setting either AUTOEQUEQE or AUTOEQUIQE ...
+	 */
+	if (dbqt)
+		c.timeren_timerix =
+			cpu_to_be32(FW_EQ_ETH_CMD_TIMEREN_F |
+				    FW_EQ_ETH_CMD_TIMERIX_V(txq->dbqtimerix));
+
 	ret = t4_wr_mbox(adap, adap->mbox, &c, sizeof(c), &c);
 	if (ret) {
 		kfree(txq->q.sdesc);
@@ -3639,6 +3841,8 @@ int t4_sge_alloc_eth_txq(struct adapter *adap, struct sge_eth_txq *txq,
 	txq->txq = netdevq;
 	txq->tso = txq->tx_cso = txq->vlan_ins = 0;
 	txq->mapping_err = 0;
+	txq->dbqt = dbqt;
+
 	return 0;
 }
 
@@ -3646,10 +3850,11 @@ int t4_sge_alloc_ctrl_txq(struct adapter *adap, struct sge_ctrl_txq *txq,
 			  struct net_device *dev, unsigned int iqid,
 			  unsigned int cmplqid)
 {
-	int ret, nentries;
-	struct fw_eq_ctrl_cmd c;
-	struct sge *s = &adap->sge;
+	unsigned int chip_ver = CHELSIO_CHIP_VERSION(adap->params.chip);
 	struct port_info *pi = netdev_priv(dev);
+	struct sge *s = &adap->sge;
+	struct fw_eq_ctrl_cmd c;
+	int ret, nentries;
 
 	/* Add status entries */
 	nentries = txq->q.size + s->stat_len / sizeof(struct tx_desc);
@@ -3673,7 +3878,9 @@ int t4_sge_alloc_ctrl_txq(struct adapter *adap, struct sge_ctrl_txq *txq,
 		      FW_EQ_CTRL_CMD_PCIECHN_V(pi->tx_chan) |
 		      FW_EQ_CTRL_CMD_FETCHRO_F | FW_EQ_CTRL_CMD_IQID_V(iqid));
 	c.dcaen_to_eqsize =
-		htonl(FW_EQ_CTRL_CMD_FBMIN_V(FETCHBURSTMIN_64B_X) |
+		htonl(FW_EQ_CTRL_CMD_FBMIN_V(chip_ver <= CHELSIO_T5
+					     ? FETCHBURSTMIN_64B_X
+					     : FETCHBURSTMIN_64B_T6_X) |
 		      FW_EQ_CTRL_CMD_FBMAX_V(FETCHBURSTMAX_512B_X) |
 		      FW_EQ_CTRL_CMD_CIDXFTHRESH_V(CIDXFLUSHTHRESH_32_X) |
 		      FW_EQ_CTRL_CMD_EQSIZE_V(nentries));
@@ -3713,6 +3920,7 @@ int t4_sge_alloc_uld_txq(struct adapter *adap, struct sge_uld_txq *txq,
 			 struct net_device *dev, unsigned int iqid,
 			 unsigned int uld_type)
 {
+	unsigned int chip_ver = CHELSIO_CHIP_VERSION(adap->params.chip);
 	int ret, nentries;
 	struct fw_eq_ofld_cmd c;
 	struct sge *s = &adap->sge;
@@ -3743,7 +3951,9 @@ int t4_sge_alloc_uld_txq(struct adapter *adap, struct sge_uld_txq *txq,
 		      FW_EQ_OFLD_CMD_PCIECHN_V(pi->tx_chan) |
 		      FW_EQ_OFLD_CMD_FETCHRO_F | FW_EQ_OFLD_CMD_IQID_V(iqid));
 	c.dcaen_to_eqsize =
-		htonl(FW_EQ_OFLD_CMD_FBMIN_V(FETCHBURSTMIN_64B_X) |
+		htonl(FW_EQ_OFLD_CMD_FBMIN_V(chip_ver <= CHELSIO_T5
+					     ? FETCHBURSTMIN_64B_X
+					     : FETCHBURSTMIN_64B_T6_X) |
 		      FW_EQ_OFLD_CMD_FBMAX_V(FETCHBURSTMAX_512B_X) |
 		      FW_EQ_OFLD_CMD_CIDXFTHRESH_V(CIDXFLUSHTHRESH_32_X) |
 		      FW_EQ_OFLD_CMD_EQSIZE_V(nentries));
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index c5e5466..27af347 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -6713,6 +6713,47 @@ int t4_sge_ctxt_flush(struct adapter *adap, unsigned int mbox, int ctxt_type)
 }
 
 /**
+ *	t4_read_sge_dbqtimers - reag SGE Doorbell Queue Timer values
+ *	@adap - the adapter
+ *	@ndbqtimers: size of the provided SGE Doorbell Queue Timer table
+ *	@dbqtimers: SGE Doorbell Queue Timer table
+ *
+ *	Reads the SGE Doorbell Queue Timer values into the provided table.
+ *	Returns 0 on success (Firmware and Hardware support this feature),
+ *	an error on failure.
+ */
+int t4_read_sge_dbqtimers(struct adapter *adap, unsigned int ndbqtimers,
+			  u16 *dbqtimers)
+{
+	int ret, dbqtimerix;
+
+	ret = 0;
+	dbqtimerix = 0;
+	while (dbqtimerix < ndbqtimers) {
+		int nparams, param;
+		u32 params[7], vals[7];
+
+		nparams = ndbqtimers - dbqtimerix;
+		if (nparams > ARRAY_SIZE(params))
+			nparams = ARRAY_SIZE(params);
+
+		for (param = 0; param < nparams; param++)
+			params[param] =
+			  (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DEV) |
+			   FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DEV_DBQ_TIMER) |
+			   FW_PARAMS_PARAM_Y_V(dbqtimerix + param));
+		ret = t4_query_params(adap, adap->mbox, adap->pf, 0,
+				      nparams, params, vals);
+		if (ret)
+			break;
+
+		for (param = 0; param < nparams; param++)
+			dbqtimers[dbqtimerix++] = vals[param];
+	}
+	return ret;
+}
+
+/**
  *      t4_fw_hello - establish communication with FW
  *      @adap: the adapter
  *      @mbox: mailbox to use for the FW command
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.h b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.h
index 361d503..002fc62 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.h
@@ -91,6 +91,7 @@ enum {
 	SGE_CTXT_SIZE = 24,       /* size of SGE context */
 	SGE_NTIMERS = 6,          /* # of interrupt holdoff timer values */
 	SGE_NCOUNTERS = 4,        /* # of interrupt packet counter values */
+	SGE_NDBQTIMERS = 8,       /* # of Doorbell Queue Timer values */
 	SGE_MAX_IQ_SIZE = 65520,
 
 	SGE_TIMER_RSTRT_CNTR = 6, /* restart RX packet threshold counter */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_values.h b/drivers/net/ethernet/chelsio/cxgb4/t4_values.h
index f6558cb..eb1aa82 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_values.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_values.h
@@ -71,12 +71,18 @@
 #define FETCHBURSTMIN_64B_X		2
 #define FETCHBURSTMIN_128B_X		3
 
+/* T6 and later use a single-bit encoding for FetchBurstMin */
+#define FETCHBURSTMIN_64B_T6_X		0
+#define FETCHBURSTMIN_128B_T6_X		1
+
 #define FETCHBURSTMAX_256B_X		2
 #define FETCHBURSTMAX_512B_X		3
 
+#define HOSTFCMODE_INGRESS_QUEUE_X	1
 #define HOSTFCMODE_STATUS_PAGE_X	2
 
 #define CIDXFLUSHTHRESH_32_X		5
+#define CIDXFLUSHTHRESH_128_X		7
 
 #define UPDATEDELIVERY_INTERRUPT_X	1
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
index 1d9b3e1..631f166 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
@@ -1254,6 +1254,8 @@ enum fw_params_param_dev {
 	FW_PARAMS_PARAM_DEV_RDMA_WRITE_WITH_IMM = 0x21,
 	FW_PARAMS_PARAM_DEV_RI_WRITE_CMPL_WR    = 0x24,
 	FW_PARAMS_PARAM_DEV_OPAQUE_VIID_SMT_EXTN = 0x27,
+	FW_PARAMS_PARAM_DEV_DBQ_TIMER	= 0x29,
+	FW_PARAMS_PARAM_DEV_DBQ_TIMERTICK = 0x2A,
 };
 
 /*
@@ -1322,6 +1324,7 @@ enum fw_params_param_dmaq {
 	FW_PARAMS_PARAM_DMAQ_EQ_CMPLIQID_CTRL = 0x11,
 	FW_PARAMS_PARAM_DMAQ_EQ_SCHEDCLASS_ETH = 0x12,
 	FW_PARAMS_PARAM_DMAQ_EQ_DCBPRIO_ETH = 0x13,
+	FW_PARAMS_PARAM_DMAQ_EQ_TIMERIX	= 0x15,
 	FW_PARAMS_PARAM_DMAQ_CONM_CTXT = 0x20,
 };
 
@@ -1751,8 +1754,8 @@ struct fw_eq_eth_cmd {
 	__be32 fetchszm_to_iqid;
 	__be32 dcaen_to_eqsize;
 	__be64 eqaddr;
-	__be32 viid_pkd;
-	__be32 r8_lo;
+	__be32 autoequiqe_to_viid;
+	__be32 timeren_timerix;
 	__be64 r9;
 };
 
@@ -1847,6 +1850,10 @@ struct fw_eq_eth_cmd {
 #define FW_EQ_ETH_CMD_EQSIZE_S		0
 #define FW_EQ_ETH_CMD_EQSIZE_V(x)	((x) << FW_EQ_ETH_CMD_EQSIZE_S)
 
+#define FW_EQ_ETH_CMD_AUTOEQUIQE_S	31
+#define FW_EQ_ETH_CMD_AUTOEQUIQE_V(x)	((x) << FW_EQ_ETH_CMD_AUTOEQUIQE_S)
+#define FW_EQ_ETH_CMD_AUTOEQUIQE_F	FW_EQ_ETH_CMD_AUTOEQUIQE_V(1U)
+
 #define FW_EQ_ETH_CMD_AUTOEQUEQE_S	30
 #define FW_EQ_ETH_CMD_AUTOEQUEQE_V(x)	((x) << FW_EQ_ETH_CMD_AUTOEQUEQE_S)
 #define FW_EQ_ETH_CMD_AUTOEQUEQE_F	FW_EQ_ETH_CMD_AUTOEQUEQE_V(1U)
@@ -1854,6 +1861,19 @@ struct fw_eq_eth_cmd {
 #define FW_EQ_ETH_CMD_VIID_S	16
 #define FW_EQ_ETH_CMD_VIID_V(x)	((x) << FW_EQ_ETH_CMD_VIID_S)
 
+#define FW_EQ_ETH_CMD_TIMEREN_S		3
+#define FW_EQ_ETH_CMD_TIMEREN_M		0x1
+#define FW_EQ_ETH_CMD_TIMEREN_V(x)	((x) << FW_EQ_ETH_CMD_TIMEREN_S)
+#define FW_EQ_ETH_CMD_TIMEREN_G(x)	\
+    (((x) >> FW_EQ_ETH_CMD_TIMEREN_S) & FW_EQ_ETH_CMD_TIMEREN_M)
+#define FW_EQ_ETH_CMD_TIMEREN_F	FW_EQ_ETH_CMD_TIMEREN_V(1U)
+
+#define FW_EQ_ETH_CMD_TIMERIX_S		0
+#define FW_EQ_ETH_CMD_TIMERIX_M		0x7
+#define FW_EQ_ETH_CMD_TIMERIX_V(x)	((x) << FW_EQ_ETH_CMD_TIMERIX_S)
+#define FW_EQ_ETH_CMD_TIMERIX_G(x)	\
+    (((x) >> FW_EQ_ETH_CMD_TIMERIX_S) & FW_EQ_ETH_CMD_TIMERIX_M)
+
 struct fw_eq_ctrl_cmd {
 	__be32 op_to_vfn;
 	__be32 alloc_to_len16;
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
index 1d534f0..11d2ba0 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
@@ -2268,7 +2268,7 @@ int t4vf_sge_alloc_rxq(struct adapter *adapter, struct sge_rspq *rspq,
 	cmd.iqaddr = cpu_to_be64(rspq->phys_addr);
 
 	if (fl) {
-		enum chip_type chip =
+		unsigned int chip_ver =
 			CHELSIO_CHIP_VERSION(adapter->params.chip);
 		/*
 		 * Allocate the ring for the hardware free list (with space
@@ -2319,10 +2319,10 @@ int t4vf_sge_alloc_rxq(struct adapter *adapter, struct sge_rspq *rspq,
 		 */
 		cmd.fl0dcaen_to_fl0cidxfthresh =
 			cpu_to_be16(
-				FW_IQ_CMD_FL0FBMIN_V(chip <= CHELSIO_T5 ?
-						     FETCHBURSTMIN_128B_X :
-						     FETCHBURSTMIN_64B_X) |
-				FW_IQ_CMD_FL0FBMAX_V((chip <= CHELSIO_T5) ?
+				FW_IQ_CMD_FL0FBMIN_V(chip_ver <= CHELSIO_T5
+						     ? FETCHBURSTMIN_128B_X
+						     : FETCHBURSTMIN_64B_T6_X) |
+				FW_IQ_CMD_FL0FBMAX_V((chip_ver <= CHELSIO_T5) ?
 						     FETCHBURSTMAX_512B_X :
 						     FETCHBURSTMAX_256B_X));
 		cmd.fl0size = cpu_to_be16(flsz);
@@ -2411,10 +2411,11 @@ int t4vf_sge_alloc_eth_txq(struct adapter *adapter, struct sge_eth_txq *txq,
 			   struct net_device *dev, struct netdev_queue *devq,
 			   unsigned int iqid)
 {
+	unsigned int chip_ver = CHELSIO_CHIP_VERSION(adapter->params.chip);
+	struct port_info *pi = netdev_priv(dev);
+	struct fw_eq_eth_cmd cmd, rpl;
 	struct sge *s = &adapter->sge;
 	int ret, nentries;
-	struct fw_eq_eth_cmd cmd, rpl;
-	struct port_info *pi = netdev_priv(dev);
 
 	/*
 	 * Calculate the size of the hardware TX Queue (including the Status
@@ -2448,17 +2449,19 @@ int t4vf_sge_alloc_eth_txq(struct adapter *adapter, struct sge_eth_txq *txq,
 	cmd.alloc_to_len16 = cpu_to_be32(FW_EQ_ETH_CMD_ALLOC_F |
 					 FW_EQ_ETH_CMD_EQSTART_F |
 					 FW_LEN16(cmd));
-	cmd.viid_pkd = cpu_to_be32(FW_EQ_ETH_CMD_AUTOEQUEQE_F |
-				   FW_EQ_ETH_CMD_VIID_V(pi->viid));
+	cmd.autoequiqe_to_viid = cpu_to_be32(FW_EQ_ETH_CMD_AUTOEQUEQE_F |
+					     FW_EQ_ETH_CMD_VIID_V(pi->viid));
 	cmd.fetchszm_to_iqid =
 		cpu_to_be32(FW_EQ_ETH_CMD_HOSTFCMODE_V(SGE_HOSTFCMODE_STPG) |
 			    FW_EQ_ETH_CMD_PCIECHN_V(pi->port_id) |
 			    FW_EQ_ETH_CMD_IQID_V(iqid));
 	cmd.dcaen_to_eqsize =
-		cpu_to_be32(FW_EQ_ETH_CMD_FBMIN_V(SGE_FETCHBURSTMIN_64B) |
-			    FW_EQ_ETH_CMD_FBMAX_V(SGE_FETCHBURSTMAX_512B) |
+		cpu_to_be32(FW_EQ_ETH_CMD_FBMIN_V(chip_ver <= CHELSIO_T5
+						  ? FETCHBURSTMIN_64B_X
+						  : FETCHBURSTMIN_64B_T6_X) |
+			    FW_EQ_ETH_CMD_FBMAX_V(FETCHBURSTMAX_512B_X) |
 			    FW_EQ_ETH_CMD_CIDXFTHRESH_V(
-						SGE_CIDXFLUSHTHRESH_32) |
+						CIDXFLUSHTHRESH_32_X) |
 			    FW_EQ_ETH_CMD_EQSIZE_V(nentries));
 	cmd.eqaddr = cpu_to_be64(txq->q.phys_addr);
 
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH net-next v2 2/2] cxgb4: Add capability to get/set SGE Doorbell Queue Timer Tick
From: Vishal Kulkarni @ 2019-02-14 12:49 UTC (permalink / raw)
  To: netdev, davem; +Cc: nirranjan, indranil, dt, Vishal Kulkarni
In-Reply-To: <1550148556-3531-1-git-send-email-vishal@chelsio.com>

This patch gets/sets SGE Doorbell Queue timer ticks via ethtool

Original work by: Casey Leedom <leedom@chelsio.com>

Signed-off-by: Vishal Kulkarni <vishal@chelsio.com>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h         |   1 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c | 189 ++++++++++++++++++++-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c    |  14 +-
 3 files changed, 198 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 68d0d45..b7b0eb1 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -819,6 +819,7 @@ struct sge {
 	u16 nqs_per_uld;	    /* # of Rx queues per ULD */
 	u16 timer_val[SGE_NTIMERS];
 	u8 counter_val[SGE_NCOUNTERS];
+	u16 dbqtimer_tick;
 	u16 dbqtimer_val[SGE_NDBQTIMERS];
 	u32 fl_pg_order;            /* large page allocation size */
 	u32 stat_len;               /* length of status page at ring end */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
index 7960435..65b8dc7 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
@@ -932,11 +932,190 @@ static int get_adaptive_rx_setting(struct net_device *dev)
 	return q->rspq.adaptive_rx;
 }
 
-static int set_coalesce(struct net_device *dev, struct ethtool_coalesce *c)
+/* Return the current global Adapter SGE Doorbell Queue Timer Tick for all
+ * Ethernet TX Queues.
+ */
+static int get_dbqtimer_tick(struct net_device *dev)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adap = pi->adapter;
+
+	if (!(adap->flags & SGE_DBQ_TIMER))
+		return 0;
+
+	return adap->sge.dbqtimer_tick;
+}
+
+/* Return the SGE Doorbell Queue Timer Value for the Ethernet TX Queues
+ * associated with a Network Device.
+ */
+static int get_dbqtimer(struct net_device *dev)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adap = pi->adapter;
+	struct sge_eth_txq *txq;
+
+	txq = &adap->sge.ethtxq[pi->first_qset];
+
+	if (!(adap->flags & SGE_DBQ_TIMER))
+		return 0;
+
+	/* all of the TX Queues use the same Timer Index */
+	return adap->sge.dbqtimer_val[txq->dbqtimerix];
+}
+
+/* Set the global Adapter SGE Doorbell Queue Timer Tick for all Ethernet TX
+ * Queues.  This is the fundamental "Tick" that sets the scale of values which
+ * can be used.  Individual Ethernet TX Queues index into a relatively small
+ * array of Tick Multipliers.  Changing the base Tick will thus change all of
+ * the resulting Timer Values associated with those multipliers for all
+ * Ethernet TX Queues.
+ */
+static int set_dbqtimer_tick(struct net_device *dev, int usecs)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adap = pi->adapter;
+	struct sge *s = &adap->sge;
+	u32 param, val;
+	int ret;
+
+	if (!(adap->flags & SGE_DBQ_TIMER))
+		return 0;
+
+	/* return early if it's the same Timer Tick we're already using */
+	if (s->dbqtimer_tick == usecs)
+		return 0;
+
+	/* attempt to set the new Timer Tick value */
+	param = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DEV) |
+		 FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DEV_DBQ_TIMERTICK));
+	val = usecs;
+	ret = t4_set_params(adap, adap->mbox, adap->pf, 0, 1, &param, &val);
+	if (ret)
+		return ret;
+	s->dbqtimer_tick = usecs;
+
+	/* if successful, reread resulting dependent Timer values */
+	ret = t4_read_sge_dbqtimers(adap, ARRAY_SIZE(s->dbqtimer_val),
+				    s->dbqtimer_val);
+	return ret;
+}
+
+/* Set the SGE Doorbell Queue Timer Value for the Ethernet TX Queues
+ * associated with a Network Device.  There is a relatively small array of
+ * possible Timer Values so we need to pick the closest value available.
+ */
+static int set_dbqtimer(struct net_device *dev, int usecs)
+{
+	int qix, timerix, min_timerix, delta, min_delta;
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adap = pi->adapter;
+	struct sge *s = &adap->sge;
+	struct sge_eth_txq *txq;
+	u32 param, val;
+	int ret;
+
+	if (!(adap->flags & SGE_DBQ_TIMER))
+		return 0;
+
+	/* Find the SGE Doorbell Timer Value that's closest to the requested
+	 * value.
+	 */
+	min_delta = INT_MAX;
+	min_timerix = 0;
+	for (timerix = 0; timerix < ARRAY_SIZE(s->dbqtimer_val); timerix++) {
+		delta = s->dbqtimer_val[timerix] - usecs;
+		if (delta < 0)
+			delta = -delta;
+		if (delta < min_delta) {
+			min_delta = delta;
+			min_timerix = timerix;
+		}
+	}
+
+	/* Return early if it's the same Timer Index we're already using.
+	 * We use the same Timer Index for all of the TX Queues for an
+	 * interface so it's only necessary to check the first one.
+	 */
+	txq = &s->ethtxq[pi->first_qset];
+	if (txq->dbqtimerix == min_timerix)
+		return 0;
+
+	for (qix = 0; qix < pi->nqsets; qix++, txq++) {
+		if (adap->flags & FULL_INIT_DONE) {
+			param =
+			 (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DMAQ) |
+			  FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DMAQ_EQ_TIMERIX) |
+			  FW_PARAMS_PARAM_YZ_V(txq->q.cntxt_id));
+			val = min_timerix;
+			ret = t4_set_params(adap, adap->mbox, adap->pf, 0,
+					    1, &param, &val);
+			if (ret)
+				return ret;
+		}
+		txq->dbqtimerix = min_timerix;
+	}
+	return 0;
+}
+
+/* Set the global Adapter SGE Doorbell Queue Timer Tick for all Ethernet TX
+ * Queues and the Timer Value for the Ethernet TX Queues associated with a
+ * Network Device.  Since changing the global Tick changes all of the
+ * available Timer Values, we need to do this first before selecting the
+ * resulting closest Timer Value.  Moreover, since the Tick is global,
+ * changing it affects the Timer Values for all Network Devices on the
+ * adapter.  So, before changing the Tick, we grab all of the current Timer
+ * Values for other Network Devices on this Adapter and then attempt to select
+ * new Timer Values which are close to the old values ...
+ */
+static int set_dbqtimer_tickval(struct net_device *dev,
+				int tick_usecs, int timer_usecs)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adap = pi->adapter;
+	int timer[MAX_NPORTS];
+	unsigned int port;
+	int ret;
+
+	/* Grab the other adapter Network Interface current timers and fill in
+	 * the new one for this Network Interface.
+	 */
+	for_each_port(adap, port)
+		if (port == pi->port_id)
+			timer[port] = timer_usecs;
+		else
+			timer[port] = get_dbqtimer(adap->port[port]);
+
+	/* Change the global Tick first ... */
+	ret = set_dbqtimer_tick(dev, tick_usecs);
+	if (ret)
+		return ret;
+
+	/* ... and then set all of the Network Interface Timer Values ... */
+	for_each_port(adap, port) {
+		ret = set_dbqtimer(adap->port[port], timer[port]);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int set_coalesce(struct net_device *dev,
+			struct ethtool_coalesce *coalesce)
 {
-	set_adaptive_rx_setting(dev, c->use_adaptive_rx_coalesce);
-	return set_rx_intr_params(dev, c->rx_coalesce_usecs,
-				  c->rx_max_coalesced_frames);
+	int ret;
+
+	set_adaptive_rx_setting(dev, coalesce->use_adaptive_rx_coalesce);
+
+	ret = set_rx_intr_params(dev, coalesce->rx_coalesce_usecs,
+				 coalesce->rx_max_coalesced_frames);
+	if (ret)
+		return ret;
+
+	return set_dbqtimer_tickval(dev,
+				    coalesce->tx_coalesce_usecs_irq,
+				    coalesce->tx_coalesce_usecs);
 }
 
 static int get_coalesce(struct net_device *dev, struct ethtool_coalesce *c)
@@ -949,6 +1128,8 @@ static int get_coalesce(struct net_device *dev, struct ethtool_coalesce *c)
 	c->rx_max_coalesced_frames = (rq->intr_params & QINTR_CNT_EN_F) ?
 		adap->sge.counter_val[rq->pktcnt_idx] : 0;
 	c->use_adaptive_rx_coalesce = get_adaptive_rx_setting(dev);
+	c->tx_coalesce_usecs_irq = get_dbqtimer_tick(dev);
+	c->tx_coalesce_usecs = get_dbqtimer(dev);
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index bdd11a6..bcbac24 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -4331,8 +4331,18 @@ static int adap_init0(struct adapter *adap)
 	/* Grab the SGE Doorbell Queue Timer values.  If successful, that
 	 * indicates that the Firmware and Hardware support this.
 	 */
-	ret = t4_read_sge_dbqtimers(adap, ARRAY_SIZE(adap->sge.dbqtimer_val),
-				    adap->sge.dbqtimer_val);
+	params[0] = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DEV) |
+		    FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DEV_DBQ_TIMERTICK));
+	ret = t4_query_params(adap, adap->mbox, adap->pf, 0,
+			      1, params, val);
+
+	if (!ret) {
+		adap->sge.dbqtimer_tick = val[0];
+		ret = t4_read_sge_dbqtimers(adap,
+					    ARRAY_SIZE(adap->sge.dbqtimer_val),
+					    adap->sge.dbqtimer_val);
+	}
+
 	if (!ret)
 		adap->flags |= SGE_DBQ_TIMER;
 
-- 
1.8.3.1


^ permalink raw reply related

* Re: [PATCH net-next 7/9] net: bridge: Stop calling switchdev_port_attr_get()
From: Ido Schimmel @ 2019-02-14 13:02 UTC (permalink / raw)
  To: Florian Fainelli
  Cc: netdev@vger.kernel.org, David S. Miller, open list,
	open list:STAGING SUBSYSTEM, moderated list:ETHERNET BRIDGE,
	Jiri Pirko, andrew@lunn.ch, vivien.didelot@gmail.com
In-Reply-To: <20190214112002.GA8698@splinter>

On Thu, Feb 14, 2019 at 01:20:02PM +0200, Ido Schimmel wrote:
> On Wed, Feb 13, 2019 at 02:06:36PM -0800, Florian Fainelli wrote:
> > Now that all switchdev drivers have been converted to checking the
> > bridge port flags during the prepare phase of the
> > switchdev_port_attr_set() when the process
> > SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS, we can avoid calling
> > switchdev_port_attr_get() with
> > SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS_SUPPORT.
> > 
> > Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
> > ---
> >  net/bridge/br_switchdev.c | 16 +++++++---------
> >  1 file changed, 7 insertions(+), 9 deletions(-)
> > 
> > diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
> > index db9e8ab96d48..8f88f8a1a7fa 100644
> > --- a/net/bridge/br_switchdev.c
> > +++ b/net/bridge/br_switchdev.c
> > @@ -64,29 +64,27 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p,
> >  {
> >  	struct switchdev_attr attr = {
> >  		.orig_dev = p->dev,
> > -		.id = SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS_SUPPORT,
> > +		.id = SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS,
> > +		.u.brport_flags = flags,
> >  	};
> >  	int err;
> >  
> >  	if (mask & ~BR_PORT_FLAGS_HW_OFFLOAD)
> >  		return 0;
> >  
> > -	err = switchdev_port_attr_get(p->dev, &attr);
> > -	if (err == -EOPNOTSUPP)
> > -		return 0;
> > -	if (err)
> > +	err = switchdev_port_attr_set(p->dev, &attr);
> > +	if (err && err != -EOPNOTSUPP)
> >  		return err;
> >  
> > -	/* Check if specific bridge flag attribute offload is supported */
> > -	if (!(attr.u.brport_flags_support & mask)) {
> > +	if (err == -EOPNOTSUPP) {
> >  		br_warn(p->br, "bridge flag offload is not supported %u(%s)\n",
> >  			(unsigned int)p->port_no, p->dev->name);
> > -		return -EOPNOTSUPP;
> > +		return err;
> >  	}
> 
> I see that you return -EOPNOTSUPP from drivers in case of unsupported
> flags. I believe this is problematic (I'll test soon). The same return
> code is used by:
> 
> 1. Switch drivers to indicate unsupported flags
> 2. switchdev code to indicate unsupported netdev (no switchdev ops)
> 
> I guess that with this patch any attempt to set bridge port flags on
> veth/dummy device will result in an error.

Yea, that's the case. You can test with
tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh and other
bridge-related tests we have there.

Another problem is that during PORT_PRE_BRIDGE_FLAGS you pass 'flags'
and not 'mask'. This breaks mlxsw (and probably others as well) given
BR_BCAST_FLOOD is set by default.

> 
> >  
> >  	attr.id = SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS;
> >  	attr.flags = SWITCHDEV_F_DEFER;
> > -	attr.u.brport_flags = flags;
> > +
> >  	err = switchdev_port_attr_set(p->dev, &attr);
> >  	if (err) {
> >  		br_warn(p->br, "error setting offload flag on port %u(%s)\n",
> > -- 
> > 2.17.1
> > 

^ permalink raw reply

* [PATCH net] i40e: fix XDP_REDIRECT/XDP xmit ring cleanup race
From: Björn Töpel @ 2019-02-14 13:03 UTC (permalink / raw)
  To: intel-wired-lan
  Cc: Björn Töpel, magnus.karlsson, magnus.karlsson, brouer,
	netdev, maciej.fijalkowski

From: Björn Töpel <bjorn.topel@intel.com>

When the driver clears the XDP xmit ring due to re-configuration or
teardown, in-progress ndo_xdp_xmit must be taken into consideration.

The ndo_xdp_xmit function is typically called from a napi context that
the driver does not control. Therefore, we must be careful not to
clear the XDP ring, while the call is on-going. This patch adds a
synchronize_rcu() to wait for napi(s) (preempt-disable regions and
softirqs), prior clearing the queue. Further, the __I40E_CONFIG_BUSY
flag is checked in the ndo_xdp_xmit implementation to avoid touching
the XDP xmit queue during re-configuration.

Fixes: d9314c474d4f ("i40e: add support for XDP_REDIRECT")
Fixes: 123cecd427b6 ("i40e: added queue pair disable/enable functions")
Reported-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 14 ++++++++++++--
 drivers/net/ethernet/intel/i40e/i40e_txrx.c |  4 +++-
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index f52e2c46e6a7..6a54a1e2f4ae 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -6725,8 +6725,13 @@ void i40e_down(struct i40e_vsi *vsi)
 
 	for (i = 0; i < vsi->num_queue_pairs; i++) {
 		i40e_clean_tx_ring(vsi->tx_rings[i]);
-		if (i40e_enabled_xdp_vsi(vsi))
+		if (i40e_enabled_xdp_vsi(vsi)) {
+			/* Make sure that in-progress ndo_xdp_xmit
+			 * calls are completed.
+			 */
+			synchronize_rcu();
 			i40e_clean_tx_ring(vsi->xdp_rings[i]);
+		}
 		i40e_clean_rx_ring(vsi->rx_rings[i]);
 	}
 
@@ -11955,8 +11960,13 @@ static void i40e_queue_pair_reset_stats(struct i40e_vsi *vsi, int queue_pair)
 static void i40e_queue_pair_clean_rings(struct i40e_vsi *vsi, int queue_pair)
 {
 	i40e_clean_tx_ring(vsi->tx_rings[queue_pair]);
-	if (i40e_enabled_xdp_vsi(vsi))
+	if (i40e_enabled_xdp_vsi(vsi)) {
+		/* Make sure that in-progress ndo_xdp_xmit calls are
+		 * completed.
+		 */
+		synchronize_rcu();
 		i40e_clean_tx_ring(vsi->xdp_rings[queue_pair]);
+	}
 	i40e_clean_rx_ring(vsi->rx_rings[queue_pair]);
 }
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index a7e14e98889f..6c97667d20ef 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -3709,6 +3709,7 @@ int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 	struct i40e_netdev_priv *np = netdev_priv(dev);
 	unsigned int queue_index = smp_processor_id();
 	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
 	struct i40e_ring *xdp_ring;
 	int drops = 0;
 	int i;
@@ -3716,7 +3717,8 @@ int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 	if (test_bit(__I40E_VSI_DOWN, vsi->state))
 		return -ENETDOWN;
 
-	if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
+	if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs ||
+	    test_bit(__I40E_CONFIG_BUSY, pf->state))
 		return -ENXIO;
 
 	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
-- 
2.19.1


^ permalink raw reply related

* Re: [RFC PATCH net-next 2/5] net: 8021q: vlan_dev: add vid tag for uc and mc address lists
From: Ivan Khoronzhuk @ 2019-02-14 13:03 UTC (permalink / raw)
  To: Florian Fainelli; +Cc: davem, linux-omap, netdev, linux-kernel, jiri, andrew
In-Reply-To: <BF485558-C29E-4F25-84B5-4C6C80BDF337@gmail.com>

On Wed, Feb 13, 2019 at 08:49:39PM -0800, Florian Fainelli wrote:
>
>
>On February 13, 2019 8:17:16 AM PST, Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org> wrote:
>>On Tue, Jan 22, 2019 at 03:12:41PM +0200, Ivan Khoronzhuk wrote:
>>>On Mon, Jan 21, 2019 at 03:37:41PM -0800, Florian Fainelli wrote:
>>>>On 12/4/18 3:42 PM, Ivan Khoronzhuk wrote:
>>>>>On Tue, Dec 04, 2018 at 11:49:27AM -0800, Florian Fainelli wrote:
>>>
>>>[...]
>>>
>>>>
>>>>Ivan, based on the recent submission I copied you on [1], it sounds
>>like
>>>>we want to move ahead with your proposal to extend netdev_hw_addr
>>with a
>>>>vid member.
>>>>
>>>>On second thought, your approach is good and if we enclose the vid
>>>>member within an #if IS_ENABLED(CONFIG_VLAN)8021Q) we should be good
>>for
>>>>most foreseeable use cases, if not, we can always introduce a
>>variable
>>>>size/defined context in the future.
>>>>
>>>>Can you resubmit this patch series as non-RFC in the next few days so
>>I
>>>>can also repost mine [1] and take advantage of these changes for
>>>>multicast over VLAN when VLAN filtering is globally enabled on the
>>device.
>>>>
>>>>[1]: https://www.spinics.net/lists/netdev/msg544722.html
>>>>
>>>>Thanks!
>>>
>>>Yes, sure. I can start to do that in several days.
>>>Just a little busy right now.
>>>
>>>Just before doing this, maybe some comments could be added as it has
>>more
>>>attention now. Meanwhile I can send alternative variant but based on
>>>real dev splitting addresses between vlans. In this approach it leaves
>>address
>>>space w/o vid extension but requires more changes to vlan core.
>>Drawback here
>>>that to change one address alg traverses all related vlan addresses,
>>it can be
>>>cpu/time wasteful, if it's done regularly, but saves memory....
>>>
>>>Basically it's implemented locally in cpsw and requires more changes
>>to move
>>>it as some vlan core auxiliary functions to be reused. But it can work
>>only
>>>with vlans directly on top of real dev, which is fixable.
>>>
>>>Core function here:
>>>__hw_addr_ref_sync_dev
>>>it is called only for address the link of which was
>>increased/decreased, thus
>>>update made only on one address, comparing it for every vlan dev.
>>>
>>>It was added with this patch:
>>>[1] net: core: dev_addr_lists: add auxiliary func to handle reference
>>>address update e7946760de5852f32
>>>
>>>And used by this patch:
>>>[2] net: ethernet: ti: cpsw: fix vlan mcast 15180eca569bfe1d4d
>>>
>>>So, idea is to move [2] to be vlan core auxiliary function to be
>>reused
>>>by NIC drivers.
>>>
>>>But potentially it can bring a little more changes I assume:
>>>
>>>1) add priv_flag |= IFF_IV_FLT (independent vlan filtering). It allows
>>to reuse
>>>this flag for farther changes, probably for per vlan allmulti or so.
>>>
>>>2) real dev has to have complete list for vlans, not only their vids,
>>but also
>>>all vlandevs in device chain above it. So changes in add_vid can be
>>required.
>>>Vlan core can assign vlan dev pointer to real device only after it's
>>completely
>>>initialized. And for propagation reasons it requires every device in
>>>infrastructure to be aware. That seems doable, but depends not only on
>>me.
>>>
>>>3) Move code from [2] to be auxiliary vlan core API for setting mc and
>>uc.
>>>From this patch only one function is cpsw specific: cpsw_set_mc(). The
>>rest can
>>>be applicable on every NIC supporting IFF_IV_FLT.
>>>
>>>4) Move code from link below to do the same but for uc addresses:
>>>https://git.linaro.org/people/ivan.khoronzhuk/tsn_kernel.git/commit/?h=ucast_vlan_fix&id=ebc88a7d8758759322d9ff88f25f8bac51ce7219
>>>here only one func cpsw specific: cpsw_set_uc()
>>>the rest can be generic.
>>>
>>>As third alternative, we can think about how to reduce memory for
>>addresses by
>>>reusing them or else, but this is as continuation of addr+vid
>>approach, and API
>>>probably would be the same.
>>>
>>>Then all this can be compared for proper decision.
>>
>>
>>Hi Florian,
>>
>>After several more investigations and tries probably better left this
>>idea as is.
>
>Thank you for keeping the thread alive, does that mean you are going to resubmit this patch series as-is (rebased) or are you saying that you are abandoning the idea and leaving the situation the way it is in cpsw?

I will resubmit this one. But:

I have to try one more approach before this.
The idea is to create simple rx flt device tree while mc/us sync.
Then use it at real device to dispatch addresses.

It increases hw_addr struct a little and code base,
But:
- no need to keep linearly all vlan addresses in one address space.
- replicates RX filtering struct of net devices,
  (but not logical tree of netdevs)
- keeps devs info per address.
- no need to change addr lenth and modify existent API
- access at any net dev to above rx flt device structure per address
- potentially can be use not only for vlan devs identification but for
  other rx path offloads.

Idea is simple but not completely verified it yet,
need a little bit more time to prove/clean ...or drop it.

>
>>
>>Here actually several explanations for this:
>>1) If even assume that we can get access to vlan devices in the above
>>ndev
>>tree (we can) that doesn't guarantee that receive vlan filters are set
>>replicating this structure. For example bond device can have one active
>>slave
>>but both of them in the tree having vid set, in this case addresses are
>>syched only with active slave, no filters should be applied to not
>>active slave.
>>this can be achieved only each address has vid context.
>>
>>2) According to 1) rx filters device structure can be created while
>>mc_sync()
>>in each rx_mode(), and then used as orthogonal info. I've tried and it
>>looks
>>not cool and consumes anyway memory and even if it's less it's still
>>not very
>>scalable. (+ no normal signal "in complex structure case" when address
>>should
>>be undated to avoid redundant cpu cycles). Not sure it can have
>>practical
>>results and be universal enouph.
>>
>>3) Assuming that every device in the tree (bond, team or else) is legal
>>to
>>modify its own address space, the real end device cannot be sure the
>>vlan device
>>address spaces reflects vid addresses that device tree want's from him.
>>According to this each address in address space must hold its own
>>context at
>>every device and this context is comparable with address size.
>>
>>>-- Regards,
>>>Ivan Khoronzhuk
>
>-- 
>Florian

-- 
Regards,
Ivan Khoronzhuk

^ permalink raw reply

* Re: [PATCH net-next 3/9] mlxsw: spectrum: Check bridge flags during prepare phase
From: Ido Schimmel @ 2019-02-14 13:11 UTC (permalink / raw)
  To: Florian Fainelli
  Cc: netdev@vger.kernel.org, David S. Miller, open list,
	open list:STAGING SUBSYSTEM, moderated list:ETHERNET BRIDGE,
	Jiri Pirko, andrew@lunn.ch, vivien.didelot@gmail.com
In-Reply-To: <20190213220638.1552-4-f.fainelli@gmail.com>

On Wed, Feb 13, 2019 at 02:06:32PM -0800, Florian Fainelli wrote:
> In preparation for getting rid of switchdev_port_attr_get(), have mlxsw
> check for the bridge flags being set through switchdev_port_attr_set()
> when the SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS attribute identifier is
> used.
> 
> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
> ---
>  .../ethernet/mellanox/mlxsw/spectrum_switchdev.c   | 14 +++++++++++---
>  1 file changed, 11 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
> index 1f492b7dbea8..7616eab50035 100644
> --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
> +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
> @@ -598,13 +598,17 @@ mlxsw_sp_bridge_port_learning_set(struct mlxsw_sp_port *mlxsw_sp_port,
>  static int mlxsw_sp_port_attr_br_flags_set(struct mlxsw_sp_port *mlxsw_sp_port,
>  					   struct switchdev_trans *trans,
>  					   struct net_device *orig_dev,
> -					   unsigned long brport_flags)
> +					   unsigned long brport_flags,
> +					   bool pre_set)
>  {
>  	struct mlxsw_sp_bridge_port *bridge_port;
>  	int err;
>  
> -	if (switchdev_trans_ph_prepare(trans))
> +	if (switchdev_trans_ph_prepare(trans) && pre_set) {
> +		if (brport_flags & ~(BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD))
> +			return -EOPNOTSUPP;
>  		return 0;
> +	}

When we get SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS we only want to
perform a check. With this code in case it's not prepare phase, then we
continue to set the flags. Better do:

if (pre_set) {
	if (switchdev_trans_ph_commit(trans))
		return 0;
	// perform check here
}

>  
>  	bridge_port = mlxsw_sp_bridge_port_find(mlxsw_sp_port->mlxsw_sp->bridge,
>  						orig_dev);
> @@ -833,6 +837,7 @@ static int mlxsw_sp_port_attr_set(struct net_device *dev,
>  				  struct switchdev_trans *trans)
>  {
>  	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
> +	bool pre_set = false;
>  	int err;
>  
>  	switch (attr->id) {
> @@ -841,10 +846,13 @@ static int mlxsw_sp_port_attr_set(struct net_device *dev,
>  						       attr->orig_dev,
>  						       attr->u.stp_state);
>  		break;
> +	case SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS:
> +		pre_set = true;	/* fall through */
>  	case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS:
>  		err = mlxsw_sp_port_attr_br_flags_set(mlxsw_sp_port, trans,
>  						      attr->orig_dev,
> -						      attr->u.brport_flags);
> +						      attr->u.brport_flags,
> +						      pre_set);
>  		break;
>  	case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME:
>  		err = mlxsw_sp_port_attr_br_ageing_set(mlxsw_sp_port, trans,
> -- 
> 2.17.1
> 

^ permalink raw reply

* [PATCH net-next 1/2] net/mlx5e: Introduce mlx5e_flow_esw_attr_init() helper
From: xiangxia.m.yue @ 2019-02-12  3:39 UTC (permalink / raw)
  To: saeedm, gerlitz.or; +Cc: netdev, Tonghao Zhang

From: Tonghao Zhang <xiangxia.m.yue@gmail.com>

Introduce the mlx5e_flow_esw_attr_init() helper
for simplifying codes.

Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 41 +++++++++++++++++--------
 1 file changed, 29 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index e943775..98b002c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -2736,6 +2736,30 @@ static bool is_peer_flow_needed(struct mlx5e_tc_flow *flow)
 	return err;
 }
 
+static void
+mlx5e_flow_esw_attr_init(struct mlx5_esw_flow_attr *esw_attr,
+			 struct mlx5e_priv *priv,
+			 struct mlx5e_tc_flow_parse_attr *parse_attr,
+			 struct tc_cls_flower_offload *f,
+			 struct mlx5_eswitch_rep *in_rep,
+			 struct mlx5_core_dev *in_mdev)
+{
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+
+	esw_attr->parse_attr = parse_attr;
+	esw_attr->chain = f->common.chain_index;
+	esw_attr->prio = TC_H_MAJ(f->common.prio) >> 16;
+
+	esw_attr->in_rep = in_rep;
+	esw_attr->in_mdev = in_mdev;
+
+	if (MLX5_CAP_ESW(esw->dev, counter_eswitch_affinity) ==
+	    MLX5_COUNTER_SOURCE_ESWITCH)
+		esw_attr->counter_dev = in_mdev;
+	else
+		esw_attr->counter_dev = priv->mdev;
+}
+
 static struct mlx5e_tc_flow *
 __mlx5e_add_fdb_flow(struct mlx5e_priv *priv,
 		     struct tc_cls_flower_offload *f,
@@ -2757,28 +2781,21 @@ static bool is_peer_flow_needed(struct mlx5e_tc_flow *flow)
 			       &parse_attr, &flow);
 	if (err)
 		goto out;
+
 	parse_attr->filter_dev = filter_dev;
-	flow->esw_attr->parse_attr = parse_attr;
+	mlx5e_flow_esw_attr_init(flow->esw_attr,
+				 priv, parse_attr,
+				 f, in_rep, in_mdev);
+
 	err = parse_cls_flower(flow->priv, flow, &parse_attr->spec,
 			       f, filter_dev);
 	if (err)
 		goto err_free;
 
-	flow->esw_attr->chain = f->common.chain_index;
-	flow->esw_attr->prio = TC_H_MAJ(f->common.prio) >> 16;
 	err = parse_tc_fdb_actions(priv, &rule->action, parse_attr, flow, extack);
 	if (err)
 		goto err_free;
 
-	flow->esw_attr->in_rep = in_rep;
-	flow->esw_attr->in_mdev = in_mdev;
-
-	if (MLX5_CAP_ESW(esw->dev, counter_eswitch_affinity) ==
-	    MLX5_COUNTER_SOURCE_ESWITCH)
-		flow->esw_attr->counter_dev = in_mdev;
-	else
-		flow->esw_attr->counter_dev = priv->mdev;
-
 	err = mlx5e_tc_add_fdb_flow(priv, parse_attr, flow, extack);
 	if (err)
 		goto err_free;
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH net-next 2/2] net/mlx5e: Remove 'parse_attr' argument in mlx5e_tc_add_fdb_flow()
From: xiangxia.m.yue @ 2019-02-12  3:39 UTC (permalink / raw)
  To: saeedm, gerlitz.or; +Cc: netdev, Tonghao Zhang
In-Reply-To: <1549942783-56833-1-git-send-email-xiangxia.m.yue@gmail.com>

From: Tonghao Zhang <xiangxia.m.yue@gmail.com>

This patch is a little improvement. Simplify the mlx5e_tc_add_fdb_flow().

Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 98b002c..15cf26d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -929,13 +929,13 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv,
 
 static int
 mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
-		      struct mlx5e_tc_flow_parse_attr *parse_attr,
 		      struct mlx5e_tc_flow *flow,
 		      struct netlink_ext_ack *extack)
 {
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 	u32 max_chain = mlx5_eswitch_get_chain_range(esw);
 	struct mlx5_esw_flow_attr *attr = flow->esw_attr;
+	struct mlx5e_tc_flow_parse_attr *parse_attr = attr->parse_attr;
 	u16 max_prio = mlx5_eswitch_get_prio_range(esw);
 	struct net_device *out_dev, *encap_dev = NULL;
 	struct mlx5_fc *counter = NULL;
@@ -967,7 +967,7 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv,
 		if (!(attr->dests[out_index].flags & MLX5_ESW_DEST_ENCAP))
 			continue;
 
-		mirred_ifindex = attr->parse_attr->mirred_ifindex[out_index];
+		mirred_ifindex = parse_attr->mirred_ifindex[out_index];
 		out_dev = __dev_get_by_index(dev_net(priv->netdev),
 					     mirred_ifindex);
 		err = mlx5e_attach_encap(priv,
@@ -2796,7 +2796,7 @@ static bool is_peer_flow_needed(struct mlx5e_tc_flow *flow)
 	if (err)
 		goto err_free;
 
-	err = mlx5e_tc_add_fdb_flow(priv, parse_attr, flow, extack);
+	err = mlx5e_tc_add_fdb_flow(priv, flow, extack);
 	if (err)
 		goto err_free;
 
-- 
1.8.3.1


^ permalink raw reply related

* Re: [PATCH] net: phy: at803x: disable delay only for RGMII mode
From: Peter Ujfalusi @ 2019-02-14 13:22 UTC (permalink / raw)
  To: Niklas Cassel
  Cc: Marc Gonzalez, Andrew Lunn, Florian Fainelli, Vinod Koul,
	David S Miller, linux-arm-msm, Bjorn Andersson, netdev,
	Nori, Sekhar
In-Reply-To: <20190214123922.GA28897@centauri.ideon.se>

Hi Niklas,

On 14/02/2019 14.39, Niklas Cassel wrote:
>>> So, I've rebased your old patch, see attachment.
>>> I suggest that Peter test it on am335x-evm.
>>
>> with the patch + s/rgmii-txid/rgmii-id in the am335x-evmsk.dts ethernet
>> is working.
>> I don't have am335x-evm to test, but it has the same PHY as evmsk.
>>
> 
> Florian's concern was that this PHY driver looked at "phy-mode" from the
> perspective of the MAC rather than the PHY.
> However, if s/rgmii-txid/rgmii-id is the correct fix for am335x-evm,
> then this means that this PHY driver was just broken.
> 
> If the driver had misinterpreted the perspective, then the correct
> fix for am335x-evm would have been s/rgmii-txid/rgmii-rxid.

Not sure if I got this right, but:
rgmii-id/txid/rxid is the delay mode between PHY and MAC, right?
on the PHY node it is from the PHY perspective, right?

The errata I have mentioned for am335x say:
"The reset state of RGMII1_IDMODE (bit 4) and RGMII2_IDMODE (bit 5) in
the GMII_SEL register enables internal delay mode on the transmit clock
of the respective RGMII port. The AM335x device does not support
internal delay mode, so RGMII1_IDMODE and RGMII2_IDMODE must be set to 1b."

If the delay mode on the transmit clock is not working on the am335x,
then this translate that the rxid needs to be enabled on the PHY side?

But then why it worked when only the txid was enabled and rxid was not
on the PHY side, and why it works if both txid and rxid is enabled?

Just tried w/ your patch and setting rgmii-rxid for am335x-evmsk and
ethernet is not working, it only works w/ rgmii-id (so both tx and rx
delay is enabled on the PHY side?)

> So considering that this driver seems to be really broken
> (rather then just inverted perspective),
> perhaps we can merge the patch I attached in my previous email after all?
> (Together with a s/rgmii-txid/rgmii-id in the am335x-evmsk.dts.)

at the same time am335x-evm.dts needs to have the same change and most
likely other boards which uses the same PHY needs to be checked?

PS: sorry for my lack of knowledge on the networking stuff...

- Péter

Texas Instruments Finland Oy, Porkkalankatu 22, 00180 Helsinki.
Y-tunnus/Business ID: 0615521-4. Kotipaikka/Domicile: Helsinki

^ permalink raw reply

* Re: [PATCH  2/8] net: ethernet: stmmac: update to support all PHY config for stm32mp157c.
From: Andrew Lunn @ 2019-02-14 13:40 UTC (permalink / raw)
  To: Christophe Roullier
  Cc: robh, davem, joabreu, mark.rutland, mcoquelin.stm32,
	alexandre.torgue, peppe.cavallaro, linux-stm32, linux-kernel,
	devicetree, linux-arm-kernel, netdev
In-Reply-To: <1550126763-22669-3-git-send-email-christophe.roullier@st.com>

 On Thu, Feb 14, 2019 at 07:45:57AM +0100, Christophe Roullier wrote:
> @@ -131,19 +185,19 @@ static int stm32mp1_set_mode(struct plat_stmmacenet_data *plat_dat)
>  	case PHY_INTERFACE_MODE_RGMII:
>  		val = SYSCFG_PMCR_ETH_SEL_RGMII;
> -		if (dwmac->int_phyclk)
> +		if (dwmac->eth_clk_sel_reg)
>  			val |= SYSCFG_PMCR_ETH_CLK_SEL;
>  		pr_debug("SYSCFG init : PHY_INTERFACE_MODE_RGMII\n");
>  		break;

Hi Christophe

This code should handle all 4 PHY_INTERFACE_MODE_RGMII* values.

     Andrew

^ permalink raw reply

* Re: [PATCH  7/8] ARM: dts: stm32: Add Ethernet support on stm32h7 SOC and activate it for eval and disco boards
From: Andrew Lunn @ 2019-02-14 13:44 UTC (permalink / raw)
  To: Christophe Roullier
  Cc: robh, davem, joabreu, mark.rutland, mcoquelin.stm32,
	alexandre.torgue, peppe.cavallaro, linux-stm32, linux-kernel,
	devicetree, linux-arm-kernel, netdev
In-Reply-To: <1550126763-22669-8-git-send-email-christophe.roullier@st.com>

On Thu, Feb 14, 2019 at 07:46:02AM +0100, Christophe Roullier wrote:
> +	mdio0 {
> +		#address-cells = <1>;
> +		#size-cells = <0>;
> +		compatible = "snps,dwmac-mdio";
> +		phy1: ethernet-phy@1 {
> +			reg = <0>;
> +		};

The reg value should be the same as ethernet-phy@X.

    Andrew
> +	mdio0 {
> +		#address-cells = <1>;
> +		#size-cells = <0>;
> +		compatible = "snps,dwmac-mdio";
> +		phy1: ethernet-phy@1 {
> +			reg = <0>;
> +		};

Here as well.

     Andrew

^ permalink raw reply

* Re: [PATCH iproute2] lib/libnetlink: ensure a minimum of 32KB for the buffer used in rtnl_recvmsg()
From: Michal Kubecek @ 2019-02-14 13:49 UTC (permalink / raw)
  To: netdev
  Cc: David Ahern, Eric Dumazet, Stephen Hemminger, Eric Dumazet,
	Hangbin Liu, Phil Sutter
In-Reply-To: <b42f0dcb-3c8c-9797-a9f1-da71642e26cc@gmail.com>

On Tue, Feb 12, 2019 at 07:04:17PM -0700, David Ahern wrote:
> 
> Do we know of any single message sizes > 32k? 2d34851cd341 cites
> increasing VF's but at some point there is a limit. If not, the whole
> PEEK thing should go away and we just malloc 32k (or 64k) buffers for
> each recvmsg.

IFLA_VF_LIST is by far the biggest thing I have seen so far. I don't
remember exact numbers but the issue with 32KB buffer (for the whole
RTM_NELINK message) was encountered by some of our customers with NICs
having 120 or 128 VFs.

There is a bigger issue with IFLA_VFINFO_LIST, though, as it's an
attribute so that netlink limits its size to 64 KB. IIRC with current
size of IFLA_VF_INFO this would be reached with 270-280 VFs (I'm sure
the number was higer than 256 but not too much higher.)

This would mean unless we let something else grow too much, the whole
message shouldn't get much bigger than 64 KB. And if we can find some
other solution (e.g. passing VF information in separate messages if
client declares support), even 32 KB would be more than enough.

Michal Kubecek

^ permalink raw reply

* Re: [PATCH iproute2] lib/libnetlink: ensure a minimum of 32KB for the buffer used in rtnl_recvmsg()
From: Michal Kubecek @ 2019-02-14 13:51 UTC (permalink / raw)
  To: netdev
  Cc: Hangbin Liu, David Ahern, Eric Dumazet, Stephen Hemminger,
	Phil Sutter
In-Reply-To: <20190213062012.GC10051@dhcp-12-139.nay.redhat.com>

On Wed, Feb 13, 2019 at 02:20:12PM +0800, Hangbin Liu wrote:
>  
> > > Do we know of any single message sizes > 32k? 2d34851cd341 cites
> > > increasing VF's but at some point there is a limit. If not, the whole
> > > PEEK thing should go away and we just malloc 32k (or 64k) buffers for
> > > each recvmsg.
> 
> Apart from the 200 VFs example, I think there will be more and more virtual
> interfaces be used in cloud environment, like openstack/OVS, so useing 32K
> or 64K is still not safe.

Many intefraces are not a problem. Those will have separate messages so
that they don't need to fit into one buffer.

Michal Kubecek

^ permalink raw reply

* Re: [PATCH iproute2] iplink: document XDP subcommand to force the XDP mode.
From: Matteo Croce @ 2019-02-14 14:01 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, David Ahern, Stephen Hemminger, Jakub Kicinski
In-Reply-To: <20190213140403.5fcf761e@shemminger-XPS-13-9360>

On Wed, Feb 13, 2019 at 11:04 PM Stephen Hemminger
<stephen@networkplumber.org> wrote:
>
> On Wed, 13 Feb 2019 15:40:30 +0100
> Matteo Croce <mcroce@redhat.com> wrote:
>
> > When attaching an eBPF program to a device, ip link can force the XDP mode
> > by using the xdp{generic,drv,offload} keyword instead of just 'xdp'.
> > Document this behaviour also in the help output.
> >
> > Signed-off-by: Matteo Croce <mcroce@redhat.com>
> > Fixes: 14683814 ("bpf: add xdpdrv for requesting XDP driver mode")
> > Fixes: 1b5e8094 ("bpf: allow requesting XDP HW offload")
>
> Applied, thanks.
> The man page already has this as well.
>

Yes, I found it just after I made the patch. However, it could be nice
to have the generic "xdp" and a command like "type" or "mode" to
specify the XDP mode, eg.
ip link set dev eth0 xdp mode [ auto | generic | drv | offload ]
I was trying to add it, but unfortunately it seems that the arguments
aren't parsed in a loop, and are required to be in the exact order.
Would this change make sense?

Regards,

--
Matteo Croce
per aspera ad upstream

^ permalink raw reply

* [v3 PATCH 0/4] mac80211: Fix incorrect usage of rhashtable walk API
From: Herbert Xu @ 2019-02-14 14:02 UTC (permalink / raw)
  To: David Miller, johannes, linux-wireless, netdev, j, tgraf,
	johannes.berg, Julia Lawall
In-Reply-To: <20190213143853.labj6zdcsoupkris@gondor.apana.org.au>

Hi:

v3 fixes a bug in patch two where we would return a NULL pointer
when we should return an existing path.

The first two patches in this series are bug fixes and should be
backported to stable.

They fixes a number of issues with the use of the rhashtable API
in mac80211.  First of all it converts the use of rashtable walks over
to a simple linked list.  This is because an rhashtable walk is
inherently unstable and not meant for uses that require stability,
e.g., when you're trying to lookup an object to delete.

It also fixes a potential memory leak when the rhashtable insertion
fails (which can occur due to OOM).

The third patch is a code-cleanup to mac80211 while the last patch
removes an obsolete rhashtable API.

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* [PATCH 2/4] mac80211: Free mpath object when rhashtable insertion fails
From: Herbert Xu @ 2019-02-14 14:03 UTC (permalink / raw)
  To: David Miller, johannes, linux-wireless, netdev, j, tgraf,
	johannes.berg, Julia Lawall
In-Reply-To: <20190214140236.omt74prxhkfaasue@gondor.apana.org.au>

When rhashtable insertion fails the mesh table code doesn't free
the now-orphan mesh path object.  This patch fixes that.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 net/mac80211/mesh_pathtbl.c |   17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c
index 884a0d212e8b..c3a7396fb955 100644
--- a/net/mac80211/mesh_pathtbl.c
+++ b/net/mac80211/mesh_pathtbl.c
@@ -436,17 +436,15 @@ struct mesh_path *mesh_path_add(struct ieee80211_sub_if_data *sdata,
 	} while (unlikely(ret == -EEXIST && !mpath));
 	spin_unlock_bh(&tbl->walk_lock);
 
-	if (ret && ret != -EEXIST)
-		return ERR_PTR(ret);
-
-	/* At this point either new_mpath was added, or we found a
-	 * matching entry already in the table; in the latter case
-	 * free the unnecessary new entry.
-	 */
-	if (ret == -EEXIST) {
+	if (ret) {
 		kfree(new_mpath);
+
+		if (ret != -EEXIST)
+			return ERR_PTR(ret);
+
 		new_mpath = mpath;
 	}
+
 	sdata->u.mesh.mesh_paths_generation++;
 	return new_mpath;
 }
@@ -481,6 +479,9 @@ int mpp_path_add(struct ieee80211_sub_if_data *sdata,
 		hlist_add_head_rcu(&new_mpath->walk_list, &tbl->walk_head);
 	spin_unlock_bh(&tbl->walk_lock);
 
+	if (ret)
+		kfree(new_mpath);
+
 	sdata->u.mesh.mpp_paths_generation++;
 	return ret;
 }

^ permalink raw reply related

* [PATCH 4/4] rhashtable: Remove obsolete rhashtable_walk_init function
From: Herbert Xu @ 2019-02-14 14:03 UTC (permalink / raw)
  To: David Miller, johannes, linux-wireless, netdev, j, tgraf,
	johannes.berg, Julia Lawall
In-Reply-To: <20190214140236.omt74prxhkfaasue@gondor.apana.org.au>

The rhashtable_walk_init function has been obsolete for more than
two years.  This patch finally converts its last users over to
rhashtable_walk_enter and removes it.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 include/linux/rhashtable.h |    8 --------
 lib/rhashtable.c           |    2 +-
 lib/test_rhashtable.c      |    9 ++-------
 net/ipv6/ila/ila_xlat.c    |   15 +++------------
 net/netlink/af_netlink.c   |   10 +---------
 5 files changed, 7 insertions(+), 37 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 20f9c6af7473..ae9c0f71f311 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -1113,14 +1113,6 @@ static inline int rhashtable_replace_fast(
 	return err;
 }
 
-/* Obsolete function, do not use in new code. */
-static inline int rhashtable_walk_init(struct rhashtable *ht,
-				       struct rhashtable_iter *iter, gfp_t gfp)
-{
-	rhashtable_walk_enter(ht, iter);
-	return 0;
-}
-
 /**
  * rhltable_walk_enter - Initialise an iterator
  * @hlt:	Table to walk over
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 852ffa5160f1..0a105d4af166 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -682,7 +682,7 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_enter);
  * rhashtable_walk_exit - Free an iterator
  * @iter:	Hash table Iterator
  *
- * This function frees resources allocated by rhashtable_walk_init.
+ * This function frees resources allocated by rhashtable_walk_enter.
  */
 void rhashtable_walk_exit(struct rhashtable_iter *iter)
 {
diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c
index 6a8ac7626797..d96962eea942 100644
--- a/lib/test_rhashtable.c
+++ b/lib/test_rhashtable.c
@@ -177,16 +177,11 @@ static int __init test_rht_lookup(struct rhashtable *ht, struct test_obj *array,
 
 static void test_bucket_stats(struct rhashtable *ht, unsigned int entries)
 {
-	unsigned int err, total = 0, chain_len = 0;
+	unsigned int total = 0, chain_len = 0;
 	struct rhashtable_iter hti;
 	struct rhash_head *pos;
 
-	err = rhashtable_walk_init(ht, &hti, GFP_KERNEL);
-	if (err) {
-		pr_warn("Test failed: allocation error");
-		return;
-	}
-
+	rhashtable_walk_enter(ht, &hti);
 	rhashtable_walk_start(&hti);
 
 	while ((pos = rhashtable_walk_next(&hti))) {
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 17c455ff69ff..ae6cd4cef8db 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -385,10 +385,7 @@ int ila_xlat_nl_cmd_flush(struct sk_buff *skb, struct genl_info *info)
 	spinlock_t *lock;
 	int ret;
 
-	ret = rhashtable_walk_init(&ilan->xlat.rhash_table, &iter, GFP_KERNEL);
-	if (ret)
-		goto done;
-
+	rhashtable_walk_enter(&ilan->xlat.rhash_table, &iter);
 	rhashtable_walk_start(&iter);
 
 	for (;;) {
@@ -509,23 +506,17 @@ int ila_xlat_nl_dump_start(struct netlink_callback *cb)
 	struct net *net = sock_net(cb->skb->sk);
 	struct ila_net *ilan = net_generic(net, ila_net_id);
 	struct ila_dump_iter *iter;
-	int ret;
 
 	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
 	if (!iter)
 		return -ENOMEM;
 
-	ret = rhashtable_walk_init(&ilan->xlat.rhash_table, &iter->rhiter,
-				   GFP_KERNEL);
-	if (ret) {
-		kfree(iter);
-		return ret;
-	}
+	rhashtable_walk_enter(&ilan->xlat.rhash_table, &iter->rhiter);
 
 	iter->skip = 0;
 	cb->args[0] = (long)iter;
 
-	return ret;
+	return 0;
 }
 
 int ila_xlat_nl_dump_done(struct netlink_callback *cb)
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 3c023d6120f6..f369cc751cea 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2541,15 +2541,7 @@ struct nl_seq_iter {
 
 static int netlink_walk_start(struct nl_seq_iter *iter)
 {
-	int err;
-
-	err = rhashtable_walk_init(&nl_table[iter->link].hash, &iter->hti,
-				   GFP_KERNEL);
-	if (err) {
-		iter->link = MAX_LINKS;
-		return err;
-	}
-
+	rhashtable_walk_enter(&nl_table[iter->link].hash, &iter->hti);
 	rhashtable_walk_start(&iter->hti);
 
 	return 0;

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox