Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH] net: ethernet: mediatek: Add MT7628/88 SoC support
From: René van Dorst @ 2019-07-17 12:53 UTC (permalink / raw)
  To: Stefan Roese
  Cc: netdev, linux-mediatek, Sean Wang, Felix Fietkau, John Crispin
In-Reply-To: <20190717110243.14240-1-sr@denx.de>

Quoting Stefan Roese <sr@denx.de>:

Hi Stefan,

So comments below.

> This patch adds support for the MediaTek MT7628/88 SoCs to the common
> MediaTek ethernet driver. Some minor changes are needed for this and
> a bigger change, as the MT7628 does not support QDMA (only PDMA).
>
> Signed-off-by: Stefan Roese <sr@denx.de>
> Cc: René van Dorst <opensource@vdorst.com>
> Cc: Sean Wang <sean.wang@mediatek.com>
> Cc: Felix Fietkau <nbd@openwrt.org>
> Cc: John Crispin <john@phrozen.org>
> ---
>  .../devicetree/bindings/net/mediatek-net.txt  |   1 +
>  drivers/net/ethernet/mediatek/mtk_eth_path.c  |   4 +
>  drivers/net/ethernet/mediatek/mtk_eth_soc.c   | 490 ++++++++++++++----
>  drivers/net/ethernet/mediatek/mtk_eth_soc.h   |  39 +-
>  4 files changed, 424 insertions(+), 110 deletions(-)
>
> diff --git a/Documentation/devicetree/bindings/net/mediatek-net.txt  
> b/Documentation/devicetree/bindings/net/mediatek-net.txt
> index 770ff98d4524..ec6793562148 100644
> --- a/Documentation/devicetree/bindings/net/mediatek-net.txt
> +++ b/Documentation/devicetree/bindings/net/mediatek-net.txt
> @@ -11,6 +11,7 @@ Required properties:
>  		"mediatek,mt2701-eth": for MT2701 SoC
>  		"mediatek,mt7623-eth", "mediatek,mt2701-eth": for MT7623 SoC
>  		"mediatek,mt7622-eth": for MT7622 SoC
> +		"mediatek,mt7628-eth": for MT7628/88 SoC
>  		"mediatek,mt7629-eth": for MT7629 SoC
>  - reg: Address and length of the register set for the device
>  - interrupts: Should contain the three frame engines interrupts in numeric
> diff --git a/drivers/net/ethernet/mediatek/mtk_eth_path.c  
> b/drivers/net/ethernet/mediatek/mtk_eth_path.c
> index 7f05880cf9ef..28960e4c4e43 100644
> --- a/drivers/net/ethernet/mediatek/mtk_eth_path.c
> +++ b/drivers/net/ethernet/mediatek/mtk_eth_path.c
> @@ -315,6 +315,10 @@ int mtk_setup_hw_path(struct mtk_eth *eth, int  
> mac_id, int phymode)
>  {
>  	int err;
>
> +	/* No mux'ing for MT7628/88 */
> +	if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628))
> +		return 0;
> +
>  	switch (phymode) {
>  	case PHY_INTERFACE_MODE_TRGMII:
>  	case PHY_INTERFACE_MODE_RGMII_TXID:
> diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c  
> b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
> index b20b3a5a1ebb..1f248ef6ef88 100644
> --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
> +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
> @@ -323,11 +323,14 @@ static int mtk_phy_connect(struct net_device *dev)
>  		goto err_phy;
>  	}
>
> -	/* put the gmac into the right mode */
> -	regmap_read(eth->ethsys, ETHSYS_SYSCFG0, &val);
> -	val &= ~SYSCFG0_GE_MODE(SYSCFG0_GE_MASK, mac->id);
> -	val |= SYSCFG0_GE_MODE(mac->ge_mode, mac->id);
> -	regmap_write(eth->ethsys, ETHSYS_SYSCFG0, val);
> +	/* No MT7628/88 support for now */
> +	if (!MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
> +		/* put the gmac into the right mode */
> +		regmap_read(eth->ethsys, ETHSYS_SYSCFG0, &val);
> +		val &= ~SYSCFG0_GE_MODE(SYSCFG0_GE_MASK, mac->id);
> +		val |= SYSCFG0_GE_MODE(mac->ge_mode, mac->id);
> +		regmap_write(eth->ethsys, ETHSYS_SYSCFG0, val);
> +	}
>
>  	/* couple phydev to net_device */
>  	if (mtk_phy_connect_node(eth, mac, np))
> @@ -395,8 +398,8 @@ static inline void mtk_tx_irq_disable(struct  
> mtk_eth *eth, u32 mask)
>  	u32 val;
>
>  	spin_lock_irqsave(&eth->tx_irq_lock, flags);
> -	val = mtk_r32(eth, MTK_QDMA_INT_MASK);
> -	mtk_w32(eth, val & ~mask, MTK_QDMA_INT_MASK);
> +	val = mtk_r32(eth, eth->tx_int_mask_reg);
> +	mtk_w32(eth, val & ~mask, eth->tx_int_mask_reg);
>  	spin_unlock_irqrestore(&eth->tx_irq_lock, flags);
>  }
>
> @@ -406,8 +409,8 @@ static inline void mtk_tx_irq_enable(struct  
> mtk_eth *eth, u32 mask)
>  	u32 val;
>
>  	spin_lock_irqsave(&eth->tx_irq_lock, flags);
> -	val = mtk_r32(eth, MTK_QDMA_INT_MASK);
> -	mtk_w32(eth, val | mask, MTK_QDMA_INT_MASK);
> +	val = mtk_r32(eth, eth->tx_int_mask_reg);
> +	mtk_w32(eth, val | mask, eth->tx_int_mask_reg);
>  	spin_unlock_irqrestore(&eth->tx_irq_lock, flags);
>  }
>
> @@ -437,6 +440,7 @@ static int mtk_set_mac_address(struct net_device  
> *dev, void *p)
>  {
>  	int ret = eth_mac_addr(dev, p);
>  	struct mtk_mac *mac = netdev_priv(dev);
> +	struct mtk_eth *eth = mac->hw;
>  	const char *macaddr = dev->dev_addr;
>
>  	if (ret)
> @@ -446,11 +450,19 @@ static int mtk_set_mac_address(struct  
> net_device *dev, void *p)
>  		return -EBUSY;
>
>  	spin_lock_bh(&mac->hw->page_lock);
> -	mtk_w32(mac->hw, (macaddr[0] << 8) | macaddr[1],
> -		MTK_GDMA_MAC_ADRH(mac->id));
> -	mtk_w32(mac->hw, (macaddr[2] << 24) | (macaddr[3] << 16) |
> -		(macaddr[4] << 8) | macaddr[5],
> -		MTK_GDMA_MAC_ADRL(mac->id));
> +	if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
> +		mtk_w32(mac->hw, (macaddr[0] << 8) | macaddr[1],
> +			MT7628_SDM_MAC_ADRH);
> +		mtk_w32(mac->hw, (macaddr[2] << 24) | (macaddr[3] << 16) |
> +			(macaddr[4] << 8) | macaddr[5],
> +			MT7628_SDM_MAC_ADRL);
> +	} else {
> +		mtk_w32(mac->hw, (macaddr[0] << 8) | macaddr[1],
> +			MTK_GDMA_MAC_ADRH(mac->id));
> +		mtk_w32(mac->hw, (macaddr[2] << 24) | (macaddr[3] << 16) |
> +			(macaddr[4] << 8) | macaddr[5],
> +			MTK_GDMA_MAC_ADRL(mac->id));
> +	}
>  	spin_unlock_bh(&mac->hw->page_lock);
>
>  	return 0;
> @@ -626,19 +638,47 @@ static inline struct mtk_tx_buf  
> *mtk_desc_to_tx_buf(struct mtk_tx_ring *ring,
>  	return &ring->buf[idx];
>  }
>
> +static struct mtk_tx_dma *qdma_to_pdma(struct mtk_tx_ring *ring,
> +				       struct mtk_tx_dma *dma)
> +{
> +	return ring->dma_pdma - ring->dma + dma;
> +}
> +
> +static int txd_to_idx(struct mtk_tx_ring *ring, struct mtk_tx_dma *dma)
> +{
> +	return ((u32)dma - (u32)ring->dma) / sizeof(*dma);
> +}
> +
>  static void mtk_tx_unmap(struct mtk_eth *eth, struct mtk_tx_buf *tx_buf)
>  {
> -	if (tx_buf->flags & MTK_TX_FLAGS_SINGLE0) {
> -		dma_unmap_single(eth->dev,
> -				 dma_unmap_addr(tx_buf, dma_addr0),
> -				 dma_unmap_len(tx_buf, dma_len0),
> -				 DMA_TO_DEVICE);
> -	} else if (tx_buf->flags & MTK_TX_FLAGS_PAGE0) {
> -		dma_unmap_page(eth->dev,
> -			       dma_unmap_addr(tx_buf, dma_addr0),
> -			       dma_unmap_len(tx_buf, dma_len0),
> -			       DMA_TO_DEVICE);
> +	if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
> +		if (dma_unmap_len(tx_buf, dma_len0)) {
> +			dma_unmap_page(eth->dev,
> +				       dma_unmap_addr(tx_buf, dma_addr0),
> +				       dma_unmap_len(tx_buf, dma_len0),
> +				       DMA_TO_DEVICE);
> +		}
> +
> +		if (dma_unmap_len(tx_buf, dma_len1)) {
> +			dma_unmap_page(eth->dev,
> +				       dma_unmap_addr(tx_buf, dma_addr1),
> +				       dma_unmap_len(tx_buf, dma_len1),
> +				       DMA_TO_DEVICE);
> +		}
> +	} else {
> +		if (tx_buf->flags & MTK_TX_FLAGS_SINGLE0) {
> +			dma_unmap_single(eth->dev,
> +					 dma_unmap_addr(tx_buf, dma_addr0),
> +					 dma_unmap_len(tx_buf, dma_len0),
> +					 DMA_TO_DEVICE);
> +		} else if (tx_buf->flags & MTK_TX_FLAGS_PAGE0) {
> +			dma_unmap_page(eth->dev,
> +				       dma_unmap_addr(tx_buf, dma_addr0),
> +				       dma_unmap_len(tx_buf, dma_len0),
> +				       DMA_TO_DEVICE);
> +		}
>  	}
> +
>  	tx_buf->flags = 0;
>  	if (tx_buf->skb &&
>  	    (tx_buf->skb != (struct sk_buff *)MTK_DMA_DUMMY_DESC))
> @@ -646,19 +686,45 @@ static void mtk_tx_unmap(struct mtk_eth *eth,  
> struct mtk_tx_buf *tx_buf)
>  	tx_buf->skb = NULL;
>  }
>
> +static void setup_tx_buf(struct mtk_eth *eth, struct mtk_tx_buf *tx_buf,
> +			 struct mtk_tx_dma *txd, dma_addr_t mapped_addr,
> +			 size_t size, int idx)
> +{
> +	if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
> +		if (idx & 1) {
> +			txd->txd3 = mapped_addr;
> +			txd->txd2 |= TX_DMA_PLEN1(size);
> +			dma_unmap_addr_set(tx_buf, dma_addr1, mapped_addr);
> +			dma_unmap_len_set(tx_buf, dma_len1, size);
> +		} else {
> +			tx_buf->skb = (struct sk_buff *)MTK_DMA_DUMMY_DESC;
> +			txd->txd1 = mapped_addr;
> +			txd->txd2 = TX_DMA_PLEN0(size);
> +			dma_unmap_addr_set(tx_buf, dma_addr0, mapped_addr);
> +			dma_unmap_len_set(tx_buf, dma_len0, size);
> +		}
> +	} else {
> +		dma_unmap_addr_set(tx_buf, dma_addr0, mapped_addr);
> +		dma_unmap_len_set(tx_buf, dma_len0, size);
> +	}
> +}
> +
>  static int mtk_tx_map(struct sk_buff *skb, struct net_device *dev,
>  		      int tx_num, struct mtk_tx_ring *ring, bool gso)
>  {
>  	struct mtk_mac *mac = netdev_priv(dev);
>  	struct mtk_eth *eth = mac->hw;
>  	struct mtk_tx_dma *itxd, *txd;
> +	struct mtk_tx_dma *itxd_pdma, *txd_pdma;
>  	struct mtk_tx_buf *itx_buf, *tx_buf;
>  	dma_addr_t mapped_addr;
>  	unsigned int nr_frags;
>  	int i, n_desc = 1;
>  	u32 txd4 = 0, fport;
> +	int k = 0;
>
>  	itxd = ring->next_free;
> +	itxd_pdma = qdma_to_pdma(ring, itxd);
>  	if (itxd == ring->last_free)
>  		return -ENOMEM;
>
> @@ -689,12 +755,14 @@ static int mtk_tx_map(struct sk_buff *skb,  
> struct net_device *dev,
>  	itx_buf->flags |= MTK_TX_FLAGS_SINGLE0;
>  	itx_buf->flags |= (!mac->id) ? MTK_TX_FLAGS_FPORT0 :
>  			  MTK_TX_FLAGS_FPORT1;
> -	dma_unmap_addr_set(itx_buf, dma_addr0, mapped_addr);
> -	dma_unmap_len_set(itx_buf, dma_len0, skb_headlen(skb));
> +	setup_tx_buf(eth, itx_buf, itxd_pdma, mapped_addr, skb_headlen(skb),
> +		     k++);
>
>  	/* TX SG offload */
>  	txd = itxd;
> +	txd_pdma = qdma_to_pdma(ring, txd);
>  	nr_frags = skb_shinfo(skb)->nr_frags;
> +
>  	for (i = 0; i < nr_frags; i++) {
>  		struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[i];
>  		unsigned int offset = 0;
> @@ -703,12 +771,20 @@ static int mtk_tx_map(struct sk_buff *skb,  
> struct net_device *dev,
>  		while (frag_size) {
>  			bool last_frag = false;
>  			unsigned int frag_map_size;
> +			bool new_desc = true;
> +
> +			if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628) &&
> +			    !(i & 0x1)) {
> +				new_desc = false;
> +			} else {
> +				txd = mtk_qdma_phys_to_virt(ring, txd->txd2);
> +				txd_pdma = qdma_to_pdma(ring, txd);
> +				if (txd == ring->last_free)
> +					goto err_dma;
> +
> +				n_desc++;
> +			}
>
> -			txd = mtk_qdma_phys_to_virt(ring, txd->txd2);
> -			if (txd == ring->last_free)
> -				goto err_dma;
> -
> -			n_desc++;
>  			frag_map_size = min(frag_size, MTK_TX_DMA_BUF_LEN);
>  			mapped_addr = skb_frag_dma_map(eth->dev, frag, offset,
>  						       frag_map_size,
> @@ -727,14 +803,16 @@ static int mtk_tx_map(struct sk_buff *skb,  
> struct net_device *dev,
>  			WRITE_ONCE(txd->txd4, fport);
>
>  			tx_buf = mtk_desc_to_tx_buf(ring, txd);
> -			memset(tx_buf, 0, sizeof(*tx_buf));
> +			if (new_desc)
> +				memset(tx_buf, 0, sizeof(*tx_buf));
>  			tx_buf->skb = (struct sk_buff *)MTK_DMA_DUMMY_DESC;
>  			tx_buf->flags |= MTK_TX_FLAGS_PAGE0;
>  			tx_buf->flags |= (!mac->id) ? MTK_TX_FLAGS_FPORT0 :
>  					 MTK_TX_FLAGS_FPORT1;
>
> -			dma_unmap_addr_set(tx_buf, dma_addr0, mapped_addr);
> -			dma_unmap_len_set(tx_buf, dma_len0, frag_map_size);
> +			setup_tx_buf(eth, tx_buf, txd_pdma, mapped_addr,
> +				     frag_map_size, k++);
> +
>  			frag_size -= frag_map_size;
>  			offset += frag_map_size;
>  		}
> @@ -746,6 +824,12 @@ static int mtk_tx_map(struct sk_buff *skb,  
> struct net_device *dev,
>  	WRITE_ONCE(itxd->txd4, txd4);
>  	WRITE_ONCE(itxd->txd3, (TX_DMA_SWC | TX_DMA_PLEN0(skb_headlen(skb)) |
>  				(!nr_frags * TX_DMA_LS0)));
> +	if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
> +		if (k & 0x1)
> +			txd_pdma->txd2 |= TX_DMA_LS0;
> +		else
> +			txd_pdma->txd2 |= TX_DMA_LS1;
> +	}
>
>  	netdev_sent_queue(dev, skb->len);
>  	skb_tx_timestamp(skb);
> @@ -758,9 +842,15 @@ static int mtk_tx_map(struct sk_buff *skb,  
> struct net_device *dev,
>  	 */
>  	wmb();
>
> -	if (netif_xmit_stopped(netdev_get_tx_queue(dev, 0)) ||
> -	    !netdev_xmit_more())
> -		mtk_w32(eth, txd->txd2, MTK_QTX_CTX_PTR);
> +	if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
> +		int next_idx = NEXT_DESP_IDX(txd_to_idx(ring, txd),
> +					     ring->dma_size);
> +		mtk_w32(eth, next_idx, MT7628_TX_CTX_IDX0);
> +	} else {
> +		if (netif_xmit_stopped(netdev_get_tx_queue(dev, 0)) ||
> +		    !netdev_xmit_more())
> +			mtk_w32(eth, txd->txd2, MTK_QTX_CTX_PTR);
> +	}
>
>  	return 0;
>
> @@ -772,7 +862,11 @@ static int mtk_tx_map(struct sk_buff *skb,  
> struct net_device *dev,
>  		mtk_tx_unmap(eth, tx_buf);
>
>  		itxd->txd3 = TX_DMA_LS0 | TX_DMA_OWNER_CPU;
> +		if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628))
> +			itxd_pdma->txd2 = TX_DMA_DESP2_DEF;
> +
>  		itxd = mtk_qdma_phys_to_virt(ring, itxd->txd2);
> +		itxd_pdma = qdma_to_pdma(ring, itxd);
>  	} while (itxd != txd);
>
>  	return -ENOMEM;
> @@ -902,7 +996,7 @@ static struct mtk_rx_ring  
> *mtk_get_rx_ring(struct mtk_eth *eth)
>
>  	for (i = 0; i < MTK_MAX_RX_RING_NUM; i++) {
>  		ring = &eth->rx_ring[i];
> -		idx = NEXT_RX_DESP_IDX(ring->calc_idx, ring->dma_size);
> +		idx = NEXT_DESP_IDX(ring->calc_idx, ring->dma_size);
>  		if (ring->dma[idx].rxd2 & RX_DMA_DONE) {
>  			ring->calc_idx_update = true;
>  			return ring;
> @@ -945,13 +1039,13 @@ static int mtk_poll_rx(struct napi_struct  
> *napi, int budget,
>  		struct net_device *netdev;
>  		unsigned int pktlen;
>  		dma_addr_t dma_addr;
> -		int mac = 0;
> +		int mac;
>
>  		ring = mtk_get_rx_ring(eth);
>  		if (unlikely(!ring))
>  			goto rx_done;
>
> -		idx = NEXT_RX_DESP_IDX(ring->calc_idx, ring->dma_size);
> +		idx = NEXT_DESP_IDX(ring->calc_idx, ring->dma_size);
>  		rxd = &ring->dma[idx];
>  		data = ring->data[idx];
>
> @@ -960,9 +1054,13 @@ static int mtk_poll_rx(struct napi_struct  
> *napi, int budget,
>  			break;
>
>  		/* find out which mac the packet come from. values start at 1 */
> -		mac = (trxd.rxd4 >> RX_DMA_FPORT_SHIFT) &
> -		      RX_DMA_FPORT_MASK;
> -		mac--;
> +		if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
> +			mac = 0;
> +		} else {
> +			mac = (trxd.rxd4 >> RX_DMA_FPORT_SHIFT) &
> +				RX_DMA_FPORT_MASK;
> +			mac--;
> +		}
>
>  		if (unlikely(mac < 0 || mac >= MTK_MAC_COUNT ||
>  			     !eth->netdev[mac]))
> @@ -980,7 +1078,8 @@ static int mtk_poll_rx(struct napi_struct  
> *napi, int budget,
>  			goto release_desc;
>  		}
>  		dma_addr = dma_map_single(eth->dev,
> -					  new_data + NET_SKB_PAD,
> +					  new_data + NET_SKB_PAD +
> +					  eth->ip_align,
>  					  ring->buf_size,
>  					  DMA_FROM_DEVICE);
>  		if (unlikely(dma_mapping_error(eth->dev, dma_addr))) {
> @@ -1003,7 +1102,7 @@ static int mtk_poll_rx(struct napi_struct  
> *napi, int budget,
>  		pktlen = RX_DMA_GET_PLEN0(trxd.rxd2);
>  		skb->dev = netdev;
>  		skb_put(skb, pktlen);
> -		if (trxd.rxd4 & RX_DMA_L4_VALID)
> +		if (trxd.rxd4 & eth->rx_dma_l4_valid)
>  			skb->ip_summed = CHECKSUM_UNNECESSARY;
>  		else
>  			skb_checksum_none_assert(skb);
> @@ -1020,7 +1119,10 @@ static int mtk_poll_rx(struct napi_struct  
> *napi, int budget,
>  		rxd->rxd1 = (unsigned int)dma_addr;
>
>  release_desc:
> -		rxd->rxd2 = RX_DMA_PLEN0(ring->buf_size);
> +		if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628))
> +			rxd->rxd2 = RX_DMA_LSO;
> +		else
> +			rxd->rxd2 = RX_DMA_PLEN0(ring->buf_size);
>
>  		ring->calc_idx = idx;
>
> @@ -1039,19 +1141,14 @@ static int mtk_poll_rx(struct napi_struct  
> *napi, int budget,
>  	return done;
>  }
>
> -static int mtk_poll_tx(struct mtk_eth *eth, int budget)
> +static int mtk_poll_tx_qdma(struct mtk_eth *eth, int budget,
> +			    unsigned int *done, unsigned int *bytes)
>  {
>  	struct mtk_tx_ring *ring = &eth->tx_ring;
>  	struct mtk_tx_dma *desc;
>  	struct sk_buff *skb;
>  	struct mtk_tx_buf *tx_buf;
> -	unsigned int done[MTK_MAX_DEVS];
> -	unsigned int bytes[MTK_MAX_DEVS];
>  	u32 cpu, dma;
> -	int total = 0, i;
> -
> -	memset(done, 0, sizeof(done));
> -	memset(bytes, 0, sizeof(bytes));
>
>  	cpu = mtk_r32(eth, MTK_QTX_CRX_PTR);
>  	dma = mtk_r32(eth, MTK_QTX_DRX_PTR);
> @@ -1089,6 +1186,62 @@ static int mtk_poll_tx(struct mtk_eth *eth,  
> int budget)
>
>  	mtk_w32(eth, cpu, MTK_QTX_CRX_PTR);
>
> +	return budget;
> +}
> +
> +static int mtk_poll_tx_pdma(struct mtk_eth *eth, int budget,
> +			    unsigned int *done, unsigned int *bytes)
> +{
> +	struct mtk_tx_ring *ring = &eth->tx_ring;
> +	struct mtk_tx_dma *desc;
> +	struct sk_buff *skb;
> +	struct mtk_tx_buf *tx_buf;
> +	u32 cpu, dma;
> +
> +	cpu = ring->cpu_idx;
> +	dma = mtk_r32(eth, MT7628_TX_DTX_IDX0);
> +
> +	while ((cpu != dma) && budget) {
> +		tx_buf = &ring->buf[cpu];
> +		skb = tx_buf->skb;
> +		if (!skb)
> +			break;
> +
> +		if (skb != (struct sk_buff *)MTK_DMA_DUMMY_DESC) {
> +			bytes[0] += skb->len;
> +			done[0]++;
> +			budget--;
> +		}
> +
> +		mtk_tx_unmap(eth, tx_buf);
> +
> +		desc = &ring->dma[cpu];
> +		ring->last_free = desc;
> +		atomic_inc(&ring->free_count);
> +
> +		cpu = NEXT_DESP_IDX(cpu, ring->dma_size);
> +	}
> +
> +	ring->cpu_idx = cpu;
> +
> +	return budget;
> +}
> +
> +static int mtk_poll_tx(struct mtk_eth *eth, int budget)
> +{
> +	struct mtk_tx_ring *ring = &eth->tx_ring;
> +	unsigned int done[MTK_MAX_DEVS];
> +	unsigned int bytes[MTK_MAX_DEVS];
> +	int total = 0, i;
> +
> +	memset(done, 0, sizeof(done));
> +	memset(bytes, 0, sizeof(bytes));
> +
> +	if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628))
> +		budget = mtk_poll_tx_pdma(eth, budget, done, bytes);
> +	else
> +		budget = mtk_poll_tx_qdma(eth, budget, done, bytes);
> +
>  	for (i = 0; i < MTK_MAC_COUNT; i++) {
>  		if (!eth->netdev[i] || !done[i])
>  			continue;
> @@ -1120,8 +1273,12 @@ static int mtk_napi_tx(struct napi_struct  
> *napi, int budget)
>  	u32 status, mask;
>  	int tx_done = 0;
>
> -	mtk_handle_status_irq(eth);
> -	mtk_w32(eth, MTK_TX_DONE_INT, MTK_QMTK_INT_STATUS);
> +	if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
> +		mtk_w32(eth, MTK_TX_DONE_INT, MTK_PDMA_INT_STATUS);
> +	} else {
> +		mtk_handle_status_irq(eth);
> +		mtk_w32(eth, MTK_TX_DONE_INT, MTK_QMTK_INT_STATUS);
> +	}
>  	tx_done = mtk_poll_tx(eth, budget);
>
>  	if (unlikely(netif_msg_intr(eth))) {
> @@ -1135,7 +1292,10 @@ static int mtk_napi_tx(struct napi_struct  
> *napi, int budget)
>  	if (tx_done == budget)
>  		return budget;
>
> -	status = mtk_r32(eth, MTK_QMTK_INT_STATUS);
> +	if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628))
> +		status = mtk_r32(eth, MTK_PDMA_INT_STATUS);
> +	else
> +		status = mtk_r32(eth, MTK_QMTK_INT_STATUS);
>  	if (status & MTK_TX_DONE_INT)
>  		return budget;
>
> @@ -1202,6 +1362,24 @@ static int mtk_tx_alloc(struct mtk_eth *eth)
>  		ring->dma[i].txd3 = TX_DMA_LS0 | TX_DMA_OWNER_CPU;
>  	}
>
> +	/* On MT7688 (PDMA only) this driver uses the ring->dma structs
> +	 * only as the framework. The real HW descriptors are the PDMA
> +	 * descriptors in ring->dma_pdma.
> +	 */
> +	if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
> +		ring->dma_pdma = dma_alloc_coherent(eth->dev, MTK_DMA_SIZE * sz,
> +						    &ring->phys_pdma,
> +						    GFP_ATOMIC);
> +		if (!ring->dma_pdma)
> +			goto no_tx_mem;
> +
> +		for (i = 0; i < MTK_DMA_SIZE; i++) {
> +			ring->dma_pdma[i].txd2 = TX_DMA_DESP2_DEF;
> +			ring->dma_pdma[i].txd4 = 0;
> +		}
> +	}
> +
> +	ring->dma_size = MTK_DMA_SIZE;
>  	atomic_set(&ring->free_count, MTK_DMA_SIZE - 2);
>  	ring->next_free = &ring->dma[0];
>  	ring->last_free = &ring->dma[MTK_DMA_SIZE - 1];
> @@ -1212,15 +1390,23 @@ static int mtk_tx_alloc(struct mtk_eth *eth)
>  	 */
>  	wmb();
>
> -	mtk_w32(eth, ring->phys, MTK_QTX_CTX_PTR);
> -	mtk_w32(eth, ring->phys, MTK_QTX_DTX_PTR);
> -	mtk_w32(eth,
> -		ring->phys + ((MTK_DMA_SIZE - 1) * sz),
> -		MTK_QTX_CRX_PTR);
> -	mtk_w32(eth,
> -		ring->phys + ((MTK_DMA_SIZE - 1) * sz),
> -		MTK_QTX_DRX_PTR);
> -	mtk_w32(eth, (QDMA_RES_THRES << 8) | QDMA_RES_THRES, MTK_QTX_CFG(0));
> +	if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
> +		mtk_w32(eth, ring->phys_pdma, MT7628_TX_BASE_PTR0);
> +		mtk_w32(eth, MTK_DMA_SIZE, MT7628_TX_MAX_CNT0);
> +		mtk_w32(eth, 0, MT7628_TX_CTX_IDX0);
> +		mtk_w32(eth, MT7628_PST_DTX_IDX0, MTK_PDMA_RST_IDX);
> +	} else {
> +		mtk_w32(eth, ring->phys, MTK_QTX_CTX_PTR);
> +		mtk_w32(eth, ring->phys, MTK_QTX_DTX_PTR);
> +		mtk_w32(eth,
> +			ring->phys + ((MTK_DMA_SIZE - 1) * sz),
> +			MTK_QTX_CRX_PTR);
> +		mtk_w32(eth,
> +			ring->phys + ((MTK_DMA_SIZE - 1) * sz),
> +			MTK_QTX_DRX_PTR);
> +		mtk_w32(eth, (QDMA_RES_THRES << 8) | QDMA_RES_THRES,
> +			MTK_QTX_CFG(0));
> +	}
>
>  	return 0;
>
> @@ -1247,6 +1433,14 @@ static void mtk_tx_clean(struct mtk_eth *eth)
>  				  ring->phys);
>  		ring->dma = NULL;
>  	}
> +
> +	if (ring->dma_pdma) {
> +		dma_free_coherent(eth->dev,
> +				  MTK_DMA_SIZE * sizeof(*ring->dma_pdma),
> +				  ring->dma_pdma,
> +				  ring->phys_pdma);
> +		ring->dma_pdma = NULL;
> +	}
>  }
>
>  static int mtk_rx_alloc(struct mtk_eth *eth, int ring_no, int rx_flag)
> @@ -1294,14 +1488,17 @@ static int mtk_rx_alloc(struct mtk_eth *eth,  
> int ring_no, int rx_flag)
>
>  	for (i = 0; i < rx_dma_size; i++) {
>  		dma_addr_t dma_addr = dma_map_single(eth->dev,
> -				ring->data[i] + NET_SKB_PAD,
> +				ring->data[i] + NET_SKB_PAD + eth->ip_align,
>  				ring->buf_size,
>  				DMA_FROM_DEVICE);
>  		if (unlikely(dma_mapping_error(eth->dev, dma_addr)))
>  			return -ENOMEM;
>  		ring->dma[i].rxd1 = (unsigned int)dma_addr;
>
> -		ring->dma[i].rxd2 = RX_DMA_PLEN0(ring->buf_size);
> +		if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628))
> +			ring->dma[i].rxd2 = RX_DMA_LSO;
> +		else
> +			ring->dma[i].rxd2 = RX_DMA_PLEN0(ring->buf_size);
>  	}
>  	ring->dma_size = rx_dma_size;
>  	ring->calc_idx_update = false;
> @@ -1617,9 +1814,16 @@ static int mtk_dma_busy_wait(struct mtk_eth *eth)
>  	unsigned long t_start = jiffies;
>
>  	while (1) {
> -		if (!(mtk_r32(eth, MTK_QDMA_GLO_CFG) &
> -		      (MTK_RX_DMA_BUSY | MTK_TX_DMA_BUSY)))
> -			return 0;
> +		if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
> +			if (!(mtk_r32(eth, MTK_PDMA_GLO_CFG) &
> +			      (MTK_RX_DMA_BUSY | MTK_TX_DMA_BUSY)))
> +				return 0;
> +		} else {
> +			if (!(mtk_r32(eth, MTK_QDMA_GLO_CFG) &
> +			      (MTK_RX_DMA_BUSY | MTK_TX_DMA_BUSY)))
> +				return 0;
> +		}
> +
>  		if (time_after(jiffies, t_start + MTK_DMA_BUSY_TIMEOUT))
>  			break;
>  	}
> @@ -1636,20 +1840,24 @@ static int mtk_dma_init(struct mtk_eth *eth)
>  	if (mtk_dma_busy_wait(eth))
>  		return -EBUSY;
>
> -	/* QDMA needs scratch memory for internal reordering of the
> -	 * descriptors
> -	 */
> -	err = mtk_init_fq_dma(eth);
> -	if (err)
> -		return err;
> +	if (!MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
> +		/* QDMA needs scratch memory for internal reordering of the
> +		 * descriptors
> +		 */
> +		err = mtk_init_fq_dma(eth);
> +		if (err)
> +			return err;
> +	}
>
>  	err = mtk_tx_alloc(eth);
>  	if (err)
>  		return err;
>
> -	err = mtk_rx_alloc(eth, 0, MTK_RX_FLAGS_QDMA);
> -	if (err)
> -		return err;
> +	if (!MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
> +		err = mtk_rx_alloc(eth, 0, MTK_RX_FLAGS_QDMA);
> +		if (err)
> +			return err;
> +	}
>
>  	err = mtk_rx_alloc(eth, 0, MTK_RX_FLAGS_NORMAL);
>  	if (err)
> @@ -1666,10 +1874,14 @@ static int mtk_dma_init(struct mtk_eth *eth)
>  			return err;
>  	}
>
> -	/* Enable random early drop and set drop threshold automatically */
> -	mtk_w32(eth, FC_THRES_DROP_MODE | FC_THRES_DROP_EN | FC_THRES_MIN,
> -		MTK_QDMA_FC_THRES);
> -	mtk_w32(eth, 0x0, MTK_QDMA_HRED2);
> +	if (!MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
> +		/* Enable random early drop and set drop threshold
> +		 * automatically
> +		 */
> +		mtk_w32(eth, FC_THRES_DROP_MODE | FC_THRES_DROP_EN |
> +			FC_THRES_MIN, MTK_QDMA_FC_THRES);
> +		mtk_w32(eth, 0x0, MTK_QDMA_HRED2);
> +	}
>
>  	return 0;
>  }
> @@ -1740,14 +1952,23 @@ static irqreturn_t mtk_handle_irq_tx(int  
> irq, void *_eth)
>  static irqreturn_t mtk_handle_irq(int irq, void *_eth)
>  {
>  	struct mtk_eth *eth = _eth;
> +	u32 status;
>
> +	status = mtk_r32(eth, MTK_PDMA_INT_STATUS);
>  	if (mtk_r32(eth, MTK_PDMA_INT_MASK) & MTK_RX_DONE_INT) {
>  		if (mtk_r32(eth, MTK_PDMA_INT_STATUS) & MTK_RX_DONE_INT)
>  			mtk_handle_irq_rx(irq, _eth);
>  	}
> -	if (mtk_r32(eth, MTK_QDMA_INT_MASK) & MTK_TX_DONE_INT) {
> -		if (mtk_r32(eth, MTK_QMTK_INT_STATUS) & MTK_TX_DONE_INT)
> -			mtk_handle_irq_tx(irq, _eth);
> +	if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
> +		if (mtk_r32(eth, MTK_PDMA_INT_MASK) & MTK_TX_DONE_INT) {
> +			if (mtk_r32(eth, MTK_PDMA_INT_STATUS) & MTK_TX_DONE_INT)
> +				mtk_handle_irq_tx(irq, _eth);
> +		}
> +	} else {
> +		if (mtk_r32(eth, MTK_QDMA_INT_MASK) & MTK_TX_DONE_INT) {
> +			if (mtk_r32(eth, MTK_QMTK_INT_STATUS) & MTK_TX_DONE_INT)
> +				mtk_handle_irq_tx(irq, _eth);
> +		}
>  	}
>
>  	return IRQ_HANDLED;
> @@ -1778,17 +1999,23 @@ static int mtk_start_dma(struct mtk_eth *eth)
>  		return err;
>  	}
>
> -	mtk_w32(eth,
> -		MTK_TX_WB_DDONE | MTK_TX_DMA_EN |
> -		MTK_DMA_SIZE_16DWORDS | MTK_NDP_CO_PRO |
> -		MTK_RX_DMA_EN | MTK_RX_2B_OFFSET |
> -		MTK_RX_BT_32DWORDS,
> -		MTK_QDMA_GLO_CFG);
> +	if (!MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
> +		mtk_w32(eth,
> +			MTK_TX_WB_DDONE | MTK_TX_DMA_EN |
> +			MTK_DMA_SIZE_16DWORDS | MTK_NDP_CO_PRO |
> +			MTK_RX_DMA_EN | MTK_RX_2B_OFFSET |
> +			MTK_RX_BT_32DWORDS,
> +			MTK_QDMA_GLO_CFG);
>
> -	mtk_w32(eth,
> -		MTK_RX_DMA_EN | rx_2b_offset |
> -		MTK_RX_BT_32DWORDS | MTK_MULTI_EN,
> -		MTK_PDMA_GLO_CFG);
> +		mtk_w32(eth,
> +			MTK_RX_DMA_EN | rx_2b_offset |
> +			MTK_RX_BT_32DWORDS | MTK_MULTI_EN,
> +			MTK_PDMA_GLO_CFG);
> +	} else {
> +		mtk_w32(eth, MTK_TX_WB_DDONE | MTK_TX_DMA_EN | MTK_RX_DMA_EN |
> +			MTK_MULTI_EN | MTK_PDMA_SIZE_8DWORDS,
> +			MTK_PDMA_GLO_CFG);
> +	}
>
>  	return 0;
>  }
> @@ -1816,7 +2043,6 @@ static int mtk_open(struct net_device *dev)
>
>  	phy_start(dev->phydev);
>  	netif_start_queue(dev);
> -
>  	return 0;
>  }
>
> @@ -1860,7 +2086,8 @@ static int mtk_stop(struct net_device *dev)
>  	napi_disable(&eth->tx_napi);
>  	napi_disable(&eth->rx_napi);
>
> -	mtk_stop_dma(eth, MTK_QDMA_GLO_CFG);
> +	if (!MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628))
> +		mtk_stop_dma(eth, MTK_QDMA_GLO_CFG);
>  	mtk_stop_dma(eth, MTK_PDMA_GLO_CFG);
>
>  	mtk_dma_free(eth);
> @@ -1922,6 +2149,24 @@ static int mtk_hw_init(struct mtk_eth *eth)
>  	if (ret)
>  		goto err_disable_pm;
>
> +	if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
> +		ret = device_reset(eth->dev);
> +		if (ret) {
> +			dev_err(eth->dev, "MAC reset failed!\n");
> +			goto err_disable_pm;
> +		}
> +
> +		/* enable interrupt delay for RX */
> +		mtk_w32(eth, MTK_PDMA_DELAY_RX_DELAY, MTK_PDMA_DELAY_INT);
> +
> +		/* disable delay and normal interrupt */
> +		mtk_tx_irq_disable(eth, ~0);
> +		mtk_rx_irq_disable(eth, ~0);
> +
> +		return 0;
> +	}
> +
> +	/* Non-MT7628 handling... */
>  	ethsys_reset(eth, RSTCTRL_FE);
>  	ethsys_reset(eth, RSTCTRL_PPE);
>
> @@ -2425,13 +2670,13 @@ static int mtk_add_mac(struct mtk_eth *eth,  
> struct device_node *np)
>  	eth->netdev[id]->netdev_ops = &mtk_netdev_ops;
>  	eth->netdev[id]->base_addr = (unsigned long)eth->base;
>
> -	eth->netdev[id]->hw_features = MTK_HW_FEATURES;
> +	eth->netdev[id]->hw_features = eth->soc->hw_features;
>  	if (eth->hwlro)
>  		eth->netdev[id]->hw_features |= NETIF_F_LRO;
>
> -	eth->netdev[id]->vlan_features = MTK_HW_FEATURES &
> +	eth->netdev[id]->vlan_features = eth->soc->hw_features &
>  		~(NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX);
> -	eth->netdev[id]->features |= MTK_HW_FEATURES;
> +	eth->netdev[id]->features |= eth->soc->hw_features;
>  	eth->netdev[id]->ethtool_ops = &mtk_ethtool_ops;
>
>  	eth->netdev[id]->irq = eth->irq[0];
> @@ -2463,15 +2708,26 @@ static int mtk_probe(struct platform_device *pdev)
>  	if (IS_ERR(eth->base))
>  		return PTR_ERR(eth->base);
>
> +	if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
> +		eth->tx_int_mask_reg = MTK_PDMA_INT_MASK;
> +		eth->rx_dma_l4_valid = RX_DMA_L4_VALID_PDMA;
> +		eth->ip_align = NET_IP_ALIGN;
> +	} else {
> +		eth->tx_int_mask_reg = MTK_QDMA_INT_MASK;
> +		eth->rx_dma_l4_valid = RX_DMA_L4_VALID;
> +	}
> +
>  	spin_lock_init(&eth->page_lock);
>  	spin_lock_init(&eth->tx_irq_lock);
>  	spin_lock_init(&eth->rx_irq_lock);
>
> -	eth->ethsys = syscon_regmap_lookup_by_phandle(pdev->dev.of_node,
> -						      "mediatek,ethsys");
> -	if (IS_ERR(eth->ethsys)) {
> -		dev_err(&pdev->dev, "no ethsys regmap found\n");
> -		return PTR_ERR(eth->ethsys);
> +	if (!MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
> +		eth->ethsys = syscon_regmap_lookup_by_phandle(pdev->dev.of_node,
> +							      "mediatek,ethsys");
> +		if (IS_ERR(eth->ethsys)) {
> +			dev_err(&pdev->dev, "no ethsys regmap found\n");
> +			return PTR_ERR(eth->ethsys);
> +		}
>  	}
>
>  	if (MTK_HAS_CAPS(eth->soc->caps, MTK_INFRA)) {
> @@ -2570,9 +2826,12 @@ static int mtk_probe(struct platform_device *pdev)
>  	if (err)
>  		goto err_free_dev;
>
> -	err = mtk_mdio_init(eth);
> -	if (err)
> -		goto err_free_dev;
> +	/* No MT7628/88 support yet */
> +	if (!MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
> +		err = mtk_mdio_init(eth);
> +		if (err)
> +			goto err_free_dev;
> +	}
>
>  	for (i = 0; i < MTK_MAX_DEVS; i++) {
>  		if (!eth->netdev[i])
> @@ -2635,12 +2894,14 @@ static int mtk_remove(struct platform_device *pdev)
>
>  static const struct mtk_soc_data mt2701_data = {
>  	.caps = MT7623_CAPS | MTK_HWLRO,
> +	.hw_features = MTK_HW_FEATURES,
>  	.required_clks = MT7623_CLKS_BITMAP,
>  	.required_pctl = true,
>  };
>
>  static const struct mtk_soc_data mt7621_data = {
>  	.caps = MT7621_CAPS,
> +	.hw_features = MTK_HW_FEATURES,
>  	.required_clks = MT7621_CLKS_BITMAP,
>  	.required_pctl = false,
>  };
> @@ -2648,19 +2909,29 @@ static const struct mtk_soc_data mt7621_data = {
>  static const struct mtk_soc_data mt7622_data = {
>  	.ana_rgc3 = 0x2028,
>  	.caps = MT7622_CAPS | MTK_HWLRO,
> +	.hw_features = MTK_HW_FEATURES,
>  	.required_clks = MT7622_CLKS_BITMAP,
>  	.required_pctl = false,
>  };
>
>  static const struct mtk_soc_data mt7623_data = {
>  	.caps = MT7623_CAPS | MTK_HWLRO,
> +	.hw_features = MTK_HW_FEATURES,
>  	.required_clks = MT7623_CLKS_BITMAP,
>  	.required_pctl = true,
>  };
>
> +static const struct mtk_soc_data mt7628_data = {
> +	.caps = MT7628_CAPS,
> +	.hw_features = MTK_HW_FEATURES_MT7628,
> +	.required_clks = MT7628_CLKS_BITMAP,
> +	.required_pctl = false,
> +};
> +
>  static const struct mtk_soc_data mt7629_data = {
>  	.ana_rgc3 = 0x128,
>  	.caps = MT7629_CAPS | MTK_HWLRO,
> +	.hw_features = MTK_HW_FEATURES,
>  	.required_clks = MT7629_CLKS_BITMAP,
>  	.required_pctl = false,
>  };
> @@ -2670,6 +2941,7 @@ const struct of_device_id of_mtk_match[] = {
>  	{ .compatible = "mediatek,mt7621-eth", .data = &mt7621_data},
>  	{ .compatible = "mediatek,mt7622-eth", .data = &mt7622_data},
>  	{ .compatible = "mediatek,mt7623-eth", .data = &mt7623_data},
> +	{ .compatible = "mediatek,mt7628-eth", .data = &mt7628_data},
>  	{ .compatible = "mediatek,mt7629-eth", .data = &mt7629_data},
>  	{},
>  };
> diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.h  
> b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
> index bab94f763e2c..c3866d6451e2 100644
> --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h
> +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
> @@ -39,7 +39,8 @@
>  				 NETIF_F_SG | NETIF_F_TSO | \
>  				 NETIF_F_TSO6 | \
>  				 NETIF_F_IPV6_CSUM)
> -#define NEXT_RX_DESP_IDX(X, Y)	(((X) + 1) & ((Y) - 1))
> +#define MTK_HW_FEATURES_MT7628	(NETIF_F_SG | NETIF_F_RXCSUM)
> +#define NEXT_DESP_IDX(X, Y)	(((X) + 1) & ((Y) - 1))
>
>  #define MTK_MAX_RX_RING_NUM	4
>  #define MTK_HW_LRO_DMA_SIZE	8
> @@ -118,6 +119,7 @@
>  /* PDMA Global Configuration Register */
>  #define MTK_PDMA_GLO_CFG	0xa04
>  #define MTK_MULTI_EN		BIT(10)
> +#define MTK_PDMA_SIZE_8DWORDS	(1 << 4)
>
>  /* PDMA Reset Index Register */
>  #define MTK_PDMA_RST_IDX	0xa08
> @@ -276,11 +278,18 @@
>  #define TX_DMA_OWNER_CPU	BIT(31)
>  #define TX_DMA_LS0		BIT(30)
>  #define TX_DMA_PLEN0(_x)	(((_x) & MTK_TX_DMA_BUF_LEN) << 16)
> +#define TX_DMA_PLEN1(_x)	((_x) & MTK_TX_DMA_BUF_LEN)
>  #define TX_DMA_SWC		BIT(14)
>  #define TX_DMA_SDL(_x)		(((_x) & 0x3fff) << 16)
>
> +/* PDMA on MT7628 */
> +#define TX_DMA_DONE		BIT(31)
> +#define TX_DMA_LS1		BIT(14)
> +#define TX_DMA_DESP2_DEF	(TX_DMA_LS0 | TX_DMA_DONE)
> +
>  /* QDMA descriptor rxd2 */
>  #define RX_DMA_DONE		BIT(31)
> +#define RX_DMA_LSO		BIT(30)
>  #define RX_DMA_PLEN0(_x)	(((_x) & 0x3fff) << 16)
>  #define RX_DMA_GET_PLEN0(_x)	(((_x) >> 16) & 0x3fff)
>
> @@ -289,6 +298,7 @@
>
>  /* QDMA descriptor rxd4 */
>  #define RX_DMA_L4_VALID		BIT(24)
> +#define RX_DMA_L4_VALID_PDMA	BIT(30)		/* when PDMA is used */
>  #define RX_DMA_FPORT_SHIFT	19
>  #define RX_DMA_FPORT_MASK	0x7
>
> @@ -412,6 +422,19 @@
>  #define CO_QPHY_SEL            BIT(0)
>  #define GEPHY_MAC_SEL          BIT(1)
>
> +/* MT7628/88 specific stuff */
> +#define MT7628_PDMA_OFFSET	0x0800
> +#define MT7628_SDM_OFFSET	0x0c00
> +
> +#define MT7628_TX_BASE_PTR0	(MT7628_PDMA_OFFSET + 0x00)
> +#define MT7628_TX_MAX_CNT0	(MT7628_PDMA_OFFSET + 0x04)
> +#define MT7628_TX_CTX_IDX0	(MT7628_PDMA_OFFSET + 0x08)
> +#define MT7628_TX_DTX_IDX0	(MT7628_PDMA_OFFSET + 0x0c)
> +#define MT7628_PST_DTX_IDX0	BIT(0)
> +
> +#define MT7628_SDM_MAC_ADRL	(MT7628_SDM_OFFSET + 0x0c)
> +#define MT7628_SDM_MAC_ADRH	(MT7628_SDM_OFFSET + 0x10)
> +
>  struct mtk_rx_dma {
>  	unsigned int rxd1;
>  	unsigned int rxd2;
> @@ -509,6 +532,7 @@ enum mtk_clks_map {
>  				 BIT(MTK_CLK_SGMII_CK) | \
>  				 BIT(MTK_CLK_ETH2PLL))
>  #define MT7621_CLKS_BITMAP	(0)
> +#define MT7628_CLKS_BITMAP	(0)
>  #define MT7629_CLKS_BITMAP	(BIT(MTK_CLK_ETHIF) | BIT(MTK_CLK_ESW) |  \
>  				 BIT(MTK_CLK_GP0) | BIT(MTK_CLK_GP1) | \
>  				 BIT(MTK_CLK_GP2) | BIT(MTK_CLK_FE) | \
> @@ -563,6 +587,10 @@ struct mtk_tx_ring {
>  	struct mtk_tx_dma *last_free;
>  	u16 thresh;
>  	atomic_t free_count;
> +	int dma_size;
> +	struct mtk_tx_dma *dma_pdma;	/* For MT7628/88 PDMA handling */
> +	dma_addr_t phys_pdma;
> +	int cpu_idx;
>  };
>
>  /* PDMA rx ring mode */
> @@ -604,6 +632,7 @@ enum mkt_eth_capabilities {
>  	MTK_HWLRO_BIT,
>  	MTK_SHARED_INT_BIT,
>  	MTK_TRGMII_MT7621_CLK_BIT,
> +	MTK_SOC_MT7628,

This should be MTK_SOC_MT7628_BIT, this only defines the bit number!

and futher on #define MTK_SOC_MT7628 BIT(MTK_SOC_MT7628_BIT)

Based on this commit [0], MT7621 also needs the PDMA for the RX path.
I know that is not your issue but I think it is better to add a extra
capability bit for the PDMA bits so it can also be used on other socs.

Greats,

René

[0] https://lkml.org/lkml/2018/3/14/1038


>  	/* MUX BITS*/
>  	MTK_ETH_MUX_GDM1_TO_GMAC1_ESW_BIT,
> @@ -696,6 +725,8 @@ enum mkt_eth_capabilities {
>
>  #define MT7623_CAPS  (MTK_GMAC1_RGMII | MTK_GMAC1_TRGMII | MTK_GMAC2_RGMII)
>
> +#define MT7628_CAPS  (MTK_SHARED_INT | MTK_SOC_MT7628)
> +
>  #define MT7629_CAPS  (MTK_GMAC1_SGMII | MTK_GMAC2_SGMII |  
> MTK_GMAC2_GEPHY | \
>  		      MTK_GDM1_ESW | MTK_MUX_GDM1_TO_GMAC1_ESW | \
>  		      MTK_MUX_GMAC2_GMAC0_TO_GEPHY | \
> @@ -707,6 +738,7 @@ enum mkt_eth_capabilities {
>   * @ana_rgc3:                   The offset for register ANA_RGC3 related to
>   *				sgmiisys syscon
>   * @caps			Flags shown the extra capability for the SoC
> + * @hw_features			Flags shown HW features
>   * @required_clks		Flags shown the bitmap for required clocks on
>   *				the target SoC
>   * @required_pctl		A bool value to show whether the SoC requires
> @@ -717,6 +749,7 @@ struct mtk_soc_data {
>  	u32		caps;
>  	u32		required_clks;
>  	bool		required_pctl;
> +	netdev_features_t hw_features;
>  };
>
>  /* currently no SoC has more than 2 macs */
> @@ -810,6 +843,10 @@ struct mtk_eth {
>  	unsigned long			state;
>
>  	const struct mtk_soc_data	*soc;
> +
> +	u32				tx_int_mask_reg;
> +	u32				rx_dma_l4_valid;
> +	int				ip_align;
>  };
>
>  /* struct mtk_mac -	the structure that holds the info about the MACs of the
> --
> 2.22.0




^ permalink raw reply

* Re: [PATCH RFC 0/4] Add support to directly attach BPF program to ftrace
From: Joel Fernandes @ 2019-07-17 13:01 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: linux-kernel, Adrian Ratiu, Alexei Starovoitov, bpf,
	Brendan Gregg, connoro, Daniel Borkmann, duyuchao, Ingo Molnar,
	jeffv, Karim Yaghmour, kernel-team, linux-kselftest,
	Manali Shukla, Manjo Raja Rao, Martin KaFai Lau, Masami Hiramatsu,
	Matt Mullins, Michal Gregorczyk, Michal Gregorczyk,
	Mohammad Husain, namhyung, namhyung, netdev, paul.chaignon,
	primiano, Qais Yousef, Shuah Khan, Song Liu, Srinivas Ramana,
	Steven Rostedt, Tamir Carmeli, Yonghong Song
In-Reply-To: <20190717012406.lugqemvubixfdd6v@ast-mbp.dhcp.thefacebook.com>

On Tue, Jul 16, 2019 at 06:24:07PM -0700, Alexei Starovoitov wrote:
[snip]
> > > > > I don't see why a new bpf node for a trace event is a bad idea, really.
> > > > 
> > > > See the patches for kprobe/uprobe FD-based api and the reasons behind it.
> > > > tldr: text is racy, doesn't scale, poor security, etc.
> > > 
> > > Is it possible to use perf without CAP_SYS_ADMIN and control security at the
> > > per-event level? We are selective about who can access which event, using
> > > selinux. That's how our ftrace-based tracers work. Its fine grained per-event
> > > control. That's where I was going with the tracefs approach since we get that
> > > granularity using the file system.
> 
> android's choice of selinux is not a factor in deciding kernel apis.
> It's completely separate discusion wether disallowing particular tracepoints
> for given user make sense at all.
> Just because you can hack it in via selinux blocking particular
> /sys/debug/tracing/ directory and convince yourself that it's somehow
> makes android more secure. It doesn't mean that all new api should fit
> into this model.

Its not like a hack, it is just control of which tracefs node can be
accessed and which cannot be since the tracing can run on production systems
out in the field and there are several concerns to address like security,
privacy etc. It is not just for debugging usecases. We do collect traces out
in the field where these issues are real and cannot be ignored.

SELinux model is deny everything, and then selectively grant access to what
is needed. The VFS and security LSM hooks provide this control quite well. I am
not sure if such control is possible through perf hence I asked the question.

> I think allowing one tracepoint and disallowing another is pointless
> from security point of view. Tracing bpf program can do bpf_probe_read
> of anything.

I think the assumption here is the user controls the program instructions at
runtime, but that's not the case. The BPF program we are loading is not
dynamically generated, it is built at build time and it is loaded from a
secure verified partition, so even though it can do bpf_probe_read, it is
still not something that the user can change. And, we are planning to make it
even more secure by making it kernel verify the program at load time as well
(you were on some discussions about that a few months ago).

thanks,

 - Joel


^ permalink raw reply

* Re: [PATCH v2 1/3] Bluetooth: btintel: Add firmware lock function
From: Marcel Holtmann @ 2019-07-17 13:36 UTC (permalink / raw)
  To: Kai-Heng Feng
  Cc: Johannes Berg, emmanuel.grumbach, luciano.coelho, Johan Hedberg,
	linuxwifi, linux-wireless, netdev, linux-bluetooth, linux-kernel
In-Reply-To: <20190717074920.21624-1-kai.heng.feng@canonical.com>

Hi Kai-Heng,

> When Intel 8260 starts to load Bluetooth firmware and WiFi firmware, by
> calling btintel_download_firmware() and iwl_pcie_load_given_ucode_8000()
> respectively, the Bluetooth btintel_download_firmware() aborts half way:
> [   11.950216] Bluetooth: hci0: Failed to send firmware data (-38)
> 
> Let btusb and iwlwifi load firmwares exclusively can avoid the issue, so
> introduce a lock to use in btusb and iwlwifi.
> 
> This issue still occurs with latest WiFi and Bluetooth firmwares.
> 
> BugLink: https://bugs.launchpad.net/bugs/1832988
> 
> Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
> ---
> v2:
> - Add bug report link.
> - Rebase on latest wireless-next.
> 
> drivers/bluetooth/btintel.c   | 14 ++++++++++++++
> drivers/bluetooth/btintel.h   | 10 ++++++++++
> include/linux/intel-wifi-bt.h |  8 ++++++++
> 3 files changed, 32 insertions(+)
> create mode 100644 include/linux/intel-wifi-bt.h
> 
> diff --git a/drivers/bluetooth/btintel.c b/drivers/bluetooth/btintel.c
> index bb99c8653aab..93ab18d6ddad 100644
> --- a/drivers/bluetooth/btintel.c
> +++ b/drivers/bluetooth/btintel.c
> @@ -20,6 +20,8 @@
> 
> #define BDADDR_INTEL (&(bdaddr_t) {{0x00, 0x8b, 0x9e, 0x19, 0x03, 0x00}})
> 
> +static DEFINE_MUTEX(firmware_lock);
> +
> int btintel_check_bdaddr(struct hci_dev *hdev)
> {
> 	struct hci_rp_read_bd_addr *bda;
> @@ -709,6 +711,18 @@ int btintel_download_firmware(struct hci_dev *hdev, const struct firmware *fw,
> }
> EXPORT_SYMBOL_GPL(btintel_download_firmware);
> 
> +void btintel_firmware_lock(void)
> +{
> +	mutex_lock(&firmware_lock);
> +}
> +EXPORT_SYMBOL_GPL(btintel_firmware_lock);
> +
> +void btintel_firmware_unlock(void)
> +{
> +	mutex_unlock(&firmware_lock);
> +}
> +EXPORT_SYMBOL_GPL(btintel_firmware_unlock);
> +

so I am not in favor of this solution. The hardware guys should start looking into fixing the firmware loading and provide proper firmware that can be loaded at the same time.

I am also not for sure penalizing all Intel Bluetooth/WiFi combos only because one of them has a bug during simultaneous loading of WiFi and Bluetooth firmware.

Frankly it would be better to detect a failed load and try a second time instead of trying to lock each other out. The cross-contamination of WiFi and Bluetooth drivers is just not clean.

Regards

Marcel


^ permalink raw reply

* Re: Request for backport of 96125bf9985a75db00496dd2bc9249b777d2b19b
From: Dave Taht @ 2019-07-17 13:45 UTC (permalink / raw)
  To: Loganaden Velvindron; +Cc: netdev
In-Reply-To: <CAOp4FwQszD4ocAx6hWud5uvzv5EtuTOpYqJ10XhR5gxkXSZvFQ@mail.gmail.com>

On Mon, Jul 15, 2019 at 11:01 AM Loganaden Velvindron
<loganaden@gmail.com> wrote:
>
> On Fri, Jul 5, 2019 at 6:15 PM Loganaden Velvindron <loganaden@gmail.com> wrote:
> >
> > Hi folks,
> >
> > I read the guidelines for LTS/stable.
> > https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html
> >
> >
> > Although this is not a bugfix, I am humbly submitting a request so
> > that commit id
> > -- 96125bf9985a75db00496dd2bc9249b777d2b19b Allow 0.0.0.0/8 as a valid
> > address range --  is backported to all LTS kernels.
> >
> > My motivation for such a request is that we need this patch to be as
> > widely deployed as possible and as early as possible for interop and
> > hopefully move into better utilization of ipv4 addresses space. Hence
> > my request for it be added to -stable.
> >
>
> Any feedback ?
>
> > Kind regards,
> > //Logan

I am perfectly willing to wait a year or so on the -stable front to
see what, if any, problems that ensue from mainlining this in 5.3.
It's straightforward for distros that wish to do this backport (like
openwrt) to do it now, and other OSes will take longer than this to
adopt, regardless.


-- 

Dave Täht
CTO, TekLibre, LLC
http://www.teklibre.com
Tel: 1-831-205-9740

^ permalink raw reply

* Re: [PATCH v2 1/3] Bluetooth: btintel: Add firmware lock function
From: Kai-Heng Feng @ 2019-07-17 14:08 UTC (permalink / raw)
  To: Marcel Holtmann
  Cc: Johannes Berg, Grumbach, Emmanuel, Luciano Coelho, Johan Hedberg,
	linuxwifi, linux-wireless, netdev, linux-bluetooth, linux-kernel
In-Reply-To: <7CE1949F-76D2-4D27-82B6-02124E62DF5C@holtmann.org>

at 21:36, Marcel Holtmann <marcel@holtmann.org> wrote:

> Hi Kai-Heng,
>
>> When Intel 8260 starts to load Bluetooth firmware and WiFi firmware, by
>> calling btintel_download_firmware() and iwl_pcie_load_given_ucode_8000()
>> respectively, the Bluetooth btintel_download_firmware() aborts half way:
>> [   11.950216] Bluetooth: hci0: Failed to send firmware data (-38)
>>
>> Let btusb and iwlwifi load firmwares exclusively can avoid the issue, so
>> introduce a lock to use in btusb and iwlwifi.
>>
>> This issue still occurs with latest WiFi and Bluetooth firmwares.
>>
>> BugLink: https://bugs.launchpad.net/bugs/1832988
>>
>> Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
>> ---
>> v2:
>> - Add bug report link.
>> - Rebase on latest wireless-next.
>>
>> drivers/bluetooth/btintel.c   | 14 ++++++++++++++
>> drivers/bluetooth/btintel.h   | 10 ++++++++++
>> include/linux/intel-wifi-bt.h |  8 ++++++++
>> 3 files changed, 32 insertions(+)
>> create mode 100644 include/linux/intel-wifi-bt.h
>>
>> diff --git a/drivers/bluetooth/btintel.c b/drivers/bluetooth/btintel.c
>> index bb99c8653aab..93ab18d6ddad 100644
>> --- a/drivers/bluetooth/btintel.c
>> +++ b/drivers/bluetooth/btintel.c
>> @@ -20,6 +20,8 @@
>>
>> #define BDADDR_INTEL (&(bdaddr_t) {{0x00, 0x8b, 0x9e, 0x19, 0x03, 0x00}})
>>
>> +static DEFINE_MUTEX(firmware_lock);
>> +
>> int btintel_check_bdaddr(struct hci_dev *hdev)
>> {
>> 	struct hci_rp_read_bd_addr *bda;
>> @@ -709,6 +711,18 @@ int btintel_download_firmware(struct hci_dev *hdev,  
>> const struct firmware *fw,
>> }
>> EXPORT_SYMBOL_GPL(btintel_download_firmware);
>>
>> +void btintel_firmware_lock(void)
>> +{
>> +	mutex_lock(&firmware_lock);
>> +}
>> +EXPORT_SYMBOL_GPL(btintel_firmware_lock);
>> +
>> +void btintel_firmware_unlock(void)
>> +{
>> +	mutex_unlock(&firmware_lock);
>> +}
>> +EXPORT_SYMBOL_GPL(btintel_firmware_unlock);
>> +
>
> so I am not in favor of this solution. The hardware guys should start  
> looking into fixing the firmware loading and provide proper firmware that  
> can be loaded at the same time.

Of course it’s much better to fix from hardware side.

>
> I am also not for sure penalizing all Intel Bluetooth/WiFi combos only  
> because one of them has a bug during simultaneous loading of WiFi and  
> Bluetooth firmware.

Yes, it’s not ideal.

>
> Frankly it would be better to detect a failed load and try a second time  
> instead of trying to lock each other out. The cross-contamination of WiFi  
> and Bluetooth drivers is just not clean.

Ok. Where do you think is better to handle it, Bluetooth core or USB core?

Kai-Heng

>
> Regards
>
> Marcel



^ permalink raw reply

* [PATCH] net: dsa: sja1105: Add missing spin_unlock
From: YueHaibing @ 2019-07-17 14:12 UTC (permalink / raw)
  To: andrew, vivien.didelot, f.fainelli, davem, olteanv
  Cc: linux-kernel, netdev, YueHaibing

It should call spin_unlock() before return NULL.
Detected by Coccinelle.

Reported-by: Hulk Robot <hulkci@huawei.com>
Fixes: f3097be21bf1 net: ("dsa: sja1105: Add a state machine for RX timestamping")
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
---
 net/dsa/tag_sja1105.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index 1d96c9d..26363d7 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -216,6 +216,7 @@ static struct sk_buff
 		if (!skb) {
 			dev_err_ratelimited(dp->ds->dev,
 					    "Failed to copy stampable skb\n");
+			spin_unlock(&sp->data->meta_lock);
 			return NULL;
 		}
 		sja1105_transfer_meta(skb, meta);
-- 
2.7.4



^ permalink raw reply related

* Re: [PATCH iproute2-rc 2/8] rdma: Add "stat qp show" support
From: Leon Romanovsky @ 2019-07-17 14:23 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, David Ahern, Mark Zhang, RDMA mailing list
In-Reply-To: <20190716120128.6beab22e@hermes.lan>

On Tue, Jul 16, 2019 at 12:01:28PM -0700, Stephen Hemminger wrote:
> On Wed, 10 Jul 2019 10:24:49 +0300
> Leon Romanovsky <leon@kernel.org> wrote:
>
> > From: Mark Zhang <markz@mellanox.com>
> >
> > This patch presents link, id, task name, lqpn, as well as all sub
> > counters of a QP counter.
> > A QP counter is a dynamically allocated statistic counter that is
> > bound with one or more QPs. It has several sub-counters, each is
> > used for a different purpose.
> >
> > Examples:
> > $ rdma stat qp show
> > link mlx5_2/1 cntn 5 pid 31609 comm client.1 rx_write_requests 0
> > rx_read_requests 0 rx_atomic_requests 0 out_of_buffer 0 out_of_sequence 0
> > duplicate_request 0 rnr_nak_retry_err 0 packet_seq_err 0
> > implied_nak_seq_err 0 local_ack_timeout_err 0 resp_local_length_error 0
> > resp_cqe_error 0 req_cqe_error 0 req_remote_invalid_request 0
> > req_remote_access_errors 0 resp_remote_access_errors 0
> > resp_cqe_flush_error 0 req_cqe_flush_error 0
> >     LQPN: <178>
> > $ rdma stat show link rocep1s0f5/1
> > link rocep1s0f5/1 rx_write_requests 0 rx_read_requests 0 rx_atomic_requests 0 out_of_buffer 0 duplicate_request 0
> > rnr_nak_retry_err 0 packet_seq_err 0 implied_nak_seq_err 0 local_ack_timeout_err 0 resp_local_length_error 0 resp_cqe_error 0
> > req_cqe_error 0 req_remote_invalid_request 0 req_remote_access_errors 0 resp_remote_access_errors 0 resp_cqe_flush_error 0
> > req_cqe_flush_error 0 rp_cnp_ignored 0 rp_cnp_handled 0 np_ecn_marked_roce_packets 0 np_cnp_sent 0
> > $ rdma stat show link rocep1s0f5/1 -p
> > link rocep1s0f5/1
> >     rx_write_requests 0
> >     rx_read_requests 0
> >     rx_atomic_requests 0
> >     out_of_buffer 0
> >     duplicate_request 0
> >     rnr_nak_retry_err 0
> >     packet_seq_err 0
> >     implied_nak_seq_err 0
> >     local_ack_timeout_err 0
> >     resp_local_length_error 0
> >     resp_cqe_error 0
> >     req_cqe_error 0
> >     req_remote_invalid_request 0
> >     req_remote_access_errors 0
> >     resp_remote_access_errors 0
> >     resp_cqe_flush_error 0
> >     req_cqe_flush_error 0
> >     rp_cnp_ignored 0
> >     rp_cnp_handled 0
> >     np_ecn_marked_roce_packets 0
> >     np_cnp_sent 0
> >
> > Signed-off-by: Mark Zhang <markz@mellanox.com>
> > Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
> > ---
> >  rdma/Makefile |   2 +-
> >  rdma/rdma.c   |   3 +-
> >  rdma/rdma.h   |   1 +
> >  rdma/stat.c   | 268 ++++++++++++++++++++++++++++++++++++++++++++++++++
> >  rdma/utils.c  |   7 ++
> >  5 files changed, 279 insertions(+), 2 deletions(-)
> >  create mode 100644 rdma/stat.c
> >
>
> Headers have been merged, but this patch does not apply cleanly to current iproute2

Strange, it applied for me cleanly and latest commit in my iproute2
local repo is d035cc1b "ip tunnel: warn when changing IPv6 tunnel without tunnel name"

I will resend the series with fixed typo.

Thanks

>

^ permalink raw reply

* Re: [PATCH V3 00/15] Packed virtqueue support for vhost
From: Michael S. Tsirkin @ 2019-07-17 14:28 UTC (permalink / raw)
  To: Jason Wang
  Cc: kvm, virtualization, netdev, linux-kernel, jfreimann, tiwei.bie,
	maxime.coquelin
In-Reply-To: <af066030-96f1-4a8d-4864-7b6b903477a6@redhat.com>

On Wed, Jul 17, 2019 at 08:27:28PM +0800, Jason Wang wrote:
> 
> On 2019/7/17 下午7:02, Michael S. Tsirkin wrote:
> > On Wed, Jul 17, 2019 at 06:52:40AM -0400, Jason Wang wrote:
> > > Hi all:
> > > 
> > > This series implements packed virtqueues which were described
> > > at [1]. In this version we try to address the performance regression
> > > saw by V2. The root cause is packed virtqueue need more times of
> > > userspace memory accesssing which turns out to be very
> > > expensive. Thanks to the help of 7f466032dc9e ("vhost: access vq
> > > metadata through kernel virtual address"), such overhead cold be
> > > eliminated. So in this version, we can see about 2% improvement for
> > > packed virtqueue on PPS.
> > Great job, thanks!
> > Pls allow a bit more review time than usual as this is a big patchset.
> > Should be done by Tuesday.
> > -next material anyway.
> 
> 
> Sure, just to confirm, I think this should go for your vhost tree?.
> 
> Thanks

I think this makes sense, yes.

^ permalink raw reply

* [PATCH iproute2-rc v1 0/7] Statistics counter support
From: Leon Romanovsky @ 2019-07-17 14:31 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Leon Romanovsky, netdev, David Ahern, Mark Zhang,
	RDMA mailing list

From: Leon Romanovsky <leonro@mellanox.com>

Changelog v0->v1:
 * Fixed typo in manual page (Gal)
 * Rebased on top of d035cc1b "ip tunnel: warn when changing IPv6 tunnel without tunnel name"
 * Dropped update header file because it was already merged.

---------------------------------------------------------------------------------------

Hi,

This is supplementary part of accepted to rdma-next kernel series,
that kernel series provided an option to get various counters: global
and per-objects.

Currently, all counters are printed in format similar to other
device/link properties, while "-p" option will print them in table like
format.

[leonro@server ~]$ rdma stat show
link mlx5_0/1 rx_write_requests 0 rx_read_requests 0 rx_atomic_requests
0 out_of_buffer 0 duplicate_request 0 rnr_nak_retry_err 0 packet_seq_err
0 implied_nak_seq_err 0 local_ack_timeout_err 0 resp_local_length_error
0 resp_cqe_error 0 req_cqe_error 0 req_remote_invalid_request 0
req_remote_access_errors 0 resp_remote_access_errors 0
resp_cqe_flush_error 0 req_cqe_flush_error 0 rp_cnp_ignored 0
rp_cnp_handled 0 np_ecn_marked_roce_packets 0 np_cnp_sent 0

[leonro@server ~]$ rdma stat show -p
link mlx5_0/1
	rx_write_requests 0
	rx_read_requests 0
	rx_atomic_requests 0
	out_of_buffer 0
	duplicate_request 0
	rnr_nak_retry_err 0
	packet_seq_err 0
	implied_nak_seq_err 0
	local_ack_timeout_err 0
	resp_local_length_error 0
	resp_cqe_error 0
	req_cqe_error 0
	req_remote_invalid_request 0
	req_remote_access_errors 0
	resp_remote_access_errors 0
	resp_cqe_flush_error 0
	req_cqe_flush_error 0
	rp_cnp_ignored 0
	rp_cnp_handled 0
	np_ecn_marked_roce_packets 0
	np_cnp_sent 0

Thanks

Mark Zhang (7):
  rdma: Add "stat qp show" support
  rdma: Add get per-port counter mode support
  rdma: Add rdma statistic counter per-port auto mode support
  rdma: Make get_port_from_argv() returns valid port in strict port mode
  rdma: Add stat manual mode support
  rdma: Add default counter show support
  rdma: Document counter statistic

 man/man8/rdma-dev.8       |   1 +
 man/man8/rdma-link.8      |   1 +
 man/man8/rdma-resource.8  |   1 +
 man/man8/rdma-statistic.8 | 167 +++++++++
 man/man8/rdma.8           |   7 +-
 rdma/Makefile             |   2 +-
 rdma/rdma.c               |   3 +-
 rdma/rdma.h               |   1 +
 rdma/stat.c               | 759 ++++++++++++++++++++++++++++++++++++++
 rdma/utils.c              |  17 +-
 10 files changed, 954 insertions(+), 5 deletions(-)
 create mode 100644 man/man8/rdma-statistic.8
 create mode 100644 rdma/stat.c

--
2.20.1


^ permalink raw reply

* [PATCH iproute2-rc v1 2/7] rdma: Add get per-port counter mode support
From: Leon Romanovsky @ 2019-07-17 14:31 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Leon Romanovsky, netdev, David Ahern, Mark Zhang,
	RDMA mailing list
In-Reply-To: <20190717143157.27205-1-leon@kernel.org>

From: Mark Zhang <markz@mellanox.com>

Add an interface to show which mode is active. Two modes are supported:
- "auto": In this mode all QPs belong to one category are bind automatically
  to a single counter set. Currently only "qp type" is supported;
- "manual": In this mode QPs are bound to a counter manually.

Examples:
$ rdma statistic qp mode
0/1: mlx5_0/1: qp auto off
1/1: mlx5_1/1: qp auto off
2/1: mlx5_2/1: qp auto type on
3/1: mlx5_3/1: qp auto off

$ rdma statistic qp mode link mlx5_0
0/1: mlx5_0/1: qp auto off

Signed-off-by: Mark Zhang <markz@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 rdma/stat.c  | 140 +++++++++++++++++++++++++++++++++++++++++++++++++++
 rdma/utils.c |   2 +
 2 files changed, 142 insertions(+)

diff --git a/rdma/stat.c b/rdma/stat.c
index da35ef7d..0c239851 100644
--- a/rdma/stat.c
+++ b/rdma/stat.c
@@ -13,13 +13,152 @@ static int stat_help(struct rd *rd)
 	pr_out("Usage: %s [ OPTIONS ] statistic { COMMAND | help }\n", rd->filename);
 	pr_out("       %s statistic OBJECT show\n", rd->filename);
 	pr_out("       %s statistic OBJECT show link [ DEV/PORT_INDEX ] [ FILTER-NAME FILTER-VALUE ]\n", rd->filename);
+	pr_out("       %s statistic OBJECT mode\n", rd->filename);
+	pr_out("where  OBJECT: = { qp }\n");
 	pr_out("Examples:\n");
 	pr_out("       %s statistic qp show\n", rd->filename);
 	pr_out("       %s statistic qp show link mlx5_2/1\n", rd->filename);
+	pr_out("       %s statistic qp mode\n", rd->filename);
+	pr_out("       %s statistic qp mode link mlx5_0\n", rd->filename);
 
 	return 0;
 }
 
+struct counter_param {
+	char *name;
+	uint32_t attr;
+};
+
+static struct counter_param auto_params[] = {
+	{ "type", RDMA_COUNTER_MASK_QP_TYPE, },
+	{ NULL },
+};
+
+static int prepare_auto_mode_str(struct nlattr **tb, uint32_t mask,
+				 char *output, int len)
+{
+	char s[] = "qp auto";
+	int i, outlen = strlen(s);
+
+	memset(output, 0, len);
+	snprintf(output, len, "%s", s);
+
+	if (mask) {
+		for (i = 0; auto_params[i].name != NULL; i++) {
+			if (mask & auto_params[i].attr) {
+				outlen += strlen(auto_params[i].name) + 1;
+				if (outlen >= len)
+					return -EINVAL;
+				strcat(output, " ");
+				strcat(output, auto_params[i].name);
+			}
+		}
+
+		if (outlen + strlen(" on") >= len)
+			return -EINVAL;
+		strcat(output, " on");
+	} else {
+		if (outlen + strlen(" off") >= len)
+			return -EINVAL;
+		strcat(output, " off");
+	}
+
+	return 0;
+}
+
+static int qp_link_get_mode_parse_cb(const struct nlmsghdr *nlh, void *data)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX] = {};
+	uint32_t mode = 0, mask = 0;
+	char output[128] = {};
+	struct rd *rd = data;
+	uint32_t idx, port;
+	const char *name;
+
+	mnl_attr_parse(nlh, 0, rd_attr_cb, tb);
+	if (!tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_DEV_NAME])
+		return MNL_CB_ERROR;
+
+	if (!tb[RDMA_NLDEV_ATTR_PORT_INDEX]) {
+		pr_err("This tool doesn't support switches yet\n");
+		return MNL_CB_ERROR;
+	}
+
+	idx = mnl_attr_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+	port = mnl_attr_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+	name = mnl_attr_get_str(tb[RDMA_NLDEV_ATTR_DEV_NAME]);
+	if (tb[RDMA_NLDEV_ATTR_STAT_MODE])
+		mode = mnl_attr_get_u32(tb[RDMA_NLDEV_ATTR_STAT_MODE]);
+
+	if (mode == RDMA_COUNTER_MODE_AUTO) {
+		if (!tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK])
+			return MNL_CB_ERROR;
+		mask = mnl_attr_get_u32(tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]);
+		prepare_auto_mode_str(tb, mask, output, sizeof(output));
+	} else {
+		snprintf(output, sizeof(output), "qp auto off");
+	}
+
+	if (rd->json_output) {
+		jsonw_uint_field(rd->jw, "ifindex", idx);
+		jsonw_uint_field(rd->jw, "port", port);
+		jsonw_string_field(rd->jw, "mode", output);
+	} else {
+		pr_out("%u/%u: %s/%u: %s\n", idx, port, name, port, output);
+	}
+
+	return MNL_CB_OK;
+}
+
+static int stat_one_qp_link_get_mode(struct rd *rd)
+{
+	uint32_t seq;
+	int ret;
+
+	if (!rd->port_idx)
+		return 0;
+
+	rd_prepare_msg(rd, RDMA_NLDEV_CMD_STAT_GET,
+		       &seq, (NLM_F_REQUEST | NLM_F_ACK));
+
+	mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_DEV_INDEX, rd->dev_idx);
+	mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_PORT_INDEX, rd->port_idx);
+	/* Make RDMA_NLDEV_ATTR_STAT_MODE valid so that kernel knows
+	 * return only mode instead of all counters
+	 */
+	mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_STAT_MODE,
+			 RDMA_COUNTER_MODE_MANUAL);
+	mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_STAT_RES, RDMA_NLDEV_ATTR_RES_QP);
+	ret = rd_send_msg(rd);
+	if (ret)
+		return ret;
+
+	if (rd->json_output)
+		jsonw_start_object(rd->jw);
+	ret = rd_recv_msg(rd, qp_link_get_mode_parse_cb, rd, seq);
+	if (rd->json_output)
+		jsonw_end_object(rd->jw);
+
+	return ret;
+}
+
+static int stat_qp_link_get_mode(struct rd *rd)
+{
+	return rd_exec_link(rd, stat_one_qp_link_get_mode, false);
+}
+
+static int stat_qp_get_mode(struct rd *rd)
+{
+	const struct rd_cmd cmds[] = {
+		{ NULL,		stat_qp_link_get_mode },
+		{ "link",	stat_qp_link_get_mode },
+		{ "help",	stat_help },
+		{ 0 }
+	};
+
+	return rd_exec_cmd(rd, cmds, "parameter");
+}
+
 static int res_get_hwcounters(struct rd *rd, struct nlattr *hwc_table, bool print)
 {
 	struct nlattr *nla_entry;
@@ -248,6 +387,7 @@ static int stat_qp(struct rd *rd)
 		{ NULL,		stat_qp_show },
 		{ "show",	stat_qp_show },
 		{ "list",	stat_qp_show },
+		{ "mode",	stat_qp_get_mode },
 		{ "help",	stat_help },
 		{ 0 }
 	};
diff --git a/rdma/utils.c b/rdma/utils.c
index 7bc0439a..9c885ad7 100644
--- a/rdma/utils.c
+++ b/rdma/utils.c
@@ -443,6 +443,8 @@ static const enum mnl_attr_data_type nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY] = MNL_TYPE_NESTED,
 	[RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME] = MNL_TYPE_NUL_STRING,
 	[RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE] = MNL_TYPE_U64,
+	[RDMA_NLDEV_ATTR_STAT_MODE] = MNL_TYPE_U32,
+	[RDMA_NLDEV_ATTR_STAT_RES] = MNL_TYPE_U32,
 };
 
 int rd_attr_check(const struct nlattr *attr, int *typep)
-- 
2.20.1


^ permalink raw reply related

* [PATCH iproute2-rc v1 1/7] rdma: Add "stat qp show" support
From: Leon Romanovsky @ 2019-07-17 14:31 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Leon Romanovsky, netdev, David Ahern, Mark Zhang,
	RDMA mailing list
In-Reply-To: <20190717143157.27205-1-leon@kernel.org>

From: Mark Zhang <markz@mellanox.com>

This patch presents link, id, task name, lqpn, as well as all sub
counters of a QP counter.
A QP counter is a dynamically allocated statistic counter that is
bound with one or more QPs. It has several sub-counters, each is
used for a different purpose.

Examples:
$ rdma stat qp show
link mlx5_2/1 cntn 5 pid 31609 comm client.1 rx_write_requests 0
rx_read_requests 0 rx_atomic_requests 0 out_of_buffer 0 out_of_sequence 0
duplicate_request 0 rnr_nak_retry_err 0 packet_seq_err 0
implied_nak_seq_err 0 local_ack_timeout_err 0 resp_local_length_error 0
resp_cqe_error 0 req_cqe_error 0 req_remote_invalid_request 0
req_remote_access_errors 0 resp_remote_access_errors 0
resp_cqe_flush_error 0 req_cqe_flush_error 0
    LQPN: <178>
$ rdma stat show link rocep1s0f5/1
link rocep1s0f5/1 rx_write_requests 0 rx_read_requests 0 rx_atomic_requests 0 out_of_buffer 0 duplicate_request 0
rnr_nak_retry_err 0 packet_seq_err 0 implied_nak_seq_err 0 local_ack_timeout_err 0 resp_local_length_error 0 resp_cqe_error 0
req_cqe_error 0 req_remote_invalid_request 0 req_remote_access_errors 0 resp_remote_access_errors 0 resp_cqe_flush_error 0
req_cqe_flush_error 0 rp_cnp_ignored 0 rp_cnp_handled 0 np_ecn_marked_roce_packets 0 np_cnp_sent 0
$ rdma stat show link rocep1s0f5/1 -p
link rocep1s0f5/1
    rx_write_requests 0
    rx_read_requests 0
    rx_atomic_requests 0
    out_of_buffer 0
    duplicate_request 0
    rnr_nak_retry_err 0
    packet_seq_err 0
    implied_nak_seq_err 0
    local_ack_timeout_err 0
    resp_local_length_error 0
    resp_cqe_error 0
    req_cqe_error 0
    req_remote_invalid_request 0
    req_remote_access_errors 0
    resp_remote_access_errors 0
    resp_cqe_flush_error 0
    req_cqe_flush_error 0
    rp_cnp_ignored 0
    rp_cnp_handled 0
    np_ecn_marked_roce_packets 0
    np_cnp_sent 0

Signed-off-by: Mark Zhang <markz@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 rdma/Makefile |   2 +-
 rdma/rdma.c   |   3 +-
 rdma/rdma.h   |   1 +
 rdma/stat.c   | 268 ++++++++++++++++++++++++++++++++++++++++++++++++++
 rdma/utils.c  |   7 ++
 5 files changed, 279 insertions(+), 2 deletions(-)
 create mode 100644 rdma/stat.c

diff --git a/rdma/Makefile b/rdma/Makefile
index 4847f27e..e3f550bf 100644
--- a/rdma/Makefile
+++ b/rdma/Makefile
@@ -7,7 +7,7 @@ ifeq ($(HAVE_MNL),y)
 CFLAGS += -I./include/uapi/
 
 RDMA_OBJ = rdma.o utils.o dev.o link.o res.o res-pd.o res-mr.o res-cq.o \
-	   res-cmid.o res-qp.o sys.o
+	   res-cmid.o res-qp.o sys.o stat.o
 
 TARGETS += rdma
 endif
diff --git a/rdma/rdma.c b/rdma/rdma.c
index e9f1b4bb..4e34da92 100644
--- a/rdma/rdma.c
+++ b/rdma/rdma.c
@@ -11,7 +11,7 @@ static void help(char *name)
 {
 	pr_out("Usage: %s [ OPTIONS ] OBJECT { COMMAND | help }\n"
 	       "       %s [ -f[orce] ] -b[atch] filename\n"
-	       "where  OBJECT := { dev | link | resource | system | help }\n"
+	       "where  OBJECT := { dev | link | resource | system | statistic | help }\n"
 	       "       OPTIONS := { -V[ersion] | -d[etails] | -j[son] | -p[retty]}\n", name, name);
 }
 
@@ -30,6 +30,7 @@ static int rd_cmd(struct rd *rd, int argc, char **argv)
 		{ "link",	cmd_link },
 		{ "resource",	cmd_res },
 		{ "system",	cmd_sys },
+		{ "statistic",	cmd_stat },
 		{ 0 }
 	};
 
diff --git a/rdma/rdma.h b/rdma/rdma.h
index 885a751e..23157743 100644
--- a/rdma/rdma.h
+++ b/rdma/rdma.h
@@ -94,6 +94,7 @@ int cmd_dev(struct rd *rd);
 int cmd_link(struct rd *rd);
 int cmd_res(struct rd *rd);
 int cmd_sys(struct rd *rd);
+int cmd_stat(struct rd *rd);
 int rd_exec_cmd(struct rd *rd, const struct rd_cmd *c, const char *str);
 int rd_exec_dev(struct rd *rd, int (*cb)(struct rd *rd));
 int rd_exec_require_dev(struct rd *rd, int (*cb)(struct rd *rd));
diff --git a/rdma/stat.c b/rdma/stat.c
new file mode 100644
index 00000000..da35ef7d
--- /dev/null
+++ b/rdma/stat.c
@@ -0,0 +1,268 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * rdma.c	RDMA tool
+ * Authors:     Mark Zhang <markz@mellanox.com>
+ */
+
+#include "rdma.h"
+#include "res.h"
+#include <inttypes.h>
+
+static int stat_help(struct rd *rd)
+{
+	pr_out("Usage: %s [ OPTIONS ] statistic { COMMAND | help }\n", rd->filename);
+	pr_out("       %s statistic OBJECT show\n", rd->filename);
+	pr_out("       %s statistic OBJECT show link [ DEV/PORT_INDEX ] [ FILTER-NAME FILTER-VALUE ]\n", rd->filename);
+	pr_out("Examples:\n");
+	pr_out("       %s statistic qp show\n", rd->filename);
+	pr_out("       %s statistic qp show link mlx5_2/1\n", rd->filename);
+
+	return 0;
+}
+
+static int res_get_hwcounters(struct rd *rd, struct nlattr *hwc_table, bool print)
+{
+	struct nlattr *nla_entry;
+	const char *nm;
+	uint64_t v;
+	int err;
+
+	mnl_attr_for_each_nested(nla_entry, hwc_table) {
+		struct nlattr *hw_line[RDMA_NLDEV_ATTR_MAX] = {};
+
+		err = mnl_attr_parse_nested(nla_entry, rd_attr_cb, hw_line);
+		if (err != MNL_CB_OK)
+			return -EINVAL;
+
+		if (!hw_line[RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME] ||
+		    !hw_line[RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE]) {
+			return -EINVAL;
+		}
+
+		if (!print)
+			continue;
+
+		nm = mnl_attr_get_str(hw_line[RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME]);
+		v = mnl_attr_get_u64(hw_line[RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE]);
+		if (rd->pretty_output && !rd->json_output)
+			newline_indent(rd);
+		res_print_uint(rd, nm, v, hw_line[RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME]);
+	}
+
+	return MNL_CB_OK;
+}
+
+static int res_counter_line(struct rd *rd, const char *name, int index,
+		       struct nlattr **nla_line)
+{
+	uint32_t cntn, port = 0, pid = 0, qpn;
+	struct nlattr *hwc_table, *qp_table;
+	struct nlattr *nla_entry;
+	const char *comm = NULL;
+	bool isfirst;
+	int err;
+
+	if (nla_line[RDMA_NLDEV_ATTR_PORT_INDEX])
+		port = mnl_attr_get_u32(nla_line[RDMA_NLDEV_ATTR_PORT_INDEX]);
+
+	hwc_table = nla_line[RDMA_NLDEV_ATTR_STAT_HWCOUNTERS];
+	qp_table = nla_line[RDMA_NLDEV_ATTR_RES_QP];
+	if (!hwc_table || !qp_table ||
+	    !nla_line[RDMA_NLDEV_ATTR_STAT_COUNTER_ID])
+		return MNL_CB_ERROR;
+
+	cntn = mnl_attr_get_u32(nla_line[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]);
+	if (rd_is_filtered_attr(rd, "cntn", cntn,
+				nla_line[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]))
+		return MNL_CB_OK;
+
+	if (nla_line[RDMA_NLDEV_ATTR_RES_PID]) {
+		pid = mnl_attr_get_u32(nla_line[RDMA_NLDEV_ATTR_RES_PID]);
+		comm = get_task_name(pid);
+	}
+	if (rd_is_filtered_attr(rd, "pid", pid,
+				nla_line[RDMA_NLDEV_ATTR_RES_PID]))
+		return MNL_CB_OK;
+
+	if (nla_line[RDMA_NLDEV_ATTR_RES_KERN_NAME])
+		comm = (char *)mnl_attr_get_str(
+			nla_line[RDMA_NLDEV_ATTR_RES_KERN_NAME]);
+
+	mnl_attr_for_each_nested(nla_entry, qp_table) {
+		struct nlattr *qp_line[RDMA_NLDEV_ATTR_MAX] = {};
+
+		err = mnl_attr_parse_nested(nla_entry, rd_attr_cb, qp_line);
+		if (err != MNL_CB_OK)
+			return -EINVAL;
+
+		if (!qp_line[RDMA_NLDEV_ATTR_RES_LQPN])
+			return -EINVAL;
+
+		qpn = mnl_attr_get_u32(qp_line[RDMA_NLDEV_ATTR_RES_LQPN]);
+		if (rd_is_filtered_attr(rd, "lqpn", qpn,
+					qp_line[RDMA_NLDEV_ATTR_RES_LQPN]))
+			return MNL_CB_OK;
+	}
+
+	err = res_get_hwcounters(rd, hwc_table, false);
+	if (err != MNL_CB_OK)
+		return err;
+
+	if (rd->json_output) {
+		jsonw_string_field(rd->jw, "ifname", name);
+		if (port)
+			jsonw_uint_field(rd->jw, "port", port);
+		jsonw_uint_field(rd->jw, "cntn", cntn);
+	} else {
+		if (port)
+			pr_out("link %s/%u cntn %u ", name, port, cntn);
+		else
+			pr_out("dev %s cntn %u ", name, cntn);
+	}
+
+	res_print_uint(rd, "pid", pid, nla_line[RDMA_NLDEV_ATTR_RES_PID]);
+	print_comm(rd, comm, nla_line);
+
+	res_get_hwcounters(rd, hwc_table, true);
+
+	isfirst = true;
+	mnl_attr_for_each_nested(nla_entry, qp_table) {
+		struct nlattr *qp_line[RDMA_NLDEV_ATTR_MAX] = {};
+
+		if (isfirst && !rd->json_output)
+			pr_out("\n    LQPN: <");
+
+		err = mnl_attr_parse_nested(nla_entry, rd_attr_cb, qp_line);
+		if (err != MNL_CB_OK)
+			return -EINVAL;
+
+		if (!qp_line[RDMA_NLDEV_ATTR_RES_LQPN])
+			return -EINVAL;
+
+		qpn = mnl_attr_get_u32(qp_line[RDMA_NLDEV_ATTR_RES_LQPN]);
+		if (rd->json_output) {
+			jsonw_uint_field(rd->jw, "lqpn", qpn);
+		} else {
+			if (isfirst)
+				pr_out("%d", qpn);
+			else
+				pr_out(", %d", qpn);
+		}
+		isfirst = false;
+	}
+
+	if (!rd->json_output)
+		pr_out(">\n");
+	return MNL_CB_OK;
+}
+
+static int stat_qp_show_parse_cb(const struct nlmsghdr *nlh, void *data)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX] = {};
+	struct nlattr *nla_table, *nla_entry;
+	struct rd *rd = data;
+	const char *name;
+	uint32_t idx;
+	int ret;
+
+	mnl_attr_parse(nlh, 0, rd_attr_cb, tb);
+	if (!tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_DEV_NAME] ||
+	    !tb[RDMA_NLDEV_ATTR_STAT_COUNTER])
+		return MNL_CB_ERROR;
+
+	name = mnl_attr_get_str(tb[RDMA_NLDEV_ATTR_DEV_NAME]);
+	idx = mnl_attr_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+	nla_table = tb[RDMA_NLDEV_ATTR_STAT_COUNTER];
+
+	mnl_attr_for_each_nested(nla_entry, nla_table) {
+		struct nlattr *nla_line[RDMA_NLDEV_ATTR_MAX] = {};
+
+		ret = mnl_attr_parse_nested(nla_entry, rd_attr_cb, nla_line);
+		if (ret != MNL_CB_OK)
+			break;
+
+		ret = res_counter_line(rd, name, idx, nla_line);
+		if (ret != MNL_CB_OK)
+			break;
+	}
+
+	return ret;
+}
+
+static const struct filters stat_valid_filters[MAX_NUMBER_OF_FILTERS] = {
+	{ .name = "cntn", .is_number = true },
+	{ .name = "lqpn", .is_number = true },
+	{ .name = "pid", .is_number = true },
+};
+
+static int stat_qp_show_one_link(struct rd *rd)
+{
+	int flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP;
+	uint32_t seq;
+	int ret;
+
+	if (!rd->port_idx)
+		return 0;
+
+	ret = rd_build_filter(rd, stat_valid_filters);
+	if (ret)
+		return ret;
+
+	rd_prepare_msg(rd, RDMA_NLDEV_CMD_STAT_GET, &seq, flags);
+	mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_DEV_INDEX, rd->dev_idx);
+	mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_PORT_INDEX, rd->port_idx);
+	mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_STAT_RES, RDMA_NLDEV_ATTR_RES_QP);
+	ret = rd_send_msg(rd);
+	if (ret)
+		return ret;
+
+	if (rd->json_output)
+		jsonw_start_object(rd->jw);
+	ret = rd_recv_msg(rd, stat_qp_show_parse_cb, rd, seq);
+	if (rd->json_output)
+		jsonw_end_object(rd->jw);
+
+	return ret;
+}
+
+static int stat_qp_show_link(struct rd *rd)
+{
+	return rd_exec_link(rd, stat_qp_show_one_link, false);
+}
+
+static int stat_qp_show(struct rd *rd)
+{
+	const struct rd_cmd cmds[] = {
+		{ NULL,		stat_qp_show_link },
+		{ "link",	stat_qp_show_link },
+		{ "help",	stat_help },
+		{ 0 }
+	};
+
+	return rd_exec_cmd(rd, cmds, "parameter");
+}
+
+static int stat_qp(struct rd *rd)
+{
+	const struct rd_cmd cmds[] =  {
+		{ NULL,		stat_qp_show },
+		{ "show",	stat_qp_show },
+		{ "list",	stat_qp_show },
+		{ "help",	stat_help },
+		{ 0 }
+	};
+
+	return rd_exec_cmd(rd, cmds, "parameter");
+}
+
+int cmd_stat(struct rd *rd)
+{
+	const struct rd_cmd cmds[] =  {
+		{ NULL,		stat_help },
+		{ "help",	stat_help },
+		{ "qp",		stat_qp },
+		{ 0 }
+	};
+
+	return rd_exec_cmd(rd, cmds, "statistic command");
+}
diff --git a/rdma/utils.c b/rdma/utils.c
index 558d1c29..7bc0439a 100644
--- a/rdma/utils.c
+++ b/rdma/utils.c
@@ -436,6 +436,13 @@ static const enum mnl_attr_data_type nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_DRIVER_S64] = MNL_TYPE_U64,
 	[RDMA_NLDEV_ATTR_DRIVER_U64] = MNL_TYPE_U64,
 	[RDMA_NLDEV_SYS_ATTR_NETNS_MODE] = MNL_TYPE_U8,
+	[RDMA_NLDEV_ATTR_STAT_COUNTER] = MNL_TYPE_NESTED,
+	[RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY] = MNL_TYPE_NESTED,
+	[RDMA_NLDEV_ATTR_STAT_COUNTER_ID] = MNL_TYPE_U32,
+	[RDMA_NLDEV_ATTR_STAT_HWCOUNTERS] = MNL_TYPE_NESTED,
+	[RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY] = MNL_TYPE_NESTED,
+	[RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME] = MNL_TYPE_NUL_STRING,
+	[RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE] = MNL_TYPE_U64,
 };
 
 int rd_attr_check(const struct nlattr *attr, int *typep)
-- 
2.20.1


^ permalink raw reply related

* [PATCH iproute2-rc v1 3/7] rdma: Add rdma statistic counter per-port auto mode support
From: Leon Romanovsky @ 2019-07-17 14:31 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Leon Romanovsky, netdev, David Ahern, Mark Zhang,
	RDMA mailing list
In-Reply-To: <20190717143157.27205-1-leon@kernel.org>

From: Mark Zhang <markz@mellanox.com>

With per-QP statistic counter support, a user is allowed to monitor
specific QPs categories, which are bound to/unbound from counters
dynamically allocated/deallocated.

In per-port "auto" mode, QPs are bound to counters automatically
according to common criteria. For example a per "type"(qp type)
scheme, where in each process all QPs have same qp type are bind
automatically to a single counter.
Currently only "type" (qp type) is supported. Examples:

$ rdma statistic qp set link mlx5_2/1 auto type on
$ rdma statistic qp set link mlx5_2/1 auto off

Signed-off-by: Mark Zhang <markz@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 rdma/stat.c  | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 rdma/utils.c |  1 +
 2 files changed, 88 insertions(+)

diff --git a/rdma/stat.c b/rdma/stat.c
index 0c239851..ad1cc063 100644
--- a/rdma/stat.c
+++ b/rdma/stat.c
@@ -14,12 +14,17 @@ static int stat_help(struct rd *rd)
 	pr_out("       %s statistic OBJECT show\n", rd->filename);
 	pr_out("       %s statistic OBJECT show link [ DEV/PORT_INDEX ] [ FILTER-NAME FILTER-VALUE ]\n", rd->filename);
 	pr_out("       %s statistic OBJECT mode\n", rd->filename);
+	pr_out("       %s statistic OBJECT set COUNTER_SCOPE [DEV/PORT_INDEX] auto {CRITERIA | off}\n", rd->filename);
 	pr_out("where  OBJECT: = { qp }\n");
+	pr_out("       CRITERIA : = { type }\n");
+	pr_out("       COUNTER_SCOPE: = { link | dev }\n");
 	pr_out("Examples:\n");
 	pr_out("       %s statistic qp show\n", rd->filename);
 	pr_out("       %s statistic qp show link mlx5_2/1\n", rd->filename);
 	pr_out("       %s statistic qp mode\n", rd->filename);
 	pr_out("       %s statistic qp mode link mlx5_0\n", rd->filename);
+	pr_out("       %s statistic qp set link mlx5_2/1 auto type on\n", rd->filename);
+	pr_out("       %s statistic qp set link mlx5_2/1 auto off\n", rd->filename);
 
 	return 0;
 }
@@ -381,6 +386,87 @@ static int stat_qp_show(struct rd *rd)
 	return rd_exec_cmd(rd, cmds, "parameter");
 }
 
+static int stat_qp_set_link_auto_sendmsg(struct rd *rd, uint32_t mask)
+{
+	uint32_t seq;
+
+	rd_prepare_msg(rd, RDMA_NLDEV_CMD_STAT_SET,
+		       &seq, (NLM_F_REQUEST | NLM_F_ACK));
+
+	mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_DEV_INDEX, rd->dev_idx);
+	mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_PORT_INDEX, rd->port_idx);
+	mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_STAT_RES, RDMA_NLDEV_ATTR_RES_QP);
+	mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_STAT_MODE,
+			 RDMA_COUNTER_MODE_AUTO);
+	mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, mask);
+
+	return rd_sendrecv_msg(rd, seq);
+}
+
+static int stat_one_qp_set_link_auto_off(struct rd *rd)
+{
+	return stat_qp_set_link_auto_sendmsg(rd, 0);
+}
+
+static int stat_one_qp_set_auto_type_on(struct rd *rd)
+{
+	return stat_qp_set_link_auto_sendmsg(rd, RDMA_COUNTER_MASK_QP_TYPE);
+}
+
+static int stat_one_qp_set_link_auto_type(struct rd *rd)
+{
+	const struct rd_cmd cmds[] = {
+		{ NULL,		stat_help },
+		{ "on",		stat_one_qp_set_auto_type_on },
+		{ 0 }
+	};
+
+	return rd_exec_cmd(rd, cmds, "parameter");
+}
+
+static int stat_one_qp_set_link_auto(struct rd *rd)
+{
+	const struct rd_cmd cmds[] = {
+		{ NULL,		stat_one_qp_link_get_mode },
+		{ "off",	stat_one_qp_set_link_auto_off },
+		{ "type",	stat_one_qp_set_link_auto_type },
+		{ 0 }
+	};
+
+	return rd_exec_cmd(rd, cmds, "parameter");
+}
+
+static int stat_one_qp_set_link(struct rd *rd)
+{
+	const struct rd_cmd cmds[] = {
+		{ NULL,		stat_one_qp_link_get_mode },
+		{ "auto",	stat_one_qp_set_link_auto },
+		{ 0 }
+	};
+
+	if (!rd->port_idx)
+		return 0;
+
+	return rd_exec_cmd(rd, cmds, "parameter");
+}
+
+static int stat_qp_set_link(struct rd *rd)
+{
+	return rd_exec_link(rd, stat_one_qp_set_link, false);
+}
+
+static int stat_qp_set(struct rd *rd)
+{
+	const struct rd_cmd cmds[] = {
+		{ NULL,		stat_help },
+		{ "link",	stat_qp_set_link },
+		{ "help",	stat_help },
+		{ 0 }
+	};
+
+	return rd_exec_cmd(rd, cmds, "parameter");
+}
+
 static int stat_qp(struct rd *rd)
 {
 	const struct rd_cmd cmds[] =  {
@@ -388,6 +474,7 @@ static int stat_qp(struct rd *rd)
 		{ "show",	stat_qp_show },
 		{ "list",	stat_qp_show },
 		{ "mode",	stat_qp_get_mode },
+		{ "set",	stat_qp_set },
 		{ "help",	stat_help },
 		{ 0 }
 	};
diff --git a/rdma/utils.c b/rdma/utils.c
index 9c885ad7..aed1a3d0 100644
--- a/rdma/utils.c
+++ b/rdma/utils.c
@@ -445,6 +445,7 @@ static const enum mnl_attr_data_type nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE] = MNL_TYPE_U64,
 	[RDMA_NLDEV_ATTR_STAT_MODE] = MNL_TYPE_U32,
 	[RDMA_NLDEV_ATTR_STAT_RES] = MNL_TYPE_U32,
+	[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK] = MNL_TYPE_U32,
 };
 
 int rd_attr_check(const struct nlattr *attr, int *typep)
-- 
2.20.1


^ permalink raw reply related

* [PATCH iproute2-rc v1 4/7] rdma: Make get_port_from_argv() returns valid port in strict port mode
From: Leon Romanovsky @ 2019-07-17 14:31 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Leon Romanovsky, netdev, David Ahern, Mark Zhang,
	RDMA mailing list
In-Reply-To: <20190717143157.27205-1-leon@kernel.org>

From: Mark Zhang <markz@mellanox.com>

When strict_port is set, make get_port_from_argv() returns failure if
no valid port is specified.

Signed-off-by: Mark Zhang <markz@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 rdma/utils.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/rdma/utils.c b/rdma/utils.c
index aed1a3d0..95b669f3 100644
--- a/rdma/utils.c
+++ b/rdma/utils.c
@@ -56,7 +56,7 @@ bool rd_no_arg(struct rd *rd)
  * mlx5_1/1    | 1          | false
  * mlx5_1/-    | 0          | false
  *
- * In strict mode, /- will return error.
+ * In strict port mode, a non-0 port must be provided
  */
 static int get_port_from_argv(struct rd *rd, uint32_t *port,
 			      bool *is_dump_all, bool strict_port)
@@ -64,7 +64,7 @@ static int get_port_from_argv(struct rd *rd, uint32_t *port,
 	char *slash;
 
 	*port = 0;
-	*is_dump_all = true;
+	*is_dump_all = strict_port ? false : true;
 
 	slash = strchr(rd_argv(rd), '/');
 	/* if no port found, return 0 */
@@ -83,6 +83,9 @@ static int get_port_from_argv(struct rd *rd, uint32_t *port,
 		if (!*port && strlen(slash))
 			return -EINVAL;
 	}
+	if (strict_port && (*port == 0))
+		return -EINVAL;
+
 	return 0;
 }
 
-- 
2.20.1


^ permalink raw reply related

* [PATCH iproute2-rc v1 6/7] rdma: Add default counter show support
From: Leon Romanovsky @ 2019-07-17 14:31 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Leon Romanovsky, netdev, David Ahern, Mark Zhang,
	RDMA mailing list
In-Reply-To: <20190717143157.27205-1-leon@kernel.org>

From: Mark Zhang <markz@mellanox.com>

Show default counter statistics, which are same through the sysfs
interface: /sys/class/infiniband/<dev>/ports/<port>/hw_counters/

Example:
$ rdma stat show link mlx5_2/1
link mlx5_2/1 rx_write_requests 8 rx_read_requests 4 rx_atomic_requests 0
out_of_buffer 0 out_of_sequence 0 duplicate_request 0 rnr_nak_retry_err 0
packet_seq_err 0 implied_nak_seq_err 0 local_ack_timeout_err 0
resp_local_length_error 0 resp_cqe_error 0 req_cqe_error 0
req_remote_invalid_request 0 req_remote_access_errors 0
resp_remote_access_errors 0 resp_cqe_flush_error 0 req_cqe_flush_error 0
rp_cnp_ignored 0 rp_cnp_handled 0 np_ecn_marked_roce_packets 0
np_cnp_sent 0 rx_icrc_encapsulated 0

Signed-off-by: Mark Zhang <markz@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 rdma/stat.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 73 insertions(+), 1 deletion(-)

diff --git a/rdma/stat.c b/rdma/stat.c
index 942c1ac3..ef0bbcf1 100644
--- a/rdma/stat.c
+++ b/rdma/stat.c
@@ -17,6 +17,8 @@ static int stat_help(struct rd *rd)
 	pr_out("       %s statistic OBJECT set COUNTER_SCOPE [DEV/PORT_INDEX] auto {CRITERIA | off}\n", rd->filename);
 	pr_out("       %s statistic OBJECT bind COUNTER_SCOPE [DEV/PORT_INDEX] [OBJECT-ID] [COUNTER-ID]\n", rd->filename);
 	pr_out("       %s statistic OBJECT unbind COUNTER_SCOPE [DEV/PORT_INDEX] [COUNTER-ID]\n", rd->filename);
+	pr_out("       %s statistic show\n", rd->filename);
+	pr_out("       %s statistic show link [ DEV/PORT_INDEX ]\n", rd->filename);
 	pr_out("where  OBJECT: = { qp }\n");
 	pr_out("       CRITERIA : = { type }\n");
 	pr_out("       COUNTER_SCOPE: = { link | dev }\n");
@@ -31,6 +33,8 @@ static int stat_help(struct rd *rd)
 	pr_out("       %s statistic qp bind link mlx5_2/1 lqpn 178 cntn 4\n", rd->filename);
 	pr_out("       %s statistic qp unbind link mlx5_2/1 cntn 4\n", rd->filename);
 	pr_out("       %s statistic qp unbind link mlx5_2/1 cntn 4 lqpn 178\n", rd->filename);
+	pr_out("       %s statistic show\n", rd->filename);
+	pr_out("       %s statistic show link mlx5_2/1\n", rd->filename);
 
 	return 0;
 }
@@ -674,10 +678,78 @@ static int stat_qp(struct rd *rd)
 	return rd_exec_cmd(rd, cmds, "parameter");
 }
 
+static int stat_show_parse_cb(const struct nlmsghdr *nlh, void *data)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX] = {};
+	struct rd *rd = data;
+	const char *name;
+	uint32_t port;
+	int ret;
+
+	mnl_attr_parse(nlh, 0, rd_attr_cb, tb);
+	if (!tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_DEV_NAME] ||
+	    !tb[RDMA_NLDEV_ATTR_PORT_INDEX] ||
+	    !tb[RDMA_NLDEV_ATTR_STAT_HWCOUNTERS])
+		return MNL_CB_ERROR;
+
+	name = mnl_attr_get_str(tb[RDMA_NLDEV_ATTR_DEV_NAME]);
+	port = mnl_attr_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+	if (rd->json_output) {
+		jsonw_string_field(rd->jw, "ifname", name);
+		jsonw_uint_field(rd->jw, "port", port);
+	} else {
+		pr_out("link %s/%u ", name, port);
+	}
+
+	ret = res_get_hwcounters(rd, tb[RDMA_NLDEV_ATTR_STAT_HWCOUNTERS], true);
+
+	if (!rd->json_output)
+		pr_out("\n");
+	return ret;
+}
+
+static int stat_show_one_link(struct rd *rd)
+{
+	int flags = NLM_F_REQUEST | NLM_F_ACK;
+	uint32_t seq;
+	int ret;
+
+	if (!rd->port_idx)
+		return 0;
+
+	rd_prepare_msg(rd, RDMA_NLDEV_CMD_STAT_GET, &seq,  flags);
+	mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_DEV_INDEX, rd->dev_idx);
+	mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_PORT_INDEX, rd->port_idx);
+	ret = rd_send_msg(rd);
+	if (ret)
+		return ret;
+
+	return rd_recv_msg(rd, stat_show_parse_cb, rd, seq);
+}
+
+static int stat_show_link(struct rd *rd)
+{
+	return rd_exec_link(rd, stat_show_one_link, false);
+}
+
+static int stat_show(struct rd *rd)
+{
+	const struct rd_cmd cmds[] = {
+		{ NULL,		stat_show_link },
+		{ "link",	stat_show_link },
+		{ "help",	stat_help },
+		{ 0 }
+	};
+
+	return rd_exec_cmd(rd, cmds, "parameter");
+}
+
 int cmd_stat(struct rd *rd)
 {
 	const struct rd_cmd cmds[] =  {
-		{ NULL,		stat_help },
+		{ NULL,		stat_show },
+		{ "show",	stat_show },
+		{ "list",	stat_show },
 		{ "help",	stat_help },
 		{ "qp",		stat_qp },
 		{ 0 }
-- 
2.20.1


^ permalink raw reply related

* [PATCH iproute2-rc v1 7/7] rdma: Document counter statistic
From: Leon Romanovsky @ 2019-07-17 14:31 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Leon Romanovsky, netdev, David Ahern, Mark Zhang,
	RDMA mailing list
In-Reply-To: <20190717143157.27205-1-leon@kernel.org>

From: Mark Zhang <markz@mellanox.com>

Add document of accessing the QP counter, including bind/unbind a QP
to a counter manually or automatically, and dump counter statistics.

Signed-off-by: Mark Zhang <markz@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 man/man8/rdma-dev.8       |   1 +
 man/man8/rdma-link.8      |   1 +
 man/man8/rdma-resource.8  |   1 +
 man/man8/rdma-statistic.8 | 167 ++++++++++++++++++++++++++++++++++++++
 man/man8/rdma.8           |   7 +-
 5 files changed, 176 insertions(+), 1 deletion(-)
 create mode 100644 man/man8/rdma-statistic.8

diff --git a/man/man8/rdma-dev.8 b/man/man8/rdma-dev.8
index 38e34b3b..e77e7cd0 100644
--- a/man/man8/rdma-dev.8
+++ b/man/man8/rdma-dev.8
@@ -77,6 +77,7 @@ previously created using iproute2 ip command.
 .BR rdma-link (8),
 .BR rdma-resource (8),
 .BR rdma-system (8),
+.BR rdma-statistic (8),
 .br
 
 .SH AUTHOR
diff --git a/man/man8/rdma-link.8 b/man/man8/rdma-link.8
index b3b40de7..32f80228 100644
--- a/man/man8/rdma-link.8
+++ b/man/man8/rdma-link.8
@@ -97,6 +97,7 @@ Removes RXE link rxe_eth0
 .BR rdma (8),
 .BR rdma-dev (8),
 .BR rdma-resource (8),
+.BR rdma-statistic (8),
 .br
 
 .SH AUTHOR
diff --git a/man/man8/rdma-resource.8 b/man/man8/rdma-resource.8
index 40b073db..05030d0a 100644
--- a/man/man8/rdma-resource.8
+++ b/man/man8/rdma-resource.8
@@ -103,6 +103,7 @@ Show CQs belonging to pid 30489
 .BR rdma (8),
 .BR rdma-dev (8),
 .BR rdma-link (8),
+.BR rdma-statistic (8),
 .br
 
 .SH AUTHOR
diff --git a/man/man8/rdma-statistic.8 b/man/man8/rdma-statistic.8
new file mode 100644
index 00000000..dea6ff24
--- /dev/null
+++ b/man/man8/rdma-statistic.8
@@ -0,0 +1,167 @@
+.TH RDMA\-STATISTIC 8 "17 Mar 2019" "iproute2" "Linux"
+.SH NAME
+rdma-statistic \- RDMA statistic counter configuration
+.SH SYNOPSIS
+.sp
+.ad l
+.in +8
+.ti -8
+.B rdma
+.RI "[ " OPTIONS " ]"
+.B statistic
+.RI  " { " COMMAND " | "
+.BR help " }"
+.sp
+
+.ti -8
+.B rdma statistic
+.RI "[ " OBJECT " ]"
+.B show
+
+.ti -8
+.B rdma statistic
+.RI "[ " OBJECT " ]"
+.B show link
+.RI "[ " DEV/PORT_INDX " ]"
+
+.ti -8
+.B rdma statistic
+.IR OBJECT
+.B mode
+
+.ti -8
+.B rdma statistic
+.IR OBJECT
+.B set
+.IR COUNTER_SCOPE
+.RI "[ " DEV/PORT_INDEX "]"
+.B auto
+.RI "{ " CRITERIA " | "
+.BR off " }"
+
+.ti -8
+.B rdma statistic
+.IR OBJECT
+.B bind
+.IR COUNTER_SCOPE
+.RI "[ " DEV/PORT_INDEX "]"
+.RI "[ " OBJECT-ID " ]"
+.RI "[ " COUNTER-ID " ]"
+
+.ti -8
+.B rdma statistic
+.IR OBJECT
+.B unbind
+.IR COUNTER_SCOPE
+.RI "[ " DEV/PORT_INDEX "]"
+.RI "[ " COUNTER-ID " ]"
+.RI "[ " OBJECT-ID " ]"
+
+.ti -8
+.IR COUNTER_SCOPE " := "
+.RB "{ " link " | " dev " }"
+
+.ti -8
+.IR OBJECT " := "
+.RB "{ " qp " }"
+
+.ti -8
+.IR CRITERIA " := "
+.RB "{ " type " }"
+
+.SH "DESCRIPTION"
+.SS rdma statistic [object] show - Queries the specified RDMA device for RDMA and driver-specific statistics. Show the default hw counters if object is not specified
+
+.PP
+.I "DEV"
+- specifies counters on this RDMA device to show.
+
+.I "PORT_INDEX"
+- specifies counters on this RDMA port to show.
+
+.SS rdma statistic <object> set - configure counter statistic auto-mode for a specific device/port
+In auto mode all objects belong to one category are bind automatically to a single counter set.
+
+.SS rdma statistic <object> bind - manually bind an object (e.g., a qp) with a counter
+When bound the statistics of this object are available in this counter.
+
+.SS rdma statistic <object> unbind - manually unbind an object (e.g., a qp) from the counter previously bound
+When unbound the statistics of this object are no longer available in this counter; And if object id is not specified then all objects on this counter will be unbound.
+
+.I "COUNTER-ID"
+- specifies the id of the counter to be bound.
+If this argument is omitted then a new counter will be allocated.
+
+.SH "EXAMPLES"
+.PP
+rdma statistic show
+.RS 4
+Shows the state of the default counter of all RDMA devices on the system.
+.RE
+.PP
+rdma statistic show link mlx5_2/1
+.RS 4
+Shows the state of the default counter of specified RDMA port
+.RE
+.PP
+rdma statistic qp show
+.RS 4
+Shows the state of all qp counters of all RDMA devices on the system.
+.RE
+.PP
+rdma statistic qp show link mlx5_2/1
+.RS 4
+Shows the state of all qp counters of specified RDMA port.
+.RE
+.PP
+rdma statistic qp show link mlx5_2 pid 30489
+.RS 4
+Shows the state of all qp counters of specified RDMA port and belonging to pid 30489
+.RE
+.PP
+rdma statistic qp mode
+.RS 4
+List current counter mode on all devices
+.RE
+.PP
+rdma statistic qp mode link mlx5_2/1
+.RS 4
+List current counter mode of device mlx5_2 port 1
+.RE
+.PP
+rdma statistic qp set link mlx5_2/1 auto type on
+.RS 4
+On device mlx5_2 port 1, for each new QP bind it with a counter automatically. Per counter for QPs with same qp type in each process. Currently only "type" is supported.
+.RE
+.PP
+rdma statistic qp set link mlx5_2/1 auto off
+.RS 4
+Turn-off auto mode on device mlx5_2 port 1. The allocated counters can be manually accessed.
+.RE
+.PP
+rdma statistic qp bind link mlx5_2/1 lqpn 178
+.RS 4
+On device mlx5_2 port 1, allocate a counter and bind the specified qp on it
+.RE
+.PP
+rdma statistic qp unbind link mlx5_2/1 cntn 4 lqpn 178
+.RS 4
+On device mlx5_2 port 1, bind the specified qp on the specified counter
+.RE
+.PP
+rdma statistic qp unbind link mlx5_2/1 cntn 4
+.RS 4
+On device mlx5_2 port 1, unbind all QPs on the specified counter. After that this counter will be released automatically by the kernel.
+
+.RE
+.PP
+
+.SH SEE ALSO
+.BR rdma (8),
+.BR rdma-dev (8),
+.BR rdma-link (8),
+.BR rdma-resource (8),
+.br
+
+.SH AUTHOR
+Mark Zhang <markz@mellanox.com>
diff --git a/man/man8/rdma.8 b/man/man8/rdma.8
index 3ae33987..ef29b1c6 100644
--- a/man/man8/rdma.8
+++ b/man/man8/rdma.8
@@ -19,7 +19,7 @@ rdma \- RDMA tool
 
 .ti -8
 .IR OBJECT " := { "
-.BR dev " | " link " | " system " }"
+.BR dev " | " link " | " system " | " statistic " }"
 .sp
 
 .ti -8
@@ -74,6 +74,10 @@ Generate JSON output.
 .B sys
 - RDMA subsystem related.
 
+.TP
+.B statistic
+- RDMA counter statistic related.
+
 .PP
 The names of all objects may be written in full or
 abbreviated form, for example
@@ -112,6 +116,7 @@ Exit status is 0 if command was successful or a positive integer upon failure.
 .BR rdma-link (8),
 .BR rdma-resource (8),
 .BR rdma-system (8),
+.BR rdma-statistic (8),
 .br
 
 .SH REPORTING BUGS
-- 
2.20.1


^ permalink raw reply related

* [PATCH iproute2-rc v1 5/7] rdma: Add stat manual mode support
From: Leon Romanovsky @ 2019-07-17 14:31 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Leon Romanovsky, netdev, David Ahern, Mark Zhang,
	RDMA mailing list
In-Reply-To: <20190717143157.27205-1-leon@kernel.org>

From: Mark Zhang <markz@mellanox.com>

In manual mode a QP can be manually bound to a counter. If the counter
id(cntn) is not specified that kernel will allocate one. After a
successful bind, the cntn can be seen through "rdma statistic qp show".
And in unbind if lqpn is not specified then all QPs on this counter will
be unbound.
The manual and auto mode are mutual-exclusive.

Examples:
$ rdma statistic qp bind link mlx5_2/1 lqpn 178
$ rdma statistic qp bind link mlx5_2/1 lqpn 178 cntn 4
$ rdma statistic qp unbind link mlx5_2/1 cntn 4
$ rdma statistic qp unbind link mlx5_2/1 cntn 4 lqpn 178

Signed-off-by: Mark Zhang <markz@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 rdma/stat.c | 192 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 192 insertions(+)

diff --git a/rdma/stat.c b/rdma/stat.c
index ad1cc063..942c1ac3 100644
--- a/rdma/stat.c
+++ b/rdma/stat.c
@@ -15,6 +15,8 @@ static int stat_help(struct rd *rd)
 	pr_out("       %s statistic OBJECT show link [ DEV/PORT_INDEX ] [ FILTER-NAME FILTER-VALUE ]\n", rd->filename);
 	pr_out("       %s statistic OBJECT mode\n", rd->filename);
 	pr_out("       %s statistic OBJECT set COUNTER_SCOPE [DEV/PORT_INDEX] auto {CRITERIA | off}\n", rd->filename);
+	pr_out("       %s statistic OBJECT bind COUNTER_SCOPE [DEV/PORT_INDEX] [OBJECT-ID] [COUNTER-ID]\n", rd->filename);
+	pr_out("       %s statistic OBJECT unbind COUNTER_SCOPE [DEV/PORT_INDEX] [COUNTER-ID]\n", rd->filename);
 	pr_out("where  OBJECT: = { qp }\n");
 	pr_out("       CRITERIA : = { type }\n");
 	pr_out("       COUNTER_SCOPE: = { link | dev }\n");
@@ -25,6 +27,10 @@ static int stat_help(struct rd *rd)
 	pr_out("       %s statistic qp mode link mlx5_0\n", rd->filename);
 	pr_out("       %s statistic qp set link mlx5_2/1 auto type on\n", rd->filename);
 	pr_out("       %s statistic qp set link mlx5_2/1 auto off\n", rd->filename);
+	pr_out("       %s statistic qp bind link mlx5_2/1 lqpn 178\n", rd->filename);
+	pr_out("       %s statistic qp bind link mlx5_2/1 lqpn 178 cntn 4\n", rd->filename);
+	pr_out("       %s statistic qp unbind link mlx5_2/1 cntn 4\n", rd->filename);
+	pr_out("       %s statistic qp unbind link mlx5_2/1 cntn 4 lqpn 178\n", rd->filename);
 
 	return 0;
 }
@@ -467,6 +473,190 @@ static int stat_qp_set(struct rd *rd)
 	return rd_exec_cmd(rd, cmds, "parameter");
 }
 
+static int stat_get_arg(struct rd *rd, const char *arg)
+{
+	int value = 0;
+	char *endp;
+
+	if (strcmpx(rd_argv(rd), arg) != 0)
+		return -EINVAL;
+
+	rd_arg_inc(rd);
+	value = strtol(rd_argv(rd), &endp, 10);
+	rd_arg_inc(rd);
+
+	return value;
+}
+
+static int stat_one_qp_bind(struct rd *rd)
+{
+	int lqpn = 0, cntn = 0, ret;
+	uint32_t seq;
+
+	if (rd_no_arg(rd)) {
+		stat_help(rd);
+		return -EINVAL;
+	}
+
+	ret = rd_build_filter(rd, stat_valid_filters);
+	if (ret)
+		return ret;
+
+	lqpn = stat_get_arg(rd, "lqpn");
+
+	rd_prepare_msg(rd, RDMA_NLDEV_CMD_STAT_SET,
+		       &seq, (NLM_F_REQUEST | NLM_F_ACK));
+
+	mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_STAT_MODE,
+			 RDMA_COUNTER_MODE_MANUAL);
+
+	mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_STAT_RES, RDMA_NLDEV_ATTR_RES_QP);
+	mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_DEV_INDEX, rd->dev_idx);
+	mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_PORT_INDEX, rd->port_idx);
+	mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_RES_LQPN, lqpn);
+
+	if (rd_argc(rd)) {
+		cntn = stat_get_arg(rd, "cntn");
+		mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_STAT_COUNTER_ID,
+				 cntn);
+	}
+
+	return rd_sendrecv_msg(rd, seq);
+}
+
+static int do_stat_qp_unbind_lqpn(struct rd *rd, uint32_t cntn, uint32_t lqpn)
+{
+	uint32_t seq;
+
+	rd_prepare_msg(rd, RDMA_NLDEV_CMD_STAT_DEL,
+		       &seq, (NLM_F_REQUEST | NLM_F_ACK));
+
+	mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_STAT_MODE,
+			 RDMA_COUNTER_MODE_MANUAL);
+	mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_STAT_RES, RDMA_NLDEV_ATTR_RES_QP);
+	mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_DEV_INDEX, rd->dev_idx);
+	mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_PORT_INDEX, rd->port_idx);
+	mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn);
+	mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_RES_LQPN, lqpn);
+
+	return rd_sendrecv_msg(rd, seq);
+}
+
+static int stat_get_counter_parse_cb(const struct nlmsghdr *nlh, void *data)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX] = {};
+	struct nlattr *nla_table, *nla_entry;
+	struct rd *rd = data;
+	uint32_t lqpn, cntn;
+	int err;
+
+	mnl_attr_parse(nlh, 0, rd_attr_cb, tb);
+
+	if (!tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID])
+		return MNL_CB_ERROR;
+	cntn = mnl_attr_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]);
+
+	nla_table = tb[RDMA_NLDEV_ATTR_RES_QP];
+	if (!nla_table)
+		return MNL_CB_ERROR;
+
+	mnl_attr_for_each_nested(nla_entry, nla_table) {
+		struct nlattr *nla_line[RDMA_NLDEV_ATTR_MAX] = {};
+
+		err = mnl_attr_parse_nested(nla_entry, rd_attr_cb, nla_line);
+		if (err != MNL_CB_OK)
+			return -EINVAL;
+
+		if (!nla_line[RDMA_NLDEV_ATTR_RES_LQPN])
+			return -EINVAL;
+
+		lqpn = mnl_attr_get_u32(nla_line[RDMA_NLDEV_ATTR_RES_LQPN]);
+		err = do_stat_qp_unbind_lqpn(rd, cntn, lqpn);
+		if (err)
+			return MNL_CB_ERROR;
+	}
+
+	return MNL_CB_OK;
+}
+
+static int stat_one_qp_unbind(struct rd *rd)
+{
+	int flags = NLM_F_REQUEST | NLM_F_ACK, ret;
+	char buf[MNL_SOCKET_BUFFER_SIZE];
+	int lqpn = 0, cntn = 0;
+	unsigned int portid;
+	uint32_t seq;
+
+	ret = rd_build_filter(rd, stat_valid_filters);
+	if (ret)
+		return ret;
+
+	cntn = stat_get_arg(rd, "cntn");
+	if (rd_argc(rd)) {
+		lqpn = stat_get_arg(rd, "lqpn");
+		return do_stat_qp_unbind_lqpn(rd, cntn, lqpn);
+	}
+
+	rd_prepare_msg(rd, RDMA_NLDEV_CMD_STAT_GET, &seq, flags);
+	mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_DEV_INDEX, rd->dev_idx);
+	mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_PORT_INDEX, rd->port_idx);
+	mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_STAT_RES, RDMA_NLDEV_ATTR_RES_QP);
+	mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn);
+	ret = rd_send_msg(rd);
+	if (ret)
+		return ret;
+
+
+	/* Can't use rd_recv_msg() since the callback also calls it (recursively),
+	 * then rd_recv_msg() always return -1 here
+	 */
+	portid = mnl_socket_get_portid(rd->nl);
+	ret = mnl_socket_recvfrom(rd->nl, buf, sizeof(buf));
+	if (ret <= 0)
+		return ret;
+
+	ret = mnl_cb_run(buf, ret, seq, portid, stat_get_counter_parse_cb, rd);
+	mnl_socket_close(rd->nl);
+	if (ret != MNL_CB_OK)
+		return ret;
+
+	return 0;
+}
+
+static int stat_qp_bind_link(struct rd *rd)
+{
+	return rd_exec_link(rd, stat_one_qp_bind, true);
+}
+
+static int stat_qp_bind(struct rd *rd)
+{
+	const struct rd_cmd cmds[] = {
+		{ NULL,		stat_help },
+		{ "link",	stat_qp_bind_link },
+		{ "help",	stat_help },
+		{ 0 },
+	};
+
+	return rd_exec_cmd(rd, cmds, "parameter");
+}
+
+static int stat_qp_unbind_link(struct rd *rd)
+{
+	return rd_exec_link(rd, stat_one_qp_unbind, true);
+}
+
+static int stat_qp_unbind(struct rd *rd)
+{
+	const struct rd_cmd cmds[] = {
+		{ NULL,		stat_help },
+		{ "link",	stat_qp_unbind_link },
+		{ "help",	stat_help },
+		{ 0 },
+	};
+
+	return rd_exec_cmd(rd, cmds, "parameter");
+}
+
 static int stat_qp(struct rd *rd)
 {
 	const struct rd_cmd cmds[] =  {
@@ -475,6 +665,8 @@ static int stat_qp(struct rd *rd)
 		{ "list",	stat_qp_show },
 		{ "mode",	stat_qp_get_mode },
 		{ "set",	stat_qp_set },
+		{ "bind",	stat_qp_bind },
+		{ "unbind",	stat_qp_unbind },
 		{ "help",	stat_help },
 		{ 0 }
 	};
-- 
2.20.1


^ permalink raw reply related

* Re: [PATCH v4 3/5] vsock/virtio: fix locking in virtio_transport_inc_tx_pkt()
From: Michael S. Tsirkin @ 2019-07-17 14:51 UTC (permalink / raw)
  To: Stefano Garzarella
  Cc: netdev, linux-kernel, Stefan Hajnoczi, David S. Miller,
	virtualization, Jason Wang, kvm
In-Reply-To: <20190717113030.163499-4-sgarzare@redhat.com>

On Wed, Jul 17, 2019 at 01:30:28PM +0200, Stefano Garzarella wrote:
> fwd_cnt and last_fwd_cnt are protected by rx_lock, so we should use
> the same spinlock also if we are in the TX path.
> 
> Move also buf_alloc under the same lock.
> 
> Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>

Wait a second is this a bugfix?
If it's used under the wrong lock won't values get corrupted?
Won't traffic then stall or more data get to sent than
credits?

> ---
>  include/linux/virtio_vsock.h            | 2 +-
>  net/vmw_vsock/virtio_transport_common.c | 4 ++--
>  2 files changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
> index 49fc9d20bc43..4c7781f4b29b 100644
> --- a/include/linux/virtio_vsock.h
> +++ b/include/linux/virtio_vsock.h
> @@ -35,7 +35,6 @@ struct virtio_vsock_sock {
>  
>  	/* Protected by tx_lock */
>  	u32 tx_cnt;
> -	u32 buf_alloc;
>  	u32 peer_fwd_cnt;
>  	u32 peer_buf_alloc;
>  
> @@ -43,6 +42,7 @@ struct virtio_vsock_sock {
>  	u32 fwd_cnt;
>  	u32 last_fwd_cnt;
>  	u32 rx_bytes;
> +	u32 buf_alloc;
>  	struct list_head rx_queue;
>  };
>  
> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
> index a85559d4d974..34a2b42313b7 100644
> --- a/net/vmw_vsock/virtio_transport_common.c
> +++ b/net/vmw_vsock/virtio_transport_common.c
> @@ -210,11 +210,11 @@ static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs,
>  
>  void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt)
>  {
> -	spin_lock_bh(&vvs->tx_lock);
> +	spin_lock_bh(&vvs->rx_lock);
>  	vvs->last_fwd_cnt = vvs->fwd_cnt;
>  	pkt->hdr.fwd_cnt = cpu_to_le32(vvs->fwd_cnt);
>  	pkt->hdr.buf_alloc = cpu_to_le32(vvs->buf_alloc);
> -	spin_unlock_bh(&vvs->tx_lock);
> +	spin_unlock_bh(&vvs->rx_lock);
>  }
>  EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt);
>  
> -- 
> 2.20.1

^ permalink raw reply

* Re: [PATCH v4 4/5] vhost/vsock: split packets to send using multiple buffers
From: Michael S. Tsirkin @ 2019-07-17 14:54 UTC (permalink / raw)
  To: Stefano Garzarella
  Cc: netdev, linux-kernel, Stefan Hajnoczi, David S. Miller,
	virtualization, Jason Wang, kvm
In-Reply-To: <20190717113030.163499-5-sgarzare@redhat.com>

On Wed, Jul 17, 2019 at 01:30:29PM +0200, Stefano Garzarella wrote:
> If the packets to sent to the guest are bigger than the buffer
> available, we can split them, using multiple buffers and fixing
> the length in the packet header.
> This is safe since virtio-vsock supports only stream sockets.
> 
> Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>

So how does it work right now? If an app
does sendmsg with a 64K buffer and the other
side publishes 4K buffers - does it just stall?


> ---
>  drivers/vhost/vsock.c                   | 66 ++++++++++++++++++-------
>  net/vmw_vsock/virtio_transport_common.c | 15 ++++--
>  2 files changed, 60 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> index 6c8390a2af52..9f57736fe15e 100644
> --- a/drivers/vhost/vsock.c
> +++ b/drivers/vhost/vsock.c
> @@ -102,7 +102,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
>  		struct iov_iter iov_iter;
>  		unsigned out, in;
>  		size_t nbytes;
> -		size_t len;
> +		size_t iov_len, payload_len;
>  		int head;
>  
>  		spin_lock_bh(&vsock->send_pkt_list_lock);
> @@ -147,8 +147,24 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
>  			break;
>  		}
>  
> -		len = iov_length(&vq->iov[out], in);
> -		iov_iter_init(&iov_iter, READ, &vq->iov[out], in, len);
> +		iov_len = iov_length(&vq->iov[out], in);
> +		if (iov_len < sizeof(pkt->hdr)) {
> +			virtio_transport_free_pkt(pkt);
> +			vq_err(vq, "Buffer len [%zu] too small\n", iov_len);
> +			break;
> +		}
> +
> +		iov_iter_init(&iov_iter, READ, &vq->iov[out], in, iov_len);
> +		payload_len = pkt->len - pkt->off;
> +
> +		/* If the packet is greater than the space available in the
> +		 * buffer, we split it using multiple buffers.
> +		 */
> +		if (payload_len > iov_len - sizeof(pkt->hdr))
> +			payload_len = iov_len - sizeof(pkt->hdr);
> +
> +		/* Set the correct length in the header */
> +		pkt->hdr.len = cpu_to_le32(payload_len);
>  
>  		nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
>  		if (nbytes != sizeof(pkt->hdr)) {
> @@ -157,33 +173,47 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
>  			break;
>  		}
>  
> -		nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter);
> -		if (nbytes != pkt->len) {
> +		nbytes = copy_to_iter(pkt->buf + pkt->off, payload_len,
> +				      &iov_iter);
> +		if (nbytes != payload_len) {
>  			virtio_transport_free_pkt(pkt);
>  			vq_err(vq, "Faulted on copying pkt buf\n");
>  			break;
>  		}
>  
> -		vhost_add_used(vq, head, sizeof(pkt->hdr) + pkt->len);
> +		vhost_add_used(vq, head, sizeof(pkt->hdr) + payload_len);
>  		added = true;
>  
> -		if (pkt->reply) {
> -			int val;
> -
> -			val = atomic_dec_return(&vsock->queued_replies);
> -
> -			/* Do we have resources to resume tx processing? */
> -			if (val + 1 == tx_vq->num)
> -				restart_tx = true;
> -		}
> -
>  		/* Deliver to monitoring devices all correctly transmitted
>  		 * packets.
>  		 */
>  		virtio_transport_deliver_tap_pkt(pkt);
>  
> -		total_len += pkt->len;
> -		virtio_transport_free_pkt(pkt);
> +		pkt->off += payload_len;
> +		total_len += payload_len;
> +
> +		/* If we didn't send all the payload we can requeue the packet
> +		 * to send it with the next available buffer.
> +		 */
> +		if (pkt->off < pkt->len) {
> +			spin_lock_bh(&vsock->send_pkt_list_lock);
> +			list_add(&pkt->list, &vsock->send_pkt_list);
> +			spin_unlock_bh(&vsock->send_pkt_list_lock);
> +		} else {
> +			if (pkt->reply) {
> +				int val;
> +
> +				val = atomic_dec_return(&vsock->queued_replies);
> +
> +				/* Do we have resources to resume tx
> +				 * processing?
> +				 */
> +				if (val + 1 == tx_vq->num)
> +					restart_tx = true;
> +			}
> +
> +			virtio_transport_free_pkt(pkt);
> +		}
>  	} while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len)));
>  	if (added)
>  		vhost_signal(&vsock->dev, vq);
> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
> index 34a2b42313b7..56fab3f03d0e 100644
> --- a/net/vmw_vsock/virtio_transport_common.c
> +++ b/net/vmw_vsock/virtio_transport_common.c
> @@ -97,8 +97,17 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
>  	struct virtio_vsock_pkt *pkt = opaque;
>  	struct af_vsockmon_hdr *hdr;
>  	struct sk_buff *skb;
> +	size_t payload_len;
> +	void *payload_buf;
>  
> -	skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + pkt->len,
> +	/* A packet could be split to fit the RX buffer, so we can retrieve
> +	 * the payload length from the header and the buffer pointer taking
> +	 * care of the offset in the original packet.
> +	 */
> +	payload_len = le32_to_cpu(pkt->hdr.len);
> +	payload_buf = pkt->buf + pkt->off;
> +
> +	skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + payload_len,
>  			GFP_ATOMIC);
>  	if (!skb)
>  		return NULL;
> @@ -138,8 +147,8 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
>  
>  	skb_put_data(skb, &pkt->hdr, sizeof(pkt->hdr));
>  
> -	if (pkt->len) {
> -		skb_put_data(skb, pkt->buf, pkt->len);
> +	if (payload_len) {
> +		skb_put_data(skb, payload_buf, payload_len);
>  	}
>  
>  	return skb;
> -- 
> 2.20.1

^ permalink raw reply

* Re: [PATCH v4 5/5] vsock/virtio: change the maximum packet size allowed
From: Michael S. Tsirkin @ 2019-07-17 14:59 UTC (permalink / raw)
  To: Stefano Garzarella
  Cc: netdev, linux-kernel, Stefan Hajnoczi, David S. Miller,
	virtualization, Jason Wang, kvm
In-Reply-To: <20190717113030.163499-6-sgarzare@redhat.com>

On Wed, Jul 17, 2019 at 01:30:30PM +0200, Stefano Garzarella wrote:
> Since now we are able to split packets, we can avoid limiting
> their sizes to VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE.
> Instead, we can use VIRTIO_VSOCK_MAX_PKT_BUF_SIZE as the max
> packet size.
> 
> Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>


OK so this is kind of like GSO where we are passing
64K packets to the vsock and then split at the
low level.


> ---
>  net/vmw_vsock/virtio_transport_common.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
> index 56fab3f03d0e..94cc0fa3e848 100644
> --- a/net/vmw_vsock/virtio_transport_common.c
> +++ b/net/vmw_vsock/virtio_transport_common.c
> @@ -181,8 +181,8 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
>  	vvs = vsk->trans;
>  
>  	/* we can send less than pkt_len bytes */
> -	if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE)
> -		pkt_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
> +	if (pkt_len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE)
> +		pkt_len = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
>  
>  	/* virtio_transport_get_credit might return less than pkt_len credit */
>  	pkt_len = virtio_transport_get_credit(vvs, pkt_len);
> -- 
> 2.20.1

^ permalink raw reply

* Re: IPv6 L2TP issues related to 93531c67
From: Paul Donohue @ 2019-07-17 15:37 UTC (permalink / raw)
  To: David Ahern; +Cc: David S. Miller, Alexey Kuznetsov, Hideaki YOSHIFUJI, netdev
In-Reply-To: <22e3eabc-526d-8265-ac39-a6aefc9ef7db@gmail.com>

On Wed, Jul 17, 2019 at 05:11:21AM -0600, David Ahern wrote:
> This fixes the test script (whitespace damaged but simple enough to
> manually patch). See if it fixes the problem with your more complex
> setup. If so I will send a formal patch.

Yes! I applied this on top of f632a8170a6b667ee4e3f552087588f0fe13c4bb (master branch), and it fixes the problem on my systems.

Thank you very much!  I really appreciate all of your work on Linux networking!

^ permalink raw reply

* Re: KASAN: use-after-free Read in nr_release
From: syzbot @ 2019-07-17 16:11 UTC (permalink / raw)
  To: davem, hdanton, linux-hams, linux-kernel, netdev, ralf,
	syzkaller-bugs
In-Reply-To: <0000000000007e8b70058acbd60f@google.com>

syzbot has found a reproducer for the following crash on:

HEAD commit:    192f0f8e Merge tag 'powerpc-5.3-1' of git://git.kernel.org..
git tree:       net-next
console output: https://syzkaller.appspot.com/x/log.txt?x=171bde00600000
kernel config:  https://syzkaller.appspot.com/x/.config?x=87305c3ca9c25c70
dashboard link: https://syzkaller.appspot.com/bug?extid=6eaef7158b19e3fec3a0
compiler:       gcc (GCC) 9.0.0 20181231 (experimental)
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=15882cd0600000

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+6eaef7158b19e3fec3a0@syzkaller.appspotmail.com

==================================================================
BUG: KASAN: use-after-free in atomic_read  
/./include/asm-generic/atomic-instrumented.h:26 [inline]
BUG: KASAN: use-after-free in refcount_inc_not_zero_checked+0x81/0x200  
/lib/refcount.c:123
Read of size 4 at addr ffff88807be6b6c0 by task syz-executor.0/11548

CPU: 0 PID: 11548 Comm: syz-executor.0 Not tainted 5.2.0+ #66
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
Call Trace:
  __dump_stack /lib/dump_stack.c:77 [inline]
  dump_stack+0x172/0x1f0 /lib/dump_stack.c:113
  print_address_description.cold+0xd4/0x306 /mm/kasan/report.c:351
  __kasan_report.cold+0x1b/0x36 /mm/kasan/report.c:482
  kasan_report+0x12/0x20 /mm/kasan/common.c:612
  check_memory_region_inline /mm/kasan/generic.c:185 [inline]
  check_memory_region+0x134/0x1a0 /mm/kasan/generic.c:192
  __kasan_check_read+0x11/0x20 /mm/kasan/common.c:92
  atomic_read /./include/asm-generic/atomic-instrumented.h:26 [inline]
  refcount_inc_not_zero_checked+0x81/0x200 /lib/refcount.c:123
  refcount_inc_checked+0x17/0x70 /lib/refcount.c:156
  sock_hold /./include/net/sock.h:649 [inline]
  nr_release+0x62/0x3e0 /net/netrom/af_netrom.c:520
  __sock_release+0xce/0x280 /net/socket.c:586
  sock_close+0x1e/0x30 /net/socket.c:1264
  __fput+0x2ff/0x890 /fs/file_table.c:280
  ____fput+0x16/0x20 /fs/file_table.c:313
  task_work_run+0x145/0x1c0 /kernel/task_work.c:113
  tracehook_notify_resume /./include/linux/tracehook.h:185 [inline]
  exit_to_usermode_loop+0x316/0x380 /arch/x86/entry/common.c:163
  prepare_exit_to_usermode /arch/x86/entry/common.c:194 [inline]
  syscall_return_slowpath /arch/x86/entry/common.c:274 [inline]
  do_syscall_64+0x5a9/0x6a0 /arch/x86/entry/common.c:299
  entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x413501
Code: 75 14 b8 03 00 00 00 0f 05 48 3d 01 f0 ff ff 0f 83 04 1b 00 00 c3 48  
83 ec 08 e8 0a fc ff ff 48 89 04 24 b8 03 00 00 00 0f 05 <48> 8b 3c 24 48  
89 c2 e8 53 fc ff ff 48 89 d0 48 83 c4 08 48 3d 01
RSP: 002b:00007ffe5eb40550 EFLAGS: 00000293 ORIG_RAX: 0000000000000003
RAX: 0000000000000000 RBX: 0000000000000004 RCX: 0000000000413501
RDX: 0000001b2be20000 RSI: 0000000000000000 RDI: 0000000000000003
RBP: 0000000000000001 R08: ffffffffffffffff R09: ffffffffffffffff
R10: 00007ffe5eb40630 R11: 0000000000000293 R12: 000000000075c9a0
R13: 000000000075c9a0 R14: 0000000000760a68 R15: ffffffffffffffff

Allocated by task 0:
  save_stack+0x23/0x90 /mm/kasan/common.c:69
  set_track /mm/kasan/common.c:77 [inline]
  __kasan_kmalloc /mm/kasan/common.c:487 [inline]
  __kasan_kmalloc.constprop.0+0xcf/0xe0 /mm/kasan/common.c:460
  kasan_kmalloc+0x9/0x10 /mm/kasan/common.c:501
  __do_kmalloc /mm/slab.c:3655 [inline]
  __kmalloc+0x163/0x780 /mm/slab.c:3664
  kmalloc /./include/linux/slab.h:557 [inline]
  sk_prot_alloc+0x23a/0x310 /net/core/sock.c:1603
  sk_alloc+0x39/0xf70 /net/core/sock.c:1657
  nr_make_new /net/netrom/af_netrom.c:476 [inline]
  nr_rx_frame+0x733/0x1e80 /net/netrom/af_netrom.c:959
  nr_loopback_timer+0x7b/0x170 /net/netrom/nr_loopback.c:59
  call_timer_fn+0x1ac/0x780 /kernel/time/timer.c:1322
  expire_timers /kernel/time/timer.c:1366 [inline]
  __run_timers /kernel/time/timer.c:1685 [inline]
  __run_timers /kernel/time/timer.c:1653 [inline]
  run_timer_softirq+0x697/0x17a0 /kernel/time/timer.c:1698
  __do_softirq+0x262/0x98c /kernel/softirq.c:292

Freed by task 11551:
  save_stack+0x23/0x90 /mm/kasan/common.c:69
  set_track /mm/kasan/common.c:77 [inline]
  __kasan_slab_free+0x102/0x150 /mm/kasan/common.c:449
  kasan_slab_free+0xe/0x10 /mm/kasan/common.c:457
  __cache_free /mm/slab.c:3425 [inline]
  kfree+0x10a/0x2c0 /mm/slab.c:3756
  sk_prot_free /net/core/sock.c:1640 [inline]
  __sk_destruct+0x4f7/0x6e0 /net/core/sock.c:1726
  sk_destruct+0x86/0xa0 /net/core/sock.c:1734
  __sk_free+0xfb/0x360 /net/core/sock.c:1745
  sk_free+0x42/0x50 /net/core/sock.c:1756
  sock_put /./include/net/sock.h:1725 [inline]
  sock_efree+0x61/0x80 /net/core/sock.c:2042
  skb_release_head_state+0xeb/0x260 /net/core/skbuff.c:652
  skb_release_all+0x16/0x60 /net/core/skbuff.c:663
  __kfree_skb /net/core/skbuff.c:679 [inline]
  kfree_skb /net/core/skbuff.c:697 [inline]
  kfree_skb+0x101/0x3c0 /net/core/skbuff.c:691
  nr_accept+0x570/0x720 /net/netrom/af_netrom.c:819
  __sys_accept4+0x34e/0x6a0 /net/socket.c:1750
  __do_sys_accept4 /net/socket.c:1785 [inline]
  __se_sys_accept4 /net/socket.c:1782 [inline]
  __x64_sys_accept4+0x97/0xf0 /net/socket.c:1782
  do_syscall_64+0xfd/0x6a0 /arch/x86/entry/common.c:296
  entry_SYSCALL_64_after_hwframe+0x49/0xbe

The buggy address belongs to the object at ffff88807be6b640
  which belongs to the cache kmalloc-2k of size 2048
The buggy address is located 128 bytes inside of
  2048-byte region [ffff88807be6b640, ffff88807be6be40)
The buggy address belongs to the page:
page:ffffea0001ef9a80 refcount:1 mapcount:0 mapping:ffff8880aa400e00  
index:0x0 compound_mapcount: 0
flags: 0x1fffc0000010200(slab|head)
raw: 01fffc0000010200 ffffea0001ef9708 ffffea0002453708 ffff8880aa400e00
raw: 0000000000000000 ffff88807be6a540 0000000100000003 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
  ffff88807be6b580: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
  ffff88807be6b600: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb
> ffff88807be6b680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                                            ^
  ffff88807be6b700: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
  ffff88807be6b780: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
==================================================================
------------[ cut here ]------------
ODEBUG: activate not available (active state 0) object type: timer_list  
hint: nr_t1timer_expiry+0x0/0x340 /net/netrom/nr_timer.c:157
WARNING: CPU: 0 PID: 11548 at lib/debugobjects.c:481  
debug_print_object+0x168/0x250 /lib/debugobjects.c:481
Modules linked in:
CPU: 0 PID: 11548 Comm: syz-executor.0 Tainted: G    B             5.2.0+  
#66
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
RIP: 0010:debug_print_object+0x168/0x250 /lib/debugobjects.c:481
Code: dd a0 48 c5 87 48 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 b5 00 00 00 48  
8b 14 dd a0 48 c5 87 48 c7 c7 00 3e c5 87 e8 f0 b1 07 fe <0f> 0b 83 05 13  
86 66 06 01 48 83 c4 20 5b 41 5c 41 5d 41 5e 5d c3
RSP: 0018:ffff88809151faf0 EFLAGS: 00010082
RAX: 0000000000000000 RBX: 0000000000000005 RCX: 0000000000000000
RDX: 0000000000000000 RSI: ffffffff815c1016 RDI: ffffed10122a3f50
RBP: ffff88809151fb30 R08: ffff8880943fe300 R09: ffffed1015d040f1
R10: ffffed1015d040f0 R11: ffff8880ae820787 R12: 0000000000000001
R13: ffffffff88db4ca0 R14: ffffffff8161a860 R15: 1ffff110122a3f6c
FS:  0000555555737940(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fada90cddb8 CR3: 00000000a7f80000 CR4: 00000000001406f0
Call Trace:
  debug_object_activate+0x2e5/0x470 /lib/debugobjects.c:680
  debug_timer_activate /kernel/time/timer.c:710 [inline]
  __mod_timer /kernel/time/timer.c:1035 [inline]
  mod_timer+0x452/0xc10 /kernel/time/timer.c:1096
  sk_reset_timer+0x24/0x60 /net/core/sock.c:2821
  nr_start_t1timer+0x6e/0xa0 /net/netrom/nr_timer.c:52
  nr_release+0x1de/0x3e0 /net/netrom/af_netrom.c:537
  __sock_release+0xce/0x280 /net/socket.c:586
  sock_close+0x1e/0x30 /net/socket.c:1264
  __fput+0x2ff/0x890 /fs/file_table.c:280
  ____fput+0x16/0x20 /fs/file_table.c:313
  task_work_run+0x145/0x1c0 /kernel/task_work.c:113
  tracehook_notify_resume /./include/linux/tracehook.h:185 [inline]
  exit_to_usermode_loop+0x316/0x380 /arch/x86/entry/common.c:163
  prepare_exit_to_usermode /arch/x86/entry/common.c:194 [inline]
  syscall_return_slowpath /arch/x86/entry/common.c:274 [inline]
  do_syscall_64+0x5a9/0x6a0 /arch/x86/entry/common.c:299
  entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x413501
Code: 75 14 b8 03 00 00 00 0f 05 48 3d 01 f0 ff ff 0f 83 04 1b 00 00 c3 48  
83 ec 08 e8 0a fc ff ff 48 89 04 24 b8 03 00 00 00 0f 05 <48> 8b 3c 24 48  
89 c2 e8 53 fc ff ff 48 89 d0 48 83 c4 08 48 3d 01
RSP: 002b:00007ffe5eb40550 EFLAGS: 00000293 ORIG_RAX: 0000000000000003
RAX: 0000000000000000 RBX: 0000000000000004 RCX: 0000000000413501
RDX: 0000001b2be20000 RSI: 0000000000000000 RDI: 0000000000000003
RBP: 0000000000000001 R08: ffffffffffffffff R09: ffffffffffffffff
R10: 00007ffe5eb40630 R11: 0000000000000293 R12: 000000000075c9a0
R13: 000000000075c9a0 R14: 0000000000760a68 R15: ffffffffffffffff
irq event stamp: 1316
hardirqs last  enabled at (1315): [<ffffffff873119e8>]  
__raw_spin_unlock_irq /./include/linux/spinlock_api_smp.h:168 [inline]
hardirqs last  enabled at (1315): [<ffffffff873119e8>]  
_raw_spin_unlock_irq+0x28/0x90 /kernel/locking/spinlock.c:199
hardirqs last disabled at (1316): [<ffffffff8731216f>]  
__raw_spin_lock_irqsave /./include/linux/spinlock_api_smp.h:108 [inline]
hardirqs last disabled at (1316): [<ffffffff8731216f>]  
_raw_spin_lock_irqsave+0x6f/0xcd /kernel/locking/spinlock.c:159
softirqs last  enabled at (1168): [<ffffffff812923fe>] memcpy  
/./include/linux/string.h:359 [inline]
softirqs last  enabled at (1168): [<ffffffff812923fe>]  
fpu__copy+0x17e/0x8c0 /arch/x86/kernel/fpu/core.c:195
softirqs last disabled at (1166): [<ffffffff81292327>] fpu__copy+0xa7/0x8c0  
/arch/x86/kernel/fpu/core.c:183
---[ end trace c9359faa0df5eab0 ]---


^ permalink raw reply

* Re: [PATCH v4 13/15] docs: ABI: testing: make the files compatible with ReST output
From: Jonathan Cameron @ 2019-07-17 16:13 UTC (permalink / raw)
  To: Mauro Carvalho Chehab
  Cc: gregkh, Rafael J. Wysocki, Len Brown, Jonathan Cameron,
	Hartmut Knaack, Lars-Peter Clausen, Peter Meerwald-Stadler,
	Peter Rosin, Benson Leung, Enric Balletbo i Serra, Guenter Roeck,
	Maxime Coquelin, Alexandre Torgue, Fabrice Gasnier,
	Frederic Barrat, Andrew Donnellan, Sebastian Reichel,
	Heikki Krogerus, Boris Ostrovsky, Juergen Gross,
	Stefano Stabellini, Mike Kravetz, Nicolas Ferre,
	Alexandre Belloni, Ludovic Desroches, Richard Cochran,
	Jonathan Corbet, linux-acpi, linux-iio, linux-stm32,
	linux-arm-kernel, linuxppc-dev, linux-pm, linux-usb, xen-devel,
	linux-mm, netdev, linux-doc
In-Reply-To: <88d15fa38167e3f2e73e65e1c1a1f39bca0267b4.1563365880.git.mchehab+samsung@kernel.org>

On Wed, 17 Jul 2019 09:28:17 -0300
Mauro Carvalho Chehab <mchehab+samsung@kernel.org> wrote:

> Some files over there won't parse well by Sphinx.
> 
> Fix them.
> 
> Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Hi Mauro,

Does feel like this one should perhaps have been broken up a touch!

For the IIO ones I've eyeballed it rather than testing the results

Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>



^ permalink raw reply

* Re: [PATCH bpf] bpf: fix narrower loads on s390
From: Y Song @ 2019-07-17 16:25 UTC (permalink / raw)
  To: Ilya Leoshkevich; +Cc: bpf, netdev, gor, heiko.carstens
In-Reply-To: <4311B5C3-8D1B-4958-9CDE-450662A7851D@linux.ibm.com>

On Wed, Jul 17, 2019 at 3:36 AM Ilya Leoshkevich <iii@linux.ibm.com> wrote:
>
> > Am 17.07.2019 um 11:21 schrieb Ilya Leoshkevich <iii@linux.ibm.com>:
> >
> >> Am 17.07.2019 um 07:11 schrieb Y Song <ys114321@gmail.com>:
> >>
> >> [sorry, resend again as previous one has come text messed out due to
> >> networking issues]
> >>
> >> On Tue, Jul 16, 2019 at 10:08 PM Y Song <ys114321@gmail.com> wrote:
> >>>
> >>> On Tue, Jul 16, 2019 at 4:59 AM Ilya Leoshkevich <iii@linux.ibm.com> wrote:
> >>>>
> >>>> test_pkt_md_access is failing on s390, since the associated eBPF prog
> >>>> returns TC_ACT_SHOT, which in turn happens because loading a part of a
> >>>> struct __sk_buff field produces an incorrect result.
> >>>>
> >>>> The problem is that when verifier emits the code to replace partial load
> >>>> of a field with a full load, a shift and a bitwise AND, it assumes that
> >>>> the machine is little endian.
> >>>>
> >>>> Adjust shift count calculation to account for endianness.
> >>>>
> >>>> Fixes: 31fd85816dbe ("bpf: permits narrower load from bpf program context fields")
> >>>> Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
> >>>> ---
> >>>> kernel/bpf/verifier.c | 8 ++++++--
> >>>> 1 file changed, 6 insertions(+), 2 deletions(-)
> >>>>
> >>>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> >>>> index 5900cbb966b1..3f9353653558 100644
> >>>> --- a/kernel/bpf/verifier.c
> >>>> +++ b/kernel/bpf/verifier.c
> >>>> @@ -8616,8 +8616,12 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
> >>>>               }
> >>>>
> >>>>               if (is_narrower_load && size < target_size) {
> >>>> -                       u8 shift = (off & (size_default - 1)) * 8;
> >>>> -
> >>>> +                       u8 load_off = off & (size_default - 1);
> >>>> +#ifdef __LITTLE_ENDIAN
> >>>> +                       u8 shift = load_off * 8;
> >>>> +#else
> >>>> +                       u8 shift = (size_default - (load_off + size)) * 8;
> >>>> +#endif
> >>>
> >> All the values are in register. The shifting operations should be the
> >> same for big endian and little endian, e.g., value 64 >> 2 = 16 when
> >> value "64" is in register. So I did not see a problem here.
> >>
> >> Could you elaborate which field access in test_pkt_md_access
> >> caused problem?
> >
> > The very first one: TEST_FIELD(__u8,  len, 0xFF);
> >
> >> It would be good if you can give detailed memory layout and register values
> >> to illustrate the problem.
> >
> > Suppose len = 0x11223344. On a big endian system, this would be
> >
> > 11 22 33 44
> >
> > Now, we would like to do *(u8 *)&len, the desired result is 0x11.
> > Verifier should emit the following: ((*(u32 *)&len) >> 24) & 0xff, but as
> > of today it misses the shift.
> >
> > On a little endian system the layout is:
> >
> > 44 33 22 11
> >
> > and the desired result is different - 0x44. Verifier correctly emits
> > (*(u32 *)&len) & 0xff.
>
> I’ve just realized, that this example does not reflect what the test is
> doing on big-endian systems (there is an #ifdef for those).
>
> Here is a better one: len=0x11223344 and we would like to do
> ((u8 *)&len)[3].
>
> len is represented as `11 22 33 44` in memory, so the desired result is
> 0x44. It can be obtained by doing (*(u32 *)&len) & 0xff, but today the
> verifier does ((*(u32 *)&len) >> 24) & 0xff instead.

What you described above for the memory layout all makes sense.
The root cause is for big endian, we should do *((u8 *)&len + 3).
This is exactly what macros in test_pkt_md_access.c tries to do.

if  __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
#define TEST_FIELD(TYPE, FIELD, MASK)                                   \
        {                                                               \
                TYPE tmp = *(volatile TYPE *)&skb->FIELD;               \
                if (tmp != ((*(volatile __u32 *)&skb->FIELD) & MASK))   \
                        return TC_ACT_SHOT;                             \
        }
#else
#define TEST_FIELD_OFFSET(a, b) ((sizeof(a) - sizeof(b)) / sizeof(b))
#define TEST_FIELD(TYPE, FIELD, MASK)                                   \
        {                                                               \
                TYPE tmp = *((volatile TYPE *)&skb->FIELD +             \
                              TEST_FIELD_OFFSET(skb->FIELD, TYPE));     \
                if (tmp != ((*(volatile __u32 *)&skb->FIELD) & MASK))   \
                        return TC_ACT_SHOT;                             \
        }
#endif

Could you check whether your __BYTE_ORDER__ is set
correctly or not for this case? You may need to tweak Makefile
if you are doing cross compilation, I am not sure how as I
did not have environment.

^ permalink raw reply

* Re: [PATCH net-next v1] fix: taprio: Change type of txtime-delay parameter to u32
From: Patel, Vedang @ 2019-07-17 17:32 UTC (permalink / raw)
  To: David Miller
  Cc: netdev@vger.kernel.org, Kirsher, Jeffrey T, Jamal Hadi Salim,
	Cong Wang, Jiri Pirko, intel-wired-lan@lists.osuosl.org,
	Gomes, Vinicius, l@dorileo.org, Jakub Kicinski, Murali Karicheri,
	Sergei Shtylyov, Eric Dumazet, Brown, Aaron F, Stephen Hemminger
In-Reply-To: <20190716.141904.308520366333461345.davem@davemloft.net>



> On Jul 16, 2019, at 2:19 PM, David Miller <davem@davemloft.net> wrote:
> 
> From: Vedang Patel <vedang.patel@intel.com>
> Date: Tue, 16 Jul 2019 12:52:18 -0700
> 
>> During the review of the iproute2 patches for txtime-assist mode, it was
>> pointed out that it does not make sense for the txtime-delay parameter to
>> be negative. So, change the type of the parameter from s32 to u32.
>> 
>> Fixes: 4cfd5779bd6e ("taprio: Add support for txtime-assist mode")
>> Reported-by: Stephen Hemminger <stephen@networkplumber.org>
>> Signed-off-by: Vedang Patel <vedang.patel@intel.com>
> 
> You should have targetted this at 'net' as that's the only tree open
> right now.
> 
> I'll apply this.

Sorry about that.

I will keep this in mind from next time. 

Thanks,
Vedang

^ permalink raw reply

* [PATCH net-next v2 0/2] Fix batched event generation for skbedit action
From: Roman Mashak @ 2019-07-17 17:36 UTC (permalink / raw)
  To: davem; +Cc: netdev, kernel, jhs, xiyou.wangcong, jiri, Roman Mashak

When adding or deleting a batch of entries, the kernel sends up to
TCA_ACT_MAX_PRIO (defined to 32 in kernel) entries in an event to user
space. However it does not consider that the action sizes may vary and
require different skb sizes.

For example, consider the following script adding 32 entries with all
supported skbedit parameters (in order to maximize netlink messages size):

% cat tc-batch.sh
TC="sudo /mnt/iproute2.git/tc/tc"

$TC actions flush action skbedit
for i in `seq 1 $1`;
do
   cmd="action skbedit queue_mapping 2 priority 10 mark 7/0xaabbccdd ptype host inheritdsfield index $i "
   args=$args$cmd
done
$TC actions add $args
%
% ./tc-batch.sh 32
Error: Failed to fill netlink attributes while adding TC action.
We have an error talking to the kernel
%

patch 1 adds callback in tc_action_ops of skbedit action, which calculates
the action size, and passes size to tcf_add_notify()/tcf_del_notify().

patch 2 updates the TDC test suite with relevant skbedit test cases.

v2:
   Added Fixes: tag
   Added cover letter with details on the patchset

Roman Mashak (2):
  net sched: update skbedit action for batched events operations
  tc-testing: updated skbedit action tests with batch create/delete

 net/sched/act_skbedit.c                            | 12 ++++++
 .../tc-testing/tc-tests/actions/skbedit.json       | 47 ++++++++++++++++++++++
 2 files changed, 59 insertions(+)

-- 
2.7.4


^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox