Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH net-next v2 06/10] net: bcmgenet: add main driver file
From: Francois Romieu @ 2014-02-13 10:35 UTC (permalink / raw)
  To: Florian Fainelli; +Cc: netdev, davem, cernekee, devicetree
In-Reply-To: <1392269395-23513-7-git-send-email-f.fainelli@gmail.com>

Florian Fainelli <f.fainelli@gmail.com> :
[...]
> diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
> new file mode 100644
> index 0000000..ab71e81
> --- /dev/null
> +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
[...]
> +static int bcmgenet_set_rx_csum(struct net_device *dev,
> +				netdev_features_t wanted)
> +{
> +	struct bcmgenet_priv *priv = netdev_priv(dev);
> +	u32 rbuf_chk_ctrl;
> +	int rx_csum_en;
> +
> +	rx_csum_en = !!(wanted & NETIF_F_RXCSUM);

It's a bool.

> +
> +	spin_lock_bh(&priv->bh_lock);
> +	rbuf_chk_ctrl = bcmgenet_rbuf_readl(priv, RBUF_CHK_CTRL);
> +
> +	/* enable rx checksumming */
> +	if (!rx_csum_en)
> +		rbuf_chk_ctrl &= ~RBUF_RXCHK_EN;
> +	else
> +		rbuf_chk_ctrl |= RBUF_RXCHK_EN;
> +	priv->desc_rxchk_en = rx_csum_en;
> +	bcmgenet_rbuf_writel(priv, rbuf_chk_ctrl, RBUF_CHK_CTRL);
> +
> +	spin_unlock_bh(&priv->bh_lock);
> +
> +	return 0;
> +}
> +static int bcmgenet_set_tx_csum(struct net_device *dev,

Missing empty line.

[...]
> +static void bcmgenet_update_mib_counters(struct bcmgenet_priv *priv)
> +{
> +	int i, j = 0;
> +
> +	for (i = 0; i < BCMGENET_STATS_LEN; i++) {
> +		const struct bcmgenet_stats *s;
> +		u32 val = 0;
> +		char *p;
> +		u8 offset = 0;

Xmas tree please.

[...]
> +static void bcmgenet_get_ethtool_stats(struct net_device *dev,
> +					struct ethtool_stats *stats,
> +					u64 *data)
> +{
> +	struct bcmgenet_priv *priv = netdev_priv(dev);
> +	int i;
> +
> +	mutex_lock(&priv->mib_mutex);
> +	if (netif_running(dev))
> +		bcmgenet_update_mib_counters(priv);
> +
> +	for (i = 0; i < BCMGENET_STATS_LEN; i++) {
> +		const struct bcmgenet_stats *s;
> +		char *p;
> +
> +		s = &bcmgenet_gstrings_stats[i];
> +		if (s->type == BCMGENET_STAT_NETDEV)
> +			p = (char *)&dev->stats;
> +		else
> +			p = (char *)priv;
> +		p += s->stat_offset;
> +		data[i] = *(u32 *)p;
> +	}
> +	mutex_unlock(&priv->mib_mutex);

The mutex is not used anywhere else and dev_ethtool runs under RTNL.

[...]
> +/* Assign skb to RX DMA descriptor. */
> +static int bcmgenet_alloc_rx_buffers(struct bcmgenet_priv *priv)
> +{
> +	struct enet_cb *cb;

Wrong scope.

> +	int ret = 0;
> +	int i;
> +	u32 reg;
> +
> +	netif_dbg(priv, hw, priv->dev, "%s:\n", __func__);
> +
> +	/* This function may be called from irq bottom-half. */
> +	spin_lock_bh(&priv->bh_lock);

The Rx part of the NAPI handler directly calls bcmgenet_rx_refill through
bcmgenet_desc_rx. bcmgenet_poll does not sync with bh_lock.

Either some factoring was forgotten or some legacy locking / comment was
left in place (there should not be any ->open vs ->poll race)

> +
> +	/* loop here for each buffer needing assign */
> +	for (i = 0; i < priv->num_rx_bds; i++) {
> +		cb = &priv->rx_cbs[priv->rx_bd_assign_index];
> +		if (cb->skb)
> +			continue;
> +
> +		/* set the DMA descriptor length once and for all
> +		 * it will only change if we support dynamically sizing
> +		 * priv->rx_buf_len, but we do not
> +		 */
> +		dmadesc_set_length_status(priv, priv->rx_bd_assign_ptr,
> +				priv->rx_buf_len << DMA_BUFLENGTH_SHIFT);
> +
> +		ret = bcmgenet_rx_refill(priv, cb);
> +		if (ret)
> +			break;
> +
> +	}
> +
> +	/* Enable rx DMA incase it was disabled due to running out of rx BD */

Nit: nothing proves even a single descriptor suceeded allocation.

[...]
> +static int reset_umac(struct bcmgenet_priv *priv)
> +{
> +	struct device *kdev = &priv->pdev->dev;
> +	unsigned int timeout = 0;
> +	u32 reg;
> +
> +	/* 7358a0/7552a0: bad default in RBUF_FLUSH_CTRL.umac_sw_rst */
> +	bcmgenet_rbuf_ctrl_set(priv, 0);
> +	udelay(10);
> +
> +	/* disable MAC while updating its registers */
> +	bcmgenet_umac_writel(priv, 0, UMAC_CMD);
> +
> +	/* issue soft reset, wait for it to complete */
> +	bcmgenet_umac_writel(priv, CMD_SW_RESET, UMAC_CMD);
> +	while (timeout++ < 1000) {
> +		reg = bcmgenet_umac_readl(priv, UMAC_CMD);
> +		if (!(reg & CMD_SW_RESET))
> +			break;

			return 0;

> +		udelay(1);
> +	}
> +
> +	if (timeout == 1000) {
> +		dev_err(kdev,
> +			"timeout waiting for MAC to come out of resetn\n");
> +		return -ETIMEDOUT;
> +	}
> +
> +	return 0;
> +}
> +
> +/* init_umac: Initializes the uniMac controller */

Useless.

> +static int init_umac(struct bcmgenet_priv *priv)
> +{
[...]
> +static void bcmgenet_init_multiq(struct net_device *dev)
> +{
> +	struct bcmgenet_priv *priv = netdev_priv(dev);
> +	unsigned int i, dma_enable;
> +	u32 reg, dma_ctrl, ring_cfg = 0, dma_priority = 0;
> +
> +	if (!netif_is_multiqueue(dev)) {
> +		netdev_warn(dev, "called with non multi queue aware HW\n");
> +		return;
> +	}
> +
> +	dma_ctrl = bcmgenet_tdma_readl(priv, DMA_CTRL);
> +	dma_enable = dma_ctrl & DMA_EN;
> +	dma_ctrl &= ~DMA_EN;
> +	bcmgenet_tdma_writel(priv, dma_ctrl, DMA_CTRL);
> +
> +	/* Enable strict priority arbiter mode */
> +	bcmgenet_tdma_writel(priv, DMA_ARBITER_SP, DMA_ARB_CTRL);
> +
> +	for (i = 0; i < priv->hw_params->tx_queues; i++) {
> +		/* first 64 tx_cbs are reserved for default tx queue
> +		 * (ring 16)
> +		 */
> +		bcmgenet_init_tx_ring(priv, i, priv->hw_params->bds_cnt,
> +					i * priv->hw_params->bds_cnt,
> +					(i + 1) * priv->hw_params->bds_cnt);
> +
> +		/* Configure ring as decriptor ring and setup priority */
> +		ring_cfg |= (1 << i);
> +		dma_priority |= ((GENET_Q0_PRIORITY + i) <<
> +				(GENET_MAX_MQ_CNT + 1) * i);
> +		dma_ctrl |= (1 << (i + DMA_RING_BUF_EN_SHIFT));

Excess parenthesis.

[...]
> +/* NAPI polling method*/
> +static int bcmgenet_poll(struct napi_struct *napi, int budget)
> +{
> +	struct bcmgenet_priv *priv = container_of(napi,
> +			struct bcmgenet_priv, napi);
> +	unsigned int work_done;
> +
> +	work_done = bcmgenet_desc_rx(priv, budget);
> +
> +	/* tx reclaim */
> +	bcmgenet_tx_reclaim(priv->dev, &priv->tx_rings[DESC_INDEX]);

You may move the quick Tx reclaim before the slower Rx protocol processing.

> +	/* Advancing our consumer index*/
> +	priv->rx_c_index += work_done;
> +	priv->rx_c_index &= DMA_C_INDEX_MASK;
> +	bcmgenet_rdma_ring_writel(priv, DESC_INDEX,
> +				priv->rx_c_index, RDMA_CONS_INDEX);
> +	if (work_done < budget) {
> +		napi_complete(napi);
> +		bcmgenet_intrl2_0_writel(priv,
> +			UMAC_IRQ_RXDMA_BDONE, INTRL2_CPU_MASK_CLEAR);
> +	}
> +
> +	return work_done;
> +}
> +
> +/* Interrupt bottom half */
> +static void bcmgenet_irq_task(struct work_struct *work)
> +{
> +	struct bcmgenet_priv *priv = container_of(
> +			work, struct bcmgenet_priv, bcmgenet_irq_work);
> +	struct net_device *dev;

	struct net_device *dev = priv->dev;

> +	u32 reg;
> +
> +	dev = priv->dev;
> +
> +	netif_dbg(priv, intr, dev, "%s\n", __func__);
> +	/* Cable plugged/unplugged event */
> +	if (priv->phy_interface == PHY_INTERFACE_MODE_INTERNAL) {
> +		if (priv->irq0_stat & UMAC_IRQ_PHY_DET_R) {
> +			priv->irq0_stat &= ~UMAC_IRQ_PHY_DET_R;
> +			netif_crit(priv, link, dev,
> +				"cable plugged in, powering up\n");
> +			bcmgenet_power_up(priv, GENET_POWER_CABLE_SENSE);
> +		} else if (priv->irq0_stat & UMAC_IRQ_PHY_DET_F) {
> +			priv->irq0_stat &= ~UMAC_IRQ_PHY_DET_F;
> +			netif_crit(priv, link, dev,
> +				"cable unplugged, powering down\n");
> +			bcmgenet_power_down(priv, GENET_POWER_CABLE_SENSE);
> +		}
> +	}
> +	if (priv->irq0_stat & UMAC_IRQ_MPD_R) {
> +		priv->irq0_stat &= ~UMAC_IRQ_MPD_R;
> +		netif_crit(priv, wol, dev,
> +			"magic packet detected, waking up\n");
> +		/* disable mpd interrupt */
> +		bcmgenet_intrl2_0_writel(priv,
> +			UMAC_IRQ_MPD_R, INTRL2_CPU_MASK_SET);
> +		/* disable CRC forward.*/
> +		reg = bcmgenet_umac_readl(priv, UMAC_CMD);
> +		reg &= ~CMD_CRC_FWD;
> +		bcmgenet_umac_writel(priv, reg, UMAC_CMD);
> +		priv->crc_fwd_en = 0;
> +		bcmgenet_power_up(priv, GENET_POWER_WOL_MAGIC);
> +
> +	} else if (priv->irq0_stat & (UMAC_IRQ_HFB_SM | UMAC_IRQ_HFB_MM)) {
> +		priv->irq0_stat &= ~(UMAC_IRQ_HFB_SM | UMAC_IRQ_HFB_MM);
> +		netif_crit(priv, wol, dev,
> +			"ACPI pattern matched, waking up\n");
> +		/* disable HFB match interrupts */
> +		bcmgenet_intrl2_0_writel(priv,
> +			UMAC_IRQ_HFB_SM | UMAC_IRQ_HFB_MM, INTRL2_CPU_MASK_SET);
> +		bcmgenet_power_up(priv, GENET_POWER_WOL_ACPI);
> +	}

It smells of half-backed wol / runtime power support. Imvho it deserves
some comment in the changelog message to hint about its maturity.

> +
> +	/* Link UP/DOWN event */
> +	if ((priv->hw_params->flags & GENET_HAS_MDIO_INTR) &&
> +		(priv->irq0_stat & (UMAC_IRQ_LINK_UP|UMAC_IRQ_LINK_DOWN))) {
> +		if (priv->phydev)
> +			phy_mac_interrupt(priv->phydev,
> +				(priv->irq0_stat & UMAC_IRQ_LINK_UP));
> +		priv->irq0_stat &= ~(UMAC_IRQ_LINK_UP|UMAC_IRQ_LINK_DOWN);
> +	}
> +}
> +
> +/* bcmgenet_isr1: interrupt handler for ring buffer. */
> +static irqreturn_t bcmgenet_isr1(int irq, void *dev_id)
> +{
> +	struct bcmgenet_priv *priv = dev_id;
> +	unsigned int index;

Wrong scope.

> +
> +	/* Save irq status for bottom-half processing. */
> +	priv->irq1_stat =
> +		bcmgenet_intrl2_1_readl(priv, INTRL2_CPU_STAT) &
> +		~priv->int1_mask;
> +	/* clear inerrupts*/
> +	bcmgenet_intrl2_1_writel(priv, priv->irq1_stat, INTRL2_CPU_CLEAR);
> +
> +	netif_dbg(priv, intr, priv->dev,
> +		"%s: IRQ=0x%x\n", __func__, priv->irq1_stat);
> +	/* Check the MBDONE interrupts.
> +	 * packet is done, reclaim descriptors
> +	 */
> +	if (priv->irq1_stat & 0x0000ffff) {
> +		index = 0;
> +		for (index = 0; index < 16; index++) {

Proofread patrol alert :o)

(...]
> +static int bcmgenet_dma_teardown(struct bcmgenet_priv *priv)
> +{
> +	int timeout = 0;
> +	u32 reg;
> +
> +	/* Disable TDMA to stop add more frames in TX DMA */
> +	reg = bcmgenet_tdma_readl(priv, DMA_CTRL);
> +	reg &= ~DMA_EN;
> +	bcmgenet_tdma_writel(priv, reg, DMA_CTRL);
> +
> +	/* Check TDMA status register to confirm TDMA is disabled */
> +	while (!(bcmgenet_tdma_readl(priv, DMA_STATUS) & DMA_DISABLED)) {
> +		if (timeout++ == 5000) {
> +			netdev_warn(priv->dev,
> +				"Timed out while disabling TX DMA\n");
> +			return -ETIMEDOUT;
> +		}
> +		udelay(1);
> +	}

On the stylistic side, the driver hesitates between "while", "for" timeout
and conditions loop. I'd go for the boring "int i; for (i = 0; i < max; i++)"
style but it's your call.

On the worrying side, even if Tx DMA does not stop, the driver should
try to disable Rx DMA.

> +
> +	/* Wait 10ms for packet drain in both tx and rx dma */
> +	usleep_range(10000, 20000);
> +
> +	/* Disable RDMA */
> +	reg = bcmgenet_rdma_readl(priv, DMA_CTRL);
> +	reg &= ~DMA_EN;
> +	bcmgenet_rdma_writel(priv, reg, DMA_CTRL);
> +
> +	timeout = 0;
> +	/* Check RDMA status register to confirm RDMA is disabled */
> +	while (!(bcmgenet_rdma_readl(priv, DMA_STATUS) & DMA_DISABLED)) {
> +		if (timeout++ == 5000) {
> +			netdev_warn(priv->dev,
> +				"Timed out while disabling RX DMA\n");
> +			return -ETIMEDOUT;
> +		}
> +		udelay(1);
> +	}
> +
> +	return 0;
> +}
[...]
> +static void bcmgenet_timeout(struct net_device *dev)
> +{
> +	struct bcmgenet_priv *priv = netdev_priv(dev);
> +
> +	BUG_ON(dev == NULL);

dev == NULL should be noticed quickly.

> +
> +	netif_dbg(priv, tx_err, dev, "bcmgenet_timeout\n");
> +
> +	dev->trans_start = jiffies;
> +
> +	dev->stats.tx_errors++;
> +
> +	netif_tx_wake_all_queues(dev);

dev_watchdog already complains (loudly).

Is it really supposed to recover ?

> +}
> +
> +#define MAX_MC_COUNT	16
> +
> +static inline void bcmgenet_set_mdf_addr(struct bcmgenet_priv *priv,
> +					 unsigned char *addr,
> +					 int *i,
> +					 int *mc)
> +{
> +	u32 reg;
> +
> +	bcmgenet_umac_writel(priv, addr[0] << 8 | addr[1],
> +			UMAC_MDF_ADDR + (*i * 4));
> +	bcmgenet_umac_writel(priv,
> +			addr[2] << 24 | addr[3] << 16 |
> +			addr[4] << 8 | addr[5],
> +			UMAC_MDF_ADDR + ((*i + 1) * 4));

I would not expect such an indent to pass beyond davem.

> +	reg = bcmgenet_umac_readl(priv, UMAC_MDF_CTRL);
> +	reg |= (1 << (MAX_MC_COUNT - *mc));
> +	bcmgenet_umac_writel(priv, reg, UMAC_MDF_CTRL);
> +	*i += 2;
> +	(*mc)++;
> +}
> +
> +static void bcmgenet_set_rx_mode(struct net_device *dev)
> +{
> +	struct bcmgenet_priv *priv = netdev_priv(dev);
> +	struct netdev_hw_addr *ha;
> +	int i, mc;
> +	u32 reg;
> +
> +	netif_dbg(priv, hw, dev, "%s: %08X\n", __func__, dev->flags);
> +
> +	/* Promiscous mode */
> +	reg = bcmgenet_umac_readl(priv, UMAC_CMD);
> +	if (dev->flags & IFF_PROMISC) {
> +		reg |= CMD_PROMISC;
> +		bcmgenet_umac_writel(priv, reg, UMAC_CMD);
> +		bcmgenet_umac_writel(priv, 0, UMAC_MDF_CTRL);
> +		return;
> +	} else {
> +		reg &= ~CMD_PROMISC;
> +		bcmgenet_umac_writel(priv, reg, UMAC_CMD);
> +	}
> +
> +	/* UniMac doesn't support ALLMULTI */
> +	if (dev->flags & IFF_ALLMULTI)
> +		return;

The driver did not fulfill the request. It could complain to help user.

[...]
> +static int bcmgenet_set_mac_addr(struct net_device *dev, void *p)
> +{
> +	struct sockaddr *addr = p;
> +
> +	if (netif_running(dev))
> +		return -EBUSY;

Add a comment to specify if it is a hardware shortcoming or something else ?

> +
> +	ether_addr_copy(dev->dev_addr, addr->sa_data);
> +
> +	return 0;
> +}

[...]
> +static const struct net_device_ops bcmgenet_netdev_ops = {
> +	.ndo_open = bcmgenet_open,
> +	.ndo_stop = bcmgenet_close,
> +	.ndo_start_xmit = bcmgenet_xmit,
> +	.ndo_select_queue = bcmgenet_select_queue,
> +	.ndo_tx_timeout = bcmgenet_timeout,
> +	.ndo_set_rx_mode = bcmgenet_set_rx_mode,
> +	.ndo_set_mac_address = bcmgenet_set_mac_addr,
> +	.ndo_do_ioctl = bcmgenet_ioctl,
> +	.ndo_set_features = bcmgenet_set_features,
> +};

Please use tabs before '=' to line things up.

[...]
> +static int bcmgenet_probe(struct platform_device *pdev)
> +{
> +	struct device_node *dn = pdev->dev.of_node;
> +	struct bcmgenet_priv *priv;
> +	struct net_device *dev;
> +	const void *macaddr;
> +	struct resource *r;
> +	int err = -EIO;
> +
> +	/* Up to GENET_MAX_MQ_CNT + 1 TX queues and a single RX queue */
> +	dev = alloc_etherdev_mqs(sizeof(*priv), GENET_MAX_MQ_CNT + 1, 1);
> +	if (!dev) {
> +		dev_err(&pdev->dev, "can't allocate net device\n");
> +		return -ENOMEM;
> +	}
> +
> +	priv = netdev_priv(dev);
> +	priv->irq0 = platform_get_irq(pdev, 0);
> +	priv->irq1 = platform_get_irq(pdev, 1);
> +	if (!priv->irq0 || !priv->irq1) {
> +		dev_err(&pdev->dev, "can't find IRQs\n");
> +		err = -EINVAL;
> +		goto err;
> +	}
> +
> +	macaddr = of_get_mac_address(dn);
> +	if (!macaddr) {
> +		dev_err(&pdev->dev, "can't find MAC address\n");
> +		err = -EINVAL;
> +		goto err;
> +	}
> +
> +	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
> +	priv->base = devm_request_and_ioremap(&pdev->dev, r);
> +	if (!priv->base) {
> +		dev_err(&pdev->dev, "can't ioremap\n");
> +		err = -EINVAL;
> +		goto err;
> +	}
> +
> +	dev->base_addr = (unsigned long)priv->base;

base_addr in net_device is a legacy hack.

> +	SET_NETDEV_DEV(dev, &pdev->dev);
> +	dev_set_drvdata(&pdev->dev, dev);
> +	ether_addr_copy(dev->dev_addr, macaddr);
> +	dev->irq = priv->irq0;

And so is irq.

-- 
Ueimor

^ permalink raw reply

* Re: [PATCH net-next v2 04/10] net: phy: add Broadcom BCM7xxx internal PHY driver
From: Francois Romieu @ 2014-02-13 10:34 UTC (permalink / raw)
  To: Florian Fainelli; +Cc: netdev, davem, cernekee, devicetree
In-Reply-To: <1392269395-23513-5-git-send-email-f.fainelli@gmail.com>

Florian Fainelli <f.fainelli@gmail.com> :
[...]
> diff --git a/drivers/net/phy/bcm7xxx.c b/drivers/net/phy/bcm7xxx.c
> new file mode 100644
> index 0000000..6aea6e2
> --- /dev/null
> +++ b/drivers/net/phy/bcm7xxx.c
[...]
> +static int bcm7445_config_init(struct phy_device *phydev)
> +{
> +	int ret;

It could be declared after 'i' below.

> +	const struct bcm7445_regs {

static const

> +		int reg;
> +		u16 value;
> +	} bcm7445_regs_cfg[] = {
> +		/* increases ADC latency by 24ns */
> +		{ 0x17, 0x0038 },
> +		{ 0x15, 0xAB95 },
> +		/* increases internal 1V LDO voltage by 5% */
> +		{ 0x17, 0x2038 },
> +		{ 0x15, 0xBB22 },
> +		/* reduce RX low pass filter corner frequency */
> +		{ 0x17, 0x6038 },
> +		{ 0x15, 0xFFC5 },
> +		/* reduce RX high pass filter corner frequency */
> +		{ 0x17, 0x003a },
> +		{ 0x15, 0x2002 },
> +	};
> +	unsigned int i;
> +
> +	for (i = 0; i < ARRAY_SIZE(bcm7445_regs_cfg); i++) {
> +		ret = phy_write(phydev,
> +				bcm7445_regs_cfg[i].reg,
> +				bcm7445_regs_cfg[i].value);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +static void phy_write_exp(struct phy_device *phydev,
> +					u16 reg, u16 value)

static void phy_write_exp(struct phy_device *phydev, u16 reg, u16 value)

> +{
> +	phy_write(phydev, 0x17, 0xf00 | reg);
> +	phy_write(phydev, 0x15, value);
> +}
> +
> +static void phy_write_misc(struct phy_device *phydev,
> +					u16 reg, u16 chl, u16 value)
   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ all tabs that don't line up

static void phy_write_misc(struct phy_device *phydev,
			   u16 reg, u16 chl, u16 value)

static void phy_write_misc(struct phy_device *phydev, u16 reg, u16 chl,
			   u16 value)

static void phy_write_misc(struct phy_device *phydev, u16 reg, u16 chl, u16 val)


> +{
> +	int tmp;
> +
> +	phy_write(phydev, 0x18, 0x7);
> +
> +	tmp = phy_read(phydev, 0x18);
> +	tmp |= 0x800;
> +	phy_write(phydev, 0x18, tmp);
> +
> +	tmp = (chl * 0x2000) | reg;
> +	phy_write(phydev, 0x17, tmp);
> +
> +	phy_write(phydev, 0x15, value);

You may use some #define for the 0x15, 0x17 and 0x18 values.

> +}
> +
> +static int bcm7xxx_28nm_afe_config_init(struct phy_device *phydev)
> +{
> +	/* write AFE_RXCONFIG_0 */
> +	phy_write_misc(phydev, 0x38, 0x0000, 0xeb19);
> +
> +	/* write AFE_RXCONFIG_1 */
> +	phy_write_misc(phydev, 0x38, 0x0001, 0x9a3f);
> +
> +	/* write AFE_RX_LP_COUNTER */
> +	phy_write_misc(phydev, 0x38, 0x0003, 0x7fc7);
> +
> +	/* write AFE_HPF_TRIM_OTHERS */
> +	phy_write_misc(phydev, 0x3A, 0x0000, 0x000b);
> +
> +	/* write AFTE_TX_CONFIG */
> +	phy_write_misc(phydev, 0x39, 0x0000, 0x0800);

Some #define may be welcome to replace the comments.

[...]
> +static int bcm7xxx_28nm_config_init(struct phy_device *phydev)
> +{
> +	int ret;
> +
> +	ret = bcm7445_config_init(phydev);
> +	if (ret)
> +		return ret;
> +
> +	return bcm7xxx_28nm_afe_config_init(phydev);
> +}
> +
> +static int phy_set_clr_bits(struct phy_device *dev, int location,
> +					int set_mask, int clr_mask)
> +{
> +	int v, ret;
> +
> +	v = phy_read(dev, location);
> +	if (v < 0)
> +		return v;
> +
> +	v &= ~clr_mask;
> +	v |= set_mask;
> +
> +	ret = phy_write(dev, location, v);
> +	if (ret < 0)
> +		return ret;
> +
> +	return v;
> +}
> +
> +static int bcm7xxx_config_init(struct phy_device *phydev)
> +{
> +	/* Enable 64 clock MDIO */
> +	phy_write(phydev, 0x1d, 0x1000);
> +	phy_read(phydev, 0x1d);
> +
> +	/* Workaround only required for 100Mbits/sec */
> +	if (!(phydev->dev_flags & PHY_BRCM_100MBPS_WAR))
> +		return 0;
> +
> +	/* set shadow mode 2 */
> +	phy_set_clr_bits(phydev, 0x1f, 0x0004, 0x0004);

phy_set_clr_bits returned status code is not checked.

> +
> +	/* set iddq_clkbias */
> +	phy_write(phydev, 0x14, 0x0F00);
> +	udelay(10);
> +
> +	/* reset iddq_clkbias */
> +	phy_write(phydev, 0x14, 0x0C00);
> +
> +	phy_write(phydev, 0x13, 0x7555);
> +
> +	/* reset shadow mode 2 */
> +	phy_set_clr_bits(phydev, 0x1f, 0x0004, 0);

phy_set_clr_bits returned status code is not checked.

> +
> +	return 0;
> +}
> +
> +/* Workaround for putting the PHY in IDDQ mode, required
> + * for all BCM7XXX PHYs
> + */
> +static int bcm7xxx_suspend(struct phy_device *phydev)

Factor out with bcm7445_config_init and some helper ?

> +{
> +	int ret;
> +	const struct bcm7xxx_regs {
> +		int reg;
> +		u16 value;
> +	} bcm7xxx_suspend_cfg[] = {
> +		{ 0x1f, 0x008b },
> +		{ 0x10, 0x01c0 },
> +		{ 0x14, 0x7000 },
> +		{ 0x1f, 0x000f },
> +		{ 0x10, 0x20d0 },
> +		{ 0x1f, 0x000b },
> +	};
> +	unsigned int i;

-- 
Ueimor

^ permalink raw reply

* Re: ip_set: protocol %u message -- useful?
From: Jozsef Kadlecsik @ 2014-02-13 10:30 UTC (permalink / raw)
  To: Ilia Mirkin
  Cc: netdev, linux-kernel@vger.kernel.org, Patrick McHardy,
	Vitaly Lavrov
In-Reply-To: <CAKb7UvhsLi2Sv4SWA2nVTgoFV8dzrY=3R1-SUk9D-aa0VdizSA@mail.gmail.com>

On Thu, 13 Feb 2014, Ilia Mirkin wrote:

> I've recently noticed a lot of
> 
> [356872.380595] ip_set: protocol 6

That means the ip_set module has been loaded in multiple times.
 
> messages in my dmesg. This might be because of some local
> configuration changes I've made, or perhaps a kernel upgrade. Either
> way, it appears this message has been a pr_notice since the original
> code added it in a7b4f989a62 ("netfilter: ipset: IP set core
> support").
> 
> Does this message provide a lot of value? Or could it be made into a pr_debug?

That's a report message on the protocol version used by the ipset 
subsystem. There was (and possibly will be) multiple protocols, so it 
helps to catch basic userpsace/kernelspace communication issues.
 
> FWIW commit 1785e8f473 ("netfiler: ipset: Add net namespace for
> ipset"), merged in v3.13-rc1 changed the code around which may have
> made it more likely to appear in dmesg (with the namespace stuff). Not
> sure though. I don't (purposely) use namespaces.

Namespaces can explain why you see the message so many times, but then you 
must have namespaces activated.
 
> I'm happy to put together a patch demoting it to pr_debug if people
> think it's OK.

I'm fine with it, it's OK.

Best regards,
Jozsef
-
E-mail  : kadlec@blackhole.kfki.hu, kadlecsik.jozsef@wigner.mta.hu
PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
Address : Wigner Research Centre for Physics, Hungarian Academy of Sciences
          H-1525 Budapest 114, POB. 49, Hungary

^ permalink raw reply

* Re: [PATCH net v2] vhost: fix a theoretical race in device cleanup
From: Jason Wang @ 2014-02-13 10:01 UTC (permalink / raw)
  To: Michael S. Tsirkin, linux-kernel, David Miller
  Cc: kvm, virtio-dev, virtualization, netdev
In-Reply-To: <1392284540-2031-1-git-send-email-mst@redhat.com>

On 02/13/2014 05:45 PM, Michael S. Tsirkin wrote:
> vhost_zerocopy_callback accesses VQ right after it drops a ubuf
> reference.  In theory, this could race with device removal which waits
> on the ubuf kref, and crash on use after free.
>
> Do all accesses within rcu read side critical section, and synchronize
> on release.
>
> Since callbacks are always invoked from bh, synchronize_rcu_bh seems
> enough and will help release complete a bit faster.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>
> This is was previously posted as part of patch
> series, but it's an independent fix really.
> Theoretical race so not needed for stable I think.
>
> changes from v1:
> 	fixed typo in commit log
>
>  drivers/vhost/net.c | 6 ++++++
>  1 file changed, 6 insertions(+)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index b12176f..f1be80d 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -308,6 +308,8 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
>  	struct vhost_virtqueue *vq = ubufs->vq;
>  	int cnt;
>  
> +	rcu_read_lock_bh();
> +
>  	/* set len to mark this desc buffers done DMA */
>  	vq->heads[ubuf->desc].len = success ?
>  		VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
> @@ -322,6 +324,8 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
>  	 */
>  	if (cnt <= 1 || !(cnt % 16))
>  		vhost_poll_queue(&vq->poll);
> +
> +	rcu_read_unlock_bh();
>  }
>  
>  /* Expects to be always run from workqueue - which acts as
> @@ -804,6 +808,8 @@ static int vhost_net_release(struct inode *inode, struct file *f)
>  		fput(tx_sock->file);
>  	if (rx_sock)
>  		fput(rx_sock->file);
> +	/* Make sure no callbacks are outstanding */
> +	synchronize_rcu_bh();
>  	/* We do an extra flush before freeing memory,
>  	 * since jobs can re-queue themselves. */
>  	vhost_net_flush(n);

Acked-by: Jason Wang <jasowang@redhat.com>

^ permalink raw reply

* Re: [PATCH net v2] vhost: fix ref cnt checking deadlock
From: Jason Wang @ 2014-02-13 10:00 UTC (permalink / raw)
  To: Michael S. Tsirkin, linux-kernel, David Miller
  Cc: Qin Chuanyu, kvm, virtio-dev, virtualization, netdev
In-Reply-To: <1392284448-1977-1-git-send-email-mst@redhat.com>

On 02/13/2014 05:42 PM, Michael S. Tsirkin wrote:
> vhost checked the counter within the refcnt before decrementing.  It
> really wanted to know that it is the one that has the last reference, as
> a way to batch freeing resources a bit more efficiently.
>
> Note: we only let refcount go to 0 on device release.
>
> This works well but we now access the ref counter twice so there's a
> race: all users might see a high count and decide to defer freeing
> resources.
> In the end no one initiates freeing resources until the last reference
> is gone (which is on VM shotdown so might happen after a looooong time).
>
> Let's do what we probably should have done straight away:
> switch from kref to plain atomic, documenting the
> semantics, return the refcount value atomically after decrement,
> then use that to avoid the deadlock.
>
> Reported-by: Qin Chuanyu <qinchuanyu@huawei.com>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>
> This patch is needed for 3.14 and -stable.
>
>  drivers/vhost/net.c | 41 ++++++++++++++++++++---------------------
>  1 file changed, 20 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 831eb4f..b12176f 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -70,7 +70,12 @@ enum {
>  };
>  
>  struct vhost_net_ubuf_ref {
> -	struct kref kref;
> +	/* refcount follows semantics similar to kref:
> +	 *  0: object is released
> +	 *  1: no outstanding ubufs
> +	 * >1: outstanding ubufs
> +	 */
> +	atomic_t refcount;
>  	wait_queue_head_t wait;
>  	struct vhost_virtqueue *vq;
>  };
> @@ -116,14 +121,6 @@ static void vhost_net_enable_zcopy(int vq)
>  	vhost_net_zcopy_mask |= 0x1 << vq;
>  }
>  
> -static void vhost_net_zerocopy_done_signal(struct kref *kref)
> -{
> -	struct vhost_net_ubuf_ref *ubufs;
> -
> -	ubufs = container_of(kref, struct vhost_net_ubuf_ref, kref);
> -	wake_up(&ubufs->wait);
> -}
> -
>  static struct vhost_net_ubuf_ref *
>  vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy)
>  {
> @@ -134,21 +131,24 @@ vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy)
>  	ubufs = kmalloc(sizeof(*ubufs), GFP_KERNEL);
>  	if (!ubufs)
>  		return ERR_PTR(-ENOMEM);
> -	kref_init(&ubufs->kref);
> +	atomic_set(&ubufs->refcount, 1);
>  	init_waitqueue_head(&ubufs->wait);
>  	ubufs->vq = vq;
>  	return ubufs;
>  }
>  
> -static void vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs)
> +static int vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs)
>  {
> -	kref_put(&ubufs->kref, vhost_net_zerocopy_done_signal);
> +	int r = atomic_sub_return(1, &ubufs->refcount);
> +	if (unlikely(!r))
> +		wake_up(&ubufs->wait);
> +	return r;
>  }
>  
>  static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs)
>  {
> -	kref_put(&ubufs->kref, vhost_net_zerocopy_done_signal);
> -	wait_event(ubufs->wait, !atomic_read(&ubufs->kref.refcount));
> +	vhost_net_ubuf_put(ubufs);
> +	wait_event(ubufs->wait, !atomic_read(&ubufs->refcount));
>  }
>  
>  static void vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref *ubufs)
> @@ -306,22 +306,21 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
>  {
>  	struct vhost_net_ubuf_ref *ubufs = ubuf->ctx;
>  	struct vhost_virtqueue *vq = ubufs->vq;
> -	int cnt = atomic_read(&ubufs->kref.refcount);
> +	int cnt;
>  
>  	/* set len to mark this desc buffers done DMA */
>  	vq->heads[ubuf->desc].len = success ?
>  		VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
> -	vhost_net_ubuf_put(ubufs);
> +	cnt = vhost_net_ubuf_put(ubufs);
>  
>  	/*
>  	 * Trigger polling thread if guest stopped submitting new buffers:
> -	 * in this case, the refcount after decrement will eventually reach 1
> -	 * so here it is 2.
> +	 * in this case, the refcount after decrement will eventually reach 1.
>  	 * We also trigger polling periodically after each 16 packets
>  	 * (the value 16 here is more or less arbitrary, it's tuned to trigger
>  	 * less than 10% of times).
>  	 */
> -	if (cnt <= 2 || !(cnt % 16))
> +	if (cnt <= 1 || !(cnt % 16))
>  		vhost_poll_queue(&vq->poll);
>  }
>  
> @@ -420,7 +419,7 @@ static void handle_tx(struct vhost_net *net)
>  			msg.msg_control = ubuf;
>  			msg.msg_controllen = sizeof(ubuf);
>  			ubufs = nvq->ubufs;
> -			kref_get(&ubufs->kref);
> +			atomic_inc(&ubufs->refcount);
>  			nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
>  		} else {
>  			msg.msg_control = NULL;
> @@ -785,7 +784,7 @@ static void vhost_net_flush(struct vhost_net *n)
>  		vhost_net_ubuf_put_and_wait(n->vqs[VHOST_NET_VQ_TX].ubufs);
>  		mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
>  		n->tx_flush = false;
> -		kref_init(&n->vqs[VHOST_NET_VQ_TX].ubufs->kref);
> +		atomic_set(&n->vqs[VHOST_NET_VQ_TX].ubufs->refcount, 1);
>  		mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
>  	}
>  }

Acked-by: Jason Wang <jasowang@redhat.com>

^ permalink raw reply

* [PATCH net v2] vhost: fix a theoretical race in device cleanup
From: Michael S. Tsirkin @ 2014-02-13  9:45 UTC (permalink / raw)
  To: linux-kernel, David Miller; +Cc: virtio-dev, netdev, kvm, virtualization

vhost_zerocopy_callback accesses VQ right after it drops a ubuf
reference.  In theory, this could race with device removal which waits
on the ubuf kref, and crash on use after free.

Do all accesses within rcu read side critical section, and synchronize
on release.

Since callbacks are always invoked from bh, synchronize_rcu_bh seems
enough and will help release complete a bit faster.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---

This is was previously posted as part of patch
series, but it's an independent fix really.
Theoretical race so not needed for stable I think.

changes from v1:
	fixed typo in commit log

 drivers/vhost/net.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index b12176f..f1be80d 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -308,6 +308,8 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
 	struct vhost_virtqueue *vq = ubufs->vq;
 	int cnt;
 
+	rcu_read_lock_bh();
+
 	/* set len to mark this desc buffers done DMA */
 	vq->heads[ubuf->desc].len = success ?
 		VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
@@ -322,6 +324,8 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
 	 */
 	if (cnt <= 1 || !(cnt % 16))
 		vhost_poll_queue(&vq->poll);
+
+	rcu_read_unlock_bh();
 }
 
 /* Expects to be always run from workqueue - which acts as
@@ -804,6 +808,8 @@ static int vhost_net_release(struct inode *inode, struct file *f)
 		fput(tx_sock->file);
 	if (rx_sock)
 		fput(rx_sock->file);
+	/* Make sure no callbacks are outstanding */
+	synchronize_rcu_bh();
 	/* We do an extra flush before freeing memory,
 	 * since jobs can re-queue themselves. */
 	vhost_net_flush(n);
-- 
MST

^ permalink raw reply related

* [PATCH net v2] vhost: fix ref cnt checking deadlock
From: Michael S. Tsirkin @ 2014-02-13  9:42 UTC (permalink / raw)
  To: linux-kernel, David Miller
  Cc: virtio-dev, kvm, netdev, virtualization, Qin Chuanyu

vhost checked the counter within the refcnt before decrementing.  It
really wanted to know that it is the one that has the last reference, as
a way to batch freeing resources a bit more efficiently.

Note: we only let refcount go to 0 on device release.

This works well but we now access the ref counter twice so there's a
race: all users might see a high count and decide to defer freeing
resources.
In the end no one initiates freeing resources until the last reference
is gone (which is on VM shotdown so might happen after a looooong time).

Let's do what we probably should have done straight away:
switch from kref to plain atomic, documenting the
semantics, return the refcount value atomically after decrement,
then use that to avoid the deadlock.

Reported-by: Qin Chuanyu <qinchuanyu@huawei.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---

This patch is needed for 3.14 and -stable.

 drivers/vhost/net.c | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 831eb4f..b12176f 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -70,7 +70,12 @@ enum {
 };
 
 struct vhost_net_ubuf_ref {
-	struct kref kref;
+	/* refcount follows semantics similar to kref:
+	 *  0: object is released
+	 *  1: no outstanding ubufs
+	 * >1: outstanding ubufs
+	 */
+	atomic_t refcount;
 	wait_queue_head_t wait;
 	struct vhost_virtqueue *vq;
 };
@@ -116,14 +121,6 @@ static void vhost_net_enable_zcopy(int vq)
 	vhost_net_zcopy_mask |= 0x1 << vq;
 }
 
-static void vhost_net_zerocopy_done_signal(struct kref *kref)
-{
-	struct vhost_net_ubuf_ref *ubufs;
-
-	ubufs = container_of(kref, struct vhost_net_ubuf_ref, kref);
-	wake_up(&ubufs->wait);
-}
-
 static struct vhost_net_ubuf_ref *
 vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy)
 {
@@ -134,21 +131,24 @@ vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy)
 	ubufs = kmalloc(sizeof(*ubufs), GFP_KERNEL);
 	if (!ubufs)
 		return ERR_PTR(-ENOMEM);
-	kref_init(&ubufs->kref);
+	atomic_set(&ubufs->refcount, 1);
 	init_waitqueue_head(&ubufs->wait);
 	ubufs->vq = vq;
 	return ubufs;
 }
 
-static void vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs)
+static int vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs)
 {
-	kref_put(&ubufs->kref, vhost_net_zerocopy_done_signal);
+	int r = atomic_sub_return(1, &ubufs->refcount);
+	if (unlikely(!r))
+		wake_up(&ubufs->wait);
+	return r;
 }
 
 static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs)
 {
-	kref_put(&ubufs->kref, vhost_net_zerocopy_done_signal);
-	wait_event(ubufs->wait, !atomic_read(&ubufs->kref.refcount));
+	vhost_net_ubuf_put(ubufs);
+	wait_event(ubufs->wait, !atomic_read(&ubufs->refcount));
 }
 
 static void vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref *ubufs)
@@ -306,22 +306,21 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
 {
 	struct vhost_net_ubuf_ref *ubufs = ubuf->ctx;
 	struct vhost_virtqueue *vq = ubufs->vq;
-	int cnt = atomic_read(&ubufs->kref.refcount);
+	int cnt;
 
 	/* set len to mark this desc buffers done DMA */
 	vq->heads[ubuf->desc].len = success ?
 		VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
-	vhost_net_ubuf_put(ubufs);
+	cnt = vhost_net_ubuf_put(ubufs);
 
 	/*
 	 * Trigger polling thread if guest stopped submitting new buffers:
-	 * in this case, the refcount after decrement will eventually reach 1
-	 * so here it is 2.
+	 * in this case, the refcount after decrement will eventually reach 1.
 	 * We also trigger polling periodically after each 16 packets
 	 * (the value 16 here is more or less arbitrary, it's tuned to trigger
 	 * less than 10% of times).
 	 */
-	if (cnt <= 2 || !(cnt % 16))
+	if (cnt <= 1 || !(cnt % 16))
 		vhost_poll_queue(&vq->poll);
 }
 
@@ -420,7 +419,7 @@ static void handle_tx(struct vhost_net *net)
 			msg.msg_control = ubuf;
 			msg.msg_controllen = sizeof(ubuf);
 			ubufs = nvq->ubufs;
-			kref_get(&ubufs->kref);
+			atomic_inc(&ubufs->refcount);
 			nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
 		} else {
 			msg.msg_control = NULL;
@@ -785,7 +784,7 @@ static void vhost_net_flush(struct vhost_net *n)
 		vhost_net_ubuf_put_and_wait(n->vqs[VHOST_NET_VQ_TX].ubufs);
 		mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
 		n->tx_flush = false;
-		kref_init(&n->vqs[VHOST_NET_VQ_TX].ubufs->kref);
+		atomic_set(&n->vqs[VHOST_NET_VQ_TX].ubufs->refcount, 1);
 		mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
 	}
 }
-- 
MST

^ permalink raw reply related

* Re: bnx2x + SR-IOV, no internal L2 switching
From: Yoann Juet @ 2014-02-13  9:42 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: netdev, Ariel Elior
In-Reply-To: <1392246640.15615.37.camel@deadeye.wl.decadent.org.uk>


[-- Attachment #1.1: Type: text/plain, Size: 563 bytes --]

Hi Ben,

 > Are you're using the ISC DHCP client, which puts the interface in
> promiscuous mode?  If the Broadcom NIC supports promiscuous mode on VFs,
> that may explain what you're seeing.

No, I put the interface into promiscuous mode with Wireshark's tshark 
command line tool.

> I think these VFs don't support promiscuous mode.  Anyway, the ixgbevf
> driver silently ignores it.

This could be an explanation. Ariel Elior sent to me a patch. Test in 
progress.

Regards,
-- 
Université de Nantes - Direction des Systèmes d'Information

[-- Attachment #1.2: yoann_juet.vcf --]
[-- Type: text/x-vcard, Size: 377 bytes --]

begin:vcard
fn:Yoann Juet
n:Juet;Yoann
org;quoted-printable;quoted-printable:Direction des Syst=C3=A8mes d'Information;P=C3=B4le R=C3=A9seau
adr;quoted-printable:BP 92208;;2 rue de la Houssini=C3=A8re;Nantes Cedex 3;;44322;France
email;internet:yoann.juet@univ-nantes.fr
tel;work:02.53.48.49.26
tel;fax:02.53.48.49.09
tel;cell:06.73.15.42.19
version:2.1
end:vcard


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 3256 bytes --]

^ permalink raw reply

* Re: nonagle flags for TSQ
From: John Ogness @ 2014-02-13  9:34 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev
In-Reply-To: <1391792743.10160.59.camel@edumazet-glaptop2.roam.corp.google.com>

Hi Eric,

Since the patch has already been pulled, I don't know how much energy
you want to spend to track this down. I am acting as a middle-man for
the person with the problem, which is the reason for the large latency
in my responses.

On 2014-02-07, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> I cant see how TSQ would even trigger with this test.

The problem is appearing on a kernel with the PREEMPT_RT patch
(3.10.20-rt17). I do not see anything in the PREEMPT_RT patch that would
affect the use of TSQ.

> What value do you have on /proc/sys/net/ipv4/tcp_limit_output_bytes ?

conf 	: icmp_echo_ignore_all 	: 0
icmp_echo_ignore_broadcasts 	: 1
icmp_echo_sysrq 	: 0
icmp_errors_use_inbound_ifaddr 	: 0
icmp_ignore_bogus_error_responses 	: 1
icmp_ratelimit 	: 1000
icmp_ratemask 	: 6168
igmp_max_memberships 	: 20
igmp_max_msf 	: 10
inet_peer_maxttl 	: 600
inet_peer_minttl 	: 120
inet_peer_threshold 	: 65664
ip_default_ttl 	: 64
ip_dynaddr 	: 0
ip_early_demux 	: 1
ip_forward 	: 0
ip_local_port_range 	: 32768	61000
ip_local_reserved_ports 	: 
ip_no_pmtu_disc 	: 0
ip_nonlocal_bind 	: 0
ipfrag_high_thresh 	: 4194304
ipfrag_low_thresh 	: 3145728
ipfrag_max_dist 	: 64
ipfrag_secret_interval 	: 600
ipfrag_time 	: 30
neigh 	: ping_group_range 	: 1	0
route 	: tcp_abort_on_overflow 	: 0
tcp_adv_win_scale 	: 1
tcp_allowed_congestion_control 	: cubic reno
tcp_app_win 	: 31
tcp_available_congestion_control 	: cubic reno
tcp_base_mss 	: 512
tcp_challenge_ack_limit 	: 100
tcp_congestion_control 	: cubic
tcp_dsack 	: 1
tcp_early_retrans 	: 3
tcp_ecn 	: 2
tcp_fack 	: 1
tcp_fastopen 	: 0
tcp_fastopen_key 	: fe6450bc-a4524655-c76e138c-10df4490
tcp_fin_timeout 	: 60
tcp_frto 	: 2
tcp_keepalive_intvl 	: 75
tcp_keepalive_probes 	: 9
tcp_keepalive_time 	: 7200
tcp_limit_output_bytes 	: 131072
tcp_low_latency 	: 0
tcp_max_orphans 	: 4096
tcp_max_ssthresh 	: 0
tcp_max_syn_backlog 	: 128
tcp_max_tw_buckets 	: 4096
tcp_mem 	: 20187	26919	40374
tcp_min_tso_segs 	: 2
tcp_moderate_rcvbuf 	: 1
tcp_mtu_probing 	: 0
tcp_no_metrics_save 	: 0
tcp_orphan_retries 	: 0
tcp_reordering 	: 3
tcp_retrans_collapse 	: 1
tcp_retries1 	: 3
tcp_retries2 	: 15
tcp_rfc1337 	: 0
tcp_rmem 	: 4096	87380	6291456
tcp_sack 	: 1
tcp_slow_start_after_idle 	: 1
tcp_stdurg 	: 0
tcp_syn_retries 	: 6
tcp_synack_retries 	: 5
tcp_syncookies 	: 1
tcp_thin_dupack 	: 0
tcp_thin_linear_timeouts 	: 0
tcp_timestamps 	: 1
tcp_tso_win_divisor 	: 3
tcp_tw_recycle 	: 0
tcp_tw_reuse 	: 0
tcp_window_scaling 	: 1
tcp_wmem 	: 4096	16384	4194304
tcp_workaround_signed_windows 	: 0
udp_mem 	: 20319	27093	40638
udp_rmem_min 	: 4096
udp_wmem_min 	: 4096

John Ogness

^ permalink raw reply

* ip_set: protocol %u message -- useful?
From: Ilia Mirkin @ 2014-02-13  9:30 UTC (permalink / raw)
  To: netdev, linux-kernel@vger.kernel.org, Jozsef Kadlecsik,
	Patrick McHardy, Vitaly Lavrov

Hello,

I've recently noticed a lot of

[356872.380595] ip_set: protocol 6

messages in my dmesg. This might be because of some local
configuration changes I've made, or perhaps a kernel upgrade. Either
way, it appears this message has been a pr_notice since the original
code added it in a7b4f989a62 ("netfilter: ipset: IP set core
support").

Does this message provide a lot of value? Or could it be made into a pr_debug?

FWIW commit 1785e8f473 ("netfiler: ipset: Add net namespace for
ipset"), merged in v3.13-rc1 changed the code around which may have
made it more likely to appear in dmesg (with the namespace stuff). Not
sure though. I don't (purposely) use namespaces.

I'm happy to put together a patch demoting it to pr_debug if people
think it's OK.

Thanks,

  -ilia

^ permalink raw reply

* RE: [PATCH net-next 10/14] ethtool: Expand documentation of struct ethtool_stats
From: David Laight @ 2014-02-13  9:17 UTC (permalink / raw)
  To: 'Ben Hutchings', David Miller; +Cc: netdev@vger.kernel.org
In-Reply-To: <1392243283.15615.14.camel@deadeye.wl.decadent.org.uk>

From: Of Ben Hutchings
> -/* for dumping NIC-specific statistics */
> +/**
> + * struct ethtool_stats - device-specific statistics
> + * @cmd: Command number = %ETHTOOL_GSTATS
> + * @n_stats: On return, the number of statistics
> + * @data: Array of statistics
> + *
> + * Users must use %ETHTOOL_GSSET_INFO or %ETHTOOL_GDRVINFO to find the
> + * number of statistics that will be returned.  They must allocate a
> + * buffer of the appropriate size (8 * number of statistics)
> + * immediately following this structure.
> + */
>  struct ethtool_stats {
> -	__u32	cmd;		/* ETHTOOL_GSTATS */
> -	__u32	n_stats;	/* number of u64's being returned */
> +	__u32	cmd;
> +	__u32	n_stats;
>  	__u64	data[0];
>  };

Hmmm... Although that allows some script to generate documentation,
for anyone looking at the heeder file it seems a retrograde step.

	David


^ permalink raw reply

* Re: [PATCH] usbnet: remove generic hard_header_len check
From: Bjørn Mork @ 2014-02-13  9:05 UTC (permalink / raw)
  To: Emil Goode
  Cc: Steve Glendinning, Oliver Neukum, David S. Miller, Freddy Xin,
	Eric Dumazet, Ming Lei, Paul Gortmaker, Jeff Kirsher,
	Liu Junliang, Octavian Purdila, linux-usb, netdev, linux-kernel
In-Reply-To: <1392249836-27151-1-git-send-email-emilgoode@gmail.com>

Emil Goode <emilgoode@gmail.com> writes:

> This patch removes a generic hard_header_len check from the usbnet
> module that is causing dropped packages under certain circumstances
> for devices that send rx packets that cross urb boundaries.
>
> One example is the AX88772B which occasionally send rx packets that
> cross urb boundaries where the remaining partial packet is sent with
> no hardware header. When the buffer with a partial packet is of less
> number of octets than the value of hard_header_len the buffer is
> discarded by the usbnet module.
>
> With AX88772B this can be reproduced by using ping with a packet
> size between 1965-1976.
>
> The bug has been reported here:
>
> https://bugzilla.kernel.org/show_bug.cgi?id=29082
>
> This patch introduces the following changes:
> - Removes the generic hard_header_len check in the rx_complete
>   function in the usbnet module.
> - Introduces a ETH_HLEN check for skbs that are not cloned from
>   within a rx_fixup callback.
> - For safety a hard_header_len check is added to each rx_fixup
>   callback function that could be affected by this change.
>   These extra checks could possibly be removed by someone
>   who has the hardware to test.
>
> The changes place full responsibility on the rx_fixup callback
> functions that clone skbs to only pass valid skbs to the
> usbnet_skb_return function.
>
> Signed-off-by: Emil Goode <emilgoode@gmail.com>
> Reported-by: Igor Gnatenko <i.gnatenko.brain@gmail.com>


Great work!  Looks good to me.

Just a couple of questions:

> diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
> index ff5c871..b2e2aee 100644
> --- a/drivers/net/usb/qmi_wwan.c
> +++ b/drivers/net/usb/qmi_wwan.c
> @@ -80,10 +80,10 @@ static int qmi_wwan_rx_fixup(struct usbnet *dev, struct sk_buff *skb)
>  {
>  	__be16 proto;
>  
> -	/* usbnet rx_complete guarantees that skb->len is at least
> -	 * hard_header_len, so we can inspect the dest address without
> -	 * checking skb->len
> -	 */
> +	/* This check is no longer done by usbnet */
> +	if (skb->len < dev->net->hard_header_len)
> +		return 0;
> +
>  	switch (skb->data[0] & 0xf0) {
>  	case 0x40:
>  		proto = htons(ETH_P_IP);
> diff --git a/drivers/net/usb/rndis_host.c b/drivers/net/usb/rndis_host.c
> index a48bc0f..524a47a 100644
> --- a/drivers/net/usb/rndis_host.c
> +++ b/drivers/net/usb/rndis_host.c
> @@ -492,6 +492,10 @@ EXPORT_SYMBOL_GPL(rndis_unbind);
>   */
>  int rndis_rx_fixup(struct usbnet *dev, struct sk_buff *skb)
>  {
> +	/* This check is no longer done by usbnet */
> +	if (skb->len < dev->net->hard_header_len)
> +		return 0;
> +

Wouldn't it be better to test against ETH_HLEN, since that is a constant
and "obviously correct" in this case?


> diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
> index 4671da7..dd10d58 100644
> --- a/drivers/net/usb/usbnet.c
> +++ b/drivers/net/usb/usbnet.c
> @@ -542,17 +542,19 @@ static inline void rx_process (struct usbnet *dev, struct sk_buff *skb)
>  	}
>  	// else network stack removes extra byte if we forced a short packet
>  
> -	if (skb->len) {
> -		/* all data was already cloned from skb inside the driver */
> -		if (dev->driver_info->flags & FLAG_MULTI_PACKET)
> -			dev_kfree_skb_any(skb);
> -		else
> -			usbnet_skb_return(dev, skb);
> +	/* all data was already cloned from skb inside the driver */
> +	if (dev->driver_info->flags & FLAG_MULTI_PACKET)
> +		goto done;
> +
> +	if (skb->len < ETH_HLEN) {
> +		dev->net->stats.rx_errors++;
> +		dev->net->stats.rx_length_errors++;
> +		netif_dbg(dev, rx_err, dev->net, "rx length %d\n", skb->len);
> +	} else {
> +		usbnet_skb_return(dev, skb);
>  		return;
>  	}
>  
> -	netif_dbg(dev, rx_err, dev->net, "drop\n");
> -	dev->net->stats.rx_errors++;
>  done:
>  	skb_queue_tail(&dev->done, skb);
>  }


At first glance I wondered where the dev_kfree_skb_any(skb) went.  But
then I realized that you leave that to the common rx_cleanup path, using
the dev->done queue.  Is that right?  If so, then I guess it could use a
bit of explaining in the commit message?

If I'm wrong, then I'm still looking for the explanation of where the
dev_kfree_skb_any went :-)



Bjørn

^ permalink raw reply

* [PATCH -next V2] gre: return more precise errno value when adding tunnel fails
From: Florian Westphal @ 2014-02-13  8:58 UTC (permalink / raw)
  To: netdev; +Cc: Florian Westphal

Currently this always returns ENOBUFS, because the return value of
__ip_tunnel_create is discarded.

A more common failure is a duplicate name (EEXIST).  Propagate the real
error code so userspace can display a more meaningful error message.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 Changes since v1:
  Use ERR_CAST() (suggested by Ben Hutchings)

 net/ipv4/ip_tunnel.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 50228be..f137c5d 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -443,7 +443,7 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
 	fbt = netdev_priv(itn->fb_tunnel_dev);
 	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
 	if (IS_ERR(dev))
-		return NULL;
+		return ERR_CAST(dev);
 
 	dev->mtu = ip_tunnel_bind_dev(dev);
 
@@ -796,9 +796,13 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
 
 		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
 
-		if (!t && (cmd == SIOCADDTUNNEL))
+		if (!t && (cmd == SIOCADDTUNNEL)) {
 			t = ip_tunnel_create(net, itn, p);
-
+			if (IS_ERR(t)) {
+				err = PTR_ERR(t);
+				break;
+			}
+		}
 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
 			if (t != NULL) {
 				if (t->dev != dev) {
@@ -825,8 +829,9 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
 		if (t) {
 			err = 0;
 			ip_tunnel_update(itn, t, dev, p, true);
-		} else
-			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
+		} else {
+			err = -ENOENT;
+		}
 		break;
 
 	case SIOCDELTUNNEL:
-- 
1.8.1.5

^ permalink raw reply related

* Re: [PATCH] ipv4: arp: process only if ipv4 address configured
From: Florian Westphal @ 2014-02-13  8:56 UTC (permalink / raw)
  To: David Miller; +Cc: hannes, fw, netdev
In-Reply-To: <20140212.192520.955911778570827692.davem@davemloft.net>

David Miller <davem@davemloft.net> wrote:
> From: Hannes Frederic Sowa <hannes@stressinduktion.org>
> Date: Thu, 13 Feb 2014 01:24:13 +0100
> 
> > I actually expect arp answers for ip addresses bound to loopback even from an
> > interface without ip address, if we strictly conform to the week end host
> > model in linux.
> 
> Agreed.

Thanks for your inpout everyone.  I've self-rejected the patch.

^ permalink raw reply

* [PATCH] irtty-sir.c: Do not set_termios() on irtty_close()
From: Tommie Gannert @ 2014-02-13  8:52 UTC (permalink / raw)
  To: Samuel Ortiz; +Cc: netdev, linux-kernel

From: Tommie Gannert <tommie@gannert.se>

Issuing set_termios() from irtty_close() causes kernel Oops for
unplugged usb-serial devices.

Since no other tty_ldisc calls set_termios() on close and no tty driver
seem to check if tty->device_data is NULL or not on entry to set_termios(),
the only solution I can come up with is to remove the irtty_stop_receiver()
call, which only updates termios.

Signed-off-by: Tommie Gannert <tommie@gannert.se>

---
I know very little of this code, and I'm not sure this is a good solution,
so here's some background:

I have a gadget using IrLAP over RS-232, and it's connected to a
USB-RS232 converter. I have udev rules to start/stop irattach on USB
connect/disconnect, but I see an oops if I simply disconnect the device
while irattach is still running:

Crash log:

[  631.791294] BUG: unable to handle kernel NULL pointer dereference at 0000000000000024
[  631.791390] IP: [<ffffffffa0c07e42>] ftdi_set_termios+0x42/0x670 [ftdi_sio]
...
[  631.793963]  [<ffffffffa0b7fba3>] serial_set_termios+0x43/0x90 [usbserial]
[  631.794031]  [<ffffffffa0c013ec>] irtty_close+0x10c/0x190 [irtty_sir]
[  631.794096]  [<ffffffff81356e88>] tty_ldisc_close.isra.1+0x38/0x50
[  631.794157]  [<ffffffff81356eb8>] tty_ldisc_kill+0x18/0x90
[  631.794209]  [<ffffffff81357894>] tty_ldisc_release+0x34/0x90
[  631.794266]  [<ffffffff8134fe00>] tty_release+0x470/0x600

There is a comment in irtty_close() speculating about potential problems
with usb-serial. I'm not sure if that comment belongs to sirdev_put_instance() or
the irtty_stop_receiver() call. I would guess both, so I let it be.

The effect of this is that /dev/ttyUSB* is still in use, and thus leaking
at least dev nodes. This is a minimal patch that solves that oops.
--- linux-3.12/drivers/net/irda/irtty-sir.c.orig	2014-02-12 21:36:46.132496089 +0000
+++ linux-3.12/drivers/net/irda/irtty-sir.c	2014-02-12 21:57:21.635843884 +0000
@@ -521,7 +521,6 @@ static void irtty_close(struct tty_struc
  	sirdev_put_instance(priv->dev);
  
  	/* Stop tty */
-	irtty_stop_receiver(tty, TRUE);
  	clear_bit(TTY_DO_WRITE_WAKEUP, &tty->flags);
  	if (tty->ops->stop)
  		tty->ops->stop(tty);

^ permalink raw reply

* Re: [PATCH net] net: Clear local_df only if crossing namespace.
From: Nicolas Dichtel @ 2014-02-13  8:50 UTC (permalink / raw)
  To: Pravin Shelar
  Cc: David Miller, netdev, Templin, Fred L, Steffen Klassert,
	Hannes Frederic Sowa
In-Reply-To: <CALnjE+ogMJ=axPo6Xq3a7xO2W5fLY1utwcACRdeoCPgYv4EQsg@mail.gmail.com>

Le 12/02/2014 18:05, Pravin Shelar a écrit :
> On Wed, Feb 12, 2014 at 1:32 AM, Nicolas Dichtel
> <nicolas.dichtel@6wind.com> wrote:
>> Le 12/02/2014 05:26, Pravin Shelar a écrit :
>>
>>> On Mon, Feb 10, 2014 at 6:11 PM, Hannes Frederic Sowa
>>> <hannes@stressinduktion.org> wrote:
>>>>
>>>> On Mon, Feb 10, 2014 at 01:00:14PM -0800, Pravin Shelar wrote:
>>>>>
>>>>> On Fri, Feb 7, 2014 at 4:58 PM, Hannes Frederic Sowa
>>>>> <hannes@stressinduktion.org> wrote:
>>>>>>
>>>>>> May I know because of wich vport, vxlan or gre, you did this change?
>>>>>>
>>>>> It affects both gre and vxlan.
>>>>
>>>>
>>>> Ok, thanks.
>>>>
>>>>>> I am feeling a bit uncomfortable handling remote and local packets that
>>>>>> differently on lower tunnel output (local_df is mostly set on locally
>>>>>> originating packets).
>>>>>
>>>>>
>>>>> For ip traffic it make sense to turn on local_df only for local
>>>>> traffic, since for remote case we can send icmp (frag-needed) back to
>>>>> source. No such thing exist for OVS tunnels. ICMP packet are not
>>>>> returned to source for the tunnels. That is why to be on safe side,
>>>>> local_df is turned on for tunnels in OVS.
>>>>
>>>>
>>>> I have a proposal:
>>>>
>>>> I don't like it that much because of the many arguments. But I currently
>>>> don't see another easy solution. Maybe we should make bool xnet an enum
>>>> and
>>>> test with bitops?
>>>>
>>>> I left the clearing of local_df in skb_scrub_packet as we need it for the
>>>> dev_forward_skb case and it should be done that in any case.
>>>>
>>>> This diff is slightly compile tested. ;)
>>>>
>>>> I can test and make proper submit if you agree.
>>>>
>>>> What do you think?
>>>>
>>>
>>> I am not sure why the caller can not just set skb->local_df before
>>> calling iptunnel_xmit() rather than passing extra arg to this
>>> function?
>>> There are not that many caller of this function.
>>
>> The benefit is that it ensures that future callers will think about this
>> point
>> ;-)
>>
>
> But that add extra test cases in fast path.
> For example OVS we can not reset skb->mark in skb_scrub_packet(). I am
> going to send patch for that too. Do you think I should add another
> argument for skb-mark clear too ?
Maybe this will be the same argument than local_df: 'bool ovs' (probably find a
better name ;-))

^ permalink raw reply

* [PATCH v2 net-next] ipv4: ip_forward: perform skb->pkt_type check at the beginning
From: Denis Kirjanov @ 2014-02-13  4:58 UTC (permalink / raw)
  To: netdev, davem; +Cc: Denis Kirjanov

Packets which have L2 address different from ours should be
already filtered before entering into ip_forward().

Perform that check at the beginning to avoid processing such packets.

Signed-off-by: Denis Kirjanov <kda@linux-powerpc.org>
---
v1 -> v2: Fixed identation
---
 net/ipv4/ip_forward.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index e9f1217..d9d9290 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -59,6 +59,10 @@ int ip_forward(struct sk_buff *skb)
 	struct rtable *rt;	/* Route we use */
 	struct ip_options *opt	= &(IPCB(skb)->opt);
 
+	/* that should never happen */
+	if (skb->pkt_type != PACKET_HOST)
+		goto drop;
+
 	if (skb_warn_if_lro(skb))
 		goto drop;
 
@@ -68,9 +72,6 @@ int ip_forward(struct sk_buff *skb)
 	if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
 		return NET_RX_SUCCESS;
 
-	if (skb->pkt_type != PACKET_HOST)
-		goto drop;
-
 	skb_forward_csum(skb);
 
 	/*
-- 
1.8.0.2

^ permalink raw reply related

* [PATCH v2 net-next] net: remove useless if check from register_netdevice()
From: Denis Kirjanov @ 2014-02-13  4:47 UTC (permalink / raw)
  To: davem, netdev; +Cc: Denis Kirjanov

remove useless if check from register_netdevice()

Signed-off-by: Denis Kirjanov <kda@linux-powerpc.org>
---
v1 -> v2: Fixed identation
---
 net/core/dev.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 4ad1b78..21a72ad 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5876,8 +5876,7 @@ int register_netdevice(struct net_device *dev)
 	if (dev->netdev_ops->ndo_init) {
 		ret = dev->netdev_ops->ndo_init(dev);
 		if (ret) {
-			if (ret > 0)
-				ret = -EIO;
+			ret = -EIO;
 			goto out;
 		}
 	}
-- 
1.8.0.2

^ permalink raw reply related

* Re: [PATCH 2/3] net: UDP gro_receive accept csum=0
From: Or Gerlitz @ 2014-02-13  8:06 UTC (permalink / raw)
  To: Tom Herbert, davem, netdev, Joseph Gasparakis; +Cc: Jerry Chu, Eric Dumazet
In-Reply-To: <alpine.DEB.2.02.1402110928070.7090@tomh.mtv.corp.google.com>

On 11/02/2014 19:43, Tom Herbert wrote:
> The code to validate checksum in UDP gro_receive explictly checks
> against driver having set CHECKSUM_COMPLETE. This does not perform
> GRO on UDP packets with a checksum of zero (no checksum needed).
> This patch adds the condition to allow UDP checksum to be zero.
> Signed-off-by: Tom Herbert <therbert@google.com>
> ---
>   net/ipv4/udp_offload.c | 13 ++++++++-----
>   1 file changed, 8 insertions(+), 5 deletions(-)
>
> diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
> index 25f5cee..4db7796 100644
> --- a/net/ipv4/udp_offload.c
> +++ b/net/ipv4/udp_offload.c
> @@ -156,13 +156,9 @@ static struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *s
>   	unsigned int hlen, off;
>   	int flush = 1;
>   
> -	if (NAPI_GRO_CB(skb)->udp_mark ||
> -	    (!skb->encapsulation && skb->ip_summed != CHECKSUM_COMPLETE))
> +	if (NAPI_GRO_CB(skb)->udp_mark)
>   		goto out;
>   
> -	/* mark that this skb passed once through the udp gro layer */
> -	NAPI_GRO_CB(skb)->udp_mark = 1;
> -
>   	off  = skb_gro_offset(skb);
>   	hlen = off + sizeof(*uh);
>   	uh   = skb_gro_header_fast(skb, off);
> @@ -172,6 +168,13 @@ static struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *s
>   			goto out;
>   	}
>   
> +	if (!skb->encapsulation &&
> +	    skb->ip_summed != CHECKSUM_COMPLETE && uh->check != 0)
> +		goto out;
> +
> +	/* mark that this skb passed once through the udp gro layer */
> +	NAPI_GRO_CB(skb)->udp_mark = 1;
> +

Hi Tom,

Considering the patch just "as is" vs. the current code, its OK.

However, as skbs have only one indicator for the status of the checksum 
checks done by the receiving hardware, the basic assertion I thought we 
needed here is to reject skb if either it has the udp mark set or the 
encapsulation field is false, this is according to the conventions set 
by these two commits

0afb166 vxlan: Add capability of Rx checksum offload for inner packet
6a674e9 net: Add support for hardware-offloaded encapsulation

B/c after finalizing the GRO work and decapsulating, skb injected up 
into the TCP stack with ip_summed equals to CHECKSUM_NONE are rejected. 
If this assumption is wrong, maybe we can remove testing the ip_summed 
field here altogether?

Or.

^ permalink raw reply

* [net-next 4/5] ixgbe: Add WoL support for a new device
From: Aaron Brown @ 2014-02-13  8:00 UTC (permalink / raw)
  To: davem; +Cc: Mark Rustad, netdev, gospo, sassmann, Aaron Brown
In-Reply-To: <1392278450-27062-1-git-send-email-aaron.f.brown@intel.com>

From: Mark Rustad <mark.d.rustad@intel.com>

Add WoL support for port 0 of a new 82599-based device.

Signed-off-by: Mark Rustad <mark.d.rustad@intel.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Signed-off-by: Aaron Brown <aaron.f.brown@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 1 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_type.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 84ca49b..e65a5be 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -7800,6 +7800,7 @@ int ixgbe_wol_supported(struct ixgbe_adapter *adapter, u16 device_id,
 	case IXGBE_DEV_ID_82599_SFP:
 		/* Only these subdevices could supports WOL */
 		switch (subdevice_id) {
+		case IXGBE_SUBDEV_ID_82599_SFP_WOL0:
 		case IXGBE_SUBDEV_ID_82599_560FLR:
 			/* only support first port */
 			if (hw->bus.func != 0)
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
index 0d39cfc..9283cff 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
@@ -54,6 +54,7 @@
 #define IXGBE_DEV_ID_82599_BACKPLANE_FCOE       0x152a
 #define IXGBE_DEV_ID_82599_SFP_FCOE      0x1529
 #define IXGBE_SUBDEV_ID_82599_SFP        0x11A9
+#define IXGBE_SUBDEV_ID_82599_SFP_WOL0   0x1071
 #define IXGBE_SUBDEV_ID_82599_RNDC       0x1F72
 #define IXGBE_SUBDEV_ID_82599_560FLR     0x17D0
 #define IXGBE_SUBDEV_ID_82599_SP_560FLR  0x211B
-- 
1.8.5.GIT

^ permalink raw reply related

* [net-next 5/5] ixgbe: fixup warning regarding max_vfs parameter
From: Aaron Brown @ 2014-02-13  8:00 UTC (permalink / raw)
  To: davem; +Cc: Jacob Keller, netdev, gospo, sassmann, Aaron Brown
In-Reply-To: <1392278450-27062-1-git-send-email-aaron.f.brown@intel.com>

From: Jacob Keller <jacob.e.keller@intel.com>

The max_vfs parameter for ixgbe is deprecated in favor of using the
sysfs sriov_numvfs field to assign VFs as needed, instead of fixing the
value at module load time. The current message only indicates that you
should use this, without adequately explaining how to do so.

This patch modifies the warning message to include the command necessary
to spawn VFs via sysfs, and points users to the Documentation for more
information.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Signed-off-by: Aaron Brown <aaron.f.brown@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index e65a5be..bbd41a1 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -5068,8 +5068,18 @@ static int ixgbe_sw_init(struct ixgbe_adapter *adapter)
 	hw->fc.disable_fc_autoneg = ixgbe_device_supports_autoneg_fc(hw);
 
 #ifdef CONFIG_PCI_IOV
-	if (max_vfs > 0)
-		e_dev_warn("Enabling SR-IOV VFs using the max_vfs module parameter is deprecated - please use the pci sysfs interface instead.\n");
+	if (max_vfs > 0) {
+		e_dev_warn("Enabling SR-IOV VFs using the max_vfs module parameter is deprecated.\n");
+		e_dev_warn("Please use the pci sysfs interface instead. Ex:\n");
+		e_dev_warn("echo '%d' > /sys/bus/pci/devices/%04x:%02x:%02x.%1x/sriov_numvfs\n",
+			   max_vfs,
+			   pci_domain_nr(pdev->bus),
+			   pdev->bus->number,
+			   PCI_SLOT(pdev->devfn),
+			   PCI_FUNC(pdev->devfn)
+			   );
+		e_dev_warn("See 'Documentation/PCI/pci-iov-howto.txt for more information.\n");
+	}
 
 	/* assign number of SR-IOV VFs */
 	if (hw->mac.type != ixgbe_mac_82598EB) {
-- 
1.8.5.GIT

^ permalink raw reply related

* [net-next 3/5] ixgbe: don't use magic size number to assign ptp_caps.name
From: Aaron Brown @ 2014-02-13  8:00 UTC (permalink / raw)
  To: davem; +Cc: Jacob Keller, netdev, gospo, sassmann, Aaron Brown
In-Reply-To: <1392278450-27062-1-git-send-email-aaron.f.brown@intel.com>

From: Jacob Keller <jacob.e.keller@intel.com>

Rather than using a magic size number, just use sizeof since that will
work and is more robust to future changes.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Signed-off-by: Aaron Brown <aaron.f.brown@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
index 95ed8ea..5697214 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
@@ -849,7 +849,9 @@ void ixgbe_ptp_init(struct ixgbe_adapter *adapter)
 
 	switch (adapter->hw.mac.type) {
 	case ixgbe_mac_X540:
-		snprintf(adapter->ptp_caps.name, 16, "%s", netdev->name);
+		snprintf(adapter->ptp_caps.name,
+			 sizeof(adapter->ptp_caps.name),
+			 "%s", netdev->name);
 		adapter->ptp_caps.owner = THIS_MODULE;
 		adapter->ptp_caps.max_adj = 250000000;
 		adapter->ptp_caps.n_alarm = 0;
@@ -863,7 +865,9 @@ void ixgbe_ptp_init(struct ixgbe_adapter *adapter)
 		adapter->ptp_caps.enable = ixgbe_ptp_enable;
 		break;
 	case ixgbe_mac_82599EB:
-		snprintf(adapter->ptp_caps.name, 16, "%s", netdev->name);
+		snprintf(adapter->ptp_caps.name,
+			 sizeof(adapter->ptp_caps.name),
+			 "%s", netdev->name);
 		adapter->ptp_caps.owner = THIS_MODULE;
 		adapter->ptp_caps.max_adj = 250000000;
 		adapter->ptp_caps.n_alarm = 0;
-- 
1.8.5.GIT

^ permalink raw reply related

* [net-next 2/5] ixgbe: implement SIOCGHWTSTAMP ioctl
From: Aaron Brown @ 2014-02-13  8:00 UTC (permalink / raw)
  To: davem; +Cc: Jacob Keller, netdev, gospo, sassmann, Aaron Brown
In-Reply-To: <1392278450-27062-1-git-send-email-aaron.f.brown@intel.com>

From: Jacob Keller <jacob.e.keller@intel.com>

This patch adds support for the new SIOCGHWTSTAMP ioctl, which enables
a process to determine the current timestamp configuration. In order to
implement this, store a copy of the timestamp configuration. In addition,
we can remove the 'int cmd' parameter as the new set_ts_config function
doesn't use it. I also fixed a typo in the function description.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Signed-off-by: Aaron Brown <aaron.f.brown@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h      |  5 ++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  4 ++-
 drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c  | 35 +++++++++++++++++----------
 3 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index 0186ea2..d7c7f13 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -765,6 +765,7 @@ struct ixgbe_adapter {
 	struct ptp_clock_info ptp_caps;
 	struct work_struct ptp_tx_work;
 	struct sk_buff *ptp_tx_skb;
+	struct hwtstamp_config tstamp_config;
 	unsigned long ptp_tx_start;
 	unsigned long last_overflow_check;
 	unsigned long last_rx_ptp_check;
@@ -958,8 +959,8 @@ static inline void ixgbe_ptp_rx_hwtstamp(struct ixgbe_ring *rx_ring,
 	rx_ring->last_rx_timestamp = jiffies;
 }
 
-int ixgbe_ptp_hwtstamp_ioctl(struct ixgbe_adapter *adapter, struct ifreq *ifr,
-			     int cmd);
+int ixgbe_ptp_set_ts_config(struct ixgbe_adapter *adapter, struct ifreq *ifr);
+int ixgbe_ptp_get_ts_config(struct ixgbe_adapter *adapter, struct ifreq *ifr);
 void ixgbe_ptp_start_cyclecounter(struct ixgbe_adapter *adapter);
 void ixgbe_ptp_reset(struct ixgbe_adapter *adapter);
 void ixgbe_ptp_check_pps_event(struct ixgbe_adapter *adapter, u32 eicr);
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 7824559..84ca49b 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -7149,7 +7149,9 @@ static int ixgbe_ioctl(struct net_device *netdev, struct ifreq *req, int cmd)
 
 	switch (cmd) {
 	case SIOCSHWTSTAMP:
-		return ixgbe_ptp_hwtstamp_ioctl(adapter, req, cmd);
+		return ixgbe_ptp_set_ts_config(adapter, req);
+	case SIOCGHWTSTAMP:
+		return ixgbe_ptp_get_ts_config(adapter, req);
 	default:
 		return mdio_mii_ioctl(&adapter->hw.phy.mdio, if_mii(req), cmd);
 	}
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
index 5184e2a..95ed8ea 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
@@ -576,14 +576,21 @@ void __ixgbe_ptp_rx_hwtstamp(struct ixgbe_q_vector *q_vector,
 	shhwtstamps->hwtstamp = ns_to_ktime(ns);
 }
 
+int ixgbe_ptp_get_ts_config(struct ixgbe_adapter *adapter, struct ifreq *ifr)
+{
+	struct hwtstamp_config *config = &adapter->tstamp_config;
+
+	return copy_to_user(ifr->ifr_data, config,
+			    sizeof(*config)) ? -EFAULT : 0;
+}
+
 /**
- * ixgbe_ptp_hwtstamp_ioctl - control hardware time stamping
+ * ixgbe_ptp_set_ts_config - control hardware time stamping
  * @adapter: pointer to adapter struct
  * @ifreq: ioctl data
- * @cmd: particular ioctl requested
  *
  * Outgoing time stamping can be enabled and disabled. Play nice and
- * disable it when requested, although it shouldn't case any overhead
+ * disable it when requested, although it shouldn't cause any overhead
  * when no packet needs it. At most one packet in the queue may be
  * marked for time stamping, otherwise it would be impossible to tell
  * for sure to which packet the hardware time stamp belongs.
@@ -599,25 +606,24 @@ void __ixgbe_ptp_rx_hwtstamp(struct ixgbe_q_vector *q_vector,
  * Event mode. This more accurately tells the user what the hardware is going
  * to do anyways.
  */
-int ixgbe_ptp_hwtstamp_ioctl(struct ixgbe_adapter *adapter,
-			     struct ifreq *ifr, int cmd)
+int ixgbe_ptp_set_ts_config(struct ixgbe_adapter *adapter, struct ifreq *ifr)
 {
 	struct ixgbe_hw *hw = &adapter->hw;
-	struct hwtstamp_config config;
+	struct hwtstamp_config *config = &adapter->tstamp_config;
 	u32 tsync_tx_ctl = IXGBE_TSYNCTXCTL_ENABLED;
 	u32 tsync_rx_ctl = IXGBE_TSYNCRXCTL_ENABLED;
 	u32 tsync_rx_mtrl = PTP_EV_PORT << 16;
 	bool is_l2 = false;
 	u32 regval;
 
-	if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
+	if (copy_from_user(config, ifr->ifr_data, sizeof(*config)))
 		return -EFAULT;
 
 	/* reserved for future extensions */
-	if (config.flags)
+	if (config->flags)
 		return -EINVAL;
 
-	switch (config.tx_type) {
+	switch (config->tx_type) {
 	case HWTSTAMP_TX_OFF:
 		tsync_tx_ctl = 0;
 	case HWTSTAMP_TX_ON:
@@ -626,7 +632,7 @@ int ixgbe_ptp_hwtstamp_ioctl(struct ixgbe_adapter *adapter,
 		return -ERANGE;
 	}
 
-	switch (config.rx_filter) {
+	switch (config->rx_filter) {
 	case HWTSTAMP_FILTER_NONE:
 		tsync_rx_ctl = 0;
 		tsync_rx_mtrl = 0;
@@ -650,7 +656,7 @@ int ixgbe_ptp_hwtstamp_ioctl(struct ixgbe_adapter *adapter,
 	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
 		tsync_rx_ctl |= IXGBE_TSYNCRXCTL_TYPE_EVENT_V2;
 		is_l2 = true;
-		config.rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT;
+		config->rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT;
 		break;
 	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
 	case HWTSTAMP_FILTER_ALL:
@@ -661,7 +667,7 @@ int ixgbe_ptp_hwtstamp_ioctl(struct ixgbe_adapter *adapter,
 		 * Delay_Req messages and hardware does not support
 		 * timestamping all packets => return error
 		 */
-		config.rx_filter = HWTSTAMP_FILTER_NONE;
+		config->rx_filter = HWTSTAMP_FILTER_NONE;
 		return -ERANGE;
 	}
 
@@ -702,7 +708,7 @@ int ixgbe_ptp_hwtstamp_ioctl(struct ixgbe_adapter *adapter,
 	regval = IXGBE_READ_REG(hw, IXGBE_TXSTMPH);
 	regval = IXGBE_READ_REG(hw, IXGBE_RXSTMPH);
 
-	return copy_to_user(ifr->ifr_data, &config, sizeof(config)) ?
+	return copy_to_user(ifr->ifr_data, config, sizeof(*config)) ?
 		-EFAULT : 0;
 }
 
@@ -809,6 +815,9 @@ void ixgbe_ptp_reset(struct ixgbe_adapter *adapter)
 	IXGBE_WRITE_REG(hw, IXGBE_SYSTIMH, 0x00000000);
 	IXGBE_WRITE_FLUSH(hw);
 
+	/* Reset the saved tstamp_config */
+	memset(&adapter->tstamp_config, 0, sizeof(adapter->tstamp_config));
+
 	ixgbe_ptp_start_cyclecounter(adapter);
 
 	spin_lock_irqsave(&adapter->tmreg_lock, flags);
-- 
1.8.5.GIT

^ permalink raw reply related

* [net-next 1/5] ixgbe: modify behavior on receiving a HW ECC error.
From: Aaron Brown @ 2014-02-13  8:00 UTC (permalink / raw)
  To: davem; +Cc: Don Skidmore, netdev, gospo, sassmann, Aaron Brown
In-Reply-To: <1392278450-27062-1-git-send-email-aaron.f.brown@intel.com>

From: Don Skidmore <donald.c.skidmore@intel.com>

Currently when we noticed a HW ECC error we would  request the use reload
the driver to force a reset of the part.  This was done due to the mistaken
believe that a normal reset would not be sufficient.  Well it turns out it
would be so now we just schedule a reset upon seeing the ECC.

Signed-off-by: Don Skidmore <donald.c.skidmore@intel.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Signed-off-by: Aaron Brown <aaron.f.brown@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 6d4ada7..7824559 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -2630,9 +2630,12 @@ static irqreturn_t ixgbe_msix_other(int irq, void *data)
 	switch (hw->mac.type) {
 	case ixgbe_mac_82599EB:
 	case ixgbe_mac_X540:
-		if (eicr & IXGBE_EICR_ECC)
-			e_info(link, "Received unrecoverable ECC Err, please "
-			       "reboot\n");
+		if (eicr & IXGBE_EICR_ECC) {
+			e_info(link, "Received ECC Err, initiating reset\n");
+			adapter->flags2 |= IXGBE_FLAG2_RESET_REQUESTED;
+			ixgbe_service_event_schedule(adapter);
+			IXGBE_WRITE_REG(hw, IXGBE_EICR, IXGBE_EICR_ECC);
+		}
 		/* Handle Flow Director Full threshold interrupt */
 		if (eicr & IXGBE_EICR_FLOW_DIR) {
 			int reinit_count = 0;
@@ -2846,9 +2849,12 @@ static irqreturn_t ixgbe_intr(int irq, void *data)
 		ixgbe_check_sfp_event(adapter, eicr);
 		/* Fall through */
 	case ixgbe_mac_X540:
-		if (eicr & IXGBE_EICR_ECC)
-			e_info(link, "Received unrecoverable ECC err, please "
-				     "reboot\n");
+		if (eicr & IXGBE_EICR_ECC) {
+			e_info(link, "Received ECC Err, initiating reset\n");
+			adapter->flags2 |= IXGBE_FLAG2_RESET_REQUESTED;
+			ixgbe_service_event_schedule(adapter);
+			IXGBE_WRITE_REG(hw, IXGBE_EICR, IXGBE_EICR_ECC);
+		}
 		ixgbe_check_overtemp_event(adapter, eicr);
 		break;
 	default:
-- 
1.8.5.GIT

^ permalink raw reply related

* [net-next 0/5] Intel Wired LAN Driver Updates
From: Aaron Brown @ 2014-02-13  8:00 UTC (permalink / raw)
  To: davem; +Cc: Aaron Brown, netdev, gospo, sassmann

This series contains updates to the ixgbe driver.

Don makes recovery from a HW ECC error just schedule a reset as it turns
out the previous behaviour of forcing the user to reload is not necessary.

Jacob adds support for the new SIOCGHWTSTAMP ioctl, removes a magic number
from the ptp_caps.name assignment and makes a warning message much more
useful.

Mark add WOL support for a new part.

Don Skidmore (1):
  ixgbe: modify behavior on receiving a HW ECC error.

Jacob Keller (3):
  ixgbe: implement SIOCGHWTSTAMP ioctl
  ixgbe: don't use magic size number to assign ptp_caps.name
  ixgbe: fixup warning regarding max_vfs parameter

Mark Rustad (1):
  ixgbe: Add WoL support for a new device

 drivers/net/ethernet/intel/ixgbe/ixgbe.h      |  5 ++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 37 +++++++++++++++++------
 drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c  | 43 +++++++++++++++++----------
 drivers/net/ethernet/intel/ixgbe/ixgbe_type.h |  1 +
 4 files changed, 60 insertions(+), 26 deletions(-)

-- 
1.8.5.GIT

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox