* [PATCH v6 1/4] net: macb: add phylink support
From: Parshuram Thombare @ 2019-07-10 14:37 UTC (permalink / raw)
To: andrew, nicolas.ferre, davem, f.fainelli
Cc: linux, netdev, hkallweit1, linux-kernel, rafalc, piotrs, aniljoy,
arthurm, stevenh, pthombar, mparab
In-Reply-To: <1562769391-31803-1-git-send-email-pthombar@cadence.com>
This patch replace phylib API's by phylink API's.
Signed-off-by: Parshuram Thombare <pthombar@cadence.com>
---
drivers/net/ethernet/cadence/Kconfig | 2 +-
drivers/net/ethernet/cadence/macb.h | 3 +
drivers/net/ethernet/cadence/macb_main.c | 332 +++++++++++++----------
3 files changed, 187 insertions(+), 150 deletions(-)
diff --git a/drivers/net/ethernet/cadence/Kconfig b/drivers/net/ethernet/cadence/Kconfig
index f4b3bd85dfe3..53b50c24d9c9 100644
--- a/drivers/net/ethernet/cadence/Kconfig
+++ b/drivers/net/ethernet/cadence/Kconfig
@@ -22,7 +22,7 @@ if NET_VENDOR_CADENCE
config MACB
tristate "Cadence MACB/GEM support"
depends on HAS_DMA && COMMON_CLK
- select PHYLIB
+ select PHYLINK
---help---
The Cadence MACB ethernet interface is found on many Atmel AT32 and
AT91 parts. This driver also supports the Cadence GEM (Gigabit
diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h
index 03983bd46eef..a4007057b35e 100644
--- a/drivers/net/ethernet/cadence/macb.h
+++ b/drivers/net/ethernet/cadence/macb.h
@@ -11,6 +11,7 @@
#include <linux/ptp_clock_kernel.h>
#include <linux/net_tstamp.h>
#include <linux/interrupt.h>
+#include <linux/phylink.h>
#if defined(CONFIG_ARCH_DMA_ADDR_T_64BIT) || defined(CONFIG_MACB_USE_HWSTAMP)
#define MACB_EXT_DESC
@@ -1232,6 +1233,8 @@ struct macb {
u32 rx_intr_mask;
struct macb_pm_data pm_data;
+ struct phylink *pl;
+ struct phylink_config pl_config;
};
#ifdef CONFIG_MACB_USE_HWSTAMP
diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index 5ca17e62dc3e..ce064eb9252a 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -36,6 +36,7 @@
#include <linux/tcp.h>
#include <linux/iopoll.h>
#include <linux/pm_runtime.h>
+#include <linux/phylink.h>
#include "macb.h"
/* This structure is only used for MACB on SiFive FU540 devices */
@@ -433,115 +434,160 @@ static void macb_set_tx_clk(struct clk *clk, int speed, struct net_device *dev)
netdev_err(dev, "adjusting tx_clk failed.\n");
}
-static void macb_handle_link_change(struct net_device *dev)
+static void gem_phylink_validate(struct phylink_config *pl_config,
+ unsigned long *supported,
+ struct phylink_link_state *state)
{
- struct macb *bp = netdev_priv(dev);
- struct phy_device *phydev = dev->phydev;
+ struct net_device *netdev = to_net_dev(pl_config->dev);
+ struct macb *bp = netdev_priv(netdev);
+ __ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, };
+
+ switch (state->interface) {
+ case PHY_INTERFACE_MODE_GMII:
+ case PHY_INTERFACE_MODE_RGMII:
+ if (!macb_is_gem(bp))
+ goto empty_set;
+ break;
+ default:
+ break;
+ }
+
+ switch (state->interface) {
+ case PHY_INTERFACE_MODE_GMII:
+ case PHY_INTERFACE_MODE_RGMII:
+ if (bp->caps & MACB_CAPS_GIGABIT_MODE_AVAILABLE) {
+ phylink_set(mask, 1000baseT_Full);
+ phylink_set(mask, 1000baseX_Full);
+ if (!(bp->caps & MACB_CAPS_NO_GIGABIT_HALF))
+ phylink_set(mask, 1000baseT_Half);
+ }
+ /* fallthrough */
+ case PHY_INTERFACE_MODE_MII:
+ case PHY_INTERFACE_MODE_RMII:
+ phylink_set(mask, 10baseT_Half);
+ phylink_set(mask, 10baseT_Full);
+ phylink_set(mask, 100baseT_Half);
+ phylink_set(mask, 100baseT_Full);
+ break;
+ default:
+ goto empty_set;
+ }
+
+ linkmode_and(supported, supported, mask);
+ linkmode_and(state->advertising, state->advertising, mask);
+ return;
+
+empty_set:
+ linkmode_zero(supported);
+}
+
+static int gem_phylink_mac_link_state(struct phylink_config *pl_config,
+ struct phylink_link_state *state)
+{
+ return -EOPNOTSUPP;
+}
+
+static void gem_mac_config(struct phylink_config *pl_config, unsigned int mode,
+ const struct phylink_link_state *state)
+{
+ struct net_device *netdev = to_net_dev(pl_config->dev);
+ struct macb *bp = netdev_priv(netdev);
+ bool change_interface = bp->phy_interface != state->interface;
unsigned long flags;
- int status_change = 0;
spin_lock_irqsave(&bp->lock, flags);
- if (phydev->link) {
- if ((bp->speed != phydev->speed) ||
- (bp->duplex != phydev->duplex)) {
- u32 reg;
+ if (change_interface)
+ bp->phy_interface = state->interface;
- reg = macb_readl(bp, NCFGR);
- reg &= ~(MACB_BIT(SPD) | MACB_BIT(FD));
- if (macb_is_gem(bp))
- reg &= ~GEM_BIT(GBE);
+ if (!phylink_autoneg_inband(mode) &&
+ (bp->speed != state->speed ||
+ bp->duplex != state->duplex)) {
+ u32 reg;
- if (phydev->duplex)
- reg |= MACB_BIT(FD);
- if (phydev->speed == SPEED_100)
- reg |= MACB_BIT(SPD);
- if (phydev->speed == SPEED_1000 &&
- bp->caps & MACB_CAPS_GIGABIT_MODE_AVAILABLE)
- reg |= GEM_BIT(GBE);
-
- macb_or_gem_writel(bp, NCFGR, reg);
+ reg = macb_readl(bp, NCFGR);
+ reg &= ~(MACB_BIT(SPD) | MACB_BIT(FD));
+ if (macb_is_gem(bp))
+ reg &= ~GEM_BIT(GBE);
+ if (state->duplex)
+ reg |= MACB_BIT(FD);
- bp->speed = phydev->speed;
- bp->duplex = phydev->duplex;
- status_change = 1;
+ switch (state->speed) {
+ case SPEED_1000:
+ reg |= GEM_BIT(GBE);
+ break;
+ case SPEED_100:
+ reg |= MACB_BIT(SPD);
+ break;
+ default:
+ break;
}
- }
+ macb_or_gem_writel(bp, NCFGR, reg);
- if (phydev->link != bp->link) {
- if (!phydev->link) {
- bp->speed = 0;
- bp->duplex = -1;
- }
- bp->link = phydev->link;
+ bp->speed = state->speed;
+ bp->duplex = state->duplex;
- status_change = 1;
+ if (state->link)
+ macb_set_tx_clk(bp->tx_clk, state->speed, netdev);
}
spin_unlock_irqrestore(&bp->lock, flags);
+}
- if (status_change) {
- if (phydev->link) {
- /* Update the TX clock rate if and only if the link is
- * up and there has been a link change.
- */
- macb_set_tx_clk(bp->tx_clk, phydev->speed, dev);
+static void gem_mac_link_up(struct phylink_config *pl_config, unsigned int mode,
+ phy_interface_t interface, struct phy_device *phy)
+{
+ struct net_device *netdev = to_net_dev(pl_config->dev);
+ struct macb *bp = netdev_priv(netdev);
- netif_carrier_on(dev);
- netdev_info(dev, "link up (%d/%s)\n",
- phydev->speed,
- phydev->duplex == DUPLEX_FULL ?
- "Full" : "Half");
- } else {
- netif_carrier_off(dev);
- netdev_info(dev, "link down\n");
- }
- }
+ bp->link = 1;
+ /* Enable TX and RX */
+ macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(RE) | MACB_BIT(TE));
+}
+
+static void gem_mac_link_down(struct phylink_config *pl_config,
+ unsigned int mode, phy_interface_t interface)
+{
+ struct net_device *netdev = to_net_dev(pl_config->dev);
+ struct macb *bp = netdev_priv(netdev);
+
+ bp->link = 0;
+ /* Disable TX and RX */
+ macb_writel(bp, NCR,
+ macb_readl(bp, NCR) & ~(MACB_BIT(RE) | MACB_BIT(TE)));
}
+static const struct phylink_mac_ops gem_phylink_ops = {
+ .validate = gem_phylink_validate,
+ .mac_link_state = gem_phylink_mac_link_state,
+ .mac_config = gem_mac_config,
+ .mac_link_up = gem_mac_link_up,
+ .mac_link_down = gem_mac_link_down,
+};
+
/* based on au1000_eth. c*/
-static int macb_mii_probe(struct net_device *dev)
+static int macb_mii_probe(struct net_device *dev, phy_interface_t phy_mode)
{
struct macb *bp = netdev_priv(dev);
struct phy_device *phydev;
struct device_node *np;
- int ret, i;
+ int ret;
np = bp->pdev->dev.of_node;
ret = 0;
- if (np) {
- if (of_phy_is_fixed_link(np)) {
- bp->phy_node = of_node_get(np);
- } else {
- bp->phy_node = of_parse_phandle(np, "phy-handle", 0);
- /* fallback to standard phy registration if no
- * phy-handle was found nor any phy found during
- * dt phy registration
- */
- if (!bp->phy_node && !phy_find_first(bp->mii_bus)) {
- for (i = 0; i < PHY_MAX_ADDR; i++) {
- phydev = mdiobus_scan(bp->mii_bus, i);
- if (IS_ERR(phydev) &&
- PTR_ERR(phydev) != -ENODEV) {
- ret = PTR_ERR(phydev);
- break;
- }
- }
-
- if (ret)
- return -ENODEV;
- }
- }
+ bp->pl_config.dev = &dev->dev;
+ bp->pl_config.type = PHYLINK_NETDEV;
+ bp->pl = phylink_create(&bp->pl_config, of_fwnode_handle(np),
+ phy_mode, &gem_phylink_ops);
+ if (IS_ERR(bp->pl)) {
+ netdev_err(dev,
+ "error creating PHYLINK: %ld\n", PTR_ERR(bp->pl));
+ return PTR_ERR(bp->pl);
}
- if (bp->phy_node) {
- phydev = of_phy_connect(dev, bp->phy_node,
- &macb_handle_link_change, 0,
- bp->phy_interface);
- if (!phydev)
- return -ENODEV;
- } else {
+ ret = phylink_of_phy_connect(bp->pl, np, 0);
+ if (ret == -ENODEV && bp->mii_bus) {
phydev = phy_find_first(bp->mii_bus);
if (!phydev) {
netdev_err(dev, "no PHY found\n");
@@ -549,32 +595,22 @@ static int macb_mii_probe(struct net_device *dev)
}
/* attach the mac to the phy */
- ret = phy_connect_direct(dev, phydev, &macb_handle_link_change,
- bp->phy_interface);
+ ret = phylink_connect_phy(bp->pl, phydev);
if (ret) {
netdev_err(dev, "Could not attach to PHY\n");
return ret;
}
}
- /* mask with MAC supported features */
- if (macb_is_gem(bp) && bp->caps & MACB_CAPS_GIGABIT_MODE_AVAILABLE)
- phy_set_max_speed(phydev, SPEED_1000);
- else
- phy_set_max_speed(phydev, SPEED_100);
-
- if (bp->caps & MACB_CAPS_NO_GIGABIT_HALF)
- phy_remove_link_mode(phydev,
- ETHTOOL_LINK_MODE_1000baseT_Half_BIT);
-
bp->link = 0;
- bp->speed = 0;
- bp->duplex = -1;
+ bp->speed = SPEED_UNKNOWN;
+ bp->duplex = DUPLEX_UNKNOWN;
+ bp->phy_interface = PHY_INTERFACE_MODE_MAX;
- return 0;
+ return ret;
}
-static int macb_mii_init(struct macb *bp)
+static int macb_mii_init(struct macb *bp, phy_interface_t phy_mode)
{
struct device_node *np;
int err = -ENXIO;
@@ -599,22 +635,12 @@ static int macb_mii_init(struct macb *bp)
dev_set_drvdata(&bp->dev->dev, bp->mii_bus);
np = bp->pdev->dev.of_node;
- if (np && of_phy_is_fixed_link(np)) {
- if (of_phy_register_fixed_link(np) < 0) {
- dev_err(&bp->pdev->dev,
- "broken fixed-link specification %pOF\n", np);
- goto err_out_free_mdiobus;
- }
-
- err = mdiobus_register(bp->mii_bus);
- } else {
- err = of_mdiobus_register(bp->mii_bus, np);
- }
+ err = of_mdiobus_register(bp->mii_bus, np);
if (err)
goto err_out_free_fixed_link;
- err = macb_mii_probe(bp->dev);
+ err = macb_mii_probe(bp->dev, phy_mode);
if (err)
goto err_out_unregister_bus;
@@ -625,7 +651,6 @@ static int macb_mii_init(struct macb *bp)
err_out_free_fixed_link:
if (np && of_phy_is_fixed_link(np))
of_phy_deregister_fixed_link(np);
-err_out_free_mdiobus:
of_node_put(bp->phy_node);
mdiobus_free(bp->mii_bus);
err_out:
@@ -2418,12 +2443,6 @@ static int macb_open(struct net_device *dev)
/* carrier starts down */
netif_carrier_off(dev);
- /* if the phy is not yet register, retry later*/
- if (!dev->phydev) {
- err = -EAGAIN;
- goto pm_exit;
- }
-
/* RX buffers initialization */
macb_init_rx_buffer_size(bp, bufsz);
@@ -2441,7 +2460,7 @@ static int macb_open(struct net_device *dev)
macb_init_hw(bp);
/* schedule a link state check */
- phy_start(dev->phydev);
+ phylink_start(bp->pl);
netif_tx_start_all_queues(dev);
@@ -2468,8 +2487,7 @@ static int macb_close(struct net_device *dev)
for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue)
napi_disable(&queue->napi);
- if (dev->phydev)
- phy_stop(dev->phydev);
+ phylink_stop(bp->pl);
spin_lock_irqsave(&bp->lock, flags);
macb_reset_hw(bp);
@@ -3158,6 +3176,23 @@ static int gem_set_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd)
return ret;
}
+static int gem_ethtool_get_link_ksettings(struct net_device *netdev,
+ struct ethtool_link_ksettings *cmd)
+{
+ struct macb *bp = netdev_priv(netdev);
+
+ return phylink_ethtool_ksettings_get(bp->pl, cmd);
+}
+
+static int
+gem_ethtool_set_link_ksettings(struct net_device *netdev,
+ const struct ethtool_link_ksettings *cmd)
+{
+ struct macb *bp = netdev_priv(netdev);
+
+ return phylink_ethtool_ksettings_set(bp->pl, cmd);
+}
+
static const struct ethtool_ops macb_ethtool_ops = {
.get_regs_len = macb_get_regs_len,
.get_regs = macb_get_regs,
@@ -3165,8 +3200,8 @@ static const struct ethtool_ops macb_ethtool_ops = {
.get_ts_info = ethtool_op_get_ts_info,
.get_wol = macb_get_wol,
.set_wol = macb_set_wol,
- .get_link_ksettings = phy_ethtool_get_link_ksettings,
- .set_link_ksettings = phy_ethtool_set_link_ksettings,
+ .get_link_ksettings = gem_ethtool_get_link_ksettings,
+ .set_link_ksettings = gem_ethtool_set_link_ksettings,
.get_ringparam = macb_get_ringparam,
.set_ringparam = macb_set_ringparam,
};
@@ -3179,8 +3214,8 @@ static const struct ethtool_ops gem_ethtool_ops = {
.get_ethtool_stats = gem_get_ethtool_stats,
.get_strings = gem_get_ethtool_strings,
.get_sset_count = gem_get_sset_count,
- .get_link_ksettings = phy_ethtool_get_link_ksettings,
- .set_link_ksettings = phy_ethtool_set_link_ksettings,
+ .get_link_ksettings = gem_ethtool_get_link_ksettings,
+ .set_link_ksettings = gem_ethtool_set_link_ksettings,
.get_ringparam = macb_get_ringparam,
.set_ringparam = macb_set_ringparam,
.get_rxnfc = gem_get_rxnfc,
@@ -3189,17 +3224,13 @@ static const struct ethtool_ops gem_ethtool_ops = {
static int macb_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
{
- struct phy_device *phydev = dev->phydev;
struct macb *bp = netdev_priv(dev);
if (!netif_running(dev))
return -EINVAL;
- if (!phydev)
- return -ENODEV;
-
if (!bp->ptp_info)
- return phy_mii_ioctl(phydev, rq, cmd);
+ return phylink_mii_ioctl(bp->pl, rq, cmd);
switch (cmd) {
case SIOCSHWTSTAMP:
@@ -3207,7 +3238,7 @@ static int macb_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
case SIOCGHWTSTAMP:
return bp->ptp_info->get_hwtst(dev, rq);
default:
- return phy_mii_ioctl(phydev, rq, cmd);
+ return phylink_mii_ioctl(bp->pl, rq, cmd);
}
}
@@ -3709,7 +3740,7 @@ static int at91ether_open(struct net_device *dev)
MACB_BIT(HRESP));
/* schedule a link state check */
- phy_start(dev->phydev);
+ phylink_start(lp->pl);
netif_start_queue(dev);
@@ -4182,13 +4213,12 @@ static int macb_probe(struct platform_device *pdev)
struct clk *tsu_clk = NULL;
unsigned int queue_mask, num_queues;
bool native_io;
- struct phy_device *phydev;
struct net_device *dev;
struct resource *regs;
void __iomem *mem;
const char *mac;
struct macb *bp;
- int err, val;
+ int err, val, phy_mode;
regs = platform_get_resource(pdev, IORESOURCE_MEM, 0);
mem = devm_ioremap_resource(&pdev->dev, regs);
@@ -4309,24 +4339,20 @@ static int macb_probe(struct platform_device *pdev)
macb_get_hwaddr(bp);
}
- err = of_get_phy_mode(np);
- if (err < 0)
+ phy_mode = of_get_phy_mode(np);
+ if (phy_mode < 0)
/* not found in DT, MII by default */
- bp->phy_interface = PHY_INTERFACE_MODE_MII;
- else
- bp->phy_interface = err;
+ phy_mode = PHY_INTERFACE_MODE_MII;
/* IP specific init */
err = init(pdev);
if (err)
goto err_out_free_netdev;
- err = macb_mii_init(bp);
+ err = macb_mii_init(bp, phy_mode);
if (err)
goto err_out_free_netdev;
- phydev = dev->phydev;
-
netif_carrier_off(dev);
err = register_netdev(dev);
@@ -4338,8 +4364,6 @@ static int macb_probe(struct platform_device *pdev)
tasklet_init(&bp->hresp_err_tasklet, macb_hresp_error_task,
(unsigned long)bp);
- phy_attached_info(phydev);
-
netdev_info(dev, "Cadence %s rev 0x%08x at 0x%08lx irq %d (%pM)\n",
macb_is_gem(bp) ? "GEM" : "MACB", macb_readl(bp, MID),
dev->base_addr, dev->irq, dev->dev_addr);
@@ -4350,7 +4374,9 @@ static int macb_probe(struct platform_device *pdev)
return 0;
err_out_unregister_mdio:
- phy_disconnect(dev->phydev);
+ rtnl_lock();
+ phylink_disconnect_phy(bp->pl);
+ rtnl_unlock();
mdiobus_unregister(bp->mii_bus);
of_node_put(bp->phy_node);
if (np && of_phy_is_fixed_link(np))
@@ -4384,13 +4410,18 @@ static int macb_remove(struct platform_device *pdev)
if (dev) {
bp = netdev_priv(dev);
- if (dev->phydev)
- phy_disconnect(dev->phydev);
+ if (bp->pl) {
+ rtnl_lock();
+ phylink_disconnect_phy(bp->pl);
+ rtnl_unlock();
+ }
mdiobus_unregister(bp->mii_bus);
if (np && of_phy_is_fixed_link(np))
of_phy_deregister_fixed_link(np);
dev->phydev = NULL;
mdiobus_free(bp->mii_bus);
+ if (bp->pl)
+ phylink_destroy(bp->pl);
unregister_netdev(dev);
pm_runtime_disable(&pdev->dev);
@@ -4433,8 +4464,9 @@ static int __maybe_unused macb_suspend(struct device *dev)
for (q = 0, queue = bp->queues; q < bp->num_queues;
++q, ++queue)
napi_disable(&queue->napi);
- phy_stop(netdev->phydev);
- phy_suspend(netdev->phydev);
+ phylink_stop(bp->pl);
+ if (netdev->phydev)
+ phy_suspend(netdev->phydev);
spin_lock_irqsave(&bp->lock, flags);
macb_reset_hw(bp);
spin_unlock_irqrestore(&bp->lock, flags);
@@ -4482,9 +4514,11 @@ static int __maybe_unused macb_resume(struct device *dev)
for (q = 0, queue = bp->queues; q < bp->num_queues;
++q, ++queue)
napi_enable(&queue->napi);
- phy_resume(netdev->phydev);
- phy_init_hw(netdev->phydev);
- phy_start(netdev->phydev);
+ if (netdev->phydev) {
+ phy_resume(netdev->phydev);
+ phy_init_hw(netdev->phydev);
+ }
+ phylink_start(bp->pl);
}
bp->macbgem_ops.mog_init_rings(bp);
--
2.17.1
^ permalink raw reply related
* Re: [PATCH net-next v6 06/15] ethtool: netlink bitset handling
From: Michal Kubecek @ 2019-07-10 14:37 UTC (permalink / raw)
To: netdev
Cc: Jiri Pirko, David Miller, Jakub Kicinski, Andrew Lunn,
Florian Fainelli, John Linville, Stephen Hemminger, Johannes Berg,
linux-kernel
In-Reply-To: <20190710125943.GC2291@nanopsycho>
On Wed, Jul 10, 2019 at 02:59:43PM +0200, Jiri Pirko wrote:
> Wed, Jul 10, 2019 at 02:38:03PM CEST, mkubecek@suse.cz wrote:
> >On Tue, Jul 09, 2019 at 04:18:17PM +0200, Jiri Pirko wrote:
> >>
> >> I understand. So how about avoid the bitfield all together and just
> >> have array of either bits of strings or combinations?
> >>
> >> ETHTOOL_CMD_SETTINGS_SET (U->K)
> >> ETHTOOL_A_HEADER
> >> ETHTOOL_A_DEV_NAME = "eth3"
> >> ETHTOOL_A_SETTINGS_PRIV_FLAGS
> >> ETHTOOL_A_SETTINGS_PRIV_FLAG
> >> ETHTOOL_A_FLAG_NAME = "legacy-rx"
> >> ETHTOOL_A_FLAG_VALUE (NLA_FLAG)
> >>
> >> or the same with index instead of string
> >>
> >> ETHTOOL_CMD_SETTINGS_SET (U->K)
> >> ETHTOOL_A_HEADER
> >> ETHTOOL_A_DEV_NAME = "eth3"
> >> ETHTOOL_A_SETTINGS_PRIV_FLAGS
> >> ETHTOOL_A_SETTINGS_PRIV_FLAG
> >> ETHTOOL_A_FLAG_INDEX = 0
> >> ETHTOOL_A_FLAG_VALUE (NLA_FLAG)
> >>
> >>
> >> For set you can combine both when you want to set multiple bits:
> >>
> >> ETHTOOL_CMD_SETTINGS_SET (U->K)
> >> ETHTOOL_A_HEADER
> >> ETHTOOL_A_DEV_NAME = "eth3"
> >> ETHTOOL_A_SETTINGS_PRIV_FLAGS
> >> ETHTOOL_A_SETTINGS_PRIV_FLAG
> >> ETHTOOL_A_FLAG_INDEX = 2
> >> ETHTOOL_A_FLAG_VALUE (NLA_FLAG)
> >> ETHTOOL_A_SETTINGS_PRIV_FLAG
> >> ETHTOOL_A_FLAG_INDEX = 8
> >> ETHTOOL_A_FLAG_VALUE (NLA_FLAG)
> >> ETHTOOL_A_SETTINGS_PRIV_FLAG
> >> ETHTOOL_A_FLAG_NAME = "legacy-rx"
> >> ETHTOOL_A_FLAG_VALUE (NLA_FLAG)
> >>
> >>
> >> For get this might be a bit bigger message:
> >>
> >> ETHTOOL_CMD_SETTINGS_GET_REPLY (K->U)
> >> ETHTOOL_A_HEADER
> >> ETHTOOL_A_DEV_NAME = "eth3"
> >> ETHTOOL_A_SETTINGS_PRIV_FLAGS
> >> ETHTOOL_A_SETTINGS_PRIV_FLAG
> >> ETHTOOL_A_FLAG_INDEX = 0
> >> ETHTOOL_A_FLAG_NAME = "legacy-rx"
> >> ETHTOOL_A_FLAG_VALUE (NLA_FLAG)
> >> ETHTOOL_A_SETTINGS_PRIV_FLAG
> >> ETHTOOL_A_FLAG_INDEX = 1
> >> ETHTOOL_A_FLAG_NAME = "vf-ipsec"
> >> ETHTOOL_A_FLAG_VALUE (NLA_FLAG)
> >> ETHTOOL_A_SETTINGS_PRIV_FLAG
> >> ETHTOOL_A_FLAG_INDEX = 8
> >> ETHTOOL_A_FLAG_NAME = "something-else"
> >> ETHTOOL_A_FLAG_VALUE (NLA_FLAG)
> >
> >This is perfect for "one shot" applications but not so much for long
> >running ones, either "ethtool --monitor" or management or monitoring
> >daemons. Repeating the names in every notification message would be
> >a waste, it's much more convenient to load the strings only once and
>
> Yeah, for those aplications, the ETHTOOL_A_FLAG_NAME could be omitted
>
>
> >cache them. Even if we omit the names in notifications (and possibly the
> >GET replies if client opts for it), this format still takes 12-16 bytes
> >per bit.
> >
> >So the problem I'm trying to address is that there are two types of
> >clients with very different mode of work and different preferences.
> >
> >Looking at the bitset.c, I would rather say that most of the complexity
> >and ugliness comes from dealing with both unsigned long based bitmaps
> >and u32 based ones. Originally, there were functions working with
> >unsigned long based bitmaps and the variants with "32" suffix were
> >wrappers around them which converted u32 bitmaps to unsigned long ones
> >and back. This became a problem when kernel started issuing warnings
> >about variable length arrays as getting rid of them meant two kmalloc()
> >and two kfree() for each u32 bitmap operation, even if most of the
> >bitmaps are in rather short in practice.
> >
> >Maybe the wrapper could do something like
> >
> >int ethnl_put_bitset32(const u32 *value, const u32 *mask,
> > unsigned int size, ...)
> >{
> > unsigned long fixed_value[2], fixed_mask[2];
> > unsigned long *tmp_value = fixed_value;
> > unsigned long *tmp_mask = fixed_mask;
> >
> > if (size > sizeof(fixed_value) * BITS_PER_BYTE) {
> > tmp_value = bitmap_alloc(size);
> > if (!tmp_value)
> > return -ENOMEM;
> > tmp_mask = bitmap_alloc(size);
> > if (!tmp_mask) {
> > kfree(tmp_value);
> > return -ENOMEM;
> > }
> > }
> >
> > bitmap_from_arr32(tmp_value, value, size);
> > bitmap_from_arr32(tmp_mask, mask, size);
> > ret = ethnl_put_bitset(tmp_value, tmp_mask, size, ...);
> >}
> >
> >This way we would make bitset.c code cleaner while avoiding allocating
> >short bitmaps (which is the most common case).
>
> I'm primarily concerned about the uapi. Plus if the uapi approach is united
> for both index and string, we can omit this whole bitset abomination...
I'm afraid I don't understand this comment. Whatever the representation
of bitmaps (both simple bitmaps and value/mask pairs) is going to be, we
will need a function for parsing them (currently ethnl_update_bitset())
and a function for filling them into the message (currently
ethnl_put_bitset()). Unless you are suggesting to write a copy of
essentially the same parser and composer for each of the bitsets (there
is 15 of them at the already and 4 NLA_BITFIELD32 attributes which I'm
seriously considering to replace with arbitrary length bitsets as well
to make the UAPI as future proof as possible).
After all, what you suggested above is exactly the same structure as my
bitset in verbose form, except you omit size (which is a problem, as
discussed in other part of the thread) and put the contents of BITS
container directly under the main container.
Michal
^ permalink raw reply
* [PATCH v6 0/5] net: macb: cover letter
From: Parshuram Thombare @ 2019-07-10 14:36 UTC (permalink / raw)
To: andrew, nicolas.ferre, davem, f.fainelli
Cc: linux, netdev, hkallweit1, linux-kernel, rafalc, piotrs, aniljoy,
arthurm, stevenh, pthombar, mparab
Hello !
This is 6th version of patch set containing following patches
for Cadence ethernet controller driver.
1. 0001-net-macb-add-phylink-support.patch
Replace phylib API's with phylink API's.
2. 0002-net-macb-add-support-for-sgmii-MAC-PHY-interface.patch
This patch add support for SGMII mode.
3. 0004-net-macb-add-support-for-c45-PHY.patch
This patch is to support C45 PHY.
4. 0005-net-macb-add-support-for-high-speed-interface
This patch add support for 10G USXGMII PCS in fixed mode.
Changes in v2:
1. Dropped patch configuring TI PHY DP83867 from
Cadence PCI wrapper driver.
2. Removed code registering emulated PHY for fixed mode.
3. Code reformatting as per Andrew's and Florian's suggestions.
Changes in v3:
Based on Russell's suggestions
1. Configure MAC in mac_config only for non in-band modes
2. Handle dynamic phy_mode changes in mac_config
3. Move MAC configurations to mac_config
4. Removed seemingly redundant check for phylink handle
5. Removed code from mac_an_restart and mac_link_state
now just return -EOPNOTSUPP
Changes in v4:
1. Removed PHY_INTERFACE_MODE_2500BASEX, PHY_INTERFACE_MODE_1000BASEX and
2.5G PHY_INTERFACE_MODE_SGMII phy modes from supported modes
Changes in v5:
1. Code refactoring
Changes in v6:
1. Allow phylink to validate particular phy_mode support by hardware.
2. Remove device tree parameter and 5G serdes rate for USXGMII
Regards,
Parshuram Thombare
Parshuram Thombare (4):
net: macb: add phylink support
net: macb: add support for sgmii MAC-PHY interface
net: macb: add support for c45 PHY
net: macb: add support for high speed interface
drivers/net/ethernet/cadence/Kconfig | 2 +-
drivers/net/ethernet/cadence/macb.h | 115 ++++-
drivers/net/ethernet/cadence/macb_main.c | 543 ++++++++++++++++-------
3 files changed, 483 insertions(+), 177 deletions(-)
--
2.17.1
^ permalink raw reply
* Re: [PATCH] [net-next] davinci_cpdma: don't cast dma_addr_t to pointer
From: Ivan Khoronzhuk @ 2019-07-10 14:26 UTC (permalink / raw)
To: Arnd Bergmann
Cc: David S. Miller, Grygorii Strashko, Andrew Lunn, Ilias Apalodimas,
linux-omap, netdev, linux-kernel
In-Reply-To: <20190710080106.24237-1-arnd@arndb.de>
On Wed, Jul 10, 2019 at 10:00:33AM +0200, Arnd Bergmann wrote:
>dma_addr_t may be 64-bit wide on 32-bit architectures, so it is not
>valid to cast between it and a pointer:
>
>drivers/net/ethernet/ti/davinci_cpdma.c: In function 'cpdma_chan_submit_si':
>drivers/net/ethernet/ti/davinci_cpdma.c:1047:12: error: cast from pointer to integer of different size [-Werror=pointer-to-int-cast]
>drivers/net/ethernet/ti/davinci_cpdma.c: In function 'cpdma_chan_idle_submit_mapped':
>drivers/net/ethernet/ti/davinci_cpdma.c:1114:12: error: cast to pointer from integer of different size [-Werror=int-to-pointer-cast]
>drivers/net/ethernet/ti/davinci_cpdma.c: In function 'cpdma_chan_submit_mapped':
>drivers/net/ethernet/ti/davinci_cpdma.c:1164:12: error: cast to pointer from integer of different size [-Werror=int-to-pointer-cast]
>
>Solve this by using two separate members in 'struct submit_info'.
>Since this avoids the use of the 'flag' member, the structure does
>not even grow in typical configurations.
>
>Fixes: 6670acacd59e ("net: ethernet: ti: davinci_cpdma: add dma mapped submit")
>Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Despite "flags" could be used for smth else (who knows), looks ok.
Reviewed-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
--
Regards,
Ivan Khoronzhuk
^ permalink raw reply
* Re: [net][PATCH 5/5] rds: avoid version downgrade to legitimate newer peer connections
From: Yanjun Zhu @ 2019-07-10 14:26 UTC (permalink / raw)
To: Santosh Shilimkar, netdev, davem
In-Reply-To: <1562736764-31752-6-git-send-email-santosh.shilimkar@oracle.com>
On 2019/7/10 13:32, Santosh Shilimkar wrote:
> Connections with legitimate tos values can get into usual connection
> race. It can result in consumer reject. We don't want tos value or
> protocol version to be demoted for such connections otherwise
> piers would end up different tos values which can results in
> no connection. Example a peer initiated connection with say
> tos 8 while usual connection racing can get downgraded to tos 0
> which is not desirable.
>
> Patch fixes above issue introduced by commit
> commit d021fabf525f ("rds: rdma: add consumer reject")
>
> Reported-by: Yanjun Zhu <yanjun.zhu@oracle.com>
> Tested-by: Yanjun Zhu <yanjun.zhu@oracle.com>
Thanks. I am OK with this.
Zhu Yanjun
> Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
> ---
> net/rds/rdma_transport.c | 6 ++++--
> 1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
> index 9db455d..ff74c4b 100644
> --- a/net/rds/rdma_transport.c
> +++ b/net/rds/rdma_transport.c
> @@ -117,8 +117,10 @@ static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
> ((*err) <= RDS_RDMA_REJ_INCOMPAT))) {
> pr_warn("RDS/RDMA: conn <%pI6c, %pI6c> rejected, dropping connection\n",
> &conn->c_laddr, &conn->c_faddr);
> - conn->c_proposed_version = RDS_PROTOCOL_COMPAT_VERSION;
> - conn->c_tos = 0;
> +
> + if (!conn->c_tos)
> + conn->c_proposed_version = RDS_PROTOCOL_COMPAT_VERSION;
> +
> rds_conn_drop(conn);
> }
> rdsdebug("Connection rejected: %s\n",
^ permalink raw reply
* Re: [net][PATCH 4/5] rds: Return proper "tos" value to user-space
From: Yanjun Zhu @ 2019-07-10 14:25 UTC (permalink / raw)
To: Santosh Shilimkar, netdev, davem
In-Reply-To: <1562736764-31752-5-git-send-email-santosh.shilimkar@oracle.com>
On 2019/7/10 13:32, Santosh Shilimkar wrote:
> From: Gerd Rausch <gerd.rausch@oracle.com>
>
> The proper "tos" value needs to be returned
> to user-space (sockopt RDS_INFO_CONNECTIONS).
>
> Fixes: 3eb450367d08 ("rds: add type of service(tos) infrastructure")
> Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
> Reviewed-by: Zhu Yanjun <yanjun.zhu@oracle.com>
Thanks. I am OK with this.
Zhu Yanjun
> Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
> ---
> net/rds/connection.c | 1 +
> 1 file changed, 1 insertion(+)
>
> diff --git a/net/rds/connection.c b/net/rds/connection.c
> index 7ea134f..ed7f213 100644
> --- a/net/rds/connection.c
> +++ b/net/rds/connection.c
> @@ -736,6 +736,7 @@ static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
> cinfo->next_rx_seq = cp->cp_next_rx_seq;
> cinfo->laddr = conn->c_laddr.s6_addr32[3];
> cinfo->faddr = conn->c_faddr.s6_addr32[3];
> + cinfo->tos = conn->c_tos;
> strncpy(cinfo->transport, conn->c_trans->t_name,
> sizeof(cinfo->transport));
> cinfo->flags = 0;
^ permalink raw reply
* Re: [net][PATCH 3/5] rds: Accept peer connection reject messages due to incompatible version
From: Yanjun Zhu @ 2019-07-10 14:24 UTC (permalink / raw)
To: Santosh Shilimkar, netdev, davem
In-Reply-To: <1562736764-31752-4-git-send-email-santosh.shilimkar@oracle.com>
On 2019/7/10 13:32, Santosh Shilimkar wrote:
> From: Gerd Rausch <gerd.rausch@oracle.com>
>
> Prior to
> commit d021fabf525ff ("rds: rdma: add consumer reject")
>
> function "rds_rdma_cm_event_handler_cmn" would always honor a rejected
> connection attempt by issuing a "rds_conn_drop".
>
> The commit mentioned above added a "break", eliminating
> the "fallthrough" case and made the "rds_conn_drop" rather conditional:
>
> Now it only happens if a "consumer defined" reject (i.e. "rdma_reject")
> carries an integer-value of "1" inside "private_data":
>
> if (!conn)
> break;
> err = (int *)rdma_consumer_reject_data(cm_id, event, &len);
> if (!err || (err && ((*err) == RDS_RDMA_REJ_INCOMPAT))) {
> pr_warn("RDS/RDMA: conn <%pI6c, %pI6c> rejected, dropping connection\n",
> &conn->c_laddr, &conn->c_faddr);
> conn->c_proposed_version = RDS_PROTOCOL_COMPAT_VERSION;
> rds_conn_drop(conn);
> }
> rdsdebug("Connection rejected: %s\n",
> rdma_reject_msg(cm_id, event->status));
> break;
> /* FALLTHROUGH */
> A number of issues are worth mentioning here:
> #1) Previous versions of the RDS code simply rejected a connection
> by calling "rdma_reject(cm_id, NULL, 0);"
> So the value of the payload in "private_data" will not be "1",
> but "0".
>
> #2) Now the code has become dependent on host byte order and sizing.
> If one peer is big-endian, the other is little-endian,
> or there's a difference in sizeof(int) (e.g. ILP64 vs LP64),
> the *err check does not work as intended.
>
> #3) There is no check for "len" to see if the data behind *err is even valid.
> Luckily, it appears that the "rdma_reject(cm_id, NULL, 0)" will always
> carry 148 bytes of zeroized payload.
> But that should probably not be relied upon here.
>
> #4) With the added "break;",
> we might as well drop the misleading "/* FALLTHROUGH */" comment.
>
> This commit does _not_ address issue #2, as the sender would have to
> agree on a byte order as well.
>
> Here is the sequence of messages in this observed error-scenario:
> Host-A is pre-QoS changes (excluding the commit mentioned above)
> Host-B is post-QoS changes (including the commit mentioned above)
>
> #1 Host-B
> issues a connection request via function "rds_conn_path_transition"
> connection state transitions to "RDS_CONN_CONNECTING"
>
> #2 Host-A
> rejects the incompatible connection request (from #1)
> It does so by calling "rdma_reject(cm_id, NULL, 0);"
>
> #3 Host-B
> receives an "RDMA_CM_EVENT_REJECTED" event (from #2)
> But since the code is changed in the way described above,
> it won't drop the connection here, simply because "*err == 0".
>
> #4 Host-A
> issues a connection request
>
> #5 Host-B
> receives an "RDMA_CM_EVENT_CONNECT_REQUEST" event
> and ends up calling "rds_ib_cm_handle_connect".
> But since the state is already in "RDS_CONN_CONNECTING"
> (as of #1) it will end up issuing a "rdma_reject" without
> dropping the connection:
> if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
> /* Wait and see - our connect may still be succeeding */
> rds_ib_stats_inc(s_ib_connect_raced);
> }
> goto out;
>
> #6 Host-A
> receives an "RDMA_CM_EVENT_REJECTED" event (from #5),
> drops the connection and tries again (goto #4) until it gives up.
>
> Tested-by: Zhu Yanjun <yanjun.zhu@oracle.com>
Thanks
Zhu Yanjun
> Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
> Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
> ---
> net/rds/rdma_transport.c | 5 +++--
> 1 file changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
> index 46bce83..9db455d 100644
> --- a/net/rds/rdma_transport.c
> +++ b/net/rds/rdma_transport.c
> @@ -112,7 +112,9 @@ static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
> if (!conn)
> break;
> err = (int *)rdma_consumer_reject_data(cm_id, event, &len);
> - if (!err || (err && ((*err) == RDS_RDMA_REJ_INCOMPAT))) {
> + if (!err ||
> + (err && len >= sizeof(*err) &&
> + ((*err) <= RDS_RDMA_REJ_INCOMPAT))) {
> pr_warn("RDS/RDMA: conn <%pI6c, %pI6c> rejected, dropping connection\n",
> &conn->c_laddr, &conn->c_faddr);
> conn->c_proposed_version = RDS_PROTOCOL_COMPAT_VERSION;
> @@ -122,7 +124,6 @@ static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
> rdsdebug("Connection rejected: %s\n",
> rdma_reject_msg(cm_id, event->status));
> break;
> - /* FALLTHROUGH */
> case RDMA_CM_EVENT_ADDR_ERROR:
> case RDMA_CM_EVENT_ROUTE_ERROR:
> case RDMA_CM_EVENT_CONNECT_ERROR:
^ permalink raw reply
* Re: [PATCH] [net-next] net/mlx5e: avoid uninitialized variable use
From: Tariq Toukan @ 2019-07-10 14:22 UTC (permalink / raw)
To: Arnd Bergmann, Saeed Mahameed, Leon Romanovsky, David S. Miller
Cc: Tariq Toukan, Eran Ben Elisha, Boris Pismenny,
netdev@vger.kernel.org, linux-rdma@vger.kernel.org,
linux-kernel@vger.kernel.org, clang-built-linux@googlegroups.com
In-Reply-To: <20190710130638.1846846-1-arnd@arndb.de>
On 7/10/2019 4:06 PM, Arnd Bergmann wrote:
> clang points to a variable being used in an unexpected
> code path:
>
> drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c:251:2: warning: variable 'rec_seq_sz' is used uninitialized whenever switch default is taken [-Wsometimes-uninitialized]
> default:
> ^~~~~~~
> drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c:255:46: note: uninitialized use occurs here
> skip_static_post = !memcmp(rec_seq, &rn_be, rec_seq_sz);
> ^~~~~~~~~~
>
> From looking at the function logic, it seems that there is no
> sensible way to continue here, so just return early and hope
> for the best.
>
> Fixes: d2ead1f360e8 ("net/mlx5e: Add kTLS TX HW offload support")
> Signed-off-by: Arnd Bergmann <arnd@arndb.de>
> ---
> drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c | 1 +
> 1 file changed, 1 insertion(+)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c
> index 3f5f4317a22b..5c08891806f0 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c
> @@ -250,6 +250,7 @@ tx_post_resync_params(struct mlx5e_txqsq *sq,
> }
> default:
> WARN_ON(1);
> + return;
> }
>
> skip_static_post = !memcmp(rec_seq, &rn_be, rec_seq_sz);
>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Thanks!
^ permalink raw reply
* Re: [net][PATCH 1/5] rds: fix reordering with composite message notification
From: Yanjun Zhu @ 2019-07-10 14:23 UTC (permalink / raw)
To: Santosh Shilimkar, netdev, davem
In-Reply-To: <1562736764-31752-2-git-send-email-santosh.shilimkar@oracle.com>
On 2019/7/10 13:32, Santosh Shilimkar wrote:
> RDS composite message(rdma + control) user notification needs to be
> triggered once the full message is delivered and such a fix was
> added as part of commit 941f8d55f6d61 ("RDS: RDMA: Fix the composite
> message user notification"). But rds_send_remove_from_sock is missing
> data part notify check and hence at times the user don't get
> notification which isn't desirable.
>
> One way is to fix the rds_send_remove_from_sock to check of that case
> but considering the ordering complexity with completion handler and
> rdma + control messages are always dispatched back to back in same send
> context, just delaying the signaled completion on rmda work request also
> gets the desired behaviour. i.e Notifying application only after
> RDMA + control message send completes. So patch updates the earlier
> fix with this approach. The delay signaling completions of rdma op
> till the control message send completes fix was done by Venkat
> Venkatsubra in downstream kernel.
>
> Reviewed-and-tested-by: Zhu Yanjun <yanjun.zhu@oracle.com>
Thanks. I am fine with this.
Zhu Yanjun
> Reviewed-by: Gerd Rausch <gerd.rausch@oracle.com>
> Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
> ---
> net/rds/ib_send.c | 29 +++++++++++++----------------
> net/rds/rdma.c | 10 ----------
> net/rds/rds.h | 1 -
> net/rds/send.c | 4 +---
> 4 files changed, 14 insertions(+), 30 deletions(-)
>
> diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
> index 18f2341..dfe6237 100644
> --- a/net/rds/ib_send.c
> +++ b/net/rds/ib_send.c
> @@ -69,6 +69,16 @@ static void rds_ib_send_complete(struct rds_message *rm,
> complete(rm, notify_status);
> }
>
> +static void rds_ib_send_unmap_data(struct rds_ib_connection *ic,
> + struct rm_data_op *op,
> + int wc_status)
> +{
> + if (op->op_nents)
> + ib_dma_unmap_sg(ic->i_cm_id->device,
> + op->op_sg, op->op_nents,
> + DMA_TO_DEVICE);
> +}
> +
> static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic,
> struct rm_rdma_op *op,
> int wc_status)
> @@ -129,21 +139,6 @@ static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic,
> rds_ib_stats_inc(s_ib_atomic_fadd);
> }
>
> -static void rds_ib_send_unmap_data(struct rds_ib_connection *ic,
> - struct rm_data_op *op,
> - int wc_status)
> -{
> - struct rds_message *rm = container_of(op, struct rds_message, data);
> -
> - if (op->op_nents)
> - ib_dma_unmap_sg(ic->i_cm_id->device,
> - op->op_sg, op->op_nents,
> - DMA_TO_DEVICE);
> -
> - if (rm->rdma.op_active && rm->data.op_notify)
> - rds_ib_send_unmap_rdma(ic, &rm->rdma, wc_status);
> -}
> -
> /*
> * Unmap the resources associated with a struct send_work.
> *
> @@ -902,7 +897,9 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
> send->s_queued = jiffies;
> send->s_op = NULL;
>
> - nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify);
> + if (!op->op_notify)
> + nr_sig += rds_ib_set_wr_signal_state(ic, send,
> + op->op_notify);
>
> send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
> send->s_rdma_wr.remote_addr = remote_addr;
> diff --git a/net/rds/rdma.c b/net/rds/rdma.c
> index b340ed4..916f5ec 100644
> --- a/net/rds/rdma.c
> +++ b/net/rds/rdma.c
> @@ -641,16 +641,6 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
> }
> op->op_notifier->n_user_token = args->user_token;
> op->op_notifier->n_status = RDS_RDMA_SUCCESS;
> -
> - /* Enable rmda notification on data operation for composite
> - * rds messages and make sure notification is enabled only
> - * for the data operation which follows it so that application
> - * gets notified only after full message gets delivered.
> - */
> - if (rm->data.op_sg) {
> - rm->rdma.op_notify = 0;
> - rm->data.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
> - }
> }
>
> /* The cookie contains the R_Key of the remote memory region, and
> diff --git a/net/rds/rds.h b/net/rds/rds.h
> index 0d8f67c..f0066d1 100644
> --- a/net/rds/rds.h
> +++ b/net/rds/rds.h
> @@ -476,7 +476,6 @@ struct rds_message {
> } rdma;
> struct rm_data_op {
> unsigned int op_active:1;
> - unsigned int op_notify:1;
> unsigned int op_nents;
> unsigned int op_count;
> unsigned int op_dmasg;
> diff --git a/net/rds/send.c b/net/rds/send.c
> index 166dd57..031b1e9 100644
> --- a/net/rds/send.c
> +++ b/net/rds/send.c
> @@ -491,14 +491,12 @@ void rds_rdma_send_complete(struct rds_message *rm, int status)
> struct rm_rdma_op *ro;
> struct rds_notifier *notifier;
> unsigned long flags;
> - unsigned int notify = 0;
>
> spin_lock_irqsave(&rm->m_rs_lock, flags);
>
> - notify = rm->rdma.op_notify | rm->data.op_notify;
> ro = &rm->rdma;
> if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
> - ro->op_active && notify && ro->op_notifier) {
> + ro->op_active && ro->op_notify && ro->op_notifier) {
> notifier = ro->op_notifier;
> rs = rm->m_rs;
> sock_hold(rds_rs_to_sk(rs));
^ permalink raw reply
* [PATCH nf-next] net/mlx5e: Fix kernel NULL pointer dereference
From: wenxu @ 2019-07-10 14:18 UTC (permalink / raw)
To: pablo, davem; +Cc: netdev
From: wenxu <wenxu@ucloud.cn>
[ 3444.666552] BUG: kernel NULL pointer dereference, address: 0000000000000000
[ 3444.666631] #PF: supervisor read access in kernel mode
[ 3444.666701] #PF: error_code(0x0000) - not-present page
[ 3444.666769] PGD 8000000812dd7067 P4D 8000000812dd7067 PUD 8207cc067 PMD 0
[ 3444.666843] Oops: 0000 [#1] SMP PTI
[ 3444.666910] CPU: 17 PID: 27387 Comm: nft Kdump: loaded Tainted: G O 5.2.0-rc6+ #1
[ 3444.666987] Hardware name: Huawei Technologies Co., Ltd. RH1288 V3/BC11HGSC0, BIOS 3.57 02/26/2017
[ 3444.667071] RIP: 0010:flow_block_cb_setup_simple+0x127/0x240
[ 3444.667141] Code: 02 48 89 43 08 31 c0 48 83 c4 10 5b 41 5c 41 5d 41 5e 41 5f 5d c3 48 83 c4 10 b8 a1 ff ff ff 5b 41 5c 41 5d 41 5e 41 5f 5d c3 <49> 8b 04 24 49 39 c4 75 0a eb 2f 48 8b 00 49 39 c4 74 27 4c 3b 68
[ 3444.668201] RSP: 0018:ffffc90007b7b888 EFLAGS: 00010246
[ 3444.668595] RAX: 0000000000000000 RBX: ffff8890439a9b40 RCX: ffff88904d5008c0
[ 3444.668992] RDX: ffffffffa0879850 RSI: 0000000000000000 RDI: ffffc90007b7b908
[ 3444.669389] RBP: ffffc90007b7b8c0 R08: ffff88904d5008c0 R09: 0000000000000001
[ 3444.669787] R10: ffff88885a797d00 R11: ffff8890439a9b00 R12: 0000000000000000
[ 3444.670186] R13: ffffffffa0879850 R14: ffffc90007b7b908 R15: ffffffff823a8480
[ 3444.670588] FS: 00007f357c2fa740(0000) GS:ffff88885fe40000(0000) knlGS:0000000000000000
[ 3444.671313] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 3444.671705] CR2: 0000000000000000 CR3: 00000001a1600002 CR4: 00000000001626e0
[ 3444.672103] Call Trace:
[ 3444.672505] ? jump_label_update+0x5f/0xc0
[ 3444.672933] mlx5e_rep_setup_tc+0x32/0x40 [mlx5_core]
[ 3444.673335] nft_flow_offload_chain+0xd0/0x1d0 [nf_tables]
[ 3444.673729] nft_flow_rule_offload_commit+0x91/0x11b [nf_tables]
[ 3444.674129] nf_tables_commit+0x90/0xe30 [nf_tables]
[ 3444.674529] nfnetlink_rcv_batch+0x3b9/0x750 [nfnetlink]
Init the driver_block_list parameter
Fixes: 955bcb6ea0df ("drivers: net: use flow block API")
Signed-off-by: wenxu <wenxu@ucloud.cn>
---
drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 10ef90a..90c6de9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -1182,7 +1182,7 @@ static int mlx5e_rep_setup_tc(struct net_device *dev, enum tc_setup_type type,
switch (type) {
case TC_SETUP_BLOCK:
- return flow_block_cb_setup_simple(type_data, NULL,
+ return flow_block_cb_setup_simple(type_data, &mlx5e_block_cb_list,
mlx5e_rep_setup_tc_cb,
priv, priv, true);
default:
--
1.8.3.1
^ permalink raw reply related
* [PATCH RFC 4/4] selftests/bpf: Add test for ftrace-based BPF attach/detach
From: Joel Fernandes (Google) @ 2019-07-10 14:15 UTC (permalink / raw)
To: linux-kernel
Cc: Joel Fernandes (Google), Adrian Ratiu, Alexei Starovoitov, bpf,
Brendan Gregg, connoro, Daniel Borkmann, duyuchao, Ingo Molnar,
jeffv, Karim Yaghmour, kernel-team, linux-kselftest,
Manali Shukla, Manjo Raja Rao, Martin KaFai Lau, Masami Hiramatsu,
Matt Mullins, Michal Gregorczyk, Michal Gregorczyk,
Mohammad Husain, namhyung, namhyung, netdev, paul.chaignon,
primiano, Qais Yousef, Shuah Khan, Song Liu, Srinivas Ramana,
Steven Rostedt, Tamir Carmeli, Yonghong Song
In-Reply-To: <20190710141548.132193-1-joel@joelfernandes.org>
Here we add support for testing the attach and detach of a BPF program
to a tracepoint through tracefs.
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
.../raw_tp_writable_test_ftrace_run.c | 89 +++++++++++++++++++
1 file changed, 89 insertions(+)
create mode 100644 tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_ftrace_run.c
diff --git a/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_ftrace_run.c b/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_ftrace_run.c
new file mode 100644
index 000000000000..7b42e3a69b71
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_ftrace_run.c
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+#include <linux/nbd.h>
+
+void test_raw_tp_writable_test_ftrace_run(void)
+{
+ __u32 duration = 0;
+ char error[4096];
+ int ret;
+
+ const struct bpf_insn trace_program[] = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ };
+
+ struct bpf_load_program_attr load_attr = {
+ .prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
+ .license = "GPL v2",
+ .insns = trace_program,
+ .insns_cnt = sizeof(trace_program) / sizeof(struct bpf_insn),
+ .log_level = 2,
+ };
+
+ int bpf_fd = bpf_load_program_xattr(&load_attr, error, sizeof(error));
+
+ if (CHECK(bpf_fd < 0, "bpf_raw_tracepoint_writable loaded",
+ "failed: %d errno %d\n", bpf_fd, errno))
+ return;
+
+ const struct bpf_insn skb_program[] = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ };
+
+ struct bpf_load_program_attr skb_load_attr = {
+ .prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
+ .license = "GPL v2",
+ .insns = skb_program,
+ .insns_cnt = sizeof(skb_program) / sizeof(struct bpf_insn),
+ };
+
+ int filter_fd =
+ bpf_load_program_xattr(&skb_load_attr, error, sizeof(error));
+ if (CHECK(filter_fd < 0, "test_program_loaded", "failed: %d errno %d\n",
+ filter_fd, errno))
+ goto out_bpffd;
+
+ ret = bpf_raw_tracepoint_ftrace_attach("bpf_test_run",
+ "bpf_test_finish",
+ bpf_fd);
+ if (CHECK(ret < 0, "bpf_raw_tracepoint_ftrace_attach",
+ "failed: %d errno %d\n", ret, errno))
+ goto out_filterfd;
+
+ char test_skb[128] = {
+ 0,
+ };
+
+ __u32 prog_ret;
+ int err = bpf_prog_test_run(filter_fd, 1, test_skb, sizeof(test_skb), 0,
+ 0, &prog_ret, 0);
+ CHECK(err != 42, "test_run",
+ "tracepoint did not modify return value\n");
+ CHECK(prog_ret != 0, "test_run_ret",
+ "socket_filter did not return 0\n");
+
+ ret = bpf_raw_tracepoint_ftrace_detach("bpf_test_run",
+ "bpf_test_finish",
+ bpf_fd);
+ if (CHECK(ret < 0, "bpf_raw_tracepoint_ftrace_detach",
+ "failed: %d errno %d\n", ret, errno))
+ goto out_filterfd;
+
+ err = bpf_prog_test_run(filter_fd, 1, test_skb, sizeof(test_skb), 0, 0,
+ &prog_ret, 0);
+ CHECK(err != 0, "test_run_notrace",
+ "test_run failed with %d errno %d\n", err, errno);
+ CHECK(prog_ret != 0, "test_run_ret_notrace",
+ "socket_filter did not return 0\n");
+
+out_filterfd:
+ close(filter_fd);
+out_bpffd:
+ close(bpf_fd);
+}
--
2.22.0.410.gd8fdbe21b5-goog
^ permalink raw reply related
* [PATCH RFC 3/4] lib/bpf: Add support for ftrace event attach and detach
From: Joel Fernandes (Google) @ 2019-07-10 14:15 UTC (permalink / raw)
To: linux-kernel
Cc: Joel Fernandes (Google), Adrian Ratiu, Alexei Starovoitov, bpf,
Brendan Gregg, connoro, Daniel Borkmann, duyuchao, Ingo Molnar,
jeffv, Karim Yaghmour, kernel-team, linux-kselftest,
Manali Shukla, Manjo Raja Rao, Martin KaFai Lau, Masami Hiramatsu,
Matt Mullins, Michal Gregorczyk, Michal Gregorczyk,
Mohammad Husain, namhyung, namhyung, netdev, paul.chaignon,
primiano, Qais Yousef, Shuah Khan, Song Liu, Srinivas Ramana,
Steven Rostedt, Tamir Carmeli, Yonghong Song
In-Reply-To: <20190710141548.132193-1-joel@joelfernandes.org>
Add the needed library support in this commit.
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
tools/lib/bpf/bpf.c | 53 ++++++++++++++++++++++++++++++++++++++++
tools/lib/bpf/bpf.h | 4 +++
tools/lib/bpf/libbpf.map | 2 ++
3 files changed, 59 insertions(+)
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index c4a48086dc9a..28c5a7d00d14 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -24,6 +24,9 @@
#include <stdlib.h>
#include <string.h>
#include <memory.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
#include <unistd.h>
#include <asm/unistd.h>
#include <linux/bpf.h>
@@ -57,6 +60,8 @@
#define min(x, y) ((x) < (y) ? (x) : (y))
#endif
+#define TRACEFS "/sys/kernel/debug/tracing"
+
static inline __u64 ptr_to_u64(const void *ptr)
{
return (__u64) (unsigned long) ptr;
@@ -658,6 +663,54 @@ int bpf_raw_tracepoint_open(const char *name, int prog_fd)
return sys_bpf(BPF_RAW_TRACEPOINT_OPEN, &attr, sizeof(attr));
}
+int bpf_raw_tracepoint_ftrace_attach(const char *subsys, const char *name,
+ int prog_fd)
+{
+ char buf[256];
+ int len, ret, tfd;
+
+ sprintf(buf, "%s/events/%s/%s/bpf", TRACEFS, subsys, name);
+ tfd = open(buf, O_WRONLY);
+ if (tfd < 0)
+ return tfd;
+
+ sprintf(buf, "attach:%d", prog_fd);
+ len = strlen(buf);
+ ret = write(tfd, buf, len);
+
+ if (ret < 0)
+ goto err;
+ if (ret != len)
+ ret = -1;
+err:
+ close(tfd);
+ return ret;
+}
+
+int bpf_raw_tracepoint_ftrace_detach(const char *subsys, const char *name,
+ int prog_fd)
+{
+ char buf[256];
+ int len, ret, tfd;
+
+ sprintf(buf, "%s/events/%s/%s/bpf", TRACEFS, subsys, name);
+ tfd = open(buf, O_WRONLY);
+ if (tfd < 0)
+ return tfd;
+
+ sprintf(buf, "detach:%d", prog_fd);
+ len = strlen(buf);
+ ret = write(tfd, buf, len);
+
+ if (ret < 0)
+ goto err;
+ if (ret != len)
+ ret = -1;
+err:
+ close(tfd);
+ return ret;
+}
+
int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size,
bool do_log)
{
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 9593fec75652..5b9c44658037 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -163,6 +163,10 @@ LIBBPF_API int bpf_prog_query(int target_fd, enum bpf_attach_type type,
__u32 query_flags, __u32 *attach_flags,
__u32 *prog_ids, __u32 *prog_cnt);
LIBBPF_API int bpf_raw_tracepoint_open(const char *name, int prog_fd);
+LIBBPF_API int bpf_raw_tracepoint_ftrace_attach(const char *subsys,
+ const char *name, int prog_fd);
+LIBBPF_API int bpf_raw_tracepoint_ftrace_detach(const char *subsys,
+ const char *name, int prog_fd);
LIBBPF_API int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf,
__u32 log_buf_size, bool do_log);
LIBBPF_API int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf,
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 673001787cba..fca377b688c2 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -163,4 +163,6 @@ LIBBPF_0.0.3 {
bpf_map__is_internal;
bpf_map_freeze;
btf__finalize_data;
+ bpf_raw_tracepoint_ftrace_attach;
+ bpf_raw_tracepoint_ftrace_detach;
} LIBBPF_0.0.2;
--
2.22.0.410.gd8fdbe21b5-goog
^ permalink raw reply related
* [PATCH RFC 2/4] trace/bpf: Add support for attach/detach of ftrace events to BPF
From: Joel Fernandes (Google) @ 2019-07-10 14:15 UTC (permalink / raw)
To: linux-kernel
Cc: Joel Fernandes (Google), Adrian Ratiu, Alexei Starovoitov, bpf,
Brendan Gregg, connoro, Daniel Borkmann, duyuchao, Ingo Molnar,
jeffv, Karim Yaghmour, kernel-team, linux-kselftest,
Manali Shukla, Manjo Raja Rao, Martin KaFai Lau, Masami Hiramatsu,
Matt Mullins, Michal Gregorczyk, Michal Gregorczyk,
Mohammad Husain, namhyung, namhyung, netdev, paul.chaignon,
primiano, Qais Yousef, Shuah Khan, Song Liu, Srinivas Ramana,
Steven Rostedt, Tamir Carmeli, Yonghong Song
In-Reply-To: <20190710141548.132193-1-joel@joelfernandes.org>
Add a new bpf file to each trace event. The following commands can be
written into it:
attach:<fd> Attaches BPF prog fd to tracepoint
detach:<fd> Detaches BPF prog fd to tracepoint
Reading the bpf file will show all the attached programs to the
tracepoint.
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
include/linux/bpf_trace.h | 6 ++
include/linux/trace_events.h | 1 +
kernel/trace/bpf_trace.c | 169 +++++++++++++++++++++++++++++++++++
kernel/trace/trace.h | 1 +
kernel/trace/trace_events.c | 9 +-
5 files changed, 184 insertions(+), 2 deletions(-)
diff --git a/include/linux/bpf_trace.h b/include/linux/bpf_trace.h
index 4a593827fd87..1fe73501809c 100644
--- a/include/linux/bpf_trace.h
+++ b/include/linux/bpf_trace.h
@@ -9,6 +9,12 @@
struct bpf_raw_tracepoint {
struct bpf_raw_event_map *btp;
struct bpf_prog *prog;
+ /*
+ * Multiple programs can be attached to a tracepoint,
+ * All of these are linked to each other and can be reached
+ * from the event's bpf_attach file in tracefs.
+ */
+ struct list_head event_attached;
};
struct bpf_raw_tracepoint *bpf_raw_tracepoint_open(char *tp_name, int prog_fd);
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 8a62731673f7..525f2ac44aa3 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -371,6 +371,7 @@ struct trace_event_file {
struct trace_array *tr;
struct trace_subsystem_dir *system;
struct list_head triggers;
+ struct list_head bpf_attached;
/*
* 32 bit flags:
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index c4b543bc617f..28621ad88c12 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1469,3 +1469,172 @@ struct bpf_raw_tracepoint *bpf_raw_tracepoint_open(char *tp_name, int prog_fd)
bpf_put_raw_tracepoint(btp);
return ERR_PTR(err);
}
+
+enum event_bpf_cmd { BPF_ATTACH, BPF_DETACH };
+#define BPF_CMD_BUF_LEN 32
+
+static ssize_t
+event_bpf_attach_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ int err, prog_fd, cmd_num, len;
+ struct trace_event_call *call;
+ struct trace_event_file *file;
+ struct bpf_raw_tracepoint *raw_tp, *next;
+ char buf[BPF_CMD_BUF_LEN], *end, *tok;
+ enum event_bpf_cmd cmd;
+ struct bpf_prog *prog;
+ bool prog_put = true;
+
+ len = min((int)cnt, BPF_CMD_BUF_LEN - 1);
+
+ err = copy_from_user(buf, ubuf, len);
+ if (err)
+ return err;
+ buf[len] = 0;
+
+ /* Parse 2 arguments of format: <cmd>:<fd> */
+ end = &buf[0];
+ cmd_num = 1;
+ while (cmd_num < 3) {
+ tok = strsep(&end, ":");
+ if (!tok)
+ return -EINVAL;
+
+ switch (cmd_num) {
+ case 1:
+ if (!strncmp(tok, "attach", 6))
+ cmd = BPF_ATTACH;
+ else if (!strncmp(tok, "detach", 6))
+ cmd = BPF_DETACH;
+ else
+ return -EINVAL;
+ break;
+ case 2:
+ err = kstrtoint(tok, 10, &prog_fd);
+ if (err)
+ return err;
+ break;
+ }
+ cmd_num++;
+ }
+ if (cmd_num != 3)
+ return -EINVAL;
+
+ file = event_file_data(filp);
+ /* Command is to attach fd to tracepoint */
+ if (cmd == BPF_ATTACH) {
+ mutex_lock(&event_mutex);
+ call = file->event_call;
+
+ raw_tp = bpf_raw_tracepoint_open((char *)call->tp->name,
+ prog_fd);
+ if (IS_ERR(raw_tp)) {
+ mutex_unlock(&event_mutex);
+ return PTR_ERR(raw_tp);
+ }
+
+ list_add(&raw_tp->event_attached, &file->bpf_attached);
+ mutex_unlock(&event_mutex);
+ *ppos += cnt;
+ return cnt;
+ }
+
+ /* Command is to detach fd from tracepoint */
+ prog = bpf_prog_get(prog_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ mutex_lock(&event_mutex);
+ list_for_each_entry_safe(raw_tp, next, &file->bpf_attached,
+ event_attached) {
+ if (raw_tp->prog == prog) {
+ list_del(&raw_tp->event_attached);
+ bpf_raw_tracepoint_close(raw_tp);
+ prog_put = false;
+ break;
+ }
+ }
+ mutex_unlock(&event_mutex);
+
+ if (prog_put)
+ bpf_prog_put(prog);
+ *ppos += cnt;
+ return cnt;
+}
+
+static void *event_bpf_attach_next(struct seq_file *m, void *t, loff_t *pos)
+{
+ struct trace_event_file *file = event_file_data(m->private);
+
+ return seq_list_next(t, &file->bpf_attached, pos);
+}
+
+static void *event_bpf_attach_start(struct seq_file *m, loff_t *pos)
+{
+ struct trace_event_file *event_file;
+
+ /* ->stop() is called even if ->start() fails */
+ mutex_lock(&event_mutex);
+ event_file = event_file_data(m->private);
+ if (unlikely(!event_file))
+ return ERR_PTR(-ENODEV);
+
+ if (list_empty(&event_file->bpf_attached))
+ return NULL;
+
+ return seq_list_start(&event_file->bpf_attached, *pos);
+}
+
+static void event_bpf_attach_stop(struct seq_file *m, void *t)
+{
+ mutex_unlock(&event_mutex);
+}
+
+static int event_bpf_attach_show(struct seq_file *m, void *v)
+{
+ struct bpf_raw_tracepoint *raw_tp;
+
+ raw_tp = list_entry(v, struct bpf_raw_tracepoint, event_attached);
+ seq_printf(m, "prog id: %u\n", raw_tp->prog->aux->id);
+ return 0;
+}
+
+static const struct seq_operations event_bpf_attach_seq_ops = {
+ .start = event_bpf_attach_start,
+ .next = event_bpf_attach_next,
+ .stop = event_bpf_attach_stop,
+ .show = event_bpf_attach_show,
+};
+
+static int event_bpf_attach_open(struct inode *inode, struct file *file)
+{
+ int ret = 0;
+
+ mutex_lock(&event_mutex);
+
+ if (unlikely(!event_file_data(file))) {
+ mutex_unlock(&event_mutex);
+ return -ENODEV;
+ }
+
+ if (file->f_mode & FMODE_READ) {
+ ret = seq_open(file, &event_bpf_attach_seq_ops);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+
+ m->private = file;
+ }
+ }
+
+ mutex_unlock(&event_mutex);
+
+ return ret;
+}
+
+const struct file_operations event_bpf_attach_fops = {
+ .open = event_bpf_attach_open,
+ .read = seq_read,
+ .write = event_bpf_attach_write,
+ .llseek = default_llseek,
+};
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 005f08629b8b..e33828d24eb2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1582,6 +1582,7 @@ extern struct list_head ftrace_events;
extern const struct file_operations event_trigger_fops;
extern const struct file_operations event_hist_fops;
+extern const struct file_operations event_bpf_attach_fops;
#ifdef CONFIG_HIST_TRIGGERS
extern int register_trigger_hist_cmd(void);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 67851fb66b6b..79420d5efaef 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2018,8 +2018,10 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
trace_create_file("trigger", 0644, file->dir, file,
&event_trigger_fops);
- trace_create_file("bpf_attach", 0644, file->dir, file,
- &bpf_attach_trigger_fops);
+#ifdef CONFIG_BPF_EVENTS
+ trace_create_file("bpf", 0644, file->dir, file,
+ &event_bpf_attach_fops);
+#endif
}
#ifdef CONFIG_HIST_TRIGGERS
@@ -2267,6 +2269,9 @@ trace_create_new_event(struct trace_event_call *call,
atomic_set(&file->sm_ref, 0);
atomic_set(&file->tm_ref, 0);
INIT_LIST_HEAD(&file->triggers);
+#ifdef CONFIG_BPF_EVENTS
+ INIT_LIST_HEAD(&file->bpf_attached);
+#endif
list_add(&file->list, &tr->events);
return file;
--
2.22.0.410.gd8fdbe21b5-goog
^ permalink raw reply related
* [PATCH RFC 1/4] Move bpf_raw_tracepoint functionality into bpf_trace.c
From: Joel Fernandes (Google) @ 2019-07-10 14:15 UTC (permalink / raw)
To: linux-kernel
Cc: Joel Fernandes (Google), Adrian Ratiu, Alexei Starovoitov, bpf,
Brendan Gregg, connoro, Daniel Borkmann, duyuchao, Ingo Molnar,
jeffv, Karim Yaghmour, kernel-team, linux-kselftest,
Manali Shukla, Manjo Raja Rao, Martin KaFai Lau, Masami Hiramatsu,
Matt Mullins, Michal Gregorczyk, Michal Gregorczyk,
Mohammad Husain, namhyung, namhyung, netdev, paul.chaignon,
primiano, Qais Yousef, Shuah Khan, Song Liu, Srinivas Ramana,
Steven Rostedt, Tamir Carmeli, Yonghong Song
In-Reply-To: <20190710141548.132193-1-joel@joelfernandes.org>
In preparation to use raw tracepoints for BPF directly from ftrace, move
the bpf_raw_tracepoint functionality into bpf_trace.c
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
include/linux/bpf_trace.h | 10 ++++++
kernel/bpf/syscall.c | 69 ++++++-------------------------------
kernel/trace/bpf_trace.c | 56 ++++++++++++++++++++++++++++++
kernel/trace/trace_events.c | 3 ++
4 files changed, 80 insertions(+), 58 deletions(-)
diff --git a/include/linux/bpf_trace.h b/include/linux/bpf_trace.h
index ddf896abcfb6..4a593827fd87 100644
--- a/include/linux/bpf_trace.h
+++ b/include/linux/bpf_trace.h
@@ -4,4 +4,14 @@
#include <trace/events/xdp.h>
+#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
+
+struct bpf_raw_tracepoint {
+ struct bpf_raw_event_map *btp;
+ struct bpf_prog *prog;
+};
+
+struct bpf_raw_tracepoint *bpf_raw_tracepoint_open(char *tp_name, int prog_fd);
+void bpf_raw_tracepoint_close(struct bpf_raw_tracepoint *tp);
+
#endif /* __LINUX_BPF_TRACE_H__ */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 42d17f730780..2001949b33f1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1737,21 +1737,11 @@ static int bpf_obj_get(const union bpf_attr *attr)
attr->file_flags);
}
-struct bpf_raw_tracepoint {
- struct bpf_raw_event_map *btp;
- struct bpf_prog *prog;
-};
-
static int bpf_raw_tracepoint_release(struct inode *inode, struct file *filp)
{
struct bpf_raw_tracepoint *raw_tp = filp->private_data;
- if (raw_tp->prog) {
- bpf_probe_unregister(raw_tp->btp, raw_tp->prog);
- bpf_prog_put(raw_tp->prog);
- }
- bpf_put_raw_tracepoint(raw_tp->btp);
- kfree(raw_tp);
+ bpf_raw_tracepoint_close(raw_tp);
return 0;
}
@@ -1761,64 +1751,27 @@ static const struct file_operations bpf_raw_tp_fops = {
.write = bpf_dummy_write,
};
-#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
-
-static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
+static int bpf_raw_tracepoint_open_syscall(const union bpf_attr *attr)
{
- struct bpf_raw_tracepoint *raw_tp;
- struct bpf_raw_event_map *btp;
- struct bpf_prog *prog;
+ int tp_fd;
char tp_name[128];
- int tp_fd, err;
+ struct bpf_raw_tracepoint *raw_tp;
if (strncpy_from_user(tp_name, u64_to_user_ptr(attr->raw_tracepoint.name),
sizeof(tp_name) - 1) < 0)
return -EFAULT;
tp_name[sizeof(tp_name) - 1] = 0;
- btp = bpf_get_raw_tracepoint(tp_name);
- if (!btp)
- return -ENOENT;
-
- raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER);
- if (!raw_tp) {
- err = -ENOMEM;
- goto out_put_btp;
- }
- raw_tp->btp = btp;
-
- prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
- if (IS_ERR(prog)) {
- err = PTR_ERR(prog);
- goto out_free_tp;
- }
- if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT &&
- prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) {
- err = -EINVAL;
- goto out_put_prog;
- }
-
- err = bpf_probe_register(raw_tp->btp, prog);
- if (err)
- goto out_put_prog;
+ raw_tp = bpf_raw_tracepoint_open(tp_name, attr->raw_tracepoint.prog_fd);
+ if (IS_ERR(raw_tp))
+ return PTR_ERR(raw_tp);
- raw_tp->prog = prog;
tp_fd = anon_inode_getfd("bpf-raw-tracepoint", &bpf_raw_tp_fops, raw_tp,
O_CLOEXEC);
- if (tp_fd < 0) {
- bpf_probe_unregister(raw_tp->btp, prog);
- err = tp_fd;
- goto out_put_prog;
- }
- return tp_fd;
+ if (tp_fd < 0)
+ bpf_probe_unregister(raw_tp->btp, raw_tp->prog);
-out_put_prog:
- bpf_prog_put(prog);
-out_free_tp:
- kfree(raw_tp);
-out_put_btp:
- bpf_put_raw_tracepoint(btp);
- return err;
+ return tp_fd;
}
static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
@@ -2848,7 +2801,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
err = bpf_obj_get_info_by_fd(&attr, uattr);
break;
case BPF_RAW_TRACEPOINT_OPEN:
- err = bpf_raw_tracepoint_open(&attr);
+ err = bpf_raw_tracepoint_open_syscall(&attr);
break;
case BPF_BTF_LOAD:
err = bpf_btf_load(&attr);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 1c9a4745e596..c4b543bc617f 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -7,6 +7,7 @@
#include <linux/slab.h>
#include <linux/bpf.h>
#include <linux/bpf_perf_event.h>
+#include <linux/bpf_trace.h>
#include <linux/filter.h>
#include <linux/uaccess.h>
#include <linux/ctype.h>
@@ -1413,3 +1414,58 @@ static int __init bpf_event_init(void)
fs_initcall(bpf_event_init);
#endif /* CONFIG_MODULES */
+
+void bpf_raw_tracepoint_close(struct bpf_raw_tracepoint *raw_tp)
+{
+ if (raw_tp->prog) {
+ bpf_probe_unregister(raw_tp->btp, raw_tp->prog);
+ bpf_prog_put(raw_tp->prog);
+ }
+ bpf_put_raw_tracepoint(raw_tp->btp);
+ kfree(raw_tp);
+}
+
+struct bpf_raw_tracepoint *bpf_raw_tracepoint_open(char *tp_name, int prog_fd)
+{
+ struct bpf_raw_tracepoint *raw_tp;
+ struct bpf_raw_event_map *btp;
+ struct bpf_prog *prog;
+ int err;
+
+ btp = bpf_get_raw_tracepoint(tp_name);
+ if (!btp)
+ return ERR_PTR(-ENOENT);
+
+ raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER);
+ if (!raw_tp) {
+ err = -ENOMEM;
+ goto out_put_btp;
+ }
+ raw_tp->btp = btp;
+
+ prog = bpf_prog_get(prog_fd);
+ if (IS_ERR(prog)) {
+ err = PTR_ERR(prog);
+ goto out_free_tp;
+ }
+ if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT &&
+ prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) {
+ err = -EINVAL;
+ goto out_put_prog;
+ }
+
+ err = bpf_probe_register(raw_tp->btp, prog);
+ if (err)
+ goto out_put_prog;
+
+ raw_tp->prog = prog;
+ return raw_tp;
+
+out_put_prog:
+ bpf_prog_put(prog);
+out_free_tp:
+ kfree(raw_tp);
+out_put_btp:
+ bpf_put_raw_tracepoint(btp);
+ return ERR_PTR(err);
+}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 0ce3db67f556..67851fb66b6b 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2017,6 +2017,9 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
trace_create_file("trigger", 0644, file->dir, file,
&event_trigger_fops);
+
+ trace_create_file("bpf_attach", 0644, file->dir, file,
+ &bpf_attach_trigger_fops);
}
#ifdef CONFIG_HIST_TRIGGERS
--
2.22.0.410.gd8fdbe21b5-goog
^ permalink raw reply related
* [PATCH RFC 0/4] Add support to directly attach BPF program to ftrace
From: Joel Fernandes (Google) @ 2019-07-10 14:15 UTC (permalink / raw)
To: linux-kernel
Cc: Joel Fernandes (Google), Adrian Ratiu, Alexei Starovoitov, bpf,
Brendan Gregg, connoro, Daniel Borkmann, duyuchao, Ingo Molnar,
jeffv, Karim Yaghmour, kernel-team, linux-kselftest,
Manali Shukla, Manjo Raja Rao, Martin KaFai Lau, Masami Hiramatsu,
Matt Mullins, Michal Gregorczyk, Michal Gregorczyk,
Mohammad Husain, namhyung, namhyung, netdev, paul.chaignon,
primiano, Qais Yousef, Shuah Khan, Song Liu, Srinivas Ramana,
Steven Rostedt, Tamir Carmeli, Yonghong Song
Hi,
These patches make it possible to attach BPF programs directly to tracepoints
using ftrace (/sys/kernel/debug/tracing) without needing the process doing the
attach to be alive. This has the following benefits:
1. Simplified Security: In Android, we have finer-grained security controls to
specific ftrace trace events using SELinux labels. We control precisely who is
allowed to enable an ftrace event already. By adding a node to ftrace for
attaching BPF programs, we can use the same mechanism to further control who is
allowed to attach to a trace event.
2. Process lifetime: In Android we are adding usecases where a tracing program
needs to be attached all the time to a tracepoint, for the full life time of
the system. Such as to gather statistics where there no need for a detach for
the full system lifetime. With perf or bpf(2)'s BPF_RAW_TRACEPOINT_OPEN, this
means keeping a process alive all the time. However, in Android our BPF loader
currently (for hardeneded security) involves just starting a process at boot
time, doing the BPF program loading, and then pinning them to /sys/fs/bpf. We
don't keep this process alive all the time. It is more suitable to do a
one-shot attach of the program using ftrace and not need to have a process
alive all the time anymore for this. Such process also needs elevated
privileges since tracepoint program loading currently requires CAP_SYS_ADMIN
anyway so by design Android's bpfloader runs once at init and exits.
This series add a new bpf file to /sys/kernel/debug/tracing/events/X/Y/bpf
The following commands can be written into it:
attach:<fd> Attaches BPF prog fd to tracepoint
detach:<fd> Detaches BPF prog fd to tracepoint
Reading the bpf file will show all the attached programs to the tracepoint.
Joel Fernandes (Google) (4):
Move bpf_raw_tracepoint functionality into bpf_trace.c
trace/bpf: Add support for attach/detach of ftrace events to BPF
lib/bpf: Add support for ftrace event attach and detach
selftests/bpf: Add test for ftrace-based BPF attach/detach
include/linux/bpf_trace.h | 16 ++
include/linux/trace_events.h | 1 +
kernel/bpf/syscall.c | 69 +-----
kernel/trace/bpf_trace.c | 225 ++++++++++++++++++
kernel/trace/trace.h | 1 +
kernel/trace/trace_events.c | 8 +
tools/lib/bpf/bpf.c | 53 +++++
tools/lib/bpf/bpf.h | 4 +
tools/lib/bpf/libbpf.map | 2 +
.../raw_tp_writable_test_ftrace_run.c | 89 +++++++
10 files changed, 410 insertions(+), 58 deletions(-)
create mode 100644 tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_ftrace_run.c
--
2.22.0.410.gd8fdbe21b5-goog
^ permalink raw reply
* Re: [PATCH net-next iproute2 v2 2/2] devlink: Introduce PCI PF and VF port flavour and attribute
From: Jiri Pirko @ 2019-07-10 13:57 UTC (permalink / raw)
To: Parav Pandit; +Cc: netdev, stephen, dsahern, jiri
In-Reply-To: <20190710123952.6877-2-parav@mellanox.com>
Wed, Jul 10, 2019 at 02:39:52PM CEST, parav@mellanox.com wrote:
>Introduce PCI PF and VF port flavour and port attributes such as PF
>number and VF number.
>
>$ devlink port show
>pci/0000:05:00.0/0: type eth netdev eth0 flavour pcipf pfnum 0
>pci/0000:05:00.0/1: type eth netdev eth1 flavour pcivf pfnum 0 vfnum 0
>pci/0000:05:00.0/2: type eth netdev eth2 flavour pcivf pfnum 0 vfnum 1
>
>Signed-off-by: Parav Pandit <parav@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
^ permalink raw reply
* [PATCH net] net: fix use-after-free in __netif_receive_skb_core
From: Sabrina Dubroca @ 2019-07-10 13:52 UTC (permalink / raw)
To: netdev; +Cc: Sabrina Dubroca, Edward Cree, Andreas Steinmetz
When __netif_receive_skb_core handles a shared skb, it can be
reallocated in a few different places:
- the device's rx_handler
- vlan_do_receive
- skb_vlan_untag
To deal with that, rx_handlers and vlan_do_receive get passed a
reference to the skb, and skb_vlan_untag just returns the new
skb. This was not a problem until commit 88eb1944e18c ("net: core:
propagate SKB lists through packet_type lookup"), which moved the
final handling of the skb via pt_prev out of
__netif_receive_skb_core. After this commit, when the skb is
reallocated by __netif_receive_skb_core, KASAN reports a
use-after-free on the old skb:
BUG: KASAN: use-after-free in __netif_receive_skb_one_core+0x15c/0x180
Call Trace:
<IRQ>
__netif_receive_skb_one_core+0x15c/0x180
process_backlog+0x1b5/0x630
? net_rx_action+0x247/0xd00
net_rx_action+0x3fa/0xd00
? napi_complete_done+0x360/0x360
__do_softirq+0x257/0xa0b
do_softirq_own_stack+0x2a/0x40
</IRQ>
? __dev_queue_xmit+0x12ba/0x3120
do_softirq+0x5d/0x60
[...]
Allocated by task 505:
__kasan_kmalloc.constprop.0+0xd6/0x140
kmem_cache_alloc+0xd4/0x2e0
skb_clone+0x106/0x300
deliver_clone+0x3f/0xa0
maybe_deliver+0x1c0/0x2b0
br_flood+0xd4/0x320
br_dev_xmit+0xbc0/0x1080
dev_hard_start_xmit+0x139/0x750
__dev_queue_xmit+0x24eb/0x3120
packet_sendmsg+0x1bfa/0x50e0
[...]
Freed by task 505:
__kasan_slab_free+0x138/0x1e0
kmem_cache_free+0xa2/0x2e0
macsec_handle_frame+0xa24/0x2e60
__netif_receive_skb_core+0xe2a/0x2c90
__netif_receive_skb_one_core+0x96/0x180
process_backlog+0x1b5/0x630
net_rx_action+0x3fa/0xd00
__do_softirq+0x257/0xa0b
The solution is to pass a reference to the skb to
__netif_receive_skb_core, as we already do with the rx_handlers, so
that its callers use the new skb.
Fixes: 88eb1944e18c ("net: core: propagate SKB lists through packet_type lookup")
Reported-by: Andreas Steinmetz <ast@domdv.de>
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
---
net/core/dev.c | 26 ++++++++++++++++++++------
1 file changed, 20 insertions(+), 6 deletions(-)
diff --git a/net/core/dev.c b/net/core/dev.c
index d6edd218babd..0bbf6d2a9c32 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4809,11 +4809,12 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
return 0;
}
-static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc,
+static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
struct packet_type **ppt_prev)
{
struct packet_type *ptype, *pt_prev;
rx_handler_func_t *rx_handler;
+ struct sk_buff *skb = *pskb;
struct net_device *orig_dev;
bool deliver_exact = false;
int ret = NET_RX_DROP;
@@ -4852,6 +4853,7 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc,
if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
skb = skb_vlan_untag(skb);
+ *pskb = skb;
if (unlikely(!skb))
goto out;
}
@@ -4878,6 +4880,7 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc,
#ifdef CONFIG_NET_INGRESS
if (static_branch_unlikely(&ingress_needed_key)) {
skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
+ *pskb = skb;
if (!skb)
goto out;
@@ -4891,11 +4894,14 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc,
goto drop;
if (skb_vlan_tag_present(skb)) {
+ bool ret2;
if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
- if (vlan_do_receive(&skb))
+ ret2 = vlan_do_receive(pskb);
+ skb = *pskb;
+ if (ret2)
goto another_round;
else if (unlikely(!skb))
goto out;
@@ -4903,11 +4909,14 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc,
rx_handler = rcu_dereference(skb->dev->rx_handler);
if (rx_handler) {
+ rx_handler_result_t res;
if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
- switch (rx_handler(&skb)) {
+ res = rx_handler(pskb);
+ skb = *pskb;
+ switch (res) {
case RX_HANDLER_CONSUMED:
ret = NET_RX_SUCCESS;
goto out;
@@ -4931,15 +4940,20 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc,
skb->pkt_type = PACKET_OTHERHOST;
} else if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
+ bool ret2;
+
/* Outer header is 802.1P with vlan 0, inner header is
* 802.1Q or 802.1AD and vlan_do_receive() above could
* not find vlan dev for vlan id 0.
*/
__vlan_hwaccel_clear_tag(skb);
skb = skb_vlan_untag(skb);
+ *pskb = skb;
if (unlikely(!skb))
goto out;
- if (vlan_do_receive(&skb))
+ ret2 = vlan_do_receive(pskb);
+ skb = *pskb;
+ if (ret2)
/* After stripping off 802.1P header with vlan 0
* vlan dev is found for inner header.
*/
@@ -5004,7 +5018,7 @@ static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
struct packet_type *pt_prev = NULL;
int ret;
- ret = __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
+ ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
if (pt_prev)
ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
skb->dev, pt_prev, orig_dev);
@@ -5082,7 +5096,7 @@ static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemallo
struct packet_type *pt_prev = NULL;
skb_list_del_init(skb);
- __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
+ __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
if (!pt_prev)
continue;
if (pt_curr != pt_prev || od_curr != orig_dev) {
--
2.22.0
^ permalink raw reply related
* [PATCH net-next] net/sched: Fix kernel NULL pointer dereference
From: wenxu @ 2019-07-10 13:45 UTC (permalink / raw)
To: pablo, davem; +Cc: netfilter-devel, netdev
From: wenxu <wenxu@ucloud.cn>
[ 697.665184] BUG: kernel NULL pointer dereference, address: 0000000000000030
[ 697.665550] #PF: supervisor read access in kernel mode
[ 697.665906] #PF: error_code(0x0000) - not-present page
[ 697.666297] PGD 800000104e636067 P4D 800000104e636067 PUD ff4b02067 PMD 0
[ 697.666710] Oops: 0000 [#1] SMP PTI
[ 697.667115] CPU: 31 PID: 24466 Comm: modprobe Kdump: loaded Tainted: G O 5.2.0-rc6+ #1
[ 697.667867] Hardware name: Huawei Technologies Co., Ltd. RH1288 V3/BC11HGSC0, BIOS 3.57 02/26/2017
[ 697.668620] RIP: 0010:tc_indr_block_ing_cmd.isra.52+0x4c/0xb0
[ 697.669029] Code: 83 ec 40 65 48 8b 04 25 28 00 00 00 48 89 45 e8 31 c0 f3 48 ab 48 8b 06 49 8b b3 e8 04 00 00 44 89 45 b0 c7 45 b4 01 00 00 00 <8b> 48 30 48 89 75 c0 85 c9 48 8d 4d b0 0f 95 45 b8 48 85 c0 4c 8d
[ 697.670132] RSP: 0018:ffffc90007bf7958 EFLAGS: 00010246
[ 697.670537] RAX: 0000000000000000 RBX: ffff88905e2cbae8 RCX: 0000000000000000
[ 697.670938] RDX: ffff88905e2cbcd8 RSI: ffffffff823a8480 RDI: ffffc90007bf7990
[ 697.671352] RBP: ffffc90007bf79a8 R08: 0000000000000000 R09: ffff88905e2cbcc0
[ 697.671761] R10: ffff888107c07780 R11: ffff88902c249000 R12: ffff88905e2cbcd0
[ 697.672173] R13: ffff88905e2cbac0 R14: ffff88885596bc00 R15: ffff88905e2cbcc0
[ 697.672582] FS: 00007fe0b4095740(0000) GS:ffff88905fbc0000(0000) knlGS:0000000000000000
[ 697.673335] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 697.673746] CR2: 0000000000000030 CR3: 0000000ff46b4005 CR4: 00000000001606e0
[ 697.674156] Call Trace:
[ 697.674563] __tc_indr_block_cb_register+0x11e/0x3c0
[ 697.674998] mlx5e_nic_rep_netdevice_event+0x9e/0x110 [mlx5_core]
[ 697.675411] notifier_call_chain+0x53/0xa0
[ 697.675812] raw_notifier_call_chain+0x16/0x20
[ 697.676223] call_netdevice_notifiers_info+0x2d/0x60
[ 697.676633] register_netdevice+0x3fa/0x500
get indr_dev->block after check it.
Fixes: 955bcb6ea0df ("drivers: net: use flow block API")
Signed-off-by: wenxu <wenxu@ucloud.cn>
---
net/sched/cls_api.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 638c1bc..be899f7 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -684,13 +684,14 @@ static void tc_indr_block_ing_cmd(struct tc_indr_block_dev *indr_dev,
.command = command,
.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS,
.net = dev_net(indr_dev->dev),
- .block_shared = tcf_block_shared(indr_dev->block),
};
INIT_LIST_HEAD(&bo.cb_list);
if (!indr_dev->block)
return;
+ bo.block_shared = tcf_block_shared(indr_dev->block);
+
indr_block_cb->cb(indr_dev->dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK,
&bo);
tcf_block_setup(indr_dev->block, &bo);
--
1.8.3.1
^ permalink raw reply related
* [PATCH net] ipv6: fix static key imbalance in fl_create()
From: Eric Dumazet @ 2019-07-10 13:40 UTC (permalink / raw)
To: David S . Miller
Cc: netdev, Eric Dumazet, Eric Dumazet, Willem de Bruijn, syzbot
In-Reply-To: <20190710134011.221210-1-edumazet@google.com>
fl_create() should call static_branch_deferred_inc() only in
case of success.
Also we should not call fl_free() in error path, as this could
cause a static key imbalance.
jump label: negative count!
WARNING: CPU: 0 PID: 15907 at kernel/jump_label.c:221 static_key_slow_try_dec kernel/jump_label.c:221 [inline]
WARNING: CPU: 0 PID: 15907 at kernel/jump_label.c:221 static_key_slow_try_dec+0x1ab/0x1d0 kernel/jump_label.c:206
Kernel panic - not syncing: panic_on_warn set ...
CPU: 0 PID: 15907 Comm: syz-executor.2 Not tainted 5.2.0-rc6+ #62
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:77 [inline]
dump_stack+0x172/0x1f0 lib/dump_stack.c:113
panic+0x2cb/0x744 kernel/panic.c:219
__warn.cold+0x20/0x4d kernel/panic.c:576
report_bug+0x263/0x2b0 lib/bug.c:186
fixup_bug arch/x86/kernel/traps.c:179 [inline]
fixup_bug arch/x86/kernel/traps.c:174 [inline]
do_error_trap+0x11b/0x200 arch/x86/kernel/traps.c:272
do_invalid_op+0x37/0x50 arch/x86/kernel/traps.c:291
invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:986
RIP: 0010:static_key_slow_try_dec kernel/jump_label.c:221 [inline]
RIP: 0010:static_key_slow_try_dec+0x1ab/0x1d0 kernel/jump_label.c:206
Code: c0 e8 e9 3e e5 ff 83 fb 01 0f 85 32 ff ff ff e8 5b 3d e5 ff 45 31 ff eb a0 e8 51 3d e5 ff 48 c7 c7 40 99 92 87 e8 13 75 b7 ff <0f> 0b eb 8b 4c 89 e7 e8 a9 c0 1e 00 e9 de fe ff ff e8 bf 6d b7 ff
RSP: 0018:ffff88805f9c7450 EFLAGS: 00010286
RAX: 0000000000000000 RBX: 00000000ffffffff RCX: 0000000000000000
RDX: 000000000000e3e1 RSI: ffffffff815adb06 RDI: ffffed100bf38e7c
RBP: ffff88805f9c74e0 R08: ffff88806acf0700 R09: ffffed1015d060a9
R10: ffffed1015d060a8 R11: ffff8880ae830547 R12: ffffffff89832ce0
R13: ffff88805f9c74b8 R14: 1ffff1100bf38e8b R15: 00000000ffffff01
__static_key_slow_dec_deferred+0x65/0x110 kernel/jump_label.c:272
fl_free+0xa9/0xe0 net/ipv6/ip6_flowlabel.c:121
fl_create+0x6af/0x9f0 net/ipv6/ip6_flowlabel.c:457
ipv6_flowlabel_opt+0x80e/0x2730 net/ipv6/ip6_flowlabel.c:624
do_ipv6_setsockopt.isra.0+0x2119/0x4100 net/ipv6/ipv6_sockglue.c:825
ipv6_setsockopt+0xf6/0x170 net/ipv6/ipv6_sockglue.c:944
tcp_setsockopt net/ipv4/tcp.c:3131 [inline]
tcp_setsockopt+0x8f/0xe0 net/ipv4/tcp.c:3125
sock_common_setsockopt+0x94/0xd0 net/core/sock.c:3130
__sys_setsockopt+0x253/0x4b0 net/socket.c:2080
__do_sys_setsockopt net/socket.c:2096 [inline]
__se_sys_setsockopt net/socket.c:2093 [inline]
__x64_sys_setsockopt+0xbe/0x150 net/socket.c:2093
do_syscall_64+0xfd/0x680 arch/x86/entry/common.c:301
entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x4597c9
Code: fd b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 cb b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007f2670556c78 EFLAGS: 00000246 ORIG_RAX: 0000000000000036
RAX: ffffffffffffffda RBX: 0000000000000005 RCX: 00000000004597c9
RDX: 0000000000000020 RSI: 0000000000000029 RDI: 0000000000000003
RBP: 000000000075bfc8 R08: 000000000000fdf7 R09: 0000000000000000
R10: 0000000020000000 R11: 0000000000000246 R12: 00007f26705576d4
R13: 00000000004cec00 R14: 00000000004dd520 R15: 00000000ffffffff
Kernel Offset: disabled
Rebooting in 86400 seconds..
Fixes: 59c820b2317f ("ipv6: elide flowlabel check if no exclusive leases exist")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
---
net/ipv6/ip6_flowlabel.c | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index ad284b1fd308a646f27f715f35d9759fd50c5902..d64b83e856428195c1ecc963a263155c8b4528d0 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -435,8 +435,6 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,
}
fl->dst = freq->flr_dst;
atomic_set(&fl->users, 1);
- if (fl_shared_exclusive(fl) || fl->opt)
- static_branch_deferred_inc(&ipv6_flowlabel_exclusive);
switch (fl->share) {
case IPV6_FL_S_EXCL:
case IPV6_FL_S_ANY:
@@ -451,10 +449,15 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,
err = -EINVAL;
goto done;
}
+ if (fl_shared_exclusive(fl) || fl->opt)
+ static_branch_deferred_inc(&ipv6_flowlabel_exclusive);
return fl;
done:
- fl_free(fl);
+ if (fl) {
+ kfree(fl->opt);
+ kfree(fl);
+ }
*err_p = err;
return NULL;
}
--
2.22.0.410.gd8fdbe21b5-goog
^ permalink raw reply related
* [PATCH net] ipv6: fix potential crash in ip6_datagram_dst_update()
From: Eric Dumazet @ 2019-07-10 13:40 UTC (permalink / raw)
To: David S . Miller
Cc: netdev, Eric Dumazet, Eric Dumazet, Willem de Bruijn, syzbot
In-Reply-To: <20190710134011.221210-1-edumazet@google.com>
Willem forgot to change one of the calls to fl6_sock_lookup(),
which can now return an error or NULL.
syzbot reported :
kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] PREEMPT SMP KASAN
CPU: 1 PID: 31763 Comm: syz-executor.0 Not tainted 5.2.0-rc6+ #63
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:ip6_datagram_dst_update+0x559/0xc30 net/ipv6/datagram.c:83
Code: 00 00 e8 ea 29 3f fb 4d 85 f6 0f 84 96 04 00 00 e8 dc 29 3f fb 49 8d 7e 20 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 16 06 00 00 4d 8b 6e 20 e8 b4 29 3f fb 4c 89 ee
RSP: 0018:ffff88809ba97ae0 EFLAGS: 00010207
RAX: dffffc0000000000 RBX: ffff8880a81254b0 RCX: ffffc90008118000
RDX: 0000000000000003 RSI: ffffffff86319a84 RDI: 000000000000001e
RBP: ffff88809ba97c10 R08: ffff888065e9e700 R09: ffffed1015d26c80
R10: ffffed1015d26c7f R11: ffff8880ae9363fb R12: ffff8880a8124f40
R13: 0000000000000001 R14: fffffffffffffffe R15: ffff88809ba97b40
FS: 00007f38e606a700(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00000000202c0140 CR3: 00000000a026a000 CR4: 00000000001406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
__ip6_datagram_connect+0x5e9/0x1390 net/ipv6/datagram.c:246
ip6_datagram_connect+0x30/0x50 net/ipv6/datagram.c:269
ip6_datagram_connect_v6_only+0x69/0x90 net/ipv6/datagram.c:281
inet_dgram_connect+0x14a/0x2d0 net/ipv4/af_inet.c:571
__sys_connect+0x264/0x330 net/socket.c:1824
__do_sys_connect net/socket.c:1835 [inline]
__se_sys_connect net/socket.c:1832 [inline]
__x64_sys_connect+0x73/0xb0 net/socket.c:1832
do_syscall_64+0xfd/0x680 arch/x86/entry/common.c:301
entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x4597c9
Code: fd b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 cb b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007f38e6069c78 EFLAGS: 00000246 ORIG_RAX: 000000000000002a
RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00000000004597c9
RDX: 000000000000001c RSI: 0000000020000040 RDI: 0000000000000003
RBP: 000000000075bf20 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00007f38e606a6d4
R13: 00000000004bfd07 R14: 00000000004d1838 R15: 00000000ffffffff
Modules linked in:
RIP: 0010:ip6_datagram_dst_update+0x559/0xc30 net/ipv6/datagram.c:83
Code: 00 00 e8 ea 29 3f fb 4d 85 f6 0f 84 96 04 00 00 e8 dc 29 3f fb 49 8d 7e 20 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 16 06 00 00 4d 8b 6e 20 e8 b4 29 3f fb 4c 89 ee
Fixes: 59c820b2317f ("ipv6: elide flowlabel check if no exclusive leases exist")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
---
net/ipv6/datagram.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 9d78c907b918a98cbb9e80154a038e31b6bddd11..9ab897ded4df52d882cda1414ef0159f3eb1765a 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -74,7 +74,7 @@ int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr)
if (np->sndflow && (np->flow_label & IPV6_FLOWLABEL_MASK)) {
flowlabel = fl6_sock_lookup(sk, np->flow_label);
- if (!flowlabel)
+ if (IS_ERR(flowlabel))
return -EINVAL;
}
ip6_datagram_flow_key_init(&fl6, sk);
--
2.22.0.410.gd8fdbe21b5-goog
^ permalink raw reply related
* [PATCH net] ipv6: tcp: fix flowlabels reflection for RST packets
From: Eric Dumazet @ 2019-07-10 13:40 UTC (permalink / raw)
To: David S . Miller; +Cc: netdev, Eric Dumazet, Eric Dumazet, Marek Majkowski
In 323a53c41292 ("ipv6: tcp: enable flowlabel reflection in some RST packets")
and 50a8accf1062 ("ipv6: tcp: send consistent flowlabel in TIME_WAIT state")
we took care of IPv6 flowlabel reflections for two cases.
This patch takes care of the remaining case, when the RST packet
is sent on behalf of a 'full' socket.
In Marek use case, this was a socket in TCP_CLOSE state.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Marek Majkowski <marek@cloudflare.com>
Tested-by: Marek Majkowski <marek@cloudflare.com>
---
net/ipv6/tcp_ipv6.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d56a9019a0feb5a34312ec353c555f44b8c09b3d..5da069e91cacca4e84a3e41dae4746c9d38fcc46 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -984,8 +984,13 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
if (sk) {
oif = sk->sk_bound_dev_if;
- if (sk_fullsock(sk))
+ if (sk_fullsock(sk)) {
+ const struct ipv6_pinfo *np = tcp_inet6_sk(sk);
+
trace_tcp_send_reset(sk, skb);
+ if (np->repflow)
+ label = ip6_flowlabel(ipv6h);
+ }
if (sk->sk_state == TCP_TIME_WAIT)
label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel);
} else {
--
2.22.0.410.gd8fdbe21b5-goog
^ permalink raw reply related
* [PATCH] libertas: Add missing sentinel at end of if_usb.c fw_table
From: Kevin Easton @ 2019-07-10 13:31 UTC (permalink / raw)
To: linux-wireless
Cc: andreyknvl, davem, kvalo, libertas-dev, linux-kernel, syzbot,
netdev, syzkaller-bugs
This sentinel tells the firmware loading process when to stop.
Reported-and-tested-by: syzbot+98156c174c5a2cad9f8f@syzkaller.appspotmail.com
Signed-off-by: Kevin Easton <kevin@guarana.org>
---
drivers/net/wireless/marvell/libertas/if_usb.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/wireless/marvell/libertas/if_usb.c b/drivers/net/wireless/marvell/libertas/if_usb.c
index f1622f0ff8c9..fe3142d85d1e 100644
--- a/drivers/net/wireless/marvell/libertas/if_usb.c
+++ b/drivers/net/wireless/marvell/libertas/if_usb.c
@@ -50,7 +50,8 @@ static const struct lbs_fw_table fw_table[] = {
{ MODEL_8388, "libertas/usb8388_v5.bin", NULL },
{ MODEL_8388, "libertas/usb8388.bin", NULL },
{ MODEL_8388, "usb8388.bin", NULL },
- { MODEL_8682, "libertas/usb8682.bin", NULL }
+ { MODEL_8682, "libertas/usb8682.bin", NULL },
+ { 0, NULL, NULL }
};
static const struct usb_device_id if_usb_table[] = {
--
2.11.0
^ permalink raw reply related
* Re: [rdma 14/16] RDMA/irdma: Add ABI definitions
From: Jason Gunthorpe @ 2019-07-10 13:32 UTC (permalink / raw)
To: Henry Orosco
Cc: Saleem, Shiraz, Leon Romanovsky, Kirsher, Jeffrey T,
dledford@redhat.com, davem@davemloft.net, Ismail, Mustafa,
linux-rdma@vger.kernel.org, netdev@vger.kernel.org,
nhorman@redhat.com, sassmann@redhat.com, poswald@suse.com,
Ertman, David M
In-Reply-To: <20190709205613.GA7440@horosco-MOBL2.amr.corp.intel.com>
On Tue, Jul 09, 2019 at 03:56:13PM -0500, Henry Orosco wrote:
> On Mon, Jul 08, 2019 at 02:13:39PM +0000, Jason Gunthorpe wrote:
> > On Sat, Jul 06, 2019 at 04:15:20PM +0000, Saleem, Shiraz wrote:
> > > > Subject: Re: [rdma 14/16] RDMA/irdma: Add ABI definitions
> > > >
> > > > On Fri, Jul 05, 2019 at 04:42:19PM +0000, Saleem, Shiraz wrote:
> > > > > > Subject: Re: [rdma 14/16] RDMA/irdma: Add ABI definitions
> > > > > >
> > > > > > On Thu, Jul 04, 2019 at 10:40:21AM +0300, Leon Romanovsky wrote:
> > > > > > > On Wed, Jul 03, 2019 at 07:12:57PM -0700, Jeff Kirsher wrote:
> > > > > > > > From: Mustafa Ismail <mustafa.ismail@intel.com>
> > > > > > > >
> > > > > > > > Add ABI definitions for irdma.
> > > > > > > >
> > > > > > > > Signed-off-by: Mustafa Ismail <mustafa.ismail@intel.com>
> > > > > > > > Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
> > > > > > > > include/uapi/rdma/irdma-abi.h | 130
> > > > > > > > ++++++++++++++++++++++++++++++++++
> > > > > > > > 1 file changed, 130 insertions(+) create mode 100644
> > > > > > > > include/uapi/rdma/irdma-abi.h
> > > > > > > >
> > > > > > > > diff --git a/include/uapi/rdma/irdma-abi.h
> > > > > > > > b/include/uapi/rdma/irdma-abi.h new file mode 100644 index
> > > > > > > > 000000000000..bdfbda4c829e
> > > > > > > > +++ b/include/uapi/rdma/irdma-abi.h
> > > > > > > > @@ -0,0 +1,130 @@
> > > > > > > > +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
> > > > > > > > +/* Copyright (c) 2006 - 2019 Intel Corporation. All rights reserved.
> > > > > > > > + * Copyright (c) 2005 Topspin Communications. All rights reserved.
> > > > > > > > + * Copyright (c) 2005 Cisco Systems. All rights reserved.
> > > > > > > > + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
> > > > > > > > + */
> > > > > > > > +
> > > > > > > > +#ifndef IRDMA_ABI_H
> > > > > > > > +#define IRDMA_ABI_H
> > > > > > > > +
> > > > > > > > +#include <linux/types.h>
> > > > > > > > +
> > > > > > > > +/* irdma must support legacy GEN_1 i40iw kernel
> > > > > > > > + * and user-space whose last ABI ver is 5 */ #define
> > > > > > > > +IRDMA_ABI_VER
> > > > > > > > +6
> > > > > > >
> > > > > > > Can you please elaborate about it more?
> > > > > > > There is no irdma code in RDMA yet, so it makes me wonder why new
> > > > > > > define shouldn't start from 1.
> > > > > >
> > > > > > It is because they are ABI compatible with the current user space,
> > > > > > which raises the question why we even have this confusing header file..
> > > > >
> > > > > It is because we need to support current providers/i40iw user-space.
> > > > > Our user-space patch series will introduce a new provider (irdma)
> > > > > whose ABI ver. is also 6 (capable of supporting X722 and which will
> > > > > work with i40iw driver on older kernels) and removes providers/i40iw from rdma-
> > > > core.
> > > >
> > > > Why on earth would we do that?
> > > >
> > > A unified library providers/irdma to go in hand with the driver irdma and uses the ABI header.
> > > It can support the new network device e810 and existing x722 iWARP device. It obsoletes
> > > providers/i40iw and extends its ABI. So why keep providers/i40iw around in rdma-core?
> >
> > Why rewrite a perfectly good userspace that is compatible with the
> > future and past kernels?
> >
> > Is there something so wrong with the userspace provider to need this?
> >
>
> Yes, the issue is that providers/i40iw was never designed to work with a unified driver
> which supports multiple hardware generations.
But Shiraz said it works fine with the new kernel driver.. So what is
actually the problem?
Jason
^ permalink raw reply
* [PATCH] ipv6: Use ipv6_authlen for len
From: yangxingwu @ 2019-07-10 13:14 UTC (permalink / raw)
To: davem
Cc: kuznet, yoshfuji, netdev, linux-kernel, pablo, kadlec, fw,
netfilter-devel, coreteam, yangxingwu
The length of AH header is computed manually as (hp->hdrlen+2)<<2.
However, in include/linux/ipv6.h, a macro named ipv6_authlen is
already defined for exactly the same job. This commit replaces
the manual computation code with the macro.
Signed-off-by: yangxingwu <xingwu.yang@gmail.com>
---
net/ipv6/ah6.c | 4 ++--
net/ipv6/exthdrs_core.c | 2 +-
net/ipv6/ip6_tunnel.c | 2 +-
net/ipv6/netfilter/ip6t_ah.c | 2 +-
| 2 +-
net/ipv6/netfilter/nf_conntrack_reasm.c | 2 +-
net/ipv6/netfilter/nf_log_ipv6.c | 2 +-
7 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 68b9e92..626c64b 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -464,7 +464,7 @@ static void ah6_input_done(struct crypto_async_request *base, int err)
struct ah_data *ahp = x->data;
struct ip_auth_hdr *ah = ip_auth_hdr(skb);
int hdr_len = skb_network_header_len(skb);
- int ah_hlen = (ah->hdrlen + 2) << 2;
+ int ah_hlen = ipv6_authlen(ah);
if (err)
goto out;
@@ -546,7 +546,7 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb)
ahash = ahp->ahash;
nexthdr = ah->nexthdr;
- ah_hlen = (ah->hdrlen + 2) << 2;
+ ah_hlen = ipv6_authlen(ah);
if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
index 11a43ee..b358f1a 100644
--- a/net/ipv6/exthdrs_core.c
+++ b/net/ipv6/exthdrs_core.c
@@ -266,7 +266,7 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
} else if (nexthdr == NEXTHDR_AUTH) {
if (flags && (*flags & IP6_FH_F_AUTH) && (target < 0))
break;
- hdrlen = (hp->hdrlen + 2) << 2;
+ hdrlen = ipv6_authlen(hp);
} else
hdrlen = ipv6_optlen(hp);
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index b80fde1..3134fbb 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -416,7 +416,7 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw)
break;
optlen = 8;
} else if (nexthdr == NEXTHDR_AUTH) {
- optlen = (hdr->hdrlen + 2) << 2;
+ optlen = ipv6_authlen(hdr);
} else {
optlen = ipv6_optlen(hdr);
}
diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c
index 0228ff3..4e15a14 100644
--- a/net/ipv6/netfilter/ip6t_ah.c
+++ b/net/ipv6/netfilter/ip6t_ah.c
@@ -55,7 +55,7 @@ static bool ah_mt6(const struct sk_buff *skb, struct xt_action_param *par)
return false;
}
- hdrlen = (ah->hdrlen + 2) << 2;
+ hdrlen = ipv6_authlen(ah);
pr_debug("IPv6 AH LEN %u %u ", hdrlen, ah->hdrlen);
pr_debug("RES %04X ", ah->reserved);
--git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c
index fd439f8..0fc6326 100644
--- a/net/ipv6/netfilter/ip6t_ipv6header.c
+++ b/net/ipv6/netfilter/ip6t_ipv6header.c
@@ -71,7 +71,7 @@
if (nexthdr == NEXTHDR_FRAGMENT)
hdrlen = 8;
else if (nexthdr == NEXTHDR_AUTH)
- hdrlen = (hp->hdrlen + 2) << 2;
+ hdrlen = ipv6_authlen(hp);
else
hdrlen = ipv6_optlen(hp);
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 84322ce..16de015 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -421,7 +421,7 @@ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb,
if (skb_copy_bits(skb, start, &hdr, sizeof(hdr)))
BUG();
if (nexthdr == NEXTHDR_AUTH)
- hdrlen = (hdr.hdrlen+2)<<2;
+ hdrlen = ipv6_authlen(&hdr);
else
hdrlen = ipv6_optlen(&hdr);
diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c
index 549c511..f53bd8f 100644
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -155,7 +155,7 @@ static void dump_ipv6_packet(struct net *net, struct nf_log_buf *m,
}
- hdrlen = (hp->hdrlen+2)<<2;
+ hdrlen = ipv6_authlen(hp);
break;
case IPPROTO_ESP:
if (logflags & NF_LOG_IPOPT) {
--
1.8.3.1
^ permalink raw reply related
* [PATCH ipsec] xfrm interface: fix list corruption for x-netns
From: Nicolas Dichtel @ 2019-07-10 13:11 UTC (permalink / raw)
To: steffen.klassert, davem; +Cc: netdev, Nicolas Dichtel, Julien Floret
dev_net(dev) is the netns of the device and xi->net is the link netns,
where the device has been linked.
changelink() must operate in the link netns to avoid a corruption of
the xfrm lists.
Note that xi->net and dev_net(xi->physdev) are always the same.
Before the patch, the xfrmi lists may be corrupted and can later trigger a
kernel panic.
Fixes: f203b76d7809 ("xfrm: Add virtual xfrm interfaces")
Reported-by: Julien Floret <julien.floret@6wind.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Tested-by: Julien Floret <julien.floret@6wind.com>
---
net/xfrm/xfrm_interface.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c
index a60d391f7ebe..9c5bc8dcf608 100644
--- a/net/xfrm/xfrm_interface.c
+++ b/net/xfrm/xfrm_interface.c
@@ -503,7 +503,7 @@ static int xfrmi_change(struct xfrm_if *xi, const struct xfrm_if_parms *p)
static int xfrmi_update(struct xfrm_if *xi, struct xfrm_if_parms *p)
{
- struct net *net = dev_net(xi->dev);
+ struct net *net = xi->net;
struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
int err;
@@ -663,9 +663,9 @@ static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[],
struct nlattr *data[],
struct netlink_ext_ack *extack)
{
- struct net *net = dev_net(dev);
+ struct xfrm_if *xi = netdev_priv(dev);
+ struct net *net = xi->net;
struct xfrm_if_parms p;
- struct xfrm_if *xi;
xfrmi_netlink_parms(data, &p);
xi = xfrmi_locate(net, &p);
@@ -707,7 +707,7 @@ static struct net *xfrmi_get_link_net(const struct net_device *dev)
{
struct xfrm_if *xi = netdev_priv(dev);
- return dev_net(xi->phydev);
+ return xi->net;
}
static const struct nla_policy xfrmi_policy[IFLA_XFRM_MAX + 1] = {
--
2.21.0
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox