* [PATCH net-next 09/12] amd-xgbe: Always attempt link training in KR mode
From: Tom Lendacky @ 2018-05-21 21:59 UTC (permalink / raw)
To: netdev; +Cc: David Miller
In-Reply-To: <20180521215818.8135.83100.stgit@tlendack-t1.amdoffice.net>
Link training is always attempted when in KR mode, but the code is
structured to check if link training has been enabled before attempting
to perform it. Since that check will always be true, simplify the code
to always enable and start link training during KR auto-negotiation.
Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
drivers/net/ethernet/amd/xgbe/xgbe-mdio.c | 69 +++++++----------------------
1 file changed, 16 insertions(+), 53 deletions(-)
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
index 9c39c72..450b89c 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
@@ -216,31 +216,8 @@ static void xgbe_an_clear_interrupts_all(struct xgbe_prv_data *pdata)
xgbe_an37_clear_interrupts(pdata);
}
-static void xgbe_an73_enable_kr_training(struct xgbe_prv_data *pdata)
-{
- unsigned int reg;
-
- reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
-
- reg |= XGBE_KR_TRAINING_ENABLE;
- XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL, reg);
-}
-
-static void xgbe_an73_disable_kr_training(struct xgbe_prv_data *pdata)
-{
- unsigned int reg;
-
- reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
-
- reg &= ~XGBE_KR_TRAINING_ENABLE;
- XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL, reg);
-}
-
static void xgbe_kr_mode(struct xgbe_prv_data *pdata)
{
- /* Enable KR training */
- xgbe_an73_enable_kr_training(pdata);
-
/* Set MAC to 10G speed */
pdata->hw_if.set_speed(pdata, SPEED_10000);
@@ -250,9 +227,6 @@ static void xgbe_kr_mode(struct xgbe_prv_data *pdata)
static void xgbe_kx_2500_mode(struct xgbe_prv_data *pdata)
{
- /* Disable KR training */
- xgbe_an73_disable_kr_training(pdata);
-
/* Set MAC to 2.5G speed */
pdata->hw_if.set_speed(pdata, SPEED_2500);
@@ -262,9 +236,6 @@ static void xgbe_kx_2500_mode(struct xgbe_prv_data *pdata)
static void xgbe_kx_1000_mode(struct xgbe_prv_data *pdata)
{
- /* Disable KR training */
- xgbe_an73_disable_kr_training(pdata);
-
/* Set MAC to 1G speed */
pdata->hw_if.set_speed(pdata, SPEED_1000);
@@ -278,9 +249,6 @@ static void xgbe_sfi_mode(struct xgbe_prv_data *pdata)
if (pdata->kr_redrv)
return xgbe_kr_mode(pdata);
- /* Disable KR training */
- xgbe_an73_disable_kr_training(pdata);
-
/* Set MAC to 10G speed */
pdata->hw_if.set_speed(pdata, SPEED_10000);
@@ -290,9 +258,6 @@ static void xgbe_sfi_mode(struct xgbe_prv_data *pdata)
static void xgbe_x_mode(struct xgbe_prv_data *pdata)
{
- /* Disable KR training */
- xgbe_an73_disable_kr_training(pdata);
-
/* Set MAC to 1G speed */
pdata->hw_if.set_speed(pdata, SPEED_1000);
@@ -302,9 +267,6 @@ static void xgbe_x_mode(struct xgbe_prv_data *pdata)
static void xgbe_sgmii_1000_mode(struct xgbe_prv_data *pdata)
{
- /* Disable KR training */
- xgbe_an73_disable_kr_training(pdata);
-
/* Set MAC to 1G speed */
pdata->hw_if.set_speed(pdata, SPEED_1000);
@@ -314,9 +276,6 @@ static void xgbe_sgmii_1000_mode(struct xgbe_prv_data *pdata)
static void xgbe_sgmii_100_mode(struct xgbe_prv_data *pdata)
{
- /* Disable KR training */
- xgbe_an73_disable_kr_training(pdata);
-
/* Set MAC to 1G speed */
pdata->hw_if.set_speed(pdata, SPEED_1000);
@@ -425,6 +384,12 @@ static void xgbe_an73_set(struct xgbe_prv_data *pdata, bool enable,
{
unsigned int reg;
+ /* Disable KR training for now */
+ reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
+ reg &= ~XGBE_KR_TRAINING_ENABLE;
+ XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL, reg);
+
+ /* Update AN settings */
reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_CTRL1);
reg &= ~MDIO_AN_CTRL1_ENABLE;
@@ -522,21 +487,19 @@ static enum xgbe_an xgbe_an73_tx_training(struct xgbe_prv_data *pdata,
XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_FECCTRL, reg);
/* Start KR training */
- reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
- if (reg & XGBE_KR_TRAINING_ENABLE) {
- if (pdata->phy_if.phy_impl.kr_training_pre)
- pdata->phy_if.phy_impl.kr_training_pre(pdata);
+ if (pdata->phy_if.phy_impl.kr_training_pre)
+ pdata->phy_if.phy_impl.kr_training_pre(pdata);
- reg |= XGBE_KR_TRAINING_START;
- XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL,
- reg);
+ reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
+ reg |= XGBE_KR_TRAINING_ENABLE;
+ reg |= XGBE_KR_TRAINING_START;
+ XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL, reg);
- netif_dbg(pdata, link, pdata->netdev,
- "KR training initiated\n");
+ netif_dbg(pdata, link, pdata->netdev,
+ "KR training initiated\n");
- if (pdata->phy_if.phy_impl.kr_training_post)
- pdata->phy_if.phy_impl.kr_training_post(pdata);
- }
+ if (pdata->phy_if.phy_impl.kr_training_post)
+ pdata->phy_if.phy_impl.kr_training_post(pdata);
return XGBE_AN_PAGE_RECEIVED;
}
^ permalink raw reply related
* [PATCH net-next 10/12] amd-xgbe: Advertise FEC support with the KR re-driver
From: Tom Lendacky @ 2018-05-21 21:59 UTC (permalink / raw)
To: netdev; +Cc: David Miller
In-Reply-To: <20180521215818.8135.83100.stgit@tlendack-t1.amdoffice.net>
When a KR re-driver is present, indicate the FEC support is available
during auto-negotiation.
Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index 141bb13..dd747f6 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -1720,6 +1720,10 @@ static void xgbe_phy_an_advertising(struct xgbe_prv_data *pdata,
XGBE_CLR_ADV(dlks, 1000baseKX_Full);
XGBE_CLR_ADV(dlks, 10000baseKR_Full);
+ /* Advertise FEC support is present */
+ if (pdata->fec_ability & MDIO_PMA_10GBR_FECABLE_ABLE)
+ XGBE_SET_ADV(dlks, 10000baseR_FEC);
+
switch (phy_data->port_mode) {
case XGBE_PORT_MODE_BACKPLANE:
XGBE_SET_ADV(dlks, 10000baseKR_Full);
^ permalink raw reply related
* [PATCH net-next 11/12] amd-xgbe: Update the BelFuse quirk to support SGMII
From: Tom Lendacky @ 2018-05-21 22:00 UTC (permalink / raw)
To: netdev; +Cc: David Miller
In-Reply-To: <20180521215818.8135.83100.stgit@tlendack-t1.amdoffice.net>
Instead of using a quirk to make the BelFuse 1GBT-SFP06 part look like
a 1000baseX part, program the SFP PHY to support SGMII and 10/100/1000
baseT.
Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c | 109 +++++++++++++++++++--------
1 file changed, 75 insertions(+), 34 deletions(-)
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index dd747f6..194a569 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -860,6 +860,9 @@ static bool xgbe_phy_finisar_phy_quirks(struct xgbe_prv_data *pdata)
struct xgbe_phy_data *phy_data = pdata->phy_data;
unsigned int phy_id = phy_data->phydev->phy_id;
+ if (phy_data->port_mode != XGBE_PORT_MODE_SFP)
+ return false;
+
if ((phy_id & 0xfffffff0) != 0x01ff0cc0)
return false;
@@ -885,8 +888,80 @@ static bool xgbe_phy_finisar_phy_quirks(struct xgbe_prv_data *pdata)
return true;
}
+static bool xgbe_phy_belfuse_phy_quirks(struct xgbe_prv_data *pdata)
+{
+ struct xgbe_phy_data *phy_data = pdata->phy_data;
+ struct xgbe_sfp_eeprom *sfp_eeprom = &phy_data->sfp_eeprom;
+ unsigned int phy_id = phy_data->phydev->phy_id;
+ int reg;
+
+ if (phy_data->port_mode != XGBE_PORT_MODE_SFP)
+ return false;
+
+ if (memcmp(&sfp_eeprom->base[XGBE_SFP_BASE_VENDOR_NAME],
+ XGBE_BEL_FUSE_VENDOR, XGBE_SFP_BASE_VENDOR_NAME_LEN))
+ return false;
+
+ if (memcmp(&sfp_eeprom->base[XGBE_SFP_BASE_VENDOR_PN],
+ XGBE_BEL_FUSE_PARTNO, XGBE_SFP_BASE_VENDOR_PN_LEN))
+ return false;
+
+ if ((phy_id & 0xfffffff0) != 0x03625d10)
+ return false;
+
+ /* Disable RGMII mode */
+ phy_write(phy_data->phydev, 0x18, 0x7007);
+ reg = phy_read(phy_data->phydev, 0x18);
+ phy_write(phy_data->phydev, 0x18, reg & ~0x0080);
+
+ /* Enable fiber register bank */
+ phy_write(phy_data->phydev, 0x1c, 0x7c00);
+ reg = phy_read(phy_data->phydev, 0x1c);
+ reg &= 0x03ff;
+ reg &= ~0x0001;
+ phy_write(phy_data->phydev, 0x1c, 0x8000 | 0x7c00 | reg | 0x0001);
+
+ /* Power down SerDes */
+ reg = phy_read(phy_data->phydev, 0x00);
+ phy_write(phy_data->phydev, 0x00, reg | 0x00800);
+
+ /* Configure SGMII-to-Copper mode */
+ phy_write(phy_data->phydev, 0x1c, 0x7c00);
+ reg = phy_read(phy_data->phydev, 0x1c);
+ reg &= 0x03ff;
+ reg &= ~0x0006;
+ phy_write(phy_data->phydev, 0x1c, 0x8000 | 0x7c00 | reg | 0x0004);
+
+ /* Power up SerDes */
+ reg = phy_read(phy_data->phydev, 0x00);
+ phy_write(phy_data->phydev, 0x00, reg & ~0x00800);
+
+ /* Enable copper register bank */
+ phy_write(phy_data->phydev, 0x1c, 0x7c00);
+ reg = phy_read(phy_data->phydev, 0x1c);
+ reg &= 0x03ff;
+ reg &= ~0x0001;
+ phy_write(phy_data->phydev, 0x1c, 0x8000 | 0x7c00 | reg);
+
+ /* Power up SerDes */
+ reg = phy_read(phy_data->phydev, 0x00);
+ phy_write(phy_data->phydev, 0x00, reg & ~0x00800);
+
+ phy_data->phydev->supported = PHY_GBIT_FEATURES;
+ phy_data->phydev->supported |= SUPPORTED_Pause | SUPPORTED_Asym_Pause;
+ phy_data->phydev->advertising = phy_data->phydev->supported;
+
+ netif_dbg(pdata, drv, pdata->netdev,
+ "BelFuse PHY quirk in place\n");
+
+ return true;
+}
+
static void xgbe_phy_external_phy_quirks(struct xgbe_prv_data *pdata)
{
+ if (xgbe_phy_belfuse_phy_quirks(pdata))
+ return;
+
if (xgbe_phy_finisar_phy_quirks(pdata))
return;
}
@@ -1027,37 +1102,6 @@ static bool xgbe_phy_check_sfp_mod_absent(struct xgbe_phy_data *phy_data)
return false;
}
-static bool xgbe_phy_belfuse_parse_quirks(struct xgbe_prv_data *pdata)
-{
- struct xgbe_phy_data *phy_data = pdata->phy_data;
- struct xgbe_sfp_eeprom *sfp_eeprom = &phy_data->sfp_eeprom;
-
- if (memcmp(&sfp_eeprom->base[XGBE_SFP_BASE_VENDOR_NAME],
- XGBE_BEL_FUSE_VENDOR, XGBE_SFP_BASE_VENDOR_NAME_LEN))
- return false;
-
- if (!memcmp(&sfp_eeprom->base[XGBE_SFP_BASE_VENDOR_PN],
- XGBE_BEL_FUSE_PARTNO, XGBE_SFP_BASE_VENDOR_PN_LEN)) {
- phy_data->sfp_base = XGBE_SFP_BASE_1000_SX;
- phy_data->sfp_cable = XGBE_SFP_CABLE_ACTIVE;
- phy_data->sfp_speed = XGBE_SFP_SPEED_1000;
- if (phy_data->sfp_changed)
- netif_dbg(pdata, drv, pdata->netdev,
- "Bel-Fuse SFP quirk in place\n");
- return true;
- }
-
- return false;
-}
-
-static bool xgbe_phy_sfp_parse_quirks(struct xgbe_prv_data *pdata)
-{
- if (xgbe_phy_belfuse_parse_quirks(pdata))
- return true;
-
- return false;
-}
-
static void xgbe_phy_sfp_parse_eeprom(struct xgbe_prv_data *pdata)
{
struct xgbe_phy_data *phy_data = pdata->phy_data;
@@ -1076,9 +1120,6 @@ static void xgbe_phy_sfp_parse_eeprom(struct xgbe_prv_data *pdata)
phy_data->sfp_tx_fault = xgbe_phy_check_sfp_tx_fault(phy_data);
phy_data->sfp_rx_los = xgbe_phy_check_sfp_rx_los(phy_data);
- if (xgbe_phy_sfp_parse_quirks(pdata))
- return;
-
/* Assume ACTIVE cable unless told it is PASSIVE */
if (sfp_base[XGBE_SFP_BASE_CABLE] & XGBE_SFP_BASE_CABLE_PASSIVE) {
phy_data->sfp_cable = XGBE_SFP_CABLE_PASSIVE;
^ permalink raw reply related
* [PATCH net-next 12/12] amd-xgbe: Improve SFP 100Mbps auto-negotiation
From: Tom Lendacky @ 2018-05-21 22:00 UTC (permalink / raw)
To: netdev; +Cc: David Miller
In-Reply-To: <20180521215818.8135.83100.stgit@tlendack-t1.amdoffice.net>
After changing speed to 100Mbps as a result of auto-negotiation (AN),
some 10/100/1000Mbps SFPs indicate a successful link (no faults or loss
of signal), but cannot successfully transmit or receive data. These
SFPs required an extra auto-negotiation (AN) after the speed change in
order to operate properly. Add a quirk for these SFPs so that if the
outcome of the AN actually results in changing to a new speed, re-initiate
AN at that new speed.
Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
drivers/net/ethernet/amd/xgbe/xgbe-mdio.c | 77 +++++++++++++++------------
drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c | 6 ++
drivers/net/ethernet/amd/xgbe/xgbe.h | 1
3 files changed, 50 insertions(+), 34 deletions(-)
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
index 450b89c..4b5d625 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
@@ -331,13 +331,15 @@ static void xgbe_switch_mode(struct xgbe_prv_data *pdata)
xgbe_change_mode(pdata, pdata->phy_if.phy_impl.switch_mode(pdata));
}
-static void xgbe_set_mode(struct xgbe_prv_data *pdata,
+static bool xgbe_set_mode(struct xgbe_prv_data *pdata,
enum xgbe_mode mode)
{
if (mode == xgbe_cur_mode(pdata))
- return;
+ return false;
xgbe_change_mode(pdata, mode);
+
+ return true;
}
static bool xgbe_use_mode(struct xgbe_prv_data *pdata,
@@ -1178,21 +1180,23 @@ static int xgbe_phy_config_fixed(struct xgbe_prv_data *pdata)
return 0;
}
-static int __xgbe_phy_config_aneg(struct xgbe_prv_data *pdata)
+static int __xgbe_phy_config_aneg(struct xgbe_prv_data *pdata, bool set_mode)
{
int ret;
+ mutex_lock(&pdata->an_mutex);
+
set_bit(XGBE_LINK_INIT, &pdata->dev_state);
pdata->link_check = jiffies;
ret = pdata->phy_if.phy_impl.an_config(pdata);
if (ret)
- return ret;
+ goto out;
if (pdata->phy.autoneg != AUTONEG_ENABLE) {
ret = xgbe_phy_config_fixed(pdata);
if (ret || !pdata->kr_redrv)
- return ret;
+ goto out;
netif_dbg(pdata, link, pdata->netdev, "AN redriver support\n");
} else {
@@ -1202,24 +1206,27 @@ static int __xgbe_phy_config_aneg(struct xgbe_prv_data *pdata)
/* Disable auto-negotiation interrupt */
disable_irq(pdata->an_irq);
- /* Start auto-negotiation in a supported mode */
- if (xgbe_use_mode(pdata, XGBE_MODE_KR)) {
- xgbe_set_mode(pdata, XGBE_MODE_KR);
- } else if (xgbe_use_mode(pdata, XGBE_MODE_KX_2500)) {
- xgbe_set_mode(pdata, XGBE_MODE_KX_2500);
- } else if (xgbe_use_mode(pdata, XGBE_MODE_KX_1000)) {
- xgbe_set_mode(pdata, XGBE_MODE_KX_1000);
- } else if (xgbe_use_mode(pdata, XGBE_MODE_SFI)) {
- xgbe_set_mode(pdata, XGBE_MODE_SFI);
- } else if (xgbe_use_mode(pdata, XGBE_MODE_X)) {
- xgbe_set_mode(pdata, XGBE_MODE_X);
- } else if (xgbe_use_mode(pdata, XGBE_MODE_SGMII_1000)) {
- xgbe_set_mode(pdata, XGBE_MODE_SGMII_1000);
- } else if (xgbe_use_mode(pdata, XGBE_MODE_SGMII_100)) {
- xgbe_set_mode(pdata, XGBE_MODE_SGMII_100);
- } else {
- enable_irq(pdata->an_irq);
- return -EINVAL;
+ if (set_mode) {
+ /* Start auto-negotiation in a supported mode */
+ if (xgbe_use_mode(pdata, XGBE_MODE_KR)) {
+ xgbe_set_mode(pdata, XGBE_MODE_KR);
+ } else if (xgbe_use_mode(pdata, XGBE_MODE_KX_2500)) {
+ xgbe_set_mode(pdata, XGBE_MODE_KX_2500);
+ } else if (xgbe_use_mode(pdata, XGBE_MODE_KX_1000)) {
+ xgbe_set_mode(pdata, XGBE_MODE_KX_1000);
+ } else if (xgbe_use_mode(pdata, XGBE_MODE_SFI)) {
+ xgbe_set_mode(pdata, XGBE_MODE_SFI);
+ } else if (xgbe_use_mode(pdata, XGBE_MODE_X)) {
+ xgbe_set_mode(pdata, XGBE_MODE_X);
+ } else if (xgbe_use_mode(pdata, XGBE_MODE_SGMII_1000)) {
+ xgbe_set_mode(pdata, XGBE_MODE_SGMII_1000);
+ } else if (xgbe_use_mode(pdata, XGBE_MODE_SGMII_100)) {
+ xgbe_set_mode(pdata, XGBE_MODE_SGMII_100);
+ } else {
+ enable_irq(pdata->an_irq);
+ ret = -EINVAL;
+ goto out;
+ }
}
/* Disable and stop any in progress auto-negotiation */
@@ -1239,16 +1246,7 @@ static int __xgbe_phy_config_aneg(struct xgbe_prv_data *pdata)
xgbe_an_init(pdata);
xgbe_an_restart(pdata);
- return 0;
-}
-
-static int xgbe_phy_config_aneg(struct xgbe_prv_data *pdata)
-{
- int ret;
-
- mutex_lock(&pdata->an_mutex);
-
- ret = __xgbe_phy_config_aneg(pdata);
+out:
if (ret)
set_bit(XGBE_LINK_ERR, &pdata->dev_state);
else
@@ -1259,6 +1257,16 @@ static int xgbe_phy_config_aneg(struct xgbe_prv_data *pdata)
return ret;
}
+static int xgbe_phy_config_aneg(struct xgbe_prv_data *pdata)
+{
+ return __xgbe_phy_config_aneg(pdata, true);
+}
+
+static int xgbe_phy_reconfig_aneg(struct xgbe_prv_data *pdata)
+{
+ return __xgbe_phy_config_aneg(pdata, false);
+}
+
static bool xgbe_phy_aneg_done(struct xgbe_prv_data *pdata)
{
return (pdata->an_result == XGBE_AN_COMPLETE);
@@ -1315,7 +1323,8 @@ static void xgbe_phy_status_result(struct xgbe_prv_data *pdata)
pdata->phy.duplex = DUPLEX_FULL;
- xgbe_set_mode(pdata, mode);
+ if (xgbe_set_mode(pdata, mode) && pdata->an_again)
+ xgbe_phy_reconfig_aneg(pdata);
}
static void xgbe_phy_status(struct xgbe_prv_data *pdata)
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index 194a569..3ceb4f9 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -902,6 +902,9 @@ static bool xgbe_phy_belfuse_phy_quirks(struct xgbe_prv_data *pdata)
XGBE_BEL_FUSE_VENDOR, XGBE_SFP_BASE_VENDOR_NAME_LEN))
return false;
+ /* For Bel-Fuse, use the extra AN flag */
+ pdata->an_again = 1;
+
if (memcmp(&sfp_eeprom->base[XGBE_SFP_BASE_VENDOR_PN],
XGBE_BEL_FUSE_PARTNO, XGBE_SFP_BASE_VENDOR_PN_LEN))
return false;
@@ -978,6 +981,9 @@ static int xgbe_phy_find_phy_device(struct xgbe_prv_data *pdata)
if (phy_data->phydev)
return 0;
+ /* Clear the extra AN flag */
+ pdata->an_again = 0;
+
/* Check for the use of an external PHY */
if (phy_data->phydev_mode == XGBE_MDIO_MODE_NONE)
return 0;
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h b/drivers/net/ethernet/amd/xgbe/xgbe.h
index 7a412cf..47bcbcf 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe.h
+++ b/drivers/net/ethernet/amd/xgbe/xgbe.h
@@ -1261,6 +1261,7 @@ struct xgbe_prv_data {
enum xgbe_rx kr_state;
enum xgbe_rx kx_state;
struct work_struct an_work;
+ unsigned int an_again;
unsigned int an_supported;
unsigned int parallel_detect;
unsigned int fec_ability;
^ permalink raw reply related
* Re: [PATCH net] tuntap: raise EPOLLOUT on device up
From: Michael S. Tsirkin @ 2018-05-21 22:06 UTC (permalink / raw)
To: Jason Wang; +Cc: netdev, linux-kernel, Hannes Frederic Sowa, Eric Dumazet
In-Reply-To: <e178f169-43c4-8ac4-8334-bc3eba1bd10e@redhat.com>
On Sat, May 19, 2018 at 09:09:11AM +0800, Jason Wang wrote:
>
>
> On 2018年05月18日 22:46, Michael S. Tsirkin wrote:
> > On Fri, May 18, 2018 at 10:11:54PM +0800, Jason Wang wrote:
> > >
> > > On 2018年05月18日 22:06, Michael S. Tsirkin wrote:
> > > > On Fri, May 18, 2018 at 10:00:31PM +0800, Jason Wang wrote:
> > > > > On 2018年05月18日 21:26, Jason Wang wrote:
> > > > > > On 2018年05月18日 21:13, Michael S. Tsirkin wrote:
> > > > > > > On Fri, May 18, 2018 at 09:00:43PM +0800, Jason Wang wrote:
> > > > > > > > We return -EIO on device down but can not raise EPOLLOUT after it was
> > > > > > > > up. This may confuse user like vhost which expects tuntap to raise
> > > > > > > > EPOLLOUT to re-enable its TX routine after tuntap is down. This could
> > > > > > > > be easily reproduced by transmitting packets from VM while down and up
> > > > > > > > the tap device. Fixing this by set SOCKWQ_ASYNC_NOSPACE on -EIO.
> > > > > > > >
> > > > > > > > Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
> > > > > > > > Cc: Eric Dumazet <edumazet@google.com>
> > > > > > > > Fixes: 1bd4978a88ac2 ("tun: honor IFF_UP in tun_get_user()")
> > > > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > > > > ---
> > > > > > > > drivers/net/tun.c | 4 +++-
> > > > > > > > 1 file changed, 3 insertions(+), 1 deletion(-)
> > > > > > > >
> > > > > > > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> > > > > > > > index d45ac37..1b29761 100644
> > > > > > > > --- a/drivers/net/tun.c
> > > > > > > > +++ b/drivers/net/tun.c
> > > > > > > > @@ -1734,8 +1734,10 @@ static ssize_t tun_get_user(struct
> > > > > > > > tun_struct *tun, struct tun_file *tfile,
> > > > > > > > int skb_xdp = 1;
> > > > > > > > bool frags = tun_napi_frags_enabled(tun);
> > > > > > > > - if (!(tun->dev->flags & IFF_UP))
> > > > > > > > + if (!(tun->dev->flags & IFF_UP)) {
> > > > > > > Isn't this racy? What if flag is cleared at this point?
> > > > > > I think you mean "set at this point"? Then yes, so we probably need to
> > > > > > set the bit during tun_net_close().
> > > > > >
> > > > > > Thanks
> > > > > Looks no need, vhost will poll socket after it see EIO. So we are ok here?
> > > > >
> > > > > Thanks
> > > > In fact I don't even understand why does this help any longer.
> > > >
> > > We disable tx polling and only enable it on demand for a better rx
> > > performance. You may want to have a look at :
> > >
> > > commit feb8892cb441c742d4220cf7ced001e7fa070731
> > > Author: Jason Wang <jasowang@redhat.com>
> > > Date: Mon Nov 13 11:45:34 2017 +0800
> > >
> > > vhost_net: conditionally enable tx polling
> > >
> > > Thanks
> >
> > Question is, what looks at SOCKWQ_ASYNC_NOSPACE.
> > I think it's tested when packet is transmitted,
> > but there is no guarantee here any packet will
> > ever be transmitted.
> >
>
> Well, actually, I do plan to disable vq polling from the beginning. But
> looks like you do not want this:
>
> See https://patchwork.kernel.org/patch/10034025/
>
> Thanks
Not sure I understand what you are saying, it's enabling polling we are
talking about.
--
MST
^ permalink raw reply
* Re: [PATCH net] tuntap: raise EPOLLOUT on device up
From: Michael S. Tsirkin @ 2018-05-21 22:08 UTC (permalink / raw)
To: David Miller; +Cc: jasowang, netdev, linux-kernel, hannes, edumazet
In-Reply-To: <20180521.114742.427929977852677864.davem@davemloft.net>
On Mon, May 21, 2018 at 11:47:42AM -0400, David Miller wrote:
> From: Jason Wang <jasowang@redhat.com>
> Date: Fri, 18 May 2018 21:00:43 +0800
>
> > We return -EIO on device down but can not raise EPOLLOUT after it was
> > up. This may confuse user like vhost which expects tuntap to raise
> > EPOLLOUT to re-enable its TX routine after tuntap is down. This could
> > be easily reproduced by transmitting packets from VM while down and up
> > the tap device. Fixing this by set SOCKWQ_ASYNC_NOSPACE on -EIO.
> >
> > Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
> > Cc: Eric Dumazet <edumazet@google.com>
> > Fixes: 1bd4978a88ac2 ("tun: honor IFF_UP in tun_get_user()")
> > Signed-off-by: Jason Wang <jasowang@redhat.com>
>
> I'm no so sure what to do with this patch.
>
> Like Michael says, this flag bit is only checks upon transmit which
> may or may not happen after this point. It doesn't seem to be
> guaranteed.
Jason, can't we detect a link up transition and respond accordingly?
What do you think?
--
MST
^ permalink raw reply
* [PATCH net-next 0/2] tcp: reduce quickack pressure for ECN
From: Eric Dumazet @ 2018-05-21 22:08 UTC (permalink / raw)
To: David S . Miller
Cc: netdev, Van Jacobson, Neal Cardwell, Yuchung Cheng,
Soheil Hassas Yeganeh, Eric Dumazet, Eric Dumazet
Small patch series changing TCP behavior vs quickack and ECN
First patch is a refactoring, adding parameter to tcp_incr_quickack()
and tcp_enter_quickack_mode() helpers.
Second patch implements the change, lowering number of ACK packets
sent after an ECN event.
Eric Dumazet (2):
tcp: add max_quickacks param to tcp_incr_quickack and
tcp_enter_quickack_mode
tcp: do not aggressively quick ack after ECN events
net/ipv4/tcp_input.c | 24 +++++++++++++-----------
1 file changed, 13 insertions(+), 11 deletions(-)
--
2.17.0.441.gb46fe60e1d-goog
^ permalink raw reply
* [PATCH net-next 1/2] tcp: add max_quickacks param to tcp_incr_quickack and tcp_enter_quickack_mode
From: Eric Dumazet @ 2018-05-21 22:08 UTC (permalink / raw)
To: David S . Miller
Cc: netdev, Van Jacobson, Neal Cardwell, Yuchung Cheng,
Soheil Hassas Yeganeh, Eric Dumazet, Eric Dumazet
In-Reply-To: <20180521220857.229273-1-edumazet@google.com>
We want to add finer control of the number of ACK packets sent after
ECN events.
This patch is not changing current behavior, it only enables following
change.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
net/ipv4/tcp_input.c | 24 +++++++++++++-----------
1 file changed, 13 insertions(+), 11 deletions(-)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index aebb29ab2fdf2ceaa182cd11928f145a886149ff..2e970e9f4e09d966b703af2d14d521a4328eba7e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -203,21 +203,23 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
}
}
-static void tcp_incr_quickack(struct sock *sk)
+static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks)
{
struct inet_connection_sock *icsk = inet_csk(sk);
unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
if (quickacks == 0)
quickacks = 2;
+ quickacks = min(quickacks, max_quickacks);
if (quickacks > icsk->icsk_ack.quick)
- icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
+ icsk->icsk_ack.quick = quickacks;
}
-static void tcp_enter_quickack_mode(struct sock *sk)
+static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- tcp_incr_quickack(sk);
+
+ tcp_incr_quickack(sk, max_quickacks);
icsk->icsk_ack.pingpong = 0;
icsk->icsk_ack.ato = TCP_ATO_MIN;
}
@@ -261,7 +263,7 @@ static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
* it is probably a retransmit.
*/
if (tp->ecn_flags & TCP_ECN_SEEN)
- tcp_enter_quickack_mode((struct sock *)tp);
+ tcp_enter_quickack_mode((struct sock *)tp, TCP_MAX_QUICKACKS);
break;
case INET_ECN_CE:
if (tcp_ca_needs_ecn((struct sock *)tp))
@@ -269,7 +271,7 @@ static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
/* Better not delay acks, sender can have a very low cwnd */
- tcp_enter_quickack_mode((struct sock *)tp);
+ tcp_enter_quickack_mode((struct sock *)tp, TCP_MAX_QUICKACKS);
tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
}
tp->ecn_flags |= TCP_ECN_SEEN;
@@ -686,7 +688,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
/* The _first_ data packet received, initialize
* delayed ACK engine.
*/
- tcp_incr_quickack(sk);
+ tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
icsk->icsk_ack.ato = TCP_ATO_MIN;
} else {
int m = now - icsk->icsk_ack.lrcvtime;
@@ -702,7 +704,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
/* Too long gap. Apparently sender failed to
* restart window, so that we send ACKs quickly.
*/
- tcp_incr_quickack(sk);
+ tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
sk_mem_reclaim(sk);
}
}
@@ -4179,7 +4181,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
- tcp_enter_quickack_mode(sk);
+ tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
@@ -4706,7 +4708,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
out_of_window:
- tcp_enter_quickack_mode(sk);
+ tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
inet_csk_schedule_ack(sk);
drop:
tcp_drop(sk, skb);
@@ -5790,7 +5792,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* to stand against the temptation 8) --ANK
*/
inet_csk_schedule_ack(sk);
- tcp_enter_quickack_mode(sk);
+ tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
TCP_DELACK_MAX, TCP_RTO_MAX);
--
2.17.0.441.gb46fe60e1d-goog
^ permalink raw reply related
* [PATCH net-next 2/2] tcp: do not aggressively quick ack after ECN events
From: Eric Dumazet @ 2018-05-21 22:08 UTC (permalink / raw)
To: David S . Miller
Cc: netdev, Van Jacobson, Neal Cardwell, Yuchung Cheng,
Soheil Hassas Yeganeh, Eric Dumazet, Eric Dumazet
In-Reply-To: <20180521220857.229273-1-edumazet@google.com>
ECN signals currently forces TCP to enter quickack mode for
up to 16 (TCP_MAX_QUICKACKS) following incoming packets.
We believe this is not needed, and only sending one immediate ack
for the current packet should be enough.
This should reduce the extra load noticed in DCTCP environments,
after congestion events.
This is part 2 of our effort to reduce pure ACK packets.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
net/ipv4/tcp_input.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2e970e9f4e09d966b703af2d14d521a4328eba7e..1191cac72109f2f7e2b688ddbc1d404151d274d6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -263,7 +263,7 @@ static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
* it is probably a retransmit.
*/
if (tp->ecn_flags & TCP_ECN_SEEN)
- tcp_enter_quickack_mode((struct sock *)tp, TCP_MAX_QUICKACKS);
+ tcp_enter_quickack_mode((struct sock *)tp, 1);
break;
case INET_ECN_CE:
if (tcp_ca_needs_ecn((struct sock *)tp))
@@ -271,7 +271,7 @@ static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
/* Better not delay acks, sender can have a very low cwnd */
- tcp_enter_quickack_mode((struct sock *)tp, TCP_MAX_QUICKACKS);
+ tcp_enter_quickack_mode((struct sock *)tp, 1);
tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
}
tp->ecn_flags |= TCP_ECN_SEEN;
--
2.17.0.441.gb46fe60e1d-goog
^ permalink raw reply related
* Reply
From: conapreb @ 2018-05-21 15:29 UTC (permalink / raw)
To: Recipients
Are you free for discussion?
^ permalink raw reply
* Re: [RFC PATCH net-next 10/12] vhost_net: build xdp buff
From: Michael S. Tsirkin @ 2018-05-21 22:21 UTC (permalink / raw)
To: Jesse Brandeburg; +Cc: Jason Wang, kvm, virtualization, netdev, linux-kernel
In-Reply-To: <20180521095611.00005caa@intel.com>
On Mon, May 21, 2018 at 09:56:11AM -0700, Jesse Brandeburg wrote:
> On Mon, 21 May 2018 17:04:31 +0800 Jason wrote:
> > This patch implement build XDP buffers in vhost_net. The idea is do
> > userspace copy in vhost_net and build XDP buff based on the
> > page. Vhost_net can then submit one or an array of XDP buffs to
> > underlayer socket (e.g TUN). TUN can choose to do XDP or call
> > build_skb() to build skb. To support build skb, vnet header were also
> > stored into the header of the XDP buff.
> >
> > This userspace copy and XDP buffs building is key to achieve XDP
> > batching in TUN, since TUN does not need to care about userspace copy
> > and then can disable premmption for several XDP buffs to achieve
> > batching from XDP.
> >
> > TODO: reserve headroom based on the TUN XDP.
> >
> > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > ---
> > drivers/vhost/net.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++++
> > 1 file changed, 74 insertions(+)
> >
> > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > index f0639d7..1209e84 100644
> > --- a/drivers/vhost/net.c
> > +++ b/drivers/vhost/net.c
> > @@ -492,6 +492,80 @@ static bool vhost_has_more_pkts(struct vhost_net *net,
> > likely(!vhost_exceeds_maxpend(net));
> > }
> >
> > +#define VHOST_NET_HEADROOM 256
> > +#define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
> > +
> > +static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
> > + struct iov_iter *from,
> > + struct xdp_buff *xdp)
> > +{
> > + struct vhost_virtqueue *vq = &nvq->vq;
> > + struct page_frag *alloc_frag = ¤t->task_frag;
> > + struct virtio_net_hdr *gso;
> > + size_t len = iov_iter_count(from);
> > + int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> > + int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + VHOST_NET_HEADROOM
> > + + nvq->sock_hlen);
> > + int sock_hlen = nvq->sock_hlen;
> > + void *buf;
> > + int copied;
> > +
> > + if (len < nvq->sock_hlen)
> > + return -EFAULT;
> > +
> > + if (SKB_DATA_ALIGN(len + pad) +
> > + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
> > + return -ENOSPC;
> > +
> > + buflen += SKB_DATA_ALIGN(len + pad);
>
> maybe store the result of SKB_DATA_ALIGN in a local instead of doing
> the work twice?
I don't mind, but I guess gcc can always do it itself?
> > + alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
> > + if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
> > + return -ENOMEM;
> > +
> > + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > +
> > + /* We store two kinds of metadata in the header which will be
> > + * used for XDP_PASS to do build_skb():
> > + * offset 0: buflen
> > + * offset sizeof(int): vnet header
> > + */
> > + copied = copy_page_from_iter(alloc_frag->page,
> > + alloc_frag->offset + sizeof(int), sock_hlen, from);
> > + if (copied != sock_hlen)
> > + return -EFAULT;
> > +
> > + gso = (struct virtio_net_hdr *)(buf + sizeof(int));
> > +
> > + if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
> > + vhost16_to_cpu(vq, gso->csum_start) +
> > + vhost16_to_cpu(vq, gso->csum_offset) + 2 >
> > + vhost16_to_cpu(vq, gso->hdr_len)) {
> > + gso->hdr_len = cpu_to_vhost16(vq,
> > + vhost16_to_cpu(vq, gso->csum_start) +
> > + vhost16_to_cpu(vq, gso->csum_offset) + 2);
> > +
> > + if (vhost16_to_cpu(vq, gso->hdr_len) > len)
> > + return -EINVAL;
> > + }
> > +
> > + len -= sock_hlen;
> > + copied = copy_page_from_iter(alloc_frag->page,
> > + alloc_frag->offset + pad,
> > + len, from);
> > + if (copied != len)
> > + return -EFAULT;
> > +
> > + xdp->data_hard_start = buf;
> > + xdp->data = buf + pad;
> > + xdp->data_end = xdp->data + len;
> > + *(int *)(xdp->data_hard_start)= buflen;
>
> space before =
>
> > +
> > + get_page(alloc_frag->page);
> > + alloc_frag->offset += buflen;
> > +
> > + return 0;
> > +}
> > +
> > static void handle_tx_copy(struct vhost_net *net)
> > {
> > struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
^ permalink raw reply
* Re: [PATCH net-next] r8169: perform reset synchronously in __rtl8169_resume
From: Heiner Kallweit @ 2018-05-21 22:38 UTC (permalink / raw)
To: Francois Romieu
Cc: David Miller, Realtek linux nic maintainers,
netdev@vger.kernel.org
In-Reply-To: <20180521212422.GA25067@electric-eye.fr.zoreil.com>
Am 21.05.2018 um 23:24 schrieb Francois Romieu:
> Heiner Kallweit <hkallweit1@gmail.com> :
> [...]
>> diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
>> index 75dfac024..1eb4f625a 100644
>> --- a/drivers/net/ethernet/realtek/r8169.c
>> +++ b/drivers/net/ethernet/realtek/r8169.c
>> @@ -7327,9 +7327,9 @@ static void __rtl8169_resume(struct net_device *dev)
>> rtl_lock_work(tp);
>> napi_enable(&tp->napi);
>> set_bit(RTL_FLAG_TASK_ENABLED, tp->wk.flags);
>> + if (netif_running(dev))
>> + rtl_reset_work(tp);
>> rtl_unlock_work(tp);
>> -
>> - rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_PENDING);
>> }
>>
>> static int rtl8169_resume(struct device *device)
>
> netif_running() returns true before rtl_open(): issuing rtl_reset_work()
> synchronously against uninitialized rings right at the start of rtl_open()
> won't work as expected.
>
rtl8169_runtime_resume() has a check for tp->TxDescArray at the beginning.
Therefore it will return immediately before even calling
__rtl8169_resume(), because tp->TxDescArray is set in rtl_open() only.
So I don't think the described issue exists.
However I have to admit that I wasn't aware that netif_running() is true
already when entering rtl_open(), so thanks for the hint.
> pm_runtime_get_sync() could be issued later in rtl_open() (but I would
> not mind something different with runtime_{resume/suspend} in open/close).
>
^ permalink raw reply
* Re: [PATCH v2] ath10k: transmit queued frames after waking queues
From: Rajkumar Manoharan @ 2018-05-21 23:11 UTC (permalink / raw)
To: Niklas Cassel
Cc: Kalle Valo, David S. Miller, ath10k, linux-wireless, netdev,
linux-kernel, linux-wireless-owner
In-Reply-To: <20180521204359.23884-1-niklas.cassel@linaro.org>
On 2018-05-21 13:43, Niklas Cassel wrote:
> The following problem was observed when running iperf:
[...]
>
> In order to avoid trying to flush the queue every time we free a frame,
> only do this when there are 3 or less frames pending, and while we
> actually have frames in the queue. This logic was copied from
> mt76_txq_schedule (mt76), one of few other drivers that are actually
> using wake_tx_queue.
>
> Suggested-by: Toke Høiland-Jørgensen <toke@toke.dk>
> Signed-off-by: Niklas Cassel <niklas.cassel@linaro.org>
> ---
> Changes since V1: use READ_ONCE() to disallow the compiler
> optimizing things in undesirable ways.
>
> drivers/net/wireless/ath/ath10k/txrx.c | 3 +++
> 1 file changed, 3 insertions(+)
>
> diff --git a/drivers/net/wireless/ath/ath10k/txrx.c
> b/drivers/net/wireless/ath/ath10k/txrx.c
> index cda164f6e9f6..264cf0bd5c00 100644
> --- a/drivers/net/wireless/ath/ath10k/txrx.c
> +++ b/drivers/net/wireless/ath/ath10k/txrx.c
> @@ -95,6 +95,9 @@ int ath10k_txrx_tx_unref(struct ath10k_htt *htt,
> wake_up(&htt->empty_tx_wq);
> spin_unlock_bh(&htt->tx_lock);
>
> + if (READ_ONCE(htt->num_pending_tx) <= 3 && !list_empty(&ar->txqs))
> + ath10k_mac_tx_push_pending(ar);
> +
Niklas,
Sorry for the late response. ath10k_mac_tx_push_pending is already
called
at the end of NAPI handler. Isn't that enough to process pending frames?
Earlier we observed performance issues in calling push_pending from each
tx completion. IMHO this change may introduce the same problem again.
-Rajkumar
^ permalink raw reply
* Re: [PATCH bpf-next 1/5] bpf: Hooks for sys_sendmsg
From: Martin KaFai Lau @ 2018-05-21 23:16 UTC (permalink / raw)
To: Andrey Ignatov; +Cc: netdev, davem, ast, daniel, kernel-team
In-Reply-To: <654b59700a21ed2342b12f30a583ba9329ef850b.1526694154.git.rdna@fb.com>
On Fri, May 18, 2018 at 07:21:09PM -0700, Andrey Ignatov wrote:
> In addition to already existing BPF hooks for sys_bind and sys_connect,
> the patch provides new hooks for sys_sendmsg.
>
> It leverages existing BPF program type `BPF_PROG_TYPE_CGROUP_SOCK_ADDR`
> that provides access to socket itlself (properties like family, type,
> protocol) and user-passed `struct sockaddr *` so that BPF program can
> override destination IP and port for system calls such as sendto(2) or
> sendmsg(2) and/or assign source IP to the socket.
>
> The hooks are implemented as two new attach types:
> `BPF_CGROUP_UDP4_SENDMSG` and `BPF_CGROUP_UDP6_SENDMSG` for UDPv4 and
> UDPv6 correspondingly.
>
> UDPv4 and UDPv6 separate attach types for same reason as sys_bind and
> sys_connect hooks, i.e. to prevent reading from / writing to e.g.
> user_ip6 fields when user passes sockaddr_in since it'd be out-of-bound.
>
> The difference with already existing hooks is sys_sendmsg are
> implemented only for unconnected UDP.
>
> For TCP it doesn't make sense to change user-provided `struct sockaddr *`
> at sendto(2)/sendmsg(2) time since socket either was already connected
> and has source/destination set or wasn't connected and call to
> sendto(2)/sendmsg(2) would lead to ENOTCONN anyway.
>
> Connected UDP is already handled by sys_connect hooks that can override
> source/destination at connect time and use fast-path later, i.e. these
> hooks don't affect UDP fast-path.
>
> Rewriting source IP is implemented differently than that in sys_connect
> hooks. When sys_sendmsg is used with unconnected UDP it doesn't work to
> just bind socket to desired local IP address since source IP can be set
> on per-packet basis by using ancillary data (cmsg(3)). So no matter if
> socket is bound or not, source IP has to be rewritten on every call to
> sys_sendmsg.
>
> To do so two new fields are added to UAPI `struct bpf_sock_addr`;
> * `msg_src_ip4` to set source IPv4 for UDPv4;
> * `msg_src_ip6` to set source IPv6 for UDPv6.
>
> Signed-off-by: Andrey Ignatov <rdna@fb.com>
> Acked-by: Alexei Starovoitov <ast@kernel.org>
> ---
> include/linux/bpf-cgroup.h | 23 +++++++++++++++++------
> include/linux/filter.h | 1 +
> include/uapi/linux/bpf.h | 8 ++++++++
> kernel/bpf/cgroup.c | 11 ++++++++++-
> kernel/bpf/syscall.c | 8 ++++++++
> net/core/filter.c | 39 +++++++++++++++++++++++++++++++++++++++
> net/ipv4/udp.c | 20 ++++++++++++++++++--
> net/ipv6/udp.c | 17 +++++++++++++++++
> 8 files changed, 118 insertions(+), 9 deletions(-)
>
> diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> index 30d15e6..46f01ba 100644
> --- a/include/linux/bpf-cgroup.h
> +++ b/include/linux/bpf-cgroup.h
> @@ -66,7 +66,8 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
>
> int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
> struct sockaddr *uaddr,
> - enum bpf_attach_type type);
> + enum bpf_attach_type type,
> + void *t_ctx);
>
> int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
> struct bpf_sock_ops_kern *sock_ops,
> @@ -120,16 +121,18 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
> ({ \
> int __ret = 0; \
> if (cgroup_bpf_enabled) \
> - __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type); \
> + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \
> + NULL); \
> __ret; \
> })
>
> -#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type) \
> +#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) \
> ({ \
> int __ret = 0; \
> if (cgroup_bpf_enabled) { \
> lock_sock(sk); \
> - __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type); \
> + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \
> + t_ctx); \
> release_sock(sk); \
> } \
> __ret; \
> @@ -151,10 +154,16 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
> BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_CONNECT)
>
> #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) \
> - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT)
> + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT, NULL)
>
> #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) \
> - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT)
> + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT, NULL)
> +
> +#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) \
> + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_SENDMSG, t_ctx)
> +
> +#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) \
> + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_SENDMSG, t_ctx)
>
> #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \
> ({ \
> @@ -197,6 +206,8 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
> #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; })
> #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; })
> #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; })
> +#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr) ({ 0; })
> +#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr) ({ 0; })
> #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
> #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
>
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index d358d18..d90abda 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -1010,6 +1010,7 @@ struct bpf_sock_addr_kern {
> * only two (src and dst) are available at convert_ctx_access time
> */
> u64 tmp_reg;
> + void *t_ctx; /* Attach type specific context. */
> };
>
> struct bpf_sock_ops_kern {
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 97446bb..b70ad2c 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -158,6 +158,8 @@ enum bpf_attach_type {
> BPF_CGROUP_INET6_CONNECT,
> BPF_CGROUP_INET4_POST_BIND,
> BPF_CGROUP_INET6_POST_BIND,
> + BPF_CGROUP_UDP4_SENDMSG,
> + BPF_CGROUP_UDP6_SENDMSG,
> __MAX_BPF_ATTACH_TYPE
> };
>
> @@ -2247,6 +2249,12 @@ struct bpf_sock_addr {
> __u32 family; /* Allows 4-byte read, but no write */
> __u32 type; /* Allows 4-byte read, but no write */
> __u32 protocol; /* Allows 4-byte read, but no write */
> + __u32 msg_src_ip4; /* Allows 1,2,4-byte read an 4-byte write.
> + * Stored in network byte order.
> + */
> + __u32 msg_src_ip6[4]; /* Allows 1,2,4-byte read an 4-byte write.
> + * Stored in network byte order.
> + */
> };
>
> /* User bpf_sock_ops struct to access socket values and specify request ops
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index 43171a0..f7c00bd 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -500,6 +500,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
> * @sk: sock struct that will use sockaddr
> * @uaddr: sockaddr struct provided by user
> * @type: The type of program to be exectuted
> + * @t_ctx: Pointer to attach type specific context
> *
> * socket is expected to be of type INET or INET6.
> *
> @@ -508,12 +509,15 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
> */
> int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
> struct sockaddr *uaddr,
> - enum bpf_attach_type type)
> + enum bpf_attach_type type,
> + void *t_ctx)
> {
> struct bpf_sock_addr_kern ctx = {
> .sk = sk,
> .uaddr = uaddr,
> + .t_ctx = t_ctx,
> };
> + struct sockaddr_storage unspec;
> struct cgroup *cgrp;
> int ret;
>
> @@ -523,6 +527,11 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
> if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
> return 0;
>
> + if (!ctx.uaddr) {
> + memset(&unspec, 0, sizeof(unspec));
> + ctx.uaddr = (struct sockaddr *)&unspec;
> + }
> +
> cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
> ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
>
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index bfcde94..11a5a95 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -1247,6 +1247,8 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
> case BPF_CGROUP_INET6_BIND:
> case BPF_CGROUP_INET4_CONNECT:
> case BPF_CGROUP_INET6_CONNECT:
> + case BPF_CGROUP_UDP4_SENDMSG:
> + case BPF_CGROUP_UDP6_SENDMSG:
> return 0;
> default:
> return -EINVAL;
> @@ -1563,6 +1565,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
> case BPF_CGROUP_INET6_BIND:
> case BPF_CGROUP_INET4_CONNECT:
> case BPF_CGROUP_INET6_CONNECT:
> + case BPF_CGROUP_UDP4_SENDMSG:
> + case BPF_CGROUP_UDP6_SENDMSG:
> ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
> break;
> case BPF_CGROUP_SOCK_OPS:
> @@ -1633,6 +1637,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
> case BPF_CGROUP_INET6_BIND:
> case BPF_CGROUP_INET4_CONNECT:
> case BPF_CGROUP_INET6_CONNECT:
> + case BPF_CGROUP_UDP4_SENDMSG:
> + case BPF_CGROUP_UDP6_SENDMSG:
> ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
> break;
> case BPF_CGROUP_SOCK_OPS:
> @@ -1690,6 +1696,8 @@ static int bpf_prog_query(const union bpf_attr *attr,
> case BPF_CGROUP_INET6_POST_BIND:
> case BPF_CGROUP_INET4_CONNECT:
> case BPF_CGROUP_INET6_CONNECT:
> + case BPF_CGROUP_UDP4_SENDMSG:
> + case BPF_CGROUP_UDP6_SENDMSG:
> case BPF_CGROUP_SOCK_OPS:
> case BPF_CGROUP_DEVICE:
> break;
> diff --git a/net/core/filter.c b/net/core/filter.c
> index aec5eba..f696dc9 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -5010,6 +5010,7 @@ static bool sock_addr_is_valid_access(int off, int size,
> switch (prog->expected_attach_type) {
> case BPF_CGROUP_INET4_BIND:
> case BPF_CGROUP_INET4_CONNECT:
> + case BPF_CGROUP_UDP4_SENDMSG:
> break;
> default:
> return false;
> @@ -5019,6 +5020,24 @@ static bool sock_addr_is_valid_access(int off, int size,
> switch (prog->expected_attach_type) {
> case BPF_CGROUP_INET6_BIND:
> case BPF_CGROUP_INET6_CONNECT:
> + case BPF_CGROUP_UDP6_SENDMSG:
> + break;
> + default:
> + return false;
> + }
> + break;
> + case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
> + switch (prog->expected_attach_type) {
> + case BPF_CGROUP_UDP4_SENDMSG:
> + break;
> + default:
> + return false;
> + }
> + break;
> + case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
> + msg_src_ip6[3]):
> + switch (prog->expected_attach_type) {
> + case BPF_CGROUP_UDP6_SENDMSG:
> break;
> default:
> return false;
> @@ -5029,6 +5048,9 @@ static bool sock_addr_is_valid_access(int off, int size,
> switch (off) {
> case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
> case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
> + case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
> + case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
> + msg_src_ip6[3]):
> /* Only narrow read access allowed for now. */
> if (type == BPF_READ) {
> bpf_ctx_record_field_size(info, size_default);
> @@ -5783,6 +5805,23 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
> *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg,
> SK_FL_PROTO_SHIFT);
> break;
> +
> + case offsetof(struct bpf_sock_addr, msg_src_ip4):
> + /* Treat t_ctx as struct in_addr for msg_src_ip4. */
> + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
> + struct bpf_sock_addr_kern, struct in_addr, t_ctx,
> + s_addr, BPF_SIZE(si->code), 0, tmp_reg);
> + break;
> +
> + case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
> + msg_src_ip6[3]):
> + off = si->off;
> + off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]);
> + /* Treat t_ctx as struct in6_addr for msg_src_ip6. */
> + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
> + struct bpf_sock_addr_kern, struct in6_addr, t_ctx,
> + s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg);
> + break;
> }
>
> return insn - insn_buf;
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index ff4d4ba..a1f9ba2 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -900,6 +900,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
> {
> struct inet_sock *inet = inet_sk(sk);
> struct udp_sock *up = udp_sk(sk);
> + DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
> struct flowi4 fl4_stack;
> struct flowi4 *fl4;
> int ulen = len;
> @@ -954,8 +955,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
> /*
> * Get and verify the address.
> */
> - if (msg->msg_name) {
> - DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
> + if (usin) {
> if (msg->msg_namelen < sizeof(*usin))
> return -EINVAL;
> if (usin->sin_family != AF_INET) {
> @@ -1009,6 +1009,22 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
> rcu_read_unlock();
> }
>
> + if (!connected) {
> + err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
> + (struct sockaddr *)usin, &ipc.addr);
> + if (err)
> + goto out_free;
> + if (usin) {
> + if (usin->sin_port == 0) {
> + /* BPF program set invalid port. Reject it. */
> + err = -EINVAL;
> + goto out_free;
> + }
> + daddr = usin->sin_addr.s_addr;
> + dport = usin->sin_port;
> + }
> + }
> +
> saddr = ipc.addr;
> ipc.addr = faddr = daddr;
>
> diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
> index 2839c1b..6f580ea 100644
> --- a/net/ipv6/udp.c
> +++ b/net/ipv6/udp.c
> @@ -1315,6 +1315,22 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
> fl6.saddr = np->saddr;
> fl6.fl6_sport = inet->inet_sport;
>
> + if (!connected) {
> + err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
> + (struct sockaddr *)sin6, &fl6.saddr);
> + if (err)
> + goto out_no_dst;
> + if (sin6) {
> + if (sin6->sin6_port == 0) {
> + /* BPF program set invalid port. Reject it. */
> + err = -EINVAL;
> + goto out_no_dst;
> + }
> + fl6.fl6_dport = sin6->sin6_port;
> + fl6.daddr = sin6->sin6_addr;
Could the bpf_prog change sin6 to a v4mapped address?
> + }
> + }
> +
> final_p = fl6_update_dst(&fl6, opt, &final);
It seems fl6_update_dst() may update fl6->daddr.
Is it fine (or inline with the expectation after running
a bpf_prog that has changed user_ip6)?
> if (final_p)
> connected = false;
> @@ -1394,6 +1410,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>
> out:
> dst_release(dst);
> +out_no_dst:
A nit. If dst is init to NULL, can a new exit label
be avoided?
> fl6_sock_release(flowlabel);
> txopt_put(opt_to_free);
> if (!err)
> --
> 2.9.5
>
^ permalink raw reply
* Re: [PATCH bpf-next 1/5] bpf: Hooks for sys_sendmsg
From: kbuild test robot @ 2018-05-21 23:33 UTC (permalink / raw)
To: Andrey Ignatov
Cc: kbuild-all, netdev, Andrey Ignatov, davem, ast, daniel,
kernel-team
In-Reply-To: <654b59700a21ed2342b12f30a583ba9329ef850b.1526694154.git.rdna@fb.com>
[-- Attachment #1: Type: text/plain, Size: 11021 bytes --]
Hi Andrey,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on bpf-next/master]
url: https://github.com/0day-ci/linux/commits/Andrey-Ignatov/bpf-Hooks-for-sys_sendmsg/20180522-065614
base: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: x86_64-randconfig-x003-201820 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-16) 7.3.0
reproduce:
# save the attached .config to linux build tree
make ARCH=x86_64
All errors (new ones prefixed by >>):
net/ipv4/udp.c: In function 'udp_sendmsg':
>> net/ipv4/udp.c:1014:44: error: macro "BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK" passed 3 arguments, but takes just 2
(struct sockaddr *)usin, &ipc.addr);
^
>> net/ipv4/udp.c:1013:9: error: 'BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK' undeclared (first use in this function); did you mean 'BPF_CGROUP_UDP4_SENDMSG'?
err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BPF_CGROUP_UDP4_SENDMSG
net/ipv4/udp.c:1013:9: note: each undeclared identifier is reported only once for each function it appears in
--
net/ipv6/udp.c: In function 'udpv6_sendmsg':
>> net/ipv6/udp.c:1320:44: error: macro "BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK" passed 3 arguments, but takes just 2
(struct sockaddr *)sin6, &fl6.saddr);
^
>> net/ipv6/udp.c:1319:9: error: 'BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK' undeclared (first use in this function); did you mean 'BPF_CGROUP_UDP6_SENDMSG'?
err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BPF_CGROUP_UDP6_SENDMSG
net/ipv6/udp.c:1319:9: note: each undeclared identifier is reported only once for each function it appears in
vim +/BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK +1014 net/ipv4/udp.c
898
899 int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
900 {
901 struct inet_sock *inet = inet_sk(sk);
902 struct udp_sock *up = udp_sk(sk);
903 DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
904 struct flowi4 fl4_stack;
905 struct flowi4 *fl4;
906 int ulen = len;
907 struct ipcm_cookie ipc;
908 struct rtable *rt = NULL;
909 int free = 0;
910 int connected = 0;
911 __be32 daddr, faddr, saddr;
912 __be16 dport;
913 u8 tos;
914 int err, is_udplite = IS_UDPLITE(sk);
915 int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
916 int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
917 struct sk_buff *skb;
918 struct ip_options_data opt_copy;
919
920 if (len > 0xFFFF)
921 return -EMSGSIZE;
922
923 /*
924 * Check the flags.
925 */
926
927 if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */
928 return -EOPNOTSUPP;
929
930 ipc.opt = NULL;
931 ipc.tx_flags = 0;
932 ipc.ttl = 0;
933 ipc.tos = -1;
934
935 getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
936
937 fl4 = &inet->cork.fl.u.ip4;
938 if (up->pending) {
939 /*
940 * There are pending frames.
941 * The socket lock must be held while it's corked.
942 */
943 lock_sock(sk);
944 if (likely(up->pending)) {
945 if (unlikely(up->pending != AF_INET)) {
946 release_sock(sk);
947 return -EINVAL;
948 }
949 goto do_append_data;
950 }
951 release_sock(sk);
952 }
953 ulen += sizeof(struct udphdr);
954
955 /*
956 * Get and verify the address.
957 */
958 if (usin) {
959 if (msg->msg_namelen < sizeof(*usin))
960 return -EINVAL;
961 if (usin->sin_family != AF_INET) {
962 if (usin->sin_family != AF_UNSPEC)
963 return -EAFNOSUPPORT;
964 }
965
966 daddr = usin->sin_addr.s_addr;
967 dport = usin->sin_port;
968 if (dport == 0)
969 return -EINVAL;
970 } else {
971 if (sk->sk_state != TCP_ESTABLISHED)
972 return -EDESTADDRREQ;
973 daddr = inet->inet_daddr;
974 dport = inet->inet_dport;
975 /* Open fast path for connected socket.
976 Route will not be used, if at least one option is set.
977 */
978 connected = 1;
979 }
980
981 ipc.sockc.tsflags = sk->sk_tsflags;
982 ipc.addr = inet->inet_saddr;
983 ipc.oif = sk->sk_bound_dev_if;
984 ipc.gso_size = up->gso_size;
985
986 if (msg->msg_controllen) {
987 err = udp_cmsg_send(sk, msg, &ipc.gso_size);
988 if (err > 0)
989 err = ip_cmsg_send(sk, msg, &ipc,
990 sk->sk_family == AF_INET6);
991 if (unlikely(err < 0)) {
992 kfree(ipc.opt);
993 return err;
994 }
995 if (ipc.opt)
996 free = 1;
997 connected = 0;
998 }
999 if (!ipc.opt) {
1000 struct ip_options_rcu *inet_opt;
1001
1002 rcu_read_lock();
1003 inet_opt = rcu_dereference(inet->inet_opt);
1004 if (inet_opt) {
1005 memcpy(&opt_copy, inet_opt,
1006 sizeof(*inet_opt) + inet_opt->opt.optlen);
1007 ipc.opt = &opt_copy.opt;
1008 }
1009 rcu_read_unlock();
1010 }
1011
1012 if (!connected) {
> 1013 err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
> 1014 (struct sockaddr *)usin, &ipc.addr);
1015 if (err)
1016 goto out_free;
1017 if (usin) {
1018 if (usin->sin_port == 0) {
1019 /* BPF program set invalid port. Reject it. */
1020 err = -EINVAL;
1021 goto out_free;
1022 }
1023 daddr = usin->sin_addr.s_addr;
1024 dport = usin->sin_port;
1025 }
1026 }
1027
1028 saddr = ipc.addr;
1029 ipc.addr = faddr = daddr;
1030
1031 sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags);
1032
1033 if (ipc.opt && ipc.opt->opt.srr) {
1034 if (!daddr) {
1035 err = -EINVAL;
1036 goto out_free;
1037 }
1038 faddr = ipc.opt->opt.faddr;
1039 connected = 0;
1040 }
1041 tos = get_rttos(&ipc, inet);
1042 if (sock_flag(sk, SOCK_LOCALROUTE) ||
1043 (msg->msg_flags & MSG_DONTROUTE) ||
1044 (ipc.opt && ipc.opt->opt.is_strictroute)) {
1045 tos |= RTO_ONLINK;
1046 connected = 0;
1047 }
1048
1049 if (ipv4_is_multicast(daddr)) {
1050 if (!ipc.oif)
1051 ipc.oif = inet->mc_index;
1052 if (!saddr)
1053 saddr = inet->mc_addr;
1054 connected = 0;
1055 } else if (!ipc.oif) {
1056 ipc.oif = inet->uc_index;
1057 } else if (ipv4_is_lbcast(daddr) && inet->uc_index) {
1058 /* oif is set, packet is to local broadcast and
1059 * and uc_index is set. oif is most likely set
1060 * by sk_bound_dev_if. If uc_index != oif check if the
1061 * oif is an L3 master and uc_index is an L3 slave.
1062 * If so, we want to allow the send using the uc_index.
1063 */
1064 if (ipc.oif != inet->uc_index &&
1065 ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk),
1066 inet->uc_index)) {
1067 ipc.oif = inet->uc_index;
1068 }
1069 }
1070
1071 if (connected)
1072 rt = (struct rtable *)sk_dst_check(sk, 0);
1073
1074 if (!rt) {
1075 struct net *net = sock_net(sk);
1076 __u8 flow_flags = inet_sk_flowi_flags(sk);
1077
1078 fl4 = &fl4_stack;
1079
1080 flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
1081 RT_SCOPE_UNIVERSE, sk->sk_protocol,
1082 flow_flags,
1083 faddr, saddr, dport, inet->inet_sport,
1084 sk->sk_uid);
1085
1086 security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
1087 rt = ip_route_output_flow(net, fl4, sk);
1088 if (IS_ERR(rt)) {
1089 err = PTR_ERR(rt);
1090 rt = NULL;
1091 if (err == -ENETUNREACH)
1092 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
1093 goto out;
1094 }
1095
1096 err = -EACCES;
1097 if ((rt->rt_flags & RTCF_BROADCAST) &&
1098 !sock_flag(sk, SOCK_BROADCAST))
1099 goto out;
1100 if (connected)
1101 sk_dst_set(sk, dst_clone(&rt->dst));
1102 }
1103
1104 if (msg->msg_flags&MSG_CONFIRM)
1105 goto do_confirm;
1106 back_from_confirm:
1107
1108 saddr = fl4->saddr;
1109 if (!ipc.addr)
1110 daddr = ipc.addr = fl4->daddr;
1111
1112 /* Lockless fast path for the non-corking case. */
1113 if (!corkreq) {
1114 struct inet_cork cork;
1115
1116 skb = ip_make_skb(sk, fl4, getfrag, msg, ulen,
1117 sizeof(struct udphdr), &ipc, &rt,
1118 &cork, msg->msg_flags);
1119 err = PTR_ERR(skb);
1120 if (!IS_ERR_OR_NULL(skb))
1121 err = udp_send_skb(skb, fl4, &cork);
1122 goto out;
1123 }
1124
1125 lock_sock(sk);
1126 if (unlikely(up->pending)) {
1127 /* The socket is already corked while preparing it. */
1128 /* ... which is an evident application bug. --ANK */
1129 release_sock(sk);
1130
1131 net_dbg_ratelimited("socket already corked\n");
1132 err = -EINVAL;
1133 goto out;
1134 }
1135 /*
1136 * Now cork the socket to pend data.
1137 */
1138 fl4 = &inet->cork.fl.u.ip4;
1139 fl4->daddr = daddr;
1140 fl4->saddr = saddr;
1141 fl4->fl4_dport = dport;
1142 fl4->fl4_sport = inet->inet_sport;
1143 up->pending = AF_INET;
1144
1145 do_append_data:
1146 up->len += ulen;
1147 err = ip_append_data(sk, fl4, getfrag, msg, ulen,
1148 sizeof(struct udphdr), &ipc, &rt,
1149 corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
1150 if (err)
1151 udp_flush_pending_frames(sk);
1152 else if (!corkreq)
1153 err = udp_push_pending_frames(sk);
1154 else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))
1155 up->pending = 0;
1156 release_sock(sk);
1157
1158 out:
1159 ip_rt_put(rt);
1160 out_free:
1161 if (free)
1162 kfree(ipc.opt);
1163 if (!err)
1164 return len;
1165 /*
1166 * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space. Reporting
1167 * ENOBUFS might not be good (it's not tunable per se), but otherwise
1168 * we don't have a good statistic (IpOutDiscards but it can be too many
1169 * things). We could add another new stat but at least for now that
1170 * seems like overkill.
1171 */
1172 if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
1173 UDP_INC_STATS(sock_net(sk),
1174 UDP_MIB_SNDBUFERRORS, is_udplite);
1175 }
1176 return err;
1177
1178 do_confirm:
1179 if (msg->msg_flags & MSG_PROBE)
1180 dst_confirm_neigh(&rt->dst, &fl4->daddr);
1181 if (!(msg->msg_flags&MSG_PROBE) || len)
1182 goto back_from_confirm;
1183 err = 0;
1184 goto out;
1185 }
1186 EXPORT_SYMBOL(udp_sendmsg);
1187
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 27551 bytes --]
^ permalink raw reply
* [PATCH net] netfilter: provide correct argument to nla_strlcpy()
From: Eric Dumazet @ 2018-05-21 23:33 UTC (permalink / raw)
To: David S . Miller
Cc: netdev, Eric Dumazet, Eric Dumazet, Florian Westphal,
Pablo Neira Ayuso
Recent patch forgot to remove nla_data(), upsetting syzkaller a bit.
BUG: KASAN: slab-out-of-bounds in nla_strlcpy+0x13d/0x150 lib/nlattr.c:314
Read of size 1 at addr ffff8801ad1f4fdd by task syz-executor189/4509
CPU: 1 PID: 4509 Comm: syz-executor189 Not tainted 4.17.0-rc6+ #62
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:77 [inline]
dump_stack+0x1b9/0x294 lib/dump_stack.c:113
print_address_description+0x6c/0x20b mm/kasan/report.c:256
kasan_report_error mm/kasan/report.c:354 [inline]
kasan_report.cold.7+0x242/0x2fe mm/kasan/report.c:412
__asan_report_load1_noabort+0x14/0x20 mm/kasan/report.c:430
nla_strlcpy+0x13d/0x150 lib/nlattr.c:314
nfnl_acct_new+0x574/0xc50 net/netfilter/nfnetlink_acct.c:118
nfnetlink_rcv_msg+0xdb5/0xff0 net/netfilter/nfnetlink.c:212
netlink_rcv_skb+0x172/0x440 net/netlink/af_netlink.c:2448
nfnetlink_rcv+0x1fe/0x1ba0 net/netfilter/nfnetlink.c:513
netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline]
netlink_unicast+0x58b/0x740 net/netlink/af_netlink.c:1336
netlink_sendmsg+0x9f0/0xfa0 net/netlink/af_netlink.c:1901
sock_sendmsg_nosec net/socket.c:629 [inline]
sock_sendmsg+0xd5/0x120 net/socket.c:639
sock_write_iter+0x35a/0x5a0 net/socket.c:908
call_write_iter include/linux/fs.h:1784 [inline]
new_sync_write fs/read_write.c:474 [inline]
__vfs_write+0x64d/0x960 fs/read_write.c:487
vfs_write+0x1f8/0x560 fs/read_write.c:549
ksys_write+0xf9/0x250 fs/read_write.c:598
__do_sys_write fs/read_write.c:610 [inline]
__se_sys_write fs/read_write.c:607 [inline]
__x64_sys_write+0x73/0xb0 fs/read_write.c:607
Fixes: 4e09fc873d92 ("netfilter: prefer nla_strlcpy for dealing with NLA_STRING attributes")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Florian Westphal <fw@strlen.de>
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Reported-by: syzbot <syzkaller@googlegroups.com>
---
net/netfilter/nfnetlink_acct.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index 6ddf89183e7b47e6c029b28cf5b524c73a790498..a0e5adf0b3b6ddbcd01f956d25dda01c611a7663 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -115,7 +115,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl,
nfacct->flags = flags;
}
- nla_strlcpy(nfacct->name, nla_data(tb[NFACCT_NAME]), NFACCT_NAME_MAX);
+ nla_strlcpy(nfacct->name, tb[NFACCT_NAME], NFACCT_NAME_MAX);
if (tb[NFACCT_BYTES]) {
atomic64_set(&nfacct->bytes,
--
2.17.0.441.gb46fe60e1d-goog
^ permalink raw reply related
* Re: [Cake] [PATCH net-next v14 4/7] sch_cake: Add NAT awareness to packet classifier
From: Marcelo Ricardo Leitner @ 2018-05-21 23:34 UTC (permalink / raw)
To: Toke Høiland-Jørgensen; +Cc: netdev, cake, netfilter-devel
In-Reply-To: <152693495866.32668.5164616056948127124.stgit@alrua-kau>
[Cc'ing netfilter-devel@ for awareness]
On Mon, May 21, 2018 at 10:35:58PM +0200, Toke Høiland-Jørgensen wrote:
> When CAKE is deployed on a gateway that also performs NAT (which is a
> common deployment mode), the host fairness mechanism cannot distinguish
> internal hosts from each other, and so fails to work correctly.
>
> To fix this, we add an optional NAT awareness mode, which will query the
> kernel conntrack mechanism to obtain the pre-NAT addresses for each packet
> and use that in the flow and host hashing.
>
> When the shaper is enabled and the host is already performing NAT, the cost
> of this lookup is negligible. However, in unlimited mode with no NAT being
> performed, there is a significant CPU cost at higher bandwidths. For this
> reason, the feature is turned off by default.
>
> Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
> ---
> net/sched/sch_cake.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 79 insertions(+)
>
> diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
> index 92623160d43e..04364993ce19 100644
> --- a/net/sched/sch_cake.c
> +++ b/net/sched/sch_cake.c
> @@ -71,6 +71,12 @@
> #include <net/tcp.h>
> #include <net/flow_dissector.h>
>
> +#if IS_REACHABLE(CONFIG_NF_CONNTRACK)
> +#include <net/netfilter/nf_conntrack_core.h>
> +#include <net/netfilter/nf_conntrack_zones.h>
> +#include <net/netfilter/nf_conntrack.h>
> +#endif
> +
> #define CAKE_SET_WAYS (8)
> #define CAKE_MAX_TINS (8)
> #define CAKE_QUEUES (1024)
> @@ -516,6 +522,60 @@ static bool cobalt_should_drop(struct cobalt_vars *vars,
> return drop;
> }
>
> +#if IS_REACHABLE(CONFIG_NF_CONNTRACK)
> +
> +static void cake_update_flowkeys(struct flow_keys *keys,
> + const struct sk_buff *skb)
> +{
> + const struct nf_conntrack_tuple *tuple;
> + enum ip_conntrack_info ctinfo;
> + struct nf_conn *ct;
> + bool rev = false;
> +
> + if (tc_skb_protocol(skb) != htons(ETH_P_IP))
> + return;
> +
> + ct = nf_ct_get(skb, &ctinfo);
> + if (ct) {
> + tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo));
> + } else {
> + const struct nf_conntrack_tuple_hash *hash;
> + struct nf_conntrack_tuple srctuple;
> +
> + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
> + NFPROTO_IPV4, dev_net(skb->dev),
> + &srctuple))
> + return;
> +
> + hash = nf_conntrack_find_get(dev_net(skb->dev),
> + &nf_ct_zone_dflt,
> + &srctuple);
> + if (!hash)
> + return;
> +
> + rev = true;
> + ct = nf_ct_tuplehash_to_ctrack(hash);
> + tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir);
> + }
> +
> + keys->addrs.v4addrs.src = rev ? tuple->dst.u3.ip : tuple->src.u3.ip;
> + keys->addrs.v4addrs.dst = rev ? tuple->src.u3.ip : tuple->dst.u3.ip;
> +
> + if (keys->ports.ports) {
> + keys->ports.src = rev ? tuple->dst.u.all : tuple->src.u.all;
> + keys->ports.dst = rev ? tuple->src.u.all : tuple->dst.u.all;
> + }
> + if (rev)
> + nf_ct_put(ct);
> +}
> +#else
> +static void cake_update_flowkeys(struct flow_keys *keys,
> + const struct sk_buff *skb)
> +{
> + /* There is nothing we can do here without CONNTRACK */
> +}
> +#endif
> +
> /* Cake has several subtle multiple bit settings. In these cases you
> * would be matching triple isolate mode as well.
> */
> @@ -543,6 +603,9 @@ static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
> skb_flow_dissect_flow_keys(skb, &keys,
> FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
>
> + if (flow_mode & CAKE_FLOW_NAT_FLAG)
> + cake_update_flowkeys(&keys, skb);
> +
> /* flow_hash_from_keys() sorts the addresses by value, so we have
> * to preserve their order in a separate data structure to treat
> * src and dst host addresses as independently selectable.
> @@ -1894,6 +1957,18 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt,
> if (err < 0)
> return err;
>
> + if (tb[TCA_CAKE_NAT]) {
> +#if IS_REACHABLE(CONFIG_NF_CONNTRACK)
> + q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
> + q->flow_mode |= CAKE_FLOW_NAT_FLAG *
> + !!nla_get_u32(tb[TCA_CAKE_NAT]);
> +#else
> + NL_SET_ERR_MSG_ATTR(extack, "No conntrack support in kernel",
> + tb[TCA_CAKE_NAT]);
> + return -EOPNOTSUPP;
> +#endif
> + }
> +
> if (tb[TCA_CAKE_BASE_RATE64])
> q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
>
> @@ -2066,6 +2141,10 @@ static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
> if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
> goto nla_put_failure;
>
> + if (nla_put_u32(skb, TCA_CAKE_NAT,
> + !!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
> + goto nla_put_failure;
> +
> return nla_nest_end(skb, opts);
>
> nla_put_failure:
>
> _______________________________________________
> Cake mailing list
> Cake@lists.bufferbloat.net
> https://lists.bufferbloat.net/listinfo/cake
^ permalink raw reply
* Re: [PATCH net] netfilter: provide correct argument to nla_strlcpy()
From: Eric Dumazet @ 2018-05-21 23:34 UTC (permalink / raw)
To: David Miller; +Cc: netdev, Eric Dumazet, Florian Westphal, Pablo Neira Ayuso
In-Reply-To: <20180521233320.66082-1-edumazet@google.com>
On Mon, May 21, 2018 at 4:33 PM Eric Dumazet <edumazet@google.com> wrote:
> Recent patch forgot to remove nla_data(), upsetting syzkaller a bit.
Humpff.
I forgot to add one file in the change.
Will send V2.
^ permalink raw reply
* [PATCH v2 net] netfilter: provide correct argument to nla_strlcpy()
From: Eric Dumazet @ 2018-05-21 23:35 UTC (permalink / raw)
To: David S . Miller
Cc: netdev, Eric Dumazet, Eric Dumazet, Florian Westphal,
Pablo Neira Ayuso
Recent patch forgot to remove nla_data(), upsetting syzkaller a bit.
BUG: KASAN: slab-out-of-bounds in nla_strlcpy+0x13d/0x150 lib/nlattr.c:314
Read of size 1 at addr ffff8801ad1f4fdd by task syz-executor189/4509
CPU: 1 PID: 4509 Comm: syz-executor189 Not tainted 4.17.0-rc6+ #62
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:77 [inline]
dump_stack+0x1b9/0x294 lib/dump_stack.c:113
print_address_description+0x6c/0x20b mm/kasan/report.c:256
kasan_report_error mm/kasan/report.c:354 [inline]
kasan_report.cold.7+0x242/0x2fe mm/kasan/report.c:412
__asan_report_load1_noabort+0x14/0x20 mm/kasan/report.c:430
nla_strlcpy+0x13d/0x150 lib/nlattr.c:314
nfnl_acct_new+0x574/0xc50 net/netfilter/nfnetlink_acct.c:118
nfnetlink_rcv_msg+0xdb5/0xff0 net/netfilter/nfnetlink.c:212
netlink_rcv_skb+0x172/0x440 net/netlink/af_netlink.c:2448
nfnetlink_rcv+0x1fe/0x1ba0 net/netfilter/nfnetlink.c:513
netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline]
netlink_unicast+0x58b/0x740 net/netlink/af_netlink.c:1336
netlink_sendmsg+0x9f0/0xfa0 net/netlink/af_netlink.c:1901
sock_sendmsg_nosec net/socket.c:629 [inline]
sock_sendmsg+0xd5/0x120 net/socket.c:639
sock_write_iter+0x35a/0x5a0 net/socket.c:908
call_write_iter include/linux/fs.h:1784 [inline]
new_sync_write fs/read_write.c:474 [inline]
__vfs_write+0x64d/0x960 fs/read_write.c:487
vfs_write+0x1f8/0x560 fs/read_write.c:549
ksys_write+0xf9/0x250 fs/read_write.c:598
__do_sys_write fs/read_write.c:610 [inline]
__se_sys_write fs/read_write.c:607 [inline]
__x64_sys_write+0x73/0xb0 fs/read_write.c:607
Fixes: 4e09fc873d92 ("netfilter: prefer nla_strlcpy for dealing with NLA_STRING attributes")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Florian Westphal <fw@strlen.de>
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Reported-by: syzbot <syzkaller@googlegroups.com>
---
net/netfilter/nfnetlink_acct.c | 2 +-
net/netfilter/nfnetlink_cthelper.c | 4 ++--
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index 6ddf89183e7b47e6c029b28cf5b524c73a790498..a0e5adf0b3b6ddbcd01f956d25dda01c611a7663 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -115,7 +115,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl,
nfacct->flags = flags;
}
- nla_strlcpy(nfacct->name, nla_data(tb[NFACCT_NAME]), NFACCT_NAME_MAX);
+ nla_strlcpy(nfacct->name, tb[NFACCT_NAME], NFACCT_NAME_MAX);
if (tb[NFACCT_BYTES]) {
atomic64_set(&nfacct->bytes,
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index fa026b269b3691d5186e28020eb2b08e93dc3679..cb5b5f2077774c29fb987b7f1eb2d75e9efc2fe1 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -150,7 +150,7 @@ nfnl_cthelper_expect_policy(struct nf_conntrack_expect_policy *expect_policy,
return -EINVAL;
nla_strlcpy(expect_policy->name,
- nla_data(tb[NFCTH_POLICY_NAME]), NF_CT_HELPER_NAME_LEN);
+ tb[NFCTH_POLICY_NAME], NF_CT_HELPER_NAME_LEN);
expect_policy->max_expected =
ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX]));
if (expect_policy->max_expected > NF_CT_EXPECT_MAX_CNT)
@@ -235,7 +235,7 @@ nfnl_cthelper_create(const struct nlattr * const tb[],
goto err1;
nla_strlcpy(helper->name,
- nla_data(tb[NFCTH_NAME]), NF_CT_HELPER_NAME_LEN);
+ tb[NFCTH_NAME], NF_CT_HELPER_NAME_LEN);
size = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN]));
if (size > FIELD_SIZEOF(struct nf_conn_help, data)) {
ret = -ENOMEM;
--
2.17.0.441.gb46fe60e1d-goog
^ permalink raw reply related
* Re: [Cake] [PATCH net-next v14 6/7] sch_cake: Add overhead compensation support to the rate shaper
From: Marcelo Ricardo Leitner @ 2018-05-21 23:45 UTC (permalink / raw)
To: Toke Høiland-Jørgensen; +Cc: netdev, cake
In-Reply-To: <152693495874.32668.11244869690098290078.stgit@alrua-kau>
On Mon, May 21, 2018 at 10:35:58PM +0200, Toke Høiland-Jørgensen wrote:
> +static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
> +{
> + const struct skb_shared_info *shinfo = skb_shinfo(skb);
> + unsigned int hdr_len, last_len = 0;
> + u32 off = skb_network_offset(skb);
> + u32 len = qdisc_pkt_len(skb);
> + u16 segs = 1;
> +
> + q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
> +
> + if (!shinfo->gso_size)
> + return cake_calc_overhead(q, len, off);
> +
> + /* borrowed from qdisc_pkt_len_init() */
> + hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
> +
> + /* + transport layer */
> + if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
> + SKB_GSO_TCPV6))) {
> + const struct tcphdr *th;
> + struct tcphdr _tcphdr;
> +
> + th = skb_header_pointer(skb, skb_transport_offset(skb),
> + sizeof(_tcphdr), &_tcphdr);
> + if (likely(th))
> + hdr_len += __tcp_hdrlen(th);
> + } else {
I didn't see some code limiting GSO packets to just TCP or UDP. Is it
safe to assume that this packet is an UDP one, and not SCTP or ESP,
for example?
> + struct udphdr _udphdr;
> +
> + if (skb_header_pointer(skb, skb_transport_offset(skb),
> + sizeof(_udphdr), &_udphdr))
> + hdr_len += sizeof(struct udphdr);
> + }
> +
> + if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
> + segs = DIV_ROUND_UP(skb->len - hdr_len,
> + shinfo->gso_size);
> + else
> + segs = shinfo->gso_segs;
> +
> + len = shinfo->gso_size + hdr_len;
> + last_len = skb->len - shinfo->gso_size * (segs - 1);
> +
> + return (cake_calc_overhead(q, len, off) * (segs - 1) +
> + cake_calc_overhead(q, last_len, off));
> +}
> +
^ permalink raw reply
* Re: [PATCH bpf-next 1/5] bpf: Hooks for sys_sendmsg
From: kbuild test robot @ 2018-05-22 0:08 UTC (permalink / raw)
To: Andrey Ignatov
Cc: kbuild-all, netdev, Andrey Ignatov, davem, ast, daniel,
kernel-team
In-Reply-To: <654b59700a21ed2342b12f30a583ba9329ef850b.1526694154.git.rdna@fb.com>
[-- Attachment #1: Type: text/plain, Size: 10869 bytes --]
Hi Andrey,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on bpf-next/master]
url: https://github.com/0day-ci/linux/commits/Andrey-Ignatov/bpf-Hooks-for-sys_sendmsg/20180522-065614
base: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: i386-randconfig-s1-201820 (attached as .config)
compiler: gcc-6 (Debian 6.4.0-9) 6.4.0 20171026
reproduce:
# save the attached .config to linux build tree
make ARCH=i386
All errors (new ones prefixed by >>):
net/ipv4/udp.c: In function 'udp_sendmsg':
net/ipv4/udp.c:1014:44: error: macro "BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK" passed 3 arguments, but takes just 2
(struct sockaddr *)usin, &ipc.addr);
^
>> net/ipv4/udp.c:1013:9: error: 'BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK' undeclared (first use in this function)
err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
net/ipv4/udp.c:1013:9: note: each undeclared identifier is reported only once for each function it appears in
--
net/ipv6/udp.c: In function 'udpv6_sendmsg':
net/ipv6/udp.c:1320:44: error: macro "BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK" passed 3 arguments, but takes just 2
(struct sockaddr *)sin6, &fl6.saddr);
^
>> net/ipv6/udp.c:1319:9: error: 'BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK' undeclared (first use in this function)
err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
net/ipv6/udp.c:1319:9: note: each undeclared identifier is reported only once for each function it appears in
vim +/BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK +1013 net/ipv4/udp.c
898
899 int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
900 {
901 struct inet_sock *inet = inet_sk(sk);
902 struct udp_sock *up = udp_sk(sk);
903 DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
904 struct flowi4 fl4_stack;
905 struct flowi4 *fl4;
906 int ulen = len;
907 struct ipcm_cookie ipc;
908 struct rtable *rt = NULL;
909 int free = 0;
910 int connected = 0;
911 __be32 daddr, faddr, saddr;
912 __be16 dport;
913 u8 tos;
914 int err, is_udplite = IS_UDPLITE(sk);
915 int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
916 int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
917 struct sk_buff *skb;
918 struct ip_options_data opt_copy;
919
920 if (len > 0xFFFF)
921 return -EMSGSIZE;
922
923 /*
924 * Check the flags.
925 */
926
927 if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */
928 return -EOPNOTSUPP;
929
930 ipc.opt = NULL;
931 ipc.tx_flags = 0;
932 ipc.ttl = 0;
933 ipc.tos = -1;
934
935 getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
936
937 fl4 = &inet->cork.fl.u.ip4;
938 if (up->pending) {
939 /*
940 * There are pending frames.
941 * The socket lock must be held while it's corked.
942 */
943 lock_sock(sk);
944 if (likely(up->pending)) {
945 if (unlikely(up->pending != AF_INET)) {
946 release_sock(sk);
947 return -EINVAL;
948 }
949 goto do_append_data;
950 }
951 release_sock(sk);
952 }
953 ulen += sizeof(struct udphdr);
954
955 /*
956 * Get and verify the address.
957 */
958 if (usin) {
959 if (msg->msg_namelen < sizeof(*usin))
960 return -EINVAL;
961 if (usin->sin_family != AF_INET) {
962 if (usin->sin_family != AF_UNSPEC)
963 return -EAFNOSUPPORT;
964 }
965
966 daddr = usin->sin_addr.s_addr;
967 dport = usin->sin_port;
968 if (dport == 0)
969 return -EINVAL;
970 } else {
971 if (sk->sk_state != TCP_ESTABLISHED)
972 return -EDESTADDRREQ;
973 daddr = inet->inet_daddr;
974 dport = inet->inet_dport;
975 /* Open fast path for connected socket.
976 Route will not be used, if at least one option is set.
977 */
978 connected = 1;
979 }
980
981 ipc.sockc.tsflags = sk->sk_tsflags;
982 ipc.addr = inet->inet_saddr;
983 ipc.oif = sk->sk_bound_dev_if;
984 ipc.gso_size = up->gso_size;
985
986 if (msg->msg_controllen) {
987 err = udp_cmsg_send(sk, msg, &ipc.gso_size);
988 if (err > 0)
989 err = ip_cmsg_send(sk, msg, &ipc,
990 sk->sk_family == AF_INET6);
991 if (unlikely(err < 0)) {
992 kfree(ipc.opt);
993 return err;
994 }
995 if (ipc.opt)
996 free = 1;
997 connected = 0;
998 }
999 if (!ipc.opt) {
1000 struct ip_options_rcu *inet_opt;
1001
1002 rcu_read_lock();
1003 inet_opt = rcu_dereference(inet->inet_opt);
1004 if (inet_opt) {
1005 memcpy(&opt_copy, inet_opt,
1006 sizeof(*inet_opt) + inet_opt->opt.optlen);
1007 ipc.opt = &opt_copy.opt;
1008 }
1009 rcu_read_unlock();
1010 }
1011
1012 if (!connected) {
> 1013 err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
> 1014 (struct sockaddr *)usin, &ipc.addr);
1015 if (err)
1016 goto out_free;
1017 if (usin) {
1018 if (usin->sin_port == 0) {
1019 /* BPF program set invalid port. Reject it. */
1020 err = -EINVAL;
1021 goto out_free;
1022 }
1023 daddr = usin->sin_addr.s_addr;
1024 dport = usin->sin_port;
1025 }
1026 }
1027
1028 saddr = ipc.addr;
1029 ipc.addr = faddr = daddr;
1030
1031 sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags);
1032
1033 if (ipc.opt && ipc.opt->opt.srr) {
1034 if (!daddr) {
1035 err = -EINVAL;
1036 goto out_free;
1037 }
1038 faddr = ipc.opt->opt.faddr;
1039 connected = 0;
1040 }
1041 tos = get_rttos(&ipc, inet);
1042 if (sock_flag(sk, SOCK_LOCALROUTE) ||
1043 (msg->msg_flags & MSG_DONTROUTE) ||
1044 (ipc.opt && ipc.opt->opt.is_strictroute)) {
1045 tos |= RTO_ONLINK;
1046 connected = 0;
1047 }
1048
1049 if (ipv4_is_multicast(daddr)) {
1050 if (!ipc.oif)
1051 ipc.oif = inet->mc_index;
1052 if (!saddr)
1053 saddr = inet->mc_addr;
1054 connected = 0;
1055 } else if (!ipc.oif) {
1056 ipc.oif = inet->uc_index;
1057 } else if (ipv4_is_lbcast(daddr) && inet->uc_index) {
1058 /* oif is set, packet is to local broadcast and
1059 * and uc_index is set. oif is most likely set
1060 * by sk_bound_dev_if. If uc_index != oif check if the
1061 * oif is an L3 master and uc_index is an L3 slave.
1062 * If so, we want to allow the send using the uc_index.
1063 */
1064 if (ipc.oif != inet->uc_index &&
1065 ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk),
1066 inet->uc_index)) {
1067 ipc.oif = inet->uc_index;
1068 }
1069 }
1070
1071 if (connected)
1072 rt = (struct rtable *)sk_dst_check(sk, 0);
1073
1074 if (!rt) {
1075 struct net *net = sock_net(sk);
1076 __u8 flow_flags = inet_sk_flowi_flags(sk);
1077
1078 fl4 = &fl4_stack;
1079
1080 flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
1081 RT_SCOPE_UNIVERSE, sk->sk_protocol,
1082 flow_flags,
1083 faddr, saddr, dport, inet->inet_sport,
1084 sk->sk_uid);
1085
1086 security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
1087 rt = ip_route_output_flow(net, fl4, sk);
1088 if (IS_ERR(rt)) {
1089 err = PTR_ERR(rt);
1090 rt = NULL;
1091 if (err == -ENETUNREACH)
1092 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
1093 goto out;
1094 }
1095
1096 err = -EACCES;
1097 if ((rt->rt_flags & RTCF_BROADCAST) &&
1098 !sock_flag(sk, SOCK_BROADCAST))
1099 goto out;
1100 if (connected)
1101 sk_dst_set(sk, dst_clone(&rt->dst));
1102 }
1103
1104 if (msg->msg_flags&MSG_CONFIRM)
1105 goto do_confirm;
1106 back_from_confirm:
1107
1108 saddr = fl4->saddr;
1109 if (!ipc.addr)
1110 daddr = ipc.addr = fl4->daddr;
1111
1112 /* Lockless fast path for the non-corking case. */
1113 if (!corkreq) {
1114 struct inet_cork cork;
1115
1116 skb = ip_make_skb(sk, fl4, getfrag, msg, ulen,
1117 sizeof(struct udphdr), &ipc, &rt,
1118 &cork, msg->msg_flags);
1119 err = PTR_ERR(skb);
1120 if (!IS_ERR_OR_NULL(skb))
1121 err = udp_send_skb(skb, fl4, &cork);
1122 goto out;
1123 }
1124
1125 lock_sock(sk);
1126 if (unlikely(up->pending)) {
1127 /* The socket is already corked while preparing it. */
1128 /* ... which is an evident application bug. --ANK */
1129 release_sock(sk);
1130
1131 net_dbg_ratelimited("socket already corked\n");
1132 err = -EINVAL;
1133 goto out;
1134 }
1135 /*
1136 * Now cork the socket to pend data.
1137 */
1138 fl4 = &inet->cork.fl.u.ip4;
1139 fl4->daddr = daddr;
1140 fl4->saddr = saddr;
1141 fl4->fl4_dport = dport;
1142 fl4->fl4_sport = inet->inet_sport;
1143 up->pending = AF_INET;
1144
1145 do_append_data:
1146 up->len += ulen;
1147 err = ip_append_data(sk, fl4, getfrag, msg, ulen,
1148 sizeof(struct udphdr), &ipc, &rt,
1149 corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
1150 if (err)
1151 udp_flush_pending_frames(sk);
1152 else if (!corkreq)
1153 err = udp_push_pending_frames(sk);
1154 else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))
1155 up->pending = 0;
1156 release_sock(sk);
1157
1158 out:
1159 ip_rt_put(rt);
1160 out_free:
1161 if (free)
1162 kfree(ipc.opt);
1163 if (!err)
1164 return len;
1165 /*
1166 * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space. Reporting
1167 * ENOBUFS might not be good (it's not tunable per se), but otherwise
1168 * we don't have a good statistic (IpOutDiscards but it can be too many
1169 * things). We could add another new stat but at least for now that
1170 * seems like overkill.
1171 */
1172 if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
1173 UDP_INC_STATS(sock_net(sk),
1174 UDP_MIB_SNDBUFERRORS, is_udplite);
1175 }
1176 return err;
1177
1178 do_confirm:
1179 if (msg->msg_flags & MSG_PROBE)
1180 dst_confirm_neigh(&rt->dst, &fl4->daddr);
1181 if (!(msg->msg_flags&MSG_PROBE) || len)
1182 goto back_from_confirm;
1183 err = 0;
1184 goto out;
1185 }
1186 EXPORT_SYMBOL(udp_sendmsg);
1187
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 24538 bytes --]
^ permalink raw reply
* [PATCH net-next v4 0/2] openvswitch: Support conntrack zone limit
From: Yi-Hung Wei @ 2018-05-22 0:16 UTC (permalink / raw)
To: netdev, pshelar; +Cc: Yi-Hung Wei
Currently, nf_conntrack_max is used to limit the maximum number of
conntrack entries in the conntrack table for every network namespace.
For the VMs and containers that reside in the same namespace,
they share the same conntrack table, and the total # of conntrack entries
for all the VMs and containers are limited by nf_conntrack_max. In this
case, if one of the VM/container abuses the usage the conntrack entries,
it blocks the others from committing valid conntrack entries into the
conntrack table. Even if we can possibly put the VM in different network
namespace, the current nf_conntrack_max configuration is kind of rigid
that we cannot limit different VM/container to have different # conntrack
entries.
To address the aforementioned issue, this patch proposes to have a
fine-grained mechanism that could further limit the # of conntrack entries
per-zone. For example, we can designate different zone to different VM,
and set conntrack limit to each zone. By providing this isolation, a
mis-behaved VM only consumes the conntrack entries in its own zone, and
it will not influence other well-behaved VMs. Moreover, the users can
set various conntrack limit to different zone based on their preference.
The proposed implementation utilizes Netfilter's nf_conncount backend
to count the number of connections in a particular zone. If the number of
connection is above a configured limitation, OVS will return ENOMEM to the
userspace. If userspace does not configure the zone limit, the limit
defaults to zero that is no limitation, which is backward compatible to
the behavior without this patch.
The first patch defines the conntrack limit netlink definition, and the
second patch provides the implementation.
v3->v4:
- Addresses comments from Parvin that include simplify netlink API,
and remove unncessary RCU lockings.
- Rebases to master.
v2->v3:
- Addresses comments from Parvin that include using static keys to check
if ovs_ct_limit features is used, only check ct_limit when a ct entry
is unconfirmed, and reports rate limited warning messages when the ct
limit is reached.
- Rebases to master.
v1->v2:
- Fixes commit log typos suggested by Greg.
- Fixes memory free issue that Julia found.
Yi-Hung Wei (2):
openvswitch: Add conntrack limit netlink definition
openvswitch: Support conntrack zone limit
include/uapi/linux/openvswitch.h | 26 ++
net/openvswitch/Kconfig | 3 +-
net/openvswitch/conntrack.c | 541 ++++++++++++++++++++++++++++++++++++++-
net/openvswitch/conntrack.h | 9 +-
net/openvswitch/datapath.c | 7 +-
net/openvswitch/datapath.h | 3 +
6 files changed, 583 insertions(+), 6 deletions(-)
--
2.7.4
^ permalink raw reply
* [PATCH net-next v4 1/2] openvswitch: Add conntrack limit netlink definition
From: Yi-Hung Wei @ 2018-05-22 0:16 UTC (permalink / raw)
To: netdev, pshelar; +Cc: Yi-Hung Wei
In-Reply-To: <1526948165-32443-1-git-send-email-yihung.wei@gmail.com>
Define netlink messages and attributes to support user kernel
communication that uses the conntrack limit feature.
Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
---
include/uapi/linux/openvswitch.h | 26 ++++++++++++++++++++++++++
1 file changed, 26 insertions(+)
diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 713e56ce681f..d8da2b7591f5 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -937,4 +937,30 @@ enum ovs_meter_band_type {
#define OVS_METER_BAND_TYPE_MAX (__OVS_METER_BAND_TYPE_MAX - 1)
+/* Conntrack limit */
+#define OVS_CT_LIMIT_FAMILY "ovs_ct_limit"
+#define OVS_CT_LIMIT_MCGROUP "ovs_ct_limit"
+#define OVS_CT_LIMIT_VERSION 0x1
+
+enum ovs_ct_limit_cmd {
+ OVS_CT_LIMIT_CMD_UNSPEC,
+ OVS_CT_LIMIT_CMD_SET, /* Add or modify ct limit. */
+ OVS_CT_LIMIT_CMD_DEL, /* Delete ct limit. */
+ OVS_CT_LIMIT_CMD_GET /* Get ct limit. */
+};
+
+enum ovs_ct_limit_attr {
+ OVS_CT_LIMIT_ATTR_UNSPEC,
+ OVS_CT_LIMIT_ATTR_ZONE_LIMIT, /* Nested struct ovs_zone_limit. */
+ __OVS_CT_LIMIT_ATTR_MAX
+};
+
+#define OVS_CT_LIMIT_ATTR_MAX (__OVS_CT_LIMIT_ATTR_MAX - 1)
+
+struct ovs_zone_limit {
+ int zone_id;
+ __u32 limit;
+ __u32 count;
+};
+
#endif /* _LINUX_OPENVSWITCH_H */
--
2.7.4
^ permalink raw reply related
* [PATCH net-next v4 2/2] openvswitch: Support conntrack zone limit
From: Yi-Hung Wei @ 2018-05-22 0:16 UTC (permalink / raw)
To: netdev, pshelar; +Cc: Yi-Hung Wei
In-Reply-To: <1526948165-32443-1-git-send-email-yihung.wei@gmail.com>
Currently, nf_conntrack_max is used to limit the maximum number of
conntrack entries in the conntrack table for every network namespace.
For the VMs and containers that reside in the same namespace,
they share the same conntrack table, and the total # of conntrack entries
for all the VMs and containers are limited by nf_conntrack_max. In this
case, if one of the VM/container abuses the usage the conntrack entries,
it blocks the others from committing valid conntrack entries into the
conntrack table. Even if we can possibly put the VM in different network
namespace, the current nf_conntrack_max configuration is kind of rigid
that we cannot limit different VM/container to have different # conntrack
entries.
To address the aforementioned issue, this patch proposes to have a
fine-grained mechanism that could further limit the # of conntrack entries
per-zone. For example, we can designate different zone to different VM,
and set conntrack limit to each zone. By providing this isolation, a
mis-behaved VM only consumes the conntrack entries in its own zone, and
it will not influence other well-behaved VMs. Moreover, the users can
set various conntrack limit to different zone based on their preference.
The proposed implementation utilizes Netfilter's nf_conncount backend
to count the number of connections in a particular zone. If the number of
connection is above a configured limitation, ovs will return ENOMEM to the
userspace. If userspace does not configure the zone limit, the limit
defaults to zero that is no limitation, which is backward compatible to
the behavior without this patch.
The following high leve APIs are provided to the userspace:
- OVS_CT_LIMIT_CMD_SET:
* set default connection limit for all zones
* set the connection limit for a particular zone
- OVS_CT_LIMIT_CMD_DEL:
* remove the connection limit for a particular zone
- OVS_CT_LIMIT_CMD_GET:
* get the default connection limit for all zones
* get the connection limit for a particular zone
Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
---
net/openvswitch/Kconfig | 3 +-
net/openvswitch/conntrack.c | 541 +++++++++++++++++++++++++++++++++++++++++++-
net/openvswitch/conntrack.h | 9 +-
net/openvswitch/datapath.c | 7 +-
net/openvswitch/datapath.h | 3 +
5 files changed, 557 insertions(+), 6 deletions(-)
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index 2650205cdaf9..89da9512ec1e 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -9,7 +9,8 @@ config OPENVSWITCH
(NF_CONNTRACK && ((!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6) && \
(!NF_NAT || NF_NAT) && \
(!NF_NAT_IPV4 || NF_NAT_IPV4) && \
- (!NF_NAT_IPV6 || NF_NAT_IPV6)))
+ (!NF_NAT_IPV6 || NF_NAT_IPV6) && \
+ (!NETFILTER_CONNCOUNT || NETFILTER_CONNCOUNT)))
select LIBCRC32C
select MPLS
select NET_MPLS_GSO
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 02fc343feb66..e8bb91420ca9 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -16,8 +16,11 @@
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/sctp.h>
+#include <linux/static_key.h>
#include <net/ip.h>
+#include <net/genetlink.h>
#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_count.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_labels.h>
#include <net/netfilter/nf_conntrack_seqadj.h>
@@ -76,6 +79,31 @@ struct ovs_conntrack_info {
#endif
};
+#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+#define OVS_CT_LIMIT_UNLIMITED 0
+#define OVS_CT_LIMIT_DEFAULT OVS_CT_LIMIT_UNLIMITED
+#define CT_LIMIT_HASH_BUCKETS 512
+static DEFINE_STATIC_KEY_FALSE(ovs_ct_limit_enabled);
+
+struct ovs_ct_limit {
+ /* Elements in ovs_ct_limit_info->limits hash table */
+ struct hlist_node hlist_node;
+ struct rcu_head rcu;
+ u16 zone;
+ u32 limit;
+};
+
+struct ovs_ct_limit_info {
+ u32 default_limit;
+ struct hlist_head *limits;
+ struct nf_conncount_data *data __aligned(8);
+};
+
+static const struct nla_policy ct_limit_policy[OVS_CT_LIMIT_ATTR_MAX + 1] = {
+ [OVS_CT_LIMIT_ATTR_ZONE_LIMIT] = { .type = NLA_NESTED, },
+};
+#endif
+
static bool labels_nonzero(const struct ovs_key_ct_labels *labels);
static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info);
@@ -1036,6 +1064,89 @@ static bool labels_nonzero(const struct ovs_key_ct_labels *labels)
return false;
}
+#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+static struct hlist_head *ct_limit_hash_bucket(
+ const struct ovs_ct_limit_info *info, u16 zone)
+{
+ return &info->limits[zone & (CT_LIMIT_HASH_BUCKETS - 1)];
+}
+
+/* Call with ovs_mutex */
+static void ct_limit_set(const struct ovs_ct_limit_info *info,
+ struct ovs_ct_limit *new_ct_limit)
+{
+ struct ovs_ct_limit *ct_limit;
+ struct hlist_head *head;
+
+ head = ct_limit_hash_bucket(info, new_ct_limit->zone);
+ hlist_for_each_entry_rcu(ct_limit, head, hlist_node) {
+ if (ct_limit->zone == new_ct_limit->zone) {
+ hlist_replace_rcu(&ct_limit->hlist_node,
+ &new_ct_limit->hlist_node);
+ kfree_rcu(ct_limit, rcu);
+ return;
+ }
+ }
+
+ hlist_add_head_rcu(&new_ct_limit->hlist_node, head);
+}
+
+/* Call with ovs_mutex */
+static void ct_limit_del(const struct ovs_ct_limit_info *info, u16 zone)
+{
+ struct ovs_ct_limit *ct_limit;
+ struct hlist_head *head;
+ struct hlist_node *n;
+
+ head = ct_limit_hash_bucket(info, zone);
+ hlist_for_each_entry_safe(ct_limit, n, head, hlist_node) {
+ if (ct_limit->zone == zone) {
+ hlist_del_rcu(&ct_limit->hlist_node);
+ kfree_rcu(ct_limit, rcu);
+ return;
+ }
+ }
+}
+
+/* Call with RCU read lock */
+static u32 ct_limit_get(const struct ovs_ct_limit_info *info, u16 zone)
+{
+ struct ovs_ct_limit *ct_limit;
+ struct hlist_head *head;
+
+ head = ct_limit_hash_bucket(info, zone);
+ hlist_for_each_entry_rcu(ct_limit, head, hlist_node) {
+ if (ct_limit->zone == zone)
+ return ct_limit->limit;
+ }
+
+ return info->default_limit;
+}
+
+static int ovs_ct_check_limit(struct net *net,
+ const struct ovs_conntrack_info *info,
+ const struct nf_conntrack_tuple *tuple)
+{
+ struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+ const struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info;
+ u32 per_zone_limit, connections;
+ u32 conncount_key[5];
+
+ conncount_key[0] = info->zone.id;
+
+ per_zone_limit = ct_limit_get(ct_limit_info, info->zone.id);
+ if (per_zone_limit == OVS_CT_LIMIT_UNLIMITED)
+ return 0;
+
+ connections = nf_conncount_count(net, ct_limit_info->data,
+ conncount_key, tuple, &info->zone);
+ if (connections > per_zone_limit)
+ return -ENOMEM;
+
+ return 0;
+}
+#endif
+
/* Lookup connection and confirm if unconfirmed. */
static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
const struct ovs_conntrack_info *info,
@@ -1054,6 +1165,21 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
if (!ct)
return 0;
+#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+ if (static_branch_unlikely(&ovs_ct_limit_enabled)) {
+ if (!nf_ct_is_confirmed(ct)) {
+ err = ovs_ct_check_limit(net, info,
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ if (err) {
+ net_warn_ratelimited("openvswitch: zone: %u "
+ "execeeds conntrack limit\n",
+ info->zone.id);
+ return err;
+ }
+ }
+ }
+#endif
+
/* Set the conntrack event mask if given. NEW and DELETE events have
* their own groups, but the NFNLGRP_CONNTRACK_UPDATE group listener
* typically would receive many kinds of updates. Setting the event
@@ -1655,7 +1781,410 @@ static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info)
nf_ct_tmpl_free(ct_info->ct);
}
-void ovs_ct_init(struct net *net)
+#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+static int ovs_ct_limit_init(struct net *net, struct ovs_net *ovs_net)
+{
+ int i, err;
+
+ ovs_net->ct_limit_info = kmalloc(sizeof(*ovs_net->ct_limit_info),
+ GFP_KERNEL);
+ if (!ovs_net->ct_limit_info)
+ return -ENOMEM;
+
+ ovs_net->ct_limit_info->default_limit = OVS_CT_LIMIT_DEFAULT;
+ ovs_net->ct_limit_info->limits =
+ kmalloc_array(CT_LIMIT_HASH_BUCKETS, sizeof(struct hlist_head),
+ GFP_KERNEL);
+ if (!ovs_net->ct_limit_info->limits) {
+ kfree(ovs_net->ct_limit_info);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < CT_LIMIT_HASH_BUCKETS; i++)
+ INIT_HLIST_HEAD(&ovs_net->ct_limit_info->limits[i]);
+
+ ovs_net->ct_limit_info->data =
+ nf_conncount_init(net, NFPROTO_INET, sizeof(u32));
+
+ if (IS_ERR(ovs_net->ct_limit_info->data)) {
+ err = PTR_ERR(ovs_net->ct_limit_info->data);
+ kfree(ovs_net->ct_limit_info->limits);
+ kfree(ovs_net->ct_limit_info);
+ return err;
+ }
+ return 0;
+}
+
+static void ovs_ct_limit_exit(struct net *net, struct ovs_net *ovs_net)
+{
+ const struct ovs_ct_limit_info *info = ovs_net->ct_limit_info;
+ int i;
+
+ nf_conncount_destroy(net, NFPROTO_INET, info->data);
+ for (i = 0; i < CT_LIMIT_HASH_BUCKETS; ++i) {
+ struct hlist_head *head = &info->limits[i];
+ struct ovs_ct_limit *ct_limit;
+
+ hlist_for_each_entry_rcu(ct_limit, head, hlist_node)
+ kfree_rcu(ct_limit, rcu);
+ }
+ kfree(ovs_net->ct_limit_info->limits);
+ kfree(ovs_net->ct_limit_info);
+}
+
+static struct sk_buff *
+ovs_ct_limit_cmd_reply_start(struct genl_info *info, u8 cmd,
+ struct ovs_header **ovs_reply_header)
+{
+ struct ovs_header *ovs_header = info->userhdr;
+ struct sk_buff *skb;
+
+ skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!skb)
+ return ERR_PTR(-ENOMEM);
+
+ *ovs_reply_header = genlmsg_put(skb, info->snd_portid,
+ info->snd_seq,
+ &dp_ct_limit_genl_family, 0, cmd);
+
+ if (!*ovs_reply_header) {
+ nlmsg_free(skb);
+ return ERR_PTR(-EMSGSIZE);
+ }
+ (*ovs_reply_header)->dp_ifindex = ovs_header->dp_ifindex;
+
+ return skb;
+}
+
+static bool check_zone_id(int zone_id, u16 *pzone)
+{
+ if (zone_id >= 0 && zone_id <= 65535) {
+ *pzone = (u16)zone_id;
+ return true;
+ }
+ return false;
+}
+
+static int ovs_ct_limit_set_zone_limit(struct nlattr *nla_zone_limit,
+ struct ovs_ct_limit_info *info)
+{
+ struct ovs_zone_limit *zone_limit;
+ int rem;
+ u16 zone;
+
+ rem = NLA_ALIGN(nla_len(nla_zone_limit));
+ zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit);
+
+ while (rem >= sizeof(*zone_limit)) {
+ if (unlikely(zone_limit->zone_id == -1)) {
+ ovs_lock();
+ info->default_limit = zone_limit->limit;
+ ovs_unlock();
+ } else if (unlikely(!check_zone_id(
+ zone_limit->zone_id, &zone))) {
+ OVS_NLERR(true, "zone id is out of range");
+ } else {
+ struct ovs_ct_limit *ct_limit;
+
+ ct_limit = kmalloc(sizeof(*ct_limit), GFP_KERNEL);
+ if (!ct_limit)
+ return -ENOMEM;
+
+ ct_limit->zone = zone;
+ ct_limit->limit = zone_limit->limit;
+
+ ovs_lock();
+ ct_limit_set(info, ct_limit);
+ ovs_unlock();
+ }
+ rem -= NLA_ALIGN(sizeof(*zone_limit));
+ zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit +
+ NLA_ALIGN(sizeof(*zone_limit)));
+ }
+
+ if (rem)
+ OVS_NLERR(true, "set zone limit has %d unknown bytes", rem);
+
+ return 0;
+}
+
+static int ovs_ct_limit_del_zone_limit(struct nlattr *nla_zone_limit,
+ struct ovs_ct_limit_info *info)
+{
+ struct ovs_zone_limit *zone_limit;
+ int rem;
+ u16 zone;
+
+ rem = NLA_ALIGN(nla_len(nla_zone_limit));
+ zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit);
+
+ while (rem >= sizeof(*zone_limit)) {
+ if (unlikely(!check_zone_id(zone_limit->zone_id, &zone))) {
+ OVS_NLERR(true, "zone id is out of range");
+ } else {
+ ovs_lock();
+ ct_limit_del(info, zone);
+ ovs_unlock();
+ }
+ rem -= NLA_ALIGN(sizeof(*zone_limit));
+ zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit +
+ NLA_ALIGN(sizeof(*zone_limit)));
+ }
+
+ if (rem)
+ OVS_NLERR(true, "del zone limit has %d unknown bytes", rem);
+
+ return 0;
+}
+
+static int ovs_ct_limit_get_default_limit(struct ovs_ct_limit_info *info,
+ struct sk_buff *reply)
+{
+ struct ovs_zone_limit zone_limit;
+ int err;
+
+ zone_limit.zone_id = -1;
+ zone_limit.limit = info->default_limit;
+ err = nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static int ovs_ct_limit_get_zone_limit(struct net *net,
+ struct nlattr *nla_zone_limit,
+ struct ovs_ct_limit_info *info,
+ struct sk_buff *reply)
+{
+ struct nf_conntrack_zone ct_zone;
+ struct ovs_zone_limit *zone_limit;
+ int rem, err;
+ u32 conncount_key[5];
+ u16 zone;
+
+ rem = NLA_ALIGN(nla_len(nla_zone_limit));
+ zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit);
+
+ while (rem >= sizeof(*zone_limit)) {
+ if (unlikely(zone_limit->zone_id == -1)) {
+ err = ovs_ct_limit_get_default_limit(info, reply);
+ if (err)
+ return err;
+ } else if (unlikely(!check_zone_id(zone_limit->zone_id,
+ &zone))) {
+ OVS_NLERR(true, "zone id is out of range");
+ } else {
+ rcu_read_lock();
+ zone_limit->limit = ct_limit_get(info, zone);
+ rcu_read_unlock();
+
+ nf_ct_zone_init(&ct_zone, zone, NF_CT_DEFAULT_ZONE_DIR,
+ 0);
+ conncount_key[0] = zone;
+ zone_limit->count = nf_conncount_count(
+ net, info->data, conncount_key, NULL, &ct_zone);
+ err = nla_put_nohdr(reply, sizeof(*zone_limit),
+ zone_limit);
+ if (err)
+ return err;
+ }
+ rem -= NLA_ALIGN(sizeof(*zone_limit));
+ zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit +
+ NLA_ALIGN(sizeof(*zone_limit)));
+ }
+
+ if (rem)
+ OVS_NLERR(true, "get zone limit has %d unknown bytes", rem);
+
+ return 0;
+}
+
+static int ovs_ct_limit_get_all_zone_limit(struct net *net,
+ struct ovs_ct_limit_info *info,
+ struct sk_buff *reply)
+{
+ struct nf_conntrack_zone ct_zone;
+ struct ovs_zone_limit zone_limit;
+ struct ovs_ct_limit *ct_limit;
+ struct hlist_head *head;
+ u32 conncount_key[5];
+ int i, err = 0;
+
+ err = ovs_ct_limit_get_default_limit(info, reply);
+ if (err)
+ return err;
+
+ rcu_read_lock();
+ for (i = 0; i < CT_LIMIT_HASH_BUCKETS; ++i) {
+ head = &info->limits[i];
+ hlist_for_each_entry_rcu(ct_limit, head, hlist_node) {
+ zone_limit.zone_id = ct_limit->zone;
+ zone_limit.limit = ct_limit->limit;
+ nf_ct_zone_init(&ct_zone, ct_limit->zone,
+ NF_CT_DEFAULT_ZONE_DIR, 0);
+
+ conncount_key[0] = ct_limit->zone;
+ zone_limit.count = nf_conncount_count(net, info->data,
+ conncount_key, NULL, &ct_zone);
+ err = nla_put_nohdr(reply, sizeof(zone_limit),
+ &zone_limit);
+ if (err)
+ goto exit_err;
+ }
+ }
+
+exit_err:
+ rcu_read_unlock();
+ return err;
+}
+
+static int ovs_ct_limit_cmd_set(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr **a = info->attrs;
+ struct sk_buff *reply;
+ struct ovs_header *ovs_reply_header;
+ struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id);
+ struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info;
+ int err;
+
+ reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_SET,
+ &ovs_reply_header);
+ if (IS_ERR(reply))
+ return PTR_ERR(reply);
+
+ if (!a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]) {
+ err = -EINVAL;
+ goto exit_err;
+ }
+
+ err = ovs_ct_limit_set_zone_limit(a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT],
+ ct_limit_info);
+ if (err)
+ goto exit_err;
+
+ static_branch_enable(&ovs_ct_limit_enabled);
+
+ genlmsg_end(reply, ovs_reply_header);
+ return genlmsg_reply(reply, info);
+
+exit_err:
+ nlmsg_free(reply);
+ return err;
+}
+
+static int ovs_ct_limit_cmd_del(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr **a = info->attrs;
+ struct sk_buff *reply;
+ struct ovs_header *ovs_reply_header;
+ struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id);
+ struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info;
+ int err;
+
+ reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_DEL,
+ &ovs_reply_header);
+ if (IS_ERR(reply))
+ return PTR_ERR(reply);
+
+ if (!a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]) {
+ err = -EINVAL;
+ goto exit_err;
+ }
+
+ err = ovs_ct_limit_del_zone_limit(a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT],
+ ct_limit_info);
+ if (err)
+ goto exit_err;
+
+ genlmsg_end(reply, ovs_reply_header);
+ return genlmsg_reply(reply, info);
+
+exit_err:
+ nlmsg_free(reply);
+ return err;
+}
+
+static int ovs_ct_limit_cmd_get(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr **a = info->attrs;
+ struct nlattr *nla_reply;
+ struct sk_buff *reply;
+ struct ovs_header *ovs_reply_header;
+ struct net *net = sock_net(skb->sk);
+ struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+ struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info;
+ int err;
+
+ reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_GET,
+ &ovs_reply_header);
+ if (IS_ERR(reply))
+ return PTR_ERR(reply);
+
+ nla_reply = nla_nest_start(reply, OVS_CT_LIMIT_ATTR_ZONE_LIMIT);
+
+ if (a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]) {
+ err = ovs_ct_limit_get_zone_limit(
+ net, a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT], ct_limit_info,
+ reply);
+ if (err)
+ goto exit_err;
+ } else {
+ err = ovs_ct_limit_get_all_zone_limit(net, ct_limit_info,
+ reply);
+ if (err)
+ goto exit_err;
+ }
+
+ nla_nest_end(reply, nla_reply);
+ genlmsg_end(reply, ovs_reply_header);
+ return genlmsg_reply(reply, info);
+
+exit_err:
+ nlmsg_free(reply);
+ return err;
+}
+
+static struct genl_ops ct_limit_genl_ops[] = {
+ { .cmd = OVS_CT_LIMIT_CMD_SET,
+ .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN
+ * privilege. */
+ .policy = ct_limit_policy,
+ .doit = ovs_ct_limit_cmd_set,
+ },
+ { .cmd = OVS_CT_LIMIT_CMD_DEL,
+ .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN
+ * privilege. */
+ .policy = ct_limit_policy,
+ .doit = ovs_ct_limit_cmd_del,
+ },
+ { .cmd = OVS_CT_LIMIT_CMD_GET,
+ .flags = 0, /* OK for unprivileged users. */
+ .policy = ct_limit_policy,
+ .doit = ovs_ct_limit_cmd_get,
+ },
+};
+
+static const struct genl_multicast_group ovs_ct_limit_multicast_group = {
+ .name = OVS_CT_LIMIT_MCGROUP,
+};
+
+struct genl_family dp_ct_limit_genl_family __ro_after_init = {
+ .hdrsize = sizeof(struct ovs_header),
+ .name = OVS_CT_LIMIT_FAMILY,
+ .version = OVS_CT_LIMIT_VERSION,
+ .maxattr = OVS_CT_LIMIT_ATTR_MAX,
+ .netnsok = true,
+ .parallel_ops = true,
+ .ops = ct_limit_genl_ops,
+ .n_ops = ARRAY_SIZE(ct_limit_genl_ops),
+ .mcgrps = &ovs_ct_limit_multicast_group,
+ .n_mcgrps = 1,
+ .module = THIS_MODULE,
+};
+#endif
+
+int ovs_ct_init(struct net *net)
{
unsigned int n_bits = sizeof(struct ovs_key_ct_labels) * BITS_PER_BYTE;
struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
@@ -1666,12 +2195,22 @@ void ovs_ct_init(struct net *net)
} else {
ovs_net->xt_label = true;
}
+
+#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+ return ovs_ct_limit_init(net, ovs_net);
+#else
+ return 0;
+#endif
}
void ovs_ct_exit(struct net *net)
{
struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+ ovs_ct_limit_exit(net, ovs_net);
+#endif
+
if (ovs_net->xt_label)
nf_connlabels_put(net);
}
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index 399dfdd2c4f9..900dadd70974 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -17,10 +17,11 @@
#include "flow.h"
struct ovs_conntrack_info;
+struct ovs_ct_limit_info;
enum ovs_key_attr;
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
-void ovs_ct_init(struct net *);
+int ovs_ct_init(struct net *);
void ovs_ct_exit(struct net *);
bool ovs_ct_verify(struct net *, enum ovs_key_attr attr);
int ovs_ct_copy_action(struct net *, const struct nlattr *,
@@ -44,7 +45,7 @@ void ovs_ct_free_action(const struct nlattr *a);
#else
#include <linux/errno.h>
-static inline void ovs_ct_init(struct net *net) { }
+static inline int ovs_ct_init(struct net *net) { return 0; }
static inline void ovs_ct_exit(struct net *net) { }
@@ -104,4 +105,8 @@ static inline void ovs_ct_free_action(const struct nlattr *a) { }
#define CT_SUPPORTED_MASK 0
#endif /* CONFIG_NF_CONNTRACK */
+
+#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+extern struct genl_family dp_ct_limit_genl_family;
+#endif
#endif /* ovs_conntrack.h */
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 015e24e08909..a61818e94396 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -2288,6 +2288,9 @@ static struct genl_family * const dp_genl_families[] = {
&dp_flow_genl_family,
&dp_packet_genl_family,
&dp_meter_genl_family,
+#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+ &dp_ct_limit_genl_family,
+#endif
};
static void dp_unregister_genl(int n_families)
@@ -2323,8 +2326,7 @@ static int __net_init ovs_init_net(struct net *net)
INIT_LIST_HEAD(&ovs_net->dps);
INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq);
- ovs_ct_init(net);
- return 0;
+ return ovs_ct_init(net);
}
static void __net_exit list_vports_from_net(struct net *net, struct net *dnet,
@@ -2469,3 +2471,4 @@ MODULE_ALIAS_GENL_FAMILY(OVS_VPORT_FAMILY);
MODULE_ALIAS_GENL_FAMILY(OVS_FLOW_FAMILY);
MODULE_ALIAS_GENL_FAMILY(OVS_PACKET_FAMILY);
MODULE_ALIAS_GENL_FAMILY(OVS_METER_FAMILY);
+MODULE_ALIAS_GENL_FAMILY(OVS_CT_LIMIT_FAMILY);
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 523d65526766..c9eb267c6f7e 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -144,6 +144,9 @@ struct dp_upcall_info {
struct ovs_net {
struct list_head dps;
struct work_struct dp_notify_work;
+#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+ struct ovs_ct_limit_info *ct_limit_info;
+#endif
/* Module reference for configuring conntrack. */
bool xt_label;
--
2.7.4
^ permalink raw reply related
* Re: [PATCH] [RFC] bpf: tracing: new helper bpf_get_current_cgroup_ino
From: Y Song @ 2018-05-22 0:24 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: Alban Crequy, netdev, linux-kernel, containers, cgroups,
Alban Crequy, tj
In-Reply-To: <20180521162609.lpdrnozowmzdn57m@ast-mbp.dhcp.thefacebook.com>
On Mon, May 21, 2018 at 9:26 AM, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> On Sun, May 13, 2018 at 07:33:18PM +0200, Alban Crequy wrote:
>>
>> +BPF_CALL_2(bpf_get_current_cgroup_ino, u32, hierarchy, u64, flags)
>> +{
>> + // TODO: pick the correct hierarchy instead of the mem controller
>> + struct cgroup *cgrp = task_cgroup(current, memory_cgrp_id);
>> +
>> + if (unlikely(!cgrp))
>> + return -EINVAL;
>> + if (unlikely(hierarchy))
>> + return -EINVAL;
>> + if (unlikely(flags))
>> + return -EINVAL;
>> +
>> + return cgrp->kn->id.ino;
>
> ino only is not enough to identify cgroup. It needs generation number too.
> I don't quite see how hierarchy and flags can be used in the future.
> Also why limit it to memcg?
>
> How about something like this instead:
>
> BPF_CALL_2(bpf_get_current_cgroup_id)
> {
> struct cgroup *cgrp = task_dfl_cgroup(current);
>
> return cgrp->kn->id.id;
> }
> The user space can use fhandle api to get the same 64-bit id.
I think this should work. This will also be useful to bcc as user
space can encode desired id
in the bpf program and compared that id to the current cgroup id, so we can have
cgroup level tracing (esp. stat collection) support. To cope with
cgroup hierarchy, user can use
cgroup-array based approach or explicitly compare against multiple cgroup id's.
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox