* [patch net-next 3/5] mlxsw: Do not pass around driver_priv directly
From: Jiri Pirko @ 2016-04-08 15:45 UTC (permalink / raw)
To: netdev; +Cc: davem, idosch, eladr, yotamg, ogerlitz, roopa, gospo
In-Reply-To: <1460130325-14931-1-git-send-email-jiri@resnulli.us>
From: Jiri Pirko <jiri@mellanox.com>
Instead of that, pass mlxsw_core and use a helper to get driver priv
from driver code. Looks much cleaner that way.
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
---
drivers/net/ethernet/mellanox/mlxsw/core.c | 19 +++++++++++--------
drivers/net/ethernet/mellanox/mlxsw/core.h | 11 +++++++----
drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 17 +++++++++--------
drivers/net/ethernet/mellanox/mlxsw/switchx2.c | 8 ++++----
4 files changed, 31 insertions(+), 24 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index 39161fb..3958195 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -114,6 +114,12 @@ struct mlxsw_core {
/* driver_priv has to be always the last item */
};
+void *mlxsw_core_driver_priv(struct mlxsw_core *mlxsw_core)
+{
+ return mlxsw_core->driver_priv;
+}
+EXPORT_SYMBOL(mlxsw_core_driver_priv);
+
struct mlxsw_rx_listener_item {
struct list_head list;
struct mlxsw_rx_listener rxl;
@@ -795,8 +801,7 @@ static int mlxsw_devlink_port_split(struct devlink *devlink,
return -EINVAL;
if (!mlxsw_core->driver->port_split)
return -EOPNOTSUPP;
- return mlxsw_core->driver->port_split(mlxsw_core->driver_priv,
- port_index, count);
+ return mlxsw_core->driver->port_split(mlxsw_core, port_index, count);
}
static int mlxsw_devlink_port_unsplit(struct devlink *devlink,
@@ -808,8 +813,7 @@ static int mlxsw_devlink_port_unsplit(struct devlink *devlink,
return -EINVAL;
if (!mlxsw_core->driver->port_unsplit)
return -EOPNOTSUPP;
- return mlxsw_core->driver->port_unsplit(mlxsw_core->driver_priv,
- port_index);
+ return mlxsw_core->driver->port_unsplit(mlxsw_core, port_index);
}
static const struct devlink_ops mlxsw_devlink_ops = {
@@ -880,8 +884,7 @@ int mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info,
if (err)
goto err_devlink_register;
- err = mlxsw_driver->init(mlxsw_core->driver_priv, mlxsw_core,
- mlxsw_bus_info);
+ err = mlxsw_driver->init(mlxsw_core, mlxsw_bus_info);
if (err)
goto err_driver_init;
@@ -892,7 +895,7 @@ int mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info,
return 0;
err_debugfs_init:
- mlxsw_core->driver->fini(mlxsw_core->driver_priv);
+ mlxsw_core->driver->fini(mlxsw_core);
err_driver_init:
devlink_unregister(devlink);
err_devlink_register:
@@ -918,7 +921,7 @@ void mlxsw_core_bus_device_unregister(struct mlxsw_core *mlxsw_core)
struct devlink *devlink = priv_to_devlink(mlxsw_core);
mlxsw_core_debugfs_fini(mlxsw_core);
- mlxsw_core->driver->fini(mlxsw_core->driver_priv);
+ mlxsw_core->driver->fini(mlxsw_core);
devlink_unregister(devlink);
mlxsw_emad_fini(mlxsw_core);
mlxsw_core->bus->fini(mlxsw_core->bus_priv);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h
index 0454212..f3cebef 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -62,6 +62,8 @@ struct mlxsw_driver;
struct mlxsw_bus;
struct mlxsw_bus_info;
+void *mlxsw_core_driver_priv(struct mlxsw_core *mlxsw_core);
+
int mlxsw_core_driver_register(struct mlxsw_driver *mlxsw_driver);
void mlxsw_core_driver_unregister(struct mlxsw_driver *mlxsw_driver);
@@ -192,11 +194,12 @@ struct mlxsw_driver {
const char *kind;
struct module *owner;
size_t priv_size;
- int (*init)(void *driver_priv, struct mlxsw_core *mlxsw_core,
+ int (*init)(struct mlxsw_core *mlxsw_core,
const struct mlxsw_bus_info *mlxsw_bus_info);
- void (*fini)(void *driver_priv);
- int (*port_split)(void *driver_priv, u8 local_port, unsigned int count);
- int (*port_unsplit)(void *driver_priv, u8 local_port);
+ void (*fini)(struct mlxsw_core *mlxsw_core);
+ int (*port_split)(struct mlxsw_core *mlxsw_core, u8 local_port,
+ unsigned int count);
+ int (*port_unsplit)(struct mlxsw_core *mlxsw_core, u8 local_port);
void (*txhdr_construct)(struct sk_buff *skb,
const struct mlxsw_tx_info *tx_info);
u8 txhdr_len;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 8abe1a6..19b3c14 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -1948,9 +1948,10 @@ static u8 mlxsw_sp_cluster_base_port_get(u8 local_port)
return local_port - offset;
}
-static int mlxsw_sp_port_split(void *priv, u8 local_port, unsigned int count)
+static int mlxsw_sp_port_split(struct mlxsw_core *mlxsw_core, u8 local_port,
+ unsigned int count)
{
- struct mlxsw_sp *mlxsw_sp = priv;
+ struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
struct mlxsw_sp_port *mlxsw_sp_port;
u8 width = MLXSW_PORT_MODULE_MAX_WIDTH / count;
u8 module, cur_width, base_port;
@@ -2022,9 +2023,9 @@ err_port_create:
return err;
}
-static int mlxsw_sp_port_unsplit(void *priv, u8 local_port)
+static int mlxsw_sp_port_unsplit(struct mlxsw_core *mlxsw_core, u8 local_port)
{
- struct mlxsw_sp *mlxsw_sp = priv;
+ struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
struct mlxsw_sp_port *mlxsw_sp_port;
u8 module, cur_width, base_port;
unsigned int count;
@@ -2369,10 +2370,10 @@ static int mlxsw_sp_lag_init(struct mlxsw_sp *mlxsw_sp)
return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(slcr), slcr_pl);
}
-static int mlxsw_sp_init(void *priv, struct mlxsw_core *mlxsw_core,
+static int mlxsw_sp_init(struct mlxsw_core *mlxsw_core,
const struct mlxsw_bus_info *mlxsw_bus_info)
{
- struct mlxsw_sp *mlxsw_sp = priv;
+ struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
int err;
mlxsw_sp->core = mlxsw_core;
@@ -2443,9 +2444,9 @@ err_event_register:
return err;
}
-static void mlxsw_sp_fini(void *priv)
+static void mlxsw_sp_fini(struct mlxsw_core *mlxsw_core)
{
- struct mlxsw_sp *mlxsw_sp = priv;
+ struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
mlxsw_sp_switchdev_fini(mlxsw_sp);
mlxsw_sp_traps_fini(mlxsw_sp);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/switchx2.c b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
index 2518c84..3842eab 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
@@ -1447,10 +1447,10 @@ static int mlxsw_sx_flood_init(struct mlxsw_sx *mlxsw_sx)
return mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(sgcr), sgcr_pl);
}
-static int mlxsw_sx_init(void *priv, struct mlxsw_core *mlxsw_core,
+static int mlxsw_sx_init(struct mlxsw_core *mlxsw_core,
const struct mlxsw_bus_info *mlxsw_bus_info)
{
- struct mlxsw_sx *mlxsw_sx = priv;
+ struct mlxsw_sx *mlxsw_sx = mlxsw_core_driver_priv(mlxsw_core);
int err;
mlxsw_sx->core = mlxsw_core;
@@ -1497,9 +1497,9 @@ err_event_register:
return err;
}
-static void mlxsw_sx_fini(void *priv)
+static void mlxsw_sx_fini(struct mlxsw_core *mlxsw_core)
{
- struct mlxsw_sx *mlxsw_sx = priv;
+ struct mlxsw_sx *mlxsw_sx = mlxsw_core_driver_priv(mlxsw_core);
mlxsw_sx_traps_fini(mlxsw_sx);
mlxsw_sx_event_unregister(mlxsw_sx, MLXSW_TRAP_ID_PUDE);
--
2.5.5
^ permalink raw reply related
* [patch net-next 2/5] mlxsw: Pass mlxsw_core as a param of mlxsw_core_skb_transmit*
From: Jiri Pirko @ 2016-04-08 15:45 UTC (permalink / raw)
To: netdev; +Cc: davem, idosch, eladr, yotamg, ogerlitz, roopa, gospo
In-Reply-To: <1460130325-14931-1-git-send-email-jiri@resnulli.us>
From: Jiri Pirko <jiri@mellanox.com>
Instead of passing around driver priv, pass struct mlxsw_core *
directly.
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
---
drivers/net/ethernet/mellanox/mlxsw/core.c | 15 +++------------
drivers/net/ethernet/mellanox/mlxsw/core.h | 5 ++---
drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 4 ++--
drivers/net/ethernet/mellanox/mlxsw/switchx2.c | 4 ++--
4 files changed, 9 insertions(+), 19 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index 004fb8b..39161fb 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -381,7 +381,7 @@ static int __mlxsw_emad_transmit(struct mlxsw_core *mlxsw_core,
mlxsw_core->emad.trans_active = true;
- err = mlxsw_core_skb_transmit(mlxsw_core->driver_priv, skb, tx_info);
+ err = mlxsw_core_skb_transmit(mlxsw_core, skb, tx_info);
if (err) {
dev_err(mlxsw_core->bus_info->dev, "Failed to transmit EMAD (tid=%llx)\n",
mlxsw_core->emad.tid);
@@ -929,26 +929,17 @@ void mlxsw_core_bus_device_unregister(struct mlxsw_core *mlxsw_core)
}
EXPORT_SYMBOL(mlxsw_core_bus_device_unregister);
-static struct mlxsw_core *__mlxsw_core_get(void *driver_priv)
-{
- return container_of(driver_priv, struct mlxsw_core, driver_priv);
-}
-
-bool mlxsw_core_skb_transmit_busy(void *driver_priv,
+bool mlxsw_core_skb_transmit_busy(struct mlxsw_core *mlxsw_core,
const struct mlxsw_tx_info *tx_info)
{
- struct mlxsw_core *mlxsw_core = __mlxsw_core_get(driver_priv);
-
return mlxsw_core->bus->skb_transmit_busy(mlxsw_core->bus_priv,
tx_info);
}
EXPORT_SYMBOL(mlxsw_core_skb_transmit_busy);
-int mlxsw_core_skb_transmit(void *driver_priv, struct sk_buff *skb,
+int mlxsw_core_skb_transmit(struct mlxsw_core *mlxsw_core, struct sk_buff *skb,
const struct mlxsw_tx_info *tx_info)
{
- struct mlxsw_core *mlxsw_core = __mlxsw_core_get(driver_priv);
-
return mlxsw_core->bus->skb_transmit(mlxsw_core->bus_priv, skb,
tx_info);
}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h
index 06631a0..0454212 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -75,10 +75,9 @@ struct mlxsw_tx_info {
bool is_emad;
};
-bool mlxsw_core_skb_transmit_busy(void *driver_priv,
+bool mlxsw_core_skb_transmit_busy(struct mlxsw_core *mlxsw_core,
const struct mlxsw_tx_info *tx_info);
-
-int mlxsw_core_skb_transmit(void *driver_priv, struct sk_buff *skb,
+int mlxsw_core_skb_transmit(struct mlxsw_core *mlxsw_core, struct sk_buff *skb,
const struct mlxsw_tx_info *tx_info);
struct mlxsw_rx_listener {
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 3216f2b..8abe1a6 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -390,7 +390,7 @@ static netdev_tx_t mlxsw_sp_port_xmit(struct sk_buff *skb,
u64 len;
int err;
- if (mlxsw_core_skb_transmit_busy(mlxsw_sp, &tx_info))
+ if (mlxsw_core_skb_transmit_busy(mlxsw_sp->core, &tx_info))
return NETDEV_TX_BUSY;
if (unlikely(skb_headroom(skb) < MLXSW_TXHDR_LEN)) {
@@ -414,7 +414,7 @@ static netdev_tx_t mlxsw_sp_port_xmit(struct sk_buff *skb,
/* Due to a race we might fail here because of a full queue. In that
* unlikely case we simply drop the packet.
*/
- err = mlxsw_core_skb_transmit(mlxsw_sp, skb, &tx_info);
+ err = mlxsw_core_skb_transmit(mlxsw_sp->core, skb, &tx_info);
if (!err) {
pcpu_stats = this_cpu_ptr(mlxsw_sp_port->pcpu_stats);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/switchx2.c b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
index 2417f09..2518c84 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
@@ -302,7 +302,7 @@ static netdev_tx_t mlxsw_sx_port_xmit(struct sk_buff *skb,
u64 len;
int err;
- if (mlxsw_core_skb_transmit_busy(mlxsw_sx, &tx_info))
+ if (mlxsw_core_skb_transmit_busy(mlxsw_sx->core, &tx_info))
return NETDEV_TX_BUSY;
if (unlikely(skb_headroom(skb) < MLXSW_TXHDR_LEN)) {
@@ -320,7 +320,7 @@ static netdev_tx_t mlxsw_sx_port_xmit(struct sk_buff *skb,
/* Due to a race we might fail here because of a full queue. In that
* unlikely case we simply drop the packet.
*/
- err = mlxsw_core_skb_transmit(mlxsw_sx, skb, &tx_info);
+ err = mlxsw_core_skb_transmit(mlxsw_sx->core, skb, &tx_info);
if (!err) {
pcpu_stats = this_cpu_ptr(mlxsw_sx_port->pcpu_stats);
--
2.5.5
^ permalink raw reply related
* [patch net-next 1/5] mlxsw: Move devlink port registration into common core code
From: Jiri Pirko @ 2016-04-08 15:45 UTC (permalink / raw)
To: netdev; +Cc: davem, idosch, eladr, yotamg, ogerlitz, roopa, gospo
In-Reply-To: <1460130325-14931-1-git-send-email-jiri@resnulli.us>
From: Jiri Pirko <jiri@mellanox.com>
Remove devlink port reg/unreg from spectrum and switchx2 code and rather
do the common work in core. That also ensures code separation where
devlink is only used in core.c.
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
---
drivers/net/ethernet/mellanox/mlxsw/core.c | 22 ++++++++++++++++++
drivers/net/ethernet/mellanox/mlxsw/core.h | 10 +++++++++
drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 31 +++++++++-----------------
drivers/net/ethernet/mellanox/mlxsw/spectrum.h | 3 +--
drivers/net/ethernet/mellanox/mlxsw/switchx2.c | 30 +++++++++----------------
5 files changed, 55 insertions(+), 41 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index f69f628..004fb8b 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -1358,6 +1358,28 @@ void mlxsw_core_lag_mapping_clear(struct mlxsw_core *mlxsw_core,
}
EXPORT_SYMBOL(mlxsw_core_lag_mapping_clear);
+int mlxsw_core_port_init(struct mlxsw_core *mlxsw_core,
+ struct mlxsw_core_port *mlxsw_core_port, u8 local_port,
+ struct net_device *dev, bool split, u32 split_group)
+{
+ struct devlink *devlink = priv_to_devlink(mlxsw_core);
+ struct devlink_port *devlink_port = &mlxsw_core_port->devlink_port;
+
+ if (split)
+ devlink_port_split_set(devlink_port, split_group);
+ devlink_port_type_eth_set(devlink_port, dev);
+ return devlink_port_register(devlink, devlink_port, local_port);
+}
+EXPORT_SYMBOL(mlxsw_core_port_init);
+
+void mlxsw_core_port_fini(struct mlxsw_core_port *mlxsw_core_port)
+{
+ struct devlink_port *devlink_port = &mlxsw_core_port->devlink_port;
+
+ devlink_port_unregister(devlink_port);
+}
+EXPORT_SYMBOL(mlxsw_core_port_fini);
+
int mlxsw_cmd_exec(struct mlxsw_core *mlxsw_core, u16 opcode, u8 opcode_mod,
u32 in_mod, bool out_mbox_direct,
char *in_mbox, size_t in_mbox_size,
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h
index c73d1c0..06631a0 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -43,6 +43,7 @@
#include <linux/gfp.h>
#include <linux/types.h>
#include <linux/skbuff.h>
+#include <net/devlink.h>
#include "trap.h"
#include "reg.h"
@@ -131,6 +132,15 @@ u8 mlxsw_core_lag_mapping_get(struct mlxsw_core *mlxsw_core,
void mlxsw_core_lag_mapping_clear(struct mlxsw_core *mlxsw_core,
u16 lag_id, u8 local_port);
+struct mlxsw_core_port {
+ struct devlink_port devlink_port;
+};
+
+int mlxsw_core_port_init(struct mlxsw_core *mlxsw_core,
+ struct mlxsw_core_port *mlxsw_core_port, u8 local_port,
+ struct net_device *dev, bool split, u32 split_group);
+void mlxsw_core_port_fini(struct mlxsw_core_port *mlxsw_core_port);
+
#define MLXSW_CONFIG_PROFILE_SWID_COUNT 8
struct mlxsw_swid_config {
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 507263a..3216f2b 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -50,7 +50,6 @@
#include <linux/bitops.h>
#include <linux/list.h>
#include <linux/dcbnl.h>
-#include <net/devlink.h>
#include <net/switchdev.h>
#include <generated/utsrelease.h>
@@ -1685,9 +1684,7 @@ static int mlxsw_sp_port_ets_init(struct mlxsw_sp_port *mlxsw_sp_port)
static int __mlxsw_sp_port_create(struct mlxsw_sp *mlxsw_sp, u8 local_port,
bool split, u8 module, u8 width)
{
- struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
struct mlxsw_sp_port *mlxsw_sp_port;
- struct devlink_port *devlink_port;
struct net_device *dev;
size_t bytes;
int err;
@@ -1740,16 +1737,6 @@ static int __mlxsw_sp_port_create(struct mlxsw_sp *mlxsw_sp, u8 local_port,
*/
dev->hard_header_len += MLXSW_TXHDR_LEN;
- devlink_port = &mlxsw_sp_port->devlink_port;
- if (mlxsw_sp_port->split)
- devlink_port_split_set(devlink_port, module);
- err = devlink_port_register(devlink, devlink_port, local_port);
- if (err) {
- dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to register devlink port\n",
- mlxsw_sp_port->local_port);
- goto err_devlink_port_register;
- }
-
err = mlxsw_sp_port_system_port_mapping_set(mlxsw_sp_port);
if (err) {
dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to set system port mapping\n",
@@ -1812,7 +1799,14 @@ static int __mlxsw_sp_port_create(struct mlxsw_sp *mlxsw_sp, u8 local_port,
goto err_register_netdev;
}
- devlink_port_type_eth_set(devlink_port, dev);
+ err = mlxsw_core_port_init(mlxsw_sp->core, &mlxsw_sp_port->core_port,
+ mlxsw_sp_port->local_port, dev,
+ mlxsw_sp_port->split, module);
+ if (err) {
+ dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to init core port\n",
+ mlxsw_sp_port->local_port);
+ goto err_core_port_init;
+ }
err = mlxsw_sp_port_vlan_init(mlxsw_sp_port);
if (err)
@@ -1822,6 +1816,8 @@ static int __mlxsw_sp_port_create(struct mlxsw_sp *mlxsw_sp, u8 local_port,
return 0;
err_port_vlan_init:
+ mlxsw_core_port_fini(&mlxsw_sp_port->core_port);
+err_core_port_init:
unregister_netdev(dev);
err_register_netdev:
err_port_dcb_init:
@@ -1832,8 +1828,6 @@ err_port_mtu_set:
err_port_speed_by_width_set:
err_port_swid_set:
err_port_system_port_mapping_set:
- devlink_port_unregister(&mlxsw_sp_port->devlink_port);
-err_devlink_port_register:
err_dev_addr_init:
free_percpu(mlxsw_sp_port->pcpu_stats);
err_alloc_stats:
@@ -1887,16 +1881,13 @@ static void mlxsw_sp_port_vports_fini(struct mlxsw_sp_port *mlxsw_sp_port)
static void mlxsw_sp_port_remove(struct mlxsw_sp *mlxsw_sp, u8 local_port)
{
struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp->ports[local_port];
- struct devlink_port *devlink_port;
if (!mlxsw_sp_port)
return;
mlxsw_sp->ports[local_port] = NULL;
- devlink_port = &mlxsw_sp_port->devlink_port;
- devlink_port_type_clear(devlink_port);
+ mlxsw_core_port_fini(&mlxsw_sp_port->core_port);
unregister_netdev(mlxsw_sp_port->dev); /* This calls ndo_stop */
mlxsw_sp_port_dcb_fini(mlxsw_sp_port);
- devlink_port_unregister(devlink_port);
mlxsw_sp_port_vports_fini(mlxsw_sp_port);
mlxsw_sp_port_switchdev_fini(mlxsw_sp_port);
mlxsw_sp_port_swid_set(mlxsw_sp_port, MLXSW_PORT_SWID_DISABLED_PORT);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
index 47610a5..361b0c2 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -44,7 +44,6 @@
#include <linux/list.h>
#include <linux/dcbnl.h>
#include <net/switchdev.h>
-#include <net/devlink.h>
#include "port.h"
#include "core.h"
@@ -166,6 +165,7 @@ struct mlxsw_sp_port_pcpu_stats {
};
struct mlxsw_sp_port {
+ struct mlxsw_core_port core_port; /* must be first */
struct net_device *dev;
struct mlxsw_sp_port_pcpu_stats __percpu *pcpu_stats;
struct mlxsw_sp *mlxsw_sp;
@@ -198,7 +198,6 @@ struct mlxsw_sp_port {
unsigned long *untagged_vlans;
/* VLAN interfaces */
struct list_head vports_list;
- struct devlink_port devlink_port;
};
static inline bool
diff --git a/drivers/net/ethernet/mellanox/mlxsw/switchx2.c b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
index c49447f..2417f09 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
@@ -43,7 +43,6 @@
#include <linux/device.h>
#include <linux/skbuff.h>
#include <linux/if_vlan.h>
-#include <net/devlink.h>
#include <net/switchdev.h>
#include <generated/utsrelease.h>
@@ -75,11 +74,11 @@ struct mlxsw_sx_port_pcpu_stats {
};
struct mlxsw_sx_port {
+ struct mlxsw_core_port core_port; /* must be first */
struct net_device *dev;
struct mlxsw_sx_port_pcpu_stats __percpu *pcpu_stats;
struct mlxsw_sx *mlxsw_sx;
u8 local_port;
- struct devlink_port devlink_port;
};
/* tx_hdr_version
@@ -956,9 +955,7 @@ mlxsw_sx_port_mac_learning_mode_set(struct mlxsw_sx_port *mlxsw_sx_port,
static int mlxsw_sx_port_create(struct mlxsw_sx *mlxsw_sx, u8 local_port)
{
- struct devlink *devlink = priv_to_devlink(mlxsw_sx->core);
struct mlxsw_sx_port *mlxsw_sx_port;
- struct devlink_port *devlink_port;
struct net_device *dev;
bool usable;
int err;
@@ -1012,14 +1009,6 @@ static int mlxsw_sx_port_create(struct mlxsw_sx *mlxsw_sx, u8 local_port)
goto port_not_usable;
}
- devlink_port = &mlxsw_sx_port->devlink_port;
- err = devlink_port_register(devlink, devlink_port, local_port);
- if (err) {
- dev_err(mlxsw_sx->bus_info->dev, "Port %d: Failed to register devlink port\n",
- mlxsw_sx_port->local_port);
- goto err_devlink_port_register;
- }
-
err = mlxsw_sx_port_system_port_mapping_set(mlxsw_sx_port);
if (err) {
dev_err(mlxsw_sx->bus_info->dev, "Port %d: Failed to set system port mapping\n",
@@ -1077,11 +1066,19 @@ static int mlxsw_sx_port_create(struct mlxsw_sx *mlxsw_sx, u8 local_port)
goto err_register_netdev;
}
- devlink_port_type_eth_set(devlink_port, dev);
+ err = mlxsw_core_port_init(mlxsw_sx->core, &mlxsw_sx_port->core_port,
+ mlxsw_sx_port->local_port, dev, false, 0);
+ if (err) {
+ dev_err(mlxsw_sx->bus_info->dev, "Port %d: Failed to init core port\n",
+ mlxsw_sx_port->local_port);
+ goto err_core_port_init;
+ }
mlxsw_sx->ports[local_port] = mlxsw_sx_port;
return 0;
+err_core_port_init:
+ unregister_netdev(dev);
err_register_netdev:
err_port_mac_learning_mode_set:
err_port_stp_state_set:
@@ -1090,8 +1087,6 @@ err_port_mtu_set:
err_port_speed_set:
err_port_swid_set:
err_port_system_port_mapping_set:
- devlink_port_unregister(&mlxsw_sx_port->devlink_port);
-err_devlink_port_register:
port_not_usable:
err_port_module_check:
err_dev_addr_get:
@@ -1104,15 +1099,12 @@ err_alloc_stats:
static void mlxsw_sx_port_remove(struct mlxsw_sx *mlxsw_sx, u8 local_port)
{
struct mlxsw_sx_port *mlxsw_sx_port = mlxsw_sx->ports[local_port];
- struct devlink_port *devlink_port;
if (!mlxsw_sx_port)
return;
- devlink_port = &mlxsw_sx_port->devlink_port;
- devlink_port_type_clear(devlink_port);
+ mlxsw_core_port_fini(&mlxsw_sx_port->core_port);
unregister_netdev(mlxsw_sx_port->dev); /* This calls ndo_stop */
mlxsw_sx_port_swid_set(mlxsw_sx_port, MLXSW_PORT_SWID_DISABLED_PORT);
- devlink_port_unregister(devlink_port);
free_percpu(mlxsw_sx_port->pcpu_stats);
free_netdev(mlxsw_sx_port->dev);
}
--
2.5.5
^ permalink raw reply related
* [patch net-next 0/5] mlxsw: small driver update
From: Jiri Pirko @ 2016-04-08 15:45 UTC (permalink / raw)
To: netdev; +Cc: davem, idosch, eladr, yotamg, ogerlitz, roopa, gospo
From: Jiri Pirko <jiri@mellanox.com>
Cosmetics, in preparation to sharedbuffer patchset.
Jiri Pirko (5):
mlxsw: Move devlink port registration into common core code
mlxsw: Pass mlxsw_core as a param of mlxsw_core_skb_transmit*
mlxsw: Do not pass around driver_priv directly
mlxsw: reg: Share direction enum between SBPR, SBCM, SBPM
mlxsw: reg: Fix SBPM register name
drivers/net/ethernet/mellanox/mlxsw/core.c | 56 ++++++++++++++--------
drivers/net/ethernet/mellanox/mlxsw/core.h | 26 +++++++---
drivers/net/ethernet/mellanox/mlxsw/reg.h | 27 ++++-------
drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 52 +++++++++-----------
drivers/net/ethernet/mellanox/mlxsw/spectrum.h | 3 +-
.../net/ethernet/mellanox/mlxsw/spectrum_buffers.c | 20 ++++----
drivers/net/ethernet/mellanox/mlxsw/switchx2.c | 42 +++++++---------
7 files changed, 114 insertions(+), 112 deletions(-)
--
2.5.5
^ permalink raw reply
* Re: [PATCH net-next] ipv6, token: allow for clearing the current device token
From: Hannes Frederic Sowa @ 2016-04-08 15:36 UTC (permalink / raw)
To: Bjørn Mork; +Cc: Daniel Borkmann, davem, robbat2, netdev
In-Reply-To: <874mbct95a.fsf@nemi.mork.no>
On 08.04.2016 17:25, Bjørn Mork wrote:
> Hannes Frederic Sowa <hannes@stressinduktion.org> writes:
>
>> On Fri, Apr 8, 2016, at 16:18, Bjørn Mork wrote:
>>> Daniel Borkmann <daniel@iogearbox.net> writes:
>>>
>>>>
>>>> if (!token)
>>>> return -EINVAL;
>>>> - if (ipv6_addr_any(token))
>>>> - return -EINVAL;
>>>> if (dev->flags & (IFF_LOOPBACK | IFF_NOARP))
>>>> return -EINVAL;
>>>
>>> Not directly related to the patch in question. It just made me aware of
>>> this restriction...
>>>
>>> I realize that I'm a few years late here, but what's with the IFF_NOARP?
>>> Is that just because we can't do DAD for the token based addresses? How
>>> is that different from manually configuring the whole address?
>>
>> IFF_NOARP is kind of the equivalent to no neighbor discovery. If you set
>> a token and never get in a router advertisement you never create a
>> tokenized ip address, thus the feature is useless.
>
> You can get router advertisements with IFF_NOARP. You cannot lookup L2
> addresses, but the L3 prefix info is still as useful as with any other
> interface.
Of course router advertisements can be send and received with IFF_NOARP
and probably we act on them as usual, as you showed. Looking in the
source we don't really specify what those flags mean/do for IPv6. So I
think you can assume that it is in there because of history.
I would absolutely not mind if you remove the limitation for IFF_ARP.
Bye,
Hannes
^ permalink raw reply
* Re: [RFC v5 0/5] Add virtio transport for AF_VSOCK
From: Ian Campbell @ 2016-04-08 15:35 UTC (permalink / raw)
To: Stefan Hajnoczi, kvm
Cc: netdev, Michael S. Tsirkin, Matt Benjamin, Christoffer Dall,
Alex Bennée, marius vlad, areis, Claudio Imbrenda, Greg Kurz,
virtualization
In-Reply-To: <1459520587-12337-1-git-send-email-stefanha@redhat.com>
On Fri, 2016-04-01 at 15:23 +0100, Stefan Hajnoczi wrote:
> This series is based on Michael Tsirkin's vhost branch (v4.5-rc6).
>
> I'm about to process Claudio Imbrenda's locking fixes for virtio-vsock but
> first I want to share the latest version of the code. Several people are
> playing with vsock now so sharing the latest code should avoid duplicate work.
Thanks for this, I've been using it in my project and it mostly seems
fine.
One wrinkle I came across, which I'm not sure if it is by design or a
problem is that I can see this sequence coming from the guest (with
other activity in between):
1) OP_SHUTDOWN w/ flags == SHUTDOWN_RX
2) OP_SHUTDOWN w/ flags == SHUTDOWN_TX
3) OP_SHUTDOWN w/ flags == SHUTDOWN_TX|SHUTDOWN_RX
I orignally had my backend close things down at #2, however this meant
that when #3 arrived it was for a non-existent socket (or, worse, an
active one if the ports got reused). I checked v5 of the spec
proposal[0] which says:
If these bits are set and there are no more virtqueue buffers
pending the socket is disconnected.
but I'm not entirely sure if this behaviour contradicts this or not
(the bits have both been set at #2, but not at the same time).
BTW, how does one tell if there are no more virtqueue buffers pending
or not while processing the op?
Another thing I noticed, which is really more to do with the generic
AF_VSOCK bits than anything to do with your patches is that there is no
limitations on which vsock ports a non-privileged user can bind to and
relatedly that there is no netns support so e.g. users in unproivileged
containers can bind to any vsock port and talk to the host, which might
be undesirable. For my use for now I just went with the big hammer
approach of denying access from anything other than init_net
namespace[1] while I consider what the right answer is.
Ian.
[0] http://thread.gmane.org/gmane.comp.emulators.virtio.devel/1092
[1]
From 366c9c42afb9bd54f92f72518470c09e46f12e88 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@docker.com>
Date: Mon, 4 Apr 2016 14:50:10 +0100
Subject: [PATCH] VSOCK: Only allow host network namespace to use AF_VSOCK.
The VSOCK addressing schema does not really lend itself to simply creating an
alternative end point address within a namespace.
Signed-off-by: Ian Campbell <ian.campbell@docker.com>
---
net/vmw_vsock/af_vsock.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 1e5f5ed..cdb3dd3 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1840,6 +1840,9 @@ static const struct proto_ops vsock_stream_ops = {
static int vsock_create(struct net *net, struct socket *sock,
int protocol, int kern)
{
+ if (!net_eq(net, &init_net))
+ return -EAFNOSUPPORT;
+
if (!sock)
return -EINVAL;
--
2.8.0.rc3
^ permalink raw reply related
* [PATCH] can: mcp251x: Replace create_freezable_workqueue with alloc_workqueue
From: Amitoj Kaur Chawla @ 2016-04-08 15:32 UTC (permalink / raw)
To: wg, mkl, linux-can, netdev, linux-kernel; +Cc: tj
Replace scheduled to be removed create_freezable_workqueue with
alloc_workqueue.
priv->wq should be explicitly set as freezable to ensure it is frozen
in the suspend sequence and work items are drained so that no new work
item starts execution until thawed. Thus, use of WQ_FREEZABLE flag
here is required.
WQ_MEM_RECLAIM flag has been set here to ensure forward progress
regardless of memory pressure.
The order of execution is not important so set @max_active as 0.
Signed-off-by: Amitoj Kaur Chawla <amitoj1606@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
---
drivers/net/can/spi/mcp251x.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/can/spi/mcp251x.c b/drivers/net/can/spi/mcp251x.c
index 74a7dfe..cf36d26 100644
--- a/drivers/net/can/spi/mcp251x.c
+++ b/drivers/net/can/spi/mcp251x.c
@@ -961,7 +961,8 @@ static int mcp251x_open(struct net_device *net)
goto open_unlock;
}
- priv->wq = create_freezable_workqueue("mcp251x_wq");
+ priv->wq = alloc_workqueue("mcp251x_wq", WQ_FREEZABLE | WQ_MEM_RECLAIM,
+ 0);
INIT_WORK(&priv->tx_work, mcp251x_tx_work_handler);
INIT_WORK(&priv->restart_work, mcp251x_restart_work_handler);
--
1.9.1
^ permalink raw reply related
* Re: [PATCH net-next] ipv6, token: allow for clearing the current device token
From: Bjørn Mork @ 2016-04-08 15:25 UTC (permalink / raw)
To: Hannes Frederic Sowa; +Cc: Daniel Borkmann, davem, robbat2, netdev
In-Reply-To: <1460126038.1452331.572938089.68D8127C@webmail.messagingengine.com>
Hannes Frederic Sowa <hannes@stressinduktion.org> writes:
> On Fri, Apr 8, 2016, at 16:18, Bjørn Mork wrote:
>> Daniel Borkmann <daniel@iogearbox.net> writes:
>>
>> >
>> > if (!token)
>> > return -EINVAL;
>> > - if (ipv6_addr_any(token))
>> > - return -EINVAL;
>> > if (dev->flags & (IFF_LOOPBACK | IFF_NOARP))
>> > return -EINVAL;
>>
>> Not directly related to the patch in question. It just made me aware of
>> this restriction...
>>
>> I realize that I'm a few years late here, but what's with the IFF_NOARP?
>> Is that just because we can't do DAD for the token based addresses? How
>> is that different from manually configuring the whole address?
>
> IFF_NOARP is kind of the equivalent to no neighbor discovery. If you set
> a token and never get in a router advertisement you never create a
> tokenized ip address, thus the feature is useless.
You can get router advertisements with IFF_NOARP. You cannot lookup L2
addresses, but the L3 prefix info is still as useful as with any other
interface.
Doing
tshark -nVi any -f ip6
while bringing up the POINTOPOINT NOARP interface shown at the end shows
the expected RS and RA:
Frame 1: 64 bytes on wire (512 bits), 64 bytes captured (512 bits) on interface 0
Interface id: 0 (any)
Encapsulation type: Linux cooked-mode capture (25)
Arrival Time: Apr 8, 2016 17:18:36.094554456 CEST
[Time shift for this packet: 0.000000000 seconds]
Epoch Time: 1460128716.094554456 seconds
[Time delta from previous captured frame: 0.000000000 seconds]
[Time delta from previous displayed frame: 0.000000000 seconds]
[Time since reference or first frame: 0.000000000 seconds]
Frame Number: 1
Frame Length: 64 bytes (512 bits)
Capture Length: 64 bytes (512 bits)
[Frame is marked: False]
[Frame is ignored: False]
[Protocols in frame: sll:ethertype:ipv6:icmpv6]
Linux cooked capture
Packet type: Sent by us (4)
Link-layer address type: 65534
Link-layer address length: 0
Protocol: IPv6 (0x86dd)
Internet Protocol Version 6, Src: fe80::8019:47ef:17a1:8c38, Dst: ff02::2
0110 .... = Version: 6
.... 0000 0000 .... .... .... .... .... = Traffic class: 0x00 (DSCP: CS0, ECN: Not-ECT)
.... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0)
.... .... ..00 .... .... .... .... .... = Explicit Congestion Notification: Not ECN-Capable Transport (0)
.... .... .... 0000 0000 0000 0000 0000 = Flowlabel: 0x00000000
Payload length: 8
Next header: ICMPv6 (58)
Hop limit: 255
Source: fe80::8019:47ef:17a1:8c38
Destination: ff02::2
[Source GeoIP: Unknown]
[Destination GeoIP: Unknown]
Internet Control Message Protocol v6
Type: Router Solicitation (133)
Code: 0
Checksum: 0x1155 [correct]
Reserved: 00000000
Frame 2: 120 bytes on wire (960 bits), 120 bytes captured (960 bits) on interface 0
Interface id: 0 (any)
Encapsulation type: Linux cooked-mode capture (25)
Arrival Time: Apr 8, 2016 17:18:36.101806992 CEST
[Time shift for this packet: 0.000000000 seconds]
Epoch Time: 1460128716.101806992 seconds
[Time delta from previous captured frame: 0.007252536 seconds]
[Time delta from previous displayed frame: 0.007252536 seconds]
[Time since reference or first frame: 0.007252536 seconds]
Frame Number: 2
Frame Length: 120 bytes (960 bits)
Capture Length: 120 bytes (960 bits)
[Frame is marked: False]
[Frame is ignored: False]
[Protocols in frame: sll:ethertype:ipv6:icmpv6]
Linux cooked capture
Packet type: Unicast to us (0)
Link-layer address type: 65534
Link-layer address length: 0
Protocol: IPv6 (0x86dd)
Internet Protocol Version 6, Src: fe80::a5a6:793:6bfe:ea1c, Dst: fe80::8019:47ef:17a1:8c38
0110 .... = Version: 6
.... 0000 0000 .... .... .... .... .... = Traffic class: 0x00 (DSCP: CS0, ECN: Not-ECT)
.... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0)
.... .... ..00 .... .... .... .... .... = Explicit Congestion Notification: Not ECN-Capable Transport (0)
.... .... .... 0000 0000 0000 0000 0000 = Flowlabel: 0x00000000
Payload length: 64
Next header: ICMPv6 (58)
Hop limit: 255
Source: fe80::a5a6:793:6bfe:ea1c
Destination: fe80::8019:47ef:17a1:8c38
[Source GeoIP: Unknown]
[Destination GeoIP: Unknown]
Internet Control Message Protocol v6
Type: Router Advertisement (134)
Code: 0
Checksum: 0x96fa [correct]
Cur hop limit: 255
Flags: 0x40
0... .... = Managed address configuration: Not set
.1.. .... = Other configuration: Set
..0. .... = Home Agent: Not set
...0 0... = Prf (Default Router Preference): Medium (0)
.... .0.. = Proxy: Not set
.... ..0. = Reserved: 0
Router lifetime (s): 65535
Reachable time (ms): 0
Retrans timer (ms): 0
ICMPv6 Option (Source link-layer address : 02:50:f3:00:01:00)
Type: Source link-layer address (1)
Length: 1 (8 bytes)
Link-layer address: 02:50:f3:00:01:00
ICMPv6 Option (MTU : 1500)
Type: MTU (5)
Length: 1 (8 bytes)
Reserved
MTU: 1500
ICMPv6 Option (Prefix information : 2a02:2121:81:e578::/64)
Type: Prefix information (3)
Length: 4 (32 bytes)
Prefix Length: 64
Flag: 0xc0
1... .... = On-link flag(L): Set
.1.. .... = Autonomous address-configuration flag(A): Set
..0. .... = Router address flag(R): Not set
...0 0000 = Reserved: 0
Valid Lifetime: 4294967295 (Infinity)
Preferred Lifetime: 4294967295 (Infinity)
Reserved
Prefix: 2a02:2121:81:e578::
nemi:/tmp# ifconfig wwan0
wwan0 Link encap:UNSPEC HWaddr 00-00-00-00-00-00-00-00-00-00-00-00-00-00-00-00
inet addr:10.135.186.66 P-t-P:10.135.186.66 Mask:255.255.255.252
inet6 addr: 2a02:2121:81:e578:bce:7da1:24a5:af48/64 Scope:Global
inet6 addr: fe80::8019:47ef:17a1:8c38/64 Scope:Link
UP POINTOPOINT RUNNING NOARP MULTICAST MTU:1500 Metric:1
RX packets:3 errors:0 dropped:0 overruns:0 frame:0
TX packets:3 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:1000
RX bytes:716 (716.0 B) TX bytes:704 (704.0 B)
(the other end is an LTE modem here)
Bjørn
^ permalink raw reply
* Re: [PATCH] [linux-next] Doc: networking: Fix typo in dsa
From: Andrew Lunn @ 2016-04-08 15:19 UTC (permalink / raw)
To: Masanari Iida; +Cc: linux-kernel, corbet, linux-doc, davem, netdev
In-Reply-To: <1460127625-6956-1-git-send-email-standby24x7@gmail.com>
On Sat, Apr 09, 2016 at 12:00:25AM +0900, Masanari Iida wrote:
> This patch fix typos in Documentation/networking/dsa.
>
> Signed-off-by: Masanari Iida <standby24x7@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Thanks
Andrew
> ---
> Documentation/networking/dsa/bcm_sf2.txt | 2 +-
> Documentation/networking/dsa/dsa.txt | 2 +-
> 2 files changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/Documentation/networking/dsa/bcm_sf2.txt b/Documentation/networking/dsa/bcm_sf2.txt
> index d999d0c1c5b8..eba3a2431e91 100644
> --- a/Documentation/networking/dsa/bcm_sf2.txt
> +++ b/Documentation/networking/dsa/bcm_sf2.txt
> @@ -38,7 +38,7 @@ Implementation details
> ======================
>
> The driver is located in drivers/net/dsa/bcm_sf2.c and is implemented as a DSA
> -driver; see Documentation/networking/dsa/dsa.txt for details on the subsytem
> +driver; see Documentation/networking/dsa/dsa.txt for details on the subsystem
> and what it provides.
>
> The SF2 switch is configured to enable a Broadcom specific 4-bytes switch tag
> diff --git a/Documentation/networking/dsa/dsa.txt b/Documentation/networking/dsa/dsa.txt
> index 3b196c304b73..36f905d9c77c 100644
> --- a/Documentation/networking/dsa/dsa.txt
> +++ b/Documentation/networking/dsa/dsa.txt
> @@ -334,7 +334,7 @@ more specifically with its VLAN filtering portion when configuring VLANs on top
> of per-port slave network devices. Since DSA primarily deals with
> MDIO-connected switches, although not exclusively, SWITCHDEV's
> prepare/abort/commit phases are often simplified into a prepare phase which
> -checks whether the operation is supporte by the DSA switch driver, and a commit
> +checks whether the operation is supported by the DSA switch driver, and a commit
> phase which applies the changes.
>
> As of today, the only SWITCHDEV objects supported by DSA are the FDB and VLAN
> --
> 2.8.0
>
^ permalink raw reply
* [PATCH v2] route: do not cache fib route info on local routes with oif
From: Chris Friesen @ 2016-04-08 15:08 UTC (permalink / raw)
To: Julian Anastasov; +Cc: netdev
In-Reply-To: <alpine.LFD.2.11.1604072305400.2154@ja.home.ssi.bg>
For local routes that require a particular output interface we do not want to
cache the result. Caching the result causes incorrect behaviour when there are
multiple source addresses on the interface. The end result being that if the
intended recipient is waiting on that interface for the packet he won't receive
it because it will be delivered on the loopback interface and the IP_PKTINFO
ipi_ifindex will be set to the loopback interface as well.
This can be tested by running a program such as "dhcp_release" which attempts
to inject a packet on a particular interface so that it is received by another
program on the same board. The receiving process should see an IP_PKTINFO
ipi_ifndex value of the source interface (e.g., eth1) instead of the loopback
interface (e.g., lo). The packet will still appear on the loopback interface
in tcpdump but the important aspect is that the CMSG info is correct.
Sample dhcp_release command line:
dhcp_release eth1 192.168.204.222 02:11:33:22:44:66
Signed-off-by: Allain Legacy <allain.legacy@windriver.com>
Signed off-by: Chris Friesen <chris.friesen@windriver.com>
---
net/ipv4/route.c | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 02c6229..437a377 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2045,6 +2045,18 @@ static struct rtable *__mkroute_output(const struct
fib_result *res,
*/
if (fi && res->prefixlen < 4)
fi = NULL;
+ } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
+ (orig_oif != dev_out->ifindex)) {
+ /* For local routes that require a particular output interface
+ * we do not want to cache the result. Caching the result
+ * causes incorrect behaviour when there are multiple source
+ * addresses on the interface, the end result being that if the
+ * intended recipient is waiting on that interface for the
+ * packet he won't receive it because it will be delivered on
+ * the loopback interface and the IP_PKTINFO ipi_ifindex will
+ * be set to the loopback interface as well.
+ */
+ fi = NULL;
}
fnhe = NULL;
^ permalink raw reply related
* Re: [RFC PATCH] possible bug in handling of ipv4 route caching
From: Chris Friesen @ 2016-04-08 15:00 UTC (permalink / raw)
To: Julian Anastasov; +Cc: netdev
In-Reply-To: <alpine.LFD.2.11.1604072305400.2154@ja.home.ssi.bg>
On 04/07/2016 03:20 PM, Julian Anastasov wrote:
> On Thu, 7 Apr 2016, Chris Friesen wrote:
>
>> Hi,
>>
>> We think we may have found a bug in the handling of ipv4 route caching,
>> and are curious what you think.
>>
>> For local routes that require a particular output interface we do not
>> want to cache the result. Caching the result causes incorrect behaviour
>> when there are multiple source addresses on the interface. The end
>> result being that if the intended recipient is waiting on that interface
>> for the packet he won't receive it because it will be delivered on the
>> loopback interface and the IP_PKTINFO ipi_ifindex will be set to the
>> loopback interface as well.
>> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
>> index 02c6229..e965d4b 100644
>> --- a/net/ipv4/route.c
>> +++ b/net/ipv4/route.c
>> @@ -2045,6 +2045,17 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
>> */
>> if (fi && res->prefixlen < 4)
>> fi = NULL;
>> + } else if ((type == RTN_LOCAL) && (orig_oif != 0)) {
>
> So, we can be more specific. Can this work?:
>
> } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
> (orig_oif != dev_out->ifindex)) {
>
> I.e. we should allow to cache orig_oif=LOOPBACK_IFINDEX
> but eth1 should not be cached.
Yes, we think that will work. New patch to follow.
Chris
^ permalink raw reply
* [PATCH] [linux-next] Doc: networking: Fix typo in dsa
From: Masanari Iida @ 2016-04-08 15:00 UTC (permalink / raw)
To: linux-kernel, corbet, linux-doc, davem, netdev; +Cc: Masanari Iida
This patch fix typos in Documentation/networking/dsa.
Signed-off-by: Masanari Iida <standby24x7@gmail.com>
---
Documentation/networking/dsa/bcm_sf2.txt | 2 +-
Documentation/networking/dsa/dsa.txt | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/Documentation/networking/dsa/bcm_sf2.txt b/Documentation/networking/dsa/bcm_sf2.txt
index d999d0c1c5b8..eba3a2431e91 100644
--- a/Documentation/networking/dsa/bcm_sf2.txt
+++ b/Documentation/networking/dsa/bcm_sf2.txt
@@ -38,7 +38,7 @@ Implementation details
======================
The driver is located in drivers/net/dsa/bcm_sf2.c and is implemented as a DSA
-driver; see Documentation/networking/dsa/dsa.txt for details on the subsytem
+driver; see Documentation/networking/dsa/dsa.txt for details on the subsystem
and what it provides.
The SF2 switch is configured to enable a Broadcom specific 4-bytes switch tag
diff --git a/Documentation/networking/dsa/dsa.txt b/Documentation/networking/dsa/dsa.txt
index 3b196c304b73..36f905d9c77c 100644
--- a/Documentation/networking/dsa/dsa.txt
+++ b/Documentation/networking/dsa/dsa.txt
@@ -334,7 +334,7 @@ more specifically with its VLAN filtering portion when configuring VLANs on top
of per-port slave network devices. Since DSA primarily deals with
MDIO-connected switches, although not exclusively, SWITCHDEV's
prepare/abort/commit phases are often simplified into a prepare phase which
-checks whether the operation is supporte by the DSA switch driver, and a commit
+checks whether the operation is supported by the DSA switch driver, and a commit
phase which applies the changes.
As of today, the only SWITCHDEV objects supported by DSA are the FDB and VLAN
--
2.8.0
^ permalink raw reply related
* Re: [PATCH RFC] net: decrease the length of backlog queue immediately after it's detached from sk
From: Eric Dumazet @ 2016-04-08 14:44 UTC (permalink / raw)
To: Yang Yingliang; +Cc: netdev, davem, Ding Tianhong
In-Reply-To: <5707939B.2030907@huawei.com>
On Fri, 2016-04-08 at 19:18 +0800, Yang Yingliang wrote:
> I expand tcp_adv_win_scale and tcp_rmem. It has no effect.
Try :
echo -2 >/proc/sys/net/ipv4/tcp_adv_win_scale
And restart your flows.
^ permalink raw reply
* Re: [PATCH net-next] ipv6, token: allow for clearing the current device token
From: Hannes Frederic Sowa @ 2016-04-08 14:33 UTC (permalink / raw)
To: Bjørn Mork, Daniel Borkmann; +Cc: davem, robbat2, netdev
In-Reply-To: <878u0otc96.fsf@nemi.mork.no>
On Fri, Apr 8, 2016, at 16:18, Bjørn Mork wrote:
> Daniel Borkmann <daniel@iogearbox.net> writes:
>
> >
> > if (!token)
> > return -EINVAL;
> > - if (ipv6_addr_any(token))
> > - return -EINVAL;
> > if (dev->flags & (IFF_LOOPBACK | IFF_NOARP))
> > return -EINVAL;
>
> Not directly related to the patch in question. It just made me aware of
> this restriction...
>
> I realize that I'm a few years late here, but what's with the IFF_NOARP?
> Is that just because we can't do DAD for the token based addresses? How
> is that different from manually configuring the whole address?
IFF_NOARP is kind of the equivalent to no neighbor discovery. If you set
a token and never get in a router advertisement you never create a
tokenized ip address, thus the feature is useless.
Bye,
Hannes
^ permalink raw reply
* Re: qdisc spin lock
From: Eric Dumazet @ 2016-04-08 14:19 UTC (permalink / raw)
To: Michael Ma; +Cc: Cong Wang, Linux Kernel Network Developers
In-Reply-To: <CAAmHdhxagKnLP1_5ZW7HTsVBu0TSFYKCvNstAEWN-NHrdnvvVQ@mail.gmail.com>
On Thu, 2016-03-31 at 16:48 -0700, Michael Ma wrote:
> I didn't really know that multiple qdiscs can be isolated using MQ so
> that each txq can be associated with a particular qdisc. Also we don't
> really have multiple interfaces...
>
> With this MQ solution we'll still need to assign transmit queues to
> different classes by doing some math on the bandwidth limit if I
> understand correctly, which seems to be less convenient compared with
> a solution purely within HTB.
>
> I assume that with this solution I can still share qdisc among
> multiple transmit queues - please let me know if this is not the case.
Note that this MQ + HTB thing works well, unless you use a bonding
device. (Or you need the MQ+HTB on the slaves, with no way of sharing
tokens between the slaves)
https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=bb1d912323d5dd50e1079e389f4e964be14f0ae3
bonding can not really be used as a true MQ device yet.
I might send a patch to disable this 'bonding feature' if no slave sets
a queue_id.
^ permalink raw reply
* Re: [PATCH net-next] ipv6, token: allow for clearing the current device token
From: Bjørn Mork @ 2016-04-08 14:18 UTC (permalink / raw)
To: Daniel Borkmann; +Cc: davem, hannes, robbat2, netdev
In-Reply-To: <307b4d32099f606d0fbe0ce9fecd3a039b796361.1460123261.git.daniel@iogearbox.net>
Daniel Borkmann <daniel@iogearbox.net> writes:
>
> if (!token)
> return -EINVAL;
> - if (ipv6_addr_any(token))
> - return -EINVAL;
> if (dev->flags & (IFF_LOOPBACK | IFF_NOARP))
> return -EINVAL;
Not directly related to the patch in question. It just made me aware of
this restriction...
I realize that I'm a few years late here, but what's with the IFF_NOARP?
Is that just because we can't do DAD for the token based addresses? How
is that different from manually configuring the whole address?
Bjørn
^ permalink raw reply
* Re: [PATCH net-next] ipv6, token: allow for clearing the current device token
From: Hannes Frederic Sowa @ 2016-04-08 13:57 UTC (permalink / raw)
To: Daniel Borkmann, davem; +Cc: robbat2, netdev
In-Reply-To: <307b4d32099f606d0fbe0ce9fecd3a039b796361.1460123261.git.daniel@iogearbox.net>
On 08.04.2016 15:55, Daniel Borkmann wrote:
> The original tokenized iid support implemented via f53adae4eae5 ("net: ipv6:
> add tokenized interface identifier support") didn't allow for clearing a
> device token as it was intended that this addressing mode was the only one
> active for globally scoped IPv6 addresses. Later we relaxed that restriction
> via 617fe29d45bd ("net: ipv6: only invalidate previously tokenized addresses"),
> and we should also allow for clearing tokens as there's no good reason why
> it shouldn't be allowed.
>
> Fixes: 617fe29d45bd ("net: ipv6: only invalidate previously tokenized addresses")
> Reported-by: Robin H. Johnson <robbat2@gentoo.org>
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Thanks,
Hannes
^ permalink raw reply
* [PATCH net-next] ipv6, token: allow for clearing the current device token
From: Daniel Borkmann @ 2016-04-08 13:55 UTC (permalink / raw)
To: davem; +Cc: hannes, robbat2, netdev, Daniel Borkmann
The original tokenized iid support implemented via f53adae4eae5 ("net: ipv6:
add tokenized interface identifier support") didn't allow for clearing a
device token as it was intended that this addressing mode was the only one
active for globally scoped IPv6 addresses. Later we relaxed that restriction
via 617fe29d45bd ("net: ipv6: only invalidate previously tokenized addresses"),
and we should also allow for clearing tokens as there's no good reason why
it shouldn't be allowed.
Fixes: 617fe29d45bd ("net: ipv6: only invalidate previously tokenized addresses")
Reported-by: Robin H. Johnson <robbat2@gentoo.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
---
Dave, I think net-next is fine, but don't have any objections if you rather
want to put it into net. Thanks!
net/ipv6/addrconf.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 27aed1a..a6c9927 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4995,15 +4995,13 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token)
{
struct inet6_ifaddr *ifp;
struct net_device *dev = idev->dev;
- bool update_rs = false;
+ bool clear_token, update_rs = false;
struct in6_addr ll_addr;
ASSERT_RTNL();
if (!token)
return -EINVAL;
- if (ipv6_addr_any(token))
- return -EINVAL;
if (dev->flags & (IFF_LOOPBACK | IFF_NOARP))
return -EINVAL;
if (!ipv6_accept_ra(idev))
@@ -5018,10 +5016,13 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token)
write_unlock_bh(&idev->lock);
+ clear_token = ipv6_addr_any(token);
+ if (clear_token)
+ goto update_lft;
+
if (!idev->dead && (idev->if_flags & IF_READY) &&
!ipv6_get_lladdr(dev, &ll_addr, IFA_F_TENTATIVE |
IFA_F_OPTIMISTIC)) {
-
/* If we're not ready, then normal ifup will take care
* of this. Otherwise, we need to request our rs here.
*/
@@ -5029,6 +5030,7 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token)
update_rs = true;
}
+update_lft:
write_lock_bh(&idev->lock);
if (update_rs) {
--
1.9.3
^ permalink raw reply related
* Please get back to me.
From: Nawal @ 2016-04-08 13:29 UTC (permalink / raw)
To: Recipients
Hi, my name is Nawal denison, i need your assistance to move my family
out of Egypt to your country due to the conflict and killings going on here . I have $3. million dollars. which i will use to Establish A
viable joint company with you for your help, attach is my pic and my daughter
Thank you.
Nawal denison
^ permalink raw reply
* Re: [PATCH] iproute2: tc_bpf.c: fix building with musl libc
From: Daniel Borkmann @ 2016-04-08 13:14 UTC (permalink / raw)
To: Gustavo Zacarias, netdev
In-Reply-To: <1460120373-32096-1-git-send-email-gustavo@zacarias.com.ar>
On 04/08/2016 02:59 PM, Gustavo Zacarias wrote:
> We need limits.h for PATH_MAX, fixes:
>
> tc_bpf.c: In function ‘bpf_map_selfcheck_pinned’:
> tc_bpf.c:222:12: error: ‘PATH_MAX’ undeclared (first use in this
> function)
> char file[PATH_MAX], buff[4096];
>
> Signed-off-by: Gustavo Zacarias <gustavo@zacarias.com.ar>
Fine with me, thanks!
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
^ permalink raw reply
* [PATCH net-next] sock: tigthen lockdep checks for sock_owned_by_user
From: Hannes Frederic Sowa @ 2016-04-08 13:11 UTC (permalink / raw)
To: netdev-u79uwXL29TY76Z2rM5mHXA
Cc: linux-cifs-u79uwXL29TY76Z2rM5mHXA,
linux-bluetooth-u79uwXL29TY76Z2rM5mHXA,
linux-nfs-u79uwXL29TY76Z2rM5mHXA
sock_owned_by_user should not be used without socket lock held. It seems
to be a common practice to check .owned before lock reclassification, so
provide a little help to abstract this check away.
Cc: linux-cifs-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: linux-bluetooth-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: linux-nfs-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Signed-off-by: Hannes Frederic Sowa <hannes-tFNcAqjVMyqKXQKiL6tip0B+6BGkLq7r@public.gmane.org>
---
fs/cifs/connect.c | 4 ++--
include/net/sock.h | 44 +++++++++++++++++++++++++++++---------------
net/bluetooth/af_bluetooth.c | 2 +-
net/llc/llc_proc.c | 2 +-
net/sunrpc/svcsock.c | 3 +--
net/sunrpc/xprtsock.c | 3 +--
6 files changed, 35 insertions(+), 23 deletions(-)
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index a763cd3d9e7c80..3e37e52639394b 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2918,7 +2918,7 @@ static inline void
cifs_reclassify_socket4(struct socket *sock)
{
struct sock *sk = sock->sk;
- BUG_ON(sock_owned_by_user(sk));
+ BUG_ON(!sock_allow_reclassification(sk));
sock_lock_init_class_and_name(sk, "slock-AF_INET-CIFS",
&cifs_slock_key[0], "sk_lock-AF_INET-CIFS", &cifs_key[0]);
}
@@ -2927,7 +2927,7 @@ static inline void
cifs_reclassify_socket6(struct socket *sock)
{
struct sock *sk = sock->sk;
- BUG_ON(sock_owned_by_user(sk));
+ BUG_ON(!sock_allow_reclassification(sk));
sock_lock_init_class_and_name(sk, "slock-AF_INET6-CIFS",
&cifs_slock_key[1], "sk_lock-AF_INET6-CIFS", &cifs_key[1]);
}
diff --git a/include/net/sock.h b/include/net/sock.h
index 81d6fecec0a2c0..baba58770ac593 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1316,21 +1316,6 @@ static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
__kfree_skb(skb);
}
-/* Used by processes to "lock" a socket state, so that
- * interrupts and bottom half handlers won't change it
- * from under us. It essentially blocks any incoming
- * packets, so that we won't get any new data or any
- * packets that change the state of the socket.
- *
- * While locked, BH processing will add new packets to
- * the backlog queue. This queue is processed by the
- * owner of the socket lock right before it is released.
- *
- * Since ~2.3.5 it is also exclusive sleep lock serializing
- * accesses from user process context.
- */
-#define sock_owned_by_user(sk) ((sk)->sk_lock.owned)
-
static inline void sock_release_ownership(struct sock *sk)
{
if (sk->sk_lock.owned) {
@@ -1403,6 +1388,35 @@ static inline void unlock_sock_fast(struct sock *sk, bool slow)
spin_unlock_bh(&sk->sk_lock.slock);
}
+/* Used by processes to "lock" a socket state, so that
+ * interrupts and bottom half handlers won't change it
+ * from under us. It essentially blocks any incoming
+ * packets, so that we won't get any new data or any
+ * packets that change the state of the socket.
+ *
+ * While locked, BH processing will add new packets to
+ * the backlog queue. This queue is processed by the
+ * owner of the socket lock right before it is released.
+ *
+ * Since ~2.3.5 it is also exclusive sleep lock serializing
+ * accesses from user process context.
+ */
+
+static inline bool sock_owned_by_user(const struct sock *sk)
+{
+#ifdef CONFIG_LOCKDEP
+ WARN_ON(!lockdep_sock_is_held(sk));
+#endif
+ return sk->sk_lock.owned;
+}
+
+/* no reclassification while locks are held */
+static inline bool sock_allow_reclassification(const struct sock *csk)
+{
+ struct sock *sk = (struct sock *)csk;
+
+ return !sk->sk_lock.owned && !spin_is_locked(&sk->sk_lock.slock);
+}
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
struct proto *prot, int kern);
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 955eda93e66f32..3df7aefb766335 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -65,7 +65,7 @@ static const char *const bt_slock_key_strings[BT_MAX_PROTO] = {
void bt_sock_reclassify_lock(struct sock *sk, int proto)
{
BUG_ON(!sk);
- BUG_ON(sock_owned_by_user(sk));
+ BUG_ON(!sock_allow_reclassification(sk));
sock_lock_init_class_and_name(sk,
bt_slock_key_strings[proto], &bt_slock_key[proto],
diff --git a/net/llc/llc_proc.c b/net/llc/llc_proc.c
index 1a3c7e0f5d0de3..29c509c54bb22d 100644
--- a/net/llc/llc_proc.c
+++ b/net/llc/llc_proc.c
@@ -195,7 +195,7 @@ static int llc_seq_core_show(struct seq_file *seq, void *v)
timer_pending(&llc->pf_cycle_timer.timer),
timer_pending(&llc->rej_sent_timer.timer),
timer_pending(&llc->busy_state_timer.timer),
- !!sk->sk_backlog.tail, !!sock_owned_by_user(sk));
+ !!sk->sk_backlog.tail, !!sk->sk_lock.owned);
out:
return 0;
}
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 1413cdcc131c4a..bfe0a06530f6aa 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -85,8 +85,7 @@ static void svc_reclassify_socket(struct socket *sock)
{
struct sock *sk = sock->sk;
- WARN_ON_ONCE(sock_owned_by_user(sk));
- if (sock_owned_by_user(sk))
+ if (WARN_ON_ONCE(!sock_allow_reclassification(sk)))
return;
switch (sk->sk_family) {
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 65e759569e4873..b7d6b76dede095 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1881,8 +1881,7 @@ static inline void xs_reclassify_socket6(struct socket *sock)
static inline void xs_reclassify_socket(int family, struct socket *sock)
{
- WARN_ON_ONCE(sock_owned_by_user(sock->sk));
- if (sock_owned_by_user(sock->sk))
+ if (WARN_ON_ONCE(!sock_allow_reclassification(sock->sk)))
return;
switch (family) {
--
2.5.5
^ permalink raw reply related
* [PATCH] iproute2: tc_bpf.c: fix building with musl libc
From: Gustavo Zacarias @ 2016-04-08 12:59 UTC (permalink / raw)
To: netdev; +Cc: Gustavo Zacarias
We need limits.h for PATH_MAX, fixes:
tc_bpf.c: In function ‘bpf_map_selfcheck_pinned’:
tc_bpf.c:222:12: error: ‘PATH_MAX’ undeclared (first use in this
function)
char file[PATH_MAX], buff[4096];
Signed-off-by: Gustavo Zacarias <gustavo@zacarias.com.ar>
---
tc/tc_bpf.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/tc/tc_bpf.c b/tc/tc_bpf.c
index d94af82..042e76f 100644
--- a/tc/tc_bpf.c
+++ b/tc/tc_bpf.c
@@ -20,6 +20,7 @@
#include <errno.h>
#include <fcntl.h>
#include <stdarg.h>
+#include <limits.h>
#ifdef HAVE_ELF
#include <libelf.h>
--
2.7.3
^ permalink raw reply related
* Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter
From: Jesper Dangaard Brouer @ 2016-04-08 12:33 UTC (permalink / raw)
To: Brenden Blanco
Cc: davem, netdev, tom, alexei.starovoitov, ogerlitz, daniel,
eric.dumazet, ecree, john.fastabend, tgraf, johannes,
eranlinuxmellanox, lorenzo, brouer, linux-mm
In-Reply-To: <20160408123614.2a15a346@redhat.com>
On Fri, 8 Apr 2016 12:36:14 +0200 Jesper Dangaard Brouer <brouer@redhat.com> wrote:
> > +/* user return codes for PHYS_DEV prog type */
> > +enum bpf_phys_dev_action {
> > + BPF_PHYS_DEV_DROP,
> > + BPF_PHYS_DEV_OK,
> > +};
>
> I can imagine these extra return codes:
>
> BPF_PHYS_DEV_MODIFIED, /* Packet page/payload modified */
> BPF_PHYS_DEV_STOLEN, /* E.g. forward use-case */
> BPF_PHYS_DEV_SHARED, /* Queue for async processing, e.g. tcpdump use-case */
>
> The "STOLEN" and "SHARED" use-cases require some refcnt manipulations,
> which we can look at when we get that far...
I want to point out something which is quite FUNDAMENTAL, for
understanding these return codes (and network stack).
At driver RX time, the network stack basically have two ways of
building an SKB, which is send up the stack.
Option-A (fastest): The packet page is writable. The SKB can be
allocated and skb->data/head can point directly to the page. And
we place/write skb_shared_info in the end/tail-room. (This is done by
calling build_skb()).
Option-B (slower): The packet page is read-only. The SKB cannot point
skb->data/head directly to the page, because skb_shared_info need to be
written into skb->end (slightly hidden via skb_shinfo() casting). To
get around this, a separate piece of memory is allocated (speedup by
__alloc_page_frag) for pointing skb->data/head, so skb_shared_info can
be written. (This is done when calling netdev/napi_alloc_skb()).
Drivers then need to copy over packet headers, and assign + adjust
skb_shinfo(skb)->frags[0] offset to skip copied headers.
Unfortunately most drivers use option-B. Due to cost of calling the
page allocator. It is only slightly most expensive to get a larger
compound page from the page allocator, which then can be partitioned into
page-fragments, thus amortizing the page alloc cost. Unfortunately the
cost is added later, when constructing the SKB.
Another reason for option-B, is that archs with expensive IOMMU
requirements (like PowerPC), don't need to dma_unmap on every packet,
but only on the compound page level.
Side-note: Most drivers have a "copy-break" optimization. Especially
for option-B, when copying header data anyhow. For small packet, one
might as well free (or recycle) the RX page, if header size fits into
the newly allocated memory (for skb_shared_info).
For the early filter drop (DDoS use-case), it does not matter that the
packet-page is read-only.
BUT for the future XDP (eXpress Data Path) use-case it does matter. If
we ever want to see speeds comparable to DPDK, then drivers to
need to implement option-A, as this allow forwarding at the packet-page
level.
I hope, my future page-pool facility can remove/hide the cost calling
the page allocator.
Back to the return codes, thus:
-------------------------------
BPF_PHYS_DEV_SHARED requires driver use option-B, when constructing
the SKB, and treat packet data as read-only.
BPF_PHYS_DEV_MODIFIED requires driver to provide a writable packet-page.
--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Principal Kernel Engineer at Red Hat
Author of http://www.iptv-analyzer.org
LinkedIn: http://www.linkedin.com/in/brouer
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply
* Re: [PATCH] ipv6: rework the lock in addrconf_permanent_addr
From: Sergei Shtylyov @ 2016-04-08 11:50 UTC (permalink / raw)
To: roy.qing.li, netdev
In-Reply-To: <1460107359-8937-1-git-send-email-roy.qing.li@gmail.com>
Hello.
On 4/8/2016 12:22 PM, roy.qing.li@gmail.com wrote:
> From: Li RongQing <roy.qing.li@gmail.com>
>
> 1. nothing of idev is changed, so read lock is enough
> 2. ifp is changed, so used ifp->lock or cmpxchg to protect it
>
> Signed-off-by: Li RongQing <roy.qing.li@gmail.com>
> ---
> net/ipv6/addrconf.c | 26 ++++++++++++++++++++------
> 1 file changed, 20 insertions(+), 6 deletions(-)
>
> diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
> index 27aed1a..f6e7605b 100644
> --- a/net/ipv6/addrconf.c
> +++ b/net/ipv6/addrconf.c
[...]
> @@ -3212,7 +3221,12 @@ static int fixup_permanent_addr(struct inet6_dev *idev,
> if (unlikely(IS_ERR(rt)))
> return PTR_ERR(rt);
>
> - ifp->rt = rt;
> + prev = cmpxchg(&ifp->rt, NULL, rt);
> +
> + /*if cmpxchg failed*/
Please add spaces after /* and before */
> + if (prev) {
> + ip6_rt_put(rt);
> + }
{} not needed here, please remove.
[...]
MBR, Sergei
^ permalink raw reply
* Re: [RFC PATCH v2 4/5] mlx4: add support for fast rx drop bpf program
From: Jesper Dangaard Brouer @ 2016-04-08 11:41 UTC (permalink / raw)
To: Brenden Blanco
Cc: davem, netdev, tom, alexei.starovoitov, ogerlitz, daniel,
eric.dumazet, ecree, john.fastabend, tgraf, johannes,
eranlinuxmellanox, lorenzo, brouer
In-Reply-To: <1460090930-11219-4-git-send-email-bblanco@plumgrid.com>
On Thu, 7 Apr 2016 21:48:49 -0700 Brenden Blanco <bblanco@plumgrid.com> wrote:
> +int mlx4_call_bpf(struct bpf_prog *prog, void *data, unsigned int length)
> +{
> + struct sk_buff *skb = this_cpu_ptr(&percpu_bpf_phys_dev_md);
> + int ret;
> +
> + build_bpf_phys_dev_md(skb, data, length);
> +
> + rcu_read_lock();
> + ret = BPF_PROG_RUN(prog, (void *)skb);
> + rcu_read_unlock();
> +
> + return ret;
> +}
[...]
> diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> index 86bcfe5..287da02 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> @@ -840,6 +843,23 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
> l2_tunnel = (dev->hw_enc_features & NETIF_F_RXCSUM) &&
> (cqe->vlan_my_qpn & cpu_to_be32(MLX4_CQE_L2_TUNNEL));
>
> + /* A bpf program gets first chance to drop the packet. It may
> + * read bytes but not past the end of the frag.
> + */
> + if (prog) {
> + struct ethhdr *ethh;
> + dma_addr_t dma;
> +
> + dma = be64_to_cpu(rx_desc->data[0].addr);
> + dma_sync_single_for_cpu(priv->ddev, dma, sizeof(*ethh),
> + DMA_FROM_DEVICE);
> + ethh = page_address(frags[0].page) +
> + frags[0].page_offset;
> + if (mlx4_call_bpf(prog, ethh, frags[0].page_size) ==
> + BPF_PHYS_DEV_DROP)
> + goto next;
> + }
> +
Do the program need to know if the packet-page we handover is
considered 'read-only' at the call site? Or do we rely on my idea
requiring the registration to "know" this...
--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Principal Kernel Engineer at Red Hat
Author of http://www.iptv-analyzer.org
LinkedIn: http://www.linkedin.com/in/brouer
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox