Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 6/7] qlcnic: fix bug in LRO descriptor access macro
From: Sony Chacko @ 2012-12-04 13:33 UTC (permalink / raw)
  To: davem; +Cc: netdev, Dept_NX_Linux_NIC_Driver, Sony Chacko
In-Reply-To: <1354628038-2234-1-git-send-email-sony.chacko@qlogic.com>

From: Sony Chacko <sony.chacko@qlogic.com>

Signed-off-by: Sony Chacko <sony.chacko@qlogic.com>
---
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c
index 880a9ca..6f82812 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c
@@ -66,7 +66,7 @@
 	(((sts_data) >> 58) & 0x03F)
 
 #define qlcnic_get_lro_sts_refhandle(sts_data) 	\
-	((sts_data) & 0x0FFFF)
+	((sts_data) & 0x07FFF)
 #define qlcnic_get_lro_sts_length(sts_data)	\
 	(((sts_data) >> 16) & 0x0FFFF)
 #define qlcnic_get_lro_sts_l2_hdr_offset(sts_data)	\
-- 
1.7.1

^ permalink raw reply related

* [PATCH 5/7] qlcnic: update NIC partition interface routines
From: Sony Chacko @ 2012-12-04 13:33 UTC (permalink / raw)
  To: davem; +Cc: netdev, Dept_NX_Linux_NIC_Driver, Sony Chacko
In-Reply-To: <1354628038-2234-1-git-send-email-sony.chacko@qlogic.com>

From: Sony Chacko <sony.chacko@qlogic.com>

Refactor 82xx driver to support new adapter
Update routines to support variable number of NIC partitions

Signed-off-by: Rajesh Borundia <rajesh.borundia@qlogic.com>
Signed-off-by: Sony Chacko <sony.chacko@qlogic.com>
---
 drivers/net/ethernet/qlogic/qlcnic/qlcnic.h      |    1 +
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c  |   39 +++++++--
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c |   97 +++++++++++++---------
 3 files changed, 89 insertions(+), 48 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h b/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h
index 4d85c70..5379024 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h
@@ -1051,6 +1051,7 @@ struct qlcnic_npar_info {
 	u8	mac_anti_spoof;
 	u8	promisc_mode;
 	u8	offload_flags;
+	u8      pci_func;
 };
 
 struct qlcnic_eswitch {
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c
index c5cfbaa..58f094c 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c
@@ -7,6 +7,18 @@
 
 #include "qlcnic.h"
 
+static int qlcnic_is_valid_nic_func(struct qlcnic_adapter *adapter, u8 pci_func)
+{
+	int i;
+
+	for (i = 0; i < adapter->ahw->act_pci_func; i++) {
+		if (adapter->npars[i].pci_func == pci_func)
+			return i;
+	}
+
+	return -1;
+}
+
 static u32
 qlcnic_poll_rsp(struct qlcnic_adapter *adapter)
 {
@@ -817,11 +829,14 @@ int qlcnic_get_pci_info(struct qlcnic_adapter *adapter,
 	qlcnic_issue_cmd(adapter, &cmd);
 	err = cmd.rsp.cmd;
 
+	adapter->ahw->act_pci_func = 0;
 	if (err == QLCNIC_RCODE_SUCCESS) {
 		for (i = 0; i < QLCNIC_MAX_PCI_FUNC; i++, npar++, pci_info++) {
 			pci_info->id = le16_to_cpu(npar->id);
 			pci_info->active = le16_to_cpu(npar->active);
 			pci_info->type = le16_to_cpu(npar->type);
+			if (pci_info->type == QLCNIC_TYPE_NIC)
+				adapter->ahw->act_pci_func++;
 			pci_info->default_port =
 				le16_to_cpu(npar->default_port);
 			pci_info->tx_min_bw =
@@ -1016,12 +1031,13 @@ int qlcnic_get_eswitch_stats(struct qlcnic_adapter *adapter, const u8 eswitch,
 	esw_stats->numbytes = QLCNIC_STATS_NOT_AVAIL;
 	esw_stats->context_id = eswitch;
 
-	for (i = 0; i < QLCNIC_MAX_PCI_FUNC; i++) {
+	for (i = 0; i < adapter->ahw->act_pci_func; i++) {
 		if (adapter->npars[i].phy_port != eswitch)
 			continue;
 
 		memset(&port_stats, 0, sizeof(struct __qlcnic_esw_statistics));
-		if (qlcnic_get_port_stats(adapter, i, rx_tx, &port_stats))
+		if (qlcnic_get_port_stats(adapter, adapter->npars[i].pci_func,
+					  rx_tx, &port_stats))
 			continue;
 
 		esw_stats->size = port_stats.size;
@@ -1120,7 +1136,7 @@ op_type = 1 for port vlan_id
 int qlcnic_config_switch_port(struct qlcnic_adapter *adapter,
 		struct qlcnic_esw_func_cfg *esw_cfg)
 {
-	int err = -EIO;
+	int err = -EIO, index;
 	u32 arg1, arg2 = 0;
 	struct qlcnic_cmd_args cmd;
 	u8 pci_func;
@@ -1128,7 +1144,10 @@ int qlcnic_config_switch_port(struct qlcnic_adapter *adapter,
 	if (adapter->ahw->op_mode != QLCNIC_MGMT_FUNC)
 		return err;
 	pci_func = esw_cfg->pci_func;
-	arg1 = (adapter->npars[pci_func].phy_port & BIT_0);
+	index = qlcnic_is_valid_nic_func(adapter, pci_func);
+	if (index < 0)
+		return err;
+	arg1 = (adapter->npars[index].phy_port & BIT_0);
 	arg1 |= (pci_func << 8);
 
 	if (__qlcnic_get_eswitch_port_config(adapter, &arg1, &arg2))
@@ -1192,11 +1211,17 @@ qlcnic_get_eswitch_port_config(struct qlcnic_adapter *adapter,
 			struct qlcnic_esw_func_cfg *esw_cfg)
 {
 	u32 arg1, arg2;
+	int index;
 	u8 phy_port;
-	if (adapter->ahw->op_mode == QLCNIC_MGMT_FUNC)
-		phy_port = adapter->npars[esw_cfg->pci_func].phy_port;
-	else
+
+	if (adapter->ahw->op_mode == QLCNIC_MGMT_FUNC) {
+		index = qlcnic_is_valid_nic_func(adapter, esw_cfg->pci_func);
+		if (index < 0)
+			return -EIO;
+		phy_port = adapter->npars[index].phy_port;
+	} else {
 		phy_port = adapter->ahw->physical_port;
+	}
 	arg1 = phy_port;
 	arg1 |= (esw_cfg->pci_func << 8);
 	if (__qlcnic_get_eswitch_port_config(adapter, &arg1, &arg2))
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
index bfcd004..d94d3e9 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
@@ -91,6 +91,9 @@ static void qlcnic_set_netdev_features(struct qlcnic_adapter *,
 static int qlcnic_vlan_rx_add(struct net_device *, u16);
 static int qlcnic_vlan_rx_del(struct net_device *, u16);
 
+#define QLCNIC_IS_TSO_CAPABLE(adapter)	\
+	((adapter)->ahw->capabilities & QLCNIC_FW_CAPABILITY_TSO)
+
 /*  PCI Device ID Table  */
 #define ENTRY(device) \
 	{PCI_DEVICE(PCI_VENDOR_ID_QLOGIC, (device)), \
@@ -369,19 +372,25 @@ qlcnic_cleanup_pci_map(struct qlcnic_adapter *adapter)
 		iounmap(adapter->ahw->pci_base0);
 }
 
-static int
-qlcnic_init_pci_info(struct qlcnic_adapter *adapter)
+static int qlcnic_init_pci_info(struct qlcnic_adapter *adapter)
 {
 	struct qlcnic_pci_info *pci_info;
-	int i, ret = 0;
+	int i, ret = 0, j = 0;
+	u16 act_pci_func;
 	u8 pfn;
 
 	pci_info = kcalloc(QLCNIC_MAX_PCI_FUNC, sizeof(*pci_info), GFP_KERNEL);
 	if (!pci_info)
 		return -ENOMEM;
 
+	ret = qlcnic_get_pci_info(adapter, pci_info);
+	if (ret)
+		goto err_pci_info;
+
+	act_pci_func = adapter->ahw->act_pci_func;
+
 	adapter->npars = kzalloc(sizeof(struct qlcnic_npar_info) *
-				QLCNIC_MAX_PCI_FUNC, GFP_KERNEL);
+				 act_pci_func, GFP_KERNEL);
 	if (!adapter->npars) {
 		ret = -ENOMEM;
 		goto err_pci_info;
@@ -394,21 +403,25 @@ qlcnic_init_pci_info(struct qlcnic_adapter *adapter)
 		goto err_npars;
 	}
 
-	ret = qlcnic_get_pci_info(adapter, pci_info);
-	if (ret)
-		goto err_eswitch;
-
 	for (i = 0; i < QLCNIC_MAX_PCI_FUNC; i++) {
 		pfn = pci_info[i].id;
+
 		if (pfn >= QLCNIC_MAX_PCI_FUNC) {
 			ret = QL_STATUS_INVALID_PARAM;
 			goto err_eswitch;
 		}
-		adapter->npars[pfn].active = (u8)pci_info[i].active;
-		adapter->npars[pfn].type = (u8)pci_info[i].type;
-		adapter->npars[pfn].phy_port = (u8)pci_info[i].default_port;
-		adapter->npars[pfn].min_bw = pci_info[i].tx_min_bw;
-		adapter->npars[pfn].max_bw = pci_info[i].tx_max_bw;
+
+		if (!pci_info[i].active ||
+		    (pci_info[i].type != QLCNIC_TYPE_NIC))
+			continue;
+
+		adapter->npars[j].pci_func = pfn;
+		adapter->npars[j].active = (u8)pci_info[i].active;
+		adapter->npars[j].type = (u8)pci_info[i].type;
+		adapter->npars[j].phy_port = (u8)pci_info[i].default_port;
+		adapter->npars[j].min_bw = pci_info[i].tx_min_bw;
+		adapter->npars[j].max_bw = pci_info[i].tx_max_bw;
+		j++;
 	}
 
 	for (i = 0; i < QLCNIC_NIU_MAX_XG_PORTS; i++)
@@ -436,7 +449,7 @@ qlcnic_set_function_modes(struct qlcnic_adapter *adapter)
 	u32 ref_count;
 	int i, ret = 1;
 	u32 data = QLCNIC_MGMT_FUNC;
-	void __iomem *priv_op = adapter->ahw->pci_base0 + QLCNIC_DRV_OP_MODE;
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
 
 	/* If other drivers are not in use set their privilege level */
 	ref_count = QLCRD32(adapter, QLCNIC_CRB_DRV_ACTIVE);
@@ -445,21 +458,20 @@ qlcnic_set_function_modes(struct qlcnic_adapter *adapter)
 		goto err_lock;
 
 	if (qlcnic_config_npars) {
-		for (i = 0; i < QLCNIC_MAX_PCI_FUNC; i++) {
-			id = i;
-			if (adapter->npars[i].type != QLCNIC_TYPE_NIC ||
-				id == adapter->ahw->pci_func)
+		for (i = 0; i < ahw->act_pci_func; i++) {
+			id = adapter->npars[i].pci_func;
+			if (id == ahw->pci_func)
 				continue;
 			data |= (qlcnic_config_npars &
 					QLC_DEV_SET_DRV(0xf, id));
 		}
 	} else {
-		data = readl(priv_op);
-		data = (data & ~QLC_DEV_SET_DRV(0xf, adapter->ahw->pci_func)) |
+		data = QLCRD32(adapter, QLCNIC_DRV_OP_MODE);
+		data = (data & ~QLC_DEV_SET_DRV(0xf, ahw->pci_func)) |
 			(QLC_DEV_SET_DRV(QLCNIC_MGMT_FUNC,
-			adapter->ahw->pci_func));
+					 ahw->pci_func));
 	}
-	writel(data, priv_op);
+	QLCWR32(adapter, QLCNIC_DRV_OP_MODE, data);
 	qlcnic_api_unlock(adapter);
 err_lock:
 	return ret;
@@ -632,6 +644,7 @@ qlcnic_initialize_nic(struct qlcnic_adapter *adapter)
 	int err;
 	struct qlcnic_info nic_info;
 
+	memset(&nic_info, 0, sizeof(struct qlcnic_info));
 	err = qlcnic_get_nic_info(adapter, &nic_info, adapter->ahw->pci_func);
 	if (err)
 		return err;
@@ -798,8 +811,7 @@ qlcnic_check_eswitch_mode(struct qlcnic_adapter *adapter)
 	return err;
 }
 
-static int
-qlcnic_set_default_offload_settings(struct qlcnic_adapter *adapter)
+static int qlcnic_set_default_offload_settings(struct qlcnic_adapter *adapter)
 {
 	struct qlcnic_esw_func_cfg esw_cfg;
 	struct qlcnic_npar_info *npar;
@@ -808,16 +820,16 @@ qlcnic_set_default_offload_settings(struct qlcnic_adapter *adapter)
 	if (adapter->need_fw_reset)
 		return 0;
 
-	for (i = 0; i < QLCNIC_MAX_PCI_FUNC; i++) {
-		if (adapter->npars[i].type != QLCNIC_TYPE_NIC)
-			continue;
+	for (i = 0; i < adapter->ahw->act_pci_func; i++) {
 		memset(&esw_cfg, 0, sizeof(struct qlcnic_esw_func_cfg));
-		esw_cfg.pci_func = i;
-		esw_cfg.offload_flags = BIT_0;
+		esw_cfg.pci_func = adapter->npars[i].pci_func;
 		esw_cfg.mac_override = BIT_0;
 		esw_cfg.promisc_mode = BIT_0;
-		if (adapter->ahw->capabilities  & QLCNIC_FW_CAPABILITY_TSO)
-			esw_cfg.offload_flags |= (BIT_1 | BIT_2);
+		if (qlcnic_82xx_check(adapter)) {
+			esw_cfg.offload_flags = BIT_0;
+			if (QLCNIC_IS_TSO_CAPABLE(adapter))
+				esw_cfg.offload_flags |= (BIT_1 | BIT_2);
+		}
 		if (qlcnic_config_switch_port(adapter, &esw_cfg))
 			return -EIO;
 		npar = &adapter->npars[i];
@@ -855,22 +867,24 @@ qlcnic_reset_eswitch_config(struct qlcnic_adapter *adapter,
 	return 0;
 }
 
-static int
-qlcnic_reset_npar_config(struct qlcnic_adapter *adapter)
+static int qlcnic_reset_npar_config(struct qlcnic_adapter *adapter)
 {
 	int i, err;
 	struct qlcnic_npar_info *npar;
 	struct qlcnic_info nic_info;
+	u8 pci_func;
 
-	if (!adapter->need_fw_reset)
-		return 0;
+	if (qlcnic_82xx_check(adapter))
+		if (!adapter->need_fw_reset)
+			return 0;
 
 	/* Set the NPAR config data after FW reset */
-	for (i = 0; i < QLCNIC_MAX_PCI_FUNC; i++) {
+	for (i = 0; i < adapter->ahw->act_pci_func; i++) {
 		npar = &adapter->npars[i];
-		if (npar->type != QLCNIC_TYPE_NIC)
-			continue;
-		err = qlcnic_get_nic_info(adapter, &nic_info, i);
+		pci_func = npar->pci_func;
+		memset(&nic_info, 0, sizeof(struct qlcnic_info));
+		err = qlcnic_get_nic_info(adapter,
+					  &nic_info, pci_func);
 		if (err)
 			return err;
 		nic_info.min_tx_bw = npar->min_bw;
@@ -881,11 +895,12 @@ qlcnic_reset_npar_config(struct qlcnic_adapter *adapter)
 
 		if (npar->enable_pm) {
 			err = qlcnic_config_port_mirroring(adapter,
-							npar->dest_npar, 1, i);
+							   npar->dest_npar, 1,
+							   pci_func);
 			if (err)
 				return err;
 		}
-		err = qlcnic_reset_eswitch_config(adapter, npar, i);
+		err = qlcnic_reset_eswitch_config(adapter, npar, pci_func);
 		if (err)
 			return err;
 	}
-- 
1.7.1

^ permalink raw reply related

* [PATCH 7/7] qlcnic: rename module params with module_param_named
From: Sony Chacko @ 2012-12-04 13:33 UTC (permalink / raw)
  To: davem; +Cc: netdev, Dept_NX_Linux_NIC_Driver, Sony Chacko
In-Reply-To: <1354628038-2234-1-git-send-email-sony.chacko@qlogic.com>

From: Sony Chacko <sony.chacko@qlogic.com>

Add qlcnic prefix to qlcnic driver module parameters.

Signed-off-by: Sony Chacko <sony.chacko@qlogic.com>
---
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c |   32 +++++++++++-----------
 1 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
index d94d3e9..8b566c2 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
@@ -34,21 +34,21 @@ static int qlcnic_mac_learn;
 module_param(qlcnic_mac_learn, int, 0444);
 MODULE_PARM_DESC(qlcnic_mac_learn, "Mac Filter (0=disabled, 1=enabled)");
 
-static int use_msi = 1;
-module_param(use_msi, int, 0444);
+static int qlcnic_use_msi = 1;
 MODULE_PARM_DESC(use_msi, "MSI interrupt (0=disabled, 1=enabled");
+module_param_named(use_msi, qlcnic_use_msi, int, 0444);
 
-static int use_msi_x = 1;
-module_param(use_msi_x, int, 0444);
+static int qlcnic_use_msi_x = 1;
 MODULE_PARM_DESC(use_msi_x, "MSI-X interrupt (0=disabled, 1=enabled");
+module_param_named(use_msi_x, qlcnic_use_msi_x, int, 0444);
 
-static int auto_fw_reset = 1;
-module_param(auto_fw_reset, int, 0644);
+static int qlcnic_auto_fw_reset = 1;
 MODULE_PARM_DESC(auto_fw_reset, "Auto firmware reset (0=disabled, 1=enabled");
+module_param_named(auto_fw_reset, qlcnic_auto_fw_reset, int, 0644);
 
-static int load_fw_file;
-module_param(load_fw_file, int, 0444);
+static int qlcnic_load_fw_file;
 MODULE_PARM_DESC(load_fw_file, "Load firmware from (0=flash, 1=file");
+module_param_named(load_fw_file, qlcnic_load_fw_file, int, 0444);
 
 static int qlcnic_config_npars;
 module_param(qlcnic_config_npars, int, 0444);
@@ -317,7 +317,7 @@ static void qlcnic_enable_msi_legacy(struct qlcnic_adapter *adapter)
 	struct qlcnic_hardware_context *ahw = adapter->ahw;
 	struct pci_dev *pdev = adapter->pdev;
 
-	if (use_msi && !pci_enable_msi(pdev)) {
+	if (qlcnic_use_msi && !pci_enable_msi(pdev)) {
 		adapter->flags |= QLCNIC_MSI_ENABLED;
 		offset = msi_tgt_status[adapter->ahw->pci_func];
 		adapter->tgt_status_reg = qlcnic_get_ioaddr(adapter->ahw,
@@ -631,7 +631,7 @@ qlcnic_check_options(struct qlcnic_adapter *adapter)
 		adapter->max_rxd = MAX_RCV_DESCRIPTORS_1G;
 	}
 
-	adapter->ahw->msix_supported = !!use_msi_x;
+	adapter->ahw->msix_supported = !!qlcnic_use_msi_x;
 
 	adapter->num_txd = MAX_CMD_DESCRIPTORS;
 
@@ -961,7 +961,7 @@ qlcnic_start_firmware(struct qlcnic_adapter *adapter)
 	else if (!err)
 		goto check_fw_status;
 
-	if (load_fw_file)
+	if (qlcnic_load_fw_file)
 		qlcnic_request_firmware(adapter);
 	else {
 		err = qlcnic_check_flash_fw_ver(adapter);
@@ -2558,7 +2558,7 @@ qlcnic_check_health(struct qlcnic_adapter *adapter)
 		if (adapter->need_fw_reset)
 			goto detach;
 
-		if (adapter->ahw->reset_context && auto_fw_reset) {
+		if (adapter->ahw->reset_context && qlcnic_auto_fw_reset) {
 			qlcnic_reset_hw_context(adapter);
 			adapter->netdev->trans_start = jiffies;
 		}
@@ -2573,7 +2573,7 @@ qlcnic_check_health(struct qlcnic_adapter *adapter)
 
 	qlcnic_dev_request_reset(adapter);
 
-	if (auto_fw_reset)
+	if (qlcnic_auto_fw_reset)
 		clear_bit(__QLCNIC_FW_ATTACHED, &adapter->state);
 
 	dev_err(&adapter->pdev->dev, "firmware hang detected\n");
@@ -2598,8 +2598,8 @@ detach:
 	adapter->dev_state = (state == QLCNIC_DEV_NEED_QUISCENT) ? state :
 		QLCNIC_DEV_NEED_RESET;
 
-	if (auto_fw_reset &&
-		!test_and_set_bit(__QLCNIC_RESETTING, &adapter->state)) {
+	if (qlcnic_auto_fw_reset && !test_and_set_bit(__QLCNIC_RESETTING,
+						      &adapter->state)) {
 
 		qlcnic_schedule_work(adapter, qlcnic_detach_work, 0);
 		QLCDB(adapter, DRV, "fw recovery scheduled.\n");
@@ -2784,7 +2784,7 @@ qlcnicvf_start_firmware(struct qlcnic_adapter *adapter)
 
 int qlcnic_validate_max_rss(struct net_device *netdev, u8 max_hw, u8 val)
 {
-	if (!use_msi_x && !use_msi) {
+	if (!qlcnic_use_msi_x && !qlcnic_use_msi) {
 		netdev_info(netdev, "no msix or msi support, hence no rss\n");
 		return -EINVAL;
 	}
-- 
1.7.1

^ permalink raw reply related

* RE: [net-next PATCH V3-evictor] net: frag evictor,avoid killing warm frag queues
From: David Laight @ 2012-12-04 14:32 UTC (permalink / raw)
  To: Jesper Dangaard Brouer, Eric Dumazet, David S. Miller,
	Florian Westphal
  Cc: netdev, Thomas Graf, Paul E. McKenney, Cong Wang, Herbert Xu
In-Reply-To: <20121204133007.20215.52566.stgit@dragon>

> The fragmentation evictor system have a very unfortunate eviction
> system for killing fragment, when the system is put under pressure.
> 
> If packets are coming in too fast, the evictor code kills "warm"
> fragments too quickly.  Resulting in close to zero throughput, as
> fragments are killed before they have a chance to complete
> 
> This is related to the bad interaction with the LRU (Least Recently
> Used) list.  Under load the LRU list sort-of changes meaning/behavior.
> When the LRU head is very new/warm, then the head is most likely the
> one with most fragments and the tail (latest used or added element)
> with least.
> 
> Solved by, introducing a creation "jiffie" timestamp (creation_ts).
> If the element is tried evicted in same jiffie, then perform tail drop
> on the LRU list instead.

I'm not at all sure a 'same tick' test is actually sensible.

Some quick 'ball park' maths:
10Mbps is about 1kB per jiffie (1kHz tick).
So at 10Mbps you'll see a frame every 1.5 jiffies.
So if the source of any fragment stream is some remote ADSL
connection (rather than a local Ge LAN connection) you'll
never manage to assemble the entire datagram.
Anyone connecting from dialup will fail miserably.

This means you need to keep some initial fragments for much
longer than 1 jiffie before discarding them.

Anything with more than one in-sequence fragment is a good
candidate for keeping (especially if the inter-fragment time
is being monitored).

Anything with out of sequence fragments is a very good choice
for discard.

But yes, discarding some 'new' fragments is very likely necessary
in order to manage to actually receive anything.

Sometimes I've wondered whether discarding some transmits would
help when the rx side is congested (and v.v.). I've never tried
modelling that one with any traffic flows! But forcing the remote
to timeout might reduce the overall traffic, trouble is the
discards would have to be carefully chosen!
(I was thinking about this mainly with regard to routers that
are having issues feeding outbound data into a narrow pipe.)

	David

^ permalink raw reply

* Re: [PATCH net-next 2/3] virtio_net: multiqueue support
From: Jason Wang @ 2012-12-04 14:45 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: krkumar2, kvm, netdev, linux-kernel, virtualization, bhutchings,
	jwhan, davem, shiyer
In-Reply-To: <20121204132422.GD7499@redhat.com>

On Tuesday, December 04, 2012 03:24:22 PM Michael S. Tsirkin wrote:
> I found some bugs, see below.
> Also some style nitpicking, this is not mandatory to address.

Thanks for the reviewing.
> 
> On Tue, Dec 04, 2012 at 07:07:57PM +0800, Jason Wang wrote:
> > This addes multiqueue support to virtio_net driver. In multiple queue
> > modes, the driver expects the number of queue paris is equal to the
> > number of vcpus. To eliminate the contention bettwen vcpus and
> > virtqueues, per-cpu virtqueue pairs
> > were implemented through:
> Lots of typos above - try running ispell on it :)
> 

Sure.
> > - select the txq based on the smp processor id.
> > - smp affinity hint were set to the vcpu that owns the queue pairs.
> > 
> > Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
> > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > ---
> > 
> >  drivers/net/virtio_net.c        |  472
> >  ++++++++++++++++++++++++++++++--------- include/uapi/linux/virtio_net.h
> >  |   16 ++
> >  2 files changed, 385 insertions(+), 103 deletions(-)
> > 
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 266f712..912f5b2 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -81,16 +81,25 @@ struct virtnet_info {
> > 
> >  	struct virtio_device *vdev;
> >  	struct virtqueue *cvq;
> >  	struct net_device *dev;
> > 
> > -	struct send_queue sq;
> > -	struct receive_queue rq;
> > +	struct send_queue *sq;
> > +	struct receive_queue *rq;
> > 
> >  	unsigned int status;
> > 
> > +	/* Max # of queue pairs supported by the device */
> > +	u16 max_queue_pairs;
> > +
> > +	/* # of queue pairs currently used by the driver */
> > +	u16 curr_queue_pairs;
> > +
> > 
> >  	/* I like... big packets and I cannot lie! */
> >  	bool big_packets;
> >  	
> >  	/* Host will merge rx buffers for big packets (shake it! shake it!) */
> >  	bool mergeable_rx_bufs;
> > 
> > +	/* Has control virtqueue */
> > +	bool has_cvq;
> > +
> > 
> >  	/* enable config space updates */
> >  	bool config_enable;
> > 
> > @@ -125,6 +134,32 @@ struct padded_vnet_hdr {
> > 
> >  	char padding[6];
> >  
> >  };
> > 
> > +static const struct ethtool_ops virtnet_ethtool_ops;
> > +
> > +
> > +/* Converting between virtqueue no. and kernel tx/rx queue no.
> > + * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
> > + */
> > +static int vq2txq(struct virtqueue *vq)
> > +{
> > +	return (virtqueue_get_queue_index(vq) - 1) / 2;
> > +}
> > +
> > +static int txq2vq(int txq)
> > +{
> > +	return txq * 2 + 1;
> > +}
> > +
> > +static int vq2rxq(struct virtqueue *vq)
> > +{
> > +	return virtqueue_get_queue_index(vq) / 2;
> > +}
> > +
> > +static int rxq2vq(int rxq)
> > +{
> > +	return rxq * 2;
> > +}
> > +
> > 
> >  static inline struct skb_vnet_hdr *skb_vnet_hdr(struct sk_buff *skb)
> >  {
> >  
> >  	return (struct skb_vnet_hdr *)skb->cb;
> > 
> > @@ -165,7 +200,7 @@ static void skb_xmit_done(struct virtqueue *vq)
> > 
> >  	virtqueue_disable_cb(vq);
> >  	
> >  	/* We were probably waiting for more output buffers. */
> > 
> > -	netif_wake_queue(vi->dev);
> > +	netif_wake_subqueue(vi->dev, vq2txq(vq));
> > 
> >  }
> >  
> >  static void set_skb_frag(struct sk_buff *skb, struct page *page,
> > 
> > @@ -502,7 +537,7 @@ static bool try_fill_recv(struct receive_queue *rq,
> > gfp_t gfp)> 
> >  static void skb_recv_done(struct virtqueue *rvq)
> >  {
> >  
> >  	struct virtnet_info *vi = rvq->vdev->priv;
> > 
> > -	struct receive_queue *rq = &vi->rq;
> > +	struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
> > 
> >  	/* Schedule NAPI, Suppress further interrupts if successful. */
> >  	if (napi_schedule_prep(&rq->napi)) {
> > 
> > @@ -532,15 +567,21 @@ static void refill_work(struct work_struct *work)
> > 
> >  	struct virtnet_info *vi =
> >  	
> >  		container_of(work, struct virtnet_info, refill.work);
> >  	
> >  	bool still_empty;
> > 
> > +	int i;
> > +
> > +	for (i = 0; i < vi->max_queue_pairs; i++) {
> > +		struct receive_queue *rq = &vi->rq[i];
> > 
> > -	napi_disable(&vi->rq.napi);
> > -	still_empty = !try_fill_recv(&vi->rq, GFP_KERNEL);
> > -	virtnet_napi_enable(&vi->rq);
> > +		napi_disable(&rq->napi);
> > +		still_empty = !try_fill_recv(rq, GFP_KERNEL);
> > +		virtnet_napi_enable(rq);
> > 
> > -	/* In theory, this can happen: if we don't get any buffers in
> > -	 * we will *never* try to fill again. */
> > -	if (still_empty)
> > -		schedule_delayed_work(&vi->refill, HZ/2);
> > +		/* In theory, this can happen: if we don't get any buffers in
> > +		 * we will *never* try to fill again.
> > +		 */
> > +		if (still_empty)
> > +			schedule_delayed_work(&vi->refill, HZ/2);
> > +	}
> > 
> >  }
> >  
> >  static int virtnet_poll(struct napi_struct *napi, int budget)
> > 
> > @@ -650,7 +691,8 @@ static int xmit_skb(struct send_queue *sq, struct
> > sk_buff *skb)> 
> >  static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device
> >  *dev)
> >  {
> >  
> >  	struct virtnet_info *vi = netdev_priv(dev);
> > 
> > -	struct send_queue *sq = &vi->sq;
> > +	int qnum = skb_get_queue_mapping(skb);
> > +	struct send_queue *sq = &vi->sq[qnum];
> > 
> >  	int capacity;
> >  	
> >  	/* Free up any pending old buffers before queueing new ones. */
> > 
> > @@ -664,13 +706,14 @@ static netdev_tx_t start_xmit(struct sk_buff *skb,
> > struct net_device *dev)> 
> >  		if (likely(capacity == -ENOMEM)) {
> >  		
> >  			if (net_ratelimit())
> >  			
> >  				dev_warn(&dev->dev,
> > 
> > -					 "TX queue failure: out of memory\n");
> > +					 "TXQ (%d) failure: out of memory\n",
> > +					 qnum);
> > 
> >  		} else {
> >  		
> >  			dev->stats.tx_fifo_errors++;
> >  			if (net_ratelimit())
> >  			
> >  				dev_warn(&dev->dev,
> > 
> > -					 "Unexpected TX queue failure: %d\n",
> > -					 capacity);
> > +					 "Unexpected TXQ (%d) failure: %d\n",
> > +					 qnum, capacity);
> > 
> >  		}
> >  		dev->stats.tx_dropped++;
> >  		kfree_skb(skb);
> > 
> > @@ -685,12 +728,12 @@ static netdev_tx_t start_xmit(struct sk_buff *skb,
> > struct net_device *dev)> 
> >  	/* Apparently nice girls don't return TX_BUSY; stop the queue
> >  	
> >  	 * before it gets out of hand.  Naturally, this wastes entries. */
> >  	
> >  	if (capacity < 2+MAX_SKB_FRAGS) {
> > 
> > -		netif_stop_queue(dev);
> > +		netif_stop_subqueue(dev, qnum);
> > 
> >  		if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
> >  		
> >  			/* More just got used, free them then recheck. */
> >  			capacity += free_old_xmit_skbs(sq);
> >  			if (capacity >= 2+MAX_SKB_FRAGS) {
> > 
> > -				netif_start_queue(dev);
> > +				netif_start_subqueue(dev, qnum);
> > 
> >  				virtqueue_disable_cb(sq->vq);
> >  			
> >  			}
> >  		
> >  		}
> > 
> > @@ -758,23 +801,13 @@ static struct rtnl_link_stats64
> > *virtnet_stats(struct net_device *dev,> 
> >  static void virtnet_netpoll(struct net_device *dev)
> >  {
> >  
> >  	struct virtnet_info *vi = netdev_priv(dev);
> > 
> > +	int i;
> > 
> > -	napi_schedule(&vi->rq.napi);
> > +	for (i = 0; i < vi->curr_queue_pairs; i++)
> > +		napi_schedule(&vi->rq[i].napi);
> > 
> >  }
> >  #endif
> > 
> > -static int virtnet_open(struct net_device *dev)
> > -{
> > -	struct virtnet_info *vi = netdev_priv(dev);
> > -
> > -	/* Make sure we have some buffers: if oom use wq. */
> > -	if (!try_fill_recv(&vi->rq, GFP_KERNEL))
> > -		schedule_delayed_work(&vi->refill, 0);
> > -
> > -	virtnet_napi_enable(&vi->rq);
> > -	return 0;
> > -}
> > -
> > 
> >  /*
> >  
> >   * Send command via the control virtqueue and check status.  Commands
> >   * supported by the hypervisor, as indicated by feature bits, should
> > 
> > @@ -830,13 +863,51 @@ static void virtnet_ack_link_announce(struct
> > virtnet_info *vi)> 
> >  	rtnl_unlock();
> >  
> >  }
> > 
> > +/* Caller check the support of cvq and multiqueue. */
> > +static int virtnet_set_queues(struct virtnet_info *vi)
> > +{
> > +	struct scatterlist sg;
> > +	struct virtio_net_ctrl_rfs s;
> > +	struct net_device *dev = vi->dev;
> > +
> > +	s.virtqueue_pairs = vi->curr_queue_pairs;
> > +	sg_init_one(&sg, &s, sizeof(s));
> > +
> > +	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RFS,
> > +				  VIRTIO_NET_CTRL_RFS_VQ_PAIRS_SET, &sg, 1, 0)){
> > +		dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
> > +			 vi->curr_queue_pairs);
> > +		return -EINVAL;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static int virtnet_open(struct net_device *dev)
> 
> Why move this here? diff will be smaller if you don't.

Ok, will move it back.
> 
> > +{
> > +	struct virtnet_info *vi = netdev_priv(dev);
> > +	int i;
> > +
> > +	for (i = 0; i < vi->max_queue_pairs; i++) {
> > +		/* Make sure we have some buffers: if oom use wq. */
> > +		if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
> > +			schedule_delayed_work(&vi->refill, 0);
> > +		virtnet_napi_enable(&vi->rq[i]);
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > 
> >  static int virtnet_close(struct net_device *dev)
> >  {
> >  
> >  	struct virtnet_info *vi = netdev_priv(dev);
> > 
> > +	int i;
> > 
> >  	/* Make sure refill_work doesn't re-enable napi! */
> >  	cancel_delayed_work_sync(&vi->refill);
> > 
> > -	napi_disable(&vi->rq.napi);
> > +
> > +	for (i = 0; i < vi->max_queue_pairs; i++)
> > +		napi_disable(&vi->rq[i].napi);
> > 
> >  	return 0;
> >  
> >  }
> > 
> > @@ -948,8 +1019,8 @@ static void virtnet_get_ringparam(struct net_device
> > *dev,> 
> >  {
> >  
> >  	struct virtnet_info *vi = netdev_priv(dev);
> > 
> > -	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq.vq);
> > -	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq.vq);
> > +	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq);
> > +	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq);
> > 
> >  	ring->rx_pending = ring->rx_max_pending;
> >  	ring->tx_pending = ring->tx_max_pending;
> >  
> >  }
> > 
> > @@ -967,12 +1038,6 @@ static void virtnet_get_drvinfo(struct net_device
> > *dev,> 
> >  }
> > 
> > -static const struct ethtool_ops virtnet_ethtool_ops = {
> > -	.get_drvinfo = virtnet_get_drvinfo,
> > -	.get_link = ethtool_op_get_link,
> > -	.get_ringparam = virtnet_get_ringparam,
> > -};
> > -
> > 
> >  #define MIN_MTU 68
> >  #define MAX_MTU 65535
> > 
> > @@ -984,6 +1049,21 @@ static int virtnet_change_mtu(struct net_device
> > *dev, int new_mtu)> 
> >  	return 0;
> >  
> >  }
> > 
> > +/* To avoid contending a lock hold by a vcpu who would exit to host,
> > select the + * txq based on the processor id.
> > + * TODO: handle cpu hotplug.
> > + */
> > +static u16 virtnet_select_queue(struct net_device *dev, struct sk_buff
> > *skb) +{
> > +	int txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) :
> > +		  smp_processor_id();
> > +
> > +	while (unlikely(txq >= dev->real_num_tx_queues))
> > +		txq -= dev->real_num_tx_queues;
> > +
> > +	return txq;
> > +}
> > +
> > 
> >  static const struct net_device_ops virtnet_netdev = {
> >  
> >  	.ndo_open            = virtnet_open,
> >  	.ndo_stop   	     = virtnet_close,
> > 
> > @@ -995,6 +1075,7 @@ static const struct net_device_ops virtnet_netdev = {
> > 
> >  	.ndo_get_stats64     = virtnet_stats,
> >  	.ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
> >  	.ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
> > 
> > +	.ndo_select_queue     = virtnet_select_queue,
> > 
> >  #ifdef CONFIG_NET_POLL_CONTROLLER
> >  
> >  	.ndo_poll_controller = virtnet_netpoll,
> >  
> >  #endif
> > 
> > @@ -1030,10 +1111,10 @@ static void virtnet_config_changed_work(struct
> > work_struct *work)> 
> >  	if (vi->status & VIRTIO_NET_S_LINK_UP) {
> >  	
> >  		netif_carrier_on(vi->dev);
> > 
> > -		netif_wake_queue(vi->dev);
> > +		netif_tx_wake_all_queues(vi->dev);
> > 
> >  	} else {
> >  	
> >  		netif_carrier_off(vi->dev);
> > 
> > -		netif_stop_queue(vi->dev);
> > +		netif_tx_stop_all_queues(vi->dev);
> > 
> >  	}
> >  
> >  done:
> >  	mutex_unlock(&vi->config_lock);
> > 
> > @@ -1046,41 +1127,219 @@ static void virtnet_config_changed(struct
> > virtio_device *vdev)> 
> >  	schedule_work(&vi->config_work);
> >  
> >  }
> > 
> > -static int init_vqs(struct virtnet_info *vi)
> > +static void free_receive_bufs(struct virtnet_info *vi)
> > 
> >  {
> > 
> > -	struct virtqueue *vqs[3];
> > -	vq_callback_t *callbacks[] = { skb_recv_done, skb_xmit_done, NULL};
> > -	const char *names[] = { "input", "output", "control" };
> > -	int nvqs, err;
> > +	int i;
> > 
> > -	/* We expect two virtqueues, receive then send,
> > -	 * and optionally control. */
> > -	nvqs = virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ) ? 3 : 2;
> > +	for (i = 0; i < vi->max_queue_pairs; i++) {
> > +		while (vi->rq[i].pages)
> > +			__free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
> > +	}
> > +}
> > 
> > -	err = vi->vdev->config->find_vqs(vi->vdev, nvqs, vqs, callbacks, names);
> > -	if (err)
> > -		return err;
> > +/* Free memory allocated for send and receive queues */
> > +static void virtnet_free_queues(struct virtnet_info *vi)
> 
> I think it's cleaner to open-code this, this way
> during error handling you can have:
> 
> 	kfree(vi->rq);
> err_rq:
> 	kfree(vi->sq);
> err_sq:
> 
> without tricks like malloc them both then goto end.

Ok.
> 
> > +{
> > +	kfree(vi->rq);
> > +	vi->rq = NULL;
> > +	kfree(vi->sq);
> > +	vi->sq = NULL;
> 
> I think = NULL is not needed - we never call this twice.

Sure, will remove it.
> 
> > +}
> > +
> > +static void free_unused_bufs(struct virtnet_info *vi)
> > +{
> > +	void *buf;
> > +	int i;
> > +
> > +	for (i = 0; i < vi->max_queue_pairs; i++) {
> > +		struct virtqueue *vq = vi->sq[i].vq;
> > +		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
> > +			dev_kfree_skb(buf);
> > +	}
> > +
> > +	for (i = 0; i < vi->max_queue_pairs; i++) {
> > +		struct virtqueue *vq = vi->rq[i].vq;
> > +
> > +		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
> > +			if (vi->mergeable_rx_bufs || vi->big_packets)
> > +				give_pages(&vi->rq[i], buf);
> > +			else
> > +				dev_kfree_skb(buf);
> > +			--vi->rq[i].num;
> > +		}
> > +		BUG_ON(vi->rq[i].num != 0);
> > +	}
> > +}
> > +
> > +static void virtnet_set_affinity(struct virtnet_info *vi, bool set)
> > +{
> > +	int i;
> > +
> > +	/* Don't set the affinity hint when in single queue mode or we have too
> > +	 * much online cpus.
> > +	 */
> 
> Pls remove this comment, or replace with one explaining
> the motivation for this logic.

Will replace with the motivation.
> 
> > +	if (vi->curr_queue_pairs == 1 ||
> > +	    vi->max_queue_pairs > num_online_cpus())
> 
> If we have less it's not a good idea either, is it?
> So check vi->max_queue_pairs != num_online_cpus().

Ok.
> 
> > +		set = false;
> 
> This will overwrite affinity if it was set by userspace.
> Just
> 	if (set)
> 		return;
> will not have this problem.

But we need handle the situtaiton when switch back to sq from mq mode. 
Otherwise we may still get the affinity hint used in mq.  This kind of overwrite 
is unavoidable or is there some method to detect whether userspac write 
something new?
> 
> > +	for (i = 0; i < vi->max_queue_pairs; i++) {
> > +		int cpu = set ? i : -1;
> > +		virtqueue_set_affinity(vi->rq[i].vq, cpu);
> > +		virtqueue_set_affinity(vi->sq[i].vq, cpu);
> > +	}
> > +}
> > +
> > +static void virtnet_del_vqs(struct virtnet_info *vi)
> 
> It might be a good idea to add this function in previous
> patch.

Yes, good suggetion.
> 
> > +{
> > +	struct virtio_device *vdev = vi->vdev;
> > +
> > +	virtnet_set_affinity(vi, false);
> > +
> > +	vdev->config->del_vqs(vdev);
> > 
> > -	vi->rq.vq = vqs[0];
> > -	vi->sq.vq = vqs[1];
> > +	virtnet_free_queues(vi);
> > +}
> > 
> > -	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)) {
> > -		vi->cvq = vqs[2];
> > +static int virtnet_find_vqs(struct virtnet_info *vi)
> > +{
> > +	vq_callback_t **callbacks;
> > +	struct virtqueue **vqs;
> > +	int ret = -ENOMEM;
> > +	int i, total_vqs;
> > +	char **names;
> > +
> > +	/* We expect 1 RX virtqueue followed by 1 TX virtqueue, followd by
> 
> followed
> 

Ok, will correct it.
> > +	 * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by
> > +	 * possible control vq.
> > +	 */
> > +	total_vqs = vi->max_queue_pairs * 2 +
> > +		    virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);
> > +
> > +	/* Allocate space for find_vqs parameters */
> > +	vqs = kzalloc(total_vqs * sizeof(*vqs), GFP_KERNEL);
> > +	callbacks = kzalloc(total_vqs * sizeof(*callbacks), GFP_KERNEL);
> > +	if (!vqs || !callbacks)
> > +		goto err_mem;
> > +	names = kzalloc(total_vqs * sizeof(*names), GFP_KERNEL);
> 
> Why kzalloc? You seem to fill in all of it. Pls just use kmalloc
> and initialize fields.
> 

Right.
> > +	if (!names)
> > +		goto err_mem;
> 
> Since you have separate goto here it's more consistent
> to use a separate label and two if tests above.
> 

Sure.
> > +
> > +	/* Parameters for control virtqueue, if any */
> > +	if (vi->has_cvq) {
> > +		callbacks[total_vqs - 1] = NULL;
> > +		names[total_vqs - 1] = kasprintf(GFP_KERNEL, "control");
> > +	}
> > 
> > +	/* Allocate/initialize parameters for send/receive virtqueues */
> > +	for (i = 0; i < vi->max_queue_pairs; i++) {
> > +		callbacks[rxq2vq(i)] = skb_recv_done;
> > +		callbacks[txq2vq(i)] = skb_xmit_done;
> > +		names[rxq2vq(i)] = kasprintf(GFP_KERNEL, "input.%d", i);
> > +		names[txq2vq(i)] = kasprintf(GFP_KERNEL, "output.%d", i);
> > +	}
> 
> We would need to check kasprintf return value.

Looks like a better method is to make the name as a memeber of receive_queue 
and send_queue, and use sprintf here.
> Also if you allocate names from slab we'll need to free them
> later.

Then it could be freed when the send_queue and receive_queue is freed.
> It's probably easier to just use fixed names for now -
> it's not like the index is really useful.

Looks useful for debugging e.g. check whether the irq distribution is as 
expected.
> 
> > +
> > +	ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks,
> > +					 (const char **)names);
> 
> Please avoid casts, use a proper type for names.

I'm consider we need a minor change in this api, we need allocate the names 
dynamically which could not be a const char **.
> 
> > +	if (ret)
> > +		goto err_names;
> > +
> > +	if (vi->has_cvq) {
> > +		vi->cvq = vqs[total_vqs - 1];
> > 
> >  		if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
> >  		
> >  			vi->dev->features |= NETIF_F_HW_VLAN_FILTER;
> >  	
> >  	}
> > 
> > +
> > +	for (i = 0; i < vi->max_queue_pairs; i++) {
> > +		vi->rq[i].vq = vqs[rxq2vq(i)];
> > +		vi->sq[i].vq = vqs[txq2vq(i)];
> > +	}
> > +
> > +	kfree(callbacks);
> > +	kfree(vqs);
> 
> Who frees names if there's no error?
> 

The virtio core does not copy the name, so it need this and only used for 
debugging if I'm reading the code correctly.
> > +
> > +	return 0;
> > +
> > +err_names:
> > +	for (i = 0; i < total_vqs * 2; i++)
> 
> Why * 2?
> This looks like a bug.

Yes, it's a bug.
> 
> > +		kfree(names[i]);
> > +	kfree(names);
> > +
> > +err_mem:
> > +	kfree(callbacks);
> > +	kfree(vqs);
> > +
> > +	return ret;
> > +}
> > +
> > +static int virtnet_alloc_queues(struct virtnet_info *vi)
> > +{
> > +	int i;
> > +
> > +	vi->sq = kzalloc(sizeof(vi->sq[0]) * vi->max_queue_pairs, GFP_KERNEL);
> > +	vi->rq = kzalloc(sizeof(vi->rq[0]) * vi->max_queue_pairs, GFP_KERNEL);
> 
> While equivalent, *vi->rq is clearer IMHO.

Ok.
> 
> > +	if (!vi->rq || !vi->sq)
> > +		goto err;
> > +
> > +	INIT_DELAYED_WORK(&vi->refill, refill_work);
> > +	/* setup initial receive and send queue parameters */
> 
> Pls remove this comment, it's confuses more than it clarifies.

Sure.
> 
> > +	for (i = 0; i < vi->max_queue_pairs; i++) {
> > +		vi->rq[i].pages = NULL;
> > +		netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll,
> > +			       napi_weight);
> > +
> > +		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
> > +		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
> > +	}
> > +
> > +
> 
> Extra empty line.
> 

Will remove this line.
> > +	return 0;
> > +
> > +err:
> > +	virtnet_free_queues(vi);
> > +	return -ENOMEM;
> > +}
> > +
> > +static int init_vqs(struct virtnet_info *vi)
> > +{
> > +	int ret;
> > +
> > +	/* Allocate send & receive queues */
> > +	ret = virtnet_alloc_queues(vi);
> > +	if (ret)
> > +		goto err;
> > +
> > +	ret = virtnet_find_vqs(vi);
> > +	if (ret)
> > +		goto err_free;
> > +
> > +	virtnet_set_affinity(vi, true);
> > 
> >  	return 0;
> > 
> > +
> > +err_free:
> > +	virtnet_free_queues(vi);
> > +err:
> > +	return ret;
> > 
> >  }
> >  
> >  static int virtnet_probe(struct virtio_device *vdev)
> >  {
> > 
> > -	int err;
> > +	int i, err;
> > 
> >  	struct net_device *dev;
> >  	struct virtnet_info *vi;
> > 
> > +	u16 max_queue_pairs;
> > +
> > +	/* Find if host supports multiqueue virtio_net device */
> > +	err = virtio_config_val(vdev, VIRTIO_NET_F_RFS,
> > +				offsetof(struct virtio_net_config,
> > +				max_virtqueue_pairs), &max_queue_pairs);
> > +
> > +	/* We need at least 2 queue's */
> > +	if (err || max_queue_pairs < VIRTIO_NET_CTRL_RFS_VQ_PAIRS_MIN ||
> > +	    max_queue_pairs > VIRTIO_NET_CTRL_RFS_VQ_PAIRS_MAX)
> 
> Check has_cvq as well.
> 

Sure.
> > +		max_queue_pairs = 1;
> > 
> >  	/* Allocate ourselves a network device with room for our info */
> > 
> > -	dev = alloc_etherdev(sizeof(struct virtnet_info));
> > +	dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs);
> > 
> >  	if (!dev)
> >  	
> >  		return -ENOMEM;
> > 
> > @@ -1127,22 +1386,17 @@ static int virtnet_probe(struct virtio_device
> > *vdev)> 
> >  	/* Set up our device-specific information */
> >  	vi = netdev_priv(dev);
> > 
> > -	netif_napi_add(dev, &vi->rq.napi, virtnet_poll, napi_weight);
> > 
> >  	vi->dev = dev;
> >  	vi->vdev = vdev;
> >  	vdev->priv = vi;
> > 
> > -	vi->rq.pages = NULL;
> > 
> >  	vi->stats = alloc_percpu(struct virtnet_stats);
> >  	err = -ENOMEM;
> >  	if (vi->stats == NULL)
> >  	
> >  		goto free;
> > 
> > -	INIT_DELAYED_WORK(&vi->refill, refill_work);
> > 
> >  	mutex_init(&vi->config_lock);
> >  	vi->config_enable = true;
> >  	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
> > 
> > -	sg_init_table(vi->rq.sg, ARRAY_SIZE(vi->rq.sg));
> > -	sg_init_table(vi->sq.sg, ARRAY_SIZE(vi->sq.sg));
> > 
> >  	/* If we can receive ANY GSO packets, we must allocate large ones. */
> >  	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
> > 
> > @@ -1153,10 +1407,21 @@ static int virtnet_probe(struct virtio_device
> > *vdev)> 
> >  	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
> >  	
> >  		vi->mergeable_rx_bufs = true;
> > 
> > +	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
> > +		vi->has_cvq = true;
> > +
> > +	/* Use single tx/rx queue pair as default */
> > +	vi->curr_queue_pairs = 1;
> > +	vi->max_queue_pairs = max_queue_pairs;
> > +
> > +	/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
> > 
> >  	err = init_vqs(vi);
> >  	if (err)
> >  	
> >  		goto free_stats;
> > 
> > +	netif_set_real_num_tx_queues(dev, 1);
> > +	netif_set_real_num_rx_queues(dev, 1);
> > +
> > 
> >  	err = register_netdev(dev);
> >  	if (err) {
> >  	
> >  		pr_debug("virtio_net: registering device failed\n");
> > 
> > @@ -1164,12 +1429,15 @@ static int virtnet_probe(struct virtio_device
> > *vdev)> 
> >  	}
> >  	
> >  	/* Last of all, set up some receive buffers. */
> > 
> > -	try_fill_recv(&vi->rq, GFP_KERNEL);
> > -
> > -	/* If we didn't even get one input buffer, we're useless. */
> > -	if (vi->rq.num == 0) {
> > -		err = -ENOMEM;
> > -		goto unregister;
> > +	for (i = 0; i < vi->max_queue_pairs; i++) {
> > +		try_fill_recv(&vi->rq[i], GFP_KERNEL);
> > +
> > +		/* If we didn't even get one input buffer, we're useless. */
> > +		if (vi->rq[i].num == 0) {
> > +			free_unused_bufs(vi);
> > +			err = -ENOMEM;
> > +			goto free_recv_bufs;
> > +		}
> > 
> >  	}
> >  	
> >  	/* Assume link up if device can't report link status,
> > 
> > @@ -1182,13 +1450,19 @@ static int virtnet_probe(struct virtio_device
> > *vdev)> 
> >  		netif_carrier_on(dev);
> >  	
> >  	}
> > 
> > -	pr_debug("virtnet: registered device %s\n", dev->name);
> > +	pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
> > +		 dev->name, max_queue_pairs);
> > +
> > 
> >  	return 0;
> > 
> > -unregister:
> > +free_recv_bufs:
> > +	free_receive_bufs(vi);
> > 
> >  	unregister_netdev(dev);
> > 
> > +
> > 
> >  free_vqs:
> > -	vdev->config->del_vqs(vdev);
> > +	cancel_delayed_work_sync(&vi->refill);
> > +	virtnet_del_vqs(vi);
> > +
> > 
> >  free_stats:
> >  	free_percpu(vi->stats);
> >  
> >  free:
> > @@ -1196,28 +1470,6 @@ free:
> >  	return err;
> >  
> >  }
> > 
> > -static void free_unused_bufs(struct virtnet_info *vi)
> > -{
> > -	void *buf;
> > -	while (1) {
> > -		buf = virtqueue_detach_unused_buf(vi->sq.vq);
> > -		if (!buf)
> > -			break;
> > -		dev_kfree_skb(buf);
> > -	}
> > -	while (1) {
> > -		buf = virtqueue_detach_unused_buf(vi->rq.vq);
> > -		if (!buf)
> > -			break;
> > -		if (vi->mergeable_rx_bufs || vi->big_packets)
> > -			give_pages(&vi->rq, buf);
> > -		else
> > -			dev_kfree_skb(buf);
> > -		--vi->rq.num;
> > -	}
> > -	BUG_ON(vi->rq.num != 0);
> > -}
> > -
> > 
> >  static void remove_vq_common(struct virtnet_info *vi)
> >  {
> >  
> >  	vi->vdev->config->reset(vi->vdev);
> > 
> > @@ -1225,10 +1477,9 @@ static void remove_vq_common(struct virtnet_info
> > *vi)> 
> >  	/* Free unused buffers in both send and recv, if any. */
> >  	free_unused_bufs(vi);
> > 
> > -	vi->vdev->config->del_vqs(vi->vdev);
> > +	free_receive_bufs(vi);
> > 
> > -	while (vi->rq.pages)
> > -		__free_pages(get_a_page(&vi->rq, GFP_KERNEL), 0);
> > +	virtnet_del_vqs(vi);
> > 
> >  }
> >  
> >  static void __devexit virtnet_remove(struct virtio_device *vdev)
> > 
> > @@ -1254,6 +1505,7 @@ static void __devexit virtnet_remove(struct
> > virtio_device *vdev)> 
> >  static int virtnet_freeze(struct virtio_device *vdev)
> >  {
> >  
> >  	struct virtnet_info *vi = vdev->priv;
> > 
> > +	int i;
> > 
> >  	/* Prevent config work handler from accessing the device */
> >  	mutex_lock(&vi->config_lock);
> > 
> > @@ -1264,7 +1516,10 @@ static int virtnet_freeze(struct virtio_device
> > *vdev)> 
> >  	cancel_delayed_work_sync(&vi->refill);
> >  	
> >  	if (netif_running(vi->dev))
> > 
> > -		napi_disable(&vi->rq.napi);
> > +		for (i = 0; i < vi->max_queue_pairs; i++) {
> > +			napi_disable(&vi->rq[i].napi);
> > +			netif_napi_del(&vi->rq[i].napi);
> > +		}
> > 
> >  	remove_vq_common(vi);
> > 
> > @@ -1276,24 +1531,29 @@ static int virtnet_freeze(struct virtio_device
> > *vdev)> 
> >  static int virtnet_restore(struct virtio_device *vdev)
> >  {
> >  
> >  	struct virtnet_info *vi = vdev->priv;
> > 
> > -	int err;
> > +	int err, i;
> > 
> >  	err = init_vqs(vi);
> >  	if (err)
> >  	
> >  		return err;
> >  	
> >  	if (netif_running(vi->dev))
> > 
> > -		virtnet_napi_enable(&vi->rq);
> > +		for (i = 0; i < vi->max_queue_pairs; i++)
> > +			virtnet_napi_enable(&vi->rq[i]);
> > 
> >  	netif_device_attach(vi->dev);
> > 
> > -	if (!try_fill_recv(&vi->rq, GFP_KERNEL))
> > -		schedule_delayed_work(&vi->refill, 0);
> > +	for (i = 0; i < vi->max_queue_pairs; i++)
> > +		if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
> > +			schedule_delayed_work(&vi->refill, 0);
> > 
> >  	mutex_lock(&vi->config_lock);
> >  	vi->config_enable = true;
> >  	mutex_unlock(&vi->config_lock);
> > 
> > +	if (vi->has_cvq && virtio_has_feature(vi->vdev, VIRTIO_NET_F_RFS))
> > +		virtnet_set_queues(vi);
> > +
> 
> I think it's easier to test
> if (curr_queue_pairs == max_queue_pairs)
> within virtnet_set_queues and make it
> a NOP if so.

Still need to send the command during restore since we reset the device during 
freezing.
> 
> >  	return 0;
> >  
> >  }
> >  #endif
> > 
> > @@ -1311,7 +1571,7 @@ static unsigned int features[] = {
> > 
> >  	VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO,
> >  	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ,
> >  	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN,
> > 
> > -	VIRTIO_NET_F_GUEST_ANNOUNCE,
> > +	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_RFS,
> > 
> >  };
> >  
> >  static struct virtio_driver virtio_net_driver = {
> > 
> > @@ -1329,6 +1589,12 @@ static struct virtio_driver virtio_net_driver = {
> > 
> >  #endif
> >  };
> > 
> > +static const struct ethtool_ops virtnet_ethtool_ops = {
> > +	.get_drvinfo = virtnet_get_drvinfo,
> > +	.get_link = ethtool_op_get_link,
> > +	.get_ringparam = virtnet_get_ringparam,
> > +};
> > +
> > 
> >  static int __init init(void)
> >  {
> >  
> >  	return register_virtio_driver(&virtio_net_driver);
> > 
> > diff --git a/include/uapi/linux/virtio_net.h
> > b/include/uapi/linux/virtio_net.h index 2470f54..6056cec 100644
> > --- a/include/uapi/linux/virtio_net.h
> > +++ b/include/uapi/linux/virtio_net.h
> > @@ -51,6 +51,7 @@
> > 
> >  #define VIRTIO_NET_F_CTRL_RX_EXTRA 20	/* Extra RX mode control support 
*/
> >  #define VIRTIO_NET_F_GUEST_ANNOUNCE 21	/* Guest can announce device on
> >  the
> >  
> >  					 * network */
> > 
> > +#define VIRTIO_NET_F_RFS	22	/* Device supports multiple TXQ/RXQ */
> 
> Should be
> /* Device supports Receive Flow Steering. */

Ok.
> 
> >  #define VIRTIO_NET_S_LINK_UP	1	/* Link is up */
> >  #define VIRTIO_NET_S_ANNOUNCE	2	/* Announcement is needed */
> > 
> > @@ -60,6 +61,8 @@ struct virtio_net_config {
> > 
> >  	__u8 mac[6];
> >  	/* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */
> >  	__u16 status;
> > 
> > +	/* Total number of RX/TX queues */
> 
> Better comment:
> /* Maximum number of each of transmit and receive queues;
>  * see VIRTIO_NET_F_RFS and VIRTIO_NET_CTRL_RFS.
>  * Legal values are between 1 and 0x8000
>  */

Sure.
> 
> > +	__u16 max_virtqueue_pairs;
> > 
> >  } __attribute__((packed));
> >  
> >  /* This is the first element of the scatter-gather list.  If you don't
> > 
> > @@ -166,4 +169,17 @@ struct virtio_net_ctrl_mac {
> > 
> >  #define VIRTIO_NET_CTRL_ANNOUNCE       3
> >  
> >   #define VIRTIO_NET_CTRL_ANNOUNCE_ACK         0
> > 
> > +/*
> > + * Control multiqueue
> 
> Here's a better comment:
> 
> Control Receive Flow Steering
> 
>  The command VIRTIO_NET_CTRL_RFS_VQ_PAIRS_SET
>  enables Receive Flow Steering, specifying the number of the transmit and
> receive queues that will be used.
>  After the command is consumed and acked by the device,
>  the device will not steer new packets on receive virtqueues
>  other than specified nor read from transmit virtqueues other than
> specified. Accordingly, driver should not transmit new packets
>  on virtqueues other than specified.
> 

Sure, thanks for the this reminding.
> > + *
> 
> Remove this empty line.

Ok.
> 
> > + */
> > +struct virtio_net_ctrl_rfs {
> 
> /* Number of each of transmit and receive queues to use;
>  * Legal values are between 1 and max_virtqueue_pairs
>  */
> 
> > +	u16 virtqueue_pairs;
> > +};
> > +
> > +#define VIRTIO_NET_CTRL_RFS   4
> > + #define VIRTIO_NET_CTRL_RFS_VQ_PAIRS_SET        0
> 
> /* Value range for max_virtqueue_pairsfor and virtqueue_pairs above */
> 
> > + #define VIRTIO_NET_CTRL_RFS_VQ_PAIRS_MIN        1
> > + #define VIRTIO_NET_CTRL_RFS_VQ_PAIRS_MAX        0x8000
> > +
> > 
> >  #endif /* _LINUX_VIRTIO_NET_H */
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH net-next 3/3] virtio-net: change the number of queues through ethtool
From: Jason Wang @ 2012-12-04 14:46 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: krkumar2, kvm, netdev, linux-kernel, virtualization, bhutchings,
	jwhan, davem, shiyer
In-Reply-To: <20121204134959.GG7499@redhat.com>

On Tuesday, December 04, 2012 03:49:59 PM Michael S. Tsirkin wrote:
> On Tue, Dec 04, 2012 at 07:07:58PM +0800, Jason Wang wrote:
> > This patch implement the ethtool_{set|get}_channels method of ethool to
> > allow user to change the number of queues dymaically when the device is
> > running. This would let the user to configure it on demand.
> > 
> > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > ---
> > 
> >  drivers/net/virtio_net.c |   44
> >  ++++++++++++++++++++++++++++++++++++++++++++ 1 files changed, 44
> >  insertions(+), 0 deletions(-)
> > 
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 912f5b2..b9f9887 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -1589,10 +1589,54 @@ static struct virtio_driver virtio_net_driver = {
> > 
> >  #endif
> >  };
> > 
> > +/* TODO: Eliminate OOO packets during switching */
> > +static int virtnet_set_channels(struct net_device *dev,
> > +				struct ethtool_channels *channels)
> > +{
> > +	struct virtnet_info *vi = netdev_priv(dev);
> > +	u16 queue_pairs = channels->combined_count;
> > +	u16 old_queue_pairs = vi->curr_queue_pairs;
> > +
> > +	/* We don't support separate rx/tx channels.
> > +	 * We don't allow setting 'other' channels.
> > +	 */
> > +	if (channels->rx_count || channels->tx_count || channels->other_count)
> > +		return -EINVAL;
> > +
> > +	if (queue_pairs > vi->max_queue_pairs)
> > +		return -EINVAL;
> > +
> > +	vi->curr_queue_pairs = queue_pairs;
> > +	if (virtnet_set_queues(vi) == 0) {
> > +		netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs);
> > +		netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs);
> 
> Just use queue_pairs - it's shorter.

Ok.
> 
> > +
> > +		virtnet_set_affinity(vi, true);
> > +	} else
> > +		vi->curr_queue_pairs = old_queue_pairs;
> 
> Should be
> 	ret = virtnet_set_queues(vi);
> 	if (ret) {
> 		vi->curr_queue_pairs = old_queue_pairs;
> 		return ret;
> 	}
> otherwise we loose error reporting.
> 

Right.
> Also it's better if virtnet_set_queues
> gets queue_pairs as parameter and set curr_queue_pairs
> on success.

True, looks simpler than current method, will do it in next version.
> 
> > +
> > +	return 0;
> > +}
> > +
> > +static void virtnet_get_channels(struct net_device *dev,
> > +				 struct ethtool_channels *channels)
> > +{
> > +	struct virtnet_info *vi = netdev_priv(dev);
> > +
> > +	channels->combined_count = vi->curr_queue_pairs;
> > +	channels->max_combined = vi->max_queue_pairs;
> > +	channels->max_other = 0;
> > +	channels->rx_count = 0;
> > +	channels->tx_count = 0;
> > +	channels->other_count = 0;
> > +}
> > +
> > 
> >  static const struct ethtool_ops virtnet_ethtool_ops = {
> >  
> >  	.get_drvinfo = virtnet_get_drvinfo,
> >  	.get_link = ethtool_op_get_link,
> >  	.get_ringparam = virtnet_get_ringparam,
> > 
> > +	.set_channels = virtnet_set_channels,
> > +	.get_channels = virtnet_get_channels,
> > 
> >  };
> >  
> >  static int __init init(void)
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [net-next PATCH V3-evictor] net: frag evictor, avoid killing warm frag queues
From: Eric Dumazet @ 2012-12-04 14:47 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: David S. Miller, Florian Westphal, netdev, Thomas Graf,
	Paul E. McKenney, Cong Wang, Herbert Xu
In-Reply-To: <20121204133007.20215.52566.stgit@dragon>

On Tue, 2012-12-04 at 14:30 +0100, Jesper Dangaard Brouer wrote:
> The fragmentation evictor system have a very unfortunate eviction
> system for killing fragment, when the system is put under pressure.
> 
> If packets are coming in too fast, the evictor code kills "warm"
> fragments too quickly.  Resulting in close to zero throughput, as
> fragments are killed before they have a chance to complete
> 
> This is related to the bad interaction with the LRU (Least Recently
> Used) list.  Under load the LRU list sort-of changes meaning/behavior.
> When the LRU head is very new/warm, then the head is most likely the
> one with most fragments and the tail (latest used or added element)
> with least.
> 
> Solved by, introducing a creation "jiffie" timestamp (creation_ts).
> If the element is tried evicted in same jiffie, then perform tail drop
> on the LRU list instead.
> 
> Signed-off-by: Jesper Dangaard Brouer <jbrouer@redhat.com>

This would only 'work' if a reassembled packet can be done/completed
under one jiffie.

For 64KB packets, this means 100Mb link wont be able to deliver a
reassembled packet under IP frags load if HZ=1000

LRU goal is to be able to select the oldest inet_frag_queue, because in
typical networks, packet losses are really happening and this is why
some packets wont complete their reassembly. They naturally will be
found on LRU head, and they probably are very fat (for example a single
packet was lost for the inet_frag_queue)

Choosing the most recent inet_frag_queue is exactly the opposite
strategy. We pay the huge cost of maintaining a central LRU, and we
exactly misuse it.

As long as an inet_frag_queue receives new fragments and is moved to the
LRU tail, its a candidate for being kept, not a candidate for being
evicted.

Only when an inet_frag_queue is the oldest one, it becomes a candidate
for eviction.

I think you are trying to solve a configuration/tuning problem by
changing a valid strategy.

Whats wrong with admitting high_thresh/low_thresh default values should
be updated, now some people apparently want to use IP fragments in
production ?

Lets say we allow to use 1 % of memory for frags, instead of the current
256 KB limit, which was chosen decades ago.

Only in very severe DOS attacks, LRU head 'creation_ts' would possibly
be <= 1ms. And under severe DOS attacks, I am afraid there is nothing we
can do.

(We could eventually avoid LRU hassle and chose instead a random drop
strategy)

high_thresh/low_thresh should be changed from 'int' to 'long' as well,
so that a 64bit host could use more than 2GB for frag storage.

^ permalink raw reply

* Re: net, batman: lockdep circular dependency warning
From: Sven Eckelmann @ 2012-12-04 14:51 UTC (permalink / raw)
  To: b.a.t.m.a.n, netdev; +Cc: Sasha Levin
In-Reply-To: <50A15E1B.1010001@oracle.com>

[-- Attachment #1: Type: text/plain, Size: 4997 bytes --]

Hi,

thanks for your report. It seems nobody else wanted to give an answer... so I 
try to give a small overview.

On Monday 12 November 2012 15:37:47 Sasha Levin wrote:
> Hi all,
> 
> While fuzzing with trinity inside a KVM tools (lkvm) guest running latest
> -next kernel, I've stumbled on the following:
> 
> [ 1002.969392] ======================================================
> [ 1002.971639] [ INFO: possible circular locking dependency detected ]
> [ 1002.975805] 3.7.0-rc5-next-20121112-sasha-00018-g2f4ce0e #127 Tainted: G 
>       W [ 1002.983691]
> ------------------------------------------------------- [ 1002.983691]
> trinity-child18/8149 is trying to acquire lock:
> [ 1002.983691]  (s_active#313){++++.+}, at: [<ffffffff812f9941>]
> sysfs_addrm_finish+0x31/0x60 [ 1002.983691]
> [ 1002.983691] but task is already holding lock:
> [ 1002.983691]  (rtnl_mutex){+.+.+.}, at: [<ffffffff834fcc62>]
> rtnl_lock+0x12/0x20 [ 1002.983691]
> [ 1002.983691] which lock already depends on the new lock.

It is known that batman-adv has a problem with the attaching/detaching of 
interfaces over this sysfs. The cause of this problem is related to the fact 
that batman-adv not only creates its own net_devices, but also unregisters 
net_devices. This unregister will add a new element in the net_todo_list. This 
will cause a rtnl_lock when it calls netdev_wait_allrefs (there are some 
condition, but we just ignore them for now). So the whole exercise of using 
rtnl_trylock was useless.

This extra rtnl_lock can cause a deadlock as you found out because it is 
activated through a sysfs file and therefore the s_active mutex is locked (we 
have the dependency s_active -> rtnl_mutex, but other users have rtnl_mutex -> 
s_active).

So, what to do? There are different possibilities. We have to keep in mind 
that there is a patchset (not yet accepted by the batman-adv maintainers) 
which allows to use `ip link` or compatible tools to create/destroy batman-adv 
devices and attach/detach other devices.

1. Remove the sysfs interface to attach/detach net_devices (which
   destroys/creates batman-adv devices)

   This is not really backward compatible and therefore not really acceptable.
   Marek Lindner and Simon Wunderlich are also against forcing users to
   require special tools to add/configure batman-adv devices (even batctl, ip
   and so on).

2. Ignore the possible deadlock

   (sry, fill in your own comment...)

3. Add workarounds in the core net code

   Simon Wunderlich already tried it... I personally think it is not the right
   way because it more likely to introduce more bugs by hiding a batman-adv
   bug. And these bugs are a lot harder to find... trust me

   For example the usage of __rtnl_unlock will let this bug to appear in
   other places which use rtnl_trylock. This is caused by the fact that the
   todo item isn't processed by __rtnl_unlock (this is the whole idea by
   calling it) and therefore the todo work stays in net_todo_list. Another
   user of rtnl_trylock will now call rtnl_unlock and don't expect an entry in
   net_todo_list because he never unregistered a device. So he now has the
   problem of batman-adv (what an unsocial läderlappen).

   And moving everybody using rtnl_trylock to __rtnl_unlock has still the
   problem that batman-adv don't immediatelly work on its todo and so
   maybe causes other side effects because... the notifications weren't
   sent and therefore the refcount of the unregistered device didn't went
   to zero.

   (I'll leave other side effects as homework for the reader)

4. Don't automatically remove batman-adv devices

   The current approach is to automatically unregister batman-adv devices
   when they don't have attached slave-devices (hardif called by batman-adv).
   Removing this will slightly change the behaviour, but the interface can
   still be removed using `ip link del dev bat0` or a similar tool.

5. Add a workaround solution and promote the use of the standard interface

   So, the basic problem is the s_active mutex locked by the sysfs interface.
   An idea is to postpone the part which needs the rtnl_mutex to a later time.
   This has obviously the problem that we cannot return an error code to the
   caller when the device creation failed in the postponed part. This problem
   can reduced slightly be moving only the unregister part, but now I'll leave
   this out for simplicity of the description.

   A possible implementation would create a work_struct and add it to
   batadv_event_workqueue. This work item has to contain all information given
   by the user (which hardif, name of the batman-adv device).

   Simon Wunderlich already disliked this workaround, but Antonio Quartulli
   tried to encourage an RFC implementation. I've prefered a textual
   description rather than a patch missing explanations of the other
   alternatives.

Kind regards,
	Sven

[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* Re: [PATCH net-next 2/3] virtio_net: multiqueue support
From: Michael S. Tsirkin @ 2012-12-04 15:11 UTC (permalink / raw)
  To: Jason Wang
  Cc: krkumar2, kvm, netdev, linux-kernel, virtualization, bhutchings,
	jwhan, davem, shiyer
In-Reply-To: <7748384.YyJVzCCbsc@jason-thinkpad-t430s>

On Tue, Dec 04, 2012 at 10:45:33PM +0800, Jason Wang wrote:
> On Tuesday, December 04, 2012 03:24:22 PM Michael S. Tsirkin wrote:
> > I found some bugs, see below.
> > Also some style nitpicking, this is not mandatory to address.
> 
> Thanks for the reviewing.
> > 
> > On Tue, Dec 04, 2012 at 07:07:57PM +0800, Jason Wang wrote:
> > > This addes multiqueue support to virtio_net driver. In multiple queue
> > > modes, the driver expects the number of queue paris is equal to the
> > > number of vcpus. To eliminate the contention bettwen vcpus and
> > > virtqueues, per-cpu virtqueue pairs
> > > were implemented through:
> > Lots of typos above - try running ispell on it :)
> > 
> 
> Sure.
> > > - select the txq based on the smp processor id.
> > > - smp affinity hint were set to the vcpu that owns the queue pairs.
> > > 
> > > Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
> > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > ---
> > > 
> > >  drivers/net/virtio_net.c        |  472
> > >  ++++++++++++++++++++++++++++++--------- include/uapi/linux/virtio_net.h
> > >  |   16 ++
> > >  2 files changed, 385 insertions(+), 103 deletions(-)
> > > 
> > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > index 266f712..912f5b2 100644
> > > --- a/drivers/net/virtio_net.c
> > > +++ b/drivers/net/virtio_net.c
> > > @@ -81,16 +81,25 @@ struct virtnet_info {
> > > 
> > >  	struct virtio_device *vdev;
> > >  	struct virtqueue *cvq;
> > >  	struct net_device *dev;
> > > 
> > > -	struct send_queue sq;
> > > -	struct receive_queue rq;
> > > +	struct send_queue *sq;
> > > +	struct receive_queue *rq;
> > > 
> > >  	unsigned int status;
> > > 
> > > +	/* Max # of queue pairs supported by the device */
> > > +	u16 max_queue_pairs;
> > > +
> > > +	/* # of queue pairs currently used by the driver */
> > > +	u16 curr_queue_pairs;
> > > +
> > > 
> > >  	/* I like... big packets and I cannot lie! */
> > >  	bool big_packets;
> > >  	
> > >  	/* Host will merge rx buffers for big packets (shake it! shake it!) */
> > >  	bool mergeable_rx_bufs;
> > > 
> > > +	/* Has control virtqueue */
> > > +	bool has_cvq;
> > > +
> > > 
> > >  	/* enable config space updates */
> > >  	bool config_enable;
> > > 
> > > @@ -125,6 +134,32 @@ struct padded_vnet_hdr {
> > > 
> > >  	char padding[6];
> > >  
> > >  };
> > > 
> > > +static const struct ethtool_ops virtnet_ethtool_ops;
> > > +
> > > +
> > > +/* Converting between virtqueue no. and kernel tx/rx queue no.
> > > + * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
> > > + */
> > > +static int vq2txq(struct virtqueue *vq)
> > > +{
> > > +	return (virtqueue_get_queue_index(vq) - 1) / 2;
> > > +}
> > > +
> > > +static int txq2vq(int txq)
> > > +{
> > > +	return txq * 2 + 1;
> > > +}
> > > +
> > > +static int vq2rxq(struct virtqueue *vq)
> > > +{
> > > +	return virtqueue_get_queue_index(vq) / 2;
> > > +}
> > > +
> > > +static int rxq2vq(int rxq)
> > > +{
> > > +	return rxq * 2;
> > > +}
> > > +
> > > 
> > >  static inline struct skb_vnet_hdr *skb_vnet_hdr(struct sk_buff *skb)
> > >  {
> > >  
> > >  	return (struct skb_vnet_hdr *)skb->cb;
> > > 
> > > @@ -165,7 +200,7 @@ static void skb_xmit_done(struct virtqueue *vq)
> > > 
> > >  	virtqueue_disable_cb(vq);
> > >  	
> > >  	/* We were probably waiting for more output buffers. */
> > > 
> > > -	netif_wake_queue(vi->dev);
> > > +	netif_wake_subqueue(vi->dev, vq2txq(vq));
> > > 
> > >  }
> > >  
> > >  static void set_skb_frag(struct sk_buff *skb, struct page *page,
> > > 
> > > @@ -502,7 +537,7 @@ static bool try_fill_recv(struct receive_queue *rq,
> > > gfp_t gfp)> 
> > >  static void skb_recv_done(struct virtqueue *rvq)
> > >  {
> > >  
> > >  	struct virtnet_info *vi = rvq->vdev->priv;
> > > 
> > > -	struct receive_queue *rq = &vi->rq;
> > > +	struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
> > > 
> > >  	/* Schedule NAPI, Suppress further interrupts if successful. */
> > >  	if (napi_schedule_prep(&rq->napi)) {
> > > 
> > > @@ -532,15 +567,21 @@ static void refill_work(struct work_struct *work)
> > > 
> > >  	struct virtnet_info *vi =
> > >  	
> > >  		container_of(work, struct virtnet_info, refill.work);
> > >  	
> > >  	bool still_empty;
> > > 
> > > +	int i;
> > > +
> > > +	for (i = 0; i < vi->max_queue_pairs; i++) {
> > > +		struct receive_queue *rq = &vi->rq[i];
> > > 
> > > -	napi_disable(&vi->rq.napi);
> > > -	still_empty = !try_fill_recv(&vi->rq, GFP_KERNEL);
> > > -	virtnet_napi_enable(&vi->rq);
> > > +		napi_disable(&rq->napi);
> > > +		still_empty = !try_fill_recv(rq, GFP_KERNEL);
> > > +		virtnet_napi_enable(rq);
> > > 
> > > -	/* In theory, this can happen: if we don't get any buffers in
> > > -	 * we will *never* try to fill again. */
> > > -	if (still_empty)
> > > -		schedule_delayed_work(&vi->refill, HZ/2);
> > > +		/* In theory, this can happen: if we don't get any buffers in
> > > +		 * we will *never* try to fill again.
> > > +		 */
> > > +		if (still_empty)
> > > +			schedule_delayed_work(&vi->refill, HZ/2);
> > > +	}
> > > 
> > >  }
> > >  
> > >  static int virtnet_poll(struct napi_struct *napi, int budget)
> > > 
> > > @@ -650,7 +691,8 @@ static int xmit_skb(struct send_queue *sq, struct
> > > sk_buff *skb)> 
> > >  static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device
> > >  *dev)
> > >  {
> > >  
> > >  	struct virtnet_info *vi = netdev_priv(dev);
> > > 
> > > -	struct send_queue *sq = &vi->sq;
> > > +	int qnum = skb_get_queue_mapping(skb);
> > > +	struct send_queue *sq = &vi->sq[qnum];
> > > 
> > >  	int capacity;
> > >  	
> > >  	/* Free up any pending old buffers before queueing new ones. */
> > > 
> > > @@ -664,13 +706,14 @@ static netdev_tx_t start_xmit(struct sk_buff *skb,
> > > struct net_device *dev)> 
> > >  		if (likely(capacity == -ENOMEM)) {
> > >  		
> > >  			if (net_ratelimit())
> > >  			
> > >  				dev_warn(&dev->dev,
> > > 
> > > -					 "TX queue failure: out of memory\n");
> > > +					 "TXQ (%d) failure: out of memory\n",
> > > +					 qnum);
> > > 
> > >  		} else {
> > >  		
> > >  			dev->stats.tx_fifo_errors++;
> > >  			if (net_ratelimit())
> > >  			
> > >  				dev_warn(&dev->dev,
> > > 
> > > -					 "Unexpected TX queue failure: %d\n",
> > > -					 capacity);
> > > +					 "Unexpected TXQ (%d) failure: %d\n",
> > > +					 qnum, capacity);
> > > 
> > >  		}
> > >  		dev->stats.tx_dropped++;
> > >  		kfree_skb(skb);
> > > 
> > > @@ -685,12 +728,12 @@ static netdev_tx_t start_xmit(struct sk_buff *skb,
> > > struct net_device *dev)> 
> > >  	/* Apparently nice girls don't return TX_BUSY; stop the queue
> > >  	
> > >  	 * before it gets out of hand.  Naturally, this wastes entries. */
> > >  	
> > >  	if (capacity < 2+MAX_SKB_FRAGS) {
> > > 
> > > -		netif_stop_queue(dev);
> > > +		netif_stop_subqueue(dev, qnum);
> > > 
> > >  		if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
> > >  		
> > >  			/* More just got used, free them then recheck. */
> > >  			capacity += free_old_xmit_skbs(sq);
> > >  			if (capacity >= 2+MAX_SKB_FRAGS) {
> > > 
> > > -				netif_start_queue(dev);
> > > +				netif_start_subqueue(dev, qnum);
> > > 
> > >  				virtqueue_disable_cb(sq->vq);
> > >  			
> > >  			}
> > >  		
> > >  		}
> > > 
> > > @@ -758,23 +801,13 @@ static struct rtnl_link_stats64
> > > *virtnet_stats(struct net_device *dev,> 
> > >  static void virtnet_netpoll(struct net_device *dev)
> > >  {
> > >  
> > >  	struct virtnet_info *vi = netdev_priv(dev);
> > > 
> > > +	int i;
> > > 
> > > -	napi_schedule(&vi->rq.napi);
> > > +	for (i = 0; i < vi->curr_queue_pairs; i++)
> > > +		napi_schedule(&vi->rq[i].napi);
> > > 
> > >  }
> > >  #endif
> > > 
> > > -static int virtnet_open(struct net_device *dev)
> > > -{
> > > -	struct virtnet_info *vi = netdev_priv(dev);
> > > -
> > > -	/* Make sure we have some buffers: if oom use wq. */
> > > -	if (!try_fill_recv(&vi->rq, GFP_KERNEL))
> > > -		schedule_delayed_work(&vi->refill, 0);
> > > -
> > > -	virtnet_napi_enable(&vi->rq);
> > > -	return 0;
> > > -}
> > > -
> > > 
> > >  /*
> > >  
> > >   * Send command via the control virtqueue and check status.  Commands
> > >   * supported by the hypervisor, as indicated by feature bits, should
> > > 
> > > @@ -830,13 +863,51 @@ static void virtnet_ack_link_announce(struct
> > > virtnet_info *vi)> 
> > >  	rtnl_unlock();
> > >  
> > >  }
> > > 
> > > +/* Caller check the support of cvq and multiqueue. */
> > > +static int virtnet_set_queues(struct virtnet_info *vi)
> > > +{
> > > +	struct scatterlist sg;
> > > +	struct virtio_net_ctrl_rfs s;
> > > +	struct net_device *dev = vi->dev;
> > > +
> > > +	s.virtqueue_pairs = vi->curr_queue_pairs;
> > > +	sg_init_one(&sg, &s, sizeof(s));
> > > +
> > > +	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RFS,
> > > +				  VIRTIO_NET_CTRL_RFS_VQ_PAIRS_SET, &sg, 1, 0)){
> > > +		dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
> > > +			 vi->curr_queue_pairs);
> > > +		return -EINVAL;
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static int virtnet_open(struct net_device *dev)
> > 
> > Why move this here? diff will be smaller if you don't.
> 
> Ok, will move it back.
> > 
> > > +{
> > > +	struct virtnet_info *vi = netdev_priv(dev);
> > > +	int i;
> > > +
> > > +	for (i = 0; i < vi->max_queue_pairs; i++) {
> > > +		/* Make sure we have some buffers: if oom use wq. */
> > > +		if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
> > > +			schedule_delayed_work(&vi->refill, 0);
> > > +		virtnet_napi_enable(&vi->rq[i]);
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > 
> > >  static int virtnet_close(struct net_device *dev)
> > >  {
> > >  
> > >  	struct virtnet_info *vi = netdev_priv(dev);
> > > 
> > > +	int i;
> > > 
> > >  	/* Make sure refill_work doesn't re-enable napi! */
> > >  	cancel_delayed_work_sync(&vi->refill);
> > > 
> > > -	napi_disable(&vi->rq.napi);
> > > +
> > > +	for (i = 0; i < vi->max_queue_pairs; i++)
> > > +		napi_disable(&vi->rq[i].napi);
> > > 
> > >  	return 0;
> > >  
> > >  }
> > > 
> > > @@ -948,8 +1019,8 @@ static void virtnet_get_ringparam(struct net_device
> > > *dev,> 
> > >  {
> > >  
> > >  	struct virtnet_info *vi = netdev_priv(dev);
> > > 
> > > -	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq.vq);
> > > -	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq.vq);
> > > +	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq);
> > > +	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq);
> > > 
> > >  	ring->rx_pending = ring->rx_max_pending;
> > >  	ring->tx_pending = ring->tx_max_pending;
> > >  
> > >  }
> > > 
> > > @@ -967,12 +1038,6 @@ static void virtnet_get_drvinfo(struct net_device
> > > *dev,> 
> > >  }
> > > 
> > > -static const struct ethtool_ops virtnet_ethtool_ops = {
> > > -	.get_drvinfo = virtnet_get_drvinfo,
> > > -	.get_link = ethtool_op_get_link,
> > > -	.get_ringparam = virtnet_get_ringparam,
> > > -};
> > > -
> > > 
> > >  #define MIN_MTU 68
> > >  #define MAX_MTU 65535
> > > 
> > > @@ -984,6 +1049,21 @@ static int virtnet_change_mtu(struct net_device
> > > *dev, int new_mtu)> 
> > >  	return 0;
> > >  
> > >  }
> > > 
> > > +/* To avoid contending a lock hold by a vcpu who would exit to host,
> > > select the + * txq based on the processor id.
> > > + * TODO: handle cpu hotplug.
> > > + */
> > > +static u16 virtnet_select_queue(struct net_device *dev, struct sk_buff
> > > *skb) +{
> > > +	int txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) :
> > > +		  smp_processor_id();
> > > +
> > > +	while (unlikely(txq >= dev->real_num_tx_queues))
> > > +		txq -= dev->real_num_tx_queues;
> > > +
> > > +	return txq;
> > > +}
> > > +
> > > 
> > >  static const struct net_device_ops virtnet_netdev = {
> > >  
> > >  	.ndo_open            = virtnet_open,
> > >  	.ndo_stop   	     = virtnet_close,
> > > 
> > > @@ -995,6 +1075,7 @@ static const struct net_device_ops virtnet_netdev = {
> > > 
> > >  	.ndo_get_stats64     = virtnet_stats,
> > >  	.ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
> > >  	.ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
> > > 
> > > +	.ndo_select_queue     = virtnet_select_queue,
> > > 
> > >  #ifdef CONFIG_NET_POLL_CONTROLLER
> > >  
> > >  	.ndo_poll_controller = virtnet_netpoll,
> > >  
> > >  #endif
> > > 
> > > @@ -1030,10 +1111,10 @@ static void virtnet_config_changed_work(struct
> > > work_struct *work)> 
> > >  	if (vi->status & VIRTIO_NET_S_LINK_UP) {
> > >  	
> > >  		netif_carrier_on(vi->dev);
> > > 
> > > -		netif_wake_queue(vi->dev);
> > > +		netif_tx_wake_all_queues(vi->dev);
> > > 
> > >  	} else {
> > >  	
> > >  		netif_carrier_off(vi->dev);
> > > 
> > > -		netif_stop_queue(vi->dev);
> > > +		netif_tx_stop_all_queues(vi->dev);
> > > 
> > >  	}
> > >  
> > >  done:
> > >  	mutex_unlock(&vi->config_lock);
> > > 
> > > @@ -1046,41 +1127,219 @@ static void virtnet_config_changed(struct
> > > virtio_device *vdev)> 
> > >  	schedule_work(&vi->config_work);
> > >  
> > >  }
> > > 
> > > -static int init_vqs(struct virtnet_info *vi)
> > > +static void free_receive_bufs(struct virtnet_info *vi)
> > > 
> > >  {
> > > 
> > > -	struct virtqueue *vqs[3];
> > > -	vq_callback_t *callbacks[] = { skb_recv_done, skb_xmit_done, NULL};
> > > -	const char *names[] = { "input", "output", "control" };
> > > -	int nvqs, err;
> > > +	int i;
> > > 
> > > -	/* We expect two virtqueues, receive then send,
> > > -	 * and optionally control. */
> > > -	nvqs = virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ) ? 3 : 2;
> > > +	for (i = 0; i < vi->max_queue_pairs; i++) {
> > > +		while (vi->rq[i].pages)
> > > +			__free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
> > > +	}
> > > +}
> > > 
> > > -	err = vi->vdev->config->find_vqs(vi->vdev, nvqs, vqs, callbacks, names);
> > > -	if (err)
> > > -		return err;
> > > +/* Free memory allocated for send and receive queues */
> > > +static void virtnet_free_queues(struct virtnet_info *vi)
> > 
> > I think it's cleaner to open-code this, this way
> > during error handling you can have:
> > 
> > 	kfree(vi->rq);
> > err_rq:
> > 	kfree(vi->sq);
> > err_sq:
> > 
> > without tricks like malloc them both then goto end.
> 
> Ok.
> > 
> > > +{
> > > +	kfree(vi->rq);
> > > +	vi->rq = NULL;
> > > +	kfree(vi->sq);
> > > +	vi->sq = NULL;
> > 
> > I think = NULL is not needed - we never call this twice.
> 
> Sure, will remove it.
> > 
> > > +}
> > > +
> > > +static void free_unused_bufs(struct virtnet_info *vi)
> > > +{
> > > +	void *buf;
> > > +	int i;
> > > +
> > > +	for (i = 0; i < vi->max_queue_pairs; i++) {
> > > +		struct virtqueue *vq = vi->sq[i].vq;
> > > +		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
> > > +			dev_kfree_skb(buf);
> > > +	}
> > > +
> > > +	for (i = 0; i < vi->max_queue_pairs; i++) {
> > > +		struct virtqueue *vq = vi->rq[i].vq;
> > > +
> > > +		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
> > > +			if (vi->mergeable_rx_bufs || vi->big_packets)
> > > +				give_pages(&vi->rq[i], buf);
> > > +			else
> > > +				dev_kfree_skb(buf);
> > > +			--vi->rq[i].num;
> > > +		}
> > > +		BUG_ON(vi->rq[i].num != 0);
> > > +	}
> > > +}
> > > +
> > > +static void virtnet_set_affinity(struct virtnet_info *vi, bool set)
> > > +{
> > > +	int i;
> > > +
> > > +	/* Don't set the affinity hint when in single queue mode or we have too
> > > +	 * much online cpus.
> > > +	 */
> > 
> > Pls remove this comment, or replace with one explaining
> > the motivation for this logic.
> 
> Will replace with the motivation.
> > 
> > > +	if (vi->curr_queue_pairs == 1 ||
> > > +	    vi->max_queue_pairs > num_online_cpus())
> > 
> > If we have less it's not a good idea either, is it?
> > So check vi->max_queue_pairs != num_online_cpus().
> 
> Ok.
> > 
> > > +		set = false;
> > 
> > This will overwrite affinity if it was set by userspace.
> > Just
> > 	if (set)
> > 		return;
> > will not have this problem.
> 
> But we need handle the situtaiton when switch back to sq from mq mode. 
> Otherwise we may still get the affinity hint used in mq.
>  This kind of overwrite 
> is unavoidable or is there some method to detect whether userspac write 
> something new?

If we didn't set the affinity originally we should not overwrite it.
I think this means we need a flag that tells us that
virtio set the affinity.

> > 
> > > +	for (i = 0; i < vi->max_queue_pairs; i++) {
> > > +		int cpu = set ? i : -1;
> > > +		virtqueue_set_affinity(vi->rq[i].vq, cpu);
> > > +		virtqueue_set_affinity(vi->sq[i].vq, cpu);
> > > +	}
> > > +}
> > > +
> > > +static void virtnet_del_vqs(struct virtnet_info *vi)
> > 
> > It might be a good idea to add this function in previous
> > patch.
> 
> Yes, good suggetion.
> > 
> > > +{
> > > +	struct virtio_device *vdev = vi->vdev;
> > > +
> > > +	virtnet_set_affinity(vi, false);
> > > +
> > > +	vdev->config->del_vqs(vdev);
> > > 
> > > -	vi->rq.vq = vqs[0];
> > > -	vi->sq.vq = vqs[1];
> > > +	virtnet_free_queues(vi);
> > > +}
> > > 
> > > -	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)) {
> > > -		vi->cvq = vqs[2];
> > > +static int virtnet_find_vqs(struct virtnet_info *vi)
> > > +{
> > > +	vq_callback_t **callbacks;
> > > +	struct virtqueue **vqs;
> > > +	int ret = -ENOMEM;
> > > +	int i, total_vqs;
> > > +	char **names;
> > > +
> > > +	/* We expect 1 RX virtqueue followed by 1 TX virtqueue, followd by
> > 
> > followed
> > 
> 
> Ok, will correct it.
> > > +	 * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by
> > > +	 * possible control vq.
> > > +	 */
> > > +	total_vqs = vi->max_queue_pairs * 2 +
> > > +		    virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);
> > > +
> > > +	/* Allocate space for find_vqs parameters */
> > > +	vqs = kzalloc(total_vqs * sizeof(*vqs), GFP_KERNEL);
> > > +	callbacks = kzalloc(total_vqs * sizeof(*callbacks), GFP_KERNEL);
> > > +	if (!vqs || !callbacks)
> > > +		goto err_mem;
> > > +	names = kzalloc(total_vqs * sizeof(*names), GFP_KERNEL);
> > 
> > Why kzalloc? You seem to fill in all of it. Pls just use kmalloc
> > and initialize fields.
> > 
> 
> Right.
> > > +	if (!names)
> > > +		goto err_mem;
> > 
> > Since you have separate goto here it's more consistent
> > to use a separate label and two if tests above.
> > 
> 
> Sure.
> > > +
> > > +	/* Parameters for control virtqueue, if any */
> > > +	if (vi->has_cvq) {
> > > +		callbacks[total_vqs - 1] = NULL;
> > > +		names[total_vqs - 1] = kasprintf(GFP_KERNEL, "control");
> > > +	}
> > > 
> > > +	/* Allocate/initialize parameters for send/receive virtqueues */
> > > +	for (i = 0; i < vi->max_queue_pairs; i++) {
> > > +		callbacks[rxq2vq(i)] = skb_recv_done;
> > > +		callbacks[txq2vq(i)] = skb_xmit_done;
> > > +		names[rxq2vq(i)] = kasprintf(GFP_KERNEL, "input.%d", i);
> > > +		names[txq2vq(i)] = kasprintf(GFP_KERNEL, "output.%d", i);
> > > +	}
> > 
> > We would need to check kasprintf return value.
> 
> Looks like a better method is to make the name as a memeber of receive_queue 
> and send_queue, and use sprintf here.
> > Also if you allocate names from slab we'll need to free them
> > later.
> 
> Then it could be freed when the send_queue and receive_queue is freed.
> > It's probably easier to just use fixed names for now -
> > it's not like the index is really useful.
> 
> Looks useful for debugging e.g. check whether the irq distribution is as 
> expected.

Well it doesn't really matter which one goes where, right?
As long as interrupts are distributed well.

> > 
> > > +
> > > +	ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks,
> > > +					 (const char **)names);
> > 
> > Please avoid casts, use a proper type for names.
> 
> I'm consider we need a minor change in this api, we need allocate the names 
> dynamically which could not be a const char **.

I don't see why. Any use that allocates on the fly as
you did would leak memory. Any use like you suggest
e.g. allocating as part of send/receive structure
would be fine.

> > 
> > > +	if (ret)
> > > +		goto err_names;
> > > +
> > > +	if (vi->has_cvq) {
> > > +		vi->cvq = vqs[total_vqs - 1];
> > > 
> > >  		if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
> > >  		
> > >  			vi->dev->features |= NETIF_F_HW_VLAN_FILTER;
> > >  	
> > >  	}
> > > 
> > > +
> > > +	for (i = 0; i < vi->max_queue_pairs; i++) {
> > > +		vi->rq[i].vq = vqs[rxq2vq(i)];
> > > +		vi->sq[i].vq = vqs[txq2vq(i)];
> > > +	}
> > > +
> > > +	kfree(callbacks);
> > > +	kfree(vqs);
> > 
> > Who frees names if there's no error?
> > 
> 
> The virtio core does not copy the name, so it need this and only used for 
> debugging if I'm reading the code correctly.

No, virtio core does not free either individual vq name or the names
array passed in. So this leaks memory.

> > > +
> > > +	return 0;
> > > +
> > > +err_names:
> > > +	for (i = 0; i < total_vqs * 2; i++)
> > 
> > Why * 2?
> > This looks like a bug.
> 
> Yes, it's a bug.
> > 
> > > +		kfree(names[i]);
> > > +	kfree(names);
> > > +
> > > +err_mem:
> > > +	kfree(callbacks);
> > > +	kfree(vqs);
> > > +
> > > +	return ret;
> > > +}
> > > +
> > > +static int virtnet_alloc_queues(struct virtnet_info *vi)
> > > +{
> > > +	int i;
> > > +
> > > +	vi->sq = kzalloc(sizeof(vi->sq[0]) * vi->max_queue_pairs, GFP_KERNEL);
> > > +	vi->rq = kzalloc(sizeof(vi->rq[0]) * vi->max_queue_pairs, GFP_KERNEL);
> > 
> > While equivalent, *vi->rq is clearer IMHO.
> 
> Ok.
> > 
> > > +	if (!vi->rq || !vi->sq)
> > > +		goto err;
> > > +
> > > +	INIT_DELAYED_WORK(&vi->refill, refill_work);
> > > +	/* setup initial receive and send queue parameters */
> > 
> > Pls remove this comment, it's confuses more than it clarifies.
> 
> Sure.
> > 
> > > +	for (i = 0; i < vi->max_queue_pairs; i++) {
> > > +		vi->rq[i].pages = NULL;
> > > +		netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll,
> > > +			       napi_weight);
> > > +
> > > +		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
> > > +		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
> > > +	}
> > > +
> > > +
> > 
> > Extra empty line.
> > 
> 
> Will remove this line.
> > > +	return 0;
> > > +
> > > +err:
> > > +	virtnet_free_queues(vi);
> > > +	return -ENOMEM;
> > > +}
> > > +
> > > +static int init_vqs(struct virtnet_info *vi)
> > > +{
> > > +	int ret;
> > > +
> > > +	/* Allocate send & receive queues */
> > > +	ret = virtnet_alloc_queues(vi);
> > > +	if (ret)
> > > +		goto err;
> > > +
> > > +	ret = virtnet_find_vqs(vi);
> > > +	if (ret)
> > > +		goto err_free;
> > > +
> > > +	virtnet_set_affinity(vi, true);
> > > 
> > >  	return 0;
> > > 
> > > +
> > > +err_free:
> > > +	virtnet_free_queues(vi);
> > > +err:
> > > +	return ret;
> > > 
> > >  }
> > >  
> > >  static int virtnet_probe(struct virtio_device *vdev)
> > >  {
> > > 
> > > -	int err;
> > > +	int i, err;
> > > 
> > >  	struct net_device *dev;
> > >  	struct virtnet_info *vi;
> > > 
> > > +	u16 max_queue_pairs;
> > > +
> > > +	/* Find if host supports multiqueue virtio_net device */
> > > +	err = virtio_config_val(vdev, VIRTIO_NET_F_RFS,
> > > +				offsetof(struct virtio_net_config,
> > > +				max_virtqueue_pairs), &max_queue_pairs);
> > > +
> > > +	/* We need at least 2 queue's */
> > > +	if (err || max_queue_pairs < VIRTIO_NET_CTRL_RFS_VQ_PAIRS_MIN ||
> > > +	    max_queue_pairs > VIRTIO_NET_CTRL_RFS_VQ_PAIRS_MAX)
> > 
> > Check has_cvq as well.
> > 
> 
> Sure.
> > > +		max_queue_pairs = 1;
> > > 
> > >  	/* Allocate ourselves a network device with room for our info */
> > > 
> > > -	dev = alloc_etherdev(sizeof(struct virtnet_info));
> > > +	dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs);
> > > 
> > >  	if (!dev)
> > >  	
> > >  		return -ENOMEM;
> > > 
> > > @@ -1127,22 +1386,17 @@ static int virtnet_probe(struct virtio_device
> > > *vdev)> 
> > >  	/* Set up our device-specific information */
> > >  	vi = netdev_priv(dev);
> > > 
> > > -	netif_napi_add(dev, &vi->rq.napi, virtnet_poll, napi_weight);
> > > 
> > >  	vi->dev = dev;
> > >  	vi->vdev = vdev;
> > >  	vdev->priv = vi;
> > > 
> > > -	vi->rq.pages = NULL;
> > > 
> > >  	vi->stats = alloc_percpu(struct virtnet_stats);
> > >  	err = -ENOMEM;
> > >  	if (vi->stats == NULL)
> > >  	
> > >  		goto free;
> > > 
> > > -	INIT_DELAYED_WORK(&vi->refill, refill_work);
> > > 
> > >  	mutex_init(&vi->config_lock);
> > >  	vi->config_enable = true;
> > >  	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
> > > 
> > > -	sg_init_table(vi->rq.sg, ARRAY_SIZE(vi->rq.sg));
> > > -	sg_init_table(vi->sq.sg, ARRAY_SIZE(vi->sq.sg));
> > > 
> > >  	/* If we can receive ANY GSO packets, we must allocate large ones. */
> > >  	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
> > > 
> > > @@ -1153,10 +1407,21 @@ static int virtnet_probe(struct virtio_device
> > > *vdev)> 
> > >  	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
> > >  	
> > >  		vi->mergeable_rx_bufs = true;
> > > 
> > > +	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
> > > +		vi->has_cvq = true;
> > > +
> > > +	/* Use single tx/rx queue pair as default */
> > > +	vi->curr_queue_pairs = 1;
> > > +	vi->max_queue_pairs = max_queue_pairs;
> > > +
> > > +	/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
> > > 
> > >  	err = init_vqs(vi);
> > >  	if (err)
> > >  	
> > >  		goto free_stats;
> > > 
> > > +	netif_set_real_num_tx_queues(dev, 1);
> > > +	netif_set_real_num_rx_queues(dev, 1);
> > > +
> > > 
> > >  	err = register_netdev(dev);
> > >  	if (err) {
> > >  	
> > >  		pr_debug("virtio_net: registering device failed\n");
> > > 
> > > @@ -1164,12 +1429,15 @@ static int virtnet_probe(struct virtio_device
> > > *vdev)> 
> > >  	}
> > >  	
> > >  	/* Last of all, set up some receive buffers. */
> > > 
> > > -	try_fill_recv(&vi->rq, GFP_KERNEL);
> > > -
> > > -	/* If we didn't even get one input buffer, we're useless. */
> > > -	if (vi->rq.num == 0) {
> > > -		err = -ENOMEM;
> > > -		goto unregister;
> > > +	for (i = 0; i < vi->max_queue_pairs; i++) {
> > > +		try_fill_recv(&vi->rq[i], GFP_KERNEL);
> > > +
> > > +		/* If we didn't even get one input buffer, we're useless. */
> > > +		if (vi->rq[i].num == 0) {
> > > +			free_unused_bufs(vi);
> > > +			err = -ENOMEM;
> > > +			goto free_recv_bufs;
> > > +		}
> > > 
> > >  	}
> > >  	
> > >  	/* Assume link up if device can't report link status,
> > > 
> > > @@ -1182,13 +1450,19 @@ static int virtnet_probe(struct virtio_device
> > > *vdev)> 
> > >  		netif_carrier_on(dev);
> > >  	
> > >  	}
> > > 
> > > -	pr_debug("virtnet: registered device %s\n", dev->name);
> > > +	pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
> > > +		 dev->name, max_queue_pairs);
> > > +
> > > 
> > >  	return 0;
> > > 
> > > -unregister:
> > > +free_recv_bufs:
> > > +	free_receive_bufs(vi);
> > > 
> > >  	unregister_netdev(dev);
> > > 
> > > +
> > > 
> > >  free_vqs:
> > > -	vdev->config->del_vqs(vdev);
> > > +	cancel_delayed_work_sync(&vi->refill);
> > > +	virtnet_del_vqs(vi);
> > > +
> > > 
> > >  free_stats:
> > >  	free_percpu(vi->stats);
> > >  
> > >  free:
> > > @@ -1196,28 +1470,6 @@ free:
> > >  	return err;
> > >  
> > >  }
> > > 
> > > -static void free_unused_bufs(struct virtnet_info *vi)
> > > -{
> > > -	void *buf;
> > > -	while (1) {
> > > -		buf = virtqueue_detach_unused_buf(vi->sq.vq);
> > > -		if (!buf)
> > > -			break;
> > > -		dev_kfree_skb(buf);
> > > -	}
> > > -	while (1) {
> > > -		buf = virtqueue_detach_unused_buf(vi->rq.vq);
> > > -		if (!buf)
> > > -			break;
> > > -		if (vi->mergeable_rx_bufs || vi->big_packets)
> > > -			give_pages(&vi->rq, buf);
> > > -		else
> > > -			dev_kfree_skb(buf);
> > > -		--vi->rq.num;
> > > -	}
> > > -	BUG_ON(vi->rq.num != 0);
> > > -}
> > > -
> > > 
> > >  static void remove_vq_common(struct virtnet_info *vi)
> > >  {
> > >  
> > >  	vi->vdev->config->reset(vi->vdev);
> > > 
> > > @@ -1225,10 +1477,9 @@ static void remove_vq_common(struct virtnet_info
> > > *vi)> 
> > >  	/* Free unused buffers in both send and recv, if any. */
> > >  	free_unused_bufs(vi);
> > > 
> > > -	vi->vdev->config->del_vqs(vi->vdev);
> > > +	free_receive_bufs(vi);
> > > 
> > > -	while (vi->rq.pages)
> > > -		__free_pages(get_a_page(&vi->rq, GFP_KERNEL), 0);
> > > +	virtnet_del_vqs(vi);
> > > 
> > >  }
> > >  
> > >  static void __devexit virtnet_remove(struct virtio_device *vdev)
> > > 
> > > @@ -1254,6 +1505,7 @@ static void __devexit virtnet_remove(struct
> > > virtio_device *vdev)> 
> > >  static int virtnet_freeze(struct virtio_device *vdev)
> > >  {
> > >  
> > >  	struct virtnet_info *vi = vdev->priv;
> > > 
> > > +	int i;
> > > 
> > >  	/* Prevent config work handler from accessing the device */
> > >  	mutex_lock(&vi->config_lock);
> > > 
> > > @@ -1264,7 +1516,10 @@ static int virtnet_freeze(struct virtio_device
> > > *vdev)> 
> > >  	cancel_delayed_work_sync(&vi->refill);
> > >  	
> > >  	if (netif_running(vi->dev))
> > > 
> > > -		napi_disable(&vi->rq.napi);
> > > +		for (i = 0; i < vi->max_queue_pairs; i++) {
> > > +			napi_disable(&vi->rq[i].napi);
> > > +			netif_napi_del(&vi->rq[i].napi);
> > > +		}
> > > 
> > >  	remove_vq_common(vi);
> > > 
> > > @@ -1276,24 +1531,29 @@ static int virtnet_freeze(struct virtio_device
> > > *vdev)> 
> > >  static int virtnet_restore(struct virtio_device *vdev)
> > >  {
> > >  
> > >  	struct virtnet_info *vi = vdev->priv;
> > > 
> > > -	int err;
> > > +	int err, i;
> > > 
> > >  	err = init_vqs(vi);
> > >  	if (err)
> > >  	
> > >  		return err;
> > >  	
> > >  	if (netif_running(vi->dev))
> > > 
> > > -		virtnet_napi_enable(&vi->rq);
> > > +		for (i = 0; i < vi->max_queue_pairs; i++)
> > > +			virtnet_napi_enable(&vi->rq[i]);
> > > 
> > >  	netif_device_attach(vi->dev);
> > > 
> > > -	if (!try_fill_recv(&vi->rq, GFP_KERNEL))
> > > -		schedule_delayed_work(&vi->refill, 0);
> > > +	for (i = 0; i < vi->max_queue_pairs; i++)
> > > +		if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
> > > +			schedule_delayed_work(&vi->refill, 0);
> > > 
> > >  	mutex_lock(&vi->config_lock);
> > >  	vi->config_enable = true;
> > >  	mutex_unlock(&vi->config_lock);
> > > 
> > > +	if (vi->has_cvq && virtio_has_feature(vi->vdev, VIRTIO_NET_F_RFS))
> > > +		virtnet_set_queues(vi);
> > > +
> > 
> > I think it's easier to test
> > if (curr_queue_pairs == max_queue_pairs)
> > within virtnet_set_queues and make it
> > a NOP if so.
> 
> Still need to send the command during restore since we reset the device during 
> freezing.


Then maybe check vi->has_cvq && virtio_has_feature(vi->vdev,
VIRTIO_NET_F_RFS) in there?

> > 
> > >  	return 0;
> > >  
> > >  }
> > >  #endif
> > > 
> > > @@ -1311,7 +1571,7 @@ static unsigned int features[] = {
> > > 
> > >  	VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO,
> > >  	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ,
> > >  	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN,
> > > 
> > > -	VIRTIO_NET_F_GUEST_ANNOUNCE,
> > > +	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_RFS,
> > > 
> > >  };
> > >  
> > >  static struct virtio_driver virtio_net_driver = {
> > > 
> > > @@ -1329,6 +1589,12 @@ static struct virtio_driver virtio_net_driver = {
> > > 
> > >  #endif
> > >  };
> > > 
> > > +static const struct ethtool_ops virtnet_ethtool_ops = {
> > > +	.get_drvinfo = virtnet_get_drvinfo,
> > > +	.get_link = ethtool_op_get_link,
> > > +	.get_ringparam = virtnet_get_ringparam,
> > > +};
> > > +
> > > 
> > >  static int __init init(void)
> > >  {
> > >  
> > >  	return register_virtio_driver(&virtio_net_driver);
> > > 
> > > diff --git a/include/uapi/linux/virtio_net.h
> > > b/include/uapi/linux/virtio_net.h index 2470f54..6056cec 100644
> > > --- a/include/uapi/linux/virtio_net.h
> > > +++ b/include/uapi/linux/virtio_net.h
> > > @@ -51,6 +51,7 @@
> > > 
> > >  #define VIRTIO_NET_F_CTRL_RX_EXTRA 20	/* Extra RX mode control support 
> */
> > >  #define VIRTIO_NET_F_GUEST_ANNOUNCE 21	/* Guest can announce device on
> > >  the
> > >  
> > >  					 * network */
> > > 
> > > +#define VIRTIO_NET_F_RFS	22	/* Device supports multiple TXQ/RXQ */
> > 
> > Should be
> > /* Device supports Receive Flow Steering. */
> 
> Ok.
> > 
> > >  #define VIRTIO_NET_S_LINK_UP	1	/* Link is up */
> > >  #define VIRTIO_NET_S_ANNOUNCE	2	/* Announcement is needed */
> > > 
> > > @@ -60,6 +61,8 @@ struct virtio_net_config {
> > > 
> > >  	__u8 mac[6];
> > >  	/* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */
> > >  	__u16 status;
> > > 
> > > +	/* Total number of RX/TX queues */
> > 
> > Better comment:
> > /* Maximum number of each of transmit and receive queues;
> >  * see VIRTIO_NET_F_RFS and VIRTIO_NET_CTRL_RFS.
> >  * Legal values are between 1 and 0x8000
> >  */
> 
> Sure.
> > 
> > > +	__u16 max_virtqueue_pairs;
> > > 
> > >  } __attribute__((packed));
> > >  
> > >  /* This is the first element of the scatter-gather list.  If you don't
> > > 
> > > @@ -166,4 +169,17 @@ struct virtio_net_ctrl_mac {
> > > 
> > >  #define VIRTIO_NET_CTRL_ANNOUNCE       3
> > >  
> > >   #define VIRTIO_NET_CTRL_ANNOUNCE_ACK         0
> > > 
> > > +/*
> > > + * Control multiqueue
> > 
> > Here's a better comment:
> > 
> > Control Receive Flow Steering
> > 
> >  The command VIRTIO_NET_CTRL_RFS_VQ_PAIRS_SET
> >  enables Receive Flow Steering, specifying the number of the transmit and
> > receive queues that will be used.
> >  After the command is consumed and acked by the device,
> >  the device will not steer new packets on receive virtqueues
> >  other than specified nor read from transmit virtqueues other than
> > specified. Accordingly, driver should not transmit new packets
> >  on virtqueues other than specified.
> > 
> 
> Sure, thanks for the this reminding.
> > > + *
> > 
> > Remove this empty line.
> 
> Ok.
> > 
> > > + */
> > > +struct virtio_net_ctrl_rfs {
> > 
> > /* Number of each of transmit and receive queues to use;
> >  * Legal values are between 1 and max_virtqueue_pairs
> >  */
> > 
> > > +	u16 virtqueue_pairs;
> > > +};
> > > +
> > > +#define VIRTIO_NET_CTRL_RFS   4
> > > + #define VIRTIO_NET_CTRL_RFS_VQ_PAIRS_SET        0
> > 
> > /* Value range for max_virtqueue_pairsfor and virtqueue_pairs above */
> > 
> > > + #define VIRTIO_NET_CTRL_RFS_VQ_PAIRS_MIN        1
> > > + #define VIRTIO_NET_CTRL_RFS_VQ_PAIRS_MAX        0x8000
> > > +
> > > 
> > >  #endif /* _LINUX_VIRTIO_NET_H */
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [RFC PATCH 2/2] tun: fix LSM/SELinux labeling of tun/tap devices
From: Michael S. Tsirkin @ 2012-12-04 15:24 UTC (permalink / raw)
  To: Jason Wang; +Cc: Paul Moore, netdev, linux-security-module, selinux
In-Reply-To: <7659411.O2Or69Bf6n@jason-thinkpad-t430s>

On Tue, Dec 04, 2012 at 09:24:43PM +0800, Jason Wang wrote:
> On Monday, December 03, 2012 11:22:29 AM Paul Moore wrote:
> > On Monday, December 03, 2012 06:15:42 PM Jason Wang wrote:
> > > On 11/30/2012 06:06 AM, Paul Moore wrote:
> > > > This patch corrects some problems with LSM/SELinux that were introduced
> > > > with the multiqueue patchset.  The problem stems from the fact that the
> > > > multiqueue work changed the relationship between the tun device and its
> > > > associated socket; before the socket persisted for the life of the
> > > > device, however after the multiqueue changes the socket only persisted
> > > > for the life of the userspace connection (fd open).  For non-persistent
> > > > devices this is not an issue, but for persistent devices this can cause
> > > > the tun device to lose its SELinux label.
> > > > 
> > > > We correct this problem by adding an opaque LSM security blob to the
> > > > tun device struct which allows us to have the LSM security state, e.g.
> > > > SELinux labeling information, persist for the lifetime of the tun
> > > > device.
> > 
> > ...
> > 
> > > > -static int selinux_tun_dev_attach(struct sock *sk)
> > > > +static int selinux_tun_dev_attach(struct sock *sk, void *security)
> > > > 
> > > >  {
> > > > 
> > > > +	struct tun_security_struct *tunsec = security;
> > > > 
> > > >  	struct sk_security_struct *sksec = sk->sk_security;
> > > >  	u32 sid = current_sid();
> > > >  	int err;
> > > > 
> > > > +	/* we don't currently perform any NetLabel based labeling here ...
> > > > 
> > > >  	err = avc_has_perm(sid, sksec->sid, SECCLASS_TUN_SOCKET,
> > > >  	
> > > >  			   TUN_SOCKET__RELABELFROM, NULL);
> > > >  	
> > > >  	if (err)
> > > >  	
> > > >  		return err;
> > > > 
> > > > -	err = avc_has_perm(sid, sid, SECCLASS_TUN_SOCKET,
> > > > +	err = avc_has_perm(sid, tunsec->sid, SECCLASS_TUN_SOCKET,
> > > > 
> > > >  			   TUN_SOCKET__RELABELTO, NULL);
> > > >  	
> > > >  	if (err)
> > > >  	
> > > >  		return err;
> > > > 
> > > > -	sksec->sid = sid;
> > > > +	sksec->sid = tunsec->sid;
> > > > +	sksec->sclass = SECCLASS_TUN_SOCKET;
> > > 
> > > I'm not sure whether this is correct, looks like we need to differ between
> > > TUNSETQUEUE and TUNSETIFF. When userspace call TUNSETIFF for persistent
> > > device, looks like we need change the sid of tunsec like in the past.
> > 
> > It may be that I'm misunderstanding TUNSETQUEUE and/or TUNSETIFF.  Can you
> > elaborate as to why they should be different?
> 
> If I understand correctly, before multiqueue patchset, TUNSETIFF is used to:
> 
> 1) Create the tun/tap network device
> 2) For persistent device, re-attach the fd to the network device / socket. In 
> this case, we call selinux_tun_dev_attch() to relabel the socket sid (in fact 
> also the device's since the socket were persistent also) to the sid of process 
> that calls TUNSETIFF.
> 
> So, after the changes of multiqueue, we need try to preserve those policy. The 
> interesting part is the introducing of TUNSETQUEUE, it's used to attach more 
> file descriptors/sockets to a tun/tap device after at least one file descriptor 
> were attached to the tun/tap device through TUNSETIFF. So I think maybe we 
> need differ those two ioctls. This patch looks fine for TUNSETQUEUE, but for 
> TUNSETIFF, we need relabel the tunsec to the process that calling TUNSETIFF 
> for persistent device?

Basically, it looks like currently once you get a tun fd,
you can attach it to any device even if normally
selinux would prevent you from accessing it.

If we reuse selinux_tun_dev_attach, we won't need to
change selinux policy, with a new capability we will need to change it
to allow libvirt to do TUNSETQUEUE.


> 
> btw. Current code does allow calling TUNSETQUEUE to a persistent tun/tap 
> device with no file attached. It should be a bug and need to be fixed.

Is this a problem? You can always
attach
set queue
detach

and it would be hard to prevent this ...

> > 
> > One thing that I think we probably should change is the relabelto/from
> > permissions in the function above (selinux_tun_dev_attach()); in the case
> > where the socket does not yet have a label, e.g. 'sksec->sid == 0', we
> > should probably skip the relabel permissions since we want to assign the
> > TUN device label regardless in this case.
> 
> I'm not familiar with the selinux, have a quick glance of the code, looks like 
> the label has been initialized to SECINITSID_KERNEL in 
> selinux_socket_post_create().
> 
> Thanks

^ permalink raw reply

* Re: [RFT PATCH] 8139cp: properly support change of MTU values [v2]
From: David Woodhouse @ 2012-12-04 15:44 UTC (permalink / raw)
  To: Francois Romieu; +Cc: jogreene, David Miller, netdev
In-Reply-To: <20121203204608.GA9815@electric-eye.fr.zoreil.com>

[-- Attachment #1: Type: text/plain, Size: 934 bytes --]

On Mon, 2012-12-03 at 21:46 +0100, Francois Romieu wrote:
> David Miller <davem@davemloft.net> :
> [...]
> > I've applied this to net-next, if it triggers any problems we have
> > some time to work it out before 3.8 is released.
> 
> I have bounced the messages to David Woodhouse since he authored the
> last 8139cp changes in net-next and owns the hardware to notice
> regressions.

Thanks. 

This almost works. The patch itself is fine, but the device can't
receive packets larger than 2266 bytes (ping -s 2238). After that, I get
rx_fifo errors. I think the RX FIFO is only 2KiB on the 8139cp, isn't
it? So after that it's dependent on how fast it can shovel it out across
the PCI bus. Which is "not fast" in this case.

Transmit appears to be fine.

-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation




[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 6171 bytes --]

^ permalink raw reply

* Re: [RFC PATCH 2/2] tun: fix LSM/SELinux labeling of tun/tap devices
From: Paul Moore @ 2012-12-04 16:18 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-security-module, selinux, mst
In-Reply-To: <7659411.O2Or69Bf6n@jason-thinkpad-t430s>

On Tuesday, December 04, 2012 09:24:43 PM Jason Wang wrote:
> On Monday, December 03, 2012 11:22:29 AM Paul Moore wrote:
> > It may be that I'm misunderstanding TUNSETQUEUE and/or TUNSETIFF.  Can you
> > elaborate as to why they should be different?
> 
> If I understand correctly, before multiqueue patchset, TUNSETIFF is used to:
> 
> 1) Create the tun/tap network device
> 2) For persistent device, re-attach the fd to the network device / socket.
> In this case, we call selinux_tun_dev_attch() to relabel the socket sid (in
> fact also the device's since the socket were persistent also) to the sid of
> process that calls TUNSETIFF.
> 
> So, after the changes of multiqueue, we need try to preserve those policy.
> The interesting part is the introducing of TUNSETQUEUE, it's used to attach
> more file descriptors/sockets to a tun/tap device after at least one file
> descriptor were attached to the tun/tap device through TUNSETIFF. So I
> think maybe we need differ those two ioctls. This patch looks fine for
> TUNSETQUEUE, but for TUNSETIFF, we need relabel the tunsec to the process
> that calling TUNSETIFF for persistent device?

Okay, based on your explanation of TUNSETQUEUE, the steps below are what I 
believe we need to do ... if you disagree speak up quickly please.

A. TUNSETIFF (new, non-persistent device)

[Allocate and initialize the tun_struct LSM state based on the calling 
process, use this state to label the TUN socket.]

1. Call security_tun_dev_create() which authorizes the action.
2. Call security_tun_dev_alloc_security() which allocates the tun_struct LSM 
blob and SELinux sets some internal blob state to record the label of the 
calling process.
3. Call security_tun_dev_attach() which sets the label of the TUN socket to 
match the label stored in the tun_struct LSM blob during A2.  No authorization 
is done at this point since the socket is new/unlabeled.

B. TUNSETIFF (existing, persistent device)

[Relabel the existing tun_struct LSM state based on the calling process, use
this state to label the TUN socket.]

1. Attempt to relabel/reset the tun_struct LSM blob from the currently stored 
value, set during A2, to the label of the current calling process. *** THIS IS 
NOT CURRENTLY DONE IN THE RFC PATCH ***
2. Call security_tun_dev_attach() which sets the label of the TUN socket to
match the label stored in the tun_struct LSM blob during B1. No authorization 
is done at this point since the socket is new/unlabeled.

C. TUNSETQUEUE

[Use the existing tun_struct LSM state to label the new TUN socket.]

1. Call security_tun_dev_attach() which sets the label of the TUN socket to 
match the label stored in the tun_struct LSM blob set during either A2 or B1.  
No authorization is done at this point since the socket is new/unlabeled.

> btw. Current code does allow calling TUNSETQUEUE to a persistent tun/tap
> device with no file attached. It should be a bug and need to be fixed.

Since you wrote that code will you be submitting a patch to fix that problem?

> > One thing that I think we probably should change is the relabelto/from
> > permissions in the function above (selinux_tun_dev_attach()); in the case
> > where the socket does not yet have a label, e.g. 'sksec->sid == 0', we
> > should probably skip the relabel permissions since we want to assign the
> > TUN device label regardless in this case.
> 
> I'm not familiar with the selinux, have a quick glance of the code, looks
> like the label has been initialized to SECINITSID_KERNEL in
> selinux_socket_post_create().

Unless I've missed something in your changes, the multiqueue code never calls 
any socket code which ends up calling {security,selinux}_socket_post_create(); 
I believe you only call sk_alloc() which ends up calling 
{security,selinux}_sk_alloc() which sets SECINITSID_UNLABELED (I mistakenly 
wrote 0 instead in my earlier email which is techincally SECSID_NULL).  Either 
way, I still think the logic I originally described above is correct.

-- 
paul moore
security and virtualization @ redhat

^ permalink raw reply

* Re: [PATCH net-next v2] bridge: implement multicast fast leave
From: Stephen Hemminger @ 2012-12-04 16:44 UTC (permalink / raw)
  To: Cong Wang; +Cc: netdev, bridge, Herbert Xu, David S. Miller
In-Reply-To: <1354615000-19660-1-git-send-email-amwang@redhat.com>

On Tue,  4 Dec 2012 17:56:40 +0800
Cong Wang <amwang@redhat.com> wrote:

> V2: make the toggle per-port
> 
> Fast leave allows bridge to immediately stops the multicast
> traffic on the port receives IGMP Leave when IGMP snooping is enabled,
> no timeouts are observed.
> 
> Cc: Herbert Xu <herbert@gondor.apana.org.au>
> Cc: Stephen Hemminger <shemminger@vyatta.com>
> Cc: "David S. Miller" <davem@davemloft.net>
> Signed-off-by: Cong Wang <amwang@redhat.com>

Ok. as is.

A good followup change would be to migrate
these multicast flags into the existing flags bits (see hairpin etc),
and make them manageable via netlink.

^ permalink raw reply

* Re: [PATCH net-next v2] bridge: export multicast database via netlink
From: Stephen Hemminger @ 2012-12-04 16:53 UTC (permalink / raw)
  To: Cong Wang
  Cc: netdev, bridge, Herbert Xu, Jesper Dangaard Brouer, Thomas Graf,
	David S. Miller
In-Reply-To: <1354539824-7898-1-git-send-email-amwang@redhat.com>

On Mon,  3 Dec 2012 21:03:43 +0800
Cong Wang <amwang@redhat.com> wrote:

> V2: drop patch 1/2, export ifindex directly
>     Redesign netlink attributes
>     Improve netlink seq check
>     Handle IPv6 addr as well
> 
> TODO: remove debugging printk's
> 
> This patch exports bridge multicast database via netlink
> message type RTM_GETMDB. Similar to fdb, but currently bridge-specific.
> We may need to support modify multicast database too (RTM_{ADD,DEL}MDB).
> 
> Cc: Herbert Xu <herbert@gondor.apana.org.au>
> Cc: Stephen Hemminger <shemminger@vyatta.com>
> Cc: "David S. Miller" <davem@davemloft.net>
> Cc: Thomas Graf <tgraf@suug.ch>
> Cc: Jesper Dangaard Brouer <brouer@redhat.com>
> Signed-off-by: Cong Wang <amwang@redhat.com>
>     

Minor nit reported by checkpatch was the messages should be using the api
which provides the most info in the log to identify.

WARNING: Prefer netdev_info(netdev, ... then dev_info(dev, ... then pr_info(...  to printk(KERN_INFO ...
#190: FILE: net/bridge/br_mdb.c:28:
+		printk(KERN_INFO "no router on bridge\n")

There is a set of macro's already for use in bridging code:
                br_info(br, "no router on bridge\n");

^ permalink raw reply

* Re: [PATCH] vxlan: Fix error that was resulting in VXLAN MTU size being 10 bytes too large
From: Alexander Duyck @ 2012-12-04 17:12 UTC (permalink / raw)
  To: Naoto MATSUMOTO; +Cc: Joseph Glanville, Stephen Hemminger, David Miller, netdev
In-Reply-To: <20121204094809.9689.C42C3789@sakura.ad.jp>

On 12/03/2012 04:48 PM, Naoto MATSUMOTO wrote:
> 
> Hi all
> 
> Sharing my testlab resut for you ;-)
> 
> A First Look At VXLAN over Infiniband Network On Linux 3.7-rc7
> http://slidesha.re/TsCKWc
> 
> plz enjyoi it.
> 
> --
> Naoto

What was the MTU for the underlying IPoIB devices in your test?  Just
wondering because the MTU on the VXLAN is showing as 65470 which would
imply the IPoIB devices should have a 65520 MTU and I just wanted to
confirm that.

Thanks,

Alex

^ permalink raw reply

* Re: [PATCH v2] net/macb: Use non-coherent memory for rx buffers
From: Nicolas Ferre @ 2012-12-04 17:16 UTC (permalink / raw)
  To: David Laight
  Cc: David S. Miller, netdev, linux-arm-kernel, linux-kernel,
	Joachim Eastwood, Jean-Christophe PLAGNIOL-VILLARD,
	Havard Skinnemoen
In-Reply-To: <AE90C24D6B3A694183C094C60CF0A2F6026B70D6@saturn3.aculab.com>

On 12/03/2012 03:25 PM, David Laight :
>> On 12/03/2012 01:43 PM, David Laight :
>>>> Allocate regular pages to use as backing for the RX ring and use the
>>>> DMA API to sync the caches. This should give a bit better performance
>>>> since it allows the CPU to do burst transfers from memory. It is also
>>>> a necessary step on the way to reduce the amount of copying done by
>>>> the driver.
>>>
>>> I've not tried to understand the patches, but you have to be
>>> very careful using non-snooped memory for descriptor rings.
>>> No amount of DMA API calls can sort out some of the issues.
>>
>> David,
>>
>> Maybe I have not described the patch properly but the non-coherent
>> memory is not used for descriptor rings. It is used for DMA buffers
>> pointed out by descriptors (that are allocated as coherent memory).
>>
>> As buffers are filled up by the interface DMA and then, afterwards, used
>> by the driver to pass data to the net layer, it seems to me that the use
>> of non-coherent memory is sensible.
> 
> Ah, ok - difficult to actually determine from a fast read of the code.
> So you invalidate (I think that is the right term) all the cache lines
> that are part of each rx buffer before giving it back to the MAC unit.
> (Maybe that first time, and just those cache lines that might have been
> written to after reception - I'd worry about whether the CRC is written
> into the rx buffer!)

If I understand well, you mean that the call to:

		dma_sync_single_range_for_device(&bp->pdev->dev, phys,
				pg_offset, frag_len, DMA_FROM_DEVICE);

in the rx path after having copied the data to skb is not needed?
That is also the conclusion that I found after having thinking about
this again... I will check this.

For the CRC, my driver is not using the CRC offloading feature for the
moment. So no CRC is written by the device.

> I was wondering if the code needs to do per page allocations?
> Perhaps that is necessary to avoid needing a large block of
> contiguous physical memory (and virtual addresses)?

The page management seems interesting for future management of RX
buffers as skb fragments: that will allow to avoid copying received data.

> I know from some experiments done many years ago that a data
> copy in the MAC tx and rx path isn't necessarily as bad as
> people may think - especially if it removes complicated
> 'buffer loaning' schemes and/or iommu setup (or bounce
> buffers due to limited hardware memory addressing).
> 
> The rx copy can usually be made to be a 'whole word' copy
> (ie you copy the two bytes of garbage that (mis)align the
> destination MAC address, and some bytes after the CRC.
> With some hardware I believe it is possible for the cache
> controller to do cache-line aligned copies very quickly!
> (Some very new x86 cpus might be doing this for 'rep movsd'.)

Well, on our side, the "memory bus" resource is precious, so I imagine
that even with an optimized copy, limiting the use of this resource
should be better.

> The copy in the rx path is also better for short packets
> the can end up queued for userspace (although a copy in
> the socket code would solve that one.

Sure, some patches by Haavard that I am working on at the moment are
taking care of copying in any cases the first 62 bytes (+2 bytes
alignment) for each packet so that we cover the case of short packets
and headers...

Thanks for your comments, best regards,
-- 
Nicolas Ferre

^ permalink raw reply

* Re: [PATCH] net/macb: Use dmapool to align descriptors on 64bits
From: Nicolas Ferre @ 2012-12-04 17:18 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: David S. Miller, netdev, linux-arm-kernel, linux-kernel,
	Joachim Eastwood, Jean-Christophe PLAGNIOL-VILLARD
In-Reply-To: <1354543290.7030.26.camel@deadeye.wl.decadent.org.uk>

On 12/03/2012 03:01 PM, Ben Hutchings :
> On Mon, 2012-12-03 at 13:15 +0100, Nicolas Ferre wrote:
>> Depending on datapath, some revisions of GEM need
>> 64bits aligned descriptors. Use dmapool to allocate
>> these descriptors.
>> Note that different size between RX and TX rings
>> leads to the creation of two pools.
>>
>> Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
> [...]
> 
> dma_alloc_coherent() allocates whole pages, which I think is quite
> enough alignment.  You can't save memory by doing this, since each pool
> needs at least one page.  So what is this change meant to achieve?

Well... I guess we can forget this patch then ;-)

Thanks, best regards,
-- 
Nicolas Ferre

^ permalink raw reply

* Re: userspace utils for linux-can
From: Marc Kleine-Budde @ 2012-12-04 17:19 UTC (permalink / raw)
  To: ftpadmin
  Cc: Oliver Hartkopp, Wolfgang Grandegger, David Miller,
	Linux Netdev List
In-Reply-To: <50B4D2B8.8020707@pengutronix.de>

[-- Attachment #1: Type: text/plain, Size: 657 bytes --]

On 11/27/2012 03:48 PM, Marc Kleine-Budde wrote:
> Hello Kernel.org Admins,
> 
> I'd like to put the tarballs of the linux-can userspace testing
> utilities on the kernel.org infrastructure, it this possible? I was
> thinking about somewhere under:
> 
> http://kernel.org/pub/linux/utils/net
> 
> A possible directory name could be "can-utils"

ping

regards,
MArc

-- 
Pengutronix e.K.                  | Marc Kleine-Budde           |
Industrial Linux Solutions        | Phone: +49-231-2826-924     |
Vertretung West/Dortmund          | Fax:   +49-5121-206917-5555 |
Amtsgericht Hildesheim, HRA 2686  | http://www.pengutronix.de   |


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 261 bytes --]

^ permalink raw reply

* [PATCH 6/6] netfilter: nf_nat: Handle routing changes in MASQUERADE target
From: pablo @ 2012-12-04 17:31 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1354642293-4114-1-git-send-email-pablo@netfilter.org>

From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>

When the route changes (backup default route, VPNs) which affect a
masqueraded target, the packets were sent out with the outdated source
address. The patch addresses the issue by comparing the outgoing interface
directly with the masqueraded interface in the nat table.

Events are inefficient in this case, because it'd require adding route
events to the network core and then scanning the whole conntrack table
and re-checking the route for all entry.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_nat.h    |   15 +++++++++++++++
 net/ipv4/netfilter/iptable_nat.c  |    4 ++++
 net/ipv6/netfilter/ip6table_nat.c |    4 ++++
 3 files changed, 23 insertions(+)

diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h
index bd8eea7..ad14a79 100644
--- a/include/net/netfilter/nf_nat.h
+++ b/include/net/netfilter/nf_nat.h
@@ -68,4 +68,19 @@ static inline struct nf_conn_nat *nfct_nat(const struct nf_conn *ct)
 #endif
 }
 
+static inline bool nf_nat_oif_changed(unsigned int hooknum,
+				      enum ip_conntrack_info ctinfo,
+				      struct nf_conn_nat *nat,
+				      const struct net_device *out)
+{
+#if IS_ENABLED(CONFIG_IP_NF_TARGET_MASQUERADE) || \
+    IS_ENABLED(CONFIG_IP6_NF_TARGET_MASQUERADE)
+	return nat->masq_index && hooknum == NF_INET_POST_ROUTING &&
+	       CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL &&
+	       nat->masq_index != out->ifindex;
+#else
+	return false;
+#endif
+}
+
 #endif
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index ac635a7..da2c8a3 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -134,6 +134,10 @@ nf_nat_ipv4_fn(unsigned int hooknum,
 		/* ESTABLISHED */
 		NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
 			     ctinfo == IP_CT_ESTABLISHED_REPLY);
+		if (nf_nat_oif_changed(hooknum, ctinfo, nat, out)) {
+			nf_ct_kill_acct(ct, ctinfo, skb);
+			return NF_DROP;
+		}
 	}
 
 	return nf_nat_packet(ct, ctinfo, hooknum, skb);
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
index fa84cf8..6c8ae24 100644
--- a/net/ipv6/netfilter/ip6table_nat.c
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -137,6 +137,10 @@ nf_nat_ipv6_fn(unsigned int hooknum,
 		/* ESTABLISHED */
 		NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
 			     ctinfo == IP_CT_ESTABLISHED_REPLY);
+		if (nf_nat_oif_changed(hooknum, ctinfo, nat, out)) {
+			nf_ct_kill_acct(ct, ctinfo, skb);
+			return NF_DROP;
+		}
 	}
 
 	return nf_nat_packet(ct, ctinfo, hooknum, skb);
-- 
1.7.10.4


^ permalink raw reply related

* [PATCH 0/6] netfilter updates for net-next
From: pablo @ 2012-12-04 17:31 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev

From: Pablo Neira Ayuso <pablo@netfilter.org>

Hi David,

If time allows, please pull the following 6 updates for your
net-next tree:

* Remove limitation in the maximum number of supported sets in ipset.
  Now ipset automagically increments the number of slots in the array
  of sets by 64 new spare slots, from Jozsef Kadlecsik.

* Partially remove the generic queue infrastructure now that ip_queue
  is gone. Its only client is nfnetlink_queue now, from Florian
  Westphal.

* Add missing attribute policy checkings in ctnetlink, from Florian
  Westphal.

* Automagically kill conntrack entries that use the wrong output
  interface for the masquerading case in case of routing changes,
  from Jozsef Kadlecsik.

* Two patches two improve ct object traceability. Now ct objects are
  always placed in any of the existing lists. This allows us to dump
  the content of unconfirmed and dying conntracks via ctnetlink as
  a way to provide more instrumentation in case you suspect leaks,
  from myself.

You can pull these changes from:

git://1984.lsi.us.es/nf-next master

Thanks!

Florian Westphal (2):
  netfilter: kill support for per-af queue backends
  netfilter: ctnetlink: nla_policy updates

Jozsef Kadlecsik (2):
  netfilter: ipset: Increase the number of maximal sets automatically
  netfilter: nf_nat: Handle routing changes in MASQUERADE target

Pablo Neira Ayuso (2):
  netfilter: nf_conntrack: improve nf_conn object traceability
  netfilter: ctnetlink: dump entries from the dying and unconfirmed lists

 include/net/netfilter/nf_conntrack.h               |    2 +-
 include/net/netfilter/nf_nat.h                     |   15 ++
 include/net/netfilter/nf_queue.h                   |    8 +-
 include/uapi/linux/netfilter/nfnetlink_conntrack.h |    2 +
 net/ipv4/netfilter/iptable_nat.c                   |    4 +
 net/ipv6/netfilter/ip6table_nat.c                  |    4 +
 net/netfilter/core.c                               |    2 -
 net/netfilter/ipset/ip_set_core.c                  |  243 +++++++++++++-------
 net/netfilter/nf_conntrack_core.c                  |   25 +-
 net/netfilter/nf_conntrack_netlink.c               |  118 +++++++++-
 net/netfilter/nf_conntrack_proto_tcp.c             |    2 +
 net/netfilter/nf_queue.c                           |  152 ++----------
 net/netfilter/nfnetlink_queue_core.c               |   14 +-
 13 files changed, 332 insertions(+), 259 deletions(-)

--
1.7.10.4

^ permalink raw reply

* [PATCH 2/6] netfilter: nf_conntrack: improve nf_conn object traceability
From: pablo @ 2012-12-04 17:31 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1354642293-4114-1-git-send-email-pablo@netfilter.org>

From: Pablo Neira Ayuso <pablo@netfilter.org>

This patch modifies the conntrack subsystem so that all existing
allocated conntrack objects can be found in any of the following
places:

* the hash table, this is the typical place for alive conntrack objects.
* the unconfirmed list, this is the place for newly created conntrack objects
  that are still traversing the stack.
* the dying list, this is where you can find conntrack objects that are dying
  or that should die anytime soon (eg. once the destroy event is delivered to
  the conntrackd daemon).

Thus, we make sure that we follow the track for all existing conntrack
objects. This patch, together with some extension of the ctnetlink interface
to dump the content of the dying and unconfirmed lists, will help in case
to debug suspected nf_conn object leaks.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack.h |    2 +-
 net/netfilter/nf_conntrack_core.c    |   25 +++++++++----------------
 net/netfilter/nf_conntrack_netlink.c |    2 +-
 3 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index f1494fe..caca0c4 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -182,7 +182,7 @@ __nf_conntrack_find(struct net *net, u16 zone,
 
 extern int nf_conntrack_hash_check_insert(struct nf_conn *ct);
 extern void nf_ct_delete_from_lists(struct nf_conn *ct);
-extern void nf_ct_insert_dying_list(struct nf_conn *ct);
+extern void nf_ct_dying_timeout(struct nf_conn *ct);
 
 extern void nf_conntrack_flush_report(struct net *net, u32 pid, int report);
 
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 0f241be..af17516 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -221,11 +221,9 @@ destroy_conntrack(struct nf_conntrack *nfct)
 	 * too. */
 	nf_ct_remove_expectations(ct);
 
-	/* We overload first tuple to link into unconfirmed list. */
-	if (!nf_ct_is_confirmed(ct)) {
-		BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
-		hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
-	}
+	/* We overload first tuple to link into unconfirmed or dying list.*/
+	BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
+	hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
 
 	NF_CT_STAT_INC(net, delete);
 	spin_unlock_bh(&nf_conntrack_lock);
@@ -247,6 +245,9 @@ void nf_ct_delete_from_lists(struct nf_conn *ct)
 	 * Otherwise we can get spurious warnings. */
 	NF_CT_STAT_INC(net, delete_list);
 	clean_from_lists(ct);
+	/* add this conntrack to the dying list */
+	hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
+			     &net->ct.dying);
 	spin_unlock_bh(&nf_conntrack_lock);
 }
 EXPORT_SYMBOL_GPL(nf_ct_delete_from_lists);
@@ -268,31 +269,23 @@ static void death_by_event(unsigned long ul_conntrack)
 	}
 	/* we've got the event delivered, now it's dying */
 	set_bit(IPS_DYING_BIT, &ct->status);
-	spin_lock(&nf_conntrack_lock);
-	hlist_nulls_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
-	spin_unlock(&nf_conntrack_lock);
 	nf_ct_put(ct);
 }
 
-void nf_ct_insert_dying_list(struct nf_conn *ct)
+void nf_ct_dying_timeout(struct nf_conn *ct)
 {
 	struct net *net = nf_ct_net(ct);
 	struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct);
 
 	BUG_ON(ecache == NULL);
 
-	/* add this conntrack to the dying list */
-	spin_lock_bh(&nf_conntrack_lock);
-	hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
-			     &net->ct.dying);
-	spin_unlock_bh(&nf_conntrack_lock);
 	/* set a new timer to retry event delivery */
 	setup_timer(&ecache->timeout, death_by_event, (unsigned long)ct);
 	ecache->timeout.expires = jiffies +
 		(random32() % net->ct.sysctl_events_retry_timeout);
 	add_timer(&ecache->timeout);
 }
-EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list);
+EXPORT_SYMBOL_GPL(nf_ct_dying_timeout);
 
 static void death_by_timeout(unsigned long ul_conntrack)
 {
@@ -307,7 +300,7 @@ static void death_by_timeout(unsigned long ul_conntrack)
 	    unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) {
 		/* destroy event was not delivered */
 		nf_ct_delete_from_lists(ct);
-		nf_ct_insert_dying_list(ct);
+		nf_ct_dying_timeout(ct);
 		return;
 	}
 	set_bit(IPS_DYING_BIT, &ct->status);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 7bbfb3d..34370a9 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -989,7 +989,7 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
 					      nlmsg_report(nlh)) < 0) {
 			nf_ct_delete_from_lists(ct);
 			/* we failed to report the event, try later */
-			nf_ct_insert_dying_list(ct);
+			nf_ct_dying_timeout(ct);
 			nf_ct_put(ct);
 			return 0;
 		}
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 1/6] netfilter: ipset: Increase the number of maximal sets automatically
From: pablo @ 2012-12-04 17:31 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1354642293-4114-1-git-send-email-pablo@netfilter.org>

From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>

The max number of sets was hardcoded at kernel cofiguration time and
could only be modified via a module parameter. The patch adds the support
of increasing the max number of sets automatically, as needed.

The array of sets is incremented by 64 new slots if we run out of
empty slots. The absolute limit for the maximal number of sets
is limited by 65534.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipset/ip_set_core.c |  243 ++++++++++++++++++++++++-------------
 1 file changed, 160 insertions(+), 83 deletions(-)

diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index fed899f..6d6d8f2 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -28,9 +28,10 @@ static LIST_HEAD(ip_set_type_list);		/* all registered set types */
 static DEFINE_MUTEX(ip_set_type_mutex);		/* protects ip_set_type_list */
 static DEFINE_RWLOCK(ip_set_ref_lock);		/* protects the set refs */
 
-static struct ip_set **ip_set_list;		/* all individual sets */
+static struct ip_set * __rcu *ip_set_list;	/* all individual sets */
 static ip_set_id_t ip_set_max = CONFIG_IP_SET_MAX; /* max number of sets */
 
+#define IP_SET_INC	64
 #define STREQ(a, b)	(strncmp(a, b, IPSET_MAXNAMELEN) == 0)
 
 static unsigned int max_sets;
@@ -42,6 +43,12 @@ MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
 MODULE_DESCRIPTION("core IP set support");
 MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET);
 
+/* When the nfnl mutex is held: */
+#define nfnl_dereference(p)		\
+	rcu_dereference_protected(p, 1)
+#define nfnl_set(id)			\
+	nfnl_dereference(ip_set_list)[id]
+
 /*
  * The set types are implemented in modules and registered set types
  * can be found in ip_set_type_list. Adding/deleting types is
@@ -321,19 +328,19 @@ EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6);
  */
 
 static inline void
-__ip_set_get(ip_set_id_t index)
+__ip_set_get(struct ip_set *set)
 {
 	write_lock_bh(&ip_set_ref_lock);
-	ip_set_list[index]->ref++;
+	set->ref++;
 	write_unlock_bh(&ip_set_ref_lock);
 }
 
 static inline void
-__ip_set_put(ip_set_id_t index)
+__ip_set_put(struct ip_set *set)
 {
 	write_lock_bh(&ip_set_ref_lock);
-	BUG_ON(ip_set_list[index]->ref == 0);
-	ip_set_list[index]->ref--;
+	BUG_ON(set->ref == 0);
+	set->ref--;
 	write_unlock_bh(&ip_set_ref_lock);
 }
 
@@ -344,12 +351,25 @@ __ip_set_put(ip_set_id_t index)
  * so it can't be destroyed (or changed) under our foot.
  */
 
+static inline struct ip_set *
+ip_set_rcu_get(ip_set_id_t index)
+{
+	struct ip_set *set;
+
+	rcu_read_lock();
+	/* ip_set_list itself needs to be protected */
+	set = rcu_dereference(ip_set_list)[index];
+	rcu_read_unlock();
+
+	return set;
+}
+
 int
 ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
 	    const struct xt_action_param *par,
 	    const struct ip_set_adt_opt *opt)
 {
-	struct ip_set *set = ip_set_list[index];
+	struct ip_set *set = ip_set_rcu_get(index);
 	int ret = 0;
 
 	BUG_ON(set == NULL);
@@ -388,7 +408,7 @@ ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
 	   const struct xt_action_param *par,
 	   const struct ip_set_adt_opt *opt)
 {
-	struct ip_set *set = ip_set_list[index];
+	struct ip_set *set = ip_set_rcu_get(index);
 	int ret;
 
 	BUG_ON(set == NULL);
@@ -411,7 +431,7 @@ ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
 	   const struct xt_action_param *par,
 	   const struct ip_set_adt_opt *opt)
 {
-	struct ip_set *set = ip_set_list[index];
+	struct ip_set *set = ip_set_rcu_get(index);
 	int ret = 0;
 
 	BUG_ON(set == NULL);
@@ -440,14 +460,17 @@ ip_set_get_byname(const char *name, struct ip_set **set)
 	ip_set_id_t i, index = IPSET_INVALID_ID;
 	struct ip_set *s;
 
+	rcu_read_lock();
 	for (i = 0; i < ip_set_max; i++) {
-		s = ip_set_list[i];
+		s = rcu_dereference(ip_set_list)[i];
 		if (s != NULL && STREQ(s->name, name)) {
-			__ip_set_get(i);
+			__ip_set_get(s);
 			index = i;
 			*set = s;
+			break;
 		}
 	}
+	rcu_read_unlock();
 
 	return index;
 }
@@ -462,8 +485,13 @@ EXPORT_SYMBOL_GPL(ip_set_get_byname);
 void
 ip_set_put_byindex(ip_set_id_t index)
 {
-	if (ip_set_list[index] != NULL)
-		__ip_set_put(index);
+	struct ip_set *set;
+
+	rcu_read_lock();
+	set = rcu_dereference(ip_set_list)[index];
+	if (set != NULL)
+		__ip_set_put(set);
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(ip_set_put_byindex);
 
@@ -477,7 +505,7 @@ EXPORT_SYMBOL_GPL(ip_set_put_byindex);
 const char *
 ip_set_name_byindex(ip_set_id_t index)
 {
-	const struct ip_set *set = ip_set_list[index];
+	const struct ip_set *set = ip_set_rcu_get(index);
 
 	BUG_ON(set == NULL);
 	BUG_ON(set->ref == 0);
@@ -501,11 +529,18 @@ EXPORT_SYMBOL_GPL(ip_set_name_byindex);
 ip_set_id_t
 ip_set_nfnl_get(const char *name)
 {
+	ip_set_id_t i, index = IPSET_INVALID_ID;
 	struct ip_set *s;
-	ip_set_id_t index;
 
 	nfnl_lock();
-	index = ip_set_get_byname(name, &s);
+	for (i = 0; i < ip_set_max; i++) {
+		s = nfnl_set(i);
+		if (s != NULL && STREQ(s->name, name)) {
+			__ip_set_get(s);
+			index = i;
+			break;
+		}
+	}
 	nfnl_unlock();
 
 	return index;
@@ -521,12 +556,15 @@ EXPORT_SYMBOL_GPL(ip_set_nfnl_get);
 ip_set_id_t
 ip_set_nfnl_get_byindex(ip_set_id_t index)
 {
+	struct ip_set *set;
+
 	if (index > ip_set_max)
 		return IPSET_INVALID_ID;
 
 	nfnl_lock();
-	if (ip_set_list[index])
-		__ip_set_get(index);
+	set = nfnl_set(index);
+	if (set)
+		__ip_set_get(set);
 	else
 		index = IPSET_INVALID_ID;
 	nfnl_unlock();
@@ -545,8 +583,11 @@ EXPORT_SYMBOL_GPL(ip_set_nfnl_get_byindex);
 void
 ip_set_nfnl_put(ip_set_id_t index)
 {
+	struct ip_set *set;
 	nfnl_lock();
-	ip_set_put_byindex(index);
+	set = nfnl_set(index);
+	if (set != NULL)
+		__ip_set_put(set);
 	nfnl_unlock();
 }
 EXPORT_SYMBOL_GPL(ip_set_nfnl_put);
@@ -603,41 +644,46 @@ static const struct nla_policy ip_set_create_policy[IPSET_ATTR_CMD_MAX + 1] = {
 	[IPSET_ATTR_DATA]	= { .type = NLA_NESTED },
 };
 
-static ip_set_id_t
-find_set_id(const char *name)
+static struct ip_set *
+find_set_and_id(const char *name, ip_set_id_t *id)
 {
-	ip_set_id_t i, index = IPSET_INVALID_ID;
-	const struct ip_set *set;
+	struct ip_set *set = NULL;
+	ip_set_id_t i;
 
-	for (i = 0; index == IPSET_INVALID_ID && i < ip_set_max; i++) {
-		set = ip_set_list[i];
-		if (set != NULL && STREQ(set->name, name))
-			index = i;
+	*id = IPSET_INVALID_ID;
+	for (i = 0; i < ip_set_max; i++) {
+		set = nfnl_set(i);
+		if (set != NULL && STREQ(set->name, name)) {
+			*id = i;
+			break;
+		}
 	}
-	return index;
+	return (*id == IPSET_INVALID_ID ? NULL : set);
 }
 
 static inline struct ip_set *
 find_set(const char *name)
 {
-	ip_set_id_t index = find_set_id(name);
+	ip_set_id_t id;
 
-	return index == IPSET_INVALID_ID ? NULL : ip_set_list[index];
+	return find_set_and_id(name, &id);
 }
 
 static int
 find_free_id(const char *name, ip_set_id_t *index, struct ip_set **set)
 {
+	struct ip_set *s;
 	ip_set_id_t i;
 
 	*index = IPSET_INVALID_ID;
 	for (i = 0;  i < ip_set_max; i++) {
-		if (ip_set_list[i] == NULL) {
+		s = nfnl_set(i);
+		if (s == NULL) {
 			if (*index == IPSET_INVALID_ID)
 				*index = i;
-		} else if (STREQ(name, ip_set_list[i]->name)) {
+		} else if (STREQ(name, s->name)) {
 			/* Name clash */
-			*set = ip_set_list[i];
+			*set = s;
 			return -EEXIST;
 		}
 	}
@@ -730,10 +776,9 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
 	 * and check clashing.
 	 */
 	ret = find_free_id(set->name, &index, &clash);
-	if (ret != 0) {
+	if (ret == -EEXIST) {
 		/* If this is the same set and requested, ignore error */
-		if (ret == -EEXIST &&
-		    (flags & IPSET_FLAG_EXIST) &&
+		if ((flags & IPSET_FLAG_EXIST) &&
 		    STREQ(set->type->name, clash->type->name) &&
 		    set->type->family == clash->type->family &&
 		    set->type->revision_min == clash->type->revision_min &&
@@ -741,13 +786,36 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
 		    set->variant->same_set(set, clash))
 			ret = 0;
 		goto cleanup;
-	}
+	} else if (ret == -IPSET_ERR_MAX_SETS) {
+		struct ip_set **list, **tmp;
+		ip_set_id_t i = ip_set_max + IP_SET_INC;
+
+		if (i < ip_set_max || i == IPSET_INVALID_ID)
+			/* Wraparound */
+			goto cleanup;
+
+		list = kzalloc(sizeof(struct ip_set *) * i, GFP_KERNEL);
+		if (!list)
+			goto cleanup;
+		/* nfnl mutex is held, both lists are valid */
+		tmp = nfnl_dereference(ip_set_list);
+		memcpy(list, tmp, sizeof(struct ip_set *) * ip_set_max);
+		rcu_assign_pointer(ip_set_list, list);
+		/* Make sure all current packets have passed through */
+		synchronize_net();
+		/* Use new list */
+		index = ip_set_max;
+		ip_set_max = i;
+		kfree(tmp);
+		ret = 0;
+	} else if (ret)
+		goto cleanup;
 
 	/*
 	 * Finally! Add our shiny new set to the list, and be done.
 	 */
 	pr_debug("create: '%s' created with index %u!\n", set->name, index);
-	ip_set_list[index] = set;
+	nfnl_set(index) = set;
 
 	return ret;
 
@@ -772,10 +840,10 @@ ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = {
 static void
 ip_set_destroy_set(ip_set_id_t index)
 {
-	struct ip_set *set = ip_set_list[index];
+	struct ip_set *set = nfnl_set(index);
 
 	pr_debug("set: %s\n",  set->name);
-	ip_set_list[index] = NULL;
+	nfnl_set(index) = NULL;
 
 	/* Must call it without holding any lock */
 	set->variant->destroy(set);
@@ -788,6 +856,7 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
 	       const struct nlmsghdr *nlh,
 	       const struct nlattr * const attr[])
 {
+	struct ip_set *s;
 	ip_set_id_t i;
 	int ret = 0;
 
@@ -807,22 +876,24 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
 	read_lock_bh(&ip_set_ref_lock);
 	if (!attr[IPSET_ATTR_SETNAME]) {
 		for (i = 0; i < ip_set_max; i++) {
-			if (ip_set_list[i] != NULL && ip_set_list[i]->ref) {
+			s = nfnl_set(i);
+			if (s != NULL && s->ref) {
 				ret = -IPSET_ERR_BUSY;
 				goto out;
 			}
 		}
 		read_unlock_bh(&ip_set_ref_lock);
 		for (i = 0; i < ip_set_max; i++) {
-			if (ip_set_list[i] != NULL)
+			s = nfnl_set(i);
+			if (s != NULL)
 				ip_set_destroy_set(i);
 		}
 	} else {
-		i = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME]));
-		if (i == IPSET_INVALID_ID) {
+		s = find_set_and_id(nla_data(attr[IPSET_ATTR_SETNAME]), &i);
+		if (s == NULL) {
 			ret = -ENOENT;
 			goto out;
-		} else if (ip_set_list[i]->ref) {
+		} else if (s->ref) {
 			ret = -IPSET_ERR_BUSY;
 			goto out;
 		}
@@ -853,21 +924,24 @@ ip_set_flush(struct sock *ctnl, struct sk_buff *skb,
 	     const struct nlmsghdr *nlh,
 	     const struct nlattr * const attr[])
 {
+	struct ip_set *s;
 	ip_set_id_t i;
 
 	if (unlikely(protocol_failed(attr)))
 		return -IPSET_ERR_PROTOCOL;
 
 	if (!attr[IPSET_ATTR_SETNAME]) {
-		for (i = 0; i < ip_set_max; i++)
-			if (ip_set_list[i] != NULL)
-				ip_set_flush_set(ip_set_list[i]);
+		for (i = 0; i < ip_set_max; i++) {
+			s = nfnl_set(i);
+			if (s != NULL)
+				ip_set_flush_set(s);
+		}
 	} else {
-		i = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME]));
-		if (i == IPSET_INVALID_ID)
+		s = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
+		if (s == NULL)
 			return -ENOENT;
 
-		ip_set_flush_set(ip_set_list[i]);
+		ip_set_flush_set(s);
 	}
 
 	return 0;
@@ -889,7 +963,7 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb,
 	      const struct nlmsghdr *nlh,
 	      const struct nlattr * const attr[])
 {
-	struct ip_set *set;
+	struct ip_set *set, *s;
 	const char *name2;
 	ip_set_id_t i;
 	int ret = 0;
@@ -911,8 +985,8 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb,
 
 	name2 = nla_data(attr[IPSET_ATTR_SETNAME2]);
 	for (i = 0; i < ip_set_max; i++) {
-		if (ip_set_list[i] != NULL &&
-		    STREQ(ip_set_list[i]->name, name2)) {
+		s = nfnl_set(i);
+		if (s != NULL && STREQ(s->name, name2)) {
 			ret = -IPSET_ERR_EXIST_SETNAME2;
 			goto out;
 		}
@@ -947,17 +1021,14 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
 		     attr[IPSET_ATTR_SETNAME2] == NULL))
 		return -IPSET_ERR_PROTOCOL;
 
-	from_id = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME]));
-	if (from_id == IPSET_INVALID_ID)
+	from = find_set_and_id(nla_data(attr[IPSET_ATTR_SETNAME]), &from_id);
+	if (from == NULL)
 		return -ENOENT;
 
-	to_id = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME2]));
-	if (to_id == IPSET_INVALID_ID)
+	to = find_set_and_id(nla_data(attr[IPSET_ATTR_SETNAME2]), &to_id);
+	if (to == NULL)
 		return -IPSET_ERR_EXIST_SETNAME2;
 
-	from = ip_set_list[from_id];
-	to = ip_set_list[to_id];
-
 	/* Features must not change.
 	 * Not an artificial restriction anymore, as we must prevent
 	 * possible loops created by swapping in setlist type of sets. */
@@ -971,8 +1042,8 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
 
 	write_lock_bh(&ip_set_ref_lock);
 	swap(from->ref, to->ref);
-	ip_set_list[from_id] = to;
-	ip_set_list[to_id] = from;
+	nfnl_set(from_id) = to;
+	nfnl_set(to_id) = from;
 	write_unlock_bh(&ip_set_ref_lock);
 
 	return 0;
@@ -992,7 +1063,7 @@ static int
 ip_set_dump_done(struct netlink_callback *cb)
 {
 	if (cb->args[2]) {
-		pr_debug("release set %s\n", ip_set_list[cb->args[1]]->name);
+		pr_debug("release set %s\n", nfnl_set(cb->args[1])->name);
 		ip_set_put_byindex((ip_set_id_t) cb->args[1]);
 	}
 	return 0;
@@ -1030,8 +1101,11 @@ dump_init(struct netlink_callback *cb)
 	 */
 
 	if (cda[IPSET_ATTR_SETNAME]) {
-		index = find_set_id(nla_data(cda[IPSET_ATTR_SETNAME]));
-		if (index == IPSET_INVALID_ID)
+		struct ip_set *set;
+
+		set = find_set_and_id(nla_data(cda[IPSET_ATTR_SETNAME]),
+				      &index);
+		if (set == NULL)
 			return -ENOENT;
 
 		dump_type = DUMP_ONE;
@@ -1081,7 +1155,7 @@ dump_last:
 		 dump_type, dump_flags, cb->args[1]);
 	for (; cb->args[1] < max; cb->args[1]++) {
 		index = (ip_set_id_t) cb->args[1];
-		set = ip_set_list[index];
+		set = nfnl_set(index);
 		if (set == NULL) {
 			if (dump_type == DUMP_ONE) {
 				ret = -ENOENT;
@@ -1100,7 +1174,7 @@ dump_last:
 		if (!cb->args[2]) {
 			/* Start listing: make sure set won't be destroyed */
 			pr_debug("reference set\n");
-			__ip_set_get(index);
+			__ip_set_get(set);
 		}
 		nlh = start_msg(skb, NETLINK_CB(cb->skb).portid,
 				cb->nlh->nlmsg_seq, flags,
@@ -1159,7 +1233,7 @@ next_set:
 release_refcount:
 	/* If there was an error or set is done, release set */
 	if (ret || !cb->args[2]) {
-		pr_debug("release set %s\n", ip_set_list[index]->name);
+		pr_debug("release set %s\n", nfnl_set(index)->name);
 		ip_set_put_byindex(index);
 		cb->args[2] = 0;
 	}
@@ -1409,17 +1483,15 @@ ip_set_header(struct sock *ctnl, struct sk_buff *skb,
 	const struct ip_set *set;
 	struct sk_buff *skb2;
 	struct nlmsghdr *nlh2;
-	ip_set_id_t index;
 	int ret = 0;
 
 	if (unlikely(protocol_failed(attr) ||
 		     attr[IPSET_ATTR_SETNAME] == NULL))
 		return -IPSET_ERR_PROTOCOL;
 
-	index = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME]));
-	if (index == IPSET_INVALID_ID)
+	set = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
+	if (set == NULL)
 		return -ENOENT;
-	set = ip_set_list[index];
 
 	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
 	if (skb2 == NULL)
@@ -1684,6 +1756,7 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
 	}
 	case IP_SET_OP_GET_BYNAME: {
 		struct ip_set_req_get_set *req_get = data;
+		ip_set_id_t id;
 
 		if (*len != sizeof(struct ip_set_req_get_set)) {
 			ret = -EINVAL;
@@ -1691,12 +1764,14 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
 		}
 		req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0';
 		nfnl_lock();
-		req_get->set.index = find_set_id(req_get->set.name);
+		find_set_and_id(req_get->set.name, &id);
+		req_get->set.index = id;
 		nfnl_unlock();
 		goto copy;
 	}
 	case IP_SET_OP_GET_BYINDEX: {
 		struct ip_set_req_get_set *req_get = data;
+		struct ip_set *set;
 
 		if (*len != sizeof(struct ip_set_req_get_set) ||
 		    req_get->set.index >= ip_set_max) {
@@ -1704,9 +1779,8 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
 			goto done;
 		}
 		nfnl_lock();
-		strncpy(req_get->set.name,
-			ip_set_list[req_get->set.index]
-				? ip_set_list[req_get->set.index]->name : "",
+		set = nfnl_set(req_get->set.index);
+		strncpy(req_get->set.name, set ? set->name : "",
 			IPSET_MAXNAMELEN);
 		nfnl_unlock();
 		goto copy;
@@ -1737,6 +1811,7 @@ static struct nf_sockopt_ops so_set __read_mostly = {
 static int __init
 ip_set_init(void)
 {
+	struct ip_set **list;
 	int ret;
 
 	if (max_sets)
@@ -1744,22 +1819,22 @@ ip_set_init(void)
 	if (ip_set_max >= IPSET_INVALID_ID)
 		ip_set_max = IPSET_INVALID_ID - 1;
 
-	ip_set_list = kzalloc(sizeof(struct ip_set *) * ip_set_max,
-			      GFP_KERNEL);
-	if (!ip_set_list)
+	list = kzalloc(sizeof(struct ip_set *) * ip_set_max, GFP_KERNEL);
+	if (!list)
 		return -ENOMEM;
 
+	rcu_assign_pointer(ip_set_list, list);
 	ret = nfnetlink_subsys_register(&ip_set_netlink_subsys);
 	if (ret != 0) {
 		pr_err("ip_set: cannot register with nfnetlink.\n");
-		kfree(ip_set_list);
+		kfree(list);
 		return ret;
 	}
 	ret = nf_register_sockopt(&so_set);
 	if (ret != 0) {
 		pr_err("SO_SET registry failed: %d\n", ret);
 		nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
-		kfree(ip_set_list);
+		kfree(list);
 		return ret;
 	}
 
@@ -1770,10 +1845,12 @@ ip_set_init(void)
 static void __exit
 ip_set_fini(void)
 {
+	struct ip_set **list = rcu_dereference_protected(ip_set_list, 1);
+
 	/* There can't be any existing set */
 	nf_unregister_sockopt(&so_set);
 	nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
-	kfree(ip_set_list);
+	kfree(list);
 	pr_debug("these are the famous last words\n");
 }
 
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 3/6] netfilter: ctnetlink: dump entries from the dying and unconfirmed lists
From: pablo @ 2012-12-04 17:31 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1354642293-4114-1-git-send-email-pablo@netfilter.org>

From: Pablo Neira Ayuso <pablo@netfilter.org>

This patch adds a new operation to dump the content of the dying and
unconfirmed lists.

Under some situations, the global conntrack counter can be inconsistent
with the number of entries that we can dump from the conntrack table.
The way to resolve this is to allow dumping the content of the unconfirmed
and dying lists, so far it was not possible to look at its content.

This provides some extra instrumentation to resolve problematic situations
in which anyone suspects memory leaks.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nfnetlink_conntrack.h |    2 +
 net/netfilter/nf_conntrack_netlink.c               |  108 ++++++++++++++++++++
 2 files changed, 110 insertions(+)

diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
index 43bfe3e..86e930c 100644
--- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
@@ -9,6 +9,8 @@ enum cntl_msg_types {
 	IPCTNL_MSG_CT_GET_CTRZERO,
 	IPCTNL_MSG_CT_GET_STATS_CPU,
 	IPCTNL_MSG_CT_GET_STATS,
+	IPCTNL_MSG_CT_GET_DYING,
+	IPCTNL_MSG_CT_GET_UNCONFIRMED,
 
 	IPCTNL_MSG_MAX
 };
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 34370a9..c24a00a 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1089,6 +1089,112 @@ out:
 	return err == -EAGAIN ? -ENOBUFS : err;
 }
 
+static int ctnetlink_done_list(struct netlink_callback *cb)
+{
+	if (cb->args[1])
+		nf_ct_put((struct nf_conn *)cb->args[1]);
+	return 0;
+}
+
+static int
+ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb,
+		    struct hlist_nulls_head *list)
+{
+	struct nf_conn *ct, *last;
+	struct nf_conntrack_tuple_hash *h;
+	struct hlist_nulls_node *n;
+	struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+	u_int8_t l3proto = nfmsg->nfgen_family;
+	int res;
+
+	if (cb->args[2])
+		return 0;
+
+	spin_lock_bh(&nf_conntrack_lock);
+	last = (struct nf_conn *)cb->args[1];
+restart:
+	hlist_nulls_for_each_entry(h, n, list, hnnode) {
+		ct = nf_ct_tuplehash_to_ctrack(h);
+		if (l3proto && nf_ct_l3num(ct) != l3proto)
+			continue;
+		if (cb->args[1]) {
+			if (ct != last)
+				continue;
+			cb->args[1] = 0;
+		}
+		rcu_read_lock();
+		res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid,
+					  cb->nlh->nlmsg_seq,
+					  NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+					  ct);
+		rcu_read_unlock();
+		if (res < 0) {
+			nf_conntrack_get(&ct->ct_general);
+			cb->args[1] = (unsigned long)ct;
+			goto out;
+		}
+	}
+	if (cb->args[1]) {
+		cb->args[1] = 0;
+		goto restart;
+	} else
+		cb->args[2] = 1;
+out:
+	spin_unlock_bh(&nf_conntrack_lock);
+	if (last)
+		nf_ct_put(last);
+
+	return skb->len;
+}
+
+static int
+ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+
+	return ctnetlink_dump_list(skb, cb, &net->ct.dying);
+}
+
+static int
+ctnetlink_get_ct_dying(struct sock *ctnl, struct sk_buff *skb,
+		       const struct nlmsghdr *nlh,
+		       const struct nlattr * const cda[])
+{
+	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+		struct netlink_dump_control c = {
+			.dump = ctnetlink_dump_dying,
+			.done = ctnetlink_done_list,
+		};
+		return netlink_dump_start(ctnl, skb, nlh, &c);
+	}
+
+	return -EOPNOTSUPP;
+}
+
+static int
+ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+
+	return ctnetlink_dump_list(skb, cb, &net->ct.unconfirmed);
+}
+
+static int
+ctnetlink_get_ct_unconfirmed(struct sock *ctnl, struct sk_buff *skb,
+			     const struct nlmsghdr *nlh,
+			     const struct nlattr * const cda[])
+{
+	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+		struct netlink_dump_control c = {
+			.dump = ctnetlink_dump_unconfirmed,
+			.done = ctnetlink_done_list,
+		};
+		return netlink_dump_start(ctnl, skb, nlh, &c);
+	}
+
+	return -EOPNOTSUPP;
+}
+
 #ifdef CONFIG_NF_NAT_NEEDED
 static int
 ctnetlink_parse_nat_setup(struct nf_conn *ct,
@@ -2712,6 +2818,8 @@ static const struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = {
 					    .policy = ct_nla_policy },
 	[IPCTNL_MSG_CT_GET_STATS_CPU]	= { .call = ctnetlink_stat_ct_cpu },
 	[IPCTNL_MSG_CT_GET_STATS]	= { .call = ctnetlink_stat_ct },
+	[IPCTNL_MSG_CT_GET_DYING]	= { .call = ctnetlink_get_ct_dying },
+	[IPCTNL_MSG_CT_GET_UNCONFIRMED]	= { .call = ctnetlink_get_ct_unconfirmed },
 };
 
 static const struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = {
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 4/6] netfilter: kill support for per-af queue backends
From: pablo @ 2012-12-04 17:31 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1354642293-4114-1-git-send-email-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

We used to have several queueing backends, but nowadays only
nfnetlink_queue remains.

In light of this there doesn't seem to be a good reason to
support per-af registering -- just hook up nfnetlink_queue on module
load and remove it on unload.

This means that the userspace BIND/UNBIND_PF commands are now obsolete;
the kernel will ignore them.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_queue.h     |    8 +-
 net/netfilter/core.c                 |    2 -
 net/netfilter/nf_queue.c             |  152 +++-------------------------------
 net/netfilter/nfnetlink_queue_core.c |   14 ++--
 4 files changed, 20 insertions(+), 156 deletions(-)

diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
index 252fd10..fb1c0be 100644
--- a/include/net/netfilter/nf_queue.h
+++ b/include/net/netfilter/nf_queue.h
@@ -21,14 +21,10 @@ struct nf_queue_entry {
 struct nf_queue_handler {
 	int			(*outfn)(struct nf_queue_entry *entry,
 					 unsigned int queuenum);
-	char			*name;
 };
 
-extern int nf_register_queue_handler(u_int8_t pf,
-				     const struct nf_queue_handler *qh);
-extern int nf_unregister_queue_handler(u_int8_t pf,
-				       const struct nf_queue_handler *qh);
-extern void nf_unregister_queue_handlers(const struct nf_queue_handler *qh);
+void nf_register_queue_handler(const struct nf_queue_handler *qh);
+void nf_unregister_queue_handler(void);
 extern void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict);
 
 #endif /* _NF_QUEUE_H */
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 68912dad..a9c488b 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -295,8 +295,6 @@ void __init netfilter_init(void)
 		panic("cannot create netfilter proc entry");
 #endif
 
-	if (netfilter_queue_init() < 0)
-		panic("cannot initialize nf_queue");
 	if (netfilter_log_init() < 0)
 		panic("cannot initialize nf_log");
 }
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 8d2cf9e..d812c12 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -14,84 +14,32 @@
 #include "nf_internals.h"
 
 /*
- * A queue handler may be registered for each protocol.  Each is protected by
- * long term mutex.  The handler must provide an an outfn() to accept packets
- * for queueing and must reinject all packets it receives, no matter what.
+ * Hook for nfnetlink_queue to register its queue handler.
+ * We do this so that most of the NFQUEUE code can be modular.
+ *
+ * Once the queue is registered it must reinject all packets it
+ * receives, no matter what.
  */
-static const struct nf_queue_handler __rcu *queue_handler[NFPROTO_NUMPROTO] __read_mostly;
-
-static DEFINE_MUTEX(queue_handler_mutex);
+static const struct nf_queue_handler __rcu *queue_handler __read_mostly;
 
 /* return EBUSY when somebody else is registered, return EEXIST if the
  * same handler is registered, return 0 in case of success. */
-int nf_register_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh)
+void nf_register_queue_handler(const struct nf_queue_handler *qh)
 {
-	int ret;
-	const struct nf_queue_handler *old;
-
-	if (pf >= ARRAY_SIZE(queue_handler))
-		return -EINVAL;
-
-	mutex_lock(&queue_handler_mutex);
-	old = rcu_dereference_protected(queue_handler[pf],
-					lockdep_is_held(&queue_handler_mutex));
-	if (old == qh)
-		ret = -EEXIST;
-	else if (old)
-		ret = -EBUSY;
-	else {
-		rcu_assign_pointer(queue_handler[pf], qh);
-		ret = 0;
-	}
-	mutex_unlock(&queue_handler_mutex);
-
-	return ret;
+	/* should never happen, we only have one queueing backend in kernel */
+	WARN_ON(rcu_access_pointer(queue_handler));
+	rcu_assign_pointer(queue_handler, qh);
 }
 EXPORT_SYMBOL(nf_register_queue_handler);
 
 /* The caller must flush their queue before this */
-int nf_unregister_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh)
+void nf_unregister_queue_handler(void)
 {
-	const struct nf_queue_handler *old;
-
-	if (pf >= ARRAY_SIZE(queue_handler))
-		return -EINVAL;
-
-	mutex_lock(&queue_handler_mutex);
-	old = rcu_dereference_protected(queue_handler[pf],
-					lockdep_is_held(&queue_handler_mutex));
-	if (old && old != qh) {
-		mutex_unlock(&queue_handler_mutex);
-		return -EINVAL;
-	}
-
-	RCU_INIT_POINTER(queue_handler[pf], NULL);
-	mutex_unlock(&queue_handler_mutex);
-
+	RCU_INIT_POINTER(queue_handler, NULL);
 	synchronize_rcu();
-
-	return 0;
 }
 EXPORT_SYMBOL(nf_unregister_queue_handler);
 
-void nf_unregister_queue_handlers(const struct nf_queue_handler *qh)
-{
-	u_int8_t pf;
-
-	mutex_lock(&queue_handler_mutex);
-	for (pf = 0; pf < ARRAY_SIZE(queue_handler); pf++)  {
-		if (rcu_dereference_protected(
-				queue_handler[pf],
-				lockdep_is_held(&queue_handler_mutex)
-				) == qh)
-			RCU_INIT_POINTER(queue_handler[pf], NULL);
-	}
-	mutex_unlock(&queue_handler_mutex);
-
-	synchronize_rcu();
-}
-EXPORT_SYMBOL_GPL(nf_unregister_queue_handlers);
-
 static void nf_queue_entry_release_refs(struct nf_queue_entry *entry)
 {
 	/* Release those devices we held, or Alexey will kill me. */
@@ -137,7 +85,7 @@ static int __nf_queue(struct sk_buff *skb,
 	/* QUEUE == DROP if no one is waiting, to be safe. */
 	rcu_read_lock();
 
-	qh = rcu_dereference(queue_handler[pf]);
+	qh = rcu_dereference(queue_handler);
 	if (!qh) {
 		status = -ESRCH;
 		goto err_unlock;
@@ -344,77 +292,3 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
 	kfree(entry);
 }
 EXPORT_SYMBOL(nf_reinject);
-
-#ifdef CONFIG_PROC_FS
-static void *seq_start(struct seq_file *seq, loff_t *pos)
-{
-	if (*pos >= ARRAY_SIZE(queue_handler))
-		return NULL;
-
-	return pos;
-}
-
-static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
-{
-	(*pos)++;
-
-	if (*pos >= ARRAY_SIZE(queue_handler))
-		return NULL;
-
-	return pos;
-}
-
-static void seq_stop(struct seq_file *s, void *v)
-{
-
-}
-
-static int seq_show(struct seq_file *s, void *v)
-{
-	int ret;
-	loff_t *pos = v;
-	const struct nf_queue_handler *qh;
-
-	rcu_read_lock();
-	qh = rcu_dereference(queue_handler[*pos]);
-	if (!qh)
-		ret = seq_printf(s, "%2lld NONE\n", *pos);
-	else
-		ret = seq_printf(s, "%2lld %s\n", *pos, qh->name);
-	rcu_read_unlock();
-
-	return ret;
-}
-
-static const struct seq_operations nfqueue_seq_ops = {
-	.start	= seq_start,
-	.next	= seq_next,
-	.stop	= seq_stop,
-	.show	= seq_show,
-};
-
-static int nfqueue_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &nfqueue_seq_ops);
-}
-
-static const struct file_operations nfqueue_file_ops = {
-	.owner	 = THIS_MODULE,
-	.open	 = nfqueue_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = seq_release,
-};
-#endif /* PROC_FS */
-
-
-int __init netfilter_queue_init(void)
-{
-#ifdef CONFIG_PROC_FS
-	if (!proc_create("nf_queue", S_IRUGO,
-			 proc_net_netfilter, &nfqueue_file_ops))
-		return -1;
-#endif
-	return 0;
-}
-
diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c
index e12d44e..3158d87 100644
--- a/net/netfilter/nfnetlink_queue_core.c
+++ b/net/netfilter/nfnetlink_queue_core.c
@@ -809,7 +809,6 @@ static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = {
 };
 
 static const struct nf_queue_handler nfqh = {
-	.name 	= "nf_queue",
 	.outfn	= &nfqnl_enqueue_packet,
 };
 
@@ -827,14 +826,10 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
 	if (nfqa[NFQA_CFG_CMD]) {
 		cmd = nla_data(nfqa[NFQA_CFG_CMD]);
 
-		/* Commands without queue context - might sleep */
+		/* Obsolete commands without queue context */
 		switch (cmd->command) {
-		case NFQNL_CFG_CMD_PF_BIND:
-			return nf_register_queue_handler(ntohs(cmd->pf),
-							 &nfqh);
-		case NFQNL_CFG_CMD_PF_UNBIND:
-			return nf_unregister_queue_handler(ntohs(cmd->pf),
-							   &nfqh);
+		case NFQNL_CFG_CMD_PF_BIND: return 0;
+		case NFQNL_CFG_CMD_PF_UNBIND: return 0;
 		}
 	}
 
@@ -1074,6 +1069,7 @@ static int __init nfnetlink_queue_init(void)
 #endif
 
 	register_netdevice_notifier(&nfqnl_dev_notifier);
+	nf_register_queue_handler(&nfqh);
 	return status;
 
 #ifdef CONFIG_PROC_FS
@@ -1087,7 +1083,7 @@ cleanup_netlink_notifier:
 
 static void __exit nfnetlink_queue_fini(void)
 {
-	nf_unregister_queue_handlers(&nfqh);
+	nf_unregister_queue_handler();
 	unregister_netdevice_notifier(&nfqnl_dev_notifier);
 #ifdef CONFIG_PROC_FS
 	remove_proc_entry("nfnetlink_queue", proc_net_netfilter);
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 5/6] netfilter: ctnetlink: nla_policy updates
From: pablo @ 2012-12-04 17:31 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1354642293-4114-1-git-send-email-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

Add stricter checking for a few attributes.
Note that these changes don't fix any bug in the current code base.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_netlink.c   |    8 ++++++--
 net/netfilter/nf_conntrack_proto_tcp.c |    2 ++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index c24a00a..4e078cd 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -898,7 +898,8 @@ ctnetlink_parse_zone(const struct nlattr *attr, u16 *zone)
 }
 
 static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = {
-	[CTA_HELP_NAME]		= { .type = NLA_NUL_STRING },
+	[CTA_HELP_NAME]		= { .type = NLA_NUL_STRING,
+				    .len = NF_CT_HELPER_NAME_LEN - 1 },
 };
 
 static inline int
@@ -932,6 +933,8 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = {
 	[CTA_ID]		= { .type = NLA_U32 },
 	[CTA_NAT_DST]		= { .type = NLA_NESTED },
 	[CTA_TUPLE_MASTER]	= { .type = NLA_NESTED },
+	[CTA_NAT_SEQ_ADJ_ORIG]  = { .type = NLA_NESTED },
+	[CTA_NAT_SEQ_ADJ_REPLY] = { .type = NLA_NESTED },
 	[CTA_ZONE]		= { .type = NLA_U16 },
 	[CTA_MARK_MASK]		= { .type = NLA_U32 },
 };
@@ -2322,7 +2325,8 @@ static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = {
 	[CTA_EXPECT_MASK]	= { .type = NLA_NESTED },
 	[CTA_EXPECT_TIMEOUT]	= { .type = NLA_U32 },
 	[CTA_EXPECT_ID]		= { .type = NLA_U32 },
-	[CTA_EXPECT_HELP_NAME]	= { .type = NLA_NUL_STRING },
+	[CTA_EXPECT_HELP_NAME]	= { .type = NLA_NUL_STRING,
+				    .len = NF_CT_HELPER_NAME_LEN - 1 },
 	[CTA_EXPECT_ZONE]	= { .type = NLA_U16 },
 	[CTA_EXPECT_FLAGS]	= { .type = NLA_U32 },
 	[CTA_EXPECT_CLASS]	= { .type = NLA_U32 },
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 61f9285..83876e9 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1353,6 +1353,8 @@ static const struct nla_policy tcp_timeout_nla_policy[CTA_TIMEOUT_TCP_MAX+1] = {
 	[CTA_TIMEOUT_TCP_TIME_WAIT]	= { .type = NLA_U32 },
 	[CTA_TIMEOUT_TCP_CLOSE]		= { .type = NLA_U32 },
 	[CTA_TIMEOUT_TCP_SYN_SENT2]	= { .type = NLA_U32 },
+	[CTA_TIMEOUT_TCP_RETRANS]	= { .type = NLA_U32 },
+	[CTA_TIMEOUT_TCP_UNACK]		= { .type = NLA_U32 },
 };
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
 
-- 
1.7.10.4

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox