Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next 3/9] bnx2x: add a select queue callback
From: Vladislav Zolotarov @ 2010-12-13 15:44 UTC (permalink / raw)
  To: Dave Miller
  Cc: netdev list, Eilon Greenstein, Shmulik Ravid-Rabinovitz,
	Dmitry Kravkov

This callback required to allow FCoE traffic to be
sent on separate priority queue from other L2 traffic,
which is managed by PFC in HW.

Signed-off-by: Vladislav Zolotarov <vladz@broadcom.com>
Signed-off-by: Shmulik Ravid-Rabinovitz <shmulikr@broadcom.com>
Signed-off-by: Dmitry Kravkov <dmitry@broadcom.com>
Signed-off-by: Eilon Greenstein <eilong@broadcom.com>

---
 drivers/net/bnx2x/bnx2x_cmn.c  |   29 +++++++++++++++++++++++++++++
 drivers/net/bnx2x/bnx2x_cmn.h  |    3 +++
 drivers/net/bnx2x/bnx2x_main.c |    1 +
 3 files changed, 37 insertions(+), 0 deletions(-)

diff --git a/drivers/net/bnx2x/bnx2x_cmn.c b/drivers/net/bnx2x/bnx2x_cmn.c
index fa12365..e698b07 100644
--- a/drivers/net/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/bnx2x/bnx2x_cmn.c
@@ -1167,6 +1167,35 @@ void bnx2x_netif_stop(struct bnx2x *bp, int disable_hw)
 	netif_tx_disable(bp->dev);
 }
 
+u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb)
+{
+#ifdef BCM_CNIC
+	struct bnx2x *bp = netdev_priv(dev);
+	if (NO_FCOE(bp))
+		return skb_tx_hash(dev, skb);
+	else {
+		struct ethhdr *hdr = (struct ethhdr *)skb->data;
+		u16 ether_type = ntohs(hdr->h_proto);
+
+		/* Skip VLAN tag if present */
+		if (ether_type == ETH_P_8021Q) {
+			struct vlan_ethhdr *vhdr =
+				(struct vlan_ethhdr *)skb->data;
+
+			ether_type = ntohs(vhdr->h_vlan_encapsulated_proto);
+		}
+
+		/* If ethertype is FCoE or FIP - use FCoE ring */
+		if ((ether_type == ETH_P_FCOE) || (ether_type == ETH_P_FIP))
+			return bnx2x_fcoe(bp, index);
+	}
+#endif
+	/* Select a none-FCoE queue:  if FCoE is enabled, exclude FCoE L2 ring
+	 */
+	return __skb_tx_hash(dev, skb,
+			dev->real_num_tx_queues - FCOE_CONTEXT_USE);
+}
+
 void bnx2x_set_num_queues(struct bnx2x *bp)
 {
 	switch (bp->multi_mode) {
diff --git a/drivers/net/bnx2x/bnx2x_cmn.h b/drivers/net/bnx2x/bnx2x_cmn.h
index 4bb0113..258f0c0 100644
--- a/drivers/net/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/bnx2x/bnx2x_cmn.h
@@ -343,6 +343,9 @@ int bnx2x_nic_load(struct bnx2x *bp, int load_mode);
 /* hard_xmit callback */
 netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev);
 
+/* select_queue callback */
+u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb);
+
 int bnx2x_change_mac_addr(struct net_device *dev, void *p);
 
 /* NAPI poll Rx part */
diff --git a/drivers/net/bnx2x/bnx2x_main.c b/drivers/net/bnx2x/bnx2x_main.c
index e6e2746..563b2cb 100644
--- a/drivers/net/bnx2x/bnx2x_main.c
+++ b/drivers/net/bnx2x/bnx2x_main.c
@@ -8977,6 +8977,7 @@ static const struct net_device_ops bnx2x_netdev_ops = {
 	.ndo_open		= bnx2x_open,
 	.ndo_stop		= bnx2x_close,
 	.ndo_start_xmit		= bnx2x_start_xmit,
+	.ndo_select_queue	= bnx2x_select_queue,
 	.ndo_set_multicast_list	= bnx2x_set_rx_mode,
 	.ndo_set_mac_address	= bnx2x_change_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
-- 
1.7.0.4





^ permalink raw reply related

* [PATCH net-next 2/9] bnx2x: add FCoE ring
From: Vladislav Zolotarov @ 2010-12-13 15:44 UTC (permalink / raw)
  To: Dave Miller; +Cc: netdev list, Eilon Greenstein, shmulikr, Dmitry Kravkov

Includes new driver structures and FW/HW configuration for FCoE ring

Signed-off-by: Vladislav Zolotarov <vladz@broadcom.com>
Signed-off-by: Shmulik Ravid-Rabinovitz <shmulikr@broadcom.com>
Signed-off-by: Dmitry Kravkov <dmitry@broadcom.com>
Signed-off-by: Eilon Greenstein <eilong@broadcom.com>
---
 drivers/net/bnx2x/bnx2x.h         |  118 +++++++++++++--
 drivers/net/bnx2x/bnx2x_cmn.c     |   81 ++++++++--
 drivers/net/bnx2x/bnx2x_cmn.h     |   53 ++++++-
 drivers/net/bnx2x/bnx2x_ethtool.c |  290 ++++++++++++++++++----------------
 drivers/net/bnx2x/bnx2x_main.c    |  315 +++++++++++++++++++++++++++++--------
 drivers/net/bnx2x/bnx2x_stats.c   |   13 +-
 drivers/net/bnx2x/bnx2x_stats.h   |    2 -
 7 files changed, 631 insertions(+), 241 deletions(-)

diff --git a/drivers/net/bnx2x/bnx2x.h b/drivers/net/bnx2x/bnx2x.h
index 7e4d682..475725c 100644
--- a/drivers/net/bnx2x/bnx2x.h
+++ b/drivers/net/bnx2x/bnx2x.h
@@ -13,6 +13,8 @@
 
 #ifndef BNX2X_H
 #define BNX2X_H
+#include <linux/netdevice.h>
+#include <linux/types.h>
 
 /* compilation time flags */
 
@@ -199,10 +201,25 @@ void bnx2x_panic_dump(struct bnx2x *bp);
 /* EQ completions */
 #define HC_SP_INDEX_EQ_CONS			7
 
+/* FCoE L2 connection completions */
+#define HC_SP_INDEX_ETH_FCOE_TX_CQ_CONS		6
+#define HC_SP_INDEX_ETH_FCOE_RX_CQ_CONS		4
 /* iSCSI L2 */
 #define HC_SP_INDEX_ETH_ISCSI_CQ_CONS		5
 #define HC_SP_INDEX_ETH_ISCSI_RX_CQ_CONS	1
 
+/* Special clients parameters */
+
+/* SB indices */
+/* FCoE L2 */
+#define BNX2X_FCOE_L2_RX_INDEX \
+	(&bp->def_status_blk->sp_sb.\
+	index_values[HC_SP_INDEX_ETH_FCOE_RX_CQ_CONS])
+
+#define BNX2X_FCOE_L2_TX_INDEX \
+	(&bp->def_status_blk->sp_sb.\
+	index_values[HC_SP_INDEX_ETH_FCOE_TX_CQ_CONS])
+
 /**
  *  CIDs and CLIDs:
  *  CLIDs below is a CLID for func 0, then the CLID for other
@@ -215,12 +232,19 @@ void bnx2x_panic_dump(struct bnx2x *bp);
 #define BNX2X_ISCSI_ETH_CL_ID		17
 #define BNX2X_ISCSI_ETH_CID		17
 
+/* FCoE L2 */
+#define BNX2X_FCOE_ETH_CL_ID		18
+#define BNX2X_FCOE_ETH_CID		18
+
 /** Additional rings budgeting */
 #ifdef BCM_CNIC
 #define CNIC_CONTEXT_USE		1
+#define FCOE_CONTEXT_USE		1
 #else
 #define CNIC_CONTEXT_USE		0
+#define FCOE_CONTEXT_USE		0
 #endif /* BCM_CNIC */
+#define NONE_ETH_CONTEXT_USE	(FCOE_CONTEXT_USE)
 
 #define AEU_IN_ATTN_BITS_PXPPCICLOCKCLIENT_PARITY_ERROR \
 	AEU_INPUTS_ATTN_BITS_PXPPCICLOCKCLIENT_PARITY_ERROR
@@ -401,6 +425,17 @@ struct bnx2x_fastpath {
 };
 
 #define bnx2x_fp(bp, nr, var)		(bp->fp[nr].var)
+#ifdef BCM_CNIC
+/* FCoE L2 `fastpath' is right after the eth entries */
+#define FCOE_IDX			BNX2X_NUM_ETH_QUEUES(bp)
+#define bnx2x_fcoe_fp(bp)		(&bp->fp[FCOE_IDX])
+#define bnx2x_fcoe(bp, var)		(bnx2x_fcoe_fp(bp)->var)
+#define IS_FCOE_FP(fp)			(fp->index == FCOE_IDX)
+#define IS_FCOE_IDX(idx)		((idx) == FCOE_IDX)
+#else
+#define IS_FCOE_FP(fp)		false
+#define IS_FCOE_IDX(idx)	false
+#endif
 
 
 /* MC hsi */
@@ -669,7 +704,9 @@ struct bnx2x_port {
 enum {
 	CAM_ETH_LINE = 0,
 	CAM_ISCSI_ETH_LINE,
-	CAM_MAX_PF_LINE = CAM_ISCSI_ETH_LINE
+	CAM_FIP_ETH_LINE,
+	CAM_FIP_MCAST_LINE,
+	CAM_MAX_PF_LINE = CAM_FIP_MCAST_LINE
 };
 /* number of MACs per function in NIG memory - used for SI mode */
 #define NIG_LLH_FUNC_MEM_SIZE		16
@@ -714,6 +751,14 @@ enum {
  */
 #define L2_FP_COUNT(cid_cnt)	((cid_cnt) - CNIC_CONTEXT_USE)
 
+/*
+ * The number of FP-SB allocated by the driver == max number of regular L2
+ * queues + 1 for the CNIC which also consumes an FP-SB
+ */
+#define FP_SB_COUNT(cid_cnt)	((cid_cnt) - FCOE_CONTEXT_USE)
+#define NUM_IGU_SB_REQUIRED(cid_cnt) \
+				(FP_SB_COUNT(cid_cnt) - NONE_ETH_CONTEXT_USE)
+
 union cdu_context {
 	struct eth_context eth;
 	char pad[1024];
@@ -726,7 +771,8 @@ union cdu_context {
 
 #ifdef BCM_CNIC
 #define CNIC_ISCSI_CID_MAX	256
-#define CNIC_CID_MAX		(CNIC_ISCSI_CID_MAX)
+#define CNIC_FCOE_CID_MAX	2048
+#define CNIC_CID_MAX		(CNIC_ISCSI_CID_MAX + CNIC_FCOE_CID_MAX)
 #define CNIC_ILT_LINES		DIV_ROUND_UP(CNIC_CID_MAX, ILT_PAGE_CIDS)
 #endif
 
@@ -922,6 +968,10 @@ struct bnx2x {
 #define DISABLE_MSI_FLAG		0x200
 #define BP_NOMCP(bp)			(bp->flags & NO_MCP_FLAG)
 #define MF_FUNC_DIS			0x1000
+#define FCOE_MACS_SET			0x2000
+#define NO_FCOE_FLAG			0x4000
+
+#define NO_FCOE(bp)		((bp)->flags & NO_FCOE_FLAG)
 
 	int			pf_num;	/* absolute PF number */
 	int			pfid;	/* per-path PF number */
@@ -1069,7 +1119,8 @@ struct bnx2x {
 	u16			cnic_kwq_pending;
 	u16			cnic_spq_pending;
 	struct mutex		cnic_mutex;
-	u8			iscsi_mac[6];
+	u8			iscsi_mac[ETH_ALEN];
+	u8			fip_mac[ETH_ALEN];
 #endif
 
 	int			dmae_ready;
@@ -1159,10 +1210,17 @@ struct bnx2x {
 #define RSS_IPV6_TCP_CAP	0x0008
 
 #define BNX2X_NUM_QUEUES(bp)	(bp->num_queues)
+#define BNX2X_NUM_ETH_QUEUES(bp) (BNX2X_NUM_QUEUES(bp) - NONE_ETH_CONTEXT_USE)
+
+/* ethtool statistics are displayed for all regular ethernet queues and the
+ * fcoe L2 queue if not disabled
+ */
+#define BNX2X_NUM_STAT_QUEUES(bp) (NO_FCOE(bp) ? BNX2X_NUM_ETH_QUEUES(bp) : \
+			   (BNX2X_NUM_ETH_QUEUES(bp) + FCOE_CONTEXT_USE))
+
 #define is_multi(bp)		(BNX2X_NUM_QUEUES(bp) > 1)
 
 #define BNX2X_MAX_QUEUES(bp)	(bp->igu_sb_cnt - CNIC_CONTEXT_USE)
-#define is_eth_multi(bp)	(BNX2X_NUM_ETH_QUEUES(bp) > 1)
 
 #define RSS_IPV4_CAP_MASK						\
 	TSTORM_ETH_FUNCTION_COMMON_CONFIG_RSS_IPV4_CAPABILITY
@@ -1255,6 +1313,7 @@ struct bnx2x_client_ramrod_params {
 	u16 cl_id;
 	u32 cid;
 	u8 poll;
+#define CLIENT_IS_FCOE			0x01
 #define CLIENT_IS_LEADING_RSS		0x02
 	u8 flags;
 };
@@ -1287,11 +1346,54 @@ struct bnx2x_func_init_params {
 	u16		spq_prod;	/* valid iff FUNC_FLG_SPQ */
 };
 
+#define for_each_eth_queue(bp, var) \
+			for (var = 0; var < BNX2X_NUM_ETH_QUEUES(bp); var++)
+
+#define for_each_nondefault_eth_queue(bp, var) \
+			for (var = 1; var < BNX2X_NUM_ETH_QUEUES(bp); var++)
+
+#define for_each_napi_queue(bp, var) \
+	for (var = 0; \
+		var < BNX2X_NUM_ETH_QUEUES(bp) + FCOE_CONTEXT_USE; var++) \
+		if (skip_queue(bp, var))	\
+			continue;		\
+		else
+
 #define for_each_queue(bp, var) \
-			for (var = 0; var < BNX2X_NUM_QUEUES(bp); var++)
+	for (var = 0; var < BNX2X_NUM_QUEUES(bp); var++) \
+		if (skip_queue(bp, var))	\
+			continue;		\
+		else
+
+#define for_each_rx_queue(bp, var) \
+	for (var = 0; var < BNX2X_NUM_QUEUES(bp); var++) \
+		if (skip_rx_queue(bp, var))	\
+			continue;		\
+		else
+
+#define for_each_tx_queue(bp, var) \
+	for (var = 0; var < BNX2X_NUM_QUEUES(bp); var++) \
+		if (skip_tx_queue(bp, var))	\
+			continue;		\
+		else
+
 #define for_each_nondefault_queue(bp, var) \
-			for (var = 1; var < BNX2X_NUM_QUEUES(bp); var++)
+	for (var = 1; var < BNX2X_NUM_QUEUES(bp); var++) \
+		if (skip_queue(bp, var))	\
+			continue;		\
+		else
 
+/* skip rx queue
+ * if FCOE l2 support is diabled and this is the fcoe L2 queue
+ */
+#define skip_rx_queue(bp, idx)	(NO_FCOE(bp) && IS_FCOE_IDX(idx))
+
+/* skip tx queue
+ * if FCOE l2 support is diabled and this is the fcoe L2 queue
+ */
+#define skip_tx_queue(bp, idx)	(NO_FCOE(bp) && IS_FCOE_IDX(idx))
+
+#define skip_queue(bp, idx)	(NO_FCOE(bp) && IS_FCOE_IDX(idx))
 
 #define WAIT_RAMROD_POLL	0x01
 #define WAIT_RAMROD_COMMON	0x02
@@ -1615,10 +1717,6 @@ static inline u32 reg_poll(struct bnx2x *bp, u32 reg, u32 expected, int ms,
 	MAC_CONFIGURATION_ENTRY_ACTION_TYPE) == \
 	(T_ETH_MAC_COMMAND_INVALIDATE))
 
-#define CAM_INVALIDATE(x) \
-	(x.target_table_entry.flags = TSTORM_CAM_TARGET_TABLE_ENTRY_ACTION_TYPE)
-
-
 /* Number of u32 elements in MC hash array */
 #define MC_HASH_SIZE			8
 #define MC_HASH_OFFSET(bp, i)		(BAR_TSTRORM_INTMEM + \
diff --git a/drivers/net/bnx2x/bnx2x_cmn.c b/drivers/net/bnx2x/bnx2x_cmn.c
index 236c00c..fa12365 100644
--- a/drivers/net/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/bnx2x/bnx2x_cmn.c
@@ -827,7 +827,7 @@ void bnx2x_init_rx_rings(struct bnx2x *bp)
 	DP(NETIF_MSG_IFUP,
 	   "mtu %d  rx_buf_size %d\n", bp->dev->mtu, bp->rx_buf_size);
 
-	for_each_queue(bp, j) {
+	for_each_rx_queue(bp, j) {
 		struct bnx2x_fastpath *fp = &bp->fp[j];
 
 		if (!fp->disable_tpa) {
@@ -880,7 +880,7 @@ void bnx2x_init_rx_rings(struct bnx2x *bp)
 		}
 	}
 
-	for_each_queue(bp, j) {
+	for_each_rx_queue(bp, j) {
 		struct bnx2x_fastpath *fp = &bp->fp[j];
 
 		fp->rx_bd_cons = 0;
@@ -911,7 +911,7 @@ static void bnx2x_free_tx_skbs(struct bnx2x *bp)
 {
 	int i;
 
-	for_each_queue(bp, i) {
+	for_each_tx_queue(bp, i) {
 		struct bnx2x_fastpath *fp = &bp->fp[i];
 
 		u16 bd_cons = fp->tx_bd_cons;
@@ -929,7 +929,7 @@ static void bnx2x_free_rx_skbs(struct bnx2x *bp)
 {
 	int i, j;
 
-	for_each_queue(bp, j) {
+	for_each_rx_queue(bp, j) {
 		struct bnx2x_fastpath *fp = &bp->fp[j];
 
 		for (i = 0; i < NUM_RX_BD; i++) {
@@ -970,7 +970,7 @@ static void bnx2x_free_msix_irqs(struct bnx2x *bp)
 #ifdef BCM_CNIC
 	offset++;
 #endif
-	for_each_queue(bp, i) {
+	for_each_eth_queue(bp, i) {
 		DP(NETIF_MSG_IFDOWN, "about to release fp #%d->%d irq  "
 		   "state %x\n", i, bp->msix_table[i + offset].vector,
 		   bnx2x_fp(bp, i, state));
@@ -1004,14 +1004,14 @@ int bnx2x_enable_msix(struct bnx2x *bp)
 	   bp->msix_table[msix_vec].entry, bp->msix_table[msix_vec].entry);
 	msix_vec++;
 #endif
-	for_each_queue(bp, i) {
+	for_each_eth_queue(bp, i) {
 		bp->msix_table[msix_vec].entry = msix_vec;
 		DP(NETIF_MSG_IFUP, "msix_table[%d].entry = %d "
 		   "(fastpath #%u)\n", msix_vec, msix_vec, i);
 		msix_vec++;
 	}
 
-	req_cnt = BNX2X_NUM_QUEUES(bp) + CNIC_CONTEXT_USE + 1;
+	req_cnt = BNX2X_NUM_ETH_QUEUES(bp) + CNIC_CONTEXT_USE + 1;
 
 	rc = pci_enable_msix(bp->pdev, &bp->msix_table[0], req_cnt);
 
@@ -1067,7 +1067,7 @@ static int bnx2x_req_msix_irqs(struct bnx2x *bp)
 #ifdef BCM_CNIC
 	offset++;
 #endif
-	for_each_queue(bp, i) {
+	for_each_eth_queue(bp, i) {
 		struct bnx2x_fastpath *fp = &bp->fp[i];
 		snprintf(fp->name, sizeof(fp->name), "%s-fp-%d",
 			 bp->dev->name, i);
@@ -1084,7 +1084,7 @@ static int bnx2x_req_msix_irqs(struct bnx2x *bp)
 		fp->state = BNX2X_FP_STATE_IRQ;
 	}
 
-	i = BNX2X_NUM_QUEUES(bp);
+	i = BNX2X_NUM_ETH_QUEUES(bp);
 	offset = 1 + CNIC_CONTEXT_USE;
 	netdev_info(bp->dev, "using MSI-X  IRQs: sp %d  fp[%d] %d"
 	       " ... fp[%d] %d\n",
@@ -1131,7 +1131,7 @@ static void bnx2x_napi_enable(struct bnx2x *bp)
 {
 	int i;
 
-	for_each_queue(bp, i)
+	for_each_napi_queue(bp, i)
 		napi_enable(&bnx2x_fp(bp, i, napi));
 }
 
@@ -1139,7 +1139,7 @@ static void bnx2x_napi_disable(struct bnx2x *bp)
 {
 	int i;
 
-	for_each_queue(bp, i)
+	for_each_napi_queue(bp, i)
 		napi_disable(&bnx2x_fp(bp, i, napi));
 }
 
@@ -1181,7 +1181,22 @@ void bnx2x_set_num_queues(struct bnx2x *bp)
 		bp->num_queues = 1;
 		break;
 	}
+
+	/* Add special queues */
+	bp->num_queues += NONE_ETH_CONTEXT_USE;
+}
+
+#ifdef BCM_CNIC
+static inline void bnx2x_set_fcoe_eth_macs(struct bnx2x *bp)
+{
+	if (!NO_FCOE(bp)) {
+		if (!IS_MF_SD(bp))
+			bnx2x_set_fip_eth_mac_addr(bp, 1);
+		bnx2x_set_all_enode_macs(bp, 1);
+		bp->flags |= FCOE_MACS_SET;
+	}
 }
+#endif
 
 static void bnx2x_release_firmware(struct bnx2x *bp)
 {
@@ -1191,6 +1206,20 @@ static void bnx2x_release_firmware(struct bnx2x *bp)
 	release_firmware(bp->firmware);
 }
 
+static inline int bnx2x_set_real_num_queues(struct bnx2x *bp)
+{
+	int rc, num = bp->num_queues;
+
+#ifdef BCM_CNIC
+	if (NO_FCOE(bp))
+		num -= FCOE_CONTEXT_USE;
+
+#endif
+	netif_set_real_num_tx_queues(bp->dev, num);
+	rc = netif_set_real_num_rx_queues(bp->dev, num);
+	return rc;
+}
+
 /* must be called with rtnl_lock */
 int bnx2x_nic_load(struct bnx2x *bp, int load_mode)
 {
@@ -1217,10 +1246,9 @@ int bnx2x_nic_load(struct bnx2x *bp, int load_mode)
 	if (bnx2x_alloc_mem(bp))
 		return -ENOMEM;
 
-	netif_set_real_num_tx_queues(bp->dev, bp->num_queues);
-	rc = netif_set_real_num_rx_queues(bp->dev, bp->num_queues);
+	rc = bnx2x_set_real_num_queues(bp);
 	if (rc) {
-		BNX2X_ERR("Unable to update real_num_rx_queues\n");
+		BNX2X_ERR("Unable to set real_num_queues\n");
 		goto load_error0;
 	}
 
@@ -1228,6 +1256,10 @@ int bnx2x_nic_load(struct bnx2x *bp, int load_mode)
 		bnx2x_fp(bp, i, disable_tpa) =
 					((bp->flags & TPA_ENABLE_FLAG) == 0);
 
+#ifdef BCM_CNIC
+	/* We don't want TPA on FCoE L2 ring */
+	bnx2x_fcoe(bp, disable_tpa) = 1;
+#endif
 	bnx2x_napi_enable(bp);
 
 	/* Send LOAD_REQUEST command to MCP
@@ -1358,6 +1390,10 @@ int bnx2x_nic_load(struct bnx2x *bp, int load_mode)
 	/* Now when Clients are configured we are ready to work */
 	bp->state = BNX2X_STATE_OPEN;
 
+#ifdef BCM_CNIC
+	bnx2x_set_fcoe_eth_macs(bp);
+#endif
+
 	bnx2x_set_eth_mac(bp, 1);
 
 	if (bp->port.pmf)
@@ -1416,7 +1452,7 @@ load_error3:
 
 	/* Free SKBs, SGEs, TPA pool and driver internals */
 	bnx2x_free_skbs(bp);
-	for_each_queue(bp, i)
+	for_each_rx_queue(bp, i)
 		bnx2x_free_rx_sge_range(bp, bp->fp + i, NUM_RX_SGE);
 
 	/* Release IRQs */
@@ -1487,7 +1523,7 @@ int bnx2x_nic_unload(struct bnx2x *bp, int unload_mode)
 
 	/* Free SKBs, SGEs, TPA pool and driver internals */
 	bnx2x_free_skbs(bp);
-	for_each_queue(bp, i)
+	for_each_rx_queue(bp, i)
 		bnx2x_free_rx_sge_range(bp, bp->fp + i, NUM_RX_SGE);
 
 	bnx2x_free_mem(bp);
@@ -1591,6 +1627,17 @@ int bnx2x_poll(struct napi_struct *napi, int budget)
 
 		/* Fall out from the NAPI loop if needed */
 		if (!(bnx2x_has_rx_work(fp) || bnx2x_has_tx_work(fp))) {
+#ifdef BCM_CNIC
+			/* No need to update SB for FCoE L2 ring as long as
+			 * it's connected to the default SB and the SB
+			 * has been updated when NAPI was scheduled.
+			 */
+			if (IS_FCOE_FP(fp)) {
+				napi_complete(napi);
+				break;
+			}
+#endif
+
 			bnx2x_update_fpsb_idx(fp);
 			/* bnx2x_has_rx_work() reads the status block,
 			 * thus we need to ensure that status block indices
@@ -2255,7 +2302,7 @@ int __devinit bnx2x_alloc_mem_bp(struct bnx2x *bp)
 	bp->fp = fp;
 
 	/* msix table */
-	tbl = kzalloc((bp->l2_cid_count + 1) * sizeof(*tbl),
+	tbl = kzalloc((FP_SB_COUNT(bp->l2_cid_count) + 1) * sizeof(*tbl),
 				  GFP_KERNEL);
 	if (!tbl)
 		goto alloc_err;
diff --git a/drivers/net/bnx2x/bnx2x_cmn.h b/drivers/net/bnx2x/bnx2x_cmn.h
index cb8f2a0..4bb0113 100644
--- a/drivers/net/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/bnx2x/bnx2x_cmn.h
@@ -242,6 +242,30 @@ int bnx2x_release_hw_lock(struct bnx2x *bp, u32 resource);
  */
 void bnx2x_set_eth_mac(struct bnx2x *bp, int set);
 
+#ifdef BCM_CNIC
+/**
+ * Set/Clear FIP MAC(s) at the next enties in the CAM after the ETH
+ * MAC(s). This function will wait until the ramdord completion
+ * returns.
+ *
+ * @param bp driver handle
+ * @param set set or clear the CAM entry
+ *
+ * @return 0 if cussess, -ENODEV if ramrod doesn't return.
+ */
+int bnx2x_set_fip_eth_mac_addr(struct bnx2x *bp, int set);
+
+/**
+ * Set/Clear ALL_ENODE mcast MAC.
+ *
+ * @param bp
+ * @param set
+ *
+ * @return int
+ */
+int bnx2x_set_all_enode_macs(struct bnx2x *bp, int set);
+#endif
+
 /**
  * Set MAC filtering configurations.
  *
@@ -695,7 +719,7 @@ static inline void bnx2x_add_all_napi(struct bnx2x *bp)
 	int i;
 
 	/* Add NAPI objects */
-	for_each_queue(bp, i)
+	for_each_napi_queue(bp, i)
 		netif_napi_add(bp->dev, &bnx2x_fp(bp, i, napi),
 			       bnx2x_poll, BNX2X_NAPI_WEIGHT);
 }
@@ -704,7 +728,7 @@ static inline void bnx2x_del_all_napi(struct bnx2x *bp)
 {
 	int i;
 
-	for_each_queue(bp, i)
+	for_each_napi_queue(bp, i)
 		netif_napi_del(&bnx2x_fp(bp, i, napi));
 }
 
@@ -870,7 +894,7 @@ static inline void bnx2x_init_tx_rings(struct bnx2x *bp)
 {
 	int i, j;
 
-	for_each_queue(bp, j) {
+	for_each_tx_queue(bp, j) {
 		struct bnx2x_fastpath *fp = &bp->fp[j];
 
 		for (i = 1; i <= NUM_TX_RINGS; i++) {
@@ -949,7 +973,30 @@ static inline void bnx2x_set_next_page_rx_cq(struct bnx2x_fastpath *fp)
 	}
 }
 
+#ifdef BCM_CNIC
+static inline void bnx2x_init_fcoe_fp(struct bnx2x *bp)
+{
+	bnx2x_fcoe(bp, cl_id) = BNX2X_FCOE_ETH_CL_ID +
+		BP_E1HVN(bp) * NONE_ETH_CONTEXT_USE;
+	bnx2x_fcoe(bp, cid) = BNX2X_FCOE_ETH_CID;
+	bnx2x_fcoe(bp, fw_sb_id) = DEF_SB_ID;
+	bnx2x_fcoe(bp, igu_sb_id) = bp->igu_dsb_id;
+	bnx2x_fcoe(bp, bp) = bp;
+	bnx2x_fcoe(bp, state) = BNX2X_FP_STATE_CLOSED;
+	bnx2x_fcoe(bp, index) = FCOE_IDX;
+	bnx2x_fcoe(bp, rx_cons_sb) = BNX2X_FCOE_L2_RX_INDEX;
+	bnx2x_fcoe(bp, tx_cons_sb) = BNX2X_FCOE_L2_TX_INDEX;
+	/* qZone id equals to FW (per path) client id */
+	bnx2x_fcoe(bp, cl_qzone_id) = bnx2x_fcoe(bp, cl_id) +
+		BP_PORT(bp)*(CHIP_IS_E2(bp) ? ETH_MAX_RX_CLIENTS_E2 :
+				ETH_MAX_RX_CLIENTS_E1H);
+	/* init shortcut */
+	bnx2x_fcoe(bp, ustorm_rx_prods_offset) = CHIP_IS_E2(bp) ?
+	    USTORM_RX_PRODS_E2_OFFSET(bnx2x_fcoe(bp, cl_qzone_id)) :
+	    USTORM_RX_PRODS_E1X_OFFSET(BP_PORT(bp), bnx2x_fcoe_fp(bp)->cl_id);
 
+}
+#endif
 
 static inline void __storm_memset_struct(struct bnx2x *bp,
 					 u32 addr, size_t size, u32 *data)
diff --git a/drivers/net/bnx2x/bnx2x_ethtool.c b/drivers/net/bnx2x/bnx2x_ethtool.c
index bd94827..99c672d 100644
--- a/drivers/net/bnx2x/bnx2x_ethtool.c
+++ b/drivers/net/bnx2x/bnx2x_ethtool.c
@@ -25,6 +25,143 @@
 #include "bnx2x_cmn.h"
 #include "bnx2x_dump.h"
 
+/* Note: in the format strings below %s is replaced by the queue-name which is
+ * either its index or 'fcoe' for the fcoe queue. Make sure the format string
+ * length does not exceed ETH_GSTRING_LEN - MAX_QUEUE_NAME_LEN + 2
+ */
+#define MAX_QUEUE_NAME_LEN	4
+static const struct {
+	long offset;
+	int size;
+	char string[ETH_GSTRING_LEN];
+} bnx2x_q_stats_arr[] = {
+/* 1 */	{ Q_STATS_OFFSET32(total_bytes_received_hi), 8, "[%s]: rx_bytes" },
+	{ Q_STATS_OFFSET32(error_bytes_received_hi),
+						8, "[%s]: rx_error_bytes" },
+	{ Q_STATS_OFFSET32(total_unicast_packets_received_hi),
+						8, "[%s]: rx_ucast_packets" },
+	{ Q_STATS_OFFSET32(total_multicast_packets_received_hi),
+						8, "[%s]: rx_mcast_packets" },
+	{ Q_STATS_OFFSET32(total_broadcast_packets_received_hi),
+						8, "[%s]: rx_bcast_packets" },
+	{ Q_STATS_OFFSET32(no_buff_discard_hi),	8, "[%s]: rx_discards" },
+	{ Q_STATS_OFFSET32(rx_err_discard_pkt),
+					 4, "[%s]: rx_phy_ip_err_discards"},
+	{ Q_STATS_OFFSET32(rx_skb_alloc_failed),
+					 4, "[%s]: rx_skb_alloc_discard" },
+	{ Q_STATS_OFFSET32(hw_csum_err), 4, "[%s]: rx_csum_offload_errors" },
+
+/* 10 */{ Q_STATS_OFFSET32(total_bytes_transmitted_hi),	8, "[%s]: tx_bytes" },
+	{ Q_STATS_OFFSET32(total_unicast_packets_transmitted_hi),
+						8, "[%s]: tx_ucast_packets" },
+	{ Q_STATS_OFFSET32(total_multicast_packets_transmitted_hi),
+						8, "[%s]: tx_mcast_packets" },
+	{ Q_STATS_OFFSET32(total_broadcast_packets_transmitted_hi),
+						8, "[%s]: tx_bcast_packets" }
+};
+
+#define BNX2X_NUM_Q_STATS ARRAY_SIZE(bnx2x_q_stats_arr)
+
+static const struct {
+	long offset;
+	int size;
+	u32 flags;
+#define STATS_FLAGS_PORT		1
+#define STATS_FLAGS_FUNC		2
+#define STATS_FLAGS_BOTH		(STATS_FLAGS_FUNC | STATS_FLAGS_PORT)
+	char string[ETH_GSTRING_LEN];
+} bnx2x_stats_arr[] = {
+/* 1 */	{ STATS_OFFSET32(total_bytes_received_hi),
+				8, STATS_FLAGS_BOTH, "rx_bytes" },
+	{ STATS_OFFSET32(error_bytes_received_hi),
+				8, STATS_FLAGS_BOTH, "rx_error_bytes" },
+	{ STATS_OFFSET32(total_unicast_packets_received_hi),
+				8, STATS_FLAGS_BOTH, "rx_ucast_packets" },
+	{ STATS_OFFSET32(total_multicast_packets_received_hi),
+				8, STATS_FLAGS_BOTH, "rx_mcast_packets" },
+	{ STATS_OFFSET32(total_broadcast_packets_received_hi),
+				8, STATS_FLAGS_BOTH, "rx_bcast_packets" },
+	{ STATS_OFFSET32(rx_stat_dot3statsfcserrors_hi),
+				8, STATS_FLAGS_PORT, "rx_crc_errors" },
+	{ STATS_OFFSET32(rx_stat_dot3statsalignmenterrors_hi),
+				8, STATS_FLAGS_PORT, "rx_align_errors" },
+	{ STATS_OFFSET32(rx_stat_etherstatsundersizepkts_hi),
+				8, STATS_FLAGS_PORT, "rx_undersize_packets" },
+	{ STATS_OFFSET32(etherstatsoverrsizepkts_hi),
+				8, STATS_FLAGS_PORT, "rx_oversize_packets" },
+/* 10 */{ STATS_OFFSET32(rx_stat_etherstatsfragments_hi),
+				8, STATS_FLAGS_PORT, "rx_fragments" },
+	{ STATS_OFFSET32(rx_stat_etherstatsjabbers_hi),
+				8, STATS_FLAGS_PORT, "rx_jabbers" },
+	{ STATS_OFFSET32(no_buff_discard_hi),
+				8, STATS_FLAGS_BOTH, "rx_discards" },
+	{ STATS_OFFSET32(mac_filter_discard),
+				4, STATS_FLAGS_PORT, "rx_filtered_packets" },
+	{ STATS_OFFSET32(xxoverflow_discard),
+				4, STATS_FLAGS_PORT, "rx_fw_discards" },
+	{ STATS_OFFSET32(brb_drop_hi),
+				8, STATS_FLAGS_PORT, "rx_brb_discard" },
+	{ STATS_OFFSET32(brb_truncate_hi),
+				8, STATS_FLAGS_PORT, "rx_brb_truncate" },
+	{ STATS_OFFSET32(pause_frames_received_hi),
+				8, STATS_FLAGS_PORT, "rx_pause_frames" },
+	{ STATS_OFFSET32(rx_stat_maccontrolframesreceived_hi),
+				8, STATS_FLAGS_PORT, "rx_mac_ctrl_frames" },
+	{ STATS_OFFSET32(nig_timer_max),
+			4, STATS_FLAGS_PORT, "rx_constant_pause_events" },
+/* 20 */{ STATS_OFFSET32(rx_err_discard_pkt),
+				4, STATS_FLAGS_BOTH, "rx_phy_ip_err_discards"},
+	{ STATS_OFFSET32(rx_skb_alloc_failed),
+				4, STATS_FLAGS_BOTH, "rx_skb_alloc_discard" },
+	{ STATS_OFFSET32(hw_csum_err),
+				4, STATS_FLAGS_BOTH, "rx_csum_offload_errors" },
+
+	{ STATS_OFFSET32(total_bytes_transmitted_hi),
+				8, STATS_FLAGS_BOTH, "tx_bytes" },
+	{ STATS_OFFSET32(tx_stat_ifhcoutbadoctets_hi),
+				8, STATS_FLAGS_PORT, "tx_error_bytes" },
+	{ STATS_OFFSET32(total_unicast_packets_transmitted_hi),
+				8, STATS_FLAGS_BOTH, "tx_ucast_packets" },
+	{ STATS_OFFSET32(total_multicast_packets_transmitted_hi),
+				8, STATS_FLAGS_BOTH, "tx_mcast_packets" },
+	{ STATS_OFFSET32(total_broadcast_packets_transmitted_hi),
+				8, STATS_FLAGS_BOTH, "tx_bcast_packets" },
+	{ STATS_OFFSET32(tx_stat_dot3statsinternalmactransmiterrors_hi),
+				8, STATS_FLAGS_PORT, "tx_mac_errors" },
+	{ STATS_OFFSET32(rx_stat_dot3statscarriersenseerrors_hi),
+				8, STATS_FLAGS_PORT, "tx_carrier_errors" },
+/* 30 */{ STATS_OFFSET32(tx_stat_dot3statssinglecollisionframes_hi),
+				8, STATS_FLAGS_PORT, "tx_single_collisions" },
+	{ STATS_OFFSET32(tx_stat_dot3statsmultiplecollisionframes_hi),
+				8, STATS_FLAGS_PORT, "tx_multi_collisions" },
+	{ STATS_OFFSET32(tx_stat_dot3statsdeferredtransmissions_hi),
+				8, STATS_FLAGS_PORT, "tx_deferred" },
+	{ STATS_OFFSET32(tx_stat_dot3statsexcessivecollisions_hi),
+				8, STATS_FLAGS_PORT, "tx_excess_collisions" },
+	{ STATS_OFFSET32(tx_stat_dot3statslatecollisions_hi),
+				8, STATS_FLAGS_PORT, "tx_late_collisions" },
+	{ STATS_OFFSET32(tx_stat_etherstatscollisions_hi),
+				8, STATS_FLAGS_PORT, "tx_total_collisions" },
+	{ STATS_OFFSET32(tx_stat_etherstatspkts64octets_hi),
+				8, STATS_FLAGS_PORT, "tx_64_byte_packets" },
+	{ STATS_OFFSET32(tx_stat_etherstatspkts65octetsto127octets_hi),
+			8, STATS_FLAGS_PORT, "tx_65_to_127_byte_packets" },
+	{ STATS_OFFSET32(tx_stat_etherstatspkts128octetsto255octets_hi),
+			8, STATS_FLAGS_PORT, "tx_128_to_255_byte_packets" },
+	{ STATS_OFFSET32(tx_stat_etherstatspkts256octetsto511octets_hi),
+			8, STATS_FLAGS_PORT, "tx_256_to_511_byte_packets" },
+/* 40 */{ STATS_OFFSET32(tx_stat_etherstatspkts512octetsto1023octets_hi),
+			8, STATS_FLAGS_PORT, "tx_512_to_1023_byte_packets" },
+	{ STATS_OFFSET32(etherstatspkts1024octetsto1522octets_hi),
+			8, STATS_FLAGS_PORT, "tx_1024_to_1522_byte_packets" },
+	{ STATS_OFFSET32(etherstatspktsover1522octets_hi),
+			8, STATS_FLAGS_PORT, "tx_1523_to_9022_byte_packets" },
+	{ STATS_OFFSET32(pause_frames_sent_hi),
+				8, STATS_FLAGS_PORT, "tx_pause_frames" }
+};
+
+#define BNX2X_NUM_STATS		ARRAY_SIZE(bnx2x_stats_arr)
+
 static int bnx2x_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 {
 	struct bnx2x *bp = netdev_priv(dev);
@@ -1318,7 +1455,7 @@ static int bnx2x_test_registers(struct bnx2x *bp)
 
 			save_val = REG_RD(bp, offset);
 
-			REG_WR(bp, offset, (wr_val & mask));
+			REG_WR(bp, offset, wr_val & mask);
 
 			val = REG_RD(bp, offset);
 
@@ -1689,7 +1826,7 @@ static int bnx2x_test_intr(struct bnx2x *bp)
 	config->hdr.client_id = bp->fp->cl_id;
 	config->hdr.reserved1 = 0;
 
-	bp->set_mac_pending++;
+	bp->set_mac_pending = 1;
 	smp_wmb();
 	rc = bnx2x_sp_post(bp, RAMROD_CMD_ID_COMMON_SET_MAC, 0,
 			   U64_HI(bnx2x_sp_mapping(bp, mac_config)),
@@ -1787,134 +1924,6 @@ static void bnx2x_self_test(struct net_device *dev,
 #endif
 }
 
-static const struct {
-	long offset;
-	int size;
-	u8 string[ETH_GSTRING_LEN];
-} bnx2x_q_stats_arr[BNX2X_NUM_Q_STATS] = {
-/* 1 */	{ Q_STATS_OFFSET32(total_bytes_received_hi), 8, "[%d]: rx_bytes" },
-	{ Q_STATS_OFFSET32(error_bytes_received_hi),
-						8, "[%d]: rx_error_bytes" },
-	{ Q_STATS_OFFSET32(total_unicast_packets_received_hi),
-						8, "[%d]: rx_ucast_packets" },
-	{ Q_STATS_OFFSET32(total_multicast_packets_received_hi),
-						8, "[%d]: rx_mcast_packets" },
-	{ Q_STATS_OFFSET32(total_broadcast_packets_received_hi),
-						8, "[%d]: rx_bcast_packets" },
-	{ Q_STATS_OFFSET32(no_buff_discard_hi),	8, "[%d]: rx_discards" },
-	{ Q_STATS_OFFSET32(rx_err_discard_pkt),
-					 4, "[%d]: rx_phy_ip_err_discards"},
-	{ Q_STATS_OFFSET32(rx_skb_alloc_failed),
-					 4, "[%d]: rx_skb_alloc_discard" },
-	{ Q_STATS_OFFSET32(hw_csum_err), 4, "[%d]: rx_csum_offload_errors" },
-
-/* 10 */{ Q_STATS_OFFSET32(total_bytes_transmitted_hi),	8, "[%d]: tx_bytes" },
-	{ Q_STATS_OFFSET32(total_unicast_packets_transmitted_hi),
-						8, "[%d]: tx_ucast_packets" },
-	{ Q_STATS_OFFSET32(total_multicast_packets_transmitted_hi),
-						8, "[%d]: tx_mcast_packets" },
-	{ Q_STATS_OFFSET32(total_broadcast_packets_transmitted_hi),
-						8, "[%d]: tx_bcast_packets" }
-};
-
-static const struct {
-	long offset;
-	int size;
-	u32 flags;
-#define STATS_FLAGS_PORT		1
-#define STATS_FLAGS_FUNC		2
-#define STATS_FLAGS_BOTH		(STATS_FLAGS_FUNC | STATS_FLAGS_PORT)
-	u8 string[ETH_GSTRING_LEN];
-} bnx2x_stats_arr[BNX2X_NUM_STATS] = {
-/* 1 */	{ STATS_OFFSET32(total_bytes_received_hi),
-				8, STATS_FLAGS_BOTH, "rx_bytes" },
-	{ STATS_OFFSET32(error_bytes_received_hi),
-				8, STATS_FLAGS_BOTH, "rx_error_bytes" },
-	{ STATS_OFFSET32(total_unicast_packets_received_hi),
-				8, STATS_FLAGS_BOTH, "rx_ucast_packets" },
-	{ STATS_OFFSET32(total_multicast_packets_received_hi),
-				8, STATS_FLAGS_BOTH, "rx_mcast_packets" },
-	{ STATS_OFFSET32(total_broadcast_packets_received_hi),
-				8, STATS_FLAGS_BOTH, "rx_bcast_packets" },
-	{ STATS_OFFSET32(rx_stat_dot3statsfcserrors_hi),
-				8, STATS_FLAGS_PORT, "rx_crc_errors" },
-	{ STATS_OFFSET32(rx_stat_dot3statsalignmenterrors_hi),
-				8, STATS_FLAGS_PORT, "rx_align_errors" },
-	{ STATS_OFFSET32(rx_stat_etherstatsundersizepkts_hi),
-				8, STATS_FLAGS_PORT, "rx_undersize_packets" },
-	{ STATS_OFFSET32(etherstatsoverrsizepkts_hi),
-				8, STATS_FLAGS_PORT, "rx_oversize_packets" },
-/* 10 */{ STATS_OFFSET32(rx_stat_etherstatsfragments_hi),
-				8, STATS_FLAGS_PORT, "rx_fragments" },
-	{ STATS_OFFSET32(rx_stat_etherstatsjabbers_hi),
-				8, STATS_FLAGS_PORT, "rx_jabbers" },
-	{ STATS_OFFSET32(no_buff_discard_hi),
-				8, STATS_FLAGS_BOTH, "rx_discards" },
-	{ STATS_OFFSET32(mac_filter_discard),
-				4, STATS_FLAGS_PORT, "rx_filtered_packets" },
-	{ STATS_OFFSET32(xxoverflow_discard),
-				4, STATS_FLAGS_PORT, "rx_fw_discards" },
-	{ STATS_OFFSET32(brb_drop_hi),
-				8, STATS_FLAGS_PORT, "rx_brb_discard" },
-	{ STATS_OFFSET32(brb_truncate_hi),
-				8, STATS_FLAGS_PORT, "rx_brb_truncate" },
-	{ STATS_OFFSET32(pause_frames_received_hi),
-				8, STATS_FLAGS_PORT, "rx_pause_frames" },
-	{ STATS_OFFSET32(rx_stat_maccontrolframesreceived_hi),
-				8, STATS_FLAGS_PORT, "rx_mac_ctrl_frames" },
-	{ STATS_OFFSET32(nig_timer_max),
-			4, STATS_FLAGS_PORT, "rx_constant_pause_events" },
-/* 20 */{ STATS_OFFSET32(rx_err_discard_pkt),
-				4, STATS_FLAGS_BOTH, "rx_phy_ip_err_discards"},
-	{ STATS_OFFSET32(rx_skb_alloc_failed),
-				4, STATS_FLAGS_BOTH, "rx_skb_alloc_discard" },
-	{ STATS_OFFSET32(hw_csum_err),
-				4, STATS_FLAGS_BOTH, "rx_csum_offload_errors" },
-
-	{ STATS_OFFSET32(total_bytes_transmitted_hi),
-				8, STATS_FLAGS_BOTH, "tx_bytes" },
-	{ STATS_OFFSET32(tx_stat_ifhcoutbadoctets_hi),
-				8, STATS_FLAGS_PORT, "tx_error_bytes" },
-	{ STATS_OFFSET32(total_unicast_packets_transmitted_hi),
-				8, STATS_FLAGS_BOTH, "tx_ucast_packets" },
-	{ STATS_OFFSET32(total_multicast_packets_transmitted_hi),
-				8, STATS_FLAGS_BOTH, "tx_mcast_packets" },
-	{ STATS_OFFSET32(total_broadcast_packets_transmitted_hi),
-				8, STATS_FLAGS_BOTH, "tx_bcast_packets" },
-	{ STATS_OFFSET32(tx_stat_dot3statsinternalmactransmiterrors_hi),
-				8, STATS_FLAGS_PORT, "tx_mac_errors" },
-	{ STATS_OFFSET32(rx_stat_dot3statscarriersenseerrors_hi),
-				8, STATS_FLAGS_PORT, "tx_carrier_errors" },
-/* 30 */{ STATS_OFFSET32(tx_stat_dot3statssinglecollisionframes_hi),
-				8, STATS_FLAGS_PORT, "tx_single_collisions" },
-	{ STATS_OFFSET32(tx_stat_dot3statsmultiplecollisionframes_hi),
-				8, STATS_FLAGS_PORT, "tx_multi_collisions" },
-	{ STATS_OFFSET32(tx_stat_dot3statsdeferredtransmissions_hi),
-				8, STATS_FLAGS_PORT, "tx_deferred" },
-	{ STATS_OFFSET32(tx_stat_dot3statsexcessivecollisions_hi),
-				8, STATS_FLAGS_PORT, "tx_excess_collisions" },
-	{ STATS_OFFSET32(tx_stat_dot3statslatecollisions_hi),
-				8, STATS_FLAGS_PORT, "tx_late_collisions" },
-	{ STATS_OFFSET32(tx_stat_etherstatscollisions_hi),
-				8, STATS_FLAGS_PORT, "tx_total_collisions" },
-	{ STATS_OFFSET32(tx_stat_etherstatspkts64octets_hi),
-				8, STATS_FLAGS_PORT, "tx_64_byte_packets" },
-	{ STATS_OFFSET32(tx_stat_etherstatspkts65octetsto127octets_hi),
-			8, STATS_FLAGS_PORT, "tx_65_to_127_byte_packets" },
-	{ STATS_OFFSET32(tx_stat_etherstatspkts128octetsto255octets_hi),
-			8, STATS_FLAGS_PORT, "tx_128_to_255_byte_packets" },
-	{ STATS_OFFSET32(tx_stat_etherstatspkts256octetsto511octets_hi),
-			8, STATS_FLAGS_PORT, "tx_256_to_511_byte_packets" },
-/* 40 */{ STATS_OFFSET32(tx_stat_etherstatspkts512octetsto1023octets_hi),
-			8, STATS_FLAGS_PORT, "tx_512_to_1023_byte_packets" },
-	{ STATS_OFFSET32(etherstatspkts1024octetsto1522octets_hi),
-			8, STATS_FLAGS_PORT, "tx_1024_to_1522_byte_packets" },
-	{ STATS_OFFSET32(etherstatspktsover1522octets_hi),
-			8, STATS_FLAGS_PORT, "tx_1523_to_9022_byte_packets" },
-	{ STATS_OFFSET32(pause_frames_sent_hi),
-				8, STATS_FLAGS_PORT, "tx_pause_frames" }
-};
-
 #define IS_PORT_STAT(i) \
 	((bnx2x_stats_arr[i].flags & STATS_FLAGS_BOTH) == STATS_FLAGS_PORT)
 #define IS_FUNC_STAT(i)		(bnx2x_stats_arr[i].flags & STATS_FLAGS_FUNC)
@@ -1929,7 +1938,8 @@ static int bnx2x_get_sset_count(struct net_device *dev, int stringset)
 	switch (stringset) {
 	case ETH_SS_STATS:
 		if (is_multi(bp)) {
-			num_stats = BNX2X_NUM_Q_STATS * bp->num_queues;
+			num_stats = BNX2X_NUM_STAT_QUEUES(bp) *
+				BNX2X_NUM_Q_STATS;
 			if (!IS_MF_MODE_STAT(bp))
 				num_stats += BNX2X_NUM_STATS;
 		} else {
@@ -1955,15 +1965,25 @@ static void bnx2x_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
 {
 	struct bnx2x *bp = netdev_priv(dev);
 	int i, j, k;
+	char queue_name[MAX_QUEUE_NAME_LEN+1];
 
 	switch (stringset) {
 	case ETH_SS_STATS:
 		if (is_multi(bp)) {
 			k = 0;
-			for_each_queue(bp, i) {
+			for_each_napi_queue(bp, i) {
+				memset(queue_name, 0, sizeof(queue_name));
+
+				if (IS_FCOE_IDX(i))
+					sprintf(queue_name, "fcoe");
+				else
+					sprintf(queue_name, "%d", i);
+
 				for (j = 0; j < BNX2X_NUM_Q_STATS; j++)
-					sprintf(buf + (k + j)*ETH_GSTRING_LEN,
-						bnx2x_q_stats_arr[j].string, i);
+					snprintf(buf + (k + j)*ETH_GSTRING_LEN,
+						ETH_GSTRING_LEN,
+						bnx2x_q_stats_arr[j].string,
+						queue_name);
 				k += BNX2X_NUM_Q_STATS;
 			}
 			if (IS_MF_MODE_STAT(bp))
@@ -1997,7 +2017,7 @@ static void bnx2x_get_ethtool_stats(struct net_device *dev,
 
 	if (is_multi(bp)) {
 		k = 0;
-		for_each_queue(bp, i) {
+		for_each_napi_queue(bp, i) {
 			hw_stats = (u32 *)&bp->fp[i].eth_q_stats;
 			for (j = 0; j < BNX2X_NUM_Q_STATS; j++) {
 				if (bnx2x_q_stats_arr[j].size == 0) {
diff --git a/drivers/net/bnx2x/bnx2x_main.c b/drivers/net/bnx2x/bnx2x_main.c
index 0068a1d..e6e2746 100644
--- a/drivers/net/bnx2x/bnx2x_main.c
+++ b/drivers/net/bnx2x/bnx2x_main.c
@@ -121,6 +121,10 @@ MODULE_PARM_DESC(debug, " Default debug msglevel");
 
 static struct workqueue_struct *bnx2x_wq;
 
+#ifdef BCM_CNIC
+static u8 ALL_ENODE_MACS[] = {0x01, 0x10, 0x18, 0x01, 0x00, 0x01};
+#endif
+
 enum bnx2x_board_type {
 	BCM57710 = 0,
 	BCM57711 = 1,
@@ -921,7 +925,7 @@ void bnx2x_panic_dump(struct bnx2x *bp)
 	       sp_sb_data.p_func.vf_valid);
 
 
-	for_each_queue(bp, i) {
+	for_each_eth_queue(bp, i) {
 		struct bnx2x_fastpath *fp = &bp->fp[i];
 		int loop;
 		struct hc_status_block_data_e2 sb_data_e2;
@@ -961,6 +965,10 @@ void bnx2x_panic_dump(struct bnx2x *bp)
 
 		/* host sb data */
 
+#ifdef BCM_CNIC
+		if (IS_FCOE_FP(fp))
+			continue;
+#endif
 		BNX2X_ERR("     run indexes (");
 		for (j = 0; j < HC_SB_MAX_SM; j++)
 			pr_cont("0x%x%s",
@@ -1029,7 +1037,7 @@ void bnx2x_panic_dump(struct bnx2x *bp)
 #ifdef BNX2X_STOP_ON_ERROR
 	/* Rings */
 	/* Rx */
-	for_each_queue(bp, i) {
+	for_each_rx_queue(bp, i) {
 		struct bnx2x_fastpath *fp = &bp->fp[i];
 
 		start = RX_BD(le16_to_cpu(*fp->rx_cons_sb) - 10);
@@ -1063,7 +1071,7 @@ void bnx2x_panic_dump(struct bnx2x *bp)
 	}
 
 	/* Tx */
-	for_each_queue(bp, i) {
+	for_each_tx_queue(bp, i) {
 		struct bnx2x_fastpath *fp = &bp->fp[i];
 
 		start = TX_BD(le16_to_cpu(*fp->tx_cons_sb) - 10);
@@ -1298,7 +1306,7 @@ void bnx2x_int_disable_sync(struct bnx2x *bp, int disable_hw)
 #ifdef BCM_CNIC
 		offset++;
 #endif
-		for_each_queue(bp, i)
+		for_each_eth_queue(bp, i)
 			synchronize_irq(bp->msix_table[i + offset].vector);
 	} else
 		synchronize_irq(bp->pdev->irq);
@@ -1420,7 +1428,7 @@ irqreturn_t bnx2x_interrupt(int irq, void *dev_instance)
 		return IRQ_HANDLED;
 #endif
 
-	for_each_queue(bp, i) {
+	for_each_eth_queue(bp, i) {
 		struct bnx2x_fastpath *fp = &bp->fp[i];
 
 		mask = 0x2 << (fp->index + CNIC_CONTEXT_USE);
@@ -2253,6 +2261,15 @@ u32 bnx2x_fw_command(struct bnx2x *bp, u32 command, u32 param)
 	return rc;
 }
 
+static u8 stat_counter_valid(struct bnx2x *bp, struct bnx2x_fastpath *fp)
+{
+#ifdef BCM_CNIC
+	if (IS_FCOE_FP(fp) && IS_MF(bp))
+		return false;
+#endif
+	return true;
+}
+
 /* must be called under rtnl_lock */
 static void bnx2x_rxq_set_mac_filters(struct bnx2x *bp, u16 cl_id, u32 filters)
 {
@@ -2411,7 +2428,8 @@ static inline u16 bnx2x_get_cl_flags(struct bnx2x *bp,
 	if (!fp->disable_tpa)
 		flags |= QUEUE_FLG_TPA;
 
-	flags |= QUEUE_FLG_STATS;
+	flags = stat_counter_valid(bp, fp) ?
+			(flags | QUEUE_FLG_STATS) : (flags & ~QUEUE_FLG_STATS);
 
 	return flags;
 }
@@ -2471,7 +2489,10 @@ static void bnx2x_pf_rx_cl_prep(struct bnx2x *bp,
 	rxq_init->cache_line_log = BNX2X_RX_ALIGN_SHIFT;
 	rxq_init->fw_sb_id = fp->fw_sb_id;
 
-	rxq_init->sb_cq_index = U_SB_ETH_RX_CQ_INDEX;
+	if (IS_FCOE_FP(fp))
+		rxq_init->sb_cq_index = HC_SP_INDEX_ETH_FCOE_RX_CQ_CONS;
+	else
+		rxq_init->sb_cq_index = U_SB_ETH_RX_CQ_INDEX;
 
 	rxq_init->cid = HW_CID(bp, fp->cid);
 
@@ -2491,6 +2512,12 @@ static void bnx2x_pf_tx_cl_prep(struct bnx2x *bp,
 	txq_init->sb_cq_index = C_SB_ETH_TX_CQ_INDEX;
 	txq_init->traffic_type = LLFC_TRAFFIC_TYPE_NW;
 	txq_init->fw_sb_id = fp->fw_sb_id;
+
+	if (IS_FCOE_FP(fp)) {
+		txq_init->sb_cq_index = HC_SP_INDEX_ETH_FCOE_TX_CQ_CONS;
+		txq_init->traffic_type = LLFC_TRAFFIC_TYPE_FCOE;
+	}
+
 	txq_init->hc_rate = bp->tx_ticks ? (1000000 / bp->tx_ticks) : 0;
 }
 
@@ -3689,8 +3716,11 @@ static void bnx2x_eq_int(struct bnx2x *bp)
 #ifdef BCM_CNIC
 			if (!bnx2x_cnic_handle_cfc_del(bp, cid, elem))
 				goto next_spqe;
+			if (cid == BNX2X_FCOE_ETH_CID)
+				bnx2x_fcoe(bp, state) = BNX2X_FP_STATE_CLOSED;
+			else
 #endif
-			bnx2x_fp(bp, cid, state) =
+				bnx2x_fp(bp, cid, state) =
 						BNX2X_FP_STATE_CLOSED;
 
 			goto next_spqe;
@@ -3766,7 +3796,13 @@ static void bnx2x_sp_task(struct work_struct *work)
 
 	/* SP events: STAT_QUERY and others */
 	if (status & BNX2X_DEF_SB_IDX) {
+#ifdef BCM_CNIC
+		struct bnx2x_fastpath *fp = bnx2x_fcoe_fp(bp);
 
+		if ((!NO_FCOE(bp)) &&
+			(bnx2x_has_rx_work(fp) || bnx2x_has_tx_work(fp)))
+			napi_schedule(&bnx2x_fcoe(bp, napi));
+#endif
 		/* Handle EQ completions */
 		bnx2x_eq_int(bp);
 
@@ -4149,7 +4185,7 @@ void bnx2x_update_coalesce(struct bnx2x *bp)
 {
 	int i;
 
-	for_each_queue(bp, i)
+	for_each_eth_queue(bp, i)
 		bnx2x_update_coalesce_sb(bp, bp->fp[i].fw_sb_id,
 					 bp->rx_ticks, bp->tx_ticks);
 }
@@ -4197,13 +4233,16 @@ static void bnx2x_init_ind_table(struct bnx2x *bp)
 	for (i = 0; i < TSTORM_INDIRECTION_TABLE_SIZE; i++)
 		REG_WR8(bp, BAR_TSTRORM_INTMEM +
 			TSTORM_INDIRECTION_TABLE_OFFSET(func) + i,
-			bp->fp->cl_id + (i % bp->num_queues));
+			bp->fp->cl_id + (i % (bp->num_queues -
+				NONE_ETH_CONTEXT_USE)));
 }
 
 void bnx2x_set_storm_rx_mode(struct bnx2x *bp)
 {
 	int mode = bp->rx_mode;
+	int port = BP_PORT(bp);
 	u16 cl_id;
+	u32 def_q_filters = 0;
 
 	/* All but management unicast packets should pass to the host as well */
 	u32 llh_mask =
@@ -4214,30 +4253,42 @@ void bnx2x_set_storm_rx_mode(struct bnx2x *bp)
 
 	switch (mode) {
 	case BNX2X_RX_MODE_NONE: /* no Rx */
-		cl_id = BP_L_ID(bp);
-		bnx2x_rxq_set_mac_filters(bp, cl_id, BNX2X_ACCEPT_NONE);
+		def_q_filters = BNX2X_ACCEPT_NONE;
+#ifdef BCM_CNIC
+		if (!NO_FCOE(bp)) {
+			cl_id = bnx2x_fcoe(bp, cl_id);
+			bnx2x_rxq_set_mac_filters(bp, cl_id, BNX2X_ACCEPT_NONE);
+		}
+#endif
 		break;
 
 	case BNX2X_RX_MODE_NORMAL:
-		cl_id = BP_L_ID(bp);
-		bnx2x_rxq_set_mac_filters(bp, cl_id,
-			BNX2X_ACCEPT_UNICAST |
-			BNX2X_ACCEPT_BROADCAST |
-			BNX2X_ACCEPT_MULTICAST);
+		def_q_filters |= BNX2X_ACCEPT_UNICAST | BNX2X_ACCEPT_BROADCAST |
+				BNX2X_ACCEPT_MULTICAST;
+#ifdef BCM_CNIC
+		cl_id = bnx2x_fcoe(bp, cl_id);
+		bnx2x_rxq_set_mac_filters(bp, cl_id, BNX2X_ACCEPT_UNICAST |
+					  BNX2X_ACCEPT_MULTICAST);
+#endif
 		break;
 
 	case BNX2X_RX_MODE_ALLMULTI:
-		cl_id = BP_L_ID(bp);
-		bnx2x_rxq_set_mac_filters(bp, cl_id,
-			BNX2X_ACCEPT_UNICAST |
-			BNX2X_ACCEPT_BROADCAST |
-			BNX2X_ACCEPT_ALL_MULTICAST);
+		def_q_filters |= BNX2X_ACCEPT_UNICAST | BNX2X_ACCEPT_BROADCAST |
+				BNX2X_ACCEPT_ALL_MULTICAST;
+#ifdef BCM_CNIC
+		cl_id = bnx2x_fcoe(bp, cl_id);
+		bnx2x_rxq_set_mac_filters(bp, cl_id, BNX2X_ACCEPT_UNICAST |
+					  BNX2X_ACCEPT_MULTICAST);
+#endif
 		break;
 
 	case BNX2X_RX_MODE_PROMISC:
-		cl_id = BP_L_ID(bp);
-		bnx2x_rxq_set_mac_filters(bp, cl_id, BNX2X_PROMISCUOUS_MODE);
-
+		def_q_filters |= BNX2X_PROMISCUOUS_MODE;
+#ifdef BCM_CNIC
+		cl_id = bnx2x_fcoe(bp, cl_id);
+		bnx2x_rxq_set_mac_filters(bp, cl_id, BNX2X_ACCEPT_UNICAST |
+					  BNX2X_ACCEPT_MULTICAST);
+#endif
 		/* pass management unicast packets as well */
 		llh_mask |= NIG_LLH0_BRB1_DRV_MASK_REG_LLH0_BRB1_DRV_MASK_UNCST;
 		break;
@@ -4247,20 +4298,24 @@ void bnx2x_set_storm_rx_mode(struct bnx2x *bp)
 		break;
 	}
 
+	cl_id = BP_L_ID(bp);
+	bnx2x_rxq_set_mac_filters(bp, cl_id, def_q_filters);
+
 	REG_WR(bp,
-	       BP_PORT(bp) ? NIG_REG_LLH1_BRB1_DRV_MASK :
-			     NIG_REG_LLH0_BRB1_DRV_MASK,
-	       llh_mask);
+	       (port ? NIG_REG_LLH1_BRB1_DRV_MASK :
+		       NIG_REG_LLH0_BRB1_DRV_MASK), llh_mask);
 
 	DP(NETIF_MSG_IFUP, "rx mode %d\n"
 		"drop_ucast 0x%x\ndrop_mcast 0x%x\ndrop_bcast 0x%x\n"
-		"accp_ucast 0x%x\naccp_mcast 0x%x\naccp_bcast 0x%x\n", mode,
+		"accp_ucast 0x%x\naccp_mcast 0x%x\naccp_bcast 0x%x\n"
+		"unmatched_ucast 0x%x\n", mode,
 		bp->mac_filters.ucast_drop_all,
 		bp->mac_filters.mcast_drop_all,
 		bp->mac_filters.bcast_drop_all,
 		bp->mac_filters.ucast_accept_all,
 		bp->mac_filters.mcast_accept_all,
-		bp->mac_filters.bcast_accept_all
+		bp->mac_filters.bcast_accept_all,
+		bp->mac_filters.unmatched_unicast
 	);
 
 	storm_memset_mac_filters(bp, &bp->mac_filters, BP_FUNC(bp));
@@ -4369,9 +4424,11 @@ void bnx2x_nic_init(struct bnx2x *bp, u32 load_code)
 {
 	int i;
 
-	for_each_queue(bp, i)
+	for_each_eth_queue(bp, i)
 		bnx2x_init_fp_sb(bp, i);
 #ifdef BCM_CNIC
+	if (!NO_FCOE(bp))
+		bnx2x_init_fcoe_fp(bp);
 
 	bnx2x_init_sb(bp, bp->cnic_sb_mapping,
 		      BNX2X_VF_ID_INVALID, false,
@@ -5877,6 +5934,15 @@ void bnx2x_free_mem(struct bnx2x *bp)
 	/* fastpath */
 	/* Common */
 	for_each_queue(bp, i) {
+#ifdef BCM_CNIC
+		/* FCoE client uses default status block */
+		if (IS_FCOE_IDX(i)) {
+			union host_hc_status_block *sb =
+				&bnx2x_fp(bp, i, status_blk);
+			memset(sb, 0, sizeof(union host_hc_status_block));
+			bnx2x_fp(bp, i, status_blk_mapping) = 0;
+		} else {
+#endif
 		/* status blocks */
 		if (CHIP_IS_E2(bp))
 			BNX2X_PCI_FREE(bnx2x_fp(bp, i, status_blk.e2_sb),
@@ -5886,9 +5952,12 @@ void bnx2x_free_mem(struct bnx2x *bp)
 			BNX2X_PCI_FREE(bnx2x_fp(bp, i, status_blk.e1x_sb),
 				       bnx2x_fp(bp, i, status_blk_mapping),
 				       sizeof(struct host_hc_status_block_e1x));
+#ifdef BCM_CNIC
+		}
+#endif
 	}
 	/* Rx */
-	for_each_queue(bp, i) {
+	for_each_rx_queue(bp, i) {
 
 		/* fastpath rx rings: rx_buf rx_desc rx_comp */
 		BNX2X_FREE(bnx2x_fp(bp, i, rx_buf_ring));
@@ -5908,7 +5977,7 @@ void bnx2x_free_mem(struct bnx2x *bp)
 			       BCM_PAGE_SIZE * NUM_RX_SGE_PAGES);
 	}
 	/* Tx */
-	for_each_queue(bp, i) {
+	for_each_tx_queue(bp, i) {
 
 		/* fastpath tx rings: tx_buf tx_desc */
 		BNX2X_FREE(bnx2x_fp(bp, i, tx_buf_ring));
@@ -5992,15 +6061,20 @@ int bnx2x_alloc_mem(struct bnx2x *bp)
 		union host_hc_status_block *sb = &bnx2x_fp(bp, i, status_blk);
 		bnx2x_fp(bp, i, bp) = bp;
 		/* status blocks */
-		if (CHIP_IS_E2(bp))
-			BNX2X_PCI_ALLOC(sb->e2_sb,
-				&bnx2x_fp(bp, i, status_blk_mapping),
-				sizeof(struct host_hc_status_block_e2));
-		else
-			BNX2X_PCI_ALLOC(sb->e1x_sb,
-				&bnx2x_fp(bp, i, status_blk_mapping),
-				sizeof(struct host_hc_status_block_e1x));
-
+#ifdef BCM_CNIC
+		if (!IS_FCOE_IDX(i)) {
+#endif
+			if (CHIP_IS_E2(bp))
+				BNX2X_PCI_ALLOC(sb->e2_sb,
+				    &bnx2x_fp(bp, i, status_blk_mapping),
+				    sizeof(struct host_hc_status_block_e2));
+			else
+				BNX2X_PCI_ALLOC(sb->e1x_sb,
+				    &bnx2x_fp(bp, i, status_blk_mapping),
+				    sizeof(struct host_hc_status_block_e1x));
+#ifdef BCM_CNIC
+		}
+#endif
 		set_sb_shortcuts(bp, i);
 	}
 	/* Rx */
@@ -6410,7 +6484,8 @@ static int bnx2x_set_iscsi_eth_mac_addr(struct bnx2x *bp, int set)
 {
 	u8 cam_offset = (CHIP_IS_E1(bp) ? ((BP_PORT(bp) ? 32 : 0) + 2) :
 			 bnx2x_e1h_cam_offset(bp, CAM_ISCSI_ETH_LINE));
-	u32 iscsi_l2_cl_id = BNX2X_ISCSI_ETH_CL_ID;
+	u32 iscsi_l2_cl_id = BNX2X_ISCSI_ETH_CL_ID +
+		BP_E1HVN(bp) * NONE_ETH_CONTEXT_USE;
 	u32 cl_bit_vec = (1 << iscsi_l2_cl_id);
 
 	/* Send a SET_MAC ramrod */
@@ -6418,6 +6493,50 @@ static int bnx2x_set_iscsi_eth_mac_addr(struct bnx2x *bp, int set)
 			       cam_offset, 0);
 
 	bnx2x_set_mac_in_nig(bp, set, bp->iscsi_mac, LLH_CAM_ISCSI_ETH_LINE);
+
+	return 0;
+}
+
+/**
+ * Set FCoE L2 MAC(s) at the next enties in the CAM after the
+ * ETH MAC(s). This function will wait until the ramdord
+ * completion returns.
+ *
+ * @param bp driver handle
+ * @param set set or clear the CAM entry
+ *
+ * @return 0 if cussess, -ENODEV if ramrod doesn't return.
+ */
+int bnx2x_set_fip_eth_mac_addr(struct bnx2x *bp, int set)
+{
+	u32 cl_bit_vec = (1 << bnx2x_fcoe(bp, cl_id));
+	/**
+	 * CAM allocation for E1H
+	 * eth unicasts: by func number
+	 * iscsi: by func number
+	 * fip unicast: by func number
+	 * fip multicast: by func number
+	 */
+	bnx2x_set_mac_addr_gen(bp, set, bp->fip_mac,
+		cl_bit_vec, bnx2x_e1h_cam_offset(bp, CAM_FIP_ETH_LINE), 0);
+
+	return 0;
+}
+
+int bnx2x_set_all_enode_macs(struct bnx2x *bp, int set)
+{
+	u32 cl_bit_vec = (1 << bnx2x_fcoe(bp, cl_id));
+
+	/**
+	 * CAM allocation for E1H
+	 * eth unicasts: by func number
+	 * iscsi: by func number
+	 * fip unicast: by func number
+	 * fip multicast: by func number
+	 */
+	bnx2x_set_mac_addr_gen(bp, set, ALL_ENODE_MACS,	cl_bit_vec,
+		bnx2x_e1h_cam_offset(bp, CAM_FIP_MCAST_LINE), 0);
+
 	return 0;
 }
 #endif
@@ -6435,6 +6554,8 @@ static void bnx2x_fill_cl_init_data(struct bnx2x *bp,
 	data->general.statistics_counter_id = params->rxq_params.stat_id;
 	data->general.statistics_en_flg =
 		(params->rxq_params.flags & QUEUE_FLG_STATS) ? 1 : 0;
+	data->general.is_fcoe_flg =
+		(params->ramrod_params.flags & CLIENT_IS_FCOE) ? 1 : 0;
 	data->general.activate_flg = activate;
 	data->general.sp_client_id = params->rxq_params.spcl_id;
 
@@ -6503,7 +6624,9 @@ static void bnx2x_fill_cl_init_data(struct bnx2x *bp,
 	data->fc.safc_group_num = params->txq_params.cos;
 	data->fc.safc_group_en_flg =
 		(params->txq_params.flags & QUEUE_FLG_COS) ? 1 : 0;
-	data->fc.traffic_type = LLFC_TRAFFIC_TYPE_NW;
+	data->fc.traffic_type =
+		(params->ramrod_params.flags & CLIENT_IS_FCOE) ?
+		LLFC_TRAFFIC_TYPE_FCOE : LLFC_TRAFFIC_TYPE_NW;
 }
 
 static inline void bnx2x_set_ctx_validation(struct eth_context *cxt, u32 cid)
@@ -6602,7 +6725,7 @@ static int __devinit bnx2x_set_int_mode(struct bnx2x *bp)
 		bnx2x_enable_msi(bp);
 		/* falling through... */
 	case INT_MODE_INTx:
-		bp->num_queues = 1;
+		bp->num_queues = 1 + NONE_ETH_CONTEXT_USE;
 		DP(NETIF_MSG_IFUP, "set number of queues to 1\n");
 		break;
 	default:
@@ -6625,8 +6748,8 @@ static int __devinit bnx2x_set_int_mode(struct bnx2x *bp)
 					  "enable MSI-X (%d), "
 					  "set number of queues to %d\n",
 				   bp->num_queues,
-				   1);
-			bp->num_queues = 1;
+				   1 + NONE_ETH_CONTEXT_USE);
+			bp->num_queues = 1 + NONE_ETH_CONTEXT_USE;
 
 			if (!(bp->flags & DISABLE_MSI_FLAG))
 				bnx2x_enable_msi(bp);
@@ -6747,7 +6870,9 @@ int bnx2x_setup_client(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 	struct bnx2x_client_init_params params = { {0} };
 	int rc;
 
-	bnx2x_ack_sb(bp, fp->igu_sb_id, USTORM_ID, 0,
+	/* reset IGU state skip FCoE L2 queue */
+	if (!IS_FCOE_FP(fp))
+		bnx2x_ack_sb(bp, fp->igu_sb_id, USTORM_ID, 0,
 			     IGU_INT_ENABLE, 0);
 
 	params.ramrod_params.pstate = &fp->state;
@@ -6755,6 +6880,12 @@ int bnx2x_setup_client(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 	params.ramrod_params.index = fp->index;
 	params.ramrod_params.cid = fp->cid;
 
+#ifdef BCM_CNIC
+	if (IS_FCOE_FP(fp))
+		params.ramrod_params.flags |= CLIENT_IS_FCOE;
+
+#endif
+
 	if (is_leading)
 		params.ramrod_params.flags |= CLIENT_IS_LEADING_RSS;
 
@@ -6839,7 +6970,7 @@ static void bnx2x_reset_func(struct bnx2x *bp)
 	REG_WR8(bp, BAR_USTRORM_INTMEM + USTORM_FUNC_EN_OFFSET(func), 0);
 
 	/* FP SBs */
-	for_each_queue(bp, i) {
+	for_each_eth_queue(bp, i) {
 		struct bnx2x_fastpath *fp = &bp->fp[i];
 		REG_WR8(bp,
 			BAR_CSTRORM_INTMEM +
@@ -6959,6 +7090,20 @@ static void bnx2x_reset_chip(struct bnx2x *bp, u32 reset_code)
 	}
 }
 
+#ifdef BCM_CNIC
+static inline void bnx2x_del_fcoe_eth_macs(struct bnx2x *bp)
+{
+	if (bp->flags & FCOE_MACS_SET) {
+		if (!IS_MF_SD(bp))
+			bnx2x_set_fip_eth_mac_addr(bp, 0);
+
+		bnx2x_set_all_enode_macs(bp, 0);
+
+		bp->flags &= ~FCOE_MACS_SET;
+	}
+}
+#endif
+
 void bnx2x_chip_cleanup(struct bnx2x *bp, int unload_mode)
 {
 	int port = BP_PORT(bp);
@@ -6966,7 +7111,7 @@ void bnx2x_chip_cleanup(struct bnx2x *bp, int unload_mode)
 	int i, cnt, rc;
 
 	/* Wait until tx fastpath tasks complete */
-	for_each_queue(bp, i) {
+	for_each_tx_queue(bp, i) {
 		struct bnx2x_fastpath *fp = &bp->fp[i];
 
 		cnt = 1000;
@@ -7006,13 +7151,7 @@ void bnx2x_chip_cleanup(struct bnx2x *bp, int unload_mode)
 	}
 
 #ifdef BCM_CNIC
-	/* Clear iSCSI L2 MAC */
-	mutex_lock(&bp->cnic_mutex);
-	if (bp->cnic_flags & BNX2X_CNIC_FLAG_MAC_SET) {
-		bnx2x_set_iscsi_eth_mac_addr(bp, 0);
-		bp->cnic_flags &= ~BNX2X_CNIC_FLAG_MAC_SET;
-	}
-	mutex_unlock(&bp->cnic_mutex);
+	bnx2x_del_fcoe_eth_macs(bp);
 #endif
 
 	if (unload_mode == UNLOAD_NORMAL)
@@ -7865,7 +8004,7 @@ static void __devinit bnx2x_get_igu_cam_info(struct bnx2x *bp)
 	bp->igu_sb_cnt = 0;
 	if (CHIP_INT_MODE_IS_BC(bp)) {
 		bp->igu_sb_cnt = min_t(u8, FP_SB_MAX_E1x,
-				       bp->l2_cid_count);
+				       NUM_IGU_SB_REQUIRED(bp->l2_cid_count));
 
 		bp->igu_base_sb = (CHIP_MODE_IS_4_PORT(bp) ? pfid : vn) *
 			FP_SB_MAX_E1x;
@@ -7896,7 +8035,8 @@ static void __devinit bnx2x_get_igu_cam_info(struct bnx2x *bp)
 			}
 		}
 	}
-	bp->igu_sb_cnt = min_t(u8, bp->igu_sb_cnt, bp->l2_cid_count);
+	bp->igu_sb_cnt = min_t(u8, bp->igu_sb_cnt,
+				   NUM_IGU_SB_REQUIRED(bp->l2_cid_count));
 	if (bp->igu_sb_cnt == 0)
 		BNX2X_ERR("CAM configuration error\n");
 }
@@ -8312,6 +8452,17 @@ static void __devinit bnx2x_get_mac_hwinfo(struct bnx2x *bp)
 	memcpy(bp->link_params.mac_addr, bp->dev->dev_addr, ETH_ALEN);
 	memcpy(bp->dev->perm_addr, bp->dev->dev_addr, ETH_ALEN);
 
+#ifdef BCM_CNIC
+	/* Inform the upper layers about FCoE MAC */
+	if (!CHIP_IS_E1x(bp)) {
+		if (IS_MF_SD(bp))
+			memcpy(bp->fip_mac, bp->dev->dev_addr,
+			       sizeof(bp->fip_mac));
+		else
+			memcpy(bp->fip_mac, bp->iscsi_mac,
+			       sizeof(bp->fip_mac));
+	}
+#endif
 }
 
 static int __devinit bnx2x_get_hwinfo(struct bnx2x *bp)
@@ -8328,7 +8479,8 @@ static int __devinit bnx2x_get_hwinfo(struct bnx2x *bp)
 
 		bp->igu_dsb_id = DEF_SB_IGU_ID;
 		bp->igu_base_sb = 0;
-		bp->igu_sb_cnt = min_t(u8, FP_SB_MAX_E1x, bp->l2_cid_count);
+		bp->igu_sb_cnt = min_t(u8, FP_SB_MAX_E1x,
+				       NUM_IGU_SB_REQUIRED(bp->l2_cid_count));
 	} else {
 		bp->common.int_block = INT_BLOCK_IGU;
 		val = REG_RD(bp, IGU_REG_BLOCK_CONFIGURATION);
@@ -9263,7 +9415,7 @@ static int __devinit bnx2x_init_one(struct pci_dev *pdev,
 		return -ENODEV;
 	}
 
-	cid_count += CNIC_CONTEXT_USE;
+	cid_count += NONE_ETH_CONTEXT_USE + CNIC_CONTEXT_USE;
 
 	/* dev zeroed in init_etherdev */
 	dev = alloc_etherdev_mq(sizeof(*bp), cid_count);
@@ -9292,6 +9444,13 @@ static int __devinit bnx2x_init_one(struct pci_dev *pdev,
 	/* calc qm_cid_count */
 	bp->qm_cid_count = bnx2x_set_qm_cid_count(bp, cid_count);
 
+#ifdef BCM_CNIC
+	/* disable FCOE L2 queue for E1x*/
+	if (CHIP_IS_E1x(bp))
+		bp->flags |= NO_FCOE_FLAG;
+
+#endif
+
 	/* Configure interupt mode: try to enable MSI-X/MSI if
 	 * needed, set bp->num_queues appropriately.
 	 */
@@ -9306,6 +9465,15 @@ static int __devinit bnx2x_init_one(struct pci_dev *pdev,
 		goto init_one_exit;
 	}
 
+#ifdef BCM_CNIC
+	if (!NO_FCOE(bp)) {
+		/* Add storage MAC address */
+		rtnl_lock();
+		dev_addr_add(bp->dev, bp->fip_mac, NETDEV_HW_ADDR_T_SAN);
+		rtnl_unlock();
+	}
+#endif
+
 	bnx2x_get_pcie_width_speed(bp, &pcie_width, &pcie_speed);
 
 	netdev_info(dev, "%s (%c%d) PCI-E x%d %s found at mem %lx,"
@@ -9349,6 +9517,15 @@ static void __devexit bnx2x_remove_one(struct pci_dev *pdev)
 	}
 	bp = netdev_priv(dev);
 
+#ifdef BCM_CNIC
+	/* Delete storage MAC address */
+	if (!NO_FCOE(bp)) {
+		rtnl_lock();
+		dev_addr_del(bp->dev, bp->fip_mac, NETDEV_HW_ADDR_T_SAN);
+		rtnl_unlock();
+	}
+#endif
+
 	unregister_netdev(dev);
 
 	/* Delete all NAPI objects */
@@ -9398,7 +9575,7 @@ static int bnx2x_eeh_nic_unload(struct bnx2x *bp)
 	/* Free SKBs, SGEs, TPA pool and driver internals */
 	bnx2x_free_skbs(bp);
 
-	for_each_queue(bp, i)
+	for_each_rx_queue(bp, i)
 		bnx2x_free_rx_sge_range(bp, bp->fp + i, NUM_RX_SGE);
 
 	bnx2x_free_mem(bp);
@@ -9625,7 +9802,8 @@ static void bnx2x_cnic_sp_post(struct bnx2x *bp, int count)
 				break;
 			else
 				atomic_dec(&bp->spq_left);
-		} else if (type == ISCSI_CONNECTION_TYPE) {
+		} else if ((type == ISCSI_CONNECTION_TYPE) ||
+			   (type == FCOE_CONNECTION_TYPE)) {
 			if (bp->cnic_spq_pending >=
 			    bp->cnic_eth_dev.max_kwqe_pending)
 				break;
@@ -9772,6 +9950,9 @@ static int bnx2x_drv_ctl(struct net_device *dev, struct drv_ctl_info *ctl)
 	case DRV_CTL_START_L2_CMD: {
 		u32 cli = ctl->data.ring.client_id;
 
+		/* Clear FCoE FIP and ALL ENODE MACs addresses first */
+		bnx2x_del_fcoe_eth_macs(bp);
+
 		/* Set iSCSI MAC address */
 		bnx2x_set_iscsi_eth_mac_addr(bp, 1);
 
@@ -9893,10 +10074,6 @@ static int bnx2x_unregister_cnic(struct net_device *dev)
 	struct cnic_eth_dev *cp = &bp->cnic_eth_dev;
 
 	mutex_lock(&bp->cnic_mutex);
-	if (bp->cnic_flags & BNX2X_CNIC_FLAG_MAC_SET) {
-		bp->cnic_flags &= ~BNX2X_CNIC_FLAG_MAC_SET;
-		bnx2x_set_iscsi_eth_mac_addr(bp, 0);
-	}
 	cp->drv_state = 0;
 	rcu_assign_pointer(bp->cnic_ops, NULL);
 	mutex_unlock(&bp->cnic_mutex);
@@ -9927,7 +10104,9 @@ struct cnic_eth_dev *bnx2x_cnic_probe(struct net_device *dev)
 	cp->drv_ctl = bnx2x_drv_ctl;
 	cp->drv_register_cnic = bnx2x_register_cnic;
 	cp->drv_unregister_cnic = bnx2x_unregister_cnic;
-	cp->iscsi_l2_client_id = BNX2X_ISCSI_ETH_CL_ID;
+	cp->fcoe_init_cid = BNX2X_FCOE_ETH_CID;
+	cp->iscsi_l2_client_id = BNX2X_ISCSI_ETH_CL_ID +
+		BP_E1HVN(bp) * NONE_ETH_CONTEXT_USE;
 	cp->iscsi_l2_cid = BNX2X_ISCSI_ETH_CID;
 
 	DP(BNX2X_MSG_SP, "page_size %d, tbl_offset %d, tbl_lines %d, "
diff --git a/drivers/net/bnx2x/bnx2x_stats.c b/drivers/net/bnx2x/bnx2x_stats.c
index 4733c83..6e4d9b1 100644
--- a/drivers/net/bnx2x/bnx2x_stats.c
+++ b/drivers/net/bnx2x/bnx2x_stats.c
@@ -160,7 +160,7 @@ static void bnx2x_storm_stats_post(struct bnx2x *bp)
 
 		ramrod_data.drv_counter = bp->stats_counter++;
 		ramrod_data.collect_port = bp->port.pmf ? 1 : 0;
-		for_each_queue(bp, i)
+		for_each_eth_queue(bp, i)
 			ramrod_data.ctr_id_vector |= (1 << bp->fp[i].cl_id);
 
 		rc = bnx2x_sp_post(bp, RAMROD_CMD_ID_COMMON_STAT_QUERY, 0,
@@ -766,7 +766,7 @@ static int bnx2x_storm_stats_update(struct bnx2x *bp)
 	estats->no_buff_discard_hi = 0;
 	estats->no_buff_discard_lo = 0;
 
-	for_each_queue(bp, i) {
+	for_each_eth_queue(bp, i) {
 		struct bnx2x_fastpath *fp = &bp->fp[i];
 		int cl_id = fp->cl_id;
 		struct tstorm_per_client_stats *tclient =
@@ -996,7 +996,7 @@ static void bnx2x_net_stats_update(struct bnx2x *bp)
 	nstats->tx_bytes = bnx2x_hilo(&estats->total_bytes_transmitted_hi);
 
 	tmp = estats->mac_discard;
-	for_each_queue(bp, i)
+	for_each_rx_queue(bp, i)
 		tmp += le32_to_cpu(bp->fp[i].old_tclient.checksum_discard);
 	nstats->rx_dropped = tmp;
 
@@ -1087,7 +1087,7 @@ static void bnx2x_stats_update(struct bnx2x *bp)
 		       bp->dev->name,
 		       estats->brb_drop_lo, estats->brb_truncate_lo);
 
-		for_each_queue(bp, i) {
+		for_each_eth_queue(bp, i) {
 			struct bnx2x_fastpath *fp = &bp->fp[i];
 			struct bnx2x_eth_q_stats *qstats = &fp->eth_q_stats;
 
@@ -1101,7 +1101,7 @@ static void bnx2x_stats_update(struct bnx2x *bp)
 			       fp->rx_calls, fp->rx_pkt);
 		}
 
-		for_each_queue(bp, i) {
+		for_each_eth_queue(bp, i) {
 			struct bnx2x_fastpath *fp = &bp->fp[i];
 			struct bnx2x_eth_q_stats *qstats = &fp->eth_q_stats;
 			struct netdev_queue *txq =
@@ -1381,7 +1381,8 @@ void bnx2x_stats_init(struct bnx2x *bp)
 		memset(&fp->eth_q_stats, 0, sizeof(struct bnx2x_eth_q_stats));
 	}
 
-	for_each_queue(bp, i) {
+	/* FW stats are currently collected for ETH clients only */
+	for_each_eth_queue(bp, i) {
 		/* Set initial stats counter in the stats ramrod data to -1 */
 		int cl_id = bp->fp[i].cl_id;
 
diff --git a/drivers/net/bnx2x/bnx2x_stats.h b/drivers/net/bnx2x/bnx2x_stats.h
index afd15ef..596798c 100644
--- a/drivers/net/bnx2x/bnx2x_stats.h
+++ b/drivers/net/bnx2x/bnx2x_stats.h
@@ -53,7 +53,6 @@ struct bnx2x_eth_q_stats {
 	u32 hw_csum_err;
 };
 
-#define BNX2X_NUM_Q_STATS		13
 #define Q_STATS_OFFSET32(stat_name) \
 			(offsetof(struct bnx2x_eth_q_stats, stat_name) / 4)
 
@@ -225,7 +224,6 @@ struct bnx2x_eth_stats {
 	u32 nig_timer_max;
 };
 
-#define BNX2X_NUM_STATS			43
 #define STATS_OFFSET32(stat_name) \
 			(offsetof(struct bnx2x_eth_stats, stat_name) / 4)
 
-- 
1.7.0.4





^ permalink raw reply related

* [PATCH net-next 1/9] Take the distribution range definition out of skb_tx_hash()
From: Vladislav Zolotarov @ 2010-12-13 15:43 UTC (permalink / raw)
  To: Dave Miller; +Cc: netdev list, Eilon Greenstein

Move the calcualation of the Tx hash for a given hash range into a separate
function and define the skb_tx_hash(), which calculates a Tx hash for a
[0; dev->real_num_tx_queues - 1] hash values range, using this
function (__skb_tx_hash()).

Signed-off-by: Vladislav Zolotarov <vladz@broadcom.com>
Signed-off-by: Eilon Greenstein <eilong@broadcom.com>
---
 include/linux/netdevice.h |   10 ++++++++++
 include/linux/skbuff.h    |    5 +++--
 net/core/dev.c            |   15 ++++++++++-----
 3 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d31bc3c..445e682 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1747,6 +1747,16 @@ static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
 		__netif_schedule(txq->qdisc);
 }

+/*
+ * Returns a Tx hash for the given packet when dev->real_num_tx_queues is used
+ * as a distribution range limit for the returned value.
+ */
+static inline u16 skb_tx_hash(const struct net_device *dev,
+			      const struct sk_buff *skb)
+{
+	return __skb_tx_hash(dev, skb, dev->real_num_tx_queues);
+}
+
 /**
  *	netif_is_multiqueue - test if device has multiple transmit queues
  *	@dev: network device
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 19f37a6..4c4bec6 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2165,8 +2165,9 @@ static inline bool skb_rx_queue_recorded(const struct sk_buff *skb)
 	return skb->queue_mapping != 0;
 }

-extern u16 skb_tx_hash(const struct net_device *dev,
-		       const struct sk_buff *skb);
+extern u16 __skb_tx_hash(const struct net_device *dev,
+			 const struct sk_buff *skb,
+			 unsigned int num_tx_queues);

 #ifdef CONFIG_XFRM
 static inline struct sec_path *skb_sec_path(struct sk_buff *skb)
diff --git a/net/core/dev.c b/net/core/dev.c
index d28b3a0..b25dd08 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2112,14 +2112,19 @@ out:

 static u32 hashrnd __read_mostly;

-u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
+/*
+ * Returns a Tx hash based on the given packet descriptor a Tx queues' number
+ * to be used as a distribution range.
+ */
+u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
+		  unsigned int num_tx_queues)
 {
 	u32 hash;

 	if (skb_rx_queue_recorded(skb)) {
 		hash = skb_get_rx_queue(skb);
-		while (unlikely(hash >= dev->real_num_tx_queues))
-			hash -= dev->real_num_tx_queues;
+		while (unlikely(hash >= num_tx_queues))
+			hash -= num_tx_queues;
 		return hash;
 	}

@@ -2129,9 +2134,9 @@ u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
 		hash = (__force u16) skb->protocol ^ skb->rxhash;
 	hash = jhash_1word(hash, hashrnd);

-	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
+	return (u16) (((u64) hash * num_tx_queues) >> 32);
 }
-EXPORT_SYMBOL(skb_tx_hash);
+EXPORT_SYMBOL(__skb_tx_hash);

 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
 {
--
1.7.0.4





^ permalink raw reply related

* [PATCH net-next 0/9] bnx2x: FCoE support
From: Vladislav Zolotarov @ 2010-12-13 15:43 UTC (permalink / raw)
  To: Dave Miller
  Cc: netdev list, Eilon Greenstein, Dmitry Kravkov,
	Shmulik Ravid-Rabinovitz

Hello Dave,

This patch-series adds FCoE support to 57712 HW. It provides
required intefaces for upper-level driver (CNIC) and handles
FCoE related HW/FW initialization and flows.

Since the FW files are too big (patches 6 and 8) and will not 
pass the mailing list, it is also located at:
http://linux.broadcom.com/eilong/1.62.00-2

It's a second attempt to submit this patch series after fixing
a Tx hash distribution fairness.

Thanks,
Vlad

^ permalink raw reply

* Re: [*v2 PATCH 01/22] IPVS: netns, add basic init per netns.
From: Hans Schillstrom @ 2010-12-13 15:12 UTC (permalink / raw)
  To: Jan Engelhardt
  Cc: horms@verge.net.au, ja@ssi.bg, daniel.lezcano@free.fr,
	wensong@linux-vs.org, lvs-devel@vger.kernel.org,
	netdev@vger.kernel.org, netfilter-devel@vger.kernel.org,
	hans@schillstrom.com
In-Reply-To: <alpine.LNX.2.01.1012131503190.18216@obet.zrqbmnf.qr>

On Mon, 2010-12-13 at 15:08 +0100, Jan Engelhardt wrote:
> On Monday 2010-12-13 14:38, Hans Schillstrom wrote:
> >@@ -28,6 +28,18 @@
> > #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
> > #include <net/netfilter/nf_conntrack.h>
> > #endif
> >+#include <net/net_namespace.h>		/* Netw namespace */
> >+
> >+/*
> >+ * Generic access of ipvs struct
> >+ */
> >+static inline struct netns_ipvs * net_ipvs(struct net* net) {
> >+#ifdef CONFIG_NET_NS
> >+	return net->ipvs;
> >+#else
> >+	return init_net.ipvs;
> >+#endif
> >+}
> 
> This looks redundant, because even in the !CONFIG_NET_NS case,
> 'net' is valid and already points to &init_net.

Yes, that's right I'll remove it.

Thanks
Hans
> 
> >+static void __net_exit __ip_vs_cleanup(struct net *net)
> >+{
> >+	struct netns_ipvs *ipvs = net_ipvs(net);
> >+
> >+	IP_VS_DBG(10, "ipvs netns %d released\n", ipvs->inc);
> >+}
> >+



^ permalink raw reply

* [PATCH 5/5] net: add skb.old_queue_mapping
From: Changli Gao @ 2010-12-13 14:43 UTC (permalink / raw)
  To: Jamal Hadi Salim; +Cc: David S. Miller, Jamal Hadi Salim, netdev, Changli Gao
In-Reply-To: <1292251414-5154-1-git-send-email-xiaosuo@gmail.com>

For the skbs returned from ifb, we should use the queue_mapping
saved before ifb.

We save old queue_mapping in old_queue_mapping just before calling 
dev_queue_xmit, and restore the old_queue_mapping to queue_mapping
just before reinjecting the skb.

dev_pick_tx() use the current queue_mapping for the skbs reinjected
by ifb.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
---
 drivers/net/ifb.c      |    1 +
 include/linux/skbuff.h |    3 +++
 net/core/dev.c         |    5 +++++
 net/sched/act_mirred.c |    1 +
 4 files changed, 10 insertions(+)
diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 16c767b..481e4b1 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -95,6 +95,7 @@ static void ri_tasklet(unsigned long _dev)
 		u64_stats_update_end(&qp->syncp);
 		skb->skb_iif = dev->ifindex;
 
+		skb->queue_mapping = skb->old_queue_mapping;
 		if (from & AT_EGRESS) {
 			dev_queue_xmit(skb);
 		} else if (from & AT_INGRESS) {
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 19f37a6..2ce2a96 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -403,6 +403,9 @@ struct sk_buff {
 	};
 
 	__u16			vlan_tci;
+#ifdef CONFIG_NET_CLS_ACT
+	__u16			old_queue_mapping;
+#endif
 
 	sk_buff_data_t		transport_header;
 	sk_buff_data_t		network_header;
diff --git a/net/core/dev.c b/net/core/dev.c
index d28b3a0..8e97cfd 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2190,6 +2190,11 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev,
 	int queue_index;
 	const struct net_device_ops *ops = dev->netdev_ops;
 
+#ifdef CONFIG_NET_CLS_ACT
+	if (skb->tc_verd & TC_NCLS)
+		queue_index = skb_get_queue_mapping(skb);
+	else
+#endif
 	if (dev->real_num_tx_queues == 1)
 		queue_index = 0;
 	else if (ops->ndo_select_queue) {
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 0c311be..853eb30 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -197,6 +197,7 @@ static int tcf_mirred(struct sk_buff *skb, struct tc_action *a,
 
 	skb2->skb_iif = skb->dev->ifindex;
 	skb2->dev = dev;
+	skb2->old_queue_mapping = skb->queue_mapping;
 	dev_queue_xmit(skb2);
 	err = 0;
 

^ permalink raw reply related

* [PATCH 4/5] ifb: add multiqueue support
From: Changli Gao @ 2010-12-13 14:43 UTC (permalink / raw)
  To: Jamal Hadi Salim; +Cc: David S. Miller, netdev, Changli Gao
In-Reply-To: <1292251414-5154-1-git-send-email-xiaosuo@gmail.com>

Each ifb NIC has nr_cpu_ids rx queues and nr_cpu_ids queues. Packets
transmitted to ifb are enqueued to the corresponding per cpu tx queues,
and processed in the corresponding per cpu tasklet latter.

The stats are converted to the u64 ones.

tq is a stack variable now. It makes ifb_q_private smaller and tx queue
locked only once in ri_tasklet.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
---
 drivers/net/ifb.c |  211 ++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 141 insertions(+), 70 deletions(-)
diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 57c5cfb..16c767b 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -37,56 +37,63 @@
 #include <net/net_namespace.h>
 
 #define TX_Q_LIMIT    32
+struct ifb_q_private {
+	struct tasklet_struct	ifb_tasklet;
+	struct sk_buff_head	rq;
+	struct u64_stats_sync	syncp;
+	u64			rx_packets;
+	u64			rx_bytes;
+	u64			rx_dropped;
+};
+
 struct ifb_private {
-	struct tasklet_struct   ifb_tasklet;
-	int     tasklet_pending;
-	struct sk_buff_head     rq;
-	struct sk_buff_head     tq;
+	struct ifb_q_private __percpu	*q;
 };
 
 static int numifbs = 2;
 
-static void ri_tasklet(unsigned long dev)
+static void ri_tasklet(unsigned long _dev)
 {
-
-	struct net_device *_dev = (struct net_device *)dev;
-	struct ifb_private *dp = netdev_priv(_dev);
-	struct net_device_stats *stats = &_dev->stats;
+	struct net_device *dev = (struct net_device *)_dev;
+	struct ifb_private *p = netdev_priv(dev);
+	struct ifb_q_private *qp;
 	struct netdev_queue *txq;
 	struct sk_buff *skb;
-
-	txq = netdev_get_tx_queue(_dev, 0);
-	skb = skb_peek(&dp->tq);
-	if (skb == NULL) {
-		if (__netif_tx_trylock(txq)) {
-			skb_queue_splice_tail_init(&dp->rq, &dp->tq);
-			__netif_tx_unlock(txq);
-		} else {
-			/* reschedule */
-			goto resched;
-		}
+	struct sk_buff_head tq;
+
+	__skb_queue_head_init(&tq);
+	txq = netdev_get_tx_queue(dev, raw_smp_processor_id());
+	qp = per_cpu_ptr(p->q, raw_smp_processor_id());
+	if (!__netif_tx_trylock(txq)) {
+		tasklet_schedule(&qp->ifb_tasklet);
+		return;
 	}
+	skb_queue_splice_tail_init(&qp->rq, &tq);
+	if (netif_tx_queue_stopped(txq))
+		netif_tx_wake_queue(txq);
+	__netif_tx_unlock(txq);
 
-	while ((skb = skb_dequeue(&dp->tq)) != NULL) {
+	while ((skb = __skb_dequeue(&tq)) != NULL) {
 		u32 from = G_TC_FROM(skb->tc_verd);
 
 		skb->tc_verd = 0;
 		skb->tc_verd = SET_TC_NCLS(skb->tc_verd);
-		stats->tx_packets++;
-		stats->tx_bytes += skb->len;
+		u64_stats_update_begin(&qp->syncp);
+		txq->tx_packets++;
+		txq->tx_bytes += skb->len;
 
 		rcu_read_lock();
 		skb->dev = dev_get_by_index_rcu(&init_net, skb->skb_iif);
 		if (!skb->dev) {
 			rcu_read_unlock();
+			txq->tx_dropped++;
+			u64_stats_update_end(&qp->syncp);
 			dev_kfree_skb(skb);
-			stats->tx_dropped++;
-			if (skb_queue_len(&dp->tq) != 0)
-				goto resched;
-			break;
+			continue;
 		}
 		rcu_read_unlock();
-		skb->skb_iif = _dev->ifindex;
+		u64_stats_update_end(&qp->syncp);
+		skb->skb_iif = dev->ifindex;
 
 		if (from & AT_EGRESS) {
 			dev_queue_xmit(skb);
@@ -96,48 +103,32 @@ static void ri_tasklet(unsigned long dev)
 		} else
 			BUG();
 	}
-
-	if (__netif_tx_trylock(txq)) {
-		skb = skb_peek(&dp->rq);
-		if (skb == NULL) {
-			dp->tasklet_pending = 0;
-			if (netif_queue_stopped(_dev))
-				netif_wake_queue(_dev);
-		} else {
-			__netif_tx_unlock(txq);
-			goto resched;
-		}
-		__netif_tx_unlock(txq);
-	} else {
-resched:
-		dp->tasklet_pending = 1;
-		tasklet_schedule(&dp->ifb_tasklet);
-	}
-
 }
 
 static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	struct ifb_private *dp = netdev_priv(dev);
-	struct net_device_stats *stats = &dev->stats;
+	struct ifb_private *p = netdev_priv(dev);
+	struct ifb_q_private *qp = per_cpu_ptr(p->q,
+					       skb_get_queue_mapping(skb));
 	u32 from = G_TC_FROM(skb->tc_verd);
 
-	stats->rx_packets++;
-	stats->rx_bytes += skb->len;
+	u64_stats_update_begin(&qp->syncp);
+	qp->rx_packets++;
+	qp->rx_bytes += skb->len;
 
 	if (!(from & (AT_INGRESS|AT_EGRESS)) || !skb->skb_iif) {
+		qp->rx_dropped++;
+		u64_stats_update_end(&qp->syncp);
 		dev_kfree_skb(skb);
-		stats->rx_dropped++;
 		return NETDEV_TX_OK;
 	}
+	u64_stats_update_end(&qp->syncp);
 
-	__skb_queue_tail(&dp->rq, skb);
-	if (!dp->tasklet_pending) {
-		dp->tasklet_pending = 1;
-		tasklet_schedule(&dp->ifb_tasklet);
-	}
+	__skb_queue_tail(&qp->rq, skb);
+	if (skb_queue_len(&qp->rq) == 1)
+		tasklet_schedule(&qp->ifb_tasklet);
 
-	if (skb_queue_len(&dp->rq) >= dev->tx_queue_len)
+	if (skb_queue_len(&qp->rq) >= dev->tx_queue_len)
 		netif_stop_queue(dev);
 
 	return NETDEV_TX_OK;
@@ -145,33 +136,103 @@ static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
 
 static int ifb_close(struct net_device *dev)
 {
-	struct ifb_private *dp = netdev_priv(dev);
-
-	tasklet_kill(&dp->ifb_tasklet);
-	netif_stop_queue(dev);
-	__skb_queue_purge(&dp->rq);
-	__skb_queue_purge(&dp->tq);
+	struct ifb_private *p = netdev_priv(dev);
+	struct ifb_q_private *qp;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		qp = per_cpu_ptr(p->q, cpu);
+		tasklet_kill(&qp->ifb_tasklet);
+		netif_tx_stop_queue(netdev_get_tx_queue(dev, cpu));
+		__skb_queue_purge(&qp->rq);
+	}
 
 	return 0;
 }
 
 static int ifb_open(struct net_device *dev)
 {
-	struct ifb_private *dp = netdev_priv(dev);
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		netif_tx_start_queue(netdev_get_tx_queue(dev, cpu));
+
+	return 0;
+}
+
+static int ifb_init(struct net_device *dev)
+{
+	struct ifb_private *p = netdev_priv(dev);
+	struct ifb_q_private *q;
+	int cpu;
 
-	tasklet_init(&dp->ifb_tasklet, ri_tasklet, (unsigned long)dev);
-	__skb_queue_head_init(&dp->rq);
-	__skb_queue_head_init(&dp->tq);
-	netif_start_queue(dev);
+	p->q = alloc_percpu(struct ifb_q_private);
+	if (!p->q)
+		return -ENOMEM;
+	for_each_possible_cpu(cpu) {
+		q = per_cpu_ptr(p->q, cpu);
+		tasklet_init(&q->ifb_tasklet, ri_tasklet, (unsigned long)dev);
+		__skb_queue_head_init(&q->rq);
+	}
 
 	return 0;
 }
 
+static void ifb_uninit(struct net_device *dev)
+{
+	struct ifb_private *p = netdev_priv(dev);
+
+	free_percpu(p->q);
+}
+
+static u16 ifb_select_queue(struct net_device *dev, struct sk_buff *skb)
+{
+	return smp_processor_id();
+}
+
+static struct rtnl_link_stats64 *ifb_get_stats64(struct net_device *dev,
+		struct rtnl_link_stats64 *stats)
+{
+	struct ifb_private *p = netdev_priv(dev);
+	struct ifb_q_private *q;
+	struct netdev_queue *txq;
+	int cpu;
+	u64 rx_packets, rx_bytes, rx_dropped;
+	u64 tx_packets, tx_bytes, tx_dropped;
+	unsigned int start;
+
+	for_each_possible_cpu(cpu) {
+		q = per_cpu_ptr(p->q, cpu);
+		txq = netdev_get_tx_queue(dev, cpu);
+		do {
+			start = u64_stats_fetch_begin_bh(&q->syncp);
+			rx_packets = q->rx_packets;
+			rx_bytes = q->rx_bytes;
+			rx_dropped = q->rx_dropped;
+			tx_packets = txq->tx_packets;
+			tx_bytes = txq->tx_bytes;
+			tx_dropped = txq->tx_dropped;
+		} while (u64_stats_fetch_retry_bh(&q->syncp, start));
+		stats->rx_packets += rx_packets;
+		stats->rx_bytes += rx_bytes;
+		stats->rx_dropped += rx_dropped;
+		stats->tx_packets += tx_packets;
+		stats->tx_bytes += tx_bytes;
+		stats->tx_dropped += tx_dropped;
+	}
+
+	return stats;
+}
+
 static const struct net_device_ops ifb_netdev_ops = {
+	.ndo_init		= ifb_init,
+	.ndo_uninit		= ifb_uninit,
 	.ndo_open		= ifb_open,
 	.ndo_stop		= ifb_close,
 	.ndo_start_xmit		= ifb_xmit,
 	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_select_queue	= ifb_select_queue,
+	.ndo_get_stats64	= ifb_get_stats64,
 };
 
 static void ifb_setup(struct net_device *dev)
@@ -202,11 +263,21 @@ static int ifb_validate(struct nlattr *tb[], struct nlattr *data[])
 	return 0;
 }
 
+static int ifb_get_tx_queues(struct net *net, struct nlattr *tb[],
+			     unsigned int *tx_queues,
+			     unsigned int *real_tx_queues)
+{
+	*real_tx_queues = *tx_queues = nr_cpu_ids;
+
+	return 0;
+}
+
 static struct rtnl_link_ops ifb_link_ops __read_mostly = {
 	.kind		= "ifb",
 	.priv_size	= sizeof(struct ifb_private),
 	.setup		= ifb_setup,
 	.validate	= ifb_validate,
+	.get_tx_queues	= ifb_get_tx_queues,
 };
 
 /* Number of ifb devices to be set up by this module. */
@@ -218,8 +289,8 @@ static int __init ifb_init_one(int index)
 	struct net_device *dev_ifb;
 	int err;
 
-	dev_ifb = alloc_netdev(sizeof(struct ifb_private), "ifb%d", ifb_setup);
-
+	dev_ifb = alloc_netdev_mq(sizeof(struct ifb_private), "ifb%d",
+				  ifb_setup, nr_cpu_ids);
 	if (!dev_ifb)
 		return -ENOMEM;
 

^ permalink raw reply related

* [PATCH 3/5] ifb: fix tx_queue_len overlimit
From: Changli Gao @ 2010-12-13 14:43 UTC (permalink / raw)
  To: Jamal Hadi Salim; +Cc: David S. Miller, netdev, Changli Gao
In-Reply-To: <1292251414-5154-1-git-send-email-xiaosuo@gmail.com>

We should check the length of rq after enqueuing, otherwise the length
of rq will be over tx_queue_len.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
---
 drivers/net/ifb.c |    6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 1628d01..57c5cfb 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -131,15 +131,15 @@ static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
 		return NETDEV_TX_OK;
 	}

-	if (skb_queue_len(&dp->rq) >= dev->tx_queue_len)
-		netif_stop_queue(dev);
-
 	__skb_queue_tail(&dp->rq, skb);
 	if (!dp->tasklet_pending) {
 		dp->tasklet_pending = 1;
 		tasklet_schedule(&dp->ifb_tasklet);
 	}

+	if (skb_queue_len(&dp->rq) >= dev->tx_queue_len)
+		netif_stop_queue(dev);
+
 	return NETDEV_TX_OK;
 }

^ permalink raw reply related

* [PATCH 2/5] ifb: code cleanup
From: Changli Gao @ 2010-12-13 14:43 UTC (permalink / raw)
  To: Jamal Hadi Salim; +Cc: David S. Miller, netdev, Changli Gao
In-Reply-To: <1292251414-5154-1-git-send-email-xiaosuo@gmail.com>

Clean up code according to scripts/checkpatch.pl

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
---
 drivers/net/ifb.c |   22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)
diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 0667a61..1628d01 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -15,14 +15,14 @@
 	by Patrick McHardy and then maintained by Andre Correa.
 
 	You need the tc action  mirror or redirect to feed this device
-       	packets.
+	packets.
 
 	This program is free software; you can redistribute it and/or
 	modify it under the terms of the GNU General Public License
 	as published by the Free Software Foundation; either version
 	2 of the License, or (at your option) any later version.
 
-  	Authors:	Jamal Hadi Salim (2005)
+	Authors:	Jamal Hadi Salim (2005)
 
 */
 
@@ -56,7 +56,8 @@ static void ri_tasklet(unsigned long dev)
 	struct sk_buff *skb;
 
 	txq = netdev_get_tx_queue(_dev, 0);
-	if ((skb = skb_peek(&dp->tq)) == NULL) {
+	skb = skb_peek(&dp->tq);
+	if (skb == NULL) {
 		if (__netif_tx_trylock(txq)) {
 			skb_queue_splice_tail_init(&dp->rq, &dp->tq);
 			__netif_tx_unlock(txq);
@@ -72,7 +73,7 @@ static void ri_tasklet(unsigned long dev)
 		skb->tc_verd = 0;
 		skb->tc_verd = SET_TC_NCLS(skb->tc_verd);
 		stats->tx_packets++;
-		stats->tx_bytes +=skb->len;
+		stats->tx_bytes += skb->len;
 
 		rcu_read_lock();
 		skb->dev = dev_get_by_index_rcu(&init_net, skb->skb_iif);
@@ -97,7 +98,8 @@ static void ri_tasklet(unsigned long dev)
 	}
 
 	if (__netif_tx_trylock(txq)) {
-		if ((skb = skb_peek(&dp->rq)) == NULL) {
+		skb = skb_peek(&dp->rq);
+		if (skb == NULL) {
 			dp->tasklet_pending = 0;
 			if (netif_queue_stopped(_dev))
 				netif_wake_queue(_dev);
@@ -121,7 +123,7 @@ static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
 	u32 from = G_TC_FROM(skb->tc_verd);
 
 	stats->rx_packets++;
-	stats->rx_bytes+=skb->len;
+	stats->rx_bytes += skb->len;
 
 	if (!(from & (AT_INGRESS|AT_EGRESS)) || !skb->skb_iif) {
 		dev_kfree_skb(skb);
@@ -129,9 +131,8 @@ static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
 		return NETDEV_TX_OK;
 	}
 
-	if (skb_queue_len(&dp->rq) >= dev->tx_queue_len) {
+	if (skb_queue_len(&dp->rq) >= dev->tx_queue_len)
 		netif_stop_queue(dev);
-	}
 
 	__skb_queue_tail(&dp->rq, skb);
 	if (!dp->tasklet_pending) {
@@ -150,6 +151,7 @@ static int ifb_close(struct net_device *dev)
 	netif_stop_queue(dev);
 	__skb_queue_purge(&dp->rq);
 	__skb_queue_purge(&dp->tq);
+
 	return 0;
 }
 
@@ -196,6 +198,7 @@ static int ifb_validate(struct nlattr *tb[], struct nlattr *data[])
 		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
 			return -EADDRNOTAVAIL;
 	}
+
 	return 0;
 }
 
@@ -215,8 +218,7 @@ static int __init ifb_init_one(int index)
 	struct net_device *dev_ifb;
 	int err;
 
-	dev_ifb = alloc_netdev(sizeof(struct ifb_private),
-				 "ifb%d", ifb_setup);
+	dev_ifb = alloc_netdev(sizeof(struct ifb_private), "ifb%d", ifb_setup);
 
 	if (!dev_ifb)
 		return -ENOMEM;

^ permalink raw reply related

* [PATCH 1/5] ifb: remove function declarations
From: Changli Gao @ 2010-12-13 14:43 UTC (permalink / raw)
  To: Jamal Hadi Salim; +Cc: David S. Miller, netdev, Changli Gao

Restructure to remove function declarations.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
---
 drivers/net/ifb.c |   51 +++++++++++++++++++++++----------------------------
 1 file changed, 23 insertions(+), 28 deletions(-)
diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index bfa03db..0667a61 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -46,11 +46,6 @@ struct ifb_private {
 
 static int numifbs = 2;
 
-static void ri_tasklet(unsigned long dev);
-static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev);
-static int ifb_open(struct net_device *dev);
-static int ifb_close(struct net_device *dev);
-
 static void ri_tasklet(unsigned long dev)
 {
 
@@ -119,29 +114,6 @@ resched:
 
 }
 
-static const struct net_device_ops ifb_netdev_ops = {
-	.ndo_open	= ifb_open,
-	.ndo_stop	= ifb_close,
-	.ndo_start_xmit	= ifb_xmit,
-	.ndo_validate_addr = eth_validate_addr,
-};
-
-static void ifb_setup(struct net_device *dev)
-{
-	/* Initialize the device structure. */
-	dev->destructor = free_netdev;
-	dev->netdev_ops = &ifb_netdev_ops;
-
-	/* Fill in device structure with ethernet-generic values. */
-	ether_setup(dev);
-	dev->tx_queue_len = TX_Q_LIMIT;
-
-	dev->flags |= IFF_NOARP;
-	dev->flags &= ~IFF_MULTICAST;
-	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
-	random_ether_addr(dev->dev_addr);
-}
-
 static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ifb_private *dp = netdev_priv(dev);
@@ -193,6 +165,29 @@ static int ifb_open(struct net_device *dev)
 	return 0;
 }
 
+static const struct net_device_ops ifb_netdev_ops = {
+	.ndo_open		= ifb_open,
+	.ndo_stop		= ifb_close,
+	.ndo_start_xmit		= ifb_xmit,
+	.ndo_validate_addr	= eth_validate_addr,
+};
+
+static void ifb_setup(struct net_device *dev)
+{
+	/* Initialize the device structure. */
+	dev->destructor = free_netdev;
+	dev->netdev_ops = &ifb_netdev_ops;
+
+	/* Fill in device structure with ethernet-generic values. */
+	ether_setup(dev);
+	dev->tx_queue_len = TX_Q_LIMIT;
+
+	dev->flags |= IFF_NOARP;
+	dev->flags &= ~IFF_MULTICAST;
+	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+	random_ether_addr(dev->dev_addr);
+}
+
 static int ifb_validate(struct nlattr *tb[], struct nlattr *data[])
 {
 	if (tb[IFLA_ADDRESS]) {

^ permalink raw reply related

* [PATCH net-next-2.6] net: add dev_close_many
From: Octavian Purdila @ 2010-12-13 14:18 UTC (permalink / raw)
  To: netdev; +Cc: Lucian Adrian Grijincu, Vlad Dogaru, Octavian Purdila

Add dev_close_many and dev_deactivate_many to factorize another
expensive sync-rcu operation in the netdevice unregister path.

$ modprobe dummy numdummies=10000
$ ip link set dev dummy* up
$ time rmmod dummy

Without the patch           With the patch

real    0m 24.63s           real    0m 5.15s
user    0m 0.00s            user    0m 0.00s
sys     0m 6.05s            sys     0m 5.14s

Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
---
 include/net/sch_generic.h |    1 +
 net/core/dev.c            |  121 ++++++++++++++++++++++++++++----------------
 net/sched/sch_generic.c   |   29 ++++++++---
 3 files changed, 100 insertions(+), 51 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index ea1f8a8..786cc39 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -321,6 +321,7 @@ extern void dev_init_scheduler(struct net_device *dev);
 extern void dev_shutdown(struct net_device *dev);
 extern void dev_activate(struct net_device *dev);
 extern void dev_deactivate(struct net_device *dev);
+extern void dev_deactivate_many(struct list_head *head);
 extern struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
 				     struct Qdisc *qdisc);
 extern void qdisc_reset(struct Qdisc *qdisc);
diff --git a/net/core/dev.c b/net/core/dev.c
index d28b3a0..7cab19f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1222,51 +1222,88 @@ int dev_open(struct net_device *dev)
 }
 EXPORT_SYMBOL(dev_open);
 
-static int __dev_close(struct net_device *dev)
+static int __dev_close_many(struct list_head *head)
 {
-	const struct net_device_ops *ops = dev->netdev_ops;
+	struct net_device *dev;
 
-	ASSERT_RTNL();
-	might_sleep();
+	list_for_each_entry(dev, head, unreg_list) {
+		ASSERT_RTNL();
+		might_sleep();
 
-	/*
-	 *	Tell people we are going down, so that they can
-	 *	prepare to death, when device is still operating.
-	 */
-	call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
+		/*
+		 *	Tell people we are going down, so that they can
+		 *	prepare to death, when device is still operating.
+		 */
+		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
 
-	clear_bit(__LINK_STATE_START, &dev->state);
+		clear_bit(__LINK_STATE_START, &dev->state);
 
-	/* Synchronize to scheduled poll. We cannot touch poll list,
-	 * it can be even on different cpu. So just clear netif_running().
-	 *
-	 * dev->stop() will invoke napi_disable() on all of it's
-	 * napi_struct instances on this device.
-	 */
-	smp_mb__after_clear_bit(); /* Commit netif_running(). */
+		/* Synchronize to scheduled poll. We cannot touch poll list, it
+		 * can be even on different cpu. So just clear netif_running().
+		 *
+		 * dev->stop() will invoke napi_disable() on all of it's
+		 * napi_struct instances on this device.
+		 */
+		smp_mb__after_clear_bit(); /* Commit netif_running(). */
+	}
 
-	dev_deactivate(dev);
+	dev_deactivate_many(head);
 
-	/*
-	 *	Call the device specific close. This cannot fail.
-	 *	Only if device is UP
-	 *
-	 *	We allow it to be called even after a DETACH hot-plug
-	 *	event.
-	 */
-	if (ops->ndo_stop)
-		ops->ndo_stop(dev);
+	list_for_each_entry(dev, head, unreg_list) {
+		const struct net_device_ops *ops = dev->netdev_ops;
 
-	/*
-	 *	Device is now down.
-	 */
+		/*
+		 *	Call the device specific close. This cannot fail.
+		 *	Only if device is UP
+		 *
+		 *	We allow it to be called even after a DETACH hot-plug
+		 *	event.
+		 */
+		if (ops->ndo_stop)
+			ops->ndo_stop(dev);
+
+		/*
+		 *	Device is now down.
+		 */
+
+		dev->flags &= ~IFF_UP;
+
+		/*
+		 *	Shutdown NET_DMA
+		 */
+		net_dmaengine_put();
+	}
 
-	dev->flags &= ~IFF_UP;
+	return 0;
+}
+
+static int __dev_close(struct net_device *dev)
+{
+	LIST_HEAD(single);
+
+	list_add(&dev->unreg_list, &single);
+	return __dev_close_many(&single);
+}
+
+int dev_close_many(struct list_head *head)
+{
+	struct net_device *dev, *tmp;
+
+	list_for_each_entry_safe(dev, tmp, head, unreg_list)
+		if (!(dev->flags & IFF_UP)) {
+			list_del(&dev->unreg_list);
+			continue;
+		}
+
+	__dev_close_many(head);
 
 	/*
-	 *	Shutdown NET_DMA
+	 * Tell people we are down
 	 */
-	net_dmaengine_put();
+	list_for_each_entry(dev, head, unreg_list) {
+		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
+		call_netdevice_notifiers(NETDEV_DOWN, dev);
+	}
 
 	return 0;
 }
@@ -1282,16 +1319,10 @@ static int __dev_close(struct net_device *dev)
  */
 int dev_close(struct net_device *dev)
 {
-	if (!(dev->flags & IFF_UP))
-		return 0;
-
-	__dev_close(dev);
+	LIST_HEAD(single);
 
-	/*
-	 * Tell people we are down
-	 */
-	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
-	call_netdevice_notifiers(NETDEV_DOWN, dev);
+	list_add(&dev->unreg_list, &single);
+	dev_close_many(&single);
 
 	return 0;
 }
@@ -4958,10 +4989,12 @@ static void rollback_registered_many(struct list_head *head)
 		}
 
 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
+	}
 
-		/* If device is running, close it first. */
-		dev_close(dev);
+	/* If device is running, close it first. */
+	dev_close_many(head);
 
+	list_for_each_entry(dev, head, unreg_list) {
 		/* And unlink it from device chain. */
 		unlist_netdevice(dev);
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 0918834..34dc598 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -810,20 +810,35 @@ static bool some_qdisc_is_busy(struct net_device *dev)
 	return false;
 }
 
-void dev_deactivate(struct net_device *dev)
+void dev_deactivate_many(struct list_head *head)
 {
-	netdev_for_each_tx_queue(dev, dev_deactivate_queue, &noop_qdisc);
-	if (dev_ingress_queue(dev))
-		dev_deactivate_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
+	struct net_device *dev;
 
-	dev_watchdog_down(dev);
+	list_for_each_entry(dev, head, unreg_list) {
+		netdev_for_each_tx_queue(dev, dev_deactivate_queue,
+					 &noop_qdisc);
+		if (dev_ingress_queue(dev))
+			dev_deactivate_queue(dev, dev_ingress_queue(dev),
+					     &noop_qdisc);
+
+		dev_watchdog_down(dev);
+	}
 
 	/* Wait for outstanding qdisc-less dev_queue_xmit calls. */
 	synchronize_rcu();
 
 	/* Wait for outstanding qdisc_run calls. */
-	while (some_qdisc_is_busy(dev))
-		yield();
+	list_for_each_entry(dev, head, unreg_list)
+		while (some_qdisc_is_busy(dev))
+			yield();
+}
+
+void dev_deactivate(struct net_device *dev)
+{
+	LIST_HEAD(single);
+
+	list_add(&dev->unreg_list, &single);
+	dev_deactivate_many(&single);
 }
 
 static void dev_init_scheduler_queue(struct net_device *dev,
-- 
1.7.1


^ permalink raw reply related

* Re: [*v2 PATCH 01/22] IPVS: netns, add basic init per netns.
From: Jan Engelhardt @ 2010-12-13 14:08 UTC (permalink / raw)
  To: Hans Schillstrom
  Cc: horms, ja, daniel.lezcano, wensong, lvs-devel, netdev,
	netfilter-devel, hans
In-Reply-To: <1292247510-753-2-git-send-email-hans.schillstrom@ericsson.com>


On Monday 2010-12-13 14:38, Hans Schillstrom wrote:
>@@ -28,6 +28,18 @@
> #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
> #include <net/netfilter/nf_conntrack.h>
> #endif
>+#include <net/net_namespace.h>		/* Netw namespace */
>+
>+/*
>+ * Generic access of ipvs struct
>+ */
>+static inline struct netns_ipvs * net_ipvs(struct net* net) {
>+#ifdef CONFIG_NET_NS
>+	return net->ipvs;
>+#else
>+	return init_net.ipvs;
>+#endif
>+}

This looks redundant, because even in the !CONFIG_NET_NS case,
'net' is valid and already points to &init_net.

>+static void __net_exit __ip_vs_cleanup(struct net *net)
>+{
>+	struct netns_ipvs *ipvs = net_ipvs(net);
>+
>+	IP_VS_DBG(10, "ipvs netns %d released\n", ipvs->inc);
>+}
>+

^ permalink raw reply

* [PATCH] ehea: Use the standard logging functions
From: leitao @ 2010-12-13 14:04 UTC (permalink / raw)
  To: davem; +Cc: netdev, linux-kernel, Joe Perches, Breno Leitao
In-Reply-To: <98d5a9b37b40f9733324b0f488522f2d4e990676.1291861915.git.joe@perches.com>

Joe,

I fixed some compilation errors and re-generate the patch. Thanks!

Author: Joe Perches <joe@perches.com>

Remove ehea_error, ehea_info and ehea_debug macros.
Use pr_fmt, pr_<level>, netdev_<level> and netif_<level> as appropriate.
Fix messages to use trailing "\n", some messages had an extra one
as the old ehea_<level> macros added a trailing "\n".
Coalesced long format strings.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Breno Leitao <leitao@linux.vnet.ibm.com>
---
 drivers/net/ehea/ehea.h         |   13 --
 drivers/net/ehea/ehea_ethtool.c |   18 +-
 drivers/net/ehea/ehea_main.c    |  415 +++++++++++++++++++--------------------
 drivers/net/ehea/ehea_phyp.c    |   40 ++--
 drivers/net/ehea/ehea_qmr.c     |   89 +++++----
 5 files changed, 278 insertions(+), 297 deletions(-)

diff --git a/drivers/net/ehea/ehea.h b/drivers/net/ehea/ehea.h
index 1f2a675..a724a2d 100644
--- a/drivers/net/ehea/ehea.h
+++ b/drivers/net/ehea/ehea.h
@@ -130,19 +130,6 @@
 
 /* utility functions */
 
-#define ehea_info(fmt, args...) \
-	printk(KERN_INFO DRV_NAME ": " fmt "\n", ## args)
-
-#define ehea_error(fmt, args...) \
-	printk(KERN_ERR DRV_NAME ": Error in %s: " fmt "\n", __func__, ## args)
-
-#ifdef DEBUG
-#define ehea_debug(fmt, args...) \
-	printk(KERN_DEBUG DRV_NAME ": " fmt, ## args)
-#else
-#define ehea_debug(fmt, args...) do {} while (0)
-#endif
-
 void ehea_dump(void *adr, int len, char *msg);
 
 #define EHEA_BMASK(pos, length) (((pos) << 16) + (length))
diff --git a/drivers/net/ehea/ehea_ethtool.c b/drivers/net/ehea/ehea_ethtool.c
index 1f37ee6..afebf20 100644
--- a/drivers/net/ehea/ehea_ethtool.c
+++ b/drivers/net/ehea/ehea_ethtool.c
@@ -26,6 +26,8 @@
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include "ehea.h"
 #include "ehea_phyp.h"
 
@@ -118,10 +120,10 @@ doit:
 	ret = ehea_set_portspeed(port, sp);
 
 	if (!ret)
-		ehea_info("%s: Port speed successfully set: %dMbps "
-			  "%s Duplex",
-			  port->netdev->name, port->port_speed,
-			  port->full_duplex == 1 ? "Full" : "Half");
+		netdev_info(dev,
+			    "Port speed successfully set: %dMbps %s Duplex\n",
+			    port->port_speed,
+			    port->full_duplex == 1 ? "Full" : "Half");
 out:
 	return ret;
 }
@@ -134,10 +136,10 @@ static int ehea_nway_reset(struct net_device *dev)
 	ret = ehea_set_portspeed(port, EHEA_SPEED_AUTONEG);
 
 	if (!ret)
-		ehea_info("%s: Port speed successfully set: %dMbps "
-			  "%s Duplex",
-			  port->netdev->name, port->port_speed,
-			  port->full_duplex == 1 ? "Full" : "Half");
+		netdev_info(port->netdev,
+			    "Port speed successfully set: %dMbps %s Duplex\n",
+			    port->port_speed,
+			    port->full_duplex == 1 ? "Full" : "Half");
 	return ret;
 }
 
diff --git a/drivers/net/ehea/ehea_main.c b/drivers/net/ehea/ehea_main.c
index 81e5b7b..0dfef6d 100644
--- a/drivers/net/ehea/ehea_main.c
+++ b/drivers/net/ehea/ehea_main.c
@@ -26,6 +26,8 @@
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/in.h>
 #include <linux/ip.h>
 #include <linux/tcp.h>
@@ -135,8 +137,8 @@ void ehea_dump(void *adr, int len, char *msg)
 	int x;
 	unsigned char *deb = adr;
 	for (x = 0; x < len; x += 16) {
-		printk(DRV_NAME " %s adr=%p ofs=%04x %016llx %016llx\n", msg,
-			  deb, x, *((u64 *)&deb[0]), *((u64 *)&deb[8]));
+		pr_info("%s adr=%p ofs=%04x %016llx %016llx\n",
+			msg, deb, x, *((u64 *)&deb[0]), *((u64 *)&deb[8]));
 		deb += 16;
 	}
 }
@@ -336,7 +338,7 @@ static struct net_device_stats *ehea_get_stats(struct net_device *dev)
 
 	cb2 = (void *)get_zeroed_page(GFP_KERNEL);
 	if (!cb2) {
-		ehea_error("no mem for cb2");
+		netdev_err(dev, "no mem for cb2\n");
 		goto out;
 	}
 
@@ -344,7 +346,7 @@ static struct net_device_stats *ehea_get_stats(struct net_device *dev)
 				      port->logical_port_id,
 				      H_PORT_CB2, H_PORT_CB2_ALL, cb2);
 	if (hret != H_SUCCESS) {
-		ehea_error("query_ehea_port failed");
+		netdev_err(dev, "query_ehea_port failed\n");
 		goto out_herr;
 	}
 
@@ -399,7 +401,7 @@ static void ehea_refill_rq1(struct ehea_port_res *pr, int index, int nr_of_wqes)
 			skb_arr_rq1[index] = netdev_alloc_skb(dev,
 							      EHEA_L_PKT_SIZE);
 			if (!skb_arr_rq1[index]) {
-				ehea_info("Unable to allocate enough skb in the array\n");
+				netdev_info(dev, "Unable to allocate enough skb in the array\n");
 				pr->rq1_skba.os_skbs = fill_wqes - i;
 				break;
 			}
@@ -423,14 +425,14 @@ static void ehea_init_fill_rq1(struct ehea_port_res *pr, int nr_rq1a)
 	int i;
 
 	if (nr_rq1a > pr->rq1_skba.len) {
-		ehea_error("NR_RQ1A bigger than skb array len\n");
+		netdev_err(dev, "NR_RQ1A bigger than skb array len\n");
 		return;
 	}
 
 	for (i = 0; i < nr_rq1a; i++) {
 		skb_arr_rq1[i] = netdev_alloc_skb(dev, EHEA_L_PKT_SIZE);
 		if (!skb_arr_rq1[i]) {
-			ehea_info("No enough memory to allocate skb array\n");
+			netdev_info(dev, "Not enough memory to allocate skb array\n");
 			break;
 		}
 	}
@@ -468,8 +470,9 @@ static int ehea_refill_rq_def(struct ehea_port_res *pr,
 		if (!skb) {
 			q_skba->os_skbs = fill_wqes - i;
 			if (q_skba->os_skbs == q_skba->len - 2) {
-				ehea_info("%s: rq%i ran dry - no mem for skb",
-					  pr->port->netdev->name, rq_nr);
+				netdev_info(pr->port->netdev,
+					    "rq%i ran dry - no mem for skb\n",
+					    rq_nr);
 				ret = -ENOMEM;
 			}
 			break;
@@ -634,8 +637,8 @@ static int ehea_treat_poll_error(struct ehea_port_res *pr, int rq,
 
 	if (cqe->status & EHEA_CQE_STAT_FAT_ERR_MASK) {
 		if (netif_msg_rx_err(pr->port)) {
-			ehea_error("Critical receive error for QP %d. "
-				   "Resetting port.", pr->qp->init_attr.qp_nr);
+			pr_err("Critical receive error for QP %d. Resetting port.\n",
+			       pr->qp->init_attr.qp_nr);
 			ehea_dump(cqe, sizeof(*cqe), "CQE");
 		}
 		ehea_schedule_port_reset(pr->port);
@@ -737,13 +740,13 @@ static int ehea_proc_rwqes(struct net_device *dev,
 							  skb_arr_rq1_len,
 							  wqe_index);
 				if (unlikely(!skb)) {
-					if (netif_msg_rx_err(port))
-						ehea_error("LL rq1: skb=NULL");
+					netif_err(port, rx_err, dev,
+						  "LL rq1: skb=NULL\n");
 
 					skb = netdev_alloc_skb(dev,
 							       EHEA_L_PKT_SIZE);
 					if (!skb) {
-						ehea_info("Not enough memory to allocate skb\n");
+						netdev_info(dev, "Not enough memory to allocate skb\n");
 						break;
 					}
 				}
@@ -755,8 +758,8 @@ static int ehea_proc_rwqes(struct net_device *dev,
 				skb = get_skb_by_index(skb_arr_rq2,
 						       skb_arr_rq2_len, cqe);
 				if (unlikely(!skb)) {
-					if (netif_msg_rx_err(port))
-						ehea_error("rq2: skb=NULL");
+					netif_err(port, rx_err, dev,
+						  "rq2: skb=NULL\n");
 					break;
 				}
 				ehea_fill_skb(dev, skb, cqe);
@@ -766,8 +769,8 @@ static int ehea_proc_rwqes(struct net_device *dev,
 				skb = get_skb_by_index(skb_arr_rq3,
 						       skb_arr_rq3_len, cqe);
 				if (unlikely(!skb)) {
-					if (netif_msg_rx_err(port))
-						ehea_error("rq3: skb=NULL");
+					netif_err(port, rx_err, dev,
+						  "rq3: skb=NULL\n");
 					break;
 				}
 				ehea_fill_skb(dev, skb, cqe);
@@ -839,7 +842,7 @@ static void check_sqs(struct ehea_port *port)
 					 msecs_to_jiffies(100));
 
 		if (!ret) {
-			ehea_error("HW/SW queues out of sync");
+			pr_err("HW/SW queues out of sync\n");
 			ehea_schedule_port_reset(pr->port);
 			return;
 		}
@@ -872,14 +875,14 @@ static struct ehea_cqe *ehea_proc_cqes(struct ehea_port_res *pr, int my_quota)
 		}
 
 		if (cqe->status & EHEA_CQE_STAT_ERR_MASK) {
-			ehea_error("Bad send completion status=0x%04X",
-				   cqe->status);
+			pr_err("Bad send completion status=0x%04X\n",
+			       cqe->status);
 
 			if (netif_msg_tx_err(pr->port))
 				ehea_dump(cqe, sizeof(*cqe), "Send CQE");
 
 			if (cqe->status & EHEA_CQE_STAT_RESET_MASK) {
-				ehea_error("Resetting port");
+				pr_err("Resetting port\n");
 				ehea_schedule_port_reset(pr->port);
 				break;
 			}
@@ -997,8 +1000,8 @@ static irqreturn_t ehea_qp_aff_irq_handler(int irq, void *param)
 
 	while (eqe) {
 		qp_token = EHEA_BMASK_GET(EHEA_EQE_QP_TOKEN, eqe->entry);
-		ehea_error("QP aff_err: entry=0x%llx, token=0x%x",
-			   eqe->entry, qp_token);
+		pr_err("QP aff_err: entry=0x%llx, token=0x%x\n",
+		       eqe->entry, qp_token);
 
 		qp = port->port_res[qp_token].qp;
 
@@ -1016,7 +1019,7 @@ static irqreturn_t ehea_qp_aff_irq_handler(int irq, void *param)
 	}
 
 	if (reset_port) {
-		ehea_error("Resetting port");
+		pr_err("Resetting port\n");
 		ehea_schedule_port_reset(port);
 	}
 
@@ -1044,7 +1047,7 @@ int ehea_sense_port_attr(struct ehea_port *port)
 	/* may be called via ehea_neq_tasklet() */
 	cb0 = (void *)get_zeroed_page(GFP_ATOMIC);
 	if (!cb0) {
-		ehea_error("no mem for cb0");
+		pr_err("no mem for cb0\n");
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -1136,7 +1139,7 @@ int ehea_set_portspeed(struct ehea_port *port, u32 port_speed)
 
 	cb4 = (void *)get_zeroed_page(GFP_KERNEL);
 	if (!cb4) {
-		ehea_error("no mem for cb4");
+		pr_err("no mem for cb4\n");
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -1187,16 +1190,16 @@ int ehea_set_portspeed(struct ehea_port *port, u32 port_speed)
 				break;
 			}
 		} else {
-			ehea_error("Failed sensing port speed");
+			pr_err("Failed sensing port speed\n");
 			ret = -EIO;
 		}
 	} else {
 		if (hret == H_AUTHORITY) {
-			ehea_info("Hypervisor denied setting port speed");
+			pr_info("Hypervisor denied setting port speed\n");
 			ret = -EPERM;
 		} else {
 			ret = -EIO;
-			ehea_error("Failed setting port speed");
+			pr_err("Failed setting port speed\n");
 		}
 	}
 	if (!prop_carrier_state || (port->phy_link == EHEA_PHY_LINK_UP))
@@ -1213,80 +1216,78 @@ static void ehea_parse_eqe(struct ehea_adapter *adapter, u64 eqe)
 	u8 ec;
 	u8 portnum;
 	struct ehea_port *port;
+	struct net_device *dev;
 
 	ec = EHEA_BMASK_GET(NEQE_EVENT_CODE, eqe);
 	portnum = EHEA_BMASK_GET(NEQE_PORTNUM, eqe);
 	port = ehea_get_port(adapter, portnum);
+	dev = port->netdev;
 
 	switch (ec) {
 	case EHEA_EC_PORTSTATE_CHG:	/* port state change */
 
 		if (!port) {
-			ehea_error("unknown portnum %x", portnum);
+			netdev_err(dev, "unknown portnum %x\n", portnum);
 			break;
 		}
 
 		if (EHEA_BMASK_GET(NEQE_PORT_UP, eqe)) {
-			if (!netif_carrier_ok(port->netdev)) {
+			if (!netif_carrier_ok(dev)) {
 				ret = ehea_sense_port_attr(port);
 				if (ret) {
-					ehea_error("failed resensing port "
-						   "attributes");
+					netdev_err(dev, "failed resensing port attributes\n");
 					break;
 				}
 
-				if (netif_msg_link(port))
-					ehea_info("%s: Logical port up: %dMbps "
-						  "%s Duplex",
-						  port->netdev->name,
-						  port->port_speed,
-						  port->full_duplex ==
-						  1 ? "Full" : "Half");
+				netif_info(port, link, dev,
+					   "Logical port up: %dMbps %s Duplex\n",
+					   port->port_speed,
+					   port->full_duplex == 1 ?
+					   "Full" : "Half");
 
-				netif_carrier_on(port->netdev);
-				netif_wake_queue(port->netdev);
+				netif_carrier_on(dev);
+				netif_wake_queue(dev);
 			}
 		} else
-			if (netif_carrier_ok(port->netdev)) {
-				if (netif_msg_link(port))
-					ehea_info("%s: Logical port down",
-						  port->netdev->name);
-				netif_carrier_off(port->netdev);
-				netif_stop_queue(port->netdev);
+			if (netif_carrier_ok(dev)) {
+				netif_info(port, link, dev,
+					   "Logical port down\n");
+				netif_carrier_off(dev);
+				netif_stop_queue(dev);
 			}
 
 		if (EHEA_BMASK_GET(NEQE_EXTSWITCH_PORT_UP, eqe)) {
 			port->phy_link = EHEA_PHY_LINK_UP;
-			if (netif_msg_link(port))
-				ehea_info("%s: Physical port up",
-					  port->netdev->name);
+			netif_info(port, link, dev,
+				   "Physical port up\n");
 			if (prop_carrier_state)
-				netif_carrier_on(port->netdev);
+				netif_carrier_on(dev);
 		} else {
 			port->phy_link = EHEA_PHY_LINK_DOWN;
-			if (netif_msg_link(port))
-				ehea_info("%s: Physical port down",
-					  port->netdev->name);
+			netif_info(port, link, dev,
+				   "Physical port down\n");
 			if (prop_carrier_state)
-				netif_carrier_off(port->netdev);
+				netif_carrier_off(dev);
 		}
 
 		if (EHEA_BMASK_GET(NEQE_EXTSWITCH_PRIMARY, eqe))
-			ehea_info("External switch port is primary port");
+			netdev_info(dev,
+				    "External switch port is primary port\n");
 		else
-			ehea_info("External switch port is backup port");
+			netdev_info(dev,
+				    "External switch port is backup port\n");
 
 		break;
 	case EHEA_EC_ADAPTER_MALFUNC:
-		ehea_error("Adapter malfunction");
+		netdev_err(dev, "Adapter malfunction\n");
 		break;
 	case EHEA_EC_PORT_MALFUNC:
-		ehea_info("Port malfunction: Device: %s", port->netdev->name);
-		netif_carrier_off(port->netdev);
-		netif_stop_queue(port->netdev);
+		netdev_info(dev, "Port malfunction\n");
+		netif_carrier_off(dev);
+		netif_stop_queue(dev);
 		break;
 	default:
-		ehea_error("unknown event code %x, eqe=0x%llX", ec, eqe);
+		netdev_err(dev, "unknown event code %x, eqe=0x%llX\n", ec, eqe);
 		break;
 	}
 }
@@ -1298,13 +1299,13 @@ static void ehea_neq_tasklet(unsigned long data)
 	u64 event_mask;
 
 	eqe = ehea_poll_eq(adapter->neq);
-	ehea_debug("eqe=%p", eqe);
+	pr_debug("eqe=%p\n", eqe);
 
 	while (eqe) {
-		ehea_debug("*eqe=%lx", eqe->entry);
+		pr_debug("*eqe=%lx\n", (unsigned long) eqe->entry);
 		ehea_parse_eqe(adapter, eqe->entry);
 		eqe = ehea_poll_eq(adapter->neq);
-		ehea_debug("next eqe=%p", eqe);
+		pr_debug("next eqe=%p\n", eqe);
 	}
 
 	event_mask = EHEA_BMASK_SET(NELR_PORTSTATE_CHG, 1)
@@ -1353,14 +1354,14 @@ static int ehea_reg_interrupts(struct net_device *dev)
 				  ehea_qp_aff_irq_handler,
 				  IRQF_DISABLED, port->int_aff_name, port);
 	if (ret) {
-		ehea_error("failed registering irq for qp_aff_irq_handler:"
-			   "ist=%X", port->qp_eq->attr.ist1);
+		netdev_err(dev, "failed registering irq for qp_aff_irq_handler:ist=%X\n",
+			   port->qp_eq->attr.ist1);
 		goto out_free_qpeq;
 	}
 
-	if (netif_msg_ifup(port))
-		ehea_info("irq_handle 0x%X for function qp_aff_irq_handler "
-			  "registered", port->qp_eq->attr.ist1);
+	netif_info(port, ifup, dev,
+		   "irq_handle 0x%X for function qp_aff_irq_handler registered\n",
+		   port->qp_eq->attr.ist1);
 
 
 	for (i = 0; i < port->num_def_qps + port->num_add_tx_qps; i++) {
@@ -1372,14 +1373,13 @@ static int ehea_reg_interrupts(struct net_device *dev)
 					  IRQF_DISABLED, pr->int_send_name,
 					  pr);
 		if (ret) {
-			ehea_error("failed registering irq for ehea_queue "
-				   "port_res_nr:%d, ist=%X", i,
-				   pr->eq->attr.ist1);
+			netdev_err(dev, "failed registering irq for ehea_queue port_res_nr:%d, ist=%X\n",
+				   i, pr->eq->attr.ist1);
 			goto out_free_req;
 		}
-		if (netif_msg_ifup(port))
-			ehea_info("irq_handle 0x%X for function ehea_queue_int "
-				  "%d registered", pr->eq->attr.ist1, i);
+		netif_info(port, ifup, dev,
+			   "irq_handle 0x%X for function ehea_queue_int %d registered\n",
+			   pr->eq->attr.ist1, i);
 	}
 out:
 	return ret;
@@ -1410,16 +1410,16 @@ static void ehea_free_interrupts(struct net_device *dev)
 	for (i = 0; i < port->num_def_qps + port->num_add_tx_qps; i++) {
 		pr = &port->port_res[i];
 		ibmebus_free_irq(pr->eq->attr.ist1, pr);
-		if (netif_msg_intr(port))
-			ehea_info("free send irq for res %d with handle 0x%X",
-				  i, pr->eq->attr.ist1);
+		netif_info(port, intr, dev,
+			   "free send irq for res %d with handle 0x%X\n",
+			   i, pr->eq->attr.ist1);
 	}
 
 	/* associated events */
 	ibmebus_free_irq(port->qp_eq->attr.ist1, port);
-	if (netif_msg_intr(port))
-		ehea_info("associated event interrupt for handle 0x%X freed",
-			  port->qp_eq->attr.ist1);
+	netif_info(port, intr, dev,
+		   "associated event interrupt for handle 0x%X freed\n",
+		   port->qp_eq->attr.ist1);
 }
 
 static int ehea_configure_port(struct ehea_port *port)
@@ -1488,7 +1488,7 @@ int ehea_gen_smrs(struct ehea_port_res *pr)
 out_free:
 	ehea_rem_mr(&pr->send_mr);
 out:
-	ehea_error("Generating SMRS failed\n");
+	pr_err("Generating SMRS failed\n");
 	return -EIO;
 }
 
@@ -1543,7 +1543,7 @@ static int ehea_init_port_res(struct ehea_port *port, struct ehea_port_res *pr,
 
 	pr->eq = ehea_create_eq(adapter, eq_type, EHEA_MAX_ENTRIES_EQ, 0);
 	if (!pr->eq) {
-		ehea_error("create_eq failed (eq)");
+		pr_err("create_eq failed (eq)\n");
 		goto out_free;
 	}
 
@@ -1551,7 +1551,7 @@ static int ehea_init_port_res(struct ehea_port *port, struct ehea_port_res *pr,
 				     pr->eq->fw_handle,
 				     port->logical_port_id);
 	if (!pr->recv_cq) {
-		ehea_error("create_cq failed (cq_recv)");
+		pr_err("create_cq failed (cq_recv)\n");
 		goto out_free;
 	}
 
@@ -1559,19 +1559,19 @@ static int ehea_init_port_res(struct ehea_port *port, struct ehea_port_res *pr,
 				     pr->eq->fw_handle,
 				     port->logical_port_id);
 	if (!pr->send_cq) {
-		ehea_error("create_cq failed (cq_send)");
+		pr_err("create_cq failed (cq_send)\n");
 		goto out_free;
 	}
 
 	if (netif_msg_ifup(port))
-		ehea_info("Send CQ: act_nr_cqes=%d, Recv CQ: act_nr_cqes=%d",
-			  pr->send_cq->attr.act_nr_of_cqes,
-			  pr->recv_cq->attr.act_nr_of_cqes);
+		pr_info("Send CQ: act_nr_cqes=%d, Recv CQ: act_nr_cqes=%d\n",
+			pr->send_cq->attr.act_nr_of_cqes,
+			pr->recv_cq->attr.act_nr_of_cqes);
 
 	init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL);
 	if (!init_attr) {
 		ret = -ENOMEM;
-		ehea_error("no mem for ehea_qp_init_attr");
+		pr_err("no mem for ehea_qp_init_attr\n");
 		goto out_free;
 	}
 
@@ -1596,18 +1596,18 @@ static int ehea_init_port_res(struct ehea_port *port, struct ehea_port_res *pr,
 
 	pr->qp = ehea_create_qp(adapter, adapter->pd, init_attr);
 	if (!pr->qp) {
-		ehea_error("create_qp failed");
+		pr_err("create_qp failed\n");
 		ret = -EIO;
 		goto out_free;
 	}
 
 	if (netif_msg_ifup(port))
-		ehea_info("QP: qp_nr=%d\n act_nr_snd_wqe=%d\n nr_rwqe_rq1=%d\n "
-			  "nr_rwqe_rq2=%d\n nr_rwqe_rq3=%d", init_attr->qp_nr,
-			  init_attr->act_nr_send_wqes,
-			  init_attr->act_nr_rwqes_rq1,
-			  init_attr->act_nr_rwqes_rq2,
-			  init_attr->act_nr_rwqes_rq3);
+		pr_info("QP: qp_nr=%d\n act_nr_snd_wqe=%d\n nr_rwqe_rq1=%d\n nr_rwqe_rq2=%d\n nr_rwqe_rq3=%d\n",
+			init_attr->qp_nr,
+			init_attr->act_nr_send_wqes,
+			init_attr->act_nr_rwqes_rq1,
+			init_attr->act_nr_rwqes_rq2,
+			init_attr->act_nr_rwqes_rq3);
 
 	pr->sq_skba_size = init_attr->act_nr_send_wqes + 1;
 
@@ -1758,7 +1758,7 @@ static void write_swqe2_TSO(struct sk_buff *skb,
 			swqe->descriptors++;
 		}
 	} else
-		ehea_error("cannot handle fragmented headers");
+		pr_err("cannot handle fragmented headers\n");
 }
 
 static void write_swqe2_nonTSO(struct sk_buff *skb,
@@ -1854,8 +1854,8 @@ static int ehea_broadcast_reg_helper(struct ehea_port *port, u32 hcallid)
 				     port->logical_port_id,
 				     reg_type, port->mac_addr, 0, hcallid);
 	if (hret != H_SUCCESS) {
-		ehea_error("%sregistering bc address failed (tagged)",
-			   hcallid == H_REG_BCMC ? "" : "de");
+		pr_err("%sregistering bc address failed (tagged)\n",
+		       hcallid == H_REG_BCMC ? "" : "de");
 		ret = -EIO;
 		goto out_herr;
 	}
@@ -1866,8 +1866,8 @@ static int ehea_broadcast_reg_helper(struct ehea_port *port, u32 hcallid)
 				     port->logical_port_id,
 				     reg_type, port->mac_addr, 0, hcallid);
 	if (hret != H_SUCCESS) {
-		ehea_error("%sregistering bc address failed (vlan)",
-			   hcallid == H_REG_BCMC ? "" : "de");
+		pr_err("%sregistering bc address failed (vlan)\n",
+		       hcallid == H_REG_BCMC ? "" : "de");
 		ret = -EIO;
 	}
 out_herr:
@@ -1889,7 +1889,7 @@ static int ehea_set_mac_addr(struct net_device *dev, void *sa)
 
 	cb0 = (void *)get_zeroed_page(GFP_KERNEL);
 	if (!cb0) {
-		ehea_error("no mem for cb0");
+		pr_err("no mem for cb0\n");
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -1937,11 +1937,11 @@ out:
 static void ehea_promiscuous_error(u64 hret, int enable)
 {
 	if (hret == H_AUTHORITY)
-		ehea_info("Hypervisor denied %sabling promiscuous mode",
-			  enable == 1 ? "en" : "dis");
+		pr_info("Hypervisor denied %sabling promiscuous mode\n",
+			enable == 1 ? "en" : "dis");
 	else
-		ehea_error("failed %sabling promiscuous mode",
-			   enable == 1 ? "en" : "dis");
+		pr_err("failed %sabling promiscuous mode\n",
+		       enable == 1 ? "en" : "dis");
 }
 
 static void ehea_promiscuous(struct net_device *dev, int enable)
@@ -1955,7 +1955,7 @@ static void ehea_promiscuous(struct net_device *dev, int enable)
 
 	cb7 = (void *)get_zeroed_page(GFP_ATOMIC);
 	if (!cb7) {
-		ehea_error("no mem for cb7");
+		pr_err("no mem for cb7\n");
 		goto out;
 	}
 
@@ -2015,7 +2015,7 @@ static int ehea_drop_multicast_list(struct net_device *dev)
 		hret = ehea_multicast_reg_helper(port, mc_entry->macaddr,
 						 H_DEREG_BCMC);
 		if (hret) {
-			ehea_error("failed deregistering mcast MAC");
+			pr_err("failed deregistering mcast MAC\n");
 			ret = -EIO;
 		}
 
@@ -2038,7 +2038,8 @@ static void ehea_allmulti(struct net_device *dev, int enable)
 			if (!hret)
 				port->allmulti = 1;
 			else
-				ehea_error("failed enabling IFF_ALLMULTI");
+				netdev_err(dev,
+					   "failed enabling IFF_ALLMULTI\n");
 		}
 	} else
 		if (!enable) {
@@ -2047,7 +2048,8 @@ static void ehea_allmulti(struct net_device *dev, int enable)
 			if (!hret)
 				port->allmulti = 0;
 			else
-				ehea_error("failed disabling IFF_ALLMULTI");
+				netdev_err(dev,
+					   "failed disabling IFF_ALLMULTI\n");
 		}
 }
 
@@ -2058,7 +2060,7 @@ static void ehea_add_multicast_entry(struct ehea_port *port, u8 *mc_mac_addr)
 
 	ehea_mcl_entry = kzalloc(sizeof(*ehea_mcl_entry), GFP_ATOMIC);
 	if (!ehea_mcl_entry) {
-		ehea_error("no mem for mcl_entry");
+		pr_err("no mem for mcl_entry\n");
 		return;
 	}
 
@@ -2071,7 +2073,7 @@ static void ehea_add_multicast_entry(struct ehea_port *port, u8 *mc_mac_addr)
 	if (!hret)
 		list_add(&ehea_mcl_entry->list, &port->mc_list->list);
 	else {
-		ehea_error("failed registering mcast MAC");
+		pr_err("failed registering mcast MAC\n");
 		kfree(ehea_mcl_entry);
 	}
 }
@@ -2104,9 +2106,8 @@ static void ehea_set_multicast_list(struct net_device *dev)
 		}
 
 		if (netdev_mc_count(dev) > port->adapter->max_mc_mac) {
-			ehea_info("Mcast registration limit reached (0x%llx). "
-				  "Use ALLMULTI!",
-				  port->adapter->max_mc_mac);
+			pr_info("Mcast registration limit reached (0x%llx). Use ALLMULTI!\n",
+				port->adapter->max_mc_mac);
 			goto out;
 		}
 
@@ -2312,10 +2313,10 @@ static int ehea_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 	pr->swqe_id_counter += 1;
 
-	if (netif_msg_tx_queued(port)) {
-		ehea_info("post swqe on QP %d", pr->qp->init_attr.qp_nr);
+	netif_info(port, tx_queued, dev,
+		   "post swqe on QP %d\n", pr->qp->init_attr.qp_nr);
+	if (netif_msg_tx_queued(port))
 		ehea_dump(swqe, 512, "swqe");
-	}
 
 	if (unlikely(test_bit(__EHEA_STOP_XFER, &ehea_driver_flags))) {
 		netif_stop_queue(dev);
@@ -2351,14 +2352,14 @@ static void ehea_vlan_rx_register(struct net_device *dev,
 
 	cb1 = (void *)get_zeroed_page(GFP_KERNEL);
 	if (!cb1) {
-		ehea_error("no mem for cb1");
+		pr_err("no mem for cb1\n");
 		goto out;
 	}
 
 	hret = ehea_h_modify_ehea_port(adapter->handle, port->logical_port_id,
 				       H_PORT_CB1, H_PORT_CB1_ALL, cb1);
 	if (hret != H_SUCCESS)
-		ehea_error("modify_ehea_port failed");
+		pr_err("modify_ehea_port failed\n");
 
 	free_page((unsigned long)cb1);
 out:
@@ -2375,14 +2376,14 @@ static void ehea_vlan_rx_add_vid(struct net_device *dev, unsigned short vid)
 
 	cb1 = (void *)get_zeroed_page(GFP_KERNEL);
 	if (!cb1) {
-		ehea_error("no mem for cb1");
+		pr_err("no mem for cb1\n");
 		goto out;
 	}
 
 	hret = ehea_h_query_ehea_port(adapter->handle, port->logical_port_id,
 				      H_PORT_CB1, H_PORT_CB1_ALL, cb1);
 	if (hret != H_SUCCESS) {
-		ehea_error("query_ehea_port failed");
+		pr_err("query_ehea_port failed\n");
 		goto out;
 	}
 
@@ -2392,7 +2393,7 @@ static void ehea_vlan_rx_add_vid(struct net_device *dev, unsigned short vid)
 	hret = ehea_h_modify_ehea_port(adapter->handle, port->logical_port_id,
 				       H_PORT_CB1, H_PORT_CB1_ALL, cb1);
 	if (hret != H_SUCCESS)
-		ehea_error("modify_ehea_port failed");
+		pr_err("modify_ehea_port failed\n");
 out:
 	free_page((unsigned long)cb1);
 	return;
@@ -2410,14 +2411,14 @@ static void ehea_vlan_rx_kill_vid(struct net_device *dev, unsigned short vid)
 
 	cb1 = (void *)get_zeroed_page(GFP_KERNEL);
 	if (!cb1) {
-		ehea_error("no mem for cb1");
+		pr_err("no mem for cb1\n");
 		goto out;
 	}
 
 	hret = ehea_h_query_ehea_port(adapter->handle, port->logical_port_id,
 				      H_PORT_CB1, H_PORT_CB1_ALL, cb1);
 	if (hret != H_SUCCESS) {
-		ehea_error("query_ehea_port failed");
+		pr_err("query_ehea_port failed\n");
 		goto out;
 	}
 
@@ -2427,7 +2428,7 @@ static void ehea_vlan_rx_kill_vid(struct net_device *dev, unsigned short vid)
 	hret = ehea_h_modify_ehea_port(adapter->handle, port->logical_port_id,
 				       H_PORT_CB1, H_PORT_CB1_ALL, cb1);
 	if (hret != H_SUCCESS)
-		ehea_error("modify_ehea_port failed");
+		pr_err("modify_ehea_port failed\n");
 out:
 	free_page((unsigned long)cb1);
 }
@@ -2449,7 +2450,7 @@ int ehea_activate_qp(struct ehea_adapter *adapter, struct ehea_qp *qp)
 	hret = ehea_h_query_ehea_qp(adapter->handle, 0, qp->fw_handle,
 				    EHEA_BMASK_SET(H_QPCB0_ALL, 0xFFFF), cb0);
 	if (hret != H_SUCCESS) {
-		ehea_error("query_ehea_qp failed (1)");
+		pr_err("query_ehea_qp failed (1)\n");
 		goto out;
 	}
 
@@ -2458,14 +2459,14 @@ int ehea_activate_qp(struct ehea_adapter *adapter, struct ehea_qp *qp)
 				     EHEA_BMASK_SET(H_QPCB0_QP_CTL_REG, 1), cb0,
 				     &dummy64, &dummy64, &dummy16, &dummy16);
 	if (hret != H_SUCCESS) {
-		ehea_error("modify_ehea_qp failed (1)");
+		pr_err("modify_ehea_qp failed (1)\n");
 		goto out;
 	}
 
 	hret = ehea_h_query_ehea_qp(adapter->handle, 0, qp->fw_handle,
 				    EHEA_BMASK_SET(H_QPCB0_ALL, 0xFFFF), cb0);
 	if (hret != H_SUCCESS) {
-		ehea_error("query_ehea_qp failed (2)");
+		pr_err("query_ehea_qp failed (2)\n");
 		goto out;
 	}
 
@@ -2474,14 +2475,14 @@ int ehea_activate_qp(struct ehea_adapter *adapter, struct ehea_qp *qp)
 				     EHEA_BMASK_SET(H_QPCB0_QP_CTL_REG, 1), cb0,
 				     &dummy64, &dummy64, &dummy16, &dummy16);
 	if (hret != H_SUCCESS) {
-		ehea_error("modify_ehea_qp failed (2)");
+		pr_err("modify_ehea_qp failed (2)\n");
 		goto out;
 	}
 
 	hret = ehea_h_query_ehea_qp(adapter->handle, 0, qp->fw_handle,
 				    EHEA_BMASK_SET(H_QPCB0_ALL, 0xFFFF), cb0);
 	if (hret != H_SUCCESS) {
-		ehea_error("query_ehea_qp failed (3)");
+		pr_err("query_ehea_qp failed (3)\n");
 		goto out;
 	}
 
@@ -2490,14 +2491,14 @@ int ehea_activate_qp(struct ehea_adapter *adapter, struct ehea_qp *qp)
 				     EHEA_BMASK_SET(H_QPCB0_QP_CTL_REG, 1), cb0,
 				     &dummy64, &dummy64, &dummy16, &dummy16);
 	if (hret != H_SUCCESS) {
-		ehea_error("modify_ehea_qp failed (3)");
+		pr_err("modify_ehea_qp failed (3)\n");
 		goto out;
 	}
 
 	hret = ehea_h_query_ehea_qp(adapter->handle, 0, qp->fw_handle,
 				    EHEA_BMASK_SET(H_QPCB0_ALL, 0xFFFF), cb0);
 	if (hret != H_SUCCESS) {
-		ehea_error("query_ehea_qp failed (4)");
+		pr_err("query_ehea_qp failed (4)\n");
 		goto out;
 	}
 
@@ -2518,7 +2519,7 @@ static int ehea_port_res_setup(struct ehea_port *port, int def_qps,
 				   EHEA_MAX_ENTRIES_EQ, 1);
 	if (!port->qp_eq) {
 		ret = -EINVAL;
-		ehea_error("ehea_create_eq failed (qp_eq)");
+		pr_err("ehea_create_eq failed (qp_eq)\n");
 		goto out_kill_eq;
 	}
 
@@ -2599,27 +2600,27 @@ static int ehea_up(struct net_device *dev)
 	ret = ehea_port_res_setup(port, port->num_def_qps,
 				  port->num_add_tx_qps);
 	if (ret) {
-		ehea_error("port_res_failed");
+		netdev_err(dev, "port_res_failed\n");
 		goto out;
 	}
 
 	/* Set default QP for this port */
 	ret = ehea_configure_port(port);
 	if (ret) {
-		ehea_error("ehea_configure_port failed. ret:%d", ret);
+		netdev_err(dev, "ehea_configure_port failed. ret:%d\n", ret);
 		goto out_clean_pr;
 	}
 
 	ret = ehea_reg_interrupts(dev);
 	if (ret) {
-		ehea_error("reg_interrupts failed. ret:%d", ret);
+		netdev_err(dev, "reg_interrupts failed. ret:%d\n", ret);
 		goto out_clean_pr;
 	}
 
 	for (i = 0; i < port->num_def_qps + port->num_add_tx_qps; i++) {
 		ret = ehea_activate_qp(port->adapter, port->port_res[i].qp);
 		if (ret) {
-			ehea_error("activate_qp failed");
+			netdev_err(dev, "activate_qp failed\n");
 			goto out_free_irqs;
 		}
 	}
@@ -2627,7 +2628,7 @@ static int ehea_up(struct net_device *dev)
 	for (i = 0; i < port->num_def_qps; i++) {
 		ret = ehea_fill_port_res(&port->port_res[i]);
 		if (ret) {
-			ehea_error("out_free_irqs");
+			netdev_err(dev, "out_free_irqs\n");
 			goto out_free_irqs;
 		}
 	}
@@ -2650,7 +2651,7 @@ out_clean_pr:
 	ehea_clean_all_portres(port);
 out:
 	if (ret)
-		ehea_info("Failed starting %s. ret=%i", dev->name, ret);
+		netdev_info(dev, "Failed starting. ret=%i\n", ret);
 
 	ehea_update_bcmc_registrations();
 	ehea_update_firmware_handles();
@@ -2681,8 +2682,7 @@ static int ehea_open(struct net_device *dev)
 
 	mutex_lock(&port->port_lock);
 
-	if (netif_msg_ifup(port))
-		ehea_info("enabling port %s", dev->name);
+	netif_info(port, ifup, dev, "enabling port\n");
 
 	ret = ehea_up(dev);
 	if (!ret) {
@@ -2717,8 +2717,7 @@ static int ehea_down(struct net_device *dev)
 
 	ret = ehea_clean_all_portres(port);
 	if (ret)
-		ehea_info("Failed freeing resources for %s. ret=%i",
-			  dev->name, ret);
+		netdev_info(dev, "Failed freeing resources. ret=%i\n", ret);
 
 	ehea_update_firmware_handles();
 
@@ -2730,8 +2729,7 @@ static int ehea_stop(struct net_device *dev)
 	int ret;
 	struct ehea_port *port = netdev_priv(dev);
 
-	if (netif_msg_ifdown(port))
-		ehea_info("disabling port %s", dev->name);
+	netif_info(port, ifdown, dev, "disabling port\n");
 
 	set_bit(__EHEA_DISABLE_PORT_RESET, &port->flags);
 	cancel_work_sync(&port->reset_task);
@@ -2772,7 +2770,7 @@ static void ehea_flush_sq(struct ehea_port *port)
 			 msecs_to_jiffies(100));
 
 		if (!ret) {
-			ehea_error("WARNING: sq not flushed completely");
+			pr_err("WARNING: sq not flushed completely\n");
 			break;
 		}
 	}
@@ -2808,7 +2806,7 @@ int ehea_stop_qps(struct net_device *dev)
 					    EHEA_BMASK_SET(H_QPCB0_ALL, 0xFFFF),
 					    cb0);
 		if (hret != H_SUCCESS) {
-			ehea_error("query_ehea_qp failed (1)");
+			pr_err("query_ehea_qp failed (1)\n");
 			goto out;
 		}
 
@@ -2820,7 +2818,7 @@ int ehea_stop_qps(struct net_device *dev)
 							    1), cb0, &dummy64,
 					     &dummy64, &dummy16, &dummy16);
 		if (hret != H_SUCCESS) {
-			ehea_error("modify_ehea_qp failed (1)");
+			pr_err("modify_ehea_qp failed (1)\n");
 			goto out;
 		}
 
@@ -2828,14 +2826,14 @@ int ehea_stop_qps(struct net_device *dev)
 					    EHEA_BMASK_SET(H_QPCB0_ALL, 0xFFFF),
 					    cb0);
 		if (hret != H_SUCCESS) {
-			ehea_error("query_ehea_qp failed (2)");
+			pr_err("query_ehea_qp failed (2)\n");
 			goto out;
 		}
 
 		/* deregister shared memory regions */
 		dret = ehea_rem_smrs(pr);
 		if (dret) {
-			ehea_error("unreg shared memory region failed");
+			pr_err("unreg shared memory region failed\n");
 			goto out;
 		}
 	}
@@ -2904,7 +2902,7 @@ int ehea_restart_qps(struct net_device *dev)
 
 		ret = ehea_gen_smrs(pr);
 		if (ret) {
-			ehea_error("creation of shared memory regions failed");
+			netdev_err(dev, "creation of shared memory regions failed\n");
 			goto out;
 		}
 
@@ -2915,7 +2913,7 @@ int ehea_restart_qps(struct net_device *dev)
 					    EHEA_BMASK_SET(H_QPCB0_ALL, 0xFFFF),
 					    cb0);
 		if (hret != H_SUCCESS) {
-			ehea_error("query_ehea_qp failed (1)");
+			netdev_err(dev, "query_ehea_qp failed (1)\n");
 			goto out;
 		}
 
@@ -2927,7 +2925,7 @@ int ehea_restart_qps(struct net_device *dev)
 							    1), cb0, &dummy64,
 					     &dummy64, &dummy16, &dummy16);
 		if (hret != H_SUCCESS) {
-			ehea_error("modify_ehea_qp failed (1)");
+			netdev_err(dev, "modify_ehea_qp failed (1)\n");
 			goto out;
 		}
 
@@ -2935,7 +2933,7 @@ int ehea_restart_qps(struct net_device *dev)
 					    EHEA_BMASK_SET(H_QPCB0_ALL, 0xFFFF),
 					    cb0);
 		if (hret != H_SUCCESS) {
-			ehea_error("query_ehea_qp failed (2)");
+			netdev_err(dev, "query_ehea_qp failed (2)\n");
 			goto out;
 		}
 
@@ -2972,8 +2970,7 @@ static void ehea_reset_port(struct work_struct *work)
 
 	ehea_set_multicast_list(dev);
 
-	if (netif_msg_timer(port))
-		ehea_info("Device %s resetted successfully", dev->name);
+	netif_info(port, timer, dev, "reset successful\n");
 
 	port_napi_enable(port);
 
@@ -2988,7 +2985,7 @@ static void ehea_rereg_mrs(void)
 	int ret, i;
 	struct ehea_adapter *adapter;
 
-	ehea_info("LPAR memory changed - re-initializing driver");
+	pr_info("LPAR memory changed - re-initializing driver\n");
 
 	list_for_each_entry(adapter, &adapter_list, list)
 		if (adapter->active_ports) {
@@ -3020,8 +3017,7 @@ static void ehea_rereg_mrs(void)
 			/* Unregister old memory region */
 			ret = ehea_rem_mr(&adapter->mr);
 			if (ret) {
-				ehea_error("unregister MR failed - driver"
-					   " inoperable!");
+				pr_err("unregister MR failed - driver inoperable!\n");
 				goto out;
 			}
 		}
@@ -3033,8 +3029,7 @@ static void ehea_rereg_mrs(void)
 			/* Register new memory region */
 			ret = ehea_reg_kernel_mr(adapter, &adapter->mr);
 			if (ret) {
-				ehea_error("register MR failed - driver"
-					   " inoperable!");
+				pr_err("register MR failed - driver inoperable!\n");
 				goto out;
 			}
 
@@ -3057,7 +3052,7 @@ static void ehea_rereg_mrs(void)
 				}
 			}
 		}
-	ehea_info("re-initializing driver complete");
+	pr_info("re-initializing driver complete\n");
 out:
 	return;
 }
@@ -3110,7 +3105,7 @@ int ehea_get_jumboframe_status(struct ehea_port *port, int *jumbo)
 	/* (Try to) enable *jumbo frames */
 	cb4 = (void *)get_zeroed_page(GFP_KERNEL);
 	if (!cb4) {
-		ehea_error("no mem for cb4");
+		pr_err("no mem for cb4\n");
 		ret = -ENOMEM;
 		goto out;
 	} else {
@@ -3172,13 +3167,13 @@ static struct device *ehea_register_port(struct ehea_port *port,
 
 	ret = of_device_register(&port->ofdev);
 	if (ret) {
-		ehea_error("failed to register device. ret=%d", ret);
+		pr_err("failed to register device. ret=%d\n", ret);
 		goto out;
 	}
 
 	ret = device_create_file(&port->ofdev.dev, &dev_attr_log_port_id);
 	if (ret) {
-		ehea_error("failed to register attributes, ret=%d", ret);
+		pr_err("failed to register attributes, ret=%d\n", ret);
 		goto out_unreg_of_dev;
 	}
 
@@ -3228,7 +3223,7 @@ struct ehea_port *ehea_setup_single_port(struct ehea_adapter *adapter,
 	dev = alloc_etherdev(sizeof(struct ehea_port));
 
 	if (!dev) {
-		ehea_error("no mem for net_device");
+		pr_err("no mem for net_device\n");
 		ret = -ENOMEM;
 		goto out_err;
 	}
@@ -3282,7 +3277,7 @@ struct ehea_port *ehea_setup_single_port(struct ehea_adapter *adapter,
 
 	ret = register_netdev(dev);
 	if (ret) {
-		ehea_error("register_netdev failed. ret=%d", ret);
+		pr_err("register_netdev failed. ret=%d\n", ret);
 		goto out_unreg_port;
 	}
 
@@ -3290,11 +3285,10 @@ struct ehea_port *ehea_setup_single_port(struct ehea_adapter *adapter,
 
 	ret = ehea_get_jumboframe_status(port, &jumbo);
 	if (ret)
-		ehea_error("failed determining jumbo frame status for %s",
-			   port->netdev->name);
+		netdev_err(dev, "failed determining jumbo frame status\n");
 
-	ehea_info("%s: Jumbo frames are %sabled", dev->name,
-		  jumbo == 1 ? "en" : "dis");
+	netdev_info(dev, "Jumbo frames are %sabled\n",
+		    jumbo == 1 ? "en" : "dis");
 
 	adapter->active_ports++;
 
@@ -3310,8 +3304,8 @@ out_free_ethdev:
 	free_netdev(dev);
 
 out_err:
-	ehea_error("setting up logical port with id=%d failed, ret=%d",
-		   logical_port_id, ret);
+	pr_err("setting up logical port with id=%d failed, ret=%d\n",
+	       logical_port_id, ret);
 	return NULL;
 }
 
@@ -3341,13 +3335,13 @@ static int ehea_setup_ports(struct ehea_adapter *adapter)
 		dn_log_port_id = of_get_property(eth_dn, "ibm,hea-port-no",
 						 NULL);
 		if (!dn_log_port_id) {
-			ehea_error("bad device node: eth_dn name=%s",
-				   eth_dn->full_name);
+			pr_err("bad device node: eth_dn name=%s\n",
+			       eth_dn->full_name);
 			continue;
 		}
 
 		if (ehea_add_adapter_mr(adapter)) {
-			ehea_error("creating MR failed");
+			pr_err("creating MR failed\n");
 			of_node_put(eth_dn);
 			return -EIO;
 		}
@@ -3356,9 +3350,8 @@ static int ehea_setup_ports(struct ehea_adapter *adapter)
 							  *dn_log_port_id,
 							  eth_dn);
 		if (adapter->port[i])
-			ehea_info("%s -> logical port id #%d",
-				  adapter->port[i]->netdev->name,
-				  *dn_log_port_id);
+			netdev_info(adapter->port[i]->netdev,
+				    "logical port id #%d\n", *dn_log_port_id);
 		else
 			ehea_remove_adapter_mr(adapter);
 
@@ -3403,21 +3396,20 @@ static ssize_t ehea_probe_port(struct device *dev,
 	port = ehea_get_port(adapter, logical_port_id);
 
 	if (port) {
-		ehea_info("adding port with logical port id=%d failed. port "
-			  "already configured as %s.", logical_port_id,
-			  port->netdev->name);
+		netdev_info(port->netdev, "adding port with logical port id=%d failed: port already configured\n",
+			    logical_port_id);
 		return -EINVAL;
 	}
 
 	eth_dn = ehea_get_eth_dn(adapter, logical_port_id);
 
 	if (!eth_dn) {
-		ehea_info("no logical port with id %d found", logical_port_id);
+		pr_info("no logical port with id %d found\n", logical_port_id);
 		return -EINVAL;
 	}
 
 	if (ehea_add_adapter_mr(adapter)) {
-		ehea_error("creating MR failed");
+		pr_err("creating MR failed\n");
 		return -EIO;
 	}
 
@@ -3432,8 +3424,8 @@ static ssize_t ehea_probe_port(struct device *dev,
 				break;
 			}
 
-		ehea_info("added %s (logical port id=%d)", port->netdev->name,
-			  logical_port_id);
+		netdev_info(port->netdev, "added: (logical port id=%d)\n",
+			    logical_port_id);
 	} else {
 		ehea_remove_adapter_mr(adapter);
 		return -EIO;
@@ -3456,8 +3448,8 @@ static ssize_t ehea_remove_port(struct device *dev,
 	port = ehea_get_port(adapter, logical_port_id);
 
 	if (port) {
-		ehea_info("removed %s (logical port id=%d)", port->netdev->name,
-			  logical_port_id);
+		netdev_info(port->netdev, "removed: (logical port id=%d)\n",
+			    logical_port_id);
 
 		ehea_shutdown_single_port(port);
 
@@ -3467,8 +3459,8 @@ static ssize_t ehea_remove_port(struct device *dev,
 				break;
 			}
 	} else {
-		ehea_error("removing port with logical port id=%d failed. port "
-			   "not configured.", logical_port_id);
+		pr_err("removing port with logical port id=%d failed. port not configured.\n",
+		       logical_port_id);
 		return -EINVAL;
 	}
 
@@ -3505,7 +3497,7 @@ static int __devinit ehea_probe_adapter(struct platform_device *dev,
 	int ret;
 
 	if (!dev || !dev->dev.of_node) {
-		ehea_error("Invalid ibmebus device probed");
+		pr_err("Invalid ibmebus device probed\n");
 		return -EINVAL;
 	}
 
@@ -3651,17 +3643,17 @@ static int ehea_mem_notifier(struct notifier_block *nb,
 
 	switch (action) {
 	case MEM_CANCEL_OFFLINE:
-		ehea_info("memory offlining canceled");
+		pr_info("memory offlining canceled");
 		/* Readd canceled memory block */
 	case MEM_ONLINE:
-		ehea_info("memory is going online");
+		pr_info("memory is going online");
 		set_bit(__EHEA_STOP_XFER, &ehea_driver_flags);
 		if (ehea_add_sect_bmap(arg->start_pfn, arg->nr_pages))
 			goto out_unlock;
 		ehea_rereg_mrs();
 		break;
 	case MEM_GOING_OFFLINE:
-		ehea_info("memory is going offline");
+		pr_info("memory is going offline");
 		set_bit(__EHEA_STOP_XFER, &ehea_driver_flags);
 		if (ehea_rem_sect_bmap(arg->start_pfn, arg->nr_pages))
 			goto out_unlock;
@@ -3687,7 +3679,7 @@ static int ehea_reboot_notifier(struct notifier_block *nb,
 				unsigned long action, void *unused)
 {
 	if (action == SYS_RESTART) {
-		ehea_info("Reboot: freeing all eHEA resources");
+		pr_info("Reboot: freeing all eHEA resources\n");
 		ibmebus_unregister_driver(&ehea_driver);
 	}
 	return NOTIFY_DONE;
@@ -3703,22 +3695,22 @@ static int check_module_parm(void)
 
 	if ((rq1_entries < EHEA_MIN_ENTRIES_QP) ||
 	    (rq1_entries > EHEA_MAX_ENTRIES_RQ1)) {
-		ehea_info("Bad parameter: rq1_entries");
+		pr_info("Bad parameter: rq1_entries\n");
 		ret = -EINVAL;
 	}
 	if ((rq2_entries < EHEA_MIN_ENTRIES_QP) ||
 	    (rq2_entries > EHEA_MAX_ENTRIES_RQ2)) {
-		ehea_info("Bad parameter: rq2_entries");
+		pr_info("Bad parameter: rq2_entries\n");
 		ret = -EINVAL;
 	}
 	if ((rq3_entries < EHEA_MIN_ENTRIES_QP) ||
 	    (rq3_entries > EHEA_MAX_ENTRIES_RQ3)) {
-		ehea_info("Bad parameter: rq3_entries");
+		pr_info("Bad parameter: rq3_entries\n");
 		ret = -EINVAL;
 	}
 	if ((sq_entries < EHEA_MIN_ENTRIES_QP) ||
 	    (sq_entries > EHEA_MAX_ENTRIES_SQ)) {
-		ehea_info("Bad parameter: sq_entries");
+		pr_info("Bad parameter: sq_entries\n");
 		ret = -EINVAL;
 	}
 
@@ -3738,8 +3730,7 @@ int __init ehea_module_init(void)
 {
 	int ret;
 
-	printk(KERN_INFO "IBM eHEA ethernet device driver (Release %s)\n",
-	       DRV_VERSION);
+	pr_info("IBM eHEA ethernet device driver (Release %s)\n", DRV_VERSION);
 
 	memset(&ehea_fw_handles, 0, sizeof(ehea_fw_handles));
 	memset(&ehea_bcmc_regs, 0, sizeof(ehea_bcmc_regs));
@@ -3757,27 +3748,27 @@ int __init ehea_module_init(void)
 
 	ret = register_reboot_notifier(&ehea_reboot_nb);
 	if (ret)
-		ehea_info("failed registering reboot notifier");
+		pr_info("failed registering reboot notifier\n");
 
 	ret = register_memory_notifier(&ehea_mem_nb);
 	if (ret)
-		ehea_info("failed registering memory remove notifier");
+		pr_info("failed registering memory remove notifier\n");
 
 	ret = crash_shutdown_register(ehea_crash_handler);
 	if (ret)
-		ehea_info("failed registering crash handler");
+		pr_info("failed registering crash handler\n");
 
 	ret = ibmebus_register_driver(&ehea_driver);
 	if (ret) {
-		ehea_error("failed registering eHEA device driver on ebus");
+		pr_err("failed registering eHEA device driver on ebus\n");
 		goto out2;
 	}
 
 	ret = driver_create_file(&ehea_driver.driver,
 				 &driver_attr_capabilities);
 	if (ret) {
-		ehea_error("failed to register capabilities attribute, ret=%d",
-			   ret);
+		pr_err("failed to register capabilities attribute, ret=%d\n",
+		       ret);
 		goto out3;
 	}
 
@@ -3802,7 +3793,7 @@ static void __exit ehea_module_exit(void)
 	unregister_reboot_notifier(&ehea_reboot_nb);
 	ret = crash_shutdown_unregister(ehea_crash_handler);
 	if (ret)
-		ehea_info("failed unregistering crash handler");
+		pr_info("failed unregistering crash handler\n");
 	unregister_memory_notifier(&ehea_mem_nb);
 	kfree(ehea_fw_handles.arr);
 	kfree(ehea_bcmc_regs.arr);
diff --git a/drivers/net/ehea/ehea_phyp.c b/drivers/net/ehea/ehea_phyp.c
index 8fe9dca..0506967 100644
--- a/drivers/net/ehea/ehea_phyp.c
+++ b/drivers/net/ehea/ehea_phyp.c
@@ -26,6 +26,8 @@
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include "ehea_phyp.h"
 
 
@@ -67,12 +69,11 @@ static long ehea_plpar_hcall_norets(unsigned long opcode,
 		}
 
 		if (ret < H_SUCCESS)
-			ehea_error("opcode=%lx ret=%lx"
-				   " arg1=%lx arg2=%lx arg3=%lx arg4=%lx"
-				   " arg5=%lx arg6=%lx arg7=%lx ",
-				   opcode, ret,
-				   arg1, arg2, arg3, arg4, arg5,
-				   arg6, arg7);
+			pr_err("opcode=%lx ret=%lx"
+			       " arg1=%lx arg2=%lx arg3=%lx arg4=%lx"
+			       " arg5=%lx arg6=%lx arg7=%lx\n",
+			       opcode, ret,
+			       arg1, arg2, arg3, arg4, arg5, arg6, arg7);
 
 		return ret;
 	}
@@ -114,19 +115,18 @@ static long ehea_plpar_hcall9(unsigned long opcode,
 		    && (((cb_cat == H_PORT_CB4) && ((arg3 == H_PORT_CB4_JUMBO)
 		    || (arg3 == H_PORT_CB4_SPEED))) || ((cb_cat == H_PORT_CB7)
 		    && (arg3 == H_PORT_CB7_DUCQPN)))))
-			ehea_error("opcode=%lx ret=%lx"
-				   " arg1=%lx arg2=%lx arg3=%lx arg4=%lx"
-				   " arg5=%lx arg6=%lx arg7=%lx arg8=%lx"
-				   " arg9=%lx"
-				   " out1=%lx out2=%lx out3=%lx out4=%lx"
-				   " out5=%lx out6=%lx out7=%lx out8=%lx"
-				   " out9=%lx",
-				   opcode, ret,
-				   arg1, arg2, arg3, arg4, arg5,
-				   arg6, arg7, arg8, arg9,
-				   outs[0], outs[1], outs[2], outs[3],
-				   outs[4], outs[5], outs[6], outs[7],
-				   outs[8]);
+			pr_err("opcode=%lx ret=%lx"
+			       " arg1=%lx arg2=%lx arg3=%lx arg4=%lx"
+			       " arg5=%lx arg6=%lx arg7=%lx arg8=%lx"
+			       " arg9=%lx"
+			       " out1=%lx out2=%lx out3=%lx out4=%lx"
+			       " out5=%lx out6=%lx out7=%lx out8=%lx"
+			       " out9=%lx\n",
+			       opcode, ret,
+			       arg1, arg2, arg3, arg4, arg5,
+			       arg6, arg7, arg8, arg9,
+			       outs[0], outs[1], outs[2], outs[3], outs[4],
+			       outs[5], outs[6], outs[7], outs[8]);
 		return ret;
 	}
 
@@ -515,7 +515,7 @@ u64 ehea_h_register_rpage_mr(const u64 adapter_handle, const u64 mr_handle,
 			     const u64 log_pageaddr, const u64 count)
 {
 	if ((count > 1) && (log_pageaddr & ~PAGE_MASK)) {
-		ehea_error("not on pageboundary");
+		pr_err("not on pageboundary\n");
 		return H_PARAMETER;
 	}
 
diff --git a/drivers/net/ehea/ehea_qmr.c b/drivers/net/ehea/ehea_qmr.c
index 89128b6..cd44bb8 100644
--- a/drivers/net/ehea/ehea_qmr.c
+++ b/drivers/net/ehea/ehea_qmr.c
@@ -26,6 +26,8 @@
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include "ehea.h"
@@ -45,7 +47,7 @@ static void *hw_qpageit_get_inc(struct hw_queue *queue)
 		queue->current_q_offset -= queue->pagesize;
 		retvalue = NULL;
 	} else if (((u64) retvalue) & (EHEA_PAGESIZE-1)) {
-		ehea_error("not on pageboundary");
+		pr_err("not on pageboundary\n");
 		retvalue = NULL;
 	}
 	return retvalue;
@@ -58,15 +60,15 @@ static int hw_queue_ctor(struct hw_queue *queue, const u32 nr_of_pages,
 	int i, k;
 
 	if ((pagesize > PAGE_SIZE) || (!pages_per_kpage)) {
-		ehea_error("pagesize conflict! kernel pagesize=%d, "
-			   "ehea pagesize=%d", (int)PAGE_SIZE, (int)pagesize);
+		pr_err("pagesize conflict! kernel pagesize=%d, ehea pagesize=%d\n",
+		       (int)PAGE_SIZE, (int)pagesize);
 		return -EINVAL;
 	}
 
 	queue->queue_length = nr_of_pages * pagesize;
 	queue->queue_pages = kmalloc(nr_of_pages * sizeof(void *), GFP_KERNEL);
 	if (!queue->queue_pages) {
-		ehea_error("no mem for queue_pages");
+		pr_err("no mem for queue_pages\n");
 		return -ENOMEM;
 	}
 
@@ -130,7 +132,7 @@ struct ehea_cq *ehea_create_cq(struct ehea_adapter *adapter,
 
 	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
 	if (!cq) {
-		ehea_error("no mem for cq");
+		pr_err("no mem for cq\n");
 		goto out_nomem;
 	}
 
@@ -147,7 +149,7 @@ struct ehea_cq *ehea_create_cq(struct ehea_adapter *adapter,
 	hret = ehea_h_alloc_resource_cq(adapter->handle, &cq->attr,
 					&cq->fw_handle, &cq->epas);
 	if (hret != H_SUCCESS) {
-		ehea_error("alloc_resource_cq failed");
+		pr_err("alloc_resource_cq failed\n");
 		goto out_freemem;
 	}
 
@@ -159,7 +161,7 @@ struct ehea_cq *ehea_create_cq(struct ehea_adapter *adapter,
 	for (counter = 0; counter < cq->attr.nr_pages; counter++) {
 		vpage = hw_qpageit_get_inc(&cq->hw_queue);
 		if (!vpage) {
-			ehea_error("hw_qpageit_get_inc failed");
+			pr_err("hw_qpageit_get_inc failed\n");
 			goto out_kill_hwq;
 		}
 
@@ -168,9 +170,8 @@ struct ehea_cq *ehea_create_cq(struct ehea_adapter *adapter,
 					     0, EHEA_CQ_REGISTER_ORIG,
 					     cq->fw_handle, rpage, 1);
 		if (hret < H_SUCCESS) {
-			ehea_error("register_rpage_cq failed ehea_cq=%p "
-				   "hret=%llx counter=%i act_pages=%i",
-				   cq, hret, counter, cq->attr.nr_pages);
+			pr_err("register_rpage_cq failed ehea_cq=%p hret=%llx counter=%i act_pages=%i\n",
+			       cq, hret, counter, cq->attr.nr_pages);
 			goto out_kill_hwq;
 		}
 
@@ -178,14 +179,14 @@ struct ehea_cq *ehea_create_cq(struct ehea_adapter *adapter,
 			vpage = hw_qpageit_get_inc(&cq->hw_queue);
 
 			if ((hret != H_SUCCESS) || (vpage)) {
-				ehea_error("registration of pages not "
-					   "complete hret=%llx\n", hret);
+				pr_err("registration of pages not complete hret=%llx\n",
+				       hret);
 				goto out_kill_hwq;
 			}
 		} else {
 			if (hret != H_PAGE_REGISTERED) {
-				ehea_error("CQ: registration of page failed "
-					   "hret=%llx\n", hret);
+				pr_err("CQ: registration of page failed hret=%llx\n",
+				       hret);
 				goto out_kill_hwq;
 			}
 		}
@@ -241,7 +242,7 @@ int ehea_destroy_cq(struct ehea_cq *cq)
 	}
 
 	if (hret != H_SUCCESS) {
-		ehea_error("destroy CQ failed");
+		pr_err("destroy CQ failed\n");
 		return -EIO;
 	}
 
@@ -259,7 +260,7 @@ struct ehea_eq *ehea_create_eq(struct ehea_adapter *adapter,
 
 	eq = kzalloc(sizeof(*eq), GFP_KERNEL);
 	if (!eq) {
-		ehea_error("no mem for eq");
+		pr_err("no mem for eq\n");
 		return NULL;
 	}
 
@@ -272,21 +273,21 @@ struct ehea_eq *ehea_create_eq(struct ehea_adapter *adapter,
 	hret = ehea_h_alloc_resource_eq(adapter->handle,
 					&eq->attr, &eq->fw_handle);
 	if (hret != H_SUCCESS) {
-		ehea_error("alloc_resource_eq failed");
+		pr_err("alloc_resource_eq failed\n");
 		goto out_freemem;
 	}
 
 	ret = hw_queue_ctor(&eq->hw_queue, eq->attr.nr_pages,
 			    EHEA_PAGESIZE, sizeof(struct ehea_eqe));
 	if (ret) {
-		ehea_error("can't allocate eq pages");
+		pr_err("can't allocate eq pages\n");
 		goto out_freeres;
 	}
 
 	for (i = 0; i < eq->attr.nr_pages; i++) {
 		vpage = hw_qpageit_get_inc(&eq->hw_queue);
 		if (!vpage) {
-			ehea_error("hw_qpageit_get_inc failed");
+			pr_err("hw_qpageit_get_inc failed\n");
 			hret = H_RESOURCE;
 			goto out_kill_hwq;
 		}
@@ -370,7 +371,7 @@ int ehea_destroy_eq(struct ehea_eq *eq)
 	}
 
 	if (hret != H_SUCCESS) {
-		ehea_error("destroy EQ failed");
+		pr_err("destroy EQ failed\n");
 		return -EIO;
 	}
 
@@ -395,7 +396,7 @@ int ehea_qp_alloc_register(struct ehea_qp *qp, struct hw_queue *hw_queue,
 	for (cnt = 0; cnt < nr_pages; cnt++) {
 		vpage = hw_qpageit_get_inc(hw_queue);
 		if (!vpage) {
-			ehea_error("hw_qpageit_get_inc failed");
+			pr_err("hw_qpageit_get_inc failed\n");
 			goto out_kill_hwq;
 		}
 		rpage = virt_to_abs(vpage);
@@ -403,7 +404,7 @@ int ehea_qp_alloc_register(struct ehea_qp *qp, struct hw_queue *hw_queue,
 					     0, h_call_q_selector,
 					     qp->fw_handle, rpage, 1);
 		if (hret < H_SUCCESS) {
-			ehea_error("register_rpage_qp failed");
+			pr_err("register_rpage_qp failed\n");
 			goto out_kill_hwq;
 		}
 	}
@@ -432,7 +433,7 @@ struct ehea_qp *ehea_create_qp(struct ehea_adapter *adapter,
 
 	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
 	if (!qp) {
-		ehea_error("no mem for qp");
+		pr_err("no mem for qp\n");
 		return NULL;
 	}
 
@@ -441,7 +442,7 @@ struct ehea_qp *ehea_create_qp(struct ehea_adapter *adapter,
 	hret = ehea_h_alloc_resource_qp(adapter->handle, init_attr, pd,
 					&qp->fw_handle, &qp->epas);
 	if (hret != H_SUCCESS) {
-		ehea_error("ehea_h_alloc_resource_qp failed");
+		pr_err("ehea_h_alloc_resource_qp failed\n");
 		goto out_freemem;
 	}
 
@@ -455,7 +456,7 @@ struct ehea_qp *ehea_create_qp(struct ehea_adapter *adapter,
 				     init_attr->act_wqe_size_enc_sq, adapter,
 				     0);
 	if (ret) {
-		ehea_error("can't register for sq ret=%x", ret);
+		pr_err("can't register for sq ret=%x\n", ret);
 		goto out_freeres;
 	}
 
@@ -465,7 +466,7 @@ struct ehea_qp *ehea_create_qp(struct ehea_adapter *adapter,
 				     init_attr->act_wqe_size_enc_rq1,
 				     adapter, 1);
 	if (ret) {
-		ehea_error("can't register for rq1 ret=%x", ret);
+		pr_err("can't register for rq1 ret=%x\n", ret);
 		goto out_kill_hwsq;
 	}
 
@@ -476,7 +477,7 @@ struct ehea_qp *ehea_create_qp(struct ehea_adapter *adapter,
 					     init_attr->act_wqe_size_enc_rq2,
 					     adapter, 2);
 		if (ret) {
-			ehea_error("can't register for rq2 ret=%x", ret);
+			pr_err("can't register for rq2 ret=%x\n", ret);
 			goto out_kill_hwr1q;
 		}
 	}
@@ -488,7 +489,7 @@ struct ehea_qp *ehea_create_qp(struct ehea_adapter *adapter,
 					     init_attr->act_wqe_size_enc_rq3,
 					     adapter, 3);
 		if (ret) {
-			ehea_error("can't register for rq3 ret=%x", ret);
+			pr_err("can't register for rq3 ret=%x\n", ret);
 			goto out_kill_hwr2q;
 		}
 	}
@@ -553,7 +554,7 @@ int ehea_destroy_qp(struct ehea_qp *qp)
 	}
 
 	if (hret != H_SUCCESS) {
-		ehea_error("destroy QP failed");
+		pr_err("destroy QP failed\n");
 		return -EIO;
 	}
 
@@ -842,7 +843,7 @@ static u64 ehea_reg_mr_section(int top, int dir, int idx, u64 *pt,
 		    (hret != H_PAGE_REGISTERED)) {
 			ehea_h_free_resource(adapter->handle, mr->handle,
 					     FORCE_FREE);
-			ehea_error("register_rpage_mr failed");
+			pr_err("register_rpage_mr failed\n");
 			return hret;
 		}
 	}
@@ -896,7 +897,7 @@ int ehea_reg_kernel_mr(struct ehea_adapter *adapter, struct ehea_mr *mr)
 
 	pt = (void *)get_zeroed_page(GFP_KERNEL);
 	if (!pt) {
-		ehea_error("no mem");
+		pr_err("no mem\n");
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -906,14 +907,14 @@ int ehea_reg_kernel_mr(struct ehea_adapter *adapter, struct ehea_mr *mr)
 					&mr->handle, &mr->lkey);
 
 	if (hret != H_SUCCESS) {
-		ehea_error("alloc_resource_mr failed");
+		pr_err("alloc_resource_mr failed\n");
 		ret = -EIO;
 		goto out;
 	}
 
 	if (!ehea_bmap) {
 		ehea_h_free_resource(adapter->handle, mr->handle, FORCE_FREE);
-		ehea_error("no busmap available");
+		pr_err("no busmap available\n");
 		ret = -EIO;
 		goto out;
 	}
@@ -929,7 +930,7 @@ int ehea_reg_kernel_mr(struct ehea_adapter *adapter, struct ehea_mr *mr)
 
 	if (hret != H_SUCCESS) {
 		ehea_h_free_resource(adapter->handle, mr->handle, FORCE_FREE);
-		ehea_error("registering mr failed");
+		pr_err("registering mr failed\n");
 		ret = -EIO;
 		goto out;
 	}
@@ -952,7 +953,7 @@ int ehea_rem_mr(struct ehea_mr *mr)
 	hret = ehea_h_free_resource(mr->adapter->handle, mr->handle,
 				    FORCE_FREE);
 	if (hret != H_SUCCESS) {
-		ehea_error("destroy MR failed");
+		pr_err("destroy MR failed\n");
 		return -EIO;
 	}
 
@@ -987,14 +988,14 @@ void print_error_data(u64 *data)
 		length = EHEA_PAGESIZE;
 
 	if (type == EHEA_AER_RESTYPE_QP)
-		ehea_error("QP (resource=%llX) state: AER=0x%llX, AERR=0x%llX, "
-			   "port=%llX", resource, data[6], data[12], data[22]);
+		pr_err("QP (resource=%llX) state: AER=0x%llX, AERR=0x%llX, port=%llX\n",
+		       resource, data[6], data[12], data[22]);
 	else if (type == EHEA_AER_RESTYPE_CQ)
-		ehea_error("CQ (resource=%llX) state: AER=0x%llX", resource,
-			   data[6]);
+		pr_err("CQ (resource=%llX) state: AER=0x%llX\n",
+		       resource, data[6]);
 	else if (type == EHEA_AER_RESTYPE_EQ)
-		ehea_error("EQ (resource=%llX) state: AER=0x%llX", resource,
-			   data[6]);
+		pr_err("EQ (resource=%llX) state: AER=0x%llX\n",
+		       resource, data[6]);
 
 	ehea_dump(data, length, "error data");
 }
@@ -1008,7 +1009,7 @@ u64 ehea_error_data(struct ehea_adapter *adapter, u64 res_handle,
 
 	rblock = (void *)get_zeroed_page(GFP_KERNEL);
 	if (!rblock) {
-		ehea_error("Cannot allocate rblock memory.");
+		pr_err("Cannot allocate rblock memory\n");
 		goto out;
 	}
 
@@ -1020,9 +1021,9 @@ u64 ehea_error_data(struct ehea_adapter *adapter, u64 res_handle,
 		*aerr = rblock[12];
 		print_error_data(rblock);
 	} else if (ret == H_R_STATE) {
-		ehea_error("No error data available: %llX.", res_handle);
+		pr_err("No error data available: %llX\n", res_handle);
 	} else
-		ehea_error("Error data could not be fetched: %llX", res_handle);
+		pr_err("Error data could not be fetched: %llX\n", res_handle);
 
 	free_page((unsigned long)rblock);
 out:
-- 
1.7.3.3

^ permalink raw reply related

* Re: [PATCH] phy: add the IC+ IP1001 driver
From: Peppe CAVALLARO @ 2010-12-13 13:34 UTC (permalink / raw)
  To: Andy Fleming; +Cc: netdev@vger.kernel.org
In-Reply-To: <AANLkTimd4GFzaC9m2Wg4h=ey33gr3shj+v5RQJbOG1Qz@mail.gmail.com>

On 12/11/2010 12:48 AM, Andy Fleming wrote:
>
> On Thu, Dec 9, 2010 at 3:05 AM, Peppe CAVALLARO
> <peppe.cavallaro@st.com> wrote:
> > This patch adds the IC+ IP1001 (Gigabit Ethernet Transceiver) driver.
> > I've had to add an additional delay (2ns) to adjust RX clock phase at
> > GMII/ RGMII interface (according to the PHY data-sheet). This helps to
> > have the RGMII working on some ST platforms.
> >
> > Signed-off-by: Giuseppe Cavallaro <peppe.cavallaro@st.com>
> > ---
> >  drivers/net/phy/Kconfig  |    2 +-
> >  drivers/net/phy/icplus.c |   59
> ++++++++++++++++++++++++++++++++++++++++++----
> >  2 files changed, 55 insertions(+), 6 deletions(-)
> >
> > diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
> > index cb3d13e..35fda5a 100644
> > --- a/drivers/net/phy/Kconfig
> > +++ b/drivers/net/phy/Kconfig
> > @@ -64,7 +64,7 @@ config BCM63XX_PHY
> >  config ICPLUS_PHY
> >        tristate "Drivers for ICPlus PHYs"
> >        ---help---
> > -         Currently supports the IP175C PHY.
> > +         Currently supports the IP175C and IP1001 PHYs.
> >
> >  config REALTEK_PHY
> >        tristate "Drivers for Realtek PHYs"
> > diff --git a/drivers/net/phy/icplus.c b/drivers/net/phy/icplus.c
> > index c1d2d25..9a09e24 100644
> > --- a/drivers/net/phy/icplus.c
> > +++ b/drivers/net/phy/icplus.c
> > @@ -30,7 +30,7 @@
> >  #include <asm/irq.h>
> >  #include <asm/uaccess.h>
> >
> > -MODULE_DESCRIPTION("ICPlus IP175C PHY driver");
> > +MODULE_DESCRIPTION("ICPlus IP175C/IC1001 PHY drivers");
> >  MODULE_AUTHOR("Michael Barkowski");
> >  MODULE_LICENSE("GPL");
> >
> > @@ -89,6 +89,33 @@ static int ip175c_config_init(struct phy_device
> *phydev)
> >        return 0;
> >  }
> >
> > +static int ip1001_config_init(struct phy_device *phydev)
> > +{
> > +       int err, value;
> > +
> > +       /* Software Reset PHY */
> > +       value = phy_read(phydev, MII_BMCR);
> > +       value |= BMCR_RESET;
> > +       err = phy_write(phydev, MII_BMCR, value);
> > +       if (err < 0)
> > +               return err;
> > +
> > +       do {
> > +               value = phy_read(phydev, MII_BMCR);
> > +       } while (value & BMCR_RESET);
> > +
> > +       /* Additional delay (2ns) used to adjust RX clock phase
> > +        * at GMII/ RGMII interface */
> > +       value = phy_read(phydev, 16);
> > +       value |= 0x3;
>
>
> Two things:
> 1) Is this generic for all instances of this PHY, or only for the
> particular board you're using?  Frequently these sorts of clock
> adjustments are done to deal with skew on the lines between the PHY
> and the NIC.  If this is a board-specific change, consider using a
> board fixup or passing a flag from the NIC.
>

After a call with the Hw validation,it seems that we
should only add this for the RGMII mode (not necessary for
GMII mode). From the data-sheet I cannot catch this detail.
>From the IC1001 data-sheer IIUC, it's to add input delay to
the GTX_CLK (2ns for GMII/RGMII and 4ns for MII).

I've tested, with this delay, both GMII and RGMII modes
without any issues.

Hmm, I could re-create the patch and only add the IP1001.
Timing adjustment, through R16 setting, could be in another patch.
Let me know.

> 2) Can we have names for register 16 and value 0x3?
>

Yes, we can.

Regards,
Peppe

>
> Andy
>

^ permalink raw reply

* [*v2 PATCH 19/22] IPVS: netns, trash handling
From: Hans Schillstrom @ 2010-12-13 13:38 UTC (permalink / raw)
  To: horms, ja, daniel.lezcano, wensong, lvs-devel, netdev,
	netfilter-devel
  Cc: hans, Hans Schillstrom
In-Reply-To: <1292247510-753-1-git-send-email-hans.schillstrom@ericsson.com>

trash list per namspace,
and reordering of some params in dst struct.

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
---
 include/net/ip_vs.h            |    4 ++--
 include/net/netns/ip_vs.h      |    3 +++
 net/netfilter/ipvs/ip_vs_ctl.c |   23 ++++++++++-------------
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 057c954..943c766 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -644,8 +644,8 @@ struct ip_vs_dest {
 	struct list_head	d_list;   /* for table with all the dests */
 
 	u16			af;		/* address family */
-	union nf_inet_addr	addr;		/* IP address of the server */
 	__be16			port;		/* port number of the server */
+	union nf_inet_addr	addr;		/* IP address of the server */
 	volatile unsigned	flags;		/* dest status flags */
 	atomic_t		conn_flags;	/* flags to copy to conn */
 	atomic_t		weight;		/* server weight */
@@ -672,8 +672,8 @@ struct ip_vs_dest {
 	/* for virtual service */
 	struct ip_vs_service	*svc;		/* service it belongs to */
 	__u16			protocol;	/* which protocol (TCP/UDP) */
-	union nf_inet_addr	vaddr;		/* virtual IP address */
 	__be16			vport;		/* virtual port number */
+	union nf_inet_addr	vaddr;		/* virtual IP address */
 	__u32			vfwmark;	/* firewall mark of service */
 };
 
diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h
index eeb1dd1..b70444d 100644
--- a/include/net/netns/ip_vs.h
+++ b/include/net/netns/ip_vs.h
@@ -82,6 +82,9 @@ struct netns_ipvs {
 	/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
 	struct mutex		ctl_mutex;
 	struct lock_class_key 	ctl_key;	/* ctl_mutex debuging */
+	/* Trash for destinations */
+	struct list_head	dest_trash;
+
 	/* sys-ctl struct */
 	struct ctl_table_header	*sysctl_hdr;
 	struct ctl_table	*sysctl_tbl;
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 747bc96..418f6ba 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -253,11 +253,6 @@ static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
 
 /*
- *	Trash for destinations
- */
-static LIST_HEAD(ip_vs_dest_trash);
-
-/*
  *	FTP & NULL virtual service counters
  */
 static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
@@ -645,11 +640,12 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
 		     __be16 dport)
 {
 	struct ip_vs_dest *dest, *nxt;
+	struct netns_ipvs *ipvs = net_ipvs(svc->net);
 
 	/*
 	 * Find the destination in trash
 	 */
-	list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
+	list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
 		IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
 			      "dest->refcnt=%d\n",
 			      dest->vfwmark,
@@ -697,11 +693,12 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
  *  are expired, and the refcnt of each destination in the trash must
  *  be 1, so we simply release them here.
  */
-static void ip_vs_trash_cleanup(void)
+static void ip_vs_trash_cleanup(struct net *net)
 {
 	struct ip_vs_dest *dest, *nxt;
+	struct netns_ipvs *ipvs = net_ipvs(net);
 
-	list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
+	list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
 		list_del(&dest->n_list);
 		ip_vs_dst_reset(dest);
 		__ip_vs_unbind_svc(dest);
@@ -826,7 +823,6 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
 		pr_err("%s(): no memory.\n", __func__);
 		return -ENOMEM;
 	}
-
 	dest->af = svc->af;
 	dest->protocol = svc->protocol;
 	dest->vaddr = svc->addr;
@@ -1004,7 +1000,7 @@ static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
 			      IP_VS_DBG_ADDR(dest->af, &dest->addr),
 			      ntohs(dest->port),
 			      atomic_read(&dest->refcnt));
-		list_add(&dest->n_list, &ip_vs_dest_trash);
+		list_add(&dest->n_list, &ipvs->dest_trash);
 		atomic_inc(&dest->refcnt);
 	}
 }
@@ -3435,6 +3431,7 @@ int __net_init __ip_vs_control_init(struct net *net)
 	for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
 		INIT_LIST_HEAD(&ipvs->rs_table[idx]);
 	}
+	INIT_LIST_HEAD(&ipvs->dest_trash);
 
 	/* procfs stats */
 	ipvs->ctl_stats = kzalloc(sizeof(struct ip_vs_stats), GFP_KERNEL);
@@ -3515,12 +3512,13 @@ static void __net_exit __ip_vs_control_cleanup(struct net *net)
 	if (!net_eq(net, &init_net))	/* netns not enabled yet */
 		return;
 
+	ip_vs_trash_cleanup(net);
 	ip_vs_kill_estimator(net, ipvs->ctl_stats);
+	cancel_rearming_delayed_work(&ipvs->defense_work);
+	cancel_work_sync(&ipvs->defense_work.work);
 	unregister_net_sysctl_table(ipvs->sysctl_hdr);
 	proc_net_remove(net, "ip_vs_stats");
 	proc_net_remove(net, "ip_vs");
-	cancel_rearming_delayed_work(&ipvs->defense_work);
-	cancel_work_sync(&ipvs->defense_work.work);
 	free_percpu(ipvs->ustats);
 	kfree(ipvs->ctl_stats);
 }
@@ -3577,7 +3575,6 @@ err:
 void ip_vs_control_cleanup(void)
 {
 	EnterFunction(2);
-	ip_vs_trash_cleanup();
 	unregister_pernet_subsys(&ipvs_control_ops);
 	ip_vs_genl_unregister();
 	nf_unregister_sockopt(&ip_vs_sockopts);
-- 
1.7.2.3


^ permalink raw reply related

* [*v2 PATCH 14/22] IPVS: netns awareness to ip_vs_sync
From: Hans Schillstrom @ 2010-12-13 13:38 UTC (permalink / raw)
  To: horms, ja, daniel.lezcano, wensong, lvs-devel, netdev,
	netfilter-devel
  Cc: hans, Hans Schillstrom
In-Reply-To: <1292247510-753-1-git-send-email-hans.schillstrom@ericsson.com>

All global variables moved to struct ipvs,
most external changes fixed (i.e. init_net removed)
in sync_buf create  + 4 replaced by sizeof(struct..)

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
---
 include/net/ip_vs.h             |   14 +-
 include/net/netns/ip_vs.h       |   16 ++
 net/netfilter/ipvs/ip_vs_core.c |   14 +-
 net/netfilter/ipvs/ip_vs_ctl.c  |   55 ++++---
 net/netfilter/ipvs/ip_vs_sync.c |  325 +++++++++++++++++++++------------------
 5 files changed, 235 insertions(+), 189 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 34f8d2f..21ba325 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -966,7 +966,7 @@ extern struct ip_vs_stats ip_vs_stats;
 extern const struct ctl_path net_vs_ctl_path[];
 extern int sysctl_ip_vs_sync_ver;
 
-extern void ip_vs_sync_switch_mode(int mode);
+extern void ip_vs_sync_switch_mode(struct net *net, int mode);
 extern struct ip_vs_service *
 ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
 		  const union nf_inet_addr *vaddr, __be16 vport);
@@ -995,14 +995,10 @@ extern struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp);
  *      IPVS sync daemon data and function prototypes
  *      (from ip_vs_sync.c)
  */
-extern volatile int ip_vs_sync_state;
-extern volatile int ip_vs_master_syncid;
-extern volatile int ip_vs_backup_syncid;
-extern char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
-extern char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
-extern int start_sync_thread(int state, char *mcast_ifn, __u8 syncid);
-extern int stop_sync_thread(int state);
-extern void ip_vs_sync_conn(struct ip_vs_conn *cp);
+extern int start_sync_thread(struct net *net, int state, char *mcast_ifn,
+			     __u8 syncid);
+extern int stop_sync_thread(struct net *net, int state);
+extern void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp);
 extern int ip_vs_sync_init(void);
 extern void ip_vs_sync_cleanup(void);
 
diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h
index 6486501..1f00447 100644
--- a/include/net/netns/ip_vs.h
+++ b/include/net/netns/ip_vs.h
@@ -74,6 +74,22 @@ struct netns_ipvs {
 	struct list_head 	est_list;	/* estimator list */
 	spinlock_t		est_lock;
 
+	/* ip_vs_sync */
+	struct list_head	sync_queue;
+	spinlock_t		sync_lock;
+	struct ip_vs_sync_buff  *sync_buff;
+	spinlock_t		sync_buff_lock;
+	struct sockaddr_in 	sync_mcast_addr;
+	struct task_struct 	*master_thread;
+	struct task_struct 	*backup_thread;
+	int 			send_mesg_maxlen;
+	int 			recv_mesg_maxlen;
+	volatile int 		sync_state;
+	volatile int 		master_syncid;
+	volatile int 		backup_syncid;
+	/* multicast interface name */
+	char 			master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
+	char 			backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
 };
 
 #endif /* IP_VS_H_ */
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index c9289b5..e77ebcb 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1463,6 +1463,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 	struct ip_vs_protocol *pp;
 	struct ip_vs_conn *cp;
 	int ret, restart, pkts;
+	struct net *net;
+	struct netns_ipvs *ipvs;
 
 	/* Already marked as IPVS request or reply? */
 	if (skb->ipvs_property)
@@ -1541,7 +1543,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 	}
 
 	IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
-
+	net = skb_net(skb);
+	ipvs = net_ipvs(net);
 	/* Check the server status */
 	if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
 		/* the destination server is not available */
@@ -1574,12 +1577,13 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 	 *
 	 * For ONE_PKT let ip_vs_sync_conn() do the filter work.
 	 */
+
 	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
 		pkts = sysctl_ip_vs_sync_threshold[0];
 	else
 		pkts = atomic_add_return(1, &cp->in_pkts);
 
-	if ((ip_vs_sync_state & IP_VS_STATE_MASTER) &&
+	if ((ipvs->sync_state & IP_VS_STATE_MASTER) &&
 	    cp->protocol == IPPROTO_SCTP) {
 		if ((cp->state == IP_VS_SCTP_S_ESTABLISHED &&
 			(pkts % sysctl_ip_vs_sync_threshold[1]
@@ -1588,13 +1592,13 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 				 ((cp->state == IP_VS_SCTP_S_CLOSED) ||
 				  (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) ||
 				  (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) {
-			ip_vs_sync_conn(cp);
+			ip_vs_sync_conn(net, cp);
 			goto out;
 		}
 	}
 
 	/* Keep this block last: TCP and others with pp->num_states <= 1 */
-	else if ((ip_vs_sync_state & IP_VS_STATE_MASTER) &&
+	else if ((ipvs->sync_state & IP_VS_STATE_MASTER) &&
 	    (((cp->protocol != IPPROTO_TCP ||
 	       cp->state == IP_VS_TCP_S_ESTABLISHED) &&
 	      (pkts % sysctl_ip_vs_sync_threshold[1]
@@ -1604,7 +1608,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 	       (cp->state == IP_VS_TCP_S_CLOSE) ||
 	       (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
 	       (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
-		ip_vs_sync_conn(cp);
+		ip_vs_sync_conn(net, cp);
 out:
 	cp->old_state = cp->state;
 
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 36458e8..86195c2 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1558,7 +1558,8 @@ proc_do_sync_mode(ctl_table *table, int write,
 			/* Restore the correct value */
 			*valp = val;
 		} else {
-			ip_vs_sync_switch_mode(val);
+			struct net *net = current->nsproxy->net_ns;
+			ip_vs_sync_switch_mode(net, val);
 		}
 	}
 	return rc;
@@ -2173,11 +2174,12 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 		goto out_unlock;
 	} else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
 		struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
-		ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
+		ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
+					dm->syncid);
 		goto out_unlock;
 	} else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
 		struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
-		ret = stop_sync_thread(dm->state);
+		ret = stop_sync_thread(net, dm->state);
 		goto out_unlock;
 	}
 
@@ -2285,7 +2287,8 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
 }
 
 static inline int
-__ip_vs_get_service_entries(struct net *net, const struct ip_vs_get_services *get,
+__ip_vs_get_service_entries(struct net *net,
+			    const struct ip_vs_get_services *get,
 			    struct ip_vs_get_services __user *uptr)
 {
 	int idx, count=0;
@@ -2422,6 +2425,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	int ret = 0;
 	unsigned int copylen;
 	struct net *net = sock_net(sk);
+	struct netns_ipvs *ipvs = net_ipvs(net);
 
 	BUG_ON(!net);
 	if (!capable(CAP_NET_ADMIN))
@@ -2543,15 +2547,17 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 		struct ip_vs_daemon_user d[2];
 
 		memset(&d, 0, sizeof(d));
-		if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
+		if (ipvs->sync_state & IP_VS_STATE_MASTER) {
 			d[0].state = IP_VS_STATE_MASTER;
-			strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
-			d[0].syncid = ip_vs_master_syncid;
+			strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
+				sizeof(d[0].mcast_ifn));
+			d[0].syncid = ipvs->master_syncid;
 		}
-		if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
+		if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
 			d[1].state = IP_VS_STATE_BACKUP;
-			strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
-			d[1].syncid = ip_vs_backup_syncid;
+			strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn,
+				sizeof(d[1].mcast_ifn));
+			d[1].syncid = ipvs->backup_syncid;
 		}
 		if (copy_to_user(user, &d, sizeof(d)) != 0)
 			ret = -EFAULT;
@@ -3058,20 +3064,23 @@ nla_put_failure:
 static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
 				   struct netlink_callback *cb)
 {
+	struct net *net = skb_net(skb);
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
 	mutex_lock(&__ip_vs_mutex);
-	if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
+	if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
 		if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
-					   ip_vs_master_mcast_ifn,
-					   ip_vs_master_syncid, cb) < 0)
+					   ipvs->master_mcast_ifn,
+					   ipvs->master_syncid, cb) < 0)
 			goto nla_put_failure;
 
 		cb->args[0] = 1;
 	}
 
-	if ((ip_vs_sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
+	if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
 		if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
-					   ip_vs_backup_mcast_ifn,
-					   ip_vs_backup_syncid, cb) < 0)
+					   ipvs->backup_mcast_ifn,
+					   ipvs->backup_syncid, cb) < 0)
 			goto nla_put_failure;
 
 		cb->args[1] = 1;
@@ -3083,24 +3092,26 @@ nla_put_failure:
 	return skb->len;
 }
 
-static int ip_vs_genl_new_daemon(struct nlattr **attrs)
+static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs)
 {
 	if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
 	      attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
 	      attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
 		return -EINVAL;
 
-	return start_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
+	return start_sync_thread(net,
+				 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
 				 nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
 				 nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
 }
 
-static int ip_vs_genl_del_daemon(struct nlattr **attrs)
+static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs)
 {
 	if (!attrs[IPVS_DAEMON_ATTR_STATE])
 		return -EINVAL;
 
-	return stop_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
+	return stop_sync_thread(net,
+				nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
 }
 
 static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs)
@@ -3156,9 +3167,9 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
 		}
 
 		if (cmd == IPVS_CMD_NEW_DAEMON)
-			ret = ip_vs_genl_new_daemon(daemon_attrs);
+			ret = ip_vs_genl_new_daemon(net, daemon_attrs);
 		else
-			ret = ip_vs_genl_del_daemon(daemon_attrs);
+			ret = ip_vs_genl_del_daemon(net, daemon_attrs);
 		goto out;
 	} else if (cmd == IPVS_CMD_ZERO &&
 		   !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 8c8bb4c..29c6bbb 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -192,6 +192,7 @@ union ip_vs_sync_conn {
 #define IPVS_OPT_F_PARAM	(1 << (IPVS_OPT_PARAM-1))
 
 struct ip_vs_sync_thread_data {
+	struct net *net;
 	struct socket *sock;
 	char *buf;
 };
@@ -259,10 +260,6 @@ struct ip_vs_sync_mesg {
 	/* ip_vs_sync_conn entries start here */
 };
 
-/* the maximum length of sync (sending/receiving) message */
-static int sync_send_mesg_maxlen;
-static int sync_recv_mesg_maxlen;
-
 struct ip_vs_sync_buff {
 	struct list_head        list;
 	unsigned long           firstuse;
@@ -273,28 +270,6 @@ struct ip_vs_sync_buff {
 	unsigned char           *end;
 };
 
-
-/* the sync_buff list head and the lock */
-static LIST_HEAD(ip_vs_sync_queue);
-static DEFINE_SPINLOCK(ip_vs_sync_lock);
-
-/* current sync_buff for accepting new conn entries */
-static struct ip_vs_sync_buff   *curr_sb = NULL;
-static DEFINE_SPINLOCK(curr_sb_lock);
-
-/* ipvs sync daemon state */
-volatile int ip_vs_sync_state = IP_VS_STATE_NONE;
-volatile int ip_vs_master_syncid = 0;
-volatile int ip_vs_backup_syncid = 0;
-
-/* multicast interface name */
-char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
-char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
-
-/* sync daemon tasks */
-static struct task_struct *sync_master_thread;
-static struct task_struct *sync_backup_thread;
-
 /* multicast addr */
 static struct sockaddr_in mcast_addr = {
 	.sin_family		= AF_INET,
@@ -324,20 +299,20 @@ static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no)
 	put_unaligned_be32(ho->previous_delta, &no->previous_delta);
 }
 
-static inline struct ip_vs_sync_buff *sb_dequeue(void)
+static inline struct ip_vs_sync_buff *sb_dequeue(struct netns_ipvs *ipvs)
 {
 	struct ip_vs_sync_buff *sb;
 
-	spin_lock_bh(&ip_vs_sync_lock);
-	if (list_empty(&ip_vs_sync_queue)) {
+	spin_lock_bh(&ipvs->sync_lock);
+	if (list_empty(&ipvs->sync_queue)) {
 		sb = NULL;
 	} else {
-		sb = list_entry(ip_vs_sync_queue.next,
+		sb = list_entry(ipvs->sync_queue.next,
 				struct ip_vs_sync_buff,
 				list);
 		list_del(&sb->list);
 	}
-	spin_unlock_bh(&ip_vs_sync_lock);
+	spin_unlock_bh(&ipvs->sync_lock);
 
 	return sb;
 }
@@ -345,25 +320,26 @@ static inline struct ip_vs_sync_buff *sb_dequeue(void)
 /*
  * Create a new sync buffer for Version 1 proto.
  */
-static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
+static inline struct ip_vs_sync_buff *
+ip_vs_sync_buff_create(struct netns_ipvs *ipvs)
 {
 	struct ip_vs_sync_buff *sb;
 
 	if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
 		return NULL;
 
-	if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
+	if (!(sb->mesg=kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC))) {
 		kfree(sb);
 		return NULL;
 	}
 	sb->mesg->reserved = 0;  /* old nr_conns i.e. must be zeo now */
 	sb->mesg->version = SYNC_PROTO_VER;
-	sb->mesg->syncid = ip_vs_master_syncid;
+	sb->mesg->syncid = ipvs->master_syncid;
 	sb->mesg->size = sizeof(struct ip_vs_sync_mesg);
 	sb->mesg->nr_conns = 0;
 	sb->mesg->spare = 0;
 	sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg);
-	sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;
+	sb->end = (unsigned char *)sb->mesg + ipvs->send_mesg_maxlen;
 
 	sb->firstuse = jiffies;
 	return sb;
@@ -375,14 +351,16 @@ static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
 	kfree(sb);
 }
 
-static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
+static inline void sb_queue_tail(struct netns_ipvs *ipvs)
 {
-	spin_lock(&ip_vs_sync_lock);
-	if (ip_vs_sync_state & IP_VS_STATE_MASTER)
-		list_add_tail(&sb->list, &ip_vs_sync_queue);
+	struct ip_vs_sync_buff *sb = ipvs->sync_buff;
+
+	spin_lock(&ipvs->sync_lock);
+	if (ipvs->sync_state & IP_VS_STATE_MASTER)
+		list_add_tail(&sb->list, &ipvs->sync_queue);
 	else
 		ip_vs_sync_buff_release(sb);
-	spin_unlock(&ip_vs_sync_lock);
+	spin_unlock(&ipvs->sync_lock);
 }
 
 /*
@@ -390,18 +368,18 @@ static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
  *	than the specified time or the specified time is zero.
  */
 static inline struct ip_vs_sync_buff *
-get_curr_sync_buff(unsigned long time)
+get_curr_sync_buff(struct netns_ipvs *ipvs, unsigned long time)
 {
 	struct ip_vs_sync_buff *sb;
 
-	spin_lock_bh(&curr_sb_lock);
-	if (curr_sb && (time == 0 ||
-			time_before(jiffies - curr_sb->firstuse, time))) {
-		sb = curr_sb;
-		curr_sb = NULL;
+	spin_lock_bh(&ipvs->sync_buff_lock);
+	if (ipvs->sync_buff && (time == 0 ||
+	    time_before(jiffies - ipvs->sync_buff->firstuse, time))) {
+		sb = ipvs->sync_buff;
+		ipvs->sync_buff = NULL;
 	} else
 		sb = NULL;
-	spin_unlock_bh(&curr_sb_lock);
+	spin_unlock_bh(&ipvs->sync_buff_lock);
 	return sb;
 }
 
@@ -409,33 +387,37 @@ get_curr_sync_buff(unsigned long time)
  * Switch mode from sending version 0 or 1
  *  - must handle sync_buf
  */
-void ip_vs_sync_switch_mode(int mode) {
+void ip_vs_sync_switch_mode(struct net *net, int mode)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
 
-	if (!ip_vs_sync_state & IP_VS_STATE_MASTER)
+	if (!ipvs->sync_state & IP_VS_STATE_MASTER)
 		return;
-	if (mode == sysctl_ip_vs_sync_ver || !curr_sb)
+	if (mode == sysctl_ip_vs_sync_ver || !ipvs->sync_buff)
 		return;
 
-	spin_lock_bh(&curr_sb_lock);
+	spin_lock_bh(&ipvs->sync_buff_lock);
 	/* Buffer empty ? then let buf_create do the job  */
-	if ( curr_sb->mesg->size <=  sizeof(struct ip_vs_sync_mesg)) {
-		kfree(curr_sb);
-		curr_sb = NULL;
+	if ( ipvs->sync_buff->mesg->size <=  sizeof(struct ip_vs_sync_mesg)) {
+		kfree(ipvs->sync_buff);
+		ipvs->sync_buff = NULL;
 	} else {
-		spin_lock_bh(&ip_vs_sync_lock);
-		if (ip_vs_sync_state & IP_VS_STATE_MASTER)
-			list_add_tail(&curr_sb->list, &ip_vs_sync_queue);
+		spin_lock_bh(&ipvs->sync_lock);
+		if (ipvs->sync_state & IP_VS_STATE_MASTER)
+			list_add_tail(&ipvs->sync_buff->list,
+				      &ipvs->sync_queue);
 		else
-			ip_vs_sync_buff_release(curr_sb);
-		spin_unlock_bh(&ip_vs_sync_lock);
+			ip_vs_sync_buff_release(ipvs->sync_buff);
+		spin_unlock_bh(&ipvs->sync_lock);
 	}
-	spin_unlock_bh(&curr_sb_lock);
+	spin_unlock_bh(&ipvs->sync_buff_lock);
 }
 
 /*
  * Create a new sync buffer for Version 0 proto.
  */
-static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create_v0(void)
+static inline struct ip_vs_sync_buff *
+ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs)
 {
 	struct ip_vs_sync_buff *sb;
 	struct ip_vs_sync_mesg_v0 *mesg;
@@ -443,16 +425,16 @@ static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create_v0(void)
 	if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
 		return NULL;
 
-	if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
+	if (!(sb->mesg=kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC))) {
 		kfree(sb);
 		return NULL;
 	}
 	mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg;
 	mesg->nr_conns = 0;
-	mesg->syncid = ip_vs_master_syncid;
-	mesg->size = 4;
-	sb->head = (unsigned char *)mesg + 4;
-	sb->end = (unsigned char *)mesg + sync_send_mesg_maxlen;
+	mesg->syncid = ipvs->master_syncid;
+	mesg->size = sizeof(struct ip_vs_sync_mesg_v0);
+	sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0);
+	sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen;
 	sb->firstuse = jiffies;
 	return sb;
 }
@@ -461,8 +443,9 @@ static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create_v0(void)
  *      Version 0 , could be switched in by sys_ctl.
  *      Add an ip_vs_conn information into the current sync_buff.
  */
-void ip_vs_sync_conn_v0(struct ip_vs_conn *cp)
+void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp)
 {
+	struct netns_ipvs *ipvs = net_ipvs(net);
 	struct ip_vs_sync_mesg_v0 *m;
 	struct ip_vs_sync_conn_v0 *s;
 	int len;
@@ -473,10 +456,12 @@ void ip_vs_sync_conn_v0(struct ip_vs_conn *cp)
 	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
 		return;
 
-	spin_lock(&curr_sb_lock);
-	if (!curr_sb) {
-		if (!(curr_sb=ip_vs_sync_buff_create_v0())) {
-			spin_unlock(&curr_sb_lock);
+	spin_lock(&ipvs->sync_buff_lock);
+	if (!ipvs->sync_buff) {
+		ipvs->sync_buff =
+			ip_vs_sync_buff_create_v0(ipvs);
+		if (!ipvs->sync_buff) {
+			spin_unlock(&ipvs->sync_buff_lock);
 			pr_err("ip_vs_sync_buff_create failed.\n");
 			return;
 		}
@@ -484,8 +469,8 @@ void ip_vs_sync_conn_v0(struct ip_vs_conn *cp)
 
 	len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
 		SIMPLE_CONN_SIZE;
-	m = (struct ip_vs_sync_mesg_v0 *)curr_sb->mesg;
-	s = (struct ip_vs_sync_conn_v0 *)curr_sb->head;
+	m = (struct ip_vs_sync_mesg_v0 *)ipvs->sync_buff->mesg;
+	s = (struct ip_vs_sync_conn_v0 *)ipvs->sync_buff->head;
 
 	/* copy members */
 	s->reserved = 0;
@@ -506,18 +491,18 @@ void ip_vs_sync_conn_v0(struct ip_vs_conn *cp)
 
 	m->nr_conns++;
 	m->size += len;
-	curr_sb->head += len;
+	ipvs->sync_buff->head += len;
 
 	/* check if there is a space for next one */
-	if (curr_sb->head + FULL_CONN_SIZE > curr_sb->end) {
-		sb_queue_tail(curr_sb);
-		curr_sb = NULL;
+	if (ipvs->sync_buff->head + FULL_CONN_SIZE > ipvs->sync_buff->end) {
+		sb_queue_tail(ipvs);
+		ipvs->sync_buff = NULL;
 	}
-	spin_unlock(&curr_sb_lock);
+	spin_unlock(&ipvs->sync_buff_lock);
 
 	/* synchronize its controller if it has */
 	if (cp->control)
-		ip_vs_sync_conn(cp->control);
+		ip_vs_sync_conn(net, cp->control);
 }
 
 /*
@@ -525,8 +510,9 @@ void ip_vs_sync_conn_v0(struct ip_vs_conn *cp)
  *      Called by ip_vs_in.
  *      Sending Version 1 messages
  */
-void ip_vs_sync_conn(struct ip_vs_conn *cp)
+void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp)
 {
+	struct netns_ipvs *ipvs = net_ipvs(net);
 	struct ip_vs_sync_mesg *m;
 	union ip_vs_sync_conn *s;
 	__u8 *p;
@@ -534,7 +520,7 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp)
 
 	/* Handle old version of the protocol */
 	if (sysctl_ip_vs_sync_ver == 0) {
-		ip_vs_sync_conn_v0(cp);
+		ip_vs_sync_conn_v0(net, cp);
 		return;
 	}
 	/* Do not sync ONE PACKET */
@@ -551,7 +537,7 @@ sloop:
 		pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN);
 	}
 
-	spin_lock(&curr_sb_lock);
+	spin_lock(&ipvs->sync_buff_lock);
 
 #ifdef CONFIG_IP_VS_IPV6
 	if (cp->af == AF_INET6)
@@ -570,26 +556,26 @@ sloop:
 
 	/* check if there is a space for this one  */
 	pad = 0;
-	if (curr_sb) {
-		pad = (4 - (size_t)curr_sb->head) & 3;
-		if (curr_sb->head + len + pad > curr_sb->end) {
-			sb_queue_tail(curr_sb);
-			curr_sb = NULL;
+	if (ipvs->sync_buff) {
+		pad = (4 - (size_t)ipvs->sync_buff->head) & 3;
+		if (ipvs->sync_buff->head + len + pad > ipvs->sync_buff->end) {
+			sb_queue_tail(ipvs);
+			ipvs->sync_buff = NULL;
 			pad = 0;
 		}
 	}
 
-	if (!curr_sb) {
-		if (!(curr_sb=ip_vs_sync_buff_create())) {
-			spin_unlock(&curr_sb_lock);
+	if (!ipvs->sync_buff) {
+		if (!(ipvs->sync_buff=ip_vs_sync_buff_create(ipvs))) {
+			spin_unlock(&ipvs->sync_buff_lock);
 			pr_err("ip_vs_sync_buff_create failed.\n");
 			return;
 		}
 	}
 
-	m = curr_sb->mesg;
-	p = curr_sb->head;
-	curr_sb->head += pad + len;
+	m = ipvs->sync_buff->mesg;
+	p = ipvs->sync_buff->head;
+	ipvs->sync_buff->head += pad + len;
 	m->size += pad + len;
 	/* Add ev. padding from prev. sync_conn */
 	while (pad--)
@@ -647,7 +633,7 @@ sloop:
 		}
 	}
 
-	spin_unlock(&curr_sb_lock);
+	spin_unlock(&ipvs->sync_buff_lock);
 
 control:
 	/* synchronize its controller if it has */
@@ -699,7 +685,8 @@ ip_vs_conn_fill_param_sync(int af, union ip_vs_sync_conn *sc,
 			buff[pe_name_len]=0;
 			p->pe = __ip_vs_pe_getbyname(buff);
 			if (!p->pe) {
-				IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n", buff);
+				IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n",
+					     buff);
 				return 1;
 			}
 		} else {
@@ -748,7 +735,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
 		 * If it is not found the connection will remain unbound
 		 * but still handled.
 		 */
-		dest = ip_vs_find_dest(&init_net, type, daddr, dport, param->vaddr,
+		dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr,
 				       param->vport, protocol, fwmark);
 
 		/*  Set the approprite ativity flag */
@@ -1089,6 +1076,7 @@ out:
 static void ip_vs_process_message(struct net *net, __u8 *buffer,
 				  const size_t buflen)
 {
+	struct netns_ipvs *ipvs = net_ipvs(net);
 	struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer;
 	__u8 *p, *msg_end;
 	int i, nr_conns;
@@ -1105,7 +1093,7 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer,
 		return;
 	}
 	/* SyncID sanity check */
-	if (ip_vs_backup_syncid != 0 && m2->syncid != ip_vs_backup_syncid) {
+	if (ipvs->backup_syncid != 0 && m2->syncid != ipvs->backup_syncid) {
 		IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid);
 		return;
 	}
@@ -1189,8 +1177,9 @@ static int set_mcast_if(struct sock *sk, char *ifname)
 {
 	struct net_device *dev;
 	struct inet_sock *inet = inet_sk(sk);
+	struct net *net = sock_net(sk);
 
-	if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+	if ((dev = __dev_get_by_name(net, ifname)) == NULL)
 		return -ENODEV;
 
 	if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
@@ -1209,30 +1198,33 @@ static int set_mcast_if(struct sock *sk, char *ifname)
  *	Set the maximum length of sync message according to the
  *	specified interface's MTU.
  */
-static int set_sync_mesg_maxlen(int sync_state)
+static int set_sync_mesg_maxlen(struct net *net, int sync_state)
 {
+	struct netns_ipvs *ipvs = net_ipvs(net);
 	struct net_device *dev;
 	int num;
 
 	if (sync_state == IP_VS_STATE_MASTER) {
-		if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL)
+		dev = __dev_get_by_name(net, ipvs->master_mcast_ifn);
+		if (!dev)
 			return -ENODEV;
 
 		num = (dev->mtu - sizeof(struct iphdr) -
 		       sizeof(struct udphdr) -
 		       SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
-		sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
+		ipvs->send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
 			SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF);
 		IP_VS_DBG(7, "setting the maximum length of sync sending "
-			  "message %d.\n", sync_send_mesg_maxlen);
+			  "message %d.\n", ipvs->send_mesg_maxlen);
 	} else if (sync_state == IP_VS_STATE_BACKUP) {
-		if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL)
+		dev = __dev_get_by_name(net, ipvs->backup_mcast_ifn);
+		if (!dev)
 			return -ENODEV;
 
-		sync_recv_mesg_maxlen = dev->mtu -
+		ipvs->recv_mesg_maxlen = dev->mtu -
 			sizeof(struct iphdr) - sizeof(struct udphdr);
 		IP_VS_DBG(7, "setting the maximum length of sync receiving "
-			  "message %d.\n", sync_recv_mesg_maxlen);
+			  "message %d.\n", ipvs->recv_mesg_maxlen);
 	}
 
 	return 0;
@@ -1247,6 +1239,7 @@ static int set_sync_mesg_maxlen(int sync_state)
 static int
 join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
 {
+	struct net *net = sock_net(sk);
 	struct ip_mreqn mreq;
 	struct net_device *dev;
 	int ret;
@@ -1254,7 +1247,7 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
 	memset(&mreq, 0, sizeof(mreq));
 	memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
 
-	if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+	if ((dev = __dev_get_by_name(net, ifname)) == NULL)
 		return -ENODEV;
 	if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
 		return -EINVAL;
@@ -1271,11 +1264,12 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
 
 static int bind_mcastif_addr(struct socket *sock, char *ifname)
 {
+	struct net *net = sock_net(sock->sk);
 	struct net_device *dev;
 	__be32 addr;
 	struct sockaddr_in sin;
 
-	if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+	if ((dev = __dev_get_by_name(net, ifname)) == NULL)
 		return -ENODEV;
 
 	addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
@@ -1297,8 +1291,9 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname)
 /*
  *      Set up sending multicast socket over UDP
  */
-static struct socket * make_send_sock(void)
+static struct socket * make_send_sock(struct net *net)
 {
+	struct netns_ipvs *ipvs = net_ipvs(net);
 	struct socket *sock;
 	int result;
 
@@ -1309,7 +1304,7 @@ static struct socket * make_send_sock(void)
 		return ERR_PTR(result);
 	}
 
-	result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn);
+	result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn);
 	if (result < 0) {
 		pr_err("Error setting outbound mcast interface\n");
 		goto error;
@@ -1318,7 +1313,7 @@ static struct socket * make_send_sock(void)
 	set_mcast_loop(sock->sk, 0);
 	set_mcast_ttl(sock->sk, 1);
 
-	result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn);
+	result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn);
 	if (result < 0) {
 		pr_err("Error binding address of the mcast interface\n");
 		goto error;
@@ -1342,8 +1337,9 @@ static struct socket * make_send_sock(void)
 /*
  *      Set up receiving multicast socket over UDP
  */
-static struct socket * make_receive_sock(void)
+static struct socket * make_receive_sock(struct net *net)
 {
+	struct netns_ipvs *ipvs = net_ipvs(net);
 	struct socket *sock;
 	int result;
 
@@ -1367,7 +1363,7 @@ static struct socket * make_receive_sock(void)
 	/* join the multicast group */
 	result = join_mcast_group(sock->sk,
 			(struct in_addr *) &mcast_addr.sin_addr,
-			ip_vs_backup_mcast_ifn);
+			ipvs->backup_mcast_ifn);
 	if (result < 0) {
 		pr_err("Error joining to the multicast group\n");
 		goto error;
@@ -1438,20 +1434,21 @@ ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
 static int sync_thread_master(void *data)
 {
 	struct ip_vs_sync_thread_data *tinfo = data;
+	struct netns_ipvs *ipvs = net_ipvs(tinfo->net);
 	struct ip_vs_sync_buff *sb;
 
 	pr_info("sync thread started: state = MASTER, mcast_ifn = %s, "
 		"syncid = %d\n",
-		ip_vs_master_mcast_ifn, ip_vs_master_syncid);
+		ipvs->master_mcast_ifn, ipvs->master_syncid);
 
 	while (!kthread_should_stop()) {
-		while ((sb = sb_dequeue())) {
+		while ((sb = sb_dequeue(ipvs))) {
 			ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
 			ip_vs_sync_buff_release(sb);
 		}
 
-		/* check if entries stay in curr_sb for 2 seconds */
-		sb = get_curr_sync_buff(2 * HZ);
+		/* check if entries stay in ipvs->sync_buff for 2 seconds */
+		sb = get_curr_sync_buff(ipvs, 2 * HZ);
 		if (sb) {
 			ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
 			ip_vs_sync_buff_release(sb);
@@ -1461,12 +1458,12 @@ static int sync_thread_master(void *data)
 	}
 
 	/* clean up the sync_buff queue */
-	while ((sb=sb_dequeue())) {
+	while ((sb=sb_dequeue(ipvs))) {
 		ip_vs_sync_buff_release(sb);
 	}
 
 	/* clean up the current sync_buff */
-	if ((sb = get_curr_sync_buff(0))) {
+	if ((sb = get_curr_sync_buff(ipvs, 0))) {
 		ip_vs_sync_buff_release(sb);
 	}
 
@@ -1481,11 +1478,12 @@ static int sync_thread_master(void *data)
 static int sync_thread_backup(void *data)
 {
 	struct ip_vs_sync_thread_data *tinfo = data;
+	struct netns_ipvs *ipvs = net_ipvs(tinfo->net);
 	int len;
 
 	pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
 		"syncid = %d\n",
-		ip_vs_backup_mcast_ifn, ip_vs_backup_syncid);
+		ipvs->backup_mcast_ifn, ipvs->backup_syncid);
 
 	while (!kthread_should_stop()) {
 		wait_event_interruptible(*sk_sleep(tinfo->sock->sk),
@@ -1495,7 +1493,7 @@ static int sync_thread_backup(void *data)
 		/* do we have data now? */
 		while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
 			len = ip_vs_receive(tinfo->sock, tinfo->buf,
-					sync_recv_mesg_maxlen);
+					ipvs->recv_mesg_maxlen);
 			if (len <= 0) {
 				pr_err("receiving message error\n");
 				break;
@@ -1504,7 +1502,7 @@ static int sync_thread_backup(void *data)
 			/* disable bottom half, because it accesses the data
 			   shared by softirq while getting/creating conns */
 			local_bh_disable();
-			ip_vs_process_message(&init_net, tinfo->buf, len);
+			ip_vs_process_message(tinfo->net, tinfo->buf, len);
 			local_bh_enable();
 		}
 	}
@@ -1518,11 +1516,12 @@ static int sync_thread_backup(void *data)
 }
 
 
-int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
+int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
 {
 	struct ip_vs_sync_thread_data *tinfo;
 	struct task_struct **realtask, *task;
 	struct socket *sock;
+	struct netns_ipvs *ipvs = net_ipvs(net);
 	char *name, *buf = NULL;
 	int (*threadfn)(void *data);
 	int result = -ENOMEM;
@@ -1532,27 +1531,27 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
 		  sizeof(struct ip_vs_sync_conn_v0));
 
 	if (state == IP_VS_STATE_MASTER) {
-		if (sync_master_thread)
+		if (ipvs->master_thread)
 			return -EEXIST;
 
-		strlcpy(ip_vs_master_mcast_ifn, mcast_ifn,
-			sizeof(ip_vs_master_mcast_ifn));
-		ip_vs_master_syncid = syncid;
-		realtask = &sync_master_thread;
-		name = "ipvs_syncmaster";
+		strlcpy(ipvs->master_mcast_ifn, mcast_ifn,
+			sizeof(ipvs->master_mcast_ifn));
+		ipvs->master_syncid = syncid;
+		realtask = &ipvs->master_thread;
+		name = "ipvs_master:%d";
 		threadfn = sync_thread_master;
-		sock = make_send_sock();
+		sock = make_send_sock(net);
 	} else if (state == IP_VS_STATE_BACKUP) {
-		if (sync_backup_thread)
+		if (ipvs->backup_thread)
 			return -EEXIST;
 
-		strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn,
-			sizeof(ip_vs_backup_mcast_ifn));
-		ip_vs_backup_syncid = syncid;
-		realtask = &sync_backup_thread;
-		name = "ipvs_syncbackup";
+		strlcpy(ipvs->backup_mcast_ifn, mcast_ifn,
+			sizeof(ipvs->backup_mcast_ifn));
+		ipvs->backup_syncid = syncid;
+		realtask = &ipvs->backup_thread;
+		name = "ipvs_backup:%d";
 		threadfn = sync_thread_backup;
-		sock = make_receive_sock();
+		sock = make_receive_sock(net);
 	} else {
 		return -EINVAL;
 	}
@@ -1562,9 +1561,9 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
 		goto out;
 	}
 
-	set_sync_mesg_maxlen(state);
+	set_sync_mesg_maxlen(net, state);
 	if (state == IP_VS_STATE_BACKUP) {
-		buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL);
+		buf = kmalloc(ipvs->recv_mesg_maxlen, GFP_KERNEL);
 		if (!buf)
 			goto outsocket;
 	}
@@ -1573,10 +1572,11 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
 	if (!tinfo)
 		goto outbuf;
 
+	tinfo->net = net;
 	tinfo->sock = sock;
 	tinfo->buf = buf;
 
-	task = kthread_run(threadfn, tinfo, name);
+	task = kthread_run(threadfn, tinfo, name, ipvs->inc);
 	if (IS_ERR(task)) {
 		result = PTR_ERR(task);
 		goto outtinfo;
@@ -1584,7 +1584,7 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
 
 	/* mark as active */
 	*realtask = task;
-	ip_vs_sync_state |= state;
+	ipvs->sync_state |= state;
 
 	/* increase the module use count */
 	ip_vs_use_count_inc();
@@ -1602,16 +1602,18 @@ out:
 }
 
 
-int stop_sync_thread(int state)
+int stop_sync_thread(struct net *net, int state)
 {
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
 	IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
 
 	if (state == IP_VS_STATE_MASTER) {
-		if (!sync_master_thread)
+		if (!ipvs->master_thread)
 			return -ESRCH;
 
 		pr_info("stopping master sync thread %d ...\n",
-			task_pid_nr(sync_master_thread));
+			task_pid_nr(ipvs->master_thread));
 
 		/*
 		 * The lock synchronizes with sb_queue_tail(), so that we don't
@@ -1619,21 +1621,21 @@ int stop_sync_thread(int state)
 		 * progress of stopping the master sync daemon.
 		 */
 
-		spin_lock_bh(&ip_vs_sync_lock);
-		ip_vs_sync_state &= ~IP_VS_STATE_MASTER;
-		spin_unlock_bh(&ip_vs_sync_lock);
-		kthread_stop(sync_master_thread);
-		sync_master_thread = NULL;
+		spin_lock_bh(&ipvs->sync_lock);
+		ipvs->sync_state &= ~IP_VS_STATE_MASTER;
+		spin_unlock_bh(&ipvs->sync_lock);
+		kthread_stop(ipvs->master_thread);
+		ipvs->master_thread = NULL;
 	} else if (state == IP_VS_STATE_BACKUP) {
-		if (!sync_backup_thread)
+		if (!ipvs->backup_thread)
 			return -ESRCH;
 
 		pr_info("stopping backup sync thread %d ...\n",
-			task_pid_nr(sync_backup_thread));
+			task_pid_nr(ipvs->backup_thread));
 
-		ip_vs_sync_state &= ~IP_VS_STATE_BACKUP;
-		kthread_stop(sync_backup_thread);
-		sync_backup_thread = NULL;
+		ipvs->sync_state &= ~IP_VS_STATE_BACKUP;
+		kthread_stop(ipvs->backup_thread);
+		ipvs->backup_thread = NULL;
 	} else {
 		return -EINVAL;
 	}
@@ -1649,13 +1651,30 @@ int stop_sync_thread(int state)
  */
 static int __net_init __ip_vs_sync_init(struct net *net)
 {
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	if (!net_eq(net, &init_net))	/* netns not enabled yet */
+		return -EPERM;
+
+	INIT_LIST_HEAD(&ipvs->sync_queue);
+	spin_lock_init(&ipvs->sync_lock);
+	spin_lock_init(&ipvs->sync_buff_lock);
+
+	ipvs->sync_mcast_addr.sin_family = AF_INET;
+	ipvs->sync_mcast_addr.sin_port = cpu_to_be16(IP_VS_SYNC_PORT);
+	ipvs->sync_mcast_addr.sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP);
 	return 0;
 }
 
 static void __ip_vs_sync_cleanup(struct net *net)
 {
+	if (!net_eq(net, &init_net))	/* netns not enabled yet */
+		return;
+	stop_sync_thread(net, IP_VS_STATE_MASTER);
+	stop_sync_thread(net, IP_VS_STATE_BACKUP);
 	return;
 }
+
 static struct pernet_operations ipvs_sync_ops = {
 	.init = __ip_vs_sync_init,
 	.exit = __ip_vs_sync_cleanup,
-- 
1.7.2.3


^ permalink raw reply related

* [*v2 PATCH 15/22] IPVS: netns, ip_vs_stats and its procfs
From: Hans Schillstrom @ 2010-12-13 13:38 UTC (permalink / raw)
  To: horms, ja, daniel.lezcano, wensong, lvs-devel, netdev,
	netfilter-devel
  Cc: hans, Hans Schillstrom
In-Reply-To: <1292247510-753-1-git-send-email-hans.schillstrom@ericsson.com>

The statistic counter locks for every packet are now removed,
and that statistic is now per CPU, i.e. no locks needed.
However summing is made in ip_vs_est into ip_vs_stats struct
which is moved to ipvs struc.

procfs, ip_vs_stats now have a "per cpu" count and a grand total.
A new function seq_file_single_net() in ip_vs.h created for handling of
single_open_net() since it does not place net ptr in a struct, like others.


/var/lib/lxc # cat /proc/net/ip_vs_stats
       Total Incoming Outgoing         Incoming         Outgoing
CPU    Conns  Packets  Packets            Bytes            Bytes
  0        0        3        1               9D               34
  1        0        1        2               49               70
  2        0        1        2               34               76
  3        1        2        2               70               74
  ~        1        7        7              18A              18E

     Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s
           0        0        0                0                0

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
---
 include/net/ip_vs.h             |   22 ++++++++++++
 include/net/netns/ip_vs.h       |    4 ++
 net/netfilter/ipvs/ip_vs_core.c |   23 ++++++------
 net/netfilter/ipvs/ip_vs_ctl.c  |   72 ++++++++++++++++++++++++++-------------
 net/netfilter/ipvs/ip_vs_est.c  |   26 ++++++++++++++
 5 files changed, 111 insertions(+), 36 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 21ba325..848fcda 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -92,6 +92,19 @@ static inline struct net *skb_sknet(const struct sk_buff *skb) {
 	return &init_net;
 #endif
 }
+/*
+ * This one needed for single_open_net since net is stored directly in
+ * private not as a struct i.e. seq_file_net cant be used.
+ */
+static inline struct net *seq_file_single_net(struct seq_file *seq)
+{
+#ifdef CONFIG_NET_NS
+	return (struct net *)seq->private;
+#else
+	return &init_net;
+#endif
+}
+
 
 /* Connections' size value needed by ip_vs_ctl.c */
 extern int ip_vs_conn_tab_size;
@@ -348,6 +361,15 @@ struct ip_vs_stats {
 
 	spinlock_t              lock;           /* spin lock */
 };
+/*
+ * Helper Macros for per cpu
+ * ipvs->ctl_stats->ustats.count
+ */
+#define IPVS_STAT_INC(ipvs, count)	\
+	__this_cpu_inc((ipvs)->ustats->count)
+
+#define IPVS_STAT_ADD(ipvs, count, value)	\
+	__this_cpu_add((ipvs)->ustats->count, value)
 
 struct dst_entry;
 struct iphdr;
diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h
index 1f00447..b6642f0 100644
--- a/include/net/netns/ip_vs.h
+++ b/include/net/netns/ip_vs.h
@@ -62,6 +62,10 @@ struct netns_ipvs {
 	struct list_head 	sctp_apps[SCTP_APP_TAB_SIZE];
 	spinlock_t		sctp_app_lock;
 #endif
+	/* ip_vs_ctl */
+	struct ip_vs_stats 		*ctl_stats; /* Statistics & estimator */
+	struct ip_vs_stats_user __percpu *ustats;   /* Statistics */
+
 	/* ip_vs_lblc */
 	int 			sysctl_lblc_expiration;
 	struct ctl_table_header	*lblc_ctl_header;
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index e77ebcb..679eb16 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -115,6 +115,8 @@ static inline void
 ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 {
 	struct ip_vs_dest *dest = cp->dest;
+	struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
+
 	if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
 		spin_lock(&dest->stats.lock);
 		dest->stats.ustats.inpkts++;
@@ -126,10 +128,8 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 		dest->svc->stats.ustats.inbytes += skb->len;
 		spin_unlock(&dest->svc->stats.lock);
 
-		spin_lock(&ip_vs_stats.lock);
-		ip_vs_stats.ustats.inpkts++;
-		ip_vs_stats.ustats.inbytes += skb->len;
-		spin_unlock(&ip_vs_stats.lock);
+		IPVS_STAT_INC(ipvs, inpkts);
+		IPVS_STAT_ADD(ipvs, inbytes, skb->len);
 	}
 }
 
@@ -138,6 +138,8 @@ static inline void
 ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 {
 	struct ip_vs_dest *dest = cp->dest;
+	struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
+
 	if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
 		spin_lock(&dest->stats.lock);
 		dest->stats.ustats.outpkts++;
@@ -149,10 +151,8 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 		dest->svc->stats.ustats.outbytes += skb->len;
 		spin_unlock(&dest->svc->stats.lock);
 
-		spin_lock(&ip_vs_stats.lock);
-		ip_vs_stats.ustats.outpkts++;
-		ip_vs_stats.ustats.outbytes += skb->len;
-		spin_unlock(&ip_vs_stats.lock);
+		IPVS_STAT_INC(ipvs, outpkts);
+		IPVS_STAT_ADD(ipvs, outbytes, skb->len);
 	}
 }
 
@@ -160,6 +160,8 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 static inline void
 ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
 {
+	struct netns_ipvs *ipvs = net_ipvs(svc->net);
+
 	spin_lock(&cp->dest->stats.lock);
 	cp->dest->stats.ustats.conns++;
 	spin_unlock(&cp->dest->stats.lock);
@@ -168,9 +170,7 @@ ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
 	svc->stats.ustats.conns++;
 	spin_unlock(&svc->stats.lock);
 
-	spin_lock(&ip_vs_stats.lock);
-	ip_vs_stats.ustats.conns++;
-	spin_unlock(&ip_vs_stats.lock);
+	IPVS_STAT_INC(ipvs, conns);
 }
 
 
@@ -1827,7 +1827,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
 	},
 #endif
 };
-
 /*
  *	Initialize IP Virtual Server netns mem.
  */
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 86195c2..ca7bf30 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -258,8 +258,7 @@ static DECLARE_DELAYED_WORK(defense_work, defense_work_handler);
 
 static void defense_work_handler(struct work_struct *work)
 {
-	struct net *net = &init_net;
-	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct netns_ipvs *ipvs = net_ipvs(&init_net);
 
 	update_defense_level(ipvs);
 	if (atomic_read(&ip_vs_dropentry))
@@ -1498,7 +1497,7 @@ static int ip_vs_zero_all(struct net *net)
 		}
 	}
 
-	ip_vs_zero_stats(&ip_vs_stats);
+	ip_vs_zero_stats(net_ipvs(net)->ctl_stats);
 	return 0;
 }
 
@@ -1988,36 +1987,42 @@ static const struct file_operations ip_vs_info_fops = {
 
 #endif
 
-struct ip_vs_stats ip_vs_stats = {
-	.lock = __SPIN_LOCK_UNLOCKED(ip_vs_stats.lock),
-};
-
 #ifdef CONFIG_PROC_FS
 static int ip_vs_stats_show(struct seq_file *seq, void *v)
 {
+	struct net *net = seq_file_single_net(seq);
+	struct ip_vs_stats *ctl_stats = net_ipvs(net)->ctl_stats;
+	int i;
 
 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
 	seq_puts(seq,
-		 "   Total Incoming Outgoing         Incoming         Outgoing\n");
+		 "       Total Incoming Outgoing         Incoming         Outgoing\n");
 	seq_printf(seq,
-		   "   Conns  Packets  Packets            Bytes            Bytes\n");
+		   "CPU    Conns  Packets  Packets            Bytes            Bytes\n");
 
-	spin_lock_bh(&ip_vs_stats.lock);
-	seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.ustats.conns,
-		   ip_vs_stats.ustats.inpkts, ip_vs_stats.ustats.outpkts,
-		   (unsigned long long) ip_vs_stats.ustats.inbytes,
-		   (unsigned long long) ip_vs_stats.ustats.outbytes);
+	for_each_possible_cpu(i) {
+		struct ip_vs_stats_user *u = per_cpu_ptr(net->ipvs->ustats, i);
+		seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n",
+			    i, u->conns, u->inpkts, u->outpkts,
+			    (__u64) u->inbytes, (__u64) u->outbytes);
+	}
+
+	spin_lock_bh(&ctl_stats->lock);
+	seq_printf(seq, "  ~ %8X %8X %8X %16LX %16LX\n\n", ctl_stats->ustats.conns,
+		   ctl_stats->ustats.inpkts, ctl_stats->ustats.outpkts,
+		   (unsigned long long) ctl_stats->ustats.inbytes,
+		   (unsigned long long) ctl_stats->ustats.outbytes);
 
 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
 	seq_puts(seq,
-		   " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
-	seq_printf(seq,"%8X %8X %8X %16X %16X\n",
-			ip_vs_stats.ustats.cps,
-			ip_vs_stats.ustats.inpps,
-			ip_vs_stats.ustats.outpps,
-			ip_vs_stats.ustats.inbps,
-			ip_vs_stats.ustats.outbps);
-	spin_unlock_bh(&ip_vs_stats.lock);
+		   "     Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
+	seq_printf(seq,"    %8X %8X %8X %16X %16X\n",
+			ctl_stats->ustats.cps,
+			ctl_stats->ustats.inpps,
+			ctl_stats->ustats.outpps,
+			ctl_stats->ustats.inbps,
+			ctl_stats->ustats.outbps);
+	spin_unlock_bh(&ctl_stats->lock);
 
 	return 0;
 }
@@ -3459,6 +3464,18 @@ int __net_init __ip_vs_control_init(struct net *net)
 
 	if (!net_eq(net, &init_net))	/* netns not enabled yet */
 		return -EPERM;
+	/* procfs stats */
+	ipvs->ctl_stats = kzalloc(sizeof(struct ip_vs_stats), GFP_KERNEL);
+	if (ipvs->ctl_stats == NULL) {
+		pr_err("%s(): no memory.\n", __func__);
+		return -ENOMEM;
+	}
+	ipvs->ustats = alloc_percpu(struct ip_vs_stats_user);
+	if (!ipvs->ustats) {
+		pr_err("%s() alloc_percpu failed\n",__func__);
+		goto err_alloc;
+	}
+	spin_lock_init(&ipvs->ctl_stats->lock);
 
 	for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
 		INIT_LIST_HEAD(&ipvs->rs_table[idx]);
@@ -3469,22 +3486,29 @@ int __net_init __ip_vs_control_init(struct net *net)
 	sysctl_header = register_net_sysctl_table(net, net_vs_ctl_path, vs_vars);
 	if (sysctl_header == NULL)
 		goto err_reg;
-	ip_vs_new_estimator(net, &ip_vs_stats);
+	ip_vs_new_estimator(net, ipvs->ctl_stats);
 	return 0;
 
 err_reg:
+	free_percpu(ipvs->ustats);
+err_alloc:
+	kfree(ipvs->ctl_stats);
 	return -ENOMEM;
 }
 
 static void __net_exit __ip_vs_control_cleanup(struct net *net)
 {
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
 	if (!net_eq(net, &init_net))	/* netns not enabled yet */
 		return;
 
-	ip_vs_kill_estimator(net, &ip_vs_stats);
+	ip_vs_kill_estimator(net, ipvs->ctl_stats);
 	unregister_net_sysctl_table(sysctl_header);
 	proc_net_remove(net, "ip_vs_stats");
 	proc_net_remove(net, "ip_vs");
+	free_percpu(ipvs->ustats);
+	kfree(ipvs->ctl_stats);
 }
 
 static struct pernet_operations ipvs_control_ops = {
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index 6bcabed..576de09 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -55,6 +55,31 @@
 static void estimation_timer(unsigned long arg);
 static DEFINE_TIMER(est_timer, estimation_timer, 0, 0);
 
+/*
+ * Make a summary from each cpu
+ */
+static inline void get_stats(struct netns_ipvs *ipvs)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct ip_vs_stats_user *u = per_cpu_ptr(ipvs->ustats, i);
+		if (i) {
+			ipvs->ctl_stats->ustats.conns += u->conns;
+			ipvs->ctl_stats->ustats.inpkts += u->inpkts;
+			ipvs->ctl_stats->ustats.inbytes += u->inbytes;
+			ipvs->ctl_stats->ustats.outpkts += u->outpkts;
+			ipvs->ctl_stats->ustats.outbytes += u->outbytes;
+		} else {
+			ipvs->ctl_stats->ustats.conns = u->conns;
+			ipvs->ctl_stats->ustats.inpkts = u->inpkts;
+			ipvs->ctl_stats->ustats.inbytes = u->inbytes;
+			ipvs->ctl_stats->ustats.outpkts = u->outpkts;
+			ipvs->ctl_stats->ustats.outbytes = u->outbytes;
+		}
+	}
+}
+
 static void estimation_timer(unsigned long arg)
 {
 	struct ip_vs_estimator *e;
@@ -68,6 +93,7 @@ static void estimation_timer(unsigned long arg)
 
 	for_each_net(net) {
 		ipvs = net_ipvs(net);
+		get_stats(ipvs);
 		spin_lock(&ipvs->est_lock);
 		list_for_each_entry(e, &ipvs->est_list, list) {
 			s = container_of(e, struct ip_vs_stats, est);
-- 
1.7.2.3


^ permalink raw reply related

* [*v2 PATCH 08/22] IPVS: netns preparation for proto_sctp
From: Hans Schillstrom @ 2010-12-13 13:38 UTC (permalink / raw)
  To: horms, ja, daniel.lezcano, wensong, lvs-devel, netdev,
	netfilter-devel
  Cc: hans, Hans Schillstrom
In-Reply-To: <1292247510-753-1-git-send-email-hans.schillstrom@ericsson.com>

In this phase (one), all local vars will be moved to ipvs struct.

Remaining work, add param struct net *net to a couple of
functions that is common for all protos and use ip_vs_proto_data

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
---
 include/net/netns/ip_vs.h             |   10 +++-
 net/netfilter/ipvs/ip_vs_proto.c      |    3 +
 net/netfilter/ipvs/ip_vs_proto_sctp.c |  117 +++++++++++++++++----------------
 3 files changed, 73 insertions(+), 57 deletions(-)

diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h
index 4975026..fcb3c7c 100644
--- a/include/net/netns/ip_vs.h
+++ b/include/net/netns/ip_vs.h
@@ -48,7 +48,15 @@ struct netns_ipvs {
 	struct list_head 	udp_apps[UDP_APP_TAB_SIZE];
 	spinlock_t		udp_app_lock;
 #endif
-
+	/* ip_vs_proto_sctp */
+#ifdef CONFIG_IP_VS_PROTO_SCTP
+	#define SCTP_APP_TAB_BITS        4
+	#define SCTP_APP_TAB_SIZE        (1 << SCTP_APP_TAB_BITS)
+	#define SCTP_APP_TAB_MASK        (SCTP_APP_TAB_SIZE - 1)
+	/* Hash table for SCTP application incarnations	 */
+	struct list_head 	sctp_apps[SCTP_APP_TAB_SIZE];
+	spinlock_t		sctp_app_lock;
+#endif
 	/* ip_vs_lblc */
 	int 			sysctl_lblc_expiration;
 	struct ctl_table_header	*lblc_ctl_header;
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index ec71d47..fd3a7a8 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -313,6 +313,9 @@ static int  __net_init  __ip_vs_protocol_init(struct net *net)
 #ifdef CONFIG_IP_VS_PROTO_UDP
 	register_ip_vs_proto_netns(net, &ip_vs_protocol_udp);
 #endif
+#ifdef CONFIG_IP_VS_PROTO_SCTP
+	register_ip_vs_proto_netns(net, &ip_vs_protocol_sctp);
+#endif
 	return 0;
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 521b827..108ae0c 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -862,7 +862,7 @@ static struct ipvs_sctp_nextstate
 /*
  *      Timeout table[state]
  */
-static int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = {
+static const int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = {
 	[IP_VS_SCTP_S_NONE]         =     2 * HZ,
 	[IP_VS_SCTP_S_INIT_CLI]     =     1 * 60 * HZ,
 	[IP_VS_SCTP_S_INIT_SER]     =     1 * 60 * HZ,
@@ -906,17 +906,14 @@ static const char *sctp_state_name(int state)
 	return "?";
 }
 
-static void sctp_timeout_change(struct ip_vs_protocol *pp, int flags)
-{
-}
-
+/*
 static int
 sctp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
 {
 
 return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_SCTP_S_LAST,
 				sctp_state_name_table, sname, to);
-}
+} */
 
 static inline int
 set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
@@ -926,6 +923,7 @@ set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
 	unsigned char chunk_type;
 	int event, next_state;
 	int ihl;
+	struct ip_vs_proto_data *pd;
 
 #ifdef CONFIG_IP_VS_IPV6
 	ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
@@ -1001,10 +999,13 @@ set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
 			}
 		}
 	}
+	pd = ip_vs_proto_data_get(&init_net, pp->protocol); /* tmp fix */
+	if (likely(pd))
+		cp->timeout = pd->timeout_table[cp->state = next_state];
+	else	/* What to do ? */
+		cp->timeout = sctp_timeouts[cp->state = next_state];
 
-	 cp->timeout = pp->timeout_table[cp->state = next_state];
-
-	 return 1;
+	return 1;
 }
 
 static int
@@ -1020,16 +1021,6 @@ sctp_state_transition(struct ip_vs_conn *cp, int direction,
 	return ret;
 }
 
-/*
- *      Hash table for SCTP application incarnations
- */
-#define SCTP_APP_TAB_BITS        4
-#define SCTP_APP_TAB_SIZE        (1 << SCTP_APP_TAB_BITS)
-#define SCTP_APP_TAB_MASK        (SCTP_APP_TAB_SIZE - 1)
-
-static struct list_head sctp_apps[SCTP_APP_TAB_SIZE];
-static DEFINE_SPINLOCK(sctp_app_lock);
-
 static inline __u16 sctp_app_hashkey(__be16 port)
 {
 	return (((__force u16)port >> SCTP_APP_TAB_BITS) ^ (__force u16)port)
@@ -1042,34 +1033,40 @@ static int sctp_register_app(struct ip_vs_app *inc)
 	__u16 hash;
 	__be16 port = inc->port;
 	int ret = 0;
+	struct netns_ipvs *ipvs = net_ipvs(&init_net);
+	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(&init_net, IPPROTO_SCTP);
 
 	hash = sctp_app_hashkey(port);
 
-	spin_lock_bh(&sctp_app_lock);
-	list_for_each_entry(i, &sctp_apps[hash], p_list) {
+	spin_lock_bh(&ipvs->sctp_app_lock);
+	list_for_each_entry(i, &ipvs->sctp_apps[hash], p_list) {
 		if (i->port == port) {
 			ret = -EEXIST;
 			goto out;
 		}
 	}
-	list_add(&inc->p_list, &sctp_apps[hash]);
-	atomic_inc(&ip_vs_protocol_sctp.appcnt);
+	list_add(&inc->p_list, &ipvs->sctp_apps[hash]);
+	atomic_inc(&pd->pp->appcnt);
 out:
-	spin_unlock_bh(&sctp_app_lock);
+	spin_unlock_bh(&ipvs->sctp_app_lock);
 
 	return ret;
 }
 
 static void sctp_unregister_app(struct ip_vs_app *inc)
 {
-	spin_lock_bh(&sctp_app_lock);
-	atomic_dec(&ip_vs_protocol_sctp.appcnt);
+	struct netns_ipvs *ipvs = net_ipvs(&init_net);
+	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(&init_net, IPPROTO_SCTP);
+
+	spin_lock_bh(&ipvs->sctp_app_lock);
+	atomic_dec(&pd->pp->appcnt);
 	list_del(&inc->p_list);
-	spin_unlock_bh(&sctp_app_lock);
+	spin_unlock_bh(&ipvs->sctp_app_lock);
 }
 
 static int sctp_app_conn_bind(struct ip_vs_conn *cp)
 {
+	struct netns_ipvs *ipvs = net_ipvs(&init_net);
 	int hash;
 	struct ip_vs_app *inc;
 	int result = 0;
@@ -1080,12 +1077,12 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp)
 	/* Lookup application incarnations and bind the right one */
 	hash = sctp_app_hashkey(cp->vport);
 
-	spin_lock(&sctp_app_lock);
-	list_for_each_entry(inc, &sctp_apps[hash], p_list) {
+	spin_lock(&ipvs->sctp_app_lock);
+	list_for_each_entry(inc, &ipvs->sctp_apps[hash], p_list) {
 		if (inc->port == cp->vport) {
 			if (unlikely(!ip_vs_app_inc_get(inc)))
 				break;
-			spin_unlock(&sctp_app_lock);
+			spin_unlock(&ipvs->sctp_app_lock);
 
 			IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
 					"%s:%u to app %s on port %u\n",
@@ -1101,43 +1098,51 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp)
 			goto out;
 		}
 	}
-	spin_unlock(&sctp_app_lock);
+	spin_unlock(&ipvs->sctp_app_lock);
 out:
 	return result;
 }
 
-static void ip_vs_sctp_init(struct ip_vs_protocol *pp)
+/* ---------------------------------------------
+ *   timeouts is netns related now.
+ * ---------------------------------------------
+ */
+static void __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd)
 {
-	IP_VS_INIT_HASH_TABLE(sctp_apps);
-	pp->timeout_table = sctp_timeouts;
-}
+	struct netns_ipvs *ipvs = net_ipvs(net);
 
+	ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE);
+	spin_lock_init(&ipvs->tcp_app_lock);
+	pd->timeout_table = ip_vs_create_timeout_table((int*)sctp_timeouts,
+							sizeof(sctp_timeouts));
+}
 
-static void ip_vs_sctp_exit(struct ip_vs_protocol *pp)
+static void __ip_vs_sctp_exit(struct net *net, struct ip_vs_proto_data *pd)
 {
-
+	kfree(pd->timeout_table);
 }
 
 struct ip_vs_protocol ip_vs_protocol_sctp = {
-	.name = "SCTP",
-	.protocol = IPPROTO_SCTP,
-	.num_states = IP_VS_SCTP_S_LAST,
-	.dont_defrag = 0,
-	.appcnt = ATOMIC_INIT(0),
-	.init = ip_vs_sctp_init,
-	.exit = ip_vs_sctp_exit,
-	.register_app = sctp_register_app,
+	.name		= "SCTP",
+	.protocol	= IPPROTO_SCTP,
+	.num_states	= IP_VS_SCTP_S_LAST,
+	.dont_defrag	= 0,
+	.init 		= NULL,
+	.exit 		= NULL,
+	.init_netns 	= __ip_vs_sctp_init,
+	.exit_netns 	= __ip_vs_sctp_exit,
+	.register_app	= sctp_register_app,
 	.unregister_app = sctp_unregister_app,
-	.conn_schedule = sctp_conn_schedule,
-	.conn_in_get = ip_vs_conn_in_get_proto,
-	.conn_out_get = ip_vs_conn_out_get_proto,
-	.snat_handler = sctp_snat_handler,
-	.dnat_handler = sctp_dnat_handler,
-	.csum_check = sctp_csum_check,
-	.state_name = sctp_state_name,
+	.conn_schedule	= sctp_conn_schedule,
+	.conn_in_get	= ip_vs_conn_in_get_proto,
+	.conn_out_get	= ip_vs_conn_out_get_proto,
+	.snat_handler	= sctp_snat_handler,
+	.dnat_handler	= sctp_dnat_handler,
+	.csum_check	= sctp_csum_check,
+	.state_name	= sctp_state_name,
 	.state_transition = sctp_state_transition,
-	.app_conn_bind = sctp_app_conn_bind,
-	.debug_packet = ip_vs_tcpudp_debug_packet,
-	.timeout_change = sctp_timeout_change,
-	.set_state_timeout = sctp_set_state_timeout,
+	.app_conn_bind	= sctp_app_conn_bind,
+	.debug_packet	= ip_vs_tcpudp_debug_packet,
+	.timeout_change	= NULL,
+/*	.set_state_timeout = sctp_set_state_timeout, */
 };
-- 
1.7.2.3


^ permalink raw reply related

* [*v2 PATCH 06/22] IPVS: netns preparation for proto_tcp
From: Hans Schillstrom @ 2010-12-13 13:38 UTC (permalink / raw)
  To: horms, ja, daniel.lezcano, wensong, lvs-devel, netdev,
	netfilter-devel
  Cc: hans, Hans Schillstrom
In-Reply-To: <1292247510-753-1-git-send-email-hans.schillstrom@ericsson.com>

In this phase (one), all local vars will be moved to ipvs struct.

Remaining work, add param struct net *net to a couple of
functions that is common for all protos and use all
ip_vs_proto_data

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
---
 include/net/ip_vs.h                  |    7 ++-
 include/net/netns/ip_vs.h            |    8 +++
 net/netfilter/ipvs/ip_vs_ftp.c       |    9 ++-
 net/netfilter/ipvs/ip_vs_proto.c     |   13 ++++-
 net/netfilter/ipvs/ip_vs_proto_tcp.c |   99 +++++++++++++++++++--------------
 5 files changed, 87 insertions(+), 49 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 528c5e8..3765add 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -40,11 +40,12 @@ static inline struct netns_ipvs * net_ipvs(struct net* net) {
 	return init_net.ipvs;
 #endif
 }
+
 /*
  * Get net ptr from skb in traffic cases
  * use skb_sknet when call is from userland (ioctl or netlink)
  */
-static inline struct net *skb_net(struct sk_buff *skb) {
+static inline struct net *skb_net(const struct sk_buff *skb) {
 #ifdef CONFIG_NET_NS
 #ifdef CONFIG_IP_VS_DEBUG
 	/*
@@ -71,7 +72,7 @@ static inline struct net *skb_net(struct sk_buff *skb) {
 #endif
 }
 
-static inline struct net *skb_sknet(struct sk_buff *skb) {
+static inline struct net *skb_sknet(const struct sk_buff *skb) {
 #ifdef CONFIG_NET_NS
 #ifdef CONFIG_IP_VS_DEBUG
 	/* Start with the most likely hit */
@@ -808,7 +809,7 @@ extern void ip_vs_conn_expire_now(struct ip_vs_conn *cp);
 
 extern const char * ip_vs_state_name(__u16 proto, int state);
 
-extern void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp);
+extern void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp);
 extern int ip_vs_check_template(struct ip_vs_conn *ct);
 extern void ip_vs_random_dropentry(void);
 extern int ip_vs_conn_init(void);
diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h
index b7d7815..512cdd0 100644
--- a/include/net/netns/ip_vs.h
+++ b/include/net/netns/ip_vs.h
@@ -32,6 +32,14 @@ struct netns_ipvs {
 	/* ip_vs_proto */
 	#define IP_VS_PROTO_TAB_SIZE	32	/* must be power of 2 */
 	struct ip_vs_proto_data *proto_data_table[IP_VS_PROTO_TAB_SIZE];
+	/* ip_vs_proto_tcp */
+#ifdef CONFIG_IP_VS_PROTO_TCP
+	#define	TCP_APP_TAB_BITS	4
+	#define	TCP_APP_TAB_SIZE	(1 << TCP_APP_TAB_BITS)
+	#define	TCP_APP_TAB_MASK	(TCP_APP_TAB_SIZE - 1)
+	struct list_head 	tcp_apps[TCP_APP_TAB_SIZE];
+	spinlock_t		tcp_app_lock;
+#endif
 
 	/* ip_vs_lblc */
 	int 			sysctl_lblc_expiration;
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 0e762f3..fdcb74b 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -157,6 +157,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 	int ret = 0;
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct;
+	struct net *net;
 
 #ifdef CONFIG_IP_VS_IPV6
 	/* This application helper doesn't work with IPv6 yet,
@@ -256,9 +257,9 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 		 * Not setting 'diff' is intentional, otherwise the sequence
 		 * would be adjusted twice.
 		 */
-
+		net = skb_net(skb);
 		cp->app_data = NULL;
-		ip_vs_tcp_conn_listen(n_cp);
+		ip_vs_tcp_conn_listen(net, n_cp);
 		ip_vs_conn_put(n_cp);
 		return ret;
 	}
@@ -287,6 +288,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
 	union nf_inet_addr to;
 	__be16 port;
 	struct ip_vs_conn *n_cp;
+	struct net *net;
 
 #ifdef CONFIG_IP_VS_IPV6
 	/* This application helper doesn't work with IPv6 yet,
@@ -378,7 +380,8 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
 	/*
 	 *	Move tunnel to listen state
 	 */
-	ip_vs_tcp_conn_listen(n_cp);
+	net = skb_net(skb);
+	ip_vs_tcp_conn_listen(net, n_cp);
 	ip_vs_conn_put(n_cp);
 
 	return 1;
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index 8caaf3e..90d69c5 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -307,12 +307,23 @@ ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp,
  */
 static int  __net_init  __ip_vs_protocol_init(struct net *net)
 {
+#ifdef CONFIG_IP_VS_PROTO_TCP
+	register_ip_vs_proto_netns(net, &ip_vs_protocol_tcp);
+#endif
 	return 0;
 }
 
 static void __net_exit __ip_vs_protocol_cleanup(struct net *net)
 {
-	/* empty */
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct ip_vs_proto_data *pd;
+	int i;
+
+	/* unregister all the ipvs proto data for this netns */
+	for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+		while ((pd = ipvs->proto_data_table[i]) != NULL)
+			unregister_ip_vs_proto_netns(net, pd);
+	}
 }
 
 static struct pernet_operations ipvs_proto_ops = {
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 5e4da60..8986b25 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -9,8 +9,12 @@
  *              as published by the Free Software Foundation; either version
  *              2 of the License, or (at your option) any later version.
  *
- * Changes:
+ * Changes:     Hans Schillstrom <hans.schillstrom@ericsson.com>
  *
+ *              Network name space (netns) aware.
+ *              Global data moved to netns i.e struct netns_ipvs
+ *              tcp_timeouts table has copy per netns in a hash table per
+ *              protocol ip_vs_proto_data and is handled by netns
  */
 
 #define KMSG_COMPONENT "IPVS"
@@ -46,8 +50,8 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 	net = skb_net(skb);
 	/* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
 	if (th->syn &&
-	    (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol, &iph.daddr,
-				     th->dest))) {
+	    (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol,
+				     &iph.daddr, th->dest))) {
 		int ignored;
 
 		if (ip_vs_todrop()) {
@@ -345,7 +349,7 @@ static const int tcp_state_off[IP_VS_DIR_LAST] = {
 /*
  *	Timeout table[state]
  */
-static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
+static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
 	[IP_VS_TCP_S_NONE]		=	2*HZ,
 	[IP_VS_TCP_S_ESTABLISHED]	=	15*60*HZ,
 	[IP_VS_TCP_S_SYN_SENT]		=	2*60*HZ,
@@ -459,13 +463,13 @@ static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
 	*/
 	tcp_state_table = (on? tcp_states_dos : tcp_states);
 }
-
+/* Remove dot used
 static int
 tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
 {
 	return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST,
 				       tcp_state_name_table, sname, to);
-}
+} */
 
 static inline int tcp_state_idx(struct tcphdr *th)
 {
@@ -487,6 +491,7 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
 	int state_idx;
 	int new_state = IP_VS_TCP_S_CLOSE;
 	int state_off = tcp_state_off[direction];
+	struct ip_vs_proto_data *pd;  /* Temp fix */
 
 	/*
 	 *    Update state offset to INPUT_ONLY if necessary
@@ -542,10 +547,13 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
 		}
 	}
 
-	cp->timeout = pp->timeout_table[cp->state = new_state];
+	pd = ip_vs_proto_data_get(&init_net, pp->protocol);
+	if (likely(pd))
+		cp->timeout = pd->timeout_table[cp->state = new_state];
+	else	/* What to do ? */
+		cp->timeout = tcp_timeouts[cp->state = new_state];
 }
 
-
 /*
  *	Handle state transitions
  */
@@ -573,17 +581,6 @@ tcp_state_transition(struct ip_vs_conn *cp, int direction,
 	return 1;
 }
 
-
-/*
- *	Hash table for TCP application incarnations
- */
-#define	TCP_APP_TAB_BITS	4
-#define	TCP_APP_TAB_SIZE	(1 << TCP_APP_TAB_BITS)
-#define	TCP_APP_TAB_MASK	(TCP_APP_TAB_SIZE - 1)
-
-static struct list_head tcp_apps[TCP_APP_TAB_SIZE];
-static DEFINE_SPINLOCK(tcp_app_lock);
-
 static inline __u16 tcp_app_hashkey(__be16 port)
 {
 	return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
@@ -597,21 +594,23 @@ static int tcp_register_app(struct ip_vs_app *inc)
 	__u16 hash;
 	__be16 port = inc->port;
 	int ret = 0;
+	struct netns_ipvs *ipvs = net_ipvs(&init_net);
+	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(&init_net, IPPROTO_TCP);
 
 	hash = tcp_app_hashkey(port);
 
-	spin_lock_bh(&tcp_app_lock);
-	list_for_each_entry(i, &tcp_apps[hash], p_list) {
+	spin_lock_bh(&ipvs->tcp_app_lock);
+	list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {
 		if (i->port == port) {
 			ret = -EEXIST;
 			goto out;
 		}
 	}
-	list_add(&inc->p_list, &tcp_apps[hash]);
-	atomic_inc(&ip_vs_protocol_tcp.appcnt);
+	list_add(&inc->p_list, &ipvs->tcp_apps[hash]);
+	atomic_inc(&pd->pp->appcnt);
 
   out:
-	spin_unlock_bh(&tcp_app_lock);
+	spin_unlock_bh(&ipvs->tcp_app_lock);
 	return ret;
 }
 
@@ -619,16 +618,20 @@ static int tcp_register_app(struct ip_vs_app *inc)
 static void
 tcp_unregister_app(struct ip_vs_app *inc)
 {
-	spin_lock_bh(&tcp_app_lock);
-	atomic_dec(&ip_vs_protocol_tcp.appcnt);
+	struct netns_ipvs *ipvs = net_ipvs(&init_net);
+	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(&init_net, IPPROTO_TCP);
+
+	spin_lock_bh(&ipvs->tcp_app_lock);
+	atomic_dec(&pd->pp->appcnt);
 	list_del(&inc->p_list);
-	spin_unlock_bh(&tcp_app_lock);
+	spin_unlock_bh(&ipvs->tcp_app_lock);
 }
 
 
 static int
 tcp_app_conn_bind(struct ip_vs_conn *cp)
 {
+	struct netns_ipvs *ipvs = net_ipvs(&init_net);
 	int hash;
 	struct ip_vs_app *inc;
 	int result = 0;
@@ -640,12 +643,12 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
 	/* Lookup application incarnations and bind the right one */
 	hash = tcp_app_hashkey(cp->vport);
 
-	spin_lock(&tcp_app_lock);
-	list_for_each_entry(inc, &tcp_apps[hash], p_list) {
+	spin_lock(&ipvs->tcp_app_lock);
+	list_for_each_entry(inc, &ipvs->tcp_apps[hash], p_list) {
 		if (inc->port == cp->vport) {
 			if (unlikely(!ip_vs_app_inc_get(inc)))
 				break;
-			spin_unlock(&tcp_app_lock);
+			spin_unlock(&ipvs->tcp_app_lock);
 
 			IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
 				      "%s:%u to app %s on port %u\n",
@@ -662,7 +665,7 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
 			goto out;
 		}
 	}
-	spin_unlock(&tcp_app_lock);
+	spin_unlock(&ipvs->tcp_app_lock);
 
   out:
 	return result;
@@ -672,24 +675,34 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
 /*
  *	Set LISTEN timeout. (ip_vs_conn_put will setup timer)
  */
-void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
+void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp)
 {
+	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+
 	spin_lock(&cp->lock);
 	cp->state = IP_VS_TCP_S_LISTEN;
-	cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN];
+	cp->timeout = ( pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN]
+			   : tcp_timeouts[IP_VS_TCP_S_LISTEN]);
 	spin_unlock(&cp->lock);
 }
 
-
-static void ip_vs_tcp_init(struct ip_vs_protocol *pp)
+/* ---------------------------------------------
+ *   timeouts is netns related now.
+ * ---------------------------------------------
+ */
+static void __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
 {
-	IP_VS_INIT_HASH_TABLE(tcp_apps);
-	pp->timeout_table = tcp_timeouts;
-}
+	struct netns_ipvs *ipvs = net_ipvs(net);
 
+	ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
+	spin_lock_init(&ipvs->tcp_app_lock);
+	pd->timeout_table = ip_vs_create_timeout_table((int*)tcp_timeouts,
+							sizeof(tcp_timeouts));
+}
 
-static void ip_vs_tcp_exit(struct ip_vs_protocol *pp)
+static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd)
 {
+	kfree(pd->timeout_table);
 }
 
 
@@ -699,8 +712,10 @@ struct ip_vs_protocol ip_vs_protocol_tcp = {
 	.num_states =		IP_VS_TCP_S_LAST,
 	.dont_defrag =		0,
 	.appcnt =		ATOMIC_INIT(0),
-	.init =			ip_vs_tcp_init,
-	.exit =			ip_vs_tcp_exit,
+	.init =			NULL,
+	.exit =			NULL,
+	.init_netns =		__ip_vs_tcp_init,
+	.exit_netns =		__ip_vs_tcp_exit,
 	.register_app =		tcp_register_app,
 	.unregister_app =	tcp_unregister_app,
 	.conn_schedule =	tcp_conn_schedule,
@@ -714,5 +729,5 @@ struct ip_vs_protocol ip_vs_protocol_tcp = {
 	.app_conn_bind =	tcp_app_conn_bind,
 	.debug_packet =		ip_vs_tcpudp_debug_packet,
 	.timeout_change =	tcp_timeout_change,
-	.set_state_timeout =	tcp_set_state_timeout,
+/*	.set_state_timeout =	tcp_set_state_timeout, */
 };
-- 
1.7.2.3


^ permalink raw reply related

* [*v2 PATCH 22/22] IPVS: netns, final patch enabling network name space.
From: Hans Schillstrom @ 2010-12-13 13:38 UTC (permalink / raw)
  To: horms, ja, daniel.lezcano, wensong, lvs-devel, netdev,
	netfilter-devel
  Cc: hans, Hans Schillstrom
In-Reply-To: <1292247510-753-1-git-send-email-hans.schillstrom@ericsson.com>

all init_net removed, (except for some alloc related
that needs to be there)

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
---
 net/netfilter/ipvs/ip_vs_app.c  |    3 ---
 net/netfilter/ipvs/ip_vs_conn.c |    5 -----
 net/netfilter/ipvs/ip_vs_core.c |    4 ----
 net/netfilter/ipvs/ip_vs_ctl.c  |    7 +------
 net/netfilter/ipvs/ip_vs_est.c  |    3 ---
 net/netfilter/ipvs/ip_vs_ftp.c  |    6 ------
 net/netfilter/ipvs/ip_vs_sync.c |    5 -----
 7 files changed, 1 insertions(+), 32 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index af2cd83..f10901b 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -581,9 +581,6 @@ static int __net_init __ip_vs_app_init(struct net *net)
 {
 	struct netns_ipvs *ipvs = net_ipvs(net);
 
-	if (!net_eq(net, &init_net))	/* netns not enabled yet */
-		return -EPERM;
-
 	INIT_LIST_HEAD(&ipvs->app_list);
 	__mutex_init(&ipvs->app_mutex,"ipvs->app_mutex", &ipvs->app_key);
 	proc_net_fops_create(net, "ip_vs_app", 0, &ip_vs_app_fops);
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 0200d97..24a17c8 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -1232,8 +1232,6 @@ int __net_init __ip_vs_conn_init(struct net *net)
 {
 	struct netns_ipvs *ipvs = net_ipvs(net);
 
-	if (!net_eq(net, &init_net))	/* netns not enabled yet */
-		return -EPERM;
 	atomic_set(&ipvs->conn_count, 0);
 
 	proc_net_fops_create(net, "ip_vs_conn", 0, &ip_vs_conn_fops);
@@ -1244,9 +1242,6 @@ int __net_init __ip_vs_conn_init(struct net *net)
 /* Cleanup and release all netns related ... */
 static void __net_exit __ip_vs_conn_cleanup(struct net *net)
 {
-	if (!net_eq(net, &init_net))	/* netns not enabled yet */
-		return;
-
 	/* flush all the connection entries first */
 	ip_vs_conn_flush(net);
 	proc_net_remove(net, "ip_vs_conn");
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 7441dcd..bc3fca5 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1852,10 +1852,6 @@ static int __net_init  __ip_vs_init(struct net *net)
 {
 	struct netns_ipvs *ipvs;
 
-	if (!net_eq(net, &init_net)) {
-		pr_err("The final patch for enabling netns is missing\n");
-		return -EPERM;
-	}
 	ipvs = (struct netns_ipvs *)net_generic(net, ip_vs_net_id);
 	if (ipvs == NULL) {
 		pr_err("%s(): no memory.\n", __func__);
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 3883ec6..f1a1ad8 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2544,6 +2544,7 @@ static struct genl_family ip_vs_genl_family = {
 	.name		= IPVS_GENL_NAME,
 	.version	= IPVS_GENL_VERSION,
 	.maxattr	= IPVS_CMD_MAX,
+	.netnsok        = true,         /* Make ipvsadm to work on netns */
 };
 
 /* Policy used for first-level command attributes */
@@ -3413,9 +3414,6 @@ int __net_init __ip_vs_control_init(struct net *net)
 	struct netns_ipvs *ipvs = net_ipvs(net);
 	struct ctl_table *tbl;
 
-	if (!net_eq(net, &init_net))	/* netns not enabled yet */
-		return -EPERM;
-
 	atomic_set(&ipvs->dropentry, 0);
 	spin_lock_init(&ipvs->dropentry_lock);
 	spin_lock_init(&ipvs->droppacket_lock);
@@ -3507,9 +3505,6 @@ static void __net_exit __ip_vs_control_cleanup(struct net *net)
 {
 	struct netns_ipvs *ipvs = net_ipvs(net);
 
-	if (!net_eq(net, &init_net))	/* netns not enabled yet */
-		return;
-
 	ip_vs_trash_cleanup(net);
 	ip_vs_kill_estimator(net, ipvs->ctl_stats);
 	cancel_rearming_delayed_work(&ipvs->defense_work);
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index 576de09..cf8a0f5 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -195,9 +195,6 @@ static int __net_init __ip_vs_estimator_init(struct net *net)
 {
 	struct netns_ipvs *ipvs = net_ipvs(net);
 
-	if (!net_eq(net, &init_net))	/* netns not enabled yet */
-		return -EPERM;
-
 	INIT_LIST_HEAD(&ipvs->est_list);
 	spin_lock_init(&ipvs->est_lock);
 	return 0;
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 78d5980..596b39c 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -412,9 +412,6 @@ static int __net_init __ip_vs_ftp_init(struct net *net)
 	int i, ret;
 	struct ip_vs_app *app = &ip_vs_ftp;
 
-	if (!net_eq(net, &init_net))	/* netns not enabled yet */
-		return -EPERM;
-
 	ret = register_ip_vs_app(net, app);
 	if (ret)
 		return ret;
@@ -441,9 +438,6 @@ static void __ip_vs_ftp_exit(struct net *net)
 {
 	struct ip_vs_app *app = &ip_vs_ftp;
 
-	if (!net_eq(net, &init_net))	/* netns not enabled yet */
-		return;
-
 	unregister_ip_vs_app(net, app);
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 25b4729..f0e020b 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1653,9 +1653,6 @@ static int __net_init __ip_vs_sync_init(struct net *net)
 {
 	struct netns_ipvs *ipvs = net_ipvs(net);
 
-	if (!net_eq(net, &init_net))	/* netns not enabled yet */
-		return -EPERM;
-
 	INIT_LIST_HEAD(&ipvs->sync_queue);
 	spin_lock_init(&ipvs->sync_lock);
 	spin_lock_init(&ipvs->sync_buff_lock);
@@ -1668,8 +1665,6 @@ static int __net_init __ip_vs_sync_init(struct net *net)
 
 static void __ip_vs_sync_cleanup(struct net *net)
 {
-	if (!net_eq(net, &init_net))	/* netns not enabled yet */
-		return;
 	stop_sync_thread(net, IP_VS_STATE_MASTER);
 	stop_sync_thread(net, IP_VS_STATE_BACKUP);
 	return;
-- 
1.7.2.3


^ permalink raw reply related

* [*v2 PATCH 21/22] IPVS: netns, misc init_net removal in core.
From: Hans Schillstrom @ 2010-12-13 13:38 UTC (permalink / raw)
  To: horms, ja, daniel.lezcano, wensong, lvs-devel, netdev,
	netfilter-devel
  Cc: hans, Hans Schillstrom
In-Reply-To: <1292247510-753-1-git-send-email-hans.schillstrom@ericsson.com>

init_net removed in __ip_vs_addr_is_local_v6, and got net as param.

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
---
 net/netfilter/ipvs/ip_vs_core.c |    7 +++++--
 net/netfilter/ipvs/ip_vs_ctl.c  |    9 +++++----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 34e4f71..7441dcd 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -486,10 +486,12 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
 int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 		struct ip_vs_protocol *pp)
 {
+	struct net *net;
 	struct netns_ipvs *ipvs;
 	__be16 _ports[2], *pptr;
 	struct ip_vs_iphdr iph;
 	int unicast;
+
 	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
 
 	pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
@@ -497,18 +499,19 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 		ip_vs_service_put(svc);
 		return NF_DROP;
 	}
+	net = skb_net(skb);
 
 #ifdef CONFIG_IP_VS_IPV6
 	if (svc->af == AF_INET6)
 		unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
 	else
 #endif
-		unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST);
+		unicast = (inet_addr_type(net, iph.daddr.ip) == RTN_UNICAST);
 
 	/* if it is fwmark-based service, the cache_bypass sysctl is up
 	   and the destination is a non-local unicast, then create
 	   a cache_bypass connection entry */
-	ipvs = net_ipvs(skb_net(skb));
+	ipvs = net_ipvs(net);
 	if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) {
 		int ret, cs;
 		struct ip_vs_conn *cp;
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 047a779..3883ec6 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -68,7 +68,8 @@ int ip_vs_get_debug_level(void)
 
 #ifdef CONFIG_IP_VS_IPV6
 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
-static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr)
+static int __ip_vs_addr_is_local_v6(struct net *net,
+				    const struct in6_addr *addr)
 {
 	struct rt6_info *rt;
 	struct flowi fl = {
@@ -79,7 +80,7 @@ static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr)
 				.saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
 	};
 
-	rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
+	rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl);
 	if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK))
 			return 1;
 
@@ -803,12 +804,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
 		atype = ipv6_addr_type(&udest->addr.in6);
 		if ((!(atype & IPV6_ADDR_UNICAST) ||
 			atype & IPV6_ADDR_LINKLOCAL) &&
-			!__ip_vs_addr_is_local_v6(&udest->addr.in6))
+			!__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6))
 			return -EINVAL;
 	} else
 #endif
 	{
-		atype = inet_addr_type(&init_net, udest->addr.ip);
+		atype = inet_addr_type(svc->net, udest->addr.ip);
 		if (atype != RTN_LOCAL && atype != RTN_UNICAST)
 			return -EINVAL;
 	}
-- 
1.7.2.3


^ permalink raw reply related

* [*v2 PATCH 20/22] IPVS: netns, svc counters moved in ip_vs_ctl,c
From: Hans Schillstrom @ 2010-12-13 13:38 UTC (permalink / raw)
  To: horms, ja, daniel.lezcano, wensong, lvs-devel, netdev,
	netfilter-devel
  Cc: hans, Hans Schillstrom
In-Reply-To: <1292247510-753-1-git-send-email-hans.schillstrom@ericsson.com>

Last two global vars to be moved,
ip_vs_ftpsvc_counter and ip_vs_nullsvc_counter.

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
---
 include/net/netns/ip_vs.h      |    4 ++++
 net/netfilter/ipvs/ip_vs_ctl.c |   21 +++++++++------------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h
index b70444d..b7e0569 100644
--- a/include/net/netns/ip_vs.h
+++ b/include/net/netns/ip_vs.h
@@ -69,6 +69,7 @@ struct netns_ipvs {
 	struct ip_vs_stats 		 *ctl_stats; /* Statistics & estimator */
 	struct ip_vs_stats_user __percpu *ustats;    /* Statistics */
 	int 			num_services;    /* no of virtual services */
+
 	/* 1/rate drop and drop-entry variables */
 	struct delayed_work	defense_work;   /* Work handler */
 	int 			drop_rate;
@@ -84,6 +85,9 @@ struct netns_ipvs {
 	struct lock_class_key 	ctl_key;	/* ctl_mutex debuging */
 	/* Trash for destinations */
 	struct list_head	dest_trash;
+	/* Service counters */
+	atomic_t 		ftpsvc_counter;
+	atomic_t 		nullsvc_counter;
 
 	/* sys-ctl struct */
 	struct ctl_table_header	*sysctl_hdr;
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 418f6ba..047a779 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -252,12 +252,6 @@ static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
 /* the service table hashed by fwmark */
 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
 
-/*
- *	FTP & NULL virtual service counters
- */
-static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
-static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
-
 
 /*
  *	Returns hash value for virtual service
@@ -406,6 +400,7 @@ ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
 		  const union nf_inet_addr *vaddr, __be16 vport)
 {
 	struct ip_vs_service *svc;
+	struct netns_ipvs *ipvs = net_ipvs(net);
 
 	read_lock(&__ip_vs_svc_lock);
 
@@ -423,7 +418,7 @@ ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
 
 	if (svc == NULL
 	    && protocol == IPPROTO_TCP
-	    && atomic_read(&ip_vs_ftpsvc_counter)
+	    && atomic_read(&ipvs->ftpsvc_counter)
 	    && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
 		/*
 		 * Check if ftp service entry exists, the packet
@@ -433,7 +428,7 @@ ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
 	}
 
 	if (svc == NULL
-	    && atomic_read(&ip_vs_nullsvc_counter)) {
+	    && atomic_read(&ipvs->nullsvc_counter)) {
 		/*
 		 * Check if the catch-all port (port zero) exists
 		 */
@@ -1150,9 +1145,9 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
 
 	/* Update the virtual service counters */
 	if (svc->port == FTPPORT)
-		atomic_inc(&ip_vs_ftpsvc_counter);
+		atomic_inc(&ipvs->ftpsvc_counter);
 	else if (svc->port == 0)
-		atomic_inc(&ip_vs_nullsvc_counter);
+		atomic_inc(&ipvs->nullsvc_counter);
 
 	ip_vs_new_estimator(net, &svc->stats);
 
@@ -1333,9 +1328,9 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
 	 *    Update the virtual service counters
 	 */
 	if (svc->port == FTPPORT)
-		atomic_dec(&ip_vs_ftpsvc_counter);
+		atomic_dec(&ipvs->ftpsvc_counter);
 	else if (svc->port == 0)
-		atomic_dec(&ip_vs_nullsvc_counter);
+		atomic_dec(&ipvs->nullsvc_counter);
 
 	/*
 	 *    Free the service if nobody refers to it
@@ -3432,6 +3427,8 @@ int __net_init __ip_vs_control_init(struct net *net)
 		INIT_LIST_HEAD(&ipvs->rs_table[idx]);
 	}
 	INIT_LIST_HEAD(&ipvs->dest_trash);
+	atomic_set(&ipvs->ftpsvc_counter, 0);
+	atomic_set(&ipvs->nullsvc_counter, 0);
 
 	/* procfs stats */
 	ipvs->ctl_stats = kzalloc(sizeof(struct ip_vs_stats), GFP_KERNEL);
-- 
1.7.2.3


^ permalink raw reply related

* [*v2 PATCH 18/22] IPVS: netns, defense work timer.
From: Hans Schillstrom @ 2010-12-13 13:38 UTC (permalink / raw)
  To: horms, ja, daniel.lezcano, wensong, lvs-devel, netdev,
	netfilter-devel
  Cc: hans, Hans Schillstrom
In-Reply-To: <1292247510-753-1-git-send-email-hans.schillstrom@ericsson.com>

This patch makes defense work timer per name-space,
A net ptr had to be added to the ipvs struct,
since it's needed by defense_work_handler.

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
---
 include/net/ip_vs.h             |    2 +-
 include/net/netns/ip_vs.h       |    4 +++-
 net/netfilter/ipvs/ip_vs_conn.c |    5 +++--
 net/netfilter/ipvs/ip_vs_core.c |    1 +
 net/netfilter/ipvs/ip_vs_ctl.c  |   20 +++++++++-----------
 5 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 9b3fd8f..057c954 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -861,7 +861,7 @@ extern const char * ip_vs_state_name(__u16 proto, int state);
 
 extern void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp);
 extern int ip_vs_check_template(struct ip_vs_conn *ct);
-extern void ip_vs_random_dropentry(void);
+extern void ip_vs_random_dropentry(struct net *net);
 extern int ip_vs_conn_init(void);
 extern void ip_vs_conn_cleanup(void);
 
diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h
index b4a40bc..eeb1dd1 100644
--- a/include/net/netns/ip_vs.h
+++ b/include/net/netns/ip_vs.h
@@ -68,9 +68,9 @@ struct netns_ipvs {
 	/* ip_vs_ctl */
 	struct ip_vs_stats 		 *ctl_stats; /* Statistics & estimator */
 	struct ip_vs_stats_user __percpu *ustats;    /* Statistics */
-
 	int 			num_services;    /* no of virtual services */
 	/* 1/rate drop and drop-entry variables */
+	struct delayed_work	defense_work;   /* Work handler */
 	int 			drop_rate;
 	int			drop_counter;
 	atomic_t 		dropentry;
@@ -130,6 +130,8 @@ struct netns_ipvs {
 	/* multicast interface name */
 	char 			master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
 	char 			backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
+	/* net name space ptr */
+	struct net 		*net;            /* Needed by timer routines */
 };
 
 #endif /* IP_VS_H_ */
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 3ccc33a..0200d97 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -1138,7 +1138,7 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
 }
 
 /* Called from keventd and must protect itself from softirqs */
-void ip_vs_random_dropentry(void)
+void ip_vs_random_dropentry(struct net *net)
 {
 	int idx;
 	struct ip_vs_conn *cp;
@@ -1158,7 +1158,8 @@ void ip_vs_random_dropentry(void)
 			if (cp->flags & IP_VS_CONN_F_TEMPLATE)
 				/* connection template */
 				continue;
-
+			if (!ip_vs_conn_net_eq(cp, net))
+				continue;
 			if (cp->protocol == IPPROTO_TCP) {
 				switch(cp->state) {
 				case IP_VS_TCP_S_SYN_RECV:
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 3a4dad2..34e4f71 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1858,6 +1858,7 @@ static int __net_init  __ip_vs_init(struct net *net)
 		pr_err("%s(): no memory.\n", __func__);
 		return -ENOMEM;
 	}
+	ipvs->net = net;
 	/* Incarnation counters used for creating unique names */
 	ipvs->inc = atomic_read(&ipvs_netns_cnt);
 	atomic_inc(&ipvs_netns_cnt);
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 6982298..747bc96 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -215,18 +215,16 @@ static void update_defense_level(struct netns_ipvs *ipvs)
  *	Timer for checking the defense
  */
 #define DEFENSE_TIMER_PERIOD	1*HZ
-static void defense_work_handler(struct work_struct *work);
-static DECLARE_DELAYED_WORK(defense_work, defense_work_handler);
 
 static void defense_work_handler(struct work_struct *work)
 {
-	struct netns_ipvs *ipvs = net_ipvs(&init_net);
+	struct netns_ipvs *ipvs =
+		container_of(work, struct netns_ipvs, defense_work.work);
 
 	update_defense_level(ipvs);
 	if (atomic_read(&ipvs->dropentry))
-		ip_vs_random_dropentry();
-
-	schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
+		ip_vs_random_dropentry(ipvs->net);
+	schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
 }
 
 int
@@ -3495,6 +3493,9 @@ int __net_init __ip_vs_control_init(struct net *net)
 		goto err_reg;
 	ip_vs_new_estimator(net, ipvs->ctl_stats);
 	ipvs->sysctl_tbl = tbl;
+	/* Schedule defense work */
+	INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
+	schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
 	return 0;
 
 err_reg:
@@ -3518,6 +3519,8 @@ static void __net_exit __ip_vs_control_cleanup(struct net *net)
 	unregister_net_sysctl_table(ipvs->sysctl_hdr);
 	proc_net_remove(net, "ip_vs_stats");
 	proc_net_remove(net, "ip_vs");
+	cancel_rearming_delayed_work(&ipvs->defense_work);
+	cancel_work_sync(&ipvs->defense_work.work);
 	free_percpu(ipvs->ustats);
 	kfree(ipvs->ctl_stats);
 }
@@ -3561,9 +3564,6 @@ int __init ip_vs_control_init(void)
 		goto err_net;
 	}
 
-	/* Hook the defense timer */
-	schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
-
 	LeaveFunction(2);
 	return 0;
 
@@ -3578,8 +3578,6 @@ void ip_vs_control_cleanup(void)
 {
 	EnterFunction(2);
 	ip_vs_trash_cleanup();
-	cancel_rearming_delayed_work(&defense_work);
-	cancel_work_sync(&defense_work.work);
 	unregister_pernet_subsys(&ipvs_control_ops);
 	ip_vs_genl_unregister();
 	nf_unregister_sockopt(&ip_vs_sockopts);
-- 
1.7.2.3


^ permalink raw reply related

* [*v2 PATCH 17/22] IPVS: netns, ip_vs_ctl local vars moved to ipvs struct.
From: Hans Schillstrom @ 2010-12-13 13:38 UTC (permalink / raw)
  To: horms, ja, daniel.lezcano, wensong, lvs-devel, netdev,
	netfilter-devel
  Cc: hans, Hans Schillstrom
In-Reply-To: <1292247510-753-1-git-send-email-hans.schillstrom@ericsson.com>

Moving global vars to ipvs struct, except for svc table lock.

Next patch for ctl will be drop-rate handling.

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
---
 include/net/ip_vs.h                   |   19 +--
 include/net/netns/ip_vs.h             |   42 ++++-
 net/netfilter/ipvs/ip_vs_conn.c       |    7 +-
 net/netfilter/ipvs/ip_vs_core.c       |   38 +++--
 net/netfilter/ipvs/ip_vs_ctl.c        |  329 +++++++++++++++++----------------
 net/netfilter/ipvs/ip_vs_proto_sctp.c |    2 +-
 net/netfilter/ipvs/ip_vs_proto_tcp.c  |    2 +-
 net/netfilter/ipvs/ip_vs_proto_udp.c  |    2 +-
 net/netfilter/ipvs/ip_vs_sync.c       |    9 +-
 9 files changed, 251 insertions(+), 199 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 446e417..9b3fd8f 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1007,13 +1007,6 @@ extern int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 /*
  *      IPVS control data and functions (from ip_vs_ctl.c)
  */
-extern int sysctl_ip_vs_cache_bypass;
-extern int sysctl_ip_vs_expire_nodest_conn;
-extern int sysctl_ip_vs_expire_quiescent_template;
-extern int sysctl_ip_vs_sync_threshold[2];
-extern int sysctl_ip_vs_nat_icmp_send;
-extern int sysctl_ip_vs_conntrack;
-extern int sysctl_ip_vs_snat_reroute;
 extern struct ip_vs_stats ip_vs_stats;
 extern const struct ctl_path net_vs_ctl_path[];
 extern int sysctl_ip_vs_sync_ver;
@@ -1103,11 +1096,11 @@ extern int ip_vs_icmp_xmit_v6
 extern int ip_vs_drop_rate;
 extern int ip_vs_drop_counter;
 
-static __inline__ int ip_vs_todrop(void)
+static __inline__ int ip_vs_todrop(struct netns_ipvs *ipvs)
 {
-	if (!ip_vs_drop_rate) return 0;
-	if (--ip_vs_drop_counter > 0) return 0;
-	ip_vs_drop_counter = ip_vs_drop_rate;
+	if (!ipvs->drop_rate) return 0;
+	if (--ipvs->drop_counter > 0) return 0;
+	ipvs->drop_counter = ipvs->drop_rate;
 	return 1;
 }
 
@@ -1195,9 +1188,9 @@ static inline void ip_vs_notrack(struct sk_buff *skb)
  *      Netfilter connection tracking
  *      (from ip_vs_nfct.c)
  */
-static inline int ip_vs_conntrack_enabled(void)
+static inline int ip_vs_conntrack_enabled(struct netns_ipvs *ipvs)
 {
-	return sysctl_ip_vs_conntrack;
+	return ipvs->sysctl_conntrack;
 }
 
 extern void ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp,
diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h
index f2d5bcd..b4a40bc 100644
--- a/include/net/netns/ip_vs.h
+++ b/include/net/netns/ip_vs.h
@@ -62,12 +62,46 @@ struct netns_ipvs {
 	struct list_head 	sctp_apps[SCTP_APP_TAB_SIZE];
 	spinlock_t		sctp_app_lock;
 #endif
+	/* ip_vs_conn */
+	atomic_t 		conn_count;      /*  connection counter */
+
 	/* ip_vs_ctl */
-	struct ip_vs_stats 		*ctl_stats; /* Statistics & estimator */
-	struct ip_vs_stats_user __percpu *ustats;   /* Statistics */
+	struct ip_vs_stats 		 *ctl_stats; /* Statistics & estimator */
+	struct ip_vs_stats_user __percpu *ustats;    /* Statistics */
+
+	int 			num_services;    /* no of virtual services */
+	/* 1/rate drop and drop-entry variables */
+	int 			drop_rate;
+	int			drop_counter;
+	atomic_t 		dropentry;
+	/* locks in ctl.c */
+	spinlock_t 		dropentry_lock;  /* drop entry handling */
+	spinlock_t		droppacket_lock; /* drop packet handling */
+	spinlock_t		securetcp_lock;  /* state and timeout tables */
+	rwlock_t		rs_lock;         /* real services table */
+	/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
+	struct mutex		ctl_mutex;
+	struct lock_class_key 	ctl_key;	/* ctl_mutex debuging */
+	/* sys-ctl struct */
+	struct ctl_table_header	*sysctl_hdr;
+	struct ctl_table	*sysctl_tbl;
+	/* sysctl variables */
+	int 			sysctl_amemthresh;
+	int 			sysctl_am_droprate;
+	int 			sysctl_drop_entry;
+	int 			sysctl_drop_packet;
+	int 			sysctl_secure_tcp;
+#ifdef CONFIG_IP_VS_NFCT
+	int 			sysctl_conntrack;
+#endif
+	int			sysctl_snat_reroute;
+	int			sysctl_sync_ver;
+	int 			sysctl_cache_bypass;
+	int 			sysctl_expire_nodest_conn;
+	int 			sysctl_expire_quiescent_template;
+	int 			sysctl_sync_threshold[2];
+	int 			sysctl_nat_icmp_send;
 
-	/* ip_vs_conn */
-	atomic_t 		conn_count;         /*  connection counter */
 	/* ip_vs_lblc */
 	int 			sysctl_lblc_expiration;
 	struct ctl_table_header	*lblc_ctl_header;
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 41f93cc..3ccc33a 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -686,13 +686,14 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
 int ip_vs_check_template(struct ip_vs_conn *ct)
 {
 	struct ip_vs_dest *dest = ct->dest;
+	struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(ct));
 
 	/*
 	 * Checking the dest server status.
 	 */
 	if ((dest == NULL) ||
 	    !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
-	    (sysctl_ip_vs_expire_quiescent_template &&
+	    (ipvs->sysctl_expire_quiescent_template &&
 	     (atomic_read(&dest->weight) == 0))) {
 		IP_VS_DBG_BUF(9, "check_template: dest not available for "
 			      "protocol %s s:%s:%d v:%s:%d "
@@ -879,7 +880,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
 	 * IP_VS_CONN_F_ONE_PACKET too.
 	 */
 
-	if (ip_vs_conntrack_enabled())
+	if (ip_vs_conntrack_enabled(ipvs))
 		cp->flags |= IP_VS_CONN_F_NFCT;
 
 	/* Hash it in the ip_vs_conn_tab finally */
@@ -1198,7 +1199,7 @@ static void ip_vs_conn_flush(struct net *net)
 	struct ip_vs_conn *cp;
 	struct netns_ipvs *ipvs = net_ipvs(net);
 
-  flush_again:
+flush_again:
 	for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
 		/*
 		 *  Lock is actually needed in this loop.
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 69ba8cc..3a4dad2 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -486,6 +486,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
 int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 		struct ip_vs_protocol *pp)
 {
+	struct netns_ipvs *ipvs;
 	__be16 _ports[2], *pptr;
 	struct ip_vs_iphdr iph;
 	int unicast;
@@ -507,7 +508,8 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 	/* if it is fwmark-based service, the cache_bypass sysctl is up
 	   and the destination is a non-local unicast, then create
 	   a cache_bypass connection entry */
-	if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
+	ipvs = net_ipvs(skb_net(skb));
+	if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) {
 		int ret, cs;
 		struct ip_vs_conn *cp;
 		unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
@@ -719,6 +721,7 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
 				struct ip_vs_protocol *pp,
 				unsigned int offset, unsigned int ihl)
 {
+	struct netns_ipvs *ipvs;
 	unsigned int verdict = NF_DROP;
 
 	if (IP_VS_FWD_METHOD(cp) != 0) {
@@ -740,6 +743,8 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
 	if (!skb_make_writable(skb, offset))
 		goto out;
 
+	ipvs = net_ipvs(skb_net(skb));
+
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6)
 		ip_vs_nat_icmp_v6(skb, pp, cp, 1);
@@ -749,11 +754,11 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
 
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6) {
-		if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
+		if (ipvs->sysctl_snat_reroute && ip6_route_me_harder(skb) != 0)
 			goto out;
 	} else
 #endif
-		if ((sysctl_ip_vs_snat_reroute ||
+		if ((ipvs->sysctl_snat_reroute ||
 		     skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
 		    ip_route_me_harder(skb, RTN_LOCAL) != 0)
 			goto out;
@@ -964,6 +969,8 @@ static unsigned int
 handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		struct ip_vs_conn *cp, int ihl)
 {
+	struct netns_ipvs *ipvs;
+
 	IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");
 
 	if (!skb_make_writable(skb, ihl))
@@ -998,13 +1005,15 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 	 * if it came from this machine itself.  So re-compute
 	 * the routing information.
 	 */
+	ipvs = net_ipvs(skb_net(skb));
+
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6) {
-		if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
+		if (ipvs->sysctl_snat_reroute && ip6_route_me_harder(skb) != 0)
 			goto drop;
 	} else
 #endif
-		if ((sysctl_ip_vs_snat_reroute ||
+		if ((ipvs->sysctl_snat_reroute ||
 		     skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
 		    ip_route_me_harder(skb, RTN_LOCAL) != 0)
 			goto drop;
@@ -1040,9 +1049,9 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
 	struct ip_vs_iphdr iph;
 	struct ip_vs_protocol *pp;
 	struct ip_vs_conn *cp;
+	struct netns_ipvs *ipvs;
 
 	EnterFunction(11);
-	net = skb_net(skb);
 
 	/* Already marked as IPVS request or reply? */
 	if (skb->ipvs_property)
@@ -1112,11 +1121,14 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
 	/*
 	 * Check if the packet belongs to an existing entry
 	 */
+	net = skb_net(skb);
+	ipvs = net_ipvs(net);
+
 	cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
 
 	if (likely(cp))
 		return handle_response(af, skb, pp, cp, iph.len);
-	if (sysctl_ip_vs_nat_icmp_send &&
+	if (ipvs->sysctl_nat_icmp_send &&
 	    (pp->protocol == IPPROTO_TCP ||
 	     pp->protocol == IPPROTO_UDP ||
 	     pp->protocol == IPPROTO_SCTP)) {
@@ -1552,7 +1564,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 	if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
 		/* the destination server is not available */
 
-		if (sysctl_ip_vs_expire_nodest_conn) {
+		if (ipvs->sysctl_expire_nodest_conn) {
 			/* try to expire the connection immediately */
 			ip_vs_conn_expire_now(cp);
 		}
@@ -1582,15 +1594,15 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 	 */
 
 	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
-		pkts = sysctl_ip_vs_sync_threshold[0];
+		pkts = ipvs->sysctl_sync_threshold[0];
 	else
 		pkts = atomic_add_return(1, &cp->in_pkts);
 
 	if ((ipvs->sync_state & IP_VS_STATE_MASTER) &&
 	    cp->protocol == IPPROTO_SCTP) {
 		if ((cp->state == IP_VS_SCTP_S_ESTABLISHED &&
-			(pkts % sysctl_ip_vs_sync_threshold[1]
-			 == sysctl_ip_vs_sync_threshold[0])) ||
+			(pkts % ipvs->sysctl_sync_threshold[1]
+			 == ipvs->sysctl_sync_threshold[0])) ||
 				(cp->old_state != cp->state &&
 				 ((cp->state == IP_VS_SCTP_S_CLOSED) ||
 				  (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) ||
@@ -1604,8 +1616,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 	else if ((ipvs->sync_state & IP_VS_STATE_MASTER) &&
 	    (((cp->protocol != IPPROTO_TCP ||
 	       cp->state == IP_VS_TCP_S_ESTABLISHED) &&
-	      (pkts % sysctl_ip_vs_sync_threshold[1]
-	       == sysctl_ip_vs_sync_threshold[0])) ||
+	      (pkts % ipvs->sysctl_sync_threshold[1]
+	       == ipvs->sysctl_sync_threshold[0])) ||
 	     ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
 	      ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
 	       (cp->state == IP_VS_TCP_S_CLOSE) ||
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index ca7bf30..6982298 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -52,48 +52,10 @@
 
 #include <net/ip_vs.h>
 
-/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
-static DEFINE_MUTEX(__ip_vs_mutex);
-
 /* lock for service table */
 static DEFINE_RWLOCK(__ip_vs_svc_lock);
 
-/* lock for table with the real services */
-static DEFINE_RWLOCK(__ip_vs_rs_lock);
-
-/* lock for state and timeout tables */
-static DEFINE_SPINLOCK(ip_vs_securetcp_lock);
-
-/* lock for drop entry handling */
-static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
-
-/* lock for drop packet handling */
-static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
-
-/* 1/rate drop and drop-entry variables */
-int ip_vs_drop_rate = 0;
-int ip_vs_drop_counter = 0;
-static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
-
-/* number of virtual services */
-static int ip_vs_num_services = 0;
-
 /* sysctl variables */
-static int sysctl_ip_vs_drop_entry = 0;
-static int sysctl_ip_vs_drop_packet = 0;
-static int sysctl_ip_vs_secure_tcp = 0;
-static int sysctl_ip_vs_amemthresh = 1024;
-static int sysctl_ip_vs_am_droprate = 10;
-int sysctl_ip_vs_cache_bypass = 0;
-int sysctl_ip_vs_expire_nodest_conn = 0;
-int sysctl_ip_vs_expire_quiescent_template = 0;
-int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
-int sysctl_ip_vs_nat_icmp_send = 0;
-#ifdef CONFIG_IP_VS_NFCT
-int sysctl_ip_vs_conntrack;
-#endif
-int sysctl_ip_vs_snat_reroute = 1;
-int sysctl_ip_vs_sync_ver = 1;		/* Default version of sync proto */
 
 #ifdef CONFIG_IP_VS_DEBUG
 static int sysctl_ip_vs_debug_level = 0;
@@ -144,73 +106,73 @@ static void update_defense_level(struct netns_ipvs *ipvs)
 	/* si_swapinfo(&i); */
 	/* availmem = availmem - (i.totalswap - i.freeswap); */
 
-	nomem = (availmem < sysctl_ip_vs_amemthresh);
+	nomem = (availmem < ipvs->sysctl_amemthresh);
 
 	local_bh_disable();
 
 	/* drop_entry */
-	spin_lock(&__ip_vs_dropentry_lock);
-	switch (sysctl_ip_vs_drop_entry) {
+	spin_lock(&ipvs->dropentry_lock);
+	switch (ipvs->sysctl_drop_entry) {
 	case 0:
-		atomic_set(&ip_vs_dropentry, 0);
+		atomic_set(&ipvs->dropentry, 0);
 		break;
 	case 1:
 		if (nomem) {
-			atomic_set(&ip_vs_dropentry, 1);
-			sysctl_ip_vs_drop_entry = 2;
+			atomic_set(&ipvs->dropentry, 1);
+			ipvs->sysctl_drop_entry = 2;
 		} else {
-			atomic_set(&ip_vs_dropentry, 0);
+			atomic_set(&ipvs->dropentry, 0);
 		}
 		break;
 	case 2:
 		if (nomem) {
-			atomic_set(&ip_vs_dropentry, 1);
+			atomic_set(&ipvs->dropentry, 1);
 		} else {
-			atomic_set(&ip_vs_dropentry, 0);
-			sysctl_ip_vs_drop_entry = 1;
+			atomic_set(&ipvs->dropentry, 0);
+			ipvs->sysctl_drop_entry = 1;
 		};
 		break;
 	case 3:
-		atomic_set(&ip_vs_dropentry, 1);
+		atomic_set(&ipvs->dropentry, 1);
 		break;
 	}
-	spin_unlock(&__ip_vs_dropentry_lock);
+	spin_unlock(&ipvs->dropentry_lock);
 
 	/* drop_packet */
-	spin_lock(&__ip_vs_droppacket_lock);
-	switch (sysctl_ip_vs_drop_packet) {
+	spin_lock(&ipvs->droppacket_lock);
+	switch (ipvs->sysctl_drop_packet) {
 	case 0:
-		ip_vs_drop_rate = 0;
+		ipvs->drop_rate = 0;
 		break;
 	case 1:
 		if (nomem) {
-			ip_vs_drop_rate = ip_vs_drop_counter
-				= sysctl_ip_vs_amemthresh /
-				(sysctl_ip_vs_amemthresh-availmem);
-			sysctl_ip_vs_drop_packet = 2;
+			ipvs->drop_rate = ipvs->drop_counter
+				= ipvs->sysctl_amemthresh /
+				(ipvs->sysctl_amemthresh-availmem);
+			ipvs->sysctl_drop_packet = 2;
 		} else {
-			ip_vs_drop_rate = 0;
+			ipvs->drop_rate = 0;
 		}
 		break;
 	case 2:
 		if (nomem) {
-			ip_vs_drop_rate = ip_vs_drop_counter
-				= sysctl_ip_vs_amemthresh /
-				(sysctl_ip_vs_amemthresh-availmem);
+			ipvs->drop_rate = ipvs->drop_counter
+				= ipvs->sysctl_amemthresh /
+				(ipvs->sysctl_amemthresh-availmem);
 		} else {
-			ip_vs_drop_rate = 0;
-			sysctl_ip_vs_drop_packet = 1;
+			ipvs->drop_rate = 0;
+			ipvs->sysctl_drop_packet = 1;
 		}
 		break;
 	case 3:
-		ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
+		ipvs->drop_rate = ipvs->sysctl_am_droprate;
 		break;
 	}
-	spin_unlock(&__ip_vs_droppacket_lock);
+	spin_unlock(&ipvs->droppacket_lock);
 
 	/* secure_tcp */
-	spin_lock(&ip_vs_securetcp_lock);
-	switch (sysctl_ip_vs_secure_tcp) {
+	spin_lock(&ipvs->securetcp_lock);
+	switch (ipvs->sysctl_secure_tcp) {
 	case 0:
 		if (old_secure_tcp >= 2)
 			to_change = 0;
@@ -219,7 +181,7 @@ static void update_defense_level(struct netns_ipvs *ipvs)
 		if (nomem) {
 			if (old_secure_tcp < 2)
 				to_change = 1;
-			sysctl_ip_vs_secure_tcp = 2;
+			ipvs->sysctl_secure_tcp = 2;
 		} else {
 			if (old_secure_tcp >= 2)
 				to_change = 0;
@@ -232,7 +194,7 @@ static void update_defense_level(struct netns_ipvs *ipvs)
 		} else {
 			if (old_secure_tcp >= 2)
 				to_change = 0;
-			sysctl_ip_vs_secure_tcp = 1;
+			ipvs->sysctl_secure_tcp = 1;
 		}
 		break;
 	case 3:
@@ -240,10 +202,10 @@ static void update_defense_level(struct netns_ipvs *ipvs)
 			to_change = 1;
 		break;
 	}
-	old_secure_tcp = sysctl_ip_vs_secure_tcp;
+	old_secure_tcp = ipvs->sysctl_secure_tcp;
 	if (to_change >= 0)
-		ip_vs_protocol_timeout_change(ipvs, sysctl_ip_vs_secure_tcp>1);
-	spin_unlock(&ip_vs_securetcp_lock);
+		ip_vs_protocol_timeout_change(ipvs, ipvs->sysctl_secure_tcp>1);
+	spin_unlock(&ipvs->securetcp_lock);
 
 	local_bh_enable();
 }
@@ -261,7 +223,7 @@ static void defense_work_handler(struct work_struct *work)
 	struct netns_ipvs *ipvs = net_ipvs(&init_net);
 
 	update_defense_level(ipvs);
-	if (atomic_read(&ip_vs_dropentry))
+	if (atomic_read(&ipvs->dropentry))
 		ip_vs_random_dropentry();
 
 	schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
@@ -600,7 +562,7 @@ ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol,
 	 */
 	hash = ip_vs_rs_hashkey(af, daddr, dport);
 
-	read_lock(&__ip_vs_rs_lock);
+	read_lock(&ipvs->rs_lock);
 	list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) {
 		if ((dest->af == af)
 		    && ip_vs_addr_equal(af, &dest->addr, daddr)
@@ -608,11 +570,11 @@ ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol,
 		    && ((dest->protocol == protocol) ||
 			dest->vfwmark)) {
 			/* HIT */
-			read_unlock(&__ip_vs_rs_lock);
+			read_unlock(&ipvs->rs_lock);
 			return dest;
 		}
 	}
-	read_unlock(&__ip_vs_rs_lock);
+	read_unlock(&ipvs->rs_lock);
 
 	return NULL;
 }
@@ -784,9 +746,9 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
 		 *    Put the real service in rs_table if not present.
 		 *    For now only for NAT!
 		 */
-		write_lock_bh(&__ip_vs_rs_lock);
+		write_lock_bh(&ipvs->rs_lock);
 		ip_vs_rs_hash(ipvs, dest);
-		write_unlock_bh(&__ip_vs_rs_lock);
+		write_unlock_bh(&ipvs->rs_lock);
 	}
 	atomic_set(&dest->conn_flags, conn_flags);
 
@@ -1009,14 +971,16 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
  */
 static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
 {
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
 	ip_vs_kill_estimator(net, &dest->stats);
 
 	/*
 	 *  Remove it from the d-linked list with the real services.
 	 */
-	write_lock_bh(&__ip_vs_rs_lock);
+	write_lock_bh(&ipvs->rs_lock);
 	ip_vs_rs_unhash(dest);
-	write_unlock_bh(&__ip_vs_rs_lock);
+	write_unlock_bh(&ipvs->rs_lock);
 
 	/*
 	 *  Decrease the refcnt of the dest, and free the dest
@@ -1078,7 +1042,6 @@ static int
 ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 {
 	struct ip_vs_dest *dest;
-	struct net *net = svc->net;
 	__be16 dport = udest->port;
 
 	EnterFunction(2);
@@ -1107,7 +1070,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 	/*
 	 *	Delete the destination
 	 */
-	__ip_vs_del_dest(net, dest);
+	__ip_vs_del_dest(svc->net, dest);
 
 	LeaveFunction(2);
 
@@ -1126,6 +1089,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
 	struct ip_vs_scheduler *sched = NULL;
 	struct ip_vs_pe *pe = NULL;
 	struct ip_vs_service *svc = NULL;
+	struct netns_ipvs *ipvs = net_ipvs(net);
 
 	/* increase the module use count */
 	ip_vs_use_count_inc();
@@ -1200,7 +1164,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
 
 	/* Count only IPv4 services for old get/setsockopt interface */
 	if (svc->af == AF_INET)
-		ip_vs_num_services++;
+		ipvs->num_services++;
 
 	/* Hash the service into the service table */
 	write_lock_bh(&__ip_vs_svc_lock);
@@ -1337,12 +1301,13 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
 	struct ip_vs_dest *dest, *nxt;
 	struct ip_vs_scheduler *old_sched;
 	struct ip_vs_pe *old_pe;
+	struct netns_ipvs *ipvs = net_ipvs(svc->net);
 
 	pr_info("%s: enter\n", __func__);
 
 	/* Count only IPv4 services for old get/setsockopt interface */
 	if (svc->af == AF_INET)
-		ip_vs_num_services--;
+		ipvs->num_services--;
 
 	ip_vs_kill_estimator(svc->net, &svc->stats);
 
@@ -1566,42 +1531,31 @@ proc_do_sync_mode(ctl_table *table, int write,
 
 /*
  *	IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
+ *	Do not change order or insert new entries without
+ *	align with netns init in __ip_vs_control_init()
  */
 
 static struct ctl_table vs_vars[] = {
 	{
 		.procname	= "amemthresh",
-		.data		= &sysctl_ip_vs_amemthresh,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-#ifdef CONFIG_IP_VS_DEBUG
-	{
-		.procname	= "debug_level",
-		.data		= &sysctl_ip_vs_debug_level,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-#endif
 	{
 		.procname	= "am_droprate",
-		.data		= &sysctl_ip_vs_am_droprate,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "drop_entry",
-		.data		= &sysctl_ip_vs_drop_entry,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_do_defense_mode,
 	},
 	{
 		.procname	= "drop_packet",
-		.data		= &sysctl_ip_vs_drop_packet,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_do_defense_mode,
@@ -1609,7 +1563,6 @@ static struct ctl_table vs_vars[] = {
 #ifdef CONFIG_IP_VS_NFCT
 	{
 		.procname	= "conntrack",
-		.data		= &sysctl_ip_vs_conntrack,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
@@ -1617,25 +1570,61 @@ static struct ctl_table vs_vars[] = {
 #endif
 	{
 		.procname	= "secure_tcp",
-		.data		= &sysctl_ip_vs_secure_tcp,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_do_defense_mode,
 	},
 	{
 		.procname	= "snat_reroute",
-		.data		= &sysctl_ip_vs_snat_reroute,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
 	{
 		.procname	= "sync_version",
-		.data		= &sysctl_ip_vs_sync_ver,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_do_sync_mode,
 	},
+	{
+		.procname	= "cache_bypass",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "expire_nodest_conn",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "expire_quiescent_template",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "sync_threshold",
+		.maxlen		= sizeof(((struct netns_ipvs*)0)->sysctl_sync_threshold),
+		.mode		= 0644,
+		.proc_handler	= proc_do_sync_threshold,
+	},
+	{
+		.procname	= "nat_icmp_send",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#ifdef CONFIG_IP_VS_DEBUG
+	{
+		.procname	= "debug_level",
+		.data		= &sysctl_ip_vs_debug_level,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#endif
 #if 0
 	{
 		.procname	= "timeout_established",
@@ -1722,41 +1711,6 @@ static struct ctl_table vs_vars[] = {
 		.proc_handler	= proc_dointvec_jiffies,
 	},
 #endif
-	{
-		.procname	= "cache_bypass",
-		.data		= &sysctl_ip_vs_cache_bypass,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-	{
-		.procname	= "expire_nodest_conn",
-		.data		= &sysctl_ip_vs_expire_nodest_conn,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-	{
-		.procname	= "expire_quiescent_template",
-		.data		= &sysctl_ip_vs_expire_quiescent_template,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-	{
-		.procname	= "sync_threshold",
-		.data		= &sysctl_ip_vs_sync_threshold,
-		.maxlen		= sizeof(sysctl_ip_vs_sync_threshold),
-		.mode		= 0644,
-		.proc_handler	= proc_do_sync_threshold,
-	},
-	{
-		.procname	= "nat_icmp_send",
-		.data		= &sysctl_ip_vs_nat_icmp_send,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 	{ }
 };
 
@@ -1768,8 +1722,6 @@ const struct ctl_path net_vs_ctl_path[] = {
 };
 EXPORT_SYMBOL_GPL(net_vs_ctl_path);
 
-static struct ctl_table_header * sysctl_header;
-
 #ifdef CONFIG_PROC_FS
 
 struct ip_vs_iter {
@@ -2144,6 +2096,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 	struct ip_vs_service *svc;
 	struct ip_vs_dest_user *udest_compat;
 	struct ip_vs_dest_user_kern udest;
+	struct netns_ipvs *ipvs = net_ipvs(net);
 
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
@@ -2164,7 +2117,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 	/* increase the module use count */
 	ip_vs_use_count_inc();
 
-	if (mutex_lock_interruptible(&__ip_vs_mutex)) {
+	if (mutex_lock_interruptible(&ipvs->ctl_mutex)) {
 		ret = -ERESTARTSYS;
 		goto out_dec;
 	}
@@ -2259,7 +2212,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 	}
 
   out_unlock:
-	mutex_unlock(&__ip_vs_mutex);
+	mutex_unlock(&ipvs->ctl_mutex);
   out_dec:
 	/* decrease the module use count */
 	ip_vs_use_count_dec();
@@ -2452,7 +2405,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	if (copy_from_user(arg, user, copylen) != 0)
 		return -EFAULT;
 
-	if (mutex_lock_interruptible(&__ip_vs_mutex))
+	if (mutex_lock_interruptible(&ipvs->ctl_mutex))
 		return -ERESTARTSYS;
 
 	switch (cmd) {
@@ -2475,7 +2428,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 		struct ip_vs_getinfo info;
 		info.version = IP_VS_VERSION_CODE;
 		info.size = ip_vs_conn_tab_size;
-		info.num_services = ip_vs_num_services;
+		info.num_services = ipvs->num_services;
 		if (copy_to_user(user, &info, sizeof(info)) != 0)
 			ret = -EFAULT;
 	}
@@ -2574,7 +2527,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	}
 
   out:
-	mutex_unlock(&__ip_vs_mutex);
+	mutex_unlock(&ipvs->ctl_mutex);
 	return ret;
 }
 
@@ -2756,8 +2709,9 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
 	int start = cb->args[0];
 	struct ip_vs_service *svc;
 	struct net *net = skb_sknet(skb);
+	struct netns_ipvs *ipvs = net_ipvs(skb_sknet(skb));
 
-	mutex_lock(&__ip_vs_mutex);
+	mutex_lock(&ipvs->ctl_mutex);
 	for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
 		list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
 			if (++idx <= start || !net_eq(svc->net, net))
@@ -2781,7 +2735,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
 	}
 
 nla_put_failure:
-	mutex_unlock(&__ip_vs_mutex);
+	mutex_unlock(&ipvs->ctl_mutex);
 	cb->args[0] = idx;
 
 	return skb->len;
@@ -2945,16 +2899,18 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
 	struct ip_vs_service *svc;
 	struct ip_vs_dest *dest;
 	struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
-	struct net *net;
+	struct net *net = skb_sknet(skb);
+	struct netns_ipvs *ipvs = net_ipvs(net);
 
-	mutex_lock(&__ip_vs_mutex);
+
+	mutex_lock(&ipvs->ctl_mutex);
 
 	/* Try to find the service for which to dump destinations */
 	if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs,
 			IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
 		goto out_err;
 
-	net = skb_sknet(skb);
+
 	svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]);
 	if (IS_ERR(svc) || svc == NULL)
 		goto out_err;
@@ -2973,7 +2929,7 @@ nla_put_failure:
 	cb->args[0] = idx;
 
 out_err:
-	mutex_unlock(&__ip_vs_mutex);
+	mutex_unlock(&ipvs->ctl_mutex);
 
 	return skb->len;
 }
@@ -3072,7 +3028,7 @@ static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
 	struct net *net = skb_net(skb);
 	struct netns_ipvs *ipvs = net_ipvs(net);
 
-	mutex_lock(&__ip_vs_mutex);
+	mutex_lock(&ipvs->ctl_mutex);
 	if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
 		if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
 					   ipvs->master_mcast_ifn,
@@ -3092,7 +3048,7 @@ static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
 	}
 
 nla_put_failure:
-	mutex_unlock(&__ip_vs_mutex);
+	mutex_unlock(&ipvs->ctl_mutex);
 
 	return skb->len;
 }
@@ -3146,11 +3102,13 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
 	int ret = 0, cmd;
 	int need_full_svc = 0, need_full_dest = 0;
 	struct net *net;
+	struct netns_ipvs *ipvs;
 
 	net = skb_sknet(skb);
+	ipvs = net_ipvs(net);
 	cmd = info->genlhdr->cmd;
 
-	mutex_lock(&__ip_vs_mutex);
+	mutex_lock(&ipvs->ctl_mutex);
 
 	if (cmd == IPVS_CMD_FLUSH) {
 		ret = ip_vs_flush(net);
@@ -3246,7 +3204,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
 	}
 
 out:
-	mutex_unlock(&__ip_vs_mutex);
+	mutex_unlock(&ipvs->ctl_mutex);
 
 	return ret;
 }
@@ -3257,8 +3215,10 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
 	void *reply;
 	int ret, cmd, reply_cmd;
 	struct net *net;
+	struct netns_ipvs *ipvs;
 
 	net = skb_sknet(skb);
+	ipvs = net_ipvs(net);
 	cmd = info->genlhdr->cmd;
 
 	if (cmd == IPVS_CMD_GET_SERVICE)
@@ -3276,7 +3236,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
 	if (!msg)
 		return -ENOMEM;
 
-	mutex_lock(&__ip_vs_mutex);
+	mutex_lock(&ipvs->ctl_mutex);
 
 	reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
 	if (reply == NULL)
@@ -3339,7 +3299,7 @@ nla_put_failure:
 out_err:
 	nlmsg_free(msg);
 out:
-	mutex_unlock(&__ip_vs_mutex);
+	mutex_unlock(&ipvs->ctl_mutex);
 
 	return ret;
 }
@@ -3461,9 +3421,23 @@ int __net_init __ip_vs_control_init(struct net *net)
 {
 	int idx;
 	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct ctl_table *tbl;
 
 	if (!net_eq(net, &init_net))	/* netns not enabled yet */
 		return -EPERM;
+
+	atomic_set(&ipvs->dropentry, 0);
+	spin_lock_init(&ipvs->dropentry_lock);
+	spin_lock_init(&ipvs->droppacket_lock);
+	spin_lock_init(&ipvs->securetcp_lock);
+	ipvs->rs_lock = __RW_LOCK_UNLOCKED(ipvs->rs_lock);
+	__mutex_init(&ipvs->ctl_mutex,"ipvs->ctl_mutex", &ipvs->ctl_key);
+
+	/* Initialize rs_table */
+	for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
+		INIT_LIST_HEAD(&ipvs->rs_table[idx]);
+	}
+
 	/* procfs stats */
 	ipvs->ctl_stats = kzalloc(sizeof(struct ip_vs_stats), GFP_KERNEL);
 	if (ipvs->ctl_stats == NULL) {
@@ -3480,16 +3454,53 @@ int __net_init __ip_vs_control_init(struct net *net)
 	for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
 		INIT_LIST_HEAD(&ipvs->rs_table[idx]);
 	}
-
 	proc_net_fops_create(net, "ip_vs", 0, &ip_vs_info_fops);
 	proc_net_fops_create(net, "ip_vs_stats", 0, &ip_vs_stats_fops);
-	sysctl_header = register_net_sysctl_table(net, net_vs_ctl_path, vs_vars);
-	if (sysctl_header == NULL)
+
+	if (!net_eq(net, &init_net)) {
+		tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
+		if (tbl == NULL)
+			goto err_dup;
+	} else
+		tbl = vs_vars;
+	/* Initialize sysctl defaults */
+	idx = 0;
+	ipvs->sysctl_amemthresh = 1024;
+	tbl[idx++].data = &ipvs->sysctl_amemthresh;
+	ipvs->sysctl_am_droprate = 10;
+	tbl[idx++].data = &ipvs->sysctl_am_droprate;
+	tbl[idx++].data = &ipvs->sysctl_drop_entry;
+	tbl[idx++].data = &ipvs->sysctl_drop_packet;
+#ifdef CONFIG_IP_VS_NFCT
+	tbl[idx++].data = &ipvs->sysctl_conntrack;
+#endif
+	tbl[idx++].data = &ipvs->sysctl_secure_tcp;
+	ipvs->sysctl_snat_reroute = 1;
+	tbl[idx++].data = &ipvs->sysctl_snat_reroute;
+	ipvs->sysctl_sync_ver = 1;
+	tbl[idx++].data = &ipvs->sysctl_sync_ver;
+	tbl[idx++].data = &ipvs->sysctl_cache_bypass;
+	tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
+	tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
+	ipvs->sysctl_sync_threshold[0] = 3;
+	ipvs->sysctl_sync_threshold[1] = 50;
+	tbl[idx].data = &ipvs->sysctl_sync_threshold;
+	tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
+	tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
+
+
+	ipvs->sysctl_hdr = register_net_sysctl_table(net, net_vs_ctl_path,
+						  vs_vars);
+	if (ipvs->sysctl_hdr == NULL)
 		goto err_reg;
 	ip_vs_new_estimator(net, ipvs->ctl_stats);
+	ipvs->sysctl_tbl = tbl;
 	return 0;
 
 err_reg:
+	if (!net_eq(net, &init_net))
+		kfree(tbl);
+err_dup:
 	free_percpu(ipvs->ustats);
 err_alloc:
 	kfree(ipvs->ctl_stats);
@@ -3504,7 +3515,7 @@ static void __net_exit __ip_vs_control_cleanup(struct net *net)
 		return;
 
 	ip_vs_kill_estimator(net, ipvs->ctl_stats);
-	unregister_net_sysctl_table(sysctl_header);
+	unregister_net_sysctl_table(ipvs->sysctl_hdr);
 	proc_net_remove(net, "ip_vs_stats");
 	proc_net_remove(net, "ip_vs");
 	free_percpu(ipvs->ustats);
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index c011830..6ab5b84 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -34,7 +34,7 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 				     &iph.daddr, sh->dest))) {
 		int ignored;
 
-		if (ip_vs_todrop()) {
+		if (ip_vs_todrop(net_ipvs(net))) {
 			/*
 			 * It seems that we are very loaded.
 			 * We have to drop this packet :(
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 2e270b6..440918e 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -54,7 +54,7 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 				     &iph.daddr, th->dest))) {
 		int ignored;
 
-		if (ip_vs_todrop()) {
+		if (ip_vs_todrop(net_ipvs(net))) {
 			/*
 			 * It seems that we are very loaded.
 			 * We have to drop this packet :(
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index 882a547..597edf0 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -50,7 +50,7 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 	if (svc) {
 		int ignored;
 
-		if (ip_vs_todrop()) {
+		if (ip_vs_todrop(net_ipvs(net))) {
 			/*
 			 * It seems that we are very loaded.
 			 * We have to drop this packet :(
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 2887c34..25b4729 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -393,7 +393,7 @@ void ip_vs_sync_switch_mode(struct net *net, int mode)
 
 	if (!ipvs->sync_state & IP_VS_STATE_MASTER)
 		return;
-	if (mode == sysctl_ip_vs_sync_ver || !ipvs->sync_buff)
+	if (mode == ipvs->sysctl_sync_ver || !ipvs->sync_buff)
 		return;
 
 	spin_lock_bh(&ipvs->sync_buff_lock);
@@ -519,7 +519,7 @@ void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp)
 	unsigned int len, pe_name_len, pad;
 
 	/* Handle old version of the protocol */
-	if (sysctl_ip_vs_sync_ver == 0) {
+	if (ipvs->sysctl_sync_ver == 0) {
 		ip_vs_sync_conn_v0(net, cp);
 		return;
 	}
@@ -647,7 +647,7 @@ control:
 	if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
 		int pkts = atomic_add_return(1, &cp->in_pkts);
 
-		if (pkts % sysctl_ip_vs_sync_threshold[1] != 1)
+		if (pkts % ipvs->sysctl_sync_threshold[1] != 1)
 			return;
 	}
 	goto sloop;
@@ -721,6 +721,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
 {
 	struct ip_vs_dest *dest;
 	struct ip_vs_conn *cp;
+	struct netns_ipvs *ipvs = net_ipvs(net);
 
 	if (!(flags & IP_VS_CONN_F_TEMPLATE))
 		cp = ip_vs_conn_in_get(param);
@@ -791,7 +792,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
 
 	if (opt)
 		memcpy(&cp->in_seq, opt, sizeof(*opt));
-	atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
+	atomic_set(&cp->in_pkts, ipvs->sysctl_sync_threshold[0]);
 	cp->state = state;
 	cp->old_state = cp->state;
 	/*
-- 
1.7.2.3


^ permalink raw reply related

* [*v2 PATCH 16/22] IPVS: netns, connection hash got net as param.
From: Hans Schillstrom @ 2010-12-13 13:38 UTC (permalink / raw)
  To: horms, ja, daniel.lezcano, wensong, lvs-devel, netdev,
	netfilter-devel
  Cc: hans, Hans Schillstrom
In-Reply-To: <1292247510-753-1-git-send-email-hans.schillstrom@ericsson.com>

Connection hash table is now name space aware.
i.e. net ptr >> 8 is xor:ed to the hash,
and this is the first param to be compared.
The net struct is 0xa40 in size ( a little bit smaller for 32 bit arch:s)
and cache-line aligned, so a ptr >> 5 might be a more clever solution ?

All lookups where net is compared uses net_eq() which returns 1 when netns
is disabled, and the compiler seems to do something clever in that case.

ip_vs_conn_fill_param() have *net as first param now.

Three new inlines added to keep conn struct smaller
when names space is disabled.
- ip_vs_conn_net()
- ip_vs_conn_net_set()
- ip_vs_conn_net_eq()

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
---
 include/net/ip_vs.h                     |   54 ++++++++---
 include/net/netns/ip_vs.h               |    2 +
 net/netfilter/ipvs/ip_vs_conn.c         |  165 ++++++++++++++++++-------------
 net/netfilter/ipvs/ip_vs_core.c         |   15 ++-
 net/netfilter/ipvs/ip_vs_ftp.c          |   14 ++-
 net/netfilter/ipvs/ip_vs_nfct.c         |    6 +-
 net/netfilter/ipvs/ip_vs_proto_ah_esp.c |   15 ++-
 net/netfilter/ipvs/ip_vs_proto_sctp.c   |    2 +-
 net/netfilter/ipvs/ip_vs_proto_tcp.c    |    2 +-
 net/netfilter/ipvs/ip_vs_proto_udp.c    |    2 +-
 net/netfilter/ipvs/ip_vs_sync.c         |   13 +--
 11 files changed, 177 insertions(+), 113 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 848fcda..446e417 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -105,7 +105,6 @@ static inline struct net *seq_file_single_net(struct seq_file *seq)
 #endif
 }
 
-
 /* Connections' size value needed by ip_vs_ctl.c */
 extern int ip_vs_conn_tab_size;
 
@@ -460,6 +459,7 @@ extern struct ip_vs_proto_data * ip_vs_proto_data_get(struct net *net,
 						      unsigned short proto);
 
 struct ip_vs_conn_param {
+	struct net *			net;
 	const union nf_inet_addr	*caddr;
 	const union nf_inet_addr	*vaddr;
 	__be16				cport;
@@ -477,17 +477,19 @@ struct ip_vs_conn_param {
  */
 struct ip_vs_conn {
 	struct list_head        c_list;         /* hashed list heads */
-
+#ifdef CONFIG_NET_NS
+	struct net              *net;           /* Name space */
+#endif
 	/* Protocol, addresses and port numbers */
-	u16                      af;		/* address family */
-	union nf_inet_addr       caddr;          /* client address */
-	union nf_inet_addr       vaddr;          /* virtual address */
-	union nf_inet_addr       daddr;          /* destination address */
-	volatile __u32           flags;          /* status flags */
-	__u32                    fwmark;         /* Fire wall mark from skb */
-	__be16                   cport;
-	__be16                   vport;
-	__be16                   dport;
+	u16                     af;             /* address family */
+	__be16                  cport;
+	__be16                  vport;
+	__be16                  dport;
+	__u32                   fwmark;         /* Fire wall mark from skb */
+	union nf_inet_addr      caddr;          /* client address */
+	union nf_inet_addr      vaddr;          /* virtual address */
+	union nf_inet_addr      daddr;          /* destination address */
+	volatile __u32          flags;          /* status flags */
 	__u16                   protocol;       /* Which protocol (TCP/UDP) */
 
 	/* counter and timer */
@@ -530,6 +532,33 @@ struct ip_vs_conn {
 	__u8			pe_data_len;
 };
 
+/*
+ *  To save some memory in conn table when name space is disabled.
+ */
+static inline struct net *ip_vs_conn_net(const struct ip_vs_conn *cp)
+{
+#ifdef CONFIG_NET_NS
+	return cp->net;
+#else
+	return &init_net;
+#endif
+}
+static inline void ip_vs_conn_net_set(struct ip_vs_conn *cp, struct net *net)
+{
+#ifdef CONFIG_NET_NS
+	cp->net = net;
+#endif
+}
+
+static inline int ip_vs_conn_net_eq(const struct ip_vs_conn *cp,
+				    struct net *net)
+{
+#ifdef CONFIG_NET_NS
+	return cp->net == net;
+#else
+	return 1;
+#endif
+}
 
 /*
  *	Extended internal versions of struct ip_vs_service_user and
@@ -779,13 +808,14 @@ enum {
 	IP_VS_DIR_LAST,
 };
 
-static inline void ip_vs_conn_fill_param(int af, int protocol,
+static inline void ip_vs_conn_fill_param(struct net *net, int af, int protocol,
 					 const union nf_inet_addr *caddr,
 					 __be16 cport,
 					 const union nf_inet_addr *vaddr,
 					 __be16 vport,
 					 struct ip_vs_conn_param *p)
 {
+	p->net = net;
 	p->af = af;
 	p->protocol = protocol;
 	p->caddr = caddr;
diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h
index b6642f0..f2d5bcd 100644
--- a/include/net/netns/ip_vs.h
+++ b/include/net/netns/ip_vs.h
@@ -66,6 +66,8 @@ struct netns_ipvs {
 	struct ip_vs_stats 		*ctl_stats; /* Statistics & estimator */
 	struct ip_vs_stats_user __percpu *ustats;   /* Statistics */
 
+	/* ip_vs_conn */
+	atomic_t 		conn_count;         /*  connection counter */
 	/* ip_vs_lblc */
 	int 			sysctl_lblc_expiration;
 	struct ctl_table_header	*lblc_ctl_header;
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index ef35e5d..41f93cc 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -64,9 +64,6 @@ static struct list_head *ip_vs_conn_tab __read_mostly;
 /*  SLAB cache for IPVS connections */
 static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
 
-/*  counter for current IPVS connections */
-static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
-
 /*  counter for no client port connections */
 static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
 
@@ -76,7 +73,7 @@ static unsigned int ip_vs_conn_rnd __read_mostly;
 /*
  *  Fine locking granularity for big connection hash table
  */
-#define CT_LOCKARRAY_BITS  4
+#define CT_LOCKARRAY_BITS  5
 #define CT_LOCKARRAY_SIZE  (1<<CT_LOCKARRAY_BITS)
 #define CT_LOCKARRAY_MASK  (CT_LOCKARRAY_SIZE-1)
 
@@ -133,19 +130,19 @@ static inline void ct_write_unlock_bh(unsigned key)
 /*
  *	Returns hash value for IPVS connection entry
  */
-static unsigned int ip_vs_conn_hashkey(int af, unsigned proto,
+static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned proto,
 				       const union nf_inet_addr *addr,
 				       __be16 port)
 {
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6)
-		return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
-				    (__force u32)port, proto, ip_vs_conn_rnd)
-			& ip_vs_conn_tab_mask;
+		return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
+				    (__force u32)port, proto, ip_vs_conn_rnd) ^
+			((size_t)net>>8)) & ip_vs_conn_tab_mask;
 #endif
-	return jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
-			    ip_vs_conn_rnd)
-		& ip_vs_conn_tab_mask;
+	return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
+			    ip_vs_conn_rnd) ^
+		((size_t)net>>8)) & ip_vs_conn_tab_mask;
 }
 
 static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
@@ -166,15 +163,15 @@ static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
 		port = p->vport;
 	}
 
-	return ip_vs_conn_hashkey(p->af, p->protocol, addr, port);
+	return ip_vs_conn_hashkey(p->net, p->af, p->protocol, addr, port);
 }
 
 static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
 {
 	struct ip_vs_conn_param p;
 
-	ip_vs_conn_fill_param(cp->af, cp->protocol, &cp->caddr, cp->cport,
-			      NULL, 0, &p);
+	ip_vs_conn_fill_param(ip_vs_conn_net(cp), cp->af, cp->protocol,
+			      &cp->caddr, cp->cport, NULL, 0, &p);
 
 	if (cp->pe) {
 		p.pe = cp->pe;
@@ -186,7 +183,7 @@ static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
 }
 
 /*
- *	Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
+ *	Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port.
  *	returns bool success.
  */
 static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
@@ -268,10 +265,10 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
 	ct_read_lock(hash);
 
 	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (cp->af == p->af &&
+		if (ip_vs_conn_net_eq(cp, p->net) && cp->af == p->af &&
+		    p->cport == cp->cport && p->vport == cp->vport &&
 		    ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
 		    ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
-		    p->cport == cp->cport && p->vport == cp->vport &&
 		    ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
 		    p->protocol == cp->protocol) {
 			/* HIT */
@@ -313,17 +310,18 @@ ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb,
 			    struct ip_vs_conn_param *p)
 {
 	__be16 _ports[2], *pptr;
+	struct net *net = skb_net(skb);
 
 	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
 	if (pptr == NULL)
 		return 1;
 
 	if (likely(!inverse))
-		ip_vs_conn_fill_param(af, iph->protocol, &iph->saddr, pptr[0],
-				      &iph->daddr, pptr[1], p);
+		ip_vs_conn_fill_param(net, af, iph->protocol, &iph->saddr,
+				      pptr[0], &iph->daddr, pptr[1], p);
 	else
-		ip_vs_conn_fill_param(af, iph->protocol, &iph->daddr, pptr[1],
-				      &iph->saddr, pptr[0], p);
+		ip_vs_conn_fill_param(net, af, iph->protocol, &iph->daddr,
+				      pptr[1], &iph->saddr, pptr[0], p);
 	return 0;
 }
 
@@ -353,6 +351,8 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
 	ct_read_lock(hash);
 
 	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+		if (!ip_vs_conn_net_eq(cp, p->net))
+			continue;
 		if (p->pe_data && p->pe->ct_match) {
 			if (p->pe == cp->pe && p->pe->ct_match(p, cp))
 				goto out;
@@ -403,10 +403,10 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
 	ct_read_lock(hash);
 
 	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (cp->af == p->af &&
+		if (ip_vs_conn_net_eq(cp, p->net) && cp->af == p->af &&
+		    p->vport == cp->cport && p->cport == cp->dport &&
 		    ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
 		    ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
-		    p->vport == cp->cport && p->cport == cp->dport &&
 		    p->protocol == cp->protocol) {
 			/* HIT */
 			atomic_inc(&cp->refcnt);
@@ -611,8 +611,8 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
 	struct ip_vs_dest *dest;
 
 	if ((cp) && (!cp->dest)) {
-		dest = ip_vs_find_dest(&init_net, cp->af, &cp->daddr, cp->dport,
-				       &cp->vaddr, cp->vport,
+		dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr,
+				       cp->dport, &cp->vaddr, cp->vport,
 				       cp->protocol, cp->fwmark);
 		ip_vs_bind_dest(cp, dest);
 		return dest;
@@ -730,6 +730,7 @@ int ip_vs_check_template(struct ip_vs_conn *ct)
 static void ip_vs_conn_expire(unsigned long data)
 {
 	struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
+	struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
 
 	cp->timeout = 60*HZ;
 
@@ -772,7 +773,7 @@ static void ip_vs_conn_expire(unsigned long data)
 		ip_vs_unbind_dest(cp);
 		if (cp->flags & IP_VS_CONN_F_NO_CPORT)
 			atomic_dec(&ip_vs_conn_no_cport_cnt);
-		atomic_dec(&ip_vs_conn_count);
+		atomic_dec(&ipvs->conn_count);
 
 		kmem_cache_free(ip_vs_conn_cachep, cp);
 		return;
@@ -806,7 +807,9 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
 	       struct ip_vs_dest *dest, __u32 fwmark)
 {
 	struct ip_vs_conn *cp;
-	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(&init_net, p->protocol);
+	struct netns_ipvs *ipvs = net_ipvs(p->net);
+	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net,
+							   p->protocol);
 
 	cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
 	if (cp == NULL) {
@@ -816,6 +819,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
 
 	INIT_LIST_HEAD(&cp->c_list);
 	setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
+	ip_vs_conn_net_set(cp, p->net);
 	cp->af		   = p->af;
 	cp->protocol	   = p->protocol;
 	ip_vs_addr_copy(p->af, &cp->caddr, p->caddr);
@@ -846,7 +850,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
 	atomic_set(&cp->n_control, 0);
 	atomic_set(&cp->in_pkts, 0);
 
-	atomic_inc(&ip_vs_conn_count);
+	atomic_inc(&ipvs->conn_count);
 	if (flags & IP_VS_CONN_F_NO_CPORT)
 		atomic_inc(&ip_vs_conn_no_cport_cnt);
 
@@ -888,17 +892,22 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
  *	/proc/net/ip_vs_conn entries
  */
 #ifdef CONFIG_PROC_FS
+struct ip_vs_iter_state {
+	struct seq_net_private p;
+	struct list_head *l;
+};
 
 static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
 {
 	int idx;
 	struct ip_vs_conn *cp;
+	struct ip_vs_iter_state *iter = seq->private;
 
 	for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
 		ct_read_lock_bh(idx);
 		list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
 			if (pos-- == 0) {
-				seq->private = &ip_vs_conn_tab[idx];
+				iter->l = &ip_vs_conn_tab[idx];
 			return cp;
 			}
 		}
@@ -910,14 +919,17 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
 
 static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	seq->private = NULL;
+	struct ip_vs_iter_state *iter = seq->private;
+
+	iter->l = NULL;
 	return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
 }
 
 static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct ip_vs_conn *cp = v;
-	struct list_head *e, *l = seq->private;
+	struct ip_vs_iter_state *iter = seq->private;
+	struct list_head *e, *l = iter->l;
 	int idx;
 
 	++*pos;
@@ -934,18 +946,19 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	while (++idx < ip_vs_conn_tab_size) {
 		ct_read_lock_bh(idx);
 		list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
-			seq->private = &ip_vs_conn_tab[idx];
+			iter->l = &ip_vs_conn_tab[idx];
 			return cp;
 		}
 		ct_read_unlock_bh(idx);
 	}
-	seq->private = NULL;
+	iter->l = NULL;
 	return NULL;
 }
 
 static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
 {
-	struct list_head *l = seq->private;
+	struct ip_vs_iter_state *iter = seq->private;
+	struct list_head *l = iter->l;
 
 	if (l)
 		ct_read_unlock_bh(l - ip_vs_conn_tab);
@@ -959,9 +972,12 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
    "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires PEName PEData\n");
 	else {
 		const struct ip_vs_conn *cp = v;
+		struct net *net = seq_file_net(seq);
 		char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
 		size_t len = 0;
 
+		if (!ip_vs_conn_net_eq(cp, net))
+			return 0;
 		if (cp->pe_data) {
 			pe_data[0] = ' ';
 			len = strlen(cp->pe->name);
@@ -1006,7 +1022,8 @@ static const struct seq_operations ip_vs_conn_seq_ops = {
 
 static int ip_vs_conn_open(struct inode *inode, struct file *file)
 {
-	return seq_open(file, &ip_vs_conn_seq_ops);
+	return seq_open_net(inode, file, &ip_vs_conn_seq_ops,
+			    sizeof(struct ip_vs_iter_state));
 }
 
 static const struct file_operations ip_vs_conn_fops = {
@@ -1033,6 +1050,10 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
    "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Origin Expires\n");
 	else {
 		const struct ip_vs_conn *cp = v;
+		struct net *net = seq_file_net(seq);
+
+		if (!ip_vs_conn_net_eq(cp, net))
+			return 0;
 
 #ifdef CONFIG_IP_VS_IPV6
 		if (cp->af == AF_INET6)
@@ -1069,7 +1090,8 @@ static const struct seq_operations ip_vs_conn_sync_seq_ops = {
 
 static int ip_vs_conn_sync_open(struct inode *inode, struct file *file)
 {
-	return seq_open(file, &ip_vs_conn_sync_seq_ops);
+	return seq_open_net(inode, file, &ip_vs_conn_sync_seq_ops,
+			    sizeof(struct ip_vs_iter_state));
 }
 
 static const struct file_operations ip_vs_conn_sync_fops = {
@@ -1170,10 +1192,11 @@ void ip_vs_random_dropentry(void)
 /*
  *      Flush all the connection entries in the ip_vs_conn_tab
  */
-static void ip_vs_conn_flush(void)
+static void ip_vs_conn_flush(struct net *net)
 {
 	int idx;
 	struct ip_vs_conn *cp;
+	struct netns_ipvs *ipvs = net_ipvs(net);
 
   flush_again:
 	for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
@@ -1183,7 +1206,8 @@ static void ip_vs_conn_flush(void)
 		ct_write_lock_bh(idx);
 
 		list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
-
+			if (!ip_vs_conn_net_eq(cp, net))
+				continue;
 			IP_VS_DBG(4, "del connection\n");
 			ip_vs_conn_expire_now(cp);
 			if (cp->control) {
@@ -1196,7 +1220,7 @@ static void ip_vs_conn_flush(void)
 
 	/* the counter may be not NULL, because maybe some conn entries
 	   are run by slow timer handler or unhashed but still referred */
-	if (atomic_read(&ip_vs_conn_count) != 0) {
+	if (atomic_read(&ipvs->conn_count) != 0) {
 		schedule();
 		goto flush_again;
 	}
@@ -1204,14 +1228,37 @@ static void ip_vs_conn_flush(void)
 
 int __net_init __ip_vs_conn_init(struct net *net)
 {
-	int idx;
+	struct netns_ipvs *ipvs = net_ipvs(net);
 
 	if (!net_eq(net, &init_net))	/* netns not enabled yet */
 		return -EPERM;
+	atomic_set(&ipvs->conn_count, 0);
 
-	/* Compute size and mask */
-	ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
-	ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;
+	proc_net_fops_create(net, "ip_vs_conn", 0, &ip_vs_conn_fops);
+	proc_net_fops_create(net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
+
+	return 0;
+}
+/* Cleanup and release all netns related ... */
+static void __net_exit __ip_vs_conn_cleanup(struct net *net)
+{
+	if (!net_eq(net, &init_net))	/* netns not enabled yet */
+		return;
+
+	/* flush all the connection entries first */
+	ip_vs_conn_flush(net);
+	proc_net_remove(net, "ip_vs_conn");
+	proc_net_remove(net, "ip_vs_conn_sync");
+}
+static struct pernet_operations ipvs_conn_ops = {
+	.init = __ip_vs_conn_init,
+	.exit = __ip_vs_conn_cleanup,
+};
+
+int __init ip_vs_conn_init(void)
+{
+	int rv;
+	int idx;
 
 	/* Compute size and mask */
 	ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
@@ -1249,34 +1296,6 @@ int __net_init __ip_vs_conn_init(struct net *net)
 		rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
 	}
 
-	proc_net_fops_create(net, "ip_vs_conn", 0, &ip_vs_conn_fops);
-	proc_net_fops_create(net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
-
-	return 0;
-}
-/* Cleanup and release all netns related ... */
-static void __net_exit __ip_vs_conn_cleanup(struct net *net)
-{
-	if (!net_eq(net, &init_net))	/* netns not enabled yet */
-		return;
-
-	/* flush all the connection entries first */
-	ip_vs_conn_flush();
-	/* Release the empty cache */
-	kmem_cache_destroy(ip_vs_conn_cachep);
-	proc_net_remove(net, "ip_vs_conn");
-	proc_net_remove(net, "ip_vs_conn_sync");
-	vfree(ip_vs_conn_tab);
-}
-static struct pernet_operations ipvs_conn_ops = {
-	.init = __ip_vs_conn_init,
-	.exit = __ip_vs_conn_cleanup,
-};
-
-int __init ip_vs_conn_init(void)
-{
-	int rv;
-
 	rv = register_pernet_subsys(&ipvs_conn_ops);
 
 	/* calculate the random value for connection hash */
@@ -1287,4 +1306,8 @@ int __init ip_vs_conn_init(void)
 void ip_vs_conn_cleanup(void)
 {
 	unregister_pernet_subsys(&ipvs_conn_ops);
+	/* Release the empty cache */
+	kmem_cache_destroy(ip_vs_conn_cachep);
+	vfree(ip_vs_conn_tab);
+
 }
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 679eb16..69ba8cc 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -193,7 +193,8 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
 			      const union nf_inet_addr *vaddr, __be16 vport,
 			      struct ip_vs_conn_param *p)
 {
-	ip_vs_conn_fill_param(svc->af, protocol, caddr, cport, vaddr, vport, p);
+	ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr,
+			      vport, p);
 	p->pe = svc->pe;
 	if (p->pe && p->pe->fill_param)
 		return p->pe->fill_param(p, skb);
@@ -336,8 +337,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 	/*
 	 *    Create a new connection according to the template
 	 */
-	ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, src_port,
-			      &iph.daddr, dst_port, &param);
+	ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol, &iph.saddr,
+			      src_port, &iph.daddr, dst_port, &param);
 
 	cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest, skb->mark);
 	if (cp == NULL) {
@@ -451,8 +452,10 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
 	 */
 	{
 		struct ip_vs_conn_param p;
-		ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr,
-				      pptr[0], &iph.daddr, pptr[1], &p);
+
+		ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol,
+				      &iph.saddr, pptr[0], &iph.daddr, pptr[1],
+				      &p);
 		cp = ip_vs_conn_new(&p, &dest->addr,
 				    dest->port ? dest->port : pptr[1],
 				    flags, dest, skb->mark);
@@ -518,7 +521,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 		IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
 		{
 			struct ip_vs_conn_param p;
-			ip_vs_conn_fill_param(svc->af, iph.protocol,
+			ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol,
 					      &iph.saddr, pptr[0],
 					      &iph.daddr, pptr[1], &p);
 			cp = ip_vs_conn_new(&p, &daddr, 0,
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index b39befd..78d5980 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -198,13 +198,15 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 		 */
 		{
 			struct ip_vs_conn_param p;
-			ip_vs_conn_fill_param(AF_INET, iph->protocol,
-					      &from, port, &cp->caddr, 0, &p);
+			ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET,
+					      iph->protocol, &from, port,
+					      &cp->caddr, 0, &p);
 			n_cp = ip_vs_conn_out_get(&p);
 		}
 		if (!n_cp) {
 			struct ip_vs_conn_param p;
-			ip_vs_conn_fill_param(AF_INET, IPPROTO_TCP, &cp->caddr,
+			ip_vs_conn_fill_param(ip_vs_conn_net(cp),
+					      AF_INET, IPPROTO_TCP, &cp->caddr,
 					      0, &cp->vaddr, port, &p);
 			n_cp = ip_vs_conn_new(&p, &from, port,
 					      IP_VS_CONN_F_NO_CPORT |
@@ -360,9 +362,9 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
 
 	{
 		struct ip_vs_conn_param p;
-		ip_vs_conn_fill_param(AF_INET, iph->protocol, &to, port,
-				      &cp->vaddr, htons(ntohs(cp->vport)-1),
-				      &p);
+		ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET,
+				      iph->protocol, &to, port, &cp->vaddr,
+				      htons(ntohs(cp->vport)-1), &p);
 		n_cp = ip_vs_conn_in_get(&p);
 		if (!n_cp) {
 			n_cp = ip_vs_conn_new(&p, &cp->daddr,
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
index 4680647..f454c80 100644
--- a/net/netfilter/ipvs/ip_vs_nfct.c
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -141,6 +141,7 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
 	struct nf_conntrack_tuple *orig, new_reply;
 	struct ip_vs_conn *cp;
 	struct ip_vs_conn_param p;
+	struct net *net = nf_ct_net(ct);
 
 	if (exp->tuple.src.l3num != PF_INET)
 		return;
@@ -155,7 +156,7 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
 
 	/* RS->CLIENT */
 	orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
-	ip_vs_conn_fill_param(exp->tuple.src.l3num, orig->dst.protonum,
+	ip_vs_conn_fill_param(net, exp->tuple.src.l3num, orig->dst.protonum,
 			      &orig->src.u3, orig->src.u.tcp.port,
 			      &orig->dst.u3, orig->dst.u.tcp.port, &p);
 	cp = ip_vs_conn_out_get(&p);
@@ -268,7 +269,8 @@ void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
 		" for conn " FMT_CONN "\n",
 		__func__, ARG_TUPLE(&tuple), ARG_CONN(cp));
 
-	h = nf_conntrack_find_get(&init_net, NF_CT_DEFAULT_ZONE, &tuple);
+	h = nf_conntrack_find_get(ip_vs_conn_net(cp), NF_CT_DEFAULT_ZONE,
+				  &tuple);
 	if (h) {
 		ct = nf_ct_tuplehash_to_ctrack(h);
 		/* Show what happens instead of calling nf_ct_kill() */
diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
index b8b37fa..ded1adb 100644
--- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -41,15 +41,16 @@ struct isakmp_hdr {
 #define PORT_ISAKMP	500
 
 static void
-ah_esp_conn_fill_param_proto(int af, const struct ip_vs_iphdr *iph,
-			     int inverse, struct ip_vs_conn_param *p)
+ah_esp_conn_fill_param_proto(struct net *net, int af,
+			     const struct ip_vs_iphdr *iph, int inverse,
+			     struct ip_vs_conn_param *p)
 {
 	if (likely(!inverse))
-		ip_vs_conn_fill_param(af, IPPROTO_UDP,
+		ip_vs_conn_fill_param(net, af, IPPROTO_UDP,
 				      &iph->saddr, htons(PORT_ISAKMP),
 				      &iph->daddr, htons(PORT_ISAKMP), p);
 	else
-		ip_vs_conn_fill_param(af, IPPROTO_UDP,
+		ip_vs_conn_fill_param(net, af, IPPROTO_UDP,
 				      &iph->daddr, htons(PORT_ISAKMP),
 				      &iph->saddr, htons(PORT_ISAKMP), p);
 }
@@ -61,8 +62,9 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
 {
 	struct ip_vs_conn *cp;
 	struct ip_vs_conn_param p;
+	struct net *net = skb_net(skb);
 
-	ah_esp_conn_fill_param_proto(af, iph, inverse, &p);
+	ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p);
 	cp = ip_vs_conn_in_get(&p);
 	if (!cp) {
 		/*
@@ -90,8 +92,9 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb,
 {
 	struct ip_vs_conn *cp;
 	struct ip_vs_conn_param p;
+	struct net *net = skb_net(skb);
 
-	ah_esp_conn_fill_param_proto(af, iph, inverse, &p);
+	ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p);
 	cp = ip_vs_conn_out_get(&p);
 	if (!cp) {
 		IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet "
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 3c6d06d..c011830 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -1064,7 +1064,7 @@ static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc)
 
 static int sctp_app_conn_bind(struct ip_vs_conn *cp)
 {
-	struct netns_ipvs *ipvs = net_ipvs(&init_net);
+	struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
 	int hash;
 	struct ip_vs_app *inc;
 	int result = 0;
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 326768d..2e270b6 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -626,7 +626,7 @@ tcp_unregister_app(struct net *net, struct ip_vs_app *inc)
 static int
 tcp_app_conn_bind(struct ip_vs_conn *cp)
 {
-	struct netns_ipvs *ipvs = net_ipvs(&init_net);
+	struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
 	int hash;
 	struct ip_vs_app *inc;
 	int result = 0;
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index 7e894d5..882a547 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -396,7 +396,7 @@ udp_unregister_app(struct net *net, struct ip_vs_app *inc)
 
 static int udp_app_conn_bind(struct ip_vs_conn *cp)
 {
-	struct netns_ipvs *ipvs = net_ipvs(&init_net);
+	struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
 	int hash;
 	struct ip_vs_app *inc;
 	int result = 0;
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 29c6bbb..2887c34 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -657,21 +657,21 @@ control:
  *  fill_param used by version 1
  */
 static inline int
-ip_vs_conn_fill_param_sync(int af, union ip_vs_sync_conn *sc,
+ip_vs_conn_fill_param_sync(struct net *net, int af, union ip_vs_sync_conn *sc,
 			   struct ip_vs_conn_param *p,
 			   __u8 *pe_data, unsigned int pe_data_len,
 			   __u8 *pe_name, unsigned int pe_name_len)
 {
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6)
-		ip_vs_conn_fill_param(af, sc->v6.protocol,
+		ip_vs_conn_fill_param(net, af, sc->v6.protocol,
 				      (const union nf_inet_addr *)&sc->v6.caddr,
 				      sc->v6.cport,
 				      (const union nf_inet_addr *)&sc->v6.vaddr,
 				      sc->v6.vport, p);
 	else
 #endif
-		ip_vs_conn_fill_param(af, sc->v4.protocol,
+		ip_vs_conn_fill_param(net, af, sc->v4.protocol,
 				      (const union nf_inet_addr *)&sc->v4.caddr,
 				      sc->v4.cport,
 				      (const union nf_inet_addr *)&sc->v4.vaddr,
@@ -878,7 +878,7 @@ static void ip_vs_process_message_v0(struct net *net, const char *buffer,
 			}
 		}
 
-		ip_vs_conn_fill_param(AF_INET, s->protocol,
+		ip_vs_conn_fill_param(net, AF_INET, s->protocol,
 				      (const union nf_inet_addr *)&s->caddr,
 				      s->cport,
 				      (const union nf_inet_addr *)&s->vaddr,
@@ -1040,9 +1040,8 @@ static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end)
 			state = 0;
 		}
 	}
-	if (ip_vs_conn_fill_param_sync(af, s, &param,
-					pe_data, pe_data_len,
-					pe_name, pe_name_len)) {
+	if (ip_vs_conn_fill_param_sync(net, af, s, &param, pe_data,
+				       pe_data_len, pe_name, pe_name_len)) {
 		retc = 50;
 		goto out;
 	}
-- 
1.7.2.3


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox