Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 3/5] RDMA/cxgb4: Fix LE hash collision bug for active open connection
From: Vipul Pandya @ 2012-11-29 14:52 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: roland-BHEL68pLQRGGvPXPguhicg, davem-fT/PcQaiUtIeIZ0/mPfg9Q,
	divy-ut6Up61K2wZBDgjK7y7TUQ, dm-ut6Up61K2wZBDgjK7y7TUQ,
	kumaras-ut6Up61K2wZBDgjK7y7TUQ,
	swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW,
	abhishek-ut6Up61K2wZBDgjK7y7TUQ, Vipul Pandya
In-Reply-To: <1354200745-23598-1-git-send-email-vipul-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>

It enables establishing active open connection using fw_ofld_connection work
request when cpl_act_open_rpl says TCAM full error which may be because
of LE hash collision. Current support is only for IPv4 active open connections.

Sets ntuple bits in active open requests. For T4 firmware greater than 1.4.10.0
ntuple bits are required to be set.

Adds nocong and enable_ecn module parameter options.

Signed-off-by: Vipul Pandya <vipul-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
---
 drivers/infiniband/hw/cxgb4/cm.c              |  158 ++++++++++++++++++++++++-
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h        |    1 +
 drivers/net/ethernet/chelsio/cxgb4/t4_msg.h   |   19 +++
 drivers/net/ethernet/chelsio/cxgb4/t4_regs.h  |    2 +
 drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h |  109 +++++++++++++++++
 5 files changed, 283 insertions(+), 6 deletions(-)

diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index 6cfd4d8..560ca39 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -61,6 +61,14 @@ static char *states[] = {
 	NULL,
 };
 
+static int nocong;
+module_param(nocong, int, 0644);
+MODULE_PARM_DESC(nocong, "Turn of congestion control (default=0)");
+
+static int enable_ecn;
+module_param(enable_ecn, int, 0644);
+MODULE_PARM_DESC(enable_ecn, "Enable ECN (default=0/disabled)");
+
 static int dack_mode = 1;
 module_param(dack_mode, int, 0644);
 MODULE_PARM_DESC(dack_mode, "Delayed ack mode (default=1)");
@@ -442,6 +450,50 @@ static int send_abort(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp)
 	return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
 }
 
+#define VLAN_NONE 0xfff
+#define FILTER_SEL_VLAN_NONE 0xffff
+#define FILTER_SEL_WIDTH_P_FC (3+1) /* port uses 3 bits, FCoE one bit */
+#define FILTER_SEL_WIDTH_VIN_P_FC \
+	(6 + 7 + FILTER_SEL_WIDTH_P_FC) /* 6 bits are unused, VF uses 7 bits*/
+#define FILTER_SEL_WIDTH_TAG_P_FC \
+	(3 + FILTER_SEL_WIDTH_VIN_P_FC) /* PF uses 3 bits */
+#define FILTER_SEL_WIDTH_VLD_TAG_P_FC (1 + FILTER_SEL_WIDTH_TAG_P_FC)
+
+static unsigned int select_ntuple(struct c4iw_dev *dev, struct dst_entry *dst,
+				  struct l2t_entry *l2t)
+{
+	unsigned int ntuple = 0;
+	u32 viid;
+
+	switch (dev->rdev.lldi.filt_mode) {
+
+	/* default filter mode */
+	case HW_TPL_FR_MT_PR_IV_P_FC:
+		if (l2t->vlan == VLAN_NONE)
+			ntuple |= FILTER_SEL_VLAN_NONE << FILTER_SEL_WIDTH_P_FC;
+		else {
+			ntuple |= l2t->vlan << FILTER_SEL_WIDTH_P_FC;
+			ntuple |= 1 << FILTER_SEL_WIDTH_VLD_TAG_P_FC;
+		}
+		ntuple |= l2t->lport << S_PORT | IPPROTO_TCP <<
+			  FILTER_SEL_WIDTH_VLD_TAG_P_FC;
+		break;
+	case HW_TPL_FR_MT_PR_OV_P_FC: {
+		viid = cxgb4_port_viid(l2t->neigh->dev);
+
+		ntuple |= FW_VIID_VIN_GET(viid) << FILTER_SEL_WIDTH_P_FC;
+		ntuple |= FW_VIID_PFN_GET(viid) << FILTER_SEL_WIDTH_VIN_P_FC;
+		ntuple |= FW_VIID_VIVLD_GET(viid) << FILTER_SEL_WIDTH_TAG_P_FC;
+		ntuple |= l2t->lport << S_PORT | IPPROTO_TCP <<
+			  FILTER_SEL_WIDTH_VLD_TAG_P_FC;
+		break;
+	}
+	default:
+		break;
+	}
+	return ntuple;
+}
+
 static int send_connect(struct c4iw_ep *ep)
 {
 	struct cpl_act_open_req *req;
@@ -464,7 +516,8 @@ static int send_connect(struct c4iw_ep *ep)
 
 	cxgb4_best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx);
 	wscale = compute_wscale(rcv_win);
-	opt0 = KEEP_ALIVE(1) |
+	opt0 = (nocong ? NO_CONG(1) : 0) |
+	       KEEP_ALIVE(1) |
 	       DELACK(1) |
 	       WND_SCALE(wscale) |
 	       MSS_IDX(mtu_idx) |
@@ -475,6 +528,7 @@ static int send_connect(struct c4iw_ep *ep)
 	       ULP_MODE(ULP_MODE_TCPDDP) |
 	       RCV_BUFSIZ(rcv_win>>10);
 	opt2 = RX_CHANNEL(0) |
+	       CCTRL_ECN(enable_ecn) |
 	       RSS_QUEUE_VALID | RSS_QUEUE(ep->rss_qid);
 	if (enable_tcp_timestamps)
 		opt2 |= TSTAMPS_EN(1);
@@ -493,7 +547,7 @@ static int send_connect(struct c4iw_ep *ep)
 	req->local_ip = ep->com.local_addr.sin_addr.s_addr;
 	req->peer_ip = ep->com.remote_addr.sin_addr.s_addr;
 	req->opt0 = cpu_to_be64(opt0);
-	req->params = 0;
+	req->params = cpu_to_be32(select_ntuple(ep->com.dev, ep->dst, ep->l2t));
 	req->opt2 = cpu_to_be32(opt2);
 	return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
 }
@@ -1384,6 +1438,61 @@ static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
 	return 0;
 }
 
+static void send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid)
+{
+	struct sk_buff *skb;
+	struct fw_ofld_connection_wr *req;
+	unsigned int mtu_idx;
+	int wscale;
+
+	skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+	req = (struct fw_ofld_connection_wr *)__skb_put(skb, sizeof(*req));
+	memset(req, 0, sizeof(*req));
+	req->op_compl = htonl(V_WR_OP(FW_OFLD_CONNECTION_WR));
+	req->len16_pkd = htonl(FW_WR_LEN16(DIV_ROUND_UP(sizeof(*req), 16)));
+	req->le.filter = cpu_to_be32(select_ntuple(ep->com.dev, ep->dst,
+				     ep->l2t));
+	req->le.lport = ep->com.local_addr.sin_port;
+	req->le.pport = ep->com.remote_addr.sin_port;
+	req->le.u.ipv4.lip = ep->com.local_addr.sin_addr.s_addr;
+	req->le.u.ipv4.pip = ep->com.remote_addr.sin_addr.s_addr;
+	req->tcb.t_state_to_astid =
+			htonl(V_FW_OFLD_CONNECTION_WR_T_STATE(TCP_SYN_SENT) |
+			V_FW_OFLD_CONNECTION_WR_ASTID(atid));
+	req->tcb.cplrxdataack_cplpassacceptrpl =
+			htons(F_FW_OFLD_CONNECTION_WR_CPLRXDATAACK);
+	req->tcb.tx_max = jiffies;
+	cxgb4_best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx);
+	wscale = compute_wscale(rcv_win);
+	req->tcb.opt0 = TCAM_BYPASS(1) |
+		(nocong ? NO_CONG(1) : 0) |
+		KEEP_ALIVE(1) |
+		DELACK(1) |
+		WND_SCALE(wscale) |
+		MSS_IDX(mtu_idx) |
+		L2T_IDX(ep->l2t->idx) |
+		TX_CHAN(ep->tx_chan) |
+		SMAC_SEL(ep->smac_idx) |
+		DSCP(ep->tos) |
+		ULP_MODE(ULP_MODE_TCPDDP) |
+		RCV_BUFSIZ(rcv_win >> 10);
+	req->tcb.opt2 = PACE(1) |
+		TX_QUEUE(ep->com.dev->rdev.lldi.tx_modq[ep->tx_chan]) |
+		RX_CHANNEL(0) |
+		CCTRL_ECN(enable_ecn) |
+		RSS_QUEUE_VALID | RSS_QUEUE(ep->rss_qid);
+	if (enable_tcp_timestamps)
+		req->tcb.opt2 |= TSTAMPS_EN(1);
+	if (enable_tcp_sack)
+		req->tcb.opt2 |= SACK_EN(1);
+	if (wscale && enable_tcp_window_scaling)
+		req->tcb.opt2 |= WND_SCALE_EN(1);
+	req->tcb.opt0 = cpu_to_be64(req->tcb.opt0);
+	req->tcb.opt2 = cpu_to_be32(req->tcb.opt2);
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
+	c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+}
+
 /*
  * Return whether a failed active open has allocated a TID
  */
@@ -1420,6 +1529,14 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
 	case CPL_ERR_CONN_RESET:
 	case CPL_ERR_CONN_TIMEDOUT:
 		break;
+	case CPL_ERR_TCAM_FULL:
+		mutex_lock(&dev->rdev.stats.lock);
+		dev->rdev.stats.tcam_full++;
+		mutex_unlock(&dev->rdev.stats.lock);
+		send_fw_act_open_req(ep,
+			GET_TID_TID(GET_AOPEN_ATID(ntohl(rpl->atid_status))));
+		return 0;
+		break;
 	default:
 		printk(KERN_INFO MOD "Active open failure - "
 		       "atid %u status %u errno %d %pI4:%u->%pI4:%u\n",
@@ -1511,14 +1628,15 @@ static void accept_cr(struct c4iw_ep *ep, __be32 peer_ip, struct sk_buff *skb,
 	skb_get(skb);
 	cxgb4_best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx);
 	wscale = compute_wscale(rcv_win);
-	opt0 = KEEP_ALIVE(1) |
+	opt0 = (nocong ? NO_CONG(1) : 0) |
+	       KEEP_ALIVE(1) |
 	       DELACK(1) |
 	       WND_SCALE(wscale) |
 	       MSS_IDX(mtu_idx) |
 	       L2T_IDX(ep->l2t->idx) |
 	       TX_CHAN(ep->tx_chan) |
 	       SMAC_SEL(ep->smac_idx) |
-	       DSCP(ep->tos) |
+	       DSCP(ep->tos >> 2) |
 	       ULP_MODE(ULP_MODE_TCPDDP) |
 	       RCV_BUFSIZ(rcv_win>>10);
 	opt2 = RX_CHANNEL(0) |
@@ -1530,6 +1648,15 @@ static void accept_cr(struct c4iw_ep *ep, __be32 peer_ip, struct sk_buff *skb,
 		opt2 |= SACK_EN(1);
 	if (wscale && enable_tcp_window_scaling)
 		opt2 |= WND_SCALE_EN(1);
+	if (enable_ecn) {
+		const struct tcphdr *tcph;
+		u32 hlen = ntohl(req->hdr_len);
+
+		tcph = (const void *)(req + 1) + G_ETH_HDR_LEN(hlen) +
+			G_IP_HDR_LEN(hlen);
+		if (tcph->ece && tcph->cwr)
+			opt2 |= CCTRL_ECN(1);
+	}
 
 	rpl = cplhdr(skb);
 	INIT_TP_WR(rpl, ep->hwtid);
@@ -2649,11 +2776,14 @@ static int fw6_msg(struct c4iw_dev *dev, struct sk_buff *skb)
 	struct cpl_fw6_msg *rpl = cplhdr(skb);
 	struct c4iw_wr_wait *wr_waitp;
 	int ret;
+	u8 opcode;
+	struct cpl_fw6_msg_ofld_connection_wr_rpl *req;
+	struct c4iw_ep *ep;
 
 	PDBG("%s type %u\n", __func__, rpl->type);
 
 	switch (rpl->type) {
-	case 1:
+	case FW6_TYPE_WR_RPL:
 		ret = (int)((be64_to_cpu(rpl->data[0]) >> 8) & 0xff);
 		wr_waitp = (struct c4iw_wr_wait *)(__force unsigned long) rpl->data[1];
 		PDBG("%s wr_waitp %p ret %u\n", __func__, wr_waitp, ret);
@@ -2661,9 +2791,25 @@ static int fw6_msg(struct c4iw_dev *dev, struct sk_buff *skb)
 			c4iw_wake_up(wr_waitp, ret ? -ret : 0);
 		kfree_skb(skb);
 		break;
-	case 2:
+	case FW6_TYPE_CQE:
 		sched(dev, skb);
 		break;
+	case FW6_TYPE_OFLD_CONNECTION_WR_RPL:
+		opcode = *(const u8 *)rpl->data;
+		if (opcode == FW_OFLD_CONNECTION_WR) {
+			req =
+			(struct cpl_fw6_msg_ofld_connection_wr_rpl *)rpl->data;
+			if (req->t_state == TCP_SYN_SENT
+			    && (req->retval == FW_ENOMEM
+				|| req->retval == FW_EADDRINUSE)) {
+				ep = (struct c4iw_ep *)
+				     lookup_atid(dev->rdev.lldi.tids,
+						 req->tid);
+				c4iw_l2t_send(&dev->rdev, skb, ep->l2t);
+				return 0;
+			}
+		}
+		break;
 	default:
 		printk(KERN_ERR MOD "%s unexpected fw6 msg type %u\n", __func__,
 		       rpl->type);
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index 9beb3a9..6a17fde 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -130,6 +130,7 @@ struct c4iw_stats {
 	u64  db_empty;
 	u64  db_drop;
 	u64  db_state_transitions;
+	u64  tcam_full;
 };
 
 struct c4iw_rdev {
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h
index 480eb43..5b51c01 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h
@@ -193,6 +193,10 @@ struct work_request_hdr {
 	__be64 wr_lo;
 };
 
+/* wr_hi fields */
+#define S_WR_OP    24
+#define V_WR_OP(x) ((__u64)(x) << S_WR_OP)
+
 #define WR_HDR struct work_request_hdr wr
 
 struct cpl_pass_open_req {
@@ -204,12 +208,14 @@ struct cpl_pass_open_req {
 	__be32 peer_ip;
 	__be64 opt0;
 #define TX_CHAN(x)    ((x) << 2)
+#define NO_CONG(x)    ((x) << 4)
 #define DELACK(x)     ((x) << 5)
 #define ULP_MODE(x)   ((x) << 8)
 #define RCV_BUFSIZ(x) ((x) << 12)
 #define DSCP(x)       ((x) << 22)
 #define SMAC_SEL(x)   ((u64)(x) << 28)
 #define L2T_IDX(x)    ((u64)(x) << 36)
+#define TCAM_BYPASS(x) ((u64)(x) << 48)
 #define NAGLE(x)      ((u64)(x) << 49)
 #define WND_SCALE(x)  ((u64)(x) << 50)
 #define KEEP_ALIVE(x) ((u64)(x) << 54)
@@ -247,8 +253,10 @@ struct cpl_pass_accept_rpl {
 #define RSS_QUEUE_VALID      (1 << 10)
 #define RX_COALESCE_VALID(x) ((x) << 11)
 #define RX_COALESCE(x)       ((x) << 12)
+#define PACE(x)	      ((x) << 16)
 #define TX_QUEUE(x)          ((x) << 23)
 #define RX_CHANNEL(x)        ((x) << 26)
+#define CCTRL_ECN(x)         ((x) << 27)
 #define WND_SCALE_EN(x)      ((x) << 28)
 #define TSTAMPS_EN(x)        ((x) << 29)
 #define SACK_EN(x)           ((x) << 30)
@@ -635,6 +643,17 @@ struct cpl_fw6_msg {
 /* cpl_fw6_msg.type values */
 enum {
 	FW6_TYPE_CMD_RPL = 0,
+	FW6_TYPE_WR_RPL = 1,
+	FW6_TYPE_CQE = 2,
+	FW6_TYPE_OFLD_CONNECTION_WR_RPL = 3,
+};
+
+struct cpl_fw6_msg_ofld_connection_wr_rpl {
+	__u64   cookie;
+	__be32  tid;    /* or atid in case of active failure */
+	__u8    t_state;
+	__u8    retval;
+	__u8    rsvd[2];
 };
 
 enum {
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h b/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
index 66ab614..bdc1bbc 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
@@ -1065,4 +1065,6 @@
 
 #define A_TP_TX_SCHED_PCMD 0x25
 
+#define S_PORT    1
+
 #endif /* __T4_REGS_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
index b4dc560..6e3542f 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
@@ -39,6 +39,11 @@ enum fw_ret_val {
 	FW_ENOEXEC      = 8,            /* Exec format error; inv microcode */
 };
 
+enum fw_retval {
+	FW_ENOMEM               = 12,   /* out of memory */
+	FW_EADDRINUSE           = 98,   /* address already in use */
+};
+
 #define FW_T4VF_SGE_BASE_ADDR      0x0000
 #define FW_T4VF_MPS_BASE_ADDR      0x0100
 #define FW_T4VF_PL_BASE_ADDR       0x0200
@@ -50,6 +55,7 @@ enum fw_wr_opcodes {
 	FW_ULPTX_WR                    = 0x04,
 	FW_TP_WR                       = 0x05,
 	FW_ETH_TX_PKT_WR               = 0x08,
+	FW_OFLD_CONNECTION_WR          = 0x2f,
 	FW_FLOWC_WR                    = 0x0a,
 	FW_OFLD_TX_DATA_WR             = 0x0b,
 	FW_CMD_WR                      = 0x10,
@@ -84,6 +90,7 @@ struct fw_wr_hdr {
 #define FW_WR_LEN16(x)	((x) << 0)
 
 #define HW_TPL_FR_MT_PR_IV_P_FC         0X32B
+#define HW_TPL_FR_MT_PR_OV_P_FC         0X327
 
 /* filter wr reply code in cookie in CPL_SET_TCB_RPL */
 enum fw_filter_wr_cookie {
@@ -378,6 +385,108 @@ struct fw_eth_tx_pkt_wr {
 	__be64 r3;
 };
 
+struct fw_ofld_connection_wr {
+	__be32 op_compl;
+	__be32 len16_pkd;
+	__u64  cookie;
+	__be64 r2;
+	__be64 r3;
+	struct fw_ofld_connection_le {
+		__be32 version_cpl;
+		__be32 filter;
+		__be32 r1;
+		__be16 lport;
+		__be16 pport;
+		union fw_ofld_connection_leip {
+			struct fw_ofld_connection_le_ipv4 {
+				__be32 pip;
+				__be32 lip;
+				__be64 r0;
+				__be64 r1;
+				__be64 r2;
+			} ipv4;
+			struct fw_ofld_connection_le_ipv6 {
+				__be64 pip_hi;
+				__be64 pip_lo;
+				__be64 lip_hi;
+				__be64 lip_lo;
+			} ipv6;
+		} u;
+	} le;
+	struct fw_ofld_connection_tcb {
+		__be32 t_state_to_astid;
+		__be16 cplrxdataack_cplpassacceptrpl;
+		__be16 rcv_adv;
+		__be32 rcv_nxt;
+		__be32 tx_max;
+		__be64 opt0;
+		__be32 opt2;
+		__be32 r1;
+		__be64 r2;
+		__be64 r3;
+	} tcb;
+};
+
+#define S_FW_OFLD_CONNECTION_WR_VERSION                31
+#define M_FW_OFLD_CONNECTION_WR_VERSION                0x1
+#define V_FW_OFLD_CONNECTION_WR_VERSION(x)     \
+	((x) << S_FW_OFLD_CONNECTION_WR_VERSION)
+#define G_FW_OFLD_CONNECTION_WR_VERSION(x)     \
+	(((x) >> S_FW_OFLD_CONNECTION_WR_VERSION) & \
+	M_FW_OFLD_CONNECTION_WR_VERSION)
+#define F_FW_OFLD_CONNECTION_WR_VERSION        \
+	V_FW_OFLD_CONNECTION_WR_VERSION(1U)
+
+#define S_FW_OFLD_CONNECTION_WR_CPL    30
+#define M_FW_OFLD_CONNECTION_WR_CPL    0x1
+#define V_FW_OFLD_CONNECTION_WR_CPL(x) ((x) << S_FW_OFLD_CONNECTION_WR_CPL)
+#define G_FW_OFLD_CONNECTION_WR_CPL(x) \
+	(((x) >> S_FW_OFLD_CONNECTION_WR_CPL) & M_FW_OFLD_CONNECTION_WR_CPL)
+#define F_FW_OFLD_CONNECTION_WR_CPL    V_FW_OFLD_CONNECTION_WR_CPL(1U)
+
+#define S_FW_OFLD_CONNECTION_WR_T_STATE                28
+#define M_FW_OFLD_CONNECTION_WR_T_STATE                0xf
+#define V_FW_OFLD_CONNECTION_WR_T_STATE(x)     \
+	((x) << S_FW_OFLD_CONNECTION_WR_T_STATE)
+#define G_FW_OFLD_CONNECTION_WR_T_STATE(x)     \
+	(((x) >> S_FW_OFLD_CONNECTION_WR_T_STATE) & \
+	M_FW_OFLD_CONNECTION_WR_T_STATE)
+
+#define S_FW_OFLD_CONNECTION_WR_RCV_SCALE      24
+#define M_FW_OFLD_CONNECTION_WR_RCV_SCALE      0xf
+#define V_FW_OFLD_CONNECTION_WR_RCV_SCALE(x)   \
+	((x) << S_FW_OFLD_CONNECTION_WR_RCV_SCALE)
+#define G_FW_OFLD_CONNECTION_WR_RCV_SCALE(x)   \
+	(((x) >> S_FW_OFLD_CONNECTION_WR_RCV_SCALE) & \
+	M_FW_OFLD_CONNECTION_WR_RCV_SCALE)
+
+#define S_FW_OFLD_CONNECTION_WR_ASTID          0
+#define M_FW_OFLD_CONNECTION_WR_ASTID          0xffffff
+#define V_FW_OFLD_CONNECTION_WR_ASTID(x)       \
+	((x) << S_FW_OFLD_CONNECTION_WR_ASTID)
+#define G_FW_OFLD_CONNECTION_WR_ASTID(x)       \
+	(((x) >> S_FW_OFLD_CONNECTION_WR_ASTID) & M_FW_OFLD_CONNECTION_WR_ASTID)
+
+#define S_FW_OFLD_CONNECTION_WR_CPLRXDATAACK   15
+#define M_FW_OFLD_CONNECTION_WR_CPLRXDATAACK   0x1
+#define V_FW_OFLD_CONNECTION_WR_CPLRXDATAACK(x)        \
+	((x) << S_FW_OFLD_CONNECTION_WR_CPLRXDATAACK)
+#define G_FW_OFLD_CONNECTION_WR_CPLRXDATAACK(x)        \
+	(((x) >> S_FW_OFLD_CONNECTION_WR_CPLRXDATAACK) & \
+	M_FW_OFLD_CONNECTION_WR_CPLRXDATAACK)
+#define F_FW_OFLD_CONNECTION_WR_CPLRXDATAACK   \
+	V_FW_OFLD_CONNECTION_WR_CPLRXDATAACK(1U)
+
+#define S_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL       14
+#define M_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL       0x1
+#define V_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL(x)    \
+	((x) << S_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL)
+#define G_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL(x)    \
+	(((x) >> S_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL) & \
+	M_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL)
+#define F_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL       \
+	V_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL(1U)
+
 enum fw_flowc_mnem {
 	FW_FLOWC_MNEM_PFNVFN,		/* PFN [15:8] VFN [7:0] */
 	FW_FLOWC_MNEM_CH,
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH 5/5] RDMA/cxgb4: Fix bug for active and passive LE hash collision path
From: Vipul Pandya @ 2012-11-29 14:52 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: roland-BHEL68pLQRGGvPXPguhicg, davem-fT/PcQaiUtIeIZ0/mPfg9Q,
	divy-ut6Up61K2wZBDgjK7y7TUQ, dm-ut6Up61K2wZBDgjK7y7TUQ,
	kumaras-ut6Up61K2wZBDgjK7y7TUQ,
	swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW,
	abhishek-ut6Up61K2wZBDgjK7y7TUQ, Vipul Pandya
In-Reply-To: <1354200745-23598-1-git-send-email-vipul-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>

Retries active opens for INUSE errors.

Logs any active ofld_connect_wr error replies.

Sends ofld_connect_wr on same ctrlq. It needs to go  on the same control txq as
regular CPL active/passive messages.

Retries on active open replies with EADDRINUSE.

Uses active open fw wr only if active filter region is set.

Adds stat for ofld_connect_wr failures.

This patch also adds debugfs file to show endpoints.

Signed-off-by: Vipul Pandya <vipul-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
---
 drivers/infiniband/hw/cxgb4/cm.c                |  277 ++++++++++++++++-------
 drivers/infiniband/hw/cxgb4/device.c            |  125 ++++++++++
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h          |   32 +++
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h      |    1 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c |   19 ++-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h  |    8 +-
 drivers/net/ethernet/chelsio/cxgb4/t4_regs.h    |    2 +
 7 files changed, 376 insertions(+), 88 deletions(-)

diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index b5840b4..255bd3f 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -276,6 +276,7 @@ void _c4iw_free_ep(struct kref *kref)
 		cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, ep->hwtid);
 		dst_release(ep->dst);
 		cxgb4_l2t_release(ep->l2t);
+		remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid);
 	}
 	kfree(ep);
 }
@@ -551,6 +552,7 @@ static int send_connect(struct c4iw_ep *ep)
 	req->opt0 = cpu_to_be64(opt0);
 	req->params = cpu_to_be32(select_ntuple(ep->com.dev, ep->dst, ep->l2t));
 	req->opt2 = cpu_to_be32(opt2);
+	set_bit(ACT_OPEN_REQ, &ep->com.history);
 	return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
 }
 
@@ -827,6 +829,7 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb)
 	/* setup the hwtid for this connection */
 	ep->hwtid = tid;
 	cxgb4_insert_tid(t, ep, tid);
+	insert_handle(dev, &dev->hwtid_idr, ep, ep->hwtid);
 
 	ep->snd_seq = be32_to_cpu(req->snd_isn);
 	ep->rcv_seq = be32_to_cpu(req->rcv_isn);
@@ -834,7 +837,9 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb)
 	set_emss(ep, ntohs(req->tcp_opt));
 
 	/* dealloc the atid */
+	remove_handle(ep->com.dev, &ep->com.dev->atid_idr, atid);
 	cxgb4_free_atid(t, atid);
+	set_bit(ACT_ESTAB, &ep->com.history);
 
 	/* start MPA negotiation */
 	send_flowc(ep, NULL);
@@ -860,6 +865,7 @@ static void close_complete_upcall(struct c4iw_ep *ep)
 		ep->com.cm_id->rem_ref(ep->com.cm_id);
 		ep->com.cm_id = NULL;
 		ep->com.qp = NULL;
+		set_bit(CLOSE_UPCALL, &ep->com.history);
 	}
 }
 
@@ -868,6 +874,7 @@ static int abort_connection(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp)
 	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
 	close_complete_upcall(ep);
 	state_set(&ep->com, ABORTING);
+	set_bit(ABORT_CONN, &ep->com.history);
 	return send_abort(ep, skb, gfp);
 }
 
@@ -882,6 +889,7 @@ static void peer_close_upcall(struct c4iw_ep *ep)
 		PDBG("peer close delivered ep %p cm_id %p tid %u\n",
 		     ep, ep->com.cm_id, ep->hwtid);
 		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+		set_bit(DISCONN_UPCALL, &ep->com.history);
 	}
 }
 
@@ -900,6 +908,7 @@ static void peer_abort_upcall(struct c4iw_ep *ep)
 		ep->com.cm_id->rem_ref(ep->com.cm_id);
 		ep->com.cm_id = NULL;
 		ep->com.qp = NULL;
+		set_bit(ABORT_UPCALL, &ep->com.history);
 	}
 }
 
@@ -932,6 +941,7 @@ static void connect_reply_upcall(struct c4iw_ep *ep, int status)
 
 	PDBG("%s ep %p tid %u status %d\n", __func__, ep,
 	     ep->hwtid, status);
+	set_bit(CONN_RPL_UPCALL, &ep->com.history);
 	ep->com.cm_id->event_handler(ep->com.cm_id, &event);
 
 	if (status < 0) {
@@ -972,6 +982,7 @@ static void connect_request_upcall(struct c4iw_ep *ep)
 						ep->parent_ep->com.cm_id,
 						&event);
 	}
+	set_bit(CONNREQ_UPCALL, &ep->com.history);
 	c4iw_put_ep(&ep->parent_ep->com);
 	ep->parent_ep = NULL;
 }
@@ -988,6 +999,7 @@ static void established_upcall(struct c4iw_ep *ep)
 	if (ep->com.cm_id) {
 		PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
 		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+		set_bit(ESTAB_UPCALL, &ep->com.history);
 	}
 }
 
@@ -1373,6 +1385,7 @@ static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb)
 	unsigned int dlen = ntohs(hdr->len);
 	unsigned int tid = GET_TID(hdr);
 	struct tid_info *t = dev->rdev.lldi.tids;
+	__u8 status = hdr->status;
 
 	ep = lookup_tid(t, tid);
 	PDBG("%s ep %p tid %u dlen %u\n", __func__, ep, ep->hwtid, dlen);
@@ -1395,9 +1408,9 @@ static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb)
 	case MPA_REP_SENT:
 		break;
 	default:
-		printk(KERN_ERR MOD "%s Unexpected streaming data."
-		       " ep %p state %d tid %u\n",
-		       __func__, ep, state_read(&ep->com), ep->hwtid);
+		pr_err("%s Unexpected streaming data." \
+		       " ep %p state %d tid %u status %d\n",
+		       __func__, ep, state_read(&ep->com), ep->hwtid, status);
 
 		/*
 		 * The ep will timeout and inform the ULP of the failure.
@@ -1464,6 +1477,7 @@ static void send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid)
 	req->tcb.cplrxdataack_cplpassacceptrpl =
 			htons(F_FW_OFLD_CONNECTION_WR_CPLRXDATAACK);
 	req->tcb.tx_max = jiffies;
+	req->tcb.rcv_adv = htons(1);
 	cxgb4_best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx);
 	wscale = compute_wscale(rcv_win);
 	req->tcb.opt0 = TCAM_BYPASS(1) |
@@ -1491,7 +1505,8 @@ static void send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid)
 		req->tcb.opt2 |= WND_SCALE_EN(1);
 	req->tcb.opt0 = cpu_to_be64(req->tcb.opt0);
 	req->tcb.opt2 = cpu_to_be32(req->tcb.opt2);
-	set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, ep->ctrlq_idx);
+	set_bit(ACT_OFLD_CONN, &ep->com.history);
 	c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
 }
 
@@ -1504,6 +1519,111 @@ static inline int act_open_has_tid(int status)
 	       status != CPL_ERR_ARP_MISS;
 }
 
+#define ACT_OPEN_RETRY_COUNT 2
+
+static int c4iw_reconnect(struct c4iw_ep *ep)
+{
+	int err = 0;
+	struct rtable *rt;
+	struct port_info *pi;
+	struct net_device *pdev;
+	int step;
+	struct neighbour *neigh;
+
+	PDBG("%s qp %p cm_id %p\n", __func__, ep->com.qp, ep->com.cm_id);
+	init_timer(&ep->timer);
+
+	/*
+	 * Allocate an active TID to initiate a TCP connection.
+	 */
+	ep->atid = cxgb4_alloc_atid(ep->com.dev->rdev.lldi.tids, ep);
+	if (ep->atid == -1) {
+		pr_err("%s - cannot alloc atid.\n", __func__);
+		err = -ENOMEM;
+		goto fail2;
+	}
+	insert_handle(ep->com.dev, &ep->com.dev->atid_idr, ep, ep->atid);
+
+	/* find a route */
+	rt = find_route(ep->com.dev,
+			ep->com.cm_id->local_addr.sin_addr.s_addr,
+			ep->com.cm_id->remote_addr.sin_addr.s_addr,
+			ep->com.cm_id->local_addr.sin_port,
+			ep->com.cm_id->remote_addr.sin_port, 0);
+	if (!rt) {
+		pr_err("%s - cannot find route.\n", __func__);
+		err = -EHOSTUNREACH;
+		goto fail3;
+	}
+	ep->dst = &rt->dst;
+
+	neigh = dst_neigh_lookup(ep->dst,
+			&ep->com.cm_id->remote_addr.sin_addr.s_addr);
+	/* get a l2t entry */
+	if (neigh->dev->flags & IFF_LOOPBACK) {
+		PDBG("%s LOOPBACK\n", __func__);
+		pdev = ip_dev_find(&init_net,
+				ep->com.cm_id->remote_addr.sin_addr.s_addr);
+		ep->l2t = cxgb4_l2t_get(ep->com.dev->rdev.lldi.l2t,
+				neigh, pdev, 0);
+		pi = (struct port_info *)netdev_priv(pdev);
+		ep->mtu = pdev->mtu;
+		ep->tx_chan = cxgb4_port_chan(pdev);
+		ep->smac_idx = (cxgb4_port_viid(pdev) & 0x7F) << 1;
+		dev_put(pdev);
+	} else {
+		ep->l2t = cxgb4_l2t_get(ep->com.dev->rdev.lldi.l2t,
+				neigh, neigh->dev, 0);
+		pi = (struct port_info *)netdev_priv(neigh->dev);
+		ep->mtu = dst_mtu(ep->dst);
+		ep->tx_chan = cxgb4_port_chan(neigh->dev);
+		ep->smac_idx = (cxgb4_port_viid(neigh->dev) &
+				0x7F) << 1;
+	}
+
+	step = ep->com.dev->rdev.lldi.ntxq / ep->com.dev->rdev.lldi.nchan;
+	ep->txq_idx = pi->port_id * step;
+	ep->ctrlq_idx = pi->port_id;
+	step = ep->com.dev->rdev.lldi.nrxq / ep->com.dev->rdev.lldi.nchan;
+	ep->rss_qid = ep->com.dev->rdev.lldi.rxq_ids[pi->port_id * step];
+
+	if (!ep->l2t) {
+		pr_err("%s - cannot alloc l2e.\n", __func__);
+		err = -ENOMEM;
+		goto fail4;
+	}
+
+	PDBG("%s txq_idx %u tx_chan %u smac_idx %u rss_qid %u l2t_idx %u\n",
+	     __func__, ep->txq_idx, ep->tx_chan, ep->smac_idx, ep->rss_qid,
+	     ep->l2t->idx);
+
+	state_set(&ep->com, CONNECTING);
+	ep->tos = 0;
+
+	/* send connect request to rnic */
+	err = send_connect(ep);
+	if (!err)
+		goto out;
+
+	cxgb4_l2t_release(ep->l2t);
+fail4:
+	dst_release(ep->dst);
+fail3:
+	remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid);
+	cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid);
+fail2:
+	/*
+	 * remember to send notification to upper layer.
+	 * We are in here so the upper layer is not aware that this is
+	 * re-connect attempt and so, upper layer is still waiting for
+	 * response of 1st connect request.
+	 */
+	connect_reply_upcall(ep, -ECONNRESET);
+	c4iw_put_ep(&ep->com);
+out:
+	return err;
+}
+
 static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
 {
 	struct c4iw_ep *ep;
@@ -1524,6 +1644,8 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
 		return 0;
 	}
 
+	set_bit(ACT_OPEN_RPL, &ep->com.history);
+
 	/*
 	 * Log interesting failures.
 	 */
@@ -1532,12 +1654,27 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
 	case CPL_ERR_CONN_TIMEDOUT:
 		break;
 	case CPL_ERR_TCAM_FULL:
-		mutex_lock(&dev->rdev.stats.lock);
-		dev->rdev.stats.tcam_full++;
-		mutex_unlock(&dev->rdev.stats.lock);
-		send_fw_act_open_req(ep,
-			GET_TID_TID(GET_AOPEN_ATID(ntohl(rpl->atid_status))));
-		return 0;
+		if (dev->rdev.lldi.enable_fw_ofld_conn) {
+			mutex_lock(&dev->rdev.stats.lock);
+			dev->rdev.stats.tcam_full++;
+			mutex_unlock(&dev->rdev.stats.lock);
+			send_fw_act_open_req(ep,
+					     GET_TID_TID(GET_AOPEN_ATID(
+					     ntohl(rpl->atid_status))));
+			return 0;
+		}
+		break;
+	case CPL_ERR_CONN_EXIST:
+		if (ep->retry_count++ < ACT_OPEN_RETRY_COUNT) {
+			set_bit(ACT_RETRY_INUSE, &ep->com.history);
+			remove_handle(ep->com.dev, &ep->com.dev->atid_idr,
+					atid);
+			cxgb4_free_atid(t, atid);
+			dst_release(ep->dst);
+			cxgb4_l2t_release(ep->l2t);
+			c4iw_reconnect(ep);
+			return 0;
+		}
 		break;
 	default:
 		printk(KERN_INFO MOD "Active open failure - "
@@ -1556,6 +1693,7 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
 	if (status && act_open_has_tid(status))
 		cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, GET_TID(rpl));
 
+	remove_handle(ep->com.dev, &ep->com.dev->atid_idr, atid);
 	cxgb4_free_atid(t, atid);
 	dst_release(ep->dst);
 	cxgb4_l2t_release(ep->l2t);
@@ -1776,7 +1914,7 @@ out:
 
 static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
 {
-	struct c4iw_ep *child_ep, *parent_ep;
+	struct c4iw_ep *child_ep = NULL, *parent_ep;
 	struct cpl_pass_accept_req *req = cplhdr(skb);
 	unsigned int stid = GET_POPEN_TID(ntohl(req->tos_stid));
 	struct tid_info *t = dev->rdev.lldi.tids;
@@ -1857,6 +1995,7 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
 	init_timer(&child_ep->timer);
 	cxgb4_insert_tid(t, child_ep, hwtid);
 	accept_cr(child_ep, peer_ip, skb, req);
+	set_bit(PASS_ACCEPT_REQ, &child_ep->com.history);
 	goto out;
 reject:
 	reject_cr(dev, hwtid, peer_ip, skb);
@@ -1880,11 +2019,13 @@ static int pass_establish(struct c4iw_dev *dev, struct sk_buff *skb)
 	     ntohs(req->tcp_opt));
 
 	set_emss(ep, ntohs(req->tcp_opt));
+	insert_handle(dev, &dev->hwtid_idr, ep, ep->hwtid);
 
 	dst_confirm(ep->dst);
 	state_set(&ep->com, MPA_REQ_WAIT);
 	start_ep_timer(ep);
 	send_flowc(ep, skb);
+	set_bit(PASS_ESTAB, &ep->com.history);
 
 	return 0;
 }
@@ -1904,6 +2045,7 @@ static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb)
 	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
 	dst_confirm(ep->dst);
 
+	set_bit(PEER_CLOSE, &ep->com.history);
 	mutex_lock(&ep->com.mutex);
 	switch (ep->com.state) {
 	case MPA_REQ_WAIT:
@@ -1983,74 +2125,6 @@ static int is_neg_adv_abort(unsigned int status)
 	       status == CPL_ERR_PERSIST_NEG_ADVICE;
 }
 
-static int c4iw_reconnect(struct c4iw_ep *ep)
-{
-	struct rtable *rt;
-	int err = 0;
-
-	PDBG("%s qp %p cm_id %p\n", __func__, ep->com.qp, ep->com.cm_id);
-	init_timer(&ep->timer);
-
-	/*
-	 * Allocate an active TID to initiate a TCP connection.
-	 */
-	ep->atid = cxgb4_alloc_atid(ep->com.dev->rdev.lldi.tids, ep);
-	if (ep->atid == -1) {
-		printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __func__);
-		err = -ENOMEM;
-		goto fail2;
-	}
-
-	/* find a route */
-	rt = find_route(ep->com.dev,
-			ep->com.cm_id->local_addr.sin_addr.s_addr,
-			ep->com.cm_id->remote_addr.sin_addr.s_addr,
-			ep->com.cm_id->local_addr.sin_port,
-			ep->com.cm_id->remote_addr.sin_port, 0);
-	if (!rt) {
-		printk(KERN_ERR MOD "%s - cannot find route.\n", __func__);
-		err = -EHOSTUNREACH;
-		goto fail3;
-	}
-	ep->dst = &rt->dst;
-
-	err = import_ep(ep, ep->com.cm_id->remote_addr.sin_addr.s_addr,
-			ep->dst, ep->com.dev, false);
-	if (err) {
-		printk(KERN_ERR MOD "%s - cannot alloc l2e.\n", __func__);
-		goto fail4;
-	}
-
-	PDBG("%s txq_idx %u tx_chan %u smac_idx %u rss_qid %u l2t_idx %u\n",
-	     __func__, ep->txq_idx, ep->tx_chan, ep->smac_idx, ep->rss_qid,
-	     ep->l2t->idx);
-
-	state_set(&ep->com, CONNECTING);
-	ep->tos = 0;
-
-	/* send connect request to rnic */
-	err = send_connect(ep);
-	if (!err)
-		goto out;
-
-	cxgb4_l2t_release(ep->l2t);
-fail4:
-	dst_release(ep->dst);
-fail3:
-	cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid);
-fail2:
-	/*
-	 * remember to send notification to upper layer.
-	 * We are in here so the upper layer is not aware that this is
-	 * re-connect attempt and so, upper layer is still waiting for
-	 * response of 1st connect request.
-	 */
-	connect_reply_upcall(ep, -ECONNRESET);
-	c4iw_put_ep(&ep->com);
-out:
-	return err;
-}
-
 static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
 {
 	struct cpl_abort_req_rss *req = cplhdr(skb);
@@ -2071,6 +2145,7 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
 	}
 	PDBG("%s ep %p tid %u state %u\n", __func__, ep, ep->hwtid,
 	     ep->com.state);
+	set_bit(PEER_ABORT, &ep->com.history);
 
 	/*
 	 * Wake up any threads in rdma_init() or rdma_fini().
@@ -2285,6 +2360,7 @@ int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
 		c4iw_put_ep(&ep->com);
 		return -ECONNRESET;
 	}
+	set_bit(ULP_REJECT, &ep->com.history);
 	BUG_ON(state_read(&ep->com) != MPA_REQ_RCVD);
 	if (mpa_rev == 0)
 		abort_connection(ep, NULL, GFP_KERNEL);
@@ -2314,6 +2390,7 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 	BUG_ON(state_read(&ep->com) != MPA_REQ_RCVD);
 	BUG_ON(!qp);
 
+	set_bit(ULP_ACCEPT, &ep->com.history);
 	if ((conn_param->ord > c4iw_max_read_depth) ||
 	    (conn_param->ird > c4iw_max_read_depth)) {
 		abort_connection(ep, NULL, GFP_KERNEL);
@@ -2437,6 +2514,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 		err = -ENOMEM;
 		goto fail2;
 	}
+	insert_handle(dev, &dev->atid_idr, ep, ep->atid);
 
 	PDBG("%s saddr 0x%x sport 0x%x raddr 0x%x rport 0x%x\n", __func__,
 	     ntohl(cm_id->local_addr.sin_addr.s_addr),
@@ -2482,6 +2560,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 fail4:
 	dst_release(ep->dst);
 fail3:
+	remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid);
 	cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid);
 fail2:
 	cm_id->rem_ref(cm_id);
@@ -2524,6 +2603,7 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog)
 		err = -ENOMEM;
 		goto fail2;
 	}
+	insert_handle(dev, &dev->stid_idr, ep, ep->stid);
 	state_set(&ep->com, LISTEN);
 	if (dev->rdev.lldi.enable_fw_ofld_conn) {
 		do {
@@ -2531,7 +2611,10 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog)
 				ep->com.dev->rdev.lldi.ports[0], ep->stid,
 				ep->com.local_addr.sin_addr.s_addr,
 				ep->com.local_addr.sin_port,
-				ep->com.dev->rdev.lldi.rxq_ids[0]);
+				0,
+				ep->com.dev->rdev.lldi.rxq_ids[0],
+				0,
+				0);
 			if (err == -EBUSY) {
 				set_current_state(TASK_UNINTERRUPTIBLE);
 				schedule_timeout(usecs_to_jiffies(100));
@@ -2542,6 +2625,7 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog)
 		err = cxgb4_create_server(ep->com.dev->rdev.lldi.ports[0],
 				ep->stid, ep->com.local_addr.sin_addr.s_addr,
 				ep->com.local_addr.sin_port,
+				0,
 				ep->com.dev->rdev.lldi.rxq_ids[0]);
 		if (!err)
 			err = c4iw_wait_for_reply(&ep->com.dev->rdev,
@@ -2587,6 +2671,7 @@ int c4iw_destroy_listen(struct iw_cm_id *cm_id)
 		err = c4iw_wait_for_reply(&ep->com.dev->rdev, &ep->com.wr_wait,
 					  0, 0, __func__);
 	}
+	remove_handle(ep->com.dev, &ep->com.dev->stid_idr, ep->stid);
 	cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid, PF_INET);
 done:
 	cm_id->rem_ref(cm_id);
@@ -2650,10 +2735,13 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)
 
 	if (close) {
 		if (abrupt) {
+			set_bit(EP_DISC_ABORT, &ep->com.history);
 			close_complete_upcall(ep);
 			ret = send_abort(ep, NULL, gfp);
-		} else
+		} else {
+			set_bit(EP_DISC_CLOSE, &ep->com.history);
 			ret = send_halfclose(ep, gfp);
+		}
 		if (ret)
 			fatal = 1;
 	}
@@ -2667,6 +2755,7 @@ static void active_ofld_conn_reply(struct c4iw_dev *dev, struct sk_buff *skb,
 			struct cpl_fw6_msg_ofld_connection_wr_rpl *req)
 {
 	struct c4iw_ep *ep;
+	int atid = be32_to_cpu(req->tid);
 
 	ep = (struct c4iw_ep *)lookup_atid(dev->rdev.lldi.tids, req->tid);
 	if (!ep)
@@ -2674,15 +2763,35 @@ static void active_ofld_conn_reply(struct c4iw_dev *dev, struct sk_buff *skb,
 
 	switch (req->retval) {
 	case FW_ENOMEM:
+		set_bit(ACT_RETRY_NOMEM, &ep->com.history);
+		if (ep->retry_count++ < ACT_OPEN_RETRY_COUNT) {
+			send_fw_act_open_req(ep, atid);
+			return;
+		}
 	case FW_EADDRINUSE:
-		PDBG("%s ofld conn wr ret %d\n", __func__, req->retval);
+		set_bit(ACT_RETRY_INUSE, &ep->com.history);
+		if (ep->retry_count++ < ACT_OPEN_RETRY_COUNT) {
+			send_fw_act_open_req(ep, atid);
+			return;
+		}
 		break;
 	default:
 		pr_info("%s unexpected ofld conn wr retval %d\n",
 		       __func__, req->retval);
 		break;
 	}
+	pr_err("active ofld_connect_wr failure %d atid %d\n",
+	       req->retval, atid);
+	mutex_lock(&dev->rdev.stats.lock);
+	dev->rdev.stats.act_ofld_conn_fails++;
+	mutex_unlock(&dev->rdev.stats.lock);
 	connect_reply_upcall(ep, status2errno(req->retval));
+	state_set(&ep->com, DEAD);
+	remove_handle(dev, &dev->atid_idr, atid);
+	cxgb4_free_atid(dev->rdev.lldi.tids, atid);
+	dst_release(ep->dst);
+	cxgb4_l2t_release(ep->l2t);
+	c4iw_put_ep(&ep->com);
 }
 
 static void passive_ofld_conn_reply(struct c4iw_dev *dev, struct sk_buff *skb,
@@ -2696,6 +2805,9 @@ static void passive_ofld_conn_reply(struct c4iw_dev *dev, struct sk_buff *skb,
 	BUG_ON(!rpl_skb);
 	if (req->retval) {
 		PDBG("%s passive open failure %d\n", __func__, req->retval);
+		mutex_lock(&dev->rdev.stats.lock);
+		dev->rdev.stats.pas_ofld_conn_fails++;
+		mutex_unlock(&dev->rdev.stats.lock);
 		kfree_skb(rpl_skb);
 	} else {
 		cpl = (struct cpl_pass_accept_req *)cplhdr(rpl_skb);
@@ -2989,6 +3101,7 @@ static void process_timeout(struct c4iw_ep *ep)
 	mutex_lock(&ep->com.mutex);
 	PDBG("%s ep %p tid %u state %d\n", __func__, ep, ep->hwtid,
 	     ep->com.state);
+	set_bit(TIMEDOUT, &ep->com.history);
 	switch (ep->com.state) {
 	case MPA_REQ_SENT:
 		__state_set(&ep->com, ABORTING);
diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c
index 6b5b3d1..ba11c76 100644
--- a/drivers/infiniband/hw/cxgb4/device.c
+++ b/drivers/infiniband/hw/cxgb4/device.c
@@ -280,6 +280,10 @@ static int stats_show(struct seq_file *seq, void *v)
 		   db_state_str[dev->db_state],
 		   dev->rdev.stats.db_state_transitions);
 	seq_printf(seq, "TCAM_FULL: %10llu\n", dev->rdev.stats.tcam_full);
+	seq_printf(seq, "ACT_OFLD_CONN_FAILS: %10llu\n",
+		   dev->rdev.stats.act_ofld_conn_fails);
+	seq_printf(seq, "PAS_OFLD_CONN_FAILS: %10llu\n",
+		   dev->rdev.stats.pas_ofld_conn_fails);
 	return 0;
 }
 
@@ -310,6 +314,9 @@ static ssize_t stats_clear(struct file *file, const char __user *buf,
 	dev->rdev.stats.db_empty = 0;
 	dev->rdev.stats.db_drop = 0;
 	dev->rdev.stats.db_state_transitions = 0;
+	dev->rdev.stats.tcam_full = 0;
+	dev->rdev.stats.act_ofld_conn_fails = 0;
+	dev->rdev.stats.pas_ofld_conn_fails = 0;
 	mutex_unlock(&dev->rdev.stats.lock);
 	return count;
 }
@@ -323,6 +330,113 @@ static const struct file_operations stats_debugfs_fops = {
 	.write   = stats_clear,
 };
 
+static int dump_ep(int id, void *p, void *data)
+{
+	struct c4iw_ep *ep = p;
+	struct c4iw_debugfs_data *epd = data;
+	int space;
+	int cc;
+
+	space = epd->bufsize - epd->pos - 1;
+	if (space == 0)
+		return 1;
+
+	cc = snprintf(epd->buf + epd->pos, space,
+			"ep %p cm_id %p qp %p state %d flags 0x%lx history 0x%lx "
+			"hwtid %d atid %d %pI4:%d <-> %pI4:%d\n",
+			ep, ep->com.cm_id, ep->com.qp, (int)ep->com.state,
+			ep->com.flags, ep->com.history, ep->hwtid, ep->atid,
+			&ep->com.local_addr.sin_addr.s_addr,
+			ntohs(ep->com.local_addr.sin_port),
+			&ep->com.remote_addr.sin_addr.s_addr,
+			ntohs(ep->com.remote_addr.sin_port));
+	if (cc < space)
+		epd->pos += cc;
+	return 0;
+}
+
+static int dump_listen_ep(int id, void *p, void *data)
+{
+	struct c4iw_listen_ep *ep = p;
+	struct c4iw_debugfs_data *epd = data;
+	int space;
+	int cc;
+
+	space = epd->bufsize - epd->pos - 1;
+	if (space == 0)
+		return 1;
+
+	cc = snprintf(epd->buf + epd->pos, space,
+			"ep %p cm_id %p state %d flags 0x%lx stid %d backlog %d "
+			"%pI4:%d\n", ep, ep->com.cm_id, (int)ep->com.state,
+			ep->com.flags, ep->stid, ep->backlog,
+			&ep->com.local_addr.sin_addr.s_addr,
+			ntohs(ep->com.local_addr.sin_port));
+	if (cc < space)
+		epd->pos += cc;
+	return 0;
+}
+
+static int ep_release(struct inode *inode, struct file *file)
+{
+	struct c4iw_debugfs_data *epd = file->private_data;
+	if (!epd) {
+		pr_info("%s null qpd?\n", __func__);
+		return 0;
+	}
+	vfree(epd->buf);
+	kfree(epd);
+	return 0;
+}
+
+static int ep_open(struct inode *inode, struct file *file)
+{
+	struct c4iw_debugfs_data *epd;
+	int ret = 0;
+	int count = 1;
+
+	epd = kmalloc(sizeof(*epd), GFP_KERNEL);
+	if (!epd) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	epd->devp = inode->i_private;
+	epd->pos = 0;
+
+	spin_lock_irq(&epd->devp->lock);
+	idr_for_each(&epd->devp->hwtid_idr, count_idrs, &count);
+	idr_for_each(&epd->devp->atid_idr, count_idrs, &count);
+	idr_for_each(&epd->devp->stid_idr, count_idrs, &count);
+	spin_unlock_irq(&epd->devp->lock);
+
+	epd->bufsize = count * 160;
+	epd->buf = vmalloc(epd->bufsize);
+	if (!epd->buf) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+
+	spin_lock_irq(&epd->devp->lock);
+	idr_for_each(&epd->devp->hwtid_idr, dump_ep, epd);
+	idr_for_each(&epd->devp->atid_idr, dump_ep, epd);
+	idr_for_each(&epd->devp->stid_idr, dump_listen_ep, epd);
+	spin_unlock_irq(&epd->devp->lock);
+
+	file->private_data = epd;
+	goto out;
+err1:
+	kfree(epd);
+out:
+	return ret;
+}
+
+static const struct file_operations ep_debugfs_fops = {
+	.owner   = THIS_MODULE,
+	.open    = ep_open,
+	.release = ep_release,
+	.read    = debugfs_read,
+};
+
 static int setup_debugfs(struct c4iw_dev *devp)
 {
 	struct dentry *de;
@@ -345,6 +459,11 @@ static int setup_debugfs(struct c4iw_dev *devp)
 	if (de && de->d_inode)
 		de->d_inode->i_size = 4096;
 
+	de = debugfs_create_file("eps", S_IWUSR, devp->debugfs_root,
+			(void *)devp, &ep_debugfs_fops);
+	if (de && de->d_inode)
+		de->d_inode->i_size = 4096;
+
 	return 0;
 }
 
@@ -476,6 +595,9 @@ static void c4iw_dealloc(struct uld_ctx *ctx)
 	idr_destroy(&ctx->dev->cqidr);
 	idr_destroy(&ctx->dev->qpidr);
 	idr_destroy(&ctx->dev->mmidr);
+	idr_destroy(&ctx->dev->hwtid_idr);
+	idr_destroy(&ctx->dev->stid_idr);
+	idr_destroy(&ctx->dev->atid_idr);
 	iounmap(ctx->dev->rdev.oc_mw_kva);
 	ib_dealloc_device(&ctx->dev->ibdev);
 	ctx->dev = NULL;
@@ -533,6 +655,9 @@ static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop)
 	idr_init(&devp->cqidr);
 	idr_init(&devp->qpidr);
 	idr_init(&devp->mmidr);
+	idr_init(&devp->hwtid_idr);
+	idr_init(&devp->stid_idr);
+	idr_init(&devp->atid_idr);
 	spin_lock_init(&devp->lock);
 	mutex_init(&devp->rdev.stats.lock);
 	mutex_init(&devp->db_mutex);
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index 6a17fde..9c1644f 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -131,6 +131,8 @@ struct c4iw_stats {
 	u64  db_drop;
 	u64  db_state_transitions;
 	u64  tcam_full;
+	u64  act_ofld_conn_fails;
+	u64  pas_ofld_conn_fails;
 };
 
 struct c4iw_rdev {
@@ -224,6 +226,9 @@ struct c4iw_dev {
 	struct dentry *debugfs_root;
 	enum db_state db_state;
 	int qpcnt;
+	struct idr hwtid_idr;
+	struct idr atid_idr;
+	struct idr stid_idr;
 };
 
 static inline struct c4iw_dev *to_c4iw_dev(struct ib_device *ibdev)
@@ -713,6 +718,31 @@ enum c4iw_ep_flags {
 	CLOSE_SENT		= 3,
 };
 
+enum c4iw_ep_history {
+	ACT_OPEN_REQ            = 0,
+	ACT_OFLD_CONN           = 1,
+	ACT_OPEN_RPL            = 2,
+	ACT_ESTAB               = 3,
+	PASS_ACCEPT_REQ         = 4,
+	PASS_ESTAB              = 5,
+	ABORT_UPCALL            = 6,
+	ESTAB_UPCALL            = 7,
+	CLOSE_UPCALL            = 8,
+	ULP_ACCEPT              = 9,
+	ULP_REJECT              = 10,
+	TIMEDOUT                = 11,
+	PEER_ABORT              = 12,
+	PEER_CLOSE              = 13,
+	CONNREQ_UPCALL          = 14,
+	ABORT_CONN              = 15,
+	DISCONN_UPCALL          = 16,
+	EP_DISC_CLOSE           = 17,
+	EP_DISC_ABORT           = 18,
+	CONN_RPL_UPCALL         = 19,
+	ACT_RETRY_NOMEM         = 20,
+	ACT_RETRY_INUSE         = 21
+};
+
 struct c4iw_ep_common {
 	struct iw_cm_id *cm_id;
 	struct c4iw_qp *qp;
@@ -724,6 +754,7 @@ struct c4iw_ep_common {
 	struct sockaddr_in remote_addr;
 	struct c4iw_wr_wait wr_wait;
 	unsigned long flags;
+	unsigned long history;
 };
 
 struct c4iw_listen_ep {
@@ -761,6 +792,7 @@ struct c4iw_ep {
 	u8 tos;
 	u8 retry_with_mpa_v1;
 	u8 tried_with_mpa_v1;
+	unsigned int retry_count;
 };
 
 static inline struct c4iw_ep *to_ep(struct iw_cm_id *cm_id)
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index b5d299d..682d490 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -530,6 +530,7 @@ struct adapter {
 	struct net_device *port[MAX_NPORTS];
 	u8 chan_map[NCHAN];                   /* channel -> port map */
 
+	u32 filter_mode;
 	unsigned int l2t_start;
 	unsigned int l2t_end;
 	struct l2t_data *l2t;
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index b9ce750..97ab36f 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -2688,7 +2688,8 @@ static int tid_init(struct tid_info *t)
  *	Returns <0 on error and one of the %NET_XMIT_* values on success.
  */
 int cxgb4_create_server(const struct net_device *dev, unsigned int stid,
-			__be32 sip, __be16 sport, unsigned int queue)
+			__be32 sip, __be16 sport, __be16 vlan,
+			unsigned int queue)
 {
 	unsigned int chan;
 	struct sk_buff *skb;
@@ -3061,7 +3062,7 @@ static void uld_attach(struct adapter *adap, unsigned int uld)
 	lli.ucq_density = 1 << QUEUESPERPAGEPF0_GET(
 			t4_read_reg(adap, SGE_INGRESS_QUEUES_PER_PAGE_PF) >>
 			(adap->fn * 4));
-	lli.filt_mode = tp_vlan_pri_map;
+	lli.filt_mode = adap->filter_mode;
 	/* MODQ_REQ_MAP sets queues 0-3 to chan 0-3 */
 	for (i = 0; i < NCHAN; i++)
 		lli.tx_modq[i] = i;
@@ -3327,7 +3328,8 @@ static int delete_filter(struct adapter *adapter, unsigned int fidx)
 }
 
 int cxgb4_create_server_filter(const struct net_device *dev, unsigned int stid,
-		__be32 sip, __be16 sport, unsigned int queue)
+		__be32 sip, __be16 sport, __be16 vlan,
+		unsigned int queue, unsigned char port, unsigned char mask)
 {
 	int ret;
 	struct filter_entry *f;
@@ -3361,11 +3363,16 @@ int cxgb4_create_server_filter(const struct net_device *dev, unsigned int stid,
 	f->fs.val.lport = cpu_to_be16(sport);
 	f->fs.mask.lport  = ~0;
 	val = (u8 *)&sip;
-	if ((val[0] | val[1] | val[2] | val[3]) != 0)
+	if ((val[0] | val[1] | val[2] | val[3]) != 0) {
 		for (i = 0; i < 4; i++) {
 			f->fs.val.lip[i] = val[i];
 			f->fs.mask.lip[i] = ~0;
 		}
+		if (adap->filter_mode & F_PORT) {
+			f->fs.val.iport = port;
+			f->fs.mask.iport = mask;
+		}
+	}
 
 	f->fs.dirsteer = 1;
 	f->fs.iq = queue;
@@ -4472,6 +4479,10 @@ static int adap_init0(struct adapter *adap)
 	for (j = 0; j < NCHAN; j++)
 		adap->params.tp.tx_modq[j] = j;
 
+	t4_read_indirect(adap, TP_PIO_ADDR, TP_PIO_DATA,
+			 &adap->filter_mode, 1,
+			 TP_VLAN_PRI_MAP);
+
 	adap->flags |= FW_OK;
 	return 0;
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
index 065bbd5..e2bbc7f 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
@@ -38,6 +38,7 @@
 #include <linux/cache.h>
 #include <linux/spinlock.h>
 #include <linux/skbuff.h>
+#include <linux/inetdevice.h>
 #include <linux/atomic.h>
 
 /* CPL message priority levels */
@@ -151,9 +152,12 @@ void cxgb4_remove_tid(struct tid_info *t, unsigned int qid, unsigned int tid);
 struct in6_addr;
 
 int cxgb4_create_server(const struct net_device *dev, unsigned int stid,
-			__be32 sip, __be16 sport, unsigned int queue);
+			__be32 sip, __be16 sport, __be16 vlan,
+			unsigned int queue);
 int cxgb4_create_server_filter(const struct net_device *dev, unsigned int stid,
-			       __be32 sip, __be16 sport, unsigned int queue);
+			       __be32 sip, __be16 sport, __be16 vlan,
+			       unsigned int queue,
+			       unsigned char port, unsigned char mask);
 int cxgb4_remove_server_filter(const struct net_device *dev, unsigned int stid,
 			       unsigned int queue, bool ipv6);
 static inline void set_wr_txq(struct sk_buff *skb, int prio, int queue)
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h b/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
index bdc1bbc..3139221 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
@@ -1066,5 +1066,7 @@
 #define A_TP_TX_SCHED_PCMD 0x25
 
 #define S_PORT    1
+#define V_PORT(x) ((x) << S_PORT)
+#define F_PORT    V_PORT(1U)
 
 #endif /* __T4_REGS_H */
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* Re: [PATCH] solos-pci: don't call vcc->pop() after pclose()
From: Krzysztof Mazur @ 2012-11-29 14:55 UTC (permalink / raw)
  To: David Woodhouse
  Cc: David Laight, chas williams - CONTRACTOR, davem, netdev,
	linux-kernel, nathan
In-Reply-To: <1354200137.21562.167.camel@shinybook.infradead.org>

On Thu, Nov 29, 2012 at 02:42:17PM +0000, David Woodhouse wrote:
> On Thu, 2012-11-29 at 14:20 +0100, Krzysztof Mazur wrote:
> >                 if (card->tx_skb[port] == skb) {
> >                         skb_get(skb);
> >                         solos_pop(SKB_CB(skb)->vcc, skb);
> >                         SKB_CB(skb)->vcc = NULL;
> 
> Um... yes, that would probably work. But it's subtle enough that it
> bothers me. And if it *did* cause any strange issues, it's a rare case
> and would be hard to reproduce/debug. Is it *really* necessary, just to
> speed up the vcc close? I'm inclined to stick with the 'KISS' approach.

That's why I proposed that simple loop that just calls ->pop() and
sets vcc to NULL.

Forget about that, Chas said that we cannot leave close() until we
close that vcc, so we really need to wait.

Krzysiek

^ permalink raw reply

* [PATCH 0/5] Add LE hash collision bug fix for active and passive offloaded connections
From: Vipul Pandya @ 2012-11-29 14:52 UTC (permalink / raw)
  To: linux-rdma, netdev
  Cc: roland, davem, divy, dm, kumaras, swise, abhishek, Vipul Pandya

This patch series fixes the LE hash collision issue in cxgb4 and RDMA/cxgb4
drivers in kernel.org.

If the hash functionality is enabled in T4 then tuple information of active and
passive offloaded connections are stored in DDR3 memory. LE (Lookup Engine)
implements the interface to search this tuple entries using hash algorithm. To
avoid LE hash collision, firmware provides new mechanisms to setup active and
passive open connections.

In case of active open connection, firmware detects LE hash collision situation
and notifies driver. Driver uses fw_ofld_connection work request to offload
that connection. Its tuple information will be stored in TCAM memory array. 

In case of passive open connection, server filter region is created in TCAM.
This region stores the filter which will redirect the incoming SYN packet to
offload queues. After this driver tries to establish the connection using
firmware work request.

This patch series also adds framework for managing filters and to use T4's
filter capabilities.

Following testing has been carried out successfully on this patch series.
1. 1000 active open connections created and saw that tcam_full counter is
getting incremented through debugfs file.
2. Multiple passive open connections were created and ran the same test
repetatively to verify server filter is getting created and deleted properly.

The patch-series is based on Roland's infiniband tree (for-next branch),
and involves changes on cxgb4 and RDMA/cxgb4 drivers. Both linux-rdma and netdev
are included in this post for review.

We have some more bug fixes in RDMA/cxgb4 after this patch series. So, we would
like to request this series to get mereged through Roland's infiniband for-next
tree.

Thanks,
Vipul Pandya

Vipul Pandya (5):
  cxgb4: Add T4 filter support
  cxgb4: Add LE hash collision bug fix path in LLD driver
  RDMA/cxgb4: Fix LE hash collision bug for active open connection
  RDMA/cxgb4: Fix LE hash collision bug for passive open connection
  RDMA/cxgb4: Fix bug for active and passive LE hash collision path

 drivers/infiniband/hw/cxgb4/cm.c                |  789 +++++++++++++++++++---
 drivers/infiniband/hw/cxgb4/device.c            |  210 ++++++-
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h          |   33 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h      |  146 +++++
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c |  473 +++++++++++++-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h  |   23 +-
 drivers/net/ethernet/chelsio/cxgb4/l2t.c        |   34 +
 drivers/net/ethernet/chelsio/cxgb4/l2t.h        |    3 +
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c      |   23 +-
 drivers/net/ethernet/chelsio/cxgb4/t4_msg.h     |   66 ++
 drivers/net/ethernet/chelsio/cxgb4/t4_regs.h    |   37 ++
 drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h   |  388 +++++++++++
 12 files changed, 2100 insertions(+), 125 deletions(-)

^ permalink raw reply

* [PATCH 4/5] RDMA/cxgb4: Fix LE hash collision bug for passive open connection
From: Vipul Pandya @ 2012-11-29 14:52 UTC (permalink / raw)
  To: linux-rdma, netdev
  Cc: roland, davem, divy, dm, kumaras, swise, abhishek, Vipul Pandya
In-Reply-To: <1354200745-23598-1-git-send-email-vipul@chelsio.com>

It establishes passive open connection through firmware work request. Passive
open connection will go through this path as now instead of listening server we
create a server filter which will redirect the incoming SYN packet to the
offload queue. After this driver tries to establish the connection using
firmware work request.

Signed-off-by: Vipul Pandya <vipul@chelsio.com>
---
 drivers/infiniband/hw/cxgb4/cm.c                |  411 ++++++++++++++++++++---
 drivers/infiniband/hw/cxgb4/device.c            |   85 +++++-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c |    9 +
 drivers/net/ethernet/chelsio/cxgb4/t4_msg.h     |   46 +++
 4 files changed, 496 insertions(+), 55 deletions(-)

diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index 560ca39..b5840b4 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -38,10 +38,12 @@
 #include <linux/inetdevice.h>
 #include <linux/ip.h>
 #include <linux/tcp.h>
+#include <linux/if_vlan.h>
 
 #include <net/neighbour.h>
 #include <net/netevent.h>
 #include <net/route.h>
+#include <net/tcp.h>
 
 #include "iw_cxgb4.h"
 
@@ -1570,13 +1572,14 @@ static int pass_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
 	struct c4iw_listen_ep *ep = lookup_stid(t, stid);
 
 	if (!ep) {
-		printk(KERN_ERR MOD "stid %d lookup failure!\n", stid);
-		return 0;
+		PDBG("%s stid %d lookup failure!\n", __func__, stid);
+		goto out;
 	}
 	PDBG("%s ep %p status %d error %d\n", __func__, ep,
 	     rpl->status, status2errno(rpl->status));
 	c4iw_wake_up(&ep->com.wr_wait, status2errno(rpl->status));
 
+out:
 	return 0;
 }
 
@@ -1780,15 +1783,23 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
 	unsigned int hwtid = GET_TID(req);
 	struct dst_entry *dst;
 	struct rtable *rt;
-	__be32 local_ip, peer_ip;
+	__be32 local_ip, peer_ip = 0;
 	__be16 local_port, peer_port;
 	int err;
+	u16 peer_mss = ntohs(req->tcpopt.mss);
 
 	parent_ep = lookup_stid(t, stid);
-	PDBG("%s parent ep %p tid %u\n", __func__, parent_ep, hwtid);
-
+	if (!parent_ep) {
+		PDBG("%s connect request on invalid stid %d\n", __func__, stid);
+		goto reject;
+	}
 	get_4tuple(req, &local_ip, &peer_ip, &local_port, &peer_port);
 
+	PDBG("%s parent ep %p hwtid %u laddr 0x%x raddr 0x%x lport %d " \
+	     "rport %d peer_mss %d\n", __func__, parent_ep, hwtid,
+	     ntohl(local_ip), ntohl(peer_ip), ntohs(local_port),
+	     ntohs(peer_port), peer_mss);
+
 	if (state_read(&parent_ep->com) != LISTEN) {
 		printk(KERN_ERR "%s - listening ep not in LISTEN\n",
 		       __func__);
@@ -1822,6 +1833,9 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
 		goto reject;
 	}
 
+	if (peer_mss && child_ep->mtu > (peer_mss + 40))
+		child_ep->mtu = peer_mss + 40;
+
 	state_set(&child_ep->com, CONNECTING);
 	child_ep->com.dev = dev;
 	child_ep->com.cm_id = NULL;
@@ -1862,6 +1876,9 @@ static int pass_establish(struct c4iw_dev *dev, struct sk_buff *skb)
 	ep->snd_seq = be32_to_cpu(req->snd_isn);
 	ep->rcv_seq = be32_to_cpu(req->rcv_isn);
 
+	PDBG("%s ep %p hwtid %u tcp_opt 0x%02x\n", __func__, ep, tid,
+	     ntohs(req->tcp_opt));
+
 	set_emss(ep, ntohs(req->tcp_opt));
 
 	dst_confirm(ep->dst);
@@ -2479,7 +2496,6 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog)
 	struct c4iw_dev *dev = to_c4iw_dev(cm_id->device);
 	struct c4iw_listen_ep *ep;
 
-
 	might_sleep();
 
 	ep = alloc_ep(sizeof(*ep), GFP_KERNEL);
@@ -2498,30 +2514,49 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog)
 	/*
 	 * Allocate a server TID.
 	 */
-	ep->stid = cxgb4_alloc_stid(dev->rdev.lldi.tids, PF_INET, ep);
+	if (dev->rdev.lldi.enable_fw_ofld_conn)
+		ep->stid = cxgb4_alloc_sftid(dev->rdev.lldi.tids, PF_INET, ep);
+	else
+		ep->stid = cxgb4_alloc_stid(dev->rdev.lldi.tids, PF_INET, ep);
+
 	if (ep->stid == -1) {
 		printk(KERN_ERR MOD "%s - cannot alloc stid.\n", __func__);
 		err = -ENOMEM;
 		goto fail2;
 	}
-
 	state_set(&ep->com, LISTEN);
-	c4iw_init_wr_wait(&ep->com.wr_wait);
-	err = cxgb4_create_server(ep->com.dev->rdev.lldi.ports[0], ep->stid,
-				  ep->com.local_addr.sin_addr.s_addr,
-				  ep->com.local_addr.sin_port,
-				  ep->com.dev->rdev.lldi.rxq_ids[0]);
-	if (err)
-		goto fail3;
-
-	/* wait for pass_open_rpl */
-	err = c4iw_wait_for_reply(&ep->com.dev->rdev, &ep->com.wr_wait, 0, 0,
-				  __func__);
+	if (dev->rdev.lldi.enable_fw_ofld_conn) {
+		do {
+			err = cxgb4_create_server_filter(
+				ep->com.dev->rdev.lldi.ports[0], ep->stid,
+				ep->com.local_addr.sin_addr.s_addr,
+				ep->com.local_addr.sin_port,
+				ep->com.dev->rdev.lldi.rxq_ids[0]);
+			if (err == -EBUSY) {
+				set_current_state(TASK_UNINTERRUPTIBLE);
+				schedule_timeout(usecs_to_jiffies(100));
+			}
+		} while (err == -EBUSY);
+	} else {
+		c4iw_init_wr_wait(&ep->com.wr_wait);
+		err = cxgb4_create_server(ep->com.dev->rdev.lldi.ports[0],
+				ep->stid, ep->com.local_addr.sin_addr.s_addr,
+				ep->com.local_addr.sin_port,
+				ep->com.dev->rdev.lldi.rxq_ids[0]);
+		if (!err)
+			err = c4iw_wait_for_reply(&ep->com.dev->rdev,
+						  &ep->com.wr_wait,
+						  0, 0, __func__);
+	}
 	if (!err) {
 		cm_id->provider_data = ep;
 		goto out;
 	}
-fail3:
+	pr_err("%s cxgb4_create_server/filter failed err %d " \
+	       "stid %d laddr %08x lport %d\n", \
+	       __func__, err, ep->stid,
+	       ntohl(ep->com.local_addr.sin_addr.s_addr),
+	       ntohs(ep->com.local_addr.sin_port));
 	cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid, PF_INET);
 fail2:
 	cm_id->rem_ref(cm_id);
@@ -2540,12 +2575,18 @@ int c4iw_destroy_listen(struct iw_cm_id *cm_id)
 
 	might_sleep();
 	state_set(&ep->com, DEAD);
-	c4iw_init_wr_wait(&ep->com.wr_wait);
-	err = listen_stop(ep);
-	if (err)
-		goto done;
-	err = c4iw_wait_for_reply(&ep->com.dev->rdev, &ep->com.wr_wait, 0, 0,
-				  __func__);
+	if (ep->com.dev->rdev.lldi.enable_fw_ofld_conn) {
+		err = cxgb4_remove_server_filter(
+			ep->com.dev->rdev.lldi.ports[0], ep->stid,
+			ep->com.dev->rdev.lldi.rxq_ids[0], 0);
+	} else {
+		c4iw_init_wr_wait(&ep->com.wr_wait);
+		err = listen_stop(ep);
+		if (err)
+			goto done;
+		err = c4iw_wait_for_reply(&ep->com.dev->rdev, &ep->com.wr_wait,
+					  0, 0, __func__);
+	}
 	cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid, PF_INET);
 done:
 	cm_id->rem_ref(cm_id);
@@ -2622,10 +2663,298 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)
 	return ret;
 }
 
-static int async_event(struct c4iw_dev *dev, struct sk_buff *skb)
+static void active_ofld_conn_reply(struct c4iw_dev *dev, struct sk_buff *skb,
+			struct cpl_fw6_msg_ofld_connection_wr_rpl *req)
+{
+	struct c4iw_ep *ep;
+
+	ep = (struct c4iw_ep *)lookup_atid(dev->rdev.lldi.tids, req->tid);
+	if (!ep)
+		return;
+
+	switch (req->retval) {
+	case FW_ENOMEM:
+	case FW_EADDRINUSE:
+		PDBG("%s ofld conn wr ret %d\n", __func__, req->retval);
+		break;
+	default:
+		pr_info("%s unexpected ofld conn wr retval %d\n",
+		       __func__, req->retval);
+		break;
+	}
+	connect_reply_upcall(ep, status2errno(req->retval));
+}
+
+static void passive_ofld_conn_reply(struct c4iw_dev *dev, struct sk_buff *skb,
+			struct cpl_fw6_msg_ofld_connection_wr_rpl *req)
+{
+	struct sk_buff *rpl_skb;
+	struct cpl_pass_accept_req *cpl;
+	int ret;
+
+	rpl_skb = (struct sk_buff *)cpu_to_be64(req->cookie);
+	BUG_ON(!rpl_skb);
+	if (req->retval) {
+		PDBG("%s passive open failure %d\n", __func__, req->retval);
+		kfree_skb(rpl_skb);
+	} else {
+		cpl = (struct cpl_pass_accept_req *)cplhdr(rpl_skb);
+		OPCODE_TID(cpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_REQ,
+						      htonl(req->tid)));
+		ret = pass_accept_req(dev, rpl_skb);
+		if (!ret)
+			kfree_skb(rpl_skb);
+	}
+	return;
+}
+
+static int deferred_fw6_msg(struct c4iw_dev *dev, struct sk_buff *skb)
 {
 	struct cpl_fw6_msg *rpl = cplhdr(skb);
-	c4iw_ev_dispatch(dev, (struct t4_cqe *)&rpl->data[0]);
+	struct cpl_fw6_msg_ofld_connection_wr_rpl *req;
+
+	switch (rpl->type) {
+	case FW6_TYPE_CQE:
+		c4iw_ev_dispatch(dev, (struct t4_cqe *)&rpl->data[0]);
+		break;
+	case FW6_TYPE_OFLD_CONNECTION_WR_RPL:
+		req = (struct cpl_fw6_msg_ofld_connection_wr_rpl *)rpl->data;
+		switch (req->t_state) {
+		case TCP_SYN_SENT:
+			active_ofld_conn_reply(dev, skb, req);
+			break;
+		case TCP_SYN_RECV:
+			passive_ofld_conn_reply(dev, skb, req);
+			break;
+		default:
+			pr_err("%s unexpected ofld conn wr state %d\n",
+			       __func__, req->t_state);
+			break;
+		}
+		break;
+	}
+	return 0;
+}
+
+static void build_cpl_pass_accept_req(struct sk_buff *skb, int stid , u8 tos)
+{
+	u32 l2info;
+	u16 vlantag, len, hdr_len;
+	u8 intf;
+	struct cpl_rx_pkt *cpl = cplhdr(skb);
+	struct cpl_pass_accept_req *req;
+	struct tcp_options_received tmp_opt;
+
+	/* Store values from cpl_rx_pkt in temporary location. */
+	vlantag = cpl->vlan;
+	len = cpl->len;
+	l2info  = cpl->l2info;
+	hdr_len = cpl->hdr_len;
+	intf = cpl->iff;
+
+	__skb_pull(skb, sizeof(*req) + sizeof(struct rss_header));
+
+	/* We need to parse the TCP options from SYN packet.
+	 * to generate cpl_pass_accept_req.
+	 */
+	memset(&tmp_opt, 0, sizeof(tmp_opt));
+	tcp_clear_options(&tmp_opt);
+	tcp_parse_options(skb, &tmp_opt, 0, 0, NULL);
+
+	req = (struct cpl_pass_accept_req *)__skb_push(skb, sizeof(*req));
+	memset(req, 0, sizeof(*req));
+	req->l2info = cpu_to_be16(V_SYN_INTF(intf) |
+			 V_SYN_MAC_IDX(G_RX_MACIDX(htonl(l2info))) |
+			 F_SYN_XACT_MATCH);
+	req->hdr_len = cpu_to_be32(V_SYN_RX_CHAN(G_RX_CHAN(htonl(l2info))) |
+				V_TCP_HDR_LEN(G_RX_TCPHDR_LEN(htons(hdr_len))) |
+				V_IP_HDR_LEN(G_RX_IPHDR_LEN(htons(hdr_len))) |
+				V_ETH_HDR_LEN(G_RX_ETHHDR_LEN(htonl(l2info))));
+	req->vlan = vlantag;
+	req->len = len;
+	req->tos_stid = cpu_to_be32(PASS_OPEN_TID(stid) |
+				    PASS_OPEN_TOS(tos));
+	req->tcpopt.mss = htons(tmp_opt.mss_clamp);
+	if (tmp_opt.wscale_ok)
+		req->tcpopt.wsf = tmp_opt.snd_wscale;
+	req->tcpopt.tstamp = tmp_opt.saw_tstamp;
+	if (tmp_opt.sack_ok)
+		req->tcpopt.sack = 1;
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_REQ, 0));
+	return;
+}
+
+static void send_fw_pass_open_req(struct c4iw_dev *dev, struct sk_buff *skb,
+				  __be32 laddr, __be16 lport,
+				  __be32 raddr, __be16 rport,
+				  u32 rcv_isn, u32 filter, u16 window,
+				  u32 rss_qid, u8 port_id)
+{
+	struct sk_buff *req_skb;
+	struct fw_ofld_connection_wr *req;
+	struct cpl_pass_accept_req *cpl = cplhdr(skb);
+
+	req_skb = alloc_skb(sizeof(struct fw_ofld_connection_wr), GFP_KERNEL);
+	req = (struct fw_ofld_connection_wr *)__skb_put(req_skb, sizeof(*req));
+	memset(req, 0, sizeof(*req));
+	req->op_compl = htonl(V_WR_OP(FW_OFLD_CONNECTION_WR) | FW_WR_COMPL(1));
+	req->len16_pkd = htonl(FW_WR_LEN16(DIV_ROUND_UP(sizeof(*req), 16)));
+	req->le.version_cpl = htonl(F_FW_OFLD_CONNECTION_WR_CPL);
+	req->le.filter = filter;
+	req->le.lport = lport;
+	req->le.pport = rport;
+	req->le.u.ipv4.lip = laddr;
+	req->le.u.ipv4.pip = raddr;
+	req->tcb.rcv_nxt = htonl(rcv_isn + 1);
+	req->tcb.rcv_adv = htons(window);
+	req->tcb.t_state_to_astid =
+		 htonl(V_FW_OFLD_CONNECTION_WR_T_STATE(TCP_SYN_RECV) |
+			V_FW_OFLD_CONNECTION_WR_RCV_SCALE(cpl->tcpopt.wsf) |
+			V_FW_OFLD_CONNECTION_WR_ASTID(
+			GET_PASS_OPEN_TID(ntohl(cpl->tos_stid))));
+
+	/*
+	 * We store the qid in opt2 which will be used by the firmware
+	 * to send us the wr response.
+	 */
+	req->tcb.opt2 = htonl(V_RSS_QUEUE(rss_qid));
+
+	/*
+	 * We initialize the MSS index in TCB to 0xF.
+	 * So that when driver sends cpl_pass_accept_rpl
+	 * TCB picks up the correct value. If this was 0
+	 * TP will ignore any value > 0 for MSS index.
+	 */
+	req->tcb.opt0 = cpu_to_be64(V_MSS_IDX(0xF));
+	req->cookie = cpu_to_be64((u64)skb);
+
+	set_wr_txq(req_skb, CPL_PRIORITY_CONTROL, port_id);
+	cxgb4_ofld_send(dev->rdev.lldi.ports[0], req_skb);
+}
+
+/*
+ * Handler for CPL_RX_PKT message. Need to handle cpl_rx_pkt
+ * messages when a filter is being used instead of server to
+ * redirect a syn packet. When packets hit filter they are redirected
+ * to the offload queue and driver tries to establish the connection
+ * using firmware work request.
+ */
+static int rx_pkt(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	int stid;
+	unsigned int filter;
+	struct ethhdr *eh = NULL;
+	struct vlan_ethhdr *vlan_eh = NULL;
+	struct iphdr *iph;
+	struct tcphdr *tcph;
+	struct rss_header *rss = (void *)skb->data;
+	struct cpl_rx_pkt *cpl = (void *)skb->data;
+	struct cpl_pass_accept_req *req = (void *)(rss + 1);
+	struct l2t_entry *e;
+	struct dst_entry *dst;
+	struct rtable *rt;
+	struct c4iw_ep *lep;
+	u16 window;
+	struct port_info *pi;
+	struct net_device *pdev;
+	u16 rss_qid;
+	int step;
+	u32 tx_chan;
+	struct neighbour *neigh;
+
+	/* Drop all non-SYN packets */
+	if (!(cpl->l2info & cpu_to_be32(F_RXF_SYN)))
+		goto reject;
+
+	/*
+	 * Drop all packets which did not hit the filter.
+	 * Unlikely to happen.
+	 */
+	if (!(rss->filter_hit && rss->filter_tid))
+		goto reject;
+
+	/*
+	 * Calculate the server tid from filter hit index from cpl_rx_pkt.
+	 */
+	stid = cpu_to_be32(rss->hash_val) - dev->rdev.lldi.tids->sftid_base
+					  + dev->rdev.lldi.tids->nstids;
+
+	lep = (struct c4iw_ep *)lookup_stid(dev->rdev.lldi.tids, stid);
+	if (!lep) {
+		PDBG("%s connect request on invalid stid %d\n", __func__, stid);
+		goto reject;
+	}
+
+	if (G_RX_ETHHDR_LEN(ntohl(cpl->l2info)) == ETH_HLEN) {
+		eh = (struct ethhdr *)(req + 1);
+		iph = (struct iphdr *)(eh + 1);
+	} else {
+		vlan_eh = (struct vlan_ethhdr *)(req + 1);
+		iph = (struct iphdr *)(vlan_eh + 1);
+		skb->vlan_tci = ntohs(cpl->vlan);
+	}
+
+	if (iph->version != 0x4)
+		goto reject;
+
+	tcph = (struct tcphdr *)(iph + 1);
+	skb_set_network_header(skb, (void *)iph - (void *)rss);
+	skb_set_transport_header(skb, (void *)tcph - (void *)rss);
+	skb_get(skb);
+
+	PDBG("%s lip 0x%x lport %u pip 0x%x pport %u tos %d\n", __func__,
+	     ntohl(iph->daddr), ntohs(tcph->dest), ntohl(iph->saddr),
+	     ntohs(tcph->source), iph->tos);
+
+	rt = find_route(dev, iph->daddr, iph->saddr, tcph->dest, tcph->source,
+			iph->tos);
+	if (!rt) {
+		pr_err("%s - failed to find dst entry!\n",
+		       __func__);
+		goto reject;
+	}
+	dst = &rt->dst;
+	neigh = dst_neigh_lookup_skb(dst, skb);
+
+	if (neigh->dev->flags & IFF_LOOPBACK) {
+		pdev = ip_dev_find(&init_net, iph->daddr);
+		e = cxgb4_l2t_get(dev->rdev.lldi.l2t, neigh,
+				    pdev, 0);
+		pi = (struct port_info *)netdev_priv(pdev);
+		tx_chan = cxgb4_port_chan(pdev);
+		dev_put(pdev);
+	} else {
+		e = cxgb4_l2t_get(dev->rdev.lldi.l2t, neigh,
+					neigh->dev, 0);
+		pi = (struct port_info *)netdev_priv(neigh->dev);
+		tx_chan = cxgb4_port_chan(neigh->dev);
+	}
+	if (!e) {
+		pr_err("%s - failed to allocate l2t entry!\n",
+		       __func__);
+		goto free_dst;
+	}
+
+	step = dev->rdev.lldi.nrxq / dev->rdev.lldi.nchan;
+	rss_qid = dev->rdev.lldi.rxq_ids[pi->port_id * step];
+	window = htons(tcph->window);
+
+	/* Calcuate filter portion for LE region. */
+	filter = cpu_to_be32(select_ntuple(dev, dst, e));
+
+	/*
+	 * Synthesize the cpl_pass_accept_req. We have everything except the
+	 * TID. Once firmware sends a reply with TID we update the TID field
+	 * in cpl and pass it through the regular cpl_pass_accept_req path.
+	 */
+	build_cpl_pass_accept_req(skb, stid, iph->tos);
+	send_fw_pass_open_req(dev, skb, iph->daddr, tcph->dest, iph->saddr,
+			      tcph->source, ntohl(tcph->seq), filter, window,
+			      rss_qid, pi->port_id);
+	cxgb4_l2t_release(e);
+free_dst:
+	dst_release(dst);
+reject:
 	return 0;
 }
 
@@ -2648,7 +2977,8 @@ static c4iw_handler_func work_handlers[NUM_CPL_CMDS] = {
 	[CPL_CLOSE_CON_RPL] = close_con_rpl,
 	[CPL_RDMA_TERMINATE] = terminate,
 	[CPL_FW4_ACK] = fw4_ack,
-	[CPL_FW6_MSG] = async_event
+	[CPL_FW6_MSG] = deferred_fw6_msg,
+	[CPL_RX_PKT] = rx_pkt
 };
 
 static void process_timeout(struct c4iw_ep *ep)
@@ -2776,9 +3106,6 @@ static int fw6_msg(struct c4iw_dev *dev, struct sk_buff *skb)
 	struct cpl_fw6_msg *rpl = cplhdr(skb);
 	struct c4iw_wr_wait *wr_waitp;
 	int ret;
-	u8 opcode;
-	struct cpl_fw6_msg_ofld_connection_wr_rpl *req;
-	struct c4iw_ep *ep;
 
 	PDBG("%s type %u\n", __func__, rpl->type);
 
@@ -2792,23 +3119,8 @@ static int fw6_msg(struct c4iw_dev *dev, struct sk_buff *skb)
 		kfree_skb(skb);
 		break;
 	case FW6_TYPE_CQE:
-		sched(dev, skb);
-		break;
 	case FW6_TYPE_OFLD_CONNECTION_WR_RPL:
-		opcode = *(const u8 *)rpl->data;
-		if (opcode == FW_OFLD_CONNECTION_WR) {
-			req =
-			(struct cpl_fw6_msg_ofld_connection_wr_rpl *)rpl->data;
-			if (req->t_state == TCP_SYN_SENT
-			    && (req->retval == FW_ENOMEM
-				|| req->retval == FW_EADDRINUSE)) {
-				ep = (struct c4iw_ep *)
-				     lookup_atid(dev->rdev.lldi.tids,
-						 req->tid);
-				c4iw_l2t_send(&dev->rdev, skb, ep->l2t);
-				return 0;
-			}
-		}
+		sched(dev, skb);
 		break;
 	default:
 		printk(KERN_ERR MOD "%s unexpected fw6 msg type %u\n", __func__,
@@ -2870,7 +3182,8 @@ c4iw_handler_func c4iw_handlers[NUM_CPL_CMDS] = {
 	[CPL_RDMA_TERMINATE] = sched,
 	[CPL_FW4_ACK] = sched,
 	[CPL_SET_TCB_RPL] = set_tcb_rpl,
-	[CPL_FW6_MSG] = fw6_msg
+	[CPL_FW6_MSG] = fw6_msg,
+	[CPL_RX_PKT] = sched
 };
 
 int __init c4iw_cm_init(void)
diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c
index cb4ecd7..6b5b3d1 100644
--- a/drivers/infiniband/hw/cxgb4/device.c
+++ b/drivers/infiniband/hw/cxgb4/device.c
@@ -279,6 +279,7 @@ static int stats_show(struct seq_file *seq, void *v)
 	seq_printf(seq, " DB State: %s Transitions %llu\n",
 		   db_state_str[dev->db_state],
 		   dev->rdev.stats.db_state_transitions);
+	seq_printf(seq, "TCAM_FULL: %10llu\n", dev->rdev.stats.tcam_full);
 	return 0;
 }
 
@@ -577,14 +578,76 @@ out:
 	return ctx;
 }
 
+static inline struct sk_buff *copy_gl_to_skb_pkt(const struct pkt_gl *gl,
+						 const __be64 *rsp,
+						 u32 pktshift)
+{
+	struct sk_buff *skb;
+
+	/*
+	 * Allocate space for cpl_pass_accept_req which will be synthesized by
+	 * driver. Once the driver synthesizes the request the skb will go
+	 * through the regular cpl_pass_accept_req processing.
+	 * The math here assumes sizeof cpl_pass_accept_req >= sizeof
+	 * cpl_rx_pkt.
+	 */
+	skb = alloc_skb(gl->tot_len + sizeof(struct cpl_pass_accept_req) +
+			sizeof(struct rss_header) - pktshift, GFP_ATOMIC);
+	if (unlikely(!skb))
+		return NULL;
+
+	 __skb_put(skb, gl->tot_len + sizeof(struct cpl_pass_accept_req) +
+		   sizeof(struct rss_header) - pktshift);
+
+	/*
+	 * This skb will contain:
+	 *   rss_header from the rspq descriptor (1 flit)
+	 *   cpl_rx_pkt struct from the rspq descriptor (2 flits)
+	 *   space for the difference between the size of an
+	 *      rx_pkt and pass_accept_req cpl (1 flit)
+	 *   the packet data from the gl
+	 */
+	skb_copy_to_linear_data(skb, rsp, sizeof(struct cpl_pass_accept_req) +
+				sizeof(struct rss_header));
+	skb_copy_to_linear_data_offset(skb, sizeof(struct rss_header) +
+				       sizeof(struct cpl_pass_accept_req),
+				       gl->va + pktshift,
+				       gl->tot_len - pktshift);
+	return skb;
+}
+
+static inline int recv_rx_pkt(struct c4iw_dev *dev, const struct pkt_gl *gl,
+			   const __be64 *rsp)
+{
+	unsigned int opcode = *(u8 *)rsp;
+	struct sk_buff *skb;
+
+	if (opcode != CPL_RX_PKT)
+		goto out;
+
+	skb = copy_gl_to_skb_pkt(gl , rsp, dev->rdev.lldi.sge_pktshift);
+	if (skb == NULL)
+		goto out;
+
+	if (c4iw_handlers[opcode] == NULL) {
+		pr_info("%s no handler opcode 0x%x...\n", __func__,
+		       opcode);
+		kfree_skb(skb);
+		goto out;
+	}
+	c4iw_handlers[opcode](dev, skb);
+	return 1;
+out:
+	return 0;
+}
+
 static int c4iw_uld_rx_handler(void *handle, const __be64 *rsp,
 			const struct pkt_gl *gl)
 {
 	struct uld_ctx *ctx = handle;
 	struct c4iw_dev *dev = ctx->dev;
 	struct sk_buff *skb;
-	const struct cpl_act_establish *rpl;
-	unsigned int opcode;
+	u8 opcode;
 
 	if (gl == NULL) {
 		/* omit RSS and rsp_ctrl at end of descriptor */
@@ -601,19 +664,29 @@ static int c4iw_uld_rx_handler(void *handle, const __be64 *rsp,
 		u32 qid = be32_to_cpu(rc->pldbuflen_qid);
 		c4iw_ev_handler(dev, qid);
 		return 0;
+	} else if (unlikely(*(u8 *)rsp != *(u8 *)gl->va)) {
+		if (recv_rx_pkt(dev, gl, rsp))
+			return 0;
+
+		pr_info("%s: unexpected FL contents at %p, " \
+		       "RSS %#llx, FL %#llx, len %u\n",
+		       pci_name(ctx->lldi.pdev), gl->va,
+		       (unsigned long long)be64_to_cpu(*rsp),
+		       (unsigned long long)be64_to_cpu(*(u64 *)gl->va),
+		       gl->tot_len);
+
+		return 0;
 	} else {
 		skb = cxgb4_pktgl_to_skb(gl, 128, 128);
 		if (unlikely(!skb))
 			goto nomem;
 	}
 
-	rpl = cplhdr(skb);
-	opcode = rpl->ot.opcode;
-
+	opcode = *(u8 *)rsp;
 	if (c4iw_handlers[opcode])
 		c4iw_handlers[opcode](dev, skb);
 	else
-		printk(KERN_INFO "%s no handler opcode 0x%x...\n", __func__,
+		pr_info("%s no handler opcode 0x%x...\n", __func__,
 		       opcode);
 
 	return 0;
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index d0cd65b..b9ce750 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -3337,6 +3337,10 @@ int cxgb4_create_server_filter(const struct net_device *dev, unsigned int stid,
 
 	adap = netdev2adap(dev);
 
+	/* Adjust stid to correct filter index */
+	stid -= adap->tids.nstids;
+	stid += adap->tids.nftids;
+
 	/**
 	 * Check to make sure the filter requested is writable ...
 	 */
@@ -3387,6 +3391,11 @@ int cxgb4_remove_server_filter(const struct net_device *dev, unsigned int stid,
 	struct adapter *adap;
 
 	adap = netdev2adap(dev);
+
+	/* Adjust stid to correct filter index */
+	stid -= adap->tids.nstids;
+	stid += adap->tids.nftids;
+
 	f = &adap->tids.ftid_tab[stid];
 	/* Unlock the filter */
 	f->locked = 0;
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h
index 5b51c01..1179616 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h
@@ -199,6 +199,18 @@ struct work_request_hdr {
 
 #define WR_HDR struct work_request_hdr wr
 
+/* option 0 fields */
+#define S_MSS_IDX    60
+#define M_MSS_IDX    0xF
+#define V_MSS_IDX(x) ((__u64)(x) << S_MSS_IDX)
+#define G_MSS_IDX(x) (((x) >> S_MSS_IDX) & M_MSS_IDX)
+
+/* option 2 fields */
+#define S_RSS_QUEUE    0
+#define M_RSS_QUEUE    0x3FF
+#define V_RSS_QUEUE(x) ((x) << S_RSS_QUEUE)
+#define G_RSS_QUEUE(x) (((x) >> S_RSS_QUEUE) & M_RSS_QUEUE)
+
 struct cpl_pass_open_req {
 	WR_HDR;
 	union opcode_tid ot;
@@ -300,6 +312,9 @@ struct cpl_pass_establish {
 	union opcode_tid ot;
 	__be32 rsvd;
 	__be32 tos_stid;
+#define PASS_OPEN_TID(x) ((x) << 0)
+#define PASS_OPEN_TOS(x) ((x) << 24)
+#define GET_PASS_OPEN_TID(x)	(((x) >> 0) & 0xFFFFFF)
 #define GET_POPEN_TID(x) ((x) & 0xffffff)
 #define GET_POPEN_TOS(x) (((x) >> 24) & 0xff)
 	__be16 mac_idx;
@@ -545,6 +560,37 @@ struct cpl_rx_pkt {
 	__be16 err_vec;
 };
 
+/* rx_pkt.l2info fields */
+#define S_RX_ETHHDR_LEN    0
+#define M_RX_ETHHDR_LEN    0x1F
+#define V_RX_ETHHDR_LEN(x) ((x) << S_RX_ETHHDR_LEN)
+#define G_RX_ETHHDR_LEN(x) (((x) >> S_RX_ETHHDR_LEN) & M_RX_ETHHDR_LEN)
+
+#define S_RX_MACIDX    8
+#define M_RX_MACIDX    0x1FF
+#define V_RX_MACIDX(x) ((x) << S_RX_MACIDX)
+#define G_RX_MACIDX(x) (((x) >> S_RX_MACIDX) & M_RX_MACIDX)
+
+#define S_RXF_SYN    21
+#define V_RXF_SYN(x) ((x) << S_RXF_SYN)
+#define F_RXF_SYN    V_RXF_SYN(1U)
+
+#define S_RX_CHAN    28
+#define M_RX_CHAN    0xF
+#define V_RX_CHAN(x) ((x) << S_RX_CHAN)
+#define G_RX_CHAN(x) (((x) >> S_RX_CHAN) & M_RX_CHAN)
+
+/* rx_pkt.hdr_len fields */
+#define S_RX_TCPHDR_LEN    0
+#define M_RX_TCPHDR_LEN    0x3F
+#define V_RX_TCPHDR_LEN(x) ((x) << S_RX_TCPHDR_LEN)
+#define G_RX_TCPHDR_LEN(x) (((x) >> S_RX_TCPHDR_LEN) & M_RX_TCPHDR_LEN)
+
+#define S_RX_IPHDR_LEN    6
+#define M_RX_IPHDR_LEN    0x3FF
+#define V_RX_IPHDR_LEN(x) ((x) << S_RX_IPHDR_LEN)
+#define G_RX_IPHDR_LEN(x) (((x) >> S_RX_IPHDR_LEN) & M_RX_IPHDR_LEN)
+
 struct cpl_trace_pkt {
 	u8 opcode;
 	u8 intf;
-- 
1.7.1

^ permalink raw reply related

* Re: [PATCH v2 3/3] pppoatm: protect against freeing of vcc
From: Krzysztof Mazur @ 2012-11-29 15:09 UTC (permalink / raw)
  To: David Woodhouse
  Cc: David Laight, chas williams - CONTRACTOR, davem, netdev,
	linux-kernel, nathan
In-Reply-To: <1354141115.21562.101.camel@shinybook.infradead.org>

On Wed, Nov 28, 2012 at 10:18:35PM +0000, David Woodhouse wrote:
> On Wed, 2012-11-28 at 09:21 +0000, David Laight wrote:
> > Even when it might make sense to sleep in close until tx drains
> > there needs to be a finite timeout before it become abortive.
> 
> You are, of course, right. We should never wait for hardware for ever.
> And just to serve me right, I seem to have hit a bug in the latest Solos
> firmware (1.11) which makes it sometimes lock up when I reboot. So it
> never responds to the PKT_PCLOSE packet... and thus it deadlocks when I
> try to kill pppd and unload the module to reset it :)
> 
> New version...
> 
> From 53dd01c08fec5b26006a009b25e4210127fdb27a Mon Sep 17 00:00:00 2001
> From: David Woodhouse <David.Woodhouse@intel.com>
> Date: Tue, 27 Nov 2012 23:49:24 +0000
> Subject: [PATCH] solos-pci: Wait for pending TX to complete when releasing
>  vcc
> 
> We should no longer be calling the old pop routine for the vcc, after
> vcc_release() has completed. Make sure we wait for any pending TX skbs
> to complete, by waiting for our own PKT_PCLOSE control skb to be sent.
> 
> Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
> ---
>  drivers/atm/solos-pci.c | 17 +++++++++++++++--
>  1 file changed, 15 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/atm/solos-pci.c b/drivers/atm/solos-pci.c
> index 9851093..3720670 100644
> --- a/drivers/atm/solos-pci.c
> +++ b/drivers/atm/solos-pci.c
> @@ -92,6 +92,7 @@ struct pkt_hdr {
>  };
>  
>  struct solos_skb_cb {
> +	struct completion c;
>  	struct atm_vcc *vcc;
>  	uint32_t dma_addr;
>  };
> @@ -881,11 +882,18 @@ static void pclose(struct atm_vcc *vcc)
>  	header->vci = cpu_to_le16(vcc->vci);
>  	header->type = cpu_to_le16(PKT_PCLOSE);
>  
> +	init_completion(&SKB_CB(skb)->c);
> +
>  	fpga_queue(card, SOLOS_CHAN(vcc->dev), skb, NULL);
>  
>  	clear_bit(ATM_VF_ADDR, &vcc->flags);
>  	clear_bit(ATM_VF_READY, &vcc->flags);
>  
> +	if (!wait_for_completion_timeout(&SKB_CB(skb)->c,
> +					 jiffies + msecs_to_jiffies(5000)))
> +		dev_warn(&card->dev->dev, "Timeout waiting for VCC close on port %d\n",
> +			 SOLOS_CHAN(vcc->dev));
> +

I don't like two thinks about this patch:

	- if allos_skb(sizeof(*header), GFP_ATOMIC) at beginning of
	  pclose() fails we will crash

	- if card wakes up after this timeout we will probably crash too

That's why proposed different approach, but it has other problems.

Krzysiek

^ permalink raw reply

* Re: TCP and reordering
From: Noel Grandin @ 2012-11-29 15:10 UTC (permalink / raw)
  To: netdev
In-Reply-To: <1354125688.21562.95.camel@shinybook.infradead.org>

David Woodhouse <dwmw2 <at> infradead.org> writes:
> On Wed, 2012-11-28 at 09:16 -0800, Eric Dumazet wrote:
> > BQL was really something to control/limit queueing on ethernet links,
> > not for stacked devices, as stacked devices normally have no queue.
> 
> Stacked devices have more queue than anything else :)
> 

Surely BQL is something that should only be implemented on the device at the 
bottom of the stack, and everything above that shouldn't be bothering?

^ permalink raw reply

* Re: [PATCH v2 3/3] pppoatm: protect against freeing of vcc
From: chas williams - CONTRACTOR @ 2012-11-29 15:37 UTC (permalink / raw)
  To: David Woodhouse
  Cc: David Laight, Krzysztof Mazur, davem, netdev, linux-kernel,
	nathan
In-Reply-To: <1354141115.21562.101.camel@shinybook.infradead.org>

On Wed, 28 Nov 2012 22:18:35 +0000
David Woodhouse <dwmw2@infradead.org> wrote:

> diff --git a/drivers/atm/solos-pci.c b/drivers/atm/solos-pci.c
> index 9851093..3720670 100644
> --- a/drivers/atm/solos-pci.c
> +++ b/drivers/atm/solos-pci.c
> @@ -92,6 +92,7 @@ struct pkt_hdr {
>  };
>  
>  struct solos_skb_cb {
> +	struct completion c;
>  	struct atm_vcc *vcc;
>  	uint32_t dma_addr;
>  };
> @@ -881,11 +882,18 @@ static void pclose(struct atm_vcc *vcc)
>  	header->vci = cpu_to_le16(vcc->vci);
>  	header->type = cpu_to_le16(PKT_PCLOSE);
>  
> +	init_completion(&SKB_CB(skb)->c);
> +
>  	fpga_queue(card, SOLOS_CHAN(vcc->dev), skb, NULL);
>  
>  	clear_bit(ATM_VF_ADDR, &vcc->flags);
>  	clear_bit(ATM_VF_READY, &vcc->flags);

you shouldnt clear ATM_VF_ADDR until the vpi/vci is actually closed and
ready for reuse.  at this point, it isnt. ATM_VF_READY should already
be clear at this point but you should set it before you queue your
PKT_CLOSE.  these flags probably should be handled outside the drivers
since the context for them is pretty clear.  just another patch i never
got around to writing...

checking for ATM_VF_READY in find_vcc() is probably going to give you
grief as well since ATM_VF_READY isnt entirely under your control.  you
need to be able to find the vcc until after pclose() is finished since
your tasklet might have a few packets it is still processing?

^ permalink raw reply

* Re: [PATCH v2 3/3] pppoatm: protect against freeing of vcc
From: David Woodhouse @ 2012-11-29 15:47 UTC (permalink / raw)
  To: Krzysztof Mazur
  Cc: David Laight, chas williams - CONTRACTOR, davem, netdev,
	linux-kernel, nathan
In-Reply-To: <20121129150945.GB16478@shrek.podlesie.net>

[-- Attachment #1: Type: text/plain, Size: 4457 bytes --]

On Thu, 2012-11-29 at 16:09 +0100, Krzysztof Mazur wrote:
> 
> I don't like two thinks about this patch:
> 
>         - if allos_skb(sizeof(*header), GFP_ATOMIC) at beginning of
>           pclose() fails we will crash
> 
>         - if card wakes up after this timeout we will probably crash too
> 
> That's why proposed different approach, but it has other problems.

How about this variant on what you suggested. Yes, we can definitely
remove everything that's in the queue... as long as we use
skb_queue_walk_safe() instead of skb_queue_walk().

We can use GFP_KERNEL instead of GFP_ATOMIC, which at least reduces the
likelihood of failing to close the vcc.

We end up waiting *only* if there is a packet which is *currently* being
DMA'd to the card. And if the card doesn't take that within 5 seconds,
it almost certainly never will. So I can live with that.

I'd definitely want someone with a DMA-capable FPGA to test this
properly, adding printks to it to make sure the interesting path is
being exercised. Nathan, you should be able to trigger it with the same
test that used to just crash the system entirely — send a flood of
packets while you kill br2684ctl.

diff --git a/drivers/atm/solos-pci.c b/drivers/atm/solos-pci.c
index 6258961..2006a39 100644
--- a/drivers/atm/solos-pci.c
+++ b/drivers/atm/solos-pci.c
@@ -92,7 +92,6 @@ struct pkt_hdr {
 };
 
 struct solos_skb_cb {
-	struct completion c;
 	struct atm_vcc *vcc;
 	uint32_t dma_addr;
 };
@@ -868,10 +867,11 @@ static int popen(struct atm_vcc *vcc)
 static void pclose(struct atm_vcc *vcc)
 {
 	struct solos_card *card = vcc->dev->dev_data;
-	struct sk_buff *skb;
+	struct sk_buff *skb, *tmpskb;
 	struct pkt_hdr *header;
+	unsigned char port = SOLOS_CHAN(vcc->dev);
 
-	skb = alloc_skb(sizeof(*header), GFP_ATOMIC);
+	skb = alloc_skb(sizeof(*header), GFP_KERNEL);
 	if (!skb) {
 		dev_warn(&card->dev->dev, "Failed to allocate sk_buff in pclose()\n");
 		return;
@@ -883,22 +883,50 @@ static void pclose(struct atm_vcc *vcc)
 	header->vci = cpu_to_le16(vcc->vci);
 	header->type = cpu_to_le16(PKT_PCLOSE);
 
-	init_completion(&SKB_CB(skb)->c);
-
 	fpga_queue(card, SOLOS_CHAN(vcc->dev), skb, NULL);
 
 	clear_bit(ATM_VF_ADDR, &vcc->flags);
 	clear_bit(ATM_VF_READY, &vcc->flags);
 
-	if (!wait_for_completion_timeout(&SKB_CB(skb)->c,
-					 msecs_to_jiffies(5000)))
-		dev_warn(&card->dev->dev, "Timeout waiting for VCC close on port %d\n",
-			 SOLOS_CHAN(vcc->dev));
+	/* Remove any yet-to-be-transmitted packets from the pending queue */
+	spin_lock(&card->tx_queue_lock);
+	skb_queue_walk_safe(&card->tx_queue[port], skb, tmpskb) {
+		if (SKB_CB(skb)->vcc == vcc) {
+			skb_unlink(skb, &card->tx_queue[port]);
+			solos_pop(vcc, skb);
+		}
+	}
+	spin_unlock(&card->tx_queue_lock);
 
 	/* Hold up vcc_destroy_socket() (our caller) until solos_bh() in the
 	   tasklet has finished processing any incoming packets (and, more to
 	   the point, using the vcc pointer). */
 	tasklet_unlock_wait(&card->tlet);
+
+	/*
+	 * If we're in DMA mode and a skb on this VCC is *currently* being
+	 * submitted, wait for it to finish (using param_wq)
+	 */
+	if (card->using_dma) {
+		DEFINE_WAIT(wait);
+
+		spin_lock(&card->tx_lock);
+		while (card->tx_skb[port] && SKB_CB(card->tx_skb[port])->vcc == vcc) {
+			prepare_to_wait(&card->param_wq, &wait, TASK_UNINTERRUPTIBLE);
+			spin_unlock(&card->tx_lock);
+
+			if (schedule_timeout(5 * HZ)) {
+				dev_warn(&card->dev->dev,
+					 "Timeout waiting for VCC close on port %d\n",
+					 port);
+				goto done_waiting;
+			}
+			spin_lock(&card->tx_lock);
+		}
+		spin_unlock(&card->tx_lock);
+	done_waiting:
+		finish_wait(&card->param_wq, &wait);
+	}
 	return;
 }
 
@@ -1020,12 +1048,15 @@ static uint32_t fpga_tx(struct solos_card *card)
 			if (vcc) {
 				atomic_inc(&vcc->stats->tx);
 				solos_pop(vcc, oldskb);
-			} else {
-				struct pkt_hdr *header = (void *)oldskb->data;
-				if (le16_to_cpu(header->type) == PKT_PCLOSE)
-					complete(&SKB_CB(oldskb)->c);
+
+				/*
+				 * If it's a TX skb on a closed VCC, pclose()
+				 * may be waiting for it...
+				 */
+				if (!test_bit(ATM_VF_READY, &vcc->flags))
+					wake_up(&card->param_wq);
+			} else
 				dev_kfree_skb_irq(oldskb);
-			}
 		}
 	}
 	/* For non-DMA TX, write the 'TX start' bit for all four ports simultaneously */


-- 
dwmw2


[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 6171 bytes --]

^ permalink raw reply related

* RE: [E1000-devel] 82571EB: Detected Hardware Unit Hang
From: Fujinaka, Todd @ 2012-11-29 15:52 UTC (permalink / raw)
  To: Ethan Zhao
  Cc: Joe Jin, Ben Hutchings, Mary Mcgrath, netdev@vger.kernel.org,
	e1000-devel@lists.sf.net, linux-kernel@vger.kernel.org, linux-pci
In-Reply-To: <CABawtvPrz=z_CLCvPeXkky7COyhjPyfBu3QCTbiiPpvr+5bicg@mail.gmail.com>

Someone else pointed this out to me locally. If you have a non-client BIOS, you should be able to set the MaxPayloadSize using setpci. You have to make sure that you're being consistent throughout all the associated links.

Todd Fujinaka
Technical Marketing Engineer
LAN Access Division (LAD)
Intel Corporation
todd.fujinaka@intel.com
(503) 712-4565


-----Original Message-----
From: Ethan Zhao [mailto:ethan.kernel@gmail.com] 
Sent: Wednesday, November 28, 2012 7:10 PM
To: Fujinaka, Todd
Cc: Joe Jin; Ben Hutchings; Mary Mcgrath; netdev@vger.kernel.org; e1000-devel@lists.sf.net; linux-kernel@vger.kernel.org; linux-pci
Subject: Re: [E1000-devel] 82571EB: Detected Hardware Unit Hang

Joe,
    Possibly your customer is running a kernel without source code on a platform whose vendor wouldn't like to fix BIOS issue( Is that a HP/Dell server ?).
    Anyway, to see if is a payload issue or,  you could change the payload size with setpci tool to those devices and set the link retrain bit to trigger the link retraining to debug the issue and identity the root cause.  I thinks it is much easier than modify the BIOS or  eeprom of NIC.

    e.g.
   set device control register to 0f 00   (128 bytes payload size)
   #   setpci -v -s 00:02.0 98.w=000f
   set device link control register to 60h (retrain the link)
   #  setpci -v -s 00:02.0 a0.b=60

  Hope it works,  Just my 2 cents.

Ethan.zhao@oracle.com

On Wed, Nov 28, 2012 at 11:53 PM, Fujinaka, Todd <todd.fujinaka@intel.com> wrote:
> The only EEPROM I know about or can speak to is the one attached to the 82571 and it doesn't set the MaxPayloadSize. That's done by the BIOS.
>
> Todd Fujinaka
> Technical Marketing Engineer
> LAN Access Division (LAD)
> Intel Corporation
> todd.fujinaka@intel.com
> (503) 712-4565
>
>
> -----Original Message-----
> From: Joe Jin [mailto:joe.jin@oracle.com]
> Sent: Wednesday, November 28, 2012 12:31 AM
> To: Ben Hutchings
> Cc: Fujinaka, Todd; Mary Mcgrath; netdev@vger.kernel.org; 
> e1000-devel@lists.sf.net; linux-kernel@vger.kernel.org; linux-pci
> Subject: Re: [E1000-devel] 82571EB: Detected Hardware Unit Hang
>
> On 11/28/12 02:10, Ben Hutchings wrote:
>> On Tue, 2012-11-27 at 17:32 +0000, Fujinaka, Todd wrote:
>>> Forgive me if I'm being too repetitious as I think some of this has 
>>> been mentioned in the past.
>>>
>>> We (and by we I mean the Ethernet part and driver) can only change 
>>> the advertised availability of a larger MaxPayloadSize. The size is 
>>> negotiated by both sides of the link when the link is established.
>>> The driver should not change the size of the link as it would be 
>>> poking at registers outside of its scope and is controlled by the 
>>> upstream bridge (not us).
>> [...]
>>
>> MaxPayloadSize (MPS) is not negotiated between devices but is 
>> programmed by the system firmware (at least for devices present at 
>> boot - the kernel may be responsible in case of hotplug).  You can 
>> use the kernel parameter 'pci=pcie_bus_perf' (or one of several 
>> others) to set a policy that overrides this, but no policy will allow 
>> setting MPS above the device's MaxPayloadSizeSupported (MPSS).
>>
>
> Ben,
>
> Unfortunately I'm using 3.0.x kernel and this is not included in the kernel.
> So I'm trying to use ethtool modify it from eeprom to see if help or no.
>
>
> Todd, I'll review all MaxPayload for all devices, but need to say if it mismatch, customer could not modify it from BIOS for there was not entry at there, to test it, we have to find how to verify if this is the root cause, so still need to find the offset in eeprom.
>
> Thanks in advance,
> Joe
>

^ permalink raw reply

* Re: [PATCH v2 3/3] pppoatm: protect against freeing of vcc
From: David Woodhouse @ 2012-11-29 15:59 UTC (permalink / raw)
  To: chas williams - CONTRACTOR
  Cc: David Laight, Krzysztof Mazur, davem, netdev, linux-kernel,
	nathan
In-Reply-To: <20121129103744.38149dc6@thirdoffive.cmf.nrl.navy.mil>

[-- Attachment #1: Type: text/plain, Size: 1756 bytes --]

On Thu, 2012-11-29 at 10:37 -0500, chas williams - CONTRACTOR wrote:
> you shouldnt clear ATM_VF_ADDR until the vpi/vci is actually closed and
> ready for reuse.  at this point, it isnt.

So I should always wait for the completion of my PKT_CLOSE and only
clear ATM_VF_ADDR when it's actually done?

But can you define 'ready for reuse'? From the moment I clear
ATM_VF_ADDR, another CPU may enter my popen() function to set up another
VCC with the same parameters, and everything should work fine. The
PKT_POPEN will end up on the queue *after* my PKT_PCLOSE for the old
VCC. Any received packets will be dropped until the new VCC gets
ATM_VF_READY set (by the popen function).

What's the actual failure mode, caused by me clearing ATM_VF_ADDR "too
early"?

>  ATM_VF_READY should already be clear at this point but you should set
> it before you queue your PKT_CLOSE. 

I should *set* it? Do you mean clear it? Yes, I see it's cleared by
vcc_destroy_socket()... but all the other ATM drivers also seem to clear
it for themselves, and that would appear to be harmless.

> checking for ATM_VF_READY in find_vcc() is probably going to give you
> grief as well since ATM_VF_READY isnt entirely under your control. 

That's fine. If *anyone* has cleared ATM_VF_READY, I stop sending
packets up it. Or, more to the point, I stop using the damn thing at
all. See commit 1f6ea6e511e5ec730d8e88651da1b7b6e8fd1333.

>  you need to be able to find the vcc until after pclose() is finished since
> your tasklet might have a few packets it is still processing?

The whole point of that check is that the tasklet *won't* be able to
find it any more, and it'll just discard incoming packets for the
obsolescent VCC.

-- 
dwmw2

[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 6171 bytes --]

^ permalink raw reply

* Re: [PATCH v2 3/3] pppoatm: protect against freeing of vcc
From: chas williams - CONTRACTOR @ 2012-11-29 15:59 UTC (permalink / raw)
  To: David Woodhouse
  Cc: Krzysztof Mazur, David Laight, davem, netdev, linux-kernel,
	nathan
In-Reply-To: <1354204077.21562.172.camel@shinybook.infradead.org>

On Thu, 29 Nov 2012 15:47:57 +0000
David Woodhouse <dwmw2@infradead.org> wrote:


> @@ -1020,12 +1048,15 @@ static uint32_t fpga_tx(struct solos_card *card)
>  			if (vcc) {
>  				atomic_inc(&vcc->stats->tx);
>  				solos_pop(vcc, oldskb);
> -			} else {
> -				struct pkt_hdr *header = (void *)oldskb->data;
> -				if (le16_to_cpu(header->type) == PKT_PCLOSE)
> -					complete(&SKB_CB(oldskb)->c);
> +
> +				/*
> +				 * If it's a TX skb on a closed VCC, pclose()
> +				 * may be waiting for it...
> +				 */
> +				if (!test_bit(ATM_VF_READY, &vcc->flags))
> +					wake_up(&card->param_wq);
> +			} else
>  				dev_kfree_skb_irq(oldskb);
> -			}

the part that bothers me (and i dont have the programmer's guide for
the solos hardware) is that you are watching for the PKT_PCLOSE to be
sent to the card.  shouldnt you be watching for the PKT_PCLOSE to be
returned from the card (assuming it does such a thing) so that you can
be assured that the tx/rx for this vpi/vci pair has been "stopped"?

^ permalink raw reply

* Re: [PATCH v2 3/3] pppoatm: protect against freeing of vcc
From: chas williams - CONTRACTOR @ 2012-11-29 16:11 UTC (permalink / raw)
  To: David Woodhouse
  Cc: David Laight, Krzysztof Mazur, davem, netdev, linux-kernel,
	nathan
In-Reply-To: <1354204748.21562.180.camel@shinybook.infradead.org>

On Thu, 29 Nov 2012 15:59:08 +0000
David Woodhouse <dwmw2@infradead.org> wrote:

> On Thu, 2012-11-29 at 10:37 -0500, chas williams - CONTRACTOR wrote:
> > you shouldnt clear ATM_VF_ADDR until the vpi/vci is actually closed and
> > ready for reuse.  at this point, it isnt.
> 
> So I should always wait for the completion of my PKT_CLOSE and only
> clear ATM_VF_ADDR when it's actually done?
> 
> But can you define 'ready for reuse'? From the moment I clear
> ATM_VF_ADDR, another CPU may enter my popen() function to set up another
> VCC with the same parameters, and everything should work fine. The
> PKT_POPEN will end up on the queue *after* my PKT_PCLOSE for the old
> VCC. Any received packets will be dropped until the new VCC gets
> ATM_VF_READY set (by the popen function).
> 
> What's the actual failure mode, caused by me clearing ATM_VF_ADDR "too
> early"?

there may not be one (due to serialization from other parts of the
atm stack) but you "shouldn't" clear ATM_VF_ADDR until the vpi/vci pair
is ready for reuse.  by reuse, i mean that any previous rx/tx data in
the vpi/vci segmentation hardware has been removed/cleared.

> >  ATM_VF_READY should already be clear at this point but you should set
> > it before you queue your PKT_CLOSE. 
> 
> I should *set* it? Do you mean clear it? Yes, I see it's cleared by

sorry, i did mean clear it.

> vcc_destroy_socket()... but all the other ATM drivers also seem to clear
> it for themselves, and that would appear to be harmless.

yeah, like i said, it is spuriously cleared in the drivers and should
probably just be moved to under the control of the next layer up
completely.  drivers/atm should just handle the hardware side, not the
software side.

> > checking for ATM_VF_READY in find_vcc() is probably going to give you
> > grief as well since ATM_VF_READY isnt entirely under your control. 
> 
> That's fine. If *anyone* has cleared ATM_VF_READY, I stop sending
> packets up it. Or, more to the point, I stop using the damn thing at
> all. See commit 1f6ea6e511e5ec730d8e88651da1b7b6e8fd1333.
> 
> >  you need to be able to find the vcc until after pclose() is finished since
> > your tasklet might have a few packets it is still processing?
> 
> The whole point of that check is that the tasklet *won't* be able to
> find it any more, and it'll just discard incoming packets for the
> obsolescent VCC.

that's fine as long as you understand this.  in the case of the he, i
needed to be able to find the vcc until close() is finished so that i
can wakeup the sleeper in the close() routine that is waiting for the
reassembly queue to be cleared/reset.  also, i still needed to find the
vcc for the tx side during close() since i still might need to pop()
skb's that are being sent during the close() while i am still trying to
get the hardware to shutdown the transmit dma engine.

^ permalink raw reply

* [net-next PATCH V2 0/9] net: fragmentation performance scalability on NUMA/SMP systems
From: Jesper Dangaard Brouer @ 2012-11-29 16:10 UTC (permalink / raw)
  To: Eric Dumazet, David S. Miller, Florian Westphal
  Cc: Jesper Dangaard Brouer, netdev, Pablo Neira Ayuso, Thomas Graf,
	Cong Wang, Patrick McHardy, Paul E. McKenney, Herbert Xu

This patchset implements significant performance improvements for
fragmentation handling in the kernel, with a focus on NUMA and SMP
based systems.

This is V2 of the patchset, previously send as an RFC patchset.
(Notice an extra patch is inserted after patch-05, thus shifting the
patch numbers.)

To reviewers:

 Please give me your signoff's or ack's, or comment on the code,
 explaining what I should change.

The fragmentation code today:

 The fragmentation code "protects" kernel resources, by implementing
 some memory resource limitation code.  This is centered around a
 global readers-writer lock, and (per network namespace) an atomic mem
 counter and a LRU (Least-Recently-Used) list.  (Although separate
 global variables and namespace resources, are kept for IPv4, IPv6
 and Netfilter reassembly.)

 The code tries to keep the memory usage between a high and low
 threshold (see: /proc/sys/net/ipv4/ipfrag_{high,low}_thresh).  The
 "evictor" code cleans up fragments, when the high threshold is
 exceeded, and stops only, when the low threshold is reached.

The scalability problem:

 Having a global/central variable for a resource limit is obviously a
 scalability issue on SMP systems, and even amplified on a NUMA based
 system.

 When profiling the code, the scalability problems appeared to be the
 readers-writer lock.  But, surprise, the primary scalability issue
 was caused by the global atomic mem limit counter, which, especially
 on NUMA systems, would prolong the time spend inside the
 readers-writer lock sections.  It is not trivial to remove the
 readers-writer lock, but it is possible to reduce the number of
 writer lock sections.

Testlab:

 My original big-testlab were based on four Intel based 10Gbit/s NICs
 on two identical Sandy-Bridge-E NUMA system.  The testlab
 used/available, while rebasing to net-next, were not as powerful.
 Its based on a single Sandy-Bridge-E NUMA system with the same Intel
 10G NICs, but the generator machine was an old Core-i7 920 with some
 older NICs. This means that I have not been able to generate full 4x
 10G wirespeed.  I have chosen (mostly) to include 2x 10G test results
 due to the generator machine (although the 4x 10G results from the
 big system looks more impressive).

 The tests are performed with netperf -t UDP_STREAM (which default
 send UDP packets with size 65507 bytes, which gets fragmented).  The
 netserver's get numactl pinned and the CPU sockets get smp_affinity
 aligned to the physical NIC connected to its own NUMA node.

Performance results:

 For the impressive 4x 10Gbit/s big-testlab results, performance goes
  from (a collective) 496 Mbit/s to 38463 Mbit/s (per stream 9615 Mbit/s)
  (at packet size 65507 bytes)

 For the results to be fair/meaningful, I'll report the used packet
 size, as (after the fixes) bigger UDP packets scale better, because
 smaller packets will require/create more frag queues to handle.

 I'll report packet size 65507 and three fragments 1472*3=4416 bytes.

 Disabled Ethernet Flow Control (via ethtool -A).  To show the real
 effect of the patches, the system needs to be in an "overload"
 situation.  When Ethernet Flow Control is enabled, the system will
 make the generator back-off, and the code path will be less stressed.
 Thus, I have disabled Ethernet Flow Control.

No patches:
 -------
 Results without any patches, and no flow control:

  2x10G size(65507) result:(7+50)     =57   Mbit/s (gen:9613+9473 Mbit/s)
  2x10G size(4416)  result:(3619+3772)=7391 Mbit/s (gen:8339+9105 Mbit/s)

 The very pure result with large frames is a result of the "evictor"
 code, which gets fixed in patch-01.

Patch-01: net: frag evictor, avoid killing warm frag queues
 -------
 The fragmentation evictor system have a very unfortunate eviction
 system for killing fragment, when the system is put under pressure.
 The evictor code basically kills "warm" fragments too quickly.
 Resulting in a massive, DoS like, performance drop, as seen above
 (no-patch) results with large packets.

 The solution is to avoid killing "warm" fragments, and rather block
 new incoming in case mem limit is exceeded. This is solved by
 introducing a creation time-stamp, which set to "jiffies" in
 inet_frag_alloc().

 UPDATE V2:
  - Drop the INET_FRAG_FIRST_IN idea for detecting dropped "head" packets

  2x10G size(65507) result:(3011+2568)=5579 Mbit/s (gen:9613+9553 Mbit/s)
  2x10G size(4416)  result:(3716+3518)=7234 Mbit/s (gen:9037+8614 Mbit/s)

Patch-02: cache line adjust inet_frag_queue.net (netns)
 -------
 Avoid possible cache-line bounces in struct inet_frag_queue.  By
 moving the net pointer (struct netns_frags) because its placed on the
 same write-often cache-line as e.g. refcnt and lock.

  2x10G size(65507) result:(2960+2613)=5573 Mbit/s (gen:9614+9465 Mbit/s)
  2x10G size(4416)  result:(3858+3650)=7508 Mbit/s (gen:8076+7633 Mbit/s)

 The performance benefit looks small. We can discuss if this patch is
 needed or not.

Patch-03: move LRU list maintenance outside of rwlock
 -------
 Updating the fragmentation queues LRU (Least-Recently-Used) list,
 required taking the hash writer lock.  However, the LRU list isn't
 tied to the hash at all, so we can use a separate lock for it.

 This patch looks like a performance loss for big packets, but the LRU
 locking changes are needed, by later patches.

 UPDATE V2:
  - Don't perform inet_frag_lru_move() outside the q.lock (inet_frag_queue)
    Because there were a theoretical chance of a race between
    inet_frag_lru_move() and fq_unlink() which is called under the
    q.lock. I have not been able to provoke this though (it should
    result in a list poison error)

  2x10G size(65507) result:(2533+2138)=4671 Mbit/s (gen:9612+9461 Mbit/s)
  2x10G size(4416)  result:(3952+3713)=7665 Mbit/s (gen:9168+8415 Mbit/s)

Patch-04: frag helper functions for mem limit tracking
 -------
 This patch is only meant as a preparation patch, towards the next
 patch.  The performance improvement comes from reduce the number
 atomic operation, during freeing of a frag queue, by summing the mem
 accounting before and doing a single atomic dec.

  2x10G size(65507) result:(2475+3101)=5576 Mbit/s (gen:9614+9439 Mbit/s)
  2x10G size(4416)  result:(3928+4129)=8057 Mbit/s (gen:7259+8131 Mbit/s)

Patch-05: per CPU resource, mem limit and LRU list accounting
 -------
 The major performance bottleneck on NUMA systems, is the mem limit
 counter, which is based on an atomic counter.  This patch removes the
 cache-bouncing of the atomic counter, by moving this accounting to be
 bound to each CPU.  The LRU list also need to be done per CPU,
 in-order to keep the accounting straight.

 UPDATE V2:
   - Rename struct cpu_resource -> frag_cpu_limit
   - Move init functions from inet_frag.h to inet_fragment.c
   - Cleanup per CPU in inet_frags_exit_net()

  2x10G size(65507) result:(9603+9458)=19061 Mbit/s (gen:9614+9458 Mbit/s)
  2x10G size(4416)  result:(4871+4848)= 9719 Mbit/s (gen:9107+8378 Mbit/s)

 To compare the benefit of the next patches, its necessary to increase
 the stress on the code, but doing 4x 10Gbit/s tests.

  4x10G size(65507) result:(8631+9337+7534+6928)=32430 Mbit/s
                       (gen:8646+9613+7547+6937 =32743 Mbit/s)
  4x10G size(4416)  result:(2870+2990+2993+3016)=11869 Mbit/s
                       (gen:4819+7767+6893+5043 =24522 Mbit/s)

Patch-06: implement dynamic percpu alloc of frag_cpu_limit
 -------
 Use the percpu API to implement dynamic per CPU allocation of the
 frag_cpu_limit in struct netns_frags.  This replaces the static array
 percpu[NR_CPUS].

 UPDATE V2:
  - This is a new patch.
  - Keeping it separate to get explicit review of this
  - (as this the first time I use the percpu API)

 2x10G size(65507) result:(9603+9367)=18970 Mbit/s (gen: 9614+9379=18993 Mbit/s)
 2x10G size(4416)  result:(4887+4773)= 9660 Mbit/s (gen: 7966+7412=15378 Mbit/s)

 4x10G size(65507) result:(7821+7723+6784+7859)=30187 Mbit/s
                     (gen: 8017+9545+6798+7863 =32223 Mbit/s)
 4x10G size(4416)  result:(2706+2684+2647+2669)=10706 Mbit/s
                     (gen: 4943+7483+7291+4271 =23988 Mbit/s)

 At first sight it looks like performance went down a bit, but as you
 can see in the next patches, my V2 results are (almost) the same.

Patch-07: nqueues_under_LRU_lock
 -------
 This patch just moves the nqueues counter under the LRU lock (and
 per CPU), instead of the write lock, to prepare for next patch.  No
 need for performance testing this part.

Patch-08: hash_bucket_locking
 -------
 This patch implements per hash bucket locking for the frag queue
 hash.  This removes two write locks, and the only remaining write
 lock is for protecting hash rebuild.  This essentially reduces the
 readers-writer lock to a rebuild lock.

 UPDATE V2:
  - Fixed two bugs
  - 1) Missed/too-late read_lock() for protecting hashfn in fq_unlink()
  - 2) Used old hash bucket instead of new dest bucket in inet_frag_secret_rebuild()

  2x10G size(65507) result:(9602+9466)=19068 Mbit/s (gen:9613+9472 Mbit/s)
                 V2 result:(9521+9505)=19026 Mbit/s

  2x10G size(4416)  result:(5024+4925)= 9949 Mbit/s (gen:8581+8957 Mbit/s)
                 V2 result:(5140+5206)=10346 Mbit/s

 To see the real benefit of this patch, we need to crank up the load
 and stress on the code, with 4x 10Gbit/s at small packets,
 improvement at size(4416): before 11869 Mbit/s now 17155 Mbit/s. Also
 note the regression at size(65507) 32430 -> 31021.

  4x10G size(65507) result:(7618+8708+7381+7314)=31021 Mbit/s
                 V2 result:(7488+8350+6834+8562)=31234 Mbit/s
                       (gen:7628+9501+8728+7321 =33178 Mbit/s)

  4x10G size(4416)  result:(4156+4714+4300+3985)=17155 Mbit/s
                 V2 result:(4341+4607+3963+4450)=17361 Mbit/s
                       (gen:6614+5330+7745+5366 =25055 Mbit/s)

 At 4x10G size(4416) I have seen 206 frag queues in use, and hash size is 64.

Patch-09: cache_align_hash_bucket
 -------
 Increase frag queue hash size and assure cache-line alignment to
 avoid false sharing.  Hash size is set to 256, because I have
 observed 206 frag queues in use at 4x10G with packet size 4416 bytes.

  2x10G size(65507) result:(9601+9414)=19015 Mbit/s (gen:9614+9434 Mbit/s)
                 V2 result:(9599+9427)=19026 Mbit/s

  2x10G size(4416)  result:(5421+5268)=10689 Mbit/s (gen:8028+7457 Mbit/s)
                 V2 result:(5377+5336)=10713 Mbit/s

 This does introduce an improvement (although not as big as I
 expected), but most importantly the regression seen in patch-08 4x10G
 at size(65507) is gone (patch-05:32430 Mbits/s -> 32676 Mbit).

  4x10G size(65507) result:(7604+8307+9593+7172)=32676 Mbit/s
                 V2 result:(7612+8063+9580+7265)=32520 Mbit/s
                       (gen:7615+8713+9606+7184 =33118 Mbit/s)

  4x10G size(4416)  result:(4890+4364+4139+4530)=17923 Mbit/s
                 V2 result:(3860+4533+4936+4519)=17848 Mbit/s
                       (gen:5170+6873+5215+7632 =24890 Mbit/s)

 After this patch it looks like the read lock is now the new
 contention point.

DROPPED Patch-10: Hack disable rebuild and remove rw_lock
 -------
 I've done a quick hack patch, that remove the readers-writer lock, by
 disabling/breaking hash rebuilding.  Just to see how big the
 performance gain would be.

 UPDATE V2:
  - I have dropped this patch.  It was just to show the potential.
  - Lets first integrate the other patches, and leave this for the future

  2x10G size(4416) result: 6481+6764 = 13245 Mbit/s (gen: 7652+8077 Mbit/s)

  4x10G size(4416) result:(5610+6283+5735+5238)=22866 Mbit/s
                     (gen: 6530+7860+5967+5238 =25595 Mbit/s)

 And the results show, that its a big win. With 4x10G size(4416)
 before: 17923 Mbit/s -> now: 22866 Mbit/s increase 4943 Mbit/s.
 With 2x10G size(4416) before 10689 Mbit/s -> 13245 Mbit/s
 increase 2556 Mbit/s.

 In the future, I'll work on a real solution for removing the rw_lock
 while still supporting hash rebuilding.  Suggestions and ideas are
 welcome.

This patchset is based upon:
  Davem's net-next tree:
    git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git
  On top of:
    commit ff33c0e1885cda44dd14c79f70df4706f83582a0
    (net: Remove bogus dependencies on INET)

---

Jesper Dangaard Brouer (9):
      net: increase frag queue hash size and cache-line
      net: frag queue locking per hash bucket
      net: frag, move nqueues counter under LRU lock protection
      net: frag, implement dynamic percpu alloc of frag_cpu_limit
      net: frag, per CPU resource, mem limit and LRU list accounting
      net: frag helper functions for mem limit tracking
      net: frag, move LRU list maintenance outside of rwlock
      net: frag cache line adjust inet_frag_queue.net
      net: frag evictor, avoid killing warm frag queues

 include/net/inet_frag.h                 |  114 ++++++++++++++++++++--
 include/net/ipv6.h                      |    4 -
 net/ipv4/inet_fragment.c                |  162 ++++++++++++++++++++++++-------
 net/ipv4/ip_fragment.c                  |   39 +++----
 net/ipv6/netfilter/nf_conntrack_reasm.c |   13 +-
 net/ipv6/reassembly.c                   |   12 +-
 6 files changed, 260 insertions(+), 84 deletions(-)

--
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Sr. Network Kernel Developer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* [net-next PATCH V2 1/9] net: frag evictor, avoid killing warm frag queues
From: Jesper Dangaard Brouer @ 2012-11-29 16:11 UTC (permalink / raw)
  To: Eric Dumazet, David S. Miller, Florian Westphal
  Cc: Jesper Dangaard Brouer, netdev, Pablo Neira Ayuso, Thomas Graf,
	Cong Wang, Patrick McHardy, Paul E. McKenney, Herbert Xu
In-Reply-To: <20121129161019.17754.29670.stgit@dragon>

The fragmentation evictor system have a very unfortunate eviction
system for killing fragment, when the system is put under pressure.
If packets are coming in too fast, the evictor code kills "warm"
fragments too quickly.  Resulting in a massive performance drop,
because we drop frag lists where we have already queue up a lot of
fragments/work, which gets killed before they have a chance to
complete.

This is perhaps amplified by the CPUs fighting for the same lru_list
element q in inet_frag_evictor(), and the atomic_dec_and_test(&q->refcnt)
which will cause another trip round the loop.

The solution idea (orig conceived by Florian Westphal) is to avoid
killing "warm" fragments, and rather block new incoming in case mem
limit is exceeded. This is solved by introducing a creation time-stamp
(creation_ts) in inet_frag_queue, which set to "jiffies" in
inet_frag_alloc().

In inet_frag_evictor() we don't kill "warm" queue elements. A "warm"
queue element is an element reaching inet_frag_evictor() within the
same "jiffie".

To maintain the queue limit (/proc/sys/net/ipv4/ipfrag_high_thresh) we
don't allow, new frag queue's to be allocated in inet_frag_alloc().
This will result in kernel log messages, which have been adjusted to
"ip_frag_create: mem limit reached".  We should consider dropping this
text, to avoid confusing end-users.

Notice, this is not the complete solution to fixing fragmentation
performance, but it allow us to see what is going on.

Original-idea-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Jesper Dangaard Brouer <jbrouer@redhat.com>

---
V2:
 - Drop the INET_FRAG_FIRST_IN idea for detecting dropped "head" packets

 include/net/inet_frag.h  |    1 +
 net/ipv4/inet_fragment.c |   19 +++++++++++++++++++
 net/ipv4/ip_fragment.c   |    6 +++---
 3 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 32786a0..7b897b2 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -24,6 +24,7 @@ struct inet_frag_queue {
 	ktime_t			stamp;
 	int			len;        /* total length of orig datagram */
 	int			meat;
+	u32			creation_ts;/* jiffies when queue was created*/
 	__u8			last_in;    /* first/last segment arrived? */

 #define INET_FRAG_COMPLETE	4
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 4750d2b..9bb6237 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -178,6 +178,18 @@ int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)

 		q = list_first_entry(&nf->lru_list,
 				struct inet_frag_queue, lru_list);
+
+		/* queue entry is warm, i.e. new frags are arriving
+		 * too fast.  instead of evicting warm entry, give it
+		 * a chance to complete.  Instead, inet_frag_alloc()
+		 * will fail until more time has elapsed or queue
+		 * completes.
+		 */
+		if (!force && q->creation_ts == (u32) jiffies) {
+			read_unlock(&f->lock);
+			break;
+		}
+
 		atomic_inc(&q->refcnt);
 		read_unlock(&f->lock);

@@ -244,10 +256,17 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
 {
 	struct inet_frag_queue *q;

+	/* Guard creations of new frag queues if mem limit reached, as
+	 * we allow warm/recent elements to survive in inet_frag_evictor()
+	 */
+	if (atomic_read(&nf->mem) > nf->high_thresh)
+		return NULL;
+
 	q = kzalloc(f->qsize, GFP_ATOMIC);
 	if (q == NULL)
 		return NULL;

+	q->creation_ts = (u32) jiffies;
 	q->net = nf;
 	f->constructor(q, arg);
 	atomic_add(f->qsize, &nf->mem);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 1cf6a76..ef00d0a 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -300,12 +300,12 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)

 	q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
 	if (q == NULL)
-		goto out_nomem;
+		goto out_memlimit;

 	return container_of(q, struct ipq, q);

-out_nomem:
-	LIMIT_NETDEBUG(KERN_ERR pr_fmt("ip_frag_create: no memory left !\n"));
+out_memlimit:
+	LIMIT_NETDEBUG(KERN_ERR pr_fmt("ip_frag_create: mem limit reached!\n"));
 	return NULL;
 }

^ permalink raw reply related

* [net-next PATCH V2 2/9] net: frag cache line adjust inet_frag_queue.net
From: Jesper Dangaard Brouer @ 2012-11-29 16:11 UTC (permalink / raw)
  To: Eric Dumazet, David S. Miller, Florian Westphal
  Cc: Jesper Dangaard Brouer, netdev, Pablo Neira Ayuso, Thomas Graf,
	Cong Wang, Patrick McHardy, Paul E. McKenney, Herbert Xu
In-Reply-To: <20121129161019.17754.29670.stgit@dragon>

In inet_frag_find() unfortunate cache-line bounces can occur with
struct inet_frag_queue.  As the net pointer (struct netns_frags)
is placed on the same write-often cache-line as e.g. refcnt and
lock. As the hash match check always check (q->net == nf).

This (of-cause) only happens on hash bucket collisions, but as current
hash size is only 64 this makes collisions more likely.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---

 include/net/inet_frag.h |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 7b897b2..1f75316 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -14,7 +14,6 @@ struct netns_frags {
 
 struct inet_frag_queue {
 	struct hlist_node	list;
-	struct netns_frags	*net;
 	struct list_head	lru_list;   /* lru list member */
 	spinlock_t		lock;
 	atomic_t		refcnt;
@@ -24,6 +23,7 @@ struct inet_frag_queue {
 	ktime_t			stamp;
 	int			len;        /* total length of orig datagram */
 	int			meat;
+	struct netns_frags	*net;
 	u32			creation_ts;/* jiffies when queue was created*/
 	__u8			last_in;    /* first/last segment arrived? */
 

^ permalink raw reply related

* [net-next PATCH V2 3/9] net: frag, move LRU list maintenance outside of rwlock
From: Jesper Dangaard Brouer @ 2012-11-29 16:12 UTC (permalink / raw)
  To: Eric Dumazet, David S. Miller, Florian Westphal
  Cc: Jesper Dangaard Brouer, netdev, Pablo Neira Ayuso, Thomas Graf,
	Cong Wang, Patrick McHardy, Paul E. McKenney, Herbert Xu
In-Reply-To: <20121129161019.17754.29670.stgit@dragon>

Updating the fragmentation queues LRU (Least-Recently-Used) list,
required taking the hash writer lock.  However, the LRU list isn't
tied to the hash at all, so we can use a separate lock for it.

This change, in it self, does not improve performance significantly.
But its part of making the fragmentation code scale.

Original-idea-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>

---
V2:
 - Don't perform inet_frag_lru_move() outside the q.lock (inet_frag_queue)
   Because there were a theoretical chance of a race between
   inet_frag_lru_move() and fq_unlink() which is called under the
   q.lock. I have not been able to provoke this though (it should
   result in a list poison error)

 include/net/inet_frag.h                 |   22 ++++++++++++++++++++++
 net/ipv4/inet_fragment.c                |   14 ++++++++------
 net/ipv4/ip_fragment.c                  |    4 +---
 net/ipv6/netfilter/nf_conntrack_reasm.c |    5 ++---
 net/ipv6/reassembly.c                   |    4 +---
 5 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 1f75316..312a3fa 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -5,6 +5,7 @@ struct netns_frags {
 	int			nqueues;
 	atomic_t		mem;
 	struct list_head	lru_list;
+	spinlock_t		lru_lock;
 
 	/* sysctls */
 	int			timeout;
@@ -73,4 +74,25 @@ static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f
 		inet_frag_destroy(q, f, NULL);
 }
 
+static inline void inet_frag_lru_move(struct inet_frag_queue *q)
+{
+	spin_lock(&q->net->lru_lock);
+	list_move_tail(&q->lru_list, &q->net->lru_list);
+	spin_unlock(&q->net->lru_lock);
+}
+
+static inline void inet_frag_lru_del(struct inet_frag_queue *q)
+{
+	spin_lock(&q->net->lru_lock);
+	list_del(&q->lru_list);
+	spin_unlock(&q->net->lru_lock);
+}
+
+static inline void inet_frag_lru_add(struct netns_frags *nf,
+				     struct inet_frag_queue *q)
+{
+	spin_lock(&nf->lru_lock);
+	list_add_tail(&q->lru_list, &nf->lru_list);
+	spin_unlock(&nf->lru_lock);
+}
 #endif
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 9bb6237..4e56587 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -75,6 +75,7 @@ void inet_frags_init_net(struct netns_frags *nf)
 	nf->nqueues = 0;
 	atomic_set(&nf->mem, 0);
 	INIT_LIST_HEAD(&nf->lru_list);
+	spin_lock_init(&nf->lru_lock);
 }
 EXPORT_SYMBOL(inet_frags_init_net);
 
@@ -98,9 +99,9 @@ static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
 {
 	write_lock(&f->lock);
 	hlist_del(&fq->list);
-	list_del(&fq->lru_list);
 	fq->net->nqueues--;
 	write_unlock(&f->lock);
+	inet_frag_lru_del(fq);
 }
 
 void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
@@ -170,9 +171,10 @@ int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
 
 	work = atomic_read(&nf->mem) - nf->low_thresh;
 	while (work > 0) {
-		read_lock(&f->lock);
+		spin_lock(&nf->lru_lock);
+
 		if (list_empty(&nf->lru_list)) {
-			read_unlock(&f->lock);
+			spin_unlock(&nf->lru_lock);
 			break;
 		}
 
@@ -186,12 +188,12 @@ int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
 		 * completes.
 		 */
 		if (!force && q->creation_ts == (u32) jiffies) {
-			read_unlock(&f->lock);
+			spin_unlock(&nf->lru_lock);
 			break;
 		}
 
 		atomic_inc(&q->refcnt);
-		read_unlock(&f->lock);
+		spin_unlock(&nf->lru_lock);
 
 		spin_lock(&q->lock);
 		if (!(q->last_in & INET_FRAG_COMPLETE))
@@ -245,9 +247,9 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
 
 	atomic_inc(&qp->refcnt);
 	hlist_add_head(&qp->list, &f->hash[hash]);
-	list_add_tail(&qp->lru_list, &nf->lru_list);
 	nf->nqueues++;
 	write_unlock(&f->lock);
+	inet_frag_lru_add(nf, qp);
 	return qp;
 }
 
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index ef00d0a..b2425bf 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -531,9 +531,7 @@ found:
 	    qp->q.meat == qp->q.len)
 		return ip_frag_reasm(qp, prev, dev);
 
-	write_lock(&ip4_frags.lock);
-	list_move_tail(&qp->q.lru_list, &qp->q.net->lru_list);
-	write_unlock(&ip4_frags.lock);
+	inet_frag_lru_move(&qp->q);
 	return -EINPROGRESS;
 
 err:
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 22c8ea9..b0a1c96 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -325,9 +325,8 @@ found:
 		fq->nhoffset = nhoff;
 		fq->q.last_in |= INET_FRAG_FIRST_IN;
 	}
-	write_lock(&nf_frags.lock);
-	list_move_tail(&fq->q.lru_list, &fq->q.net->lru_list);
-	write_unlock(&nf_frags.lock);
+
+	inet_frag_lru_move(&fq->q);
 	return 0;
 
 discard_fq:
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index e5253ec..b373309 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -341,9 +341,7 @@ found:
 	    fq->q.meat == fq->q.len)
 		return ip6_frag_reasm(fq, prev, dev);
 
-	write_lock(&ip6_frags.lock);
-	list_move_tail(&fq->q.lru_list, &fq->q.net->lru_list);
-	write_unlock(&ip6_frags.lock);
+	inet_frag_lru_move(&fq->q);
 	return -1;
 
 discard_fq:

^ permalink raw reply related

* [net-next PATCH V2 4/9] net: frag helper functions for mem limit tracking
From: Jesper Dangaard Brouer @ 2012-11-29 16:12 UTC (permalink / raw)
  To: Eric Dumazet, David S. Miller, Florian Westphal
  Cc: Jesper Dangaard Brouer, netdev, Pablo Neira Ayuso, Thomas Graf,
	Cong Wang, Patrick McHardy, Paul E. McKenney, Herbert Xu
In-Reply-To: <20121129161019.17754.29670.stgit@dragon>

This change is primarily a preparation to ease the extension of memory
limit tracking.

The change does reduce the number atomic operation, during freeing of
a frag queue.  This does introduce a fairly good performance improvement, as
these atomic operations are at the core of the performance problems
seen on NUMA systems.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---

 include/net/inet_frag.h                 |   28 ++++++++++++++++++++++++++++
 include/net/ipv6.h                      |    2 +-
 net/ipv4/inet_fragment.c                |   27 +++++++++++++--------------
 net/ipv4/ip_fragment.c                  |   24 +++++++++++-------------
 net/ipv6/netfilter/nf_conntrack_reasm.c |    6 +++---
 net/ipv6/reassembly.c                   |    6 +++---
 6 files changed, 59 insertions(+), 34 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 312a3fa..9bbef17 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -95,4 +95,32 @@ static inline void inet_frag_lru_add(struct netns_frags *nf,
 	list_add_tail(&q->lru_list, &nf->lru_list);
 	spin_unlock(&nf->lru_lock);
 }
+
+/* Memory Tracking Functions. */
+
+static inline int frag_mem_limit(struct netns_frags *nf)
+{
+	return atomic_read(&nf->mem);
+}
+
+static inline void sub_frag_mem_limit(struct inet_frag_queue *q, int i)
+{
+	atomic_sub(i, &q->net->mem);
+}
+
+static inline void add_frag_mem_limit(struct inet_frag_queue *q, int i)
+{
+	atomic_add(i, &q->net->mem);
+}
+
+static inline void init_frag_mem_limit(struct netns_frags *nf)
+{
+	atomic_set(&nf->mem, 0);
+}
+
+static inline int sum_frag_mem_limit(struct netns_frags *nf)
+{
+	return atomic_read(&nf->mem);
+}
+
 #endif
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 979bf6c..a5c1cf1 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -279,7 +279,7 @@ static inline int ip6_frag_nqueues(struct net *net)
 
 static inline int ip6_frag_mem(struct net *net)
 {
-	return atomic_read(&net->ipv6.frags.mem);
+	return sum_frag_mem_limit(&net->ipv6.frags);
 }
 #endif
 
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 4e56587..0ecacbd 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -73,7 +73,7 @@ EXPORT_SYMBOL(inet_frags_init);
 void inet_frags_init_net(struct netns_frags *nf)
 {
 	nf->nqueues = 0;
-	atomic_set(&nf->mem, 0);
+	init_frag_mem_limit(nf);
 	INIT_LIST_HEAD(&nf->lru_list);
 	spin_lock_init(&nf->lru_lock);
 }
@@ -118,12 +118,8 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
 EXPORT_SYMBOL(inet_frag_kill);
 
 static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
-		struct sk_buff *skb, int *work)
+		struct sk_buff *skb)
 {
-	if (work)
-		*work -= skb->truesize;
-
-	atomic_sub(skb->truesize, &nf->mem);
 	if (f->skb_free)
 		f->skb_free(skb);
 	kfree_skb(skb);
@@ -134,6 +130,7 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
 {
 	struct sk_buff *fp;
 	struct netns_frags *nf;
+	unsigned int sum, sum_truesize = 0;
 
 	WARN_ON(!(q->last_in & INET_FRAG_COMPLETE));
 	WARN_ON(del_timer(&q->timer) != 0);
@@ -144,13 +141,14 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
 	while (fp) {
 		struct sk_buff *xp = fp->next;
 
-		frag_kfree_skb(nf, f, fp, work);
+		sum_truesize += fp->truesize;
+		frag_kfree_skb(nf, f, fp);
 		fp = xp;
 	}
-
+	sum = sum_truesize + f->qsize;
 	if (work)
-		*work -= f->qsize;
-	atomic_sub(f->qsize, &nf->mem);
+		*work -= sum;
+	sub_frag_mem_limit(q, sum);
 
 	if (f->destructor)
 		f->destructor(q);
@@ -165,11 +163,11 @@ int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
 	int work, evicted = 0;
 
 	if (!force) {
-		if (atomic_read(&nf->mem) <= nf->high_thresh)
+		if (frag_mem_limit(nf) <= nf->high_thresh)
 			return 0;
 	}
 
-	work = atomic_read(&nf->mem) - nf->low_thresh;
+	work = frag_mem_limit(nf) - nf->low_thresh;
 	while (work > 0) {
 		spin_lock(&nf->lru_lock);
 
@@ -261,7 +259,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
 	/* Guard creations of new frag queues if mem limit reached, as
 	 * we allow warm/recent elements to survive in inet_frag_evictor()
 	 */
-	if (atomic_read(&nf->mem) > nf->high_thresh)
+	if (frag_mem_limit(nf) > nf->high_thresh)
 		return NULL;
 
 	q = kzalloc(f->qsize, GFP_ATOMIC);
@@ -271,7 +269,8 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
 	q->creation_ts = (u32) jiffies;
 	q->net = nf;
 	f->constructor(q, arg);
-	atomic_add(f->qsize, &nf->mem);
+	add_frag_mem_limit(q, f->qsize);
+
 	setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
 	spin_lock_init(&q->lock);
 	atomic_set(&q->refcnt, 1);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index b2425bf..abb5551 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -122,7 +122,7 @@ int ip_frag_nqueues(struct net *net)
 
 int ip_frag_mem(struct net *net)
 {
-	return atomic_read(&net->ipv4.frags.mem);
+	return sum_frag_mem_limit(&net->ipv4.frags);
 }
 
 static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
@@ -161,13 +161,6 @@ static bool ip4_frag_match(struct inet_frag_queue *q, void *a)
 		qp->user == arg->user;
 }
 
-/* Memory Tracking Functions. */
-static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb)
-{
-	atomic_sub(skb->truesize, &nf->mem);
-	kfree_skb(skb);
-}
-
 static void ip4_frag_init(struct inet_frag_queue *q, void *a)
 {
 	struct ipq *qp = container_of(q, struct ipq, q);
@@ -340,6 +333,7 @@ static inline int ip_frag_too_far(struct ipq *qp)
 static int ip_frag_reinit(struct ipq *qp)
 {
 	struct sk_buff *fp;
+	unsigned int sum_truesize = 0;
 
 	if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
 		atomic_inc(&qp->q.refcnt);
@@ -349,9 +343,12 @@ static int ip_frag_reinit(struct ipq *qp)
 	fp = qp->q.fragments;
 	do {
 		struct sk_buff *xp = fp->next;
-		frag_kfree_skb(qp->q.net, fp);
+
+		sum_truesize += fp->truesize;
+		kfree_skb(fp);
 		fp = xp;
 	} while (fp);
+	sub_frag_mem_limit(&qp->q, sum_truesize);
 
 	qp->q.last_in = 0;
 	qp->q.len = 0;
@@ -496,7 +493,8 @@ found:
 				qp->q.fragments = next;
 
 			qp->q.meat -= free_it->len;
-			frag_kfree_skb(qp->q.net, free_it);
+			sub_frag_mem_limit(&qp->q, free_it->truesize);
+			kfree_skb(free_it);
 		}
 	}
 
@@ -519,7 +517,7 @@ found:
 	qp->q.stamp = skb->tstamp;
 	qp->q.meat += skb->len;
 	qp->ecn |= ecn;
-	atomic_add(skb->truesize, &qp->q.net->mem);
+	add_frag_mem_limit(&qp->q, skb->truesize);
 	if (offset == 0)
 		qp->q.last_in |= INET_FRAG_FIRST_IN;
 
@@ -615,7 +613,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 		head->len -= clone->len;
 		clone->csum = 0;
 		clone->ip_summed = head->ip_summed;
-		atomic_add(clone->truesize, &qp->q.net->mem);
+		add_frag_mem_limit(&qp->q, clone->truesize);
 	}
 
 	skb_push(head, head->data - skb_network_header(head));
@@ -643,7 +641,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 		}
 		fp = next;
 	}
-	atomic_sub(sum_truesize, &qp->q.net->mem);
+	sub_frag_mem_limit(&qp->q, sum_truesize);
 
 	head->next = NULL;
 	head->dev = dev;
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index b0a1c96..c088831 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -316,7 +316,7 @@ found:
 	fq->q.meat += skb->len;
 	if (payload_len > fq->q.max_size)
 		fq->q.max_size = payload_len;
-	atomic_add(skb->truesize, &fq->q.net->mem);
+	add_frag_mem_limit(&fq->q, skb->truesize);
 
 	/* The first fragment.
 	 * nhoffset is obtained from the first fragment, of course.
@@ -394,7 +394,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev)
 		clone->ip_summed = head->ip_summed;
 
 		NFCT_FRAG6_CB(clone)->orig = NULL;
-		atomic_add(clone->truesize, &fq->q.net->mem);
+		add_frag_mem_limit(&fq->q, clone->truesize);
 	}
 
 	/* We have to remove fragment header from datagram and to relocate
@@ -418,7 +418,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev)
 			head->csum = csum_add(head->csum, fp->csum);
 		head->truesize += fp->truesize;
 	}
-	atomic_sub(head->truesize, &fq->q.net->mem);
+	sub_frag_mem_limit(&fq->q, head->truesize);
 
 	head->local_df = 1;
 	head->next = NULL;
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index b373309..bab2c27 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -327,7 +327,7 @@ found:
 	}
 	fq->q.stamp = skb->tstamp;
 	fq->q.meat += skb->len;
-	atomic_add(skb->truesize, &fq->q.net->mem);
+	add_frag_mem_limit(&fq->q, skb->truesize);
 
 	/* The first fragment.
 	 * nhoffset is obtained from the first fragment, of course.
@@ -427,7 +427,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 		head->len -= clone->len;
 		clone->csum = 0;
 		clone->ip_summed = head->ip_summed;
-		atomic_add(clone->truesize, &fq->q.net->mem);
+		add_frag_mem_limit(&fq->q, clone->truesize);
 	}
 
 	/* We have to remove fragment header from datagram and to relocate
@@ -465,7 +465,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 		}
 		fp = next;
 	}
-	atomic_sub(sum_truesize, &fq->q.net->mem);
+	sub_frag_mem_limit(&fq->q, sum_truesize);
 
 	head->next = NULL;
 	head->dev = dev;

^ permalink raw reply related

* [net-next PATCH V2 5/9] net: frag, per CPU resource, mem limit and LRU list accounting
From: Jesper Dangaard Brouer @ 2012-11-29 16:13 UTC (permalink / raw)
  To: Eric Dumazet, David S. Miller, Florian Westphal
  Cc: Jesper Dangaard Brouer, netdev, Pablo Neira Ayuso, Thomas Graf,
	Cong Wang, Patrick McHardy, Paul E. McKenney, Herbert Xu
In-Reply-To: <20121129161019.17754.29670.stgit@dragon>

The major performance bottleneck on NUMA systems, is the mem limit
counter which is based an atomic counter.  This patch removes the
cache-bouncing of the atomic counter, by moving this accounting to be
bound to each CPU.  The LRU list also need to be done per CPU,
in-order to keep the accounting straight.

If fragments belonging together is "sprayed" across CPUs, performance
will still suffer, but due to NIC rxhashing this is not very common.
Correct accounting in this situation is maintained by recording and
"assigning" a CPU to a frag queue when its allocated (caused by the
first packet associated packet).

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>

---
V2:
 - Rename struct cpu_resource -> frag_cpu_limit
 - Move init functions from inet_frag.h to inet_fragment.c
 - Cleanup per CPU in inet_frags_exit_net()

 include/net/inet_frag.h                 |   64 +++++++++++++++++++------------
 net/ipv4/inet_fragment.c                |   50 ++++++++++++++++++------
 net/ipv4/ip_fragment.c                  |    3 +
 net/ipv6/netfilter/nf_conntrack_reasm.c |    2 -
 net/ipv6/reassembly.c                   |    2 -
 5 files changed, 80 insertions(+), 41 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 9bbef17..8421904 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -1,11 +1,22 @@
 #ifndef __NET_FRAG_H__
 #define __NET_FRAG_H__
 
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
+
+/* Need to maintain these resource limits per CPU, else we will kill
+ * performance due to cache-line bouncing
+ */
+struct frag_cpu_limit {
+	atomic_t                mem;
+	struct list_head        lru_list;
+	spinlock_t              lru_lock;
+} ____cacheline_aligned_in_smp;
+
 struct netns_frags {
 	int			nqueues;
-	atomic_t		mem;
-	struct list_head	lru_list;
-	spinlock_t		lru_lock;
+
+	struct frag_cpu_limit	percpu[NR_CPUS];
 
 	/* sysctls */
 	int			timeout;
@@ -26,6 +37,7 @@ struct inet_frag_queue {
 	int			meat;
 	struct netns_frags	*net;
 	u32			creation_ts;/* jiffies when queue was created*/
+	u32			cpu_alloc;  /* used for mem limit accounting */
 	__u8			last_in;    /* first/last segment arrived? */
 
 #define INET_FRAG_COMPLETE	4
@@ -63,7 +75,8 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
 void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f);
 void inet_frag_destroy(struct inet_frag_queue *q,
 				struct inet_frags *f, int *work);
-int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force);
+int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f,
+		      bool force, int on_cpu);
 struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
 		struct inet_frags *f, void *key, unsigned int hash)
 	__releases(&f->lock);
@@ -74,53 +87,54 @@ static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f
 		inet_frag_destroy(q, f, NULL);
 }
 
+/* LRU (Least Recently Used) resource functions */
+
 static inline void inet_frag_lru_move(struct inet_frag_queue *q)
 {
-	spin_lock(&q->net->lru_lock);
-	list_move_tail(&q->lru_list, &q->net->lru_list);
-	spin_unlock(&q->net->lru_lock);
+	int cpu = q->cpu_alloc;
+	spin_lock(&q->net->percpu[cpu].lru_lock);
+	list_move_tail(&q->lru_list, &q->net->percpu[cpu].lru_list);
+	spin_unlock(&q->net->percpu[cpu].lru_lock);
 }
 
 static inline void inet_frag_lru_del(struct inet_frag_queue *q)
 {
-	spin_lock(&q->net->lru_lock);
+	int cpu = q->cpu_alloc;
+	spin_lock(&q->net->percpu[cpu].lru_lock);
 	list_del(&q->lru_list);
-	spin_unlock(&q->net->lru_lock);
+	spin_unlock(&q->net->percpu[cpu].lru_lock);
 }
 
 static inline void inet_frag_lru_add(struct netns_frags *nf,
 				     struct inet_frag_queue *q)
 {
-	spin_lock(&nf->lru_lock);
-	list_add_tail(&q->lru_list, &nf->lru_list);
-	spin_unlock(&nf->lru_lock);
+	int cpu = q->cpu_alloc;
+	spin_lock(&nf->percpu[cpu].lru_lock);
+	list_add_tail(&q->lru_list, &nf->percpu[cpu].lru_list);
+	spin_unlock(&nf->percpu[cpu].lru_lock);
 }
 
 /* Memory Tracking Functions. */
 
-static inline int frag_mem_limit(struct netns_frags *nf)
-{
-	return atomic_read(&nf->mem);
-}
-
 static inline void sub_frag_mem_limit(struct inet_frag_queue *q, int i)
 {
-	atomic_sub(i, &q->net->mem);
+	int cpu = q->cpu_alloc;
+	atomic_sub(i, &q->net->percpu[cpu].mem);
 }
 
 static inline void add_frag_mem_limit(struct inet_frag_queue *q, int i)
 {
-	atomic_add(i, &q->net->mem);
-}
-
-static inline void init_frag_mem_limit(struct netns_frags *nf)
-{
-	atomic_set(&nf->mem, 0);
+	int cpu = q->cpu_alloc;
+	atomic_add(i, &q->net->percpu[cpu].mem);
 }
 
 static inline int sum_frag_mem_limit(struct netns_frags *nf)
 {
-	return atomic_read(&nf->mem);
+	unsigned int sum = 0;
+	int cpu;
+	for_each_possible_cpu(cpu)
+		sum += atomic_read(&nf->percpu[cpu].mem);
+	return sum;
 }
 
 #endif
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 0ecacbd..068aabe 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -23,6 +23,17 @@
 
 #include <net/inet_frag.h>
 
+static inline int frag_mem_limit_on_cpu(struct netns_frags *nf, int on_cpu)
+{
+	return atomic_read(&nf->percpu[on_cpu].mem);
+}
+
+static inline int frag_mem_limit(struct netns_frags *nf)
+{
+	int cpu = smp_processor_id();
+	return frag_mem_limit_on_cpu(nf, cpu);
+}
+
 static void inet_frag_secret_rebuild(unsigned long dummy)
 {
 	struct inet_frags *f = (struct inet_frags *)dummy;
@@ -70,12 +81,20 @@ void inet_frags_init(struct inet_frags *f)
 }
 EXPORT_SYMBOL(inet_frags_init);
 
+static void inet_frags_init_percpu_limit(struct netns_frags *nf)
+{
+	int cpu;
+	for_each_possible_cpu(cpu) {
+		INIT_LIST_HEAD(&nf->percpu[cpu].lru_list);
+		spin_lock_init(&nf->percpu[cpu].lru_lock);
+		atomic_set(&nf->percpu[cpu].mem, 0);
+	}
+}
+
 void inet_frags_init_net(struct netns_frags *nf)
 {
 	nf->nqueues = 0;
-	init_frag_mem_limit(nf);
-	INIT_LIST_HEAD(&nf->lru_list);
-	spin_lock_init(&nf->lru_lock);
+	inet_frags_init_percpu_limit(nf);
 }
 EXPORT_SYMBOL(inet_frags_init_net);
 
@@ -87,10 +106,12 @@ EXPORT_SYMBOL(inet_frags_fini);
 
 void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
 {
+	int cpu;
 	nf->low_thresh = 0;
 
 	local_bh_disable();
-	inet_frag_evictor(nf, f, true);
+	for_each_possible_cpu(cpu)
+		inet_frag_evictor(nf, f, true, cpu);
 	local_bh_enable();
 }
 EXPORT_SYMBOL(inet_frags_exit_net);
@@ -157,26 +178,28 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
 }
 EXPORT_SYMBOL(inet_frag_destroy);
 
-int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
+int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f,
+		      bool force, int on_cpu)
 {
 	struct inet_frag_queue *q;
 	int work, evicted = 0;
+	int cpu = (likely(on_cpu < 0)) ? smp_processor_id() : on_cpu;
 
 	if (!force) {
-		if (frag_mem_limit(nf) <= nf->high_thresh)
+		if (frag_mem_limit_on_cpu(nf, cpu) <= nf->high_thresh)
 			return 0;
 	}
 
-	work = frag_mem_limit(nf) - nf->low_thresh;
+	work = frag_mem_limit_on_cpu(nf, cpu) - nf->low_thresh;
 	while (work > 0) {
-		spin_lock(&nf->lru_lock);
+		spin_lock(&nf->percpu[cpu].lru_lock);
 
-		if (list_empty(&nf->lru_list)) {
-			spin_unlock(&nf->lru_lock);
+		if (list_empty(&nf->percpu[cpu].lru_list)) {
+			spin_unlock(&nf->percpu[cpu].lru_lock);
 			break;
 		}
 
-		q = list_first_entry(&nf->lru_list,
+		q = list_first_entry(&nf->percpu[cpu].lru_list,
 				struct inet_frag_queue, lru_list);
 
 		/* queue entry is warm, i.e. new frags are arriving
@@ -186,12 +209,12 @@ int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
 		 * completes.
 		 */
 		if (!force && q->creation_ts == (u32) jiffies) {
-			spin_unlock(&nf->lru_lock);
+			spin_unlock(&nf->percpu[cpu].lru_lock);
 			break;
 		}
 
 		atomic_inc(&q->refcnt);
-		spin_unlock(&nf->lru_lock);
+		spin_unlock(&nf->percpu[cpu].lru_lock);
 
 		spin_lock(&q->lock);
 		if (!(q->last_in & INET_FRAG_COMPLETE))
@@ -267,6 +290,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
 		return NULL;
 
 	q->creation_ts = (u32) jiffies;
+	q->cpu_alloc = (u32) smp_processor_id();
 	q->net = nf;
 	f->constructor(q, arg);
 	add_frag_mem_limit(q, f->qsize);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index abb5551..99944a8 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -18,6 +18,7 @@
  *		John McDonald	:	0 length frag bug.
  *		Alexey Kuznetsov:	SMP races, threading, cleanup.
  *		Patrick McHardy :	LRU queue of frag heads for evictor.
+ *		Jesper Brouer   :	SMP/NUMA scalability
  */
 
 #define pr_fmt(fmt) "IPv4: " fmt
@@ -212,7 +213,7 @@ static void ip_evictor(struct net *net)
 {
 	int evicted;
 
-	evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false);
+	evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false, -1);
 	if (evicted)
 		IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted);
 }
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index c088831..8cb1710 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -566,7 +566,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user)
 	fhdr = (struct frag_hdr *)skb_transport_header(clone);
 
 	local_bh_disable();
-	inet_frag_evictor(&net->nf_frag.frags, &nf_frags, false);
+	inet_frag_evictor(&net->nf_frag.frags, &nf_frags, false, -1);
 	local_bh_enable();
 
 	fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr);
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index bab2c27..d1e70dd 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -529,7 +529,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 		return 1;
 	}
 
-	evicted = inet_frag_evictor(&net->ipv6.frags, &ip6_frags, false);
+	evicted = inet_frag_evictor(&net->ipv6.frags, &ip6_frags, false, -1);
 	if (evicted)
 		IP6_ADD_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
 				 IPSTATS_MIB_REASMFAILS, evicted);

^ permalink raw reply related

* [net-next PATCH V2 7/9] net: frag, move nqueues counter under LRU lock protection
From: Jesper Dangaard Brouer @ 2012-11-29 16:15 UTC (permalink / raw)
  To: Eric Dumazet, David S. Miller, Florian Westphal
  Cc: Jesper Dangaard Brouer, netdev, Pablo Neira Ayuso, Thomas Graf,
	Cong Wang, Patrick McHardy, Paul E. McKenney, Herbert Xu
In-Reply-To: <20121129161019.17754.29670.stgit@dragon>

Preparation patch for per hash bucket locking.

This patch just moves the nqueues counter under the LRU lock (and
per CPU), instead of the write lock, to prepare for next patch.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---

 include/net/inet_frag.h  |   19 +++++++++++++++++--
 include/net/ipv6.h       |    2 +-
 net/ipv4/inet_fragment.c |    4 +---
 net/ipv4/ip_fragment.c   |    2 +-
 4 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 3eadf42..f58590f 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -12,11 +12,10 @@ struct frag_cpu_limit {
 	atomic_t                mem;
 	struct list_head        lru_list;
 	spinlock_t              lru_lock;
+	int			nqueues;
 } ____cacheline_aligned_in_smp;
 
 struct netns_frags {
-	int			nqueues;
-
 	struct frag_cpu_limit __percpu *percpu;
 
 	/* sysctls */
@@ -107,6 +106,7 @@ static inline void inet_frag_lru_del(struct inet_frag_queue *q)
 
 	spin_lock(&percpu->lru_lock);
 	list_del(&q->lru_list);
+	percpu->nqueues--;
 	spin_unlock(&percpu->lru_lock);
 }
 
@@ -118,6 +118,7 @@ static inline void inet_frag_lru_add(struct netns_frags *nf,
 
 	spin_lock(&percpu->lru_lock);
 	list_add_tail(&q->lru_list, &percpu->lru_list);
+	percpu->nqueues++;
 	spin_unlock(&percpu->lru_lock);
 }
 
@@ -150,4 +151,18 @@ static inline int sum_frag_mem_limit(struct netns_frags *nf)
 	return sum;
 }
 
+static inline int sum_frag_nqueues(struct netns_frags *nf)
+{
+	unsigned int sum = 0;
+	int cpu;
+	for_each_possible_cpu(cpu) {
+		struct frag_cpu_limit *percpu = per_cpu_ptr(nf->percpu, cpu);
+
+		spin_lock(&percpu->lru_lock);
+		sum += percpu->nqueues;
+		spin_unlock(&percpu->lru_lock);
+	}
+	return sum;
+}
+
 #endif
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index a5c1cf1..27edfcf 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -274,7 +274,7 @@ extern bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb);
 #if IS_ENABLED(CONFIG_IPV6)
 static inline int ip6_frag_nqueues(struct net *net)
 {
-	return net->ipv6.frags.nqueues;
+	return sum_frag_nqueues(&net->ipv6.frags);
 }
 
 static inline int ip6_frag_mem(struct net *net)
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 0099f0c..9b97f2e 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -96,13 +96,13 @@ static int inet_frags_init_percpu_limit(struct netns_frags *nf)
 		INIT_LIST_HEAD(&percpu->lru_list);
 		spin_lock_init(&percpu->lru_lock);
 		atomic_set(&percpu->mem, 0);
+		percpu->nqueues = 0;
 	}
 	return 1;
 }
 
 void inet_frags_init_net(struct netns_frags *nf)
 {
-	nf->nqueues = 0;
 	inet_frags_init_percpu_limit(nf);
 }
 EXPORT_SYMBOL(inet_frags_init_net);
@@ -131,7 +131,6 @@ static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
 {
 	write_lock(&f->lock);
 	hlist_del(&fq->list);
-	fq->net->nqueues--;
 	write_unlock(&f->lock);
 	inet_frag_lru_del(fq);
 }
@@ -280,7 +279,6 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
 
 	atomic_inc(&qp->refcnt);
 	hlist_add_head(&qp->list, &f->hash[hash]);
-	nf->nqueues++;
 	write_unlock(&f->lock);
 	inet_frag_lru_add(nf, qp);
 	return qp;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 99944a8..26fd2b7 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -118,7 +118,7 @@ static struct inet_frags ip4_frags;
 
 int ip_frag_nqueues(struct net *net)
 {
-	return net->ipv4.frags.nqueues;
+	return sum_frag_nqueues(&net->ipv4.frags);
 }
 
 int ip_frag_mem(struct net *net)

^ permalink raw reply related

* [net-next PATCH V2 6/9] net: frag, implement dynamic percpu alloc of frag_cpu_limit
From: Jesper Dangaard Brouer @ 2012-11-29 16:14 UTC (permalink / raw)
  To: Eric Dumazet, David S. Miller, Florian Westphal
  Cc: Jesper Dangaard Brouer, netdev, Pablo Neira Ayuso, Thomas Graf,
	Cong Wang, Patrick McHardy, Paul E. McKenney, Herbert Xu
In-Reply-To: <20121129161019.17754.29670.stgit@dragon>

Use the percpu API to implement dynamic per CPU allocation
of the frag_cpu_limit in struct netns_frags.  This replaces
the static array percpu[NR_CPUS].

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
Its the first time I use the percpu API, please let me know
if I'm using it correctly.

 include/net/inet_frag.h  |   39 ++++++++++++++++++++++++++-------------
 net/ipv4/inet_fragment.c |   34 +++++++++++++++++++++++-----------
 2 files changed, 49 insertions(+), 24 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 8421904..3eadf42 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -3,6 +3,7 @@
 
 #include <linux/spinlock.h>
 #include <linux/atomic.h>
+#include <linux/percpu.h>
 
 /* Need to maintain these resource limits per CPU, else we will kill
  * performance due to cache-line bouncing
@@ -16,7 +17,7 @@ struct frag_cpu_limit {
 struct netns_frags {
 	int			nqueues;
 
-	struct frag_cpu_limit	percpu[NR_CPUS];
+	struct frag_cpu_limit __percpu *percpu;
 
 	/* sysctls */
 	int			timeout;
@@ -92,26 +93,32 @@ static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f
 static inline void inet_frag_lru_move(struct inet_frag_queue *q)
 {
 	int cpu = q->cpu_alloc;
-	spin_lock(&q->net->percpu[cpu].lru_lock);
-	list_move_tail(&q->lru_list, &q->net->percpu[cpu].lru_list);
-	spin_unlock(&q->net->percpu[cpu].lru_lock);
+	struct frag_cpu_limit *percpu = per_cpu_ptr(q->net->percpu, cpu);
+
+	spin_lock(&percpu->lru_lock);
+	list_move_tail(&q->lru_list, &percpu->lru_list);
+	spin_unlock(&percpu->lru_lock);
 }
 
 static inline void inet_frag_lru_del(struct inet_frag_queue *q)
 {
 	int cpu = q->cpu_alloc;
-	spin_lock(&q->net->percpu[cpu].lru_lock);
+	struct frag_cpu_limit *percpu = per_cpu_ptr(q->net->percpu, cpu);
+
+	spin_lock(&percpu->lru_lock);
 	list_del(&q->lru_list);
-	spin_unlock(&q->net->percpu[cpu].lru_lock);
+	spin_unlock(&percpu->lru_lock);
 }
 
 static inline void inet_frag_lru_add(struct netns_frags *nf,
 				     struct inet_frag_queue *q)
 {
 	int cpu = q->cpu_alloc;
-	spin_lock(&nf->percpu[cpu].lru_lock);
-	list_add_tail(&q->lru_list, &nf->percpu[cpu].lru_list);
-	spin_unlock(&nf->percpu[cpu].lru_lock);
+	struct frag_cpu_limit *percpu = per_cpu_ptr(nf->percpu, cpu);
+
+	spin_lock(&percpu->lru_lock);
+	list_add_tail(&q->lru_list, &percpu->lru_list);
+	spin_unlock(&percpu->lru_lock);
 }
 
 /* Memory Tracking Functions. */
@@ -119,21 +126,27 @@ static inline void inet_frag_lru_add(struct netns_frags *nf,
 static inline void sub_frag_mem_limit(struct inet_frag_queue *q, int i)
 {
 	int cpu = q->cpu_alloc;
-	atomic_sub(i, &q->net->percpu[cpu].mem);
+	struct frag_cpu_limit *percpu = per_cpu_ptr(q->net->percpu, cpu);
+	atomic_sub(i, &percpu->mem);
 }
 
 static inline void add_frag_mem_limit(struct inet_frag_queue *q, int i)
 {
 	int cpu = q->cpu_alloc;
-	atomic_add(i, &q->net->percpu[cpu].mem);
+	struct frag_cpu_limit *percpu = per_cpu_ptr(q->net->percpu, cpu);
+	atomic_add(i, &percpu->mem);
 }
 
 static inline int sum_frag_mem_limit(struct netns_frags *nf)
 {
 	unsigned int sum = 0;
 	int cpu;
-	for_each_possible_cpu(cpu)
-		sum += atomic_read(&nf->percpu[cpu].mem);
+
+	for_each_possible_cpu(cpu) {
+		struct frag_cpu_limit *percpu = per_cpu_ptr(nf->percpu, cpu);
+
+		sum += atomic_read(&percpu->mem);
+	}
 	return sum;
 }
 
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 068aabe..0099f0c 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -25,7 +25,8 @@
 
 static inline int frag_mem_limit_on_cpu(struct netns_frags *nf, int on_cpu)
 {
-	return atomic_read(&nf->percpu[on_cpu].mem);
+	struct frag_cpu_limit *percpu = per_cpu_ptr(nf->percpu, on_cpu);
+	return atomic_read(&percpu->mem);
 }
 
 static inline int frag_mem_limit(struct netns_frags *nf)
@@ -81,14 +82,22 @@ void inet_frags_init(struct inet_frags *f)
 }
 EXPORT_SYMBOL(inet_frags_init);
 
-static void inet_frags_init_percpu_limit(struct netns_frags *nf)
+static int inet_frags_init_percpu_limit(struct netns_frags *nf)
 {
 	int cpu;
+
+	nf->percpu = alloc_percpu(struct frag_cpu_limit);
+	if (!nf->percpu)
+		return -ENOMEM;
+
 	for_each_possible_cpu(cpu) {
-		INIT_LIST_HEAD(&nf->percpu[cpu].lru_list);
-		spin_lock_init(&nf->percpu[cpu].lru_lock);
-		atomic_set(&nf->percpu[cpu].mem, 0);
+		struct frag_cpu_limit *percpu = per_cpu_ptr(nf->percpu, cpu);
+
+		INIT_LIST_HEAD(&percpu->lru_list);
+		spin_lock_init(&percpu->lru_lock);
+		atomic_set(&percpu->mem, 0);
 	}
+	return 1;
 }
 
 void inet_frags_init_net(struct netns_frags *nf)
@@ -113,6 +122,8 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
 	for_each_possible_cpu(cpu)
 		inet_frag_evictor(nf, f, true, cpu);
 	local_bh_enable();
+
+	free_percpu(nf->percpu);
 }
 EXPORT_SYMBOL(inet_frags_exit_net);
 
@@ -184,6 +195,7 @@ int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f,
 	struct inet_frag_queue *q;
 	int work, evicted = 0;
 	int cpu = (likely(on_cpu < 0)) ? smp_processor_id() : on_cpu;
+	struct frag_cpu_limit *percpu = per_cpu_ptr(nf->percpu, cpu);
 
 	if (!force) {
 		if (frag_mem_limit_on_cpu(nf, cpu) <= nf->high_thresh)
@@ -192,14 +204,14 @@ int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f,
 
 	work = frag_mem_limit_on_cpu(nf, cpu) - nf->low_thresh;
 	while (work > 0) {
-		spin_lock(&nf->percpu[cpu].lru_lock);
+		spin_lock(&percpu->lru_lock);
 
-		if (list_empty(&nf->percpu[cpu].lru_list)) {
-			spin_unlock(&nf->percpu[cpu].lru_lock);
+		if (list_empty(&percpu->lru_list)) {
+			spin_unlock(&percpu->lru_lock);
 			break;
 		}
 
-		q = list_first_entry(&nf->percpu[cpu].lru_list,
+		q = list_first_entry(&percpu->lru_list,
 				struct inet_frag_queue, lru_list);
 
 		/* queue entry is warm, i.e. new frags are arriving
@@ -209,12 +221,12 @@ int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f,
 		 * completes.
 		 */
 		if (!force && q->creation_ts == (u32) jiffies) {
-			spin_unlock(&nf->percpu[cpu].lru_lock);
+			spin_unlock(&percpu->lru_lock);
 			break;
 		}
 
 		atomic_inc(&q->refcnt);
-		spin_unlock(&nf->percpu[cpu].lru_lock);
+		spin_unlock(&percpu->lru_lock);
 
 		spin_lock(&q->lock);
 		if (!(q->last_in & INET_FRAG_COMPLETE))

^ permalink raw reply related

* [net-next PATCH V2 8/9] net: frag queue locking per hash bucket
From: Jesper Dangaard Brouer @ 2012-11-29 16:15 UTC (permalink / raw)
  To: Eric Dumazet, David S. Miller, Florian Westphal
  Cc: Jesper Dangaard Brouer, netdev, Pablo Neira Ayuso, Thomas Graf,
	Cong Wang, Patrick McHardy, Paul E. McKenney, Herbert Xu
In-Reply-To: <20121129161019.17754.29670.stgit@dragon>

This patch implements per hash bucket locking for the frag queue
hash.  This removes two write locks, and the only remaining write
lock is for protecting hash rebuild.  This essentially reduce the
readers-writer lock to a rebuild lock.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>

---
V2:
 - Fixed two bugs
 - 1) Missed/too-late read_lock() for protecting hashfn in fq_unlink()
 - 2) Uses old hash bucket instead of new dest bucket in inet_frag_secret_rebuild()

 include/net/inet_frag.h  |   10 +++++++-
 net/ipv4/inet_fragment.c |   56 +++++++++++++++++++++++++++++++++++-----------
 2 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index f58590f..431f68e 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -49,9 +49,15 @@ struct inet_frag_queue {
 
 #define INETFRAGS_HASHSZ		64
 
+
+struct inet_frag_bucket {
+	struct hlist_head	chain;
+	spinlock_t		chain_lock;
+};
+
 struct inet_frags {
-	struct hlist_head	hash[INETFRAGS_HASHSZ];
-	rwlock_t		lock;
+	struct inet_frag_bucket	hash[INETFRAGS_HASHSZ];
+	rwlock_t		lock; /* Rebuild lock */
 	u32			rnd;
 	int			qsize;
 	int			secret_interval;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 9b97f2e..59999e6 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -41,20 +41,27 @@ static void inet_frag_secret_rebuild(unsigned long dummy)
 	unsigned long now = jiffies;
 	int i;
 
+	/* Per bucket lock NOT needed here, due to write lock protection */
 	write_lock(&f->lock);
+
 	get_random_bytes(&f->rnd, sizeof(u32));
 	for (i = 0; i < INETFRAGS_HASHSZ; i++) {
+		struct inet_frag_bucket *hb;
 		struct inet_frag_queue *q;
 		struct hlist_node *p, *n;
 
-		hlist_for_each_entry_safe(q, p, n, &f->hash[i], list) {
+		hb = &f->hash[i];
+		hlist_for_each_entry_safe(q, p, n, &hb->chain, list) {
 			unsigned int hval = f->hashfn(q);
 
 			if (hval != i) {
+				struct inet_frag_bucket *hb_dest;
+
 				hlist_del(&q->list);
 
 				/* Relink to new hash chain. */
-				hlist_add_head(&q->list, &f->hash[hval]);
+				hb_dest = &f->hash[hval];
+				hlist_add_head(&q->list, &hb->chain);
 			}
 		}
 	}
@@ -67,9 +74,12 @@ void inet_frags_init(struct inet_frags *f)
 {
 	int i;
 
-	for (i = 0; i < INETFRAGS_HASHSZ; i++)
-		INIT_HLIST_HEAD(&f->hash[i]);
+	for (i = 0; i < INETFRAGS_HASHSZ; i++) {
+		struct inet_frag_bucket *hb = &f->hash[i];
 
+		spin_lock_init(&hb->chain_lock);
+		INIT_HLIST_HEAD(&hb->chain);
+	}
 	rwlock_init(&f->lock);
 
 	f->rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^
@@ -129,9 +139,17 @@ EXPORT_SYMBOL(inet_frags_exit_net);
 
 static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
 {
-	write_lock(&f->lock);
+	struct inet_frag_bucket *hb;
+	unsigned int hash;
+
+	read_lock(&f->lock);
+	hash = f->hashfn(fq);
+	hb = &f->hash[hash];
+
+	spin_lock_bh(&hb->chain_lock);
 	hlist_del(&fq->list);
-	write_unlock(&f->lock);
+	spin_unlock_bh(&hb->chain_lock);
+	read_unlock(&f->lock);
 	inet_frag_lru_del(fq);
 }
 
@@ -245,28 +263,33 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
 		struct inet_frag_queue *qp_in, struct inet_frags *f,
 		void *arg)
 {
+	struct inet_frag_bucket *hb;
 	struct inet_frag_queue *qp;
 #ifdef CONFIG_SMP
 	struct hlist_node *n;
 #endif
 	unsigned int hash;
 
-	write_lock(&f->lock);
+	read_lock(&f->lock); /* Protects against hash rebuild */
 	/*
 	 * While we stayed w/o the lock other CPU could update
 	 * the rnd seed, so we need to re-calculate the hash
 	 * chain. Fortunatelly the qp_in can be used to get one.
 	 */
 	hash = f->hashfn(qp_in);
+	hb = &f->hash[hash];
+	spin_lock_bh(&hb->chain_lock);
+
 #ifdef CONFIG_SMP
 	/* With SMP race we have to recheck hash table, because
 	 * such entry could be created on other cpu, while we
-	 * promoted read lock to write lock.
+	 * promoted read lock to write lock. ***Comment FIXME***
 	 */
-	hlist_for_each_entry(qp, n, &f->hash[hash], list) {
+	hlist_for_each_entry(qp, n, &hb->chain, list) {
 		if (qp->net == nf && f->match(qp, arg)) {
 			atomic_inc(&qp->refcnt);
-			write_unlock(&f->lock);
+			spin_unlock_bh(&hb->chain_lock);
+			read_unlock(&f->lock);
 			qp_in->last_in |= INET_FRAG_COMPLETE;
 			inet_frag_put(qp_in, f);
 			return qp;
@@ -278,8 +301,9 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
 		atomic_inc(&qp->refcnt);
 
 	atomic_inc(&qp->refcnt);
-	hlist_add_head(&qp->list, &f->hash[hash]);
-	write_unlock(&f->lock);
+	hlist_add_head(&qp->list, &hb->chain);
+	spin_unlock_bh(&hb->chain_lock);
+	read_unlock(&f->lock);
 	inet_frag_lru_add(nf, qp);
 	return qp;
 }
@@ -328,16 +352,22 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
 		struct inet_frags *f, void *key, unsigned int hash)
 	__releases(&f->lock)
 {
+	struct inet_frag_bucket *hb;
 	struct inet_frag_queue *q;
 	struct hlist_node *n;
 
-	hlist_for_each_entry(q, n, &f->hash[hash], list) {
+	hb = &f->hash[hash];
+
+	spin_lock_bh(&hb->chain_lock);
+	hlist_for_each_entry(q, n, &hb->chain, list) {
 		if (q->net == nf && f->match(q, key)) {
 			atomic_inc(&q->refcnt);
+			spin_unlock_bh(&hb->chain_lock);
 			read_unlock(&f->lock);
 			return q;
 		}
 	}
+	spin_unlock_bh(&hb->chain_lock);
 	read_unlock(&f->lock);
 
 	return inet_frag_create(nf, f, key);

^ permalink raw reply related

* [net-next PATCH V2 9/9] net: increase frag queue hash size and cache-line
From: Jesper Dangaard Brouer @ 2012-11-29 16:16 UTC (permalink / raw)
  To: Eric Dumazet, David S. Miller, Florian Westphal
  Cc: Jesper Dangaard Brouer, netdev, Pablo Neira Ayuso, Thomas Graf,
	Cong Wang, Patrick McHardy, Paul E. McKenney, Herbert Xu
In-Reply-To: <20121129161019.17754.29670.stgit@dragon>

Increase frag queue hash size and assure cache-line alignment to
avoid false sharing.  Hash size is set to 256, because I have
observed 206 frag queues in use at 4x10G with packet size 4416 bytes
(three fragments).

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---

 include/net/inet_frag.h  |    5 ++---
 net/ipv4/inet_fragment.c |    2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 431f68e..c8ad7e4 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -47,13 +47,12 @@ struct inet_frag_queue {
 	u16			max_size;
 };
 
-#define INETFRAGS_HASHSZ		64
-
+#define INETFRAGS_HASHSZ	256
 
 struct inet_frag_bucket {
 	struct hlist_head	chain;
 	spinlock_t		chain_lock;
-};
+} ____cacheline_aligned_in_smp;
 
 struct inet_frags {
 	struct inet_frag_bucket	hash[INETFRAGS_HASHSZ];
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 59999e6..44c9c75 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -61,7 +61,7 @@ static void inet_frag_secret_rebuild(unsigned long dummy)
 
 				/* Relink to new hash chain. */
 				hb_dest = &f->hash[hval];
-				hlist_add_head(&q->list, &hb->chain);
+				hlist_add_head(&q->list, &hb_dest->chain);
 			}
 		}
 	}

^ permalink raw reply related

* Re: [PATCH v2 3/3] pppoatm: protect against freeing of vcc
From: David Woodhouse @ 2012-11-29 16:24 UTC (permalink / raw)
  To: chas williams - CONTRACTOR
  Cc: Krzysztof Mazur, David Laight, davem, netdev, linux-kernel,
	nathan
In-Reply-To: <20121129105934.3e0c3a04@thirdoffive.cmf.nrl.navy.mil>

[-- Attachment #1: Type: text/plain, Size: 1278 bytes --]

On Thu, 2012-11-29 at 10:59 -0500, chas williams - CONTRACTOR wrote:
> the part that bothers me (and i dont have the programmer's guide for
> the solos hardware) is that you are watching for the PKT_PCLOSE to be
> sent to the card.  shouldnt you be watching for the PKT_PCLOSE to be
> returned from the card (assuming it does such a thing) so that you can
> be assured that the tx/rx for this vpi/vci pair has been "stopped"?

Define "stopped".

For the RX case... the other end may *always* take it upon itself to
send us a packet marked with arbitrary VCI/VPI, right? There's no
connection setup for it "on the wire", in the case of PVC?

So bearing that in mind: from the moment ATM_VF_READY gets cleared, as
far as the ATM core is concerned, we will no longer receive packets on
the given VCC. If we receive any, we'll just complain about receiving
packets for an unknown VCI/VPI.

For the TX case ... yes, we need to be sure we aren't continuing to send
packets after our close() routine completes. We *used* to, but the
resulting ->pop() calls were causing problems, and that's why we're
looking at this code path closer. The currently proposed patches (except
one suggestion from Krzyztof that we both shouted down) would fix that.

-- 
dwmw2

[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 6171 bytes --]

^ permalink raw reply

* Re: [PATCH v2 3/3] pppoatm: protect against freeing of vcc
From: Krzysztof Mazur @ 2012-11-29 16:28 UTC (permalink / raw)
  To: David Woodhouse
  Cc: David Laight, chas williams - CONTRACTOR, davem, netdev,
	linux-kernel, nathan
In-Reply-To: <1354204077.21562.172.camel@shinybook.infradead.org>

On Thu, Nov 29, 2012 at 03:47:57PM +0000, David Woodhouse wrote:
> On Thu, 2012-11-29 at 16:09 +0100, Krzysztof Mazur wrote:
> > 
> > I don't like two thinks about this patch:
> > 
> >         - if allos_skb(sizeof(*header), GFP_ATOMIC) at beginning of
> >           pclose() fails we will crash
> > 
> >         - if card wakes up after this timeout we will probably crash too
> > 
> > That's why proposed different approach, but it has other problems.
> 
> How about this variant on what you suggested. Yes, we can definitely
> remove everything that's in the queue... as long as we use
> skb_queue_walk_safe() instead of skb_queue_walk().
> 
> We can use GFP_KERNEL instead of GFP_ATOMIC, which at least reduces the
> likelihood of failing to close the vcc.
> 
> We end up waiting *only* if there is a packet which is *currently* being
> DMA'd to the card. And if the card doesn't take that within 5 seconds,
> it almost certainly never will. So I can live with that.
> 

Yeah, that shouldn't happen.

> +				if (!test_bit(ATM_VF_READY, &vcc->flags))
> +					wake_up(&card->param_wq);
> +			} else

according to CodingStyle:

+ } else {
>  				dev_kfree_skb_irq(oldskb);
> -			}
+ }

Krzysiek

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox