Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 3/5] RDMA/cxgb4: Fix LE hash collision bug for active open connection
From: Vipul Pandya @ 2012-11-29 14:52 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: roland-BHEL68pLQRGGvPXPguhicg, davem-fT/PcQaiUtIeIZ0/mPfg9Q,
	divy-ut6Up61K2wZBDgjK7y7TUQ, dm-ut6Up61K2wZBDgjK7y7TUQ,
	kumaras-ut6Up61K2wZBDgjK7y7TUQ,
	swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW,
	abhishek-ut6Up61K2wZBDgjK7y7TUQ, Vipul Pandya
In-Reply-To: <1354200745-23598-1-git-send-email-vipul-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>

It enables establishing active open connection using fw_ofld_connection work
request when cpl_act_open_rpl says TCAM full error which may be because
of LE hash collision. Current support is only for IPv4 active open connections.

Sets ntuple bits in active open requests. For T4 firmware greater than 1.4.10.0
ntuple bits are required to be set.

Adds nocong and enable_ecn module parameter options.

Signed-off-by: Vipul Pandya <vipul-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
---
 drivers/infiniband/hw/cxgb4/cm.c              |  158 ++++++++++++++++++++++++-
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h        |    1 +
 drivers/net/ethernet/chelsio/cxgb4/t4_msg.h   |   19 +++
 drivers/net/ethernet/chelsio/cxgb4/t4_regs.h  |    2 +
 drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h |  109 +++++++++++++++++
 5 files changed, 283 insertions(+), 6 deletions(-)

diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index 6cfd4d8..560ca39 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -61,6 +61,14 @@ static char *states[] = {
 	NULL,
 };
 
+static int nocong;
+module_param(nocong, int, 0644);
+MODULE_PARM_DESC(nocong, "Turn of congestion control (default=0)");
+
+static int enable_ecn;
+module_param(enable_ecn, int, 0644);
+MODULE_PARM_DESC(enable_ecn, "Enable ECN (default=0/disabled)");
+
 static int dack_mode = 1;
 module_param(dack_mode, int, 0644);
 MODULE_PARM_DESC(dack_mode, "Delayed ack mode (default=1)");
@@ -442,6 +450,50 @@ static int send_abort(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp)
 	return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
 }
 
+#define VLAN_NONE 0xfff
+#define FILTER_SEL_VLAN_NONE 0xffff
+#define FILTER_SEL_WIDTH_P_FC (3+1) /* port uses 3 bits, FCoE one bit */
+#define FILTER_SEL_WIDTH_VIN_P_FC \
+	(6 + 7 + FILTER_SEL_WIDTH_P_FC) /* 6 bits are unused, VF uses 7 bits*/
+#define FILTER_SEL_WIDTH_TAG_P_FC \
+	(3 + FILTER_SEL_WIDTH_VIN_P_FC) /* PF uses 3 bits */
+#define FILTER_SEL_WIDTH_VLD_TAG_P_FC (1 + FILTER_SEL_WIDTH_TAG_P_FC)
+
+static unsigned int select_ntuple(struct c4iw_dev *dev, struct dst_entry *dst,
+				  struct l2t_entry *l2t)
+{
+	unsigned int ntuple = 0;
+	u32 viid;
+
+	switch (dev->rdev.lldi.filt_mode) {
+
+	/* default filter mode */
+	case HW_TPL_FR_MT_PR_IV_P_FC:
+		if (l2t->vlan == VLAN_NONE)
+			ntuple |= FILTER_SEL_VLAN_NONE << FILTER_SEL_WIDTH_P_FC;
+		else {
+			ntuple |= l2t->vlan << FILTER_SEL_WIDTH_P_FC;
+			ntuple |= 1 << FILTER_SEL_WIDTH_VLD_TAG_P_FC;
+		}
+		ntuple |= l2t->lport << S_PORT | IPPROTO_TCP <<
+			  FILTER_SEL_WIDTH_VLD_TAG_P_FC;
+		break;
+	case HW_TPL_FR_MT_PR_OV_P_FC: {
+		viid = cxgb4_port_viid(l2t->neigh->dev);
+
+		ntuple |= FW_VIID_VIN_GET(viid) << FILTER_SEL_WIDTH_P_FC;
+		ntuple |= FW_VIID_PFN_GET(viid) << FILTER_SEL_WIDTH_VIN_P_FC;
+		ntuple |= FW_VIID_VIVLD_GET(viid) << FILTER_SEL_WIDTH_TAG_P_FC;
+		ntuple |= l2t->lport << S_PORT | IPPROTO_TCP <<
+			  FILTER_SEL_WIDTH_VLD_TAG_P_FC;
+		break;
+	}
+	default:
+		break;
+	}
+	return ntuple;
+}
+
 static int send_connect(struct c4iw_ep *ep)
 {
 	struct cpl_act_open_req *req;
@@ -464,7 +516,8 @@ static int send_connect(struct c4iw_ep *ep)
 
 	cxgb4_best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx);
 	wscale = compute_wscale(rcv_win);
-	opt0 = KEEP_ALIVE(1) |
+	opt0 = (nocong ? NO_CONG(1) : 0) |
+	       KEEP_ALIVE(1) |
 	       DELACK(1) |
 	       WND_SCALE(wscale) |
 	       MSS_IDX(mtu_idx) |
@@ -475,6 +528,7 @@ static int send_connect(struct c4iw_ep *ep)
 	       ULP_MODE(ULP_MODE_TCPDDP) |
 	       RCV_BUFSIZ(rcv_win>>10);
 	opt2 = RX_CHANNEL(0) |
+	       CCTRL_ECN(enable_ecn) |
 	       RSS_QUEUE_VALID | RSS_QUEUE(ep->rss_qid);
 	if (enable_tcp_timestamps)
 		opt2 |= TSTAMPS_EN(1);
@@ -493,7 +547,7 @@ static int send_connect(struct c4iw_ep *ep)
 	req->local_ip = ep->com.local_addr.sin_addr.s_addr;
 	req->peer_ip = ep->com.remote_addr.sin_addr.s_addr;
 	req->opt0 = cpu_to_be64(opt0);
-	req->params = 0;
+	req->params = cpu_to_be32(select_ntuple(ep->com.dev, ep->dst, ep->l2t));
 	req->opt2 = cpu_to_be32(opt2);
 	return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
 }
@@ -1384,6 +1438,61 @@ static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
 	return 0;
 }
 
+static void send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid)
+{
+	struct sk_buff *skb;
+	struct fw_ofld_connection_wr *req;
+	unsigned int mtu_idx;
+	int wscale;
+
+	skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+	req = (struct fw_ofld_connection_wr *)__skb_put(skb, sizeof(*req));
+	memset(req, 0, sizeof(*req));
+	req->op_compl = htonl(V_WR_OP(FW_OFLD_CONNECTION_WR));
+	req->len16_pkd = htonl(FW_WR_LEN16(DIV_ROUND_UP(sizeof(*req), 16)));
+	req->le.filter = cpu_to_be32(select_ntuple(ep->com.dev, ep->dst,
+				     ep->l2t));
+	req->le.lport = ep->com.local_addr.sin_port;
+	req->le.pport = ep->com.remote_addr.sin_port;
+	req->le.u.ipv4.lip = ep->com.local_addr.sin_addr.s_addr;
+	req->le.u.ipv4.pip = ep->com.remote_addr.sin_addr.s_addr;
+	req->tcb.t_state_to_astid =
+			htonl(V_FW_OFLD_CONNECTION_WR_T_STATE(TCP_SYN_SENT) |
+			V_FW_OFLD_CONNECTION_WR_ASTID(atid));
+	req->tcb.cplrxdataack_cplpassacceptrpl =
+			htons(F_FW_OFLD_CONNECTION_WR_CPLRXDATAACK);
+	req->tcb.tx_max = jiffies;
+	cxgb4_best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx);
+	wscale = compute_wscale(rcv_win);
+	req->tcb.opt0 = TCAM_BYPASS(1) |
+		(nocong ? NO_CONG(1) : 0) |
+		KEEP_ALIVE(1) |
+		DELACK(1) |
+		WND_SCALE(wscale) |
+		MSS_IDX(mtu_idx) |
+		L2T_IDX(ep->l2t->idx) |
+		TX_CHAN(ep->tx_chan) |
+		SMAC_SEL(ep->smac_idx) |
+		DSCP(ep->tos) |
+		ULP_MODE(ULP_MODE_TCPDDP) |
+		RCV_BUFSIZ(rcv_win >> 10);
+	req->tcb.opt2 = PACE(1) |
+		TX_QUEUE(ep->com.dev->rdev.lldi.tx_modq[ep->tx_chan]) |
+		RX_CHANNEL(0) |
+		CCTRL_ECN(enable_ecn) |
+		RSS_QUEUE_VALID | RSS_QUEUE(ep->rss_qid);
+	if (enable_tcp_timestamps)
+		req->tcb.opt2 |= TSTAMPS_EN(1);
+	if (enable_tcp_sack)
+		req->tcb.opt2 |= SACK_EN(1);
+	if (wscale && enable_tcp_window_scaling)
+		req->tcb.opt2 |= WND_SCALE_EN(1);
+	req->tcb.opt0 = cpu_to_be64(req->tcb.opt0);
+	req->tcb.opt2 = cpu_to_be32(req->tcb.opt2);
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
+	c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+}
+
 /*
  * Return whether a failed active open has allocated a TID
  */
@@ -1420,6 +1529,14 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
 	case CPL_ERR_CONN_RESET:
 	case CPL_ERR_CONN_TIMEDOUT:
 		break;
+	case CPL_ERR_TCAM_FULL:
+		mutex_lock(&dev->rdev.stats.lock);
+		dev->rdev.stats.tcam_full++;
+		mutex_unlock(&dev->rdev.stats.lock);
+		send_fw_act_open_req(ep,
+			GET_TID_TID(GET_AOPEN_ATID(ntohl(rpl->atid_status))));
+		return 0;
+		break;
 	default:
 		printk(KERN_INFO MOD "Active open failure - "
 		       "atid %u status %u errno %d %pI4:%u->%pI4:%u\n",
@@ -1511,14 +1628,15 @@ static void accept_cr(struct c4iw_ep *ep, __be32 peer_ip, struct sk_buff *skb,
 	skb_get(skb);
 	cxgb4_best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx);
 	wscale = compute_wscale(rcv_win);
-	opt0 = KEEP_ALIVE(1) |
+	opt0 = (nocong ? NO_CONG(1) : 0) |
+	       KEEP_ALIVE(1) |
 	       DELACK(1) |
 	       WND_SCALE(wscale) |
 	       MSS_IDX(mtu_idx) |
 	       L2T_IDX(ep->l2t->idx) |
 	       TX_CHAN(ep->tx_chan) |
 	       SMAC_SEL(ep->smac_idx) |
-	       DSCP(ep->tos) |
+	       DSCP(ep->tos >> 2) |
 	       ULP_MODE(ULP_MODE_TCPDDP) |
 	       RCV_BUFSIZ(rcv_win>>10);
 	opt2 = RX_CHANNEL(0) |
@@ -1530,6 +1648,15 @@ static void accept_cr(struct c4iw_ep *ep, __be32 peer_ip, struct sk_buff *skb,
 		opt2 |= SACK_EN(1);
 	if (wscale && enable_tcp_window_scaling)
 		opt2 |= WND_SCALE_EN(1);
+	if (enable_ecn) {
+		const struct tcphdr *tcph;
+		u32 hlen = ntohl(req->hdr_len);
+
+		tcph = (const void *)(req + 1) + G_ETH_HDR_LEN(hlen) +
+			G_IP_HDR_LEN(hlen);
+		if (tcph->ece && tcph->cwr)
+			opt2 |= CCTRL_ECN(1);
+	}
 
 	rpl = cplhdr(skb);
 	INIT_TP_WR(rpl, ep->hwtid);
@@ -2649,11 +2776,14 @@ static int fw6_msg(struct c4iw_dev *dev, struct sk_buff *skb)
 	struct cpl_fw6_msg *rpl = cplhdr(skb);
 	struct c4iw_wr_wait *wr_waitp;
 	int ret;
+	u8 opcode;
+	struct cpl_fw6_msg_ofld_connection_wr_rpl *req;
+	struct c4iw_ep *ep;
 
 	PDBG("%s type %u\n", __func__, rpl->type);
 
 	switch (rpl->type) {
-	case 1:
+	case FW6_TYPE_WR_RPL:
 		ret = (int)((be64_to_cpu(rpl->data[0]) >> 8) & 0xff);
 		wr_waitp = (struct c4iw_wr_wait *)(__force unsigned long) rpl->data[1];
 		PDBG("%s wr_waitp %p ret %u\n", __func__, wr_waitp, ret);
@@ -2661,9 +2791,25 @@ static int fw6_msg(struct c4iw_dev *dev, struct sk_buff *skb)
 			c4iw_wake_up(wr_waitp, ret ? -ret : 0);
 		kfree_skb(skb);
 		break;
-	case 2:
+	case FW6_TYPE_CQE:
 		sched(dev, skb);
 		break;
+	case FW6_TYPE_OFLD_CONNECTION_WR_RPL:
+		opcode = *(const u8 *)rpl->data;
+		if (opcode == FW_OFLD_CONNECTION_WR) {
+			req =
+			(struct cpl_fw6_msg_ofld_connection_wr_rpl *)rpl->data;
+			if (req->t_state == TCP_SYN_SENT
+			    && (req->retval == FW_ENOMEM
+				|| req->retval == FW_EADDRINUSE)) {
+				ep = (struct c4iw_ep *)
+				     lookup_atid(dev->rdev.lldi.tids,
+						 req->tid);
+				c4iw_l2t_send(&dev->rdev, skb, ep->l2t);
+				return 0;
+			}
+		}
+		break;
 	default:
 		printk(KERN_ERR MOD "%s unexpected fw6 msg type %u\n", __func__,
 		       rpl->type);
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index 9beb3a9..6a17fde 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -130,6 +130,7 @@ struct c4iw_stats {
 	u64  db_empty;
 	u64  db_drop;
 	u64  db_state_transitions;
+	u64  tcam_full;
 };
 
 struct c4iw_rdev {
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h
index 480eb43..5b51c01 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h
@@ -193,6 +193,10 @@ struct work_request_hdr {
 	__be64 wr_lo;
 };
 
+/* wr_hi fields */
+#define S_WR_OP    24
+#define V_WR_OP(x) ((__u64)(x) << S_WR_OP)
+
 #define WR_HDR struct work_request_hdr wr
 
 struct cpl_pass_open_req {
@@ -204,12 +208,14 @@ struct cpl_pass_open_req {
 	__be32 peer_ip;
 	__be64 opt0;
 #define TX_CHAN(x)    ((x) << 2)
+#define NO_CONG(x)    ((x) << 4)
 #define DELACK(x)     ((x) << 5)
 #define ULP_MODE(x)   ((x) << 8)
 #define RCV_BUFSIZ(x) ((x) << 12)
 #define DSCP(x)       ((x) << 22)
 #define SMAC_SEL(x)   ((u64)(x) << 28)
 #define L2T_IDX(x)    ((u64)(x) << 36)
+#define TCAM_BYPASS(x) ((u64)(x) << 48)
 #define NAGLE(x)      ((u64)(x) << 49)
 #define WND_SCALE(x)  ((u64)(x) << 50)
 #define KEEP_ALIVE(x) ((u64)(x) << 54)
@@ -247,8 +253,10 @@ struct cpl_pass_accept_rpl {
 #define RSS_QUEUE_VALID      (1 << 10)
 #define RX_COALESCE_VALID(x) ((x) << 11)
 #define RX_COALESCE(x)       ((x) << 12)
+#define PACE(x)	      ((x) << 16)
 #define TX_QUEUE(x)          ((x) << 23)
 #define RX_CHANNEL(x)        ((x) << 26)
+#define CCTRL_ECN(x)         ((x) << 27)
 #define WND_SCALE_EN(x)      ((x) << 28)
 #define TSTAMPS_EN(x)        ((x) << 29)
 #define SACK_EN(x)           ((x) << 30)
@@ -635,6 +643,17 @@ struct cpl_fw6_msg {
 /* cpl_fw6_msg.type values */
 enum {
 	FW6_TYPE_CMD_RPL = 0,
+	FW6_TYPE_WR_RPL = 1,
+	FW6_TYPE_CQE = 2,
+	FW6_TYPE_OFLD_CONNECTION_WR_RPL = 3,
+};
+
+struct cpl_fw6_msg_ofld_connection_wr_rpl {
+	__u64   cookie;
+	__be32  tid;    /* or atid in case of active failure */
+	__u8    t_state;
+	__u8    retval;
+	__u8    rsvd[2];
 };
 
 enum {
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h b/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
index 66ab614..bdc1bbc 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
@@ -1065,4 +1065,6 @@
 
 #define A_TP_TX_SCHED_PCMD 0x25
 
+#define S_PORT    1
+
 #endif /* __T4_REGS_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
index b4dc560..6e3542f 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
@@ -39,6 +39,11 @@ enum fw_ret_val {
 	FW_ENOEXEC      = 8,            /* Exec format error; inv microcode */
 };
 
+enum fw_retval {
+	FW_ENOMEM               = 12,   /* out of memory */
+	FW_EADDRINUSE           = 98,   /* address already in use */
+};
+
 #define FW_T4VF_SGE_BASE_ADDR      0x0000
 #define FW_T4VF_MPS_BASE_ADDR      0x0100
 #define FW_T4VF_PL_BASE_ADDR       0x0200
@@ -50,6 +55,7 @@ enum fw_wr_opcodes {
 	FW_ULPTX_WR                    = 0x04,
 	FW_TP_WR                       = 0x05,
 	FW_ETH_TX_PKT_WR               = 0x08,
+	FW_OFLD_CONNECTION_WR          = 0x2f,
 	FW_FLOWC_WR                    = 0x0a,
 	FW_OFLD_TX_DATA_WR             = 0x0b,
 	FW_CMD_WR                      = 0x10,
@@ -84,6 +90,7 @@ struct fw_wr_hdr {
 #define FW_WR_LEN16(x)	((x) << 0)
 
 #define HW_TPL_FR_MT_PR_IV_P_FC         0X32B
+#define HW_TPL_FR_MT_PR_OV_P_FC         0X327
 
 /* filter wr reply code in cookie in CPL_SET_TCB_RPL */
 enum fw_filter_wr_cookie {
@@ -378,6 +385,108 @@ struct fw_eth_tx_pkt_wr {
 	__be64 r3;
 };
 
+struct fw_ofld_connection_wr {
+	__be32 op_compl;
+	__be32 len16_pkd;
+	__u64  cookie;
+	__be64 r2;
+	__be64 r3;
+	struct fw_ofld_connection_le {
+		__be32 version_cpl;
+		__be32 filter;
+		__be32 r1;
+		__be16 lport;
+		__be16 pport;
+		union fw_ofld_connection_leip {
+			struct fw_ofld_connection_le_ipv4 {
+				__be32 pip;
+				__be32 lip;
+				__be64 r0;
+				__be64 r1;
+				__be64 r2;
+			} ipv4;
+			struct fw_ofld_connection_le_ipv6 {
+				__be64 pip_hi;
+				__be64 pip_lo;
+				__be64 lip_hi;
+				__be64 lip_lo;
+			} ipv6;
+		} u;
+	} le;
+	struct fw_ofld_connection_tcb {
+		__be32 t_state_to_astid;
+		__be16 cplrxdataack_cplpassacceptrpl;
+		__be16 rcv_adv;
+		__be32 rcv_nxt;
+		__be32 tx_max;
+		__be64 opt0;
+		__be32 opt2;
+		__be32 r1;
+		__be64 r2;
+		__be64 r3;
+	} tcb;
+};
+
+#define S_FW_OFLD_CONNECTION_WR_VERSION                31
+#define M_FW_OFLD_CONNECTION_WR_VERSION                0x1
+#define V_FW_OFLD_CONNECTION_WR_VERSION(x)     \
+	((x) << S_FW_OFLD_CONNECTION_WR_VERSION)
+#define G_FW_OFLD_CONNECTION_WR_VERSION(x)     \
+	(((x) >> S_FW_OFLD_CONNECTION_WR_VERSION) & \
+	M_FW_OFLD_CONNECTION_WR_VERSION)
+#define F_FW_OFLD_CONNECTION_WR_VERSION        \
+	V_FW_OFLD_CONNECTION_WR_VERSION(1U)
+
+#define S_FW_OFLD_CONNECTION_WR_CPL    30
+#define M_FW_OFLD_CONNECTION_WR_CPL    0x1
+#define V_FW_OFLD_CONNECTION_WR_CPL(x) ((x) << S_FW_OFLD_CONNECTION_WR_CPL)
+#define G_FW_OFLD_CONNECTION_WR_CPL(x) \
+	(((x) >> S_FW_OFLD_CONNECTION_WR_CPL) & M_FW_OFLD_CONNECTION_WR_CPL)
+#define F_FW_OFLD_CONNECTION_WR_CPL    V_FW_OFLD_CONNECTION_WR_CPL(1U)
+
+#define S_FW_OFLD_CONNECTION_WR_T_STATE                28
+#define M_FW_OFLD_CONNECTION_WR_T_STATE                0xf
+#define V_FW_OFLD_CONNECTION_WR_T_STATE(x)     \
+	((x) << S_FW_OFLD_CONNECTION_WR_T_STATE)
+#define G_FW_OFLD_CONNECTION_WR_T_STATE(x)     \
+	(((x) >> S_FW_OFLD_CONNECTION_WR_T_STATE) & \
+	M_FW_OFLD_CONNECTION_WR_T_STATE)
+
+#define S_FW_OFLD_CONNECTION_WR_RCV_SCALE      24
+#define M_FW_OFLD_CONNECTION_WR_RCV_SCALE      0xf
+#define V_FW_OFLD_CONNECTION_WR_RCV_SCALE(x)   \
+	((x) << S_FW_OFLD_CONNECTION_WR_RCV_SCALE)
+#define G_FW_OFLD_CONNECTION_WR_RCV_SCALE(x)   \
+	(((x) >> S_FW_OFLD_CONNECTION_WR_RCV_SCALE) & \
+	M_FW_OFLD_CONNECTION_WR_RCV_SCALE)
+
+#define S_FW_OFLD_CONNECTION_WR_ASTID          0
+#define M_FW_OFLD_CONNECTION_WR_ASTID          0xffffff
+#define V_FW_OFLD_CONNECTION_WR_ASTID(x)       \
+	((x) << S_FW_OFLD_CONNECTION_WR_ASTID)
+#define G_FW_OFLD_CONNECTION_WR_ASTID(x)       \
+	(((x) >> S_FW_OFLD_CONNECTION_WR_ASTID) & M_FW_OFLD_CONNECTION_WR_ASTID)
+
+#define S_FW_OFLD_CONNECTION_WR_CPLRXDATAACK   15
+#define M_FW_OFLD_CONNECTION_WR_CPLRXDATAACK   0x1
+#define V_FW_OFLD_CONNECTION_WR_CPLRXDATAACK(x)        \
+	((x) << S_FW_OFLD_CONNECTION_WR_CPLRXDATAACK)
+#define G_FW_OFLD_CONNECTION_WR_CPLRXDATAACK(x)        \
+	(((x) >> S_FW_OFLD_CONNECTION_WR_CPLRXDATAACK) & \
+	M_FW_OFLD_CONNECTION_WR_CPLRXDATAACK)
+#define F_FW_OFLD_CONNECTION_WR_CPLRXDATAACK   \
+	V_FW_OFLD_CONNECTION_WR_CPLRXDATAACK(1U)
+
+#define S_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL       14
+#define M_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL       0x1
+#define V_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL(x)    \
+	((x) << S_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL)
+#define G_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL(x)    \
+	(((x) >> S_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL) & \
+	M_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL)
+#define F_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL       \
+	V_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL(1U)
+
 enum fw_flowc_mnem {
 	FW_FLOWC_MNEM_PFNVFN,		/* PFN [15:8] VFN [7:0] */
 	FW_FLOWC_MNEM_CH,
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH 2/5] cxgb4: Add LE hash collision bug fix path in LLD driver
From: Vipul Pandya @ 2012-11-29 14:52 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: roland-BHEL68pLQRGGvPXPguhicg, davem-fT/PcQaiUtIeIZ0/mPfg9Q,
	divy-ut6Up61K2wZBDgjK7y7TUQ, dm-ut6Up61K2wZBDgjK7y7TUQ,
	kumaras-ut6Up61K2wZBDgjK7y7TUQ,
	swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW,
	abhishek-ut6Up61K2wZBDgjK7y7TUQ, Vipul Pandya
In-Reply-To: <1354200745-23598-1-git-send-email-vipul-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>

It supports establishing passive open connection through firmware filter work
request. Passive open connection will go through this path as now instead of
listening server we create a server filter which will redirect the incoming SYN
packet to the offload queue.

It divides filter region into regular filters and server filter portion. It
introduces new server filter region which will be exclusively used for creating
server filters. This region will not overlap with regular filter region.

It provides new API cxgb4_alloc_sftid in LLD for getting stid in case of LE
hash collision path. This new stid will be used to open server filter in the
filter region.

Signed-off-by: Vipul Pandya <vipul-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h      |    4 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c |  151 ++++++++++++++++++++++-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h  |   15 ++-
 drivers/net/ethernet/chelsio/cxgb4/t4_regs.h    |   33 +++++
 4 files changed, 195 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 8cfc1ba..b5d299d 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -35,6 +35,8 @@
 #ifndef __CXGB4_H__
 #define __CXGB4_H__
 
+#include "t4_hw.h"
+
 #include <linux/bitops.h>
 #include <linux/cache.h>
 #include <linux/interrupt.h>
@@ -212,6 +214,8 @@ struct tp_err_stats {
 struct tp_params {
 	unsigned int ntxchan;        /* # of Tx channels */
 	unsigned int tre;            /* log2 of core clocks per TP tick */
+	unsigned short tx_modq_map;  /* TX modulation scheduler queue to */
+				     /* channel map */
 
 	uint32_t dack_re;            /* DACK timer resolution */
 	unsigned short tx_modq[NCHAN];	/* channel to modulation queue map */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 4934eda..d0cd65b 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -2498,6 +2498,34 @@ int cxgb4_alloc_stid(struct tid_info *t, int family, void *data)
 EXPORT_SYMBOL(cxgb4_alloc_stid);
 
 /*
+ * Allocate a server filter TID and set it to the supplied value.
+ */
+int cxgb4_alloc_sftid(struct tid_info *t, int family, void *data)
+{
+	int stid;
+
+	spin_lock_bh(&t->stid_lock);
+	if (family == PF_INET) {
+		stid = find_next_zero_bit(t->stid_bmap,
+				t->nstids + t->nsftids, t->nstids);
+		if (stid < (t->nstids + t->nsftids))
+			__set_bit(stid, t->stid_bmap);
+		else
+			stid = -1;
+	} else {
+		stid = -1;
+	}
+	if (stid >= 0) {
+		t->stid_tab[stid].data = data;
+		stid += t->stid_base;
+		t->stids_in_use++;
+	}
+	spin_unlock_bh(&t->stid_lock);
+	return stid;
+}
+EXPORT_SYMBOL(cxgb4_alloc_sftid);
+
+/**
  * Release a server TID.
  */
 void cxgb4_free_stid(struct tid_info *t, unsigned int stid, int family)
@@ -2613,12 +2641,14 @@ static int tid_init(struct tid_info *t)
 	unsigned int stid_bmap_size;
 	unsigned int natids = t->natids;
 
-	stid_bmap_size = BITS_TO_LONGS(t->nstids);
+	stid_bmap_size = BITS_TO_LONGS(t->nstids + t->nsftids);
 	size = t->ntids * sizeof(*t->tid_tab) +
 	       natids * sizeof(*t->atid_tab) +
 	       t->nstids * sizeof(*t->stid_tab) +
+	       t->nsftids * sizeof(*t->stid_tab) +
 	       stid_bmap_size * sizeof(long) +
-	       t->nftids * sizeof(*t->ftid_tab);
+	       t->nftids * sizeof(*t->ftid_tab) +
+	       t->nsftids * sizeof(*t->ftid_tab);
 
 	t->tid_tab = t4_alloc_mem(size);
 	if (!t->tid_tab)
@@ -2626,7 +2656,7 @@ static int tid_init(struct tid_info *t)
 
 	t->atid_tab = (union aopen_entry *)&t->tid_tab[t->ntids];
 	t->stid_tab = (struct serv_entry *)&t->atid_tab[natids];
-	t->stid_bmap = (unsigned long *)&t->stid_tab[t->nstids];
+	t->stid_bmap = (unsigned long *)&t->stid_tab[t->nstids + t->nsftids];
 	t->ftid_tab = (struct filter_entry *)&t->stid_bmap[stid_bmap_size];
 	spin_lock_init(&t->stid_lock);
 	spin_lock_init(&t->atid_lock);
@@ -2642,7 +2672,7 @@ static int tid_init(struct tid_info *t)
 			t->atid_tab[natids - 1].next = &t->atid_tab[natids];
 		t->afree = t->atid_tab;
 	}
-	bitmap_zero(t->stid_bmap, t->nstids);
+	bitmap_zero(t->stid_bmap, t->nstids + t->nsftids);
 	return 0;
 }
 
@@ -3004,6 +3034,7 @@ static void uld_attach(struct adapter *adap, unsigned int uld)
 {
 	void *handle;
 	struct cxgb4_lld_info lli;
+	unsigned short i;
 
 	lli.pdev = adap->pdev;
 	lli.l2t = adap->l2t;
@@ -3030,10 +3061,16 @@ static void uld_attach(struct adapter *adap, unsigned int uld)
 	lli.ucq_density = 1 << QUEUESPERPAGEPF0_GET(
 			t4_read_reg(adap, SGE_INGRESS_QUEUES_PER_PAGE_PF) >>
 			(adap->fn * 4));
+	lli.filt_mode = tp_vlan_pri_map;
+	/* MODQ_REQ_MAP sets queues 0-3 to chan 0-3 */
+	for (i = 0; i < NCHAN; i++)
+		lli.tx_modq[i] = i;
 	lli.gts_reg = adap->regs + MYPF_REG(SGE_PF_GTS);
 	lli.db_reg = adap->regs + MYPF_REG(SGE_PF_KDOORBELL);
 	lli.fw_vers = adap->params.fw_vers;
 	lli.dbfifo_int_thresh = dbfifo_int_thresh;
+	lli.sge_pktshift = adap->sge.pktshift;
+	lli.enable_fw_ofld_conn = adap->flags & FW_OFLD_CONN;
 
 	handle = ulds[uld].add(&lli);
 	if (IS_ERR(handle)) {
@@ -3276,7 +3313,7 @@ static int delete_filter(struct adapter *adapter, unsigned int fidx)
 	struct filter_entry *f;
 	int ret;
 
-	if (fidx >= adapter->tids.nftids)
+	if (fidx >= adapter->tids.nftids + adapter->tids.nsftids)
 		return -EINVAL;
 
 	f = &adapter->tids.ftid_tab[fidx];
@@ -3289,6 +3326,79 @@ static int delete_filter(struct adapter *adapter, unsigned int fidx)
 	return 0;
 }
 
+int cxgb4_create_server_filter(const struct net_device *dev, unsigned int stid,
+		__be32 sip, __be16 sport, unsigned int queue)
+{
+	int ret;
+	struct filter_entry *f;
+	struct adapter *adap;
+	int i;
+	u8 *val;
+
+	adap = netdev2adap(dev);
+
+	/**
+	 * Check to make sure the filter requested is writable ...
+	 */
+	f = &adap->tids.ftid_tab[stid];
+	ret = writable_filter(f);
+	if (ret)
+		return ret;
+
+	/**
+	 * Clear out any old resources being used by the filter before
+	 * we start constructing the new filter.
+	 */
+	if (f->valid)
+		clear_filter(adap, f);
+
+	/* Clear out filter specifications */
+	memset(&f->fs, 0, sizeof(struct ch_filter_specification));
+	f->fs.val.lport = cpu_to_be16(sport);
+	f->fs.mask.lport  = ~0;
+	val = (u8 *)&sip;
+	if ((val[0] | val[1] | val[2] | val[3]) != 0)
+		for (i = 0; i < 4; i++) {
+			f->fs.val.lip[i] = val[i];
+			f->fs.mask.lip[i] = ~0;
+		}
+
+	f->fs.dirsteer = 1;
+	f->fs.iq = queue;
+	/* Mark filter as locked */
+	f->locked = 1;
+	f->fs.rpttid = 1;
+
+	ret = set_filter_wr(adap, stid);
+	if (ret) {
+		clear_filter(adap, f);
+		return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(cxgb4_create_server_filter);
+
+int cxgb4_remove_server_filter(const struct net_device *dev, unsigned int stid,
+		unsigned int queue, bool ipv6)
+{
+	int ret;
+	struct filter_entry *f;
+	struct adapter *adap;
+
+	adap = netdev2adap(dev);
+	f = &adap->tids.ftid_tab[stid];
+	/* Unlock the filter */
+	f->locked = 0;
+
+	ret = delete_filter(adap, stid);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+EXPORT_SYMBOL(cxgb4_remove_server_filter);
+
 static struct rtnl_link_stats64 *cxgb_get_stats(struct net_device *dev,
 						struct rtnl_link_stats64 *ns)
 {
@@ -3535,6 +3645,34 @@ static int adap_init1(struct adapter *adap, struct fw_caps_config_cmd *c)
 	v = t4_read_reg(adap, TP_PIO_DATA);
 	t4_write_reg(adap, TP_PIO_DATA, v & ~CSUM_HAS_PSEUDO_HDR);
 
+	/* first 4 Tx modulation queues point to consecutive Tx channels */
+	adap->params.tp.tx_modq_map = 0xE4;
+	t4_write_reg(adap, A_TP_TX_MOD_QUEUE_REQ_MAP,
+		     V_TX_MOD_QUEUE_REQ_MAP(adap->params.tp.tx_modq_map));
+
+	/* associate each Tx modulation queue with consecutive Tx channels */
+	v = 0x84218421;
+	t4_write_indirect(adap, TP_PIO_ADDR, TP_PIO_DATA,
+			  &v, 1, A_TP_TX_SCHED_HDR);
+	t4_write_indirect(adap, TP_PIO_ADDR, TP_PIO_DATA,
+			  &v, 1, A_TP_TX_SCHED_FIFO);
+	t4_write_indirect(adap, TP_PIO_ADDR, TP_PIO_DATA,
+			  &v, 1, A_TP_TX_SCHED_PCMD);
+
+#define T4_TX_MODQ_10G_WEIGHT_DEFAULT 16 /* in KB units */
+	if (is_offload(adap)) {
+		t4_write_reg(adap, A_TP_TX_MOD_QUEUE_WEIGHT0,
+			     V_TX_MODQ_WEIGHT0(T4_TX_MODQ_10G_WEIGHT_DEFAULT) |
+			     V_TX_MODQ_WEIGHT1(T4_TX_MODQ_10G_WEIGHT_DEFAULT) |
+			     V_TX_MODQ_WEIGHT2(T4_TX_MODQ_10G_WEIGHT_DEFAULT) |
+			     V_TX_MODQ_WEIGHT3(T4_TX_MODQ_10G_WEIGHT_DEFAULT));
+		t4_write_reg(adap, A_TP_TX_MOD_CHANNEL_WEIGHT,
+			     V_TX_MODQ_WEIGHT0(T4_TX_MODQ_10G_WEIGHT_DEFAULT) |
+			     V_TX_MODQ_WEIGHT1(T4_TX_MODQ_10G_WEIGHT_DEFAULT) |
+			     V_TX_MODQ_WEIGHT2(T4_TX_MODQ_10G_WEIGHT_DEFAULT) |
+			     V_TX_MODQ_WEIGHT3(T4_TX_MODQ_10G_WEIGHT_DEFAULT));
+	}
+
 	/* get basic stuff going */
 	return t4_early_init(adap, adap->fn);
 }
@@ -4958,7 +5096,8 @@ static void __devexit remove_one(struct pci_dev *pdev)
 		 */
 		if (adapter->tids.ftid_tab) {
 			struct filter_entry *f = &adapter->tids.ftid_tab[0];
-			for (i = 0; i < adapter->tids.nftids; i++, f++)
+			for (i = 0; i < (adapter->tids.nftids +
+					adapter->tids.nsftids); i++, f++)
 				if (f->valid)
 					clear_filter(adapter, f);
 		}
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
index 59a6133..065bbd5 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
@@ -131,7 +131,7 @@ static inline void *lookup_atid(const struct tid_info *t, unsigned int atid)
 static inline void *lookup_stid(const struct tid_info *t, unsigned int stid)
 {
 	stid -= t->stid_base;
-	return stid < t->nstids ? t->stid_tab[stid].data : NULL;
+	return stid < (t->nstids + t->nsftids) ? t->stid_tab[stid].data : NULL;
 }
 
 static inline void cxgb4_insert_tid(struct tid_info *t, void *data,
@@ -143,6 +143,7 @@ static inline void cxgb4_insert_tid(struct tid_info *t, void *data,
 
 int cxgb4_alloc_atid(struct tid_info *t, void *data);
 int cxgb4_alloc_stid(struct tid_info *t, int family, void *data);
+int cxgb4_alloc_sftid(struct tid_info *t, int family, void *data);
 void cxgb4_free_atid(struct tid_info *t, unsigned int atid);
 void cxgb4_free_stid(struct tid_info *t, unsigned int stid, int family);
 void cxgb4_remove_tid(struct tid_info *t, unsigned int qid, unsigned int tid);
@@ -151,7 +152,10 @@ struct in6_addr;
 
 int cxgb4_create_server(const struct net_device *dev, unsigned int stid,
 			__be32 sip, __be16 sport, unsigned int queue);
-
+int cxgb4_create_server_filter(const struct net_device *dev, unsigned int stid,
+			       __be32 sip, __be16 sport, unsigned int queue);
+int cxgb4_remove_server_filter(const struct net_device *dev, unsigned int stid,
+			       unsigned int queue, bool ipv6);
 static inline void set_wr_txq(struct sk_buff *skb, int prio, int queue)
 {
 	skb_set_queue_mapping(skb, (queue << 1) | prio);
@@ -223,9 +227,16 @@ struct cxgb4_lld_info {
 	unsigned int iscsi_iolen;            /* iSCSI max I/O length */
 	unsigned short udb_density;          /* # of user DB/page */
 	unsigned short ucq_density;          /* # of user CQs/page */
+	unsigned short filt_mode;            /* filter optional components */
+	unsigned short tx_modq[NCHAN];       /* maps each tx channel to a */
+					     /* scheduler queue */
 	void __iomem *gts_reg;               /* address of GTS register */
 	void __iomem *db_reg;                /* address of kernel doorbell */
 	int dbfifo_int_thresh;		     /* doorbell fifo int threshold */
+	unsigned int sge_pktshift;           /* Padding between CPL and */
+					     /*	packet data */
+	bool enable_fw_ofld_conn;            /* Enable connection through fw */
+					     /* WR */
 };
 
 struct cxgb4_uld_info {
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h b/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
index a1a8b57..66ab614 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
@@ -1032,4 +1032,37 @@
 #define  ADDRESS(x)     ((x) << ADDRESS_SHIFT)
 
 #define XGMAC_PORT_INT_CAUSE 0x10dc
+
+#define A_TP_TX_MOD_QUEUE_REQ_MAP 0x7e28
+
+#define A_TP_TX_MOD_CHANNEL_WEIGHT 0x7e34
+
+#define S_TX_MOD_QUEUE_REQ_MAP    0
+#define M_TX_MOD_QUEUE_REQ_MAP    0xffffU
+#define V_TX_MOD_QUEUE_REQ_MAP(x) ((x) << S_TX_MOD_QUEUE_REQ_MAP)
+
+#define A_TP_TX_MOD_QUEUE_WEIGHT0 0x7e30
+
+#define S_TX_MODQ_WEIGHT3    24
+#define M_TX_MODQ_WEIGHT3    0xffU
+#define V_TX_MODQ_WEIGHT3(x) ((x) << S_TX_MODQ_WEIGHT3)
+
+#define S_TX_MODQ_WEIGHT2    16
+#define M_TX_MODQ_WEIGHT2    0xffU
+#define V_TX_MODQ_WEIGHT2(x) ((x) << S_TX_MODQ_WEIGHT2)
+
+#define S_TX_MODQ_WEIGHT1    8
+#define M_TX_MODQ_WEIGHT1    0xffU
+#define V_TX_MODQ_WEIGHT1(x) ((x) << S_TX_MODQ_WEIGHT1)
+
+#define S_TX_MODQ_WEIGHT0    0
+#define M_TX_MODQ_WEIGHT0    0xffU
+#define V_TX_MODQ_WEIGHT0(x) ((x) << S_TX_MODQ_WEIGHT0)
+
+#define A_TP_TX_SCHED_HDR 0x23
+
+#define A_TP_TX_SCHED_FIFO 0x24
+
+#define A_TP_TX_SCHED_PCMD 0x25
+
 #endif /* __T4_REGS_H */
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH 1/5] cxgb4: Add T4 filter support
From: Vipul Pandya @ 2012-11-29 14:52 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: roland-BHEL68pLQRGGvPXPguhicg, davem-fT/PcQaiUtIeIZ0/mPfg9Q,
	divy-ut6Up61K2wZBDgjK7y7TUQ, dm-ut6Up61K2wZBDgjK7y7TUQ,
	kumaras-ut6Up61K2wZBDgjK7y7TUQ,
	swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW,
	abhishek-ut6Up61K2wZBDgjK7y7TUQ, Vipul Pandya
In-Reply-To: <1354200745-23598-1-git-send-email-vipul-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>

The T4 architecture is capable of filtering ingress packets at line rate
using the rule in TCAM. If packet hits a rule in the TCAM then it can be either
dropped or passed to the receive queues based on a rule settings.

This patch adds framework for managing filters and to use T4's filter
capabilities. It constructs a Firmware Filter Work Request which writes the
filter at a specified index to get the work done. It hosts shadow copy of
ingress filter entry to check field size limitations and save memory in the
case where the filter table is large.

Signed-off-by: Vipul Pandya <vipul-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h      |  141 +++++++++++
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c |  309 ++++++++++++++++++++++-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h  |    2 +
 drivers/net/ethernet/chelsio/cxgb4/l2t.c        |   34 +++
 drivers/net/ethernet/chelsio/cxgb4/l2t.h        |    3 +
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c      |   23 ++-
 drivers/net/ethernet/chelsio/cxgb4/t4_msg.h     |    1 +
 drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h   |  279 ++++++++++++++++++++
 8 files changed, 787 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 378988b..8cfc1ba 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -545,6 +545,139 @@ struct adapter {
 	spinlock_t stats_lock;
 };
 
+/**
+ * Defined bit width of user definable filter tuples
+ */
+#define ETHTYPE_BITWIDTH 16
+#define FRAG_BITWIDTH 1
+#define MACIDX_BITWIDTH 9
+#define FCOE_BITWIDTH 1
+#define IPORT_BITWIDTH 3
+#define MATCHTYPE_BITWIDTH 3
+#define PROTO_BITWIDTH 8
+#define TOS_BITWIDTH 8
+#define PF_BITWIDTH 8
+#define VF_BITWIDTH 8
+#define IVLAN_BITWIDTH 16
+#define OVLAN_BITWIDTH 16
+
+/**
+ * Filter matching rules.  These consist of a set of ingress packet field
+ * (value, mask) tuples.  The associated ingress packet field matches the
+ * tuple when ((field & mask) == value).  (Thus a wildcard "don't care" field
+ * rule can be constructed by specifying a tuple of (0, 0).)  A filter rule
+ * matches an ingress packet when all of the individual individual field
+ * matching rules are true.
+ *
+ * Partial field masks are always valid, however, while it may be easy to
+ * understand their meanings for some fields (e.g. IP address to match a
+ * subnet), for others making sensible partial masks is less intuitive (e.g.
+ * MPS match type) ...
+ *
+ * Most of the following data structures are modeled on T4 capabilities.
+ * Drivers for earlier chips use the subsets which make sense for those chips.
+ * We really need to come up with a hardware-independent mechanism to
+ * represent hardware filter capabilities ...
+ */
+struct ch_filter_tuple {
+	/**
+	 * Compressed header matching field rules.  The TP_VLAN_PRI_MAP
+	 * register selects which of these fields will participate in the
+	 * filter match rules -- up to a maximum of 36 bits.  Because
+	 * TP_VLAN_PRI_MAP is a global register, all filters must use the same
+	 * set of fields.
+	 */
+	uint32_t ethtype:ETHTYPE_BITWIDTH;      /* Ethernet type */
+	uint32_t frag:FRAG_BITWIDTH;            /* IP fragmentation header */
+	uint32_t ivlan_vld:1;                   /* inner VLAN valid */
+	uint32_t ovlan_vld:1;                   /* outer VLAN valid */
+	uint32_t pfvf_vld:1;                    /* PF/VF valid */
+	uint32_t macidx:MACIDX_BITWIDTH;        /* exact match MAC index */
+	uint32_t fcoe:FCOE_BITWIDTH;            /* FCoE packet */
+	uint32_t iport:IPORT_BITWIDTH;          /* ingress port */
+	uint32_t matchtype:MATCHTYPE_BITWIDTH;  /* MPS match type */
+	uint32_t proto:PROTO_BITWIDTH;          /* protocol type */
+	uint32_t tos:TOS_BITWIDTH;              /* TOS/Traffic Type */
+	uint32_t pf:PF_BITWIDTH;                /* PCI-E PF ID */
+	uint32_t vf:VF_BITWIDTH;                /* PCI-E VF ID */
+	uint32_t ivlan:IVLAN_BITWIDTH;          /* inner VLAN */
+	uint32_t ovlan:OVLAN_BITWIDTH;          /* outer VLAN */
+
+	/**
+	 * Uncompressed header matching field rules.  These are always
+	 * available for field rules.
+	 */
+	uint8_t lip[16];        /* local IP address (IPv4 in [3:0]) */
+	uint8_t fip[16];        /* foreign IP address (IPv4 in [3:0]) */
+	uint16_t lport;         /* local port */
+	uint16_t fport;         /* foreign port */
+};
+
+/**
+ * A filter ioctl command.
+ */
+struct ch_filter_specification {
+	/**
+	 * Administrative fields for filter.
+	 */
+	uint32_t hitcnts:1;     /* count filter hits in TCB */
+	uint32_t prio:1;        /* filter has priority over active/server */
+
+	/**
+	 * Fundamental filter typing.  This is the one element of filter
+	 * matching that doesn't exist as a (value, mask) tuple.
+	 */
+	uint32_t type:1;        /* 0 => IPv4, 1 => IPv6 */
+
+	/**
+	 * Packet dispatch information.  Ingress packets which match the
+	 * filter rules will be dropped, passed to the host or switched back
+	 * out as egress packets.
+	 */
+	uint32_t action:2;      /* drop, pass, switch */
+
+	uint32_t rpttid:1;      /* report TID in RSS hash field */
+
+	uint32_t dirsteer:1;    /* 0 => RSS, 1 => steer to iq */
+	uint32_t iq:10;         /* ingress queue */
+
+	uint32_t maskhash:1;    /* dirsteer=0: store RSS hash in TCB */
+	uint32_t dirsteerhash:1;/* dirsteer=1: 0 => TCB contains RSS hash */
+				/*             1 => TCB contains IQ ID */
+
+	/**
+	 * Switch proxy/rewrite fields.  An ingress packet which matches a
+	 * filter with "switch" set will be looped back out as an egress
+	 * packet -- potentially with some Ethernet header rewriting.
+	 */
+	uint32_t eport:2;       /* egress port to switch packet out */
+	uint32_t newdmac:1;     /* rewrite destination MAC address */
+	uint32_t newsmac:1;     /* rewrite source MAC address */
+	uint32_t newvlan:2;     /* rewrite VLAN Tag */
+	uint8_t dmac[ETH_ALEN]; /* new destination MAC address */
+	uint8_t smac[ETH_ALEN]; /* new source MAC address */
+	uint16_t vlan;          /* VLAN Tag to insert */
+
+	/**
+	 * Filter rule value/mask pairs.
+	 */
+	struct ch_filter_tuple val;
+	struct ch_filter_tuple mask;
+};
+
+enum {
+	FILTER_PASS = 0,        /* default */
+	FILTER_DROP,
+	FILTER_SWITCH
+};
+
+enum {
+	VLAN_NOCHANGE = 0,      /* default */
+	VLAN_REMOVE,
+	VLAN_INSERT,
+	VLAN_REWRITE
+};
+
 static inline u32 t4_read_reg(struct adapter *adap, u32 reg_addr)
 {
 	return readl(adap->regs + reg_addr);
@@ -701,6 +834,12 @@ static inline int t4_wr_mbox_ns(struct adapter *adap, int mbox, const void *cmd,
 void t4_write_indirect(struct adapter *adap, unsigned int addr_reg,
 		       unsigned int data_reg, const u32 *vals,
 		       unsigned int nregs, unsigned int start_idx);
+void t4_read_indirect(struct adapter *adap, unsigned int addr_reg,
+		      unsigned int data_reg, u32 *vals, unsigned int nregs,
+		      unsigned int start_idx);
+
+struct fw_filter_wr;
+
 void t4_intr_enable(struct adapter *adapter);
 void t4_intr_disable(struct adapter *adapter);
 int t4_slow_intr_handler(struct adapter *adapter);
@@ -737,6 +876,8 @@ void t4_tp_get_tcp_stats(struct adapter *adap, struct tp_tcp_stats *v4,
 void t4_load_mtus(struct adapter *adap, const unsigned short *mtus,
 		  const unsigned short *alpha, const unsigned short *beta);
 
+void t4_mk_filtdelwr(unsigned int ftid, struct fw_filter_wr *wr, int qid);
+
 void t4_wol_magic_enable(struct adapter *adap, unsigned int port,
 			 const u8 *addr);
 int t4_wol_pat_enable(struct adapter *adap, unsigned int port, unsigned int map,
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 0df1284..4934eda 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -175,6 +175,33 @@ enum {
 	MIN_FL_ENTRIES       = 16
 };
 
+/*
+ * Host shadow copy of ingress filter entry.  This is in host native format
+ * and doesn't match the ordering or bit order, etc. of the hardware of the
+ * firmware command.  The use of bit-field structure elements is purely to
+ * remind ourselves of the field size limitations and save memory in the case
+ * where the filter table is large.
+ */
+struct filter_entry {
+	/*
+	 * Administrative fields for filter.
+	 */
+	u32 valid:1;            /* filter allocated and valid */
+	u32 locked:1;           /* filter is administratively locked */
+
+	u32 pending:1;          /* filter action is pending firmware reply */
+	u32 smtidx:8;           /* Source MAC Table index for smac */
+	struct l2t_entry *l2t;  /* Layer Two Table entry for dmac */
+
+	/**
+	 * The filter itself.  Most of this is a straight copy of information
+	 * provided by the extended ioctl().  Some fields are translated to
+	 * internal forms -- for instance the Ingress Queue ID passed in from
+	 * the ioctl() is translated into the Absolute Ingress Queue ID.
+	 */
+	struct ch_filter_specification fs;
+};
+
 #define DFLT_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK | \
 			 NETIF_MSG_TIMER | NETIF_MSG_IFDOWN | NETIF_MSG_IFUP |\
 			 NETIF_MSG_RX_ERR | NETIF_MSG_TX_ERR)
@@ -325,6 +352,9 @@ enum {
 
 static unsigned int tp_vlan_pri_map = TP_VLAN_PRI_MAP_DEFAULT;
 
+module_param(tp_vlan_pri_map, uint, 0644);
+MODULE_PARM_DESC(tp_vlan_pri_map, "global compressed filter configuration");
+
 static struct dentry *cxgb4_debugfs_root;
 
 static LIST_HEAD(adapter_list);
@@ -507,6 +537,72 @@ static int link_start(struct net_device *dev)
 }
 
 /*
+ * Clear a filter and release any of its resources that we own.  This also
+ * clears the filter's "pending" status.
+ */
+static void clear_filter(struct adapter *adap, struct filter_entry *f)
+{
+	/*
+	 * If the new or old filter have loopback rewriteing rules then we'll
+	 * need to free any existing Layer Two Table (L2T) entries of the old
+	 * filter rule.  The firmware will handle freeing up any Source MAC
+	 * Table (SMT) entries used for rewriting Source MAC Addresses in
+	 * loopback rules.
+	 */
+	if (f->l2t)
+		cxgb4_l2t_release(f->l2t);
+
+	/**
+	 * The zeroing of the filter rule below clears the filter valid,
+	 * pending, locked flags, l2t pointer, etc. so it's all we need for
+	 * this operation.
+	 */
+	memset(f, 0, sizeof(*f));
+}
+
+/**
+ * Handle a filter write/deletion reply.
+ */
+static void filter_rpl(struct adapter *adap, const struct cpl_set_tcb_rpl *rpl)
+{
+	unsigned int idx = GET_TID(rpl);
+	unsigned int nidx = idx - adap->tids.ftid_base;
+	unsigned int ret;
+	struct filter_entry *f;
+
+	if (idx >= adap->tids.ftid_base && nidx <
+	   (adap->tids.nftids + adap->tids.nsftids)) {
+		idx = nidx;
+		ret = GET_TCB_COOKIE(rpl->cookie);
+		f = &adap->tids.ftid_tab[idx];
+
+		if (ret == FW_FILTER_WR_FLT_DELETED) {
+			/**
+			 * Clear the filter when we get confirmation from the
+			 * hardware that the filter has been deleted.
+			 */
+			clear_filter(adap, f);
+		} else if (ret == FW_FILTER_WR_SMT_TBL_FULL) {
+			dev_err(adap->pdev_dev, "filter %u setup failed due to full SMT\n",
+				idx);
+			clear_filter(adap, f);
+		} else if (ret == FW_FILTER_WR_FLT_ADDED) {
+			f->smtidx = (be64_to_cpu(rpl->oldval) >> 24) & 0xff;
+			f->pending = 0;  /* asynchronous setup completed */
+			f->valid = 1;
+		} else {
+			/**
+			 * Something went wrong.  Issue a warning about the
+			 * problem and clear everything out.
+			 */
+			dev_err(adap->pdev_dev, "filter %u setup failed with error %u\n",
+				idx, ret);
+			clear_filter(adap, f);
+		}
+	}
+}
+
+/**
  * Response queue handler for the FW event queue.
  */
 static int fwevtq_handler(struct sge_rspq *q, const __be64 *rsp,
@@ -542,6 +638,10 @@ static int fwevtq_handler(struct sge_rspq *q, const __be64 *rsp,
 		const struct cpl_l2t_write_rpl *p = (void *)rsp;
 
 		do_l2t_write_rpl(q->adap, p);
+	} else if (opcode == CPL_SET_TCB_RPL) {
+		const struct cpl_set_tcb_rpl *p = (void *)rsp;
+
+		filter_rpl(q->adap, p);
 	} else
 		dev_err(q->adap->pdev_dev,
 			"unexpected CPL %#x on FW event queue\n", opcode);
@@ -983,6 +1083,154 @@ static void t4_free_mem(void *addr)
 		kfree(addr);
 }
 
+/*
+ * Send a Work Request to write the filter at a specified index.  We construct
+ * a Firmware Filter Work Request to have the work done and put the indicated
+ * filter into "pending" mode which will prevent any further actions against
+ * it till we get a reply from the firmware on the completion status of the
+ * request.
+ */
+static int set_filter_wr(struct adapter *adapter, int fidx)
+{
+	struct filter_entry *f = &adapter->tids.ftid_tab[fidx];
+	struct sk_buff *skb;
+	struct fw_filter_wr *fwr;
+	unsigned int ftid;
+
+	/**
+	 * If the new filter requires loopback Destination MAC and/or VLAN
+	 * rewriting then we need to allocate a Layer 2 Table (L2T) entry for
+	 * the filter.
+	 */
+	if (f->fs.newdmac || f->fs.newvlan) {
+		/* allocate L2T entry for new filter */
+		f->l2t = t4_l2t_alloc_switching(adapter->l2t);
+		if (f->l2t == NULL)
+			return -EAGAIN;
+		if (t4_l2t_set_switching(adapter, f->l2t, f->fs.vlan,
+					f->fs.eport, f->fs.dmac)) {
+			cxgb4_l2t_release(f->l2t);
+			f->l2t = NULL;
+			return -ENOMEM;
+		}
+	}
+
+	ftid = adapter->tids.ftid_base + fidx;
+
+	skb = alloc_skb(sizeof(*fwr), GFP_KERNEL | __GFP_NOFAIL);
+	fwr = (struct fw_filter_wr *)__skb_put(skb, sizeof(*fwr));
+	memset(fwr, 0, sizeof(*fwr));
+
+	/**
+	 * It would be nice to put most of the following in t4_hw.c but most
+	 * of the work is translating the cxgbtool ch_filter_specification
+	 * into the Work Request and the definition of that structure is
+	 * currently in cxgbtool.h which isn't appropriate to pull into the
+	 * common code.  We may eventually try to come up with a more neutral
+	 * filter specification structure but for now it's easiest to simply
+	 * put this fairly direct code in line ...
+	 */
+	fwr->op_pkd = htonl(FW_WR_OP(FW_FILTER_WR));
+	fwr->len16_pkd = htonl(FW_WR_LEN16(sizeof(*fwr)/16));
+	fwr->tid_to_iq =
+		htonl(V_FW_FILTER_WR_TID(ftid) |
+		      V_FW_FILTER_WR_RQTYPE(f->fs.type) |
+		      V_FW_FILTER_WR_NOREPLY(0) |
+		      V_FW_FILTER_WR_IQ(f->fs.iq));
+	fwr->del_filter_to_l2tix =
+		htonl(V_FW_FILTER_WR_RPTTID(f->fs.rpttid) |
+		      V_FW_FILTER_WR_DROP(f->fs.action == FILTER_DROP) |
+		      V_FW_FILTER_WR_DIRSTEER(f->fs.dirsteer) |
+		      V_FW_FILTER_WR_MASKHASH(f->fs.maskhash) |
+		      V_FW_FILTER_WR_DIRSTEERHASH(f->fs.dirsteerhash) |
+		      V_FW_FILTER_WR_LPBK(f->fs.action == FILTER_SWITCH) |
+		      V_FW_FILTER_WR_DMAC(f->fs.newdmac) |
+		      V_FW_FILTER_WR_SMAC(f->fs.newsmac) |
+		      V_FW_FILTER_WR_INSVLAN(f->fs.newvlan == VLAN_INSERT ||
+					     f->fs.newvlan == VLAN_REWRITE) |
+		      V_FW_FILTER_WR_RMVLAN(f->fs.newvlan == VLAN_REMOVE ||
+					    f->fs.newvlan == VLAN_REWRITE) |
+		      V_FW_FILTER_WR_HITCNTS(f->fs.hitcnts) |
+		      V_FW_FILTER_WR_TXCHAN(f->fs.eport) |
+		      V_FW_FILTER_WR_PRIO(f->fs.prio) |
+		      V_FW_FILTER_WR_L2TIX(f->l2t ? f->l2t->idx : 0));
+	fwr->ethtype = htons(f->fs.val.ethtype);
+	fwr->ethtypem = htons(f->fs.mask.ethtype);
+	fwr->frag_to_ovlan_vldm =
+		(V_FW_FILTER_WR_FRAG(f->fs.val.frag) |
+		 V_FW_FILTER_WR_FRAGM(f->fs.mask.frag) |
+		 V_FW_FILTER_WR_IVLAN_VLD(f->fs.val.ivlan_vld) |
+		 V_FW_FILTER_WR_OVLAN_VLD(f->fs.val.ovlan_vld) |
+		 V_FW_FILTER_WR_IVLAN_VLDM(f->fs.mask.ivlan_vld) |
+		 V_FW_FILTER_WR_OVLAN_VLDM(f->fs.mask.ovlan_vld));
+	fwr->smac_sel = 0;
+	fwr->rx_chan_rx_rpl_iq =
+		htons(V_FW_FILTER_WR_RX_CHAN(0) |
+		      V_FW_FILTER_WR_RX_RPL_IQ(adapter->sge.fw_evtq.abs_id));
+	fwr->maci_to_matchtypem =
+		htonl(V_FW_FILTER_WR_MACI(f->fs.val.macidx) |
+		      V_FW_FILTER_WR_MACIM(f->fs.mask.macidx) |
+		      V_FW_FILTER_WR_FCOE(f->fs.val.fcoe) |
+		      V_FW_FILTER_WR_FCOEM(f->fs.mask.fcoe) |
+		      V_FW_FILTER_WR_PORT(f->fs.val.iport) |
+		      V_FW_FILTER_WR_PORTM(f->fs.mask.iport) |
+		      V_FW_FILTER_WR_MATCHTYPE(f->fs.val.matchtype) |
+		      V_FW_FILTER_WR_MATCHTYPEM(f->fs.mask.matchtype));
+	fwr->ptcl = f->fs.val.proto;
+	fwr->ptclm = f->fs.mask.proto;
+	fwr->ttyp = f->fs.val.tos;
+	fwr->ttypm = f->fs.mask.tos;
+	fwr->ivlan = htons(f->fs.val.ivlan);
+	fwr->ivlanm = htons(f->fs.mask.ivlan);
+	fwr->ovlan = htons(f->fs.val.ovlan);
+	fwr->ovlanm = htons(f->fs.mask.ovlan);
+	memcpy(fwr->lip, f->fs.val.lip, sizeof(fwr->lip));
+	memcpy(fwr->lipm, f->fs.mask.lip, sizeof(fwr->lipm));
+	memcpy(fwr->fip, f->fs.val.fip, sizeof(fwr->fip));
+	memcpy(fwr->fipm, f->fs.mask.fip, sizeof(fwr->fipm));
+	fwr->lp = htons(f->fs.val.lport);
+	fwr->lpm = htons(f->fs.mask.lport);
+	fwr->fp = htons(f->fs.val.fport);
+	fwr->fpm = htons(f->fs.mask.fport);
+	if (f->fs.newsmac)
+		memcpy(fwr->sma, f->fs.smac, sizeof(fwr->sma));
+
+	/**
+	 * Mark the filter as "pending" and ship off the Filter Work Request.
+	 * When we get the Work Request Reply we'll clear the pending status.
+	 */
+	f->pending = 1;
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, f->fs.val.iport & 0x3);
+	t4_ofld_send(adapter, skb);
+	return 0;
+}
+
+/**
+ * Delete the filter at a specified index.
+ */
+static int del_filter_wr(struct adapter *adapter, int fidx)
+{
+	struct filter_entry *f = &adapter->tids.ftid_tab[fidx];
+	struct sk_buff *skb;
+	struct fw_filter_wr *fwr;
+	unsigned int len, ftid;
+
+	len = sizeof(*fwr);
+	ftid = adapter->tids.ftid_base + fidx;
+
+	skb = alloc_skb(len, GFP_KERNEL | __GFP_NOFAIL);
+	fwr = (struct fw_filter_wr *)__skb_put(skb, len);
+	t4_mk_filtdelwr(ftid, fwr, adapter->sge.fw_evtq.abs_id);
+
+	/**
+	 * Mark the filter as "pending" and ship off the Filter Work Request.
+	 * When we get the Work Request Reply we'll clear the pending status.
+	 */
+	f->pending = 1;
+	t4_mgmt_tx(adapter, skb);
+	return 0;
+}
+
 static inline int is_offload(const struct adapter *adap)
 {
 	return adap->params.offload;
@@ -2195,7 +2443,7 @@ int cxgb4_alloc_atid(struct tid_info *t, void *data)
 	if (t->afree) {
 		union aopen_entry *p = t->afree;
 
-		atid = p - t->atid_tab;
+		atid = (p - t->atid_tab) + t->atid_base;
 		t->afree = p->next;
 		p->data = data;
 		t->atids_in_use++;
@@ -2210,7 +2458,7 @@ EXPORT_SYMBOL(cxgb4_alloc_atid);
  */
 void cxgb4_free_atid(struct tid_info *t, unsigned int atid)
 {
-	union aopen_entry *p = &t->atid_tab[atid];
+	union aopen_entry *p = &t->atid_tab[atid - t->atid_base];
 
 	spin_lock_bh(&t->atid_lock);
 	p->next = t->afree;
@@ -2362,11 +2610,16 @@ EXPORT_SYMBOL(cxgb4_remove_tid);
 static int tid_init(struct tid_info *t)
 {
 	size_t size;
+	unsigned int stid_bmap_size;
 	unsigned int natids = t->natids;
 
-	size = t->ntids * sizeof(*t->tid_tab) + natids * sizeof(*t->atid_tab) +
+	stid_bmap_size = BITS_TO_LONGS(t->nstids);
+	size = t->ntids * sizeof(*t->tid_tab) +
+	       natids * sizeof(*t->atid_tab) +
 	       t->nstids * sizeof(*t->stid_tab) +
-	       BITS_TO_LONGS(t->nstids) * sizeof(long);
+	       stid_bmap_size * sizeof(long) +
+	       t->nftids * sizeof(*t->ftid_tab);
+
 	t->tid_tab = t4_alloc_mem(size);
 	if (!t->tid_tab)
 		return -ENOMEM;
@@ -2374,6 +2627,7 @@ static int tid_init(struct tid_info *t)
 	t->atid_tab = (union aopen_entry *)&t->tid_tab[t->ntids];
 	t->stid_tab = (struct serv_entry *)&t->atid_tab[natids];
 	t->stid_bmap = (unsigned long *)&t->stid_tab[t->nstids];
+	t->ftid_tab = (struct filter_entry *)&t->stid_bmap[stid_bmap_size];
 	spin_lock_init(&t->stid_lock);
 	spin_lock_init(&t->atid_lock);
 
@@ -2999,6 +3253,42 @@ static int cxgb_close(struct net_device *dev)
 	return t4_enable_vi(adapter, adapter->fn, pi->viid, false, false);
 }
 
+/*
+ * Return an error number if the indicated filter isn't writable ...
+ */
+static int writable_filter(struct filter_entry *f)
+{
+	if (f->locked)
+		return -EPERM;
+	if (f->pending)
+		return -EBUSY;
+
+	return 0;
+}
+
+/**
+ * Delete the filter at the specified index (if valid).  The checks for all
+ * the common problems with doing this like the filter being locked, currently
+ * pending in another operation, etc.
+ */
+static int delete_filter(struct adapter *adapter, unsigned int fidx)
+{
+	struct filter_entry *f;
+	int ret;
+
+	if (fidx >= adapter->tids.nftids)
+		return -EINVAL;
+
+	f = &adapter->tids.ftid_tab[fidx];
+	ret = writable_filter(f);
+	if (ret)
+		return ret;
+	if (f->valid)
+		return del_filter_wr(adapter, fidx);
+
+	return 0;
+}
+
 static struct rtnl_link_stats64 *cxgb_get_stats(struct net_device *dev,
 						struct rtnl_link_stats64 *ns)
 {
@@ -4662,6 +4952,17 @@ static void __devexit remove_one(struct pci_dev *pdev)
 		if (adapter->debugfs_root)
 			debugfs_remove_recursive(adapter->debugfs_root);
 
+		/*
+		 * If we allocated filters, free up state associated with any
+		 * valid filters ...
+		 */
+		if (adapter->tids.ftid_tab) {
+			struct filter_entry *f = &adapter->tids.ftid_tab[0];
+			for (i = 0; i < adapter->tids.nftids; i++, f++)
+				if (f->valid)
+					clear_filter(adapter, f);
+		}
+
 		if (adapter->flags & FULL_INIT_DONE)
 			cxgb_down(adapter);
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
index 39bec73..59a6133 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
@@ -97,7 +97,9 @@ struct tid_info {
 
 	union aopen_entry *atid_tab;
 	unsigned int natids;
+	unsigned int atid_base;
 
+	struct filter_entry *ftid_tab;
 	unsigned int nftids;
 	unsigned int ftid_base;
 	unsigned int aftid_base;
diff --git a/drivers/net/ethernet/chelsio/cxgb4/l2t.c b/drivers/net/ethernet/chelsio/cxgb4/l2t.c
index 6ac77a6..abd4180 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/l2t.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/l2t.c
@@ -484,6 +484,40 @@ void t4_l2t_update(struct adapter *adap, struct neighbour *neigh)
 		handle_failed_resolution(adap, arpq);
 }
 
+/*
+ * Allocate an L2T entry for use by a switching rule.  Such need to be
+ * explicitly freed and while busy they are not on any hash chain, so normal
+ * address resolution updates do not see them.
+ */
+struct l2t_entry *t4_l2t_alloc_switching(struct l2t_data *d)
+{
+	struct l2t_entry *e;
+
+	write_lock_bh(&d->lock);
+	e = alloc_l2e(d);
+	if (e) {
+		spin_lock(&e->lock);          /* avoid race with t4_l2t_free */
+		e->state = L2T_STATE_SWITCHING;
+		atomic_set(&e->refcnt, 1);
+		spin_unlock(&e->lock);
+	}
+	write_unlock_bh(&d->lock);
+	return e;
+}
+
+/**
+ * Sets/updates the contents of a switching L2T entry that has been allocated
+ * with an earlier call to @t4_l2t_alloc_switching.
+ */
+int t4_l2t_set_switching(struct adapter *adap, struct l2t_entry *e, u16 vlan,
+		u8 port, u8 *eth_addr)
+{
+	e->vlan = vlan;
+	e->lport = port;
+	memcpy(e->dmac, eth_addr, ETH_ALEN);
+	return write_l2e(adap, e, 0);
+}
+
 struct l2t_data *t4_init_l2t(void)
 {
 	int i;
diff --git a/drivers/net/ethernet/chelsio/cxgb4/l2t.h b/drivers/net/ethernet/chelsio/cxgb4/l2t.h
index 02b31d0..108c0f1 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/l2t.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/l2t.h
@@ -100,6 +100,9 @@ struct l2t_entry *cxgb4_l2t_get(struct l2t_data *d, struct neighbour *neigh,
 				unsigned int priority);
 
 void t4_l2t_update(struct adapter *adap, struct neighbour *neigh);
+struct l2t_entry *t4_l2t_alloc_switching(struct l2t_data *d);
+int t4_l2t_set_switching(struct adapter *adap, struct l2t_entry *e, u16 vlan,
+			 u8 port, u8 *eth_addr);
 struct l2t_data *t4_init_l2t(void);
 void do_l2t_write_rpl(struct adapter *p, const struct cpl_l2t_write_rpl *rpl);
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index 730ae2c..7638181 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -109,7 +109,7 @@ void t4_set_reg_field(struct adapter *adapter, unsigned int addr, u32 mask,
  *	Reads registers that are accessed indirectly through an address/data
  *	register pair.
  */
-static void t4_read_indirect(struct adapter *adap, unsigned int addr_reg,
+void t4_read_indirect(struct adapter *adap, unsigned int addr_reg,
 			     unsigned int data_reg, u32 *vals,
 			     unsigned int nregs, unsigned int start_idx)
 {
@@ -2268,6 +2268,27 @@ int t4_wol_pat_enable(struct adapter *adap, unsigned int port, unsigned int map,
 	return 0;
 }
 
+/*
+ *     t4_mk_filtdelwr - create a delete filter WR
+ *     @ftid: the filter ID
+ *     @wr: the filter work request to populate
+ *     @qid: ingress queue to receive the delete notification
+ *
+ *     Creates a filter work request to delete the supplied filter.  If @qid is
+ *     negative the delete notification is suppressed.
+ */
+void t4_mk_filtdelwr(unsigned int ftid, struct fw_filter_wr *wr, int qid)
+{
+	memset(wr, 0, sizeof(*wr));
+	wr->op_pkd = htonl(FW_WR_OP(FW_FILTER_WR));
+	wr->len16_pkd = htonl(FW_WR_LEN16(sizeof(*wr) / 16));
+	wr->tid_to_iq = htonl(V_FW_FILTER_WR_TID(ftid) |
+			V_FW_FILTER_WR_NOREPLY(qid < 0));
+	wr->del_filter_to_l2tix = htonl(F_FW_FILTER_WR_DEL_FILTER);
+	if (qid >= 0)
+		wr->rx_chan_rx_rpl_iq = htons(V_FW_FILTER_WR_RX_RPL_IQ(qid));
+}
+
 #define INIT_CMD(var, cmd, rd_wr) do { \
 	(var).op_to_write = htonl(FW_CMD_OP(FW_##cmd##_CMD) | \
 				  FW_CMD_REQUEST | FW_CMD_##rd_wr); \
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h
index eb71b82..480eb43 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h
@@ -332,6 +332,7 @@ struct cpl_set_tcb_field {
 	__be16 word_cookie;
 #define TCB_WORD(x)   ((x) << 0)
 #define TCB_COOKIE(x) ((x) << 5)
+#define GET_TCB_COOKIE(x) (((x) >> 5) & 7)
 	__be64 mask;
 	__be64 val;
 };
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
index a636463..b4dc560 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
@@ -35,6 +35,10 @@
 #ifndef _T4FW_INTERFACE_H_
 #define _T4FW_INTERFACE_H_
 
+enum fw_ret_val {
+	FW_ENOEXEC      = 8,            /* Exec format error; inv microcode */
+};
+
 #define FW_T4VF_SGE_BASE_ADDR      0x0000
 #define FW_T4VF_MPS_BASE_ADDR      0x0100
 #define FW_T4VF_PL_BASE_ADDR       0x0200
@@ -81,6 +85,281 @@ struct fw_wr_hdr {
 
 #define HW_TPL_FR_MT_PR_IV_P_FC         0X32B
 
+/* filter wr reply code in cookie in CPL_SET_TCB_RPL */
+enum fw_filter_wr_cookie {
+	FW_FILTER_WR_SUCCESS,
+	FW_FILTER_WR_FLT_ADDED,
+	FW_FILTER_WR_FLT_DELETED,
+	FW_FILTER_WR_SMT_TBL_FULL,
+	FW_FILTER_WR_EINVAL,
+};
+
+struct fw_filter_wr {
+	__be32 op_pkd;
+	__be32 len16_pkd;
+	__be64 r3;
+	__be32 tid_to_iq;
+	__be32 del_filter_to_l2tix;
+	__be16 ethtype;
+	__be16 ethtypem;
+	__u8   frag_to_ovlan_vldm;
+	__u8   smac_sel;
+	__be16 rx_chan_rx_rpl_iq;
+	__be32 maci_to_matchtypem;
+	__u8   ptcl;
+	__u8   ptclm;
+	__u8   ttyp;
+	__u8   ttypm;
+	__be16 ivlan;
+	__be16 ivlanm;
+	__be16 ovlan;
+	__be16 ovlanm;
+	__u8   lip[16];
+	__u8   lipm[16];
+	__u8   fip[16];
+	__u8   fipm[16];
+	__be16 lp;
+	__be16 lpm;
+	__be16 fp;
+	__be16 fpm;
+	__be16 r7;
+	__u8   sma[6];
+};
+
+#define S_FW_FILTER_WR_TID      12
+#define M_FW_FILTER_WR_TID      0xfffff
+#define V_FW_FILTER_WR_TID(x)   ((x) << S_FW_FILTER_WR_TID)
+#define G_FW_FILTER_WR_TID(x)   \
+	(((x) >> S_FW_FILTER_WR_TID) & M_FW_FILTER_WR_TID)
+
+#define S_FW_FILTER_WR_RQTYPE           11
+#define M_FW_FILTER_WR_RQTYPE           0x1
+#define V_FW_FILTER_WR_RQTYPE(x)        ((x) << S_FW_FILTER_WR_RQTYPE)
+#define G_FW_FILTER_WR_RQTYPE(x)        \
+	(((x) >> S_FW_FILTER_WR_RQTYPE) & M_FW_FILTER_WR_RQTYPE)
+#define F_FW_FILTER_WR_RQTYPE   V_FW_FILTER_WR_RQTYPE(1U)
+
+#define S_FW_FILTER_WR_NOREPLY          10
+#define M_FW_FILTER_WR_NOREPLY          0x1
+#define V_FW_FILTER_WR_NOREPLY(x)       ((x) << S_FW_FILTER_WR_NOREPLY)
+#define G_FW_FILTER_WR_NOREPLY(x)       \
+	(((x) >> S_FW_FILTER_WR_NOREPLY) & M_FW_FILTER_WR_NOREPLY)
+#define F_FW_FILTER_WR_NOREPLY  V_FW_FILTER_WR_NOREPLY(1U)
+
+#define S_FW_FILTER_WR_IQ       0
+#define M_FW_FILTER_WR_IQ       0x3ff
+#define V_FW_FILTER_WR_IQ(x)    ((x) << S_FW_FILTER_WR_IQ)
+#define G_FW_FILTER_WR_IQ(x)    \
+	(((x) >> S_FW_FILTER_WR_IQ) & M_FW_FILTER_WR_IQ)
+
+#define S_FW_FILTER_WR_DEL_FILTER       31
+#define M_FW_FILTER_WR_DEL_FILTER       0x1
+#define V_FW_FILTER_WR_DEL_FILTER(x)    ((x) << S_FW_FILTER_WR_DEL_FILTER)
+#define G_FW_FILTER_WR_DEL_FILTER(x)    \
+	(((x) >> S_FW_FILTER_WR_DEL_FILTER) & M_FW_FILTER_WR_DEL_FILTER)
+#define F_FW_FILTER_WR_DEL_FILTER       V_FW_FILTER_WR_DEL_FILTER(1U)
+
+#define S_FW_FILTER_WR_RPTTID           25
+#define M_FW_FILTER_WR_RPTTID           0x1
+#define V_FW_FILTER_WR_RPTTID(x)        ((x) << S_FW_FILTER_WR_RPTTID)
+#define G_FW_FILTER_WR_RPTTID(x)        \
+	(((x) >> S_FW_FILTER_WR_RPTTID) & M_FW_FILTER_WR_RPTTID)
+#define F_FW_FILTER_WR_RPTTID   V_FW_FILTER_WR_RPTTID(1U)
+
+#define S_FW_FILTER_WR_DROP     24
+#define M_FW_FILTER_WR_DROP     0x1
+#define V_FW_FILTER_WR_DROP(x)  ((x) << S_FW_FILTER_WR_DROP)
+#define G_FW_FILTER_WR_DROP(x)  \
+	(((x) >> S_FW_FILTER_WR_DROP) & M_FW_FILTER_WR_DROP)
+#define F_FW_FILTER_WR_DROP     V_FW_FILTER_WR_DROP(1U)
+
+#define S_FW_FILTER_WR_DIRSTEER         23
+#define M_FW_FILTER_WR_DIRSTEER         0x1
+#define V_FW_FILTER_WR_DIRSTEER(x)      ((x) << S_FW_FILTER_WR_DIRSTEER)
+#define G_FW_FILTER_WR_DIRSTEER(x)      \
+	(((x) >> S_FW_FILTER_WR_DIRSTEER) & M_FW_FILTER_WR_DIRSTEER)
+#define F_FW_FILTER_WR_DIRSTEER V_FW_FILTER_WR_DIRSTEER(1U)
+
+#define S_FW_FILTER_WR_MASKHASH         22
+#define M_FW_FILTER_WR_MASKHASH         0x1
+#define V_FW_FILTER_WR_MASKHASH(x)      ((x) << S_FW_FILTER_WR_MASKHASH)
+#define G_FW_FILTER_WR_MASKHASH(x)      \
+	(((x) >> S_FW_FILTER_WR_MASKHASH) & M_FW_FILTER_WR_MASKHASH)
+#define F_FW_FILTER_WR_MASKHASH V_FW_FILTER_WR_MASKHASH(1U)
+
+#define S_FW_FILTER_WR_DIRSTEERHASH     21
+#define M_FW_FILTER_WR_DIRSTEERHASH     0x1
+#define V_FW_FILTER_WR_DIRSTEERHASH(x)  ((x) << S_FW_FILTER_WR_DIRSTEERHASH)
+#define G_FW_FILTER_WR_DIRSTEERHASH(x)  \
+	(((x) >> S_FW_FILTER_WR_DIRSTEERHASH) & M_FW_FILTER_WR_DIRSTEERHASH)
+#define F_FW_FILTER_WR_DIRSTEERHASH     V_FW_FILTER_WR_DIRSTEERHASH(1U)
+
+#define S_FW_FILTER_WR_LPBK     20
+#define M_FW_FILTER_WR_LPBK     0x1
+#define V_FW_FILTER_WR_LPBK(x)  ((x) << S_FW_FILTER_WR_LPBK)
+#define G_FW_FILTER_WR_LPBK(x)  \
+	(((x) >> S_FW_FILTER_WR_LPBK) & M_FW_FILTER_WR_LPBK)
+#define F_FW_FILTER_WR_LPBK     V_FW_FILTER_WR_LPBK(1U)
+
+#define S_FW_FILTER_WR_DMAC     19
+#define M_FW_FILTER_WR_DMAC     0x1
+#define V_FW_FILTER_WR_DMAC(x)  ((x) << S_FW_FILTER_WR_DMAC)
+#define G_FW_FILTER_WR_DMAC(x)  \
+	(((x) >> S_FW_FILTER_WR_DMAC) & M_FW_FILTER_WR_DMAC)
+#define F_FW_FILTER_WR_DMAC     V_FW_FILTER_WR_DMAC(1U)
+
+#define S_FW_FILTER_WR_SMAC     18
+#define M_FW_FILTER_WR_SMAC     0x1
+#define V_FW_FILTER_WR_SMAC(x)  ((x) << S_FW_FILTER_WR_SMAC)
+#define G_FW_FILTER_WR_SMAC(x)  \
+	(((x) >> S_FW_FILTER_WR_SMAC) & M_FW_FILTER_WR_SMAC)
+#define F_FW_FILTER_WR_SMAC     V_FW_FILTER_WR_SMAC(1U)
+
+#define S_FW_FILTER_WR_INSVLAN          17
+#define M_FW_FILTER_WR_INSVLAN          0x1
+#define V_FW_FILTER_WR_INSVLAN(x)       ((x) << S_FW_FILTER_WR_INSVLAN)
+#define G_FW_FILTER_WR_INSVLAN(x)       \
+	(((x) >> S_FW_FILTER_WR_INSVLAN) & M_FW_FILTER_WR_INSVLAN)
+#define F_FW_FILTER_WR_INSVLAN  V_FW_FILTER_WR_INSVLAN(1U)
+
+#define S_FW_FILTER_WR_RMVLAN           16
+#define M_FW_FILTER_WR_RMVLAN           0x1
+#define V_FW_FILTER_WR_RMVLAN(x)        ((x) << S_FW_FILTER_WR_RMVLAN)
+#define G_FW_FILTER_WR_RMVLAN(x)        \
+	(((x) >> S_FW_FILTER_WR_RMVLAN) & M_FW_FILTER_WR_RMVLAN)
+#define F_FW_FILTER_WR_RMVLAN   V_FW_FILTER_WR_RMVLAN(1U)
+
+#define S_FW_FILTER_WR_HITCNTS          15
+#define M_FW_FILTER_WR_HITCNTS          0x1
+#define V_FW_FILTER_WR_HITCNTS(x)       ((x) << S_FW_FILTER_WR_HITCNTS)
+#define G_FW_FILTER_WR_HITCNTS(x)       \
+	(((x) >> S_FW_FILTER_WR_HITCNTS) & M_FW_FILTER_WR_HITCNTS)
+#define F_FW_FILTER_WR_HITCNTS  V_FW_FILTER_WR_HITCNTS(1U)
+
+#define S_FW_FILTER_WR_TXCHAN           13
+#define M_FW_FILTER_WR_TXCHAN           0x3
+#define V_FW_FILTER_WR_TXCHAN(x)        ((x) << S_FW_FILTER_WR_TXCHAN)
+#define G_FW_FILTER_WR_TXCHAN(x)        \
+	(((x) >> S_FW_FILTER_WR_TXCHAN) & M_FW_FILTER_WR_TXCHAN)
+
+#define S_FW_FILTER_WR_PRIO     12
+#define M_FW_FILTER_WR_PRIO     0x1
+#define V_FW_FILTER_WR_PRIO(x)  ((x) << S_FW_FILTER_WR_PRIO)
+#define G_FW_FILTER_WR_PRIO(x)  \
+	(((x) >> S_FW_FILTER_WR_PRIO) & M_FW_FILTER_WR_PRIO)
+#define F_FW_FILTER_WR_PRIO     V_FW_FILTER_WR_PRIO(1U)
+
+#define S_FW_FILTER_WR_L2TIX    0
+#define M_FW_FILTER_WR_L2TIX    0xfff
+#define V_FW_FILTER_WR_L2TIX(x) ((x) << S_FW_FILTER_WR_L2TIX)
+#define G_FW_FILTER_WR_L2TIX(x) \
+	(((x) >> S_FW_FILTER_WR_L2TIX) & M_FW_FILTER_WR_L2TIX)
+
+#define S_FW_FILTER_WR_FRAG     7
+#define M_FW_FILTER_WR_FRAG     0x1
+#define V_FW_FILTER_WR_FRAG(x)  ((x) << S_FW_FILTER_WR_FRAG)
+#define G_FW_FILTER_WR_FRAG(x)  \
+	(((x) >> S_FW_FILTER_WR_FRAG) & M_FW_FILTER_WR_FRAG)
+#define F_FW_FILTER_WR_FRAG     V_FW_FILTER_WR_FRAG(1U)
+
+#define S_FW_FILTER_WR_FRAGM    6
+#define M_FW_FILTER_WR_FRAGM    0x1
+#define V_FW_FILTER_WR_FRAGM(x) ((x) << S_FW_FILTER_WR_FRAGM)
+#define G_FW_FILTER_WR_FRAGM(x) \
+	(((x) >> S_FW_FILTER_WR_FRAGM) & M_FW_FILTER_WR_FRAGM)
+#define F_FW_FILTER_WR_FRAGM    V_FW_FILTER_WR_FRAGM(1U)
+
+#define S_FW_FILTER_WR_IVLAN_VLD        5
+#define M_FW_FILTER_WR_IVLAN_VLD        0x1
+#define V_FW_FILTER_WR_IVLAN_VLD(x)     ((x) << S_FW_FILTER_WR_IVLAN_VLD)
+#define G_FW_FILTER_WR_IVLAN_VLD(x)     \
+	(((x) >> S_FW_FILTER_WR_IVLAN_VLD) & M_FW_FILTER_WR_IVLAN_VLD)
+#define F_FW_FILTER_WR_IVLAN_VLD        V_FW_FILTER_WR_IVLAN_VLD(1U)
+
+#define S_FW_FILTER_WR_OVLAN_VLD        4
+#define M_FW_FILTER_WR_OVLAN_VLD        0x1
+#define V_FW_FILTER_WR_OVLAN_VLD(x)     ((x) << S_FW_FILTER_WR_OVLAN_VLD)
+#define G_FW_FILTER_WR_OVLAN_VLD(x)     \
+	(((x) >> S_FW_FILTER_WR_OVLAN_VLD) & M_FW_FILTER_WR_OVLAN_VLD)
+#define F_FW_FILTER_WR_OVLAN_VLD        V_FW_FILTER_WR_OVLAN_VLD(1U)
+
+#define S_FW_FILTER_WR_IVLAN_VLDM       3
+#define M_FW_FILTER_WR_IVLAN_VLDM       0x1
+#define V_FW_FILTER_WR_IVLAN_VLDM(x)    ((x) << S_FW_FILTER_WR_IVLAN_VLDM)
+#define G_FW_FILTER_WR_IVLAN_VLDM(x)    \
+	(((x) >> S_FW_FILTER_WR_IVLAN_VLDM) & M_FW_FILTER_WR_IVLAN_VLDM)
+#define F_FW_FILTER_WR_IVLAN_VLDM       V_FW_FILTER_WR_IVLAN_VLDM(1U)
+
+#define S_FW_FILTER_WR_OVLAN_VLDM       2
+#define M_FW_FILTER_WR_OVLAN_VLDM       0x1
+#define V_FW_FILTER_WR_OVLAN_VLDM(x)    ((x) << S_FW_FILTER_WR_OVLAN_VLDM)
+#define G_FW_FILTER_WR_OVLAN_VLDM(x)    \
+	(((x) >> S_FW_FILTER_WR_OVLAN_VLDM) & M_FW_FILTER_WR_OVLAN_VLDM)
+#define F_FW_FILTER_WR_OVLAN_VLDM       V_FW_FILTER_WR_OVLAN_VLDM(1U)
+
+#define S_FW_FILTER_WR_RX_CHAN          15
+#define M_FW_FILTER_WR_RX_CHAN          0x1
+#define V_FW_FILTER_WR_RX_CHAN(x)       ((x) << S_FW_FILTER_WR_RX_CHAN)
+#define G_FW_FILTER_WR_RX_CHAN(x)       \
+	(((x) >> S_FW_FILTER_WR_RX_CHAN) & M_FW_FILTER_WR_RX_CHAN)
+#define F_FW_FILTER_WR_RX_CHAN  V_FW_FILTER_WR_RX_CHAN(1U)
+
+#define S_FW_FILTER_WR_RX_RPL_IQ        0
+#define M_FW_FILTER_WR_RX_RPL_IQ        0x3ff
+#define V_FW_FILTER_WR_RX_RPL_IQ(x)     ((x) << S_FW_FILTER_WR_RX_RPL_IQ)
+#define G_FW_FILTER_WR_RX_RPL_IQ(x)     \
+	(((x) >> S_FW_FILTER_WR_RX_RPL_IQ) & M_FW_FILTER_WR_RX_RPL_IQ)
+
+#define S_FW_FILTER_WR_MACI     23
+#define M_FW_FILTER_WR_MACI     0x1ff
+#define V_FW_FILTER_WR_MACI(x)  ((x) << S_FW_FILTER_WR_MACI)
+#define G_FW_FILTER_WR_MACI(x)  \
+	(((x) >> S_FW_FILTER_WR_MACI) & M_FW_FILTER_WR_MACI)
+
+#define S_FW_FILTER_WR_MACIM    14
+#define M_FW_FILTER_WR_MACIM    0x1ff
+#define V_FW_FILTER_WR_MACIM(x) ((x) << S_FW_FILTER_WR_MACIM)
+#define G_FW_FILTER_WR_MACIM(x) \
+	(((x) >> S_FW_FILTER_WR_MACIM) & M_FW_FILTER_WR_MACIM)
+
+#define S_FW_FILTER_WR_FCOE     13
+#define M_FW_FILTER_WR_FCOE     0x1
+#define V_FW_FILTER_WR_FCOE(x)  ((x) << S_FW_FILTER_WR_FCOE)
+#define G_FW_FILTER_WR_FCOE(x)  \
+	(((x) >> S_FW_FILTER_WR_FCOE) & M_FW_FILTER_WR_FCOE)
+#define F_FW_FILTER_WR_FCOE     V_FW_FILTER_WR_FCOE(1U)
+
+#define S_FW_FILTER_WR_FCOEM    12
+#define M_FW_FILTER_WR_FCOEM    0x1
+#define V_FW_FILTER_WR_FCOEM(x) ((x) << S_FW_FILTER_WR_FCOEM)
+#define G_FW_FILTER_WR_FCOEM(x) \
+	(((x) >> S_FW_FILTER_WR_FCOEM) & M_FW_FILTER_WR_FCOEM)
+#define F_FW_FILTER_WR_FCOEM    V_FW_FILTER_WR_FCOEM(1U)
+
+#define S_FW_FILTER_WR_PORT     9
+#define M_FW_FILTER_WR_PORT     0x7
+#define V_FW_FILTER_WR_PORT(x)  ((x) << S_FW_FILTER_WR_PORT)
+#define G_FW_FILTER_WR_PORT(x)  \
+	(((x) >> S_FW_FILTER_WR_PORT) & M_FW_FILTER_WR_PORT)
+
+#define S_FW_FILTER_WR_PORTM    6
+#define M_FW_FILTER_WR_PORTM    0x7
+#define V_FW_FILTER_WR_PORTM(x) ((x) << S_FW_FILTER_WR_PORTM)
+#define G_FW_FILTER_WR_PORTM(x) \
+	(((x) >> S_FW_FILTER_WR_PORTM) & M_FW_FILTER_WR_PORTM)
+
+#define S_FW_FILTER_WR_MATCHTYPE        3
+#define M_FW_FILTER_WR_MATCHTYPE        0x7
+#define V_FW_FILTER_WR_MATCHTYPE(x)     ((x) << S_FW_FILTER_WR_MATCHTYPE)
+#define G_FW_FILTER_WR_MATCHTYPE(x)     \
+	(((x) >> S_FW_FILTER_WR_MATCHTYPE) & M_FW_FILTER_WR_MATCHTYPE)
+
+#define S_FW_FILTER_WR_MATCHTYPEM       0
+#define M_FW_FILTER_WR_MATCHTYPEM       0x7
+#define V_FW_FILTER_WR_MATCHTYPEM(x)    ((x) << S_FW_FILTER_WR_MATCHTYPEM)
+#define G_FW_FILTER_WR_MATCHTYPEM(x)    \
+	(((x) >> S_FW_FILTER_WR_MATCHTYPEM) & M_FW_FILTER_WR_MATCHTYPEM)
+
 struct fw_ulptx_wr {
 	__be32 op_to_compl;
 	__be32 flowid_len16;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* Re: [PATCH] solos-pci: don't call vcc->pop() after pclose()
From: David Woodhouse @ 2012-11-29 14:42 UTC (permalink / raw)
  To: Krzysztof Mazur
  Cc: David Laight, chas williams - CONTRACTOR, davem, netdev,
	linux-kernel, nathan
In-Reply-To: <20121129132021.GA15425@shrek.podlesie.net>

[-- Attachment #1: Type: text/plain, Size: 709 bytes --]

On Thu, 2012-11-29 at 14:20 +0100, Krzysztof Mazur wrote:
>                 if (card->tx_skb[port] == skb) {
>                         skb_get(skb);
>                         solos_pop(SKB_CB(skb)->vcc, skb);
>                         SKB_CB(skb)->vcc = NULL;

Um... yes, that would probably work. But it's subtle enough that it
bothers me. And if it *did* cause any strange issues, it's a rare case
and would be hard to reproduce/debug. Is it *really* necessary, just to
speed up the vcc close? I'm inclined to stick with the 'KISS' approach.

(Note that the card->tx_skb[port] skb isn't on the queue any more; you'd
do that bit separately and not in the skb_queue_walk() loop.)

-- 
dwmw2


[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 6171 bytes --]

^ permalink raw reply

* Re: [PATCH] solos-pci: don't call vcc->pop() after pclose()
From: chas williams - CONTRACTOR @ 2012-11-29 14:41 UTC (permalink / raw)
  To: Krzysztof Mazur
  Cc: David Woodhouse, David Laight, davem, netdev, linux-kernel,
	nathan
In-Reply-To: <20121129124344.GA7704@shrek.podlesie.net>

On Thu, 29 Nov 2012 13:43:44 +0100
Krzysztof Mazur <krzysiek@podlesie.net> wrote:

> Removing packets from tx_queue is not needed. We can transmit packets
> also after close. We just can't call vcc->pop() after close,
> so we can just set SKB_CB(skb)->vcc of such packets to NULL so fpga_tx()
> won't call vcc->pop().

i dont think you can transmit packets after close().  you can transmit
packets during close() though.  if you transmit after close that means
that you are using the vpi/vci pair that the atm stack thinks is no
longer in use.  additionally after close(), the hardware should be in a
state such that you cannot transmit or receive on the vpi/vci that has
been closed.

close() needs to make sure that any pending tx packets are sent or
otherwise disposed of (like turning off the transmit segmentation
engine, clearing the packets, or whatever).  any partially reassembled
pdu's also need to be cleared as well.

^ permalink raw reply

* Re: [net-next RFC v2] net_cls: traffic counter based on classification control cgroup
From: Daniel Wagner @ 2012-11-29 14:36 UTC (permalink / raw)
  To: Alexey Perevalov
  Cc: Glauber Costa, netdev-u79uwXL29TY76Z2rM5mHXA,
	cgroups-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <50B5F2EE.6050204-Sze3O3UU22JBDgjK7y7TUQ@public.gmane.org>

Hi Alexey

On 28.11.2012 12:18, Alexey Perevalov wrote:
> On 11/28/2012 12:09 PM, Daniel  Wagner wrote:
>>>>> 2) When Daniel exposed his use case to me, it gave me the impression
>>>>> that "counting traffic" is something that is totally doable by
>>>>> having a
>>>>> dedicated interface in a separate namespace. Basically, we already
>>>>> count
>>>>> traffic (rx and tx) for all interfaces anyway, so it suggests that it
>>>>> could be an interesting way to see the problem.
>>>> Moving applications into separate net namespaces is for sure a valid
>>>> solution.
>>>> Though there is a one drawback in this approach. The namespaces need
>>>> to be
>>>> attached to a bridge and then some NATting. That means every
>>>> application
>>>> would get it's own IP address. This might be okay for your certain use
>>>> cases but I am still trying to work around this. Glauber and I had some
>>>> discussion about this and he suggested to allow the physical networking
>>>> device to be attached to several namespaces (e.g. via macvlan). Every
>>>> namespace would get the same IP address. Unfortunately, this would
>>>> result in
>>>> the same mess as several physical devices on a network get the same
>>>> IP address assigned.
>>> Is I truly understand what to make statistics works we need to put
>>> process to separate namespace?
>> If a process lives in its own network namespace then you can
>> count the packets/bytes on the network interface level. The side effect
>> is that is that each namespace is obviously a new network and has to be
>> treated as such.
>>
>>> Approach to keep counter in cgroup hasn't such side effects, but it has
>>> another ).
>> cgroups are not for free. Currently a lot of effort is put into getting
>> a reasonable performance and behavior into cgroups. In this situation
>> any new feature added to cgroups will need a pretty good justification
>> why it is needed and why it cant be done with existing infrastructure.
> I want to figure out in yours proposed design:
>
>     +------------------------------------------------+
>     |network namespace1: pid1, pid2,...              |
>     |                                                |
> +---------------------------+
>     |   network stack,          network iface        |
> |                           |
>     |       nf hooks                                 +------->| physical
> network          |
>     +------------------------------------------------+        |
> interface            |
> |                           |
> |                           |
>     +------------------------------------------------+
> |                           |
>     |network namespace2: pid1, pid2,...              |
> |                           |
>     | +------->|                           |
>     |   network stack,          network iface        |
> |                           |
>     |       nf hooks                                 |
> |                           |
>     +------------------------------------------------+
> |                           |
> +---------------------------+
>     ...                                                          ^
>     +------------------------------------------------+           |
>     |network namespace3: pid1, pid2,...              |           |
>     |                                                |           |
>     |   network stack,          network iface        +-----------+
>     |       nf hooks                                 |
>     +------------------------------------------------+
>
>
> Question, in case of one physical networking device connected to several
> namespaces,

Currently, a physical device can only live in one namespace. The idea 
was to see if that 'limitation' could be removed, e.g. by modifying macvlan.

> is it allow to tweak network packet scheduler (qdisc instance) using
> traffic control tool for one physical network interface?
> The same question is about netfilter hooks. I have seen the code, it
> seems to me nf hooks is registering per network stack now.

As I said this was just and idea and maybe it is not possible.

> CGroup framework has an notification mechanism based on eventfd. For
> example I can just send notification to user space about network activity.
> Is there such mechanism in standard infrastructure to notify user space
> apps on activity on monitored application (maybe nf_queue)?

My current plan is to use IDLETIMER for this stuff.

cheers,
daniel

^ permalink raw reply

* Re: [PATCH v2 3/3] pppoatm: protect against freeing of vcc
From: chas williams - CONTRACTOR @ 2012-11-29 14:29 UTC (permalink / raw)
  To: Krzysztof Mazur
  Cc: David Woodhouse, David Laight, davem, netdev, linux-kernel,
	nathan
In-Reply-To: <20121129105715.GA10226@shrek.podlesie.net>

On Thu, 29 Nov 2012 11:57:15 +0100
Krzysztof Mazur <krzysiek@podlesie.net> wrote:

> or if we really want to call vcc->pop() for such skbs:

you need to call ->pop() to cleaning up the wmem_alloc accounting.
otherwise you will get complaints from the atm stack later about
freeing a vcc that had outstanding data.

^ permalink raw reply

* iputils: ping -I <iface>
From: Jan Synacek @ 2012-11-29 14:12 UTC (permalink / raw)
  To: YOSHIFUJI Hideaki; +Cc: netdev

Hello,

There seems to be a bug(?) when calling ping with -I lo:

$ ping -I lo kernel.org

PING kernel.org (149.20.4.69) from 192.168.1.10 lo: 56(84) bytes of data.
^C

Note that 192.168.1.10 is my primary interface's address (em1). However, no
replies are coming back.

$ ping -I em1 kernel.org

PING kernel.org (149.20.4.69) from 192.168.1.10 em1: 56(84) bytes of data.
64 bytes from pub2.kernel.org (149.20.4.69): icmp_seq=1 ttl=42 time=202 ms
64 bytes from pub2.kernel.org (149.20.4.69): icmp_seq=2 ttl=42 time=187 ms
^C

Works as expected.

I know that binding to loopback probably doesn't make much sense, but I think
that ping should be able to cope with that.

Also, it would be nice to mention the difference between -I <ip> and -I <iface>
in the manpage.

I don't understand the problem clearly enough to write a patch.

Regards,
-- 
Jan Synacek
Software Engineer, BaseOS team Brno, Red Hat

^ permalink raw reply

* pull-request: can-next 2012-11-29
From: Marc Kleine-Budde @ 2012-11-29 13:50 UTC (permalink / raw)
  To: netdev; +Cc: linux-can@vger.kernel.org

[-- Attachment #1: Type: text/plain, Size: 2152 bytes --]

Hello David,

this pull request is for net-next/master. There is a patch by Alexander
Stein fixing a reference counter problem which can make driver
unloading impossible (stable Cc'ed). And several patches by me which
remove an obsolete mechanism from several drivers, which is already
handled at the infrastructure level.

regards,
Marc

---

The following changes since commit 3177bf6f922f62743133abbcbbbb5545f4133b2d:

  net: ethernet: cpsw: fix build warnings for CPSW when CPTS not selected (2012-11-28 17:51:16 -0500)

are available in the git repository at:

  git://gitorious.org/linux-can/linux-can-next.git for-davem

for you to fetch changes up to 823d7a1f761d6404babaab04cc8b1724186cf2c8:

  can: pcan_usb_core: remove obsolete variable open_time (2012-11-29 14:34:06 +0100)

----------------------------------------------------------------
Alexander Stein (1):
      can: Do not call dev_put if restart timer is running upon close

Marc Kleine-Budde (5):
      can: mscan: remove obsolete variable open_time
      can: sja1000: remove obsolete variable open_time
      can: ems_usb: remove obsolete variable open_time
      can: esd_usb2: remove obsolete variable open_time
      can: pcan_usb_core: remove obsolete variable open_time

 drivers/net/can/dev.c                        |    3 +--
 drivers/net/can/mscan/mscan.c                |    8 --------
 drivers/net/can/mscan/mscan.h                |    1 -
 drivers/net/can/sja1000/sja1000.c            |    8 --------
 drivers/net/can/sja1000/sja1000.h            |    1 -
 drivers/net/can/usb/ems_usb.c                |    7 -------
 drivers/net/can/usb/esd_usb2.c               |   10 ----------
 drivers/net/can/usb/peak_usb/pcan_usb_core.c |    5 -----
 drivers/net/can/usb/peak_usb/pcan_usb_core.h |    1 -
 9 files changed, 1 insertion(+), 43 deletions(-)

-- 
Pengutronix e.K.                  | Marc Kleine-Budde           |
Industrial Linux Solutions        | Phone: +49-231-2826-924     |
Vertretung West/Dortmund          | Fax:   +49-5121-206917-5555 |
Amtsgericht Hildesheim, HRA 2686  | http://www.pengutronix.de   |


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 261 bytes --]

^ permalink raw reply

* Re: [PATCH] solos-pci: don't call vcc->pop() after pclose()
From: Krzysztof Mazur @ 2012-11-29 13:20 UTC (permalink / raw)
  To: David Woodhouse
  Cc: David Laight, chas williams - CONTRACTOR, davem, netdev,
	linux-kernel, nathan
In-Reply-To: <1354193837.21562.159.camel@shinybook.infradead.org>

On Thu, Nov 29, 2012 at 12:57:17PM +0000, David Woodhouse wrote:
> On Thu, 2012-11-29 at 13:43 +0100, Krzysztof Mazur wrote:
> > 
> > Removing packets from tx_queue is not needed. We can transmit packets
> > also after close. We just can't call vcc->pop() after close,
> > so we can just set SKB_CB(skb)->vcc of such packets to NULL so
> > fpga_tx() won't call vcc->pop().
> 
> Your patch doesn't do that, does it? You'd want something like
> 
>  if (card->tx_skb[port] && SKB_CB(card->tx_skb[port]->vcc) == vcc)
>     SKB_CB(card->tx_skb[port]->vcc) = NULL;

No, I want to process all queued packets, not just only those that
were already sent do card.

In that case we will need to remove other packets with that vcc
from queue, of couse we can still do that in the same loop, something
like:

	if (SKB_CB(skb)->vcc == vcc) {
		if (card->tx_skb[port] == skb) {
			skb_get(skb);
			solos_pop(SKB_CB(skb)->vcc, skb);
			SKB_CB(skb)->vcc = NULL;
		} else {
			skb_unlink(skb, &card->tx_queue[port]);
			solos_pop(SKB_CB(skb)->vcc, skb);
		}
	}

But I don't think that this optization is needed.

> 
> Under card->tx_lock should suffice.
> 
> And do we just *not* call the ->pop() on that skb ever? And hope that it
> doesn't screw up some other state somewhere? Like if we're doing MLPPP
> and I've implemented BQL for PPP... we might never call
> ppp_completed_queue() for that skb, so even though this *channel* is
> going away, it might still contribute towards the perceived queue on the
> overall PPP netdev?
> 
> Failing to call ->pop() could cause memory leaks and other issues; I
> don't think it's reasonable. I think we *have* to wait for
> card->tx_skb[port] if it's for the VCC we're closing.

We are calling ->pop() in solos_pop() just before SKB_CB(skb)->vcc = NULL,
but we are doing that before we really finish processing that packet,
that's why we do skb_get(skb).

Krzysiek

^ permalink raw reply

* Re: [PATCH] solos-pci: don't call vcc->pop() after pclose()
From: David Woodhouse @ 2012-11-29 12:57 UTC (permalink / raw)
  To: Krzysztof Mazur
  Cc: David Laight, chas williams - CONTRACTOR, davem, netdev,
	linux-kernel, nathan
In-Reply-To: <20121129124344.GA7704@shrek.podlesie.net>

[-- Attachment #1: Type: text/plain, Size: 1095 bytes --]

On Thu, 2012-11-29 at 13:43 +0100, Krzysztof Mazur wrote:
> 
> Removing packets from tx_queue is not needed. We can transmit packets
> also after close. We just can't call vcc->pop() after close,
> so we can just set SKB_CB(skb)->vcc of such packets to NULL so
> fpga_tx() won't call vcc->pop().

Your patch doesn't do that, does it? You'd want something like

 if (card->tx_skb[port] && SKB_CB(card->tx_skb[port]->vcc) == vcc)
    SKB_CB(card->tx_skb[port]->vcc) = NULL;

Under card->tx_lock should suffice.

And do we just *not* call the ->pop() on that skb ever? And hope that it
doesn't screw up some other state somewhere? Like if we're doing MLPPP
and I've implemented BQL for PPP... we might never call
ppp_completed_queue() for that skb, so even though this *channel* is
going away, it might still contribute towards the perceived queue on the
overall PPP netdev?

Failing to call ->pop() could cause memory leaks and other issues; I
don't think it's reasonable. I think we *have* to wait for
card->tx_skb[port] if it's for the VCC we're closing.

-- 
dwmw2

[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 6171 bytes --]

^ permalink raw reply

* [PATCH] solos-pci: don't call vcc->pop() after pclose()
From: Krzysztof Mazur @ 2012-11-29 12:43 UTC (permalink / raw)
  To: David Woodhouse
  Cc: David Laight, chas williams - CONTRACTOR, davem, netdev,
	linux-kernel, nathan
In-Reply-To: <1354190143.21562.145.camel@shinybook.infradead.org>

On Thu, Nov 29, 2012 at 11:55:43AM +0000, David Woodhouse wrote:
> On Thu, 2012-11-29 at 11:57 +0100, Krzysztof Mazur wrote:
> > do we really need to wait here?
> > Why don't just do something like that:
> > 
> > 	tasklet_disable(&card->tlet);
> > 	spin_lock(&card->tx_queue_lock);
> > 	for each skb in queue
> > 		SKB_CB(skb)->vcc = NULL;
> > 	spin_unlock(&card->tx_queue_lock);
> > 	tasklet_enable(&card->tlet);
> > 
> > or if we really want to call vcc->pop() for such skbs:
> > 
> > 	tasklet_disable(&card->tlet);
> > 	spin_lock(&card->tx_queue_lock);
> > 	for each skb in queue {
> > 		skb_get(skb);
> > 		solos_pop(SKB_CB(skb)->vcc, skb);
> > 		SKB_CB(skb)->vcc = NULL;
> > 	}
> > 	spin_unlock(&card->tx_queue_lock);
> > 	tasklet_enable(&card->tlet);
> 
> Yes, we could certainly remove the packets from the tx_queue first.
> 
> However, in the card->using_dma case there might be a skb for this vcc
> *currently* being DMA'd, and we'd still need to wait for that one.

Removing packets from tx_queue is not needed. We can transmit packets
also after close. We just can't call vcc->pop() after close,
so we can just set SKB_CB(skb)->vcc of such packets to NULL so fpga_tx()
won't call vcc->pop().

Maybe I was not precise enough, I'm think that all we need is
something like that:

-- >8 --
Subject: [PATCH] solos-pci: don't call vcc->pop() after pclose()

After atmdev_ops->close() we cannot use vcc->pop() because the vcc may,
and probably will be destroyed.

We can just set vcc for such frames to NULL because fpga_tx() after
completion will call dev_kfree_skb() in that case.

Signed-off-by: Krzysztof Mazur <krzysiek@podlesie.net>
---
 drivers/atm/solos-pci.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/atm/solos-pci.c b/drivers/atm/solos-pci.c
index 9851093..aabe021 100644
--- a/drivers/atm/solos-pci.c
+++ b/drivers/atm/solos-pci.c
@@ -868,6 +868,19 @@ static void pclose(struct atm_vcc *vcc)
 	struct solos_card *card = vcc->dev->dev_data;
 	struct sk_buff *skb;
 	struct pkt_hdr *header;
+	unsigned int port;
+
+	tasklet_disable(&card->tlet);
+	spin_lock(&card->tx_queue_lock);
+	for (port = 0; port < card->nr_ports; port++)
+		skb_queue_walk(&card->tx_queue[port], skb)
+			if (SKB_CB(skb)->vcc == vcc) {
+				skb_get(skb);
+				solos_pop(SKB_CB(skb)->vcc, skb);
+				SKB_CB(skb)->vcc = NULL;
+			}
+	spin_unlock(&card->tx_queue_lock);
+	tasklet_enable(&card->tlet);
 
 	skb = alloc_skb(sizeof(*header), GFP_ATOMIC);
 	if (!skb) {
-- 
1.8.0.411.g71a7da8

^ permalink raw reply related

* [PATCH v2] bonding: fix race condition in bonding_store_slaves_active
From: Nikolay Aleksandrov @ 2012-11-29 11:37 UTC (permalink / raw)
  To: netdev; +Cc: fubar, andy, davem
In-Reply-To: <1353759595-30452-1-git-send-email-nikolay@redhat.com>

 Race between bonding_store_slaves_active() and slave manipulation 
 functions. The bond_for_each_slave use in bonding_store_slaves_active()
 is not protected by any synchronization mechanism.
 NULL pointer dereference is easy to reach.
 Fixed by acquiring the bond->lock for the slave walk.

 v2: Make description text < 75 columns

Signed-off-by: Nikolay Aleksandrov <nikolay@redhat.com>
---
 drivers/net/bonding/bond_sysfs.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index ef8d2a0..ba4f95b 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -1582,6 +1582,7 @@ static ssize_t bonding_store_slaves_active(struct device *d,
 		goto out;
 	}
 
+	read_lock(&bond->lock);
 	bond_for_each_slave(bond, slave, i) {
 		if (!bond_is_active_slave(slave)) {
 			if (new_value)
@@ -1590,6 +1591,7 @@ static ssize_t bonding_store_slaves_active(struct device *d,
 				slave->inactive = 1;
 		}
 	}
+	read_unlock(&bond->lock);
 out:
 	return ret;
 }
-- 
1.7.11.7

^ permalink raw reply related

* Re: [PATCH v2 3/3] pppoatm: protect against freeing of vcc
From: David Woodhouse @ 2012-11-29 11:55 UTC (permalink / raw)
  To: Krzysztof Mazur
  Cc: David Laight, chas williams - CONTRACTOR, davem, netdev,
	linux-kernel, nathan
In-Reply-To: <20121129105715.GA10226@shrek.podlesie.net>

[-- Attachment #1: Type: text/plain, Size: 1675 bytes --]

On Thu, 2012-11-29 at 11:57 +0100, Krzysztof Mazur wrote:
> do we really need to wait here?
> Why don't just do something like that:
> 
> 	tasklet_disable(&card->tlet);
> 	spin_lock(&card->tx_queue_lock);
> 	for each skb in queue
> 		SKB_CB(skb)->vcc = NULL;
> 	spin_unlock(&card->tx_queue_lock);
> 	tasklet_enable(&card->tlet);
> 
> or if we really want to call vcc->pop() for such skbs:
> 
> 	tasklet_disable(&card->tlet);
> 	spin_lock(&card->tx_queue_lock);
> 	for each skb in queue {
> 		skb_get(skb);
> 		solos_pop(SKB_CB(skb)->vcc, skb);
> 		SKB_CB(skb)->vcc = NULL;
> 	}
> 	spin_unlock(&card->tx_queue_lock);
> 	tasklet_enable(&card->tlet);

Yes, we could certainly remove the packets from the tx_queue first.

However, in the card->using_dma case there might be a skb for this vcc
*currently* being DMA'd, and we'd still need to wait for that one.

I suppose we could just have a waitqueue in *every* TX skb, and under
card->tx_lock we could add ourselves to *that* waitqueue. Or just a
global waitqueue for DMA tx_done, perhaps. But waiting for our own
PKT_PCLOSE skb is just 'cleaner' in my view. It's simpler, and it's much
easier to test. Even if I had DMA-capable hardware, I'd have to get the
right timing to properly test that TX-pending-DMA case.

So dequeuing the packets would only serve to make pclose() slightly
faster, rather than simplifying it. It's hardly a fast path that we care
about, and I've also already ensured that there should only be one or
two packets queued per vcc *anyway*. So I'm mostly inclined not to
bother.

(I did fix the timeout argument to wait_for_completion_timeout())

-- 
dwmw2

[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 6171 bytes --]

^ permalink raw reply

* [PATCH v3] bonding: make arp_ip_target parameter checks consistent with sysfs
From: Nikolay Aleksandrov @ 2012-11-29 11:34 UTC (permalink / raw)
  To: netdev; +Cc: fubar, andy, davem
In-Reply-To: <1354121052-6263-1-git-send-email-nikolay@redhat.com>

 The module can be loaded with arp_ip_target="255.255.255.255" which makes
 it impossible to remove as the function in sysfs checks for that value,
 so we make the parameter checks consistent with sysfs.

 v2: Fix formatting
 v3: Make description text < 75 columns

Signed-off-by: Nikolay Aleksandrov <nikolay@redhat.com>
---
 drivers/net/bonding/bond_main.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 5f5b69f..d29159a 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4706,12 +4706,13 @@ static int bond_check_params(struct bond_params *params)
 	     arp_ip_count++) {
 		/* not complete check, but should be good enough to
 		   catch mistakes */
-		if (!isdigit(arp_ip_target[arp_ip_count][0])) {
+		__be32 ip = in_aton(arp_ip_target[arp_ip_count]);
+		if (!isdigit(arp_ip_target[arp_ip_count][0]) ||
+		    ip == 0 || ip == htonl(INADDR_BROADCAST)) {
 			pr_warning("Warning: bad arp_ip_target module parameter (%s), ARP monitoring will not be performed\n",
 				   arp_ip_target[arp_ip_count]);
 			arp_interval = 0;
 		} else {
-			__be32 ip = in_aton(arp_ip_target[arp_ip_count]);
 			arp_target[arp_ip_count] = ip;
 		}
 	}
-- 
1.7.11.7

^ permalink raw reply related

* [PATCH v2] bonding: fix miimon and arp_interval delayed work race conditions
From: Nikolay Aleksandrov @ 2012-11-29 11:31 UTC (permalink / raw)
  To: netdev; +Cc: fubar, andy, davem
In-Reply-To: <1353759471-30323-1-git-send-email-nikolay@redhat.com>

First I would give three observations which will be used later.
Observation 1: if (delayed_work_pending(wq)) cancel_delayed_work(wq)
 This usage is wrong because the pending bit is cleared just before the
 work's fn is executed and if the function re-arms itself we might end up
 with the work still running. It's safe to call cancel_delayed_work_sync()
 even if the work is not queued at all.
Observation 2: Use of INIT_DELAYED_WORK()
 Work needs to be initialized only once prior to (de/en)queueing.
Observation 3: IFF_UP is set only after ndo_open is called

Related race conditions:
1. Race between bonding_store_miimon() and bonding_store_arp_interval()
 Because of Obs.1 we can end up having both works enqueued.
2. Multiple races with INIT_DELAYED_WORK()
 Since the works are not protected by anything between INIT_DELAYED_WORK()
 and calls to (en/de)queue it is possible for races between the following
 functions:
 (races are also possible between the calls to INIT_DELAYED_WORK()
  and workqueue code)
 bonding_store_miimon() - bonding_store_arp_interval(), bond_close(),
			  bond_open(), enqueued functions
 bonding_store_arp_interval() - bonding_store_miimon(), bond_close(),
				bond_open(), enqueued functions
3. By Obs.1 we need to change bond_cancel_all()

Bugs 1 and 2 are fixed by moving all work initializations in bond_open
which by Obs. 2 and Obs. 3 and the fact that we make sure that all works
are cancelled in bond_close(), is guaranteed not to have any work
enqueued.
Also RTNL lock is now acquired in bonding_store_miimon/arp_interval so
they can't race with bond_close and bond_open. The opposing work is
cancelled only if the IFF_UP flag is set and it is cancelled
unconditionally. The opposing work is already cancelled if the interface
is down so no need to cancel it again. This way we don't need new
synchronizations for the bonding workqueue. These bugs (and fixes) are
tied together and belong in the same patch.
Note: I have left 1 line intentionally over 80 characters (84) because I
      didn't like how it looks broken down. If you'd prefer it otherwise,
      then simply break it.

 v2: Make description text < 75 columns

Signed-off-by: Nikolay Aleksandrov <nikolay@redhat.com>
---
 drivers/net/bonding/bond_main.c  | 88 ++++++++++++----------------------------
 drivers/net/bonding/bond_sysfs.c | 34 +++++-----------
 2 files changed, 36 insertions(+), 86 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 5f5b69f..1445c7d 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3459,6 +3459,28 @@ static int bond_xmit_hash_policy_l34(struct sk_buff *skb, int count)
 
 /*-------------------------- Device entry points ----------------------------*/
 
+static void bond_work_init_all(struct bonding *bond)
+{
+	INIT_DELAYED_WORK(&bond->mcast_work,
+			  bond_resend_igmp_join_requests_delayed);
+	INIT_DELAYED_WORK(&bond->alb_work, bond_alb_monitor);
+	INIT_DELAYED_WORK(&bond->mii_work, bond_mii_monitor);
+	if (bond->params.mode == BOND_MODE_ACTIVEBACKUP)
+		INIT_DELAYED_WORK(&bond->arp_work, bond_activebackup_arp_mon);
+	else
+		INIT_DELAYED_WORK(&bond->arp_work, bond_loadbalance_arp_mon);
+	INIT_DELAYED_WORK(&bond->ad_work, bond_3ad_state_machine_handler);
+}
+
+static void bond_work_cancel_all(struct bonding *bond)
+{
+	cancel_delayed_work_sync(&bond->mii_work);
+	cancel_delayed_work_sync(&bond->arp_work);
+	cancel_delayed_work_sync(&bond->alb_work);
+	cancel_delayed_work_sync(&bond->ad_work);
+	cancel_delayed_work_sync(&bond->mcast_work);
+}
+
 static int bond_open(struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
@@ -3481,41 +3503,27 @@ static int bond_open(struct net_device *bond_dev)
 	}
 	read_unlock(&bond->lock);
 
-	INIT_DELAYED_WORK(&bond->mcast_work, bond_resend_igmp_join_requests_delayed);
+	bond_work_init_all(bond);
 
 	if (bond_is_lb(bond)) {
 		/* bond_alb_initialize must be called before the timer
 		 * is started.
 		 */
-		if (bond_alb_initialize(bond, (bond->params.mode == BOND_MODE_ALB))) {
-			/* something went wrong - fail the open operation */
+		if (bond_alb_initialize(bond, (bond->params.mode == BOND_MODE_ALB)))
 			return -ENOMEM;
-		}
-
-		INIT_DELAYED_WORK(&bond->alb_work, bond_alb_monitor);
 		queue_delayed_work(bond->wq, &bond->alb_work, 0);
 	}
 
-	if (bond->params.miimon) {  /* link check interval, in milliseconds. */
-		INIT_DELAYED_WORK(&bond->mii_work, bond_mii_monitor);
+	if (bond->params.miimon)  /* link check interval, in milliseconds. */
 		queue_delayed_work(bond->wq, &bond->mii_work, 0);
-	}
 
 	if (bond->params.arp_interval) {  /* arp interval, in milliseconds. */
-		if (bond->params.mode == BOND_MODE_ACTIVEBACKUP)
-			INIT_DELAYED_WORK(&bond->arp_work,
-					  bond_activebackup_arp_mon);
-		else
-			INIT_DELAYED_WORK(&bond->arp_work,
-					  bond_loadbalance_arp_mon);
-
 		queue_delayed_work(bond->wq, &bond->arp_work, 0);
 		if (bond->params.arp_validate)
 			bond->recv_probe = bond_arp_rcv;
 	}
 
 	if (bond->params.mode == BOND_MODE_8023AD) {
-		INIT_DELAYED_WORK(&bond->ad_work, bond_3ad_state_machine_handler);
 		queue_delayed_work(bond->wq, &bond->ad_work, 0);
 		/* register to receive LACPDUs */
 		bond->recv_probe = bond_3ad_lacpdu_recv;
@@ -3530,34 +3538,10 @@ static int bond_close(struct net_device *bond_dev)
 	struct bonding *bond = netdev_priv(bond_dev);
 
 	write_lock_bh(&bond->lock);
-
 	bond->send_peer_notif = 0;
-
 	write_unlock_bh(&bond->lock);
 
-	if (bond->params.miimon) {  /* link check interval, in milliseconds. */
-		cancel_delayed_work_sync(&bond->mii_work);
-	}
-
-	if (bond->params.arp_interval) {  /* arp interval, in milliseconds. */
-		cancel_delayed_work_sync(&bond->arp_work);
-	}
-
-	switch (bond->params.mode) {
-	case BOND_MODE_8023AD:
-		cancel_delayed_work_sync(&bond->ad_work);
-		break;
-	case BOND_MODE_TLB:
-	case BOND_MODE_ALB:
-		cancel_delayed_work_sync(&bond->alb_work);
-		break;
-	default:
-		break;
-	}
-
-	if (delayed_work_pending(&bond->mcast_work))
-		cancel_delayed_work_sync(&bond->mcast_work);
-
+	bond_work_cancel_all(bond);
 	if (bond_is_lb(bond)) {
 		/* Must be called only after all
 		 * slaves have been released
@@ -4436,26 +4420,6 @@ static void bond_setup(struct net_device *bond_dev)
 	bond_dev->features |= bond_dev->hw_features;
 }
 
-static void bond_work_cancel_all(struct bonding *bond)
-{
-	if (bond->params.miimon && delayed_work_pending(&bond->mii_work))
-		cancel_delayed_work_sync(&bond->mii_work);
-
-	if (bond->params.arp_interval && delayed_work_pending(&bond->arp_work))
-		cancel_delayed_work_sync(&bond->arp_work);
-
-	if (bond->params.mode == BOND_MODE_ALB &&
-	    delayed_work_pending(&bond->alb_work))
-		cancel_delayed_work_sync(&bond->alb_work);
-
-	if (bond->params.mode == BOND_MODE_8023AD &&
-	    delayed_work_pending(&bond->ad_work))
-		cancel_delayed_work_sync(&bond->ad_work);
-
-	if (delayed_work_pending(&bond->mcast_work))
-		cancel_delayed_work_sync(&bond->mcast_work);
-}
-
 /*
 * Destroy a bonding device.
 * Must be under rtnl_lock when this function is called.
diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index ef8d2a0..3327a07 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -513,6 +513,8 @@ static ssize_t bonding_store_arp_interval(struct device *d,
 	int new_value, ret = count;
 	struct bonding *bond = to_bond(d);
 
+	if (!rtnl_trylock())
+		return restart_syscall();
 	if (sscanf(buf, "%d", &new_value) != 1) {
 		pr_err("%s: no arp_interval value specified.\n",
 		       bond->dev->name);
@@ -539,10 +541,6 @@ static ssize_t bonding_store_arp_interval(struct device *d,
 		pr_info("%s: ARP monitoring cannot be used with MII monitoring. %s Disabling MII monitoring.\n",
 			bond->dev->name, bond->dev->name);
 		bond->params.miimon = 0;
-		if (delayed_work_pending(&bond->mii_work)) {
-			cancel_delayed_work(&bond->mii_work);
-			flush_workqueue(bond->wq);
-		}
 	}
 	if (!bond->params.arp_targets[0]) {
 		pr_info("%s: ARP monitoring has been set up, but no ARP targets have been specified.\n",
@@ -554,19 +552,12 @@ static ssize_t bonding_store_arp_interval(struct device *d,
 		 * timer will get fired off when the open function
 		 * is called.
 		 */
-		if (!delayed_work_pending(&bond->arp_work)) {
-			if (bond->params.mode == BOND_MODE_ACTIVEBACKUP)
-				INIT_DELAYED_WORK(&bond->arp_work,
-						  bond_activebackup_arp_mon);
-			else
-				INIT_DELAYED_WORK(&bond->arp_work,
-						  bond_loadbalance_arp_mon);
-
-			queue_delayed_work(bond->wq, &bond->arp_work, 0);
-		}
+		cancel_delayed_work_sync(&bond->mii_work);
+		queue_delayed_work(bond->wq, &bond->arp_work, 0);
 	}
 
 out:
+	rtnl_unlock();
 	return ret;
 }
 static DEVICE_ATTR(arp_interval, S_IRUGO | S_IWUSR,
@@ -962,6 +953,8 @@ static ssize_t bonding_store_miimon(struct device *d,
 	int new_value, ret = count;
 	struct bonding *bond = to_bond(d);
 
+	if (!rtnl_trylock())
+		return restart_syscall();
 	if (sscanf(buf, "%d", &new_value) != 1) {
 		pr_err("%s: no miimon value specified.\n",
 		       bond->dev->name);
@@ -993,10 +986,6 @@ static ssize_t bonding_store_miimon(struct device *d,
 				bond->params.arp_validate =
 					BOND_ARP_VALIDATE_NONE;
 			}
-			if (delayed_work_pending(&bond->arp_work)) {
-				cancel_delayed_work(&bond->arp_work);
-				flush_workqueue(bond->wq);
-			}
 		}
 
 		if (bond->dev->flags & IFF_UP) {
@@ -1005,15 +994,12 @@ static ssize_t bonding_store_miimon(struct device *d,
 			 * timer will get fired off when the open function
 			 * is called.
 			 */
-			if (!delayed_work_pending(&bond->mii_work)) {
-				INIT_DELAYED_WORK(&bond->mii_work,
-						  bond_mii_monitor);
-				queue_delayed_work(bond->wq,
-						   &bond->mii_work, 0);
-			}
+			cancel_delayed_work_sync(&bond->arp_work);
+			queue_delayed_work(bond->wq, &bond->mii_work, 0);
 		}
 	}
 out:
+	rtnl_unlock();
 	return ret;
 }
 static DEVICE_ATTR(miimon, S_IRUGO | S_IWUSR,
-- 
1.7.11.7

^ permalink raw reply related

* Re: [net-next RFC] pktgen: don't wait for the device who doesn't free skb immediately after sent
From: Michael S. Tsirkin @ 2012-11-29 11:30 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, virtualization, Stephen Hemminger, davem
In-Reply-To: <1570859.IePPxHAEX1@jason-thinkpad-t430s>

On Thu, Nov 29, 2012 at 06:13:13PM +0800, Jason Wang wrote:
> On Wednesday, November 28, 2012 08:53:05 AM Stephen Hemminger wrote:
> > On Wed, 28 Nov 2012 14:48:52 +0800
> > 
> > Jason Wang <jasowang@redhat.com> wrote:
> > > On 11/28/2012 12:49 AM, Stephen Hemminger wrote:
> > > > On Tue, 27 Nov 2012 14:45:13 +0800
> > > > 
> > > > Jason Wang <jasowang@redhat.com> wrote:
> > > >> On 11/27/2012 01:37 AM, Stephen Hemminger wrote:
> > > >>> On Mon, 26 Nov 2012 15:56:52 +0800
> > > >>> 
> > > >>> Jason Wang <jasowang@redhat.com> wrote:
> > > >>>> Some deivces do not free the old tx skbs immediately after it has
> > > >>>> been sent
> > > >>>> (usually in tx interrupt). One such example is virtio-net which
> > > >>>> optimizes for virt and only free the possible old tx skbs during the
> > > >>>> next packet sending. This would lead the pktgen to wait forever in
> > > >>>> the refcount of the skb if no other pakcet will be sent afterwards.
> > > >>>> 
> > > >>>> Solving this issue by introducing a new flag IFF_TX_SKB_FREE_DELAY
> > > >>>> which could notify the pktgen that the device does not free skb
> > > >>>> immediately after it has been sent and let it not to wait for the
> > > >>>> refcount to be one.
> > > >>>> 
> > > >>>> Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > >>> 
> > > >>> Another alternative would be using skb_orphan() and skb->destructor.
> > > >>> There are other cases where skb's are not freed right away.
> > > >>> --
> > > >>> To unsubscribe from this list: send the line "unsubscribe netdev" in
> > > >>> the body of a message to majordomo@vger.kernel.org
> > > >>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > >> 
> > > >> Hi Stephen:
> > > >> 
> > > >> Do you mean registering a skb->destructor for pktgen then set and check
> > > >> bits in skb->tx_flag?
> > > > 
> > > > Yes. Register a destructor that does something like update a counter
> > > > (number of packets pending), then just spin while number of packets
> > > > pending is over threshold.
> > > > --
> > > 
> > > Not sure this is the best method, since pktgen was used to test the tx
> > > process of the device driver and NIC. If we use skb_orhpan(), we would
> > > miss the test of tx completion part.
> > 
> > There are other places that delay freeing and your solution would mean
> > finding and fixing all those. Code that does that already has to use
> > skb_orphan() to work, and I was looking for a way that could use that.
> > Introducing another flag value seems like a long term burden.
> > 
> 
> Get the point, will draft another version.
>
> > Alternatively, virtio could do cleanup more aggressively. Maybe in response
> > to ring getting half full, or add a cleanup timer or something to avoid the
> > problem.

Timer would prevent complete deadlock but it is very expensive
in the virt scenario.
pulling at ring half full would only help if ring gets half full :)
which it does not have to.

> 
> May worth to try. Another method is that virtio has a feature to notify guest 
> when tx ring is empty, we could free the old tx skbs there.
> But it may brings 
> extra overhead. If we could let virtio_net free the old tx skb timely, it 
> would be easier to bring BQL support to virtio_net also.
> 
> Thanks

We used to use notify on empty - it's still very slow.

> > 
> > 
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe netdev" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH] mISDN: improve bitops usage
From: Akinobu Mita @ 2012-11-29 11:27 UTC (permalink / raw)
  To: netdev; +Cc: Akinobu Mita, Karsten Keil

This improves bitops usages in several points:

- Convert u64 to a proper bitmap declaration.  This enables to remove
  superfluous typecasting from 'u64' to 'unsigned long *'.

- Convert superfluous atomic bitops to non atomic bitops.  The bitmap
  is allocated on the stack and it is not accessed by any other threads,
  so using atomic bitops is not necessary.

- Use find_next_zero_bit and find_next_zero_bit instead of calling
  test_bit() for each bit.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Karsten Keil <isdn@linux-pingi.de>
Cc: netdev@vger.kernel.org
---
 drivers/isdn/mISDN/tei.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/isdn/mISDN/tei.c b/drivers/isdn/mISDN/tei.c
index be88728..592f597 100644
--- a/drivers/isdn/mISDN/tei.c
+++ b/drivers/isdn/mISDN/tei.c
@@ -250,7 +250,7 @@ tei_debug(struct FsmInst *fi, char *fmt, ...)
 static int
 get_free_id(struct manager *mgr)
 {
-	u64		ids = 0;
+	DECLARE_BITMAP(ids, 64) = { [0 ... BITS_TO_LONGS(64) - 1] = 0 };
 	int		i;
 	struct layer2	*l2;
 
@@ -261,11 +261,11 @@ get_free_id(struct manager *mgr)
 			       __func__);
 			return -EBUSY;
 		}
-		test_and_set_bit(l2->ch.nr, (u_long *)&ids);
+		__set_bit(l2->ch.nr, ids);
 	}
-	for (i = 1; i < 64; i++)
-		if (!test_bit(i, (u_long *)&ids))
-			return i;
+	i = find_next_zero_bit(ids, 64, 1);
+	if (i < 64)
+		return i;
 	printk(KERN_WARNING "%s: more as 63 layer2 for one device\n",
 	       __func__);
 	return -EBUSY;
@@ -274,7 +274,7 @@ get_free_id(struct manager *mgr)
 static int
 get_free_tei(struct manager *mgr)
 {
-	u64		ids = 0;
+	DECLARE_BITMAP(ids, 64) = { [0 ... BITS_TO_LONGS(64) - 1] = 0 };
 	int		i;
 	struct layer2	*l2;
 
@@ -288,11 +288,11 @@ get_free_tei(struct manager *mgr)
 			continue;
 		i -= 64;
 
-		test_and_set_bit(i, (u_long *)&ids);
+		__set_bit(i, ids);
 	}
-	for (i = 0; i < 64; i++)
-		if (!test_bit(i, (u_long *)&ids))
-			return i + 64;
+	i = find_first_zero_bit(ids, 64);
+	if (i < 64)
+		return i + 64;
 	printk(KERN_WARNING "%s: more as 63 dynamic tei for one device\n",
 	       __func__);
 	return -1;
-- 
1.7.11.7

^ permalink raw reply related

* Re: [PATCH v2 3/3] pppoatm: protect against freeing of vcc
From: Krzysztof Mazur @ 2012-11-29 10:57 UTC (permalink / raw)
  To: David Woodhouse
  Cc: David Laight, chas williams - CONTRACTOR, davem, netdev,
	linux-kernel, nathan
In-Reply-To: <1354141115.21562.101.camel@shinybook.infradead.org>

On Wed, Nov 28, 2012 at 10:18:35PM +0000, David Woodhouse wrote:
> On Wed, 2012-11-28 at 09:21 +0000, David Laight wrote:
> > Even when it might make sense to sleep in close until tx drains
> > there needs to be a finite timeout before it become abortive.
> 
> You are, of course, right. We should never wait for hardware for ever.
> And just to serve me right, I seem to have hit a bug in the latest Solos
> firmware (1.11) which makes it sometimes lock up when I reboot. So it
> never responds to the PKT_PCLOSE packet... and thus it deadlocks when I
> try to kill pppd and unload the module to reset it :)
> 
> New version...
> 
> From 53dd01c08fec5b26006a009b25e4210127fdb27a Mon Sep 17 00:00:00 2001
> From: David Woodhouse <David.Woodhouse@intel.com>
> Date: Tue, 27 Nov 2012 23:49:24 +0000
> Subject: [PATCH] solos-pci: Wait for pending TX to complete when releasing
>  vcc
> 
> We should no longer be calling the old pop routine for the vcc, after
> vcc_release() has completed. Make sure we wait for any pending TX skbs
> to complete, by waiting for our own PKT_PCLOSE control skb to be sent.
> 
> Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
> ---
>  drivers/atm/solos-pci.c | 17 +++++++++++++++--
>  1 file changed, 15 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/atm/solos-pci.c b/drivers/atm/solos-pci.c
> index 9851093..3720670 100644
> --- a/drivers/atm/solos-pci.c
> +++ b/drivers/atm/solos-pci.c
> @@ -92,6 +92,7 @@ struct pkt_hdr {
>  };
>  
>  struct solos_skb_cb {
> +	struct completion c;
>  	struct atm_vcc *vcc;
>  	uint32_t dma_addr;
>  };
> @@ -881,11 +882,18 @@ static void pclose(struct atm_vcc *vcc)
>  	header->vci = cpu_to_le16(vcc->vci);
>  	header->type = cpu_to_le16(PKT_PCLOSE);
>  
> +	init_completion(&SKB_CB(skb)->c);
> +
>  	fpga_queue(card, SOLOS_CHAN(vcc->dev), skb, NULL);
>  
>  	clear_bit(ATM_VF_ADDR, &vcc->flags);
>  	clear_bit(ATM_VF_READY, &vcc->flags);
>  
> +	if (!wait_for_completion_timeout(&SKB_CB(skb)->c,
> +					 jiffies + msecs_to_jiffies(5000)))
> +		dev_warn(&card->dev->dev, "Timeout waiting for VCC close on port %d\n",
> +			 SOLOS_CHAN(vcc->dev));
> +

do we really need to wait here?

Why don't just do something like that:

	tasklet_disable(&card->tlet);
	spin_lock(&card->tx_queue_lock);
	for each skb in queue
		SKB_CB(skb)->vcc = NULL;
	spin_unlock(&card->tx_queue_lock);
	tasklet_enable(&card->tlet);

or if we really want to call vcc->pop() for such skbs:

	tasklet_disable(&card->tlet);
	spin_lock(&card->tx_queue_lock);
	for each skb in queue {
		skb_get(skb);
		solos_pop(SKB_CB(skb)->vcc, skb);
		SKB_CB(skb)->vcc = NULL;
	}
	spin_unlock(&card->tx_queue_lock);
	tasklet_enable(&card->tlet);

Krzysiek

^ permalink raw reply

* Re: [net-next RFC] pktgen: don't wait for the device who doesn't free skb immediately after sent
From: Jason Wang @ 2012-11-29 10:13 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: mst, netdev, linux-kernel, virtualization, davem
In-Reply-To: <20121128085305.2b0503ce@nehalam.linuxnetplumber.net>

On Wednesday, November 28, 2012 08:53:05 AM Stephen Hemminger wrote:
> On Wed, 28 Nov 2012 14:48:52 +0800
> 
> Jason Wang <jasowang@redhat.com> wrote:
> > On 11/28/2012 12:49 AM, Stephen Hemminger wrote:
> > > On Tue, 27 Nov 2012 14:45:13 +0800
> > > 
> > > Jason Wang <jasowang@redhat.com> wrote:
> > >> On 11/27/2012 01:37 AM, Stephen Hemminger wrote:
> > >>> On Mon, 26 Nov 2012 15:56:52 +0800
> > >>> 
> > >>> Jason Wang <jasowang@redhat.com> wrote:
> > >>>> Some deivces do not free the old tx skbs immediately after it has
> > >>>> been sent
> > >>>> (usually in tx interrupt). One such example is virtio-net which
> > >>>> optimizes for virt and only free the possible old tx skbs during the
> > >>>> next packet sending. This would lead the pktgen to wait forever in
> > >>>> the refcount of the skb if no other pakcet will be sent afterwards.
> > >>>> 
> > >>>> Solving this issue by introducing a new flag IFF_TX_SKB_FREE_DELAY
> > >>>> which could notify the pktgen that the device does not free skb
> > >>>> immediately after it has been sent and let it not to wait for the
> > >>>> refcount to be one.
> > >>>> 
> > >>>> Signed-off-by: Jason Wang <jasowang@redhat.com>
> > >>> 
> > >>> Another alternative would be using skb_orphan() and skb->destructor.
> > >>> There are other cases where skb's are not freed right away.
> > >>> --
> > >>> To unsubscribe from this list: send the line "unsubscribe netdev" in
> > >>> the body of a message to majordomo@vger.kernel.org
> > >>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > >> 
> > >> Hi Stephen:
> > >> 
> > >> Do you mean registering a skb->destructor for pktgen then set and check
> > >> bits in skb->tx_flag?
> > > 
> > > Yes. Register a destructor that does something like update a counter
> > > (number of packets pending), then just spin while number of packets
> > > pending is over threshold.
> > > --
> > 
> > Not sure this is the best method, since pktgen was used to test the tx
> > process of the device driver and NIC. If we use skb_orhpan(), we would
> > miss the test of tx completion part.
> 
> There are other places that delay freeing and your solution would mean
> finding and fixing all those. Code that does that already has to use
> skb_orphan() to work, and I was looking for a way that could use that.
> Introducing another flag value seems like a long term burden.
> 

Get the point, will draft another version.
> Alternatively, virtio could do cleanup more aggressively. Maybe in response
> to ring getting half full, or add a cleanup timer or something to avoid the
> problem.

May worth to try. Another method is that virtio has a feature to notify guest 
when tx ring is empty, we could free the old tx skbs there. But it may brings 
extra overhead. If we could let virtio_net free the old tx skb timely, it 
would be easier to bring BQL support to virtio_net also.

Thanks
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH] ipv6: unify logic evaluating inet6_dev's accept_ra property
From: Shmulik Ladkani @ 2012-11-29  9:26 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Hideaki YOSHIFUJI, Thomas Graf, Tore Anderson,
	shmulik.ladkani

As of 026359b [ipv6: Send ICMPv6 RSes only when RAs are accepted], the
logic determining whether to send Router Solicitations is identical
to the logic determining whether kernel accepts Router Advertisements.

However the condition itself is repeated in several code locations.

Unify it by introducing 'ipv6_accept_ra()' accessor.

Also, simplify the condition expression, making it more readable.
No semantic change.

Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
---

Not sure about the proper header to place the 'ipv6_accept_ra' accessor.
Currently placed in include/net/ipv6.h.
Other candidates were if_inet6.h and addrconf.h.

 include/net/ipv6.h  |   10 ++++++++++
 net/ipv6/addrconf.c |    3 +--
 net/ipv6/ndisc.c    |   16 ++--------------
 3 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 979bf6c..cd15585 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -271,6 +271,16 @@ struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
 
 extern bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb);
 
+static inline bool ipv6_accept_ra(struct inet6_dev *idev)
+{
+	/*
+	 * If forwarding is enabled, RA are not accepted unless the special
+	 * hybrid mode (accept_ra=2) is enabled.
+	 */
+	return idev->cnf.forwarding ? idev->cnf.accept_ra == 2 :
+	    idev->cnf.accept_ra;
+}
+
 #if IS_ENABLED(CONFIG_IPV6)
 static inline int ip6_frag_nqueues(struct net *net)
 {
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 0424e4e..ca1ed8a 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3005,8 +3005,7 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
 	   router advertisements, start sending router solicitations.
 	 */
 
-	if (((ifp->idev->cnf.accept_ra == 1 && !ifp->idev->cnf.forwarding) ||
-	     ifp->idev->cnf.accept_ra == 2) &&
+	if (ipv6_accept_ra(ifp->idev) &&
 	    ifp->idev->cnf.rtr_solicits > 0 &&
 	    (dev->flags&IFF_LOOPBACK) == 0 &&
 	    (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)) {
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 2edce30..980cdc3 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1033,18 +1033,6 @@ errout:
 	rtnl_set_sk_err(net, RTNLGRP_ND_USEROPT, err);
 }
 
-static inline int accept_ra(struct inet6_dev *in6_dev)
-{
-	/*
-	 * If forwarding is enabled, RA are not accepted unless the special
-	 * hybrid mode (accept_ra=2) is enabled.
-	 */
-	if (in6_dev->cnf.forwarding && in6_dev->cnf.accept_ra < 2)
-		return 0;
-
-	return in6_dev->cnf.accept_ra;
-}
-
 static void ndisc_router_discovery(struct sk_buff *skb)
 {
 	struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb);
@@ -1092,7 +1080,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 		return;
 	}
 
-	if (!accept_ra(in6_dev))
+	if (!ipv6_accept_ra(in6_dev))
 		goto skip_linkparms;
 
 #ifdef CONFIG_IPV6_NDISC_NODETYPE
@@ -1248,7 +1236,7 @@ skip_linkparms:
 			     NEIGH_UPDATE_F_ISROUTER);
 	}
 
-	if (!accept_ra(in6_dev))
+	if (!ipv6_accept_ra(in6_dev))
 		goto out;
 
 #ifdef CONFIG_IPV6_ROUTE_INFO
-- 
1.7.9

^ permalink raw reply related

* Re: [PATCH 1/1] Introduce notification events for routing changes
From: Jozsef Kadlecsik @ 2012-11-29  9:05 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, netfilter-devel
In-Reply-To: <20121128.175929.618012822860698468.davem@davemloft.net>

On Wed, 28 Nov 2012, David Miller wrote:

> > The netfilter MASQUERADE target does not handle the case when the routing
> > changes and the source address of existing connections become invalid.
> > The problem can be solved if routing modifications create events to which
> > the MASQUERADE target can subscribe and then delete the affected
> > connections.
> > 
> > The patch adds the required event support for IPv4/IPv6.
> > 
> > Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
> 
> What part of the information are you actually interested in?
> 
> Because just saying that a route is added or removed using fib_info X
> doesn't tell you a whole lot.
[...]

Please discard my patch. I found a simpler and more efficient way to solve 
the issue in netfilter itself.

Best regards,
Jozsef
-
E-mail  : kadlec@blackhole.kfki.hu, kadlecsik.jozsef@wigner.mta.hu
PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
Address : Wigner Research Centre for Physics, Hungarian Academy of Sciences
          H-1525 Budapest 114, POB. 49, Hungary

^ permalink raw reply

* RDS: sendto() with very large buffer triggering WARNING: at mm/page_alloc.c:2403
From: Tommi Rantala @ 2012-11-29  8:39 UTC (permalink / raw)
  To: netdev, Venkat Venkatsubra, rds-devel; +Cc: Dave Jones

Hello,

Is RDS supposed to cap the sendto() buffer size? Saw the WARNING while
fuzzing with Trinity.

#include <string.h>
#include <arpa/inet.h>
#include <sys/socket.h>

static const char buf[1234000567];

int main(void)
{
        int fd;
        struct sockaddr_in sa;

        fd = socket(21 /* AF_RDS */, SOCK_SEQPACKET, 0);
        if (fd < 0)
                return 1;

        memset(&sa, 0, sizeof(sa));
        sa.sin_family = AF_INET;
        sa.sin_addr.s_addr = inet_addr("127.0.0.1");
        sa.sin_port = htons(11111);

        bind(fd, (struct sockaddr *)&sa, sizeof(sa));

        sendto(fd, buf, sizeof(buf), 0, (struct sockaddr *)&sa, sizeof(sa));

        return 0;
}

$ strace -e sendto ./rds-sendto
sendto(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
1234000567, 0, {sa_family=AF_INET, sin_port=htons(11111),
sin_addr=inet_addr("127.0.0.1")}, 16) = -1 ENOMEM (Cannot allocate
memory)

[ 7421.592595] ------------[ cut here ]------------
[ 7421.592621] WARNING: at mm/page_alloc.c:2403
__alloc_pages_nodemask+0x2c0/0x9f0()
[ 7421.592628] Hardware name: EB1012
[ 7421.592633] Modules linked in:
[ 7421.592645] Pid: 3082, comm: rds-sendto Not tainted 3.7.0-rc7+ #58
[ 7421.592650] Call Trace:
[ 7421.592667]  [<ffffffff810a197b>] warn_slowpath_common+0x7b/0xc0
[ 7421.592678]  [<ffffffff810a19d5>] warn_slowpath_null+0x15/0x20
[ 7421.592689]  [<ffffffff81171900>] __alloc_pages_nodemask+0x2c0/0x9f0
[ 7421.592700]  [<ffffffff81107c90>] ? __lock_acquire+0x3a0/0x9f0
[ 7421.592711]  [<ffffffff81107c90>] ? __lock_acquire+0x3a0/0x9f0
[ 7421.592724]  [<ffffffff811ac6bf>] alloc_pages_current+0x7f/0xf0
[ 7421.592735]  [<ffffffff8116cc19>] __get_free_pages+0x9/0x40
[ 7421.592746]  [<ffffffff811b3d8a>] kmalloc_order_trace+0x3a/0x190
[ 7421.592755]  [<ffffffff81107c90>] ? __lock_acquire+0x3a0/0x9f0
[ 7421.592765]  [<ffffffff811b4f59>] __kmalloc+0x229/0x240
[ 7421.592778]  [<ffffffff81d1b06e>] rds_message_alloc+0x1e/0xa0
[ 7421.592789]  [<ffffffff81d1db66>] rds_sendmsg+0x196/0x720
[ 7421.592802]  [<ffffffff81a72a80>] ? sock_update_classid+0xf0/0x2b0
[ 7421.592813]  [<ffffffff81a6abec>] sock_sendmsg+0xdc/0xf0
[ 7421.592828]  [<ffffffff8118e9e5>] ? might_fault+0x85/0x90
[ 7421.592838]  [<ffffffff8118e99c>] ? might_fault+0x3c/0x90
[ 7421.592848]  [<ffffffff81a6e0fa>] sys_sendto+0xfa/0x130
[ 7421.592859]  [<ffffffff8110953d>] ? trace_hardirqs_on_caller+0x10d/0x1a0
[ 7421.592868]  [<ffffffff811095dd>] ? trace_hardirqs_on+0xd/0x10
[ 7421.592881]  [<ffffffff81e94c9d>] ? _raw_spin_unlock_irq+0x3d/0x70
[ 7421.592892]  [<ffffffff810bab44>] ? ptrace_notify+0x74/0x90
[ 7421.592904]  [<ffffffff81e96250>] tracesys+0xdd/0xe2
[ 7421.592911] ---[ end trace d9d681d0d60abf69 ]---

Tommi

^ permalink raw reply

* [PATCH: net-next] core: make GRO methods static.
From: Rami Rosen @ 2012-11-29  7:55 UTC (permalink / raw)
  To: davem; +Cc: netdev, Rami Rosen

This patch changes three methods to be static and removes their
EXPORT_SYMBOLs in core/dev.c and their external declaration in
netdevice.h. The methods, dev_gro_receive(), napi_frags_finish() and
napi_skb_finish(), which are in the GRO rx path, are not used
outside core/dev.c.

Signed-off-by: Rami Rosen <ramirose@gmail.com>
---
 include/linux/netdevice.h | 6 ------
 net/core/dev.c            | 9 +++------
 2 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e9929ab..18c5dc9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2153,16 +2153,10 @@ extern void dev_kfree_skb_any(struct sk_buff *skb);
 extern int		netif_rx(struct sk_buff *skb);
 extern int		netif_rx_ni(struct sk_buff *skb);
 extern int		netif_receive_skb(struct sk_buff *skb);
-extern gro_result_t	dev_gro_receive(struct napi_struct *napi,
-					struct sk_buff *skb);
-extern gro_result_t	napi_skb_finish(gro_result_t ret, struct sk_buff *skb);
 extern gro_result_t	napi_gro_receive(struct napi_struct *napi,
 					 struct sk_buff *skb);
 extern void		napi_gro_flush(struct napi_struct *napi, bool flush_old);
 extern struct sk_buff *	napi_get_frags(struct napi_struct *napi);
-extern gro_result_t	napi_frags_finish(struct napi_struct *napi,
-					  struct sk_buff *skb,
-					  gro_result_t ret);
 extern gro_result_t	napi_gro_frags(struct napi_struct *napi);
 
 static inline void napi_free_frags(struct napi_struct *napi)
diff --git a/net/core/dev.c b/net/core/dev.c
index 2a5f5586..2f94df2 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3592,7 +3592,7 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old)
 }
 EXPORT_SYMBOL(napi_gro_flush);
 
-enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
 	struct sk_buff **pp = NULL;
 	struct packet_offload *ptype;
@@ -3683,7 +3683,6 @@ normal:
 	ret = GRO_NORMAL;
 	goto pull;
 }
-EXPORT_SYMBOL(dev_gro_receive);
 
 static inline gro_result_t
 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
@@ -3710,7 +3709,7 @@ __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 	return dev_gro_receive(napi, skb);
 }
 
-gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
+static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
 {
 	switch (ret) {
 	case GRO_NORMAL:
@@ -3736,7 +3735,6 @@ gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
 
 	return ret;
 }
-EXPORT_SYMBOL(napi_skb_finish);
 
 static void skb_gro_reset_offset(struct sk_buff *skb)
 {
@@ -3788,7 +3786,7 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi)
 }
 EXPORT_SYMBOL(napi_get_frags);
 
-gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
+static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
 			       gro_result_t ret)
 {
 	switch (ret) {
@@ -3813,7 +3811,6 @@ gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
 
 	return ret;
 }
-EXPORT_SYMBOL(napi_frags_finish);
 
 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
 {
-- 
1.7.11.7

^ permalink raw reply related

* [PATCH 1/2] of_mdio: Honour "status=disabled" property of device
From: Alexander Sverdlin @ 2012-11-29  7:45 UTC (permalink / raw)
  To: Stephen Warren, devicetree-discuss-uLR06cmDAlY/bJ5BZ2RsiQ,
	Rob Herring, Grant Likely
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, Barry.Song-kQvG35nSl+M,
	w.sang-bIcnvbaLZ9MEGnE8C9+IrQ, alexander sverdlin

From: Alexander Sverdlin <alexander.sverdlin-uSbOeAmDUekAvxtiuMwx3w@public.gmane.org>

of_mdio: Honour "status=disabled" property of device

Currently of_mdiobus_register() function registers all PHY devices,
independetly from their status property in device tree. According to
"ePAPR 1.1" spec, device should only be registered if there is no
"status" property, or it has "ok" (or "okay") value (see
of_device_is_available()). In case of "platform devices",
of_platform_device_create_pdata() checks for "status" and ensures
that disabled devices are not pupulated. But such check for MDIO buses
was missing until now. Fix it.

Signed-off-by: Alexander Sverdlin <alexander.sverdlin-uSbOeAmDUekAvxtiuMwx3w@public.gmane.org>
---
--- linux.orig/drivers/of/of_mdio.c
+++ linux/drivers/of/of_mdio.c
@@ -53,7 +53,7 @@ int of_mdiobus_register(struct mii_bus *
 		return rc;

 	/* Loop over the child nodes and register a phy_device for each one */
-	for_each_child_of_node(np, child) {
+	for_each_available_child_of_node(np, child) {
 		const __be32 *paddr;
 		u32 addr;
 		int len;

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox