Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH v3 net-next 06/12] qed: Add LL2 slowpath handling
From: Michal Kalderon @ 2017-10-09  9:37 UTC (permalink / raw)
  To: davem; +Cc: netdev, linux-rdma, dledford, Michal Kalderon, Ariel Elior
In-Reply-To: <1507541874-18344-1-git-send-email-Michal.Kalderon@cavium.com>

For iWARP unaligned MPA flow, a slowpath event of flushing an
MPA connection that entered an unaligned state is required.
The flush ramrod is received on the ll2 queue, and a pre-registered
callback function is called to handle the flush event.

Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Signed-off-by: Ariel Elior <Ariel.Elior@cavium.com>
---
 drivers/net/ethernet/qlogic/qed/qed_ll2.c | 40 +++++++++++++++++++++++++++++--
 include/linux/qed/qed_ll2_if.h            |  5 ++++
 2 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index 8eb9645..047f556 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -423,6 +423,41 @@ static void qed_ll2_rxq_parse_reg(struct qed_hwfn *p_hwfn,
 }
 
 static int
+qed_ll2_handle_slowpath(struct qed_hwfn *p_hwfn,
+			struct qed_ll2_info *p_ll2_conn,
+			union core_rx_cqe_union *p_cqe,
+			unsigned long *p_lock_flags)
+{
+	struct qed_ll2_rx_queue *p_rx = &p_ll2_conn->rx_queue;
+	struct core_rx_slow_path_cqe *sp_cqe;
+
+	sp_cqe = &p_cqe->rx_cqe_sp;
+	if (sp_cqe->ramrod_cmd_id != CORE_RAMROD_RX_QUEUE_FLUSH) {
+		DP_NOTICE(p_hwfn,
+			  "LL2 - unexpected Rx CQE slowpath ramrod_cmd_id:%d\n",
+			  sp_cqe->ramrod_cmd_id);
+		return -EINVAL;
+	}
+
+	if (!p_ll2_conn->cbs.slowpath_cb) {
+		DP_NOTICE(p_hwfn,
+			  "LL2 - received RX_QUEUE_FLUSH but no callback was provided\n");
+		return -EINVAL;
+	}
+
+	spin_unlock_irqrestore(&p_rx->lock, *p_lock_flags);
+
+	p_ll2_conn->cbs.slowpath_cb(p_ll2_conn->cbs.cookie,
+				    p_ll2_conn->my_id,
+				    le32_to_cpu(sp_cqe->opaque_data.data[0]),
+				    le32_to_cpu(sp_cqe->opaque_data.data[1]));
+
+	spin_lock_irqsave(&p_rx->lock, *p_lock_flags);
+
+	return 0;
+}
+
+static int
 qed_ll2_rxq_handle_completion(struct qed_hwfn *p_hwfn,
 			      struct qed_ll2_info *p_ll2_conn,
 			      union core_rx_cqe_union *p_cqe,
@@ -495,8 +530,8 @@ static int qed_ll2_rxq_completion(struct qed_hwfn *p_hwfn, void *cookie)
 
 		switch (cqe->rx_cqe_sp.type) {
 		case CORE_RX_CQE_TYPE_SLOW_PATH:
-			DP_NOTICE(p_hwfn, "LL2 - unexpected Rx CQE slowpath\n");
-			rc = -EINVAL;
+			rc = qed_ll2_handle_slowpath(p_hwfn, p_ll2_conn,
+						     cqe, &flags);
 			break;
 		case CORE_RX_CQE_TYPE_GSI_OFFLOAD:
 		case CORE_RX_CQE_TYPE_REGULAR:
@@ -1214,6 +1249,7 @@ static int qed_ll2_acquire_connection_tx(struct qed_hwfn *p_hwfn,
 	p_ll2_info->cbs.rx_release_cb = cbs->rx_release_cb;
 	p_ll2_info->cbs.tx_comp_cb = cbs->tx_comp_cb;
 	p_ll2_info->cbs.tx_release_cb = cbs->tx_release_cb;
+	p_ll2_info->cbs.slowpath_cb = cbs->slowpath_cb;
 	p_ll2_info->cbs.cookie = cbs->cookie;
 
 	return 0;
diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h
index 95fdf02..e755954 100644
--- a/include/linux/qed/qed_ll2_if.h
+++ b/include/linux/qed/qed_ll2_if.h
@@ -151,11 +151,16 @@ struct qed_ll2_comp_rx_data {
 				     dma_addr_t first_frag_addr,
 				     bool b_last_fragment, bool b_last_packet);
 
+typedef
+void (*qed_ll2_slowpath_cb)(void *cxt, u8 connection_handle,
+			    u32 opaque_data_0, u32 opaque_data_1);
+
 struct qed_ll2_cbs {
 	qed_ll2_complete_rx_packet_cb rx_comp_cb;
 	qed_ll2_release_rx_packet_cb rx_release_cb;
 	qed_ll2_complete_tx_packet_cb tx_comp_cb;
 	qed_ll2_release_tx_packet_cb tx_release_cb;
+	qed_ll2_slowpath_cb slowpath_cb;
 	void *cookie;
 };
 
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v3 net-next 08/12] qed: Add mpa buffer descriptors for storing and processing mpa fpdus
From: Michal Kalderon @ 2017-10-09  9:37 UTC (permalink / raw)
  To: davem; +Cc: netdev, linux-rdma, dledford, Michal Kalderon, Ariel Elior
In-Reply-To: <1507541874-18344-1-git-send-email-Michal.Kalderon@cavium.com>

The mpa buff is a descriptor for iwarp ll2 buffers that contains
additional information required for aligining fpdu's.
In some cases, an additional packet will arrive which will complete
the alignment of a fpdu, but we won't be able to post the fpdu due to
insufficient place on the tx ring. In this case we can't loose the data
and require storing it for later. Processing is therefore done
in two places, during rx completion, where we initialize a mpa buffer
descriptor and add it to the pending list, and during tx-completion, since
we free up an entry in the tx chain we can process any pending mpa packets.
The mpa buff descriptors are pre-allocated since we have to ensure that
we won't reach a state where we can't store an incoming unaligned packet.
All packets received on the ll2 MUST be processed by the driver at some
stage. Since they are preallocated, we hold a free list.

Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Signed-off-by: Ariel Elior <Ariel.Elior@cavium.com>
---
 drivers/net/ethernet/qlogic/qed/qed_iwarp.c | 116 ++++++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_iwarp.h |  11 +++
 2 files changed, 127 insertions(+)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
index f413621..efd4861 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
@@ -1415,7 +1415,10 @@ int qed_iwarp_alloc(struct qed_hwfn *p_hwfn)
 
 void qed_iwarp_resc_free(struct qed_hwfn *p_hwfn)
 {
+	struct qed_iwarp_info *iwarp_info = &p_hwfn->p_rdma_info->iwarp;
+
 	qed_rdma_bmap_free(p_hwfn, &p_hwfn->p_rdma_info->tcp_cid_map, 1);
+	kfree(iwarp_info->mpa_bufs);
 }
 
 int qed_iwarp_accept(void *rdma_cxt, struct qed_iwarp_accept_in *iparams)
@@ -1716,12 +1719,103 @@ int qed_iwarp_reject(void *rdma_cxt, struct qed_iwarp_reject_in *iparams)
 /* fpdu can be fragmented over maximum 3 bds: header, partial mpa, unaligned */
 #define QED_IWARP_MAX_BDS_PER_FPDU 3
 static void
+qed_iwarp_mpa_get_data(struct qed_hwfn *p_hwfn,
+		       struct unaligned_opaque_data *curr_pkt,
+		       u32 opaque_data0, u32 opaque_data1)
+{
+	u64 opaque_data;
+
+	opaque_data = HILO_64(opaque_data1, opaque_data0);
+	*curr_pkt = *((struct unaligned_opaque_data *)&opaque_data);
+
+	curr_pkt->first_mpa_offset = curr_pkt->tcp_payload_offset +
+				     le16_to_cpu(curr_pkt->first_mpa_offset);
+	curr_pkt->cid = le32_to_cpu(curr_pkt->cid);
+}
+
+/* This function is called when an unaligned or incomplete MPA packet arrives
+ * driver needs to align the packet, perhaps using previous data and send
+ * it down to FW once it is aligned.
+ */
+static int
+qed_iwarp_process_mpa_pkt(struct qed_hwfn *p_hwfn,
+			  struct qed_iwarp_ll2_mpa_buf *mpa_buf)
+{
+	struct qed_iwarp_ll2_buff *buf = mpa_buf->ll2_buf;
+	int rc = -EINVAL;
+
+	qed_iwarp_ll2_post_rx(p_hwfn,
+			      buf,
+			      p_hwfn->p_rdma_info->iwarp.ll2_mpa_handle);
+	return rc;
+}
+
+static void qed_iwarp_process_pending_pkts(struct qed_hwfn *p_hwfn)
+{
+	struct qed_iwarp_info *iwarp_info = &p_hwfn->p_rdma_info->iwarp;
+	struct qed_iwarp_ll2_mpa_buf *mpa_buf = NULL;
+	int rc;
+
+	while (!list_empty(&iwarp_info->mpa_buf_pending_list)) {
+		mpa_buf = list_first_entry(&iwarp_info->mpa_buf_pending_list,
+					   struct qed_iwarp_ll2_mpa_buf,
+					   list_entry);
+
+		rc = qed_iwarp_process_mpa_pkt(p_hwfn, mpa_buf);
+
+		/* busy means break and continue processing later, don't
+		 * remove the buf from the pending list.
+		 */
+		if (rc == -EBUSY)
+			break;
+
+		list_del(&mpa_buf->list_entry);
+		list_add_tail(&mpa_buf->list_entry, &iwarp_info->mpa_buf_list);
+
+		if (rc) {	/* different error, don't continue */
+			DP_NOTICE(p_hwfn, "process pkts failed rc=%d\n", rc);
+			break;
+		}
+	}
+}
+
+static void
 qed_iwarp_ll2_comp_mpa_pkt(void *cxt, struct qed_ll2_comp_rx_data *data)
 {
+	struct qed_iwarp_ll2_mpa_buf *mpa_buf;
 	struct qed_iwarp_info *iwarp_info;
 	struct qed_hwfn *p_hwfn = cxt;
 
 	iwarp_info = &p_hwfn->p_rdma_info->iwarp;
+	mpa_buf = list_first_entry(&iwarp_info->mpa_buf_list,
+				   struct qed_iwarp_ll2_mpa_buf, list_entry);
+	if (!mpa_buf) {
+		DP_ERR(p_hwfn, "No free mpa buf\n");
+		goto err;
+	}
+
+	list_del(&mpa_buf->list_entry);
+	qed_iwarp_mpa_get_data(p_hwfn, &mpa_buf->data,
+			       data->opaque_data_0, data->opaque_data_1);
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_RDMA,
+		   "LL2 MPA CompRx payload_len:0x%x\tfirst_mpa_offset:0x%x\ttcp_payload_offset:0x%x\tflags:0x%x\tcid:0x%x\n",
+		   data->length.packet_length, mpa_buf->data.first_mpa_offset,
+		   mpa_buf->data.tcp_payload_offset, mpa_buf->data.flags,
+		   mpa_buf->data.cid);
+
+	mpa_buf->ll2_buf = data->cookie;
+	mpa_buf->tcp_payload_len = data->length.packet_length -
+				   mpa_buf->data.first_mpa_offset;
+	mpa_buf->data.first_mpa_offset += data->u.placement_offset;
+	mpa_buf->placement_offset = data->u.placement_offset;
+
+	list_add_tail(&mpa_buf->list_entry, &iwarp_info->mpa_buf_pending_list);
+
+	qed_iwarp_process_pending_pkts(p_hwfn);
+	return;
+err:
 	qed_iwarp_ll2_post_rx(p_hwfn, data->cookie,
 			      iwarp_info->ll2_mpa_handle);
 }
@@ -1872,6 +1966,11 @@ static void qed_iwarp_ll2_comp_tx_pkt(void *cxt, u8 connection_handle,
 
 	/* this was originally an rx packet, post it back */
 	qed_iwarp_ll2_post_rx(p_hwfn, buffer, connection_handle);
+
+	if (connection_handle == p_hwfn->p_rdma_info->iwarp.ll2_mpa_handle)
+		qed_iwarp_process_pending_pkts(p_hwfn);
+
+	return;
 }
 
 static void qed_iwarp_ll2_rel_tx_pkt(void *cxt, u8 connection_handle,
@@ -1986,6 +2085,7 @@ static int qed_iwarp_ll2_stop(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 	u32 mpa_buff_size;
 	u16 n_ooo_bufs;
 	int rc = 0;
+	int i;
 
 	iwarp_info = &p_hwfn->p_rdma_info->iwarp;
 	iwarp_info->ll2_syn_handle = QED_IWARP_HANDLE_INVAL;
@@ -2094,6 +2194,22 @@ static int qed_iwarp_ll2_stop(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 					 iwarp_info->ll2_mpa_handle);
 	if (rc)
 		goto err;
+	/* The mpa_bufs array serves for pending RX packets received on the
+	 * mpa ll2 that don't have place on the tx ring and require later
+	 * processing. We can't fail on allocation of such a struct therefore
+	 * we allocate enough to take care of all rx packets
+	 */
+	iwarp_info->mpa_bufs = kcalloc(data.input.rx_num_desc,
+				       sizeof(*iwarp_info->mpa_bufs),
+				       GFP_KERNEL);
+	if (!iwarp_info->mpa_bufs)
+		goto err;
+
+	INIT_LIST_HEAD(&iwarp_info->mpa_buf_pending_list);
+	INIT_LIST_HEAD(&iwarp_info->mpa_buf_list);
+	for (i = 0; i < data.input.rx_num_desc; i++)
+		list_add_tail(&iwarp_info->mpa_bufs[i].list_entry,
+			      &iwarp_info->mpa_buf_list);
 	return rc;
 err:
 	qed_iwarp_ll2_stop(p_hwfn, p_ptt);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.h b/drivers/net/ethernet/qlogic/qed/qed_iwarp.h
index 9d33a1f..2c53fe4 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.h
@@ -60,10 +60,20 @@ struct qed_iwarp_ll2_buff {
 	u32 buff_size;
 };
 
+struct qed_iwarp_ll2_mpa_buf {
+	struct list_head list_entry;
+	struct qed_iwarp_ll2_buff *ll2_buf;
+	struct unaligned_opaque_data data;
+	u16 tcp_payload_len;
+	u8 placement_offset;
+};
+
 struct qed_iwarp_info {
 	struct list_head listen_list;	/* qed_iwarp_listener */
 	struct list_head ep_list;	/* qed_iwarp_ep */
 	struct list_head ep_free_list;	/* pre-allocated ep's */
+	struct list_head mpa_buf_list;	/* list of mpa_bufs */
+	struct list_head mpa_buf_pending_list;
 	spinlock_t iw_lock;	/* for iwarp resources */
 	spinlock_t qp_lock;	/* for teardown races */
 	u32 rcv_wnd_scale;
@@ -77,6 +87,7 @@ struct qed_iwarp_info {
 	u8 peer2peer;
 	enum mpa_negotiation_mode mpa_rev;
 	enum mpa_rtr_type rtr_type;
+	struct qed_iwarp_ll2_mpa_buf *mpa_bufs;
 };
 
 enum qed_iwarp_ep_state {
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v3 net-next 09/12] qed: Add unaligned and packed packet processing
From: Michal Kalderon @ 2017-10-09  9:37 UTC (permalink / raw)
  To: davem; +Cc: netdev, linux-rdma, dledford, Michal Kalderon, Ariel Elior
In-Reply-To: <1507541874-18344-1-git-send-email-Michal.Kalderon@cavium.com>

The fpdu data structure is preallocated per connection.
Each connection stores the current status of the connection:
either nothing pending, or there is a partial fpdu that is waiting for
the rest of the fpdu (incomplete bytes != 0).
The same structure is also used for splitting a packet when there are
packed fpdus. The structure is initialized with all data required
for sending the fpdu back to the FW. A fpdu will always be spanned across
a maximum of 3 tx bds. One for the header, one for the partial fdpu
received and one for the remainder (unaligned) packet.
In case of packed fpdu's, two fragments are used, one for the header
and one for the data.
Corner cases are not handled in the patch for clarity, and will be added
as a separate patch.

Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Signed-off-by: Ariel Elior <Ariel.Elior@cavium.com>
---
 drivers/net/ethernet/qlogic/qed/qed_iwarp.c | 257 ++++++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_iwarp.h |  13 ++
 2 files changed, 270 insertions(+)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
index efd4861..83b147f 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
@@ -1419,6 +1419,7 @@ void qed_iwarp_resc_free(struct qed_hwfn *p_hwfn)
 
 	qed_rdma_bmap_free(p_hwfn, &p_hwfn->p_rdma_info->tcp_cid_map, 1);
 	kfree(iwarp_info->mpa_bufs);
+	kfree(iwarp_info->partial_fpdus);
 }
 
 int qed_iwarp_accept(void *rdma_cxt, struct qed_iwarp_accept_in *iparams)
@@ -1716,8 +1717,170 @@ int qed_iwarp_reject(void *rdma_cxt, struct qed_iwarp_reject_in *iparams)
 	return 0;
 }
 
+static struct qed_iwarp_fpdu *qed_iwarp_get_curr_fpdu(struct qed_hwfn *p_hwfn,
+						      u16 cid)
+{
+	struct qed_iwarp_info *iwarp_info = &p_hwfn->p_rdma_info->iwarp;
+	struct qed_iwarp_fpdu *partial_fpdu;
+	u32 idx;
+
+	idx = cid - qed_cxt_get_proto_cid_start(p_hwfn, PROTOCOLID_IWARP);
+	if (idx >= iwarp_info->max_num_partial_fpdus) {
+		DP_ERR(p_hwfn, "Invalid cid %x max_num_partial_fpdus=%x\n", cid,
+		       iwarp_info->max_num_partial_fpdus);
+		return NULL;
+	}
+
+	partial_fpdu = &iwarp_info->partial_fpdus[idx];
+
+	return partial_fpdu;
+}
+
+enum qed_iwarp_mpa_pkt_type {
+	QED_IWARP_MPA_PKT_PACKED,
+	QED_IWARP_MPA_PKT_PARTIAL,
+	QED_IWARP_MPA_PKT_UNALIGNED
+};
+
+#define QED_IWARP_MPA_FPDU_LENGTH_SIZE (2)
+#define QED_IWARP_MPA_CRC32_DIGEST_SIZE (4)
+
+/* Pad to multiple of 4 */
+#define QED_IWARP_PDU_DATA_LEN_WITH_PAD(data_len) ALIGN(data_len, 4)
+#define QED_IWARP_FPDU_LEN_WITH_PAD(_mpa_len)				   \
+	(QED_IWARP_PDU_DATA_LEN_WITH_PAD((_mpa_len) +			   \
+					 QED_IWARP_MPA_FPDU_LENGTH_SIZE) + \
+					 QED_IWARP_MPA_CRC32_DIGEST_SIZE)
+
 /* fpdu can be fragmented over maximum 3 bds: header, partial mpa, unaligned */
 #define QED_IWARP_MAX_BDS_PER_FPDU 3
+
+char *pkt_type_str[] = {
+	"QED_IWARP_MPA_PKT_PACKED",
+	"QED_IWARP_MPA_PKT_PARTIAL",
+	"QED_IWARP_MPA_PKT_UNALIGNED"
+};
+
+static enum qed_iwarp_mpa_pkt_type
+qed_iwarp_mpa_classify(struct qed_hwfn *p_hwfn,
+		       struct qed_iwarp_fpdu *fpdu,
+		       u16 tcp_payload_len, u8 *mpa_data)
+{
+	enum qed_iwarp_mpa_pkt_type pkt_type;
+	u16 mpa_len;
+
+	if (fpdu->incomplete_bytes) {
+		pkt_type = QED_IWARP_MPA_PKT_UNALIGNED;
+		goto out;
+	}
+
+	mpa_len = ntohs(*((u16 *)(mpa_data)));
+	fpdu->fpdu_length = QED_IWARP_FPDU_LEN_WITH_PAD(mpa_len);
+
+	if (fpdu->fpdu_length <= tcp_payload_len)
+		pkt_type = QED_IWARP_MPA_PKT_PACKED;
+	else
+		pkt_type = QED_IWARP_MPA_PKT_PARTIAL;
+
+out:
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+		   "MPA_ALIGN: %s: fpdu_length=0x%x tcp_payload_len:0x%x\n",
+		   pkt_type_str[pkt_type], fpdu->fpdu_length, tcp_payload_len);
+
+	return pkt_type;
+}
+
+static void
+qed_iwarp_init_fpdu(struct qed_iwarp_ll2_buff *buf,
+		    struct qed_iwarp_fpdu *fpdu,
+		    struct unaligned_opaque_data *pkt_data,
+		    u16 tcp_payload_size, u8 placement_offset)
+{
+	fpdu->mpa_buf = buf;
+	fpdu->pkt_hdr = buf->data_phys_addr + placement_offset;
+	fpdu->pkt_hdr_size = pkt_data->tcp_payload_offset;
+	fpdu->mpa_frag = buf->data_phys_addr + pkt_data->first_mpa_offset;
+	fpdu->mpa_frag_virt = (u8 *)(buf->data) + pkt_data->first_mpa_offset;
+
+	if (tcp_payload_size < fpdu->fpdu_length)
+		fpdu->incomplete_bytes = fpdu->fpdu_length - tcp_payload_size;
+	else
+		fpdu->incomplete_bytes = 0;	/* complete fpdu */
+
+	fpdu->mpa_frag_len = fpdu->fpdu_length - fpdu->incomplete_bytes;
+}
+
+static int
+qed_iwarp_send_fpdu(struct qed_hwfn *p_hwfn,
+		    struct qed_iwarp_fpdu *fpdu,
+		    struct unaligned_opaque_data *curr_pkt,
+		    struct qed_iwarp_ll2_buff *buf,
+		    u16 tcp_payload_size, enum qed_iwarp_mpa_pkt_type pkt_type)
+{
+	struct qed_ll2_tx_pkt_info tx_pkt;
+	u8 ll2_handle;
+	int rc;
+
+	memset(&tx_pkt, 0, sizeof(tx_pkt));
+
+	/* An unaligned packet means it's split over two tcp segments. So the
+	 * complete packet requires 3 bds, one for the header, one for the
+	 * part of the fpdu of the first tcp segment, and the last fragment
+	 * will point to the remainder of the fpdu. A packed pdu, requires only
+	 * two bds, one for the header and one for the data.
+	 */
+	tx_pkt.num_of_bds = (pkt_type == QED_IWARP_MPA_PKT_UNALIGNED) ? 3 : 2;
+	tx_pkt.tx_dest = QED_LL2_TX_DEST_LB;
+	tx_pkt.l4_hdr_offset_w = fpdu->pkt_hdr_size >> 2; /* offset in words */
+
+	/* Send the mpa_buf only with the last fpdu (in case of packed) */
+	if (pkt_type == QED_IWARP_MPA_PKT_UNALIGNED ||
+	    tcp_payload_size <= fpdu->fpdu_length)
+		tx_pkt.cookie = fpdu->mpa_buf;
+
+	tx_pkt.first_frag = fpdu->pkt_hdr;
+	tx_pkt.first_frag_len = fpdu->pkt_hdr_size;
+	tx_pkt.enable_ip_cksum = true;
+	tx_pkt.enable_l4_cksum = true;
+	tx_pkt.calc_ip_len = true;
+	/* vlan overload with enum iwarp_ll2_tx_queues */
+	tx_pkt.vlan = IWARP_LL2_ALIGNED_TX_QUEUE;
+
+	ll2_handle = p_hwfn->p_rdma_info->iwarp.ll2_mpa_handle;
+
+	/* Set first fragment to header */
+	rc = qed_ll2_prepare_tx_packet(p_hwfn, ll2_handle, &tx_pkt, true);
+	if (rc)
+		goto out;
+
+	/* Set second fragment to first part of packet */
+	rc = qed_ll2_set_fragment_of_tx_packet(p_hwfn, ll2_handle,
+					       fpdu->mpa_frag,
+					       fpdu->mpa_frag_len);
+	if (rc)
+		goto out;
+
+	if (!fpdu->incomplete_bytes)
+		goto out;
+
+	/* Set third fragment to second part of the packet */
+	rc = qed_ll2_set_fragment_of_tx_packet(p_hwfn,
+					       ll2_handle,
+					       buf->data_phys_addr +
+					       curr_pkt->first_mpa_offset,
+					       fpdu->incomplete_bytes);
+out:
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_RDMA,
+		   "MPA_ALIGN: Sent FPDU num_bds=%d first_frag_len=%x, mpa_frag_len=0x%x, incomplete_bytes:0x%x rc=%d\n",
+		   tx_pkt.num_of_bds,
+		   tx_pkt.first_frag_len,
+		   fpdu->mpa_frag_len,
+		   fpdu->incomplete_bytes, rc);
+
+	return rc;
+}
+
 static void
 qed_iwarp_mpa_get_data(struct qed_hwfn *p_hwfn,
 		       struct unaligned_opaque_data *curr_pkt,
@@ -1741,9 +1904,79 @@ int qed_iwarp_reject(void *rdma_cxt, struct qed_iwarp_reject_in *iparams)
 qed_iwarp_process_mpa_pkt(struct qed_hwfn *p_hwfn,
 			  struct qed_iwarp_ll2_mpa_buf *mpa_buf)
 {
+	struct unaligned_opaque_data *curr_pkt = &mpa_buf->data;
 	struct qed_iwarp_ll2_buff *buf = mpa_buf->ll2_buf;
+	enum qed_iwarp_mpa_pkt_type pkt_type;
+	struct qed_iwarp_fpdu *fpdu;
 	int rc = -EINVAL;
+	u8 *mpa_data;
+
+	fpdu = qed_iwarp_get_curr_fpdu(p_hwfn, curr_pkt->cid & 0xffff);
+	if (!fpdu) { /* something corrupt with cid, post rx back */
+		DP_ERR(p_hwfn, "Invalid cid, drop and post back to rx cid=%x\n",
+		       curr_pkt->cid);
+		goto err;
+	}
 
+	do {
+		mpa_data = ((u8 *)(buf->data) + curr_pkt->first_mpa_offset);
+
+		pkt_type = qed_iwarp_mpa_classify(p_hwfn, fpdu,
+						  mpa_buf->tcp_payload_len,
+						  mpa_data);
+
+		switch (pkt_type) {
+		case QED_IWARP_MPA_PKT_PARTIAL:
+			qed_iwarp_init_fpdu(buf, fpdu,
+					    curr_pkt,
+					    mpa_buf->tcp_payload_len,
+					    mpa_buf->placement_offset);
+
+			mpa_buf->tcp_payload_len = 0;
+			break;
+		case QED_IWARP_MPA_PKT_PACKED:
+			qed_iwarp_init_fpdu(buf, fpdu,
+					    curr_pkt,
+					    mpa_buf->tcp_payload_len,
+					    mpa_buf->placement_offset);
+
+			rc = qed_iwarp_send_fpdu(p_hwfn, fpdu, curr_pkt, buf,
+						 mpa_buf->tcp_payload_len,
+						 pkt_type);
+			if (rc) {
+				DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+					   "Can't send FPDU:reset rc=%d\n", rc);
+				memset(fpdu, 0, sizeof(*fpdu));
+				break;
+			}
+
+			mpa_buf->tcp_payload_len -= fpdu->fpdu_length;
+			curr_pkt->first_mpa_offset += fpdu->fpdu_length;
+			break;
+		case QED_IWARP_MPA_PKT_UNALIGNED:
+			rc = qed_iwarp_send_fpdu(p_hwfn, fpdu, curr_pkt, buf,
+						 mpa_buf->tcp_payload_len,
+						 pkt_type);
+			if (rc) {
+				DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+					   "Can't send FPDU:delay rc=%d\n", rc);
+				/* don't reset fpdu -> we need it for next
+				 * classify
+				 */
+				break;
+			}
+
+			mpa_buf->tcp_payload_len -= fpdu->incomplete_bytes;
+			curr_pkt->first_mpa_offset += fpdu->incomplete_bytes;
+			/* The framed PDU was sent - no more incomplete bytes */
+			fpdu->incomplete_bytes = 0;
+			break;
+		}
+	} while (mpa_buf->tcp_payload_len && !rc);
+
+	return rc;
+
+err:
 	qed_iwarp_ll2_post_rx(p_hwfn,
 			      buf,
 			      p_hwfn->p_rdma_info->iwarp.ll2_mpa_handle);
@@ -1989,11 +2222,27 @@ static void qed_iwarp_ll2_rel_tx_pkt(void *cxt, u8 connection_handle,
 	kfree(buffer);
 }
 
+/* The only slowpath for iwarp ll2 is unalign flush. When this completion
+ * is received, need to reset the FPDU.
+ */
 void
 qed_iwarp_ll2_slowpath(void *cxt,
 		       u8 connection_handle,
 		       u32 opaque_data_0, u32 opaque_data_1)
 {
+	struct unaligned_opaque_data unalign_data;
+	struct qed_hwfn *p_hwfn = cxt;
+	struct qed_iwarp_fpdu *fpdu;
+
+	qed_iwarp_mpa_get_data(p_hwfn, &unalign_data,
+			       opaque_data_0, opaque_data_1);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "(0x%x) Flush fpdu\n",
+		   unalign_data.cid);
+
+	fpdu = qed_iwarp_get_curr_fpdu(p_hwfn, (u16)unalign_data.cid);
+	if (fpdu)
+		memset(fpdu, 0, sizeof(*fpdu));
 }
 
 static int qed_iwarp_ll2_stop(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
@@ -2194,6 +2443,14 @@ static int qed_iwarp_ll2_stop(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 					 iwarp_info->ll2_mpa_handle);
 	if (rc)
 		goto err;
+
+	iwarp_info->partial_fpdus = kcalloc((u16)p_hwfn->p_rdma_info->num_qps,
+					    sizeof(*iwarp_info->partial_fpdus),
+					    GFP_KERNEL);
+	if (!iwarp_info->partial_fpdus)
+		goto err;
+
+	iwarp_info->max_num_partial_fpdus = (u16)p_hwfn->p_rdma_info->num_qps;
 	/* The mpa_bufs array serves for pending RX packets received on the
 	 * mpa ll2 that don't have place on the tx ring and require later
 	 * processing. We can't fail on allocation of such a struct therefore
diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.h b/drivers/net/ethernet/qlogic/qed/qed_iwarp.h
index 2c53fe4..858755c 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.h
@@ -68,6 +68,17 @@ struct qed_iwarp_ll2_mpa_buf {
 	u8 placement_offset;
 };
 
+struct qed_iwarp_fpdu {
+	struct qed_iwarp_ll2_buff *mpa_buf;
+	void *mpa_frag_virt;
+	dma_addr_t mpa_frag;
+	dma_addr_t pkt_hdr;
+	u16 mpa_frag_len;
+	u16 fpdu_length;
+	u16 incomplete_bytes;
+	u8 pkt_hdr_size;
+};
+
 struct qed_iwarp_info {
 	struct list_head listen_list;	/* qed_iwarp_listener */
 	struct list_head ep_list;	/* qed_iwarp_ep */
@@ -87,7 +98,9 @@ struct qed_iwarp_info {
 	u8 peer2peer;
 	enum mpa_negotiation_mode mpa_rev;
 	enum mpa_rtr_type rtr_type;
+	struct qed_iwarp_fpdu *partial_fpdus;
 	struct qed_iwarp_ll2_mpa_buf *mpa_bufs;
+	u16 max_num_partial_fpdus;
 };
 
 enum qed_iwarp_ep_state {
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v3 net-next 10/12] qed: Add support for freeing two ll2 buffers for corner cases
From: Michal Kalderon @ 2017-10-09  9:37 UTC (permalink / raw)
  To: davem; +Cc: netdev, linux-rdma, dledford, Michal Kalderon, Ariel Elior
In-Reply-To: <1507541874-18344-1-git-send-email-Michal.Kalderon@cavium.com>

When posting a packet on the ll2 tx, we can provide a cookie that
will be returned upon tx completion. This cookie is the ll2 iwarp buffer
which is then reposted to the rx ring. Part of the unaligned mpa flow
is determining when a buffer can be reposted. Each buffer needs to be
sent only once as a cookie for on the tx ring. In packed fpdu case, only
the last packet will be sent with the buffer, meaning we need to handle the
case that a cookie can be NULL on tx complete. In addition, when a fpdu
splits over two buffers, but there are no more fpdus on the second buffer,
two buffers need to be provided as a cookie. To avoid changing the ll2
interface to provide two cookies, we introduce a piggy buf pointer,
relevant for iWARP only, that holds a pointer to a second buffer that
needs to be released during tx completion.

Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Signed-off-by: Ariel Elior <Ariel.Elior@cavium.com>
---
 drivers/net/ethernet/qlogic/qed/qed_iwarp.c | 25 +++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_iwarp.h |  1 +
 2 files changed, 26 insertions(+)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
index 83b147f..8b17369 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
@@ -1846,6 +1846,12 @@ enum qed_iwarp_mpa_pkt_type {
 	/* vlan overload with enum iwarp_ll2_tx_queues */
 	tx_pkt.vlan = IWARP_LL2_ALIGNED_TX_QUEUE;
 
+	/* special case of unaligned packet and not packed, need to send
+	 * both buffers as cookie to release.
+	 */
+	if (tcp_payload_size == fpdu->incomplete_bytes)
+		fpdu->mpa_buf->piggy_buf = buf;
+
 	ll2_handle = p_hwfn->p_rdma_info->iwarp.ll2_mpa_handle;
 
 	/* Set first fragment to header */
@@ -2195,9 +2201,19 @@ static void qed_iwarp_ll2_comp_tx_pkt(void *cxt, u8 connection_handle,
 				      bool b_last_fragment, bool b_last_packet)
 {
 	struct qed_iwarp_ll2_buff *buffer = cookie;
+	struct qed_iwarp_ll2_buff *piggy;
 	struct qed_hwfn *p_hwfn = cxt;
 
+	if (!buffer)		/* can happen in packed mpa unaligned... */
+		return;
+
 	/* this was originally an rx packet, post it back */
+	piggy = buffer->piggy_buf;
+	if (piggy) {
+		buffer->piggy_buf = NULL;
+		qed_iwarp_ll2_post_rx(p_hwfn, piggy, connection_handle);
+	}
+
 	qed_iwarp_ll2_post_rx(p_hwfn, buffer, connection_handle);
 
 	if (connection_handle == p_hwfn->p_rdma_info->iwarp.ll2_mpa_handle)
@@ -2216,6 +2232,15 @@ static void qed_iwarp_ll2_rel_tx_pkt(void *cxt, u8 connection_handle,
 	if (!buffer)
 		return;
 
+	if (buffer->piggy_buf) {
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  buffer->piggy_buf->buff_size,
+				  buffer->piggy_buf->data,
+				  buffer->piggy_buf->data_phys_addr);
+
+		kfree(buffer->piggy_buf);
+	}
+
 	dma_free_coherent(&p_hwfn->cdev->pdev->dev, buffer->buff_size,
 			  buffer->data, buffer->data_phys_addr);
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.h b/drivers/net/ethernet/qlogic/qed/qed_iwarp.h
index 858755c..58db51a 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.h
@@ -55,6 +55,7 @@ enum qed_iwarp_qp_state {
 #define QED_IWARP_HANDLE_INVAL		(0xff)
 
 struct qed_iwarp_ll2_buff {
+	struct qed_iwarp_ll2_buff *piggy_buf;
 	void *data;
 	dma_addr_t data_phys_addr;
 	u32 buff_size;
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v3 net-next 12/12] qed: Add iWARP support for fpdu spanned over more than two tcp packets
From: Michal Kalderon @ 2017-10-09  9:37 UTC (permalink / raw)
  To: davem; +Cc: netdev, linux-rdma, dledford, Michal Kalderon, Ariel Elior
In-Reply-To: <1507541874-18344-1-git-send-email-Michal.Kalderon@cavium.com>

We continue to maintain a maximum of three buffers per fpdu, to ensure
that there are enough buffers for additional unaligned mpa packets.
To support this, if a fpdu is split over more than two tcp packets, we
use an intermediate buffer to copy the data to the previous buffer, then
we can release the data. We need an intermediate buffer as the initial
buffer partial packet could be located at the end of the packet, not
leaving room for additional data. This is a corner case, and will usually
not be the case.

Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Signed-off-by: Ariel Elior <Ariel.Elior@cavium.com>
---
 drivers/net/ethernet/qlogic/qed/qed_iwarp.c | 193 ++++++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_iwarp.h |   1 +
 2 files changed, 194 insertions(+)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
index 2994942..b2b1f87 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
@@ -1420,6 +1420,7 @@ void qed_iwarp_resc_free(struct qed_hwfn *p_hwfn)
 	qed_rdma_bmap_free(p_hwfn, &p_hwfn->p_rdma_info->tcp_cid_map, 1);
 	kfree(iwarp_info->mpa_bufs);
 	kfree(iwarp_info->partial_fpdus);
+	kfree(iwarp_info->mpa_intermediate_buf);
 }
 
 int qed_iwarp_accept(void *rdma_cxt, struct qed_iwarp_accept_in *iparams)
@@ -1762,6 +1763,11 @@ enum qed_iwarp_mpa_pkt_type {
 	"QED_IWARP_MPA_PKT_UNALIGNED"
 };
 
+static int
+qed_iwarp_recycle_pkt(struct qed_hwfn *p_hwfn,
+		      struct qed_iwarp_fpdu *fpdu,
+		      struct qed_iwarp_ll2_buff *buf);
+
 static enum qed_iwarp_mpa_pkt_type
 qed_iwarp_mpa_classify(struct qed_hwfn *p_hwfn,
 		       struct qed_iwarp_fpdu *fpdu,
@@ -1822,6 +1828,68 @@ enum qed_iwarp_mpa_pkt_type {
 	fpdu->mpa_frag_len = fpdu->fpdu_length - fpdu->incomplete_bytes;
 }
 
+static int
+qed_iwarp_cp_pkt(struct qed_hwfn *p_hwfn,
+		 struct qed_iwarp_fpdu *fpdu,
+		 struct unaligned_opaque_data *pkt_data,
+		 struct qed_iwarp_ll2_buff *buf, u16 tcp_payload_size)
+{
+	u8 *tmp_buf = p_hwfn->p_rdma_info->iwarp.mpa_intermediate_buf;
+	int rc;
+
+	/* need to copy the data from the partial packet stored in fpdu
+	 * to the new buf, for this we also need to move the data currently
+	 * placed on the buf. The assumption is that the buffer is big enough
+	 * since fpdu_length <= mss, we use an intermediate buffer since
+	 * we may need to copy the new data to an overlapping location
+	 */
+	if ((fpdu->mpa_frag_len + tcp_payload_size) > (u16)buf->buff_size) {
+		DP_ERR(p_hwfn,
+		       "MPA ALIGN: Unexpected: buffer is not large enough for split fpdu buff_size = %d mpa_frag_len = %d, tcp_payload_size = %d, incomplete_bytes = %d\n",
+		       buf->buff_size, fpdu->mpa_frag_len,
+		       tcp_payload_size, fpdu->incomplete_bytes);
+		return -EINVAL;
+	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+		   "MPA ALIGN Copying fpdu: [%p, %d] [%p, %d]\n",
+		   fpdu->mpa_frag_virt, fpdu->mpa_frag_len,
+		   (u8 *)(buf->data) + pkt_data->first_mpa_offset,
+		   tcp_payload_size);
+
+	memcpy(tmp_buf, fpdu->mpa_frag_virt, fpdu->mpa_frag_len);
+	memcpy(tmp_buf + fpdu->mpa_frag_len,
+	       (u8 *)(buf->data) + pkt_data->first_mpa_offset,
+	       tcp_payload_size);
+
+	rc = qed_iwarp_recycle_pkt(p_hwfn, fpdu, fpdu->mpa_buf);
+	if (rc)
+		return rc;
+
+	/* If we managed to post the buffer copy the data to the new buffer
+	 * o/w this will occur in the next round...
+	 */
+	memcpy((u8 *)(buf->data), tmp_buf,
+	       fpdu->mpa_frag_len + tcp_payload_size);
+
+	fpdu->mpa_buf = buf;
+	/* fpdu->pkt_hdr remains as is */
+	/* fpdu->mpa_frag is overridden with new buf */
+	fpdu->mpa_frag = buf->data_phys_addr;
+	fpdu->mpa_frag_virt = buf->data;
+	fpdu->mpa_frag_len += tcp_payload_size;
+
+	fpdu->incomplete_bytes -= tcp_payload_size;
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_RDMA,
+		   "MPA ALIGN: split fpdu buff_size = %d mpa_frag_len = %d, tcp_payload_size = %d, incomplete_bytes = %d\n",
+		   buf->buff_size, fpdu->mpa_frag_len, tcp_payload_size,
+		   fpdu->incomplete_bytes);
+
+	return 0;
+}
+
 static void
 qed_iwarp_update_fpdu_length(struct qed_hwfn *p_hwfn,
 			     struct qed_iwarp_fpdu *fpdu, u8 *mpa_data)
@@ -1843,6 +1911,90 @@ enum qed_iwarp_mpa_pkt_type {
 	}
 }
 
+#define QED_IWARP_IS_RIGHT_EDGE(_curr_pkt) \
+	(GET_FIELD((_curr_pkt)->flags,	   \
+		   UNALIGNED_OPAQUE_DATA_PKT_REACHED_WIN_RIGHT_EDGE))
+
+/* This function is used to recycle a buffer using the ll2 drop option. It
+ * uses the mechanism to ensure that all buffers posted to tx before this one
+ * were completed. The buffer sent here will be sent as a cookie in the tx
+ * completion function and can then be reposted to rx chain when done. The flow
+ * that requires this is the flow where a FPDU splits over more than 3 tcp
+ * segments. In this case the driver needs to re-post a rx buffer instead of
+ * the one received, but driver can't simply repost a buffer it copied from
+ * as there is a case where the buffer was originally a packed FPDU, and is
+ * partially posted to FW. Driver needs to ensure FW is done with it.
+ */
+static int
+qed_iwarp_recycle_pkt(struct qed_hwfn *p_hwfn,
+		      struct qed_iwarp_fpdu *fpdu,
+		      struct qed_iwarp_ll2_buff *buf)
+{
+	struct qed_ll2_tx_pkt_info tx_pkt;
+	u8 ll2_handle;
+	int rc;
+
+	memset(&tx_pkt, 0, sizeof(tx_pkt));
+	tx_pkt.num_of_bds = 1;
+	tx_pkt.tx_dest = QED_LL2_TX_DEST_DROP;
+	tx_pkt.l4_hdr_offset_w = fpdu->pkt_hdr_size >> 2;
+	tx_pkt.first_frag = fpdu->pkt_hdr;
+	tx_pkt.first_frag_len = fpdu->pkt_hdr_size;
+	buf->piggy_buf = NULL;
+	tx_pkt.cookie = buf;
+
+	ll2_handle = p_hwfn->p_rdma_info->iwarp.ll2_mpa_handle;
+
+	rc = qed_ll2_prepare_tx_packet(p_hwfn, ll2_handle, &tx_pkt, true);
+	if (rc)
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+			   "Can't drop packet rc=%d\n", rc);
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_RDMA,
+		   "MPA_ALIGN: send drop tx packet [%lx, 0x%x], buf=%p, rc=%d\n",
+		   (unsigned long int)tx_pkt.first_frag,
+		   tx_pkt.first_frag_len, buf, rc);
+
+	return rc;
+}
+
+static int
+qed_iwarp_win_right_edge(struct qed_hwfn *p_hwfn, struct qed_iwarp_fpdu *fpdu)
+{
+	struct qed_ll2_tx_pkt_info tx_pkt;
+	u8 ll2_handle;
+	int rc;
+
+	memset(&tx_pkt, 0, sizeof(tx_pkt));
+	tx_pkt.num_of_bds = 1;
+	tx_pkt.tx_dest = QED_LL2_TX_DEST_LB;
+	tx_pkt.l4_hdr_offset_w = fpdu->pkt_hdr_size >> 2;
+
+	tx_pkt.first_frag = fpdu->pkt_hdr;
+	tx_pkt.first_frag_len = fpdu->pkt_hdr_size;
+	tx_pkt.enable_ip_cksum = true;
+	tx_pkt.enable_l4_cksum = true;
+	tx_pkt.calc_ip_len = true;
+	/* vlan overload with enum iwarp_ll2_tx_queues */
+	tx_pkt.vlan = IWARP_LL2_ALIGNED_RIGHT_TRIMMED_TX_QUEUE;
+
+	ll2_handle = p_hwfn->p_rdma_info->iwarp.ll2_mpa_handle;
+
+	rc = qed_ll2_prepare_tx_packet(p_hwfn, ll2_handle, &tx_pkt, true);
+	if (rc)
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+			   "Can't send right edge rc=%d\n", rc);
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_RDMA,
+		   "MPA_ALIGN: Sent right edge FPDU num_bds=%d [%lx, 0x%x], rc=%d\n",
+		   tx_pkt.num_of_bds,
+		   (unsigned long int)tx_pkt.first_frag,
+		   tx_pkt.first_frag_len, rc);
+
+	return rc;
+}
+
 static int
 qed_iwarp_send_fpdu(struct qed_hwfn *p_hwfn,
 		    struct qed_iwarp_fpdu *fpdu,
@@ -1971,6 +2123,20 @@ enum qed_iwarp_mpa_pkt_type {
 					    mpa_buf->tcp_payload_len,
 					    mpa_buf->placement_offset);
 
+			if (!QED_IWARP_IS_RIGHT_EDGE(curr_pkt)) {
+				mpa_buf->tcp_payload_len = 0;
+				break;
+			}
+
+			rc = qed_iwarp_win_right_edge(p_hwfn, fpdu);
+
+			if (rc) {
+				DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+					   "Can't send FPDU:reset rc=%d\n", rc);
+				memset(fpdu, 0, sizeof(*fpdu));
+				break;
+			}
+
 			mpa_buf->tcp_payload_len = 0;
 			break;
 		case QED_IWARP_MPA_PKT_PACKED:
@@ -1994,6 +2160,28 @@ enum qed_iwarp_mpa_pkt_type {
 			break;
 		case QED_IWARP_MPA_PKT_UNALIGNED:
 			qed_iwarp_update_fpdu_length(p_hwfn, fpdu, mpa_data);
+			if (mpa_buf->tcp_payload_len < fpdu->incomplete_bytes) {
+				/* special handling of fpdu split over more
+				 * than 2 segments
+				 */
+				if (QED_IWARP_IS_RIGHT_EDGE(curr_pkt)) {
+					rc = qed_iwarp_win_right_edge(p_hwfn,
+								      fpdu);
+					/* packet will be re-processed later */
+					if (rc)
+						return rc;
+				}
+
+				rc = qed_iwarp_cp_pkt(p_hwfn, fpdu, curr_pkt,
+						      buf,
+						      mpa_buf->tcp_payload_len);
+				if (rc) /* packet will be re-processed later */
+					return rc;
+
+				mpa_buf->tcp_payload_len = 0;
+				break;
+			}
+
 			rc = qed_iwarp_send_fpdu(p_hwfn, fpdu, curr_pkt, buf,
 						 mpa_buf->tcp_payload_len,
 						 pkt_type);
@@ -2510,6 +2698,11 @@ static int qed_iwarp_ll2_stop(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 		goto err;
 
 	iwarp_info->max_num_partial_fpdus = (u16)p_hwfn->p_rdma_info->num_qps;
+
+	iwarp_info->mpa_intermediate_buf = kzalloc(mpa_buff_size, GFP_KERNEL);
+	if (!iwarp_info->mpa_intermediate_buf)
+		goto err;
+
 	/* The mpa_bufs array serves for pending RX packets received on the
 	 * mpa ll2 that don't have place on the tx ring and require later
 	 * processing. We can't fail on allocation of such a struct therefore
diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.h b/drivers/net/ethernet/qlogic/qed/qed_iwarp.h
index c58793a..c1ecd74 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.h
@@ -107,6 +107,7 @@ struct qed_iwarp_info {
 	enum mpa_rtr_type rtr_type;
 	struct qed_iwarp_fpdu *partial_fpdus;
 	struct qed_iwarp_ll2_mpa_buf *mpa_bufs;
+	u8 *mpa_intermediate_buf;
 	u16 max_num_partial_fpdus;
 };
 
-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH 41/47] netfilter: convert hook list to an array
From: Tariq Toukan @ 2017-10-09 10:04 UTC (permalink / raw)
  To: Florian Westphal, Tariq Toukan
  Cc: Pablo Neira Ayuso, netfilter-devel, Aaron Conole, davem, netdev
In-Reply-To: <20171009093132.GA7150@breakpoint.cc>



On 09/10/2017 12:31 PM, Florian Westphal wrote:
> Tariq Toukan <tariqt@mellanox.com> wrote:
>> On 04/09/2017 1:42 AM, Pablo Neira Ayuso wrote:
>>> From: Aaron Conole <aconole@bytheb.org>
>>>
>>> This converts the storage and layout of netfilter hook entries from a
>>> linked list to an array.  After this commit, hook entries will be
>>> stored adjacent in memory.  The next pointer is no longer required.
>>>
>>> The ops pointers are stored at the end of the array as they are only
>>> used in the register/unregister path and in the legacy br_netfilter code.
>>>
>>> nf_unregister_net_hooks() is slower than needed as it just calls
>>> nf_unregister_net_hook in a loop (i.e. at least n synchronize_net()
>>> calls), this will be addressed in followup patch.
>>>
>>> Test setup:
>>>   - ixgbe 10gbit
>>>   - netperf UDP_STREAM, 64 byte packets
>>>   - 5 hooks: (raw + mangle prerouting, mangle+filter input, inet filter):
>>> empty mangle and raw prerouting, mangle and filter input hooks:
>>> 353.9
>>> this patch:
>>> 364.2
>>>
>>> Signed-off-by: Aaron Conole <aconole@bytheb.org>
>>> Signed-off-by: Florian Westphal <fw@strlen.de>
>>> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
>>> ---
>>
>> Hi,
>>
>> We experience a regression in server with iommu enabled.
>> After installing kernel and rebooting the server, it crashes during boot.
>> Please see trace below.
>>
>> Bisecting points to this patch.
> 
> Hmm, strange because
> 
>> [   25.907811] BUG: unable to handle kernel NULL pointer dereference at
>> 000000000000003c
>> [   25.907828] IP: _raw_read_lock_bh+0x15/0x40
> 
> ... this says that ebt_table is NULL (0x3c is the offset of the rwlock).
> 
> If you don't have that fix already, does

No, didn't have it.

> https://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git/commit/?id=e6b72ee88a56bcfe63f72e9c30766484c45bec72
> 
> netfilter: ebtables: fix race condition in frame_filter_net_init()
> 
> resolve this bug for you?
> 

Now I applied the fix and bug is resolved.

Many thanks!
Tariq

^ permalink raw reply

* Re: [patch net-next v2 1/5] net: bridge: Notify on bridge device mrouter state changes
From: Ivan Vecera @ 2017-10-09 10:19 UTC (permalink / raw)
  To: Jiri Pirko, netdev
  Cc: davem, yotamg, idosch, nogahf, mlxsw, nikolay, andrew, stephen,
	nbd, roopa
In-Reply-To: <20171009091535.1315-2-jiri@resnulli.us>

On 9.10.2017 11:15, Jiri Pirko wrote:
> From: Yotam Gigi <yotamg@mellanox.com>
> 
> Add the SWITCHDEV_ATTR_ID_BRIDGE_MROUTER switchdev notification type, used
> to indicate whether the bridge is or isn't mrouter. Notify when the bridge
> changes its state, similarly to the already existing bridged port mrouter
> notifications.
> 
> The notification uses the switchdev_attr.u.mrouter boolean flag to indicate
> the current bridge mrouter status. Thus, it only indicates whether the
> bridge is currently used as an mrouter or not, and does not indicate the
> exact mrouter state of the bridge (learning, permanent, etc.).
> 
> Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
> ---
> v1->v2:
>  - use the timer_pending to distinguish between learning-on and
>    learning-off states
> ---
>  include/net/switchdev.h   |  1 +
>  net/bridge/br_multicast.c | 38 +++++++++++++++++++++++++++++++++++---
>  2 files changed, 36 insertions(+), 3 deletions(-)
> 
> diff --git a/include/net/switchdev.h b/include/net/switchdev.h
> index d767b79..d756fbe 100644
> --- a/include/net/switchdev.h
> +++ b/include/net/switchdev.h
> @@ -51,6 +51,7 @@ enum switchdev_attr_id {
>  	SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME,
>  	SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING,
>  	SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED,
> +	SWITCHDEV_ATTR_ID_BRIDGE_MROUTER,
>  };
>  
>  struct switchdev_attr {
> diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
> index 8dc5c8d..bd50550 100644
> --- a/net/bridge/br_multicast.c
> +++ b/net/bridge/br_multicast.c
> @@ -859,8 +859,32 @@ static void br_multicast_router_expired(unsigned long data)
>  	spin_unlock(&br->multicast_lock);
>  }
>  
> +static void br_mc_router_state_change(struct net_bridge *p,
> +				      bool is_mc_router)
> +{
> +	struct switchdev_attr attr = {
> +		.orig_dev = p->dev,
> +		.id = SWITCHDEV_ATTR_ID_BRIDGE_MROUTER,
> +		.flags = SWITCHDEV_F_DEFER,
> +		.u.mrouter = is_mc_router,
> +	};
> +
> +	switchdev_port_attr_set(p->dev, &attr);
> +}
> +
>  static void br_multicast_local_router_expired(unsigned long data)
>  {
> +	struct net_bridge *br = (struct net_bridge *)data;
> +
> +	spin_lock(&br->multicast_lock);
> +	if (br->multicast_router == MDB_RTR_TYPE_DISABLED ||
> +	    br->multicast_router == MDB_RTR_TYPE_PERM ||
> +	    timer_pending(&br->multicast_router_timer))
> +		goto out;
> +
> +	br_mc_router_state_change(br, false);
> +out:
> +	spin_unlock(&br->multicast_lock);
>  }
>  
>  static void br_multicast_querier_expired(struct net_bridge *br,
> @@ -1364,9 +1388,12 @@ static void br_multicast_mark_router(struct net_bridge *br,
>  	unsigned long now = jiffies;
>  
>  	if (!port) {
> -		if (br->multicast_router == MDB_RTR_TYPE_TEMP_QUERY)
> +		if (br->multicast_router == MDB_RTR_TYPE_TEMP_QUERY) {
> +			if (!timer_pending(&br->multicast_router_timer))
> +				br_mc_router_state_change(br, true);
>  			mod_timer(&br->multicast_router_timer,
>  				  now + br->multicast_querier_interval);
> +		}
>  		return;
>  	}
>  
> @@ -1952,7 +1979,7 @@ void br_multicast_init(struct net_bridge *br)
>  
>  	spin_lock_init(&br->multicast_lock);
>  	setup_timer(&br->multicast_router_timer,
> -		    br_multicast_local_router_expired, 0);
> +		    br_multicast_local_router_expired, (unsigned long)br);
>  	setup_timer(&br->ip4_other_query.timer,
>  		    br_ip4_multicast_querier_expired, (unsigned long)br);
>  	setup_timer(&br->ip4_own_query.timer, br_ip4_multicast_query_expired,
> @@ -2042,9 +2069,14 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val)
>  	switch (val) {
>  	case MDB_RTR_TYPE_DISABLED:
>  	case MDB_RTR_TYPE_PERM:
> +		br_mc_router_state_change(br, val == MDB_RTR_TYPE_PERM);
>  		del_timer(&br->multicast_router_timer);
> -		/* fall through */
> +		br->multicast_router = val;
> +		err = 0;
> +		break;
>  	case MDB_RTR_TYPE_TEMP_QUERY:
> +		if (br->multicast_router != MDB_RTR_TYPE_TEMP_QUERY)
> +			br_mc_router_state_change(br, false);
>  		br->multicast_router = val;
>  		err = 0;
>  		break;
> 

Reviewed by: Ivan Vecera <ivecera@redhat.com>

^ permalink raw reply

* Re: [patch net-next v2 2/5] net: bridge: Export bridge multicast router state
From: Ivan Vecera @ 2017-10-09 10:19 UTC (permalink / raw)
  To: Jiri Pirko, netdev
  Cc: davem, yotamg, idosch, nogahf, mlxsw, nikolay, andrew, stephen,
	nbd, roopa
In-Reply-To: <20171009091535.1315-3-jiri@resnulli.us>

On 9.10.2017 11:15, Jiri Pirko wrote:
> From: Yotam Gigi <yotamg@mellanox.com>
> 
> Add an access function that, given a bridge netdevice, returns whether the
> bridge device is currently an mrouter or not. The function uses the already
> existing br_multicast_is_router function to check that.
> 
> This function is needed in order to allow ports that join an already
> existing bridge to know the current mrouter state of the bridge device.
> Together with the bridge device mrouter ports switchdev notifications, it
> is possible to have full offloading of the semantics of the bridge device
> mcast router state.
> 
> Due to the fact that the bridge multicast router status can change in
> packet RX path, take the multicast_router bridge spinlock to protect the
> read.
> 
> Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
> Reviewed-by: Nogah Frankel <nogahf@mellanox.com>
> Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
> ---
>  include/linux/if_bridge.h |  5 +++++
>  net/bridge/br_multicast.c | 12 ++++++++++++
>  2 files changed, 17 insertions(+)
> 
> diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
> index 316ee11..02639eb 100644
> --- a/include/linux/if_bridge.h
> +++ b/include/linux/if_bridge.h
> @@ -64,6 +64,7 @@ int br_multicast_list_adjacent(struct net_device *dev,
>  bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto);
>  bool br_multicast_has_querier_adjacent(struct net_device *dev, int proto);
>  bool br_multicast_enabled(const struct net_device *dev);
> +bool br_multicast_router(const struct net_device *dev);
>  #else
>  static inline int br_multicast_list_adjacent(struct net_device *dev,
>  					     struct list_head *br_ip_list)
> @@ -84,6 +85,10 @@ static inline bool br_multicast_enabled(const struct net_device *dev)
>  {
>  	return false;
>  }
> +static inline bool br_multicast_router(const struct net_device *dev)
> +{
> +	return false;
> +}
>  #endif
>  
>  #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING)
> diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
> index bd50550..7947e04 100644
> --- a/net/bridge/br_multicast.c
> +++ b/net/bridge/br_multicast.c
> @@ -2216,6 +2216,18 @@ bool br_multicast_enabled(const struct net_device *dev)
>  }
>  EXPORT_SYMBOL_GPL(br_multicast_enabled);
>  
> +bool br_multicast_router(const struct net_device *dev)
> +{
> +	struct net_bridge *br = netdev_priv(dev);
> +	bool is_router;
> +
> +	spin_lock_bh(&br->multicast_lock);
> +	is_router = br_multicast_is_router(br);
> +	spin_unlock_bh(&br->multicast_lock);
> +	return is_router;
> +}
> +EXPORT_SYMBOL_GPL(br_multicast_router);
> +
>  int br_multicast_set_querier(struct net_bridge *br, unsigned long val)
>  {
>  	unsigned long max_delay;
> 

Reviewed by: Ivan Vecera <ivecera@redhat.com>

^ permalink raw reply

* Re: [patch net-next v2 1/5] net: bridge: Notify on bridge device mrouter state changes
From: Nikolay Aleksandrov @ 2017-10-09 10:52 UTC (permalink / raw)
  To: Jiri Pirko, netdev
  Cc: davem, yotamg, idosch, nogahf, mlxsw, ivecera, andrew, stephen,
	nbd, roopa
In-Reply-To: <20171009091535.1315-2-jiri@resnulli.us>

On 09/10/17 12:15, Jiri Pirko wrote:
> From: Yotam Gigi <yotamg@mellanox.com>
> 
> Add the SWITCHDEV_ATTR_ID_BRIDGE_MROUTER switchdev notification type, used
> to indicate whether the bridge is or isn't mrouter. Notify when the bridge
> changes its state, similarly to the already existing bridged port mrouter
> notifications.
> 
> The notification uses the switchdev_attr.u.mrouter boolean flag to indicate
> the current bridge mrouter status. Thus, it only indicates whether the
> bridge is currently used as an mrouter or not, and does not indicate the
> exact mrouter state of the bridge (learning, permanent, etc.).
> 
> Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
> ---
> v1->v2:
>  - use the timer_pending to distinguish between learning-on and
>    learning-off states
> ---
>  include/net/switchdev.h   |  1 +
>  net/bridge/br_multicast.c | 38 +++++++++++++++++++++++++++++++++++---
>  2 files changed, 36 insertions(+), 3 deletions(-)
> 

Much simpler, I like it. Thanks!

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>

^ permalink raw reply

* Re: [PATCH v3] netfilter: SYNPROXY: fix process non tcp packet bug in {ipv4,ipv6}_synproxy_hook
From: Pablo Neira Ayuso @ 2017-10-09 11:09 UTC (permalink / raw)
  To: Lin Zhang
  Cc: kadlec, fw, davem, kuznet, yoshfuji, netfilter-devel, coreteam,
	netdev
In-Reply-To: <1507221843-4915-1-git-send-email-xiaolou4617@gmail.com>

On Fri, Oct 06, 2017 at 12:44:03AM +0800, Lin Zhang wrote:
> In function {ipv4,ipv6}_synproxy_hook we expect a normal tcp packet,
> but the real server maybe reply an icmp error packet related to the
> exist tcp conntrack, so we will access wrong tcp data.
> 
> For fix it, check for the protocol field and only process tcp traffic.

Applied, thanks. I have mangled the title slightly to:

        netfilter: SYNPROXY: skip non-tcp packet in {ipv4, ipv6}_synproxy_hook

For the record.

^ permalink raw reply

* Re: [PATCH] netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED mode of 'xt_bpf_info_v1'
From: Pablo Neira Ayuso @ 2017-10-09 11:18 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Shmulik Ladkani, netfilter-devel, Willem de Bruijn,
	Network Development, Daniel Borkmann, Rafael Buchbinder,
	Shmulik Ladkani
In-Reply-To: <CAF=yD-JbZ_ViWsBWK7Ye=4RYF9BgnMKvD04xE5P26RNRpnmuiQ@mail.gmail.com>

On Fri, Oct 06, 2017 at 01:40:13PM -0400, Willem de Bruijn wrote:
> On Fri, Oct 6, 2017 at 12:02 PM, Shmulik Ladkani <shmulik@nsof.io> wrote:
> > From: Shmulik Ladkani <shmulik.ladkani@gmail.com>
> >
> > Commit 2c16d6033264 ("netfilter: xt_bpf: support ebpf") introduced
> > support for attaching an eBPF object by an fd, with the
> > 'bpf_mt_check_v1' ABI expecting the '.fd' to be specified upon each
> > IPT_SO_SET_REPLACE call.
> >
> > However this breaks subsequent iptables calls:
> >
> >  # iptables -A INPUT -m bpf --object-pinned /sys/fs/bpf/xxx -j ACCEPT
> >  # iptables -A INPUT -s 5.6.7.8 -j ACCEPT
> >  iptables: Invalid argument. Run `dmesg' for more information.
[...]
> >
> > References: [1] https://marc.info/?l=netfilter-devel&m=150564724607440&w=2
> >             [2] https://marc.info/?l=netfilter-devel&m=150575727129880&w=2
> >
> > Cc: Pablo Neira Ayuso <pablo@netfilter.org>
> > Cc: Willem de Bruijn <willemb@google.com>
> > Reported-by: Rafael Buchbinder <rafi@rbk.ms>
> > Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
> 
> Acked-by: Willem de Bruijn <willemb@google.com>

Applied, thanks.

^ permalink raw reply

* Re: netlink backwards compatibility in userspace tools
From: Jason A. Donenfeld @ 2017-10-09 11:34 UTC (permalink / raw)
  To: David Miller; +Cc: Netdev, LKML, Daniel Kahn Gillmor
In-Reply-To: <20171008.214726.1977368725573365543.davem@davemloft.net>

Hi Dave,

That seems wise. Thanks for the advice. A more sophisticated way of
approaching this would be for the kernel to also send a bitmap of
which attributes are "critical" and only warn (or even error) of
_those_ are not understood. But that seems needlessly complex, and so
I think I'll go with your suggestion, and simply warn once with a
shorter message.

Jason

^ permalink raw reply

* [PATCH] netlink: do not set cb_running if dump's start() errs
From: Jason A. Donenfeld @ 2017-10-09 11:56 UTC (permalink / raw)
  To: johannes, davem, Netdev, linux-kernel; +Cc: Jason A. Donenfeld

It turns out that multiple places can call netlink_dump(), which means
it's still possible to dereference partially initialized values in
dump() that were the result of a faulty returned start().

This fixes the issue by calling start() _before_ setting cb_running to
true, so that there's no chance at all of hitting the dump() function
through any indirect paths.

In testing this with several different pieces of tricky code to trigger
these issues, this commit fixes all avenues that I'm aware of.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
---
 net/netlink/af_netlink.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 94c11cf0459d..f34750691c5c 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2266,16 +2266,17 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
 	cb->min_dump_alloc = control->min_dump_alloc;
 	cb->skb = skb;

+	if (cb->start) {
+		ret = cb->start(cb);
+		if (ret)
+			goto error_unlock;
+	}
+
 	nlk->cb_running = true;

 	mutex_unlock(nlk->cb_mutex);

-	ret = 0;
-	if (cb->start)
-		ret = cb->start(cb);
-
-	if (!ret)
-		ret = netlink_dump(sk);
+	ret = netlink_dump(sk);

 	sock_put(sk);

-- 
2.14.2

^ permalink raw reply related

* Re: [PATCH] netlink: do not set cb_running if dump's start() errs
From: Jason A. Donenfeld @ 2017-10-09 11:57 UTC (permalink / raw)
  To: Johannes Berg, David Miller, Netdev, LKML; +Cc: Jason A. Donenfeld
In-Reply-To: <20171009115648.25989-1-Jason@zx2c4.com>

Dave - this very likely should be queued up for stable.

Jason

^ permalink raw reply

* Re: [PATCH] netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED mode of 'xt_bpf_info_v1'
From: Pablo Neira Ayuso @ 2017-10-09 11:57 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Shmulik Ladkani, netfilter-devel, Willem de Bruijn,
	Network Development, Daniel Borkmann, Rafael Buchbinder,
	Shmulik Ladkani
In-Reply-To: <20171009111823.GA30637@salvia>

On Mon, Oct 09, 2017 at 01:18:23PM +0200, Pablo Neira Ayuso wrote:
> On Fri, Oct 06, 2017 at 01:40:13PM -0400, Willem de Bruijn wrote:
> > On Fri, Oct 6, 2017 at 12:02 PM, Shmulik Ladkani <shmulik@nsof.io> wrote:
> > > From: Shmulik Ladkani <shmulik.ladkani@gmail.com>
> > >
> > > Commit 2c16d6033264 ("netfilter: xt_bpf: support ebpf") introduced
> > > support for attaching an eBPF object by an fd, with the
> > > 'bpf_mt_check_v1' ABI expecting the '.fd' to be specified upon each
> > > IPT_SO_SET_REPLACE call.
> > >
> > > However this breaks subsequent iptables calls:
> > >
> > >  # iptables -A INPUT -m bpf --object-pinned /sys/fs/bpf/xxx -j ACCEPT
> > >  # iptables -A INPUT -s 5.6.7.8 -j ACCEPT
> > >  iptables: Invalid argument. Run `dmesg' for more information.
> [...]
> > >
> > > References: [1] https://marc.info/?l=netfilter-devel&m=150564724607440&w=2
> > >             [2] https://marc.info/?l=netfilter-devel&m=150575727129880&w=2
> > >
> > > Cc: Pablo Neira Ayuso <pablo@netfilter.org>
> > > Cc: Willem de Bruijn <willemb@google.com>
> > > Reported-by: Rafael Buchbinder <rafi@rbk.ms>
> > > Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
> > 
> > Acked-by: Willem de Bruijn <willemb@google.com>
> 
> Applied, thanks.

Hm, I have to keep this back. Compilation breaks here.

net/netfilter/xt_bpf.c: In function ‘__bpf_mt_check_path’:
net/netfilter/xt_bpf.c:59:2: error: implicit declaration of function
‘bpf_obj_get_user’ [-Werror=implicit-function-declaration]
  fd = bpf_obj_get_user(path);
  ^

^ permalink raw reply

* Re: [PATCH] netlink: do not set cb_running if dump's start() errs
From: Johannes Berg @ 2017-10-09 11:58 UTC (permalink / raw)
  To: Jason A. Donenfeld, davem, Netdev, linux-kernel
In-Reply-To: <20171009115648.25989-1-Jason@zx2c4.com>

On Mon, 2017-10-09 at 13:56 +0200, Jason A. Donenfeld wrote:

> @@ -2266,16 +2266,17 @@ int __netlink_dump_start(struct sock *ssk,
> struct sk_buff *skb,
>  	cb->min_dump_alloc = control->min_dump_alloc;
>  	cb->skb = skb;
>  
> +	if (cb->start) {
> +		ret = cb->start(cb);
> +		if (ret)
> +			goto error_unlock;
> +	}
> +
>  	nlk->cb_running = true;
>  
>  	mutex_unlock(nlk->cb_mutex);

Hmm. Now start is invoked with the mutex held, I'm not sure it actually
_matters_, but that should probably be reviewed and mentioned in the
commit log?

johannes

^ permalink raw reply

* Re: [PATCH] netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED mode of 'xt_bpf_info_v1'
From: Daniel Borkmann @ 2017-10-09 12:01 UTC (permalink / raw)
  To: Pablo Neira Ayuso, Willem de Bruijn
  Cc: Shmulik Ladkani, netfilter-devel, Willem de Bruijn,
	Network Development, Rafael Buchbinder, Shmulik Ladkani
In-Reply-To: <20171009115736.GA31826@salvia>

Hi Shmulik,

On 10/09/2017 01:57 PM, Pablo Neira Ayuso wrote:
> On Mon, Oct 09, 2017 at 01:18:23PM +0200, Pablo Neira Ayuso wrote:
>> On Fri, Oct 06, 2017 at 01:40:13PM -0400, Willem de Bruijn wrote:
>>> On Fri, Oct 6, 2017 at 12:02 PM, Shmulik Ladkani <shmulik@nsof.io> wrote:
>>>> From: Shmulik Ladkani <shmulik.ladkani@gmail.com>
>>>>
>>>> Commit 2c16d6033264 ("netfilter: xt_bpf: support ebpf") introduced
>>>> support for attaching an eBPF object by an fd, with the
>>>> 'bpf_mt_check_v1' ABI expecting the '.fd' to be specified upon each
>>>> IPT_SO_SET_REPLACE call.
>>>>
>>>> However this breaks subsequent iptables calls:
>>>>
>>>>   # iptables -A INPUT -m bpf --object-pinned /sys/fs/bpf/xxx -j ACCEPT
>>>>   # iptables -A INPUT -s 5.6.7.8 -j ACCEPT
>>>>   iptables: Invalid argument. Run `dmesg' for more information.
>> [...]
>>>>
>>>> References: [1] https://marc.info/?l=netfilter-devel&m=150564724607440&w=2
>>>>              [2] https://marc.info/?l=netfilter-devel&m=150575727129880&w=2
>>>>
>>>> Cc: Pablo Neira Ayuso <pablo@netfilter.org>
>>>> Cc: Willem de Bruijn <willemb@google.com>
>>>> Reported-by: Rafael Buchbinder <rafi@rbk.ms>
>>>> Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
>>>
>>> Acked-by: Willem de Bruijn <willemb@google.com>
>>
>> Applied, thanks.
>
> Hm, I have to keep this back. Compilation breaks here.
>
> net/netfilter/xt_bpf.c: In function ‘__bpf_mt_check_path’:
> net/netfilter/xt_bpf.c:59:2: error: implicit declaration of function
> ‘bpf_obj_get_user’ [-Werror=implicit-function-declaration]
>    fd = bpf_obj_get_user(path);
>    ^

Yeah, probably best to just add a dummy bpf_obj_get_user()
returning an error when CONFIG_BPF_SYSCALL is disabled.

^ permalink raw reply

* Re: [PATCH 10/12] net/mlx4: replace <linux/radix-tree.h> with <linux/radix-tree-root.h>
From: Masahiro Yamada @ 2017-10-09 12:02 UTC (permalink / raw)
  To: Joe Perches
  Cc: David Miller, Linux Kernel Mailing List, Thomas Gleixner,
	Andrew Morton, Ian Abbott, Ingo Molnar, Linus Torvalds,
	linux-rdma, Yishai Hadas, Tariq Toukan, netdev
In-Reply-To: <1507483942.11434.4.camel@perches.com>

2017-10-09 2:32 GMT+09:00 Joe Perches <joe@perches.com>:
> On Mon, 2017-10-09 at 02:29 +0900, Masahiro Yamada wrote:
>> The idea is simple; include necessary headers explicitly.
>
> Try that for kernel.h
>
> There's a reason aggregation of #includes is useful.
>

BTW, talking about <linux/kernel.h>, it is too much aggregation, isn't it?

It exceed 850 lines, and contains lots of unrelated stuff in one header.
Perhaps, it could be a good time to think about splitting?

For example, I see many trace_... things in it.

I wonder if linux/kernel.h is a good home for them, or a separate file.

-- 
Best Regards
Masahiro Yamada

^ permalink raw reply

* [PATCH v3 1/3] net: phy: Remove TI DP83822 from DP83848 driver
From: Dan Murphy @ 2017-10-09 12:03 UTC (permalink / raw)
  To: andrew, f.fainelli; +Cc: netdev, Woojung.Huh, afd, Dan Murphy

Removing the DP83822 device from the DP83848 to
support the TI DP83822 dedicated driver that will
initially support WoL settings.

Signed-off-by: Dan Murphy <dmurphy@ti.com>
---

v3 - No changes made
v2 - There was no v1 on this patch this is new.

 drivers/net/phy/dp83848.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/net/phy/dp83848.c b/drivers/net/phy/dp83848.c
index 3de4fe4dda77..3966d43c5146 100644
--- a/drivers/net/phy/dp83848.c
+++ b/drivers/net/phy/dp83848.c
@@ -20,7 +20,6 @@
 #define TI_DP83620_PHY_ID		0x20005ce0
 #define NS_DP83848C_PHY_ID		0x20005c90
 #define TLK10X_PHY_ID			0x2000a210
-#define TI_DP83822_PHY_ID		0x2000a240
 
 /* Registers */
 #define DP83848_MICR			0x11 /* MII Interrupt Control Register */
@@ -80,7 +79,6 @@ static struct mdio_device_id __maybe_unused dp83848_tbl[] = {
 	{ NS_DP83848C_PHY_ID, 0xfffffff0 },
 	{ TI_DP83620_PHY_ID, 0xfffffff0 },
 	{ TLK10X_PHY_ID, 0xfffffff0 },
-	{ TI_DP83822_PHY_ID, 0xfffffff0 },
 	{ }
 };
 MODULE_DEVICE_TABLE(mdio, dp83848_tbl);
@@ -110,7 +108,6 @@ static struct phy_driver dp83848_driver[] = {
 	DP83848_PHY_DRIVER(NS_DP83848C_PHY_ID, "NS DP83848C 10/100 Mbps PHY"),
 	DP83848_PHY_DRIVER(TI_DP83620_PHY_ID, "TI DP83620 10/100 Mbps PHY"),
 	DP83848_PHY_DRIVER(TLK10X_PHY_ID, "TI TLK10X 10/100 Mbps PHY"),
-	DP83848_PHY_DRIVER(TI_DP83822_PHY_ID, "TI DP83822 10/100 Mbps PHY"),
 };
 module_phy_driver(dp83848_driver);
 
-- 
2.14.0

^ permalink raw reply related

* [PATCH v3 2/3] net: phy: DP83822 initial driver submission
From: Dan Murphy @ 2017-10-09 12:03 UTC (permalink / raw)
  To: andrew, f.fainelli; +Cc: netdev, Woojung.Huh, afd, Dan Murphy
In-Reply-To: <20171009120302.23611-1-dmurphy@ti.com>

Add support for the TI  DP83822 10/100Mbit ethernet phy.

The DP83822 provides flexibility to connect to a MAC through a
standard MII, RMII or RGMII interface.

Datasheet:
http://www.ti.com/product/DP83822I/datasheet

Signed-off-by: Dan Murphy <dmurphy@ti.com>
---

v3 - Fixed WoL indication bit and removed mutex for suspend/resume - 
https://www.mail-archive.com/netdev@vger.kernel.org/msg191891.html and
https://www.mail-archive.com/netdev@vger.kernel.org/msg191665.html

v2 - Updated per comments.  Removed unnessary parantheis, called genphy_suspend/
resume routines and then performing WoL changes, reworked sopass storage and reduced
the number of phy reads, and moved WOL_SECURE_ON - 
https://www.mail-archive.com/netdev@vger.kernel.org/msg191392.html

 drivers/net/phy/Kconfig   |   5 +
 drivers/net/phy/Makefile  |   1 +
 drivers/net/phy/dp83822.c | 302 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 308 insertions(+)
 create mode 100644 drivers/net/phy/dp83822.c

diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index cd931cf9dcc2..8e78a482e09e 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -277,6 +277,11 @@ config DAVICOM_PHY
 	---help---
 	  Currently supports dm9161e and dm9131
 
+config DP83822_PHY
+	tristate "Texas Instruments DP83822 PHY"
+	---help---
+	  Supports the DP83822 PHY.
+
 config DP83848_PHY
 	tristate "Texas Instruments DP83848 PHY"
 	---help---
diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile
index 416df92fbf4f..df3b82ba8550 100644
--- a/drivers/net/phy/Makefile
+++ b/drivers/net/phy/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_CICADA_PHY)	+= cicada.o
 obj-$(CONFIG_CORTINA_PHY)	+= cortina.o
 obj-$(CONFIG_DAVICOM_PHY)	+= davicom.o
 obj-$(CONFIG_DP83640_PHY)	+= dp83640.o
+obj-$(CONFIG_DP83822_PHY)	+= dp83822.o
 obj-$(CONFIG_DP83848_PHY)	+= dp83848.o
 obj-$(CONFIG_DP83867_PHY)	+= dp83867.o
 obj-$(CONFIG_FIXED_PHY)		+= fixed_phy.o
diff --git a/drivers/net/phy/dp83822.c b/drivers/net/phy/dp83822.c
new file mode 100644
index 000000000000..de196dbc46cd
--- /dev/null
+++ b/drivers/net/phy/dp83822.c
@@ -0,0 +1,302 @@
+/*
+ * Driver for the Texas Instruments DP83822 PHY
+ *
+ * Copyright (C) 2017 Texas Instruments Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/ethtool.h>
+#include <linux/etherdevice.h>
+#include <linux/kernel.h>
+#include <linux/mii.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/phy.h>
+#include <linux/netdevice.h>
+
+#define DP83822_PHY_ID	        0x2000a240
+#define DP83822_DEVADDR		0x1f
+
+#define MII_DP83822_MISR1	0x12
+#define MII_DP83822_MISR2	0x13
+#define MII_DP83822_RESET_CTRL	0x1f
+
+#define DP83822_HW_RESET	BIT(15)
+#define DP83822_SW_RESET	BIT(14)
+
+/* MISR1 bits */
+#define DP83822_RX_ERR_HF_INT_EN	BIT(0)
+#define DP83822_FALSE_CARRIER_HF_INT_EN	BIT(1)
+#define DP83822_ANEG_COMPLETE_INT_EN	BIT(2)
+#define DP83822_DUP_MODE_CHANGE_INT_EN	BIT(3)
+#define DP83822_SPEED_CHANGED_INT_EN	BIT(4)
+#define DP83822_LINK_STAT_INT_EN	BIT(5)
+#define DP83822_ENERGY_DET_INT_EN	BIT(6)
+#define DP83822_LINK_QUAL_INT_EN	BIT(7)
+
+/* MISR2 bits */
+#define DP83822_JABBER_DET_INT_EN	BIT(0)
+#define DP83822_WOL_PKT_INT_EN		BIT(1)
+#define DP83822_SLEEP_MODE_INT_EN	BIT(2)
+#define DP83822_MDI_XOVER_INT_EN	BIT(3)
+#define DP83822_LB_FIFO_INT_EN		BIT(4)
+#define DP83822_PAGE_RX_INT_EN		BIT(5)
+#define DP83822_ANEG_ERR_INT_EN		BIT(6)
+#define DP83822_EEE_ERROR_CHANGE_INT_EN	BIT(7)
+
+/* INT_STAT1 bits */
+#define DP83822_WOL_INT_EN	BIT(4)
+#define DP83822_WOL_INT_STAT	BIT(12)
+
+#define MII_DP83822_RXSOP1	0x04a5
+#define	MII_DP83822_RXSOP2	0x04a6
+#define	MII_DP83822_RXSOP3	0x04a7
+
+/* WoL Registers */
+#define	MII_DP83822_WOL_CFG	0x04a0
+#define	MII_DP83822_WOL_STAT	0x04a1
+#define	MII_DP83822_WOL_DA1	0x04a2
+#define	MII_DP83822_WOL_DA2	0x04a3
+#define	MII_DP83822_WOL_DA3	0x04a4
+
+/* WoL bits */
+#define DP83822_WOL_MAGIC_EN	BIT(1)
+#define DP83822_WOL_SECURE_ON	BIT(5)
+#define DP83822_WOL_EN		BIT(7)
+#define DP83822_WOL_INDICATION_SEL BIT(8)
+#define DP83822_WOL_CLR_INDICATION BIT(11)
+
+static int dp83822_ack_interrupt(struct phy_device *phydev)
+{
+	int err = phy_read(phydev, MII_DP83822_MISR1);
+
+	if (err < 0)
+		return err;
+
+	err = phy_read(phydev, MII_DP83822_MISR2);
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+
+static int dp83822_set_wol(struct phy_device *phydev,
+			   struct ethtool_wolinfo *wol)
+{
+	struct net_device *ndev = phydev->attached_dev;
+	u16 value;
+	const u8 *mac;
+
+	if (wol->wolopts & (WAKE_MAGIC | WAKE_MAGICSECURE)) {
+		mac = (const u8 *)ndev->dev_addr;
+
+		if (!is_valid_ether_addr(mac))
+			return -EINVAL;
+
+		/* MAC addresses start with byte 5, but stored in mac[0].
+		 * 822 PHYs store bytes 4|5, 2|3, 0|1
+		 */
+		phy_write_mmd(phydev, DP83822_DEVADDR,
+			      MII_DP83822_WOL_DA1, (mac[1] << 8) | mac[0]);
+		phy_write_mmd(phydev, DP83822_DEVADDR,
+			      MII_DP83822_WOL_DA2, (mac[3] << 8) | mac[2]);
+		phy_write_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_DA3,
+			      (mac[5] << 8) | mac[4]);
+
+		value = phy_read_mmd(phydev, DP83822_DEVADDR,
+				     MII_DP83822_WOL_CFG);
+		if (wol->wolopts & WAKE_MAGIC)
+			value |= DP83822_WOL_MAGIC_EN;
+		else
+			value &= ~DP83822_WOL_MAGIC_EN;
+
+		if (wol->wolopts & WAKE_MAGICSECURE) {
+			phy_write_mmd(phydev, DP83822_DEVADDR,
+				      MII_DP83822_RXSOP1,
+				      (wol->sopass[1] << 8) | wol->sopass[0]);
+			phy_write_mmd(phydev, DP83822_DEVADDR,
+				      MII_DP83822_RXSOP2,
+				      (wol->sopass[3] << 8) | wol->sopass[2]);
+			phy_write_mmd(phydev, DP83822_DEVADDR,
+				      MII_DP83822_RXSOP3,
+				      (wol->sopass[5] << 8) | wol->sopass[4]);
+			value |= DP83822_WOL_SECURE_ON;
+		} else {
+			value &= ~DP83822_WOL_SECURE_ON;
+		}
+
+		value |= (DP83822_WOL_EN | DP83822_WOL_INDICATION_SEL |
+			  DP83822_WOL_CLR_INDICATION);
+		phy_write_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_CFG,
+			      value);
+	} else {
+		value = phy_read_mmd(phydev, DP83822_DEVADDR,
+				     MII_DP83822_WOL_CFG);
+		value &= ~DP83822_WOL_EN;
+		phy_write_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_CFG,
+			      value);
+	}
+
+	return 0;
+}
+
+static void dp83822_get_wol(struct phy_device *phydev,
+			    struct ethtool_wolinfo *wol)
+{
+	int value;
+	u16 sopass_val;
+
+	wol->supported = (WAKE_MAGIC | WAKE_MAGICSECURE);
+	wol->wolopts = 0;
+
+	value = phy_read_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_CFG);
+	if (value & DP83822_WOL_MAGIC_EN)
+		wol->wolopts |= WAKE_MAGIC;
+
+	if (~value & DP83822_WOL_CLR_INDICATION)
+		wol->wolopts = 0;
+
+	if (value & DP83822_WOL_SECURE_ON) {
+		wol->wolopts |= WAKE_MAGICSECURE;
+	} else {
+		wol->wolopts &= ~WAKE_MAGICSECURE;
+		return;
+	}
+
+	sopass_val = phy_read_mmd(phydev, DP83822_DEVADDR, MII_DP83822_RXSOP1);
+	wol->sopass[0] = (sopass_val & 0xff);
+	wol->sopass[1] = (sopass_val >> 8);
+
+	sopass_val = phy_read_mmd(phydev, DP83822_DEVADDR, MII_DP83822_RXSOP2);
+	wol->sopass[2] = (sopass_val & 0xff);
+	wol->sopass[3] = (sopass_val >> 8);
+
+	sopass_val = phy_read_mmd(phydev, DP83822_DEVADDR, MII_DP83822_RXSOP3);
+	wol->sopass[4] = (sopass_val & 0xff);
+	wol->sopass[5] = (sopass_val >> 8);
+}
+
+static int dp83822_config_intr(struct phy_device *phydev)
+{
+	int misr_status;
+	int err;
+
+	if (phydev->interrupts == PHY_INTERRUPT_ENABLED) {
+		misr_status = phy_read(phydev, MII_DP83822_MISR1);
+		if (misr_status < 0)
+			return misr_status;
+
+		misr_status |= (DP83822_RX_ERR_HF_INT_EN |
+				DP83822_FALSE_CARRIER_HF_INT_EN |
+				DP83822_ANEG_COMPLETE_INT_EN |
+				DP83822_DUP_MODE_CHANGE_INT_EN |
+				DP83822_SPEED_CHANGED_INT_EN |
+				DP83822_LINK_STAT_INT_EN |
+				DP83822_ENERGY_DET_INT_EN |
+				DP83822_LINK_QUAL_INT_EN);
+
+		err = phy_write(phydev, MII_DP83822_MISR1, misr_status);
+		if (err < 0)
+			return err;
+
+		misr_status = phy_read(phydev, MII_DP83822_MISR2);
+		if (misr_status < 0)
+			return misr_status;
+
+		misr_status |= (DP83822_JABBER_DET_INT_EN |
+				DP83822_WOL_PKT_INT_EN |
+				DP83822_SLEEP_MODE_INT_EN |
+				DP83822_MDI_XOVER_INT_EN |
+				DP83822_LB_FIFO_INT_EN |
+				DP83822_PAGE_RX_INT_EN |
+				DP83822_ANEG_ERR_INT_EN |
+				DP83822_EEE_ERROR_CHANGE_INT_EN);
+
+		err = phy_write(phydev, MII_DP83822_MISR2, misr_status);
+	} else {
+		err = phy_write(phydev, MII_DP83822_MISR1, 0);
+		if (err < 0)
+			return err;
+
+		err = phy_write(phydev, MII_DP83822_MISR1, 0);
+	}
+
+	return err;
+}
+
+static int dp83822_phy_reset(struct phy_device *phydev)
+{
+	int err;
+
+	err = phy_write(phydev, MII_DP83822_RESET_CTRL, DP83822_HW_RESET);
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+
+static int dp83822_suspend(struct phy_device *phydev)
+{
+	int value;
+
+	value = phy_read_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_CFG);
+
+	if (!(value & DP83822_WOL_EN))
+		genphy_suspend(phydev);
+
+	return 0;
+}
+
+static int dp83822_resume(struct phy_device *phydev)
+{
+	int value;
+
+	genphy_resume(phydev);
+
+	value = phy_read_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_CFG);
+
+	phy_write_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_CFG, value |
+		      DP83822_WOL_CLR_INDICATION);
+
+
+	return 0;
+}
+
+static struct phy_driver dp83822_driver[] = {
+	{
+	 .phy_id = DP83822_PHY_ID,
+	 .phy_id_mask = 0xfffffff0,
+	 .name = "TI DP83822",
+	 .features = PHY_BASIC_FEATURES,
+	 .flags = PHY_HAS_INTERRUPT,
+	 .config_init = genphy_config_init,
+	 .soft_reset = dp83822_phy_reset,
+	 .get_wol = dp83822_get_wol,
+	 .set_wol = dp83822_set_wol,
+	 .ack_interrupt = dp83822_ack_interrupt,
+	 .config_intr = dp83822_config_intr,
+	 .config_aneg = genphy_config_aneg,
+	 .read_status = genphy_read_status,
+	 .suspend = dp83822_suspend,
+	 .resume = dp83822_resume,
+	 },
+};
+module_phy_driver(dp83822_driver);
+
+static struct mdio_device_id __maybe_unused dp83822_tbl[] = {
+	{ DP83822_PHY_ID, 0xfffffff0 },
+	{ },
+};
+MODULE_DEVICE_TABLE(mdio, dp83822_tbl);
+
+MODULE_DESCRIPTION("Texas Instruments DP83822 PHY driver");
+MODULE_AUTHOR("Dan Murphy <dmurphy@ti.com");
+MODULE_LICENSE("GPL");
-- 
2.14.0

^ permalink raw reply related

* [PATCH v3 3/3] net: phy: Change error to EINVAL for invalid MAC
From: Dan Murphy @ 2017-10-09 12:03 UTC (permalink / raw)
  To: andrew, f.fainelli; +Cc: netdev, Woojung.Huh, afd, Dan Murphy
In-Reply-To: <20171009120302.23611-1-dmurphy@ti.com>

Change the return error code to EINVAL if the MAC
address is not valid in the set_wol function.

Signed-off-by: Dan Murphy <dmurphy@ti.com>
---

v3 - No changes made
v2 - There was no v1 on this patch this is new.

 drivers/net/phy/at803x.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c
index c1e52b9dc58d..5f93e6add563 100644
--- a/drivers/net/phy/at803x.c
+++ b/drivers/net/phy/at803x.c
@@ -167,7 +167,7 @@ static int at803x_set_wol(struct phy_device *phydev,
 		mac = (const u8 *) ndev->dev_addr;
 
 		if (!is_valid_ether_addr(mac))
-			return -EFAULT;
+			return -EINVAL;
 
 		for (i = 0; i < 3; i++) {
 			phy_write(phydev, AT803X_MMD_ACCESS_CONTROL,
-- 
2.14.0

^ permalink raw reply related

* [PATCH] cdc_ether: flag the u-blox TOBY-L2 and SARA-U2 as wwan
From: Aleksander Morgado @ 2017-10-09 12:05 UTC (permalink / raw)
  To: oliver
  Cc: davem, marco.demarco, stefano.godeas, linux-usb, netdev,
	linux-kernel, Aleksander Morgado

The u-blox TOBY-L2 is a LTE Cat 4 module with HSPA+ and 2G fallback.
This module allows switching to different USB profiles with the
'AT+UUSBCONF' command, and provides a ECM network interface when the
'AT+UUSBCONF=2' profile is selected.

The u-blox SARA-U2 is a HSPA module with 2G fallback. The default USB
configuration includes a ECM network interface.

Both these modules are controlled via AT commands through one of the
TTYs exposed. Connecting these modules may be done just by activating
the desired PDP context with 'AT+CGACT=1,<cid>' and then running DHCP
on the ECM interface.

Signed-off-by: Aleksander Morgado <aleksander@aleksander.es>
---
 drivers/net/usb/cdc_ether.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c
index 29c7e2ec0dcb..52ea80bcd639 100644
--- a/drivers/net/usb/cdc_ether.c
+++ b/drivers/net/usb/cdc_ether.c
@@ -560,6 +560,7 @@ static const struct driver_info wwan_info = {
 #define NVIDIA_VENDOR_ID	0x0955
 #define HP_VENDOR_ID		0x03f0
 #define MICROSOFT_VENDOR_ID	0x045e
+#define UBLOX_VENDOR_ID		0x1546
 
 static const struct usb_device_id	products[] = {
 /* BLACKLIST !!
@@ -868,6 +869,18 @@ static const struct usb_device_id	products[] = {
 				      USB_CDC_SUBCLASS_ETHERNET,
 				      USB_CDC_PROTO_NONE),
 	.driver_info = (unsigned long)&zte_cdc_info,
+}, {
+	/* U-blox TOBY-L2 */
+	USB_DEVICE_AND_INTERFACE_INFO(UBLOX_VENDOR_ID, 0x1143, USB_CLASS_COMM,
+				      USB_CDC_SUBCLASS_ETHERNET,
+				      USB_CDC_PROTO_NONE),
+	.driver_info = (unsigned long)&wwan_info,
+}, {
+	/* U-blox SARA-U2 */
+	USB_DEVICE_AND_INTERFACE_INFO(UBLOX_VENDOR_ID, 0x1104, USB_CLASS_COMM,
+				      USB_CDC_SUBCLASS_ETHERNET,
+				      USB_CDC_PROTO_NONE),
+	.driver_info = (unsigned long)&wwan_info,
 }, {
 	USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET,
 			USB_CDC_PROTO_NONE),
-- 
2.14.1

^ permalink raw reply related

* Re: [PATCH] netlink: do not set cb_running if dump's start() errs
From: Jason A. Donenfeld @ 2017-10-09 12:12 UTC (permalink / raw)
  To: Johannes Berg; +Cc: David Miller, Netdev, LKML
In-Reply-To: <1507550326.26041.39.camel@sipsolutions.net>

Hi Johannes,

Yes, indeed, and I think that's actually a good thing. It means that
the starts aren't ever called in parallel, which could be useful if
drivers have any ordering requirements. The change doesn't negatively
effect any existing drivers. I'll resubmit with a larger commit
message explaining this.

Jason

^ permalink raw reply

* [PATCH v2] netlink: do not set cb_running if dump's start() errs
From: Jason A. Donenfeld @ 2017-10-09 12:14 UTC (permalink / raw)
  To: johannes, davem, Netdev, linux-kernel; +Cc: Jason A. Donenfeld
In-Reply-To: <CAHmME9qK84HT6DEC2Z3q-E+J8AtNSp_7GNAcjWSSsKnxX0eAkA@mail.gmail.com>

It turns out that multiple places can call netlink_dump(), which means
it's still possible to dereference partially initialized values in
dump() that were the result of a faulty returned start().

This fixes the issue by calling start() _before_ setting cb_running to
true, so that there's no chance at all of hitting the dump() function
through any indirect paths.

It also moves the call to start() to be when the mutex is held. This has
the nice side effect of serializing invocations to start(), which is
likely desirable anyway. It also prevents any possible other races that
might come out of this logic.

In testing this with several different pieces of tricky code to trigger
these issues, this commit fixes all avenues that I'm aware of.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
---
 net/netlink/af_netlink.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 94c11cf0459d..f34750691c5c 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2266,16 +2266,17 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
 	cb->min_dump_alloc = control->min_dump_alloc;
 	cb->skb = skb;

+	if (cb->start) {
+		ret = cb->start(cb);
+		if (ret)
+			goto error_unlock;
+	}
+
 	nlk->cb_running = true;

 	mutex_unlock(nlk->cb_mutex);

-	ret = 0;
-	if (cb->start)
-		ret = cb->start(cb);
-
-	if (!ret)
-		ret = netlink_dump(sk);
+	ret = netlink_dump(sk);

 	sock_put(sk);

-- 
2.14.2

^ permalink raw reply related

* [PATCH v2] netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED mode of 'xt_bpf_info_v1'
From: Shmulik Ladkani @ 2017-10-09 12:27 UTC (permalink / raw)
  To: Pablo Neira Ayuso, netfilter-devel
  Cc: netdev, Daniel Borkmann, shmulik, Rafael Buchbinder,
	Shmulik Ladkani, Willem de Bruijn

From: Shmulik Ladkani <shmulik.ladkani@gmail.com>

Commit 2c16d6033264 ("netfilter: xt_bpf: support ebpf") introduced
support for attaching an eBPF object by an fd, with the
'bpf_mt_check_v1' ABI expecting the '.fd' to be specified upon each
IPT_SO_SET_REPLACE call.

However this breaks subsequent iptables calls:

 # iptables -A INPUT -m bpf --object-pinned /sys/fs/bpf/xxx -j ACCEPT
 # iptables -A INPUT -s 5.6.7.8 -j ACCEPT
 iptables: Invalid argument. Run `dmesg' for more information.

That's because iptables works by loading exising rules using
IPT_SO_GET_ENTRIES to userspace, then issuing IPT_SO_SET_REPLACE with
the replacement set.

However, the loaded 'xt_bpf_info_v1' has an arbitrary '.fd' number
(from the initial "iptables -m bpf" invocation) - so when 2nd invocation
occurs, userspace passes a bogus fd number, which leads to
'bpf_mt_check_v1' to fail.

One suggested solution [1] was to hack iptables userspace, to perform a
"entries fixup" immediatley after IPT_SO_GET_ENTRIES, by opening a new,
process-local fd per every 'xt_bpf_info_v1' entry seen.

However, in [2] both Pablo Neira Ayuso and Willem de Bruijn suggested to
depricate the xt_bpf_info_v1 ABI dealing with pinned ebpf objects.

This fix changes the XT_BPF_MODE_FD_PINNED behavior to ignore the given
'.fd' and instead perform an in-kernel lookup for the bpf object given
the provided '.path'.

It also defines an alias for the XT_BPF_MODE_FD_PINNED mode, named
XT_BPF_MODE_PATH_PINNED, to better reflect the fact that the user is
expected to provide the path of the pinned object.

Existing XT_BPF_MODE_FD_ELF behavior (non-pinned fd mode) is preserved.

References: [1] https://marc.info/?l=netfilter-devel&m=150564724607440&w=2
            [2] https://marc.info/?l=netfilter-devel&m=150575727129880&w=2

Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Cc: Willem de Bruijn <willemb@google.com>
Reported-by: Rafael Buchbinder <rafi@rbk.ms>
Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
---
v2: Fix xt_bpf.c compilation if CONFIG_BPF_SYSCALL=n
---
 include/linux/bpf.h                   |  5 +++++
 include/uapi/linux/netfilter/xt_bpf.h |  1 +
 kernel/bpf/inode.c                    |  1 +
 net/netfilter/xt_bpf.c                | 22 ++++++++++++++++++++--
 4 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a67daea731ab..a49b321a1262 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -407,6 +407,11 @@ static inline void __bpf_prog_uncharge(struct user_struct *user, u32 pages)
 {
 }
 
+static inline int bpf_obj_get_user(const char __user *pathname)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline struct net_device  *__dev_map_lookup_elem(struct bpf_map *map,
 						       u32 key)
 {
diff --git a/include/uapi/linux/netfilter/xt_bpf.h b/include/uapi/linux/netfilter/xt_bpf.h
index b97725af2ac0..da161b56c79e 100644
--- a/include/uapi/linux/netfilter/xt_bpf.h
+++ b/include/uapi/linux/netfilter/xt_bpf.h
@@ -23,6 +23,7 @@ enum xt_bpf_modes {
 	XT_BPF_MODE_FD_PINNED,
 	XT_BPF_MODE_FD_ELF,
 };
+#define XT_BPF_MODE_PATH_PINNED XT_BPF_MODE_FD_PINNED
 
 struct xt_bpf_info_v1 {
 	__u16 mode;
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index e833ed914358..be1dde967208 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -363,6 +363,7 @@ int bpf_obj_get_user(const char __user *pathname)
 	putname(pname);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(bpf_obj_get_user);
 
 static void bpf_evict_inode(struct inode *inode)
 {
diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
index 38986a95216c..29123934887b 100644
--- a/net/netfilter/xt_bpf.c
+++ b/net/netfilter/xt_bpf.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/syscalls.h>
 #include <linux/skbuff.h>
 #include <linux/filter.h>
 #include <linux/bpf.h>
@@ -49,6 +50,22 @@ static int __bpf_mt_check_fd(int fd, struct bpf_prog **ret)
 	return 0;
 }
 
+static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret)
+{
+	mm_segment_t oldfs = get_fs();
+	int retval, fd;
+
+	set_fs(KERNEL_DS);
+	fd = bpf_obj_get_user(path);
+	set_fs(oldfs);
+	if (fd < 0)
+		return fd;
+
+	retval = __bpf_mt_check_fd(fd, ret);
+	sys_close(fd);
+	return retval;
+}
+
 static int bpf_mt_check(const struct xt_mtchk_param *par)
 {
 	struct xt_bpf_info *info = par->matchinfo;
@@ -66,9 +83,10 @@ static int bpf_mt_check_v1(const struct xt_mtchk_param *par)
 		return __bpf_mt_check_bytecode(info->bpf_program,
 					       info->bpf_program_num_elem,
 					       &info->filter);
-	else if (info->mode == XT_BPF_MODE_FD_PINNED ||
-		 info->mode == XT_BPF_MODE_FD_ELF)
+	else if (info->mode == XT_BPF_MODE_FD_ELF)
 		return __bpf_mt_check_fd(info->fd, &info->filter);
+	else if (info->mode == XT_BPF_MODE_PATH_PINNED)
+		return __bpf_mt_check_path(info->path, &info->filter);
 	else
 		return -EINVAL;
 }
-- 
2.14.2


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox