Linux RDMA and InfiniBand development
 help / color / mirror / Atom feed
* [PATCH] IB/isert: fix unaligned immediate-data handling
@ 2020-08-30 10:32 Sagi Grimberg
  2020-08-31  5:14 ` Christoph Hellwig
  2020-08-31 12:18 ` Jason Gunthorpe
  0 siblings, 2 replies; 14+ messages in thread
From: Sagi Grimberg @ 2020-08-30 10:32 UTC (permalink / raw)
  To: linux-rdma; +Cc: Max Gurtovoy, Stephen Rust, Doug Dumitru

Currently we allocate rx buffers in a single contiguous buffers
for headers (iser and iscsi) and data trailer. This means
that most likely the data starting offset is aligned to 76
bytes (size of both headers).

This worked fine for years, but at some point this broke.
To fix this, we should avoid passing unaligned buffers for
I/O.

Instead, we allocate our recv buffers with some extra space
such that we can have the data portion align to 512 byte boundary.
This also means that we cannot reference headers or data using
structure but rather accessors (as they may move based on alignment).
Also, get rid of the wrong __packed annotation from iser_rx_desc
as this has only harmful effects (not aligned to anything).

This affects the rx descriptors for iscsi login and data plane.

Reported-by: Stephen Rust <srust@blockbridge.com>
Tested-by: Doug Dumitru <doug@dumitru.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/infiniband/ulp/isert/ib_isert.c | 93 +++++++++++++------------
 drivers/infiniband/ulp/isert/ib_isert.h | 41 ++++++++---
 2 files changed, 79 insertions(+), 55 deletions(-)

diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
index 61e2f7fc513d..5b6a0ad9faaa 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -140,15 +140,16 @@ isert_alloc_rx_descriptors(struct isert_conn *isert_conn)
 	rx_desc = isert_conn->rx_descs;
 
 	for (i = 0; i < ISERT_QP_MAX_RECV_DTOS; i++, rx_desc++)  {
-		dma_addr = ib_dma_map_single(ib_dev, (void *)rx_desc,
-					ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+		dma_addr = ib_dma_map_single(ib_dev,
+					rx_desc->buf,
+					ISER_RX_SIZE, DMA_FROM_DEVICE);
 		if (ib_dma_mapping_error(ib_dev, dma_addr))
 			goto dma_map_fail;
 
 		rx_desc->dma_addr = dma_addr;
 
 		rx_sg = &rx_desc->rx_sg;
-		rx_sg->addr = rx_desc->dma_addr;
+		rx_sg->addr = rx_desc->dma_addr + isert_get_hdr_offset(rx_desc);
 		rx_sg->length = ISER_RX_PAYLOAD_SIZE;
 		rx_sg->lkey = device->pd->local_dma_lkey;
 		rx_desc->rx_cqe.done = isert_recv_done;
@@ -160,7 +161,7 @@ isert_alloc_rx_descriptors(struct isert_conn *isert_conn)
 	rx_desc = isert_conn->rx_descs;
 	for (j = 0; j < i; j++, rx_desc++) {
 		ib_dma_unmap_single(ib_dev, rx_desc->dma_addr,
-				    ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+				    ISER_RX_SIZE, DMA_FROM_DEVICE);
 	}
 	kfree(isert_conn->rx_descs);
 	isert_conn->rx_descs = NULL;
@@ -181,7 +182,7 @@ isert_free_rx_descriptors(struct isert_conn *isert_conn)
 	rx_desc = isert_conn->rx_descs;
 	for (i = 0; i < ISERT_QP_MAX_RECV_DTOS; i++, rx_desc++)  {
 		ib_dma_unmap_single(ib_dev, rx_desc->dma_addr,
-				    ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+				    ISER_RX_SIZE, DMA_FROM_DEVICE);
 	}
 
 	kfree(isert_conn->rx_descs);
@@ -299,10 +300,10 @@ isert_free_login_buf(struct isert_conn *isert_conn)
 			    ISER_RX_PAYLOAD_SIZE, DMA_TO_DEVICE);
 	kfree(isert_conn->login_rsp_buf);
 
-	ib_dma_unmap_single(ib_dev, isert_conn->login_req_dma,
-			    ISER_RX_PAYLOAD_SIZE,
+	ib_dma_unmap_single(ib_dev, isert_conn->login_desc->dma_addr,
+			    ISER_RX_SIZE,
 			    DMA_FROM_DEVICE);
-	kfree(isert_conn->login_req_buf);
+	kfree(isert_conn->login_desc);
 }
 
 static int
@@ -311,25 +312,25 @@ isert_alloc_login_buf(struct isert_conn *isert_conn,
 {
 	int ret;
 
-	isert_conn->login_req_buf = kzalloc(sizeof(*isert_conn->login_req_buf),
+	isert_conn->login_desc = kzalloc(sizeof(*isert_conn->login_desc),
 			GFP_KERNEL);
-	if (!isert_conn->login_req_buf)
+	if (!isert_conn->login_desc)
 		return -ENOMEM;
 
-	isert_conn->login_req_dma = ib_dma_map_single(ib_dev,
-				isert_conn->login_req_buf,
-				ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
-	ret = ib_dma_mapping_error(ib_dev, isert_conn->login_req_dma);
+	isert_conn->login_desc->dma_addr = ib_dma_map_single(ib_dev,
+				isert_conn->login_desc->buf,
+				ISER_RX_SIZE, DMA_FROM_DEVICE);
+	ret = ib_dma_mapping_error(ib_dev, isert_conn->login_desc->dma_addr);
 	if (ret) {
-		isert_err("login_req_dma mapping error: %d\n", ret);
-		isert_conn->login_req_dma = 0;
-		goto out_free_login_req_buf;
+		isert_err("login_desc dma mapping error: %d\n", ret);
+		isert_conn->login_desc->dma_addr = 0;
+		goto out_free_login_desc;
 	}
 
 	isert_conn->login_rsp_buf = kzalloc(ISER_RX_PAYLOAD_SIZE, GFP_KERNEL);
 	if (!isert_conn->login_rsp_buf) {
 		ret = -ENOMEM;
-		goto out_unmap_login_req_buf;
+		goto out_unmap_login_desc;
 	}
 
 	isert_conn->login_rsp_dma = ib_dma_map_single(ib_dev,
@@ -346,11 +347,11 @@ isert_alloc_login_buf(struct isert_conn *isert_conn,
 
 out_free_login_rsp_buf:
 	kfree(isert_conn->login_rsp_buf);
-out_unmap_login_req_buf:
-	ib_dma_unmap_single(ib_dev, isert_conn->login_req_dma,
-			    ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
-out_free_login_req_buf:
-	kfree(isert_conn->login_req_buf);
+out_unmap_login_desc:
+	ib_dma_unmap_single(ib_dev, isert_conn->login_desc->dma_addr,
+			    ISER_RX_SIZE, DMA_FROM_DEVICE);
+out_free_login_desc:
+	kfree(isert_conn->login_desc);
 	return ret;
 }
 
@@ -476,7 +477,7 @@ isert_connect_release(struct isert_conn *isert_conn)
 	if (isert_conn->qp)
 		isert_destroy_qp(isert_conn);
 
-	if (isert_conn->login_req_buf)
+	if (isert_conn->login_desc)
 		isert_free_login_buf(isert_conn);
 
 	isert_device_put(device);
@@ -862,17 +863,18 @@ isert_login_post_recv(struct isert_conn *isert_conn)
 	int ret;
 
 	memset(&sge, 0, sizeof(struct ib_sge));
-	sge.addr = isert_conn->login_req_dma;
+	sge.addr = isert_conn->login_desc->dma_addr +
+		isert_get_hdr_offset(isert_conn->login_desc);
 	sge.length = ISER_RX_PAYLOAD_SIZE;
 	sge.lkey = isert_conn->device->pd->local_dma_lkey;
 
 	isert_dbg("Setup sge: addr: %llx length: %d 0x%08x\n",
 		sge.addr, sge.length, sge.lkey);
 
-	isert_conn->login_req_buf->rx_cqe.done = isert_login_recv_done;
+	isert_conn->login_desc->rx_cqe.done = isert_login_recv_done;
 
 	memset(&rx_wr, 0, sizeof(struct ib_recv_wr));
-	rx_wr.wr_cqe = &isert_conn->login_req_buf->rx_cqe;
+	rx_wr.wr_cqe = &isert_conn->login_desc->rx_cqe;
 	rx_wr.sg_list = &sge;
 	rx_wr.num_sge = 1;
 
@@ -949,7 +951,7 @@ isert_put_login_tx(struct iscsi_conn *conn, struct iscsi_login *login,
 static void
 isert_rx_login_req(struct isert_conn *isert_conn)
 {
-	struct iser_rx_desc *rx_desc = isert_conn->login_req_buf;
+	struct iser_rx_desc *rx_desc = isert_conn->login_desc;
 	int rx_buflen = isert_conn->login_req_len;
 	struct iscsi_conn *conn = isert_conn->conn;
 	struct iscsi_login *login = conn->conn_login;
@@ -961,7 +963,7 @@ isert_rx_login_req(struct isert_conn *isert_conn)
 
 	if (login->first_request) {
 		struct iscsi_login_req *login_req =
-			(struct iscsi_login_req *)&rx_desc->iscsi_header;
+			(struct iscsi_login_req *)isert_get_iscsi_hdr(rx_desc);
 		/*
 		 * Setup the initial iscsi_login values from the leading
 		 * login request PDU.
@@ -980,13 +982,13 @@ isert_rx_login_req(struct isert_conn *isert_conn)
 		login->tsih		= be16_to_cpu(login_req->tsih);
 	}
 
-	memcpy(&login->req[0], (void *)&rx_desc->iscsi_header, ISCSI_HDR_LEN);
+	memcpy(&login->req[0], isert_get_iscsi_hdr(rx_desc), ISCSI_HDR_LEN);
 
 	size = min(rx_buflen, MAX_KEY_VALUE_PAIRS);
 	isert_dbg("Using login payload size: %d, rx_buflen: %d "
 		  "MAX_KEY_VALUE_PAIRS: %d\n", size, rx_buflen,
 		  MAX_KEY_VALUE_PAIRS);
-	memcpy(login->req_buf, &rx_desc->data[0], size);
+	memcpy(login->req_buf, isert_get_data(rx_desc), size);
 
 	if (login->first_request) {
 		complete(&isert_conn->login_comp);
@@ -1051,14 +1053,15 @@ isert_handle_scsi_cmd(struct isert_conn *isert_conn,
 	if (imm_data_len != data_len) {
 		sg_nents = max(1UL, DIV_ROUND_UP(imm_data_len, PAGE_SIZE));
 		sg_copy_from_buffer(cmd->se_cmd.t_data_sg, sg_nents,
-				    &rx_desc->data[0], imm_data_len);
+				    isert_get_data(rx_desc), imm_data_len);
 		isert_dbg("Copy Immediate sg_nents: %u imm_data_len: %d\n",
 			  sg_nents, imm_data_len);
 	} else {
 		sg_init_table(&isert_cmd->sg, 1);
 		cmd->se_cmd.t_data_sg = &isert_cmd->sg;
 		cmd->se_cmd.t_data_nents = 1;
-		sg_set_buf(&isert_cmd->sg, &rx_desc->data[0], imm_data_len);
+		sg_set_buf(&isert_cmd->sg, isert_get_data(rx_desc),
+				imm_data_len);
 		isert_dbg("Transfer Immediate imm_data_len: %d\n",
 			  imm_data_len);
 	}
@@ -1127,9 +1130,9 @@ isert_handle_iscsi_dataout(struct isert_conn *isert_conn,
 	}
 	isert_dbg("Copying DataOut: sg_start: %p, sg_off: %u "
 		  "sg_nents: %u from %p %u\n", sg_start, sg_off,
-		  sg_nents, &rx_desc->data[0], unsol_data_len);
+		  sg_nents, isert_get_data(rx_desc), unsol_data_len);
 
-	sg_copy_from_buffer(sg_start, sg_nents, &rx_desc->data[0],
+	sg_copy_from_buffer(sg_start, sg_nents, isert_get_data(rx_desc),
 			    unsol_data_len);
 
 	rc = iscsit_check_dataout_payload(cmd, hdr, false);
@@ -1188,7 +1191,7 @@ isert_handle_text_cmd(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd
 	}
 	cmd->text_in_ptr = text_in;
 
-	memcpy(cmd->text_in_ptr, &rx_desc->data[0], payload_length);
+	memcpy(cmd->text_in_ptr, isert_get_data(rx_desc), payload_length);
 
 	return iscsit_process_text_cmd(conn, cmd, hdr);
 }
@@ -1198,7 +1201,7 @@ isert_rx_opcode(struct isert_conn *isert_conn, struct iser_rx_desc *rx_desc,
 		uint32_t read_stag, uint64_t read_va,
 		uint32_t write_stag, uint64_t write_va)
 {
-	struct iscsi_hdr *hdr = &rx_desc->iscsi_header;
+	struct iscsi_hdr *hdr = isert_get_iscsi_hdr(rx_desc);
 	struct iscsi_conn *conn = isert_conn->conn;
 	struct iscsi_cmd *cmd;
 	struct isert_cmd *isert_cmd;
@@ -1296,8 +1299,8 @@ isert_recv_done(struct ib_cq *cq, struct ib_wc *wc)
 	struct isert_conn *isert_conn = wc->qp->qp_context;
 	struct ib_device *ib_dev = isert_conn->cm_id->device;
 	struct iser_rx_desc *rx_desc = cqe_to_rx_desc(wc->wr_cqe);
-	struct iscsi_hdr *hdr = &rx_desc->iscsi_header;
-	struct iser_ctrl *iser_ctrl = &rx_desc->iser_header;
+	struct iscsi_hdr *hdr = isert_get_iscsi_hdr(rx_desc);
+	struct iser_ctrl *iser_ctrl = isert_get_iser_hdr(rx_desc);
 	uint64_t read_va = 0, write_va = 0;
 	uint32_t read_stag = 0, write_stag = 0;
 
@@ -1311,7 +1314,7 @@ isert_recv_done(struct ib_cq *cq, struct ib_wc *wc)
 	rx_desc->in_use = true;
 
 	ib_dma_sync_single_for_cpu(ib_dev, rx_desc->dma_addr,
-			ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+			ISER_RX_SIZE, DMA_FROM_DEVICE);
 
 	isert_dbg("DMA: 0x%llx, iSCSI opcode: 0x%02x, ITT: 0x%08x, flags: 0x%02x dlen: %d\n",
 		 rx_desc->dma_addr, hdr->opcode, hdr->itt, hdr->flags,
@@ -1346,7 +1349,7 @@ isert_recv_done(struct ib_cq *cq, struct ib_wc *wc)
 			read_stag, read_va, write_stag, write_va);
 
 	ib_dma_sync_single_for_device(ib_dev, rx_desc->dma_addr,
-			ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+			ISER_RX_SIZE, DMA_FROM_DEVICE);
 }
 
 static void
@@ -1360,8 +1363,8 @@ isert_login_recv_done(struct ib_cq *cq, struct ib_wc *wc)
 		return;
 	}
 
-	ib_dma_sync_single_for_cpu(ib_dev, isert_conn->login_req_dma,
-			ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+	ib_dma_sync_single_for_cpu(ib_dev, isert_conn->login_desc->dma_addr,
+			ISER_RX_SIZE, DMA_FROM_DEVICE);
 
 	isert_conn->login_req_len = wc->byte_len - ISER_HEADERS_LEN;
 
@@ -1376,8 +1379,8 @@ isert_login_recv_done(struct ib_cq *cq, struct ib_wc *wc)
 	complete(&isert_conn->login_req_comp);
 	mutex_unlock(&isert_conn->mutex);
 
-	ib_dma_sync_single_for_device(ib_dev, isert_conn->login_req_dma,
-				ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+	ib_dma_sync_single_for_device(ib_dev, isert_conn->login_desc->dma_addr,
+				ISER_RX_SIZE, DMA_FROM_DEVICE);
 }
 
 static void
diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h
index c55f7d9bfced..7fee4a65e181 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.h
+++ b/drivers/infiniband/ulp/isert/ib_isert.h
@@ -59,9 +59,11 @@
 				ISERT_MAX_TX_MISC_PDUS	+ \
 				ISERT_MAX_RX_MISC_PDUS)
 
-#define ISER_RX_PAD_SIZE	(ISCSI_DEF_MAX_RECV_SEG_LEN + 4096 - \
-		(ISER_RX_PAYLOAD_SIZE + sizeof(u64) + sizeof(struct ib_sge) + \
-		 sizeof(struct ib_cqe) + sizeof(bool)))
+/*
+ * RX size is default of 8k plus headers, but data needs to align to
+ * 512 boundary, so use 1024 to have the extra space for alignment.
+ */
+#define ISER_RX_SIZE		(ISCSI_DEF_MAX_RECV_SEG_LEN + 1024)
 
 /* Maximum support is 16MB I/O size */
 #define ISCSI_ISER_MAX_SG_TABLESIZE	4096
@@ -81,21 +83,41 @@ enum iser_conn_state {
 };
 
 struct iser_rx_desc {
-	struct iser_ctrl iser_header;
-	struct iscsi_hdr iscsi_header;
-	char		data[ISCSI_DEF_MAX_RECV_SEG_LEN];
+	char		buf[ISER_RX_SIZE];
 	u64		dma_addr;
 	struct ib_sge	rx_sg;
 	struct ib_cqe	rx_cqe;
 	bool		in_use;
-	char		pad[ISER_RX_PAD_SIZE];
-} __packed;
+};
 
 static inline struct iser_rx_desc *cqe_to_rx_desc(struct ib_cqe *cqe)
 {
 	return container_of(cqe, struct iser_rx_desc, rx_cqe);
 }
 
+static void *isert_get_iser_hdr(struct iser_rx_desc *desc)
+{
+	return PTR_ALIGN(desc->buf + ISER_HEADERS_LEN, 512) - ISER_HEADERS_LEN;
+}
+
+static size_t isert_get_hdr_offset(struct iser_rx_desc *desc)
+{
+	return isert_get_iser_hdr(desc) - (void *)desc->buf;
+}
+
+static void *isert_get_iscsi_hdr(struct iser_rx_desc *desc)
+{
+	return isert_get_iser_hdr(desc) + sizeof(struct iser_ctrl);
+}
+
+static void *isert_get_data(struct iser_rx_desc *desc)
+{
+	void *data = isert_get_iser_hdr(desc) + ISER_HEADERS_LEN;
+
+	WARN_ON((uintptr_t)data & 511);
+	return data;
+}
+
 struct iser_tx_desc {
 	struct iser_ctrl iser_header;
 	struct iscsi_hdr iscsi_header;
@@ -142,9 +164,8 @@ struct isert_conn {
 	u32			responder_resources;
 	u32			initiator_depth;
 	bool			pi_support;
-	struct iser_rx_desc	*login_req_buf;
+	struct iser_rx_desc	*login_desc;
 	char			*login_rsp_buf;
-	u64			login_req_dma;
 	int			login_req_len;
 	u64			login_rsp_dma;
 	struct iser_rx_desc	*rx_descs;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH] IB/isert: fix unaligned immediate-data handling
  2020-08-30 10:32 [PATCH] IB/isert: fix unaligned immediate-data handling Sagi Grimberg
@ 2020-08-31  5:14 ` Christoph Hellwig
  2020-08-31  5:37   ` Doug Dumitru
  2020-08-31 12:18 ` Jason Gunthorpe
  1 sibling, 1 reply; 14+ messages in thread
From: Christoph Hellwig @ 2020-08-31  5:14 UTC (permalink / raw)
  To: Sagi Grimberg; +Cc: linux-rdma, Max Gurtovoy, Stephen Rust, Doug Dumitru

On Sun, Aug 30, 2020 at 03:32:09AM -0700, Sagi Grimberg wrote:
> Currently we allocate rx buffers in a single contiguous buffers
> for headers (iser and iscsi) and data trailer. This means
> that most likely the data starting offset is aligned to 76
> bytes (size of both headers).
> 
> This worked fine for years, but at some point this broke.
> To fix this, we should avoid passing unaligned buffers for
> I/O.
> 
> Instead, we allocate our recv buffers with some extra space
> such that we can have the data portion align to 512 byte boundary.
> This also means that we cannot reference headers or data using
> structure but rather accessors (as they may move based on alignment).
> Also, get rid of the wrong __packed annotation from iser_rx_desc
> as this has only harmful effects (not aligned to anything).
> 
> This affects the rx descriptors for iscsi login and data plane.

What are the symptoms of the breakage?

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] IB/isert: fix unaligned immediate-data handling
  2020-08-31  5:14 ` Christoph Hellwig
@ 2020-08-31  5:37   ` Doug Dumitru
  0 siblings, 0 replies; 14+ messages in thread
From: Doug Dumitru @ 2020-08-31  5:37 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Sagi Grimberg, linux-rdma, Max Gurtovoy, Stephen Rust

On Sun, Aug 30, 2020 at 10:14 PM Christoph Hellwig <hch@infradead.org> wrote:
>
> On Sun, Aug 30, 2020 at 03:32:09AM -0700, Sagi Grimberg wrote:
> > Currently we allocate rx buffers in a single contiguous buffers
> > for headers (iser and iscsi) and data trailer. This means
> > that most likely the data starting offset is aligned to 76
> > bytes (size of both headers).
> >
> > This worked fine for years, but at some point this broke.
> > To fix this, we should avoid passing unaligned buffers for
> > I/O.
> >
> > Instead, we allocate our recv buffers with some extra space
> > such that we can have the data portion align to 512 byte boundary.
> > This also means that we cannot reference headers or data using
> > structure but rather accessors (as they may move based on alignment).
> > Also, get rid of the wrong __packed annotation from iser_rx_desc
> > as this has only harmful effects (not aligned to anything).
> >
> > This affects the rx descriptors for iscsi login and data plane.
>
> What are the symptoms of the breakage?

You don't get the same blocks back that you just wrote.

A simple program that tags sectors with sector numbers gets blocks
that are shifted by 72 bytes.  Shows up basically instantly.
Completely non-functional for pretty much any application.

I tested this on 5.4.61 with both Mellanox Connect X3 (ROCE) and
Chelsio T580 cards (iWarp).  Both required that ImmediateData be
turned off on the target side before the patch.  After the patch, it
works both ways.

This is from a thread that started late last year and went thru about
March.  Sagi then seems to drop it, until I pinged him on it a few
days ago.  He asked me to test the patch and then I did.  My tests
were only two systems and I did not dig into the patch.  I don't think
Sagi has the hardware to test this end to end himself.

I am not sure how much the ImmediateData hurts performance.  I get
wire speed either way on my single socket E5-1650 v3 server.

Doug Dumitru

[posting twice, forgot to turn plain text on, sorry]

-- 
Doug Dumitru
EasyCo LLC

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] IB/isert: fix unaligned immediate-data handling
  2020-08-30 10:32 [PATCH] IB/isert: fix unaligned immediate-data handling Sagi Grimberg
  2020-08-31  5:14 ` Christoph Hellwig
@ 2020-08-31 12:18 ` Jason Gunthorpe
  2020-08-31 16:29   ` Doug Dumitru
  2020-08-31 16:48   ` Sagi Grimberg
  1 sibling, 2 replies; 14+ messages in thread
From: Jason Gunthorpe @ 2020-08-31 12:18 UTC (permalink / raw)
  To: Sagi Grimberg; +Cc: linux-rdma, Max Gurtovoy, Stephen Rust, Doug Dumitru

On Sun, Aug 30, 2020 at 03:32:09AM -0700, Sagi Grimberg wrote:
> Currently we allocate rx buffers in a single contiguous buffers
> for headers (iser and iscsi) and data trailer. This means
> that most likely the data starting offset is aligned to 76
> bytes (size of both headers).
> 
> This worked fine for years, but at some point this broke.
> To fix this, we should avoid passing unaligned buffers for
> I/O.

That is a bit vauge - what suddenly broke it?

Jason

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] IB/isert: fix unaligned immediate-data handling
  2020-08-31 12:18 ` Jason Gunthorpe
@ 2020-08-31 16:29   ` Doug Dumitru
  2020-08-31 16:48   ` Sagi Grimberg
  1 sibling, 0 replies; 14+ messages in thread
From: Doug Dumitru @ 2020-08-31 16:29 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Sagi Grimberg, linux-rdma, Max Gurtovoy, Stephen Rust

On Mon, Aug 31, 2020 at 5:18 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Sun, Aug 30, 2020 at 03:32:09AM -0700, Sagi Grimberg wrote:
> > Currently we allocate rx buffers in a single contiguous buffers
> > for headers (iser and iscsi) and data trailer. This means
> > that most likely the data starting offset is aligned to 76
> > bytes (size of both headers).
> >
> > This worked fine for years, but at some point this broke.
> > To fix this, we should avoid passing unaligned buffers for
> > I/O.
>
> That is a bit vauge - what suddenly broke it?

I will try some regression testing to see when it broke.  As this is
RDMA, it takes real hardware and the build/reboot cycle takes a lot
longer.  I should be able to get to this in the middle of the week.

If anyone has a guess of what version to "look around", please let me know.

Doug

>
> Jason



-- 
Doug Dumitru
EasyCo LLC

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] IB/isert: fix unaligned immediate-data handling
  2020-08-31 12:18 ` Jason Gunthorpe
  2020-08-31 16:29   ` Doug Dumitru
@ 2020-08-31 16:48   ` Sagi Grimberg
       [not found]     ` <CAAFE1bcXNdMD5AR80tSFVoYouTrEg_swR73ba2FVEB5UH7MXLQ@mail.gmail.com>
                       ` (2 more replies)
  1 sibling, 3 replies; 14+ messages in thread
From: Sagi Grimberg @ 2020-08-31 16:48 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: linux-rdma, Max Gurtovoy, Stephen Rust, Doug Dumitru, ming Lei


>> Currently we allocate rx buffers in a single contiguous buffers
>> for headers (iser and iscsi) and data trailer. This means
>> that most likely the data starting offset is aligned to 76
>> bytes (size of both headers).
>>
>> This worked fine for years, but at some point this broke.
>> To fix this, we should avoid passing unaligned buffers for
>> I/O.
> 
> That is a bit vauge - what suddenly broke it?

Somewhere around the multipage bvec work that Ming did. The issue was
that brd assumed a 512 aligned page vector. IIRC the discussion settled
that the block layer expects a 512B aligned buffer(s).

Adding Ming to the thread.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] IB/isert: fix unaligned immediate-data handling
       [not found]     ` <CAAFE1bcXNdMD5AR80tSFVoYouTrEg_swR73ba2FVEB5UH7MXLQ@mail.gmail.com>
@ 2020-08-31 16:59       ` Stephen Rust
  0 siblings, 0 replies; 14+ messages in thread
From: Stephen Rust @ 2020-08-31 16:59 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: linux-rdma, Max Gurtovoy, Stephen Rust, Doug Dumitru, ming Lei,
	Sagi Grimberg

Please see the following original thread for the history, and the git
bisect showing which commit this was introduced. Other consumers of
block layer (eg: xfs) also were required to change their alignment
around this time, to be 512B aligned.

https://lore.kernel.org/linux-rdma/20191203041504.GC6245@ming.t460p/T/#m478c0b2a455a4b897320fa859b89910ffd7b6697

Thanks.

Sagi Grimberg <sagi@grimberg.me>
> >> Currently we allocate rx buffers in a single contiguous buffers
> >> for headers (iser and iscsi) and data trailer. This means
> >> that most likely the data starting offset is aligned to 76
> >> bytes (size of both headers).
> >>
> >> This worked fine for years, but at some point this broke.
> >> To fix this, we should avoid passing unaligned buffers for
> >> I/O.
> >
> > That is a bit vauge - what suddenly broke it?
>
> Somewhere around the multipage bvec work that Ming did. The issue was
> that brd assumed a 512 aligned page vector. IIRC the discussion settled
> that the block layer expects a 512B aligned buffer(s).
>
> Adding Ming to the thread.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] IB/isert: fix unaligned immediate-data handling
  2020-08-31 16:48   ` Sagi Grimberg
       [not found]     ` <CAAFE1bcXNdMD5AR80tSFVoYouTrEg_swR73ba2FVEB5UH7MXLQ@mail.gmail.com>
@ 2020-08-31 17:02     ` Doug Dumitru
  2020-08-31 17:59       ` Jason Gunthorpe
  2020-09-01  1:00     ` Ming Lei
  2 siblings, 1 reply; 14+ messages in thread
From: Doug Dumitru @ 2020-08-31 17:02 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Jason Gunthorpe, linux-rdma, Max Gurtovoy, Stephen Rust, ming Lei

On Mon, Aug 31, 2020 at 9:48 AM Sagi Grimberg <sagi@grimberg.me> wrote:
>
>
> >> Currently we allocate rx buffers in a single contiguous buffers
> >> for headers (iser and iscsi) and data trailer. This means
> >> that most likely the data starting offset is aligned to 76
> >> bytes (size of both headers).
> >>
> >> This worked fine for years, but at some point this broke.
> >> To fix this, we should avoid passing unaligned buffers for
> >> I/O.
> >
> > That is a bit vauge - what suddenly broke it?
>
> Somewhere around the multipage bvec work that Ming did. The issue was
> that brd assumed a 512 aligned page vector. IIRC the discussion settled
> that the block layer expects a 512B aligned buffer(s).

I will second that block layers expect "at least" 512B aligned
buffers.  Many of them get less efficient when bvecs are not full, 4K
aligned, pages.

>
> Adding Ming to the thread.



-- 
Doug Dumitru
EasyCo LLC

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] IB/isert: fix unaligned immediate-data handling
  2020-08-31 17:02     ` Doug Dumitru
@ 2020-08-31 17:59       ` Jason Gunthorpe
  2020-09-01  5:56         ` Christoph Hellwig
  0 siblings, 1 reply; 14+ messages in thread
From: Jason Gunthorpe @ 2020-08-31 17:59 UTC (permalink / raw)
  To: doug; +Cc: Sagi Grimberg, linux-rdma, Max Gurtovoy, Stephen Rust, ming Lei

On Mon, Aug 31, 2020 at 10:02:15AM -0700, Doug Dumitru wrote:
> On Mon, Aug 31, 2020 at 9:48 AM Sagi Grimberg <sagi@grimberg.me> wrote:
> >
> >
> > >> Currently we allocate rx buffers in a single contiguous buffers
> > >> for headers (iser and iscsi) and data trailer. This means
> > >> that most likely the data starting offset is aligned to 76
> > >> bytes (size of both headers).
> > >>
> > >> This worked fine for years, but at some point this broke.
> > >> To fix this, we should avoid passing unaligned buffers for
> > >> I/O.
> > >
> > > That is a bit vauge - what suddenly broke it?
> >
> > Somewhere around the multipage bvec work that Ming did. The issue was
> > that brd assumed a 512 aligned page vector. IIRC the discussion settled
> > that the block layer expects a 512B aligned buffer(s).
> 
> I will second that block layers expect "at least" 512B aligned
> buffers.  Many of them get less efficient when bvecs are not full, 4K
> aligned, pages.

Can we put all this info in the commit message please?

Thanks,
Jason

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] IB/isert: fix unaligned immediate-data handling
  2020-08-31 16:48   ` Sagi Grimberg
       [not found]     ` <CAAFE1bcXNdMD5AR80tSFVoYouTrEg_swR73ba2FVEB5UH7MXLQ@mail.gmail.com>
  2020-08-31 17:02     ` Doug Dumitru
@ 2020-09-01  1:00     ` Ming Lei
  2020-09-01  1:10       ` Sagi Grimberg
  2 siblings, 1 reply; 14+ messages in thread
From: Ming Lei @ 2020-09-01  1:00 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Jason Gunthorpe, linux-rdma, Max Gurtovoy, Stephen Rust,
	Doug Dumitru

On Mon, Aug 31, 2020 at 09:48:12AM -0700, Sagi Grimberg wrote:
> 
> > > Currently we allocate rx buffers in a single contiguous buffers
> > > for headers (iser and iscsi) and data trailer. This means
> > > that most likely the data starting offset is aligned to 76
> > > bytes (size of both headers).
> > > 
> > > This worked fine for years, but at some point this broke.
> > > To fix this, we should avoid passing unaligned buffers for
> > > I/O.
> > 
> > That is a bit vauge - what suddenly broke it?
> 
> Somewhere around the multipage bvec work that Ming did. The issue was
> that brd assumed a 512 aligned page vector. IIRC the discussion settled
> that the block layer expects a 512B aligned buffer(s).

I don't think the limit is from multipage bvec, and the limit is
actually from driver since block layer just sets up the vectors which
is passed to driver, see the following discussion:

https://lore.kernel.org/linux-block/20191204230225.GA26189@ming.t460p/

So what is the exact setting for reproducing the issue? Some underlying
driver does require aligned buffer, such as loop/dio, brd, or when kasan
is involved for buffer allocated via kmalloc().


Thanks, 
Ming


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] IB/isert: fix unaligned immediate-data handling
  2020-09-01  1:00     ` Ming Lei
@ 2020-09-01  1:10       ` Sagi Grimberg
  2020-09-01  2:01         ` Ming Lei
  0 siblings, 1 reply; 14+ messages in thread
From: Sagi Grimberg @ 2020-09-01  1:10 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jason Gunthorpe, linux-rdma, Max Gurtovoy, Stephen Rust,
	Doug Dumitru


>>>> Currently we allocate rx buffers in a single contiguous buffers
>>>> for headers (iser and iscsi) and data trailer. This means
>>>> that most likely the data starting offset is aligned to 76
>>>> bytes (size of both headers).
>>>>
>>>> This worked fine for years, but at some point this broke.
>>>> To fix this, we should avoid passing unaligned buffers for
>>>> I/O.
>>>
>>> That is a bit vauge - what suddenly broke it?
>>
>> Somewhere around the multipage bvec work that Ming did. The issue was
>> that brd assumed a 512 aligned page vector. IIRC the discussion settled
>> that the block layer expects a 512B aligned buffer(s).
> 
> I don't think the limit is from multipage bvec, and the limit is
> actually from driver since block layer just sets up the vectors which
> is passed to driver, see the following discussion:

Understood, fact is that this used to work (which is probably why this
was overlooked) and now it doesn't. Assumed it was related to multipage
bvec without a proof :)

I guess the basis was the original report:

 > Hi,
 >
 > We recently began testing 5.4 in preparation for migration from 4.14. One
 > of our tests found reproducible data corruption in 5.x kernels. The test
 > consists of a few basic single-issue writes to an iSER attached ramdisk.
 > The writes are subsequently verified with single-issue reads. We tracked
 > the corruption down using git bisect. The issue appears to have 
started in
 > 5.1 with the following commit:
 >
 > 3d75ca0adef4280650c6690a0c4702a74a6f3c95 block: introduce multi-page bvec
 > helpers

Anyways, I think it's a reasonable requirement and hence this fix...

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] IB/isert: fix unaligned immediate-data handling
  2020-09-01  1:10       ` Sagi Grimberg
@ 2020-09-01  2:01         ` Ming Lei
  0 siblings, 0 replies; 14+ messages in thread
From: Ming Lei @ 2020-09-01  2:01 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Jason Gunthorpe, linux-rdma, Max Gurtovoy, Stephen Rust,
	Doug Dumitru

On Mon, Aug 31, 2020 at 06:10:38PM -0700, Sagi Grimberg wrote:
> 
> > > > > Currently we allocate rx buffers in a single contiguous buffers
> > > > > for headers (iser and iscsi) and data trailer. This means
> > > > > that most likely the data starting offset is aligned to 76
> > > > > bytes (size of both headers).
> > > > > 
> > > > > This worked fine for years, but at some point this broke.
> > > > > To fix this, we should avoid passing unaligned buffers for
> > > > > I/O.
> > > > 
> > > > That is a bit vauge - what suddenly broke it?
> > > 
> > > Somewhere around the multipage bvec work that Ming did. The issue was
> > > that brd assumed a 512 aligned page vector. IIRC the discussion settled
> > > that the block layer expects a 512B aligned buffer(s).
> > 
> > I don't think the limit is from multipage bvec, and the limit is
> > actually from driver since block layer just sets up the vectors which
> > is passed to driver, see the following discussion:
> 
> Understood, fact is that this used to work (which is probably why this
> was overlooked) and now it doesn't. Assumed it was related to multipage
> bvec without a proof :)
> 
> I guess the basis was the original report:
> 
> > Hi,
> >
> > We recently began testing 5.4 in preparation for migration from 4.14. One
> > of our tests found reproducible data corruption in 5.x kernels. The test
> > consists of a few basic single-issue writes to an iSER attached ramdisk.
> > The writes are subsequently verified with single-issue reads. We tracked
> > the corruption down using git bisect. The issue appears to have started in
> > 5.1 with the following commit:
> >
> > 3d75ca0adef4280650c6690a0c4702a74a6f3c95 block: introduce multi-page bvec
> > helpers
> 
> Anyways, I think it's a reasonable requirement and hence this fix...

Agree, your fix is good, even though we can make brd working, however
there might be other drivers in which DMA alignment limit is broken by
the unaligned buffer from upper layer.


thanks,
Ming


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] IB/isert: fix unaligned immediate-data handling
  2020-08-31 17:59       ` Jason Gunthorpe
@ 2020-09-01  5:56         ` Christoph Hellwig
  2020-09-01  7:42           ` Sagi Grimberg
  0 siblings, 1 reply; 14+ messages in thread
From: Christoph Hellwig @ 2020-09-01  5:56 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: doug, Sagi Grimberg, linux-rdma, Max Gurtovoy, Stephen Rust,
	ming Lei

On Mon, Aug 31, 2020 at 02:59:31PM -0300, Jason Gunthorpe wrote:
> Can we put all this info in the commit message please?

Yes, please.  Note that the block layer actually reports the
required dma alignment.  The worst case is the logical block size,
but many devices require less alignment.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] IB/isert: fix unaligned immediate-data handling
  2020-09-01  5:56         ` Christoph Hellwig
@ 2020-09-01  7:42           ` Sagi Grimberg
  0 siblings, 0 replies; 14+ messages in thread
From: Sagi Grimberg @ 2020-09-01  7:42 UTC (permalink / raw)
  To: Christoph Hellwig, Jason Gunthorpe
  Cc: doug, linux-rdma, Max Gurtovoy, Stephen Rust, ming Lei


>> Can we put all this info in the commit message please?

Sent out v2 with an updated commit message.

> Yes, please.  Note that the block layer actually reports the
> required dma alignment.  The worst case is the logical block size,
> but many devices require less alignment.

Given that we don't know the I/O target when we get the command
with the data, its better to make sure the worst case alignment
is met rather than to bounce afterwards when we find an alignment
issue.

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2020-09-01  7:42 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2020-08-30 10:32 [PATCH] IB/isert: fix unaligned immediate-data handling Sagi Grimberg
2020-08-31  5:14 ` Christoph Hellwig
2020-08-31  5:37   ` Doug Dumitru
2020-08-31 12:18 ` Jason Gunthorpe
2020-08-31 16:29   ` Doug Dumitru
2020-08-31 16:48   ` Sagi Grimberg
     [not found]     ` <CAAFE1bcXNdMD5AR80tSFVoYouTrEg_swR73ba2FVEB5UH7MXLQ@mail.gmail.com>
2020-08-31 16:59       ` Stephen Rust
2020-08-31 17:02     ` Doug Dumitru
2020-08-31 17:59       ` Jason Gunthorpe
2020-09-01  5:56         ` Christoph Hellwig
2020-09-01  7:42           ` Sagi Grimberg
2020-09-01  1:00     ` Ming Lei
2020-09-01  1:10       ` Sagi Grimberg
2020-09-01  2:01         ` Ming Lei

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox