[PATCH] rdma: Fix multiple disk performance degradation

Distributed Replicated Block Device (DRBD) development
 help / color / mirror / Atom feed

* [PATCH] rdma: Fix multiple disk performance degradation
@ 2024-09-14  5:43 zhengbing.huang
  2024-09-17 12:29 ` Philipp Reisner
  2024-10-16 16:48 ` Philipp Reisner
  0 siblings, 2 replies; 4+ messages in thread
From: zhengbing.huang @ 2024-09-14  5:43 UTC (permalink / raw)
  To: drbd-dev

In the performance test of rdma mode, we found that when
two drbd disks were simultaneously subjected to high-pressure I/O write,
the IOPS of each drbd disk would be reduced by half.

The reason is that if the cq_attr.comp_vector parameter
is not specified when rdma create send_cq and recv_cq,
cq will be allocated to the same irq for processing.
At this point, if multiple disks are stress tested at the same time,
irq will not be processed in a timely manner
and performance will decrease.

The solution is to use the network port number as the index of the comp_vector,
so that the cq of each disk can be distributed across different IRQs

Signed-off-by: zhengbing.huang <zhengbing.huang@easystack.cn>
---
 drbd/drbd_transport_rdma.c | 44 ++++++++++++++++++++++++++++++++++----
 1 file changed, 40 insertions(+), 4 deletions(-)

diff --git a/drbd/drbd_transport_rdma.c b/drbd/drbd_transport_rdma.c
index 813787c28..8915e60d1 100644
--- a/drbd/drbd_transport_rdma.c
+++ b/drbd/drbd_transport_rdma.c
@@ -2486,10 +2486,43 @@ static int dtr_init_flow(struct dtr_path *path, enum drbd_stream stream)
 	return err;
 }
 
+static int dtr_get_my_port(struct dtr_path *path)
+{
+	int port = 0;
+	struct sockaddr_storage *addr = (struct sockaddr_storage *)&path->path.my_addr;
+
+	if (addr->ss_family == AF_INET6) {
+		const struct sockaddr_in6 *v6a = (const struct sockaddr_in6 *)addr;
+
+		port = be16_to_cpu(v6a->sin6_port);
+	} else /* AF_INET, AF_SSOCKS, AF_SDP */ {
+		const struct sockaddr_in *v4a = (const struct sockaddr_in *)addr;
+
+		port = be16_to_cpu(v4a->sin_port);
+	}
+
+	return port;
+}
+
+static void dtr_get_comp_vectors(struct dtr_path *path, int cq_num, int *comp_vectors)
+{
+	int i;
+	int tmp_comp_vector = dtr_get_my_port(path) * cq_num;
+
+	for (i = 0; i < cq_num; i++) {
+		comp_vectors[i] = tmp_comp_vector + i;
+	}
+
+	return;
+}
+
 static int _dtr_cm_alloc_rdma_res(struct dtr_cm *cm,
 				    enum dtr_alloc_rdma_res_causes *cause)
 {
-	int err, i, rx_descs_max = 0, tx_descs_max = 0;
+	int err, i, cq_index = 0, rx_descs_max = 0, tx_descs_max = 0;
+	int cq_num = 2; /* recv_cq and send_cq */
+	int comp_vectors[2] = {0}; /* recv_cq and send_cq */
+	struct ib_device *device = cm->id->device;
 	struct ib_cq_init_attr cq_attr = {};
 	struct dtr_path *path = cm->path;
 
@@ -2504,16 +2537,18 @@ static int _dtr_cm_alloc_rdma_res(struct dtr_cm *cm,
 	/* in 4.9 ib_alloc_pd got the ability to specify flags as second param */
 	/* so far we don't use flags, but if we start using them, we have to be
 	 * aware that the compat layer removes this parameter for old kernels */
-	cm->pd = ib_alloc_pd(cm->id->device, 0);
+	cm->pd = ib_alloc_pd(device, 0);
 	if (IS_ERR(cm->pd)) {
 		*cause = IB_ALLOC_PD;
 		err = PTR_ERR(cm->pd);
 		goto pd_failed;
 	}
 
+	dtr_get_comp_vectors(path, cq_num, comp_vectors);
 	/* create recv completion queue (CQ) */
 	cq_attr.cqe = rx_descs_max;
-	cm->recv_cq = ib_create_cq(cm->id->device,
+	cq_attr.comp_vector = comp_vectors[cq_index] % device->num_comp_vectors;
+	cm->recv_cq = ib_create_cq(device,
 			dtr_rx_cq_event_handler, NULL, cm,
 			&cq_attr);
 	if (IS_ERR(cm->recv_cq)) {
@@ -2524,7 +2559,8 @@ static int _dtr_cm_alloc_rdma_res(struct dtr_cm *cm,
 
 	/* create send completion queue (CQ) */
 	cq_attr.cqe = tx_descs_max;
-	cm->send_cq = ib_create_cq(cm->id->device,
+	cq_attr.comp_vector = comp_vectors[cq_index++] % device->num_comp_vectors;
+	cm->send_cq = ib_create_cq(device,
 			dtr_tx_cq_event_handler, NULL, cm,
 			&cq_attr);
 	if (IS_ERR(cm->send_cq)) {
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] rdma: Fix multiple disk performance degradation
  2024-09-14  5:43 [PATCH] rdma: Fix multiple disk performance degradation zhengbing.huang
@ 2024-09-17 12:29 ` Philipp Reisner
  2024-10-16 16:48 ` Philipp Reisner
  1 sibling, 0 replies; 4+ messages in thread
From: Philipp Reisner @ 2024-09-17 12:29 UTC (permalink / raw)
  To: zhengbing.huang; +Cc: drbd-dev

Hi Zhengbing,

That patch looks interesting.
Please allow me a few days for a proper repyl. Currently, I am at a conference.

best regards,
 Philipp

On Sat, Sep 14, 2024 at 8:12 AM zhengbing.huang
<zhengbing.huang@easystack.cn> wrote:
>
> In the performance test of rdma mode, we found that when
> two drbd disks were simultaneously subjected to high-pressure I/O write,
> the IOPS of each drbd disk would be reduced by half.
>
> The reason is that if the cq_attr.comp_vector parameter
> is not specified when rdma create send_cq and recv_cq,
> cq will be allocated to the same irq for processing.
> At this point, if multiple disks are stress tested at the same time,
> irq will not be processed in a timely manner
> and performance will decrease.
>
> The solution is to use the network port number as the index of the comp_vector,
> so that the cq of each disk can be distributed across different IRQs
>
> Signed-off-by: zhengbing.huang <zhengbing.huang@easystack.cn>
> ---
>  drbd/drbd_transport_rdma.c | 44 ++++++++++++++++++++++++++++++++++----
>  1 file changed, 40 insertions(+), 4 deletions(-)
>
> diff --git a/drbd/drbd_transport_rdma.c b/drbd/drbd_transport_rdma.c
> index 813787c28..8915e60d1 100644
> --- a/drbd/drbd_transport_rdma.c
> +++ b/drbd/drbd_transport_rdma.c
> @@ -2486,10 +2486,43 @@ static int dtr_init_flow(struct dtr_path *path, enum drbd_stream stream)
>         return err;
>  }
>
> +static int dtr_get_my_port(struct dtr_path *path)
> +{
> +       int port = 0;
> +       struct sockaddr_storage *addr = (struct sockaddr_storage *)&path->path.my_addr;
> +
> +       if (addr->ss_family == AF_INET6) {
> +               const struct sockaddr_in6 *v6a = (const struct sockaddr_in6 *)addr;
> +
> +               port = be16_to_cpu(v6a->sin6_port);
> +       } else /* AF_INET, AF_SSOCKS, AF_SDP */ {
> +               const struct sockaddr_in *v4a = (const struct sockaddr_in *)addr;
> +
> +               port = be16_to_cpu(v4a->sin_port);
> +       }
> +
> +       return port;
> +}
> +
> +static void dtr_get_comp_vectors(struct dtr_path *path, int cq_num, int *comp_vectors)
> +{
> +       int i;
> +       int tmp_comp_vector = dtr_get_my_port(path) * cq_num;
> +
> +       for (i = 0; i < cq_num; i++) {
> +               comp_vectors[i] = tmp_comp_vector + i;
> +       }
> +
> +       return;
> +}
> +
>  static int _dtr_cm_alloc_rdma_res(struct dtr_cm *cm,
>                                     enum dtr_alloc_rdma_res_causes *cause)
>  {
> -       int err, i, rx_descs_max = 0, tx_descs_max = 0;
> +       int err, i, cq_index = 0, rx_descs_max = 0, tx_descs_max = 0;
> +       int cq_num = 2; /* recv_cq and send_cq */
> +       int comp_vectors[2] = {0}; /* recv_cq and send_cq */
> +       struct ib_device *device = cm->id->device;
>         struct ib_cq_init_attr cq_attr = {};
>         struct dtr_path *path = cm->path;
>
> @@ -2504,16 +2537,18 @@ static int _dtr_cm_alloc_rdma_res(struct dtr_cm *cm,
>         /* in 4.9 ib_alloc_pd got the ability to specify flags as second param */
>         /* so far we don't use flags, but if we start using them, we have to be
>          * aware that the compat layer removes this parameter for old kernels */
> -       cm->pd = ib_alloc_pd(cm->id->device, 0);
> +       cm->pd = ib_alloc_pd(device, 0);
>         if (IS_ERR(cm->pd)) {
>                 *cause = IB_ALLOC_PD;
>                 err = PTR_ERR(cm->pd);
>                 goto pd_failed;
>         }
>
> +       dtr_get_comp_vectors(path, cq_num, comp_vectors);
>         /* create recv completion queue (CQ) */
>         cq_attr.cqe = rx_descs_max;
> -       cm->recv_cq = ib_create_cq(cm->id->device,
> +       cq_attr.comp_vector = comp_vectors[cq_index] % device->num_comp_vectors;
> +       cm->recv_cq = ib_create_cq(device,
>                         dtr_rx_cq_event_handler, NULL, cm,
>                         &cq_attr);
>         if (IS_ERR(cm->recv_cq)) {
> @@ -2524,7 +2559,8 @@ static int _dtr_cm_alloc_rdma_res(struct dtr_cm *cm,
>
>         /* create send completion queue (CQ) */
>         cq_attr.cqe = tx_descs_max;
> -       cm->send_cq = ib_create_cq(cm->id->device,
> +       cq_attr.comp_vector = comp_vectors[cq_index++] % device->num_comp_vectors;
> +       cm->send_cq = ib_create_cq(device,
>                         dtr_tx_cq_event_handler, NULL, cm,
>                         &cq_attr);
>         if (IS_ERR(cm->send_cq)) {
> --
> 2.17.1
>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] rdma: Fix multiple disk performance degradation
  2024-09-14  5:43 [PATCH] rdma: Fix multiple disk performance degradation zhengbing.huang
  2024-09-17 12:29 ` Philipp Reisner
@ 2024-10-16 16:48 ` Philipp Reisner
  2024-10-17  6:35   ` Zhengbing
  1 sibling, 1 reply; 4+ messages in thread
From: Philipp Reisner @ 2024-10-16 16:48 UTC (permalink / raw)
  To: zhengbing.huang; +Cc: drbd-dev

Hi Zhengbing,

My impression is that using the port number to distribute the IRQs is
a hack. What about allocating an array and tracking how many CMs got
assigned to each IRQ, and always using the IRQ number that is used by
the least number of other CMs?

best regards,
 Philipp

On Sat, Sep 14, 2024 at 8:12 AM zhengbing.huang
<zhengbing.huang@easystack.cn> wrote:
>
> In the performance test of rdma mode, we found that when
> two drbd disks were simultaneously subjected to high-pressure I/O write,
> the IOPS of each drbd disk would be reduced by half.
>
> The reason is that if the cq_attr.comp_vector parameter
> is not specified when rdma create send_cq and recv_cq,
> cq will be allocated to the same irq for processing.
> At this point, if multiple disks are stress tested at the same time,
> irq will not be processed in a timely manner
> and performance will decrease.
>
> The solution is to use the network port number as the index of the comp_vector,
> so that the cq of each disk can be distributed across different IRQs
>
> Signed-off-by: zhengbing.huang <zhengbing.huang@easystack.cn>
> ---
>  drbd/drbd_transport_rdma.c | 44 ++++++++++++++++++++++++++++++++++----
>  1 file changed, 40 insertions(+), 4 deletions(-)
>
> diff --git a/drbd/drbd_transport_rdma.c b/drbd/drbd_transport_rdma.c
> index 813787c28..8915e60d1 100644
> --- a/drbd/drbd_transport_rdma.c
> +++ b/drbd/drbd_transport_rdma.c
> @@ -2486,10 +2486,43 @@ static int dtr_init_flow(struct dtr_path *path, enum drbd_stream stream)
>         return err;
>  }
>
> +static int dtr_get_my_port(struct dtr_path *path)
> +{
> +       int port = 0;
> +       struct sockaddr_storage *addr = (struct sockaddr_storage *)&path->path.my_addr;
> +
> +       if (addr->ss_family == AF_INET6) {
> +               const struct sockaddr_in6 *v6a = (const struct sockaddr_in6 *)addr;
> +
> +               port = be16_to_cpu(v6a->sin6_port);
> +       } else /* AF_INET, AF_SSOCKS, AF_SDP */ {
> +               const struct sockaddr_in *v4a = (const struct sockaddr_in *)addr;
> +
> +               port = be16_to_cpu(v4a->sin_port);
> +       }
> +
> +       return port;
> +}
> +
> +static void dtr_get_comp_vectors(struct dtr_path *path, int cq_num, int *comp_vectors)
> +{
> +       int i;
> +       int tmp_comp_vector = dtr_get_my_port(path) * cq_num;
> +
> +       for (i = 0; i < cq_num; i++) {
> +               comp_vectors[i] = tmp_comp_vector + i;
> +       }
> +
> +       return;
> +}
> +
>  static int _dtr_cm_alloc_rdma_res(struct dtr_cm *cm,
>                                     enum dtr_alloc_rdma_res_causes *cause)
>  {
> -       int err, i, rx_descs_max = 0, tx_descs_max = 0;
> +       int err, i, cq_index = 0, rx_descs_max = 0, tx_descs_max = 0;
> +       int cq_num = 2; /* recv_cq and send_cq */
> +       int comp_vectors[2] = {0}; /* recv_cq and send_cq */
> +       struct ib_device *device = cm->id->device;
>         struct ib_cq_init_attr cq_attr = {};
>         struct dtr_path *path = cm->path;
>
> @@ -2504,16 +2537,18 @@ static int _dtr_cm_alloc_rdma_res(struct dtr_cm *cm,
>         /* in 4.9 ib_alloc_pd got the ability to specify flags as second param */
>         /* so far we don't use flags, but if we start using them, we have to be
>          * aware that the compat layer removes this parameter for old kernels */
> -       cm->pd = ib_alloc_pd(cm->id->device, 0);
> +       cm->pd = ib_alloc_pd(device, 0);
>         if (IS_ERR(cm->pd)) {
>                 *cause = IB_ALLOC_PD;
>                 err = PTR_ERR(cm->pd);
>                 goto pd_failed;
>         }
>
> +       dtr_get_comp_vectors(path, cq_num, comp_vectors);
>         /* create recv completion queue (CQ) */
>         cq_attr.cqe = rx_descs_max;
> -       cm->recv_cq = ib_create_cq(cm->id->device,
> +       cq_attr.comp_vector = comp_vectors[cq_index] % device->num_comp_vectors;
> +       cm->recv_cq = ib_create_cq(device,
>                         dtr_rx_cq_event_handler, NULL, cm,
>                         &cq_attr);
>         if (IS_ERR(cm->recv_cq)) {
> @@ -2524,7 +2559,8 @@ static int _dtr_cm_alloc_rdma_res(struct dtr_cm *cm,
>
>         /* create send completion queue (CQ) */
>         cq_attr.cqe = tx_descs_max;
> -       cm->send_cq = ib_create_cq(cm->id->device,
> +       cq_attr.comp_vector = comp_vectors[cq_index++] % device->num_comp_vectors;
> +       cm->send_cq = ib_create_cq(device,
>                         dtr_tx_cq_event_handler, NULL, cm,
>                         &cq_attr);
>         if (IS_ERR(cm->send_cq)) {
> --
> 2.17.1
>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re:Re: [PATCH] rdma: Fix multiple disk performance degradation
  2024-10-16 16:48 ` Philipp Reisner
@ 2024-10-17  6:35   ` Zhengbing
  0 siblings, 0 replies; 4+ messages in thread
From: Zhengbing @ 2024-10-17  6:35 UTC (permalink / raw)
  To: Philipp Reisner; +Cc: drbd-dev

[-- Attachment #1: Type: text/plain, Size: 5014 bytes --]

Hi Philipp,

This is indeed a better idea.
Make it happen.


best regards,
zhengbing



发件人：Philipp Reisner <philipp.reisner@linbit.com>
发送日期：2024-10-17 00:48:26
收件人："zhengbing.huang" <zhengbing.huang@easystack.cn>
抄送人：drbd-dev@lists.linbit.com
主题：Re: [PATCH] rdma: Fix multiple disk performance degradation>Hi Zhengbing,
>
>My impression is that using the port number to distribute the IRQs is
>a hack. What about allocating an array and tracking how many CMs got
>assigned to each IRQ, and always using the IRQ number that is used by
>the least number of other CMs?
>
>best regards,
> Philipp
>
>On Sat, Sep 14, 2024 at 8:12 AM zhengbing.huang
><zhengbing.huang@easystack.cn> wrote:
>>
>> In the performance test of rdma mode, we found that when
>> two drbd disks were simultaneously subjected to high-pressure I/O write,
>> the IOPS of each drbd disk would be reduced by half.
>>
>> The reason is that if the cq_attr.comp_vector parameter
>> is not specified when rdma create send_cq and recv_cq,
>> cq will be allocated to the same irq for processing.
>> At this point, if multiple disks are stress tested at the same time,
>> irq will not be processed in a timely manner
>> and performance will decrease.
>>
>> The solution is to use the network port number as the index of the comp_vector,
>> so that the cq of each disk can be distributed across different IRQs
>>
>> Signed-off-by: zhengbing.huang <zhengbing.huang@easystack.cn>
>> ---
>>  drbd/drbd_transport_rdma.c | 44 ++++++++++++++++++++++++++++++++++----
>>  1 file changed, 40 insertions(+), 4 deletions(-)
>>
>> diff --git a/drbd/drbd_transport_rdma.c b/drbd/drbd_transport_rdma.c
>> index 813787c28..8915e60d1 100644
>> --- a/drbd/drbd_transport_rdma.c
>> +++ b/drbd/drbd_transport_rdma.c
>> @@ -2486,10 +2486,43 @@ static int dtr_init_flow(struct dtr_path *path, enum drbd_stream stream)
>>         return err;
>>  }
>>
>> +static int dtr_get_my_port(struct dtr_path *path)
>> +{
>> +       int port = 0;
>> +       struct sockaddr_storage *addr = (struct sockaddr_storage *)&path->path.my_addr;
>> +
>> +       if (addr->ss_family == AF_INET6) {
>> +               const struct sockaddr_in6 *v6a = (const struct sockaddr_in6 *)addr;
>> +
>> +               port = be16_to_cpu(v6a->sin6_port);
>> +       } else /* AF_INET, AF_SSOCKS, AF_SDP */ {
>> +               const struct sockaddr_in *v4a = (const struct sockaddr_in *)addr;
>> +
>> +               port = be16_to_cpu(v4a->sin_port);
>> +       }
>> +
>> +       return port;
>> +}
>> +
>> +static void dtr_get_comp_vectors(struct dtr_path *path, int cq_num, int *comp_vectors)
>> +{
>> +       int i;
>> +       int tmp_comp_vector = dtr_get_my_port(path) * cq_num;
>> +
>> +       for (i = 0; i < cq_num; i++) {
>> +               comp_vectors[i] = tmp_comp_vector + i;
>> +       }
>> +
>> +       return;
>> +}
>> +
>>  static int _dtr_cm_alloc_rdma_res(struct dtr_cm *cm,
>>                                     enum dtr_alloc_rdma_res_causes *cause)
>>  {
>> -       int err, i, rx_descs_max = 0, tx_descs_max = 0;
>> +       int err, i, cq_index = 0, rx_descs_max = 0, tx_descs_max = 0;
>> +       int cq_num = 2; /* recv_cq and send_cq */
>> +       int comp_vectors[2] = {0}; /* recv_cq and send_cq */
>> +       struct ib_device *device = cm->id->device;
>>         struct ib_cq_init_attr cq_attr = {};
>>         struct dtr_path *path = cm->path;
>>
>> @@ -2504,16 +2537,18 @@ static int _dtr_cm_alloc_rdma_res(struct dtr_cm *cm,
>>         /* in 4.9 ib_alloc_pd got the ability to specify flags as second param */
>>         /* so far we don't use flags, but if we start using them, we have to be
>>          * aware that the compat layer removes this parameter for old kernels */
>> -       cm->pd = ib_alloc_pd(cm->id->device, 0);
>> +       cm->pd = ib_alloc_pd(device, 0);
>>         if (IS_ERR(cm->pd)) {
>>                 *cause = IB_ALLOC_PD;
>>                 err = PTR_ERR(cm->pd);
>>                 goto pd_failed;
>>         }
>>
>> +       dtr_get_comp_vectors(path, cq_num, comp_vectors);
>>         /* create recv completion queue (CQ) */
>>         cq_attr.cqe = rx_descs_max;
>> -       cm->recv_cq = ib_create_cq(cm->id->device,
>> +       cq_attr.comp_vector = comp_vectors[cq_index] % device->num_comp_vectors;
>> +       cm->recv_cq = ib_create_cq(device,
>>                         dtr_rx_cq_event_handler, NULL, cm,
>>                         &cq_attr);
>>         if (IS_ERR(cm->recv_cq)) {
>> @@ -2524,7 +2559,8 @@ static int _dtr_cm_alloc_rdma_res(struct dtr_cm *cm,
>>
>>         /* create send completion queue (CQ) */
>>         cq_attr.cqe = tx_descs_max;
>> -       cm->send_cq = ib_create_cq(cm->id->device,
>> +       cq_attr.comp_vector = comp_vectors[cq_index++] % device->num_comp_vectors;
>> +       cm->send_cq = ib_create_cq(device,
>>                         dtr_tx_cq_event_handler, NULL, cm,
>>                         &cq_attr);
>>         if (IS_ERR(cm->send_cq)) {
>> --
>> 2.17.1
>>





[-- Attachment #2: Type: text/html, Size: 6235 bytes --]

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2024-10-17 10:45 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-09-14  5:43 [PATCH] rdma: Fix multiple disk performance degradation zhengbing.huang
2024-09-17 12:29 ` Philipp Reisner
2024-10-16 16:48 ` Philipp Reisner
2024-10-17  6:35   ` Zhengbing

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox