* [PATCH] libibverbs: Add huge page support to ibv_madvise_range()
@ 2009-11-29 17:08 Alex Vainman
[not found] ` <4B12AA78.7090401-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
0 siblings, 1 reply; 16+ messages in thread
From: Alex Vainman @ 2009-11-29 17:08 UTC (permalink / raw)
To: roland; +Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA
ibv_reg_mr() fails to register a memory region allocated on huge page and not
the default page size. This happens because ibv_madvise_range() aligns memory
region to the default system page size before calling to madvise() which fails
with EINVAL error. madvise() fails because it expects that the start and end
pointer of the memory range be huge page aligned.
Patch handles the issue by:
1. ibv_fork_init() gets kernel's default huge page size in addition
to the default page size.
2. ibv_madvise_range() first tries aligning users memory range to default
page size and if madvise() fails with EINVAL error then it tries to align
users memory range by huge page size and tries madvise() again.
Signed-off-by: Alex Vaynman <alexv-smomgflXvOZWk0Htik3J/w@public.gmane.org>
---
src/memory.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 files changed, 68 insertions(+), 1 deletions(-)
diff --git a/src/memory.c b/src/memory.c
index 550015a..73db083 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -40,6 +40,9 @@
#include <unistd.h>
#include <stdlib.h>
#include <stdint.h>
+#include <ctype.h>
+#include <fcntl.h>
+#include <string.h>
#include "ibverbs.h"
@@ -54,6 +57,8 @@
#define MADV_DOFORK 11
#endif
+#define MEMINFO_SIZE 2048
+
struct ibv_mem_node {
enum {
IBV_RED,
@@ -68,8 +73,51 @@ struct ibv_mem_node {
static struct ibv_mem_node *mm_root;
static pthread_mutex_t mm_mutex = PTHREAD_MUTEX_INITIALIZER;
static int page_size;
+static int huge_page_size;
static int too_late;
+/*
+ * Get the kernel default huge page size.
+ */
+static int get_huge_page_size()
+{
+ int fd;
+ char buf[MEMINFO_SIZE];
+ int mem_file_len;
+ char *p_hpage_val = NULL;
+ char *end_pointer = NULL;
+ char file_name[] = "/proc/meminfo";
+ const char label[] = "Hugepagesize:";
+ int ret_val = 0;
+
+ fd = open(file_name, O_RDONLY);
+ if (fd < 0)
+ return fd;
+
+ mem_file_len = read(fd, buf, sizeof(buf) - 1);
+
+ close(fd);
+ if (mem_file_len < 0)
+ return mem_file_len;
+
+ buf[mem_file_len] = '\0';
+
+ p_hpage_val = strstr(buf, label);
+ if (!p_hpage_val) {
+ errno = EINVAL;
+ return -1;
+ }
+ p_hpage_val += strlen(label);
+
+ errno = 0;
+ ret_val = strtol(p_hpage_val, &end_pointer, 0);
+
+ if (errno != 0)
+ return -1;
+
+ return ret_val * 1024;
+}
+
int ibv_fork_init(void)
{
void *tmp;
@@ -85,6 +133,8 @@ int ibv_fork_init(void)
if (page_size < 0)
return errno;
+ huge_page_size = get_huge_page_size();
+
if (posix_memalign(&tmp, page_size, page_size))
return ENOMEM;
@@ -554,7 +604,8 @@ static struct ibv_mem_node *prepare_to_roll_back(struct ibv_mem_node *node,
return node;
}
-static int ibv_madvise_range(void *base, size_t size, int advice)
+static int ibv_madvise_range_helper(void *base, size_t size, int advice,
+ int page_size)
{
uintptr_t start, end;
struct ibv_mem_node *node, *tmp;
@@ -646,6 +697,22 @@ out:
return ret;
}
+static int ibv_madvise_range(void *base, size_t size, int advice)
+{
+ int ret_val = 0;
+
+ ret_val = ibv_madvise_range_helper(base, size, advice, page_size);
+
+ /*
+ * if memory is backed by huge pages we need to align it
+ * to huge page boundary in order madvise() will succeed.
+ */
+ if (ret_val == -1 && errno == EINVAL && huge_page_size > 0)
+ ret_val = ibv_madvise_range_helper(base, size, advice, huge_page_size);
+
+ return ret_val;
+}
+
int ibv_dontfork_range(void *base, size_t size)
{
if (mm_root)
--
1.5.5
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related [flat|nested] 16+ messages in thread[parent not found: <4B12AA78.7090401-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>]
* Re: [PATCH] libibverbs: Add huge page support to ibv_madvise_range() [not found] ` <4B12AA78.7090401-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> @ 2009-12-08 14:03 ` Alex Vainman [not found] ` <4B1E5CA9.3090707-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> 2010-01-12 14:25 ` Eli Cohen 2010-01-15 18:59 ` Roland Dreier 2 siblings, 1 reply; 16+ messages in thread From: Alex Vainman @ 2009-12-08 14:03 UTC (permalink / raw) To: Roland Dreier Cc: roland, linux-rdma-u79uwXL29TY76Z2rM5mHXA, alexr-smomgflXvOZWk0Htik3J/w Hi Roland, Have you had a chance to look at this patch and at the patch I've sent with this email: "Subject: [PATCH] libibverbs: Add huge page support to ibv_madvise_range() Sent: Sun, 29 Nov 2009 19:08:08 +0200"? Thanks, Alexv Alex Vainman wrote: > ibv_reg_mr() fails to register a memory region allocated on huge page and not > the default page size. This happens because ibv_madvise_range() aligns memory > region to the default system page size before calling to madvise() which fails > with EINVAL error. madvise() fails because it expects that the start and end > pointer of the memory range be huge page aligned. > Patch handles the issue by: > 1. ibv_fork_init() gets kernel's default huge page size in addition > to the default page size. > 2. ibv_madvise_range() first tries aligning users memory range to default > page size and if madvise() fails with EINVAL error then it tries to align > users memory range by huge page size and tries madvise() again. > > Signed-off-by: Alex Vaynman <alexv-smomgflXvOZWk0Htik3J/w@public.gmane.org> > --- > src/memory.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++- > 1 files changed, 68 insertions(+), 1 deletions(-) > > diff --git a/src/memory.c b/src/memory.c > index 550015a..73db083 100644 > --- a/src/memory.c > +++ b/src/memory.c > @@ -40,6 +40,9 @@ > #include <unistd.h> > #include <stdlib.h> > #include <stdint.h> > +#include <ctype.h> > +#include <fcntl.h> > +#include <string.h> > > #include "ibverbs.h" > > @@ -54,6 +57,8 @@ > #define MADV_DOFORK 11 > #endif > > +#define MEMINFO_SIZE 2048 > + > struct ibv_mem_node { > enum { > IBV_RED, > @@ -68,8 +73,51 @@ struct ibv_mem_node { > static struct ibv_mem_node *mm_root; > static pthread_mutex_t mm_mutex = PTHREAD_MUTEX_INITIALIZER; > static int page_size; > +static int huge_page_size; > static int too_late; > > +/* > + * Get the kernel default huge page size. > + */ > +static int get_huge_page_size() > +{ > + int fd; > + char buf[MEMINFO_SIZE]; > + int mem_file_len; > + char *p_hpage_val = NULL; > + char *end_pointer = NULL; > + char file_name[] = "/proc/meminfo"; > + const char label[] = "Hugepagesize:"; > + int ret_val = 0; > + > + fd = open(file_name, O_RDONLY); > + if (fd < 0) > + return fd; > + > + mem_file_len = read(fd, buf, sizeof(buf) - 1); > + > + close(fd); > + if (mem_file_len < 0) > + return mem_file_len; > + > + buf[mem_file_len] = '\0'; > + > + p_hpage_val = strstr(buf, label); > + if (!p_hpage_val) { > + errno = EINVAL; > + return -1; > + } > + p_hpage_val += strlen(label); > + > + errno = 0; > + ret_val = strtol(p_hpage_val, &end_pointer, 0); > + > + if (errno != 0) > + return -1; > + > + return ret_val * 1024; > +} > + > int ibv_fork_init(void) > { > void *tmp; > @@ -85,6 +133,8 @@ int ibv_fork_init(void) > if (page_size < 0) > return errno; > > + huge_page_size = get_huge_page_size(); > + > if (posix_memalign(&tmp, page_size, page_size)) > return ENOMEM; > > @@ -554,7 +604,8 @@ static struct ibv_mem_node *prepare_to_roll_back(struct ibv_mem_node *node, > return node; > } > > -static int ibv_madvise_range(void *base, size_t size, int advice) > +static int ibv_madvise_range_helper(void *base, size_t size, int advice, > + int page_size) > { > uintptr_t start, end; > struct ibv_mem_node *node, *tmp; > @@ -646,6 +697,22 @@ out: > return ret; > } > > +static int ibv_madvise_range(void *base, size_t size, int advice) > +{ > + int ret_val = 0; > + > + ret_val = ibv_madvise_range_helper(base, size, advice, page_size); > + > + /* > + * if memory is backed by huge pages we need to align it > + * to huge page boundary in order madvise() will succeed. > + */ > + if (ret_val == -1 && errno == EINVAL && huge_page_size > 0) > + ret_val = ibv_madvise_range_helper(base, size, advice, huge_page_size); > + > + return ret_val; > +} > + > int ibv_dontfork_range(void *base, size_t size) > { > if (mm_root) -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 16+ messages in thread
[parent not found: <4B1E5CA9.3090707-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>]
* Re: [PATCH] libibverbs: Add huge page support to ibv_madvise_range() [not found] ` <4B1E5CA9.3090707-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> @ 2010-01-12 9:26 ` Alex Vainman 0 siblings, 0 replies; 16+ messages in thread From: Alex Vainman @ 2010-01-12 9:26 UTC (permalink / raw) To: Roland Dreier Cc: roland, linux-rdma-u79uwXL29TY76Z2rM5mHXA, alexr-smomgflXvOZWk0Htik3J/w, alexv-smomgflXvOZWk0Htik3J/w Hi Roland, I would like to ask you if you have an estimation for when you will review the patches below? Thanks, AlexV Alex Vainman wrote: > Hi Roland, > > Have you had a chance to look at this patch and at the patch > I've sent with this email: > "Subject: [PATCH] libibverbs: Add huge page support to ibv_madvise_range() > Sent: Sun, 29 Nov 2009 19:08:08 +0200"? > > Thanks, > Alexv > > > Alex Vainman wrote: >> ibv_reg_mr() fails to register a memory region allocated on huge page and not >> the default page size. This happens because ibv_madvise_range() aligns memory >> region to the default system page size before calling to madvise() which fails >> with EINVAL error. madvise() fails because it expects that the start and end >> pointer of the memory range be huge page aligned. >> Patch handles the issue by: >> 1. ibv_fork_init() gets kernel's default huge page size in addition >> to the default page size. >> 2. ibv_madvise_range() first tries aligning users memory range to default >> page size and if madvise() fails with EINVAL error then it tries to align >> users memory range by huge page size and tries madvise() again. >> >> Signed-off-by: Alex Vaynman <alexv-smomgflXvOZWk0Htik3J/w@public.gmane.org> >> --- >> src/memory.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++- >> 1 files changed, 68 insertions(+), 1 deletions(-) >> >> diff --git a/src/memory.c b/src/memory.c >> index 550015a..73db083 100644 >> --- a/src/memory.c >> +++ b/src/memory.c >> @@ -40,6 +40,9 @@ >> #include <unistd.h> >> #include <stdlib.h> >> #include <stdint.h> >> +#include <ctype.h> >> +#include <fcntl.h> >> +#include <string.h> >> >> #include "ibverbs.h" >> >> @@ -54,6 +57,8 @@ >> #define MADV_DOFORK 11 >> #endif >> >> +#define MEMINFO_SIZE 2048 >> + >> struct ibv_mem_node { >> enum { >> IBV_RED, >> @@ -68,8 +73,51 @@ struct ibv_mem_node { >> static struct ibv_mem_node *mm_root; >> static pthread_mutex_t mm_mutex = PTHREAD_MUTEX_INITIALIZER; >> static int page_size; >> +static int huge_page_size; >> static int too_late; >> >> +/* >> + * Get the kernel default huge page size. >> + */ >> +static int get_huge_page_size() >> +{ >> + int fd; >> + char buf[MEMINFO_SIZE]; >> + int mem_file_len; >> + char *p_hpage_val = NULL; >> + char *end_pointer = NULL; >> + char file_name[] = "/proc/meminfo"; >> + const char label[] = "Hugepagesize:"; >> + int ret_val = 0; >> + >> + fd = open(file_name, O_RDONLY); >> + if (fd < 0) >> + return fd; >> + >> + mem_file_len = read(fd, buf, sizeof(buf) - 1); >> + >> + close(fd); >> + if (mem_file_len < 0) >> + return mem_file_len; >> + >> + buf[mem_file_len] = '\0'; >> + >> + p_hpage_val = strstr(buf, label); >> + if (!p_hpage_val) { >> + errno = EINVAL; >> + return -1; >> + } >> + p_hpage_val += strlen(label); >> + >> + errno = 0; >> + ret_val = strtol(p_hpage_val, &end_pointer, 0); >> + >> + if (errno != 0) >> + return -1; >> + >> + return ret_val * 1024; >> +} >> + >> int ibv_fork_init(void) >> { >> void *tmp; >> @@ -85,6 +133,8 @@ int ibv_fork_init(void) >> if (page_size < 0) >> return errno; >> >> + huge_page_size = get_huge_page_size(); >> + >> if (posix_memalign(&tmp, page_size, page_size)) >> return ENOMEM; >> >> @@ -554,7 +604,8 @@ static struct ibv_mem_node *prepare_to_roll_back(struct ibv_mem_node *node, >> return node; >> } >> >> -static int ibv_madvise_range(void *base, size_t size, int advice) >> +static int ibv_madvise_range_helper(void *base, size_t size, int advice, >> + int page_size) >> { >> uintptr_t start, end; >> struct ibv_mem_node *node, *tmp; >> @@ -646,6 +697,22 @@ out: >> return ret; >> } >> >> +static int ibv_madvise_range(void *base, size_t size, int advice) >> +{ >> + int ret_val = 0; >> + >> + ret_val = ibv_madvise_range_helper(base, size, advice, page_size); >> + >> + /* >> + * if memory is backed by huge pages we need to align it >> + * to huge page boundary in order madvise() will succeed. >> + */ >> + if (ret_val == -1 && errno == EINVAL && huge_page_size > 0) >> + ret_val = ibv_madvise_range_helper(base, size, advice, huge_page_size); >> + >> + return ret_val; >> +} >> + >> int ibv_dontfork_range(void *base, size_t size) >> { >> if (mm_root) > > > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] libibverbs: Add huge page support to ibv_madvise_range() [not found] ` <4B12AA78.7090401-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> 2009-12-08 14:03 ` Alex Vainman @ 2010-01-12 14:25 ` Eli Cohen 2010-01-14 15:12 ` Alex Vainman 2010-01-15 18:59 ` Roland Dreier 2 siblings, 1 reply; 16+ messages in thread From: Eli Cohen @ 2010-01-12 14:25 UTC (permalink / raw) To: alexv-smomgflXvOZWk0Htik3J/w; +Cc: roland, linux-rdma-u79uwXL29TY76Z2rM5mHXA On Sun, Nov 29, 2009 at 07:08:08PM +0200, Alex Vainman wrote: > + p_hpage_val = strstr(buf, label); > + if (!p_hpage_val) { > + errno = EINVAL; > + return -1; > + } > + p_hpage_val += strlen(label); > + > + errno = 0; > + ret_val = strtol(p_hpage_val, &end_pointer, 0); If strtol() fails, you may return with an invalid, non zero value for huge page size. Maybe use some temporary variable to hold the intermediate result. > + > + if (errno != 0) > + return -1; > + > + return ret_val * 1024; > +} > + -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] libibverbs: Add huge page support to ibv_madvise_range() 2010-01-12 14:25 ` Eli Cohen @ 2010-01-14 15:12 ` Alex Vainman 0 siblings, 0 replies; 16+ messages in thread From: Alex Vainman @ 2010-01-14 15:12 UTC (permalink / raw) To: Eli Cohen Cc: alexv-smomgflXvOZWk0Htik3J/w, roland, linux-rdma-u79uwXL29TY76Z2rM5mHXA Hi Eli, Sorry fot the late response. > If strtol() fails, you may return with an invalid, non zero value for > huge page size. Maybe use some temporary variable to hold the > intermediate result. If strtol() fails, for any reason, we always return value <= 0 for huge page size. This indicates that system's huge page size is unknown and the second call to ibv_madvise_range_helper(), with huge page size, will be skipped (as it supposed to be). Thanks, AlexV Eli Cohen Wrote: > On Sun, Nov 29, 2009 at 07:08:08PM +0200, Alex Vainman wrote: >> + p_hpage_val = strstr(buf, label); >> + if (!p_hpage_val) { >> + errno = EINVAL; >> + return -1; >> + } >> + p_hpage_val += strlen(label); >> + >> + errno = 0; >> + ret_val = strtol(p_hpage_val, &end_pointer, 0); > > If strtol() fails, you may return with an invalid, non zero value for > huge page size. Maybe use some temporary variable to hold the > intermediate result. >> + >> + if (errno != 0) >> + return -1; >> + >> + return ret_val * 1024; >> +} >> + > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] libibverbs: Add huge page support to ibv_madvise_range() [not found] ` <4B12AA78.7090401-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> 2009-12-08 14:03 ` Alex Vainman 2010-01-12 14:25 ` Eli Cohen @ 2010-01-15 18:59 ` Roland Dreier [not found] ` <ada8wbzi490.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org> 2 siblings, 1 reply; 16+ messages in thread From: Roland Dreier @ 2010-01-15 18:59 UTC (permalink / raw) To: alexv-smomgflXvOZWk0Htik3J/w; +Cc: roland, linux-rdma-u79uwXL29TY76Z2rM5mHXA > ibv_reg_mr() fails to register a memory region allocated on huge page and not > the default page size. This happens because ibv_madvise_range() aligns memory > region to the default system page size before calling to madvise() which fails > with EINVAL error. madvise() fails because it expects that the start and end > pointer of the memory range be huge page aligned. Seems unfortunate. I wonder if there's a way the kernel madvise could help us here? > +/* > + * Get the kernel default huge page size. > + */ > +static int get_huge_page_size() > +{ > + int fd; > + char buf[MEMINFO_SIZE]; > + int mem_file_len; > + char *p_hpage_val = NULL; > + char *end_pointer = NULL; > + char file_name[] = "/proc/meminfo"; > + const char label[] = "Hugepagesize:"; > + int ret_val = 0; > + > + fd = open(file_name, O_RDONLY); > + if (fd < 0) > + return fd; > + > + mem_file_len = read(fd, buf, sizeof(buf) - 1); > + > + close(fd); > + if (mem_file_len < 0) > + return mem_file_len; > + > + buf[mem_file_len] = '\0'; > + > + p_hpage_val = strstr(buf, label); > + if (!p_hpage_val) { > + errno = EINVAL; > + return -1; > + } > + p_hpage_val += strlen(label); > + > + errno = 0; > + ret_val = strtol(p_hpage_val, &end_pointer, 0); > + > + if (errno != 0) > + return -1; > + > + return ret_val * 1024; > +} This seems to duplicate but only partially a similar function from libhugetlbfs. Is there any way we can just use that directly? eg libhugetlbfs handles the case where there are multiple huge page sizes (and that exists even on mainstream x86 with 2MB and 1GB pages possible on the same system). - R. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 16+ messages in thread
[parent not found: <ada8wbzi490.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org>]
* Re: [PATCH] libibverbs: Add huge page support to ibv_madvise_range() [not found] ` <ada8wbzi490.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org> @ 2010-01-17 9:30 ` Alex Vainman [not found] ` <4B52D8A8.7060804-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> 2010-04-22 7:35 ` Alex Vainman 1 sibling, 1 reply; 16+ messages in thread From: Alex Vainman @ 2010-01-17 9:30 UTC (permalink / raw) To: Roland Dreier Cc: alexv-smomgflXvOZWk0Htik3J/w, roland, linux-rdma-u79uwXL29TY76Z2rM5mHXA > Seems unfortunate. I wonder if there's a way the kernel madvise could > help us here? We've tried to find the best solution for this issue. But we couldn't find another one without changing the API or changing the kernel. > This seems to duplicate but only partially a similar function from > libhugetlbfs. Is there any way we can just use that directly? I will check this. Thanks, AlexV Roland Dreier Wrote: > > ibv_reg_mr() fails to register a memory region allocated on huge page and not > > the default page size. This happens because ibv_madvise_range() aligns memory > > region to the default system page size before calling to madvise() which fails > > with EINVAL error. madvise() fails because it expects that the start and end > > pointer of the memory range be huge page aligned. > > Seems unfortunate. I wonder if there's a way the kernel madvise could > help us here? > > > +/* > > + * Get the kernel default huge page size. > > + */ > > +static int get_huge_page_size() > > +{ > > + int fd; > > + char buf[MEMINFO_SIZE]; > > + int mem_file_len; > > + char *p_hpage_val = NULL; > > + char *end_pointer = NULL; > > + char file_name[] = "/proc/meminfo"; > > + const char label[] = "Hugepagesize:"; > > + int ret_val = 0; > > + > > + fd = open(file_name, O_RDONLY); > > + if (fd < 0) > > + return fd; > > + > > + mem_file_len = read(fd, buf, sizeof(buf) - 1); > > + > > + close(fd); > > + if (mem_file_len < 0) > > + return mem_file_len; > > + > > + buf[mem_file_len] = '\0'; > > + > > + p_hpage_val = strstr(buf, label); > > + if (!p_hpage_val) { > > + errno = EINVAL; > > + return -1; > > + } > > + p_hpage_val += strlen(label); > > + > > + errno = 0; > > + ret_val = strtol(p_hpage_val, &end_pointer, 0); > > + > > + if (errno != 0) > > + return -1; > > + > > + return ret_val * 1024; > > +} > > This seems to duplicate but only partially a similar function from > libhugetlbfs. Is there any way we can just use that directly? eg > libhugetlbfs handles the case where there are multiple huge page sizes > (and that exists even on mainstream x86 with 2MB and 1GB pages possible > on the same system). > > - R. > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 16+ messages in thread
[parent not found: <4B52D8A8.7060804-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>]
* Re: [PATCH] libibverbs: Add huge page support to ibv_madvise_range() [not found] ` <4B52D8A8.7060804-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> @ 2010-01-17 17:19 ` Roland Dreier [not found] ` <adak4vghcoo.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org> 0 siblings, 1 reply; 16+ messages in thread From: Roland Dreier @ 2010-01-17 17:19 UTC (permalink / raw) To: alexv-smomgflXvOZWk0Htik3J/w; +Cc: roland, linux-rdma-u79uwXL29TY76Z2rM5mHXA > But we couldn't find another one without changing the API or changing > the kernel. Changing which API? Changing the kernel is a possibility of course. - R. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 16+ messages in thread
[parent not found: <adak4vghcoo.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org>]
* Re: [PATCH] libibverbs: Add huge page support to ibv_madvise_range() [not found] ` <adak4vghcoo.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org> @ 2010-01-18 12:53 ` Alex Vainman [not found] ` <4B5459E3.2040902-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> 0 siblings, 1 reply; 16+ messages in thread From: Alex Vainman @ 2010-01-18 12:53 UTC (permalink / raw) To: Roland Dreier Cc: alexv-smomgflXvOZWk0Htik3J/w, roland, linux-rdma-u79uwXL29TY76Z2rM5mHXA, alexr-smomgflXvOZWk0Htik3J/w > Changing which API? Verbs API. Since there is no way to figure out (as we know) from user space if a given memory region is backed by a huge page or default one, we can allow the user to supply us this info on memory registration. For that we need to add version of ibv_reg_mr() with additional input parameter for page size (the same for ibv_dontfork_range() and ibv_dofork_range()). > Changing the kernel is a possibility of course. In my opinion if we have the user space solution it is preferable than changing the kernel, which is more complex path. Alex. Roland Dreier Wrote: > > But we couldn't find another one without changing the API or changing > > the kernel. > > Changing which API? Changing the kernel is a possibility of course. > > - R. > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 16+ messages in thread
[parent not found: <4B5459E3.2040902-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>]
* Re: [PATCH] libibverbs: Add huge page support to ibv_madvise_range() [not found] ` <4B5459E3.2040902-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> @ 2010-02-17 14:52 ` Chuck Hartley 0 siblings, 0 replies; 16+ messages in thread From: Chuck Hartley @ 2010-02-17 14:52 UTC (permalink / raw) To: linux-rdma-u79uwXL29TY76Z2rM5mHXA Can someone tell me the current status of this issue? Has a decision been made to update the verbs API in some future release, and if so, which one? Will there be corresponding updates to DAPL? Thanks, Chuck -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] libibverbs: Add huge page support to ibv_madvise_range() [not found] ` <ada8wbzi490.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org> 2010-01-17 9:30 ` Alex Vainman @ 2010-04-22 7:35 ` Alex Vainman [not found] ` <4BCFFC48.4060401-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> 1 sibling, 1 reply; 16+ messages in thread From: Alex Vainman @ 2010-04-22 7:35 UTC (permalink / raw) To: Roland Dreier Cc: alexv-smomgflXvOZWk0Htik3J/w, roland, linux-rdma-u79uwXL29TY76Z2rM5mHXA, alexr-smomgflXvOZWk0Htik3J/w Roland Dreier Wrote: > > ibv_reg_mr() fails to register a memory region allocated on huge page and not > > the default page size. This happens because ibv_madvise_range() aligns memory > > region to the default system page size before calling to madvise() which fails > > with EINVAL error. madvise() fails because it expects that the start and end > > pointer of the memory range be huge page aligned. > > Seems unfortunate. I wonder if there's a way the kernel madvise could > help us here? > > > +/* > > + * Get the kernel default huge page size. > > + */ > > +static int get_huge_page_size() > > +{ > > + int fd; > > + char buf[MEMINFO_SIZE]; > > + int mem_file_len; > > + char *p_hpage_val = NULL; > > + char *end_pointer = NULL; > > + char file_name[] = "/proc/meminfo"; > > + const char label[] = "Hugepagesize:"; > > + int ret_val = 0; > > + > > + fd = open(file_name, O_RDONLY); > > + if (fd < 0) > > + return fd; > > + > > + mem_file_len = read(fd, buf, sizeof(buf) - 1); > > + > > + close(fd); > > + if (mem_file_len < 0) > > + return mem_file_len; > > + > > + buf[mem_file_len] = '\0'; > > + > > + p_hpage_val = strstr(buf, label); > > + if (!p_hpage_val) { > > + errno = EINVAL; > > + return -1; > > + } > > + p_hpage_val += strlen(label); > > + > > + errno = 0; > > + ret_val = strtol(p_hpage_val, &end_pointer, 0); > > + > > + if (errno != 0) > > + return -1; > > + > > + return ret_val * 1024; > > +} > > This seems to duplicate but only partially a similar function from > libhugetlbfs. Is there any way we can just use that directly? eg > libhugetlbfs handles the case where there are multiple huge page sizes > (and that exists even on mainstream x86 with 2MB and 1GB pages possible > on the same system). > > - R. > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > Hi Roland, After the patches, which handle madvise failure, are applied(these pathces were submited under the topic:"libibverbs: Undo changes in memory range tree when madvise() fails"), I would like to renew the discussion about this patch, which actually depends on the above patches, since it may cause madvise failure. >This seems to duplicate but only partially a similar function from >libhugetlbfs. Is there any way we can just use that directly? eg >libhugetlbfs handles the case where there are multiple huge page sizes >(and that exists even on mainstream x86 with 2MB and 1GB pages possible >on the same system). In order to avoid adding additional dependency to libibverbs, maybe we should just to enhance the get_huge_page_size() so it will support multiple huge page sizes? -Alex -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 16+ messages in thread
[parent not found: <4BCFFC48.4060401-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>]
* Re: [PATCH] libibverbs: Add huge page support to ibv_madvise_range() [not found] ` <4BCFFC48.4060401-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> @ 2010-05-06 20:51 ` Roland Dreier [not found] ` <adazl0c92kp.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org> 0 siblings, 1 reply; 16+ messages in thread From: Roland Dreier @ 2010-05-06 20:51 UTC (permalink / raw) To: alexv-smomgflXvOZWk0Htik3J/w Cc: roland, linux-rdma-u79uwXL29TY76Z2rM5mHXA, alexr-smomgflXvOZWk0Htik3J/w > In order to avoid adding additional dependency to libibverbs, maybe > we should just to enhance the get_huge_page_size() so it will support > multiple huge page sizes? I think that does make sense. However see my reply to Alexander Schmidt -- maybe there is something clever we can do with /proc/*/maps to figure out when we need to use this? - R. -- Roland Dreier <rolandd-FYB4Gu1CFyUAvxtiuMwx3w@public.gmane.org> || For corporate legal information go to: http://www.cisco.com/web/about/doing_business/legal/cri/index.html -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 16+ messages in thread
[parent not found: <adazl0c92kp.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org>]
* Re: [PATCH] libibverbs: Add huge page support to ibv_madvise_range() [not found] ` <adazl0c92kp.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org> @ 2010-05-13 14:04 ` Alex Vainman [not found] ` <4BEC06DB.30505-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> 0 siblings, 1 reply; 16+ messages in thread From: Alex Vainman @ 2010-05-13 14:04 UTC (permalink / raw) To: Roland Dreier Cc: alexv-smomgflXvOZWk0Htik3J/w, roland, linux-rdma-u79uwXL29TY76Z2rM5mHXA, alexr-smomgflXvOZWk0Htik3J/w, monis-smomgflXvOZWk0Htik3J/w, ogerlitz-smomgflXvOZWk0Htik3J/w Roland Dreier Wrote: > > In order to avoid adding additional dependency to libibverbs, maybe > > we should just to enhance the get_huge_page_size() so it will support > > multiple huge page sizes? > > I think that does make sense. However see my reply to Alexander > Schmidt -- maybe there is something clever we can do with /proc/*/maps > to figure out when we need to use this? > > - R. Maybe we should use the /proc/pid/pagemap and /proc/kpageflags files? These files let a userspace process find out which physical frame each virtual page is mapped to and to get the properties of each page frame including the page size. The disadvantage of this method is, that it applicable on kernel >= 2.6.25, since this interface is supported starting from the kernel 2.6.25. -Alex -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 16+ messages in thread
[parent not found: <4BEC06DB.30505-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>]
* Re: [PATCH] libibverbs: Add huge page support to ibv_madvise_range() [not found] ` <4BEC06DB.30505-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> @ 2010-05-13 15:50 ` Roland Dreier [not found] ` <adapr0zsswc.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org> 0 siblings, 1 reply; 16+ messages in thread From: Roland Dreier @ 2010-05-13 15:50 UTC (permalink / raw) To: alexv-smomgflXvOZWk0Htik3J/w Cc: roland, linux-rdma-u79uwXL29TY76Z2rM5mHXA, alexr-smomgflXvOZWk0Htik3J/w, monis-smomgflXvOZWk0Htik3J/w, ogerlitz-smomgflXvOZWk0Htik3J/w > Maybe we should use the /proc/pid/pagemap and /proc/kpageflags files? > These files let a userspace process find out which physical frame > each virtual page is mapped to and to get the properties of each page > frame including the page size. Sounds interesting... the only problem is that /proc/kpageflags is root-only, so we can't use that. But does /proc/pid/pagemap give enough information to be useful? > The disadvantage of this method is, that it applicable on kernel >= > 2.6.25, since this interface is supported starting from the kernel > 2.6.25. I think that's OK -- Roland Dreier <rolandd-FYB4Gu1CFyUAvxtiuMwx3w@public.gmane.org> || For corporate legal information go to: http://www.cisco.com/web/about/doing_business/legal/cri/index.html -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 16+ messages in thread
[parent not found: <adapr0zsswc.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org>]
* Re: [PATCH] libibverbs: Add huge page support to ibv_madvise_range() [not found] ` <adapr0zsswc.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org> @ 2010-05-14 0:04 ` Pradeep Satyanarayana [not found] ` <4BEC937F.5000808-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org> 0 siblings, 1 reply; 16+ messages in thread From: Pradeep Satyanarayana @ 2010-05-14 0:04 UTC (permalink / raw) To: Roland Dreier Cc: alexv-smomgflXvOZWk0Htik3J/w, roland, linux-rdma-u79uwXL29TY76Z2rM5mHXA, alexr-smomgflXvOZWk0Htik3J/w, monis-smomgflXvOZWk0Htik3J/w, ogerlitz-smomgflXvOZWk0Htik3J/w Roland Dreier wrote: > > Maybe we should use the /proc/pid/pagemap and /proc/kpageflags files? > > These files let a userspace process find out which physical frame > > each virtual page is mapped to and to get the properties of each page > > frame including the page size. > > Sounds interesting... the only problem is that /proc/kpageflags is > root-only, so we can't use that. But does /proc/pid/pagemap give enough > information to be useful? The most useful information that /proc/pid/pagemap provides the PFN (if the page is not swapped out). In fact there is very useful tool called Documentation/vm/page-types.c which gives a lot of the information that we need. Again the caveat is that it uses /proc/kpageflags. /proc/pid/pagemap also provided "page shift". I am not sure what that is in the context of huge pages. > > > The disadvantage of this method is, that it applicable on kernel >= > > 2.6.25, since this interface is supported starting from the kernel > > 2.6.25. > > I think that's OK Pradeep -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 16+ messages in thread
[parent not found: <4BEC937F.5000808-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>]
* Re: [PATCH] libibverbs: Add huge page support to ibv_madvise_range() [not found] ` <4BEC937F.5000808-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org> @ 2010-05-18 5:29 ` Pradeep Satyanarayana 0 siblings, 0 replies; 16+ messages in thread From: Pradeep Satyanarayana @ 2010-05-18 5:29 UTC (permalink / raw) To: Roland Dreier; +Cc: alexv-smomgflXvOZWk0Htik3J/w, linux-rdma, Or Gerlitz Pradeep Satyanarayana wrote: > Roland Dreier wrote: >> > Maybe we should use the /proc/pid/pagemap and /proc/kpageflags files? >> > These files let a userspace process find out which physical frame >> > each virtual page is mapped to and to get the properties of each page >> > frame including the page size. >> >> Sounds interesting... the only problem is that /proc/kpageflags is >> root-only, so we can't use that. But does /proc/pid/pagemap give enough >> information to be useful? > > The most useful information that /proc/pid/pagemap provides the PFN (if the > page is not swapped out). In fact there is very useful tool called > Documentation/vm/page-types.c which gives a lot of the information that we > need. Again the caveat is that it uses /proc/kpageflags. Roland, Documentation/vm/page-types didn't turn out to be as interesting as expected. It does show the usage of huge pages, but it seems to provide system information and not process specific. The possible alternatives and their limitations are: 1. /proc/pid/maps - usable only with libhugetlfs 2. page-type and friends -/proc/kpageflgs is readable by root only (plus additional limitations described above). 3. /proc/pid/smaps -this provides KernelPageSize/MMUPageSize information (on recent kernels) with and without libhugetlbfs. Parsing this may be slow, but how often would it be used? Given the above, I see /proc/pid/smaps as one alternative that provides a complete solution. Are there others? What are your thoughts? Pradeep -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 16+ messages in thread
end of thread, other threads:[~2010-05-18 5:29 UTC | newest]
Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-11-29 17:08 [PATCH] libibverbs: Add huge page support to ibv_madvise_range() Alex Vainman
[not found] ` <4B12AA78.7090401-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2009-12-08 14:03 ` Alex Vainman
[not found] ` <4B1E5CA9.3090707-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2010-01-12 9:26 ` Alex Vainman
2010-01-12 14:25 ` Eli Cohen
2010-01-14 15:12 ` Alex Vainman
2010-01-15 18:59 ` Roland Dreier
[not found] ` <ada8wbzi490.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org>
2010-01-17 9:30 ` Alex Vainman
[not found] ` <4B52D8A8.7060804-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2010-01-17 17:19 ` Roland Dreier
[not found] ` <adak4vghcoo.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org>
2010-01-18 12:53 ` Alex Vainman
[not found] ` <4B5459E3.2040902-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2010-02-17 14:52 ` Chuck Hartley
2010-04-22 7:35 ` Alex Vainman
[not found] ` <4BCFFC48.4060401-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2010-05-06 20:51 ` Roland Dreier
[not found] ` <adazl0c92kp.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org>
2010-05-13 14:04 ` Alex Vainman
[not found] ` <4BEC06DB.30505-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2010-05-13 15:50 ` Roland Dreier
[not found] ` <adapr0zsswc.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org>
2010-05-14 0:04 ` Pradeep Satyanarayana
[not found] ` <4BEC937F.5000808-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
2010-05-18 5:29 ` Pradeep Satyanarayana
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox