public inbox for linux-rdma@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] libibverbs: Add huge page support to ibv_madvise_range()
@ 2009-11-29 17:08 Alex Vainman
       [not found] ` <4B12AA78.7090401-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 16+ messages in thread
From: Alex Vainman @ 2009-11-29 17:08 UTC (permalink / raw)
  To: roland; +Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA


ibv_reg_mr() fails to register a memory region allocated on huge page and not
the default page size. This happens because ibv_madvise_range() aligns memory
region to the default system page size before calling to madvise() which fails
with EINVAL error. madvise() fails because it expects that the start and end
pointer of the memory range be huge page aligned.
Patch handles the issue by:
1. ibv_fork_init() gets kernel's default huge page size in addition
   to the default page size.
2. ibv_madvise_range() first tries aligning users memory range to default
   page size and if madvise() fails with EINVAL error then it tries to align
   users memory range by huge page size and tries madvise() again.

Signed-off-by: Alex Vaynman <alexv-smomgflXvOZWk0Htik3J/w@public.gmane.org>
---
 src/memory.c |   69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 68 insertions(+), 1 deletions(-)

diff --git a/src/memory.c b/src/memory.c
index 550015a..73db083 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -40,6 +40,9 @@
 #include <unistd.h>
 #include <stdlib.h>
 #include <stdint.h>
+#include <ctype.h>
+#include <fcntl.h>
+#include <string.h>
 
 #include "ibverbs.h"
 
@@ -54,6 +57,8 @@
 #define MADV_DOFORK	11
 #endif
 
+#define MEMINFO_SIZE	2048
+
 struct ibv_mem_node {
 	enum {
 		IBV_RED,
@@ -68,8 +73,51 @@ struct ibv_mem_node {
 static struct ibv_mem_node *mm_root;
 static pthread_mutex_t mm_mutex = PTHREAD_MUTEX_INITIALIZER;
 static int page_size;
+static int huge_page_size;
 static int too_late;
 
+/*
+ * Get the kernel default huge page size.
+ */
+static int get_huge_page_size()
+{
+	int fd;
+	char buf[MEMINFO_SIZE];
+	int mem_file_len;
+	char *p_hpage_val = NULL;
+	char *end_pointer = NULL;
+	char file_name[] = "/proc/meminfo";
+	const char label[] = "Hugepagesize:";
+	int ret_val = 0;
+
+	fd = open(file_name, O_RDONLY);
+	if (fd < 0)
+		return fd;
+
+	mem_file_len = read(fd, buf, sizeof(buf) - 1);
+
+	close(fd);
+	if (mem_file_len < 0)
+		return mem_file_len;
+
+	buf[mem_file_len] = '\0';
+
+	p_hpage_val = strstr(buf, label);
+	if (!p_hpage_val) {
+		errno = EINVAL;
+		return -1;
+	}
+	p_hpage_val += strlen(label);
+
+	errno = 0;
+	ret_val = strtol(p_hpage_val, &end_pointer, 0);
+
+	if (errno != 0)
+		return -1;
+
+	return ret_val * 1024;
+}
+
 int ibv_fork_init(void)
 {
 	void *tmp;
@@ -85,6 +133,8 @@ int ibv_fork_init(void)
 	if (page_size < 0)
 		return errno;
 
+	huge_page_size = get_huge_page_size();
+
 	if (posix_memalign(&tmp, page_size, page_size))
 		return ENOMEM;
 
@@ -554,7 +604,8 @@ static struct ibv_mem_node *prepare_to_roll_back(struct ibv_mem_node *node,
 	return node;
 }
 
-static int ibv_madvise_range(void *base, size_t size, int advice)
+static int ibv_madvise_range_helper(void *base, size_t size, int advice,
+				    int page_size)
 {
 	uintptr_t start, end;
 	struct ibv_mem_node *node, *tmp;
@@ -646,6 +697,22 @@ out:
 	return ret;
 }
 
+static int ibv_madvise_range(void *base, size_t size, int advice)
+{
+	int ret_val = 0;
+
+	ret_val = ibv_madvise_range_helper(base, size, advice, page_size);
+
+	/*
+	 * if memory is backed by huge pages we need to align it
+	 * to huge page boundary in order madvise() will succeed.
+	 */
+	if (ret_val == -1 && errno == EINVAL && huge_page_size > 0)
+		ret_val = ibv_madvise_range_helper(base, size, advice, huge_page_size);
+
+	return ret_val;
+}
+
 int ibv_dontfork_range(void *base, size_t size)
 {
 	if (mm_root)
-- 
1.5.5

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2010-05-18  5:29 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-11-29 17:08 [PATCH] libibverbs: Add huge page support to ibv_madvise_range() Alex Vainman
     [not found] ` <4B12AA78.7090401-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2009-12-08 14:03   ` Alex Vainman
     [not found]     ` <4B1E5CA9.3090707-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2010-01-12  9:26       ` Alex Vainman
2010-01-12 14:25   ` Eli Cohen
2010-01-14 15:12     ` Alex Vainman
2010-01-15 18:59   ` Roland Dreier
     [not found]     ` <ada8wbzi490.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org>
2010-01-17  9:30       ` Alex Vainman
     [not found]         ` <4B52D8A8.7060804-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2010-01-17 17:19           ` Roland Dreier
     [not found]             ` <adak4vghcoo.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org>
2010-01-18 12:53               ` Alex Vainman
     [not found]                 ` <4B5459E3.2040902-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2010-02-17 14:52                   ` Chuck Hartley
2010-04-22  7:35       ` Alex Vainman
     [not found]         ` <4BCFFC48.4060401-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2010-05-06 20:51           ` Roland Dreier
     [not found]             ` <adazl0c92kp.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org>
2010-05-13 14:04               ` Alex Vainman
     [not found]                 ` <4BEC06DB.30505-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2010-05-13 15:50                   ` Roland Dreier
     [not found]                     ` <adapr0zsswc.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org>
2010-05-14  0:04                       ` Pradeep Satyanarayana
     [not found]                         ` <4BEC937F.5000808-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
2010-05-18  5:29                           ` Pradeep Satyanarayana

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox