[PATCH 16/19] staging/lustre/osc: use global osc_rq_pool to reduce memory usage

All of lore.kernel.org
 help / color / mirror / Atom feed

From: green@linuxhacker.ru
To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	devel@driverdev.osuosl.org,
	Andreas Dilger <andreas.dilger@intel.com>
Cc: Linux Kernel Mailing List <linux-kernel@vger.kernel.org>,
	Li Xi <lixi@ddn.com>, Wu Libin <lwu@ddn.com>,
	Wang Shilong <wshilong@ddn.com>,
	Oleg Drokin <oleg.drokin@intel.com>
Subject: [PATCH 16/19] staging/lustre/osc: use global osc_rq_pool to reduce memory usage
Date: Mon, 14 Sep 2015 18:41:32 -0400	[thread overview]
Message-ID: <1442270495-1655259-17-git-send-email-green@linuxhacker.ru> (raw)
In-Reply-To: <1442270495-1655259-1-git-send-email-green@linuxhacker.ru>

From: Li Xi <lixi@ddn.com>

The per-osc request pools consume a lot of memory if there are
hundreds of OSCs on one client. This will be a critical problem
if the client doesn't have sufficient memory for both OSCs and
applications.

This patch replaces per-osc request pools with a global pool
osc_rq_pool. The total memory usage is 5MB by default. And it
can be set by a module parameter of OSC:
"options osc osc_reqpool_mem_max=POOL_SIZE". The unit of POOL_SIZE
is MB. If cl_max_rpcs_in_flight is the same for all OSCs, the
memory usage of the OSC pool can be calculated as:
Min(POOL_SIZE * 1M,
    (cl_max_rpcs_in_flight + 2) * OSC number * OST_MAXREQSIZE)

Also, this patch changes the allocation logic of OSC write requests.
The allocation from osc_rq_pool will only be tried after normal
allocation failed.

Signed-off-by: Wu Libin <lwu@ddn.com>
Signed-off-by: Wang Shilong <wshilong@ddn.com>
Signed-off-by: Li Xi <lixi@ddn.com>
Reviewed-on: http://review.whamcloud.com/15422
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-6770
Reviewed-by: Jinshan Xiong <jinshan.xiong@intel.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Signed-off-by: Oleg Drokin <oleg.drokin@intel.com>
---
 .../staging/lustre/lustre/include/lustre_import.h  |  2 -
 drivers/staging/lustre/lustre/include/lustre_net.h |  7 +-
 drivers/staging/lustre/lustre/include/obd_class.h  |  4 --
 drivers/staging/lustre/lustre/obdclass/genops.c    |  1 -
 drivers/staging/lustre/lustre/osc/lproc_osc.c      | 17 ++++-
 drivers/staging/lustre/lustre/osc/osc_internal.h   |  4 ++
 drivers/staging/lustre/lustre/osc/osc_request.c    | 79 ++++++++++++++++++----
 drivers/staging/lustre/lustre/ptlrpc/client.c      | 21 +++---
 8 files changed, 94 insertions(+), 41 deletions(-)

diff --git a/drivers/staging/lustre/lustre/include/lustre_import.h b/drivers/staging/lustre/lustre/include/lustre_import.h
index 5a38f3d..c8b89a3 100644
--- a/drivers/staging/lustre/lustre/include/lustre_import.h
+++ b/drivers/staging/lustre/lustre/include/lustre_import.h
@@ -306,8 +306,6 @@ struct obd_import {
 	__u32		     imp_msg_magic;
 	__u32		     imp_msghdr_flags;       /* adjusted based on server capability */
 
-	struct ptlrpc_request_pool *imp_rq_pool;	  /* emergency request pool */
-
 	struct imp_at	     imp_at;		 /* adaptive timeout data */
 	time_t		    imp_last_reply_time;    /* for health check */
 };
diff --git a/drivers/staging/lustre/lustre/include/lustre_net.h b/drivers/staging/lustre/lustre/include/lustre_net.h
index 5df493e..313a56c 100644
--- a/drivers/staging/lustre/lustre/include/lustre_net.h
+++ b/drivers/staging/lustre/lustre/include/lustre_net.h
@@ -500,7 +500,7 @@ struct ptlrpc_request_pool {
 	/** Maximum message size that would fit into a request from this pool */
 	int prp_rq_size;
 	/** Function to allocate more requests for this pool */
-	void (*prp_populate)(struct ptlrpc_request_pool *, int);
+	int (*prp_populate)(struct ptlrpc_request_pool *, int);
 };
 
 struct lu_context;
@@ -2381,11 +2381,11 @@ void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc,
 			    struct ptlrpc_request *req);
 
 void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool);
-void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq);
+int ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq);
 
 struct ptlrpc_request_pool *
 ptlrpc_init_rq_pool(int, int,
-		    void (*populate_pool)(struct ptlrpc_request_pool *, int));
+		    int (*populate_pool)(struct ptlrpc_request_pool *, int));
 
 void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req);
 struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp,
@@ -2957,7 +2957,6 @@ void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes);
 
 /* ptlrpc/llog_client.c */
 extern struct llog_operations llog_client_ops;
-
 /** @} net */
 
 #endif
diff --git a/drivers/staging/lustre/lustre/include/obd_class.h b/drivers/staging/lustre/lustre/include/obd_class.h
index 87bb2ce..ce6fa55 100644
--- a/drivers/staging/lustre/lustre/include/obd_class.h
+++ b/drivers/staging/lustre/lustre/include/obd_class.h
@@ -626,10 +626,6 @@ static inline void obd_cleanup_client_import(struct obd_device *obd)
 		CDEBUG(D_CONFIG, "%s: client import never connected\n",
 		       obd->obd_name);
 		ptlrpc_invalidate_import(imp);
-		if (imp->imp_rq_pool) {
-			ptlrpc_free_rq_pool(imp->imp_rq_pool);
-			imp->imp_rq_pool = NULL;
-		}
 		client_destroy_import(imp);
 		obd->u.cli.cl_import = NULL;
 	}
diff --git a/drivers/staging/lustre/lustre/obdclass/genops.c b/drivers/staging/lustre/lustre/obdclass/genops.c
index 370d5b4..594955d 100644
--- a/drivers/staging/lustre/lustre/obdclass/genops.c
+++ b/drivers/staging/lustre/lustre/obdclass/genops.c
@@ -1663,7 +1663,6 @@ static void obd_zombie_export_add(struct obd_export *exp)
 static void obd_zombie_import_add(struct obd_import *imp)
 {
 	LASSERT(!imp->imp_sec);
-	LASSERT(!imp->imp_rq_pool);
 	spin_lock(&obd_zombie_impexp_lock);
 	LASSERT(list_empty(&imp->imp_zombie_chain));
 	zombies_count++;
diff --git a/drivers/staging/lustre/lustre/osc/lproc_osc.c b/drivers/staging/lustre/lustre/osc/lproc_osc.c
index ff6d2e2..c504d15 100644
--- a/drivers/staging/lustre/lustre/osc/lproc_osc.c
+++ b/drivers/staging/lustre/lustre/osc/lproc_osc.c
@@ -96,9 +96,9 @@ static ssize_t max_rpcs_in_flight_store(struct kobject *kobj,
 	struct obd_device *dev = container_of(kobj, struct obd_device,
 					      obd_kobj);
 	struct client_obd *cli = &dev->u.cli;
-	struct ptlrpc_request_pool *pool = cli->cl_import->imp_rq_pool;
 	int rc;
 	unsigned long val;
+	int adding, added, req_count;
 
 	rc = kstrtoul(buffer, 10, &val);
 	if (rc)
@@ -107,8 +107,19 @@ static ssize_t max_rpcs_in_flight_store(struct kobject *kobj,
 	if (val < 1 || val > OSC_MAX_RIF_MAX)
 		return -ERANGE;
 
-	if (pool && val > cli->cl_max_rpcs_in_flight)
-		pool->prp_populate(pool, val-cli->cl_max_rpcs_in_flight);
+	adding = val - cli->cl_max_rpcs_in_flight;
+	req_count = atomic_read(&osc_pool_req_count);
+	if (adding > 0 && req_count < osc_reqpool_maxreqcount) {
+		/*
+		 * There might be some race which will cause over-limit
+		 * allocation, but it is fine.
+		 */
+		if (req_count + adding > osc_reqpool_maxreqcount)
+			adding = osc_reqpool_maxreqcount - req_count;
+
+		added = osc_rq_pool->prp_populate(osc_rq_pool, adding);
+		atomic_add(added, &osc_pool_req_count);
+	}
 
 	client_obd_list_lock(&cli->cl_loi_list_lock);
 	cli->cl_max_rpcs_in_flight = val;
diff --git a/drivers/staging/lustre/lustre/osc/osc_internal.h b/drivers/staging/lustre/lustre/osc/osc_internal.h
index 470698b..7d0a3e2 100644
--- a/drivers/staging/lustre/lustre/osc/osc_internal.h
+++ b/drivers/staging/lustre/lustre/osc/osc_internal.h
@@ -39,6 +39,10 @@
 
 #define OAP_MAGIC 8675309
 
+extern atomic_t osc_pool_req_count;
+extern unsigned int osc_reqpool_maxreqcount;
+extern struct ptlrpc_request_pool *osc_rq_pool;
+
 struct lu_env;
 
 enum async_flags {
diff --git a/drivers/staging/lustre/lustre/osc/osc_request.c b/drivers/staging/lustre/lustre/osc/osc_request.c
index 114c550..f41f762 100644
--- a/drivers/staging/lustre/lustre/osc/osc_request.c
+++ b/drivers/staging/lustre/lustre/osc/osc_request.c
@@ -50,9 +50,18 @@
 #include "../include/lustre_param.h"
 #include "../include/lustre_fid.h"
 #include "../include/obd_class.h"
+#include "../include/obd.h"
 #include "osc_internal.h"
 #include "osc_cl_internal.h"
 
+atomic_t osc_pool_req_count;
+unsigned int osc_reqpool_maxreqcount;
+struct ptlrpc_request_pool *osc_rq_pool;
+
+/* max memory used for request pool, unit is MB */
+static unsigned int osc_reqpool_mem_max = 5;
+module_param(osc_reqpool_mem_max, uint, 0444);
+
 struct osc_brw_async_args {
 	struct obdo       *aa_oa;
 	int		aa_requested_nob;
@@ -1268,7 +1277,7 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,
 	if ((cmd & OBD_BRW_WRITE) != 0) {
 		opc = OST_WRITE;
 		req = ptlrpc_request_alloc_pool(cli->cl_import,
-						cli->cl_import->imp_rq_pool,
+						osc_rq_pool,
 						&RQF_OST_BRW_WRITE);
 	} else {
 		opc = OST_READ;
@@ -3163,6 +3172,9 @@ int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 	struct client_obd *cli = &obd->u.cli;
 	void *handler;
 	int rc;
+	int adding;
+	int added;
+	int req_count;
 
 	rc = ptlrpcd_addref();
 	if (rc)
@@ -3191,15 +3203,20 @@ int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 		ptlrpc_lprocfs_register_obd(obd);
 	}
 
-	/* We need to allocate a few requests more, because
-	 * brw_interpret tries to create new requests before freeing
-	 * previous ones, Ideally we want to have 2x max_rpcs_in_flight
-	 * reserved, but I'm afraid that might be too much wasted RAM
-	 * in fact, so 2 is just my guess and still should work. */
-	cli->cl_import->imp_rq_pool =
-		ptlrpc_init_rq_pool(cli->cl_max_rpcs_in_flight + 2,
-				    OST_MAXREQSIZE,
-				    ptlrpc_add_rqs_to_pool);
+	/*
+	 * We try to control the total number of requests with a upper limit
+	 * osc_reqpool_maxreqcount. There might be some race which will cause
+	 * over-limit allocation, but it is fine.
+	 */
+	req_count = atomic_read(&osc_pool_req_count);
+	if (req_count < osc_reqpool_maxreqcount) {
+		adding = cli->cl_max_rpcs_in_flight + 2;
+		if (req_count + adding > osc_reqpool_maxreqcount)
+			adding = osc_reqpool_maxreqcount - req_count;
+
+		added = ptlrpc_add_rqs_to_pool(osc_rq_pool, adding);
+		atomic_add(added, &osc_pool_req_count);
+	}
 
 	INIT_LIST_HEAD(&cli->cl_grant_shrink_list);
 	ns_register_cancel(obd->obd_namespace, osc_cancel_for_recovery);
@@ -3339,6 +3356,8 @@ extern struct lock_class_key osc_ast_guard_class;
 static int __init osc_init(void)
 {
 	struct lprocfs_static_vars lvars = { NULL };
+	unsigned int reqpool_size;
+	unsigned int reqsize;
 	int rc;
 
 	/* print an address of _any_ initialized kernel symbol from this
@@ -3354,14 +3373,45 @@ static int __init osc_init(void)
 
 	rc = class_register_type(&osc_obd_ops, NULL,
 				 LUSTRE_OSC_NAME, &osc_device_type);
-	if (rc) {
-		lu_kmem_fini(osc_caches);
-		return rc;
-	}
+	if (rc)
+		goto out_kmem;
 
 	spin_lock_init(&osc_ast_guard);
 	lockdep_set_class(&osc_ast_guard, &osc_ast_guard_class);
 
+	/* This is obviously too much memory, only prevent overflow here */
+	if (osc_reqpool_mem_max >= 1 << 12 || osc_reqpool_mem_max == 0) {
+		rc = -EINVAL;
+		goto out_type;
+	}
+
+	reqpool_size = osc_reqpool_mem_max << 20;
+
+	reqsize = 1;
+	while (reqsize < OST_MAXREQSIZE)
+		reqsize = reqsize << 1;
+
+	/*
+	 * We don't enlarge the request count in OSC pool according to
+	 * cl_max_rpcs_in_flight. The allocation from the pool will only be
+	 * tried after normal allocation failed. So a small OSC pool won't
+	 * cause much performance degression in most of cases.
+	 */
+	osc_reqpool_maxreqcount = reqpool_size / reqsize;
+
+	atomic_set(&osc_pool_req_count, 0);
+	osc_rq_pool = ptlrpc_init_rq_pool(0, OST_MAXREQSIZE,
+					  ptlrpc_add_rqs_to_pool);
+
+	if (osc_rq_pool)
+		return 0;
+
+	rc = -ENOMEM;
+
+out_type:
+	class_unregister_type(LUSTRE_OSC_NAME);
+out_kmem:
+	lu_kmem_fini(osc_caches);
 	return rc;
 }
 
@@ -3369,6 +3419,7 @@ static void /*__exit*/ osc_exit(void)
 {
 	class_unregister_type(LUSTRE_OSC_NAME);
 	lu_kmem_fini(osc_caches);
+	ptlrpc_free_rq_pool(osc_rq_pool);
 }
 
 MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
diff --git a/drivers/staging/lustre/lustre/ptlrpc/client.c b/drivers/staging/lustre/lustre/ptlrpc/client.c
index 1800db1..90b24fc 100644
--- a/drivers/staging/lustre/lustre/ptlrpc/client.c
+++ b/drivers/staging/lustre/lustre/ptlrpc/client.c
@@ -446,7 +446,7 @@ EXPORT_SYMBOL(ptlrpc_free_rq_pool);
 /**
  * Allocates, initializes and adds \a num_rq requests to the pool \a pool
  */
-void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq)
+int ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq)
 {
 	int i;
 	int size = 1;
@@ -468,11 +468,11 @@ void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq)
 		spin_unlock(&pool->prp_lock);
 		req = ptlrpc_request_cache_alloc(GFP_NOFS);
 		if (!req)
-			return;
+			return i;
 		msg = libcfs_kvzalloc(size, GFP_NOFS);
 		if (!msg) {
 			ptlrpc_request_cache_free(req);
-			return;
+			return i;
 		}
 		req->rq_reqbuf = msg;
 		req->rq_reqbuf_len = size;
@@ -481,6 +481,7 @@ void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq)
 		list_add_tail(&req->rq_list, &pool->prp_req_list);
 	}
 	spin_unlock(&pool->prp_lock);
+	return num_rq;
 }
 EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool);
 
@@ -494,7 +495,7 @@ EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool);
  */
 struct ptlrpc_request_pool *
 ptlrpc_init_rq_pool(int num_rq, int msgsize,
-		    void (*populate_pool)(struct ptlrpc_request_pool *, int))
+		    int (*populate_pool)(struct ptlrpc_request_pool *, int))
 {
 	struct ptlrpc_request_pool *pool;
 
@@ -512,11 +513,6 @@ ptlrpc_init_rq_pool(int num_rq, int msgsize,
 
 	populate_pool(pool, num_rq);
 
-	if (list_empty(&pool->prp_req_list)) {
-		/* have not allocated a single request for the pool */
-		kfree(pool);
-		pool = NULL;
-	}
 	return pool;
 }
 EXPORT_SYMBOL(ptlrpc_init_rq_pool);
@@ -702,11 +698,10 @@ struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp,
 {
 	struct ptlrpc_request *request = NULL;
 
-	if (pool)
-		request = ptlrpc_prep_req_from_pool(pool);
+	request = ptlrpc_request_cache_alloc(GFP_NOFS);
 
-	if (!request)
-		request = ptlrpc_request_cache_alloc(GFP_NOFS);
+	if (!request && pool)
+		request = ptlrpc_prep_req_from_pool(pool);
 
 	if (request) {
 		LASSERTF((unsigned long)imp > 0x1000, "%p", imp);
-- 
2.1.0

next prev parent reply	other threads:[~2015-09-14 22:43 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-09-14 22:41 [PATCH 00/19] Lustre fixes green
2015-09-14 22:41 ` [PATCH 01/19] staging/lustre/lnet: Reenable lnet router debugfs green
2015-09-14 22:41 ` [PATCH 02/19] staging/lustre/obdclass: reorganize busy object accounting green
2015-09-14 22:41 ` [PATCH 03/19] staging/lustre/llite: cleanup open handle for client open failure green
2015-09-14 22:41 ` [PATCH 04/19] staging/lustre/llite: strengthen checks for hsm flags and archive id green
2015-09-14 22:41 ` [PATCH 05/19] staging/lustre/ptlrpc: remove LUSTRE_MSG_MAGIC_V1 support green
2015-09-14 22:41 ` [PATCH 06/19] staging/lustre/lmv: fix potential null pointer dereference green
2015-09-15 13:26   ` Trevor Woerner
2015-09-15 13:57     ` Oleg Drokin
2015-09-14 22:41 ` [PATCH 07/19] staging/lustre/llite: deny non-root user for changelog operations green
2015-09-14 22:41 ` [PATCH 08/19] staging/lustre/o2iblnd: connection refcount fix for kiblnd_post_rx green
2015-09-14 22:41 ` [PATCH 09/19] staging/lustre/osc: LBUG in osc_lru_reclaim green
2015-09-14 22:41 ` [PATCH 10/19] staging/lustre/libcfs: minor fix in cfs_hash_for_each_relax() green
2015-09-14 22:41 ` [PATCH 11/19] staging/lustre/lnet: fix deadloop in ksocknal_push green
2015-09-14 22:41 ` [PATCH 12/19] staging/lustre/o2iblnd: wrong uses of kib_tx_t::tx_nfrags green
2015-09-14 22:41 ` [PATCH 13/19] staging/lustre/llite: ASSERTION( atomic_read(&d->ld_ref) == 0 ) failed green
2015-09-14 22:41 ` [PATCH 14/19] staging/lustre/obdclass: Eliminate hash bucket scans in lu_cache_shrink green
2015-09-14 22:41 ` [PATCH 15/19] staging/lustre: Remove unused MAY_ constants green
2015-09-14 22:41 ` green [this message]
2015-09-14 22:41 ` [PATCH 17/19] staging/lustre/o2iblnd: leak cmid in kiblnd_dev_need_failover green
2015-09-14 22:41 ` [PATCH 18/19] staging/lustre/libcfs: remove unused cfs_timer_done green
2015-09-14 22:41 ` [PATCH 19/19] staging/lustre/ptlrpc: make ptlrpcd threads cpt-aware green

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:5a38f3d dfblob:c8b89a3 dfblob:5df493e dfblob:313a56c
dfblob:87bb2ce dfblob:ce6fa55 dfblob:370d5b4 dfblob:594955d
dfblob:ff6d2e2 dfblob:c504d15 dfblob:470698b dfblob:7d0a3e2
dfblob:114c550 dfblob:f41f762 dfblob:1800db1 dfblob:90b24fc )
 OR (
bs:"[PATCH 16/19] staging/lustre/osc: use global osc_rq_pool to reduce memory usage" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1442270495-1655259-17-git-send-email-green@linuxhacker.ru \
    --to=green@linuxhacker.ru \
    --cc=andreas.dilger@intel.com \
    --cc=devel@driverdev.osuosl.org \
    --cc=gregkh@linuxfoundation.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=lixi@ddn.com \
    --cc=lwu@ddn.com \
    --cc=oleg.drokin@intel.com \
    --cc=wshilong@ddn.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.