lustre-devel-lustre.org archive mirror
 help / color / mirror / Atom feed
From: James Simmons <jsimmons@infradead.org>
To: Andreas Dilger <adilger@whamcloud.com>,
	Oleg Drokin <green@whamcloud.com>, NeilBrown <neilb@suse.de>
Cc: Lai Siyao <lai.siyao@whamcloud.com>,
	Lustre Development List <lustre-devel@lists.lustre.org>
Subject: [lustre-devel] [PATCH 09/15] lustre: dne: dir migrate in QOS mode
Date: Mon,  8 Nov 2021 10:07:37 -0500	[thread overview]
Message-ID: <1636384063-13838-10-git-send-email-jsimmons@infradead.org> (raw)
In-Reply-To: <1636384063-13838-1-git-send-email-jsimmons@infradead.org>

From: Lai Siyao <lai.siyao@whamcloud.com>

Support "lfs migrate -m -1 ..." to migrate directory to MDTs by
space and inode usage, if system is balanced, the target MDT is
chosen in roundrobin mode, otherwise the less full MDTs will be
chosen, and the most full MDT is avoided.

Another minor change: if directory is migrated to specific MDTs,
and the target stripe count is more than 1, its subdirs may not be
migrated to the specified MDT in the command, but migrated to the
MDT where its parent stripe is located (subdir will be striped too),
as can avoid unnecessary remote directories. NB, for command like
"lfs migrate -m 0,1,2 ...", though the subdir may be located on
either MDT0, MDT1 or MDT2, its stripes will be striped over these
three MDTs, but for command like "lfs migrate -m 0 -c 3...", the
subdir may be striped on other MDTs if the subdir is not located on
MDT0.

WC-bug-id: https://jira.whamcloud.com/browse/LU-13076
Lustre-commit: 378c7567876b430d0 ("LU-13076 dne: dir migrate in QOS mode")
Signed-off-by: Lai Siyao <lai.siyao@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/44886
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Hongchao Zhang <hongchao@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Signed-off-by: James Simmons <jsimmons@infradead.org>
---
 fs/lustre/lmv/lmv_obd.c | 176 +++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 158 insertions(+), 18 deletions(-)

diff --git a/fs/lustre/lmv/lmv_obd.c b/fs/lustre/lmv/lmv_obd.c
index fb64b6c..b31f943 100644
--- a/fs/lustre/lmv/lmv_obd.c
+++ b/fs/lustre/lmv/lmv_obd.c
@@ -1427,7 +1427,7 @@ static int lmv_close(struct obd_export *exp, struct md_op_data *op_data,
 	return md_close(tgt->ltd_exp, op_data, mod, request);
 }
 
-static struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, u32 *mdt,
+static struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, u32 mdt,
 					      unsigned short dir_depth)
 {
 	struct lu_tgt_desc *tgt, *cur = NULL;
@@ -1462,7 +1462,7 @@ static struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, u32 *mdt,
 
 		tgt->ltd_qos.ltq_usable = 1;
 		lu_tgt_qos_weight_calc(tgt);
-		if (tgt->ltd_index == *mdt)
+		if (tgt->ltd_index == mdt)
 			cur = tgt;
 		total_avail += tgt->ltd_qos.ltq_avail;
 		total_weight += tgt->ltd_qos.ltq_weight;
@@ -1477,7 +1477,6 @@ static struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, u32 *mdt,
 	       (total_usable * 256 * (1 + dir_depth / 4));
 	if (cur && cur->ltd_qos.ltq_avail >= rand) {
 		tgt = cur;
-		rc = 0;
 		goto unlock;
 	}
 
@@ -1491,9 +1490,7 @@ static struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, u32 *mdt,
 		if (cur_weight < rand)
 			continue;
 
-		*mdt = tgt->ltd_index;
 		ltd_qos_update(&lmv->lmv_mdt_descs, tgt, &total_weight);
-		rc = 0;
 		goto unlock;
 	}
 
@@ -1506,7 +1503,7 @@ static struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, u32 *mdt,
 	return tgt;
 }
 
-static struct lu_tgt_desc *lmv_locate_tgt_rr(struct lmv_obd *lmv, u32 *mdt)
+static struct lu_tgt_desc *lmv_locate_tgt_rr(struct lmv_obd *lmv)
 {
 	struct lu_tgt_desc *tgt;
 	int i;
@@ -1520,8 +1517,7 @@ static struct lu_tgt_desc *lmv_locate_tgt_rr(struct lmv_obd *lmv, u32 *mdt)
 		if (!tgt || !tgt->ltd_exp || !tgt->ltd_active)
 			continue;
 
-		*mdt = tgt->ltd_index;
-		lmv->lmv_qos_rr_index = (*mdt + 1) %
+		lmv->lmv_qos_rr_index = (tgt->ltd_index + 1) %
 					lmv->lmv_mdt_descs.ltd_tgts_size;
 		spin_unlock(&lmv->lmv_lock);
 
@@ -1532,6 +1528,65 @@ static struct lu_tgt_desc *lmv_locate_tgt_rr(struct lmv_obd *lmv, u32 *mdt)
 	return ERR_PTR(-ENODEV);
 }
 
+/* locate MDT which is less full (avoid the most full MDT) */
+static struct lu_tgt_desc *lmv_locate_tgt_lf(struct lmv_obd *lmv)
+{
+	struct lu_tgt_desc *min = NULL;
+	struct lu_tgt_desc *tgt;
+	u64 avail = 0;
+	u64 rand;
+
+	if (!ltd_qos_is_usable(&lmv->lmv_mdt_descs))
+		return ERR_PTR(-EAGAIN);
+
+	down_write(&lmv->lmv_qos.lq_rw_sem);
+
+	if (!ltd_qos_is_usable(&lmv->lmv_mdt_descs)) {
+		tgt = ERR_PTR(-EAGAIN);
+		goto unlock;
+	}
+
+	lmv_foreach_tgt(lmv, tgt) {
+		if (!tgt->ltd_exp || !tgt->ltd_active) {
+			tgt->ltd_qos.ltq_usable = 0;
+			continue;
+		}
+
+		tgt->ltd_qos.ltq_usable = 1;
+		lu_tgt_qos_weight_calc(tgt);
+		avail += tgt->ltd_qos.ltq_avail;
+		if (!min || min->ltd_qos.ltq_avail > tgt->ltd_qos.ltq_avail)
+			min = tgt;
+	}
+
+	/* avoid the most full MDT */
+	if (min)
+		avail -= min->ltd_qos.ltq_avail;
+
+	rand = lu_prandom_u64_max(avail);
+	avail = 0;
+	lmv_foreach_connected_tgt(lmv, tgt) {
+		if (!tgt->ltd_qos.ltq_usable)
+			continue;
+
+		if (tgt == min)
+			continue;
+
+		avail += tgt->ltd_qos.ltq_avail;
+		if (avail < rand)
+			continue;
+
+		goto unlock;
+	}
+
+	/* no proper target found */
+	tgt = ERR_PTR(-EAGAIN);
+unlock:
+	up_write(&lmv->lmv_qos.lq_rw_sem);
+
+	return tgt;
+}
+
 /* locate MDT by file name, for striped directory, the file name hash decides
  * which stripe its dirent is stored.
  */
@@ -1847,7 +1902,7 @@ int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
 	} else if (lmv_op_qos_mkdir(op_data)) {
 		struct lmv_tgt_desc *tmp = tgt;
 
-		tgt = lmv_locate_tgt_qos(lmv, &op_data->op_mds,
+		tgt = lmv_locate_tgt_qos(lmv, op_data->op_mds,
 					 op_data->op_dir_depth);
 		if (tgt == ERR_PTR(-EAGAIN)) {
 			if (ltd_qos_is_balanced(&lmv->lmv_mdt_descs) &&
@@ -1858,11 +1913,12 @@ int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
 				 */
 				tgt = tmp;
 			else
-				tgt = lmv_locate_tgt_rr(lmv, &op_data->op_mds);
+				tgt = lmv_locate_tgt_rr(lmv);
 		}
 		if (IS_ERR(tgt))
 			return PTR_ERR(tgt);
 
+		op_data->op_mds = tgt->ltd_index;
 		/*
 		 * only update statfs after QoS mkdir, this means the cached
 		 * statfs may be stale, and current mkdir may not follow QoS
@@ -2069,6 +2125,53 @@ static int lmv_link(struct obd_export *exp, struct md_op_data *op_data,
 	return md_link(tgt->ltd_exp, op_data, request);
 }
 
+/* migrate the top directory */
+static inline bool lmv_op_topdir_migrate(const struct md_op_data *op_data)
+{
+	if (!S_ISDIR(op_data->op_mode))
+		return false;
+
+	if (lmv_dir_layout_changing(op_data->op_mea1))
+		return false;
+
+	return true;
+}
+
+/* migrate top dir to specific MDTs */
+static inline bool lmv_topdir_specific_migrate(const struct md_op_data *op_data)
+{
+	const struct lmv_user_md *lum = op_data->op_data;
+
+	if (!lmv_op_topdir_migrate(op_data))
+		return false;
+
+	return le32_to_cpu(lum->lum_stripe_offset) != LMV_OFFSET_DEFAULT;
+}
+
+/* migrate top dir in QoS mode if user issued "lfs migrate -m -1..." */
+static inline bool lmv_topdir_qos_migrate(const struct md_op_data *op_data)
+{
+	const struct lmv_user_md *lum = op_data->op_data;
+
+	if (!lmv_op_topdir_migrate(op_data))
+		return false;
+
+	return le32_to_cpu(lum->lum_stripe_offset) == LMV_OFFSET_DEFAULT;
+}
+
+static inline bool lmv_subdir_specific_migrate(const struct md_op_data *op_data)
+{
+	const struct lmv_user_md *lum = op_data->op_data;
+
+	if (!S_ISDIR(op_data->op_mode))
+		return false;
+
+	if (!lmv_dir_layout_changing(op_data->op_mea1))
+		return false;
+
+	return le32_to_cpu(lum->lum_stripe_offset) != LMV_OFFSET_DEFAULT;
+}
+
 static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data,
 			const char *name, size_t namelen,
 			struct ptlrpc_request **request)
@@ -2133,19 +2236,56 @@ static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data,
 	if (IS_ERR(child_tgt))
 		return PTR_ERR(child_tgt);
 
-	/* for directory, migrate to MDT specified by lum_stripe_offset;
-	 * otherwise migrate to the target stripe of parent, but parent
-	 * directory may have finished migration (normally current file too),
-	 * allocate FID on MDT lum_stripe_offset, and server will check
-	 * whether file was migrated already.
-	 */
-	if (S_ISDIR(op_data->op_mode) || !tp_tgt) {
+	if (lmv_topdir_specific_migrate(op_data)) {
 		struct lmv_user_md *lum = op_data->op_data;
 
 		op_data->op_mds = le32_to_cpu(lum->lum_stripe_offset);
-	} else  {
+	} else if (lmv_topdir_qos_migrate(op_data)) {
+		tgt = lmv_locate_tgt_lf(lmv);
+		if (tgt == ERR_PTR(-EAGAIN))
+			tgt = lmv_locate_tgt_rr(lmv);
+		if (IS_ERR(tgt))
+			return PTR_ERR(tgt);
+
+		op_data->op_mds = tgt->ltd_index;
+	} else if (lmv_subdir_specific_migrate(op_data)) {
+		struct lmv_user_md *lum = op_data->op_data;
+		u32 i;
+
+		LASSERT(tp_tgt);
+		if (le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC) {
+			/* adjust MDTs in lum, since subdir is located on where
+			 * its parent stripe is, not the first specified MDT.
+			 */
+			for (i = 0; i < le32_to_cpu(lum->lum_stripe_count);
+			     i++) {
+				if (le32_to_cpu(lum->lum_objects[i].lum_mds) ==
+				    tp_tgt->ltd_index)
+					break;
+			}
+
+			if (i == le32_to_cpu(lum->lum_stripe_count))
+				return -ENODEV;
+
+			lum->lum_objects[i].lum_mds =
+				lum->lum_objects[0].lum_mds;
+			lum->lum_objects[0].lum_mds =
+				cpu_to_le32(tp_tgt->ltd_index);
+		}
+		/* NB, the above adjusts subdir migration for command like
+		 * "lfs migrate -m 0,1,2 ...", but for migration like
+		 * "lfs migrate -m 0 -c 2 ...", the top dir is migrated to MDT0
+		 * and MDT1, however its subdir may be migrated to MDT1 and MDT2
+		 */
+
+		lum->lum_stripe_offset = cpu_to_le32(tp_tgt->ltd_index);
 		op_data->op_mds = tp_tgt->ltd_index;
+	} else if (tp_tgt) {
+		op_data->op_mds = tp_tgt->ltd_index;
+	} else {
+		op_data->op_mds = sp_tgt->ltd_index;
 	}
+
 	rc = lmv_fid_alloc(NULL, exp, &target_fid, op_data);
 	if (rc)
 		return rc;
-- 
1.8.3.1

_______________________________________________
lustre-devel mailing list
lustre-devel@lists.lustre.org
http://lists.lustre.org/listinfo.cgi/lustre-devel-lustre.org

  parent reply	other threads:[~2021-11-08 15:08 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-11-08 15:07 [lustre-devel] [PATCH 00/15] lustre: update to OpenSFS tree Nov 8, 2021 James Simmons
2021-11-08 15:07 ` [lustre-devel] [PATCH 01/15] lustre: sec: keep encryption context in xattr cache James Simmons
2021-11-08 15:07 ` [lustre-devel] [PATCH 02/15] lustre: mdc: add support for grant shrink James Simmons
2021-11-08 15:07 ` [lustre-devel] [PATCH 03/15] lnet: Fix reference leak in lnet_parse James Simmons
2021-11-08 15:07 ` [lustre-devel] [PATCH 04/15] lnet: socklnd: lock ksnc_tx_queue list processing James Simmons
2021-11-08 15:07 ` [lustre-devel] [PATCH 05/15] lustre: ptlrpc: align function names with param names James Simmons
2021-11-08 15:07 ` [lustre-devel] [PATCH 06/15] lnet: don't retry allocating router buffers James Simmons
2021-11-08 15:07 ` [lustre-devel] [PATCH 07/15] lustre: ptlrpc: recalc timer on EINPROGRESS reply James Simmons
2021-11-08 15:07 ` [lustre-devel] [PATCH 08/15] lustre: obdclass: add start time to stats files James Simmons
2021-11-08 15:07 ` James Simmons [this message]
2021-11-08 15:07 ` [lustre-devel] [PATCH 10/15] lustre: lov: fix error handling in lov_new_pool James Simmons
2021-11-08 15:07 ` [lustre-devel] [PATCH 11/15] lustre: vfs: set_nlink() is not race-safe James Simmons
2021-11-08 15:07 ` [lustre-devel] [PATCH 12/15] lustre: ptlrpc: remove LASSERT in nrs_polices debugfs handler James Simmons
2021-11-08 15:07 ` [lustre-devel] [PATCH 13/15] lnet: socklnd: default conns_per_peer to 0 James Simmons
2021-11-08 15:07 ` [lustre-devel] [PATCH 14/15] lnet: don't use hops to determine the route state James Simmons
2021-11-08 15:07 ` [lustre-devel] [PATCH 15/15] lustre: lmv: update default LMV upon any change James Simmons

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1636384063-13838-10-git-send-email-jsimmons@infradead.org \
    --to=jsimmons@infradead.org \
    --cc=adilger@whamcloud.com \
    --cc=green@whamcloud.com \
    --cc=lai.siyao@whamcloud.com \
    --cc=lustre-devel@lists.lustre.org \
    --cc=neilb@suse.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).