Linux RAID subsystem development

Linux RAID subsystem development
 help / color / mirror / Atom feed

* [PATCH 24/27] drbd: implement REQ_OP_WRITE_ZEROES
From: Christoph Hellwig @ 2017-04-05 17:21 UTC (permalink / raw)
  To: axboe, martin.petersen, agk, snitzer, shli, philipp.reisner,
	lars.ellenberg
  Cc: linux-block, linux-raid, dm-devel, linux-scsi, drbd-dev
In-Reply-To: <20170405172125.22600-1-hch@lst.de>

It seems like DRBD assumes its on the wire TRIM request always zeroes data.
Use that fact to implement REQ_OP_WRITE_ZEROES.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
---
 drivers/block/drbd/drbd_main.c     | 3 ++-
 drivers/block/drbd/drbd_nl.c       | 2 ++
 drivers/block/drbd/drbd_receiver.c | 6 +++---
 drivers/block/drbd/drbd_req.c      | 7 +++++--
 drivers/block/drbd/drbd_worker.c   | 4 +++-
 5 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 92c60cbd04ee..8e62d9f65510 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1668,7 +1668,8 @@ static u32 bio_flags_to_wire(struct drbd_connection *connection,
 			(bio->bi_opf & REQ_FUA ? DP_FUA : 0) |
 			(bio->bi_opf & REQ_PREFLUSH ? DP_FLUSH : 0) |
 			(bio_op(bio) == REQ_OP_WRITE_SAME ? DP_WSAME : 0) |
-			(bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0);
+			(bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0) |
+			(bio_op(bio) == REQ_OP_WRITE_ZEROES ? DP_DISCARD : 0);
 	else
 		return bio->bi_opf & REQ_SYNC ? DP_RW_SYNC : 0;
 }
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 908c704e20aa..e4516d3b971d 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1217,10 +1217,12 @@ static void decide_on_discard_support(struct drbd_device *device,
 		blk_queue_discard_granularity(q, 512);
 		q->limits.max_discard_sectors = drbd_max_discard_sectors(connection);
 		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+		q->limits.max_write_zeroes_sectors = drbd_max_discard_sectors(connection);
 	} else {
 		queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
 		blk_queue_discard_granularity(q, 0);
 		q->limits.max_discard_sectors = 0;
+		q->limits.max_write_zeroes_sectors = 0;
 	}
 }
 
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index bc1d296581f9..1b0a2be24f39 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -2285,7 +2285,7 @@ static unsigned long wire_flags_to_bio_flags(u32 dpf)
 static unsigned long wire_flags_to_bio_op(u32 dpf)
 {
 	if (dpf & DP_DISCARD)
-		return REQ_OP_DISCARD;
+		return REQ_OP_WRITE_ZEROES;
 	else
 		return REQ_OP_WRITE;
 }
@@ -2476,7 +2476,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
 	op_flags = wire_flags_to_bio_flags(dp_flags);
 	if (pi->cmd == P_TRIM) {
 		D_ASSERT(peer_device, peer_req->i.size > 0);
-		D_ASSERT(peer_device, op == REQ_OP_DISCARD);
+		D_ASSERT(peer_device, op == REQ_OP_WRITE_ZEROES);
 		D_ASSERT(peer_device, peer_req->pages == NULL);
 	} else if (peer_req->pages == NULL) {
 		D_ASSERT(device, peer_req->i.size == 0);
@@ -4789,7 +4789,7 @@ static int receive_rs_deallocated(struct drbd_connection *connection, struct pac
 
 	if (get_ldev(device)) {
 		struct drbd_peer_request *peer_req;
-		const int op = REQ_OP_DISCARD;
+		const int op = REQ_OP_WRITE_ZEROES;
 
 		peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER, sector,
 					       size, 0, GFP_NOIO);
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 6da9ea8c48b6..b5730e17b455 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -59,6 +59,7 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio
 	drbd_req_make_private_bio(req, bio_src);
 	req->rq_state = (bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0)
 		      | (bio_op(bio_src) == REQ_OP_WRITE_SAME ? RQ_WSAME : 0)
+		      | (bio_op(bio_src) == REQ_OP_WRITE_ZEROES ? RQ_UNMAP : 0)
 		      | (bio_op(bio_src) == REQ_OP_DISCARD ? RQ_UNMAP : 0);
 	req->device = device;
 	req->master_bio = bio_src;
@@ -1180,7 +1181,8 @@ drbd_submit_req_private_bio(struct drbd_request *req)
 	if (get_ldev(device)) {
 		if (drbd_insert_fault(device, type))
 			bio_io_error(bio);
-		else if (bio_op(bio) == REQ_OP_DISCARD)
+		else if (bio_op(bio) == REQ_OP_WRITE_ZEROES ||
+			 bio_op(bio) == REQ_OP_DISCARD)
 			drbd_process_discard_req(req);
 		else
 			generic_make_request(bio);
@@ -1234,7 +1236,8 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long
 	_drbd_start_io_acct(device, req);
 
 	/* process discards always from our submitter thread */
-	if (bio_op(bio) & REQ_OP_DISCARD)
+	if ((bio_op(bio) & REQ_OP_WRITE_ZEROES) ||
+	    (bio_op(bio) & REQ_OP_DISCARD))
 		goto queue_for_submitter_thread;
 
 	if (rw == WRITE && req->private_bio && req->i.size
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 3bff33f21435..1afcb4e02d8d 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -174,7 +174,8 @@ void drbd_peer_request_endio(struct bio *bio)
 	struct drbd_peer_request *peer_req = bio->bi_private;
 	struct drbd_device *device = peer_req->peer_device->device;
 	bool is_write = bio_data_dir(bio) == WRITE;
-	bool is_discard = !!(bio_op(bio) == REQ_OP_DISCARD);
+	bool is_discard = bio_op(bio) == REQ_OP_WRITE_ZEROES ||
+			  bio_op(bio) == REQ_OP_DISCARD;
 
 	if (bio->bi_error && __ratelimit(&drbd_ratelimit_state))
 		drbd_warn(device, "%s: error=%d s=%llus\n",
@@ -249,6 +250,7 @@ void drbd_request_endio(struct bio *bio)
 	/* to avoid recursion in __req_mod */
 	if (unlikely(bio->bi_error)) {
 		switch (bio_op(bio)) {
+		case REQ_OP_WRITE_ZEROES:
 		case REQ_OP_DISCARD:
 			if (bio->bi_error == -EOPNOTSUPP)
 				what = DISCARD_COMPLETED_NOTSUPP;
-- 
2.11.0

^ permalink raw reply related

* [PATCH 25/27] block: remove the discard_zeroes_data flag
From: Christoph Hellwig @ 2017-04-05 17:21 UTC (permalink / raw)
  To: axboe, martin.petersen, agk, snitzer, shli, philipp.reisner,
	lars.ellenberg
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel, linux-raid
In-Reply-To: <20170405172125.22600-1-hch@lst.de>

Now that we use the proper REQ_OP_WRITE_ZEROES operation everywhere we can
kill this hack.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
---
 Documentation/ABI/testing/sysfs-block | 10 ++-----
 Documentation/block/queue-sysfs.txt   |  5 ----
 block/blk-lib.c                       |  7 +----
 block/blk-settings.c                  |  3 ---
 block/blk-sysfs.c                     |  2 +-
 block/compat_ioctl.c                  |  2 +-
 block/ioctl.c                         |  2 +-
 drivers/block/drbd/drbd_main.c        |  2 --
 drivers/block/drbd/drbd_nl.c          |  7 +----
 drivers/block/loop.c                  |  2 --
 drivers/block/mtip32xx/mtip32xx.c     |  1 -
 drivers/block/nbd.c                   |  1 -
 drivers/md/dm-cache-target.c          |  1 -
 drivers/md/dm-crypt.c                 |  1 -
 drivers/md/dm-raid.c                  |  6 ++---
 drivers/md/dm-raid1.c                 |  1 -
 drivers/md/dm-table.c                 | 19 -------------
 drivers/md/dm-thin.c                  |  2 --
 drivers/md/raid5.c                    | 50 +++++++++++------------------------
 drivers/scsi/sd.c                     |  5 ----
 drivers/target/target_core_device.c   |  2 +-
 include/linux/blkdev.h                | 15 -----------
 include/linux/device-mapper.h         |  5 ----
 23 files changed, 27 insertions(+), 124 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index 2da04ce6aeef..dea212db9df3 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -213,14 +213,8 @@ What:		/sys/block/<disk>/queue/discard_zeroes_data
 Date:		May 2011
 Contact:	Martin K. Petersen <martin.petersen@oracle.com>
 Description:
-		Devices that support discard functionality may return
-		stale or random data when a previously discarded block
-		is read back. This can cause problems if the filesystem
-		expects discarded blocks to be explicitly cleared. If a
-		device reports that it deterministically returns zeroes
-		when a discarded area is read the discard_zeroes_data
-		parameter will be set to one. Otherwise it will be 0 and
-		the result of reading a discarded area is undefined.
+		Will always return 0.  Don't rely on any specific behavior
+		for discards, and don't read this file.
 
 What:		/sys/block/<disk>/queue/write_same_max_bytes
 Date:		January 2012
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
index b7f6bdc96d73..2c1e67058fd3 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -43,11 +43,6 @@ large discards are issued, setting this value lower will make Linux issue
 smaller discards and potentially help reduce latencies induced by large
 discard operations.
 
-discard_zeroes_data (RO)
-------------------------
-When read, this file will show if the discarded block are zeroed by the
-device or not. If its value is '1' the blocks are zeroed otherwise not.
-
 hw_sector_size (RO)
 -------------------
 This is the hardware sector size of the device, in bytes.
diff --git a/block/blk-lib.c b/block/blk-lib.c
index b0c6c4bcf441..e8caecd71688 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -37,17 +37,12 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		return -ENXIO;
 
 	if (flags & BLKDEV_DISCARD_SECURE) {
-		if (flags & BLKDEV_DISCARD_ZERO)
-			return -EOPNOTSUPP;
 		if (!blk_queue_secure_erase(q))
 			return -EOPNOTSUPP;
 		op = REQ_OP_SECURE_ERASE;
 	} else {
 		if (!blk_queue_discard(q))
 			return -EOPNOTSUPP;
-		if ((flags & BLKDEV_DISCARD_ZERO) &&
-		    !q->limits.discard_zeroes_data)
-			return -EOPNOTSUPP;
 		op = REQ_OP_DISCARD;
 	}
 
@@ -126,7 +121,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 			&bio);
 	if (!ret && bio) {
 		ret = submit_bio_wait(bio);
-		if (ret == -EOPNOTSUPP && !(flags & BLKDEV_DISCARD_ZERO))
+		if (ret == -EOPNOTSUPP)
 			ret = 0;
 		bio_put(bio);
 	}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 1e7174ffc9d4..4fa81ed383ca 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -103,7 +103,6 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->discard_granularity = 0;
 	lim->discard_alignment = 0;
 	lim->discard_misaligned = 0;
-	lim->discard_zeroes_data = 0;
 	lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
 	lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
 	lim->alignment_offset = 0;
@@ -127,7 +126,6 @@ void blk_set_stacking_limits(struct queue_limits *lim)
 	blk_set_default_limits(lim);
 
 	/* Inherit limits from component devices */
-	lim->discard_zeroes_data = 1;
 	lim->max_segments = USHRT_MAX;
 	lim->max_discard_segments = 1;
 	lim->max_hw_sectors = UINT_MAX;
@@ -609,7 +607,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	t->io_opt = lcm_not_zero(t->io_opt, b->io_opt);
 
 	t->cluster &= b->cluster;
-	t->discard_zeroes_data &= b->discard_zeroes_data;
 
 	/* Physical block size a multiple of the logical block size? */
 	if (t->physical_block_size & (t->logical_block_size - 1)) {
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 45854266e398..b65ce3c65ae8 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -208,7 +208,7 @@ static ssize_t queue_discard_max_store(struct request_queue *q,
 
 static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
 {
-	return queue_var_show(queue_discard_zeroes_data(q), page);
+	return queue_var_show(0, page);
 }
 
 static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 570021a0dc1c..04325b81c2b4 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -685,7 +685,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case BLKALIGNOFF:
 		return compat_put_int(arg, bdev_alignment_offset(bdev));
 	case BLKDISCARDZEROES:
-		return compat_put_uint(arg, bdev_discard_zeroes_data(bdev));
+		return compat_put_uint(arg, 0);
 	case BLKFLSBUF:
 	case BLKROSET:
 	case BLKDISCARD:
diff --git a/block/ioctl.c b/block/ioctl.c
index 8ea00a41be01..0de02ee67eed 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -547,7 +547,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	case BLKALIGNOFF:
 		return put_int(arg, bdev_alignment_offset(bdev));
 	case BLKDISCARDZEROES:
-		return put_uint(arg, bdev_discard_zeroes_data(bdev));
+		return put_uint(arg, 0);
 	case BLKSECTGET:
 		max_sectors = min_t(unsigned int, USHRT_MAX,
 				    queue_max_sectors(bdev_get_queue(bdev)));
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 8e62d9f65510..84455c365f57 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -931,7 +931,6 @@ void assign_p_sizes_qlim(struct drbd_device *device, struct p_sizes *p, struct r
 		p->qlim->io_min = cpu_to_be32(queue_io_min(q));
 		p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
 		p->qlim->discard_enabled = blk_queue_discard(q);
-		p->qlim->discard_zeroes_data = queue_discard_zeroes_data(q);
 		p->qlim->write_same_capable = !!q->limits.max_write_same_sectors;
 	} else {
 		q = device->rq_queue;
@@ -941,7 +940,6 @@ void assign_p_sizes_qlim(struct drbd_device *device, struct p_sizes *p, struct r
 		p->qlim->io_min = cpu_to_be32(queue_io_min(q));
 		p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
 		p->qlim->discard_enabled = 0;
-		p->qlim->discard_zeroes_data = 0;
 		p->qlim->write_same_capable = 0;
 	}
 }
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index e4516d3b971d..02255a0d68b9 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1199,10 +1199,6 @@ static void decide_on_discard_support(struct drbd_device *device,
 	struct drbd_connection *connection = first_peer_device(device)->connection;
 	bool can_do = b ? blk_queue_discard(b) : true;
 
-	if (can_do && b && !b->limits.discard_zeroes_data && !discard_zeroes_if_aligned) {
-		can_do = false;
-		drbd_info(device, "discard_zeroes_data=0 and discard_zeroes_if_aligned=no: disabling discards\n");
-	}
 	if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_TRIM)) {
 		can_do = false;
 		drbd_info(connection, "peer DRBD too old, does not support TRIM: disabling discards\n");
@@ -1484,8 +1480,7 @@ static void sanitize_disk_conf(struct drbd_device *device, struct disk_conf *dis
 	if (disk_conf->al_extents > drbd_al_extents_max(nbc))
 		disk_conf->al_extents = drbd_al_extents_max(nbc);
 
-	if (!blk_queue_discard(q)
-	    || (!q->limits.discard_zeroes_data && !disk_conf->discard_zeroes_if_aligned)) {
+	if (!blk_queue_discard(q)) {
 		if (disk_conf->rs_discard_granularity) {
 			disk_conf->rs_discard_granularity = 0; /* disable feature */
 			drbd_info(device, "rs_discard_granularity feature disabled\n");
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 3bb04c1a4ba1..3081d83d2ea3 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -828,7 +828,6 @@ static void loop_config_discard(struct loop_device *lo)
 		q->limits.discard_alignment = 0;
 		blk_queue_max_discard_sectors(q, 0);
 		blk_queue_max_write_zeroes_sectors(q, 0);
-		q->limits.discard_zeroes_data = 0;
 		queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
 		return;
 	}
@@ -837,7 +836,6 @@ static void loop_config_discard(struct loop_device *lo)
 	q->limits.discard_alignment = 0;
 	blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
 	blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
-	q->limits.discard_zeroes_data = 1;
 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
 }
 
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 30076e7753bc..05e3e664ea1b 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -4025,7 +4025,6 @@ static int mtip_block_initialize(struct driver_data *dd)
 		dd->queue->limits.discard_granularity = 4096;
 		blk_queue_max_discard_sectors(dd->queue,
 			MTIP_MAX_TRIM_ENTRY_LEN * MTIP_MAX_TRIM_ENTRIES);
-		dd->queue->limits.discard_zeroes_data = 0;
 	}
 
 	/* Set the capacity of the device in 512 byte sectors. */
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 03ae72985c79..b02f2362fdf7 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -1110,7 +1110,6 @@ static int nbd_dev_add(int index)
 	queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, disk->queue);
 	disk->queue->limits.discard_granularity = 512;
 	blk_queue_max_discard_sectors(disk->queue, UINT_MAX);
-	disk->queue->limits.discard_zeroes_data = 0;
 	blk_queue_max_hw_sectors(disk->queue, 65536);
 	disk->queue->limits.max_sectors = 256;
 
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 9c689b34e6e7..975922c8f231 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2773,7 +2773,6 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 
 	ti->num_discard_bios = 1;
 	ti->discards_supported = true;
-	ti->discard_zeroes_data_unsupported = true;
 	ti->split_discard_bios = false;
 
 	cache->features = ca->features;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 389a3637ffcc..ef1d836bd81b 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -2030,7 +2030,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	wake_up_process(cc->write_thread);
 
 	ti->num_flush_bios = 1;
-	ti->discard_zeroes_data_unsupported = true;
 
 	return 0;
 
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index f8564d63982f..468f1380de1d 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2813,7 +2813,9 @@ static void configure_discard_support(struct raid_set *rs)
 	/* Assume discards not supported until after checks below. */
 	ti->discards_supported = false;
 
-	/* RAID level 4,5,6 require discard_zeroes_data for data integrity! */
+	/*
+	 * XXX: RAID level 4,5,6 require zeroing for safety.
+	 */
 	raid456 = (rs->md.level == 4 || rs->md.level == 5 || rs->md.level == 6);
 
 	for (i = 0; i < rs->raid_disks; i++) {
@@ -2827,8 +2829,6 @@ static void configure_discard_support(struct raid_set *rs)
 			return;
 
 		if (raid456) {
-			if (!q->limits.discard_zeroes_data)
-				return;
 			if (!devices_handle_discard_safely) {
 				DMERR("raid456 discard support disabled due to discard_zeroes_data uncertainty.");
 				DMERR("Set dm-raid.devices_handle_discard_safely=Y to override.");
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 2ddc2d20e62d..a95cbb80fb34 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1124,7 +1124,6 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	ti->num_flush_bios = 1;
 	ti->num_discard_bios = 1;
 	ti->per_io_data_size = sizeof(struct dm_raid1_bio_record);
-	ti->discard_zeroes_data_unsupported = true;
 
 	ms->kmirrord_wq = alloc_workqueue("kmirrord", WQ_MEM_RECLAIM, 0);
 	if (!ms->kmirrord_wq) {
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 5cd665c91ead..958275aca008 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1449,22 +1449,6 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush)
 	return false;
 }
 
-static bool dm_table_discard_zeroes_data(struct dm_table *t)
-{
-	struct dm_target *ti;
-	unsigned i = 0;
-
-	/* Ensure that all targets supports discard_zeroes_data. */
-	while (i < dm_table_get_num_targets(t)) {
-		ti = dm_table_get_target(t, i++);
-
-		if (ti->discard_zeroes_data_unsupported)
-			return false;
-	}
-
-	return true;
-}
-
 static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev,
 			    sector_t start, sector_t len, void *data)
 {
@@ -1620,9 +1604,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	}
 	blk_queue_write_cache(q, wc, fua);
 
-	if (!dm_table_discard_zeroes_data(t))
-		q->limits.discard_zeroes_data = 0;
-
 	/* Ensure that all underlying devices are non-rotational. */
 	if (dm_table_all_devices_attribute(t, device_is_nonrot))
 		queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 2b266a2b5035..a5f1916f621a 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -3263,7 +3263,6 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	 * them down to the data device.  The thin device's discard
 	 * processing will cause mappings to be removed from the btree.
 	 */
-	ti->discard_zeroes_data_unsupported = true;
 	if (pf.discard_enabled && pf.discard_passdown) {
 		ti->num_discard_bios = 1;
 
@@ -4119,7 +4118,6 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	ti->per_io_data_size = sizeof(struct dm_thin_endio_hook);
 
 	/* In case the pool supports discards, pass them on. */
-	ti->discard_zeroes_data_unsupported = true;
 	if (tc->pool->pf.discard_enabled) {
 		ti->discards_supported = true;
 		ti->num_discard_bios = 1;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 8cf1f86dcd05..d6ae8d22d461 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7229,7 +7229,6 @@ static int raid5_run(struct mddev *mddev)
 
 	if (mddev->queue) {
 		int chunk_size;
-		bool discard_supported = true;
 		/* read-ahead size must cover two whole stripes, which
 		 * is 2 * (datadisks) * chunksize where 'n' is the
 		 * number of raid devices
@@ -7265,12 +7264,6 @@ static int raid5_run(struct mddev *mddev)
 		blk_queue_max_discard_sectors(mddev->queue,
 					      0xfffe * STRIPE_SECTORS);
 
-		/*
-		 * unaligned part of discard request will be ignored, so can't
-		 * guarantee discard_zeroes_data
-		 */
-		mddev->queue->limits.discard_zeroes_data = 0;
-
 		blk_queue_max_write_same_sectors(mddev->queue, 0);
 		blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
 
@@ -7279,35 +7272,24 @@ static int raid5_run(struct mddev *mddev)
 					  rdev->data_offset << 9);
 			disk_stack_limits(mddev->gendisk, rdev->bdev,
 					  rdev->new_data_offset << 9);
-			/*
-			 * discard_zeroes_data is required, otherwise data
-			 * could be lost. Consider a scenario: discard a stripe
-			 * (the stripe could be inconsistent if
-			 * discard_zeroes_data is 0); write one disk of the
-			 * stripe (the stripe could be inconsistent again
-			 * depending on which disks are used to calculate
-			 * parity); the disk is broken; The stripe data of this
-			 * disk is lost.
-			 */
-			if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) ||
-			    !bdev_get_queue(rdev->bdev)->
-						limits.discard_zeroes_data)
-				discard_supported = false;
-			/* Unfortunately, discard_zeroes_data is not currently
-			 * a guarantee - just a hint.  So we only allow DISCARD
-			 * if the sysadmin has confirmed that only safe devices
-			 * are in use by setting a module parameter.
-			 */
-			if (!devices_handle_discard_safely) {
-				if (discard_supported) {
-					pr_info("md/raid456: discard support disabled due to uncertainty.\n");
-					pr_info("Set raid456.devices_handle_discard_safely=Y to override.\n");
-				}
-				discard_supported = false;
-			}
 		}
 
-		if (discard_supported &&
+		/*
+		 * zeroing is required, otherwise data
+		 * could be lost. Consider a scenario: discard a stripe
+		 * (the stripe could be inconsistent if
+		 * discard_zeroes_data is 0); write one disk of the
+		 * stripe (the stripe could be inconsistent again
+		 * depending on which disks are used to calculate
+		 * parity); the disk is broken; The stripe data of this
+		 * disk is lost.
+		 *
+		 * We only allow DISCARD if the sysadmin has confirmed that
+		 * only safe devices are in use by setting a module parameter.
+		 * A better idea might be to turn DISCARD into WRITE_ZEROES
+		 * requests, as that is required to be safe.
+		 */
+		if (devices_handle_discard_safely &&
 		    mddev->queue->limits.max_discard_sectors >= (stripe >> 9) &&
 		    mddev->queue->limits.discard_granularity >= stripe)
 			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 001593ed0444..bcb0cb020fd2 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -644,8 +644,6 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
 	unsigned int logical_block_size = sdkp->device->sector_size;
 	unsigned int max_blocks = 0;
 
-	q->limits.discard_zeroes_data = 0;
-
 	/*
 	 * When LBPRZ is reported, discard alignment and granularity
 	 * must be fixed to the logical block size. Otherwise the block
@@ -681,19 +679,16 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
 	case SD_LBP_WS16:
 		max_blocks = min_not_zero(sdkp->max_ws_blocks,
 					  (u32)SD_MAX_WS16_BLOCKS);
-		q->limits.discard_zeroes_data = sdkp->lbprz;
 		break;
 
 	case SD_LBP_WS10:
 		max_blocks = min_not_zero(sdkp->max_ws_blocks,
 					  (u32)SD_MAX_WS10_BLOCKS);
-		q->limits.discard_zeroes_data = sdkp->lbprz;
 		break;
 
 	case SD_LBP_ZERO:
 		max_blocks = min_not_zero(sdkp->max_ws_blocks,
 					  (u32)SD_MAX_WS10_BLOCKS);
-		q->limits.discard_zeroes_data = 1;
 		break;
 	}
 
diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c
index c754ae33bf7b..d2f089cfa9ae 100644
--- a/drivers/target/target_core_device.c
+++ b/drivers/target/target_core_device.c
@@ -851,7 +851,7 @@ bool target_configure_unmap_from_queue(struct se_dev_attrib *attrib,
 	attrib->unmap_granularity = q->limits.discard_granularity / block_size;
 	attrib->unmap_granularity_alignment = q->limits.discard_alignment /
 								block_size;
-	attrib->unmap_zeroes_data = q->limits.discard_zeroes_data;
+	attrib->unmap_zeroes_data = 0;
 	return true;
 }
 EXPORT_SYMBOL(target_configure_unmap_from_queue);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a5055d760661..d5d9dd72418a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -339,7 +339,6 @@ struct queue_limits {
 	unsigned char		misaligned;
 	unsigned char		discard_misaligned;
 	unsigned char		cluster;
-	unsigned char		discard_zeroes_data;
 	unsigned char		raid_partial_stripes_expensive;
 	enum blk_zoned_model	zoned;
 };
@@ -1342,7 +1341,6 @@ extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct page *page);
 
 #define BLKDEV_DISCARD_SECURE	(1 << 0)	/* issue a secure erase */
-#define BLKDEV_DISCARD_ZERO	(1 << 1)	/* must reliably zero data */
 
 extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
@@ -1542,19 +1540,6 @@ static inline int bdev_discard_alignment(struct block_device *bdev)
 	return q->limits.discard_alignment;
 }
 
-static inline unsigned int queue_discard_zeroes_data(struct request_queue *q)
-{
-	if (q->limits.max_discard_sectors && q->limits.discard_zeroes_data == 1)
-		return 1;
-
-	return 0;
-}
-
-static inline unsigned int bdev_discard_zeroes_data(struct block_device *bdev)
-{
-	return queue_discard_zeroes_data(bdev_get_queue(bdev));
-}
-
 static inline unsigned int bdev_write_same(struct block_device *bdev)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 3829bee2302a..c7ea33e38fb9 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -296,11 +296,6 @@ struct dm_target {
 	 * on max_io_len boundary.
 	 */
 	bool split_discard_bios:1;
-
-	/*
-	 * Set if this target does not return zeroes on discarded blocks.
-	 */
-	bool discard_zeroes_data_unsupported:1;
 };
 
 /* Each target can link one of these into the table */
-- 
2.11.0

^ permalink raw reply related

* [PATCH 26/27] scsi: sd: Separate zeroout and discard command choices
From: Christoph Hellwig @ 2017-04-05 17:21 UTC (permalink / raw)
  To: axboe, martin.petersen, agk, snitzer, shli, philipp.reisner,
	lars.ellenberg
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel, linux-raid
In-Reply-To: <20170405172125.22600-1-hch@lst.de>

From: "Martin K. Petersen" <martin.petersen@oracle.com>

Now that zeroout and discards are distinct operations we need to
separate the policy of choosing the appropriate command. Create a
zeroing_mode which can be one of:

write:			Zeroout assist not present, use regular WRITE
writesame:		Allow WRITE SAME(10/16) with a zeroed payload
writesame_16_unmap:	Allow WRITE SAME(16) with UNMAP
writesame_10_unmap:	Allow WRITE SAME(10) with UNMAP

The last two are conditional on the device being thin provisioned with
LBPRZ=1 and LBPWS=1 or LBPWS10=1 respectively.

Whether to set the UNMAP bit or not depends on the REQ_NOUNMAP flag. And
if none of the _unmap variants are supported, regular WRITE SAME will be
used if the device supports it.

The zeroout_mode is exported in sysfs and the detected mode for a given
device can be overridden using the string constants above.

With this change in place we can now issue WRITE SAME(16) with UNMAP set
for block zeroing applications that require hard guarantees and
logical_block_size granularity. And at the same time use the UNMAP
command with the device's preferred granulary and alignment for discard
operations.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/scsi/sd.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 drivers/scsi/sd.h |  8 ++++++++
 2 files changed, 61 insertions(+), 3 deletions(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index bcb0cb020fd2..acf9d17b05d8 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -418,6 +418,46 @@ provisioning_mode_store(struct device *dev, struct device_attribute *attr,
 }
 static DEVICE_ATTR_RW(provisioning_mode);
 
+static const char *zeroing_mode[] = {
+	[SD_ZERO_WRITE]		= "write",
+	[SD_ZERO_WS]		= "writesame",
+	[SD_ZERO_WS16_UNMAP]	= "writesame_16_unmap",
+	[SD_ZERO_WS10_UNMAP]	= "writesame_10_unmap",
+};
+
+static ssize_t
+zeroing_mode_show(struct device *dev, struct device_attribute *attr,
+		  char *buf)
+{
+	struct scsi_disk *sdkp = to_scsi_disk(dev);
+
+	return snprintf(buf, 20, "%s\n", zeroing_mode[sdkp->zeroing_mode]);
+}
+
+static ssize_t
+zeroing_mode_store(struct device *dev, struct device_attribute *attr,
+		   const char *buf, size_t count)
+{
+	struct scsi_disk *sdkp = to_scsi_disk(dev);
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (!strncmp(buf, zeroing_mode[SD_ZERO_WRITE], 20))
+		sdkp->zeroing_mode = SD_ZERO_WRITE;
+	else if (!strncmp(buf, zeroing_mode[SD_ZERO_WS], 20))
+		sdkp->zeroing_mode = SD_ZERO_WS;
+	else if (!strncmp(buf, zeroing_mode[SD_ZERO_WS16_UNMAP], 20))
+		sdkp->zeroing_mode = SD_ZERO_WS16_UNMAP;
+	else if (!strncmp(buf, zeroing_mode[SD_ZERO_WS10_UNMAP], 20))
+		sdkp->zeroing_mode = SD_ZERO_WS10_UNMAP;
+	else
+		return -EINVAL;
+
+	return count;
+}
+static DEVICE_ATTR_RW(zeroing_mode);
+
 static ssize_t
 max_medium_access_timeouts_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
@@ -496,6 +536,7 @@ static struct attribute *sd_disk_attrs[] = {
 	&dev_attr_app_tag_own.attr,
 	&dev_attr_thin_provisioning.attr,
 	&dev_attr_provisioning_mode.attr,
+	&dev_attr_zeroing_mode.attr,
 	&dev_attr_max_write_same_blocks.attr,
 	&dev_attr_max_medium_access_timeouts.attr,
 	NULL,
@@ -799,10 +840,10 @@ static int sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd)
 	u32 nr_sectors = blk_rq_sectors(rq) >> (ilog2(sdp->sector_size) - 9);
 
 	if (!(rq->cmd_flags & REQ_NOUNMAP)) {
-		switch (sdkp->provisioning_mode) {
-		case SD_LBP_WS16:
+		switch (sdkp->zeroing_mode) {
+		case SD_ZERO_WS16_UNMAP:
 			return sd_setup_write_same16_cmnd(cmd, true);
-		case SD_LBP_WS10:
+		case SD_ZERO_WS10_UNMAP:
 			return sd_setup_write_same10_cmnd(cmd, true);
 		}
 	}
@@ -840,6 +881,15 @@ static void sd_config_write_same(struct scsi_disk *sdkp)
 		sdkp->max_ws_blocks = 0;
 	}
 
+	if (sdkp->lbprz && sdkp->lbpws)
+		sdkp->zeroing_mode = SD_ZERO_WS16_UNMAP;
+	else if (sdkp->lbprz && sdkp->lbpws10)
+		sdkp->zeroing_mode = SD_ZERO_WS10_UNMAP;
+	else if (sdkp->max_ws_blocks)
+		sdkp->zeroing_mode = SD_ZERO_WS;
+	else
+		sdkp->zeroing_mode = SD_ZERO_WRITE;
+
 out:
 	blk_queue_max_write_same_sectors(q, sdkp->max_ws_blocks *
 					 (logical_block_size >> 9));
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 4dac35e96a75..a2c4b5c35379 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -59,6 +59,13 @@ enum {
 	SD_LBP_DISABLE,		/* Discard disabled due to failed cmd */
 };
 
+enum {
+	SD_ZERO_WRITE = 0,	/* Use WRITE(10/16) command */
+	SD_ZERO_WS,		/* Use WRITE SAME(10/16) command */
+	SD_ZERO_WS16_UNMAP,	/* Use WRITE SAME(16) with UNMAP */
+	SD_ZERO_WS10_UNMAP,	/* Use WRITE SAME(10) with UNMAP */
+};
+
 struct scsi_disk {
 	struct scsi_driver *driver;	/* always &sd_template */
 	struct scsi_device *device;
@@ -89,6 +96,7 @@ struct scsi_disk {
 	u8		write_prot;
 	u8		protection_type;/* Data Integrity Field */
 	u8		provisioning_mode;
+	u8		zeroing_mode;
 	unsigned	ATO : 1;	/* state of disk ATO bit */
 	unsigned	cache_override : 1; /* temp override of WCE,RCD */
 	unsigned	WCE : 1;	/* state of disk WCE bit */
-- 
2.11.0

^ permalink raw reply related

* [PATCH 27/27] scsi: sd: Remove LBPRZ dependency for discards
From: Christoph Hellwig @ 2017-04-05 17:21 UTC (permalink / raw)
  To: axboe, martin.petersen, agk, snitzer, shli, philipp.reisner,
	lars.ellenberg
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel, linux-raid
In-Reply-To: <20170405172125.22600-1-hch@lst.de>

From: "Martin K. Petersen" <martin.petersen@oracle.com>

Separating discards and zeroout operations allows us to remove the LBPRZ
block zeroing constraints from discards and honor the device preferences
for UNMAP commands.

If supported by the device, we'll also choose UNMAP over one of the
WRITE SAME variants for discards.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/scsi/sd.c | 25 ++++++-------------------
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index acf9d17b05d8..8cf34a8e3eea 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -685,24 +685,11 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
 	unsigned int logical_block_size = sdkp->device->sector_size;
 	unsigned int max_blocks = 0;
 
-	/*
-	 * When LBPRZ is reported, discard alignment and granularity
-	 * must be fixed to the logical block size. Otherwise the block
-	 * layer will drop misaligned portions of the request which can
-	 * lead to data corruption. If LBPRZ is not set, we honor the
-	 * device preference.
-	 */
-	if (sdkp->lbprz) {
-		q->limits.discard_alignment = 0;
-		q->limits.discard_granularity = logical_block_size;
-	} else {
-		q->limits.discard_alignment = sdkp->unmap_alignment *
-			logical_block_size;
-		q->limits.discard_granularity =
-			max(sdkp->physical_block_size,
-			    sdkp->unmap_granularity * logical_block_size);
-	}
-
+	q->limits.discard_alignment =
+		sdkp->unmap_alignment * logical_block_size;
+	q->limits.discard_granularity =
+		max(sdkp->physical_block_size,
+		    sdkp->unmap_granularity * logical_block_size);
 	sdkp->provisioning_mode = mode;
 
 	switch (mode) {
@@ -2842,7 +2829,7 @@ static void sd_read_block_limits(struct scsi_disk *sdkp)
 				sd_config_discard(sdkp, SD_LBP_WS16);
 
 		} else {	/* LBP VPD page tells us what to use */
-			if (sdkp->lbpu && sdkp->max_unmap_blocks && !sdkp->lbprz)
+			if (sdkp->lbpu && sdkp->max_unmap_blocks)
 				sd_config_discard(sdkp, SD_LBP_UNMAP);
 			else if (sdkp->lbpws)
 				sd_config_discard(sdkp, SD_LBP_WS16);
-- 
2.11.0

^ permalink raw reply related

* Can we deprecate ioctl(RAID_VERSION)?
From: jes.sorensen @ 2017-04-05 17:55 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid

Hi Neil,

Looking through the code in mdadm, I noticed a number of cases calling
ioctl(RAID_VERSION). At first I had it confused with metadata version,
but it looks like RAID_VERSION will always return 90000 if it's a valid
raid device.

In the cases we want to confirm the fd is a valid raid array,
ioctl(GET_ARRAY_INFO) should do, or sysfs_read(GET_VERSION).

Am I missing something obvious here, or do you see any reason for
leaving this around?

Thanks,
Jes

^ permalink raw reply

* Re: [PATCH 2/7] Makefile, x86, LLVM: disable unsupported optimization flags
From: Masahiro Yamada @ 2017-04-05 18:08 UTC (permalink / raw)
  To: Michael Davidson
  Cc: Michal Marek, Thomas Gleixner, Ingo Molnar, H. Peter Anvin,
	Herbert Xu, David S. Miller, Shaohua Li, Alexander Potapenko,
	Dmitry Vyukov, Matthias Kaehlcke, X86 ML,
	Linux Kbuild mailing list, Linux Kernel Mailing List,
	linux-crypto, linux-raid
In-Reply-To: <20170317001520.85223-3-md@google.com>

Hi Michael,

2017-03-17 9:15 GMT+09:00 Michael Davidson <md@google.com>:
> Unfortunately, while clang generates a warning about these flags
> being unsupported it still exits with a status of 0 so we have
> to explicitly disable them instead of just using a cc-option check.
>
> Signed-off-by: Michael Davidson <md@google.com>


Instead, does the following work for you?
https://patchwork.kernel.org/patch/9657285/


You need to use
$(call cc-option, ...)
for -falign-jumps=1 and -falign-loops=1



-- 
Best Regards
Masahiro Yamada

^ permalink raw reply

* Re: [PATCH 2/7] Makefile, x86, LLVM: disable unsupported optimization flags
From: Matthias Kaehlcke @ 2017-04-05 19:01 UTC (permalink / raw)
  To: Masahiro Yamada
  Cc: Michael Davidson, Michal Marek, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, Herbert Xu, David S. Miller, Shaohua Li,
	Alexander Potapenko, Dmitry Vyukov, X86 ML,
	Linux Kbuild mailing list, Linux Kernel Mailing List,
	linux-crypto, linux-raid
In-Reply-To: <CAK7LNAR2efi4e3-st-hP3T7v=Hkb81beKibgYeKzxs3Q3kP0uA@mail.gmail.com>

Hi Masahiro,

El Thu, Apr 06, 2017 at 03:08:26AM +0900 Masahiro Yamada ha dit:

> 2017-03-17 9:15 GMT+09:00 Michael Davidson <md@google.com>:
> > Unfortunately, while clang generates a warning about these flags
> > being unsupported it still exits with a status of 0 so we have
> > to explicitly disable them instead of just using a cc-option check.
> >
> > Signed-off-by: Michael Davidson <md@google.com>
> 
> 
> Instead, does the following work for you?
> https://patchwork.kernel.org/patch/9657285/

Thanks for the pointer, I was about to give this change (or rather its
ancestor) a rework myself :)

> You need to use
> $(call cc-option, ...)
> for -falign-jumps=1 and -falign-loops=1

I can confirm that this works.

Thanks

Matthias

^ permalink raw reply

* Re: Can we deprecate ioctl(RAID_VERSION)?
From: jes.sorensen @ 2017-04-05 19:02 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid
In-Reply-To: <wrfjvaqid9ih.fsf@gmail.com>

jes.sorensen@gmail.com writes:
> Hi Neil,
>
> Looking through the code in mdadm, I noticed a number of cases calling
> ioctl(RAID_VERSION). At first I had it confused with metadata version,
> but it looks like RAID_VERSION will always return 90000 if it's a valid
> raid device.
>
> In the cases we want to confirm the fd is a valid raid array,
> ioctl(GET_ARRAY_INFO) should do, or sysfs_read(GET_VERSION).
>
> Am I missing something obvious here, or do you see any reason for
> leaving this around?

Sorry the above is wrong, it will always return 900, not 90000. Some of
the code that stood out is in util.c:

int md_get_version(int fd)
{
        struct stat stb;
        mdu_version_t vers;

        if (fstat(fd, &stb)<0)
                return -1;
        if ((S_IFMT&stb.st_mode) != S_IFBLK)
                return -1;

        if (ioctl(fd, RAID_VERSION, &vers) == 0)
                return  (vers.major*10000) + (vers.minor*100) + vers.patchlevel;
        if (errno == EACCES)
                return -1;
        if (major(stb.st_rdev) == MD_MAJOR)
                return (3600);
        return -1;
}

....

int set_array_info(int mdfd, struct supertype *st, struct mdinfo *info)
{
        /* Initialise kernel's knowledge of array.
         * This varies between externally managed arrays
         * and older kernels
         */
        int vers = md_get_version(mdfd);
        int rv;

#ifndef MDASSEMBLE
        if (st->ss->external)
                rv = sysfs_set_array(info, vers);
        else
#endif
                if ((vers % 100) >= 1) { /* can use different versions */
                mdu_array_info_t inf;
                memset(&inf, 0, sizeof(inf));
                inf.major_version = info->array.major_version;
                inf.minor_version = info->array.minor_version;
                rv = ioctl(mdfd, SET_ARRAY_INFO, &inf);
        } else
                rv = ioctl(mdfd, SET_ARRAY_INFO, NULL);
        return rv;
}

This has been around since at least 2008, the current code came in
f35f25259279573c6274e2783536c0b0a399bdd4, but it looks like even the
prior code made the same assumptions.

In either case, the above 'if ((vers % 100) >= 1)' will always trigger
since the kernel does #define MD_PATCHLEVEL_VERSION 3

It's not like we have been updating MD_PATCHLEVEL_VERSION for a
while. Was the code meant to be looking at the superblock minor version?
I've been staring at this for a while now, so please beat me over the
head if I missed something blatantly obvious.

Jes

^ permalink raw reply

* Re: [PATCH 2/7] Makefile, x86, LLVM: disable unsupported optimization flags
From: Michael Davidson @ 2017-04-05 19:11 UTC (permalink / raw)
  To: Matthias Kaehlcke
  Cc: Masahiro Yamada, Michal Marek, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, Herbert Xu, David S. Miller, Shaohua Li,
	Alexander Potapenko, Dmitry Vyukov, X86 ML,
	Linux Kbuild mailing list, Linux Kernel Mailing List,
	linux-crypto, linux-raid
In-Reply-To: <20170405190130.GD145051@google.com>

It "works" for the cases that I currently care about but I have to say
that I am uneasy about adding -Werror to the cc-option test in this
way.

Suppose that one of the *other* flags that is implicitly passed to the
compiler by cc-option - eg something that was explicitly specified in
$(KBUILD_CFLAGS) - triggers a warning. In that case all calls to
cc-option will silently fail because of the -Werror and valid options
will not be detected correctly.

If everyone is OK with that because "it shouldn't normally ever
happen" then that is fine, but if does result in a subtle change from
existing behavior (and a trap that I almost immediately fell into
after applying a similar patch).

On Wed, Apr 5, 2017 at 12:01 PM, Matthias Kaehlcke <mka@chromium.org> wrote:
> Hi Masahiro,
>
> El Thu, Apr 06, 2017 at 03:08:26AM +0900 Masahiro Yamada ha dit:
>
>> 2017-03-17 9:15 GMT+09:00 Michael Davidson <md@google.com>:
>> > Unfortunately, while clang generates a warning about these flags
>> > being unsupported it still exits with a status of 0 so we have
>> > to explicitly disable them instead of just using a cc-option check.
>> >
>> > Signed-off-by: Michael Davidson <md@google.com>
>>
>>
>> Instead, does the following work for you?
>> https://patchwork.kernel.org/patch/9657285/
>
> Thanks for the pointer, I was about to give this change (or rather its
> ancestor) a rework myself :)
>
>> You need to use
>> $(call cc-option, ...)
>> for -falign-jumps=1 and -falign-loops=1
>
> I can confirm that this works.
>
> Thanks
>
> Matthias

^ permalink raw reply

* Re: [md PATCH 08/10] md/raid5: make chunk_aligned_read() split bios more cleanly.
From: kbuild test robot @ 2017-04-05 22:15 UTC (permalink / raw)
  To: NeilBrown; +Cc: kbuild-all, Shaohua Li, linux-raid
In-Reply-To: <149136515124.25893.6650408840570246136.stgit@noble>

[-- Attachment #1: Type: text/plain, Size: 3188 bytes --]

Hi NeilBrown,

[auto build test WARNING on next-20170330]
[cannot apply to md/master md/for-next block/for-next linus/master v4.9-rc8 v4.9-rc7 v4.9-rc6 v4.11-rc5]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/NeilBrown/Simplify-bio-splitting-and-related-code/20170406-045741
config: x86_64-randconfig-x007-201714 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

Note: it may well be a FALSE warning. FWIW you are at least aware of it now.
http://gcc.gnu.org/wiki/Better_Uninitialized_Warnings

All warnings (new ones prefixed by >>):

   drivers/md/raid5.c: In function 'raid5_make_request':
>> drivers/md/raid5.c:5554:6: warning: 'split' may be used uninitialized in this function [-Wmaybe-uninitialized]
      if (!bi)
         ^

vim +/split +5554 drivers/md/raid5.c

828cbe98 Shaohua Li           2015-09-02  5538  		/* ret == -EAGAIN, fallback */
3bddb7f8 Song Liu             2016-11-18  5539  		/*
3bddb7f8 Song Liu             2016-11-18  5540  		 * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH,
3bddb7f8 Song Liu             2016-11-18  5541  		 * we need to flush journal device
3bddb7f8 Song Liu             2016-11-18  5542  		 */
3bddb7f8 Song Liu             2016-11-18  5543  		do_flush = bi->bi_opf & REQ_PREFLUSH;
828cbe98 Shaohua Li           2015-09-02  5544  	}
e5dcdd80 NeilBrown            2005-09-09  5545  
9ffc8f7c Eric Mei             2015-03-18  5546  	/*
9ffc8f7c Eric Mei             2015-03-18  5547  	 * If array is degraded, better not do chunk aligned read because
9ffc8f7c Eric Mei             2015-03-18  5548  	 * later we might have to read it again in order to reconstruct
9ffc8f7c Eric Mei             2015-03-18  5549  	 * data on failed drives.
9ffc8f7c Eric Mei             2015-03-18  5550  	 */
9ffc8f7c Eric Mei             2015-03-18  5551  	if (rw == READ && mddev->degraded == 0 &&
7ef6b12a Ming Lin             2015-05-06  5552  	    mddev->reshape_position == MaxSector) {
7ef6b12a Ming Lin             2015-05-06  5553  		bi = chunk_aligned_read(mddev, bi);
7ef6b12a Ming Lin             2015-05-06 @5554  		if (!bi)
5a7bbad2 Christoph Hellwig    2011-09-12  5555  			return;
7ef6b12a Ming Lin             2015-05-06  5556  	}
52488615 Raz Ben-Jehuda(caro  2006-12-10  5557) 
796a5cf0 Mike Christie        2016-06-05  5558  	if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) {
620125f2 Shaohua Li           2012-10-11  5559  		make_discard_request(mddev, bi);
620125f2 Shaohua Li           2012-10-11  5560  		return;
620125f2 Shaohua Li           2012-10-11  5561  	}
620125f2 Shaohua Li           2012-10-11  5562  

:::::: The code at line 5554 was first introduced by commit
:::::: 7ef6b12a1966f273afb750e19e1e8129bea48fec md/raid5: split bio for chunk_aligned_read

:::::: TO: Ming Lin <ming.l@ssi.samsung.com>
:::::: CC: Jens Axboe <axboe@fb.com>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 24343 bytes --]

^ permalink raw reply

* Re: Can we deprecate ioctl(RAID_VERSION)?
From: NeilBrown @ 2017-04-05 22:32 UTC (permalink / raw)
  To: jes.sorensen; +Cc: linux-raid
In-Reply-To: <wrfjr316d6f5.fsf@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 3647 bytes --]

On Thu, Apr 06 2017, jes.sorensen@gmail.com wrote:

> jes.sorensen@gmail.com writes:
>> Hi Neil,
>>
>> Looking through the code in mdadm, I noticed a number of cases calling
>> ioctl(RAID_VERSION). At first I had it confused with metadata version,
>> but it looks like RAID_VERSION will always return 90000 if it's a valid
>> raid device.
>>
>> In the cases we want to confirm the fd is a valid raid array,
>> ioctl(GET_ARRAY_INFO) should do, or sysfs_read(GET_VERSION).
>>
>> Am I missing something obvious here, or do you see any reason for
>> leaving this around?
>
> Sorry the above is wrong, it will always return 900, not 90000. Some of
> the code that stood out is in util.c:
>
> int md_get_version(int fd)
> {
>         struct stat stb;
>         mdu_version_t vers;
>
>         if (fstat(fd, &stb)<0)
>                 return -1;
>         if ((S_IFMT&stb.st_mode) != S_IFBLK)
>                 return -1;
>
>         if (ioctl(fd, RAID_VERSION, &vers) == 0)
>                 return  (vers.major*10000) + (vers.minor*100) + vers.patchlevel;
>         if (errno == EACCES)
>                 return -1;
>         if (major(stb.st_rdev) == MD_MAJOR)
>                 return (3600);
>         return -1;
> }
>
> ....
>
> int set_array_info(int mdfd, struct supertype *st, struct mdinfo *info)
> {
>         /* Initialise kernel's knowledge of array.
>          * This varies between externally managed arrays
>          * and older kernels
>          */
>         int vers = md_get_version(mdfd);
>         int rv;
>
> #ifndef MDASSEMBLE
>         if (st->ss->external)
>                 rv = sysfs_set_array(info, vers);
>         else
> #endif
>                 if ((vers % 100) >= 1) { /* can use different versions */
>                 mdu_array_info_t inf;
>                 memset(&inf, 0, sizeof(inf));
>                 inf.major_version = info->array.major_version;
>                 inf.minor_version = info->array.minor_version;
>                 rv = ioctl(mdfd, SET_ARRAY_INFO, &inf);
>         } else
>                 rv = ioctl(mdfd, SET_ARRAY_INFO, NULL);
>         return rv;
> }
>
> This has been around since at least 2008, the current code came in
> f35f25259279573c6274e2783536c0b0a399bdd4, but it looks like even the
> prior code made the same assumptions.
>
> In either case, the above 'if ((vers % 100) >= 1)' will always trigger
> since the kernel does #define MD_PATCHLEVEL_VERSION 3
>
> It's not like we have been updating MD_PATCHLEVEL_VERSION for a
> while. Was the code meant to be looking at the superblock minor version?
> I've been staring at this for a while now, so please beat me over the
> head if I missed something blatantly obvious.
>
> Jes

It is hard to get versioning right...

The version returned by the RAID_VERSION ioctl is meant to reflect the
capabilities of the implementation.  We could use the kernel version
number for that (and sometimes do), but as distro's often backport
features, that isn't always reliable.

I've incremented the MD_PATCHLEVEL_VERSION when a change is made that
cannot easily be detected from user-space.  As you note, we are up to
three.  The last change was in 2.6.15.
I've never contemplated changing the other two numbers that RAID_VERSION
return.  They don't seem to mean anything useful.

What exactly do you mean by "deprecate" the ioctl?
If you remove the code in mdadm that calls it, mdadm will not work
correctly on kernels older than 2.6.15, and it will be harder to
and an future capability that is not easily visible from user space.
If you remove the code in the kernel that handles it, you'll break
mdadm.

What is the goal here?

Thanks,
NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply

* Re: [md PATCH 08/10] md/raid5: make chunk_aligned_read() split bios more cleanly.
From: NeilBrown @ 2017-04-06  0:13 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid
In-Reply-To: <149136515124.25893.6650408840570246136.stgit@noble>

[-- Attachment #1: Type: text/plain, Size: 459 bytes --]


Missed a split->raid_bio conversion here

Signed-off-by: NeilBrown <neilb@suse.com>

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index c523fd69a7bc..683fbf7f22a8 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5247,7 +5247,7 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
 	}
 
 	if (!raid5_read_one_chunk(mddev, raid_bio))
-		return split;
+		return raid_bio;
 
 	return NULL;
 }

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply related

* [PATCH V2] md/raid10: reset the 'first' at the end of loop
From: Guoqing Jiang @ 2017-04-06  1:12 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, neilb
In-Reply-To: <1491017900-8150-1-git-send-email-gqjiang@suse.com>

We need to set "first = 0' at the end of rdev_for_each
loop, so we can get the array's min_offset_diff correctly
otherwise min_offset_diff just means the last rdev's
offset diff.

Suggested-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/raid10.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 0f13d57..e055ec9 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3769,6 +3769,7 @@ static int raid10_run(struct mddev *mddev)
 
 		if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
 			discard_supported = true;
+		first = 0;
 	}
 
 	if (mddev->queue) {
@@ -4172,6 +4173,7 @@ static int raid10_start_reshape(struct mddev *mddev)
 			if (first || diff < min_offset_diff)
 				min_offset_diff = diff;
 		}
+		first = 0;
 	}
 
 	if (max(before_length, after_length) > min_offset_diff)
-- 
2.6.6


^ permalink raw reply related

* Re: [RFC PATCH] raid1: reset 'bi_next' before reuse the bio
From: NeilBrown @ 2017-04-06  2:03 UTC (permalink / raw)
  To: Michael Wang, linux-raid, linux-kernel@vger.kernel.org
  Cc: Shaohua Li, Jinpu Wang
In-Reply-To: <465f2653-3afc-3329-dbf4-af13010113b7@profitbricks.com>

[-- Attachment #1: Type: text/plain, Size: 2743 bytes --]

On Wed, Apr 05 2017, Michael Wang wrote:

> On 04/05/2017 12:17 AM, NeilBrown wrote:
> [snip]
>>> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
>>> index 7d67235..0554110 100644
>>> --- a/drivers/md/raid1.c
>>> +++ b/drivers/md/raid1.c
>>> @@ -1986,11 +1986,13 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
>>>  		/* Don't try recovering from here - just fail it
>>>  		 * ... unless it is the last working device of course */
>>>  		md_error(mddev, rdev);
>>> -		if (test_bit(Faulty, &rdev->flags))
>>> +		if (test_bit(Faulty, &rdev->flags)) {
>>>  			/* Don't try to read from here, but make sure
>>>  			 * put_buf does it's thing
>>>  			 */
>>>  			bio->bi_end_io = end_sync_write;
>>> +			bio->bi_next = NULL;
>>> +		}
>>>  	}
>>>  
>>>  	while(sectors) {
>> 
>> 
>> Ah - I see what is happening now.  I was looking at the vanilla 4.4
>> code, which doesn't have the failfast changes.
>
> My bad to forgot mention... yes our md stuff is very much close to the
> upstream.
>
>> 
>> I don't think your patch is correct though.  We really shouldn't be
>> re-using that bio, and setting bi_next to NULL just hides the bug.  It
>> doesn't fix it.
>> As the rdev is now Faulty, it doesn't make sense for
>> sync_request_write() to submit a write request to it.
>
> Make sense, while still have concerns regarding the design:
>   * in this case since the read_disk already abandoned, is it fine to
>     keep r1_bio->read_disk recording the faulty device index?

I guess we could set it to -1.  I'm not sure that would help at all.


>   * we assign the 'end_sync_write' to the original read bio in this
>     case, but when is this supposed to be called?

It isn't called.  But the value of ->bi_end_io is tests a couple of
times.  Particularly in put_buf(), but also a little further down in
fix_sync_read_errors(). 

>
>> 
>> Can you confirm that this works please.
>
> Yes, it works.
>
> Tested-by: Michael Wang <yun.wang@profitbricks.com>

Thanks.  I'll add that and submit the patch.

Thanks,
NeilBrown

>
> Regards,
> Michael Wang
>
>> 
>> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
>> index d2d8b8a5bd56..219f1e1f1d1d 100644
>> --- a/drivers/md/raid1.c
>> +++ b/drivers/md/raid1.c
>> @@ -2180,6 +2180,8 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
>>  		     (i == r1_bio->read_disk ||
>>  		      !test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
>>  			continue;
>> +		if (test_bit(Faulty, &conf->mirrors[i].rdev->flags))
>> +			continue;
>>  
>>  		bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
>>  		if (test_bit(FailFast, &conf->mirrors[i].rdev->flags))
>> 
>> 
>> Thanks,
>> NeilBrown
>> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply

* [PATCH] md/raid1: avoid reusing a resync bio after error handling.
From: NeilBrown @ 2017-04-06  2:06 UTC (permalink / raw)
  To: Shaohua Li; +Cc: Linux-RAID, Michael Wang

[-- Attachment #1: Type: text/plain, Size: 1634 bytes --]

fix_sync_read_error() modifies a bio on a newly faulty
device by setting bi_end_io to end_sync_write.
This ensure that put_buf() will still call rdev_dec_pending()
as required, but makes sure that subsequent code in
fix_sync_read_error() doesn't try to read from the device.

Unfortunately this interacts badly with sync_request_write()
which assumes that any bio with bi_end_io set to non-NULL
other than end_sync_read is safe to write to.

As the device is now faulty it doesn't make sense to write.
As the bio was recently used for a read, it is "dirty"
and not suitable for immediate submission.
In particular, ->bi_next might be non-NULL, which will cause
generic_make_request() to complain.

Break this interaction by refusing to write to devices
which are marked as Faulty.

Reported-and-tested-by: Michael Wang <yun.wang@profitbricks.com>
Fixes: 2e52d449bcec ("md/raid1: add failfast handling for reads.")
Cc: stable@vger.kernel.org (v4.10+)
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid1.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a70283753a35..9c1b2231d2db 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2185,6 +2185,8 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
 		     (i == r1_bio->read_disk ||
 		      !test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
 			continue;
+		if (test_bit(Faulty, &conf->mirrors[i].rdev->flags))
+			continue;

 		bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
 		if (test_bit(FailFast, &conf->mirrors[i].rdev->flags))
-- 
2.12.2

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply related

* Re: [PATCH V2] md/raid10: reset the 'first' at the end of loop
From: NeilBrown @ 2017-04-06  2:27 UTC (permalink / raw)
  To: Guoqing Jiang, linux-raid; +Cc: shli
In-Reply-To: <1491441138-16155-1-git-send-email-gqjiang@suse.com>

[-- Attachment #1: Type: text/plain, Size: 1112 bytes --]

On Thu, Apr 06 2017, Guoqing Jiang wrote:

> We need to set "first = 0' at the end of rdev_for_each
> loop, so we can get the array's min_offset_diff correctly
> otherwise min_offset_diff just means the last rdev's
> offset diff.
>
> Suggested-by: NeilBrown <neilb@suse.com>
> Signed-off-by: Guoqing Jiang <gqjiang@suse.com>

Reviewed-by: NeilBrown <neilb@suse.com>

Thanks,
NeilBrown

> ---
>  drivers/md/raid10.c | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index 0f13d57..e055ec9 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -3769,6 +3769,7 @@ static int raid10_run(struct mddev *mddev)
>  
>  		if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
>  			discard_supported = true;
> +		first = 0;
>  	}
>  
>  	if (mddev->queue) {
> @@ -4172,6 +4173,7 @@ static int raid10_start_reshape(struct mddev *mddev)
>  			if (first || diff < min_offset_diff)
>  				min_offset_diff = diff;
>  		}
> +		first = 0;
>  	}
>  
>  	if (max(before_length, after_length) > min_offset_diff)
> -- 
> 2.6.6

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply

* [PATCH] md:MD_CLOSING needs to be cleared after called md_set_readonly or do_md_stop
From: Zhilong Liu @ 2017-04-06  3:16 UTC (permalink / raw)
  To: shli; +Cc: linux-raid, Zhilong Liu, NeilBrown

From: NeilBrown <neilb@suse.com>

if called md_set_readonly and set MD_CLOSING bit, the mddev cannot
be opened any more due to the MD_CLOING bit wasn't cleared. Thus it
needs to be cleared in md_ioctl after any call to md_set_readonly()
or do_md_stop().
Fixes: af8d8e6f0315 ("md: changes for MD_STILL_CLOSED flag")

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Zhilong Liu <zlliu@suse.com>
---

 drivers/md/md.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index f6ae1d6..906a4bf 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6776,6 +6776,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 	void __user *argp = (void __user *)arg;
 	struct mddev *mddev = NULL;
 	int ro;
+	bool did_set_md_closing = false;
 
 	if (!md_ioctl_valid(cmd))
 		return -ENOTTY;
@@ -6865,7 +6866,9 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 			err = -EBUSY;
 			goto out;
 		}
+		WARN_ON_ONCE(test_bit(MD_CLOSING, &mddev->flags));
 		set_bit(MD_CLOSING, &mddev->flags);
+		did_set_md_closing = true;
 		mutex_unlock(&mddev->open_mutex);
 		sync_blockdev(bdev);
 	}
@@ -7058,6 +7061,8 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 		mddev->hold_active = 0;
 	mddev_unlock(mddev);
 out:
+	if(did_set_md_closing)
+		clear_bit(MD_CLOSING, &mddev->flags);
 	return err;
 }
 #ifdef CONFIG_COMPAT
-- 
2.6.6


^ permalink raw reply related

* [v2] raid6/altivec: Add vpermxor implementation for raid6 Q syndrome
From: Matt Brown @ 2017-04-06  5:38 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: dja, linux-raid

The raid6 Q syndrome check has been optimised using the vpermxor
instruction.  This instruction was made available with POWER8, ISA version
2.07. It allows for both vperm and vxor instructions to be done in a single
instruction. This has been tested for correctness on a ppc64le vm with a
basic RAID6 setup containing 5 drives.

The performance benchmarks are from the raid6test in the /lib/raid6/test
directory. These results are from an IBM Firestone machine with ppc64le
architecture. The benchmark results show a 35% speed increase over the best
existing algorithm for powerpc (altivec). The raid6test has also been run
on a big-endian ppc64 vm to ensure it also works for big-endian
architectures.

Performance benchmarks:

	raid6: altivecx4 gen() 18773 MB/s
	raid6: altivecx8 gen() 19438 MB/s

	raid6: vpermxor4 gen() 25112 MB/s
	raid6: vpermxor8 gen() 26279 MB/s

Bugs fixed:
	- A small bug in pq.h regarding a missing and mismatched
	  ifdef statement
	- Fixed test/Makefile to correctly build test on ppc

Signed-off-by: Matt Brown <matthew.brown.dev@gmail.com>
---
mpe I assume you are ok to take this patch, most of the other ppc raid patches have gone through you.

Changelog:
v2
	- added reference to raid6 paper
	- shortened asm lines
	- removed redundant Makefile line
	- fixed test/Makefile bug
---
 include/linux/raid/pq.h |   4 ++
 lib/raid6/Makefile      |  27 ++++++++++++-
 lib/raid6/algos.c       |   4 ++
 lib/raid6/altivec.uc    |   3 ++
 lib/raid6/test/Makefile |  26 +++++++++---
 lib/raid6/vpermxor.uc   | 104 ++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 161 insertions(+), 7 deletions(-)
 create mode 100644 lib/raid6/vpermxor.uc

diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index 4d57bba..3df9aa6 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -107,6 +107,10 @@ extern const struct raid6_calls raid6_avx512x2;
 extern const struct raid6_calls raid6_avx512x4;
 extern const struct raid6_calls raid6_tilegx8;
 extern const struct raid6_calls raid6_s390vx8;
+extern const struct raid6_calls raid6_vpermxor1;
+extern const struct raid6_calls raid6_vpermxor2;
+extern const struct raid6_calls raid6_vpermxor4;
+extern const struct raid6_calls raid6_vpermxor8;
 
 struct raid6_recov_calls {
 	void (*data2)(int, size_t, int, int, void **);
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 3057011..7775aad 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -4,7 +4,8 @@ raid6_pq-y	+= algos.o recov.o tables.o int1.o int2.o int4.o \
 		   int8.o int16.o int32.o
 
 raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o avx512.o recov_avx512.o
-raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o
+raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o \
+				vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o
 raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o
 raid6_pq-$(CONFIG_TILEGX) += tilegx8.o
 raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o
@@ -88,6 +89,30 @@ $(obj)/altivec8.c:   UNROLL := 8
 $(obj)/altivec8.c:   $(src)/altivec.uc $(src)/unroll.awk FORCE
 	$(call if_changed,unroll)
 
+CFLAGS_vpermxor1.o += $(altivec_flags)
+targets += vpermxor1.c
+$(obj)/vpermxor1.c: UNROLL := 1
+$(obj)/vpermxor1.c: $(src)/vpermxor.uc $(src)/unroll.awk FORCE
+	$(call if_changed,unroll)
+
+CFLAGS_vpermxor2.o += $(altivec_flags)
+targets += vpermxor2.c
+$(obj)/vpermxor2.c: UNROLL := 2
+$(obj)/vpermxor2.c: $(src)/vpermxor.uc $(src)/unroll.awk FORCE
+	$(call if_changed,unroll)
+
+CFLAGS_vpermxor4.o += $(altivec_flags)
+targets += vpermxor4.c
+$(obj)/vpermxor4.c: UNROLL := 4
+$(obj)/vpermxor4.c: $(src)/vpermxor.uc $(src)/unroll.awk FORCE
+	$(call if_changed,unroll)
+
+CFLAGS_vpermxor8.o += $(altivec_flags)
+targets += vpermxor8.c
+$(obj)/vpermxor8.c: UNROLL := 8
+$(obj)/vpermxor8.c: $(src)/vpermxor.uc $(src)/unroll.awk FORCE
+	$(call if_changed,unroll)
+
 CFLAGS_neon1.o += $(NEON_FLAGS)
 targets += neon1.c
 $(obj)/neon1.c:   UNROLL := 1
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 7857049..edd4f69 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -74,6 +74,10 @@ const struct raid6_calls * const raid6_algos[] = {
 	&raid6_altivec2,
 	&raid6_altivec4,
 	&raid6_altivec8,
+	&raid6_vpermxor1,
+	&raid6_vpermxor2,
+	&raid6_vpermxor4,
+	&raid6_vpermxor8,
 #endif
 #if defined(CONFIG_TILEGX)
 	&raid6_tilegx8,
diff --git a/lib/raid6/altivec.uc b/lib/raid6/altivec.uc
index 682aae8..d20ed0d 100644
--- a/lib/raid6/altivec.uc
+++ b/lib/raid6/altivec.uc
@@ -24,10 +24,13 @@
 
 #include <linux/raid/pq.h>
 
+#ifdef CONFIG_ALTIVEC
+
 #include <altivec.h>
 #ifdef __KERNEL__
 # include <asm/cputable.h>
 # include <asm/switch_to.h>
+#endif /* __KERNEL__ */
 
 /*
  * This is the C data type to use.  We use a vector of
diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile
index 2c7b60e..b9503a3 100644
--- a/lib/raid6/test/Makefile
+++ b/lib/raid6/test/Makefile
@@ -43,11 +43,13 @@ else ifeq ($(HAS_NEON),yes)
         OBJS   += neon.o neon1.o neon2.o neon4.o neon8.o
         CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1
 else
-        HAS_ALTIVEC := $(shell printf '\#include <altivec.h>\nvector int a;\n' |\
-                         gcc -c -x c - >&/dev/null && \
-                         rm ./-.o && echo yes)
-        ifeq ($(HAS_ALTIVEC),yes)
-                OBJS += altivec1.o altivec2.o altivec4.o altivec8.o
+	 HAS_ALTIVEC := $(shell printf '\#include <altivec.h>\nvector int a;\n' |\
+			 gcc -c -x c - >/dev/null && rm ./-.o && echo yes)
+	 ifeq ($(HAS_ALTIVEC),yes)
+		CFLAGS += -I../../../arch/powerpc/include
+		CFLAGS += -DCONFIG_ALTIVEC
+		OBJS += altivec1.o altivec2.o altivec4.o altivec8.o \
+			vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o
         endif
 endif
 ifeq ($(ARCH),tilegx)
@@ -97,6 +99,18 @@ altivec4.c: altivec.uc ../unroll.awk
 altivec8.c: altivec.uc ../unroll.awk
 	$(AWK) ../unroll.awk -vN=8 < altivec.uc > $@
 
+vpermxor1.c: vpermxor.uc ../unroll.awk
+	$(AWK) ../unroll.awk -vN=1 < vpermxor.uc > $@
+
+vpermxor2.c: vpermxor.uc ../unroll.awk
+	$(AWK) ../unroll.awk -vN=2 < vpermxor.uc > $@
+
+vpermxor4.c: vpermxor.uc ../unroll.awk
+	$(AWK) ../unroll.awk -vN=4 < vpermxor.uc > $@
+
+vpermxor8.c: vpermxor.uc ../unroll.awk
+	$(AWK) ../unroll.awk -vN=8 < vpermxor.uc > $@
+
 int1.c: int.uc ../unroll.awk
 	$(AWK) ../unroll.awk -vN=1 < int.uc > $@
 
@@ -122,7 +136,7 @@ tables.c: mktables
 	./mktables > tables.c
 
 clean:
-	rm -f *.o *.a mktables mktables.c *.uc int*.c altivec*.c neon*.c tables.c raid6test
+	rm -f *.o *.a mktables mktables.c *.uc int*.c altivec*.c vpermxor*.c neon*.c tables.c raid6test
 	rm -f tilegx*.c
 
 spotless: clean
diff --git a/lib/raid6/vpermxor.uc b/lib/raid6/vpermxor.uc
new file mode 100644
index 0000000..cb7bed3
--- /dev/null
+++ b/lib/raid6/vpermxor.uc
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2017, Matt Brown, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * vpermxor$#.c
+ *
+ * Based on H. Peter Anvin's paper - The mathematics of RAID-6
+ *
+ * $#-way unrolled portable integer math RAID-6 instruction set
+ * This file is postprocessed using unroll.awk
+ *
+ * vpermxor$#.c makes use of the vpermxor opcode to optimise the RAID6 Q
+ * syndrome calculations.
+ * This can be run on systems which have both Altivec and the vpermxor opcode.
+ *
+ * This instruction was introduced in POWER8 - ISA v2.07.
+ */
+
+#include <linux/raid/pq.h>
+#ifdef CONFIG_ALTIVEC
+
+#include <altivec.h>
+#ifdef __KERNEL__
+#include <asm/cputable.h>
+#include <asm/switch_to.h>
+#endif
+
+typedef vector unsigned char unative_t;
+#define NSIZE sizeof(unative_t)
+
+static const vector unsigned char gf_low = {0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14,
+					    0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08,
+					    0x06, 0x04, 0x02,0x00};
+static const vector unsigned char gf_high = {0xfd, 0xdd, 0xbd, 0x9d, 0x7d, 0x5d,
+					     0x3d, 0x1d, 0xe0, 0xc0, 0xa0, 0x80,
+					     0x60, 0x40, 0x20, 0x00};
+
+static void noinline raid6_vpermxor$#_gen_syndrome_real(int disks, size_t bytes,
+							void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+	unative_t wp$$, wq$$, wd$$;
+
+	z0 = disks - 3;		/* Highest data disk */
+	p = dptr[z0+1];		/* XOR parity */
+	q = dptr[z0+2];		/* RS syndrome */
+
+	for (d = 0; d < bytes; d += NSIZE*$#) {
+		wp$$ = wq$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE];
+
+		for (z = z0-1; z>=0; z--) {
+			wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+			/* P syndrome */
+			wp$$ = vec_xor(wp$$, wd$$);
+
+			/*Q syndrome */
+			asm("vpermxor %0,%1,%2,%3":"=v"(wq$$):"v"(gf_high), "v"(gf_low), "v"(wq$$));
+			wq$$ = vec_xor(wq$$, wd$$);
+		}
+		*(unative_t *)&p[d+NSIZE*$$] = wp$$;
+		*(unative_t *)&q[d+NSIZE*$$] = wq$$;
+	}
+}
+
+static void raid6_vpermxor$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+	preempt_disable();
+	enable_kernel_altivec();
+
+	raid6_vpermxor$#_gen_syndrome_real(disks, bytes, ptrs);
+
+	disable_kernel_altivec();
+	preempt_enable();
+}
+
+int raid6_have_altivec_vpermxor(void);
+#if $# == 1
+int raid6_have_altivec_vpermxor(void)
+{
+	/* Check if CPU has both altivec and the vpermxor instruction*/
+# ifdef __KERNEL__
+	return (cpu_has_feature(CONFIG_ALTIVEC) &&
+		cpu_has_feature(CPU_FTR_ARCH_207S));
+# else
+	return 1;
+#endif
+
+}
+#endif
+
+const struct raid6_calls raid6_vpermxor$# = {
+	raid6_vpermxor$#_gen_syndrome,
+	NULL,
+	raid6_have_altivec_vpermxor,
+	"vpermxor$#",
+	0
+};
+#endif
-- 
2.9.3


^ permalink raw reply related

* Re: mdadm - dmraid
From: NeilBrown @ 2017-04-06  6:00 UTC (permalink / raw)
  To: gmx, linux-raid
In-Reply-To: <e8eb3b62-3b9e-e99f-f8c5-d92e176b9c0f@gmx.de>

[-- Attachment #1: Type: text/plain, Size: 658 bytes --]

On Wed, Apr 05 2017, gmx wrote:

> there exist a problem between packages dmraid and mdadm

What problem?


>
> incompatibility between both is known. but both packages can be 
> installed simultan without message

What incompatibility?  I don't know of one.
Please be more specific.

NeilBrown


>
> (result: problems - perhaps there are no problems in both packages but 
> together - even no kernel-problem)
>
> greetings hj
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply

* Re: [v2] raid6/altivec: Add vpermxor implementation for raid6 Q syndrome
From: Daniel Axtens @ 2017-04-06  6:13 UTC (permalink / raw)
  To: Matt Brown, linuxppc-dev; +Cc: linux-raid
In-Reply-To: <20170406053816.5239-1-matthew.brown.dev@gmail.com>

Hi Matt,

Thanks for answering my questions and doing those fixes.


> Bugs fixed:
> 	- A small bug in pq.h regarding a missing and mismatched
> 	  ifdef statement
> 	- Fixed test/Makefile to correctly build test on ppc
>

I think this commit should be labelled:
Fixes: 4f8c55c5ad49 ("lib/raid6: build proper files on corresponding arch")

mpe can probably add that when he merges - no need to do a new version :)

>  else
> -        HAS_ALTIVEC := $(shell printf '\#include <altivec.h>\nvector int a;\n' |\
> -                         gcc -c -x c - >&/dev/null && \
> -                         rm ./-.o && echo yes)
> -        ifeq ($(HAS_ALTIVEC),yes)
> -                OBJS += altivec1.o altivec2.o altivec4.o altivec8.o
> +	 HAS_ALTIVEC := $(shell printf '\#include <altivec.h>\nvector int a;\n' |\
> +			 gcc -c -x c - >/dev/null && rm ./-.o && echo yes)
> +	 ifeq ($(HAS_ALTIVEC),yes)
> +		CFLAGS += -I../../../arch/powerpc/include
> +		CFLAGS += -DCONFIG_ALTIVEC
> +		OBJS += altivec1.o altivec2.o altivec4.o altivec8.o \
> +			vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o
>          endif
>  endif
Looks like vim has replaced spaces with tabs here. Not sure how much we
care...

>  ifeq ($(ARCH),tilegx)
> @@ -97,6 +99,18 @@ altivec4.c: altivec.uc ../unroll.awk
>  altivec8.c: altivec.uc ../unroll.awk
>  	$(AWK) ../unroll.awk -vN=8 < altivec.uc > $@
>
... especially seeing as tabs are already used in the file here!

> +vpermxor1.c: vpermxor.uc ../unroll.awk
> +	$(AWK) ../unroll.awk -vN=1 < vpermxor.uc > $@
> +
> +vpermxor2.c: vpermxor.uc ../unroll.awk
> +	$(AWK) ../unroll.awk -vN=2 < vpermxor.uc > $@
> +
> +vpermxor4.c: vpermxor.uc ../unroll.awk
> +	$(AWK) ../unroll.awk -vN=4 < vpermxor.uc > $@
> +
> +vpermxor8.c: vpermxor.uc ../unroll.awk
> +	$(AWK) ../unroll.awk -vN=8 < vpermxor.uc > $@
> +
>  int1.c: int.uc ../unroll.awk
>  	$(AWK) ../unroll.awk -vN=1 < int.uc > $@
>  
> +			/*Q syndrome */
> +	/* Check if CPU has both altivec and the vpermxor instruction*/
Very trivial nit: for future patches please make sure you have spaces
between the beginning/end of a comment and the comment.

> +# ifdef __KERNEL__
> +	return (cpu_has_feature(CONFIG_ALTIVEC) &&
> +		cpu_has_feature(CPU_FTR_ARCH_207S));
I think CPU_FTR_ARCH_207S implies Altivec? Again, not a real problem,
I don't think it's really worth doing a respin for any of these, so:

Reviewed-by: Daniel Axtens <dja@axtens.net>

Thanks for making Power faster!

Regards,
Daniel


^ permalink raw reply

* Re: [PATCH 26/27] scsi: sd: Separate zeroout and discard command choices
From: Hannes Reinecke @ 2017-04-06  6:17 UTC (permalink / raw)
  To: Christoph Hellwig, axboe, martin.petersen, agk, snitzer, shli,
	philipp.reisner, lars.ellenberg
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel, linux-raid
In-Reply-To: <20170405172125.22600-27-hch@lst.de>

On 04/05/2017 07:21 PM, Christoph Hellwig wrote:
> From: "Martin K. Petersen" <martin.petersen@oracle.com>
> 
> Now that zeroout and discards are distinct operations we need to
> separate the policy of choosing the appropriate command. Create a
> zeroing_mode which can be one of:
> 
> write:			Zeroout assist not present, use regular WRITE
> writesame:		Allow WRITE SAME(10/16) with a zeroed payload
> writesame_16_unmap:	Allow WRITE SAME(16) with UNMAP
> writesame_10_unmap:	Allow WRITE SAME(10) with UNMAP
> 
> The last two are conditional on the device being thin provisioned with
> LBPRZ=1 and LBPWS=1 or LBPWS10=1 respectively.
> 
> Whether to set the UNMAP bit or not depends on the REQ_NOUNMAP flag. And
> if none of the _unmap variants are supported, regular WRITE SAME will be
> used if the device supports it.
> 
> The zeroout_mode is exported in sysfs and the detected mode for a given
> device can be overridden using the string constants above.
> 
> With this change in place we can now issue WRITE SAME(16) with UNMAP set
> for block zeroing applications that require hard guarantees and
> logical_block_size granularity. And at the same time use the UNMAP
> command with the device's preferred granulary and alignment for discard
> operations.
> 
> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  drivers/scsi/sd.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
>  drivers/scsi/sd.h |  8 ++++++++
>  2 files changed, 61 insertions(+), 3 deletions(-)
> 
Reviewed-by: Hannes Reinecke <hare@suse.com>

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare@suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)

^ permalink raw reply

* Re: [PATCH 27/27] scsi: sd: Remove LBPRZ dependency for discards
From: Hannes Reinecke @ 2017-04-06  6:18 UTC (permalink / raw)
  To: Christoph Hellwig, axboe, martin.petersen, agk, snitzer, shli,
	philipp.reisner, lars.ellenberg
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel, linux-raid
In-Reply-To: <20170405172125.22600-28-hch@lst.de>

On 04/05/2017 07:21 PM, Christoph Hellwig wrote:
> From: "Martin K. Petersen" <martin.petersen@oracle.com>
> 
> Separating discards and zeroout operations allows us to remove the LBPRZ
> block zeroing constraints from discards and honor the device preferences
> for UNMAP commands.
> 
> If supported by the device, we'll also choose UNMAP over one of the
> WRITE SAME variants for discards.
> 
> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  drivers/scsi/sd.c | 25 ++++++-------------------
>  1 file changed, 6 insertions(+), 19 deletions(-)
> 
Reviewed-by: Hannes Reinecke <hare@suse.com>

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare@suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)

^ permalink raw reply

* Re: mdadm:compiled warning in mdadm.c:1974:29 treats as errors
From: Liu Zhilong @ 2017-04-06  8:21 UTC (permalink / raw)
  To: jes.sorensen; +Cc: linux-raid
In-Reply-To: <wrfjd1cqeugb.fsf@gmail.com>

hi,

I found this compiling warning, and can reproduce on my Leap42.1, 
Leap42.2 and SLES12 SP2 with v4.8.5 version of gcc.

# gcc -v
Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/usr/lib64/gcc/x86_64-suse-linux/4.8/lto-wrapper
Target: x86_64-suse-linux
Configured with: ../configure --prefix=/usr --infodir=/usr/share/info 
--mandir=/usr/share/man --libdir=/usr/lib64 --libexecdir=/usr/lib64 
--enable-languages=c,c++,objc,fortran,obj-c++,java,ada 
--enable-checking=release --with-gxx-include-dir=/usr/include/c++/4.8 
--enable-ssp --disable-libssp --disable-plugin 
--with-bugurl=http://bugs.opensuse.org/ --with-pkgversion='SUSE Linux' 
--disable-libgcj --disable-libmudflap --with-slibdir=/lib64 
--with-system-zlib --enable-__cxa_atexit 
--enable-libstdcxx-allocator=new --disable-libstdcxx-pch 
--enable-version-specific-runtime-libs --enable-linker-build-id 
--enable-linux-futex --program-suffix=-4.8 --without-system-libunwind 
--with-arch-32=i586 --with-tune=generic --build=x86_64-suse-linux 
--host=x86_64-suse-linux
Thread model: posix
gcc version 4.8.5 (SUSE Linux)

# make everything
... ...
mdadm.c: In function ‘main’:
mdadm.c:1974:29: error: ‘mdfd’ may be used uninitialized in this 
function [-Werror=maybe-uninitialized]
    if (dv->devname[0] == '/' || mdfd < 0)
                              ^
mdadm.c:1914:7: note: ‘mdfd’ was declared here
    int mdfd;
        ^
cc1: all warnings being treated as errors
Makefile:206: recipe for target 'mdadm.Os' failed
make: *** [mdadm.Os] Error 1

Thanks,
-Zhilong

On 04/05/2017 11:37 PM, jes.sorensen@gmail.com wrote:
> Zhilong <zlliu@suse.com> writes:
>> Hi,jes;
>>    I have found that there is mdfd returns uninitialized warning in
>> mdadm.c. I cannot paste the logs due to I'm on holiday. Please try to
>> issue make everything, you should see it.
> I don't see anything here, if you have specifics, please post them.
>
> Jes


^ permalink raw reply

* Re: Can we deprecate ioctl(RAID_VERSION)?
From: Jes Sorensen @ 2017-04-06 15:31 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid, Hannes Reinecke, kernel-team
In-Reply-To: <87h922trit.fsf@notabene.neil.brown.name>

On 04/05/2017 06:32 PM, NeilBrown wrote:
> On Thu, Apr 06 2017, jes.sorensen@gmail.com wrote:
>
>> jes.sorensen@gmail.com writes:
>>> Hi Neil,
>>>
>>> Looking through the code in mdadm, I noticed a number of cases calling
>>> ioctl(RAID_VERSION). At first I had it confused with metadata version,
>>> but it looks like RAID_VERSION will always return 90000 if it's a valid
>>> raid device.
>>>
>>> In the cases we want to confirm the fd is a valid raid array,
>>> ioctl(GET_ARRAY_INFO) should do, or sysfs_read(GET_VERSION).
>>>
>>> Am I missing something obvious here, or do you see any reason for
>>> leaving this around?
>>
>> Sorry the above is wrong, it will always return 900, not 90000. Some of
>> the code that stood out is in util.c:
>>
>> int md_get_version(int fd)
>> {
>>         struct stat stb;
>>         mdu_version_t vers;
>>
>>         if (fstat(fd, &stb)<0)
>>                 return -1;
>>         if ((S_IFMT&stb.st_mode) != S_IFBLK)
>>                 return -1;
>>
>>         if (ioctl(fd, RAID_VERSION, &vers) == 0)
>>                 return  (vers.major*10000) + (vers.minor*100) + vers.patchlevel;
>>         if (errno == EACCES)
>>                 return -1;
>>         if (major(stb.st_rdev) == MD_MAJOR)
>>                 return (3600);
>>         return -1;
>> }
>>
>> ....
>>
>> int set_array_info(int mdfd, struct supertype *st, struct mdinfo *info)
>> {
>>         /* Initialise kernel's knowledge of array.
>>          * This varies between externally managed arrays
>>          * and older kernels
>>          */
>>         int vers = md_get_version(mdfd);
>>         int rv;
>>
>> #ifndef MDASSEMBLE
>>         if (st->ss->external)
>>                 rv = sysfs_set_array(info, vers);
>>         else
>> #endif
>>                 if ((vers % 100) >= 1) { /* can use different versions */
>>                 mdu_array_info_t inf;
>>                 memset(&inf, 0, sizeof(inf));
>>                 inf.major_version = info->array.major_version;
>>                 inf.minor_version = info->array.minor_version;
>>                 rv = ioctl(mdfd, SET_ARRAY_INFO, &inf);
>>         } else
>>                 rv = ioctl(mdfd, SET_ARRAY_INFO, NULL);
>>         return rv;
>> }
>>
>> This has been around since at least 2008, the current code came in
>> f35f25259279573c6274e2783536c0b0a399bdd4, but it looks like even the
>> prior code made the same assumptions.
>>
>> In either case, the above 'if ((vers % 100) >= 1)' will always trigger
>> since the kernel does #define MD_PATCHLEVEL_VERSION 3
>>
>> It's not like we have been updating MD_PATCHLEVEL_VERSION for a
>> while. Was the code meant to be looking at the superblock minor version?
>> I've been staring at this for a while now, so please beat me over the
>> head if I missed something blatantly obvious.
>>
>> Jes
>
> It is hard to get versioning right...
>
> The version returned by the RAID_VERSION ioctl is meant to reflect the
> capabilities of the implementation.  We could use the kernel version
> number for that (and sometimes do), but as distro's often backport
> features, that isn't always reliable.
>
> I've incremented the MD_PATCHLEVEL_VERSION when a change is made that
> cannot easily be detected from user-space.  As you note, we are up to
> three.  The last change was in 2.6.15.
> I've never contemplated changing the other two numbers that RAID_VERSION
> return.  They don't seem to mean anything useful.
>
> What exactly do you mean by "deprecate" the ioctl?
> If you remove the code in mdadm that calls it, mdadm will not work
> correctly on kernels older than 2.6.15, and it will be harder to
> and an future capability that is not easily visible from user space.
> If you remove the code in the kernel that handles it, you'll break
> mdadm.

Neil,

I see, thanks for explaining.

The goal is to eventually get out of the ioctl() business and get to a 
state where we can do everything via sysfs/configfs. Right now we have a 
big mix between ioctl and sysfs where neither interface does everything. 
The recent issues with PPL (I think it was) showed that we had to add 
more ioctl support because the interfaces needed to do it for sysfs 
weren't quite there. My long term goal is to get that situation improved 
so we can avoid adding anymore ioctl interfaces and eventually allow for 
distros to build mdadm with ioctl support disabled. We had a discussion 
at LSF/MM in Boston about this (Hannes, Shaohua, Song, and myself).

Reading the code I found it confusing that it was so tied to the patch 
level, but didn't do anything with the version numbers. At least 
intuitively if I bumped the version number, I would reset the patchlevel 
which would break things.

I think it's fair to draw a line in the sand and say that mdadm-4.1+ 
will not support kernels older than 2.6.15. I am open to the kernel 
version we pick here, but I would like to start deprecating some of the 
really old code. I have patches that does this in my tree, but I need to 
add a check for kernel version > 2.6.15. I am not aware what SuSE's 
enterprise kernel versions look like, but checking RHEL/CentOS RHEL5 was 
2.6.18, while RHEL4 was 2.6.9 - and RHEL4 has been unsupported for quite 
a while. At least for RHEL/CentOS 2.6.15 as the line in the sand seems fine.

For the kernel to expose features to userland in the future, I would 
prefer to go with a feature-flag style interface exposed via sysfs. That 
way a distro could enable one feature, but not the other in their kernel 
without having to worry about actual version numbers.

Cheers,
Jes

^ permalink raw reply

* Re: Can we deprecate ioctl(RAID_VERSION)?
From: Coly Li @ 2017-04-06 18:10 UTC (permalink / raw)
  To: Jes Sorensen, NeilBrown; +Cc: linux-raid, Hannes Reinecke, kernel-team
In-Reply-To: <070d7f50-c8f0-d5df-89ed-adb8b7582d8a@gmail.com>

On 2017/4/6 下午11:31, Jes Sorensen wrote:
> On 04/05/2017 06:32 PM, NeilBrown wrote:
>> On Thu, Apr 06 2017, jes.sorensen@gmail.com wrote:
>>
>>> jes.sorensen@gmail.com writes:
>>>> Hi Neil,
>>>>
>>>> Looking through the code in mdadm, I noticed a number of cases calling
>>>> ioctl(RAID_VERSION). At first I had it confused with metadata version,
>>>> but it looks like RAID_VERSION will always return 90000 if it's a valid
>>>> raid device.
>>>>
>>>> In the cases we want to confirm the fd is a valid raid array,
>>>> ioctl(GET_ARRAY_INFO) should do, or sysfs_read(GET_VERSION).
>>>>
>>>> Am I missing something obvious here, or do you see any reason for
>>>> leaving this around?
>>>
>>> Sorry the above is wrong, it will always return 900, not 90000. Some of
>>> the code that stood out is in util.c:
>>>
>>> int md_get_version(int fd)
>>> {
>>>         struct stat stb;
>>>         mdu_version_t vers;
>>>
>>>         if (fstat(fd, &stb)<0)
>>>                 return -1;
>>>         if ((S_IFMT&stb.st_mode) != S_IFBLK)
>>>                 return -1;
>>>
>>>         if (ioctl(fd, RAID_VERSION, &vers) == 0)
>>>                 return  (vers.major*10000) + (vers.minor*100) +
>>> vers.patchlevel;
>>>         if (errno == EACCES)
>>>                 return -1;
>>>         if (major(stb.st_rdev) == MD_MAJOR)
>>>                 return (3600);
>>>         return -1;
>>> }
>>>
>>> ....
>>>
>>> int set_array_info(int mdfd, struct supertype *st, struct mdinfo *info)
>>> {
>>>         /* Initialise kernel's knowledge of array.
>>>          * This varies between externally managed arrays
>>>          * and older kernels
>>>          */
>>>         int vers = md_get_version(mdfd);
>>>         int rv;
>>>
>>> #ifndef MDASSEMBLE
>>>         if (st->ss->external)
>>>                 rv = sysfs_set_array(info, vers);
>>>         else
>>> #endif
>>>                 if ((vers % 100) >= 1) { /* can use different
>>> versions */
>>>                 mdu_array_info_t inf;
>>>                 memset(&inf, 0, sizeof(inf));
>>>                 inf.major_version = info->array.major_version;
>>>                 inf.minor_version = info->array.minor_version;
>>>                 rv = ioctl(mdfd, SET_ARRAY_INFO, &inf);
>>>         } else
>>>                 rv = ioctl(mdfd, SET_ARRAY_INFO, NULL);
>>>         return rv;
>>> }
>>>
>>> This has been around since at least 2008, the current code came in
>>> f35f25259279573c6274e2783536c0b0a399bdd4, but it looks like even the
>>> prior code made the same assumptions.
>>>
>>> In either case, the above 'if ((vers % 100) >= 1)' will always trigger
>>> since the kernel does #define MD_PATCHLEVEL_VERSION 3
>>>
>>> It's not like we have been updating MD_PATCHLEVEL_VERSION for a
>>> while. Was the code meant to be looking at the superblock minor version?
>>> I've been staring at this for a while now, so please beat me over the
>>> head if I missed something blatantly obvious.
>>>
>>> Jes
>>
>> It is hard to get versioning right...
>>
>> The version returned by the RAID_VERSION ioctl is meant to reflect the
>> capabilities of the implementation.  We could use the kernel version
>> number for that (and sometimes do), but as distro's often backport
>> features, that isn't always reliable.
>>
>> I've incremented the MD_PATCHLEVEL_VERSION when a change is made that
>> cannot easily be detected from user-space.  As you note, we are up to
>> three.  The last change was in 2.6.15.
>> I've never contemplated changing the other two numbers that RAID_VERSION
>> return.  They don't seem to mean anything useful.
>>
>> What exactly do you mean by "deprecate" the ioctl?
>> If you remove the code in mdadm that calls it, mdadm will not work
>> correctly on kernels older than 2.6.15, and it will be harder to
>> and an future capability that is not easily visible from user space.
>> If you remove the code in the kernel that handles it, you'll break
>> mdadm.
> 
> Neil,
> 
> I see, thanks for explaining.
> 
> The goal is to eventually get out of the ioctl() business and get to a
> state where we can do everything via sysfs/configfs. Right now we have a
> big mix between ioctl and sysfs where neither interface does everything.
> The recent issues with PPL (I think it was) showed that we had to add
> more ioctl support because the interfaces needed to do it for sysfs
> weren't quite there. My long term goal is to get that situation improved
> so we can avoid adding anymore ioctl interfaces and eventually allow for
> distros to build mdadm with ioctl support disabled. We had a discussion
> at LSF/MM in Boston about this (Hannes, Shaohua, Song, and myself).
> 
> Reading the code I found it confusing that it was so tied to the patch
> level, but didn't do anything with the version numbers. At least
> intuitively if I bumped the version number, I would reset the patchlevel
> which would break things.
> 
> I think it's fair to draw a line in the sand and say that mdadm-4.1+
> will not support kernels older than 2.6.15. I am open to the kernel
> version we pick here, but I would like to start deprecating some of the
> really old code. I have patches that does this in my tree, but I need to
> add a check for kernel version > 2.6.15. I am not aware what SuSE's
> enterprise kernel versions look like, but checking RHEL/CentOS RHEL5 was
> 2.6.18, while RHEL4 was 2.6.9 - and RHEL4 has been unsupported for quite
> a while. At least for RHEL/CentOS 2.6.15 as the line in the sand seems
> fine.
> 
> For the kernel to expose features to userland in the future, I would
> prefer to go with a feature-flag style interface exposed via sysfs. That
> way a distro could enable one feature, but not the other in their kernel
> without having to worry about actual version numbers.

BTW, I'd like to take on the kernel side stuffs.
IMHO, it would be better that we also set a timeline, after the
timeline, we should add new interface via configfs, and keep all legancy
ioctl implementation before this timeline.

Coly


^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox