LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 17/17] ibmvfc: provide modules parameters for MQ settings
From: Tyrel Datwyler @ 2020-12-02  0:53 UTC (permalink / raw)
  To: james.bottomley
  Cc: Tyrel Datwyler, martin.petersen, linux-scsi, linux-kernel, brking,
	linuxppc-dev
In-Reply-To: <20201202005329.4538-1-tyreld@linux.ibm.com>

Add the various module parameter toggles for adjusting the MQ
characteristics at boot/load time as well as a device attribute for
changing the client scsi channel request amount.

Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
---
 drivers/scsi/ibmvscsi/ibmvfc.c | 75 +++++++++++++++++++++++++++++-----
 1 file changed, 65 insertions(+), 10 deletions(-)

diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index 97e8eed04b01..bc7c2dcd902c 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -40,6 +40,12 @@ static unsigned int disc_threads = IBMVFC_MAX_DISC_THREADS;
 static unsigned int ibmvfc_debug = IBMVFC_DEBUG;
 static unsigned int log_level = IBMVFC_DEFAULT_LOG_LEVEL;
 static unsigned int cls3_error = IBMVFC_CLS3_ERROR;
+static unsigned int mq_enabled = IBMVFC_MQ;
+static unsigned int nr_scsi_hw_queues = IBMVFC_SCSI_HW_QUEUES;
+static unsigned int nr_scsi_channels = IBMVFC_SCSI_CHANNELS;
+static unsigned int mig_channels_only = IBMVFC_MIG_NO_SUB_TO_CRQ;
+static unsigned int mig_no_less_channels = IBMVFC_MIG_NO_N_TO_M;
+
 static LIST_HEAD(ibmvfc_head);
 static DEFINE_SPINLOCK(ibmvfc_driver_lock);
 static struct scsi_transport_template *ibmvfc_transport_template;
@@ -49,6 +55,22 @@ MODULE_AUTHOR("Brian King <brking@linux.vnet.ibm.com>");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(IBMVFC_DRIVER_VERSION);
 
+module_param_named(mq, mq_enabled, uint, S_IRUGO);
+MODULE_PARM_DESC(mq, "Enable multiqueue support. "
+		 "[Default=" __stringify(IBMVFC_MQ) "]");
+module_param_named(scsi_host_queues, nr_scsi_hw_queues, uint, S_IRUGO);
+MODULE_PARM_DESC(scsi_host_queues, "Number of SCSI Host submission queues. "
+		 "[Default=" __stringify(IBMVFC_SCSI_HW_QUEUES) "]");
+module_param_named(scsi_hw_channels, nr_scsi_channels, uint, S_IRUGO);
+MODULE_PARM_DESC(scsi_hw_channels, "Number of hw scsi channels to request. "
+		 "[Default=" __stringify(IBMVFC_SCSI_CHANNELS) "]");
+module_param_named(mig_channels_only, mig_channels_only, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(mig_channels_only, "Prevent migration to non-channelized system. "
+		 "[Default=" __stringify(IBMVFC_MIG_NO_SUB_TO_CRQ) "]");
+module_param_named(mig_no_less_channels, mig_no_less_channels, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(mig_no_less_channels, "Prevent migration to system with less channels. "
+		 "[Default=" __stringify(IBMVFC_MIG_NO_N_TO_M) "]");
+
 module_param_named(init_timeout, init_timeout, uint, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(init_timeout, "Initialization timeout in seconds. "
 		 "[Default=" __stringify(IBMVFC_INIT_TIMEOUT) "]");
@@ -823,7 +845,7 @@ static int ibmvfc_reset_crq(struct ibmvfc_host *vhost)
 	crq->cur = 0;
 
 	if (vhost->scsi_scrqs.scrqs) {
-		for (i = 0; i < IBMVFC_SCSI_HW_QUEUES; i++) {
+		for (i = 0; i < nr_scsi_hw_queues; i++) {
 			scrq = &vhost->scsi_scrqs.scrqs[i];
 			memset(scrq->msgs, 0, PAGE_SIZE);
 			scrq->cur = 0;
@@ -3228,6 +3250,36 @@ static ssize_t ibmvfc_store_log_level(struct device *dev,
 	return strlen(buf);
 }
 
+static ssize_t ibmvfc_show_scsi_channels(struct device *dev,
+					 struct device_attribute *attr, char *buf)
+{
+	struct Scsi_Host *shost = class_to_shost(dev);
+	struct ibmvfc_host *vhost = shost_priv(shost);
+	unsigned long flags = 0;
+	int len;
+
+	spin_lock_irqsave(shost->host_lock, flags);
+	len = snprintf(buf, PAGE_SIZE, "%d\n", vhost->client_scsi_channels);
+	spin_unlock_irqrestore(shost->host_lock, flags);
+	return len;
+}
+
+static ssize_t ibmvfc_store_scsi_channels(struct device *dev,
+					 struct device_attribute *attr,
+					 const char *buf, size_t count)
+{
+	struct Scsi_Host *shost = class_to_shost(dev);
+	struct ibmvfc_host *vhost = shost_priv(shost);
+	unsigned long flags = 0;
+	unsigned int channels;
+
+	spin_lock_irqsave(shost->host_lock, flags);
+	channels = simple_strtoul(buf, NULL, 10);
+	vhost->client_scsi_channels = min(channels, nr_scsi_hw_queues);
+	spin_unlock_irqrestore(shost->host_lock, flags);
+	return strlen(buf);
+}
+
 static DEVICE_ATTR(partition_name, S_IRUGO, ibmvfc_show_host_partition_name, NULL);
 static DEVICE_ATTR(device_name, S_IRUGO, ibmvfc_show_host_device_name, NULL);
 static DEVICE_ATTR(port_loc_code, S_IRUGO, ibmvfc_show_host_loc_code, NULL);
@@ -3236,6 +3288,8 @@ static DEVICE_ATTR(npiv_version, S_IRUGO, ibmvfc_show_host_npiv_version, NULL);
 static DEVICE_ATTR(capabilities, S_IRUGO, ibmvfc_show_host_capabilities, NULL);
 static DEVICE_ATTR(log_level, S_IRUGO | S_IWUSR,
 		   ibmvfc_show_log_level, ibmvfc_store_log_level);
+static DEVICE_ATTR(nr_scsi_channels, S_IRUGO | S_IWUSR,
+		   ibmvfc_show_scsi_channels, ibmvfc_store_scsi_channels);
 
 #ifdef CONFIG_SCSI_IBMVFC_TRACE
 /**
@@ -3292,6 +3346,7 @@ static struct device_attribute *ibmvfc_attrs[] = {
 	&dev_attr_npiv_version,
 	&dev_attr_capabilities,
 	&dev_attr_log_level,
+	&dev_attr_nr_scsi_channels,
 	NULL
 };
 
@@ -4676,9 +4731,9 @@ static void ibmvfc_channel_enquiry(struct ibmvfc_host *vhost)
 	mad->common.opcode = cpu_to_be32(IBMVFC_CHANNEL_ENQUIRY);
 	mad->common.length = cpu_to_be16(sizeof(*mad));
 
-	if (IBMVFC_MIG_NO_SUB_TO_CRQ)
+	if (mig_channels_only)
 		mad->flags |= cpu_to_be32(IBMVFC_NO_CHANNELS_TO_CRQ_SUPPORT);
-	if (IBMVFC_MIG_NO_N_TO_M)
+	if (mig_no_less_channels)
 		mad->flags |= cpu_to_be32(IBMVFC_NO_N_TO_M_CHANNELS_SUPPORT);
 
 	ibmvfc_set_host_action(vhost, IBMVFC_HOST_ACTION_INIT_WAIT);
@@ -5416,13 +5471,13 @@ static int ibmvfc_init_sub_crqs(struct ibmvfc_host *vhost)
 
 	ENTER;
 
-	vhost->scsi_scrqs.scrqs = kcalloc(IBMVFC_SCSI_HW_QUEUES,
+	vhost->scsi_scrqs.scrqs = kcalloc(nr_scsi_hw_queues,
 					  sizeof(*vhost->scsi_scrqs.scrqs),
 					  GFP_KERNEL);
 	if (!vhost->scsi_scrqs.scrqs)
 		return -1;
 
-	for (i = 0; i < IBMVFC_SCSI_HW_QUEUES; i++) {
+	for (i = 0; i < nr_scsi_hw_queues; i++) {
 		if (ibmvfc_register_scsi_channel(vhost, i)) {
 			for (j = i; j > 0; j--)
 				ibmvfc_deregister_scsi_channel(vhost, j - 1);
@@ -5446,7 +5501,7 @@ static void ibmvfc_release_sub_crqs(struct ibmvfc_host *vhost)
 	if (!vhost->scsi_scrqs.scrqs)
 		return;
 
-	for (i = 0; i < IBMVFC_SCSI_HW_QUEUES; i++)
+	for (i = 0; i < nr_scsi_hw_queues; i++)
 		ibmvfc_deregister_scsi_channel(vhost, i);
 
 	kfree(vhost->scsi_scrqs.scrqs);
@@ -5658,13 +5713,13 @@ static int ibmvfc_probe(struct vio_dev *vdev, const struct vio_device_id *id)
 	}
 
 	shost->transportt = ibmvfc_transport_template;
-	shost->can_queue = (max_requests / IBMVFC_SCSI_HW_QUEUES);
+	shost->can_queue = (max_requests / nr_scsi_hw_queues);
 	shost->max_lun = max_lun;
 	shost->max_id = max_targets;
 	shost->max_sectors = IBMVFC_MAX_SECTORS;
 	shost->max_cmd_len = IBMVFC_MAX_CDB_LEN;
 	shost->unique_id = shost->host_no;
-	shost->nr_hw_queues = IBMVFC_SCSI_HW_QUEUES;
+	shost->nr_hw_queues = nr_scsi_hw_queues;
 
 	vhost = shost_priv(shost);
 	INIT_LIST_HEAD(&vhost->sent);
@@ -5677,8 +5732,8 @@ static int ibmvfc_probe(struct vio_dev *vdev, const struct vio_device_id *id)
 	vhost->log_level = log_level;
 	vhost->task_set = 1;
 
-	vhost->mq_enabled = IBMVFC_MQ;
-	vhost->client_scsi_channels = IBMVFC_SCSI_CHANNELS;
+	vhost->mq_enabled = mq_enabled;
+	vhost->client_scsi_channels = min(nr_scsi_hw_queues, nr_scsi_channels);
 	vhost->using_channels = 0;
 	vhost->do_enquiry = 1;
 
-- 
2.27.0


^ permalink raw reply related

* [PATCH v2 14/17] ibmvfc: add cancel mad initialization helper
From: Tyrel Datwyler @ 2020-12-02  0:53 UTC (permalink / raw)
  To: james.bottomley
  Cc: Tyrel Datwyler, martin.petersen, linux-scsi, linux-kernel, brking,
	linuxppc-dev
In-Reply-To: <20201202005329.4538-1-tyreld@linux.ibm.com>

Add a helper routine for initializing a Cancel MAD. This will be useful
for a channelized client that needs to send a Cancel commands down every
channel commands were sent for a particular LUN.

Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
---
 drivers/scsi/ibmvscsi/ibmvfc.c | 67 ++++++++++++++++++++--------------
 1 file changed, 39 insertions(+), 28 deletions(-)

diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index c1ac2acba5fd..0b6284020f06 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -2286,6 +2286,44 @@ static int ibmvfc_wait_for_ops(struct ibmvfc_host *vhost, void *device,
 	return SUCCESS;
 }
 
+static struct ibmvfc_event *ibmvfc_init_tmf(struct ibmvfc_host *vhost,
+				     struct scsi_device *sdev,
+				     int type)
+{
+	struct scsi_target *starget = scsi_target(sdev);
+	struct fc_rport *rport = starget_to_rport(starget);
+	struct ibmvfc_event *evt;
+	struct ibmvfc_tmf *tmf;
+
+	evt = ibmvfc_get_event(vhost);
+	ibmvfc_init_event(evt, ibmvfc_sync_completion, IBMVFC_MAD_FORMAT);
+
+	tmf = &evt->iu.tmf;
+	memset(tmf, 0, sizeof(*tmf));
+	if (ibmvfc_check_caps(vhost, IBMVFC_HANDLE_VF_WWPN)) {
+		tmf->common.version = cpu_to_be32(2);
+		tmf->target_wwpn = cpu_to_be64(rport->port_name);
+	} else {
+		tmf->common.version = cpu_to_be32(1);
+	}
+	tmf->common.opcode = cpu_to_be32(IBMVFC_TMF_MAD);
+	tmf->common.length = cpu_to_be16(sizeof(*tmf));
+	tmf->scsi_id = cpu_to_be64(rport->port_id);
+	int_to_scsilun(sdev->lun, &tmf->lun);
+	if (!ibmvfc_check_caps(vhost, IBMVFC_CAN_SUPPRESS_ABTS))
+		type &= ~IBMVFC_TMF_SUPPRESS_ABTS;
+	if (vhost->state == IBMVFC_ACTIVE)
+		tmf->flags = cpu_to_be32((type | IBMVFC_TMF_LUA_VALID));
+	else
+		tmf->flags = cpu_to_be32(((type & IBMVFC_TMF_SUPPRESS_ABTS) | IBMVFC_TMF_LUA_VALID));
+	tmf->cancel_key = cpu_to_be32((unsigned long)sdev->hostdata);
+	tmf->my_cancel_key = cpu_to_be32((unsigned long)starget->hostdata);
+
+	init_completion(&evt->comp);
+
+	return evt;
+}
+
 /**
  * ibmvfc_cancel_all - Cancel all outstanding commands to the device
  * @sdev:	scsi device to cancel commands
@@ -2300,9 +2338,6 @@ static int ibmvfc_wait_for_ops(struct ibmvfc_host *vhost, void *device,
 static int ibmvfc_cancel_all(struct scsi_device *sdev, int type)
 {
 	struct ibmvfc_host *vhost = shost_priv(sdev->host);
-	struct scsi_target *starget = scsi_target(sdev);
-	struct fc_rport *rport = starget_to_rport(starget);
-	struct ibmvfc_tmf *tmf;
 	struct ibmvfc_event *evt, *found_evt;
 	union ibmvfc_iu rsp;
 	int rsp_rc = -EBUSY;
@@ -2327,32 +2362,8 @@ static int ibmvfc_cancel_all(struct scsi_device *sdev, int type)
 	}
 
 	if (vhost->logged_in) {
-		evt = ibmvfc_get_event(vhost);
-		ibmvfc_init_event(evt, ibmvfc_sync_completion, IBMVFC_MAD_FORMAT);
-
-		tmf = &evt->iu.tmf;
-		memset(tmf, 0, sizeof(*tmf));
-		if (ibmvfc_check_caps(vhost, IBMVFC_HANDLE_VF_WWPN)) {
-			tmf->common.version = cpu_to_be32(2);
-			tmf->target_wwpn = cpu_to_be64(rport->port_name);
-		} else {
-			tmf->common.version = cpu_to_be32(1);
-		}
-		tmf->common.opcode = cpu_to_be32(IBMVFC_TMF_MAD);
-		tmf->common.length = cpu_to_be16(sizeof(*tmf));
-		tmf->scsi_id = cpu_to_be64(rport->port_id);
-		int_to_scsilun(sdev->lun, &tmf->lun);
-		if (!ibmvfc_check_caps(vhost, IBMVFC_CAN_SUPPRESS_ABTS))
-			type &= ~IBMVFC_TMF_SUPPRESS_ABTS;
-		if (vhost->state == IBMVFC_ACTIVE)
-			tmf->flags = cpu_to_be32((type | IBMVFC_TMF_LUA_VALID));
-		else
-			tmf->flags = cpu_to_be32(((type & IBMVFC_TMF_SUPPRESS_ABTS) | IBMVFC_TMF_LUA_VALID));
-		tmf->cancel_key = cpu_to_be32((unsigned long)sdev->hostdata);
-		tmf->my_cancel_key = cpu_to_be32((unsigned long)starget->hostdata);
-
+		evt = ibmvfc_init_tmf(vhost, sdev, type);
 		evt->sync_iu = &rsp;
-		init_completion(&evt->comp);
 		rsp_rc = ibmvfc_send_event(evt, vhost, default_timeout);
 	}
 
-- 
2.27.0


^ permalink raw reply related

* [PATCH v2 15/17] ibmvfc: send Cancel MAD down each hw scsi channel
From: Tyrel Datwyler @ 2020-12-02  0:53 UTC (permalink / raw)
  To: james.bottomley
  Cc: Tyrel Datwyler, martin.petersen, linux-scsi, linux-kernel, brking,
	linuxppc-dev
In-Reply-To: <20201202005329.4538-1-tyreld@linux.ibm.com>

In general the client needs to send Cancel MADs and task management
commands down the same channel as the command(s) intended to cancel or
abort. The client assigns cancel keys per LUN and thus must send a
Cancel down each channel commands were submitted for that LUN. Further,
the client then must wait for those cancel completions prior to
submitting a LUN RESET or ABORT TASK SET.

Allocate event pointers for each possible scsi channel and assign an
event for each channel that requires a cancel. Wait for completion each
submitted cancel.

Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
---
 drivers/scsi/ibmvscsi/ibmvfc.c | 106 +++++++++++++++++++++------------
 1 file changed, 68 insertions(+), 38 deletions(-)

diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index 0b6284020f06..97e8eed04b01 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -2339,32 +2339,52 @@ static int ibmvfc_cancel_all(struct scsi_device *sdev, int type)
 {
 	struct ibmvfc_host *vhost = shost_priv(sdev->host);
 	struct ibmvfc_event *evt, *found_evt;
-	union ibmvfc_iu rsp;
-	int rsp_rc = -EBUSY;
+	struct ibmvfc_event **evt_list;
+	union ibmvfc_iu *rsp;
+	int rsp_rc = 0;
 	unsigned long flags;
 	u16 status;
+	int num_hwq = 1;
+	int i;
+	int ret = 0;
 
 	ENTER;
 	spin_lock_irqsave(vhost->host->host_lock, flags);
-	found_evt = NULL;
-	list_for_each_entry(evt, &vhost->sent, queue) {
-		if (evt->cmnd && evt->cmnd->device == sdev) {
-			found_evt = evt;
-			break;
+	if (vhost->using_channels && vhost->scsi_scrqs.active_queues)
+		num_hwq = vhost->scsi_scrqs.active_queues;
+
+	evt_list = kcalloc(num_hwq, sizeof(*evt_list), GFP_KERNEL);
+	rsp = kcalloc(num_hwq, sizeof(*rsp), GFP_KERNEL);
+
+	for (i = 0; i < num_hwq; i++) {
+		sdev_printk(KERN_INFO, sdev, "Cancelling outstanding commands on queue %d.\n", i);
+
+		found_evt = NULL;
+		list_for_each_entry(evt, &vhost->sent, queue) {
+			if (evt->cmnd && evt->cmnd->device == sdev && evt->hwq == i) {
+				found_evt = evt;
+				break;
+			}
 		}
-	}
 
-	if (!found_evt) {
-		if (vhost->log_level > IBMVFC_DEFAULT_LOG_LEVEL)
-			sdev_printk(KERN_INFO, sdev, "No events found to cancel\n");
-		spin_unlock_irqrestore(vhost->host->host_lock, flags);
-		return 0;
-	}
+		if (!found_evt) {
+			if (vhost->log_level > IBMVFC_DEFAULT_LOG_LEVEL)
+				sdev_printk(KERN_INFO, sdev, "No events found to cancel on queue %d\n", i);
+			continue;
+		}
 
-	if (vhost->logged_in) {
-		evt = ibmvfc_init_tmf(vhost, sdev, type);
-		evt->sync_iu = &rsp;
-		rsp_rc = ibmvfc_send_event(evt, vhost, default_timeout);
+
+		if (vhost->logged_in) {
+			evt_list[i] = ibmvfc_init_tmf(vhost, sdev, type);
+			evt_list[i]->hwq = i;
+			evt_list[i]->sync_iu = &rsp[i];
+			rsp_rc = ibmvfc_send_event(evt_list[i], vhost, default_timeout);
+			if (rsp_rc)
+				break;
+		} else {
+			rsp_rc = -EBUSY;
+			break;
+		}
 	}
 
 	spin_unlock_irqrestore(vhost->host->host_lock, flags);
@@ -2374,32 +2394,42 @@ static int ibmvfc_cancel_all(struct scsi_device *sdev, int type)
 		/* If failure is received, the host adapter is most likely going
 		 through reset, return success so the caller will wait for the command
 		 being cancelled to get returned */
-		return 0;
+		goto free_mem;
 	}
 
-	sdev_printk(KERN_INFO, sdev, "Cancelling outstanding commands.\n");
-
-	wait_for_completion(&evt->comp);
-	status = be16_to_cpu(rsp.mad_common.status);
-	spin_lock_irqsave(vhost->host->host_lock, flags);
-	ibmvfc_free_event(evt);
-	spin_unlock_irqrestore(vhost->host->host_lock, flags);
+	for (i = 0; i < num_hwq; i++) {
+		if (!evt_list[i])
+			continue;
 
-	if (status != IBMVFC_MAD_SUCCESS) {
-		sdev_printk(KERN_WARNING, sdev, "Cancel failed with rc=%x\n", status);
-		switch (status) {
-		case IBMVFC_MAD_DRIVER_FAILED:
-		case IBMVFC_MAD_CRQ_ERROR:
-			/* Host adapter most likely going through reset, return success to
-			 the caller will wait for the command being cancelled to get returned */
-			return 0;
-		default:
-			return -EIO;
-		};
+		wait_for_completion(&evt_list[i]->comp);
+		status = be16_to_cpu(rsp[i].mad_common.status);
+
+		if (status != IBMVFC_MAD_SUCCESS) {
+			sdev_printk(KERN_WARNING, sdev, "Cancel failed with rc=%x\n", status);
+			switch (status) {
+			case IBMVFC_MAD_DRIVER_FAILED:
+			case IBMVFC_MAD_CRQ_ERROR:
+				/* Host adapter most likely going through reset, return success to
+				 the caller will wait for the command being cancelled to get returned */
+				goto free_mem;
+			default:
+				ret = -EIO;
+				goto free_mem;
+			};
+		}
 	}
 
 	sdev_printk(KERN_INFO, sdev, "Successfully cancelled outstanding commands\n");
-	return 0;
+free_mem:
+	spin_lock_irqsave(vhost->host->host_lock, flags);
+	for (i = 0; i < num_hwq; i++)
+		if (evt_list[i])
+			ibmvfc_free_event(evt_list[i]);
+	spin_unlock_irqrestore(vhost->host->host_lock, flags);
+	kfree(evt_list);
+	kfree(rsp);
+
+	return ret;
 }
 
 /**
-- 
2.27.0


^ permalink raw reply related

* [PATCH v2 02/17] ibmvfc: define hcall wrapper for registering a Sub-CRQ
From: Tyrel Datwyler @ 2020-12-02  0:53 UTC (permalink / raw)
  To: james.bottomley
  Cc: Tyrel Datwyler, martin.petersen, linux-scsi, linux-kernel,
	Brian King, brking, linuxppc-dev
In-Reply-To: <20201202005329.4538-1-tyreld@linux.ibm.com>

Sub-CRQs are registred with firmware via a hypercall. Abstract that
interface into a simpler helper function.

Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
Reviewed-by: Brian King <brking@linux.vnet.ibm.com>
---
 drivers/scsi/ibmvscsi/ibmvfc.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index f1d677a7423d..64674054dbae 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -138,6 +138,20 @@ static void ibmvfc_tgt_move_login(struct ibmvfc_target *);
 
 static const char *unknown_error = "unknown error";
 
+static long h_reg_sub_crq(unsigned long unit_address, unsigned long ioba,
+			  unsigned long length, unsigned long *cookie,
+			  unsigned long *irq)
+{
+	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+	long rc;
+
+	rc = plpar_hcall(H_REG_SUB_CRQ, retbuf, unit_address, ioba, length);
+	*cookie = retbuf[0];
+	*irq = retbuf[1];
+
+	return rc;
+}
+
 static int ibmvfc_check_caps(struct ibmvfc_host *vhost, unsigned long cap_flags)
 {
 	u64 host_caps = be64_to_cpu(vhost->login_buf->resp.capabilities);
-- 
2.27.0


^ permalink raw reply related

* [PATCH v2 01/17] ibmvfc: add vhost fields and defaults for MQ enablement
From: Tyrel Datwyler @ 2020-12-02  0:53 UTC (permalink / raw)
  To: james.bottomley
  Cc: Tyrel Datwyler, martin.petersen, linux-scsi, linux-kernel, brking,
	linuxppc-dev
In-Reply-To: <20201202005329.4538-1-tyreld@linux.ibm.com>

Introduce several new vhost fields for managing MQ state of the adapter
as well as initial defaults for MQ enablement.

Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
---
 drivers/scsi/ibmvscsi/ibmvfc.c |  9 ++++++++-
 drivers/scsi/ibmvscsi/ibmvfc.h | 13 +++++++++++--
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index 42e4d35e0d35..f1d677a7423d 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -5161,12 +5161,13 @@ static int ibmvfc_probe(struct vio_dev *vdev, const struct vio_device_id *id)
 	}
 
 	shost->transportt = ibmvfc_transport_template;
-	shost->can_queue = max_requests;
+	shost->can_queue = (max_requests / IBMVFC_SCSI_HW_QUEUES);
 	shost->max_lun = max_lun;
 	shost->max_id = max_targets;
 	shost->max_sectors = IBMVFC_MAX_SECTORS;
 	shost->max_cmd_len = IBMVFC_MAX_CDB_LEN;
 	shost->unique_id = shost->host_no;
+	shost->nr_hw_queues = IBMVFC_SCSI_HW_QUEUES;
 
 	vhost = shost_priv(shost);
 	INIT_LIST_HEAD(&vhost->sent);
@@ -5178,6 +5179,12 @@ static int ibmvfc_probe(struct vio_dev *vdev, const struct vio_device_id *id)
 	vhost->partition_number = -1;
 	vhost->log_level = log_level;
 	vhost->task_set = 1;
+
+	vhost->mq_enabled = IBMVFC_MQ;
+	vhost->client_scsi_channels = IBMVFC_SCSI_CHANNELS;
+	vhost->using_channels = 0;
+	vhost->do_enquiry = 1;
+
 	strcpy(vhost->partition_name, "UNKNOWN");
 	init_waitqueue_head(&vhost->work_wait_q);
 	init_waitqueue_head(&vhost->init_wait_q);
diff --git a/drivers/scsi/ibmvscsi/ibmvfc.h b/drivers/scsi/ibmvscsi/ibmvfc.h
index 9d58cfd774d3..e095daada70e 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.h
+++ b/drivers/scsi/ibmvscsi/ibmvfc.h
@@ -41,16 +41,21 @@
 #define IBMVFC_DEFAULT_LOG_LEVEL	2
 #define IBMVFC_MAX_CDB_LEN		16
 #define IBMVFC_CLS3_ERROR		0
+#define IBMVFC_MQ			0
+#define IBMVFC_SCSI_CHANNELS		0
+#define IBMVFC_SCSI_HW_QUEUES		1
+#define IBMVFC_MIG_NO_SUB_TO_CRQ	0
+#define IBMVFC_MIG_NO_N_TO_M		0
 
 /*
  * Ensure we have resources for ERP and initialization:
- * 1 for ERP
  * 1 for initialization
  * 1 for NPIV Logout
  * 2 for BSG passthru
  * 2 for each discovery thread
+ * 1 ERP for each possible HW Queue
  */
-#define IBMVFC_NUM_INTERNAL_REQ	(1 + 1 + 1 + 2 + (disc_threads * 2))
+#define IBMVFC_NUM_INTERNAL_REQ	(1 + 1 + 2 + (disc_threads * 2) + IBMVFC_SCSI_HW_QUEUES)
 
 #define IBMVFC_MAD_SUCCESS		0x00
 #define IBMVFC_MAD_NOT_SUPPORTED	0xF1
@@ -826,6 +831,10 @@ struct ibmvfc_host {
 	int delay_init;
 	int scan_complete;
 	int logged_in;
+	int mq_enabled;
+	int using_channels;
+	int do_enquiry;
+	int client_scsi_channels;
 	int aborting_passthru;
 	int events_to_log;
 #define IBMVFC_AE_LINKUP	0x0001
-- 
2.27.0


^ permalink raw reply related

* [PATCH v2 10/17] ibmvfc: advertise client support for using hardware channels
From: Tyrel Datwyler @ 2020-12-02  0:53 UTC (permalink / raw)
  To: james.bottomley
  Cc: Tyrel Datwyler, martin.petersen, linux-scsi, linux-kernel,
	Brian King, brking, linuxppc-dev
In-Reply-To: <20201202005329.4538-1-tyreld@linux.ibm.com>

Previous patches have plumbed the necessary Sub-CRQ interface and
channel negotiation MADs to fully channelized hardware queues.

Advertise client support via NPIV Login capability
IBMVFC_CAN_USE_CHANNELS when the client bits have MQ enabled via
vhost->mq_enabled, or when channels were already in use during a
subsequent NPIV Login. The later is required because channel support is
only renegotiated after a CRQ pair is broken. Simple NPIV Logout/Logins
require the client to continue to advertise the channel capability until
the CRQ pair between the client is broken.

Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
Reviewed-by: Brian King <brking@linux.vnet.ibm.com>
---
 drivers/scsi/ibmvscsi/ibmvfc.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index bfd3340eb0b6..0e6c9e55a221 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -1282,6 +1282,10 @@ static void ibmvfc_set_login_info(struct ibmvfc_host *vhost)
 
 	login_info->max_cmds = cpu_to_be32(max_requests + IBMVFC_NUM_INTERNAL_REQ);
 	login_info->capabilities = cpu_to_be64(IBMVFC_CAN_MIGRATE | IBMVFC_CAN_SEND_VF_WWPN);
+
+	if (vhost->mq_enabled || vhost->using_channels)
+		login_info->capabilities |= cpu_to_be64(IBMVFC_CAN_USE_CHANNELS);
+
 	login_info->async.va = cpu_to_be64(vhost->async_crq.msg_token);
 	login_info->async.len = cpu_to_be32(vhost->async_crq.size * sizeof(*vhost->async_crq.msgs));
 	strncpy(login_info->partition_name, vhost->partition_name, IBMVFC_MAX_NAME);
-- 
2.27.0


^ permalink raw reply related

* [PATCH v2 05/17] ibmvfc: add Sub-CRQ IRQ enable/disable routine
From: Tyrel Datwyler @ 2020-12-02  0:53 UTC (permalink / raw)
  To: james.bottomley
  Cc: Tyrel Datwyler, martin.petersen, linux-scsi, linux-kernel,
	Brian King, brking, linuxppc-dev
In-Reply-To: <20201202005329.4538-1-tyreld@linux.ibm.com>

Each Sub-CRQ has its own interrupt. A hypercall is required to toggle
the IRQ state. Provide the necessary mechanism via a helper function.

Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
Reviewed-by: Brian King <brking@linux.vnet.ibm.com>
---
 drivers/scsi/ibmvscsi/ibmvfc.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index 4860487c6779..97f00fefa809 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -3361,6 +3361,26 @@ static void ibmvfc_tasklet(void *data)
 	spin_unlock_irqrestore(vhost->host->host_lock, flags);
 }
 
+static int ibmvfc_toggle_scrq_irq(struct ibmvfc_sub_queue *scrq, int enable)
+{
+	struct device *dev = scrq->vhost->dev;
+	struct vio_dev *vdev = to_vio_dev(dev);
+	unsigned long rc;
+	int irq_action = H_ENABLE_VIO_INTERRUPT;
+
+	if (!enable)
+		irq_action = H_DISABLE_VIO_INTERRUPT;
+
+	rc = plpar_hcall_norets(H_VIOCTL, vdev->unit_address, irq_action,
+				scrq->hw_irq, 0, 0);
+
+	if (rc)
+		dev_err(dev, "Couldn't %s sub-crq[%lu] irq. rc=%ld\n",
+			enable ? "enable" : "disable", scrq->hwq_id, rc);
+
+	return rc;
+}
+
 /**
  * ibmvfc_init_tgt - Set the next init job step for the target
  * @tgt:		ibmvfc target struct
-- 
2.27.0


^ permalink raw reply related

* Re: [net-next PATCH] net: freescale: ucc_geth: remove unused SKB_ALLOC_TIMEOUT
From: patchwork-bot+netdevbpf @ 2020-12-02  1:00 UTC (permalink / raw)
  To: Chris Packham
  Cc: trivial, netdev, linux-kernel, leoyang.li, kuba, linuxppc-dev,
	davem
In-Reply-To: <20201130001010.28998-1-chris.packham@alliedtelesis.co.nz>

Hello:

This patch was applied to netdev/net-next.git (refs/heads/master):

On Mon, 30 Nov 2020 13:10:10 +1300 you wrote:
> This was added in commit ce973b141dfa ("[PATCH] Freescale QE UCC gigabit
> ethernet driver") but doesn't appear to have been used. Remove it now.
> 
> Signed-off-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
> ---
>  drivers/net/ethernet/freescale/ucc_geth.h | 1 -
>  1 file changed, 1 deletion(-)

Here is the summary with links:
  - [net-next] net: freescale: ucc_geth: remove unused SKB_ALLOC_TIMEOUT
    https://git.kernel.org/netdev/net-next/c/2bf7d3776b74

You are awesome, thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply

* [PATCH v2 03/17] ibmvfc: add Subordinate CRQ definitions
From: Tyrel Datwyler @ 2020-12-02  0:53 UTC (permalink / raw)
  To: james.bottomley
  Cc: Tyrel Datwyler, martin.petersen, linux-scsi, linux-kernel,
	Brian King, brking, linuxppc-dev
In-Reply-To: <20201202005329.4538-1-tyreld@linux.ibm.com>

Subordinate Command Response Queues (Sub CRQ) are used in conjunction
with the primary CRQ when more than one queue is needed by the virtual
IO adapter. Recent phyp firmware versions support Sub CRQ's with ibmvfc
adapters. This feature is a prerequisite for supporting multiple
hardware backed submission queues in the vfc adapter.

The Sub CRQ command element differs from the standard CRQ in that it is
32bytes long as opposed to 16bytes for the latter. Despite this extra
16bytes the ibmvfc protocol will use the original CRQ command element
mapped to the first 16bytes of the Sub CRQ element initially.

Add definitions for the Sub CRQ command element and queue.

Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
Reviewed-by: Brian King <brking@linux.vnet.ibm.com>
---
 drivers/scsi/ibmvscsi/ibmvfc.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/drivers/scsi/ibmvscsi/ibmvfc.h b/drivers/scsi/ibmvscsi/ibmvfc.h
index e095daada70e..b3cd35cbf067 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.h
+++ b/drivers/scsi/ibmvscsi/ibmvfc.h
@@ -656,6 +656,29 @@ struct ibmvfc_crq_queue {
 	dma_addr_t msg_token;
 };
 
+struct ibmvfc_sub_crq {
+	struct ibmvfc_crq crq;
+	__be64 reserved[2];
+} __packed __aligned(8);
+
+struct ibmvfc_sub_queue {
+	struct ibmvfc_sub_crq *msgs;
+	dma_addr_t msg_token;
+	int size, cur;
+	struct ibmvfc_host *vhost;
+	unsigned long cookie;
+	unsigned long vios_cookie;
+	unsigned long hw_irq;
+	unsigned long irq;
+	unsigned long hwq_id;
+	char name[32];
+};
+
+struct ibmvfc_scsi_channels {
+	struct ibmvfc_sub_queue *scrqs;
+	unsigned int active_queues;
+};
+
 enum ibmvfc_ae_link_state {
 	IBMVFC_AE_LS_LINK_UP		= 0x01,
 	IBMVFC_AE_LS_LINK_BOUNCED	= 0x02,
-- 
2.27.0


^ permalink raw reply related

* [PATCH kernel] powerpc/kuap: Restore AMR after replaying soft interrupts
From: Alexey Kardashevskiy @ 2020-12-02  1:09 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Alexey Kardashevskiy, Nicholas Piggin

When interrupted in raw_copy_from_user()/... after user memory access
is enabled, a nested handler may also access user memory (perf is
one example) and when it does so, it calls prevent_read_from_user()
which prevents the upper handler from accessing user memory.

This saves/restores AMR when replaying interrupts. get_kuap/set_kuap have
stubs for disabled KUAP so no ifdefs.

Found by syzkaller.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---

This is an example:

------------[ cut here ]------------
Bug: Read fault blocked by AMR!
WARNING: CPU: 0 PID: 1603 at /home/aik/p/kernel/arch/powerpc/include/asm/book3s/64/kup-radix.h:145 __do_page_fau

Modules linked in:
CPU: 0 PID: 1603 Comm: amr Not tainted 5.10.0-rc6_v5.10-rc6_a+fstn1 #24
NIP:  c00000000009ece8 LR: c00000000009ece4 CTR: 0000000000000000
REGS: c00000000dc63560 TRAP: 0700   Not tainted  (5.10.0-rc6_v5.10-rc6_a+fstn1)
MSR:  8000000000021033 <SF,ME,IR,DR,RI,LE>  CR: 28002888  XER: 20040000
CFAR: c0000000001fa928 IRQMASK: 1
GPR00: c00000000009ece4 c00000000dc637f0 c000000002397600 000000000000001f
GPR04: c0000000020eb318 0000000000000000 c00000000dc63494 0000000000000027
GPR08: c00000007fe4de68 c00000000dfe9180 0000000000000000 0000000000000001
GPR12: 0000000000002000 c0000000030a0000 0000000000000000 0000000000000000
GPR16: 0000000000000000 0000000000000000 0000000000000000 bfffffffffffffff
GPR20: 0000000000000000 c0000000134a4020 c0000000019c2218 0000000000000fe0
GPR24: 0000000000000000 0000000000000000 c00000000d106200 0000000040000000
GPR28: 0000000000000000 0000000000000300 c00000000dc63910 c000000001946730
NIP [c00000000009ece8] __do_page_fault+0xb38/0xde0
LR [c00000000009ece4] __do_page_fault+0xb34/0xde0
Call Trace:
[c00000000dc637f0] [c00000000009ece4] __do_page_fault+0xb34/0xde0 (unreliable)
[c00000000dc638a0] [c00000000000c968] handle_page_fault+0x10/0x2c
--- interrupt: 300 at strncpy_from_user+0x290/0x440
    LR = strncpy_from_user+0x284/0x440
[c00000000dc63ba0] [c000000000c3dcb0] strncpy_from_user+0x2f0/0x440 (unreliable)
[c00000000dc63c30] [c00000000068b888] getname_flags+0x88/0x2c0
[c00000000dc63c90] [c000000000662a44] do_sys_openat2+0x2d4/0x5f0
[c00000000dc63d30] [c00000000066560c] do_sys_open+0xcc/0x140
[c00000000dc63dc0] [c000000000045e10] system_call_exception+0x160/0x240
[c00000000dc63e20] [c00000000000da60] system_call_common+0xf0/0x27c
Instruction dump:
409c0048 3fe2ff5b 3bfff128 fac10060 fae10068 482f7a85 60000000 3c62ff5b
7fe4fb78 3863f250 4815bbd9 60000000 <0fe00000> 3c62ff5b 3863f2b8 4815c8b5
irq event stamp: 254
hardirqs last  enabled at (253): [<c000000000019550>] arch_local_irq_restore+0xa0/0x150
hardirqs last disabled at (254): [<c000000000008a10>] data_access_common_virt+0x1b0/0x1d0
softirqs last  enabled at (0): [<c0000000001f6d5c>] copy_process+0x78c/0x2120
softirqs last disabled at (0): [<0000000000000000>] 0x0
---[ end trace ba98aec5151f3aeb ]---
---
 arch/powerpc/kernel/irq.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 7d0f7682d01d..915123d861d0 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -221,6 +221,7 @@ void replay_soft_interrupts(void)
 	 */
 	unsigned char happened = local_paca->irq_happened;
 	struct pt_regs regs;
+	unsigned long kuap_state = get_kuap();
 
 	ppc_save_regs(&regs);
 	regs.softe = IRQS_ENABLED;
@@ -309,6 +310,7 @@ void replay_soft_interrupts(void)
 		trace_hardirqs_off();
 		goto again;
 	}
+	set_kuap(kuap_state);
 }
 
 notrace void arch_local_irq_restore(unsigned long mask)
-- 
2.17.1


^ permalink raw reply related

* Re: [PATCH v2 2/2] kbuild: Disable CONFIG_LD_ORPHAN_WARN for ld.lld 10.0.1
From: Masahiro Yamada @ 2020-12-02  2:37 UTC (permalink / raw)
  To: Kees Cook
  Cc: Michal Marek, kernelci . org bot, Linux Kbuild mailing list,
	Catalin Marinas, Mark Brown,
	maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT), Nick Desaulniers,
	Russell King, LKML, linuxppc-dev, Arvind Sankar, Ingo Molnar,
	Borislav Petkov, clang-built-linux, Nathan Chancellor,
	Will Deacon, Thomas Gleixner, Linux ARM
In-Reply-To: <202012011255.9D677ED3@keescook>

On Wed, Dec 2, 2020 at 5:56 AM Kees Cook <keescook@chromium.org> wrote:
>
> On Tue, Dec 01, 2020 at 10:31:37PM +0900, Masahiro Yamada wrote:
> > On Wed, Nov 25, 2020 at 7:22 AM Kees Cook <keescook@chromium.org> wrote:
> > >
> > > On Thu, Nov 19, 2020 at 01:13:27PM -0800, Nick Desaulniers wrote:
> > > > On Thu, Nov 19, 2020 at 12:57 PM Nathan Chancellor
> > > > <natechancellor@gmail.com> wrote:
> > > > >
> > > > > ld.lld 10.0.1 spews a bunch of various warnings about .rela sections,
> > > > > along with a few others. Newer versions of ld.lld do not have these
> > > > > warnings. As a result, do not add '--orphan-handling=warn' to
> > > > > LDFLAGS_vmlinux if ld.lld's version is not new enough.
> > > > >
> > > > > Link: https://github.com/ClangBuiltLinux/linux/issues/1187
> > > > > Link: https://github.com/ClangBuiltLinux/linux/issues/1193
> > > > > Reported-by: Arvind Sankar <nivedita@alum.mit.edu>
> > > > > Reported-by: kernelci.org bot <bot@kernelci.org>
> > > > > Reported-by: Mark Brown <broonie@kernel.org>
> > > > > Reviewed-by: Kees Cook <keescook@chromium.org>
> > > > > Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
> > > >
> > > > Thanks for the additions in v2.
> > > > Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
> > >
> > > I'm going to carry this for a few days in -next, and if no one screams,
> > > ask Linus to pull it for v5.10-rc6.
> > >
> > > Thanks!
> > >
> > > --
> > > Kees Cook
> >
> >
> > Sorry for the delay.
> > Applied to linux-kbuild.
>
> Great, thanks!
>
> > But, I already see this in linux-next.
> > Please let me know if I should drop it from my tree.
>
> My intention was to get this to Linus this week. Do you want to do that
> yourself, or Ack the patches in my tree and I'll send it?
>
> -Kees
>
> --
> Kees Cook


I will send a kbuild pull request myself this week.




-- 
Best Regards
Masahiro Yamada

^ permalink raw reply

* Re: [PATCH 5/8] lazy tlb: allow lazy tlb mm switching to be configurable
From: Nicholas Piggin @ 2020-12-02  2:49 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, X86 ML, LKML, Linux-MM,
	Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <CALCETrWz3hqptsmTHAu1Qb=E8FPhYRVfcO1nhTVHwOpTNq6w1w@mail.gmail.com>

Excerpts from Andy Lutomirski's message of November 29, 2020 10:36 am:
> On Sat, Nov 28, 2020 at 8:02 AM Nicholas Piggin <npiggin@gmail.com> wrote:
>>
>> NOMMU systems could easily go without this and save a bit of code
>> and the refcount atomics, because their mm switch is a no-op. I
>> haven't flipped them over because haven't audited all arch code to
>> convert over to using the _lazy_tlb refcounting.
>>
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> ---
>>  arch/Kconfig             | 11 +++++++
>>  include/linux/sched/mm.h | 13 ++++++--
>>  kernel/sched/core.c      | 68 +++++++++++++++++++++++++++++-----------
>>  kernel/sched/sched.h     |  4 ++-
>>  4 files changed, 75 insertions(+), 21 deletions(-)
>>
>> diff --git a/arch/Kconfig b/arch/Kconfig
>> index 56b6ccc0e32d..596bf589d74b 100644
>> --- a/arch/Kconfig
>> +++ b/arch/Kconfig
>> @@ -430,6 +430,17 @@ config ARCH_WANT_IRQS_OFF_ACTIVATE_MM
>>           irqs disabled over activate_mm. Architectures that do IPI based TLB
>>           shootdowns should enable this.
>>
>> +# Should make this depend on MMU, because there is little use for lazy mm switching
>> +# with NOMMU. Must audit NOMMU architecture code for lazy mm refcounting first.
>> +config MMU_LAZY_TLB
>> +       def_bool y
>> +       help
>> +         Enable "lazy TLB" mmu context switching for kernel threads.
>> +
>> +config MMU_LAZY_TLB_REFCOUNT
>> +       def_bool y
>> +       depends on MMU_LAZY_TLB
>> +
> 
> This could use some documentation as to what "no" means.

Sure I can add a bit more.

> 
>>  config ARCH_HAVE_NMI_SAFE_CMPXCHG
>>         bool
>>
>> diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
>> index 7157c0f6fef8..bd0f27402d4b 100644
>> --- a/include/linux/sched/mm.h
>> +++ b/include/linux/sched/mm.h
>> @@ -51,12 +51,21 @@ static inline void mmdrop(struct mm_struct *mm)
>>  /* Helpers for lazy TLB mm refcounting */
>>  static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
>>  {
>> -       mmgrab(mm);
>> +       if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
>> +               mmgrab(mm);
>>  }
>>
>>  static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
>>  {
>> -       mmdrop(mm);
>> +       if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) {
>> +               mmdrop(mm);
>> +       } else {
>> +               /*
>> +                * mmdrop_lazy_tlb must provide a full memory barrier, see the
>> +                * membarrier comment finish_task_switch.
> 
> "membarrier comment in finish_task_switch()", perhaps?

Sure.

Thanks,
Nick


^ permalink raw reply

* Re: [PATCH 2/8] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
From: Nicholas Piggin @ 2020-12-02  2:49 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, X86 ML, LKML, Linux-MM,
	Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <CALCETrXYkbuJJnDv9ijfT+5tLQ2FOvvN1Ugoh5NwOy+zHp9HXA@mail.gmail.com>

Excerpts from Andy Lutomirski's message of November 29, 2020 3:55 am:
> On Sat, Nov 28, 2020 at 8:02 AM Nicholas Piggin <npiggin@gmail.com> wrote:
>>
>> And get rid of the generic sync_core_before_usermode facility. This is
>> functionally a no-op in the core scheduler code, but it also catches
>>
>> This helper is the wrong way around I think. The idea that membarrier
>> state requires a core sync before returning to user is the easy one
>> that does not need hiding behind membarrier calls. The gap in core
>> synchronization due to x86's sysret/sysexit and lazy tlb mode, is the
>> tricky detail that is better put in x86 lazy tlb code.
>>
>> Consider if an arch did not synchronize core in switch_mm either, then
>> membarrier_mm_sync_core_before_usermode would be in the wrong place
>> but arch specific mmu context functions would still be the right place.
>> There is also a exit_lazy_tlb case that is not covered by this call, which
>> could be a bugs (kthread use mm the membarrier process's mm then context
>> switch back to the process without switching mm or lazy mm switch).
>>
>> This makes lazy tlb code a bit more modular.
> 
> I have a couple of membarrier fixes that I want to send out today or
> tomorrow, and they might eliminate the need for this patch.  Let me
> think about this a little bit.  I'll cc you.  The existing code is way
> to subtle and the comments are far too confusing for me to be quickly
> confident about any of my conclusions :)
> 

Thanks for the head's up. I'll have to have a better look through them 
but I don't know that it eliminates the need for this entirely although
it might close some gaps and make this not a bug fix. The problem here 
is x86 code wanted something to be called when a lazy mm is unlazied,
but it missed some spots and also the core scheduler doesn't need to 
know about those x86 details if it has this generic call that annotates
the lazy handling better.

I'll go through the wording again and look at your patches a bit better
but I think they are somewhat orthogonal.

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH 1/8] lazy tlb: introduce exit_lazy_tlb
From: Nicholas Piggin @ 2020-12-02  2:49 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, X86 ML, LKML, Linux-MM,
	Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <CALCETrVbFm7gZ7G_5DWa6UGYtCzZTQvC_CPRVDZ0Lb-tiMnjSg@mail.gmail.com>

Excerpts from Andy Lutomirski's message of November 29, 2020 10:38 am:
> On Sat, Nov 28, 2020 at 8:01 AM Nicholas Piggin <npiggin@gmail.com> wrote:
>>
>> This is called at points where a lazy mm is switched away or made not
>> lazy (by its owner switching back).
>>
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> ---
>>  arch/arm/mach-rpc/ecard.c            |  1 +
>>  arch/powerpc/mm/book3s64/radix_tlb.c |  1 +
>>  fs/exec.c                            |  6 ++++--
>>  include/asm-generic/mmu_context.h    | 21 +++++++++++++++++++++
>>  kernel/kthread.c                     |  1 +
>>  kernel/sched/core.c                  |  2 ++
>>  6 files changed, 30 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/arm/mach-rpc/ecard.c b/arch/arm/mach-rpc/ecard.c
>> index 827b50f1c73e..43eb1bfba466 100644
>> --- a/arch/arm/mach-rpc/ecard.c
>> +++ b/arch/arm/mach-rpc/ecard.c
>> @@ -253,6 +253,7 @@ static int ecard_init_mm(void)
>>         current->mm = mm;
>>         current->active_mm = mm;
>>         activate_mm(active_mm, mm);
>> +       exit_lazy_tlb(active_mm, current);
>>         mmdrop(active_mm);
>>         ecard_init_pgtables(mm);
>>         return 0;
>> diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
>> index b487b489d4b6..ac3fec03926a 100644
>> --- a/arch/powerpc/mm/book3s64/radix_tlb.c
>> +++ b/arch/powerpc/mm/book3s64/radix_tlb.c
>> @@ -661,6 +661,7 @@ static void do_exit_flush_lazy_tlb(void *arg)
>>                 mmgrab(&init_mm);
>>                 current->active_mm = &init_mm;
>>                 switch_mm_irqs_off(mm, &init_mm, current);
>> +               exit_lazy_tlb(mm, current);
>>                 mmdrop(mm);
>>         }
>>
>> diff --git a/fs/exec.c b/fs/exec.c
>> index 547a2390baf5..4b4dea1bb7ba 100644
>> --- a/fs/exec.c
>> +++ b/fs/exec.c
>> @@ -1017,6 +1017,8 @@ static int exec_mmap(struct mm_struct *mm)
>>         if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
>>                 local_irq_enable();
>>         activate_mm(active_mm, mm);
>> +       if (!old_mm)
>> +               exit_lazy_tlb(active_mm, tsk);
>>         if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
>>                 local_irq_enable();
>>         tsk->mm->vmacache_seqnum = 0;
>> @@ -1028,9 +1030,9 @@ static int exec_mmap(struct mm_struct *mm)
>>                 setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
>>                 mm_update_next_owner(old_mm);
>>                 mmput(old_mm);
>> -               return 0;
>> +       } else {
>> +               mmdrop(active_mm);
>>         }
>> -       mmdrop(active_mm);
> 
> This looks like an unrelated change.

I thought the old style was pointless and made me look twice to make 
sure we weren't mmdrop()ing the lazy.

> 
>>         return 0;
>>  }
>>
>> diff --git a/include/asm-generic/mmu_context.h b/include/asm-generic/mmu_context.h
>> index 91727065bacb..4626d0020e65 100644
>> --- a/include/asm-generic/mmu_context.h
>> +++ b/include/asm-generic/mmu_context.h
>> @@ -24,6 +24,27 @@ static inline void enter_lazy_tlb(struct mm_struct *mm,
>>  }
>>  #endif
>>
>> +/*
>> + * exit_lazy_tlb - Called after switching away from a lazy TLB mode mm.
>> + *
>> + * mm:  the lazy mm context that was switched
>> + * tsk: the task that was switched to (with a non-lazy mm)
>> + *
>> + * mm may equal tsk->mm.
>> + * mm and tsk->mm will not be NULL.
>> + *
>> + * Note this is not symmetrical to enter_lazy_tlb, this is not
>> + * called when tasks switch into the lazy mm, it's called after the
>> + * lazy mm becomes non-lazy (either switched to a different mm or the
>> + * owner of the mm returns).
>> + */
>> +#ifndef exit_lazy_tlb
>> +static inline void exit_lazy_tlb(struct mm_struct *mm,
> 
> Maybe name this parameter prev_lazy_mm?
> 

mm is better because it's the mm that we're "exiting lazy" from, the 
function name gives the context.

prev might suggest it was the previous but it's the current one, or
that we're switching to another mm but we may not be at all.

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH kernel] powerpc/perf: Stop crashing with generic_compat_pmu
From: Alexey Kardashevskiy @ 2020-12-02  3:01 UTC (permalink / raw)
  To: Madhavan Srinivasan, linuxppc-dev; +Cc: Madhavan Srinivasan
In-Reply-To: <c3852667-210c-48de-7f89-a06250b4df05@linux.ibm.com>

Hi Maddy,

I just noticed that I still have "powerpc/perf: Add checks for reserved 
values" in my pile (pushed here 
https://github.com/aik/linux/commit/61e1bc3f2e19d450e2e2d39174d422160b21957b 
), do we still need it? The lockups I saw were fixed by 
https://github.com/aik/linux/commit/17899eaf88d689 but it is hardly a 
replacement. Thanks,


On 04/06/2020 02:34, Madhavan Srinivasan wrote:
> 
> 
> On 6/2/20 8:26 AM, Alexey Kardashevskiy wrote:
>> The bhrb_filter_map ("The  Branch  History  Rolling  Buffer") callback is
>> only defined in raw CPUs' power_pmu structs. The "architected" CPUs use
>> generic_compat_pmu which does not have this callback and crashed occur.
>>
>> This add a NULL pointer check for bhrb_filter_map() which behaves as if
>> the callback returned an error.
>>
>> This does not add the same check for config_bhrb() as the only caller
>> checks for cpuhw->bhrb_users which remains zero if bhrb_filter_map==0.
> 
> Changes looks fine.
> Reviewed-by: Madhavan Srinivasan <maddy@linux.ibm.com>
> 
> The commit be80e758d0c2e ('powerpc/perf: Add generic compat mode pmu 
> driver')
> which introduced generic_compat_pmu was merged in v5.2.  So we need to
> CC stable starting from 5.2 :( .  My bad,  sorry.
> 
> Maddy
> 
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>   arch/powerpc/perf/core-book3s.c | 19 ++++++++++++++-----
>>   1 file changed, 14 insertions(+), 5 deletions(-)
>>
>> diff --git a/arch/powerpc/perf/core-book3s.c 
>> b/arch/powerpc/perf/core-book3s.c
>> index 3dcfecf858f3..36870569bf9c 100644
>> --- a/arch/powerpc/perf/core-book3s.c
>> +++ b/arch/powerpc/perf/core-book3s.c
>> @@ -1515,9 +1515,16 @@ static int power_pmu_add(struct perf_event 
>> *event, int ef_flags)
>>       ret = 0;
>>    out:
>>       if (has_branch_stack(event)) {
>> -        power_pmu_bhrb_enable(event);
>> -        cpuhw->bhrb_filter = ppmu->bhrb_filter_map(
>> -                    event->attr.branch_sample_type);
>> +        u64 bhrb_filter = -1;
>> +
>> +        if (ppmu->bhrb_filter_map)
>> +            bhrb_filter = ppmu->bhrb_filter_map(
>> +                event->attr.branch_sample_type);
>> +
>> +        if (bhrb_filter != -1) {
>> +            cpuhw->bhrb_filter = bhrb_filter;
>> +            power_pmu_bhrb_enable(event); /* Does bhrb_users++ */
>> +        }
>>       }
>>
>>       perf_pmu_enable(event->pmu);
>> @@ -1839,7 +1846,6 @@ static int power_pmu_event_init(struct 
>> perf_event *event)
>>       int n;
>>       int err;
>>       struct cpu_hw_events *cpuhw;
>> -    u64 bhrb_filter;
>>
>>       if (!ppmu)
>>           return -ENOENT;
>> @@ -1945,7 +1951,10 @@ static int power_pmu_event_init(struct 
>> perf_event *event)
>>       err = power_check_constraints(cpuhw, events, cflags, n + 1);
>>
>>       if (has_branch_stack(event)) {
>> -        bhrb_filter = ppmu->bhrb_filter_map(
>> +        u64 bhrb_filter = -1;
>> +
>> +        if (ppmu->bhrb_filter_map)
>> +            bhrb_filter = ppmu->bhrb_filter_map(
>>                       event->attr.branch_sample_type);
>>
>>           if (bhrb_filter == -1) {
> 

-- 
Alexey

^ permalink raw reply

* Re: [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
From: Nicholas Piggin @ 2020-12-02  3:09 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, X86 ML, LKML, Linux-MM,
	Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <CALCETrVXUbe8LfNn-Qs+DzrOQaiw+sFUg1J047yByV31SaTOZw@mail.gmail.com>

Excerpts from Andy Lutomirski's message of November 29, 2020 1:54 pm:
> On Sat, Nov 28, 2020 at 8:02 AM Nicholas Piggin <npiggin@gmail.com> wrote:
>>
>> On big systems, the mm refcount can become highly contented when doing
>> a lot of context switching with threaded applications (particularly
>> switching between the idle thread and an application thread).
>>
>> Abandoning lazy tlb slows switching down quite a bit in the important
>> user->idle->user cases, so so instead implement a non-refcounted scheme
>> that causes __mmdrop() to IPI all CPUs in the mm_cpumask and shoot down
>> any remaining lazy ones.
>>
>> Shootdown IPIs are some concern, but they have not been observed to be
>> a big problem with this scheme (the powerpc implementation generated
>> 314 additional interrupts on a 144 CPU system during a kernel compile).
>> There are a number of strategies that could be employed to reduce IPIs
>> if they turn out to be a problem for some workload.
> 
> I'm still wondering whether we can do even better.

We probably can, for some values of better / more complex. This came up 
last time I posted, there was a big concern about IPIs etc, but it just 
wasn't an issue at all even when I tried to coax them to happen a bit.

The thing is they are faily self-limiting, it's not actually all that 
frequent that you have an mm get taken for a lazy *and* move between 
CPUs. Perhaps more often with threaded apps, but in that case you're 
eating various IPI costs anyway (e.g., when moving the task to another
CPU, on TLB shootdowns, etc).

So from last time I did measure and I did document some possible 
improvements that could be made in comments, but I decided to keep it 
simple before adding complexity to it.

> 
> The IPIs you're doing aren't really necessary -- we don't
> fundamentally need to free the pagetables immediately when all
> non-lazy users are done with them (and current kernels don't) -- what
> we need to do is to synchronize all the bookkeeping.  So, with
> adequate locking (famous last words), a couple of alternative schemes
> ought to be possible.

It's not freeing the page tables, those are freed by this point already 
I think (at least on powerpc they are). It's releasing the lazy mm.

> 
> a) Instead of sending an IPI, increment mm_count on behalf of the
> remote CPU and do something to make sure that the remote CPU knows we
> did this on its behalf.  Then free the mm when mm_count hits zero.
> 
> b) Treat mm_cpumask as part of the refcount.  Add one to mm_count when
> an mm is created.  Once mm_users hits zero, whoever clears the last
> bit in mm_cpumask is responsible for decrementing a single reference
> from mm_count, and whoever sets it to zero frees the mm.

Right, these were some possible avenues to explore, thing is it's 
complexity and more synchronisation costs, and in the fast (context 
switch) path too. The IPI actually avoids all fast path work, atomic
or not.

> Version (b) seems fairly straightforward to implement -- add RCU
> protection and a atomic_t special_ref_cleared (initially 0) to struct
> mm_struct itself.  After anyone clears a bit to mm_cpumask (which is
> already a barrier), they read mm_users.  If it's zero, then they scan
> mm_cpumask and see if it's empty.  If it is, they atomically swap
> special_ref_cleared to 1.  If it was zero before the swap, they do
> mmdrop().  I can imagine some tweaks that could make this a big
> faster, at least in the limit of a huge number of CPUs.
> 
> Version (a) seems a bit harder to reason about.  Maybe it could be
> done like this.  Add a percpu variable mm_with_extra_count.  This
> variable can be NULL, but it can also be an mm that has an extra
> reference on behalf of the cpu in question.
> 
> __mmput scans mm_cpumask and, for each cpu in the mask, mmgrabs the mm
> and cmpxchgs that cpu's mm_with_extra_count from NULL to mm.  If it
> succeeds, then we win.  If it fails, further thought is required, and
> maybe we have to send an IPI, although maybe some other cleverness is
> possible.  Any time a CPU switches mms, it does atomic swaps
> mm_with_extra_count to NULL and mmdrops whatever the mm was.  (Maybe
> it needs to check the mm isn't equal to the new mm, although it would
> be quite bizarre for this to happen.)  Other than these mmgrab and
> mmdrop calls, the mm switching code doesn't mmgrab or mmdrop at all.
> 
> 
> Version (a) seems like it could have excellent performance.

That said, if x86 wanted to explore something like this, the code to do 
it is a bit modular (I don't think a proliferation of lazy refcounting 
config options is a good idea of course, but 2 versions one for powrepc
style set-and-forget mm_cpumask and one for x86 set-and-clear would
be okay.

> *However*, I think we should consider whether we want to do something
> even bigger first.  Even with any of these changes, we still need to
> maintain mm_cpumask(), and that itself can be a scalability problem.
> I wonder if we can solve this problem too.  Perhaps the switch_mm()
> paths could only ever set mm_cpumask bits,

Powerpc does this.

> and anyone who would send
> an IPI because a bit is set in mm_cpumask would first check some
> percpu variable (cpu_rq(cpu)->something? 

This is a suggested possible optimization to the IPI scheme (you would
check if it's active).

There's pros and cons to it. You get more IPIs and cross TLB shootdowns
and jitter cleaning up behind you rather than cleaning up as you go.

> an entirely new variable) to
> see if the bit in mm_cpumask is spurious.  Or perhaps mm_cpumask could
> be split up across multiple cachelines, one per node.

IIRC Peter or someone mentioned this was something that was looked at 
for x86.

> We should keep the recent lessons from Apple in mind, though: x86 is a
> dinosaur.

Wow. What is the recent lesson from Apple?? I'm completely out of the 
loop here.

> The future of atomics is going to look a lot more like
> ARM's LSE than x86's rather anemic set.  This means that mm_cpumask
> operations won't need to be full barriers forever, and we might not
> want to take the implied full barriers in set_bit() and clear_bit()
> for granted.

Sure, set_bit / clear_bit aren't full barriers in terms of Linux 
semantics, so generic code doesn't assume that. What x86 does is
add the smp_mb__after_blah or before_blah to avoid an added barrier
because of it's heavier-than-required set_bit.

I'm not quite sure what you were getting at though. The atomic itself is 
really quite a small cost of the exit() operation (or even a context 
switch operation) _most_ of the time (x86 CPUs seem to have very fast
atomics so might be even smaller cost for you). It's just when you 
happen to bounce a cache line, which hurts no matter what you do.

Thanks,
Nick



^ permalink raw reply

* Re: [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
From: Nicholas Piggin @ 2020-12-02  3:47 UTC (permalink / raw)
  To: Christian Borntraeger, Catalin Marinas, Dave Hansen,
	Vasily Gorbik, Heiko Carstens, Andy Lutomirski, Will Deacon
  Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, X86 ML, LKML, Linux-MM,
	Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <CALCETrXAR_9EGaOF8ymVkZycxgZkYk0dR+NjEpTfVzdcS3sOVw@mail.gmail.com>

Excerpts from Andy Lutomirski's message of December 1, 2020 4:31 am:
> other arch folk: there's some background here:
> 
> https://lkml.kernel.org/r/CALCETrVXUbe8LfNn-Qs+DzrOQaiw+sFUg1J047yByV31SaTOZw@mail.gmail.com
> 
> On Sun, Nov 29, 2020 at 12:16 PM Andy Lutomirski <luto@kernel.org> wrote:
>>
>> On Sat, Nov 28, 2020 at 7:54 PM Andy Lutomirski <luto@kernel.org> wrote:
>> >
>> > On Sat, Nov 28, 2020 at 8:02 AM Nicholas Piggin <npiggin@gmail.com> wrote:
>> > >
>> > > On big systems, the mm refcount can become highly contented when doing
>> > > a lot of context switching with threaded applications (particularly
>> > > switching between the idle thread and an application thread).
>> > >
>> > > Abandoning lazy tlb slows switching down quite a bit in the important
>> > > user->idle->user cases, so so instead implement a non-refcounted scheme
>> > > that causes __mmdrop() to IPI all CPUs in the mm_cpumask and shoot down
>> > > any remaining lazy ones.
>> > >
>> > > Shootdown IPIs are some concern, but they have not been observed to be
>> > > a big problem with this scheme (the powerpc implementation generated
>> > > 314 additional interrupts on a 144 CPU system during a kernel compile).
>> > > There are a number of strategies that could be employed to reduce IPIs
>> > > if they turn out to be a problem for some workload.
>> >
>> > I'm still wondering whether we can do even better.
>> >
>>
>> Hold on a sec.. __mmput() unmaps VMAs, frees pagetables, and flushes
>> the TLB.  On x86, this will shoot down all lazies as long as even a
>> single pagetable was freed.  (Or at least it will if we don't have a
>> serious bug, but the code seems okay.  We'll hit pmd_free_tlb, which
>> sets tlb->freed_tables, which will trigger the IPI.)  So, on
>> architectures like x86, the shootdown approach should be free.  The
>> only way it ought to have any excess IPIs is if we have CPUs in
>> mm_cpumask() that don't need IPI to free pagetables, which could
>> happen on paravirt.
> 
> Indeed, on x86, we do this:
> 
> [   11.558844]  flush_tlb_mm_range.cold+0x18/0x1d
> [   11.559905]  tlb_finish_mmu+0x10e/0x1a0
> [   11.561068]  exit_mmap+0xc8/0x1a0
> [   11.561932]  mmput+0x29/0xd0
> [   11.562688]  do_exit+0x316/0xa90
> [   11.563588]  do_group_exit+0x34/0xb0
> [   11.564476]  __x64_sys_exit_group+0xf/0x10
> [   11.565512]  do_syscall_64+0x34/0x50
> 
> and we have info->freed_tables set.
> 
> What are the architectures that have large systems like?
> 
> x86: we already zap lazies, so it should cost basically nothing to do

This is not zapping lazies, this is freeing the user page tables.

"lazy mm" is where a switch to a kernel thread takes on the
previous mm for its kernel mapping rather than switch to init_mm.

> a little loop at the end of __mmput() to make sure that no lazies are
> left.  If we care about paravirt performance, we could implement one
> of the optimizations I mentioned above to fix up the refcounts instead
> of sending an IPI to any remaining lazies.

It might be possible x86's scheme you could scan mm_cpumask
carefully synchronized or something when the last user reference
gets dropped that frees the lazy at that point, but I don't know
what that would buy you because you're still having to maintain
the mm_cpumask on switches. powerpc's characteristics are just
different here so it makes sense whereas I don't know if it
would on x86.

> 
> arm64: AFAICT arm64's flush uses magic arm64 hardware support for
> remote flushes, so any lazy mm references will still exist after
> exit_mmap().  (arm64 uses lazy TLB, right?)  So this is kind of like
> the x86 paravirt case.  Are there large enough arm64 systems that any
> of this matters?
> 
> s390x: The code has too many acronyms for me to understand it fully,
> but I think it's more or less the same situation as arm64.  How big do
> s390x systems come?
> 
> power: Ridiculously complicated, seems to vary by system and kernel config.
> 
> So, Nick, your unconditional IPI scheme is apparently a big
> improvement for power, and it should be an improvement and have low
> cost for x86.

As said, the tradeoffs are different, I'm not so sure. It was a big 
improvement on a very big system with the powerpc mm_cpumask switching
model on a microbenchmark designed to stress this, which is about all
I can say for it.

> On arm64 and s390x it will add more IPIs on process
> exit but reduce contention on context switching depending on how lazy
> TLB works.  I suppose we could try it for all architectures without
> any further optimizations.

It will remain opt-in but certainly try it out and see. There are some
requirements as documented in the config option text.

> Or we could try one of the perhaps
> excessively clever improvements I linked above.  arm64, s390x people,
> what do you think?
> 

I'm not against improvements to the scheme. e.g., from the patch

+               /*
+                * IPI overheads have not found to be expensive, but they could
+                * be reduced in a number of possible ways, for example (in
+                * roughly increasing order of complexity):
+                * - A batch of mms requiring IPIs could be gathered and freed
+                *   at once.
+                * - CPUs could store their active mm somewhere that can be
+                *   remotely checked without a lock, to filter out
+                *   false-positives in the cpumask.
+                * - After mm_users or mm_count reaches zero, switching away
+                *   from the mm could clear mm_cpumask to reduce some IPIs
+                *   (some batching or delaying would help).
+                * - A delayed freeing and RCU-like quiescing sequence based on
+                *   mm switching to avoid IPIs completely.
+                */

But would like to have numbers before being too clever.

Thanks,
Nick

^ permalink raw reply

* [PATCH v7 updated 21/22 ] powerpc/book3s64/kup: Check max key supported before enabling kup
From: Aneesh Kumar K.V @ 2020-12-02  4:38 UTC (permalink / raw)
  To: linuxppc-dev, mpe; +Cc: Aneesh Kumar K.V
In-Reply-To: <20201127044424.40686-22-aneesh.kumar@linux.ibm.com>

Don't enable KUEP/KUAP if we support less than or equal to 3 keys.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/include/asm/kup.h   |  3 +++
 arch/powerpc/mm/book3s64/pkeys.c | 33 ++++++++++++++++++++------------
 arch/powerpc/mm/init-common.c    |  4 ++--
 3 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h
index 952be0414f43..f8ec679bd2de 100644
--- a/arch/powerpc/include/asm/kup.h
+++ b/arch/powerpc/include/asm/kup.h
@@ -44,6 +44,9 @@
 
 #else /* !__ASSEMBLY__ */
 
+extern bool disable_kuep;
+extern bool disable_kuap;
+
 #include <linux/pgtable.h>
 
 void setup_kup(void);
diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c
index 4a3aeddbe0c7..2b7ded396db4 100644
--- a/arch/powerpc/mm/book3s64/pkeys.c
+++ b/arch/powerpc/mm/book3s64/pkeys.c
@@ -185,6 +185,27 @@ void __init pkey_early_init_devtree(void)
 		default_uamor &= ~(0x3ul << pkeyshift(execute_only_key));
 	}
 
+	if (unlikely(num_pkey <= 3)) {
+		/*
+		 * Insufficient number of keys to support
+		 * KUAP/KUEP feature.
+		 */
+		disable_kuep = true;
+		disable_kuap = true;
+		WARN(1, "Disabling kernel user protection due to low (%d) max supported keys\n", num_pkey);
+	} else {
+		/*  handle key which is used by kernel for KAUP */
+		reserved_allocation_mask |= (0x1 << 3);
+		/*
+		 * Mark access for kup_key in default amr so that
+		 * we continue to operate with that AMR in
+		 * copy_to/from_user().
+		 */
+		default_amr   &= ~(0x3ul << pkeyshift(3));
+		default_iamr  &= ~(0x1ul << pkeyshift(3));
+		default_uamor &= ~(0x3ul << pkeyshift(3));
+	}
+
 	/*
 	 * Allow access for only key 0. And prevent any other modification.
 	 */
@@ -205,18 +226,6 @@ void __init pkey_early_init_devtree(void)
 	reserved_allocation_mask |= (0x1 << 1);
 	default_uamor &= ~(0x3ul << pkeyshift(1));
 
-	/*  handle key which is used by kernel for KAUP */
-	reserved_allocation_mask |= (0x1 << 3);
-	/*
-	 * Mark access for KUAP key in default amr so that
-	 * we continue to operate with that AMR in
-	 * copy_to/from_user().
-	 */
-	default_amr   &= ~(0x3ul << pkeyshift(3));
-	default_iamr  &= ~(0x1ul << pkeyshift(3));
-	default_uamor &= ~(0x3ul << pkeyshift(3));
-
-
 	/*
 	 * Prevent the usage of OS reserved keys. Update UAMOR
 	 * for those keys. Also mark the rest of the bits in the
diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c
index 8e0d792ac296..afdebb95bcae 100644
--- a/arch/powerpc/mm/init-common.c
+++ b/arch/powerpc/mm/init-common.c
@@ -28,8 +28,8 @@ EXPORT_SYMBOL_GPL(kernstart_addr);
 unsigned long kernstart_virt_addr __ro_after_init = KERNELBASE;
 EXPORT_SYMBOL_GPL(kernstart_virt_addr);
 
-static bool disable_kuep = !IS_ENABLED(CONFIG_PPC_KUEP);
-static bool disable_kuap = !IS_ENABLED(CONFIG_PPC_KUAP);
+bool disable_kuep = !IS_ENABLED(CONFIG_PPC_KUEP);
+bool disable_kuap = !IS_ENABLED(CONFIG_PPC_KUAP);
 
 static int __init parse_nosmep(char *p)
 {
-- 
2.28.0


^ permalink raw reply related

* [PATCH v2 0/4] Powerpc: Better preemption for shared processor
From: Srikar Dronamraju @ 2020-12-02  5:04 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Nathan Lynch, Gautham R Shenoy, Phil Auld, Srikar Dronamraju,
	Juri Lelli, Peter Zijlstra, LKML, Nicholas Piggin, Waiman Long,
	linuxppc-dev, Valentin Schneider

Currently, vcpu_is_preempted will return the yield_count for
shared_processor. On a PowerVM LPAR, Phyp schedules at SMT8 core boundary
i.e all CPUs belonging to a core are either group scheduled in or group
scheduled out. This can be used to better predict non-preempted CPUs on
PowerVM shared LPARs.

perf stat -r 5 -a perf bench sched pipe -l 10000000 (lesser time is better)

powerpc/next
     35,107,951.20 msec cpu-clock                 #  255.898 CPUs utilized            ( +-  0.31% )
        23,655,348      context-switches          #    0.674 K/sec                    ( +-  3.72% )
            14,465      cpu-migrations            #    0.000 K/sec                    ( +-  5.37% )
            82,463      page-faults               #    0.002 K/sec                    ( +-  8.40% )
 1,127,182,328,206      cycles                    #    0.032 GHz                      ( +-  1.60% )  (66.67%)
    78,587,300,622      stalled-cycles-frontend   #    6.97% frontend cycles idle     ( +-  0.08% )  (50.01%)
   654,124,218,432      stalled-cycles-backend    #   58.03% backend cycles idle      ( +-  1.74% )  (50.01%)
   834,013,059,242      instructions              #    0.74  insn per cycle
                                                  #    0.78  stalled cycles per insn  ( +-  0.73% )  (66.67%)
   132,911,454,387      branches                  #    3.786 M/sec                    ( +-  0.59% )  (50.00%)
     2,890,882,143      branch-misses             #    2.18% of all branches          ( +-  0.46% )  (50.00%)

           137.195 +- 0.419 seconds time elapsed  ( +-  0.31% )

powerpc/next + patchset
     29,981,702.64 msec cpu-clock                 #  255.881 CPUs utilized            ( +-  1.30% )
        40,162,456      context-switches          #    0.001 M/sec                    ( +-  0.01% )
             1,110      cpu-migrations            #    0.000 K/sec                    ( +-  5.20% )
            62,616      page-faults               #    0.002 K/sec                    ( +-  3.93% )
 1,430,030,626,037      cycles                    #    0.048 GHz                      ( +-  1.41% )  (66.67%)
    83,202,707,288      stalled-cycles-frontend   #    5.82% frontend cycles idle     ( +-  0.75% )  (50.01%)
   744,556,088,520      stalled-cycles-backend    #   52.07% backend cycles idle      ( +-  1.39% )  (50.01%)
   940,138,418,674      instructions              #    0.66  insn per cycle
                                                  #    0.79  stalled cycles per insn  ( +-  0.51% )  (66.67%)
   146,452,852,283      branches                  #    4.885 M/sec                    ( +-  0.80% )  (50.00%)
     3,237,743,996      branch-misses             #    2.21% of all branches          ( +-  1.18% )  (50.01%)

            117.17 +- 1.52 seconds time elapsed  ( +-  1.30% )

This is around 14.6% improvement in performance.

Changelog:
v1->v2:
v1: https://lore.kernel.org/linuxppc-dev/20201028123512.871051-1-srikar@linux.vnet.ibm.com/t/#u
 - Rebased to 27th Nov linuxppc/merge tree.
 - Moved a hunk to fix a no previous prototype warning reported by: lkp@intel.com
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org/thread/C6PTRPHWMC7VV4OTYN3ISYKDHTDQS6YI/

Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
Cc: LKML <linux-kernel@vger.kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Valentin Schneider <valentin.schneider@arm.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Waiman Long <longman@redhat.com>
Cc: Phil Auld <pauld@redhat.com>

Srikar Dronamraju (4):
  powerpc: Refactor is_kvm_guest declaration to new header
  powerpc: Rename is_kvm_guest to check_kvm_guest
  powerpc: Reintroduce is_kvm_guest
  powerpc/paravirt: Use is_kvm_guest in vcpu_is_preempted

 arch/powerpc/include/asm/firmware.h  |  6 ------
 arch/powerpc/include/asm/kvm_guest.h | 25 +++++++++++++++++++++++++
 arch/powerpc/include/asm/kvm_para.h  |  2 +-
 arch/powerpc/include/asm/paravirt.h  | 18 ++++++++++++++++++
 arch/powerpc/kernel/firmware.c       |  5 ++++-
 arch/powerpc/platforms/pseries/smp.c |  3 ++-
 6 files changed, 50 insertions(+), 9 deletions(-)
 create mode 100644 arch/powerpc/include/asm/kvm_guest.h

-- 
2.18.4


^ permalink raw reply

* [PATCH v2 4/4] powerpc/paravirt: Use is_kvm_guest in vcpu_is_preempted
From: Srikar Dronamraju @ 2020-12-02  5:04 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Nathan Lynch, Gautham R Shenoy, Phil Auld, Srikar Dronamraju,
	Juri Lelli, Peter Zijlstra, LKML, Nicholas Piggin, Waiman Long,
	linuxppc-dev, Valentin Schneider
In-Reply-To: <20201202050456.164005-1-srikar@linux.vnet.ibm.com>

If its a shared lpar but not a KVM guest, then see if the vCPU is
related to the calling vCPU. On PowerVM, only cores can be preempted.
So if one vCPU is a non-preempted state, we can decipher that all other
vCPUs sharing the same core are in non-preempted state.

Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
Cc: LKML <linux-kernel@vger.kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Valentin Schneider <valentin.schneider@arm.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Waiman Long <longman@redhat.com>
Cc: Phil Auld <pauld@redhat.com>
Acked-by: Waiman Long <longman@redhat.com>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/paravirt.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/arch/powerpc/include/asm/paravirt.h b/arch/powerpc/include/asm/paravirt.h
index 9362c94fe3aa..edc08f04aef7 100644
--- a/arch/powerpc/include/asm/paravirt.h
+++ b/arch/powerpc/include/asm/paravirt.h
@@ -10,6 +10,9 @@
 #endif
 
 #ifdef CONFIG_PPC_SPLPAR
+#include <asm/kvm_guest.h>
+#include <asm/cputhreads.h>
+
 DECLARE_STATIC_KEY_FALSE(shared_processor);
 
 static inline bool is_shared_processor(void)
@@ -74,6 +77,21 @@ static inline bool vcpu_is_preempted(int cpu)
 {
 	if (!is_shared_processor())
 		return false;
+
+#ifdef CONFIG_PPC_SPLPAR
+	if (!is_kvm_guest()) {
+		int first_cpu = cpu_first_thread_sibling(smp_processor_id());
+
+		/*
+		 * Preemption can only happen at core granularity. This CPU
+		 * is not preempted if one of the CPU of this core is not
+		 * preempted.
+		 */
+		if (cpu_first_thread_sibling(cpu) == first_cpu)
+			return false;
+	}
+#endif
+
 	if (yield_count_of(cpu) & 1)
 		return true;
 	return false;
-- 
2.18.4


^ permalink raw reply related

* [PATCH v2 2/4] powerpc: Rename is_kvm_guest to check_kvm_guest
From: Srikar Dronamraju @ 2020-12-02  5:04 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Nathan Lynch, Gautham R Shenoy, Phil Auld, Srikar Dronamraju,
	Juri Lelli, Peter Zijlstra, LKML, Nicholas Piggin, Waiman Long,
	linuxppc-dev, Valentin Schneider
In-Reply-To: <20201202050456.164005-1-srikar@linux.vnet.ibm.com>

is_kvm_guest() will be reused in subsequent patch in a new avatar.  Hence
rename is_kvm_guest to check_kvm_guest. No additional changes.

Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
Cc: LKML <linux-kernel@vger.kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Valentin Schneider <valentin.schneider@arm.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Waiman Long <longman@redhat.com>
Cc: Phil Auld <pauld@redhat.com>
Acked-by: Waiman Long <longman@redhat.com>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/kvm_guest.h | 4 ++--
 arch/powerpc/include/asm/kvm_para.h  | 2 +-
 arch/powerpc/kernel/firmware.c       | 2 +-
 arch/powerpc/platforms/pseries/smp.c | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_guest.h b/arch/powerpc/include/asm/kvm_guest.h
index c0ace884a0e8..ba8291e02ba9 100644
--- a/arch/powerpc/include/asm/kvm_guest.h
+++ b/arch/powerpc/include/asm/kvm_guest.h
@@ -7,9 +7,9 @@
 #define __POWERPC_KVM_GUEST_H__
 
 #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_KVM_GUEST)
-bool is_kvm_guest(void);
+bool check_kvm_guest(void);
 #else
-static inline bool is_kvm_guest(void) { return false; }
+static inline bool check_kvm_guest(void) { return false; }
 #endif
 
 #endif /* __POWERPC_KVM_GUEST_H__ */
diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
index abe1b5e82547..6fba06b6cfdb 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -14,7 +14,7 @@
 
 static inline int kvm_para_available(void)
 {
-	return IS_ENABLED(CONFIG_KVM_GUEST) && is_kvm_guest();
+	return IS_ENABLED(CONFIG_KVM_GUEST) && check_kvm_guest();
 }
 
 static inline unsigned int kvm_arch_para_features(void)
diff --git a/arch/powerpc/kernel/firmware.c b/arch/powerpc/kernel/firmware.c
index 5f48e5ad24cd..0aeb6a5b1a9e 100644
--- a/arch/powerpc/kernel/firmware.c
+++ b/arch/powerpc/kernel/firmware.c
@@ -22,7 +22,7 @@ EXPORT_SYMBOL_GPL(powerpc_firmware_features);
 #endif
 
 #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_KVM_GUEST)
-bool is_kvm_guest(void)
+bool check_kvm_guest(void)
 {
 	struct device_node *hyper_node;
 
diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
index d578732c545d..c70b4be9f0a5 100644
--- a/arch/powerpc/platforms/pseries/smp.c
+++ b/arch/powerpc/platforms/pseries/smp.c
@@ -211,7 +211,7 @@ static __init void pSeries_smp_probe(void)
 	if (!cpu_has_feature(CPU_FTR_SMT))
 		return;
 
-	if (is_kvm_guest()) {
+	if (check_kvm_guest()) {
 		/*
 		 * KVM emulates doorbells by disabling FSCR[MSGP] so msgsndp
 		 * faults to the hypervisor which then reads the instruction
-- 
2.18.4


^ permalink raw reply related

* [PATCH v2 1/4] powerpc: Refactor is_kvm_guest declaration to new header
From: Srikar Dronamraju @ 2020-12-02  5:04 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Nathan Lynch, Gautham R Shenoy, Phil Auld, Srikar Dronamraju,
	Juri Lelli, Peter Zijlstra, LKML, Nicholas Piggin, Waiman Long,
	linuxppc-dev, Valentin Schneider
In-Reply-To: <20201202050456.164005-1-srikar@linux.vnet.ibm.com>

Only code/declaration movement, in anticipation of doing a kvm-aware
vcpu_is_preempted. No additional changes.

Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
Cc: LKML <linux-kernel@vger.kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Valentin Schneider <valentin.schneider@arm.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Waiman Long <longman@redhat.com>
Cc: Phil Auld <pauld@redhat.com>
Acked-by: Waiman Long <longman@redhat.com>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
Changelog:
v1->v2:
v1: https://lore.kernel.org/linuxppc-dev/20201028123512.871051-1-srikar@linux.vnet.ibm.com/t/#u
 - Moved a hunk to fix a no previous prototype warning reported by: lkp@intel.com
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org/thread/C6PTRPHWMC7VV4OTYN3ISYKDHTDQS6YI/

 arch/powerpc/include/asm/firmware.h  |  6 ------
 arch/powerpc/include/asm/kvm_guest.h | 15 +++++++++++++++
 arch/powerpc/include/asm/kvm_para.h  |  2 +-
 arch/powerpc/kernel/firmware.c       |  1 +
 arch/powerpc/platforms/pseries/smp.c |  1 +
 5 files changed, 18 insertions(+), 7 deletions(-)
 create mode 100644 arch/powerpc/include/asm/kvm_guest.h

diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h
index 0b295bdb201e..aa6a5ef5d483 100644
--- a/arch/powerpc/include/asm/firmware.h
+++ b/arch/powerpc/include/asm/firmware.h
@@ -134,12 +134,6 @@ extern int ibm_nmi_interlock_token;
 
 extern unsigned int __start___fw_ftr_fixup, __stop___fw_ftr_fixup;
 
-#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_KVM_GUEST)
-bool is_kvm_guest(void);
-#else
-static inline bool is_kvm_guest(void) { return false; }
-#endif
-
 #ifdef CONFIG_PPC_PSERIES
 void pseries_probe_fw_features(void);
 #else
diff --git a/arch/powerpc/include/asm/kvm_guest.h b/arch/powerpc/include/asm/kvm_guest.h
new file mode 100644
index 000000000000..c0ace884a0e8
--- /dev/null
+++ b/arch/powerpc/include/asm/kvm_guest.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020  IBM Corporation
+ */
+
+#ifndef __POWERPC_KVM_GUEST_H__
+#define __POWERPC_KVM_GUEST_H__
+
+#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_KVM_GUEST)
+bool is_kvm_guest(void);
+#else
+static inline bool is_kvm_guest(void) { return false; }
+#endif
+
+#endif /* __POWERPC_KVM_GUEST_H__ */
diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
index 744612054c94..abe1b5e82547 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -8,7 +8,7 @@
 #ifndef __POWERPC_KVM_PARA_H__
 #define __POWERPC_KVM_PARA_H__
 
-#include <asm/firmware.h>
+#include <asm/kvm_guest.h>
 
 #include <uapi/asm/kvm_para.h>
 
diff --git a/arch/powerpc/kernel/firmware.c b/arch/powerpc/kernel/firmware.c
index fe48d319d490..5f48e5ad24cd 100644
--- a/arch/powerpc/kernel/firmware.c
+++ b/arch/powerpc/kernel/firmware.c
@@ -14,6 +14,7 @@
 #include <linux/of.h>
 
 #include <asm/firmware.h>
+#include <asm/kvm_guest.h>
 
 #ifdef CONFIG_PPC64
 unsigned long powerpc_firmware_features __read_mostly;
diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
index 92922491a81c..d578732c545d 100644
--- a/arch/powerpc/platforms/pseries/smp.c
+++ b/arch/powerpc/platforms/pseries/smp.c
@@ -42,6 +42,7 @@
 #include <asm/plpar_wrappers.h>
 #include <asm/code-patching.h>
 #include <asm/svm.h>
+#include <asm/kvm_guest.h>
 
 #include "pseries.h"
 
-- 
2.18.4


^ permalink raw reply related

* [PATCH v2 3/4] powerpc: Reintroduce is_kvm_guest in a new avatar
From: Srikar Dronamraju @ 2020-12-02  5:04 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Nathan Lynch, Gautham R Shenoy, Phil Auld, Srikar Dronamraju,
	Juri Lelli, Peter Zijlstra, LKML, Nicholas Piggin, Waiman Long,
	linuxppc-dev, Valentin Schneider
In-Reply-To: <20201202050456.164005-1-srikar@linux.vnet.ibm.com>

Introduce a static branch that would be set during boot if the OS
happens to be a KVM guest. Subsequent checks to see if we are on KVM
will rely on this static branch. This static branch would be used in
vcpu_is_preempted in a subsequent patch.

Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
Cc: LKML <linux-kernel@vger.kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Valentin Schneider <valentin.schneider@arm.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Waiman Long <longman@redhat.com>
Cc: Phil Auld <pauld@redhat.com>
Acked-by: Waiman Long <longman@redhat.com>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/kvm_guest.h | 10 ++++++++++
 arch/powerpc/include/asm/kvm_para.h  |  2 +-
 arch/powerpc/kernel/firmware.c       |  2 ++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/kvm_guest.h b/arch/powerpc/include/asm/kvm_guest.h
index ba8291e02ba9..627ba272e781 100644
--- a/arch/powerpc/include/asm/kvm_guest.h
+++ b/arch/powerpc/include/asm/kvm_guest.h
@@ -7,8 +7,18 @@
 #define __POWERPC_KVM_GUEST_H__
 
 #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_KVM_GUEST)
+#include <linux/jump_label.h>
+
+DECLARE_STATIC_KEY_FALSE(kvm_guest);
+
+static inline bool is_kvm_guest(void)
+{
+	return static_branch_unlikely(&kvm_guest);
+}
+
 bool check_kvm_guest(void);
 #else
+static inline bool is_kvm_guest(void) { return false; }
 static inline bool check_kvm_guest(void) { return false; }
 #endif
 
diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
index 6fba06b6cfdb..abe1b5e82547 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -14,7 +14,7 @@
 
 static inline int kvm_para_available(void)
 {
-	return IS_ENABLED(CONFIG_KVM_GUEST) && check_kvm_guest();
+	return IS_ENABLED(CONFIG_KVM_GUEST) && is_kvm_guest();
 }
 
 static inline unsigned int kvm_arch_para_features(void)
diff --git a/arch/powerpc/kernel/firmware.c b/arch/powerpc/kernel/firmware.c
index 0aeb6a5b1a9e..28498fc573f2 100644
--- a/arch/powerpc/kernel/firmware.c
+++ b/arch/powerpc/kernel/firmware.c
@@ -22,6 +22,7 @@ EXPORT_SYMBOL_GPL(powerpc_firmware_features);
 #endif
 
 #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_KVM_GUEST)
+DEFINE_STATIC_KEY_FALSE(kvm_guest);
 bool check_kvm_guest(void)
 {
 	struct device_node *hyper_node;
@@ -33,6 +34,7 @@ bool check_kvm_guest(void)
 	if (!of_device_is_compatible(hyper_node, "linux,kvm"))
 		return 0;
 
+	static_branch_enable(&kvm_guest);
 	return 1;
 }
 #endif
-- 
2.18.4


^ permalink raw reply related

* [powerpc:fixes-test] BUILD SUCCESS f54db39fbe40731c40aefdd3bc26e7d56d668c64
From: kernel test robot @ 2020-12-02  5:40 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev

tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git  fixes-test
branch HEAD: f54db39fbe40731c40aefdd3bc26e7d56d668c64  KVM: PPC: Book3S HV: XIVE: Fix vCPU id sanity check

elapsed time: 724m

configs tested: 157
configs skipped: 2

The following configs have been built successfully.
More configs may be tested in the coming days.

gcc tested configs:
arm                                 defconfig
arm64                            allyesconfig
arm64                               defconfig
arm                              allyesconfig
arm                              allmodconfig
arm                        clps711x_defconfig
sh                        edosk7760_defconfig
powerpc                        cell_defconfig
mips                          ath79_defconfig
openrisc                         alldefconfig
sh                               j2_defconfig
mips                       capcella_defconfig
arm                           viper_defconfig
c6x                        evmc6474_defconfig
arm                          pxa3xx_defconfig
c6x                        evmc6457_defconfig
m68k                            q40_defconfig
arc                        nsim_700_defconfig
riscv                    nommu_k210_defconfig
arc                         haps_hs_defconfig
riscv                    nommu_virt_defconfig
mips                         tb0226_defconfig
arm                            dove_defconfig
m68k                        m5272c3_defconfig
sh                           se7705_defconfig
m68k                         apollo_defconfig
powerpc               mpc834x_itxgp_defconfig
arm                        realview_defconfig
microblaze                          defconfig
mips                malta_kvm_guest_defconfig
sh                ecovec24-romimage_defconfig
powerpc                         wii_defconfig
arm                      integrator_defconfig
s390                       zfcpdump_defconfig
xtensa                generic_kc705_defconfig
powerpc                    klondike_defconfig
mips                     loongson1c_defconfig
arc                        nsimosci_defconfig
ia64                                defconfig
mips                           gcw0_defconfig
xtensa                         virt_defconfig
c6x                        evmc6678_defconfig
sh                             shx3_defconfig
mips                  maltasmvp_eva_defconfig
powerpc                     tqm5200_defconfig
arc                 nsimosci_hs_smp_defconfig
s390                                defconfig
sh                         ap325rxa_defconfig
m68k                       m5475evb_defconfig
c6x                                 defconfig
powerpc                     ep8248e_defconfig
arm                          pcm027_defconfig
mips                           ip22_defconfig
sh                        dreamcast_defconfig
arm                            mps2_defconfig
sparc                               defconfig
ia64                        generic_defconfig
arm                         s3c6400_defconfig
powerpc                     rainier_defconfig
powerpc                     taishan_defconfig
powerpc                       eiger_defconfig
powerpc                        fsp2_defconfig
powerpc                      ppc40x_defconfig
h8300                               defconfig
powerpc                      ppc44x_defconfig
arm                              alldefconfig
arm                          moxart_defconfig
powerpc                    amigaone_defconfig
mips                        maltaup_defconfig
arc                              alldefconfig
microblaze                      mmu_defconfig
parisc                           alldefconfig
arm                           h3600_defconfig
mips                           jazz_defconfig
arm                     davinci_all_defconfig
powerpc                 mpc834x_itx_defconfig
powerpc                    mvme5100_defconfig
sh                        apsh4ad0a_defconfig
sh                  sh7785lcr_32bit_defconfig
arm                          imote2_defconfig
mips                      fuloong2e_defconfig
sparc                       sparc64_defconfig
arm                        vexpress_defconfig
powerpc                      pasemi_defconfig
powerpc                  storcenter_defconfig
parisc                generic-32bit_defconfig
mips                    maltaup_xpa_defconfig
mips                            e55_defconfig
mips                           xway_defconfig
xtensa                  audio_kc705_defconfig
powerpc                 canyonlands_defconfig
sh                            hp6xx_defconfig
powerpc                     skiroot_defconfig
ia64                             allmodconfig
ia64                             allyesconfig
m68k                             allmodconfig
m68k                                defconfig
m68k                             allyesconfig
nios2                               defconfig
arc                              allyesconfig
nds32                             allnoconfig
c6x                              allyesconfig
nds32                               defconfig
nios2                            allyesconfig
csky                                defconfig
alpha                               defconfig
alpha                            allyesconfig
xtensa                           allyesconfig
h8300                            allyesconfig
arc                                 defconfig
sh                               allmodconfig
parisc                              defconfig
s390                             allyesconfig
parisc                           allyesconfig
i386                             allyesconfig
sparc                            allyesconfig
i386                                defconfig
mips                             allyesconfig
mips                             allmodconfig
powerpc                          allyesconfig
powerpc                          allmodconfig
powerpc                           allnoconfig
i386                 randconfig-a004-20201201
i386                 randconfig-a005-20201201
i386                 randconfig-a001-20201201
i386                 randconfig-a002-20201201
i386                 randconfig-a006-20201201
i386                 randconfig-a003-20201201
x86_64               randconfig-a016-20201201
x86_64               randconfig-a012-20201201
x86_64               randconfig-a014-20201201
x86_64               randconfig-a013-20201201
x86_64               randconfig-a015-20201201
x86_64               randconfig-a011-20201201
i386                 randconfig-a014-20201201
i386                 randconfig-a013-20201201
i386                 randconfig-a011-20201201
i386                 randconfig-a015-20201201
i386                 randconfig-a012-20201201
i386                 randconfig-a016-20201201
riscv                            allyesconfig
riscv                             allnoconfig
riscv                               defconfig
riscv                          rv32_defconfig
riscv                            allmodconfig
x86_64                                   rhel
x86_64                           allyesconfig
x86_64                    rhel-7.6-kselftests
x86_64                              defconfig
x86_64                               rhel-8.3
x86_64                                  kexec

clang tested configs:
x86_64               randconfig-a004-20201201
x86_64               randconfig-a006-20201201
x86_64               randconfig-a001-20201201
x86_64               randconfig-a002-20201201
x86_64               randconfig-a005-20201201
x86_64               randconfig-a003-20201201

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

^ permalink raw reply

* [powerpc:next-test] BUILD SUCCESS 72e886545963b33dd5e1d92ee9c77dadb51adc4e
From: kernel test robot @ 2020-12-02  5:40 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev

tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git  next-test
branch HEAD: 72e886545963b33dd5e1d92ee9c77dadb51adc4e  powerpc/pseries: Define PCI bus speed for Gen4 and Gen5

elapsed time: 724m

configs tested: 147
configs skipped: 2

The following configs have been built successfully.
More configs may be tested in the coming days.

gcc tested configs:
arm                                 defconfig
arm64                            allyesconfig
arm64                               defconfig
arm                              allyesconfig
arm                              allmodconfig
arm                        clps711x_defconfig
sh                        edosk7760_defconfig
powerpc                        cell_defconfig
mips                          ath79_defconfig
openrisc                         alldefconfig
sh                               j2_defconfig
mips                       capcella_defconfig
arm                           viper_defconfig
c6x                        evmc6474_defconfig
arm                          pxa3xx_defconfig
c6x                        evmc6457_defconfig
m68k                            q40_defconfig
arc                        nsim_700_defconfig
arc                         haps_hs_defconfig
riscv                    nommu_virt_defconfig
mips                         tb0226_defconfig
arm                            dove_defconfig
m68k                        m5272c3_defconfig
mips                malta_kvm_guest_defconfig
sh                ecovec24-romimage_defconfig
powerpc                         wii_defconfig
arm                      integrator_defconfig
s390                       zfcpdump_defconfig
xtensa                generic_kc705_defconfig
ia64                                defconfig
powerpc                    klondike_defconfig
mips                     loongson1c_defconfig
arc                        nsimosci_defconfig
mips                           gcw0_defconfig
xtensa                         virt_defconfig
c6x                        evmc6678_defconfig
sh                             shx3_defconfig
mips                  maltasmvp_eva_defconfig
powerpc                     tqm5200_defconfig
arc                 nsimosci_hs_smp_defconfig
s390                                defconfig
sh                         ap325rxa_defconfig
m68k                       m5475evb_defconfig
c6x                                 defconfig
powerpc                     ep8248e_defconfig
arm                          pcm027_defconfig
mips                           ip22_defconfig
ia64                        generic_defconfig
sh                        dreamcast_defconfig
arm                            mps2_defconfig
arm                         s3c6400_defconfig
powerpc                     rainier_defconfig
powerpc                     taishan_defconfig
powerpc                       eiger_defconfig
powerpc                        fsp2_defconfig
powerpc                      ppc40x_defconfig
h8300                               defconfig
powerpc                      ppc44x_defconfig
arm                              alldefconfig
arm                          moxart_defconfig
powerpc                    amigaone_defconfig
mips                        maltaup_defconfig
arc                              alldefconfig
microblaze                      mmu_defconfig
parisc                           alldefconfig
arm                           h3600_defconfig
mips                           jazz_defconfig
arm                     davinci_all_defconfig
powerpc                 mpc834x_itx_defconfig
powerpc                    mvme5100_defconfig
sh                        apsh4ad0a_defconfig
sh                  sh7785lcr_32bit_defconfig
arm                          imote2_defconfig
mips                      fuloong2e_defconfig
sparc                       sparc64_defconfig
arm                        vexpress_defconfig
mips                           xway_defconfig
xtensa                  audio_kc705_defconfig
powerpc                 canyonlands_defconfig
sh                            hp6xx_defconfig
powerpc                     skiroot_defconfig
ia64                             allmodconfig
ia64                             allyesconfig
m68k                             allmodconfig
m68k                                defconfig
m68k                             allyesconfig
nds32                               defconfig
nios2                            allyesconfig
csky                                defconfig
alpha                               defconfig
alpha                            allyesconfig
nios2                               defconfig
arc                              allyesconfig
nds32                             allnoconfig
c6x                              allyesconfig
xtensa                           allyesconfig
h8300                            allyesconfig
arc                                 defconfig
sh                               allmodconfig
parisc                              defconfig
s390                             allyesconfig
parisc                           allyesconfig
i386                             allyesconfig
sparc                            allyesconfig
sparc                               defconfig
i386                                defconfig
mips                             allyesconfig
mips                             allmodconfig
powerpc                          allyesconfig
powerpc                          allmodconfig
powerpc                           allnoconfig
i386                 randconfig-a004-20201201
i386                 randconfig-a005-20201201
i386                 randconfig-a001-20201201
i386                 randconfig-a002-20201201
i386                 randconfig-a006-20201201
i386                 randconfig-a003-20201201
x86_64               randconfig-a016-20201201
x86_64               randconfig-a012-20201201
x86_64               randconfig-a014-20201201
x86_64               randconfig-a013-20201201
x86_64               randconfig-a015-20201201
x86_64               randconfig-a011-20201201
i386                 randconfig-a014-20201201
i386                 randconfig-a013-20201201
i386                 randconfig-a011-20201201
i386                 randconfig-a015-20201201
i386                 randconfig-a012-20201201
i386                 randconfig-a016-20201201
riscv                    nommu_k210_defconfig
riscv                            allyesconfig
riscv                             allnoconfig
riscv                               defconfig
riscv                          rv32_defconfig
riscv                            allmodconfig
x86_64                                   rhel
x86_64                           allyesconfig
x86_64                    rhel-7.6-kselftests
x86_64                              defconfig
x86_64                               rhel-8.3
x86_64                                  kexec

clang tested configs:
x86_64               randconfig-a004-20201201
x86_64               randconfig-a006-20201201
x86_64               randconfig-a001-20201201
x86_64               randconfig-a002-20201201
x86_64               randconfig-a005-20201201
x86_64               randconfig-a003-20201201

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox