LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 3/6] ibmvfc: add new fields for version 2 of several MADs
From: Tyrel Datwyler @ 2020-11-12  1:04 UTC (permalink / raw)
  To: james.bottomley
  Cc: Tyrel Datwyler, martin.petersen, linux-scsi, linux-kernel, brking,
	linuxppc-dev
In-Reply-To: <20201112010442.102589-1-tyreld@linux.ibm.com>

Introduce a targetWWPN field to several MADs. Its possible that a scsi
ID of a target can change due to some fabric changes. The WWPN of the
scsi target provides a better way to identify the target. Also, add
flags for receiving MAD versioning information and advertising client
support for targetWWPN with the VIOS. This latter capability flag will
be required for future clients capable of requesting multiple hardware
queues from the host adapter.

Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
---
 drivers/scsi/ibmvscsi/ibmvfc.c | 58 ++++++++++++++++++----------------
 drivers/scsi/ibmvscsi/ibmvfc.h | 28 +++++++++++++---
 2 files changed, 55 insertions(+), 31 deletions(-)

diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index 7b25789dba9a..aa3445bec42c 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -149,6 +149,7 @@ static void ibmvfc_trc_start(struct ibmvfc_event *evt)
 	struct ibmvfc_host *vhost = evt->vhost;
 	struct ibmvfc_cmd *vfc_cmd = &evt->iu.cmd;
 	struct ibmvfc_mad_common *mad = &evt->iu.mad_common;
+	struct ibmvfc_fcp_cmd_iu *iu = &vfc_cmd->v1.iu;
 	struct ibmvfc_trace_entry *entry;
 
 	entry = &vhost->trace[vhost->trace_index++];
@@ -159,11 +160,11 @@ static void ibmvfc_trc_start(struct ibmvfc_event *evt)
 
 	switch (entry->fmt) {
 	case IBMVFC_CMD_FORMAT:
-		entry->op_code = vfc_cmd->iu.cdb[0];
+		entry->op_code = iu->cdb[0];
 		entry->scsi_id = be64_to_cpu(vfc_cmd->tgt_scsi_id);
-		entry->lun = scsilun_to_int(&vfc_cmd->iu.lun);
-		entry->tmf_flags = vfc_cmd->iu.tmf_flags;
-		entry->u.start.xfer_len = be32_to_cpu(vfc_cmd->iu.xfer_len);
+		entry->lun = scsilun_to_int(&iu->lun);
+		entry->tmf_flags = iu->tmf_flags;
+		entry->u.start.xfer_len = be32_to_cpu(iu->xfer_len);
 		break;
 	case IBMVFC_MAD_FORMAT:
 		entry->op_code = be32_to_cpu(mad->opcode);
@@ -183,6 +184,8 @@ static void ibmvfc_trc_end(struct ibmvfc_event *evt)
 	struct ibmvfc_host *vhost = evt->vhost;
 	struct ibmvfc_cmd *vfc_cmd = &evt->xfer_iu->cmd;
 	struct ibmvfc_mad_common *mad = &evt->xfer_iu->mad_common;
+	struct ibmvfc_fcp_cmd_iu *iu = &vfc_cmd->v1.iu;
+	struct ibmvfc_fcp_rsp *rsp = &vfc_cmd->v1.rsp;
 	struct ibmvfc_trace_entry *entry = &vhost->trace[vhost->trace_index++];
 
 	entry->evt = evt;
@@ -192,15 +195,15 @@ static void ibmvfc_trc_end(struct ibmvfc_event *evt)
 
 	switch (entry->fmt) {
 	case IBMVFC_CMD_FORMAT:
-		entry->op_code = vfc_cmd->iu.cdb[0];
+		entry->op_code = iu->cdb[0];
 		entry->scsi_id = be64_to_cpu(vfc_cmd->tgt_scsi_id);
-		entry->lun = scsilun_to_int(&vfc_cmd->iu.lun);
-		entry->tmf_flags = vfc_cmd->iu.tmf_flags;
+		entry->lun = scsilun_to_int(&iu->lun);
+		entry->tmf_flags = iu->tmf_flags;
 		entry->u.end.status = be16_to_cpu(vfc_cmd->status);
 		entry->u.end.error = be16_to_cpu(vfc_cmd->error);
-		entry->u.end.fcp_rsp_flags = vfc_cmd->rsp.flags;
-		entry->u.end.rsp_code = vfc_cmd->rsp.data.info.rsp_code;
-		entry->u.end.scsi_status = vfc_cmd->rsp.scsi_status;
+		entry->u.end.fcp_rsp_flags = rsp->flags;
+		entry->u.end.rsp_code = rsp->data.info.rsp_code;
+		entry->u.end.scsi_status = rsp->scsi_status;
 		break;
 	case IBMVFC_MAD_FORMAT:
 		entry->op_code = be32_to_cpu(mad->opcode);
@@ -263,7 +266,7 @@ static const char *ibmvfc_get_cmd_error(u16 status, u16 error)
 static int ibmvfc_get_err_result(struct ibmvfc_cmd *vfc_cmd)
 {
 	int err;
-	struct ibmvfc_fcp_rsp *rsp = &vfc_cmd->rsp;
+	struct ibmvfc_fcp_rsp *rsp = &vfc_cmd->v1.rsp;
 	int fc_rsp_len = be32_to_cpu(rsp->fcp_rsp_len);
 
 	if ((rsp->flags & FCP_RSP_LEN_VALID) &&
@@ -1378,6 +1381,7 @@ static int ibmvfc_map_sg_data(struct scsi_cmnd *scmd,
 	int sg_mapped;
 	struct srp_direct_buf *data = &vfc_cmd->ioba;
 	struct ibmvfc_host *vhost = dev_get_drvdata(dev);
+	struct ibmvfc_fcp_cmd_iu *iu = &vfc_cmd->v1.iu;
 
 	if (cls3_error)
 		vfc_cmd->flags |= cpu_to_be16(IBMVFC_CLASS_3_ERR);
@@ -1394,10 +1398,10 @@ static int ibmvfc_map_sg_data(struct scsi_cmnd *scmd,
 
 	if (scmd->sc_data_direction == DMA_TO_DEVICE) {
 		vfc_cmd->flags |= cpu_to_be16(IBMVFC_WRITE);
-		vfc_cmd->iu.add_cdb_len |= IBMVFC_WRDATA;
+		iu->add_cdb_len |= IBMVFC_WRDATA;
 	} else {
 		vfc_cmd->flags |= cpu_to_be16(IBMVFC_READ);
-		vfc_cmd->iu.add_cdb_len |= IBMVFC_RDDATA;
+		iu->add_cdb_len |= IBMVFC_RDDATA;
 	}
 
 	if (sg_mapped == 1) {
@@ -1516,7 +1520,7 @@ static void ibmvfc_log_error(struct ibmvfc_event *evt)
 {
 	struct ibmvfc_cmd *vfc_cmd = &evt->xfer_iu->cmd;
 	struct ibmvfc_host *vhost = evt->vhost;
-	struct ibmvfc_fcp_rsp *rsp = &vfc_cmd->rsp;
+	struct ibmvfc_fcp_rsp *rsp = &vfc_cmd->v1.rsp;
 	struct scsi_cmnd *cmnd = evt->cmnd;
 	const char *err = unknown_error;
 	int index = ibmvfc_get_err_index(be16_to_cpu(vfc_cmd->status), be16_to_cpu(vfc_cmd->error));
@@ -1570,7 +1574,7 @@ static void ibmvfc_relogin(struct scsi_device *sdev)
 static void ibmvfc_scsi_done(struct ibmvfc_event *evt)
 {
 	struct ibmvfc_cmd *vfc_cmd = &evt->xfer_iu->cmd;
-	struct ibmvfc_fcp_rsp *rsp = &vfc_cmd->rsp;
+	struct ibmvfc_fcp_rsp *rsp = &vfc_cmd->v1.rsp;
 	struct scsi_cmnd *cmnd = evt->cmnd;
 	u32 rsp_len = 0;
 	u32 sense_len = be32_to_cpu(rsp->fcp_sense_len);
@@ -1652,14 +1656,14 @@ static struct ibmvfc_cmd *ibmvfc_init_vfc_cmd(struct ibmvfc_event *evt, struct s
 	struct ibmvfc_cmd *vfc_cmd = &evt->iu.cmd;
 
 	memset(vfc_cmd, 0, sizeof(*vfc_cmd));
-	vfc_cmd->resp.va = cpu_to_be64(be64_to_cpu(evt->crq.ioba) + offsetof(struct ibmvfc_cmd, rsp));
-	vfc_cmd->resp.len = cpu_to_be32(sizeof(vfc_cmd->rsp));
+	vfc_cmd->resp.va = cpu_to_be64(be64_to_cpu(evt->crq.ioba) + offsetof(struct ibmvfc_cmd, v1.rsp));
+	vfc_cmd->resp.len = cpu_to_be32(sizeof(vfc_cmd->v1.rsp));
 	vfc_cmd->frame_type = cpu_to_be32(IBMVFC_SCSI_FCP_TYPE);
-	vfc_cmd->payload_len = cpu_to_be32(sizeof(vfc_cmd->iu));
-	vfc_cmd->resp_len = cpu_to_be32(sizeof(vfc_cmd->rsp));
+	vfc_cmd->payload_len = cpu_to_be32(sizeof(vfc_cmd->v1.iu));
+	vfc_cmd->resp_len = cpu_to_be32(sizeof(vfc_cmd->v1.rsp));
 	vfc_cmd->cancel_key = cpu_to_be32((unsigned long)sdev->hostdata);
 	vfc_cmd->tgt_scsi_id = cpu_to_be64(rport->port_id);
-	int_to_scsilun(sdev->lun, &vfc_cmd->iu.lun);
+	int_to_scsilun(sdev->lun, &vfc_cmd->v1.iu.lun);
 
 	return vfc_cmd;
 }
@@ -1696,12 +1700,12 @@ static int ibmvfc_queuecommand_lck(struct scsi_cmnd *cmnd,
 
 	vfc_cmd = ibmvfc_init_vfc_cmd(evt, cmnd->device);
 
-	vfc_cmd->iu.xfer_len = cpu_to_be32(scsi_bufflen(cmnd));
-	memcpy(vfc_cmd->iu.cdb, cmnd->cmnd, cmnd->cmd_len);
+	vfc_cmd->v1.iu.xfer_len = cpu_to_be32(scsi_bufflen(cmnd));
+	memcpy(vfc_cmd->v1.iu.cdb, cmnd->cmnd, cmnd->cmd_len);
 
 	if (cmnd->flags & SCMD_TAGGED) {
 		vfc_cmd->task_tag = cpu_to_be64(cmnd->tag);
-		vfc_cmd->iu.pri_task_attr = IBMVFC_SIMPLE_TASK;
+		vfc_cmd->v1.iu.pri_task_attr = IBMVFC_SIMPLE_TASK;
 	}
 
 	if (likely(!(rc = ibmvfc_map_sg_data(cmnd, evt, vfc_cmd, vhost->dev))))
@@ -2026,7 +2030,7 @@ static int ibmvfc_reset_device(struct scsi_device *sdev, int type, char *desc)
 	struct ibmvfc_cmd *tmf;
 	struct ibmvfc_event *evt = NULL;
 	union ibmvfc_iu rsp_iu;
-	struct ibmvfc_fcp_rsp *fc_rsp = &rsp_iu.cmd.rsp;
+	struct ibmvfc_fcp_rsp *fc_rsp = &rsp_iu.cmd.v1.rsp;
 	int rsp_rc = -EBUSY;
 	unsigned long flags;
 	int rsp_code = 0;
@@ -2038,7 +2042,7 @@ static int ibmvfc_reset_device(struct scsi_device *sdev, int type, char *desc)
 		tmf = ibmvfc_init_vfc_cmd(evt, sdev);
 
 		tmf->flags = cpu_to_be16((IBMVFC_NO_MEM_DESC | IBMVFC_TMF));
-		tmf->iu.tmf_flags = type;
+		tmf->v1.iu.tmf_flags = type;
 		evt->sync_iu = &rsp_iu;
 
 		init_completion(&evt->comp);
@@ -2331,7 +2335,7 @@ static int ibmvfc_abort_task_set(struct scsi_device *sdev)
 	struct ibmvfc_cmd *tmf;
 	struct ibmvfc_event *evt, *found_evt;
 	union ibmvfc_iu rsp_iu;
-	struct ibmvfc_fcp_rsp *fc_rsp = &rsp_iu.cmd.rsp;
+	struct ibmvfc_fcp_rsp *fc_rsp = &rsp_iu.cmd.v1.rsp;
 	int rc, rsp_rc = -EBUSY;
 	unsigned long flags, timeout = IBMVFC_ABORT_TIMEOUT;
 	int rsp_code = 0;
@@ -2358,7 +2362,7 @@ static int ibmvfc_abort_task_set(struct scsi_device *sdev)
 		tmf = ibmvfc_init_vfc_cmd(evt, sdev);
 
 		tmf->flags = cpu_to_be16((IBMVFC_NO_MEM_DESC | IBMVFC_TMF));
-		tmf->iu.tmf_flags = IBMVFC_ABORT_TASK_SET;
+		tmf->v1.iu.tmf_flags = IBMVFC_ABORT_TASK_SET;
 		evt->sync_iu = &rsp_iu;
 
 		init_completion(&evt->comp);
diff --git a/drivers/scsi/ibmvscsi/ibmvfc.h b/drivers/scsi/ibmvscsi/ibmvfc.h
index 34debccfb142..65092812bd4a 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.h
+++ b/drivers/scsi/ibmvscsi/ibmvfc.h
@@ -54,6 +54,7 @@
 
 #define IBMVFC_MAD_SUCCESS		0x00
 #define IBMVFC_MAD_NOT_SUPPORTED	0xF1
+#define IBMVFC_MAD_VERSION_NOT_SUPP	0xF2
 #define IBMVFC_MAD_FAILED		0xF7
 #define IBMVFC_MAD_DRIVER_FAILED	0xEE
 #define IBMVFC_MAD_CRQ_ERROR		0xEF
@@ -168,6 +169,8 @@ struct ibmvfc_npiv_login {
 #define IBMVFC_CAN_MIGRATE		0x01
 #define IBMVFC_CAN_USE_CHANNELS		0x02
 #define IBMVFC_CAN_HANDLE_FPIN		0x04
+#define IBMVFC_CAN_USE_MAD_VERSION	0x08
+#define IBMVFC_CAN_SEND_VF_WWPN		0x10
 	__be64 node_name;
 	struct srp_direct_buf async;
 	u8 partition_name[IBMVFC_MAX_NAME];
@@ -211,7 +214,9 @@ struct ibmvfc_npiv_login_resp {
 	__be64 capabilities;
 #define IBMVFC_CAN_FLUSH_ON_HALT	0x08
 #define IBMVFC_CAN_SUPPRESS_ABTS	0x10
-#define IBMVFC_CAN_SUPPORT_CHANNELS	0x20
+#define IBMVFC_MAD_VERSION_CAP		0x20
+#define IBMVFC_HANDLE_VF_WWPN		0x40
+#define IBMVFC_CAN_SUPPORT_CHANNELS	0x80
 	__be32 max_cmds;
 	__be32 scsi_id_sz;
 	__be64 max_dma_len;
@@ -293,6 +298,7 @@ struct ibmvfc_port_login {
 	__be32 reserved2;
 	struct ibmvfc_service_parms service_parms;
 	struct ibmvfc_service_parms service_parms_change;
+	__be64 targetWWPN;
 	__be64 reserved3[2];
 } __packed __aligned(8);
 
@@ -344,6 +350,7 @@ struct ibmvfc_process_login {
 	__be16 status;
 	__be16 error;			/* also fc_reason */
 	__be32 reserved2;
+	__be64 targetWWPN;
 	__be64 reserved3[2];
 } __packed __aligned(8);
 
@@ -378,6 +385,8 @@ struct ibmvfc_tmf {
 	__be32 cancel_key;
 	__be32 my_cancel_key;
 	__be32 pad;
+	__be64 targetWWPN;
+	__be64 taskTag;
 	__be64 reserved[2];
 } __packed __aligned(8);
 
@@ -474,9 +483,19 @@ struct ibmvfc_cmd {
 	__be64 correlation;
 	__be64 tgt_scsi_id;
 	__be64 tag;
-	__be64 reserved3[2];
-	struct ibmvfc_fcp_cmd_iu iu;
-	struct ibmvfc_fcp_rsp rsp;
+	__be64 targetWWPN;
+	__be64 reserved3;
+	union {
+		struct {
+			struct ibmvfc_fcp_cmd_iu iu;
+			struct ibmvfc_fcp_rsp rsp;
+		} v1;
+		struct {
+			__be64 reserved4;
+			struct ibmvfc_fcp_cmd_iu iu;
+			struct ibmvfc_fcp_rsp rsp;
+		} v2;
+	};
 } __packed __aligned(8);
 
 struct ibmvfc_passthru_fc_iu {
@@ -503,6 +522,7 @@ struct ibmvfc_passthru_iu {
 	__be64 correlation;
 	__be64 scsi_id;
 	__be64 tag;
+	__be64 targetWWPN;
 	__be64 reserved2[2];
 } __packed __aligned(8);
 
-- 
2.27.0


^ permalink raw reply related

* [PATCH 2/6] ibmvfc: deduplicate common ibmvfc_cmd init code
From: Tyrel Datwyler @ 2020-11-12  1:04 UTC (permalink / raw)
  To: james.bottomley
  Cc: Tyrel Datwyler, martin.petersen, linux-scsi, linux-kernel, brking,
	linuxppc-dev
In-Reply-To: <20201112010442.102589-1-tyreld@linux.ibm.com>

The virtual FC frame command exchaned with the VIOS is used for device
reset and command abort TMF as well as normally queued commands. When
initializing the ibmvfc_cmd there several elements of the command that
are set the same way regardless of the command type.

Deduplicate code by moving these commonally set fields into a
initialization helper routine, namely ibmvfc_init_vfc_cmd().

Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
---
 drivers/scsi/ibmvscsi/ibmvfc.c | 55 ++++++++++++++--------------------
 1 file changed, 23 insertions(+), 32 deletions(-)

diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index 01fe65de9086..7b25789dba9a 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -1646,6 +1646,24 @@ static inline int ibmvfc_host_chkready(struct ibmvfc_host *vhost)
 	return result;
 }
 
+static struct ibmvfc_cmd *ibmvfc_init_vfc_cmd(struct ibmvfc_event *evt, struct scsi_device *sdev)
+{
+	struct fc_rport *rport = starget_to_rport(scsi_target(sdev));
+	struct ibmvfc_cmd *vfc_cmd = &evt->iu.cmd;
+
+	memset(vfc_cmd, 0, sizeof(*vfc_cmd));
+	vfc_cmd->resp.va = cpu_to_be64(be64_to_cpu(evt->crq.ioba) + offsetof(struct ibmvfc_cmd, rsp));
+	vfc_cmd->resp.len = cpu_to_be32(sizeof(vfc_cmd->rsp));
+	vfc_cmd->frame_type = cpu_to_be32(IBMVFC_SCSI_FCP_TYPE);
+	vfc_cmd->payload_len = cpu_to_be32(sizeof(vfc_cmd->iu));
+	vfc_cmd->resp_len = cpu_to_be32(sizeof(vfc_cmd->rsp));
+	vfc_cmd->cancel_key = cpu_to_be32((unsigned long)sdev->hostdata);
+	vfc_cmd->tgt_scsi_id = cpu_to_be64(rport->port_id);
+	int_to_scsilun(sdev->lun, &vfc_cmd->iu.lun);
+
+	return vfc_cmd;
+}
+
 /**
  * ibmvfc_queuecommand - The queuecommand function of the scsi template
  * @cmnd:	struct scsi_cmnd to be executed
@@ -1675,17 +1693,10 @@ static int ibmvfc_queuecommand_lck(struct scsi_cmnd *cmnd,
 	ibmvfc_init_event(evt, ibmvfc_scsi_done, IBMVFC_CMD_FORMAT);
 	evt->cmnd = cmnd;
 	cmnd->scsi_done = done;
-	vfc_cmd = &evt->iu.cmd;
-	memset(vfc_cmd, 0, sizeof(*vfc_cmd));
-	vfc_cmd->resp.va = cpu_to_be64(be64_to_cpu(evt->crq.ioba) + offsetof(struct ibmvfc_cmd, rsp));
-	vfc_cmd->resp.len = cpu_to_be32(sizeof(vfc_cmd->rsp));
-	vfc_cmd->frame_type = cpu_to_be32(IBMVFC_SCSI_FCP_TYPE);
-	vfc_cmd->payload_len = cpu_to_be32(sizeof(vfc_cmd->iu));
-	vfc_cmd->resp_len = cpu_to_be32(sizeof(vfc_cmd->rsp));
-	vfc_cmd->cancel_key = cpu_to_be32((unsigned long)cmnd->device->hostdata);
-	vfc_cmd->tgt_scsi_id = cpu_to_be64(rport->port_id);
+
+	vfc_cmd = ibmvfc_init_vfc_cmd(evt, cmnd->device);
+
 	vfc_cmd->iu.xfer_len = cpu_to_be32(scsi_bufflen(cmnd));
-	int_to_scsilun(cmnd->device->lun, &vfc_cmd->iu.lun);
 	memcpy(vfc_cmd->iu.cdb, cmnd->cmnd, cmnd->cmd_len);
 
 	if (cmnd->flags & SCMD_TAGGED) {
@@ -2012,7 +2023,6 @@ static int ibmvfc_bsg_request(struct bsg_job *job)
 static int ibmvfc_reset_device(struct scsi_device *sdev, int type, char *desc)
 {
 	struct ibmvfc_host *vhost = shost_priv(sdev->host);
-	struct fc_rport *rport = starget_to_rport(scsi_target(sdev));
 	struct ibmvfc_cmd *tmf;
 	struct ibmvfc_event *evt = NULL;
 	union ibmvfc_iu rsp_iu;
@@ -2025,17 +2035,8 @@ static int ibmvfc_reset_device(struct scsi_device *sdev, int type, char *desc)
 	if (vhost->state == IBMVFC_ACTIVE) {
 		evt = ibmvfc_get_event(vhost);
 		ibmvfc_init_event(evt, ibmvfc_sync_completion, IBMVFC_CMD_FORMAT);
+		tmf = ibmvfc_init_vfc_cmd(evt, sdev);
 
-		tmf = &evt->iu.cmd;
-		memset(tmf, 0, sizeof(*tmf));
-		tmf->resp.va = cpu_to_be64(be64_to_cpu(evt->crq.ioba) + offsetof(struct ibmvfc_cmd, rsp));
-		tmf->resp.len = cpu_to_be32(sizeof(tmf->rsp));
-		tmf->frame_type = cpu_to_be32(IBMVFC_SCSI_FCP_TYPE);
-		tmf->payload_len = cpu_to_be32(sizeof(tmf->iu));
-		tmf->resp_len = cpu_to_be32(sizeof(tmf->rsp));
-		tmf->cancel_key = cpu_to_be32((unsigned long)sdev->hostdata);
-		tmf->tgt_scsi_id = cpu_to_be64(rport->port_id);
-		int_to_scsilun(sdev->lun, &tmf->iu.lun);
 		tmf->flags = cpu_to_be16((IBMVFC_NO_MEM_DESC | IBMVFC_TMF));
 		tmf->iu.tmf_flags = type;
 		evt->sync_iu = &rsp_iu;
@@ -2327,7 +2328,6 @@ static int ibmvfc_match_evt(struct ibmvfc_event *evt, void *match)
 static int ibmvfc_abort_task_set(struct scsi_device *sdev)
 {
 	struct ibmvfc_host *vhost = shost_priv(sdev->host);
-	struct fc_rport *rport = starget_to_rport(scsi_target(sdev));
 	struct ibmvfc_cmd *tmf;
 	struct ibmvfc_event *evt, *found_evt;
 	union ibmvfc_iu rsp_iu;
@@ -2355,17 +2355,8 @@ static int ibmvfc_abort_task_set(struct scsi_device *sdev)
 	if (vhost->state == IBMVFC_ACTIVE) {
 		evt = ibmvfc_get_event(vhost);
 		ibmvfc_init_event(evt, ibmvfc_sync_completion, IBMVFC_CMD_FORMAT);
+		tmf = ibmvfc_init_vfc_cmd(evt, sdev);
 
-		tmf = &evt->iu.cmd;
-		memset(tmf, 0, sizeof(*tmf));
-		tmf->resp.va = cpu_to_be64(be64_to_cpu(evt->crq.ioba) + offsetof(struct ibmvfc_cmd, rsp));
-		tmf->resp.len = cpu_to_be32(sizeof(tmf->rsp));
-		tmf->frame_type = cpu_to_be32(IBMVFC_SCSI_FCP_TYPE);
-		tmf->payload_len = cpu_to_be32(sizeof(tmf->iu));
-		tmf->resp_len = cpu_to_be32(sizeof(tmf->rsp));
-		tmf->cancel_key = cpu_to_be32((unsigned long)sdev->hostdata);
-		tmf->tgt_scsi_id = cpu_to_be64(rport->port_id);
-		int_to_scsilun(sdev->lun, &tmf->iu.lun);
 		tmf->flags = cpu_to_be16((IBMVFC_NO_MEM_DESC | IBMVFC_TMF));
 		tmf->iu.tmf_flags = IBMVFC_ABORT_TASK_SET;
 		evt->sync_iu = &rsp_iu;
-- 
2.27.0


^ permalink raw reply related

* [PATCH 6/6] ibmvfc: advertise client support for targetWWPN using v2 commands
From: Tyrel Datwyler @ 2020-11-12  1:04 UTC (permalink / raw)
  To: james.bottomley
  Cc: Tyrel Datwyler, martin.petersen, linux-scsi, linux-kernel, brking,
	linuxppc-dev
In-Reply-To: <20201112010442.102589-1-tyreld@linux.ibm.com>

The previous patch added support for the targetWWPN field in version 2
MADs and vfcFrame structures.

Set the IBMVFC_CAN_SEND_VF_WWPN bit in our capabailites flag during NPIV
Login to inform the VIOS that this client supports this feature.

Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
---
 drivers/scsi/ibmvscsi/ibmvfc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index 604bccebf7d2..3c72f5a009b8 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -1246,7 +1246,7 @@ static void ibmvfc_set_login_info(struct ibmvfc_host *vhost)
 		login_info->flags |= cpu_to_be16(IBMVFC_CLIENT_MIGRATED);
 
 	login_info->max_cmds = cpu_to_be32(max_requests + IBMVFC_NUM_INTERNAL_REQ);
-	login_info->capabilities = cpu_to_be64(IBMVFC_CAN_MIGRATE);
+	login_info->capabilities = cpu_to_be64(IBMVFC_CAN_MIGRATE | IBMVFC_CAN_SEND_VF_WWPN);
 	login_info->async.va = cpu_to_be64(vhost->async_crq.msg_token);
 	login_info->async.len = cpu_to_be32(vhost->async_crq.size * sizeof(*vhost->async_crq.msgs));
 	strncpy(login_info->partition_name, vhost->partition_name, IBMVFC_MAX_NAME);
-- 
2.27.0


^ permalink raw reply related

* [PATCH 1/6] ibmvfc: byte swap login_buf.resp values in attribute show functions
From: Tyrel Datwyler @ 2020-11-12  1:04 UTC (permalink / raw)
  To: james.bottomley
  Cc: Tyrel Datwyler, martin.petersen, linux-scsi, linux-kernel, brking,
	linuxppc-dev

Both ibmvfc_show_host_(capabilities|npiv_version) functions retrieve
values from vhost->login_buf.resp buffer. This is the MAD response
buffer from the VIOS and as such any multi-byte non-string values are in
big endian format.

Byte swap these values to host cpu endian format for better human
readability.

Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
---
 drivers/scsi/ibmvscsi/ibmvfc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index 070cf516b98f..01fe65de9086 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -3025,7 +3025,7 @@ static ssize_t ibmvfc_show_host_npiv_version(struct device *dev,
 {
 	struct Scsi_Host *shost = class_to_shost(dev);
 	struct ibmvfc_host *vhost = shost_priv(shost);
-	return snprintf(buf, PAGE_SIZE, "%d\n", vhost->login_buf->resp.version);
+	return snprintf(buf, PAGE_SIZE, "%d\n", be32_to_cpu(vhost->login_buf->resp.version));
 }
 
 static ssize_t ibmvfc_show_host_capabilities(struct device *dev,
@@ -3033,7 +3033,7 @@ static ssize_t ibmvfc_show_host_capabilities(struct device *dev,
 {
 	struct Scsi_Host *shost = class_to_shost(dev);
 	struct ibmvfc_host *vhost = shost_priv(shost);
-	return snprintf(buf, PAGE_SIZE, "%llx\n", vhost->login_buf->resp.capabilities);
+	return snprintf(buf, PAGE_SIZE, "%llx\n", be64_to_cpu(vhost->login_buf->resp.capabilities));
 }
 
 /**
-- 
2.27.0


^ permalink raw reply related

* [PATCH 4/6] ibmvfc: add FC payload retrieval routines for versioned vfcFrames
From: Tyrel Datwyler @ 2020-11-12  1:04 UTC (permalink / raw)
  To: james.bottomley
  Cc: Tyrel Datwyler, martin.petersen, linux-scsi, linux-kernel, brking,
	linuxppc-dev
In-Reply-To: <20201112010442.102589-1-tyreld@linux.ibm.com>

The FC iu and response payloads are located at different offsets
depending on the ibmvfc_cmd version. This is a result of the version 2
vfcFrame definition adding an extra 64bytes of reserved space to the
structure prior to the payloads.

Add helper routines to determine the current vfcFrame version and
returning pointers to the proper iu or response structures within that
ibmvfc_cmd.

Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
---
 drivers/scsi/ibmvscsi/ibmvfc.c | 76 ++++++++++++++++++++++++----------
 1 file changed, 53 insertions(+), 23 deletions(-)

diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index aa3445bec42c..5e666f7c9266 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -138,6 +138,22 @@ static void ibmvfc_tgt_move_login(struct ibmvfc_target *);
 
 static const char *unknown_error = "unknown error";
 
+static struct ibmvfc_fcp_cmd_iu *ibmvfc_get_fcp_iu(struct ibmvfc_host *vhost, struct ibmvfc_cmd *vfc_cmd)
+{
+	if (be64_to_cpu(vhost->login_buf->resp.capabilities) & IBMVFC_HANDLE_VF_WWPN)
+		return &vfc_cmd->v2.iu;
+	else
+		return &vfc_cmd->v1.iu;
+}
+
+static struct ibmvfc_fcp_rsp *ibmvfc_get_fcp_rsp(struct ibmvfc_host *vhost, struct ibmvfc_cmd *vfc_cmd)
+{
+	if (be64_to_cpu(vhost->login_buf->resp.capabilities) & IBMVFC_HANDLE_VF_WWPN)
+		return &vfc_cmd->v2.rsp;
+	else
+		return &vfc_cmd->v1.rsp;
+}
+
 #ifdef CONFIG_SCSI_IBMVFC_TRACE
 /**
  * ibmvfc_trc_start - Log a start trace entry
@@ -149,7 +165,7 @@ static void ibmvfc_trc_start(struct ibmvfc_event *evt)
 	struct ibmvfc_host *vhost = evt->vhost;
 	struct ibmvfc_cmd *vfc_cmd = &evt->iu.cmd;
 	struct ibmvfc_mad_common *mad = &evt->iu.mad_common;
-	struct ibmvfc_fcp_cmd_iu *iu = &vfc_cmd->v1.iu;
+	struct ibmvfc_fcp_cmd_iu *iu = ibmvfc_get_fcp_iu(vhost, vfc_cmd);
 	struct ibmvfc_trace_entry *entry;
 
 	entry = &vhost->trace[vhost->trace_index++];
@@ -184,8 +200,8 @@ static void ibmvfc_trc_end(struct ibmvfc_event *evt)
 	struct ibmvfc_host *vhost = evt->vhost;
 	struct ibmvfc_cmd *vfc_cmd = &evt->xfer_iu->cmd;
 	struct ibmvfc_mad_common *mad = &evt->xfer_iu->mad_common;
-	struct ibmvfc_fcp_cmd_iu *iu = &vfc_cmd->v1.iu;
-	struct ibmvfc_fcp_rsp *rsp = &vfc_cmd->v1.rsp;
+	struct ibmvfc_fcp_cmd_iu *iu = ibmvfc_get_fcp_iu(vhost, vfc_cmd);
+	struct ibmvfc_fcp_rsp *rsp = ibmvfc_get_fcp_rsp(vhost, vfc_cmd);
 	struct ibmvfc_trace_entry *entry = &vhost->trace[vhost->trace_index++];
 
 	entry->evt = evt;
@@ -263,10 +279,10 @@ static const char *ibmvfc_get_cmd_error(u16 status, u16 error)
  * Return value:
  *	SCSI result value to return for completed command
  **/
-static int ibmvfc_get_err_result(struct ibmvfc_cmd *vfc_cmd)
+static int ibmvfc_get_err_result(struct ibmvfc_host *vhost, struct ibmvfc_cmd *vfc_cmd)
 {
 	int err;
-	struct ibmvfc_fcp_rsp *rsp = &vfc_cmd->v1.rsp;
+	struct ibmvfc_fcp_rsp *rsp = ibmvfc_get_fcp_rsp(vhost, vfc_cmd);
 	int fc_rsp_len = be32_to_cpu(rsp->fcp_rsp_len);
 
 	if ((rsp->flags & FCP_RSP_LEN_VALID) &&
@@ -1381,7 +1397,7 @@ static int ibmvfc_map_sg_data(struct scsi_cmnd *scmd,
 	int sg_mapped;
 	struct srp_direct_buf *data = &vfc_cmd->ioba;
 	struct ibmvfc_host *vhost = dev_get_drvdata(dev);
-	struct ibmvfc_fcp_cmd_iu *iu = &vfc_cmd->v1.iu;
+	struct ibmvfc_fcp_cmd_iu *iu = ibmvfc_get_fcp_iu(evt->vhost, vfc_cmd);
 
 	if (cls3_error)
 		vfc_cmd->flags |= cpu_to_be16(IBMVFC_CLASS_3_ERR);
@@ -1520,7 +1536,7 @@ static void ibmvfc_log_error(struct ibmvfc_event *evt)
 {
 	struct ibmvfc_cmd *vfc_cmd = &evt->xfer_iu->cmd;
 	struct ibmvfc_host *vhost = evt->vhost;
-	struct ibmvfc_fcp_rsp *rsp = &vfc_cmd->v1.rsp;
+	struct ibmvfc_fcp_rsp *rsp = ibmvfc_get_fcp_rsp(vhost, vfc_cmd);
 	struct scsi_cmnd *cmnd = evt->cmnd;
 	const char *err = unknown_error;
 	int index = ibmvfc_get_err_index(be16_to_cpu(vfc_cmd->status), be16_to_cpu(vfc_cmd->error));
@@ -1574,7 +1590,7 @@ static void ibmvfc_relogin(struct scsi_device *sdev)
 static void ibmvfc_scsi_done(struct ibmvfc_event *evt)
 {
 	struct ibmvfc_cmd *vfc_cmd = &evt->xfer_iu->cmd;
-	struct ibmvfc_fcp_rsp *rsp = &vfc_cmd->v1.rsp;
+	struct ibmvfc_fcp_rsp *rsp = ibmvfc_get_fcp_rsp(evt->vhost, vfc_cmd);
 	struct scsi_cmnd *cmnd = evt->cmnd;
 	u32 rsp_len = 0;
 	u32 sense_len = be32_to_cpu(rsp->fcp_sense_len);
@@ -1588,7 +1604,7 @@ static void ibmvfc_scsi_done(struct ibmvfc_event *evt)
 			scsi_set_resid(cmnd, 0);
 
 		if (vfc_cmd->status) {
-			cmnd->result = ibmvfc_get_err_result(vfc_cmd);
+			cmnd->result = ibmvfc_get_err_result(evt->vhost, vfc_cmd);
 
 			if (rsp->flags & FCP_RSP_LEN_VALID)
 				rsp_len = be32_to_cpu(rsp->fcp_rsp_len);
@@ -1653,17 +1669,25 @@ static inline int ibmvfc_host_chkready(struct ibmvfc_host *vhost)
 static struct ibmvfc_cmd *ibmvfc_init_vfc_cmd(struct ibmvfc_event *evt, struct scsi_device *sdev)
 {
 	struct fc_rport *rport = starget_to_rport(scsi_target(sdev));
+	struct ibmvfc_host *vhost = evt->vhost;
 	struct ibmvfc_cmd *vfc_cmd = &evt->iu.cmd;
+	struct ibmvfc_fcp_cmd_iu *iu = ibmvfc_get_fcp_iu(vhost, vfc_cmd);
+	struct ibmvfc_fcp_rsp *rsp = ibmvfc_get_fcp_rsp(vhost, vfc_cmd);
+	size_t offset;
 
 	memset(vfc_cmd, 0, sizeof(*vfc_cmd));
-	vfc_cmd->resp.va = cpu_to_be64(be64_to_cpu(evt->crq.ioba) + offsetof(struct ibmvfc_cmd, v1.rsp));
-	vfc_cmd->resp.len = cpu_to_be32(sizeof(vfc_cmd->v1.rsp));
+	if (be64_to_cpu(vhost->login_buf->resp.capabilities) & IBMVFC_HANDLE_VF_WWPN)
+		offset = offsetof(struct ibmvfc_cmd, v2.rsp);
+	else
+		offset = offsetof(struct ibmvfc_cmd, v1.rsp);
+	vfc_cmd->resp.va = cpu_to_be64(be64_to_cpu(evt->crq.ioba) + offset);
+	vfc_cmd->resp.len = cpu_to_be32(sizeof(*rsp));
 	vfc_cmd->frame_type = cpu_to_be32(IBMVFC_SCSI_FCP_TYPE);
-	vfc_cmd->payload_len = cpu_to_be32(sizeof(vfc_cmd->v1.iu));
-	vfc_cmd->resp_len = cpu_to_be32(sizeof(vfc_cmd->v1.rsp));
+	vfc_cmd->payload_len = cpu_to_be32(sizeof(*iu));
+	vfc_cmd->resp_len = cpu_to_be32(sizeof(*rsp));
 	vfc_cmd->cancel_key = cpu_to_be32((unsigned long)sdev->hostdata);
 	vfc_cmd->tgt_scsi_id = cpu_to_be64(rport->port_id);
-	int_to_scsilun(sdev->lun, &vfc_cmd->v1.iu.lun);
+	int_to_scsilun(sdev->lun, &iu->lun);
 
 	return vfc_cmd;
 }
@@ -1682,6 +1706,7 @@ static int ibmvfc_queuecommand_lck(struct scsi_cmnd *cmnd,
 	struct ibmvfc_host *vhost = shost_priv(cmnd->device->host);
 	struct fc_rport *rport = starget_to_rport(scsi_target(cmnd->device));
 	struct ibmvfc_cmd *vfc_cmd;
+	struct ibmvfc_fcp_cmd_iu *iu;
 	struct ibmvfc_event *evt;
 	int rc;
 
@@ -1699,13 +1724,14 @@ static int ibmvfc_queuecommand_lck(struct scsi_cmnd *cmnd,
 	cmnd->scsi_done = done;
 
 	vfc_cmd = ibmvfc_init_vfc_cmd(evt, cmnd->device);
+	iu = ibmvfc_get_fcp_iu(vhost, vfc_cmd);
 
-	vfc_cmd->v1.iu.xfer_len = cpu_to_be32(scsi_bufflen(cmnd));
-	memcpy(vfc_cmd->v1.iu.cdb, cmnd->cmnd, cmnd->cmd_len);
+	iu->xfer_len = cpu_to_be32(scsi_bufflen(cmnd));
+	memcpy(iu->cdb, cmnd->cmnd, cmnd->cmd_len);
 
 	if (cmnd->flags & SCMD_TAGGED) {
 		vfc_cmd->task_tag = cpu_to_be64(cmnd->tag);
-		vfc_cmd->v1.iu.pri_task_attr = IBMVFC_SIMPLE_TASK;
+		iu->pri_task_attr = IBMVFC_SIMPLE_TASK;
 	}
 
 	if (likely(!(rc = ibmvfc_map_sg_data(cmnd, evt, vfc_cmd, vhost->dev))))
@@ -2030,7 +2056,8 @@ static int ibmvfc_reset_device(struct scsi_device *sdev, int type, char *desc)
 	struct ibmvfc_cmd *tmf;
 	struct ibmvfc_event *evt = NULL;
 	union ibmvfc_iu rsp_iu;
-	struct ibmvfc_fcp_rsp *fc_rsp = &rsp_iu.cmd.v1.rsp;
+	struct ibmvfc_fcp_cmd_iu *iu;
+	struct ibmvfc_fcp_rsp *fc_rsp = ibmvfc_get_fcp_rsp(vhost, &rsp_iu.cmd);
 	int rsp_rc = -EBUSY;
 	unsigned long flags;
 	int rsp_code = 0;
@@ -2040,9 +2067,10 @@ static int ibmvfc_reset_device(struct scsi_device *sdev, int type, char *desc)
 		evt = ibmvfc_get_event(vhost);
 		ibmvfc_init_event(evt, ibmvfc_sync_completion, IBMVFC_CMD_FORMAT);
 		tmf = ibmvfc_init_vfc_cmd(evt, sdev);
+		iu = ibmvfc_get_fcp_iu(vhost, tmf);
 
 		tmf->flags = cpu_to_be16((IBMVFC_NO_MEM_DESC | IBMVFC_TMF));
-		tmf->v1.iu.tmf_flags = type;
+		iu->tmf_flags = type;
 		evt->sync_iu = &rsp_iu;
 
 		init_completion(&evt->comp);
@@ -2060,7 +2088,7 @@ static int ibmvfc_reset_device(struct scsi_device *sdev, int type, char *desc)
 	wait_for_completion(&evt->comp);
 
 	if (rsp_iu.cmd.status)
-		rsp_code = ibmvfc_get_err_result(&rsp_iu.cmd);
+		rsp_code = ibmvfc_get_err_result(vhost, &rsp_iu.cmd);
 
 	if (rsp_code) {
 		if (fc_rsp->flags & FCP_RSP_LEN_VALID)
@@ -2335,7 +2363,8 @@ static int ibmvfc_abort_task_set(struct scsi_device *sdev)
 	struct ibmvfc_cmd *tmf;
 	struct ibmvfc_event *evt, *found_evt;
 	union ibmvfc_iu rsp_iu;
-	struct ibmvfc_fcp_rsp *fc_rsp = &rsp_iu.cmd.v1.rsp;
+	struct ibmvfc_fcp_cmd_iu *iu;
+	struct ibmvfc_fcp_rsp *fc_rsp = ibmvfc_get_fcp_rsp(vhost, &rsp_iu.cmd);
 	int rc, rsp_rc = -EBUSY;
 	unsigned long flags, timeout = IBMVFC_ABORT_TIMEOUT;
 	int rsp_code = 0;
@@ -2360,9 +2389,10 @@ static int ibmvfc_abort_task_set(struct scsi_device *sdev)
 		evt = ibmvfc_get_event(vhost);
 		ibmvfc_init_event(evt, ibmvfc_sync_completion, IBMVFC_CMD_FORMAT);
 		tmf = ibmvfc_init_vfc_cmd(evt, sdev);
+		iu = ibmvfc_get_fcp_iu(vhost, tmf);
 
+		iu->tmf_flags = IBMVFC_ABORT_TASK_SET;
 		tmf->flags = cpu_to_be16((IBMVFC_NO_MEM_DESC | IBMVFC_TMF));
-		tmf->v1.iu.tmf_flags = IBMVFC_ABORT_TASK_SET;
 		evt->sync_iu = &rsp_iu;
 
 		init_completion(&evt->comp);
@@ -2409,7 +2439,7 @@ static int ibmvfc_abort_task_set(struct scsi_device *sdev)
 	}
 
 	if (rsp_iu.cmd.status)
-		rsp_code = ibmvfc_get_err_result(&rsp_iu.cmd);
+		rsp_code = ibmvfc_get_err_result(vhost, &rsp_iu.cmd);
 
 	if (rsp_code) {
 		if (fc_rsp->flags & FCP_RSP_LEN_VALID)
-- 
2.27.0


^ permalink raw reply related

* [PATCH 5/6] ibmvfc: add support for targetWWPN field in v2 MADs and vfcFrame
From: Tyrel Datwyler @ 2020-11-12  1:04 UTC (permalink / raw)
  To: james.bottomley
  Cc: Tyrel Datwyler, martin.petersen, linux-scsi, linux-kernel, brking,
	linuxppc-dev
In-Reply-To: <20201112010442.102589-1-tyreld@linux.ibm.com>

Several version 2 MADs and the version 2 vfcFrame structures introduced
a new targetWWPN field for better identification of the target then
simply the scsi_id.

Set this field and MAD versioning fields when the VIOS advertises the
IBMVFC_HANDLE_VF_WWPN capability.

Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
---
 drivers/scsi/ibmvscsi/ibmvfc.c | 39 ++++++++++++++++++++++++++++------
 1 file changed, 33 insertions(+), 6 deletions(-)

diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index 5e666f7c9266..604bccebf7d2 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -1676,9 +1676,10 @@ static struct ibmvfc_cmd *ibmvfc_init_vfc_cmd(struct ibmvfc_event *evt, struct s
 	size_t offset;
 
 	memset(vfc_cmd, 0, sizeof(*vfc_cmd));
-	if (be64_to_cpu(vhost->login_buf->resp.capabilities) & IBMVFC_HANDLE_VF_WWPN)
+	if (be64_to_cpu(vhost->login_buf->resp.capabilities) & IBMVFC_HANDLE_VF_WWPN) {
 		offset = offsetof(struct ibmvfc_cmd, v2.rsp);
-	else
+		vfc_cmd->targetWWPN = cpu_to_be64(rport->port_name);
+	} else
 		offset = offsetof(struct ibmvfc_cmd, v1.rsp);
 	vfc_cmd->resp.va = cpu_to_be64(be64_to_cpu(evt->crq.ioba) + offset);
 	vfc_cmd->resp.len = cpu_to_be32(sizeof(*rsp));
@@ -2053,6 +2054,7 @@ static int ibmvfc_bsg_request(struct bsg_job *job)
 static int ibmvfc_reset_device(struct scsi_device *sdev, int type, char *desc)
 {
 	struct ibmvfc_host *vhost = shost_priv(sdev->host);
+	struct fc_rport *rport = starget_to_rport(scsi_target(sdev));
 	struct ibmvfc_cmd *tmf;
 	struct ibmvfc_event *evt = NULL;
 	union ibmvfc_iu rsp_iu;
@@ -2070,6 +2072,8 @@ static int ibmvfc_reset_device(struct scsi_device *sdev, int type, char *desc)
 		iu = ibmvfc_get_fcp_iu(vhost, tmf);
 
 		tmf->flags = cpu_to_be16((IBMVFC_NO_MEM_DESC | IBMVFC_TMF));
+		if (be64_to_cpu(vhost->login_buf->resp.capabilities) & IBMVFC_HANDLE_VF_WWPN)
+			tmf->targetWWPN = cpu_to_be64(rport->port_name);
 		iu->tmf_flags = type;
 		evt->sync_iu = &rsp_iu;
 
@@ -2260,7 +2264,12 @@ static int ibmvfc_cancel_all(struct scsi_device *sdev, int type)
 
 		tmf = &evt->iu.tmf;
 		memset(tmf, 0, sizeof(*tmf));
-		tmf->common.version = cpu_to_be32(1);
+		if (be64_to_cpu(vhost->login_buf->resp.capabilities) & IBMVFC_HANDLE_VF_WWPN) {
+			tmf->common.version = cpu_to_be32(2);
+			tmf->targetWWPN = cpu_to_be64(found_evt->tgt->wwpn);
+		} else {
+			tmf->common.version = cpu_to_be32(1);
+		}
 		tmf->common.opcode = cpu_to_be32(IBMVFC_TMF_MAD);
 		tmf->common.length = cpu_to_be16(sizeof(*tmf));
 		tmf->scsi_id = cpu_to_be64(rport->port_id);
@@ -2360,6 +2369,7 @@ static int ibmvfc_match_evt(struct ibmvfc_event *evt, void *match)
 static int ibmvfc_abort_task_set(struct scsi_device *sdev)
 {
 	struct ibmvfc_host *vhost = shost_priv(sdev->host);
+	struct fc_rport *rport = starget_to_rport(scsi_target(sdev));
 	struct ibmvfc_cmd *tmf;
 	struct ibmvfc_event *evt, *found_evt;
 	union ibmvfc_iu rsp_iu;
@@ -2391,6 +2401,8 @@ static int ibmvfc_abort_task_set(struct scsi_device *sdev)
 		tmf = ibmvfc_init_vfc_cmd(evt, sdev);
 		iu = ibmvfc_get_fcp_iu(vhost, tmf);
 
+		if (be64_to_cpu(vhost->login_buf->resp.capabilities) & IBMVFC_HANDLE_VF_WWPN)
+			tmf->targetWWPN = cpu_to_be64(rport->port_name);
 		iu->tmf_flags = IBMVFC_ABORT_TASK_SET;
 		tmf->flags = cpu_to_be16((IBMVFC_NO_MEM_DESC | IBMVFC_TMF));
 		evt->sync_iu = &rsp_iu;
@@ -3470,7 +3482,12 @@ static void ibmvfc_tgt_send_prli(struct ibmvfc_target *tgt)
 	evt->tgt = tgt;
 	prli = &evt->iu.prli;
 	memset(prli, 0, sizeof(*prli));
-	prli->common.version = cpu_to_be32(1);
+	if (be64_to_cpu(vhost->login_buf->resp.capabilities) & IBMVFC_HANDLE_VF_WWPN) {
+		prli->common.version = cpu_to_be32(2);
+		prli->targetWWPN = cpu_to_be64(tgt->wwpn);
+	} else {
+		prli->common.version = cpu_to_be32(1);
+	}
 	prli->common.opcode = cpu_to_be32(IBMVFC_PROCESS_LOGIN);
 	prli->common.length = cpu_to_be16(sizeof(*prli));
 	prli->scsi_id = cpu_to_be64(tgt->scsi_id);
@@ -3573,7 +3590,12 @@ static void ibmvfc_tgt_send_plogi(struct ibmvfc_target *tgt)
 	evt->tgt = tgt;
 	plogi = &evt->iu.plogi;
 	memset(plogi, 0, sizeof(*plogi));
-	plogi->common.version = cpu_to_be32(1);
+	if(be64_to_cpu(vhost->login_buf->resp.capabilities) & IBMVFC_HANDLE_VF_WWPN) {
+		plogi->common.version = cpu_to_be32(2);
+		plogi->targetWWPN = cpu_to_be64(tgt->wwpn);
+	} else {
+		plogi->common.version = cpu_to_be32(1);
+	}
 	plogi->common.opcode = cpu_to_be32(IBMVFC_PORT_LOGIN);
 	plogi->common.length = cpu_to_be16(sizeof(*plogi));
 	plogi->scsi_id = cpu_to_be64(tgt->scsi_id);
@@ -3973,7 +3995,12 @@ static void ibmvfc_adisc_timeout(struct timer_list *t)
 	evt->tgt = tgt;
 	tmf = &evt->iu.tmf;
 	memset(tmf, 0, sizeof(*tmf));
-	tmf->common.version = cpu_to_be32(1);
+	if (be64_to_cpu(vhost->login_buf->resp.capabilities) & IBMVFC_HANDLE_VF_WWPN) {
+		tmf->common.version = cpu_to_be32(2);
+		tmf->targetWWPN = cpu_to_be64(tgt->wwpn);
+	} else {
+		tmf->common.version = cpu_to_be32(1);
+	}
 	tmf->common.opcode = cpu_to_be32(IBMVFC_TMF_MAD);
 	tmf->common.length = cpu_to_be16(sizeof(*tmf));
 	tmf->scsi_id = cpu_to_be64(tgt->scsi_id);
-- 
2.27.0


^ permalink raw reply related

* [for-next][PATCH 11/17] ftrace: Add recording of functions that caused recursion
From: Steven Rostedt @ 2020-11-12  0:32 UTC (permalink / raw)
  To: linux-kernel
  Cc: x86, linux-doc, Peter Zijlstra, Sebastian Andrzej Siewior,
	Kamalesh Babulal, James E.J. Bottomley, Guo Ren, H. Peter Anvin,
	live-patching, Miroslav Benes, Ingo Molnar, linux-s390,
	Joe Lawrence, Jonathan Corbet, Mauro Carvalho Chehab,
	Helge Deller, Anton Vorontsov, linux-csky, Christian Borntraeger,
	Petr Mladek, Kees Cook, Vasily Gorbik, Heiko Carstens,
	Jiri Kosina, Borislav Petkov, Josh Poimboeuf, Thomas Gleixner,
	Tony Luck, linux-parisc, Masami Hiramatsu, Colin Cross,
	Paul Mackerras, Andrew Morton, linuxppc-dev
In-Reply-To: <20201112003244.764326960@goodmis.org>

From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>

This adds CONFIG_FTRACE_RECORD_RECURSION that will record to a file
"recursed_functions" all the functions that caused recursion while a
callback to the function tracer was running.

Link: https://lkml.kernel.org/r/20201106023548.102375687@goodmis.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Guo Ren <guoren@kernel.org>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: x86@kernel.org
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Anton Vorontsov <anton@enomsg.org>
Cc: Colin Cross <ccross@android.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Joe Lawrence <joe.lawrence@redhat.com>
Cc: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Cc: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: linux-doc@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-csky@vger.kernel.org
Cc: linux-parisc@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-s390@vger.kernel.org
Cc: live-patching@vger.kernel.org
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 Documentation/trace/ftrace-uses.rst   |   6 +-
 arch/csky/kernel/probes/ftrace.c      |   2 +-
 arch/parisc/kernel/ftrace.c           |   2 +-
 arch/powerpc/kernel/kprobes-ftrace.c  |   2 +-
 arch/s390/kernel/ftrace.c             |   2 +-
 arch/x86/kernel/kprobes/ftrace.c      |   2 +-
 fs/pstore/ftrace.c                    |   2 +-
 include/linux/trace_recursion.h       |  29 +++-
 kernel/livepatch/patch.c              |   2 +-
 kernel/trace/Kconfig                  |  25 +++
 kernel/trace/Makefile                 |   1 +
 kernel/trace/ftrace.c                 |   4 +-
 kernel/trace/trace_event_perf.c       |   2 +-
 kernel/trace/trace_functions.c        |   2 +-
 kernel/trace/trace_output.c           |   6 +-
 kernel/trace/trace_output.h           |   1 +
 kernel/trace/trace_recursion_record.c | 236 ++++++++++++++++++++++++++
 17 files changed, 306 insertions(+), 20 deletions(-)
 create mode 100644 kernel/trace/trace_recursion_record.c

diff --git a/Documentation/trace/ftrace-uses.rst b/Documentation/trace/ftrace-uses.rst
index 86cd14b8e126..5981d5691745 100644
--- a/Documentation/trace/ftrace-uses.rst
+++ b/Documentation/trace/ftrace-uses.rst
@@ -118,7 +118,7 @@ can help in this regard. If you start your code with:
 
 	int bit;
 
-	bit = ftrace_test_recursion_trylock();
+	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (bit < 0)
 		return;
 
@@ -130,7 +130,9 @@ The code in between will be safe to use, even if it ends up calling a
 function that the callback is tracing. Note, on success,
 ftrace_test_recursion_trylock() will disable preemption, and the
 ftrace_test_recursion_unlock() will enable it again (if it was previously
-enabled).
+enabled). The instruction pointer (ip) and its parent (parent_ip) is passed to
+ftrace_test_recursion_trylock() to record where the recursion happened
+(if CONFIG_FTRACE_RECORD_RECURSION is set).
 
 Alternatively, if the FTRACE_OPS_FL_RECURSION flag is set on the ftrace_ops
 (as explained below), then a helper trampoline will be used to test
diff --git a/arch/csky/kernel/probes/ftrace.c b/arch/csky/kernel/probes/ftrace.c
index 5eb2604fdf71..f30b179924ef 100644
--- a/arch/csky/kernel/probes/ftrace.c
+++ b/arch/csky/kernel/probes/ftrace.c
@@ -18,7 +18,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 	struct kprobe *p;
 	struct kprobe_ctlblk *kcb;
 
-	bit = ftrace_test_recursion_trylock();
+	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (bit < 0)
 		return;
 
diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c
index 13d85042810a..1c5d3732bda2 100644
--- a/arch/parisc/kernel/ftrace.c
+++ b/arch/parisc/kernel/ftrace.c
@@ -210,7 +210,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 	struct kprobe *p;
 	int bit;
 
-	bit = ftrace_test_recursion_trylock();
+	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (bit < 0)
 		return;
 
diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c
index 5df8d50c65ae..fdfee39938ea 100644
--- a/arch/powerpc/kernel/kprobes-ftrace.c
+++ b/arch/powerpc/kernel/kprobes-ftrace.c
@@ -20,7 +20,7 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip,
 	struct kprobe_ctlblk *kcb;
 	int bit;
 
-	bit = ftrace_test_recursion_trylock();
+	bit = ftrace_test_recursion_trylock(nip, parent_nip);
 	if (bit < 0)
 		return;
 
diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index 8f31c726537a..657c1ab45408 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -204,7 +204,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 	struct kprobe *p;
 	int bit;
 
-	bit = ftrace_test_recursion_trylock();
+	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (bit < 0)
 		return;
 
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index a40a6cdfcca3..954d930a7127 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -20,7 +20,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 	struct kprobe_ctlblk *kcb;
 	int bit;
 
-	bit = ftrace_test_recursion_trylock();
+	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (bit < 0)
 		return;
 
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index 816210fc5d3a..adb0935eb062 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -41,7 +41,7 @@ static void notrace pstore_ftrace_call(unsigned long ip,
 	if (unlikely(oops_in_progress))
 		return;
 
-	bit = ftrace_test_recursion_trylock();
+	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (bit < 0)
 		return;
 
diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index ac3d73484cb2..228cc56ed66e 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -91,6 +91,9 @@ enum {
 	 * not be correct. Allow for a single recursion to cover this case.
 	 */
 	TRACE_TRANSITION_BIT,
+
+	/* Used to prevent recursion recording from recursing. */
+	TRACE_RECORD_RECURSION_BIT,
 };
 
 #define trace_recursion_set(bit)	do { (current)->trace_recursion |= (1<<(bit)); } while (0)
@@ -142,7 +145,22 @@ static __always_inline int trace_get_context_bit(void)
 			pc & HARDIRQ_MASK ? TRACE_CTX_IRQ : TRACE_CTX_SOFTIRQ;
 }
 
-static __always_inline int trace_test_and_set_recursion(int start, int max)
+#ifdef CONFIG_FTRACE_RECORD_RECURSION
+extern void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip);
+# define do_ftrace_record_recursion(ip, pip)				\
+	do {								\
+		if (!trace_recursion_test(TRACE_RECORD_RECURSION_BIT)) { \
+			trace_recursion_set(TRACE_RECORD_RECURSION_BIT); \
+			ftrace_record_recursion(ip, pip);		\
+			trace_recursion_clear(TRACE_RECORD_RECURSION_BIT); \
+		}							\
+	} while (0)
+#else
+# define do_ftrace_record_recursion(ip, pip)	do { } while (0)
+#endif
+
+static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsigned long pip,
+							int start, int max)
 {
 	unsigned int val = current->trace_recursion;
 	int bit;
@@ -158,8 +176,10 @@ static __always_inline int trace_test_and_set_recursion(int start, int max)
 		 * a switch between contexts. Allow for a single recursion.
 		 */
 		bit = TRACE_TRANSITION_BIT;
-		if (trace_recursion_test(bit))
+		if (trace_recursion_test(bit)) {
+			do_ftrace_record_recursion(ip, pip);
 			return -1;
+		}
 		trace_recursion_set(bit);
 		barrier();
 		return bit + 1;
@@ -199,9 +219,10 @@ static __always_inline void trace_clear_recursion(int bit)
  * Returns: -1 if a recursion happened.
  *           >= 0 if no recursion
  */
-static __always_inline int ftrace_test_recursion_trylock(void)
+static __always_inline int ftrace_test_recursion_trylock(unsigned long ip,
+							 unsigned long parent_ip)
 {
-	return trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX);
+	return trace_test_and_set_recursion(ip, parent_ip, TRACE_FTRACE_START, TRACE_FTRACE_MAX);
 }
 
 /**
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index 15480bf3ce88..875c5dbbdd33 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -49,7 +49,7 @@ static void notrace klp_ftrace_handler(unsigned long ip,
 
 	ops = container_of(fops, struct klp_ops, fops);
 
-	bit = ftrace_test_recursion_trylock();
+	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (WARN_ON_ONCE(bit < 0))
 		return;
 	/*
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a4020c0b4508..9b11c096d139 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -727,6 +727,31 @@ config TRACE_EVAL_MAP_FILE
 
 	If unsure, say N.
 
+config FTRACE_RECORD_RECURSION
+	bool "Record functions that recurse in function tracing"
+	depends on FUNCTION_TRACER
+	help
+	  All callbacks that attach to the function tracing have some sort
+	  of protection against recursion. Even though the protection exists,
+	  it adds overhead. This option will create a file in the tracefs
+	  file system called "recursed_functions" that will list the functions
+	  that triggered a recursion.
+
+	  This will add more overhead to cases that have recursion.
+
+	  If unsure, say N
+
+config FTRACE_RECORD_RECURSION_SIZE
+	int "Max number of recursed functions to record"
+	default	128
+	depends on FTRACE_RECORD_RECURSION
+	help
+	  This defines the limit of number of functions that can be
+	  listed in the "recursed_functions" file, that lists all
+	  the functions that caused a recursion to happen.
+	  This file can be reset, but the limit can not change in
+	  size at runtime.
+
 config GCOV_PROFILE_FTRACE
 	bool "Enable GCOV profiling on ftrace subsystem"
 	depends on GCOV_KERNEL
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index e153be351548..7e44cea89fdc 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -92,6 +92,7 @@ obj-$(CONFIG_DYNAMIC_EVENTS) += trace_dynevent.o
 obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
 obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o
 obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o
+obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o
 
 obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 39f2bba89b76..03aad2b5cd5e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -6918,7 +6918,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
 	struct ftrace_ops *op;
 	int bit;
 
-	bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
+	bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START, TRACE_LIST_MAX);
 	if (bit < 0)
 		return;
 
@@ -6993,7 +6993,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
 {
 	int bit;
 
-	bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
+	bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START, TRACE_LIST_MAX);
 	if (bit < 0)
 		return;
 
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index a2b9fddb8148..1b202e28dfaa 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -447,7 +447,7 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 	if ((unsigned long)ops->private != smp_processor_id())
 		return;
 
-	bit = ftrace_test_recursion_trylock();
+	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (bit < 0)
 		return;
 
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 89c414ce1388..646eda6c44a5 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -141,7 +141,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
 	if (unlikely(!tr->function_enabled))
 		return;
 
-	bit = ftrace_test_recursion_trylock();
+	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (bit < 0)
 		return;
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 000e9dc224c6..92b1575ae0ca 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -353,8 +353,8 @@ static inline const char *kretprobed(const char *name)
 }
 #endif /* CONFIG_KRETPROBES */
 
-static void
-seq_print_sym(struct trace_seq *s, unsigned long address, bool offset)
+void
+trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset)
 {
 #ifdef CONFIG_KALLSYMS
 	char str[KSYM_SYMBOL_LEN];
@@ -420,7 +420,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
 		goto out;
 	}
 
-	seq_print_sym(s, ip, sym_flags & TRACE_ITER_SYM_OFFSET);
+	trace_seq_print_sym(s, ip, sym_flags & TRACE_ITER_SYM_OFFSET);
 
 	if (sym_flags & TRACE_ITER_SYM_ADDR)
 		trace_seq_printf(s, " <" IP_FMT ">", ip);
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 2f742b74e7e6..4c954636caf0 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -16,6 +16,7 @@ extern int
 seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
 		unsigned long sym_flags);
 
+extern void trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset);
 extern int trace_print_context(struct trace_iterator *iter);
 extern int trace_print_lat_context(struct trace_iterator *iter);
 
diff --git a/kernel/trace/trace_recursion_record.c b/kernel/trace/trace_recursion_record.c
new file mode 100644
index 000000000000..b2edac1fe156
--- /dev/null
+++ b/kernel/trace/trace_recursion_record.c
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/fs.h>
+
+#include "trace_output.h"
+
+struct recursed_functions {
+	unsigned long		ip;
+	unsigned long		parent_ip;
+};
+
+static struct recursed_functions recursed_functions[CONFIG_FTRACE_RECORD_RECURSION_SIZE];
+static atomic_t nr_records;
+
+/*
+ * Cache the last found function. Yes, updates to this is racey, but
+ * so is memory cache ;-)
+ */
+static unsigned long cached_function;
+
+void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip)
+{
+	int index = 0;
+	int i;
+	unsigned long old;
+
+ again:
+	/* First check the last one recorded */
+	if (ip == cached_function)
+		return;
+
+	i = atomic_read(&nr_records);
+	/* nr_records is -1 when clearing records */
+	smp_mb__after_atomic();
+	if (i < 0)
+		return;
+
+	/*
+	 * If there's two writers and this writer comes in second,
+	 * the cmpxchg() below to update the ip will fail. Then this
+	 * writer will try again. It is possible that index will now
+	 * be greater than nr_records. This is because the writer
+	 * that succeeded has not updated the nr_records yet.
+	 * This writer could keep trying again until the other writer
+	 * updates nr_records. But if the other writer takes an
+	 * interrupt, and that interrupt locks up that CPU, we do
+	 * not want this CPU to lock up due to the recursion protection,
+	 * and have a bug report showing this CPU as the cause of
+	 * locking up the computer. To not lose this record, this
+	 * writer will simply use the next position to update the
+	 * recursed_functions, and it will update the nr_records
+	 * accordingly.
+	 */
+	if (index < i)
+		index = i;
+	if (index >= CONFIG_FTRACE_RECORD_RECURSION_SIZE)
+		return;
+
+	for (i = index - 1; i >= 0; i--) {
+		if (recursed_functions[i].ip == ip) {
+			cached_function = ip;
+			return;
+		}
+	}
+
+	cached_function = ip;
+
+	/*
+	 * We only want to add a function if it hasn't been added before.
+	 * Add to the current location before incrementing the count.
+	 * If it fails to add, then increment the index (save in i)
+	 * and try again.
+	 */
+	old = cmpxchg(&recursed_functions[index].ip, 0, ip);
+	if (old != 0) {
+		/* Did something else already added this for us? */
+		if (old == ip)
+			return;
+		/* Try the next location (use i for the next index) */
+		index++;
+		goto again;
+	}
+
+	recursed_functions[index].parent_ip = parent_ip;
+
+	/*
+	 * It's still possible that we could race with the clearing
+	 *    CPU0                                    CPU1
+	 *    ----                                    ----
+	 *                                       ip = func
+	 *  nr_records = -1;
+	 *  recursed_functions[0] = 0;
+	 *                                       i = -1
+	 *                                       if (i < 0)
+	 *  nr_records = 0;
+	 *  (new recursion detected)
+	 *      recursed_functions[0] = func
+	 *                                            cmpxchg(recursed_functions[0],
+	 *                                                    func, 0)
+	 *
+	 * But the worse that could happen is that we get a zero in
+	 * the recursed_functions array, and it's likely that "func" will
+	 * be recorded again.
+	 */
+	i = atomic_read(&nr_records);
+	smp_mb__after_atomic();
+	if (i < 0)
+		cmpxchg(&recursed_functions[index].ip, ip, 0);
+	else if (i <= index)
+		atomic_cmpxchg(&nr_records, i, index + 1);
+}
+EXPORT_SYMBOL_GPL(ftrace_record_recursion);
+
+static DEFINE_MUTEX(recursed_function_lock);
+static struct trace_seq *tseq;
+
+static void *recursed_function_seq_start(struct seq_file *m, loff_t *pos)
+{
+	void *ret = NULL;
+	int index;
+
+	mutex_lock(&recursed_function_lock);
+	index = atomic_read(&nr_records);
+	if (*pos < index) {
+		ret = &recursed_functions[*pos];
+	}
+
+	tseq = kzalloc(sizeof(*tseq), GFP_KERNEL);
+	if (!tseq)
+		return ERR_PTR(-ENOMEM);
+
+	trace_seq_init(tseq);
+
+	return ret;
+}
+
+static void *recursed_function_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	int index;
+	int p;
+
+	index = atomic_read(&nr_records);
+	p = ++(*pos);
+
+	return p < index ? &recursed_functions[p] : NULL;
+}
+
+static void recursed_function_seq_stop(struct seq_file *m, void *v)
+{
+	kfree(tseq);
+	mutex_unlock(&recursed_function_lock);
+}
+
+static int recursed_function_seq_show(struct seq_file *m, void *v)
+{
+	struct recursed_functions *record = v;
+	int ret = 0;
+
+	if (record) {
+		trace_seq_print_sym(tseq, record->parent_ip, true);
+		trace_seq_puts(tseq, ":\t");
+		trace_seq_print_sym(tseq, record->ip, true);
+		trace_seq_putc(tseq, '\n');
+		ret = trace_print_seq(m, tseq);
+	}
+
+	return ret;
+}
+
+static const struct seq_operations recursed_function_seq_ops = {
+	.start  = recursed_function_seq_start,
+	.next   = recursed_function_seq_next,
+	.stop   = recursed_function_seq_stop,
+	.show   = recursed_function_seq_show
+};
+
+static int recursed_function_open(struct inode *inode, struct file *file)
+{
+	int ret = 0;
+
+	mutex_lock(&recursed_function_lock);
+	/* If this file was opened for write, then erase contents */
+	if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
+		/* disable updating records */
+		atomic_set(&nr_records, -1);
+		smp_mb__after_atomic();
+		memset(recursed_functions, 0, sizeof(recursed_functions));
+		smp_wmb();
+		/* enable them again */
+		atomic_set(&nr_records, 0);
+	}
+	if (file->f_mode & FMODE_READ)
+		ret = seq_open(file, &recursed_function_seq_ops);
+	mutex_unlock(&recursed_function_lock);
+
+	return ret;
+}
+
+static ssize_t recursed_function_write(struct file *file,
+				       const char __user *buffer,
+				       size_t count, loff_t *ppos)
+{
+	return count;
+}
+
+static int recursed_function_release(struct inode *inode, struct file *file)
+{
+	if (file->f_mode & FMODE_READ)
+		seq_release(inode, file);
+	return 0;
+}
+
+static const struct file_operations recursed_functions_fops = {
+	.open           = recursed_function_open,
+	.write		= recursed_function_write,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = recursed_function_release,
+};
+
+__init static int create_recursed_functions(void)
+{
+	struct dentry *dentry;
+
+	dentry = trace_create_file("recursed_functions", 0644, NULL, NULL,
+				   &recursed_functions_fops);
+	if (!dentry)
+		pr_warn("WARNING: Failed to create recursed_functions\n");
+	return 0;
+}
+
+fs_initcall(create_recursed_functions);
-- 
2.28.0



^ permalink raw reply related

* [for-next][PATCH 05/17] kprobes/ftrace: Add recursion protection to the ftrace callback
From: Steven Rostedt @ 2020-11-12  0:32 UTC (permalink / raw)
  To: linux-kernel
  Cc: Peter Zijlstra, James E.J. Bottomley, Guo Ren, linux-csky,
	H. Peter Anvin, Miroslav Benes, Ingo Molnar, linux-s390,
	Helge Deller, x86, Anil S Keshavamurthy, Christian Borntraeger,
	Naveen N. Rao, Petr Mladek, Vasily Gorbik, Heiko Carstens,
	Jiri Kosina, Borislav Petkov, Josh Poimboeuf, Thomas Gleixner,
	linux-parisc, Masami Hiramatsu, Paul Mackerras, Andrew Morton,
	linuxppc-dev, David S. Miller
In-Reply-To: <20201112003244.764326960@goodmis.org>

From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>

If a ftrace callback does not supply its own recursion protection and
does not set the RECURSION_SAFE flag in its ftrace_ops, then ftrace will
make a helper trampoline to do so before calling the callback instead of
just calling the callback directly.

The default for ftrace_ops is going to change. It will expect that handlers
provide their own recursion protection, unless its ftrace_ops states
otherwise.

Link: https://lkml.kernel.org/r/20201028115613.140212174@goodmis.org
Link: https://lkml.kernel.org/r/20201106023546.944907560@goodmis.org

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Josh  Poimboeuf <jpoimboe@redhat.com>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: x86@kernel.org
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: linux-csky@vger.kernel.org
Cc: linux-parisc@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-s390@vger.kernel.org
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 arch/csky/kernel/probes/ftrace.c     | 12 ++++++++++--
 arch/parisc/kernel/ftrace.c          | 16 +++++++++++++---
 arch/powerpc/kernel/kprobes-ftrace.c | 11 ++++++++++-
 arch/s390/kernel/ftrace.c            | 16 +++++++++++++---
 arch/x86/kernel/kprobes/ftrace.c     | 12 ++++++++++--
 5 files changed, 56 insertions(+), 11 deletions(-)

diff --git a/arch/csky/kernel/probes/ftrace.c b/arch/csky/kernel/probes/ftrace.c
index 5264763d05be..5eb2604fdf71 100644
--- a/arch/csky/kernel/probes/ftrace.c
+++ b/arch/csky/kernel/probes/ftrace.c
@@ -13,16 +13,21 @@ int arch_check_ftrace_location(struct kprobe *p)
 void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 			   struct ftrace_ops *ops, struct pt_regs *regs)
 {
+	int bit;
 	bool lr_saver = false;
 	struct kprobe *p;
 	struct kprobe_ctlblk *kcb;
 
-	/* Preempt is disabled by ftrace */
+	bit = ftrace_test_recursion_trylock();
+	if (bit < 0)
+		return;
+
+	preempt_disable_notrace();
 	p = get_kprobe((kprobe_opcode_t *)ip);
 	if (!p) {
 		p = get_kprobe((kprobe_opcode_t *)(ip - MCOUNT_INSN_SIZE));
 		if (unlikely(!p) || kprobe_disabled(p))
-			return;
+			goto out;
 		lr_saver = true;
 	}
 
@@ -56,6 +61,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 		 */
 		__this_cpu_write(current_kprobe, NULL);
 	}
+out:
+	preempt_enable_notrace();
+	ftrace_test_recursion_unlock(bit);
 }
 NOKPROBE_SYMBOL(kprobe_ftrace_handler);
 
diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c
index 63e3ecb9da81..13d85042810a 100644
--- a/arch/parisc/kernel/ftrace.c
+++ b/arch/parisc/kernel/ftrace.c
@@ -207,14 +207,21 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 			   struct ftrace_ops *ops, struct pt_regs *regs)
 {
 	struct kprobe_ctlblk *kcb;
-	struct kprobe *p = get_kprobe((kprobe_opcode_t *)ip);
+	struct kprobe *p;
+	int bit;
 
-	if (unlikely(!p) || kprobe_disabled(p))
+	bit = ftrace_test_recursion_trylock();
+	if (bit < 0)
 		return;
 
+	preempt_disable_notrace();
+	p = get_kprobe((kprobe_opcode_t *)ip);
+	if (unlikely(!p) || kprobe_disabled(p))
+		goto out;
+
 	if (kprobe_running()) {
 		kprobes_inc_nmissed_count(p);
-		return;
+		goto out;
 	}
 
 	__this_cpu_write(current_kprobe, p);
@@ -235,6 +242,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 		}
 	}
 	__this_cpu_write(current_kprobe, NULL);
+out:
+	preempt_enable_notrace();
+	ftrace_test_recursion_unlock(bit);
 }
 NOKPROBE_SYMBOL(kprobe_ftrace_handler);
 
diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c
index 972cb28174b2..5df8d50c65ae 100644
--- a/arch/powerpc/kernel/kprobes-ftrace.c
+++ b/arch/powerpc/kernel/kprobes-ftrace.c
@@ -18,10 +18,16 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip,
 {
 	struct kprobe *p;
 	struct kprobe_ctlblk *kcb;
+	int bit;
 
+	bit = ftrace_test_recursion_trylock();
+	if (bit < 0)
+		return;
+
+	preempt_disable_notrace();
 	p = get_kprobe((kprobe_opcode_t *)nip);
 	if (unlikely(!p) || kprobe_disabled(p))
-		return;
+		goto out;
 
 	kcb = get_kprobe_ctlblk();
 	if (kprobe_running()) {
@@ -52,6 +58,9 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip,
 		 */
 		__this_cpu_write(current_kprobe, NULL);
 	}
+out:
+	preempt_enable_notrace();
+	ftrace_test_recursion_unlock(bit);
 }
 NOKPROBE_SYMBOL(kprobe_ftrace_handler);
 
diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index b388e87a08bf..8f31c726537a 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -201,14 +201,21 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 		struct ftrace_ops *ops, struct pt_regs *regs)
 {
 	struct kprobe_ctlblk *kcb;
-	struct kprobe *p = get_kprobe((kprobe_opcode_t *)ip);
+	struct kprobe *p;
+	int bit;
 
-	if (unlikely(!p) || kprobe_disabled(p))
+	bit = ftrace_test_recursion_trylock();
+	if (bit < 0)
 		return;
 
+	preempt_disable_notrace();
+	p = get_kprobe((kprobe_opcode_t *)ip);
+	if (unlikely(!p) || kprobe_disabled(p))
+		goto out;
+
 	if (kprobe_running()) {
 		kprobes_inc_nmissed_count(p);
-		return;
+		goto out;
 	}
 
 	__this_cpu_write(current_kprobe, p);
@@ -228,6 +235,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 		}
 	}
 	__this_cpu_write(current_kprobe, NULL);
+out:
+	preempt_enable_notrace();
+	ftrace_test_recursion_unlock(bit);
 }
 NOKPROBE_SYMBOL(kprobe_ftrace_handler);
 
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 681a4b36e9bb..a40a6cdfcca3 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -18,11 +18,16 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 {
 	struct kprobe *p;
 	struct kprobe_ctlblk *kcb;
+	int bit;
 
-	/* Preempt is disabled by ftrace */
+	bit = ftrace_test_recursion_trylock();
+	if (bit < 0)
+		return;
+
+	preempt_disable_notrace();
 	p = get_kprobe((kprobe_opcode_t *)ip);
 	if (unlikely(!p) || kprobe_disabled(p))
-		return;
+		goto out;
 
 	kcb = get_kprobe_ctlblk();
 	if (kprobe_running()) {
@@ -52,6 +57,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 		 */
 		__this_cpu_write(current_kprobe, NULL);
 	}
+out:
+	preempt_enable_notrace();
+	ftrace_test_recursion_unlock(bit);
 }
 NOKPROBE_SYMBOL(kprobe_ftrace_handler);
 
-- 
2.28.0



^ permalink raw reply related

* Re: [PATCH] powerpc/powernv/memtrace: Fake non-memblock aligned sized traces
From: Michael Neuling @ 2020-11-11 21:03 UTC (permalink / raw)
  To: Jordan Niethe, linuxppc-dev; +Cc: Rashmica Gupta
In-Reply-To: <20201111055524.2458-1-jniethe5@gmail.com>

CC Rashmica Gupta <rashmicy@gmail.com>

On Wed, 2020-11-11 at 16:55 +1100, Jordan Niethe wrote:
> The hardware trace macros which use the memory provided by memtrace are
> able to use trace sizes as small as 16MB. Only memblock aligned values
> can be removed from each NUMA node by writing that value to
> memtrace/enable in debugfs.  This means setting up, say, a 16MB trace is
> not possible.  To allow such a trace size, instead align whatever value
> is written to memtrace/enable to the memblock size for the purpose of
> removing it from each NUMA node but report the written value from
> memtrace/enable and memtrace/x/size in debugfs.
> 
> Signed-off-by: Jordan Niethe <jniethe5@gmail.com>
> ---
>  arch/powerpc/platforms/powernv/memtrace.c | 20 ++++++--------------
>  1 file changed, 6 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/memtrace.c
> b/arch/powerpc/platforms/powernv/memtrace.c
> index 6828108486f8..1188bc8fd090 100644
> --- a/arch/powerpc/platforms/powernv/memtrace.c
> +++ b/arch/powerpc/platforms/powernv/memtrace.c
> @@ -191,7 +191,7 @@ static int memtrace_init_debugfs(void)
>  		ent->dir = dir;
>  		debugfs_create_file("trace", 0400, dir, ent, &memtrace_fops);
>  		debugfs_create_x64("start", 0400, dir, &ent->start);
> -		debugfs_create_x64("size", 0400, dir, &ent->size);
> +		debugfs_create_x64("size", 0400, dir, &memtrace_size);
>  	}
>  
>  	return ret;
> @@ -259,33 +259,25 @@ static int memtrace_enable_set(void *data, u64 val)
>  {
>  	u64 bytes;
>  
> -	/*
> -	 * Don't attempt to do anything if size isn't aligned to a memory
> -	 * block or equal to zero.
> -	 */
> -	bytes = memory_block_size_bytes();
> -	if (val & (bytes - 1)) {
> -		pr_err("Value must be aligned with 0x%llx\n", bytes);
> -		return -EINVAL;
> -	}
> -
>  	/* Re-add/online previously removed/offlined memory */
>  	if (memtrace_size) {
>  		if (memtrace_online())
>  			return -EAGAIN;
>  	}
>  
> +	memtrace_size = val;
> +
>  	if (!val)
>  		return 0;
>  
> -	/* Offline and remove memory */
> -	if (memtrace_init_regions_runtime(val))
> +	/* Offline and remove memory aligned to memory blocks */
> +	bytes = memory_block_size_bytes();
> +	if (memtrace_init_regions_runtime(ALIGN(val, bytes)))
>  		return -EINVAL;
>  
>  	if (memtrace_init_debugfs())
>  		return -EINVAL;
>  
> -	memtrace_size = val;
>  
>  	return 0;
>  }

^ permalink raw reply

* Re: [PATCH v4 10/18] dt-bindings: usb: Convert DWC USB3 bindings to DT schema
From: Rob Herring @ 2020-11-11 20:14 UTC (permalink / raw)
  To: Serge Semin
  Cc: Neil Armstrong, Bjorn Andersson, Pavel Parkhomenko, Kevin Hilman,
	Krzysztof Kozlowski, Andy Gross, Chunfeng Yun, linux-snps-arc,
	devicetree, Mathias Nyman, Martin Blumenstingl, Lad Prabhakar,
	Alexey Malahov, linux-arm-kernel, Roger Quadros, Felipe Balbi,
	Greg Kroah-Hartman, Yoshihiro Shimoda, linux-usb, linux-mips,
	Serge Semin, linux-kernel, Manu Gautam, linuxppc-dev
In-Reply-To: <20201111090853.14112-11-Sergey.Semin@baikalelectronics.ru>

On Wed, Nov 11, 2020 at 12:08:45PM +0300, Serge Semin wrote:
> DWC USB3 DT node is supposed to be compliant with the Generic xHCI
> Controller schema, but with additional vendor-specific properties, the
> controller-specific reference clocks and PHYs. So let's convert the
> currently available legacy text-based DWC USB3 bindings to the DT schema
> and make sure the DWC USB3 nodes are also validated against the
> usb-xhci.yaml schema.
> 
> Note we have to discard the nodename restriction of being prefixed with
> "dwc3@" string, since in accordance with the usb-hcd.yaml schema USB nodes
> are supposed to be named as "^usb(@.*)".
> 
> Signed-off-by: Serge Semin <Sergey.Semin@baikalelectronics.ru>
> 
> ---
> 
> Changelog v2:
> - Discard '|' from the descriptions, since we don't need to preserve
>   the text formatting in any of them.
> - Drop quotes from around the string constants.
> - Fix the "clock-names" prop description to be referring the enumerated
>   clock-names instead of the ones from the Databook.
> 
> Changelog v3:
> - Apply usb-xhci.yaml# schema only if the controller is supposed to work
>   as either host or otg.
> 
> Changelog v4:
> - Apply usb-drd.yaml schema first. If the controller is configured
>   to work in a gadget mode only, then apply the usb.yaml schema too,
>   otherwise apply the usb-xhci.yaml schema.
> - Discard the Rob'es Reviewed-by tag. Please review the patch one more
>   time.
> ---
>  .../devicetree/bindings/usb/dwc3.txt          | 125 --------
>  .../devicetree/bindings/usb/snps,dwc3.yaml    | 303 ++++++++++++++++++
>  2 files changed, 303 insertions(+), 125 deletions(-)
>  delete mode 100644 Documentation/devicetree/bindings/usb/dwc3.txt
>  create mode 100644 Documentation/devicetree/bindings/usb/snps,dwc3.yaml
> 
> diff --git a/Documentation/devicetree/bindings/usb/dwc3.txt b/Documentation/devicetree/bindings/usb/dwc3.txt
> deleted file mode 100644
> index d03edf9d3935..000000000000
> --- a/Documentation/devicetree/bindings/usb/dwc3.txt
> +++ /dev/null
> @@ -1,125 +0,0 @@
> -synopsys DWC3 CORE
> -
> -DWC3- USB3 CONTROLLER. Complies to the generic USB binding properties
> -      as described in 'usb/generic.txt'
> -
> -Required properties:
> - - compatible: must be "snps,dwc3"
> - - reg : Address and length of the register set for the device
> - - interrupts: Interrupts used by the dwc3 controller.
> - - clock-names: list of clock names. Ideally should be "ref",
> -                "bus_early", "suspend" but may be less or more.
> - - clocks: list of phandle and clock specifier pairs corresponding to
> -           entries in the clock-names property.
> -
> -Exception for clocks:
> -  clocks are optional if the parent node (i.e. glue-layer) is compatible to
> -  one of the following:
> -    "cavium,octeon-7130-usb-uctl"
> -    "qcom,dwc3"
> -    "samsung,exynos5250-dwusb3"
> -    "samsung,exynos5433-dwusb3"
> -    "samsung,exynos7-dwusb3"
> -    "sprd,sc9860-dwc3"
> -    "st,stih407-dwc3"
> -    "ti,am437x-dwc3"
> -    "ti,dwc3"
> -    "ti,keystone-dwc3"
> -    "rockchip,rk3399-dwc3"
> -    "xlnx,zynqmp-dwc3"
> -
> -Optional properties:
> - - usb-phy : array of phandle for the PHY device.  The first element
> -   in the array is expected to be a handle to the USB2/HS PHY and
> -   the second element is expected to be a handle to the USB3/SS PHY
> - - phys: from the *Generic PHY* bindings
> - - phy-names: from the *Generic PHY* bindings; supported names are "usb2-phy"
> -	or "usb3-phy".
> - - resets: set of phandle and reset specifier pairs
> - - snps,usb2-lpm-disable: indicate if we don't want to enable USB2 HW LPM
> - - snps,usb3_lpm_capable: determines if platform is USB3 LPM capable
> - - snps,dis-start-transfer-quirk: when set, disable isoc START TRANSFER command
> -			failure SW work-around for DWC_usb31 version 1.70a-ea06
> -			and prior.
> - - snps,disable_scramble_quirk: true when SW should disable data scrambling.
> -	Only really useful for FPGA builds.
> - - snps,has-lpm-erratum: true when DWC3 was configured with LPM Erratum enabled
> - - snps,lpm-nyet-threshold: LPM NYET threshold
> - - snps,u2exit_lfps_quirk: set if we want to enable u2exit lfps quirk
> - - snps,u2ss_inp3_quirk: set if we enable P3 OK for U2/SS Inactive quirk
> - - snps,req_p1p2p3_quirk: when set, the core will always request for
> -			P1/P2/P3 transition sequence.
> - - snps,del_p1p2p3_quirk: when set core will delay P1/P2/P3 until a certain
> -			amount of 8B10B errors occur.
> - - snps,del_phy_power_chg_quirk: when set core will delay PHY power change
> -			from P0 to P1/P2/P3.
> - - snps,lfps_filter_quirk: when set core will filter LFPS reception.
> - - snps,rx_detect_poll_quirk: when set core will disable a 400us delay to start
> -			Polling LFPS after RX.Detect.
> - - snps,tx_de_emphasis_quirk: when set core will set Tx de-emphasis value.
> - - snps,tx_de_emphasis: the value driven to the PHY is controlled by the
> -			LTSSM during USB3 Compliance mode.
> - - snps,dis_u3_susphy_quirk: when set core will disable USB3 suspend phy.
> - - snps,dis_u2_susphy_quirk: when set core will disable USB2 suspend phy.
> - - snps,dis_enblslpm_quirk: when set clears the enblslpm in GUSB2PHYCFG,
> -			disabling the suspend signal to the PHY.
> - - snps,dis-u1-entry-quirk: set if link entering into U1 needs to be disabled.
> - - snps,dis-u2-entry-quirk: set if link entering into U2 needs to be disabled.
> - - snps,dis_rxdet_inp3_quirk: when set core will disable receiver detection
> -			in PHY P3 power state.
> - - snps,dis-u2-freeclk-exists-quirk: when set, clear the u2_freeclk_exists
> -			in GUSB2PHYCFG, specify that USB2 PHY doesn't provide
> -			a free-running PHY clock.
> - - snps,dis-del-phy-power-chg-quirk: when set core will change PHY power
> -			from P0 to P1/P2/P3 without delay.
> - - snps,dis-tx-ipgap-linecheck-quirk: when set, disable u2mac linestate check
> -			during HS transmit.
> - - snps,parkmode-disable-ss-quirk: when set, all SuperSpeed bus instances in
> -			park mode are disabled.
> - - snps,dis_metastability_quirk: when set, disable metastability workaround.
> -			CAUTION: use only if you are absolutely sure of it.
> - - snps,is-utmi-l1-suspend: true when DWC3 asserts output signal
> -			utmi_l1_suspend_n, false when asserts utmi_sleep_n
> - - snps,hird-threshold: HIRD threshold
> - - snps,hsphy_interface: High-Speed PHY interface selection between "utmi" for
> -   UTMI+ and "ulpi" for ULPI when the DWC_USB3_HSPHY_INTERFACE has value 3.
> - - snps,quirk-frame-length-adjustment: Value for GFLADJ_30MHZ field of GFLADJ
> -	register for post-silicon frame length adjustment when the
> -	fladj_30mhz_sdbnd signal is invalid or incorrect.
> - - snps,rx-thr-num-pkt-prd: periodic ESS RX packet threshold count - host mode
> -			only. Set this and rx-max-burst-prd to a valid,
> -			non-zero value 1-16 (DWC_usb31 programming guide
> -			section 1.2.4) to enable periodic ESS RX threshold.
> - - snps,rx-max-burst-prd: max periodic ESS RX burst size - host mode only. Set
> -			this and rx-thr-num-pkt-prd to a valid, non-zero value
> -			1-16 (DWC_usb31 programming guide section 1.2.4) to
> -			enable periodic ESS RX threshold.
> - - snps,tx-thr-num-pkt-prd: periodic ESS TX packet threshold count - host mode
> -			only. Set this and tx-max-burst-prd to a valid,
> -			non-zero value 1-16 (DWC_usb31 programming guide
> -			section 1.2.3) to enable periodic ESS TX threshold.
> - - snps,tx-max-burst-prd: max periodic ESS TX burst size - host mode only. Set
> -			this and tx-thr-num-pkt-prd to a valid, non-zero value
> -			1-16 (DWC_usb31 programming guide section 1.2.3) to
> -			enable periodic ESS TX threshold.
> -
> - - <DEPRECATED> tx-fifo-resize: determines if the FIFO *has* to be reallocated.
> - - snps,incr-burst-type-adjustment: Value for INCR burst type of GSBUSCFG0
> -			register, undefined length INCR burst type enable and INCRx type.
> -			When just one value, which means INCRX burst mode enabled. When
> -			more than one value, which means undefined length INCR burst type
> -			enabled. The values can be 1, 4, 8, 16, 32, 64, 128 and 256.
> -
> - - in addition all properties from usb-xhci.txt from the current directory are
> -   supported as well
> -
> -
> -This is usually a subnode to DWC3 glue to which it is connected.
> -
> -dwc3@4a030000 {
> -	compatible = "snps,dwc3";
> -	reg = <0x4a030000 0xcfff>;
> -	interrupts = <0 92 4>
> -	usb-phy = <&usb2_phy>, <&usb3,phy>;
> -	snps,incr-burst-type-adjustment = <1>, <4>, <8>, <16>;
> -};
> diff --git a/Documentation/devicetree/bindings/usb/snps,dwc3.yaml b/Documentation/devicetree/bindings/usb/snps,dwc3.yaml
> new file mode 100644
> index 000000000000..079617891da6
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/usb/snps,dwc3.yaml
> @@ -0,0 +1,303 @@
> +# SPDX-License-Identifier: GPL-2.0
> +%YAML 1.2
> +---
> +$id: http://devicetree.org/schemas/usb/snps,dwc3.yaml#
> +$schema: http://devicetree.org/meta-schemas/core.yaml#
> +
> +title: Synopsys DesignWare USB3 Controller
> +
> +maintainers:
> +  - Felipe Balbi <balbi@kernel.org>
> +
> +description:
> +  This is usually a subnode to DWC3 glue to which it is connected, but can also
> +  be presented as a standalone DT node with an optional vendor-specific
> +  compatible string.
> +
> +allOf:
> +  - $ref: usb-drd.yaml#
> +  - if:
> +      properties:
> +        dr_mode:
> +          const: peripheral
> +    then:
> +      $ref: usb.yaml#

This part could be done in usb-drd.yaml?

> +    else:
> +      $ref: usb-xhci.yaml#

I'd really prefer if all the schema can just be applied unconditionally. 
Shouldn't someone (like a bootloader) be able to change dr_mode without 
changing anything else to set the mode? That would imply all the 
schemas can be applied.

Rob

^ permalink raw reply

* Re: [PATCH v4 01/18] dt-bindings: usb: usb-hcd: Detach generic USB controller properties
From: Serge Semin @ 2020-11-11 19:30 UTC (permalink / raw)
  To: Rob Herring
  Cc: Neil Armstrong, linux-mips, Pavel Parkhomenko, Kevin Hilman,
	Krzysztof Kozlowski, Andy Gross, Chunfeng Yun, linux-snps-arc,
	devicetree, Mathias Nyman, Martin Blumenstingl, Lad Prabhakar,
	Alexey Malahov, Rob Herring, Bjorn Andersson, linux-arm-kernel,
	Roger Quadros, Felipe Balbi, Greg Kroah-Hartman,
	Yoshihiro Shimoda, linux-usb, linux-kernel, Serge Semin,
	Manu Gautam, linuxppc-dev
In-Reply-To: <20201111191640.GA1857205@bogus>

On Wed, Nov 11, 2020 at 01:16:40PM -0600, Rob Herring wrote:
> On Wed, 11 Nov 2020 12:08:36 +0300, Serge Semin wrote:
> > There can be three distinctive types of the USB controllers: USB hosts,
> > USB peripherals/gadgets and USB OTG, which can switch from one role to
> > another. In order to have that hierarchy handled in the DT binding files,
> > we need to collect common properties in a common DT schema and specific
> > properties in dedicated schemas. Seeing the usb-hcd.yaml DT schema is
> > dedicated for the USB host controllers only, let's move some common
> > properties from there into the usb.yaml schema. So the later would be
> > available to evaluate all currently supported types of the USB
> > controllers.
> > 
> > While at it add an explicit "additionalProperties: true" into the
> > usb-hcd.yaml as setting the additionalProperties/unevaluateProperties
> > properties is going to be get mandatory soon.
> > 
> > Signed-off-by: Serge Semin <Sergey.Semin@baikalelectronics.ru>
> > 
> > ---
> > 
> > Changelog v4:
> > - This is a new patch created as a result of the comment left
> >   by Chunfeng Yun in v3
> > ---
> >  .../devicetree/bindings/usb/usb-hcd.yaml      | 14 ++-------
> >  .../devicetree/bindings/usb/usb.yaml          | 29 +++++++++++++++++++
> >  2 files changed, 32 insertions(+), 11 deletions(-)
> >  create mode 100644 Documentation/devicetree/bindings/usb/usb.yaml
> > 
> 
> 
> My bot found errors running 'make dt_binding_check' on your patch:
> 

> yamllint warnings/errors:
> ./Documentation/devicetree/bindings/usb/usb-hcd.yaml:17:1: [error] duplication of key "additionalProperties" in mapping (key-duplicates)

Oh my. Don't know how this has slipped in. It's even more weird given
that I've performed dt_binding_check before sending the patches out.
Anyway I'll fix the duplication in v5. Please proceed with the series
review.

-Sergey

> 
> dtschema/dtc warnings/errors:
> Traceback (most recent call last):
>   File "/usr/local/bin/dt-extract-example", line 45, in <module>
>     binding = yaml.load(open(args.yamlfile, encoding='utf-8').read())
>   File "/usr/local/lib/python3.8/dist-packages/ruamel/yaml/main.py", line 343, in load
>     return constructor.get_single_data()
>   File "/usr/local/lib/python3.8/dist-packages/ruamel/yaml/constructor.py", line 113, in get_single_data
>     return self.construct_document(node)
>   File "/usr/local/lib/python3.8/dist-packages/ruamel/yaml/constructor.py", line 123, in construct_document
>     for _dummy in generator:
>   File "/usr/local/lib/python3.8/dist-packages/ruamel/yaml/constructor.py", line 723, in construct_yaml_map
>     value = self.construct_mapping(node)
>   File "/usr/local/lib/python3.8/dist-packages/ruamel/yaml/constructor.py", line 440, in construct_mapping
>     return BaseConstructor.construct_mapping(self, node, deep=deep)
>   File "/usr/local/lib/python3.8/dist-packages/ruamel/yaml/constructor.py", line 257, in construct_mapping
>     if self.check_mapping_key(node, key_node, mapping, key, value):
>   File "/usr/local/lib/python3.8/dist-packages/ruamel/yaml/constructor.py", line 295, in check_mapping_key
>     raise DuplicateKeyError(*args)
> ruamel.yaml.constructor.DuplicateKeyError: while constructing a mapping
>   in "<unicode string>", line 4, column 1
> found duplicate key "additionalProperties" with value "True" (original value: "True")
>   in "<unicode string>", line 17, column 1
> 
> To suppress this check see:
>     http://yaml.readthedocs.io/en/latest/api.html#duplicate-keys
> 
> Duplicate keys will become an error in future releases, and are errors
> by default when using the new API.
> 
> make[1]: *** [Documentation/devicetree/bindings/Makefile:20: Documentation/devicetree/bindings/usb/usb-hcd.example.dts] Error 1
> make[1]: *** Deleting file 'Documentation/devicetree/bindings/usb/usb-hcd.example.dts'
> make[1]: *** Waiting for unfinished jobs....
> make[1]: *** [Documentation/devicetree/bindings/Makefile:59: Documentation/devicetree/bindings/processed-schema-examples.json] Error 123
> make: *** [Makefile:1364: dt_binding_check] Error 2
> 
> 
> See https://patchwork.ozlabs.org/patch/1398034
> 
> The base for the patch is generally the last rc1. Any dependencies
> should be noted.
> 
> If you already ran 'make dt_binding_check' and didn't see the above
> error(s), then make sure 'yamllint' is installed and dt-schema is up to
> date:
> 
> pip3 install dtschema --upgrade
> 
> Please check and re-submit.
> 

^ permalink raw reply

* Re: [PATCH v4 01/18] dt-bindings: usb: usb-hcd: Detach generic USB controller properties
From: Rob Herring @ 2020-11-11 19:16 UTC (permalink / raw)
  To: Serge Semin
  Cc: Neil Armstrong, linux-mips, Pavel Parkhomenko, Kevin Hilman,
	Krzysztof Kozlowski, Andy Gross, Chunfeng Yun, linux-snps-arc,
	devicetree, Mathias Nyman, Martin Blumenstingl, Lad Prabhakar,
	Alexey Malahov, Rob Herring, Bjorn Andersson, linux-arm-kernel,
	Roger Quadros, Felipe Balbi, Greg Kroah-Hartman,
	Yoshihiro Shimoda, linux-usb, linux-kernel, Serge Semin,
	Manu Gautam, linuxppc-dev
In-Reply-To: <20201111090853.14112-2-Sergey.Semin@baikalelectronics.ru>

On Wed, 11 Nov 2020 12:08:36 +0300, Serge Semin wrote:
> There can be three distinctive types of the USB controllers: USB hosts,
> USB peripherals/gadgets and USB OTG, which can switch from one role to
> another. In order to have that hierarchy handled in the DT binding files,
> we need to collect common properties in a common DT schema and specific
> properties in dedicated schemas. Seeing the usb-hcd.yaml DT schema is
> dedicated for the USB host controllers only, let's move some common
> properties from there into the usb.yaml schema. So the later would be
> available to evaluate all currently supported types of the USB
> controllers.
> 
> While at it add an explicit "additionalProperties: true" into the
> usb-hcd.yaml as setting the additionalProperties/unevaluateProperties
> properties is going to be get mandatory soon.
> 
> Signed-off-by: Serge Semin <Sergey.Semin@baikalelectronics.ru>
> 
> ---
> 
> Changelog v4:
> - This is a new patch created as a result of the comment left
>   by Chunfeng Yun in v3
> ---
>  .../devicetree/bindings/usb/usb-hcd.yaml      | 14 ++-------
>  .../devicetree/bindings/usb/usb.yaml          | 29 +++++++++++++++++++
>  2 files changed, 32 insertions(+), 11 deletions(-)
>  create mode 100644 Documentation/devicetree/bindings/usb/usb.yaml
> 


My bot found errors running 'make dt_binding_check' on your patch:

yamllint warnings/errors:
./Documentation/devicetree/bindings/usb/usb-hcd.yaml:17:1: [error] duplication of key "additionalProperties" in mapping (key-duplicates)

dtschema/dtc warnings/errors:
Traceback (most recent call last):
  File "/usr/local/bin/dt-extract-example", line 45, in <module>
    binding = yaml.load(open(args.yamlfile, encoding='utf-8').read())
  File "/usr/local/lib/python3.8/dist-packages/ruamel/yaml/main.py", line 343, in load
    return constructor.get_single_data()
  File "/usr/local/lib/python3.8/dist-packages/ruamel/yaml/constructor.py", line 113, in get_single_data
    return self.construct_document(node)
  File "/usr/local/lib/python3.8/dist-packages/ruamel/yaml/constructor.py", line 123, in construct_document
    for _dummy in generator:
  File "/usr/local/lib/python3.8/dist-packages/ruamel/yaml/constructor.py", line 723, in construct_yaml_map
    value = self.construct_mapping(node)
  File "/usr/local/lib/python3.8/dist-packages/ruamel/yaml/constructor.py", line 440, in construct_mapping
    return BaseConstructor.construct_mapping(self, node, deep=deep)
  File "/usr/local/lib/python3.8/dist-packages/ruamel/yaml/constructor.py", line 257, in construct_mapping
    if self.check_mapping_key(node, key_node, mapping, key, value):
  File "/usr/local/lib/python3.8/dist-packages/ruamel/yaml/constructor.py", line 295, in check_mapping_key
    raise DuplicateKeyError(*args)
ruamel.yaml.constructor.DuplicateKeyError: while constructing a mapping
  in "<unicode string>", line 4, column 1
found duplicate key "additionalProperties" with value "True" (original value: "True")
  in "<unicode string>", line 17, column 1

To suppress this check see:
    http://yaml.readthedocs.io/en/latest/api.html#duplicate-keys

Duplicate keys will become an error in future releases, and are errors
by default when using the new API.

make[1]: *** [Documentation/devicetree/bindings/Makefile:20: Documentation/devicetree/bindings/usb/usb-hcd.example.dts] Error 1
make[1]: *** Deleting file 'Documentation/devicetree/bindings/usb/usb-hcd.example.dts'
make[1]: *** Waiting for unfinished jobs....
make[1]: *** [Documentation/devicetree/bindings/Makefile:59: Documentation/devicetree/bindings/processed-schema-examples.json] Error 123
make: *** [Makefile:1364: dt_binding_check] Error 2


See https://patchwork.ozlabs.org/patch/1398034

The base for the patch is generally the last rc1. Any dependencies
should be noted.

If you already ran 'make dt_binding_check' and didn't see the above
error(s), then make sure 'yamllint' is installed and dt-schema is up to
date:

pip3 install dtschema --upgrade

Please check and re-submit.


^ permalink raw reply

* Re: [PATCH 3/3] powerpc: rewrite atomics to use ARCH_ATOMIC
From: kernel test robot @ 2020-11-11 19:07 UTC (permalink / raw)
  To: Nicholas Piggin, linuxppc-dev
  Cc: Christophe Leroy, kbuild-all, Arnd Bergmann, Peter Zijlstra,
	Boqun Feng, linux-kernel, Nicholas Piggin, Alexey Kardashevskiy,
	Will Deacon
In-Reply-To: <20201111110723.3148665-4-npiggin@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 13242 bytes --]

Hi Nicholas,

I love your patch! Perhaps something to improve:

[auto build test WARNING on powerpc/next]
[also build test WARNING on asm-generic/master linus/master v5.10-rc3 next-20201111]
[cannot apply to scottwood/next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Nicholas-Piggin/powerpc-convert-to-use-ARCH_ATOMIC/20201111-190941
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc-allyesconfig (attached as .config)
compiler: powerpc64-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/9e1bec8fe216b0745c647e52c40d1f0033fb4efd
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Nicholas-Piggin/powerpc-convert-to-use-ARCH_ATOMIC/20201111-190941
        git checkout 9e1bec8fe216b0745c647e52c40d1f0033fb4efd
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=powerpc 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   In file included from arch/powerpc/include/asm/atomic.h:11,
                    from include/linux/atomic.h:7,
                    from include/linux/rcupdate.h:25,
                    from include/linux/rculist.h:11,
                    from include/linux/sched/signal.h:5,
                    from drivers/gpu/drm/drm_lock.c:37:
   drivers/gpu/drm/drm_lock.c: In function 'drm_lock_take':
>> arch/powerpc/include/asm/cmpxchg.h:463:41: warning: passing argument 1 of '__cmpxchg_relaxed' discards 'volatile' qualifier from pointer target type [-Wdiscarded-qualifiers]
     463 |  (__typeof__(*(ptr))) __cmpxchg_relaxed((ptr),   \
         |                                         ^~~~~
   include/linux/atomic.h:73:9: note: in expansion of macro 'arch_cmpxchg_relaxed'
      73 |  typeof(op##_relaxed(args)) __ret;    \
         |         ^~
   include/linux/atomic-arch-fallback.h:52:2: note: in expansion of macro '__atomic_op_fence'
      52 |  __atomic_op_fence(arch_cmpxchg, __VA_ARGS__)
         |  ^~~~~~~~~~~~~~~~~
   include/asm-generic/atomic-instrumented.h:1685:2: note: in expansion of macro 'arch_cmpxchg'
    1685 |  arch_cmpxchg(__ai_ptr, __VA_ARGS__);    \
         |  ^~~~~~~~~~~~
   drivers/gpu/drm/drm_lock.c:75:10: note: in expansion of macro 'cmpxchg'
      75 |   prev = cmpxchg(lock, old, new);
         |          ^~~~~~~
   arch/powerpc/include/asm/cmpxchg.h:432:25: note: expected 'void *' but argument is of type 'volatile unsigned int *'
     432 | __cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new,
         |                   ~~~~~~^~~
>> arch/powerpc/include/asm/cmpxchg.h:463:41: warning: passing argument 1 of '__cmpxchg_relaxed' discards 'volatile' qualifier from pointer target type [-Wdiscarded-qualifiers]
     463 |  (__typeof__(*(ptr))) __cmpxchg_relaxed((ptr),   \
         |                                         ^~~~~
   include/linux/atomic.h:75:10: note: in expansion of macro 'arch_cmpxchg_relaxed'
      75 |  __ret = op##_relaxed(args);     \
         |          ^~
   include/linux/atomic-arch-fallback.h:52:2: note: in expansion of macro '__atomic_op_fence'
      52 |  __atomic_op_fence(arch_cmpxchg, __VA_ARGS__)
         |  ^~~~~~~~~~~~~~~~~
   include/asm-generic/atomic-instrumented.h:1685:2: note: in expansion of macro 'arch_cmpxchg'
    1685 |  arch_cmpxchg(__ai_ptr, __VA_ARGS__);    \
         |  ^~~~~~~~~~~~
   drivers/gpu/drm/drm_lock.c:75:10: note: in expansion of macro 'cmpxchg'
      75 |   prev = cmpxchg(lock, old, new);
         |          ^~~~~~~
   arch/powerpc/include/asm/cmpxchg.h:432:25: note: expected 'void *' but argument is of type 'volatile unsigned int *'
     432 | __cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new,
         |                   ~~~~~~^~~
   drivers/gpu/drm/drm_lock.c: In function 'drm_lock_transfer':
>> arch/powerpc/include/asm/cmpxchg.h:463:41: warning: passing argument 1 of '__cmpxchg_relaxed' discards 'volatile' qualifier from pointer target type [-Wdiscarded-qualifiers]
     463 |  (__typeof__(*(ptr))) __cmpxchg_relaxed((ptr),   \
         |                                         ^~~~~
   include/linux/atomic.h:73:9: note: in expansion of macro 'arch_cmpxchg_relaxed'
      73 |  typeof(op##_relaxed(args)) __ret;    \
         |         ^~
   include/linux/atomic-arch-fallback.h:52:2: note: in expansion of macro '__atomic_op_fence'
      52 |  __atomic_op_fence(arch_cmpxchg, __VA_ARGS__)
         |  ^~~~~~~~~~~~~~~~~
   include/asm-generic/atomic-instrumented.h:1685:2: note: in expansion of macro 'arch_cmpxchg'
    1685 |  arch_cmpxchg(__ai_ptr, __VA_ARGS__);    \
         |  ^~~~~~~~~~~~
   drivers/gpu/drm/drm_lock.c:118:10: note: in expansion of macro 'cmpxchg'
     118 |   prev = cmpxchg(lock, old, new);
         |          ^~~~~~~
   arch/powerpc/include/asm/cmpxchg.h:432:25: note: expected 'void *' but argument is of type 'volatile unsigned int *'
     432 | __cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new,
         |                   ~~~~~~^~~
>> arch/powerpc/include/asm/cmpxchg.h:463:41: warning: passing argument 1 of '__cmpxchg_relaxed' discards 'volatile' qualifier from pointer target type [-Wdiscarded-qualifiers]
     463 |  (__typeof__(*(ptr))) __cmpxchg_relaxed((ptr),   \
         |                                         ^~~~~
   include/linux/atomic.h:75:10: note: in expansion of macro 'arch_cmpxchg_relaxed'
      75 |  __ret = op##_relaxed(args);     \
         |          ^~
   include/linux/atomic-arch-fallback.h:52:2: note: in expansion of macro '__atomic_op_fence'
      52 |  __atomic_op_fence(arch_cmpxchg, __VA_ARGS__)
         |  ^~~~~~~~~~~~~~~~~
   include/asm-generic/atomic-instrumented.h:1685:2: note: in expansion of macro 'arch_cmpxchg'
    1685 |  arch_cmpxchg(__ai_ptr, __VA_ARGS__);    \
         |  ^~~~~~~~~~~~
   drivers/gpu/drm/drm_lock.c:118:10: note: in expansion of macro 'cmpxchg'
     118 |   prev = cmpxchg(lock, old, new);
         |          ^~~~~~~
   arch/powerpc/include/asm/cmpxchg.h:432:25: note: expected 'void *' but argument is of type 'volatile unsigned int *'
     432 | __cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new,
         |                   ~~~~~~^~~
   drivers/gpu/drm/drm_lock.c: In function 'drm_legacy_lock_free':
>> arch/powerpc/include/asm/cmpxchg.h:463:41: warning: passing argument 1 of '__cmpxchg_relaxed' discards 'volatile' qualifier from pointer target type [-Wdiscarded-qualifiers]
     463 |  (__typeof__(*(ptr))) __cmpxchg_relaxed((ptr),   \
         |                                         ^~~~~
   include/linux/atomic.h:73:9: note: in expansion of macro 'arch_cmpxchg_relaxed'
      73 |  typeof(op##_relaxed(args)) __ret;    \
         |         ^~
   include/linux/atomic-arch-fallback.h:52:2: note: in expansion of macro '__atomic_op_fence'
      52 |  __atomic_op_fence(arch_cmpxchg, __VA_ARGS__)
         |  ^~~~~~~~~~~~~~~~~
   include/asm-generic/atomic-instrumented.h:1685:2: note: in expansion of macro 'arch_cmpxchg'
    1685 |  arch_cmpxchg(__ai_ptr, __VA_ARGS__);    \
         |  ^~~~~~~~~~~~
   drivers/gpu/drm/drm_lock.c:141:10: note: in expansion of macro 'cmpxchg'
     141 |   prev = cmpxchg(lock, old, new);
         |          ^~~~~~~
   arch/powerpc/include/asm/cmpxchg.h:432:25: note: expected 'void *' but argument is of type 'volatile unsigned int *'
     432 | __cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new,
         |                   ~~~~~~^~~
>> arch/powerpc/include/asm/cmpxchg.h:463:41: warning: passing argument 1 of '__cmpxchg_relaxed' discards 'volatile' qualifier from pointer target type [-Wdiscarded-qualifiers]
     463 |  (__typeof__(*(ptr))) __cmpxchg_relaxed((ptr),   \
         |                                         ^~~~~
   include/linux/atomic.h:75:10: note: in expansion of macro 'arch_cmpxchg_relaxed'
      75 |  __ret = op##_relaxed(args);     \
         |          ^~
   include/linux/atomic-arch-fallback.h:52:2: note: in expansion of macro '__atomic_op_fence'
      52 |  __atomic_op_fence(arch_cmpxchg, __VA_ARGS__)
         |  ^~~~~~~~~~~~~~~~~
   include/asm-generic/atomic-instrumented.h:1685:2: note: in expansion of macro 'arch_cmpxchg'
    1685 |  arch_cmpxchg(__ai_ptr, __VA_ARGS__);    \
         |  ^~~~~~~~~~~~
   drivers/gpu/drm/drm_lock.c:141:10: note: in expansion of macro 'cmpxchg'
     141 |   prev = cmpxchg(lock, old, new);
         |          ^~~~~~~
   arch/powerpc/include/asm/cmpxchg.h:432:25: note: expected 'void *' but argument is of type 'volatile unsigned int *'
     432 | __cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new,
         |                   ~~~~~~^~~
   drivers/gpu/drm/drm_lock.c: In function 'drm_legacy_idlelock_release':
>> arch/powerpc/include/asm/cmpxchg.h:463:41: warning: passing argument 1 of '__cmpxchg_relaxed' discards 'volatile' qualifier from pointer target type [-Wdiscarded-qualifiers]
     463 |  (__typeof__(*(ptr))) __cmpxchg_relaxed((ptr),   \
         |                                         ^~~~~
   include/linux/atomic.h:73:9: note: in expansion of macro 'arch_cmpxchg_relaxed'
      73 |  typeof(op##_relaxed(args)) __ret;    \
         |         ^~
   include/linux/atomic-arch-fallback.h:52:2: note: in expansion of macro '__atomic_op_fence'
      52 |  __atomic_op_fence(arch_cmpxchg, __VA_ARGS__)
         |  ^~~~~~~~~~~~~~~~~
   include/asm-generic/atomic-instrumented.h:1685:2: note: in expansion of macro 'arch_cmpxchg'
    1685 |  arch_cmpxchg(__ai_ptr, __VA_ARGS__);    \
         |  ^~~~~~~~~~~~
   drivers/gpu/drm/drm_lock.c:319:12: note: in expansion of macro 'cmpxchg'
     319 |     prev = cmpxchg(lock, old, DRM_KERNEL_CONTEXT);
         |            ^~~~~~~
   arch/powerpc/include/asm/cmpxchg.h:432:25: note: expected 'void *' but argument is of type 'volatile unsigned int *'
     432 | __cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new,
         |                   ~~~~~~^~~
>> arch/powerpc/include/asm/cmpxchg.h:463:41: warning: passing argument 1 of '__cmpxchg_relaxed' discards 'volatile' qualifier from pointer target type [-Wdiscarded-qualifiers]
     463 |  (__typeof__(*(ptr))) __cmpxchg_relaxed((ptr),   \
         |                                         ^~~~~
   include/linux/atomic.h:75:10: note: in expansion of macro 'arch_cmpxchg_relaxed'
      75 |  __ret = op##_relaxed(args);     \
         |          ^~
   include/linux/atomic-arch-fallback.h:52:2: note: in expansion of macro '__atomic_op_fence'
      52 |  __atomic_op_fence(arch_cmpxchg, __VA_ARGS__)
         |  ^~~~~~~~~~~~~~~~~
   include/asm-generic/atomic-instrumented.h:1685:2: note: in expansion of macro 'arch_cmpxchg'
    1685 |  arch_cmpxchg(__ai_ptr, __VA_ARGS__);    \
         |  ^~~~~~~~~~~~
   drivers/gpu/drm/drm_lock.c:319:12: note: in expansion of macro 'cmpxchg'
     319 |     prev = cmpxchg(lock, old, DRM_KERNEL_CONTEXT);
         |            ^~~~~~~
   arch/powerpc/include/asm/cmpxchg.h:432:25: note: expected 'void *' but argument is of type 'volatile unsigned int *'
     432 | __cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new,
         |                   ~~~~~~^~~

vim +463 arch/powerpc/include/asm/cmpxchg.h

56c08e6d226c860 Boqun Feng      2015-12-15  450  
9e1bec8fe216b07 Nicholas Piggin 2020-11-11  451  #define arch_cmpxchg_local(ptr, o, n)					 \
ae3a197e3d0bfe3 David Howells   2012-03-28  452    ({									 \
ae3a197e3d0bfe3 David Howells   2012-03-28  453       __typeof__(*(ptr)) _o_ = (o);					 \
ae3a197e3d0bfe3 David Howells   2012-03-28  454       __typeof__(*(ptr)) _n_ = (n);					 \
ae3a197e3d0bfe3 David Howells   2012-03-28  455       (__typeof__(*(ptr))) __cmpxchg_local((ptr), (unsigned long)_o_,	 \
ae3a197e3d0bfe3 David Howells   2012-03-28  456  				    (unsigned long)_n_, sizeof(*(ptr))); \
ae3a197e3d0bfe3 David Howells   2012-03-28  457    })
ae3a197e3d0bfe3 David Howells   2012-03-28  458  
9e1bec8fe216b07 Nicholas Piggin 2020-11-11  459  #define arch_cmpxchg_relaxed(ptr, o, n)					\
56c08e6d226c860 Boqun Feng      2015-12-15  460  ({									\
56c08e6d226c860 Boqun Feng      2015-12-15  461  	__typeof__(*(ptr)) _o_ = (o);					\
56c08e6d226c860 Boqun Feng      2015-12-15  462  	__typeof__(*(ptr)) _n_ = (n);					\
56c08e6d226c860 Boqun Feng      2015-12-15 @463  	(__typeof__(*(ptr))) __cmpxchg_relaxed((ptr),			\
56c08e6d226c860 Boqun Feng      2015-12-15  464  			(unsigned long)_o_, (unsigned long)_n_,		\
56c08e6d226c860 Boqun Feng      2015-12-15  465  			sizeof(*(ptr)));				\
56c08e6d226c860 Boqun Feng      2015-12-15  466  })
56c08e6d226c860 Boqun Feng      2015-12-15  467  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 71460 bytes --]

^ permalink raw reply

* [PATCH v2 8/8] powernv/memtrace: don't abuse memory hot(un)plug infrastructure for memory allocations
From: David Hildenbrand @ 2020-11-11 14:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: Michal Hocko, Wei Yang, David Hildenbrand, Michal Hocko, linux-mm,
	Paul Mackerras, Rashmica Gupta, linuxppc-dev, Andrew Morton,
	Mike Rapoport, Oscar Salvador
In-Reply-To: <20201111145322.15793-1-david@redhat.com>

Let's use alloc_contig_pages() for allocating memory and remove the
linear mapping manually via arch_remove_linear_mapping(). Mark all pages
PG_offline, such that they will definitely not get touched - e.g.,
when hibernating. When freeing memory, try to revert what we did.

The original idea was discussed in:
 https://lkml.kernel.org/r/48340e96-7e6b-736f-9e23-d3111b915b6e@redhat.com

This is similar to CONFIG_DEBUG_PAGEALLOC handling on other
architectures, whereby only single pages are unmapped from the linear
mapping. Let's mimic what memory hot(un)plug would do with the linear
mapping.

We now need MEMORY_HOTPLUG and CONTIG_ALLOC as dependencies. Add a TODO
that we want to use __GFP_ZERO for clearing once alloc_contig_pages()
understands that.

Tested with in QEMU/TCG with 10 GiB of main memory:
  [root@localhost ~]# echo 0x40000000 > /sys/kernel/debug/powerpc/memtrace/enable
  [  105.903043][ T1080] memtrace: Allocated trace memory on node 0 at 0x0000000080000000
  [root@localhost ~]# echo 0x40000000 > /sys/kernel/debug/powerpc/memtrace/enable
  [  145.042493][ T1080] radix-mmu: Mapped 0x0000000080000000-0x00000000c0000000 with 64.0 KiB pages
  [  145.049019][ T1080] memtrace: Freed trace memory back on node 0
  [  145.333960][ T1080] memtrace: Allocated trace memory on node 0 at 0x0000000080000000
  [root@localhost ~]# echo 0x80000000 > /sys/kernel/debug/powerpc/memtrace/enable
  [  213.606916][ T1080] radix-mmu: Mapped 0x0000000080000000-0x00000000c0000000 with 64.0 KiB pages
  [  213.613855][ T1080] memtrace: Freed trace memory back on node 0
  [  214.185094][ T1080] memtrace: Allocated trace memory on node 0 at 0x0000000080000000
  [root@localhost ~]# echo 0x100000000 > /sys/kernel/debug/powerpc/memtrace/enable
  [  234.874872][ T1080] radix-mmu: Mapped 0x0000000080000000-0x0000000100000000 with 64.0 KiB pages
  [  234.886974][ T1080] memtrace: Freed trace memory back on node 0
  [  234.890153][ T1080] memtrace: Failed to allocate trace memory on node 0
  [root@localhost ~]# echo 0x40000000 > /sys/kernel/debug/powerpc/memtrace/enable
  [  259.490196][ T1080] memtrace: Allocated trace memory on node 0 at 0x0000000080000000

I also made sure allocated memory is properly zeroed.

Note 1: We currently won't be allocating from ZONE_MOVABLE - because our
	pages are not movable. However, as we don't run with any memory
	hot(un)plug mechanism around, we could make an exception to
	increase the chance of allocations succeeding.

Note 2: PG_reserved isn't sufficient. E.g., kernel_page_present() used
	along PG_reserved in hibernation code will always return "true"
	on powerpc, resulting in the pages getting touched. It's too
	generic - e.g., indicates boot allocations.

Note 3: For now, we keep using memory_block_size_bytes() as minimum
	granularity.

Suggested-by: Michal Hocko <mhocko@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Rashmica Gupta <rashmica.g@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
---
 arch/powerpc/platforms/powernv/Kconfig    |   8 +-
 arch/powerpc/platforms/powernv/memtrace.c | 163 ++++++++--------------
 2 files changed, 62 insertions(+), 109 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
index 938803eab0ad..619b093a0657 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -27,11 +27,11 @@ config OPAL_PRD
 	  recovery diagnostics on OpenPower machines
 
 config PPC_MEMTRACE
-	bool "Enable removal of RAM from kernel mappings for tracing"
-	depends on PPC_POWERNV && MEMORY_HOTREMOVE
+	bool "Enable runtime allocation of RAM for tracing"
+	depends on PPC_POWERNV && MEMORY_HOTPLUG && CONTIG_ALLOC
 	help
-	  Enabling this option allows for the removal of memory (RAM)
-	  from the kernel mappings to be used for hardware tracing.
+	  Enabling this option allows for runtime allocation of memory (RAM)
+	  for hardware tracing.
 
 config PPC_VAS
 	bool "IBM Virtual Accelerator Switchboard (VAS)"
diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c
index 0e42fe2d7b6a..5fc9408bb0b3 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -51,33 +51,12 @@ static const struct file_operations memtrace_fops = {
 	.open	= simple_open,
 };
 
-static int check_memblock_online(struct memory_block *mem, void *arg)
-{
-	if (mem->state != MEM_ONLINE)
-		return -1;
-
-	return 0;
-}
-
-static int change_memblock_state(struct memory_block *mem, void *arg)
-{
-	unsigned long state = (unsigned long)arg;
-
-	mem->state = state;
-
-	return 0;
-}
-
 static void memtrace_clear_range(unsigned long start_pfn,
 				 unsigned long nr_pages)
 {
 	unsigned long pfn;
 
-	/*
-	 * As pages are offline, we cannot trust the memmap anymore. As HIGHMEM
-	 * does not apply, avoid passing around "struct page" and use
-	 * clear_page() instead directly.
-	 */
+	/* As HIGHMEM does not apply, use clear_page() directly. */
 	for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
 		if (IS_ALIGNED(pfn, PAGES_PER_SECTION))
 			cond_resched();
@@ -85,72 +64,39 @@ static void memtrace_clear_range(unsigned long start_pfn,
 	}
 }
 
-/* called with device_hotplug_lock held */
-static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
-{
-	const unsigned long start = PFN_PHYS(start_pfn);
-	const unsigned long size = PFN_PHYS(nr_pages);
-
-	if (walk_memory_blocks(start, size, NULL, check_memblock_online))
-		return false;
-
-	walk_memory_blocks(start, size, (void *)MEM_GOING_OFFLINE,
-			   change_memblock_state);
-
-	if (offline_pages(start_pfn, nr_pages)) {
-		walk_memory_blocks(start, size, (void *)MEM_ONLINE,
-				   change_memblock_state);
-		return false;
-	}
-
-	walk_memory_blocks(start, size, (void *)MEM_OFFLINE,
-			   change_memblock_state);
-
-
-	return true;
-}
-
 static u64 memtrace_alloc_node(u32 nid, u64 size)
 {
-	u64 start_pfn, end_pfn, nr_pages, pfn;
-	u64 base_pfn;
-	u64 bytes = memory_block_size_bytes();
+	const unsigned long nr_pages = PHYS_PFN(size);
+	unsigned long pfn, start_pfn;
+	struct page *page;
 
-	if (!node_spanned_pages(nid))
+	/*
+	 * Trace memory needs to be aligned to the size, which is guaranteed
+	 * by alloc_contig_pages().
+	 */
+	page = alloc_contig_pages(nr_pages, GFP_KERNEL | __GFP_THISNODE |
+				  __GFP_NOWARN, nid, NULL);
+	if (!page)
 		return 0;
+	start_pfn = page_to_pfn(page);
 
-	start_pfn = node_start_pfn(nid);
-	end_pfn = node_end_pfn(nid);
-	nr_pages = size >> PAGE_SHIFT;
-
-	/* Trace memory needs to be aligned to the size */
-	end_pfn = round_down(end_pfn - nr_pages, nr_pages);
-
-	lock_device_hotplug();
-	for (base_pfn = end_pfn; base_pfn > start_pfn; base_pfn -= nr_pages) {
-		if (memtrace_offline_pages(nid, base_pfn, nr_pages) == true) {
-			/*
-			 * Clear the range while we still have a linear
-			 * mapping.
-			 */
-			memtrace_clear_range(base_pfn, nr_pages);
-			/*
-			 * Remove memory in memory block size chunks so that
-			 * iomem resources are always split to the same size and
-			 * we never try to remove memory that spans two iomem
-			 * resources.
-			 */
-			end_pfn = base_pfn + nr_pages;
-			for (pfn = base_pfn; pfn < end_pfn; pfn += bytes>> PAGE_SHIFT) {
-				__remove_memory(nid, pfn << PAGE_SHIFT, bytes);
-			}
-			unlock_device_hotplug();
-			return base_pfn << PAGE_SHIFT;
-		}
-	}
-	unlock_device_hotplug();
+	/*
+	 * Clear the range while we still have a linear mapping.
+	 *
+	 * TODO: use __GFP_ZERO with alloc_contig_pages() once supported.
+	 */
+	memtrace_clear_range(start_pfn, nr_pages);
 
-	return 0;
+	/*
+	 * Set pages PageOffline(), to indicate that nobody (e.g., hibernation,
+	 * dumping, ...) should be touching these pages.
+	 */
+	for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++)
+		__SetPageOffline(pfn_to_page(pfn));
+
+	arch_remove_linear_mapping(PFN_PHYS(start_pfn), size);
+
+	return PFN_PHYS(start_pfn);
 }
 
 static int memtrace_init_regions_runtime(u64 size)
@@ -220,16 +166,30 @@ static int memtrace_init_debugfs(void)
 	return ret;
 }
 
-static int online_mem_block(struct memory_block *mem, void *arg)
+static int memtrace_free(int nid, u64 start, u64 size)
 {
-	return device_online(&mem->dev);
+	struct mhp_params params = { .pgprot = PAGE_KERNEL };
+	const unsigned long nr_pages = PHYS_PFN(size);
+	const unsigned long start_pfn = PHYS_PFN(start);
+	unsigned long pfn;
+	int ret;
+
+	ret = arch_create_linear_mapping(nid, start, size, &params);
+	if (ret)
+		return ret;
+
+	for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++)
+		__ClearPageOffline(pfn_to_page(pfn));
+
+	free_contig_range(start_pfn, nr_pages);
+	return 0;
 }
 
 /*
- * Iterate through the chunks of memory we have removed from the kernel
- * and attempt to add them back to the kernel.
+ * Iterate through the chunks of memory we allocated and attempt to expose
+ * them back to the kernel.
  */
-static int memtrace_online(void)
+static int memtrace_free_regions(void)
 {
 	int i, ret = 0;
 	struct memtrace_entry *ent;
@@ -237,7 +197,7 @@ static int memtrace_online(void)
 	for (i = memtrace_array_nr - 1; i >= 0; i--) {
 		ent = &memtrace_array[i];
 
-		/* We have onlined this chunk previously */
+		/* We have freed this chunk previously */
 		if (ent->nid == NUMA_NO_NODE)
 			continue;
 
@@ -247,30 +207,25 @@ static int memtrace_online(void)
 			ent->mem = 0;
 		}
 
-		if (add_memory(ent->nid, ent->start, ent->size, MHP_NONE)) {
-			pr_err("Failed to add trace memory to node %d\n",
+		if (memtrace_free(ent->nid, ent->start, ent->size)) {
+			pr_err("Failed to free trace memory on node %d\n",
 				ent->nid);
 			ret += 1;
 			continue;
 		}
 
-		lock_device_hotplug();
-		walk_memory_blocks(ent->start, ent->size, NULL,
-				   online_mem_block);
-		unlock_device_hotplug();
-
 		/*
-		 * Memory was added successfully so clean up references to it
-		 * so on reentry we can tell that this chunk was added.
+		 * Memory was freed successfully so clean up references to it
+		 * so on reentry we can tell that this chunk was freed.
 		 */
 		debugfs_remove_recursive(ent->dir);
-		pr_info("Added trace memory back to node %d\n", ent->nid);
+		pr_info("Freed trace memory back on node %d\n", ent->nid);
 		ent->size = ent->start = ent->nid = NUMA_NO_NODE;
 	}
 	if (ret)
 		return ret;
 
-	/* If all chunks of memory were added successfully, reset globals */
+	/* If all chunks of memory were freed successfully, reset globals */
 	kfree(memtrace_array);
 	memtrace_array = NULL;
 	memtrace_size = 0;
@@ -295,18 +250,16 @@ static int memtrace_enable_set(void *data, u64 val)
 
 	mutex_lock(&memtrace_mutex);
 
-	/* Re-add/online previously removed/offlined memory */
-	if (memtrace_size) {
-		if (memtrace_online())
-			goto out_unlock;
-	}
+	/* Free all previously allocated memory. */
+	if (memtrace_size && memtrace_free_regions())
+		goto out_unlock;
 
 	if (!val) {
 		rc = 0;
 		goto out_unlock;
 	}
 
-	/* Offline and remove memory */
+	/* Allocate memory. */
 	if (memtrace_init_regions_runtime(val))
 		goto out_unlock;
 
-- 
2.26.2


^ permalink raw reply related

* [PATCH v2 7/8] powerpc/mm: remove linear mapping if __add_pages() fails in arch_add_memory()
From: David Hildenbrand @ 2020-11-11 14:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: Michal Hocko, Wei Yang, David Hildenbrand, linux-mm,
	Paul Mackerras, Rashmica Gupta, linuxppc-dev, Andrew Morton,
	Mike Rapoport, Oscar Salvador
In-Reply-To: <20201111145322.15793-1-david@redhat.com>

Let's revert what we did in case seomthing goes wrong and we return an
error - as already done on arm64 and s390x.

Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Rashmica Gupta <rashmica.g@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
---
 arch/powerpc/mm/mem.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index c5755b9efb64..8b946ec68d1b 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -170,7 +170,10 @@ int __ref arch_add_memory(int nid, u64 start, u64 size,
 	rc = arch_create_linear_mapping(nid, start, size, params);
 	if (rc)
 		return rc;
-	return __add_pages(nid, start_pfn, nr_pages, params);
+	rc = __add_pages(nid, start_pfn, nr_pages, params);
+	if (rc)
+		arch_remove_linear_mapping(start, size);
+	return rc;
 }
 
 void __ref arch_remove_memory(int nid, u64 start, u64 size,
-- 
2.26.2


^ permalink raw reply related

* [PATCH v2 6/8] powerepc/book3s64/hash: drop WARN_ON in hash__remove_section_mapping
From: David Hildenbrand @ 2020-11-11 14:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: Michal Hocko, Wei Yang, David Hildenbrand, Nicholas Piggin,
	linux-mm, Paul Mackerras, Aneesh Kumar K.V, Rashmica Gupta,
	linuxppc-dev, Andrew Morton, Mike Rapoport, Oscar Salvador
In-Reply-To: <20201111145322.15793-1-david@redhat.com>

The single caller (arch_remove_linear_mapping()) prints a proper warning
when this function fails. No need to eventually crash the kernel - let's
drop this WARN_ON.

Suggested-by: Oscar Salvador <osalvador@suse.de>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Rashmica Gupta <rashmica.g@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
---
 arch/powerpc/mm/book3s64/hash_utils.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index 24702c0a92e0..d2dcb7757c68 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -845,7 +845,6 @@ int hash__remove_section_mapping(unsigned long start, unsigned long end)
 {
 	int rc = htab_remove_mapping(start, end, mmu_linear_psize,
 				     mmu_kernel_ssize);
-	WARN_ON(rc < 0);
 
 	if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC)
 		pr_warn("Hash collision while resizing HPT\n");
-- 
2.26.2


^ permalink raw reply related

* [PATCH v2 5/8] powerpc/mm: print warning in arch_remove_linear_mapping()
From: David Hildenbrand @ 2020-11-11 14:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: Michal Hocko, Wei Yang, David Hildenbrand, linux-mm,
	Paul Mackerras, Rashmica Gupta, linuxppc-dev, Andrew Morton,
	Mike Rapoport, Oscar Salvador
In-Reply-To: <20201111145322.15793-1-david@redhat.com>

Let's print a warning similar to in arch_add_linear_mapping() instead of
WARN_ON_ONCE() and eventually crashing the kernel.

Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Rashmica Gupta <rashmica.g@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
---
 arch/powerpc/mm/mem.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index ca5c4b54c366..c5755b9efb64 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -150,7 +150,9 @@ void __ref arch_remove_linear_mapping(u64 start, u64 size)
 	mutex_lock(&linear_mapping_mutex);
 	ret = remove_section_mapping(start, start + size);
 	mutex_unlock(&linear_mapping_mutex);
-	WARN_ON_ONCE(ret);
+	if (ret)
+		pr_warn("Unable to remove linear mapping for 0x%llx..0x%llx: %d\n",
+			start, start + size, ret);
 
 	/* Ensure all vmalloc mappings are flushed in case they also
 	 * hit that section of memory
-- 
2.26.2


^ permalink raw reply related

* [PATCH v2 4/8] powerpc/mm: protect linear mapping modifications by a mutex
From: David Hildenbrand @ 2020-11-11 14:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: Michal Hocko, Wei Yang, David Hildenbrand, linux-mm,
	Paul Mackerras, Rashmica Gupta, linuxppc-dev, Andrew Morton,
	Mike Rapoport, Oscar Salvador
In-Reply-To: <20201111145322.15793-1-david@redhat.com>

This code currently relies on mem_hotplug_begin()/mem_hotplug_done() -
create_section_mapping()/remove_section_mapping() implementations
cannot tollerate getting called concurrently.

Let's prepare for callers (memtrace) not holding any such locks (and
don't force them to mess with memory hotplug locks).

Other parts in these functions don't seem to rely on external locking.

Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Rashmica Gupta <rashmica.g@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
---
 arch/powerpc/mm/mem.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 8a86d81f8df0..ca5c4b54c366 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -58,6 +58,7 @@
 #define CPU_FTR_NOEXECUTE	0
 #endif
 
+static DEFINE_MUTEX(linear_mapping_mutex);
 unsigned long long memory_limit;
 bool init_mem_is_free;
 
@@ -126,8 +127,10 @@ int __ref arch_create_linear_mapping(int nid, u64 start, u64 size,
 	int rc;
 
 	start = (unsigned long)__va(start);
+	mutex_lock(&linear_mapping_mutex);
 	rc = create_section_mapping(start, start + size, nid,
 				    params->pgprot);
+	mutex_unlock(&linear_mapping_mutex);
 	if (rc) {
 		pr_warn("Unable to create linear mapping for 0x%llx..0x%llx: %d\n",
 			start, start + size, rc);
@@ -144,7 +147,9 @@ void __ref arch_remove_linear_mapping(u64 start, u64 size)
 	start = (unsigned long)__va(start);
 	flush_dcache_range_chunked(start, start + size, FLUSH_CHUNK_SIZE);
 
+	mutex_lock(&linear_mapping_mutex);
 	ret = remove_section_mapping(start, start + size);
+	mutex_unlock(&linear_mapping_mutex);
 	WARN_ON_ONCE(ret);
 
 	/* Ensure all vmalloc mappings are flushed in case they also
-- 
2.26.2


^ permalink raw reply related

* [PATCH v2 3/8] powerpc/mm: factor out creating/removing linear mapping
From: David Hildenbrand @ 2020-11-11 14:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: Michal Hocko, Wei Yang, David Hildenbrand, linux-mm,
	Paul Mackerras, Rashmica Gupta, linuxppc-dev, Andrew Morton,
	Mike Rapoport, Oscar Salvador
In-Reply-To: <20201111145322.15793-1-david@redhat.com>

We want to stop abusing memory hotplug infrastructure in memtrace code
to perform allocations and remove the linear mapping. Instead we will use
alloc_contig_pages() and remove the linear mapping manually.

Let's factor out creating/removing the linear mapping into
arch_create_linear_mapping() / arch_remove_linear_mapping() - so in the
future, we might be able to have whole arch_add_memory() /
arch_remove_memory() be implemented in common code.

Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Rashmica Gupta <rashmica.g@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
---
 arch/powerpc/mm/mem.c          | 41 +++++++++++++++++++++++-----------
 include/linux/memory_hotplug.h |  3 +++
 2 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 01ec2a252f09..8a86d81f8df0 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -120,34 +120,26 @@ static void flush_dcache_range_chunked(unsigned long start, unsigned long stop,
 	}
 }
 
-int __ref arch_add_memory(int nid, u64 start, u64 size,
-			  struct mhp_params *params)
+int __ref arch_create_linear_mapping(int nid, u64 start, u64 size,
+				     struct mhp_params *params)
 {
-	unsigned long start_pfn = start >> PAGE_SHIFT;
-	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int rc;
 
 	start = (unsigned long)__va(start);
 	rc = create_section_mapping(start, start + size, nid,
 				    params->pgprot);
 	if (rc) {
-		pr_warn("Unable to create mapping for hot added memory 0x%llx..0x%llx: %d\n",
+		pr_warn("Unable to create linear mapping for 0x%llx..0x%llx: %d\n",
 			start, start + size, rc);
 		return -EFAULT;
 	}
-
-	return __add_pages(nid, start_pfn, nr_pages, params);
+	return 0;
 }
 
-void __ref arch_remove_memory(int nid, u64 start, u64 size,
-			     struct vmem_altmap *altmap)
+void __ref arch_remove_linear_mapping(u64 start, u64 size)
 {
-	unsigned long start_pfn = start >> PAGE_SHIFT;
-	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;
 
-	__remove_pages(start_pfn, nr_pages, altmap);
-
 	/* Remove htab bolted mappings for this section of memory */
 	start = (unsigned long)__va(start);
 	flush_dcache_range_chunked(start, start + size, FLUSH_CHUNK_SIZE);
@@ -160,6 +152,29 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size,
 	 */
 	vm_unmap_aliases();
 }
+
+int __ref arch_add_memory(int nid, u64 start, u64 size,
+			  struct mhp_params *params)
+{
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+	int rc;
+
+	rc = arch_create_linear_mapping(nid, start, size, params);
+	if (rc)
+		return rc;
+	return __add_pages(nid, start_pfn, nr_pages, params);
+}
+
+void __ref arch_remove_memory(int nid, u64 start, u64 size,
+			      struct vmem_altmap *altmap)
+{
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+
+	__remove_pages(start_pfn, nr_pages, altmap);
+	arch_remove_linear_mapping(start, size);
+}
 #endif
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index d65c6fdc5cfc..00b9e9bd3850 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -375,6 +375,9 @@ extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
 					  unsigned long pnum);
 extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
 		unsigned long nr_pages);
+extern int arch_create_linear_mapping(int nid, u64 start, u64 size,
+				      struct mhp_params *params);
+void arch_remove_linear_mapping(u64 start, u64 size);
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
-- 
2.26.2


^ permalink raw reply related

* [PATCH v2 2/8] powernv/memtrace: fix crashing the kernel when enabling concurrently
From: David Hildenbrand @ 2020-11-11 14:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Hildenbrand, stable, linux-mm, Paul Mackerras,
	Rashmica Gupta, linuxppc-dev
In-Reply-To: <20201111145322.15793-1-david@redhat.com>

It's very easy to crash the kernel right now by simply trying to enable
memtrace concurrently, hammering on the "enable" interface

loop.sh:
  #!/bin/bash

  dmesg --console-off

  while true; do
          echo 0x40000000 > /sys/kernel/debug/powerpc/memtrace/enable
  done

[root@localhost ~]# loop.sh &
[root@localhost ~]# loop.sh &

Resulting quickly in a kernel crash. Let's properly protect using a
mutex.

Fixes: 9d5171a8f248 ("powerpc/powernv: Enable removal of memory for in memory tracing")
Cc: stable@vger.kernel.org# v4.14+
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Rashmica Gupta <rashmica.g@gmail.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
---
 arch/powerpc/platforms/powernv/memtrace.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c
index eea1f94482ff..0e42fe2d7b6a 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -30,6 +30,7 @@ struct memtrace_entry {
 	char name[16];
 };
 
+static DEFINE_MUTEX(memtrace_mutex);
 static u64 memtrace_size;
 
 static struct memtrace_entry *memtrace_array;
@@ -279,6 +280,7 @@ static int memtrace_online(void)
 
 static int memtrace_enable_set(void *data, u64 val)
 {
+	int rc = -EAGAIN;
 	u64 bytes;
 
 	/*
@@ -291,25 +293,31 @@ static int memtrace_enable_set(void *data, u64 val)
 		return -EINVAL;
 	}
 
+	mutex_lock(&memtrace_mutex);
+
 	/* Re-add/online previously removed/offlined memory */
 	if (memtrace_size) {
 		if (memtrace_online())
-			return -EAGAIN;
+			goto out_unlock;
 	}
 
-	if (!val)
-		return 0;
+	if (!val) {
+		rc = 0;
+		goto out_unlock;
+	}
 
 	/* Offline and remove memory */
 	if (memtrace_init_regions_runtime(val))
-		return -EINVAL;
+		goto out_unlock;
 
 	if (memtrace_init_debugfs())
-		return -EINVAL;
+		goto out_unlock;
 
 	memtrace_size = val;
-
-	return 0;
+	rc = 0;
+out_unlock:
+	mutex_unlock(&memtrace_mutex);
+	return rc;
 }
 
 static int memtrace_enable_get(void *data, u64 *val)
-- 
2.26.2


^ permalink raw reply related

* [PATCH v2 1/8] powernv/memtrace: don't leak kernel memory to user space
From: David Hildenbrand @ 2020-11-11 14:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: Michal Hocko, Wei Yang, David Hildenbrand, stable, linux-mm,
	Paul Mackerras, Rashmica Gupta, linuxppc-dev, Andrew Morton,
	Mike Rapoport, Oscar Salvador
In-Reply-To: <20201111145322.15793-1-david@redhat.com>

We currently leak kernel memory to user space, because memory offlining
doesn't do any implicit clearing of memory and we are missing explicit
clearing of memory.

Let's keep it simple and clear pages before removing the linear mapping.

Reproduced in QEMU/TCG with 10 GiB of main memory:
  [root@localhost ~]# dd obs=9G if=/dev/urandom of=/dev/null
  [... wait until "free -m" used counter no longer changes and cancel]
  19665802+0 records in
  1+0 records out
  9663676416 bytes (9.7 GB, 9.0 GiB) copied, 135.548 s, 71.3 MB/s
  [root@localhost ~]# cat /sys/devices/system/memory/block_size_bytes
  40000000
  [root@localhost ~]# echo 0x40000000 > /sys/kernel/debug/powerpc/memtrace/enable
  [  402.978663][ T1086] page:000000001bc4bc74 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x24900
  [  402.980063][ T1086] flags: 0x7ffff000001000(reserved)
  [  402.980415][ T1086] raw: 007ffff000001000 c00c000000924008 c00c000000924008 0000000000000000
  [  402.980627][ T1086] raw: 0000000000000000 0000000000000000 00000001ffffffff 0000000000000000
  [  402.980845][ T1086] page dumped because: unmovable page
  [  402.989608][ T1086] Offlined Pages 16384
  [  403.324155][ T1086] memtrace: Allocated trace memory on node 0 at 0x0000000200000000

Before this patch:
  [root@localhost ~]# hexdump -C /sys/kernel/debug/powerpc/memtrace/00000000/trace  | head
  00000000  c8 25 72 51 4d 26 36 c5  5c c2 56 15 d5 1a cd 10  |.%rQM&6.\.V.....|
  00000010  19 b9 50 b2 cb e3 60 b8  ec 0a f3 ec 4b 3c 39 f0  |..P...`.....K<9.|$
  00000020  4e 5a 4c cf bd 26 19 ff  37 79 13 67 24 b7 b8 57  |NZL..&..7y.g$..W|$
  00000030  98 3e f5 be 6f 14 6a bd  a4 52 bc 6e e9 e0 c1 5d  |.>..o.j..R.n...]|$
  00000040  76 b3 ae b5 88 d7 da e3  64 23 85 2c 10 88 07 b6  |v.......d#.,....|$
  00000050  9a d8 91 de f7 50 27 69  2e 64 9c 6f d3 19 45 79  |.....P'i.d.o..Ey|$
  00000060  6a 6f 8a 61 71 19 1f c7  f1 df 28 26 ca 0f 84 55  |jo.aq.....(&...U|$
  00000070  01 3f be e4 e2 e1 da ff  7b 8c 8e 32 37 b4 24 53  |.?......{..27.$S|$
  00000080  1b 70 30 45 56 e6 8c c4  0e b5 4c fb 9f dd 88 06  |.p0EV.....L.....|$
  00000090  ef c4 18 79 f1 60 b1 5c  79 59 4d f4 36 d7 4a 5c  |...y.`.\yYM.6.J\|$

After this patch:
  [root@localhost ~]# hexdump -C /sys/kernel/debug/powerpc/memtrace/00000000/trace  | head
  00000000  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
  *
  40000000

Reported-by: Michael Ellerman <mpe@ellerman.id.au>
Fixes: 9d5171a8f248 ("powerpc/powernv: Enable removal of memory for in memory tracing")
Cc: stable@vger.kernel.org # v4.14+
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Rashmica Gupta <rashmica.g@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
---
 arch/powerpc/platforms/powernv/memtrace.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c
index 6828108486f8..eea1f94482ff 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -67,6 +67,23 @@ static int change_memblock_state(struct memory_block *mem, void *arg)
 	return 0;
 }
 
+static void memtrace_clear_range(unsigned long start_pfn,
+				 unsigned long nr_pages)
+{
+	unsigned long pfn;
+
+	/*
+	 * As pages are offline, we cannot trust the memmap anymore. As HIGHMEM
+	 * does not apply, avoid passing around "struct page" and use
+	 * clear_page() instead directly.
+	 */
+	for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
+		if (IS_ALIGNED(pfn, PAGES_PER_SECTION))
+			cond_resched();
+		clear_page(__va(PFN_PHYS(pfn)));
+	}
+}
+
 /* called with device_hotplug_lock held */
 static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
 {
@@ -111,6 +128,11 @@ static u64 memtrace_alloc_node(u32 nid, u64 size)
 	lock_device_hotplug();
 	for (base_pfn = end_pfn; base_pfn > start_pfn; base_pfn -= nr_pages) {
 		if (memtrace_offline_pages(nid, base_pfn, nr_pages) == true) {
+			/*
+			 * Clear the range while we still have a linear
+			 * mapping.
+			 */
+			memtrace_clear_range(base_pfn, nr_pages);
 			/*
 			 * Remove memory in memory block size chunks so that
 			 * iomem resources are always split to the same size and
-- 
2.26.2


^ permalink raw reply related

* [PATCH v2 0/8] powernv/memtrace: don't abuse memory hot(un)plug infrastructure for memory allocations
From: David Hildenbrand @ 2020-11-11 14:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: Michal Hocko, Wei Yang, David Hildenbrand, Nicholas Piggin,
	Michal Hocko, linux-mm, Paul Mackerras, Aneesh Kumar K.V,
	Andrew Morton, linuxppc-dev, Rashmica Gupta, Mike Rapoport,
	Oscar Salvador

Based on latest linux/master

powernv/memtrace is the only in-kernel user that rips out random memory
it never added (doesn't own) in order to allocate memory without a
linear mapping. Let's stop abusing memory hot(un)plug infrastructure for
that - use alloc_contig_pages() for allocating memory and remove the
linear mapping manually.

The original idea was discussed in:
 https://lkml.kernel.org/r/48340e96-7e6b-736f-9e23-d3111b915b6e@redhat.com

I only tested via QEMU TCG with a single NUMA node- see patch #8 for more
details.

Error handling and cleanup handling in memtrace code is a mess - that
should definitely get cleaned up sooner or later. Once we have __GFP_ZERO
support for alloc_contig_pages(), we can drop manual clearing. I added
a TODO for now, so this series can go via the powerpc tree - the __GFP_ZERO
change is then better suited via the mm tree, along with support for
__GFP_ZERO.

v1 -> v2:
- Tweaks to patch descriptions
- "powernv/memtrace: don't leak kernel memory to user space"
-- Added. Reported by Michael.
- "powernv/memtrace: fix crashing the kernel when enabling concurrently"
-- Added, discovered while testing.
- "powerpc/mm: protect linear mapping modifications by a mutex"
-- Added. Although we currently won't have concurrency, this is cleaner and
   future-proof.
- "powerepc/book3s64/hash: drop WARN_ON in hash__remove_section_mapping"
-- Added. Suggested by Oscar
- "powernv/memtrace: don't abuse memory hot(un)plug infrastructure for
   memory allocations"
-- Reshuffle the code to make review easier.
-- Add a TODO regarding __GFP_ZERO. Adapt to changed page clearing code.
-- Use GFP_KERNEL | __GFP_THISNODE | __GFP_NOWARN for allocations.


David Hildenbrand (8):
  powernv/memtrace: don't leak kernel memory to user space
  powernv/memtrace: fix crashing the kernel when enabling concurrently
  powerpc/mm: factor out creating/removing linear mapping
  powerpc/mm: protect linear mapping modifications by a mutex
  powerpc/mm: print warning in arch_remove_linear_mapping()
  powerepc/book3s64/hash: drop WARN_ON in hash__remove_section_mapping
  powerpc/mm: remove linear mapping if __add_pages() fails in
    arch_add_memory()
  powernv/memtrace: don't abuse memory hot(un)plug infrastructure for
    memory allocations

 arch/powerpc/mm/book3s64/hash_utils.c     |   1 -
 arch/powerpc/mm/mem.c                     |  53 +++++--
 arch/powerpc/platforms/powernv/Kconfig    |   8 +-
 arch/powerpc/platforms/powernv/memtrace.c | 175 ++++++++++------------
 include/linux/memory_hotplug.h            |   3 +
 5 files changed, 125 insertions(+), 115 deletions(-)

-- 
2.26.2


^ permalink raw reply

* Re: [PATCH 1/3] asm-generic/atomic64: Add support for ARCH_ATOMIC
From: Peter Zijlstra @ 2020-11-11 13:44 UTC (permalink / raw)
  To: Christophe Leroy
  Cc: Christophe Leroy, linux-arch, Arnd Bergmann, Alexey Kardashevskiy,
	linuxppc-dev, Boqun Feng, linux-kernel, Nicholas Piggin,
	Will Deacon
In-Reply-To: <3086114c-8af6-3863-0cbf-5d3956fcda95@csgroup.eu>

On Wed, Nov 11, 2020 at 02:39:01PM +0100, Christophe Leroy wrote:
> Hello,
> 
> Le 11/11/2020 à 12:07, Nicholas Piggin a écrit :
> > This passes atomic64 selftest on ppc32 on qemu (uniprocessor only)
> > both before and after powerpc is converted to use ARCH_ATOMIC.
> 
> Can you explain what this change does and why it is needed ?

That certainly should've been in the Changelog. This enables atomic
instrumentation, see asm-generic/atomic-instrumented.h. IOW, it makes
atomic ops visible to K*SAN.

^ permalink raw reply

* Re: [PATCH 1/3] asm-generic/atomic64: Add support for ARCH_ATOMIC
From: Christophe Leroy @ 2020-11-11 13:39 UTC (permalink / raw)
  To: Nicholas Piggin, linuxppc-dev
  Cc: Christophe Leroy, linux-arch, Arnd Bergmann, Peter Zijlstra,
	Boqun Feng, linux-kernel, Alexey Kardashevskiy, Will Deacon
In-Reply-To: <20201111110723.3148665-2-npiggin@gmail.com>

Hello,

Le 11/11/2020 à 12:07, Nicholas Piggin a écrit :
> This passes atomic64 selftest on ppc32 on qemu (uniprocessor only)
> both before and after powerpc is converted to use ARCH_ATOMIC.

Can you explain what this change does and why it is needed ?

Christophe

> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>   include/asm-generic/atomic64.h | 70 +++++++++++++++++++++++++++-------
>   lib/atomic64.c                 | 36 ++++++++---------
>   2 files changed, 75 insertions(+), 31 deletions(-)
> 
> diff --git a/include/asm-generic/atomic64.h b/include/asm-generic/atomic64.h
> index 370f01d4450f..2b1ecb591bb9 100644
> --- a/include/asm-generic/atomic64.h
> +++ b/include/asm-generic/atomic64.h
> @@ -15,19 +15,17 @@ typedef struct {
>   
>   #define ATOMIC64_INIT(i)	{ (i) }
>   
> -extern s64 atomic64_read(const atomic64_t *v);
> -extern void atomic64_set(atomic64_t *v, s64 i);
> -
> -#define atomic64_set_release(v, i)	atomic64_set((v), (i))
> +extern s64 __atomic64_read(const atomic64_t *v);
> +extern void __atomic64_set(atomic64_t *v, s64 i);
>   
>   #define ATOMIC64_OP(op)							\
> -extern void	 atomic64_##op(s64 a, atomic64_t *v);
> +extern void	 __atomic64_##op(s64 a, atomic64_t *v);
>   
>   #define ATOMIC64_OP_RETURN(op)						\
> -extern s64 atomic64_##op##_return(s64 a, atomic64_t *v);
> +extern s64 __atomic64_##op##_return(s64 a, atomic64_t *v);
>   
>   #define ATOMIC64_FETCH_OP(op)						\
> -extern s64 atomic64_fetch_##op(s64 a, atomic64_t *v);
> +extern s64 __atomic64_fetch_##op(s64 a, atomic64_t *v);
>   
>   #define ATOMIC64_OPS(op)	ATOMIC64_OP(op) ATOMIC64_OP_RETURN(op) ATOMIC64_FETCH_OP(op)
>   
> @@ -46,11 +44,57 @@ ATOMIC64_OPS(xor)
>   #undef ATOMIC64_OP_RETURN
>   #undef ATOMIC64_OP
>   
> -extern s64 atomic64_dec_if_positive(atomic64_t *v);
> -#define atomic64_dec_if_positive atomic64_dec_if_positive
> -extern s64 atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n);
> -extern s64 atomic64_xchg(atomic64_t *v, s64 new);
> -extern s64 atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u);
> -#define atomic64_fetch_add_unless atomic64_fetch_add_unless
> +extern s64 __atomic64_dec_if_positive(atomic64_t *v);
> +extern s64 __atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n);
> +extern s64 __atomic64_xchg(atomic64_t *v, s64 new);
> +extern s64 __atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u);
> +
> +#ifdef ARCH_ATOMIC
> +#define arch_atomic64_read __atomic64_read
> +#define arch_atomic64_set __atomic64_set
> +#define arch_atomic64_add __atomic64_add
> +#define arch_atomic64_add_return __atomic64_add_return
> +#define arch_atomic64_fetch_add __atomic64_fetch_add
> +#define arch_atomic64_sub __atomic64_sub
> +#define arch_atomic64_sub_return __atomic64_sub_return
> +#define arch_atomic64_fetch_sub __atomic64_fetch_sub
> +#define arch_atomic64_and __atomic64_and
> +#define arch_atomic64_and_return __atomic64_and_return
> +#define arch_atomic64_fetch_and __atomic64_fetch_and
> +#define arch_atomic64_or __atomic64_or
> +#define arch_atomic64_or_return __atomic64_or_return
> +#define arch_atomic64_fetch_or __atomic64_fetch_or
> +#define arch_atomic64_xor __atomic64_xor
> +#define arch_atomic64_xor_return __atomic64_xor_return
> +#define arch_atomic64_fetch_xor __atomic64_fetch_xor
> +#define arch_atomic64_xchg __atomic64_xchg
> +#define arch_atomic64_cmpxchg __atomic64_cmpxchg
> +#define arch_atomic64_set_release(v, i)	__atomic64_set((v), (i))
> +#define arch_atomic64_dec_if_positive __atomic64_dec_if_positive
> +#define arch_atomic64_fetch_add_unless __atomic64_fetch_add_unless
> +#else
> +#define atomic64_read __atomic64_read
> +#define atomic64_set __atomic64_set
> +#define atomic64_add __atomic64_add
> +#define atomic64_add_return __atomic64_add_return
> +#define atomic64_fetch_add __atomic64_fetch_add
> +#define atomic64_sub __atomic64_sub
> +#define atomic64_sub_return __atomic64_sub_return
> +#define atomic64_fetch_sub __atomic64_fetch_sub
> +#define atomic64_and __atomic64_and
> +#define atomic64_and_return __atomic64_and_return
> +#define atomic64_fetch_and __atomic64_fetch_and
> +#define atomic64_or __atomic64_or
> +#define atomic64_or_return __atomic64_or_return
> +#define atomic64_fetch_or __atomic64_fetch_or
> +#define atomic64_xor __atomic64_xor
> +#define atomic64_xor_return __atomic64_xor_return
> +#define atomic64_fetch_xor __atomic64_fetch_xor
> +#define atomic64_xchg __atomic64_xchg
> +#define atomic64_cmpxchg __atomic64_cmpxchg
> +#define atomic64_set_release(v, i)	__atomic64_set((v), (i))
> +#define atomic64_dec_if_positive __atomic64_dec_if_positive
> +#define atomic64_fetch_add_unless __atomic64_fetch_add_unless
> +#endif
>   
>   #endif  /*  _ASM_GENERIC_ATOMIC64_H  */
> diff --git a/lib/atomic64.c b/lib/atomic64.c
> index e98c85a99787..05aba5e3268f 100644
> --- a/lib/atomic64.c
> +++ b/lib/atomic64.c
> @@ -42,7 +42,7 @@ static inline raw_spinlock_t *lock_addr(const atomic64_t *v)
>   	return &atomic64_lock[addr & (NR_LOCKS - 1)].lock;
>   }
>   
> -s64 atomic64_read(const atomic64_t *v)
> +s64 __atomic64_read(const atomic64_t *v)
>   {
>   	unsigned long flags;
>   	raw_spinlock_t *lock = lock_addr(v);
> @@ -53,9 +53,9 @@ s64 atomic64_read(const atomic64_t *v)
>   	raw_spin_unlock_irqrestore(lock, flags);
>   	return val;
>   }
> -EXPORT_SYMBOL(atomic64_read);
> +EXPORT_SYMBOL(__atomic64_read);
>   
> -void atomic64_set(atomic64_t *v, s64 i)
> +void __atomic64_set(atomic64_t *v, s64 i)
>   {
>   	unsigned long flags;
>   	raw_spinlock_t *lock = lock_addr(v);
> @@ -64,10 +64,10 @@ void atomic64_set(atomic64_t *v, s64 i)
>   	v->counter = i;
>   	raw_spin_unlock_irqrestore(lock, flags);
>   }
> -EXPORT_SYMBOL(atomic64_set);
> +EXPORT_SYMBOL(__atomic64_set);
>   
>   #define ATOMIC64_OP(op, c_op)						\
> -void atomic64_##op(s64 a, atomic64_t *v)				\
> +void __atomic64_##op(s64 a, atomic64_t *v)				\
>   {									\
>   	unsigned long flags;						\
>   	raw_spinlock_t *lock = lock_addr(v);				\
> @@ -76,10 +76,10 @@ void atomic64_##op(s64 a, atomic64_t *v)				\
>   	v->counter c_op a;						\
>   	raw_spin_unlock_irqrestore(lock, flags);			\
>   }									\
> -EXPORT_SYMBOL(atomic64_##op);
> +EXPORT_SYMBOL(__atomic64_##op);
>   
>   #define ATOMIC64_OP_RETURN(op, c_op)					\
> -s64 atomic64_##op##_return(s64 a, atomic64_t *v)			\
> +s64 __atomic64_##op##_return(s64 a, atomic64_t *v)			\
>   {									\
>   	unsigned long flags;						\
>   	raw_spinlock_t *lock = lock_addr(v);				\
> @@ -90,10 +90,10 @@ s64 atomic64_##op##_return(s64 a, atomic64_t *v)			\
>   	raw_spin_unlock_irqrestore(lock, flags);			\
>   	return val;							\
>   }									\
> -EXPORT_SYMBOL(atomic64_##op##_return);
> +EXPORT_SYMBOL(__atomic64_##op##_return);
>   
>   #define ATOMIC64_FETCH_OP(op, c_op)					\
> -s64 atomic64_fetch_##op(s64 a, atomic64_t *v)				\
> +s64 __atomic64_fetch_##op(s64 a, atomic64_t *v)				\
>   {									\
>   	unsigned long flags;						\
>   	raw_spinlock_t *lock = lock_addr(v);				\
> @@ -105,7 +105,7 @@ s64 atomic64_fetch_##op(s64 a, atomic64_t *v)				\
>   	raw_spin_unlock_irqrestore(lock, flags);			\
>   	return val;							\
>   }									\
> -EXPORT_SYMBOL(atomic64_fetch_##op);
> +EXPORT_SYMBOL(__atomic64_fetch_##op);
>   
>   #define ATOMIC64_OPS(op, c_op)						\
>   	ATOMIC64_OP(op, c_op)						\
> @@ -130,7 +130,7 @@ ATOMIC64_OPS(xor, ^=)
>   #undef ATOMIC64_OP_RETURN
>   #undef ATOMIC64_OP
>   
> -s64 atomic64_dec_if_positive(atomic64_t *v)
> +s64 __atomic64_dec_if_positive(atomic64_t *v)
>   {
>   	unsigned long flags;
>   	raw_spinlock_t *lock = lock_addr(v);
> @@ -143,9 +143,9 @@ s64 atomic64_dec_if_positive(atomic64_t *v)
>   	raw_spin_unlock_irqrestore(lock, flags);
>   	return val;
>   }
> -EXPORT_SYMBOL(atomic64_dec_if_positive);
> +EXPORT_SYMBOL(__atomic64_dec_if_positive);
>   
> -s64 atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n)
> +s64 __atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n)
>   {
>   	unsigned long flags;
>   	raw_spinlock_t *lock = lock_addr(v);
> @@ -158,9 +158,9 @@ s64 atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n)
>   	raw_spin_unlock_irqrestore(lock, flags);
>   	return val;
>   }
> -EXPORT_SYMBOL(atomic64_cmpxchg);
> +EXPORT_SYMBOL(__atomic64_cmpxchg);
>   
> -s64 atomic64_xchg(atomic64_t *v, s64 new)
> +s64 __atomic64_xchg(atomic64_t *v, s64 new)
>   {
>   	unsigned long flags;
>   	raw_spinlock_t *lock = lock_addr(v);
> @@ -172,9 +172,9 @@ s64 atomic64_xchg(atomic64_t *v, s64 new)
>   	raw_spin_unlock_irqrestore(lock, flags);
>   	return val;
>   }
> -EXPORT_SYMBOL(atomic64_xchg);
> +EXPORT_SYMBOL(__atomic64_xchg);
>   
> -s64 atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u)
> +s64 __atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u)
>   {
>   	unsigned long flags;
>   	raw_spinlock_t *lock = lock_addr(v);
> @@ -188,4 +188,4 @@ s64 atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u)
>   
>   	return val;
>   }
> -EXPORT_SYMBOL(atomic64_fetch_add_unless);
> +EXPORT_SYMBOL(__atomic64_fetch_add_unless);
> 

^ permalink raw reply

* [PATCH seccomp v2 8/8] seccomp/cache: Report cache data through /proc/pid/seccomp_cache
From: YiFei Zhu @ 2020-11-11 13:33 UTC (permalink / raw)
  To: containers
  Cc: linux-sh, Tobin Feldman-Fitzthum, Hubertus Franke, Jack Chen,
	linux-riscv, Andrea Arcangeli, linux-s390, YiFei Zhu, linux-csky,
	Tianyin Xu, linux-xtensa, Kees Cook, Jann Horn, Valentin Rothberg,
	Aleksa Sarai, Josep Torrellas, Will Drewry, linux-parisc,
	linux-kernel, Andy Lutomirski, Dimitrios Skarlatos, David Laight,
	Giuseppe Scrivano, linuxppc-dev, Tycho Andersen
In-Reply-To: <cover.1605101222.git.yifeifz2@illinois.edu>

From: YiFei Zhu <yifeifz2@illinois.edu>

Currently the kernel does not provide an infrastructure to translate
architecture numbers to a human-readable name. Translating syscall
numbers to syscall names is possible through FTRACE_SYSCALL
infrastructure but it does not provide support for compat syscalls.

This will create a file for each PID as /proc/pid/seccomp_cache.
The file will be empty when no seccomp filters are loaded, or be
in the format of:
<arch name> <decimal syscall number> <ALLOW | FILTER>
where ALLOW means the cache is guaranteed to allow the syscall,
and filter means the cache will pass the syscall to the BPF filter.

For the docker default profile on x86_64 it looks like:
x86_64 0 ALLOW
x86_64 1 ALLOW
x86_64 2 ALLOW
x86_64 3 ALLOW
[...]
x86_64 132 ALLOW
x86_64 133 ALLOW
x86_64 134 FILTER
x86_64 135 FILTER
x86_64 136 FILTER
x86_64 137 ALLOW
x86_64 138 ALLOW
x86_64 139 FILTER
x86_64 140 ALLOW
x86_64 141 ALLOW
[...]

This file is guarded by CONFIG_SECCOMP_CACHE_DEBUG with a default
of N because I think certain users of seccomp might not want the
application to know which syscalls are definitely usable. For
the same reason, it is also guarded by CAP_SYS_ADMIN.

Suggested-by: Jann Horn <jannh@google.com>
Link: https://lore.kernel.org/lkml/CAG48ez3Ofqp4crXGksLmZY6=fGrF_tWyUCg7PBkAetvbbOPeOA@mail.gmail.com/
Signed-off-by: YiFei Zhu <yifeifz2@illinois.edu>
---
 arch/Kconfig            | 15 +++++++++++
 fs/proc/base.c          |  6 +++++
 include/linux/seccomp.h |  7 +++++
 kernel/seccomp.c        | 59 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 87 insertions(+)

diff --git a/arch/Kconfig b/arch/Kconfig
index 56b6ccc0e32d..6e2eb7171da0 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -514,6 +514,21 @@ config SECCOMP_FILTER
 
 	  See Documentation/userspace-api/seccomp_filter.rst for details.
 
+config SECCOMP_CACHE_DEBUG
+	bool "Show seccomp filter cache status in /proc/pid/seccomp_cache"
+	depends on SECCOMP
+	depends on SECCOMP_FILTER && !HAVE_SPARSE_SYSCALL_NR
+	depends on PROC_FS
+	help
+	  This enables the /proc/pid/seccomp_cache interface to monitor
+	  seccomp cache data. The file format is subject to change. Reading
+	  the file requires CAP_SYS_ADMIN.
+
+	  This option is for debugging only. Enabling presents the risk that
+	  an adversary may be able to infer the seccomp filter logic.
+
+	  If unsure, say N.
+
 config HAVE_ARCH_STACKLEAK
 	bool
 	help
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 0f707003dda5..d652f9dbaecc 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3261,6 +3261,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_PROC_PID_ARCH_STATUS
 	ONE("arch_status", S_IRUGO, proc_pid_arch_status),
 #endif
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+	ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
+#endif
 };
 
 static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3590,6 +3593,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_PROC_PID_ARCH_STATUS
 	ONE("arch_status", S_IRUGO, proc_pid_arch_status),
 #endif
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+	ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
+#endif
 };
 
 static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 02aef2844c38..76963ec4641a 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -121,4 +121,11 @@ static inline long seccomp_get_metadata(struct task_struct *task,
 	return -EINVAL;
 }
 #endif /* CONFIG_SECCOMP_FILTER && CONFIG_CHECKPOINT_RESTORE */
+
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+struct seq_file;
+
+int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
+			   struct pid *pid, struct task_struct *task);
+#endif
 #endif /* _LINUX_SECCOMP_H */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index d8cf468dbe1e..76f524e320b1 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -553,6 +553,9 @@ void seccomp_filter_release(struct task_struct *tsk)
 {
 	struct seccomp_filter *orig = tsk->seccomp.filter;
 
+	/* We are effectively holding the siglock by not having any sighand. */
+	WARN_ON(tsk->sighand != NULL);
+
 	/* Detach task from its filter tree. */
 	tsk->seccomp.filter = NULL;
 	__seccomp_filter_release(orig);
@@ -2335,3 +2338,59 @@ static int __init seccomp_sysctl_init(void)
 device_initcall(seccomp_sysctl_init)
 
 #endif /* CONFIG_SYSCTL */
+
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+/* Currently CONFIG_SECCOMP_CACHE_DEBUG implies SECCOMP_ARCH_NATIVE */
+static void proc_pid_seccomp_cache_arch(struct seq_file *m, const char *name,
+					const void *bitmap, size_t bitmap_size)
+{
+	int nr;
+
+	for (nr = 0; nr < bitmap_size; nr++) {
+		bool cached = test_bit(nr, bitmap);
+		char *status = cached ? "ALLOW" : "FILTER";
+
+		seq_printf(m, "%s %d %s\n", name, nr, status);
+	}
+}
+
+int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
+			   struct pid *pid, struct task_struct *task)
+{
+	struct seccomp_filter *f;
+	unsigned long flags;
+
+	/*
+	 * We don't want some sandboxed process to know what their seccomp
+	 * filters consist of.
+	 */
+	if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (!lock_task_sighand(task, &flags))
+		return -ESRCH;
+
+	f = READ_ONCE(task->seccomp.filter);
+	if (!f) {
+		unlock_task_sighand(task, &flags);
+		return 0;
+	}
+
+	/* prevent filter from being freed while we are printing it */
+	__get_seccomp_filter(f);
+	unlock_task_sighand(task, &flags);
+
+	proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_NATIVE_NAME,
+				    f->cache.allow_native,
+				    SECCOMP_ARCH_NATIVE_NR);
+
+#ifdef SECCOMP_ARCH_COMPAT
+	proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_COMPAT_NAME,
+				    f->cache.allow_compat,
+				    SECCOMP_ARCH_COMPAT_NR);
+#endif /* SECCOMP_ARCH_COMPAT */
+
+	__put_seccomp_filter(f);
+	return 0;
+}
+#endif /* CONFIG_SECCOMP_CACHE_DEBUG */
-- 
2.29.2


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox