LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v11 01/13] MODSIGN: Export module signature definitions
From: Thiago Jung Bauermann @ 2019-06-11  6:28 UTC (permalink / raw)
  To: linux-integrity
  Cc: Herbert Xu, linux-doc, Dmitry Kasatkin, David S. Miller,
	Jonathan Corbet, linux-kernel, Mimi Zohar, James Morris,
	David Howells, AKASHI, Takahiro, linux-security-module, keyrings,
	linux-crypto, Jessica Yu, linuxppc-dev, David Woodhouse,
	Thiago Jung Bauermann, Serge E. Hallyn
In-Reply-To: <20190611062817.18412-1-bauerman@linux.ibm.com>

IMA will use the module_signature format for append signatures, so export
the relevant definitions and factor out the code which verifies that the
appended signature trailer is valid.

Also, create a CONFIG_MODULE_SIG_FORMAT option so that IMA can select it
and be able to use mod_check_sig() without having to depend on either
CONFIG_MODULE_SIG or CONFIG_MODULES.

Signed-off-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Reviewed-by: Mimi Zohar <zohar@linux.ibm.com>
Cc: Jessica Yu <jeyu@kernel.org>
---
 include/linux/module.h           |  3 --
 include/linux/module_signature.h | 44 +++++++++++++++++++++++++
 init/Kconfig                     |  6 +++-
 kernel/Makefile                  |  1 +
 kernel/module.c                  |  1 +
 kernel/module_signature.c        | 46 ++++++++++++++++++++++++++
 kernel/module_signing.c          | 56 +++++---------------------------
 scripts/Makefile                 |  2 +-
 8 files changed, 106 insertions(+), 53 deletions(-)

diff --git a/include/linux/module.h b/include/linux/module.h
index 188998d3dca9..aa56f531cf1e 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -25,9 +25,6 @@
 #include <linux/percpu.h>
 #include <asm/module.h>
 
-/* In stripped ARM and x86-64 modules, ~ is surprisingly rare. */
-#define MODULE_SIG_STRING "~Module signature appended~\n"
-
 /* Not Yet Implemented */
 #define MODULE_SUPPORTED_DEVICE(name)
 
diff --git a/include/linux/module_signature.h b/include/linux/module_signature.h
new file mode 100644
index 000000000000..523617fc5b6a
--- /dev/null
+++ b/include/linux/module_signature.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Module signature handling.
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#ifndef _LINUX_MODULE_SIGNATURE_H
+#define _LINUX_MODULE_SIGNATURE_H
+
+/* In stripped ARM and x86-64 modules, ~ is surprisingly rare. */
+#define MODULE_SIG_STRING "~Module signature appended~\n"
+
+enum pkey_id_type {
+	PKEY_ID_PGP,		/* OpenPGP generated key ID */
+	PKEY_ID_X509,		/* X.509 arbitrary subjectKeyIdentifier */
+	PKEY_ID_PKCS7,		/* Signature in PKCS#7 message */
+};
+
+/*
+ * Module signature information block.
+ *
+ * The constituents of the signature section are, in order:
+ *
+ *	- Signer's name
+ *	- Key identifier
+ *	- Signature data
+ *	- Information block
+ */
+struct module_signature {
+	u8	algo;		/* Public-key crypto algorithm [0] */
+	u8	hash;		/* Digest algorithm [0] */
+	u8	id_type;	/* Key identifier type [PKEY_ID_PKCS7] */
+	u8	signer_len;	/* Length of signer's name [0] */
+	u8	key_id_len;	/* Length of key identifier [0] */
+	u8	__pad[3];
+	__be32	sig_len;	/* Length of signature data */
+};
+
+int mod_check_sig(const struct module_signature *ms, size_t file_len,
+		  const char *name);
+
+#endif /* _LINUX_MODULE_SIGNATURE_H */
diff --git a/init/Kconfig b/init/Kconfig
index 8b9ffe236e4f..c2286a3c74c5 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1852,6 +1852,10 @@ config BASE_SMALL
 	default 0 if BASE_FULL
 	default 1 if !BASE_FULL
 
+config MODULE_SIG_FORMAT
+	def_bool n
+	select SYSTEM_DATA_VERIFICATION
+
 menuconfig MODULES
 	bool "Enable loadable module support"
 	option modules
@@ -1929,7 +1933,7 @@ config MODULE_SRCVERSION_ALL
 config MODULE_SIG
 	bool "Module signature verification"
 	depends on MODULES
-	select SYSTEM_DATA_VERIFICATION
+	select MODULE_SIG_FORMAT
 	help
 	  Check modules for valid signatures upon load: the signature
 	  is simply appended to the module. For more information see
diff --git a/kernel/Makefile b/kernel/Makefile
index 33824f0385b3..f29ae2997a43 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -58,6 +58,7 @@ endif
 obj-$(CONFIG_UID16) += uid16.o
 obj-$(CONFIG_MODULES) += module.o
 obj-$(CONFIG_MODULE_SIG) += module_signing.o
+obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_CRASH_CORE) += crash_core.o
diff --git a/kernel/module.c b/kernel/module.c
index 6e6712b3aaf5..2712f4d217f5 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -19,6 +19,7 @@
 #include <linux/export.h>
 #include <linux/extable.h>
 #include <linux/moduleloader.h>
+#include <linux/module_signature.h>
 #include <linux/trace_events.h>
 #include <linux/init.h>
 #include <linux/kallsyms.h>
diff --git a/kernel/module_signature.c b/kernel/module_signature.c
new file mode 100644
index 000000000000..4224a1086b7d
--- /dev/null
+++ b/kernel/module_signature.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Module signature checker
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/errno.h>
+#include <linux/printk.h>
+#include <linux/module_signature.h>
+#include <asm/byteorder.h>
+
+/**
+ * mod_check_sig - check that the given signature is sane
+ *
+ * @ms:		Signature to check.
+ * @file_len:	Size of the file to which @ms is appended.
+ * @name:	What is being checked. Used for error messages.
+ */
+int mod_check_sig(const struct module_signature *ms, size_t file_len,
+		  const char *name)
+{
+	if (be32_to_cpu(ms->sig_len) >= file_len - sizeof(*ms))
+		return -EBADMSG;
+
+	if (ms->id_type != PKEY_ID_PKCS7) {
+		pr_err("%s: Module is not signed with expected PKCS#7 message\n",
+		       name);
+		return -ENOPKG;
+	}
+
+	if (ms->algo != 0 ||
+	    ms->hash != 0 ||
+	    ms->signer_len != 0 ||
+	    ms->key_id_len != 0 ||
+	    ms->__pad[0] != 0 ||
+	    ms->__pad[1] != 0 ||
+	    ms->__pad[2] != 0) {
+		pr_err("%s: PKCS#7 signature info has unexpected non-zero params\n",
+		       name);
+		return -EBADMSG;
+	}
+
+	return 0;
+}
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index 6b9a926fd86b..cdd04a6b8074 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -11,37 +11,13 @@
 
 #include <linux/kernel.h>
 #include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/module_signature.h>
 #include <linux/string.h>
 #include <linux/verification.h>
 #include <crypto/public_key.h>
 #include "module-internal.h"
 
-enum pkey_id_type {
-	PKEY_ID_PGP,		/* OpenPGP generated key ID */
-	PKEY_ID_X509,		/* X.509 arbitrary subjectKeyIdentifier */
-	PKEY_ID_PKCS7,		/* Signature in PKCS#7 message */
-};
-
-/*
- * Module signature information block.
- *
- * The constituents of the signature section are, in order:
- *
- *	- Signer's name
- *	- Key identifier
- *	- Signature data
- *	- Information block
- */
-struct module_signature {
-	u8	algo;		/* Public-key crypto algorithm [0] */
-	u8	hash;		/* Digest algorithm [0] */
-	u8	id_type;	/* Key identifier type [PKEY_ID_PKCS7] */
-	u8	signer_len;	/* Length of signer's name [0] */
-	u8	key_id_len;	/* Length of key identifier [0] */
-	u8	__pad[3];
-	__be32	sig_len;	/* Length of signature data */
-};
-
 /*
  * Verify the signature on a module.
  */
@@ -49,6 +25,7 @@ int mod_verify_sig(const void *mod, struct load_info *info)
 {
 	struct module_signature ms;
 	size_t sig_len, modlen = info->len;
+	int ret;
 
 	pr_devel("==>%s(,%zu)\n", __func__, modlen);
 
@@ -56,32 +33,15 @@ int mod_verify_sig(const void *mod, struct load_info *info)
 		return -EBADMSG;
 
 	memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms));
-	modlen -= sizeof(ms);
+
+	ret = mod_check_sig(&ms, modlen, info->name);
+	if (ret)
+		return ret;
 
 	sig_len = be32_to_cpu(ms.sig_len);
-	if (sig_len >= modlen)
-		return -EBADMSG;
-	modlen -= sig_len;
+	modlen -= sig_len + sizeof(ms);
 	info->len = modlen;
 
-	if (ms.id_type != PKEY_ID_PKCS7) {
-		pr_err("%s: Module is not signed with expected PKCS#7 message\n",
-		       info->name);
-		return -ENOPKG;
-	}
-
-	if (ms.algo != 0 ||
-	    ms.hash != 0 ||
-	    ms.signer_len != 0 ||
-	    ms.key_id_len != 0 ||
-	    ms.__pad[0] != 0 ||
-	    ms.__pad[1] != 0 ||
-	    ms.__pad[2] != 0) {
-		pr_err("%s: PKCS#7 signature info has unexpected non-zero params\n",
-		       info->name);
-		return -EBADMSG;
-	}
-
 	return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
 				      VERIFY_USE_SECONDARY_KEYRING,
 				      VERIFYING_MODULE_SIGNATURE,
diff --git a/scripts/Makefile b/scripts/Makefile
index 9d442ee050bd..52098b080ab7 100644
--- a/scripts/Makefile
+++ b/scripts/Makefile
@@ -17,7 +17,7 @@ hostprogs-$(CONFIG_VT)           += conmakehash
 hostprogs-$(BUILD_C_RECORDMCOUNT) += recordmcount
 hostprogs-$(CONFIG_BUILDTIME_EXTABLE_SORT) += sortextable
 hostprogs-$(CONFIG_ASN1)	 += asn1_compiler
-hostprogs-$(CONFIG_MODULE_SIG)	 += sign-file
+hostprogs-$(CONFIG_MODULE_SIG_FORMAT) += sign-file
 hostprogs-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += extract-cert
 hostprogs-$(CONFIG_SYSTEM_EXTRA_CERTIFICATE) += insert-sys-cert
 


^ permalink raw reply related

* [PATCH v11 00/13] Appended signatures support for IMA appraisal
From: Thiago Jung Bauermann @ 2019-06-11  6:28 UTC (permalink / raw)
  To: linux-integrity
  Cc: Herbert Xu, linux-doc, Dmitry Kasatkin, David S. Miller,
	Jonathan Corbet, linux-kernel, Mimi Zohar, James Morris,
	David Howells, AKASHI, Takahiro, linux-security-module, keyrings,
	linux-crypto, Jessica Yu, linuxppc-dev, David Woodhouse,
	Thiago Jung Bauermann, Serge E. Hallyn

Hello,

Nothing big in this version. Noteworthy changes are:

1. Fixes for two bugs in ima_appraise_measurements() which were spotted and
resolved by Mimi Zohar. The changelog points them out.

2. One bugfix in process_measurement() which would cause all files
appraised with modsig to be measured as well, even if the policy didn't
request it.

3. Adapted to work with per policy rule template formats.

Plus small cosmetic changes in some places. The changelog has the details.

This has been tested with signed modules and with signed kernels loaded via
kexec_file_load().

Many thanks to Mimi Zohar for her help with the development of this patch
series.

The patches apply on today's linux-integrity/next-queued-testing.

Original cover letter:

On the OpenPOWER platform, secure boot and trusted boot are being
implemented using IMA for taking measurements and verifying signatures.
Since the kernel image on Power servers is an ELF binary, kernels are
signed using the scripts/sign-file tool and thus use the same signature
format as signed kernel modules.

This patch series adds support in IMA for verifying those signatures.
It adds flexibility to OpenPOWER secure boot, because it allows it to boot
kernels with the signature appended to them as well as kernels where the
signature is stored in the IMA extended attribute.

Changes since v10:

- Patch "MODSIGN: Export module signature definitions"
  - Moved config MODULE_SIG_FORMAT definition before its use. Suggested by
    Mimi Zohar.
  - Added missing kerneldoc for @name parameter. Suggested by Mimi Zohar.

- Patch "ima: Implement support for module-style appended signatures"
  - Bugfix: don't check status variable when deciding whether to verify
    modsig in ima_appraise_measurement(). Suggested by Mimi Zohar.
  - Bugfix: verify the modsig in ima_appraise_measurement() if the xattr
    contains a digest. Suggested by Mimi Zohar.

- Patch "ima: Define ima-modsig template"
  - Renamed ima_modsig_serialize() to ima_get_raw_modsig().
  - Renamed check_current_template_modsig() to check_template_modsig().
  - Fixed outdated comment in ima_eventmodsig_init(). Suggested by Mimi
    Zohar.
  - Check either the global or the per-rule template when an appraisal rule
    allows modsig. Suggested by Mimi Zohar.

- Patch "ima: Store the measurement again when appraising a modsig"
  - Bugfix: Only re-measure file containing modsig if it was measured
    before.
  - Check for modsig-related fields in the template_desc obtained in
    process_measurement() which can be a per-rule template. Suggested by Mimi
    Zohar.

- Patch "ima: Allow template= option for appraise rules as well"
  - New patch. Suggested by Mimi Zohar.

Changes since v9:

- Patch "MODSIGN: Export module signature definitions"
  - Moved mod_check_sig() to a new file so that CONFIG_IMA_APPRAISE_MODSIG
    doesn't have to depend on CONFIG_MODULES.
  - Changed scripts/Makefile to build sign-file if CONFIG_MODULE_SIG_FORMAT
    is set.
  - Removed Mimi's Reviewed-by because of the changes in this version.

- Patch "PKCS#7: Refactor verify_pkcs7_signature()"
  - Don't add function pkcs7_get_message_sig() anymore, since it's not
    needed in the current version.

- Patch "PKCS#7: Introduce pkcs7_get_digest()"
  - Changed 'len' argument from 'u8 *' to 'u32 *'.
  - Added 'hash_algo' argument to obtain the algo used for the digest.
  - Don't check whether 'buf', 'len' and 'hash_algo' output arguments are NULL,
    since the function's only caller always sets them.
  - Removed Mimi's Reviewed-by because of the changes in this version.

- Patch "integrity: Introduce asymmetric_sig_has_known_key()"
  - Dropped.

- Patch "integrity: Introduce integrity_keyring_from_id"
  - Squashed into "ima: Implement support for module-style appended signatures"
  - Changed integrity_keyring_from_id() to a static function (suggested by Mimi
    Zohar).

- Patch "ima: Introduce is_signed()"
  - Dropped.

- Patch "ima: Export func_tokens"
  - Squashed into "ima: Implement support for module-style appended signatures"

- Patch "ima: Use designated initializers for struct ima_event_data"
  - New patch.

- Patch "ima: Factor xattr_verify() out of ima_appraise_measurement()"
  - New patch.

- Patch "ima: Implement support for module-style appended signatures"
  - Renamed 'struct modsig_hdr' to 'struct modsig'.
  - Added integrity_modsig_verify() to integrity/digsig.c so that it's not
    necessary to export integrity_keyring_from_id() (Suggested by Mimi Zohar).
  - Don't add functions ima_xattr_sig_known_key() and
    modsig_has_known_key() since they're not necessary anymore.
  - Added modsig argument to ima_appraise_measurement().
  - Verify modsig in a separate function called by ima_appraise_measurement().
  - Renamed ima_read_collect_modsig() to ima_read_modsig(), with a separate
    collect function added in patch "ima: Collect modsig" (suggested by Mimi
    Zohar).
  - In ima_read_modsig(), moved code saving of raw PKCS7 data to 'struct
    modsig' to patch "ima: Collect modsig".
  - In ima_read_modsig(), moved all parts related to the modsig hash to
    patch "ima: Collect modsig".
  - In ima_read_modsig(), don't check if the buf pointer is NULL since it's
    never supposed to happen.
  - Renamed ima_free_xattr_data() to ima_free_modsig().
  - No need to check for modsig in ima_read_xattr() and
    ima_inode_set_xattr() anymore.
  - In ima_modsig_verify(), don't check if the modsig pointer is NULL since
    it's not supposed to happen.
  - Don't define IMA_MODSIG element in enum evm_ima_xattr_type.

- Patch "ima: Collect modsig"
  - New patch.

- Patch "ima: Define ima-modsig template"
  - Patch renamed from "ima: Add new "d-sig" template field"
  - Renamed 'd-sig' template field to 'd-modsig'.
  - Added 'modsig' template field.
  - Added 'ima-modsig' defined template descriptor.
  - Renamed ima_modsig_serialize_data() to ima_modsig_serialize().
  - Renamed ima_get_modsig_hash() to ima_get_modsig_digest(). Also the
    function is a lot simpler now since what it used to do is now done in
    ima_collect_modsig() and pkcs7_get_digest().
  - Added check for failed modsig collection in ima_eventdigest_modsig_init().
  - Added modsig argument to ima_store_measurement().
  - Added 'modsig' field to struct ima_event_data.
  - Removed check for modsig == NULL in ima_get_modsig_digest() and in
    ima_modsig_serialize_data() since their callers already performs that
    check.
  - Moved check_current_template_modsig() to this patch, previously was in
    "ima: Store the measurement again when appraising a modsig".

- Patch "ima: Store the measurement again when appraising a modsig"
  - Renamed ima_template_has_sig() to ima_template_has_modsig().
  - Added a change to ima_collect_measurement(), making it to call
    ima_collect_modsig() even if IMA_COLLECT is set in iint->flags.
  - Removed IMA_READ_MEASURE flag.
  - Renamed template_has_sig global variable to template_has_modsig.
  - Renamed find_sig_in_template() to find_modsig_in_template().


Thiago Jung Bauermann (13):
  MODSIGN: Export module signature definitions
  PKCS#7: Refactor verify_pkcs7_signature()
  PKCS#7: Introduce pkcs7_get_digest()
  integrity: Introduce struct evm_xattr
  integrity: Select CONFIG_KEYS instead of depending on it
  ima: Use designated initializers for struct ima_event_data
  ima: Add modsig appraise_type option for module-style appended
    signatures
  ima: Factor xattr_verify() out of ima_appraise_measurement()
  ima: Implement support for module-style appended signatures
  ima: Collect modsig
  ima: Define ima-modsig template
  ima: Store the measurement again when appraising a modsig
  ima: Allow template= option for appraise rules as well

 Documentation/ABI/testing/ima_policy      |   6 +-
 Documentation/security/IMA-templates.rst  |   7 +-
 certs/system_keyring.c                    |  61 +++++--
 crypto/asymmetric_keys/pkcs7_verify.c     |  33 ++++
 include/crypto/pkcs7.h                    |   4 +
 include/linux/module.h                    |   3 -
 include/linux/module_signature.h          |  44 +++++
 include/linux/verification.h              |  10 ++
 init/Kconfig                              |   6 +-
 kernel/Makefile                           |   1 +
 kernel/module.c                           |   1 +
 kernel/module_signature.c                 |  46 +++++
 kernel/module_signing.c                   |  56 +-----
 scripts/Makefile                          |   2 +-
 security/integrity/Kconfig                |   2 +-
 security/integrity/digsig.c               |  43 ++++-
 security/integrity/evm/evm_main.c         |   8 +-
 security/integrity/ima/Kconfig            |  13 ++
 security/integrity/ima/Makefile           |   1 +
 security/integrity/ima/ima.h              |  60 ++++++-
 security/integrity/ima/ima_api.c          |  34 +++-
 security/integrity/ima/ima_appraise.c     | 199 ++++++++++++++--------
 security/integrity/ima/ima_init.c         |   4 +-
 security/integrity/ima/ima_main.c         |  24 ++-
 security/integrity/ima/ima_modsig.c       | 169 ++++++++++++++++++
 security/integrity/ima/ima_policy.c       |  68 +++++++-
 security/integrity/ima/ima_template.c     |  26 ++-
 security/integrity/ima/ima_template_lib.c |  60 ++++++-
 security/integrity/ima/ima_template_lib.h |   4 +
 security/integrity/integrity.h            |  26 +++
 30 files changed, 840 insertions(+), 181 deletions(-)
 create mode 100644 include/linux/module_signature.h
 create mode 100644 kernel/module_signature.c
 create mode 100644 security/integrity/ima/ima_modsig.c


^ permalink raw reply

* Re: [PATCH 1/3] powerpc/64: __ioremap_at clean up in the error case
From: Christophe Leroy @ 2019-06-11  6:28 UTC (permalink / raw)
  To: Nicholas Piggin, linuxppc-dev
In-Reply-To: <20190610030818.17965-1-npiggin@gmail.com>



Le 10/06/2019 à 05:08, Nicholas Piggin a écrit :
> __ioremap_at error handling is wonky, it requires caller to clean up
> after it. Implement a helper that does the map and error cleanup and
> remove the requirement from the caller.
> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
> 
> This series is a different approach to the problem, using the generic
> ioremap_page_range directly which reduces added code, and moves
> the radix specific code into radix files. Thanks to Christophe for
> pointing out various problems with the previous patch.
> 
>   arch/powerpc/mm/pgtable_64.c | 27 ++++++++++++++++++++-------
>   1 file changed, 20 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
> index d2d976ff8a0e..6bd3660388aa 100644
> --- a/arch/powerpc/mm/pgtable_64.c
> +++ b/arch/powerpc/mm/pgtable_64.c
> @@ -108,14 +108,30 @@ unsigned long ioremap_bot;
>   unsigned long ioremap_bot = IOREMAP_BASE;
>   #endif
>   
> +static int ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, pgprot_t prot, int nid)
> +{
> +	unsigned long i;
> +
> +	for (i = 0; i < size; i += PAGE_SIZE) {
> +		int err = map_kernel_page(ea + i, pa + i, prot);

Missing a blank line

> +		if (err) {

I'd have done the following to reduce indentation depth

		if (!err)
			continue

> +			if (slab_is_available())
> +				unmap_kernel_range(ea, size);

Shouldn't it be unmap_kernel_range(ea, i) ?

Christophe

> +			else
> +				WARN_ON_ONCE(1); /* Should clean up */
> +			return err;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
>   /**
>    * __ioremap_at - Low level function to establish the page tables
>    *                for an IO mapping
>    */
>   void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_t prot)
>   {
> -	unsigned long i;
> -
>   	/* We don't support the 4K PFN hack with ioremap */
>   	if (pgprot_val(prot) & H_PAGE_4K_PFN)
>   		return NULL;
> @@ -129,9 +145,8 @@ void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_
>   	WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
>   	WARN_ON(size & ~PAGE_MASK);
>   
> -	for (i = 0; i < size; i += PAGE_SIZE)
> -		if (map_kernel_page((unsigned long)ea + i, pa + i, prot))
> -			return NULL;
> +	if (ioremap_range((unsigned long)ea, pa, size, prot, NUMA_NO_NODE))
> +		return NULL;
>   
>   	return (void __iomem *)ea;
>   }
> @@ -182,8 +197,6 @@ void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size,
>   
>   		area->phys_addr = paligned;
>   		ret = __ioremap_at(paligned, area->addr, size, prot);
> -		if (!ret)
> -			vunmap(area->addr);
>   	} else {
>   		ret = __ioremap_at(paligned, (void *)ioremap_bot, size, prot);
>   		if (ret)
> 

^ permalink raw reply

* Re: [PATCH 4/4] mm/vmalloc: Hugepage vmalloc mappings
From: Anshuman Khandual @ 2019-06-11  6:17 UTC (permalink / raw)
  To: Nicholas Piggin, Mark Rutland; +Cc: linux-mm, linuxppc-dev, linux-arm-kernel
In-Reply-To: <1560177786.t6c5cn5hw4.astroid@bobo.none>



On 06/10/2019 08:14 PM, Nicholas Piggin wrote:
> Mark Rutland's on June 11, 2019 12:10 am:
>> Hi,
>>
>> On Mon, Jun 10, 2019 at 02:38:38PM +1000, Nicholas Piggin wrote:
>>> For platforms that define HAVE_ARCH_HUGE_VMAP, have vmap allow vmalloc to
>>> allocate huge pages and map them
>>>
>>> This brings dTLB misses for linux kernel tree `git diff` from 45,000 to
>>> 8,000 on a Kaby Lake KVM guest with 8MB dentry hash and mitigations=off
>>> (performance is in the noise, under 1% difference, page tables are likely
>>> to be well cached for this workload). Similar numbers are seen on POWER9.
>>
>> Do you happen to know which vmalloc mappings these get used for in the
>> above case? Where do we see vmalloc mappings that large?
> 
> Large module vmalloc could be subject to huge mappings.
> 
>> I'm worried as to how this would interact with the set_memory_*()
>> functions, as on arm64 those can only operate on page-granular mappings.
>> Those may need fixing up to handle huge mappings; certainly if the above
>> is all for modules.
> 
> Good point, that looks like it would break on arm64 at least. I'll
> work on it. We may have to make this opt in beyond HUGE_VMAP.

This is another reason we might need to have an arch opt-ins like the one
I mentioned before.

^ permalink raw reply

* Re: [BISECTED REGRESSION] b43legacy broken on G4 PowerBook
From: Christoph Hellwig @ 2019-06-11  6:08 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Aaro Koskinen, linux-wireless, linux-kernel, Christian Zigotzky,
	linuxppc-dev, Christoph Hellwig, Larry Finger
In-Reply-To: <c91ccbddd6a58dbee5705f10ed1d98fb44bd8f8d.camel@kernel.crashing.org>

On Tue, Jun 11, 2019 at 03:56:33PM +1000, Benjamin Herrenschmidt wrote:
> The reason I think it sort-of-mostly-worked is that to get more than
> 1GB of RAM, those machines use CONFIG_HIGHMEM. And *most* network
> buffers aren't allocated in Highmem.... so you got lucky.
> 
> That said, there is such as thing as no-copy send on network, so I
> wouldn't be surprised if some things would still have failed, just not
> frequent enough for you to notice.

Unless NETIF_F_HIGHDMA is set on a netdev, the core networkign code
will bounce buffer highmem pages for the driver under all circumstances.

^ permalink raw reply

* Re: [BISECTED REGRESSION] b43legacy broken on G4 PowerBook
From: Christoph Hellwig @ 2019-06-11  6:05 UTC (permalink / raw)
  To: Larry Finger
  Cc: Aaro Koskinen, linux-wireless, linux-kernel, Christian Zigotzky,
	linuxppc-dev, Christoph Hellwig
In-Reply-To: <153c13f5-a829-1eab-a3c5-fecfb84127ff@lwfinger.net>

On Mon, Jun 10, 2019 at 11:09:47AM -0500, Larry Finger wrote:
>>>                  return -EIO;
>>>
>>> For b43legacy, dev->dma_mask is 0xc265684800000000.
>>>      dma_supported(dev, mask) is 0xc08b000000000000, mask is 0x3fffffff, and
>>> the routine returns -EIO.
>>>
>>> For b43,       dev->dma_mask is 0xc265684800000001,
>>>      dma_supported(dev, mask) is 0xc08b000000000000, mask is 0x77777777, and
>>> the routine returns 0.
>>
>> I don't fully understand what values the above map to.  Can you send
>> me your actual debugging patch as well?
>
> I do not understand why the if statement returns true as neither of the 
> values is zero. After seeing the x86 output shown below, I also do not 
> understand all the trailing zeros.
>
> My entire patch is attached. That output came from this section:

What might be confusing in your output is that dev->dma_mask is a pointer,
and we are setting it in dma_set_mask.  That is before we only check
if the pointer is set, and later we override it.  Of course this doesn't
actually explain the failure.  But what is even more strange to me
is that you get a return value from dma_supported() that isn't 0 or 1,
as that function is supposed to return a boolean, and I really can't see
how mask >= __phys_to_dma(dev, min_mask), would return anything but 0
or 1.  Does the output change if you use the correct printk specifiers?

i.e. with a debug patch like this:


diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 2c2772e9702a..9e5b30b12b10 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -378,6 +378,7 @@ EXPORT_SYMBOL(dma_direct_map_resource);
 int dma_direct_supported(struct device *dev, u64 mask)
 {
 	u64 min_mask;
+	bool ret;
 
 	if (IS_ENABLED(CONFIG_ZONE_DMA))
 		min_mask = DMA_BIT_MASK(ARCH_ZONE_DMA_BITS);
@@ -391,7 +392,12 @@ int dma_direct_supported(struct device *dev, u64 mask)
 	 * use __phys_to_dma() here so that the SME encryption mask isn't
 	 * part of the check.
 	 */
-	return mask >= __phys_to_dma(dev, min_mask);
+	ret = (mask >= __phys_to_dma(dev, min_mask));
+	if (!ret)
+		dev_info(dev,
+			"%s: failed (mask = 0x%llx, min_mask = 0x%llx/0x%llx, dma bits = %d\n",
+			__func__, mask, min_mask, __phys_to_dma(dev, min_mask), ARCH_ZONE_DMA_BITS);
+	return ret;
 }
 
 size_t dma_direct_max_mapping_size(struct device *dev)
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index f7afdadb6770..6c57ccdee2ae 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -317,8 +317,14 @@ void arch_dma_set_mask(struct device *dev, u64 mask);
 
 int dma_set_mask(struct device *dev, u64 mask)
 {
-	if (!dev->dma_mask || !dma_supported(dev, mask))
+	if (!dev->dma_mask) {
+		dev_info(dev, "no DMA mask set!\n");
 		return -EIO;
+	}
+	if (!dma_supported(dev, mask)) {
+		printk("DMA not supported\n");
+		return -EIO;
+	}
 
 	arch_dma_set_mask(dev, mask);
 	dma_check_mask(dev, mask);

^ permalink raw reply related

* Re: [BISECTED REGRESSION] b43legacy broken on G4 PowerBook
From: Benjamin Herrenschmidt @ 2019-06-11  5:56 UTC (permalink / raw)
  To: Larry Finger, Aaro Koskinen, Christoph Hellwig,
	Christian Zigotzky, Michael Ellerman
  Cc: linux-wireless, linuxppc-dev, linux-kernel
In-Reply-To: <3ed1ccfe-d7ca-11b9-17b3-303d1ae1bb0f@lwfinger.net>

On Mon, 2019-06-10 at 13:44 -0500, Larry Finger wrote:
> On 6/7/19 11:21 PM, Benjamin Herrenschmidt wrote:
> > 
> > > Please try the attached patch. I'm not really pleased with it and I will
> > > continue to determine why the fallback to a 30-bit mask fails, but at least this
> > > one works for me.
> > 
> > Your patch only makes sense if the device is indeed capable of
> > addressing 31-bits.
> > 
> > So either the driver is buggy and asks for a too small mask in which
> > case your patch is ok, or it's not and you're just going to cause all
> > sort of interesting random problems including possible memory
> > corruption.
> 
> Of course the driver may be buggy, but it asks for the correct mask.
> 
> This particular device is not capable of handling 32-bit DMA. The driver detects 
> the 32-bit failure and falls back to 30 bits. It works on x86, and did on PPC32 
> until 5.1. As Christoph said, it should always be possible to use fewer bits 
> than the maximum.

No, I don't think it *worked* on ppc32 before Christoph patch. I think
it "mostly sort-of worked" :-)

The reason I'm saying that is if your system has more than 1GB of RAM,
then you'll have chunks of memory that the device simply cannot
address.

Before Christoph patches, we had no ZONE_DMA or ZONE_DMA32 covering the
30-bit limited space, so any memory allocation could in theory land
above 30-bits, causing all sort of horrible things to happen with that
driver.

The reason I think it sort-of-mostly-worked is that to get more than
1GB of RAM, those machines use CONFIG_HIGHMEM. And *most* network
buffers aren't allocated in Highmem.... so you got lucky.

That said, there is such as thing as no-copy send on network, so I
wouldn't be surprised if some things would still have failed, just not
frequent enough for you to notice.

> Similar devices that are new enough to use b43 rather than b43legacy work with 
> new kernels; however, they have and use 32-bit DMA.

Cheres,
Ben.

^ permalink raw reply

* Re: [PATCH v2 3/6] powerpc/eeh: Improve debug messages around device addition
From: Alexey Kardashevskiy @ 2019-06-11  5:47 UTC (permalink / raw)
  To: Sam Bobroff, linuxppc-dev; +Cc: oohall, tyreld
In-Reply-To: <8deaedffad8ed3327f296a561c2a31c930c65f88.1557203383.git.sbobroff@linux.ibm.com>



On 07/05/2019 14:30, Sam Bobroff wrote:
> Also remove useless comment.
> 
> Signed-off-by: Sam Bobroff <sbobroff@linux.ibm.com>
> Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  arch/powerpc/kernel/eeh.c                    |  2 +-
>  arch/powerpc/platforms/powernv/eeh-powernv.c | 14 ++++++++----
>  arch/powerpc/platforms/pseries/eeh_pseries.c | 23 +++++++++++++++-----
>  3 files changed, 28 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
> index 8d3c36a1f194..b14d89547895 100644
> --- a/arch/powerpc/kernel/eeh.c
> +++ b/arch/powerpc/kernel/eeh.c
> @@ -1291,7 +1291,7 @@ void eeh_add_device_late(struct pci_dev *dev)
>  	pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn);
>  	edev = pdn_to_eeh_dev(pdn);
>  	if (edev->pdev == dev) {
> -		pr_debug("EEH: Already referenced !\n");
> +		pr_debug("EEH: Device %s already referenced!\n", pci_name(dev));
>  		return;
>  	}
>  
> diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
> index 6fc1a463b796..0e374cdba961 100644
> --- a/arch/powerpc/platforms/powernv/eeh-powernv.c
> +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
> @@ -50,10 +50,7 @@ void pnv_pcibios_bus_add_device(struct pci_dev *pdev)
>  	if (!pdev->is_virtfn)
>  		return;
>  
> -	/*
> -	 * The following operations will fail if VF's sysfs files
> -	 * aren't created or its resources aren't finalized.
> -	 */
> +	pr_debug("%s: EEH: Setting up device %s.\n", __func__, pci_name(pdev));


dev_dbg() seems more appropriate.


>  	eeh_add_device_early(pdn);
>  	eeh_add_device_late(pdev);
>  	eeh_sysfs_add_device(pdev);
> @@ -397,6 +394,10 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
>  	int ret;
>  	int config_addr = (pdn->busno << 8) | (pdn->devfn);
>  
> +	pr_debug("%s: probing %04x:%02x:%02x.%01x\n",
> +		__func__, hose->global_number, pdn->busno,
> +		PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
> +
>  	/*
>  	 * When probing the root bridge, which doesn't have any
>  	 * subordinate PCI devices. We don't have OF node for
> @@ -491,6 +492,11 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
>  	/* Save memory bars */
>  	eeh_save_bars(edev);
>  
> +	pr_debug("%s: EEH enabled on %02x:%02x.%01x PHB#%x-PE#%x\n",
> +		__func__, pdn->busno, PCI_SLOT(pdn->devfn),
> +		PCI_FUNC(pdn->devfn), edev->pe->phb->global_number,
> +		edev->pe->addr);
> +
>  	return NULL;
>  }
>  
> diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c
> index 7aa50258dd42..ae06878fbdea 100644
> --- a/arch/powerpc/platforms/pseries/eeh_pseries.c
> +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
> @@ -65,6 +65,8 @@ void pseries_pcibios_bus_add_device(struct pci_dev *pdev)
>  	if (!pdev->is_virtfn)
>  		return;
>  
> +	pr_debug("%s: EEH: Setting up device %s.\n", __func__, pci_name(pdev));
> +
>  	pdn->device_id  =  pdev->device;
>  	pdn->vendor_id  =  pdev->vendor;
>  	pdn->class_code =  pdev->class;
> @@ -251,6 +253,10 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data)
>  	int enable = 0;
>  	int ret;
>  
> +	pr_debug("%s: probing %04x:%02x:%02x.%01x\n",
> +		__func__, pdn->phb->global_number, pdn->busno,
> +		PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
> +
>  	/* Retrieve OF node and eeh device */
>  	edev = pdn_to_eeh_dev(pdn);
>  	if (!edev || edev->pe)
> @@ -294,7 +300,12 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data)
>  
>  	/* Enable EEH on the device */
>  	ret = eeh_ops->set_option(&pe, EEH_OPT_ENABLE);
> -	if (!ret) {
> +	if (ret) {
> +		pr_debug("%s: EEH failed to enable on %02x:%02x.%01x PHB#%x-PE#%x (code %d)\n",
> +			__func__, pdn->busno, PCI_SLOT(pdn->devfn),
> +			PCI_FUNC(pdn->devfn), pe.phb->global_number,
> +			pe.addr, ret);
> +	} else {


edev!=NULL here so you could do dev_dbg(&edev->pdev->dev,...) and skip
PCI_SLOT/PCI_FUNC. Or is (edev!=NULL && edev->pdev==NULL) possible (it
could be, just asking)?


>  		/* Retrieve PE address */
>  		edev->pe_config_addr = eeh_ops->get_pe_addr(&pe);
>  		pe.addr = edev->pe_config_addr;
> @@ -310,11 +321,6 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data)
>  		if (enable) {
>  			eeh_add_flag(EEH_ENABLED);
>  			eeh_add_to_parent_pe(edev);
> -
> -			pr_debug("%s: EEH enabled on %02x:%02x.%01x PHB#%x-PE#%x\n",
> -				__func__, pdn->busno, PCI_SLOT(pdn->devfn),
> -				PCI_FUNC(pdn->devfn), pe.phb->global_number,
> -				pe.addr);
>  		} else if (pdn->parent && pdn_to_eeh_dev(pdn->parent) &&
>  			   (pdn_to_eeh_dev(pdn->parent))->pe) {
>  			/* This device doesn't support EEH, but it may have an
> @@ -323,6 +329,11 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data)
>  			edev->pe_config_addr = pdn_to_eeh_dev(pdn->parent)->pe_config_addr;
>  			eeh_add_to_parent_pe(edev);
>  		}
> +		pr_debug("%s: EEH %s on %02x:%02x.%01x PHB#%x-PE#%x (code %d)\n",
> +			__func__, (enable ? "enabled" : "unsupported"),
> +			pdn->busno, PCI_SLOT(pdn->devfn),
> +			PCI_FUNC(pdn->devfn), pe.phb->global_number,
> +			pe.addr, ret);

Same here. I understand though this one is a cut-n-paste :)


>  	}
>  
>  	/* Save memory bars */
> 

-- 
Alexey

^ permalink raw reply

* Re: [PATCH 4/4] mm/vmalloc: Hugepage vmalloc mappings
From: Christophe Leroy @ 2019-06-11  5:39 UTC (permalink / raw)
  To: Nicholas Piggin, linux-mm, Russell Currey; +Cc: linuxppc-dev, linux-arm-kernel
In-Reply-To: <20190610043838.27916-4-npiggin@gmail.com>



Le 10/06/2019 à 06:38, Nicholas Piggin a écrit :
> For platforms that define HAVE_ARCH_HUGE_VMAP, have vmap allow vmalloc to
> allocate huge pages and map them

Will this be compatible with Russell's series 
https://patchwork.ozlabs.org/patch/1099857/ for the implementation of 
STRICT_MODULE_RWX ?
I see that apply_to_page_range() have things like BUG_ON(pud_huge(*pud));

Might also be an issue for arm64 as I think Russell's implementation 
comes from there.

> 
> This brings dTLB misses for linux kernel tree `git diff` from 45,000 to
> 8,000 on a Kaby Lake KVM guest with 8MB dentry hash and mitigations=off
> (performance is in the noise, under 1% difference, page tables are likely
> to be well cached for this workload). Similar numbers are seen on POWER9.
> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>   include/asm-generic/4level-fixup.h |   1 +
>   include/asm-generic/5level-fixup.h |   1 +
>   include/linux/vmalloc.h            |   1 +
>   mm/vmalloc.c                       | 132 +++++++++++++++++++++++------
>   4 files changed, 107 insertions(+), 28 deletions(-)
> 
> diff --git a/include/asm-generic/4level-fixup.h b/include/asm-generic/4level-fixup.h
> index e3667c9a33a5..3cc65a4dd093 100644
> --- a/include/asm-generic/4level-fixup.h
> +++ b/include/asm-generic/4level-fixup.h
> @@ -20,6 +20,7 @@
>   #define pud_none(pud)			0
>   #define pud_bad(pud)			0
>   #define pud_present(pud)		1
> +#define pud_large(pud)			0
>   #define pud_ERROR(pud)			do { } while (0)
>   #define pud_clear(pud)			pgd_clear(pud)
>   #define pud_val(pud)			pgd_val(pud)
> diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h
> index bb6cb347018c..c4377db09a4f 100644
> --- a/include/asm-generic/5level-fixup.h
> +++ b/include/asm-generic/5level-fixup.h
> @@ -22,6 +22,7 @@
>   #define p4d_none(p4d)			0
>   #define p4d_bad(p4d)			0
>   #define p4d_present(p4d)		1
> +#define p4d_large(p4d)			0
>   #define p4d_ERROR(p4d)			do { } while (0)
>   #define p4d_clear(p4d)			pgd_clear(p4d)
>   #define p4d_val(p4d)			pgd_val(p4d)
> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> index 812bea5866d6..4c92dc608928 100644
> --- a/include/linux/vmalloc.h
> +++ b/include/linux/vmalloc.h
> @@ -42,6 +42,7 @@ struct vm_struct {
>   	unsigned long		size;
>   	unsigned long		flags;
>   	struct page		**pages;
> +	unsigned int		page_shift;
>   	unsigned int		nr_pages;
>   	phys_addr_t		phys_addr;
>   	const void		*caller;
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index dd27cfb29b10..0cf8e861caeb 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -36,6 +36,7 @@
>   #include <linux/rbtree_augmented.h>
>   
>   #include <linux/uaccess.h>
> +#include <asm/pgtable.h>
>   #include <asm/tlbflush.h>
>   #include <asm/shmparam.h>
>   
> @@ -440,6 +441,41 @@ static int vmap_pages_range(unsigned long start, unsigned long end,
>   	return ret;
>   }
>   
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
> +static int vmap_hpages_range(unsigned long start, unsigned long end,
> +				   pgprot_t prot, struct page **pages,
> +				   unsigned int page_shift)
> +{
> +	unsigned long addr = start;
> +	unsigned int i, nr = (end - start) >> (PAGE_SHIFT + page_shift);
> +
> +	for (i = 0; i < nr; i++) {
> +		int err;
> +
> +		err = vmap_range_noflush(addr,
> +					addr + (PAGE_SIZE << page_shift),
> +					__pa(page_address(pages[i])), prot,
> +					PAGE_SHIFT + page_shift);
> +		if (err)
> +			return err;
> +
> +		addr += PAGE_SIZE << page_shift;
> +	}
> +	flush_cache_vmap(start, end);
> +
> +	return nr;
> +}
> +#else
> +static int vmap_hpages_range(unsigned long start, unsigned long end,
> +			   pgprot_t prot, struct page **pages,
> +			   unsigned int page_shift)
> +{
> +	BUG_ON(page_shift != PAGE_SIZE);

Do we really need a BUG_ON() there ? What happens if this condition is 
true ?

> +	return vmap_pages_range(start, end, prot, pages);
> +}
> +#endif
> +
> +
>   int is_vmalloc_or_module_addr(const void *x)
>   {
>   	/*
> @@ -462,7 +498,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
>   {
>   	unsigned long addr = (unsigned long) vmalloc_addr;
>   	struct page *page = NULL;
> -	pgd_t *pgd = pgd_offset_k(addr);
> +	pgd_t *pgd;
>   	p4d_t *p4d;
>   	pud_t *pud;
>   	pmd_t *pmd;
> @@ -474,27 +510,38 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
>   	 */
>   	VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
>   
> +	pgd = pgd_offset_k(addr);
>   	if (pgd_none(*pgd))
>   		return NULL;
> +
>   	p4d = p4d_offset(pgd, addr);
>   	if (p4d_none(*p4d))
>   		return NULL;
> -	pud = pud_offset(p4d, addr);
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP

Do we really need that ifdef ? Won't p4d_large() always return 0 when is 
not set ?
Otherwise, could we use IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP) instead ?

Same several places below.

> +	if (p4d_large(*p4d))
> +		return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
> +#endif
> +	if (WARN_ON_ONCE(p4d_bad(*p4d)))
> +		return NULL;
>   
> -	/*
> -	 * Don't dereference bad PUD or PMD (below) entries. This will also
> -	 * identify huge mappings, which we may encounter on architectures
> -	 * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
> -	 * identified as vmalloc addresses by is_vmalloc_addr(), but are
> -	 * not [unambiguously] associated with a struct page, so there is
> -	 * no correct value to return for them.
> -	 */
> -	WARN_ON_ONCE(pud_bad(*pud));
> -	if (pud_none(*pud) || pud_bad(*pud))
> +	pud = pud_offset(p4d, addr);
> +	if (pud_none(*pud))
> +		return NULL;
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
> +	if (pud_large(*pud))
> +		return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
> +#endif
> +	if (WARN_ON_ONCE(pud_bad(*pud)))
>   		return NULL;
> +
>   	pmd = pmd_offset(pud, addr);
> -	WARN_ON_ONCE(pmd_bad(*pmd));
> -	if (pmd_none(*pmd) || pmd_bad(*pmd))
> +	if (pmd_none(*pmd))
> +		return NULL;
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
> +	if (pmd_large(*pmd))
> +		return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
> +#endif
> +	if (WARN_ON_ONCE(pmd_bad(*pmd)))
>   		return NULL;
>   
>   	ptep = pte_offset_map(pmd, addr);
> @@ -502,6 +549,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
>   	if (pte_present(pte))
>   		page = pte_page(pte);
>   	pte_unmap(ptep);
> +
>   	return page;
>   }
>   EXPORT_SYMBOL(vmalloc_to_page);
> @@ -2185,8 +2233,9 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
>   		return NULL;
>   
>   	if (flags & VM_IOREMAP)
> -		align = 1ul << clamp_t(int, get_count_order_long(size),
> -				       PAGE_SHIFT, IOREMAP_MAX_ORDER);
> +		align = max(align,
> +				1ul << clamp_t(int, get_count_order_long(size),
> +				       PAGE_SHIFT, IOREMAP_MAX_ORDER));
>   
>   	area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
>   	if (unlikely(!area))
> @@ -2398,7 +2447,7 @@ static void __vunmap(const void *addr, int deallocate_pages)
>   			struct page *page = area->pages[i];
>   
>   			BUG_ON(!page);
> -			__free_pages(page, 0);
> +			__free_pages(page, area->page_shift);
>   		}
>   
>   		kvfree(area->pages);
> @@ -2541,14 +2590,17 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>   				 pgprot_t prot, int node)
>   {
>   	struct page **pages;
> +	unsigned long addr = (unsigned long)area->addr;
> +	unsigned long size = get_vm_area_size(area);
> +	unsigned int page_shift = area->page_shift;
> +	unsigned int shift = page_shift + PAGE_SHIFT;
>   	unsigned int nr_pages, array_size, i;
>   	const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
>   	const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
>   	const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
> -					0 :
> -					__GFP_HIGHMEM;
> +					0 : __GFP_HIGHMEM;

This patch is already quite big, shouldn't this kind of unrelated 
cleanups be in another patch ?

>   
> -	nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
> +	nr_pages = size >> shift;
>   	array_size = (nr_pages * sizeof(struct page *));
>   
>   	area->nr_pages = nr_pages;
> @@ -2569,10 +2621,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>   	for (i = 0; i < area->nr_pages; i++) {
>   		struct page *page;
>   
> -		if (node == NUMA_NO_NODE)
> -			page = alloc_page(alloc_mask|highmem_mask);
> -		else
> -			page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
> +		page = alloc_pages_node(node,
> +				alloc_mask|highmem_mask, page_shift);

This is also nice cleanup, but does it really belong to this patch ?

>   
>   		if (unlikely(!page)) {
>   			/* Successfully allocated i pages, free them in __vunmap() */
> @@ -2584,8 +2634,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>   			cond_resched();
>   	}
>   
> -	if (map_vm_area(area, prot, pages))
> +	if (vmap_hpages_range(addr, addr + size, prot, pages, page_shift) < 0)
>   		goto fail;
> +

Cleanup ?

>   	return area->addr;
>   
>   fail:
> @@ -2619,22 +2670,39 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
>   			pgprot_t prot, unsigned long vm_flags, int node,
>   			const void *caller)
>   {
> -	struct vm_struct *area;
> +	struct vm_struct *area = NULL;
>   	void *addr;
>   	unsigned long real_size = size;
> +	unsigned long real_align = align;
> +	unsigned int shift = PAGE_SHIFT;
>   
>   	size = PAGE_ALIGN(size);
>   	if (!size || (size >> PAGE_SHIFT) > totalram_pages())
>   		goto fail;
>   
> +	if (IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) {
> +		unsigned long size_per_node;
> +
> +		size_per_node = size;
> +		if (node == NUMA_NO_NODE)
> +			size_per_node /= num_online_nodes();
> +		if (size_per_node >= PMD_SIZE)
> +			shift = PMD_SHIFT;
> +	}
> +again:
> +	align = max(real_align, 1UL << shift);
> +	size = ALIGN(real_size, align);
> +
>   	area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
>   				vm_flags, start, end, node, gfp_mask, caller);
>   	if (!area)
>   		goto fail;
>   
> +	area->page_shift = shift - PAGE_SHIFT;
> +
>   	addr = __vmalloc_area_node(area, gfp_mask, prot, node);
>   	if (!addr)
> -		return NULL;
> +		goto fail;
>   
>   	/*
>   	 * In this function, newly allocated vm_struct has VM_UNINITIALIZED
> @@ -2648,8 +2716,16 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
>   	return addr;
>   
>   fail:
> -	warn_alloc(gfp_mask, NULL,
> +	if (shift == PMD_SHIFT) {
> +		shift = PAGE_SHIFT;
> +		goto again;
> +	}
> +
> +	if (!area) {
> +		/* Warn for area allocation, page allocations already warn */
> +		warn_alloc(gfp_mask, NULL,
>   			  "vmalloc: allocation failure: %lu bytes", real_size);
> +	}
>   	return NULL;
>   }
>   
> 

Christophe

^ permalink raw reply

* Re: [PATCH 1/4] mm: Move ioremap page table mapping function to mm/
From: Christophe Leroy @ 2019-06-11  5:24 UTC (permalink / raw)
  To: Nicholas Piggin, linux-mm; +Cc: linuxppc-dev, linux-arm-kernel
In-Reply-To: <20190610043838.27916-1-npiggin@gmail.com>



Le 10/06/2019 à 06:38, Nicholas Piggin a écrit :
> ioremap_page_range is a generic function to create a kernel virtual
> mapping, move it to mm/vmalloc.c and rename it vmap_range.
> 
> For clarity with this move, also:
> - Rename vunmap_page_range (vmap_range's inverse) to vunmap_range.
> - Rename vmap_page_range (which takes a page array) to vmap_pages.

Maybe it would be easier to follow the change if the name change was 
done in another patch than the move.

> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
> 
> Fixed up the arm64 compile errors, fixed a few bugs, and tidied
> things up a bit more.
> 
> Have tested powerpc and x86 but not arm64, would appreciate a review
> and test of the arm64 patch if possible.
> 
>   include/linux/vmalloc.h |   3 +
>   lib/ioremap.c           | 173 +++---------------------------
>   mm/vmalloc.c            | 228 ++++++++++++++++++++++++++++++++++++----
>   3 files changed, 229 insertions(+), 175 deletions(-)
> 
> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> index 51e131245379..812bea5866d6 100644
> --- a/include/linux/vmalloc.h
> +++ b/include/linux/vmalloc.h
> @@ -147,6 +147,9 @@ extern struct vm_struct *find_vm_area(const void *addr);
>   extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
>   			struct page **pages);
>   #ifdef CONFIG_MMU
> +extern int vmap_range(unsigned long addr,
> +		       unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
> +		       unsigned int max_page_shift);

Drop extern keyword here.

As checkpatch tells you, 'CHECK:AVOID_EXTERNS: extern prototypes should 
be avoided in .h files'

Christophe

>   extern int map_kernel_range_noflush(unsigned long start, unsigned long size,
>   				    pgprot_t prot, struct page **pages);
>   extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size);
> diff --git a/lib/ioremap.c b/lib/ioremap.c
> index 063213685563..e13946da8ec3 100644
> --- a/lib/ioremap.c
> +++ b/lib/ioremap.c
> @@ -58,165 +58,24 @@ static inline int ioremap_pud_enabled(void) { return 0; }
>   static inline int ioremap_pmd_enabled(void) { return 0; }
>   #endif	/* CONFIG_HAVE_ARCH_HUGE_VMAP */
>   
> -static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
> -		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
> -{
> -	pte_t *pte;
> -	u64 pfn;
> -
> -	pfn = phys_addr >> PAGE_SHIFT;
> -	pte = pte_alloc_kernel(pmd, addr);
> -	if (!pte)
> -		return -ENOMEM;
> -	do {
> -		BUG_ON(!pte_none(*pte));
> -		set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
> -		pfn++;
> -	} while (pte++, addr += PAGE_SIZE, addr != end);
> -	return 0;
> -}
> -
> -static int ioremap_try_huge_pmd(pmd_t *pmd, unsigned long addr,
> -				unsigned long end, phys_addr_t phys_addr,
> -				pgprot_t prot)
> -{
> -	if (!ioremap_pmd_enabled())
> -		return 0;
> -
> -	if ((end - addr) != PMD_SIZE)
> -		return 0;
> -
> -	if (!IS_ALIGNED(phys_addr, PMD_SIZE))
> -		return 0;
> -
> -	if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
> -		return 0;
> -
> -	return pmd_set_huge(pmd, phys_addr, prot);
> -}
> -
> -static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
> -		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
> -{
> -	pmd_t *pmd;
> -	unsigned long next;
> -
> -	pmd = pmd_alloc(&init_mm, pud, addr);
> -	if (!pmd)
> -		return -ENOMEM;
> -	do {
> -		next = pmd_addr_end(addr, end);
> -
> -		if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr, prot))
> -			continue;
> -
> -		if (ioremap_pte_range(pmd, addr, next, phys_addr, prot))
> -			return -ENOMEM;
> -	} while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
> -	return 0;
> -}
> -
> -static int ioremap_try_huge_pud(pud_t *pud, unsigned long addr,
> -				unsigned long end, phys_addr_t phys_addr,
> -				pgprot_t prot)
> -{
> -	if (!ioremap_pud_enabled())
> -		return 0;
> -
> -	if ((end - addr) != PUD_SIZE)
> -		return 0;
> -
> -	if (!IS_ALIGNED(phys_addr, PUD_SIZE))
> -		return 0;
> -
> -	if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
> -		return 0;
> -
> -	return pud_set_huge(pud, phys_addr, prot);
> -}
> -
> -static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
> -		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
> -{
> -	pud_t *pud;
> -	unsigned long next;
> -
> -	pud = pud_alloc(&init_mm, p4d, addr);
> -	if (!pud)
> -		return -ENOMEM;
> -	do {
> -		next = pud_addr_end(addr, end);
> -
> -		if (ioremap_try_huge_pud(pud, addr, next, phys_addr, prot))
> -			continue;
> -
> -		if (ioremap_pmd_range(pud, addr, next, phys_addr, prot))
> -			return -ENOMEM;
> -	} while (pud++, phys_addr += (next - addr), addr = next, addr != end);
> -	return 0;
> -}
> -
> -static int ioremap_try_huge_p4d(p4d_t *p4d, unsigned long addr,
> -				unsigned long end, phys_addr_t phys_addr,
> -				pgprot_t prot)
> -{
> -	if (!ioremap_p4d_enabled())
> -		return 0;
> -
> -	if ((end - addr) != P4D_SIZE)
> -		return 0;
> -
> -	if (!IS_ALIGNED(phys_addr, P4D_SIZE))
> -		return 0;
> -
> -	if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
> -		return 0;
> -
> -	return p4d_set_huge(p4d, phys_addr, prot);
> -}
> -
> -static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr,
> -		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
> -{
> -	p4d_t *p4d;
> -	unsigned long next;
> -
> -	p4d = p4d_alloc(&init_mm, pgd, addr);
> -	if (!p4d)
> -		return -ENOMEM;
> -	do {
> -		next = p4d_addr_end(addr, end);
> -
> -		if (ioremap_try_huge_p4d(p4d, addr, next, phys_addr, prot))
> -			continue;
> -
> -		if (ioremap_pud_range(p4d, addr, next, phys_addr, prot))
> -			return -ENOMEM;
> -	} while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
> -	return 0;
> -}
> -
>   int ioremap_page_range(unsigned long addr,
>   		       unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
>   {
> -	pgd_t *pgd;
> -	unsigned long start;
> -	unsigned long next;
> -	int err;
> -
> -	might_sleep();
> -	BUG_ON(addr >= end);
> -
> -	start = addr;
> -	pgd = pgd_offset_k(addr);
> -	do {
> -		next = pgd_addr_end(addr, end);
> -		err = ioremap_p4d_range(pgd, addr, next, phys_addr, prot);
> -		if (err)
> -			break;
> -	} while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
> -
> -	flush_cache_vmap(start, end);
> +	unsigned int max_page_shift = PAGE_SHIFT;
> +
> +	/*
> +	 * Due to the max_page_shift parameter to vmap_range, platforms must
> +	 * enable all smaller sizes to take advantage of a given size,
> +	 * otherwise fall back to small pages.
> +	 */
> +	if (ioremap_pmd_enabled()) {
> +		max_page_shift = PMD_SHIFT;
> +		if (ioremap_pud_enabled()) {
> +			max_page_shift = PUD_SHIFT;
> +			if (ioremap_p4d_enabled())
> +				max_page_shift = P4D_SHIFT;
> +		}
> +	}
>   
> -	return err;
> +	return vmap_range(addr, end, phys_addr, prot, max_page_shift);
>   }
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 233af6936c93..dd27cfb29b10 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -119,7 +119,7 @@ static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end)
>   	} while (p4d++, addr = next, addr != end);
>   }
>   
> -static void vunmap_page_range(unsigned long addr, unsigned long end)
> +static void vunmap_range(unsigned long addr, unsigned long end)
>   {
>   	pgd_t *pgd;
>   	unsigned long next;
> @@ -135,6 +135,198 @@ static void vunmap_page_range(unsigned long addr, unsigned long end)
>   }
>   
>   static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
> +			unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
> +{
> +	pte_t *pte;
> +	u64 pfn;
> +
> +	pfn = phys_addr >> PAGE_SHIFT;
> +	pte = pte_alloc_kernel(pmd, addr);
> +	if (!pte)
> +		return -ENOMEM;
> +	do {
> +		BUG_ON(!pte_none(*pte));
> +		set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
> +		pfn++;
> +	} while (pte++, addr += PAGE_SIZE, addr != end);
> +	return 0;
> +}
> +
> +static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr,
> +			unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
> +			unsigned int max_page_shift)
> +{
> +	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
> +		return 0;
> +
> +	if (max_page_shift < PMD_SHIFT)
> +		return 0;
> +
> +	if ((end - addr) != PMD_SIZE)
> +		return 0;
> +
> +	if (!IS_ALIGNED(phys_addr, PMD_SIZE))
> +		return 0;
> +
> +	if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
> +		return 0;
> +
> +	return pmd_set_huge(pmd, phys_addr, prot);
> +}
> +
> +static inline int vmap_pmd_range(pud_t *pud, unsigned long addr,
> +			unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
> +			unsigned int max_page_shift)
> +{
> +	pmd_t *pmd;
> +	unsigned long next;
> +
> +	pmd = pmd_alloc(&init_mm, pud, addr);
> +	if (!pmd)
> +		return -ENOMEM;
> +	do {
> +		next = pmd_addr_end(addr, end);
> +
> +		if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
> +					max_page_shift))
> +			continue;
> +
> +		if (vmap_pte_range(pmd, addr, next, phys_addr, prot))
> +			return -ENOMEM;
> +	} while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
> +	return 0;
> +}
> +
> +static int vmap_try_huge_pud(pud_t *pud, unsigned long addr,
> +			unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
> +			unsigned int max_page_shift)
> +{
> +	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
> +		return 0;
> +
> +	if (max_page_shift < PUD_SHIFT)
> +		return 0;
> +
> +	if ((end - addr) != PUD_SIZE)
> +		return 0;
> +
> +	if (!IS_ALIGNED(phys_addr, PUD_SIZE))
> +		return 0;
> +
> +	if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
> +		return 0;
> +
> +	return pud_set_huge(pud, phys_addr, prot);
> +}
> +
> +static inline int vmap_pud_range(p4d_t *p4d, unsigned long addr,
> +			unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
> +			unsigned int max_page_shift)
> +{
> +	pud_t *pud;
> +	unsigned long next;
> +
> +	pud = pud_alloc(&init_mm, p4d, addr);
> +	if (!pud)
> +		return -ENOMEM;
> +	do {
> +		next = pud_addr_end(addr, end);
> +
> +		if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot,
> +					max_page_shift))
> +			continue;
> +
> +		if (vmap_pmd_range(pud, addr, next, phys_addr, prot,
> +					max_page_shift))
> +			return -ENOMEM;
> +	} while (pud++, phys_addr += (next - addr), addr = next, addr != end);
> +	return 0;
> +}
> +
> +static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr,
> +			unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
> +			unsigned int max_page_shift)
> +{
> +	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
> +		return 0;
> +
> +	if (max_page_shift < P4D_SHIFT)
> +		return 0;
> +
> +	if ((end - addr) != P4D_SIZE)
> +		return 0;
> +
> +	if (!IS_ALIGNED(phys_addr, P4D_SIZE))
> +		return 0;
> +
> +	if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
> +		return 0;
> +
> +	return p4d_set_huge(p4d, phys_addr, prot);
> +}
> +
> +static inline int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
> +			unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
> +			unsigned int max_page_shift)
> +{
> +	p4d_t *p4d;
> +	unsigned long next;
> +
> +	p4d = p4d_alloc(&init_mm, pgd, addr);
> +	if (!p4d)
> +		return -ENOMEM;
> +	do {
> +		next = p4d_addr_end(addr, end);
> +
> +		if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot,
> +					max_page_shift))
> +			continue;
> +
> +		if (vmap_pud_range(p4d, addr, next, phys_addr, prot,
> +					max_page_shift))
> +			return -ENOMEM;
> +	} while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
> +	return 0;
> +}
> +
> +static int vmap_range_noflush(unsigned long addr,
> +			unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
> +			unsigned int max_page_shift)
> +{
> +	pgd_t *pgd;
> +	unsigned long start;
> +	unsigned long next;
> +	int err;
> +
> +	might_sleep();
> +	BUG_ON(addr >= end);
> +
> +	start = addr;
> +	pgd = pgd_offset_k(addr);
> +	do {
> +		next = pgd_addr_end(addr, end);
> +		err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
> +					max_page_shift);
> +		if (err)
> +			break;
> +	} while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
> +
> +	return err;
> +}
> +
> +int vmap_range(unsigned long addr,
> +		       unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
> +		       unsigned int max_page_shift)
> +{
> +	int ret;
> +
> +	ret = vmap_range_noflush(addr, end, phys_addr, prot, max_page_shift);
> +	flush_cache_vmap(addr, end);
> +
> +	return ret;
> +}
> +
> +static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
>   		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
>   {
>   	pte_t *pte;
> @@ -160,7 +352,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
>   	return 0;
>   }
>   
> -static int vmap_pmd_range(pud_t *pud, unsigned long addr,
> +static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
>   		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
>   {
>   	pmd_t *pmd;
> @@ -171,13 +363,13 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr,
>   		return -ENOMEM;
>   	do {
>   		next = pmd_addr_end(addr, end);
> -		if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
> +		if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr))
>   			return -ENOMEM;
>   	} while (pmd++, addr = next, addr != end);
>   	return 0;
>   }
>   
> -static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
> +static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
>   		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
>   {
>   	pud_t *pud;
> @@ -188,13 +380,13 @@ static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
>   		return -ENOMEM;
>   	do {
>   		next = pud_addr_end(addr, end);
> -		if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
> +		if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr))
>   			return -ENOMEM;
>   	} while (pud++, addr = next, addr != end);
>   	return 0;
>   }
>   
> -static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
> +static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
>   		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
>   {
>   	p4d_t *p4d;
> @@ -205,7 +397,7 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
>   		return -ENOMEM;
>   	do {
>   		next = p4d_addr_end(addr, end);
> -		if (vmap_pud_range(p4d, addr, next, prot, pages, nr))
> +		if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr))
>   			return -ENOMEM;
>   	} while (p4d++, addr = next, addr != end);
>   	return 0;
> @@ -217,7 +409,7 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
>    *
>    * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
>    */
> -static int vmap_page_range_noflush(unsigned long start, unsigned long end,
> +static int vmap_pages_range_noflush(unsigned long start, unsigned long end,
>   				   pgprot_t prot, struct page **pages)
>   {
>   	pgd_t *pgd;
> @@ -230,7 +422,7 @@ static int vmap_page_range_noflush(unsigned long start, unsigned long end,
>   	pgd = pgd_offset_k(addr);
>   	do {
>   		next = pgd_addr_end(addr, end);
> -		err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr);
> +		err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr);
>   		if (err)
>   			return err;
>   	} while (pgd++, addr = next, addr != end);
> @@ -238,12 +430,12 @@ static int vmap_page_range_noflush(unsigned long start, unsigned long end,
>   	return nr;
>   }
>   
> -static int vmap_page_range(unsigned long start, unsigned long end,
> +static int vmap_pages_range(unsigned long start, unsigned long end,
>   			   pgprot_t prot, struct page **pages)
>   {
>   	int ret;
>   
> -	ret = vmap_page_range_noflush(start, end, prot, pages);
> +	ret = vmap_pages_range_noflush(start, end, prot, pages);
>   	flush_cache_vmap(start, end);
>   	return ret;
>   }
> @@ -1148,7 +1340,7 @@ static void free_vmap_area(struct vmap_area *va)
>    */
>   static void unmap_vmap_area(struct vmap_area *va)
>   {
> -	vunmap_page_range(va->va_start, va->va_end);
> +	vunmap_range(va->va_start, va->va_end);
>   }
>   
>   /*
> @@ -1586,7 +1778,7 @@ static void vb_free(const void *addr, unsigned long size)
>   	rcu_read_unlock();
>   	BUG_ON(!vb);
>   
> -	vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
> +	vunmap_range((unsigned long)addr, (unsigned long)addr + size);
>   
>   	if (debug_pagealloc_enabled())
>   		flush_tlb_kernel_range((unsigned long)addr,
> @@ -1736,7 +1928,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro
>   		addr = va->va_start;
>   		mem = (void *)addr;
>   	}
> -	if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
> +	if (vmap_pages_range(addr, addr + size, prot, pages) < 0) {
>   		vm_unmap_ram(mem, count);
>   		return NULL;
>   	}
> @@ -1903,7 +2095,7 @@ void __init vmalloc_init(void)
>   int map_kernel_range_noflush(unsigned long addr, unsigned long size,
>   			     pgprot_t prot, struct page **pages)
>   {
> -	return vmap_page_range_noflush(addr, addr + size, prot, pages);
> +	return vmap_pages_range_noflush(addr, addr + size, prot, pages);
>   }
>   
>   /**
> @@ -1922,7 +2114,7 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size,
>    */
>   void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
>   {
> -	vunmap_page_range(addr, addr + size);
> +	vunmap_range(addr, addr + size);
>   }
>   EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush);
>   
> @@ -1939,7 +2131,7 @@ void unmap_kernel_range(unsigned long addr, unsigned long size)
>   	unsigned long end = addr + size;
>   
>   	flush_cache_vunmap(addr, end);
> -	vunmap_page_range(addr, end);
> +	vunmap_range(addr, end);
>   	flush_tlb_kernel_range(addr, end);
>   }
>   EXPORT_SYMBOL_GPL(unmap_kernel_range);
> @@ -1950,7 +2142,7 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages)
>   	unsigned long end = addr + get_vm_area_size(area);
>   	int err;
>   
> -	err = vmap_page_range(addr, end, prot, pages);
> +	err = vmap_pages_range(addr, end, prot, pages);
>   
>   	return err > 0 ? 0 : err;
>   }
> 

^ permalink raw reply

* Re: [PATCH v3 3/3] powerpc: Add support to initialize ima policy rules
From: Satheesh Rajendran @ 2019-06-11  5:19 UTC (permalink / raw)
  To: Nayna Jain
  Cc: linux-efi, Ard Biesheuvel, linux-kernel, Mimi Zohar,
	Claudio Carvalho, Matthew Garret, linuxppc-dev, Paul Mackerras,
	Jeremy Kerr, linux-integrity
In-Reply-To: <1560198837-18857-4-git-send-email-nayna@linux.ibm.com>

On Mon, Jun 10, 2019 at 04:33:57PM -0400, Nayna Jain wrote:
> PowerNV secure boot relies on the kernel IMA security subsystem to
> perform the OS kernel image signature verification. Since each secure
> boot mode has different IMA policy requirements, dynamic definition of
> the policy rules based on the runtime secure boot mode of the system is
> required. On systems that support secure boot, but have it disabled,
> only measurement policy rules of the kernel image and modules are
> defined.
> 
> This patch defines the arch-specific implementation to retrieve the
> secure boot mode of the system and accordingly configures the IMA policy
> rules.
> 
> This patch provides arch-specific IMA policies if PPC_SECURE_BOOT
> config is enabled.
> 
> Signed-off-by: Nayna Jain <nayna@linux.ibm.com>
> ---
>  arch/powerpc/Kconfig           | 14 +++++++++
>  arch/powerpc/kernel/Makefile   |  1 +
>  arch/powerpc/kernel/ima_arch.c | 54 ++++++++++++++++++++++++++++++++++
>  include/linux/ima.h            |  3 +-
>  4 files changed, 71 insertions(+), 1 deletion(-)
>  create mode 100644 arch/powerpc/kernel/ima_arch.c

Hi,

This series failed to build against linuxppc/merge tree with `ppc64le_defconfig`,

arch/powerpc/platforms/powernv/secboot.c:14:6: error: redefinition of 'get_powerpc_sb_mode'
   14 | bool get_powerpc_sb_mode(void)
      |      ^~~~~~~~~~~~~~~~~~~
In file included from arch/powerpc/platforms/powernv/secboot.c:11:
./arch/powerpc/include/asm/secboot.h:15:20: note: previous definition of 'get_powerpc_sb_mode' was here
   15 | static inline bool get_powerpc_sb_mode(void)
      |                    ^~~~~~~~~~~~~~~~~~~
make[3]: *** [scripts/Makefile.build:278: arch/powerpc/platforms/powernv/secboot.o] Error 1
make[3]: *** Waiting for unfinished jobs....
make[2]: *** [scripts/Makefile.build:489: arch/powerpc/platforms/powernv] Error 2
make[1]: *** [scripts/Makefile.build:489: arch/powerpc/platforms] Error 2
make: *** [Makefile:1071: arch/powerpc] Error 2
make: *** Waiting for unfinished jobs....

Regards,
-Satheesh

> 
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 8c1c636308c8..9de77bb14f54 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -902,6 +902,20 @@ config PPC_MEM_KEYS
> 
>  	  If unsure, say y.
> 
> +config PPC_SECURE_BOOT
> +	prompt "Enable PowerPC Secure Boot"
> +	bool
> +	default n
> +	depends on PPC64
> +	depends on OPAL_SECVAR
> +	depends on IMA
> +	depends on IMA_ARCH_POLICY
> +	help
> +	  Linux on POWER with firmware secure boot enabled needs to define
> +	  security policies to extend secure boot to the OS.This config
> +	  allows user to enable OS Secure Boot on PowerPC systems that
> +	  have firmware secure boot support.
> +
>  endmenu
> 
>  config ISA_DMA_API
> diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
> index 0ea6c4aa3a20..75c929b41341 100644
> --- a/arch/powerpc/kernel/Makefile
> +++ b/arch/powerpc/kernel/Makefile
> @@ -131,6 +131,7 @@ ifdef CONFIG_IMA
>  obj-y				+= ima_kexec.o
>  endif
>  endif
> +obj-$(CONFIG_PPC_SECURE_BOOT)	+= ima_arch.o
> 
>  obj-$(CONFIG_AUDIT)		+= audit.o
>  obj64-$(CONFIG_AUDIT)		+= compat_audit.o
> diff --git a/arch/powerpc/kernel/ima_arch.c b/arch/powerpc/kernel/ima_arch.c
> new file mode 100644
> index 000000000000..1767bf6e6550
> --- /dev/null
> +++ b/arch/powerpc/kernel/ima_arch.c
> @@ -0,0 +1,54 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2019 IBM Corporation
> + * Author: Nayna Jain <nayna@linux.ibm.com>
> + *
> + * ima_arch.c
> + *      - initialize ima policies for PowerPC Secure Boot
> + */
> +
> +#include <linux/ima.h>
> +#include <asm/secboot.h>
> +
> +bool arch_ima_get_secureboot(void)
> +{
> +	bool sb_mode;
> +
> +	sb_mode = get_powerpc_sb_mode();
> +	if (sb_mode)
> +		return true;
> +	else
> +		return false;
> +}
> +
> +/*
> + * File signature verification is not needed, include only measurements
> + */
> +static const char *const default_arch_rules[] = {
> +	"measure func=KEXEC_KERNEL_CHECK template=ima-modsig",
> +	"measure func=MODULE_CHECK template=ima-modsig",
> +	NULL
> +};
> +
> +/* Both file signature verification and measurements are needed */
> +static const char *const sb_arch_rules[] = {
> +	"measure func=KEXEC_KERNEL_CHECK template=ima-modsig",
> +	"measure func=MODULE_CHECK template=ima-modsig",
> +	"appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig|modsig template=ima-modsig",
> +#if !IS_ENABLED(CONFIG_MODULE_SIG)
> +	"appraise func=MODULE_CHECK appraise_type=imasig|modsig template=ima-modsig",
> +#endif
> +	NULL
> +};
> +
> +/*
> + * On PowerPC, file measurements are to be added to the IMA measurement list
> + * irrespective of the secure boot state of the system. Signature verification
> + * is conditionally enabled based on the secure boot state.
> + */
> +const char *const *arch_get_ima_policy(void)
> +{
> +	if (IS_ENABLED(CONFIG_IMA_ARCH_POLICY) && arch_ima_get_secureboot())
> +		return sb_arch_rules;
> +	return default_arch_rules;
> +}
> diff --git a/include/linux/ima.h b/include/linux/ima.h
> index fd9f7cf4cdf5..a01df076ecae 100644
> --- a/include/linux/ima.h
> +++ b/include/linux/ima.h
> @@ -31,7 +31,8 @@ extern void ima_post_path_mknod(struct dentry *dentry);
>  extern void ima_add_kexec_buffer(struct kimage *image);
>  #endif
> 
> -#if (defined(CONFIG_X86) && defined(CONFIG_EFI)) || defined(CONFIG_S390)
> +#if (defined(CONFIG_X86) && defined(CONFIG_EFI)) || defined(CONFIG_S390) \
> +	|| defined(CONFIG_PPC_SECURE_BOOT)
>  extern bool arch_ima_get_secureboot(void);
>  extern const char * const *arch_get_ima_policy(void);
>  #else
> -- 
> 2.20.1
> 


^ permalink raw reply

* Re: [RFC V3] mm: Generalize and rename notify_page_fault() as kprobe_page_fault()
From: Anshuman Khandual @ 2019-06-11  5:15 UTC (permalink / raw)
  To: Christophe Leroy, linux-kernel, linux-mm
  Cc: Mark Rutland, Michal Hocko, linux-ia64, linux-sh, Peter Zijlstra,
	Catalin Marinas, Dave Hansen, Heiko Carstens, Paul Mackerras,
	Matthew Wilcox, sparclinux, linux-s390, Yoshinori Sato, x86,
	Russell King, Will Deacon, Ingo Molnar, Fenghua Yu,
	Stephen Rothwell, Andrey Konovalov, Andy Lutomirski,
	Thomas Gleixner, linux-arm-kernel, Tony Luck, Martin Schwidefsky,
	Andrew Morton, linuxppc-dev, David S. Miller
In-Reply-To: <f6d295c8-574d-3e64-79ae-2f7d3ff4c9f0@c-s.fr>



On 06/11/2019 10:16 AM, Christophe Leroy wrote:
> 
> 
> Le 10/06/2019 à 04:39, Anshuman Khandual a écrit :
>>
>>
>> On 06/07/2019 09:01 PM, Christophe Leroy wrote:
>>>
>>>
>>> Le 07/06/2019 à 12:34, Anshuman Khandual a écrit :
>>>> Very similar definitions for notify_page_fault() are being used by multiple
>>>> architectures duplicating much of the same code. This attempts to unify all
>>>> of them into a generic implementation, rename it as kprobe_page_fault() and
>>>> then move it to a common header.
>>>>
>>>> kprobes_built_in() can detect CONFIG_KPROBES, hence new kprobe_page_fault()
>>>> need not be wrapped again within CONFIG_KPROBES. Trap number argument can
>>>> now contain upto an 'unsigned int' accommodating all possible platforms.
>>>>
>>>> kprobe_page_fault() goes the x86 way while dealing with preemption context.
>>>> As explained in these following commits the invoking context in itself must
>>>> be non-preemptible for kprobes processing context irrespective of whether
>>>> kprobe_running() or perhaps smp_processor_id() is safe or not. It does not
>>>> make much sense to continue when original context is preemptible. Instead
>>>> just bail out earlier.
>>>>
>>>> commit a980c0ef9f6d
>>>> ("x86/kprobes: Refactor kprobes_fault() like kprobe_exceptions_notify()")
>>>>
>>>> commit b506a9d08bae ("x86: code clarification patch to Kprobes arch code")
>>>>
>>>> Cc: linux-arm-kernel@lists.infradead.org
>>>> Cc: linux-ia64@vger.kernel.org
>>>> Cc: linuxppc-dev@lists.ozlabs.org
>>>> Cc: linux-s390@vger.kernel.org
>>>> Cc: linux-sh@vger.kernel.org
>>>> Cc: sparclinux@vger.kernel.org
>>>> Cc: x86@kernel.org
>>>> Cc: Andrew Morton <akpm@linux-foundation.org>
>>>> Cc: Michal Hocko <mhocko@suse.com>
>>>> Cc: Matthew Wilcox <willy@infradead.org>
>>>> Cc: Mark Rutland <mark.rutland@arm.com>
>>>> Cc: Christophe Leroy <christophe.leroy@c-s.fr>
>>>> Cc: Stephen Rothwell <sfr@canb.auug.org.au>
>>>> Cc: Andrey Konovalov <andreyknvl@google.com>
>>>> Cc: Michael Ellerman <mpe@ellerman.id.au>
>>>> Cc: Paul Mackerras <paulus@samba.org>
>>>> Cc: Russell King <linux@armlinux.org.uk>
>>>> Cc: Catalin Marinas <catalin.marinas@arm.com>
>>>> Cc: Will Deacon <will.deacon@arm.com>
>>>> Cc: Tony Luck <tony.luck@intel.com>
>>>> Cc: Fenghua Yu <fenghua.yu@intel.com>
>>>> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
>>>> Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
>>>> Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
>>>> Cc: "David S. Miller" <davem@davemloft.net>
>>>> Cc: Thomas Gleixner <tglx@linutronix.de>
>>>> Cc: Peter Zijlstra <peterz@infradead.org>
>>>> Cc: Ingo Molnar <mingo@redhat.com>
>>>> Cc: Andy Lutomirski <luto@kernel.org>
>>>> Cc: Dave Hansen <dave.hansen@linux.intel.com>
>>>>
>>>> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
>>>> ---
>>>> Testing:
>>>>
>>>> - Build and boot tested on arm64 and x86
>>>> - Build tested on some other archs (arm, sparc64, alpha, powerpc etc)
>>>>
>>>> Changes in RFC V3:
>>>>
>>>> - Updated the commit message with an explaination for new preemption behaviour
>>>> - Moved notify_page_fault() to kprobes.h with 'static nokprobe_inline' per Matthew
>>>> - Changed notify_page_fault() return type from int to bool per Michael Ellerman
>>>> - Renamed notify_page_fault() as kprobe_page_fault() per Peterz
>>>>
>>>> Changes in RFC V2: (https://patchwork.kernel.org/patch/10974221/)
>>>>
>>>> - Changed generic notify_page_fault() per Mathew Wilcox
>>>> - Changed x86 to use new generic notify_page_fault()
>>>> - s/must not/need not/ in commit message per Matthew Wilcox
>>>>
>>>> Changes in RFC V1: (https://patchwork.kernel.org/patch/10968273/)
>>>>
>>>>    arch/arm/mm/fault.c      | 24 +-----------------------
>>>>    arch/arm64/mm/fault.c    | 24 +-----------------------
>>>>    arch/ia64/mm/fault.c     | 24 +-----------------------
>>>>    arch/powerpc/mm/fault.c  | 23 ++---------------------
>>>>    arch/s390/mm/fault.c     | 16 +---------------
>>>>    arch/sh/mm/fault.c       | 18 ++----------------
>>>>    arch/sparc/mm/fault_64.c | 16 +---------------
>>>>    arch/x86/mm/fault.c      | 21 ++-------------------
>>>>    include/linux/kprobes.h  | 16 ++++++++++++++++
>>>>    9 files changed, 27 insertions(+), 155 deletions(-)
>>>>
>>>
>>> [...]
>>>
>>>> diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
>>>> index 443d980..064dd15 100644
>>>> --- a/include/linux/kprobes.h
>>>> +++ b/include/linux/kprobes.h
>>>> @@ -458,4 +458,20 @@ static inline bool is_kprobe_optinsn_slot(unsigned long addr)
>>>>    }
>>>>    #endif
>>>>    +static nokprobe_inline bool kprobe_page_fault(struct pt_regs *regs,
>>>> +                          unsigned int trap)
>>>> +{
>>>> +    int ret = 0;
>>>
>>> ret is pointless.
>>>
>>>> +
>>>> +    /*
>>>> +     * To be potentially processing a kprobe fault and to be allowed
>>>> +     * to call kprobe_running(), we have to be non-preemptible.
>>>> +     */
>>>> +    if (kprobes_built_in() && !preemptible() && !user_mode(regs)) {
>>>> +        if (kprobe_running() && kprobe_fault_handler(regs, trap))
>>>
>>> don't need an 'if A if B', can do 'if A && B'
>>
>> Which will make it a very lengthy condition check.
> 
> Yes. But is that a problem at all ?

Probably not.

> 
> For me the following would be easier to read.
> 
> if (kprobes_built_in() && !preemptible() && !user_mode(regs) &&
>     kprobe_running() && kprobe_fault_handler(regs, trap))
>     ret = 1;

As mentioned before will stick with current x86 implementation. 

^ permalink raw reply

* Re: [RFC V3] mm: Generalize and rename notify_page_fault() as kprobe_page_fault()
From: Anshuman Khandual @ 2019-06-11  5:14 UTC (permalink / raw)
  To: Leonardo Bras, Christophe Leroy, linux-kernel, linux-mm
  Cc: Mark Rutland, Michal Hocko, linux-ia64, linux-sh, Peter Zijlstra,
	Catalin Marinas, Dave Hansen, Heiko Carstens, Paul Mackerras,
	sparclinux, linux-s390, Yoshinori Sato, x86, Russell King,
	Matthew Wilcox, Ingo Molnar, Andrey Konovalov, Fenghua Yu,
	Stephen Rothwell, Will Deacon, Andy Lutomirski, Thomas Gleixner,
	linux-arm-kernel, Tony Luck, Martin Schwidefsky, Andrew Morton,
	linuxppc-dev, David S. Miller
In-Reply-To: <8dd6168592437378ff4a7c204e0f2962d002b44f.camel@linux.ibm.com>



On 06/10/2019 08:57 PM, Leonardo Bras wrote:
> On Mon, 2019-06-10 at 08:09 +0530, Anshuman Khandual wrote:
>>>> +    /*
>>>> +     * To be potentially processing a kprobe fault and to be allowed
>>>> +     * to call kprobe_running(), we have to be non-preemptible.
>>>> +     */
>>>> +    if (kprobes_built_in() && !preemptible() && !user_mode(regs)) {
>>>> +        if (kprobe_running() && kprobe_fault_handler(regs, trap))
>>>
>>> don't need an 'if A if B', can do 'if A && B'
>>
>> Which will make it a very lengthy condition check.
> 
> Well, is there any problem line-breaking the if condition?
> 
> if (A && B && C &&
>     D && E )
> 
> Also, if it's used only to decide the return value, maybe would be fine
> to do somethink like that:
> 
> return (A && B && C &&
>         D && E ); 

Got it. But as Dave and Matthew had pointed out earlier, the current x86
implementation has better readability. Hence will probably stick with it.

^ permalink raw reply

* [PATCH kernel v2] powerpc/powernv/ioda: Fix race in TCE level allocation
From: Alexey Kardashevskiy @ 2019-06-11  5:08 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Jose Ricardo Ziviani, Alexey Kardashevskiy, Alistair Popple,
	Daniel Henrique Barboza, kvm-ppc, Sam Bobroff, Paul Mackerras,
	stable, Oliver O'Halloran, Reza Arbab, David Gibson

pnv_tce() returns a pointer to a TCE entry and originally a TCE table
would be pre-allocated. For the default case of 2GB window the table
needs only a single level and that is fine. However if more levels are
requested, it is possible to get a race when 2 threads want a pointer
to a TCE entry from the same page of TCEs.

This adds cmpxchg to handle the race. Note that once a TCE is non-zero,
it cannot become zero again.

CC: stable@vger.kernel.org # v4.19+
Fixes: a68bd1267b72 ("powerpc/powernv/ioda: Allocate indirect TCE levels on demand")
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---

The race occurs about 30 times in the first 3 minutes of copying files
via rsync and that's about it.

This fixes EEH's from
https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=110810

---
Changes:
v2:
* replaced spin_lock with cmpxchg+readonce
---
 arch/powerpc/platforms/powernv/pci-ioda-tce.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
index e28f03e1eb5e..8d6569590161 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
@@ -48,6 +48,9 @@ static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift)
 	return addr;
 }
 
+static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr,
+		unsigned long size, unsigned int levels);
+
 static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc)
 {
 	__be64 *tmp = user ? tbl->it_userspace : (__be64 *) tbl->it_base;
@@ -57,9 +60,9 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc)
 
 	while (level) {
 		int n = (idx & mask) >> (level * shift);
-		unsigned long tce;
+		unsigned long oldtce, tce = be64_to_cpu(READ_ONCE(tmp[n]));
 
-		if (tmp[n] == 0) {
+		if (!tce) {
 			__be64 *tmp2;
 
 			if (!alloc)
@@ -70,10 +73,15 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc)
 			if (!tmp2)
 				return NULL;
 
-			tmp[n] = cpu_to_be64(__pa(tmp2) |
-					TCE_PCI_READ | TCE_PCI_WRITE);
+			tce = __pa(tmp2) | TCE_PCI_READ | TCE_PCI_WRITE;
+			oldtce = be64_to_cpu(cmpxchg(&tmp[n], 0,
+					cpu_to_be64(tce)));
+			if (oldtce) {
+				pnv_pci_ioda2_table_do_free_pages(tmp2,
+					ilog2(tbl->it_level_size) + 3, 1);
+				tce = oldtce;
+			}
 		}
-		tce = be64_to_cpu(tmp[n]);
 
 		tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE));
 		idx &= ~mask;
-- 
2.17.1


^ permalink raw reply related

* Re: [RFC V3] mm: Generalize and rename notify_page_fault() as kprobe_page_fault()
From: Christophe Leroy @ 2019-06-11  4:46 UTC (permalink / raw)
  To: Anshuman Khandual, linux-kernel, linux-mm
  Cc: Mark Rutland, Michal Hocko, linux-ia64, linux-sh, Peter Zijlstra,
	Catalin Marinas, Dave Hansen, Heiko Carstens, Paul Mackerras,
	Matthew Wilcox, sparclinux, linux-s390, Yoshinori Sato, x86,
	Russell King, Will Deacon, Ingo Molnar, Fenghua Yu,
	Stephen Rothwell, Andrey Konovalov, Andy Lutomirski,
	Thomas Gleixner, linux-arm-kernel, Tony Luck, Martin Schwidefsky,
	Andrew Morton, linuxppc-dev, David S. Miller
In-Reply-To: <97e9c9b3-89c8-d378-4730-841a900e6800@arm.com>



Le 10/06/2019 à 04:39, Anshuman Khandual a écrit :
> 
> 
> On 06/07/2019 09:01 PM, Christophe Leroy wrote:
>>
>>
>> Le 07/06/2019 à 12:34, Anshuman Khandual a écrit :
>>> Very similar definitions for notify_page_fault() are being used by multiple
>>> architectures duplicating much of the same code. This attempts to unify all
>>> of them into a generic implementation, rename it as kprobe_page_fault() and
>>> then move it to a common header.
>>>
>>> kprobes_built_in() can detect CONFIG_KPROBES, hence new kprobe_page_fault()
>>> need not be wrapped again within CONFIG_KPROBES. Trap number argument can
>>> now contain upto an 'unsigned int' accommodating all possible platforms.
>>>
>>> kprobe_page_fault() goes the x86 way while dealing with preemption context.
>>> As explained in these following commits the invoking context in itself must
>>> be non-preemptible for kprobes processing context irrespective of whether
>>> kprobe_running() or perhaps smp_processor_id() is safe or not. It does not
>>> make much sense to continue when original context is preemptible. Instead
>>> just bail out earlier.
>>>
>>> commit a980c0ef9f6d
>>> ("x86/kprobes: Refactor kprobes_fault() like kprobe_exceptions_notify()")
>>>
>>> commit b506a9d08bae ("x86: code clarification patch to Kprobes arch code")
>>>
>>> Cc: linux-arm-kernel@lists.infradead.org
>>> Cc: linux-ia64@vger.kernel.org
>>> Cc: linuxppc-dev@lists.ozlabs.org
>>> Cc: linux-s390@vger.kernel.org
>>> Cc: linux-sh@vger.kernel.org
>>> Cc: sparclinux@vger.kernel.org
>>> Cc: x86@kernel.org
>>> Cc: Andrew Morton <akpm@linux-foundation.org>
>>> Cc: Michal Hocko <mhocko@suse.com>
>>> Cc: Matthew Wilcox <willy@infradead.org>
>>> Cc: Mark Rutland <mark.rutland@arm.com>
>>> Cc: Christophe Leroy <christophe.leroy@c-s.fr>
>>> Cc: Stephen Rothwell <sfr@canb.auug.org.au>
>>> Cc: Andrey Konovalov <andreyknvl@google.com>
>>> Cc: Michael Ellerman <mpe@ellerman.id.au>
>>> Cc: Paul Mackerras <paulus@samba.org>
>>> Cc: Russell King <linux@armlinux.org.uk>
>>> Cc: Catalin Marinas <catalin.marinas@arm.com>
>>> Cc: Will Deacon <will.deacon@arm.com>
>>> Cc: Tony Luck <tony.luck@intel.com>
>>> Cc: Fenghua Yu <fenghua.yu@intel.com>
>>> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
>>> Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
>>> Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
>>> Cc: "David S. Miller" <davem@davemloft.net>
>>> Cc: Thomas Gleixner <tglx@linutronix.de>
>>> Cc: Peter Zijlstra <peterz@infradead.org>
>>> Cc: Ingo Molnar <mingo@redhat.com>
>>> Cc: Andy Lutomirski <luto@kernel.org>
>>> Cc: Dave Hansen <dave.hansen@linux.intel.com>
>>>
>>> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
>>> ---
>>> Testing:
>>>
>>> - Build and boot tested on arm64 and x86
>>> - Build tested on some other archs (arm, sparc64, alpha, powerpc etc)
>>>
>>> Changes in RFC V3:
>>>
>>> - Updated the commit message with an explaination for new preemption behaviour
>>> - Moved notify_page_fault() to kprobes.h with 'static nokprobe_inline' per Matthew
>>> - Changed notify_page_fault() return type from int to bool per Michael Ellerman
>>> - Renamed notify_page_fault() as kprobe_page_fault() per Peterz
>>>
>>> Changes in RFC V2: (https://patchwork.kernel.org/patch/10974221/)
>>>
>>> - Changed generic notify_page_fault() per Mathew Wilcox
>>> - Changed x86 to use new generic notify_page_fault()
>>> - s/must not/need not/ in commit message per Matthew Wilcox
>>>
>>> Changes in RFC V1: (https://patchwork.kernel.org/patch/10968273/)
>>>
>>>    arch/arm/mm/fault.c      | 24 +-----------------------
>>>    arch/arm64/mm/fault.c    | 24 +-----------------------
>>>    arch/ia64/mm/fault.c     | 24 +-----------------------
>>>    arch/powerpc/mm/fault.c  | 23 ++---------------------
>>>    arch/s390/mm/fault.c     | 16 +---------------
>>>    arch/sh/mm/fault.c       | 18 ++----------------
>>>    arch/sparc/mm/fault_64.c | 16 +---------------
>>>    arch/x86/mm/fault.c      | 21 ++-------------------
>>>    include/linux/kprobes.h  | 16 ++++++++++++++++
>>>    9 files changed, 27 insertions(+), 155 deletions(-)
>>>
>>
>> [...]
>>
>>> diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
>>> index 443d980..064dd15 100644
>>> --- a/include/linux/kprobes.h
>>> +++ b/include/linux/kprobes.h
>>> @@ -458,4 +458,20 @@ static inline bool is_kprobe_optinsn_slot(unsigned long addr)
>>>    }
>>>    #endif
>>>    +static nokprobe_inline bool kprobe_page_fault(struct pt_regs *regs,
>>> +                          unsigned int trap)
>>> +{
>>> +    int ret = 0;
>>
>> ret is pointless.
>>
>>> +
>>> +    /*
>>> +     * To be potentially processing a kprobe fault and to be allowed
>>> +     * to call kprobe_running(), we have to be non-preemptible.
>>> +     */
>>> +    if (kprobes_built_in() && !preemptible() && !user_mode(regs)) {
>>> +        if (kprobe_running() && kprobe_fault_handler(regs, trap))
>>
>> don't need an 'if A if B', can do 'if A && B'
> 
> Which will make it a very lengthy condition check.

Yes. But is that a problem at all ?

For me the following would be easier to read.

if (kprobes_built_in() && !preemptible() && !user_mode(regs) &&
     kprobe_running() && kprobe_fault_handler(regs, trap))
	ret = 1;

Christophe

> 
>>
>>> +            ret = 1;
>>
>> can do 'return true;' directly here
>>
>>> +    }
>>> +    return ret;
>>
>> And 'return false' here.
> 
> Makes sense, will drop ret.
> 

^ permalink raw reply

* Re: [PATCH kernel] powerpc/powernv/ioda: Fix race in TCE level allocation
From: Alexey Kardashevskiy @ 2019-06-11  4:29 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Jose Ricardo Ziviani, Sam Bobroff, Alistair Popple,
	Daniel Henrique Barboza, kvm-ppc, Paul Mackerras, stable,
	Oliver O'Halloran, David Gibson
In-Reply-To: <20190611023103.86977-1-aik@ozlabs.ru>

Please ignore this (causes lockdep warnings), v2 is coming.


On 11/06/2019 12:31, Alexey Kardashevskiy wrote:
> pnv_tce() returns a pointer to a TCE entry and originally a TCE table
> would be pre-allocated. For the default case of 2GB window the table
> needs only a single level and that is fine. However if more levels are
> requested, it is possible to get a race when 2 threads want a pointer
> to a TCE entry from the same page of TCEs.
> 
> This adds a spinlock to handle the race. The alloc==true case is not
> possible in the real mode so spinlock is safe for KVM as well.
> 
> CC: stable@vger.kernel.org # v4.19+
> Fixes: a68bd1267b72 ("powerpc/powernv/ioda: Allocate indirect TCE levels on demand")
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
> 
> This fixes EEH's from
> https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=110810
> 
> 
> ---
>  arch/powerpc/include/asm/iommu.h              |  1 +
>  arch/powerpc/platforms/powernv/pci-ioda-tce.c | 21 ++++++++++++-------
>  2 files changed, 14 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index 2c1845e5e851..1825b4cc0097 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -111,6 +111,7 @@ struct iommu_table {
>  	struct iommu_table_ops *it_ops;
>  	struct kref    it_kref;
>  	int it_nid;
> +	spinlock_t it_lock;
>  };
>  
>  #define IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry) \
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
> index e28f03e1eb5e..9a19d61e2b12 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
> @@ -29,6 +29,7 @@ void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
>  	tbl->it_size = tce_size >> 3;
>  	tbl->it_busno = 0;
>  	tbl->it_type = TCE_PCI;
> +	spin_lock_init(&tbl->it_lock);
>  }
>  
>  static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift)
> @@ -60,18 +61,22 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc)
>  		unsigned long tce;
>  
>  		if (tmp[n] == 0) {
> -			__be64 *tmp2;
> -
>  			if (!alloc)
>  				return NULL;
>  
> -			tmp2 = pnv_alloc_tce_level(tbl->it_nid,
> -					ilog2(tbl->it_level_size) + 3);
> -			if (!tmp2)
> -				return NULL;
> +			spin_lock(&tbl->it_lock);
> +			if (tmp[n] == 0) {
> +				__be64 *tmp2;
>  
> -			tmp[n] = cpu_to_be64(__pa(tmp2) |
> -					TCE_PCI_READ | TCE_PCI_WRITE);
> +				tmp2 = pnv_alloc_tce_level(tbl->it_nid,
> +						ilog2(tbl->it_level_size) + 3);
> +				if (tmp2)
> +					tmp[n] = cpu_to_be64(__pa(tmp2) |
> +						TCE_PCI_READ | TCE_PCI_WRITE);
> +			}
> +			spin_unlock(&tbl->it_lock);
> +			if (tmp[n] == 0)
> +				return NULL;
>  		}
>  		tce = be64_to_cpu(tmp[n]);
>  
> 

-- 
Alexey

^ permalink raw reply

* [PATCH RESEND] Powerpc/Watchpoint: Restore nvgprs while returning from exception
From: Ravi Bangoria @ 2019-06-11  3:34 UTC (permalink / raw)
  To: mpe
  Cc: mikey, linux-kernel, npiggin, paulus, mahesh, naveen.n.rao,
	linuxppc-dev
In-Reply-To: <a2696037-539c-2f37-3b2f-7288a58fbfe7@linux.ibm.com>

Powerpc hw triggers watchpoint before executing the instruction. To
make trigger-after-execute behavior, kernel emulates the instruction.
If the instruction is 'load something into non-volatile register',
exception handler should restore emulated register state while
returning back, otherwise there will be register state corruption.
Ex, Adding a watchpoint on a list can corrput the list:

  # cat /proc/kallsyms | grep kthread_create_list
  c00000000121c8b8 d kthread_create_list

Add watchpoint on kthread_create_list->prev:

  # perf record -e mem:0xc00000000121c8c0

Run some workload such that new kthread gets invoked. Ex, I just
logged out from console:

  list_add corruption. next->prev should be prev (c000000001214e00), \
	but was c00000000121c8b8. (next=c00000000121c8b8).
  WARNING: CPU: 59 PID: 309 at lib/list_debug.c:25 __list_add_valid+0xb4/0xc0
  CPU: 59 PID: 309 Comm: kworker/59:0 Kdump: loaded Not tainted 5.1.0-rc7+ #69
  ...
  NIP __list_add_valid+0xb4/0xc0
  LR __list_add_valid+0xb0/0xc0
  Call Trace:
  __list_add_valid+0xb0/0xc0 (unreliable)
  __kthread_create_on_node+0xe0/0x260
  kthread_create_on_node+0x34/0x50
  create_worker+0xe8/0x260
  worker_thread+0x444/0x560
  kthread+0x160/0x1a0
  ret_from_kernel_thread+0x5c/0x70

List corruption happened because it uses 'load into non-volatile
register' instruction:

Snippet from __kthread_create_on_node:

  c000000000136be8:     addis   r29,r2,-19
  c000000000136bec:     ld      r29,31424(r29)
        if (!__list_add_valid(new, prev, next))
  c000000000136bf0:     mr      r3,r30
  c000000000136bf4:     mr      r5,r28
  c000000000136bf8:     mr      r4,r29
  c000000000136bfc:     bl      c00000000059a2f8 <__list_add_valid+0x8>

Register state from WARN_ON():

  GPR00: c00000000059a3a0 c000007ff23afb50 c000000001344e00 0000000000000075
  GPR04: 0000000000000000 0000000000000000 0000001852af8bc1 0000000000000000
  GPR08: 0000000000000001 0000000000000007 0000000000000006 00000000000004aa
  GPR12: 0000000000000000 c000007ffffeb080 c000000000137038 c000005ff62aaa00
  GPR16: 0000000000000000 0000000000000000 c000007fffbe7600 c000007fffbe7370
  GPR20: c000007fffbe7320 c000007fffbe7300 c000000001373a00 0000000000000000
  GPR24: fffffffffffffef7 c00000000012e320 c000007ff23afcb0 c000000000cb8628
  GPR28: c00000000121c8b8 c000000001214e00 c000007fef5b17e8 c000007fef5b17c0

Watchpoint hit at 0xc000000000136bec.

  addis   r29,r2,-19
   => r29 = 0xc000000001344e00 + (-19 << 16)
   => r29 = 0xc000000001214e00

  ld      r29,31424(r29)
   => r29 = *(0xc000000001214e00 + 31424)
   => r29 = *(0xc00000000121c8c0)

0xc00000000121c8c0 is where we placed a watchpoint and thus this
instruction was emulated by emulate_step. But because handle_dabr_fault
did not restore emulated register state, r29 still contains stale
value in above register state.

Fixes: 5aae8a5370802 ("powerpc, hw_breakpoints: Implement hw_breakpoints for 64-bit server processors") 
Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Cc: stable@vger.kernel.org # 2.6.36+
Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/exceptions-64s.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 6b86055e5251..0e649d980ec3 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1761,7 +1761,7 @@ handle_dabr_fault:
 	ld      r5,_DSISR(r1)
 	addi    r3,r1,STACK_FRAME_OVERHEAD
 	bl      do_break
-12:	b       ret_from_except_lite
+12:	b       ret_from_except
 
 
 #ifdef CONFIG_PPC_BOOK3S_64
-- 
2.20.1


^ permalink raw reply related

* [PATCH kernel] powerpc/powernv/ioda: Fix race in TCE level allocation
From: Alexey Kardashevskiy @ 2019-06-11  2:31 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Jose Ricardo Ziviani, Alexey Kardashevskiy, Alistair Popple,
	Daniel Henrique Barboza, kvm-ppc, Sam Bobroff, Paul Mackerras,
	stable, Oliver O'Halloran, David Gibson

pnv_tce() returns a pointer to a TCE entry and originally a TCE table
would be pre-allocated. For the default case of 2GB window the table
needs only a single level and that is fine. However if more levels are
requested, it is possible to get a race when 2 threads want a pointer
to a TCE entry from the same page of TCEs.

This adds a spinlock to handle the race. The alloc==true case is not
possible in the real mode so spinlock is safe for KVM as well.

CC: stable@vger.kernel.org # v4.19+
Fixes: a68bd1267b72 ("powerpc/powernv/ioda: Allocate indirect TCE levels on demand")
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---

This fixes EEH's from
https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=110810


---
 arch/powerpc/include/asm/iommu.h              |  1 +
 arch/powerpc/platforms/powernv/pci-ioda-tce.c | 21 ++++++++++++-------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 2c1845e5e851..1825b4cc0097 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -111,6 +111,7 @@ struct iommu_table {
 	struct iommu_table_ops *it_ops;
 	struct kref    it_kref;
 	int it_nid;
+	spinlock_t it_lock;
 };
 
 #define IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry) \
diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
index e28f03e1eb5e..9a19d61e2b12 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
@@ -29,6 +29,7 @@ void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
 	tbl->it_size = tce_size >> 3;
 	tbl->it_busno = 0;
 	tbl->it_type = TCE_PCI;
+	spin_lock_init(&tbl->it_lock);
 }
 
 static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift)
@@ -60,18 +61,22 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc)
 		unsigned long tce;
 
 		if (tmp[n] == 0) {
-			__be64 *tmp2;
-
 			if (!alloc)
 				return NULL;
 
-			tmp2 = pnv_alloc_tce_level(tbl->it_nid,
-					ilog2(tbl->it_level_size) + 3);
-			if (!tmp2)
-				return NULL;
+			spin_lock(&tbl->it_lock);
+			if (tmp[n] == 0) {
+				__be64 *tmp2;
 
-			tmp[n] = cpu_to_be64(__pa(tmp2) |
-					TCE_PCI_READ | TCE_PCI_WRITE);
+				tmp2 = pnv_alloc_tce_level(tbl->it_nid,
+						ilog2(tbl->it_level_size) + 3);
+				if (tmp2)
+					tmp[n] = cpu_to_be64(__pa(tmp2) |
+						TCE_PCI_READ | TCE_PCI_WRITE);
+			}
+			spin_unlock(&tbl->it_lock);
+			if (tmp[n] == 0)
+				return NULL;
 		}
 		tce = be64_to_cpu(tmp[n]);
 
-- 
2.17.1


^ permalink raw reply related

* Re: [PATCH RESEND 1/2] tools/perf: Add arch neutral function to choose event for perf kvm record
From: Ravi Bangoria @ 2019-06-11  2:16 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo, Anju T Sudhakar
  Cc: Ravi Bangoria, maddy, peterz, linuxppc-dev, linux-kernel,
	alexander.shishkin, namhyung, jolsa
In-Reply-To: <20190610151642.GT21245@kernel.org>



On 6/10/19 8:46 PM, Arnaldo Carvalho de Melo wrote:
> Em Mon, Jun 10, 2019 at 12:15:17PM +0530, Anju T Sudhakar escreveu:
>> 'perf kvm record' uses 'cycles'(if the user did not specify any event) as
>> the default event to profile the guest.
>> This will not provide any proper samples from the guest incase of
>> powerpc architecture, since in powerpc the PMUs are controlled by
>> the guest rather than the host.
>>
>> Patch adds a function to pick an arch specific event for 'perf kvm record',
>> instead of selecting 'cycles' as a default event for all architectures.
>>
>> For powerpc this function checks for any user specified event, and if there
>> isn't any it returns invalid instead of proceeding with 'cycles' event.
> 
> Michael, Ravi, Maddy, could you please provide an Acked-by, Reviewed-by
> or Tested-by?

Code looks fine to me but cross-build fails for aarch64:

  builtin-kvm.c:1513:12: error: no previous prototype for 'kvm_add_default_arch_event' [-Werror=missing-prototypes]
   int __weak kvm_add_default_arch_event(int *argc __maybe_unused,
              ^~~~~~~~~~~~~~~~~~~~~~~~~~
  cc1: all warnings being treated as errors
  mv: cannot stat './.builtin-kvm.o.tmp': No such file or directory

With the build fix:
Acked-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>


^ permalink raw reply

* [PATCH] crypto: vmx - Document CTR mode counter width quirks
From: Daniel Axtens @ 2019-06-11  1:54 UTC (permalink / raw)
  To: mpe, ebiggers, linux-crypto, Herbert Xu
  Cc: leo.barbosa, Stephan Mueller, nayna, omosnacek, leitao, pfsmorigo,
	marcelo.cerri, gcwilson, linuxppc-dev

The CTR code comes from OpenSSL, where it does a 32-bit counter.
The kernel has a 128-bit counter. This difference has lead to
issues.

Document it.

Signed-off-by: Daniel Axtens <dja@axtens.net>
---
 drivers/crypto/vmx/aesp8-ppc.pl | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/drivers/crypto/vmx/aesp8-ppc.pl b/drivers/crypto/vmx/aesp8-ppc.pl
index 9c6b5c1d6a1a..db874367b602 100644
--- a/drivers/crypto/vmx/aesp8-ppc.pl
+++ b/drivers/crypto/vmx/aesp8-ppc.pl
@@ -1286,6 +1286,24 @@ ___
 
 #########################################################################
 {{{	# CTR procedure[s]						#
+
+####################### WARNING: Here be dragons! #######################
+#
+# This code is written as 'ctr32', based on a 32-bit counter used
+# upstream. The kernel does *not* use a 32-bit counter. The kernel uses
+# a 128-bit counter.
+#
+# This leads to subtle changes from the upstream code: the counter
+# is incremented with vaddu_q_m rather than vaddu_w_m. This occurs in
+# both the bulk (8 blocks at a time) path, and in the individual block
+# path. Be aware of this when doing updates.
+#
+# See:
+# 1d4aa0b4c181 ("crypto: vmx - Fixing AES-CTR counter bug")
+# 009b30ac7444 ("crypto: vmx - CTR: always increment IV as quadword")
+# https://github.com/openssl/openssl/pull/8942
+#
+#########################################################################
 my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
 my ($rndkey0,$rndkey1,$inout,$tmp)=		map("v$_",(0..3));
 my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
@@ -1357,7 +1375,7 @@ Loop_ctr32_enc:
 	addi		$idx,$idx,16
 	bdnz		Loop_ctr32_enc
 
-	vadduqm		$ivec,$ivec,$one
+	vadduqm		$ivec,$ivec,$one	# Kernel change for 128-bit
 	 vmr		$dat,$inptail
 	 lvx		$inptail,0,$inp
 	 addi		$inp,$inp,16
@@ -1501,7 +1519,7 @@ Load_ctr32_enc_key:
 	$SHL		$len,$len,4
 
 	vadduqm		$out1,$ivec,$one	# counter values ...
-	vadduqm		$out2,$ivec,$two
+	vadduqm		$out2,$ivec,$two	# (do all ctr adds as 128-bit)
 	vxor		$out0,$ivec,$rndkey0	# ... xored with rndkey[0]
 	 le?li		$idx,8
 	vadduqm		$out3,$out1,$two
-- 
2.20.1


^ permalink raw reply related

* Re: [PATCH v2] powerpc: Add force enable of DAWR on P9 option
From: Cédric Le Goater @ 2019-06-10 17:31 UTC (permalink / raw)
  To: Michael Neuling, mpe; +Cc: linuxppc-dev, Cameron Kaiser
In-Reply-To: <20190401060312.22670-1-mikey@neuling.org>

Hello Michael,


> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -822,18 +822,21 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
>  	mtspr	SPRN_IAMR, r5
>  	mtspr	SPRN_PSPB, r6
>  	mtspr	SPRN_FSCR, r7
> -	ld	r5, VCPU_DAWR(r4)
> -	ld	r6, VCPU_DAWRX(r4)
> -	ld	r7, VCPU_CIABR(r4)
> -	ld	r8, VCPU_TAR(r4)
>  	/*
>  	 * Handle broken DAWR case by not writing it. This means we
>  	 * can still store the DAWR register for migration.
>  	 */
> -BEGIN_FTR_SECTION
> +	LOAD_REG_ADDR(r5, dawr_force_enable)
> +	lbz	r5, 0(r5)
> +	cmpdi	r5, 0
> +	beq	1f
> +	ld	r5, VCPU_DAWR(r4)
> +	ld	r6, VCPU_DAWRX(r4)
>  	mtspr	SPRN_DAWR, r5
>  	mtspr	SPRN_DAWRX, r6
> -END_FTR_SECTION_IFSET(CPU_FTR_DAWR)
> +1:
> +	ld	r7, VCPU_CIABR(r4)
> +	ld	r8, VCPU_TAR(r4)
>  	mtspr	SPRN_CIABR, r7
>  	mtspr	SPRN_TAR, r8
>  	ld	r5, VCPU_IC(r4)
> @@ -2513,11 +2516,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
>  	blr
>  
>  2:
> -BEGIN_FTR_SECTION
> -	/* POWER9 with disabled DAWR */
> +	LOAD_REG_ADDR(r11, dawr_force_enable)
> +	lbz	r11, 0(r11)
> +	cmpdi	r11, 0
>  	li	r3, H_HARDWARE
> -	blr
> -END_FTR_SECTION_IFCLR(CPU_FTR_DAWR)
> +	beqlr

Why is this a 'beqlr' ? Shouldn't it be a blr ? 

C.

>  	/* Emulate H_SET_DABR/X on P8 for the sake of compat mode guests */
>  	rlwimi	r5, r4, 5, DAWRX_DR | DAWRX_DW
>  	rlwimi	r5, r4, 2, DAWRX_WT
> 


^ permalink raw reply

* Re: [RFC PATCH] powerpc/book3e: KASAN Full support for 64bit
From: Daniel Axtens @ 2019-06-11  1:21 UTC (permalink / raw)
  To: Christophe Leroy, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <a0c04cb8-a19b-2d73-5725-4868556e2b47@c-s.fr>

Christophe Leroy <christophe.leroy@c-s.fr> writes:

> On 06/03/2019 11:50 PM, Daniel Axtens wrote:
>> Christophe Leroy <christophe.leroy@c-s.fr> writes:
>> 
>>> Hi,
>>>
>>> Ok, can you share your .config ?
>> 
>> Sure! This one is with kasan off as the last build I did was testing to
>> see if the code reorgisation was the cause of the issues. (it was not)
>> 
>> 
>> 
>> 
>> This was the kasan-enabled config that failed to boot:
>> 
>> 
>
> Same issue with your .config under QEMU:
>
> A go with gdb shows:
>
> Breakpoint 3, 0xc000000000027b6c in exc_0x700_common ()
> => 0xc000000000027b6c <exc_0x700_common+0>:	f8 01 00 70	std     r0,112(r1)
> (gdb) bt
> #0  0xc000000000027b6c in exc_0x700_common ()
> #1  0xc00000000136f80c in .udbg_init_memcons ()
>

Thanks for debugging this!

> Without CONFIG_PPC_EARLY_DEBUG, it boots fine for me. Can you check on 
> your side ?

Yes, that works on my side.

> Deactivating KASAN for arch/powerpc/kernel/udbg.o and 
> arch/powerpc/sysdev/udbg_memcons.o is not enough, we hit a call to 
> strstr() in register_early_udbg_console(), and once we get rid of it (in 
> the same way as in prom_init.c) the next issue is register_console() and 
> I don't know what to do about that one.

Disabling early debug seems like a reasonable restriction to add.

I'll have a look at modules across this and book3s next.

Regards,
Daniel

>
> Christophe
>
>> 
>> 
>> Regards,
>> Daniel
>> 
>>>
>>> Christophe
>>>
>>> Le 31/05/2019 à 03:29, Daniel Axtens a écrit :
>>>> Hi Christophe,
>>>>
>>>> I tried this on the t4240rdb and it fails to boot if KASAN is
>>>> enabled. It does boot with the patch applied but KASAN disabled, so that
>>>> narrows it down a little bit.
>>>>
>>>> I need to focus on 3s first so I'll just drop 3e from my patch set for
>>>> now.
>>>>
>>>> Regards,
>>>> Daniel
>>>>
>>>>> The KASAN shadow area is mapped into vmemmap space:
>>>>> 0x8000 0400 0000 0000 to 0x8000 0600 0000 0000.
>>>>> For this vmemmap has to be disabled.
>>>>>
>>>>> Cc: Daniel Axtens <dja@axtens.net>
>>>>> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
>>>>> ---
>>>>>    arch/powerpc/Kconfig                  |   1 +
>>>>>    arch/powerpc/Kconfig.debug            |   3 +-
>>>>>    arch/powerpc/include/asm/kasan.h      |  11 +++
>>>>>    arch/powerpc/kernel/Makefile          |   2 +
>>>>>    arch/powerpc/kernel/head_64.S         |   3 +
>>>>>    arch/powerpc/kernel/setup_64.c        |  20 +++---
>>>>>    arch/powerpc/mm/kasan/Makefile        |   1 +
>>>>>    arch/powerpc/mm/kasan/kasan_init_64.c | 129 ++++++++++++++++++++++++++++++++++
>>>>>    8 files changed, 159 insertions(+), 11 deletions(-)
>>>>>    create mode 100644 arch/powerpc/mm/kasan/kasan_init_64.c
>>>>>
>>>>> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
>>>>> index 1a2fb50126b2..e0b7c45e4dc7 100644
>>>>> --- a/arch/powerpc/Kconfig
>>>>> +++ b/arch/powerpc/Kconfig
>>>>> @@ -174,6 +174,7 @@ config PPC
>>>>>    	select HAVE_ARCH_AUDITSYSCALL
>>>>>    	select HAVE_ARCH_JUMP_LABEL
>>>>>    	select HAVE_ARCH_KASAN			if PPC32
>>>>> +	select HAVE_ARCH_KASAN			if PPC_BOOK3E_64 && !SPARSEMEM_VMEMMAP
>>>>>    	select HAVE_ARCH_KGDB
>>>>>    	select HAVE_ARCH_MMAP_RND_BITS
>>>>>    	select HAVE_ARCH_MMAP_RND_COMPAT_BITS	if COMPAT
>>>>> diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
>>>>> index 61febbbdd02b..b4140dd6b4e4 100644
>>>>> --- a/arch/powerpc/Kconfig.debug
>>>>> +++ b/arch/powerpc/Kconfig.debug
>>>>> @@ -370,4 +370,5 @@ config PPC_FAST_ENDIAN_SWITCH
>>>>>    config KASAN_SHADOW_OFFSET
>>>>>    	hex
>>>>>    	depends on KASAN
>>>>> -	default 0xe0000000
>>>>> +	default 0xe0000000 if PPC32
>>>>> +	default 0x6800040000000000 if PPC64
>>>>> diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h
>>>>> index 296e51c2f066..756b3d58f921 100644
>>>>> --- a/arch/powerpc/include/asm/kasan.h
>>>>> +++ b/arch/powerpc/include/asm/kasan.h
>>>>> @@ -23,10 +23,21 @@
>>>>>    
>>>>>    #define KASAN_SHADOW_OFFSET	ASM_CONST(CONFIG_KASAN_SHADOW_OFFSET)
>>>>>    
>>>>> +#ifdef CONFIG_PPC32
>>>>>    #define KASAN_SHADOW_END	0UL
>>>>>    
>>>>>    #define KASAN_SHADOW_SIZE	(KASAN_SHADOW_END - KASAN_SHADOW_START)
>>>>>    
>>>>> +#else
>>>>> +
>>>>> +#include <asm/pgtable.h>
>>>>> +
>>>>> +#define KASAN_SHADOW_SIZE	(KERN_VIRT_SIZE >> KASAN_SHADOW_SCALE_SHIFT)
>>>>> +
>>>>> +#define KASAN_SHADOW_END	(KASAN_SHADOW_START + KASAN_SHADOW_SIZE)
>>>>> +
>>>>> +#endif /* CONFIG_PPC32 */
>>>>> +
>>>>>    #ifdef CONFIG_KASAN
>>>>>    void kasan_early_init(void);
>>>>>    void kasan_mmu_init(void);
>>>>> diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
>>>>> index 0ea6c4aa3a20..7f232c06f11d 100644
>>>>> --- a/arch/powerpc/kernel/Makefile
>>>>> +++ b/arch/powerpc/kernel/Makefile
>>>>> @@ -35,6 +35,8 @@ KASAN_SANITIZE_early_32.o := n
>>>>>    KASAN_SANITIZE_cputable.o := n
>>>>>    KASAN_SANITIZE_prom_init.o := n
>>>>>    KASAN_SANITIZE_btext.o := n
>>>>> +KASAN_SANITIZE_paca.o := n
>>>>> +KASAN_SANITIZE_setup_64.o := n
>>>>>    
>>>>>    ifdef CONFIG_KASAN
>>>>>    CFLAGS_early_32.o += -DDISABLE_BRANCH_PROFILING
>>>>> diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
>>>>> index 3fad8d499767..80fbd8024fb2 100644
>>>>> --- a/arch/powerpc/kernel/head_64.S
>>>>> +++ b/arch/powerpc/kernel/head_64.S
>>>>> @@ -966,6 +966,9 @@ start_here_multiplatform:
>>>>>    	 * and SLB setup before we turn on relocation.
>>>>>    	 */
>>>>>    
>>>>> +#ifdef CONFIG_KASAN
>>>>> +	bl	kasan_early_init
>>>>> +#endif
>>>>>    	/* Restore parameters passed from prom_init/kexec */
>>>>>    	mr	r3,r31
>>>>>    	bl	early_setup		/* also sets r13 and SPRG_PACA */
>>>>> diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
>>>>> index ba404dd9ce1d..d2bf860dd966 100644
>>>>> --- a/arch/powerpc/kernel/setup_64.c
>>>>> +++ b/arch/powerpc/kernel/setup_64.c
>>>>> @@ -311,6 +311,16 @@ void __init early_setup(unsigned long dt_ptr)
>>>>>     	DBG(" -> early_setup(), dt_ptr: 0x%lx\n", dt_ptr);
>>>>>    
>>>>>    	/*
>>>>> +	 * Configure exception handlers. This include setting up trampolines
>>>>> +	 * if needed, setting exception endian mode, etc...
>>>>> +	 */
>>>>> +	configure_exceptions();
>>>>> +
>>>>> +	/* Apply all the dynamic patching */
>>>>> +	apply_feature_fixups();
>>>>> +	setup_feature_keys();
>>>>> +
>>>>> +	/*
>>>>>    	 * Do early initialization using the flattened device
>>>>>    	 * tree, such as retrieving the physical memory map or
>>>>>    	 * calculating/retrieving the hash table size.
>>>>> @@ -325,16 +335,6 @@ void __init early_setup(unsigned long dt_ptr)
>>>>>    	setup_paca(paca_ptrs[boot_cpuid]);
>>>>>    	fixup_boot_paca();
>>>>>    
>>>>> -	/*
>>>>> -	 * Configure exception handlers. This include setting up trampolines
>>>>> -	 * if needed, setting exception endian mode, etc...
>>>>> -	 */
>>>>> -	configure_exceptions();
>>>>> -
>>>>> -	/* Apply all the dynamic patching */
>>>>> -	apply_feature_fixups();
>>>>> -	setup_feature_keys();
>>>>> -
>>>>>    	/* Initialize the hash table or TLB handling */
>>>>>    	early_init_mmu();
>>>>>    
>>>>> diff --git a/arch/powerpc/mm/kasan/Makefile b/arch/powerpc/mm/kasan/Makefile
>>>>> index 6577897673dd..0bfbe3892808 100644
>>>>> --- a/arch/powerpc/mm/kasan/Makefile
>>>>> +++ b/arch/powerpc/mm/kasan/Makefile
>>>>> @@ -3,3 +3,4 @@
>>>>>    KASAN_SANITIZE := n
>>>>>    
>>>>>    obj-$(CONFIG_PPC32)           += kasan_init_32.o
>>>>> +obj-$(CONFIG_PPC64)	+= kasan_init_64.o
>>>>> diff --git a/arch/powerpc/mm/kasan/kasan_init_64.c b/arch/powerpc/mm/kasan/kasan_init_64.c
>>>>> new file mode 100644
>>>>> index 000000000000..7fd71b8e883b
>>>>> --- /dev/null
>>>>> +++ b/arch/powerpc/mm/kasan/kasan_init_64.c
>>>>> @@ -0,0 +1,129 @@
>>>>> +// SPDX-License-Identifier: GPL-2.0
>>>>> +
>>>>> +#define DISABLE_BRANCH_PROFILING
>>>>> +
>>>>> +#include <linux/kasan.h>
>>>>> +#include <linux/printk.h>
>>>>> +#include <linux/memblock.h>
>>>>> +#include <linux/sched/task.h>
>>>>> +#include <asm/pgalloc.h>
>>>>> +
>>>>> +static void __init kasan_populate_pte(pte_t *ptep, pgprot_t prot)
>>>>> +{
>>>>> +	unsigned long va = (unsigned long)kasan_early_shadow_page;
>>>>> +	phys_addr_t pa = __pa(kasan_early_shadow_page);
>>>>> +	int i;
>>>>> +
>>>>> +	for (i = 0; i < PTRS_PER_PTE; i++, ptep++)
>>>>> +		__set_pte_at(&init_mm, va, ptep, pfn_pte(PHYS_PFN(pa), prot), 0);
>>>>> +}
>>>>> +
>>>>> +static void __init kasan_populate_pmd(pmd_t *pmdp)
>>>>> +{
>>>>> +	int i;
>>>>> +
>>>>> +	for (i = 0; i < PTRS_PER_PMD; i++)
>>>>> +		pmd_populate_kernel(&init_mm, pmdp + i, kasan_early_shadow_pte);
>>>>> +}
>>>>> +
>>>>> +static void __init kasan_populate_pud(pud_t *pudp)
>>>>> +{
>>>>> +	int i;
>>>>> +
>>>>> +	for (i = 0; i < PTRS_PER_PUD; i++)
>>>>> +		pud_populate(&init_mm, pudp + i, kasan_early_shadow_pmd);
>>>>> +}
>>>>> +
>>>>> +static void __init *kasan_alloc_pgtable(unsigned long size)
>>>>> +{
>>>>> +	void *ptr = memblock_alloc_try_nid(size, size, MEMBLOCK_LOW_LIMIT,
>>>>> +					   __pa(MAX_DMA_ADDRESS), NUMA_NO_NODE);
>>>>> +
>>>>> +	if (!ptr)
>>>>> +		panic("%s: Failed to allocate %lu bytes align=0x%lx max_addr=%lx\n",
>>>>> +		      __func__, size, size, __pa(MAX_DMA_ADDRESS));
>>>>> +
>>>>> +	return ptr;
>>>>> +}
>>>>> +
>>>>> +static int __init kasan_map_page(unsigned long va, unsigned long pa, pgprot_t prot)
>>>>> +{
>>>>> +	pgd_t *pgdp = pgd_offset_k(va);
>>>>> +	pud_t *pudp;
>>>>> +	pmd_t *pmdp;
>>>>> +	pte_t *ptep;
>>>>> +
>>>>> +	if (pgd_none(*pgdp) || (void *)pgd_page_vaddr(*pgdp) == kasan_early_shadow_pud) {
>>>>> +		pudp = kasan_alloc_pgtable(PUD_TABLE_SIZE);
>>>>> +		kasan_populate_pud(pudp);
>>>>> +		pgd_populate(&init_mm, pgdp, pudp);
>>>>> +	}
>>>>> +	pudp = pud_offset(pgdp, va);
>>>>> +	if (pud_none(*pudp) || (void *)pud_page_vaddr(*pudp) == kasan_early_shadow_pmd) {
>>>>> +		pmdp = kasan_alloc_pgtable(PMD_TABLE_SIZE);
>>>>> +		kasan_populate_pmd(pmdp);
>>>>> +		pud_populate(&init_mm, pudp, pmdp);
>>>>> +	}
>>>>> +	pmdp = pmd_offset(pudp, va);
>>>>> +	if (!pmd_present(*pmdp) || (void *)pmd_page_vaddr(*pmdp) == kasan_early_shadow_pte) {
>>>>> +		ptep = kasan_alloc_pgtable(PTE_TABLE_SIZE);
>>>>> +		kasan_populate_pte(ptep, PAGE_KERNEL);
>>>>> +		pmd_populate_kernel(&init_mm, pmdp, ptep);
>>>>> +	}
>>>>> +	ptep = pte_offset_kernel(pmdp, va);
>>>>> +
>>>>> +	__set_pte_at(&init_mm, va, ptep, pfn_pte(pa >> PAGE_SHIFT, prot), 0);
>>>>> +
>>>>> +	return 0;
>>>>> +}
>>>>> +
>>>>> +static void __init kasan_init_region(struct memblock_region *reg)
>>>>> +{
>>>>> +	void *start = __va(reg->base);
>>>>> +	void *end = __va(reg->base + reg->size);
>>>>> +	unsigned long k_start, k_end, k_cur;
>>>>> +
>>>>> +	if (start >= end)
>>>>> +		return;
>>>>> +
>>>>> +	k_start = (unsigned long)kasan_mem_to_shadow(start);
>>>>> +	k_end = (unsigned long)kasan_mem_to_shadow(end);
>>>>> +
>>>>> +	for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE) {
>>>>> +		void *va = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
>>>>> +
>>>>> +		kasan_map_page(k_cur, __pa(va), PAGE_KERNEL);
>>>>> +	}
>>>>> +	flush_tlb_kernel_range(k_start, k_end);
>>>>> +}
>>>>> +
>>>>> +void __init kasan_init(void)
>>>>> +{
>>>>> +	struct memblock_region *reg;
>>>>> +
>>>>> +	for_each_memblock(memory, reg)
>>>>> +		kasan_init_region(reg);
>>>>> +
>>>>> +	/* It's too early to use clear_page() ! */
>>>>> +	memset(kasan_early_shadow_page, 0, sizeof(kasan_early_shadow_page));
>>>>> +
>>>>> +	/* Enable error messages */
>>>>> +	init_task.kasan_depth = 0;
>>>>> +	pr_info("KASAN init done\n");
>>>>> +}
>>>>> +
>>>>> +/* The early shadow maps everything to a single page of zeroes */
>>>>> +asmlinkage void __init kasan_early_init(void)
>>>>> +{
>>>>> +	unsigned long addr = KASAN_SHADOW_START;
>>>>> +	unsigned long end = KASAN_SHADOW_END;
>>>>> +	pgd_t *pgdp = pgd_offset_k(addr);
>>>>> +
>>>>> +	kasan_populate_pte(kasan_early_shadow_pte, PAGE_KERNEL);
>>>>> +	kasan_populate_pmd(kasan_early_shadow_pmd);
>>>>> +	kasan_populate_pud(kasan_early_shadow_pud);
>>>>> +
>>>>> +	do {
>>>>> +		pgd_populate(&init_mm, pgdp, kasan_early_shadow_pud);
>>>>> +	} while (pgdp++, addr = pgd_addr_end(addr, end), addr != end);
>>>>> +}
>>>>> -- 
>>>>> 2.13.3

^ permalink raw reply

* Re: crash after NX error
From: Haren Myneni @ 2019-06-11  0:44 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev, Stewart Smith
In-Reply-To: <87zhmwmgv7.fsf@concordia.ellerman.id.au>

On 06/05/2019 04:06 AM, Michael Ellerman wrote:
> Stewart Smith <stewart@linux.ibm.com> writes:
>> On my two socket POWER9 system (powernv) with 842 zwap set up, I
>> recently got a crash with the Ubuntu kernel (I haven't tried with
>> upstream, and this is the first time the system has died like this, so
>> I'm not sure how repeatable it is).
>>
>> [    2.891463] zswap: loaded using pool 842-nx/zbud
>> ...
>> [15626.124646] nx_compress_powernv: ERROR: CSB still not valid after 5000000 us, giving up : 00 00 00 00 00000000
>> [16868.932913] Unable to handle kernel paging request for data at address 0x6655f67da816cdb8
>> [16868.933726] Faulting instruction address: 0xc000000000391600
>>
>>
>> cpu 0x68: Vector: 380 (Data Access Out of Range) at [c000001c9d98b9a0]
>>     pc: c000000000391600: kmem_cache_alloc+0x2e0/0x340
>>     lr: c0000000003915ec: kmem_cache_alloc+0x2cc/0x340
>>     sp: c000001c9d98bc20
>>    msr: 900000000280b033
>>    dar: 6655f67da816cdb8
>>   current = 0xc000001ad43cb400
>>   paca    = 0xc00000000fac7800   softe: 0        irq_happened: 0x01
>>     pid   = 8319, comm = make
>> Linux version 4.15.0-50-generic (buildd@bos02-ppc64el-006) (gcc version 7.3.0 (Ubuntu 7.3.0-16ubuntu3)) #54-Ubuntu SMP Mon May 6 18:55:18 UTC 2019 (Ubuntu 4.15.0-50.54-generic 4.15.18)
>>
>> 68:mon> t
>> [c000001c9d98bc20] c0000000003914d4 kmem_cache_alloc+0x1b4/0x340 (unreliable)
>> [c000001c9d98bc80] c0000000003b1e14 __khugepaged_enter+0x54/0x220
>> [c000001c9d98bcc0] c00000000010f0ec copy_process.isra.5.part.6+0xebc/0x1a10
>> [c000001c9d98bda0] c00000000010fe4c _do_fork+0xec/0x510
>> [c000001c9d98be30] c00000000000b584 ppc_clone+0x8/0xc
>> --- Exception: c00 (System Call) at 00007afe9daf87f4
>> SP (7fffca606880) is in userspace
>>
>> So, it looks like there could be a problem in the error path, plausibly
>> fixed by this patch:
>>
>> commit 656ecc16e8fc2ab44b3d70e3fcc197a7020d0ca5
>> Author: Haren Myneni <haren@linux.vnet.ibm.com>
>> Date:   Wed Jun 13 00:32:40 2018 -0700
>>
>>     crypto/nx: Initialize 842 high and normal RxFIFO control registers
>>     
>>     NX increments readOffset by FIFO size in receive FIFO control register
>>     when CRB is read. But the index in RxFIFO has to match with the
>>     corresponding entry in FIFO maintained by VAS in kernel. Otherwise NX
>>     may be processing incorrect CRBs and can cause CRB timeout.
>>     
>>     VAS FIFO offset is 0 when the receive window is opened during
>>     initialization. When the module is reloaded or in kexec boot, readOffset
>>     in FIFO control register may not match with VAS entry. This patch adds
>>     nx_coproc_init OPAL call to reset readOffset and queued entries in FIFO
>>     control register for both high and normal FIFOs.
>>     
>>     Signed-off-by: Haren Myneni <haren@us.ibm.com>
>>     [mpe: Fixup uninitialized variable warning]
>>     Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
>>
>> $ git describe --contains 656ecc16e8fc2ab44b3d70e3fcc197a7020d0ca5
>> v4.19-rc1~24^2~50
>>
>>
>> Which was never backported to any stable release, so probably needs to
>> be for v4.14 through v4.18.
> 
> Yeah the P9 NX support went in in:
>   b0d6c9bab5e4 ("crypto/nx: Add P9 NX support for 842 compression engine")
> 
> Which was: v4.14-rc1~119^2~21, so first released in v4.14.
> 
> 
> I'm actually less interested in that and more interested in the
> subsequent crash. The time stamps are miles apart though, did we just
> leave some corrupted memory after the NX failed and then hit it later?
> Or did we not correctly signal to the upper level APIs that the request
> failed.
> 
> I think we need to do some testing with errors injected into the
> wait_for_csb() path, to ensure that failures there are not causing
> corrupting in zswap. Haren have you done any testing of error injection?

The code path returns error code from wait_for_csb() properly to upper level APIs. In the case of decompression case, upon failure the request will fall back to SW 842. 

If NX is involved in this crash, the compression request may be successful with invalid CRB (mismatch FIFO entries in NX and VAS). Then SW 842 may be decompressed invalid data which might cause corruption later when accessing it. 

I will try to reproduce the issue with 4.14 kernel,

Thanks
Haren
  
> 
> cheers
> 


^ permalink raw reply

* [Bug 203839] Kernel 5.2-rc3 fails to boot on a PowerMac G4 3,6: systemd[1]: Failed to bump fs.file-max, ignoring: invalid argument
From: bugzilla-daemon @ 2019-06-11  0:34 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <bug-203839-206035@https.bugzilla.kernel.org/>

https://bugzilla.kernel.org/show_bug.cgi?id=203839

Erhard F. (erhard_f@mailbox.org) changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
 Attachment #283139|0                           |1
        is obsolete|                            |

--- Comment #7 from Erhard F. (erhard_f@mailbox.org) ---
Created attachment 283185
  --> https://bugzilla.kernel.org/attachment.cgi?id=283185&action=edit
kernel .config (5.1.0-rc3+, G4 MDD)

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

^ permalink raw reply

* [Bug 203839] Kernel 5.2-rc3 fails to boot on a PowerMac G4 3,6: systemd[1]: Failed to bump fs.file-max, ignoring: invalid argument
From: bugzilla-daemon @ 2019-06-11  0:32 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <bug-203839-206035@https.bugzilla.kernel.org/>

https://bugzilla.kernel.org/show_bug.cgi?id=203839

--- Comment #6 from Erhard F. (erhard_f@mailbox.org) ---
Created attachment 283183
  --> https://bugzilla.kernel.org/attachment.cgi?id=283183&action=edit
bisect.log

bisect took me a while due to quite some skips. Cherry-picking
397d2300b08cdee052053e362018cdb6dd65eea2 and
305d60012304684bd59ea1f67703e51662e4906a helped me complete it.

# git bisect good | tee -a /root/bisect02.log
215b823707ce4e8e52b106915f70357fa474c669 is the first bad commit
commit 215b823707ce4e8e52b106915f70357fa474c669
Author: Christophe Leroy <christophe.leroy@c-s.fr>
Date:   Fri Apr 26 16:23:36 2019 +0000

    powerpc/32s: set up an early static hash table for KASAN.

    KASAN requires early activation of hash table, before memblock()
    functions are available.

    This patch implements an early hash_table statically defined in
    __initdata.

    During early boot, a single page table is used.

    For hash32, when doing the final init, one page table is allocated
    for each PGD entry because of the _PAGE_HASHPTE flag which can't be
    common to several virt pages. This is done after memblock get
    available but before switching to the final hash table, otherwise
    there are issues with TLB flushing due to the shared entries.

    Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
    Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

:040000 040000 abc24eb3c4ad3e4f2b1eb7b52c295c8b95d79a78
c3b6114c26eb8e181abb3f1abc9b6ecc12292f4d M      arch

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox