Linux Documentation
 help / color / mirror / Atom feed
* [PATCH v5 2/2] cpuset: Add cpuset.flags control knob to v2
From: Waiman Long @ 2018-03-15 21:20 UTC (permalink / raw)
  To: Tejun Heo, Li Zefan, Johannes Weiner, Peter Zijlstra, Ingo Molnar
  Cc: cgroups, linux-kernel, linux-doc, kernel-team, pjt, luto, efault,
	torvalds, Roman Gushchin, Waiman Long
In-Reply-To: <1521148842-15486-1-git-send-email-longman@redhat.com>

In cgroup v1, there are a bunch of cpuset control knobs that are just
boolean flags. To reduce the proliteration of cpuset control knobs
in v2 where multiple controllers will be active in a single cgroup,
all the boolean flags are now consolidated into a single cpuset.flags
control knob.

Currently, the cpuset.flags control knob just has one flag -
sched_load_balance. This flag is needed to enable CPU isolation
similar to what can be done with the "isolcpus" kernel boot parameter.
More flags can be added in the future if desired.

When the "cpuset.flags" file is read, it shows what flags have been
currently enabled. Turning on or off specific flags are done in a
way similar to what controllers can be enabled or disabled in the
cgroup.subtree_control control knob.

To enable the sched_load_balance flag, use

  # echo +sched_load_balance > cpuset.flags

To disable it, use

  # echo -sched_load_balance > cpuset.flags

Signed-off-by: Waiman Long <longman@redhat.com>
---
 Documentation/cgroup-v2.txt | 32 +++++++++++++++
 kernel/cgroup/cpuset.c      | 98 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 130 insertions(+)

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index b91fd5d..362026a 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -1318,6 +1318,38 @@ Cpuset Interface Files
 	of "cpuset.mems".  Its value will be affected by memory nodes
 	hotplug events.
 
+  cpuset.flags
+	A read-write multiple values file which exists on non-root
+	cgroups.
+
+	On read, it lists the flags that are currently enabled.  On
+	write, the '+' or '-' prefix with the flag name is used to
+	enable and disable the flag respectively.
+
+	The currently supported flag is:
+
+	  sched_load_balance
+		When it is not set, there will be no load balancing
+		among CPUs on this cpuset.  Tasks will stay in the
+		CPUs they are running on and will not be moved to
+		other CPUs.
+
+		When it is set, tasks within this cpuset will be
+		load-balanced by the kernel scheduler.  Tasks will be
+		moved from CPUs with high load to other CPUs within
+		the same cpuset with less load periodically.
+
+		This flag is on by default and will have to be
+		explicitly turned off.
+
+	To set a flag, use the '+' prefix:
+
+	  # echo +sched_load_balance > cpuset.flags
+
+	To clear a flag, use the '-' prefix:
+
+	  # echo -sched_load_balance > cpuset.flags
+
 
 IO
 --
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 7837d1f..3145dc0 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1601,6 +1601,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
 	FILE_MEMORY_PRESSURE,
 	FILE_SPREAD_PAGE,
 	FILE_SPREAD_SLAB,
+	FILE_FLAGS,
 } cpuset_filetype_t;
 
 static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
@@ -1823,6 +1824,97 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
 	return 0;
 }
 
+static const struct {
+	char *name;
+	int  cs_bit;
+} cpuset_flags[] = {
+	{ "sched_load_balance", CS_SCHED_LOAD_BALANCE },
+};
+
+static int cpuset_read_flags(struct seq_file *sf, void *v)
+{
+	struct cpuset *cs = css_cs(seq_css(sf));
+	unsigned long enabled = READ_ONCE(cs->flags);
+	int i, cnt;
+
+	for (i = cnt = 0; i < ARRAY_SIZE(cpuset_flags); i++) {
+		if (!test_bit(cpuset_flags[i].cs_bit, &enabled))
+			continue;
+
+		if (cnt++)
+			seq_putc(sf, ' ');
+
+		seq_printf(sf, "%s", cpuset_flags[i].name);
+	}
+	seq_putc(sf, '\n');
+	return 0;
+}
+
+static ssize_t cpuset_write_flags(struct kernfs_open_file *of, char *buf,
+				  size_t nbytes, loff_t off)
+{
+	struct cpuset *cs = css_cs(of_css(of));
+	unsigned long enable = 0, disable = 0;
+	char *tok;
+	int i;
+
+	mutex_lock(&cpuset_mutex);
+	if (!is_cpuset_online(cs)) {
+		nbytes = -ENODEV;
+		goto out_unlock;
+	}
+
+	/*
+	 * Parse input - space seperated list of feature names prefixed
+	 * with either + or -.
+	 */
+	buf = strstrip(buf);
+	while ((tok = strsep(&buf, " "))) {
+		if (tok[0] == '\0')
+			continue;
+		for (i = 0; i < ARRAY_SIZE(cpuset_flags); i++)
+			if (!strcmp(tok + 1, cpuset_flags[i].name))
+				break;
+		if (i == ARRAY_SIZE(cpuset_flags)) {
+			nbytes = -EINVAL;
+			goto out_unlock;
+		}
+		if (*tok == '+') {
+			set_bit(cpuset_flags[i].cs_bit, &cs->flags);
+			enable  |= 1UL << cpuset_flags[i].cs_bit;
+			disable &= 1UL << cpuset_flags[i].cs_bit;
+		} else if (*tok == '-') {
+			disable |= 1UL << cpuset_flags[i].cs_bit;
+			enable  &= 1UL << cpuset_flags[i].cs_bit;
+		} else {
+			nbytes = -EINVAL;
+			goto out_unlock;
+		}
+	}
+
+	/*
+	 * Set/clear the flags individually. It is possible an error may
+	 * happen before all the flags are processed, but it should be rare.
+	 */
+	for  (i = 0; i < ARRAY_SIZE(cpuset_flags); i++) {
+		unsigned long flag = 1UL << cpuset_flags[i].cs_bit;
+		int retval;
+
+		if (flag & (enable|disable)) {
+			retval = update_flag(cpuset_flags[i].cs_bit, cs,
+					     enable & flag);
+			if (retval) {
+				nbytes = retval;
+				goto out_unlock;
+			}
+		}
+	}
+
+out_unlock:
+	mutex_unlock(&cpuset_mutex);
+	return nbytes;
+}
+
 /*
  * for the common functions, 'private' gives the type of file
  */
@@ -1962,6 +2054,12 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
 		.private = FILE_EFFECTIVE_MEMLIST,
 	},
 
+	{
+		.name = "flags",
+		.seq_show = cpuset_read_flags,
+		.write = cpuset_write_flags,
+		.private = FILE_FLAGS,
+	},
 	{ }	/* terminate */
 };
 
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH v5 0/2] cpuset: Enable cpuset controller in default hierarchy
From: Waiman Long @ 2018-03-15 21:20 UTC (permalink / raw)
  To: Tejun Heo, Li Zefan, Johannes Weiner, Peter Zijlstra, Ingo Molnar
  Cc: cgroups, linux-kernel, linux-doc, kernel-team, pjt, luto, efault,
	torvalds, Roman Gushchin, Waiman Long

v5:
 - Add patch 2 to provide the cpuset.flags control knob for the
   sched_load_balance flag which should be the only feature that is
   essential as a replacement of the "isolcpus" kernel boot parameter.

v4:
 - Further minimize the feature set by removing the flags control knob.

v3:
 - Further trim the additional features down to just memory_migrate.
 - Update Documentation/cgroup-v2.txt.

The purpose of this patchset is to provide a minimal set of cpuset
features for cgroup v2. That minimal set includes the cpus, mems and
their effective_* counterparts as well as a new flags control knob
that currently supports only the sched_load_balance flag.

This patchset does not exclude the possibility of adding more flags
and features in the future after careful consideration.

Patch 1 enables cpuset in cgroup v2 with cpus, mems and their
effective_* counterparts.

Patch 2 adds flags with support for the sched_load_balance only.

Waiman Long (2):
  cpuset: Enable cpuset controller in default hierarchy
  cpuset: Add cpuset.flags control knob to v2

 Documentation/cgroup-v2.txt | 128 ++++++++++++++++++++++++++++++++++++----
 kernel/cgroup/cpuset.c      | 140 +++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 256 insertions(+), 12 deletions(-)

-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v9 5/7] arm64: kvm: Introduce KVM_ARM_SET_SERROR_ESR ioctl
From: James Morse @ 2018-03-15 20:46 UTC (permalink / raw)
  To: gengdongjiu, drjones
  Cc: gengdongjiu, linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org, corbet@lwn.net,
	marc.zyngier@arm.com, catalin.marinas@arm.com,
	linux-doc@vger.kernel.org, rjw@rjwysocki.net,
	linux@armlinux.org.uk, will.deacon@arm.com,
	robert.moore@intel.com, linux-acpi@vger.kernel.org, bp@alien8.de,
	lv.zheng@intel.com, Huangshaoyu, kvmarm@lists.cs.columbia.edu,
	devel@acpica.org
In-Reply-To: <CAMj-D2COh4xDKreZXPHeiJK34mrPqDbHszDWY-fz_SfxTRaHQA@mail.gmail.com>

Hi gengdongjiu,

On 08/03/18 06:18, gengdongjiu wrote:
> Hi James,
>    sorry for my late response due to chines new year.

Happy new year,


> 2018-02-16 1:55 GMT+08:00 James Morse <james.morse@arm.com>:
>> On 12/02/18 10:19, gengdongjiu wrote:
>>> On 2018/2/10 1:44, James Morse wrote:
>>>> The point? We can't know what a CPU without the RAS extensions puts in there.
>>>>
>>>> Why Does this matter? When migrating a pending SError we have to know the
>>>> difference between 'use this 64bit value', and 'the CPU will generate it'.
>>>> If I make an SError pending with ESR=0 on a CPU with VSESR, I can't migrated to
>>>> a system that generates an impdef SError-ESR, because I can't know it will be 0.
>>
>>> For the target system, before taking the SError, no one can know whether its syndrome value
>>> is IMPLEMENTATION DEFINED or architecturally defined.
>>
>> For a virtual-SError, the hypervisor knows what it generated. (do I have
>> VSESR_EL2? What did I put in there?).
>>
>>
>>> when the virtual SError is taken, the ESR_ELx.IDS will be updated, then we can know
>>> whether the ESR value is impdef or architecturally defined.
>>
>> True, the guest can't know anything about a pending virtual SError until it
>> takes it. Why is this a problem?
>>
>>
>>> It seems migration is only allowed only when target system and source system all support
>>> RAS extension, because we do not know whether its syndrome is IMPLEMENTATION DEFINED or
>>> architecturally defined.
>>
>> I don't think Qemu allows migration between hosts with differing guest-ID
>> registers. But we shouldn't depend on this, and we may want to hide the v8.2 RAS
>> features from the guest's ID register, but still use them from the host.
>>
>> The way I imagined it working was we would pack the following information into
>> that events struct:
>> {
>>         bool serror_pending;
>>         bool serror_has_esr;
>>         u64  serror_esr;
>> }
> 
> I have used your suggestion struct

Ah! This is where it came from. Sorry, this was just to illustrate the
information/sizes we wanted to transfer.... I didn't mean it literally.

I should have said "64 bits of ESR, so that we can transfer anything that is
added to VSESR_EL2 in the future, a flag somewhere to indicate an serror is
pending, and another flag to indicate the ESR has a value we should use".


Thanks/Sorry!

James


>> The problem I was trying to describe is because there is no value of serror_esr
>> we can use to mean 'Ignore this, I'm a v8.0 CPU'. VSESR_EL2 is a 64bit register,
>> any bits we abuse may get a meaning we want to use in the future.
>>
>> When it comes to migration, v8.{0,1} systems can only GET/SET events where
>> serror_has_esr == false, they can't use the serror_esr. On v8.2 systems we
>> should require serror_has_esr to be true.
> yes, I agreed.
> 
>>
>> If we need to support migration from v8.{0,1} to v8.2, we can make up an impdef
>> serror_esr.
> 
> For the Qemu migration, I need to check more the QEMU code.


> Hi Andrew,
>       I use KVM_GET/SET_VCPU_EVENTS IOCTL to migrate the Serror
> exception status of VM,
> The even struct is shown below:
> 
> {
>       bool serror_pending;
>       bool serror_has_esr;
>      u64  serror_esr;
> }
> 
> Only when the target machine is armv8.2, it needs to set the
> serror_esr(SError Exception Syndrome Register).
> for the armv8.0,  software can not set the serror_esr(SError Exception
> Syndrome Register).
> so when migration from v8.{0,1} to v8.2, QEMU should make up an impdef
> serror_esr for the v8.2 target.
> can you give me some suggestion how to set that register in the QEMU?
> I do not familar with the QEMU migration.
> Thanks very much.

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v10 2/5] arm64: KVM: export the capability to set guest SError syndrome
From: James Morse @ 2018-03-15 20:39 UTC (permalink / raw)
  To: Dongjiu Geng
  Cc: rkrcmar, corbet, christoffer.dall, marc.zyngier, linux,
	catalin.marinas, rjw, bp, lenb, kvm, linux-doc, linux-kernel,
	linux-arm-kernel, kvmarm, linux-acpi, devel, huangshaoyu
In-Reply-To: <1520093380-42577-3-git-send-email-gengdongjiu@huawei.com>

Hi Dongjiu Geng,

On 03/03/18 16:09, Dongjiu Geng wrote:
> Before user space injects a SError, it needs to know whether it can
> specify the guest Exception Syndrome, so KVM should tell user space
> whether it has such capability.

> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index fc3ae95..8a3d708 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -4415,3 +4415,14 @@ Parameters: none
>  This capability indicates if the flic device will be able to get/set the
>  AIS states for migration via the KVM_DEV_FLIC_AISM_ALL attribute and allows
>  to discover this without having to create a flic device.
> +
> +8.14 KVM_CAP_ARM_SET_SERROR_ESR
> +
> +Architectures: arm, arm64
> +
> +This capability indicates that userspace can specify syndrome value reported to
> +guest OS when guest takes a virtual SError interrupt exception.

"when userspace triggers a virtual SError"... how?


> +If KVM has this capability, userspace can only specify the ISS field for the ESR
> +syndrome, can not specify the EC field which is not under control by KVM.

Where do I put the ESR?
If you re-order this after the patch that adds the API, you can describe how
this can be used.


Thanks,

James



> +If this virtual SError is taken to EL1 using AArch64, this value will be reported
> +into ISS filed of ESR_EL1.
> diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
> index 3256b92..38c8a64 100644
> --- a/arch/arm64/kvm/reset.c
> +++ b/arch/arm64/kvm/reset.c
> @@ -77,6 +77,9 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
>  	case KVM_CAP_ARM_PMU_V3:
>  		r = kvm_arm_support_pmu_v3();
>  		break;
> +	case KVM_CAP_ARM_INJECT_SERROR_ESR:
> +		r = cpus_have_const_cap(ARM64_HAS_RAS_EXTN);
> +		break;
>  	case KVM_CAP_SET_GUEST_DEBUG:
>  	case KVM_CAP_VCPU_ATTRIBUTES:
>  		r = 1;
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 8fb90a0..3587b33 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -934,6 +934,7 @@ struct kvm_ppc_resize_hpt {
>  #define KVM_CAP_S390_AIS_MIGRATION 150
>  #define KVM_CAP_PPC_GET_CPU_CHAR 151
>  #define KVM_CAP_S390_BPB 152
> +#define KVM_CAP_ARM_INJECT_SERROR_ESR 153
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v10 3/5] arm/arm64: KVM: Introduce set and get per-vcpu event
From: James Morse @ 2018-03-15 20:38 UTC (permalink / raw)
  To: Dongjiu Geng
  Cc: rkrcmar, corbet, christoffer.dall, marc.zyngier, linux,
	catalin.marinas, rjw, bp, lenb, kvm, linux-doc, linux-kernel,
	linux-arm-kernel, kvmarm, linux-acpi, devel, huangshaoyu
In-Reply-To: <1520093380-42577-4-git-send-email-gengdongjiu@huawei.com>

Hi Dongjiu Geng,

On 03/03/18 16:09, Dongjiu Geng wrote:
> RAS Extension provides VSESR_EL2 register to specify
> virtual SError syndrome value, this patch adds a new
> IOCTL to export user-invisible states related to
> SError exceptions. User space can setup the
> kvm_vcpu_events to inject specified SError, also it
> can support live migration.

> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index 8a3d708..26ae151 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -819,11 +819,13 @@ struct kvm_clock_data {
>  
>  Capability: KVM_CAP_VCPU_EVENTS
>  Extended by: KVM_CAP_INTR_SHADOW
> -Architectures: x86
> +Architectures: x86, arm, arm64
>  Type: vm ioctl
>  Parameters: struct kvm_vcpu_event (out)
>  Returns: 0 on success, -1 on error
>  
> +X86:
> +
>  Gets currently pending exceptions, interrupts, and NMIs as well as related
>  states of the vcpu.
>  
> @@ -865,15 +867,29 @@ Only two fields are defined in the flags field:
>  - KVM_VCPUEVENT_VALID_SMM may be set in the flags field to signal that
>    smi contains a valid state.
>  
> +ARM, ARM64:
> +
> +Gets currently pending SError exceptions as well as related states of the vcpu.
> +
> +struct kvm_vcpu_events {
> +       struct {
> +               bool serror_pending;
> +               bool serror_has_esr;
> +               u64 serror_esr;
> +       } exception;
> +};

Don't put bool in an ABI struct. The encoding is up to the compiler.
The compiler will insert padding in this struct to make serror_esr naturally
aligned. Different compilers may do it differently. You'll see that the existing
struct kvm_vcpu_events has 'pad' fields to ensure each element in the struct is
naturally aligned.

serror_pending and serror_has_esr need to be in a flags field.

I thought the logic for re-using the CAP was so user-space could re-use
save/restore code to transfer whatever we put in here during migration. If the
struct is a different size the code has to be different anyway.
My understanding of Drew and Christoffer's comments was that we should re-use
the existing struct. (but now that I look at it, its not so clear).

(If we reuse the struct, we can put the esr in exception.error_code, if we can
get away with it: It would be good to union exception up with a u64, then use
that. This would let us transfer anything we need in those RES0 bits of the
64bit VSESR_EL2).


>  4.32 KVM_SET_VCPU_EVENTS
>  
>  Capability: KVM_CAP_VCPU_EVENTS
>  Extended by: KVM_CAP_INTR_SHADOW
> -Architectures: x86
> +Architectures: x86, arm, arm64
>  Type: vm ioctl
>  Parameters: struct kvm_vcpu_event (in)
>  Returns: 0 on success, -1 on error
>  
> +X86:
> +
>  Set pending exceptions, interrupts, and NMIs as well as related states of the
>  vcpu.
>  
> @@ -894,6 +910,12 @@ shall be written into the VCPU.
>  
>  KVM_VCPUEVENT_VALID_SMM can only be set if KVM_CAP_X86_SMM is available.
>  
> +ARM, ARM64:
> +
> +Set pending SError exceptions as well as related states of the vcpu.
> +
> +See KVM_GET_VCPU_EVENTS for the data structure.
> +
>  
>  4.33 KVM_GET_DEBUGREGS
>  

> diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
> index 9abbf30..32c0eae 100644
> --- a/arch/arm64/include/uapi/asm/kvm.h
> +++ b/arch/arm64/include/uapi/asm/kvm.h
> @@ -39,6 +39,7 @@
>  #define __KVM_HAVE_GUEST_DEBUG
>  #define __KVM_HAVE_IRQ_LINE
>  #define __KVM_HAVE_READONLY_MEM
> +#define __KVM_HAVE_VCPU_EVENTS
>  
>  #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
>  
> @@ -153,6 +154,15 @@ struct kvm_sync_regs {
>  struct kvm_arch_memory_slot {
>  };
>  
> +/* for KVM_GET/SET_VCPU_EVENTS */
> +struct kvm_vcpu_events {
> +	struct {
> +		bool serror_pending;
> +		bool serror_has_esr;
> +		u64 serror_esr;
> +	} exception;
> +};
> +

>  /* If you need to interpret the index values, here is the key: */
>  #define KVM_REG_ARM_COPROC_MASK		0x000000000FFF0000
>  #define KVM_REG_ARM_COPROC_SHIFT	16
> diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
> index 5c7f657..62d49c2 100644
> --- a/arch/arm64/kvm/guest.c
> +++ b/arch/arm64/kvm/guest.c
> @@ -277,6 +277,32 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
>  	return -EINVAL;
>  }
>  
> +int kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
> +			struct kvm_vcpu_events *events)
> +{
> +	events->exception.serror_pending = (vcpu_get_hcr(vcpu) & HCR_VSE);
> +	events->exception.serror_has_esr =
> +			cpus_have_const_cap(ARM64_HAS_RAS_EXTN) &&
> +					(!!vcpu_get_vsesr(vcpu));
> +	events->exception.serror_esr = vcpu_get_vsesr(vcpu);
> +
> +	return 0;

Nothing checks the return value. Why is it here?

> +}
> +
> +int kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
> +			struct kvm_vcpu_events *events)
> +{
> +	bool injected = events->exception.serror_pending;
> +	bool has_esr = events->exception.serror_has_esr;

Could you validate 'events' describes something we support. What if
cpus_have_const_cap(ARM64_HAS_RAS_EXTN) is false, we still call kvm_set_sei_esr().

Please check any parts of the struct that should be zero, are zero. This lets us
add new features, and reject attempts to migrate them (instead of silently
ignoring them).


> +	if (injected && has_esr)
> +		kvm_set_sei_esr(vcpu, events->exception.serror_esr);
> +	else if (injected)
> +		kvm_inject_vabt(vcpu);
> +
> +	return 0;

Nothing checks the return value. Why is it here?


> +}
> +
>  int __attribute_const__ kvm_target_cpu(void)
>  {
>  	unsigned long implementor = read_cpuid_implementor();


> diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
> index 7e3941f..30c56e0 100644
> --- a/virt/kvm/arm/arm.c
> +++ b/virt/kvm/arm/arm.c
> @@ -1051,6 +1051,24 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>  			return -EFAULT;
>  		return kvm_arm_vcpu_has_attr(vcpu, &attr);
>  	}
> +	case KVM_GET_VCPU_EVENTS: {
> +		struct kvm_vcpu_events events;

Please initialise events to 0 so that padding transferred to user-space doesn't
contain kernel stack.


> +		kvm_arm_vcpu_get_events(vcpu, &events);
> +
> +		if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
> +			return -EFAULT;
> +
> +		return 0;
> +	}
> +	case KVM_SET_VCPU_EVENTS: {
> +		struct kvm_vcpu_events events;
> +
> +		if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
> +			return -EFAULT;
> +
> +		return kvm_arm_vcpu_set_events(vcpu, &events);
> +	}
>  	default:
>  		return -EINVAL;
>  	}
> 

Thanks,

James

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v10 1/5] arm64: KVM: Prepare set virtual SEI syndrome value
From: James Morse @ 2018-03-15 20:37 UTC (permalink / raw)
  To: Dongjiu Geng
  Cc: rkrcmar, corbet, christoffer.dall, marc.zyngier, linux,
	catalin.marinas, rjw, bp, lenb, kvm, linux-doc, linux-kernel,
	linux-arm-kernel, kvmarm, linux-acpi, devel, huangshaoyu
In-Reply-To: <1520093380-42577-2-git-send-email-gengdongjiu@huawei.com>

Hi Dongjiu Geng,

On 03/03/18 16:09, Dongjiu Geng wrote:
> Export one API to specify virtual SEI syndrome value
> for guest, and add a helper to get the VSESR_EL2 value.

This patch adds two helpers that nothing calls... its not big, please merge it
with the patch that uses these.


> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
> index 413dc82..3294885 100644
> --- a/arch/arm64/include/asm/kvm_emulate.h
> +++ b/arch/arm64/include/asm/kvm_emulate.h
> @@ -71,6 +71,11 @@ static inline void vcpu_set_hcr(struct kvm_vcpu *vcpu, unsigned long hcr)
>  	vcpu->arch.hcr_el2 = hcr;
>  }
>  
> +static inline unsigned long vcpu_get_vsesr(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.vsesr_el2;
> +}
> +
>  static inline void vcpu_set_vsesr(struct kvm_vcpu *vcpu, u64 vsesr)
>  {
>  	vcpu->arch.vsesr_el2 = vsesr;
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index a73f63a..3dc49b7 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -354,6 +354,8 @@ void handle_exit_early(struct kvm_vcpu *vcpu, struct kvm_run *run,
>  int kvm_perf_init(void);
>  int kvm_perf_teardown(void);
>  
> +void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome);
> +
>  struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
>  
>  static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr,
> diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c
> index 60666a0..78ecb28 100644
> --- a/arch/arm64/kvm/inject_fault.c
> +++ b/arch/arm64/kvm/inject_fault.c
> @@ -186,3 +186,8 @@ void kvm_inject_vabt(struct kvm_vcpu *vcpu)
>  {
>  	pend_guest_serror(vcpu, ESR_ELx_ISV);
>  }
> +
> +void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome)
> +{
> +	pend_guest_serror(vcpu, syndrome & ESR_ELx_ISS_MASK);

If you move the ISS_MASK into pend_guest_serror(), you wouldn't need this at all.

It would be better if any validation were in the user-space helpers so we can
check user-space hasn't put something funny in the top bits.

> +}
> 


Thanks,

James
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH v2] Documentation/CodingStyle: Add an example for braces
From: Gary R Hook @ 2018-03-15 20:04 UTC (permalink / raw)
  To: linux-doc; +Cc: linux-kernel, corbet

Add another example of required braces when using a compound statements.

Signed-off-by: Gary R Hook <gary.hook@amd.com>
---

Changes since v1:
- Move the new example up, and make it more generic

 Documentation/process/coding-style.rst |    9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/Documentation/process/coding-style.rst b/Documentation/process/coding-style.rst
index a20b44a40ec4..fcef0b4b59d0 100644
--- a/Documentation/process/coding-style.rst
+++ b/Documentation/process/coding-style.rst
@@ -188,6 +188,15 @@ and
 	else
 		do_that();
 
+Do use braces when a body is more complex than a single simple statement:
+
+.. code-block:: c
+
+	if (condition) {
+		if (another_condition)
+			do_something();
+	}
+
 This does not apply if only one branch of a conditional statement is a single
 statement; in the latter case use braces in both branches:
 

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* Re: [PATCH] Documentation/CodingStyle: Add an example for braces
From: Gary R Hook @ 2018-03-15 19:54 UTC (permalink / raw)
  To: Jani Nikula, linux-doc; +Cc: linux-kernel, corbet
In-Reply-To: <87zi39ljni.fsf@intel.com>

On 03/15/2018 05:26 AM, Jani Nikula wrote:
> On Wed, 14 Mar 2018, Gary R Hook <gary.hook@amd.com> wrote:
>> Add another example of required braces when using a compound statement in
>> a loop.
>>
>> Signed-off-by: Gary R Hook <gary.hook@amd.com>
>> ---
>>   Documentation/process/coding-style.rst |    9 +++++++++
>>   1 file changed, 9 insertions(+)
>>
>> diff --git a/Documentation/process/coding-style.rst b/Documentation/process/coding-style.rst
>> index a20b44a40ec4..d98deb62c400 100644
>> --- a/Documentation/process/coding-style.rst
>> +++ b/Documentation/process/coding-style.rst
>> @@ -200,6 +200,15 @@ statement; in the latter case use braces in both branches:
>>   		otherwise();
>>   	}
>>   
>> +Also, use braces when a loop contains more than a single simple statement:
> 
> Personally, I'd not limit this to loops.
> 
> 	if (condition) {
>          	if (another_condition)
>                  	action();
>          }
> 
> You could argue the existing rule already covers these cases by
> excluding selection and iteration statements from the "single statement"
> in "Do not unnecessarily use braces where a single statement will do."

Define "statement"? There's a school of thought that uses semicolons to 
indicate a statement. I'm trying to eliminate any ambiguity by calling 
out compound statements as "more than one statement". Sure, semantics,
but in the interest of clarity. An additional sentence and example 
doesn't really cost much.

Thank you for your time. I've made some changes, and a v2 to follow shortly.



--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: rfc: remove print_vma_addr ? (was Re: [PATCH 00/16] remove eight obsolete architectures)
From: Joe Perches @ 2018-03-15 17:13 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Geert Uytterhoeven, David Howells, Arnd Bergmann, Linux-Arch,
	Linux Kernel Mailing List, linux-doc, linux-block, linux-ide,
	linux-input, netdev, linux-wireless, Linux PWM List, linux-rtc,
	linux-spi, USB list, DRI Development,
	Linux Fbdev development list, Linux Watchdog Mailing List,
	Linux FS Devel, Linux MM
In-Reply-To: <20180315170830.GA17574@bombadil.infradead.org>

On Thu, 2018-03-15 at 10:08 -0700, Matthew Wilcox wrote:
> On Thu, Mar 15, 2018 at 09:56:46AM -0700, Joe Perches wrote:
> > I have a patchset that creates a vsprintf extension for
> > print_vma_addr and removes all the uses similar to the
> > print_symbol() removal.
> > 
> > This now avoids any possible printk interleaving.
> > 
> > Unfortunately, without some #ifdef in vsprintf, which
> > I would like to avoid, it increases the nommu kernel
> > size by ~500 bytes.
> > 
> > Anyone think this is acceptable?
[]
> This doesn't feel like a huge win since it's only called ~once per
> architecture.  I'd be more excited if it made the printing of the whole
> thing standardised; eg we have a print_fault() function in mm/memory.c
> which takes a suitable set of arguments.

Sure but perhaps that's not feasible as the surrounding output
is per-arch specific.

What could be a standardized fault message here?

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: rfc: remove print_vma_addr ? (was Re: [PATCH 00/16] remove eight obsolete architectures)
From: Matthew Wilcox @ 2018-03-15 17:08 UTC (permalink / raw)
  To: Joe Perches
  Cc: Geert Uytterhoeven, David Howells, Arnd Bergmann, Linux-Arch,
	Linux Kernel Mailing List, linux-doc, linux-block, linux-ide,
	linux-input, netdev, linux-wireless, Linux PWM List, linux-rtc,
	linux-spi, USB list, DRI Development,
	Linux Fbdev development list, Linux Watchdog Mailing List,
	Linux FS Devel, Linux MM
In-Reply-To: <1521133006.22221.35.camel@perches.com>

On Thu, Mar 15, 2018 at 09:56:46AM -0700, Joe Perches wrote:
> I have a patchset that creates a vsprintf extension for
> print_vma_addr and removes all the uses similar to the
> print_symbol() removal.
> 
> This now avoids any possible printk interleaving.
> 
> Unfortunately, without some #ifdef in vsprintf, which
> I would like to avoid, it increases the nommu kernel
> size by ~500 bytes.
> 
> Anyone think this is acceptable?
> 
> Here's the overall patch, but I have it as a series
> ---
>  Documentation/core-api/printk-formats.rst |  9 +++++
>  arch/arm64/kernel/traps.c                 | 13 +++----
>  arch/mips/mm/fault.c                      | 16 ++++-----
>  arch/parisc/mm/fault.c                    | 15 ++++----
>  arch/riscv/kernel/traps.c                 | 11 +++---
>  arch/s390/mm/fault.c                      |  7 ++--
>  arch/sparc/mm/fault_32.c                  |  8 ++---
>  arch/sparc/mm/fault_64.c                  |  8 ++---
>  arch/tile/kernel/signal.c                 |  9 ++---
>  arch/um/kernel/trap.c                     | 13 +++----
>  arch/x86/kernel/signal.c                  | 10 ++----
>  arch/x86/kernel/traps.c                   | 18 ++++------
>  arch/x86/mm/fault.c                       | 12 +++----
>  include/linux/mm.h                        |  1 -
>  lib/vsprintf.c                            | 58 ++++++++++++++++++++++++++-----
>  mm/memory.c                               | 33 ------------------
>  16 files changed, 112 insertions(+), 129 deletions(-)

This doesn't feel like a huge win since it's only called ~once per
architecture.  I'd be more excited if it made the printing of the whole
thing standardised; eg we have a print_fault() function in mm/memory.c
which takes a suitable set of arguments.
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* rfc: remove print_vma_addr ? (was Re: [PATCH 00/16] remove eight obsolete architectures)
From: Joe Perches @ 2018-03-15 16:56 UTC (permalink / raw)
  To: Geert Uytterhoeven, David Howells
  Cc: Arnd Bergmann, Linux-Arch, Linux Kernel Mailing List, linux-doc,
	linux-block, linux-ide, linux-input, netdev, linux-wireless,
	Linux PWM List, linux-rtc, linux-spi, USB list, DRI Development,
	Linux Fbdev development list, Linux Watchdog Mailing List,
	Linux FS Devel, Linux MM
In-Reply-To: <CAMuHMdXcxuzCOnFCNm4NXDv-wfYJDO5GQpB_ECu7j=2BjMhNpA@mail.gmail.com>

On Thu, 2018-03-15 at 10:48 +0100, Geert Uytterhoeven wrote:
> Hi David,
> 
> On Thu, Mar 15, 2018 at 10:42 AM, David Howells <dhowells@redhat.com> wrote:
> > Do we have anything left that still implements NOMMU?
> 
> Sure: arm, c6x, m68k, microblaze, and  sh.

I have a patchset that creates a vsprintf extension for
print_vma_addr and removes all the uses similar to the
print_symbol() removal.

This now avoids any possible printk interleaving.

Unfortunately, without some #ifdef in vsprintf, which
I would like to avoid, it increases the nommu kernel
size by ~500 bytes.

Anyone think this is acceptable?

Here's the overall patch, but I have it as a series
---
 Documentation/core-api/printk-formats.rst |  9 +++++
 arch/arm64/kernel/traps.c                 | 13 +++----
 arch/mips/mm/fault.c                      | 16 ++++-----
 arch/parisc/mm/fault.c                    | 15 ++++----
 arch/riscv/kernel/traps.c                 | 11 +++---
 arch/s390/mm/fault.c                      |  7 ++--
 arch/sparc/mm/fault_32.c                  |  8 ++---
 arch/sparc/mm/fault_64.c                  |  8 ++---
 arch/tile/kernel/signal.c                 |  9 ++---
 arch/um/kernel/trap.c                     | 13 +++----
 arch/x86/kernel/signal.c                  | 10 ++----
 arch/x86/kernel/traps.c                   | 18 ++++------
 arch/x86/mm/fault.c                       | 12 +++----
 include/linux/mm.h                        |  1 -
 lib/vsprintf.c                            | 58 ++++++++++++++++++++++++++-----
 mm/memory.c                               | 33 ------------------
 16 files changed, 112 insertions(+), 129 deletions(-)

diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst
index 934559b3c130..10a91da1bc83 100644
--- a/Documentation/core-api/printk-formats.rst
+++ b/Documentation/core-api/printk-formats.rst
@@ -157,6 +157,15 @@ DMA address types dma_addr_t
 For printing a dma_addr_t type which can vary based on build options,
 regardless of the width of the CPU data path.
 
+VMA name and address
+----------------------------
+
+::
+
+	%pav	<name>[hexstart+hexsize] or ?[0+0] if unavailable
+
+For any address, print the vma's name and its starting address and size
+
 Passed by reference.
 
 Raw buffer as an escaped string
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 2b478565d774..48edf812ce8b 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -242,13 +242,14 @@ void arm64_force_sig_info(struct siginfo *info, const char *str,
 	if (!show_unhandled_signals_ratelimited())
 		goto send_sig;
 
-	pr_info("%s[%d]: unhandled exception: ", tsk->comm, task_pid_nr(tsk));
 	if (esr)
-		pr_cont("%s, ESR 0x%08x, ", esr_get_class_string(esr), esr);
-
-	pr_cont("%s", str);
-	print_vma_addr(KERN_CONT " in ", regs->pc);
-	pr_cont("\n");
+		pr_info("%s[%d]: unhandled exception: %s, ESR 0x%08x, %s in %pav\n",
+			tsk->comm, task_pid_nr(tsk),
+			esr_get_class_string(esr), esr,
+			str, &regs->pc);
+	else
+		pr_info("%s[%d]: unhandled exception: %s in %pav\n",
+			tsk->comm, task_pid_nr(tsk), str, &regs->pc);
 	__show_regs(regs);
 
 send_sig:
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index 4f8f5bf46977..ce7bf077a0f5 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -213,14 +213,14 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write,
 				tsk->comm,
 				write ? "write access to" : "read access from",
 				field, address);
-			pr_info("epc = %0*lx in", field,
-				(unsigned long) regs->cp0_epc);
-			print_vma_addr(KERN_CONT " ", regs->cp0_epc);
-			pr_cont("\n");
-			pr_info("ra  = %0*lx in", field,
-				(unsigned long) regs->regs[31]);
-			print_vma_addr(KERN_CONT " ", regs->regs[31]);
-			pr_cont("\n");
+			pr_info("epc = %0*lx in %pav\n",
+				field,
+				(unsigned long)regs->cp0_epc,
+				&regs->cp0_epc);
+			pr_info("ra  = %0*lx in %pav\n",
+				field,
+				(unsigned long)regs->regs[31],
+				&regs->regs[31]);
 		}
 		current->thread.trap_nr = (regs->cp0_cause >> 2) & 0x1f;
 		info.si_signo = SIGSEGV;
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index e247edbca68e..877cea702714 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -240,17 +240,14 @@ show_signal_msg(struct pt_regs *regs, unsigned long code,
 	if (!printk_ratelimit())
 		return;
 
-	pr_warn("\n");
-	pr_warn("do_page_fault() command='%s' type=%lu address=0x%08lx",
-	    tsk->comm, code, address);
-	print_vma_addr(KERN_CONT " in ", regs->iaoq[0]);
-
-	pr_cont("\ntrap #%lu: %s%c", code, trap_name(code),
-		vma ? ',':'\n');
+	pr_warn("do_page_fault() command='%s' type=%lu address=0x%08lx in %pav\n",
+		tsk->comm, code, address, &regs->iaoq[0]);
 
 	if (vma)
-		pr_cont(" vm_start = 0x%08lx, vm_end = 0x%08lx\n",
-			vma->vm_start, vma->vm_end);
+		pr_warn("trap #%lu: %s%c, vm_start = 0x%08lx, vm_end = 0x%08lx\n",
+			code, trap_name(code), vma->vm_start, vma->vm_end);
+	else
+		pr_warn("trap #%lu: %s%c\n", code, trap_name(code));
 
 	show_regs(regs);
 }
diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c
index 93132cb59184..16609dcb2546 100644
--- a/arch/riscv/kernel/traps.c
+++ b/arch/riscv/kernel/traps.c
@@ -78,12 +78,11 @@ static inline void do_trap_siginfo(int signo, int code,
 void do_trap(struct pt_regs *regs, int signo, int code,
 	unsigned long addr, struct task_struct *tsk)
 {
-	if (show_unhandled_signals && unhandled_signal(tsk, signo)
-	    && printk_ratelimit()) {
-		pr_info("%s[%d]: unhandled signal %d code 0x%x at 0x" REG_FMT,
-			tsk->comm, task_pid_nr(tsk), signo, code, addr);
-		print_vma_addr(KERN_CONT " in ", GET_IP(regs));
-		pr_cont("\n");
+	if (show_unhandled_signals && unhandled_signal(tsk, signo) &&
+	    printk_ratelimit()) {
+		pr_info("%s[%d]: unhandled signal %d code 0x%x at 0x" REG_FMT " in %pav\n",
+			tsk->comm, task_pid_nr(tsk), signo, code, addr,
+			&GET_IP(regs));
 		show_regs(regs);
 	}
 
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 93faeca52284..3b1d6d618af2 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -250,10 +250,9 @@ void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault)
 		return;
 	if (!printk_ratelimit())
 		return;
-	printk(KERN_ALERT "User process fault: interruption code %04x ilc:%d ",
-	       regs->int_code & 0xffff, regs->int_code >> 17);
-	print_vma_addr(KERN_CONT "in ", regs->psw.addr);
-	printk(KERN_CONT "\n");
+	printk(KERN_ALERT "User process fault: interruption code %04x ilc:%d in %pav\n",
+	       regs->int_code & 0xffff, regs->int_code >> 17,
+	       &regs->psw.addr);
 	if (is_mm_fault)
 		dump_fault_info(regs);
 	show_regs(regs);
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
index a8103a84b4ac..206ec5a1c915 100644
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -113,15 +113,11 @@ show_signal_msg(struct pt_regs *regs, int sig, int code,
 	if (!printk_ratelimit())
 		return;
 
-	printk("%s%s[%d]: segfault at %lx ip %px (rpc %px) sp %px error %x",
+	printk("%s%s[%d]: segfault at %lx ip %px (rpc %px) sp %px error %x in %pav\n",
 	       task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
 	       tsk->comm, task_pid_nr(tsk), address,
 	       (void *)regs->pc, (void *)regs->u_regs[UREG_I7],
-	       (void *)regs->u_regs[UREG_FP], code);
-
-	print_vma_addr(KERN_CONT " in ", regs->pc);
-
-	printk(KERN_CONT "\n");
+	       (void *)regs->u_regs[UREG_FP], code, &regs->pc);
 }
 
 static void __do_fault_siginfo(int code, int sig, struct pt_regs *regs,
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 41363f46797b..a21199329ebe 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -154,15 +154,11 @@ show_signal_msg(struct pt_regs *regs, int sig, int code,
 	if (!printk_ratelimit())
 		return;
 
-	printk("%s%s[%d]: segfault at %lx ip %px (rpc %px) sp %px error %x",
+	printk("%s%s[%d]: segfault at %lx ip %px (rpc %px) sp %px error %x in %pav\b",
 	       task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
 	       tsk->comm, task_pid_nr(tsk), address,
 	       (void *)regs->tpc, (void *)regs->u_regs[UREG_I7],
-	       (void *)regs->u_regs[UREG_FP], code);
-
-	print_vma_addr(KERN_CONT " in ", regs->tpc);
-
-	printk(KERN_CONT "\n");
+	       (void *)regs->u_regs[UREG_FP], code, &regs->tpc);
 }
 
 static void do_fault_siginfo(int code, int sig, struct pt_regs *regs,
diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c
index f2bf557bb005..0556106dfe8a 100644
--- a/arch/tile/kernel/signal.c
+++ b/arch/tile/kernel/signal.c
@@ -383,13 +383,10 @@ void trace_unhandled_signal(const char *type, struct pt_regs *regs,
 	if (show_unhandled_signals <= 1 && !printk_ratelimit())
 		return;
 
-	printk("%s%s[%d]: %s at %lx pc "REGFMT" signal %d",
+	printk("%s%s[%d]: %s at %lx pc " REGFMT " signal %d in %pav\n",
 	       task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
-	       tsk->comm, task_pid_nr(tsk), type, address, regs->pc, sig);
-
-	print_vma_addr(KERN_CONT " in ", regs->pc);
-
-	printk(KERN_CONT "\n");
+	       tsk->comm, task_pid_nr(tsk), type, address, regs->pc, sig,
+	       &regs->pc);
 
 	if (show_unhandled_signals > 1) {
 		switch (sig) {
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index b2b02df9896e..9281248972c0 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -150,14 +150,11 @@ static void show_segv_info(struct uml_pt_regs *regs)
 	if (!printk_ratelimit())
 		return;
 
-	printk("%s%s[%d]: segfault at %lx ip %px sp %px error %x",
-		task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
-		tsk->comm, task_pid_nr(tsk), FAULT_ADDRESS(*fi),
-		(void *)UPT_IP(regs), (void *)UPT_SP(regs),
-		fi->error_code);
-
-	print_vma_addr(KERN_CONT " in ", UPT_IP(regs));
-	printk(KERN_CONT "\n");
+	printk("%s%s[%d]: segfault at %lx ip %px sp %px error %x in %pav\n",
+	       task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
+	       tsk->comm, task_pid_nr(tsk), FAULT_ADDRESS(*fi),
+	       (void *)UPT_IP(regs), (void *)UPT_SP(regs),
+	       fi->error_code, &UPT_IP(regs));
 }
 
 static void bad_segv(struct faultinfo fi, unsigned long ip)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 4cdc0b27ec82..9ab0c5c50b29 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -841,15 +841,11 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
 {
 	struct task_struct *me = current;
 
-	if (show_unhandled_signals && printk_ratelimit()) {
-		printk("%s"
-		       "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
+	if (show_unhandled_signals && printk_ratelimit())
+		printk("%s%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx in %pav\n",
 		       task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
 		       me->comm, me->pid, where, frame,
-		       regs->ip, regs->sp, regs->orig_ax);
-		print_vma_addr(KERN_CONT " in ", regs->ip);
-		pr_cont("\n");
-	}
+		       regs->ip, regs->sp, regs->orig_ax, &regs->ip);
 
 	force_sig(SIGSEGV, me);
 }
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 3d9b2308e7fa..c6e3d02759e5 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -270,13 +270,10 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
 	tsk->thread.trap_nr = trapnr;
 
 	if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
-	    printk_ratelimit()) {
-		pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx",
+	    printk_ratelimit())
+		pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx in %pav\n",
 			tsk->comm, tsk->pid, str,
-			regs->ip, regs->sp, error_code);
-		print_vma_addr(KERN_CONT " in ", regs->ip);
-		pr_cont("\n");
-	}
+			regs->ip, regs->sp, error_code, &regs->ip);
 
 	force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk);
 }
@@ -565,13 +562,10 @@ do_general_protection(struct pt_regs *regs, long error_code)
 	tsk->thread.trap_nr = X86_TRAP_GP;
 
 	if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
-			printk_ratelimit()) {
-		pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx",
+			printk_ratelimit())
+		pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx in %pav\n",
 			tsk->comm, task_pid_nr(tsk),
-			regs->ip, regs->sp, error_code);
-		print_vma_addr(KERN_CONT " in ", regs->ip);
-		pr_cont("\n");
-	}
+			regs->ip, regs->sp, error_code, &regs->ip);
 
 	force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
 }
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index e6af2b464c3d..b629319e621a 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -857,14 +857,10 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
 	if (!printk_ratelimit())
 		return;
 
-	printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
-		task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
-		tsk->comm, task_pid_nr(tsk), address,
-		(void *)regs->ip, (void *)regs->sp, error_code);
-
-	print_vma_addr(KERN_CONT " in ", regs->ip);
-
-	printk(KERN_CONT "\n");
+	printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx in %pav\n",
+	       task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
+	       tsk->comm, task_pid_nr(tsk), address,
+	       (void *)regs->ip, (void *)regs->sp, error_code, &regs->ip);
 }
 
 static void
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9f1270360983..9584bd3e8c25 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2537,7 +2537,6 @@ extern int randomize_va_space;
 #endif
 
 const char * arch_vma_name(struct vm_area_struct *vma);
-void print_vma_addr(char *prefix, unsigned long rip);
 
 void sparse_mem_maps_populate_node(struct page **map_map,
 				   unsigned long pnum_begin,
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 942b5234a59b..9081476ea4ea 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -35,6 +35,8 @@
 #include <net/addrconf.h>
 #include <linux/siphash.h>
 #include <linux/compiler.h>
+#include <linux/mm_types.h>
+
 #ifdef CONFIG_BLOCK
 #include <linux/blkdev.h>
 #endif
@@ -407,6 +409,11 @@ struct printf_spec {
 #define FIELD_WIDTH_MAX ((1 << 23) - 1)
 #define PRECISION_MAX ((1 << 15) - 1)
 
+static const struct printf_spec strspec = {
+	.field_width = -1,
+	.precision = -1,
+};
+
 static noinline_for_stack
 char *number(char *buf, char *end, unsigned long long num,
 	     struct printf_spec spec)
@@ -1427,6 +1434,45 @@ char *netdev_bits(char *buf, char *end, const void *addr, const char *fmt)
 	return special_hex_number(buf, end, num, size);
 }
 
+static noinline_for_stack
+char *vma_addr(char *buf, char *end, const void *addr)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	char *page;
+	char tbuf[2 * sizeof(unsigned long) * 2 + 4];
+	const char *output = "?[0+0]";
+
+	/*
+	 * we might be running from an atomic context so we cannot sleep
+	 */
+	if (!down_read_trylock(&mm->mmap_sem))
+		goto output;
+
+	vma = find_vma(mm, *(unsigned long *)addr);
+	if (!vma || !vma->vm_file)
+		goto up_read;
+
+	page = (char *)__get_free_page(GFP_ATOMIC | __GFP_NOWARN);
+	if (page) {
+		char *fp;
+
+		fp = file_path(vma->vm_file, page, PAGE_SIZE);
+		if (IS_ERR(fp))
+			fp = "?";
+		buf = string(buf, end, kbasename(fp), strspec);
+		sprintf(tbuf, "[%lx+%lx]",
+			vma->vm_start, vma->vm_end - vma->vm_start);
+		output = tbuf;
+		free_page((unsigned long)page);
+	}
+
+up_read:
+	up_read(&mm->mmap_sem);
+output:
+	return string(buf, end, output, strspec);
+}
+
 static noinline_for_stack
 char *address_val(char *buf, char *end, const void *addr, const char *fmt)
 {
@@ -1434,6 +1480,8 @@ char *address_val(char *buf, char *end, const void *addr, const char *fmt)
 	int size;
 
 	switch (fmt[1]) {
+	case 'v':
+		return vma_addr(buf, end, addr);
 	case 'd':
 		num = *(const dma_addr_t *)addr;
 		size = sizeof(dma_addr_t);
@@ -1474,11 +1522,7 @@ char *format_flags(char *buf, char *end, unsigned long flags,
 					const struct trace_print_flags *names)
 {
 	unsigned long mask;
-	const struct printf_spec strspec = {
-		.field_width = -1,
-		.precision = -1,
-	};
-	const struct printf_spec numspec = {
+	static const struct printf_spec numspec = {
 		.flags = SPECIAL|SMALL,
 		.field_width = -1,
 		.precision = -1,
@@ -1548,10 +1592,6 @@ char *device_node_gen_full_name(const struct device_node *np, char *buf, char *e
 {
 	int depth;
 	const struct device_node *parent = np->parent;
-	static const struct printf_spec strspec = {
-		.field_width = -1,
-		.precision = -1,
-	};
 
 	/* special case for root node */
 	if (!parent)
diff --git a/mm/memory.c b/mm/memory.c
index bc760df8a7f4..f1f922421bde 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4502,39 +4502,6 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr,
 }
 EXPORT_SYMBOL_GPL(access_process_vm);
 
-/*
- * Print the name of a VMA.
- */
-void print_vma_addr(char *prefix, unsigned long ip)
-{
-	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma;
-
-	/*
-	 * we might be running from an atomic context so we cannot sleep
-	 */
-	if (!down_read_trylock(&mm->mmap_sem))
-		return;
-
-	vma = find_vma(mm, ip);
-	if (vma && vma->vm_file) {
-		struct file *f = vma->vm_file;
-		char *buf = (char *)__get_free_page(GFP_NOWAIT);
-		if (buf) {
-			char *p;
-
-			p = file_path(f, buf, PAGE_SIZE);
-			if (IS_ERR(p))
-				p = "?";
-			printk("%s%s[%lx+%lx]", prefix, kbasename(p),
-					vma->vm_start,
-					vma->vm_end - vma->vm_start);
-			free_page((unsigned long)buf);
-		}
-	}
-	up_read(&mm->mmap_sem);
-}
-
 #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
 void __might_fault(const char *file, int line)
 {


--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* Re: [PATCH 5/8] trace_uprobe: Support SDT markers having reference count (semaphore)
From: Steven Rostedt @ 2018-03-15 16:48 UTC (permalink / raw)
  To: Ravi Bangoria
  Cc: mhiramat, oleg, peterz, srikar, acme, ananth, akpm,
	alexander.shishkin, alexis.berlemont, corbet, dan.j.williams,
	gregkh, huawei.libin, hughd, jack, jglisse, jolsa, kan.liang,
	kirill.shutemov, kjlx, kstewart, linux-doc, linux-kernel,
	linux-mm, mhocko, milian.wolff, mingo, namhyung, naveen.n.rao, pc,
	pombredanne, tglx, tmricht, willy, yao.jin, fengguang.wu
In-Reply-To: <20180313125603.19819-6-ravi.bangoria@linux.vnet.ibm.com>

On Tue, 13 Mar 2018 18:26:00 +0530
Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> wrote:

> +static void sdt_increment_ref_ctr(struct trace_uprobe *tu)
> +{
> +	struct uprobe_map_info *info;
> +	struct vm_area_struct *vma;
> +	unsigned long vaddr;
> +
> +	uprobe_start_dup_mmap();

Please add a comment here that this function ups the mm ref count for
each info returned. Otherwise it's hard to know what that mmput() below
matches.

-- Steve

> +	info = uprobe_build_map_info(tu->inode->i_mapping,
> +				tu->ref_ctr_offset, false);
> +	if (IS_ERR(info))
> +		goto out;
> +
> +	while (info) {
> +		down_write(&info->mm->mmap_sem);
> +
> +		vma = sdt_find_vma(info->mm, tu);
> +		vaddr = vma_offset_to_vaddr(vma, tu->ref_ctr_offset);
> +		sdt_update_ref_ctr(info->mm, vaddr, 1);
> +
> +		up_write(&info->mm->mmap_sem);
> +		mmput(info->mm);
> +		info = uprobe_free_map_info(info);
> +	}
> +
> +out:
> +	uprobe_end_dup_mmap();
> +}
> +
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 3/8] Uprobe: Rename map_info to uprobe_map_info
From: Steven Rostedt @ 2018-03-15 16:44 UTC (permalink / raw)
  To: Ravi Bangoria
  Cc: mhiramat, oleg, peterz, srikar, acme, ananth, akpm,
	alexander.shishkin, alexis.berlemont, corbet, dan.j.williams,
	gregkh, huawei.libin, hughd, jack, jglisse, jolsa, kan.liang,
	kirill.shutemov, kjlx, kstewart, linux-doc, linux-kernel,
	linux-mm, mhocko, milian.wolff, mingo, namhyung, naveen.n.rao, pc,
	pombredanne, tglx, tmricht, willy, yao.jin, fengguang.wu
In-Reply-To: <20180313125603.19819-4-ravi.bangoria@linux.vnet.ibm.com>

On Tue, 13 Mar 2018 18:25:58 +0530
Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> wrote:
> -static inline struct map_info *free_map_info(struct map_info *info)
> +static inline struct uprobe_map_info *
> +uprobe_free_map_info(struct uprobe_map_info *info)
>  {
> -	struct map_info *next = info->next;
> +	struct uprobe_map_info *next = info->next;
>  	kfree(info);
>  	return next;
>  }
>  
> -static struct map_info *
> -build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
> +static struct uprobe_map_info *
> +uprobe_build_map_info(struct address_space *mapping, loff_t offset,

Also, as these functions have side effects (like you need to perform a
mmput(info->mm), you need to add kerneldoc type comments to these
functions, explaining how to use them.

When you upgrade a function from static to use cases outside the file,
it requires documenting that function for future users.

-- Steve


> +		      bool is_register)
>  {
>  	unsigned long pgoff = offset >> PAGE_SHIFT;
>  	struct vm_area_struct *vma;
> -	struct map_info *curr = NULL;
> -	struct map_info *prev = NULL;
> -	struct map_info *info;
> +	struct uprobe_map_info *curr = NULL;
> +	struct uprobe_map_info *prev = NULL;
> +	struct uprobe_map_info *info;
>  	int more = 0;
>  
>   again:
> @@ -729,7 +731,7 @@ static inline struct map_info *free_map_info(struct map_info *info)
>  			 * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through
>  			 * reclaim. This is optimistic, no harm done if it fails.
>  			 */
> -			prev = kmalloc(sizeof(struct map_info),
> +			prev = kmalloc(sizeof(struct uprobe_map_info),
>  					GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
>  			if (prev)
>  				prev->next = NULL;
> @@ -762,7 +764,7 @@ static inline struct map_info *free_map_info(struct map_info *info)
>  	}
>  
>  	do {
> -		info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
> +		info = kmalloc(sizeof(struct uprobe_map_info), GFP_KERNEL);
>  		if (!info) {
>  			curr = ERR_PTR(-ENOMEM);
>  			goto out;
> @@ -774,7 +776,7 @@ static inline struct map_info *free_map_info(struct map_info *info)
>  	goto again;
>   out:
>  	while (prev)
> -		prev = free_map_info(prev);
> +		prev = uprobe_free_map_info(prev);
>  	return curr;
>  }
>  
> @@ -782,11 +784,11 @@ static inline struct map_info *free_map_info(struct map_info *info)
>  register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
>  {
>  	bool is_register = !!new;
> -	struct map_info *info;
> +	struct uprobe_map_info *info;
>  	int err = 0;
>  
>  	percpu_down_write(&dup_mmap_sem);
> -	info = build_map_info(uprobe->inode->i_mapping,
> +	info = uprobe_build_map_info(uprobe->inode->i_mapping,
>  					uprobe->offset, is_register);
>  	if (IS_ERR(info)) {
>  		err = PTR_ERR(info);
> @@ -825,7 +827,7 @@ static inline struct map_info *free_map_info(struct map_info *info)
>  		up_write(&mm->mmap_sem);
>   free:
>  		mmput(mm);
> -		info = free_map_info(info);
> +		info = uprobe_free_map_info(info);
>  	}
>   out:
>  	percpu_up_write(&dup_mmap_sem);
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 4/8] Uprobe: Export uprobe_map_info along with uprobe_{build/free}_map_info()
From: Steven Rostedt @ 2018-03-15 16:32 UTC (permalink / raw)
  To: Ravi Bangoria
  Cc: mhiramat, oleg, peterz, srikar, acme, ananth, akpm,
	alexander.shishkin, alexis.berlemont, corbet, dan.j.williams,
	gregkh, huawei.libin, hughd, jack, jglisse, jolsa, kan.liang,
	kirill.shutemov, kjlx, kstewart, linux-doc, linux-kernel,
	linux-mm, mhocko, milian.wolff, mingo, namhyung, naveen.n.rao, pc,
	pombredanne, tglx, tmricht, willy, yao.jin, fengguang.wu
In-Reply-To: <20180313125603.19819-5-ravi.bangoria@linux.vnet.ibm.com>

On Tue, 13 Mar 2018 18:25:59 +0530
Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> wrote:

> These exported data structure and functions will be used by other
> files in later patches.

I'm reluctantly OK with the above.

> 
> No functionality changes.

Please remove this line. There are functionality changes. Turning a
static inline into an exported function *is* a functionality change.

-- Steve

> 
> Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
> ---
>  include/linux/uprobes.h |  9 +++++++++
>  kernel/events/uprobes.c | 14 +++-----------
>  2 files changed, 12 insertions(+), 11 deletions(-)
> 
> diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
> index 0a294e9..7bd2760 100644
> --- a/include/linux/uprobes.h
> +++ b/include/linux/uprobes.h
> @@ -109,12 +109,19 @@ enum rp_check {
>  	RP_CHECK_RET,
>  };
>  
> +struct address_space;
>  struct xol_area;
>  
>  struct uprobes_state {
>  	struct xol_area		*xol_area;
>  };
>  
> +struct uprobe_map_info {
> +	struct uprobe_map_info *next;
> +	struct mm_struct *mm;
> +	unsigned long vaddr;
> +};
> +
>  extern int set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr);
>  extern int set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr);
>  extern bool is_swbp_insn(uprobe_opcode_t *insn);
> @@ -149,6 +156,8 @@ struct uprobes_state {
>  extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs);
>  extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
>  					 void *src, unsigned long len);
> +extern struct uprobe_map_info *uprobe_free_map_info(struct uprobe_map_info *info);
> +extern struct uprobe_map_info *uprobe_build_map_info(struct address_space *mapping, loff_t offset, bool is_register);
>  #else /* !CONFIG_UPROBES */
>  struct uprobes_state {
>  };
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 081b88c1..e7830b8 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -695,23 +695,15 @@ static void delete_uprobe(struct uprobe *uprobe)
>  	put_uprobe(uprobe);
>  }
>  
> -struct uprobe_map_info {
> -	struct uprobe_map_info *next;
> -	struct mm_struct *mm;
> -	unsigned long vaddr;
> -};
> -
> -static inline struct uprobe_map_info *
> -uprobe_free_map_info(struct uprobe_map_info *info)
> +struct uprobe_map_info *uprobe_free_map_info(struct uprobe_map_info *info)
>  {
>  	struct uprobe_map_info *next = info->next;
>  	kfree(info);
>  	return next;
>  }
>  
> -static struct uprobe_map_info *
> -uprobe_build_map_info(struct address_space *mapping, loff_t offset,
> -		      bool is_register)
> +struct uprobe_map_info *uprobe_build_map_info(struct address_space *mapping,
> +					      loff_t offset, bool is_register)
>  {
>  	unsigned long pgoff = offset >> PAGE_SHIFT;
>  	struct vm_area_struct *vma;

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 2/8] mm: Prefix vma_ to vaddr_to_offset() and offset_to_vaddr()
From: Steven Rostedt @ 2018-03-15 16:28 UTC (permalink / raw)
  To: Ravi Bangoria
  Cc: mhiramat, oleg, peterz, srikar, acme, ananth, akpm,
	alexander.shishkin, alexis.berlemont, corbet, dan.j.williams,
	gregkh, huawei.libin, hughd, jack, jglisse, jolsa, kan.liang,
	kirill.shutemov, kjlx, kstewart, linux-doc, linux-kernel,
	linux-mm, mhocko, milian.wolff, mingo, namhyung, naveen.n.rao, pc,
	pombredanne, tglx, tmricht, willy, yao.jin, fengguang.wu
In-Reply-To: <20180313125603.19819-3-ravi.bangoria@linux.vnet.ibm.com>

On Tue, 13 Mar 2018 18:25:57 +0530
Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> wrote:

> No functionality changes.

Again, please add an explanation to why this patch is done.

-- Steve

> 
> Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 1/8] Uprobe: Export vaddr <-> offset conversion functions
From: Steven Rostedt @ 2018-03-15 16:27 UTC (permalink / raw)
  To: Ravi Bangoria
  Cc: mhiramat, oleg, peterz, srikar, acme, ananth, akpm,
	alexander.shishkin, alexis.berlemont, corbet, dan.j.williams,
	gregkh, huawei.libin, hughd, jack, jglisse, jolsa, kan.liang,
	kirill.shutemov, kjlx, kstewart, linux-doc, linux-kernel,
	linux-mm, mhocko, milian.wolff, mingo, namhyung, naveen.n.rao, pc,
	pombredanne, tglx, tmricht, willy, yao.jin, fengguang.wu
In-Reply-To: <20180313125603.19819-2-ravi.bangoria@linux.vnet.ibm.com>

On Tue, 13 Mar 2018 18:25:56 +0530
Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> wrote:

> No functionality changes.

Please add a detailed explanation why this patch is needed. All commits
should be self sufficient and stand on their own. If I were to come up
to this patch via a git blame, I would be clueless to why it was done.

-- Steve


> 
> Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
> ---
>
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 5/8] trace_uprobe: Support SDT markers having reference count (semaphore)
From: Oleg Nesterov @ 2018-03-15 15:01 UTC (permalink / raw)
  To: Ravi Bangoria
  Cc: mhiramat, peterz, srikar, acme, ananth, akpm, alexander.shishkin,
	alexis.berlemont, corbet, dan.j.williams, gregkh, huawei.libin,
	hughd, jack, jglisse, jolsa, kan.liang, kirill.shutemov, kjlx,
	kstewart, linux-doc, linux-kernel, linux-mm, mhocko, milian.wolff,
	mingo, namhyung, naveen.n.rao, pc, pombredanne, rostedt, tglx,
	tmricht, willy, yao.jin, fengguang.wu
In-Reply-To: <20180313125603.19819-6-ravi.bangoria@linux.vnet.ibm.com>

On 03/13, Ravi Bangoria wrote:
>
> +sdt_update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
> +{
> +	void *kaddr;
> +	struct page *page;
> +	struct vm_area_struct *vma;
> +	int ret = 0;
> +	unsigned short orig = 0;
> +
> +	if (vaddr == 0)
> +		return -EINVAL;
> +
> +	ret = get_user_pages_remote(NULL, mm, vaddr, 1,
> +		FOLL_FORCE | FOLL_WRITE, &page, &vma, NULL);
> +	if (ret <= 0)
> +		return ret;
> +
> +	kaddr = kmap_atomic(page);
> +	memcpy(&orig, kaddr + (vaddr & ~PAGE_MASK), sizeof(orig));
> +	orig += d;
> +	memcpy(kaddr + (vaddr & ~PAGE_MASK), &orig, sizeof(orig));
> +	kunmap_atomic(kaddr);

Hmm. Why memcpy? You could simply do

	kaddr = kmap_atomic();
	unsigned short *ptr = kaddr + (vaddr & ~PAGE_MASK);
	*ptr += d;
	kunmap_atomic();

Oleg.

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 6/8] trace_uprobe/sdt: Fix multiple update of same reference counter
From: Oleg Nesterov @ 2018-03-15 14:49 UTC (permalink / raw)
  To: Ravi Bangoria
  Cc: mhiramat, peterz, srikar, acme, ananth, akpm, alexander.shishkin,
	alexis.berlemont, corbet, dan.j.williams, gregkh, huawei.libin,
	hughd, jack, jglisse, jolsa, kan.liang, kirill.shutemov, kjlx,
	kstewart, linux-doc, linux-kernel, linux-mm, mhocko, milian.wolff,
	mingo, namhyung, naveen.n.rao, pc, pombredanne, rostedt, tglx,
	tmricht, willy, yao.jin, fengguang.wu
In-Reply-To: <20180313125603.19819-7-ravi.bangoria@linux.vnet.ibm.com>

On 03/13, Ravi Bangoria wrote:
>
> For tiny binaries/libraries, different mmap regions points to the
> same file portion. In such cases, we may increment reference counter
> multiple times.

Yes,

> But while de-registration, reference counter will get
> decremented only by once

could you explain why this happens? sdt_increment_ref_ctr() and
sdt_decrement_ref_ctr() look symmetrical, _decrement_ should see
the same mappings?

Ether way, this patch doesn't look right at first glance... Just
for example,

> +static bool sdt_check_mm_list(struct trace_uprobe *tu, struct mm_struct *mm)
> +{
> +	struct sdt_mm_list *tmp = tu->sml;
> +
> +	if (!tu->sml || !mm)
> +		return false;
> +
> +	while (tmp) {
> +		if (tmp->mm == mm)
> +			return true;
> +		tmp = tmp->next;
> +	}
> +
> +	return false;

...

> +}
> +
> +static void sdt_add_mm_list(struct trace_uprobe *tu, struct mm_struct *mm)
> +{
> +	struct sdt_mm_list *tmp;
> +
> +	tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
> +	if (!tmp)
> +		return;
> +
> +	tmp->mm = mm;
> +	tmp->next = tu->sml;
> +	tu->sml = tmp;
> +}
> +

...

> @@ -1020,8 +1104,16 @@ void trace_uprobe_mmap_callback(struct vm_area_struct *vma)
>  		    !trace_probe_is_enabled(&tu->tp))
>  			continue;
>
> +		down_write(&tu->sml_rw_sem);
> +		if (sdt_check_mm_list(tu, vma->vm_mm))
> +			goto cont;
> +
>  		vaddr = vma_offset_to_vaddr(vma, tu->ref_ctr_offset);
> -		sdt_update_ref_ctr(vma->vm_mm, vaddr, 1);
> +		if (!sdt_update_ref_ctr(vma->vm_mm, vaddr, 1))
> +			sdt_add_mm_list(tu, vma->vm_mm);
> +
> +cont:
> +		up_write(&tu->sml_rw_sem);

To simplify, suppose that tu->sml is empty.

Some process calls this function, increments the counter and adds its ->mm into
the list.

Then it exits, ->mm is freed.

The next fork/exec allocates the same memory for the new ->mm, the new process
calls trace_uprobe_mmap_callback() and sdt_check_mm_list() returns T?

Oleg.

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 5/8] trace_uprobe: Support SDT markers having reference count (semaphore)
From: Oleg Nesterov @ 2018-03-15 14:30 UTC (permalink / raw)
  To: Ravi Bangoria
  Cc: mhiramat, peterz, srikar, acme, ananth, akpm, alexander.shishkin,
	alexis.berlemont, corbet, dan.j.williams, gregkh, huawei.libin,
	hughd, jack, jglisse, jolsa, kan.liang, kirill.shutemov, kjlx,
	kstewart, linux-doc, linux-kernel, linux-mm, mhocko, milian.wolff,
	mingo, namhyung, naveen.n.rao, pc, pombredanne, rostedt, tglx,
	tmricht, willy, yao.jin, fengguang.wu
In-Reply-To: <20180315142120.GA19218@redhat.com>

On 03/15, Oleg Nesterov wrote:
>
> > +static struct vm_area_struct *
> > +sdt_find_vma(struct mm_struct *mm, struct trace_uprobe *tu)
> > +{
> > +	struct vm_area_struct *tmp;
> > +
> > +	for (tmp = mm->mmap; tmp != NULL; tmp = tmp->vm_next)
> > +		if (sdt_valid_vma(tu, tmp))
> > +			return tmp;
> > +
> > +	return NULL;
>
> I can't understand the logic... Lets ignore sdt_valid_vma() for now.
> The caller has uprobe_map_info, why it can't simply do
> vma = find_vma(uprobe_map_info->vaddr)? and then check sdt_valid_vma().

Note to mention that sdt_find_vma() can return NULL but the callers do
vma_offset_to_vaddr(vma) without any check.

Oleg.

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 5/8] trace_uprobe: Support SDT markers having reference count (semaphore)
From: Oleg Nesterov @ 2018-03-15 14:21 UTC (permalink / raw)
  To: Ravi Bangoria
  Cc: mhiramat, peterz, srikar, acme, ananth, akpm, alexander.shishkin,
	alexis.berlemont, corbet, dan.j.williams, gregkh, huawei.libin,
	hughd, jack, jglisse, jolsa, kan.liang, kirill.shutemov, kjlx,
	kstewart, linux-doc, linux-kernel, linux-mm, mhocko, milian.wolff,
	mingo, namhyung, naveen.n.rao, pc, pombredanne, rostedt, tglx,
	tmricht, willy, yao.jin, fengguang.wu
In-Reply-To: <20180313125603.19819-6-ravi.bangoria@linux.vnet.ibm.com>

On 03/13, Ravi Bangoria wrote:
>
> @@ -1053,6 +1056,9 @@ int uprobe_mmap(struct vm_area_struct *vma)
>  	struct uprobe *uprobe, *u;
>  	struct inode *inode;
>
> +	if (uprobe_mmap_callback)
> +		uprobe_mmap_callback(vma);
> +
>  	if (no_uprobe_events() || !valid_vma(vma, true))
>  		return 0;

probe_event_enable() does

	uprobe_register();
	/* WINDOW */
	sdt_increment_ref_ctr();

what if uprobe_mmap() is called in between? The counter(s) in this vma
will be incremented twice, no?

> +static struct vm_area_struct *
> +sdt_find_vma(struct mm_struct *mm, struct trace_uprobe *tu)
> +{
> +	struct vm_area_struct *tmp;
> +
> +	for (tmp = mm->mmap; tmp != NULL; tmp = tmp->vm_next)
> +		if (sdt_valid_vma(tu, tmp))
> +			return tmp;
> +
> +	return NULL;

I can't understand the logic... Lets ignore sdt_valid_vma() for now.
The caller has uprobe_map_info, why it can't simply do
vma = find_vma(uprobe_map_info->vaddr)? and then check sdt_valid_vma().

Oleg.

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 00/16] remove eight obsolete architectures
From: Christoph Hellwig @ 2018-03-15 14:17 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Hannes Reinecke, David Howells, linux-arch,
	Linux Kernel Mailing List, open list:DOCUMENTATION, linux-block,
	IDE-ML, open list:HID CORE LAYER, Networking, linux-wireless,
	linux-pwm, linux-rtc, linux-spi, linux-usb, dri-devel,
	linux-fbdev, linux-watchdog, Linux FS-devel Mailing List,
	Linux-MM
In-Reply-To: <CAK8P3a01pfvsdM1mR8raU9dA7p4H-jRJz2Y8-+KEY76W_Mukpg@mail.gmail.com>

On Thu, Mar 15, 2018 at 11:42:25AM +0100, Arnd Bergmann wrote:
> Is anyone producing a chip that includes enough of the Privileged ISA spec
> to have things like system calls, but not the MMU parts?

Various SiFive SOCs seem to support M and U mode, but no S mode or
iommu.  That should be enough for nommu Linux running in M mode if
someone cares enough to actually port it.
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] Improve mutex documentation
From: Kirill Tkhai @ 2018-03-15 13:23 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Andrew Morton, tj, cl, linux-mm, linux-kernel, linux-doc,
	Jonathan Corbet, Mauro Carvalho Chehab, Peter Zijlstra,
	Ingo Molnar
In-Reply-To: <20180315131832.GC9949@bombadil.infradead.org>

On 15.03.2018 16:18, Matthew Wilcox wrote:
> On Thu, Mar 15, 2018 at 03:12:30PM +0300, Kirill Tkhai wrote:
>>> +/**
>>> + * mutex_lock_killable() - Acquire the mutex, interruptible by fatal signals.
>>
>> Shouldn't we clarify that fatal signals are SIGKILL only?
> 
> It's more complicated than it might seem (... welcome to signal handling!)
> If you send SIGINT to a task that's waiting on a mutex_killable(), it will
> still die.  I *think* that's due to the code in complete_signal():
> 
>         if (sig_fatal(p, sig) &&
>             !(signal->flags & SIGNAL_GROUP_EXIT) &&
>             !sigismember(&t->real_blocked, sig) &&
>             (sig == SIGKILL || !p->ptrace)) {
> ...
>                                 sigaddset(&t->pending.signal, SIGKILL);
> 
> You're correct that this code only checks for SIGKILL, but any fatal
> signal will result in the signal group receiving SIGKILL.
> 
> Unless I've misunderstood, and it wouldn't be the first time I've
> misunderstood signal handling.

Sure, thanks for the explanation.

Kirill
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] Improve mutex documentation
From: Matthew Wilcox @ 2018-03-15 13:18 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: Andrew Morton, tj, cl, linux-mm, linux-kernel, linux-doc,
	Jonathan Corbet, Mauro Carvalho Chehab, Peter Zijlstra,
	Ingo Molnar
In-Reply-To: <2397831d-71b5-3cc8-9dc4-ce06e2eddfde@virtuozzo.com>

On Thu, Mar 15, 2018 at 03:12:30PM +0300, Kirill Tkhai wrote:
> > +/**
> > + * mutex_lock_killable() - Acquire the mutex, interruptible by fatal signals.
> 
> Shouldn't we clarify that fatal signals are SIGKILL only?

It's more complicated than it might seem (... welcome to signal handling!)
If you send SIGINT to a task that's waiting on a mutex_killable(), it will
still die.  I *think* that's due to the code in complete_signal():

        if (sig_fatal(p, sig) &&
            !(signal->flags & SIGNAL_GROUP_EXIT) &&
            !sigismember(&t->real_blocked, sig) &&
            (sig == SIGKILL || !p->ptrace)) {
...
                                sigaddset(&t->pending.signal, SIGKILL);

You're correct that this code only checks for SIGKILL, but any fatal
signal will result in the signal group receiving SIGKILL.

Unless I've misunderstood, and it wouldn't be the first time I've
misunderstood signal handling.
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 8/8] trace_uprobe/sdt: Document about reference counter
From: Masami Hiramatsu @ 2018-03-15 12:47 UTC (permalink / raw)
  To: Ravi Bangoria
  Cc: oleg, peterz, srikar, acme, ananth, akpm, alexander.shishkin,
	alexis.berlemont, corbet, dan.j.williams, gregkh, huawei.libin,
	hughd, jack, jglisse, jolsa, kan.liang, kirill.shutemov, kjlx,
	kstewart, linux-doc, linux-kernel, linux-mm, mhocko, milian.wolff,
	mingo, namhyung, naveen.n.rao, pc, pombredanne, rostedt, tglx,
	tmricht, willy, yao.jin, fengguang.wu
In-Reply-To: <ec9c4ef7-0117-7c7c-64bc-f6bf4261721d@linux.vnet.ibm.com>

Hi Ravi,

On Wed, 14 Mar 2018 20:52:59 +0530
Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> wrote:

> On 03/14/2018 07:20 PM, Masami Hiramatsu wrote:
> > On Tue, 13 Mar 2018 18:26:03 +0530
> > Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> wrote:
> >
> >> No functionality changes.
> > Please consider to describe what is this change and why, here.
> 
> Will add in next version.

Thanks, and could you also move this before perf-probe patch?
Also Could you make perf-probe check the tracing/README whether
the kernel supports reference counter syntax or not?

perf-tool can be used on older (or stable) kernel.

Thank you,

> 
> >> Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
> >> ---
> >>  Documentation/trace/uprobetracer.txt | 16 +++++++++++++---
> >>  kernel/trace/trace.c                 |  2 +-
> >>  2 files changed, 14 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/Documentation/trace/uprobetracer.txt b/Documentation/trace/uprobetracer.txt
> >> index bf526a7c..8fb13b0 100644
> >> --- a/Documentation/trace/uprobetracer.txt
> >> +++ b/Documentation/trace/uprobetracer.txt
> >> @@ -19,15 +19,25 @@ user to calculate the offset of the probepoint in the object.
> >>  
> >>  Synopsis of uprobe_tracer
> >>  -------------------------
> >> -  p[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] : Set a uprobe
> >> -  r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] : Set a return uprobe (uretprobe)
> >> -  -:[GRP/]EVENT                           : Clear uprobe or uretprobe event
> >> +  p[:[GRP/]EVENT] PATH:OFFSET[(REF_CTR_OFFSET)] [FETCHARGS]
> >> +  r[:[GRP/]EVENT] PATH:OFFSET[(REF_CTR_OFFSET)] [FETCHARGS]
> > Ah, OK in this context, [] means optional syntax :)
> 
> Correct.
> 
> Thanks,
> Ravi
> 


-- 
Masami Hiramatsu <mhiramat@kernel.org>
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] Improve mutex documentation
From: Kirill Tkhai @ 2018-03-15 12:12 UTC (permalink / raw)
  To: Matthew Wilcox, Andrew Morton
  Cc: tj, cl, linux-mm, linux-kernel, linux-doc, Jonathan Corbet,
	Mauro Carvalho Chehab, Peter Zijlstra, Ingo Molnar
In-Reply-To: <20180315115812.GA9949@bombadil.infradead.org>

Hi, Matthew,

On 15.03.2018 14:58, Matthew Wilcox wrote:
> On Wed, Mar 14, 2018 at 01:56:31PM -0700, Andrew Morton wrote:
>> My memory is weak and our documentation is awful.  What does
>> mutex_lock_killable() actually do and how does it differ from
>> mutex_lock_interruptible()?
> 
> From: Matthew Wilcox <mawilcox@microsoft.com>
> 
> Add kernel-doc for mutex_lock_killable() and mutex_lock_io().  Reword the
> kernel-doc for mutex_lock_interruptible().
> 
> Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
> 
> diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
> index 858a07590e39..2048359f33d2 100644
> --- a/kernel/locking/mutex.c
> +++ b/kernel/locking/mutex.c
> @@ -1082,15 +1082,16 @@ static noinline int __sched
>  __mutex_lock_interruptible_slowpath(struct mutex *lock);
>  
>  /**
> - * mutex_lock_interruptible - acquire the mutex, interruptible
> - * @lock: the mutex to be acquired
> + * mutex_lock_interruptible() - Acquire the mutex, interruptible by signals.
> + * @lock: The mutex to be acquired.
>   *
> - * Lock the mutex like mutex_lock(), and return 0 if the mutex has
> - * been acquired or sleep until the mutex becomes available. If a
> - * signal arrives while waiting for the lock then this function
> - * returns -EINTR.
> + * Lock the mutex like mutex_lock().  If a signal is delivered while the
> + * process is sleeping, this function will return without acquiring the
> + * mutex.
>   *
> - * This function is similar to (but not equivalent to) down_interruptible().
> + * Context: Process context.
> + * Return: 0 if the lock was successfully acquired or %-EINTR if a
> + * signal arrived.
>   */
>  int __sched mutex_lock_interruptible(struct mutex *lock)
>  {
> @@ -1104,6 +1105,18 @@ int __sched mutex_lock_interruptible(struct mutex *lock)
>  
>  EXPORT_SYMBOL(mutex_lock_interruptible);
>  
> +/**
> + * mutex_lock_killable() - Acquire the mutex, interruptible by fatal signals.

Shouldn't we clarify that fatal signals are SIGKILL only?

> + * @lock: The mutex to be acquired.
> + *
> + * Lock the mutex like mutex_lock().  If a signal which will be fatal to
> + * the current process is delivered while the process is sleeping, this
> + * function will return without acquiring the mutex.
> + *
> + * Context: Process context.
> + * Return: 0 if the lock was successfully acquired or %-EINTR if a
> + * fatal signal arrived.
> + */
>  int __sched mutex_lock_killable(struct mutex *lock)
>  {
>  	might_sleep();
> @@ -1115,6 +1128,16 @@ int __sched mutex_lock_killable(struct mutex *lock)
>  }
>  EXPORT_SYMBOL(mutex_lock_killable);
>  
> +/**
> + * mutex_lock_io() - Acquire the mutex and mark the process as waiting for I/O
> + * @lock: The mutex to be acquired.
> + *
> + * Lock the mutex like mutex_lock().  While the task is waiting for this
> + * mutex, it will be accounted as being in the IO wait state by the
> + * scheduler.
> + *
> + * Context: Process context.
> + */
>  void __sched mutex_lock_io(struct mutex *lock)
>  {
>  	int token;
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox