The Linux Kernel Mailing List

The Linux Kernel Mailing List
 help / color / mirror / Atom feed

* [RFC][PATCH 2/2] PM / Domains: Add default power off governor function (v2)
From: Rafael J. Wysocki @ 2011-10-21 23:11 UTC (permalink / raw)
  To: Linux PM list; +Cc: LKML, Linux-sh list, Jean Pihet, Magnus Damm
In-Reply-To: <201110220109.52991.rjw@sisk.pl>

From: Rafael J. Wysocki <rjw@sisk.pl>

Add a function deciding whether or not a given PM domain should
be powered off on the basis of that domain's devices' PM QoS
constraints.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 drivers/base/power/domain_governor.c |   96 +++++++++++++++++++++++++++++++++++
 include/linux/pm_domain.h            |    7 ++
 2 files changed, 103 insertions(+)

Index: linux/include/linux/pm_domain.h
===================================================================
--- linux.orig/include/linux/pm_domain.h
+++ linux/include/linux/pm_domain.h
@@ -49,6 +49,10 @@ struct generic_pm_domain {
 	int (*start_device)(struct device *dev);
 	int (*stop_device)(struct device *dev);
 	bool (*active_wakeup)(struct device *dev);
+	ktime_t power_off_latency;
+	ktime_t power_on_latency;
+	s64 break_even_ns;
+	s64 min_delta_ns;
 };
 
 static inline struct generic_pm_domain *pd_to_genpd(struct dev_pm_domain *pd)
@@ -64,6 +68,9 @@ struct gpd_link {
 };
 
 struct gpd_gov_dev_data {
+	ktime_t start_latency;
+	ktime_t suspend_latency;
+	ktime_t resume_latency;
 	s64 break_even_ns;
 };
 
Index: linux/drivers/base/power/domain_governor.c
===================================================================
--- linux.orig/drivers/base/power/domain_governor.c
+++ linux/drivers/base/power/domain_governor.c
@@ -35,6 +35,102 @@ bool default_stop_ok(struct device *dev)
 	return constraint_ns > gov_data->break_even_ns;
 }
 
+/* This routine must be executed under the PM domain's lock. */
+static bool default_power_down_ok(struct dev_pm_domain *pd)
+{
+	struct generic_pm_domain *genpd = pd_to_genpd(pd);
+	struct gpd_link *link;
+	struct pm_domain_data *pdd;
+	ktime_t off_time, on_time;
+	s64 delta_ns, min_delta_ns;
+
+	on_time = genpd->power_on_latency;
+	/* Check if slave domains can be off for enough time. */
+	delta_ns = ktime_to_ns(ktime_add(genpd->power_off_latency, on_time));
+	min_delta_ns = 0;
+	/* All slave domains have been powered off at this point. */
+	list_for_each_entry(link, &genpd->master_links, master_node) {
+		if (delta_ns > link->slave->min_delta_ns)
+			return false;
+
+		delta_ns = link->slave->min_delta_ns - delta_ns;
+		if (delta_ns < min_delta_ns)
+			min_delta_ns = delta_ns;
+	}
+
+	genpd->min_delta_ns = min_delta_ns;
+
+	/* Compute the total time needed to power off the domain. */
+	off_time = ktime_set(0, 0);
+	/* All devices have been stopped at this point. */
+	list_for_each_entry(pdd, &genpd->dev_list, list_node) {
+		struct gpd_gov_dev_data *gov_data;
+
+		if (!pdd->dev->driver)
+			continue;
+
+		gov_data = to_gpd_data(pdd)->gov_data;
+		if (!gov_data)
+			continue;
+
+		off_time = ktime_add(off_time, gov_data->suspend_latency);
+	}
+	off_time = ktime_add(off_time, genpd->power_off_latency);
+
+	/*
+	 * For each device in the domain compute the difference between the
+	 * QoS value and the total time required to bring the device back
+	 * assuming that the domain will be powered off and compute the minimum
+	 * of those.
+	 */
+	min_delta_ns = 0;
+	on_time = ktime_add(on_time, off_time);
+	list_for_each_entry(pdd, &genpd->dev_list, list_node) {
+		struct gpd_gov_dev_data *gov_data;
+		struct device *dev = pdd->dev;
+		ktime_t dev_up_time;
+		s32 constraint;
+		s64 constraint_ns;
+
+		if (!dev->driver)
+			continue;
+
+		gov_data = to_gpd_data(pdd)->gov_data;
+		if (gov_data) {
+			dev_up_time = ktime_add(on_time,
+						gov_data->resume_latency);
+			dev_up_time = ktime_add(dev_up_time,
+						gov_data->start_latency);
+		} else {
+			dev_up_time = on_time;
+		}
+
+		constraint = dev_pm_qos_read_value(dev);
+		if (constraint < 0)
+			return false;
+		else if (constraint == 0) /* 0 means "don't care" */
+			continue;
+
+		constraint_ns = constraint;
+		constraint_ns *= NSEC_PER_USEC;
+		delta_ns = constraint_ns - ktime_to_ns(dev_up_time);
+		if (min_delta_ns > delta_ns)
+			min_delta_ns = delta_ns;
+	}
+
+	/* Compare the computed delta with the break even value. */
+	if (min_delta_ns < genpd->break_even_ns)
+		return false;
+
+	/* Store the computed value for the masters to use. */
+	if (genpd->min_delta_ns > min_delta_ns)
+		genpd->min_delta_ns = min_delta_ns;
+
+	/* The domain can be powered off. */
+	return true;
+}
+
 struct dev_power_governor simple_qos_governor = {
 	.stop_ok = default_stop_ok,
+	.power_down_ok = default_power_down_ok,
 };


^ permalink raw reply

* [RFC][PATCH 1/2] PM / Domains: Add device stop governor function (v2)
From: Rafael J. Wysocki @ 2011-10-21 23:10 UTC (permalink / raw)
  To: Linux PM list; +Cc: LKML, Linux-sh list, Jean Pihet, Magnus Damm
In-Reply-To: <201110220109.52991.rjw@sisk.pl>

From: Rafael J. Wysocki <rjw@sisk.pl>

Add a function deciding whether or not devices should be
stopped in pm_genpd_runtime_suspend() depending on their
PM QoS values.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 arch/arm/mach-shmobile/pm-sh7372.c   |    4 ++-
 drivers/base/power/Makefile          |    2 -
 drivers/base/power/domain.c          |   29 ++++++++++++++++++----
 drivers/base/power/domain_governor.c |   40 +++++++++++++++++++++++++++++++
 include/linux/pm_domain.h            |   45 +++++++++++++++++++++++++++++++----
 5 files changed, 109 insertions(+), 11 deletions(-)

Index: linux/include/linux/pm_domain.h
===================================================================
--- linux.orig/include/linux/pm_domain.h
+++ linux/include/linux/pm_domain.h
@@ -21,6 +21,7 @@ enum gpd_status {
 
 struct dev_power_governor {
 	bool (*power_down_ok)(struct dev_pm_domain *domain);
+	bool (*stop_ok)(struct device *dev);
 };
 
 struct generic_pm_domain {
@@ -62,8 +63,13 @@ struct gpd_link {
 	struct list_head slave_node;
 };
 
+struct gpd_gov_dev_data {
+	s64 break_even_ns;
+};
+
 struct generic_pm_domain_data {
 	struct pm_domain_data base;
+	struct gpd_gov_dev_data *gov_data;
 	bool need_restore;
 };
 
@@ -73,8 +79,19 @@ static inline struct generic_pm_domain_d
 }
 
 #ifdef CONFIG_PM_GENERIC_DOMAINS
-extern int pm_genpd_add_device(struct generic_pm_domain *genpd,
-			       struct device *dev);
+extern struct dev_power_governor simple_qos_governor;
+
+extern struct generic_pm_domain *dev_to_genpd(struct device *dev);
+extern int __pm_genpd_add_device(struct generic_pm_domain *genpd,
+				 struct device *dev,
+				 struct gpd_gov_dev_data *gov_data);
+
+static inline int pm_genpd_add_device(struct generic_pm_domain *genpd,
+				      struct device *dev)
+{
+	return __pm_genpd_add_device(genpd, dev, NULL);
+}
+
 extern int pm_genpd_remove_device(struct generic_pm_domain *genpd,
 				  struct device *dev);
 extern int pm_genpd_add_subdomain(struct generic_pm_domain *genpd,
@@ -83,8 +100,23 @@ extern int pm_genpd_remove_subdomain(str
 				     struct generic_pm_domain *target);
 extern void pm_genpd_init(struct generic_pm_domain *genpd,
 			  struct dev_power_governor *gov, bool is_off);
+
 extern int pm_genpd_poweron(struct generic_pm_domain *genpd);
+
+extern bool default_stop_ok(struct device *dev);
+
 #else
+
+static inline struct generic_pm_domain *dev_to_genpd(struct device *dev)
+{
+	return ERR_PTR(-ENOSYS);
+}
+static inline int __pm_genpd_add_device(struct generic_pm_domain *genpd,
+					struct device *dev,
+					struct gpd_gov_dev_data *gov_data)
+{
+	return -ENOSYS;
+}
 static inline int pm_genpd_add_device(struct generic_pm_domain *genpd,
 				      struct device *dev)
 {
@@ -105,12 +137,17 @@ static inline int pm_genpd_remove_subdom
 {
 	return -ENOSYS;
 }
-static inline void pm_genpd_init(struct generic_pm_domain *genpd,
-				 struct dev_power_governor *gov, bool is_off) {}
+static inline void pm_genpd_init(struct generic_pm_domain *genpd, bool is_off)
+{
+}
 static inline int pm_genpd_poweron(struct generic_pm_domain *genpd)
 {
 	return -ENOSYS;
 }
+static inline bool default_stop_ok(struct device *dev)
+{
+	return false;
+}
 #endif
 
 #ifdef CONFIG_PM_GENERIC_DOMAINS_RUNTIME
Index: linux/drivers/base/power/domain.c
===================================================================
--- linux.orig/drivers/base/power/domain.c
+++ linux/drivers/base/power/domain.c
@@ -21,7 +21,7 @@ static DEFINE_MUTEX(gpd_list_lock);
 
 #ifdef CONFIG_PM
 
-static struct generic_pm_domain *dev_to_genpd(struct device *dev)
+struct generic_pm_domain *dev_to_genpd(struct device *dev)
 {
 	if (IS_ERR_OR_NULL(dev->pm_domain))
 		return ERR_PTR(-EINVAL);
@@ -403,6 +403,22 @@ static void genpd_power_off_work_fn(stru
 }
 
 /**
+ * genpd_stop_dev - Stop a given device if that's beneficial.
+ * @genpd: PM domain the device belongs to.
+ * @dev: Device to stop.
+ */
+static int genpd_stop_dev(struct generic_pm_domain *genpd, struct device *dev)
+{
+	bool (*stop_ok)(struct device *dev);
+
+	stop_ok = genpd->gov ? genpd->gov->stop_ok : NULL;
+	if (stop_ok && !stop_ok(dev))
+		return -EBUSY;
+
+	return genpd->stop_device(dev);
+}
+
+/**
  * pm_genpd_runtime_suspend - Suspend a device belonging to I/O PM domain.
  * @dev: Device to suspend.
  *
@@ -423,7 +439,7 @@ static int pm_genpd_runtime_suspend(stru
 	might_sleep_if(!genpd->dev_irq_safe);
 
 	if (genpd->stop_device) {
-		int ret = genpd->stop_device(dev);
+		int ret = genpd_stop_dev(genpd, dev);
 		if (ret)
 			return ret;
 	}
@@ -495,7 +511,7 @@ static int pm_genpd_runtime_resume(struc
 		mutex_lock(&genpd->lock);
 	}
 	finish_wait(&genpd->status_wait_queue, &wait);
-	__pm_genpd_restore_device(dev->power.subsys_data->domain_data, genpd);
+	__pm_genpd_restore_device(dev_to_psd(dev)->domain_data, genpd);
 	genpd->resume_count--;
 	genpd_set_active(genpd);
 	wake_up_all(&genpd->status_wait_queue);
@@ -1076,11 +1092,13 @@ static void pm_genpd_complete(struct dev
 #endif /* CONFIG_PM_SLEEP */
 
 /**
- * pm_genpd_add_device - Add a device to an I/O PM domain.
+ * __pm_genpd_add_device - Add a device to an I/O PM domain.
  * @genpd: PM domain to add the device to.
  * @dev: Device to be added.
+ * @gov_data: Set of PM QoS parameters to attach to the device.
  */
-int pm_genpd_add_device(struct generic_pm_domain *genpd, struct device *dev)
+int __pm_genpd_add_device(struct generic_pm_domain *genpd, struct device *dev,
+			  struct gpd_gov_dev_data *gov_data)
 {
 	struct generic_pm_domain_data *gpd_data;
 	struct pm_domain_data *pdd;
@@ -1123,6 +1141,7 @@ int pm_genpd_add_device(struct generic_p
 	gpd_data->base.dev = dev;
 	gpd_data->need_restore = false;
 	list_add_tail(&gpd_data->base.list_node, &genpd->dev_list);
+	gpd_data->gov_data = gov_data;
 
  out:
 	genpd_release_lock(genpd);
Index: linux/drivers/base/power/Makefile
===================================================================
--- linux.orig/drivers/base/power/Makefile
+++ linux/drivers/base/power/Makefile
@@ -3,7 +3,7 @@ obj-$(CONFIG_PM_SLEEP)	+= main.o wakeup.
 obj-$(CONFIG_PM_RUNTIME)	+= runtime.o
 obj-$(CONFIG_PM_TRACE_RTC)	+= trace.o
 obj-$(CONFIG_PM_OPP)	+= opp.o
-obj-$(CONFIG_PM_GENERIC_DOMAINS)	+=  domain.o
+obj-$(CONFIG_PM_GENERIC_DOMAINS)	+=  domain.o domain_governor.o
 obj-$(CONFIG_HAVE_CLK)	+= clock_ops.o
 
 ccflags-$(CONFIG_DEBUG_DRIVER) := -DDEBUG
Index: linux/drivers/base/power/domain_governor.c
===================================================================
--- /dev/null
+++ linux/drivers/base/power/domain_governor.c
@@ -0,0 +1,40 @@
+/*
+ * drivers/base/power/domain_governor.c - Governors for device PM domains.
+ *
+ * Copyright (C) 2011 Rafael J. Wysocki <rjw@sisk.pl>, Renesas Electronics Corp.
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/pm_domain.h>
+#include <linux/pm_qos.h>
+
+bool default_stop_ok(struct device *dev)
+{
+	struct gpd_gov_dev_data *gov_data;
+	s64 constraint_ns;
+	s32 constraint;
+
+	dev_dbg(dev, "%s()\n", __func__);
+
+	gov_data = to_gpd_data(dev_to_psd(dev)->domain_data)->gov_data;
+	if (!gov_data)
+		return true;
+
+	constraint = dev_pm_qos_read_value(dev);
+	if (constraint < 0)
+		return false;
+	else if (constraint == 0) /* 0 means "don't care" */
+		return true;
+
+	constraint_ns = constraint;
+	constraint_ns *= NSEC_PER_USEC;
+
+	return constraint_ns > gov_data->break_even_ns;
+}
+
+struct dev_power_governor simple_qos_governor = {
+	.stop_ok = default_stop_ok,
+};
Index: linux/arch/arm/mach-shmobile/pm-sh7372.c
===================================================================
--- linux.orig/arch/arm/mach-shmobile/pm-sh7372.c
+++ linux/arch/arm/mach-shmobile/pm-sh7372.c
@@ -161,13 +161,15 @@ static bool sh7372_power_down_forbidden(
 
 struct dev_power_governor sh7372_always_on_gov = {
 	.power_down_ok = sh7372_power_down_forbidden,
+	.stop_ok = default_stop_ok,
 };
 
 void sh7372_init_pm_domain(struct sh7372_pm_domain *sh7372_pd)
 {
 	struct generic_pm_domain *genpd = &sh7372_pd->genpd;
+	struct dev_power_governor *gov = sh7372_pd->gov;
 
-	pm_genpd_init(genpd, sh7372_pd->gov, false);
+	pm_genpd_init(genpd, gov ? : &simple_qos_governor, false);
 	genpd->stop_device = pm_clk_suspend;
 	genpd->start_device = pm_clk_resume;
 	genpd->dev_irq_safe = true;


^ permalink raw reply

* [RFC][PATCH 0/2] PM: Generic PM domains and device PM QoS (v2)
From: Rafael J. Wysocki @ 2011-10-21 23:09 UTC (permalink / raw)
  To: Linux PM list; +Cc: LKML, Linux-sh list, Jean Pihet, Magnus Damm

Hi,

This patchset is a refresh of patches introducing simple PM QoS governor
functions for PM domains.  The patches still haven't been tested properly,
although they should build.  The purpose of them is mainly to illustrate
how PM QoS may be used with PM domains to control device runtime PM.

Thanks,
Rafael

^ permalink raw reply

* Re: Linux as Bootloader
From: Geoff Levand @ 2011-10-21 23:07 UTC (permalink / raw)
  To: Christian Gmeiner; +Cc: linux-kernel
In-Reply-To: <CAH9NwWenTowswTCrrosA09D=LNF7Ej-JGNQjVxT20puA_KQ_XQ@mail.gmail.com>

Hi Christian,

On 09/16/2011 04:02 AM, Christian Gmeiner wrote:
> 2011/9/11 Geoff Levand <geoff@infradead.org>:
>> On 09/05/2011 01:24 AM, Christian Gmeiner wrote:
>>> I am using the linux kernel in combination with busybox and a separate
>>> bootloader app to
>>> startup a x86 based embedded device.
>>
>> Just FYI, Petitboot, a kexec base bootloader, does the same
>> thing as what you have done.
>>
>> I'm now working on adding a twin GUI program and an x86 OpenWRT build.
> 
> Petitboot sounds nice, but kernel.org is down. Is there an other way to get
> access to it via git?

Just FYI, my petitboot repo at kernel.org is back:

  http://git.kernel.org/?p=linux/kernel/git/geoff/petitboot.git

Also, I put a preliminary version of new Web pages I'm
working on here:

  http://bombadil.infradead.org/~geoff/petitboot/

-Geoff

  


^ permalink raw reply

* Re: kernel 3.0: BUG: soft lockup: find_get_pages+0x51/0x110
From: Andrea Arcangeli @ 2011-10-21 22:50 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Nai Xia, Hugh Dickins, Pawel Sikora, Andrew Morton, linux-mm,
	jpiszcz, arekm, linux-kernel
In-Reply-To: <20111021174120.GJ608@redhat.com>

On Fri, Oct 21, 2011 at 07:41:20PM +0200, Andrea Arcangeli wrote:
> We have two options:
> 
> 1) we remove the vma_merge call from copy_vma and we do the vma_merge
> manually after mremap succeed (so then we're as safe as fork is and we
> relay on the ordering). No locks but we'll just do 1 more allocation
> for one addition temporary vma that will be removed after mremap
> completed.
> 
> 2) Hugh's original fix.

3) put the src vma at the tail if vma_merge succeeds and the src vma
and dst vma aren't the same

I tried to implement this but I'm still wondering about the safety of
this with concurrent processes all calling mremap at the same time on
the same anon_vma same_anon_vma list, the reasoning I think it may be
safe is in the comment. I run a few mremap with my benchmark where the
THP aware mremap in -mm gets a x10 boost and moves 5G and it didn't
crash but that's about it and not conclusive, if you review please
comment...

I've to pack luggage and prepare to fly to KS tomorrow so I may not be
responsive in the next few days.

===
>From f2898ff06b5a9a14b9d957c7696137f42a2438e9 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Sat, 22 Oct 2011 00:11:49 +0200
Subject: [PATCH] mremap: enforce rmap src/dst vma ordering in case of
 vma_merge succeeding in copy_vma

migrate was doing a rmap_walk with speculative lock-less access on
pagetables. That could lead it to not serialize properly against
mremap PT locks. But a second problem remains in the order of vmas in
the same_anon_vma list used by the rmap_walk.

If vma_merge would succeed in copy_vma, the src vma could be placed
after the dst vma in the same_anon_vma list. That could still lead
migrate to miss some pte.

This patch adds a anon_vma_order_tail() function to force the dst vma
at the end of the list before mremap starts to solve the problem.

If the mremap is very large and there are a lots of parents or childs
sharing the anon_vma root lock, this should still scale better than
taking the anon_vma root lock around every pte copy practically for
the whole duration of mremap.
---
 include/linux/rmap.h |    1 +
 mm/mmap.c            |    8 ++++++++
 mm/rmap.c            |   43 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 52 insertions(+), 0 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 2148b12..45eb098 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -120,6 +120,7 @@ void anon_vma_init(void);	/* create anon_vma_cachep */
 int  anon_vma_prepare(struct vm_area_struct *);
 void unlink_anon_vmas(struct vm_area_struct *);
 int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
+void anon_vma_order_tail(struct vm_area_struct *);
 int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
 void __anon_vma_link(struct vm_area_struct *);
 
diff --git a/mm/mmap.c b/mm/mmap.c
index a65efd4..a5858dc 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2339,7 +2339,15 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 		 */
 		if (vma_start >= new_vma->vm_start &&
 		    vma_start < new_vma->vm_end)
+			/*
+			 * No need to call anon_vma_order_tail() in
+			 * this case because the same PT lock will
+			 * serialize the rmap_walk against both src
+			 * and dst vmas.
+			 */
 			*vmap = new_vma;
+		else
+			anon_vma_order_tail(new_vma);
 	} else {
 		new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 		if (new_vma) {
diff --git a/mm/rmap.c b/mm/rmap.c
index 8005080..170cece 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -272,6 +272,49 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
 }
 
 /*
+ * Some rmap walk that needs to find all ptes/hugepmds without false
+ * negatives (like migrate and split_huge_page) running concurrent
+ * with operations that copy or move pagetables (like mremap() and
+ * fork()) to be safe depends the anon_vma "same_anon_vma" list to be
+ * in a certain order: the dst_vma must be placed after the src_vma in
+ * the list. This is always guaranteed by fork() but mremap() needs to
+ * call this function to enforce it in case the dst_vma isn't newly
+ * allocated and chained with the anon_vma_clone() function but just
+ * an extension of a pre-existing vma through vma_merge.
+ *
+ * NOTE: the same_anon_vma list can still changed by other processes
+ * while mremap runs because mremap doesn't hold the anon_vma mutex to
+ * prevent modifications to the list while it runs. All we need to
+ * enforce is that the relative order of this process vmas isn't
+ * changing (we don't care about other vmas order). Each vma
+ * corresponds to an anon_vma_chain structure so there's no risk that
+ * other processes calling anon_vma_order_tail() and changing the
+ * same_anon_vma list under mremap() will screw with the relative
+ * order of this process vmas in the list, because we won't alter the
+ * order of any vma that isn't belonging to this process. And there
+ * can't be another anon_vma_order_tail running concurrently with
+ * mremap() coming from this process because we hold the mmap_sem for
+ * the whole mremap(). fork() ordering dependency also shouldn't be
+ * affected because we only care that the parent vmas are placed in
+ * the list before the child vmas and anon_vma_order_tail won't reorder
+ * vmas from either the fork parent or child.
+ */
+void anon_vma_order_tail(struct vm_area_struct *dst)
+{
+	struct anon_vma_chain *pavc;
+	struct anon_vma *root = NULL;
+
+	list_for_each_entry_reverse(pavc, &dst->anon_vma_chain, same_vma) {
+		struct anon_vma *anon_vma = pavc->anon_vma;
+		VM_BUG_ON(pavc->vma != dst);
+		root = lock_anon_vma_root(root, anon_vma);
+		list_del(&pavc->same_anon_vma);
+		list_add_tail(&pavc->same_anon_vma, &anon_vma->head);
+	}
+	unlock_anon_vma_root(root);
+}
+
+/*
  * Attach vma to its own anon_vma, as well as to the anon_vmas that
  * the corresponding VMA in the parent process is attached to.
  * Returns 0 on success, non-zero on failure.

^ permalink raw reply related

* [PATCH] PCI / ACPI: Make acpiphp ignore root bridges using PCIe native hotplug
From: Rafael J. Wysocki @ 2011-10-21 22:43 UTC (permalink / raw)
  To: Linux PCI
  Cc: Jesse Barnes, LKML, Linux PM list, ACPI Devel Mailing List,
	Matthew Garrett

From: Rafael J. Wysocki <rjw@sisk.pl>

If the kernel has requested control of the PCIe native hotplug
feature for a given root complex, the acpiphp driver should not try
to handle that root complex and it should leave it to pciehp.
Failing to do so causes problems to happen if acpiphp is loaded
before pciehp on such systems.

To address this issue make find_root_bridges() ignore PCIe root
complexes with PCIe native hotplug enabled and make add_bridge()
return error code if PCIe native hotplug is enabled for the given
root port.  This causes acpiphp to refuse to load if PCIe native
hotplug is enabled for all complexes and to refuse binding to
the root complexes with PCIe native hotplug is enabled.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 drivers/pci/hotplug/acpiphp_glue.c |   29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

Index: linux/drivers/pci/hotplug/acpiphp_glue.c
===================================================================
--- linux.orig/drivers/pci/hotplug/acpiphp_glue.c
+++ linux/drivers/pci/hotplug/acpiphp_glue.c
@@ -458,8 +458,17 @@ static int add_bridge(acpi_handle handle
 {
 	acpi_status status;
 	unsigned long long tmp;
+	struct acpi_pci_root *root;
 	acpi_handle dummy_handle;
 
+	/*
+	 * We shouldn't use this bridge if PCIe native hotplug control has been
+	 * granted by the BIOS for it.
+	 */
+	root = acpi_pci_find_root(handle);
+	if (root && (root->osc_control_set & OSC_PCI_EXPRESS_NATIVE_HP_CONTROL))
+		return -ENODEV;
+
 	/* if the bridge doesn't have _STA, we assume it is always there */
 	status = acpi_get_handle(handle, "_STA", &dummy_handle);
 	if (ACPI_SUCCESS(status)) {
@@ -1297,13 +1306,23 @@ static void handle_hotplug_event_func(ac
 static acpi_status
 find_root_bridges(acpi_handle handle, u32 lvl, void *context, void **rv)
 {
+	struct acpi_pci_root *root;
 	int *count = (int *)context;
 
-	if (acpi_is_root_bridge(handle)) {
-		acpi_install_notify_handler(handle, ACPI_SYSTEM_NOTIFY,
-				handle_hotplug_event_bridge, NULL);
-			(*count)++;
-	}
+	if (!acpi_is_root_bridge(handle))
+		return AE_OK;
+
+	root = acpi_pci_find_root(handle);
+	if (!root)
+		return AE_OK;
+
+	if (root->osc_control_set & OSC_PCI_EXPRESS_NATIVE_HP_CONTROL)
+		return AE_OK;
+
+	(*count)++;
+	acpi_install_notify_handler(handle, ACPI_SYSTEM_NOTIFY,
+				    handle_hotplug_event_bridge, NULL);
+
 	return AE_OK ;
 }
 

^ permalink raw reply

* Re: [PATCH 0/4] iommu: iommu_ops group interface
From: Woodhouse, David @ 2011-10-21 22:39 UTC (permalink / raw)
  To: Alex Williamson
  Cc: joerg.roedel@amd.com, iommu@lists.linux-foundation.org,
	linux-kernel@vger.kernel.org, chrisw@redhat.com, agraf@suse.de,
	dwg@au1.ibm.com, scottwood@freescale.com, B08248@freescale.com,
	benh@kernel.crashing.org
In-Reply-To: <1319231795.19108.22.camel@bling.home>

[-- Attachment #1: Type: text/plain, Size: 707 bytes --]

On Fri, 2011-10-21 at 15:16 -0600, Alex Williamson wrote:
> 
> 
> drivers/pci/quirks.c: int pci_iommu_group_mf_quirk(struct pci_dev *)
> 
> That's used just like iommu_group_mf to always blacklist specific
> devices.  Let me know VID/DID if you have them.  Thanks, 

Don't remember the ID offhand, but there are reports in Red Hat bugzilla
of Ricoh multifunction devices on some (HP, I think) laptops which cause
streams of DMA errors unless the sdhci driver is blacklisted...

-- 
                   Sent with MeeGo's ActiveSync support.

David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation



[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 4370 bytes --]

^ permalink raw reply

* Re: [PATCH 0/4] iommu: iommu_ops group interface
From: Alex Williamson @ 2011-10-21 22:34 UTC (permalink / raw)
  To: Woodhouse, David
  Cc: joerg.roedel@amd.com, iommu@lists.linux-foundation.org,
	linux-kernel@vger.kernel.org, chrisw@redhat.com, agraf@suse.de,
	dwg@au1.ibm.com, scottwood@freescale.com, B08248@freescale.com,
	benh@kernel.crashing.org
In-Reply-To: <1319229248.19448.6.camel@shinybook.infradead.org>

On Fri, 2011-10-21 at 15:16 -0600, Alex Williamson wrote:
> On Fri, 2011-10-21 at 20:34 +0000, Woodhouse, David wrote:
> > On Fri, 2011-10-21 at 13:55 -0600, Alex Williamson wrote:
> > > IOMMUs can't always distiguish transactions from each individual
> > > device in a system.  Sometimes this is by design (such as powerpc
> > > partitionable endpoints), other times by topology (PCIe-to-PCI
> > > bridges masking downstream devices).  We call these sets of
> > > indistinguishable devices "groups". 
> > 
> > Other times it's because a multi-function PCIe device is broken and does
> > all its DMA from function zero.... like some Ricoh devices seen in
> > laptops. Can you handle quirks to "group" those too?
> 
> I could add:
> 
> drivers/pci/quirks.c: int pci_iommu_group_mf_quirk(struct pci_dev *)
> 
> That's used just like iommu_group_mf to always blacklist specific
> devices.  Let me know VID/DID if you have them.  Thanks,

I guess just a bit and a quirk is easiest, something like below
(quirking an 82576 just as an example).  Thanks,

Alex

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 6f75536..de158d9 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -2782,11 +2782,11 @@ static int amd_iommu_device_group(struct device *dev, unsigned int *groupid)
 	if (!dev_data)
 		return -ENODEV;
 
-	if (pdev->is_virtfn || !iommu_group_mf)
-		devid = dev_data->devid;
-	else
+	if (pdev->group_mf || (iommu_group_mf && !pdev->is_virtfn))
 		devid = calc_devid(pdev->bus->number,
 				   PCI_DEVFN(PCI_SLOT(pdev->devfn), 0));
+	else
+		devid = dev_data->devid;
 
 	*groupid = amd_iommu_alias_table[devid];
 
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index a5e1a2f..fba0256 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -3939,7 +3939,7 @@ static int intel_iommu_device_group(struct device *dev, unsigned int *groupid)
 		}
 	}
 
-	if (!pdev->is_virtfn && iommu_group_mf)
+	if (pdev->group_mf || (iommu_group_mf && !pdev->is_virtfn))
 		id.pci.devfn = PCI_DEVFN(PCI_SLOT(id.pci.devfn), 0);
 
 	*groupid = id.group;
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 1196f61..4764645 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -3009,3 +3009,9 @@ int pci_dev_specific_reset(struct pci_dev *dev, int probe)
 
 	return -ENOTTY;
 }
+
+static void __devinit quirk_iommu_group_mf(struct pci_dev* dev)
+{
+	dev->group_mf = 1;
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x10c9, quirk_iommu_group_mf);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8c230cb..d763027 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -322,6 +322,7 @@ struct pci_dev {
 	unsigned int    is_hotplug_bridge:1;
 	unsigned int    __aer_firmware_first_valid:1;
 	unsigned int	__aer_firmware_first:1;
+	unsigned int	group_mf:1;	/* per device iommu=group_mf */
 	pci_dev_flags_t dev_flags;
 	atomic_t	enable_cnt;	/* pci_enable_device has been called */
 




^ permalink raw reply related

* Re: lsusd - The Linux SUSpend Daemon
From: NeilBrown @ 2011-10-21 22:34 UTC (permalink / raw)
  To: Alan Stern
  Cc: John Stultz, Rafael J. Wysocki, mark gross, Linux PM list, LKML
In-Reply-To: <Pine.LNX.4.44L0.1110211139420.2131-100000@iolanthe.rowland.org>

[-- Attachment #1: Type: text/plain, Size: 9586 bytes --]

On Fri, 21 Oct 2011 12:07:07 -0400 (EDT) Alan Stern
<stern@rowland.harvard.edu> wrote:

> On Fri, 21 Oct 2011, NeilBrown wrote:
> 
> > Hi,
> > 
> > I wasn't going to do this... but then I did.  I think that sometimes coding is
> > a bit like chocolate.
> 
> Getting started is always a big hurdle, for me anyway.
> 
> > At:
> >     git://neil.brown.name/lsusd
> > or
> >     http://neil.brown.name/git/lsusd
> > 
> > you can find a bunch of proof-of-concept sample code that implements a
> > "Linux SUSpend Daemon" with client support library and test programs.
> > 
> > I haven't actually tested it as root and had it actually suspend and resume
> > and definitely haven't had it even close to a race condition, but the
> > various bits seem to work with each other properly when I run them under
> > strace and watch.
> > 
> > It didn't turn out quite the way I imagined, but then cold harsh reality has
> > a way of destroying our dreams, doesn't it :-)
> > 
> > 
> > Below is the README file.  Comment welcome as always.
> > I'm happy for patches too, but I'm equally happy for someone to re-write it
> > completely and make something really useful and maintainable.
> > 
> > NeilBrown
> > 
> > -----------------------------------------------------------------
> > 
> > This directory contains a prototype proof-of-concept system
> > for managing suspend in Linux.
> > Thus the Linux SUSpend Daemon.
> > 
> > It contains:
> > 
> >  lsusd:
> 
> This name is no good; it's too much like "lsusb".  In fact, anything 
> starting with "ls" is going to be confusing.  Not that I have any 
> better suggestions at the moment...
> 
> >     The main daemon.  It is written to run a tight loop and blocks as
> >      required.  It obeys the wakeup_count protocol to get race-free
> >      suspend and allows clients to register to find out about
> >      suspend and to block it either briefly or longer term.
> >     It uses files in /var/run/suspend for all communication.
> 
> I'm not so keen on using files for communication.  At best, they are
> rather awkward for two-way messaging.  If you really want to use them,
> then at least put them on a non-backed filesystem, like something under
> /dev.

Isn't /var/run a tmpfs filesystem?  It should be.
Surely /run is, so in the new world order the files should probably go
there.   But that is just a detail.

I like files...  I particularly like 'flock' to block suspend.   The
rest.... whatever..
With files, you only need a context switch when there is real communication.
With sockets, every message sent must be read so there will be a context
switch.

Maybe we could do something with futexes...

> 
> >     File are:
> > 
> >       disabled:  This file always exists.  If any process holds a
> >         shared flock(), suspend will not happen.
> >       immediate:  If this file exists, lsusd will try to suspend whenever
> >         possible.
> >       request:  If this is created, then lsusd will try to suspend
> >         once, and will remove the file when suspend completes or aborts.
> >       watching:  This is normally empty.  Any process wanting to know
> >         about suspend should take a shared flock and check the file is
> >         still empty, and should watch for modification.
> >         When suspend is imminent, lsusd creates 'watching-next', writes
> >          a byte to 'watching' and waits for an exclusive lock on 'watching'.
> >         Clients should move their lock to 'watching-next' when ready for
> >         suspend.
> >         When suspend completes, another byte (or 2) is written to
> >         'watching', and 'watching-next' is renamed over it.  Clients can
> >         use either of these to know that resume has happened.
> > 
> >       watching-next: The file that will be 'watching' in the next awake cycle.
> > 
> >     lsusd does not try to be event-loop based because:
> >       - /sys/power/wakeup_count is not pollable.  This could probably be
> >         'fixed' but I want code to work with today's kernel.  It will probably
> 
> Why does this matter?

In my mind an event based program should never block.  Every action should be
non-blocking and only taken when 'poll' says it can.
Reading /sys/power/wakeup_count can be read non-blocking, but you cannot find
out when it is sensible to try to read it again.  So it doesn't fit.

> 
> >         only block 100msec at most, but that might be too long???
> 
> Too long for what?

For some other process to connect to some socket and have to wait for the
connection to be accepted.
(When reading from wakeup_count in the current code it will block for a
multiple of 100ms.  The multiplier might be 0 or 1, possibly more, though
that is unlikely).

> 
> >       - I cannot get an event notification when a lock is removed from a
> >         file. :-(  And I think locks are an excellent light-weight
> >         mechanism for blocking suspend.
> 
> Except for this one drawback.  Socket connections are superior in that 
> regard.

I'm very happy for someone else write an all-socket based daemon.
Or just use my two deamons together.

> 
> >   lsused:
> >       This is an event-loop based daemon that can therefore easily handle
> >       socket connections and client protocols which need prompt
> >       response.  It communicates with lsusd and provides extra
> >       services to client.
> > 
> >       lsused (which needs a better name) listens on the socket
> >             /var/run/suspend/registration
> >       A client may connect and send a list of file descriptors.
> 
> Including an empty list?

With current code an empty list will mean no callback ever so it would be
pointless.
It is probably this interfaces could be improved.  I just wanted something
that worked.

> 
> >       When a suspend is immanent, if any file descriptor is readable,
> 
> Or if no file descriptors were sent?

Not with current code, but that does fit the design we discussed previously.

> 
> >       lsused will send a 'S' message to the client and await an 'R' reply
> >       (S == suspend, R == ready).  When all replies are in, lsused will
> >       allow the suspend to complete.  When it does (or aborts), it will send
> >       'A' (awake) to those clients to which it sent 'S'.
> 
> But not to the client which failed to send an 'R'?

Every client must send an R before suspend can continue.  I don't currently
have an special handling for clients that misbehave.  I'm not even certain
that I correctly hand the case where the client dies and the socket closes.

> 
> >       This allows a client to get a chance to handle any wakeup events,
> >       but not to be woken unnecessarily on every suspend.
> 
> In practice, it may be best for clients that handle a large number of 
> wakeup events to avoid using the fd mechanism.  Clients that handle 
> only occasional wakeups may be better off using it.
> 
> You left out an important element: A client must be allowed to send
> 'A' at any time, indicating that it does not want to suspend now.  Of 
> course, this will work reliably only if the client uses the fd 
> mechanism.

I did leave that out because client can always use "suspend_block()" to get a
lock on the lockfile which will block suspend.
But I have no objections to it going in.

> 
> I'm not sure it's such a good idea to separate this from the main 
> daemon.  A crucial point of the protocol is that the daemon reads 
> /sys/power/wakeup_count before sending all the 'S' messages, and waits 
> for all the 'R' replies before writing wakeup_count.  The two-program 
> approach would make this difficult.

I think it already works correctly with the two programs so it doesn't seem
that difficult.
The second daemon is a client to the first, and a server to other clients.
It is a multiplexer if you like - talking one (file-based) protocol to the
central server and another (socket-based) protocol to an arbitrary number of
clients.

> 
> >    wakealarmd:
> >       This allows clients to register on the socket.
> >              /var/run/suspend/wakealarm
> >       They write a timestamp in seconds since epoch, and will receive
> >       a 'Now' message when that time arrives.
> >       Between the time the connection is made and the time a "seconds"
> >       number is written, suspend will be blocked.
> >       Also between the time that "Now" is sent and when the socket is
> >       closed, suspend is also blocked.
> 
> In theory, this could be integrated with the previous program.

True.  Keeping it separate just reduced my cognitive load during development,
and provided more sample code of what a client would look like.

I'm not even sure it is entirely race-free.  It uses a 2-second margin to
ensure there is no race between suspending and the alarm-clock wakeup, but it
isn't really close enough to the suspend call to be certain that any
particular amount of time is enough.
Unless we get a counted wakeup_source for the RTC alarm, the RTC handling
will really need to be in the main daemon immediately before the write to
'state'.

Thanks for your review.

My original plan was to have a single daemon with a main loop and a bunch of
loadable modules that provided different protocols to clients: simple-socket,
file-based, dbus, "suspend.d" script directory etc.   That might still be fun
but it won't be a priority for a while.

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply

* Re: lsusd - The Linux SUSpend Daemon
From: NeilBrown @ 2011-10-21 22:09 UTC (permalink / raw)
  To: david
  Cc: Alan Stern, John Stultz, Rafael J. Wysocki, mark gross,
	Linux PM list, LKML
In-Reply-To: <alpine.DEB.2.02.1110211308130.13493@asgard.lang.hm>

[-- Attachment #1: Type: text/plain, Size: 1631 bytes --]

On Fri, 21 Oct 2011 13:10:10 -0700 (PDT) david@lang.hm wrote:

> On Fri, 21 Oct 2011, NeilBrown wrote:
> 
> > Hi,
> >
> > I wasn't going to do this... but then I did.  I think that sometimes coding is
> > a bit like chocolate.
> >
> > At:
> >    git://neil.brown.name/lsusd
> > or
> >    http://neil.brown.name/git/lsusd
> >
> > you can find a bunch of proof-of-concept sample code that implements a
> > "Linux SUSpend Daemon" with client support library and test programs.
> >
> > I haven't actually tested it as root and had it actually suspend and resume
> > and definitely haven't had it even close to a race condition, but the
> > various bits seem to work with each other properly when I run them under
> > strace and watch.
> >
> > It didn't turn out quite the way I imagined, but then cold harsh reality has
> > a way of destroying our dreams, doesn't it :-)
> >
> >
> > Below is the README file.  Comment welcome as always.
> > I'm happy for patches too, but I'm equally happy for someone to re-write it
> > completely and make something really useful and maintainable.
> 
> have you put any thought into the idea of extending this slightly to 
> handle the userspace wakelock interface to potentially allow this to run 
> android userspace on a stock kernel?
> 
> I realize that there are other things that would be needed as well, but 
> the wakelock interface is a biggie.
> 
> David Lang

I have certainly thought of someone else doing it :-)
I only have a high-level understanding of Android interfaces and don't really
want to go any deeper than that.

Thanks,
NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply

* Re: kernel 3.0: BUG: soft lockup: find_get_pages+0x51/0x110
From: Paweł Sikora @ 2011-10-21 21:36 UTC (permalink / raw)
  To: Nai Xia
  Cc: Hugh Dickins, arekm, Linus Torvalds, linux-mm, Mel Gorman,
	jpiszcz, linux-kernel, Andrew Morton, Andrea Arcangeli
In-Reply-To: <CAPQyPG4SE8DyzuqwG74sE2zuZbDgfDoGDir1xHC3zdED+k=qLA@mail.gmail.com>

On Friday 21 of October 2011 11:07:56 Nai Xia wrote:
> On Fri, Oct 21, 2011 at 4:07 PM, Pawel Sikora <pluto@agmk.net> wrote:
> > On Friday 21 of October 2011 14:22:37 Nai Xia wrote:
> >
> >> And as a side note. Since I notice that Pawel's workload may include OOM,
> >
> > my last tests on patched (3.0.4 + migrate.c fix + vserver) kernel produce full cpu load
> > on dual 8-cores opterons like on this htop screenshot -> http://pluto.agmk.net/kernel/screen1.png
> > afaics all userspace applications usualy don't use more than half of physical memory
> > and so called "cache" on htop bar doesn't reach the 100%.
> 
> OK，did you logged any OOM killing if there was some memory usage burst?
> But, well my above OOM reasoning is a direct short cut to imagined
> root cause of "adjacent VMAs which
> should have been merged but in fact not merged" case.
> Maybe there are other cases that can lead to this or maybe it's
> totally another bug....

i don't see any OOM killing with my conservative settings
(vm.overcommit_memory=2, vm.overcommit_ratio=100).

> But still I think if my reasoning is good, similar bad things will
> happen again some time in the future,
> even if it was not your case here...
> 
> >
> > the patched kernel with disabled CONFIG_TRANSPARENT_HUGEPAGE (new thing in 2.6.38)
> > died at night, so now i'm going to disable also CONFIG_COMPACTION/MIGRATION in next
> > steps and stress this machine again...
> 
> OK, it's smart to narrow down the range first....

disabling hugepage/compacting didn't help but disabling hugepage/compacting/migration keeps
opterons stable for ~9h so far. userspace uses ~40GB (from 64) ram, caches reach 100% on htop bar,
average load ~16. i wonder if it survive weekend...

^ permalink raw reply

* [PATCH 2/2] SELinux: Do not apply MMAP_ZERO check to PROT_NONE mappings
From: Roland McGrath @ 2011-10-21 21:39 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton
  Cc: James Morris, Eric Paris, Stephen Smalley, selinux, John Johansen,
	linux-security-module, linux-kernel
In-Reply-To: <20111021213916.914462C0A5@topped-with-meat.com>

An mmap with PROT_NONE is done specifically to ensure that an address
will fault.  So doing this on addresses below CONFIG_LSM_MMAP_MIN_ADDR
is not seeking a "dangerous" operation.  Conversely, it's an attempt
to ensure robustness in case CONFIG_LSM_MMAP_MIN_ADDR or vm.mmap_min_addr
is less restrictive than the user wants to be.

Since we might let a low mapping exist at all without a check, we
add another check to prevent mprotect from granting access to such
a mapping without passing an MMAP_ZERO security check.

Signed-off-by: Roland McGrath <roland@hack.frob.com>
---
 security/selinux/hooks.c |   17 ++++++++++++++++-
 1 files changed, 16 insertions(+), 1 deletions(-)

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 76e6f04..1e3657b 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3062,7 +3062,8 @@ static int selinux_file_mmap(struct file *file, unsigned long reqprot,
 	 * at bad behaviour/exploit that we always want to get the AVC, even
 	 * if DAC would have also denied the operation.
 	 */
-	if (addr < CONFIG_LSM_MMAP_MIN_ADDR) {
+	if (addr < CONFIG_LSM_MMAP_MIN_ADDR &&
+	    (addr_only || prot != PROT_NONE)) {
 		rc = avc_has_perm(sid, sid, SECCLASS_MEMPROTECT,
 				  MEMPROTECT__MMAP_ZERO, NULL);
 		if (rc)
@@ -3091,6 +3092,20 @@ static int selinux_file_mprotect(struct vm_area_struct *vma,
 	if (selinux_checkreqprot)
 		prot = reqprot;
 
+	/*
+	 * Notice that we are intentionally putting the SELinux check before
+	 * the secondary cap_file_mprotect check.  This is such a likely attempt
+	 * at bad behaviour/exploit that we always want to get the AVC, even
+	 * if DAC would have also denied the operation.
+	 */
+	if (addr < CONFIG_LSM_MMAP_MIN_ADDR && prot != PROT_NONE) {
+		u32 sid = current_sid();
+		rc = avc_has_perm(sid, sid, SECCLASS_MEMPROTECT,
+				  MEMPROTECT__MMAP_ZERO, NULL);
+		if (rc)
+			return rc;
+	}
+
 	/* do DAC check on address space usage */
 	rc = cap_file_mprotect(vma, reqprot, prot)
 	if (rc)

^ permalink raw reply related

* [PATCH 1/2] LSM: Do not apply mmap_min_addr check to PROT_NONE mappings
From: Roland McGrath @ 2011-10-21 21:39 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton
  Cc: James Morris, Eric Paris, Stephen Smalley, selinux, John Johansen,
	linux-security-module, linux-kernel

An mmap with PROT_NONE is done specifically to ensure that an address will
fault.  So doing this on addresses below mmap_min_addr is not seeking a
"dangerous" operation.  Conversely, it's an attempt to ensure robustness in
case mmap_min_addr is less restrictive than the user wants to be.

Since we might let a low mapping exist at all without a check, we add
another check to prevent mprotect from granting access to such a mapping.

Signed-off-by: Roland McGrath <roland@hack.frob.com>
---
 include/linux/security.h |    5 ++-
 security/apparmor/lsm.c  |    7 ++++++
 security/capability.c    |    6 -----
 security/commoncap.c     |   48 ++++++++++++++++++++++++++++++++++++---------
 security/selinux/hooks.c |    8 ++++++-
 5 files changed, 55 insertions(+), 19 deletions(-)

diff --git a/include/linux/security.h b/include/linux/security.h
index ebd2a53..aba8071 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -73,6 +73,8 @@ extern int cap_inode_killpriv(struct dentry *dentry);
 extern int cap_file_mmap(struct file *file, unsigned long reqprot,
 			 unsigned long prot, unsigned long flags,
 			 unsigned long addr, unsigned long addr_only);
+extern int cap_file_mprotect(struct vm_area_struct *vma,
+			     unsigned long reqprot, unsigned long prot);
 extern int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags);
 extern int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 			  unsigned long arg4, unsigned long arg5);
@@ -2213,7 +2215,7 @@ static inline int security_file_mprotect(struct vm_area_struct *vma,
 					 unsigned long reqprot,
 					 unsigned long prot)
 {
-	return 0;
+	return cap_file_mprotect(vma, reqprot, prot);
 }
 
 static inline int security_file_lock(struct file *file, unsigned int cmd)
@@ -3044,4 +3046,3 @@ static inline void free_secdata(void *secdata)
 #endif /* CONFIG_SECURITY */
 
 #endif /* ! __LINUX_SECURITY_H */
-
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index 3783202..d2a9693 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -508,6 +508,13 @@ static int apparmor_file_mmap(struct file *file, unsigned long reqprot,
 static int apparmor_file_mprotect(struct vm_area_struct *vma,
 				  unsigned long reqprot, unsigned long prot)
 {
+        int rc;
+
+	/* do DAC check */
+	rc = cap_file_mprotect(vma, reqprot, prot);
+	if (rc)
+		return rc;
+
 	return common_mmap(OP_FMPROT, vma->vm_file, prot,
 			   !(vma->vm_flags & VM_SHARED) ? MAP_PRIVATE : 0);
 }
diff --git a/security/capability.c b/security/capability.c
index 2984ea4..3c60f07 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -316,12 +316,6 @@ static int cap_file_ioctl(struct file *file, unsigned int command,
 	return 0;
 }
 
-static int cap_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
-			     unsigned long prot)
-{
-	return 0;
-}
-
 static int cap_file_lock(struct file *file, unsigned int cmd)
 {
 	return 0;
diff --git a/security/commoncap.c b/security/commoncap.c
index a93b3b7..0d4685a 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -942,11 +942,26 @@ int cap_vm_enough_memory(struct mm_struct *mm, long pages)
 	return __vm_enough_memory(mm, pages, cap_sys_admin);
 }
 
+static int cap_mmap_min_addr(unsigned long addr)
+{
+	int ret = 0;
+
+	if (addr < dac_mmap_min_addr) {
+		ret = cap_capable(current, current_cred(), &init_user_ns,
+				  CAP_SYS_RAWIO, SECURITY_CAP_AUDIT);
+		/* set PF_SUPERPRIV if it turns out we allow the low mmap */
+		if (ret == 0)
+			current->flags |= PF_SUPERPRIV;
+	}
+
+	return ret;
+}
+
 /*
  * cap_file_mmap - check if able to map given addr
  * @file: unused
  * @reqprot: unused
- * @prot: unused
+ * @prot: protection being requested
  * @flags: unused
  * @addr: address attempting to be mapped
  * @addr_only: unused
@@ -960,14 +975,27 @@ int cap_file_mmap(struct file *file, unsigned long reqprot,
 		  unsigned long prot, unsigned long flags,
 		  unsigned long addr, unsigned long addr_only)
 {
-	int ret = 0;
+	if (addr_only || prot != PROT_NONE)
+		return cap_mmap_min_addr(addr);
+	return 0;
+}
 
-	if (addr < dac_mmap_min_addr) {
-		ret = cap_capable(current, current_cred(), &init_user_ns, CAP_SYS_RAWIO,
-				  SECURITY_CAP_AUDIT);
-		/* set PF_SUPERPRIV if it turns out we allow the low mmap */
-		if (ret == 0)
-			current->flags |= PF_SUPERPRIV;
-	}
-	return ret;
+/*
+ * cap_file_mprotect - check if able to mprotect given addr
+ * @vma: entry being changed
+ * @reqprot: unused
+ * @prot: protection being changed to
+ *
+ * If the process is attempting to change memory below dac_mmap_min_addr to
+ * anything but PROT_NONE, they need CAP_SYS_RAWIO.  The other parameters
+ * to this function are unused by the capability security module.  Returns
+ * 0 if this mapping should be allowed -EPERM if not.
+ */
+int cap_file_mprotect(struct vm_area_struct *vma,
+		      unsigned long reqprot,
+		      unsigned long prot)
+{
+	if (prot != PROT_NONE)
+		return cap_mmap_min_addr(vma->vm_start);
+	return 0;
 }
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 266a229..76e6f04 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3086,13 +3086,19 @@ static int selinux_file_mprotect(struct vm_area_struct *vma,
 				 unsigned long prot)
 {
 	const struct cred *cred = current_cred();
+	int rc;
 
 	if (selinux_checkreqprot)
 		prot = reqprot;
 
+	/* do DAC check on address space usage */
+	rc = cap_file_mprotect(vma, reqprot, prot)
+	if (rc)
+		return rc;
+
 	if (default_noexec &&
 	    (prot & PROT_EXEC) && !(vma->vm_flags & VM_EXEC)) {
-		int rc = 0;
+		rc = 0;
 		if (vma->vm_start >= vma->vm_mm->start_brk &&
 		    vma->vm_end <= vma->vm_mm->brk) {
 			rc = cred_has_perm(cred, cred, PROCESS__EXECHEAP);

^ permalink raw reply related

* [RFC][PATCH v2 -next 2/2] Adding lock operations to kmsg_dump()/pstore_dump()
From: Seiji Aguchi @ 2011-10-21 21:21 UTC (permalink / raw)
  To: Andrew Morton, Chen, Gong, linux-kernel@vger.kernel.org,
	Luck, Tony, Matthew Garrett, Vivek Goyal, len.brown@intel.com,
	ying.huang@intel.com, ak@linux.intel.com, hughd@chromium.org,
	mingo@elte.hu, jmorris@namei.org, a.p.zijlstra@chello.nl,
	namhyung@gmail.com, Don Zickus
  Cc: dle-develop@lists.sourceforge.net, Satoru Moriya

pstore_dump()/kmsg_dump() may be called everywhere in kernel.
So we have to care about following cases.

  - Panic path
    In this case, Logging message process is serialized via smp_send_stop().
    So, we can bust spin_locks.

    Currently, kmsg_dump() may be called twice (KMSG_DUMP_PANIC and KMSG_DUMP_EMERGY)
    So, for avoiding deadlock, I suggest to bust locks rather than skipping them.

  - NMI context 
     While a cpu is in NMI handler, other cpus may be running.
     So, trylock should be called so that lockdep cheking works.
 
  - Process context
    In this case, we can simply take locks.

Patch Descriptions:
  Adding a lock operation below in pstore_dump()
  - busting psinfo->buf_lock of pstore_dump() in panic path for avoiding deadlock.

  Adding lock operations below in kmsg_dump()
  - busting logbuf_lock of kmsg_dump() in panic path for avoiding deadlock.
  - Adding trylock for taking logbuf_lock of kmsg_dump() in NMI context. 
  

 Signed-off-by: Seiji Aguchi <seiji.aguchi@hds.com>

---
 fs/pstore/platform.c |   11 +++++++++++
 kernel/printk.c      |   28 +++++++++++++++++++++++++---
 2 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 2bd620f..91ef164 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -97,6 +97,17 @@ static void pstore_dump(struct kmsg_dumper *dumper,
 	else
 		why = "Unknown";
 
+	/*
+	 * pstore_dump() is called after smp_send_stop() in panic path.
+	 * So, spin_lock should be bust for avoiding deadlock.
+	 */
+	if (reason == KMSG_DUMP_PANIC)
+		spin_lock_init(&psinfo->buf_lock);
+
+	/*
+	 * While a cpu is in NMI handler, other cpus may be running.
+	 * So, trylock should be called so that lockdep checking works.
+	 */
 	if (in_nmi()) {
 		is_locked = spin_trylock(&psinfo->buf_lock);
 		if (!is_locked)
diff --git a/kernel/printk.c b/kernel/printk.c
index 1455a0d..f51f547 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1730,15 +1730,37 @@ void kmsg_dump(enum kmsg_dump_reason reason)
 	struct kmsg_dumper *dumper;
 	const char *s1, *s2;
 	unsigned long l1, l2;
-	unsigned long flags;
+	unsigned long flags = 0;
+	int is_locked = 0;
 
 	/* Theoretically, the log could move on after we do this, but
 	   there's not a lot we can do about that. The new messages
 	   will overwrite the start of what we dump. */
-	raw_spin_lock_irqsave(&logbuf_lock, flags);
+
+	/*
+	 * kmsg_dump() is called after smp_send_stop() in panic path.
+	 * So, spin_lock should be bust for avoiding deadlock.
+	 */
+	if (reason == KMSG_DUMP_PANIC)
+		raw_spin_lock_init(&logbuf_lock);
+	/*
+	 * While a cpu is in NMI handler, other cpus may be running.
+	 * So, trylock should be called so that lockdep checking works.
+	 */
+	if (in_nmi())
+		is_locked = raw_spin_trylock(&logbuf_lock);
+	else
+		raw_spin_lock_irqsave(&logbuf_lock, flags);
+
 	end = log_end & LOG_BUF_MASK;
 	chars = logged_chars;
-	raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+
+	if (in_nmi()) {
+		if (is_locked)
+			raw_spin_unlock(&logbuf_lock);
+	} else
+		raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+
 
 	if (chars > end) {
 		s1 = log_buf + log_buf_len - chars + end;
-- 1.7.1

^ permalink raw reply related

* [RFC][PATCH v2 -next 1/2] Move kmsg_dump(KMSG_DUMP_PANIC) below smp_send_stop()
From: Seiji Aguchi @ 2011-10-21 21:20 UTC (permalink / raw)
  To: Andrew Morton, Chen, Gong, linux-kernel@vger.kernel.org,
	Luck, Tony, Matthew Garrett, Vivek Goyal, len.brown@intel.com,
	ying.huang@intel.com, ak@linux.intel.com, hughd@chromium.org,
	mingo@elte.hu, jmorris@namei.org, a.p.zijlstra@chello.nl,
	namhyung@gmail.com, Don Zickus
  Cc: dle-develop@lists.sourceforge.net, Satoru Moriya

This patch is just moving kmsg_dump(KMSG_DUMP_PANIC) below smp_send_stop
for serializing logging process via smp_send_stop.

 Signed-off-by: Seiji Aguchi <seiji.aguchi@hds.com>

---
 kernel/panic.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/panic.c b/kernel/panic.c
index d7bb697..41bf6ad 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -88,8 +88,6 @@ NORET_TYPE void panic(const char * fmt, ...)
 	 */
 	crash_kexec(NULL);
 
-	kmsg_dump(KMSG_DUMP_PANIC);
-
 	/*
 	 * Note smp_send_stop is the usual smp shutdown function, which
 	 * unfortunately means it may not be hardened to work in a panic
@@ -97,6 +95,8 @@ NORET_TYPE void panic(const char * fmt, ...)
 	 */
 	smp_send_stop();
 
+	kmsg_dump(KMSG_DUMP_PANIC);
+
 	atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
 
 	bust_spinlocks(0);
-- 1.7.1

^ permalink raw reply related

* [RFC][PATCH v2 -next 0/2] make pstore/kmsg_dump run after stopping other cpus in panic path
From: Seiji Aguchi @ 2011-10-21 21:19 UTC (permalink / raw)
  To: Andrew Morton, Chen, Gong, linux-kernel@vger.kernel.org,
	Luck, Tony, Matthew Garrett, Vivek Goyal, len.brown@intel.com,
	ying.huang@intel.com, ak@linux.intel.com, hughd@chromium.org,
	mingo@elte.hu, jmorris@namei.org, a.p.zijlstra@chello.nl,
	namhyung@gmail.com, Don Zickus
  Cc: dle-develop@lists.sourceforge.net, Satoru Moriya

Hi,

 As Don mentioned in following thread, it would be nice for pstore/kmsg_dump to serialize 
 panic path and have one cpu running  because they can log messages reliably. 
 
 https://lkml.org/lkml/2011/10/13/427
 
 For realizing this idea, we have to move kmsg_dump below smp_send_stop() and bust some locks of 
 kmsg_dump/pstore in panic path. Also, we have to care about NMI interrupt.

Changelog:
 v2
  - Adding trylocks to kmsg_dump()/pstore_dump() so that they can work in NMI context.  
  - Dividing a patch into two. 
     First one is just moving kmsg_dump(KMSG_DUMP_PANIC) below smp_send_stop().
     Second one is changing lock operations in kmsg_dump()/pstore_dump().
 v1
  - moving kmsg_dump(KMSG_DUMP_PANIC) below smp_send_stop.
  - busting logbuf_lock of kmsg_dump() in panic path for avoiding deadlock.
  - busting psinfo->buf_lock of pstore_dump() in panic path for avoiding deadlock.

Patch Descriptions:
[RFC][PATCH -next v2 1/2] Moving kmsg_dump(KMSG_DUMP_PANIC) below smp_send_stop()
  - moving kmsg_dump(KMSG_DUMP_PANIC) below smp_send_stop for serializing logging 
    process via smp_send_stop.

[RFC][PATCH v2 -next 2/2] Adding lock operations to kmsg_dump()/pstore_dump()
  Adding a lock operation below in pstore_dump()
  - busting psinfo->buf_lock of pstore_dump() in panic path for avoiding deadlock.

  Adding lock operations below in kmsg_dump()
  - busting logbuf_lock of kmsg_dump() in panic path for avoiding deadlock.
  - Adding trylock for taking logbuf_lock of kmsg_dump() in NMI context. 

Seiji

^ permalink raw reply

* [PATCH 3/5] ext4: ext4_mkdir should dirty dir_block with the parent inode
From: Darrick J. Wong @ 2011-10-21 21:18 UTC (permalink / raw)
  To: Tao Ma, Theodore Tso; +Cc: linux-ext4, linux-kernel
In-Reply-To: <20111021211759.10784.17257.stgit@elm3c44.beaverton.ibm.com>

ext4_mkdir calls ext4_handle_dirty_metadata with dir_block and the inode "dir".
Unfortunately, dir_block belongs to the newly created directory (which is
"inode"), not the parent directory (which is "dir").  Fix the incorrect
association.

Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
---
 fs/ext4/namei.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6d3fab4..50c7294 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1863,7 +1863,7 @@ retry:
 	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
 	inode->i_nlink = 2;
 	BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
-	err = ext4_handle_dirty_metadata(handle, dir, dir_block);
+	err = ext4_handle_dirty_metadata(handle, inode, dir_block);
 	if (err)
 		goto out_clear_inode;
 	err = ext4_mark_inode_dirty(handle, inode);

^ permalink raw reply related

* [PATCH 2/5] ext4: ext4_rename should dirty dir_bh with the correct directory
From: Darrick J. Wong @ 2011-10-21 21:18 UTC (permalink / raw)
  To: Tao Ma, Theodore Tso; +Cc: linux-ext4, linux-kernel
In-Reply-To: <20111021211759.10784.17257.stgit@elm3c44.beaverton.ibm.com>

When ext4_rename performs a directory rename (move), dir_bh is a buffer that is
modified to update the '..' link in the directory being moved (old_inode).
However, ext4_handle_dirty_metadata is called with the old parent directory
inode (old_dir) and dir_bh, which is incorrect because dir_bh does not belong
to the parent inode.  Fix this error.

Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
---
 fs/ext4/namei.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 310b356..6d3fab4 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2530,7 +2530,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 		PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
 						cpu_to_le32(new_dir->i_ino);
 		BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
-		retval = ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
+		retval = ext4_handle_dirty_metadata(handle, old_inode, dir_bh);
 		if (retval) {
 			ext4_std_error(old_dir->i_sb, retval);
 			goto end_rename;

^ permalink raw reply related

* [PATCH 5/5] ext4: Fix endian problem in MMP initialization
From: Darrick J. Wong @ 2011-10-21 21:18 UTC (permalink / raw)
  To: Tao Ma, Theodore Tso; +Cc: linux-ext4, linux-kernel
In-Reply-To: <20111021211759.10784.17257.stgit@elm3c44.beaverton.ibm.com>

As part of startup, the MMP initialization code does this:

mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq());

Next, mmp->mmp_seq is written out to disk, a delay happens, and then the MMP
block is read back in and the sequence value is tested:

if (seq != le32_to_cpu(mmp->mmp_seq)) {
	/* fail the mount */

On a LE system such as x86, the *le32* functions do nothing and this works.
Unfortunately, on a BE system such as ppc64, this comparison becomes:

if (cpu_to_le32(new_seq) != le32_to_cpu(cpu_to_le32(new_seq)) {
	/* fail the mount */

Except for a few palindromic sequence numbers, this test always causes the
mount to fail, which makes MMP filesystems generally unmountable on ppc64.  The
attached patch fixes this situation.

Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
---
 fs/ext4/mmp.c |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 9bdef3f..a7a4986 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -295,7 +295,8 @@ skip:
 	/*
 	 * write a new random sequence number.
 	 */
-	mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq());
+	seq = mmp_new_seq();
+	mmp->mmp_seq = cpu_to_le32(seq);

 	retval = write_mmp_block(bh);
 	if (retval)

^ permalink raw reply related

* [PATCH 4/5] ext4: Prevent stack overrun in ext4_file_open when recording last known mountpoint
From: Darrick J. Wong @ 2011-10-21 21:18 UTC (permalink / raw)
  To: Tao Ma, Theodore Tso; +Cc: linux-ext4, linux-kernel
In-Reply-To: <20111021211759.10784.17257.stgit@elm3c44.beaverton.ibm.com>

In ext4_file_open, the filesystem records the mountpoint of the first file that
is opened after mounting the filesystem.  It does this by allocating a 64-byte
stack buffer, calling d_path() to grab the mount point through which this file
was accessed, and then memcpy()ing 64 bytes into the superblock's
s_last_mounted field, starting from the return value of d_path(), which is
stored as "cp".  However, if cp > buf (which it frequently is since path
components are prepended starting at the end of buf) then we can end up copying
stack data into the superblock.

Writing stack variables into the superblock doesn't sound like a great idea, so
use strlcpy instead.  Andi Kleen suggested using strlcpy instead of strncpy.

Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
---
 fs/ext4/file.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index e4095e9..9781099 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -181,8 +181,8 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 		path.dentry = mnt->mnt_root;
 		cp = d_path(&path, buf, sizeof(buf));
 		if (!IS_ERR(cp)) {
-			memcpy(sbi->s_es->s_last_mounted, cp,
-			       sizeof(sbi->s_es->s_last_mounted));
+			strlcpy(sbi->s_es->s_last_mounted, cp,
+				sizeof(sbi->s_es->s_last_mounted));
 			ext4_mark_super_dirty(sb);
 		}
 	}

^ permalink raw reply related

* [PATCH 1/5] ext4: ext4_dx_add_entry should dirty directory metadata with the directory inode
From: Darrick J. Wong @ 2011-10-21 21:17 UTC (permalink / raw)
  To: Tao Ma, Theodore Tso; +Cc: linux-ext4, linux-kernel

ext4_dx_add_entry manipulates bh2 and frames[0].bh, which are two buffer_heads
that point to directory blocks assigned to the directory inode.  However, the
function calls ext4_handle_dirty_metadata with the inode of the file that's
being added to the directory, not the directory inode itself.  Therefore,
correct the code to dirty the directory buffers with the directory inode, not
the file inode.

Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
---
 fs/ext4/namei.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 1c924fa..310b356 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1586,7 +1586,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			dxtrace(dx_show_index("node", frames[1].entries));
 			dxtrace(dx_show_index("node",
 			       ((struct dx_node *) bh2->b_data)->entries));
-			err = ext4_handle_dirty_metadata(handle, inode, bh2);
+			err = ext4_handle_dirty_metadata(handle, dir, bh2);
 			if (err)
 				goto journal_error;
 			brelse (bh2);
@@ -1612,7 +1612,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			if (err)
 				goto journal_error;
 		}
-		err = ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
+		err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh);
 		if (err) {
 			ext4_std_error(inode->i_sb, err);
 			goto cleanup;

^ permalink raw reply related

* Re: [PATCH 0/4] iommu: iommu_ops group interface
From: Alex Williamson @ 2011-10-21 21:16 UTC (permalink / raw)
  To: Woodhouse, David
  Cc: joerg.roedel@amd.com, iommu@lists.linux-foundation.org,
	linux-kernel@vger.kernel.org, chrisw@redhat.com, agraf@suse.de,
	dwg@au1.ibm.com, scottwood@freescale.com, B08248@freescale.com,
	benh@kernel.crashing.org
In-Reply-To: <1319229248.19448.6.camel@shinybook.infradead.org>

On Fri, 2011-10-21 at 20:34 +0000, Woodhouse, David wrote:
> On Fri, 2011-10-21 at 13:55 -0600, Alex Williamson wrote:
> > IOMMUs can't always distiguish transactions from each individual
> > device in a system.  Sometimes this is by design (such as powerpc
> > partitionable endpoints), other times by topology (PCIe-to-PCI
> > bridges masking downstream devices).  We call these sets of
> > indistinguishable devices "groups". 
> 
> Other times it's because a multi-function PCIe device is broken and does
> all its DMA from function zero.... like some Ricoh devices seen in
> laptops. Can you handle quirks to "group" those too?

I could add:

drivers/pci/quirks.c: int pci_iommu_group_mf_quirk(struct pci_dev *)

That's used just like iommu_group_mf to always blacklist specific
devices.  Let me know VID/DID if you have them.  Thanks,

Alex


^ permalink raw reply

* [PATCH] x86-64: Set siginfo and context on vsyscall emulation faults
From: Andy Lutomirski @ 2011-10-21 21:01 UTC (permalink / raw)
  To: Linus Torvalds, x86
  Cc: Ingo Molnar, richard -rw- weinberger, Adrian Bunk, H. Peter Anvin,
	Thomas Gleixner, Ingo Molnar, linux-kernel, Andy Lutomirski
In-Reply-To: <CA+55aFx3-F1LzVwN_gSqXcsU7uWda41bHyJxGxHNODWURu0T_g@mail.gmail.com>

To make this work, we teach the page fault handler how to send
signals on failed uaccess.  This only works for user addresses
(kernel addresses will never hit the page fault handler in the
first place), so we need to generate signals for those
separately.

This gets the tricky case right: if the user buffer spans
multiple pages and only the second page is invalid, we set
cr2 and si_addr correctly.  UML relies on this behavior to
"fault in" pages as needed.

We steal a bit from thread_info.uaccess_err to enable this.
Before this change, uaccess_err was a 32-bit boolean value.

This fixes issues with UML when vsyscall=emulate.

Reported-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
---

I've tested this briefly on the UML image that used to blow it.  It seems
to work.  It also passes my little sigcontext test.

 arch/x86/include/asm/thread_info.h |    3 +-
 arch/x86/include/asm/uaccess.h     |    2 +-
 arch/x86/kernel/vsyscall_64.c      |   67 +++++++++++++++++++++++++++++++----
 arch/x86/mm/extable.c              |    2 +-
 arch/x86/mm/fault.c                |   22 ++++++++---
 5 files changed, 79 insertions(+), 17 deletions(-)

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index a1fe5c1..25ebd79 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -40,7 +40,8 @@ struct thread_info {
 						*/
 	__u8			supervisor_stack[0];
 #endif
-	int			uaccess_err;
+	int			sig_on_uaccess_error:1;
+	int			uaccess_err:1;	/* uaccess failed */
 };
 
 #define INIT_THREAD_INFO(tsk)			\
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 36361bf..8be5f54 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -462,7 +462,7 @@ struct __large_struct { unsigned long buf[100]; };
 	barrier();
 
 #define uaccess_catch(err)						\
-	(err) |= current_thread_info()->uaccess_err;			\
+	(err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0);	\
 	current_thread_info()->uaccess_err = prev_err;			\
 } while (0)
 
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 18ae83d..c6dd0e6 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -139,11 +139,38 @@ static int addr_to_vsyscall_nr(unsigned long addr)
 	return nr;
 }
 
+static bool write_ok_or_segv(unsigned long ptr, size_t size)
+{
+	if (ptr == 0)
+		return true;
+
+	if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) {
+		siginfo_t info;
+		struct thread_struct *thread = &current->thread;
+
+		thread->error_code	= 6;  /* user fault, no page, write */
+		thread->cr2		= ptr;
+		thread->trap_no		= 14;
+
+		memset(&info, 0, sizeof(info));
+		info.si_signo		= SIGSEGV;
+		info.si_errno		= 0;
+		info.si_code		= SEGV_MAPERR;
+		info.si_addr		= (void __user *)ptr;
+
+		force_sig_info(SIGSEGV, &info, current);
+		return false;
+	} else {
+		return true;
+	}
+}
+
 bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 {
 	struct task_struct *tsk;
 	unsigned long caller;
 	int vsyscall_nr;
+	int prev_sig_on_uaccess_error;
 	long ret;
 
 	/*
@@ -179,35 +206,59 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 	if (seccomp_mode(&tsk->seccomp))
 		do_exit(SIGKILL);
 
+	/*
+	 * With a real vsyscall, page faults cause SIGSEGV.  We want to
+	 * preserve that behavior to make writing exploits harder.
+	 */
+	prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
+	current_thread_info()->sig_on_uaccess_error = 1;
+
+	ret = -EFAULT;
 	switch (vsyscall_nr) {
 	case 0:
+		if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
+		    !write_ok_or_segv(regs->si, sizeof(struct timezone)))
+			break;
+
 		ret = sys_gettimeofday(
 			(struct timeval __user *)regs->di,
 			(struct timezone __user *)regs->si);
 		break;
 
 	case 1:
+		if (!write_ok_or_segv(regs->di, sizeof(time_t)))
+			break;
+
 		ret = sys_time((time_t __user *)regs->di);
 		break;
 
 	case 2:
+		if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
+		    !write_ok_or_segv(regs->si, sizeof(unsigned)))
+			break;
+
 		ret = sys_getcpu((unsigned __user *)regs->di,
 				 (unsigned __user *)regs->si,
 				 0);
 		break;
 	}
 
+	current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
+
 	if (ret == -EFAULT) {
-		/*
-		 * Bad news -- userspace fed a bad pointer to a vsyscall.
-		 *
-		 * With a real vsyscall, that would have caused SIGSEGV.
-		 * To make writing reliable exploits using the emulated
-		 * vsyscalls harder, generate SIGSEGV here as well.
-		 */
+		/* Bad news -- userspace fed a bad pointer to a vsyscall. */
 		warn_bad_vsyscall(KERN_INFO, regs,
 				  "vsyscall fault (exploit attempt?)");
-		goto sigsegv;
+
+		/*
+		 * If we failed to generate a signal for any reason,
+		 * generate one here.  (This should be impossible.)
+		 */
+		if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
+				 !sigismember(&tsk->pending.signal, SIGSEGV)))
+			goto sigsegv;
+
+		return true;  /* Don't emulate the ret. */
 	}
 
 	regs->ax = ret;
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index d0474ad..1fb85db 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -25,7 +25,7 @@ int fixup_exception(struct pt_regs *regs)
 	if (fixup) {
 		/* If fixup is less than 16, it means uaccess error */
 		if (fixup->fixup < 16) {
-			current_thread_info()->uaccess_err = -EFAULT;
+			current_thread_info()->uaccess_err = 1;
 			regs->ip += fixup->fixup;
 			return 1;
 		}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 0d17c8c..85bec26 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -620,7 +620,7 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code,
 
 static noinline void
 no_context(struct pt_regs *regs, unsigned long error_code,
-	   unsigned long address)
+	   unsigned long address, int signal, int si_code)
 {
 	struct task_struct *tsk = current;
 	unsigned long *stackend;
@@ -628,8 +628,17 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 	int sig;
 
 	/* Are we prepared to handle this kernel fault? */
-	if (fixup_exception(regs))
+	if (fixup_exception(regs)) {
+		if (current_thread_info()->sig_on_uaccess_error && signal) {
+			tsk->thread.trap_no = 14;
+			tsk->thread.error_code = error_code | PF_USER;
+			tsk->thread.cr2 = address;
+
+			/* XXX: hwpoison faults will set the wrong code. */
+			force_sig_info_fault(signal, si_code, address, tsk, 0);
+		}
 		return;
+	}
 
 	/*
 	 * 32-bit:
@@ -749,7 +758,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 	if (is_f00f_bug(regs, address))
 		return;
 
-	no_context(regs, error_code, address);
+	no_context(regs, error_code, address, SIGSEGV, si_code);
 }
 
 static noinline void
@@ -813,7 +822,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
 
 	/* Kernel mode? Handle exceptions or die: */
 	if (!(error_code & PF_USER)) {
-		no_context(regs, error_code, address);
+		no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
 		return;
 	}
 
@@ -848,7 +857,7 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 		if (!(fault & VM_FAULT_RETRY))
 			up_read(&current->mm->mmap_sem);
 		if (!(error_code & PF_USER))
-			no_context(regs, error_code, address);
+			no_context(regs, error_code, address, 0, 0);
 		return 1;
 	}
 	if (!(fault & VM_FAULT_ERROR))
@@ -858,7 +867,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 		/* Kernel mode? Handle exceptions or die: */
 		if (!(error_code & PF_USER)) {
 			up_read(&current->mm->mmap_sem);
-			no_context(regs, error_code, address);
+			no_context(regs, error_code, address,
+				   SIGSEGV, SEGV_MAPERR);
 			return 1;
 		}
 
-- 
1.7.6.4


^ permalink raw reply related

* Re: [PATCH 0/4] iommu: iommu_ops group interface
From: Woodhouse, David @ 2011-10-21 20:34 UTC (permalink / raw)
  To: Alex Williamson
  Cc: joerg.roedel@amd.com, iommu@lists.linux-foundation.org,
	linux-kernel@vger.kernel.org, chrisw@redhat.com, agraf@suse.de,
	dwg@au1.ibm.com, scottwood@freescale.com, B08248@freescale.com,
	benh@kernel.crashing.org
In-Reply-To: <20111021195412.8438.9951.stgit@s20.home>

[-- Attachment #1: Type: text/plain, Size: 782 bytes --]

On Fri, 2011-10-21 at 13:55 -0600, Alex Williamson wrote:
> IOMMUs can't always distiguish transactions from each individual
> device in a system.  Sometimes this is by design (such as powerpc
> partitionable endpoints), other times by topology (PCIe-to-PCI
> bridges masking downstream devices).  We call these sets of
> indistinguishable devices "groups". 

Other times it's because a multi-function PCIe device is broken and does
all its DMA from function zero.... like some Ricoh devices seen in
laptops. Can you handle quirks to "group" those too?

-- 
                   Sent with MeeGo's ActiveSync support.

David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation



[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 4370 bytes --]

^ permalink raw reply

* [PATCH] x86: Fix compilation bug in kprobes' twobyte_is_boostable
From: Josh Stone @ 2011-10-21 20:33 UTC (permalink / raw)
  To: linux-kernel
  Cc: Josh Stone, Masami Hiramatsu, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, x86, Srikar Dronamraju, Linus Torvalds,
	Jakub Jelinek

When compiling an i386_defconfig kernel with gcc-4.6.1-9.fc15.i686, I
noticed a warning about the asm operand for test_bit in kprobes'
can_boost.  I discovered that this caused only the first long of
twobyte_is_boostable[] to be output.

Jakub filed and fixed gcc PR50571 to correct the warning and this output
issue.  But to solve it for less current gcc, we can make kprobes'
twobyte_is_boostable[] volatile, and it won't be optimized out.

Before:

    CC      arch/x86/kernel/kprobes.o
  In file included from include/linux/bitops.h:22:0,
                   from include/linux/kernel.h:17,
                   from [...]/arch/x86/include/asm/percpu.h:44,
                   from [...]/arch/x86/include/asm/current.h:5,
                   from [...]/arch/x86/include/asm/processor.h:15,
                   from [...]/arch/x86/include/asm/atomic.h:6,
                   from include/linux/atomic.h:4,
                   from include/linux/mutex.h:18,
                   from include/linux/notifier.h:13,
                   from include/linux/kprobes.h:34,
                   from arch/x86/kernel/kprobes.c:43:
  [...]/arch/x86/include/asm/bitops.h: In function ‘can_boost.part.1’:
  [...]/arch/x86/include/asm/bitops.h:319:2: warning: use of memory input
        without lvalue in asm operand 1 is deprecated [enabled by default]

  $ objdump -rd arch/x86/kernel/kprobes.o | grep -A1 -w bt
       551:	0f a3 05 00 00 00 00 	bt     %eax,0x0
                          554: R_386_32	.rodata.cst4

  $ objdump -s -j .rodata.cst4 -j .data arch/x86/kernel/kprobes.o

  arch/x86/kernel/kprobes.o:     file format elf32-i386

  Contents of section .data:
   0000 48000000 00000000 00000000 00000000  H...............
  Contents of section .rodata.cst4:
   0000 4c030000                             L...

Only a single long of twobyte_is_boostable[] is in the object file.

After, with volatile:

  $ objdump -rd arch/x86/kernel/kprobes.o | grep -A1 -w bt
       551:	0f a3 05 20 00 00 00 	bt     %eax,0x20
                          554: R_386_32	.data

  $ objdump -s -j .rodata.cst4 -j .data arch/x86/kernel/kprobes.o

  arch/x86/kernel/kprobes.o:     file format elf32-i386

  Contents of section .data:
   0000 48000000 00000000 00000000 00000000  H...............
   0010 00000000 00000000 00000000 00000000  ................
   0020 4c030000 0f000200 ffff0000 ffcff0c0  L...............
   0030 0000ffff 3bbbfff8 03ff2ebb 26bb2e77  ....;.......&..w

Now all 32 bytes are output into .data instead.

Signed-off-by: Josh Stone <jistone@redhat.com>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Jakub Jelinek <jakub@redhat.com>

---

Note, this is a repost of https://lkml.org/lkml/2011/10/17/500

See also my more general fix, https://lkml.org/lkml/2011/10/6/412

---
 arch/x86/kernel/kprobes.c |    4 +++-
 1 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index f1a6244..c0ed3d9 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -75,8 +75,10 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
 	/*
 	 * Undefined/reserved opcodes, conditional jump, Opcode Extension
 	 * Groups, and some special opcodes can not boost.
+	 * This is volatile to keep gcc from statically optimizing it out, as
+	 * variable_test_bit makes gcc think only *(unsigned long*) is used.
 	 */
-static const u32 twobyte_is_boostable[256 / 32] = {
+static volatile const u32 twobyte_is_boostable[256 / 32] = {
 	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
 	/*      ----------------------------------------------          */
 	W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */
-- 
1.7.6.4


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox