LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* Re: [RFC PATCH] PPC: KVM: vfio kvm device: support spapr tce
From: Alexey Kardashevskiy @ 2013-10-05  1:52 UTC (permalink / raw)
  To: Alex Williamson; +Cc: linuxppc-dev, kvm-ppc, kvm, linux-kernel
In-Reply-To: <1380902702.25705.11.camel@ul30vt.home>

On 05.10.2013 2:05, Alex Williamson wrote:
> On Fri, 2013-10-04 at 22:24 +1000, Alexey Kardashevskiy wrote:
>> This is a very rough change set required for ppc64 to use this KVM device.
>>
>> vfio_rm.c is a piece of code which is going to be called from the realmode (MMU off),
>> and I will put everything spapr-related under #ifdef CONFIG_SPAPR_TCE_IOMMU,
>> it is just friday and I have to run :)
>>
>> This is an RFC but it works.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>  arch/powerpc/kvm/Kconfig  |  1 +
>>  arch/powerpc/kvm/Makefile |  4 ++++
>>  include/linux/kvm_host.h  |  8 ++++---
>>  include/linux/vfio.h      |  3 +++
>>  include/uapi/linux/kvm.h  |  1 +
>>  virt/kvm/vfio.c           | 46 ++++++++++++++++++++++++++++++++++++++++
>>  virt/kvm/vfio_rm.c        | 54 +++++++++++++++++++++++++++++++++++++++++++++++
>>  7 files changed, 114 insertions(+), 3 deletions(-)
>>  create mode 100644 virt/kvm/vfio_rm.c
>>
>> diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
>> index 61b3535..d1b7f64 100644
>> --- a/arch/powerpc/kvm/Kconfig
>> +++ b/arch/powerpc/kvm/Kconfig
>> @@ -60,6 +60,7 @@ config KVM_BOOK3S_64
>>  	select KVM_BOOK3S_64_HANDLER
>>  	select KVM
>>  	select SPAPR_TCE_IOMMU
>> +	select KVM_VFIO
>>  	---help---
>>  	  Support running unmodified book3s_64 and book3s_32 guest kernels
>>  	  in virtual machines on book3s_64 host processors.
>> diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
>> index 6646c95..fc2878b 100644
>> --- a/arch/powerpc/kvm/Makefile
>> +++ b/arch/powerpc/kvm/Makefile
>> @@ -55,6 +55,8 @@ kvm-objs-$(CONFIG_KVM_E500MC) := $(kvm-e500mc-objs)
>>  
>>  kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \
>>  	$(KVM)/coalesced_mmio.o \
>> +	$(KVM)/vfio.o \
>> +	$(KVM)/vfio_rm.o \
>>  	fpu.o \
>>  	book3s_paired_singles.o \
>>  	book3s_pr.o \
>> @@ -76,6 +78,7 @@ kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
>>  kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \
>>  	book3s_hv_rm_xics.o
>>  kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
>> +	$(KVM)/vfio_rm.o \
>>  	book3s_hv_rmhandlers.o \
>>  	book3s_hv_rm_mmu.o \
>>  	book3s_64_vio_hv.o \
>> @@ -89,6 +92,7 @@ kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
>>  
>>  kvm-book3s_64-module-objs := \
>>  	$(KVM)/kvm_main.o \
>> +	$(KVM)/vfio.o \
>>  	$(KVM)/eventfd.o \
>>  	powerpc.o \
>>  	emulate.o \
>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>> index ad2b581..43c0290 100644
>> --- a/include/linux/kvm_host.h
>> +++ b/include/linux/kvm_host.h
>> @@ -407,6 +407,8 @@ struct kvm {
>>  #endif
>>  	long tlbs_dirty;
>>  	struct list_head devices;
>> +
>> +	struct kvm_vfio *vfio;
> 
> 
> <cringe> can't this be on kvm->arch?

It can, I just thought since it is valid for more than just one
platform, it can go here.


>>  };
>>  
>>  #define kvm_err(fmt, ...) \
>> @@ -677,15 +679,15 @@ void kvm_arch_register_noncoherent_dma(struct kvm *kvm);
>>  void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm);
>>  bool kvm_arch_has_noncoherent_dma(struct kvm *kvm);
>>  #else
>> -static inline void kvm_arch_register_noncoherent_dma(void)
>> +static inline void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
>>  {
>>  }
>>  
>> -static inline void kvm_arch_unregister_noncoherent_dma(void)
>> +static inline void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
>>  {
>>  }
>>  
>> -static inline bool kvm_arch_has_noncoherent_dma(void)
>> +static inline bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
>>  {
>>  	return false;
>>  }
> 
> Will fix in my series.
> 
>> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
>> index 24579a0..681e19b 100644
>> --- a/include/linux/vfio.h
>> +++ b/include/linux/vfio.h
>> @@ -97,4 +97,7 @@ extern struct vfio_group *vfio_group_get_external_user(struct file *filep);
>>  extern void vfio_group_put_external_user(struct vfio_group *group);
>>  extern int vfio_external_user_iommu_id(struct vfio_group *group);
>>  
>> +extern struct iommu_group *vfio_find_group_by_liobn(struct kvm *kvm,
>> +		unsigned long liobn);
>> +
> 
> Wrong header file.
> 
>>  #endif /* VFIO_H */
>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>> index 7c1a349..a74ad16 100644
>> --- a/include/uapi/linux/kvm.h
>> +++ b/include/uapi/linux/kvm.h
>> @@ -847,6 +847,7 @@ struct kvm_device_attr {
>>  #define  KVM_DEV_VFIO_GROUP			1
>>  #define   KVM_DEV_VFIO_GROUP_ADD			1
>>  #define   KVM_DEV_VFIO_GROUP_DEL			2
>> +#define  KVM_DEV_VFIO_SPAPR_TCE_LIOBN		2
>>  
>>  /*
>>   * ioctls for VM fds
>> diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
>> index 2e336a7..39dea9f 100644
>> --- a/virt/kvm/vfio.c
>> +++ b/virt/kvm/vfio.c
>> @@ -22,6 +22,7 @@
>>  struct kvm_vfio_group {
>>  	struct list_head node;
>>  	struct vfio_group *vfio_group;
>> +	uint64_t liobn; /* sPAPR */
> 
> Perhaps an arch pointer or at least a union.
> 
>>  };
>>  
>>  struct kvm_vfio {
>> @@ -188,12 +189,52 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg)
>>  	return -ENXIO;
>>  }
>>  
>> +static int kvm_vfio_set_spapr_tce_liobn(struct kvm_device *dev,
>> +		long attr, u64 arg)
>> +{
>> +	struct kvm_vfio *kv = dev->private;
>> +	struct vfio_group *vfio_group;
>> +	struct kvm_vfio_group *kvg;
>> +	void __user *argp = (void __user *)arg;
>> +	struct fd f;
>> +	int32_t fd;
>> +	uint64_t liobn = attr;
>> +
>> +	if (get_user(fd, (int32_t __user *)argp))
>> +		return -EFAULT;
>> +
>> +	f = fdget(fd);
>> +	if (!f.file)
>> +		return -EBADF;
>> +
>> +	vfio_group = kvm_vfio_group_get_external_user(f.file);
>> +	fdput(f);
>> +
>> +	list_for_each_entry(kvg, &kv->group_list, node) {
>> +		if (kvg->vfio_group == vfio_group) {
>> +			WARN_ON(kvg->liobn);
> 
> Users shouldn't be able to trigger WARN_ON so easily, return -EBUSY,
> allow it to be unset and re-set, or just allow the overwrite.
> 
>> +			kvg->liobn = liobn;
>> +			kvm_vfio_group_put_external_user(vfio_group);
>> +			return 0;
>> +		}
>> +	}
>> +
>> +	kvm_vfio_group_put_external_user(vfio_group);
>> +
>> +	return -ENXIO;
>> +}
>> +
>>  static int kvm_vfio_set_attr(struct kvm_device *dev,
>>  			     struct kvm_device_attr *attr)
>>  {
>>  	switch (attr->group) {
>>  	case KVM_DEV_VFIO_GROUP:
>>  		return kvm_vfio_set_group(dev, attr->attr, attr->addr);
>> +#ifdef CONFIG_SPAPR_TCE_IOMMU
>> +	case KVM_DEV_VFIO_SPAPR_TCE_LIOBN:
>> +		return kvm_vfio_set_spapr_tce_liobn(dev, attr->attr,
>> +				attr->addr);
>> +#endif
>>  	}
>>  
>>  	return -ENXIO;
>> @@ -211,6 +252,10 @@ static int kvm_vfio_has_attr(struct kvm_device *dev,
>>  		}
>>  
>>  		break;
>> +#ifdef CONFIG_SPAPR_TCE_IOMMU
>> +	case KVM_DEV_VFIO_SPAPR_TCE_LIOBN:
>> +		return 0;
>> +#endif
>>  	}
>>  
>>  	return -ENXIO;
>> @@ -250,6 +295,7 @@ static int kvm_vfio_create(struct kvm_device *dev, u32 type)
>>  	mutex_init(&kv->lock);
>>  
>>  	dev->private = kv;
>> +	dev->kvm->vfio = kv;
>>  
>>  	return 0;
>>  }
>> diff --git a/virt/kvm/vfio_rm.c b/virt/kvm/vfio_rm.c
>> new file mode 100644
>> index 0000000..ee9fd96
>> --- /dev/null
>> +++ b/virt/kvm/vfio_rm.c
>> @@ -0,0 +1,54 @@
>> +#include <linux/errno.h>
>> +#include <linux/file.h>
>> +#include <linux/kvm_host.h>
>> +#include <linux/list.h>
>> +#include <linux/module.h>
>> +#include <linux/mutex.h>
>> +#include <linux/slab.h>
>> +#include <linux/uaccess.h>
>> +#include <linux/vfio.h>
>> +
>> +struct kvm_vfio_group {
>> +	struct list_head node;
>> +	struct vfio_group *vfio_group;
>> +	uint64_t liobn; /* sPAPR */
>> +};
>> +
>> +struct kvm_vfio {
>> +	struct list_head group_list;
>> +	struct mutex lock;
>> +	bool noncoherent;
>> +};
>> +
>> +struct vfio_group {
>> +	struct kref			kref;
>> +	int				minor;
>> +	atomic_t			container_users;
>> +	struct iommu_group		*iommu_group;
>> +	struct vfio_container		*container;
>> +	struct list_head		device_list;
>> +	struct mutex			device_lock;
>> +	struct device			*dev;
>> +	struct notifier_block		nb;
>> +	struct list_head		vfio_next;
>> +	struct list_head		container_next;
>> +	atomic_t			opened;
>> +};
>> +
>> +struct iommu_group *vfio_find_group_by_liobn(struct kvm *kvm, unsigned long liobn)
>> +{
>> +	struct kvm_vfio_group *kvg;
>> +
>> +	if (!kvm->vfio)
>> +		return NULL;
>> +
>> +	list_for_each_entry(kvg, &kvm->vfio->group_list, node) {
>> +		if (kvg->liobn == liobn)
>> +			return kvg->vfio_group->iommu_group;
>> +	}
>> +
>> +	return NULL;
>> +}
>> +EXPORT_SYMBOL_GPL(vfio_find_group_by_liobn);
>> +
>> +
> 
> You're kidding, right?  These are intentionally private data structures
> that are blatantly copied so that you can extract what you want.  NACK.
> The iommu_group is available off struct device, do you even need vfio or
> this kvm-vfio device to get from liobn to iommu_group?  Thanks,


This is an RFC. I am not saying this is what can go to upstream or
anything. I am not kidding (why everyone assumes that?), I am showing
what API I would like to have in the VFIO KVM device. I need the way to
get iommu_table (which is in a private data of iommu_group) by LIOBN and
the VFIO KVM device is the _only_ entity which will know about this
connection (LIOBN is made up by the qemu and told to the guest) and it
cannot go to the kvm.ko - and the patch like this is the best way to
show it as my english obviously sucks.



-- 
With best regards

Alexey Kardashevskiy -- icq: 52150396

^ permalink raw reply

* Re: [PATCH] kvm: powerpc: book3s: Fix build break for BOOK3S_32
From: Alexander Graf @ 2013-10-04 23:59 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, Aneesh Kumar K.V, kvm-ppc, kvm
In-Reply-To: <20131004234509.GB2256@iris.ozlabs.ibm.com>


On 05.10.2013, at 01:45, Paul Mackerras wrote:

> On Fri, Oct 04, 2013 at 03:00:11PM +0200, Alexander Graf wrote:
>>=20
>> On 04.10.2013, at 14:35, Paul Mackerras wrote:
>>=20
>>> On Fri, Oct 04, 2013 at 02:27:02PM +0200, Alexander Graf wrote:
>>>>=20
>>>> On 04.10.2013, at 14:23, Alexander Graf wrote:
>>>>=20
>>>>>=20
>>>>> On 03.10.2013, at 06:14, Paul Mackerras wrote:
>>>>>=20
>>>>>> On Wed, Oct 02, 2013 at 08:08:44PM +0530, Aneesh Kumar K.V wrote:
>>>>>>> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
>>>>>>>=20
>>>>>>> This was introduced by 85a0d845d8bb5df5d2669416212f56cbe1474c6b
>>>>>>=20
>>>>>> It's a good idea to give the headline of the commit as well as =
the ID.
>>>>>> I also like to trim the ID to 10 characters or so.  So it should =
look
>>>>>> like this:
>>>>>>=20
>>>>>> This was introduced by 85a0d845d8 ("KVM: PPC: Book3S PR: Allocate
>>>>>> kvm_vcpu structs from kvm_vcpu_cache").
>>>>>>=20
>>>>>>> arch/powerpc/kvm/book3s_pr.c: In function =
'kvmppc_core_vcpu_create':
>>>>>>> arch/powerpc/kvm/book3s_pr.c:1182:30: error: 'struct =
kvmppc_vcpu_book3s' has no member named 'shadow_vcpu'
>>>>>>> make[1]: *** [arch/powerpc/kvm/book3s_pr.o] Error 1
>>>>>>>=20
>>>>>>> Signed-off-by: Aneesh Kumar K.V =
<aneesh.kumar@linux.vnet.ibm.com>
>>>>>>=20
>>>>>> Acked-by: Paul Mackerras <paulus@samba.org>
>>>>>=20
>>>>> Would you guys mind if I merge this into the offending patch? It's =
not trickled into -next yet, so rebasing should work.
>>>>>=20
>>>>> If not, please resend with the fixed commit message.
>>>>=20
>>>> Eh - I must've missed v2 :). So that leaves only the question on =
whether you'd be ok to squash the patch instead. It'd help =
bisectability.
>>>=20
>>> I'm OK with that.  If you do, why don't you squash the first of the
>>> two patches that I just sent into the commit it fixes as well?
>>=20
>> Because patch 1/2 spans two separate commits it would have to get =
squashed into (6aa82e, 70afec) and patch 2/2 doesn't make sense to get =
squashed anywhere :).
>=20
> Actually 70afec is fine, if you look at it, it's only 6aa82e that
> needs fixing.

True. Squashed them :).


Alex

^ permalink raw reply

* [PATCH] powerpc: fix exception clearing in e500 SPE float emulation
From: Joseph S. Myers @ 2013-10-04 22:50 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: linux-kernel

From: Joseph Myers <joseph@codesourcery.com>

The e500 SPE floating-point emulation code clears existing exceptions
(__FPU_FPSCR &= ~FP_EX_MASK;) before ORing in the exceptions from the
emulated operation.  However, these exception bits are the "sticky",
cumulative exception bits, and should only be cleared by the user
program setting SPEFSCR, not implicitly by any floating-point
instruction (whether executed purely by the hardware or emulated).
The spurious clearing of these bits shows up as missing exceptions in
glibc testing.

Fixing this, however, is not as simple as just not clearing the bits,
because while the bits may be from previous floating-point operations
(in which case they should not be cleared), the processor can also set
the sticky bits itself before the interrupt for an exception occurs,
and this can happen in cases when IEEE 754 semantics are that the
sticky bit should not be set.  Specifically, the "invalid" sticky bit
is set in various cases with non-finite operands, where IEEE 754
semantics do not involve raising such an exception, and the
"underflow" sticky bit is set in cases of exact underflow, whereas
IEEE 754 semantics are that this flag is set only for inexact
underflow.  Thus, for correct emulation the kernel needs to know the
setting of these two sticky bits before the instruction being
emulated.

When a floating-point operation raises an exception, the kernel can
note the state of the sticky bits immediately afterwards.  Some
<fenv.h> functions that affect the state of these bits, such as
fesetenv and feholdexcept, need to use prctl with PR_GET_FPEXC and
PR_SET_FPEXC anyway, and so it is natural to record the state of those
bits during that call into the kernel and so avoid any need for a
separate call into the kernel to inform it of a change to those bits.
Thus, the interface I chose to use (in this patch and the glibc port
on which I am working; the old EGLIBC port, based on Aldy's SPE
add-on, has many deficiencies in its <fenv.h> functions
implementation, so that they do not interact correctly with any Linux
kernel version) is that one of those prctl calls must be made after
any userspace change to those sticky bits, other than through a
floating-point operation that traps into the kernel anyway.
feclearexcept and fesetexceptflag duly make those calls, which would
not be required were it not for this issue.

Signed-off-by: Joseph Myers <joseph@codesourcery.com>

---

diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index ce4de5a..0b02e23 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -237,6 +237,8 @@ struct thread_struct {
 	unsigned long	evr[32];	/* upper 32-bits of SPE regs */
 	u64		acc;		/* Accumulator */
 	unsigned long	spefscr;	/* SPE & eFP status */
+	unsigned long	spefscr_last;	/* SPEFSCR value on last prctl
+					   call or trap return */
 	int		used_spe;	/* set if process has used spe */
 #endif /* CONFIG_SPE */
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
@@ -303,7 +305,9 @@ struct thread_struct {
 	(_ALIGN_UP(sizeof(init_thread_info), 16) + (unsigned long) &init_stack)

 #ifdef CONFIG_SPE
-#define SPEFSCR_INIT .spefscr = SPEFSCR_FINVE | SPEFSCR_FDBZE | SPEFSCR_FUNFE | SPEFSCR_FOVFE,
+#define SPEFSCR_INIT \
+	.spefscr = SPEFSCR_FINVE | SPEFSCR_FDBZE | SPEFSCR_FUNFE | SPEFSCR_FOVFE, \
+	.spefscr_last = SPEFSCR_FINVE | SPEFSCR_FDBZE | SPEFSCR_FUNFE | SPEFSCR_FOVFE,
 #else
 #define SPEFSCR_INIT
 #endif
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 96d2fdf..e3b91f1 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1151,6 +1151,7 @@ int set_fpexc_mode(struct task_struct *tsk, unsigned int val)
 	if (val & PR_FP_EXC_SW_ENABLE) {
 #ifdef CONFIG_SPE
 		if (cpu_has_feature(CPU_FTR_SPE)) {
+			tsk->thread.spefscr_last = mfspr(SPRN_SPEFSCR);
 			tsk->thread.fpexc_mode = val &
 				(PR_FP_EXC_SW_ENABLE | PR_FP_ALL_EXCEPT);
 			return 0;
@@ -1182,9 +1183,10 @@ int get_fpexc_mode(struct task_struct *tsk, unsigned long adr)

 	if (tsk->thread.fpexc_mode & PR_FP_EXC_SW_ENABLE)
 #ifdef CONFIG_SPE
-		if (cpu_has_feature(CPU_FTR_SPE))
+		if (cpu_has_feature(CPU_FTR_SPE)) {
+			tsk->thread.spefscr_last = mfspr(SPRN_SPEFSCR);
 			val = tsk->thread.fpexc_mode;
-		else
+		} else
 			return -EINVAL;
 #else
 		return -EINVAL;
diff --git a/arch/powerpc/math-emu/math_efp.c b/arch/powerpc/math-emu/math_efp.c
index a73f088..59835c6 100644
--- a/arch/powerpc/math-emu/math_efp.c
+++ b/arch/powerpc/math-emu/math_efp.c
@@ -630,9 +630,27 @@ update_ccr:
 	regs->ccr |= (IR << ((7 - ((speinsn >> 23) & 0x7)) << 2));

 update_regs:
-	__FPU_FPSCR &= ~FP_EX_MASK;
+	/*
+	 * If the "invalid" exception sticky bit was set by the
+	 * processor for non-finite input, but was not set before the
+	 * instruction being emulated, clear it.  Likewise for the
+	 * "underflow" bit, which may have been set by the processor
+	 * for exact underflow, not just inexact underflow when the
+	 * flag should be set for IEEE 754 semantics.  Other sticky
+	 * exceptions will only be set by the processor when they are
+	 * correct according to IEEE 754 semantics, and we must not
+	 * clear sticky bits that were already set before the emulated
+	 * instruction as they represent the user-visible sticky
+	 * exception status.  "inexact" traps to kernel are not
+	 * required for IEEE semantics and are not enabled by default,
+	 * so the "inexact" sticky bit may have been set by a previous
+	 * instruction without the kernel being aware of it.
+	 */
+	__FPU_FPSCR
+	  &= ~(FP_EX_INVALID | FP_EX_UNDERFLOW) | current->thread.spefscr_last;
 	__FPU_FPSCR |= (FP_CUR_EXCEPTIONS & FP_EX_MASK);
 	mtspr(SPRN_SPEFSCR, __FPU_FPSCR);
+	current->thread.spefscr_last = __FPU_FPSCR;

 	current->thread.evr[fc] = vc.wp[0];
 	regs->gpr[fc] = vc.wp[1];

-- 
Joseph S. Myers
joseph@codesourcery.com

^ permalink raw reply related

* Re: [PATCH] kvm: powerpc: book3s: Fix build break for BOOK3S_32
From: Paul Mackerras @ 2013-10-04 23:45 UTC (permalink / raw)
  To: Alexander Graf; +Cc: linuxppc-dev, Aneesh Kumar K.V, kvm-ppc, kvm
In-Reply-To: <1945837D-825F-4509-9D7C-D839917863A9@suse.de>

On Fri, Oct 04, 2013 at 03:00:11PM +0200, Alexander Graf wrote:
> 
> On 04.10.2013, at 14:35, Paul Mackerras wrote:
> 
> > On Fri, Oct 04, 2013 at 02:27:02PM +0200, Alexander Graf wrote:
> >> 
> >> On 04.10.2013, at 14:23, Alexander Graf wrote:
> >> 
> >>> 
> >>> On 03.10.2013, at 06:14, Paul Mackerras wrote:
> >>> 
> >>>> On Wed, Oct 02, 2013 at 08:08:44PM +0530, Aneesh Kumar K.V wrote:
> >>>>> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
> >>>>> 
> >>>>> This was introduced by 85a0d845d8bb5df5d2669416212f56cbe1474c6b
> >>>> 
> >>>> It's a good idea to give the headline of the commit as well as the ID.
> >>>> I also like to trim the ID to 10 characters or so.  So it should look
> >>>> like this:
> >>>> 
> >>>> This was introduced by 85a0d845d8 ("KVM: PPC: Book3S PR: Allocate
> >>>> kvm_vcpu structs from kvm_vcpu_cache").
> >>>> 
> >>>>> arch/powerpc/kvm/book3s_pr.c: In function 'kvmppc_core_vcpu_create':
> >>>>> arch/powerpc/kvm/book3s_pr.c:1182:30: error: 'struct kvmppc_vcpu_book3s' has no member named 'shadow_vcpu'
> >>>>> make[1]: *** [arch/powerpc/kvm/book3s_pr.o] Error 1
> >>>>> 
> >>>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> >>>> 
> >>>> Acked-by: Paul Mackerras <paulus@samba.org>
> >>> 
> >>> Would you guys mind if I merge this into the offending patch? It's not trickled into -next yet, so rebasing should work.
> >>> 
> >>> If not, please resend with the fixed commit message.
> >> 
> >> Eh - I must've missed v2 :). So that leaves only the question on whether you'd be ok to squash the patch instead. It'd help bisectability.
> > 
> > I'm OK with that.  If you do, why don't you squash the first of the
> > two patches that I just sent into the commit it fixes as well?
> 
> Because patch 1/2 spans two separate commits it would have to get squashed into (6aa82e, 70afec) and patch 2/2 doesn't make sense to get squashed anywhere :).

Actually 70afec is fine, if you look at it, it's only 6aa82e that
needs fixing.

Paul.

^ permalink raw reply

* [PATCH] powerpc: fix e500 SPE float rounding inexactness detection
From: Joseph S. Myers @ 2013-10-04 22:51 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: linux-kernel

From: Joseph Myers <joseph@codesourcery.com>

The e500 SPE floating-point emulation code for the rounding modes
rounding to positive or negative infinity (which may not be
implemented in hardware) tries to avoid emulating rounding if the
result was inexact.  However, it tests inexactness using the sticky
bit with the cumulative result of previous operations, rather than
with the non-sticky bits relating to the operation that generated the
interrupt.  Furthermore, when a vector operation generates the
interrupt, it's possible that only one of the low and high parts is
inexact, and so only that part should have rounding emulated.  This
results in incorrect rounding of exact results in these modes when the
sticky bit is set from a previous operation.

(I'm not sure why the rounding interrupts are generated at all when
the result is exact, but empirically the hardware does generate them.)

This patch checks for inexactness using the correct bits of SPEFSCR,
and ensures that rounding only occurs when the relevant part of the
result was actually inexact.

Signed-off-by: Joseph Myers <joseph@codesourcery.com>

---

diff --git a/arch/powerpc/math-emu/math_efp.c b/arch/powerpc/math-emu/math_efp.c
index a73f088..ecdf35d 100644
--- a/arch/powerpc/math-emu/math_efp.c
+++ b/arch/powerpc/math-emu/math_efp.c
@@ -662,7 +680,8 @@ int speround_handler(struct pt_regs *regs)
 {
 	union dw_union fgpr;
 	int s_lo, s_hi;
-	unsigned long speinsn, type, fc;
+	int lo_inexact, hi_inexact;
+	unsigned long speinsn, type, fc, fptype;
 
 	if (get_user(speinsn, (unsigned int __user *) regs->nip))
 		return -EFAULT;
@@ -675,8 +694,12 @@ int speround_handler(struct pt_regs *regs)
 	__FPU_FPSCR = mfspr(SPRN_SPEFSCR);
 	pr_debug("speinsn:%08lx spefscr:%08lx\n", speinsn, __FPU_FPSCR);
 
+	fptype = (speinsn >> 5) & 0x7;
+
 	/* No need to round if the result is exact */
-	if (!(__FPU_FPSCR & FP_EX_INEXACT))
+	lo_inexact = __FPU_FPSCR & (SPEFSCR_FG | SPEFSCR_FX);
+	hi_inexact = __FPU_FPSCR & (SPEFSCR_FGH | SPEFSCR_FXH);
+	if (!(lo_inexact || (hi_inexact && fptype == VCT)))
 		return 0;
 
 	fc = (speinsn >> 21) & 0x1f;
@@ -687,7 +710,7 @@ int speround_handler(struct pt_regs *regs)
 
 	pr_debug("round fgpr: %08x  %08x\n", fgpr.wp[0], fgpr.wp[1]);
 
-	switch ((speinsn >> 5) & 0x7) {
+	switch (fptype) {
 	/* Since SPE instructions on E500 core can handle round to nearest
 	 * and round toward zero with IEEE-754 complied, we just need
 	 * to handle round toward +Inf and round toward -Inf by software.
@@ -710,11 +733,15 @@ int speround_handler(struct pt_regs *regs)
 
 	case VCT:
 		if (FP_ROUNDMODE == FP_RND_PINF) {
-			if (!s_lo) fgpr.wp[1]++; /* Z_low > 0, choose Z1 */
-			if (!s_hi) fgpr.wp[0]++; /* Z_high word > 0, choose Z1 */
+			if (lo_inexact && !s_lo)
+				fgpr.wp[1]++; /* Z_low > 0, choose Z1 */
+			if (hi_inexact && !s_hi)
+				fgpr.wp[0]++; /* Z_high word > 0, choose Z1 */
 		} else { /* round to -Inf */
-			if (s_lo) fgpr.wp[1]++; /* Z_low < 0, choose Z2 */
-			if (s_hi) fgpr.wp[0]++; /* Z_high < 0, choose Z2 */
+			if (lo_inexact && s_lo)
+				fgpr.wp[1]++; /* Z_low < 0, choose Z2 */
+			if (hi_inexact && s_hi)
+				fgpr.wp[0]++; /* Z_high < 0, choose Z2 */
 		}
 		break;
 

-- 
Joseph S. Myers
joseph@codesourcery.com

^ permalink raw reply related

* Re: [PATCH RFC 00/77] Re-design MSI/MSI-X interrupts enablement pattern
From: Ben Hutchings @ 2013-10-04 21:29 UTC (permalink / raw)
  To: Alexander Gordeev
  Cc: linux-mips, VMware, Inc., linux-nvme, linux-ide, linux-s390,
	Andy King, linux-scsi, linux-rdma, x86, Ingo Molnar, linux-pci,
	iss_storagedev, linux-driver, Tejun Heo, Bjorn Helgaas,
	Dan Williams, Jon Mason, Solarflare linux maintainers, netdev,
	linux-kernel, Ralf Baechle, e1000-devel, Martin Schwidefsky,
	linux390, linuxppc-dev
In-Reply-To: <20131004082920.GA4536@dhcp-26-207.brq.redhat.com>

On Fri, 2013-10-04 at 10:29 +0200, Alexander Gordeev wrote:
> On Thu, Oct 03, 2013 at 11:49:45PM +0100, Ben Hutchings wrote:
> > On Wed, 2013-10-02 at 12:48 +0200, Alexander Gordeev wrote:
> > > This update converts pci_enable_msix() and pci_enable_msi_block()
> > > interfaces to canonical kernel functions and makes them return a
> > > error code in case of failure or 0 in case of success.
> > [...]
> > 
> > I think this is fundamentally flawed: pci_msix_table_size() and
> > pci_get_msi_cap() can only report the limits of the *device* (which the
> > driver usually already knows), whereas MSI allocation can also be
> > constrained due to *global* limits on the number of distinct IRQs.
> 
> Even the current implementation by no means addresses it. Although it
> might seem a case for architectures to report the number of IRQs available
> for a driver to retry, in fact they all just fail. The same applies to
> *any* other type of resource involved: irq_desc's, CPU interrupt vector
> space, msi_desc's etc. No platform cares about it and just bails out once
> a constrain met (please correct me if I am wrong here). Given that Linux
> has been doing well even on embedded I think we should not change it.
>
> The only exception to the above is pSeries platform which takes advantage
> of the current design (to implement MSI quota). There are indications we
> can satisfy pSeries requirements, but the design proposed in this RFC
> is not going to change drastically anyway. The start of the discusstion
> is here: https://lkml.org/lkml/2013/9/5/293

All I can see there is that Tejun didn't think that the global limits
and positive return values were implemented by any architecture.  But
you have a counter-example, so I'm not sure what your point is.

It has been quite a while since I saw this happen on x86.  But I just
checked on a test system running RHEL 5 i386 (Linux 2.6.18).  If I ask
for 16 MSI-X vectors on a device that supports 1024, the return value is
8, and indeed I can then successfully allocate 8.

Now that's going quite a way back, and it may be that global limits
aren't a significant problem any more.  With the x86_64 build of RHEL 5
on an identical system, I can allocate 16 or even 32, so this is
apparently not a hardware limit in this case.

> > Currently pci_enable_msix() will report a positive value if it fails due
> > to the global limit.  Your patch 7 removes that.  pci_enable_msi_block()
> > unfortunately doesn't appear to do this.
> 
> pci_enable_msi_block() can do more than one MSI only on x86 (with IOMMU),
> but it does not bother to return positive numbers, indeed.
> 
> > It seems to me that a more useful interface would take a minimum and
> > maximum number of vectors from the driver.  This wouldn't allow the
> > driver to specify that it could only accept, say, any even number within
> > a certain range, but you could still leave the current functions
> > available for any driver that needs that.
> 
> Mmmm.. I am not sure I am getting it. Could you please rephrase?

Most drivers seem to either:
(a) require exactly a certain number of MSI vectors, or
(b) require a minimum number of MSI vectors, usually want to allocate
more, and work with any number in between

We can support drivers in both classes by adding new allocation
functions that allow specifying a minimum (required) and maximum
(wanted) number of MSI vectors.  Those in class (a) would just specify
the same value for both.  These new functions can take account of any
global limit or allocation policy without any further changes to the
drivers that use them.

The few drivers with more specific requirements would still need to
implement the currently recommended loop, using the old allocation
functions.

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* Re: [PATCH 2/7] iommu: add api to get iommu_domain of a device
From: Alex Williamson @ 2013-10-04 18:12 UTC (permalink / raw)
  To: Bhushan Bharat-R65777
  Cc: agraf@suse.de, Wood Scott-B07421, linux-pci@vger.kernel.org,
	joro@8bytes.org, linux-kernel@vger.kernel.org,
	iommu@lists.linux-foundation.org, linuxppc-dev@lists.ozlabs.org
In-Reply-To: <6A3DF150A5B70D4F9B66A25E3F7C888D07191143@039-SN2MPN1-011.039d.mgd.msft.net>

On Fri, 2013-10-04 at 17:23 +0000, Bhushan Bharat-R65777 wrote:
> 
> > -----Original Message-----
> > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > Sent: Friday, October 04, 2013 10:43 PM
> > To: Bhushan Bharat-R65777
> > Cc: joro@8bytes.org; benh@kernel.crashing.org; galak@kernel.crashing.org; linux-
> > kernel@vger.kernel.org; linuxppc-dev@lists.ozlabs.org; linux-
> > pci@vger.kernel.org; agraf@suse.de; Wood Scott-B07421; iommu@lists.linux-
> > foundation.org
> > Subject: Re: [PATCH 2/7] iommu: add api to get iommu_domain of a device
> > 
> > On Fri, 2013-10-04 at 16:47 +0000, Bhushan Bharat-R65777 wrote:
> > >
> > > > -----Original Message-----
> > > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > > Sent: Friday, October 04, 2013 9:15 PM
> > > > To: Bhushan Bharat-R65777
> > > > Cc: joro@8bytes.org; benh@kernel.crashing.org;
> > > > galak@kernel.crashing.org; linux- kernel@vger.kernel.org;
> > > > linuxppc-dev@lists.ozlabs.org; linux- pci@vger.kernel.org;
> > > > agraf@suse.de; Wood Scott-B07421; iommu@lists.linux- foundation.org
> > > > Subject: Re: [PATCH 2/7] iommu: add api to get iommu_domain of a
> > > > device
> > > >
> > > > On Fri, 2013-10-04 at 09:54 +0000, Bhushan Bharat-R65777 wrote:
> > > > >
> > > > > > -----Original Message-----
> > > > > > From: linux-pci-owner@vger.kernel.org
> > > > > > [mailto:linux-pci-owner@vger.kernel.org]
> > > > > > On Behalf Of Alex Williamson
> > > > > > Sent: Wednesday, September 25, 2013 10:16 PM
> > > > > > To: Bhushan Bharat-R65777
> > > > > > Cc: joro@8bytes.org; benh@kernel.crashing.org;
> > > > > > galak@kernel.crashing.org; linux- kernel@vger.kernel.org;
> > > > > > linuxppc-dev@lists.ozlabs.org; linux- pci@vger.kernel.org;
> > > > > > agraf@suse.de; Wood Scott-B07421; iommu@lists.linux-
> > > > > > foundation.org; Bhushan Bharat-R65777
> > > > > > Subject: Re: [PATCH 2/7] iommu: add api to get iommu_domain of a
> > > > > > device
> > > > > >
> > > > > > On Thu, 2013-09-19 at 12:59 +0530, Bharat Bhushan wrote:
> > > > > > > This api return the iommu domain to which the device is attached.
> > > > > > > The iommu_domain is required for making API calls related to iommu.
> > > > > > > Follow up patches which use this API to know iommu maping.
> > > > > > >
> > > > > > > Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
> > > > > > > ---
> > > > > > >  drivers/iommu/iommu.c |   10 ++++++++++
> > > > > > >  include/linux/iommu.h |    7 +++++++
> > > > > > >  2 files changed, 17 insertions(+), 0 deletions(-)
> > > > > > >
> > > > > > > diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> > > > > > > index
> > > > > > > fbe9ca7..6ac5f50 100644
> > > > > > > --- a/drivers/iommu/iommu.c
> > > > > > > +++ b/drivers/iommu/iommu.c
> > > > > > > @@ -696,6 +696,16 @@ void iommu_detach_device(struct
> > > > > > > iommu_domain *domain, struct device *dev)  }
> > > > > > > EXPORT_SYMBOL_GPL(iommu_detach_device);
> > > > > > >
> > > > > > > +struct iommu_domain *iommu_get_dev_domain(struct device *dev) {
> > > > > > > +	struct iommu_ops *ops = dev->bus->iommu_ops;
> > > > > > > +
> > > > > > > +	if (unlikely(ops == NULL || ops->get_dev_iommu_domain == NULL))
> > > > > > > +		return NULL;
> > > > > > > +
> > > > > > > +	return ops->get_dev_iommu_domain(dev); }
> > > > > > > +EXPORT_SYMBOL_GPL(iommu_get_dev_domain);
> > > > > >
> > > > > > What prevents this from racing iommu_domain_free()?  There's no
> > > > > > references acquired, so there's no reason for the caller to
> > > > > > assume the
> > > > pointer is valid.
> > > > >
> > > > > Sorry for late query, somehow this email went into a folder and
> > > > > escaped;
> > > > >
> > > > > Just to be sure, there is not lock at generic "struct
> > > > > iommu_domain", but IP
> > > > specific structure (link FSL domain) linked in iommu_domain->priv
> > > > have a lock, so we need to ensure this race in FSL iommu code (say
> > > > drivers/iommu/fsl_pamu_domain.c), right?
> > > >
> > > > No, it's not sufficient to make sure that your use of the interface
> > > > is race free.  The interface itself needs to be designed so that
> > > > it's difficult to use incorrectly.
> > >
> > > So we can define iommu_get_dev_domain()/iommu_put_dev_domain();
> > > iommu_get_dev_domain() will return domain with the lock held, and
> > > iommu_put_dev_domain() will release the lock? And
> > > iommu_get_dev_domain() must always be followed by
> > > iommu_get_dev_domain().
> > 
> > What lock?  get/put are generally used for reference counting, not locking in
> > the kernel.
> > 
> > > > That's not the case here.  This is a backdoor to get the iommu
> > > > domain from the iommu driver regardless of who is using it or how.
> > > > The iommu domain is created and managed by vfio, so shouldn't we be
> > > > looking at how to do this through vfio?
> > >
> > > Let me first describe what we are doing here:
> > > During initialization:-
> > >  - vfio talks to MSI system to know the MSI-page and size
> > >  - vfio then interacts with iommu to map the MSI-page in iommu (IOVA
> > > is decided by userspace and physical address is the MSI-page)
> > >  - So the IOVA subwindow mapping is created in iommu and yes VFIO know about
> > this mapping.
> > >
> > > Now do SET_IRQ(MSI/MSIX) ioctl:
> > >  - calls pci_enable_msix()/pci_enable_msi_block(): which is supposed to set
> > MSI address/data in device.
> > >  - So in current implementation (this patchset) msi-subsystem gets the IOVA
> > from iommu via this defined interface.
> > >  - Are you saying that rather than getting this from iommu, we should get this
> > from vfio? What difference does this make?
> > 
> > Yes, you just said above that vfio knows the msi to iova mapping, so why go
> > outside of vfio to find it later?  The difference is one case you can have a
> > proper reference to data structures to make sure the pointer you get back
> > actually has meaning at the time you're using it vs the code here where you're
> > defining an API that returns a meaningless value
> 
> With FSL-PAMU we will always get consistant data from iommu or vfio-data structure.

Great, but you're trying to add a generic API to the IOMMU subsystem
that's difficult to use correctly.  The fact that you use it correctly
does not justify the API.

> > because you can't check or
> > enforce that an arbitrary caller is using it correctly.
> 
> I am not sure what is arbitrary caller? pdev is known to vfio, so vfio
> will only make pci_enable_msix()/pci_enable_msi_block() for this pdev.
> If anyother code makes then it is some other unexpectedly thing
> happening in system, no?

What's proposed here is a generic IOMMU API.  Anybody can call this.
What if the host SCSI driver decides to go get the iommu domain for it's
device (or any other device)?  Does that fit your usage model?

> >  It's not maintainable.
> > Thanks,
> 
> I do not have any issue with this as well, can you also describe the type of API you are envisioning;
> I can think of defining some function in vfio.c/vfio_iommu*.c, make them global and declare then in include/Linux/vfio.h
> And include <Linux/vfio.h> in caller file (arch/powerpc/kernel/msi.c)

Do you really want module dependencies between vfio and your core kernel
MSI setup?  Look at the vfio external user interface that we've already
defined.  That allows other components of the kernel to get a proper
reference to a vfio group.  From there you can work out how to get what
you want.  Another alternative is that vfio could register an MSI to
IOVA mapping with architecture code when the mapping is created.  The
MSI setup path could then do a lookup in architecture code for the
mapping.  You could even store the MSI to IOVA mapping in VFIO and
create an interface where SET_IRQ passes that mapping into setup code.
Thanks,

Alex

^ permalink raw reply

* Re: [PATCH v3 net-next] fix unsafe set_memory_rw from softirq
From: Ingo Molnar @ 2013-10-04 18:00 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Heiko Carstens, Eric Dumazet, Paul Mackerras, H. Peter Anvin,
	sparclinux, Nicolas Dichtel, linux-s390, Russell King, x86,
	James Morris, Ingo Molnar, Alexey Kuznetsov, Paul E. McKenney,
	Xi Wang, Matt Evans, Thomas Gleixner, linux-arm-kernel,
	Stelian Nirlu, Nicolas Schichan, Hideaki YOSHIFUJI, netdev,
	linux-kernel, David S. Miller, Mircea Gherzan, Daniel Borkmann,
	Martin Schwidefsky, linux390, linuxppc-dev, Patrick McHardy
In-Reply-To: <CAMEtUuy0mpdUHJjztK31mHjNVahnoZ8CHo2sLLyJmt=p5+gq0w@mail.gmail.com>


* Alexei Starovoitov <ast@plumgrid.com> wrote:

> >>  #else
> >> +#include <linux/slab.h>
> >
> > Inlines in the middle of header files are generally frowned upon.
> >
> > The standard pattern is to put them at the top, that way it's easier to
> > see the dependencies and there's also less .config dependent inclusion,
> > which makes header hell cleanup work easier.
> 
> Agree. I only followed the style that is already in filter.h 20 lines above.
>
> #ifdef CONFIG_BPF_JIT
> #include <stdarg.h>
> #include <linux/linkage.h>
> #include <linux/printk.h>
> 
> as part of the cleanup can move all of them to the top. In the separate commit?

Yeah, sure, that's fine.

> >>       struct sk_filter *fp;
> >>       unsigned int fsize = sizeof(struct sock_filter) * fprog->len;
> >> +     unsigned int sk_fsize = max_t(u32, fsize, sizeof(struct work_struct))
> >> +             + sizeof(*fp);
> >
> > Using the structure definition I suggested, this could be replaced with
> > the more obvious:
> >
> >         unsigned int sk_fsize = max(fsize, sizeof(*fp));
> 
> with helper function it's even cleaner. Fixed in V4

So my thought was that the helper function is perhaps too trivial and 
somewhat obscures the allocation pattern, but yeah. Either way is fine 
with me.

> > A couple of questions/suggestions:
> >
> > 1)
> >
> > I took a brief look at arch/x86/net/bpf_jit_comp.c while reviewing this
> > patch.
> >
> > You need to split up bpf_jit_compile(), it's an obscenely large, ~600
> > lines long function. We don't do that in modern, maintainable kernel code.
> 
> I had the same thought, therefore in my proposed generalization of bpf:
> http://patchwork.ozlabs.org/patch/279280/
> It is split into two. do_jit() is still a bit large at 400 lines. Can
> split it further.

Yeah, I think as long as you split out the loop iterator into a separate 
function it gets much better.

The actual instruction generation code within the iterator looks good in a 
single chunk - splitting it up further than that might in fact make it 
less readable.

> > 3)
> >
> > It's nice code altogether! Are there any plans to generalize its 
> > interfaces, to allow arbitrary bytecode to be used by other kernel 
> > subsystems as well? In particular tracing filters could make use of 
> > it, but it would also allow safe probe points.
> 
> That was exactly the reasons to generalize bpf as I proposed.

Ok, cool :-)

For the x86 bits:

Acked-by: Ingo Molnar <mingo@kernel.org>

Thanks,

	Ingo

^ permalink raw reply

* Re: [PATCH v3 net-next] fix unsafe set_memory_rw from softirq
From: Alexei Starovoitov @ 2013-10-04 17:49 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Heiko Carstens, Eric Dumazet, Paul Mackerras, H. Peter Anvin,
	sparclinux, Nicolas Dichtel, linux-s390, Russell King, x86,
	James Morris, Ingo Molnar, Alexey Kuznetsov, Paul E. McKenney,
	Xi Wang, Matt Evans, Thomas Gleixner, linux-arm-kernel,
	Stelian Nirlu, Nicolas Schichan, Hideaki YOSHIFUJI, netdev,
	linux-kernel, David S. Miller, Mircea Gherzan, Daniel Borkmann,
	Martin Schwidefsky, linux390, linuxppc-dev, Patrick McHardy
In-Reply-To: <20131004075133.GA12313@gmail.com>

On Fri, Oct 4, 2013 at 12:51 AM, Ingo Molnar <mingo@kernel.org> wrote:
>> +static void bpf_jit_free_deferred(struct work_struct *work)
>> +{
>> +     struct sk_filter *fp = container_of((void *)work, struct sk_filter,
>> +                                         insns);
>> +     unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK;
>> +     struct bpf_binary_header *header = (void *)addr;
>> +
>> +     set_memory_rw(addr, header->pages);
>> +     module_free(NULL, header);
>> +     kfree(fp);
>> +}
>
> Using the data type suggestions I make further below, this could be
> written in a simpler form, as:
>
>         struct sk_filter *fp = container_of(work, struct sk_filter, work);

yes. I've made it already as part of V4

> Also, a question, why do you mask with PAGE_MASK here:
>
>         unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK;
>
> ?
>
> AFAICS bpf_func is the module_alloc() result - and module code is page
> aligned. So ->bpf_func is always page aligned here. (The sk_run_filter
> special case cannot happen here.)

randomization of bpf_func start is a prevention of jit spraying
attacks as Eric pointed out.

>>  void bpf_jit_free(struct sk_filter *fp)
>>  {
>>       if (fp->bpf_func != sk_run_filter) {
>> -             unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK;
>> -             struct bpf_binary_header *header = (void *)addr;
>> -
>> -             set_memory_rw(addr, header->pages);
>> -             module_free(NULL, header);
>> +             struct work_struct *work = (struct work_struct *)fp->insns;
>> +             INIT_WORK(work, bpf_jit_free_deferred);
>
> Missing newline between local variables and statements.

yes. noted and fixed in V4.

>>       unsigned int            (*bpf_func)(const struct sk_buff *skb,
>>                                           const struct sock_filter *filter);
>> -     struct rcu_head         rcu;
>> +     /* insns start right after bpf_func, so that sk_run_filter() fetches
>> +      * first insn from the same cache line that was used to call into
>> +      * sk_run_filter()
>> +      */
>>       struct sock_filter      insns[0];
>
> Please use the customary (multi-line) comment style:
>
>   /*
>    * Comment .....
>    * ...... goes here.
>    */
>
> specified in Documentation/CodingStyle.

I believe filter.h belongs to networking comment style, that's why
checkpatch didn't complain.
But I removed the comment in V4

>>  static inline unsigned int sk_filter_len(const struct sk_filter *fp)
>>  {
>> -     return fp->len * sizeof(struct sock_filter) + sizeof(*fp);
>> +     return max(fp->len * sizeof(struct sock_filter),
>> +                sizeof(struct work_struct)) + sizeof(*fp);
>
> So, "sizeof(struct work_struct)) + sizeof(*fp)" is a pattern that repeats
> a couple of times. Might make sense to stick that into a helper of its own
> - but in general this open coded overlay allocation method looks a bit
> fragile and not very obvious in isolation.
>
> So it could be done a bit cleaner, using an anonymous union:
>
>         /*
>          * These two overlay, the work struct is used during workqueue
>          * driven teardown, when the instructions are not used anymore:
>          */
>         union {
>                 struct sock_filter      insns[0];
>                 struct work_struct      work;
>         };
>
> And then all the sizeof() calculations become obvious and sk_filter_len()
> could be eliminated - I've marked the conversions in the code further
> below.

Eric made exactly the same comment. Agreed and fixed in V4

>>  #else
>> +#include <linux/slab.h>
>
> Inlines in the middle of header files are generally frowned upon.
>
> The standard pattern is to put them at the top, that way it's easier to
> see the dependencies and there's also less .config dependent inclusion,
> which makes header hell cleanup work easier.

Agree. I only followed the style that is already in filter.h 20 lines above.
#ifdef CONFIG_BPF_JIT
#include <stdarg.h>
#include <linux/linkage.h>
#include <linux/printk.h>

as part of the cleanup can move all of them to the top. In the separate commit?

>>       struct sk_filter *fp;
>>       unsigned int fsize = sizeof(struct sock_filter) * fprog->len;
>> +     unsigned int sk_fsize = max_t(u32, fsize, sizeof(struct work_struct))
>> +             + sizeof(*fp);
>
> Using the structure definition I suggested, this could be replaced with
> the more obvious:
>
>         unsigned int sk_fsize = max(fsize, sizeof(*fp));

with helper function it's even cleaner. Fixed in V4

> A couple of questions/suggestions:
>
> 1)
>
> I took a brief look at arch/x86/net/bpf_jit_comp.c while reviewing this
> patch.
>
> You need to split up bpf_jit_compile(), it's an obscenely large, ~600
> lines long function. We don't do that in modern, maintainable kernel code.

I had the same thought, therefore in my proposed generalization of bpf:
http://patchwork.ozlabs.org/patch/279280/
It is split into two. do_jit() is still a bit large at 400 lines. Can
split it further.

> 3)
>
> It's nice code altogether! Are there any plans to generalize its
> interfaces, to allow arbitrary bytecode to be used by other kernel
> subsystems as well? In particular tracing filters could make use of it,
> but it would also allow safe probe points.

That was exactly the reasons to generalize bpf as I proposed.
"extended BPF is a set of pseudo instructions that stitch kernel provided
data in the form of bpf_context with kernel provided set of functions in a safe
and deterministic way with minimal performance overhead vs native code"
Not sure what 'tracing filters' you have in mind, but I'm happy to try
them with extended bpf model.
imo existing bpf with two registers is too limiting for performance
and argument passing.
That's why going to 10 made it a lot more flexible and generically usable.
Sorry to hijack the thread.

Thanks
Alexei

^ permalink raw reply

* Re: [PATCH v3 net-next] fix unsafe set_memory_rw from softirq
From: Ingo Molnar @ 2013-10-04 17:35 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Heiko Carstens, Paul Mackerras, H. Peter Anvin, sparclinux,
	Nicolas Dichtel, Alexei Starovoitov, linux-s390, Russell King,
	x86, James Morris, Ingo Molnar, Alexey Kuznetsov,
	Paul E. McKenney, Xi Wang, Matt Evans, Thomas Gleixner,
	linux-arm-kernel, Stelian Nirlu, Nicolas Schichan,
	Hideaki YOSHIFUJI, netdev, linux-kernel, David S. Miller,
	Mircea Gherzan, Daniel Borkmann, Martin Schwidefsky, linux390,
	linuxppc-dev, Patrick McHardy
In-Reply-To: <CANn89iKkbR0_HaofvC_OVvqRv_Hqj3rATx-Z_4xXeusOasa56g@mail.gmail.com>


* Eric Dumazet <edumazet@google.com> wrote:

> 1)
> >
> > I took a brief look at arch/x86/net/bpf_jit_comp.c while reviewing this
> > patch.
> >
> > You need to split up bpf_jit_compile(), it's an obscenely large, ~600
> > lines long function. We don't do that in modern, maintainable kernel code.
> >
> > 2)
> >
> > This 128 bytes extra padding:
> >
> >         /* Most of BPF filters are really small,
> >          * but if some of them fill a page, allow at least
> >          * 128 extra bytes to insert a random section of int3
> >          */
> >         sz = round_up(proglen + sizeof(*header) + 128, PAGE_SIZE);
> >
> > why is it done? It's not clear to me from the comment.
> >
> 
> commit 314beb9bcabfd6b4542ccbced2402af2c6f6142a
> Author: Eric Dumazet <edumazet@google.com>
> Date:   Fri May 17 16:37:03 2013 +0000
> 
>     x86: bpf_jit_comp: secure bpf jit against spraying attacks
> 
>     hpa bringed into my attention some security related issues
>     with BPF JIT on x86.
> 
>     This patch makes sure the bpf generated code is marked read only,
>     as other kernel text sections.
> 
>     It also splits the unused space (we vmalloc() and only use a fraction of
>     the page) in two parts, so that the generated bpf code not starts at a
>     known offset in the page, but a pseudo random one.

Thanks for the explanation - that makes sense.

	Ingo

^ permalink raw reply

* RE: [PATCH 2/7] iommu: add api to get iommu_domain of a device
From: Bhushan Bharat-R65777 @ 2013-10-04 17:23 UTC (permalink / raw)
  To: Alex Williamson
  Cc: agraf@suse.de, Wood Scott-B07421, linux-pci@vger.kernel.org,
	joro@8bytes.org, linux-kernel@vger.kernel.org,
	iommu@lists.linux-foundation.org, linuxppc-dev@lists.ozlabs.org
In-Reply-To: <1380906772.25705.19.camel@ul30vt.home>

DQoNCj4gLS0tLS1PcmlnaW5hbCBNZXNzYWdlLS0tLS0NCj4gRnJvbTogQWxleCBXaWxsaWFtc29u
IFttYWlsdG86YWxleC53aWxsaWFtc29uQHJlZGhhdC5jb21dDQo+IFNlbnQ6IEZyaWRheSwgT2N0
b2JlciAwNCwgMjAxMyAxMDo0MyBQTQ0KPiBUbzogQmh1c2hhbiBCaGFyYXQtUjY1Nzc3DQo+IENj
OiBqb3JvQDhieXRlcy5vcmc7IGJlbmhAa2VybmVsLmNyYXNoaW5nLm9yZzsgZ2FsYWtAa2VybmVs
LmNyYXNoaW5nLm9yZzsgbGludXgtDQo+IGtlcm5lbEB2Z2VyLmtlcm5lbC5vcmc7IGxpbnV4cHBj
LWRldkBsaXN0cy5vemxhYnMub3JnOyBsaW51eC0NCj4gcGNpQHZnZXIua2VybmVsLm9yZzsgYWdy
YWZAc3VzZS5kZTsgV29vZCBTY290dC1CMDc0MjE7IGlvbW11QGxpc3RzLmxpbnV4LQ0KPiBmb3Vu
ZGF0aW9uLm9yZw0KPiBTdWJqZWN0OiBSZTogW1BBVENIIDIvN10gaW9tbXU6IGFkZCBhcGkgdG8g
Z2V0IGlvbW11X2RvbWFpbiBvZiBhIGRldmljZQ0KPiANCj4gT24gRnJpLCAyMDEzLTEwLTA0IGF0
IDE2OjQ3ICswMDAwLCBCaHVzaGFuIEJoYXJhdC1SNjU3Nzcgd3JvdGU6DQo+ID4NCj4gPiA+IC0t
LS0tT3JpZ2luYWwgTWVzc2FnZS0tLS0tDQo+ID4gPiBGcm9tOiBBbGV4IFdpbGxpYW1zb24gW21h
aWx0bzphbGV4LndpbGxpYW1zb25AcmVkaGF0LmNvbV0NCj4gPiA+IFNlbnQ6IEZyaWRheSwgT2N0
b2JlciAwNCwgMjAxMyA5OjE1IFBNDQo+ID4gPiBUbzogQmh1c2hhbiBCaGFyYXQtUjY1Nzc3DQo+
ID4gPiBDYzogam9yb0A4Ynl0ZXMub3JnOyBiZW5oQGtlcm5lbC5jcmFzaGluZy5vcmc7DQo+ID4g
PiBnYWxha0BrZXJuZWwuY3Jhc2hpbmcub3JnOyBsaW51eC0ga2VybmVsQHZnZXIua2VybmVsLm9y
ZzsNCj4gPiA+IGxpbnV4cHBjLWRldkBsaXN0cy5vemxhYnMub3JnOyBsaW51eC0gcGNpQHZnZXIu
a2VybmVsLm9yZzsNCj4gPiA+IGFncmFmQHN1c2UuZGU7IFdvb2QgU2NvdHQtQjA3NDIxOyBpb21t
dUBsaXN0cy5saW51eC0gZm91bmRhdGlvbi5vcmcNCj4gPiA+IFN1YmplY3Q6IFJlOiBbUEFUQ0gg
Mi83XSBpb21tdTogYWRkIGFwaSB0byBnZXQgaW9tbXVfZG9tYWluIG9mIGENCj4gPiA+IGRldmlj
ZQ0KPiA+ID4NCj4gPiA+IE9uIEZyaSwgMjAxMy0xMC0wNCBhdCAwOTo1NCArMDAwMCwgQmh1c2hh
biBCaGFyYXQtUjY1Nzc3IHdyb3RlOg0KPiA+ID4gPg0KPiA+ID4gPiA+IC0tLS0tT3JpZ2luYWwg
TWVzc2FnZS0tLS0tDQo+ID4gPiA+ID4gRnJvbTogbGludXgtcGNpLW93bmVyQHZnZXIua2VybmVs
Lm9yZw0KPiA+ID4gPiA+IFttYWlsdG86bGludXgtcGNpLW93bmVyQHZnZXIua2VybmVsLm9yZ10N
Cj4gPiA+ID4gPiBPbiBCZWhhbGYgT2YgQWxleCBXaWxsaWFtc29uDQo+ID4gPiA+ID4gU2VudDog
V2VkbmVzZGF5LCBTZXB0ZW1iZXIgMjUsIDIwMTMgMTA6MTYgUE0NCj4gPiA+ID4gPiBUbzogQmh1
c2hhbiBCaGFyYXQtUjY1Nzc3DQo+ID4gPiA+ID4gQ2M6IGpvcm9AOGJ5dGVzLm9yZzsgYmVuaEBr
ZXJuZWwuY3Jhc2hpbmcub3JnOw0KPiA+ID4gPiA+IGdhbGFrQGtlcm5lbC5jcmFzaGluZy5vcmc7
IGxpbnV4LSBrZXJuZWxAdmdlci5rZXJuZWwub3JnOw0KPiA+ID4gPiA+IGxpbnV4cHBjLWRldkBs
aXN0cy5vemxhYnMub3JnOyBsaW51eC0gcGNpQHZnZXIua2VybmVsLm9yZzsNCj4gPiA+ID4gPiBh
Z3JhZkBzdXNlLmRlOyBXb29kIFNjb3R0LUIwNzQyMTsgaW9tbXVAbGlzdHMubGludXgtDQo+ID4g
PiA+ID4gZm91bmRhdGlvbi5vcmc7IEJodXNoYW4gQmhhcmF0LVI2NTc3Nw0KPiA+ID4gPiA+IFN1
YmplY3Q6IFJlOiBbUEFUQ0ggMi83XSBpb21tdTogYWRkIGFwaSB0byBnZXQgaW9tbXVfZG9tYWlu
IG9mIGENCj4gPiA+ID4gPiBkZXZpY2UNCj4gPiA+ID4gPg0KPiA+ID4gPiA+IE9uIFRodSwgMjAx
My0wOS0xOSBhdCAxMjo1OSArMDUzMCwgQmhhcmF0IEJodXNoYW4gd3JvdGU6DQo+ID4gPiA+ID4g
PiBUaGlzIGFwaSByZXR1cm4gdGhlIGlvbW11IGRvbWFpbiB0byB3aGljaCB0aGUgZGV2aWNlIGlz
IGF0dGFjaGVkLg0KPiA+ID4gPiA+ID4gVGhlIGlvbW11X2RvbWFpbiBpcyByZXF1aXJlZCBmb3Ig
bWFraW5nIEFQSSBjYWxscyByZWxhdGVkIHRvIGlvbW11Lg0KPiA+ID4gPiA+ID4gRm9sbG93IHVw
IHBhdGNoZXMgd2hpY2ggdXNlIHRoaXMgQVBJIHRvIGtub3cgaW9tbXUgbWFwaW5nLg0KPiA+ID4g
PiA+ID4NCj4gPiA+ID4gPiA+IFNpZ25lZC1vZmYtYnk6IEJoYXJhdCBCaHVzaGFuIDxiaGFyYXQu
Ymh1c2hhbkBmcmVlc2NhbGUuY29tPg0KPiA+ID4gPiA+ID4gLS0tDQo+ID4gPiA+ID4gPiAgZHJp
dmVycy9pb21tdS9pb21tdS5jIHwgICAxMCArKysrKysrKysrDQo+ID4gPiA+ID4gPiAgaW5jbHVk
ZS9saW51eC9pb21tdS5oIHwgICAgNyArKysrKysrDQo+ID4gPiA+ID4gPiAgMiBmaWxlcyBjaGFu
Z2VkLCAxNyBpbnNlcnRpb25zKCspLCAwIGRlbGV0aW9ucygtKQ0KPiA+ID4gPiA+ID4NCj4gPiA+
ID4gPiA+IGRpZmYgLS1naXQgYS9kcml2ZXJzL2lvbW11L2lvbW11LmMgYi9kcml2ZXJzL2lvbW11
L2lvbW11LmMNCj4gPiA+ID4gPiA+IGluZGV4DQo+ID4gPiA+ID4gPiBmYmU5Y2E3Li42YWM1ZjUw
IDEwMDY0NA0KPiA+ID4gPiA+ID4gLS0tIGEvZHJpdmVycy9pb21tdS9pb21tdS5jDQo+ID4gPiA+
ID4gPiArKysgYi9kcml2ZXJzL2lvbW11L2lvbW11LmMNCj4gPiA+ID4gPiA+IEBAIC02OTYsNiAr
Njk2LDE2IEBAIHZvaWQgaW9tbXVfZGV0YWNoX2RldmljZShzdHJ1Y3QNCj4gPiA+ID4gPiA+IGlv
bW11X2RvbWFpbiAqZG9tYWluLCBzdHJ1Y3QgZGV2aWNlICpkZXYpICB9DQo+ID4gPiA+ID4gPiBF
WFBPUlRfU1lNQk9MX0dQTChpb21tdV9kZXRhY2hfZGV2aWNlKTsNCj4gPiA+ID4gPiA+DQo+ID4g
PiA+ID4gPiArc3RydWN0IGlvbW11X2RvbWFpbiAqaW9tbXVfZ2V0X2Rldl9kb21haW4oc3RydWN0
IGRldmljZSAqZGV2KSB7DQo+ID4gPiA+ID4gPiArCXN0cnVjdCBpb21tdV9vcHMgKm9wcyA9IGRl
di0+YnVzLT5pb21tdV9vcHM7DQo+ID4gPiA+ID4gPiArDQo+ID4gPiA+ID4gPiArCWlmICh1bmxp
a2VseShvcHMgPT0gTlVMTCB8fCBvcHMtPmdldF9kZXZfaW9tbXVfZG9tYWluID09IE5VTEwpKQ0K
PiA+ID4gPiA+ID4gKwkJcmV0dXJuIE5VTEw7DQo+ID4gPiA+ID4gPiArDQo+ID4gPiA+ID4gPiAr
CXJldHVybiBvcHMtPmdldF9kZXZfaW9tbXVfZG9tYWluKGRldik7IH0NCj4gPiA+ID4gPiA+ICtF
WFBPUlRfU1lNQk9MX0dQTChpb21tdV9nZXRfZGV2X2RvbWFpbik7DQo+ID4gPiA+ID4NCj4gPiA+
ID4gPiBXaGF0IHByZXZlbnRzIHRoaXMgZnJvbSByYWNpbmcgaW9tbXVfZG9tYWluX2ZyZWUoKT8g
IFRoZXJlJ3Mgbm8NCj4gPiA+ID4gPiByZWZlcmVuY2VzIGFjcXVpcmVkLCBzbyB0aGVyZSdzIG5v
IHJlYXNvbiBmb3IgdGhlIGNhbGxlciB0bw0KPiA+ID4gPiA+IGFzc3VtZSB0aGUNCj4gPiA+IHBv
aW50ZXIgaXMgdmFsaWQuDQo+ID4gPiA+DQo+ID4gPiA+IFNvcnJ5IGZvciBsYXRlIHF1ZXJ5LCBz
b21laG93IHRoaXMgZW1haWwgd2VudCBpbnRvIGEgZm9sZGVyIGFuZA0KPiA+ID4gPiBlc2NhcGVk
Ow0KPiA+ID4gPg0KPiA+ID4gPiBKdXN0IHRvIGJlIHN1cmUsIHRoZXJlIGlzIG5vdCBsb2NrIGF0
IGdlbmVyaWMgInN0cnVjdA0KPiA+ID4gPiBpb21tdV9kb21haW4iLCBidXQgSVANCj4gPiA+IHNw
ZWNpZmljIHN0cnVjdHVyZSAobGluayBGU0wgZG9tYWluKSBsaW5rZWQgaW4gaW9tbXVfZG9tYWlu
LT5wcml2DQo+ID4gPiBoYXZlIGEgbG9jaywgc28gd2UgbmVlZCB0byBlbnN1cmUgdGhpcyByYWNl
IGluIEZTTCBpb21tdSBjb2RlIChzYXkNCj4gPiA+IGRyaXZlcnMvaW9tbXUvZnNsX3BhbXVfZG9t
YWluLmMpLCByaWdodD8NCj4gPiA+DQo+ID4gPiBObywgaXQncyBub3Qgc3VmZmljaWVudCB0byBt
YWtlIHN1cmUgdGhhdCB5b3VyIHVzZSBvZiB0aGUgaW50ZXJmYWNlDQo+ID4gPiBpcyByYWNlIGZy
ZWUuICBUaGUgaW50ZXJmYWNlIGl0c2VsZiBuZWVkcyB0byBiZSBkZXNpZ25lZCBzbyB0aGF0DQo+
ID4gPiBpdCdzIGRpZmZpY3VsdCB0byB1c2UgaW5jb3JyZWN0bHkuDQo+ID4NCj4gPiBTbyB3ZSBj
YW4gZGVmaW5lIGlvbW11X2dldF9kZXZfZG9tYWluKCkvaW9tbXVfcHV0X2Rldl9kb21haW4oKTsN
Cj4gPiBpb21tdV9nZXRfZGV2X2RvbWFpbigpIHdpbGwgcmV0dXJuIGRvbWFpbiB3aXRoIHRoZSBs
b2NrIGhlbGQsIGFuZA0KPiA+IGlvbW11X3B1dF9kZXZfZG9tYWluKCkgd2lsbCByZWxlYXNlIHRo
ZSBsb2NrPyBBbmQNCj4gPiBpb21tdV9nZXRfZGV2X2RvbWFpbigpIG11c3QgYWx3YXlzIGJlIGZv
bGxvd2VkIGJ5DQo+ID4gaW9tbXVfZ2V0X2Rldl9kb21haW4oKS4NCj4gDQo+IFdoYXQgbG9jaz8g
IGdldC9wdXQgYXJlIGdlbmVyYWxseSB1c2VkIGZvciByZWZlcmVuY2UgY291bnRpbmcsIG5vdCBs
b2NraW5nIGluDQo+IHRoZSBrZXJuZWwuDQo+IA0KPiA+ID4gVGhhdCdzIG5vdCB0aGUgY2FzZSBo
ZXJlLiAgVGhpcyBpcyBhIGJhY2tkb29yIHRvIGdldCB0aGUgaW9tbXUNCj4gPiA+IGRvbWFpbiBm
cm9tIHRoZSBpb21tdSBkcml2ZXIgcmVnYXJkbGVzcyBvZiB3aG8gaXMgdXNpbmcgaXQgb3IgaG93
Lg0KPiA+ID4gVGhlIGlvbW11IGRvbWFpbiBpcyBjcmVhdGVkIGFuZCBtYW5hZ2VkIGJ5IHZmaW8s
IHNvIHNob3VsZG4ndCB3ZSBiZQ0KPiA+ID4gbG9va2luZyBhdCBob3cgdG8gZG8gdGhpcyB0aHJv
dWdoIHZmaW8/DQo+ID4NCj4gPiBMZXQgbWUgZmlyc3QgZGVzY3JpYmUgd2hhdCB3ZSBhcmUgZG9p
bmcgaGVyZToNCj4gPiBEdXJpbmcgaW5pdGlhbGl6YXRpb246LQ0KPiA+ICAtIHZmaW8gdGFsa3Mg
dG8gTVNJIHN5c3RlbSB0byBrbm93IHRoZSBNU0ktcGFnZSBhbmQgc2l6ZQ0KPiA+ICAtIHZmaW8g
dGhlbiBpbnRlcmFjdHMgd2l0aCBpb21tdSB0byBtYXAgdGhlIE1TSS1wYWdlIGluIGlvbW11IChJ
T1ZBDQo+ID4gaXMgZGVjaWRlZCBieSB1c2Vyc3BhY2UgYW5kIHBoeXNpY2FsIGFkZHJlc3MgaXMg
dGhlIE1TSS1wYWdlKQ0KPiA+ICAtIFNvIHRoZSBJT1ZBIHN1YndpbmRvdyBtYXBwaW5nIGlzIGNy
ZWF0ZWQgaW4gaW9tbXUgYW5kIHllcyBWRklPIGtub3cgYWJvdXQNCj4gdGhpcyBtYXBwaW5nLg0K
PiA+DQo+ID4gTm93IGRvIFNFVF9JUlEoTVNJL01TSVgpIGlvY3RsOg0KPiA+ICAtIGNhbGxzIHBj
aV9lbmFibGVfbXNpeCgpL3BjaV9lbmFibGVfbXNpX2Jsb2NrKCk6IHdoaWNoIGlzIHN1cHBvc2Vk
IHRvIHNldA0KPiBNU0kgYWRkcmVzcy9kYXRhIGluIGRldmljZS4NCj4gPiAgLSBTbyBpbiBjdXJy
ZW50IGltcGxlbWVudGF0aW9uICh0aGlzIHBhdGNoc2V0KSBtc2ktc3Vic3lzdGVtIGdldHMgdGhl
IElPVkENCj4gZnJvbSBpb21tdSB2aWEgdGhpcyBkZWZpbmVkIGludGVyZmFjZS4NCj4gPiAgLSBB
cmUgeW91IHNheWluZyB0aGF0IHJhdGhlciB0aGFuIGdldHRpbmcgdGhpcyBmcm9tIGlvbW11LCB3
ZSBzaG91bGQgZ2V0IHRoaXMNCj4gZnJvbSB2ZmlvPyBXaGF0IGRpZmZlcmVuY2UgZG9lcyB0aGlz
IG1ha2U/DQo+IA0KPiBZZXMsIHlvdSBqdXN0IHNhaWQgYWJvdmUgdGhhdCB2ZmlvIGtub3dzIHRo
ZSBtc2kgdG8gaW92YSBtYXBwaW5nLCBzbyB3aHkgZ28NCj4gb3V0c2lkZSBvZiB2ZmlvIHRvIGZp
bmQgaXQgbGF0ZXI/ICBUaGUgZGlmZmVyZW5jZSBpcyBvbmUgY2FzZSB5b3UgY2FuIGhhdmUgYQ0K
PiBwcm9wZXIgcmVmZXJlbmNlIHRvIGRhdGEgc3RydWN0dXJlcyB0byBtYWtlIHN1cmUgdGhlIHBv
aW50ZXIgeW91IGdldCBiYWNrDQo+IGFjdHVhbGx5IGhhcyBtZWFuaW5nIGF0IHRoZSB0aW1lIHlv
dSdyZSB1c2luZyBpdCB2cyB0aGUgY29kZSBoZXJlIHdoZXJlIHlvdSdyZQ0KPiBkZWZpbmluZyBh
biBBUEkgdGhhdCByZXR1cm5zIGEgbWVhbmluZ2xlc3MgdmFsdWUNCg0KV2l0aCBGU0wtUEFNVSB3
ZSB3aWxsIGFsd2F5cyBnZXQgY29uc2lzdGFudCBkYXRhIGZyb20gaW9tbXUgb3IgdmZpby1kYXRh
IHN0cnVjdHVyZS4NCg0KPiBiZWNhdXNlIHlvdSBjYW4ndCBjaGVjayBvcg0KPiBlbmZvcmNlIHRo
YXQgYW4gYXJiaXRyYXJ5IGNhbGxlciBpcyB1c2luZyBpdCBjb3JyZWN0bHkuDQoNCkkgYW0gbm90
IHN1cmUgd2hhdCBpcyBhcmJpdHJhcnkgY2FsbGVyPyBwZGV2IGlzIGtub3duIHRvIHZmaW8sIHNv
IHZmaW8gd2lsbCBvbmx5IG1ha2UgcGNpX2VuYWJsZV9tc2l4KCkvcGNpX2VuYWJsZV9tc2lfYmxv
Y2soKSBmb3IgdGhpcyBwZGV2LiBJZiBhbnlvdGhlciBjb2RlIG1ha2VzIHRoZW4gaXQgaXMgc29t
ZSBvdGhlciB1bmV4cGVjdGVkbHkgdGhpbmcgaGFwcGVuaW5nIGluIHN5c3RlbSwgbm8/DQoNCg0K
PiAgSXQncyBub3QgbWFpbnRhaW5hYmxlLg0KPiBUaGFua3MsDQoNCkkgZG8gbm90IGhhdmUgYW55
IGlzc3VlIHdpdGggdGhpcyBhcyB3ZWxsLCBjYW4geW91IGFsc28gZGVzY3JpYmUgdGhlIHR5cGUg
b2YgQVBJIHlvdSBhcmUgZW52aXNpb25pbmc7DQpJIGNhbiB0aGluayBvZiBkZWZpbmluZyBzb21l
IGZ1bmN0aW9uIGluIHZmaW8uYy92ZmlvX2lvbW11Ki5jLCBtYWtlIHRoZW0gZ2xvYmFsIGFuZCBk
ZWNsYXJlIHRoZW4gaW4gaW5jbHVkZS9MaW51eC92ZmlvLmgNCkFuZCBpbmNsdWRlIDxMaW51eC92
ZmlvLmg+IGluIGNhbGxlciBmaWxlIChhcmNoL3Bvd2VycGMva2VybmVsL21zaS5jKQ0KDQpUaGFu
a3MNCi1CaGFyYXQNCj4gDQo+IEFsZXgNCj4gDQo+ID4gPiBJdCBzZWVtcyBsaWtlIHlvdSdkIHdh
bnQgdG8gdXNlIHlvdXIgZGV2aWNlIHRvIGdldCBhIHZmaW8gZ3JvdXANCj4gPiA+IHJlZmVyZW5j
ZSwgZnJvbSB3aGljaCB5b3UgY291bGQgZG8gc29tZXRoaW5nIHdpdGggdGhlIHZmaW8gZXh0ZXJu
YWwNCj4gPiA+IHVzZXIgaW50ZXJmYWNlIGFuZCBnZXQgdGhlIGlvbW11IGRvbWFpbiByZWZlcmVu
Y2UuICBUaGFua3MsDQo+ID4gPg0KPiA+ID4gQWxleA0KPiA+ID4NCj4gPiA+ID4gPiA+ICAvKg0K
PiA+ID4gPiA+ID4gICAqIElPTU1VIGdyb3VwcyBhcmUgcmVhbGx5IHRoZSBuYXRydWFsIHdvcmtp
bmcgdW5pdCBvZiB0aGUgSU9NTVUsIGJ1dA0KPiA+ID4gPiA+ID4gICAqIHRoZSBJT01NVSBBUEkg
d29ya3Mgb24gZG9tYWlucyBhbmQgZGV2aWNlcy4gIEJyaWRnZSB0aGF0DQo+ID4gPiA+ID4gPiBn
YXAgYnkgZGlmZiAtLWdpdCBhL2luY2x1ZGUvbGludXgvaW9tbXUuaA0KPiA+ID4gPiA+ID4gYi9p
bmNsdWRlL2xpbnV4L2lvbW11LmggaW5kZXggN2VhMzE5ZS4uZmEwNDZiZCAxMDA2NDQNCj4gPiA+
ID4gPiA+IC0tLSBhL2luY2x1ZGUvbGludXgvaW9tbXUuaA0KPiA+ID4gPiA+ID4gKysrIGIvaW5j
bHVkZS9saW51eC9pb21tdS5oDQo+ID4gPiA+ID4gPiBAQCAtMTI3LDYgKzEyNyw3IEBAIHN0cnVj
dCBpb21tdV9vcHMgew0KPiA+ID4gPiA+ID4gIAlpbnQgKCpkb21haW5fc2V0X3dpbmRvd3MpKHN0
cnVjdCBpb21tdV9kb21haW4gKmRvbWFpbiwgdTMyDQo+ID4gPiB3X2NvdW50KTsNCj4gPiA+ID4g
PiA+ICAJLyogR2V0IHRoZSBudW1lciBvZiB3aW5kb3cgcGVyIGRvbWFpbiAqLw0KPiA+ID4gPiA+
ID4gIAl1MzIgKCpkb21haW5fZ2V0X3dpbmRvd3MpKHN0cnVjdCBpb21tdV9kb21haW4gKmRvbWFp
bik7DQo+ID4gPiA+ID4gPiArCXN0cnVjdCBpb21tdV9kb21haW4gKigqZ2V0X2Rldl9pb21tdV9k
b21haW4pKHN0cnVjdCBkZXZpY2UNCj4gPiA+ID4gPiA+ICsqZGV2KTsNCj4gPiA+ID4gPiA+DQo+
ID4gPiA+ID4gPiAgCXVuc2lnbmVkIGxvbmcgcGdzaXplX2JpdG1hcDsgIH07IEBAIC0xOTAsNiAr
MTkxLDcgQEAgZXh0ZXJuDQo+ID4gPiA+ID4gPiBpbnQgaW9tbXVfZG9tYWluX3dpbmRvd19lbmFi
bGUoc3RydWN0IGlvbW11X2RvbWFpbg0KPiA+ID4gPiA+ICpkb21haW4sIHUzMiB3bmRfbnIsDQo+
ID4gPiA+ID4gPiAgCQkJCSAgICAgIHBoeXNfYWRkcl90IG9mZnNldCwgdTY0IHNpemUsDQo+ID4g
PiA+ID4gPiAgCQkJCSAgICAgIGludCBwcm90KTsNCj4gPiA+ID4gPiA+ICBleHRlcm4gdm9pZCBp
b21tdV9kb21haW5fd2luZG93X2Rpc2FibGUoc3RydWN0IGlvbW11X2RvbWFpbg0KPiA+ID4gPiA+
ID4gKmRvbWFpbiwNCj4gPiA+ID4gPiA+IHUzMiB3bmRfbnIpOw0KPiA+ID4gPiA+ID4gK2V4dGVy
biBzdHJ1Y3QgaW9tbXVfZG9tYWluICppb21tdV9nZXRfZGV2X2RvbWFpbihzdHJ1Y3QNCj4gPiA+
ID4gPiA+ICtkZXZpY2UgKmRldik7DQo+ID4gPiA+ID4gPiAgLyoqDQo+ID4gPiA+ID4gPiAgICog
cmVwb3J0X2lvbW11X2ZhdWx0KCkgLSByZXBvcnQgYWJvdXQgYW4gSU9NTVUgZmF1bHQgdG8gdGhl
DQo+ID4gPiA+ID4gPiBJT01NVQ0KPiA+ID4gZnJhbWV3b3JrDQo+ID4gPiA+ID4gPiAgICogQGRv
bWFpbjogdGhlIGlvbW11IGRvbWFpbiB3aGVyZSB0aGUgZmF1bHQgaGFzIGhhcHBlbmVkIEBADQo+
ID4gPiA+ID4gPiAtMjg0LDYNCj4gPiA+ID4gPiA+ICsyODYsMTEgQEAgc3RhdGljIGlubGluZSB2
b2lkDQo+ID4gPiA+ID4gPiAraW9tbXVfZG9tYWluX3dpbmRvd19kaXNhYmxlKHN0cnVjdA0KPiA+
ID4gPiA+ID4gaW9tbXVfZG9tYWluICpkb21haW4sICB7ICB9DQo+ID4gPiA+ID4gPg0KPiA+ID4g
PiA+ID4gK3N0YXRpYyBpbmxpbmUgc3RydWN0IGlvbW11X2RvbWFpbg0KPiA+ID4gPiA+ID4gKypp
b21tdV9nZXRfZGV2X2RvbWFpbihzdHJ1Y3QgZGV2aWNlDQo+ID4gPiA+ID4gPiArKmRldikgew0K
PiA+ID4gPiA+ID4gKwlyZXR1cm4gTlVMTDsNCj4gPiA+ID4gPiA+ICt9DQo+ID4gPiA+ID4gPiAr
DQo+ID4gPiA+ID4gPiAgc3RhdGljIGlubGluZSBwaHlzX2FkZHJfdCBpb21tdV9pb3ZhX3RvX3Bo
eXMoc3RydWN0DQo+ID4gPiA+ID4gPiBpb21tdV9kb21haW4gKmRvbWFpbiwgZG1hX2FkZHJfdCBp
b3ZhKSAgew0KPiA+ID4gPiA+ID4gIAlyZXR1cm4gMDsNCj4gPiA+ID4gPg0KPiA+ID4gPiA+DQo+
ID4gPiA+ID4NCj4gPiA+ID4gPiAtLQ0KPiA+ID4gPiA+IFRvIHVuc3Vic2NyaWJlIGZyb20gdGhp
cyBsaXN0OiBzZW5kIHRoZSBsaW5lICJ1bnN1YnNjcmliZSBsaW51eC1wY2kiDQo+ID4gPiA+ID4g
aW4gdGhlIGJvZHkgb2YgYSBtZXNzYWdlIHRvIG1ham9yZG9tb0B2Z2VyLmtlcm5lbC5vcmcgTW9y
ZQ0KPiA+ID4gPiA+IG1ham9yZG9tbyBpbmZvIGF0IGh0dHA6Ly92Z2VyLmtlcm5lbC5vcmcvbWFq
b3Jkb21vLWluZm8uaHRtbA0KPiA+ID4gPg0KPiA+ID4NCj4gPiA+DQo+ID4gPg0KPiA+DQo+IA0K
PiANCj4gDQoNCg==

^ permalink raw reply

* Re: [PATCH 2/7] iommu: add api to get iommu_domain of a device
From: Alex Williamson @ 2013-10-04 17:12 UTC (permalink / raw)
  To: Bhushan Bharat-R65777
  Cc: agraf@suse.de, Wood Scott-B07421, linux-pci@vger.kernel.org,
	joro@8bytes.org, linux-kernel@vger.kernel.org,
	iommu@lists.linux-foundation.org, linuxppc-dev@lists.ozlabs.org
In-Reply-To: <6A3DF150A5B70D4F9B66A25E3F7C888D07190F35@039-SN2MPN1-011.039d.mgd.msft.net>

On Fri, 2013-10-04 at 16:47 +0000, Bhushan Bharat-R65777 wrote:
> 
> > -----Original Message-----
> > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > Sent: Friday, October 04, 2013 9:15 PM
> > To: Bhushan Bharat-R65777
> > Cc: joro@8bytes.org; benh@kernel.crashing.org; galak@kernel.crashing.org; linux-
> > kernel@vger.kernel.org; linuxppc-dev@lists.ozlabs.org; linux-
> > pci@vger.kernel.org; agraf@suse.de; Wood Scott-B07421; iommu@lists.linux-
> > foundation.org
> > Subject: Re: [PATCH 2/7] iommu: add api to get iommu_domain of a device
> > 
> > On Fri, 2013-10-04 at 09:54 +0000, Bhushan Bharat-R65777 wrote:
> > >
> > > > -----Original Message-----
> > > > From: linux-pci-owner@vger.kernel.org
> > > > [mailto:linux-pci-owner@vger.kernel.org]
> > > > On Behalf Of Alex Williamson
> > > > Sent: Wednesday, September 25, 2013 10:16 PM
> > > > To: Bhushan Bharat-R65777
> > > > Cc: joro@8bytes.org; benh@kernel.crashing.org;
> > > > galak@kernel.crashing.org; linux- kernel@vger.kernel.org;
> > > > linuxppc-dev@lists.ozlabs.org; linux- pci@vger.kernel.org;
> > > > agraf@suse.de; Wood Scott-B07421; iommu@lists.linux- foundation.org;
> > > > Bhushan Bharat-R65777
> > > > Subject: Re: [PATCH 2/7] iommu: add api to get iommu_domain of a
> > > > device
> > > >
> > > > On Thu, 2013-09-19 at 12:59 +0530, Bharat Bhushan wrote:
> > > > > This api return the iommu domain to which the device is attached.
> > > > > The iommu_domain is required for making API calls related to iommu.
> > > > > Follow up patches which use this API to know iommu maping.
> > > > >
> > > > > Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
> > > > > ---
> > > > >  drivers/iommu/iommu.c |   10 ++++++++++
> > > > >  include/linux/iommu.h |    7 +++++++
> > > > >  2 files changed, 17 insertions(+), 0 deletions(-)
> > > > >
> > > > > diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index
> > > > > fbe9ca7..6ac5f50 100644
> > > > > --- a/drivers/iommu/iommu.c
> > > > > +++ b/drivers/iommu/iommu.c
> > > > > @@ -696,6 +696,16 @@ void iommu_detach_device(struct iommu_domain
> > > > > *domain, struct device *dev)  }
> > > > > EXPORT_SYMBOL_GPL(iommu_detach_device);
> > > > >
> > > > > +struct iommu_domain *iommu_get_dev_domain(struct device *dev) {
> > > > > +	struct iommu_ops *ops = dev->bus->iommu_ops;
> > > > > +
> > > > > +	if (unlikely(ops == NULL || ops->get_dev_iommu_domain == NULL))
> > > > > +		return NULL;
> > > > > +
> > > > > +	return ops->get_dev_iommu_domain(dev); }
> > > > > +EXPORT_SYMBOL_GPL(iommu_get_dev_domain);
> > > >
> > > > What prevents this from racing iommu_domain_free()?  There's no
> > > > references acquired, so there's no reason for the caller to assume the
> > pointer is valid.
> > >
> > > Sorry for late query, somehow this email went into a folder and
> > > escaped;
> > >
> > > Just to be sure, there is not lock at generic "struct iommu_domain", but IP
> > specific structure (link FSL domain) linked in iommu_domain->priv have a lock,
> > so we need to ensure this race in FSL iommu code (say
> > drivers/iommu/fsl_pamu_domain.c), right?
> > 
> > No, it's not sufficient to make sure that your use of the interface is race
> > free.  The interface itself needs to be designed so that it's difficult to use
> > incorrectly.
> 
> So we can define iommu_get_dev_domain()/iommu_put_dev_domain();
> iommu_get_dev_domain() will return domain with the lock held, and
> iommu_put_dev_domain() will release the lock? And
> iommu_get_dev_domain() must always be followed by
> iommu_get_dev_domain().

What lock?  get/put are generally used for reference counting, not
locking in the kernel.

> > That's not the case here.  This is a backdoor to get the iommu
> > domain from the iommu driver regardless of who is using it or how.  The iommu
> > domain is created and managed by vfio, so shouldn't we be looking at how to do
> > this through vfio?
> 
> Let me first describe what we are doing here:
> During initialization:-
>  - vfio talks to MSI system to know the MSI-page and size
>  - vfio then interacts with iommu to map the MSI-page in iommu (IOVA is decided by userspace and physical address is the MSI-page)
>  - So the IOVA subwindow mapping is created in iommu and yes VFIO know about this mapping.
> 
> Now do SET_IRQ(MSI/MSIX) ioctl:
>  - calls pci_enable_msix()/pci_enable_msi_block(): which is supposed to set MSI address/data in device.
>  - So in current implementation (this patchset) msi-subsystem gets the IOVA from iommu via this defined interface.
>  - Are you saying that rather than getting this from iommu, we should get this from vfio? What difference does this make?

Yes, you just said above that vfio knows the msi to iova mapping, so why
go outside of vfio to find it later?  The difference is one case you can
have a proper reference to data structures to make sure the pointer you
get back actually has meaning at the time you're using it vs the code
here where you're defining an API that returns a meaningless value
because you can't check or enforce that an arbitrary caller is using it
correctly.  It's not maintainable.  Thanks,

Alex

> > It seems like you'd want to use your device to get a vfio
> > group reference, from which you could do something with the vfio external user
> > interface and get the iommu domain reference.  Thanks,
> > 
> > Alex
> > 
> > > > >  /*
> > > > >   * IOMMU groups are really the natrual working unit of the IOMMU, but
> > > > >   * the IOMMU API works on domains and devices.  Bridge that gap
> > > > > by diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> > > > > index 7ea319e..fa046bd 100644
> > > > > --- a/include/linux/iommu.h
> > > > > +++ b/include/linux/iommu.h
> > > > > @@ -127,6 +127,7 @@ struct iommu_ops {
> > > > >  	int (*domain_set_windows)(struct iommu_domain *domain, u32
> > w_count);
> > > > >  	/* Get the numer of window per domain */
> > > > >  	u32 (*domain_get_windows)(struct iommu_domain *domain);
> > > > > +	struct iommu_domain *(*get_dev_iommu_domain)(struct device
> > > > > +*dev);
> > > > >
> > > > >  	unsigned long pgsize_bitmap;
> > > > >  };
> > > > > @@ -190,6 +191,7 @@ extern int iommu_domain_window_enable(struct
> > > > > iommu_domain
> > > > *domain, u32 wnd_nr,
> > > > >  				      phys_addr_t offset, u64 size,
> > > > >  				      int prot);
> > > > >  extern void iommu_domain_window_disable(struct iommu_domain
> > > > > *domain,
> > > > > u32 wnd_nr);
> > > > > +extern struct iommu_domain *iommu_get_dev_domain(struct device
> > > > > +*dev);
> > > > >  /**
> > > > >   * report_iommu_fault() - report about an IOMMU fault to the IOMMU
> > framework
> > > > >   * @domain: the iommu domain where the fault has happened @@
> > > > > -284,6
> > > > > +286,11 @@ static inline void iommu_domain_window_disable(struct
> > > > > iommu_domain *domain,  {  }
> > > > >
> > > > > +static inline struct iommu_domain *iommu_get_dev_domain(struct
> > > > > +device
> > > > > +*dev) {
> > > > > +	return NULL;
> > > > > +}
> > > > > +
> > > > >  static inline phys_addr_t iommu_iova_to_phys(struct iommu_domain
> > > > > *domain, dma_addr_t iova)  {
> > > > >  	return 0;
> > > >
> > > >
> > > >
> > > > --
> > > > To unsubscribe from this list: send the line "unsubscribe linux-pci"
> > > > in the body of a message to majordomo@vger.kernel.org More majordomo
> > > > info at http://vger.kernel.org/majordomo-info.html
> > >
> > 
> > 
> > 
> 

^ permalink raw reply

* RE: [PATCH 2/7] iommu: add api to get iommu_domain of a device
From: Bhushan Bharat-R65777 @ 2013-10-04 16:47 UTC (permalink / raw)
  To: Alex Williamson
  Cc: agraf@suse.de, Wood Scott-B07421, linux-pci@vger.kernel.org,
	joro@8bytes.org, linux-kernel@vger.kernel.org,
	iommu@lists.linux-foundation.org, linuxppc-dev@lists.ozlabs.org
In-Reply-To: <1380901507.2673.204.camel@ul30vt.home>

DQoNCj4gLS0tLS1PcmlnaW5hbCBNZXNzYWdlLS0tLS0NCj4gRnJvbTogQWxleCBXaWxsaWFtc29u
IFttYWlsdG86YWxleC53aWxsaWFtc29uQHJlZGhhdC5jb21dDQo+IFNlbnQ6IEZyaWRheSwgT2N0
b2JlciAwNCwgMjAxMyA5OjE1IFBNDQo+IFRvOiBCaHVzaGFuIEJoYXJhdC1SNjU3NzcNCj4gQ2M6
IGpvcm9AOGJ5dGVzLm9yZzsgYmVuaEBrZXJuZWwuY3Jhc2hpbmcub3JnOyBnYWxha0BrZXJuZWwu
Y3Jhc2hpbmcub3JnOyBsaW51eC0NCj4ga2VybmVsQHZnZXIua2VybmVsLm9yZzsgbGludXhwcGMt
ZGV2QGxpc3RzLm96bGFicy5vcmc7IGxpbnV4LQ0KPiBwY2lAdmdlci5rZXJuZWwub3JnOyBhZ3Jh
ZkBzdXNlLmRlOyBXb29kIFNjb3R0LUIwNzQyMTsgaW9tbXVAbGlzdHMubGludXgtDQo+IGZvdW5k
YXRpb24ub3JnDQo+IFN1YmplY3Q6IFJlOiBbUEFUQ0ggMi83XSBpb21tdTogYWRkIGFwaSB0byBn
ZXQgaW9tbXVfZG9tYWluIG9mIGEgZGV2aWNlDQo+IA0KPiBPbiBGcmksIDIwMTMtMTAtMDQgYXQg
MDk6NTQgKzAwMDAsIEJodXNoYW4gQmhhcmF0LVI2NTc3NyB3cm90ZToNCj4gPg0KPiA+ID4gLS0t
LS1PcmlnaW5hbCBNZXNzYWdlLS0tLS0NCj4gPiA+IEZyb206IGxpbnV4LXBjaS1vd25lckB2Z2Vy
Lmtlcm5lbC5vcmcNCj4gPiA+IFttYWlsdG86bGludXgtcGNpLW93bmVyQHZnZXIua2VybmVsLm9y
Z10NCj4gPiA+IE9uIEJlaGFsZiBPZiBBbGV4IFdpbGxpYW1zb24NCj4gPiA+IFNlbnQ6IFdlZG5l
c2RheSwgU2VwdGVtYmVyIDI1LCAyMDEzIDEwOjE2IFBNDQo+ID4gPiBUbzogQmh1c2hhbiBCaGFy
YXQtUjY1Nzc3DQo+ID4gPiBDYzogam9yb0A4Ynl0ZXMub3JnOyBiZW5oQGtlcm5lbC5jcmFzaGlu
Zy5vcmc7DQo+ID4gPiBnYWxha0BrZXJuZWwuY3Jhc2hpbmcub3JnOyBsaW51eC0ga2VybmVsQHZn
ZXIua2VybmVsLm9yZzsNCj4gPiA+IGxpbnV4cHBjLWRldkBsaXN0cy5vemxhYnMub3JnOyBsaW51
eC0gcGNpQHZnZXIua2VybmVsLm9yZzsNCj4gPiA+IGFncmFmQHN1c2UuZGU7IFdvb2QgU2NvdHQt
QjA3NDIxOyBpb21tdUBsaXN0cy5saW51eC0gZm91bmRhdGlvbi5vcmc7DQo+ID4gPiBCaHVzaGFu
IEJoYXJhdC1SNjU3NzcNCj4gPiA+IFN1YmplY3Q6IFJlOiBbUEFUQ0ggMi83XSBpb21tdTogYWRk
IGFwaSB0byBnZXQgaW9tbXVfZG9tYWluIG9mIGENCj4gPiA+IGRldmljZQ0KPiA+ID4NCj4gPiA+
IE9uIFRodSwgMjAxMy0wOS0xOSBhdCAxMjo1OSArMDUzMCwgQmhhcmF0IEJodXNoYW4gd3JvdGU6
DQo+ID4gPiA+IFRoaXMgYXBpIHJldHVybiB0aGUgaW9tbXUgZG9tYWluIHRvIHdoaWNoIHRoZSBk
ZXZpY2UgaXMgYXR0YWNoZWQuDQo+ID4gPiA+IFRoZSBpb21tdV9kb21haW4gaXMgcmVxdWlyZWQg
Zm9yIG1ha2luZyBBUEkgY2FsbHMgcmVsYXRlZCB0byBpb21tdS4NCj4gPiA+ID4gRm9sbG93IHVw
IHBhdGNoZXMgd2hpY2ggdXNlIHRoaXMgQVBJIHRvIGtub3cgaW9tbXUgbWFwaW5nLg0KPiA+ID4g
Pg0KPiA+ID4gPiBTaWduZWQtb2ZmLWJ5OiBCaGFyYXQgQmh1c2hhbiA8YmhhcmF0LmJodXNoYW5A
ZnJlZXNjYWxlLmNvbT4NCj4gPiA+ID4gLS0tDQo+ID4gPiA+ICBkcml2ZXJzL2lvbW11L2lvbW11
LmMgfCAgIDEwICsrKysrKysrKysNCj4gPiA+ID4gIGluY2x1ZGUvbGludXgvaW9tbXUuaCB8ICAg
IDcgKysrKysrKw0KPiA+ID4gPiAgMiBmaWxlcyBjaGFuZ2VkLCAxNyBpbnNlcnRpb25zKCspLCAw
IGRlbGV0aW9ucygtKQ0KPiA+ID4gPg0KPiA+ID4gPiBkaWZmIC0tZ2l0IGEvZHJpdmVycy9pb21t
dS9pb21tdS5jIGIvZHJpdmVycy9pb21tdS9pb21tdS5jIGluZGV4DQo+ID4gPiA+IGZiZTljYTcu
LjZhYzVmNTAgMTAwNjQ0DQo+ID4gPiA+IC0tLSBhL2RyaXZlcnMvaW9tbXUvaW9tbXUuYw0KPiA+
ID4gPiArKysgYi9kcml2ZXJzL2lvbW11L2lvbW11LmMNCj4gPiA+ID4gQEAgLTY5Niw2ICs2OTYs
MTYgQEAgdm9pZCBpb21tdV9kZXRhY2hfZGV2aWNlKHN0cnVjdCBpb21tdV9kb21haW4NCj4gPiA+
ID4gKmRvbWFpbiwgc3RydWN0IGRldmljZSAqZGV2KSAgfQ0KPiA+ID4gPiBFWFBPUlRfU1lNQk9M
X0dQTChpb21tdV9kZXRhY2hfZGV2aWNlKTsNCj4gPiA+ID4NCj4gPiA+ID4gK3N0cnVjdCBpb21t
dV9kb21haW4gKmlvbW11X2dldF9kZXZfZG9tYWluKHN0cnVjdCBkZXZpY2UgKmRldikgew0KPiA+
ID4gPiArCXN0cnVjdCBpb21tdV9vcHMgKm9wcyA9IGRldi0+YnVzLT5pb21tdV9vcHM7DQo+ID4g
PiA+ICsNCj4gPiA+ID4gKwlpZiAodW5saWtlbHkob3BzID09IE5VTEwgfHwgb3BzLT5nZXRfZGV2
X2lvbW11X2RvbWFpbiA9PSBOVUxMKSkNCj4gPiA+ID4gKwkJcmV0dXJuIE5VTEw7DQo+ID4gPiA+
ICsNCj4gPiA+ID4gKwlyZXR1cm4gb3BzLT5nZXRfZGV2X2lvbW11X2RvbWFpbihkZXYpOyB9DQo+
ID4gPiA+ICtFWFBPUlRfU1lNQk9MX0dQTChpb21tdV9nZXRfZGV2X2RvbWFpbik7DQo+ID4gPg0K
PiA+ID4gV2hhdCBwcmV2ZW50cyB0aGlzIGZyb20gcmFjaW5nIGlvbW11X2RvbWFpbl9mcmVlKCk/
ICBUaGVyZSdzIG5vDQo+ID4gPiByZWZlcmVuY2VzIGFjcXVpcmVkLCBzbyB0aGVyZSdzIG5vIHJl
YXNvbiBmb3IgdGhlIGNhbGxlciB0byBhc3N1bWUgdGhlDQo+IHBvaW50ZXIgaXMgdmFsaWQuDQo+
ID4NCj4gPiBTb3JyeSBmb3IgbGF0ZSBxdWVyeSwgc29tZWhvdyB0aGlzIGVtYWlsIHdlbnQgaW50
byBhIGZvbGRlciBhbmQNCj4gPiBlc2NhcGVkOw0KPiA+DQo+ID4gSnVzdCB0byBiZSBzdXJlLCB0
aGVyZSBpcyBub3QgbG9jayBhdCBnZW5lcmljICJzdHJ1Y3QgaW9tbXVfZG9tYWluIiwgYnV0IElQ
DQo+IHNwZWNpZmljIHN0cnVjdHVyZSAobGluayBGU0wgZG9tYWluKSBsaW5rZWQgaW4gaW9tbXVf
ZG9tYWluLT5wcml2IGhhdmUgYSBsb2NrLA0KPiBzbyB3ZSBuZWVkIHRvIGVuc3VyZSB0aGlzIHJh
Y2UgaW4gRlNMIGlvbW11IGNvZGUgKHNheQ0KPiBkcml2ZXJzL2lvbW11L2ZzbF9wYW11X2RvbWFp
bi5jKSwgcmlnaHQ/DQo+IA0KPiBObywgaXQncyBub3Qgc3VmZmljaWVudCB0byBtYWtlIHN1cmUg
dGhhdCB5b3VyIHVzZSBvZiB0aGUgaW50ZXJmYWNlIGlzIHJhY2UNCj4gZnJlZS4gIFRoZSBpbnRl
cmZhY2UgaXRzZWxmIG5lZWRzIHRvIGJlIGRlc2lnbmVkIHNvIHRoYXQgaXQncyBkaWZmaWN1bHQg
dG8gdXNlDQo+IGluY29ycmVjdGx5Lg0KDQpTbyB3ZSBjYW4gZGVmaW5lIGlvbW11X2dldF9kZXZf
ZG9tYWluKCkvaW9tbXVfcHV0X2Rldl9kb21haW4oKTsgIGlvbW11X2dldF9kZXZfZG9tYWluKCkg
d2lsbCByZXR1cm4gZG9tYWluIHdpdGggdGhlIGxvY2sgaGVsZCwgYW5kIGlvbW11X3B1dF9kZXZf
ZG9tYWluKCkgd2lsbCByZWxlYXNlIHRoZSBsb2NrPyBBbmQgaW9tbXVfZ2V0X2Rldl9kb21haW4o
KSBtdXN0IGFsd2F5cyBiZSBmb2xsb3dlZCBieSBpb21tdV9nZXRfZGV2X2RvbWFpbigpLg0KDQoN
Cj4gVGhhdCdzIG5vdCB0aGUgY2FzZSBoZXJlLiAgVGhpcyBpcyBhIGJhY2tkb29yIHRvIGdldCB0
aGUgaW9tbXUNCj4gZG9tYWluIGZyb20gdGhlIGlvbW11IGRyaXZlciByZWdhcmRsZXNzIG9mIHdo
byBpcyB1c2luZyBpdCBvciBob3cuICBUaGUgaW9tbXUNCj4gZG9tYWluIGlzIGNyZWF0ZWQgYW5k
IG1hbmFnZWQgYnkgdmZpbywgc28gc2hvdWxkbid0IHdlIGJlIGxvb2tpbmcgYXQgaG93IHRvIGRv
DQo+IHRoaXMgdGhyb3VnaCB2ZmlvPw0KDQpMZXQgbWUgZmlyc3QgZGVzY3JpYmUgd2hhdCB3ZSBh
cmUgZG9pbmcgaGVyZToNCkR1cmluZyBpbml0aWFsaXphdGlvbjotDQogLSB2ZmlvIHRhbGtzIHRv
IE1TSSBzeXN0ZW0gdG8ga25vdyB0aGUgTVNJLXBhZ2UgYW5kIHNpemUNCiAtIHZmaW8gdGhlbiBp
bnRlcmFjdHMgd2l0aCBpb21tdSB0byBtYXAgdGhlIE1TSS1wYWdlIGluIGlvbW11IChJT1ZBIGlz
IGRlY2lkZWQgYnkgdXNlcnNwYWNlIGFuZCBwaHlzaWNhbCBhZGRyZXNzIGlzIHRoZSBNU0ktcGFn
ZSkNCiAtIFNvIHRoZSBJT1ZBIHN1YndpbmRvdyBtYXBwaW5nIGlzIGNyZWF0ZWQgaW4gaW9tbXUg
YW5kIHllcyBWRklPIGtub3cgYWJvdXQgdGhpcyBtYXBwaW5nLg0KDQpOb3cgZG8gU0VUX0lSUShN
U0kvTVNJWCkgaW9jdGw6DQogLSBjYWxscyBwY2lfZW5hYmxlX21zaXgoKS9wY2lfZW5hYmxlX21z
aV9ibG9jaygpOiB3aGljaCBpcyBzdXBwb3NlZCB0byBzZXQgTVNJIGFkZHJlc3MvZGF0YSBpbiBk
ZXZpY2UuDQogLSBTbyBpbiBjdXJyZW50IGltcGxlbWVudGF0aW9uICh0aGlzIHBhdGNoc2V0KSBt
c2ktc3Vic3lzdGVtIGdldHMgdGhlIElPVkEgZnJvbSBpb21tdSB2aWEgdGhpcyBkZWZpbmVkIGlu
dGVyZmFjZS4NCiAtIEFyZSB5b3Ugc2F5aW5nIHRoYXQgcmF0aGVyIHRoYW4gZ2V0dGluZyB0aGlz
IGZyb20gaW9tbXUsIHdlIHNob3VsZCBnZXQgdGhpcyBmcm9tIHZmaW8/IFdoYXQgZGlmZmVyZW5j
ZSBkb2VzIHRoaXMgbWFrZT8NCg0KVGhhbmtzDQotQmhhcmF0DQoNCj4gSXQgc2VlbXMgbGlrZSB5
b3UnZCB3YW50IHRvIHVzZSB5b3VyIGRldmljZSB0byBnZXQgYSB2ZmlvDQo+IGdyb3VwIHJlZmVy
ZW5jZSwgZnJvbSB3aGljaCB5b3UgY291bGQgZG8gc29tZXRoaW5nIHdpdGggdGhlIHZmaW8gZXh0
ZXJuYWwgdXNlcg0KPiBpbnRlcmZhY2UgYW5kIGdldCB0aGUgaW9tbXUgZG9tYWluIHJlZmVyZW5j
ZS4gIFRoYW5rcywNCj4gDQo+IEFsZXgNCj4gDQo+ID4gPiA+ICAvKg0KPiA+ID4gPiAgICogSU9N
TVUgZ3JvdXBzIGFyZSByZWFsbHkgdGhlIG5hdHJ1YWwgd29ya2luZyB1bml0IG9mIHRoZSBJT01N
VSwgYnV0DQo+ID4gPiA+ICAgKiB0aGUgSU9NTVUgQVBJIHdvcmtzIG9uIGRvbWFpbnMgYW5kIGRl
dmljZXMuICBCcmlkZ2UgdGhhdCBnYXANCj4gPiA+ID4gYnkgZGlmZiAtLWdpdCBhL2luY2x1ZGUv
bGludXgvaW9tbXUuaCBiL2luY2x1ZGUvbGludXgvaW9tbXUuaA0KPiA+ID4gPiBpbmRleCA3ZWEz
MTllLi5mYTA0NmJkIDEwMDY0NA0KPiA+ID4gPiAtLS0gYS9pbmNsdWRlL2xpbnV4L2lvbW11LmgN
Cj4gPiA+ID4gKysrIGIvaW5jbHVkZS9saW51eC9pb21tdS5oDQo+ID4gPiA+IEBAIC0xMjcsNiAr
MTI3LDcgQEAgc3RydWN0IGlvbW11X29wcyB7DQo+ID4gPiA+ICAJaW50ICgqZG9tYWluX3NldF93
aW5kb3dzKShzdHJ1Y3QgaW9tbXVfZG9tYWluICpkb21haW4sIHUzMg0KPiB3X2NvdW50KTsNCj4g
PiA+ID4gIAkvKiBHZXQgdGhlIG51bWVyIG9mIHdpbmRvdyBwZXIgZG9tYWluICovDQo+ID4gPiA+
ICAJdTMyICgqZG9tYWluX2dldF93aW5kb3dzKShzdHJ1Y3QgaW9tbXVfZG9tYWluICpkb21haW4p
Ow0KPiA+ID4gPiArCXN0cnVjdCBpb21tdV9kb21haW4gKigqZ2V0X2Rldl9pb21tdV9kb21haW4p
KHN0cnVjdCBkZXZpY2UNCj4gPiA+ID4gKypkZXYpOw0KPiA+ID4gPg0KPiA+ID4gPiAgCXVuc2ln
bmVkIGxvbmcgcGdzaXplX2JpdG1hcDsNCj4gPiA+ID4gIH07DQo+ID4gPiA+IEBAIC0xOTAsNiAr
MTkxLDcgQEAgZXh0ZXJuIGludCBpb21tdV9kb21haW5fd2luZG93X2VuYWJsZShzdHJ1Y3QNCj4g
PiA+ID4gaW9tbXVfZG9tYWluDQo+ID4gPiAqZG9tYWluLCB1MzIgd25kX25yLA0KPiA+ID4gPiAg
CQkJCSAgICAgIHBoeXNfYWRkcl90IG9mZnNldCwgdTY0IHNpemUsDQo+ID4gPiA+ICAJCQkJICAg
ICAgaW50IHByb3QpOw0KPiA+ID4gPiAgZXh0ZXJuIHZvaWQgaW9tbXVfZG9tYWluX3dpbmRvd19k
aXNhYmxlKHN0cnVjdCBpb21tdV9kb21haW4NCj4gPiA+ID4gKmRvbWFpbiwNCj4gPiA+ID4gdTMy
IHduZF9ucik7DQo+ID4gPiA+ICtleHRlcm4gc3RydWN0IGlvbW11X2RvbWFpbiAqaW9tbXVfZ2V0
X2Rldl9kb21haW4oc3RydWN0IGRldmljZQ0KPiA+ID4gPiArKmRldik7DQo+ID4gPiA+ICAvKioN
Cj4gPiA+ID4gICAqIHJlcG9ydF9pb21tdV9mYXVsdCgpIC0gcmVwb3J0IGFib3V0IGFuIElPTU1V
IGZhdWx0IHRvIHRoZSBJT01NVQ0KPiBmcmFtZXdvcmsNCj4gPiA+ID4gICAqIEBkb21haW46IHRo
ZSBpb21tdSBkb21haW4gd2hlcmUgdGhlIGZhdWx0IGhhcyBoYXBwZW5lZCBAQA0KPiA+ID4gPiAt
Mjg0LDYNCj4gPiA+ID4gKzI4NiwxMSBAQCBzdGF0aWMgaW5saW5lIHZvaWQgaW9tbXVfZG9tYWlu
X3dpbmRvd19kaXNhYmxlKHN0cnVjdA0KPiA+ID4gPiBpb21tdV9kb21haW4gKmRvbWFpbiwgIHsg
IH0NCj4gPiA+ID4NCj4gPiA+ID4gK3N0YXRpYyBpbmxpbmUgc3RydWN0IGlvbW11X2RvbWFpbiAq
aW9tbXVfZ2V0X2Rldl9kb21haW4oc3RydWN0DQo+ID4gPiA+ICtkZXZpY2UNCj4gPiA+ID4gKypk
ZXYpIHsNCj4gPiA+ID4gKwlyZXR1cm4gTlVMTDsNCj4gPiA+ID4gK30NCj4gPiA+ID4gKw0KPiA+
ID4gPiAgc3RhdGljIGlubGluZSBwaHlzX2FkZHJfdCBpb21tdV9pb3ZhX3RvX3BoeXMoc3RydWN0
IGlvbW11X2RvbWFpbg0KPiA+ID4gPiAqZG9tYWluLCBkbWFfYWRkcl90IGlvdmEpICB7DQo+ID4g
PiA+ICAJcmV0dXJuIDA7DQo+ID4gPg0KPiA+ID4NCj4gPiA+DQo+ID4gPiAtLQ0KPiA+ID4gVG8g
dW5zdWJzY3JpYmUgZnJvbSB0aGlzIGxpc3Q6IHNlbmQgdGhlIGxpbmUgInVuc3Vic2NyaWJlIGxp
bnV4LXBjaSINCj4gPiA+IGluIHRoZSBib2R5IG9mIGEgbWVzc2FnZSB0byBtYWpvcmRvbW9Admdl
ci5rZXJuZWwub3JnIE1vcmUgbWFqb3Jkb21vDQo+ID4gPiBpbmZvIGF0IGh0dHA6Ly92Z2VyLmtl
cm5lbC5vcmcvbWFqb3Jkb21vLWluZm8uaHRtbA0KPiA+DQo+IA0KPiANCj4gDQoNCg==

^ permalink raw reply

* Re: [RFC PATCH] PPC: KVM: vfio kvm device: support spapr tce
From: Alex Williamson @ 2013-10-04 16:05 UTC (permalink / raw)
  To: Alexey Kardashevskiy; +Cc: linuxppc-dev, kvm-ppc, kvm, linux-kernel
In-Reply-To: <1380889450-25550-1-git-send-email-aik@ozlabs.ru>

On Fri, 2013-10-04 at 22:24 +1000, Alexey Kardashevskiy wrote:
> This is a very rough change set required for ppc64 to use this KVM device.
> 
> vfio_rm.c is a piece of code which is going to be called from the realmode (MMU off),
> and I will put everything spapr-related under #ifdef CONFIG_SPAPR_TCE_IOMMU,
> it is just friday and I have to run :)
> 
> This is an RFC but it works.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  arch/powerpc/kvm/Kconfig  |  1 +
>  arch/powerpc/kvm/Makefile |  4 ++++
>  include/linux/kvm_host.h  |  8 ++++---
>  include/linux/vfio.h      |  3 +++
>  include/uapi/linux/kvm.h  |  1 +
>  virt/kvm/vfio.c           | 46 ++++++++++++++++++++++++++++++++++++++++
>  virt/kvm/vfio_rm.c        | 54 +++++++++++++++++++++++++++++++++++++++++++++++
>  7 files changed, 114 insertions(+), 3 deletions(-)
>  create mode 100644 virt/kvm/vfio_rm.c
> 
> diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
> index 61b3535..d1b7f64 100644
> --- a/arch/powerpc/kvm/Kconfig
> +++ b/arch/powerpc/kvm/Kconfig
> @@ -60,6 +60,7 @@ config KVM_BOOK3S_64
>  	select KVM_BOOK3S_64_HANDLER
>  	select KVM
>  	select SPAPR_TCE_IOMMU
> +	select KVM_VFIO
>  	---help---
>  	  Support running unmodified book3s_64 and book3s_32 guest kernels
>  	  in virtual machines on book3s_64 host processors.
> diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
> index 6646c95..fc2878b 100644
> --- a/arch/powerpc/kvm/Makefile
> +++ b/arch/powerpc/kvm/Makefile
> @@ -55,6 +55,8 @@ kvm-objs-$(CONFIG_KVM_E500MC) := $(kvm-e500mc-objs)
>  
>  kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \
>  	$(KVM)/coalesced_mmio.o \
> +	$(KVM)/vfio.o \
> +	$(KVM)/vfio_rm.o \
>  	fpu.o \
>  	book3s_paired_singles.o \
>  	book3s_pr.o \
> @@ -76,6 +78,7 @@ kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
>  kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \
>  	book3s_hv_rm_xics.o
>  kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
> +	$(KVM)/vfio_rm.o \
>  	book3s_hv_rmhandlers.o \
>  	book3s_hv_rm_mmu.o \
>  	book3s_64_vio_hv.o \
> @@ -89,6 +92,7 @@ kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
>  
>  kvm-book3s_64-module-objs := \
>  	$(KVM)/kvm_main.o \
> +	$(KVM)/vfio.o \
>  	$(KVM)/eventfd.o \
>  	powerpc.o \
>  	emulate.o \
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index ad2b581..43c0290 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -407,6 +407,8 @@ struct kvm {
>  #endif
>  	long tlbs_dirty;
>  	struct list_head devices;
> +
> +	struct kvm_vfio *vfio;


<cringe> can't this be on kvm->arch?

>  };
>  
>  #define kvm_err(fmt, ...) \
> @@ -677,15 +679,15 @@ void kvm_arch_register_noncoherent_dma(struct kvm *kvm);
>  void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm);
>  bool kvm_arch_has_noncoherent_dma(struct kvm *kvm);
>  #else
> -static inline void kvm_arch_register_noncoherent_dma(void)
> +static inline void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
>  {
>  }
>  
> -static inline void kvm_arch_unregister_noncoherent_dma(void)
> +static inline void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
>  {
>  }
>  
> -static inline bool kvm_arch_has_noncoherent_dma(void)
> +static inline bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
>  {
>  	return false;
>  }

Will fix in my series.

> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 24579a0..681e19b 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -97,4 +97,7 @@ extern struct vfio_group *vfio_group_get_external_user(struct file *filep);
>  extern void vfio_group_put_external_user(struct vfio_group *group);
>  extern int vfio_external_user_iommu_id(struct vfio_group *group);
>  
> +extern struct iommu_group *vfio_find_group_by_liobn(struct kvm *kvm,
> +		unsigned long liobn);
> +

Wrong header file.

>  #endif /* VFIO_H */
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 7c1a349..a74ad16 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -847,6 +847,7 @@ struct kvm_device_attr {
>  #define  KVM_DEV_VFIO_GROUP			1
>  #define   KVM_DEV_VFIO_GROUP_ADD			1
>  #define   KVM_DEV_VFIO_GROUP_DEL			2
> +#define  KVM_DEV_VFIO_SPAPR_TCE_LIOBN		2
>  
>  /*
>   * ioctls for VM fds
> diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
> index 2e336a7..39dea9f 100644
> --- a/virt/kvm/vfio.c
> +++ b/virt/kvm/vfio.c
> @@ -22,6 +22,7 @@
>  struct kvm_vfio_group {
>  	struct list_head node;
>  	struct vfio_group *vfio_group;
> +	uint64_t liobn; /* sPAPR */

Perhaps an arch pointer or at least a union.

>  };
>  
>  struct kvm_vfio {
> @@ -188,12 +189,52 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg)
>  	return -ENXIO;
>  }
>  
> +static int kvm_vfio_set_spapr_tce_liobn(struct kvm_device *dev,
> +		long attr, u64 arg)
> +{
> +	struct kvm_vfio *kv = dev->private;
> +	struct vfio_group *vfio_group;
> +	struct kvm_vfio_group *kvg;
> +	void __user *argp = (void __user *)arg;
> +	struct fd f;
> +	int32_t fd;
> +	uint64_t liobn = attr;
> +
> +	if (get_user(fd, (int32_t __user *)argp))
> +		return -EFAULT;
> +
> +	f = fdget(fd);
> +	if (!f.file)
> +		return -EBADF;
> +
> +	vfio_group = kvm_vfio_group_get_external_user(f.file);
> +	fdput(f);
> +
> +	list_for_each_entry(kvg, &kv->group_list, node) {
> +		if (kvg->vfio_group == vfio_group) {
> +			WARN_ON(kvg->liobn);

Users shouldn't be able to trigger WARN_ON so easily, return -EBUSY,
allow it to be unset and re-set, or just allow the overwrite.

> +			kvg->liobn = liobn;
> +			kvm_vfio_group_put_external_user(vfio_group);
> +			return 0;
> +		}
> +	}
> +
> +	kvm_vfio_group_put_external_user(vfio_group);
> +
> +	return -ENXIO;
> +}
> +
>  static int kvm_vfio_set_attr(struct kvm_device *dev,
>  			     struct kvm_device_attr *attr)
>  {
>  	switch (attr->group) {
>  	case KVM_DEV_VFIO_GROUP:
>  		return kvm_vfio_set_group(dev, attr->attr, attr->addr);
> +#ifdef CONFIG_SPAPR_TCE_IOMMU
> +	case KVM_DEV_VFIO_SPAPR_TCE_LIOBN:
> +		return kvm_vfio_set_spapr_tce_liobn(dev, attr->attr,
> +				attr->addr);
> +#endif
>  	}
>  
>  	return -ENXIO;
> @@ -211,6 +252,10 @@ static int kvm_vfio_has_attr(struct kvm_device *dev,
>  		}
>  
>  		break;
> +#ifdef CONFIG_SPAPR_TCE_IOMMU
> +	case KVM_DEV_VFIO_SPAPR_TCE_LIOBN:
> +		return 0;
> +#endif
>  	}
>  
>  	return -ENXIO;
> @@ -250,6 +295,7 @@ static int kvm_vfio_create(struct kvm_device *dev, u32 type)
>  	mutex_init(&kv->lock);
>  
>  	dev->private = kv;
> +	dev->kvm->vfio = kv;
>  
>  	return 0;
>  }
> diff --git a/virt/kvm/vfio_rm.c b/virt/kvm/vfio_rm.c
> new file mode 100644
> index 0000000..ee9fd96
> --- /dev/null
> +++ b/virt/kvm/vfio_rm.c
> @@ -0,0 +1,54 @@
> +#include <linux/errno.h>
> +#include <linux/file.h>
> +#include <linux/kvm_host.h>
> +#include <linux/list.h>
> +#include <linux/module.h>
> +#include <linux/mutex.h>
> +#include <linux/slab.h>
> +#include <linux/uaccess.h>
> +#include <linux/vfio.h>
> +
> +struct kvm_vfio_group {
> +	struct list_head node;
> +	struct vfio_group *vfio_group;
> +	uint64_t liobn; /* sPAPR */
> +};
> +
> +struct kvm_vfio {
> +	struct list_head group_list;
> +	struct mutex lock;
> +	bool noncoherent;
> +};
> +
> +struct vfio_group {
> +	struct kref			kref;
> +	int				minor;
> +	atomic_t			container_users;
> +	struct iommu_group		*iommu_group;
> +	struct vfio_container		*container;
> +	struct list_head		device_list;
> +	struct mutex			device_lock;
> +	struct device			*dev;
> +	struct notifier_block		nb;
> +	struct list_head		vfio_next;
> +	struct list_head		container_next;
> +	atomic_t			opened;
> +};
> +
> +struct iommu_group *vfio_find_group_by_liobn(struct kvm *kvm, unsigned long liobn)
> +{
> +	struct kvm_vfio_group *kvg;
> +
> +	if (!kvm->vfio)
> +		return NULL;
> +
> +	list_for_each_entry(kvg, &kvm->vfio->group_list, node) {
> +		if (kvg->liobn == liobn)
> +			return kvg->vfio_group->iommu_group;
> +	}
> +
> +	return NULL;
> +}
> +EXPORT_SYMBOL_GPL(vfio_find_group_by_liobn);
> +
> +

You're kidding, right?  These are intentionally private data structures
that are blatantly copied so that you can extract what you want.  NACK.
The iommu_group is available off struct device, do you even need vfio or
this kvm-vfio device to get from liobn to iommu_group?  Thanks,

Alex

^ permalink raw reply

* Re: [PATCH 2/7] iommu: add api to get iommu_domain of a device
From: Alex Williamson @ 2013-10-04 15:45 UTC (permalink / raw)
  To: Bhushan Bharat-R65777
  Cc: agraf@suse.de, Wood Scott-B07421, linux-pci@vger.kernel.org,
	joro@8bytes.org, linux-kernel@vger.kernel.org,
	iommu@lists.linux-foundation.org, linuxppc-dev@lists.ozlabs.org
In-Reply-To: <6A3DF150A5B70D4F9B66A25E3F7C888D0718FD1C@039-SN2MPN1-011.039d.mgd.msft.net>

On Fri, 2013-10-04 at 09:54 +0000, Bhushan Bharat-R65777 wrote:
> 
> > -----Original Message-----
> > From: linux-pci-owner@vger.kernel.org [mailto:linux-pci-owner@vger.kernel.org]
> > On Behalf Of Alex Williamson
> > Sent: Wednesday, September 25, 2013 10:16 PM
> > To: Bhushan Bharat-R65777
> > Cc: joro@8bytes.org; benh@kernel.crashing.org; galak@kernel.crashing.org; linux-
> > kernel@vger.kernel.org; linuxppc-dev@lists.ozlabs.org; linux-
> > pci@vger.kernel.org; agraf@suse.de; Wood Scott-B07421; iommu@lists.linux-
> > foundation.org; Bhushan Bharat-R65777
> > Subject: Re: [PATCH 2/7] iommu: add api to get iommu_domain of a device
> > 
> > On Thu, 2013-09-19 at 12:59 +0530, Bharat Bhushan wrote:
> > > This api return the iommu domain to which the device is attached.
> > > The iommu_domain is required for making API calls related to iommu.
> > > Follow up patches which use this API to know iommu maping.
> > >
> > > Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
> > > ---
> > >  drivers/iommu/iommu.c |   10 ++++++++++
> > >  include/linux/iommu.h |    7 +++++++
> > >  2 files changed, 17 insertions(+), 0 deletions(-)
> > >
> > > diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index
> > > fbe9ca7..6ac5f50 100644
> > > --- a/drivers/iommu/iommu.c
> > > +++ b/drivers/iommu/iommu.c
> > > @@ -696,6 +696,16 @@ void iommu_detach_device(struct iommu_domain
> > > *domain, struct device *dev)  }
> > > EXPORT_SYMBOL_GPL(iommu_detach_device);
> > >
> > > +struct iommu_domain *iommu_get_dev_domain(struct device *dev) {
> > > +	struct iommu_ops *ops = dev->bus->iommu_ops;
> > > +
> > > +	if (unlikely(ops == NULL || ops->get_dev_iommu_domain == NULL))
> > > +		return NULL;
> > > +
> > > +	return ops->get_dev_iommu_domain(dev); }
> > > +EXPORT_SYMBOL_GPL(iommu_get_dev_domain);
> > 
> > What prevents this from racing iommu_domain_free()?  There's no references
> > acquired, so there's no reason for the caller to assume the pointer is valid.
> 
> Sorry for late query, somehow this email went into a folder and escaped;
> 
> Just to be sure, there is not lock at generic "struct iommu_domain", but IP specific structure (link FSL domain) linked in iommu_domain->priv have a lock, so we need to ensure this race in FSL iommu code (say drivers/iommu/fsl_pamu_domain.c), right?

No, it's not sufficient to make sure that your use of the interface is
race free.  The interface itself needs to be designed so that it's
difficult to use incorrectly.  That's not the case here.  This is a
backdoor to get the iommu domain from the iommu driver regardless of who
is using it or how.  The iommu domain is created and managed by vfio, so
shouldn't we be looking at how to do this through vfio?  It seems like
you'd want to use your device to get a vfio group reference, from which
you could do something with the vfio external user interface and get the
iommu domain reference.  Thanks,

Alex
 
> > >  /*
> > >   * IOMMU groups are really the natrual working unit of the IOMMU, but
> > >   * the IOMMU API works on domains and devices.  Bridge that gap by
> > > diff --git a/include/linux/iommu.h b/include/linux/iommu.h index
> > > 7ea319e..fa046bd 100644
> > > --- a/include/linux/iommu.h
> > > +++ b/include/linux/iommu.h
> > > @@ -127,6 +127,7 @@ struct iommu_ops {
> > >  	int (*domain_set_windows)(struct iommu_domain *domain, u32 w_count);
> > >  	/* Get the numer of window per domain */
> > >  	u32 (*domain_get_windows)(struct iommu_domain *domain);
> > > +	struct iommu_domain *(*get_dev_iommu_domain)(struct device *dev);
> > >
> > >  	unsigned long pgsize_bitmap;
> > >  };
> > > @@ -190,6 +191,7 @@ extern int iommu_domain_window_enable(struct iommu_domain
> > *domain, u32 wnd_nr,
> > >  				      phys_addr_t offset, u64 size,
> > >  				      int prot);
> > >  extern void iommu_domain_window_disable(struct iommu_domain *domain,
> > > u32 wnd_nr);
> > > +extern struct iommu_domain *iommu_get_dev_domain(struct device *dev);
> > >  /**
> > >   * report_iommu_fault() - report about an IOMMU fault to the IOMMU framework
> > >   * @domain: the iommu domain where the fault has happened @@ -284,6
> > > +286,11 @@ static inline void iommu_domain_window_disable(struct
> > > iommu_domain *domain,  {  }
> > >
> > > +static inline struct iommu_domain *iommu_get_dev_domain(struct device
> > > +*dev) {
> > > +	return NULL;
> > > +}
> > > +
> > >  static inline phys_addr_t iommu_iova_to_phys(struct iommu_domain
> > > *domain, dma_addr_t iova)  {
> > >  	return 0;
> > 
> > 
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-pci" in the body
> > of a message to majordomo@vger.kernel.org More majordomo info at
> > http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply

* Re: [PATCH] Kind of revert "powerpc: 52xx: provide a default in mpc52xx_irqhost_map()"
From: Sebastian Andrzej Siewior @ 2013-10-04 15:38 UTC (permalink / raw)
  To: Wolfram Sang; +Cc: linuxppc-dev, Anatolij Gustschin, linux-rt-users
In-Reply-To: <1380901029-3548-1-git-send-email-wsa@the-dreams.de>

On 10/04/2013 05:37 PM, Wolfram Sang wrote:
> This more or less reverts commit 6391f697d4892a6f233501beea553e13f7745a23.
> Instead of adding an unneeded 'default', mark the variable to prevent
> the false positive 'uninitialized var'. The other change (fixing the
> printout) needs revert, too. We want to know WHICH critical irq failed,
> not which level it had.
> 
> Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
> Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> Cc: Anatolij Gustschin <agust@denx.de>

Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

Sebastian

^ permalink raw reply

* [PATCH] Kind of revert "powerpc: 52xx: provide a default in mpc52xx_irqhost_map()"
From: Wolfram Sang @ 2013-10-04 15:37 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Sebastian Andrzej Siewior, Anatolij Gustschin, linux-rt-users,
	Wolfram Sang

This more or less reverts commit 6391f697d4892a6f233501beea553e13f7745a23.
Instead of adding an unneeded 'default', mark the variable to prevent
the false positive 'uninitialized var'. The other change (fixing the
printout) needs revert, too. We want to know WHICH critical irq failed,
not which level it had.

Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Anatolij Gustschin <agust@denx.de>
---
 arch/powerpc/platforms/52xx/mpc52xx_pic.c |    5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/platforms/52xx/mpc52xx_pic.c b/arch/powerpc/platforms/52xx/mpc52xx_pic.c
index b69221b..2898b73 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_pic.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_pic.c
@@ -340,7 +340,7 @@ static int mpc52xx_irqhost_map(struct irq_domain *h, unsigned int virq,
 {
 	int l1irq;
 	int l2irq;
-	struct irq_chip *irqchip;
+	struct irq_chip *uninitialized_var(irqchip);
 	void *hndlr;
 	int type;
 	u32 reg;
@@ -373,9 +373,8 @@ static int mpc52xx_irqhost_map(struct irq_domain *h, unsigned int virq,
 	case MPC52xx_IRQ_L1_PERP: irqchip = &mpc52xx_periph_irqchip; break;
 	case MPC52xx_IRQ_L1_SDMA: irqchip = &mpc52xx_sdma_irqchip; break;
 	case MPC52xx_IRQ_L1_CRIT:
-	default:
 		pr_warn("%s: Critical IRQ #%d is unsupported! Nopping it.\n",
-			__func__, l1irq);
+			__func__, l2irq);
 		irq_set_chip(virq, &no_irq_chip);
 		return 0;
 	}
-- 
1.7.10.4

^ permalink raw reply related

* Re: [PATCH -V2] kvm: powerpc: book3s: Fix build break for BOOK3S_32
From: Alexander Graf @ 2013-10-04 14:43 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: paulus, linuxppc-dev, kvm-ppc, kvm
In-Reply-To: <1380787123-32646-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>


On 03.10.2013, at 09:58, Aneesh Kumar K.V wrote:

> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
>=20
> This was introduced by 85a0d845d8 ("KVM: PPC: Book3S PR: Allocate
> kvm_vcpu structs from kvm_vcpu_cache").
>=20
> arch/powerpc/kvm/book3s_pr.c: In function 'kvmppc_core_vcpu_create':
> arch/powerpc/kvm/book3s_pr.c:1182:30: error: 'struct =
kvmppc_vcpu_book3s' has no member named 'shadow_vcpu'
> make[1]: *** [arch/powerpc/kvm/book3s_pr.o] Error 1
>=20
> Acked-by: Paul Mackerras <paulus@samba.org>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>

Thanks, squashed into the offending commit in kvm-ppc-queue.


Alex

> ---
> arch/powerpc/kvm/book3s_pr.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>=20
> diff --git a/arch/powerpc/kvm/book3s_pr.c =
b/arch/powerpc/kvm/book3s_pr.c
> index 8941885..6075dbd 100644
> --- a/arch/powerpc/kvm/book3s_pr.c
> +++ b/arch/powerpc/kvm/book3s_pr.c
> @@ -1179,7 +1179,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct =
kvm *kvm, unsigned int id)
>=20
> #ifdef CONFIG_KVM_BOOK3S_32
> 	vcpu->arch.shadow_vcpu =3D
> -		kzalloc(sizeof(*vcpu_book3s->shadow_vcpu), GFP_KERNEL);
> +		kzalloc(sizeof(*vcpu->arch.shadow_vcpu), GFP_KERNEL);
> 	if (!vcpu->arch.shadow_vcpu)
> 		goto free_vcpu3s;
> #endif
> --=20
> 1.8.1.2
>=20
> --
> To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 0/2] powerpc: allow kvm to use kerel debug framework
From: Alexander Graf @ 2013-10-04 14:40 UTC (permalink / raw)
  To: Michael Neuling
  Cc: linux-kernel, Bharat Bhushan, Bharat Bhushan, scottwood,
	linuxppc-dev
In-Reply-To: <21895.1373441122@ale.ozlabs.ibm.com>


On 10.07.2013, at 09:25, Michael Neuling wrote:

> Alexander Graf <agraf@suse.de> wrote:
>=20
>>=20
>> On 09.07.2013, at 06:24, Michael Neuling wrote:
>>=20
>>> Alexander Graf <agraf@suse.de> wrote:
>>>=20
>>>>=20
>>>> On 04.07.2013, at 08:15, Bharat Bhushan wrote:
>>>>=20
>>>>> From: Bharat Bhushan <bharat.bhushan@freescale.com>
>>>>>=20
>>>>> This patchset moves the debug registers in a structure, which =
allows
>>>>> kvm to use same structure for debug emulation.
>>>>>=20
>>>>> Note: Earilier a patchset =
"https://lists.ozlabs.org/pipermail/linuxppc-dev/2013-June/108132.html"
>>>>> was sent which is a bunch of six patches. That patchset is divided =
into two parts:
>>>>> 	1) powerpc specific changes (These 2 patches are actually have =
those changes)
>>>>> 	2) KVM specific changes (will send separate patch on agraf =
repository)
>>>>=20
>>>> Mikey, if you like those could you please apply the into a topic
>>>> branch and get that one merged with Ben? I'd also pull it into my =
tree
>>>> then.
>>>=20
>>> benh would pull these directly. =20
>>>=20
>>> I'll have a chat with him to see if he wants my ACK before he does =
that.
>>=20
>> I have a bunch of patches that I need to apply on top, so I need a =
topic branch.
>=20
> I've acked the PPC specific bits of the v6 version of these patches.
>=20
> benh said he'll open a topic branch for you sometime next week and =
he'll
> stick them in there. =20

After talking to Ben about it, I've just put these into kvm-ppc-queue, =
so they will hit Linus' tree through the kvm tree.


Alex

^ permalink raw reply

* Re: [PATCH] kvm: powerpc: book3s: Fix build break for BOOK3S_32
From: Aneesh Kumar K.V @ 2013-10-04 14:24 UTC (permalink / raw)
  To: Alexander Graf, Paul Mackerras; +Cc: linuxppc-dev, kvm, kvm-ppc
In-Reply-To: <36F00CBE-E313-465A-BD93-A8E1B96FD2A9@suse.de>

Alexander Graf <agraf@suse.de> writes:

> On 04.10.2013, at 14:23, Alexander Graf wrote:
>
>> 
>> On 03.10.2013, at 06:14, Paul Mackerras wrote:
>> 
>>> On Wed, Oct 02, 2013 at 08:08:44PM +0530, Aneesh Kumar K.V wrote:
>>>> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
>>>> 
>>>> This was introduced by 85a0d845d8bb5df5d2669416212f56cbe1474c6b
>>> 
>>> It's a good idea to give the headline of the commit as well as the ID.
>>> I also like to trim the ID to 10 characters or so.  So it should look
>>> like this:
>>> 
>>> This was introduced by 85a0d845d8 ("KVM: PPC: Book3S PR: Allocate
>>> kvm_vcpu structs from kvm_vcpu_cache").
>>> 
>>>> arch/powerpc/kvm/book3s_pr.c: In function 'kvmppc_core_vcpu_create':
>>>> arch/powerpc/kvm/book3s_pr.c:1182:30: error: 'struct kvmppc_vcpu_book3s' has no member named 'shadow_vcpu'
>>>> make[1]: *** [arch/powerpc/kvm/book3s_pr.o] Error 1
>>>> 
>>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
>>> 
>>> Acked-by: Paul Mackerras <paulus@samba.org>
>> 
>> Would you guys mind if I merge this into the offending patch? It's not trickled into -next yet, so rebasing should work.
>> 
>> If not, please resend with the fixed commit message.
>
> Eh - I must've missed v2 :). So that leaves only the question on whether you'd be ok to squash the patch instead. It'd help bisectability.

Yes, it should be squashed if we can do that.

-aneesh

^ permalink raw reply

* Re: [PATCH 0/6 v5] kvm: powerpc: use cache attributes from linux pte
From: Alexander Graf @ 2013-10-04 14:03 UTC (permalink / raw)
  To: Bharat Bhushan
  Cc: kvm, kvm-ppc, Bharat Bhushan, paulus, scottwood, linuxppc-dev
In-Reply-To: <1379570566-3715-1-git-send-email-Bharat.Bhushan@freescale.com>


On 19.09.2013, at 08:02, Bharat Bhushan wrote:

> From: Bharat Bhushan <bharat.bhushan@freescale.com>
>=20
> First patch is a typo fix where book3e define _PAGE_LENDIAN while it
> should be defined as _PAGE_ENDIAN. This seems to show that this is =
never exercised :-)
>=20
> Second and third patch is to allow guest controlling "G"-Guarded and =
"E"-Endian TLB attributes respectively.
>=20
> Fourth patch is moving functions/logic in common code so they can be =
used on booke also.
>=20
> Fifth and Sixth patch is actually setting caching attributes =
(TLB.WIMGE) using corresponding Linux pte.

Thanks, applied 1/6 - 3/6 to kvm-ppc-queue.


Alex

>=20
> v3->v5
> - Fix tlb-reference-flag clearing issue (patch 4/6)
> - There was a patch (4/6 powerpc: move linux pte/hugepte search to =
more generic file)
>   in the last series of this patchset which was moving pte/hugepte =
search functions to
>   generic file. That patch is no more needed as some other patch is =
already applied to fix that :)
>=20
> v2->v3
> - now lookup_linux_pte() only have pte search logic and it does not
>   set any access flags in pte. There is already a function for setting
>   access flag which will be called explicitly where needed.
>   On booke we only need to search for pte to get WIMG.
>=20
> v1->v2
> - Earlier caching attributes (WIMGE) were set based of page is RAM or =
not
>   But now we get these attributes from corresponding Linux PTE.
>=20
> Bharat Bhushan (6):
>  powerpc: book3e: _PAGE_LENDIAN must be _PAGE_ENDIAN
>  kvm: powerpc: allow guest control "E" attribute in mas2
>  kvm: powerpc: allow guest control "G" attribute in mas2
>  kvm: powerpc: keep only pte search logic in lookup_linux_pte
>  kvm: booke: clear host tlb reference flag on guest tlb invalidation
>  kvm: powerpc: use caching attributes as per linux pte
>=20
> arch/powerpc/include/asm/kvm_host.h   |    2 +-
> arch/powerpc/include/asm/pgtable.h    |   24 ++++++++++++++++
> arch/powerpc/include/asm/pte-book3e.h |    2 +-
> arch/powerpc/kvm/book3s_hv_rm_mmu.c   |   36 ++++++++----------------
> arch/powerpc/kvm/booke.c              |    2 +-
> arch/powerpc/kvm/e500.h               |   10 ++++--
> arch/powerpc/kvm/e500_mmu_host.c      |   50 =
+++++++++++++++++++--------------
> 7 files changed, 74 insertions(+), 52 deletions(-)
>=20
>=20

^ permalink raw reply

* Re: [PATCH 0/4 v6] KVM :PPC: Userspace Debug support
From: Alexander Graf @ 2013-10-04 14:02 UTC (permalink / raw)
  To: Bharat Bhushan
  Cc: sfr, mikey, kvm, kvm-ppc, Bharat Bhushan, tiejun.chen, scottwood,
	linuxppc-dev
In-Reply-To: <1372921067-19678-1-git-send-email-Bharat.Bhushan@freescale.com>


On 04.07.2013, at 08:57, Bharat Bhushan wrote:

> From: Bharat Bhushan <bharat.bhushan@freescale.com>
>=20
> Note: These patches depends on https://lkml.org/lkml/2013/7/4/49.
>=20
> This patchset adds the userspace debug support for booke/bookehv.
> this is tested on powerpc e500v2/e500mc devices.

Thanks, applied to kvm-ppc-queue.


Alex

>=20
> We are now assuming that debug resource will not be used by kernel for
> its own debugging. It will be used for only kernel user process =
debugging.
> So the kernel debug load interface during context_to is used to load
> debug conext for that selected process.
>=20
> v5->v6
> - Earlier it was a patchset of six patches. Now this is devided into
>   two parts: 1) powerpc spcific changes 2) kvm specific changes.
>   This patchset now contains only KVM specific changes.
> - using "ehpriv 1" (earlier using "ehpriv") for software breakpoint
> -
>=20
> v4->v5
> - Some comments reworded and other cleanup (like change of function =
name etc)
> - Added a function for setting MSRP rather than inline
>=20
> v3->v4
> - 4 out of 7 patches of initial patchset were applied.
>   This patchset is on and above those 4 patches
> - KVM local "struct kvmppc_booke_debug_reg" is replaced by
>   powerpc global "struct debug_reg"
> - use switch_booke_debug_regs() for debug register context switch.
> - Save DBSR before kernel pre-emption is enabled.
> - Some more cleanup
>=20
> v2->v3
> - We are now assuming that debug resource will not be used by
>   kernel for its own debugging.
>   It will be used for only kernel user process debugging.
>   So the kernel debug load interface during context_to is
>   used to load debug conext for that selected process.
>=20
> v1->v2
> - Debug registers are save/restore in vcpu_put/vcpu_get.
>   Earlier the debug registers are saved/restored in guest entry/exit
>=20
> Bharat Bhushan (4):
>  powerpc: export debug registers save function for KVM
>  KVM: PPC: exit to user space on "ehpriv 1" instruction
>  KVM: PPC: Using "struct debug_reg"
>  KVM: PPC: Add userspace debug stub support
>=20
> arch/powerpc/include/asm/disassemble.h |    4 +
> arch/powerpc/include/asm/kvm_booke.h   |    7 +-
> arch/powerpc/include/asm/kvm_host.h    |   16 +--
> arch/powerpc/include/asm/switch_to.h   |    1 +
> arch/powerpc/include/uapi/asm/kvm.h    |   22 ++-
> arch/powerpc/kernel/process.c          |    3 +-
> arch/powerpc/kvm/booke.c               |  275 =
++++++++++++++++++++++++++++----
> arch/powerpc/kvm/booke.h               |    5 +
> arch/powerpc/kvm/e500_emulate.c        |   26 +++
> 9 files changed, 312 insertions(+), 47 deletions(-)
>=20
>=20

^ permalink raw reply

* Re: [PATCH v3 net-next] fix unsafe set_memory_rw from softirq
From: Eric Dumazet @ 2013-10-04 13:56 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Heiko Carstens, Paul Mackerras, H. Peter Anvin, sparclinux,
	Nicolas Dichtel, Alexei Starovoitov, linux-s390, Russell King,
	x86, James Morris, Ingo Molnar, Alexey Kuznetsov,
	Paul E. McKenney, Xi Wang, Matt Evans, Thomas Gleixner,
	linux-arm-kernel, Stelian Nirlu, Nicolas Schichan,
	Hideaki YOSHIFUJI, netdev, linux-kernel, David S. Miller,
	Mircea Gherzan, Daniel Borkmann, Martin Schwidefsky, linux390,
	linuxppc-dev, Patrick McHardy
In-Reply-To: <20131004075133.GA12313@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 1306 bytes --]

1)
>
> I took a brief look at arch/x86/net/bpf_jit_comp.c while reviewing this
> patch.
>
> You need to split up bpf_jit_compile(), it's an obscenely large, ~600
> lines long function. We don't do that in modern, maintainable kernel code.
>
> 2)
>
> This 128 bytes extra padding:
>
>         /* Most of BPF filters are really small,
>          * but if some of them fill a page, allow at least
>          * 128 extra bytes to insert a random section of int3
>          */
>         sz = round_up(proglen + sizeof(*header) + 128, PAGE_SIZE);
>
> why is it done? It's not clear to me from the comment.
>

commit 314beb9bcabfd6b4542ccbced2402af2c6f6142a
Author: Eric Dumazet <edumazet@google.com>
Date:   Fri May 17 16:37:03 2013 +0000

    x86: bpf_jit_comp: secure bpf jit against spraying attacks

    hpa bringed into my attention some security related issues
    with BPF JIT on x86.

    This patch makes sure the bpf generated code is marked read only,
    as other kernel text sections.

    It also splits the unused space (we vmalloc() and only use a fraction of
    the page) in two parts, so that the generated bpf code not starts at a
    known offset in the page, but a pseudo random one.

    Refs:

http://mainisusuallyafunction.blogspot.com/2012/11/attacking-hardened-linux-systems-with.html

[-- Attachment #2: Type: text/html, Size: 2059 bytes --]

^ permalink raw reply

* Re: Gianfar driver crashes in Kernel v3.10
From: Claudiu Manoil @ 2013-10-04 13:45 UTC (permalink / raw)
  To: Thomas Hühn; +Cc: linuxppc-dev@lists.ozlabs.org
In-Reply-To: <8EF35D6A-A132-458C-A3B4-80D4D2C5BA4C@dai-labor.de>

On 10/4/2013 3:03 PM, Thomas H=FChn wrote:
> Hi all,
>
> We are several Openwrt users based on the TPlink 4900 device and suffer=
 from a crashing gianfar driver.
> We troubleshooted the problem down to the fact, that a 3.8er Linux kern=
el is working, and a v3.10 crashes, but there is
> no reproducable case yet.

I'll have some low traffic tests with the upstream 3.10 kernel on a
p1010rdb, but since you say there's no "reproducibility case" the crash
may not be easy to spot on my side.  I wouldn't jump to conclusions
too fast, the fact that it manifests now doesn't neccesarily mean
that a bug was introduced between v3.8 and v3.10.
I'll let you know should I find something.  Meanwhile, if you have
additional information about how to reproduce it, please share.

Thanks,
Claudiu

^ permalink raw reply

* RE: [PATCH 4/6 v5] kvm: powerpc: keep only pte search logic in lookup_linux_pte
From: Bhushan Bharat-R65777 @ 2013-10-04 13:44 UTC (permalink / raw)
  To: Alexander Graf
  Cc: Wood Scott-B07421, kvm@vger.kernel.org, kvm-ppc@vger.kernel.org,
	paulus@samba.org, linuxppc-dev@lists.ozlabs.org
In-Reply-To: <7DEA9422-69D4-4470-BE28-A3B0A623D000@suse.de>



> -----Original Message-----
> From: Alexander Graf [mailto:agraf@suse.de]
> Sent: Friday, October 04, 2013 6:57 PM
> To: Bhushan Bharat-R65777
> Cc: benh@kernel.crashing.org; paulus@samba.org; kvm@vger.kernel.org; kvm-
> ppc@vger.kernel.org; linuxppc-dev@lists.ozlabs.org; Wood Scott-B07421; Bh=
ushan
> Bharat-R65777
> Subject: Re: [PATCH 4/6 v5] kvm: powerpc: keep only pte search logic in
> lookup_linux_pte
>=20
>=20
> On 19.09.2013, at 08:02, Bharat Bhushan wrote:
>=20
> > lookup_linux_pte() was searching for a pte and also sets access flags
> > is writable. This function now searches only pte while access flag
> > setting is done explicitly.
> >
> > This pte lookup is not kvm specific, so moved to common code
> > (asm/pgtable.h) My Followup patch will use this on booke.
> >
> > Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
> > ---
> > v4->v5
> > - No change
> >
> > arch/powerpc/include/asm/pgtable.h  |   24 +++++++++++++++++++++++
> > arch/powerpc/kvm/book3s_hv_rm_mmu.c |   36 +++++++++++-----------------=
------
> > 2 files changed, 36 insertions(+), 24 deletions(-)
> >
> > diff --git a/arch/powerpc/include/asm/pgtable.h
> > b/arch/powerpc/include/asm/pgtable.h
> > index 7d6eacf..3a5de5c 100644
> > --- a/arch/powerpc/include/asm/pgtable.h
> > +++ b/arch/powerpc/include/asm/pgtable.h
> > @@ -223,6 +223,30 @@ extern int gup_hugepte(pte_t *ptep, unsigned long
> > sz, unsigned long addr, #endif pte_t *find_linux_pte_or_hugepte(pgd_t
> > *pgdir, unsigned long ea,
> > 				 unsigned *shift);
> > +
> > +static inline pte_t *lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
> > +				     unsigned long *pte_sizep)
> > +{
> > +	pte_t *ptep;
> > +	unsigned long ps =3D *pte_sizep;
> > +	unsigned int shift;
> > +
> > +	ptep =3D find_linux_pte_or_hugepte(pgdir, hva, &shift);
> > +	if (!ptep)
> > +		return __pte(0);
>=20
> This returns a struct pte_t, but your return value of the function is a s=
truct
> pte_t *. So this code will fail compiling with STRICT_MM_TYPECHECKS set. =
Any
> reason you don't just return NULL here?

I want to return the ptep (pte pointer) , so yes this should be NULL.
Will correct this.

Thanks
-Bharat

>=20
> That way callers could simply check on if (ptep) ... or you leave the ret=
urn
> value as struct pte_t.
>=20
>=20
> Alex
>=20
> > +	if (shift)
> > +		*pte_sizep =3D 1ul << shift;
> > +	else
> > +		*pte_sizep =3D PAGE_SIZE;
> > +
> > +	if (ps > *pte_sizep)
> > +		return __pte(0);
> > +
> > +	if (!pte_present(*ptep))
> > +		return __pte(0);
>=20
> > +
> > +	return ptep;
> > +}
> > #endif /* __ASSEMBLY__ */
> >
> > #endif /* __KERNEL__ */
> > diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
> > b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
> > index 45e30d6..74fa7f8 100644
> > --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
> > +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
> > @@ -134,25 +134,6 @@ static void remove_revmap_chain(struct kvm *kvm, l=
ong
> pte_index,
> > 	unlock_rmap(rmap);
> > }
> >
> > -static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
> > -			      int writing, unsigned long *pte_sizep)
> > -{
> > -	pte_t *ptep;
> > -	unsigned long ps =3D *pte_sizep;
> > -	unsigned int hugepage_shift;
> > -
> > -	ptep =3D find_linux_pte_or_hugepte(pgdir, hva, &hugepage_shift);
> > -	if (!ptep)
> > -		return __pte(0);
> > -	if (hugepage_shift)
> > -		*pte_sizep =3D 1ul << hugepage_shift;
> > -	else
> > -		*pte_sizep =3D PAGE_SIZE;
> > -	if (ps > *pte_sizep)
> > -		return __pte(0);
> > -	return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift);
> > -}
> > -
> > static inline void unlock_hpte(unsigned long *hpte, unsigned long
> > hpte_v) {
> > 	asm volatile(PPC_RELEASE_BARRIER "" : : : "memory"); @@ -173,6 +154,7
> > @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
> > 	unsigned long is_io;
> > 	unsigned long *rmap;
> > 	pte_t pte;
> > +	pte_t *ptep;
> > 	unsigned int writing;
> > 	unsigned long mmu_seq;
> > 	unsigned long rcbits;
> > @@ -231,8 +213,9 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned
> > long flags,
> >
> > 		/* Look up the Linux PTE for the backing page */
> > 		pte_size =3D psize;
> > -		pte =3D lookup_linux_pte(pgdir, hva, writing, &pte_size);
> > -		if (pte_present(pte)) {
> > +		ptep =3D lookup_linux_pte(pgdir, hva, &pte_size);
> > +		if (pte_present(pte_val(*ptep))) {
> > +			pte =3D kvmppc_read_update_linux_pte(ptep, writing);
> > 			if (writing && !pte_write(pte))
> > 				/* make the actual HPTE be read-only */
> > 				ptel =3D hpte_make_readonly(ptel);
> > @@ -661,15 +644,20 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsi=
gned
> long flags,
> > 			struct kvm_memory_slot *memslot;
> > 			pgd_t *pgdir =3D vcpu->arch.pgdir;
> > 			pte_t pte;
> > +			pte_t *ptep;
> >
> > 			psize =3D hpte_page_size(v, r);
> > 			gfn =3D ((r & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT;
> > 			memslot =3D __gfn_to_memslot(kvm_memslots(kvm), gfn);
> > 			if (memslot) {
> > 				hva =3D __gfn_to_hva_memslot(memslot, gfn);
> > -				pte =3D lookup_linux_pte(pgdir, hva, 1, &psize);
> > -				if (pte_present(pte) && !pte_write(pte))
> > -					r =3D hpte_make_readonly(r);
> > +				ptep =3D lookup_linux_pte(pgdir, hva, &psize);
> > +				if (pte_present(pte_val(*ptep))) {
> > +					pte =3D kvmppc_read_update_linux_pte(ptep,
> > +									   1);
> > +					if (pte_present(pte) && !pte_write(pte))
> > +						r =3D hpte_make_readonly(r);
> > +				}
> > 			}
> > 		}
> > 	}
> > --
> > 1.7.0.4
> >
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
> > the body of a message to majordomo@vger.kernel.org More majordomo info
> > at  http://vger.kernel.org/majordomo-info.html
>=20

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox