* [RFC PATCH 02/19] drivers/vhost: Convert to use vm_account
2023-01-24 5:42 ` Alistair Popple
@ 2023-01-24 5:42 ` Alistair Popple
-1 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-24 5:42 UTC (permalink / raw)
To: linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, jgg-DDmLM1+adcrQT0dZR+AlfA,
jhubbard-DDmLM1+adcrQT0dZR+AlfA, tjmercier-hpIqsD4AKlfQT0dZR+AlfA,
hannes-druUgvl0LCNAfugRpC6u6w, surenb-hpIqsD4AKlfQT0dZR+AlfA,
mkoutny-IBi9RG/b67k, daniel-/w4YWyX8dFk, Alistair Popple,
Michael S. Tsirkin, Jason Wang, kvm-u79uwXL29TY76Z2rM5mHXA,
virtualization-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
netdev-u79uwXL29TY76Z2rM5mHXA
Convert vhost to use the new vm_account structure and associated
account_pinned_vm() functions.
Signed-off-by: Alistair Popple <apopple-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org>
Cc: "Michael S. Tsirkin" <mst-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Cc: Jason Wang <jasowang-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Cc: kvm-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: virtualization-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
Cc: netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
---
drivers/vhost/vdpa.c | 9 +++++----
drivers/vhost/vhost.c | 2 ++
drivers/vhost/vhost.h | 1 +
3 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index ec32f78..a31dd53 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -716,7 +716,7 @@ static void vhost_vdpa_pa_unmap(struct vhost_vdpa *v, struct vhost_iotlb *iotlb,
set_page_dirty_lock(page);
unpin_user_page(page);
}
- atomic64_sub(PFN_DOWN(map->size), &dev->mm->pinned_vm);
+ vm_unaccount_pinned(&dev->vm_account, PFN_DOWN(map->size));
vhost_vdpa_general_unmap(v, map, asid);
vhost_iotlb_map_free(iotlb, map);
}
@@ -780,6 +780,10 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, struct vhost_iotlb *iotlb,
u32 asid = iotlb_to_asid(iotlb);
int r = 0;
+ if (!vdpa->use_va)
+ if (vm_account_pinned(&dev->vm_account, PFN_DOWN(size)))
+ return -ENOMEM;
+
r = vhost_iotlb_add_range_ctx(iotlb, iova, iova + size - 1,
pa, perm, opaque);
if (r)
@@ -799,9 +803,6 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, struct vhost_iotlb *iotlb,
return r;
}
- if (!vdpa->use_va)
- atomic64_add(PFN_DOWN(size), &dev->mm->pinned_vm);
-
return 0;
}
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index cbe72bf..5645c26 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -556,6 +556,7 @@ static void vhost_attach_mm(struct vhost_dev *dev)
dev->mm = current->mm;
mmgrab(dev->mm);
}
+ vm_account_init_current(&dev->vm_account);
}
static void vhost_detach_mm(struct vhost_dev *dev)
@@ -569,6 +570,7 @@ static void vhost_detach_mm(struct vhost_dev *dev)
mmdrop(dev->mm);
dev->mm = NULL;
+ vm_account_release(&dev->vm_account);
}
/* Caller should have device mutex */
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index d910910..3a9aed8 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -144,6 +144,7 @@ struct vhost_msg_node {
struct vhost_dev {
struct mm_struct *mm;
struct mutex mutex;
+ struct vm_account vm_account;
struct vhost_virtqueue **vqs;
int nvqs;
struct eventfd_ctx *log_ctx;
--
git-series 0.9.1
^ permalink raw reply related [flat|nested] 108+ messages in thread* [RFC PATCH 02/19] drivers/vhost: Convert to use vm_account
@ 2023-01-24 5:42 ` Alistair Popple
0 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-24 5:42 UTC (permalink / raw)
To: linux-mm, cgroups
Cc: linux-kernel, jgg, jhubbard, tjmercier, hannes, surenb, mkoutny,
daniel, Alistair Popple, Michael S. Tsirkin, Jason Wang, kvm,
virtualization, netdev
Convert vhost to use the new vm_account structure and associated
account_pinned_vm() functions.
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: kvm@vger.kernel.org
Cc: virtualization@lists.linux-foundation.org
Cc: netdev@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
drivers/vhost/vdpa.c | 9 +++++----
drivers/vhost/vhost.c | 2 ++
drivers/vhost/vhost.h | 1 +
3 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index ec32f78..a31dd53 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -716,7 +716,7 @@ static void vhost_vdpa_pa_unmap(struct vhost_vdpa *v, struct vhost_iotlb *iotlb,
set_page_dirty_lock(page);
unpin_user_page(page);
}
- atomic64_sub(PFN_DOWN(map->size), &dev->mm->pinned_vm);
+ vm_unaccount_pinned(&dev->vm_account, PFN_DOWN(map->size));
vhost_vdpa_general_unmap(v, map, asid);
vhost_iotlb_map_free(iotlb, map);
}
@@ -780,6 +780,10 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, struct vhost_iotlb *iotlb,
u32 asid = iotlb_to_asid(iotlb);
int r = 0;
+ if (!vdpa->use_va)
+ if (vm_account_pinned(&dev->vm_account, PFN_DOWN(size)))
+ return -ENOMEM;
+
r = vhost_iotlb_add_range_ctx(iotlb, iova, iova + size - 1,
pa, perm, opaque);
if (r)
@@ -799,9 +803,6 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, struct vhost_iotlb *iotlb,
return r;
}
- if (!vdpa->use_va)
- atomic64_add(PFN_DOWN(size), &dev->mm->pinned_vm);
-
return 0;
}
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index cbe72bf..5645c26 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -556,6 +556,7 @@ static void vhost_attach_mm(struct vhost_dev *dev)
dev->mm = current->mm;
mmgrab(dev->mm);
}
+ vm_account_init_current(&dev->vm_account);
}
static void vhost_detach_mm(struct vhost_dev *dev)
@@ -569,6 +570,7 @@ static void vhost_detach_mm(struct vhost_dev *dev)
mmdrop(dev->mm);
dev->mm = NULL;
+ vm_account_release(&dev->vm_account);
}
/* Caller should have device mutex */
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index d910910..3a9aed8 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -144,6 +144,7 @@ struct vhost_msg_node {
struct vhost_dev {
struct mm_struct *mm;
struct mutex mutex;
+ struct vm_account vm_account;
struct vhost_virtqueue **vqs;
int nvqs;
struct eventfd_ctx *log_ctx;
--
git-series 0.9.1
^ permalink raw reply related [flat|nested] 108+ messages in thread* Re: [RFC PATCH 02/19] drivers/vhost: Convert to use vm_account
2023-01-24 5:42 ` Alistair Popple
(?)
@ 2023-01-24 5:55 ` Michael S. Tsirkin
-1 siblings, 0 replies; 108+ messages in thread
From: Michael S. Tsirkin @ 2023-01-24 5:55 UTC (permalink / raw)
To: Alistair Popple
Cc: daniel, kvm, jhubbard, linux-kernel, virtualization, linux-mm,
netdev, mkoutny, jgg, hannes, cgroups, surenb, tjmercier
On Tue, Jan 24, 2023 at 04:42:31PM +1100, Alistair Popple wrote:
> diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
> index ec32f78..a31dd53 100644
> --- a/drivers/vhost/vdpa.c
> +++ b/drivers/vhost/vdpa.c
...
> @@ -780,6 +780,10 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, struct vhost_iotlb *iotlb,
> u32 asid = iotlb_to_asid(iotlb);
> int r = 0;
>
> + if (!vdpa->use_va)
> + if (vm_account_pinned(&dev->vm_account, PFN_DOWN(size)))
> + return -ENOMEM;
> +
> r = vhost_iotlb_add_range_ctx(iotlb, iova, iova + size - 1,
> pa, perm, opaque);
> if (r)
I suspect some error handling will have to be reworked then, no?
> --
> git-series 0.9.1
^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [RFC PATCH 02/19] drivers/vhost: Convert to use vm_account
@ 2023-01-24 5:55 ` Michael S. Tsirkin
0 siblings, 0 replies; 108+ messages in thread
From: Michael S. Tsirkin @ 2023-01-24 5:55 UTC (permalink / raw)
To: Alistair Popple
Cc: linux-mm, cgroups, linux-kernel, jgg, jhubbard, tjmercier, hannes,
surenb, mkoutny, daniel, Jason Wang, kvm, virtualization, netdev
On Tue, Jan 24, 2023 at 04:42:31PM +1100, Alistair Popple wrote:
> diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
> index ec32f78..a31dd53 100644
> --- a/drivers/vhost/vdpa.c
> +++ b/drivers/vhost/vdpa.c
...
> @@ -780,6 +780,10 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, struct vhost_iotlb *iotlb,
> u32 asid = iotlb_to_asid(iotlb);
> int r = 0;
>
> + if (!vdpa->use_va)
> + if (vm_account_pinned(&dev->vm_account, PFN_DOWN(size)))
> + return -ENOMEM;
> +
> r = vhost_iotlb_add_range_ctx(iotlb, iova, iova + size - 1,
> pa, perm, opaque);
> if (r)
I suspect some error handling will have to be reworked then, no?
> --
> git-series 0.9.1
^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [RFC PATCH 02/19] drivers/vhost: Convert to use vm_account
@ 2023-01-24 5:55 ` Michael S. Tsirkin
0 siblings, 0 replies; 108+ messages in thread
From: Michael S. Tsirkin @ 2023-01-24 5:55 UTC (permalink / raw)
To: Alistair Popple
Cc: daniel, kvm, jhubbard, linux-kernel, virtualization, linux-mm,
netdev, mkoutny, jgg, hannes, cgroups, surenb, tjmercier
On Tue, Jan 24, 2023 at 04:42:31PM +1100, Alistair Popple wrote:
> diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
> index ec32f78..a31dd53 100644
> --- a/drivers/vhost/vdpa.c
> +++ b/drivers/vhost/vdpa.c
...
> @@ -780,6 +780,10 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, struct vhost_iotlb *iotlb,
> u32 asid = iotlb_to_asid(iotlb);
> int r = 0;
>
> + if (!vdpa->use_va)
> + if (vm_account_pinned(&dev->vm_account, PFN_DOWN(size)))
> + return -ENOMEM;
> +
> r = vhost_iotlb_add_range_ctx(iotlb, iova, iova + size - 1,
> pa, perm, opaque);
> if (r)
I suspect some error handling will have to be reworked then, no?
> --
> git-series 0.9.1
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply [flat|nested] 108+ messages in thread
[parent not found: <20230124005356-mutt-send-email-mst-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>]
* Re: [RFC PATCH 02/19] drivers/vhost: Convert to use vm_account
2023-01-24 5:55 ` Michael S. Tsirkin
@ 2023-01-30 10:43 ` Alistair Popple
-1 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-30 10:43 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA, jgg-DDmLM1+adcrQT0dZR+AlfA,
jhubbard-DDmLM1+adcrQT0dZR+AlfA, tjmercier-hpIqsD4AKlfQT0dZR+AlfA,
hannes-druUgvl0LCNAfugRpC6u6w, surenb-hpIqsD4AKlfQT0dZR+AlfA,
mkoutny-IBi9RG/b67k, daniel-/w4YWyX8dFk, Jason Wang,
kvm-u79uwXL29TY76Z2rM5mHXA,
virtualization-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
netdev-u79uwXL29TY76Z2rM5mHXA
"Michael S. Tsirkin" <mst-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes:
> On Tue, Jan 24, 2023 at 04:42:31PM +1100, Alistair Popple wrote:
>> diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
>> index ec32f78..a31dd53 100644
>> --- a/drivers/vhost/vdpa.c
>> +++ b/drivers/vhost/vdpa.c
>
> ...
>
>> @@ -780,6 +780,10 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, struct vhost_iotlb *iotlb,
>> u32 asid = iotlb_to_asid(iotlb);
>> int r = 0;
>>
>> + if (!vdpa->use_va)
>> + if (vm_account_pinned(&dev->vm_account, PFN_DOWN(size)))
>> + return -ENOMEM;
>> +
>> r = vhost_iotlb_add_range_ctx(iotlb, iova, iova + size - 1,
>> pa, perm, opaque);
>> if (r)
>
> I suspect some error handling will have to be reworked then, no?
Thanks. I had meant to go back and double check some of these driver
conversions. Will add something like below:
@@ -787,7 +787,7 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, struct vhost_iotlb *iotlb,
r = vhost_iotlb_add_range_ctx(iotlb, iova, iova + size - 1,
pa, perm, opaque);
if (r)
- return r;
+ goto out_unaccount;
if (ops->dma_map) {
r = ops->dma_map(vdpa, asid, iova, size, pa, perm, opaque);
@@ -798,12 +798,14 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, struct vhost_iotlb *iotlb,
r = iommu_map(v->domain, iova, pa, size,
perm_to_iommu_flags(perm));
}
- if (r) {
+ if (r)
vhost_iotlb_del_range(iotlb, iova, iova + size - 1);
- return r;
- }
- return 0;
+out_unaccount:
+ if (!vdpa->use_va)
+ vm_unaccount_pinned(&dev->vm_account, PFN_DOWN(size));
+
+ return r;
}
static void vhost_vdpa_unmap(struct vhost_vdpa *v,
>> --
>> git-series 0.9.1
^ permalink raw reply [flat|nested] 108+ messages in thread* Re: [RFC PATCH 02/19] drivers/vhost: Convert to use vm_account
@ 2023-01-30 10:43 ` Alistair Popple
0 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-30 10:43 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: linux-mm, cgroups, linux-kernel, jgg, jhubbard, tjmercier, hannes,
surenb, mkoutny, daniel, Jason Wang, kvm, virtualization, netdev
"Michael S. Tsirkin" <mst@redhat.com> writes:
> On Tue, Jan 24, 2023 at 04:42:31PM +1100, Alistair Popple wrote:
>> diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
>> index ec32f78..a31dd53 100644
>> --- a/drivers/vhost/vdpa.c
>> +++ b/drivers/vhost/vdpa.c
>
> ...
>
>> @@ -780,6 +780,10 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, struct vhost_iotlb *iotlb,
>> u32 asid = iotlb_to_asid(iotlb);
>> int r = 0;
>>
>> + if (!vdpa->use_va)
>> + if (vm_account_pinned(&dev->vm_account, PFN_DOWN(size)))
>> + return -ENOMEM;
>> +
>> r = vhost_iotlb_add_range_ctx(iotlb, iova, iova + size - 1,
>> pa, perm, opaque);
>> if (r)
>
> I suspect some error handling will have to be reworked then, no?
Thanks. I had meant to go back and double check some of these driver
conversions. Will add something like below:
@@ -787,7 +787,7 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, struct vhost_iotlb *iotlb,
r = vhost_iotlb_add_range_ctx(iotlb, iova, iova + size - 1,
pa, perm, opaque);
if (r)
- return r;
+ goto out_unaccount;
if (ops->dma_map) {
r = ops->dma_map(vdpa, asid, iova, size, pa, perm, opaque);
@@ -798,12 +798,14 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, struct vhost_iotlb *iotlb,
r = iommu_map(v->domain, iova, pa, size,
perm_to_iommu_flags(perm));
}
- if (r) {
+ if (r)
vhost_iotlb_del_range(iotlb, iova, iova + size - 1);
- return r;
- }
- return 0;
+out_unaccount:
+ if (!vdpa->use_va)
+ vm_unaccount_pinned(&dev->vm_account, PFN_DOWN(size));
+
+ return r;
}
static void vhost_vdpa_unmap(struct vhost_vdpa *v,
>> --
>> git-series 0.9.1
^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [RFC PATCH 02/19] drivers/vhost: Convert to use vm_account
2023-01-24 5:42 ` Alistair Popple
(?)
(?)
@ 2023-01-24 14:34 ` Jason Gunthorpe
-1 siblings, 0 replies; 108+ messages in thread
From: Jason Gunthorpe @ 2023-01-24 14:34 UTC (permalink / raw)
To: Alistair Popple
Cc: linux-mm, cgroups, linux-kernel, jhubbard, tjmercier, hannes,
surenb, mkoutny, daniel, Michael S. Tsirkin, Jason Wang, kvm,
virtualization, netdev
On Tue, Jan 24, 2023 at 04:42:31PM +1100, Alistair Popple wrote:
> @@ -799,9 +803,6 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, struct vhost_iotlb *iotlb,
> return r;
> }
>
> - if (!vdpa->use_va)
> - atomic64_add(PFN_DOWN(size), &dev->mm->pinned_vm);
Mention in the commit message this fixes a "bug" where vhost didn't
respect the limits
Jason
^ permalink raw reply [flat|nested] 108+ messages in thread
* [RFC PATCH 03/19] drivers/vdpa: Convert vdpa to use the new vm_structure
2023-01-24 5:42 ` Alistair Popple
@ 2023-01-24 5:42 ` Alistair Popple
-1 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-24 5:42 UTC (permalink / raw)
To: linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, jgg-DDmLM1+adcrQT0dZR+AlfA,
jhubbard-DDmLM1+adcrQT0dZR+AlfA, tjmercier-hpIqsD4AKlfQT0dZR+AlfA,
hannes-druUgvl0LCNAfugRpC6u6w, surenb-hpIqsD4AKlfQT0dZR+AlfA,
mkoutny-IBi9RG/b67k, daniel-/w4YWyX8dFk, Alistair Popple,
Michael S. Tsirkin, Jason Wang,
virtualization-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
Convert vdpa to use the new vm_structure and associated
account_pinned_vm() functions.
Signed-off-by: Alistair Popple <apopple-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org>
Cc: "Michael S. Tsirkin" <mst-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Cc: Jason Wang <jasowang-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Cc: virtualization-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
---
drivers/vdpa/vdpa_user/vduse_dev.c | 20 +++++++++-----------
1 file changed, 9 insertions(+), 11 deletions(-)
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index 0c3b486..bd87b58 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -70,7 +70,7 @@ struct vduse_umem {
unsigned long iova;
unsigned long npages;
struct page **pages;
- struct mm_struct *mm;
+ struct vm_account vm_account;
};
struct vduse_dev {
@@ -950,8 +950,7 @@ static int vduse_dev_dereg_umem(struct vduse_dev *dev,
vduse_domain_remove_user_bounce_pages(dev->domain);
unpin_user_pages_dirty_lock(dev->umem->pages,
dev->umem->npages, true);
- atomic64_sub(dev->umem->npages, &dev->umem->mm->pinned_vm);
- mmdrop(dev->umem->mm);
+ vm_unaccount_pinned(&dev->umem->vm_account, dev->umem->npages);
vfree(dev->umem->pages);
kfree(dev->umem);
dev->umem = NULL;
@@ -967,7 +966,7 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev,
struct page **page_list = NULL;
struct vduse_umem *umem = NULL;
long pinned = 0;
- unsigned long npages, lock_limit;
+ unsigned long npages;
int ret;
if (!dev->domain->bounce_map ||
@@ -990,8 +989,8 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev,
mmap_read_lock(current->mm);
- lock_limit = PFN_DOWN(rlimit(RLIMIT_MEMLOCK));
- if (npages + atomic64_read(¤t->mm->pinned_vm) > lock_limit)
+ vm_account_init_current(&umem->vm_account);
+ if (vm_account_pinned(&umem->vm_account, npages))
goto out;
pinned = pin_user_pages(uaddr, npages, FOLL_LONGTERM | FOLL_WRITE,
@@ -1006,22 +1005,21 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev,
if (ret)
goto out;
- atomic64_add(npages, ¤t->mm->pinned_vm);
-
umem->pages = page_list;
umem->npages = pinned;
umem->iova = iova;
- umem->mm = current->mm;
- mmgrab(current->mm);
dev->umem = umem;
out:
- if (ret && pinned > 0)
+ if (ret && pinned > 0) {
unpin_user_pages(page_list, pinned);
+ vm_unaccount_pinned(&umem->vm_account, npages);
+ }
mmap_read_unlock(current->mm);
unlock:
if (ret) {
+ vm_account_release(&umem->vm_account);
vfree(page_list);
kfree(umem);
}
--
git-series 0.9.1
^ permalink raw reply related [flat|nested] 108+ messages in thread* [RFC PATCH 03/19] drivers/vdpa: Convert vdpa to use the new vm_structure
@ 2023-01-24 5:42 ` Alistair Popple
0 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-24 5:42 UTC (permalink / raw)
To: linux-mm, cgroups
Cc: linux-kernel, jgg, jhubbard, tjmercier, hannes, surenb, mkoutny,
daniel, Alistair Popple, Michael S. Tsirkin, Jason Wang,
virtualization
Convert vdpa to use the new vm_structure and associated
account_pinned_vm() functions.
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: virtualization@lists.linux-foundation.org
Cc: linux-kernel@vger.kernel.org
---
drivers/vdpa/vdpa_user/vduse_dev.c | 20 +++++++++-----------
1 file changed, 9 insertions(+), 11 deletions(-)
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index 0c3b486..bd87b58 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -70,7 +70,7 @@ struct vduse_umem {
unsigned long iova;
unsigned long npages;
struct page **pages;
- struct mm_struct *mm;
+ struct vm_account vm_account;
};
struct vduse_dev {
@@ -950,8 +950,7 @@ static int vduse_dev_dereg_umem(struct vduse_dev *dev,
vduse_domain_remove_user_bounce_pages(dev->domain);
unpin_user_pages_dirty_lock(dev->umem->pages,
dev->umem->npages, true);
- atomic64_sub(dev->umem->npages, &dev->umem->mm->pinned_vm);
- mmdrop(dev->umem->mm);
+ vm_unaccount_pinned(&dev->umem->vm_account, dev->umem->npages);
vfree(dev->umem->pages);
kfree(dev->umem);
dev->umem = NULL;
@@ -967,7 +966,7 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev,
struct page **page_list = NULL;
struct vduse_umem *umem = NULL;
long pinned = 0;
- unsigned long npages, lock_limit;
+ unsigned long npages;
int ret;
if (!dev->domain->bounce_map ||
@@ -990,8 +989,8 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev,
mmap_read_lock(current->mm);
- lock_limit = PFN_DOWN(rlimit(RLIMIT_MEMLOCK));
- if (npages + atomic64_read(¤t->mm->pinned_vm) > lock_limit)
+ vm_account_init_current(&umem->vm_account);
+ if (vm_account_pinned(&umem->vm_account, npages))
goto out;
pinned = pin_user_pages(uaddr, npages, FOLL_LONGTERM | FOLL_WRITE,
@@ -1006,22 +1005,21 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev,
if (ret)
goto out;
- atomic64_add(npages, ¤t->mm->pinned_vm);
-
umem->pages = page_list;
umem->npages = pinned;
umem->iova = iova;
- umem->mm = current->mm;
- mmgrab(current->mm);
dev->umem = umem;
out:
- if (ret && pinned > 0)
+ if (ret && pinned > 0) {
unpin_user_pages(page_list, pinned);
+ vm_unaccount_pinned(&umem->vm_account, npages);
+ }
mmap_read_unlock(current->mm);
unlock:
if (ret) {
+ vm_account_release(&umem->vm_account);
vfree(page_list);
kfree(umem);
}
--
git-series 0.9.1
^ permalink raw reply related [flat|nested] 108+ messages in thread[parent not found: <f2e4696380f6678527a14e885556cb1bbd314737.1674538665.git-series.apopple-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org>]
* Re: [RFC PATCH 03/19] drivers/vdpa: Convert vdpa to use the new vm_structure
2023-01-24 5:42 ` Alistair Popple
@ 2023-01-24 14:35 ` Jason Gunthorpe
-1 siblings, 0 replies; 108+ messages in thread
From: Jason Gunthorpe @ 2023-01-24 14:35 UTC (permalink / raw)
To: Alistair Popple
Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA,
jhubbard-DDmLM1+adcrQT0dZR+AlfA, tjmercier-hpIqsD4AKlfQT0dZR+AlfA,
hannes-druUgvl0LCNAfugRpC6u6w, surenb-hpIqsD4AKlfQT0dZR+AlfA,
mkoutny-IBi9RG/b67k, daniel-/w4YWyX8dFk, Michael S. Tsirkin,
Jason Wang,
virtualization-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
On Tue, Jan 24, 2023 at 04:42:32PM +1100, Alistair Popple wrote:
> @@ -990,8 +989,8 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev,
>
> mmap_read_lock(current->mm);
>
> - lock_limit = PFN_DOWN(rlimit(RLIMIT_MEMLOCK));
> - if (npages + atomic64_read(¤t->mm->pinned_vm) > lock_limit)
> + vm_account_init_current(&umem->vm_account);
> + if (vm_account_pinned(&umem->vm_account, npages))
> goto out;
>
> pinned = pin_user_pages(uaddr, npages, FOLL_LONGTERM | FOLL_WRITE,
> @@ -1006,22 +1005,21 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev,
> if (ret)
> goto out;
>
> - atomic64_add(npages, ¤t->mm->pinned_vm);
Mention in the commit message that this fixes a bug where vdpa would
race the update of mm->pinned_vm and might go past the limit.
Jason
^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [RFC PATCH 03/19] drivers/vdpa: Convert vdpa to use the new vm_structure
@ 2023-01-24 14:35 ` Jason Gunthorpe
0 siblings, 0 replies; 108+ messages in thread
From: Jason Gunthorpe @ 2023-01-24 14:35 UTC (permalink / raw)
To: Alistair Popple
Cc: linux-mm, cgroups, linux-kernel, jhubbard, tjmercier, hannes,
surenb, mkoutny, daniel, Michael S. Tsirkin, Jason Wang,
virtualization
On Tue, Jan 24, 2023 at 04:42:32PM +1100, Alistair Popple wrote:
> @@ -990,8 +989,8 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev,
>
> mmap_read_lock(current->mm);
>
> - lock_limit = PFN_DOWN(rlimit(RLIMIT_MEMLOCK));
> - if (npages + atomic64_read(¤t->mm->pinned_vm) > lock_limit)
> + vm_account_init_current(&umem->vm_account);
> + if (vm_account_pinned(&umem->vm_account, npages))
> goto out;
>
> pinned = pin_user_pages(uaddr, npages, FOLL_LONGTERM | FOLL_WRITE,
> @@ -1006,22 +1005,21 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev,
> if (ret)
> goto out;
>
> - atomic64_add(npages, ¤t->mm->pinned_vm);
Mention in the commit message that this fixes a bug where vdpa would
race the update of mm->pinned_vm and might go past the limit.
Jason
^ permalink raw reply [flat|nested] 108+ messages in thread
* [RFC PATCH 04/19] infiniband/umem: Convert to use vm_account
2023-01-24 5:42 ` Alistair Popple
@ 2023-01-24 5:42 ` Alistair Popple
-1 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-24 5:42 UTC (permalink / raw)
To: linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, jgg-DDmLM1+adcrQT0dZR+AlfA,
jhubbard-DDmLM1+adcrQT0dZR+AlfA, tjmercier-hpIqsD4AKlfQT0dZR+AlfA,
hannes-druUgvl0LCNAfugRpC6u6w, surenb-hpIqsD4AKlfQT0dZR+AlfA,
mkoutny-IBi9RG/b67k, daniel-/w4YWyX8dFk, Alistair Popple,
Jason Gunthorpe, Leon Romanovsky,
linux-rdma-u79uwXL29TY76Z2rM5mHXA
Converts the infiniband core umem code to use the vm_account structure
so that pinned pages can be charged to the correct cgroup with
account_pinned_vm().
Signed-off-by: Alistair Popple <apopple-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org>
Cc: Jason Gunthorpe <jgg-uk2M96/98Pc@public.gmane.org>
Cc: Leon Romanovsky <leon-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
---
drivers/infiniband/core/umem.c | 16 ++++++----------
drivers/infiniband/core/umem_odp.c | 6 ++++++
include/rdma/ib_umem.h | 1 +
3 files changed, 13 insertions(+), 10 deletions(-)
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 755a9c5..479b7f0 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -149,8 +149,6 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
{
struct ib_umem *umem;
struct page **page_list;
- unsigned long lock_limit;
- unsigned long new_pinned;
unsigned long cur_base;
unsigned long dma_attr = 0;
struct mm_struct *mm;
@@ -186,6 +184,7 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
umem->writable = ib_access_writable(access);
umem->owning_mm = mm = current->mm;
mmgrab(mm);
+ vm_account_init_current(&umem->vm_account);
page_list = (struct page **) __get_free_page(GFP_KERNEL);
if (!page_list) {
@@ -199,11 +198,7 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
goto out;
}
- lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
- new_pinned = atomic64_add_return(npages, &mm->pinned_vm);
- if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) {
- atomic64_sub(npages, &mm->pinned_vm);
+ if (vm_account_pinned(&umem->vm_account, npages)) {
ret = -ENOMEM;
goto out;
}
@@ -248,12 +243,13 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
umem_release:
__ib_umem_release(device, umem, 0);
- atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm);
+ vm_unaccount_pinned(&umem->vm_account, ib_umem_num_pages(umem));
out:
free_page((unsigned long) page_list);
umem_kfree:
if (ret) {
mmdrop(umem->owning_mm);
+ vm_account_release(&umem->vm_account);
kfree(umem);
}
return ret ? ERR_PTR(ret) : umem;
@@ -275,8 +271,8 @@ void ib_umem_release(struct ib_umem *umem)
__ib_umem_release(umem->ibdev, umem, 1);
- atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm);
- mmdrop(umem->owning_mm);
+ vm_unaccount_pinned(&umem->vm_account, ib_umem_num_pages(umem));
+ vm_account_release(&umem->vm_account);
kfree(umem);
}
EXPORT_SYMBOL(ib_umem_release);
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index e9fa22d..4fbca3e 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -130,6 +130,7 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device,
umem->ibdev = device;
umem->writable = ib_access_writable(access);
umem->owning_mm = current->mm;
+ vm_account_init_current(&umem->vm_account);
umem_odp->is_implicit_odp = 1;
umem_odp->page_shift = PAGE_SHIFT;
@@ -137,6 +138,7 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device,
ret = ib_init_umem_odp(umem_odp, NULL);
if (ret) {
put_pid(umem_odp->tgid);
+ vm_account_release(&umem->vm_account);
kfree(umem_odp);
return ERR_PTR(ret);
}
@@ -179,6 +181,7 @@ ib_umem_odp_alloc_child(struct ib_umem_odp *root, unsigned long addr,
umem->address = addr;
umem->writable = root->umem.writable;
umem->owning_mm = root->umem.owning_mm;
+ umem->vm_account = root->umem.vm_account;
odp_data->page_shift = PAGE_SHIFT;
odp_data->notifier.ops = ops;
@@ -239,6 +242,7 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_device *device,
umem_odp->umem.address = addr;
umem_odp->umem.writable = ib_access_writable(access);
umem_odp->umem.owning_mm = current->mm;
+ vm_account_init_current(&umem_odp->umem.vm_account);
umem_odp->notifier.ops = ops;
umem_odp->page_shift = PAGE_SHIFT;
@@ -255,6 +259,7 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_device *device,
err_put_pid:
put_pid(umem_odp->tgid);
+ vm_account_release(&umem_odp->umem.vm_account);
kfree(umem_odp);
return ERR_PTR(ret);
}
@@ -278,6 +283,7 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
kvfree(umem_odp->pfn_list);
}
put_pid(umem_odp->tgid);
+ vm_account_release(&umem_odp->umem.vm_account);
kfree(umem_odp);
}
EXPORT_SYMBOL(ib_umem_odp_release);
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 92a673c..de51406 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -19,6 +19,7 @@ struct dma_buf_attach_ops;
struct ib_umem {
struct ib_device *ibdev;
struct mm_struct *owning_mm;
+ struct vm_account vm_account;
u64 iova;
size_t length;
unsigned long address;
--
git-series 0.9.1
^ permalink raw reply related [flat|nested] 108+ messages in thread* [RFC PATCH 04/19] infiniband/umem: Convert to use vm_account
@ 2023-01-24 5:42 ` Alistair Popple
0 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-24 5:42 UTC (permalink / raw)
To: linux-mm, cgroups
Cc: linux-kernel, jgg, jhubbard, tjmercier, hannes, surenb, mkoutny,
daniel, Alistair Popple, Jason Gunthorpe, Leon Romanovsky,
linux-rdma
Converts the infiniband core umem code to use the vm_account structure
so that pinned pages can be charged to the correct cgroup with
account_pinned_vm().
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: linux-rdma@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
drivers/infiniband/core/umem.c | 16 ++++++----------
drivers/infiniband/core/umem_odp.c | 6 ++++++
include/rdma/ib_umem.h | 1 +
3 files changed, 13 insertions(+), 10 deletions(-)
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 755a9c5..479b7f0 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -149,8 +149,6 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
{
struct ib_umem *umem;
struct page **page_list;
- unsigned long lock_limit;
- unsigned long new_pinned;
unsigned long cur_base;
unsigned long dma_attr = 0;
struct mm_struct *mm;
@@ -186,6 +184,7 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
umem->writable = ib_access_writable(access);
umem->owning_mm = mm = current->mm;
mmgrab(mm);
+ vm_account_init_current(&umem->vm_account);
page_list = (struct page **) __get_free_page(GFP_KERNEL);
if (!page_list) {
@@ -199,11 +198,7 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
goto out;
}
- lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
- new_pinned = atomic64_add_return(npages, &mm->pinned_vm);
- if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) {
- atomic64_sub(npages, &mm->pinned_vm);
+ if (vm_account_pinned(&umem->vm_account, npages)) {
ret = -ENOMEM;
goto out;
}
@@ -248,12 +243,13 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
umem_release:
__ib_umem_release(device, umem, 0);
- atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm);
+ vm_unaccount_pinned(&umem->vm_account, ib_umem_num_pages(umem));
out:
free_page((unsigned long) page_list);
umem_kfree:
if (ret) {
mmdrop(umem->owning_mm);
+ vm_account_release(&umem->vm_account);
kfree(umem);
}
return ret ? ERR_PTR(ret) : umem;
@@ -275,8 +271,8 @@ void ib_umem_release(struct ib_umem *umem)
__ib_umem_release(umem->ibdev, umem, 1);
- atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm);
- mmdrop(umem->owning_mm);
+ vm_unaccount_pinned(&umem->vm_account, ib_umem_num_pages(umem));
+ vm_account_release(&umem->vm_account);
kfree(umem);
}
EXPORT_SYMBOL(ib_umem_release);
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index e9fa22d..4fbca3e 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -130,6 +130,7 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device,
umem->ibdev = device;
umem->writable = ib_access_writable(access);
umem->owning_mm = current->mm;
+ vm_account_init_current(&umem->vm_account);
umem_odp->is_implicit_odp = 1;
umem_odp->page_shift = PAGE_SHIFT;
@@ -137,6 +138,7 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device,
ret = ib_init_umem_odp(umem_odp, NULL);
if (ret) {
put_pid(umem_odp->tgid);
+ vm_account_release(&umem->vm_account);
kfree(umem_odp);
return ERR_PTR(ret);
}
@@ -179,6 +181,7 @@ ib_umem_odp_alloc_child(struct ib_umem_odp *root, unsigned long addr,
umem->address = addr;
umem->writable = root->umem.writable;
umem->owning_mm = root->umem.owning_mm;
+ umem->vm_account = root->umem.vm_account;
odp_data->page_shift = PAGE_SHIFT;
odp_data->notifier.ops = ops;
@@ -239,6 +242,7 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_device *device,
umem_odp->umem.address = addr;
umem_odp->umem.writable = ib_access_writable(access);
umem_odp->umem.owning_mm = current->mm;
+ vm_account_init_current(&umem_odp->umem.vm_account);
umem_odp->notifier.ops = ops;
umem_odp->page_shift = PAGE_SHIFT;
@@ -255,6 +259,7 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_device *device,
err_put_pid:
put_pid(umem_odp->tgid);
+ vm_account_release(&umem_odp->umem.vm_account);
kfree(umem_odp);
return ERR_PTR(ret);
}
@@ -278,6 +283,7 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
kvfree(umem_odp->pfn_list);
}
put_pid(umem_odp->tgid);
+ vm_account_release(&umem_odp->umem.vm_account);
kfree(umem_odp);
}
EXPORT_SYMBOL(ib_umem_odp_release);
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 92a673c..de51406 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -19,6 +19,7 @@ struct dma_buf_attach_ops;
struct ib_umem {
struct ib_device *ibdev;
struct mm_struct *owning_mm;
+ struct vm_account vm_account;
u64 iova;
size_t length;
unsigned long address;
--
git-series 0.9.1
^ permalink raw reply related [flat|nested] 108+ messages in thread
* [RFC PATCH 05/19] RMDA/siw: Convert to use vm_account
2023-01-24 5:42 ` Alistair Popple
@ 2023-01-24 5:42 ` Alistair Popple
-1 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-24 5:42 UTC (permalink / raw)
To: linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, jgg-DDmLM1+adcrQT0dZR+AlfA,
jhubbard-DDmLM1+adcrQT0dZR+AlfA, tjmercier-hpIqsD4AKlfQT0dZR+AlfA,
hannes-druUgvl0LCNAfugRpC6u6w, surenb-hpIqsD4AKlfQT0dZR+AlfA,
mkoutny-IBi9RG/b67k, daniel-/w4YWyX8dFk, Alistair Popple,
Bernard Metzler, Jason Gunthorpe, Leon Romanovsky,
linux-rdma-u79uwXL29TY76Z2rM5mHXA
Convert to using a vm_account structure to account pinned memory to
both the mm and the pins cgroup.
Signed-off-by: Alistair Popple <apopple-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org>
Cc: Bernard Metzler <bmt-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org>
Cc: Jason Gunthorpe <jgg-uk2M96/98Pc@public.gmane.org>
Cc: Leon Romanovsky <leon-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
---
drivers/infiniband/sw/siw/siw.h | 2 +-
drivers/infiniband/sw/siw/siw_mem.c | 20 ++++++--------------
drivers/infiniband/sw/siw/siw_verbs.c | 15 ---------------
3 files changed, 7 insertions(+), 30 deletions(-)
diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h
index 2f3a9cd..0c4a3ec 100644
--- a/drivers/infiniband/sw/siw/siw.h
+++ b/drivers/infiniband/sw/siw/siw.h
@@ -124,7 +124,7 @@ struct siw_umem {
int num_pages;
bool writable;
u64 fp_addr; /* First page base address */
- struct mm_struct *owning_mm;
+ struct vm_account vm_account;
};
struct siw_pble {
diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c
index b2b33dd..9c53fc3 100644
--- a/drivers/infiniband/sw/siw/siw_mem.c
+++ b/drivers/infiniband/sw/siw/siw_mem.c
@@ -68,7 +68,6 @@ static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages,
void siw_umem_release(struct siw_umem *umem, bool dirty)
{
- struct mm_struct *mm_s = umem->owning_mm;
int i, num_pages = umem->num_pages;
for (i = 0; num_pages; i++) {
@@ -79,9 +78,9 @@ void siw_umem_release(struct siw_umem *umem, bool dirty)
kfree(umem->page_chunk[i].plist);
num_pages -= to_free;
}
- atomic64_sub(umem->num_pages, &mm_s->pinned_vm);
+ vm_unaccount_pinned(&umem->vm_account, umem->num_pages);
+ vm_account_release(&umem->vm_account);
- mmdrop(mm_s);
kfree(umem->page_chunk);
kfree(umem);
}
@@ -365,9 +364,7 @@ struct siw_pbl *siw_pbl_alloc(u32 num_buf)
struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable)
{
struct siw_umem *umem;
- struct mm_struct *mm_s;
u64 first_page_va;
- unsigned long mlock_limit;
unsigned int foll_flags = FOLL_LONGTERM;
int num_pages, num_chunks, i, rv = 0;
@@ -385,20 +382,16 @@ struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable)
if (!umem)
return ERR_PTR(-ENOMEM);
- mm_s = current->mm;
- umem->owning_mm = mm_s;
umem->writable = writable;
- mmgrab(mm_s);
+ vm_account_init_current(&umem->vm_account);
if (writable)
foll_flags |= FOLL_WRITE;
- mmap_read_lock(mm_s);
+ mmap_read_lock(current->mm);
- mlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
- if (num_pages + atomic64_read(&mm_s->pinned_vm) > mlock_limit) {
+ if (vm_account_pinned(&umem->vm_account, num_pages)) {
rv = -ENOMEM;
goto out_sem_up;
}
@@ -429,7 +422,6 @@ struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable)
goto out_sem_up;
umem->num_pages += rv;
- atomic64_add(rv, &mm_s->pinned_vm);
first_page_va += rv * PAGE_SIZE;
nents -= rv;
got += rv;
@@ -437,7 +429,7 @@ struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable)
num_pages -= got;
}
out_sem_up:
- mmap_read_unlock(mm_s);
+ mmap_read_unlock(current->mm);
if (rv > 0)
return umem;
diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c
index 906fde1..8fab009 100644
--- a/drivers/infiniband/sw/siw/siw_verbs.c
+++ b/drivers/infiniband/sw/siw/siw_verbs.c
@@ -1321,8 +1321,6 @@ struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len,
struct siw_umem *umem = NULL;
struct siw_ureq_reg_mr ureq;
struct siw_device *sdev = to_siw_dev(pd->device);
-
- unsigned long mem_limit = rlimit(RLIMIT_MEMLOCK);
int rv;
siw_dbg_pd(pd, "start: 0x%pK, va: 0x%pK, len: %llu\n",
@@ -1338,19 +1336,6 @@ struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len,
rv = -EINVAL;
goto err_out;
}
- if (mem_limit != RLIM_INFINITY) {
- unsigned long num_pages =
- (PAGE_ALIGN(len + (start & ~PAGE_MASK))) >> PAGE_SHIFT;
- mem_limit >>= PAGE_SHIFT;
-
- if (num_pages > mem_limit - current->mm->locked_vm) {
- siw_dbg_pd(pd, "pages req %lu, max %lu, lock %lu\n",
- num_pages, mem_limit,
- current->mm->locked_vm);
- rv = -ENOMEM;
- goto err_out;
- }
- }
umem = siw_umem_get(start, len, ib_access_writable(rights));
if (IS_ERR(umem)) {
rv = PTR_ERR(umem);
--
git-series 0.9.1
^ permalink raw reply related [flat|nested] 108+ messages in thread* [RFC PATCH 05/19] RMDA/siw: Convert to use vm_account
@ 2023-01-24 5:42 ` Alistair Popple
0 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-24 5:42 UTC (permalink / raw)
To: linux-mm, cgroups
Cc: linux-kernel, jgg, jhubbard, tjmercier, hannes, surenb, mkoutny,
daniel, Alistair Popple, Bernard Metzler, Jason Gunthorpe,
Leon Romanovsky, linux-rdma
Convert to using a vm_account structure to account pinned memory to
both the mm and the pins cgroup.
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Cc: Bernard Metzler <bmt@zurich.ibm.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: linux-rdma@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
drivers/infiniband/sw/siw/siw.h | 2 +-
drivers/infiniband/sw/siw/siw_mem.c | 20 ++++++--------------
drivers/infiniband/sw/siw/siw_verbs.c | 15 ---------------
3 files changed, 7 insertions(+), 30 deletions(-)
diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h
index 2f3a9cd..0c4a3ec 100644
--- a/drivers/infiniband/sw/siw/siw.h
+++ b/drivers/infiniband/sw/siw/siw.h
@@ -124,7 +124,7 @@ struct siw_umem {
int num_pages;
bool writable;
u64 fp_addr; /* First page base address */
- struct mm_struct *owning_mm;
+ struct vm_account vm_account;
};
struct siw_pble {
diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c
index b2b33dd..9c53fc3 100644
--- a/drivers/infiniband/sw/siw/siw_mem.c
+++ b/drivers/infiniband/sw/siw/siw_mem.c
@@ -68,7 +68,6 @@ static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages,
void siw_umem_release(struct siw_umem *umem, bool dirty)
{
- struct mm_struct *mm_s = umem->owning_mm;
int i, num_pages = umem->num_pages;
for (i = 0; num_pages; i++) {
@@ -79,9 +78,9 @@ void siw_umem_release(struct siw_umem *umem, bool dirty)
kfree(umem->page_chunk[i].plist);
num_pages -= to_free;
}
- atomic64_sub(umem->num_pages, &mm_s->pinned_vm);
+ vm_unaccount_pinned(&umem->vm_account, umem->num_pages);
+ vm_account_release(&umem->vm_account);
- mmdrop(mm_s);
kfree(umem->page_chunk);
kfree(umem);
}
@@ -365,9 +364,7 @@ struct siw_pbl *siw_pbl_alloc(u32 num_buf)
struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable)
{
struct siw_umem *umem;
- struct mm_struct *mm_s;
u64 first_page_va;
- unsigned long mlock_limit;
unsigned int foll_flags = FOLL_LONGTERM;
int num_pages, num_chunks, i, rv = 0;
@@ -385,20 +382,16 @@ struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable)
if (!umem)
return ERR_PTR(-ENOMEM);
- mm_s = current->mm;
- umem->owning_mm = mm_s;
umem->writable = writable;
- mmgrab(mm_s);
+ vm_account_init_current(&umem->vm_account);
if (writable)
foll_flags |= FOLL_WRITE;
- mmap_read_lock(mm_s);
+ mmap_read_lock(current->mm);
- mlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
- if (num_pages + atomic64_read(&mm_s->pinned_vm) > mlock_limit) {
+ if (vm_account_pinned(&umem->vm_account, num_pages)) {
rv = -ENOMEM;
goto out_sem_up;
}
@@ -429,7 +422,6 @@ struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable)
goto out_sem_up;
umem->num_pages += rv;
- atomic64_add(rv, &mm_s->pinned_vm);
first_page_va += rv * PAGE_SIZE;
nents -= rv;
got += rv;
@@ -437,7 +429,7 @@ struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable)
num_pages -= got;
}
out_sem_up:
- mmap_read_unlock(mm_s);
+ mmap_read_unlock(current->mm);
if (rv > 0)
return umem;
diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c
index 906fde1..8fab009 100644
--- a/drivers/infiniband/sw/siw/siw_verbs.c
+++ b/drivers/infiniband/sw/siw/siw_verbs.c
@@ -1321,8 +1321,6 @@ struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len,
struct siw_umem *umem = NULL;
struct siw_ureq_reg_mr ureq;
struct siw_device *sdev = to_siw_dev(pd->device);
-
- unsigned long mem_limit = rlimit(RLIMIT_MEMLOCK);
int rv;
siw_dbg_pd(pd, "start: 0x%pK, va: 0x%pK, len: %llu\n",
@@ -1338,19 +1336,6 @@ struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len,
rv = -EINVAL;
goto err_out;
}
- if (mem_limit != RLIM_INFINITY) {
- unsigned long num_pages =
- (PAGE_ALIGN(len + (start & ~PAGE_MASK))) >> PAGE_SHIFT;
- mem_limit >>= PAGE_SHIFT;
-
- if (num_pages > mem_limit - current->mm->locked_vm) {
- siw_dbg_pd(pd, "pages req %lu, max %lu, lock %lu\n",
- num_pages, mem_limit,
- current->mm->locked_vm);
- rv = -ENOMEM;
- goto err_out;
- }
- }
umem = siw_umem_get(start, len, ib_access_writable(rights));
if (IS_ERR(umem)) {
rv = PTR_ERR(umem);
--
git-series 0.9.1
^ permalink raw reply related [flat|nested] 108+ messages in thread* Re: [RFC PATCH 05/19] RMDA/siw: Convert to use vm_account
2023-01-24 5:42 ` Alistair Popple
(?)
@ 2023-01-24 14:37 ` Jason Gunthorpe
[not found] ` <Y8/tGIeg5mI9bDOa-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org>
-1 siblings, 1 reply; 108+ messages in thread
From: Jason Gunthorpe @ 2023-01-24 14:37 UTC (permalink / raw)
To: Alistair Popple
Cc: linux-mm, cgroups, linux-kernel, jhubbard, tjmercier, hannes,
surenb, mkoutny, daniel, Bernard Metzler, Leon Romanovsky,
linux-rdma
On Tue, Jan 24, 2023 at 04:42:34PM +1100, Alistair Popple wrote:
> @@ -385,20 +382,16 @@ struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable)
> if (!umem)
> return ERR_PTR(-ENOMEM);
>
> - mm_s = current->mm;
> - umem->owning_mm = mm_s;
> umem->writable = writable;
>
> - mmgrab(mm_s);
> + vm_account_init_current(&umem->vm_account);
>
> if (writable)
> foll_flags |= FOLL_WRITE;
>
> - mmap_read_lock(mm_s);
> + mmap_read_lock(current->mm);
>
> - mlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> -
> - if (num_pages + atomic64_read(&mm_s->pinned_vm) > mlock_limit) {
> + if (vm_account_pinned(&umem->vm_account, num_pages)) {
> rv = -ENOMEM;
> goto out_sem_up;
> }
> @@ -429,7 +422,6 @@ struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable)
> goto out_sem_up;
>
> umem->num_pages += rv;
> - atomic64_add(rv, &mm_s->pinned_vm);
Also fixes the race bug
Jason
^ permalink raw reply [flat|nested] 108+ messages in thread
* [RFC PATCH 06/19] RDMA/usnic: convert to use vm_account
2023-01-24 5:42 ` Alistair Popple
@ 2023-01-24 5:42 ` Alistair Popple
-1 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-24 5:42 UTC (permalink / raw)
To: linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, jgg-DDmLM1+adcrQT0dZR+AlfA,
jhubbard-DDmLM1+adcrQT0dZR+AlfA, tjmercier-hpIqsD4AKlfQT0dZR+AlfA,
hannes-druUgvl0LCNAfugRpC6u6w, surenb-hpIqsD4AKlfQT0dZR+AlfA,
mkoutny-IBi9RG/b67k, daniel-/w4YWyX8dFk, Alistair Popple,
Christian Benvenuti, Nelson Escobar, Jason Gunthorpe,
Leon Romanovsky, linux-rdma-u79uwXL29TY76Z2rM5mHXA
Convert to using a vm_account structure to account pinned memory to
both the mm and the pins cgroup.
Signed-off-by: Alistair Popple <apopple-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org>
Cc: Christian Benvenuti <benve-FYB4Gu1CFyUAvxtiuMwx3w@public.gmane.org>
Cc: Nelson Escobar <neescoba-FYB4Gu1CFyUAvxtiuMwx3w@public.gmane.org>
Cc: Jason Gunthorpe <jgg-uk2M96/98Pc@public.gmane.org>
Cc: Leon Romanovsky <leon-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
---
drivers/infiniband/hw/usnic/usnic_uiom.c | 13 +++++--------
drivers/infiniband/hw/usnic/usnic_uiom.h | 1 +
2 files changed, 6 insertions(+), 8 deletions(-)
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
index c301b3b..250276e 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
@@ -89,8 +89,6 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
struct page **page_list;
struct scatterlist *sg;
struct usnic_uiom_chunk *chunk;
- unsigned long locked;
- unsigned long lock_limit;
unsigned long cur_base;
unsigned long npages;
int ret;
@@ -123,10 +121,8 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
uiomr->owning_mm = mm = current->mm;
mmap_read_lock(mm);
- locked = atomic64_add_return(npages, ¤t->mm->pinned_vm);
- lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
- if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+ vm_account_init_current(&uiomr->vm_account);
+ if (vm_account_pinned(&uiomr->vm_account, npages)) {
ret = -ENOMEM;
goto out;
}
@@ -178,7 +174,8 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
out:
if (ret < 0) {
usnic_uiom_put_pages(chunk_list, 0);
- atomic64_sub(npages, ¤t->mm->pinned_vm);
+ vm_unaccount_pinned(&uiomr->vm_account, npages);
+ vm_account_release(&uiomr->vm_account);
} else
mmgrab(uiomr->owning_mm);
@@ -430,7 +427,7 @@ void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr)
{
__usnic_uiom_reg_release(uiomr->pd, uiomr, 1);
- atomic64_sub(usnic_uiom_num_pages(uiomr), &uiomr->owning_mm->pinned_vm);
+ vm_unaccount_pinned(&uiomr->vm_account, usnic_uiom_num_pages(uiomr));
__usnic_uiom_release_tail(uiomr);
}
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.h b/drivers/infiniband/hw/usnic/usnic_uiom.h
index 5a9acf9..5c296a7 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.h
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.h
@@ -72,6 +72,7 @@ struct usnic_uiom_reg {
struct list_head chunk_list;
struct work_struct work;
struct mm_struct *owning_mm;
+ struct vm_account vm_account;
};
struct usnic_uiom_chunk {
--
git-series 0.9.1
^ permalink raw reply related [flat|nested] 108+ messages in thread* [RFC PATCH 06/19] RDMA/usnic: convert to use vm_account
@ 2023-01-24 5:42 ` Alistair Popple
0 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-24 5:42 UTC (permalink / raw)
To: linux-mm, cgroups
Cc: linux-kernel, jgg, jhubbard, tjmercier, hannes, surenb, mkoutny,
daniel, Alistair Popple, Christian Benvenuti, Nelson Escobar,
Jason Gunthorpe, Leon Romanovsky, linux-rdma
Convert to using a vm_account structure to account pinned memory to
both the mm and the pins cgroup.
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Cc: Christian Benvenuti <benve@cisco.com>
Cc: Nelson Escobar <neescoba@cisco.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: linux-rdma@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
drivers/infiniband/hw/usnic/usnic_uiom.c | 13 +++++--------
drivers/infiniband/hw/usnic/usnic_uiom.h | 1 +
2 files changed, 6 insertions(+), 8 deletions(-)
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
index c301b3b..250276e 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
@@ -89,8 +89,6 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
struct page **page_list;
struct scatterlist *sg;
struct usnic_uiom_chunk *chunk;
- unsigned long locked;
- unsigned long lock_limit;
unsigned long cur_base;
unsigned long npages;
int ret;
@@ -123,10 +121,8 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
uiomr->owning_mm = mm = current->mm;
mmap_read_lock(mm);
- locked = atomic64_add_return(npages, ¤t->mm->pinned_vm);
- lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
- if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+ vm_account_init_current(&uiomr->vm_account);
+ if (vm_account_pinned(&uiomr->vm_account, npages)) {
ret = -ENOMEM;
goto out;
}
@@ -178,7 +174,8 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
out:
if (ret < 0) {
usnic_uiom_put_pages(chunk_list, 0);
- atomic64_sub(npages, ¤t->mm->pinned_vm);
+ vm_unaccount_pinned(&uiomr->vm_account, npages);
+ vm_account_release(&uiomr->vm_account);
} else
mmgrab(uiomr->owning_mm);
@@ -430,7 +427,7 @@ void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr)
{
__usnic_uiom_reg_release(uiomr->pd, uiomr, 1);
- atomic64_sub(usnic_uiom_num_pages(uiomr), &uiomr->owning_mm->pinned_vm);
+ vm_unaccount_pinned(&uiomr->vm_account, usnic_uiom_num_pages(uiomr));
__usnic_uiom_release_tail(uiomr);
}
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.h b/drivers/infiniband/hw/usnic/usnic_uiom.h
index 5a9acf9..5c296a7 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.h
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.h
@@ -72,6 +72,7 @@ struct usnic_uiom_reg {
struct list_head chunk_list;
struct work_struct work;
struct mm_struct *owning_mm;
+ struct vm_account vm_account;
};
struct usnic_uiom_chunk {
--
git-series 0.9.1
^ permalink raw reply related [flat|nested] 108+ messages in thread[parent not found: <03ed2d166826cd7055810c66a175e20fa2153c52.1674538665.git-series.apopple-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org>]
* Re: [RFC PATCH 06/19] RDMA/usnic: convert to use vm_account
2023-01-24 5:42 ` Alistair Popple
@ 2023-01-24 14:41 ` Jason Gunthorpe
-1 siblings, 0 replies; 108+ messages in thread
From: Jason Gunthorpe @ 2023-01-24 14:41 UTC (permalink / raw)
To: Alistair Popple
Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA,
jhubbard-DDmLM1+adcrQT0dZR+AlfA, tjmercier-hpIqsD4AKlfQT0dZR+AlfA,
hannes-druUgvl0LCNAfugRpC6u6w, surenb-hpIqsD4AKlfQT0dZR+AlfA,
mkoutny-IBi9RG/b67k, daniel-/w4YWyX8dFk, Christian Benvenuti,
Nelson Escobar, Leon Romanovsky,
linux-rdma-u79uwXL29TY76Z2rM5mHXA
On Tue, Jan 24, 2023 at 04:42:35PM +1100, Alistair Popple wrote:
> diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
> index c301b3b..250276e 100644
> --- a/drivers/infiniband/hw/usnic/usnic_uiom.c
> +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
> @@ -89,8 +89,6 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
> struct page **page_list;
> struct scatterlist *sg;
> struct usnic_uiom_chunk *chunk;
> - unsigned long locked;
> - unsigned long lock_limit;
> unsigned long cur_base;
> unsigned long npages;
> int ret;
> @@ -123,10 +121,8 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
> uiomr->owning_mm = mm = current->mm;
> mmap_read_lock(mm);
>
> - locked = atomic64_add_return(npages, ¤t->mm->pinned_vm);
> - lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> -
> - if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
> + vm_account_init_current(&uiomr->vm_account);
> + if (vm_account_pinned(&uiomr->vm_account, npages)) {
> ret = -ENOMEM;
> goto out;
> }
Is this error handling right? This driver tried to avoid the race by
using atomic64_add_return() but it means that the out label undoes the add:
> @@ -178,7 +174,8 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
> out:
> if (ret < 0) {
> usnic_uiom_put_pages(chunk_list, 0);
> - atomic64_sub(npages, ¤t->mm->pinned_vm);
Here
> + vm_unaccount_pinned(&uiomr->vm_account, npages);
> + vm_account_release(&uiomr->vm_account);
But with the new API we shouldn't call vm_unaccount_pinned() if
vm_account_pinned() doesn't succeed?
Jason
^ permalink raw reply [flat|nested] 108+ messages in thread* Re: [RFC PATCH 06/19] RDMA/usnic: convert to use vm_account
@ 2023-01-24 14:41 ` Jason Gunthorpe
0 siblings, 0 replies; 108+ messages in thread
From: Jason Gunthorpe @ 2023-01-24 14:41 UTC (permalink / raw)
To: Alistair Popple
Cc: linux-mm, cgroups, linux-kernel, jhubbard, tjmercier, hannes,
surenb, mkoutny, daniel, Christian Benvenuti, Nelson Escobar,
Leon Romanovsky, linux-rdma
On Tue, Jan 24, 2023 at 04:42:35PM +1100, Alistair Popple wrote:
> diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
> index c301b3b..250276e 100644
> --- a/drivers/infiniband/hw/usnic/usnic_uiom.c
> +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
> @@ -89,8 +89,6 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
> struct page **page_list;
> struct scatterlist *sg;
> struct usnic_uiom_chunk *chunk;
> - unsigned long locked;
> - unsigned long lock_limit;
> unsigned long cur_base;
> unsigned long npages;
> int ret;
> @@ -123,10 +121,8 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
> uiomr->owning_mm = mm = current->mm;
> mmap_read_lock(mm);
>
> - locked = atomic64_add_return(npages, ¤t->mm->pinned_vm);
> - lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> -
> - if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
> + vm_account_init_current(&uiomr->vm_account);
> + if (vm_account_pinned(&uiomr->vm_account, npages)) {
> ret = -ENOMEM;
> goto out;
> }
Is this error handling right? This driver tried to avoid the race by
using atomic64_add_return() but it means that the out label undoes the add:
> @@ -178,7 +174,8 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
> out:
> if (ret < 0) {
> usnic_uiom_put_pages(chunk_list, 0);
> - atomic64_sub(npages, ¤t->mm->pinned_vm);
Here
> + vm_unaccount_pinned(&uiomr->vm_account, npages);
> + vm_account_release(&uiomr->vm_account);
But with the new API we shouldn't call vm_unaccount_pinned() if
vm_account_pinned() doesn't succeed?
Jason
^ permalink raw reply [flat|nested] 108+ messages in thread[parent not found: <Y8/uGL+TA7ow4Zmu-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org>]
* Re: [RFC PATCH 06/19] RDMA/usnic: convert to use vm_account
2023-01-24 14:41 ` Jason Gunthorpe
@ 2023-01-30 11:10 ` Alistair Popple
-1 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-30 11:10 UTC (permalink / raw)
To: Jason Gunthorpe
Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA,
jhubbard-DDmLM1+adcrQT0dZR+AlfA, tjmercier-hpIqsD4AKlfQT0dZR+AlfA,
hannes-druUgvl0LCNAfugRpC6u6w, surenb-hpIqsD4AKlfQT0dZR+AlfA,
mkoutny-IBi9RG/b67k, daniel-/w4YWyX8dFk, Christian Benvenuti,
Nelson Escobar, Leon Romanovsky,
linux-rdma-u79uwXL29TY76Z2rM5mHXA
Jason Gunthorpe <jgg-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org> writes:
> On Tue, Jan 24, 2023 at 04:42:35PM +1100, Alistair Popple wrote:
>> diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
>> index c301b3b..250276e 100644
>> --- a/drivers/infiniband/hw/usnic/usnic_uiom.c
>> +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
>> @@ -89,8 +89,6 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
>> struct page **page_list;
>> struct scatterlist *sg;
>> struct usnic_uiom_chunk *chunk;
>> - unsigned long locked;
>> - unsigned long lock_limit;
>> unsigned long cur_base;
>> unsigned long npages;
>> int ret;
>> @@ -123,10 +121,8 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
>> uiomr->owning_mm = mm = current->mm;
>> mmap_read_lock(mm);
>>
>> - locked = atomic64_add_return(npages, ¤t->mm->pinned_vm);
>> - lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>> -
>> - if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
>> + vm_account_init_current(&uiomr->vm_account);
>> + if (vm_account_pinned(&uiomr->vm_account, npages)) {
>> ret = -ENOMEM;
>> goto out;
>> }
>
> Is this error handling right? This driver tried to avoid the race by
> using atomic64_add_return() but it means that the out label undoes the add:
>
>
>> @@ -178,7 +174,8 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
>> out:
>> if (ret < 0) {
>> usnic_uiom_put_pages(chunk_list, 0);
>> - atomic64_sub(npages, ¤t->mm->pinned_vm);
>
> Here
>
>> + vm_unaccount_pinned(&uiomr->vm_account, npages);
>> + vm_account_release(&uiomr->vm_account);
>
> But with the new API we shouldn't call vm_unaccount_pinned() if
> vm_account_pinned() doesn't succeed?
Good point. Will add the following fix:
@@ -123,6 +123,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
vm_account_init_current(&uiomr->vm_account);
if (vm_account_pinned(&uiomr->vm_account, npages)) {
+ npages = 0;
ret = -ENOMEM;
goto out;
}
>
> Jason
^ permalink raw reply [flat|nested] 108+ messages in thread* Re: [RFC PATCH 06/19] RDMA/usnic: convert to use vm_account
@ 2023-01-30 11:10 ` Alistair Popple
0 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-30 11:10 UTC (permalink / raw)
To: Jason Gunthorpe
Cc: linux-mm, cgroups, linux-kernel, jhubbard, tjmercier, hannes,
surenb, mkoutny, daniel, Christian Benvenuti, Nelson Escobar,
Leon Romanovsky, linux-rdma
Jason Gunthorpe <jgg@nvidia.com> writes:
> On Tue, Jan 24, 2023 at 04:42:35PM +1100, Alistair Popple wrote:
>> diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
>> index c301b3b..250276e 100644
>> --- a/drivers/infiniband/hw/usnic/usnic_uiom.c
>> +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
>> @@ -89,8 +89,6 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
>> struct page **page_list;
>> struct scatterlist *sg;
>> struct usnic_uiom_chunk *chunk;
>> - unsigned long locked;
>> - unsigned long lock_limit;
>> unsigned long cur_base;
>> unsigned long npages;
>> int ret;
>> @@ -123,10 +121,8 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
>> uiomr->owning_mm = mm = current->mm;
>> mmap_read_lock(mm);
>>
>> - locked = atomic64_add_return(npages, ¤t->mm->pinned_vm);
>> - lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>> -
>> - if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
>> + vm_account_init_current(&uiomr->vm_account);
>> + if (vm_account_pinned(&uiomr->vm_account, npages)) {
>> ret = -ENOMEM;
>> goto out;
>> }
>
> Is this error handling right? This driver tried to avoid the race by
> using atomic64_add_return() but it means that the out label undoes the add:
>
>
>> @@ -178,7 +174,8 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
>> out:
>> if (ret < 0) {
>> usnic_uiom_put_pages(chunk_list, 0);
>> - atomic64_sub(npages, ¤t->mm->pinned_vm);
>
> Here
>
>> + vm_unaccount_pinned(&uiomr->vm_account, npages);
>> + vm_account_release(&uiomr->vm_account);
>
> But with the new API we shouldn't call vm_unaccount_pinned() if
> vm_account_pinned() doesn't succeed?
Good point. Will add the following fix:
@@ -123,6 +123,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
vm_account_init_current(&uiomr->vm_account);
if (vm_account_pinned(&uiomr->vm_account, npages)) {
+ npages = 0;
ret = -ENOMEM;
goto out;
}
>
> Jason
^ permalink raw reply [flat|nested] 108+ messages in thread
* [RFC PATCH 10/19] net: skb: Switch to using vm_account
2023-01-24 5:42 ` Alistair Popple
@ 2023-01-24 5:42 ` Alistair Popple
-1 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-24 5:42 UTC (permalink / raw)
To: linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, jgg-DDmLM1+adcrQT0dZR+AlfA,
jhubbard-DDmLM1+adcrQT0dZR+AlfA, tjmercier-hpIqsD4AKlfQT0dZR+AlfA,
hannes-druUgvl0LCNAfugRpC6u6w, surenb-hpIqsD4AKlfQT0dZR+AlfA,
mkoutny-IBi9RG/b67k, daniel-/w4YWyX8dFk, Alistair Popple,
netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
rds-devel-N0ozoZBvEnrZJqsBc5GL+g
Switch to using vm_account to charge pinned pages. This will allow a
future change to charge the pinned pages to a cgroup to limit the
overall number of pinned pages in the system.
Signed-off-by: Alistair Popple <apopple-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org>
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: rds-devel-N0ozoZBvEnrZJqsBc5GL+g@public.gmane.org
---
include/linux/skbuff.h | 6 ++---
include/net/sock.h | 2 ++-
net/core/skbuff.c | 47 +++++++++++++++----------------------------
net/rds/message.c | 9 +++++---
4 files changed, 28 insertions(+), 36 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4c84924..c956405 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -554,7 +554,6 @@ struct ubuf_info_msgzc {
};
struct mmpin {
- struct user_struct *user;
unsigned int num_pg;
} mmp;
};
@@ -563,8 +562,9 @@ struct ubuf_info_msgzc {
#define uarg_to_msgzc(ubuf_ptr) container_of((ubuf_ptr), struct ubuf_info_msgzc, \
ubuf)
-int mm_account_pinned_pages(struct mmpin *mmp, size_t size);
-void mm_unaccount_pinned_pages(struct mmpin *mmp);
+int mm_account_pinned_pages(struct vm_account *vm_account, struct mmpin *mmp,
+ size_t size);
+void mm_unaccount_pinned_pages(struct vm_account *vm_account, struct mmpin *mmp);
/* This data is invariant across clones and lives at
* the end of the header data, ie. at skb->end.
diff --git a/include/net/sock.h b/include/net/sock.h
index dcd72e6..bc3a868 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -334,6 +334,7 @@ struct sk_filter;
* @sk_security: used by security modules
* @sk_mark: generic packet mark
* @sk_cgrp_data: cgroup data for this cgroup
+ * @sk_vm_account: data for pinned memory accounting
* @sk_memcg: this socket's memory cgroup association
* @sk_write_pending: a write to stream socket waits to start
* @sk_state_change: callback to indicate change in the state of the sock
@@ -523,6 +524,7 @@ struct sock {
void *sk_security;
#endif
struct sock_cgroup_data sk_cgrp_data;
+ struct vm_account sk_vm_account;
struct mem_cgroup *sk_memcg;
void (*sk_state_change)(struct sock *sk);
void (*sk_data_ready)(struct sock *sk);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 4a0eb55..bed3fc9 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1309,42 +1309,25 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
}
EXPORT_SYMBOL_GPL(skb_morph);
-int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
+int mm_account_pinned_pages(struct vm_account *vm_account, struct mmpin *mmp,
+ size_t size)
{
- unsigned long max_pg, num_pg, new_pg, old_pg;
- struct user_struct *user;
-
- if (capable(CAP_IPC_LOCK) || !size)
- return 0;
+ unsigned int num_pg;
num_pg = (size >> PAGE_SHIFT) + 2; /* worst case */
- max_pg = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
- user = mmp->user ? : current_user();
+ if (vm_account_pinned(vm_account, num_pg))
+ return -ENOBUFS;
- old_pg = atomic_long_read(&user->locked_vm);
- do {
- new_pg = old_pg + num_pg;
- if (new_pg > max_pg)
- return -ENOBUFS;
- } while (!atomic_long_try_cmpxchg(&user->locked_vm, &old_pg, new_pg));
-
- if (!mmp->user) {
- mmp->user = get_uid(user);
- mmp->num_pg = num_pg;
- } else {
- mmp->num_pg += num_pg;
- }
+ mmp->num_pg += num_pg;
return 0;
}
EXPORT_SYMBOL_GPL(mm_account_pinned_pages);
-void mm_unaccount_pinned_pages(struct mmpin *mmp)
+void mm_unaccount_pinned_pages(struct vm_account *vm_account, struct mmpin *mmp)
{
- if (mmp->user) {
- atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm);
- free_uid(mmp->user);
- }
+ vm_unaccount_pinned(vm_account, mmp->num_pg);
+ vm_account_release(vm_account);
}
EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);
@@ -1361,9 +1344,12 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb));
uarg = (void *)skb->cb;
- uarg->mmp.user = NULL;
+ uarg->mmp.num_pg = 0;
+ vm_account_init(&sk->sk_vm_account, current,
+ current_user(), VM_ACCOUNT_USER);
- if (mm_account_pinned_pages(&uarg->mmp, size)) {
+ if (mm_account_pinned_pages(&sk->sk_vm_account, &uarg->mmp, size)) {
+ vm_account_release(&sk->sk_vm_account);
kfree_skb(skb);
return NULL;
}
@@ -1416,7 +1402,8 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
next = (u32)atomic_read(&sk->sk_zckey);
if ((u32)(uarg_zc->id + uarg_zc->len) == next) {
- if (mm_account_pinned_pages(&uarg_zc->mmp, size))
+ if (mm_account_pinned_pages(&sk->sk_vm_account,
+ &uarg_zc->mmp, size))
return NULL;
uarg_zc->len++;
uarg_zc->bytelen = bytelen;
@@ -1466,7 +1453,7 @@ static void __msg_zerocopy_callback(struct ubuf_info_msgzc *uarg)
u32 lo, hi;
u16 len;
- mm_unaccount_pinned_pages(&uarg->mmp);
+ mm_unaccount_pinned_pages(&sk->sk_vm_account, &uarg->mmp);
/* if !len, there was only 1 call, and it was aborted
* so do not queue a completion notification
diff --git a/net/rds/message.c b/net/rds/message.c
index b47e4f0..2138a70 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -99,7 +99,7 @@ static void rds_rm_zerocopy_callback(struct rds_sock *rs,
struct list_head *head;
unsigned long flags;
- mm_unaccount_pinned_pages(&znotif->z_mmp);
+ mm_unaccount_pinned_pages(&rs->rs_sk.sk_vm_account, &znotif->z_mmp);
q = &rs->rs_zcookie_queue;
spin_lock_irqsave(&q->lock, flags);
head = &q->zcookie_head;
@@ -367,6 +367,7 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *
int ret = 0;
int length = iov_iter_count(from);
struct rds_msg_zcopy_info *info;
+ struct vm_account *vm_account = &rm->m_rs->rs_sk.sk_vm_account;
rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from));
@@ -380,7 +381,9 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *
return -ENOMEM;
INIT_LIST_HEAD(&info->rs_zcookie_next);
rm->data.op_mmp_znotifier = &info->znotif;
- if (mm_account_pinned_pages(&rm->data.op_mmp_znotifier->z_mmp,
+ vm_account_init(vm_account, current, current_user(), VM_ACCOUNT_USER);
+ if (mm_account_pinned_pages(vm_account,
+ &rm->data.op_mmp_znotifier->z_mmp,
length)) {
ret = -ENOMEM;
goto err;
@@ -399,7 +402,7 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *
for (i = 0; i < rm->data.op_nents; i++)
put_page(sg_page(&rm->data.op_sg[i]));
mmp = &rm->data.op_mmp_znotifier->z_mmp;
- mm_unaccount_pinned_pages(mmp);
+ mm_unaccount_pinned_pages(vm_account, mmp);
ret = -EFAULT;
goto err;
}
--
git-series 0.9.1
^ permalink raw reply related [flat|nested] 108+ messages in thread* [RFC PATCH 10/19] net: skb: Switch to using vm_account
@ 2023-01-24 5:42 ` Alistair Popple
0 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-24 5:42 UTC (permalink / raw)
To: linux-mm, cgroups
Cc: linux-kernel, jgg, jhubbard, tjmercier, hannes, surenb, mkoutny,
daniel, Alistair Popple, netdev, linux-rdma, rds-devel
Switch to using vm_account to charge pinned pages. This will allow a
future change to charge the pinned pages to a cgroup to limit the
overall number of pinned pages in the system.
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Cc: linux-kernel@vger.kernel.org
Cc: netdev@vger.kernel.org
Cc: linux-rdma@vger.kernel.org
Cc: rds-devel@oss.oracle.com
---
include/linux/skbuff.h | 6 ++---
include/net/sock.h | 2 ++-
net/core/skbuff.c | 47 +++++++++++++++----------------------------
net/rds/message.c | 9 +++++---
4 files changed, 28 insertions(+), 36 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4c84924..c956405 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -554,7 +554,6 @@ struct ubuf_info_msgzc {
};
struct mmpin {
- struct user_struct *user;
unsigned int num_pg;
} mmp;
};
@@ -563,8 +562,9 @@ struct ubuf_info_msgzc {
#define uarg_to_msgzc(ubuf_ptr) container_of((ubuf_ptr), struct ubuf_info_msgzc, \
ubuf)
-int mm_account_pinned_pages(struct mmpin *mmp, size_t size);
-void mm_unaccount_pinned_pages(struct mmpin *mmp);
+int mm_account_pinned_pages(struct vm_account *vm_account, struct mmpin *mmp,
+ size_t size);
+void mm_unaccount_pinned_pages(struct vm_account *vm_account, struct mmpin *mmp);
/* This data is invariant across clones and lives at
* the end of the header data, ie. at skb->end.
diff --git a/include/net/sock.h b/include/net/sock.h
index dcd72e6..bc3a868 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -334,6 +334,7 @@ struct sk_filter;
* @sk_security: used by security modules
* @sk_mark: generic packet mark
* @sk_cgrp_data: cgroup data for this cgroup
+ * @sk_vm_account: data for pinned memory accounting
* @sk_memcg: this socket's memory cgroup association
* @sk_write_pending: a write to stream socket waits to start
* @sk_state_change: callback to indicate change in the state of the sock
@@ -523,6 +524,7 @@ struct sock {
void *sk_security;
#endif
struct sock_cgroup_data sk_cgrp_data;
+ struct vm_account sk_vm_account;
struct mem_cgroup *sk_memcg;
void (*sk_state_change)(struct sock *sk);
void (*sk_data_ready)(struct sock *sk);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 4a0eb55..bed3fc9 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1309,42 +1309,25 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
}
EXPORT_SYMBOL_GPL(skb_morph);
-int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
+int mm_account_pinned_pages(struct vm_account *vm_account, struct mmpin *mmp,
+ size_t size)
{
- unsigned long max_pg, num_pg, new_pg, old_pg;
- struct user_struct *user;
-
- if (capable(CAP_IPC_LOCK) || !size)
- return 0;
+ unsigned int num_pg;
num_pg = (size >> PAGE_SHIFT) + 2; /* worst case */
- max_pg = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
- user = mmp->user ? : current_user();
+ if (vm_account_pinned(vm_account, num_pg))
+ return -ENOBUFS;
- old_pg = atomic_long_read(&user->locked_vm);
- do {
- new_pg = old_pg + num_pg;
- if (new_pg > max_pg)
- return -ENOBUFS;
- } while (!atomic_long_try_cmpxchg(&user->locked_vm, &old_pg, new_pg));
-
- if (!mmp->user) {
- mmp->user = get_uid(user);
- mmp->num_pg = num_pg;
- } else {
- mmp->num_pg += num_pg;
- }
+ mmp->num_pg += num_pg;
return 0;
}
EXPORT_SYMBOL_GPL(mm_account_pinned_pages);
-void mm_unaccount_pinned_pages(struct mmpin *mmp)
+void mm_unaccount_pinned_pages(struct vm_account *vm_account, struct mmpin *mmp)
{
- if (mmp->user) {
- atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm);
- free_uid(mmp->user);
- }
+ vm_unaccount_pinned(vm_account, mmp->num_pg);
+ vm_account_release(vm_account);
}
EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);
@@ -1361,9 +1344,12 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb));
uarg = (void *)skb->cb;
- uarg->mmp.user = NULL;
+ uarg->mmp.num_pg = 0;
+ vm_account_init(&sk->sk_vm_account, current,
+ current_user(), VM_ACCOUNT_USER);
- if (mm_account_pinned_pages(&uarg->mmp, size)) {
+ if (mm_account_pinned_pages(&sk->sk_vm_account, &uarg->mmp, size)) {
+ vm_account_release(&sk->sk_vm_account);
kfree_skb(skb);
return NULL;
}
@@ -1416,7 +1402,8 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
next = (u32)atomic_read(&sk->sk_zckey);
if ((u32)(uarg_zc->id + uarg_zc->len) == next) {
- if (mm_account_pinned_pages(&uarg_zc->mmp, size))
+ if (mm_account_pinned_pages(&sk->sk_vm_account,
+ &uarg_zc->mmp, size))
return NULL;
uarg_zc->len++;
uarg_zc->bytelen = bytelen;
@@ -1466,7 +1453,7 @@ static void __msg_zerocopy_callback(struct ubuf_info_msgzc *uarg)
u32 lo, hi;
u16 len;
- mm_unaccount_pinned_pages(&uarg->mmp);
+ mm_unaccount_pinned_pages(&sk->sk_vm_account, &uarg->mmp);
/* if !len, there was only 1 call, and it was aborted
* so do not queue a completion notification
diff --git a/net/rds/message.c b/net/rds/message.c
index b47e4f0..2138a70 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -99,7 +99,7 @@ static void rds_rm_zerocopy_callback(struct rds_sock *rs,
struct list_head *head;
unsigned long flags;
- mm_unaccount_pinned_pages(&znotif->z_mmp);
+ mm_unaccount_pinned_pages(&rs->rs_sk.sk_vm_account, &znotif->z_mmp);
q = &rs->rs_zcookie_queue;
spin_lock_irqsave(&q->lock, flags);
head = &q->zcookie_head;
@@ -367,6 +367,7 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *
int ret = 0;
int length = iov_iter_count(from);
struct rds_msg_zcopy_info *info;
+ struct vm_account *vm_account = &rm->m_rs->rs_sk.sk_vm_account;
rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from));
@@ -380,7 +381,9 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *
return -ENOMEM;
INIT_LIST_HEAD(&info->rs_zcookie_next);
rm->data.op_mmp_znotifier = &info->znotif;
- if (mm_account_pinned_pages(&rm->data.op_mmp_znotifier->z_mmp,
+ vm_account_init(vm_account, current, current_user(), VM_ACCOUNT_USER);
+ if (mm_account_pinned_pages(vm_account,
+ &rm->data.op_mmp_znotifier->z_mmp,
length)) {
ret = -ENOMEM;
goto err;
@@ -399,7 +402,7 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *
for (i = 0; i < rm->data.op_nents; i++)
put_page(sg_page(&rm->data.op_sg[i]));
mmp = &rm->data.op_mmp_znotifier->z_mmp;
- mm_unaccount_pinned_pages(mmp);
+ mm_unaccount_pinned_pages(vm_account, mmp);
ret = -EFAULT;
goto err;
}
--
git-series 0.9.1
^ permalink raw reply related [flat|nested] 108+ messages in thread[parent not found: <9b54eef0b41b678cc5f318bd5ae0917bba5b8e21.1674538665.git-series.apopple-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org>]
* Re: [RFC PATCH 10/19] net: skb: Switch to using vm_account
2023-01-24 5:42 ` Alistair Popple
@ 2023-01-24 14:51 ` Jason Gunthorpe
-1 siblings, 0 replies; 108+ messages in thread
From: Jason Gunthorpe @ 2023-01-24 14:51 UTC (permalink / raw)
To: Alistair Popple
Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA,
jhubbard-DDmLM1+adcrQT0dZR+AlfA, tjmercier-hpIqsD4AKlfQT0dZR+AlfA,
hannes-druUgvl0LCNAfugRpC6u6w, surenb-hpIqsD4AKlfQT0dZR+AlfA,
mkoutny-IBi9RG/b67k, daniel-/w4YWyX8dFk,
netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
rds-devel-N0ozoZBvEnrZJqsBc5GL+g
On Tue, Jan 24, 2023 at 04:42:39PM +1100, Alistair Popple wrote:
> diff --git a/include/net/sock.h b/include/net/sock.h
> index dcd72e6..bc3a868 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -334,6 +334,7 @@ struct sk_filter;
> * @sk_security: used by security modules
> * @sk_mark: generic packet mark
> * @sk_cgrp_data: cgroup data for this cgroup
> + * @sk_vm_account: data for pinned memory accounting
> * @sk_memcg: this socket's memory cgroup association
> * @sk_write_pending: a write to stream socket waits to start
> * @sk_state_change: callback to indicate change in the state of the sock
> @@ -523,6 +524,7 @@ struct sock {
> void *sk_security;
> #endif
> struct sock_cgroup_data sk_cgrp_data;
> + struct vm_account sk_vm_account;
> struct mem_cgroup *sk_memcg;
> void (*sk_state_change)(struct sock *sk);
> void (*sk_data_ready)(struct sock *sk);
I'm not sure this makes sense in a sock - each sock can be shared with
different proceses..
> diff --git a/net/rds/message.c b/net/rds/message.c
> index b47e4f0..2138a70 100644
> --- a/net/rds/message.c
> +++ b/net/rds/message.c
> @@ -99,7 +99,7 @@ static void rds_rm_zerocopy_callback(struct rds_sock *rs,
> struct list_head *head;
> unsigned long flags;
>
> - mm_unaccount_pinned_pages(&znotif->z_mmp);
> + mm_unaccount_pinned_pages(&rs->rs_sk.sk_vm_account, &znotif->z_mmp);
> q = &rs->rs_zcookie_queue;
> spin_lock_irqsave(&q->lock, flags);
> head = &q->zcookie_head;
> @@ -367,6 +367,7 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *
> int ret = 0;
> int length = iov_iter_count(from);
> struct rds_msg_zcopy_info *info;
> + struct vm_account *vm_account = &rm->m_rs->rs_sk.sk_vm_account;
>
> rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from));
>
> @@ -380,7 +381,9 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *
> return -ENOMEM;
> INIT_LIST_HEAD(&info->rs_zcookie_next);
> rm->data.op_mmp_znotifier = &info->znotif;
> - if (mm_account_pinned_pages(&rm->data.op_mmp_znotifier->z_mmp,
> + vm_account_init(vm_account, current, current_user(), VM_ACCOUNT_USER);
> + if (mm_account_pinned_pages(vm_account,
> + &rm->data.op_mmp_znotifier->z_mmp,
> length)) {
> ret = -ENOMEM;
> goto err;
> @@ -399,7 +402,7 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *
> for (i = 0; i < rm->data.op_nents; i++)
> put_page(sg_page(&rm->data.op_sg[i]));
> mmp = &rm->data.op_mmp_znotifier->z_mmp;
> - mm_unaccount_pinned_pages(mmp);
> + mm_unaccount_pinned_pages(vm_account, mmp);
> ret = -EFAULT;
> goto err;
> }
I wonder if RDS should just not be doing accounting? Usually things
related to iov_iter are short term and we don't account for them.
But then I don't really know how RDS works, Santos?
Regardless, maybe the vm_account should be stored in the
rds_msg_zcopy_info ?
Jason
^ permalink raw reply [flat|nested] 108+ messages in thread* Re: [RFC PATCH 10/19] net: skb: Switch to using vm_account
@ 2023-01-24 14:51 ` Jason Gunthorpe
0 siblings, 0 replies; 108+ messages in thread
From: Jason Gunthorpe @ 2023-01-24 14:51 UTC (permalink / raw)
To: Alistair Popple
Cc: linux-mm, cgroups, linux-kernel, jhubbard, tjmercier, hannes,
surenb, mkoutny, daniel, netdev, linux-rdma, rds-devel
On Tue, Jan 24, 2023 at 04:42:39PM +1100, Alistair Popple wrote:
> diff --git a/include/net/sock.h b/include/net/sock.h
> index dcd72e6..bc3a868 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -334,6 +334,7 @@ struct sk_filter;
> * @sk_security: used by security modules
> * @sk_mark: generic packet mark
> * @sk_cgrp_data: cgroup data for this cgroup
> + * @sk_vm_account: data for pinned memory accounting
> * @sk_memcg: this socket's memory cgroup association
> * @sk_write_pending: a write to stream socket waits to start
> * @sk_state_change: callback to indicate change in the state of the sock
> @@ -523,6 +524,7 @@ struct sock {
> void *sk_security;
> #endif
> struct sock_cgroup_data sk_cgrp_data;
> + struct vm_account sk_vm_account;
> struct mem_cgroup *sk_memcg;
> void (*sk_state_change)(struct sock *sk);
> void (*sk_data_ready)(struct sock *sk);
I'm not sure this makes sense in a sock - each sock can be shared with
different proceses..
> diff --git a/net/rds/message.c b/net/rds/message.c
> index b47e4f0..2138a70 100644
> --- a/net/rds/message.c
> +++ b/net/rds/message.c
> @@ -99,7 +99,7 @@ static void rds_rm_zerocopy_callback(struct rds_sock *rs,
> struct list_head *head;
> unsigned long flags;
>
> - mm_unaccount_pinned_pages(&znotif->z_mmp);
> + mm_unaccount_pinned_pages(&rs->rs_sk.sk_vm_account, &znotif->z_mmp);
> q = &rs->rs_zcookie_queue;
> spin_lock_irqsave(&q->lock, flags);
> head = &q->zcookie_head;
> @@ -367,6 +367,7 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *
> int ret = 0;
> int length = iov_iter_count(from);
> struct rds_msg_zcopy_info *info;
> + struct vm_account *vm_account = &rm->m_rs->rs_sk.sk_vm_account;
>
> rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from));
>
> @@ -380,7 +381,9 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *
> return -ENOMEM;
> INIT_LIST_HEAD(&info->rs_zcookie_next);
> rm->data.op_mmp_znotifier = &info->znotif;
> - if (mm_account_pinned_pages(&rm->data.op_mmp_znotifier->z_mmp,
> + vm_account_init(vm_account, current, current_user(), VM_ACCOUNT_USER);
> + if (mm_account_pinned_pages(vm_account,
> + &rm->data.op_mmp_znotifier->z_mmp,
> length)) {
> ret = -ENOMEM;
> goto err;
> @@ -399,7 +402,7 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *
> for (i = 0; i < rm->data.op_nents; i++)
> put_page(sg_page(&rm->data.op_sg[i]));
> mmp = &rm->data.op_mmp_znotifier->z_mmp;
> - mm_unaccount_pinned_pages(mmp);
> + mm_unaccount_pinned_pages(vm_account, mmp);
> ret = -EFAULT;
> goto err;
> }
I wonder if RDS should just not be doing accounting? Usually things
related to iov_iter are short term and we don't account for them.
But then I don't really know how RDS works, Santos?
Regardless, maybe the vm_account should be stored in the
rds_msg_zcopy_info ?
Jason
^ permalink raw reply [flat|nested] 108+ messages in thread* Re: [RFC PATCH 10/19] net: skb: Switch to using vm_account
2023-01-24 14:51 ` Jason Gunthorpe
(?)
@ 2023-01-30 11:17 ` Alistair Popple
[not found] ` <87pmawz2ma.fsf-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org>
-1 siblings, 1 reply; 108+ messages in thread
From: Alistair Popple @ 2023-01-30 11:17 UTC (permalink / raw)
To: Jason Gunthorpe
Cc: linux-mm, cgroups, linux-kernel, jhubbard, tjmercier, hannes,
surenb, mkoutny, daniel, netdev, linux-rdma, rds-devel
Jason Gunthorpe <jgg@nvidia.com> writes:
> On Tue, Jan 24, 2023 at 04:42:39PM +1100, Alistair Popple wrote:
>> diff --git a/include/net/sock.h b/include/net/sock.h
>> index dcd72e6..bc3a868 100644
>> --- a/include/net/sock.h
>> +++ b/include/net/sock.h
>> @@ -334,6 +334,7 @@ struct sk_filter;
>> * @sk_security: used by security modules
>> * @sk_mark: generic packet mark
>> * @sk_cgrp_data: cgroup data for this cgroup
>> + * @sk_vm_account: data for pinned memory accounting
>> * @sk_memcg: this socket's memory cgroup association
>> * @sk_write_pending: a write to stream socket waits to start
>> * @sk_state_change: callback to indicate change in the state of the sock
>> @@ -523,6 +524,7 @@ struct sock {
>> void *sk_security;
>> #endif
>> struct sock_cgroup_data sk_cgrp_data;
>> + struct vm_account sk_vm_account;
>> struct mem_cgroup *sk_memcg;
>> void (*sk_state_change)(struct sock *sk);
>> void (*sk_data_ready)(struct sock *sk);
>
> I'm not sure this makes sense in a sock - each sock can be shared with
> different proceses..
TBH it didn't feel right to me either so was hoping for some
feedback. Will try your suggestion below.
>> diff --git a/net/rds/message.c b/net/rds/message.c
>> index b47e4f0..2138a70 100644
>> --- a/net/rds/message.c
>> +++ b/net/rds/message.c
>> @@ -99,7 +99,7 @@ static void rds_rm_zerocopy_callback(struct rds_sock *rs,
>> struct list_head *head;
>> unsigned long flags;
>>
>> - mm_unaccount_pinned_pages(&znotif->z_mmp);
>> + mm_unaccount_pinned_pages(&rs->rs_sk.sk_vm_account, &znotif->z_mmp);
>> q = &rs->rs_zcookie_queue;
>> spin_lock_irqsave(&q->lock, flags);
>> head = &q->zcookie_head;
>> @@ -367,6 +367,7 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *
>> int ret = 0;
>> int length = iov_iter_count(from);
>> struct rds_msg_zcopy_info *info;
>> + struct vm_account *vm_account = &rm->m_rs->rs_sk.sk_vm_account;
>>
>> rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from));
>>
>> @@ -380,7 +381,9 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *
>> return -ENOMEM;
>> INIT_LIST_HEAD(&info->rs_zcookie_next);
>> rm->data.op_mmp_znotifier = &info->znotif;
>> - if (mm_account_pinned_pages(&rm->data.op_mmp_znotifier->z_mmp,
>> + vm_account_init(vm_account, current, current_user(), VM_ACCOUNT_USER);
>> + if (mm_account_pinned_pages(vm_account,
>> + &rm->data.op_mmp_znotifier->z_mmp,
>> length)) {
>> ret = -ENOMEM;
>> goto err;
>> @@ -399,7 +402,7 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *
>> for (i = 0; i < rm->data.op_nents; i++)
>> put_page(sg_page(&rm->data.op_sg[i]));
>> mmp = &rm->data.op_mmp_znotifier->z_mmp;
>> - mm_unaccount_pinned_pages(mmp);
>> + mm_unaccount_pinned_pages(vm_account, mmp);
>> ret = -EFAULT;
>> goto err;
>> }
>
> I wonder if RDS should just not be doing accounting? Usually things
> related to iov_iter are short term and we don't account for them.
Yeah, I couldn't easily figure out why these were accounted for in the
first place either.
> But then I don't really know how RDS works, Santos?
>
> Regardless, maybe the vm_account should be stored in the
> rds_msg_zcopy_info ?
On first glance that looks like a better spot. Thanks for the
idea.
> Jason
^ permalink raw reply [flat|nested] 108+ messages in thread
* [RFC PATCH 13/19] fpga: dfl: afu: convert to use vm_account
2023-01-24 5:42 ` Alistair Popple
@ 2023-01-24 5:42 ` Alistair Popple
-1 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-24 5:42 UTC (permalink / raw)
To: linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, jgg-DDmLM1+adcrQT0dZR+AlfA,
jhubbard-DDmLM1+adcrQT0dZR+AlfA, tjmercier-hpIqsD4AKlfQT0dZR+AlfA,
hannes-druUgvl0LCNAfugRpC6u6w, surenb-hpIqsD4AKlfQT0dZR+AlfA,
mkoutny-IBi9RG/b67k, daniel-/w4YWyX8dFk, Alistair Popple, Wu Hao,
Tom Rix, Moritz Fischer, Xu Yilun,
linux-fpga-u79uwXL29TY76Z2rM5mHXA
To charge pinned pages against the pins cgroup drivers must use the
vm_account_pinned() functions which requires initialisation of a
struct vm_account. Convert the dfl-afu-region code to do this and
charge any pins to the pins cgroup.
Signed-off-by: Alistair Popple <apopple-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org>
Cc: Wu Hao <hao.wu-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Cc: Tom Rix <trix-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Cc: Moritz Fischer <mdf-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
Cc: Xu Yilun <yilun.xu-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Cc: linux-fpga-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
---
drivers/fpga/dfl-afu-dma-region.c | 11 ++++++++---
drivers/fpga/dfl-afu.h | 1 +
2 files changed, 9 insertions(+), 3 deletions(-)
diff --git a/drivers/fpga/dfl-afu-dma-region.c b/drivers/fpga/dfl-afu-dma-region.c
index 02b60fd..3b99784 100644
--- a/drivers/fpga/dfl-afu-dma-region.c
+++ b/drivers/fpga/dfl-afu-dma-region.c
@@ -38,7 +38,7 @@ static int afu_dma_pin_pages(struct dfl_feature_platform_data *pdata,
struct device *dev = &pdata->dev->dev;
int ret, pinned;
- ret = account_locked_vm(current->mm, npages, true);
+ ret = vm_account_pinned(®ion->vm_account, npages);
if (ret)
return ret;
@@ -67,7 +67,7 @@ static int afu_dma_pin_pages(struct dfl_feature_platform_data *pdata,
free_pages:
kfree(region->pages);
unlock_vm:
- account_locked_vm(current->mm, npages, false);
+ vm_unaccount_pinned(®ion->vm_account, npages);
return ret;
}
@@ -87,7 +87,7 @@ static void afu_dma_unpin_pages(struct dfl_feature_platform_data *pdata,
unpin_user_pages(region->pages, npages);
kfree(region->pages);
- account_locked_vm(current->mm, npages, false);
+ vm_unaccount_pinned(®ion->vm_account, npages);
dev_dbg(dev, "%ld pages unpinned\n", npages);
}
@@ -223,6 +223,7 @@ void afu_dma_region_destroy(struct dfl_feature_platform_data *pdata)
afu_dma_unpin_pages(pdata, region);
node = rb_next(node);
+ vm_account_release(®ion->vm_account);
kfree(region);
}
}
@@ -322,6 +323,8 @@ int afu_dma_map_region(struct dfl_feature_platform_data *pdata,
region->user_addr = user_addr;
region->length = length;
+ vm_account_init_current(®ion->vm_account);
+
/* Pin the user memory region */
ret = afu_dma_pin_pages(pdata, region);
if (ret) {
@@ -365,6 +368,7 @@ int afu_dma_map_region(struct dfl_feature_platform_data *pdata,
unpin_pages:
afu_dma_unpin_pages(pdata, region);
free_region:
+ vm_account_release(®ion->vm_account);
kfree(region);
return ret;
}
@@ -399,6 +403,7 @@ int afu_dma_unmap_region(struct dfl_feature_platform_data *pdata, u64 iova)
dma_unmap_page(dfl_fpga_pdata_to_parent(pdata),
region->iova, region->length, DMA_BIDIRECTIONAL);
afu_dma_unpin_pages(pdata, region);
+ vm_account_release(®ion->vm_account);
kfree(region);
return 0;
diff --git a/drivers/fpga/dfl-afu.h b/drivers/fpga/dfl-afu.h
index e5020e2..b1554e0 100644
--- a/drivers/fpga/dfl-afu.h
+++ b/drivers/fpga/dfl-afu.h
@@ -51,6 +51,7 @@ struct dfl_afu_mmio_region {
* @in_use: flag to indicate if this region is in_use.
*/
struct dfl_afu_dma_region {
+ struct vm_account vm_account;
u64 user_addr;
u64 length;
u64 iova;
--
git-series 0.9.1
^ permalink raw reply related [flat|nested] 108+ messages in thread* [RFC PATCH 13/19] fpga: dfl: afu: convert to use vm_account
@ 2023-01-24 5:42 ` Alistair Popple
0 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-24 5:42 UTC (permalink / raw)
To: linux-mm, cgroups
Cc: linux-kernel, jgg, jhubbard, tjmercier, hannes, surenb, mkoutny,
daniel, Alistair Popple, Wu Hao, Tom Rix, Moritz Fischer,
Xu Yilun, linux-fpga
To charge pinned pages against the pins cgroup drivers must use the
vm_account_pinned() functions which requires initialisation of a
struct vm_account. Convert the dfl-afu-region code to do this and
charge any pins to the pins cgroup.
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Cc: Wu Hao <hao.wu@intel.com>
Cc: Tom Rix <trix@redhat.com>
Cc: Moritz Fischer <mdf@kernel.org>
Cc: Xu Yilun <yilun.xu@intel.com>
Cc: linux-fpga@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
drivers/fpga/dfl-afu-dma-region.c | 11 ++++++++---
drivers/fpga/dfl-afu.h | 1 +
2 files changed, 9 insertions(+), 3 deletions(-)
diff --git a/drivers/fpga/dfl-afu-dma-region.c b/drivers/fpga/dfl-afu-dma-region.c
index 02b60fd..3b99784 100644
--- a/drivers/fpga/dfl-afu-dma-region.c
+++ b/drivers/fpga/dfl-afu-dma-region.c
@@ -38,7 +38,7 @@ static int afu_dma_pin_pages(struct dfl_feature_platform_data *pdata,
struct device *dev = &pdata->dev->dev;
int ret, pinned;
- ret = account_locked_vm(current->mm, npages, true);
+ ret = vm_account_pinned(®ion->vm_account, npages);
if (ret)
return ret;
@@ -67,7 +67,7 @@ static int afu_dma_pin_pages(struct dfl_feature_platform_data *pdata,
free_pages:
kfree(region->pages);
unlock_vm:
- account_locked_vm(current->mm, npages, false);
+ vm_unaccount_pinned(®ion->vm_account, npages);
return ret;
}
@@ -87,7 +87,7 @@ static void afu_dma_unpin_pages(struct dfl_feature_platform_data *pdata,
unpin_user_pages(region->pages, npages);
kfree(region->pages);
- account_locked_vm(current->mm, npages, false);
+ vm_unaccount_pinned(®ion->vm_account, npages);
dev_dbg(dev, "%ld pages unpinned\n", npages);
}
@@ -223,6 +223,7 @@ void afu_dma_region_destroy(struct dfl_feature_platform_data *pdata)
afu_dma_unpin_pages(pdata, region);
node = rb_next(node);
+ vm_account_release(®ion->vm_account);
kfree(region);
}
}
@@ -322,6 +323,8 @@ int afu_dma_map_region(struct dfl_feature_platform_data *pdata,
region->user_addr = user_addr;
region->length = length;
+ vm_account_init_current(®ion->vm_account);
+
/* Pin the user memory region */
ret = afu_dma_pin_pages(pdata, region);
if (ret) {
@@ -365,6 +368,7 @@ int afu_dma_map_region(struct dfl_feature_platform_data *pdata,
unpin_pages:
afu_dma_unpin_pages(pdata, region);
free_region:
+ vm_account_release(®ion->vm_account);
kfree(region);
return ret;
}
@@ -399,6 +403,7 @@ int afu_dma_unmap_region(struct dfl_feature_platform_data *pdata, u64 iova)
dma_unmap_page(dfl_fpga_pdata_to_parent(pdata),
region->iova, region->length, DMA_BIDIRECTIONAL);
afu_dma_unpin_pages(pdata, region);
+ vm_account_release(®ion->vm_account);
kfree(region);
return 0;
diff --git a/drivers/fpga/dfl-afu.h b/drivers/fpga/dfl-afu.h
index e5020e2..b1554e0 100644
--- a/drivers/fpga/dfl-afu.h
+++ b/drivers/fpga/dfl-afu.h
@@ -51,6 +51,7 @@ struct dfl_afu_mmio_region {
* @in_use: flag to indicate if this region is in_use.
*/
struct dfl_afu_dma_region {
+ struct vm_account vm_account;
u64 user_addr;
u64 length;
u64 iova;
--
git-series 0.9.1
^ permalink raw reply related [flat|nested] 108+ messages in thread
* [RFC PATCH 14/19] mm: Introduce a cgroup for pinned memory
2023-01-24 5:42 ` Alistair Popple
@ 2023-01-24 5:42 ` Alistair Popple
-1 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-24 5:42 UTC (permalink / raw)
To: linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, jgg-DDmLM1+adcrQT0dZR+AlfA,
jhubbard-DDmLM1+adcrQT0dZR+AlfA, tjmercier-hpIqsD4AKlfQT0dZR+AlfA,
hannes-druUgvl0LCNAfugRpC6u6w, surenb-hpIqsD4AKlfQT0dZR+AlfA,
mkoutny-IBi9RG/b67k, daniel-/w4YWyX8dFk, Alistair Popple,
Tejun Heo, Zefan Li, Andrew Morton
If too much memory in a system is pinned or locked it can lead to
problems such as performance degredation or in the worst case
out-of-memory errors as such memory cannot be moved or paged out.
In order to prevent users without CAP_IPC_LOCK from causing these
issues the amount of memory that can be pinned is typically limited by
RLIMIT_MEMLOCK. However this is inflexible as limits can't be shared
between tasks and the enforcement of these limits is inconsistent
between in-kernel users of pinned memory such as mlock() and device
drivers which may also pin pages with pin_user_pages().
To allow for a single limit to be set introduce a cgroup controller
which can be used to limit the number of pages being pinned by all
tasks in the cgroup.
Signed-off-by: Alistair Popple <apopple-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org>
Cc: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
Cc: Zefan Li <lizefan.x-EC8Uxl6Npydl57MIdRCFDg@public.gmane.org>
Cc: Johannes Weiner <hannes-druUgvl0LCNAfugRpC6u6w@public.gmane.org>
Cc: Andrew Morton <akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: cgroups-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org
---
MAINTAINERS | 7 +-
include/linux/cgroup.h | 20 +++-
include/linux/cgroup_subsys.h | 4 +-
mm/Kconfig | 11 +-
mm/Makefile | 1 +-
mm/pins_cgroup.c | 273 +++++++++++++++++++++++++++++++++++-
6 files changed, 316 insertions(+)
create mode 100644 mm/pins_cgroup.c
diff --git a/MAINTAINERS b/MAINTAINERS
index f781f93..f8526e2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5381,6 +5381,13 @@ F: tools/testing/selftests/cgroup/memcg_protection.m
F: tools/testing/selftests/cgroup/test_kmem.c
F: tools/testing/selftests/cgroup/test_memcontrol.c
+CONTROL GROUP - PINNED AND LOCKED MEMORY
+M: Alistair Popple <apopple-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org>
+L: cgroups-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
+L: linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org
+S: Maintained
+F: mm/pins_cgroup.c
+
CORETEMP HARDWARE MONITORING DRIVER
M: Fenghua Yu <fenghua.yu-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
L: linux-hwmon-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 3410aec..440f299 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -857,4 +857,24 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
#endif /* CONFIG_CGROUP_BPF */
+#ifdef CONFIG_CGROUP_PINS
+
+struct pins_cgroup *get_pins_cg(struct task_struct *task);
+void put_pins_cg(struct pins_cgroup *cg);
+void pins_uncharge(struct pins_cgroup *pins, int num);
+int pins_try_charge(struct pins_cgroup *pins, int num);
+
+#else /* CONFIG_CGROUP_PINS */
+
+static inline struct pins_cgroup *get_pins_cg(struct task_struct *task)
+{
+ return NULL;
+}
+
+static inline void put_pins_cg(struct pins_cgroup *cg) {}
+static inline void pins_uncharge(struct pins_cgroup *pins, int num) {}
+static inline int pins_try_charge(struct pins_cgroup *pins, int num) { return 0; }
+
+#endif /* CONFIG_CGROUP_PINS */
+
#endif /* _LINUX_CGROUP_H */
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 4452354..c1b4aab 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -65,6 +65,10 @@ SUBSYS(rdma)
SUBSYS(misc)
#endif
+#if IS_ENABLED(CONFIG_CGROUP_PINS)
+SUBSYS(pins)
+#endif
+
/*
* The following subsystems are not supported on the default hierarchy.
*/
diff --git a/mm/Kconfig b/mm/Kconfig
index ff7b209..7a32b98 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1183,6 +1183,17 @@ config LRU_GEN_STATS
This option has a per-memcg and per-node memory overhead.
# }
+config CGROUP_PINS
+ bool "Cgroup for pinned and locked memory"
+ default y
+
+ help
+ Having too much memory pinned or locked can lead to system
+ instability due to increased likelihood of encountering
+ out-of-memory conditions. Select this option to enable a cgroup
+ which can be used to limit the overall number of pages locked or
+ pinned by drivers.
+
source "mm/damon/Kconfig"
endmenu
diff --git a/mm/Makefile b/mm/Makefile
index 8e105e5..81db189 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -138,3 +138,4 @@ obj-$(CONFIG_IO_MAPPING) += io-mapping.o
obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
+obj-$(CONFIG_CGROUP_PINS) += pins_cgroup.o
diff --git a/mm/pins_cgroup.c b/mm/pins_cgroup.c
new file mode 100644
index 0000000..cc310d5
--- /dev/null
+++ b/mm/pins_cgroup.c
@@ -0,0 +1,273 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Controller for cgroups limiting number of pages pinned for FOLL_LONGETERM.
+ *
+ * Copyright (C) 2022 Alistair Popple <apopple-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/threads.h>
+#include <linux/atomic.h>
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/sched/task.h>
+
+#define PINS_MAX (-1ULL)
+#define PINS_MAX_STR "max"
+
+struct pins_cgroup {
+ struct cgroup_subsys_state css;
+
+ atomic64_t counter;
+ atomic64_t limit;
+
+ struct cgroup_file events_file;
+ atomic64_t events_limit;
+};
+
+static struct pins_cgroup *css_pins(struct cgroup_subsys_state *css)
+{
+ return container_of(css, struct pins_cgroup, css);
+}
+
+static struct pins_cgroup *parent_pins(struct pins_cgroup *pins)
+{
+ return css_pins(pins->css.parent);
+}
+
+struct pins_cgroup *get_pins_cg(struct task_struct *task)
+{
+ return css_pins(task_get_css(task, pins_cgrp_id));
+}
+
+void put_pins_cg(struct pins_cgroup *cg)
+{
+ css_put(&cg->css);
+}
+
+static struct cgroup_subsys_state *
+pins_css_alloc(struct cgroup_subsys_state *parent)
+{
+ struct pins_cgroup *pins;
+
+ pins = kzalloc(sizeof(struct pins_cgroup), GFP_KERNEL);
+ if (!pins)
+ return ERR_PTR(-ENOMEM);
+
+ atomic64_set(&pins->counter, 0);
+ atomic64_set(&pins->limit, PINS_MAX);
+ atomic64_set(&pins->events_limit, 0);
+ return &pins->css;
+}
+
+static void pins_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css_pins(css));
+}
+
+/**
+ * pins_cancel - uncharge the local pin count
+ * @pins: the pin cgroup state
+ * @num: the number of pins to cancel
+ *
+ * This function will WARN if the pin count goes under 0, because such a case is
+ * a bug in the pins controller proper.
+ */
+void pins_cancel(struct pins_cgroup *pins, int num)
+{
+ /*
+ * A negative count (or overflow for that matter) is invalid,
+ * and indicates a bug in the `pins` controller proper.
+ */
+ WARN_ON_ONCE(atomic64_add_negative(-num, &pins->counter));
+}
+
+/**
+ * pins_uncharge - hierarchically uncharge the pin count
+ * @pins: the pin cgroup state
+ * @num: the number of pins to uncharge
+ */
+void pins_uncharge(struct pins_cgroup *pins, int num)
+{
+ struct pins_cgroup *p;
+
+ for (p = pins; parent_pins(p); p = parent_pins(p))
+ pins_cancel(p, num);
+}
+
+/**
+ * pins_charge - hierarchically charge the pin count
+ * @pins: the pin cgroup state
+ * @num: the number of pins to charge
+ *
+ * This function does *not* follow the pin limit set. It cannot fail and the new
+ * pin count may exceed the limit. This is only used for reverting failed
+ * attaches, where there is no other way out than violating the limit.
+ */
+static void pins_charge(struct pins_cgroup *pins, int num)
+{
+ struct pins_cgroup *p;
+
+ for (p = pins; parent_pins(p); p = parent_pins(p))
+ atomic64_add(num, &p->counter);
+}
+
+/**
+ * pins_try_charge - hierarchically try to charge the pin count
+ * @pins: the pin cgroup state
+ * @num: the number of pins to charge
+ *
+ * This function follows the set limit. It will fail if the charge would cause
+ * the new value to exceed the hierarchical limit. Returns 0 if the charge
+ * succeeded, otherwise -EAGAIN.
+ */
+int pins_try_charge(struct pins_cgroup *pins, int num)
+{
+ struct pins_cgroup *p, *q;
+
+ for (p = pins; parent_pins(p); p = parent_pins(p)) {
+ uint64_t new = atomic64_add_return(num, &p->counter);
+ uint64_t limit = atomic64_read(&p->limit);
+
+ if (limit != PINS_MAX && new > limit)
+ goto revert;
+ }
+
+ return 0;
+
+revert:
+ for (q = pins; q != p; q = parent_pins(q))
+ pins_cancel(q, num);
+ pins_cancel(p, num);
+
+ return -EAGAIN;
+}
+
+static int pins_can_attach(struct cgroup_taskset *tset)
+{
+ struct cgroup_subsys_state *dst_css;
+ struct task_struct *task;
+
+ cgroup_taskset_for_each(task, dst_css, tset) {
+ struct pins_cgroup *pins = css_pins(dst_css);
+ struct cgroup_subsys_state *old_css;
+ struct pins_cgroup *old_pins;
+
+ old_css = task_css(task, pins_cgrp_id);
+ old_pins = css_pins(old_css);
+
+ pins_charge(pins, task->mm->locked_vm);
+ pins_uncharge(old_pins, task->mm->locked_vm);
+ }
+
+ return 0;
+}
+
+static void pins_cancel_attach(struct cgroup_taskset *tset)
+{
+ struct cgroup_subsys_state *dst_css;
+ struct task_struct *task;
+
+ cgroup_taskset_for_each(task, dst_css, tset) {
+ struct pins_cgroup *pins = css_pins(dst_css);
+ struct cgroup_subsys_state *old_css;
+ struct pins_cgroup *old_pins;
+
+ old_css = task_css(task, pins_cgrp_id);
+ old_pins = css_pins(old_css);
+
+ pins_charge(old_pins, task->mm->locked_vm);
+ pins_uncharge(pins, task->mm->locked_vm);
+ }
+}
+
+
+static ssize_t pins_max_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct cgroup_subsys_state *css = of_css(of);
+ struct pins_cgroup *pins = css_pins(css);
+ uint64_t limit;
+ int err;
+
+ buf = strstrip(buf);
+ if (!strcmp(buf, PINS_MAX_STR)) {
+ limit = PINS_MAX;
+ goto set_limit;
+ }
+
+ err = kstrtoll(buf, 0, &limit);
+ if (err)
+ return err;
+
+ if (limit < 0 || limit >= PINS_MAX)
+ return -EINVAL;
+
+set_limit:
+ /*
+ * Limit updates don't need to be mutex'd, since it isn't
+ * critical that any racing fork()s follow the new limit.
+ */
+ atomic64_set(&pins->limit, limit);
+ return nbytes;
+}
+
+static int pins_max_show(struct seq_file *sf, void *v)
+{
+ struct cgroup_subsys_state *css = seq_css(sf);
+ struct pins_cgroup *pins = css_pins(css);
+ uint64_t limit = atomic64_read(&pins->limit);
+
+ if (limit >= PINS_MAX)
+ seq_printf(sf, "%s\n", PINS_MAX_STR);
+ else
+ seq_printf(sf, "%lld\n", limit);
+
+ return 0;
+}
+
+static s64 pins_current_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct pins_cgroup *pins = css_pins(css);
+
+ return atomic64_read(&pins->counter);
+}
+
+static int pins_events_show(struct seq_file *sf, void *v)
+{
+ struct pins_cgroup *pins = css_pins(seq_css(sf));
+
+ seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pins->events_limit));
+ return 0;
+}
+
+static struct cftype pins_files[] = {
+ {
+ .name = "max",
+ .write = pins_max_write,
+ .seq_show = pins_max_show,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "current",
+ .read_s64 = pins_current_read,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "events",
+ .seq_show = pins_events_show,
+ .file_offset = offsetof(struct pins_cgroup, events_file),
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ { } /* terminate */
+};
+
+struct cgroup_subsys pins_cgrp_subsys = {
+ .css_alloc = pins_css_alloc,
+ .css_free = pins_css_free,
+ .legacy_cftypes = pins_files,
+ .dfl_cftypes = pins_files,
+ .can_attach = pins_can_attach,
+ .cancel_attach = pins_cancel_attach,
+};
--
git-series 0.9.1
^ permalink raw reply related [flat|nested] 108+ messages in thread* [RFC PATCH 14/19] mm: Introduce a cgroup for pinned memory
@ 2023-01-24 5:42 ` Alistair Popple
0 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-24 5:42 UTC (permalink / raw)
To: linux-mm, cgroups
Cc: linux-kernel, jgg, jhubbard, tjmercier, hannes, surenb, mkoutny,
daniel, Alistair Popple, Tejun Heo, Zefan Li, Andrew Morton
If too much memory in a system is pinned or locked it can lead to
problems such as performance degredation or in the worst case
out-of-memory errors as such memory cannot be moved or paged out.
In order to prevent users without CAP_IPC_LOCK from causing these
issues the amount of memory that can be pinned is typically limited by
RLIMIT_MEMLOCK. However this is inflexible as limits can't be shared
between tasks and the enforcement of these limits is inconsistent
between in-kernel users of pinned memory such as mlock() and device
drivers which may also pin pages with pin_user_pages().
To allow for a single limit to be set introduce a cgroup controller
which can be used to limit the number of pages being pinned by all
tasks in the cgroup.
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Zefan Li <lizefan.x@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
Cc: cgroups@vger.kernel.org
Cc: linux-mm@kvack.org
---
MAINTAINERS | 7 +-
include/linux/cgroup.h | 20 +++-
include/linux/cgroup_subsys.h | 4 +-
mm/Kconfig | 11 +-
mm/Makefile | 1 +-
mm/pins_cgroup.c | 273 +++++++++++++++++++++++++++++++++++-
6 files changed, 316 insertions(+)
create mode 100644 mm/pins_cgroup.c
diff --git a/MAINTAINERS b/MAINTAINERS
index f781f93..f8526e2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5381,6 +5381,13 @@ F: tools/testing/selftests/cgroup/memcg_protection.m
F: tools/testing/selftests/cgroup/test_kmem.c
F: tools/testing/selftests/cgroup/test_memcontrol.c
+CONTROL GROUP - PINNED AND LOCKED MEMORY
+M: Alistair Popple <apopple@nvidia.com>
+L: cgroups@vger.kernel.org
+L: linux-mm@kvack.org
+S: Maintained
+F: mm/pins_cgroup.c
+
CORETEMP HARDWARE MONITORING DRIVER
M: Fenghua Yu <fenghua.yu@intel.com>
L: linux-hwmon@vger.kernel.org
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 3410aec..440f299 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -857,4 +857,24 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
#endif /* CONFIG_CGROUP_BPF */
+#ifdef CONFIG_CGROUP_PINS
+
+struct pins_cgroup *get_pins_cg(struct task_struct *task);
+void put_pins_cg(struct pins_cgroup *cg);
+void pins_uncharge(struct pins_cgroup *pins, int num);
+int pins_try_charge(struct pins_cgroup *pins, int num);
+
+#else /* CONFIG_CGROUP_PINS */
+
+static inline struct pins_cgroup *get_pins_cg(struct task_struct *task)
+{
+ return NULL;
+}
+
+static inline void put_pins_cg(struct pins_cgroup *cg) {}
+static inline void pins_uncharge(struct pins_cgroup *pins, int num) {}
+static inline int pins_try_charge(struct pins_cgroup *pins, int num) { return 0; }
+
+#endif /* CONFIG_CGROUP_PINS */
+
#endif /* _LINUX_CGROUP_H */
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 4452354..c1b4aab 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -65,6 +65,10 @@ SUBSYS(rdma)
SUBSYS(misc)
#endif
+#if IS_ENABLED(CONFIG_CGROUP_PINS)
+SUBSYS(pins)
+#endif
+
/*
* The following subsystems are not supported on the default hierarchy.
*/
diff --git a/mm/Kconfig b/mm/Kconfig
index ff7b209..7a32b98 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1183,6 +1183,17 @@ config LRU_GEN_STATS
This option has a per-memcg and per-node memory overhead.
# }
+config CGROUP_PINS
+ bool "Cgroup for pinned and locked memory"
+ default y
+
+ help
+ Having too much memory pinned or locked can lead to system
+ instability due to increased likelihood of encountering
+ out-of-memory conditions. Select this option to enable a cgroup
+ which can be used to limit the overall number of pages locked or
+ pinned by drivers.
+
source "mm/damon/Kconfig"
endmenu
diff --git a/mm/Makefile b/mm/Makefile
index 8e105e5..81db189 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -138,3 +138,4 @@ obj-$(CONFIG_IO_MAPPING) += io-mapping.o
obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
+obj-$(CONFIG_CGROUP_PINS) += pins_cgroup.o
diff --git a/mm/pins_cgroup.c b/mm/pins_cgroup.c
new file mode 100644
index 0000000..cc310d5
--- /dev/null
+++ b/mm/pins_cgroup.c
@@ -0,0 +1,273 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Controller for cgroups limiting number of pages pinned for FOLL_LONGETERM.
+ *
+ * Copyright (C) 2022 Alistair Popple <apopple@nvidia.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/threads.h>
+#include <linux/atomic.h>
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/sched/task.h>
+
+#define PINS_MAX (-1ULL)
+#define PINS_MAX_STR "max"
+
+struct pins_cgroup {
+ struct cgroup_subsys_state css;
+
+ atomic64_t counter;
+ atomic64_t limit;
+
+ struct cgroup_file events_file;
+ atomic64_t events_limit;
+};
+
+static struct pins_cgroup *css_pins(struct cgroup_subsys_state *css)
+{
+ return container_of(css, struct pins_cgroup, css);
+}
+
+static struct pins_cgroup *parent_pins(struct pins_cgroup *pins)
+{
+ return css_pins(pins->css.parent);
+}
+
+struct pins_cgroup *get_pins_cg(struct task_struct *task)
+{
+ return css_pins(task_get_css(task, pins_cgrp_id));
+}
+
+void put_pins_cg(struct pins_cgroup *cg)
+{
+ css_put(&cg->css);
+}
+
+static struct cgroup_subsys_state *
+pins_css_alloc(struct cgroup_subsys_state *parent)
+{
+ struct pins_cgroup *pins;
+
+ pins = kzalloc(sizeof(struct pins_cgroup), GFP_KERNEL);
+ if (!pins)
+ return ERR_PTR(-ENOMEM);
+
+ atomic64_set(&pins->counter, 0);
+ atomic64_set(&pins->limit, PINS_MAX);
+ atomic64_set(&pins->events_limit, 0);
+ return &pins->css;
+}
+
+static void pins_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css_pins(css));
+}
+
+/**
+ * pins_cancel - uncharge the local pin count
+ * @pins: the pin cgroup state
+ * @num: the number of pins to cancel
+ *
+ * This function will WARN if the pin count goes under 0, because such a case is
+ * a bug in the pins controller proper.
+ */
+void pins_cancel(struct pins_cgroup *pins, int num)
+{
+ /*
+ * A negative count (or overflow for that matter) is invalid,
+ * and indicates a bug in the `pins` controller proper.
+ */
+ WARN_ON_ONCE(atomic64_add_negative(-num, &pins->counter));
+}
+
+/**
+ * pins_uncharge - hierarchically uncharge the pin count
+ * @pins: the pin cgroup state
+ * @num: the number of pins to uncharge
+ */
+void pins_uncharge(struct pins_cgroup *pins, int num)
+{
+ struct pins_cgroup *p;
+
+ for (p = pins; parent_pins(p); p = parent_pins(p))
+ pins_cancel(p, num);
+}
+
+/**
+ * pins_charge - hierarchically charge the pin count
+ * @pins: the pin cgroup state
+ * @num: the number of pins to charge
+ *
+ * This function does *not* follow the pin limit set. It cannot fail and the new
+ * pin count may exceed the limit. This is only used for reverting failed
+ * attaches, where there is no other way out than violating the limit.
+ */
+static void pins_charge(struct pins_cgroup *pins, int num)
+{
+ struct pins_cgroup *p;
+
+ for (p = pins; parent_pins(p); p = parent_pins(p))
+ atomic64_add(num, &p->counter);
+}
+
+/**
+ * pins_try_charge - hierarchically try to charge the pin count
+ * @pins: the pin cgroup state
+ * @num: the number of pins to charge
+ *
+ * This function follows the set limit. It will fail if the charge would cause
+ * the new value to exceed the hierarchical limit. Returns 0 if the charge
+ * succeeded, otherwise -EAGAIN.
+ */
+int pins_try_charge(struct pins_cgroup *pins, int num)
+{
+ struct pins_cgroup *p, *q;
+
+ for (p = pins; parent_pins(p); p = parent_pins(p)) {
+ uint64_t new = atomic64_add_return(num, &p->counter);
+ uint64_t limit = atomic64_read(&p->limit);
+
+ if (limit != PINS_MAX && new > limit)
+ goto revert;
+ }
+
+ return 0;
+
+revert:
+ for (q = pins; q != p; q = parent_pins(q))
+ pins_cancel(q, num);
+ pins_cancel(p, num);
+
+ return -EAGAIN;
+}
+
+static int pins_can_attach(struct cgroup_taskset *tset)
+{
+ struct cgroup_subsys_state *dst_css;
+ struct task_struct *task;
+
+ cgroup_taskset_for_each(task, dst_css, tset) {
+ struct pins_cgroup *pins = css_pins(dst_css);
+ struct cgroup_subsys_state *old_css;
+ struct pins_cgroup *old_pins;
+
+ old_css = task_css(task, pins_cgrp_id);
+ old_pins = css_pins(old_css);
+
+ pins_charge(pins, task->mm->locked_vm);
+ pins_uncharge(old_pins, task->mm->locked_vm);
+ }
+
+ return 0;
+}
+
+static void pins_cancel_attach(struct cgroup_taskset *tset)
+{
+ struct cgroup_subsys_state *dst_css;
+ struct task_struct *task;
+
+ cgroup_taskset_for_each(task, dst_css, tset) {
+ struct pins_cgroup *pins = css_pins(dst_css);
+ struct cgroup_subsys_state *old_css;
+ struct pins_cgroup *old_pins;
+
+ old_css = task_css(task, pins_cgrp_id);
+ old_pins = css_pins(old_css);
+
+ pins_charge(old_pins, task->mm->locked_vm);
+ pins_uncharge(pins, task->mm->locked_vm);
+ }
+}
+
+
+static ssize_t pins_max_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct cgroup_subsys_state *css = of_css(of);
+ struct pins_cgroup *pins = css_pins(css);
+ uint64_t limit;
+ int err;
+
+ buf = strstrip(buf);
+ if (!strcmp(buf, PINS_MAX_STR)) {
+ limit = PINS_MAX;
+ goto set_limit;
+ }
+
+ err = kstrtoll(buf, 0, &limit);
+ if (err)
+ return err;
+
+ if (limit < 0 || limit >= PINS_MAX)
+ return -EINVAL;
+
+set_limit:
+ /*
+ * Limit updates don't need to be mutex'd, since it isn't
+ * critical that any racing fork()s follow the new limit.
+ */
+ atomic64_set(&pins->limit, limit);
+ return nbytes;
+}
+
+static int pins_max_show(struct seq_file *sf, void *v)
+{
+ struct cgroup_subsys_state *css = seq_css(sf);
+ struct pins_cgroup *pins = css_pins(css);
+ uint64_t limit = atomic64_read(&pins->limit);
+
+ if (limit >= PINS_MAX)
+ seq_printf(sf, "%s\n", PINS_MAX_STR);
+ else
+ seq_printf(sf, "%lld\n", limit);
+
+ return 0;
+}
+
+static s64 pins_current_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct pins_cgroup *pins = css_pins(css);
+
+ return atomic64_read(&pins->counter);
+}
+
+static int pins_events_show(struct seq_file *sf, void *v)
+{
+ struct pins_cgroup *pins = css_pins(seq_css(sf));
+
+ seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pins->events_limit));
+ return 0;
+}
+
+static struct cftype pins_files[] = {
+ {
+ .name = "max",
+ .write = pins_max_write,
+ .seq_show = pins_max_show,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "current",
+ .read_s64 = pins_current_read,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "events",
+ .seq_show = pins_events_show,
+ .file_offset = offsetof(struct pins_cgroup, events_file),
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ { } /* terminate */
+};
+
+struct cgroup_subsys pins_cgrp_subsys = {
+ .css_alloc = pins_css_alloc,
+ .css_free = pins_css_free,
+ .legacy_cftypes = pins_files,
+ .dfl_cftypes = pins_files,
+ .can_attach = pins_can_attach,
+ .cancel_attach = pins_cancel_attach,
+};
--
git-series 0.9.1
^ permalink raw reply related [flat|nested] 108+ messages in thread* Re: [RFC PATCH 14/19] mm: Introduce a cgroup for pinned memory
2023-01-24 5:42 ` Alistair Popple
(?)
@ 2023-01-24 8:20 ` kernel test robot
-1 siblings, 0 replies; 108+ messages in thread
From: kernel test robot @ 2023-01-24 8:20 UTC (permalink / raw)
To: Alistair Popple; +Cc: oe-kbuild-all
Hi Alistair,
[FYI, it's a private test report for your RFC patch.]
[auto build test ERROR on 2241ab53cbb5cdb08a6b2d4688feb13971058f65]
url: https://github.com/intel-lab-lkp/linux/commits/Alistair-Popple/mm-Introduce-vm_account/20230124-135027
base: 2241ab53cbb5cdb08a6b2d4688feb13971058f65
patch link: https://lore.kernel.org/r/183372b80aac73e640d9f5ac3c742d505fc6c1f2.1674538665.git-series.apopple%40nvidia.com
patch subject: [RFC PATCH 14/19] mm: Introduce a cgroup for pinned memory
config: arc-defconfig (https://download.01.org/0day-ci/archive/20230124/202301241632.opMeQ3t6-lkp@intel.com/config)
compiler: arc-elf-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/4eba1da312a889b27469e42f20c216183d19cd4d
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Alistair-Popple/mm-Introduce-vm_account/20230124-135027
git checkout 4eba1da312a889b27469e42f20c216183d19cd4d
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=arc olddefconfig
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=arc SHELL=/bin/bash
If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
All error/warnings (new ones prefixed by >>):
>> mm/pins_cgroup.c:19:41: error: field 'css' has incomplete type
19 | struct cgroup_subsys_state css;
| ^~~
>> mm/pins_cgroup.c:24:41: error: field 'events_file' has incomplete type
24 | struct cgroup_file events_file;
| ^~~~~~~~~~~
In file included from include/linux/container_of.h:5,
from include/linux/kernel.h:21,
from mm/pins_cgroup.c:8:
mm/pins_cgroup.c: In function 'css_pins':
include/linux/compiler_types.h:299:27: error: expression in static assertion is not an integer
299 | #define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
include/linux/build_bug.h:78:56: note: in definition of macro '__static_assert'
78 | #define __static_assert(expr, msg, ...) _Static_assert(expr, msg)
| ^~~~
include/linux/container_of.h:20:9: note: in expansion of macro 'static_assert'
20 | static_assert(__same_type(*(ptr), ((type *)0)->member) || \
| ^~~~~~~~~~~~~
include/linux/container_of.h:20:23: note: in expansion of macro '__same_type'
20 | static_assert(__same_type(*(ptr), ((type *)0)->member) || \
| ^~~~~~~~~~~
mm/pins_cgroup.c:30:16: note: in expansion of macro 'container_of'
30 | return container_of(css, struct pins_cgroup, css);
| ^~~~~~~~~~~~
mm/pins_cgroup.c: In function 'get_pins_cg':
>> mm/pins_cgroup.c:40:25: error: implicit declaration of function 'task_get_css' [-Werror=implicit-function-declaration]
40 | return css_pins(task_get_css(task, pins_cgrp_id));
| ^~~~~~~~~~~~
>> mm/pins_cgroup.c:40:44: error: 'pins_cgrp_id' undeclared (first use in this function); did you mean 'pins_cgroup'?
40 | return css_pins(task_get_css(task, pins_cgrp_id));
| ^~~~~~~~~~~~
| pins_cgroup
mm/pins_cgroup.c:40:44: note: each undeclared identifier is reported only once for each function it appears in
mm/pins_cgroup.c: At top level:
mm/pins_cgroup.c:76:6: warning: no previous prototype for 'pins_cancel' [-Wmissing-prototypes]
76 | void pins_cancel(struct pins_cgroup *pins, int num)
| ^~~~~~~~~~~
>> mm/pins_cgroup.c:146:35: warning: 'struct cgroup_taskset' declared inside parameter list will not be visible outside of this definition or declaration
146 | static int pins_can_attach(struct cgroup_taskset *tset)
| ^~~~~~~~~~~~~~
mm/pins_cgroup.c: In function 'pins_can_attach':
>> mm/pins_cgroup.c:151:9: error: implicit declaration of function 'cgroup_taskset_for_each'; did you mean 'cgroup_task_frozen'? [-Werror=implicit-function-declaration]
151 | cgroup_taskset_for_each(task, dst_css, tset) {
| ^~~~~~~~~~~~~~~~~~~~~~~
| cgroup_task_frozen
>> mm/pins_cgroup.c:151:53: error: expected ';' before '{' token
151 | cgroup_taskset_for_each(task, dst_css, tset) {
| ^~
| ;
mm/pins_cgroup.c:164:1: error: no return statement in function returning non-void [-Werror=return-type]
164 | }
| ^
mm/pins_cgroup.c: At top level:
mm/pins_cgroup.c:166:39: warning: 'struct cgroup_taskset' declared inside parameter list will not be visible outside of this definition or declaration
166 | static void pins_cancel_attach(struct cgroup_taskset *tset)
| ^~~~~~~~~~~~~~
mm/pins_cgroup.c: In function 'pins_cancel_attach':
mm/pins_cgroup.c:171:53: error: expected ';' before '{' token
171 | cgroup_taskset_for_each(task, dst_css, tset) {
| ^~
| ;
mm/pins_cgroup.c: In function 'pins_max_write':
>> mm/pins_cgroup.c:188:43: error: implicit declaration of function 'of_css' [-Werror=implicit-function-declaration]
188 | struct cgroup_subsys_state *css = of_css(of);
| ^~~~~~
>> mm/pins_cgroup.c:188:43: warning: initialization of 'struct cgroup_subsys_state *' from 'int' makes pointer from integer without a cast [-Wint-conversion]
mm/pins_cgroup.c: In function 'pins_max_show':
>> mm/pins_cgroup.c:217:43: error: implicit declaration of function 'seq_css' [-Werror=implicit-function-declaration]
217 | struct cgroup_subsys_state *css = seq_css(sf);
| ^~~~~~~
mm/pins_cgroup.c:217:43: warning: initialization of 'struct cgroup_subsys_state *' from 'int' makes pointer from integer without a cast [-Wint-conversion]
mm/pins_cgroup.c: At top level:
>> mm/pins_cgroup.c:230:37: warning: 'struct cftype' declared inside parameter list will not be visible outside of this definition or declaration
230 | struct cftype *cft)
| ^~~~~~
mm/pins_cgroup.c: In function 'pins_events_show':
>> mm/pins_cgroup.c:239:45: warning: passing argument 1 of 'css_pins' makes pointer from integer without a cast [-Wint-conversion]
239 | struct pins_cgroup *pins = css_pins(seq_css(sf));
| ^~~~~~~~~~~
| |
| int
mm/pins_cgroup.c:28:65: note: expected 'struct cgroup_subsys_state *' but argument is of type 'int'
28 | static struct pins_cgroup *css_pins(struct cgroup_subsys_state *css)
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~
mm/pins_cgroup.c: At top level:
>> mm/pins_cgroup.c:245:22: error: array type has incomplete element type 'struct cftype'
245 | static struct cftype pins_files[] = {
| ^~~~~~~~~~
>> mm/pins_cgroup.c:250:26: error: 'CFTYPE_NOT_ON_ROOT' undeclared here (not in a function)
250 | .flags = CFTYPE_NOT_ON_ROOT,
| ^~~~~~~~~~~~~~~~~~
>> mm/pins_cgroup.c:266:8: error: variable 'pins_cgrp_subsys' has initializer but incomplete type
266 | struct cgroup_subsys pins_cgrp_subsys = {
| ^~~~~~~~~~~~~
>> mm/pins_cgroup.c:267:10: error: 'struct cgroup_subsys' has no member named 'css_alloc'
267 | .css_alloc = pins_css_alloc,
| ^~~~~~~~~
>> mm/pins_cgroup.c:267:22: warning: excess elements in struct initializer
267 | .css_alloc = pins_css_alloc,
| ^~~~~~~~~~~~~~
mm/pins_cgroup.c:267:22: note: (near initialization for 'pins_cgrp_subsys')
>> mm/pins_cgroup.c:268:10: error: 'struct cgroup_subsys' has no member named 'css_free'
268 | .css_free = pins_css_free,
| ^~~~~~~~
mm/pins_cgroup.c:268:21: warning: excess elements in struct initializer
268 | .css_free = pins_css_free,
| ^~~~~~~~~~~~~
mm/pins_cgroup.c:268:21: note: (near initialization for 'pins_cgrp_subsys')
>> mm/pins_cgroup.c:269:10: error: 'struct cgroup_subsys' has no member named 'legacy_cftypes'
269 | .legacy_cftypes = pins_files,
| ^~~~~~~~~~~~~~
mm/pins_cgroup.c:269:27: warning: excess elements in struct initializer
269 | .legacy_cftypes = pins_files,
| ^~~~~~~~~~
mm/pins_cgroup.c:269:27: note: (near initialization for 'pins_cgrp_subsys')
>> mm/pins_cgroup.c:270:10: error: 'struct cgroup_subsys' has no member named 'dfl_cftypes'
270 | .dfl_cftypes = pins_files,
| ^~~~~~~~~~~
mm/pins_cgroup.c:270:24: warning: excess elements in struct initializer
270 | .dfl_cftypes = pins_files,
| ^~~~~~~~~~
mm/pins_cgroup.c:270:24: note: (near initialization for 'pins_cgrp_subsys')
mm/pins_cgroup.c:271:10: error: 'struct cgroup_subsys' has no member named 'can_attach'
271 | .can_attach = pins_can_attach,
| ^~~~~~~~~~
mm/pins_cgroup.c:271:23: warning: excess elements in struct initializer
271 | .can_attach = pins_can_attach,
| ^~~~~~~~~~~~~~~
mm/pins_cgroup.c:271:23: note: (near initialization for 'pins_cgrp_subsys')
mm/pins_cgroup.c:272:10: error: 'struct cgroup_subsys' has no member named 'cancel_attach'
272 | .cancel_attach = pins_cancel_attach,
| ^~~~~~~~~~~~~
mm/pins_cgroup.c:272:26: warning: excess elements in struct initializer
272 | .cancel_attach = pins_cancel_attach,
| ^~~~~~~~~~~~~~~~~~
mm/pins_cgroup.c:272:26: note: (near initialization for 'pins_cgrp_subsys')
mm/pins_cgroup.c:266:22: error: storage size of 'pins_cgrp_subsys' isn't known
266 | struct cgroup_subsys pins_cgrp_subsys = {
| ^~~~~~~~~~~~~~~~
mm/pins_cgroup.c: In function 'get_pins_cg':
mm/pins_cgroup.c:41:1: error: control reaches end of non-void function [-Werror=return-type]
41 | }
| ^
mm/pins_cgroup.c: In function 'parent_pins':
mm/pins_cgroup.c:36:1: error: control reaches end of non-void function [-Werror=return-type]
36 | }
| ^
mm/pins_cgroup.c: At top level:
mm/pins_cgroup.c:245:22: warning: 'pins_files' defined but not used [-Wunused-variable]
245 | static struct cftype pins_files[] = {
| ^~~~~~~~~~
mm/pins_cgroup.c:107:13: warning: 'pins_charge' defined but not used [-Wunused-function]
107 | static void pins_charge(struct pins_cgroup *pins, int num)
| ^~~~~~~~~~~
cc1: some warnings being treated as errors
vim +/css +19 mm/pins_cgroup.c
17
18 struct pins_cgroup {
> 19 struct cgroup_subsys_state css;
20
21 atomic64_t counter;
22 atomic64_t limit;
23
> 24 struct cgroup_file events_file;
25 atomic64_t events_limit;
26 };
27
28 static struct pins_cgroup *css_pins(struct cgroup_subsys_state *css)
29 {
30 return container_of(css, struct pins_cgroup, css);
31 }
32
33 static struct pins_cgroup *parent_pins(struct pins_cgroup *pins)
34 {
35 return css_pins(pins->css.parent);
36 }
37
38 struct pins_cgroup *get_pins_cg(struct task_struct *task)
39 {
> 40 return css_pins(task_get_css(task, pins_cgrp_id));
41 }
42
43 void put_pins_cg(struct pins_cgroup *cg)
44 {
45 css_put(&cg->css);
46 }
47
48 static struct cgroup_subsys_state *
49 pins_css_alloc(struct cgroup_subsys_state *parent)
50 {
51 struct pins_cgroup *pins;
52
53 pins = kzalloc(sizeof(struct pins_cgroup), GFP_KERNEL);
54 if (!pins)
55 return ERR_PTR(-ENOMEM);
56
57 atomic64_set(&pins->counter, 0);
58 atomic64_set(&pins->limit, PINS_MAX);
59 atomic64_set(&pins->events_limit, 0);
60 return &pins->css;
61 }
62
63 static void pins_css_free(struct cgroup_subsys_state *css)
64 {
65 kfree(css_pins(css));
66 }
67
68 /**
69 * pins_cancel - uncharge the local pin count
70 * @pins: the pin cgroup state
71 * @num: the number of pins to cancel
72 *
73 * This function will WARN if the pin count goes under 0, because such a case is
74 * a bug in the pins controller proper.
75 */
76 void pins_cancel(struct pins_cgroup *pins, int num)
77 {
78 /*
79 * A negative count (or overflow for that matter) is invalid,
80 * and indicates a bug in the `pins` controller proper.
81 */
82 WARN_ON_ONCE(atomic64_add_negative(-num, &pins->counter));
83 }
84
85 /**
86 * pins_uncharge - hierarchically uncharge the pin count
87 * @pins: the pin cgroup state
88 * @num: the number of pins to uncharge
89 */
90 void pins_uncharge(struct pins_cgroup *pins, int num)
91 {
92 struct pins_cgroup *p;
93
94 for (p = pins; parent_pins(p); p = parent_pins(p))
95 pins_cancel(p, num);
96 }
97
98 /**
99 * pins_charge - hierarchically charge the pin count
100 * @pins: the pin cgroup state
101 * @num: the number of pins to charge
102 *
103 * This function does *not* follow the pin limit set. It cannot fail and the new
104 * pin count may exceed the limit. This is only used for reverting failed
105 * attaches, where there is no other way out than violating the limit.
106 */
107 static void pins_charge(struct pins_cgroup *pins, int num)
108 {
109 struct pins_cgroup *p;
110
111 for (p = pins; parent_pins(p); p = parent_pins(p))
112 atomic64_add(num, &p->counter);
113 }
114
115 /**
116 * pins_try_charge - hierarchically try to charge the pin count
117 * @pins: the pin cgroup state
118 * @num: the number of pins to charge
119 *
120 * This function follows the set limit. It will fail if the charge would cause
121 * the new value to exceed the hierarchical limit. Returns 0 if the charge
122 * succeeded, otherwise -EAGAIN.
123 */
124 int pins_try_charge(struct pins_cgroup *pins, int num)
125 {
126 struct pins_cgroup *p, *q;
127
128 for (p = pins; parent_pins(p); p = parent_pins(p)) {
129 uint64_t new = atomic64_add_return(num, &p->counter);
130 uint64_t limit = atomic64_read(&p->limit);
131
132 if (limit != PINS_MAX && new > limit)
133 goto revert;
134 }
135
136 return 0;
137
138 revert:
139 for (q = pins; q != p; q = parent_pins(q))
140 pins_cancel(q, num);
141 pins_cancel(p, num);
142
143 return -EAGAIN;
144 }
145
> 146 static int pins_can_attach(struct cgroup_taskset *tset)
147 {
148 struct cgroup_subsys_state *dst_css;
149 struct task_struct *task;
150
> 151 cgroup_taskset_for_each(task, dst_css, tset) {
152 struct pins_cgroup *pins = css_pins(dst_css);
153 struct cgroup_subsys_state *old_css;
154 struct pins_cgroup *old_pins;
155
156 old_css = task_css(task, pins_cgrp_id);
157 old_pins = css_pins(old_css);
158
159 pins_charge(pins, task->mm->locked_vm);
160 pins_uncharge(old_pins, task->mm->locked_vm);
161 }
162
163 return 0;
164 }
165
166 static void pins_cancel_attach(struct cgroup_taskset *tset)
167 {
168 struct cgroup_subsys_state *dst_css;
169 struct task_struct *task;
170
171 cgroup_taskset_for_each(task, dst_css, tset) {
172 struct pins_cgroup *pins = css_pins(dst_css);
173 struct cgroup_subsys_state *old_css;
174 struct pins_cgroup *old_pins;
175
176 old_css = task_css(task, pins_cgrp_id);
177 old_pins = css_pins(old_css);
178
179 pins_charge(old_pins, task->mm->locked_vm);
180 pins_uncharge(pins, task->mm->locked_vm);
181 }
182 }
183
184
185 static ssize_t pins_max_write(struct kernfs_open_file *of, char *buf,
186 size_t nbytes, loff_t off)
187 {
> 188 struct cgroup_subsys_state *css = of_css(of);
189 struct pins_cgroup *pins = css_pins(css);
190 uint64_t limit;
191 int err;
192
193 buf = strstrip(buf);
194 if (!strcmp(buf, PINS_MAX_STR)) {
195 limit = PINS_MAX;
196 goto set_limit;
197 }
198
199 err = kstrtoll(buf, 0, &limit);
200 if (err)
201 return err;
202
203 if (limit < 0 || limit >= PINS_MAX)
204 return -EINVAL;
205
206 set_limit:
207 /*
208 * Limit updates don't need to be mutex'd, since it isn't
209 * critical that any racing fork()s follow the new limit.
210 */
211 atomic64_set(&pins->limit, limit);
212 return nbytes;
213 }
214
215 static int pins_max_show(struct seq_file *sf, void *v)
216 {
> 217 struct cgroup_subsys_state *css = seq_css(sf);
218 struct pins_cgroup *pins = css_pins(css);
219 uint64_t limit = atomic64_read(&pins->limit);
220
221 if (limit >= PINS_MAX)
222 seq_printf(sf, "%s\n", PINS_MAX_STR);
223 else
224 seq_printf(sf, "%lld\n", limit);
225
226 return 0;
227 }
228
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests
^ permalink raw reply [flat|nested] 108+ messages in thread* Re: [RFC PATCH 14/19] mm: Introduce a cgroup for pinned memory
2023-01-24 5:42 ` Alistair Popple
(?)
(?)
@ 2023-01-24 15:00 ` kernel test robot
-1 siblings, 0 replies; 108+ messages in thread
From: kernel test robot @ 2023-01-24 15:00 UTC (permalink / raw)
To: Alistair Popple; +Cc: llvm, oe-kbuild-all
Hi Alistair,
[FYI, it's a private test report for your RFC patch.]
[auto build test ERROR on 2241ab53cbb5cdb08a6b2d4688feb13971058f65]
url: https://github.com/intel-lab-lkp/linux/commits/Alistair-Popple/mm-Introduce-vm_account/20230124-135027
base: 2241ab53cbb5cdb08a6b2d4688feb13971058f65
patch link: https://lore.kernel.org/r/183372b80aac73e640d9f5ac3c742d505fc6c1f2.1674538665.git-series.apopple%40nvidia.com
patch subject: [RFC PATCH 14/19] mm: Introduce a cgroup for pinned memory
config: hexagon-randconfig-r041-20230123 (https://download.01.org/0day-ci/archive/20230124/202301242242.dYYk1wqy-lkp@intel.com/config)
compiler: clang version 16.0.0 (https://github.com/llvm/llvm-project 4196ca3278f78c6e19246e54ab0ecb364e37d66a)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/4eba1da312a889b27469e42f20c216183d19cd4d
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Alistair-Popple/mm-Introduce-vm_account/20230124-135027
git checkout 4eba1da312a889b27469e42f20c216183d19cd4d
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=hexagon olddefconfig
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=hexagon SHELL=/bin/bash
If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
All error/warnings (new ones prefixed by >>):
In file included from mm/pins_cgroup.c:11:
In file included from include/linux/cgroup.h:26:
In file included from include/linux/kernel_stat.h:9:
In file included from include/linux/interrupt.h:11:
In file included from include/linux/hardirq.h:11:
In file included from ./arch/hexagon/include/generated/asm/hardirq.h:1:
In file included from include/asm-generic/hardirq.h:17:
In file included from include/linux/irq.h:20:
In file included from include/linux/io.h:13:
In file included from arch/hexagon/include/asm/io.h:334:
include/asm-generic/io.h:547:31: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
val = __raw_readb(PCI_IOBASE + addr);
~~~~~~~~~~ ^
include/asm-generic/io.h:560:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
val = __le16_to_cpu((__le16 __force)__raw_readw(PCI_IOBASE + addr));
~~~~~~~~~~ ^
include/uapi/linux/byteorder/little_endian.h:37:51: note: expanded from macro '__le16_to_cpu'
#define __le16_to_cpu(x) ((__force __u16)(__le16)(x))
^
In file included from mm/pins_cgroup.c:11:
In file included from include/linux/cgroup.h:26:
In file included from include/linux/kernel_stat.h:9:
In file included from include/linux/interrupt.h:11:
In file included from include/linux/hardirq.h:11:
In file included from ./arch/hexagon/include/generated/asm/hardirq.h:1:
In file included from include/asm-generic/hardirq.h:17:
In file included from include/linux/irq.h:20:
In file included from include/linux/io.h:13:
In file included from arch/hexagon/include/asm/io.h:334:
include/asm-generic/io.h:573:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
val = __le32_to_cpu((__le32 __force)__raw_readl(PCI_IOBASE + addr));
~~~~~~~~~~ ^
include/uapi/linux/byteorder/little_endian.h:35:51: note: expanded from macro '__le32_to_cpu'
#define __le32_to_cpu(x) ((__force __u32)(__le32)(x))
^
In file included from mm/pins_cgroup.c:11:
In file included from include/linux/cgroup.h:26:
In file included from include/linux/kernel_stat.h:9:
In file included from include/linux/interrupt.h:11:
In file included from include/linux/hardirq.h:11:
In file included from ./arch/hexagon/include/generated/asm/hardirq.h:1:
In file included from include/asm-generic/hardirq.h:17:
In file included from include/linux/irq.h:20:
In file included from include/linux/io.h:13:
In file included from arch/hexagon/include/asm/io.h:334:
include/asm-generic/io.h:584:33: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
__raw_writeb(value, PCI_IOBASE + addr);
~~~~~~~~~~ ^
include/asm-generic/io.h:594:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
__raw_writew((u16 __force)cpu_to_le16(value), PCI_IOBASE + addr);
~~~~~~~~~~ ^
include/asm-generic/io.h:604:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
__raw_writel((u32 __force)cpu_to_le32(value), PCI_IOBASE + addr);
~~~~~~~~~~ ^
>> mm/pins_cgroup.c:19:29: error: field has incomplete type 'struct cgroup_subsys_state'
struct cgroup_subsys_state css;
^
include/linux/kthread.h:218:8: note: forward declaration of 'struct cgroup_subsys_state'
struct cgroup_subsys_state;
^
>> mm/pins_cgroup.c:24:22: error: field has incomplete type 'struct cgroup_file'
struct cgroup_file events_file;
^
mm/pins_cgroup.c:24:9: note: forward declaration of 'struct cgroup_file'
struct cgroup_file events_file;
^
>> mm/pins_cgroup.c:40:18: error: call to undeclared function 'task_get_css'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
return css_pins(task_get_css(task, pins_cgrp_id));
^
>> mm/pins_cgroup.c:40:37: error: use of undeclared identifier 'pins_cgrp_id'
return css_pins(task_get_css(task, pins_cgrp_id));
^
mm/pins_cgroup.c:76:6: warning: no previous prototype for function 'pins_cancel' [-Wmissing-prototypes]
void pins_cancel(struct pins_cgroup *pins, int num)
^
mm/pins_cgroup.c:76:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
void pins_cancel(struct pins_cgroup *pins, int num)
^
static
>> mm/pins_cgroup.c:146:35: warning: declaration of 'struct cgroup_taskset' will not be visible outside of this function [-Wvisibility]
static int pins_can_attach(struct cgroup_taskset *tset)
^
>> mm/pins_cgroup.c:151:2: error: call to undeclared function 'cgroup_taskset_for_each'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
cgroup_taskset_for_each(task, dst_css, tset) {
^
>> mm/pins_cgroup.c:151:46: error: expected ';' after expression
cgroup_taskset_for_each(task, dst_css, tset) {
^
;
>> mm/pins_cgroup.c:156:13: error: call to undeclared function 'task_css'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
old_css = task_css(task, pins_cgrp_id);
^
mm/pins_cgroup.c:156:13: note: did you mean 'task_cpu'?
include/linux/sched.h:2231:28: note: 'task_cpu' declared here
static inline unsigned int task_cpu(const struct task_struct *p)
^
mm/pins_cgroup.c:156:28: error: use of undeclared identifier 'pins_cgrp_id'
old_css = task_css(task, pins_cgrp_id);
^
mm/pins_cgroup.c:166:39: warning: declaration of 'struct cgroup_taskset' will not be visible outside of this function [-Wvisibility]
static void pins_cancel_attach(struct cgroup_taskset *tset)
^
mm/pins_cgroup.c:171:2: error: call to undeclared function 'cgroup_taskset_for_each'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
cgroup_taskset_for_each(task, dst_css, tset) {
^
mm/pins_cgroup.c:171:46: error: expected ';' after expression
cgroup_taskset_for_each(task, dst_css, tset) {
^
;
mm/pins_cgroup.c:176:13: error: call to undeclared function 'task_css'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
old_css = task_css(task, pins_cgrp_id);
^
mm/pins_cgroup.c:176:28: error: use of undeclared identifier 'pins_cgrp_id'
old_css = task_css(task, pins_cgrp_id);
^
>> mm/pins_cgroup.c:188:36: error: call to undeclared function 'of_css'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
struct cgroup_subsys_state *css = of_css(of);
^
>> mm/pins_cgroup.c:188:30: error: incompatible integer to pointer conversion initializing 'struct cgroup_subsys_state *' with an expression of type 'int' [-Wint-conversion]
struct cgroup_subsys_state *css = of_css(of);
^ ~~~~~~~~~~
>> mm/pins_cgroup.c:217:36: error: call to undeclared function 'seq_css'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
struct cgroup_subsys_state *css = seq_css(sf);
^
mm/pins_cgroup.c:217:30: error: incompatible integer to pointer conversion initializing 'struct cgroup_subsys_state *' with an expression of type 'int' [-Wint-conversion]
struct cgroup_subsys_state *css = seq_css(sf);
^ ~~~~~~~~~~~
>> mm/pins_cgroup.c:230:16: warning: declaration of 'struct cftype' will not be visible outside of this function [-Wvisibility]
struct cftype *cft)
^
mm/pins_cgroup.c:239:38: error: call to undeclared function 'seq_css'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
struct pins_cgroup *pins = css_pins(seq_css(sf));
^
>> mm/pins_cgroup.c:239:38: error: incompatible integer to pointer conversion passing 'int' to parameter of type 'struct cgroup_subsys_state *' [-Wint-conversion]
struct pins_cgroup *pins = css_pins(seq_css(sf));
^~~~~~~~~~~
mm/pins_cgroup.c:28:65: note: passing argument to parameter 'css' here
static struct pins_cgroup *css_pins(struct cgroup_subsys_state *css)
^
>> mm/pins_cgroup.c:245:32: error: array has incomplete element type 'struct cftype'
static struct cftype pins_files[] = {
^
mm/pins_cgroup.c:245:15: note: forward declaration of 'struct cftype'
static struct cftype pins_files[] = {
^
fatal error: too many errors emitted, stopping now [-ferror-limit=]
10 warnings and 20 errors generated.
vim +19 mm/pins_cgroup.c
17
18 struct pins_cgroup {
> 19 struct cgroup_subsys_state css;
20
21 atomic64_t counter;
22 atomic64_t limit;
23
> 24 struct cgroup_file events_file;
25 atomic64_t events_limit;
26 };
27
28 static struct pins_cgroup *css_pins(struct cgroup_subsys_state *css)
29 {
30 return container_of(css, struct pins_cgroup, css);
31 }
32
33 static struct pins_cgroup *parent_pins(struct pins_cgroup *pins)
34 {
35 return css_pins(pins->css.parent);
36 }
37
38 struct pins_cgroup *get_pins_cg(struct task_struct *task)
39 {
> 40 return css_pins(task_get_css(task, pins_cgrp_id));
41 }
42
43 void put_pins_cg(struct pins_cgroup *cg)
44 {
45 css_put(&cg->css);
46 }
47
48 static struct cgroup_subsys_state *
49 pins_css_alloc(struct cgroup_subsys_state *parent)
50 {
51 struct pins_cgroup *pins;
52
53 pins = kzalloc(sizeof(struct pins_cgroup), GFP_KERNEL);
54 if (!pins)
55 return ERR_PTR(-ENOMEM);
56
57 atomic64_set(&pins->counter, 0);
58 atomic64_set(&pins->limit, PINS_MAX);
59 atomic64_set(&pins->events_limit, 0);
60 return &pins->css;
61 }
62
63 static void pins_css_free(struct cgroup_subsys_state *css)
64 {
65 kfree(css_pins(css));
66 }
67
68 /**
69 * pins_cancel - uncharge the local pin count
70 * @pins: the pin cgroup state
71 * @num: the number of pins to cancel
72 *
73 * This function will WARN if the pin count goes under 0, because such a case is
74 * a bug in the pins controller proper.
75 */
76 void pins_cancel(struct pins_cgroup *pins, int num)
77 {
78 /*
79 * A negative count (or overflow for that matter) is invalid,
80 * and indicates a bug in the `pins` controller proper.
81 */
82 WARN_ON_ONCE(atomic64_add_negative(-num, &pins->counter));
83 }
84
85 /**
86 * pins_uncharge - hierarchically uncharge the pin count
87 * @pins: the pin cgroup state
88 * @num: the number of pins to uncharge
89 */
90 void pins_uncharge(struct pins_cgroup *pins, int num)
91 {
92 struct pins_cgroup *p;
93
94 for (p = pins; parent_pins(p); p = parent_pins(p))
95 pins_cancel(p, num);
96 }
97
98 /**
99 * pins_charge - hierarchically charge the pin count
100 * @pins: the pin cgroup state
101 * @num: the number of pins to charge
102 *
103 * This function does *not* follow the pin limit set. It cannot fail and the new
104 * pin count may exceed the limit. This is only used for reverting failed
105 * attaches, where there is no other way out than violating the limit.
106 */
107 static void pins_charge(struct pins_cgroup *pins, int num)
108 {
109 struct pins_cgroup *p;
110
111 for (p = pins; parent_pins(p); p = parent_pins(p))
112 atomic64_add(num, &p->counter);
113 }
114
115 /**
116 * pins_try_charge - hierarchically try to charge the pin count
117 * @pins: the pin cgroup state
118 * @num: the number of pins to charge
119 *
120 * This function follows the set limit. It will fail if the charge would cause
121 * the new value to exceed the hierarchical limit. Returns 0 if the charge
122 * succeeded, otherwise -EAGAIN.
123 */
124 int pins_try_charge(struct pins_cgroup *pins, int num)
125 {
126 struct pins_cgroup *p, *q;
127
128 for (p = pins; parent_pins(p); p = parent_pins(p)) {
129 uint64_t new = atomic64_add_return(num, &p->counter);
130 uint64_t limit = atomic64_read(&p->limit);
131
132 if (limit != PINS_MAX && new > limit)
133 goto revert;
134 }
135
136 return 0;
137
138 revert:
139 for (q = pins; q != p; q = parent_pins(q))
140 pins_cancel(q, num);
141 pins_cancel(p, num);
142
143 return -EAGAIN;
144 }
145
> 146 static int pins_can_attach(struct cgroup_taskset *tset)
147 {
148 struct cgroup_subsys_state *dst_css;
149 struct task_struct *task;
150
> 151 cgroup_taskset_for_each(task, dst_css, tset) {
152 struct pins_cgroup *pins = css_pins(dst_css);
153 struct cgroup_subsys_state *old_css;
154 struct pins_cgroup *old_pins;
155
> 156 old_css = task_css(task, pins_cgrp_id);
157 old_pins = css_pins(old_css);
158
159 pins_charge(pins, task->mm->locked_vm);
160 pins_uncharge(old_pins, task->mm->locked_vm);
161 }
162
163 return 0;
164 }
165
166 static void pins_cancel_attach(struct cgroup_taskset *tset)
167 {
168 struct cgroup_subsys_state *dst_css;
169 struct task_struct *task;
170
171 cgroup_taskset_for_each(task, dst_css, tset) {
172 struct pins_cgroup *pins = css_pins(dst_css);
173 struct cgroup_subsys_state *old_css;
174 struct pins_cgroup *old_pins;
175
176 old_css = task_css(task, pins_cgrp_id);
177 old_pins = css_pins(old_css);
178
179 pins_charge(old_pins, task->mm->locked_vm);
180 pins_uncharge(pins, task->mm->locked_vm);
181 }
182 }
183
184
185 static ssize_t pins_max_write(struct kernfs_open_file *of, char *buf,
186 size_t nbytes, loff_t off)
187 {
> 188 struct cgroup_subsys_state *css = of_css(of);
189 struct pins_cgroup *pins = css_pins(css);
190 uint64_t limit;
191 int err;
192
193 buf = strstrip(buf);
194 if (!strcmp(buf, PINS_MAX_STR)) {
195 limit = PINS_MAX;
196 goto set_limit;
197 }
198
199 err = kstrtoll(buf, 0, &limit);
200 if (err)
201 return err;
202
203 if (limit < 0 || limit >= PINS_MAX)
204 return -EINVAL;
205
206 set_limit:
207 /*
208 * Limit updates don't need to be mutex'd, since it isn't
209 * critical that any racing fork()s follow the new limit.
210 */
211 atomic64_set(&pins->limit, limit);
212 return nbytes;
213 }
214
215 static int pins_max_show(struct seq_file *sf, void *v)
216 {
> 217 struct cgroup_subsys_state *css = seq_css(sf);
218 struct pins_cgroup *pins = css_pins(css);
219 uint64_t limit = atomic64_read(&pins->limit);
220
221 if (limit >= PINS_MAX)
222 seq_printf(sf, "%s\n", PINS_MAX_STR);
223 else
224 seq_printf(sf, "%lld\n", limit);
225
226 return 0;
227 }
228
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests
^ permalink raw reply [flat|nested] 108+ messages in thread* Re: [RFC PATCH 14/19] mm: Introduce a cgroup for pinned memory
2023-01-24 5:42 ` Alistair Popple
` (2 preceding siblings ...)
(?)
@ 2023-01-24 15:41 ` kernel test robot
-1 siblings, 0 replies; 108+ messages in thread
From: kernel test robot @ 2023-01-24 15:41 UTC (permalink / raw)
To: Alistair Popple; +Cc: llvm, oe-kbuild-all
Hi Alistair,
[FYI, it's a private test report for your RFC patch.]
[auto build test ERROR on 2241ab53cbb5cdb08a6b2d4688feb13971058f65]
url: https://github.com/intel-lab-lkp/linux/commits/Alistair-Popple/mm-Introduce-vm_account/20230124-135027
base: 2241ab53cbb5cdb08a6b2d4688feb13971058f65
patch link: https://lore.kernel.org/r/183372b80aac73e640d9f5ac3c742d505fc6c1f2.1674538665.git-series.apopple%40nvidia.com
patch subject: [RFC PATCH 14/19] mm: Introduce a cgroup for pinned memory
config: arm-randconfig-r036-20230123 (https://download.01.org/0day-ci/archive/20230124/202301242339.dF4Cw6vY-lkp@intel.com/config)
compiler: clang version 16.0.0 (https://github.com/llvm/llvm-project 4196ca3278f78c6e19246e54ab0ecb364e37d66a)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# install arm cross compiling tool for clang build
# apt-get install binutils-arm-linux-gnueabi
# https://github.com/intel-lab-lkp/linux/commit/4eba1da312a889b27469e42f20c216183d19cd4d
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Alistair-Popple/mm-Introduce-vm_account/20230124-135027
git checkout 4eba1da312a889b27469e42f20c216183d19cd4d
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=arm olddefconfig
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=arm SHELL=/bin/bash
If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
All error/warnings (new ones prefixed by >>):
>> mm/pins_cgroup.c:19:29: error: field has incomplete type 'struct cgroup_subsys_state'
struct cgroup_subsys_state css;
^
include/linux/kthread.h:218:8: note: forward declaration of 'struct cgroup_subsys_state'
struct cgroup_subsys_state;
^
>> mm/pins_cgroup.c:24:22: error: field has incomplete type 'struct cgroup_file'
struct cgroup_file events_file;
^
mm/pins_cgroup.c:24:9: note: forward declaration of 'struct cgroup_file'
struct cgroup_file events_file;
^
>> mm/pins_cgroup.c:40:18: error: call to undeclared function 'task_get_css'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
return css_pins(task_get_css(task, pins_cgrp_id));
^
>> mm/pins_cgroup.c:40:37: error: use of undeclared identifier 'pins_cgrp_id'
return css_pins(task_get_css(task, pins_cgrp_id));
^
>> mm/pins_cgroup.c:76:6: warning: no previous prototype for function 'pins_cancel' [-Wmissing-prototypes]
void pins_cancel(struct pins_cgroup *pins, int num)
^
mm/pins_cgroup.c:76:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
void pins_cancel(struct pins_cgroup *pins, int num)
^
static
>> mm/pins_cgroup.c:146:35: warning: declaration of 'struct cgroup_taskset' will not be visible outside of this function [-Wvisibility]
static int pins_can_attach(struct cgroup_taskset *tset)
^
>> mm/pins_cgroup.c:151:2: error: call to undeclared function 'cgroup_taskset_for_each'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
cgroup_taskset_for_each(task, dst_css, tset) {
^
>> mm/pins_cgroup.c:151:46: error: expected ';' after expression
cgroup_taskset_for_each(task, dst_css, tset) {
^
;
>> mm/pins_cgroup.c:156:13: error: call to undeclared function 'task_css'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
old_css = task_css(task, pins_cgrp_id);
^
mm/pins_cgroup.c:156:13: note: did you mean 'task_cpu'?
include/linux/sched.h:2240:28: note: 'task_cpu' declared here
static inline unsigned int task_cpu(const struct task_struct *p)
^
mm/pins_cgroup.c:156:28: error: use of undeclared identifier 'pins_cgrp_id'
old_css = task_css(task, pins_cgrp_id);
^
mm/pins_cgroup.c:166:39: warning: declaration of 'struct cgroup_taskset' will not be visible outside of this function [-Wvisibility]
static void pins_cancel_attach(struct cgroup_taskset *tset)
^
mm/pins_cgroup.c:171:2: error: call to undeclared function 'cgroup_taskset_for_each'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
cgroup_taskset_for_each(task, dst_css, tset) {
^
mm/pins_cgroup.c:171:46: error: expected ';' after expression
cgroup_taskset_for_each(task, dst_css, tset) {
^
;
mm/pins_cgroup.c:176:13: error: call to undeclared function 'task_css'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
old_css = task_css(task, pins_cgrp_id);
^
mm/pins_cgroup.c:176:28: error: use of undeclared identifier 'pins_cgrp_id'
old_css = task_css(task, pins_cgrp_id);
^
>> mm/pins_cgroup.c:188:36: error: call to undeclared function 'of_css'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
struct cgroup_subsys_state *css = of_css(of);
^
>> mm/pins_cgroup.c:188:30: error: incompatible integer to pointer conversion initializing 'struct cgroup_subsys_state *' with an expression of type 'int' [-Wint-conversion]
struct cgroup_subsys_state *css = of_css(of);
^ ~~~~~~~~~~
>> mm/pins_cgroup.c:217:36: error: call to undeclared function 'seq_css'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
struct cgroup_subsys_state *css = seq_css(sf);
^
mm/pins_cgroup.c:217:30: error: incompatible integer to pointer conversion initializing 'struct cgroup_subsys_state *' with an expression of type 'int' [-Wint-conversion]
struct cgroup_subsys_state *css = seq_css(sf);
^ ~~~~~~~~~~~
>> mm/pins_cgroup.c:230:16: warning: declaration of 'struct cftype' will not be visible outside of this function [-Wvisibility]
struct cftype *cft)
^
mm/pins_cgroup.c:239:38: error: call to undeclared function 'seq_css'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
struct pins_cgroup *pins = css_pins(seq_css(sf));
^
>> mm/pins_cgroup.c:239:38: error: incompatible integer to pointer conversion passing 'int' to parameter of type 'struct cgroup_subsys_state *' [-Wint-conversion]
struct pins_cgroup *pins = css_pins(seq_css(sf));
^~~~~~~~~~~
mm/pins_cgroup.c:28:65: note: passing argument to parameter 'css' here
static struct pins_cgroup *css_pins(struct cgroup_subsys_state *css)
^
>> mm/pins_cgroup.c:245:32: error: array has incomplete element type 'struct cftype'
static struct cftype pins_files[] = {
^
mm/pins_cgroup.c:245:15: note: forward declaration of 'struct cftype'
static struct cftype pins_files[] = {
^
fatal error: too many errors emitted, stopping now [-ferror-limit=]
4 warnings and 20 errors generated.
vim +19 mm/pins_cgroup.c
17
18 struct pins_cgroup {
> 19 struct cgroup_subsys_state css;
20
21 atomic64_t counter;
22 atomic64_t limit;
23
> 24 struct cgroup_file events_file;
25 atomic64_t events_limit;
26 };
27
28 static struct pins_cgroup *css_pins(struct cgroup_subsys_state *css)
29 {
30 return container_of(css, struct pins_cgroup, css);
31 }
32
33 static struct pins_cgroup *parent_pins(struct pins_cgroup *pins)
34 {
35 return css_pins(pins->css.parent);
36 }
37
38 struct pins_cgroup *get_pins_cg(struct task_struct *task)
39 {
> 40 return css_pins(task_get_css(task, pins_cgrp_id));
41 }
42
43 void put_pins_cg(struct pins_cgroup *cg)
44 {
45 css_put(&cg->css);
46 }
47
48 static struct cgroup_subsys_state *
49 pins_css_alloc(struct cgroup_subsys_state *parent)
50 {
51 struct pins_cgroup *pins;
52
53 pins = kzalloc(sizeof(struct pins_cgroup), GFP_KERNEL);
54 if (!pins)
55 return ERR_PTR(-ENOMEM);
56
57 atomic64_set(&pins->counter, 0);
58 atomic64_set(&pins->limit, PINS_MAX);
59 atomic64_set(&pins->events_limit, 0);
60 return &pins->css;
61 }
62
63 static void pins_css_free(struct cgroup_subsys_state *css)
64 {
65 kfree(css_pins(css));
66 }
67
68 /**
69 * pins_cancel - uncharge the local pin count
70 * @pins: the pin cgroup state
71 * @num: the number of pins to cancel
72 *
73 * This function will WARN if the pin count goes under 0, because such a case is
74 * a bug in the pins controller proper.
75 */
> 76 void pins_cancel(struct pins_cgroup *pins, int num)
77 {
78 /*
79 * A negative count (or overflow for that matter) is invalid,
80 * and indicates a bug in the `pins` controller proper.
81 */
82 WARN_ON_ONCE(atomic64_add_negative(-num, &pins->counter));
83 }
84
85 /**
86 * pins_uncharge - hierarchically uncharge the pin count
87 * @pins: the pin cgroup state
88 * @num: the number of pins to uncharge
89 */
90 void pins_uncharge(struct pins_cgroup *pins, int num)
91 {
92 struct pins_cgroup *p;
93
94 for (p = pins; parent_pins(p); p = parent_pins(p))
95 pins_cancel(p, num);
96 }
97
98 /**
99 * pins_charge - hierarchically charge the pin count
100 * @pins: the pin cgroup state
101 * @num: the number of pins to charge
102 *
103 * This function does *not* follow the pin limit set. It cannot fail and the new
104 * pin count may exceed the limit. This is only used for reverting failed
105 * attaches, where there is no other way out than violating the limit.
106 */
107 static void pins_charge(struct pins_cgroup *pins, int num)
108 {
109 struct pins_cgroup *p;
110
111 for (p = pins; parent_pins(p); p = parent_pins(p))
112 atomic64_add(num, &p->counter);
113 }
114
115 /**
116 * pins_try_charge - hierarchically try to charge the pin count
117 * @pins: the pin cgroup state
118 * @num: the number of pins to charge
119 *
120 * This function follows the set limit. It will fail if the charge would cause
121 * the new value to exceed the hierarchical limit. Returns 0 if the charge
122 * succeeded, otherwise -EAGAIN.
123 */
124 int pins_try_charge(struct pins_cgroup *pins, int num)
125 {
126 struct pins_cgroup *p, *q;
127
128 for (p = pins; parent_pins(p); p = parent_pins(p)) {
129 uint64_t new = atomic64_add_return(num, &p->counter);
130 uint64_t limit = atomic64_read(&p->limit);
131
132 if (limit != PINS_MAX && new > limit)
133 goto revert;
134 }
135
136 return 0;
137
138 revert:
139 for (q = pins; q != p; q = parent_pins(q))
140 pins_cancel(q, num);
141 pins_cancel(p, num);
142
143 return -EAGAIN;
144 }
145
> 146 static int pins_can_attach(struct cgroup_taskset *tset)
147 {
148 struct cgroup_subsys_state *dst_css;
149 struct task_struct *task;
150
> 151 cgroup_taskset_for_each(task, dst_css, tset) {
152 struct pins_cgroup *pins = css_pins(dst_css);
153 struct cgroup_subsys_state *old_css;
154 struct pins_cgroup *old_pins;
155
> 156 old_css = task_css(task, pins_cgrp_id);
157 old_pins = css_pins(old_css);
158
159 pins_charge(pins, task->mm->locked_vm);
160 pins_uncharge(old_pins, task->mm->locked_vm);
161 }
162
163 return 0;
164 }
165
166 static void pins_cancel_attach(struct cgroup_taskset *tset)
167 {
168 struct cgroup_subsys_state *dst_css;
169 struct task_struct *task;
170
171 cgroup_taskset_for_each(task, dst_css, tset) {
172 struct pins_cgroup *pins = css_pins(dst_css);
173 struct cgroup_subsys_state *old_css;
174 struct pins_cgroup *old_pins;
175
176 old_css = task_css(task, pins_cgrp_id);
177 old_pins = css_pins(old_css);
178
179 pins_charge(old_pins, task->mm->locked_vm);
180 pins_uncharge(pins, task->mm->locked_vm);
181 }
182 }
183
184
185 static ssize_t pins_max_write(struct kernfs_open_file *of, char *buf,
186 size_t nbytes, loff_t off)
187 {
> 188 struct cgroup_subsys_state *css = of_css(of);
189 struct pins_cgroup *pins = css_pins(css);
190 uint64_t limit;
191 int err;
192
193 buf = strstrip(buf);
194 if (!strcmp(buf, PINS_MAX_STR)) {
195 limit = PINS_MAX;
196 goto set_limit;
197 }
198
199 err = kstrtoll(buf, 0, &limit);
200 if (err)
201 return err;
202
203 if (limit < 0 || limit >= PINS_MAX)
204 return -EINVAL;
205
206 set_limit:
207 /*
208 * Limit updates don't need to be mutex'd, since it isn't
209 * critical that any racing fork()s follow the new limit.
210 */
211 atomic64_set(&pins->limit, limit);
212 return nbytes;
213 }
214
215 static int pins_max_show(struct seq_file *sf, void *v)
216 {
> 217 struct cgroup_subsys_state *css = seq_css(sf);
218 struct pins_cgroup *pins = css_pins(css);
219 uint64_t limit = atomic64_read(&pins->limit);
220
221 if (limit >= PINS_MAX)
222 seq_printf(sf, "%s\n", PINS_MAX_STR);
223 else
224 seq_printf(sf, "%lld\n", limit);
225
226 return 0;
227 }
228
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests
^ permalink raw reply [flat|nested] 108+ messages in thread[parent not found: <183372b80aac73e640d9f5ac3c742d505fc6c1f2.1674538665.git-series.apopple-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org>]
* Re: [RFC PATCH 14/19] mm: Introduce a cgroup for pinned memory
2023-01-24 5:42 ` Alistair Popple
@ 2023-01-27 21:44 ` Tejun Heo
-1 siblings, 0 replies; 108+ messages in thread
From: Tejun Heo @ 2023-01-27 21:44 UTC (permalink / raw)
To: Alistair Popple
Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA, jgg-DDmLM1+adcrQT0dZR+AlfA,
jhubbard-DDmLM1+adcrQT0dZR+AlfA, tjmercier-hpIqsD4AKlfQT0dZR+AlfA,
hannes-druUgvl0LCNAfugRpC6u6w, surenb-hpIqsD4AKlfQT0dZR+AlfA,
mkoutny-IBi9RG/b67k, daniel-/w4YWyX8dFk, Zefan Li, Andrew Morton
On Tue, Jan 24, 2023 at 04:42:43PM +1100, Alistair Popple wrote:
> If too much memory in a system is pinned or locked it can lead to
> problems such as performance degredation or in the worst case
> out-of-memory errors as such memory cannot be moved or paged out.
>
> In order to prevent users without CAP_IPC_LOCK from causing these
> issues the amount of memory that can be pinned is typically limited by
> RLIMIT_MEMLOCK. However this is inflexible as limits can't be shared
> between tasks and the enforcement of these limits is inconsistent
> between in-kernel users of pinned memory such as mlock() and device
> drivers which may also pin pages with pin_user_pages().
>
> To allow for a single limit to be set introduce a cgroup controller
> which can be used to limit the number of pages being pinned by all
> tasks in the cgroup.
The use case makes some sense to me but I wonder whether this'd fit a lot
better in memcg rather than being its own controller.
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [RFC PATCH 14/19] mm: Introduce a cgroup for pinned memory
@ 2023-01-27 21:44 ` Tejun Heo
0 siblings, 0 replies; 108+ messages in thread
From: Tejun Heo @ 2023-01-27 21:44 UTC (permalink / raw)
To: Alistair Popple
Cc: linux-mm, cgroups, linux-kernel, jgg, jhubbard, tjmercier, hannes,
surenb, mkoutny, daniel, Zefan Li, Andrew Morton
On Tue, Jan 24, 2023 at 04:42:43PM +1100, Alistair Popple wrote:
> If too much memory in a system is pinned or locked it can lead to
> problems such as performance degredation or in the worst case
> out-of-memory errors as such memory cannot be moved or paged out.
>
> In order to prevent users without CAP_IPC_LOCK from causing these
> issues the amount of memory that can be pinned is typically limited by
> RLIMIT_MEMLOCK. However this is inflexible as limits can't be shared
> between tasks and the enforcement of these limits is inconsistent
> between in-kernel users of pinned memory such as mlock() and device
> drivers which may also pin pages with pin_user_pages().
>
> To allow for a single limit to be set introduce a cgroup controller
> which can be used to limit the number of pages being pinned by all
> tasks in the cgroup.
The use case makes some sense to me but I wonder whether this'd fit a lot
better in memcg rather than being its own controller.
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 108+ messages in thread
[parent not found: <Y9RFs+90TyzVMs83-NiLfg/pYEd1N0TnZuCh8vA@public.gmane.org>]
* Re: [RFC PATCH 14/19] mm: Introduce a cgroup for pinned memory
2023-01-27 21:44 ` Tejun Heo
@ 2023-01-30 13:20 ` Jason Gunthorpe
-1 siblings, 0 replies; 108+ messages in thread
From: Jason Gunthorpe @ 2023-01-30 13:20 UTC (permalink / raw)
To: Tejun Heo, Daniel P. Berrange, Alex Williamson
Cc: Alistair Popple, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
cgroups-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA,
jhubbard-DDmLM1+adcrQT0dZR+AlfA, tjmercier-hpIqsD4AKlfQT0dZR+AlfA,
hannes-druUgvl0LCNAfugRpC6u6w, surenb-hpIqsD4AKlfQT0dZR+AlfA,
mkoutny-IBi9RG/b67k, daniel-/w4YWyX8dFk, Zefan Li, Andrew Morton,
libvir-list-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org, Laine Stump
On Fri, Jan 27, 2023 at 11:44:19AM -1000, Tejun Heo wrote:
> On Tue, Jan 24, 2023 at 04:42:43PM +1100, Alistair Popple wrote:
> > If too much memory in a system is pinned or locked it can lead to
> > problems such as performance degredation or in the worst case
> > out-of-memory errors as such memory cannot be moved or paged out.
> >
> > In order to prevent users without CAP_IPC_LOCK from causing these
> > issues the amount of memory that can be pinned is typically limited by
> > RLIMIT_MEMLOCK. However this is inflexible as limits can't be shared
> > between tasks and the enforcement of these limits is inconsistent
> > between in-kernel users of pinned memory such as mlock() and device
> > drivers which may also pin pages with pin_user_pages().
> >
> > To allow for a single limit to be set introduce a cgroup controller
> > which can be used to limit the number of pages being pinned by all
> > tasks in the cgroup.
>
> The use case makes some sense to me but I wonder whether this'd fit a lot
> better in memcg rather than being its own controller.
As long as the pinned limitation has its own bucket it is probably
fine? The underlying memory allocations should have already been
charged to the memcg - so we don't want to double account.
Alex and Daniel were looking at this from the qemu/libvirt
perspective, perhaps they have some insight what they would like to
see?
Thanks,
Jason
^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [RFC PATCH 14/19] mm: Introduce a cgroup for pinned memory
@ 2023-01-30 13:20 ` Jason Gunthorpe
0 siblings, 0 replies; 108+ messages in thread
From: Jason Gunthorpe @ 2023-01-30 13:20 UTC (permalink / raw)
To: Tejun Heo, Daniel P. Berrange, Alex Williamson
Cc: Alistair Popple, linux-mm, cgroups, linux-kernel, jhubbard,
tjmercier, hannes, surenb, mkoutny, daniel, Zefan Li,
Andrew Morton, libvir-list@redhat.com, Laine Stump
On Fri, Jan 27, 2023 at 11:44:19AM -1000, Tejun Heo wrote:
> On Tue, Jan 24, 2023 at 04:42:43PM +1100, Alistair Popple wrote:
> > If too much memory in a system is pinned or locked it can lead to
> > problems such as performance degredation or in the worst case
> > out-of-memory errors as such memory cannot be moved or paged out.
> >
> > In order to prevent users without CAP_IPC_LOCK from causing these
> > issues the amount of memory that can be pinned is typically limited by
> > RLIMIT_MEMLOCK. However this is inflexible as limits can't be shared
> > between tasks and the enforcement of these limits is inconsistent
> > between in-kernel users of pinned memory such as mlock() and device
> > drivers which may also pin pages with pin_user_pages().
> >
> > To allow for a single limit to be set introduce a cgroup controller
> > which can be used to limit the number of pages being pinned by all
> > tasks in the cgroup.
>
> The use case makes some sense to me but I wonder whether this'd fit a lot
> better in memcg rather than being its own controller.
As long as the pinned limitation has its own bucket it is probably
fine? The underlying memory allocations should have already been
charged to the memcg - so we don't want to double account.
Alex and Daniel were looking at this from the qemu/libvirt
perspective, perhaps they have some insight what they would like to
see?
Thanks,
Jason
^ permalink raw reply [flat|nested] 108+ messages in thread
* [RFC PATCH 15/19] mm/util: Extend vm_account to charge pages against the pin cgroup
2023-01-24 5:42 ` Alistair Popple
@ 2023-01-24 5:42 ` Alistair Popple
-1 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-24 5:42 UTC (permalink / raw)
To: linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, jgg-DDmLM1+adcrQT0dZR+AlfA,
jhubbard-DDmLM1+adcrQT0dZR+AlfA, tjmercier-hpIqsD4AKlfQT0dZR+AlfA,
hannes-druUgvl0LCNAfugRpC6u6w, surenb-hpIqsD4AKlfQT0dZR+AlfA,
mkoutny-IBi9RG/b67k, daniel-/w4YWyX8dFk, Alistair Popple
The vm_account_pinned() functions currently only account pages against
pinned_vm/locked_vm and enforce limits against RLIMIT_MEMLOCK. Extend
these to account pages and enforce limits using the pin count cgroup.
Accounting of pages will fail if either RLIMIT_MEMLOCK or the cgroup
limit is exceeded. Unlike rlimit enforcement which can be bypassed if
the user has CAP_IPC_LOCK cgroup limits can not be bypassed.
Signed-off-by: Alistair Popple <apopple-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org>
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org
---
include/linux/mm_types.h | 1 +
mm/util.c | 22 ++++++++++++++++++----
2 files changed, 19 insertions(+), 4 deletions(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7de2168..4adf8dc 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1116,6 +1116,7 @@ struct vm_account {
struct mm_struct *mm;
struct user_struct *user;
} a;
+ struct pins_cgroup *pins_cg;
enum vm_account_flags flags;
};
diff --git a/mm/util.c b/mm/util.c
index af40b1e..e5fb01a 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -442,6 +442,7 @@ void vm_account_init(struct vm_account *vm_account, struct task_struct *task,
vm_account->a.mm = task->mm;
}
+ vm_account->pins_cg = get_pins_cg(task);
vm_account->flags = flags;
}
EXPORT_SYMBOL_GPL(vm_account_init);
@@ -459,6 +460,7 @@ void vm_account_release(struct vm_account *vm_account)
free_uid(vm_account->a.user);
else
mmdrop(vm_account->a.mm);
+ put_pins_cg(vm_account->pins_cg);
}
EXPORT_SYMBOL_GPL(vm_account_release);
@@ -489,6 +491,15 @@ static int vm_account_cmpxchg(struct vm_account *vm_account,
}
}
+static void vm_unaccount_legacy(struct vm_account *vm_account,
+ unsigned long npages)
+{
+ if (vm_account->flags & VM_ACCOUNT_USER)
+ atomic_long_sub(npages, &vm_account->a.user->locked_vm);
+ else
+ atomic64_sub(npages, &vm_account->a.mm->pinned_vm);
+}
+
int vm_account_pinned(struct vm_account *vm_account, unsigned long npages)
{
unsigned long lock_limit = RLIM_INFINITY;
@@ -506,16 +517,19 @@ int vm_account_pinned(struct vm_account *vm_account, unsigned long npages)
return ret;
}
+ if (pins_try_charge(vm_account->pins_cg, npages)) {
+ vm_unaccount_legacy(vm_account, npages);
+ return -ENOMEM;
+ }
+
return 0;
}
EXPORT_SYMBOL_GPL(vm_account_pinned);
void vm_unaccount_pinned(struct vm_account *vm_account, unsigned long npages)
{
- if (vm_account->flags & VM_ACCOUNT_USER)
- atomic_long_sub(npages, &vm_account->a.user->locked_vm);
- else
- atomic64_sub(npages, &vm_account->a.mm->pinned_vm);
+ vm_unaccount_legacy(vm_account, npages);
+ pins_uncharge(vm_account->pins_cg, npages);
}
EXPORT_SYMBOL_GPL(vm_unaccount_pinned);
--
git-series 0.9.1
^ permalink raw reply related [flat|nested] 108+ messages in thread* [RFC PATCH 15/19] mm/util: Extend vm_account to charge pages against the pin cgroup
@ 2023-01-24 5:42 ` Alistair Popple
0 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-24 5:42 UTC (permalink / raw)
To: linux-mm, cgroups
Cc: linux-kernel, jgg, jhubbard, tjmercier, hannes, surenb, mkoutny,
daniel, Alistair Popple
The vm_account_pinned() functions currently only account pages against
pinned_vm/locked_vm and enforce limits against RLIMIT_MEMLOCK. Extend
these to account pages and enforce limits using the pin count cgroup.
Accounting of pages will fail if either RLIMIT_MEMLOCK or the cgroup
limit is exceeded. Unlike rlimit enforcement which can be bypassed if
the user has CAP_IPC_LOCK cgroup limits can not be bypassed.
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Cc: linux-kernel@vger.kernel.org
Cc: linux-mm@kvack.org
---
include/linux/mm_types.h | 1 +
mm/util.c | 22 ++++++++++++++++++----
2 files changed, 19 insertions(+), 4 deletions(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7de2168..4adf8dc 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1116,6 +1116,7 @@ struct vm_account {
struct mm_struct *mm;
struct user_struct *user;
} a;
+ struct pins_cgroup *pins_cg;
enum vm_account_flags flags;
};
diff --git a/mm/util.c b/mm/util.c
index af40b1e..e5fb01a 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -442,6 +442,7 @@ void vm_account_init(struct vm_account *vm_account, struct task_struct *task,
vm_account->a.mm = task->mm;
}
+ vm_account->pins_cg = get_pins_cg(task);
vm_account->flags = flags;
}
EXPORT_SYMBOL_GPL(vm_account_init);
@@ -459,6 +460,7 @@ void vm_account_release(struct vm_account *vm_account)
free_uid(vm_account->a.user);
else
mmdrop(vm_account->a.mm);
+ put_pins_cg(vm_account->pins_cg);
}
EXPORT_SYMBOL_GPL(vm_account_release);
@@ -489,6 +491,15 @@ static int vm_account_cmpxchg(struct vm_account *vm_account,
}
}
+static void vm_unaccount_legacy(struct vm_account *vm_account,
+ unsigned long npages)
+{
+ if (vm_account->flags & VM_ACCOUNT_USER)
+ atomic_long_sub(npages, &vm_account->a.user->locked_vm);
+ else
+ atomic64_sub(npages, &vm_account->a.mm->pinned_vm);
+}
+
int vm_account_pinned(struct vm_account *vm_account, unsigned long npages)
{
unsigned long lock_limit = RLIM_INFINITY;
@@ -506,16 +517,19 @@ int vm_account_pinned(struct vm_account *vm_account, unsigned long npages)
return ret;
}
+ if (pins_try_charge(vm_account->pins_cg, npages)) {
+ vm_unaccount_legacy(vm_account, npages);
+ return -ENOMEM;
+ }
+
return 0;
}
EXPORT_SYMBOL_GPL(vm_account_pinned);
void vm_unaccount_pinned(struct vm_account *vm_account, unsigned long npages)
{
- if (vm_account->flags & VM_ACCOUNT_USER)
- atomic_long_sub(npages, &vm_account->a.user->locked_vm);
- else
- atomic64_sub(npages, &vm_account->a.mm->pinned_vm);
+ vm_unaccount_legacy(vm_account, npages);
+ pins_uncharge(vm_account->pins_cg, npages);
}
EXPORT_SYMBOL_GPL(vm_unaccount_pinned);
--
git-series 0.9.1
^ permalink raw reply related [flat|nested] 108+ messages in thread
* [RFC PATCH 16/19] mm/util: Refactor account_locked_vm
2023-01-24 5:42 ` Alistair Popple
@ 2023-01-24 5:42 ` Alistair Popple
-1 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-24 5:42 UTC (permalink / raw)
To: linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, jgg-DDmLM1+adcrQT0dZR+AlfA,
jhubbard-DDmLM1+adcrQT0dZR+AlfA, tjmercier-hpIqsD4AKlfQT0dZR+AlfA,
hannes-druUgvl0LCNAfugRpC6u6w, surenb-hpIqsD4AKlfQT0dZR+AlfA,
mkoutny-IBi9RG/b67k, daniel-/w4YWyX8dFk, Alistair Popple
account_locked_vm() takes a flag to indicate if pages are being
accounted or unaccounted for. A flag is also provided to bypass
rlimits. However unaccounting of pages always succeeds and the flag to
ignore the limits is ignored. The flags make calling code harder to
understand so refactor the accounting and unaccounting paths into
separate functions.
Signed-off-by: Alistair Popple <apopple-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org>
Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
---
include/linux/mm.h | 5 +--
mm/util.c | 73 +++++++++++++++++++++++++++++++++--------------
2 files changed, 55 insertions(+), 23 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8f85716..126b756 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2090,9 +2090,10 @@ int get_user_pages_fast(unsigned long start, int nr_pages,
int pin_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages);
-int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc);
-int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
+int account_locked_vm(struct mm_struct *mm, unsigned long pages);
+int __account_locked_vm(struct mm_struct *mm, unsigned long pages,
struct task_struct *task, bool bypass_rlim);
+void __unaccount_locked_vm(struct mm_struct *mm, unsigned long pages);
struct kvec;
int get_kernel_pages(const struct kvec *iov, int nr_pages, int write,
diff --git a/mm/util.c b/mm/util.c
index e5fb01a..78b060d 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -537,7 +537,6 @@ EXPORT_SYMBOL_GPL(vm_unaccount_pinned);
* __account_locked_vm - account locked pages to an mm's locked_vm
* @mm: mm to account against
* @pages: number of pages to account
- * @inc: %true if @pages should be considered positive, %false if not
* @task: task used to check RLIMIT_MEMLOCK
* @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
*
@@ -548,7 +547,7 @@ EXPORT_SYMBOL_GPL(vm_unaccount_pinned);
* * 0 on success
* * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
*/
-int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
+int __account_locked_vm(struct mm_struct *mm, unsigned long pages,
struct task_struct *task, bool bypass_rlim)
{
unsigned long locked_vm, limit;
@@ -557,33 +556,44 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
mmap_assert_write_locked(mm);
locked_vm = mm->locked_vm;
- if (inc) {
- if (!bypass_rlim) {
- limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
- if (locked_vm + pages > limit)
- ret = -ENOMEM;
- }
- if (!ret)
- mm->locked_vm = locked_vm + pages;
- } else {
- WARN_ON_ONCE(pages > locked_vm);
- mm->locked_vm = locked_vm - pages;
+ if (!bypass_rlim) {
+ limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ if (locked_vm + pages > limit)
+ ret = -ENOMEM;
}
- pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
- (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
- locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
- ret ? " - exceeded" : "");
+ if (!ret)
+ mm->locked_vm = locked_vm + pages;
+
+ pr_debug("%s: [%d] caller %ps %lu %lu/%lu%s\n", __func__, task->pid,
+ (void *)_RET_IP_, pages << PAGE_SHIFT, locked_vm << PAGE_SHIFT,
+ task_rlimit(task, RLIMIT_MEMLOCK), ret ? " - exceeded" : "");
return ret;
}
EXPORT_SYMBOL_GPL(__account_locked_vm);
/**
+ * __unaccount_locked_vm - unaccount locked pages to an mm's locked_vm
+ * @mm: mm to account against
+ * @pages: number of pages to account
+ *
+ * Assumes @mm are valid and that mmap_lock is held as writer.
+ */
+void __unaccount_locked_vm(struct mm_struct *mm, unsigned long pages)
+{
+ unsigned long locked_vm = mm->locked_vm;
+
+ mmap_assert_write_locked(mm);
+ WARN_ON_ONCE(pages > locked_vm);
+ mm->locked_vm = locked_vm - pages;
+}
+EXPORT_SYMBOL_GPL(__unaccount_locked_vm);
+
+/**
* account_locked_vm - account locked pages to an mm's locked_vm
* @mm: mm to account against, may be NULL
* @pages: number of pages to account
- * @inc: %true if @pages should be considered positive, %false if not
*
* Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
*
@@ -591,7 +601,7 @@ EXPORT_SYMBOL_GPL(__account_locked_vm);
* * 0 on success, or if mm is NULL
* * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
*/
-int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
+int account_locked_vm(struct mm_struct *mm, unsigned long pages)
{
int ret;
@@ -599,14 +609,35 @@ int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
return 0;
mmap_write_lock(mm);
- ret = __account_locked_vm(mm, pages, inc, current,
- capable(CAP_IPC_LOCK));
+ ret = __account_locked_vm(mm, pages, current, capable(CAP_IPC_LOCK));
mmap_write_unlock(mm);
return ret;
}
EXPORT_SYMBOL_GPL(account_locked_vm);
+/**
+ * unaccount_locked_vm - account locked pages to an mm's locked_vm
+ * @mm: mm to account against, may be NULL
+ * @pages: number of pages to account
+ *
+ * Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
+ *
+ * Return:
+ * * 0 on success, or if mm is NULL
+ * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
+ */
+void unaccount_locked_vm(struct mm_struct *mm, unsigned long pages)
+{
+ if (pages == 0 || !mm)
+ return;
+
+ mmap_write_lock(mm);
+ __unaccount_locked_vm(mm, pages);
+ mmap_write_unlock(mm);
+}
+EXPORT_SYMBOL_GPL(unaccount_locked_vm);
+
unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long pgoff)
--
git-series 0.9.1
^ permalink raw reply related [flat|nested] 108+ messages in thread* [RFC PATCH 16/19] mm/util: Refactor account_locked_vm
@ 2023-01-24 5:42 ` Alistair Popple
0 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-24 5:42 UTC (permalink / raw)
To: linux-mm, cgroups
Cc: linux-kernel, jgg, jhubbard, tjmercier, hannes, surenb, mkoutny,
daniel, Alistair Popple
account_locked_vm() takes a flag to indicate if pages are being
accounted or unaccounted for. A flag is also provided to bypass
rlimits. However unaccounting of pages always succeeds and the flag to
ignore the limits is ignored. The flags make calling code harder to
understand so refactor the accounting and unaccounting paths into
separate functions.
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Cc: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org
---
include/linux/mm.h | 5 +--
mm/util.c | 73 +++++++++++++++++++++++++++++++++--------------
2 files changed, 55 insertions(+), 23 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8f85716..126b756 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2090,9 +2090,10 @@ int get_user_pages_fast(unsigned long start, int nr_pages,
int pin_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages);
-int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc);
-int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
+int account_locked_vm(struct mm_struct *mm, unsigned long pages);
+int __account_locked_vm(struct mm_struct *mm, unsigned long pages,
struct task_struct *task, bool bypass_rlim);
+void __unaccount_locked_vm(struct mm_struct *mm, unsigned long pages);
struct kvec;
int get_kernel_pages(const struct kvec *iov, int nr_pages, int write,
diff --git a/mm/util.c b/mm/util.c
index e5fb01a..78b060d 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -537,7 +537,6 @@ EXPORT_SYMBOL_GPL(vm_unaccount_pinned);
* __account_locked_vm - account locked pages to an mm's locked_vm
* @mm: mm to account against
* @pages: number of pages to account
- * @inc: %true if @pages should be considered positive, %false if not
* @task: task used to check RLIMIT_MEMLOCK
* @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
*
@@ -548,7 +547,7 @@ EXPORT_SYMBOL_GPL(vm_unaccount_pinned);
* * 0 on success
* * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
*/
-int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
+int __account_locked_vm(struct mm_struct *mm, unsigned long pages,
struct task_struct *task, bool bypass_rlim)
{
unsigned long locked_vm, limit;
@@ -557,33 +556,44 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
mmap_assert_write_locked(mm);
locked_vm = mm->locked_vm;
- if (inc) {
- if (!bypass_rlim) {
- limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
- if (locked_vm + pages > limit)
- ret = -ENOMEM;
- }
- if (!ret)
- mm->locked_vm = locked_vm + pages;
- } else {
- WARN_ON_ONCE(pages > locked_vm);
- mm->locked_vm = locked_vm - pages;
+ if (!bypass_rlim) {
+ limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ if (locked_vm + pages > limit)
+ ret = -ENOMEM;
}
- pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
- (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
- locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
- ret ? " - exceeded" : "");
+ if (!ret)
+ mm->locked_vm = locked_vm + pages;
+
+ pr_debug("%s: [%d] caller %ps %lu %lu/%lu%s\n", __func__, task->pid,
+ (void *)_RET_IP_, pages << PAGE_SHIFT, locked_vm << PAGE_SHIFT,
+ task_rlimit(task, RLIMIT_MEMLOCK), ret ? " - exceeded" : "");
return ret;
}
EXPORT_SYMBOL_GPL(__account_locked_vm);
/**
+ * __unaccount_locked_vm - unaccount locked pages to an mm's locked_vm
+ * @mm: mm to account against
+ * @pages: number of pages to account
+ *
+ * Assumes @mm are valid and that mmap_lock is held as writer.
+ */
+void __unaccount_locked_vm(struct mm_struct *mm, unsigned long pages)
+{
+ unsigned long locked_vm = mm->locked_vm;
+
+ mmap_assert_write_locked(mm);
+ WARN_ON_ONCE(pages > locked_vm);
+ mm->locked_vm = locked_vm - pages;
+}
+EXPORT_SYMBOL_GPL(__unaccount_locked_vm);
+
+/**
* account_locked_vm - account locked pages to an mm's locked_vm
* @mm: mm to account against, may be NULL
* @pages: number of pages to account
- * @inc: %true if @pages should be considered positive, %false if not
*
* Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
*
@@ -591,7 +601,7 @@ EXPORT_SYMBOL_GPL(__account_locked_vm);
* * 0 on success, or if mm is NULL
* * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
*/
-int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
+int account_locked_vm(struct mm_struct *mm, unsigned long pages)
{
int ret;
@@ -599,14 +609,35 @@ int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
return 0;
mmap_write_lock(mm);
- ret = __account_locked_vm(mm, pages, inc, current,
- capable(CAP_IPC_LOCK));
+ ret = __account_locked_vm(mm, pages, current, capable(CAP_IPC_LOCK));
mmap_write_unlock(mm);
return ret;
}
EXPORT_SYMBOL_GPL(account_locked_vm);
+/**
+ * unaccount_locked_vm - account locked pages to an mm's locked_vm
+ * @mm: mm to account against, may be NULL
+ * @pages: number of pages to account
+ *
+ * Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
+ *
+ * Return:
+ * * 0 on success, or if mm is NULL
+ * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
+ */
+void unaccount_locked_vm(struct mm_struct *mm, unsigned long pages)
+{
+ if (pages == 0 || !mm)
+ return;
+
+ mmap_write_lock(mm);
+ __unaccount_locked_vm(mm, pages);
+ mmap_write_unlock(mm);
+}
+EXPORT_SYMBOL_GPL(unaccount_locked_vm);
+
unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long pgoff)
--
git-series 0.9.1
^ permalink raw reply related [flat|nested] 108+ messages in thread* Re: [RFC PATCH 16/19] mm/util: Refactor account_locked_vm
2023-01-24 5:42 ` Alistair Popple
(?)
@ 2023-01-24 9:52 ` kernel test robot
-1 siblings, 0 replies; 108+ messages in thread
From: kernel test robot @ 2023-01-24 9:52 UTC (permalink / raw)
To: Alistair Popple; +Cc: oe-kbuild-all
Hi Alistair,
[FYI, it's a private test report for your RFC patch.]
[auto build test ERROR on 2241ab53cbb5cdb08a6b2d4688feb13971058f65]
url: https://github.com/intel-lab-lkp/linux/commits/Alistair-Popple/mm-Introduce-vm_account/20230124-135027
base: 2241ab53cbb5cdb08a6b2d4688feb13971058f65
patch link: https://lore.kernel.org/r/e72f46bd6be4f3e3c85ba6a560201cf23bdb39a9.1674538665.git-series.apopple%40nvidia.com
patch subject: [RFC PATCH 16/19] mm/util: Refactor account_locked_vm
config: s390-allmodconfig (https://download.01.org/0day-ci/archive/20230124/202301241747.9HloKRGF-lkp@intel.com/config)
compiler: s390-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/c089a7b505673ff339d4e034218c0d1207d26f5e
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Alistair-Popple/mm-Introduce-vm_account/20230124-135027
git checkout c089a7b505673ff339d4e034218c0d1207d26f5e
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=s390 olddefconfig
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=s390 SHELL=/bin/bash drivers/iommu/iommufd/
If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
All errors (new ones prefixed by >>):
drivers/iommu/iommufd/pages.c: In function 'update_mm_locked_vm':
>> drivers/iommu/iommufd/pages.c:846:60: error: incompatible type for argument 3 of '__account_locked_vm'
846 | rc = __account_locked_vm(pages->source_mm, npages, inc,
| ^~~
| |
| bool {aka _Bool}
In file included from include/linux/scatterlist.h:8,
from include/linux/iommu.h:10,
from drivers/iommu/iommufd/pages.c:50:
include/linux/mm.h:2095:45: note: expected 'struct task_struct *' but argument is of type 'bool' {aka '_Bool'}
2095 | struct task_struct *task, bool bypass_rlim);
| ~~~~~~~~~~~~~~~~~~~~^~~~
>> drivers/iommu/iommufd/pages.c:846:14: error: too many arguments to function '__account_locked_vm'
846 | rc = __account_locked_vm(pages->source_mm, npages, inc,
| ^~~~~~~~~~~~~~~~~~~
include/linux/mm.h:2094:5: note: declared here
2094 | int __account_locked_vm(struct mm_struct *mm, unsigned long pages,
| ^~~~~~~~~~~~~~~~~~~
vim +/__account_locked_vm +846 drivers/iommu/iommufd/pages.c
f394576eb11dbc Jason Gunthorpe 2022-11-29 826
f394576eb11dbc Jason Gunthorpe 2022-11-29 827 /* This is the accounting method used for compatibility with VFIO */
f394576eb11dbc Jason Gunthorpe 2022-11-29 828 static int update_mm_locked_vm(struct iopt_pages *pages, unsigned long npages,
f394576eb11dbc Jason Gunthorpe 2022-11-29 829 bool inc, struct pfn_reader_user *user)
f394576eb11dbc Jason Gunthorpe 2022-11-29 830 {
f394576eb11dbc Jason Gunthorpe 2022-11-29 831 bool do_put = false;
f394576eb11dbc Jason Gunthorpe 2022-11-29 832 int rc;
f394576eb11dbc Jason Gunthorpe 2022-11-29 833
f394576eb11dbc Jason Gunthorpe 2022-11-29 834 if (user && user->locked) {
f394576eb11dbc Jason Gunthorpe 2022-11-29 835 mmap_read_unlock(pages->source_mm);
f394576eb11dbc Jason Gunthorpe 2022-11-29 836 user->locked = 0;
f394576eb11dbc Jason Gunthorpe 2022-11-29 837 /* If we had the lock then we also have a get */
f394576eb11dbc Jason Gunthorpe 2022-11-29 838 } else if ((!user || !user->upages) &&
f394576eb11dbc Jason Gunthorpe 2022-11-29 839 pages->source_mm != current->mm) {
f394576eb11dbc Jason Gunthorpe 2022-11-29 840 if (!mmget_not_zero(pages->source_mm))
f394576eb11dbc Jason Gunthorpe 2022-11-29 841 return -EINVAL;
f394576eb11dbc Jason Gunthorpe 2022-11-29 842 do_put = true;
f394576eb11dbc Jason Gunthorpe 2022-11-29 843 }
f394576eb11dbc Jason Gunthorpe 2022-11-29 844
f394576eb11dbc Jason Gunthorpe 2022-11-29 845 mmap_write_lock(pages->source_mm);
f394576eb11dbc Jason Gunthorpe 2022-11-29 @846 rc = __account_locked_vm(pages->source_mm, npages, inc,
f394576eb11dbc Jason Gunthorpe 2022-11-29 847 pages->source_task, false);
f394576eb11dbc Jason Gunthorpe 2022-11-29 848 mmap_write_unlock(pages->source_mm);
f394576eb11dbc Jason Gunthorpe 2022-11-29 849
f394576eb11dbc Jason Gunthorpe 2022-11-29 850 if (do_put)
f394576eb11dbc Jason Gunthorpe 2022-11-29 851 mmput(pages->source_mm);
f394576eb11dbc Jason Gunthorpe 2022-11-29 852 return rc;
f394576eb11dbc Jason Gunthorpe 2022-11-29 853 }
f394576eb11dbc Jason Gunthorpe 2022-11-29 854
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests
^ permalink raw reply [flat|nested] 108+ messages in thread
* [RFC PATCH 17/19] mm: Convert mmap and mlock to use account_locked_vm
2023-01-24 5:42 ` Alistair Popple
@ 2023-01-24 5:42 ` Alistair Popple
-1 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-24 5:42 UTC (permalink / raw)
To: linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, jgg-DDmLM1+adcrQT0dZR+AlfA,
jhubbard-DDmLM1+adcrQT0dZR+AlfA, tjmercier-hpIqsD4AKlfQT0dZR+AlfA,
hannes-druUgvl0LCNAfugRpC6u6w, surenb-hpIqsD4AKlfQT0dZR+AlfA,
mkoutny-IBi9RG/b67k, daniel-/w4YWyX8dFk, Alistair Popple
A future change introduces a cgroup to control the amount of
locked/pinned memory on the system. To ensure memory pinned via mlock
and mmap is accounted for use the common account_locked_vm()
function.
As cgroups can outlive individual processes also unaccount for the
locked memory during process teardown.
This patch should introduce no user visible change.
Signed-off-by: Alistair Popple <apopple-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org>
Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
---
mm/internal.h | 2 +-
mm/mlock.c | 76 ++++++++++-----------------------------------------
mm/mmap.c | 76 +++++++++++++++++++++++++--------------------------
mm/mremap.c | 54 ++++++++++++++++++++++++++----------
mm/secretmem.c | 6 +---
5 files changed, 95 insertions(+), 119 deletions(-)
diff --git a/mm/internal.h b/mm/internal.h
index bcf75a8..7c8c3f2 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -489,8 +489,6 @@ extern long populate_vma_page_range(struct vm_area_struct *vma,
extern long faultin_vma_page_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
bool write, int *locked);
-extern int mlock_future_check(struct mm_struct *mm, unsigned long flags,
- unsigned long len);
/*
* mlock_vma_page() and munlock_vma_page():
* should be called with vma's mmap_lock held for read or write,
diff --git a/mm/mlock.c b/mm/mlock.c
index 7032f6d..a97c8c5 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -416,6 +416,20 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
/* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
goto out;
+ /*
+ * Keep track of amount of locked VM.
+ */
+ nr_pages = (end - start) >> PAGE_SHIFT;
+ if (!(newflags & VM_LOCKED)) {
+ __unaccount_locked_vm(mm, nr_pages);
+ } else if (!(oldflags & VM_LOCKED)) {
+ if (__account_locked_vm(mm, nr_pages, current,
+ capable(CAP_IPC_LOCK))) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma),
@@ -439,16 +453,6 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
success:
/*
- * Keep track of amount of locked VM.
- */
- nr_pages = (end - start) >> PAGE_SHIFT;
- if (!(newflags & VM_LOCKED))
- nr_pages = -nr_pages;
- else if (oldflags & VM_LOCKED)
- nr_pages = 0;
- mm->locked_vm += nr_pages;
-
- /*
* vm_flags is protected by the mmap_lock held in write mode.
* It's okay if try_to_unmap_one unmaps a page just after we
* set VM_LOCKED, populate_vma_page_range will bring it back.
@@ -517,42 +521,6 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
}
/*
- * Go through vma areas and sum size of mlocked
- * vma pages, as return value.
- * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT)
- * is also counted.
- * Return value: previously mlocked page counts
- */
-static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
- unsigned long start, size_t len)
-{
- struct vm_area_struct *vma;
- unsigned long count = 0;
- unsigned long end;
- VMA_ITERATOR(vmi, mm, start);
-
- /* Don't overflow past ULONG_MAX */
- if (unlikely(ULONG_MAX - len < start))
- end = ULONG_MAX;
- else
- end = start + len;
-
- for_each_vma_range(vmi, vma, end) {
- if (vma->vm_flags & VM_LOCKED) {
- if (start > vma->vm_start)
- count -= (start - vma->vm_start);
- if (end < vma->vm_end) {
- count += end - vma->vm_start;
- break;
- }
- count += vma->vm_end - vma->vm_start;
- }
- }
-
- return count >> PAGE_SHIFT;
-}
-
-/*
* convert get_user_pages() return value to posix mlock() error
*/
static int __mlock_posix_error_return(long retval)
@@ -585,21 +553,7 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla
if (mmap_write_lock_killable(current->mm))
return -EINTR;
- locked += current->mm->locked_vm;
- if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) {
- /*
- * It is possible that the regions requested intersect with
- * previously mlocked areas, that part area in "mm->locked_vm"
- * should not be counted to new mlock increment count. So check
- * and adjust locked count if necessary.
- */
- locked -= count_mm_mlocked_page_nr(current->mm,
- start, len);
- }
-
- /* check against resource limits */
- if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
- error = apply_vma_lock_flags(start, len, flags);
+ error = apply_vma_lock_flags(start, len, flags);
mmap_write_unlock(current->mm);
if (error)
diff --git a/mm/mmap.c b/mm/mmap.c
index 425a934..2c05582 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -160,7 +160,7 @@ static int check_brk_limits(unsigned long addr, unsigned long len)
if (IS_ERR_VALUE(mapped_addr))
return mapped_addr;
- return mlock_future_check(current->mm, current->mm->def_flags, len);
+ return 0;
}
static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma,
unsigned long newbrk, unsigned long oldbrk,
@@ -1184,23 +1184,6 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
return hint;
}
-int mlock_future_check(struct mm_struct *mm, unsigned long flags,
- unsigned long len)
-{
- unsigned long locked, lock_limit;
-
- /* mlock MCL_FUTURE? */
- if (flags & VM_LOCKED) {
- locked = len >> PAGE_SHIFT;
- locked += mm->locked_vm;
- lock_limit = rlimit(RLIMIT_MEMLOCK);
- lock_limit >>= PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- return -EAGAIN;
- }
- return 0;
-}
-
static inline u64 file_mmap_size_max(struct file *file, struct inode *inode)
{
if (S_ISREG(inode->i_mode))
@@ -1310,9 +1293,6 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (!can_do_mlock())
return -EPERM;
- if (mlock_future_check(mm, vm_flags, len))
- return -EAGAIN;
-
if (file) {
struct inode *inode = file_inode(file);
unsigned long flags_mask;
@@ -1882,22 +1862,27 @@ static int acct_stack_growth(struct vm_area_struct *vma,
if (size > rlimit(RLIMIT_STACK))
return -ENOMEM;
- /* mlock limit tests */
- if (mlock_future_check(mm, vma->vm_flags, grow << PAGE_SHIFT))
- return -ENOMEM;
-
/* Check to ensure the stack will not grow into a hugetlb-only region */
new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
vma->vm_end - size;
if (is_hugepage_only_range(vma->vm_mm, new_start, size))
return -EFAULT;
+ /* mlock limit tests */
+ if (vma->vm_flags & VM_LOCKED)
+ if (__account_locked_vm(mm, grow << PAGE_SHIFT, current,
+ capable(CAP_IPC_LOCK)))
+ return -ENOMEM;
+
/*
* Overcommit.. This must be the final test, as it will
* update security statistics.
*/
- if (security_vm_enough_memory_mm(mm, grow))
+ if (security_vm_enough_memory_mm(mm, grow)) {
+ if (vma->vm_flags & VM_LOCKED)
+ __unaccount_locked_vm(mm, grow << PAGE_SHIFT);
return -ENOMEM;
+ }
return 0;
}
@@ -1975,8 +1960,6 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
* to guard against concurrent vma expansions.
*/
spin_lock(&mm->page_table_lock);
- if (vma->vm_flags & VM_LOCKED)
- mm->locked_vm += grow;
vm_stat_account(mm, vma->vm_flags, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_end = address;
@@ -2056,8 +2039,6 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
* to guard against concurrent vma expansions.
*/
spin_lock(&mm->page_table_lock);
- if (vma->vm_flags & VM_LOCKED)
- mm->locked_vm += grow;
vm_stat_account(mm, vma->vm_flags, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_start = address;
@@ -2281,7 +2262,7 @@ static inline int munmap_sidetree(struct vm_area_struct *vma,
return -ENOMEM;
if (vma->vm_flags & VM_LOCKED)
- vma->vm_mm->locked_vm -= vma_pages(vma);
+ __unaccount_locked_vm(vma->vm_mm, vma_pages(vma));
return 0;
}
@@ -2525,6 +2506,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
struct vm_area_struct *next, *prev, *merge;
pgoff_t pglen = len >> PAGE_SHIFT;
unsigned long charged = 0;
+ unsigned long locked = 0;
unsigned long end = addr + len;
unsigned long merge_start = addr, merge_end = end;
pgoff_t vm_pgoff;
@@ -2560,6 +2542,15 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
vm_flags |= VM_ACCOUNT;
}
+ if (vm_flags & VM_LOCKED) {
+ locked = len >> PAGE_SHIFT;
+ if (__account_locked_vm(mm, locked, current,
+ capable(CAP_IPC_LOCK))) {
+ error = -ENOMEM;
+ goto unacct_error;
+ }
+ }
+
next = mas_next(&mas, ULONG_MAX);
prev = mas_prev(&mas, 0);
if (vm_flags & VM_SPECIAL)
@@ -2605,7 +2596,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
vma = vm_area_alloc(mm);
if (!vma) {
error = -ENOMEM;
- goto unacct_error;
+ goto unlock_error;
}
vma->vm_start = addr;
@@ -2725,8 +2716,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current->mm))
vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
- else
- mm->locked_vm += (len >> PAGE_SHIFT);
}
if (file)
@@ -2759,6 +2748,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
mapping_unmap_writable(file->f_mapping);
free_vma:
vm_area_free(vma);
+unlock_error:
+ if (locked)
+ __unaccount_locked_vm(mm, locked);
unacct_error:
if (charged)
vm_unacct_memory(charged);
@@ -2942,8 +2934,13 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
if (mm->map_count > sysctl_max_map_count)
return -ENOMEM;
+ if (flags & VM_LOCKED)
+ if (__account_locked_vm(mm, len >> PAGE_SHIFT, current,
+ capable(CAP_IPC_LOCK)))
+ return -ENOMEM;
+
if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
- return -ENOMEM;
+ goto unacct_locked;
/*
* Expand the existing vma if possible; Note that singular lists do not
@@ -2993,8 +2990,6 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
perf_event_mmap(vma);
mm->total_vm += len >> PAGE_SHIFT;
mm->data_vm += len >> PAGE_SHIFT;
- if (flags & VM_LOCKED)
- mm->locked_vm += (len >> PAGE_SHIFT);
vma->vm_flags |= VM_SOFTDIRTY;
validate_mm(mm);
return 0;
@@ -3003,6 +2998,8 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
vm_area_free(vma);
unacct_fail:
vm_unacct_memory(len >> PAGE_SHIFT);
+unacct_locked:
+ __unaccount_locked_vm(mm, len >> PAGE_SHIFT);
return -ENOMEM;
}
@@ -3064,7 +3061,7 @@ void exit_mmap(struct mm_struct *mm)
{
struct mmu_gather tlb;
struct vm_area_struct *vma;
- unsigned long nr_accounted = 0;
+ unsigned long nr_accounted = 0, nr_locked = 0;
MA_STATE(mas, &mm->mm_mt, 0, 0);
int count = 0;
@@ -3107,6 +3104,8 @@ void exit_mmap(struct mm_struct *mm)
do {
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
+ if (vma->vm_flags & VM_LOCKED)
+ nr_locked += vma_pages(vma);
remove_vma(vma);
count++;
cond_resched();
@@ -3116,6 +3115,7 @@ void exit_mmap(struct mm_struct *mm)
trace_exit_mmap(mm);
__mt_destroy(&mm->mm_mt);
+ __unaccount_locked_vm(mm, nr_locked);
mmap_write_unlock(mm);
vm_unacct_memory(nr_accounted);
}
diff --git a/mm/mremap.c b/mm/mremap.c
index fe587c5..67cc4f3 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -574,7 +574,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
bool *locked, unsigned long flags,
struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
{
- long to_account = new_len - old_len;
+ long to_account = (new_len - old_len) >> PAGE_SHIFT;
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma;
unsigned long vm_flags = vma->vm_flags;
@@ -594,7 +594,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
return -ENOMEM;
if (unlikely(flags & MREMAP_DONTUNMAP))
- to_account = new_len;
+ to_account = new_len >> PAGE_SHIFT;
if (vma->vm_ops && vma->vm_ops->may_split) {
if (vma->vm_start != old_addr)
@@ -618,16 +618,36 @@ static unsigned long move_vma(struct vm_area_struct *vma,
return err;
if (vm_flags & VM_ACCOUNT) {
- if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT))
+ if (security_vm_enough_memory_mm(mm, to_account))
return -ENOMEM;
}
+ /*
+ * MREMAP_DONTUNMAP clears VM_LOCKED on the old vma and
+ * implies new_len == old_len so no need to account locked
+ * pages.
+ */
+ if ((vm_flags & VM_LOCKED) && likely(!(flags & MREMAP_DONTUNMAP))) {
+ if (__account_locked_vm(mm, to_account, current,
+ capable(CAP_IPC_LOCK))) {
+ if (vm_flags & VM_ACCOUNT)
+ vm_unacct_memory(to_account);
+ return -ENOMEM;
+ }
+ *locked = true;
+ }
+
new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
&need_rmap_locks);
if (!new_vma) {
if (vm_flags & VM_ACCOUNT)
- vm_unacct_memory(to_account >> PAGE_SHIFT);
+ vm_unacct_memory(to_account);
+ if ((vm_flags & VM_LOCKED) &&
+ likely(!(flags & MREMAP_DONTUNMAP))) {
+ __unaccount_locked_vm(mm, to_account);
+ *locked = false;
+ }
return -ENOMEM;
}
@@ -696,10 +716,11 @@ static unsigned long move_vma(struct vm_area_struct *vma,
vma->vm_end == (old_addr + old_len))
unlink_anon_vmas(vma);
- /* Because we won't unmap we don't need to touch locked_vm */
return new_addr;
}
+ /* Make sure do_munmap() doesn't unaccount locked pages */
+ vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
/* OOM: unable to split vma, just get accounts right */
if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
@@ -707,15 +728,11 @@ static unsigned long move_vma(struct vm_area_struct *vma,
excess = 0;
}
- if (vm_flags & VM_LOCKED) {
- mm->locked_vm += new_len >> PAGE_SHIFT;
- *locked = true;
- }
-
mm->hiwater_vm = hiwater_vm;
/* Restore VM_ACCOUNT if one or two pieces of vma left */
if (excess) {
+ vma->vm_flags = vm_flags;
vma->vm_flags |= VM_ACCOUNT;
if (split)
find_vma(mm, vma->vm_end)->vm_flags |= VM_ACCOUNT;
@@ -768,9 +785,6 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
return ERR_PTR(-EFAULT);
- if (mlock_future_check(mm, vma->vm_flags, new_len - old_len))
- return ERR_PTR(-EAGAIN);
-
if (!may_expand_vm(mm, vma->vm_flags,
(new_len - old_len) >> PAGE_SHIFT))
return ERR_PTR(-ENOMEM);
@@ -1026,6 +1040,16 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
}
}
+ if (vma->vm_flags & VM_LOCKED) {
+ if (__account_locked_vm(mm, pages, current,
+ capable(CAP_IPC_LOCK))) {
+ if (vma->vm_flags & VM_ACCOUNT)
+ vm_unacct_memory(pages);
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
/*
* Function vma_merge() is called on the extension we are adding to
* the already existing vma, vma_merge() will merge this extension with
@@ -1038,14 +1062,16 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
extension_pgoff, vma_policy(vma),
vma->vm_userfaultfd_ctx, anon_vma_name(vma));
if (!vma) {
+ // TODO: We always unacct memory
+ // regardless of VM_ACCOUNT flags?
vm_unacct_memory(pages);
+ __unaccount_locked_vm(mm, pages);
ret = -ENOMEM;
goto out;
}
vm_stat_account(mm, vma->vm_flags, pages);
if (vma->vm_flags & VM_LOCKED) {
- mm->locked_vm += pages;
locked = true;
new_addr = addr;
}
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 04c3ac9..4515eb4 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -120,13 +120,11 @@ static int secretmem_release(struct inode *inode, struct file *file)
static int secretmem_mmap(struct file *file, struct vm_area_struct *vma)
{
- unsigned long len = vma->vm_end - vma->vm_start;
-
if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0)
return -EINVAL;
- if (mlock_future_check(vma->vm_mm, vma->vm_flags | VM_LOCKED, len))
- return -EAGAIN;
+ if (account_locked_vm(vma->vm_mm, vma->vm_end - vma->vm_start))
+ return -ENOMEM;
vma->vm_flags |= VM_LOCKED | VM_DONTDUMP;
vma->vm_ops = &secretmem_vm_ops;
--
git-series 0.9.1
^ permalink raw reply related [flat|nested] 108+ messages in thread* [RFC PATCH 17/19] mm: Convert mmap and mlock to use account_locked_vm
@ 2023-01-24 5:42 ` Alistair Popple
0 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-24 5:42 UTC (permalink / raw)
To: linux-mm, cgroups
Cc: linux-kernel, jgg, jhubbard, tjmercier, hannes, surenb, mkoutny,
daniel, Alistair Popple
A future change introduces a cgroup to control the amount of
locked/pinned memory on the system. To ensure memory pinned via mlock
and mmap is accounted for use the common account_locked_vm()
function.
As cgroups can outlive individual processes also unaccount for the
locked memory during process teardown.
This patch should introduce no user visible change.
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Cc: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org
---
mm/internal.h | 2 +-
mm/mlock.c | 76 ++++++++++-----------------------------------------
mm/mmap.c | 76 +++++++++++++++++++++++++--------------------------
mm/mremap.c | 54 ++++++++++++++++++++++++++----------
mm/secretmem.c | 6 +---
5 files changed, 95 insertions(+), 119 deletions(-)
diff --git a/mm/internal.h b/mm/internal.h
index bcf75a8..7c8c3f2 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -489,8 +489,6 @@ extern long populate_vma_page_range(struct vm_area_struct *vma,
extern long faultin_vma_page_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
bool write, int *locked);
-extern int mlock_future_check(struct mm_struct *mm, unsigned long flags,
- unsigned long len);
/*
* mlock_vma_page() and munlock_vma_page():
* should be called with vma's mmap_lock held for read or write,
diff --git a/mm/mlock.c b/mm/mlock.c
index 7032f6d..a97c8c5 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -416,6 +416,20 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
/* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
goto out;
+ /*
+ * Keep track of amount of locked VM.
+ */
+ nr_pages = (end - start) >> PAGE_SHIFT;
+ if (!(newflags & VM_LOCKED)) {
+ __unaccount_locked_vm(mm, nr_pages);
+ } else if (!(oldflags & VM_LOCKED)) {
+ if (__account_locked_vm(mm, nr_pages, current,
+ capable(CAP_IPC_LOCK))) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma),
@@ -439,16 +453,6 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
success:
/*
- * Keep track of amount of locked VM.
- */
- nr_pages = (end - start) >> PAGE_SHIFT;
- if (!(newflags & VM_LOCKED))
- nr_pages = -nr_pages;
- else if (oldflags & VM_LOCKED)
- nr_pages = 0;
- mm->locked_vm += nr_pages;
-
- /*
* vm_flags is protected by the mmap_lock held in write mode.
* It's okay if try_to_unmap_one unmaps a page just after we
* set VM_LOCKED, populate_vma_page_range will bring it back.
@@ -517,42 +521,6 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
}
/*
- * Go through vma areas and sum size of mlocked
- * vma pages, as return value.
- * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT)
- * is also counted.
- * Return value: previously mlocked page counts
- */
-static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
- unsigned long start, size_t len)
-{
- struct vm_area_struct *vma;
- unsigned long count = 0;
- unsigned long end;
- VMA_ITERATOR(vmi, mm, start);
-
- /* Don't overflow past ULONG_MAX */
- if (unlikely(ULONG_MAX - len < start))
- end = ULONG_MAX;
- else
- end = start + len;
-
- for_each_vma_range(vmi, vma, end) {
- if (vma->vm_flags & VM_LOCKED) {
- if (start > vma->vm_start)
- count -= (start - vma->vm_start);
- if (end < vma->vm_end) {
- count += end - vma->vm_start;
- break;
- }
- count += vma->vm_end - vma->vm_start;
- }
- }
-
- return count >> PAGE_SHIFT;
-}
-
-/*
* convert get_user_pages() return value to posix mlock() error
*/
static int __mlock_posix_error_return(long retval)
@@ -585,21 +553,7 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla
if (mmap_write_lock_killable(current->mm))
return -EINTR;
- locked += current->mm->locked_vm;
- if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) {
- /*
- * It is possible that the regions requested intersect with
- * previously mlocked areas, that part area in "mm->locked_vm"
- * should not be counted to new mlock increment count. So check
- * and adjust locked count if necessary.
- */
- locked -= count_mm_mlocked_page_nr(current->mm,
- start, len);
- }
-
- /* check against resource limits */
- if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
- error = apply_vma_lock_flags(start, len, flags);
+ error = apply_vma_lock_flags(start, len, flags);
mmap_write_unlock(current->mm);
if (error)
diff --git a/mm/mmap.c b/mm/mmap.c
index 425a934..2c05582 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -160,7 +160,7 @@ static int check_brk_limits(unsigned long addr, unsigned long len)
if (IS_ERR_VALUE(mapped_addr))
return mapped_addr;
- return mlock_future_check(current->mm, current->mm->def_flags, len);
+ return 0;
}
static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma,
unsigned long newbrk, unsigned long oldbrk,
@@ -1184,23 +1184,6 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
return hint;
}
-int mlock_future_check(struct mm_struct *mm, unsigned long flags,
- unsigned long len)
-{
- unsigned long locked, lock_limit;
-
- /* mlock MCL_FUTURE? */
- if (flags & VM_LOCKED) {
- locked = len >> PAGE_SHIFT;
- locked += mm->locked_vm;
- lock_limit = rlimit(RLIMIT_MEMLOCK);
- lock_limit >>= PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- return -EAGAIN;
- }
- return 0;
-}
-
static inline u64 file_mmap_size_max(struct file *file, struct inode *inode)
{
if (S_ISREG(inode->i_mode))
@@ -1310,9 +1293,6 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (!can_do_mlock())
return -EPERM;
- if (mlock_future_check(mm, vm_flags, len))
- return -EAGAIN;
-
if (file) {
struct inode *inode = file_inode(file);
unsigned long flags_mask;
@@ -1882,22 +1862,27 @@ static int acct_stack_growth(struct vm_area_struct *vma,
if (size > rlimit(RLIMIT_STACK))
return -ENOMEM;
- /* mlock limit tests */
- if (mlock_future_check(mm, vma->vm_flags, grow << PAGE_SHIFT))
- return -ENOMEM;
-
/* Check to ensure the stack will not grow into a hugetlb-only region */
new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
vma->vm_end - size;
if (is_hugepage_only_range(vma->vm_mm, new_start, size))
return -EFAULT;
+ /* mlock limit tests */
+ if (vma->vm_flags & VM_LOCKED)
+ if (__account_locked_vm(mm, grow << PAGE_SHIFT, current,
+ capable(CAP_IPC_LOCK)))
+ return -ENOMEM;
+
/*
* Overcommit.. This must be the final test, as it will
* update security statistics.
*/
- if (security_vm_enough_memory_mm(mm, grow))
+ if (security_vm_enough_memory_mm(mm, grow)) {
+ if (vma->vm_flags & VM_LOCKED)
+ __unaccount_locked_vm(mm, grow << PAGE_SHIFT);
return -ENOMEM;
+ }
return 0;
}
@@ -1975,8 +1960,6 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
* to guard against concurrent vma expansions.
*/
spin_lock(&mm->page_table_lock);
- if (vma->vm_flags & VM_LOCKED)
- mm->locked_vm += grow;
vm_stat_account(mm, vma->vm_flags, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_end = address;
@@ -2056,8 +2039,6 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
* to guard against concurrent vma expansions.
*/
spin_lock(&mm->page_table_lock);
- if (vma->vm_flags & VM_LOCKED)
- mm->locked_vm += grow;
vm_stat_account(mm, vma->vm_flags, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_start = address;
@@ -2281,7 +2262,7 @@ static inline int munmap_sidetree(struct vm_area_struct *vma,
return -ENOMEM;
if (vma->vm_flags & VM_LOCKED)
- vma->vm_mm->locked_vm -= vma_pages(vma);
+ __unaccount_locked_vm(vma->vm_mm, vma_pages(vma));
return 0;
}
@@ -2525,6 +2506,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
struct vm_area_struct *next, *prev, *merge;
pgoff_t pglen = len >> PAGE_SHIFT;
unsigned long charged = 0;
+ unsigned long locked = 0;
unsigned long end = addr + len;
unsigned long merge_start = addr, merge_end = end;
pgoff_t vm_pgoff;
@@ -2560,6 +2542,15 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
vm_flags |= VM_ACCOUNT;
}
+ if (vm_flags & VM_LOCKED) {
+ locked = len >> PAGE_SHIFT;
+ if (__account_locked_vm(mm, locked, current,
+ capable(CAP_IPC_LOCK))) {
+ error = -ENOMEM;
+ goto unacct_error;
+ }
+ }
+
next = mas_next(&mas, ULONG_MAX);
prev = mas_prev(&mas, 0);
if (vm_flags & VM_SPECIAL)
@@ -2605,7 +2596,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
vma = vm_area_alloc(mm);
if (!vma) {
error = -ENOMEM;
- goto unacct_error;
+ goto unlock_error;
}
vma->vm_start = addr;
@@ -2725,8 +2716,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current->mm))
vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
- else
- mm->locked_vm += (len >> PAGE_SHIFT);
}
if (file)
@@ -2759,6 +2748,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
mapping_unmap_writable(file->f_mapping);
free_vma:
vm_area_free(vma);
+unlock_error:
+ if (locked)
+ __unaccount_locked_vm(mm, locked);
unacct_error:
if (charged)
vm_unacct_memory(charged);
@@ -2942,8 +2934,13 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
if (mm->map_count > sysctl_max_map_count)
return -ENOMEM;
+ if (flags & VM_LOCKED)
+ if (__account_locked_vm(mm, len >> PAGE_SHIFT, current,
+ capable(CAP_IPC_LOCK)))
+ return -ENOMEM;
+
if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
- return -ENOMEM;
+ goto unacct_locked;
/*
* Expand the existing vma if possible; Note that singular lists do not
@@ -2993,8 +2990,6 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
perf_event_mmap(vma);
mm->total_vm += len >> PAGE_SHIFT;
mm->data_vm += len >> PAGE_SHIFT;
- if (flags & VM_LOCKED)
- mm->locked_vm += (len >> PAGE_SHIFT);
vma->vm_flags |= VM_SOFTDIRTY;
validate_mm(mm);
return 0;
@@ -3003,6 +2998,8 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
vm_area_free(vma);
unacct_fail:
vm_unacct_memory(len >> PAGE_SHIFT);
+unacct_locked:
+ __unaccount_locked_vm(mm, len >> PAGE_SHIFT);
return -ENOMEM;
}
@@ -3064,7 +3061,7 @@ void exit_mmap(struct mm_struct *mm)
{
struct mmu_gather tlb;
struct vm_area_struct *vma;
- unsigned long nr_accounted = 0;
+ unsigned long nr_accounted = 0, nr_locked = 0;
MA_STATE(mas, &mm->mm_mt, 0, 0);
int count = 0;
@@ -3107,6 +3104,8 @@ void exit_mmap(struct mm_struct *mm)
do {
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
+ if (vma->vm_flags & VM_LOCKED)
+ nr_locked += vma_pages(vma);
remove_vma(vma);
count++;
cond_resched();
@@ -3116,6 +3115,7 @@ void exit_mmap(struct mm_struct *mm)
trace_exit_mmap(mm);
__mt_destroy(&mm->mm_mt);
+ __unaccount_locked_vm(mm, nr_locked);
mmap_write_unlock(mm);
vm_unacct_memory(nr_accounted);
}
diff --git a/mm/mremap.c b/mm/mremap.c
index fe587c5..67cc4f3 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -574,7 +574,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
bool *locked, unsigned long flags,
struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
{
- long to_account = new_len - old_len;
+ long to_account = (new_len - old_len) >> PAGE_SHIFT;
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma;
unsigned long vm_flags = vma->vm_flags;
@@ -594,7 +594,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
return -ENOMEM;
if (unlikely(flags & MREMAP_DONTUNMAP))
- to_account = new_len;
+ to_account = new_len >> PAGE_SHIFT;
if (vma->vm_ops && vma->vm_ops->may_split) {
if (vma->vm_start != old_addr)
@@ -618,16 +618,36 @@ static unsigned long move_vma(struct vm_area_struct *vma,
return err;
if (vm_flags & VM_ACCOUNT) {
- if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT))
+ if (security_vm_enough_memory_mm(mm, to_account))
return -ENOMEM;
}
+ /*
+ * MREMAP_DONTUNMAP clears VM_LOCKED on the old vma and
+ * implies new_len == old_len so no need to account locked
+ * pages.
+ */
+ if ((vm_flags & VM_LOCKED) && likely(!(flags & MREMAP_DONTUNMAP))) {
+ if (__account_locked_vm(mm, to_account, current,
+ capable(CAP_IPC_LOCK))) {
+ if (vm_flags & VM_ACCOUNT)
+ vm_unacct_memory(to_account);
+ return -ENOMEM;
+ }
+ *locked = true;
+ }
+
new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
&need_rmap_locks);
if (!new_vma) {
if (vm_flags & VM_ACCOUNT)
- vm_unacct_memory(to_account >> PAGE_SHIFT);
+ vm_unacct_memory(to_account);
+ if ((vm_flags & VM_LOCKED) &&
+ likely(!(flags & MREMAP_DONTUNMAP))) {
+ __unaccount_locked_vm(mm, to_account);
+ *locked = false;
+ }
return -ENOMEM;
}
@@ -696,10 +716,11 @@ static unsigned long move_vma(struct vm_area_struct *vma,
vma->vm_end == (old_addr + old_len))
unlink_anon_vmas(vma);
- /* Because we won't unmap we don't need to touch locked_vm */
return new_addr;
}
+ /* Make sure do_munmap() doesn't unaccount locked pages */
+ vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
/* OOM: unable to split vma, just get accounts right */
if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
@@ -707,15 +728,11 @@ static unsigned long move_vma(struct vm_area_struct *vma,
excess = 0;
}
- if (vm_flags & VM_LOCKED) {
- mm->locked_vm += new_len >> PAGE_SHIFT;
- *locked = true;
- }
-
mm->hiwater_vm = hiwater_vm;
/* Restore VM_ACCOUNT if one or two pieces of vma left */
if (excess) {
+ vma->vm_flags = vm_flags;
vma->vm_flags |= VM_ACCOUNT;
if (split)
find_vma(mm, vma->vm_end)->vm_flags |= VM_ACCOUNT;
@@ -768,9 +785,6 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
return ERR_PTR(-EFAULT);
- if (mlock_future_check(mm, vma->vm_flags, new_len - old_len))
- return ERR_PTR(-EAGAIN);
-
if (!may_expand_vm(mm, vma->vm_flags,
(new_len - old_len) >> PAGE_SHIFT))
return ERR_PTR(-ENOMEM);
@@ -1026,6 +1040,16 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
}
}
+ if (vma->vm_flags & VM_LOCKED) {
+ if (__account_locked_vm(mm, pages, current,
+ capable(CAP_IPC_LOCK))) {
+ if (vma->vm_flags & VM_ACCOUNT)
+ vm_unacct_memory(pages);
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
/*
* Function vma_merge() is called on the extension we are adding to
* the already existing vma, vma_merge() will merge this extension with
@@ -1038,14 +1062,16 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
extension_pgoff, vma_policy(vma),
vma->vm_userfaultfd_ctx, anon_vma_name(vma));
if (!vma) {
+ // TODO: We always unacct memory
+ // regardless of VM_ACCOUNT flags?
vm_unacct_memory(pages);
+ __unaccount_locked_vm(mm, pages);
ret = -ENOMEM;
goto out;
}
vm_stat_account(mm, vma->vm_flags, pages);
if (vma->vm_flags & VM_LOCKED) {
- mm->locked_vm += pages;
locked = true;
new_addr = addr;
}
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 04c3ac9..4515eb4 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -120,13 +120,11 @@ static int secretmem_release(struct inode *inode, struct file *file)
static int secretmem_mmap(struct file *file, struct vm_area_struct *vma)
{
- unsigned long len = vma->vm_end - vma->vm_start;
-
if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0)
return -EINVAL;
- if (mlock_future_check(vma->vm_mm, vma->vm_flags | VM_LOCKED, len))
- return -EAGAIN;
+ if (account_locked_vm(vma->vm_mm, vma->vm_end - vma->vm_start))
+ return -ENOMEM;
vma->vm_flags |= VM_LOCKED | VM_DONTDUMP;
vma->vm_ops = &secretmem_vm_ops;
--
git-series 0.9.1
^ permalink raw reply related [flat|nested] 108+ messages in thread
* [RFC PATCH 18/19] mm/mmap: Charge locked memory to pins cgroup
2023-01-24 5:42 ` Alistair Popple
@ 2023-01-24 5:42 ` Alistair Popple
-1 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-24 5:42 UTC (permalink / raw)
To: linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, jgg-DDmLM1+adcrQT0dZR+AlfA,
jhubbard-DDmLM1+adcrQT0dZR+AlfA, tjmercier-hpIqsD4AKlfQT0dZR+AlfA,
hannes-druUgvl0LCNAfugRpC6u6w, surenb-hpIqsD4AKlfQT0dZR+AlfA,
mkoutny-IBi9RG/b67k, daniel-/w4YWyX8dFk, Alistair Popple
account_locked_vm() is used to account memory to mm->locked_vm. This
adds accounting to the pins cgorup as it behaves similarly and should
be accounted against the same global limit if set.
This means memory must now be unaccounted for correctly, as the cgroup
typically outlives both the mm and the task. It is assumed that
callers of account_locked_vm() only do accounting against the current
task. Callers that need to do accounting against remote tasks should
use account_pinned_vm() and associated struct vm_account to hold
references to the cgroup.
Signed-off-by: Alistair Popple <apopple-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org>
Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
---
mm/util.c | 24 +++++++++++++++++++++++-
1 file changed, 23 insertions(+), 1 deletion(-)
diff --git a/mm/util.c b/mm/util.c
index 78b060d..d6159e3 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -551,15 +551,21 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages,
struct task_struct *task, bool bypass_rlim)
{
unsigned long locked_vm, limit;
+ struct pins_cgroup *pins_cg = get_pins_cg(task);
int ret = 0;
mmap_assert_write_locked(mm);
+ if (pins_cg && pins_try_charge(pins_cg, pages))
+ return -ENOMEM;
+
locked_vm = mm->locked_vm;
if (!bypass_rlim) {
limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
- if (locked_vm + pages > limit)
+ if (locked_vm + pages > limit) {
+ pins_uncharge(pins_cg, pages);
ret = -ENOMEM;
+ }
}
if (!ret)
@@ -569,6 +575,12 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages,
(void *)_RET_IP_, pages << PAGE_SHIFT, locked_vm << PAGE_SHIFT,
task_rlimit(task, RLIMIT_MEMLOCK), ret ? " - exceeded" : "");
+ pr_debug("%s: [%d] caller %ps %lu %lu/%lu%s\n", __func__, task->pid,
+ (void *)_RET_IP_, pages << PAGE_SHIFT, locked_vm << PAGE_SHIFT,
+ task_rlimit(task, RLIMIT_MEMLOCK), ret ? " - exceeded" : "");
+
+ if (pins_cg)
+ put_pins_cg(pins_cg);
return ret;
}
EXPORT_SYMBOL_GPL(__account_locked_vm);
@@ -584,8 +596,18 @@ void __unaccount_locked_vm(struct mm_struct *mm, unsigned long pages)
{
unsigned long locked_vm = mm->locked_vm;
+ /*
+ * TODO: Convert book3s vio to use pinned vm to ensure
+ * unaccounting happens to the correct cgroup.
+ */
+ struct pins_cgroup *pins_cg = get_pins_cg(current);
+
mmap_assert_write_locked(mm);
WARN_ON_ONCE(pages > locked_vm);
+ if (pins_cg) {
+ pins_uncharge(pins_cg, pages);
+ put_pins_cg(pins_cg);
+ }
mm->locked_vm = locked_vm - pages;
}
EXPORT_SYMBOL_GPL(__unaccount_locked_vm);
--
git-series 0.9.1
^ permalink raw reply related [flat|nested] 108+ messages in thread* [RFC PATCH 18/19] mm/mmap: Charge locked memory to pins cgroup
@ 2023-01-24 5:42 ` Alistair Popple
0 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-24 5:42 UTC (permalink / raw)
To: linux-mm, cgroups
Cc: linux-kernel, jgg, jhubbard, tjmercier, hannes, surenb, mkoutny,
daniel, Alistair Popple
account_locked_vm() is used to account memory to mm->locked_vm. This
adds accounting to the pins cgorup as it behaves similarly and should
be accounted against the same global limit if set.
This means memory must now be unaccounted for correctly, as the cgroup
typically outlives both the mm and the task. It is assumed that
callers of account_locked_vm() only do accounting against the current
task. Callers that need to do accounting against remote tasks should
use account_pinned_vm() and associated struct vm_account to hold
references to the cgroup.
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Cc: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org
---
mm/util.c | 24 +++++++++++++++++++++++-
1 file changed, 23 insertions(+), 1 deletion(-)
diff --git a/mm/util.c b/mm/util.c
index 78b060d..d6159e3 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -551,15 +551,21 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages,
struct task_struct *task, bool bypass_rlim)
{
unsigned long locked_vm, limit;
+ struct pins_cgroup *pins_cg = get_pins_cg(task);
int ret = 0;
mmap_assert_write_locked(mm);
+ if (pins_cg && pins_try_charge(pins_cg, pages))
+ return -ENOMEM;
+
locked_vm = mm->locked_vm;
if (!bypass_rlim) {
limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
- if (locked_vm + pages > limit)
+ if (locked_vm + pages > limit) {
+ pins_uncharge(pins_cg, pages);
ret = -ENOMEM;
+ }
}
if (!ret)
@@ -569,6 +575,12 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages,
(void *)_RET_IP_, pages << PAGE_SHIFT, locked_vm << PAGE_SHIFT,
task_rlimit(task, RLIMIT_MEMLOCK), ret ? " - exceeded" : "");
+ pr_debug("%s: [%d] caller %ps %lu %lu/%lu%s\n", __func__, task->pid,
+ (void *)_RET_IP_, pages << PAGE_SHIFT, locked_vm << PAGE_SHIFT,
+ task_rlimit(task, RLIMIT_MEMLOCK), ret ? " - exceeded" : "");
+
+ if (pins_cg)
+ put_pins_cg(pins_cg);
return ret;
}
EXPORT_SYMBOL_GPL(__account_locked_vm);
@@ -584,8 +596,18 @@ void __unaccount_locked_vm(struct mm_struct *mm, unsigned long pages)
{
unsigned long locked_vm = mm->locked_vm;
+ /*
+ * TODO: Convert book3s vio to use pinned vm to ensure
+ * unaccounting happens to the correct cgroup.
+ */
+ struct pins_cgroup *pins_cg = get_pins_cg(current);
+
mmap_assert_write_locked(mm);
WARN_ON_ONCE(pages > locked_vm);
+ if (pins_cg) {
+ pins_uncharge(pins_cg, pages);
+ put_pins_cg(pins_cg);
+ }
mm->locked_vm = locked_vm - pages;
}
EXPORT_SYMBOL_GPL(__unaccount_locked_vm);
--
git-series 0.9.1
^ permalink raw reply related [flat|nested] 108+ messages in thread
* [RFC PATCH 19/19] selftests/vm: Add pins-cgroup selftest for mlock/mmap
2023-01-24 5:42 ` Alistair Popple
@ 2023-01-24 5:42 ` Alistair Popple
-1 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-24 5:42 UTC (permalink / raw)
To: linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, jgg-DDmLM1+adcrQT0dZR+AlfA,
jhubbard-DDmLM1+adcrQT0dZR+AlfA, tjmercier-hpIqsD4AKlfQT0dZR+AlfA,
hannes-druUgvl0LCNAfugRpC6u6w, surenb-hpIqsD4AKlfQT0dZR+AlfA,
mkoutny-IBi9RG/b67k, daniel-/w4YWyX8dFk, Alistair Popple,
Shuah Khan, linux-kselftest-u79uwXL29TY76Z2rM5mHXA
Add some basic tests of mlock/mmap cgroup accounting for pinned
memory.
Signed-off-by: Alistair Popple <apopple-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org>
Cc: Shuah Khan <shuah-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org
Cc: linux-kselftest-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: cgroups-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
---
MAINTAINERS | 1 +-
tools/testing/selftests/vm/Makefile | 1 +-
tools/testing/selftests/vm/pins-cgroup.c | 271 ++++++++++++++++++++++++-
3 files changed, 273 insertions(+)
create mode 100644 tools/testing/selftests/vm/pins-cgroup.c
diff --git a/MAINTAINERS b/MAINTAINERS
index f8526e2..4c4eed9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5387,6 +5387,7 @@ L: cgroups-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
L: linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org
S: Maintained
F: mm/pins_cgroup.c
+F: tools/testing/selftests/vm/pins-cgroup.c
CORETEMP HARDWARE MONITORING DRIVER
M: Fenghua Yu <fenghua.yu-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index 89c14e4..0653720 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -56,6 +56,7 @@ TEST_GEN_PROGS += soft-dirty
TEST_GEN_PROGS += split_huge_page_test
TEST_GEN_FILES += ksm_tests
TEST_GEN_PROGS += ksm_functional_tests
+TEST_GEN_FILES += pins-cgroup
ifeq ($(MACHINE),x86_64)
CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_32bit_program.c -m32)
diff --git a/tools/testing/selftests/vm/pins-cgroup.c b/tools/testing/selftests/vm/pins-cgroup.c
new file mode 100644
index 0000000..c2eabc2
--- /dev/null
+++ b/tools/testing/selftests/vm/pins-cgroup.c
@@ -0,0 +1,271 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "../kselftest_harness.h"
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/prctl.h>
+#include <sys/resource.h>
+#include <sys/capability.h>
+#include <unistd.h>
+
+#define CGROUP_TEMP "/sys/fs/cgroup/pins_XXXXXX"
+#define PINS_MAX (-1UL)
+
+FIXTURE(pins_cg)
+{
+ char *cg_path;
+ long page_size;
+};
+
+static char *cgroup_new(void)
+{
+ char *cg;
+
+ cg = malloc(sizeof(CGROUP_TEMP));
+ strcpy(cg, CGROUP_TEMP);
+ if (!mkdtemp(cg)) {
+ perror("Failed to create cgroup");
+ return NULL;
+ }
+
+ return cg;
+}
+
+static int cgroup_add_proc(char *cg, pid_t pid)
+{
+ char *cg_proc;
+ FILE *f;
+ int ret = 0;
+
+ if (asprintf(&cg_proc, "%s/cgroup.procs", cg) < 0)
+ return -1;
+
+ f = fopen(cg_proc, "w");
+ free(cg_proc);
+ if (!f)
+ return -1;
+
+ if (fprintf(f, "%ld\n", (long) pid) < 0)
+ ret = -1;
+
+ fclose(f);
+ return ret;
+}
+
+static int cgroup_set_limit(char *cg, unsigned long limit)
+{
+ char *cg_pins_max;
+ FILE *f;
+ int ret = 0;
+
+ if (asprintf(&cg_pins_max, "%s/pins.max", cg) < 0)
+ return -1;
+
+ f = fopen(cg_pins_max, "w");
+ free(cg_pins_max);
+ if (!f)
+ return -1;
+
+ if (limit != PINS_MAX) {
+ if (fprintf(f, "%ld\n", limit) < 0)
+ ret = -1;
+ } else {
+ if (fprintf(f, "max\n") < 0)
+ ret = -1;
+ }
+
+ fclose(f);
+ return ret;
+}
+
+FIXTURE_SETUP(pins_cg)
+{
+ char *cg_subtree_control;
+ FILE *f;
+
+ if (asprintf(&cg_subtree_control,
+ "/sys/fs/cgroup/cgroup.subtree_control") < 0)
+ return;
+
+ f = fopen(cg_subtree_control, "w");
+ free(cg_subtree_control);
+ if (!f)
+ return;
+
+ fprintf(f, "+pins\n");
+ fclose(f);
+
+ self->cg_path = cgroup_new();
+ self->page_size = sysconf(_SC_PAGE_SIZE);
+}
+
+FIXTURE_TEARDOWN(pins_cg)
+{
+ cgroup_add_proc("/sys/fs/cgroup", getpid());
+
+ rmdir(self->cg_path);
+ free(self->cg_path);
+}
+
+static long cgroup_pins(char *cg)
+{
+ long pin_count;
+ char *cg_pins_current;
+ FILE *f;
+ int ret;
+
+ if (asprintf(&cg_pins_current, "%s/pins.current", cg) < 0)
+ return -1;
+
+ f = fopen(cg_pins_current, "r");
+ if (!f) {
+ printf("Can't open %s\n", cg_pins_current);
+ getchar();
+ free(cg_pins_current);
+ return -2;
+ }
+
+ free(cg_pins_current);
+
+ if (fscanf(f, "%ld", &pin_count) == EOF)
+ ret = -3;
+ else
+ ret = pin_count;
+
+ fclose(f);
+ return ret;
+}
+
+static int set_rlim_memlock(unsigned long size)
+{
+ struct rlimit rlim_memlock = {
+ .rlim_cur = size,
+ .rlim_max = size,
+ };
+ cap_t cap;
+ cap_value_t capability[1] = { CAP_IPC_LOCK };
+
+ /*
+ * Many of the rlimit checks are skipped if a process has
+ * CAP_IP_LOCK. As this test should be run as root we need to
+ * explicitly drop it.
+ */
+ cap = cap_get_proc();
+ if (!cap)
+ return -1;
+ if (cap_set_flag(cap, CAP_EFFECTIVE, 1, capability, CAP_CLEAR))
+ return -1;
+ if (cap_set_proc(cap))
+ return -1;
+ return setrlimit(RLIMIT_MEMLOCK, &rlim_memlock);
+}
+
+TEST_F(pins_cg, basic)
+{
+ pid_t child_pid;
+ long page_size = self->page_size;
+ char *p;
+
+ ASSERT_EQ(cgroup_add_proc(self->cg_path, getpid()), 0);
+ p = mmap(NULL, 32*page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(p, MAP_FAILED);
+
+ ASSERT_EQ(cgroup_pins(self->cg_path), 0);
+ memset(p, 0, 16*page_size);
+ ASSERT_EQ(mlock(p, page_size), 0);
+ ASSERT_EQ(cgroup_pins(self->cg_path), 1);
+ ASSERT_EQ(mlock(p + page_size, page_size), 0);
+ ASSERT_EQ(cgroup_pins(self->cg_path), 2);
+ ASSERT_EQ(mlock(p, page_size), 0);
+ ASSERT_EQ(cgroup_pins(self->cg_path), 2);
+ ASSERT_EQ(mlock(p, 4*page_size), 0);
+ ASSERT_EQ(cgroup_pins(self->cg_path), 4);
+ ASSERT_EQ(munlock(p + 2*page_size, 2*page_size), 0);
+ ASSERT_EQ(cgroup_pins(self->cg_path), 2);
+ ASSERT_EQ(cgroup_set_limit(self->cg_path, 8), 0);
+ ASSERT_EQ(mlock(p, 16*page_size), -1);
+ ASSERT_EQ(errno, ENOMEM);
+ ASSERT_EQ(cgroup_pins(self->cg_path), 2);
+ ASSERT_EQ(cgroup_set_limit(self->cg_path, PINS_MAX), 0);
+
+ /* check mremap() a locked region correctly accounts locked pages */
+ ASSERT_EQ(mlock(p, 32*page_size), 0);
+ ASSERT_EQ(cgroup_pins(self->cg_path), 32);
+ p = mremap(p, 32*page_size, 64*page_size, MREMAP_MAYMOVE);
+ ASSERT_NE(p, MAP_FAILED);
+ ASSERT_EQ(cgroup_pins(self->cg_path), 64);
+ ASSERT_EQ(munmap(p + 32*page_size, 32*page_size), 0)
+ ASSERT_EQ(cgroup_pins(self->cg_path), 32);
+ p = mremap(p, 32*page_size, 32*page_size, MREMAP_MAYMOVE | MREMAP_DONTUNMAP);
+ ASSERT_NE(p, MAP_FAILED);
+ ASSERT_EQ(cgroup_pins(self->cg_path), 32);
+ ASSERT_EQ(munlock(p, 32*page_size), 0);
+
+ /* mremap() a locked region should fail if limit exceeded */
+ ASSERT_EQ(set_rlim_memlock(32*page_size), 0);
+ ASSERT_EQ(mlock(p, 32*page_size), 0);
+ ASSERT_EQ(mremap(p, 32*page_size, 64*page_size, 0), MAP_FAILED);
+ ASSERT_EQ(munlock(p, 32*page_size), 0);
+
+ /* Exceeds rlimit, expected to fail */
+ ASSERT_EQ(set_rlim_memlock(16*page_size), 0);
+ ASSERT_EQ(mlock(p, 32*page_size), -1);
+ ASSERT_EQ(errno, ENOMEM);
+
+ /* memory in the child isn't locked so shouldn't increase pin_cg count */
+ ASSERT_EQ(mlock(p, 16*page_size), 0);
+ child_pid = fork();
+ if (!child_pid) {
+ ASSERT_EQ(cgroup_pins(self->cg_path), 16);
+ ASSERT_EQ(mlock(p, 16*page_size), 0);
+ ASSERT_EQ(cgroup_pins(self->cg_path), 32);
+ return;
+
+ }
+ waitpid(child_pid, NULL, 0);
+
+ /* check that child exit uncharged the pins */
+ ASSERT_EQ(cgroup_pins(self->cg_path), 16);
+}
+
+TEST_F(pins_cg, mmap)
+{
+ char *p;
+
+ ASSERT_EQ(cgroup_add_proc(self->cg_path, getpid()), 0);
+ p = mmap(NULL, 4*self->page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE | MAP_LOCKED, -1, 0);
+ ASSERT_NE(p, MAP_FAILED);
+ ASSERT_EQ(cgroup_pins(self->cg_path), 4);
+}
+
+/*
+ * Test moving to a different cgroup.
+ */
+TEST_F(pins_cg, move_cg)
+{
+ char *p, *new_cg;
+
+ ASSERT_EQ(cgroup_add_proc(self->cg_path, getpid()), 0);
+ p = mmap(NULL, 16*self->page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(p, MAP_FAILED);
+ memset(p, 0, 16*self->page_size);
+ ASSERT_EQ(mlock(p, 16*self->page_size), 0);
+ ASSERT_EQ(cgroup_pins(self->cg_path), 16);
+ ASSERT_NE(new_cg = cgroup_new(), NULL);
+ ASSERT_EQ(cgroup_add_proc(new_cg, getpid()), 0);
+ ASSERT_EQ(cgroup_pins(new_cg), 16);
+ ASSERT_EQ(cgroup_add_proc(self->cg_path, getpid()), 0);
+ rmdir(new_cg);
+}
+TEST_HARNESS_MAIN
--
git-series 0.9.1
^ permalink raw reply related [flat|nested] 108+ messages in thread* [RFC PATCH 19/19] selftests/vm: Add pins-cgroup selftest for mlock/mmap
@ 2023-01-24 5:42 ` Alistair Popple
0 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-24 5:42 UTC (permalink / raw)
To: linux-mm, cgroups
Cc: linux-kernel, jgg, jhubbard, tjmercier, hannes, surenb, mkoutny,
daniel, Alistair Popple, Shuah Khan, linux-kselftest
Add some basic tests of mlock/mmap cgroup accounting for pinned
memory.
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: linux-kernel@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: linux-kselftest@vger.kernel.org
Cc: cgroups@vger.kernel.org
---
MAINTAINERS | 1 +-
tools/testing/selftests/vm/Makefile | 1 +-
tools/testing/selftests/vm/pins-cgroup.c | 271 ++++++++++++++++++++++++-
3 files changed, 273 insertions(+)
create mode 100644 tools/testing/selftests/vm/pins-cgroup.c
diff --git a/MAINTAINERS b/MAINTAINERS
index f8526e2..4c4eed9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5387,6 +5387,7 @@ L: cgroups@vger.kernel.org
L: linux-mm@kvack.org
S: Maintained
F: mm/pins_cgroup.c
+F: tools/testing/selftests/vm/pins-cgroup.c
CORETEMP HARDWARE MONITORING DRIVER
M: Fenghua Yu <fenghua.yu@intel.com>
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index 89c14e4..0653720 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -56,6 +56,7 @@ TEST_GEN_PROGS += soft-dirty
TEST_GEN_PROGS += split_huge_page_test
TEST_GEN_FILES += ksm_tests
TEST_GEN_PROGS += ksm_functional_tests
+TEST_GEN_FILES += pins-cgroup
ifeq ($(MACHINE),x86_64)
CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_32bit_program.c -m32)
diff --git a/tools/testing/selftests/vm/pins-cgroup.c b/tools/testing/selftests/vm/pins-cgroup.c
new file mode 100644
index 0000000..c2eabc2
--- /dev/null
+++ b/tools/testing/selftests/vm/pins-cgroup.c
@@ -0,0 +1,271 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "../kselftest_harness.h"
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/prctl.h>
+#include <sys/resource.h>
+#include <sys/capability.h>
+#include <unistd.h>
+
+#define CGROUP_TEMP "/sys/fs/cgroup/pins_XXXXXX"
+#define PINS_MAX (-1UL)
+
+FIXTURE(pins_cg)
+{
+ char *cg_path;
+ long page_size;
+};
+
+static char *cgroup_new(void)
+{
+ char *cg;
+
+ cg = malloc(sizeof(CGROUP_TEMP));
+ strcpy(cg, CGROUP_TEMP);
+ if (!mkdtemp(cg)) {
+ perror("Failed to create cgroup");
+ return NULL;
+ }
+
+ return cg;
+}
+
+static int cgroup_add_proc(char *cg, pid_t pid)
+{
+ char *cg_proc;
+ FILE *f;
+ int ret = 0;
+
+ if (asprintf(&cg_proc, "%s/cgroup.procs", cg) < 0)
+ return -1;
+
+ f = fopen(cg_proc, "w");
+ free(cg_proc);
+ if (!f)
+ return -1;
+
+ if (fprintf(f, "%ld\n", (long) pid) < 0)
+ ret = -1;
+
+ fclose(f);
+ return ret;
+}
+
+static int cgroup_set_limit(char *cg, unsigned long limit)
+{
+ char *cg_pins_max;
+ FILE *f;
+ int ret = 0;
+
+ if (asprintf(&cg_pins_max, "%s/pins.max", cg) < 0)
+ return -1;
+
+ f = fopen(cg_pins_max, "w");
+ free(cg_pins_max);
+ if (!f)
+ return -1;
+
+ if (limit != PINS_MAX) {
+ if (fprintf(f, "%ld\n", limit) < 0)
+ ret = -1;
+ } else {
+ if (fprintf(f, "max\n") < 0)
+ ret = -1;
+ }
+
+ fclose(f);
+ return ret;
+}
+
+FIXTURE_SETUP(pins_cg)
+{
+ char *cg_subtree_control;
+ FILE *f;
+
+ if (asprintf(&cg_subtree_control,
+ "/sys/fs/cgroup/cgroup.subtree_control") < 0)
+ return;
+
+ f = fopen(cg_subtree_control, "w");
+ free(cg_subtree_control);
+ if (!f)
+ return;
+
+ fprintf(f, "+pins\n");
+ fclose(f);
+
+ self->cg_path = cgroup_new();
+ self->page_size = sysconf(_SC_PAGE_SIZE);
+}
+
+FIXTURE_TEARDOWN(pins_cg)
+{
+ cgroup_add_proc("/sys/fs/cgroup", getpid());
+
+ rmdir(self->cg_path);
+ free(self->cg_path);
+}
+
+static long cgroup_pins(char *cg)
+{
+ long pin_count;
+ char *cg_pins_current;
+ FILE *f;
+ int ret;
+
+ if (asprintf(&cg_pins_current, "%s/pins.current", cg) < 0)
+ return -1;
+
+ f = fopen(cg_pins_current, "r");
+ if (!f) {
+ printf("Can't open %s\n", cg_pins_current);
+ getchar();
+ free(cg_pins_current);
+ return -2;
+ }
+
+ free(cg_pins_current);
+
+ if (fscanf(f, "%ld", &pin_count) == EOF)
+ ret = -3;
+ else
+ ret = pin_count;
+
+ fclose(f);
+ return ret;
+}
+
+static int set_rlim_memlock(unsigned long size)
+{
+ struct rlimit rlim_memlock = {
+ .rlim_cur = size,
+ .rlim_max = size,
+ };
+ cap_t cap;
+ cap_value_t capability[1] = { CAP_IPC_LOCK };
+
+ /*
+ * Many of the rlimit checks are skipped if a process has
+ * CAP_IP_LOCK. As this test should be run as root we need to
+ * explicitly drop it.
+ */
+ cap = cap_get_proc();
+ if (!cap)
+ return -1;
+ if (cap_set_flag(cap, CAP_EFFECTIVE, 1, capability, CAP_CLEAR))
+ return -1;
+ if (cap_set_proc(cap))
+ return -1;
+ return setrlimit(RLIMIT_MEMLOCK, &rlim_memlock);
+}
+
+TEST_F(pins_cg, basic)
+{
+ pid_t child_pid;
+ long page_size = self->page_size;
+ char *p;
+
+ ASSERT_EQ(cgroup_add_proc(self->cg_path, getpid()), 0);
+ p = mmap(NULL, 32*page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(p, MAP_FAILED);
+
+ ASSERT_EQ(cgroup_pins(self->cg_path), 0);
+ memset(p, 0, 16*page_size);
+ ASSERT_EQ(mlock(p, page_size), 0);
+ ASSERT_EQ(cgroup_pins(self->cg_path), 1);
+ ASSERT_EQ(mlock(p + page_size, page_size), 0);
+ ASSERT_EQ(cgroup_pins(self->cg_path), 2);
+ ASSERT_EQ(mlock(p, page_size), 0);
+ ASSERT_EQ(cgroup_pins(self->cg_path), 2);
+ ASSERT_EQ(mlock(p, 4*page_size), 0);
+ ASSERT_EQ(cgroup_pins(self->cg_path), 4);
+ ASSERT_EQ(munlock(p + 2*page_size, 2*page_size), 0);
+ ASSERT_EQ(cgroup_pins(self->cg_path), 2);
+ ASSERT_EQ(cgroup_set_limit(self->cg_path, 8), 0);
+ ASSERT_EQ(mlock(p, 16*page_size), -1);
+ ASSERT_EQ(errno, ENOMEM);
+ ASSERT_EQ(cgroup_pins(self->cg_path), 2);
+ ASSERT_EQ(cgroup_set_limit(self->cg_path, PINS_MAX), 0);
+
+ /* check mremap() a locked region correctly accounts locked pages */
+ ASSERT_EQ(mlock(p, 32*page_size), 0);
+ ASSERT_EQ(cgroup_pins(self->cg_path), 32);
+ p = mremap(p, 32*page_size, 64*page_size, MREMAP_MAYMOVE);
+ ASSERT_NE(p, MAP_FAILED);
+ ASSERT_EQ(cgroup_pins(self->cg_path), 64);
+ ASSERT_EQ(munmap(p + 32*page_size, 32*page_size), 0)
+ ASSERT_EQ(cgroup_pins(self->cg_path), 32);
+ p = mremap(p, 32*page_size, 32*page_size, MREMAP_MAYMOVE | MREMAP_DONTUNMAP);
+ ASSERT_NE(p, MAP_FAILED);
+ ASSERT_EQ(cgroup_pins(self->cg_path), 32);
+ ASSERT_EQ(munlock(p, 32*page_size), 0);
+
+ /* mremap() a locked region should fail if limit exceeded */
+ ASSERT_EQ(set_rlim_memlock(32*page_size), 0);
+ ASSERT_EQ(mlock(p, 32*page_size), 0);
+ ASSERT_EQ(mremap(p, 32*page_size, 64*page_size, 0), MAP_FAILED);
+ ASSERT_EQ(munlock(p, 32*page_size), 0);
+
+ /* Exceeds rlimit, expected to fail */
+ ASSERT_EQ(set_rlim_memlock(16*page_size), 0);
+ ASSERT_EQ(mlock(p, 32*page_size), -1);
+ ASSERT_EQ(errno, ENOMEM);
+
+ /* memory in the child isn't locked so shouldn't increase pin_cg count */
+ ASSERT_EQ(mlock(p, 16*page_size), 0);
+ child_pid = fork();
+ if (!child_pid) {
+ ASSERT_EQ(cgroup_pins(self->cg_path), 16);
+ ASSERT_EQ(mlock(p, 16*page_size), 0);
+ ASSERT_EQ(cgroup_pins(self->cg_path), 32);
+ return;
+
+ }
+ waitpid(child_pid, NULL, 0);
+
+ /* check that child exit uncharged the pins */
+ ASSERT_EQ(cgroup_pins(self->cg_path), 16);
+}
+
+TEST_F(pins_cg, mmap)
+{
+ char *p;
+
+ ASSERT_EQ(cgroup_add_proc(self->cg_path, getpid()), 0);
+ p = mmap(NULL, 4*self->page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE | MAP_LOCKED, -1, 0);
+ ASSERT_NE(p, MAP_FAILED);
+ ASSERT_EQ(cgroup_pins(self->cg_path), 4);
+}
+
+/*
+ * Test moving to a different cgroup.
+ */
+TEST_F(pins_cg, move_cg)
+{
+ char *p, *new_cg;
+
+ ASSERT_EQ(cgroup_add_proc(self->cg_path, getpid()), 0);
+ p = mmap(NULL, 16*self->page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(p, MAP_FAILED);
+ memset(p, 0, 16*self->page_size);
+ ASSERT_EQ(mlock(p, 16*self->page_size), 0);
+ ASSERT_EQ(cgroup_pins(self->cg_path), 16);
+ ASSERT_NE(new_cg = cgroup_new(), NULL);
+ ASSERT_EQ(cgroup_add_proc(new_cg, getpid()), 0);
+ ASSERT_EQ(cgroup_pins(new_cg), 16);
+ ASSERT_EQ(cgroup_add_proc(self->cg_path, getpid()), 0);
+ rmdir(new_cg);
+}
+TEST_HARNESS_MAIN
--
git-series 0.9.1
^ permalink raw reply related [flat|nested] 108+ messages in thread
* Re: [RFC PATCH 00/19] mm: Introduce a cgroup to limit the amount of locked and pinned memory
2023-01-24 5:42 ` Alistair Popple
@ 2023-01-24 18:26 ` Yosry Ahmed
-1 siblings, 0 replies; 108+ messages in thread
From: Yosry Ahmed @ 2023-01-24 18:26 UTC (permalink / raw)
To: Alistair Popple
Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA, jgg-DDmLM1+adcrQT0dZR+AlfA,
jhubbard-DDmLM1+adcrQT0dZR+AlfA, tjmercier-hpIqsD4AKlfQT0dZR+AlfA,
hannes-druUgvl0LCNAfugRpC6u6w, surenb-hpIqsD4AKlfQT0dZR+AlfA,
mkoutny-IBi9RG/b67k, daniel-/w4YWyX8dFk
On Mon, Jan 23, 2023 at 9:43 PM Alistair Popple <apopple-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org> wrote:
>
> Having large amounts of unmovable or unreclaimable memory in a system
> can lead to system instability due to increasing the likelihood of
> encountering out-of-memory conditions. Therefore it is desirable to
> limit the amount of memory users can lock or pin.
>
> From userspace such limits can be enforced by setting
> RLIMIT_MEMLOCK. However there is no standard method that drivers and
> other in-kernel users can use to check and enforce this limit.
>
> This has lead to a large number of inconsistencies in how limits are
> enforced. For example some drivers will use mm->locked_mm while others
> will use mm->pinned_mm or user->locked_mm. It is therefore possible to
> have up to three times RLIMIT_MEMLOCKED pinned.
>
> Having pinned memory limited per-task also makes it easy for users to
> exceed the limit. For example drivers that pin memory with
> pin_user_pages() it tends to remain pinned after fork. To deal with
> this and other issues this series introduces a cgroup for tracking and
> limiting the number of pages pinned or locked by tasks in the group.
>
> However the existing behaviour with regards to the rlimit needs to be
> maintained. Therefore the lesser of the two limits is
> enforced. Furthermore having CAP_IPC_LOCK usually bypasses the rlimit,
> but this bypass is not allowed for the cgroup.
>
> The first part of this series converts existing drivers which
> open-code the use of locked_mm/pinned_mm over to a common interface
> which manages the refcounts of the associated task/mm/user
> structs. This ensures accounting of pages is consistent and makes it
> easier to add charging of the cgroup.
>
> The second part of the series adds the cgroup and converts core mm
> code such as mlock over to charging the cgroup before finally
> introducing some selftests.
I didn't go through the entire series, so apologies if this was
mentioned somewhere, but do you mind elaborating on why this is added
as a separate cgroup controller rather than an extension of the memory
cgroup controller?
>
>
> As I don't have access to systems with all the various devices I
> haven't been able to test all driver changes. Any help there would be
> appreciated.
>
> Alistair Popple (19):
> mm: Introduce vm_account
> drivers/vhost: Convert to use vm_account
> drivers/vdpa: Convert vdpa to use the new vm_structure
> infiniband/umem: Convert to use vm_account
> RMDA/siw: Convert to use vm_account
> RDMA/usnic: convert to use vm_account
> vfio/type1: Charge pinned pages to pinned_vm instead of locked_vm
> vfio/spapr_tce: Convert accounting to pinned_vm
> io_uring: convert to use vm_account
> net: skb: Switch to using vm_account
> xdp: convert to use vm_account
> kvm/book3s_64_vio: Convert account_locked_vm() to vm_account_pinned()
> fpga: dfl: afu: convert to use vm_account
> mm: Introduce a cgroup for pinned memory
> mm/util: Extend vm_account to charge pages against the pin cgroup
> mm/util: Refactor account_locked_vm
> mm: Convert mmap and mlock to use account_locked_vm
> mm/mmap: Charge locked memory to pins cgroup
> selftests/vm: Add pins-cgroup selftest for mlock/mmap
>
> MAINTAINERS | 8 +-
> arch/powerpc/kvm/book3s_64_vio.c | 10 +-
> arch/powerpc/mm/book3s64/iommu_api.c | 29 +--
> drivers/fpga/dfl-afu-dma-region.c | 11 +-
> drivers/fpga/dfl-afu.h | 1 +-
> drivers/infiniband/core/umem.c | 16 +-
> drivers/infiniband/core/umem_odp.c | 6 +-
> drivers/infiniband/hw/usnic/usnic_uiom.c | 13 +-
> drivers/infiniband/hw/usnic/usnic_uiom.h | 1 +-
> drivers/infiniband/sw/siw/siw.h | 2 +-
> drivers/infiniband/sw/siw/siw_mem.c | 20 +--
> drivers/infiniband/sw/siw/siw_verbs.c | 15 +-
> drivers/vdpa/vdpa_user/vduse_dev.c | 20 +--
> drivers/vfio/vfio_iommu_spapr_tce.c | 15 +-
> drivers/vfio/vfio_iommu_type1.c | 59 +----
> drivers/vhost/vdpa.c | 9 +-
> drivers/vhost/vhost.c | 2 +-
> drivers/vhost/vhost.h | 1 +-
> include/linux/cgroup.h | 20 ++-
> include/linux/cgroup_subsys.h | 4 +-
> include/linux/io_uring_types.h | 3 +-
> include/linux/kvm_host.h | 1 +-
> include/linux/mm.h | 5 +-
> include/linux/mm_types.h | 88 ++++++++-
> include/linux/skbuff.h | 6 +-
> include/net/sock.h | 2 +-
> include/net/xdp_sock.h | 2 +-
> include/rdma/ib_umem.h | 1 +-
> io_uring/io_uring.c | 20 +--
> io_uring/notif.c | 4 +-
> io_uring/notif.h | 10 +-
> io_uring/rsrc.c | 38 +---
> io_uring/rsrc.h | 9 +-
> mm/Kconfig | 11 +-
> mm/Makefile | 1 +-
> mm/internal.h | 2 +-
> mm/mlock.c | 76 +------
> mm/mmap.c | 76 +++----
> mm/mremap.c | 54 +++--
> mm/pins_cgroup.c | 273 ++++++++++++++++++++++++-
> mm/secretmem.c | 6 +-
> mm/util.c | 196 +++++++++++++++--
> net/core/skbuff.c | 47 +---
> net/rds/message.c | 9 +-
> net/xdp/xdp_umem.c | 38 +--
> tools/testing/selftests/vm/Makefile | 1 +-
> tools/testing/selftests/vm/pins-cgroup.c | 271 ++++++++++++++++++++++++-
> virt/kvm/kvm_main.c | 3 +-
> 48 files changed, 1114 insertions(+), 401 deletions(-)
> create mode 100644 mm/pins_cgroup.c
> create mode 100644 tools/testing/selftests/vm/pins-cgroup.c
>
> base-commit: 2241ab53cbb5cdb08a6b2d4688feb13971058f65
> --
> git-series 0.9.1
>
^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [RFC PATCH 00/19] mm: Introduce a cgroup to limit the amount of locked and pinned memory
@ 2023-01-24 18:26 ` Yosry Ahmed
0 siblings, 0 replies; 108+ messages in thread
From: Yosry Ahmed @ 2023-01-24 18:26 UTC (permalink / raw)
To: Alistair Popple
Cc: linux-mm, cgroups, linux-kernel, jgg, jhubbard, tjmercier, hannes,
surenb, mkoutny, daniel
On Mon, Jan 23, 2023 at 9:43 PM Alistair Popple <apopple@nvidia.com> wrote:
>
> Having large amounts of unmovable or unreclaimable memory in a system
> can lead to system instability due to increasing the likelihood of
> encountering out-of-memory conditions. Therefore it is desirable to
> limit the amount of memory users can lock or pin.
>
> From userspace such limits can be enforced by setting
> RLIMIT_MEMLOCK. However there is no standard method that drivers and
> other in-kernel users can use to check and enforce this limit.
>
> This has lead to a large number of inconsistencies in how limits are
> enforced. For example some drivers will use mm->locked_mm while others
> will use mm->pinned_mm or user->locked_mm. It is therefore possible to
> have up to three times RLIMIT_MEMLOCKED pinned.
>
> Having pinned memory limited per-task also makes it easy for users to
> exceed the limit. For example drivers that pin memory with
> pin_user_pages() it tends to remain pinned after fork. To deal with
> this and other issues this series introduces a cgroup for tracking and
> limiting the number of pages pinned or locked by tasks in the group.
>
> However the existing behaviour with regards to the rlimit needs to be
> maintained. Therefore the lesser of the two limits is
> enforced. Furthermore having CAP_IPC_LOCK usually bypasses the rlimit,
> but this bypass is not allowed for the cgroup.
>
> The first part of this series converts existing drivers which
> open-code the use of locked_mm/pinned_mm over to a common interface
> which manages the refcounts of the associated task/mm/user
> structs. This ensures accounting of pages is consistent and makes it
> easier to add charging of the cgroup.
>
> The second part of the series adds the cgroup and converts core mm
> code such as mlock over to charging the cgroup before finally
> introducing some selftests.
I didn't go through the entire series, so apologies if this was
mentioned somewhere, but do you mind elaborating on why this is added
as a separate cgroup controller rather than an extension of the memory
cgroup controller?
>
>
> As I don't have access to systems with all the various devices I
> haven't been able to test all driver changes. Any help there would be
> appreciated.
>
> Alistair Popple (19):
> mm: Introduce vm_account
> drivers/vhost: Convert to use vm_account
> drivers/vdpa: Convert vdpa to use the new vm_structure
> infiniband/umem: Convert to use vm_account
> RMDA/siw: Convert to use vm_account
> RDMA/usnic: convert to use vm_account
> vfio/type1: Charge pinned pages to pinned_vm instead of locked_vm
> vfio/spapr_tce: Convert accounting to pinned_vm
> io_uring: convert to use vm_account
> net: skb: Switch to using vm_account
> xdp: convert to use vm_account
> kvm/book3s_64_vio: Convert account_locked_vm() to vm_account_pinned()
> fpga: dfl: afu: convert to use vm_account
> mm: Introduce a cgroup for pinned memory
> mm/util: Extend vm_account to charge pages against the pin cgroup
> mm/util: Refactor account_locked_vm
> mm: Convert mmap and mlock to use account_locked_vm
> mm/mmap: Charge locked memory to pins cgroup
> selftests/vm: Add pins-cgroup selftest for mlock/mmap
>
> MAINTAINERS | 8 +-
> arch/powerpc/kvm/book3s_64_vio.c | 10 +-
> arch/powerpc/mm/book3s64/iommu_api.c | 29 +--
> drivers/fpga/dfl-afu-dma-region.c | 11 +-
> drivers/fpga/dfl-afu.h | 1 +-
> drivers/infiniband/core/umem.c | 16 +-
> drivers/infiniband/core/umem_odp.c | 6 +-
> drivers/infiniband/hw/usnic/usnic_uiom.c | 13 +-
> drivers/infiniband/hw/usnic/usnic_uiom.h | 1 +-
> drivers/infiniband/sw/siw/siw.h | 2 +-
> drivers/infiniband/sw/siw/siw_mem.c | 20 +--
> drivers/infiniband/sw/siw/siw_verbs.c | 15 +-
> drivers/vdpa/vdpa_user/vduse_dev.c | 20 +--
> drivers/vfio/vfio_iommu_spapr_tce.c | 15 +-
> drivers/vfio/vfio_iommu_type1.c | 59 +----
> drivers/vhost/vdpa.c | 9 +-
> drivers/vhost/vhost.c | 2 +-
> drivers/vhost/vhost.h | 1 +-
> include/linux/cgroup.h | 20 ++-
> include/linux/cgroup_subsys.h | 4 +-
> include/linux/io_uring_types.h | 3 +-
> include/linux/kvm_host.h | 1 +-
> include/linux/mm.h | 5 +-
> include/linux/mm_types.h | 88 ++++++++-
> include/linux/skbuff.h | 6 +-
> include/net/sock.h | 2 +-
> include/net/xdp_sock.h | 2 +-
> include/rdma/ib_umem.h | 1 +-
> io_uring/io_uring.c | 20 +--
> io_uring/notif.c | 4 +-
> io_uring/notif.h | 10 +-
> io_uring/rsrc.c | 38 +---
> io_uring/rsrc.h | 9 +-
> mm/Kconfig | 11 +-
> mm/Makefile | 1 +-
> mm/internal.h | 2 +-
> mm/mlock.c | 76 +------
> mm/mmap.c | 76 +++----
> mm/mremap.c | 54 +++--
> mm/pins_cgroup.c | 273 ++++++++++++++++++++++++-
> mm/secretmem.c | 6 +-
> mm/util.c | 196 +++++++++++++++--
> net/core/skbuff.c | 47 +---
> net/rds/message.c | 9 +-
> net/xdp/xdp_umem.c | 38 +--
> tools/testing/selftests/vm/Makefile | 1 +-
> tools/testing/selftests/vm/pins-cgroup.c | 271 ++++++++++++++++++++++++-
> virt/kvm/kvm_main.c | 3 +-
> 48 files changed, 1114 insertions(+), 401 deletions(-)
> create mode 100644 mm/pins_cgroup.c
> create mode 100644 tools/testing/selftests/vm/pins-cgroup.c
>
> base-commit: 2241ab53cbb5cdb08a6b2d4688feb13971058f65
> --
> git-series 0.9.1
>
^ permalink raw reply [flat|nested] 108+ messages in thread
[parent not found: <CAJD7tkavoSu9WOnw4Nbxz41nq+Rm6Sq5EeOjh3CTyA=AT5=ujg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>]
* Re: [RFC PATCH 00/19] mm: Introduce a cgroup to limit the amount of locked and pinned memory
2023-01-24 18:26 ` Yosry Ahmed
@ 2023-01-31 0:54 ` Alistair Popple
-1 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-31 0:54 UTC (permalink / raw)
To: Yosry Ahmed
Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA, jgg-DDmLM1+adcrQT0dZR+AlfA,
jhubbard-DDmLM1+adcrQT0dZR+AlfA, tjmercier-hpIqsD4AKlfQT0dZR+AlfA,
hannes-druUgvl0LCNAfugRpC6u6w, surenb-hpIqsD4AKlfQT0dZR+AlfA,
mkoutny-IBi9RG/b67k, daniel-/w4YWyX8dFk
Yosry Ahmed <yosryahmed-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> writes:
> On Mon, Jan 23, 2023 at 9:43 PM Alistair Popple <apopple-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org> wrote:
>>
>> Having large amounts of unmovable or unreclaimable memory in a system
>> can lead to system instability due to increasing the likelihood of
>> encountering out-of-memory conditions. Therefore it is desirable to
>> limit the amount of memory users can lock or pin.
>>
>> From userspace such limits can be enforced by setting
>> RLIMIT_MEMLOCK. However there is no standard method that drivers and
>> other in-kernel users can use to check and enforce this limit.
>>
>> This has lead to a large number of inconsistencies in how limits are
>> enforced. For example some drivers will use mm->locked_mm while others
>> will use mm->pinned_mm or user->locked_mm. It is therefore possible to
>> have up to three times RLIMIT_MEMLOCKED pinned.
>>
>> Having pinned memory limited per-task also makes it easy for users to
>> exceed the limit. For example drivers that pin memory with
>> pin_user_pages() it tends to remain pinned after fork. To deal with
>> this and other issues this series introduces a cgroup for tracking and
>> limiting the number of pages pinned or locked by tasks in the group.
>>
>> However the existing behaviour with regards to the rlimit needs to be
>> maintained. Therefore the lesser of the two limits is
>> enforced. Furthermore having CAP_IPC_LOCK usually bypasses the rlimit,
>> but this bypass is not allowed for the cgroup.
>>
>> The first part of this series converts existing drivers which
>> open-code the use of locked_mm/pinned_mm over to a common interface
>> which manages the refcounts of the associated task/mm/user
>> structs. This ensures accounting of pages is consistent and makes it
>> easier to add charging of the cgroup.
>>
>> The second part of the series adds the cgroup and converts core mm
>> code such as mlock over to charging the cgroup before finally
>> introducing some selftests.
>
>
> I didn't go through the entire series, so apologies if this was
> mentioned somewhere, but do you mind elaborating on why this is added
> as a separate cgroup controller rather than an extension of the memory
> cgroup controller?
One of my early prototypes actually did add this to the memcg
controller. However pinned pages fall under their own limit, and we
wanted to always account pages to the cgroup of the task using the
driver rather than say folio_memcg(). So adding it to memcg didn't seem
to have much benefit as we didn't end up using any of the infrastructure
provided by memcg. Hence I thought it was clearer to just add it as it's
own controller.
- Alistair
>>
>>
>> As I don't have access to systems with all the various devices I
>> haven't been able to test all driver changes. Any help there would be
>> appreciated.
>>
>> Alistair Popple (19):
>> mm: Introduce vm_account
>> drivers/vhost: Convert to use vm_account
>> drivers/vdpa: Convert vdpa to use the new vm_structure
>> infiniband/umem: Convert to use vm_account
>> RMDA/siw: Convert to use vm_account
>> RDMA/usnic: convert to use vm_account
>> vfio/type1: Charge pinned pages to pinned_vm instead of locked_vm
>> vfio/spapr_tce: Convert accounting to pinned_vm
>> io_uring: convert to use vm_account
>> net: skb: Switch to using vm_account
>> xdp: convert to use vm_account
>> kvm/book3s_64_vio: Convert account_locked_vm() to vm_account_pinned()
>> fpga: dfl: afu: convert to use vm_account
>> mm: Introduce a cgroup for pinned memory
>> mm/util: Extend vm_account to charge pages against the pin cgroup
>> mm/util: Refactor account_locked_vm
>> mm: Convert mmap and mlock to use account_locked_vm
>> mm/mmap: Charge locked memory to pins cgroup
>> selftests/vm: Add pins-cgroup selftest for mlock/mmap
>>
>> MAINTAINERS | 8 +-
>> arch/powerpc/kvm/book3s_64_vio.c | 10 +-
>> arch/powerpc/mm/book3s64/iommu_api.c | 29 +--
>> drivers/fpga/dfl-afu-dma-region.c | 11 +-
>> drivers/fpga/dfl-afu.h | 1 +-
>> drivers/infiniband/core/umem.c | 16 +-
>> drivers/infiniband/core/umem_odp.c | 6 +-
>> drivers/infiniband/hw/usnic/usnic_uiom.c | 13 +-
>> drivers/infiniband/hw/usnic/usnic_uiom.h | 1 +-
>> drivers/infiniband/sw/siw/siw.h | 2 +-
>> drivers/infiniband/sw/siw/siw_mem.c | 20 +--
>> drivers/infiniband/sw/siw/siw_verbs.c | 15 +-
>> drivers/vdpa/vdpa_user/vduse_dev.c | 20 +--
>> drivers/vfio/vfio_iommu_spapr_tce.c | 15 +-
>> drivers/vfio/vfio_iommu_type1.c | 59 +----
>> drivers/vhost/vdpa.c | 9 +-
>> drivers/vhost/vhost.c | 2 +-
>> drivers/vhost/vhost.h | 1 +-
>> include/linux/cgroup.h | 20 ++-
>> include/linux/cgroup_subsys.h | 4 +-
>> include/linux/io_uring_types.h | 3 +-
>> include/linux/kvm_host.h | 1 +-
>> include/linux/mm.h | 5 +-
>> include/linux/mm_types.h | 88 ++++++++-
>> include/linux/skbuff.h | 6 +-
>> include/net/sock.h | 2 +-
>> include/net/xdp_sock.h | 2 +-
>> include/rdma/ib_umem.h | 1 +-
>> io_uring/io_uring.c | 20 +--
>> io_uring/notif.c | 4 +-
>> io_uring/notif.h | 10 +-
>> io_uring/rsrc.c | 38 +---
>> io_uring/rsrc.h | 9 +-
>> mm/Kconfig | 11 +-
>> mm/Makefile | 1 +-
>> mm/internal.h | 2 +-
>> mm/mlock.c | 76 +------
>> mm/mmap.c | 76 +++----
>> mm/mremap.c | 54 +++--
>> mm/pins_cgroup.c | 273 ++++++++++++++++++++++++-
>> mm/secretmem.c | 6 +-
>> mm/util.c | 196 +++++++++++++++--
>> net/core/skbuff.c | 47 +---
>> net/rds/message.c | 9 +-
>> net/xdp/xdp_umem.c | 38 +--
>> tools/testing/selftests/vm/Makefile | 1 +-
>> tools/testing/selftests/vm/pins-cgroup.c | 271 ++++++++++++++++++++++++-
>> virt/kvm/kvm_main.c | 3 +-
>> 48 files changed, 1114 insertions(+), 401 deletions(-)
>> create mode 100644 mm/pins_cgroup.c
>> create mode 100644 tools/testing/selftests/vm/pins-cgroup.c
>>
>> base-commit: 2241ab53cbb5cdb08a6b2d4688feb13971058f65
>> --
>> git-series 0.9.1
>>
^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [RFC PATCH 00/19] mm: Introduce a cgroup to limit the amount of locked and pinned memory
@ 2023-01-31 0:54 ` Alistair Popple
0 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-31 0:54 UTC (permalink / raw)
To: Yosry Ahmed
Cc: linux-mm, cgroups, linux-kernel, jgg, jhubbard, tjmercier, hannes,
surenb, mkoutny, daniel
Yosry Ahmed <yosryahmed@google.com> writes:
> On Mon, Jan 23, 2023 at 9:43 PM Alistair Popple <apopple@nvidia.com> wrote:
>>
>> Having large amounts of unmovable or unreclaimable memory in a system
>> can lead to system instability due to increasing the likelihood of
>> encountering out-of-memory conditions. Therefore it is desirable to
>> limit the amount of memory users can lock or pin.
>>
>> From userspace such limits can be enforced by setting
>> RLIMIT_MEMLOCK. However there is no standard method that drivers and
>> other in-kernel users can use to check and enforce this limit.
>>
>> This has lead to a large number of inconsistencies in how limits are
>> enforced. For example some drivers will use mm->locked_mm while others
>> will use mm->pinned_mm or user->locked_mm. It is therefore possible to
>> have up to three times RLIMIT_MEMLOCKED pinned.
>>
>> Having pinned memory limited per-task also makes it easy for users to
>> exceed the limit. For example drivers that pin memory with
>> pin_user_pages() it tends to remain pinned after fork. To deal with
>> this and other issues this series introduces a cgroup for tracking and
>> limiting the number of pages pinned or locked by tasks in the group.
>>
>> However the existing behaviour with regards to the rlimit needs to be
>> maintained. Therefore the lesser of the two limits is
>> enforced. Furthermore having CAP_IPC_LOCK usually bypasses the rlimit,
>> but this bypass is not allowed for the cgroup.
>>
>> The first part of this series converts existing drivers which
>> open-code the use of locked_mm/pinned_mm over to a common interface
>> which manages the refcounts of the associated task/mm/user
>> structs. This ensures accounting of pages is consistent and makes it
>> easier to add charging of the cgroup.
>>
>> The second part of the series adds the cgroup and converts core mm
>> code such as mlock over to charging the cgroup before finally
>> introducing some selftests.
>
>
> I didn't go through the entire series, so apologies if this was
> mentioned somewhere, but do you mind elaborating on why this is added
> as a separate cgroup controller rather than an extension of the memory
> cgroup controller?
One of my early prototypes actually did add this to the memcg
controller. However pinned pages fall under their own limit, and we
wanted to always account pages to the cgroup of the task using the
driver rather than say folio_memcg(). So adding it to memcg didn't seem
to have much benefit as we didn't end up using any of the infrastructure
provided by memcg. Hence I thought it was clearer to just add it as it's
own controller.
- Alistair
>>
>>
>> As I don't have access to systems with all the various devices I
>> haven't been able to test all driver changes. Any help there would be
>> appreciated.
>>
>> Alistair Popple (19):
>> mm: Introduce vm_account
>> drivers/vhost: Convert to use vm_account
>> drivers/vdpa: Convert vdpa to use the new vm_structure
>> infiniband/umem: Convert to use vm_account
>> RMDA/siw: Convert to use vm_account
>> RDMA/usnic: convert to use vm_account
>> vfio/type1: Charge pinned pages to pinned_vm instead of locked_vm
>> vfio/spapr_tce: Convert accounting to pinned_vm
>> io_uring: convert to use vm_account
>> net: skb: Switch to using vm_account
>> xdp: convert to use vm_account
>> kvm/book3s_64_vio: Convert account_locked_vm() to vm_account_pinned()
>> fpga: dfl: afu: convert to use vm_account
>> mm: Introduce a cgroup for pinned memory
>> mm/util: Extend vm_account to charge pages against the pin cgroup
>> mm/util: Refactor account_locked_vm
>> mm: Convert mmap and mlock to use account_locked_vm
>> mm/mmap: Charge locked memory to pins cgroup
>> selftests/vm: Add pins-cgroup selftest for mlock/mmap
>>
>> MAINTAINERS | 8 +-
>> arch/powerpc/kvm/book3s_64_vio.c | 10 +-
>> arch/powerpc/mm/book3s64/iommu_api.c | 29 +--
>> drivers/fpga/dfl-afu-dma-region.c | 11 +-
>> drivers/fpga/dfl-afu.h | 1 +-
>> drivers/infiniband/core/umem.c | 16 +-
>> drivers/infiniband/core/umem_odp.c | 6 +-
>> drivers/infiniband/hw/usnic/usnic_uiom.c | 13 +-
>> drivers/infiniband/hw/usnic/usnic_uiom.h | 1 +-
>> drivers/infiniband/sw/siw/siw.h | 2 +-
>> drivers/infiniband/sw/siw/siw_mem.c | 20 +--
>> drivers/infiniband/sw/siw/siw_verbs.c | 15 +-
>> drivers/vdpa/vdpa_user/vduse_dev.c | 20 +--
>> drivers/vfio/vfio_iommu_spapr_tce.c | 15 +-
>> drivers/vfio/vfio_iommu_type1.c | 59 +----
>> drivers/vhost/vdpa.c | 9 +-
>> drivers/vhost/vhost.c | 2 +-
>> drivers/vhost/vhost.h | 1 +-
>> include/linux/cgroup.h | 20 ++-
>> include/linux/cgroup_subsys.h | 4 +-
>> include/linux/io_uring_types.h | 3 +-
>> include/linux/kvm_host.h | 1 +-
>> include/linux/mm.h | 5 +-
>> include/linux/mm_types.h | 88 ++++++++-
>> include/linux/skbuff.h | 6 +-
>> include/net/sock.h | 2 +-
>> include/net/xdp_sock.h | 2 +-
>> include/rdma/ib_umem.h | 1 +-
>> io_uring/io_uring.c | 20 +--
>> io_uring/notif.c | 4 +-
>> io_uring/notif.h | 10 +-
>> io_uring/rsrc.c | 38 +---
>> io_uring/rsrc.h | 9 +-
>> mm/Kconfig | 11 +-
>> mm/Makefile | 1 +-
>> mm/internal.h | 2 +-
>> mm/mlock.c | 76 +------
>> mm/mmap.c | 76 +++----
>> mm/mremap.c | 54 +++--
>> mm/pins_cgroup.c | 273 ++++++++++++++++++++++++-
>> mm/secretmem.c | 6 +-
>> mm/util.c | 196 +++++++++++++++--
>> net/core/skbuff.c | 47 +---
>> net/rds/message.c | 9 +-
>> net/xdp/xdp_umem.c | 38 +--
>> tools/testing/selftests/vm/Makefile | 1 +-
>> tools/testing/selftests/vm/pins-cgroup.c | 271 ++++++++++++++++++++++++-
>> virt/kvm/kvm_main.c | 3 +-
>> 48 files changed, 1114 insertions(+), 401 deletions(-)
>> create mode 100644 mm/pins_cgroup.c
>> create mode 100644 tools/testing/selftests/vm/pins-cgroup.c
>>
>> base-commit: 2241ab53cbb5cdb08a6b2d4688feb13971058f65
>> --
>> git-series 0.9.1
>>
^ permalink raw reply [flat|nested] 108+ messages in thread
[parent not found: <874js7zf38.fsf-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org>]
* Re: [RFC PATCH 00/19] mm: Introduce a cgroup to limit the amount of locked and pinned memory
2023-01-31 0:54 ` Alistair Popple
@ 2023-01-31 5:14 ` Yosry Ahmed
-1 siblings, 0 replies; 108+ messages in thread
From: Yosry Ahmed @ 2023-01-31 5:14 UTC (permalink / raw)
To: Alistair Popple
Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA, jgg-DDmLM1+adcrQT0dZR+AlfA,
jhubbard-DDmLM1+adcrQT0dZR+AlfA, tjmercier-hpIqsD4AKlfQT0dZR+AlfA,
hannes-druUgvl0LCNAfugRpC6u6w, surenb-hpIqsD4AKlfQT0dZR+AlfA,
mkoutny-IBi9RG/b67k, daniel-/w4YWyX8dFk
On Mon, Jan 30, 2023 at 5:07 PM Alistair Popple <apopple-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org> wrote:
>
>
> Yosry Ahmed <yosryahmed-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> writes:
>
> > On Mon, Jan 23, 2023 at 9:43 PM Alistair Popple <apopple-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org> wrote:
> >>
> >> Having large amounts of unmovable or unreclaimable memory in a system
> >> can lead to system instability due to increasing the likelihood of
> >> encountering out-of-memory conditions. Therefore it is desirable to
> >> limit the amount of memory users can lock or pin.
> >>
> >> From userspace such limits can be enforced by setting
> >> RLIMIT_MEMLOCK. However there is no standard method that drivers and
> >> other in-kernel users can use to check and enforce this limit.
> >>
> >> This has lead to a large number of inconsistencies in how limits are
> >> enforced. For example some drivers will use mm->locked_mm while others
> >> will use mm->pinned_mm or user->locked_mm. It is therefore possible to
> >> have up to three times RLIMIT_MEMLOCKED pinned.
> >>
> >> Having pinned memory limited per-task also makes it easy for users to
> >> exceed the limit. For example drivers that pin memory with
> >> pin_user_pages() it tends to remain pinned after fork. To deal with
> >> this and other issues this series introduces a cgroup for tracking and
> >> limiting the number of pages pinned or locked by tasks in the group.
> >>
> >> However the existing behaviour with regards to the rlimit needs to be
> >> maintained. Therefore the lesser of the two limits is
> >> enforced. Furthermore having CAP_IPC_LOCK usually bypasses the rlimit,
> >> but this bypass is not allowed for the cgroup.
> >>
> >> The first part of this series converts existing drivers which
> >> open-code the use of locked_mm/pinned_mm over to a common interface
> >> which manages the refcounts of the associated task/mm/user
> >> structs. This ensures accounting of pages is consistent and makes it
> >> easier to add charging of the cgroup.
> >>
> >> The second part of the series adds the cgroup and converts core mm
> >> code such as mlock over to charging the cgroup before finally
> >> introducing some selftests.
> >
> >
> > I didn't go through the entire series, so apologies if this was
> > mentioned somewhere, but do you mind elaborating on why this is added
> > as a separate cgroup controller rather than an extension of the memory
> > cgroup controller?
>
> One of my early prototypes actually did add this to the memcg
> controller. However pinned pages fall under their own limit, and we
> wanted to always account pages to the cgroup of the task using the
> driver rather than say folio_memcg(). So adding it to memcg didn't seem
> to have much benefit as we didn't end up using any of the infrastructure
> provided by memcg. Hence I thought it was clearer to just add it as it's
> own controller.
To clarify, you account and limit pinned memory based on the cgroup of
the process pinning the pages, not based on the cgroup that the pages
are actually charged to? Is my understanding correct?
IOW, you limit the amount of memory that processes in a cgroup can
pin, not the amount of memory charged to a cgroup that can be pinned?
>
> - Alistair
>
> >>
> >>
> >> As I don't have access to systems with all the various devices I
> >> haven't been able to test all driver changes. Any help there would be
> >> appreciated.
> >>
> >> Alistair Popple (19):
> >> mm: Introduce vm_account
> >> drivers/vhost: Convert to use vm_account
> >> drivers/vdpa: Convert vdpa to use the new vm_structure
> >> infiniband/umem: Convert to use vm_account
> >> RMDA/siw: Convert to use vm_account
> >> RDMA/usnic: convert to use vm_account
> >> vfio/type1: Charge pinned pages to pinned_vm instead of locked_vm
> >> vfio/spapr_tce: Convert accounting to pinned_vm
> >> io_uring: convert to use vm_account
> >> net: skb: Switch to using vm_account
> >> xdp: convert to use vm_account
> >> kvm/book3s_64_vio: Convert account_locked_vm() to vm_account_pinned()
> >> fpga: dfl: afu: convert to use vm_account
> >> mm: Introduce a cgroup for pinned memory
> >> mm/util: Extend vm_account to charge pages against the pin cgroup
> >> mm/util: Refactor account_locked_vm
> >> mm: Convert mmap and mlock to use account_locked_vm
> >> mm/mmap: Charge locked memory to pins cgroup
> >> selftests/vm: Add pins-cgroup selftest for mlock/mmap
> >>
> >> MAINTAINERS | 8 +-
> >> arch/powerpc/kvm/book3s_64_vio.c | 10 +-
> >> arch/powerpc/mm/book3s64/iommu_api.c | 29 +--
> >> drivers/fpga/dfl-afu-dma-region.c | 11 +-
> >> drivers/fpga/dfl-afu.h | 1 +-
> >> drivers/infiniband/core/umem.c | 16 +-
> >> drivers/infiniband/core/umem_odp.c | 6 +-
> >> drivers/infiniband/hw/usnic/usnic_uiom.c | 13 +-
> >> drivers/infiniband/hw/usnic/usnic_uiom.h | 1 +-
> >> drivers/infiniband/sw/siw/siw.h | 2 +-
> >> drivers/infiniband/sw/siw/siw_mem.c | 20 +--
> >> drivers/infiniband/sw/siw/siw_verbs.c | 15 +-
> >> drivers/vdpa/vdpa_user/vduse_dev.c | 20 +--
> >> drivers/vfio/vfio_iommu_spapr_tce.c | 15 +-
> >> drivers/vfio/vfio_iommu_type1.c | 59 +----
> >> drivers/vhost/vdpa.c | 9 +-
> >> drivers/vhost/vhost.c | 2 +-
> >> drivers/vhost/vhost.h | 1 +-
> >> include/linux/cgroup.h | 20 ++-
> >> include/linux/cgroup_subsys.h | 4 +-
> >> include/linux/io_uring_types.h | 3 +-
> >> include/linux/kvm_host.h | 1 +-
> >> include/linux/mm.h | 5 +-
> >> include/linux/mm_types.h | 88 ++++++++-
> >> include/linux/skbuff.h | 6 +-
> >> include/net/sock.h | 2 +-
> >> include/net/xdp_sock.h | 2 +-
> >> include/rdma/ib_umem.h | 1 +-
> >> io_uring/io_uring.c | 20 +--
> >> io_uring/notif.c | 4 +-
> >> io_uring/notif.h | 10 +-
> >> io_uring/rsrc.c | 38 +---
> >> io_uring/rsrc.h | 9 +-
> >> mm/Kconfig | 11 +-
> >> mm/Makefile | 1 +-
> >> mm/internal.h | 2 +-
> >> mm/mlock.c | 76 +------
> >> mm/mmap.c | 76 +++----
> >> mm/mremap.c | 54 +++--
> >> mm/pins_cgroup.c | 273 ++++++++++++++++++++++++-
> >> mm/secretmem.c | 6 +-
> >> mm/util.c | 196 +++++++++++++++--
> >> net/core/skbuff.c | 47 +---
> >> net/rds/message.c | 9 +-
> >> net/xdp/xdp_umem.c | 38 +--
> >> tools/testing/selftests/vm/Makefile | 1 +-
> >> tools/testing/selftests/vm/pins-cgroup.c | 271 ++++++++++++++++++++++++-
> >> virt/kvm/kvm_main.c | 3 +-
> >> 48 files changed, 1114 insertions(+), 401 deletions(-)
> >> create mode 100644 mm/pins_cgroup.c
> >> create mode 100644 tools/testing/selftests/vm/pins-cgroup.c
> >>
> >> base-commit: 2241ab53cbb5cdb08a6b2d4688feb13971058f65
> >> --
> >> git-series 0.9.1
> >>
>
^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [RFC PATCH 00/19] mm: Introduce a cgroup to limit the amount of locked and pinned memory
@ 2023-01-31 5:14 ` Yosry Ahmed
0 siblings, 0 replies; 108+ messages in thread
From: Yosry Ahmed @ 2023-01-31 5:14 UTC (permalink / raw)
To: Alistair Popple
Cc: linux-mm, cgroups, linux-kernel, jgg, jhubbard, tjmercier, hannes,
surenb, mkoutny, daniel
On Mon, Jan 30, 2023 at 5:07 PM Alistair Popple <apopple@nvidia.com> wrote:
>
>
> Yosry Ahmed <yosryahmed@google.com> writes:
>
> > On Mon, Jan 23, 2023 at 9:43 PM Alistair Popple <apopple@nvidia.com> wrote:
> >>
> >> Having large amounts of unmovable or unreclaimable memory in a system
> >> can lead to system instability due to increasing the likelihood of
> >> encountering out-of-memory conditions. Therefore it is desirable to
> >> limit the amount of memory users can lock or pin.
> >>
> >> From userspace such limits can be enforced by setting
> >> RLIMIT_MEMLOCK. However there is no standard method that drivers and
> >> other in-kernel users can use to check and enforce this limit.
> >>
> >> This has lead to a large number of inconsistencies in how limits are
> >> enforced. For example some drivers will use mm->locked_mm while others
> >> will use mm->pinned_mm or user->locked_mm. It is therefore possible to
> >> have up to three times RLIMIT_MEMLOCKED pinned.
> >>
> >> Having pinned memory limited per-task also makes it easy for users to
> >> exceed the limit. For example drivers that pin memory with
> >> pin_user_pages() it tends to remain pinned after fork. To deal with
> >> this and other issues this series introduces a cgroup for tracking and
> >> limiting the number of pages pinned or locked by tasks in the group.
> >>
> >> However the existing behaviour with regards to the rlimit needs to be
> >> maintained. Therefore the lesser of the two limits is
> >> enforced. Furthermore having CAP_IPC_LOCK usually bypasses the rlimit,
> >> but this bypass is not allowed for the cgroup.
> >>
> >> The first part of this series converts existing drivers which
> >> open-code the use of locked_mm/pinned_mm over to a common interface
> >> which manages the refcounts of the associated task/mm/user
> >> structs. This ensures accounting of pages is consistent and makes it
> >> easier to add charging of the cgroup.
> >>
> >> The second part of the series adds the cgroup and converts core mm
> >> code such as mlock over to charging the cgroup before finally
> >> introducing some selftests.
> >
> >
> > I didn't go through the entire series, so apologies if this was
> > mentioned somewhere, but do you mind elaborating on why this is added
> > as a separate cgroup controller rather than an extension of the memory
> > cgroup controller?
>
> One of my early prototypes actually did add this to the memcg
> controller. However pinned pages fall under their own limit, and we
> wanted to always account pages to the cgroup of the task using the
> driver rather than say folio_memcg(). So adding it to memcg didn't seem
> to have much benefit as we didn't end up using any of the infrastructure
> provided by memcg. Hence I thought it was clearer to just add it as it's
> own controller.
To clarify, you account and limit pinned memory based on the cgroup of
the process pinning the pages, not based on the cgroup that the pages
are actually charged to? Is my understanding correct?
IOW, you limit the amount of memory that processes in a cgroup can
pin, not the amount of memory charged to a cgroup that can be pinned?
>
> - Alistair
>
> >>
> >>
> >> As I don't have access to systems with all the various devices I
> >> haven't been able to test all driver changes. Any help there would be
> >> appreciated.
> >>
> >> Alistair Popple (19):
> >> mm: Introduce vm_account
> >> drivers/vhost: Convert to use vm_account
> >> drivers/vdpa: Convert vdpa to use the new vm_structure
> >> infiniband/umem: Convert to use vm_account
> >> RMDA/siw: Convert to use vm_account
> >> RDMA/usnic: convert to use vm_account
> >> vfio/type1: Charge pinned pages to pinned_vm instead of locked_vm
> >> vfio/spapr_tce: Convert accounting to pinned_vm
> >> io_uring: convert to use vm_account
> >> net: skb: Switch to using vm_account
> >> xdp: convert to use vm_account
> >> kvm/book3s_64_vio: Convert account_locked_vm() to vm_account_pinned()
> >> fpga: dfl: afu: convert to use vm_account
> >> mm: Introduce a cgroup for pinned memory
> >> mm/util: Extend vm_account to charge pages against the pin cgroup
> >> mm/util: Refactor account_locked_vm
> >> mm: Convert mmap and mlock to use account_locked_vm
> >> mm/mmap: Charge locked memory to pins cgroup
> >> selftests/vm: Add pins-cgroup selftest for mlock/mmap
> >>
> >> MAINTAINERS | 8 +-
> >> arch/powerpc/kvm/book3s_64_vio.c | 10 +-
> >> arch/powerpc/mm/book3s64/iommu_api.c | 29 +--
> >> drivers/fpga/dfl-afu-dma-region.c | 11 +-
> >> drivers/fpga/dfl-afu.h | 1 +-
> >> drivers/infiniband/core/umem.c | 16 +-
> >> drivers/infiniband/core/umem_odp.c | 6 +-
> >> drivers/infiniband/hw/usnic/usnic_uiom.c | 13 +-
> >> drivers/infiniband/hw/usnic/usnic_uiom.h | 1 +-
> >> drivers/infiniband/sw/siw/siw.h | 2 +-
> >> drivers/infiniband/sw/siw/siw_mem.c | 20 +--
> >> drivers/infiniband/sw/siw/siw_verbs.c | 15 +-
> >> drivers/vdpa/vdpa_user/vduse_dev.c | 20 +--
> >> drivers/vfio/vfio_iommu_spapr_tce.c | 15 +-
> >> drivers/vfio/vfio_iommu_type1.c | 59 +----
> >> drivers/vhost/vdpa.c | 9 +-
> >> drivers/vhost/vhost.c | 2 +-
> >> drivers/vhost/vhost.h | 1 +-
> >> include/linux/cgroup.h | 20 ++-
> >> include/linux/cgroup_subsys.h | 4 +-
> >> include/linux/io_uring_types.h | 3 +-
> >> include/linux/kvm_host.h | 1 +-
> >> include/linux/mm.h | 5 +-
> >> include/linux/mm_types.h | 88 ++++++++-
> >> include/linux/skbuff.h | 6 +-
> >> include/net/sock.h | 2 +-
> >> include/net/xdp_sock.h | 2 +-
> >> include/rdma/ib_umem.h | 1 +-
> >> io_uring/io_uring.c | 20 +--
> >> io_uring/notif.c | 4 +-
> >> io_uring/notif.h | 10 +-
> >> io_uring/rsrc.c | 38 +---
> >> io_uring/rsrc.h | 9 +-
> >> mm/Kconfig | 11 +-
> >> mm/Makefile | 1 +-
> >> mm/internal.h | 2 +-
> >> mm/mlock.c | 76 +------
> >> mm/mmap.c | 76 +++----
> >> mm/mremap.c | 54 +++--
> >> mm/pins_cgroup.c | 273 ++++++++++++++++++++++++-
> >> mm/secretmem.c | 6 +-
> >> mm/util.c | 196 +++++++++++++++--
> >> net/core/skbuff.c | 47 +---
> >> net/rds/message.c | 9 +-
> >> net/xdp/xdp_umem.c | 38 +--
> >> tools/testing/selftests/vm/Makefile | 1 +-
> >> tools/testing/selftests/vm/pins-cgroup.c | 271 ++++++++++++++++++++++++-
> >> virt/kvm/kvm_main.c | 3 +-
> >> 48 files changed, 1114 insertions(+), 401 deletions(-)
> >> create mode 100644 mm/pins_cgroup.c
> >> create mode 100644 tools/testing/selftests/vm/pins-cgroup.c
> >>
> >> base-commit: 2241ab53cbb5cdb08a6b2d4688feb13971058f65
> >> --
> >> git-series 0.9.1
> >>
>
^ permalink raw reply [flat|nested] 108+ messages in thread
[parent not found: <CAJD7tkZTvXjoNZYC99yekbA0zHkD4iFj0J3+8dsOMht6rxrRcQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>]
* Re: [RFC PATCH 00/19] mm: Introduce a cgroup to limit the amount of locked and pinned memory
2023-01-31 5:14 ` Yosry Ahmed
@ 2023-01-31 11:22 ` Alistair Popple
-1 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-31 11:22 UTC (permalink / raw)
To: Yosry Ahmed
Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA, jgg-DDmLM1+adcrQT0dZR+AlfA,
jhubbard-DDmLM1+adcrQT0dZR+AlfA, tjmercier-hpIqsD4AKlfQT0dZR+AlfA,
hannes-druUgvl0LCNAfugRpC6u6w, surenb-hpIqsD4AKlfQT0dZR+AlfA,
mkoutny-IBi9RG/b67k, daniel-/w4YWyX8dFk
Yosry Ahmed <yosryahmed-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> writes:
> On Mon, Jan 30, 2023 at 5:07 PM Alistair Popple <apopple-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org> wrote:
>>
>>
>> Yosry Ahmed <yosryahmed-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> writes:
>>
>> > On Mon, Jan 23, 2023 at 9:43 PM Alistair Popple <apopple-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org> wrote:
>> >>
>> >> Having large amounts of unmovable or unreclaimable memory in a system
>> >> can lead to system instability due to increasing the likelihood of
>> >> encountering out-of-memory conditions. Therefore it is desirable to
>> >> limit the amount of memory users can lock or pin.
>> >>
>> >> From userspace such limits can be enforced by setting
>> >> RLIMIT_MEMLOCK. However there is no standard method that drivers and
>> >> other in-kernel users can use to check and enforce this limit.
>> >>
>> >> This has lead to a large number of inconsistencies in how limits are
>> >> enforced. For example some drivers will use mm->locked_mm while others
>> >> will use mm->pinned_mm or user->locked_mm. It is therefore possible to
>> >> have up to three times RLIMIT_MEMLOCKED pinned.
>> >>
>> >> Having pinned memory limited per-task also makes it easy for users to
>> >> exceed the limit. For example drivers that pin memory with
>> >> pin_user_pages() it tends to remain pinned after fork. To deal with
>> >> this and other issues this series introduces a cgroup for tracking and
>> >> limiting the number of pages pinned or locked by tasks in the group.
>> >>
>> >> However the existing behaviour with regards to the rlimit needs to be
>> >> maintained. Therefore the lesser of the two limits is
>> >> enforced. Furthermore having CAP_IPC_LOCK usually bypasses the rlimit,
>> >> but this bypass is not allowed for the cgroup.
>> >>
>> >> The first part of this series converts existing drivers which
>> >> open-code the use of locked_mm/pinned_mm over to a common interface
>> >> which manages the refcounts of the associated task/mm/user
>> >> structs. This ensures accounting of pages is consistent and makes it
>> >> easier to add charging of the cgroup.
>> >>
>> >> The second part of the series adds the cgroup and converts core mm
>> >> code such as mlock over to charging the cgroup before finally
>> >> introducing some selftests.
>> >
>> >
>> > I didn't go through the entire series, so apologies if this was
>> > mentioned somewhere, but do you mind elaborating on why this is added
>> > as a separate cgroup controller rather than an extension of the memory
>> > cgroup controller?
>>
>> One of my early prototypes actually did add this to the memcg
>> controller. However pinned pages fall under their own limit, and we
>> wanted to always account pages to the cgroup of the task using the
>> driver rather than say folio_memcg(). So adding it to memcg didn't seem
>> to have much benefit as we didn't end up using any of the infrastructure
>> provided by memcg. Hence I thought it was clearer to just add it as it's
>> own controller.
>
> To clarify, you account and limit pinned memory based on the cgroup of
> the process pinning the pages, not based on the cgroup that the pages
> are actually charged to? Is my understanding correct?
That's correct.
> IOW, you limit the amount of memory that processes in a cgroup can
> pin, not the amount of memory charged to a cgroup that can be pinned?
Right, that's a good clarification which I might steal and add to the
cover letter.
>>
>> - Alistair
>>
>> >>
>> >>
>> >> As I don't have access to systems with all the various devices I
>> >> haven't been able to test all driver changes. Any help there would be
>> >> appreciated.
>> >>
>> >> Alistair Popple (19):
>> >> mm: Introduce vm_account
>> >> drivers/vhost: Convert to use vm_account
>> >> drivers/vdpa: Convert vdpa to use the new vm_structure
>> >> infiniband/umem: Convert to use vm_account
>> >> RMDA/siw: Convert to use vm_account
>> >> RDMA/usnic: convert to use vm_account
>> >> vfio/type1: Charge pinned pages to pinned_vm instead of locked_vm
>> >> vfio/spapr_tce: Convert accounting to pinned_vm
>> >> io_uring: convert to use vm_account
>> >> net: skb: Switch to using vm_account
>> >> xdp: convert to use vm_account
>> >> kvm/book3s_64_vio: Convert account_locked_vm() to vm_account_pinned()
>> >> fpga: dfl: afu: convert to use vm_account
>> >> mm: Introduce a cgroup for pinned memory
>> >> mm/util: Extend vm_account to charge pages against the pin cgroup
>> >> mm/util: Refactor account_locked_vm
>> >> mm: Convert mmap and mlock to use account_locked_vm
>> >> mm/mmap: Charge locked memory to pins cgroup
>> >> selftests/vm: Add pins-cgroup selftest for mlock/mmap
>> >>
>> >> MAINTAINERS | 8 +-
>> >> arch/powerpc/kvm/book3s_64_vio.c | 10 +-
>> >> arch/powerpc/mm/book3s64/iommu_api.c | 29 +--
>> >> drivers/fpga/dfl-afu-dma-region.c | 11 +-
>> >> drivers/fpga/dfl-afu.h | 1 +-
>> >> drivers/infiniband/core/umem.c | 16 +-
>> >> drivers/infiniband/core/umem_odp.c | 6 +-
>> >> drivers/infiniband/hw/usnic/usnic_uiom.c | 13 +-
>> >> drivers/infiniband/hw/usnic/usnic_uiom.h | 1 +-
>> >> drivers/infiniband/sw/siw/siw.h | 2 +-
>> >> drivers/infiniband/sw/siw/siw_mem.c | 20 +--
>> >> drivers/infiniband/sw/siw/siw_verbs.c | 15 +-
>> >> drivers/vdpa/vdpa_user/vduse_dev.c | 20 +--
>> >> drivers/vfio/vfio_iommu_spapr_tce.c | 15 +-
>> >> drivers/vfio/vfio_iommu_type1.c | 59 +----
>> >> drivers/vhost/vdpa.c | 9 +-
>> >> drivers/vhost/vhost.c | 2 +-
>> >> drivers/vhost/vhost.h | 1 +-
>> >> include/linux/cgroup.h | 20 ++-
>> >> include/linux/cgroup_subsys.h | 4 +-
>> >> include/linux/io_uring_types.h | 3 +-
>> >> include/linux/kvm_host.h | 1 +-
>> >> include/linux/mm.h | 5 +-
>> >> include/linux/mm_types.h | 88 ++++++++-
>> >> include/linux/skbuff.h | 6 +-
>> >> include/net/sock.h | 2 +-
>> >> include/net/xdp_sock.h | 2 +-
>> >> include/rdma/ib_umem.h | 1 +-
>> >> io_uring/io_uring.c | 20 +--
>> >> io_uring/notif.c | 4 +-
>> >> io_uring/notif.h | 10 +-
>> >> io_uring/rsrc.c | 38 +---
>> >> io_uring/rsrc.h | 9 +-
>> >> mm/Kconfig | 11 +-
>> >> mm/Makefile | 1 +-
>> >> mm/internal.h | 2 +-
>> >> mm/mlock.c | 76 +------
>> >> mm/mmap.c | 76 +++----
>> >> mm/mremap.c | 54 +++--
>> >> mm/pins_cgroup.c | 273 ++++++++++++++++++++++++-
>> >> mm/secretmem.c | 6 +-
>> >> mm/util.c | 196 +++++++++++++++--
>> >> net/core/skbuff.c | 47 +---
>> >> net/rds/message.c | 9 +-
>> >> net/xdp/xdp_umem.c | 38 +--
>> >> tools/testing/selftests/vm/Makefile | 1 +-
>> >> tools/testing/selftests/vm/pins-cgroup.c | 271 ++++++++++++++++++++++++-
>> >> virt/kvm/kvm_main.c | 3 +-
>> >> 48 files changed, 1114 insertions(+), 401 deletions(-)
>> >> create mode 100644 mm/pins_cgroup.c
>> >> create mode 100644 tools/testing/selftests/vm/pins-cgroup.c
>> >>
>> >> base-commit: 2241ab53cbb5cdb08a6b2d4688feb13971058f65
>> >> --
>> >> git-series 0.9.1
>> >>
>>
^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [RFC PATCH 00/19] mm: Introduce a cgroup to limit the amount of locked and pinned memory
@ 2023-01-31 11:22 ` Alistair Popple
0 siblings, 0 replies; 108+ messages in thread
From: Alistair Popple @ 2023-01-31 11:22 UTC (permalink / raw)
To: Yosry Ahmed
Cc: linux-mm, cgroups, linux-kernel, jgg, jhubbard, tjmercier, hannes,
surenb, mkoutny, daniel
Yosry Ahmed <yosryahmed@google.com> writes:
> On Mon, Jan 30, 2023 at 5:07 PM Alistair Popple <apopple@nvidia.com> wrote:
>>
>>
>> Yosry Ahmed <yosryahmed@google.com> writes:
>>
>> > On Mon, Jan 23, 2023 at 9:43 PM Alistair Popple <apopple@nvidia.com> wrote:
>> >>
>> >> Having large amounts of unmovable or unreclaimable memory in a system
>> >> can lead to system instability due to increasing the likelihood of
>> >> encountering out-of-memory conditions. Therefore it is desirable to
>> >> limit the amount of memory users can lock or pin.
>> >>
>> >> From userspace such limits can be enforced by setting
>> >> RLIMIT_MEMLOCK. However there is no standard method that drivers and
>> >> other in-kernel users can use to check and enforce this limit.
>> >>
>> >> This has lead to a large number of inconsistencies in how limits are
>> >> enforced. For example some drivers will use mm->locked_mm while others
>> >> will use mm->pinned_mm or user->locked_mm. It is therefore possible to
>> >> have up to three times RLIMIT_MEMLOCKED pinned.
>> >>
>> >> Having pinned memory limited per-task also makes it easy for users to
>> >> exceed the limit. For example drivers that pin memory with
>> >> pin_user_pages() it tends to remain pinned after fork. To deal with
>> >> this and other issues this series introduces a cgroup for tracking and
>> >> limiting the number of pages pinned or locked by tasks in the group.
>> >>
>> >> However the existing behaviour with regards to the rlimit needs to be
>> >> maintained. Therefore the lesser of the two limits is
>> >> enforced. Furthermore having CAP_IPC_LOCK usually bypasses the rlimit,
>> >> but this bypass is not allowed for the cgroup.
>> >>
>> >> The first part of this series converts existing drivers which
>> >> open-code the use of locked_mm/pinned_mm over to a common interface
>> >> which manages the refcounts of the associated task/mm/user
>> >> structs. This ensures accounting of pages is consistent and makes it
>> >> easier to add charging of the cgroup.
>> >>
>> >> The second part of the series adds the cgroup and converts core mm
>> >> code such as mlock over to charging the cgroup before finally
>> >> introducing some selftests.
>> >
>> >
>> > I didn't go through the entire series, so apologies if this was
>> > mentioned somewhere, but do you mind elaborating on why this is added
>> > as a separate cgroup controller rather than an extension of the memory
>> > cgroup controller?
>>
>> One of my early prototypes actually did add this to the memcg
>> controller. However pinned pages fall under their own limit, and we
>> wanted to always account pages to the cgroup of the task using the
>> driver rather than say folio_memcg(). So adding it to memcg didn't seem
>> to have much benefit as we didn't end up using any of the infrastructure
>> provided by memcg. Hence I thought it was clearer to just add it as it's
>> own controller.
>
> To clarify, you account and limit pinned memory based on the cgroup of
> the process pinning the pages, not based on the cgroup that the pages
> are actually charged to? Is my understanding correct?
That's correct.
> IOW, you limit the amount of memory that processes in a cgroup can
> pin, not the amount of memory charged to a cgroup that can be pinned?
Right, that's a good clarification which I might steal and add to the
cover letter.
>>
>> - Alistair
>>
>> >>
>> >>
>> >> As I don't have access to systems with all the various devices I
>> >> haven't been able to test all driver changes. Any help there would be
>> >> appreciated.
>> >>
>> >> Alistair Popple (19):
>> >> mm: Introduce vm_account
>> >> drivers/vhost: Convert to use vm_account
>> >> drivers/vdpa: Convert vdpa to use the new vm_structure
>> >> infiniband/umem: Convert to use vm_account
>> >> RMDA/siw: Convert to use vm_account
>> >> RDMA/usnic: convert to use vm_account
>> >> vfio/type1: Charge pinned pages to pinned_vm instead of locked_vm
>> >> vfio/spapr_tce: Convert accounting to pinned_vm
>> >> io_uring: convert to use vm_account
>> >> net: skb: Switch to using vm_account
>> >> xdp: convert to use vm_account
>> >> kvm/book3s_64_vio: Convert account_locked_vm() to vm_account_pinned()
>> >> fpga: dfl: afu: convert to use vm_account
>> >> mm: Introduce a cgroup for pinned memory
>> >> mm/util: Extend vm_account to charge pages against the pin cgroup
>> >> mm/util: Refactor account_locked_vm
>> >> mm: Convert mmap and mlock to use account_locked_vm
>> >> mm/mmap: Charge locked memory to pins cgroup
>> >> selftests/vm: Add pins-cgroup selftest for mlock/mmap
>> >>
>> >> MAINTAINERS | 8 +-
>> >> arch/powerpc/kvm/book3s_64_vio.c | 10 +-
>> >> arch/powerpc/mm/book3s64/iommu_api.c | 29 +--
>> >> drivers/fpga/dfl-afu-dma-region.c | 11 +-
>> >> drivers/fpga/dfl-afu.h | 1 +-
>> >> drivers/infiniband/core/umem.c | 16 +-
>> >> drivers/infiniband/core/umem_odp.c | 6 +-
>> >> drivers/infiniband/hw/usnic/usnic_uiom.c | 13 +-
>> >> drivers/infiniband/hw/usnic/usnic_uiom.h | 1 +-
>> >> drivers/infiniband/sw/siw/siw.h | 2 +-
>> >> drivers/infiniband/sw/siw/siw_mem.c | 20 +--
>> >> drivers/infiniband/sw/siw/siw_verbs.c | 15 +-
>> >> drivers/vdpa/vdpa_user/vduse_dev.c | 20 +--
>> >> drivers/vfio/vfio_iommu_spapr_tce.c | 15 +-
>> >> drivers/vfio/vfio_iommu_type1.c | 59 +----
>> >> drivers/vhost/vdpa.c | 9 +-
>> >> drivers/vhost/vhost.c | 2 +-
>> >> drivers/vhost/vhost.h | 1 +-
>> >> include/linux/cgroup.h | 20 ++-
>> >> include/linux/cgroup_subsys.h | 4 +-
>> >> include/linux/io_uring_types.h | 3 +-
>> >> include/linux/kvm_host.h | 1 +-
>> >> include/linux/mm.h | 5 +-
>> >> include/linux/mm_types.h | 88 ++++++++-
>> >> include/linux/skbuff.h | 6 +-
>> >> include/net/sock.h | 2 +-
>> >> include/net/xdp_sock.h | 2 +-
>> >> include/rdma/ib_umem.h | 1 +-
>> >> io_uring/io_uring.c | 20 +--
>> >> io_uring/notif.c | 4 +-
>> >> io_uring/notif.h | 10 +-
>> >> io_uring/rsrc.c | 38 +---
>> >> io_uring/rsrc.h | 9 +-
>> >> mm/Kconfig | 11 +-
>> >> mm/Makefile | 1 +-
>> >> mm/internal.h | 2 +-
>> >> mm/mlock.c | 76 +------
>> >> mm/mmap.c | 76 +++----
>> >> mm/mremap.c | 54 +++--
>> >> mm/pins_cgroup.c | 273 ++++++++++++++++++++++++-
>> >> mm/secretmem.c | 6 +-
>> >> mm/util.c | 196 +++++++++++++++--
>> >> net/core/skbuff.c | 47 +---
>> >> net/rds/message.c | 9 +-
>> >> net/xdp/xdp_umem.c | 38 +--
>> >> tools/testing/selftests/vm/Makefile | 1 +-
>> >> tools/testing/selftests/vm/pins-cgroup.c | 271 ++++++++++++++++++++++++-
>> >> virt/kvm/kvm_main.c | 3 +-
>> >> 48 files changed, 1114 insertions(+), 401 deletions(-)
>> >> create mode 100644 mm/pins_cgroup.c
>> >> create mode 100644 tools/testing/selftests/vm/pins-cgroup.c
>> >>
>> >> base-commit: 2241ab53cbb5cdb08a6b2d4688feb13971058f65
>> >> --
>> >> git-series 0.9.1
>> >>
>>
^ permalink raw reply [flat|nested] 108+ messages in thread
[parent not found: <87r0vblzf3.fsf-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org>]
* Re: [RFC PATCH 00/19] mm: Introduce a cgroup to limit the amount of locked and pinned memory
2023-01-31 11:22 ` Alistair Popple
@ 2023-01-31 19:49 ` Yosry Ahmed
-1 siblings, 0 replies; 108+ messages in thread
From: Yosry Ahmed @ 2023-01-31 19:49 UTC (permalink / raw)
To: Alistair Popple
Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA, jgg-DDmLM1+adcrQT0dZR+AlfA,
jhubbard-DDmLM1+adcrQT0dZR+AlfA, tjmercier-hpIqsD4AKlfQT0dZR+AlfA,
hannes-druUgvl0LCNAfugRpC6u6w, surenb-hpIqsD4AKlfQT0dZR+AlfA,
mkoutny-IBi9RG/b67k, daniel-/w4YWyX8dFk
On Tue, Jan 31, 2023 at 3:24 AM Alistair Popple <apopple-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org> wrote:
>
>
> Yosry Ahmed <yosryahmed-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> writes:
>
> > On Mon, Jan 30, 2023 at 5:07 PM Alistair Popple <apopple-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org> wrote:
> >>
> >>
> >> Yosry Ahmed <yosryahmed-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> writes:
> >>
> >> > On Mon, Jan 23, 2023 at 9:43 PM Alistair Popple <apopple-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org> wrote:
> >> >>
> >> >> Having large amounts of unmovable or unreclaimable memory in a system
> >> >> can lead to system instability due to increasing the likelihood of
> >> >> encountering out-of-memory conditions. Therefore it is desirable to
> >> >> limit the amount of memory users can lock or pin.
> >> >>
> >> >> From userspace such limits can be enforced by setting
> >> >> RLIMIT_MEMLOCK. However there is no standard method that drivers and
> >> >> other in-kernel users can use to check and enforce this limit.
> >> >>
> >> >> This has lead to a large number of inconsistencies in how limits are
> >> >> enforced. For example some drivers will use mm->locked_mm while others
> >> >> will use mm->pinned_mm or user->locked_mm. It is therefore possible to
> >> >> have up to three times RLIMIT_MEMLOCKED pinned.
> >> >>
> >> >> Having pinned memory limited per-task also makes it easy for users to
> >> >> exceed the limit. For example drivers that pin memory with
> >> >> pin_user_pages() it tends to remain pinned after fork. To deal with
> >> >> this and other issues this series introduces a cgroup for tracking and
> >> >> limiting the number of pages pinned or locked by tasks in the group.
> >> >>
> >> >> However the existing behaviour with regards to the rlimit needs to be
> >> >> maintained. Therefore the lesser of the two limits is
> >> >> enforced. Furthermore having CAP_IPC_LOCK usually bypasses the rlimit,
> >> >> but this bypass is not allowed for the cgroup.
> >> >>
> >> >> The first part of this series converts existing drivers which
> >> >> open-code the use of locked_mm/pinned_mm over to a common interface
> >> >> which manages the refcounts of the associated task/mm/user
> >> >> structs. This ensures accounting of pages is consistent and makes it
> >> >> easier to add charging of the cgroup.
> >> >>
> >> >> The second part of the series adds the cgroup and converts core mm
> >> >> code such as mlock over to charging the cgroup before finally
> >> >> introducing some selftests.
> >> >
> >> >
> >> > I didn't go through the entire series, so apologies if this was
> >> > mentioned somewhere, but do you mind elaborating on why this is added
> >> > as a separate cgroup controller rather than an extension of the memory
> >> > cgroup controller?
> >>
> >> One of my early prototypes actually did add this to the memcg
> >> controller. However pinned pages fall under their own limit, and we
> >> wanted to always account pages to the cgroup of the task using the
> >> driver rather than say folio_memcg(). So adding it to memcg didn't seem
> >> to have much benefit as we didn't end up using any of the infrastructure
> >> provided by memcg. Hence I thought it was clearer to just add it as it's
> >> own controller.
> >
> > To clarify, you account and limit pinned memory based on the cgroup of
> > the process pinning the pages, not based on the cgroup that the pages
> > are actually charged to? Is my understanding correct?
>
> That's correct.
Interesting.
>
> > IOW, you limit the amount of memory that processes in a cgroup can
> > pin, not the amount of memory charged to a cgroup that can be pinned?
>
> Right, that's a good clarification which I might steal and add to the
> cover letter.
Feel free to :)
Please also clarify this in the code/docs. Glancing through the
patches I was asking myself multiple times why this is not
"memory.pinned.[current/max]" or similar.
>
> >>
> >> - Alistair
> >>
> >> >>
> >> >>
> >> >> As I don't have access to systems with all the various devices I
> >> >> haven't been able to test all driver changes. Any help there would be
> >> >> appreciated.
> >> >>
> >> >> Alistair Popple (19):
> >> >> mm: Introduce vm_account
> >> >> drivers/vhost: Convert to use vm_account
> >> >> drivers/vdpa: Convert vdpa to use the new vm_structure
> >> >> infiniband/umem: Convert to use vm_account
> >> >> RMDA/siw: Convert to use vm_account
> >> >> RDMA/usnic: convert to use vm_account
> >> >> vfio/type1: Charge pinned pages to pinned_vm instead of locked_vm
> >> >> vfio/spapr_tce: Convert accounting to pinned_vm
> >> >> io_uring: convert to use vm_account
> >> >> net: skb: Switch to using vm_account
> >> >> xdp: convert to use vm_account
> >> >> kvm/book3s_64_vio: Convert account_locked_vm() to vm_account_pinned()
> >> >> fpga: dfl: afu: convert to use vm_account
> >> >> mm: Introduce a cgroup for pinned memory
> >> >> mm/util: Extend vm_account to charge pages against the pin cgroup
> >> >> mm/util: Refactor account_locked_vm
> >> >> mm: Convert mmap and mlock to use account_locked_vm
> >> >> mm/mmap: Charge locked memory to pins cgroup
> >> >> selftests/vm: Add pins-cgroup selftest for mlock/mmap
> >> >>
> >> >> MAINTAINERS | 8 +-
> >> >> arch/powerpc/kvm/book3s_64_vio.c | 10 +-
> >> >> arch/powerpc/mm/book3s64/iommu_api.c | 29 +--
> >> >> drivers/fpga/dfl-afu-dma-region.c | 11 +-
> >> >> drivers/fpga/dfl-afu.h | 1 +-
> >> >> drivers/infiniband/core/umem.c | 16 +-
> >> >> drivers/infiniband/core/umem_odp.c | 6 +-
> >> >> drivers/infiniband/hw/usnic/usnic_uiom.c | 13 +-
> >> >> drivers/infiniband/hw/usnic/usnic_uiom.h | 1 +-
> >> >> drivers/infiniband/sw/siw/siw.h | 2 +-
> >> >> drivers/infiniband/sw/siw/siw_mem.c | 20 +--
> >> >> drivers/infiniband/sw/siw/siw_verbs.c | 15 +-
> >> >> drivers/vdpa/vdpa_user/vduse_dev.c | 20 +--
> >> >> drivers/vfio/vfio_iommu_spapr_tce.c | 15 +-
> >> >> drivers/vfio/vfio_iommu_type1.c | 59 +----
> >> >> drivers/vhost/vdpa.c | 9 +-
> >> >> drivers/vhost/vhost.c | 2 +-
> >> >> drivers/vhost/vhost.h | 1 +-
> >> >> include/linux/cgroup.h | 20 ++-
> >> >> include/linux/cgroup_subsys.h | 4 +-
> >> >> include/linux/io_uring_types.h | 3 +-
> >> >> include/linux/kvm_host.h | 1 +-
> >> >> include/linux/mm.h | 5 +-
> >> >> include/linux/mm_types.h | 88 ++++++++-
> >> >> include/linux/skbuff.h | 6 +-
> >> >> include/net/sock.h | 2 +-
> >> >> include/net/xdp_sock.h | 2 +-
> >> >> include/rdma/ib_umem.h | 1 +-
> >> >> io_uring/io_uring.c | 20 +--
> >> >> io_uring/notif.c | 4 +-
> >> >> io_uring/notif.h | 10 +-
> >> >> io_uring/rsrc.c | 38 +---
> >> >> io_uring/rsrc.h | 9 +-
> >> >> mm/Kconfig | 11 +-
> >> >> mm/Makefile | 1 +-
> >> >> mm/internal.h | 2 +-
> >> >> mm/mlock.c | 76 +------
> >> >> mm/mmap.c | 76 +++----
> >> >> mm/mremap.c | 54 +++--
> >> >> mm/pins_cgroup.c | 273 ++++++++++++++++++++++++-
> >> >> mm/secretmem.c | 6 +-
> >> >> mm/util.c | 196 +++++++++++++++--
> >> >> net/core/skbuff.c | 47 +---
> >> >> net/rds/message.c | 9 +-
> >> >> net/xdp/xdp_umem.c | 38 +--
> >> >> tools/testing/selftests/vm/Makefile | 1 +-
> >> >> tools/testing/selftests/vm/pins-cgroup.c | 271 ++++++++++++++++++++++++-
> >> >> virt/kvm/kvm_main.c | 3 +-
> >> >> 48 files changed, 1114 insertions(+), 401 deletions(-)
> >> >> create mode 100644 mm/pins_cgroup.c
> >> >> create mode 100644 tools/testing/selftests/vm/pins-cgroup.c
> >> >>
> >> >> base-commit: 2241ab53cbb5cdb08a6b2d4688feb13971058f65
> >> >> --
> >> >> git-series 0.9.1
> >> >>
> >>
>
^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [RFC PATCH 00/19] mm: Introduce a cgroup to limit the amount of locked and pinned memory
@ 2023-01-31 19:49 ` Yosry Ahmed
0 siblings, 0 replies; 108+ messages in thread
From: Yosry Ahmed @ 2023-01-31 19:49 UTC (permalink / raw)
To: Alistair Popple
Cc: linux-mm, cgroups, linux-kernel, jgg, jhubbard, tjmercier, hannes,
surenb, mkoutny, daniel
On Tue, Jan 31, 2023 at 3:24 AM Alistair Popple <apopple@nvidia.com> wrote:
>
>
> Yosry Ahmed <yosryahmed@google.com> writes:
>
> > On Mon, Jan 30, 2023 at 5:07 PM Alistair Popple <apopple@nvidia.com> wrote:
> >>
> >>
> >> Yosry Ahmed <yosryahmed@google.com> writes:
> >>
> >> > On Mon, Jan 23, 2023 at 9:43 PM Alistair Popple <apopple@nvidia.com> wrote:
> >> >>
> >> >> Having large amounts of unmovable or unreclaimable memory in a system
> >> >> can lead to system instability due to increasing the likelihood of
> >> >> encountering out-of-memory conditions. Therefore it is desirable to
> >> >> limit the amount of memory users can lock or pin.
> >> >>
> >> >> From userspace such limits can be enforced by setting
> >> >> RLIMIT_MEMLOCK. However there is no standard method that drivers and
> >> >> other in-kernel users can use to check and enforce this limit.
> >> >>
> >> >> This has lead to a large number of inconsistencies in how limits are
> >> >> enforced. For example some drivers will use mm->locked_mm while others
> >> >> will use mm->pinned_mm or user->locked_mm. It is therefore possible to
> >> >> have up to three times RLIMIT_MEMLOCKED pinned.
> >> >>
> >> >> Having pinned memory limited per-task also makes it easy for users to
> >> >> exceed the limit. For example drivers that pin memory with
> >> >> pin_user_pages() it tends to remain pinned after fork. To deal with
> >> >> this and other issues this series introduces a cgroup for tracking and
> >> >> limiting the number of pages pinned or locked by tasks in the group.
> >> >>
> >> >> However the existing behaviour with regards to the rlimit needs to be
> >> >> maintained. Therefore the lesser of the two limits is
> >> >> enforced. Furthermore having CAP_IPC_LOCK usually bypasses the rlimit,
> >> >> but this bypass is not allowed for the cgroup.
> >> >>
> >> >> The first part of this series converts existing drivers which
> >> >> open-code the use of locked_mm/pinned_mm over to a common interface
> >> >> which manages the refcounts of the associated task/mm/user
> >> >> structs. This ensures accounting of pages is consistent and makes it
> >> >> easier to add charging of the cgroup.
> >> >>
> >> >> The second part of the series adds the cgroup and converts core mm
> >> >> code such as mlock over to charging the cgroup before finally
> >> >> introducing some selftests.
> >> >
> >> >
> >> > I didn't go through the entire series, so apologies if this was
> >> > mentioned somewhere, but do you mind elaborating on why this is added
> >> > as a separate cgroup controller rather than an extension of the memory
> >> > cgroup controller?
> >>
> >> One of my early prototypes actually did add this to the memcg
> >> controller. However pinned pages fall under their own limit, and we
> >> wanted to always account pages to the cgroup of the task using the
> >> driver rather than say folio_memcg(). So adding it to memcg didn't seem
> >> to have much benefit as we didn't end up using any of the infrastructure
> >> provided by memcg. Hence I thought it was clearer to just add it as it's
> >> own controller.
> >
> > To clarify, you account and limit pinned memory based on the cgroup of
> > the process pinning the pages, not based on the cgroup that the pages
> > are actually charged to? Is my understanding correct?
>
> That's correct.
Interesting.
>
> > IOW, you limit the amount of memory that processes in a cgroup can
> > pin, not the amount of memory charged to a cgroup that can be pinned?
>
> Right, that's a good clarification which I might steal and add to the
> cover letter.
Feel free to :)
Please also clarify this in the code/docs. Glancing through the
patches I was asking myself multiple times why this is not
"memory.pinned.[current/max]" or similar.
>
> >>
> >> - Alistair
> >>
> >> >>
> >> >>
> >> >> As I don't have access to systems with all the various devices I
> >> >> haven't been able to test all driver changes. Any help there would be
> >> >> appreciated.
> >> >>
> >> >> Alistair Popple (19):
> >> >> mm: Introduce vm_account
> >> >> drivers/vhost: Convert to use vm_account
> >> >> drivers/vdpa: Convert vdpa to use the new vm_structure
> >> >> infiniband/umem: Convert to use vm_account
> >> >> RMDA/siw: Convert to use vm_account
> >> >> RDMA/usnic: convert to use vm_account
> >> >> vfio/type1: Charge pinned pages to pinned_vm instead of locked_vm
> >> >> vfio/spapr_tce: Convert accounting to pinned_vm
> >> >> io_uring: convert to use vm_account
> >> >> net: skb: Switch to using vm_account
> >> >> xdp: convert to use vm_account
> >> >> kvm/book3s_64_vio: Convert account_locked_vm() to vm_account_pinned()
> >> >> fpga: dfl: afu: convert to use vm_account
> >> >> mm: Introduce a cgroup for pinned memory
> >> >> mm/util: Extend vm_account to charge pages against the pin cgroup
> >> >> mm/util: Refactor account_locked_vm
> >> >> mm: Convert mmap and mlock to use account_locked_vm
> >> >> mm/mmap: Charge locked memory to pins cgroup
> >> >> selftests/vm: Add pins-cgroup selftest for mlock/mmap
> >> >>
> >> >> MAINTAINERS | 8 +-
> >> >> arch/powerpc/kvm/book3s_64_vio.c | 10 +-
> >> >> arch/powerpc/mm/book3s64/iommu_api.c | 29 +--
> >> >> drivers/fpga/dfl-afu-dma-region.c | 11 +-
> >> >> drivers/fpga/dfl-afu.h | 1 +-
> >> >> drivers/infiniband/core/umem.c | 16 +-
> >> >> drivers/infiniband/core/umem_odp.c | 6 +-
> >> >> drivers/infiniband/hw/usnic/usnic_uiom.c | 13 +-
> >> >> drivers/infiniband/hw/usnic/usnic_uiom.h | 1 +-
> >> >> drivers/infiniband/sw/siw/siw.h | 2 +-
> >> >> drivers/infiniband/sw/siw/siw_mem.c | 20 +--
> >> >> drivers/infiniband/sw/siw/siw_verbs.c | 15 +-
> >> >> drivers/vdpa/vdpa_user/vduse_dev.c | 20 +--
> >> >> drivers/vfio/vfio_iommu_spapr_tce.c | 15 +-
> >> >> drivers/vfio/vfio_iommu_type1.c | 59 +----
> >> >> drivers/vhost/vdpa.c | 9 +-
> >> >> drivers/vhost/vhost.c | 2 +-
> >> >> drivers/vhost/vhost.h | 1 +-
> >> >> include/linux/cgroup.h | 20 ++-
> >> >> include/linux/cgroup_subsys.h | 4 +-
> >> >> include/linux/io_uring_types.h | 3 +-
> >> >> include/linux/kvm_host.h | 1 +-
> >> >> include/linux/mm.h | 5 +-
> >> >> include/linux/mm_types.h | 88 ++++++++-
> >> >> include/linux/skbuff.h | 6 +-
> >> >> include/net/sock.h | 2 +-
> >> >> include/net/xdp_sock.h | 2 +-
> >> >> include/rdma/ib_umem.h | 1 +-
> >> >> io_uring/io_uring.c | 20 +--
> >> >> io_uring/notif.c | 4 +-
> >> >> io_uring/notif.h | 10 +-
> >> >> io_uring/rsrc.c | 38 +---
> >> >> io_uring/rsrc.h | 9 +-
> >> >> mm/Kconfig | 11 +-
> >> >> mm/Makefile | 1 +-
> >> >> mm/internal.h | 2 +-
> >> >> mm/mlock.c | 76 +------
> >> >> mm/mmap.c | 76 +++----
> >> >> mm/mremap.c | 54 +++--
> >> >> mm/pins_cgroup.c | 273 ++++++++++++++++++++++++-
> >> >> mm/secretmem.c | 6 +-
> >> >> mm/util.c | 196 +++++++++++++++--
> >> >> net/core/skbuff.c | 47 +---
> >> >> net/rds/message.c | 9 +-
> >> >> net/xdp/xdp_umem.c | 38 +--
> >> >> tools/testing/selftests/vm/Makefile | 1 +-
> >> >> tools/testing/selftests/vm/pins-cgroup.c | 271 ++++++++++++++++++++++++-
> >> >> virt/kvm/kvm_main.c | 3 +-
> >> >> 48 files changed, 1114 insertions(+), 401 deletions(-)
> >> >> create mode 100644 mm/pins_cgroup.c
> >> >> create mode 100644 tools/testing/selftests/vm/pins-cgroup.c
> >> >>
> >> >> base-commit: 2241ab53cbb5cdb08a6b2d4688feb13971058f65
> >> >> --
> >> >> git-series 0.9.1
> >> >>
> >>
>
^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [RFC PATCH 00/19] mm: Introduce a cgroup to limit the amount of locked and pinned memory
2023-01-24 5:42 ` Alistair Popple
@ 2023-01-24 20:12 ` Jason Gunthorpe
-1 siblings, 0 replies; 108+ messages in thread
From: Jason Gunthorpe @ 2023-01-24 20:12 UTC (permalink / raw)
To: Alistair Popple
Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA,
jhubbard-DDmLM1+adcrQT0dZR+AlfA, tjmercier-hpIqsD4AKlfQT0dZR+AlfA,
hannes-druUgvl0LCNAfugRpC6u6w, surenb-hpIqsD4AKlfQT0dZR+AlfA,
mkoutny-IBi9RG/b67k, daniel-/w4YWyX8dFk
On Tue, Jan 24, 2023 at 04:42:29PM +1100, Alistair Popple wrote:
> Having large amounts of unmovable or unreclaimable memory in a system
> can lead to system instability due to increasing the likelihood of
> encountering out-of-memory conditions. Therefore it is desirable to
> limit the amount of memory users can lock or pin.
>
> From userspace such limits can be enforced by setting
> RLIMIT_MEMLOCK. However there is no standard method that drivers and
> other in-kernel users can use to check and enforce this limit.
>
> This has lead to a large number of inconsistencies in how limits are
> enforced. For example some drivers will use mm->locked_mm while others
> will use mm->pinned_mm or user->locked_mm. It is therefore possible to
> have up to three times RLIMIT_MEMLOCKED pinned.
>
> Having pinned memory limited per-task also makes it easy for users to
> exceed the limit. For example drivers that pin memory with
> pin_user_pages() it tends to remain pinned after fork. To deal with
> this and other issues this series introduces a cgroup for tracking and
> limiting the number of pages pinned or locked by tasks in the group.
>
> However the existing behaviour with regards to the rlimit needs to be
> maintained. Therefore the lesser of the two limits is
> enforced. Furthermore having CAP_IPC_LOCK usually bypasses the rlimit,
> but this bypass is not allowed for the cgroup.
>
> The first part of this series converts existing drivers which
> open-code the use of locked_mm/pinned_mm over to a common interface
> which manages the refcounts of the associated task/mm/user
> structs. This ensures accounting of pages is consistent and makes it
> easier to add charging of the cgroup.
>
> The second part of the series adds the cgroup and converts core mm
> code such as mlock over to charging the cgroup before finally
> introducing some selftests.
>
> As I don't have access to systems with all the various devices I
> haven't been able to test all driver changes. Any help there would be
> appreciated.
I'm excited by this series, thanks for making it.
The pin accounting has been a long standing problem and cgroups will
really help!
Jason
^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [RFC PATCH 00/19] mm: Introduce a cgroup to limit the amount of locked and pinned memory
@ 2023-01-24 20:12 ` Jason Gunthorpe
0 siblings, 0 replies; 108+ messages in thread
From: Jason Gunthorpe @ 2023-01-24 20:12 UTC (permalink / raw)
To: Alistair Popple
Cc: linux-mm, cgroups, linux-kernel, jhubbard, tjmercier, hannes,
surenb, mkoutny, daniel
On Tue, Jan 24, 2023 at 04:42:29PM +1100, Alistair Popple wrote:
> Having large amounts of unmovable or unreclaimable memory in a system
> can lead to system instability due to increasing the likelihood of
> encountering out-of-memory conditions. Therefore it is desirable to
> limit the amount of memory users can lock or pin.
>
> From userspace such limits can be enforced by setting
> RLIMIT_MEMLOCK. However there is no standard method that drivers and
> other in-kernel users can use to check and enforce this limit.
>
> This has lead to a large number of inconsistencies in how limits are
> enforced. For example some drivers will use mm->locked_mm while others
> will use mm->pinned_mm or user->locked_mm. It is therefore possible to
> have up to three times RLIMIT_MEMLOCKED pinned.
>
> Having pinned memory limited per-task also makes it easy for users to
> exceed the limit. For example drivers that pin memory with
> pin_user_pages() it tends to remain pinned after fork. To deal with
> this and other issues this series introduces a cgroup for tracking and
> limiting the number of pages pinned or locked by tasks in the group.
>
> However the existing behaviour with regards to the rlimit needs to be
> maintained. Therefore the lesser of the two limits is
> enforced. Furthermore having CAP_IPC_LOCK usually bypasses the rlimit,
> but this bypass is not allowed for the cgroup.
>
> The first part of this series converts existing drivers which
> open-code the use of locked_mm/pinned_mm over to a common interface
> which manages the refcounts of the associated task/mm/user
> structs. This ensures accounting of pages is consistent and makes it
> easier to add charging of the cgroup.
>
> The second part of the series adds the cgroup and converts core mm
> code such as mlock over to charging the cgroup before finally
> introducing some selftests.
>
> As I don't have access to systems with all the various devices I
> haven't been able to test all driver changes. Any help there would be
> appreciated.
I'm excited by this series, thanks for making it.
The pin accounting has been a long standing problem and cgroups will
really help!
Jason
^ permalink raw reply [flat|nested] 108+ messages in thread
* Re: [RFC PATCH 00/19] mm: Introduce a cgroup to limit the amount of locked and pinned memory
2023-01-24 20:12 ` Jason Gunthorpe
(?)
@ 2023-01-31 13:57 ` David Hildenbrand
[not found] ` <6369225e-3522-341b-cd20-d95b1f11ea71-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
-1 siblings, 1 reply; 108+ messages in thread
From: David Hildenbrand @ 2023-01-31 13:57 UTC (permalink / raw)
To: Jason Gunthorpe, Alistair Popple
Cc: linux-mm, cgroups, linux-kernel, jhubbard, tjmercier, hannes,
surenb, mkoutny, daniel
On 24.01.23 21:12, Jason Gunthorpe wrote:
> On Tue, Jan 24, 2023 at 04:42:29PM +1100, Alistair Popple wrote:
>> Having large amounts of unmovable or unreclaimable memory in a system
>> can lead to system instability due to increasing the likelihood of
>> encountering out-of-memory conditions. Therefore it is desirable to
>> limit the amount of memory users can lock or pin.
>>
>> From userspace such limits can be enforced by setting
>> RLIMIT_MEMLOCK. However there is no standard method that drivers and
>> other in-kernel users can use to check and enforce this limit.
>>
>> This has lead to a large number of inconsistencies in how limits are
>> enforced. For example some drivers will use mm->locked_mm while others
>> will use mm->pinned_mm or user->locked_mm. It is therefore possible to
>> have up to three times RLIMIT_MEMLOCKED pinned.
>>
>> Having pinned memory limited per-task also makes it easy for users to
>> exceed the limit. For example drivers that pin memory with
>> pin_user_pages() it tends to remain pinned after fork. To deal with
>> this and other issues this series introduces a cgroup for tracking and
>> limiting the number of pages pinned or locked by tasks in the group.
>>
>> However the existing behaviour with regards to the rlimit needs to be
>> maintained. Therefore the lesser of the two limits is
>> enforced. Furthermore having CAP_IPC_LOCK usually bypasses the rlimit,
>> but this bypass is not allowed for the cgroup.
>>
>> The first part of this series converts existing drivers which
>> open-code the use of locked_mm/pinned_mm over to a common interface
>> which manages the refcounts of the associated task/mm/user
>> structs. This ensures accounting of pages is consistent and makes it
>> easier to add charging of the cgroup.
>>
>> The second part of the series adds the cgroup and converts core mm
>> code such as mlock over to charging the cgroup before finally
>> introducing some selftests.
>>
>> As I don't have access to systems with all the various devices I
>> haven't been able to test all driver changes. Any help there would be
>> appreciated.
>
> I'm excited by this series, thanks for making it.
>
> The pin accounting has been a long standing problem and cgroups will
> really help!
Indeed. I'm curious how GUP-fast, pinning the same page multiple times,
and pinning subpages of larger folios are handled :)
--
Thanks,
David / dhildenb
^ permalink raw reply [flat|nested] 108+ messages in thread