* Re: [PATCH 3/5] VSOCK: support receive mergeable rx buffer in guest
From: Jason Wang @ 2018-11-06 4:00 UTC (permalink / raw)
To: jiangyiwen, stefanha; +Cc: netdev, kvm, virtualization
In-Reply-To: <5BDFF57C.5020106@huawei.com>
On 2018/11/5 下午3:47, jiangyiwen wrote:
> Guest receive mergeable rx buffer, it can merge
> scatter rx buffer into a big buffer and then copy
> to user space.
>
> Signed-off-by: Yiwen Jiang <jiangyiwen@huawei.com>
> ---
> include/linux/virtio_vsock.h | 9 ++++
> net/vmw_vsock/virtio_transport.c | 75 +++++++++++++++++++++++++++++----
> net/vmw_vsock/virtio_transport_common.c | 59 ++++++++++++++++++++++----
> 3 files changed, 127 insertions(+), 16 deletions(-)
>
> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
> index da9e1fe..6be3cd7 100644
> --- a/include/linux/virtio_vsock.h
> +++ b/include/linux/virtio_vsock.h
> @@ -13,6 +13,8 @@
> #define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE (1024 * 4)
> #define VIRTIO_VSOCK_MAX_BUF_SIZE 0xFFFFFFFFUL
> #define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (1024 * 64)
> +/* virtio_vsock_pkt + max_pkt_len(default MAX_PKT_BUF_SIZE) */
> +#define VIRTIO_VSOCK_MAX_MRG_BUF_NUM ((VIRTIO_VSOCK_MAX_PKT_BUF_SIZE / PAGE_SIZE) + 1)
>
> /* Virtio-vsock feature */
> #define VIRTIO_VSOCK_F_MRG_RXBUF 0 /* Host can merge receive buffers. */
> @@ -48,6 +50,11 @@ struct virtio_vsock_sock {
> struct list_head rx_queue;
> };
>
> +struct virtio_vsock_mrg_rxbuf {
> + void *buf;
> + u32 len;
> +};
> +
> struct virtio_vsock_pkt {
> struct virtio_vsock_hdr hdr;
> struct virtio_vsock_mrg_rxbuf_hdr mrg_rxbuf_hdr;
> @@ -59,6 +66,8 @@ struct virtio_vsock_pkt {
> u32 len;
> u32 off;
> bool reply;
> + bool mergeable;
> + struct virtio_vsock_mrg_rxbuf mrg_rxbuf[VIRTIO_VSOCK_MAX_MRG_BUF_NUM];
> };
It's better to use iov here I think, and drop buf completely.
And this is better to be done in an independent patch.
>
> struct virtio_vsock_pkt_info {
> diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
> index 2040a9e..3557ad3 100644
> --- a/net/vmw_vsock/virtio_transport.c
> +++ b/net/vmw_vsock/virtio_transport.c
> @@ -359,11 +359,62 @@ static bool virtio_transport_more_replies(struct virtio_vsock *vsock)
> return val < virtqueue_get_vring_size(vq);
> }
>
> +static struct virtio_vsock_pkt *receive_mergeable(struct virtqueue *vq,
> + struct virtio_vsock *vsock, unsigned int *total_len)
> +{
> + struct virtio_vsock_pkt *pkt;
> + u16 num_buf;
> + void *page;
> + unsigned int len;
> + int i = 0;
> +
> + page = virtqueue_get_buf(vq, &len);
> + if (!page)
> + return NULL;
> +
> + *total_len = len;
> + vsock->rx_buf_nr--;
> +
> + pkt = page;
> + num_buf = le16_to_cpu(pkt->mrg_rxbuf_hdr.num_buffers);
> + if (!num_buf || num_buf > VIRTIO_VSOCK_MAX_MRG_BUF_NUM)
> + goto err;
> +
> + pkt->mergeable = true;
> + if (!le32_to_cpu(pkt->hdr.len))
> + return pkt;
> +
> + len -= sizeof(struct virtio_vsock_pkt);
> + pkt->mrg_rxbuf[i].buf = page + sizeof(struct virtio_vsock_pkt);
> + pkt->mrg_rxbuf[i].len = len;
> + i++;
> +
> + while (--num_buf) {
> + page = virtqueue_get_buf(vq, &len);
> + if (!page)
> + goto err;
> +
> + *total_len += len;
> + vsock->rx_buf_nr--;
> +
> + pkt->mrg_rxbuf[i].buf = page;
> + pkt->mrg_rxbuf[i].len = len;
> + i++;
> + }
> +
> + return pkt;
> +err:
> + virtio_transport_free_pkt(pkt);
> + return NULL;
> +}
Similar logic could be found at virtio-net driver.
Haven't thought this deeply, but it looks to me use virtio-net driver is
also possible, e.g for data-path, just register vsock specific callbacks.
> +
> static void virtio_transport_rx_work(struct work_struct *work)
> {
> struct virtio_vsock *vsock =
> container_of(work, struct virtio_vsock, rx_work);
> struct virtqueue *vq;
> + size_t vsock_hlen = vsock->mergeable ? sizeof(struct virtio_vsock_pkt) :
> + sizeof(struct virtio_vsock_hdr);
>
> vq = vsock->vqs[VSOCK_VQ_RX];
>
> @@ -383,21 +434,29 @@ static void virtio_transport_rx_work(struct work_struct *work)
> goto out;
> }
>
> - pkt = virtqueue_get_buf(vq, &len);
> - if (!pkt) {
> - break;
> - }
> + if (likely(vsock->mergeable)) {
> + pkt = receive_mergeable(vq, vsock, &len);
> + if (!pkt)
> + break;
>
> - vsock->rx_buf_nr--;
> + pkt->len = le32_to_cpu(pkt->hdr.len);
> + } else {
> + pkt = virtqueue_get_buf(vq, &len);
> + if (!pkt) {
> + break;
> + }
> +
> + vsock->rx_buf_nr--;
> + }
>
> /* Drop short/long packets */
> - if (unlikely(len < sizeof(pkt->hdr) ||
> - len > sizeof(pkt->hdr) + pkt->len)) {
> + if (unlikely(len < vsock_hlen ||
> + len > vsock_hlen + pkt->len)) {
> virtio_transport_free_pkt(pkt);
> continue;
> }
>
> - pkt->len = len - sizeof(pkt->hdr);
> + pkt->len = len - vsock_hlen;
> virtio_transport_deliver_tap_pkt(pkt);
> virtio_transport_recv_pkt(pkt);
> }
> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
> index 3ae3a33..7bef1d5 100644
> --- a/net/vmw_vsock/virtio_transport_common.c
> +++ b/net/vmw_vsock/virtio_transport_common.c
> @@ -272,14 +272,49 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
> */
> spin_unlock_bh(&vvs->rx_lock);
>
> - err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes);
> - if (err)
> - goto out;
> + if (pkt->mergeable) {
> + struct virtio_vsock_mrg_rxbuf *buf = pkt->mrg_rxbuf;
> + size_t mrg_copy_bytes, last_buf_total = 0, rxbuf_off;
> + size_t tmp_bytes = bytes;
> + int i;
> +
> + for (i = 0; i < le16_to_cpu(pkt->mrg_rxbuf_hdr.num_buffers); i++) {
> + if (pkt->off > last_buf_total + buf[i].len) {
> + last_buf_total += buf[i].len;
> + continue;
> + }
> +
> + rxbuf_off = pkt->off - last_buf_total;
> + mrg_copy_bytes = min(buf[i].len - rxbuf_off, tmp_bytes);
> + err = memcpy_to_msg(msg, buf[i].buf + rxbuf_off, mrg_copy_bytes);
> + if (err)
> + goto out;
> +
> + tmp_bytes -= mrg_copy_bytes;
> + pkt->off += mrg_copy_bytes;
> + last_buf_total += buf[i].len;
> + if (!tmp_bytes)
> + break;
> + }
After switch to use iov, you can user iov_iter helper to avoid the above
open-coding I believe.
And you can also drop the if (mergeable) condition.
Thanks
> +
> + if (tmp_bytes) {
> + printk(KERN_WARNING "WARNING! bytes = %llu, "
> + "bytes = %llu\n",
> + (unsigned long long)bytes,
> + (unsigned long long)tmp_bytes);
> + }
> +
> + total += (bytes - tmp_bytes);
> + } else {
> + err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes);
> + if (err)
> + goto out;
> +
> + total += bytes;
> + pkt->off += bytes;
> + }
>
> spin_lock_bh(&vvs->rx_lock);
> -
> - total += bytes;
> - pkt->off += bytes;
> if (pkt->off == pkt->len) {
> virtio_transport_dec_rx_pkt(vvs, pkt);
> list_del(&pkt->list);
> @@ -1050,8 +1085,16 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
>
> void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt)
> {
> - kfree(pkt->buf);
> - kfree(pkt);
> + int i;
> +
> + if (pkt->mergeable) {
> + for (i = 1; i < le16_to_cpu(pkt->mrg_rxbuf_hdr.num_buffers); i++)
> + free_page((unsigned long)pkt->mrg_rxbuf[i].buf);
> + free_page((unsigned long)(void *)pkt);
> + } else {
> + kfree(pkt->buf);
> + kfree(pkt);
> + }
> }
> EXPORT_SYMBOL_GPL(virtio_transport_free_pkt);
>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [RFC PATCH] net/mlx5e: Temp software stats variable is not required
From: Saeed Mahameed @ 2018-11-05 19:13 UTC (permalink / raw)
To: davem@davemloft.net
Cc: Jason Gunthorpe, netdev@vger.kernel.org, eric.dumazet@gmail.com,
Eran Ben Elisha, Leon Romanovsky, arnd@arndb.de,
akpm@linux-foundation.org
In-Reply-To: <20181103.193617.810293775666516890.davem@davemloft.net>
On Sat, 2018-11-03 at 19:36 -0700, David Miller wrote:
> From: Saeed Mahameed <saeedm@mellanox.com>
> Date: Fri, 2 Nov 2018 18:54:22 -0700
>
> > +static void mlx5e_fold_sw_stats(struct mlx5e_priv *priv, struct
> > rtnl_link_stats64 *s)
> > +{
> > + int i;
> > +
> > + /* not required ? */
> > + memset(s, 0, sizeof(*s));
>
> Why wouldn't this be required?
>
I just checked it is already done by the stack @dev_get_stats()
> I can see that perhaps you can only zero out the statistics that are
> used in
>
> the ndo_get_stats64() code path, but that's different.
mlx5e_fold_sw_stats can only be called from ndo_get_stats64().
The "s" pointer i am trying to zero out here is the same pointer we
receive from ndo_get_stats64().
^ permalink raw reply
* Re: [PATCH v3] arm64: dts: stratix10: fix multicast filtering
From: Dinh Nguyen @ 2018-11-05 19:14 UTC (permalink / raw)
To: Aaro Koskinen, Thor Thayer, Rob Herring, Mark Rutland
Cc: devicetree, linux-kernel, netdev, Aaro Koskinen
In-Reply-To: <20181102191048.22657-1-aaro.koskinen@iki.fi>
On 11/2/18 2:10 PM, Aaro Koskinen wrote:
> From: Aaro Koskinen <aaro.koskinen@nokia.com>
>
> On Stratix 10, the EMAC has 256 hash buckets for multicast filtering. This
> needs to be specified in DTS, otherwise the stmmac driver defaults to 64
> buckets and initializes the filter incorrectly. As a result, e.g. valid
> IPv6 multicast traffic ends up being dropped.
>
> Fixes: 78cd6a9d8e15 ("arm64: dts: Add base stratix 10 dtsi")
> Signed-off-by: Aaro Koskinen <aaro.koskinen@nokia.com>
> ---
> arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi | 3 +++
> 1 file changed, 3 insertions(+)
>
Applied!
Thanks,
Dinh
^ permalink raw reply
* Re: [RFC PATCH] net/mlx5e: Temp software stats variable is not required
From: David Miller @ 2018-11-05 19:27 UTC (permalink / raw)
To: saeedm; +Cc: jgg, netdev, eric.dumazet, eranbe, leonro, arnd, akpm
In-Reply-To: <002da677314734dedd31be1e1b199dd4f1ae8457.camel@mellanox.com>
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Mon, 5 Nov 2018 19:13:59 +0000
> On Sat, 2018-11-03 at 19:36 -0700, David Miller wrote:
>> From: Saeed Mahameed <saeedm@mellanox.com>
>> Date: Fri, 2 Nov 2018 18:54:22 -0700
>>
>> > +static void mlx5e_fold_sw_stats(struct mlx5e_priv *priv, struct
>> > rtnl_link_stats64 *s)
>> > +{
>> > + int i;
>> > +
>> > + /* not required ? */
>> > + memset(s, 0, sizeof(*s));
>>
>> Why wouldn't this be required?
>>
>
> I just checked it is already done by the stack @dev_get_stats()
Then please document this in the commit message or similar.
^ permalink raw reply
* Can NFS work with VRF?
From: Ben Greear @ 2018-11-05 19:42 UTC (permalink / raw)
To: netdev
Hello,
I was trying to improve my old series of patches that binds NFS to
a particular source IP address so that it could work with VRF in a 4.16
kernel. But, it seems a huge tangle to try to make NFS (and rpc, etc) able to bind to
a local netdevice, which I think is what would be needed to make it work with VRF.
Has anyone already worked on VRF support for NFS?
Thanks,
Ben
--
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc http://www.candelatech.com
^ permalink raw reply
* [PATCH v2 1/2] mm/page_alloc: free order-0 pages through PCP in page_frag_free()
From: Aaron Lu @ 2018-11-06 5:28 UTC (permalink / raw)
To: linux-mm, linux-kernel, netdev
Cc: Andrew Morton, Paweł Staszewski, Jesper Dangaard Brouer,
Eric Dumazet, Tariq Toukan, Ilias Apalodimas, Yoel Caspersen,
Mel Gorman, Saeed Mahameed, Michal Hocko, Vlastimil Babka,
Dave Hansen, Alexander Duyck
In-Reply-To: <20181105085820.6341-1-aaron.lu@intel.com>
page_frag_free() calls __free_pages_ok() to free the page back to
Buddy. This is OK for high order page, but for order-0 pages, it
misses the optimization opportunity of using Per-Cpu-Pages and can
cause zone lock contention when called frequently.
Paweł Staszewski recently shared his result of 'how Linux kernel
handles normal traffic'[1] and from perf data, Jesper Dangaard Brouer
found the lock contention comes from page allocator:
mlx5e_poll_tx_cq
|
--16.34%--napi_consume_skb
|
|--12.65%--__free_pages_ok
| |
| --11.86%--free_one_page
| |
| |--10.10%--queued_spin_lock_slowpath
| |
| --0.65%--_raw_spin_lock
|
|--1.55%--page_frag_free
|
--1.44%--skb_release_data
Jesper explained how it happened: mlx5 driver RX-page recycle
mechanism is not effective in this workload and pages have to go
through the page allocator. The lock contention happens during
mlx5 DMA TX completion cycle. And the page allocator cannot keep
up at these speeds.[2]
I thought that __free_pages_ok() are mostly freeing high order
pages and thought this is an lock contention for high order pages
but Jesper explained in detail that __free_pages_ok() here are
actually freeing order-0 pages because mlx5 is using order-0 pages
to satisfy its page pool allocation request.[3]
The free path as pointed out by Jesper is:
skb_free_head()
-> skb_free_frag()
-> page_frag_free()
And the pages being freed on this path are order-0 pages.
Fix this by doing similar things as in __page_frag_cache_drain() -
send the being freed page to PCP if it's an order-0 page, or
directly to Buddy if it is a high order page.
With this change, Paweł hasn't noticed lock contention yet in
his workload and Jesper has noticed a 7% performance improvement
using a micro benchmark and lock contention is gone. Ilias' test
on a 'low' speed 1Gbit interface on an cortex-a53 shows ~11%
performance boost testing with 64byte packets and __free_pages_ok()
disappeared from perf top.
[1]: https://www.spinics.net/lists/netdev/msg531362.html
[2]: https://www.spinics.net/lists/netdev/msg531421.html
[3]: https://www.spinics.net/lists/netdev/msg531556.html
Reported-by: Paweł Staszewski <pstaszewski@itcare.pl>
Analysed-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Tested-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Acked-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Signed-off-by: Aaron Lu <aaron.lu@intel.com>
---
v2: only changelog changes:
- remove the duplicated skb_free_frag() as pointed by Jesper;
- add Ilias' test result;
- add people's ack/test tag.
mm/page_alloc.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ae31839874b8..91a9a6af41a2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4555,8 +4555,14 @@ void page_frag_free(void *addr)
{
struct page *page = virt_to_head_page(addr);
- if (unlikely(put_page_testzero(page)))
- __free_pages_ok(page, compound_order(page));
+ if (unlikely(put_page_testzero(page))) {
+ unsigned int order = compound_order(page);
+
+ if (order == 0)
+ free_unref_page(page);
+ else
+ __free_pages_ok(page, order);
+ }
}
EXPORT_SYMBOL(page_frag_free);
--
2.17.2
^ permalink raw reply related
* [PATCH v2 2/2] mm/page_alloc: use a single function to free page
From: Aaron Lu @ 2018-11-06 5:30 UTC (permalink / raw)
To: linux-mm, linux-kernel, netdev
Cc: Andrew Morton, Paweł Staszewski, Jesper Dangaard Brouer,
Eric Dumazet, Tariq Toukan, Ilias Apalodimas, Yoel Caspersen,
Mel Gorman, Saeed Mahameed, Michal Hocko, Vlastimil Babka,
Dave Hansen, Alexander Duyck
In-Reply-To: <20181105085820.6341-2-aaron.lu@intel.com>
We have multiple places of freeing a page, most of them doing similar
things and a common function can be used to reduce code duplicate.
It also avoids bug fixed in one function but left in another.
Signed-off-by: Aaron Lu <aaron.lu@intel.com>
---
v2: move comments close to code as suggested by Dave.
mm/page_alloc.c | 36 ++++++++++++++++--------------------
1 file changed, 16 insertions(+), 20 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 91a9a6af41a2..4faf6b7bf225 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4425,9 +4425,17 @@ unsigned long get_zeroed_page(gfp_t gfp_mask)
}
EXPORT_SYMBOL(get_zeroed_page);
-void __free_pages(struct page *page, unsigned int order)
+static inline void free_the_page(struct page *page, unsigned int order, int nr)
{
- if (put_page_testzero(page)) {
+ VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
+
+ /*
+ * Free a page by reducing its ref count by @nr.
+ * If its refcount reaches 0, then according to its order:
+ * order0: send to PCP;
+ * high order: directly send to Buddy.
+ */
+ if (page_ref_sub_and_test(page, nr)) {
if (order == 0)
free_unref_page(page);
else
@@ -4435,6 +4443,10 @@ void __free_pages(struct page *page, unsigned int order)
}
}
+void __free_pages(struct page *page, unsigned int order)
+{
+ free_the_page(page, order, 1);
+}
EXPORT_SYMBOL(__free_pages);
void free_pages(unsigned long addr, unsigned int order)
@@ -4481,16 +4493,7 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
void __page_frag_cache_drain(struct page *page, unsigned int count)
{
- VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
-
- if (page_ref_sub_and_test(page, count)) {
- unsigned int order = compound_order(page);
-
- if (order == 0)
- free_unref_page(page);
- else
- __free_pages_ok(page, order);
- }
+ free_the_page(page, compound_order(page), count);
}
EXPORT_SYMBOL(__page_frag_cache_drain);
@@ -4555,14 +4558,7 @@ void page_frag_free(void *addr)
{
struct page *page = virt_to_head_page(addr);
- if (unlikely(put_page_testzero(page))) {
- unsigned int order = compound_order(page);
-
- if (order == 0)
- free_unref_page(page);
- else
- __free_pages_ok(page, order);
- }
+ free_the_page(page, compound_order(page), 1);
}
EXPORT_SYMBOL(page_frag_free);
--
2.17.2
^ permalink raw reply related
* Re: Kernel 4.19 network performance - forwarding/routing normal users traffic
From: Jesper Dangaard Brouer @ 2018-11-05 20:17 UTC (permalink / raw)
To: Paweł Staszewski; +Cc: David Ahern, netdev, Yoel Caspersen, brouer
In-Reply-To: <394a0bf2-fa97-1085-2eda-98ddf476895c@itcare.pl>
On Sun, 4 Nov 2018 01:24:03 +0100 Paweł Staszewski <pstaszewski@itcare.pl> wrote:
> And today again after allpy patch for page allocator - reached again
> 64/64 Gbit/s
>
> with only 50-60% cpu load
Great.
> today no slowpath hit for netwoking :)
>
> But again dropped pckt at 64GbitRX and 64TX ....
> And as it should not be pcie express limit -i think something more is
Well, this does sounds like a PCIe bandwidth limit to me.
See the PCIe BW here: https://en.wikipedia.org/wiki/PCI_Express
You likely have PCIe v3, where 1-lane have 984.6 MBytes/s or 7.87 Gbit/s
Thus, x16-lanes have 15.75 GBytes or 126 Gbit/s. It does say "in each
direction", but you are also forwarding this RX->TX on both (dual) ports
NIC that is sharing the same PCIe slot.
> going on there - and hard to catch - cause perf top doestn chenged
> besides there is no queued slowpath hit now
>
> I ordered now also intel cards to compare - but 3 weeks eta
> Faster - cause 3 days - i will have mellanox connectx 5 - so can
> separate traffic to two different x16 pcie busses
I do think you need to separate traffic to two different x16 PCIe
slots. I have found that the ConnectX-5 is significantly faster
packet-per-sec performance than ConnectX-4, but that is not your
use-case (max BW). I've not tested these NICs for maximum
_bidirectional_ bandwidth limits, I've only made sure I can do 100G
unidirectional, which can hit some funny motherboard memory limits
(remember to equip motherboard with 4 RAM blocks for full memory BW).
--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Principal Kernel Engineer at Red Hat
LinkedIn: http://www.linkedin.com/in/brouer
^ permalink raw reply
* Re: [PATCH 0/5] VSOCK: support mergeable rx buffer in vhost-vsock
From: jiangyiwen @ 2018-11-06 5:53 UTC (permalink / raw)
To: Jason Wang, stefanha; +Cc: netdev, kvm, virtualization
In-Reply-To: <229559d5-1787-09b1-6c26-57b535f20006@redhat.com>
On 2018/11/6 11:32, Jason Wang wrote:
>
> On 2018/11/6 上午11:17, jiangyiwen wrote:
>> On 2018/11/6 10:41, Jason Wang wrote:
>>> On 2018/11/6 上午10:17, jiangyiwen wrote:
>>>> On 2018/11/5 17:21, Jason Wang wrote:
>>>>> On 2018/11/5 下午3:43, jiangyiwen wrote:
>>>>>> Now vsock only support send/receive small packet, it can't achieve
>>>>>> high performance. As previous discussed with Jason Wang, I revisit the
>>>>>> idea of vhost-net about mergeable rx buffer and implement the mergeable
>>>>>> rx buffer in vhost-vsock, it can allow big packet to be scattered in
>>>>>> into different buffers and improve performance obviously.
>>>>>>
>>>>>> I write a tool to test the vhost-vsock performance, mainly send big
>>>>>> packet(64K) included guest->Host and Host->Guest. The result as
>>>>>> follows:
>>>>>>
>>>>>> Before performance:
>>>>>> Single socket Multiple sockets(Max Bandwidth)
>>>>>> Guest->Host ~400MB/s ~480MB/s
>>>>>> Host->Guest ~1450MB/s ~1600MB/s
>>>>>>
>>>>>> After performance:
>>>>>> Single socket Multiple sockets(Max Bandwidth)
>>>>>> Guest->Host ~1700MB/s ~2900MB/s
>>>>>> Host->Guest ~1700MB/s ~2900MB/s
>>>>>>
>>>>>> From the test results, the performance is improved obviously, and guest
>>>>>> memory will not be wasted.
>>>>> Hi:
>>>>>
>>>>> Thanks for the patches and the numbers are really impressive.
>>>>>
>>>>> But instead of duplicating codes between sock and net. I was considering to use virtio-net as a transport of vsock. Then we may have all existed features likes batching, mergeable rx buffers and multiqueue. Want to consider this idea? Thoughts?
>>>>>
>>>>>
>>>> Hi Jason,
>>>>
>>>> I am not very familiar with virtio-net, so I am afraid I can't give too
>>>> much effective advice. Then I have several problems:
>>>>
>>>> 1. If use virtio-net as a transport, guest should see a virtio-net
>>>> device instead of virtio-vsock device, right? Is vsock only as a
>>>> transport between socket and net_device? User should still use
>>>> AF_VSOCK type to create socket, right?
>>>
>>> Well, there're many choices. What you need is just to keep the socket API and hide the implementation. For example, you can keep the vosck device in guest and switch to use vhost-net in host. We probably need a new feature bit or header to let vhost know we are passing vsock packet. And vhost-net could forward the packet to vsock core on host.
>>>
>>>
>>>> 2. I want to know if this idea has already started, and how is
>>>> the current progress?
>>>
>>> Not yet started. Just want to listen from the community. If this sounds good, do you have interest in implementing this?
>>>
>>>
>>>> 3. And what is stefan's idea?
>>>
>>> Talk with Stefan a little on this during KVM Forum. I think he tends to agree on this idea. Anyway, let's wait for his reply.
>>>
>>>
>>> Thanks
>>>
>>>
>> Hi Jason,
>>
>> Thanks your reply, what you want is try to avoid duplicate code, and still
>> use the existed features with virtio-net.
>
>
> Yes, technically we can use virtio-net driver is guest as well but we could do it step by step.
>
>
>> Yes, if this sounds good and most people can recognize this idea, I am very
>> happy to implement this.
>
>
> Cool, thanks.
>
>
>>
>> In addition, I hope you can review these patches before the new idea is
>> implemented, after all the performance can be improved. :-)
>
>
> Ok.
>
>
> So the patch actually did three things:
>
> - mergeable buffer implementation
>
> - increase the default rx buffer size
>
> - add used and signal guest in a batch
>
> It would be helpful if you can measure the performance improvement independently. This can give reviewer a better understanding on how much did each part help.
>
> Thanks
>
>
Great, I will test the performance independently in the later version.
Thanks,
Yiwen.
>>
>> Thanks,
>> Yiwen.
>>
>>>> Thanks,
>>>> Yiwen.
>>>>
>>> .
>>>
>>
>
> .
>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* [PATCH net] net/ipv6: Move anycast init/cleanup functions out of CONFIG_PROC_FS
From: Jeff Barnhill @ 2018-11-05 20:36 UTC (permalink / raw)
To: netdev; +Cc: davem, kuznet, yoshfuji, Jeff Barnhill
Move the anycast.c init and cleanup functions which were inadvertently
added inside the CONFIG_PROC_FS definition.
Fixes: 2384d02520ff ("net/ipv6: Add anycast addresses to a global hashtable")
Signed-off-by: Jeff Barnhill <0xeffeff@gmail.com>
---
net/ipv6/anycast.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 7698637cf827..94999058e110 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -590,6 +590,7 @@ void ac6_proc_exit(struct net *net)
{
remove_proc_entry("anycast6", net->proc_net);
}
+#endif
/* Init / cleanup code
*/
@@ -611,4 +612,3 @@ void ipv6_anycast_cleanup(void)
WARN_ON(!hlist_empty(&inet6_acaddr_lst[i]));
spin_unlock(&acaddr_hash_lock);
}
-#endif
--
2.14.1
^ permalink raw reply related
* Re: [PATCH 1/5] VSOCK: support fill mergeable rx buffer in guest
From: jiangyiwen @ 2018-11-06 6:22 UTC (permalink / raw)
To: Jason Wang, stefanha; +Cc: netdev, kvm, virtualization
In-Reply-To: <1801c013-6cfb-1d07-3401-49f536e01983@redhat.com>
On 2018/11/6 11:38, Jason Wang wrote:
>
> On 2018/11/5 下午3:45, jiangyiwen wrote:
>> In driver probing, if virtio has VIRTIO_VSOCK_F_MRG_RXBUF feature,
>> it will fill mergeable rx buffer, support for host send mergeable
>> rx buffer. It will fill a page everytime to compact with small
>> packet and big packet.
>>
>> Signed-off-by: Yiwen Jiang <jiangyiwen@huawei.com>
>> ---
>> include/linux/virtio_vsock.h | 3 ++
>> net/vmw_vsock/virtio_transport.c | 72 +++++++++++++++++++++++++++++-----------
>> 2 files changed, 56 insertions(+), 19 deletions(-)
>>
>> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
>> index e223e26..bf84418 100644
>> --- a/include/linux/virtio_vsock.h
>> +++ b/include/linux/virtio_vsock.h
>> @@ -14,6 +14,9 @@
>> #define VIRTIO_VSOCK_MAX_BUF_SIZE 0xFFFFFFFFUL
>> #define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (1024 * 64)
>>
>> +/* Virtio-vsock feature */
>> +#define VIRTIO_VSOCK_F_MRG_RXBUF 0 /* Host can merge receive buffers. */
>> +
>> enum {
>> VSOCK_VQ_RX = 0, /* for host to guest data */
>> VSOCK_VQ_TX = 1, /* for guest to host data */
>> diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
>> index 5d3cce9..2040a9e 100644
>> --- a/net/vmw_vsock/virtio_transport.c
>> +++ b/net/vmw_vsock/virtio_transport.c
>> @@ -64,6 +64,7 @@ struct virtio_vsock {
>> struct virtio_vsock_event event_list[8];
>>
>> u32 guest_cid;
>> + bool mergeable;
>> };
>>
>> static struct virtio_vsock *virtio_vsock_get(void)
>> @@ -256,6 +257,25 @@ static int virtio_transport_send_pkt_loopback(struct virtio_vsock *vsock,
>> return 0;
>> }
>>
>> +static int fill_mergeable_rx_buff(struct virtqueue *vq)
>> +{
>> + void *page = NULL;
>> + struct scatterlist sg;
>> + int err;
>> +
>> + page = (void *)get_zeroed_page(GFP_KERNEL);
>
>
> Any reason to use zeroed page?
In previous version, the entire structure of virtio_vsock_pkt is preallocated
in guest use kzalloc, it is a contiguous zeroed physical memory, but host only
need to fill virtio_vsock_hdr size.
However, in mergeable rx buffer version, we only fill a page in vring descriptor
in guest, and I will reserve size of virtio_vsock_pkt in host instead of write
the total size of virtio_vsock_pkt, for the correctness of structure value,
we should set zeroed page in advance.
>
>
>> + if (!page)
>> + return -ENOMEM;
>> +
>> + sg_init_one(&sg, page, PAGE_SIZE);
>
>
> FYI, for virtio-net we have several optimizations for mergeable rx buffer:
>
> - skb_page_frag_refill() which can use high order page and reduce the stress of page allocator
>
You're right, initially I want to use a memory poll to manage the rx buffer,
and then use this in the later optimized patch. Your advice is very great.
> - we don't use fixed buffer size, instead we use EWMA to estimate the possible rx buffer size to avoid internal fragmentation
>
Ok, I analysis the feature and consider add it into my patches.
>
> If we can try to reuse virtio-net driver, we will get those nice features.
>
Yes, after all virtio-net has a very good ecological environment, and it also
do many performance optimization, it is actually a good idea.
>
> Thanks
>
>
>> +
>> + err = virtqueue_add_inbuf(vq, &sg, 1, page, GFP_KERNEL);
>> + if (err < 0)
>> + free_page((unsigned long) page);
>> +
>> + return err;
>> +}
>> +
>> static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
>> {
>> int buf_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
>> @@ -267,27 +287,33 @@ static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
>> vq = vsock->vqs[VSOCK_VQ_RX];
>>
>> do {
>> - pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
>> - if (!pkt)
>> - break;
>> + if (vsock->mergeable) {
>> + ret = fill_mergeable_rx_buff(vq);
>> + if (ret)
>> + break;
>> + } else {
>> + pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
>> + if (!pkt)
>> + break;
>>
>> - pkt->buf = kmalloc(buf_len, GFP_KERNEL);
>> - if (!pkt->buf) {
>> - virtio_transport_free_pkt(pkt);
>> - break;
>> - }
>> + pkt->buf = kmalloc(buf_len, GFP_KERNEL);
>> + if (!pkt->buf) {
>> + virtio_transport_free_pkt(pkt);
>> + break;
>> + }
>>
>> - pkt->len = buf_len;
>> + pkt->len = buf_len;
>>
>> - sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
>> - sgs[0] = &hdr;
>> + sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
>> + sgs[0] = &hdr;
>>
>> - sg_init_one(&buf, pkt->buf, buf_len);
>> - sgs[1] = &buf;
>> - ret = virtqueue_add_sgs(vq, sgs, 0, 2, pkt, GFP_KERNEL);
>> - if (ret) {
>> - virtio_transport_free_pkt(pkt);
>> - break;
>> + sg_init_one(&buf, pkt->buf, buf_len);
>> + sgs[1] = &buf;
>> + ret = virtqueue_add_sgs(vq, sgs, 0, 2, pkt, GFP_KERNEL);
>> + if (ret) {
>> + virtio_transport_free_pkt(pkt);
>> + break;
>> + }
>> }
>> vsock->rx_buf_nr++;
>> } while (vq->num_free);
>> @@ -588,6 +614,9 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
>> if (ret < 0)
>> goto out_vqs;
>>
>> + if (virtio_has_feature(vdev, VIRTIO_VSOCK_F_MRG_RXBUF))
>> + vsock->mergeable = true;
>> +
>> vsock->rx_buf_nr = 0;
>> vsock->rx_buf_max_nr = 0;
>> atomic_set(&vsock->queued_replies, 0);
>> @@ -640,8 +669,12 @@ static void virtio_vsock_remove(struct virtio_device *vdev)
>> vdev->config->reset(vdev);
>>
>> mutex_lock(&vsock->rx_lock);
>> - while ((pkt = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_RX])))
>> - virtio_transport_free_pkt(pkt);
>> + while ((pkt = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_RX]))) {
>> + if (vsock->mergeable)
>> + free_page((unsigned long)(void *)pkt);
>> + else
>> + virtio_transport_free_pkt(pkt);
>> + }
>> mutex_unlock(&vsock->rx_lock);
>>
>> mutex_lock(&vsock->tx_lock);
>> @@ -683,6 +716,7 @@ static void virtio_vsock_remove(struct virtio_device *vdev)
>> };
>>
>> static unsigned int features[] = {
>> + VIRTIO_VSOCK_F_MRG_RXBUF,
>> };
>>
>> static struct virtio_driver virtio_vsock_driver = {
>
> .
>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH net] net/ipv6: Move anycast init/cleanup functions out of CONFIG_PROC_FS
From: David Miller @ 2018-11-05 21:37 UTC (permalink / raw)
To: 0xeffeff; +Cc: netdev, kuznet, yoshfuji
In-Reply-To: <20181105203645.4993-1-0xeffeff@gmail.com>
From: Jeff Barnhill <0xeffeff@gmail.com>
Date: Mon, 5 Nov 2018 20:36:45 +0000
> Move the anycast.c init and cleanup functions which were inadvertently
> added inside the CONFIG_PROC_FS definition.
>
> Fixes: 2384d02520ff ("net/ipv6: Add anycast addresses to a global hashtable")
> Signed-off-by: Jeff Barnhill <0xeffeff@gmail.com>
Applied, thanks Jeff.
^ permalink raw reply
* Re: [PATCH net] net/ipv6: Move anycast init/cleanup functions out of CONFIG_PROC_FS
From: Jeff Barnhill @ 2018-11-05 22:02 UTC (permalink / raw)
To: davem; +Cc: netdev, Alexey Kuznetsov, yoshfuji
In-Reply-To: <20181105.133702.1214041828910910455.davem@davemloft.net>
Thanks, David. Sorry for missing that in the original patch.
Jeff
On Mon, Nov 5, 2018 at 4:55 PM David Miller <davem@davemloft.net> wrote:
>
> From: Jeff Barnhill <0xeffeff@gmail.com>
> Date: Mon, 5 Nov 2018 20:36:45 +0000
>
> > Move the anycast.c init and cleanup functions which were inadvertently
> > added inside the CONFIG_PROC_FS definition.
> >
> > Fixes: 2384d02520ff ("net/ipv6: Add anycast addresses to a global hashtable")
> > Signed-off-by: Jeff Barnhill <0xeffeff@gmail.com>
>
> Applied, thanks Jeff.
^ permalink raw reply
* Re: [PATCH v2 2/2] mm/page_alloc: use a single function to free page
From: Vlastimil Babka @ 2018-11-06 8:16 UTC (permalink / raw)
To: Aaron Lu, linux-mm, linux-kernel, netdev
Cc: Andrew Morton, Paweł Staszewski, Jesper Dangaard Brouer,
Eric Dumazet, Tariq Toukan, Ilias Apalodimas, Yoel Caspersen,
Mel Gorman, Saeed Mahameed, Michal Hocko, Dave Hansen,
Alexander Duyck
In-Reply-To: <20181106053037.GD6203@intel.com>
On 11/6/18 6:30 AM, Aaron Lu wrote:
> We have multiple places of freeing a page, most of them doing similar
> things and a common function can be used to reduce code duplicate.
>
> It also avoids bug fixed in one function but left in another.
>
> Signed-off-by: Aaron Lu <aaron.lu@intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
I assume there's no arch that would run page_ref_sub_and_test(1) slower
than put_page_testzero(), for the critical __free_pages() case?
> ---
> v2: move comments close to code as suggested by Dave.
>
> mm/page_alloc.c | 36 ++++++++++++++++--------------------
> 1 file changed, 16 insertions(+), 20 deletions(-)
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 91a9a6af41a2..4faf6b7bf225 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -4425,9 +4425,17 @@ unsigned long get_zeroed_page(gfp_t gfp_mask)
> }
> EXPORT_SYMBOL(get_zeroed_page);
>
> -void __free_pages(struct page *page, unsigned int order)
> +static inline void free_the_page(struct page *page, unsigned int order, int nr)
> {
> - if (put_page_testzero(page)) {
> + VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
> +
> + /*
> + * Free a page by reducing its ref count by @nr.
> + * If its refcount reaches 0, then according to its order:
> + * order0: send to PCP;
> + * high order: directly send to Buddy.
> + */
> + if (page_ref_sub_and_test(page, nr)) {
> if (order == 0)
> free_unref_page(page);
> else
> @@ -4435,6 +4443,10 @@ void __free_pages(struct page *page, unsigned int order)
> }
> }
>
> +void __free_pages(struct page *page, unsigned int order)
> +{
> + free_the_page(page, order, 1);
> +}
> EXPORT_SYMBOL(__free_pages);
>
> void free_pages(unsigned long addr, unsigned int order)
> @@ -4481,16 +4493,7 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>
> void __page_frag_cache_drain(struct page *page, unsigned int count)
> {
> - VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
> -
> - if (page_ref_sub_and_test(page, count)) {
> - unsigned int order = compound_order(page);
> -
> - if (order == 0)
> - free_unref_page(page);
> - else
> - __free_pages_ok(page, order);
> - }
> + free_the_page(page, compound_order(page), count);
> }
> EXPORT_SYMBOL(__page_frag_cache_drain);
>
> @@ -4555,14 +4558,7 @@ void page_frag_free(void *addr)
> {
> struct page *page = virt_to_head_page(addr);
>
> - if (unlikely(put_page_testzero(page))) {
> - unsigned int order = compound_order(page);
> -
> - if (order == 0)
> - free_unref_page(page);
> - else
> - __free_pages_ok(page, order);
> - }
> + free_the_page(page, compound_order(page), 1);
> }
> EXPORT_SYMBOL(page_frag_free);
>
>
^ permalink raw reply
* Re: [PATCH v2 2/2] mm/page_alloc: use a single function to free page
From: Aaron Lu @ 2018-11-06 8:47 UTC (permalink / raw)
To: Vlastimil Babka
Cc: linux-mm, linux-kernel, netdev, Andrew Morton,
Paweł Staszewski, Jesper Dangaard Brouer, Eric Dumazet,
Tariq Toukan, Ilias Apalodimas, Yoel Caspersen, Mel Gorman,
Saeed Mahameed, Michal Hocko, Dave Hansen, Alexander Duyck
In-Reply-To: <d6b4890c-0def-6114-2dcf-3ed120dea82c@suse.cz>
On Tue, Nov 06, 2018 at 09:16:20AM +0100, Vlastimil Babka wrote:
> On 11/6/18 6:30 AM, Aaron Lu wrote:
> > We have multiple places of freeing a page, most of them doing similar
> > things and a common function can be used to reduce code duplicate.
> >
> > It also avoids bug fixed in one function but left in another.
> >
> > Signed-off-by: Aaron Lu <aaron.lu@intel.com>
>
> Acked-by: Vlastimil Babka <vbabka@suse.cz>
Thanks.
> I assume there's no arch that would run page_ref_sub_and_test(1) slower
> than put_page_testzero(), for the critical __free_pages() case?
Good question.
I followed the non-arch specific calls and found that:
page_ref_sub_and_test() ends up calling atomic_sub_return(i, v) while
put_page_testzero() ends up calling atomic_sub_return(1, v). So they
should be same for archs that do not have their own implementations.
Back to your question: I don't know either.
If this is deemed unsafe, we can probably keep the ref modify part in
their original functions and only take the free part into a common
function.
Regards,
Aaron
> > ---
> > v2: move comments close to code as suggested by Dave.
> >
> > mm/page_alloc.c | 36 ++++++++++++++++--------------------
> > 1 file changed, 16 insertions(+), 20 deletions(-)
> >
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index 91a9a6af41a2..4faf6b7bf225 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -4425,9 +4425,17 @@ unsigned long get_zeroed_page(gfp_t gfp_mask)
> > }
> > EXPORT_SYMBOL(get_zeroed_page);
> >
> > -void __free_pages(struct page *page, unsigned int order)
> > +static inline void free_the_page(struct page *page, unsigned int order, int nr)
> > {
> > - if (put_page_testzero(page)) {
> > + VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
> > +
> > + /*
> > + * Free a page by reducing its ref count by @nr.
> > + * If its refcount reaches 0, then according to its order:
> > + * order0: send to PCP;
> > + * high order: directly send to Buddy.
> > + */
> > + if (page_ref_sub_and_test(page, nr)) {
> > if (order == 0)
> > free_unref_page(page);
> > else
> > @@ -4435,6 +4443,10 @@ void __free_pages(struct page *page, unsigned int order)
> > }
> > }
> >
> > +void __free_pages(struct page *page, unsigned int order)
> > +{
> > + free_the_page(page, order, 1);
> > +}
> > EXPORT_SYMBOL(__free_pages);
> >
> > void free_pages(unsigned long addr, unsigned int order)
> > @@ -4481,16 +4493,7 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
> >
> > void __page_frag_cache_drain(struct page *page, unsigned int count)
> > {
> > - VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
> > -
> > - if (page_ref_sub_and_test(page, count)) {
> > - unsigned int order = compound_order(page);
> > -
> > - if (order == 0)
> > - free_unref_page(page);
> > - else
> > - __free_pages_ok(page, order);
> > - }
> > + free_the_page(page, compound_order(page), count);
> > }
> > EXPORT_SYMBOL(__page_frag_cache_drain);
> >
> > @@ -4555,14 +4558,7 @@ void page_frag_free(void *addr)
> > {
> > struct page *page = virt_to_head_page(addr);
> >
> > - if (unlikely(put_page_testzero(page))) {
> > - unsigned int order = compound_order(page);
> > -
> > - if (order == 0)
> > - free_unref_page(page);
> > - else
> > - __free_pages_ok(page, order);
> > - }
> > + free_the_page(page, compound_order(page), 1);
> > }
> > EXPORT_SYMBOL(page_frag_free);
> >
> >
>
^ permalink raw reply
* [PATCH 00/14] Netfilter fixes for net
From: Pablo Neira Ayuso @ 2018-11-05 23:28 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev
Hi David,
The following patchset contains the first batch of Netfilter fixes for
your net tree:
1) Fix splat with IPv6 defragmenting locally generated fragments,
from Florian Westphal.
2) Fix Incorrect check for missing attribute in nft_osf.
3) Missing INT_MIN & INT_MAX definition for netfilter bridge uapi
header, from Jiri Slaby.
4) Revert map lookup in nft_numgen, this is already possible with
the existing infrastructure without this extension.
5) Fix wrong listing of set reference counter, make counter
synchronous again, from Stefano Brivio.
6) Fix CIDR 0 in hash:net,port,net, from Eric Westbrook.
7) Fix allocation failure with large set, use kvcalloc().
From Andrey Ryabinin.
8) No need to disable BH when fetch ip set comment, patch from
Jozsef Kadlecsik.
9) Sanity check for valid sysfs entry in xt_IDLETIMER, from
Taehee Yoo.
10) Fix suspicious rcu usage via ip_set() macro at netlink dump,
from Jozsef Kadlecsik.
11) Fix setting default timeout via nfnetlink_cttimeout, this
comes with preparation patch to add nf_{tcp,udp,...}_pernet()
helper.
12) Allow ebtables table nat to be of filter type via nft_compat.
From Florian Westphal.
13) Incorrect calculation of next bucket in early_drop, do no bump
hash value, update bucket counter instead. From Vasily Khoruzhick.
You can pull these changes from:
git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git
Thanks!
----------------------------------------------------------------
The following changes since commit 4f3ebb04d05fe36f74ef17c6ee06559626d47964:
Merge branch '100GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/net-queue (2018-10-24 16:27:33 -0700)
are available in the git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git HEAD
for you to fetch changes up to f393808dc64149ccd0e5a8427505ba2974a59854:
netfilter: conntrack: fix calculation of next bucket number in early_drop (2018-11-03 14:16:28 +0100)
----------------------------------------------------------------
Andrey Ryabinin (1):
netfilter: ipset: fix ip_set_list allocation failure
Eric Westbrook (1):
netfilter: ipset: actually allow allowable CIDR 0 in hash:net,port,net
Florian Westphal (2):
netfilter: ipv6: fix oops when defragmenting locally generated fragments
netfilter: nft_compat: ebtables 'nat' table is normal chain type
Jiri Slaby (1):
netfilter: bridge: define INT_MIN & INT_MAX in userspace
Jozsef Kadlecsik (2):
netfilter: ipset: Correct rcu_dereference() call in ip_set_put_comment()
netfilter: ipset: Fix calling ip_set() macro at dumping
Pablo Neira Ayuso (4):
netfilter: nft_osf: check if attribute is present
Revert "netfilter: nft_numgen: add map lookups for numgen random operations"
netfilter: conntrack: add nf_{tcp,udp,sctp,icmp,dccp,icmpv6,generic}_pernet()
netfilter: nfnetlink_cttimeout: pass default timeout policy to obj_to_nlattr
Stefano Brivio (1):
netfilter: ipset: list:set: Decrease refcount synchronously on deletion and replace
Taehee Yoo (1):
netfilter: xt_IDLETIMER: add sysfs filename checking routine
Vasily Khoruzhick (1):
netfilter: conntrack: fix calculation of next bucket number in early_drop
include/linux/netfilter/ipset/ip_set.h | 2 +-
include/linux/netfilter/ipset/ip_set_comment.h | 4 +-
include/net/netfilter/nf_conntrack_l4proto.h | 39 ++++++++
include/uapi/linux/netfilter/nf_tables.h | 4 +-
include/uapi/linux/netfilter_bridge.h | 4 +
net/ipv6/netfilter/nf_conntrack_reasm.c | 13 ++-
net/netfilter/ipset/ip_set_core.c | 43 +++++----
net/netfilter/ipset/ip_set_hash_netportnet.c | 8 +-
net/netfilter/ipset/ip_set_list_set.c | 17 ++--
net/netfilter/nf_conntrack_core.c | 13 ++-
net/netfilter/nf_conntrack_proto_dccp.c | 13 +--
net/netfilter/nf_conntrack_proto_generic.c | 11 +--
net/netfilter/nf_conntrack_proto_icmp.c | 11 +--
net/netfilter/nf_conntrack_proto_icmpv6.c | 11 +--
net/netfilter/nf_conntrack_proto_sctp.c | 11 +--
net/netfilter/nf_conntrack_proto_tcp.c | 15 +--
net/netfilter/nf_conntrack_proto_udp.c | 11 +--
net/netfilter/nfnetlink_cttimeout.c | 47 +++++++--
net/netfilter/nft_compat.c | 21 ++--
net/netfilter/nft_numgen.c | 127 -------------------------
net/netfilter/nft_osf.c | 2 +-
net/netfilter/xt_IDLETIMER.c | 20 ++++
22 files changed, 200 insertions(+), 247 deletions(-)
^ permalink raw reply
* [PATCH 01/14] netfilter: ipv6: fix oops when defragmenting locally generated fragments
From: Pablo Neira Ayuso @ 2018-11-05 23:28 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20181105232832.21896-1-pablo@netfilter.org>
From: Florian Westphal <fw@strlen.de>
Unlike ipv4 and normal ipv6 defrag, netfilter ipv6 defragmentation did
not save/restore skb->dst.
This causes oops when handling locally generated ipv6 fragments, as
output path needs a valid dst.
Reported-by: Maciej Żenczykowski <zenczykowski@gmail.com>
Fixes: 84379c9afe01 ("netfilter: ipv6: nf_defrag: drop skb dst before queueing")
Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/ipv6/netfilter/nf_conntrack_reasm.c | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index b8ac369f98ad..d219979c3e52 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -587,11 +587,16 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
*/
ret = -EINPROGRESS;
if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
- fq->q.meat == fq->q.len &&
- nf_ct_frag6_reasm(fq, skb, dev))
- ret = 0;
- else
+ fq->q.meat == fq->q.len) {
+ unsigned long orefdst = skb->_skb_refdst;
+
+ skb->_skb_refdst = 0UL;
+ if (nf_ct_frag6_reasm(fq, skb, dev))
+ ret = 0;
+ skb->_skb_refdst = orefdst;
+ } else {
skb_dst_drop(skb);
+ }
out_unlock:
spin_unlock_bh(&fq->q.lock);
--
2.11.0
^ permalink raw reply related
* [PATCH 02/14] netfilter: nft_osf: check if attribute is present
From: Pablo Neira Ayuso @ 2018-11-05 23:28 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20181105232832.21896-1-pablo@netfilter.org>
If the attribute is not sent, eg. old libnftnl binary, then
tb[NFTA_OSF_TTL] is NULL and kernel crashes from the _init path.
Fixes: a218dc82f0b5 ("netfilter: nft_osf: Add ttl option support")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/nft_osf.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c
index ca5e5d8c5ef8..b13618c764ec 100644
--- a/net/netfilter/nft_osf.c
+++ b/net/netfilter/nft_osf.c
@@ -50,7 +50,7 @@ static int nft_osf_init(const struct nft_ctx *ctx,
int err;
u8 ttl;
- if (nla_get_u8(tb[NFTA_OSF_TTL])) {
+ if (tb[NFTA_OSF_TTL]) {
ttl = nla_get_u8(tb[NFTA_OSF_TTL]);
if (ttl > 2)
return -EINVAL;
--
2.11.0
^ permalink raw reply related
* [PATCH 05/14] netfilter: ipset: list:set: Decrease refcount synchronously on deletion and replace
From: Pablo Neira Ayuso @ 2018-11-05 23:28 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20181105232832.21896-1-pablo@netfilter.org>
From: Stefano Brivio <sbrivio@redhat.com>
Commit 45040978c899 ("netfilter: ipset: Fix set:list type crash
when flush/dump set in parallel") postponed decreasing set
reference counters to the RCU callback.
An 'ipset del' command can terminate before the RCU grace period
is elapsed, and if sets are listed before then, the reference
counter shown in userspace will be wrong:
# ipset create h hash:ip; ipset create l list:set; ipset add l
# ipset del l h; ipset list h
Name: h
Type: hash:ip
Revision: 4
Header: family inet hashsize 1024 maxelem 65536
Size in memory: 88
References: 1
Number of entries: 0
Members:
# sleep 1; ipset list h
Name: h
Type: hash:ip
Revision: 4
Header: family inet hashsize 1024 maxelem 65536
Size in memory: 88
References: 0
Number of entries: 0
Members:
Fix this by making the reference count update synchronous again.
As a result, when sets are listed, ip_set_name_byindex() might
now fetch a set whose reference count is already zero. Instead
of relying on the reference count to protect against concurrent
set renaming, grab ip_set_ref_lock as reader and copy the name,
while holding the same lock in ip_set_rename() as writer
instead.
Reported-by: Li Shuang <shuali@redhat.com>
Fixes: 45040978c899 ("netfilter: ipset: Fix set:list type crash when flush/dump set in parallel")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
include/linux/netfilter/ipset/ip_set.h | 2 +-
net/netfilter/ipset/ip_set_core.c | 23 +++++++++++------------
net/netfilter/ipset/ip_set_list_set.c | 17 +++++++++++------
3 files changed, 23 insertions(+), 19 deletions(-)
diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 34fc80f3eb90..1d100efe74ec 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -314,7 +314,7 @@ enum {
extern ip_set_id_t ip_set_get_byname(struct net *net,
const char *name, struct ip_set **set);
extern void ip_set_put_byindex(struct net *net, ip_set_id_t index);
-extern const char *ip_set_name_byindex(struct net *net, ip_set_id_t index);
+extern void ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name);
extern ip_set_id_t ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index);
extern void ip_set_nfnl_put(struct net *net, ip_set_id_t index);
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index bc4bd247bb7d..fa15a831aeee 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -693,21 +693,20 @@ ip_set_put_byindex(struct net *net, ip_set_id_t index)
EXPORT_SYMBOL_GPL(ip_set_put_byindex);
/* Get the name of a set behind a set index.
- * We assume the set is referenced, so it does exist and
- * can't be destroyed. The set cannot be renamed due to
- * the referencing either.
- *
+ * Set itself is protected by RCU, but its name isn't: to protect against
+ * renaming, grab ip_set_ref_lock as reader (see ip_set_rename()) and copy the
+ * name.
*/
-const char *
-ip_set_name_byindex(struct net *net, ip_set_id_t index)
+void
+ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name)
{
- const struct ip_set *set = ip_set_rcu_get(net, index);
+ struct ip_set *set = ip_set_rcu_get(net, index);
BUG_ON(!set);
- BUG_ON(set->ref == 0);
- /* Referenced, so it's safe */
- return set->name;
+ read_lock_bh(&ip_set_ref_lock);
+ strncpy(name, set->name, IPSET_MAXNAMELEN);
+ read_unlock_bh(&ip_set_ref_lock);
}
EXPORT_SYMBOL_GPL(ip_set_name_byindex);
@@ -1153,7 +1152,7 @@ static int ip_set_rename(struct net *net, struct sock *ctnl,
if (!set)
return -ENOENT;
- read_lock_bh(&ip_set_ref_lock);
+ write_lock_bh(&ip_set_ref_lock);
if (set->ref != 0) {
ret = -IPSET_ERR_REFERENCED;
goto out;
@@ -1170,7 +1169,7 @@ static int ip_set_rename(struct net *net, struct sock *ctnl,
strncpy(set->name, name2, IPSET_MAXNAMELEN);
out:
- read_unlock_bh(&ip_set_ref_lock);
+ write_unlock_bh(&ip_set_ref_lock);
return ret;
}
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index 072a658fde04..4eef55da0878 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -148,9 +148,7 @@ __list_set_del_rcu(struct rcu_head * rcu)
{
struct set_elem *e = container_of(rcu, struct set_elem, rcu);
struct ip_set *set = e->set;
- struct list_set *map = set->data;
- ip_set_put_byindex(map->net, e->id);
ip_set_ext_destroy(set, e);
kfree(e);
}
@@ -158,15 +156,21 @@ __list_set_del_rcu(struct rcu_head * rcu)
static inline void
list_set_del(struct ip_set *set, struct set_elem *e)
{
+ struct list_set *map = set->data;
+
set->elements--;
list_del_rcu(&e->list);
+ ip_set_put_byindex(map->net, e->id);
call_rcu(&e->rcu, __list_set_del_rcu);
}
static inline void
-list_set_replace(struct set_elem *e, struct set_elem *old)
+list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old)
{
+ struct list_set *map = set->data;
+
list_replace_rcu(&old->list, &e->list);
+ ip_set_put_byindex(map->net, old->id);
call_rcu(&old->rcu, __list_set_del_rcu);
}
@@ -298,7 +302,7 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
INIT_LIST_HEAD(&e->list);
list_set_init_extensions(set, ext, e);
if (n)
- list_set_replace(e, n);
+ list_set_replace(set, e, n);
else if (next)
list_add_tail_rcu(&e->list, &next->list);
else if (prev)
@@ -486,6 +490,7 @@ list_set_list(const struct ip_set *set,
const struct list_set *map = set->data;
struct nlattr *atd, *nested;
u32 i = 0, first = cb->args[IPSET_CB_ARG0];
+ char name[IPSET_MAXNAMELEN];
struct set_elem *e;
int ret = 0;
@@ -504,8 +509,8 @@ list_set_list(const struct ip_set *set,
nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
if (!nested)
goto nla_put_failure;
- if (nla_put_string(skb, IPSET_ATTR_NAME,
- ip_set_name_byindex(map->net, e->id)))
+ ip_set_name_byindex(map->net, e->id, name);
+ if (nla_put_string(skb, IPSET_ATTR_NAME, name))
goto nla_put_failure;
if (ip_set_put_extensions(skb, set, e, true))
goto nla_put_failure;
--
2.11.0
^ permalink raw reply related
* [PATCH 04/14] Revert "netfilter: nft_numgen: add map lookups for numgen random operations"
From: Pablo Neira Ayuso @ 2018-11-05 23:28 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20181105232832.21896-1-pablo@netfilter.org>
Laura found a better way to do this from userspace without requiring
kernel infrastructure, revert this.
Fixes: 978d8f9055c3 ("netfilter: nft_numgen: add map lookups for numgen random operations")
Signed-off-by: Laura Garcia Liebana <nevola@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
include/uapi/linux/netfilter/nf_tables.h | 4 +-
net/netfilter/nft_numgen.c | 127 -------------------------------
2 files changed, 2 insertions(+), 129 deletions(-)
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 579974b0bf0d..7de4f1bdaf06 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1635,8 +1635,8 @@ enum nft_ng_attributes {
NFTA_NG_MODULUS,
NFTA_NG_TYPE,
NFTA_NG_OFFSET,
- NFTA_NG_SET_NAME,
- NFTA_NG_SET_ID,
+ NFTA_NG_SET_NAME, /* deprecated */
+ NFTA_NG_SET_ID, /* deprecated */
__NFTA_NG_MAX
};
#define NFTA_NG_MAX (__NFTA_NG_MAX - 1)
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
index 649d1700ec5b..3cc1b3dc3c3c 100644
--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -24,7 +24,6 @@ struct nft_ng_inc {
u32 modulus;
atomic_t counter;
u32 offset;
- struct nft_set *map;
};
static u32 nft_ng_inc_gen(struct nft_ng_inc *priv)
@@ -48,34 +47,11 @@ static void nft_ng_inc_eval(const struct nft_expr *expr,
regs->data[priv->dreg] = nft_ng_inc_gen(priv);
}
-static void nft_ng_inc_map_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
-{
- struct nft_ng_inc *priv = nft_expr_priv(expr);
- const struct nft_set *map = priv->map;
- const struct nft_set_ext *ext;
- u32 result;
- bool found;
-
- result = nft_ng_inc_gen(priv);
- found = map->ops->lookup(nft_net(pkt), map, &result, &ext);
-
- if (!found)
- return;
-
- nft_data_copy(®s->data[priv->dreg],
- nft_set_ext_data(ext), map->dlen);
-}
-
static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = {
[NFTA_NG_DREG] = { .type = NLA_U32 },
[NFTA_NG_MODULUS] = { .type = NLA_U32 },
[NFTA_NG_TYPE] = { .type = NLA_U32 },
[NFTA_NG_OFFSET] = { .type = NLA_U32 },
- [NFTA_NG_SET_NAME] = { .type = NLA_STRING,
- .len = NFT_SET_MAXNAMELEN - 1 },
- [NFTA_NG_SET_ID] = { .type = NLA_U32 },
};
static int nft_ng_inc_init(const struct nft_ctx *ctx,
@@ -101,22 +77,6 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx,
NFT_DATA_VALUE, sizeof(u32));
}
-static int nft_ng_inc_map_init(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nlattr * const tb[])
-{
- struct nft_ng_inc *priv = nft_expr_priv(expr);
- u8 genmask = nft_genmask_next(ctx->net);
-
- nft_ng_inc_init(ctx, expr, tb);
-
- priv->map = nft_set_lookup_global(ctx->net, ctx->table,
- tb[NFTA_NG_SET_NAME],
- tb[NFTA_NG_SET_ID], genmask);
-
- return PTR_ERR_OR_ZERO(priv->map);
-}
-
static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg,
u32 modulus, enum nft_ng_types type, u32 offset)
{
@@ -143,27 +103,10 @@ static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr)
priv->offset);
}
-static int nft_ng_inc_map_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
-{
- const struct nft_ng_inc *priv = nft_expr_priv(expr);
-
- if (nft_ng_dump(skb, priv->dreg, priv->modulus,
- NFT_NG_INCREMENTAL, priv->offset) ||
- nla_put_string(skb, NFTA_NG_SET_NAME, priv->map->name))
- goto nla_put_failure;
-
- return 0;
-
-nla_put_failure:
- return -1;
-}
-
struct nft_ng_random {
enum nft_registers dreg:8;
u32 modulus;
u32 offset;
- struct nft_set *map;
};
static u32 nft_ng_random_gen(struct nft_ng_random *priv)
@@ -183,25 +126,6 @@ static void nft_ng_random_eval(const struct nft_expr *expr,
regs->data[priv->dreg] = nft_ng_random_gen(priv);
}
-static void nft_ng_random_map_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
-{
- struct nft_ng_random *priv = nft_expr_priv(expr);
- const struct nft_set *map = priv->map;
- const struct nft_set_ext *ext;
- u32 result;
- bool found;
-
- result = nft_ng_random_gen(priv);
- found = map->ops->lookup(nft_net(pkt), map, &result, &ext);
- if (!found)
- return;
-
- nft_data_copy(®s->data[priv->dreg],
- nft_set_ext_data(ext), map->dlen);
-}
-
static int nft_ng_random_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
@@ -226,21 +150,6 @@ static int nft_ng_random_init(const struct nft_ctx *ctx,
NFT_DATA_VALUE, sizeof(u32));
}
-static int nft_ng_random_map_init(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nlattr * const tb[])
-{
- struct nft_ng_random *priv = nft_expr_priv(expr);
- u8 genmask = nft_genmask_next(ctx->net);
-
- nft_ng_random_init(ctx, expr, tb);
- priv->map = nft_set_lookup_global(ctx->net, ctx->table,
- tb[NFTA_NG_SET_NAME],
- tb[NFTA_NG_SET_ID], genmask);
-
- return PTR_ERR_OR_ZERO(priv->map);
-}
-
static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
const struct nft_ng_random *priv = nft_expr_priv(expr);
@@ -249,22 +158,6 @@ static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
priv->offset);
}
-static int nft_ng_random_map_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
-{
- const struct nft_ng_random *priv = nft_expr_priv(expr);
-
- if (nft_ng_dump(skb, priv->dreg, priv->modulus,
- NFT_NG_RANDOM, priv->offset) ||
- nla_put_string(skb, NFTA_NG_SET_NAME, priv->map->name))
- goto nla_put_failure;
-
- return 0;
-
-nla_put_failure:
- return -1;
-}
-
static struct nft_expr_type nft_ng_type;
static const struct nft_expr_ops nft_ng_inc_ops = {
.type = &nft_ng_type,
@@ -274,14 +167,6 @@ static const struct nft_expr_ops nft_ng_inc_ops = {
.dump = nft_ng_inc_dump,
};
-static const struct nft_expr_ops nft_ng_inc_map_ops = {
- .type = &nft_ng_type,
- .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_inc)),
- .eval = nft_ng_inc_map_eval,
- .init = nft_ng_inc_map_init,
- .dump = nft_ng_inc_map_dump,
-};
-
static const struct nft_expr_ops nft_ng_random_ops = {
.type = &nft_ng_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_ng_random)),
@@ -290,14 +175,6 @@ static const struct nft_expr_ops nft_ng_random_ops = {
.dump = nft_ng_random_dump,
};
-static const struct nft_expr_ops nft_ng_random_map_ops = {
- .type = &nft_ng_type,
- .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_random)),
- .eval = nft_ng_random_map_eval,
- .init = nft_ng_random_map_init,
- .dump = nft_ng_random_map_dump,
-};
-
static const struct nft_expr_ops *
nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
{
@@ -312,12 +189,8 @@ nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
switch (type) {
case NFT_NG_INCREMENTAL:
- if (tb[NFTA_NG_SET_NAME])
- return &nft_ng_inc_map_ops;
return &nft_ng_inc_ops;
case NFT_NG_RANDOM:
- if (tb[NFTA_NG_SET_NAME])
- return &nft_ng_random_map_ops;
return &nft_ng_random_ops;
}
--
2.11.0
^ permalink raw reply related
* [PATCH 03/14] netfilter: bridge: define INT_MIN & INT_MAX in userspace
From: Pablo Neira Ayuso @ 2018-11-05 23:28 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20181105232832.21896-1-pablo@netfilter.org>
From: Jiri Slaby <jslaby@suse.cz>
With 4.19, programs like ebtables fail to build when they include
"linux/netfilter_bridge.h". It is caused by commit 94276fa8a2a4 which
added a use of INT_MIN and INT_MAX to the header:
: In file included from /usr/include/linux/netfilter_bridge/ebtables.h:18,
: from include/ebtables_u.h:28,
: from communication.c:23:
: /usr/include/linux/netfilter_bridge.h:30:20: error: 'INT_MIN' undeclared here (not in a function)
: NF_BR_PRI_FIRST = INT_MIN,
: ^~~~~~~
Define these constants by including "limits.h" when !__KERNEL__ (the
same way as for other netfilter_* headers).
Fixes: 94276fa8a2a4 ("netfilter: bridge: Expose nf_tables bridge hook priorities through uapi")
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Acked-by: Máté Eckl <ecklm94@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
include/uapi/linux/netfilter_bridge.h | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/include/uapi/linux/netfilter_bridge.h b/include/uapi/linux/netfilter_bridge.h
index 156ccd089df1..1610fdbab98d 100644
--- a/include/uapi/linux/netfilter_bridge.h
+++ b/include/uapi/linux/netfilter_bridge.h
@@ -11,6 +11,10 @@
#include <linux/if_vlan.h>
#include <linux/if_pppox.h>
+#ifndef __KERNEL__
+#include <limits.h> /* for INT_MIN, INT_MAX */
+#endif
+
/* Bridge Hooks */
/* After promisc drops, checksum checks. */
#define NF_BR_PRE_ROUTING 0
--
2.11.0
^ permalink raw reply related
* [PATCH 07/14] netfilter: ipset: fix ip_set_list allocation failure
From: Pablo Neira Ayuso @ 2018-11-05 23:28 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20181105232832.21896-1-pablo@netfilter.org>
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
ip_set_create() and ip_set_net_init() attempt to allocate physically
contiguous memory for ip_set_list. If memory is fragmented, the
allocations could easily fail:
vzctl: page allocation failure: order:7, mode:0xc0d0
Call Trace:
dump_stack+0x19/0x1b
warn_alloc_failed+0x110/0x180
__alloc_pages_nodemask+0x7bf/0xc60
alloc_pages_current+0x98/0x110
kmalloc_order+0x18/0x40
kmalloc_order_trace+0x26/0xa0
__kmalloc+0x279/0x290
ip_set_net_init+0x4b/0x90 [ip_set]
ops_init+0x3b/0xb0
setup_net+0xbb/0x170
copy_net_ns+0xf1/0x1c0
create_new_namespaces+0xf9/0x180
copy_namespaces+0x8e/0xd0
copy_process+0xb61/0x1a00
do_fork+0x91/0x320
Use kvcalloc() to fallback to 0-order allocations if high order
page isn't available.
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/ipset/ip_set_core.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index fa15a831aeee..68db946df151 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -960,7 +960,7 @@ static int ip_set_create(struct net *net, struct sock *ctnl,
/* Wraparound */
goto cleanup;
- list = kcalloc(i, sizeof(struct ip_set *), GFP_KERNEL);
+ list = kvcalloc(i, sizeof(struct ip_set *), GFP_KERNEL);
if (!list)
goto cleanup;
/* nfnl mutex is held, both lists are valid */
@@ -972,7 +972,7 @@ static int ip_set_create(struct net *net, struct sock *ctnl,
/* Use new list */
index = inst->ip_set_max;
inst->ip_set_max = i;
- kfree(tmp);
+ kvfree(tmp);
ret = 0;
} else if (ret) {
goto cleanup;
@@ -2058,7 +2058,7 @@ ip_set_net_init(struct net *net)
if (inst->ip_set_max >= IPSET_INVALID_ID)
inst->ip_set_max = IPSET_INVALID_ID - 1;
- list = kcalloc(inst->ip_set_max, sizeof(struct ip_set *), GFP_KERNEL);
+ list = kvcalloc(inst->ip_set_max, sizeof(struct ip_set *), GFP_KERNEL);
if (!list)
return -ENOMEM;
inst->is_deleted = false;
@@ -2086,7 +2086,7 @@ ip_set_net_exit(struct net *net)
}
}
nfnl_unlock(NFNL_SUBSYS_IPSET);
- kfree(rcu_dereference_protected(inst->ip_set_list, 1));
+ kvfree(rcu_dereference_protected(inst->ip_set_list, 1));
}
static struct pernet_operations ip_set_net_ops = {
--
2.11.0
^ permalink raw reply related
* [PATCH 09/14] netfilter: xt_IDLETIMER: add sysfs filename checking routine
From: Pablo Neira Ayuso @ 2018-11-05 23:28 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20181105232832.21896-1-pablo@netfilter.org>
From: Taehee Yoo <ap420073@gmail.com>
When IDLETIMER rule is added, sysfs file is created under
/sys/class/xt_idletimer/timers/
But some label name shouldn't be used.
".", "..", "power", "uevent", "subsystem", etc...
So that sysfs filename checking routine is needed.
test commands:
%iptables -I INPUT -j IDLETIMER --timeout 1 --label "power"
splat looks like:
[95765.423132] sysfs: cannot create duplicate filename '/devices/virtual/xt_idletimer/timers/power'
[95765.433418] CPU: 0 PID: 8446 Comm: iptables Not tainted 4.19.0-rc6+ #20
[95765.449755] Call Trace:
[95765.449755] dump_stack+0xc9/0x16b
[95765.449755] ? show_regs_print_info+0x5/0x5
[95765.449755] sysfs_warn_dup+0x74/0x90
[95765.449755] sysfs_add_file_mode_ns+0x352/0x500
[95765.449755] sysfs_create_file_ns+0x179/0x270
[95765.449755] ? sysfs_add_file_mode_ns+0x500/0x500
[95765.449755] ? idletimer_tg_checkentry+0x3e5/0xb1b [xt_IDLETIMER]
[95765.449755] ? rcu_read_lock_sched_held+0x114/0x130
[95765.449755] ? __kmalloc_track_caller+0x211/0x2b0
[95765.449755] ? memcpy+0x34/0x50
[95765.449755] idletimer_tg_checkentry+0x4e2/0xb1b [xt_IDLETIMER]
[ ... ]
Fixes: 0902b469bd25 ("netfilter: xtables: idletimer target implementation")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/xt_IDLETIMER.c | 20 ++++++++++++++++++++
1 file changed, 20 insertions(+)
diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
index c6acfc2d9c84..eb4cbd244c3d 100644
--- a/net/netfilter/xt_IDLETIMER.c
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -114,6 +114,22 @@ static void idletimer_tg_expired(struct timer_list *t)
schedule_work(&timer->work);
}
+static int idletimer_check_sysfs_name(const char *name, unsigned int size)
+{
+ int ret;
+
+ ret = xt_check_proc_name(name, size);
+ if (ret < 0)
+ return ret;
+
+ if (!strcmp(name, "power") ||
+ !strcmp(name, "subsystem") ||
+ !strcmp(name, "uevent"))
+ return -EINVAL;
+
+ return 0;
+}
+
static int idletimer_tg_create(struct idletimer_tg_info *info)
{
int ret;
@@ -124,6 +140,10 @@ static int idletimer_tg_create(struct idletimer_tg_info *info)
goto out;
}
+ ret = idletimer_check_sysfs_name(info->label, sizeof(info->label));
+ if (ret < 0)
+ goto out_free_timer;
+
sysfs_attr_init(&info->timer->attr.attr);
info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL);
if (!info->timer->attr.attr.name) {
--
2.11.0
^ permalink raw reply related
* [PATCH 08/14] netfilter: ipset: Correct rcu_dereference() call in ip_set_put_comment()
From: Pablo Neira Ayuso @ 2018-11-05 23:28 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20181105232832.21896-1-pablo@netfilter.org>
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
The function is called when rcu_read_lock() is held and not
when rcu_read_lock_bh() is held.
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
| 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--git a/include/linux/netfilter/ipset/ip_set_comment.h b/include/linux/netfilter/ipset/ip_set_comment.h
index 8e2bab1e8e90..70877f8de7e9 100644
--- a/include/linux/netfilter/ipset/ip_set_comment.h
+++ b/include/linux/netfilter/ipset/ip_set_comment.h
@@ -43,11 +43,11 @@ ip_set_init_comment(struct ip_set *set, struct ip_set_comment *comment,
rcu_assign_pointer(comment->c, c);
}
-/* Used only when dumping a set, protected by rcu_read_lock_bh() */
+/* Used only when dumping a set, protected by rcu_read_lock() */
static inline int
ip_set_put_comment(struct sk_buff *skb, const struct ip_set_comment *comment)
{
- struct ip_set_comment_rcu *c = rcu_dereference_bh(comment->c);
+ struct ip_set_comment_rcu *c = rcu_dereference(comment->c);
if (!c)
return 0;
--
2.11.0
^ permalink raw reply related
* [PATCH 06/14] netfilter: ipset: actually allow allowable CIDR 0 in hash:net,port,net
From: Pablo Neira Ayuso @ 2018-11-05 23:28 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20181105232832.21896-1-pablo@netfilter.org>
From: Eric Westbrook <eric@westbrook.io>
Allow /0 as advertised for hash:net,port,net sets.
For "hash:net,port,net", ipset(8) says that "either subnet
is permitted to be a /0 should you wish to match port
between all destinations."
Make that statement true.
Before:
# ipset create cidrzero hash:net,port,net
# ipset add cidrzero 0.0.0.0/0,12345,0.0.0.0/0
ipset v6.34: The value of the CIDR parameter of the IP address is invalid
# ipset create cidrzero6 hash:net,port,net family inet6
# ipset add cidrzero6 ::/0,12345,::/0
ipset v6.34: The value of the CIDR parameter of the IP address is invalid
After:
# ipset create cidrzero hash:net,port,net
# ipset add cidrzero 0.0.0.0/0,12345,0.0.0.0/0
# ipset test cidrzero 192.168.205.129,12345,172.16.205.129
192.168.205.129,tcp:12345,172.16.205.129 is in set cidrzero.
# ipset create cidrzero6 hash:net,port,net family inet6
# ipset add cidrzero6 ::/0,12345,::/0
# ipset test cidrzero6 fe80::1,12345,ff00::1
fe80::1,tcp:12345,ff00::1 is in set cidrzero6.
See also:
https://bugzilla.kernel.org/show_bug.cgi?id=200897
https://github.com/ewestbrook/linux/commit/df7ff6efb0934ab6acc11f003ff1a7580d6c1d9c
Signed-off-by: Eric Westbrook <linux@westbrook.io>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/ipset/ip_set_hash_netportnet.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c
index d391485a6acd..613e18e720a4 100644
--- a/net/netfilter/ipset/ip_set_hash_netportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netportnet.c
@@ -213,13 +213,13 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
if (tb[IPSET_ATTR_CIDR]) {
e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (!e.cidr[0] || e.cidr[0] > HOST_MASK)
+ if (e.cidr[0] > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
}
if (tb[IPSET_ATTR_CIDR2]) {
e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
- if (!e.cidr[1] || e.cidr[1] > HOST_MASK)
+ if (e.cidr[1] > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
}
@@ -493,13 +493,13 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
if (tb[IPSET_ATTR_CIDR]) {
e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (!e.cidr[0] || e.cidr[0] > HOST_MASK)
+ if (e.cidr[0] > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
}
if (tb[IPSET_ATTR_CIDR2]) {
e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
- if (!e.cidr[1] || e.cidr[1] > HOST_MASK)
+ if (e.cidr[1] > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
}
--
2.11.0
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox