Linux virtualization list

Linux virtualization list
 help / color / mirror / Atom feed

* Re: [PATCH v2 02/17] compat_ioctl: move drivers to generic_compat_ioctl_ptrarg
From: Greg Kroah-Hartman @ 2018-09-12 18:13 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: kvm, Alexander Shishkin, Jarkko Sakkinen, virtualization,
	Benjamin Tissoires, linux-mtd, Peter Huewe, linux1394-devel,
	devel, Jason Gunthorpe, Marek Vasut, linux-input, Tomas Winkler,
	Jiri Kosina, viro, Artem Bityutskiy, netdev, linux-usb,
	linux-kernel, Sudip Mukherjee, Stefan Richter, linux-fsdevel,
	linux-integrity, David S. Miller
In-Reply-To: <20180912150142.157913-2-arnd@arndb.de>

On Wed, Sep 12, 2018 at 05:01:03PM +0200, Arnd Bergmann wrote:
> Each of these drivers has a copy of the same trivial helper function to
> convert the pointer argument and then call the native ioctl handler.
> 
> We now have a generic implementation of that, so use it.
> 
> Signed-off-by: Arnd Bergmann <arnd@arndb.de>

Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

^ permalink raw reply

* [PATCH net-next V2] virtio_net: ethtool tx napi configuration
From: Jason Wang @ 2018-09-13  5:35 UTC (permalink / raw)
  To: mst, jasowang
  Cc: netdev, Willem de Bruijn, davem, linux-kernel, virtualization

Implement ethtool .set_coalesce (-C) and .get_coalesce (-c) handlers.
Interrupt moderation is currently not supported, so these accept and
display the default settings of 0 usec and 1 frame.

Toggle tx napi through a bit in tx-frames. So as to not interfere
with possible future interrupt moderation, value 1 means tx napi while
value 0 means not.

To properly synchronize with the data path, tx napi is disabled and
tx lock is held when changing the value of napi weight. And two more
places that can access tx napi weight:

- speculative tx polling in rx napi, we can leave it as is since it
  not a must for correctness.
- skb_xmit_done(), one more check of napi weight is added before
  trying to enable tx to avoid tx to be disabled forever if napi is
  disabled after skb_xmit_done() but before the napi

Link: https://patchwork.ozlabs.org/patch/948149/
Suggested-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
Changes from V1:
- try to synchronize with datapath to allow changing mode when
  interface is up.
- use tx-frames 0 as to disable tx napi while tx-frames 1 to enable tx napi
---
 drivers/net/virtio_net.c | 64 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 63 insertions(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 765920905226..6e70864f5899 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -66,6 +66,8 @@ DECLARE_EWMA(pkt_len, 0, 64)
 
 #define VIRTNET_DRIVER_VERSION "1.0.0"
 
+static const u32 ethtool_coalesce_napi_mask = (1UL << 10);
+
 static const unsigned long guest_offloads[] = {
 	VIRTIO_NET_F_GUEST_TSO4,
 	VIRTIO_NET_F_GUEST_TSO6,
@@ -1444,7 +1446,10 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
 
 	virtqueue_napi_complete(napi, sq->vq, 0);
 
-	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
+	/* Check napi.weight to avoid tx stall since it could be set
+	 * to zero by ethtool after skb_xmit_done().
+	 */
+	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS || !sq->napi.weight)
 		netif_tx_wake_queue(txq);
 
 	return 0;
@@ -2181,6 +2186,61 @@ static int virtnet_get_link_ksettings(struct net_device *dev,
 	return 0;
 }
 
+static int virtnet_set_coalesce(struct net_device *dev,
+				struct ethtool_coalesce *ec)
+{
+	struct ethtool_coalesce ec_default = {
+		.cmd = ETHTOOL_SCOALESCE,
+		.rx_max_coalesced_frames = 1,
+	};
+	struct virtnet_info *vi = netdev_priv(dev);
+	int i, napi_weight;
+
+	if (ec->tx_max_coalesced_frames > 1)
+		return -EINVAL;
+
+	ec_default.tx_max_coalesced_frames = ec->tx_max_coalesced_frames;
+	napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
+
+	/* disallow changes to fields not explicitly tested above */
+	if (memcmp(ec, &ec_default, sizeof(ec_default)))
+		return -EINVAL;
+
+	if (napi_weight ^ vi->sq[0].napi.weight) {
+		for (i = 0; i < vi->max_queue_pairs; i++) {
+			struct netdev_queue *txq =
+			       netdev_get_tx_queue(vi->dev, i);
+
+			virtnet_napi_tx_disable(&vi->sq[i].napi);
+			__netif_tx_lock_bh(txq);
+			vi->sq[i].napi.weight = napi_weight;
+			__netif_tx_unlock_bh(txq);
+			virtnet_napi_tx_enable(vi, vi->sq[i].vq,
+					       &vi->sq[i].napi);
+		}
+	}
+
+	return 0;
+}
+
+static int virtnet_get_coalesce(struct net_device *dev,
+				struct ethtool_coalesce *ec)
+{
+	struct ethtool_coalesce ec_default = {
+		.cmd = ETHTOOL_GCOALESCE,
+		.rx_max_coalesced_frames = 1,
+		.tx_max_coalesced_frames = 0,
+	};
+	struct virtnet_info *vi = netdev_priv(dev);
+
+	memcpy(ec, &ec_default, sizeof(ec_default));
+
+	if (vi->sq[0].napi.weight)
+		ec->tx_max_coalesced_frames = 1;
+
+	return 0;
+}
+
 static void virtnet_init_settings(struct net_device *dev)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
@@ -2219,6 +2279,8 @@ static const struct ethtool_ops virtnet_ethtool_ops = {
 	.get_ts_info = ethtool_op_get_ts_info,
 	.get_link_ksettings = virtnet_get_link_ksettings,
 	.set_link_ksettings = virtnet_set_link_ksettings,
+	.set_coalesce = virtnet_set_coalesce,
+	.get_coalesce = virtnet_get_coalesce,
 };
 
 static void virtnet_freeze_down(struct virtio_device *vdev)
-- 
2.17.1

^ permalink raw reply related

* [PATCH 2/3] virtio-gpu: add VIRTIO_GPU_F_EDID feature
From: Gerd Hoffmann @ 2018-09-13  8:12 UTC (permalink / raw)
  To: virtio-dev, dri-devel
  Cc: Michael S. Tsirkin, David Airlie, open list,
	open list:VIRTIO CORE, NET AND BLOCK DRIVERS
In-Reply-To: <20180913081259.27457-1-kraxel@redhat.com>

The feature allows the guest request an EDID blob (describing monitor
capabilities) for a given scanout (aka virtual monitor connector).

It brings a new command message, which has just a scanout field (beside
the standard virtio-gpu header) and a response message which carries the
EDID data.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 include/uapi/linux/virtio_gpu.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/include/uapi/linux/virtio_gpu.h b/include/uapi/linux/virtio_gpu.h
index f43c3c6171..7267c3d2d6 100644
--- a/include/uapi/linux/virtio_gpu.h
+++ b/include/uapi/linux/virtio_gpu.h
@@ -41,6 +41,7 @@
 #include <linux/types.h>
 
 #define VIRTIO_GPU_F_VIRGL 0
+#define VIRTIO_GPU_F_EDID  1
 
 enum virtio_gpu_ctrl_type {
 	VIRTIO_GPU_UNDEFINED = 0,
@@ -56,6 +57,7 @@ enum virtio_gpu_ctrl_type {
 	VIRTIO_GPU_CMD_RESOURCE_DETACH_BACKING,
 	VIRTIO_GPU_CMD_GET_CAPSET_INFO,
 	VIRTIO_GPU_CMD_GET_CAPSET,
+	VIRTIO_GPU_CMD_GET_EDID,
 
 	/* 3d commands */
 	VIRTIO_GPU_CMD_CTX_CREATE = 0x0200,
@@ -76,6 +78,7 @@ enum virtio_gpu_ctrl_type {
 	VIRTIO_GPU_RESP_OK_DISPLAY_INFO,
 	VIRTIO_GPU_RESP_OK_CAPSET_INFO,
 	VIRTIO_GPU_RESP_OK_CAPSET,
+	VIRTIO_GPU_RESP_OK_EDID,
 
 	/* error responses */
 	VIRTIO_GPU_RESP_ERR_UNSPEC = 0x1200,
@@ -291,6 +294,20 @@ struct virtio_gpu_resp_capset {
 	__u8 capset_data[];
 };
 
+/* VIRTIO_GPU_CMD_GET_EDID */
+struct virtio_gpu_get_edid {
+	struct virtio_gpu_ctrl_hdr hdr;
+	__le32 scanout;
+};
+
+/* VIRTIO_GPU_RESP_OK_EDID */
+struct virtio_gpu_resp_edid {
+	struct virtio_gpu_ctrl_hdr hdr;
+	__le32 scanout;
+	__le32 size;
+	__u8 edid[512];
+};
+
 #define VIRTIO_GPU_EVENT_DISPLAY (1 << 0)
 
 struct virtio_gpu_config {
-- 
2.9.3

^ permalink raw reply related

* [PATCH 3/3] drm/virtio: add edid support
From: Gerd Hoffmann @ 2018-09-13  8:12 UTC (permalink / raw)
  To: virtio-dev, dri-devel
  Cc: David Airlie, open list, open list:VIRTIO GPU DRIVER
In-Reply-To: <20180913081259.27457-1-kraxel@redhat.com>

linux guest driver implementation of the VIRTIO_GPU_F_EDID feature.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 drivers/gpu/drm/virtio/virtgpu_drv.h     |  3 ++
 drivers/gpu/drm/virtio/virtgpu_display.c |  6 ++++
 drivers/gpu/drm/virtio/virtgpu_drv.c     |  1 +
 drivers/gpu/drm/virtio/virtgpu_kms.c     |  8 +++++
 drivers/gpu/drm/virtio/virtgpu_vq.c      | 59 ++++++++++++++++++++++++++++++++
 5 files changed, 77 insertions(+)

diff --git a/drivers/gpu/drm/virtio/virtgpu_drv.h b/drivers/gpu/drm/virtio/virtgpu_drv.h
index 65605e207b..58358a48df 100644
--- a/drivers/gpu/drm/virtio/virtgpu_drv.h
+++ b/drivers/gpu/drm/virtio/virtgpu_drv.h
@@ -112,6 +112,7 @@ struct virtio_gpu_output {
 	struct drm_encoder enc;
 	struct virtio_gpu_display_one info;
 	struct virtio_gpu_update_cursor cursor;
+	struct edid *edid;
 	int cur_x;
 	int cur_y;
 };
@@ -194,6 +195,7 @@ struct virtio_gpu_device {
 	spinlock_t ctx_id_idr_lock;
 
 	bool has_virgl_3d;
+	bool has_edid;
 
 	struct work_struct config_changed_work;
 
@@ -287,6 +289,7 @@ int virtio_gpu_cmd_get_capset_info(struct virtio_gpu_device *vgdev, int idx);
 int virtio_gpu_cmd_get_capset(struct virtio_gpu_device *vgdev,
 			      int idx, int version,
 			      struct virtio_gpu_drv_cap_cache **cache_p);
+int virtio_gpu_cmd_get_edids(struct virtio_gpu_device *vgdev);
 void virtio_gpu_cmd_context_create(struct virtio_gpu_device *vgdev, uint32_t id,
 				   uint32_t nlen, const char *name);
 void virtio_gpu_cmd_context_destroy(struct virtio_gpu_device *vgdev,
diff --git a/drivers/gpu/drm/virtio/virtgpu_display.c b/drivers/gpu/drm/virtio/virtgpu_display.c
index 25503b9335..e573eab1f1 100644
--- a/drivers/gpu/drm/virtio/virtgpu_display.c
+++ b/drivers/gpu/drm/virtio/virtgpu_display.c
@@ -168,6 +168,12 @@ static int virtio_gpu_conn_get_modes(struct drm_connector *connector)
 	struct drm_display_mode *mode = NULL;
 	int count, width, height;
 
+	if (output->edid) {
+		count = drm_add_edid_modes(connector, output->edid);
+		if (count)
+			return count;
+	}
+
 	width  = le32_to_cpu(output->info.r.width);
 	height = le32_to_cpu(output->info.r.height);
 	count = drm_add_modes_noedid(connector, XRES_MAX, YRES_MAX);
diff --git a/drivers/gpu/drm/virtio/virtgpu_drv.c b/drivers/gpu/drm/virtio/virtgpu_drv.c
index d9287c144f..f7f32a885a 100644
--- a/drivers/gpu/drm/virtio/virtgpu_drv.c
+++ b/drivers/gpu/drm/virtio/virtgpu_drv.c
@@ -80,6 +80,7 @@ static unsigned int features[] = {
 	 */
 	VIRTIO_GPU_F_VIRGL,
 #endif
+	VIRTIO_GPU_F_EDID,
 };
 static struct virtio_driver virtio_gpu_driver = {
 	.feature_table = features,
diff --git a/drivers/gpu/drm/virtio/virtgpu_kms.c b/drivers/gpu/drm/virtio/virtgpu_kms.c
index 65060c0852..f6cbbb27e4 100644
--- a/drivers/gpu/drm/virtio/virtgpu_kms.c
+++ b/drivers/gpu/drm/virtio/virtgpu_kms.c
@@ -44,6 +44,8 @@ static void virtio_gpu_config_changed_work_func(struct work_struct *work)
 	virtio_cread(vgdev->vdev, struct virtio_gpu_config,
 		     events_read, &events_read);
 	if (events_read & VIRTIO_GPU_EVENT_DISPLAY) {
+		if (vgdev->has_edid)
+			virtio_gpu_cmd_get_edids(vgdev);
 		virtio_gpu_cmd_get_display_info(vgdev);
 		drm_helper_hpd_irq_event(vgdev->ddev);
 		events_clear |= VIRTIO_GPU_EVENT_DISPLAY;
@@ -174,6 +176,10 @@ int virtio_gpu_driver_load(struct drm_device *dev, unsigned long flags)
 #else
 	DRM_INFO("virgl 3d acceleration not supported by guest\n");
 #endif
+	if (virtio_has_feature(vgdev->vdev, VIRTIO_GPU_F_EDID)) {
+		vgdev->has_edid = true;
+		DRM_INFO("EDID support available.\n");
+	}
 
 	ret = virtio_find_vqs(vgdev->vdev, 2, vqs, callbacks, names, NULL);
 	if (ret) {
@@ -219,6 +225,8 @@ int virtio_gpu_driver_load(struct drm_device *dev, unsigned long flags)
 
 	if (num_capsets)
 		virtio_gpu_get_capsets(vgdev, num_capsets);
+	if (vgdev->has_edid)
+		virtio_gpu_cmd_get_edids(vgdev);
 	virtio_gpu_cmd_get_display_info(vgdev);
 	wait_event_timeout(vgdev->resp_wq, !vgdev->display_info_pending,
 			   5 * HZ);
diff --git a/drivers/gpu/drm/virtio/virtgpu_vq.c b/drivers/gpu/drm/virtio/virtgpu_vq.c
index 020070d483..7d79211d1f 100644
--- a/drivers/gpu/drm/virtio/virtgpu_vq.c
+++ b/drivers/gpu/drm/virtio/virtgpu_vq.c
@@ -596,6 +596,37 @@ static void virtio_gpu_cmd_capset_cb(struct virtio_gpu_device *vgdev,
 	wake_up(&vgdev->resp_wq);
 }
 
+static void virtio_gpu_cmd_get_edid_cb(struct virtio_gpu_device *vgdev,
+				       struct virtio_gpu_vbuffer *vbuf)
+{
+	struct virtio_gpu_resp_edid *resp =
+		(struct virtio_gpu_resp_edid *)vbuf->resp_buf;
+	uint32_t scanout = le32_to_cpu(resp->scanout);
+	uint32_t size = le32_to_cpu(resp->size);
+	struct virtio_gpu_output *output;
+	struct edid *new_edid, *old_edid;
+
+	if (scanout >= vgdev->num_scanouts)
+		return;
+	output = vgdev->outputs + scanout;
+
+	if (drm_edid_is_valid((struct edid *)resp->edid)) {
+		new_edid = kmalloc(size, GFP_KERNEL);
+		memcpy(new_edid, resp->edid, size);
+	} else {
+		new_edid = NULL;
+	}
+
+	spin_lock(&vgdev->display_info_lock);
+	old_edid = output->edid;
+	output->edid = new_edid;
+	drm_connector_update_edid_property(&output->conn, output->edid);
+	spin_unlock(&vgdev->display_info_lock);
+
+	kfree(old_edid);
+	wake_up(&vgdev->resp_wq);
+}
+
 int virtio_gpu_cmd_get_display_info(struct virtio_gpu_device *vgdev)
 {
 	struct virtio_gpu_ctrl_hdr *cmd_p;
@@ -697,6 +728,34 @@ int virtio_gpu_cmd_get_capset(struct virtio_gpu_device *vgdev,
 	return 0;
 }
 
+int virtio_gpu_cmd_get_edids(struct virtio_gpu_device *vgdev)
+{
+	struct virtio_gpu_get_edid *cmd_p;
+	struct virtio_gpu_vbuffer *vbuf;
+	void *resp_buf;
+	int scanout;
+
+	if (WARN_ON(!vgdev->has_edid))
+		return -EINVAL;
+
+	for (scanout = 0; scanout < vgdev->num_scanouts; scanout++) {
+		resp_buf = kzalloc(sizeof(struct virtio_gpu_resp_edid),
+				   GFP_KERNEL);
+		if (!resp_buf)
+			return -ENOMEM;
+
+		cmd_p = virtio_gpu_alloc_cmd_resp
+			(vgdev, &virtio_gpu_cmd_get_edid_cb, &vbuf,
+			 sizeof(*cmd_p), sizeof(struct virtio_gpu_resp_edid),
+			 resp_buf);
+		cmd_p->hdr.type = cpu_to_le32(VIRTIO_GPU_CMD_GET_EDID);
+		cmd_p->scanout = cpu_to_le32(scanout);
+		virtio_gpu_queue_ctrl_buffer(vgdev, vbuf);
+	}
+
+	return 0;
+}
+
 void virtio_gpu_cmd_context_create(struct virtio_gpu_device *vgdev, uint32_t id,
 				   uint32_t nlen, const char *name)
 {
-- 
2.9.3

^ permalink raw reply related

* Re: [virtio-dev] Re: [PATCH net-next v2 0/5] virtio: support packed ring
From: Tiwei Bie @ 2018-09-13  8:59 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: virtio-dev, netdev, linux-kernel, virtualization, wexu
In-Reply-To: <20180912121457-mutt-send-email-mst@kernel.org>

On Wed, Sep 12, 2018 at 12:16:32PM -0400, Michael S. Tsirkin wrote:
> On Tue, Sep 11, 2018 at 01:37:26PM +0800, Tiwei Bie wrote:
> > On Mon, Sep 10, 2018 at 11:33:17AM +0800, Jason Wang wrote:
> > > On 2018年09月10日 11:00, Tiwei Bie wrote:
> > > > On Fri, Sep 07, 2018 at 09:00:49AM -0400, Michael S. Tsirkin wrote:
> > > > > On Fri, Sep 07, 2018 at 09:22:25AM +0800, Tiwei Bie wrote:
> > > > > > On Mon, Aug 27, 2018 at 05:00:40PM +0300, Michael S. Tsirkin wrote:
> > > > > > > Are there still plans to test the performance with vost pmd?
> > > > > > > vhost doesn't seem to show a performance gain ...
> > > > > > > 
> > > > > > I tried some performance tests with vhost PMD. In guest, the
> > > > > > XDP program will return XDP_DROP directly. And in host, testpmd
> > > > > > will do txonly fwd.
> > > > > > 
> > > > > > When burst size is 1 and packet size is 64 in testpmd and
> > > > > > testpmd needs to iterate 5 Tx queues (but only the first two
> > > > > > queues are enabled) to prepare and inject packets, I got ~12%
> > > > > > performance boost (5.7Mpps -> 6.4Mpps). And if the vhost PMD
> > > > > > is faster (e.g. just need to iterate the first two queues to
> > > > > > prepare and inject packets), then I got similar performance
> > > > > > for both rings (~9.9Mpps) (packed ring's performance can be
> > > > > > lower, because it's more complicated in driver.)
> > > > > > 
> > > > > > I think packed ring makes vhost PMD faster, but it doesn't make
> > > > > > the driver faster. In packed ring, the ring is simplified, and
> > > > > > the handling of the ring in vhost (device) is also simplified,
> > > > > > but things are not simplified in driver, e.g. although there is
> > > > > > no desc table in the virtqueue anymore, driver still needs to
> > > > > > maintain a private desc state table (which is still managed as
> > > > > > a list in this patch set) to support the out-of-order desc
> > > > > > processing in vhost (device).
> > > > > > 
> > > > > > I think this patch set is mainly to make the driver have a full
> > > > > > functional support for the packed ring, which makes it possible
> > > > > > to leverage the packed ring feature in vhost (device). But I'm
> > > > > > not sure whether there is any other better idea, I'd like to
> > > > > > hear your thoughts. Thanks!
> > > > > Just this: Jens seems to report a nice gain with virtio and
> > > > > vhost pmd across the board. Try to compare virtio and
> > > > > virtio pmd to see what does pmd do better?
> > > > The virtio PMD (drivers/net/virtio) in DPDK doesn't need to share
> > > > the virtio ring operation code with other drivers and is highly
> > > > optimized for network. E.g. in Rx, the Rx burst function won't
> > > > chain descs. So the ID management for the Rx ring can be quite
> > > > simple and straightforward, we just need to initialize these IDs
> > > > when initializing the ring and don't need to change these IDs
> > > > in data path anymore (the mergable Rx code in that patch set
> > > > assumes the descs will be written back in order, which should be
> > > > fixed. I.e., the ID in the desc should be used to index vq->descx[]).
> > > > The Tx code in that patch set also assumes the descs will be
> > > > written back by device in order, which should be fixed.
> > > 
> > > Yes it is. I think I've pointed it out in some early version of pmd patch.
> > > So I suspect part (or all) of the boost may come from in order feature.
> > > 
> > > > 
> > > > But in kernel virtio driver, the virtio_ring.c is very generic.
> > > > The enqueue (virtqueue_add()) and dequeue (virtqueue_get_buf_ctx())
> > > > functions need to support all the virtio devices and should be
> > > > able to handle all the possible cases that may happen. So although
> > > > the packed ring can be very efficient in some cases, currently
> > > > the room to optimize the performance in kernel's virtio_ring.c
> > > > isn't that much. If we want to take the fully advantage of the
> > > > packed ring's efficiency, we need some further e.g. API changes
> > > > in virtio_ring.c, which shouldn't be part of this patch set.
> > > 
> > > Could you please share more thoughts on this e.g how to improve the API?
> > > Notice since the API is shared by both split ring and packed ring, it may
> > > improve the performance of split ring as well. One can easily imagine a
> > > batching API, but it does not have many real users now, the only case is the
> > > XDP transmission which can accept an array of XDP frames.
> > 
> > I don't have detailed thoughts on this yet. But kernel's
> > virtio_ring.c is quite generic compared with what we did
> > in virtio PMD.
> 
> In what way? What are some things that aren't implemented there?

Below is the code corresponding to the virtqueue_add()
for Rx ring in virtio PMD:

https://github.com/DPDK/dpdk/blob/3605968c2fa7/drivers/net/virtio/virtio_rxtx.c#L278-L304

And below is the code of virtqueue_add() in Linux:

https://github.com/torvalds/linux/blob/54eda9df17f3/drivers/virtio/virtio_ring.c#L275-L417

In virtio PMD, for each packet (mbuf), the code is pretty
straightforward, it will just check whether there is one
available desc. If it's true, it will just fill this desc
directly.

But in virtqueue_add(), it's obvious that, the logic is
much more complicated or generic. It's supposed to be
able to handle sglist (which may consist of multiple IN
buffers and multiple OUT buffers at the same time), and
it will try to use indirect descriptors. Then it needs
several loops to parse the sglist. That's why I said
it's quite generic.

> 
> If what you say is true then we should take a careful look
> and not supporting these generic things with packed layout.
> Once we do support them it will be too late and we won't
> be able to get performance back.

I think it's a good point that we don't need to support
everything in packed ring (especially these which would
hurt the performance), as the packed ring aims at high
performance. I'm also wondering about the features. Is
there any possibility that we won't support the out of
order processing (at least not by default) in packed ring?
If I didn't miss anything, the need to support out of order
processing in packed ring will make the data structure
inside the driver not cache friendly which is similar to
the case of the descriptor table in the split ring (the
difference is that, it only happens in driver now).


> 
> 
> 
> > > 
> > > > So
> > > > I still think this patch set is mainly to make the kernel virtio
> > > > driver to have a full functional support of the packed ring, and
> > > > we can't expect impressive performance boost with it.
> > > 
> > > We can only gain when virtio ring layout is the bottleneck. If there're
> > > bottlenecks elsewhere, we probably won't see any increasing in the numbers.
> > > Vhost-net is an example, and lots of optimizations have proved that virtio
> > > ring is not the main bottleneck for the current codes. I suspect it also the
> > > case of virtio driver. Did perf tell us any interesting things in virtio
> > > driver?
> > > 
> > > Thanks
> > > 
> > > > 
> > > > > 
> > > > > > > On Wed, Jul 11, 2018 at 10:27:06AM +0800, Tiwei Bie wrote:
> > > > > > > > Hello everyone,
> > > > > > > > 
> > > > > > > > This patch set implements packed ring support in virtio driver.
> > > > > > > > 
> > > > > > > > Some functional tests have been done with Jason's
> > > > > > > > packed ring implementation in vhost:
> > > > > > > > 
> > > > > > > > https://lkml.org/lkml/2018/7/3/33
> > > > > > > > 
> > > > > > > > Both of ping and netperf worked as expected.
> > > > > > > > 
> > > > > > > > v1 -> v2:
> > > > > > > > - Use READ_ONCE() to read event off_wrap and flags together (Jason);
> > > > > > > > - Add comments related to ccw (Jason);
> > > > > > > > 
> > > > > > > > RFC (v6) -> v1:
> > > > > > > > - Avoid extra virtio_wmb() in virtqueue_enable_cb_delayed_packed()
> > > > > > > >    when event idx is off (Jason);
> > > > > > > > - Fix bufs calculation in virtqueue_enable_cb_delayed_packed() (Jason);
> > > > > > > > - Test the state of the desc at used_idx instead of last_used_idx
> > > > > > > >    in virtqueue_enable_cb_delayed_packed() (Jason);
> > > > > > > > - Save wrap counter (as part of queue state) in the return value
> > > > > > > >    of virtqueue_enable_cb_prepare_packed();
> > > > > > > > - Refine the packed ring definitions in uapi;
> > > > > > > > - Rebase on the net-next tree;
> > > > > > > > 
> > > > > > > > RFC v5 -> RFC v6:
> > > > > > > > - Avoid tracking addr/len/flags when DMA API isn't used (MST/Jason);
> > > > > > > > - Define wrap counter as bool (Jason);
> > > > > > > > - Use ALIGN() in vring_init_packed() (Jason);
> > > > > > > > - Avoid using pointer to track `next` in detach_buf_packed() (Jason);
> > > > > > > > - Add comments for barriers (Jason);
> > > > > > > > - Don't enable RING_PACKED on ccw for now (noticed by Jason);
> > > > > > > > - Refine the memory barrier in virtqueue_poll();
> > > > > > > > - Add a missing memory barrier in virtqueue_enable_cb_delayed_packed();
> > > > > > > > - Remove the hacks in virtqueue_enable_cb_prepare_packed();
> > > > > > > > 
> > > > > > > > RFC v4 -> RFC v5:
> > > > > > > > - Save DMA addr, etc in desc state (Jason);
> > > > > > > > - Track used wrap counter;
> > > > > > > > 
> > > > > > > > RFC v3 -> RFC v4:
> > > > > > > > - Make ID allocation support out-of-order (Jason);
> > > > > > > > - Various fixes for EVENT_IDX support;
> > > > > > > > 
> > > > > > > > RFC v2 -> RFC v3:
> > > > > > > > - Split into small patches (Jason);
> > > > > > > > - Add helper virtqueue_use_indirect() (Jason);
> > > > > > > > - Just set id for the last descriptor of a list (Jason);
> > > > > > > > - Calculate the prev in virtqueue_add_packed() (Jason);
> > > > > > > > - Fix/improve desc suppression code (Jason/MST);
> > > > > > > > - Refine the code layout for XXX_split/packed and wrappers (MST);
> > > > > > > > - Fix the comments and API in uapi (MST);
> > > > > > > > - Remove the BUG_ON() for indirect (Jason);
> > > > > > > > - Some other refinements and bug fixes;
> > > > > > > > 
> > > > > > > > RFC v1 -> RFC v2:
> > > > > > > > - Add indirect descriptor support - compile test only;
> > > > > > > > - Add event suppression supprt - compile test only;
> > > > > > > > - Move vring_packed_init() out of uapi (Jason, MST);
> > > > > > > > - Merge two loops into one in virtqueue_add_packed() (Jason);
> > > > > > > > - Split vring_unmap_one() for packed ring and split ring (Jason);
> > > > > > > > - Avoid using '%' operator (Jason);
> > > > > > > > - Rename free_head -> next_avail_idx (Jason);
> > > > > > > > - Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
> > > > > > > > - Some other refinements and bug fixes;
> > > > > > > > 
> > > > > > > > Thanks!
> > > > > > > > 
> > > > > > > > Tiwei Bie (5):
> > > > > > > >    virtio: add packed ring definitions
> > > > > > > >    virtio_ring: support creating packed ring
> > > > > > > >    virtio_ring: add packed ring support
> > > > > > > >    virtio_ring: add event idx support in packed ring
> > > > > > > >    virtio_ring: enable packed ring
> > > > > > > > 
> > > > > > > >   drivers/s390/virtio/virtio_ccw.c   |   14 +
> > > > > > > >   drivers/virtio/virtio_ring.c       | 1365 ++++++++++++++++++++++------
> > > > > > > >   include/linux/virtio_ring.h        |    8 +-
> > > > > > > >   include/uapi/linux/virtio_config.h |    3 +
> > > > > > > >   include/uapi/linux/virtio_ring.h   |   43 +
> > > > > > > >   5 files changed, 1157 insertions(+), 276 deletions(-)
> > > > > > > > 
> > > > > > > > -- 
> > > > > > > > 2.18.0
> > > > > > > ---------------------------------------------------------------------
> > > > > > > To unsubscribe, e-mail: virtio-dev-unsubscribe@lists.oasis-open.org
> > > > > > > For additional commands, e-mail: virtio-dev-help@lists.oasis-open.org
> > > > > > > 
> > > 
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next V2] virtio_net: ethtool tx napi configuration
From: Jason Wang @ 2018-09-13  9:07 UTC (permalink / raw)
  To: mst; +Cc: netdev, Willem de Bruijn, davem, linux-kernel, virtualization
In-Reply-To: <20180913053545.18585-1-jasowang@redhat.com>



On 2018年09月13日 13:35, Jason Wang wrote:
> Implement ethtool .set_coalesce (-C) and .get_coalesce (-c) handlers.
> Interrupt moderation is currently not supported, so these accept and
> display the default settings of 0 usec and 1 frame.
>
> Toggle tx napi through a bit in tx-frames. So as to not interfere
> with possible future interrupt moderation, value 1 means tx napi while
> value 0 means not.
>
> To properly synchronize with the data path, tx napi is disabled and
> tx lock is held when changing the value of napi weight. And two more
> places that can access tx napi weight:
>
> - speculative tx polling in rx napi, we can leave it as is since it
>    not a must for correctness.
> - skb_xmit_done(), one more check of napi weight is added before
>    trying to enable tx to avoid tx to be disabled forever if napi is
>    disabled after skb_xmit_done() but before the napi
>
> Link: https://patchwork.ozlabs.org/patch/948149/
> Suggested-by: Jason Wang <jasowang@redhat.com>
> Signed-off-by: Willem de Bruijn <willemb@google.com>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
> Changes from V1:
> - try to synchronize with datapath to allow changing mode when
>    interface is up.
> - use tx-frames 0 as to disable tx napi while tx-frames 1 to enable tx napi
> ---
>   drivers/net/virtio_net.c | 64 +++++++++++++++++++++++++++++++++++++++-
>   1 file changed, 63 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 765920905226..6e70864f5899 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -66,6 +66,8 @@ DECLARE_EWMA(pkt_len, 0, 64)
>   
>   #define VIRTNET_DRIVER_VERSION "1.0.0"
>   
> +static const u32 ethtool_coalesce_napi_mask = (1UL << 10);
> +
>   static const unsigned long guest_offloads[] = {
>   	VIRTIO_NET_F_GUEST_TSO4,
>   	VIRTIO_NET_F_GUEST_TSO6,
> @@ -1444,7 +1446,10 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
>   
>   	virtqueue_napi_complete(napi, sq->vq, 0);
>   
> -	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
> +	/* Check napi.weight to avoid tx stall since it could be set
> +	 * to zero by ethtool after skb_xmit_done().
> +	 */
> +	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS || !sq->napi.weight)
>   		netif_tx_wake_queue(txq);
>   
>   	return 0;
> @@ -2181,6 +2186,61 @@ static int virtnet_get_link_ksettings(struct net_device *dev,
>   	return 0;
>   }
>   
> +static int virtnet_set_coalesce(struct net_device *dev,
> +				struct ethtool_coalesce *ec)
> +{
> +	struct ethtool_coalesce ec_default = {
> +		.cmd = ETHTOOL_SCOALESCE,
> +		.rx_max_coalesced_frames = 1,
> +	};
> +	struct virtnet_info *vi = netdev_priv(dev);
> +	int i, napi_weight;
> +
> +	if (ec->tx_max_coalesced_frames > 1)
> +		return -EINVAL;
> +
> +	ec_default.tx_max_coalesced_frames = ec->tx_max_coalesced_frames;
> +	napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
> +
> +	/* disallow changes to fields not explicitly tested above */
> +	if (memcmp(ec, &ec_default, sizeof(ec_default)))
> +		return -EINVAL;
> +
> +	if (napi_weight ^ vi->sq[0].napi.weight) {
> +		for (i = 0; i < vi->max_queue_pairs; i++) {
> +			struct netdev_queue *txq =
> +			       netdev_get_tx_queue(vi->dev, i);
> +
> +			virtnet_napi_tx_disable(&vi->sq[i].napi);
> +			__netif_tx_lock_bh(txq);

Need to check netif_running() before disabling napi. Otherwise we may 
have a infinite loop here.

The discussion is still ongoing, if we decide to go set_coalesce, I will 
post a V3.

Thanks

> +			vi->sq[i].napi.weight = napi_weight;
> +			__netif_tx_unlock_bh(txq);
> +			virtnet_napi_tx_enable(vi, vi->sq[i].vq,
> +					       &vi->sq[i].napi);
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static int virtnet_get_coalesce(struct net_device *dev,
> +				struct ethtool_coalesce *ec)
> +{
> +	struct ethtool_coalesce ec_default = {
> +		.cmd = ETHTOOL_GCOALESCE,
> +		.rx_max_coalesced_frames = 1,
> +		.tx_max_coalesced_frames = 0,
> +	};
> +	struct virtnet_info *vi = netdev_priv(dev);
> +
> +	memcpy(ec, &ec_default, sizeof(ec_default));
> +
> +	if (vi->sq[0].napi.weight)
> +		ec->tx_max_coalesced_frames = 1;
> +
> +	return 0;
> +}
> +
>   static void virtnet_init_settings(struct net_device *dev)
>   {
>   	struct virtnet_info *vi = netdev_priv(dev);
> @@ -2219,6 +2279,8 @@ static const struct ethtool_ops virtnet_ethtool_ops = {
>   	.get_ts_info = ethtool_op_get_ts_info,
>   	.get_link_ksettings = virtnet_get_link_ksettings,
>   	.set_link_ksettings = virtnet_set_link_ksettings,
> +	.set_coalesce = virtnet_set_coalesce,
> +	.get_coalesce = virtnet_get_coalesce,
>   };
>   
>   static void virtnet_freeze_down(struct virtio_device *vdev)

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [virtio-dev] Re: [PATCH net-next v2 0/5] virtio: support packed ring
From: Jason Wang @ 2018-09-13  9:47 UTC (permalink / raw)
  To: Tiwei Bie, Michael S. Tsirkin
  Cc: virtio-dev, netdev, linux-kernel, virtualization, wexu
In-Reply-To: <20180913085919.GA42049@fbsd1.sh.intel.com>

On 2018年09月13日 16:59, Tiwei Bie wrote:
>> If what you say is true then we should take a careful look
>> and not supporting these generic things with packed layout.
>> Once we do support them it will be too late and we won't
>> be able to get performance back.
> I think it's a good point that we don't need to support
> everything in packed ring (especially these which would
> hurt the performance), as the packed ring aims at high
> performance. I'm also wondering about the features. Is
> there any possibility that we won't support the out of
> order processing (at least not by default) in packed ring?
> If I didn't miss anything, the need to support out of order
> processing in packed ring will make the data structure
> inside the driver not cache friendly which is similar to
> the case of the descriptor table in the split ring (the
> difference is that, it only happens in driver now).

Out of order is not the only user, DMA is another one. We don't have 
used ring(len), so we need to maintain buffer length somewhere even for 
in order device. But if it's not too late, I second for a OUT_OF_ORDER 
feature. Starting from in order can have much simpler code in driver.

Thanks

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next V2] virtio_net: ethtool tx napi configuration
From: Willem de Bruijn @ 2018-09-13 15:20 UTC (permalink / raw)
  To: Jason Wang
  Cc: Willem de Bruijn, Michael S. Tsirkin, Network Development, LKML,
	virtualization, David Miller
In-Reply-To: <20180913053545.18585-1-jasowang@redhat.com>

On Thu, Sep 13, 2018 at 1:40 AM Jason Wang <jasowang@redhat.com> wrote:
>
> Implement ethtool .set_coalesce (-C) and .get_coalesce (-c) handlers.
> Interrupt moderation is currently not supported, so these accept and
> display the default settings of 0 usec and 1 frame.
>
> Toggle tx napi through a bit in tx-frames. So as to not interfere
> with possible future interrupt moderation, value 1 means tx napi while
> value 0 means not.
>
> To properly synchronize with the data path, tx napi is disabled and
> tx lock is held when changing the value of napi weight. And two more
> places that can access tx napi weight:
>
> - speculative tx polling in rx napi, we can leave it as is since it
>   not a must for correctness.
> - skb_xmit_done(), one more check of napi weight is added before
>   trying to enable tx to avoid tx to be disabled forever if napi is
>   disabled after skb_xmit_done() but before the napi
>
> Link: https://patchwork.ozlabs.org/patch/948149/
> Suggested-by: Jason Wang <jasowang@redhat.com>
> Signed-off-by: Willem de Bruijn <willemb@google.com>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
> Changes from V1:
> - try to synchronize with datapath to allow changing mode when
>   interface is up.
> - use tx-frames 0 as to disable tx napi while tx-frames 1 to enable tx napi
> ---
>  drivers/net/virtio_net.c | 64 +++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 63 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 765920905226..6e70864f5899 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -66,6 +66,8 @@ DECLARE_EWMA(pkt_len, 0, 64)
>
>  #define VIRTNET_DRIVER_VERSION "1.0.0"
>
> +static const u32 ethtool_coalesce_napi_mask = (1UL << 10);
> +

This is no longer needed

>  static const unsigned long guest_offloads[] = {
>         VIRTIO_NET_F_GUEST_TSO4,
>         VIRTIO_NET_F_GUEST_TSO6,
> @@ -1444,7 +1446,10 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
>
>         virtqueue_napi_complete(napi, sq->vq, 0);
>
> -       if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
> +       /* Check napi.weight to avoid tx stall since it could be set
> +        * to zero by ethtool after skb_xmit_done().
> +        */
> +       if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS || !sq->napi.weight)
>                 netif_tx_wake_queue(txq);

I see. This assumes that the napi handler will always be called on
conversion from napi to no-napi mode.

That is safe to assume because if it isn't called (and will not call
netif_tx_wake_queue) that implies that napi was not scheduled, and
thus the tx interrupt was not suppressed and thus there was no tx
completion work to be scheduled?

>
>         return 0;
> @@ -2181,6 +2186,61 @@ static int virtnet_get_link_ksettings(struct net_device *dev,
>         return 0;
>  }
>
> +static int virtnet_set_coalesce(struct net_device *dev,
> +                               struct ethtool_coalesce *ec)
> +{
> +       struct ethtool_coalesce ec_default = {
> +               .cmd = ETHTOOL_SCOALESCE,
> +               .rx_max_coalesced_frames = 1,
> +       };
> +       struct virtnet_info *vi = netdev_priv(dev);
> +       int i, napi_weight;
> +
> +       if (ec->tx_max_coalesced_frames > 1)
> +               return -EINVAL;
> +
> +       ec_default.tx_max_coalesced_frames = ec->tx_max_coalesced_frames;
> +       napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
> +
> +       /* disallow changes to fields not explicitly tested above */
> +       if (memcmp(ec, &ec_default, sizeof(ec_default)))
> +               return -EINVAL;
> +
> +       if (napi_weight ^ vi->sq[0].napi.weight) {
> +               for (i = 0; i < vi->max_queue_pairs; i++) {
> +                       struct netdev_queue *txq =
> +                              netdev_get_tx_queue(vi->dev, i);
> +
> +                       virtnet_napi_tx_disable(&vi->sq[i].napi);
> +                       __netif_tx_lock_bh(txq);
> +                       vi->sq[i].napi.weight = napi_weight;
> +                       __netif_tx_unlock_bh(txq);
> +                       virtnet_napi_tx_enable(vi, vi->sq[i].vq,
> +                                              &vi->sq[i].napi);
> +               }
> +       }
> +
> +       return 0;
> +}
> +
> +static int virtnet_get_coalesce(struct net_device *dev,
> +                               struct ethtool_coalesce *ec)
> +{
> +       struct ethtool_coalesce ec_default = {
> +               .cmd = ETHTOOL_GCOALESCE,
> +               .rx_max_coalesced_frames = 1,
> +               .tx_max_coalesced_frames = 0,

no need to explicitly initialize to 0 (unless you did this for
documentation purposes, which is fine).

> +       };
> +       struct virtnet_info *vi = netdev_priv(dev);
> +
> +       memcpy(ec, &ec_default, sizeof(ec_default));
> +
> +       if (vi->sq[0].napi.weight)
> +               ec->tx_max_coalesced_frames = 1;
> +
> +       return 0;
> +}
> +
>  static void virtnet_init_settings(struct net_device *dev)
>  {
>         struct virtnet_info *vi = netdev_priv(dev);
> @@ -2219,6 +2279,8 @@ static const struct ethtool_ops virtnet_ethtool_ops = {
>         .get_ts_info = ethtool_op_get_ts_info,
>         .get_link_ksettings = virtnet_get_link_ksettings,
>         .set_link_ksettings = virtnet_set_link_ksettings,
> +       .set_coalesce = virtnet_set_coalesce,
> +       .get_coalesce = virtnet_get_coalesce,
>  };
>
>  static void virtnet_freeze_down(struct virtio_device *vdev)
> --
> 2.17.1
>

^ permalink raw reply

* Re: [PATCH net-next V2 00/11] vhost_net TX batching
From: David Miller @ 2018-09-13 16:28 UTC (permalink / raw)
  To: jasowang; +Cc: netdev, mst, linux-kernel, kvm, virtualization
In-Reply-To: <20180912031709.14112-1-jasowang@redhat.com>

From: Jason Wang <jasowang@redhat.com>
Date: Wed, 12 Sep 2018 11:16:58 +0800

> This series tries to batch submitting packets to underlayer socket
> through msg_control during sendmsg(). This is done by:
 ...

Series applied, thanks Jason.

^ permalink raw reply

* Re: [PATCH net-next V2 00/11] vhost_net TX batching
From: Michael S. Tsirkin @ 2018-09-13 18:02 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20180913.092819.2099750169529223026.davem@davemloft.net>

On Thu, Sep 13, 2018 at 09:28:19AM -0700, David Miller wrote:
> From: Jason Wang <jasowang@redhat.com>
> Date: Wed, 12 Sep 2018 11:16:58 +0800
> 
> > This series tries to batch submitting packets to underlayer socket
> > through msg_control during sendmsg(). This is done by:
>  ...
> 
> Series applied, thanks Jason.

Going over it now I don't see a lot to complain about, but I'd
appreciate a bit more time for review in the future. 3 days ok?

-- 
MST

^ permalink raw reply

* Re: [PATCH net-next V2 11/11] vhost_net: batch submitting XDP buffers to underlayer sockets
From: Michael S. Tsirkin @ 2018-09-13 18:04 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20180912031709.14112-12-jasowang@redhat.com>

On Wed, Sep 12, 2018 at 11:17:09AM +0800, Jason Wang wrote:
> +static void vhost_tx_batch(struct vhost_net *net,
> +			   struct vhost_net_virtqueue *nvq,
> +			   struct socket *sock,
> +			   struct msghdr *msghdr)
> +{
> +	struct tun_msg_ctl ctl = {
> +		.type = TUN_MSG_PTR,
> +		.num = nvq->batched_xdp,
> +		.ptr = nvq->xdp,
> +	};
> +	int err;
> +
> +	if (nvq->batched_xdp == 0)
> +		goto signal_used;
> +
> +	msghdr->msg_control = &ctl;
> +	err = sock->ops->sendmsg(sock, msghdr, 0);
> +	if (unlikely(err < 0)) {
> +		vq_err(&nvq->vq, "Fail to batch sending packets\n");
> +		return;
> +	}
> +
> +signal_used:
> +	vhost_net_signal_used(nvq);
> +	nvq->batched_xdp = 0;
> +}
> +

Given it's all tun-specific now, how about just exporting tap_sendmsg
and calling that? Will get rid of some indirection too.

-- 
MST

^ permalink raw reply

* Re: [PATCH net-next V2 00/11] vhost_net TX batching
From: David Miller @ 2018-09-13 18:05 UTC (permalink / raw)
  To: mst; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20180913135833-mutt-send-email-mst@kernel.org>

From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 13 Sep 2018 14:02:13 -0400

> On Thu, Sep 13, 2018 at 09:28:19AM -0700, David Miller wrote:
>> From: Jason Wang <jasowang@redhat.com>
>> Date: Wed, 12 Sep 2018 11:16:58 +0800
>> 
>> > This series tries to batch submitting packets to underlayer socket
>> > through msg_control during sendmsg(). This is done by:
>>  ...
>> 
>> Series applied, thanks Jason.
> 
> Going over it now I don't see a lot to complain about, but I'd
> appreciate a bit more time for review in the future. 3 days ok?

I try to handle every patchwork entry within 48 hours, more
specifically 2 business days.

Given the rate at which patches flow through networking, I think
this is both reasonable and necessary.

One always have the option to send a quick reply to the list saying
"Please wait for my review, I'll get to it in the next day or two."
which I will certainly honor.

Thank you.

^ permalink raw reply

* Re: [PATCH net-next V2] virtio_net: ethtool tx napi configuration
From: David Miller @ 2018-09-13 22:52 UTC (permalink / raw)
  To: jasowang; +Cc: netdev, willemb, virtualization, linux-kernel, mst
In-Reply-To: <20180913053545.18585-1-jasowang@redhat.com>

From: Jason Wang <jasowang@redhat.com>
Date: Thu, 13 Sep 2018 13:35:45 +0800

> Toggle tx napi through a bit in tx-frames.

This is not what the code implements as the interface any more.

Please fix the commit message to match the code.

Thanks.

^ permalink raw reply

* Re: [PATCH net-next V2 11/11] vhost_net: batch submitting XDP buffers to underlayer sockets
From: Jason Wang @ 2018-09-14  3:11 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20180913140332-mutt-send-email-mst@kernel.org>



On 2018年09月14日 02:04, Michael S. Tsirkin wrote:
> On Wed, Sep 12, 2018 at 11:17:09AM +0800, Jason Wang wrote:
>> +static void vhost_tx_batch(struct vhost_net *net,
>> +			   struct vhost_net_virtqueue *nvq,
>> +			   struct socket *sock,
>> +			   struct msghdr *msghdr)
>> +{
>> +	struct tun_msg_ctl ctl = {
>> +		.type = TUN_MSG_PTR,
>> +		.num = nvq->batched_xdp,
>> +		.ptr = nvq->xdp,
>> +	};
>> +	int err;
>> +
>> +	if (nvq->batched_xdp == 0)
>> +		goto signal_used;
>> +
>> +	msghdr->msg_control = &ctl;
>> +	err = sock->ops->sendmsg(sock, msghdr, 0);
>> +	if (unlikely(err < 0)) {
>> +		vq_err(&nvq->vq, "Fail to batch sending packets\n");
>> +		return;
>> +	}
>> +
>> +signal_used:
>> +	vhost_net_signal_used(nvq);
>> +	nvq->batched_xdp = 0;
>> +}
>> +
> Given it's all tun-specific now, how about just exporting tap_sendmsg
> and calling that? Will get rid of some indirection too.
>

Yes we can do it on top in the future.

Thanks
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next V2] virtio_net: ethtool tx napi configuration
From: Jason Wang @ 2018-09-14  3:29 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, willemb, virtualization, linux-kernel, mst
In-Reply-To: <20180913.155253.713713547323242272.davem@davemloft.net>



On 2018年09月14日 06:52, David Miller wrote:
> From: Jason Wang <jasowang@redhat.com>
> Date: Thu, 13 Sep 2018 13:35:45 +0800
>
>> Toggle tx napi through a bit in tx-frames.
> This is not what the code implements as the interface any more.
>
> Please fix the commit message to match the code.
>
> Thanks.

Will fix this.

Thanks
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: Sources of initialized memory in virtio?
From: Jason Wang @ 2018-09-14  3:50 UTC (permalink / raw)
  To: Alexander Potapenko, stefanha, Michael S. Tsirkin, kraxel
  Cc: kvm, virtualization
In-Reply-To: <CAG_fn=URm9Bv0sTy6Ex8ohrLVStaxoZtwH_BOEao6sqGCzQMiA@mail.gmail.com>



On 2018年09月13日 21:00, Alexander Potapenko wrote:
> Hi mighty virtio maintainers,
>
> I'm working on KMSAN, a new runtime detector of uninitialized memory
> based on compiler instrumentation (https://github.com/google/kmsan)
> KMSAN is mostly being tested on QEMU with KVM enabled, so my kernel
> interacts a lot with various virtio drivers, that's why I'm seeking
> your help.
>
> By default KMSAN treats kernel memory allocated by kmalloc() and
> alloc_page() as uninitialized. Writing a constant to memory or using
> it in copy_from_user() makes that memory initialized.
> Unfortunately a lot of writes to memory from KVM (mostly in the disk
> and network drivers) remain unnoticed by the tool, therefore we're
> seeing a lot of false positive reports (along with actual bugs, like
> CVE-2018-1118).
>
> KMSAN has an API function `kmsan_unpoison_shadow(void *buf, int len)`,
> which means "from now on, till this memory is freed or written to,
> mark it as initialized".
> I've tried playing Whack-a-Mole adding it to various places where the
> data comes from KVM, but failed to find them all. In fact, some of my
> annotations were wrong, so I ended up with the following two patches:
>
> https://github.com/google/kmsan/commit/76c671199a4de5bbe73cd13210a5e28848211bd1
> https://github.com/google/kmsan/commit/40ba1c8e2a3c6bbe8f34037413e253894251a405
>
> But I'm far from being sure this is the complete list of places where
> the memory is initialized by virtio drivers.
> May I ask you to help me find the places where we actually need to
> annotate the memory in virtio?

It looks to me another one is the used ring which device writes back the 
completed descriptor id and length. It (vr->used) was a part of a page 
which was allocated in vring_alloc_queue() through alloc_pages_exact() 
with __GFP_ZERO. So I'm not we need care about it.

Thanks

>
> Thanks in advance,

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next V2] virtio_net: ethtool tx napi configuration
From: Jason Wang @ 2018-09-14  8:24 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Willem de Bruijn, Michael S. Tsirkin, Network Development, LKML,
	virtualization, David Miller
In-Reply-To: <CAF=yD-LSgCEnL_kTH44A8-FH3nut3y=bonf4Bgf75=jd=D64hw@mail.gmail.com>



On 2018年09月13日 23:20, Willem de Bruijn wrote:
> On Thu, Sep 13, 2018 at 1:40 AM Jason Wang <jasowang@redhat.com> wrote:
>> Implement ethtool .set_coalesce (-C) and .get_coalesce (-c) handlers.
>> Interrupt moderation is currently not supported, so these accept and
>> display the default settings of 0 usec and 1 frame.
>>
>> Toggle tx napi through a bit in tx-frames. So as to not interfere
>> with possible future interrupt moderation, value 1 means tx napi while
>> value 0 means not.
>>
>> To properly synchronize with the data path, tx napi is disabled and
>> tx lock is held when changing the value of napi weight. And two more
>> places that can access tx napi weight:
>>
>> - speculative tx polling in rx napi, we can leave it as is since it
>>    not a must for correctness.
>> - skb_xmit_done(), one more check of napi weight is added before
>>    trying to enable tx to avoid tx to be disabled forever if napi is
>>    disabled after skb_xmit_done() but before the napi
>>
>> Link: https://patchwork.ozlabs.org/patch/948149/
>> Suggested-by: Jason Wang <jasowang@redhat.com>
>> Signed-off-by: Willem de Bruijn <willemb@google.com>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>> Changes from V1:
>> - try to synchronize with datapath to allow changing mode when
>>    interface is up.
>> - use tx-frames 0 as to disable tx napi while tx-frames 1 to enable tx napi
>> ---
>>   drivers/net/virtio_net.c | 64 +++++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 63 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> index 765920905226..6e70864f5899 100644
>> --- a/drivers/net/virtio_net.c
>> +++ b/drivers/net/virtio_net.c
>> @@ -66,6 +66,8 @@ DECLARE_EWMA(pkt_len, 0, 64)
>>
>>   #define VIRTNET_DRIVER_VERSION "1.0.0"
>>
>> +static const u32 ethtool_coalesce_napi_mask = (1UL << 10);
>> +
> This is no longer needed

Yes, will remove this.

>
>>   static const unsigned long guest_offloads[] = {
>>          VIRTIO_NET_F_GUEST_TSO4,
>>          VIRTIO_NET_F_GUEST_TSO6,
>> @@ -1444,7 +1446,10 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
>>
>>          virtqueue_napi_complete(napi, sq->vq, 0);
>>
>> -       if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
>> +       /* Check napi.weight to avoid tx stall since it could be set
>> +        * to zero by ethtool after skb_xmit_done().
>> +        */
>> +       if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS || !sq->napi.weight)
>>                  netif_tx_wake_queue(txq);
> I see. This assumes that the napi handler will always be called on
> conversion from napi to no-napi mode.
>
> That is safe to assume because if it isn't called (and will not call
> netif_tx_wake_queue) that implies that napi was not scheduled, and
> thus the tx interrupt was not suppressed and thus there was no tx
> completion work to be scheduled?

If it isn't called it means skb_xmit_done() wakeup tx directly instead 
of schedule tx. This could be a little bit early since there may be 
still lots of pending tx packets. But it doesn't harm, start_xmit() can 
handle this by re enable a delayed tx interrupt and disable TX.

But there's a bug, look like I need remove the check of 
(!sq->napi.weight) in the beginning of the function.

>
>>          return 0;
>> @@ -2181,6 +2186,61 @@ static int virtnet_get_link_ksettings(struct net_device *dev,
>>          return 0;
>>   }
>>
>> +static int virtnet_set_coalesce(struct net_device *dev,
>> +                               struct ethtool_coalesce *ec)
>> +{
>> +       struct ethtool_coalesce ec_default = {
>> +               .cmd = ETHTOOL_SCOALESCE,
>> +               .rx_max_coalesced_frames = 1,
>> +       };
>> +       struct virtnet_info *vi = netdev_priv(dev);
>> +       int i, napi_weight;
>> +
>> +       if (ec->tx_max_coalesced_frames > 1)
>> +               return -EINVAL;
>> +
>> +       ec_default.tx_max_coalesced_frames = ec->tx_max_coalesced_frames;
>> +       napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
>> +
>> +       /* disallow changes to fields not explicitly tested above */
>> +       if (memcmp(ec, &ec_default, sizeof(ec_default)))
>> +               return -EINVAL;
>> +
>> +       if (napi_weight ^ vi->sq[0].napi.weight) {
>> +               for (i = 0; i < vi->max_queue_pairs; i++) {
>> +                       struct netdev_queue *txq =
>> +                              netdev_get_tx_queue(vi->dev, i);
>> +
>> +                       virtnet_napi_tx_disable(&vi->sq[i].napi);
>> +                       __netif_tx_lock_bh(txq);
>> +                       vi->sq[i].napi.weight = napi_weight;
>> +                       __netif_tx_unlock_bh(txq);
>> +                       virtnet_napi_tx_enable(vi, vi->sq[i].vq,
>> +                                              &vi->sq[i].napi);
>> +               }
>> +       }
>> +
>> +       return 0;
>> +}
>> +
>> +static int virtnet_get_coalesce(struct net_device *dev,
>> +                               struct ethtool_coalesce *ec)
>> +{
>> +       struct ethtool_coalesce ec_default = {
>> +               .cmd = ETHTOOL_GCOALESCE,
>> +               .rx_max_coalesced_frames = 1,
>> +               .tx_max_coalesced_frames = 0,
> no need to explicitly initialize to 0 (unless you did this for
> documentation purposes, which is fine).

Yes.

Thanks

>> +       };
>> +       struct virtnet_info *vi = netdev_priv(dev);
>> +
>> +       memcpy(ec, &ec_default, sizeof(ec_default));
>> +
>> +       if (vi->sq[0].napi.weight)
>> +               ec->tx_max_coalesced_frames = 1;
>> +
>> +       return 0;
>> +}
>> +
>>   static void virtnet_init_settings(struct net_device *dev)
>>   {
>>          struct virtnet_info *vi = netdev_priv(dev);
>> @@ -2219,6 +2279,8 @@ static const struct ethtool_ops virtnet_ethtool_ops = {
>>          .get_ts_info = ethtool_op_get_ts_info,
>>          .get_link_ksettings = virtnet_get_link_ksettings,
>>          .set_link_ksettings = virtnet_set_link_ksettings,
>> +       .set_coalesce = virtnet_set_coalesce,
>> +       .get_coalesce = virtnet_get_coalesce,
>>   };
>>
>>   static void virtnet_freeze_down(struct virtio_device *vdev)
>> --
>> 2.17.1
>>

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* ICITS'19 - Quito, Ecuador | Extended deadline: September 23
From: Marle @ 2018-09-14 11:13 UTC (permalink / raw)
  To: virtualization

[-- Attachment #1.1: Type: text/plain, Size: 6043 bytes --]

* Proceedings by Springer, indexed in Scopus, ISI, etc.

***********************************************************

ICITS'19 - The 2019 International Conference on Information Technology & Systems

Quito, Ecuador, 6 - 8 February 2019

http://www.icits.me <http://www.icits.me/>

********************************************************************************

ICITS'19 - The 2019 International Conference on Information Technology & Systems, to be held at Quito, Ecuador, 6 - 8 February 2019, is an international forum for researchers and practitioners to present and discuss the most recent innovations, trends, results, experiences and concerns in the several perspectives of Information Technology & Systems.

We are pleased to invite you to submit your papers to ICITS'19. They can be written in English, Spanish or Portuguese. All submissions will be reviewed on the basis of relevance, originality, importance and clarity.

Topics

Submitted papers should be related with one or more of the main themes proposed for the Conference:

A) Information and Knowledge Management (IKM);

B) Organizational Models and Information Systems (OMIS);

C) Software and Systems Modeling (SSM);

D) Software Systems, Architectures, Applications and Tools (SSAAT);

E) Multimedia Systems and Applications (MSA);

F) Computer Networks, Mobility and Pervasive Systems (CNMPS);

G) Intelligent and Decision Support Systems (IDSS);

H) Big Data Analytics and Applications (BDAA);

I) Human-Computer Interaction (HCI);

J) Ethics, Computers and Security (ECS)

K) Health Informatics (HIS);

L) Information Technologies in Education (ITE);

M) Cybersecurity and Cyber-defense;

N) Electromagnetics, Sensors and Antennas for Security.

Submission and Decision

Submitted papers written in English (until 10-page limit) must comply with the format of Advances in Intelligent Systems and Computing series (see Instructions for Authors at Springer Website <http://www.springer.com/series/11156> or download a DOC example <http://www.icits.me/springerformat.doc>), must not have been published before, not be under review for any other conference or publication and not include any information leading to the authors’ identification. Therefore, the authors’ names, affiliations and bibliographic references should not be included in the version for evaluation by the Scientific Committee. This information should only be included in the camera-ready version, saved in Word or Latex format and also in PDF format. These files must be accompanied by the Consent to Publish form <http://www.icits.me/copyright.pdf> filled out, in a ZIP file, and uploaded at the conference management system.

Submitted papers written in Spanish or Portuguese (until 15-page limit) must comply with the format of RISTI <http://www.risti.xyz/> - Revista Ibérica de Sistemas e Tecnologias de Informação (download instructions/template for authors in Spanish <http://www.risti.xyz/formato-es.doc> or Portuguese <http://www.risti.xyz/formato-pt.doc>), must not have been published before, not be under review for any other conference or publication and not include any information leading to the authors’ identification. Therefore, the authors’ names, affiliations and bibliographic references should not be included in the version for evaluation by the Scientific Committee. This information should only be included in the camera-ready version, saved in Word. These file must be uploaded at the conference management system in a ZIP file.

All papers will be subjected to a “double-blind review” by at least two members of the Scientific Committee.

Based on Scientific Committee evaluation, a paper can be rejected or accepted by the Conference Chairs. In the later case, it can be accepted as paper or poster.

The authors of papers accepted as posters must build and print a poster to be exhibited during the Conference. This poster must follow an A1 or A2 vertical format. The Conference can includes Work Sessions where these posters are presented and orally discussed, with a 7 minute limit per poster.

The authors of accepted papers will have 15 minutes to present their work in a Conference Work Session; approximately 5 minutes of discussion will follow each presentation.

Publication and Indexing

To ensure that an accepted paper is published, at least one of the authors must be fully registered by the 9th of November 2018, and the paper must comply with the suggested layout and page-limit. Additionally, all recommended changes must be addressed by the authors before they submit the camera-ready version.

No more than one paper per registration will be published. An extra fee must be paid for publication of additional papers, with a maximum of one additional paper per registration. One registration permits only the participation of one author in the conference.

Papers written in English and accepted and registered will be published in Proceedings by Springer, in a book of the Advances in Intelligent Systems and Computing <http://www.springer.com/series/11156>series, will  be submitted for indexation by ISI, EI-Compendex, SCOPUS and DBLP, among others, and will be available in the SpringerLink Digital Library <http://link.springer.com/>.

Papers written in Spanish or Portuguese and accepted and registered will be published in a Special Issue of RISTI <http://www.risti.xyz/index.php?option=com_content&view=article&id=3&Itemid=104&lang=es> and will be submitted for indexation by SCOPUS, among others.

Important Dates

Paper Submission: September 16 23, 2018

Notification of Acceptance: October 21, 2018

Payment of Registration, to ensure the inclusion of an accepted paper in the conference proceedings: November 9, 2018.

Camera-ready Submission: November 9, 2018

ICITS'19 website: http://www.icits.me <http://www.icits.me/>

---
Este e-mail foi verificado em termos de vírus pelo AVG.
http://www.avg.com

[-- Attachment #1.2: Type: text/html, Size: 8378 bytes --]

[-- Attachment #2: Type: text/plain, Size: 183 bytes --]

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* [patch 00/11] x86/vdso: Cleanups, simmplifications and CLOCK_TAI support
From: Thomas Gleixner @ 2018-09-14 12:50 UTC (permalink / raw)
  To: LKML
  Cc: Florian Weimer, Juergen Gross, Arnd Bergmann, Peter Zijlstra, x86,
	virtualization, Stephen Boyd, John Stultz, Andy Lutomirski,
	Paolo Bonzini, devel, Matt Rickard

Matt attempted to add CLOCK_TAI support to the VDSO clock_gettime()
implementation, which extended the clockid switch case and added yet
another slightly different copy of the same code.

Especially the extended switch case is problematic as the compiler tends to
generate a jump table which then requires to use retpolines. If jump tables
are disabled it adds yet another conditional to the existing maze.

This series takes a different approach by consolidating the almost
identical functions into one implementation for high resolution clocks and
one for the coarse grained clock ids by storing the base data for each
clock id in an array which is indexed by the clock id.

This completely eliminates the switch case and allows further
simplifications of the code base, which at the end all together gain a few
cycles performance or at least stay on par with todays code. The resulting
performance depends heavily on the micro architecture and the compiler.

Thanks,

	tglx

8<-------------------
 arch/x86/Kconfig                        |    1 
 arch/x86/entry/vdso/vclock_gettime.c    |  199 ++++++++------------------------
 arch/x86/entry/vsyscall/vsyscall_gtod.c |   55 ++++----
 arch/x86/include/asm/vgtod.h            |   46 ++++---
 arch/x86/kernel/time.c                  |   22 +++
 include/linux/clocksource.h             |    5 
 kernel/time/Kconfig                     |    4 
 kernel/time/clocksource.c               |    2 
 8 files changed, 144 insertions(+), 190 deletions(-)

^ permalink raw reply

* [patch 01/11] clocksource: Provide clocksource_arch_init()
From: Thomas Gleixner @ 2018-09-14 12:50 UTC (permalink / raw)
  To: LKML
  Cc: Florian Weimer, Juergen Gross, Arnd Bergmann, Peter Zijlstra, x86,
	virtualization, Stephen Boyd, John Stultz, Andy Lutomirski,
	Paolo Bonzini, devel, Matt Rickard
In-Reply-To: <20180914125006.349747096@linutronix.de>

[-- Attachment #1: clocksource--Provide-clocksource_arch_init--.patch --]
[-- Type: text/plain, Size: 1492 bytes --]

Architectures have extra archdata in the clocksource, e.g. for VDSO
support. There are no sanity checks or general initializations for this
available. Add support for that.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clocksource.h |    5 +++++
 kernel/time/Kconfig         |    4 ++++
 kernel/time/clocksource.c   |    2 ++
 3 files changed, 11 insertions(+)

--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -241,6 +241,11 @@ static inline void __clocksource_update_
 	__clocksource_update_freq_scale(cs, 1000, khz);
 }
 
+#ifdef CONFIG_ARCH_CLOCKSOURCE_INIT
+extern void clocksource_arch_init(struct clocksource *cs);
+#else
+static inline void clocksource_arch_init(struct clocksource *cs) { }
+#endif
 
 extern int timekeeping_notify(struct clocksource *clock);
 
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -12,6 +12,10 @@ config CLOCKSOURCE_WATCHDOG
 config ARCH_CLOCKSOURCE_DATA
 	bool
 
+# Architecture has extra clocksource init called from registration
+config ARCH_CLOCKSOURCE_INIT
+	bool
+
 # Clocksources require validation of the clocksource against the last
 # cycle update - x86/TSC misfeature
 config CLOCKSOURCE_VALIDATE_LAST_CYCLE
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -937,6 +937,8 @@ int __clocksource_register_scale(struct
 {
 	unsigned long flags;
 
+	clocksource_arch_init(cs);
+
 	/* Initialize mult/shift and max_idle_ns */
 	__clocksource_update_freq_scale(cs, scale, freq);

^ permalink raw reply

* [patch 02/11] x86/time: Implement clocksource_arch_init()
From: Thomas Gleixner @ 2018-09-14 12:50 UTC (permalink / raw)
  To: LKML
  Cc: Florian Weimer, Juergen Gross, Arnd Bergmann, Peter Zijlstra, x86,
	virtualization, Stephen Boyd, John Stultz, Andy Lutomirski,
	Paolo Bonzini, devel, Matt Rickard
In-Reply-To: <20180914125006.349747096@linutronix.de>

[-- Attachment #1: x86-time--Implement-clocksource_arch_init--.patch --]
[-- Type: text/plain, Size: 1337 bytes --]

Runtime validate the VCLOCK_MODE in clocksource::archdata and disable
VCLOCK if invalid, which disables the VDSO but keeps the system running.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig       |    1 +
 arch/x86/kernel/time.c |   16 ++++++++++++++++
 2 files changed, 17 insertions(+)

--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -48,6 +48,7 @@ config X86
 	select ACPI_SYSTEM_POWER_STATES_SUPPORT	if ACPI
 	select ANON_INODES
 	select ARCH_CLOCKSOURCE_DATA
+	select ARCH_CLOCKSOURCE_INIT
 	select ARCH_DISCARD_MEMBLOCK
 	select ARCH_HAS_ACPI_TABLE_UPGRADE	if ACPI
 	select ARCH_HAS_DEBUG_VIRTUAL
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -10,6 +10,7 @@
  *
  */
 
+#include <linux/clocksource.h>
 #include <linux/clockchips.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
@@ -105,3 +106,18 @@ void __init time_init(void)
 {
 	late_time_init = x86_late_time_init;
 }
+
+/*
+ * Sanity check the vdso related archdata content.
+ */
+void clocksource_arch_init(struct clocksource *cs)
+{
+	if (cs->archdata.vclock_mode == VCLOCK_NONE)
+		return;
+
+	if (cs->archdata.vclock_mode >= VCLOCK_MAX) {
+		pr_warn("clocksource %s registered with invalid vclock_mode %d. Disabling vclock.\n",
+			cs->name, cs->archdata.vclock_mode);
+		cs->archdata.vclock_mode = VCLOCK_NONE;
+	}
+}

^ permalink raw reply

* [patch 03/11] x86/vdso: Enforce 64bit clocksource
From: Thomas Gleixner @ 2018-09-14 12:50 UTC (permalink / raw)
  To: LKML
  Cc: Florian Weimer, Juergen Gross, Arnd Bergmann, Peter Zijlstra, x86,
	virtualization, Stephen Boyd, John Stultz, Andy Lutomirski,
	Paolo Bonzini, devel, Matt Rickard
In-Reply-To: <20180914125006.349747096@linutronix.de>

[-- Attachment #1: x86-vdso--Enforce-64bit-clocksource.patch --]
[-- Type: text/plain, Size: 1074 bytes --]

All VDSO clock sources are TSC based and use CLOCKSOURCE_MASK(64). There is
no point in masking with all FF. Get rid of it and enforce the mask in the
sanity checker.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/entry/vdso/vclock_gettime.c |    2 +-
 arch/x86/kernel/time.c               |    6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -199,7 +199,7 @@ notrace static inline u64 vgetsns(int *m
 #endif
 	else
 		return 0;
-	v = (cycles - gtod->cycle_last) & gtod->mask;
+	v = cycles - gtod->cycle_last;
 	return v * gtod->mult;
 }
 
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -120,4 +120,10 @@ void clocksource_arch_init(struct clocks
 			cs->name, cs->archdata.vclock_mode);
 		cs->archdata.vclock_mode = VCLOCK_NONE;
 	}
+
+	if (cs->mask != CLOCKSOURCE_MASK(64)) {
+		pr_warn("clocksource %s registered with invalid mask %016llx. Disabling vclock.\n",
+			cs->name, cs->mask);
+		cs->archdata.vclock_mode = VCLOCK_NONE;
+	}
 }

^ permalink raw reply

* [patch 04/11] x86/vdso: Use unsigned int consistently for vsyscall_gtod_data::seq
From: Thomas Gleixner @ 2018-09-14 12:50 UTC (permalink / raw)
  To: LKML
  Cc: Florian Weimer, Juergen Gross, Arnd Bergmann, Peter Zijlstra, x86,
	virtualization, Stephen Boyd, John Stultz, Andy Lutomirski,
	Paolo Bonzini, devel, Matt Rickard
In-Reply-To: <20180914125006.349747096@linutronix.de>

[-- Attachment #1: x86-vdso--Mintor-cleanups.patch --]
[-- Type: text/plain, Size: 2383 bytes --]

The sequence count in vgtod_data is unsigned int, but the call sites use
unsigned long, which is a pointless exercise. Fix the call sites and
replace 'unsigned' with unsinged 'int' while at it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/entry/vdso/vclock_gettime.c |    8 ++++----
 arch/x86/include/asm/vgtod.h         |   10 +++++-----
 2 files changed, 9 insertions(+), 9 deletions(-)

--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -206,7 +206,7 @@ notrace static inline u64 vgetsns(int *m
 /* Code size doesn't matter (vdso is 4k anyway) and this is faster. */
 notrace static int __always_inline do_realtime(struct timespec *ts)
 {
-	unsigned long seq;
+	unsigned int seq;
 	u64 ns;
 	int mode;
 
@@ -227,7 +227,7 @@ notrace static int __always_inline do_re
 
 notrace static int __always_inline do_monotonic(struct timespec *ts)
 {
-	unsigned long seq;
+	unsigned int seq;
 	u64 ns;
 	int mode;
 
@@ -248,7 +248,7 @@ notrace static int __always_inline do_mo
 
 notrace static void do_realtime_coarse(struct timespec *ts)
 {
-	unsigned long seq;
+	unsigned int seq;
 	do {
 		seq = gtod_read_begin(gtod);
 		ts->tv_sec = gtod->wall_time_coarse_sec;
@@ -258,7 +258,7 @@ notrace static void do_realtime_coarse(s
 
 notrace static void do_monotonic_coarse(struct timespec *ts)
 {
-	unsigned long seq;
+	unsigned int seq;
 	do {
 		seq = gtod_read_begin(gtod);
 		ts->tv_sec = gtod->monotonic_time_coarse_sec;
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -15,9 +15,9 @@ typedef unsigned long gtod_long_t;
  * so be carefull by modifying this structure.
  */
 struct vsyscall_gtod_data {
-	unsigned seq;
+	unsigned int seq;
 
-	int vclock_mode;
+	int	vclock_mode;
 	u64	cycle_last;
 	u64	mask;
 	u32	mult;
@@ -44,9 +44,9 @@ static inline bool vclock_was_used(int v
 	return READ_ONCE(vclocks_used) & (1 << vclock);
 }
 
-static inline unsigned gtod_read_begin(const struct vsyscall_gtod_data *s)
+static inline unsigned int gtod_read_begin(const struct vsyscall_gtod_data *s)
 {
-	unsigned ret;
+	unsigned int ret;
 
 repeat:
 	ret = READ_ONCE(s->seq);
@@ -59,7 +59,7 @@ static inline unsigned gtod_read_begin(c
 }
 
 static inline int gtod_read_retry(const struct vsyscall_gtod_data *s,
-					unsigned start)
+				  unsigned int start)
 {
 	smp_rmb();
 	return unlikely(s->seq != start);

^ permalink raw reply

* [patch 05/11] x86/vdso: Introduce and use vgtod_ts
From: Thomas Gleixner @ 2018-09-14 12:50 UTC (permalink / raw)
  To: LKML
  Cc: Florian Weimer, Juergen Gross, Arnd Bergmann, Peter Zijlstra, x86,
	virtualization, Stephen Boyd, John Stultz, Andy Lutomirski,
	Paolo Bonzini, devel, Matt Rickard
In-Reply-To: <20180914125006.349747096@linutronix.de>

[-- Attachment #1: x86-vdso--Introduce-and-use-vgtod_ts.patch --]
[-- Type: text/plain, Size: 7483 bytes --]

It's desired to support more clocks in the VDSO, e.g. CLOCK_TAI. This
results either in indirect calls due to the larger switch case, which then
requires retpolines or when the compiler is forced to avoid jump tables it
results in even more conditionals.

To avoid both variants which are bad for performance the high resolution
functions and the coarse grained functions will be collapsed into one for
each. That requires to store the clock specific base time in an array.

Introcude struct vgtod_ts for storage and convert the data store, the
update function and the individual clock functions over to use it.

The new storage does not longer use gtod_long_t for seconds depending on 32
or 64 bit compile because this needs to be the full 64bit value even for
32bit when a Y2038 function is added. No point in keeping the distinction
alive in the internal representation.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/entry/vdso/vclock_gettime.c    |   24 +++++++++------
 arch/x86/entry/vsyscall/vsyscall_gtod.c |   51 ++++++++++++++++----------------
 arch/x86/include/asm/vgtod.h            |   36 ++++++++++++----------
 3 files changed, 61 insertions(+), 50 deletions(-)

--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -206,6 +206,7 @@ notrace static inline u64 vgetsns(int *m
 /* Code size doesn't matter (vdso is 4k anyway) and this is faster. */
 notrace static int __always_inline do_realtime(struct timespec *ts)
 {
+	struct vgtod_ts *base = &gtod->basetime[CLOCK_REALTIME];
 	unsigned int seq;
 	u64 ns;
 	int mode;
@@ -213,8 +214,8 @@ notrace static int __always_inline do_re
 	do {
 		seq = gtod_read_begin(gtod);
 		mode = gtod->vclock_mode;
-		ts->tv_sec = gtod->wall_time_sec;
-		ns = gtod->wall_time_snsec;
+		ts->tv_sec = base->sec;
+		ns = base->nsec;
 		ns += vgetsns(&mode);
 		ns >>= gtod->shift;
 	} while (unlikely(gtod_read_retry(gtod, seq)));
@@ -227,6 +228,7 @@ notrace static int __always_inline do_re
 
 notrace static int __always_inline do_monotonic(struct timespec *ts)
 {
+	struct vgtod_ts *base = &gtod->basetime[CLOCK_MONOTONIC];
 	unsigned int seq;
 	u64 ns;
 	int mode;
@@ -234,8 +236,8 @@ notrace static int __always_inline do_mo
 	do {
 		seq = gtod_read_begin(gtod);
 		mode = gtod->vclock_mode;
-		ts->tv_sec = gtod->monotonic_time_sec;
-		ns = gtod->monotonic_time_snsec;
+		ts->tv_sec = base->sec;
+		ns = base->nsec;
 		ns += vgetsns(&mode);
 		ns >>= gtod->shift;
 	} while (unlikely(gtod_read_retry(gtod, seq)));
@@ -248,21 +250,25 @@ notrace static int __always_inline do_mo
 
 notrace static void do_realtime_coarse(struct timespec *ts)
 {
+	struct vgtod_ts *base = &gtod->basetime[CLOCK_REALTIME_COARSE];
 	unsigned int seq;
+
 	do {
 		seq = gtod_read_begin(gtod);
-		ts->tv_sec = gtod->wall_time_coarse_sec;
-		ts->tv_nsec = gtod->wall_time_coarse_nsec;
+		ts->tv_sec = base->sec;
+		ts->tv_nsec = base->nsec;
 	} while (unlikely(gtod_read_retry(gtod, seq)));
 }
 
 notrace static void do_monotonic_coarse(struct timespec *ts)
 {
+	struct vgtod_ts *base = &gtod->basetime[CLOCK_MONOTONIC_COARSE];
 	unsigned int seq;
+
 	do {
 		seq = gtod_read_begin(gtod);
-		ts->tv_sec = gtod->monotonic_time_coarse_sec;
-		ts->tv_nsec = gtod->monotonic_time_coarse_nsec;
+		ts->tv_sec = base->sec;
+		ts->tv_nsec = base->nsec;
 	} while (unlikely(gtod_read_retry(gtod, seq)));
 }
 
@@ -318,7 +324,7 @@ int gettimeofday(struct timeval *, struc
 notrace time_t __vdso_time(time_t *t)
 {
 	/* This is atomic on x86 so we don't need any locks. */
-	time_t result = READ_ONCE(gtod->wall_time_sec);
+	time_t result = READ_ONCE(gtod->basetime[CLOCK_REALTIME].sec);
 
 	if (t)
 		*t = result;
--- a/arch/x86/entry/vsyscall/vsyscall_gtod.c
+++ b/arch/x86/entry/vsyscall/vsyscall_gtod.c
@@ -31,6 +31,8 @@ void update_vsyscall(struct timekeeper *
 {
 	int vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
 	struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
+	struct vgtod_ts *base;
+	u64 nsec;
 
 	/* Mark the new vclock used. */
 	BUILD_BUG_ON(VCLOCK_MAX >= 32);
@@ -45,34 +47,33 @@ void update_vsyscall(struct timekeeper *
 	vdata->mult		= tk->tkr_mono.mult;
 	vdata->shift		= tk->tkr_mono.shift;
 
-	vdata->wall_time_sec		= tk->xtime_sec;
-	vdata->wall_time_snsec		= tk->tkr_mono.xtime_nsec;
-
-	vdata->monotonic_time_sec	= tk->xtime_sec
-					+ tk->wall_to_monotonic.tv_sec;
-	vdata->monotonic_time_snsec	= tk->tkr_mono.xtime_nsec
-					+ ((u64)tk->wall_to_monotonic.tv_nsec
-						<< tk->tkr_mono.shift);
-	while (vdata->monotonic_time_snsec >=
-					(((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
-		vdata->monotonic_time_snsec -=
-					((u64)NSEC_PER_SEC) << tk->tkr_mono.shift;
-		vdata->monotonic_time_sec++;
+	base = &vdata->basetime[CLOCK_REALTIME];
+	base->sec = tk->xtime_sec;
+	base->nsec = tk->tkr_mono.xtime_nsec;
+
+	base = &vdata->basetime[CLOCK_MONOTONIC];
+	base->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
+	nsec = tk->tkr_mono.xtime_nsec;
+	nsec +=	((u64)tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift);
+	while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
+		nsec -= ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift;
+		base->sec++;
 	}
+	base->nsec = nsec;
 
-	vdata->wall_time_coarse_sec	= tk->xtime_sec;
-	vdata->wall_time_coarse_nsec	= (long)(tk->tkr_mono.xtime_nsec >>
-						 tk->tkr_mono.shift);
-
-	vdata->monotonic_time_coarse_sec =
-		vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec;
-	vdata->monotonic_time_coarse_nsec =
-		vdata->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec;
-
-	while (vdata->monotonic_time_coarse_nsec >= NSEC_PER_SEC) {
-		vdata->monotonic_time_coarse_nsec -= NSEC_PER_SEC;
-		vdata->monotonic_time_coarse_sec++;
+	base = &vdata->basetime[CLOCK_REALTIME_COARSE];
+	base->sec = tk->xtime_sec;
+	base->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+
+	base = &vdata->basetime[CLOCK_MONOTONIC_COARSE];
+	base->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
+	nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+	nsec += tk->wall_to_monotonic.tv_nsec;
+	while (nsec >= NSEC_PER_SEC) {
+		nsec -= NSEC_PER_SEC;
+		base->sec++;
 	}
+	base->nsec = nsec;
 
 	gtod_write_end(vdata);
 }
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -5,33 +5,37 @@
 #include <linux/compiler.h>
 #include <linux/clocksource.h>
 
+#include <uapi/linux/time.h>
+
 #ifdef BUILD_VDSO32_64
 typedef u64 gtod_long_t;
 #else
 typedef unsigned long gtod_long_t;
 #endif
+
+struct vgtod_ts {
+	u64		sec;
+	u64		nsec;
+};
+
+#define VGTOD_BASES	(CLOCK_MONOTONIC_COARSE + 1)
+#define VGTOD_HRES	(BIT(CLOCK_REALTIME) | BIT(CLOCK_MONOTONIC))
+#define VGTOD_COARSE	(BIT(CLOCK_REALTIME_COARSE) | BIT(CLOCK_MONOTONIC_COARSE))
+
 /*
  * vsyscall_gtod_data will be accessed by 32 and 64 bit code at the same time
  * so be carefull by modifying this structure.
  */
 struct vsyscall_gtod_data {
-	unsigned int seq;
+	unsigned int	seq;
+
+	int		vclock_mode;
+	u64		cycle_last;
+	u64		mask;
+	u32		mult;
+	u32		shift;
 
-	int	vclock_mode;
-	u64	cycle_last;
-	u64	mask;
-	u32	mult;
-	u32	shift;
-
-	/* open coded 'struct timespec' */
-	u64		wall_time_snsec;
-	gtod_long_t	wall_time_sec;
-	gtod_long_t	monotonic_time_sec;
-	u64		monotonic_time_snsec;
-	gtod_long_t	wall_time_coarse_sec;
-	gtod_long_t	wall_time_coarse_nsec;
-	gtod_long_t	monotonic_time_coarse_sec;
-	gtod_long_t	monotonic_time_coarse_nsec;
+	struct vgtod_ts	basetime[VGTOD_BASES];
 
 	int		tz_minuteswest;
 	int		tz_dsttime;

^ permalink raw reply

* [patch 06/11] x86/vdso: Collapse high resolution functions
From: Thomas Gleixner @ 2018-09-14 12:50 UTC (permalink / raw)
  To: LKML
  Cc: Florian Weimer, Juergen Gross, Arnd Bergmann, Peter Zijlstra, x86,
	virtualization, Stephen Boyd, John Stultz, Andy Lutomirski,
	Paolo Bonzini, devel, Matt Rickard
In-Reply-To: <20180914125006.349747096@linutronix.de>

[-- Attachment #1: x86-vdso--Collapse-high-res-functions.patch --]
[-- Type: text/plain, Size: 2221 bytes --]

do_realtime() and do_monotonic() are now the same except for the storage
array index. Hand the index in as an argument and collapse the functions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/entry/vdso/vclock_gettime.c |   35 +++++++----------------------------
 1 file changed, 7 insertions(+), 28 deletions(-)

--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -203,35 +203,12 @@ notrace static inline u64 vgetsns(int *m
 	return v * gtod->mult;
 }
 
-/* Code size doesn't matter (vdso is 4k anyway) and this is faster. */
-notrace static int __always_inline do_realtime(struct timespec *ts)
+notrace static int do_hres(clockid_t clk, struct timespec *ts)
 {
-	struct vgtod_ts *base = &gtod->basetime[CLOCK_REALTIME];
+	struct vgtod_ts *base = &gtod->basetime[clk];
 	unsigned int seq;
-	u64 ns;
 	int mode;
-
-	do {
-		seq = gtod_read_begin(gtod);
-		mode = gtod->vclock_mode;
-		ts->tv_sec = base->sec;
-		ns = base->nsec;
-		ns += vgetsns(&mode);
-		ns >>= gtod->shift;
-	} while (unlikely(gtod_read_retry(gtod, seq)));
-
-	ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
-	ts->tv_nsec = ns;
-
-	return mode;
-}
-
-notrace static int __always_inline do_monotonic(struct timespec *ts)
-{
-	struct vgtod_ts *base = &gtod->basetime[CLOCK_MONOTONIC];
-	unsigned int seq;
 	u64 ns;
-	int mode;
 
 	do {
 		seq = gtod_read_begin(gtod);
@@ -276,11 +253,11 @@ notrace int __vdso_clock_gettime(clockid
 {
 	switch (clock) {
 	case CLOCK_REALTIME:
-		if (do_realtime(ts) == VCLOCK_NONE)
+		if (do_hres(CLOCK_REALTIME, ts) == VCLOCK_NONE)
 			goto fallback;
 		break;
 	case CLOCK_MONOTONIC:
-		if (do_monotonic(ts) == VCLOCK_NONE)
+		if (do_hres(CLOCK_MONOTONIC, ts) == VCLOCK_NONE)
 			goto fallback;
 		break;
 	case CLOCK_REALTIME_COARSE:
@@ -303,7 +280,9 @@ int clock_gettime(clockid_t, struct time
 notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
 {
 	if (likely(tv != NULL)) {
-		if (unlikely(do_realtime((struct timespec *)tv) == VCLOCK_NONE))
+		struct timespec *ts = (struct timespec *) tv;
+
+		if (unlikely(do_hres(CLOCK_REALTIME, ts) == VCLOCK_NONE))
 			return vdso_fallback_gtod(tv, tz);
 		tv->tv_usec /= 1000;
 	}

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox