* [PATCH V2,net-next] net: mana: Add page pool for RX buffers
From: Haiyang Zhang @ 2023-07-18 21:48 UTC (permalink / raw)
To: linux-hyperv@vger.kernel.org, netdev@vger.kernel.org
Cc: Haiyang Zhang, Dexuan Cui, KY Srinivasan, Paul Rosswurm,
olaf@aepfle.de, vkuznets@redhat.com, davem@davemloft.net,
wei.liu@kernel.org, edumazet@google.com, kuba@kernel.org,
pabeni@redhat.com, leon@kernel.org, Long Li,
ssengar@linux.microsoft.com, linux-rdma@vger.kernel.org,
daniel@iogearbox.net, john.fastabend@gmail.com,
bpf@vger.kernel.org, ast@kernel.org, Ajay Sharma, hawk@kernel.org,
tglx@linutronix.de, shradhagupta@linux.microsoft.com,
linux-kernel@vger.kernel.org
Add page pool for RX buffers for faster buffer cycle and reduce CPU
usage.
The standard page pool API is used.
Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
---
V2:
Use the standard page pool API as suggested by Jesper Dangaard Brouer
---
drivers/net/ethernet/microsoft/mana/mana_en.c | 101 +++++++++++++++---
include/net/mana/mana.h | 3 +
2 files changed, 89 insertions(+), 15 deletions(-)
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index a499e460594b..0b557b70cd45 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1414,8 +1414,8 @@ static struct sk_buff *mana_build_skb(struct mana_rxq *rxq, void *buf_va,
return skb;
}
-static void mana_rx_skb(void *buf_va, struct mana_rxcomp_oob *cqe,
- struct mana_rxq *rxq)
+static void mana_rx_skb(void *buf_va, bool from_pool,
+ struct mana_rxcomp_oob *cqe, struct mana_rxq *rxq)
{
struct mana_stats_rx *rx_stats = &rxq->stats;
struct net_device *ndev = rxq->ndev;
@@ -1437,8 +1437,12 @@ static void mana_rx_skb(void *buf_va, struct mana_rxcomp_oob *cqe,
act = mana_run_xdp(ndev, rxq, &xdp, buf_va, pkt_len);
- if (act == XDP_REDIRECT && !rxq->xdp_rc)
+ if (act == XDP_REDIRECT && !rxq->xdp_rc) {
+ if (from_pool)
+ page_pool_release_page(rxq->page_pool,
+ virt_to_head_page(buf_va));
return;
+ }
if (act != XDP_PASS && act != XDP_TX)
goto drop_xdp;
@@ -1448,6 +1452,9 @@ static void mana_rx_skb(void *buf_va, struct mana_rxcomp_oob *cqe,
if (!skb)
goto drop;
+ if (from_pool)
+ skb_mark_for_recycle(skb);
+
skb->dev = napi->dev;
skb->protocol = eth_type_trans(skb, ndev);
@@ -1498,9 +1505,14 @@ static void mana_rx_skb(void *buf_va, struct mana_rxcomp_oob *cqe,
u64_stats_update_end(&rx_stats->syncp);
drop:
- WARN_ON_ONCE(rxq->xdp_save_va);
- /* Save for reuse */
- rxq->xdp_save_va = buf_va;
+ if (from_pool) {
+ page_pool_recycle_direct(rxq->page_pool,
+ virt_to_head_page(buf_va));
+ } else {
+ WARN_ON_ONCE(rxq->xdp_save_va);
+ /* Save for reuse */
+ rxq->xdp_save_va = buf_va;
+ }
++ndev->stats.rx_dropped;
@@ -1508,11 +1520,13 @@ static void mana_rx_skb(void *buf_va, struct mana_rxcomp_oob *cqe,
}
static void *mana_get_rxfrag(struct mana_rxq *rxq, struct device *dev,
- dma_addr_t *da, bool is_napi)
+ dma_addr_t *da, bool *from_pool, bool is_napi)
{
struct page *page;
void *va;
+ *from_pool = false;
+
/* Reuse XDP dropped page if available */
if (rxq->xdp_save_va) {
va = rxq->xdp_save_va;
@@ -1533,7 +1547,13 @@ static void *mana_get_rxfrag(struct mana_rxq *rxq, struct device *dev,
return NULL;
}
} else {
- page = dev_alloc_page();
+ if (is_napi) {
+ page = page_pool_dev_alloc_pages(rxq->page_pool);
+ *from_pool = true;
+ } else {
+ page = dev_alloc_page();
+ }
+
if (!page)
return NULL;
@@ -1543,7 +1563,11 @@ static void *mana_get_rxfrag(struct mana_rxq *rxq, struct device *dev,
*da = dma_map_single(dev, va + rxq->headroom, rxq->datasize,
DMA_FROM_DEVICE);
if (dma_mapping_error(dev, *da)) {
- put_page(virt_to_head_page(va));
+ if (*from_pool)
+ page_pool_put_full_page(rxq->page_pool, page, true);
+ else
+ put_page(virt_to_head_page(va));
+
return NULL;
}
@@ -1552,21 +1576,25 @@ static void *mana_get_rxfrag(struct mana_rxq *rxq, struct device *dev,
/* Allocate frag for rx buffer, and save the old buf */
static void mana_refill_rx_oob(struct device *dev, struct mana_rxq *rxq,
- struct mana_recv_buf_oob *rxoob, void **old_buf)
+ struct mana_recv_buf_oob *rxoob, void **old_buf,
+ bool *old_fp)
{
+ bool from_pool;
dma_addr_t da;
void *va;
- va = mana_get_rxfrag(rxq, dev, &da, true);
+ va = mana_get_rxfrag(rxq, dev, &da, &from_pool, true);
if (!va)
return;
dma_unmap_single(dev, rxoob->sgl[0].address, rxq->datasize,
DMA_FROM_DEVICE);
*old_buf = rxoob->buf_va;
+ *old_fp = rxoob->from_pool;
rxoob->buf_va = va;
rxoob->sgl[0].address = da;
+ rxoob->from_pool = from_pool;
}
static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
@@ -1580,6 +1608,7 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
struct device *dev = gc->dev;
void *old_buf = NULL;
u32 curr, pktlen;
+ bool old_fp;
apc = netdev_priv(ndev);
@@ -1622,12 +1651,12 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
rxbuf_oob = &rxq->rx_oobs[curr];
WARN_ON_ONCE(rxbuf_oob->wqe_inf.wqe_size_in_bu != 1);
- mana_refill_rx_oob(dev, rxq, rxbuf_oob, &old_buf);
+ mana_refill_rx_oob(dev, rxq, rxbuf_oob, &old_buf, &old_fp);
/* Unsuccessful refill will have old_buf == NULL.
* In this case, mana_rx_skb() will drop the packet.
*/
- mana_rx_skb(old_buf, oob, rxq);
+ mana_rx_skb(old_buf, old_fp, oob, rxq);
drop:
mana_move_wq_tail(rxq->gdma_rq, rxbuf_oob->wqe_inf.wqe_size_in_bu);
@@ -1659,6 +1688,8 @@ static void mana_poll_rx_cq(struct mana_cq *cq)
if (rxq->xdp_flush)
xdp_do_flush();
+
+ page_pool_nid_changed(rxq->page_pool, numa_mem_id());
}
static int mana_cq_handler(void *context, struct gdma_queue *gdma_queue)
@@ -1881,6 +1912,7 @@ static void mana_destroy_rxq(struct mana_port_context *apc,
struct mana_recv_buf_oob *rx_oob;
struct device *dev = gc->dev;
struct napi_struct *napi;
+ struct page *page;
int i;
if (!rxq)
@@ -1913,10 +1945,18 @@ static void mana_destroy_rxq(struct mana_port_context *apc,
dma_unmap_single(dev, rx_oob->sgl[0].address,
rx_oob->sgl[0].size, DMA_FROM_DEVICE);
- put_page(virt_to_head_page(rx_oob->buf_va));
+ page = virt_to_head_page(rx_oob->buf_va);
+
+ if (rx_oob->from_pool)
+ page_pool_put_full_page(rxq->page_pool, page, false);
+ else
+ put_page(page);
+
rx_oob->buf_va = NULL;
}
+ page_pool_destroy(rxq->page_pool);
+
if (rxq->gdma_rq)
mana_gd_destroy_queue(gc, rxq->gdma_rq);
@@ -1927,18 +1967,20 @@ static int mana_fill_rx_oob(struct mana_recv_buf_oob *rx_oob, u32 mem_key,
struct mana_rxq *rxq, struct device *dev)
{
struct mana_port_context *mpc = netdev_priv(rxq->ndev);
+ bool from_pool = false;
dma_addr_t da;
void *va;
if (mpc->rxbufs_pre)
va = mana_get_rxbuf_pre(rxq, &da);
else
- va = mana_get_rxfrag(rxq, dev, &da, false);
+ va = mana_get_rxfrag(rxq, dev, &da, &from_pool, false);
if (!va)
return -ENOMEM;
rx_oob->buf_va = va;
+ rx_oob->from_pool = from_pool;
rx_oob->sgl[0].address = da;
rx_oob->sgl[0].size = rxq->datasize;
@@ -2008,6 +2050,28 @@ static int mana_push_wqe(struct mana_rxq *rxq)
return 0;
}
+static int mana_create_page_pool(struct gdma_context *gc, struct mana_cq *cq,
+ struct mana_rxq *rxq)
+{
+ struct page_pool_params pprm = {};
+ int ret;
+
+ pprm.pool_size = RX_BUFFERS_PER_QUEUE;
+ pprm.napi = &cq->napi;
+ pprm.dev = gc->dev;
+ pprm.dma_dir = DMA_FROM_DEVICE;
+
+ rxq->page_pool = page_pool_create(&pprm);
+
+ if (IS_ERR(rxq->page_pool)) {
+ ret = PTR_ERR(rxq->page_pool);
+ rxq->page_pool = NULL;
+ return ret;
+ }
+
+ return 0;
+}
+
static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc,
u32 rxq_idx, struct mana_eq *eq,
struct net_device *ndev)
@@ -2106,6 +2170,13 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc,
netif_napi_add_weight(ndev, &cq->napi, mana_poll, 1);
+ /* Create page pool for RX queue */
+ err = mana_create_page_pool(gc, cq, rxq);
+ if (err) {
+ netdev_err(ndev, "Create page pool err:%d\n", err);
+ goto out;
+ }
+
WARN_ON(xdp_rxq_info_reg(&rxq->xdp_rxq, ndev, rxq_idx,
cq->napi.napi_id));
WARN_ON(xdp_rxq_info_reg_mem_model(&rxq->xdp_rxq,
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 024ad8ddb27e..b12859511839 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -280,6 +280,7 @@ struct mana_recv_buf_oob {
struct gdma_wqe_request wqe_req;
void *buf_va;
+ bool from_pool; /* allocated from a page pool */
/* SGL of the buffer going to be sent has part of the work request. */
u32 num_sge;
@@ -330,6 +331,8 @@ struct mana_rxq {
bool xdp_flush;
int xdp_rc; /* XDP redirect return code */
+ struct page_pool *page_pool;
+
/* MUST BE THE LAST MEMBER:
* Each receive buffer has an associated mana_recv_buf_oob.
*/
--
2.25.1
^ permalink raw reply related
* [PATCH V4 net-next] net: mana: Configure hwc timeout from hardware
From: Souradeep Chakrabarti @ 2023-07-18 18:00 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, davem, edumazet, kuba, pabeni,
longli, sharmaajay, leon, cai.huoqing, ssengar, vkuznets, tglx,
linux-hyperv, netdev, linux-kernel, linux-rdma
Cc: schakrabarti, Souradeep Chakrabarti
At present hwc timeout value is a fixed value. This patch sets the hwc
timeout from the hardware. It now uses a new hardware capability
GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG to query and set the value
in hwc_timeout.
Signed-off-by: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
---
V3 -> V4:
* Changing branch to net-next.
* Changed the commit message to 75 chars per line.
---
.../net/ethernet/microsoft/mana/gdma_main.c | 30 ++++++++++++++++++-
.../net/ethernet/microsoft/mana/hw_channel.c | 25 +++++++++++++++-
include/net/mana/gdma.h | 20 ++++++++++++-
include/net/mana/hw_channel.h | 5 ++++
4 files changed, 77 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 8f3f78b68592..4537a70e30d4 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -106,6 +106,25 @@ static int mana_gd_query_max_resources(struct pci_dev *pdev)
return 0;
}
+static int mana_gd_query_hwc_timeout(struct pci_dev *pdev, u32 *timeout_val)
+{
+ struct gdma_context *gc = pci_get_drvdata(pdev);
+ struct gdma_query_hwc_timeout_resp resp = {};
+ struct gdma_query_hwc_timeout_req req = {};
+ int err;
+
+ mana_gd_init_req_hdr(&req.hdr, GDMA_QUERY_HWC_TIMEOUT,
+ sizeof(req), sizeof(resp));
+ req.timeout_ms = *timeout_val;
+ err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
+ if (err || resp.hdr.status)
+ return err ? err : -EPROTO;
+
+ *timeout_val = resp.timeout_ms;
+
+ return 0;
+}
+
static int mana_gd_detect_devices(struct pci_dev *pdev)
{
struct gdma_context *gc = pci_get_drvdata(pdev);
@@ -879,8 +898,11 @@ int mana_gd_verify_vf_version(struct pci_dev *pdev)
struct gdma_context *gc = pci_get_drvdata(pdev);
struct gdma_verify_ver_resp resp = {};
struct gdma_verify_ver_req req = {};
+ struct hw_channel_context *hwc;
int err;
+ hwc = gc->hwc.driver_data;
+
mana_gd_init_req_hdr(&req.hdr, GDMA_VERIFY_VF_DRIVER_VERSION,
sizeof(req), sizeof(resp));
@@ -907,7 +929,13 @@ int mana_gd_verify_vf_version(struct pci_dev *pdev)
err, resp.hdr.status);
return err ? err : -EPROTO;
}
-
+ if (resp.pf_cap_flags1 & GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG) {
+ err = mana_gd_query_hwc_timeout(pdev, &hwc->hwc_timeout);
+ if (err) {
+ dev_err(gc->dev, "Failed to set the hwc timeout %d\n", err);
+ return err;
+ }
+ }
return 0;
}
diff --git a/drivers/net/ethernet/microsoft/mana/hw_channel.c b/drivers/net/ethernet/microsoft/mana/hw_channel.c
index 2bd1d74021f7..db433501e5e6 100644
--- a/drivers/net/ethernet/microsoft/mana/hw_channel.c
+++ b/drivers/net/ethernet/microsoft/mana/hw_channel.c
@@ -174,7 +174,25 @@ static void mana_hwc_init_event_handler(void *ctx, struct gdma_queue *q_self,
complete(&hwc->hwc_init_eqe_comp);
break;
+ case GDMA_EQE_HWC_SOC_RECONFIG_DATA:
+ type_data.as_uint32 = event->details[0];
+ type = type_data.type;
+ val = type_data.value;
+
+ switch (type) {
+ case HWC_DATA_CFG_HWC_TIMEOUT:
+ hwc->hwc_timeout = val;
+ break;
+
+ default:
+ dev_warn(hwc->dev, "Received unknown reconfig type %u\n", type);
+ break;
+ }
+
+ break;
+
default:
+ dev_warn(hwc->dev, "Received unknown gdma event %u\n", event->type);
/* Ignore unknown events, which should never happen. */
break;
}
@@ -704,6 +722,7 @@ int mana_hwc_create_channel(struct gdma_context *gc)
gd->pdid = INVALID_PDID;
gd->doorbell = INVALID_DOORBELL;
+ hwc->hwc_timeout = HW_CHANNEL_WAIT_RESOURCE_TIMEOUT_MS;
/* mana_hwc_init_queues() only creates the required data structures,
* and doesn't touch the HWC device.
*/
@@ -770,6 +789,8 @@ void mana_hwc_destroy_channel(struct gdma_context *gc)
hwc->gdma_dev->doorbell = INVALID_DOORBELL;
hwc->gdma_dev->pdid = INVALID_PDID;
+ hwc->hwc_timeout = 0;
+
kfree(hwc);
gc->hwc.driver_data = NULL;
gc->hwc.gdma_context = NULL;
@@ -818,6 +839,7 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len,
dest_vrq = hwc->pf_dest_vrq_id;
dest_vrcq = hwc->pf_dest_vrcq_id;
}
+ dev_err(hwc->dev, "HWC: timeout %u ms\n", hwc->hwc_timeout);
err = mana_hwc_post_tx_wqe(txq, tx_wr, dest_vrq, dest_vrcq, false);
if (err) {
@@ -825,7 +847,8 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len,
goto out;
}
- if (!wait_for_completion_timeout(&ctx->comp_event, 30 * HZ)) {
+ if (!wait_for_completion_timeout(&ctx->comp_event,
+ (hwc->hwc_timeout / 1000) * HZ)) {
dev_err(hwc->dev, "HWC: Request timed out!\n");
err = -ETIMEDOUT;
goto out;
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 96c120160f15..88b6ef7ce1a6 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -33,6 +33,7 @@ enum gdma_request_type {
GDMA_DESTROY_PD = 30,
GDMA_CREATE_MR = 31,
GDMA_DESTROY_MR = 32,
+ GDMA_QUERY_HWC_TIMEOUT = 84, /* 0x54 */
};
#define GDMA_RESOURCE_DOORBELL_PAGE 27
@@ -57,6 +58,8 @@ enum gdma_eqe_type {
GDMA_EQE_HWC_INIT_EQ_ID_DB = 129,
GDMA_EQE_HWC_INIT_DATA = 130,
GDMA_EQE_HWC_INIT_DONE = 131,
+ GDMA_EQE_HWC_SOC_RECONFIG = 132,
+ GDMA_EQE_HWC_SOC_RECONFIG_DATA = 133,
};
enum {
@@ -531,10 +534,12 @@ enum {
* so the driver is able to reliably support features like busy_poll.
*/
#define GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX BIT(2)
+#define GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG BIT(3)
#define GDMA_DRV_CAP_FLAGS1 \
(GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT | \
- GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX)
+ GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX | \
+ GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG)
#define GDMA_DRV_CAP_FLAGS2 0
@@ -664,6 +669,19 @@ struct gdma_disable_queue_req {
u32 alloc_res_id_on_creation;
}; /* HW DATA */
+/* GDMA_QUERY_HWC_TIMEOUT */
+struct gdma_query_hwc_timeout_req {
+ struct gdma_req_hdr hdr;
+ u32 timeout_ms;
+ u32 reserved;
+};
+
+struct gdma_query_hwc_timeout_resp {
+ struct gdma_resp_hdr hdr;
+ u32 timeout_ms;
+ u32 reserved;
+};
+
enum atb_page_size {
ATB_PAGE_SIZE_4K,
ATB_PAGE_SIZE_8K,
diff --git a/include/net/mana/hw_channel.h b/include/net/mana/hw_channel.h
index 6a757a6e2732..3d3b5c881bc1 100644
--- a/include/net/mana/hw_channel.h
+++ b/include/net/mana/hw_channel.h
@@ -23,6 +23,10 @@
#define HWC_INIT_DATA_PF_DEST_RQ_ID 10
#define HWC_INIT_DATA_PF_DEST_CQ_ID 11
+#define HWC_DATA_CFG_HWC_TIMEOUT 1
+
+#define HW_CHANNEL_WAIT_RESOURCE_TIMEOUT_MS 30000
+
/* Structures labeled with "HW DATA" are exchanged with the hardware. All of
* them are naturally aligned and hence don't need __packed.
*/
@@ -182,6 +186,7 @@ struct hw_channel_context {
u32 pf_dest_vrq_id;
u32 pf_dest_vrcq_id;
+ u32 hwc_timeout;
struct hwc_caller_ctx *caller_ctx;
};
--
2.34.1
^ permalink raw reply related
* Re: [PATCH 07/12] arch/x86: Declare edid_info in <asm/screen_info.h>
From: Arnd Bergmann @ 2023-07-18 14:47 UTC (permalink / raw)
To: Thomas Zimmermann, Helge Deller, Daniel Vetter, Dave Airlie
Cc: linux-hyperv, linux-efi, linux-ia64, linux-sh, Peter Zijlstra,
Dave Hansen, linux-fbdev, dri-devel, linux-mips, H. Peter Anvin,
sparclinux, linux-riscv, Ard Biesheuvel, Linux-Arch,
linux-hexagon, linux-staging, linux-csky@vger.kernel.org,
Ingo Molnar, Sami Tolvanen, Kees Cook, Paul E. McKenney,
Frederic Weisbecker, Nicholas Piggin, Borislav Petkov, loongarch,
Thomas Gleixner, linux-arm-kernel, x86, linux-kernel,
Juerg Haefliger, linux-alpha, Andrew Morton, linuxppc-dev
In-Reply-To: <150c0fa2-bff2-0644-d6e5-c4dab7f79048@suse.de>
On Wed, Jul 5, 2023, at 10:18, Thomas Zimmermann wrote:
> Am 30.06.23 um 13:53 schrieb Arnd Bergmann:
>> On Fri, Jun 30, 2023, at 09:46, Thomas Zimmermann wrote:
>>> Am 29.06.23 um 15:21 schrieb Arnd Bergmann:
>>
>> I definitely get it for the screen_info, which needs the complexity.
>> For ARCHARCH_HAS_EDID_INFO I would hope that it's never selected by
>> anything other than x86, so I would still go with just a dependency
>> on x86 for simplicity, but I don't mind having the extra symbol if that
>> keeps it more consistent with how the screen_info is handled.
>
> Well, I'd like to add edid_info to platforms with EFI. What would be
> arm/arm64 and loongarch, I guess. See below for the future plans.
To be clear: I don't mind using a 'struct edid_info' being passed
around between subsystems, that is clearly an improvement over
'struct screen_info'. It's the global variable that seems like
an artifact of linux-2.4 days, and I think we can do better than that.
>>>> I suppose you could use FIRMWARE_EDID on EFI or OF systems without
>>>> the need for a global edid_info structure, but that would not
>>>> share any code with the current fb_firmware_edid() function.
>>>
>>> The current code is build on top of screen_info and edid_info. I'd
>>> preferably not replace that, if possible.
>>
>> One way I could imagine this looking in the end would be
>> something like
>>
>> struct screen_info *fb_screen_info(struct device *dev)
>> {
>> struct screen_info *si = NULL;
>>
>> if (IS_ENABLED(CONFIG_EFI))
>> si = efi_get_screen_info(dev);
>>
>> if (IS_ENABLED(CONFIG_ARCH_HAS_SCREEN_INFO) && !si)
>> si = screen_info;
>>
>> return si;
>> }
>>
>> corresponding to fb_firmware_edid(). With this, any driver
>> that wants to access screen_info would call this function
>> instead of using the global pointer, plus either NULL pointer
>> check or a CONFIG_ARCH_HAS_SCREEN_INFO dependency.
>>
>> This way we could completely eliminate the global screen_info
>> on arm64, riscv, and loongarch but still use the efi and
>> hyperv framebuffer/drm drivers.
>
> If possible, I'd like to remove global screen_info and edid_info
> entirely from fbdev and the various consoles.
ok
> We currently use screen_info to set up the generic framebuffer device in
> drivers/firmware/sysfb.c. I'd like to use edid_info here as well, so
> that the generic graphics drivers can get EDID information.
>
> For the few fbdev drivers and consoles that require the global
> screen_info/edid_info, I'd rather provide lookup functions in sysfb
> (e.g., sysfb_get_screen_info(), sysfb_get_edid_info()). The global
> screen_info/edid_info state would then become an internal artifact of
> the sysfb code.
>
> Hopefully that explains some of the decisions made in this patchset.
I spent some more time looking at the screen_info side, after my
first set of patches to refine the #ifdefs, and I think we don't
even need to make screen_info available to non-x86 drivers at all:
- All the vgacon users except for x86 can just register a static
screen_info (or simplified into a simpler structure) with the
driver itself. This even includes ia64, which does not support
EFI framebuffers.
- The VESA, vga16, SIS, Intel and HyperV framebuffer drivers only
need access to screen_info on x86. HyperV is the only driver that
can currently access the data from EFI firmware on arm64, but
that is only used for 'gen 1' guests, which I'm pretty sure
only exist on x86.
- All the other references to screen_info are specific to EFI
firmware, so we can move the global definition from arm,
arm64, loongarch, riscv and ia64 into the EFI firmware
code itself. It is still accessed by efifb and efi-earlycon
at this point.
I have uploaded version 2 of my series to
https://git.kernel.org/pub/scm/linux/kernel/git/arnd/playground.git/log/?h=screen-info-v2
and will send it out after I get the green light from build
bots.
Arnd
^ permalink raw reply
* RE: [PATCH] hv_netvsc: support a new host capability AllowRscDisabledStatus
From: Haiyang Zhang @ 2023-07-18 13:52 UTC (permalink / raw)
To: Shradha Gupta
Cc: linux-kernel@vger.kernel.org, linux-hyperv@vger.kernel.org,
netdev@vger.kernel.org, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
KY Srinivasan, Wei Liu, Dexuan Cui, Long Li,
Michael Kelley (LINUX), David S. Miller
In-Reply-To: <20230718101845.GA24931@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net>
> -----Original Message-----
> From: Shradha Gupta <shradhagupta@linux.microsoft.com>
> Sent: Tuesday, July 18, 2023 6:19 AM
> To: Haiyang Zhang <haiyangz@microsoft.com>
> Cc: linux-kernel@vger.kernel.org; linux-hyperv@vger.kernel.org;
> netdev@vger.kernel.org; Eric Dumazet <edumazet@google.com>; Jakub Kicinski
> <kuba@kernel.org>; Paolo Abeni <pabeni@redhat.com>; KY Srinivasan
> <kys@microsoft.com>; Wei Liu <wei.liu@kernel.org>; Dexuan Cui
> <decui@microsoft.com>; Long Li <longli@microsoft.com>; Michael Kelley
> (LINUX) <mikelley@microsoft.com>; David S. Miller <davem@davemloft.net>
> Subject: Re: [PATCH] hv_netvsc: support a new host capability
> AllowRscDisabledStatus
>
> On Sun, Jul 02, 2023 at 09:37:42PM -0700, Shradha Gupta wrote:
> > On Thu, Jun 29, 2023 at 12:44:26PM +0000, Haiyang Zhang wrote:
> > >
> > >
> > > > -----Original Message-----
> > > > From: Shradha Gupta <shradhagupta@linux.microsoft.com>
> > > > Sent: Thursday, June 29, 2023 5:59 AM
> > > > To: linux-kernel@vger.kernel.org; linux-hyperv@vger.kernel.org;
> > > > netdev@vger.kernel.org
> > > > Cc: Shradha Gupta <shradhagupta@linux.microsoft.com>; Eric Dumazet
> > > > <edumazet@google.com>; Jakub Kicinski <kuba@kernel.org>; Paolo Abeni
> > > > <pabeni@redhat.com>; KY Srinivasan <kys@microsoft.com>; Haiyang
> Zhang
> > > > <haiyangz@microsoft.com>; Wei Liu <wei.liu@kernel.org>; Dexuan Cui
> > > > <decui@microsoft.com>; Long Li <longli@microsoft.com>; Michael Kelley
> > > > (LINUX) <mikelley@microsoft.com>; David S. Miller
> <davem@davemloft.net>
> > > > Subject: [PATCH] hv_netvsc: support a new host capability
> > > > AllowRscDisabledStatus
> > > >
> > > > A future Azure host update has the potential to change RSC behavior
> > > > in the VMs. To avoid this invisble change, Vswitch will check the
> > > > netvsc version of a VM before sending its RSC capabilities, and will
> > > > always indicate that the host performs RSC if the VM doesn't have an
> > > > updated netvsc driver regardless of the actual host RSC capabilities.
> > > > Netvsc now advertises a new capability: AllowRscDisabledStatus
> > > > The host will check for this capability before sending RSC status,
> > > > and if a VM does not have this capability it will send RSC enabled
> > > > status regardless of host RSC settings
> > > >
> > > > Signed-off-by: Shradha Gupta <shradhagupta@linux.microsoft.com>
> > > > ---
> > > > drivers/net/hyperv/hyperv_net.h | 3 +++
> > > > drivers/net/hyperv/netvsc.c | 8 ++++++++
> > > > 2 files changed, 11 insertions(+)
> > > >
> > > > diff --git a/drivers/net/hyperv/hyperv_net.h
> b/drivers/net/hyperv/hyperv_net.h
> > > > index dd5919ec408b..218e0f31dd66 100644
> > > > --- a/drivers/net/hyperv/hyperv_net.h
> > > > +++ b/drivers/net/hyperv/hyperv_net.h
> > > > @@ -572,6 +572,9 @@ struct nvsp_2_vsc_capability {
> > > > u64 teaming:1;
> > > > u64 vsubnetid:1;
> > > > u64 rsc:1;
> > > > + u64 timestamp:1;
> > > > + u64 reliablecorrelationid:1;
> > > > + u64 allowrscdisabledstatus:1;
> > > > };
> > > > };
> > > > } __packed;
> > > > diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
> > > > index da737d959e81..2eb1e85ba940 100644
> > > > --- a/drivers/net/hyperv/netvsc.c
> > > > +++ b/drivers/net/hyperv/netvsc.c
> > > > @@ -619,6 +619,14 @@ static int negotiate_nvsp_ver(struct hv_device
> > > > *device,
> > > > init_packet->msg.v2_msg.send_ndis_config.mtu = ndev->mtu +
> > > > ETH_HLEN;
> > > > init_packet->msg.v2_msg.send_ndis_config.capability.ieee8021q = 1;
> > > >
> > > > + /* Don't need a version check while setting this bit because if we
> > > > + * have a New VM on an old host, the VM will set the bit but the host
> > > > + * won't check it. If we have an old VM on a new host, the host will
> > > > + * check the bit, see its zero, and it'll know the VM has an
> > > > + * older NetVsc
> > > > + */
> > > > + init_packet-
> > > > >msg.v2_msg.send_ndis_config.capability.allowrscdisabledstatus = 1;
> > >
> > > Have you tested on the new host to verify: Before this patch, the host shows
> > > RSC status on, and after this patch the host shows it's off?
> > I have completed the patch sanilty tests. We are working on an upgraded host
> setup
> > to test the rsc specific changes, will update with results soon.
> > >
> > > Thanks,
> > > - Haiyang
>
> Completed this testing, rsc status reflects properly with the patch.
Thanks for the update.
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
^ permalink raw reply
* Re: [PATCH v4 11/18] media: Remove flag FBINFO_FLAG_DEFAULT from fbdev drivers
From: Hans Verkuil @ 2023-07-18 10:58 UTC (permalink / raw)
To: Thomas Zimmermann, deller, javierm, geert, dan.carpenter
Cc: linux-sh, dri-devel, linux-kernel, amd-gfx, linux-input,
linux-media, linux-fbdev, linux-staging, linux-arm-kernel,
linux-geode, linux-hyperv, linux-omap, linuxppc-dev, kvm,
Sam Ravnborg, Andy Walls, Mauro Carvalho Chehab
In-Reply-To: <20230715185343.7193-12-tzimmermann@suse.de>
Hi Thomas,
On 15/07/2023 20:51, Thomas Zimmermann wrote:
> The flag FBINFO_FLAG_DEFAULT is 0 and has no effect, as struct
> fbinfo.flags has been allocated to zero by kzalloc(). So do not
> set it.
>
> Flags should signal differences from the default values. After cleaning
> up all occurrences of FBINFO_DEFAULT, the token will be removed.
>
> v2:
> * fix commit message (Miguel)
>
> Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
> Acked-by: Sam Ravnborg <sam@ravnborg.org>
> Cc: Andy Walls <awalls@md.metrocast.net>
> Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
> Cc: Hans Verkuil <hverkuil@xs4all.nl>
> ---
> drivers/media/pci/ivtv/ivtvfb.c | 1 -
> drivers/media/test-drivers/vivid/vivid-osd.c | 1 -
> 2 files changed, 2 deletions(-)
I can take this patches for 6.6, unless you prefer to have this whole series
merged in one go?
In that case you can use my:
Reviewed-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Regards,
Hans
>
> diff --git a/drivers/media/pci/ivtv/ivtvfb.c b/drivers/media/pci/ivtv/ivtvfb.c
> index 0aeb9daaee4c..23c8c094e791 100644
> --- a/drivers/media/pci/ivtv/ivtvfb.c
> +++ b/drivers/media/pci/ivtv/ivtvfb.c
> @@ -1048,7 +1048,6 @@ static int ivtvfb_init_vidmode(struct ivtv *itv)
> /* Generate valid fb_info */
>
> oi->ivtvfb_info.node = -1;
> - oi->ivtvfb_info.flags = FBINFO_FLAG_DEFAULT;
> oi->ivtvfb_info.par = itv;
> oi->ivtvfb_info.var = oi->ivtvfb_defined;
> oi->ivtvfb_info.fix = oi->ivtvfb_fix;
> diff --git a/drivers/media/test-drivers/vivid/vivid-osd.c b/drivers/media/test-drivers/vivid/vivid-osd.c
> index ec25edc679b3..051f1805a16d 100644
> --- a/drivers/media/test-drivers/vivid/vivid-osd.c
> +++ b/drivers/media/test-drivers/vivid/vivid-osd.c
> @@ -310,7 +310,6 @@ static int vivid_fb_init_vidmode(struct vivid_dev *dev)
> /* Generate valid fb_info */
>
> dev->fb_info.node = -1;
> - dev->fb_info.flags = FBINFO_FLAG_DEFAULT;
> dev->fb_info.par = dev;
> dev->fb_info.var = dev->fb_defined;
> dev->fb_info.fix = dev->fb_fix;
^ permalink raw reply
* Re: [PATCH] hv_netvsc: support a new host capability AllowRscDisabledStatus
From: Shradha Gupta @ 2023-07-18 10:18 UTC (permalink / raw)
To: Haiyang Zhang
Cc: linux-kernel@vger.kernel.org, linux-hyperv@vger.kernel.org,
netdev@vger.kernel.org, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
KY Srinivasan, Wei Liu, Dexuan Cui, Long Li,
Michael Kelley (LINUX), David S. Miller
In-Reply-To: <20230703043742.GA9533@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net>
On Sun, Jul 02, 2023 at 09:37:42PM -0700, Shradha Gupta wrote:
> On Thu, Jun 29, 2023 at 12:44:26PM +0000, Haiyang Zhang wrote:
> >
> >
> > > -----Original Message-----
> > > From: Shradha Gupta <shradhagupta@linux.microsoft.com>
> > > Sent: Thursday, June 29, 2023 5:59 AM
> > > To: linux-kernel@vger.kernel.org; linux-hyperv@vger.kernel.org;
> > > netdev@vger.kernel.org
> > > Cc: Shradha Gupta <shradhagupta@linux.microsoft.com>; Eric Dumazet
> > > <edumazet@google.com>; Jakub Kicinski <kuba@kernel.org>; Paolo Abeni
> > > <pabeni@redhat.com>; KY Srinivasan <kys@microsoft.com>; Haiyang Zhang
> > > <haiyangz@microsoft.com>; Wei Liu <wei.liu@kernel.org>; Dexuan Cui
> > > <decui@microsoft.com>; Long Li <longli@microsoft.com>; Michael Kelley
> > > (LINUX) <mikelley@microsoft.com>; David S. Miller <davem@davemloft.net>
> > > Subject: [PATCH] hv_netvsc: support a new host capability
> > > AllowRscDisabledStatus
> > >
> > > A future Azure host update has the potential to change RSC behavior
> > > in the VMs. To avoid this invisble change, Vswitch will check the
> > > netvsc version of a VM before sending its RSC capabilities, and will
> > > always indicate that the host performs RSC if the VM doesn't have an
> > > updated netvsc driver regardless of the actual host RSC capabilities.
> > > Netvsc now advertises a new capability: AllowRscDisabledStatus
> > > The host will check for this capability before sending RSC status,
> > > and if a VM does not have this capability it will send RSC enabled
> > > status regardless of host RSC settings
> > >
> > > Signed-off-by: Shradha Gupta <shradhagupta@linux.microsoft.com>
> > > ---
> > > drivers/net/hyperv/hyperv_net.h | 3 +++
> > > drivers/net/hyperv/netvsc.c | 8 ++++++++
> > > 2 files changed, 11 insertions(+)
> > >
> > > diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
> > > index dd5919ec408b..218e0f31dd66 100644
> > > --- a/drivers/net/hyperv/hyperv_net.h
> > > +++ b/drivers/net/hyperv/hyperv_net.h
> > > @@ -572,6 +572,9 @@ struct nvsp_2_vsc_capability {
> > > u64 teaming:1;
> > > u64 vsubnetid:1;
> > > u64 rsc:1;
> > > + u64 timestamp:1;
> > > + u64 reliablecorrelationid:1;
> > > + u64 allowrscdisabledstatus:1;
> > > };
> > > };
> > > } __packed;
> > > diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
> > > index da737d959e81..2eb1e85ba940 100644
> > > --- a/drivers/net/hyperv/netvsc.c
> > > +++ b/drivers/net/hyperv/netvsc.c
> > > @@ -619,6 +619,14 @@ static int negotiate_nvsp_ver(struct hv_device
> > > *device,
> > > init_packet->msg.v2_msg.send_ndis_config.mtu = ndev->mtu +
> > > ETH_HLEN;
> > > init_packet->msg.v2_msg.send_ndis_config.capability.ieee8021q = 1;
> > >
> > > + /* Don't need a version check while setting this bit because if we
> > > + * have a New VM on an old host, the VM will set the bit but the host
> > > + * won't check it. If we have an old VM on a new host, the host will
> > > + * check the bit, see its zero, and it'll know the VM has an
> > > + * older NetVsc
> > > + */
> > > + init_packet-
> > > >msg.v2_msg.send_ndis_config.capability.allowrscdisabledstatus = 1;
> >
> > Have you tested on the new host to verify: Before this patch, the host shows
> > RSC status on, and after this patch the host shows it's off?
> I have completed the patch sanilty tests. We are working on an upgraded host setup
> to test the rsc specific changes, will update with results soon.
> >
> > Thanks,
> > - Haiyang
Completed this testing, rsc status reflects properly with the patch.
^ permalink raw reply
* Re: [PATCH 1/9] x86/hyperv: Add sev-snp enlightened guest static key
From: Tianyu Lan @ 2023-07-18 5:52 UTC (permalink / raw)
To: Vitaly Kuznetsov
Cc: Tianyu Lan, linux-arch, linux-hyperv, linux-kernel, kys, haiyangz,
wei.liu, decui, tglx, mingo, bp, dave.hansen, x86, hpa,
daniel.lezcano, arnd, michael.h.kelley
In-Reply-To: <874jnmkt4p.fsf@redhat.com>
On 6/5/2023 8:09 PM, Vitaly Kuznetsov wrote:
> Tianyu Lan <ltykernel@gmail.com> writes:
>> int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages);
>> diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
>> index c7969e806c64..9186453251f7 100644
>> --- a/arch/x86/kernel/cpu/mshyperv.c
>> +++ b/arch/x86/kernel/cpu/mshyperv.c
>> @@ -402,8 +402,12 @@ static void __init ms_hyperv_init_platform(void)
>> pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n",
>> ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b);
>>
>> - if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP)
>> +
>> + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) {
>> + static_branch_enable(&isolation_type_en_snp);
>> + } else if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) {
>> static_branch_enable(&isolation_type_snp);
>
> Nitpick: In case 'isolation_type_snp' and 'isolation_type_en_snp' are
> mutually exclusive, I'd suggest we rename the former: it is quite
> un-intuitive that for an enlightened SNP guest '&isolation_type_snp' is
> NOT enabled. E.g. we can use
>
> 'isol_type_snp_paravisor'
> and
> 'isol_type_snp_enlightened'
>
> (I also don't like 'isolation_type_en_snp' name as 'en' normally stands
> for 'enabled')
>
Hi Vitaly:
I will do such rename the function in the following patchset and this
will not affect SEV-SNP function.
^ permalink raw reply
* [PATCH V3 9/9] x86/hyperv: Add hyperv-specific handling for VMMCALL under SEV-ES
From: Tianyu Lan @ 2023-07-18 3:23 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, tglx, mingo, bp, dave.hansen, x86,
hpa, daniel.lezcano, arnd, michael.h.kelley
Cc: Tianyu Lan, linux-arch, linux-hyperv, linux-kernel, vkuznets,
Michael Kelley
In-Reply-To: <20230718032304.136888-1-ltykernel@gmail.com>
From: Tianyu Lan <tiala@microsoft.com>
Add Hyperv-specific handling for faults caused by VMMCALL
instructions.
Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Signed-off-by: Tianyu Lan <tiala@microsoft.com>
---
arch/x86/kernel/cpu/mshyperv.c | 17 +++++++++++++++++
1 file changed, 17 insertions(+)
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 8e1d9ed6a1e0..ba9a3a65f664 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -32,6 +32,7 @@
#include <asm/nmi.h>
#include <clocksource/hyperv_timer.h>
#include <asm/numa.h>
+#include <asm/svm.h>
/* Is Linux running as the root partition? */
bool hv_root_partition;
@@ -577,6 +578,20 @@ static bool __init ms_hyperv_msi_ext_dest_id(void)
return eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE;
}
+static void hv_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs)
+{
+ /* RAX and CPL are already in the GHCB */
+ ghcb_set_rcx(ghcb, regs->cx);
+ ghcb_set_rdx(ghcb, regs->dx);
+ ghcb_set_r8(ghcb, regs->r8);
+}
+
+static bool hv_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs)
+{
+ /* No checking of the return state needed */
+ return true;
+}
+
const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
.name = "Microsoft Hyper-V",
.detect = ms_hyperv_platform,
@@ -584,4 +599,6 @@ const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
.init.x2apic_available = ms_hyperv_x2apic_available,
.init.msi_ext_dest_id = ms_hyperv_msi_ext_dest_id,
.init.init_platform = ms_hyperv_init_platform,
+ .runtime.sev_es_hcall_prepare = hv_sev_es_hcall_prepare,
+ .runtime.sev_es_hcall_finish = hv_sev_es_hcall_finish,
};
--
2.25.1
^ permalink raw reply related
* [PATCH V3 8/9] x86/hyperv: Add smp support for SEV-SNP guest
From: Tianyu Lan @ 2023-07-18 3:23 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, tglx, mingo, bp, dave.hansen, x86,
hpa, daniel.lezcano, arnd, michael.h.kelley
Cc: Tianyu Lan, linux-arch, linux-hyperv, linux-kernel, vkuznets,
Michael Kelley
In-Reply-To: <20230718032304.136888-1-ltykernel@gmail.com>
From: Tianyu Lan <tiala@microsoft.com>
In the AMD SEV-SNP guest, AP needs to be started up via sev es
save area and Hyper-V requires to call HVCALL_START_VP hypercall
to pass the gpa of sev es save area with AP's vp index and VTL(Virtual
trust level) parameters. Override wakeup_secondary_cpu_64 callback
with hv_snp_boot_ap.
Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Signed-off-by: Tianyu Lan <tiala@microsoft.com>
---
arch/x86/hyperv/ivm.c | 95 +++++++++++++++++++++++++++++++
arch/x86/include/asm/mshyperv.h | 9 +++
arch/x86/kernel/cpu/mshyperv.c | 13 ++++-
include/asm-generic/hyperv-tlfs.h | 1 +
4 files changed, 116 insertions(+), 2 deletions(-)
diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
index ede47c8264e0..2eda4e69849d 100644
--- a/arch/x86/hyperv/ivm.c
+++ b/arch/x86/hyperv/ivm.c
@@ -23,11 +23,15 @@
#include <asm/sev.h>
#include <asm/realmode.h>
#include <asm/e820/api.h>
+#include <asm/desc.h>
#ifdef CONFIG_AMD_MEM_ENCRYPT
#define GHCB_USAGE_HYPERV_CALL 1
+static u8 ap_start_input_arg[PAGE_SIZE] __bss_decrypted __aligned(PAGE_SIZE);
+static u8 ap_start_stack[PAGE_SIZE] __aligned(PAGE_SIZE);
+
union hv_ghcb {
struct ghcb ghcb;
struct {
@@ -450,6 +454,97 @@ __init void hv_sev_init_mem_and_cpu(void)
}
}
+#define hv_populate_vmcb_seg(seg, gdtr_base) \
+do { \
+ if (seg.selector) { \
+ seg.base = 0; \
+ seg.limit = HV_AP_SEGMENT_LIMIT; \
+ seg.attrib = *(u16 *)(gdtr_base + seg.selector + 5); \
+ seg.attrib = (seg.attrib & 0xFF) | ((seg.attrib >> 4) & 0xF00); \
+ } \
+} while (0) \
+
+int hv_snp_boot_ap(int cpu, unsigned long start_ip)
+{
+ struct sev_es_save_area *vmsa = (struct sev_es_save_area *)
+ __get_free_page(GFP_KERNEL | __GFP_ZERO);
+ struct desc_ptr gdtr;
+ u64 ret, rmp_adjust, retry = 5;
+ struct hv_enable_vp_vtl *start_vp_input;
+ unsigned long flags;
+
+ native_store_gdt(&gdtr);
+
+ vmsa->gdtr.base = gdtr.address;
+ vmsa->gdtr.limit = gdtr.size;
+
+ asm volatile("movl %%es, %%eax;" : "=a" (vmsa->es.selector));
+ hv_populate_vmcb_seg(vmsa->es, vmsa->gdtr.base);
+
+ asm volatile("movl %%cs, %%eax;" : "=a" (vmsa->cs.selector));
+ hv_populate_vmcb_seg(vmsa->cs, vmsa->gdtr.base);
+
+ asm volatile("movl %%ss, %%eax;" : "=a" (vmsa->ss.selector));
+ hv_populate_vmcb_seg(vmsa->ss, vmsa->gdtr.base);
+
+ asm volatile("movl %%ds, %%eax;" : "=a" (vmsa->ds.selector));
+ hv_populate_vmcb_seg(vmsa->ds, vmsa->gdtr.base);
+
+ vmsa->efer = native_read_msr(MSR_EFER);
+
+ asm volatile("movq %%cr4, %%rax;" : "=a" (vmsa->cr4));
+ asm volatile("movq %%cr3, %%rax;" : "=a" (vmsa->cr3));
+ asm volatile("movq %%cr0, %%rax;" : "=a" (vmsa->cr0));
+
+ vmsa->xcr0 = 1;
+ vmsa->g_pat = HV_AP_INIT_GPAT_DEFAULT;
+ vmsa->rip = (u64)secondary_startup_64_no_verify;
+ vmsa->rsp = (u64)&ap_start_stack[PAGE_SIZE];
+
+ /*
+ * Set the SNP-specific fields for this VMSA:
+ * VMPL level
+ * SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits)
+ */
+ vmsa->vmpl = 0;
+ vmsa->sev_features = sev_status >> 2;
+
+ /*
+ * Running at VMPL0 allows the kernel to change the VMSA bit for a page
+ * using the RMPADJUST instruction. However, for the instruction to
+ * succeed it must target the permissions of a lesser privileged
+ * (higher numbered) VMPL level, so use VMPL1 (refer to the RMPADJUST
+ * instruction in the AMD64 APM Volume 3).
+ */
+ rmp_adjust = RMPADJUST_VMSA_PAGE_BIT | 1;
+ ret = rmpadjust((unsigned long)vmsa, RMP_PG_SIZE_4K,
+ rmp_adjust);
+ if (ret != 0) {
+ pr_err("RMPADJUST(%llx) failed: %llx\n", (u64)vmsa, ret);
+ return ret;
+ }
+
+ local_irq_save(flags);
+ start_vp_input =
+ (struct hv_enable_vp_vtl *)ap_start_input_arg;
+ memset(start_vp_input, 0, sizeof(*start_vp_input));
+ start_vp_input->partition_id = -1;
+ start_vp_input->vp_index = cpu;
+ start_vp_input->target_vtl.target_vtl = ms_hyperv.vtl;
+ *(u64 *)&start_vp_input->vp_context = __pa(vmsa) | 1;
+
+ do {
+ ret = hv_do_hypercall(HVCALL_START_VP,
+ start_vp_input, NULL);
+ } while (hv_result(ret) == HV_STATUS_TIME_OUT && retry--);
+
+ local_irq_restore(flags);
+
+ if (!hv_result_success(ret))
+ pr_err("HvCallStartVirtualProcessor failed: %llx\n", ret);
+ return ret;
+}
+
void __init hv_vtom_init(void)
{
/*
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index e57df590846a..c5a3c29fad01 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -65,6 +65,13 @@ struct memory_map_entry {
u32 reserved;
};
+/*
+ * DEFAULT INIT GPAT and SEGMENT LIMIT value in struct VMSA
+ * to start AP in enlightened SEV guest.
+ */
+#define HV_AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL
+#define HV_AP_SEGMENT_LIMIT 0xffffffff
+
int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages);
int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id);
int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags);
@@ -250,6 +257,7 @@ bool hv_ghcb_negotiate_protocol(void);
void __noreturn hv_ghcb_terminate(unsigned int set, unsigned int reason);
void hv_vtom_init(void);
void hv_sev_init_mem_and_cpu(void);
+int hv_snp_boot_ap(int cpu, unsigned long start_ip);
#else
static inline void hv_ghcb_msr_write(u64 msr, u64 value) {}
static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {}
@@ -257,6 +265,7 @@ static inline bool hv_ghcb_negotiate_protocol(void) { return false; }
static inline void hv_ghcb_terminate(unsigned int set, unsigned int reason) {}
static inline void hv_vtom_init(void) {}
static inline void hv_sev_init_mem_and_cpu(void) {}
+static int hv_snp_boot_ap(int cpu, unsigned long start_ip) {}
#endif
extern bool hv_isolation_type_snp(void);
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index d3bb921ee7fe..8e1d9ed6a1e0 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -295,6 +295,16 @@ static void __init hv_smp_prepare_cpus(unsigned int max_cpus)
native_smp_prepare_cpus(max_cpus);
+ /*
+ * Override wakeup_secondary_cpu_64 callback for SEV-SNP
+ * enlightened guest.
+ */
+ if (hv_isolation_type_en_snp())
+ apic->wakeup_secondary_cpu_64 = hv_snp_boot_ap;
+
+ if (!hv_root_partition)
+ return;
+
#ifdef CONFIG_X86_64
for_each_present_cpu(i) {
if (i == 0)
@@ -502,8 +512,7 @@ static void __init ms_hyperv_init_platform(void)
# ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu;
- if (hv_root_partition)
- smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus;
+ smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus;
# endif
/*
diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h
index f4e4cc4f965f..fdac4a1714ec 100644
--- a/include/asm-generic/hyperv-tlfs.h
+++ b/include/asm-generic/hyperv-tlfs.h
@@ -223,6 +223,7 @@ enum HV_GENERIC_SET_FORMAT {
#define HV_STATUS_INVALID_PORT_ID 17
#define HV_STATUS_INVALID_CONNECTION_ID 18
#define HV_STATUS_INSUFFICIENT_BUFFERS 19
+#define HV_STATUS_TIME_OUT 120
#define HV_STATUS_VTL_ALREADY_ENABLED 134
/*
--
2.25.1
^ permalink raw reply related
* [PATCH V3 7/9] x86/hyperv: Initialize cpu and memory for SEV-SNP enlightened guest
From: Tianyu Lan @ 2023-07-18 3:23 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, tglx, mingo, bp, dave.hansen, x86,
hpa, daniel.lezcano, arnd, michael.h.kelley
Cc: Tianyu Lan, linux-arch, linux-hyperv, linux-kernel, vkuznets,
Michael Kelley
In-Reply-To: <20230718032304.136888-1-ltykernel@gmail.com>
From: Tianyu Lan <tiala@microsoft.com>
Hyper-V enlightened guest doesn't have boot loader support.
Boot Linux kernel directly from hypervisor with data (kernel
image, initrd and parameter page) and memory for boot up that
is initialized via AMD SEV PSP protocol (Please reference
Section 4.5 Launching a Guest of [1]).
Kernel needs to read processor and memory info from EN_SEV_
SNP_PROCESSOR/MEM_INFO_ADDR address which are populated by
Hyper-V. The data is prepared by hypervisor via SNP_
LAUNCH_UPDATE with page type SNP_PAGE_TYPE_UNMEASURED and
Initialize smp cpu related ops, validate system memory and
add them into e820 table.
[1]: https://www.amd.com/system/files/TechDocs/56860.pdf
Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Signed-off-by: Tianyu Lan <tiala@microsoft.com>
---
Change since v2:
* Update change log.
---
arch/x86/hyperv/ivm.c | 93 +++++++++++++++++++++++++++++++++
arch/x86/include/asm/mshyperv.h | 17 ++++++
arch/x86/kernel/cpu/mshyperv.c | 3 ++
3 files changed, 113 insertions(+)
diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
index b2b5cb19fac9..ede47c8264e0 100644
--- a/arch/x86/hyperv/ivm.c
+++ b/arch/x86/hyperv/ivm.c
@@ -18,6 +18,11 @@
#include <asm/mshyperv.h>
#include <asm/hypervisor.h>
#include <asm/mtrr.h>
+#include <asm/coco.h>
+#include <asm/io_apic.h>
+#include <asm/sev.h>
+#include <asm/realmode.h>
+#include <asm/e820/api.h>
#ifdef CONFIG_AMD_MEM_ENCRYPT
@@ -58,6 +63,8 @@ union hv_ghcb {
static u16 hv_ghcb_version __ro_after_init;
+static u32 processor_count;
+
u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size)
{
union hv_ghcb *hv_ghcb;
@@ -357,6 +364,92 @@ static bool hv_is_private_mmio(u64 addr)
return false;
}
+static __init void hv_snp_get_smp_config(unsigned int early)
+{
+ /*
+ * The "early" parameter can be true only if old-style AMD
+ * Opteron NUMA detection is enabled, which should never be
+ * the case for an SEV-SNP guest. See CONFIG_AMD_NUMA.
+ * For safety, just do nothing if "early" is true.
+ */
+ if (early)
+ return;
+
+ /*
+ * There is no firmware and ACPI MADT table support in
+ * in the Hyper-V SEV-SNP enlightened guest. Set smp
+ * related config variable here.
+ */
+ while (num_processors < processor_count) {
+ early_per_cpu(x86_cpu_to_apicid, num_processors) = num_processors;
+ early_per_cpu(x86_bios_cpu_apicid, num_processors) = num_processors;
+ physid_set(num_processors, phys_cpu_present_map);
+ set_cpu_possible(num_processors, true);
+ set_cpu_present(num_processors, true);
+ num_processors++;
+ }
+}
+
+__init void hv_sev_init_mem_and_cpu(void)
+{
+ struct memory_map_entry *entry;
+ struct e820_entry *e820_entry;
+ u64 e820_end;
+ u64 ram_end;
+ u64 page;
+
+ /*
+ * Hyper-V enlightened snp guest boots kernel
+ * directly without bootloader. So roms, bios
+ * regions and reserve resources are not available.
+ * Set these callback to NULL.
+ */
+ x86_platform.legacy.rtc = 0;
+ x86_platform.legacy.reserve_bios_regions = 0;
+ x86_platform.set_wallclock = set_rtc_noop;
+ x86_platform.get_wallclock = get_rtc_noop;
+ x86_init.resources.probe_roms = x86_init_noop;
+ x86_init.resources.reserve_resources = x86_init_noop;
+ x86_init.mpparse.find_smp_config = x86_init_noop;
+ x86_init.mpparse.get_smp_config = hv_snp_get_smp_config;
+
+ /*
+ * Hyper-V SEV-SNP enlightened guest doesn't support ioapic
+ * and legacy APIC page read/write. Switch to hv apic here.
+ */
+ disable_ioapic_support();
+
+ /* Get processor and mem info. */
+ processor_count = *(u32 *)__va(EN_SEV_SNP_PROCESSOR_INFO_ADDR);
+ entry = (struct memory_map_entry *)__va(EN_SEV_SNP_MEM_INFO_ADDR);
+
+ /*
+ * There is no bootloader/EFI firmware in the SEV SNP guest.
+ * E820 table in the memory just describes memory for kernel,
+ * ACPI table, cmdline, boot params and ramdisk. The dynamic
+ * data(e.g, vcpu number and the rest memory layout) needs to
+ * be read from EN_SEV_SNP_PROCESSOR_INFO_ADDR.
+ */
+ for (; entry->numpages != 0; entry++) {
+ e820_entry = &e820_table->entries[
+ e820_table->nr_entries - 1];
+ e820_end = e820_entry->addr + e820_entry->size;
+ ram_end = (entry->starting_gpn +
+ entry->numpages) * PAGE_SIZE;
+
+ if (e820_end < entry->starting_gpn * PAGE_SIZE)
+ e820_end = entry->starting_gpn * PAGE_SIZE;
+
+ if (e820_end < ram_end) {
+ pr_info("Hyper-V: add e820 entry [mem %#018Lx-%#018Lx]\n", e820_end, ram_end - 1);
+ e820__range_add(e820_end, ram_end - e820_end,
+ E820_TYPE_RAM);
+ for (page = e820_end; page < ram_end; page += PAGE_SIZE)
+ pvalidate((unsigned long)__va(page), RMP_PG_SIZE_4K, true);
+ }
+ }
+}
+
void __init hv_vtom_init(void)
{
/*
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 025eda129d99..e57df590846a 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -50,6 +50,21 @@ extern bool hv_isolation_type_en_snp(void);
extern union hv_ghcb * __percpu *hv_ghcb_pg;
+/*
+ * Hyper-V puts processor and memory layout info
+ * to this address in SEV-SNP enlightened guest.
+ */
+#define EN_SEV_SNP_PROCESSOR_INFO_ADDR 0x802000
+#define EN_SEV_SNP_MEM_INFO_ADDR 0x802018
+
+struct memory_map_entry {
+ u64 starting_gpn;
+ u64 numpages;
+ u16 type;
+ u16 flags;
+ u32 reserved;
+};
+
int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages);
int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id);
int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags);
@@ -234,12 +249,14 @@ void hv_ghcb_msr_read(u64 msr, u64 *value);
bool hv_ghcb_negotiate_protocol(void);
void __noreturn hv_ghcb_terminate(unsigned int set, unsigned int reason);
void hv_vtom_init(void);
+void hv_sev_init_mem_and_cpu(void);
#else
static inline void hv_ghcb_msr_write(u64 msr, u64 value) {}
static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {}
static inline bool hv_ghcb_negotiate_protocol(void) { return false; }
static inline void hv_ghcb_terminate(unsigned int set, unsigned int reason) {}
static inline void hv_vtom_init(void) {}
+static inline void hv_sev_init_mem_and_cpu(void) {}
#endif
extern bool hv_isolation_type_snp(void);
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 5398fb2f4d39..d3bb921ee7fe 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -529,6 +529,9 @@ static void __init ms_hyperv_init_platform(void)
if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT))
mark_tsc_unstable("running on Hyper-V");
+ if (hv_isolation_type_en_snp())
+ hv_sev_init_mem_and_cpu();
+
hardlockup_detector_disable();
}
--
2.25.1
^ permalink raw reply related
* [PATCH V3 6/9] clocksource: hyper-v: Mark hyperv tsc page unencrypted in sev-snp enlightened guest
From: Tianyu Lan @ 2023-07-18 3:23 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, tglx, mingo, bp, dave.hansen, x86,
hpa, daniel.lezcano, arnd, michael.h.kelley
Cc: Tianyu Lan, linux-arch, linux-hyperv, linux-kernel, vkuznets,
Michael Kelley
In-Reply-To: <20230718032304.136888-1-ltykernel@gmail.com>
From: Tianyu Lan <tiala@microsoft.com>
Hyper-V tsc page is shared with hypervisor and mark the page
unencrypted in sev-snp enlightened guest when it's used.
Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Signed-off-by: Tianyu Lan <tiala@microsoft.com>
---
drivers/clocksource/hyperv_timer.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c
index e56307a81f4d..8ff7cd4e20bb 100644
--- a/drivers/clocksource/hyperv_timer.c
+++ b/drivers/clocksource/hyperv_timer.c
@@ -390,7 +390,7 @@ static __always_inline u64 read_hv_clock_msr(void)
static union {
struct ms_hyperv_tsc_page page;
u8 reserved[PAGE_SIZE];
-} tsc_pg __aligned(PAGE_SIZE);
+} tsc_pg __bss_decrypted __aligned(PAGE_SIZE);
static struct ms_hyperv_tsc_page *tsc_page = &tsc_pg.page;
static unsigned long tsc_pfn;
--
2.25.1
^ permalink raw reply related
* [PATCH V3 5/9] x86/hyperv: Use vmmcall to implement Hyper-V hypercall in sev-snp enlightened guest
From: Tianyu Lan @ 2023-07-18 3:22 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, tglx, mingo, bp, dave.hansen, x86,
hpa, daniel.lezcano, arnd, michael.h.kelley
Cc: Tianyu Lan, linux-arch, linux-hyperv, linux-kernel, vkuznets
In-Reply-To: <20230718032304.136888-1-ltykernel@gmail.com>
From: Tianyu Lan <tiala@microsoft.com>
In sev-snp enlightened guest, Hyper-V hypercall needs
to use vmmcall to trigger vmexit and notify hypervisor
to handle hypercall request.
Signed-off-by: Tianyu Lan <tiala@microsoft.com>
---
arch/x86/include/asm/mshyperv.h | 27 ++++++++++++++-------------
1 file changed, 14 insertions(+), 13 deletions(-)
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 2fa38e9f6207..025eda129d99 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -64,12 +64,12 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
if (!hv_hypercall_pg)
return U64_MAX;
- __asm__ __volatile__("mov %4, %%r8\n"
- CALL_NOSPEC
+ __asm__ __volatile__("mov %[output], %%r8\n"
+ ALTERNATIVE("vmmcall", CALL_NOSPEC, X86_FEATURE_SEV_ES)
: "=a" (hv_status), ASM_CALL_CONSTRAINT,
- "+c" (control), "+d" (input_address)
- : "r" (output_address),
- THUNK_TARGET(hv_hypercall_pg)
+ "+c" (control), "+d" (input_address)
+ : [output] "r" (output_address),
+ THUNK_TARGET(hv_hypercall_pg)
: "cc", "memory", "r8", "r9", "r10", "r11");
#else
u32 input_address_hi = upper_32_bits(input_address);
@@ -105,7 +105,8 @@ static inline u64 _hv_do_fast_hypercall8(u64 control, u64 input1)
#ifdef CONFIG_X86_64
{
- __asm__ __volatile__(CALL_NOSPEC
+ __asm__ __volatile__("mov %[thunk_target], %%r8\n"
+ ALTERNATIVE("vmmcall", CALL_NOSPEC, X86_FEATURE_SEV_ES)
: "=a" (hv_status), ASM_CALL_CONSTRAINT,
"+c" (control), "+d" (input1)
: THUNK_TARGET(hv_hypercall_pg)
@@ -150,13 +151,13 @@ static inline u64 _hv_do_fast_hypercall16(u64 control, u64 input1, u64 input2)
#ifdef CONFIG_X86_64
{
- __asm__ __volatile__("mov %4, %%r8\n"
- CALL_NOSPEC
- : "=a" (hv_status), ASM_CALL_CONSTRAINT,
- "+c" (control), "+d" (input1)
- : "r" (input2),
- THUNK_TARGET(hv_hypercall_pg)
- : "cc", "r8", "r9", "r10", "r11");
+ __asm__ __volatile__("mov %[output], %%r8\n"
+ ALTERNATIVE("vmmcall", CALL_NOSPEC, X86_FEATURE_SEV_ES)
+ : "=a" (hv_status), ASM_CALL_CONSTRAINT,
+ "+c" (control), "+d" (input1)
+ : [output] "r" (input2),
+ THUNK_TARGET(hv_hypercall_pg)
+ : "cc", "r8", "r9", "r10", "r11");
}
#else
{
--
2.25.1
^ permalink raw reply related
* [PATCH V3 4/9] drivers: hv: Mark percpu hvcall input arg page unencrypted in SEV-SNP enlightened guest
From: Tianyu Lan @ 2023-07-18 3:22 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, tglx, mingo, bp, dave.hansen, x86,
hpa, daniel.lezcano, arnd, michael.h.kelley
Cc: Tianyu Lan, linux-arch, linux-hyperv, linux-kernel, vkuznets,
Michael Kelley
In-Reply-To: <20230718032304.136888-1-ltykernel@gmail.com>
From: Tianyu Lan <tiala@microsoft.com>
Hypervisor needs to access input arg, VMBus synic event and
message pages. Mark these pages unencrypted in the SEV-SNP
guest and free them only if they have been marked encrypted
successfully.
Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Signed-off-by: Tianyu Lan <tiala@microsoft.com>
---
drivers/hv/hv.c | 57 +++++++++++++++++++++++++++++++++++++++---
drivers/hv/hv_common.c | 13 ++++++++++
2 files changed, 67 insertions(+), 3 deletions(-)
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index de6708dbe0df..ec6e35a0d9bf 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -20,6 +20,7 @@
#include <linux/interrupt.h>
#include <clocksource/hyperv_timer.h>
#include <asm/mshyperv.h>
+#include <linux/set_memory.h>
#include "hyperv_vmbus.h"
/* The one and only */
@@ -78,7 +79,7 @@ int hv_post_message(union hv_connection_id connection_id,
int hv_synic_alloc(void)
{
- int cpu;
+ int cpu, ret = -ENOMEM;
struct hv_per_cpu_context *hv_cpu;
/*
@@ -123,26 +124,76 @@ int hv_synic_alloc(void)
goto err;
}
}
+
+ if (hv_isolation_type_en_snp()) {
+ ret = set_memory_decrypted((unsigned long)
+ hv_cpu->synic_message_page, 1);
+ if (ret) {
+ pr_err("Failed to decrypt SYNIC msg page: %d\n", ret);
+ hv_cpu->synic_message_page = NULL;
+
+ /*
+ * Free the event page here so that hv_synic_free()
+ * won't later try to re-encrypt it.
+ */
+ free_page((unsigned long)hv_cpu->synic_event_page);
+ hv_cpu->synic_event_page = NULL;
+ goto err;
+ }
+
+ ret = set_memory_decrypted((unsigned long)
+ hv_cpu->synic_event_page, 1);
+ if (ret) {
+ pr_err("Failed to decrypt SYNIC event page: %d\n", ret);
+ hv_cpu->synic_event_page = NULL;
+ goto err;
+ }
+
+ memset(hv_cpu->synic_message_page, 0, PAGE_SIZE);
+ memset(hv_cpu->synic_event_page, 0, PAGE_SIZE);
+ }
}
return 0;
+
err:
/*
* Any memory allocations that succeeded will be freed when
* the caller cleans up by calling hv_synic_free()
*/
- return -ENOMEM;
+ return ret;
}
void hv_synic_free(void)
{
- int cpu;
+ int cpu, ret;
for_each_present_cpu(cpu) {
struct hv_per_cpu_context *hv_cpu
= per_cpu_ptr(hv_context.cpu_context, cpu);
+ /* It's better to leak the page if the encryption fails. */
+ if (hv_isolation_type_en_snp()) {
+ if (hv_cpu->synic_message_page) {
+ ret = set_memory_encrypted((unsigned long)
+ hv_cpu->synic_message_page, 1);
+ if (ret) {
+ pr_err("Failed to encrypt SYNIC msg page: %d\n", ret);
+ hv_cpu->synic_message_page = NULL;
+ }
+ }
+
+ if (hv_cpu->synic_event_page) {
+ ret = set_memory_encrypted((unsigned long)
+ hv_cpu->synic_event_page, 1);
+ if (ret) {
+ pr_err("Failed to encrypt SYNIC event page: %d\n", ret);
+ hv_cpu->synic_event_page = NULL;
+ }
+ }
+ }
+
free_page((unsigned long)hv_cpu->synic_event_page);
free_page((unsigned long)hv_cpu->synic_message_page);
}
diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
index 4b4aa53c34c2..2d43ba2bc925 100644
--- a/drivers/hv/hv_common.c
+++ b/drivers/hv/hv_common.c
@@ -24,6 +24,7 @@
#include <linux/kmsg_dump.h>
#include <linux/slab.h>
#include <linux/dma-map-ops.h>
+#include <linux/set_memory.h>
#include <asm/hyperv-tlfs.h>
#include <asm/mshyperv.h>
@@ -359,6 +360,7 @@ int hv_common_cpu_init(unsigned int cpu)
u64 msr_vp_index;
gfp_t flags;
int pgcount = hv_root_partition ? 2 : 1;
+ int ret;
/* hv_cpu_init() can be called with IRQs disabled from hv_resume() */
flags = irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL;
@@ -378,6 +380,17 @@ int hv_common_cpu_init(unsigned int cpu)
outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
*outputarg = (char *)(*inputarg) + HV_HYP_PAGE_SIZE;
}
+
+ if (hv_isolation_type_en_snp()) {
+ ret = set_memory_decrypted((unsigned long)*inputarg, pgcount);
+ if (ret) {
+ kfree(*inputarg);
+ *inputarg = NULL;
+ return ret;
+ }
+
+ memset(*inputarg, 0x00, pgcount * PAGE_SIZE);
+ }
}
msr_vp_index = hv_get_register(HV_REGISTER_VP_INDEX);
--
2.25.1
^ permalink raw reply related
* [PATCH V3 3/9] x86/hyperv: Mark Hyper-V vp assist page unencrypted in SEV-SNP enlightened guest
From: Tianyu Lan @ 2023-07-18 3:22 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, tglx, mingo, bp, dave.hansen, x86,
hpa, daniel.lezcano, arnd, michael.h.kelley
Cc: Tianyu Lan, linux-arch, linux-hyperv, linux-kernel, vkuznets,
Michael Kelley
In-Reply-To: <20230718032304.136888-1-ltykernel@gmail.com>
From: Tianyu Lan <tiala@microsoft.com>
hv vp assist page needs to be shared between SEV-SNP guest and Hyper-V.
So mark the page unencrypted in the SEV-SNP guest.
Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Signed-off-by: Tianyu Lan <tiala@microsoft.com>
---
arch/x86/hyperv/hv_init.c | 16 +++++++++++++++-
1 file changed, 15 insertions(+), 1 deletion(-)
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 1ba367a9686e..b004370d3b01 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -18,6 +18,7 @@
#include <asm/hyperv-tlfs.h>
#include <asm/mshyperv.h>
#include <asm/idtentry.h>
+#include <asm/set_memory.h>
#include <linux/kexec.h>
#include <linux/version.h>
#include <linux/vmalloc.h>
@@ -106,8 +107,21 @@ static int hv_cpu_init(unsigned int cpu)
* in hv_cpu_die(), otherwise a CPU may not be stopped in the
* case of CPU offlining and the VM will hang.
*/
- if (!*hvp)
+ if (!*hvp) {
*hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO);
+
+ /*
+ * Hyper-V should never specify a VM that is a Confidential
+ * VM and also running in the root partition. Root partition
+ * is blocked to run in Confidential VM. So only decrypt assist
+ * page in non-root partition here.
+ */
+ if (*hvp && hv_isolation_type_en_snp()) {
+ WARN_ON_ONCE(set_memory_decrypted((unsigned long)(*hvp), 1));
+ memset(*hvp, 0, PAGE_SIZE);
+ }
+ }
+
if (*hvp)
msr.pfn = vmalloc_to_pfn(*hvp);
--
2.25.1
^ permalink raw reply related
* [PATCH V3 2/9] x86/hyperv: Set Virtual Trust Level in VMBus init message
From: Tianyu Lan @ 2023-07-18 3:22 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, tglx, mingo, bp, dave.hansen, x86,
hpa, daniel.lezcano, arnd, michael.h.kelley
Cc: Tianyu Lan, linux-arch, linux-hyperv, linux-kernel, vkuznets
In-Reply-To: <20230718032304.136888-1-ltykernel@gmail.com>
From: Tianyu Lan <tiala@microsoft.com>
SEV-SNP guests on Hyper-V can run at multiple Virtual Trust
Levels (VTL). During boot, get the VTL at which we're running
using the GET_VP_REGISTERs hypercall, and save the value
for future use. Then during VMBus initialization, set the VTL
with the saved value as required in the VMBus init message.
Signed-off-by: Tianyu Lan <tiala@microsoft.com>
---
* Change since v2:
Update the change log.
---
arch/x86/hyperv/hv_init.c | 36 ++++++++++++++++++++++++++++++
arch/x86/include/asm/hyperv-tlfs.h | 7 ++++++
drivers/hv/connection.c | 1 +
include/asm-generic/mshyperv.h | 1 +
include/linux/hyperv.h | 4 ++--
5 files changed, 47 insertions(+), 2 deletions(-)
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 6c04b52f139b..1ba367a9686e 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -378,6 +378,40 @@ static void __init hv_get_partition_id(void)
local_irq_restore(flags);
}
+static u8 __init get_vtl(void)
+{
+ u64 control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_REGISTERS;
+ struct hv_get_vp_registers_input *input;
+ struct hv_get_vp_registers_output *output;
+ u64 vtl = 0;
+ u64 ret;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output = (struct hv_get_vp_registers_output *)input;
+ if (!input) {
+ local_irq_restore(flags);
+ goto done;
+ }
+
+ memset(input, 0, struct_size(input, element, 1));
+ input->header.partitionid = HV_PARTITION_ID_SELF;
+ input->header.vpindex = HV_VP_INDEX_SELF;
+ input->header.inputvtl = 0;
+ input->element[0].name0 = HV_X64_REGISTER_VSM_VP_STATUS;
+
+ ret = hv_do_hypercall(control, input, output);
+ if (hv_result_success(ret))
+ vtl = output->as64.low & HV_X64_VTL_MASK;
+ else
+ pr_err("Hyper-V: failed to get VTL! %lld", ret);
+ local_irq_restore(flags);
+
+done:
+ return vtl;
+}
+
/*
* This function is to be invoked early in the boot sequence after the
* hypervisor has been detected.
@@ -506,6 +540,8 @@ void __init hyperv_init(void)
/* Query the VMs extended capability once, so that it can be cached. */
hv_query_ext_cap(0);
+ /* Find the VTL */
+ ms_hyperv.vtl = get_vtl();
return;
clean_guest_os_id:
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index cea95dcd27c2..4bf0b315b0ce 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -301,6 +301,13 @@ enum hv_isolation_type {
#define HV_X64_MSR_TIME_REF_COUNT HV_REGISTER_TIME_REF_COUNT
#define HV_X64_MSR_REFERENCE_TSC HV_REGISTER_REFERENCE_TSC
+/*
+ * Registers are only accessible via HVCALL_GET_VP_REGISTERS hvcall and
+ * there is not associated MSR address.
+ */
+#define HV_X64_REGISTER_VSM_VP_STATUS 0x000D0003
+#define HV_X64_VTL_MASK GENMASK(3, 0)
+
/* Hyper-V memory host visibility */
enum hv_mem_host_visibility {
VMBUS_PAGE_NOT_VISIBLE = 0,
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 5978e9dbc286..02b54f85dc60 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -98,6 +98,7 @@ int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version)
*/
if (version >= VERSION_WIN10_V5) {
msg->msg_sint = VMBUS_MESSAGE_SINT;
+ msg->msg_vtl = ms_hyperv.vtl;
vmbus_connection.msg_conn_id = VMBUS_MESSAGE_CONNECTION_ID_4;
} else {
msg->interrupt_page = virt_to_phys(vmbus_connection.int_page);
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index 6b5c41f90398..f73a044ecaa7 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -54,6 +54,7 @@ struct ms_hyperv_info {
};
};
u64 shared_gpa_boundary;
+ u8 vtl;
};
extern struct ms_hyperv_info ms_hyperv;
extern bool hv_nested;
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index bfbc37ce223b..1f2bfec4abde 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -665,8 +665,8 @@ struct vmbus_channel_initiate_contact {
u64 interrupt_page;
struct {
u8 msg_sint;
- u8 padding1[3];
- u32 padding2;
+ u8 msg_vtl;
+ u8 reserved[6];
};
};
u64 monitor_page1;
--
2.25.1
^ permalink raw reply related
* [PATCH V3 1/9] x86/hyperv: Add sev-snp enlightened guest static key
From: Tianyu Lan @ 2023-07-18 3:22 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, tglx, mingo, bp, dave.hansen, x86,
hpa, daniel.lezcano, arnd, michael.h.kelley
Cc: Tianyu Lan, linux-arch, linux-hyperv, linux-kernel, vkuznets
In-Reply-To: <20230718032304.136888-1-ltykernel@gmail.com>
From: Tianyu Lan <tiala@microsoft.com>
Introduce static key isolation_type_en_snp for enlightened
sev-snp guest check.
Signed-off-by: Tianyu Lan <tiala@microsoft.com>
---
arch/x86/hyperv/ivm.c | 11 +++++++++++
arch/x86/include/asm/mshyperv.h | 3 +++
arch/x86/kernel/cpu/mshyperv.c | 9 +++++++--
drivers/hv/hv_common.c | 6 ++++++
include/asm-generic/mshyperv.h | 12 +++++++++---
5 files changed, 36 insertions(+), 5 deletions(-)
diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
index 14f46ad2ca64..b2b5cb19fac9 100644
--- a/arch/x86/hyperv/ivm.c
+++ b/arch/x86/hyperv/ivm.c
@@ -413,3 +413,14 @@ bool hv_isolation_type_snp(void)
{
return static_branch_unlikely(&isolation_type_snp);
}
+
+DEFINE_STATIC_KEY_FALSE(isolation_type_en_snp);
+/*
+ * hv_isolation_type_en_snp - Check system runs in the AMD SEV-SNP based
+ * isolation enlightened VM.
+ */
+bool hv_isolation_type_en_snp(void)
+{
+ return static_branch_unlikely(&isolation_type_en_snp);
+}
+
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 88d9ef98e087..2fa38e9f6207 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -26,6 +26,7 @@
union hv_ghcb;
DECLARE_STATIC_KEY_FALSE(isolation_type_snp);
+DECLARE_STATIC_KEY_FALSE(isolation_type_en_snp);
typedef int (*hyperv_fill_flush_list_func)(
struct hv_guest_mapping_flush_list *flush,
@@ -45,6 +46,8 @@ extern void *hv_hypercall_pg;
extern u64 hv_current_partition_id;
+extern bool hv_isolation_type_en_snp(void);
+
extern union hv_ghcb * __percpu *hv_ghcb_pg;
int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages);
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index c7969e806c64..5398fb2f4d39 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -402,8 +402,12 @@ static void __init ms_hyperv_init_platform(void)
pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n",
ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b);
- if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP)
+
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) {
+ static_branch_enable(&isolation_type_en_snp);
+ } else if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) {
static_branch_enable(&isolation_type_snp);
+ }
}
if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) {
@@ -473,7 +477,8 @@ static void __init ms_hyperv_init_platform(void)
#if IS_ENABLED(CONFIG_HYPERV)
if ((hv_get_isolation_type() == HV_ISOLATION_TYPE_VBS) ||
- (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP))
+ ((hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) &&
+ ms_hyperv.paravisor_present))
hv_vtom_init();
/*
* Setup the hook to get control post apic initialization.
diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
index 542a1d53b303..4b4aa53c34c2 100644
--- a/drivers/hv/hv_common.c
+++ b/drivers/hv/hv_common.c
@@ -502,6 +502,12 @@ bool __weak hv_isolation_type_snp(void)
}
EXPORT_SYMBOL_GPL(hv_isolation_type_snp);
+bool __weak hv_isolation_type_en_snp(void)
+{
+ return false;
+}
+EXPORT_SYMBOL_GPL(hv_isolation_type_en_snp);
+
void __weak hv_setup_vmbus_handler(void (*handler)(void))
{
}
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index 402a8c1c202d..6b5c41f90398 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -36,15 +36,21 @@ struct ms_hyperv_info {
u32 nested_features;
u32 max_vp_index;
u32 max_lp_index;
- u32 isolation_config_a;
+ union {
+ u32 isolation_config_a;
+ struct {
+ u32 paravisor_present : 1;
+ u32 reserved_a1 : 31;
+ };
+ };
union {
u32 isolation_config_b;
struct {
u32 cvm_type : 4;
- u32 reserved1 : 1;
+ u32 reserved_b1 : 1;
u32 shared_gpa_boundary_active : 1;
u32 shared_gpa_boundary_bits : 6;
- u32 reserved2 : 20;
+ u32 reserved_b2 : 20;
};
};
u64 shared_gpa_boundary;
--
2.25.1
^ permalink raw reply related
* [PATCH V3 0/9] x86/hyperv: Add AMD sev-snp enlightened guest support on hyperv
From: Tianyu Lan @ 2023-07-18 3:22 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, tglx, mingo, bp, dave.hansen, x86,
hpa, daniel.lezcano, arnd, michael.h.kelley
Cc: Tianyu Lan, linux-arch, linux-hyperv, linux-kernel, vkuznets
From: Tianyu Lan <tiala@microsoft.com>
Hyper-V provides two modes for running SEV-SNP VMs:
1) In vTOM mode with a paravisor (see Section 15.36.8 of [1])
2) In "fully enlightened" mode with normal "C" bit control
over page encryption, and no paravisor
For #1, the paravisor runs in VMPL 0, while Linux runs in VMPL 2
(see Section 15.36.7 of [1]). The paravisor is typically provided
by Hyper-V and handles most of the SNP-related functionality. As
such, most of the SNP functionality in the Linux guest is bypassed.
The guest operates in vTOM mode, where encryption is enabled by default.
The guest must still request page transitions between private and shared,
but there is relatively less SNP machinery required in the guest. Support
for this mode of operation first went upstream in the 5.15 kernel.
For #2, this patch set provides the initial support. The existing
SEV-SNP machinery in the kernel is fully used, but Hyper-V specific
updates are required to properly share Hyper-V communication pages
between the guest and host and to start APs at boot time.
In either mode, Hyper-V requires that the guest implement the SEV-SNP
Restricted Interrupt Injection feature (see Section 15.36.16 of [1],
and Section 5 of [2]). Without this feature, the guest is subject to
attack by a compromised hypervisor that can inject any exception at
any time, such as injecting an interrupt while the guest has interrupts
disabled. In vTOM mode, Restricted Interrupt Injection is implemented
by the paravisor, so no Linux guest changes are required. But in fully
enlightened mode, the Linux guest must provide the implementation.
This patch set is derived from an earlier patch set that includes both
the Hyper-V specific changes and Restricted Interrupt Injection support.[3]
But it is now limited to only the Hyper-V specific changes. The Restricted
Interrupt Injection support will come later in a separate patch set.
[1] https://www.amd.com/system/files/TechDocs/24593.pdf
[2] https://www.amd.com/system/files/TechDocs/56421-guest-hypervisor-communication-block-standardization.pdf
[3] https://lore.kernel.org/lkml/20230515165917.1306922-1-ltykernel@gmail.com/
Change since v2:
* Update Change log.
* Rework Hyper-V hypercall implementation.
Change since v1:
* vTOM case uses paravisor_present flag and
HV_ISOLATION_TYPE_SNP type.
* Rework some patches' change log
* Fix some comments in the patches
Tianyu Lan (9):
x86/hyperv: Add sev-snp enlightened guest static key
x86/hyperv: Set Virtual Trust Level in VMBus init message
x86/hyperv: Mark Hyper-V vp assist page unencrypted in SEV-SNP
enlightened guest
drivers: hv: Mark percpu hvcall input arg page unencrypted in SEV-SNP
enlightened guest
x86/hyperv: Use vmmcall to implement Hyper-V hypercall in sev-snp
enlightened guest
clocksource: hyper-v: Mark hyperv tsc page unencrypted in sev-snp
enlightened guest
x86/hyperv: Initialize cpu and memory for SEV-SNP enlightened guest
x86/hyperv: Add smp support for SEV-SNP guest
x86/hyperv: Add hyperv-specific handling for VMMCALL under SEV-ES
arch/x86/hyperv/hv_init.c | 52 +++++++-
arch/x86/hyperv/ivm.c | 199 +++++++++++++++++++++++++++++
arch/x86/include/asm/hyperv-tlfs.h | 7 +
arch/x86/include/asm/mshyperv.h | 56 ++++++--
arch/x86/kernel/cpu/mshyperv.c | 42 +++++-
drivers/clocksource/hyperv_timer.c | 2 +-
drivers/hv/connection.c | 1 +
drivers/hv/hv.c | 57 ++++++++-
drivers/hv/hv_common.c | 19 +++
include/asm-generic/hyperv-tlfs.h | 1 +
include/asm-generic/mshyperv.h | 13 +-
include/linux/hyperv.h | 4 +-
12 files changed, 426 insertions(+), 27 deletions(-)
--
2.25.1
^ permalink raw reply
* Re: RE: [PATCH net-next] net: mana: Add page pool for RX buffers
From: Zhu Yanjun @ 2023-07-17 23:59 UTC (permalink / raw)
To: Haiyang Zhang, Jesper Dangaard Brouer, Jakub Kicinski
Cc: brouer@redhat.com, linux-hyperv@vger.kernel.org,
netdev@vger.kernel.org, Dexuan Cui, KY Srinivasan, Paul Rosswurm,
olaf@aepfle.de, vkuznets@redhat.com, davem@davemloft.net,
wei.liu@kernel.org, edumazet@google.com, pabeni@redhat.com,
leon@kernel.org, Long Li, ssengar@linux.microsoft.com,
linux-rdma@vger.kernel.org, daniel@iogearbox.net,
john.fastabend@gmail.com, bpf@vger.kernel.org, ast@kernel.org,
Ajay Sharma, hawk@kernel.org, tglx@linutronix.de,
shradhagupta@linux.microsoft.com, linux-kernel@vger.kernel.org,
Ilias Apalodimas
In-Reply-To: <PH7PR21MB31166EF9DB2F453999D2E92ECA34A@PH7PR21MB3116.namprd21.prod.outlook.com>
在 2023/7/14 20:51, Haiyang Zhang 写道:
>
>
>> -----Original Message-----
>> From: Jesper Dangaard Brouer <jbrouer@redhat.com>
>> On 14/07/2023 05.53, Jakub Kicinski wrote:
>>> On Thu, 13 Jul 2023 14:48:45 +0000 Haiyang Zhang wrote:
>>>> Add page pool for RX buffers for faster buffer cycle and reduce CPU
>>>> usage.
>>>>
>>>> Get an extra ref count of a page after allocation, so after upper
>>>> layers put the page, it's still referenced by the pool. We can reuse
>>>> it as RX buffer without alloc a new page.
>>>
>>> Please use the real page_pool API from include/net/page_pool.h
>>> We've moved past every driver reinventing the wheel, sorry.
>>
>> +1
>>
>> Quoting[1]: Documentation/networking/page_pool.rst
>>
>> Basic use involves replacing alloc_pages() calls with the
>> page_pool_alloc_pages() call.
>> Drivers should use page_pool_dev_alloc_pages() replacing
>> dev_alloc_pages().
>
> Thank Jakub and Jesper for the reviews.
> I'm aware of the page_pool.rst doc, and actually tried it before this
> patch, but I got lower perf. If I understand correctly, we should call
> page_pool_release_page() before passing the SKB to napi_gro_receive().
>
If I get this commit correctly, this commit is to use page pool to get
better performance.
IIRC, folio is to make memory optimization. From the performance
results, with folio, the performance will get about 10%.
So not sure if the folio can be used in this commit to get better
performance.
That is my 2 cent.
Zhu Yanjun
> I found the page_pool_dev_alloc_pages() goes through the slow path,
> because the page_pool_release_page() let the page leave the pool.
>
> Do we have to call page_pool_release_page() before passing the SKB
> to napi_gro_receive()? Any better way to recycle the pages from the
> upper layer of non-XDP case?
>
> Thanks,
> - Haiyang
>
^ permalink raw reply
* [PATCH net-next v5 2/2] net: mana: Use the correct WQE count for ringing RQ doorbell
From: longli @ 2023-07-17 19:35 UTC (permalink / raw)
To: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Leon Romanovsky, Shradha Gupta, Ajay Sharma, Shachar Raindel,
Stephen Hemminger, linux-hyperv, netdev, linux-kernel
Cc: linux-rdma, Long Li
In-Reply-To: <1689622539-5334-1-git-send-email-longli@linuxonhyperv.com>
From: Long Li <longli@microsoft.com>
The hardware specification specifies that WQE_COUNT should set to 0 for
the Receive Queue. Although currently the hardware doesn't enforce the
check, in the future releases it may check on this value.
Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)")
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: Dexuan Cui <decui@microsoft.com>
Signed-off-by: Long Li <longli@microsoft.com>
---
Change log:
v4:
Split the original patch into two: one for batching doorbell, one for setting the correct wqe count
v5:
Drop Cc: stable and use net-next
drivers/net/ethernet/microsoft/mana/gdma_main.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 8f3f78b68592..3765d3389a9a 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -300,8 +300,11 @@ static void mana_gd_ring_doorbell(struct gdma_context *gc, u32 db_index,
void mana_gd_wq_ring_doorbell(struct gdma_context *gc, struct gdma_queue *queue)
{
+ /* Hardware Spec specifies that software client should set 0 for
+ * wqe_cnt for Receive Queues. This value is not used in Send Queues.
+ */
mana_gd_ring_doorbell(gc, queue->gdma_dev->doorbell, queue->type,
- queue->id, queue->head * GDMA_WQE_BU_SIZE, 1);
+ queue->id, queue->head * GDMA_WQE_BU_SIZE, 0);
}
void mana_gd_ring_cq(struct gdma_queue *cq, u8 arm_bit)
--
2.34.1
^ permalink raw reply related
* [PATCH net-next v5 1/2] net: mana: Batch ringing RX queue doorbell on receiving packets
From: longli @ 2023-07-17 19:35 UTC (permalink / raw)
To: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Leon Romanovsky, Shradha Gupta, Ajay Sharma, Shachar Raindel,
Stephen Hemminger, linux-hyperv, netdev, linux-kernel
Cc: linux-rdma, Long Li
In-Reply-To: <1689622539-5334-1-git-send-email-longli@linuxonhyperv.com>
From: Long Li <longli@microsoft.com>
It's inefficient to ring the doorbell page every time a WQE is posted to
the received queue. Excessive MMIO writes result in CPU spending more
time waiting on LOCK instructions (atomic operations), resulting in
poor scaling performance.
Move the code for ringing doorbell page to where after we have posted all
WQEs to the receive queue during a callback from napi_poll().
With this change, tests showed an improvement from 120G/s to 160G/s on a
200G physical link, with 16 or 32 hardware queues.
Tests showed no regression in network latency benchmarks on single
connection.
Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)")
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: Dexuan Cui <decui@microsoft.com>
Signed-off-by: Long Li <longli@microsoft.com>
---
Change log:
v2:
Check for comp_read > 0 as it might be negative on completion error.
Set rq.wqe_cnt to 0 according to BNIC spec.
v3:
Add details in the commit on the reason of performance increase and test numbers.
Add details in the commit on why rq.wqe_cnt should be set to 0 according to hardware spec.
Add "Reviewed-by" from Haiyang and Dexuan.
v4:
Split the original patch into two: one for batching doorbell, one for setting the correct wqe count
v5:
drop Cc: stable and use net-next
drivers/net/ethernet/microsoft/mana/mana_en.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index cd4d5ceb9f2d..1d8abe63fcb8 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1383,8 +1383,8 @@ static void mana_post_pkt_rxq(struct mana_rxq *rxq)
recv_buf_oob = &rxq->rx_oobs[curr_index];
- err = mana_gd_post_and_ring(rxq->gdma_rq, &recv_buf_oob->wqe_req,
- &recv_buf_oob->wqe_inf);
+ err = mana_gd_post_work_request(rxq->gdma_rq, &recv_buf_oob->wqe_req,
+ &recv_buf_oob->wqe_inf);
if (WARN_ON_ONCE(err))
return;
@@ -1654,6 +1654,12 @@ static void mana_poll_rx_cq(struct mana_cq *cq)
mana_process_rx_cqe(rxq, cq, &comp[i]);
}
+ if (comp_read > 0) {
+ struct gdma_context *gc = rxq->gdma_rq->gdma_dev->gdma_context;
+
+ mana_gd_wq_ring_doorbell(gc, rxq->gdma_rq);
+ }
+
if (rxq->xdp_flush)
xdp_do_flush();
}
--
2.34.1
^ permalink raw reply related
* [PATCH net-next v5 0/2] net: mana: Fix doorbell access for receive queues
From: longli @ 2023-07-17 19:35 UTC (permalink / raw)
To: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Leon Romanovsky, Shradha Gupta, Ajay Sharma, Shachar Raindel,
Stephen Hemminger, linux-hyperv, netdev, linux-kernel
Cc: linux-rdma, Long Li
From: Long Li <longli@microsoft.com>
This patchset fixes the issues discovered during 200G physical link
tests. It fixes doorbell usage and WQE format for receive queues.
Long Li (2):
net: mana: Batch ringing RX queue doorbell on receiving packets
net: mana: Use the correct WQE count for ringing RQ doorbell
drivers/net/ethernet/microsoft/mana/gdma_main.c | 5 ++++-
drivers/net/ethernet/microsoft/mana/mana_en.c | 10 ++++++++--
2 files changed, 12 insertions(+), 3 deletions(-)
--
2.34.1
^ permalink raw reply
* RE: [PATCH net-next] net: mana: Add page pool for RX buffers
From: Haiyang Zhang @ 2023-07-17 18:26 UTC (permalink / raw)
To: Jesper Dangaard Brouer, Jakub Kicinski
Cc: brouer@redhat.com, linux-hyperv@vger.kernel.org,
netdev@vger.kernel.org, Dexuan Cui, KY Srinivasan, Paul Rosswurm,
olaf@aepfle.de, vkuznets@redhat.com, davem@davemloft.net,
wei.liu@kernel.org, edumazet@google.com, pabeni@redhat.com,
leon@kernel.org, Long Li, ssengar@linux.microsoft.com,
linux-rdma@vger.kernel.org, daniel@iogearbox.net,
john.fastabend@gmail.com, bpf@vger.kernel.org, ast@kernel.org,
Ajay Sharma, hawk@kernel.org, tglx@linutronix.de,
shradhagupta@linux.microsoft.com, linux-kernel@vger.kernel.org,
Ilias Apalodimas
In-Reply-To: <3b043a95-a4bc-bbaf-c8e0-240e8ddea62f@redhat.com>
> -----Original Message-----
> From: Jesper Dangaard Brouer <jbrouer@redhat.com>
> Sent: Friday, July 14, 2023 9:13 AM
> To: Haiyang Zhang <haiyangz@microsoft.com>; Jesper Dangaard Brouer
> <jbrouer@redhat.com>; Jakub Kicinski <kuba@kernel.org>
> Cc: brouer@redhat.com; linux-hyperv@vger.kernel.org; netdev@vger.kernel.org;
> Dexuan Cui <decui@microsoft.com>; KY Srinivasan <kys@microsoft.com>; Paul
> Rosswurm <paulros@microsoft.com>; olaf@aepfle.de; vkuznets@redhat.com;
> davem@davemloft.net; wei.liu@kernel.org; edumazet@google.com;
> pabeni@redhat.com; leon@kernel.org; Long Li <longli@microsoft.com>;
> ssengar@linux.microsoft.com; linux-rdma@vger.kernel.org;
> daniel@iogearbox.net; john.fastabend@gmail.com; bpf@vger.kernel.org;
> ast@kernel.org; Ajay Sharma <sharmaajay@microsoft.com>; hawk@kernel.org;
> tglx@linutronix.de; shradhagupta@linux.microsoft.com; linux-
> kernel@vger.kernel.org; Ilias Apalodimas <ilias.apalodimas@linaro.org>
> Subject: Re: [PATCH net-next] net: mana: Add page pool for RX buffers
>
> [You don't often get email from jbrouer@redhat.com. Learn why this is
> important at https://aka.ms/LearnAboutSenderIdentification ]
>
> On 14/07/2023 14.51, Haiyang Zhang wrote:
> >
> >
> >> -----Original Message-----
> >> From: Jesper Dangaard Brouer <jbrouer@redhat.com>
> >> On 14/07/2023 05.53, Jakub Kicinski wrote:
> >>> On Thu, 13 Jul 2023 14:48:45 +0000 Haiyang Zhang wrote:
> >>>> Add page pool for RX buffers for faster buffer cycle and reduce CPU
> >>>> usage.
> >>>>
> >>>> Get an extra ref count of a page after allocation, so after upper
> >>>> layers put the page, it's still referenced by the pool. We can reuse
> >>>> it as RX buffer without alloc a new page.
> >>>
> >>> Please use the real page_pool API from include/net/page_pool.h
> >>> We've moved past every driver reinventing the wheel, sorry.
> >>
> >> +1
> >>
> >> Quoting[1]: Documentation/networking/page_pool.rst
> >>
> >> Basic use involves replacing alloc_pages() calls with the
> >> page_pool_alloc_pages() call.
> >> Drivers should use page_pool_dev_alloc_pages() replacing
> >> dev_alloc_pages().
> >
> > Thank Jakub and Jesper for the reviews.
> > I'm aware of the page_pool.rst doc, and actually tried it before this
> > patch, but I got lower perf. If I understand correctly, we should call
> > page_pool_release_page() before passing the SKB to napi_gro_receive().
> >
> > I found the page_pool_dev_alloc_pages() goes through the slow path,
> > because the page_pool_release_page() let the page leave the pool.
> >
> > Do we have to call page_pool_release_page() before passing the SKB
> > to napi_gro_receive()? Any better way to recycle the pages from the
> > upper layer of non-XDP case?
> >
>
> Today SKB "upper layers" can recycle page_pool backed packet data/page.
>
> Just use skb_mark_for_recycle(skb), then you don't need
> page_pool_release_page().
Will do. Thanks a lot!
- Haiyang
^ permalink raw reply
* Re: [PATCH] HID: hyperv: avoid struct memcpy overrun warning
From: Arnd Bergmann @ 2023-07-17 10:44 UTC (permalink / raw)
To: Andy Shevchenko, Arnd Bergmann
Cc: Jiri Kosina, Benjamin Tissoires, K. Y. Srinivasan, Haiyang Zhang,
Wei Liu, Dexuan Cui, Paulo Miguel Almeida, Michael Kelley (LINUX),
Dawei Li, Yang Yingliang, Thomas Weißschuh, linux-hyperv,
linux-input, linux-kernel
In-Reply-To: <ZLULjoePQaF+nSk2@smile.fi.intel.com>
On Mon, Jul 17, 2023, at 11:36, Andy Shevchenko wrote:
> On Wed, Jul 05, 2023 at 04:02:24PM +0200, Arnd Bergmann wrote:
>> From: Arnd Bergmann <arnd@arndb.de>
>>
>> A previous patch addressed the fortified memcpy warning for most
>> builds, but I still see this one with gcc-9:
>
> JFYI: as of today I have run Linux Next with `make W=1 allmodconfig` on x86_64
> and it seems there are still tons of similar issues which break the build.
It's a bit more complex:
- yes, there are lots of warnings for memcpy() read overflow when you
build allmodconfig kernels with W=1. I have patches for all of these and
plan to submit them over time.
- This particular one is a memcpy /write/ overflow, i.e. the
destination type overflows with the length according to gcc's
understanding of the structures. This warning is enabled even
without W=1, though it may only show up in certain configurations
or compiler versions.
Arnd
^ permalink raw reply
* Re: [PATCH] HID: hyperv: avoid struct memcpy overrun warning
From: Andy Shevchenko @ 2023-07-17 9:36 UTC (permalink / raw)
To: Arnd Bergmann
Cc: Jiri Kosina, Benjamin Tissoires, Arnd Bergmann, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Paulo Miguel Almeida,
Michael Kelley, Dawei Li, Yang Yingliang, Thomas Weißschuh,
linux-hyperv, linux-input, linux-kernel
In-Reply-To: <20230705140242.844167-1-arnd@kernel.org>
On Wed, Jul 05, 2023 at 04:02:24PM +0200, Arnd Bergmann wrote:
> From: Arnd Bergmann <arnd@arndb.de>
>
> A previous patch addressed the fortified memcpy warning for most
> builds, but I still see this one with gcc-9:
JFYI: as of today I have run Linux Next with `make W=1 allmodconfig` on x86_64
and it seems there are still tons of similar issues which break the build.
--
With Best Regards,
Andy Shevchenko
^ permalink raw reply
* Re: [PATCH] vmbus_testing: fix wrong python syntax for integer value comparison
From: Ani Sinha @ 2023-07-17 7:43 UTC (permalink / raw)
To: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui
Cc: linux-hyperv, linux-kernel
In-Reply-To: <20230705134408.6302-1-anisinha@redhat.com>
> On 05-Jul-2023, at 7:14 PM, Ani Sinha <anisinha@redhat.com> wrote:
>
> It is incorrect in python to compare integer values using the "is" keyword.
> The "is" keyword in python is used to compare references to two objects,
> not their values. Newer version of python3 (version 3.8) throws a warning
> when such incorrect comparison is made. For value comparison, "==" should
> be used.
>
> Fix this in the code and suppress the following warning:
>
> /usr/sbin/vmbus_testing:167: SyntaxWarning: "is" with a literal. Did you mean "=="?
Ping …
>
> Signed-off-by: Ani Sinha <anisinha@redhat.com>
> ---
> tools/hv/vmbus_testing | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/tools/hv/vmbus_testing b/tools/hv/vmbus_testing
> index e7212903dd1d..4467979d8f69 100755
> --- a/tools/hv/vmbus_testing
> +++ b/tools/hv/vmbus_testing
> @@ -164,7 +164,7 @@ def recursive_file_lookup(path, file_map):
> def get_all_devices_test_status(file_map):
>
> for device in file_map:
> - if (get_test_state(locate_state(device, file_map)) is 1):
> + if (get_test_state(locate_state(device, file_map)) == 1):
> print("Testing = ON for: {}"
> .format(device.split("/")[5]))
> else:
> @@ -203,7 +203,7 @@ def write_test_files(path, value):
> def set_test_state(state_path, state_value, quiet):
>
> write_test_files(state_path, state_value)
> - if (get_test_state(state_path) is 1):
> + if (get_test_state(state_path) == 1):
> if (not quiet):
> print("Testing = ON for device: {}"
> .format(state_path.split("/")[5]))
> --
> 2.39.1
>
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox