* [PATCH net-next,v3, 5/6] net/mlx5: Add HV VHCA control agent
From: Haiyang Zhang @ 2019-08-21 0:23 UTC (permalink / raw)
To: sashal@kernel.org, davem@davemloft.net, saeedm@mellanox.com,
leon@kernel.org, eranbe@mellanox.com, lorenzo.pieralisi@arm.com,
bhelgaas@google.com, linux-pci@vger.kernel.org,
linux-hyperv@vger.kernel.org, netdev@vger.kernel.org
Cc: Haiyang Zhang, KY Srinivasan, Stephen Hemminger,
linux-kernel@vger.kernel.org
In-Reply-To: <1566346948-69497-1-git-send-email-haiyangz@microsoft.com>
From: Eran Ben Elisha <eranbe@mellanox.com>
Control agent is responsible over of the control block (ID 0). It should
update the PF via this block about every capability change. In addition,
upon block 0 invalidate, it should activate all other supported agents
with data requests from the PF.
Upon agent create/destroy, the invalidate callback of the control agent
is being called in order to update the PF driver about this change.
The control agent is an integral part of HV VHCA and will be created
and destroy as part of the HV VHCA init/cleanup flow.
Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
.../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c | 122 ++++++++++++++++++++-
.../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h | 1 +
2 files changed, 121 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
index 84d1d75..4047629 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
@@ -109,22 +109,131 @@ void mlx5_hv_vhca_invalidate(void *context, u64 block_mask)
queue_work(hv_vhca->work_queue, &work->invalidate_work);
}
+#define AGENT_MASK(type) (type ? BIT(type - 1) : 0 /* control */)
+
+static void mlx5_hv_vhca_agents_control(struct mlx5_hv_vhca *hv_vhca,
+ struct mlx5_hv_vhca_control_block *block)
+{
+ int i;
+
+ for (i = 0; i < MLX5_HV_VHCA_AGENT_MAX; i++) {
+ struct mlx5_hv_vhca_agent *agent = hv_vhca->agents[i];
+
+ if (!agent || !agent->control)
+ continue;
+
+ if (!(AGENT_MASK(agent->type) & block->control))
+ continue;
+
+ agent->control(agent, block);
+ }
+}
+
+static void mlx5_hv_vhca_capabilities(struct mlx5_hv_vhca *hv_vhca,
+ u32 *capabilities)
+{
+ int i;
+
+ for (i = 0; i < MLX5_HV_VHCA_AGENT_MAX; i++) {
+ struct mlx5_hv_vhca_agent *agent = hv_vhca->agents[i];
+
+ if (agent)
+ *capabilities |= AGENT_MASK(agent->type);
+ }
+}
+
+static void
+mlx5_hv_vhca_control_agent_invalidate(struct mlx5_hv_vhca_agent *agent,
+ u64 block_mask)
+{
+ struct mlx5_hv_vhca *hv_vhca = agent->hv_vhca;
+ struct mlx5_core_dev *dev = hv_vhca->dev;
+ struct mlx5_hv_vhca_control_block *block;
+ u32 capabilities = 0;
+ int err;
+
+ block = kzalloc(sizeof(*block), GFP_KERNEL);
+ if (!block)
+ return;
+
+ err = mlx5_hv_read_config(dev, block, sizeof(*block), 0);
+ if (err)
+ goto free_block;
+
+ mlx5_hv_vhca_capabilities(hv_vhca, &capabilities);
+
+ /* In case no capabilities, send empty block in return */
+ if (!capabilities) {
+ memset(block, 0, sizeof(*block));
+ goto write;
+ }
+
+ if (block->capabilities != capabilities)
+ block->capabilities = capabilities;
+
+ if (block->control & ~capabilities)
+ goto free_block;
+
+ mlx5_hv_vhca_agents_control(hv_vhca, block);
+ block->command_ack = block->command;
+
+write:
+ mlx5_hv_write_config(dev, block, sizeof(*block), 0);
+
+free_block:
+ kfree(block);
+}
+
+static struct mlx5_hv_vhca_agent *
+mlx5_hv_vhca_control_agent_create(struct mlx5_hv_vhca *hv_vhca)
+{
+ return mlx5_hv_vhca_agent_create(hv_vhca, MLX5_HV_VHCA_AGENT_CONTROL,
+ NULL,
+ mlx5_hv_vhca_control_agent_invalidate,
+ NULL, NULL);
+}
+
+static void mlx5_hv_vhca_control_agent_destroy(struct mlx5_hv_vhca_agent *agent)
+{
+ mlx5_hv_vhca_agent_destroy(agent);
+}
+
int mlx5_hv_vhca_init(struct mlx5_hv_vhca *hv_vhca)
{
+ struct mlx5_hv_vhca_agent *agent;
+ int err;
+
if (IS_ERR_OR_NULL(hv_vhca))
return IS_ERR_OR_NULL(hv_vhca);
- return mlx5_hv_register_invalidate(hv_vhca->dev, hv_vhca,
- mlx5_hv_vhca_invalidate);
+ err = mlx5_hv_register_invalidate(hv_vhca->dev, hv_vhca,
+ mlx5_hv_vhca_invalidate);
+ if (err)
+ return err;
+
+ agent = mlx5_hv_vhca_control_agent_create(hv_vhca);
+ if (IS_ERR_OR_NULL(agent)) {
+ mlx5_hv_unregister_invalidate(hv_vhca->dev);
+ return IS_ERR_OR_NULL(agent);
+ }
+
+ hv_vhca->agents[MLX5_HV_VHCA_AGENT_CONTROL] = agent;
+
+ return 0;
}
void mlx5_hv_vhca_cleanup(struct mlx5_hv_vhca *hv_vhca)
{
+ struct mlx5_hv_vhca_agent *agent;
int i;
if (IS_ERR_OR_NULL(hv_vhca))
return;
+ agent = hv_vhca->agents[MLX5_HV_VHCA_AGENT_CONTROL];
+ if (agent)
+ mlx5_hv_vhca_control_agent_destroy(agent);
+
mutex_lock(&hv_vhca->agents_lock);
for (i = 0; i < MLX5_HV_VHCA_AGENT_MAX; i++)
WARN_ON(hv_vhca->agents[i]);
@@ -134,6 +243,11 @@ void mlx5_hv_vhca_cleanup(struct mlx5_hv_vhca *hv_vhca)
mlx5_hv_unregister_invalidate(hv_vhca->dev);
}
+static void mlx5_hv_vhca_agents_update(struct mlx5_hv_vhca *hv_vhca)
+{
+ mlx5_hv_vhca_invalidate(hv_vhca, BIT(MLX5_HV_VHCA_AGENT_CONTROL));
+}
+
struct mlx5_hv_vhca_agent *
mlx5_hv_vhca_agent_create(struct mlx5_hv_vhca *hv_vhca,
enum mlx5_hv_vhca_agent_type type,
@@ -174,6 +288,8 @@ struct mlx5_hv_vhca_agent *
hv_vhca->agents[type] = agent;
mutex_unlock(&hv_vhca->agents_lock);
+ mlx5_hv_vhca_agents_update(hv_vhca);
+
return agent;
}
@@ -195,6 +311,8 @@ void mlx5_hv_vhca_agent_destroy(struct mlx5_hv_vhca_agent *agent)
agent->cleanup(agent);
kfree(agent);
+
+ mlx5_hv_vhca_agents_update(hv_vhca);
}
static int mlx5_hv_vhca_data_block_prepare(struct mlx5_hv_vhca_agent *agent,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
index cdf1303..984e7ad 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
@@ -12,6 +12,7 @@
struct mlx5_hv_vhca_control_block;
enum mlx5_hv_vhca_agent_type {
+ MLX5_HV_VHCA_AGENT_CONTROL = 0,
MLX5_HV_VHCA_AGENT_MAX = 32,
};
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next,v3, 1/6] PCI: hv: Add a paravirtual backchannel in software
From: Haiyang Zhang @ 2019-08-21 0:23 UTC (permalink / raw)
To: sashal@kernel.org, davem@davemloft.net, saeedm@mellanox.com,
leon@kernel.org, eranbe@mellanox.com, lorenzo.pieralisi@arm.com,
bhelgaas@google.com, linux-pci@vger.kernel.org,
linux-hyperv@vger.kernel.org, netdev@vger.kernel.org
Cc: Haiyang Zhang, KY Srinivasan, Stephen Hemminger,
linux-kernel@vger.kernel.org, Dexuan Cui, Jake Oshins
In-Reply-To: <1566346948-69497-1-git-send-email-haiyangz@microsoft.com>
From: Dexuan Cui <decui@microsoft.com>
Windows SR-IOV provides a backchannel mechanism in software for communication
between a VF driver and a PF driver. These "configuration blocks" are
similar in concept to PCI configuration space, but instead of doing reads and
writes in 32-bit chunks through a very slow path, packets of up to 128 bytes
can be sent or received asynchronously.
Nearly every SR-IOV device contains just such a communications channel in
hardware, so using this one in software is usually optional. Using the
software channel, however, allows driver implementers to leverage software
tools that fuzz the communications channel looking for vulnerabilities.
The usage model for these packets puts the responsibility for reading or
writing on the VF driver. The VF driver sends a read or a write packet,
indicating which "block" is being referred to by number.
If the PF driver wishes to initiate communication, it can "invalidate" one or
more of the first 64 blocks. This invalidation is delivered via a callback
supplied by the VF driver by this driver.
No protocol is implied, except that supplied by the PF and VF drivers.
Signed-off-by: Jake Oshins <jakeo@microsoft.com>
Signed-off-by: Dexuan Cui <decui@microsoft.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: K. Y. Srinivasan <kys@microsoft.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
---
drivers/pci/controller/pci-hyperv.c | 302 ++++++++++++++++++++++++++++++++++++
include/linux/hyperv.h | 15 ++
2 files changed, 317 insertions(+)
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index 40b6254..57adeca 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -365,6 +365,39 @@ struct pci_delete_interrupt {
struct tran_int_desc int_desc;
} __packed;
+/*
+ * Note: the VM must pass a valid block id, wslot and bytes_requested.
+ */
+struct pci_read_block {
+ struct pci_message message_type;
+ u32 block_id;
+ union win_slot_encoding wslot;
+ u32 bytes_requested;
+} __packed;
+
+struct pci_read_block_response {
+ struct vmpacket_descriptor hdr;
+ u32 status;
+ u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX];
+} __packed;
+
+/*
+ * Note: the VM must pass a valid block id, wslot and byte_count.
+ */
+struct pci_write_block {
+ struct pci_message message_type;
+ u32 block_id;
+ union win_slot_encoding wslot;
+ u32 byte_count;
+ u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX];
+} __packed;
+
+struct pci_dev_inval_block {
+ struct pci_incoming_message incoming;
+ union win_slot_encoding wslot;
+ u64 block_mask;
+} __packed;
+
struct pci_dev_incoming {
struct pci_incoming_message incoming;
union win_slot_encoding wslot;
@@ -499,6 +532,9 @@ struct hv_pci_dev {
struct hv_pcibus_device *hbus;
struct work_struct wrk;
+ void (*block_invalidate)(void *context, u64 block_mask);
+ void *invalidate_context;
+
/*
* What would be observed if one wrote 0xFFFFFFFF to a BAR and then
* read it back, for each of the BAR offsets within config space.
@@ -817,6 +853,256 @@ static int hv_pcifront_write_config(struct pci_bus *bus, unsigned int devfn,
.write = hv_pcifront_write_config,
};
+/*
+ * Paravirtual backchannel
+ *
+ * Hyper-V SR-IOV provides a backchannel mechanism in software for
+ * communication between a VF driver and a PF driver. These
+ * "configuration blocks" are similar in concept to PCI configuration space,
+ * but instead of doing reads and writes in 32-bit chunks through a very slow
+ * path, packets of up to 128 bytes can be sent or received asynchronously.
+ *
+ * Nearly every SR-IOV device contains just such a communications channel in
+ * hardware, so using this one in software is usually optional. Using the
+ * software channel, however, allows driver implementers to leverage software
+ * tools that fuzz the communications channel looking for vulnerabilities.
+ *
+ * The usage model for these packets puts the responsibility for reading or
+ * writing on the VF driver. The VF driver sends a read or a write packet,
+ * indicating which "block" is being referred to by number.
+ *
+ * If the PF driver wishes to initiate communication, it can "invalidate" one or
+ * more of the first 64 blocks. This invalidation is delivered via a callback
+ * supplied by the VF driver by this driver.
+ *
+ * No protocol is implied, except that supplied by the PF and VF drivers.
+ */
+
+struct hv_read_config_compl {
+ struct hv_pci_compl comp_pkt;
+ void *buf;
+ unsigned int len;
+ unsigned int bytes_returned;
+};
+
+/**
+ * hv_pci_read_config_compl() - Invoked when a response packet
+ * for a read config block operation arrives.
+ * @context: Identifies the read config operation
+ * @resp: The response packet itself
+ * @resp_packet_size: Size in bytes of the response packet
+ */
+static void hv_pci_read_config_compl(void *context, struct pci_response *resp,
+ int resp_packet_size)
+{
+ struct hv_read_config_compl *comp = context;
+ struct pci_read_block_response *read_resp =
+ (struct pci_read_block_response *)resp;
+ unsigned int data_len, hdr_len;
+
+ hdr_len = offsetof(struct pci_read_block_response, bytes);
+ if (resp_packet_size < hdr_len) {
+ comp->comp_pkt.completion_status = -1;
+ goto out;
+ }
+
+ data_len = resp_packet_size - hdr_len;
+ if (data_len > 0 && read_resp->status == 0) {
+ comp->bytes_returned = min(comp->len, data_len);
+ memcpy(comp->buf, read_resp->bytes, comp->bytes_returned);
+ } else {
+ comp->bytes_returned = 0;
+ }
+
+ comp->comp_pkt.completion_status = read_resp->status;
+out:
+ complete(&comp->comp_pkt.host_event);
+}
+
+/**
+ * hv_read_config_block() - Sends a read config block request to
+ * the back-end driver running in the Hyper-V parent partition.
+ * @pdev: The PCI driver's representation for this device.
+ * @buf: Buffer into which the config block will be copied.
+ * @len: Size in bytes of buf.
+ * @block_id: Identifies the config block which has been requested.
+ * @bytes_returned: Size which came back from the back-end driver.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int hv_read_config_block(struct pci_dev *pdev, void *buf, unsigned int len,
+ unsigned int block_id, unsigned int *bytes_returned)
+{
+ struct hv_pcibus_device *hbus =
+ container_of(pdev->bus->sysdata, struct hv_pcibus_device,
+ sysdata);
+ struct {
+ struct pci_packet pkt;
+ char buf[sizeof(struct pci_read_block)];
+ } pkt;
+ struct hv_read_config_compl comp_pkt;
+ struct pci_read_block *read_blk;
+ int ret;
+
+ if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX)
+ return -EINVAL;
+
+ init_completion(&comp_pkt.comp_pkt.host_event);
+ comp_pkt.buf = buf;
+ comp_pkt.len = len;
+
+ memset(&pkt, 0, sizeof(pkt));
+ pkt.pkt.completion_func = hv_pci_read_config_compl;
+ pkt.pkt.compl_ctxt = &comp_pkt;
+ read_blk = (struct pci_read_block *)&pkt.pkt.message;
+ read_blk->message_type.type = PCI_READ_BLOCK;
+ read_blk->wslot.slot = devfn_to_wslot(pdev->devfn);
+ read_blk->block_id = block_id;
+ read_blk->bytes_requested = len;
+
+ ret = vmbus_sendpacket(hbus->hdev->channel, read_blk,
+ sizeof(*read_blk), (unsigned long)&pkt.pkt,
+ VM_PKT_DATA_INBAND,
+ VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
+ if (ret)
+ return ret;
+
+ ret = wait_for_response(hbus->hdev, &comp_pkt.comp_pkt.host_event);
+ if (ret)
+ return ret;
+
+ if (comp_pkt.comp_pkt.completion_status != 0 ||
+ comp_pkt.bytes_returned == 0) {
+ dev_err(&hbus->hdev->device,
+ "Read Config Block failed: 0x%x, bytes_returned=%d\n",
+ comp_pkt.comp_pkt.completion_status,
+ comp_pkt.bytes_returned);
+ return -EIO;
+ }
+
+ *bytes_returned = comp_pkt.bytes_returned;
+ return 0;
+}
+EXPORT_SYMBOL(hv_read_config_block);
+
+/**
+ * hv_pci_write_config_compl() - Invoked when a response packet for a write
+ * config block operation arrives.
+ * @context: Identifies the write config operation
+ * @resp: The response packet itself
+ * @resp_packet_size: Size in bytes of the response packet
+ */
+static void hv_pci_write_config_compl(void *context, struct pci_response *resp,
+ int resp_packet_size)
+{
+ struct hv_pci_compl *comp_pkt = context;
+
+ comp_pkt->completion_status = resp->status;
+ complete(&comp_pkt->host_event);
+}
+
+/**
+ * hv_write_config_block() - Sends a write config block request to the
+ * back-end driver running in the Hyper-V parent partition.
+ * @pdev: The PCI driver's representation for this device.
+ * @buf: Buffer from which the config block will be copied.
+ * @len: Size in bytes of buf.
+ * @block_id: Identifies the config block which is being written.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int hv_write_config_block(struct pci_dev *pdev, void *buf, unsigned int len,
+ unsigned int block_id)
+{
+ struct hv_pcibus_device *hbus =
+ container_of(pdev->bus->sysdata, struct hv_pcibus_device,
+ sysdata);
+ struct {
+ struct pci_packet pkt;
+ char buf[sizeof(struct pci_write_block)];
+ u32 reserved;
+ } pkt;
+ struct hv_pci_compl comp_pkt;
+ struct pci_write_block *write_blk;
+ u32 pkt_size;
+ int ret;
+
+ if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX)
+ return -EINVAL;
+
+ init_completion(&comp_pkt.host_event);
+
+ memset(&pkt, 0, sizeof(pkt));
+ pkt.pkt.completion_func = hv_pci_write_config_compl;
+ pkt.pkt.compl_ctxt = &comp_pkt;
+ write_blk = (struct pci_write_block *)&pkt.pkt.message;
+ write_blk->message_type.type = PCI_WRITE_BLOCK;
+ write_blk->wslot.slot = devfn_to_wslot(pdev->devfn);
+ write_blk->block_id = block_id;
+ write_blk->byte_count = len;
+ memcpy(write_blk->bytes, buf, len);
+ pkt_size = offsetof(struct pci_write_block, bytes) + len;
+ /*
+ * This quirk is required on some hosts shipped around 2018, because
+ * these hosts don't check the pkt_size correctly (new hosts have been
+ * fixed since early 2019). The quirk is also safe on very old hosts
+ * and new hosts, because, on them, what really matters is the length
+ * specified in write_blk->byte_count.
+ */
+ pkt_size += sizeof(pkt.reserved);
+
+ ret = vmbus_sendpacket(hbus->hdev->channel, write_blk, pkt_size,
+ (unsigned long)&pkt.pkt, VM_PKT_DATA_INBAND,
+ VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
+ if (ret)
+ return ret;
+
+ ret = wait_for_response(hbus->hdev, &comp_pkt.host_event);
+ if (ret)
+ return ret;
+
+ if (comp_pkt.completion_status != 0) {
+ dev_err(&hbus->hdev->device,
+ "Write Config Block failed: 0x%x\n",
+ comp_pkt.completion_status);
+ return -EIO;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(hv_write_config_block);
+
+/**
+ * hv_register_block_invalidate() - Invoked when a config block invalidation
+ * arrives from the back-end driver.
+ * @pdev: The PCI driver's representation for this device.
+ * @context: Identifies the device.
+ * @block_invalidate: Identifies all of the blocks being invalidated.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int hv_register_block_invalidate(struct pci_dev *pdev, void *context,
+ void (*block_invalidate)(void *context,
+ u64 block_mask))
+{
+ struct hv_pcibus_device *hbus =
+ container_of(pdev->bus->sysdata, struct hv_pcibus_device,
+ sysdata);
+ struct hv_pci_dev *hpdev;
+
+ hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
+ if (!hpdev)
+ return -ENODEV;
+
+ hpdev->block_invalidate = block_invalidate;
+ hpdev->invalidate_context = context;
+
+ put_pcichild(hpdev);
+ return 0;
+
+}
+EXPORT_SYMBOL(hv_register_block_invalidate);
+
/* Interrupt management hooks */
static void hv_int_desc_free(struct hv_pci_dev *hpdev,
struct tran_int_desc *int_desc)
@@ -1968,6 +2254,7 @@ static void hv_pci_onchannelcallback(void *context)
struct pci_response *response;
struct pci_incoming_message *new_message;
struct pci_bus_relations *bus_rel;
+ struct pci_dev_inval_block *inval;
struct pci_dev_incoming *dev_message;
struct hv_pci_dev *hpdev;
@@ -2045,6 +2332,21 @@ static void hv_pci_onchannelcallback(void *context)
}
break;
+ case PCI_INVALIDATE_BLOCK:
+
+ inval = (struct pci_dev_inval_block *)buffer;
+ hpdev = get_pcichild_wslot(hbus,
+ inval->wslot.slot);
+ if (hpdev) {
+ if (hpdev->block_invalidate) {
+ hpdev->block_invalidate(
+ hpdev->invalidate_context,
+ inval->block_mask);
+ }
+ put_pcichild(hpdev);
+ }
+ break;
+
default:
dev_warn(&hbus->hdev->device,
"Unimplemented protocol message %x\n",
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 6256cc3..9d37f8c 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1578,4 +1578,19 @@ struct vmpacket_descriptor *
for (pkt = hv_pkt_iter_first(channel); pkt; \
pkt = hv_pkt_iter_next(channel, pkt))
+/*
+ * Functions for passing data between SR-IOV PF and VF drivers. The VF driver
+ * sends requests to read and write blocks. Each block must be 128 bytes or
+ * smaller. Optionally, the VF driver can register a callback function which
+ * will be invoked when the host says that one or more of the first 64 block
+ * IDs is "invalid" which means that the VF driver should reread them.
+ */
+#define HV_CONFIG_BLOCK_SIZE_MAX 128
+int hv_read_config_block(struct pci_dev *dev, void *buf, unsigned int buf_len,
+ unsigned int block_id, unsigned int *bytes_returned);
+int hv_write_config_block(struct pci_dev *dev, void *buf, unsigned int len,
+ unsigned int block_id);
+int hv_register_block_invalidate(struct pci_dev *dev, void *context,
+ void (*block_invalidate)(void *context,
+ u64 block_mask));
#endif /* _HYPERV_H */
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next,v3, 0/6] Add software backchannel and mlx5e HV VHCA stats
From: Haiyang Zhang @ 2019-08-21 0:23 UTC (permalink / raw)
To: sashal@kernel.org, davem@davemloft.net, saeedm@mellanox.com,
leon@kernel.org, eranbe@mellanox.com, lorenzo.pieralisi@arm.com,
bhelgaas@google.com, linux-pci@vger.kernel.org,
linux-hyperv@vger.kernel.org, netdev@vger.kernel.org
Cc: Haiyang Zhang, KY Srinivasan, Stephen Hemminger,
linux-kernel@vger.kernel.org
This patch set adds paravirtual backchannel in software in pci_hyperv,
which is required by the mlx5e driver HV VHCA stats agent.
The stats agent is responsible on running a periodic rx/tx packets/bytes
stats update.
Dexuan Cui (1):
PCI: hv: Add a paravirtual backchannel in software
Eran Ben Elisha (4):
net/mlx5: Add wrappers for HyperV PCIe operations
net/mlx5: Add HV VHCA infrastructure
net/mlx5: Add HV VHCA control agent
net/mlx5e: Add mlx5e HV VHCA stats agent
Haiyang Zhang (1):
PCI: hv: Add a Hyper-V PCI interface driver for software backchannel
interface
MAINTAINERS | 1 +
drivers/net/ethernet/mellanox/mlx5/core/Makefile | 2 +
drivers/net/ethernet/mellanox/mlx5/core/en.h | 13 +
.../ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c | 162 +++++++++
.../ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h | 25 ++
drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 3 +
drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c | 64 ++++
drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h | 22 ++
.../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c | 371 +++++++++++++++++++++
.../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h | 104 ++++++
drivers/net/ethernet/mellanox/mlx5/core/main.c | 7 +
drivers/pci/Kconfig | 1 +
drivers/pci/controller/Kconfig | 7 +
drivers/pci/controller/Makefile | 1 +
drivers/pci/controller/pci-hyperv-intf.c | 67 ++++
drivers/pci/controller/pci-hyperv.c | 308 +++++++++++++++++
include/linux/hyperv.h | 29 ++
include/linux/mlx5/driver.h | 2 +
18 files changed, 1189 insertions(+)
create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h
create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c
create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h
create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
create mode 100644 drivers/pci/controller/pci-hyperv-intf.c
--
1.8.3.1
^ permalink raw reply
* Re: [PATCH v2 3/4] bpf: clarify when bpf_trace_printk discards lines
From: Peter Wu @ 2019-08-21 0:04 UTC (permalink / raw)
To: Alexei Starovoitov; +Cc: Alexei Starovoitov, Daniel Borkmann, netdev, bpf
In-Reply-To: <20190820232221.vzxemergvzy3bg4j@ast-mbp>
On Tue, Aug 20, 2019 at 04:22:23PM -0700, Alexei Starovoitov wrote:
> On Wed, Aug 21, 2019 at 12:08:59AM +0100, Peter Wu wrote:
> > I opened /sys/kernel/tracing/trace once and kept reading from it.
> > bpf_trace_printk somehow did not seem to work, no entries were appended
> > to that trace file. It turns out that tracing is disabled when that file
> > is open. Save the next person some time and document this.
> >
> > The trace file is described in Documentation/trace/ftrace.rst, however
> > the implication "tracing is disabled" did not immediate translate to
> > "bpf_trace_printk silently discards entries".
> >
> > Signed-off-by: Peter Wu <peter@lekensteyn.nl>
> > ---
> > include/uapi/linux/bpf.h | 2 ++
> > 1 file changed, 2 insertions(+)
> >
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 9ca333c3ce91..e4236e357ed9 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -575,6 +575,8 @@ union bpf_attr {
> > * limited to five).
> > *
> > * Each time the helper is called, it appends a line to the trace.
> > + * Lines are discarded while *\/sys/kernel/debug/tracing/trace* is
> > + * open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this.
>
> that's not quite correct.
> Having 'trace' file open doesn't discard lines.
> I think this type of comment in uapi header makes more confusion than helps.
Having the 'trace' file open for reading results in discarding lines. It
took me a while to figure that out. At first I was not even sure whether
my eBPF program was executed or not due to lack of entries in the
'trace' file.
I ended up setting a breakpoint and ended up with this call stack:
- bpf_trace_printk
- ____bpf_trace_printk
- __trace_printk
- trace_vprintk
- trace_array_vprintk
- __trace_array_vprintk
- __trace_array_vprintk
- __trace_buffer_lock_reserve
- ring_buffer_lock_reserve
The function ends up skipping the even because record_disabled == 1:
if (unlikely(atomic_read(&buffer->record_disabled)))
goto out;
Why is that? Well, I guessed that ring_buffer_record_disable and
ring_buffer_record_enable would be related. Sure enough, the first one
was hit when the 'trace' file is opened for reading while the latter is
called when the file is closed.
The relevant code from kernel/trace/trace.c (__tracing_open), "snapshot"
is true when "trace" is opened, and "false" when "trace_pipe" is used:
/* stop the trace while dumping if we are not opening "snapshot" */
if (!iter->snapshot)
tracing_stop_tr(tr);
So I think this supports the claim that lines are discarded. Do you
think this is not the case?
--
Kind regards,
Peter Wu
https://lekensteyn.nl
^ permalink raw reply
* Re: [net-next v3 00/14][pull request] 100GbE Intel Wired LAN Driver Updates 2019-08-20
From: David Miller @ 2019-08-21 0:02 UTC (permalink / raw)
To: jeffrey.t.kirsher; +Cc: netdev, nhorman, sassmann
In-Reply-To: <20190820215048.14377-1-jeffrey.t.kirsher@intel.com>
From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Tue, 20 Aug 2019 14:50:34 -0700
> This series contains updates to ice driver only.
Pulled, thanks Jeff.
^ permalink raw reply
* Re: [PATCH net-next v2 6/9] net: macsec: hardware offloading infrastructure
From: Andrew Lunn @ 2019-08-21 0:01 UTC (permalink / raw)
To: Sabrina Dubroca
Cc: Antoine Tenart, Igor Russkikh, davem@davemloft.net,
f.fainelli@gmail.com, hkallweit1@gmail.com,
netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
thomas.petazzoni@bootlin.com, alexandre.belloni@bootlin.com,
allan.nielsen@microchip.com, camelia.groza@nxp.com,
Simon Edelhaus, Pavel Belous
In-Reply-To: <20190820144119.GA28714@bistromath.localdomain>
> If you look at IPsec offloading, the networking stack builds up the
> ESP header, and passes the unencrypted data down to the driver. I'm
> wondering if the same would be possible with MACsec offloading: the
> macsec virtual interface adds the header (and maybe a dummy ICV), and
> then the HW does the encryption. In case of HW that needs to add the
> sectag itself, the driver would first strip the headers that the stack
> created. On receive, the driver would recreate a sectag and the macsec
> interface would just skip all verification (decrypt, PN).
Hi Sabrina
I assume the software implementation cannot make use of TSO or GSO,
letting the hardware segment a big buffer up into Ethernet frames?
When using hardware MACSEC, is it possible to enable these? By the
time the frames have reach the PHY GSO has been done. So it sees a
stream of frames it needs to encode/decode.
But if you are suggesting the extra headers are added by the virtual
interface, i don't think GSO can be used? My guess would be, we get a
performance boost from using hardware MAC sec, but there will also be
a performance boost if GSO can be enabled when it was disabled before.
Andrew
^ permalink raw reply
* Re: [PATCH 29/38] cls_flower: Convert handle_idr to XArray
From: David Miller @ 2019-08-20 23:58 UTC (permalink / raw)
To: willy; +Cc: netdev
In-Reply-To: <20190820223259.22348-30-willy@infradead.org>
From: Matthew Wilcox <willy@infradead.org>
Date: Tue, 20 Aug 2019 15:32:50 -0700
> - idr_replace(&head->handle_idr, fnew, fnew->handle);
> + xa_store(&head->filters, fnew->handle, fnew, 0);
Passing a gfp_t of zero? :-)
^ permalink raw reply
* Re: [PATCH 23/38] cls_api: Convert tcf_net to XArray
From: David Miller @ 2019-08-20 23:57 UTC (permalink / raw)
To: willy; +Cc: netdev
In-Reply-To: <20190820223259.22348-24-willy@infradead.org>
From: Matthew Wilcox <willy@infradead.org>
Date: Tue, 20 Aug 2019 15:32:44 -0700
> From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
>
> This module doesn't use the allocating functionality; convert it to a
> plain XArray instead of an allocating one. I've left struct tcf_net
> in place in case more objects are added to it in future, although
> it now only contains an XArray. We don't need to call xa_destroy()
> if the array is empty, so I've removed the contents of tcf_net_exit()
> -- if it can be called with entries still in place, then it shoud call
> xa_destroy() instead.
>
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
I don't know if the net exit can be invoked with entires still in place,
however if the tcf_net_exit() function is made empty it should be removed
along with the assignment to the per-netns ops.
^ permalink raw reply
* Re: [PATCH net-next 6/6] net: dsa: tag_8021q: Restore bridge pvid when enabling vlan_filtering
From: Florian Fainelli @ 2019-08-20 23:24 UTC (permalink / raw)
To: Vladimir Oltean, vivien.didelot, andrew, idosch, roopa, nikolay,
davem
Cc: netdev
In-Reply-To: <20190820000002.9776-7-olteanv@gmail.com>
On 8/19/19 5:00 PM, Vladimir Oltean wrote:
> The bridge core assumes that enabling/disabling vlan_filtering will
> translate into the simple toggling of a flag for switchdev drivers.
>
> That is clearly not the case for sja1105, which alters the VLAN table
> and the pvids in order to obtain port separation in standalone mode.
>
> So, since the bridge will not call any vlan operation through switchdev
> after enabling vlan_filtering, we need to ensure we're in a functional
> state ourselves.
>
> Hence read the pvid that the bridge is aware of, and program that into
> our ports.
>
> Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
OK, after reading how
drivers/net/dsa/sja1105/sja1105_main.c::sja1105_vlan_filtering makes use
of that functionality, that looks like the correct thing to do.
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
--
Florian
^ permalink raw reply
* Re: [PATCH v2 3/4] bpf: clarify when bpf_trace_printk discards lines
From: Alexei Starovoitov @ 2019-08-20 23:22 UTC (permalink / raw)
To: Peter Wu; +Cc: Alexei Starovoitov, Daniel Borkmann, netdev, bpf
In-Reply-To: <20190820230900.23445-4-peter@lekensteyn.nl>
On Wed, Aug 21, 2019 at 12:08:59AM +0100, Peter Wu wrote:
> I opened /sys/kernel/tracing/trace once and kept reading from it.
> bpf_trace_printk somehow did not seem to work, no entries were appended
> to that trace file. It turns out that tracing is disabled when that file
> is open. Save the next person some time and document this.
>
> The trace file is described in Documentation/trace/ftrace.rst, however
> the implication "tracing is disabled" did not immediate translate to
> "bpf_trace_printk silently discards entries".
>
> Signed-off-by: Peter Wu <peter@lekensteyn.nl>
> ---
> include/uapi/linux/bpf.h | 2 ++
> 1 file changed, 2 insertions(+)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 9ca333c3ce91..e4236e357ed9 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -575,6 +575,8 @@ union bpf_attr {
> * limited to five).
> *
> * Each time the helper is called, it appends a line to the trace.
> + * Lines are discarded while *\/sys/kernel/debug/tracing/trace* is
> + * open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this.
that's not quite correct.
Having 'trace' file open doesn't discard lines.
I think this type of comment in uapi header makes more confusion than helps.
^ permalink raw reply
* Re: [PATCH] bonding: force enable lacp port after link state recovery for 802.3ad
From: Jay Vosburgh @ 2019-08-20 23:14 UTC (permalink / raw)
To: zhangsha.zhang
Cc: vfalico, andy, davem, netdev, linux-kernel, yuehaibing, hunongda,
alex.chen
In-Reply-To: <20190820133822.2508-1-zhangsha.zhang@huawei.com>
<zhangsha.zhang@huawei.com> wrote:
>From: Sha Zhang <zhangsha.zhang@huawei.com>
>
>After the commit 334031219a84 ("bonding/802.3ad: fix slave link
>initialization transition states") merged,
>the slave's link status will be changed to BOND_LINK_FAIL
>from BOND_LINK_DOWN in the following scenario:
>- Driver reports loss of carrier and
> bonding driver receives NETDEV_CHANGE notifier
>- slave's duplex and speed is zerod and
> its port->is_enabled is cleard to 'false';
>- Driver reports link recovery and
> bonding driver receives NETDEV_UP notifier;
>- If speed/duplex getting failed here, the link status
> will be changed to BOND_LINK_FAIL;
>- The MII monotor later recover the slave's speed/duplex
> and set link status to BOND_LINK_UP, but remains
> the 'port->is_enabled' to 'false'.
>
>In this scenario, the lacp port will not be enabled even its speed
>and duplex are valid. The bond will not send LACPDU's, and its
>state is 'AD_STATE_DEFAULTED' forever. The simplest fix I think
>is to force enable lacp after port slave speed check in
>bond_miimon_commit. As enabled, the lacp port can run its state machine
>normally after link recovery.
>
>Signed-off-by: Sha Zhang <zhangsha.zhang@huawei.com>
>---
> drivers/net/bonding/bond_main.c | 8 +++++++-
> 1 file changed, 7 insertions(+), 1 deletion(-)
>
>diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>index 931d9d9..379253a 100644
>--- a/drivers/net/bonding/bond_main.c
>+++ b/drivers/net/bonding/bond_main.c
>@@ -2194,6 +2194,7 @@ static void bond_miimon_commit(struct bonding *bond)
> {
> struct list_head *iter;
> struct slave *slave, *primary;
>+ struct port *port;
>
> bond_for_each_slave(bond, slave, iter) {
> switch (slave->new_link) {
>@@ -2205,8 +2206,13 @@ static void bond_miimon_commit(struct bonding *bond)
> * link status
> */
> if (BOND_MODE(bond) == BOND_MODE_8023AD &&
>- slave->link == BOND_LINK_UP)
>+ slave->link == BOND_LINK_UP) {
> bond_3ad_adapter_speed_duplex_changed(slave);
>+ if (slave->duplex == DUPLEX_FULL) {
>+ port = &(SLAVE_AD_INFO(slave)->port);
>+ port->is_enabled = true;
>+ }
>+ }
I don't believe that testing duplex here is correct; is_enabled
is not controlled by duplex, but by carrier state. Duplex does affect
whether or not a port is permitted to aggregate, but that's entirely
separate logic (the AD_PORT_LACP_ENABLED flag).
Would it be better to call bond_3ad_handle_link_change() here,
instead of manually testing duplex and setting is_enabled?
-J
> continue;
>
> case BOND_LINK_UP:
>--
>1.8.3.1
>
---
-Jay Vosburgh, jay.vosburgh@canonical.com
^ permalink raw reply
* [PATCH v2 2/4] bpf: fix 'struct pt_reg' typo in documentation
From: Peter Wu @ 2019-08-20 23:08 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann; +Cc: netdev, bpf
In-Reply-To: <20190820230900.23445-1-peter@lekensteyn.nl>
There is no 'struct pt_reg'.
Signed-off-by: Peter Wu <peter@lekensteyn.nl>
---
include/uapi/linux/bpf.h | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index fa1c753dcdbc..9ca333c3ce91 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1013,7 +1013,7 @@ union bpf_attr {
* The realm of the route for the packet associated to *skb*, or 0
* if none was found.
*
- * int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
+ * int bpf_perf_event_output(struct pt_regs *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
* Description
* Write raw *data* blob into a special BPF perf event held by
* *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
@@ -1075,7 +1075,7 @@ union bpf_attr {
* Return
* 0 on success, or a negative error in case of failure.
*
- * int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags)
+ * int bpf_get_stackid(struct pt_regs *ctx, struct bpf_map *map, u64 flags)
* Description
* Walk a user or a kernel stack and return its id. To achieve
* this, the helper needs *ctx*, which is a pointer to the context
@@ -1724,7 +1724,7 @@ union bpf_attr {
* Return
* 0 on success, or a negative error in case of failure.
*
- * int bpf_override_return(struct pt_reg *regs, u64 rc)
+ * int bpf_override_return(struct pt_regs *regs, u64 rc)
* Description
* Used for error injection, this helper uses kprobes to override
* the return value of the probed function, and to set it to *rc*.
--
2.22.0
^ permalink raw reply related
* [PATCH v2 4/4] bpf: sync bpf.h to tools/
From: Peter Wu @ 2019-08-20 23:09 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann; +Cc: netdev, bpf
In-Reply-To: <20190820230900.23445-1-peter@lekensteyn.nl>
Fix a 'struct pt_reg' typo and clarify when bpf_trace_printk discards
lines. Affects documentation only.
Signed-off-by: Peter Wu <peter@lekensteyn.nl>
---
tools/include/uapi/linux/bpf.h | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4e455018da65..58bdb89599c9 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -575,6 +575,8 @@ union bpf_attr {
* limited to five).
*
* Each time the helper is called, it appends a line to the trace.
+ * Lines are discarded while *\/sys/kernel/debug/tracing/trace* is
+ * open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this.
* The format of the trace is customizable, and the exact output
* one will get depends on the options set in
* *\/sys/kernel/debug/tracing/trace_options* (see also the
@@ -1013,7 +1015,7 @@ union bpf_attr {
* The realm of the route for the packet associated to *skb*, or 0
* if none was found.
*
- * int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
+ * int bpf_perf_event_output(struct pt_regs *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
* Description
* Write raw *data* blob into a special BPF perf event held by
* *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
@@ -1075,7 +1077,7 @@ union bpf_attr {
* Return
* 0 on success, or a negative error in case of failure.
*
- * int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags)
+ * int bpf_get_stackid(struct pt_regs *ctx, struct bpf_map *map, u64 flags)
* Description
* Walk a user or a kernel stack and return its id. To achieve
* this, the helper needs *ctx*, which is a pointer to the context
@@ -1721,7 +1723,7 @@ union bpf_attr {
* Return
* 0 on success, or a negative error in case of failure.
*
- * int bpf_override_return(struct pt_reg *regs, u64 rc)
+ * int bpf_override_return(struct pt_regs *regs, u64 rc)
* Description
* Used for error injection, this helper uses kprobes to override
* the return value of the probed function, and to set it to *rc*.
--
2.22.0
^ permalink raw reply related
* [PATCH v2 3/4] bpf: clarify when bpf_trace_printk discards lines
From: Peter Wu @ 2019-08-20 23:08 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann; +Cc: netdev, bpf
In-Reply-To: <20190820230900.23445-1-peter@lekensteyn.nl>
I opened /sys/kernel/tracing/trace once and kept reading from it.
bpf_trace_printk somehow did not seem to work, no entries were appended
to that trace file. It turns out that tracing is disabled when that file
is open. Save the next person some time and document this.
The trace file is described in Documentation/trace/ftrace.rst, however
the implication "tracing is disabled" did not immediate translate to
"bpf_trace_printk silently discards entries".
Signed-off-by: Peter Wu <peter@lekensteyn.nl>
---
include/uapi/linux/bpf.h | 2 ++
1 file changed, 2 insertions(+)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 9ca333c3ce91..e4236e357ed9 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -575,6 +575,8 @@ union bpf_attr {
* limited to five).
*
* Each time the helper is called, it appends a line to the trace.
+ * Lines are discarded while *\/sys/kernel/debug/tracing/trace* is
+ * open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this.
* The format of the trace is customizable, and the exact output
* one will get depends on the options set in
* *\/sys/kernel/debug/tracing/trace_options* (see also the
--
2.22.0
^ permalink raw reply related
* [PATCH v2 1/4] bpf: clarify description for CONFIG_BPF_EVENTS
From: Peter Wu @ 2019-08-20 23:08 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann; +Cc: netdev, bpf
In-Reply-To: <20190820230900.23445-1-peter@lekensteyn.nl>
PERF_EVENT_IOC_SET_BPF supports uprobes since v4.3, and tracepoints
since v4.7 via commit 04a22fae4cbc ("tracing, perf: Implement BPF
programs attached to uprobes"), and commit 98b5c2c65c29 ("perf, bpf:
allow bpf programs attach to tracepoints") respectively.
Signed-off-by: Peter Wu <peter@lekensteyn.nl>
---
kernel/trace/Kconfig | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 98da8998c25c..b09d7b1ffffd 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -520,7 +520,8 @@ config BPF_EVENTS
bool
default y
help
- This allows the user to attach BPF programs to kprobe events.
+ This allows the user to attach BPF programs to kprobe, uprobe, and
+ tracepoint events.
config DYNAMIC_EVENTS
def_bool n
--
2.22.0
^ permalink raw reply related
* [PATCH v2 0/4] BPF-related documentation fixes
From: Peter Wu @ 2019-08-20 23:08 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann; +Cc: netdev, bpf
Hi,
Here are some small doc updates that should hopefully save the next
eBPF/uprobe user some time. Based on v5.3-rc2, but net-next appears to
have no conflicts.
Changes since the v1[1]:
- Split bpf.h patch for kernel and userspace tools (requested by Alexei)
- Add new 'bpf: clarify when bpf_trace_printk discards lines' patch.
Kind regards,
Peter
[1]: https://lkml.kernel.org/r/20190819212122.10286-1-peter@lekensteyn.nl
Peter Wu (4):
bpf: clarify description for CONFIG_BPF_EVENTS
bpf: fix 'struct pt_reg' typo in documentation
bpf: clarify when bpf_trace_printk discards lines
bpf: sync bpf.h to tools/
include/uapi/linux/bpf.h | 8 +++++---
kernel/trace/Kconfig | 3 ++-
tools/include/uapi/linux/bpf.h | 8 +++++---
3 files changed, 12 insertions(+), 7 deletions(-)
--
2.22.0
^ permalink raw reply
* various TLS bug fixes...
From: David Miller @ 2019-08-20 23:05 UTC (permalink / raw)
To: netdev; +Cc: jakub.kicinski
Jakub,
I just did a batch of networking -stable submissions, however I ran
into some troubles with the various TLS backports.
I was able to backport commit 414776621d10 ("net/tls: prevent
skb_orphan() from leaking TLS plain text with offload") to v5.2
but not to v4.19
I was not able to backport neither d85f01775850 ("net: tls, fix
sk_write_space NULL write when tx disabled") nor commit 57c722e932cf
("net/tls: swap sk_write_space on close") to any release. It seems
like there are a bunch of dependencies and perhaps other fixes.
I suspect you've triaged through this already on your side for other
reasons, so perhaps you could help come up with a sane set of TLS
bug fix backports that would be appropriate for -stable?
Thanks!
^ permalink raw reply
* RE: [PATCH net-next,v2 2/6] PCI: hv: Add a Hyper-V PCI interface driver for software backchannel interface
From: Haiyang Zhang @ 2019-08-20 23:00 UTC (permalink / raw)
To: David Miller
Cc: sashal@kernel.org, saeedm@mellanox.com, leon@kernel.org,
eranbe@mellanox.com, lorenzo.pieralisi@arm.com,
bhelgaas@google.com, linux-pci@vger.kernel.org,
linux-hyperv@vger.kernel.org, netdev@vger.kernel.org,
KY Srinivasan, Stephen Hemminger, linux-kernel@vger.kernel.org
In-Reply-To: <20190820.122925.1080288470348205792.davem@davemloft.net>
> -----Original Message-----
> From: David Miller <davem@davemloft.net>
> Sent: Tuesday, August 20, 2019 3:29 PM
> To: Haiyang Zhang <haiyangz@microsoft.com>
> Cc: sashal@kernel.org; saeedm@mellanox.com; leon@kernel.org;
> eranbe@mellanox.com; lorenzo.pieralisi@arm.com; bhelgaas@google.com;
> linux-pci@vger.kernel.org; linux-hyperv@vger.kernel.org;
> netdev@vger.kernel.org; KY Srinivasan <kys@microsoft.com>; Stephen
> Hemminger <sthemmin@microsoft.com>; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH net-next,v2 2/6] PCI: hv: Add a Hyper-V PCI interface
> driver for software backchannel interface
>
> From: Haiyang Zhang <haiyangz@microsoft.com>
> Date: Mon, 19 Aug 2019 19:30:47 +0000
>
> > +static void __exit exit_hv_pci_intf(void) {
> > + pr_info("unloaded\n");
> > +}
> > +
> > +static int __init init_hv_pci_intf(void) {
> > + pr_info("loaded\n");
> > +
>
> Clogging up the logs with useless messages like this is inappropriate.
> Please remove these pr_info() calls.
>
> Also, all of these symbols should probably be GPL exported.
I will update the patch -- remove the pr_info, and use EXPORT_SYMBOL_GPL()
for the symbols.
Thanks,
- Haiyang
^ permalink raw reply
* Re: [PATCH net-next 4/6] net: dsa: Don't program the VLAN as pvid on the upstream port
From: Florian Fainelli @ 2019-08-20 22:43 UTC (permalink / raw)
To: Vladimir Oltean
Cc: Vivien Didelot, Andrew Lunn, Ido Schimmel, Roopa Prabhu, nikolay,
David S. Miller, netdev
In-Reply-To: <CA+h21hpCP2KpTnCuki1M6tkQ1Qv-ex5MfKHbwQXsqotoh3ndKw@mail.gmail.com>
On 8/20/19 5:09 AM, Vladimir Oltean wrote:
> Hi Florian,
>
> On Tue, 20 Aug 2019 at 06:15, Florian Fainelli <f.fainelli@gmail.com> wrote:
>>
>>
>>
>> On 8/19/2019 5:00 PM, Vladimir Oltean wrote:
>>> Commit b2f81d304cee ("net: dsa: add CPU and DSA ports as VLAN members")
>>> programs the VLAN from the bridge into the specified port as well as the
>>> upstream port, with the same set of flags.
>>>
>>> Consider the typical case of installing pvid 1 on user port 1, pvid 2 on
>>> user port 2, etc. The upstream port would end up having a pvid equal to
>>> the last user port whose pvid was programmed from the bridge. Less than
>>> useful.
>>>
>>> So just don't change the pvid of the upstream port and let it be
>>> whatever the driver set it internally to be.
>>
>> This patch should allow removing the !dsa_is_cpu_port() checks from
>> b53_common.c:b53_vlan_add, about time :)
>>
>> It seems to me that the fundamental issue here is that because we do not
>> have a user visible network device that 1:1 maps with the CPU (or DSA)
>> ports for that matter (and for valid reasons, they would represent two
>> ends of the same pipe), we do not have a good way to control the CPU
>> port VLAN attributes.
>>
>> There was a prior attempt at allowing using the bridge master device to
>> program the CPU port's VLAN attributes, see [1], but I did not follow up
>> with that until [2] and then life caught me. If you can/want, that would
>> be great (not asking for TPS reports).
>>
>> [1]:
>> https://lists.linuxfoundation.org/pipermail/bridge/2016-November/010112.html
>> [2]:
>> https://lore.kernel.org/lkml/20180624153339.13572-1-f.fainelli@gmail.com/T/
>>
>
> So what was the conclusion of that discussion? Should you or should
> you not add the check for vlan->flags & BRIDGE_VLAN_INFO_BRENTRY?
I was not able to test that change, and got distracted for months
(years?) doing "other stuff" that is not DSA related.
> I don't exactly handle the meaning of 'master' and 'self' options from
> a user perspective.
> Right now (no patches applied) I get the following behavior in DSA
> (swp2 is already member of br0):
>
> $ echo 1 | sudo tee /sys/class/net/br0/bridge/vlan_filtering
> $ sudo bridge vlan add vid 100 dev swp2
> $ sudo bridge vlan add vid 101 dev swp2 self
> RTNETLINK answers: Operation not supported
> $ sudo bridge vlan add vid 102 dev swp2 master
> $ sudo bridge vlan add vid 103 dev br0
> RTNETLINK answers: Operation not supported
> $ sudo bridge vlan add vid 104 dev br0 self
> $ sudo bridge vlan add vid 105 dev br0 master
> RTNETLINK answers: Operation not supported
>
> $ bridge vlan
> port vlan ids
> eth0 1 PVID Egress Untagged
>
> swp5 1 PVID Egress Untagged
>
> swp2 1 PVID Egress Untagged
> 100
> 102
>
> swp3 1 PVID Egress Untagged
>
> swp4 1 PVID Egress Untagged
>
> br0 1 PVID Egress Untagged
> 104
>
> Who returns EOPNOTSUPP for VID 101 and why?
> Why is VID 102 not installed in br0? This part I don't understand from
> your patchset. Does it mean that the CPU port (br0) will have to be
> explicitly configured from now on, even if I run the commands on swp2
> with 'master'?
This does not really answer your questions, but maybe let's agree on the
user visible behavior. My expectations would be the following should be
happening with this patch applied:
- when the VLAN is created for the first and is configured on either the
bridge master device (br0) or an user port (swp2), it gets programmed
into the switch for the CPU port and respectively CPU port and swp2 port
- when you change the bridge master device VLAN attributes, or
add/delete a new one, the programming targets only the CPU port with the
proper operation
That way, there would be no change in how the initial VLAN programming
is done, in that the CPU port still gets programmed, but later on, if
you want e.g.: your CPU port not to be tagged, but untagged into a
particular VLAN.
My upcoming weeks don't look great in terms of resuming active or semi
active DSA work, but working with the DSA mock-up driver might be an
option to avoid spending too much time testing on real HW.
--
Florian
^ permalink raw reply
* Re: [PATCH 2/6] dt-bindings: net: sun8i-a83t-emac: Add phy-io-supply property
From: Ondřej Jirman @ 2019-08-20 22:36 UTC (permalink / raw)
To: Rob Herring
Cc: David S. Miller, Mark Rutland, Maxime Ripard, Chen-Yu Tsai,
Giuseppe Cavallaro, Alexandre Torgue, Jose Abreu, Maxime Coquelin,
netdev, devicetree,
moderated list:ARM/FREESCALE IMX / MXC ARM ARCHITECTURE,
linux-kernel@vger.kernel.org, linux-stm32
In-Reply-To: <CAL_JsqJHNL91KMAP5ya97eiyTypGniCJ+tbP=NchPJK502i5FQ@mail.gmail.com>
On Tue, Aug 20, 2019 at 11:57:06AM -0500, Rob Herring wrote:
> On Tue, Aug 20, 2019 at 11:34 AM Ondřej Jirman <megous@megous.com> wrote:
> >
> > On Tue, Aug 20, 2019 at 11:20:22AM -0500, Rob Herring wrote:
> > > On Tue, Aug 20, 2019 at 9:53 AM <megous@megous.com> wrote:
> > > >
> > > > From: Ondrej Jirman <megous@megous.com>
> > > >
> > > > Some PHYs require separate power supply for I/O pins in some modes
> > > > of operation. Add phy-io-supply property, to allow enabling this
> > > > power supply.
> > >
> > > Perhaps since this is new, such phys should have *-supply in their nodes.
> >
> > Yes, I just don't understand, since external ethernet phys are so common,
> > and they require power, how there's no fairly generic mechanism for this
> > already in the PHY subsystem, or somewhere?
>
> Because generic mechanisms for this don't work. For example, what
> happens when the 2 supplies need to be turned on in a certain order
> and with certain timings? And then add in reset or control lines into
> the mix... You can see in the bindings we already have some of that.
I've looked at the emac bindings that have phy-supply, and don't see reason
why this can't be generic for the phy. Just like there's generic reset
properties for phys, now. Some bindings, like fsl-fec.txt even list
custom reset properties for phy as deprecated, and recommend using
generic ones.
From the point of the view of the emac driver, it just wants to power on/power
off the phy, and wait until it's ready to be communicated with.
It's probably better to have power supplies of the phy covered by generic
phy code, because then you don't have to duplicate all this special power
up logic in every emac driver, whenever a HW designer decides to combine
such emac with external phy that requires some special hadnling on powerup.
At the moment, this lack of flexibility is hacked around by adding multiple
regulators to the DTS, and making them dependent on each other (even if one
doesn't supply the other), just because this makes the regulator core driver
enable them all. Power up delays for the PHY are described as enable-ramp-delays
on the regulators (actual regulator ramp delay + wait time for PHY to initialize).
Basically just hacking the DT so that the Linux kernel in the end does what's
necessary, instead of DT describing the actual HW.
Adding a single supply property to the phy node, as you suggest will do nothing
to help this situation. It will just result in a more complicated dwmac-sun8i
driver and will not help anyone in the future.
So I think, maybe phy powerup should be moved to generic code, just like the
phy reset code was. Generic code can have multiple supplies and some generic
way to specify power up order and timings.
But I guess, this patch series is a dead end.
> > It looks like other ethernet mac drivers also implement supplies on phys
> > on the EMAC nodes. Just grep phy-supply through dt-bindings/net.
> >
> > Historical reasons, or am I missing something? It almost seems like I must
> > be missing something, since putting these properties to phy nodes
> > seems so obvious.
>
> Things get added one by one and one new property isn't that
> controversial. We've generally learned the lesson and avoid this
> pattern now, but ethernet phys are one of the older bindings.
Understood. So maybe the solution suggested above would improve the situation
eventually?
regards,
o.
> Rob
>
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply
* [PATCH 08/38] nfp: Convert to XArray
From: Matthew Wilcox @ 2019-08-20 22:32 UTC (permalink / raw)
To: netdev; +Cc: Matthew Wilcox (Oracle)
In-Reply-To: <20190820223259.22348-1-willy@infradead.org>
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
A minor change in semantics where we simply store into the XArray rather
than insert; this only matters if there could already be something stored
at that index, and from my reading of the code that can't happen.
Use xa_for_each() rather than xas_for_each() as none of these loops
appear to be performance-critical.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
drivers/net/ethernet/netronome/nfp/abm/main.c | 4 +--
drivers/net/ethernet/netronome/nfp/abm/main.h | 4 +--
.../net/ethernet/netronome/nfp/abm/qdisc.c | 33 +++++++------------
3 files changed, 15 insertions(+), 26 deletions(-)
diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.c b/drivers/net/ethernet/netronome/nfp/abm/main.c
index 9183b3e85d21..2a06a3012e39 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.c
@@ -345,7 +345,7 @@ nfp_abm_vnic_alloc(struct nfp_app *app, struct nfp_net *nn, unsigned int id)
netif_keep_dst(nn->dp.netdev);
nfp_abm_vnic_set_mac(app->pf, abm, nn, id);
- INIT_RADIX_TREE(&alink->qdiscs, GFP_KERNEL);
+ xa_init(&alink->qdiscs);
return 0;
@@ -361,7 +361,7 @@ static void nfp_abm_vnic_free(struct nfp_app *app, struct nfp_net *nn)
struct nfp_abm_link *alink = nn->app_priv;
nfp_abm_kill_reprs(alink->abm, alink);
- WARN(!radix_tree_empty(&alink->qdiscs), "left over qdiscs\n");
+ WARN(!xa_empty(&alink->qdiscs), "left over qdiscs\n");
kfree(alink->prio_map);
kfree(alink);
}
diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.h b/drivers/net/ethernet/netronome/nfp/abm/main.h
index 48746c9c6224..2b78abe606d9 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.h
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.h
@@ -6,7 +6,7 @@
#include <linux/bits.h>
#include <linux/list.h>
-#include <linux/radix-tree.h>
+#include <linux/xarray.h>
#include <net/devlink.h>
#include <net/pkt_cls.h>
#include <net/pkt_sched.h>
@@ -219,7 +219,7 @@ struct nfp_abm_link {
struct list_head dscp_map;
struct nfp_qdisc *root_qdisc;
- struct radix_tree_root qdiscs;
+ struct xarray qdiscs;
};
static inline bool nfp_abm_has_prio(struct nfp_abm *abm)
diff --git a/drivers/net/ethernet/netronome/nfp/abm/qdisc.c b/drivers/net/ethernet/netronome/nfp/abm/qdisc.c
index 2473fb5f75e5..b40ee2f5e1c1 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/qdisc.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/qdisc.c
@@ -24,11 +24,6 @@ static bool nfp_abm_qdisc_child_valid(struct nfp_qdisc *qdisc, unsigned int id)
qdisc->children[id] != NFP_QDISC_UNTRACKED;
}
-static void *nfp_abm_qdisc_tree_deref_slot(void __rcu **slot)
-{
- return rtnl_dereference(*slot);
-}
-
static void
nfp_abm_stats_propagate(struct nfp_alink_stats *parent,
struct nfp_alink_stats *child)
@@ -245,10 +240,8 @@ nfp_abm_offload_compile_mq(struct nfp_abm_link *alink, struct nfp_qdisc *qdisc)
void nfp_abm_qdisc_offload_update(struct nfp_abm_link *alink)
{
struct nfp_abm *abm = alink->abm;
- struct radix_tree_iter iter;
struct nfp_qdisc *qdisc;
- void __rcu **slot;
- size_t i;
+ unsigned long i;
/* Mark all thresholds as unconfigured */
for (i = 0; i < abm->num_bands; i++)
@@ -257,17 +250,14 @@ void nfp_abm_qdisc_offload_update(struct nfp_abm_link *alink)
alink->total_queues);
/* Clear offload marks */
- radix_tree_for_each_slot(slot, &alink->qdiscs, &iter, 0) {
- qdisc = nfp_abm_qdisc_tree_deref_slot(slot);
+ xa_for_each(&alink->qdiscs, i, qdisc)
qdisc->offload_mark = false;
- }
if (alink->root_qdisc)
nfp_abm_offload_compile_mq(alink, alink->root_qdisc);
/* Refresh offload status */
- radix_tree_for_each_slot(slot, &alink->qdiscs, &iter, 0) {
- qdisc = nfp_abm_qdisc_tree_deref_slot(slot);
+ xa_for_each(&alink->qdiscs, i, qdisc) {
if (!qdisc->offload_mark && qdisc->offloaded)
nfp_abm_qdisc_offload_stop(alink, qdisc);
qdisc->offloaded = qdisc->offload_mark;
@@ -285,9 +275,9 @@ static void
nfp_abm_qdisc_clear_mq(struct net_device *netdev, struct nfp_abm_link *alink,
struct nfp_qdisc *qdisc)
{
- struct radix_tree_iter iter;
unsigned int mq_refs = 0;
- void __rcu **slot;
+ unsigned long index;
+ struct nfp_qdisc *mq;
if (!qdisc->use_cnt)
return;
@@ -300,8 +290,7 @@ nfp_abm_qdisc_clear_mq(struct net_device *netdev, struct nfp_abm_link *alink,
return;
/* Count refs held by MQ instances and clear pointers */
- radix_tree_for_each_slot(slot, &alink->qdiscs, &iter, 0) {
- struct nfp_qdisc *mq = nfp_abm_qdisc_tree_deref_slot(slot);
+ xa_for_each(&alink->qdiscs, index, mq) {
unsigned int i;
if (mq->type != NFP_QDISC_MQ || mq->netdev != netdev)
@@ -326,8 +315,7 @@ nfp_abm_qdisc_free(struct net_device *netdev, struct nfp_abm_link *alink,
if (!qdisc)
return;
nfp_abm_qdisc_clear_mq(netdev, alink, qdisc);
- WARN_ON(radix_tree_delete(&alink->qdiscs,
- TC_H_MAJ(qdisc->handle)) != qdisc);
+ WARN_ON(xa_erase(&alink->qdiscs, TC_H_MAJ(qdisc->handle)) != qdisc);
kfree(qdisc->children);
kfree(qdisc);
@@ -360,10 +348,11 @@ nfp_abm_qdisc_alloc(struct net_device *netdev, struct nfp_abm_link *alink,
qdisc->handle = handle;
qdisc->num_children = children;
- err = radix_tree_insert(&alink->qdiscs, TC_H_MAJ(qdisc->handle), qdisc);
+ err = xa_err(xa_store(&alink->qdiscs, TC_H_MAJ(qdisc->handle), qdisc,
+ GFP_KERNEL));
if (err) {
nfp_err(alink->abm->app->cpp,
- "Qdisc insertion into radix tree failed: %d\n", err);
+ "Qdisc insertion failed: %d\n", err);
goto err_free_child_tbl;
}
@@ -380,7 +369,7 @@ nfp_abm_qdisc_alloc(struct net_device *netdev, struct nfp_abm_link *alink,
static struct nfp_qdisc *
nfp_abm_qdisc_find(struct nfp_abm_link *alink, u32 handle)
{
- return radix_tree_lookup(&alink->qdiscs, TC_H_MAJ(handle));
+ return xa_load(&alink->qdiscs, TC_H_MAJ(handle));
}
static int
--
2.23.0.rc1
^ permalink raw reply related
* [PATCH 15/38] nfp: Convert internal ports to XArray
From: Matthew Wilcox @ 2019-08-20 22:32 UTC (permalink / raw)
To: netdev; +Cc: Matthew Wilcox (Oracle)
In-Reply-To: <20190820223259.22348-1-willy@infradead.org>
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Since nfp_fl_internal_ports was only an IDR and the lock to protect it,
replace the entire data structure with an XArray (which has an embedded
lock).
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
.../net/ethernet/netronome/nfp/flower/main.c | 44 +++++++------------
.../net/ethernet/netronome/nfp/flower/main.h | 12 +----
2 files changed, 17 insertions(+), 39 deletions(-)
diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c
index 7a20447cca19..706ae41645f5 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.c
@@ -40,35 +40,31 @@ nfp_flower_lookup_internal_port_id(struct nfp_flower_priv *priv,
struct net_device *netdev)
{
struct net_device *entry;
- int i, id = 0;
+ unsigned long i;
- rcu_read_lock();
- idr_for_each_entry(&priv->internal_ports.port_ids, entry, i)
- if (entry == netdev) {
- id = i;
- break;
- }
- rcu_read_unlock();
+ xa_for_each(&priv->internal_ports, i, entry) {
+ if (entry == netdev)
+ return i;
+ }
- return id;
+ return 0;
}
static int
nfp_flower_get_internal_port_id(struct nfp_app *app, struct net_device *netdev)
{
struct nfp_flower_priv *priv = app->priv;
- int id;
+ int err, id;
id = nfp_flower_lookup_internal_port_id(priv, netdev);
if (id > 0)
return id;
- idr_preload(GFP_ATOMIC);
- spin_lock_bh(&priv->internal_ports.lock);
- id = idr_alloc(&priv->internal_ports.port_ids, netdev,
- NFP_MIN_INT_PORT_ID, NFP_MAX_INT_PORT_ID, GFP_ATOMIC);
- spin_unlock_bh(&priv->internal_ports.lock);
- idr_preload_end();
+ err = xa_alloc_bh(&priv->internal_ports, &id, netdev,
+ XA_LIMIT(NFP_MIN_INT_PORT_ID, NFP_MAX_INT_PORT_ID),
+ GFP_ATOMIC);
+ if (err < 0)
+ return err;
return id;
}
@@ -95,13 +91,8 @@ static struct net_device *
nfp_flower_get_netdev_from_internal_port_id(struct nfp_app *app, int port_id)
{
struct nfp_flower_priv *priv = app->priv;
- struct net_device *netdev;
-
- rcu_read_lock();
- netdev = idr_find(&priv->internal_ports.port_ids, port_id);
- rcu_read_unlock();
- return netdev;
+ return xa_load(&priv->internal_ports, port_id);
}
static void
@@ -114,9 +105,7 @@ nfp_flower_free_internal_port_id(struct nfp_app *app, struct net_device *netdev)
if (!id)
return;
- spin_lock_bh(&priv->internal_ports.lock);
- idr_remove(&priv->internal_ports.port_ids, id);
- spin_unlock_bh(&priv->internal_ports.lock);
+ xa_erase_bh(&priv->internal_ports, id);
}
static int
@@ -133,13 +122,12 @@ nfp_flower_internal_port_event_handler(struct nfp_app *app,
static void nfp_flower_internal_port_init(struct nfp_flower_priv *priv)
{
- spin_lock_init(&priv->internal_ports.lock);
- idr_init(&priv->internal_ports.port_ids);
+ xa_init_flags(&priv->internal_ports, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_BH);
}
static void nfp_flower_internal_port_cleanup(struct nfp_flower_priv *priv)
{
- idr_destroy(&priv->internal_ports.port_ids);
+ xa_destroy(&priv->internal_ports);
}
static struct nfp_flower_non_repr_priv *
diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h b/drivers/net/ethernet/netronome/nfp/flower/main.h
index 31d94592a7c0..735e995ae740 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.h
@@ -119,16 +119,6 @@ struct nfp_fl_lag {
struct sk_buff_head retrans_skbs;
};
-/**
- * struct nfp_fl_internal_ports - Flower APP priv data for additional ports
- * @port_ids: Assignment of ids to any additional ports
- * @lock: Lock for extra ports list
- */
-struct nfp_fl_internal_ports {
- struct idr port_ids;
- spinlock_t lock;
-};
-
/**
* struct nfp_flower_priv - Flower APP per-vNIC priv data
* @app: Back pointer to app
@@ -191,7 +181,7 @@ struct nfp_flower_priv {
struct list_head non_repr_priv;
unsigned int active_mem_unit;
unsigned int total_mem_units;
- struct nfp_fl_internal_ports internal_ports;
+ struct xarray internal_ports;
struct delayed_work qos_stats_work;
unsigned int qos_rate_limiters;
spinlock_t qos_stats_lock; /* Protect the qos stats */
--
2.23.0.rc1
^ permalink raw reply related
* [PATCH 03/38] mlx4: Convert qp_table_tree to XArray
From: Matthew Wilcox @ 2019-08-20 22:32 UTC (permalink / raw)
To: netdev; +Cc: Matthew Wilcox (Oracle)
In-Reply-To: <20190820223259.22348-1-willy@infradead.org>
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
This XArray appears to be modifiable from interrupt context, so we have
to be a little more careful with the locking. However, the lookup can
be done without the spinlock held. I cannot determine whether
mlx4_qp_alloc() is allowed to sleep, so I've retained the GFP_ATOMIC
there, but it could be turned into GFP_KERNEL if the callers can
tolerate it sleeping.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
drivers/net/ethernet/mellanox/mlx4/mlx4.h | 3 +-
drivers/net/ethernet/mellanox/mlx4/qp.c | 37 ++++++-----------------
include/linux/mlx4/device.h | 4 +--
include/linux/mlx4/qp.h | 2 +-
4 files changed, 14 insertions(+), 32 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index b6fe22bee9f4..aaece8480da7 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -38,7 +38,7 @@
#define MLX4_H
#include <linux/mutex.h>
-#include <linux/radix-tree.h>
+#include <linux/xarray.h>
#include <linux/rbtree.h>
#include <linux/timer.h>
#include <linux/semaphore.h>
@@ -716,7 +716,6 @@ struct mlx4_qp_table {
u32 zones_uids[MLX4_QP_TABLE_ZONE_NUM];
u32 rdmarc_base;
int rdmarc_shift;
- spinlock_t lock;
struct mlx4_icm_table qp_table;
struct mlx4_icm_table auxc_table;
struct mlx4_icm_table altc_table;
diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c
index 427e7a31862c..4659ecec12c1 100644
--- a/drivers/net/ethernet/mellanox/mlx4/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx4/qp.c
@@ -48,16 +48,13 @@
void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type)
{
- struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
struct mlx4_qp *qp;
- spin_lock(&qp_table->lock);
-
+ xa_lock(&dev->qp_table);
qp = __mlx4_qp_lookup(dev, qpn);
if (qp)
refcount_inc(&qp->refcount);
-
- spin_unlock(&qp_table->lock);
+ xa_unlock(&dev->qp_table);
if (!qp) {
mlx4_dbg(dev, "Async event for none existent QP %08x\n", qpn);
@@ -390,21 +387,11 @@ static void mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn)
struct mlx4_qp *mlx4_qp_lookup(struct mlx4_dev *dev, u32 qpn)
{
- struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
- struct mlx4_qp *qp;
-
- spin_lock_irq(&qp_table->lock);
-
- qp = __mlx4_qp_lookup(dev, qpn);
-
- spin_unlock_irq(&qp_table->lock);
- return qp;
+ return __mlx4_qp_lookup(dev, qpn);
}
int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp)
{
- struct mlx4_priv *priv = mlx4_priv(dev);
- struct mlx4_qp_table *qp_table = &priv->qp_table;
int err;
if (!qpn)
@@ -416,10 +403,9 @@ int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp)
if (err)
return err;
- spin_lock_irq(&qp_table->lock);
- err = radix_tree_insert(&dev->qp_table_tree, qp->qpn &
- (dev->caps.num_qps - 1), qp);
- spin_unlock_irq(&qp_table->lock);
+ err = xa_err(xa_store_irq(&dev->qp_table,
+ qp->qpn & (dev->caps.num_qps - 1),
+ qp, GFP_ATOMIC));
if (err)
goto err_icm;
@@ -512,12 +498,11 @@ EXPORT_SYMBOL_GPL(mlx4_update_qp);
void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp)
{
- struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
unsigned long flags;
- spin_lock_irqsave(&qp_table->lock, flags);
- radix_tree_delete(&dev->qp_table_tree, qp->qpn & (dev->caps.num_qps - 1));
- spin_unlock_irqrestore(&qp_table->lock, flags);
+ xa_lock_irqsave(&dev->qp_table, flags);
+ __xa_erase(&dev->qp_table, qp->qpn & (dev->caps.num_qps - 1));
+ xa_unlock_irqrestore(&dev->qp_table, flags);
}
EXPORT_SYMBOL_GPL(mlx4_qp_remove);
@@ -760,7 +745,6 @@ static void mlx4_cleanup_qp_zones(struct mlx4_dev *dev)
int mlx4_init_qp_table(struct mlx4_dev *dev)
{
- struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
int err;
int reserved_from_top = 0;
int reserved_from_bot;
@@ -770,8 +754,7 @@ int mlx4_init_qp_table(struct mlx4_dev *dev)
u32 max_table_offset = dev->caps.dmfs_high_rate_qpn_base +
dev->caps.dmfs_high_rate_qpn_range;
- spin_lock_init(&qp_table->lock);
- INIT_RADIX_TREE(&dev->qp_table_tree, GFP_ATOMIC);
+ xa_init_flags(&dev->qp_table, XA_FLAGS_LOCK_IRQ);
if (mlx4_is_slave(dev))
return 0;
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 36e412c3d657..acffca7d9f00 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -36,7 +36,7 @@
#include <linux/if_ether.h>
#include <linux/pci.h>
#include <linux/completion.h>
-#include <linux/radix-tree.h>
+#include <linux/xarray.h>
#include <linux/cpu_rmap.h>
#include <linux/crash_dump.h>
@@ -889,7 +889,7 @@ struct mlx4_dev {
struct mlx4_caps caps;
struct mlx4_phys_caps phys_caps;
struct mlx4_quotas quotas;
- struct radix_tree_root qp_table_tree;
+ struct xarray qp_table;
u8 rev_id;
u8 port_random_macs;
char board_id[MLX4_BOARD_ID_LEN];
diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h
index 8e2828d48d7f..6c3ec3197a10 100644
--- a/include/linux/mlx4/qp.h
+++ b/include/linux/mlx4/qp.h
@@ -488,7 +488,7 @@ int mlx4_qp_to_ready(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
static inline struct mlx4_qp *__mlx4_qp_lookup(struct mlx4_dev *dev, u32 qpn)
{
- return radix_tree_lookup(&dev->qp_table_tree, qpn & (dev->caps.num_qps - 1));
+ return xa_load(&dev->qp_table, qpn & (dev->caps.num_qps - 1));
}
void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp);
--
2.23.0.rc1
^ permalink raw reply related
* [PATCH 06/38] mlx5: Convert counters_idr to XArray
From: Matthew Wilcox @ 2019-08-20 22:32 UTC (permalink / raw)
To: netdev; +Cc: Matthew Wilcox (Oracle)
In-Reply-To: <20190820223259.22348-1-willy@infradead.org>
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
This IDR wasn't using the allocation functionality, so convert it to a
plain XArray. I also suspect it could be used to replace the list_head
'counters', but I'm not willing to do that work right now.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
.../ethernet/mellanox/mlx5/core/fs_counters.c | 31 +++++--------------
include/linux/mlx5/driver.h | 3 +-
2 files changed, 9 insertions(+), 25 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
index 1804cf3c3814..5ee20d285c5e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
@@ -108,18 +108,14 @@ static struct list_head *mlx5_fc_counters_lookup_next(struct mlx5_core_dev *dev,
u32 id)
{
struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
- unsigned long next_id = (unsigned long)id + 1;
struct mlx5_fc *counter;
- unsigned long tmp;
+ unsigned long next_id;
- rcu_read_lock();
- /* skip counters that are in idr, but not yet in counters list */
- idr_for_each_entry_continue_ul(&fc_stats->counters_idr,
- counter, tmp, next_id) {
+ /* skip counters that are not yet in counters list */
+ xa_for_each_start(&fc_stats->counters_xa, next_id, counter, id + 1) {
if (!list_empty(&counter->list))
break;
}
- rcu_read_unlock();
return counter ? &counter->list : &fc_stats->counters;
}
@@ -139,9 +135,7 @@ static void mlx5_fc_stats_remove(struct mlx5_core_dev *dev,
list_del(&counter->list);
- spin_lock(&fc_stats->counters_idr_lock);
- WARN_ON(!idr_remove(&fc_stats->counters_idr, counter->id));
- spin_unlock(&fc_stats->counters_idr_lock);
+ WARN_ON(!xa_erase(&fc_stats->counters_xa, counter->id));
}
static int get_max_bulk_query_len(struct mlx5_core_dev *dev)
@@ -309,20 +303,12 @@ struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging)
counter->aging = aging;
if (aging) {
- u32 id = counter->id;
-
counter->cache.lastuse = jiffies;
counter->lastbytes = counter->cache.bytes;
counter->lastpackets = counter->cache.packets;
- idr_preload(GFP_KERNEL);
- spin_lock(&fc_stats->counters_idr_lock);
-
- err = idr_alloc_u32(&fc_stats->counters_idr, counter, &id, id,
- GFP_NOWAIT);
-
- spin_unlock(&fc_stats->counters_idr_lock);
- idr_preload_end();
+ err = xa_insert(&fc_stats->counters_xa, counter->id, counter,
+ GFP_KERNEL);
if (err)
goto err_out_alloc;
@@ -368,8 +354,7 @@ int mlx5_init_fc_stats(struct mlx5_core_dev *dev)
int max_bulk_len;
int max_out_len;
- spin_lock_init(&fc_stats->counters_idr_lock);
- idr_init(&fc_stats->counters_idr);
+ xa_init(&fc_stats->counters_xa);
INIT_LIST_HEAD(&fc_stats->counters);
init_llist_head(&fc_stats->addlist);
init_llist_head(&fc_stats->dellist);
@@ -409,7 +394,7 @@ void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev)
kfree(fc_stats->bulk_query_out);
- idr_destroy(&fc_stats->counters_idr);
+ xa_destroy(&fc_stats->counters_xa);
tmplist = llist_del_all(&fc_stats->addlist);
llist_for_each_entry_safe(counter, tmp, tmplist, addlist)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index ba8f59b11920..b8b66cdb8357 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -477,8 +477,7 @@ struct mlx5_fc_pool {
};
struct mlx5_fc_stats {
- spinlock_t counters_idr_lock; /* protects counters_idr */
- struct idr counters_idr;
+ struct xarray counters_xa;
struct list_head counters;
struct llist_head addlist;
struct llist_head dellist;
--
2.23.0.rc1
^ permalink raw reply related
* [PATCH 04/38] mlx5: Convert cq_table to XArray
From: Matthew Wilcox @ 2019-08-20 22:32 UTC (permalink / raw)
To: netdev; +Cc: Matthew Wilcox (Oracle)
In-Reply-To: <20190820223259.22348-1-willy@infradead.org>
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Since mlx5_cq_table would have shrunk down to just the xarray, eliminate
it and embed the xarray directly into mlx5_eq.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
drivers/net/ethernet/mellanox/mlx5/core/eq.c | 27 ++++---------------
.../net/ethernet/mellanox/mlx5/core/lib/eq.h | 7 +----
2 files changed, 6 insertions(+), 28 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 09d4c64b6e73..c5953f6e0a69 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -113,11 +113,10 @@ static int mlx5_cmd_destroy_eq(struct mlx5_core_dev *dev, u8 eqn)
/* caller must eventually call mlx5_cq_put on the returned cq */
static struct mlx5_core_cq *mlx5_eq_cq_get(struct mlx5_eq *eq, u32 cqn)
{
- struct mlx5_cq_table *table = &eq->cq_table;
- struct mlx5_core_cq *cq = NULL;
+ struct mlx5_core_cq *cq;
rcu_read_lock();
- cq = radix_tree_lookup(&table->tree, cqn);
+ cq = xa_load(&eq->cq_table, cqn);
if (likely(cq))
mlx5_cq_hold(cq);
rcu_read_unlock();
@@ -243,7 +242,6 @@ static int
create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
struct mlx5_eq_param *param)
{
- struct mlx5_cq_table *cq_table = &eq->cq_table;
u32 out[MLX5_ST_SZ_DW(create_eq_out)] = {0};
struct mlx5_priv *priv = &dev->priv;
u8 vecidx = param->irq_index;
@@ -254,11 +252,7 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
int err;
int i;
- /* Init CQ table */
- memset(cq_table, 0, sizeof(*cq_table));
- spin_lock_init(&cq_table->lock);
- INIT_RADIX_TREE(&cq_table->tree, GFP_ATOMIC);
-
+ xa_init_flags(&eq->cq_table, XA_FLAGS_LOCK_IRQ);
eq->nent = roundup_pow_of_two(param->nent + MLX5_NUM_SPARE_EQE);
eq->cons_index = 0;
err = mlx5_buf_alloc(dev, eq->nent * MLX5_EQE_SIZE, &eq->buf);
@@ -378,25 +372,14 @@ static int destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
int mlx5_eq_add_cq(struct mlx5_eq *eq, struct mlx5_core_cq *cq)
{
- struct mlx5_cq_table *table = &eq->cq_table;
- int err;
-
- spin_lock(&table->lock);
- err = radix_tree_insert(&table->tree, cq->cqn, cq);
- spin_unlock(&table->lock);
-
- return err;
+ return xa_err(xa_store(&eq->cq_table, cq->cqn, cq, GFP_KERNEL));
}
void mlx5_eq_del_cq(struct mlx5_eq *eq, struct mlx5_core_cq *cq)
{
- struct mlx5_cq_table *table = &eq->cq_table;
struct mlx5_core_cq *tmp;
- spin_lock(&table->lock);
- tmp = radix_tree_delete(&table->tree, cq->cqn);
- spin_unlock(&table->lock);
-
+ tmp = xa_erase(&eq->cq_table, cq->cqn);
if (!tmp) {
mlx5_core_dbg(eq->dev, "cq 0x%x not found in eq 0x%x tree\n",
eq->eqn, cq->cqn);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
index 4be4d2d36218..a342cf78120e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
@@ -16,14 +16,9 @@ struct mlx5_eq_tasklet {
spinlock_t lock; /* lock completion tasklet list */
};
-struct mlx5_cq_table {
- spinlock_t lock; /* protect radix tree */
- struct radix_tree_root tree;
-};
-
struct mlx5_eq {
struct mlx5_core_dev *dev;
- struct mlx5_cq_table cq_table;
+ struct xarray cq_table;
__be32 __iomem *doorbell;
u32 cons_index;
struct mlx5_frag_buf buf;
--
2.23.0.rc1
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox